static WEBP_INLINE void TrueMotion16x16(uint8_t* dst, const uint8_t* left, const uint8_t* top) { if (left != NULL) { if (top != NULL) { int j; v8i16 d1, d2; const v16i8 zero = { 0 }; const v8i16 TL = (v8i16)__msa_fill_h(left[-1]); const v16u8 T = LD_UB(top); ILVRL_B2_SH(zero, T, d1, d2); SUB2(d1, TL, d2, TL, d1, d2); for (j = 0; j < 16; j += 4) { v16i8 t0, t1, t2, t3; v8i16 r0, r1, r2, r3, r4, r5, r6, r7; const v8i16 L0 = (v8i16)__msa_fill_h(left[j + 0]); const v8i16 L1 = (v8i16)__msa_fill_h(left[j + 1]); const v8i16 L2 = (v8i16)__msa_fill_h(left[j + 2]); const v8i16 L3 = (v8i16)__msa_fill_h(left[j + 3]); ADD4(d1, L0, d1, L1, d1, L2, d1, L3, r0, r1, r2, r3); ADD4(d2, L0, d2, L1, d2, L2, d2, L3, r4, r5, r6, r7); CLIP_SH4_0_255(r0, r1, r2, r3); CLIP_SH4_0_255(r4, r5, r6, r7); PCKEV_B4_SB(r4, r0, r5, r1, r6, r2, r7, r3, t0, t1, t2, t3); ST_SB4(t0, t1, t2, t3, dst, BPS); dst += 4 * BPS; } } else { HorizontalPred16x16(dst, left); } } else { if (top != NULL) { VerticalPred16x16(dst, top); } else { const v16u8 out = (v16u8)__msa_fill_b(0x81); STORE16x16(out, dst); } } }
static void hevc_addblk_8x8_msa(int16_t *coeffs, uint8_t *dst, int32_t stride) { uint8_t *temp_dst = dst; uint64_t dst0, dst1, dst2, dst3; v2i64 dst_vec0 = { 0 }; v2i64 dst_vec1 = { 0 }; v8i16 dst_r0, dst_l0, dst_r1, dst_l1; v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v16u8 zeros = { 0 }; LD_SH8(coeffs, 8, in0, in1, in2, in3, in4, in5, in6, in7); LD4(temp_dst, stride, dst0, dst1, dst2, dst3); temp_dst += (4 * stride); INSERT_D2_SD(dst0, dst1, dst_vec0); INSERT_D2_SD(dst2, dst3, dst_vec1); ILVRL_B2_SH(zeros, dst_vec0, dst_r0, dst_l0); ILVRL_B2_SH(zeros, dst_vec1, dst_r1, dst_l1); ADD4(dst_r0, in0, dst_l0, in1, dst_r1, in2, dst_l1, in3, dst_r0, dst_l0, dst_r1, dst_l1); CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1); PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1); ST8x4_UB(dst_r0, dst_r1, dst, stride); dst += (4 * stride); LD4(temp_dst, stride, dst0, dst1, dst2, dst3); INSERT_D2_SD(dst0, dst1, dst_vec0); INSERT_D2_SD(dst2, dst3, dst_vec1); UNPCK_UB_SH(dst_vec0, dst_r0, dst_l0); UNPCK_UB_SH(dst_vec1, dst_r1, dst_l1); ADD4(dst_r0, in4, dst_l0, in5, dst_r1, in6, dst_l1, in7, dst_r0, dst_l0, dst_r1, dst_l1); CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1); PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1); ST8x4_UB(dst_r0, dst_r1, dst, stride); }
static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) { const v16i8 zero = { 0 }; const v8i16 TL = (v8i16)__msa_fill_h(top[-1]); const v8i16 L0 = (v8i16)__msa_fill_h(top[-2]); const v8i16 L1 = (v8i16)__msa_fill_h(top[-3]); const v8i16 L2 = (v8i16)__msa_fill_h(top[-4]); const v8i16 L3 = (v8i16)__msa_fill_h(top[-5]); const v16u8 T1 = LD_UB(top); const v8i16 T = (v8i16)__msa_ilvr_b(zero, (v16i8)T1); const v8i16 d = T - TL; v8i16 r0, r1, r2, r3; ADD4(d, L0, d, L1, d, L2, d, L3, r0, r1, r2, r3); CLIP_SH4_0_255(r0, r1, r2, r3); PCKEV_ST4x4_UB(r0, r1, r2, r3, dst, BPS); }
static void avc_idct8_addblk_msa( uint8_t *p_dst, int16_t *p_src, int32_t i_dst_stride ) { v8i16 src0, src1, src2, src3, src4, src5, src6, src7; v8i16 vec0, vec1, vec2, vec3; v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; v8i16 res0, res1, res2, res3, res4, res5, res6, res7; v4i32 tmp0_r, tmp1_r, tmp2_r, tmp3_r, tmp4_r, tmp5_r, tmp6_r, tmp7_r; v4i32 tmp0_l, tmp1_l, tmp2_l, tmp3_l, tmp4_l, tmp5_l, tmp6_l, tmp7_l; v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec0_l, vec1_l, vec2_l, vec3_l; v4i32 res0_r, res1_r, res2_r, res3_r, res4_r, res5_r, res6_r, res7_r; v4i32 res0_l, res1_l, res2_l, res3_l, res4_l, res5_l, res6_l, res7_l; v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; v16i8 zeros = { 0 }; p_src[ 0 ] += 32; LD_SH8( p_src, 8, src0, src1, src2, src3, src4, src5, src6, src7 ); vec0 = src0 + src4; vec1 = src0 - src4; vec2 = src2 >> 1; vec2 = vec2 - src6; vec3 = src6 >> 1; vec3 = src2 + vec3; BUTTERFLY_4( vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3 ); vec0 = src7 >> 1; vec0 = src5 - vec0 - src3 - src7; vec1 = src3 >> 1; vec1 = src1 - vec1 + src7 - src3; vec2 = src5 >> 1; vec2 = vec2 - src1 + src7 + src5; vec3 = src1 >> 1; vec3 = vec3 + src3 + src5 + src1; tmp4 = vec3 >> 2; tmp4 += vec0; tmp5 = vec2 >> 2; tmp5 += vec1; tmp6 = vec1 >> 2; tmp6 -= vec2; tmp7 = vec0 >> 2; tmp7 = vec3 - tmp7; BUTTERFLY_8( tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, res0, res1, res2, res3, res4, res5, res6, res7 ); TRANSPOSE8x8_SH_SH( res0, res1, res2, res3, res4, res5, res6, res7, res0, res1, res2, res3, res4, res5, res6, res7 ); UNPCK_SH_SW( res0, tmp0_r, tmp0_l ); UNPCK_SH_SW( res1, tmp1_r, tmp1_l ); UNPCK_SH_SW( res2, tmp2_r, tmp2_l ); UNPCK_SH_SW( res3, tmp3_r, tmp3_l ); UNPCK_SH_SW( res4, tmp4_r, tmp4_l ); UNPCK_SH_SW( res5, tmp5_r, tmp5_l ); UNPCK_SH_SW( res6, tmp6_r, tmp6_l ); UNPCK_SH_SW( res7, tmp7_r, tmp7_l ); BUTTERFLY_4( tmp0_r, tmp0_l, tmp4_l, tmp4_r, vec0_r, vec0_l, vec1_l, vec1_r ); vec2_r = tmp2_r >> 1; vec2_l = tmp2_l >> 1; vec2_r -= tmp6_r; vec2_l -= tmp6_l; vec3_r = tmp6_r >> 1; vec3_l = tmp6_l >> 1; vec3_r += tmp2_r; vec3_l += tmp2_l; BUTTERFLY_4( vec0_r, vec1_r, vec2_r, vec3_r, tmp0_r, tmp2_r, tmp4_r, tmp6_r ); BUTTERFLY_4( vec0_l, vec1_l, vec2_l, vec3_l, tmp0_l, tmp2_l, tmp4_l, tmp6_l ); vec0_r = tmp7_r >> 1; vec0_l = tmp7_l >> 1; vec0_r = tmp5_r - vec0_r - tmp3_r - tmp7_r; vec0_l = tmp5_l - vec0_l - tmp3_l - tmp7_l; vec1_r = tmp3_r >> 1; vec1_l = tmp3_l >> 1; vec1_r = tmp1_r - vec1_r + tmp7_r - tmp3_r; vec1_l = tmp1_l - vec1_l + tmp7_l - tmp3_l; vec2_r = tmp5_r >> 1; vec2_l = tmp5_l >> 1; vec2_r = vec2_r - tmp1_r + tmp7_r + tmp5_r; vec2_l = vec2_l - tmp1_l + tmp7_l + tmp5_l; vec3_r = tmp1_r >> 1; vec3_l = tmp1_l >> 1; vec3_r = vec3_r + tmp3_r + tmp5_r + tmp1_r; vec3_l = vec3_l + tmp3_l + tmp5_l + tmp1_l; tmp1_r = vec3_r >> 2; tmp1_l = vec3_l >> 2; tmp1_r += vec0_r; tmp1_l += vec0_l; tmp3_r = vec2_r >> 2; tmp3_l = vec2_l >> 2; tmp3_r += vec1_r; tmp3_l += vec1_l; tmp5_r = vec1_r >> 2; tmp5_l = vec1_l >> 2; tmp5_r -= vec2_r; tmp5_l -= vec2_l; tmp7_r = vec0_r >> 2; tmp7_l = vec0_l >> 2; tmp7_r = vec3_r - tmp7_r; tmp7_l = vec3_l - tmp7_l; BUTTERFLY_4( tmp0_r, tmp0_l, tmp7_l, tmp7_r, res0_r, res0_l, res7_l, res7_r ); BUTTERFLY_4( tmp2_r, tmp2_l, tmp5_l, tmp5_r, res1_r, res1_l, res6_l, res6_r ); BUTTERFLY_4( tmp4_r, tmp4_l, tmp3_l, tmp3_r, res2_r, res2_l, res5_l, res5_r ); BUTTERFLY_4( tmp6_r, tmp6_l, tmp1_l, tmp1_r, res3_r, res3_l, res4_l, res4_r ); SRA_4V( res0_r, res0_l, res1_r, res1_l, 6 ); SRA_4V( res2_r, res2_l, res3_r, res3_l, 6 ); SRA_4V( res4_r, res4_l, res5_r, res5_l, 6 ); SRA_4V( res6_r, res6_l, res7_r, res7_l, 6 ); PCKEV_H4_SH( res0_l, res0_r, res1_l, res1_r, res2_l, res2_r, res3_l, res3_r, res0, res1, res2, res3 ); PCKEV_H4_SH( res4_l, res4_r, res5_l, res5_r, res6_l, res6_r, res7_l, res7_r, res4, res5, res6, res7 ); LD_SB8( p_dst, i_dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7 ); ILVR_B4_SH( zeros, dst0, zeros, dst1, zeros, dst2, zeros, dst3, tmp0, tmp1, tmp2, tmp3 ); ILVR_B4_SH( zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7, tmp4, tmp5, tmp6, tmp7 ); ADD4( res0, tmp0, res1, tmp1, res2, tmp2, res3, tmp3, res0, res1, res2, res3 ); ADD4( res4, tmp4, res5, tmp5, res6, tmp6, res7, tmp7, res4, res5, res6, res7 ); CLIP_SH4_0_255( res0, res1, res2, res3 ); CLIP_SH4_0_255( res4, res5, res6, res7 ); PCKEV_B4_SB( res1, res0, res3, res2, res5, res4, res7, res6, dst0, dst1, dst2, dst3 ); ST8x4_UB( dst0, dst1, p_dst, i_dst_stride ); p_dst += ( 4 * i_dst_stride ); ST8x4_UB( dst2, dst3, p_dst, i_dst_stride ); }
static void hevc_addblk_32x32_msa(int16_t *coeffs, uint8_t *dst, int32_t stride) { uint8_t loop_cnt; uint8_t *temp_dst = dst; v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; v8i16 dst_r0, dst_l0, dst_r1, dst_l1, dst_r2, dst_l2, dst_r3, dst_l3; v8i16 in0, in1, in2, in3, in4, in5, in6, in7; /* Pre-load for next iteration */ LD_UB2(temp_dst, 16, dst4, dst5); temp_dst += stride; LD_UB2(temp_dst, 16, dst6, dst7); temp_dst += stride; LD_SH4(coeffs, 16, in0, in2, in4, in6); LD_SH4((coeffs + 8), 16, in1, in3, in5, in7); coeffs += 64; for (loop_cnt = 14; loop_cnt--;) { UNPCK_UB_SH(dst4, dst_r0, dst_l0); UNPCK_UB_SH(dst5, dst_r1, dst_l1); UNPCK_UB_SH(dst6, dst_r2, dst_l2); UNPCK_UB_SH(dst7, dst_r3, dst_l3); dst_r0 += in0; dst_l0 += in1; dst_r1 += in2; dst_l1 += in3; dst_r2 += in4; dst_l2 += in5; dst_r3 += in6; dst_l3 += in7; /* Pre-load for next iteration */ LD_UB2(temp_dst, 16, dst4, dst5); temp_dst += stride; LD_UB2(temp_dst, 16, dst6, dst7); temp_dst += stride; LD_SH4(coeffs, 16, in0, in2, in4, in6); LD_SH4((coeffs + 8), 16, in1, in3, in5, in7); coeffs += 64; CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1); CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3); PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3, dst_r3, dst0, dst1, dst2, dst3); ST_UB2(dst0, dst1, dst, 16); dst += stride; ST_UB2(dst2, dst3, dst, 16); dst += stride; } UNPCK_UB_SH(dst4, dst_r0, dst_l0); UNPCK_UB_SH(dst5, dst_r1, dst_l1); UNPCK_UB_SH(dst6, dst_r2, dst_l2); UNPCK_UB_SH(dst7, dst_r3, dst_l3); dst_r0 += in0; dst_l0 += in1; dst_r1 += in2; dst_l1 += in3; dst_r2 += in4; dst_l2 += in5; dst_r3 += in6; dst_l3 += in7; /* Pre-load for next iteration */ LD_UB2(temp_dst, 16, dst4, dst5); temp_dst += stride; LD_UB2(temp_dst, 16, dst6, dst7); temp_dst += stride; LD_SH4(coeffs, 16, in0, in2, in4, in6); LD_SH4((coeffs + 8), 16, in1, in3, in5, in7); CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1); CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3); PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3, dst_r3, dst0, dst1, dst2, dst3); ST_UB2(dst0, dst1, dst, 16); dst += stride; ST_UB2(dst2, dst3, dst, 16); dst += stride; UNPCK_UB_SH(dst4, dst_r0, dst_l0); UNPCK_UB_SH(dst5, dst_r1, dst_l1); UNPCK_UB_SH(dst6, dst_r2, dst_l2); UNPCK_UB_SH(dst7, dst_r3, dst_l3); dst_r0 += in0; dst_l0 += in1; dst_r1 += in2; dst_l1 += in3; dst_r2 += in4; dst_l2 += in5; dst_r3 += in6; dst_l3 += in7; CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1); CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3); PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3, dst_r3, dst0, dst1, dst2, dst3); ST_UB2(dst0, dst1, dst, 16); dst += stride; ST_UB2(dst2, dst3, dst, 16); }