static void avc_idct4x4dc_msa( int16_t *p_src, int32_t i_src_stride, int16_t *p_dst, int32_t i_dst_stride ) { v8i16 src0, src1, src2, src3; v4i32 src0_r, src1_r, src2_r, src3_r; v4i32 hres0, hres1, hres2, hres3; v8i16 vres0, vres1, vres2, vres3; v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v2i64 res0, res1; LD_SH4( p_src, i_src_stride, src0, src1, src2, src3 ); UNPCK_R_SH_SW( src0, src0_r ); UNPCK_R_SH_SW( src1, src1_r ); UNPCK_R_SH_SW( src2, src2_r ); UNPCK_R_SH_SW( src3, src3_r ); BUTTERFLY_4( src0_r, src2_r, src3_r, src1_r, vec0, vec3, vec2, vec1 ); BUTTERFLY_4( vec0, vec1, vec2, vec3, hres0, hres3, hres2, hres1 ); TRANSPOSE4x4_SW_SW( hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3 ); BUTTERFLY_4( hres0, hres2, hres3, hres1, vec0, vec3, vec2, vec1 ); BUTTERFLY_4( vec0, vec1, vec2, vec3, vec4, vec7, vec6, vec5 ); PCKEV_H4_SH( vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, vres0, vres1, vres2, vres3 ); PCKOD_D2_SD( vres1, vres0, vres3, vres2, res0, res1 ); ST8x4_UB( res0, res1, p_dst, i_dst_stride * 2 ); }
static void avc_dct4x4dc_msa( int16_t *p_src, int16_t *p_dst, int32_t i_src_stride ) { v8i16 src0, src1, src2, src3, ver_res0, ver_res1, ver_res2, ver_res3; v4i32 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3; v4i32 hor_res0, hor_res1, hor_res2, hor_res3; v4i32 ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r; LD_SH4( p_src, i_src_stride, src0, src1, src2, src3 ); UNPCK_R_SH_SW( src0, src0_r ); UNPCK_R_SH_SW( src1, src1_r ); UNPCK_R_SH_SW( src2, src2_r ); UNPCK_R_SH_SW( src3, src3_r ); BUTTERFLY_4( src0_r, src2_r, src3_r, src1_r, tmp0, tmp3, tmp2, tmp1 ); BUTTERFLY_4( tmp0, tmp1, tmp2, tmp3, hor_res0, hor_res3, hor_res2, hor_res1 ); TRANSPOSE4x4_SW_SW( hor_res0, hor_res1, hor_res2, hor_res3, hor_res0, hor_res1, hor_res2, hor_res3 ); BUTTERFLY_4( hor_res0, hor_res2, hor_res3, hor_res1, tmp0, tmp3, tmp2, tmp1 ); BUTTERFLY_4( tmp0, tmp1, tmp2, tmp3, ver_res0_r, ver_res3_r, ver_res2_r, ver_res1_r ); SRARI_W4_SW( ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r, 1 ); PCKEV_H4_SH( ver_res0_r, ver_res0_r, ver_res1_r, ver_res1_r, ver_res2_r, ver_res2_r, ver_res3_r, ver_res3_r, ver_res0, ver_res1, ver_res2, ver_res3 ); PCKOD_D2_SH( ver_res1, ver_res0, ver_res3, ver_res2, ver_res0, ver_res2 ); ST_SH2( ver_res0, ver_res2, p_dst, 8 ); }
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { uint64_t out0, out1, out2, out3; uint32_t in0, in1, in2, in3; v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; v8i16 t0, t1, t2, t3; v16u8 srcl0, srcl1, src0, src1; const v8i16 mask0 = { 0, 4, 8, 12, 1, 5, 9, 13 }; const v8i16 mask1 = { 3, 7, 11, 15, 2, 6, 10, 14 }; const v8i16 mask2 = { 4, 0, 5, 1, 6, 2, 7, 3 }; const v8i16 mask3 = { 0, 4, 1, 5, 2, 6, 3, 7 }; const v8i16 cnst0 = { 2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352 }; const v8i16 cnst1 = { 5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217 }; LW4(src, BPS, in0, in1, in2, in3); INSERT_W4_UB(in0, in1, in2, in3, src0); LW4(ref, BPS, in0, in1, in2, in3); INSERT_W4_UB(in0, in1, in2, in3, src1); ILVRL_B2_UB(src0, src1, srcl0, srcl1); HSUB_UB2_SH(srcl0, srcl1, t0, t1); VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3); ADDSUB2(t2, t3, t0, t1); t0 = SRLI_H(t0, 3); VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2); tmp0 = __msa_hadd_s_w(t3, t3); tmp2 = __msa_hsub_s_w(t3, t3); FILL_W2_SW(1812, 937, tmp1, tmp3); DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1); SRAI_W2_SW(tmp1, tmp3, 9); PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1); VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3); ADDSUB2(t2, t3, t0, t1); VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2); tmp0 = __msa_hadd_s_w(t3, t3); tmp2 = __msa_hsub_s_w(t3, t3); ADDVI_W2_SW(tmp0, 7, tmp2, 7, tmp0, tmp2); SRAI_W2_SW(tmp0, tmp2, 4); FILL_W2_SW(12000, 51000, tmp1, tmp3); DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1); SRAI_W2_SW(tmp1, tmp3, 16); UNPCK_R_SH_SW(t1, tmp4); tmp5 = __msa_ceqi_w(tmp4, 0); tmp4 = (v4i32)__msa_nor_v((v16u8)tmp5, (v16u8)tmp5); tmp5 = __msa_fill_w(1); tmp5 = (v4i32)__msa_and_v((v16u8)tmp5, (v16u8)tmp4); tmp1 += tmp5; PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1); out0 = __msa_copy_s_d((v2i64)t0, 0); out1 = __msa_copy_s_d((v2i64)t0, 1); out2 = __msa_copy_s_d((v2i64)t1, 0); out3 = __msa_copy_s_d((v2i64)t1, 1); SD4(out0, out1, out2, out3, out, 8); }
void aom_iwht4x4_16_add_msa(const int16_t *input, uint8_t *dst, int32_t dst_stride) { v8i16 in0, in1, in2, in3; v4i32 in0_r, in1_r, in2_r, in3_r, in4_r; /* load vector elements of 4x4 block */ LD4x4_SH(input, in0, in2, in3, in1); TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1); UNPCK_R_SH_SW(in0, in0_r); UNPCK_R_SH_SW(in2, in2_r); UNPCK_R_SH_SW(in3, in3_r); UNPCK_R_SH_SW(in1, in1_r); SRA_4V(in0_r, in1_r, in2_r, in3_r, UNIT_QUANT_SHIFT); in0_r += in2_r; in3_r -= in1_r; in4_r = (in0_r - in3_r) >> 1; in1_r = in4_r - in1_r; in2_r = in4_r - in2_r; in0_r -= in1_r; in3_r += in2_r; TRANSPOSE4x4_SW_SW(in0_r, in1_r, in2_r, in3_r, in0_r, in1_r, in2_r, in3_r); in0_r += in1_r; in2_r -= in3_r; in4_r = (in0_r - in2_r) >> 1; in3_r = in4_r - in3_r; in1_r = in4_r - in1_r; in0_r -= in3_r; in2_r += in1_r; PCKEV_H4_SH(in0_r, in0_r, in1_r, in1_r, in2_r, in2_r, in3_r, in3_r, in0, in1, in2, in3); ADDBLK_ST4x4_UB(in0, in3, in1, in2, dst, dst_stride); }