void vpx_fdct8x8_1_msa(const int16_t *input, int16_t *out, int32_t stride) { v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v4i32 vec_w; LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7); ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6); ADD2(in0, in2, in4, in6, in0, in4); vec_w = __msa_hadd_s_w(in0, in0); vec_w += __msa_hadd_s_w(in4, in4); out[0] = HADD_SW_S32(vec_w); out[1] = 0; }
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { uint64_t out0, out1, out2, out3; uint32_t in0, in1, in2, in3; v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; v8i16 t0, t1, t2, t3; v16u8 srcl0, srcl1, src0, src1; const v8i16 mask0 = { 0, 4, 8, 12, 1, 5, 9, 13 }; const v8i16 mask1 = { 3, 7, 11, 15, 2, 6, 10, 14 }; const v8i16 mask2 = { 4, 0, 5, 1, 6, 2, 7, 3 }; const v8i16 mask3 = { 0, 4, 1, 5, 2, 6, 3, 7 }; const v8i16 cnst0 = { 2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352 }; const v8i16 cnst1 = { 5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217 }; LW4(src, BPS, in0, in1, in2, in3); INSERT_W4_UB(in0, in1, in2, in3, src0); LW4(ref, BPS, in0, in1, in2, in3); INSERT_W4_UB(in0, in1, in2, in3, src1); ILVRL_B2_UB(src0, src1, srcl0, srcl1); HSUB_UB2_SH(srcl0, srcl1, t0, t1); VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3); ADDSUB2(t2, t3, t0, t1); t0 = SRLI_H(t0, 3); VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2); tmp0 = __msa_hadd_s_w(t3, t3); tmp2 = __msa_hsub_s_w(t3, t3); FILL_W2_SW(1812, 937, tmp1, tmp3); DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1); SRAI_W2_SW(tmp1, tmp3, 9); PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1); VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3); ADDSUB2(t2, t3, t0, t1); VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2); tmp0 = __msa_hadd_s_w(t3, t3); tmp2 = __msa_hsub_s_w(t3, t3); ADDVI_W2_SW(tmp0, 7, tmp2, 7, tmp0, tmp2); SRAI_W2_SW(tmp0, tmp2, 4); FILL_W2_SW(12000, 51000, tmp1, tmp3); DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1); SRAI_W2_SW(tmp1, tmp3, 16); UNPCK_R_SH_SW(t1, tmp4); tmp5 = __msa_ceqi_w(tmp4, 0); tmp4 = (v4i32)__msa_nor_v((v16u8)tmp5, (v16u8)tmp5); tmp5 = __msa_fill_w(1); tmp5 = (v4i32)__msa_and_v((v16u8)tmp5, (v16u8)tmp4); tmp1 += tmp5; PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1); out0 = __msa_copy_s_d((v2i64)t0, 0); out1 = __msa_copy_s_d((v2i64)t0, 1); out2 = __msa_copy_s_d((v2i64)t1, 0); out3 = __msa_copy_s_d((v2i64)t1, 1); SD4(out0, out1, out2, out3, out, 8); }
void vpx_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) { int sum, i; v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v4i32 vec_w = { 0 }; for (i = 0; i < 4; ++i) { LD_SH2(input, 8, in0, in1); input += stride; LD_SH2(input, 8, in2, in3); input += stride; LD_SH2(input, 8, in4, in5); input += stride; LD_SH2(input, 8, in6, in7); input += stride; ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6); ADD2(in0, in2, in4, in6, in0, in4); vec_w += __msa_hadd_s_w(in0, in0); vec_w += __msa_hadd_s_w(in4, in4); } sum = HADD_SW_S32(vec_w); out[0] = (int16_t)(sum >> 1); }