static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { uint64_t out0, out1, out2, out3; uint32_t in0, in1, in2, in3; v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; v8i16 t0, t1, t2, t3; v16u8 srcl0, srcl1, src0, src1; const v8i16 mask0 = { 0, 4, 8, 12, 1, 5, 9, 13 }; const v8i16 mask1 = { 3, 7, 11, 15, 2, 6, 10, 14 }; const v8i16 mask2 = { 4, 0, 5, 1, 6, 2, 7, 3 }; const v8i16 mask3 = { 0, 4, 1, 5, 2, 6, 3, 7 }; const v8i16 cnst0 = { 2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352 }; const v8i16 cnst1 = { 5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217 }; LW4(src, BPS, in0, in1, in2, in3); INSERT_W4_UB(in0, in1, in2, in3, src0); LW4(ref, BPS, in0, in1, in2, in3); INSERT_W4_UB(in0, in1, in2, in3, src1); ILVRL_B2_UB(src0, src1, srcl0, srcl1); HSUB_UB2_SH(srcl0, srcl1, t0, t1); VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3); ADDSUB2(t2, t3, t0, t1); t0 = SRLI_H(t0, 3); VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2); tmp0 = __msa_hadd_s_w(t3, t3); tmp2 = __msa_hsub_s_w(t3, t3); FILL_W2_SW(1812, 937, tmp1, tmp3); DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1); SRAI_W2_SW(tmp1, tmp3, 9); PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1); VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3); ADDSUB2(t2, t3, t0, t1); VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2); tmp0 = __msa_hadd_s_w(t3, t3); tmp2 = __msa_hsub_s_w(t3, t3); ADDVI_W2_SW(tmp0, 7, tmp2, 7, tmp0, tmp2); SRAI_W2_SW(tmp0, tmp2, 4); FILL_W2_SW(12000, 51000, tmp1, tmp3); DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1); SRAI_W2_SW(tmp1, tmp3, 16); UNPCK_R_SH_SW(t1, tmp4); tmp5 = __msa_ceqi_w(tmp4, 0); tmp4 = (v4i32)__msa_nor_v((v16u8)tmp5, (v16u8)tmp5); tmp5 = __msa_fill_w(1); tmp5 = (v4i32)__msa_and_v((v16u8)tmp5, (v16u8)tmp4); tmp1 += tmp5; PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1); out0 = __msa_copy_s_d((v2i64)t0, 0); out1 = __msa_copy_s_d((v2i64)t0, 1); out2 = __msa_copy_s_d((v2i64)t1, 0); out3 = __msa_copy_s_d((v2i64)t1, 1); SD4(out0, out1, out2, out3, out, 8); }
void vp8_short_fdct4x4_msa(int16_t *input, int16_t *output, int32_t pitch) { v8i16 in0, in1, in2, in3; v8i16 temp0, temp1; v8i16 const0, const1; v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 }; v4i32 out0, out1, out2, out3; v8i16 zero = { 0 }; LD_SH4(input, pitch / 2, in0, in1, in2, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); SLLI_4V(temp0, temp1, in1, in3, 3); in0 = temp0 + temp1; in2 = temp0 - temp1; SET_DOTP_VALUES(coeff, 0, 1, 2, const0, const1); temp0 = __msa_ilvr_h(in3, in1); in1 = __msa_splati_h(coeff, 3); out0 = (v4i32)__msa_ilvev_h(zero, in1); coeff = __msa_ilvl_h(zero, coeff); out1 = __msa_splati_w((v4i32)coeff, 0); DPADD_SH2_SW(temp0, temp0, const0, const1, out0, out1); out0 >>= 12; out1 >>= 12; PCKEV_H2_SH(out0, out0, out1, out1, in1, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); in0 = temp0 + temp1 + 7; in2 = temp0 - temp1 + 7; in0 >>= 4; in2 >>= 4; ILVR_H2_SW(zero, in0, zero, in2, out0, out2); temp1 = RET_1_IF_NZERO_H(in3); ILVR_H2_SH(zero, temp1, in3, in1, temp1, temp0); SPLATI_W2_SW(coeff, 2, out3, out1); out3 += out1; out1 = __msa_splati_w((v4i32)coeff, 1); DPADD_SH2_SW(temp0, temp0, const0, const1, out1, out3); out1 >>= 16; out3 >>= 16; out1 += (v4i32)temp1; PCKEV_H2_SH(out1, out0, out3, out2, in0, in2); ST_SH2(in0, in2, output, 8); }
static void hevc_idct_8x32_column_msa(int16_t *coeffs, uint8_t buf_pitch, uint8_t round) { uint8_t i; const int16_t *filter_ptr0 = >32x32_cnst0[0]; const int16_t *filter_ptr1 = >32x32_cnst1[0]; const int16_t *filter_ptr2 = >32x32_cnst2[0]; const int16_t *filter_ptr3 = >8x8_cnst[0]; int16_t *src0 = (coeffs + buf_pitch); int16_t *src1 = (coeffs + 2 * buf_pitch); int16_t *src2 = (coeffs + 4 * buf_pitch); int16_t *src3 = (coeffs); int32_t cnst0, cnst1; int32_t tmp_buf[8 * 32 + 15]; int32_t *tmp_buf_ptr = tmp_buf + 15; v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r; v8i16 src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l; v8i16 filt0, filter0, filter1, filter2, filter3; v4i32 sum0_r, sum0_l, sum1_r, sum1_l, tmp0_r, tmp0_l, tmp1_r, tmp1_l; /* Align pointer to 64 byte boundary */ tmp_buf_ptr = (int32_t *)(((uintptr_t) tmp_buf_ptr) & ~(uintptr_t) 63); /* process coeff 4, 12, 20, 28 */ LD_SH4(src2, 8 * buf_pitch, in0, in1, in2, in3); ILVR_H2_SH(in1, in0, in3, in2, src0_r, src1_r); ILVL_H2_SH(in1, in0, in3, in2, src0_l, src1_l); LD_SH2(src3, 16 * buf_pitch, in4, in6); LD_SH2((src3 + 8 * buf_pitch), 16 * buf_pitch, in5, in7); ILVR_H2_SH(in6, in4, in7, in5, src2_r, src3_r); ILVL_H2_SH(in6, in4, in7, in5, src2_l, src3_l); /* loop for all columns of constants */ for (i = 0; i < 2; i++) { /* processing single column of constants */ cnst0 = LW(filter_ptr2); cnst1 = LW(filter_ptr2 + 2); filter0 = (v8i16) __msa_fill_w(cnst0); filter1 = (v8i16) __msa_fill_w(cnst1); DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l); DPADD_SH2_SW(src1_r, src1_l, filter1, filter1, sum0_r, sum0_l); ST_SW2(sum0_r, sum0_l, (tmp_buf_ptr + 2 * i * 8), 4); /* processing single column of constants */ cnst0 = LW(filter_ptr2 + 4); cnst1 = LW(filter_ptr2 + 6); filter0 = (v8i16) __msa_fill_w(cnst0); filter1 = (v8i16) __msa_fill_w(cnst1); DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l); DPADD_SH2_SW(src1_r, src1_l, filter1, filter1, sum0_r, sum0_l); ST_SW2(sum0_r, sum0_l, (tmp_buf_ptr + (2 * i + 1) * 8), 4); filter_ptr2 += 8; } /* process coeff 0, 8, 16, 24 */ /* loop for all columns of constants */ for (i = 0; i < 2; i++) { /* processing first column of filter constants */ cnst0 = LW(filter_ptr3); cnst1 = LW(filter_ptr3 + 2); filter0 = (v8i16) __msa_fill_w(cnst0); filter1 = (v8i16) __msa_fill_w(cnst1); DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l, filter0, filter0, filter1, filter1, sum0_r, sum0_l, tmp1_r, tmp1_l); sum1_r = sum0_r - tmp1_r; sum1_l = sum0_l - tmp1_l; sum0_r = sum0_r + tmp1_r; sum0_l = sum0_l + tmp1_l; HEVC_EVEN16_CALC(tmp_buf_ptr, sum0_r, sum0_l, i, (7 - i)); HEVC_EVEN16_CALC(tmp_buf_ptr, sum1_r, sum1_l, (3 - i), (4 + i)); filter_ptr3 += 8; } /* process coeff 2 6 10 14 18 22 26 30 */ LD_SH8(src1, 4 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7); ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, src0_r, src1_r, src2_r, src3_r); ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, src0_l, src1_l, src2_l, src3_l); /* loop for all columns of constants */ for (i = 0; i < 8; i++) { /* processing single column of constants */ filt0 = LD_SH(filter_ptr1); SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3); DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l); DPADD_SH4_SW(src1_r, src1_l, src2_r, src2_l, filter1, filter1, filter2, filter2, sum0_r, sum0_l, sum0_r, sum0_l); DPADD_SH2_SW(src3_r, src3_l, filter3, filter3, sum0_r, sum0_l); LD_SW2(tmp_buf_ptr + i * 8, 4, tmp0_r, tmp0_l); tmp1_r = tmp0_r; tmp1_l = tmp0_l; tmp0_r += sum0_r; tmp0_l += sum0_l; ST_SW2(tmp0_r, tmp0_l, (tmp_buf_ptr + i * 8), 4); tmp1_r -= sum0_r; tmp1_l -= sum0_l; ST_SW2(tmp1_r, tmp1_l, (tmp_buf_ptr + (15 - i) * 8), 4); filter_ptr1 += 8; } /* process coeff 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 */ LD_SH8(src0, 2 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7); src0 += 16 * buf_pitch; ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, src0_r, src1_r, src2_r, src3_r); ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, src0_l, src1_l, src2_l, src3_l); LD_SH8(src0, 2 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7); ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, src4_r, src5_r, src6_r, src7_r); ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, src4_l, src5_l, src6_l, src7_l); /* loop for all columns of filter constants */ for (i = 0; i < 16; i++) { /* processing single column of constants */ filt0 = LD_SH(filter_ptr0); SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3); DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l); DPADD_SH4_SW(src1_r, src1_l, src2_r, src2_l, filter1, filter1, filter2, filter2, sum0_r, sum0_l, sum0_r, sum0_l); DPADD_SH2_SW(src3_r, src3_l, filter3, filter3, sum0_r, sum0_l); tmp1_r = sum0_r; tmp1_l = sum0_l; filt0 = LD_SH(filter_ptr0 + 8); SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3); DOTP_SH2_SW(src4_r, src4_l, filter0, filter0, sum0_r, sum0_l); DPADD_SH4_SW(src5_r, src5_l, src6_r, src6_l, filter1, filter1, filter2, filter2, sum0_r, sum0_l, sum0_r, sum0_l); DPADD_SH2_SW(src7_r, src7_l, filter3, filter3, sum0_r, sum0_l); sum0_r += tmp1_r; sum0_l += tmp1_l; LD_SW2(tmp_buf_ptr + i * 8, 4, tmp0_r, tmp0_l); tmp1_r = tmp0_r; tmp1_l = tmp0_l; tmp0_r += sum0_r; tmp0_l += sum0_l; sum1_r = __msa_fill_w(round); SRAR_W2_SW(tmp0_r, tmp0_l, sum1_r); SAT_SW2_SW(tmp0_r, tmp0_l, 15); in0 = __msa_pckev_h((v8i16) tmp0_l, (v8i16) tmp0_r); ST_SH(in0, (coeffs + i * buf_pitch)); tmp1_r -= sum0_r; tmp1_l -= sum0_l; SRAR_W2_SW(tmp1_r, tmp1_l, sum1_r); SAT_SW2_SW(tmp1_r, tmp1_l, 15); in0 = __msa_pckev_h((v8i16) tmp1_l, (v8i16) tmp1_r); ST_SH(in0, (coeffs + (31 - i) * buf_pitch)); filter_ptr0 += 16; } }