void vp9_fht8x8_msa(const int16_t *input, int16_t *output, int32_t stride, int32_t tx_type) { v8i16 in0, in1, in2, in3, in4, in5, in6, in7; LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7); SLLI_4V(in0, in1, in2, in3, 2); SLLI_4V(in4, in5, in6, in7, 2); switch (tx_type) { case DCT_DCT: VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); break; case ADST_DCT: VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); break; case DCT_ADST: VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); break; case ADST_ADST: VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); break; default: assert(0); break; } TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); VP9_SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7); ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8); }
void vpx_fdct4x4_msa(const int16_t *input, int16_t *output, int32_t src_stride) { v8i16 in0, in1, in2, in3; LD_SH4(input, src_stride, in0, in1, in2, in3); /* fdct4 pre-process */ { v8i16 vec, mask; v16i8 zero = { 0 }; v16i8 one = __msa_ldi_b(1); mask = (v8i16)__msa_sldi_b(zero, one, 15); SLLI_4V(in0, in1, in2, in3, 4); vec = __msa_ceqi_h(in0, 0); vec = vec ^ 255; vec = mask & vec; in0 += vec; } VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); SRA_4V(in0, in1, in2, in3, 2); PCKEV_D2_SH(in1, in0, in3, in2, in0, in2); ST_SH2(in0, in2, output, 8); }
void vp8_short_walsh4x4_msa(int16_t *input, int16_t *output, int32_t pitch) { v8i16 in0_h, in1_h, in2_h, in3_h; v4i32 in0_w, in1_w, in2_w, in3_w, temp0, temp1, temp2, temp3; LD_SH4(input, pitch / 2, in0_h, in1_h, in2_h, in3_h); TRANSPOSE4x4_SH_SH(in0_h, in1_h, in2_h, in3_h, in0_h, in1_h, in2_h, in3_h); UNPCK_R_SH_SW(in0_h, in0_w); UNPCK_R_SH_SW(in1_h, in1_w); UNPCK_R_SH_SW(in2_h, in2_w); UNPCK_R_SH_SW(in3_h, in3_w); BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1); SLLI_4V(temp0, temp1, temp2, temp3, 2); BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w); temp0 = RET_1_IF_NZERO_W(temp0); in0_w += temp0; TRANSPOSE4x4_SW_SW(in0_w, in1_w, in2_w, in3_w, in0_w, in1_w, in2_w, in3_w); BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1); BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w); in0_w += RET_1_IF_NEG_W(in0_w); in1_w += RET_1_IF_NEG_W(in1_w); in2_w += RET_1_IF_NEG_W(in2_w); in3_w += RET_1_IF_NEG_W(in3_w); ADD4(in0_w, 3, in1_w, 3, in2_w, 3, in3_w, 3, in0_w, in1_w, in2_w, in3_w); SRA_4V(in0_w, in1_w, in2_w, in3_w, 3); PCKEV_H2_SH(in1_w, in0_w, in3_w, in2_w, in0_h, in1_h); ST_SH2(in0_h, in1_h, output, 8); }
void vp9_fwht4x4_msa(const int16_t *input, int16_t *output, int32_t src_stride) { v8i16 in0, in1, in2, in3, in4; LD_SH4(input, src_stride, in0, in1, in2, in3); in0 += in1; in3 -= in2; in4 = (in0 - in3) >> 1; SUB2(in4, in1, in4, in2, in1, in2); in0 -= in2; in3 += in1; TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1); in0 += in2; in1 -= in3; in4 = (in0 - in1) >> 1; SUB2(in4, in2, in4, in3, in2, in3); in0 -= in3; in1 += in2; SLLI_4V(in0, in1, in2, in3, 2); TRANSPOSE4x4_SH_SH(in0, in3, in1, in2, in0, in3, in1, in2); ST4x2_UB(in0, output, 4); ST4x2_UB(in3, output + 4, 4); ST4x2_UB(in1, output + 8, 4); ST4x2_UB(in2, output + 12, 4); }
void vpx_fdct8x8_msa(const int16_t *input, int16_t *output, int32_t src_stride) { v8i16 in0, in1, in2, in3, in4, in5, in6, in7; LD_SH8(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7); SLLI_4V(in0, in1, in2, in3, 2); SLLI_4V(in4, in5, in6, in7, 2); VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7); ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8); }
void vp8_short_fdct8x4_msa(int16_t *input, int16_t *output, int32_t pitch) { v8i16 in0, in1, in2, in3; v8i16 temp0, temp1, tmp0, tmp1; v8i16 const0, const1, const2; v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 }; v8i16 zero = { 0 }; v4i32 vec0_w, vec1_w, vec2_w, vec3_w; LD_SH4(input, pitch / 2, in0, in1, in2, in3); TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); SLLI_4V(temp0, temp1, in1, in3, 3); in0 = temp0 + temp1; in2 = temp0 - temp1; SET_DOTP_VALUES(coeff, 0, 1, 2, const1, const2); temp0 = __msa_splati_h(coeff, 3); vec1_w = (v4i32)__msa_ilvev_h(zero, temp0); coeff = __msa_ilvl_h(zero, coeff); vec3_w = __msa_splati_w((v4i32)coeff, 0); ILVRL_H2_SH(in3, in1, tmp1, tmp0); vec0_w = vec1_w; vec2_w = vec3_w; DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2, vec0_w, vec1_w, vec2_w, vec3_w); SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 12); PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3); TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); in0 = temp0 + temp1 + 7; in2 = temp0 - temp1 + 7; in0 >>= 4; in2 >>= 4; SPLATI_W2_SW(coeff, 2, vec3_w, vec1_w); vec3_w += vec1_w; vec1_w = __msa_splati_w((v4i32)coeff, 1); const0 = RET_1_IF_NZERO_H(in3); ILVRL_H2_SH(in3, in1, tmp1, tmp0); vec0_w = vec1_w; vec2_w = vec3_w; DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2, vec0_w, vec1_w, vec2_w, vec3_w); SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 16); PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3); in1 += const0; PCKEV_D2_SH(in1, in0, in3, in2, temp0, temp1); ST_SH2(temp0, temp1, output, 8); PCKOD_D2_SH(in1, in0, in3, in2, in0, in2); ST_SH2(in0, in2, output + 16, 8); }
void vp9_fht4x4_msa(const int16_t *input, int16_t *output, int32_t stride, int32_t tx_type) { v8i16 in0, in1, in2, in3; LD_SH4(input, stride, in0, in1, in2, in3); /* fdct4 pre-process */ { v8i16 temp, mask; v16i8 zero = { 0 }; v16i8 one = __msa_ldi_b(1); mask = (v8i16)__msa_sldi_b(zero, one, 15); SLLI_4V(in0, in1, in2, in3, 4); temp = __msa_ceqi_h(in0, 0); temp = (v8i16)__msa_xori_b((v16u8)temp, 255); temp = mask & temp; in0 += temp; } switch (tx_type) { case DCT_DCT: VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); break; case ADST_DCT: VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); break; case DCT_ADST: VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); break; case ADST_ADST: VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); break; default: assert(0); break; } TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); SRA_4V(in0, in1, in2, in3, 2); PCKEV_D2_SH(in1, in0, in3, in2, in0, in2); ST_SH2(in0, in2, output, 8); }
static void fdct8x32_1d_column_load_butterfly(const int16_t *input, int32_t src_stride, int16_t *temp_buff) { v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 step0, step1, step2, step3; v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1; v8i16 step0_1, step1_1, step2_1, step3_1; /* 1st and 2nd set */ LD_SH4(input, src_stride, in0, in1, in2, in3); LD_SH4(input + (28 * src_stride), src_stride, in4, in5, in6, in7); LD_SH4(input + (4 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1); LD_SH4(input + (24 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1); SLLI_4V(in0, in1, in2, in3, 2); SLLI_4V(in4, in5, in6, in7, 2); SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2); SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2); BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2, step3, in4, in5, in6, in7); BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1); ST_SH4(step0, step1, step2, step3, temp_buff, 8); ST_SH4(in4, in5, in6, in7, temp_buff + (28 * 8), 8); ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (4 * 8), 8); ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (24 * 8), 8); /* 3rd and 4th set */ LD_SH4(input + (8 * src_stride), src_stride, in0, in1, in2, in3); LD_SH4(input + (20 * src_stride), src_stride, in4, in5, in6, in7); LD_SH4(input + (12 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1); LD_SH4(input + (16 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1); SLLI_4V(in0, in1, in2, in3, 2); SLLI_4V(in4, in5, in6, in7, 2); SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2); SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2); BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2, step3, in4, in5, in6, in7); BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1); ST_SH4(step0, step1, step2, step3, temp_buff + (8 * 8), 8); ST_SH4(in4, in5, in6, in7, temp_buff + (20 * 8), 8); ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (12 * 8), 8); ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (15 * 8) + 8, 8); }
void vp8_short_fdct4x4_msa(int16_t *input, int16_t *output, int32_t pitch) { v8i16 in0, in1, in2, in3; v8i16 temp0, temp1; v8i16 const0, const1; v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 }; v4i32 out0, out1, out2, out3; v8i16 zero = { 0 }; LD_SH4(input, pitch / 2, in0, in1, in2, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); SLLI_4V(temp0, temp1, in1, in3, 3); in0 = temp0 + temp1; in2 = temp0 - temp1; SET_DOTP_VALUES(coeff, 0, 1, 2, const0, const1); temp0 = __msa_ilvr_h(in3, in1); in1 = __msa_splati_h(coeff, 3); out0 = (v4i32)__msa_ilvev_h(zero, in1); coeff = __msa_ilvl_h(zero, coeff); out1 = __msa_splati_w((v4i32)coeff, 0); DPADD_SH2_SW(temp0, temp0, const0, const1, out0, out1); out0 >>= 12; out1 >>= 12; PCKEV_H2_SH(out0, out0, out1, out1, in1, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); in0 = temp0 + temp1 + 7; in2 = temp0 - temp1 + 7; in0 >>= 4; in2 >>= 4; ILVR_H2_SW(zero, in0, zero, in2, out0, out2); temp1 = RET_1_IF_NZERO_H(in3); ILVR_H2_SH(zero, temp1, in3, in1, temp1, temp0); SPLATI_W2_SW(coeff, 2, out3, out1); out3 += out1; out1 = __msa_splati_w((v4i32)coeff, 1); DPADD_SH2_SW(temp0, temp0, const0, const1, out1, out3); out1 >>= 16; out3 >>= 16; out1 += (v4i32)temp1; PCKEV_H2_SH(out1, out0, out3, out2, in0, in2); ST_SH2(in0, in2, output, 8); }
void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, int32_t src_stride) { v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 in8, in9, in10, in11, in12, in13, in14, in15; v8i16 stp21, stp22, stp23, stp24, stp25, stp26, stp30; v8i16 stp31, stp32, stp33, stp34, stp35, stp36, stp37; v8i16 vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5; v8i16 coeff = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 }; v8i16 coeff1 = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64, cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 }; v8i16 coeff2 = { -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0 }; LD_SH16(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15); SLLI_4V(in0, in1, in2, in3, 2); SLLI_4V(in4, in5, in6, in7, 2); SLLI_4V(in8, in9, in10, in11, 2); SLLI_4V(in12, in13, in14, in15, 2); ADD4(in0, in15, in1, in14, in2, in13, in3, in12, tmp0, tmp1, tmp2, tmp3); ADD4(in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, tmp6, tmp7); FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); ST_SH8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp_ptr, 32); SUB4(in0, in15, in1, in14, in2, in13, in3, in12, in15, in14, in13, in12); SUB4(in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, in9, in8); tmp_ptr += 16; /* stp 1 */ ILVL_H2_SH(in10, in13, in11, in12, vec2, vec4); ILVR_H2_SH(in10, in13, in11, in12, vec3, vec5); cnst4 = __msa_splati_h(coeff, 0); stp25 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4); cnst5 = __msa_splati_h(coeff, 1); cnst5 = __msa_ilvev_h(cnst5, cnst4); stp22 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5); stp24 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4); stp23 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5); /* stp2 */ BUTTERFLY_4(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33); BUTTERFLY_4(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34); ILVL_H2_SH(stp36, stp31, stp35, stp32, vec2, vec4); ILVR_H2_SH(stp36, stp31, stp35, stp32, vec3, vec5); SPLATI_H2_SH(coeff, 2, 3, cnst0, cnst1); cnst0 = __msa_ilvev_h(cnst0, cnst1); stp26 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0); cnst0 = __msa_splati_h(coeff, 4); cnst1 = __msa_ilvev_h(cnst1, cnst0); stp21 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1); BUTTERFLY_4(stp30, stp37, stp26, stp21, in8, in15, in14, in9); ILVRL_H2_SH(in15, in8, vec1, vec0); SPLATI_H2_SH(coeff1, 0, 1, cnst0, cnst1); cnst0 = __msa_ilvev_h(cnst0, cnst1); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr); cnst0 = __msa_splati_h(coeff2, 0); cnst0 = __msa_ilvev_h(cnst1, cnst0); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr + 224); ILVRL_H2_SH(in14, in9, vec1, vec0); SPLATI_H2_SH(coeff1, 2, 3, cnst0, cnst1); cnst1 = __msa_ilvev_h(cnst1, cnst0); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1); ST_SH(in8, tmp_ptr + 128); cnst1 = __msa_splati_h(coeff2, 2); cnst0 = __msa_ilvev_h(cnst0, cnst1); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr + 96); SPLATI_H2_SH(coeff, 2, 5, cnst0, cnst1); cnst1 = __msa_ilvev_h(cnst1, cnst0); stp25 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1); cnst1 = __msa_splati_h(coeff, 3); cnst1 = __msa_ilvev_h(cnst0, cnst1); stp22 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1); /* stp4 */ ADD2(stp34, stp25, stp33, stp22, in13, in10); ILVRL_H2_SH(in13, in10, vec1, vec0); SPLATI_H2_SH(coeff1, 4, 5, cnst0, cnst1); cnst0 = __msa_ilvev_h(cnst0, cnst1); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr + 64); cnst0 = __msa_splati_h(coeff2, 1); cnst0 = __msa_ilvev_h(cnst1, cnst0); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr + 160); SUB2(stp34, stp25, stp33, stp22, in12, in11); ILVRL_H2_SH(in12, in11, vec1, vec0); SPLATI_H2_SH(coeff1, 6, 7, cnst0, cnst1); cnst1 = __msa_ilvev_h(cnst1, cnst0); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1); ST_SH(in8, tmp_ptr + 192); cnst1 = __msa_splati_h(coeff2, 3); cnst0 = __msa_ilvev_h(cnst0, cnst1); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr + 32); }
static void fadst16_cols_step1_msa(const int16_t *input, int32_t stride, const int32_t *const0, int16_t *int_buf) { v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3; v4i32 k0, k1, k2, k3; /* load input data */ r0 = LD_SH(input); r15 = LD_SH(input + 15 * stride); r7 = LD_SH(input + 7 * stride); r8 = LD_SH(input + 8 * stride); SLLI_4V(r0, r15, r7, r8, 2); /* stage 1 */ LD_SW2(const0, 4, k0, k1); LD_SW2(const0 + 8, 4, k2, k3); MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3); r3 = LD_SH(input + 3 * stride); r4 = LD_SH(input + 4 * stride); r11 = LD_SH(input + 11 * stride); r12 = LD_SH(input + 12 * stride); SLLI_4V(r3, r4, r11, r12, 2); LD_SW2(const0 + 4 * 4, 4, k0, k1); LD_SW2(const0 + 4 * 6, 4, k2, k3); MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11); /* stage 2 */ BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1); ST_SH2(tp0, tp2, int_buf, 8); ST_SH2(tp1, tp3, int_buf + 4 * 8, 8); LD_SW2(const0 + 4 * 8, 4, k0, k1); k2 = LD_SW(const0 + 4 * 10); MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3); ST_SH2(h0, h1, int_buf + 8 * 8, 8); ST_SH2(h3, h2, int_buf + 12 * 8, 8); r9 = LD_SH(input + 9 * stride); r6 = LD_SH(input + 6 * stride); r1 = LD_SH(input + stride); r14 = LD_SH(input + 14 * stride); SLLI_4V(r9, r6, r1, r14, 2); LD_SW2(const0 + 4 * 11, 4, k0, k1); LD_SW2(const0 + 4 * 13, 4, k2, k3); MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3); ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8); r13 = LD_SH(input + 13 * stride); r2 = LD_SH(input + 2 * stride); r5 = LD_SH(input + 5 * stride); r10 = LD_SH(input + 10 * stride); SLLI_4V(r13, r2, r5, r10, 2); LD_SW2(const0 + 4 * 15, 4, k0, k1); LD_SW2(const0 + 4 * 17, 4, k2, k3); MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3); ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8); BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3); ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8); }
static void avc_dequant_8x8_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][64], int32_t i_qp ) { const int32_t i_mf = i_qp % 6; const int32_t q_bits = i_qp / 6 - 6; v8i16 dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7; v4i32 dequant_m_f0, dequant_m_f1, dequant_m_f2, dequant_m_f3; v4i32 dequant_m_f4, dequant_m_f5, dequant_m_f6, dequant_m_f7; v4i32 dequant_m_f8, dequant_m_f9, dequant_m_f10, dequant_m_f11; v4i32 dequant_m_f12, dequant_m_f13, dequant_m_f14, dequant_m_f15; LD_SH8( p_dct, 8, dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7 ); LD_SW2( pi_dequant_mf[i_mf], 4, dequant_m_f0, dequant_m_f1 ); LD_SW2( pi_dequant_mf[i_mf] + 8, 4, dequant_m_f2, dequant_m_f3 ); LD_SW2( pi_dequant_mf[i_mf] + 16, 4, dequant_m_f4, dequant_m_f5 ); LD_SW2( pi_dequant_mf[i_mf] + 24, 4, dequant_m_f6, dequant_m_f7 ); LD_SW2( pi_dequant_mf[i_mf] + 32, 4, dequant_m_f8, dequant_m_f9 ); LD_SW2( pi_dequant_mf[i_mf] + 40, 4, dequant_m_f10, dequant_m_f11 ); LD_SW2( pi_dequant_mf[i_mf] + 48, 4, dequant_m_f12, dequant_m_f13 ); LD_SW2( pi_dequant_mf[i_mf] + 56, 4, dequant_m_f14, dequant_m_f15 ); if( q_bits >= 0 ) { v8i16 q_bits_vec; v8i16 dequant_mf_h0, dequant_mf_h1, dequant_mf_h2, dequant_mf_h3; v8i16 dequant_mf_h4, dequant_mf_h5, dequant_mf_h6, dequant_mf_h7; q_bits_vec = __msa_fill_h( q_bits ); PCKEV_H4_SH( dequant_m_f1, dequant_m_f0, dequant_m_f3, dequant_m_f2, dequant_m_f5, dequant_m_f4, dequant_m_f7, dequant_m_f6, dequant_mf_h0, dequant_mf_h1, dequant_mf_h2, dequant_mf_h3 ); PCKEV_H4_SH( dequant_m_f9, dequant_m_f8, dequant_m_f11, dequant_m_f10, dequant_m_f13, dequant_m_f12, dequant_m_f15, dequant_m_f14, dequant_mf_h4, dequant_mf_h5, dequant_mf_h6, dequant_mf_h7 ); dct0 *= dequant_mf_h0; dct1 *= dequant_mf_h1; dct2 *= dequant_mf_h2; dct3 *= dequant_mf_h3; dct4 *= dequant_mf_h4; dct5 *= dequant_mf_h5; dct6 *= dequant_mf_h6; dct7 *= dequant_mf_h7; SLLI_4V( dct0, dct1, dct2, dct3, q_bits_vec ); SLLI_4V( dct4, dct5, dct6, dct7, q_bits_vec ); ST_SH8( dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7, p_dct, 8 ); } else { const int32_t q_bits_add = 1 << ( -q_bits - 1 ); v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3; v4i32 dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7; v4i32 dct_signed_w8, dct_signed_w9, dct_signed_w10, dct_signed_w11; v4i32 dct_signed_w12, dct_signed_w13, dct_signed_w14, dct_signed_w15; v4i32 q_bits_vec, q_bits_vec_add; q_bits_vec_add = __msa_fill_w( q_bits_add ); q_bits_vec = __msa_fill_w( -q_bits ); UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 ); UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 ); UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 ); UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 ); UNPCK_SH_SW( dct4, dct_signed_w8, dct_signed_w9 ); UNPCK_SH_SW( dct5, dct_signed_w10, dct_signed_w11 ); UNPCK_SH_SW( dct6, dct_signed_w12, dct_signed_w13 ); UNPCK_SH_SW( dct7, dct_signed_w14, dct_signed_w15 ); dct_signed_w0 *= dequant_m_f0; dct_signed_w1 *= dequant_m_f1; dct_signed_w2 *= dequant_m_f2; dct_signed_w3 *= dequant_m_f3; dct_signed_w4 *= dequant_m_f4; dct_signed_w5 *= dequant_m_f5; dct_signed_w6 *= dequant_m_f6; dct_signed_w7 *= dequant_m_f7; dct_signed_w8 *= dequant_m_f8; dct_signed_w9 *= dequant_m_f9; dct_signed_w10 *= dequant_m_f10; dct_signed_w11 *= dequant_m_f11; dct_signed_w12 *= dequant_m_f12; dct_signed_w13 *= dequant_m_f13; dct_signed_w14 *= dequant_m_f14; dct_signed_w15 *= dequant_m_f15; dct_signed_w0 += q_bits_vec_add; dct_signed_w1 += q_bits_vec_add; dct_signed_w2 += q_bits_vec_add; dct_signed_w3 += q_bits_vec_add; dct_signed_w4 += q_bits_vec_add; dct_signed_w5 += q_bits_vec_add; dct_signed_w6 += q_bits_vec_add; dct_signed_w7 += q_bits_vec_add; dct_signed_w8 += q_bits_vec_add; dct_signed_w9 += q_bits_vec_add; dct_signed_w10 += q_bits_vec_add; dct_signed_w11 += q_bits_vec_add; dct_signed_w12 += q_bits_vec_add; dct_signed_w13 += q_bits_vec_add; dct_signed_w14 += q_bits_vec_add; dct_signed_w15 += q_bits_vec_add; SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3, q_bits_vec ); SRA_4V( dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7, q_bits_vec ); SRA_4V( dct_signed_w8, dct_signed_w9, dct_signed_w10, dct_signed_w11, q_bits_vec ); SRA_4V( dct_signed_w12, dct_signed_w13, dct_signed_w14, dct_signed_w15, q_bits_vec ); PCKEV_H4_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2, dct_signed_w5, dct_signed_w4, dct_signed_w7, dct_signed_w6, dct0, dct1, dct2, dct3 ); PCKEV_H4_SH( dct_signed_w9, dct_signed_w8, dct_signed_w11, dct_signed_w10, dct_signed_w13, dct_signed_w12, dct_signed_w15, dct_signed_w14, dct4, dct5, dct6, dct7 ); ST_SH8( dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7, p_dct, 8 ); } }