static void fadst16_transpose_msa(int16_t *input, int16_t *out) { v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15; /* load input data */ LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14, l7, l15); TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6, r7); TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11, r12, r13, r14, r15); ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8); ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8); out += 16 * 8; /* load input data */ input += 128; LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14, l7, l15); TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6, r7); TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11, r12, r13, r14, r15); ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8); ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8); }
void fdct16x8_1d_row(int16_t *input, int16_t *output) { v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 in8, in9, in10, in11, in12, in13, in14, in15; LD_SH8(input, 16, in0, in1, in2, in3, in4, in5, in6, in7); LD_SH8((input + 8), 16, in8, in9, in10, in11, in12, in13, in14, in15); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, in10, in11, in12, in13, in14, in15); ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); ADD4(in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7); ADD4(in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10, in11); ADD4(in12, 1, in13, 1, in14, 1, in15, 1, in12, in13, in14, in15); SRA_4V(in0, in1, in2, in3, 2); SRA_4V(in4, in5, in6, in7, 2); SRA_4V(in8, in9, in10, in11, 2); SRA_4V(in12, in13, in14, in15, 2); BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, in8, in9, in10, in11, in12, in13, in14, in15); ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, input, 16); FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); LD_SH8(input, 16, in8, in9, in10, in11, in12, in13, in14, in15); FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3); ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, output, 16); TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7); ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, output + 8, 16); }
static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff, int16_t *output) { v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 in8, in9, in10, in11, in12, in13, in14, in15; v8i16 step0, step1, step2, step3, step4, step5, step6, step7; LD_SH8(temp_buff, 32, in0, in1, in2, in3, in4, in5, in6, in7); LD_SH8(temp_buff + 24, 32, in8, in9, in10, in11, in12, in13, in14, in15); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, in10, in11, in12, in13, in14, in15); BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15, step0, step1, step2, step3, step4, step5, step6, step7, in8, in9, in10, in11, in12, in13, in14, in15); ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, output, 8); ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 24 * 8), 8); /* 2nd set */ LD_SH8(temp_buff + 8, 32, in0, in1, in2, in3, in4, in5, in6, in7); LD_SH8(temp_buff + 16, 32, in8, in9, in10, in11, in12, in13, in14, in15); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, in10, in11, in12, in13, in14, in15); BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15, step0, step1, step2, step3, step4, step5, step6, step7, in8, in9, in10, in11, in12, in13, in14, in15); ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, (output + 8 * 8), 8); ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 16 * 8), 8); }
static void fadst16_transpose_postproc_msa(int16_t *input, int16_t *out) { v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15; /* load input data */ LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7); TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6, r7); FDCT_POSTPROC_2V_NEG_H(r0, r1); FDCT_POSTPROC_2V_NEG_H(r2, r3); FDCT_POSTPROC_2V_NEG_H(r4, r5); FDCT_POSTPROC_2V_NEG_H(r6, r7); ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8); out += 64; LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15); TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11, r12, r13, r14, r15); FDCT_POSTPROC_2V_NEG_H(r8, r9); FDCT_POSTPROC_2V_NEG_H(r10, r11); FDCT_POSTPROC_2V_NEG_H(r12, r13); FDCT_POSTPROC_2V_NEG_H(r14, r15); ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8); out += 64; /* load input data */ input += 128; LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7); TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6, r7); FDCT_POSTPROC_2V_NEG_H(r0, r1); FDCT_POSTPROC_2V_NEG_H(r2, r3); FDCT_POSTPROC_2V_NEG_H(r4, r5); FDCT_POSTPROC_2V_NEG_H(r6, r7); ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8); out += 64; LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15); TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11, r12, r13, r14, r15); FDCT_POSTPROC_2V_NEG_H(r8, r9); FDCT_POSTPROC_2V_NEG_H(r10, r11); FDCT_POSTPROC_2V_NEG_H(r12, r13); FDCT_POSTPROC_2V_NEG_H(r14, r15); ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8); }
static void hevc_idct_dc_8x8_msa(int16_t *coeffs) { int32_t val; v8i16 dst; val = (coeffs[0] + 1) >> 1; val = (val + 32) >> 6; dst = __msa_fill_h(val); ST_SH8(dst, dst, dst, dst, dst, dst, dst, dst, coeffs, 8); }
static void postproc_fdct16x8_1d_row(int16_t *intermediate, int16_t *output) { int16_t *temp = intermediate; int16_t *out = output; v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11; v8i16 in12, in13, in14, in15; LD_SH8(temp, 16, in0, in1, in2, in3, in4, in5, in6, in7); temp = intermediate + 8; LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, in10, in11, in12, in13, in14, in15); FDCT_POSTPROC_2V_NEG_H(in0, in1); FDCT_POSTPROC_2V_NEG_H(in2, in3); FDCT_POSTPROC_2V_NEG_H(in4, in5); FDCT_POSTPROC_2V_NEG_H(in6, in7); FDCT_POSTPROC_2V_NEG_H(in8, in9); FDCT_POSTPROC_2V_NEG_H(in10, in11); FDCT_POSTPROC_2V_NEG_H(in12, in13); FDCT_POSTPROC_2V_NEG_H(in14, in15); BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, in8, in9, in10, in11, in12, in13, in14, in15); temp = intermediate; ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, temp, 16); FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); temp = intermediate; LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15); FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3); ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, out, 16); TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7); out = output + 8; ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, out, 16); }
static void hevc_idct_transpose_8x32_to_32x8(int16_t *tmp_buf, int16_t *coeffs) { uint8_t i; v8i16 in0, in1, in2, in3, in4, in5, in6, in7; for (i = 0; i < 4; i++) { LD_SH8(tmp_buf + i * 8 * 8, 8, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, coeffs + i * 8, 32); } }
void vp9_fht8x8_msa(const int16_t *input, int16_t *output, int32_t stride, int32_t tx_type) { v8i16 in0, in1, in2, in3, in4, in5, in6, in7; LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7); SLLI_4V(in0, in1, in2, in3, 2); SLLI_4V(in4, in5, in6, in7, 2); switch (tx_type) { case DCT_DCT: VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); break; case ADST_DCT: VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); break; case DCT_ADST: VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); break; case ADST_ADST: VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); break; default: assert(0); break; } TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); VP9_SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7); ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8); }
static void hevc_idct_8x8_msa(int16_t *coeffs) { const int16_t *filter = >8x8_cnst[0]; v8i16 in0, in1, in2, in3, in4, in5, in6, in7; LD_SH8(coeffs, 8, in0, in1, in2, in3, in4, in5, in6, in7); HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 7); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 12); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, coeffs, 8); }
static void hevc_idct_dc_32x32_msa(int16_t *coeffs) { uint8_t loop; int32_t val; v8i16 dst; val = (coeffs[0] + 1) >> 1; val = (val + 32) >> 6; dst = __msa_fill_h(val); for (loop = 16; loop--;) { ST_SH8(dst, dst, dst, dst, dst, dst, dst, dst, coeffs, 8); coeffs += 8 * 8; } }
void vpx_fdct8x8_msa(const int16_t *input, int16_t *output, int32_t src_stride) { v8i16 in0, in1, in2, in3, in4, in5, in6, in7; LD_SH8(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7); SLLI_4V(in0, in1, in2, in3, 2); SLLI_4V(in4, in5, in6, in7, 2); VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7); ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8); }
void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, int32_t src_stride) { v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 in8, in9, in10, in11, in12, in13, in14, in15; v8i16 stp21, stp22, stp23, stp24, stp25, stp26, stp30; v8i16 stp31, stp32, stp33, stp34, stp35, stp36, stp37; v8i16 vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5; v8i16 coeff = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 }; v8i16 coeff1 = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64, cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 }; v8i16 coeff2 = { -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0 }; LD_SH16(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15); SLLI_4V(in0, in1, in2, in3, 2); SLLI_4V(in4, in5, in6, in7, 2); SLLI_4V(in8, in9, in10, in11, 2); SLLI_4V(in12, in13, in14, in15, 2); ADD4(in0, in15, in1, in14, in2, in13, in3, in12, tmp0, tmp1, tmp2, tmp3); ADD4(in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, tmp6, tmp7); FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); ST_SH8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp_ptr, 32); SUB4(in0, in15, in1, in14, in2, in13, in3, in12, in15, in14, in13, in12); SUB4(in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, in9, in8); tmp_ptr += 16; /* stp 1 */ ILVL_H2_SH(in10, in13, in11, in12, vec2, vec4); ILVR_H2_SH(in10, in13, in11, in12, vec3, vec5); cnst4 = __msa_splati_h(coeff, 0); stp25 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4); cnst5 = __msa_splati_h(coeff, 1); cnst5 = __msa_ilvev_h(cnst5, cnst4); stp22 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5); stp24 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4); stp23 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5); /* stp2 */ BUTTERFLY_4(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33); BUTTERFLY_4(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34); ILVL_H2_SH(stp36, stp31, stp35, stp32, vec2, vec4); ILVR_H2_SH(stp36, stp31, stp35, stp32, vec3, vec5); SPLATI_H2_SH(coeff, 2, 3, cnst0, cnst1); cnst0 = __msa_ilvev_h(cnst0, cnst1); stp26 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0); cnst0 = __msa_splati_h(coeff, 4); cnst1 = __msa_ilvev_h(cnst1, cnst0); stp21 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1); BUTTERFLY_4(stp30, stp37, stp26, stp21, in8, in15, in14, in9); ILVRL_H2_SH(in15, in8, vec1, vec0); SPLATI_H2_SH(coeff1, 0, 1, cnst0, cnst1); cnst0 = __msa_ilvev_h(cnst0, cnst1); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr); cnst0 = __msa_splati_h(coeff2, 0); cnst0 = __msa_ilvev_h(cnst1, cnst0); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr + 224); ILVRL_H2_SH(in14, in9, vec1, vec0); SPLATI_H2_SH(coeff1, 2, 3, cnst0, cnst1); cnst1 = __msa_ilvev_h(cnst1, cnst0); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1); ST_SH(in8, tmp_ptr + 128); cnst1 = __msa_splati_h(coeff2, 2); cnst0 = __msa_ilvev_h(cnst0, cnst1); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr + 96); SPLATI_H2_SH(coeff, 2, 5, cnst0, cnst1); cnst1 = __msa_ilvev_h(cnst1, cnst0); stp25 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1); cnst1 = __msa_splati_h(coeff, 3); cnst1 = __msa_ilvev_h(cnst0, cnst1); stp22 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1); /* stp4 */ ADD2(stp34, stp25, stp33, stp22, in13, in10); ILVRL_H2_SH(in13, in10, vec1, vec0); SPLATI_H2_SH(coeff1, 4, 5, cnst0, cnst1); cnst0 = __msa_ilvev_h(cnst0, cnst1); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr + 64); cnst0 = __msa_splati_h(coeff2, 1); cnst0 = __msa_ilvev_h(cnst1, cnst0); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr + 160); SUB2(stp34, stp25, stp33, stp22, in12, in11); ILVRL_H2_SH(in12, in11, vec1, vec0); SPLATI_H2_SH(coeff1, 6, 7, cnst0, cnst1); cnst1 = __msa_ilvev_h(cnst1, cnst0); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1); ST_SH(in8, tmp_ptr + 192); cnst1 = __msa_splati_h(coeff2, 3); cnst0 = __msa_ilvev_h(cnst0, cnst1); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr + 32); }
static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) { v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1; /* 1st set */ in0 = LD_SH(temp); in4 = LD_SH(temp + 32); in2 = LD_SH(temp + 64); in6 = LD_SH(temp + 96); in1 = LD_SH(temp + 128); in7 = LD_SH(temp + 152); in3 = LD_SH(temp + 192); in5 = LD_SH(temp + 216); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); /* 2nd set */ in0_1 = LD_SH(temp + 16); in1_1 = LD_SH(temp + 232); in2_1 = LD_SH(temp + 80); in3_1 = LD_SH(temp + 168); in4_1 = LD_SH(temp + 48); in5_1 = LD_SH(temp + 176); in6_1 = LD_SH(temp + 112); in7_1 = LD_SH(temp + 240); ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 32); TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1); /* 3rd set */ in0 = LD_SH(temp + 8); in1 = LD_SH(temp + 136); in2 = LD_SH(temp + 72); in3 = LD_SH(temp + 200); in4 = LD_SH(temp + 40); in5 = LD_SH(temp + 208); in6 = LD_SH(temp + 104); in7 = LD_SH(temp + 144); ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 8, 32); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output + 16, 32); /* 4th set */ in0_1 = LD_SH(temp + 24); in1_1 = LD_SH(temp + 224); in2_1 = LD_SH(temp + 88); in3_1 = LD_SH(temp + 160); in4_1 = LD_SH(temp + 56); in5_1 = LD_SH(temp + 184); in6_1 = LD_SH(temp + 120); in7_1 = LD_SH(temp + 248); TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1); ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 24, 32); }
static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr, int16_t *out) { v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 in8, in9, in10, in11, in12, in13, in14, in15; v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v4i32 vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l; v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r; v4i32 tmp0_w, tmp1_w, tmp2_w, tmp3_w; /* fdct32 even */ /* stage 2 */ LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14, in15); ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8); ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8); /* Stage 3 */ UNPCK_SH_SW(vec0, vec0_l, vec0_r); UNPCK_SH_SW(vec1, vec1_l, vec1_r); UNPCK_SH_SW(vec2, vec2_l, vec2_r); UNPCK_SH_SW(vec3, vec3_l, vec3_r); UNPCK_SH_SW(vec4, vec4_l, vec4_r); UNPCK_SH_SW(vec5, vec5_l, vec5_r); UNPCK_SH_SW(vec6, vec6_l, vec6_r); UNPCK_SH_SW(vec7, vec7_l, vec7_r); ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r, tmp0_w, tmp1_w, tmp2_w, tmp3_w); BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r); ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l, vec0_r, vec1_r, vec2_r, vec3_r); tmp3_w = vec0_r + vec3_r; vec0_r = vec0_r - vec3_r; vec3_r = vec1_r + vec2_r; vec1_r = vec1_r - vec2_r; DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64, vec4_r, tmp3_w, vec6_r, vec3_r); FDCT32_POSTPROC_NEG_W(vec4_r); FDCT32_POSTPROC_NEG_W(tmp3_w); FDCT32_POSTPROC_NEG_W(vec6_r); FDCT32_POSTPROC_NEG_W(vec3_r); PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); ST_SH2(vec5, vec4, out, 8); DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64, vec4_r, tmp3_w, vec6_r, vec3_r); FDCT32_POSTPROC_NEG_W(vec4_r); FDCT32_POSTPROC_NEG_W(tmp3_w); FDCT32_POSTPROC_NEG_W(vec6_r); FDCT32_POSTPROC_NEG_W(vec3_r); PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); ST_SH2(vec5, vec4, out + 16, 8); LD_SH8(interm_ptr, 8, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); ADD2(vec4, vec5, vec7, vec6, vec0, vec1); DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 32); ST_SH(in5, out + 56); SUB2(vec4, vec5, vec7, vec6, vec4, vec7); DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 40); ST_SH(in5, out + 48); LD_SH8(interm_ptr + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); ADD2(in0, in1, in2, in3, vec0, vec7); DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 64); ST_SH(in5, out + 120); SUB2(in0, in1, in2, in3, in0, in2); DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 72); ST_SH(in5, out + 112); SUB2(in9, vec2, in14, vec5, vec2, vec5); DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 80); ST_SH(in5, out + 104); ADD2(in3, in2, in0, in1, vec3, vec4); DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 96); ST_SH(in5, out + 88); }
static void hevc_idct_16x16_msa(int16_t *coeffs) { int16_t i, j, k; int16_t buf[256]; int16_t *buf_ptr = &buf[0]; int16_t *src = coeffs; const int16_t *filter = >16x16_cnst[0]; v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 in8, in9, in10, in11, in12, in13, in14, in15; v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r; v8i16 src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l; for (i = 2; i--;) { LD_SH16(src, 16, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15); ILVR_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10, src0_r, src1_r, src2_r, src3_r); ILVR_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15, src4_r, src5_r, src6_r, src7_r); ILVL_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10, src0_l, src1_l, src2_l, src3_l); ILVL_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15, src4_l, src5_l, src6_l, src7_l); HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l, 7); src += 8; buf_ptr = (&buf[0] + 8); filter = >16x16_cnst[0]; } src = &buf[0]; buf_ptr = coeffs; filter = >16x16_cnst[0]; for (i = 2; i--;) { LD_SH16(src, 8, in0, in8, in1, in9, in2, in10, in3, in11, in4, in12, in5, in13, in6, in14, in7, in15); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, in10, in11, in12, in13, in14, in15); ILVR_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10, src0_r, src1_r, src2_r, src3_r); ILVR_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15, src4_r, src5_r, src6_r, src7_r); ILVL_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10, src0_l, src1_l, src2_l, src3_l); ILVL_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15, src4_l, src5_l, src6_l, src7_l); HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l, 12); src += 128; buf_ptr = coeffs + 8; filter = >16x16_cnst[0]; } LD_SH8(coeffs, 16, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, coeffs, 16); LD_SH8((coeffs + 8), 16, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); LD_SH8((coeffs + 128), 16, in8, in9, in10, in11, in12, in13, in14, in15); ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, (coeffs + 128), 16); TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, (coeffs + 8), 16); LD_SH8((coeffs + 136), 16, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, (coeffs + 136), 16); }
static void avc_dequant_8x8_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][64], int32_t i_qp ) { const int32_t i_mf = i_qp % 6; const int32_t q_bits = i_qp / 6 - 6; v8i16 dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7; v4i32 dequant_m_f0, dequant_m_f1, dequant_m_f2, dequant_m_f3; v4i32 dequant_m_f4, dequant_m_f5, dequant_m_f6, dequant_m_f7; v4i32 dequant_m_f8, dequant_m_f9, dequant_m_f10, dequant_m_f11; v4i32 dequant_m_f12, dequant_m_f13, dequant_m_f14, dequant_m_f15; LD_SH8( p_dct, 8, dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7 ); LD_SW2( pi_dequant_mf[i_mf], 4, dequant_m_f0, dequant_m_f1 ); LD_SW2( pi_dequant_mf[i_mf] + 8, 4, dequant_m_f2, dequant_m_f3 ); LD_SW2( pi_dequant_mf[i_mf] + 16, 4, dequant_m_f4, dequant_m_f5 ); LD_SW2( pi_dequant_mf[i_mf] + 24, 4, dequant_m_f6, dequant_m_f7 ); LD_SW2( pi_dequant_mf[i_mf] + 32, 4, dequant_m_f8, dequant_m_f9 ); LD_SW2( pi_dequant_mf[i_mf] + 40, 4, dequant_m_f10, dequant_m_f11 ); LD_SW2( pi_dequant_mf[i_mf] + 48, 4, dequant_m_f12, dequant_m_f13 ); LD_SW2( pi_dequant_mf[i_mf] + 56, 4, dequant_m_f14, dequant_m_f15 ); if( q_bits >= 0 ) { v8i16 q_bits_vec; v8i16 dequant_mf_h0, dequant_mf_h1, dequant_mf_h2, dequant_mf_h3; v8i16 dequant_mf_h4, dequant_mf_h5, dequant_mf_h6, dequant_mf_h7; q_bits_vec = __msa_fill_h( q_bits ); PCKEV_H4_SH( dequant_m_f1, dequant_m_f0, dequant_m_f3, dequant_m_f2, dequant_m_f5, dequant_m_f4, dequant_m_f7, dequant_m_f6, dequant_mf_h0, dequant_mf_h1, dequant_mf_h2, dequant_mf_h3 ); PCKEV_H4_SH( dequant_m_f9, dequant_m_f8, dequant_m_f11, dequant_m_f10, dequant_m_f13, dequant_m_f12, dequant_m_f15, dequant_m_f14, dequant_mf_h4, dequant_mf_h5, dequant_mf_h6, dequant_mf_h7 ); dct0 *= dequant_mf_h0; dct1 *= dequant_mf_h1; dct2 *= dequant_mf_h2; dct3 *= dequant_mf_h3; dct4 *= dequant_mf_h4; dct5 *= dequant_mf_h5; dct6 *= dequant_mf_h6; dct7 *= dequant_mf_h7; SLLI_4V( dct0, dct1, dct2, dct3, q_bits_vec ); SLLI_4V( dct4, dct5, dct6, dct7, q_bits_vec ); ST_SH8( dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7, p_dct, 8 ); } else { const int32_t q_bits_add = 1 << ( -q_bits - 1 ); v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3; v4i32 dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7; v4i32 dct_signed_w8, dct_signed_w9, dct_signed_w10, dct_signed_w11; v4i32 dct_signed_w12, dct_signed_w13, dct_signed_w14, dct_signed_w15; v4i32 q_bits_vec, q_bits_vec_add; q_bits_vec_add = __msa_fill_w( q_bits_add ); q_bits_vec = __msa_fill_w( -q_bits ); UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 ); UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 ); UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 ); UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 ); UNPCK_SH_SW( dct4, dct_signed_w8, dct_signed_w9 ); UNPCK_SH_SW( dct5, dct_signed_w10, dct_signed_w11 ); UNPCK_SH_SW( dct6, dct_signed_w12, dct_signed_w13 ); UNPCK_SH_SW( dct7, dct_signed_w14, dct_signed_w15 ); dct_signed_w0 *= dequant_m_f0; dct_signed_w1 *= dequant_m_f1; dct_signed_w2 *= dequant_m_f2; dct_signed_w3 *= dequant_m_f3; dct_signed_w4 *= dequant_m_f4; dct_signed_w5 *= dequant_m_f5; dct_signed_w6 *= dequant_m_f6; dct_signed_w7 *= dequant_m_f7; dct_signed_w8 *= dequant_m_f8; dct_signed_w9 *= dequant_m_f9; dct_signed_w10 *= dequant_m_f10; dct_signed_w11 *= dequant_m_f11; dct_signed_w12 *= dequant_m_f12; dct_signed_w13 *= dequant_m_f13; dct_signed_w14 *= dequant_m_f14; dct_signed_w15 *= dequant_m_f15; dct_signed_w0 += q_bits_vec_add; dct_signed_w1 += q_bits_vec_add; dct_signed_w2 += q_bits_vec_add; dct_signed_w3 += q_bits_vec_add; dct_signed_w4 += q_bits_vec_add; dct_signed_w5 += q_bits_vec_add; dct_signed_w6 += q_bits_vec_add; dct_signed_w7 += q_bits_vec_add; dct_signed_w8 += q_bits_vec_add; dct_signed_w9 += q_bits_vec_add; dct_signed_w10 += q_bits_vec_add; dct_signed_w11 += q_bits_vec_add; dct_signed_w12 += q_bits_vec_add; dct_signed_w13 += q_bits_vec_add; dct_signed_w14 += q_bits_vec_add; dct_signed_w15 += q_bits_vec_add; SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3, q_bits_vec ); SRA_4V( dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7, q_bits_vec ); SRA_4V( dct_signed_w8, dct_signed_w9, dct_signed_w10, dct_signed_w11, q_bits_vec ); SRA_4V( dct_signed_w12, dct_signed_w13, dct_signed_w14, dct_signed_w15, q_bits_vec ); PCKEV_H4_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2, dct_signed_w5, dct_signed_w4, dct_signed_w7, dct_signed_w6, dct0, dct1, dct2, dct3 ); PCKEV_H4_SH( dct_signed_w9, dct_signed_w8, dct_signed_w11, dct_signed_w10, dct_signed_w13, dct_signed_w12, dct_signed_w15, dct_signed_w14, dct4, dct5, dct6, dct7 ); ST_SH8( dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7, p_dct, 8 ); } }