void fdct16x8_1d_row(int16_t *input, int16_t *output) { v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 in8, in9, in10, in11, in12, in13, in14, in15; LD_SH8(input, 16, in0, in1, in2, in3, in4, in5, in6, in7); LD_SH8((input + 8), 16, in8, in9, in10, in11, in12, in13, in14, in15); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, in10, in11, in12, in13, in14, in15); ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); ADD4(in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7); ADD4(in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10, in11); ADD4(in12, 1, in13, 1, in14, 1, in15, 1, in12, in13, in14, in15); SRA_4V(in0, in1, in2, in3, 2); SRA_4V(in4, in5, in6, in7, 2); SRA_4V(in8, in9, in10, in11, 2); SRA_4V(in12, in13, in14, in15, 2); BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, in8, in9, in10, in11, in12, in13, in14, in15); ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, input, 16); FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); LD_SH8(input, 16, in8, in9, in10, in11, in12, in13, in14, in15); FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3); ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, output, 16); TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7); ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, output + 8, 16); }
static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff, int16_t *output) { v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 in8, in9, in10, in11, in12, in13, in14, in15; v8i16 step0, step1, step2, step3, step4, step5, step6, step7; LD_SH8(temp_buff, 32, in0, in1, in2, in3, in4, in5, in6, in7); LD_SH8(temp_buff + 24, 32, in8, in9, in10, in11, in12, in13, in14, in15); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, in10, in11, in12, in13, in14, in15); BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15, step0, step1, step2, step3, step4, step5, step6, step7, in8, in9, in10, in11, in12, in13, in14, in15); ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, output, 8); ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 24 * 8), 8); /* 2nd set */ LD_SH8(temp_buff + 8, 32, in0, in1, in2, in3, in4, in5, in6, in7); LD_SH8(temp_buff + 16, 32, in8, in9, in10, in11, in12, in13, in14, in15); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, in10, in11, in12, in13, in14, in15); BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15, step0, step1, step2, step3, step4, step5, step6, step7, in8, in9, in10, in11, in12, in13, in14, in15); ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, (output + 8 * 8), 8); ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 16 * 8), 8); }
static void fadst16_transpose_postproc_msa(int16_t *input, int16_t *out) { v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15; /* load input data */ LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7); TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6, r7); FDCT_POSTPROC_2V_NEG_H(r0, r1); FDCT_POSTPROC_2V_NEG_H(r2, r3); FDCT_POSTPROC_2V_NEG_H(r4, r5); FDCT_POSTPROC_2V_NEG_H(r6, r7); ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8); out += 64; LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15); TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11, r12, r13, r14, r15); FDCT_POSTPROC_2V_NEG_H(r8, r9); FDCT_POSTPROC_2V_NEG_H(r10, r11); FDCT_POSTPROC_2V_NEG_H(r12, r13); FDCT_POSTPROC_2V_NEG_H(r14, r15); ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8); out += 64; /* load input data */ input += 128; LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7); TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6, r7); FDCT_POSTPROC_2V_NEG_H(r0, r1); FDCT_POSTPROC_2V_NEG_H(r2, r3); FDCT_POSTPROC_2V_NEG_H(r4, r5); FDCT_POSTPROC_2V_NEG_H(r6, r7); ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8); out += 64; LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15); TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11, r12, r13, r14, r15); FDCT_POSTPROC_2V_NEG_H(r8, r9); FDCT_POSTPROC_2V_NEG_H(r10, r11); FDCT_POSTPROC_2V_NEG_H(r12, r13); FDCT_POSTPROC_2V_NEG_H(r14, r15); ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8); }
static void postproc_fdct16x8_1d_row(int16_t *intermediate, int16_t *output) { int16_t *temp = intermediate; int16_t *out = output; v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11; v8i16 in12, in13, in14, in15; LD_SH8(temp, 16, in0, in1, in2, in3, in4, in5, in6, in7); temp = intermediate + 8; LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, in10, in11, in12, in13, in14, in15); FDCT_POSTPROC_2V_NEG_H(in0, in1); FDCT_POSTPROC_2V_NEG_H(in2, in3); FDCT_POSTPROC_2V_NEG_H(in4, in5); FDCT_POSTPROC_2V_NEG_H(in6, in7); FDCT_POSTPROC_2V_NEG_H(in8, in9); FDCT_POSTPROC_2V_NEG_H(in10, in11); FDCT_POSTPROC_2V_NEG_H(in12, in13); FDCT_POSTPROC_2V_NEG_H(in14, in15); BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, in8, in9, in10, in11, in12, in13, in14, in15); temp = intermediate; ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, temp, 16); FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); temp = intermediate; LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15); FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3); ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, out, 16); TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7); out = output + 8; ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, out, 16); }
static void hevc_idct_transpose_8x32_to_32x8(int16_t *tmp_buf, int16_t *coeffs) { uint8_t i; v8i16 in0, in1, in2, in3, in4, in5, in6, in7; for (i = 0; i < 4; i++) { LD_SH8(tmp_buf + i * 8 * 8, 8, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, coeffs + i * 8, 32); } }
void vpx_fdct8x8_1_msa(const int16_t *input, int16_t *out, int32_t stride) { v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v4i32 vec_w; LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7); ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6); ADD2(in0, in2, in4, in6, in0, in4); vec_w = __msa_hadd_s_w(in0, in0); vec_w += __msa_hadd_s_w(in4, in4); out[0] = HADD_SW_S32(vec_w); out[1] = 0; }
void vp9_fht8x8_msa(const int16_t *input, int16_t *output, int32_t stride, int32_t tx_type) { v8i16 in0, in1, in2, in3, in4, in5, in6, in7; LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7); SLLI_4V(in0, in1, in2, in3, 2); SLLI_4V(in4, in5, in6, in7, 2); switch (tx_type) { case DCT_DCT: VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); break; case ADST_DCT: VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); break; case DCT_ADST: VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); break; case ADST_ADST: VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); break; default: assert(0); break; } TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); VP9_SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7); ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8); }
static void hevc_idct_8x8_msa(int16_t *coeffs) { const int16_t *filter = >8x8_cnst[0]; v8i16 in0, in1, in2, in3, in4, in5, in6, in7; LD_SH8(coeffs, 8, in0, in1, in2, in3, in4, in5, in6, in7); HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 7); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 12); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, coeffs, 8); }
void vpx_fdct8x8_msa(const int16_t *input, int16_t *output, int32_t src_stride) { v8i16 in0, in1, in2, in3, in4, in5, in6, in7; LD_SH8(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7); SLLI_4V(in0, in1, in2, in3, 2); SLLI_4V(in4, in5, in6, in7, 2); VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7); ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8); }
static int32_t avc_coeff_last64_msa( int16_t *p_src ) { uint32_t u_res; v8i16 src0, src1, src2, src3, src4, src5, src6, src7; v8i16 tmp_h0, tmp_h1, tmp_h2, tmp_h3, tmp_h4, tmp_h5, tmp_h6, tmp_h7; v16u8 tmp0, tmp1, tmp2, tmp3; v8u16 vec0, vec1, vec2, vec3; v4i32 out0; v16u8 mask = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 }; LD_SH8( p_src, 8, src0, src1, src2, src3, src4, src5, src6, src7 ); tmp_h0 = __msa_ceqi_h( src0, 0 ); tmp_h1 = __msa_ceqi_h( src1, 0 ); tmp_h2 = __msa_ceqi_h( src2, 0 ); tmp_h3 = __msa_ceqi_h( src3, 0 ); tmp_h4 = __msa_ceqi_h( src4, 0 ); tmp_h5 = __msa_ceqi_h( src5, 0 ); tmp_h6 = __msa_ceqi_h( src6, 0 ); tmp_h7 = __msa_ceqi_h( src7, 0 ); PCKEV_B4_UB( tmp_h1, tmp_h0, tmp_h3, tmp_h2, tmp_h5, tmp_h4, tmp_h7, tmp_h6, tmp0, tmp1, tmp2, tmp3 ); tmp0 = tmp0 & mask; tmp1 = tmp1 & mask; tmp2 = tmp2 & mask; tmp3 = tmp3 & mask; HADD_UB4_UH( tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3 ); PCKEV_B2_UB( vec1, vec0, vec3, vec2, tmp0, tmp1 ); HADD_UB2_UH( tmp0, tmp1, vec0, vec1 ); tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) vec1, ( v16i8 ) vec0 ); vec0 = __msa_hadd_u_h( tmp0, tmp0 ); tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) vec0, ( v16i8 ) vec0 ); out0 = ( v4i32 ) __msa_nloc_d( ( v2i64 ) tmp0 ); u_res = __msa_copy_u_w( out0, 0 ); return ( 63 - u_res ); }
static void hevc_addblk_8x8_msa(int16_t *coeffs, uint8_t *dst, int32_t stride) { uint8_t *temp_dst = dst; uint64_t dst0, dst1, dst2, dst3; v2i64 dst_vec0 = { 0 }; v2i64 dst_vec1 = { 0 }; v8i16 dst_r0, dst_l0, dst_r1, dst_l1; v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v16u8 zeros = { 0 }; LD_SH8(coeffs, 8, in0, in1, in2, in3, in4, in5, in6, in7); LD4(temp_dst, stride, dst0, dst1, dst2, dst3); temp_dst += (4 * stride); INSERT_D2_SD(dst0, dst1, dst_vec0); INSERT_D2_SD(dst2, dst3, dst_vec1); ILVRL_B2_SH(zeros, dst_vec0, dst_r0, dst_l0); ILVRL_B2_SH(zeros, dst_vec1, dst_r1, dst_l1); ADD4(dst_r0, in0, dst_l0, in1, dst_r1, in2, dst_l1, in3, dst_r0, dst_l0, dst_r1, dst_l1); CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1); PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1); ST8x4_UB(dst_r0, dst_r1, dst, stride); dst += (4 * stride); LD4(temp_dst, stride, dst0, dst1, dst2, dst3); INSERT_D2_SD(dst0, dst1, dst_vec0); INSERT_D2_SD(dst2, dst3, dst_vec1); UNPCK_UB_SH(dst_vec0, dst_r0, dst_l0); UNPCK_UB_SH(dst_vec1, dst_r1, dst_l1); ADD4(dst_r0, in4, dst_l0, in5, dst_r1, in6, dst_l1, in7, dst_r0, dst_l0, dst_r1, dst_l1); CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1); PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1); ST8x4_UB(dst_r0, dst_r1, dst, stride); }
static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr, int16_t *out) { v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 in8, in9, in10, in11, in12, in13, in14, in15; v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v4i32 vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l; v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r; v4i32 tmp0_w, tmp1_w, tmp2_w, tmp3_w; /* fdct32 even */ /* stage 2 */ LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14, in15); ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8); ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8); /* Stage 3 */ UNPCK_SH_SW(vec0, vec0_l, vec0_r); UNPCK_SH_SW(vec1, vec1_l, vec1_r); UNPCK_SH_SW(vec2, vec2_l, vec2_r); UNPCK_SH_SW(vec3, vec3_l, vec3_r); UNPCK_SH_SW(vec4, vec4_l, vec4_r); UNPCK_SH_SW(vec5, vec5_l, vec5_r); UNPCK_SH_SW(vec6, vec6_l, vec6_r); UNPCK_SH_SW(vec7, vec7_l, vec7_r); ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r, tmp0_w, tmp1_w, tmp2_w, tmp3_w); BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r); ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l, vec0_r, vec1_r, vec2_r, vec3_r); tmp3_w = vec0_r + vec3_r; vec0_r = vec0_r - vec3_r; vec3_r = vec1_r + vec2_r; vec1_r = vec1_r - vec2_r; DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64, vec4_r, tmp3_w, vec6_r, vec3_r); FDCT32_POSTPROC_NEG_W(vec4_r); FDCT32_POSTPROC_NEG_W(tmp3_w); FDCT32_POSTPROC_NEG_W(vec6_r); FDCT32_POSTPROC_NEG_W(vec3_r); PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); ST_SH2(vec5, vec4, out, 8); DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64, vec4_r, tmp3_w, vec6_r, vec3_r); FDCT32_POSTPROC_NEG_W(vec4_r); FDCT32_POSTPROC_NEG_W(tmp3_w); FDCT32_POSTPROC_NEG_W(vec6_r); FDCT32_POSTPROC_NEG_W(vec3_r); PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); ST_SH2(vec5, vec4, out + 16, 8); LD_SH8(interm_ptr, 8, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); ADD2(vec4, vec5, vec7, vec6, vec0, vec1); DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 32); ST_SH(in5, out + 56); SUB2(vec4, vec5, vec7, vec6, vec4, vec7); DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 40); ST_SH(in5, out + 48); LD_SH8(interm_ptr + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); ADD2(in0, in1, in2, in3, vec0, vec7); DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 64); ST_SH(in5, out + 120); SUB2(in0, in1, in2, in3, in0, in2); DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 72); ST_SH(in5, out + 112); SUB2(in9, vec2, in14, vec5, vec2, vec5); DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 80); ST_SH(in5, out + 104); ADD2(in3, in2, in0, in1, vec3, vec4); DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 96); ST_SH(in5, out + 88); }
static void hevc_idct_16x16_msa(int16_t *coeffs) { int16_t i, j, k; int16_t buf[256]; int16_t *buf_ptr = &buf[0]; int16_t *src = coeffs; const int16_t *filter = >16x16_cnst[0]; v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 in8, in9, in10, in11, in12, in13, in14, in15; v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r; v8i16 src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l; for (i = 2; i--;) { LD_SH16(src, 16, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15); ILVR_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10, src0_r, src1_r, src2_r, src3_r); ILVR_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15, src4_r, src5_r, src6_r, src7_r); ILVL_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10, src0_l, src1_l, src2_l, src3_l); ILVL_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15, src4_l, src5_l, src6_l, src7_l); HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l, 7); src += 8; buf_ptr = (&buf[0] + 8); filter = >16x16_cnst[0]; } src = &buf[0]; buf_ptr = coeffs; filter = >16x16_cnst[0]; for (i = 2; i--;) { LD_SH16(src, 8, in0, in8, in1, in9, in2, in10, in3, in11, in4, in12, in5, in13, in6, in14, in7, in15); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, in10, in11, in12, in13, in14, in15); ILVR_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10, src0_r, src1_r, src2_r, src3_r); ILVR_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15, src4_r, src5_r, src6_r, src7_r); ILVL_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10, src0_l, src1_l, src2_l, src3_l); ILVL_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15, src4_l, src5_l, src6_l, src7_l); HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l, 12); src += 128; buf_ptr = coeffs + 8; filter = >16x16_cnst[0]; } LD_SH8(coeffs, 16, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, coeffs, 16); LD_SH8((coeffs + 8), 16, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); LD_SH8((coeffs + 128), 16, in8, in9, in10, in11, in12, in13, in14, in15); ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, (coeffs + 128), 16); TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, (coeffs + 8), 16); LD_SH8((coeffs + 136), 16, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, (coeffs + 136), 16); }
static void hevc_idct_8x32_column_msa(int16_t *coeffs, uint8_t buf_pitch, uint8_t round) { uint8_t i; const int16_t *filter_ptr0 = >32x32_cnst0[0]; const int16_t *filter_ptr1 = >32x32_cnst1[0]; const int16_t *filter_ptr2 = >32x32_cnst2[0]; const int16_t *filter_ptr3 = >8x8_cnst[0]; int16_t *src0 = (coeffs + buf_pitch); int16_t *src1 = (coeffs + 2 * buf_pitch); int16_t *src2 = (coeffs + 4 * buf_pitch); int16_t *src3 = (coeffs); int32_t cnst0, cnst1; int32_t tmp_buf[8 * 32 + 15]; int32_t *tmp_buf_ptr = tmp_buf + 15; v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r; v8i16 src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l; v8i16 filt0, filter0, filter1, filter2, filter3; v4i32 sum0_r, sum0_l, sum1_r, sum1_l, tmp0_r, tmp0_l, tmp1_r, tmp1_l; /* Align pointer to 64 byte boundary */ tmp_buf_ptr = (int32_t *)(((uintptr_t) tmp_buf_ptr) & ~(uintptr_t) 63); /* process coeff 4, 12, 20, 28 */ LD_SH4(src2, 8 * buf_pitch, in0, in1, in2, in3); ILVR_H2_SH(in1, in0, in3, in2, src0_r, src1_r); ILVL_H2_SH(in1, in0, in3, in2, src0_l, src1_l); LD_SH2(src3, 16 * buf_pitch, in4, in6); LD_SH2((src3 + 8 * buf_pitch), 16 * buf_pitch, in5, in7); ILVR_H2_SH(in6, in4, in7, in5, src2_r, src3_r); ILVL_H2_SH(in6, in4, in7, in5, src2_l, src3_l); /* loop for all columns of constants */ for (i = 0; i < 2; i++) { /* processing single column of constants */ cnst0 = LW(filter_ptr2); cnst1 = LW(filter_ptr2 + 2); filter0 = (v8i16) __msa_fill_w(cnst0); filter1 = (v8i16) __msa_fill_w(cnst1); DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l); DPADD_SH2_SW(src1_r, src1_l, filter1, filter1, sum0_r, sum0_l); ST_SW2(sum0_r, sum0_l, (tmp_buf_ptr + 2 * i * 8), 4); /* processing single column of constants */ cnst0 = LW(filter_ptr2 + 4); cnst1 = LW(filter_ptr2 + 6); filter0 = (v8i16) __msa_fill_w(cnst0); filter1 = (v8i16) __msa_fill_w(cnst1); DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l); DPADD_SH2_SW(src1_r, src1_l, filter1, filter1, sum0_r, sum0_l); ST_SW2(sum0_r, sum0_l, (tmp_buf_ptr + (2 * i + 1) * 8), 4); filter_ptr2 += 8; } /* process coeff 0, 8, 16, 24 */ /* loop for all columns of constants */ for (i = 0; i < 2; i++) { /* processing first column of filter constants */ cnst0 = LW(filter_ptr3); cnst1 = LW(filter_ptr3 + 2); filter0 = (v8i16) __msa_fill_w(cnst0); filter1 = (v8i16) __msa_fill_w(cnst1); DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l, filter0, filter0, filter1, filter1, sum0_r, sum0_l, tmp1_r, tmp1_l); sum1_r = sum0_r - tmp1_r; sum1_l = sum0_l - tmp1_l; sum0_r = sum0_r + tmp1_r; sum0_l = sum0_l + tmp1_l; HEVC_EVEN16_CALC(tmp_buf_ptr, sum0_r, sum0_l, i, (7 - i)); HEVC_EVEN16_CALC(tmp_buf_ptr, sum1_r, sum1_l, (3 - i), (4 + i)); filter_ptr3 += 8; } /* process coeff 2 6 10 14 18 22 26 30 */ LD_SH8(src1, 4 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7); ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, src0_r, src1_r, src2_r, src3_r); ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, src0_l, src1_l, src2_l, src3_l); /* loop for all columns of constants */ for (i = 0; i < 8; i++) { /* processing single column of constants */ filt0 = LD_SH(filter_ptr1); SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3); DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l); DPADD_SH4_SW(src1_r, src1_l, src2_r, src2_l, filter1, filter1, filter2, filter2, sum0_r, sum0_l, sum0_r, sum0_l); DPADD_SH2_SW(src3_r, src3_l, filter3, filter3, sum0_r, sum0_l); LD_SW2(tmp_buf_ptr + i * 8, 4, tmp0_r, tmp0_l); tmp1_r = tmp0_r; tmp1_l = tmp0_l; tmp0_r += sum0_r; tmp0_l += sum0_l; ST_SW2(tmp0_r, tmp0_l, (tmp_buf_ptr + i * 8), 4); tmp1_r -= sum0_r; tmp1_l -= sum0_l; ST_SW2(tmp1_r, tmp1_l, (tmp_buf_ptr + (15 - i) * 8), 4); filter_ptr1 += 8; } /* process coeff 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 */ LD_SH8(src0, 2 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7); src0 += 16 * buf_pitch; ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, src0_r, src1_r, src2_r, src3_r); ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, src0_l, src1_l, src2_l, src3_l); LD_SH8(src0, 2 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7); ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, src4_r, src5_r, src6_r, src7_r); ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, src4_l, src5_l, src6_l, src7_l); /* loop for all columns of filter constants */ for (i = 0; i < 16; i++) { /* processing single column of constants */ filt0 = LD_SH(filter_ptr0); SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3); DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l); DPADD_SH4_SW(src1_r, src1_l, src2_r, src2_l, filter1, filter1, filter2, filter2, sum0_r, sum0_l, sum0_r, sum0_l); DPADD_SH2_SW(src3_r, src3_l, filter3, filter3, sum0_r, sum0_l); tmp1_r = sum0_r; tmp1_l = sum0_l; filt0 = LD_SH(filter_ptr0 + 8); SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3); DOTP_SH2_SW(src4_r, src4_l, filter0, filter0, sum0_r, sum0_l); DPADD_SH4_SW(src5_r, src5_l, src6_r, src6_l, filter1, filter1, filter2, filter2, sum0_r, sum0_l, sum0_r, sum0_l); DPADD_SH2_SW(src7_r, src7_l, filter3, filter3, sum0_r, sum0_l); sum0_r += tmp1_r; sum0_l += tmp1_l; LD_SW2(tmp_buf_ptr + i * 8, 4, tmp0_r, tmp0_l); tmp1_r = tmp0_r; tmp1_l = tmp0_l; tmp0_r += sum0_r; tmp0_l += sum0_l; sum1_r = __msa_fill_w(round); SRAR_W2_SW(tmp0_r, tmp0_l, sum1_r); SAT_SW2_SW(tmp0_r, tmp0_l, 15); in0 = __msa_pckev_h((v8i16) tmp0_l, (v8i16) tmp0_r); ST_SH(in0, (coeffs + i * buf_pitch)); tmp1_r -= sum0_r; tmp1_l -= sum0_l; SRAR_W2_SW(tmp1_r, tmp1_l, sum1_r); SAT_SW2_SW(tmp1_r, tmp1_l, 15); in0 = __msa_pckev_h((v8i16) tmp1_l, (v8i16) tmp1_r); ST_SH(in0, (coeffs + (31 - i) * buf_pitch)); filter_ptr0 += 16; } }
static void avc_idct8_addblk_msa( uint8_t *p_dst, int16_t *p_src, int32_t i_dst_stride ) { v8i16 src0, src1, src2, src3, src4, src5, src6, src7; v8i16 vec0, vec1, vec2, vec3; v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; v8i16 res0, res1, res2, res3, res4, res5, res6, res7; v4i32 tmp0_r, tmp1_r, tmp2_r, tmp3_r, tmp4_r, tmp5_r, tmp6_r, tmp7_r; v4i32 tmp0_l, tmp1_l, tmp2_l, tmp3_l, tmp4_l, tmp5_l, tmp6_l, tmp7_l; v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec0_l, vec1_l, vec2_l, vec3_l; v4i32 res0_r, res1_r, res2_r, res3_r, res4_r, res5_r, res6_r, res7_r; v4i32 res0_l, res1_l, res2_l, res3_l, res4_l, res5_l, res6_l, res7_l; v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; v16i8 zeros = { 0 }; p_src[ 0 ] += 32; LD_SH8( p_src, 8, src0, src1, src2, src3, src4, src5, src6, src7 ); vec0 = src0 + src4; vec1 = src0 - src4; vec2 = src2 >> 1; vec2 = vec2 - src6; vec3 = src6 >> 1; vec3 = src2 + vec3; BUTTERFLY_4( vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3 ); vec0 = src7 >> 1; vec0 = src5 - vec0 - src3 - src7; vec1 = src3 >> 1; vec1 = src1 - vec1 + src7 - src3; vec2 = src5 >> 1; vec2 = vec2 - src1 + src7 + src5; vec3 = src1 >> 1; vec3 = vec3 + src3 + src5 + src1; tmp4 = vec3 >> 2; tmp4 += vec0; tmp5 = vec2 >> 2; tmp5 += vec1; tmp6 = vec1 >> 2; tmp6 -= vec2; tmp7 = vec0 >> 2; tmp7 = vec3 - tmp7; BUTTERFLY_8( tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, res0, res1, res2, res3, res4, res5, res6, res7 ); TRANSPOSE8x8_SH_SH( res0, res1, res2, res3, res4, res5, res6, res7, res0, res1, res2, res3, res4, res5, res6, res7 ); UNPCK_SH_SW( res0, tmp0_r, tmp0_l ); UNPCK_SH_SW( res1, tmp1_r, tmp1_l ); UNPCK_SH_SW( res2, tmp2_r, tmp2_l ); UNPCK_SH_SW( res3, tmp3_r, tmp3_l ); UNPCK_SH_SW( res4, tmp4_r, tmp4_l ); UNPCK_SH_SW( res5, tmp5_r, tmp5_l ); UNPCK_SH_SW( res6, tmp6_r, tmp6_l ); UNPCK_SH_SW( res7, tmp7_r, tmp7_l ); BUTTERFLY_4( tmp0_r, tmp0_l, tmp4_l, tmp4_r, vec0_r, vec0_l, vec1_l, vec1_r ); vec2_r = tmp2_r >> 1; vec2_l = tmp2_l >> 1; vec2_r -= tmp6_r; vec2_l -= tmp6_l; vec3_r = tmp6_r >> 1; vec3_l = tmp6_l >> 1; vec3_r += tmp2_r; vec3_l += tmp2_l; BUTTERFLY_4( vec0_r, vec1_r, vec2_r, vec3_r, tmp0_r, tmp2_r, tmp4_r, tmp6_r ); BUTTERFLY_4( vec0_l, vec1_l, vec2_l, vec3_l, tmp0_l, tmp2_l, tmp4_l, tmp6_l ); vec0_r = tmp7_r >> 1; vec0_l = tmp7_l >> 1; vec0_r = tmp5_r - vec0_r - tmp3_r - tmp7_r; vec0_l = tmp5_l - vec0_l - tmp3_l - tmp7_l; vec1_r = tmp3_r >> 1; vec1_l = tmp3_l >> 1; vec1_r = tmp1_r - vec1_r + tmp7_r - tmp3_r; vec1_l = tmp1_l - vec1_l + tmp7_l - tmp3_l; vec2_r = tmp5_r >> 1; vec2_l = tmp5_l >> 1; vec2_r = vec2_r - tmp1_r + tmp7_r + tmp5_r; vec2_l = vec2_l - tmp1_l + tmp7_l + tmp5_l; vec3_r = tmp1_r >> 1; vec3_l = tmp1_l >> 1; vec3_r = vec3_r + tmp3_r + tmp5_r + tmp1_r; vec3_l = vec3_l + tmp3_l + tmp5_l + tmp1_l; tmp1_r = vec3_r >> 2; tmp1_l = vec3_l >> 2; tmp1_r += vec0_r; tmp1_l += vec0_l; tmp3_r = vec2_r >> 2; tmp3_l = vec2_l >> 2; tmp3_r += vec1_r; tmp3_l += vec1_l; tmp5_r = vec1_r >> 2; tmp5_l = vec1_l >> 2; tmp5_r -= vec2_r; tmp5_l -= vec2_l; tmp7_r = vec0_r >> 2; tmp7_l = vec0_l >> 2; tmp7_r = vec3_r - tmp7_r; tmp7_l = vec3_l - tmp7_l; BUTTERFLY_4( tmp0_r, tmp0_l, tmp7_l, tmp7_r, res0_r, res0_l, res7_l, res7_r ); BUTTERFLY_4( tmp2_r, tmp2_l, tmp5_l, tmp5_r, res1_r, res1_l, res6_l, res6_r ); BUTTERFLY_4( tmp4_r, tmp4_l, tmp3_l, tmp3_r, res2_r, res2_l, res5_l, res5_r ); BUTTERFLY_4( tmp6_r, tmp6_l, tmp1_l, tmp1_r, res3_r, res3_l, res4_l, res4_r ); SRA_4V( res0_r, res0_l, res1_r, res1_l, 6 ); SRA_4V( res2_r, res2_l, res3_r, res3_l, 6 ); SRA_4V( res4_r, res4_l, res5_r, res5_l, 6 ); SRA_4V( res6_r, res6_l, res7_r, res7_l, 6 ); PCKEV_H4_SH( res0_l, res0_r, res1_l, res1_r, res2_l, res2_r, res3_l, res3_r, res0, res1, res2, res3 ); PCKEV_H4_SH( res4_l, res4_r, res5_l, res5_r, res6_l, res6_r, res7_l, res7_r, res4, res5, res6, res7 ); LD_SB8( p_dst, i_dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7 ); ILVR_B4_SH( zeros, dst0, zeros, dst1, zeros, dst2, zeros, dst3, tmp0, tmp1, tmp2, tmp3 ); ILVR_B4_SH( zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7, tmp4, tmp5, tmp6, tmp7 ); ADD4( res0, tmp0, res1, tmp1, res2, tmp2, res3, tmp3, res0, res1, res2, res3 ); ADD4( res4, tmp4, res5, tmp5, res6, tmp6, res7, tmp7, res4, res5, res6, res7 ); CLIP_SH4_0_255( res0, res1, res2, res3 ); CLIP_SH4_0_255( res4, res5, res6, res7 ); PCKEV_B4_SB( res1, res0, res3, res2, res5, res4, res7, res6, dst0, dst1, dst2, dst3 ); ST8x4_UB( dst0, dst1, p_dst, i_dst_stride ); p_dst += ( 4 * i_dst_stride ); ST8x4_UB( dst2, dst3, p_dst, i_dst_stride ); }
void vp10_iht8x8_64_add_msa(const int16_t *input, uint8_t *dst, int32_t dst_stride, int32_t tx_type) { v8i16 in0, in1, in2, in3, in4, in5, in6, in7; /* load vector elements of 8x8 block */ LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); switch (tx_type) { case DCT_DCT: /* DCT in horizontal */ VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); /* DCT in vertical */ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); break; case ADST_DCT: /* DCT in horizontal */ VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); /* ADST in vertical */ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); break; case DCT_ADST: /* ADST in horizontal */ VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); /* DCT in vertical */ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); break; case ADST_ADST: /* ADST in horizontal */ VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); /* ADST in vertical */ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); break; default: assert(0); break; } /* final rounding (add 2^4, divide by 2^5) and shift */ SRARI_H4_SH(in0, in1, in2, in3, 5); SRARI_H4_SH(in4, in5, in6, in7, 5); /* add block and store 8x8 */ VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3); dst += (4 * dst_stride); VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7); }
static void avc_dequant_8x8_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][64], int32_t i_qp ) { const int32_t i_mf = i_qp % 6; const int32_t q_bits = i_qp / 6 - 6; v8i16 dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7; v4i32 dequant_m_f0, dequant_m_f1, dequant_m_f2, dequant_m_f3; v4i32 dequant_m_f4, dequant_m_f5, dequant_m_f6, dequant_m_f7; v4i32 dequant_m_f8, dequant_m_f9, dequant_m_f10, dequant_m_f11; v4i32 dequant_m_f12, dequant_m_f13, dequant_m_f14, dequant_m_f15; LD_SH8( p_dct, 8, dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7 ); LD_SW2( pi_dequant_mf[i_mf], 4, dequant_m_f0, dequant_m_f1 ); LD_SW2( pi_dequant_mf[i_mf] + 8, 4, dequant_m_f2, dequant_m_f3 ); LD_SW2( pi_dequant_mf[i_mf] + 16, 4, dequant_m_f4, dequant_m_f5 ); LD_SW2( pi_dequant_mf[i_mf] + 24, 4, dequant_m_f6, dequant_m_f7 ); LD_SW2( pi_dequant_mf[i_mf] + 32, 4, dequant_m_f8, dequant_m_f9 ); LD_SW2( pi_dequant_mf[i_mf] + 40, 4, dequant_m_f10, dequant_m_f11 ); LD_SW2( pi_dequant_mf[i_mf] + 48, 4, dequant_m_f12, dequant_m_f13 ); LD_SW2( pi_dequant_mf[i_mf] + 56, 4, dequant_m_f14, dequant_m_f15 ); if( q_bits >= 0 ) { v8i16 q_bits_vec; v8i16 dequant_mf_h0, dequant_mf_h1, dequant_mf_h2, dequant_mf_h3; v8i16 dequant_mf_h4, dequant_mf_h5, dequant_mf_h6, dequant_mf_h7; q_bits_vec = __msa_fill_h( q_bits ); PCKEV_H4_SH( dequant_m_f1, dequant_m_f0, dequant_m_f3, dequant_m_f2, dequant_m_f5, dequant_m_f4, dequant_m_f7, dequant_m_f6, dequant_mf_h0, dequant_mf_h1, dequant_mf_h2, dequant_mf_h3 ); PCKEV_H4_SH( dequant_m_f9, dequant_m_f8, dequant_m_f11, dequant_m_f10, dequant_m_f13, dequant_m_f12, dequant_m_f15, dequant_m_f14, dequant_mf_h4, dequant_mf_h5, dequant_mf_h6, dequant_mf_h7 ); dct0 *= dequant_mf_h0; dct1 *= dequant_mf_h1; dct2 *= dequant_mf_h2; dct3 *= dequant_mf_h3; dct4 *= dequant_mf_h4; dct5 *= dequant_mf_h5; dct6 *= dequant_mf_h6; dct7 *= dequant_mf_h7; SLLI_4V( dct0, dct1, dct2, dct3, q_bits_vec ); SLLI_4V( dct4, dct5, dct6, dct7, q_bits_vec ); ST_SH8( dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7, p_dct, 8 ); } else { const int32_t q_bits_add = 1 << ( -q_bits - 1 ); v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3; v4i32 dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7; v4i32 dct_signed_w8, dct_signed_w9, dct_signed_w10, dct_signed_w11; v4i32 dct_signed_w12, dct_signed_w13, dct_signed_w14, dct_signed_w15; v4i32 q_bits_vec, q_bits_vec_add; q_bits_vec_add = __msa_fill_w( q_bits_add ); q_bits_vec = __msa_fill_w( -q_bits ); UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 ); UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 ); UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 ); UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 ); UNPCK_SH_SW( dct4, dct_signed_w8, dct_signed_w9 ); UNPCK_SH_SW( dct5, dct_signed_w10, dct_signed_w11 ); UNPCK_SH_SW( dct6, dct_signed_w12, dct_signed_w13 ); UNPCK_SH_SW( dct7, dct_signed_w14, dct_signed_w15 ); dct_signed_w0 *= dequant_m_f0; dct_signed_w1 *= dequant_m_f1; dct_signed_w2 *= dequant_m_f2; dct_signed_w3 *= dequant_m_f3; dct_signed_w4 *= dequant_m_f4; dct_signed_w5 *= dequant_m_f5; dct_signed_w6 *= dequant_m_f6; dct_signed_w7 *= dequant_m_f7; dct_signed_w8 *= dequant_m_f8; dct_signed_w9 *= dequant_m_f9; dct_signed_w10 *= dequant_m_f10; dct_signed_w11 *= dequant_m_f11; dct_signed_w12 *= dequant_m_f12; dct_signed_w13 *= dequant_m_f13; dct_signed_w14 *= dequant_m_f14; dct_signed_w15 *= dequant_m_f15; dct_signed_w0 += q_bits_vec_add; dct_signed_w1 += q_bits_vec_add; dct_signed_w2 += q_bits_vec_add; dct_signed_w3 += q_bits_vec_add; dct_signed_w4 += q_bits_vec_add; dct_signed_w5 += q_bits_vec_add; dct_signed_w6 += q_bits_vec_add; dct_signed_w7 += q_bits_vec_add; dct_signed_w8 += q_bits_vec_add; dct_signed_w9 += q_bits_vec_add; dct_signed_w10 += q_bits_vec_add; dct_signed_w11 += q_bits_vec_add; dct_signed_w12 += q_bits_vec_add; dct_signed_w13 += q_bits_vec_add; dct_signed_w14 += q_bits_vec_add; dct_signed_w15 += q_bits_vec_add; SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3, q_bits_vec ); SRA_4V( dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7, q_bits_vec ); SRA_4V( dct_signed_w8, dct_signed_w9, dct_signed_w10, dct_signed_w11, q_bits_vec ); SRA_4V( dct_signed_w12, dct_signed_w13, dct_signed_w14, dct_signed_w15, q_bits_vec ); PCKEV_H4_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2, dct_signed_w5, dct_signed_w4, dct_signed_w7, dct_signed_w6, dct0, dct1, dct2, dct3 ); PCKEV_H4_SH( dct_signed_w9, dct_signed_w8, dct_signed_w11, dct_signed_w10, dct_signed_w13, dct_signed_w12, dct_signed_w15, dct_signed_w14, dct4, dct5, dct6, dct7 ); ST_SH8( dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7, p_dct, 8 ); } }
static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) { v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 in8, in9, in10, in11, in12, in13, in14, in15; v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1; /* fdct32 even */ /* stage 2 */ LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7); LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14, in15); FDCT_POSTPROC_2V_NEG_H(vec0, vec1); FDCT_POSTPROC_2V_NEG_H(vec2, vec3); FDCT_POSTPROC_2V_NEG_H(vec4, vec5); FDCT_POSTPROC_2V_NEG_H(vec6, vec7); FDCT_POSTPROC_2V_NEG_H(in8, in9); FDCT_POSTPROC_2V_NEG_H(in10, in11); FDCT_POSTPROC_2V_NEG_H(in12, in13); FDCT_POSTPROC_2V_NEG_H(in14, in15); /* Stage 3 */ ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); temp0 = in0 + in3; in0 = in0 - in3; in3 = in1 + in2; in1 = in1 - in2; DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0); ST_SH(temp0, out); ST_SH(temp1, out + 8); DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); ST_SH(temp0, out + 16); ST_SH(temp1, out + 24); SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); ADD2(vec4, vec5, vec7, vec6, vec0, vec1); DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); ST_SH(temp0, out + 32); ST_SH(temp1, out + 56); SUB2(vec4, vec5, vec7, vec6, vec4, vec7); DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); ST_SH(temp0, out + 40); ST_SH(temp1, out + 48); DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); ADD2(in0, in1, in2, in3, vec0, vec7); DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); ST_SH(temp0, out + 64); ST_SH(temp1, out + 120); SUB2(in0, in1, in2, in3, in0, in2); DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); ST_SH(temp0, out + 72); ST_SH(temp1, out + 112); SUB2(in9, vec2, in14, vec5, vec2, vec5); DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); ST_SH(temp0, out + 80); ST_SH(temp1, out + 104); ADD2(in3, in2, in0, in1, vec3, vec4); DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); ST_SH(temp0, out + 96); ST_SH(temp1, out + 88); }