static void avc_idct4x4dc_msa( int16_t *p_src, int32_t i_src_stride, int16_t *p_dst, int32_t i_dst_stride ) { v8i16 src0, src1, src2, src3; v4i32 src0_r, src1_r, src2_r, src3_r; v4i32 hres0, hres1, hres2, hres3; v8i16 vres0, vres1, vres2, vres3; v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v2i64 res0, res1; LD_SH4( p_src, i_src_stride, src0, src1, src2, src3 ); UNPCK_R_SH_SW( src0, src0_r ); UNPCK_R_SH_SW( src1, src1_r ); UNPCK_R_SH_SW( src2, src2_r ); UNPCK_R_SH_SW( src3, src3_r ); BUTTERFLY_4( src0_r, src2_r, src3_r, src1_r, vec0, vec3, vec2, vec1 ); BUTTERFLY_4( vec0, vec1, vec2, vec3, hres0, hres3, hres2, hres1 ); TRANSPOSE4x4_SW_SW( hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3 ); BUTTERFLY_4( hres0, hres2, hres3, hres1, vec0, vec3, vec2, vec1 ); BUTTERFLY_4( vec0, vec1, vec2, vec3, vec4, vec7, vec6, vec5 ); PCKEV_H4_SH( vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, vres0, vres1, vres2, vres3 ); PCKOD_D2_SD( vres1, vres0, vres3, vres2, res0, res1 ); ST8x4_UB( res0, res1, p_dst, i_dst_stride * 2 ); }
static void avc_dct4x4dc_msa( int16_t *p_src, int16_t *p_dst, int32_t i_src_stride ) { v8i16 src0, src1, src2, src3, ver_res0, ver_res1, ver_res2, ver_res3; v4i32 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3; v4i32 hor_res0, hor_res1, hor_res2, hor_res3; v4i32 ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r; LD_SH4( p_src, i_src_stride, src0, src1, src2, src3 ); UNPCK_R_SH_SW( src0, src0_r ); UNPCK_R_SH_SW( src1, src1_r ); UNPCK_R_SH_SW( src2, src2_r ); UNPCK_R_SH_SW( src3, src3_r ); BUTTERFLY_4( src0_r, src2_r, src3_r, src1_r, tmp0, tmp3, tmp2, tmp1 ); BUTTERFLY_4( tmp0, tmp1, tmp2, tmp3, hor_res0, hor_res3, hor_res2, hor_res1 ); TRANSPOSE4x4_SW_SW( hor_res0, hor_res1, hor_res2, hor_res3, hor_res0, hor_res1, hor_res2, hor_res3 ); BUTTERFLY_4( hor_res0, hor_res2, hor_res3, hor_res1, tmp0, tmp3, tmp2, tmp1 ); BUTTERFLY_4( tmp0, tmp1, tmp2, tmp3, ver_res0_r, ver_res3_r, ver_res2_r, ver_res1_r ); SRARI_W4_SW( ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r, 1 ); PCKEV_H4_SH( ver_res0_r, ver_res0_r, ver_res1_r, ver_res1_r, ver_res2_r, ver_res2_r, ver_res3_r, ver_res3_r, ver_res0, ver_res1, ver_res2, ver_res3 ); PCKOD_D2_SH( ver_res1, ver_res0, ver_res3, ver_res2, ver_res0, ver_res2 ); ST_SH2( ver_res0, ver_res2, p_dst, 8 ); }
static void fadst16_rows_step1_msa(int16_t *input, const int32_t *const0, int16_t *int_buf) { v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3; v4i32 k0, k1, k2, k3; /* load input data */ r0 = LD_SH(input); r7 = LD_SH(input + 7 * 8); r8 = LD_SH(input + 8 * 8); r15 = LD_SH(input + 15 * 8); /* stage 1 */ LD_SW2(const0, 4, k0, k1); LD_SW2(const0 + 4 * 2, 4, k2, k3); MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3); r3 = LD_SH(input + 3 * 8); r4 = LD_SH(input + 4 * 8); r11 = LD_SH(input + 11 * 8); r12 = LD_SH(input + 12 * 8); LD_SW2(const0 + 4 * 4, 4, k0, k1); LD_SW2(const0 + 4 * 6, 4, k2, k3); MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11); /* stage 2 */ BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1); ST_SH2(tp0, tp1, int_buf, 4 * 8); ST_SH2(tp2, tp3, int_buf + 8, 4 * 8); LD_SW2(const0 + 4 * 8, 4, k0, k1); k2 = LD_SW(const0 + 4 * 10); MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3); ST_SH2(h0, h3, int_buf + 8 * 8, 4 * 8); ST_SH2(h1, h2, int_buf + 9 * 8, 4 * 8); r1 = LD_SH(input + 8); r6 = LD_SH(input + 6 * 8); r9 = LD_SH(input + 9 * 8); r14 = LD_SH(input + 14 * 8); LD_SW2(const0 + 4 * 11, 4, k0, k1); LD_SW2(const0 + 4 * 13, 4, k2, k3); MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3); ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8); r2 = LD_SH(input + 2 * 8); r5 = LD_SH(input + 5 * 8); r10 = LD_SH(input + 10 * 8); r13 = LD_SH(input + 13 * 8); LD_SW2(const0 + 4 * 15, 4, k0, k1); LD_SW2(const0 + 4 * 17, 4, k2, k3); MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3); ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8); BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3); ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8); }
void vp8_short_fdct8x4_msa(int16_t *input, int16_t *output, int32_t pitch) { v8i16 in0, in1, in2, in3; v8i16 temp0, temp1, tmp0, tmp1; v8i16 const0, const1, const2; v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 }; v8i16 zero = { 0 }; v4i32 vec0_w, vec1_w, vec2_w, vec3_w; LD_SH4(input, pitch / 2, in0, in1, in2, in3); TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); SLLI_4V(temp0, temp1, in1, in3, 3); in0 = temp0 + temp1; in2 = temp0 - temp1; SET_DOTP_VALUES(coeff, 0, 1, 2, const1, const2); temp0 = __msa_splati_h(coeff, 3); vec1_w = (v4i32)__msa_ilvev_h(zero, temp0); coeff = __msa_ilvl_h(zero, coeff); vec3_w = __msa_splati_w((v4i32)coeff, 0); ILVRL_H2_SH(in3, in1, tmp1, tmp0); vec0_w = vec1_w; vec2_w = vec3_w; DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2, vec0_w, vec1_w, vec2_w, vec3_w); SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 12); PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3); TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); in0 = temp0 + temp1 + 7; in2 = temp0 - temp1 + 7; in0 >>= 4; in2 >>= 4; SPLATI_W2_SW(coeff, 2, vec3_w, vec1_w); vec3_w += vec1_w; vec1_w = __msa_splati_w((v4i32)coeff, 1); const0 = RET_1_IF_NZERO_H(in3); ILVRL_H2_SH(in3, in1, tmp1, tmp0); vec0_w = vec1_w; vec2_w = vec3_w; DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2, vec0_w, vec1_w, vec2_w, vec3_w); SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 16); PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3); in1 += const0; PCKEV_D2_SH(in1, in0, in3, in2, temp0, temp1); ST_SH2(temp0, temp1, output, 8); PCKOD_D2_SH(in1, in0, in3, in2, in0, in2); ST_SH2(in0, in2, output + 16, 8); }
void vp8_short_fdct4x4_msa(int16_t *input, int16_t *output, int32_t pitch) { v8i16 in0, in1, in2, in3; v8i16 temp0, temp1; v8i16 const0, const1; v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 }; v4i32 out0, out1, out2, out3; v8i16 zero = { 0 }; LD_SH4(input, pitch / 2, in0, in1, in2, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); SLLI_4V(temp0, temp1, in1, in3, 3); in0 = temp0 + temp1; in2 = temp0 - temp1; SET_DOTP_VALUES(coeff, 0, 1, 2, const0, const1); temp0 = __msa_ilvr_h(in3, in1); in1 = __msa_splati_h(coeff, 3); out0 = (v4i32)__msa_ilvev_h(zero, in1); coeff = __msa_ilvl_h(zero, coeff); out1 = __msa_splati_w((v4i32)coeff, 0); DPADD_SH2_SW(temp0, temp0, const0, const1, out0, out1); out0 >>= 12; out1 >>= 12; PCKEV_H2_SH(out0, out0, out1, out1, in1, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); in0 = temp0 + temp1 + 7; in2 = temp0 - temp1 + 7; in0 >>= 4; in2 >>= 4; ILVR_H2_SW(zero, in0, zero, in2, out0, out2); temp1 = RET_1_IF_NZERO_H(in3); ILVR_H2_SH(zero, temp1, in3, in1, temp1, temp0); SPLATI_W2_SW(coeff, 2, out3, out1); out3 += out1; out1 = __msa_splati_w((v4i32)coeff, 1); DPADD_SH2_SW(temp0, temp0, const0, const1, out1, out3); out1 >>= 16; out3 >>= 16; out1 += (v4i32)temp1; PCKEV_H2_SH(out1, out0, out3, out2, in0, in2); ST_SH2(in0, in2, output, 8); }
static void avc_sub4x4_dct_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_ref, int32_t i_dst_stride, int16_t *p_dst ) { uint32_t i_src0, i_src1, i_src2, i_src3; uint32_t i_ref0, i_ref1, i_ref2, i_ref3; v16i8 src = { 0 }; v16i8 ref = { 0 }; v16u8 inp0, inp1; v8i16 diff0, diff1, diff2, diff3; v8i16 temp0, temp1, temp2, temp3; LW4( p_src, i_src_stride, i_src0, i_src1, i_src2, i_src3 ); LW4( p_ref, i_dst_stride, i_ref0, i_ref1, i_ref2, i_ref3 ); INSERT_W4_SB( i_src0, i_src1, i_src2, i_src3, src ); INSERT_W4_SB( i_ref0, i_ref1, i_ref2, i_ref3, ref ); ILVRL_B2_UB( src, ref, inp0, inp1 ); HSUB_UB2_SH( inp0, inp1, diff0, diff2 ); diff1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) diff0, ( v2i64 ) diff0 ); diff3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) diff2, ( v2i64 ) diff2 ); BUTTERFLY_4( diff0, diff1, diff2, diff3, temp0, temp1, temp2, temp3 ); diff0 = temp0 + temp1; diff1 = ( temp3 << 1 ) + temp2; diff2 = temp0 - temp1; diff3 = temp3 - ( temp2 << 1 ); TRANSPOSE4x4_SH_SH( diff0, diff1, diff2, diff3, temp0, temp1, temp2, temp3 ); BUTTERFLY_4( temp0, temp1, temp2, temp3, diff0, diff1, diff2, diff3 ); temp0 = diff0 + diff1; temp1 = ( diff3 << 1 ) + diff2; temp2 = diff0 - diff1; temp3 = diff3 - ( diff2 << 1 ); ILVR_D2_UB( temp1, temp0, temp3, temp2, inp0, inp1 ); ST_UB2( inp0, inp1, p_dst, 8 ); }
void x264_sub8x8_dct_dc_msa( int16_t pi_dct[4], uint8_t *p_pix1, uint8_t *p_pix2 ) { int32_t d0, d1, d2, d3; pi_dct[0] = subtract_sum4x4_msa( &p_pix1[0], FENC_STRIDE, &p_pix2[0], FDEC_STRIDE ); pi_dct[1] = subtract_sum4x4_msa( &p_pix1[4], FENC_STRIDE, &p_pix2[4], FDEC_STRIDE ); pi_dct[2] = subtract_sum4x4_msa( &p_pix1[4 * FENC_STRIDE + 0], FENC_STRIDE, &p_pix2[4 * FDEC_STRIDE + 0], FDEC_STRIDE ); pi_dct[3] = subtract_sum4x4_msa( &p_pix1[4 * FENC_STRIDE + 4], FENC_STRIDE, &p_pix2[4 * FDEC_STRIDE + 4], FDEC_STRIDE ); BUTTERFLY_4( pi_dct[0], pi_dct[2], pi_dct[3], pi_dct[1], d0, d1, d3, d2 ); BUTTERFLY_4( d0, d2, d3, d1, pi_dct[0], pi_dct[2], pi_dct[3], pi_dct[1] ); }
void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, int32_t src_stride) { v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 in8, in9, in10, in11, in12, in13, in14, in15; v8i16 stp21, stp22, stp23, stp24, stp25, stp26, stp30; v8i16 stp31, stp32, stp33, stp34, stp35, stp36, stp37; v8i16 vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5; v8i16 coeff = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 }; v8i16 coeff1 = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64, cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 }; v8i16 coeff2 = { -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0 }; LD_SH16(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15); SLLI_4V(in0, in1, in2, in3, 2); SLLI_4V(in4, in5, in6, in7, 2); SLLI_4V(in8, in9, in10, in11, 2); SLLI_4V(in12, in13, in14, in15, 2); ADD4(in0, in15, in1, in14, in2, in13, in3, in12, tmp0, tmp1, tmp2, tmp3); ADD4(in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, tmp6, tmp7); FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); ST_SH8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp_ptr, 32); SUB4(in0, in15, in1, in14, in2, in13, in3, in12, in15, in14, in13, in12); SUB4(in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, in9, in8); tmp_ptr += 16; /* stp 1 */ ILVL_H2_SH(in10, in13, in11, in12, vec2, vec4); ILVR_H2_SH(in10, in13, in11, in12, vec3, vec5); cnst4 = __msa_splati_h(coeff, 0); stp25 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4); cnst5 = __msa_splati_h(coeff, 1); cnst5 = __msa_ilvev_h(cnst5, cnst4); stp22 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5); stp24 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4); stp23 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5); /* stp2 */ BUTTERFLY_4(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33); BUTTERFLY_4(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34); ILVL_H2_SH(stp36, stp31, stp35, stp32, vec2, vec4); ILVR_H2_SH(stp36, stp31, stp35, stp32, vec3, vec5); SPLATI_H2_SH(coeff, 2, 3, cnst0, cnst1); cnst0 = __msa_ilvev_h(cnst0, cnst1); stp26 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0); cnst0 = __msa_splati_h(coeff, 4); cnst1 = __msa_ilvev_h(cnst1, cnst0); stp21 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1); BUTTERFLY_4(stp30, stp37, stp26, stp21, in8, in15, in14, in9); ILVRL_H2_SH(in15, in8, vec1, vec0); SPLATI_H2_SH(coeff1, 0, 1, cnst0, cnst1); cnst0 = __msa_ilvev_h(cnst0, cnst1); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr); cnst0 = __msa_splati_h(coeff2, 0); cnst0 = __msa_ilvev_h(cnst1, cnst0); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr + 224); ILVRL_H2_SH(in14, in9, vec1, vec0); SPLATI_H2_SH(coeff1, 2, 3, cnst0, cnst1); cnst1 = __msa_ilvev_h(cnst1, cnst0); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1); ST_SH(in8, tmp_ptr + 128); cnst1 = __msa_splati_h(coeff2, 2); cnst0 = __msa_ilvev_h(cnst0, cnst1); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr + 96); SPLATI_H2_SH(coeff, 2, 5, cnst0, cnst1); cnst1 = __msa_ilvev_h(cnst1, cnst0); stp25 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1); cnst1 = __msa_splati_h(coeff, 3); cnst1 = __msa_ilvev_h(cnst0, cnst1); stp22 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1); /* stp4 */ ADD2(stp34, stp25, stp33, stp22, in13, in10); ILVRL_H2_SH(in13, in10, vec1, vec0); SPLATI_H2_SH(coeff1, 4, 5, cnst0, cnst1); cnst0 = __msa_ilvev_h(cnst0, cnst1); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr + 64); cnst0 = __msa_splati_h(coeff2, 1); cnst0 = __msa_ilvev_h(cnst1, cnst0); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr + 160); SUB2(stp34, stp25, stp33, stp22, in12, in11); ILVRL_H2_SH(in12, in11, vec1, vec0); SPLATI_H2_SH(coeff1, 6, 7, cnst0, cnst1); cnst1 = __msa_ilvev_h(cnst1, cnst0); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1); ST_SH(in8, tmp_ptr + 192); cnst1 = __msa_splati_h(coeff2, 3); cnst0 = __msa_ilvev_h(cnst0, cnst1); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr + 32); }
static void avc_idct8_addblk_msa( uint8_t *p_dst, int16_t *p_src, int32_t i_dst_stride ) { v8i16 src0, src1, src2, src3, src4, src5, src6, src7; v8i16 vec0, vec1, vec2, vec3; v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; v8i16 res0, res1, res2, res3, res4, res5, res6, res7; v4i32 tmp0_r, tmp1_r, tmp2_r, tmp3_r, tmp4_r, tmp5_r, tmp6_r, tmp7_r; v4i32 tmp0_l, tmp1_l, tmp2_l, tmp3_l, tmp4_l, tmp5_l, tmp6_l, tmp7_l; v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec0_l, vec1_l, vec2_l, vec3_l; v4i32 res0_r, res1_r, res2_r, res3_r, res4_r, res5_r, res6_r, res7_r; v4i32 res0_l, res1_l, res2_l, res3_l, res4_l, res5_l, res6_l, res7_l; v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; v16i8 zeros = { 0 }; p_src[ 0 ] += 32; LD_SH8( p_src, 8, src0, src1, src2, src3, src4, src5, src6, src7 ); vec0 = src0 + src4; vec1 = src0 - src4; vec2 = src2 >> 1; vec2 = vec2 - src6; vec3 = src6 >> 1; vec3 = src2 + vec3; BUTTERFLY_4( vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3 ); vec0 = src7 >> 1; vec0 = src5 - vec0 - src3 - src7; vec1 = src3 >> 1; vec1 = src1 - vec1 + src7 - src3; vec2 = src5 >> 1; vec2 = vec2 - src1 + src7 + src5; vec3 = src1 >> 1; vec3 = vec3 + src3 + src5 + src1; tmp4 = vec3 >> 2; tmp4 += vec0; tmp5 = vec2 >> 2; tmp5 += vec1; tmp6 = vec1 >> 2; tmp6 -= vec2; tmp7 = vec0 >> 2; tmp7 = vec3 - tmp7; BUTTERFLY_8( tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, res0, res1, res2, res3, res4, res5, res6, res7 ); TRANSPOSE8x8_SH_SH( res0, res1, res2, res3, res4, res5, res6, res7, res0, res1, res2, res3, res4, res5, res6, res7 ); UNPCK_SH_SW( res0, tmp0_r, tmp0_l ); UNPCK_SH_SW( res1, tmp1_r, tmp1_l ); UNPCK_SH_SW( res2, tmp2_r, tmp2_l ); UNPCK_SH_SW( res3, tmp3_r, tmp3_l ); UNPCK_SH_SW( res4, tmp4_r, tmp4_l ); UNPCK_SH_SW( res5, tmp5_r, tmp5_l ); UNPCK_SH_SW( res6, tmp6_r, tmp6_l ); UNPCK_SH_SW( res7, tmp7_r, tmp7_l ); BUTTERFLY_4( tmp0_r, tmp0_l, tmp4_l, tmp4_r, vec0_r, vec0_l, vec1_l, vec1_r ); vec2_r = tmp2_r >> 1; vec2_l = tmp2_l >> 1; vec2_r -= tmp6_r; vec2_l -= tmp6_l; vec3_r = tmp6_r >> 1; vec3_l = tmp6_l >> 1; vec3_r += tmp2_r; vec3_l += tmp2_l; BUTTERFLY_4( vec0_r, vec1_r, vec2_r, vec3_r, tmp0_r, tmp2_r, tmp4_r, tmp6_r ); BUTTERFLY_4( vec0_l, vec1_l, vec2_l, vec3_l, tmp0_l, tmp2_l, tmp4_l, tmp6_l ); vec0_r = tmp7_r >> 1; vec0_l = tmp7_l >> 1; vec0_r = tmp5_r - vec0_r - tmp3_r - tmp7_r; vec0_l = tmp5_l - vec0_l - tmp3_l - tmp7_l; vec1_r = tmp3_r >> 1; vec1_l = tmp3_l >> 1; vec1_r = tmp1_r - vec1_r + tmp7_r - tmp3_r; vec1_l = tmp1_l - vec1_l + tmp7_l - tmp3_l; vec2_r = tmp5_r >> 1; vec2_l = tmp5_l >> 1; vec2_r = vec2_r - tmp1_r + tmp7_r + tmp5_r; vec2_l = vec2_l - tmp1_l + tmp7_l + tmp5_l; vec3_r = tmp1_r >> 1; vec3_l = tmp1_l >> 1; vec3_r = vec3_r + tmp3_r + tmp5_r + tmp1_r; vec3_l = vec3_l + tmp3_l + tmp5_l + tmp1_l; tmp1_r = vec3_r >> 2; tmp1_l = vec3_l >> 2; tmp1_r += vec0_r; tmp1_l += vec0_l; tmp3_r = vec2_r >> 2; tmp3_l = vec2_l >> 2; tmp3_r += vec1_r; tmp3_l += vec1_l; tmp5_r = vec1_r >> 2; tmp5_l = vec1_l >> 2; tmp5_r -= vec2_r; tmp5_l -= vec2_l; tmp7_r = vec0_r >> 2; tmp7_l = vec0_l >> 2; tmp7_r = vec3_r - tmp7_r; tmp7_l = vec3_l - tmp7_l; BUTTERFLY_4( tmp0_r, tmp0_l, tmp7_l, tmp7_r, res0_r, res0_l, res7_l, res7_r ); BUTTERFLY_4( tmp2_r, tmp2_l, tmp5_l, tmp5_r, res1_r, res1_l, res6_l, res6_r ); BUTTERFLY_4( tmp4_r, tmp4_l, tmp3_l, tmp3_r, res2_r, res2_l, res5_l, res5_r ); BUTTERFLY_4( tmp6_r, tmp6_l, tmp1_l, tmp1_r, res3_r, res3_l, res4_l, res4_r ); SRA_4V( res0_r, res0_l, res1_r, res1_l, 6 ); SRA_4V( res2_r, res2_l, res3_r, res3_l, 6 ); SRA_4V( res4_r, res4_l, res5_r, res5_l, 6 ); SRA_4V( res6_r, res6_l, res7_r, res7_l, 6 ); PCKEV_H4_SH( res0_l, res0_r, res1_l, res1_r, res2_l, res2_r, res3_l, res3_r, res0, res1, res2, res3 ); PCKEV_H4_SH( res4_l, res4_r, res5_l, res5_r, res6_l, res6_r, res7_l, res7_r, res4, res5, res6, res7 ); LD_SB8( p_dst, i_dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7 ); ILVR_B4_SH( zeros, dst0, zeros, dst1, zeros, dst2, zeros, dst3, tmp0, tmp1, tmp2, tmp3 ); ILVR_B4_SH( zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7, tmp4, tmp5, tmp6, tmp7 ); ADD4( res0, tmp0, res1, tmp1, res2, tmp2, res3, tmp3, res0, res1, res2, res3 ); ADD4( res4, tmp4, res5, tmp5, res6, tmp6, res7, tmp7, res4, res5, res6, res7 ); CLIP_SH4_0_255( res0, res1, res2, res3 ); CLIP_SH4_0_255( res4, res5, res6, res7 ); PCKEV_B4_SB( res1, res0, res3, res2, res5, res4, res7, res6, dst0, dst1, dst2, dst3 ); ST8x4_UB( dst0, dst1, p_dst, i_dst_stride ); p_dst += ( 4 * i_dst_stride ); ST8x4_UB( dst2, dst3, p_dst, i_dst_stride ); }
static void fadst16_cols_step2_msa(int16_t *int_buf, const int32_t *const0, int16_t *out) { int16_t *out_ptr = out + 128; v8i16 tp0, tp1, tp2, tp3, g5, g7, g13, g15; v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h10, h11; v8i16 out0, out1, out2, out3, out4, out5, out6, out7; v8i16 out8, out9, out10, out11, out12, out13, out14, out15; v4i32 k0, k1, k2, k3; LD_SH2(int_buf + 3 * 8, 4 * 8, g13, g15); LD_SH2(int_buf + 11 * 8, 4 * 8, g5, g7); LD_SW2(const0 + 4 * 19, 4, k0, k1); k2 = LD_SW(const0 + 4 * 21); MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7); tp0 = LD_SH(int_buf + 4 * 8); tp1 = LD_SH(int_buf + 5 * 8); tp3 = LD_SH(int_buf + 10 * 8); tp2 = LD_SH(int_buf + 14 * 8); LD_SW2(const0 + 4 * 22, 4, k0, k1); k2 = LD_SW(const0 + 4 * 24); MADD_BF(tp0, tp1, tp2, tp3, k0, k1, k2, k0, out4, out6, out5, out7); out4 = -out4; ST_SH(out4, (out + 3 * 16)); ST_SH(out5, (out_ptr + 4 * 16)); h1 = LD_SH(int_buf + 9 * 8); h3 = LD_SH(int_buf + 12 * 8); MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15); out13 = -out13; ST_SH(out12, (out + 2 * 16)); ST_SH(out13, (out_ptr + 5 * 16)); tp0 = LD_SH(int_buf); tp1 = LD_SH(int_buf + 8); tp2 = LD_SH(int_buf + 2 * 8); tp3 = LD_SH(int_buf + 6 * 8); BUTTERFLY_4(tp0, tp1, tp3, tp2, out0, out1, h11, h10); out1 = -out1; ST_SH(out0, (out)); ST_SH(out1, (out_ptr + 7 * 16)); h0 = LD_SH(int_buf + 8 * 8); h2 = LD_SH(int_buf + 13 * 8); BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10); out8 = -out8; ST_SH(out8, (out + 16)); ST_SH(out9, (out_ptr + 6 * 16)); /* stage 4 */ LD_SW2(const0 + 4 * 25, 4, k0, k1); LD_SW2(const0 + 4 * 27, 4, k2, k3); MADD_SHORT(h10, h11, k1, k2, out2, out3); ST_SH(out2, (out + 7 * 16)); ST_SH(out3, (out_ptr)); MADD_SHORT(out6, out7, k0, k3, out6, out7); ST_SH(out6, (out + 4 * 16)); ST_SH(out7, (out_ptr + 3 * 16)); MADD_SHORT(out10, out11, k0, k3, out10, out11); ST_SH(out10, (out + 6 * 16)); ST_SH(out11, (out_ptr + 16)); MADD_SHORT(out14, out15, k1, k2, out14, out15); ST_SH(out14, (out + 5 * 16)); ST_SH(out15, (out_ptr + 2 * 16)); }
static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) { v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 in8, in9, in10, in11, in12, in13, in14, in15; v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v8i16 temp0, temp1; /* fdct even */ LD_SH4(input, 8, in0, in1, in2, in3); LD_SH4(input + 96, 8, in12, in13, in14, in15); BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15, vec0, vec1, vec2, vec3, in12, in13, in14, in15); LD_SH4(input + 32, 8, in4, in5, in6, in7); LD_SH4(input + 64, 8, in8, in9, in10, in11); BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11, vec4, vec5, vec6, vec7, in8, in9, in10, in11); /* Stage 3 */ ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0); DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0); FDCT32_POSTPROC_2V_POS_H(temp0, temp1); ST_SH(temp0, temp); ST_SH(temp1, temp + 512); DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); FDCT32_POSTPROC_2V_POS_H(temp0, temp1); ST_SH(temp0, temp + 256); ST_SH(temp1, temp + 768); SUB4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7, vec6, vec5, vec4); DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); ADD2(vec4, vec5, vec7, vec6, vec0, vec1); DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); FDCT32_POSTPROC_2V_POS_H(temp0, temp1); ST_SH(temp0, temp + 128); ST_SH(temp1, temp + 896); SUB2(vec4, vec5, vec7, vec6, vec4, vec7); DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); FDCT32_POSTPROC_2V_POS_H(temp0, temp1); ST_SH(temp0, temp + 640); ST_SH(temp1, temp + 384); DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); ADD2(in0, in1, in2, in3, vec0, vec7); DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); FDCT32_POSTPROC_2V_POS_H(temp0, temp1); ST_SH(temp0, temp + 64); ST_SH(temp1, temp + 960); SUB2(in0, in1, in2, in3, in0, in2); DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); FDCT32_POSTPROC_2V_POS_H(temp0, temp1); ST_SH(temp0, temp + 576); ST_SH(temp1, temp + 448); SUB2(in9, vec2, in14, vec5, vec2, vec5); DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); FDCT32_POSTPROC_2V_POS_H(temp0, temp1); ST_SH(temp0, temp + 320); ST_SH(temp1, temp + 704); ADD2(in3, in2, in0, in1, vec3, vec4); DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); FDCT32_POSTPROC_2V_POS_H(temp0, temp1); ST_SH(temp0, temp + 192); ST_SH(temp1, temp + 832); }
static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) { v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 in8, in9, in10, in11, in12, in13, in14, in15; v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1; /* fdct32 even */ /* stage 2 */ LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7); LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14, in15); /* Stage 3 */ ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0); DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0); FDCT_POSTPROC_2V_NEG_H(temp0, temp1); ST_SH(temp0, out); ST_SH(temp1, out + 8); DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); FDCT_POSTPROC_2V_NEG_H(temp0, temp1); ST_SH(temp0, out + 16); ST_SH(temp1, out + 24); SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); ADD2(vec4, vec5, vec7, vec6, vec0, vec1); DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); FDCT_POSTPROC_2V_NEG_H(temp0, temp1); ST_SH(temp0, out + 32); ST_SH(temp1, out + 56); SUB2(vec4, vec5, vec7, vec6, vec4, vec7); DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); FDCT_POSTPROC_2V_NEG_H(temp0, temp1); ST_SH(temp0, out + 40); ST_SH(temp1, out + 48); DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); ADD2(in0, in1, in2, in3, vec0, vec7); DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); FDCT_POSTPROC_2V_NEG_H(temp0, temp1); ST_SH(temp0, out + 64); ST_SH(temp1, out + 120); SUB2(in0, in1, in2, in3, in0, in2); DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); FDCT_POSTPROC_2V_NEG_H(temp0, temp1); ST_SH(temp0, out + 72); ST_SH(temp1, out + 112); SUB2(in9, vec2, in14, vec5, vec2, vec5); DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5) DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); FDCT_POSTPROC_2V_NEG_H(temp0, temp1); ST_SH(temp0, out + 80); ST_SH(temp1, out + 104); ADD2(in3, in2, in0, in1, vec3, vec4); DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); FDCT_POSTPROC_2V_NEG_H(temp0, temp1); ST_SH(temp0, out + 96); ST_SH(temp1, out + 88); }
static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr, int16_t *out) { v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 in8, in9, in10, in11, in12, in13, in14, in15; v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v4i32 vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l; v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r; v4i32 tmp0_w, tmp1_w, tmp2_w, tmp3_w; /* fdct32 even */ /* stage 2 */ LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14, in15); ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8); ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8); /* Stage 3 */ UNPCK_SH_SW(vec0, vec0_l, vec0_r); UNPCK_SH_SW(vec1, vec1_l, vec1_r); UNPCK_SH_SW(vec2, vec2_l, vec2_r); UNPCK_SH_SW(vec3, vec3_l, vec3_r); UNPCK_SH_SW(vec4, vec4_l, vec4_r); UNPCK_SH_SW(vec5, vec5_l, vec5_r); UNPCK_SH_SW(vec6, vec6_l, vec6_r); UNPCK_SH_SW(vec7, vec7_l, vec7_r); ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r, tmp0_w, tmp1_w, tmp2_w, tmp3_w); BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r); ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l, vec0_r, vec1_r, vec2_r, vec3_r); tmp3_w = vec0_r + vec3_r; vec0_r = vec0_r - vec3_r; vec3_r = vec1_r + vec2_r; vec1_r = vec1_r - vec2_r; DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64, vec4_r, tmp3_w, vec6_r, vec3_r); FDCT32_POSTPROC_NEG_W(vec4_r); FDCT32_POSTPROC_NEG_W(tmp3_w); FDCT32_POSTPROC_NEG_W(vec6_r); FDCT32_POSTPROC_NEG_W(vec3_r); PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); ST_SH2(vec5, vec4, out, 8); DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64, vec4_r, tmp3_w, vec6_r, vec3_r); FDCT32_POSTPROC_NEG_W(vec4_r); FDCT32_POSTPROC_NEG_W(tmp3_w); FDCT32_POSTPROC_NEG_W(vec6_r); FDCT32_POSTPROC_NEG_W(vec3_r); PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); ST_SH2(vec5, vec4, out + 16, 8); LD_SH8(interm_ptr, 8, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); ADD2(vec4, vec5, vec7, vec6, vec0, vec1); DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 32); ST_SH(in5, out + 56); SUB2(vec4, vec5, vec7, vec6, vec4, vec7); DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 40); ST_SH(in5, out + 48); LD_SH8(interm_ptr + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); ADD2(in0, in1, in2, in3, vec0, vec7); DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 64); ST_SH(in5, out + 120); SUB2(in0, in1, in2, in3, in0, in2); DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 72); ST_SH(in5, out + 112); SUB2(in9, vec2, in14, vec5, vec2, vec5); DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 80); ST_SH(in5, out + 104); ADD2(in3, in2, in0, in1, vec3, vec4); DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 96); ST_SH(in5, out + 88); }