C++ (Cpp) BUTTERFLY_4の例

コード例 #1

0

ファイルを表示

ファイル: dct-c.c プロジェクト: 12307/PushRTMPStreamSync

static void avc_idct4x4dc_msa( int16_t *p_src, int32_t i_src_stride,
                               int16_t *p_dst, int32_t i_dst_stride )
{
    v8i16 src0, src1, src2, src3;
    v4i32 src0_r, src1_r, src2_r, src3_r;
    v4i32 hres0, hres1, hres2, hres3;
    v8i16 vres0, vres1, vres2, vres3;
    v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
    v2i64 res0, res1;

    LD_SH4( p_src, i_src_stride, src0, src1, src2, src3 );
    UNPCK_R_SH_SW( src0, src0_r );
    UNPCK_R_SH_SW( src1, src1_r );
    UNPCK_R_SH_SW( src2, src2_r );
    UNPCK_R_SH_SW( src3, src3_r );
    BUTTERFLY_4( src0_r, src2_r, src3_r, src1_r, vec0, vec3, vec2, vec1 );
    BUTTERFLY_4( vec0, vec1, vec2, vec3, hres0, hres3, hres2, hres1 );
    TRANSPOSE4x4_SW_SW( hres0, hres1, hres2, hres3,
                        hres0, hres1, hres2, hres3 );
    BUTTERFLY_4( hres0, hres2, hres3, hres1, vec0, vec3, vec2, vec1 );
    BUTTERFLY_4( vec0, vec1, vec2, vec3, vec4, vec7, vec6, vec5 );
    PCKEV_H4_SH( vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
                 vres0, vres1, vres2, vres3 );
    PCKOD_D2_SD( vres1, vres0, vres3, vres2, res0, res1 );
    ST8x4_UB( res0, res1, p_dst, i_dst_stride * 2 );
}

コード例 #2

0

ファイルを表示

ファイル: dct-c.c プロジェクト: MasterNobody/x264

static void avc_dct4x4dc_msa( int16_t *p_src, int16_t *p_dst,
                              int32_t i_src_stride )
{
    v8i16 src0, src1, src2, src3, ver_res0, ver_res1, ver_res2, ver_res3;
    v4i32 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
    v4i32 hor_res0, hor_res1, hor_res2, hor_res3;
    v4i32 ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r;

    LD_SH4( p_src, i_src_stride, src0, src1, src2, src3 );
    UNPCK_R_SH_SW( src0, src0_r );
    UNPCK_R_SH_SW( src1, src1_r );
    UNPCK_R_SH_SW( src2, src2_r );
    UNPCK_R_SH_SW( src3, src3_r );
    BUTTERFLY_4( src0_r, src2_r, src3_r, src1_r,
                 tmp0, tmp3, tmp2, tmp1 );
    BUTTERFLY_4( tmp0, tmp1, tmp2, tmp3,
                 hor_res0, hor_res3, hor_res2, hor_res1 );
    TRANSPOSE4x4_SW_SW( hor_res0, hor_res1, hor_res2, hor_res3,
                        hor_res0, hor_res1, hor_res2, hor_res3 );
    BUTTERFLY_4( hor_res0, hor_res2, hor_res3, hor_res1,
                 tmp0, tmp3, tmp2, tmp1 );
    BUTTERFLY_4( tmp0, tmp1, tmp2, tmp3,
                 ver_res0_r, ver_res3_r, ver_res2_r, ver_res1_r );
    SRARI_W4_SW( ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r, 1 );
    PCKEV_H4_SH( ver_res0_r, ver_res0_r, ver_res1_r, ver_res1_r,
                 ver_res2_r, ver_res2_r, ver_res3_r, ver_res3_r,
                 ver_res0, ver_res1, ver_res2, ver_res3 );
    PCKOD_D2_SH( ver_res1, ver_res0, ver_res3, ver_res2, ver_res0, ver_res2 );
    ST_SH2( ver_res0, ver_res2, p_dst, 8 );
}

コード例 #3

0

ファイルを表示

ファイル: fdct16x16_msa.c プロジェクト: nwgat/Mirror-aom

static void fadst16_rows_step1_msa(int16_t *input, const int32_t *const0,
                                   int16_t *int_buf) {
  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
  v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3;
  v4i32 k0, k1, k2, k3;

  /* load input data */
  r0 = LD_SH(input);
  r7 = LD_SH(input + 7 * 8);
  r8 = LD_SH(input + 8 * 8);
  r15 = LD_SH(input + 15 * 8);

  /* stage 1 */
  LD_SW2(const0, 4, k0, k1);
  LD_SW2(const0 + 4 * 2, 4, k2, k3);
  MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);

  r3 = LD_SH(input + 3 * 8);
  r4 = LD_SH(input + 4 * 8);
  r11 = LD_SH(input + 11 * 8);
  r12 = LD_SH(input + 12 * 8);

  LD_SW2(const0 + 4 * 4, 4, k0, k1);
  LD_SW2(const0 + 4 * 6, 4, k2, k3);
  MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);

  /* stage 2 */
  BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1);
  ST_SH2(tp0, tp1, int_buf, 4 * 8);
  ST_SH2(tp2, tp3, int_buf + 8, 4 * 8);

  LD_SW2(const0 + 4 * 8, 4, k0, k1);
  k2 = LD_SW(const0 + 4 * 10);
  MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
  ST_SH2(h0, h3, int_buf + 8 * 8, 4 * 8);
  ST_SH2(h1, h2, int_buf + 9 * 8, 4 * 8);

  r1 = LD_SH(input + 8);
  r6 = LD_SH(input + 6 * 8);
  r9 = LD_SH(input + 9 * 8);
  r14 = LD_SH(input + 14 * 8);

  LD_SW2(const0 + 4 * 11, 4, k0, k1);
  LD_SW2(const0 + 4 * 13, 4, k2, k3);
  MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3);
  ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8);

  r2 = LD_SH(input + 2 * 8);
  r5 = LD_SH(input + 5 * 8);
  r10 = LD_SH(input + 10 * 8);
  r13 = LD_SH(input + 13 * 8);

  LD_SW2(const0 + 4 * 15, 4, k0, k1);
  LD_SW2(const0 + 4 * 17, 4, k2, k3);
  MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3);
  ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8);
  BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3);
  ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8);
}

コード例 #4

0

ファイルを表示

ファイル: dct_msa.c プロジェクト: Corax26/libvpx

void vp8_short_fdct8x4_msa(int16_t *input, int16_t *output, int32_t pitch) {
  v8i16 in0, in1, in2, in3;
  v8i16 temp0, temp1, tmp0, tmp1;
  v8i16 const0, const1, const2;
  v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 };
  v8i16 zero = { 0 };
  v4i32 vec0_w, vec1_w, vec2_w, vec3_w;

  LD_SH4(input, pitch / 2, in0, in1, in2, in3);
  TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);

  BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
  SLLI_4V(temp0, temp1, in1, in3, 3);
  in0 = temp0 + temp1;
  in2 = temp0 - temp1;
  SET_DOTP_VALUES(coeff, 0, 1, 2, const1, const2);
  temp0 = __msa_splati_h(coeff, 3);
  vec1_w = (v4i32)__msa_ilvev_h(zero, temp0);
  coeff = __msa_ilvl_h(zero, coeff);
  vec3_w = __msa_splati_w((v4i32)coeff, 0);
  ILVRL_H2_SH(in3, in1, tmp1, tmp0);
  vec0_w = vec1_w;
  vec2_w = vec3_w;
  DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2, vec0_w,
               vec1_w, vec2_w, vec3_w);
  SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 12);
  PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3);
  TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);

  BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
  in0 = temp0 + temp1 + 7;
  in2 = temp0 - temp1 + 7;
  in0 >>= 4;
  in2 >>= 4;
  SPLATI_W2_SW(coeff, 2, vec3_w, vec1_w);
  vec3_w += vec1_w;
  vec1_w = __msa_splati_w((v4i32)coeff, 1);
  const0 = RET_1_IF_NZERO_H(in3);
  ILVRL_H2_SH(in3, in1, tmp1, tmp0);
  vec0_w = vec1_w;
  vec2_w = vec3_w;
  DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2, vec0_w,
               vec1_w, vec2_w, vec3_w);
  SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 16);
  PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3);
  in1 += const0;
  PCKEV_D2_SH(in1, in0, in3, in2, temp0, temp1);
  ST_SH2(temp0, temp1, output, 8);

  PCKOD_D2_SH(in1, in0, in3, in2, in0, in2);
  ST_SH2(in0, in2, output + 16, 8);
}

コード例 #5

0

ファイルを表示

ファイル: dct_msa.c プロジェクト: Corax26/libvpx

void vp8_short_fdct4x4_msa(int16_t *input, int16_t *output, int32_t pitch) {
  v8i16 in0, in1, in2, in3;
  v8i16 temp0, temp1;
  v8i16 const0, const1;
  v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 };
  v4i32 out0, out1, out2, out3;
  v8i16 zero = { 0 };

  LD_SH4(input, pitch / 2, in0, in1, in2, in3);
  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);

  BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
  SLLI_4V(temp0, temp1, in1, in3, 3);
  in0 = temp0 + temp1;
  in2 = temp0 - temp1;
  SET_DOTP_VALUES(coeff, 0, 1, 2, const0, const1);
  temp0 = __msa_ilvr_h(in3, in1);
  in1 = __msa_splati_h(coeff, 3);
  out0 = (v4i32)__msa_ilvev_h(zero, in1);
  coeff = __msa_ilvl_h(zero, coeff);
  out1 = __msa_splati_w((v4i32)coeff, 0);
  DPADD_SH2_SW(temp0, temp0, const0, const1, out0, out1);
  out0 >>= 12;
  out1 >>= 12;
  PCKEV_H2_SH(out0, out0, out1, out1, in1, in3);
  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);

  BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
  in0 = temp0 + temp1 + 7;
  in2 = temp0 - temp1 + 7;
  in0 >>= 4;
  in2 >>= 4;
  ILVR_H2_SW(zero, in0, zero, in2, out0, out2);
  temp1 = RET_1_IF_NZERO_H(in3);
  ILVR_H2_SH(zero, temp1, in3, in1, temp1, temp0);
  SPLATI_W2_SW(coeff, 2, out3, out1);
  out3 += out1;
  out1 = __msa_splati_w((v4i32)coeff, 1);
  DPADD_SH2_SW(temp0, temp0, const0, const1, out1, out3);
  out1 >>= 16;
  out3 >>= 16;
  out1 += (v4i32)temp1;
  PCKEV_H2_SH(out1, out0, out3, out2, in0, in2);
  ST_SH2(in0, in2, output, 8);
}

コード例 #6

0

ファイルを表示

ファイル: dct-c.c プロジェクト: 0x0B501E7E/x264

static void avc_sub4x4_dct_msa( uint8_t *p_src, int32_t i_src_stride,
                                uint8_t *p_ref, int32_t i_dst_stride,
                                int16_t *p_dst )
{
    uint32_t i_src0, i_src1, i_src2, i_src3;
    uint32_t i_ref0, i_ref1, i_ref2, i_ref3;
    v16i8 src = { 0 };
    v16i8 ref = { 0 };
    v16u8 inp0, inp1;
    v8i16 diff0, diff1, diff2, diff3;
    v8i16 temp0, temp1, temp2, temp3;

    LW4( p_src, i_src_stride, i_src0, i_src1, i_src2, i_src3 );
    LW4( p_ref, i_dst_stride, i_ref0, i_ref1, i_ref2, i_ref3 );

    INSERT_W4_SB( i_src0, i_src1, i_src2, i_src3, src );
    INSERT_W4_SB( i_ref0, i_ref1, i_ref2, i_ref3, ref );

    ILVRL_B2_UB( src, ref, inp0, inp1 );

    HSUB_UB2_SH( inp0, inp1, diff0, diff2 );

    diff1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) diff0, ( v2i64 ) diff0 );
    diff3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) diff2, ( v2i64 ) diff2 );

    BUTTERFLY_4( diff0, diff1, diff2, diff3, temp0, temp1, temp2, temp3 );

    diff0 = temp0 + temp1;
    diff1 = ( temp3 << 1 ) + temp2;
    diff2 = temp0 - temp1;
    diff3 = temp3 - ( temp2 << 1 );

    TRANSPOSE4x4_SH_SH( diff0, diff1, diff2, diff3,
                        temp0, temp1, temp2, temp3 );
    BUTTERFLY_4( temp0, temp1, temp2, temp3, diff0, diff1, diff2, diff3 );

    temp0 = diff0 + diff1;
    temp1 = ( diff3 << 1 ) + diff2;
    temp2 = diff0 - diff1;
    temp3 = diff3 - ( diff2 << 1 );

    ILVR_D2_UB( temp1, temp0, temp3, temp2, inp0, inp1 );
    ST_UB2( inp0, inp1, p_dst, 8 );
}

コード例 #7

0

ファイルを表示

ファイル: dct-c.c プロジェクト: 0x0B501E7E/x264

void x264_sub8x8_dct_dc_msa( int16_t pi_dct[4],
                             uint8_t *p_pix1, uint8_t *p_pix2 )
{
    int32_t d0, d1, d2, d3;

    pi_dct[0] = subtract_sum4x4_msa( &p_pix1[0], FENC_STRIDE,
                                     &p_pix2[0], FDEC_STRIDE );
    pi_dct[1] = subtract_sum4x4_msa( &p_pix1[4], FENC_STRIDE,
                                     &p_pix2[4], FDEC_STRIDE );
    pi_dct[2] = subtract_sum4x4_msa( &p_pix1[4 * FENC_STRIDE + 0], FENC_STRIDE,
                                     &p_pix2[4 * FDEC_STRIDE + 0],
                                     FDEC_STRIDE );
    pi_dct[3] = subtract_sum4x4_msa( &p_pix1[4 * FENC_STRIDE + 4], FENC_STRIDE,
                                     &p_pix2[4 * FDEC_STRIDE + 4],
                                     FDEC_STRIDE );

    BUTTERFLY_4( pi_dct[0], pi_dct[2], pi_dct[3], pi_dct[1], d0, d1, d3, d2 );
    BUTTERFLY_4( d0, d2, d3, d1, pi_dct[0], pi_dct[2], pi_dct[3], pi_dct[1] );
}

コード例 #8

0

ファイルを表示

ファイル: fwd_txfm_msa.c プロジェクト: MIPS/external-libvpx

void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
                        int32_t src_stride) {
  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
  v8i16 stp21, stp22, stp23, stp24, stp25, stp26, stp30;
  v8i16 stp31, stp32, stp33, stp34, stp35, stp36, stp37;
  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5;
  v8i16 coeff = { cospi_16_64, -cospi_16_64, cospi_8_64,  cospi_24_64,
                  -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 };
  v8i16 coeff1 = { cospi_2_64,  cospi_30_64, cospi_14_64, cospi_18_64,
                   cospi_10_64, cospi_22_64, cospi_6_64,  cospi_26_64 };
  v8i16 coeff2 = {
    -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0
  };

  LD_SH16(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9,
          in10, in11, in12, in13, in14, in15);
  SLLI_4V(in0, in1, in2, in3, 2);
  SLLI_4V(in4, in5, in6, in7, 2);
  SLLI_4V(in8, in9, in10, in11, 2);
  SLLI_4V(in12, in13, in14, in15, 2);
  ADD4(in0, in15, in1, in14, in2, in13, in3, in12, tmp0, tmp1, tmp2, tmp3);
  ADD4(in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, tmp6, tmp7);
  FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
                tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
  ST_SH8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp_ptr, 32);
  SUB4(in0, in15, in1, in14, in2, in13, in3, in12, in15, in14, in13, in12);
  SUB4(in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, in9, in8);

  tmp_ptr += 16;

  /* stp 1 */
  ILVL_H2_SH(in10, in13, in11, in12, vec2, vec4);
  ILVR_H2_SH(in10, in13, in11, in12, vec3, vec5);

  cnst4 = __msa_splati_h(coeff, 0);
  stp25 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4);

  cnst5 = __msa_splati_h(coeff, 1);
  cnst5 = __msa_ilvev_h(cnst5, cnst4);
  stp22 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5);
  stp24 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4);
  stp23 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5);

  /* stp2 */
  BUTTERFLY_4(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33);
  BUTTERFLY_4(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34);
  ILVL_H2_SH(stp36, stp31, stp35, stp32, vec2, vec4);
  ILVR_H2_SH(stp36, stp31, stp35, stp32, vec3, vec5);
  SPLATI_H2_SH(coeff, 2, 3, cnst0, cnst1);
  cnst0 = __msa_ilvev_h(cnst0, cnst1);
  stp26 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0);

  cnst0 = __msa_splati_h(coeff, 4);
  cnst1 = __msa_ilvev_h(cnst1, cnst0);
  stp21 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1);

  BUTTERFLY_4(stp30, stp37, stp26, stp21, in8, in15, in14, in9);
  ILVRL_H2_SH(in15, in8, vec1, vec0);
  SPLATI_H2_SH(coeff1, 0, 1, cnst0, cnst1);
  cnst0 = __msa_ilvev_h(cnst0, cnst1);

  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
  ST_SH(in8, tmp_ptr);

  cnst0 = __msa_splati_h(coeff2, 0);
  cnst0 = __msa_ilvev_h(cnst1, cnst0);
  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
  ST_SH(in8, tmp_ptr + 224);

  ILVRL_H2_SH(in14, in9, vec1, vec0);
  SPLATI_H2_SH(coeff1, 2, 3, cnst0, cnst1);
  cnst1 = __msa_ilvev_h(cnst1, cnst0);

  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1);
  ST_SH(in8, tmp_ptr + 128);

  cnst1 = __msa_splati_h(coeff2, 2);
  cnst0 = __msa_ilvev_h(cnst0, cnst1);
  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
  ST_SH(in8, tmp_ptr + 96);

  SPLATI_H2_SH(coeff, 2, 5, cnst0, cnst1);
  cnst1 = __msa_ilvev_h(cnst1, cnst0);

  stp25 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1);

  cnst1 = __msa_splati_h(coeff, 3);
  cnst1 = __msa_ilvev_h(cnst0, cnst1);
  stp22 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1);

  /* stp4 */
  ADD2(stp34, stp25, stp33, stp22, in13, in10);

  ILVRL_H2_SH(in13, in10, vec1, vec0);
  SPLATI_H2_SH(coeff1, 4, 5, cnst0, cnst1);
  cnst0 = __msa_ilvev_h(cnst0, cnst1);
  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
  ST_SH(in8, tmp_ptr + 64);

  cnst0 = __msa_splati_h(coeff2, 1);
  cnst0 = __msa_ilvev_h(cnst1, cnst0);
  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
  ST_SH(in8, tmp_ptr + 160);

  SUB2(stp34, stp25, stp33, stp22, in12, in11);
  ILVRL_H2_SH(in12, in11, vec1, vec0);
  SPLATI_H2_SH(coeff1, 6, 7, cnst0, cnst1);
  cnst1 = __msa_ilvev_h(cnst1, cnst0);

  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1);
  ST_SH(in8, tmp_ptr + 192);

  cnst1 = __msa_splati_h(coeff2, 3);
  cnst0 = __msa_ilvev_h(cnst0, cnst1);
  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
  ST_SH(in8, tmp_ptr + 32);
}

コード例 #9

0

ファイルを表示

ファイル: dct-c.c プロジェクト: 0x0B501E7E/x264

static void avc_idct8_addblk_msa( uint8_t *p_dst, int16_t *p_src,
                                  int32_t i_dst_stride )
{
    v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
    v8i16 vec0, vec1, vec2, vec3;
    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
    v4i32 tmp0_r, tmp1_r, tmp2_r, tmp3_r, tmp4_r, tmp5_r, tmp6_r, tmp7_r;
    v4i32 tmp0_l, tmp1_l, tmp2_l, tmp3_l, tmp4_l, tmp5_l, tmp6_l, tmp7_l;
    v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec0_l, vec1_l, vec2_l, vec3_l;
    v4i32 res0_r, res1_r, res2_r, res3_r, res4_r, res5_r, res6_r, res7_r;
    v4i32 res0_l, res1_l, res2_l, res3_l, res4_l, res5_l, res6_l, res7_l;
    v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
    v16i8 zeros = { 0 };

    p_src[ 0 ] += 32;

    LD_SH8( p_src, 8, src0, src1, src2, src3, src4, src5, src6, src7 );

    vec0 = src0 + src4;
    vec1 = src0 - src4;
    vec2 = src2 >> 1;
    vec2 = vec2 - src6;
    vec3 = src6 >> 1;
    vec3 = src2 + vec3;

    BUTTERFLY_4( vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3 );

    vec0 = src7 >> 1;
    vec0 = src5 - vec0 - src3 - src7;
    vec1 = src3 >> 1;
    vec1 = src1 - vec1 + src7 - src3;
    vec2 = src5 >> 1;
    vec2 = vec2 - src1 + src7 + src5;
    vec3 = src1 >> 1;
    vec3 = vec3 + src3 + src5 + src1;
    tmp4 = vec3 >> 2;
    tmp4 += vec0;
    tmp5 = vec2 >> 2;
    tmp5 += vec1;
    tmp6 = vec1 >> 2;
    tmp6 -= vec2;
    tmp7 = vec0 >> 2;
    tmp7 = vec3 - tmp7;

    BUTTERFLY_8( tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
                 res0, res1, res2, res3, res4, res5, res6, res7 );
    TRANSPOSE8x8_SH_SH( res0, res1, res2, res3, res4, res5, res6, res7,
                        res0, res1, res2, res3, res4, res5, res6, res7 );
    UNPCK_SH_SW( res0, tmp0_r, tmp0_l );
    UNPCK_SH_SW( res1, tmp1_r, tmp1_l );
    UNPCK_SH_SW( res2, tmp2_r, tmp2_l );
    UNPCK_SH_SW( res3, tmp3_r, tmp3_l );
    UNPCK_SH_SW( res4, tmp4_r, tmp4_l );
    UNPCK_SH_SW( res5, tmp5_r, tmp5_l );
    UNPCK_SH_SW( res6, tmp6_r, tmp6_l );
    UNPCK_SH_SW( res7, tmp7_r, tmp7_l );
    BUTTERFLY_4( tmp0_r, tmp0_l, tmp4_l, tmp4_r,
                 vec0_r, vec0_l, vec1_l, vec1_r );

    vec2_r = tmp2_r >> 1;
    vec2_l = tmp2_l >> 1;
    vec2_r -= tmp6_r;
    vec2_l -= tmp6_l;
    vec3_r = tmp6_r >> 1;
    vec3_l = tmp6_l >> 1;
    vec3_r += tmp2_r;
    vec3_l += tmp2_l;

    BUTTERFLY_4( vec0_r, vec1_r, vec2_r, vec3_r,
                 tmp0_r, tmp2_r, tmp4_r, tmp6_r );
    BUTTERFLY_4( vec0_l, vec1_l, vec2_l, vec3_l,
                 tmp0_l, tmp2_l, tmp4_l, tmp6_l );

    vec0_r = tmp7_r >> 1;
    vec0_l = tmp7_l >> 1;
    vec0_r = tmp5_r - vec0_r - tmp3_r - tmp7_r;
    vec0_l = tmp5_l - vec0_l - tmp3_l - tmp7_l;
    vec1_r = tmp3_r >> 1;
    vec1_l = tmp3_l >> 1;
    vec1_r = tmp1_r - vec1_r + tmp7_r - tmp3_r;
    vec1_l = tmp1_l - vec1_l + tmp7_l - tmp3_l;
    vec2_r = tmp5_r >> 1;
    vec2_l = tmp5_l >> 1;
    vec2_r = vec2_r - tmp1_r + tmp7_r + tmp5_r;
    vec2_l = vec2_l - tmp1_l + tmp7_l + tmp5_l;
    vec3_r = tmp1_r >> 1;
    vec3_l = tmp1_l >> 1;
    vec3_r = vec3_r + tmp3_r + tmp5_r + tmp1_r;
    vec3_l = vec3_l + tmp3_l + tmp5_l + tmp1_l;
    tmp1_r = vec3_r >> 2;
    tmp1_l = vec3_l >> 2;
    tmp1_r += vec0_r;
    tmp1_l += vec0_l;
    tmp3_r = vec2_r >> 2;
    tmp3_l = vec2_l >> 2;
    tmp3_r += vec1_r;
    tmp3_l += vec1_l;
    tmp5_r = vec1_r >> 2;
    tmp5_l = vec1_l >> 2;
    tmp5_r -= vec2_r;
    tmp5_l -= vec2_l;
    tmp7_r = vec0_r >> 2;
    tmp7_l = vec0_l >> 2;
    tmp7_r = vec3_r - tmp7_r;
    tmp7_l = vec3_l - tmp7_l;

    BUTTERFLY_4( tmp0_r, tmp0_l, tmp7_l, tmp7_r,
                 res0_r, res0_l, res7_l, res7_r );
    BUTTERFLY_4( tmp2_r, tmp2_l, tmp5_l, tmp5_r,
                 res1_r, res1_l, res6_l, res6_r );
    BUTTERFLY_4( tmp4_r, tmp4_l, tmp3_l, tmp3_r,
                 res2_r, res2_l, res5_l, res5_r );
    BUTTERFLY_4( tmp6_r, tmp6_l, tmp1_l, tmp1_r,
                 res3_r, res3_l, res4_l, res4_r );
    SRA_4V( res0_r, res0_l, res1_r, res1_l, 6 );
    SRA_4V( res2_r, res2_l, res3_r, res3_l, 6 );
    SRA_4V( res4_r, res4_l, res5_r, res5_l, 6 );
    SRA_4V( res6_r, res6_l, res7_r, res7_l, 6 );
    PCKEV_H4_SH( res0_l, res0_r, res1_l, res1_r, res2_l, res2_r, res3_l, res3_r,
                 res0, res1, res2, res3 );
    PCKEV_H4_SH( res4_l, res4_r, res5_l, res5_r, res6_l, res6_r, res7_l, res7_r,
                 res4, res5, res6, res7 );
    LD_SB8( p_dst, i_dst_stride,
            dst0, dst1, dst2, dst3,
            dst4, dst5, dst6, dst7 );
    ILVR_B4_SH( zeros, dst0, zeros, dst1, zeros, dst2, zeros, dst3,
                tmp0, tmp1, tmp2, tmp3 );
    ILVR_B4_SH( zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7,
                tmp4, tmp5, tmp6, tmp7 );
    ADD4( res0, tmp0, res1, tmp1, res2, tmp2, res3, tmp3,
          res0, res1, res2, res3 );
    ADD4( res4, tmp4, res5, tmp5, res6, tmp6, res7, tmp7,
          res4, res5, res6, res7 );
    CLIP_SH4_0_255( res0, res1, res2, res3 );
    CLIP_SH4_0_255( res4, res5, res6, res7 );
    PCKEV_B4_SB( res1, res0, res3, res2, res5, res4, res7, res6,
                 dst0, dst1, dst2, dst3 );
    ST8x4_UB( dst0, dst1, p_dst, i_dst_stride );
    p_dst += ( 4 * i_dst_stride );
    ST8x4_UB( dst2, dst3, p_dst, i_dst_stride );
}

コード例 #10

0

ファイルを表示

ファイル: fdct16x16_msa.c プロジェクト: nwgat/Mirror-aom

static void fadst16_cols_step2_msa(int16_t *int_buf, const int32_t *const0,
                                   int16_t *out) {
  int16_t *out_ptr = out + 128;
  v8i16 tp0, tp1, tp2, tp3, g5, g7, g13, g15;
  v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h10, h11;
  v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
  v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
  v4i32 k0, k1, k2, k3;

  LD_SH2(int_buf + 3 * 8, 4 * 8, g13, g15);
  LD_SH2(int_buf + 11 * 8, 4 * 8, g5, g7);
  LD_SW2(const0 + 4 * 19, 4, k0, k1);
  k2 = LD_SW(const0 + 4 * 21);
  MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);

  tp0 = LD_SH(int_buf + 4 * 8);
  tp1 = LD_SH(int_buf + 5 * 8);
  tp3 = LD_SH(int_buf + 10 * 8);
  tp2 = LD_SH(int_buf + 14 * 8);
  LD_SW2(const0 + 4 * 22, 4, k0, k1);
  k2 = LD_SW(const0 + 4 * 24);
  MADD_BF(tp0, tp1, tp2, tp3, k0, k1, k2, k0, out4, out6, out5, out7);
  out4 = -out4;
  ST_SH(out4, (out + 3 * 16));
  ST_SH(out5, (out_ptr + 4 * 16));

  h1 = LD_SH(int_buf + 9 * 8);
  h3 = LD_SH(int_buf + 12 * 8);
  MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
  out13 = -out13;
  ST_SH(out12, (out + 2 * 16));
  ST_SH(out13, (out_ptr + 5 * 16));

  tp0 = LD_SH(int_buf);
  tp1 = LD_SH(int_buf + 8);
  tp2 = LD_SH(int_buf + 2 * 8);
  tp3 = LD_SH(int_buf + 6 * 8);

  BUTTERFLY_4(tp0, tp1, tp3, tp2, out0, out1, h11, h10);
  out1 = -out1;
  ST_SH(out0, (out));
  ST_SH(out1, (out_ptr + 7 * 16));

  h0 = LD_SH(int_buf + 8 * 8);
  h2 = LD_SH(int_buf + 13 * 8);

  BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
  out8 = -out8;
  ST_SH(out8, (out + 16));
  ST_SH(out9, (out_ptr + 6 * 16));

  /* stage 4 */
  LD_SW2(const0 + 4 * 25, 4, k0, k1);
  LD_SW2(const0 + 4 * 27, 4, k2, k3);
  MADD_SHORT(h10, h11, k1, k2, out2, out3);
  ST_SH(out2, (out + 7 * 16));
  ST_SH(out3, (out_ptr));

  MADD_SHORT(out6, out7, k0, k3, out6, out7);
  ST_SH(out6, (out + 4 * 16));
  ST_SH(out7, (out_ptr + 3 * 16));

  MADD_SHORT(out10, out11, k0, k3, out10, out11);
  ST_SH(out10, (out + 6 * 16));
  ST_SH(out11, (out_ptr + 16));

  MADD_SHORT(out14, out15, k1, k2, out14, out15);
  ST_SH(out14, (out + 5 * 16));
  ST_SH(out15, (out_ptr + 2 * 16));
}

コード例 #11

0

ファイルを表示

ファイル: fwd_dct32x32_msa.c プロジェクト: ALEJANDROJ19/VTW-server

static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) {
  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  v8i16 temp0, temp1;

  /* fdct even */
  LD_SH4(input, 8, in0, in1, in2, in3);
  LD_SH4(input + 96, 8, in12, in13, in14, in15);
  BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15,
              vec0, vec1, vec2, vec3, in12, in13, in14, in15);
  LD_SH4(input + 32, 8, in4, in5, in6, in7);
  LD_SH4(input + 64, 8, in8, in9, in10, in11);
  BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11,
              vec4, vec5, vec6, vec7, in8, in9, in10, in11);

  /* Stage 3 */
  ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
  BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0);
  DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
  ST_SH(temp0, temp);
  ST_SH(temp1, temp + 512);

  DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
  ST_SH(temp0, temp + 256);
  ST_SH(temp1, temp + 768);

  SUB4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7, vec6, vec5, vec4);
  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
  ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
  ST_SH(temp0, temp + 128);
  ST_SH(temp1, temp + 896);

  SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
  ST_SH(temp0, temp + 640);
  ST_SH(temp1, temp + 384);

  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
  ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
  ADD2(in0, in1, in2, in3, vec0, vec7);
  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
  ST_SH(temp0, temp + 64);
  ST_SH(temp1, temp + 960);

  SUB2(in0, in1, in2, in3, in0, in2);
  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
  ST_SH(temp0, temp + 576);
  ST_SH(temp1, temp + 448);

  SUB2(in9, vec2, in14, vec5, vec2, vec5);
  DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
  SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
  ST_SH(temp0, temp + 320);
  ST_SH(temp1, temp + 704);

  ADD2(in3, in2, in0, in1, vec3, vec4);
  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
  ST_SH(temp0, temp + 192);
  ST_SH(temp1, temp + 832);
}

コード例 #12

0

ファイルを表示

ファイル: fwd_dct32x32_msa.c プロジェクト: ALEJANDROJ19/VTW-server

static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) {
  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;

  /* fdct32 even */
  /* stage 2 */
  LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
  LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);

  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
               in8, in9, in10, in11, in12, in13, in14, in15,
               vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7,
               in8, in9, in10, in11, in12, in13, in14, in15);

  /* Stage 3 */
  ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
  BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0);
  DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
  ST_SH(temp0, out);
  ST_SH(temp1, out + 8);

  DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
  ST_SH(temp0, out + 16);
  ST_SH(temp1, out + 24);

  SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
  ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
  ST_SH(temp0, out + 32);
  ST_SH(temp1, out + 56);

  SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
  ST_SH(temp0, out + 40);
  ST_SH(temp1, out + 48);

  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
  ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
  ADD2(in0, in1, in2, in3, vec0, vec7);
  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
  ST_SH(temp0, out + 64);
  ST_SH(temp1, out + 120);

  SUB2(in0, in1, in2, in3, in0, in2);
  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
  ST_SH(temp0, out + 72);
  ST_SH(temp1, out + 112);

  SUB2(in9, vec2, in14, vec5, vec2, vec5);
  DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
  SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5)
  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
  ST_SH(temp0, out + 80);
  ST_SH(temp1, out + 104);

  ADD2(in3, in2, in0, in1, vec3, vec4);
  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
  ST_SH(temp0, out + 96);
  ST_SH(temp1, out + 88);
}

コード例 #13

0

ファイルを表示

ファイル: fwd_dct32x32_msa.c プロジェクト: ALEJANDROJ19/VTW-server

static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr,
                                    int16_t *out) {
  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  v4i32 vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l;
  v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r;
  v4i32 tmp0_w, tmp1_w, tmp2_w, tmp3_w;

  /* fdct32 even */
  /* stage 2 */
  LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
  LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);

  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
               in8, in9, in10, in11, in12, in13, in14, in15,
               vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7,
               in8, in9, in10, in11, in12, in13, in14, in15);
  ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8);
  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8);

  /* Stage 3 */
  UNPCK_SH_SW(vec0, vec0_l, vec0_r);
  UNPCK_SH_SW(vec1, vec1_l, vec1_r);
  UNPCK_SH_SW(vec2, vec2_l, vec2_r);
  UNPCK_SH_SW(vec3, vec3_l, vec3_r);
  UNPCK_SH_SW(vec4, vec4_l, vec4_r);
  UNPCK_SH_SW(vec5, vec5_l, vec5_r);
  UNPCK_SH_SW(vec6, vec6_l, vec6_r);
  UNPCK_SH_SW(vec7, vec7_l, vec7_r);
  ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r,
       tmp0_w, tmp1_w, tmp2_w, tmp3_w);
  BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r);
  ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l,
       vec0_r, vec1_r, vec2_r, vec3_r);

  tmp3_w = vec0_r + vec3_r;
  vec0_r = vec0_r - vec3_r;
  vec3_r = vec1_r + vec2_r;
  vec1_r = vec1_r - vec2_r;

  DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64,
                    cospi_16_64, vec4_r, tmp3_w, vec6_r, vec3_r);
  FDCT32_POSTPROC_NEG_W(vec4_r);
  FDCT32_POSTPROC_NEG_W(tmp3_w);
  FDCT32_POSTPROC_NEG_W(vec6_r);
  FDCT32_POSTPROC_NEG_W(vec3_r);
  PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
  ST_SH2(vec5, vec4, out, 8);

  DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64,
                    cospi_8_64, vec4_r, tmp3_w, vec6_r, vec3_r);
  FDCT32_POSTPROC_NEG_W(vec4_r);
  FDCT32_POSTPROC_NEG_W(tmp3_w);
  FDCT32_POSTPROC_NEG_W(vec6_r);
  FDCT32_POSTPROC_NEG_W(vec3_r);
  PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
  ST_SH2(vec5, vec4, out + 16, 8);

  LD_SH8(interm_ptr, 8, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
  SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
  ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4);
  FDCT_POSTPROC_2V_NEG_H(in4, in5);
  ST_SH(in4, out + 32);
  ST_SH(in5, out + 56);

  SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4);
  FDCT_POSTPROC_2V_NEG_H(in4, in5);
  ST_SH(in4, out + 40);
  ST_SH(in5, out + 48);

  LD_SH8(interm_ptr + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
  ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
  ADD2(in0, in1, in2, in3, vec0, vec7);
  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4);
  FDCT_POSTPROC_2V_NEG_H(in4, in5);
  ST_SH(in4, out + 64);
  ST_SH(in5, out + 120);

  SUB2(in0, in1, in2, in3, in0, in2);
  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4);
  FDCT_POSTPROC_2V_NEG_H(in4, in5);
  ST_SH(in4, out + 72);
  ST_SH(in5, out + 112);

  SUB2(in9, vec2, in14, vec5, vec2, vec5);
  DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
  SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4);
  FDCT_POSTPROC_2V_NEG_H(in4, in5);
  ST_SH(in4, out + 80);
  ST_SH(in5, out + 104);

  ADD2(in3, in2, in0, in1, vec3, vec4);
  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5);
  FDCT_POSTPROC_2V_NEG_H(in4, in5);
  ST_SH(in4, out + 96);
  ST_SH(in5, out + 88);
}