Exemple #1
0
void fdct16x8_1d_row(int16_t *input, int16_t *output) {
  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;

  LD_SH8(input, 16, in0, in1, in2, in3, in4, in5, in6, in7);
  LD_SH8((input + 8), 16, in8, in9, in10, in11, in12, in13, in14, in15);
  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
                     in4, in5, in6, in7);
  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
                     in10, in11, in12, in13, in14, in15);
  ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
  ADD4(in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7);
  ADD4(in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10, in11);
  ADD4(in12, 1, in13, 1, in14, 1, in15, 1, in12, in13, in14, in15);
  SRA_4V(in0, in1, in2, in3, 2);
  SRA_4V(in4, in5, in6, in7, 2);
  SRA_4V(in8, in9, in10, in11, 2);
  SRA_4V(in12, in13, in14, in15, 2);
  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
               in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6,
               tmp7, in8, in9, in10, in11, in12, in13, in14, in15);
  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, input, 16);
  FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
                tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
  LD_SH8(input, 16, in8, in9, in10, in11, in12, in13, in14, in15);
  FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3,
               in4, in5, in6, in7);
  TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0,
                     tmp1, in1, tmp2, in2, tmp3, in3);
  ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, output, 16);
  TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4,
                     tmp5, in5, tmp6, in6, tmp7, in7);
  ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, output + 8, 16);
}
static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff,
                                           int16_t *output) {
  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
  v8i16 step0, step1, step2, step3, step4, step5, step6, step7;

  LD_SH8(temp_buff, 32, in0, in1, in2, in3, in4, in5, in6, in7);
  LD_SH8(temp_buff + 24, 32, in8, in9, in10, in11, in12, in13, in14, in15);
  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
                     in0, in1, in2, in3, in4, in5, in6, in7);
  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
                     in8, in9, in10, in11, in12, in13, in14, in15);
  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
               in8, in9, in10, in11, in12, in13, in14, in15,
               step0, step1, step2, step3, step4, step5, step6, step7,
               in8, in9, in10, in11, in12, in13, in14, in15);
  ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, output, 8);
  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 24 * 8), 8);

  /* 2nd set */
  LD_SH8(temp_buff + 8, 32, in0, in1, in2, in3, in4, in5, in6, in7);
  LD_SH8(temp_buff + 16, 32, in8, in9, in10, in11, in12, in13, in14, in15);
  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
                     in0, in1, in2, in3, in4, in5, in6, in7);
  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
                     in8, in9, in10, in11, in12, in13, in14, in15);
  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
               in8, in9, in10, in11, in12, in13, in14, in15,
               step0, step1, step2, step3, step4, step5, step6, step7,
               in8, in9, in10, in11, in12, in13, in14, in15);
  ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7,
         (output + 8 * 8), 8);
  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 16 * 8), 8);
}
Exemple #3
0
static void fadst16_transpose_msa(int16_t *input, int16_t *out) {
  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
  v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;

  /* load input data */
  LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14,
          l7, l15);
  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6,
                     r7);
  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11,
                     r12, r13, r14, r15);
  ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8);
  ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8);
  out += 16 * 8;

  /* load input data */
  input += 128;
  LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14,
          l7, l15);
  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6,
                     r7);
  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11,
                     r12, r13, r14, r15);
  ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8);
  ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8);
}
void vp9_fht8x8_msa(const int16_t *input, int16_t *output, int32_t stride,
                    int32_t tx_type) {
  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;

  LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7);
  SLLI_4V(in0, in1, in2, in3, 2);
  SLLI_4V(in4, in5, in6, in7, 2);

  switch (tx_type) {
    case DCT_DCT:
      VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
                in0, in1, in2, in3, in4, in5, in6, in7);
      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
                         in0, in1, in2, in3, in4, in5, in6, in7);
      VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
                in0, in1, in2, in3, in4, in5, in6, in7);
      break;
    case ADST_DCT:
      VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
                in0, in1, in2, in3, in4, in5, in6, in7);
      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
                         in0, in1, in2, in3, in4, in5, in6, in7);
      VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
                in0, in1, in2, in3, in4, in5, in6, in7);
      break;
    case DCT_ADST:
      VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
                in0, in1, in2, in3, in4, in5, in6, in7);
      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
                         in0, in1, in2, in3, in4, in5, in6, in7);
      VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
                in0, in1, in2, in3, in4, in5, in6, in7);
      break;
    case ADST_ADST:
      VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
                in0, in1, in2, in3, in4, in5, in6, in7);
      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
                         in0, in1, in2, in3, in4, in5, in6, in7);
      VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
                in0, in1, in2, in3, in4, in5, in6, in7);
      break;
    default:
      assert(0);
      break;
  }

  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
                     in0, in1, in2, in3, in4, in5, in6, in7);
  VP9_SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);
  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8);
}
Exemple #5
0
static void hevc_idct_8x8_msa(int16_t *coeffs)
{
    const int16_t *filter = &gt8x8_cnst[0];
    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;

    LD_SH8(coeffs, 8, in0, in1, in2, in3, in4, in5, in6, in7);
    HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 7);
    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
                       in0, in1, in2, in3, in4, in5, in6, in7);
    HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 12);
    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
                       in0, in1, in2, in3, in4, in5, in6, in7);
    ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, coeffs, 8);
}
Exemple #6
0
static void fadst16_transpose_postproc_msa(int16_t *input, int16_t *out) {
  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
  v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;

  /* load input data */
  LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7);
  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6,
                     r7);
  FDCT_POSTPROC_2V_NEG_H(r0, r1);
  FDCT_POSTPROC_2V_NEG_H(r2, r3);
  FDCT_POSTPROC_2V_NEG_H(r4, r5);
  FDCT_POSTPROC_2V_NEG_H(r6, r7);
  ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8);
  out += 64;

  LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15);
  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11,
                     r12, r13, r14, r15);
  FDCT_POSTPROC_2V_NEG_H(r8, r9);
  FDCT_POSTPROC_2V_NEG_H(r10, r11);
  FDCT_POSTPROC_2V_NEG_H(r12, r13);
  FDCT_POSTPROC_2V_NEG_H(r14, r15);
  ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8);
  out += 64;

  /* load input data */
  input += 128;
  LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7);
  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6,
                     r7);
  FDCT_POSTPROC_2V_NEG_H(r0, r1);
  FDCT_POSTPROC_2V_NEG_H(r2, r3);
  FDCT_POSTPROC_2V_NEG_H(r4, r5);
  FDCT_POSTPROC_2V_NEG_H(r6, r7);
  ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8);
  out += 64;

  LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15);
  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11,
                     r12, r13, r14, r15);
  FDCT_POSTPROC_2V_NEG_H(r8, r9);
  FDCT_POSTPROC_2V_NEG_H(r10, r11);
  FDCT_POSTPROC_2V_NEG_H(r12, r13);
  FDCT_POSTPROC_2V_NEG_H(r14, r15);
  ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8);
}
Exemple #7
0
void vpx_fdct8x8_msa(const int16_t *input, int16_t *output,
                     int32_t src_stride) {
  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;

  LD_SH8(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7);
  SLLI_4V(in0, in1, in2, in3, 2);
  SLLI_4V(in4, in5, in6, in7, 2);
  VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
            in5, in6, in7);
  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
                     in4, in5, in6, in7);
  VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
            in5, in6, in7);
  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
                     in4, in5, in6, in7);
  SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);
  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8);
}
Exemple #8
0
static void postproc_fdct16x8_1d_row(int16_t *intermediate, int16_t *output) {
  int16_t *temp = intermediate;
  int16_t *out = output;
  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11;
  v8i16 in12, in13, in14, in15;

  LD_SH8(temp, 16, in0, in1, in2, in3, in4, in5, in6, in7);
  temp = intermediate + 8;
  LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15);
  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
                     in4, in5, in6, in7);
  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
                     in10, in11, in12, in13, in14, in15);
  FDCT_POSTPROC_2V_NEG_H(in0, in1);
  FDCT_POSTPROC_2V_NEG_H(in2, in3);
  FDCT_POSTPROC_2V_NEG_H(in4, in5);
  FDCT_POSTPROC_2V_NEG_H(in6, in7);
  FDCT_POSTPROC_2V_NEG_H(in8, in9);
  FDCT_POSTPROC_2V_NEG_H(in10, in11);
  FDCT_POSTPROC_2V_NEG_H(in12, in13);
  FDCT_POSTPROC_2V_NEG_H(in14, in15);
  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
               in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6,
               tmp7, in8, in9, in10, in11, in12, in13, in14, in15);
  temp = intermediate;
  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, temp, 16);
  FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
                tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
  temp = intermediate;
  LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15);
  FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3,
               in4, in5, in6, in7);
  TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0,
                     tmp1, in1, tmp2, in2, tmp3, in3);
  ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, out, 16);
  TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4,
                     tmp5, in5, tmp6, in6, tmp7, in7);
  out = output + 8;
  ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, out, 16);
}
Exemple #9
0
static void hevc_idct_transpose_8x32_to_32x8(int16_t *tmp_buf, int16_t *coeffs)
{
    uint8_t i;
    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;

    for (i = 0; i < 4; i++) {
        LD_SH8(tmp_buf + i * 8 * 8, 8, in0, in1, in2, in3, in4, in5, in6, in7);
        TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
                           in0, in1, in2, in3, in4, in5, in6, in7);
        ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, coeffs + i * 8, 32);
    }
}
Exemple #10
0
static void avc_idct8_addblk_msa( uint8_t *p_dst, int16_t *p_src,
                                  int32_t i_dst_stride )
{
    v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
    v8i16 vec0, vec1, vec2, vec3;
    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
    v4i32 tmp0_r, tmp1_r, tmp2_r, tmp3_r, tmp4_r, tmp5_r, tmp6_r, tmp7_r;
    v4i32 tmp0_l, tmp1_l, tmp2_l, tmp3_l, tmp4_l, tmp5_l, tmp6_l, tmp7_l;
    v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec0_l, vec1_l, vec2_l, vec3_l;
    v4i32 res0_r, res1_r, res2_r, res3_r, res4_r, res5_r, res6_r, res7_r;
    v4i32 res0_l, res1_l, res2_l, res3_l, res4_l, res5_l, res6_l, res7_l;
    v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
    v16i8 zeros = { 0 };

    p_src[ 0 ] += 32;

    LD_SH8( p_src, 8, src0, src1, src2, src3, src4, src5, src6, src7 );

    vec0 = src0 + src4;
    vec1 = src0 - src4;
    vec2 = src2 >> 1;
    vec2 = vec2 - src6;
    vec3 = src6 >> 1;
    vec3 = src2 + vec3;

    BUTTERFLY_4( vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3 );

    vec0 = src7 >> 1;
    vec0 = src5 - vec0 - src3 - src7;
    vec1 = src3 >> 1;
    vec1 = src1 - vec1 + src7 - src3;
    vec2 = src5 >> 1;
    vec2 = vec2 - src1 + src7 + src5;
    vec3 = src1 >> 1;
    vec3 = vec3 + src3 + src5 + src1;
    tmp4 = vec3 >> 2;
    tmp4 += vec0;
    tmp5 = vec2 >> 2;
    tmp5 += vec1;
    tmp6 = vec1 >> 2;
    tmp6 -= vec2;
    tmp7 = vec0 >> 2;
    tmp7 = vec3 - tmp7;

    BUTTERFLY_8( tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
                 res0, res1, res2, res3, res4, res5, res6, res7 );
    TRANSPOSE8x8_SH_SH( res0, res1, res2, res3, res4, res5, res6, res7,
                        res0, res1, res2, res3, res4, res5, res6, res7 );
    UNPCK_SH_SW( res0, tmp0_r, tmp0_l );
    UNPCK_SH_SW( res1, tmp1_r, tmp1_l );
    UNPCK_SH_SW( res2, tmp2_r, tmp2_l );
    UNPCK_SH_SW( res3, tmp3_r, tmp3_l );
    UNPCK_SH_SW( res4, tmp4_r, tmp4_l );
    UNPCK_SH_SW( res5, tmp5_r, tmp5_l );
    UNPCK_SH_SW( res6, tmp6_r, tmp6_l );
    UNPCK_SH_SW( res7, tmp7_r, tmp7_l );
    BUTTERFLY_4( tmp0_r, tmp0_l, tmp4_l, tmp4_r,
                 vec0_r, vec0_l, vec1_l, vec1_r );

    vec2_r = tmp2_r >> 1;
    vec2_l = tmp2_l >> 1;
    vec2_r -= tmp6_r;
    vec2_l -= tmp6_l;
    vec3_r = tmp6_r >> 1;
    vec3_l = tmp6_l >> 1;
    vec3_r += tmp2_r;
    vec3_l += tmp2_l;

    BUTTERFLY_4( vec0_r, vec1_r, vec2_r, vec3_r,
                 tmp0_r, tmp2_r, tmp4_r, tmp6_r );
    BUTTERFLY_4( vec0_l, vec1_l, vec2_l, vec3_l,
                 tmp0_l, tmp2_l, tmp4_l, tmp6_l );

    vec0_r = tmp7_r >> 1;
    vec0_l = tmp7_l >> 1;
    vec0_r = tmp5_r - vec0_r - tmp3_r - tmp7_r;
    vec0_l = tmp5_l - vec0_l - tmp3_l - tmp7_l;
    vec1_r = tmp3_r >> 1;
    vec1_l = tmp3_l >> 1;
    vec1_r = tmp1_r - vec1_r + tmp7_r - tmp3_r;
    vec1_l = tmp1_l - vec1_l + tmp7_l - tmp3_l;
    vec2_r = tmp5_r >> 1;
    vec2_l = tmp5_l >> 1;
    vec2_r = vec2_r - tmp1_r + tmp7_r + tmp5_r;
    vec2_l = vec2_l - tmp1_l + tmp7_l + tmp5_l;
    vec3_r = tmp1_r >> 1;
    vec3_l = tmp1_l >> 1;
    vec3_r = vec3_r + tmp3_r + tmp5_r + tmp1_r;
    vec3_l = vec3_l + tmp3_l + tmp5_l + tmp1_l;
    tmp1_r = vec3_r >> 2;
    tmp1_l = vec3_l >> 2;
    tmp1_r += vec0_r;
    tmp1_l += vec0_l;
    tmp3_r = vec2_r >> 2;
    tmp3_l = vec2_l >> 2;
    tmp3_r += vec1_r;
    tmp3_l += vec1_l;
    tmp5_r = vec1_r >> 2;
    tmp5_l = vec1_l >> 2;
    tmp5_r -= vec2_r;
    tmp5_l -= vec2_l;
    tmp7_r = vec0_r >> 2;
    tmp7_l = vec0_l >> 2;
    tmp7_r = vec3_r - tmp7_r;
    tmp7_l = vec3_l - tmp7_l;

    BUTTERFLY_4( tmp0_r, tmp0_l, tmp7_l, tmp7_r,
                 res0_r, res0_l, res7_l, res7_r );
    BUTTERFLY_4( tmp2_r, tmp2_l, tmp5_l, tmp5_r,
                 res1_r, res1_l, res6_l, res6_r );
    BUTTERFLY_4( tmp4_r, tmp4_l, tmp3_l, tmp3_r,
                 res2_r, res2_l, res5_l, res5_r );
    BUTTERFLY_4( tmp6_r, tmp6_l, tmp1_l, tmp1_r,
                 res3_r, res3_l, res4_l, res4_r );
    SRA_4V( res0_r, res0_l, res1_r, res1_l, 6 );
    SRA_4V( res2_r, res2_l, res3_r, res3_l, 6 );
    SRA_4V( res4_r, res4_l, res5_r, res5_l, 6 );
    SRA_4V( res6_r, res6_l, res7_r, res7_l, 6 );
    PCKEV_H4_SH( res0_l, res0_r, res1_l, res1_r, res2_l, res2_r, res3_l, res3_r,
                 res0, res1, res2, res3 );
    PCKEV_H4_SH( res4_l, res4_r, res5_l, res5_r, res6_l, res6_r, res7_l, res7_r,
                 res4, res5, res6, res7 );
    LD_SB8( p_dst, i_dst_stride,
            dst0, dst1, dst2, dst3,
            dst4, dst5, dst6, dst7 );
    ILVR_B4_SH( zeros, dst0, zeros, dst1, zeros, dst2, zeros, dst3,
                tmp0, tmp1, tmp2, tmp3 );
    ILVR_B4_SH( zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7,
                tmp4, tmp5, tmp6, tmp7 );
    ADD4( res0, tmp0, res1, tmp1, res2, tmp2, res3, tmp3,
          res0, res1, res2, res3 );
    ADD4( res4, tmp4, res5, tmp5, res6, tmp6, res7, tmp7,
          res4, res5, res6, res7 );
    CLIP_SH4_0_255( res0, res1, res2, res3 );
    CLIP_SH4_0_255( res4, res5, res6, res7 );
    PCKEV_B4_SB( res1, res0, res3, res2, res5, res4, res7, res6,
                 dst0, dst1, dst2, dst3 );
    ST8x4_UB( dst0, dst1, p_dst, i_dst_stride );
    p_dst += ( 4 * i_dst_stride );
    ST8x4_UB( dst2, dst3, p_dst, i_dst_stride );
}
static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) {
  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;

  /* 1st set */
  in0 = LD_SH(temp);
  in4 = LD_SH(temp + 32);
  in2 = LD_SH(temp + 64);
  in6 = LD_SH(temp + 96);
  in1 = LD_SH(temp + 128);
  in7 = LD_SH(temp + 152);
  in3 = LD_SH(temp + 192);
  in5 = LD_SH(temp + 216);

  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
                     in0, in1, in2, in3, in4, in5, in6, in7);

  /* 2nd set */
  in0_1 = LD_SH(temp + 16);
  in1_1 = LD_SH(temp + 232);
  in2_1 = LD_SH(temp + 80);
  in3_1 = LD_SH(temp + 168);
  in4_1 = LD_SH(temp + 48);
  in5_1 = LD_SH(temp + 176);
  in6_1 = LD_SH(temp + 112);
  in7_1 = LD_SH(temp + 240);

  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 32);
  TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
                     in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);

  /* 3rd set */
  in0 = LD_SH(temp + 8);
  in1 = LD_SH(temp + 136);
  in2 = LD_SH(temp + 72);
  in3 = LD_SH(temp + 200);
  in4 = LD_SH(temp + 40);
  in5 = LD_SH(temp + 208);
  in6 = LD_SH(temp + 104);
  in7 = LD_SH(temp + 144);

  ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
         output + 8, 32);
  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
                     in0, in1, in2, in3, in4, in5, in6, in7);
  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output + 16, 32);

  /* 4th set */
  in0_1 = LD_SH(temp + 24);
  in1_1 = LD_SH(temp + 224);
  in2_1 = LD_SH(temp + 88);
  in3_1 = LD_SH(temp + 160);
  in4_1 = LD_SH(temp + 56);
  in5_1 = LD_SH(temp + 184);
  in6_1 = LD_SH(temp + 120);
  in7_1 = LD_SH(temp + 248);

  TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
                     in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
  ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
         output + 24, 32);
}
void vp10_iht8x8_64_add_msa(const int16_t *input, uint8_t *dst,
                           int32_t dst_stride, int32_t tx_type) {
  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;

  /* load vector elements of 8x8 block */
  LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);

  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
                     in0, in1, in2, in3, in4, in5, in6, in7);

  switch (tx_type) {
    case DCT_DCT:
      /* DCT in horizontal */
      VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
                     in0, in1, in2, in3, in4, in5, in6, in7);
      /* DCT in vertical */
      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
                         in0, in1, in2, in3, in4, in5, in6, in7);
      VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
                     in0, in1, in2, in3, in4, in5, in6, in7);
      break;
    case ADST_DCT:
      /* DCT in horizontal */
      VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
                     in0, in1, in2, in3, in4, in5, in6, in7);
      /* ADST in vertical */
      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
                         in0, in1, in2, in3, in4, in5, in6, in7);
      VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
                in0, in1, in2, in3, in4, in5, in6, in7);
      break;
    case DCT_ADST:
      /* ADST in horizontal */
      VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
                in0, in1, in2, in3, in4, in5, in6, in7);
      /* DCT in vertical */
      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
                         in0, in1, in2, in3, in4, in5, in6, in7);
      VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
                     in0, in1, in2, in3, in4, in5, in6, in7);
      break;
    case ADST_ADST:
      /* ADST in horizontal */
      VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
                in0, in1, in2, in3, in4, in5, in6, in7);
      /* ADST in vertical */
      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
                         in0, in1, in2, in3, in4, in5, in6, in7);
      VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
                in0, in1, in2, in3, in4, in5, in6, in7);
      break;
    default:
      assert(0);
      break;
  }

  /* final rounding (add 2^4, divide by 2^5) and shift */
  SRARI_H4_SH(in0, in1, in2, in3, 5);
  SRARI_H4_SH(in4, in5, in6, in7, 5);

  /* add block and store 8x8 */
  VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
  dst += (4 * dst_stride);
  VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
}
Exemple #13
0
static void hevc_idct_16x16_msa(int16_t *coeffs)
{
    int16_t i, j, k;
    int16_t buf[256];
    int16_t *buf_ptr = &buf[0];
    int16_t *src = coeffs;
    const int16_t *filter = &gt16x16_cnst[0];
    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
    v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
    v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
    v8i16 src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l;

    for (i = 2; i--;) {
        LD_SH16(src, 16, in0, in1, in2, in3, in4, in5, in6, in7,
                in8, in9, in10, in11, in12, in13, in14, in15);

        ILVR_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10,
                   src0_r, src1_r, src2_r, src3_r);
        ILVR_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15,
                   src4_r, src5_r, src6_r, src7_r);
        ILVL_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10,
                   src0_l, src1_l, src2_l, src3_l);
        ILVL_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15,
                   src4_l, src5_l, src6_l, src7_l);
        HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r,
                           src6_r, src7_r, src0_l, src1_l, src2_l, src3_l,
                           src4_l, src5_l, src6_l, src7_l, 7);

        src += 8;
        buf_ptr = (&buf[0] + 8);
        filter = &gt16x16_cnst[0];
    }

    src = &buf[0];
    buf_ptr = coeffs;
    filter = &gt16x16_cnst[0];

    for (i = 2; i--;) {
        LD_SH16(src, 8, in0, in8, in1, in9, in2, in10, in3, in11,
                in4, in12, in5, in13, in6, in14, in7, in15);
        TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
                           in0, in1, in2, in3, in4, in5, in6, in7);
        TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
                           in8, in9, in10, in11, in12, in13, in14, in15);
        ILVR_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10,
                   src0_r, src1_r, src2_r, src3_r);
        ILVR_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15,
                   src4_r, src5_r, src6_r, src7_r);
        ILVL_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10,
                   src0_l, src1_l, src2_l, src3_l);
        ILVL_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15,
                   src4_l, src5_l, src6_l, src7_l);
        HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r,
                           src6_r, src7_r, src0_l, src1_l, src2_l, src3_l,
                           src4_l, src5_l, src6_l, src7_l, 12);

        src += 128;
        buf_ptr = coeffs + 8;
        filter = &gt16x16_cnst[0];
    }

    LD_SH8(coeffs, 16, in0, in1, in2, in3, in4, in5, in6, in7);
    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
                       vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
    ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, coeffs, 16);

    LD_SH8((coeffs + 8), 16, in0, in1, in2, in3, in4, in5, in6, in7);
    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
                       vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
    LD_SH8((coeffs + 128), 16, in8, in9, in10, in11, in12, in13, in14, in15);
    ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, (coeffs + 128), 16);
    TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
                       vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
    ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, (coeffs + 8), 16);

    LD_SH8((coeffs + 136), 16, in0, in1, in2, in3, in4, in5, in6, in7);
    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
                       vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
    ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, (coeffs + 136), 16);
}