Ejemplo n.º 1
0
static void avc_idct4x4dc_msa( int16_t *p_src, int32_t i_src_stride,
                               int16_t *p_dst, int32_t i_dst_stride )
{
    v8i16 src0, src1, src2, src3;
    v4i32 src0_r, src1_r, src2_r, src3_r;
    v4i32 hres0, hres1, hres2, hres3;
    v8i16 vres0, vres1, vres2, vres3;
    v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
    v2i64 res0, res1;

    LD_SH4( p_src, i_src_stride, src0, src1, src2, src3 );
    UNPCK_R_SH_SW( src0, src0_r );
    UNPCK_R_SH_SW( src1, src1_r );
    UNPCK_R_SH_SW( src2, src2_r );
    UNPCK_R_SH_SW( src3, src3_r );
    BUTTERFLY_4( src0_r, src2_r, src3_r, src1_r, vec0, vec3, vec2, vec1 );
    BUTTERFLY_4( vec0, vec1, vec2, vec3, hres0, hres3, hres2, hres1 );
    TRANSPOSE4x4_SW_SW( hres0, hres1, hres2, hres3,
                        hres0, hres1, hres2, hres3 );
    BUTTERFLY_4( hres0, hres2, hres3, hres1, vec0, vec3, vec2, vec1 );
    BUTTERFLY_4( vec0, vec1, vec2, vec3, vec4, vec7, vec6, vec5 );
    PCKEV_H4_SH( vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
                 vres0, vres1, vres2, vres3 );
    PCKOD_D2_SD( vres1, vres0, vres3, vres2, res0, res1 );
    ST8x4_UB( res0, res1, p_dst, i_dst_stride * 2 );
}
Ejemplo n.º 2
0
static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr,
                                     const uint8_t *src_left,
                                     uint8_t *dst, int32_t dst_stride) {
  uint64_t val;
  uint8_t top_left = src_top_ptr[-1];
  uint32_t loop_cnt;
  v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
  v8u16 src_top_left, vec0, vec1, vec2, vec3;
  v16u8 src0, src1, src2, src3;

  val = LD(src_top_ptr);
  src_top = (v16i8)__msa_insert_d((v2i64)src_top, 0, val);
  src_top_left = (v8u16)__msa_fill_h(top_left);

  for (loop_cnt = 2; loop_cnt--;) {
    src_left0 = __msa_fill_b(src_left[0]);
    src_left1 = __msa_fill_b(src_left[1]);
    src_left2 = __msa_fill_b(src_left[2]);
    src_left3 = __msa_fill_b(src_left[3]);
    src_left += 4;

    ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
               src_left3, src_top, src0, src1, src2, src3);
    HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
    SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
    PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
    dst += (4 * dst_stride);
  }
}
Ejemplo n.º 3
0
static void hevc_addblk_8x8_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
{
    uint8_t *temp_dst = dst;
    uint64_t dst0, dst1, dst2, dst3;
    v2i64 dst_vec0 = { 0 };
    v2i64 dst_vec1 = { 0 };
    v8i16 dst_r0, dst_l0, dst_r1, dst_l1;
    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
    v16u8 zeros = { 0 };

    LD_SH8(coeffs, 8, in0, in1, in2, in3, in4, in5, in6, in7);
    LD4(temp_dst, stride, dst0, dst1, dst2, dst3);
    temp_dst += (4 * stride);

    INSERT_D2_SD(dst0, dst1, dst_vec0);
    INSERT_D2_SD(dst2, dst3, dst_vec1);
    ILVRL_B2_SH(zeros, dst_vec0, dst_r0, dst_l0);
    ILVRL_B2_SH(zeros, dst_vec1, dst_r1, dst_l1);
    ADD4(dst_r0, in0, dst_l0, in1, dst_r1, in2, dst_l1, in3,
         dst_r0, dst_l0, dst_r1, dst_l1);
    CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
    PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1);
    ST8x4_UB(dst_r0, dst_r1, dst, stride);
    dst += (4 * stride);

    LD4(temp_dst, stride, dst0, dst1, dst2, dst3);
    INSERT_D2_SD(dst0, dst1, dst_vec0);
    INSERT_D2_SD(dst2, dst3, dst_vec1);
    UNPCK_UB_SH(dst_vec0, dst_r0, dst_l0);
    UNPCK_UB_SH(dst_vec1, dst_r1, dst_l1);
    ADD4(dst_r0, in4, dst_l0, in5, dst_r1, in6, dst_l1, in7,
         dst_r0, dst_l0, dst_r1, dst_l1);
    CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
    PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1);
    ST8x4_UB(dst_r0, dst_r1, dst, stride);
}
Ejemplo n.º 4
0
static void avc_idct8_addblk_msa( uint8_t *p_dst, int16_t *p_src,
                                  int32_t i_dst_stride )
{
    v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
    v8i16 vec0, vec1, vec2, vec3;
    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
    v4i32 tmp0_r, tmp1_r, tmp2_r, tmp3_r, tmp4_r, tmp5_r, tmp6_r, tmp7_r;
    v4i32 tmp0_l, tmp1_l, tmp2_l, tmp3_l, tmp4_l, tmp5_l, tmp6_l, tmp7_l;
    v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec0_l, vec1_l, vec2_l, vec3_l;
    v4i32 res0_r, res1_r, res2_r, res3_r, res4_r, res5_r, res6_r, res7_r;
    v4i32 res0_l, res1_l, res2_l, res3_l, res4_l, res5_l, res6_l, res7_l;
    v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
    v16i8 zeros = { 0 };

    p_src[ 0 ] += 32;

    LD_SH8( p_src, 8, src0, src1, src2, src3, src4, src5, src6, src7 );

    vec0 = src0 + src4;
    vec1 = src0 - src4;
    vec2 = src2 >> 1;
    vec2 = vec2 - src6;
    vec3 = src6 >> 1;
    vec3 = src2 + vec3;

    BUTTERFLY_4( vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3 );

    vec0 = src7 >> 1;
    vec0 = src5 - vec0 - src3 - src7;
    vec1 = src3 >> 1;
    vec1 = src1 - vec1 + src7 - src3;
    vec2 = src5 >> 1;
    vec2 = vec2 - src1 + src7 + src5;
    vec3 = src1 >> 1;
    vec3 = vec3 + src3 + src5 + src1;
    tmp4 = vec3 >> 2;
    tmp4 += vec0;
    tmp5 = vec2 >> 2;
    tmp5 += vec1;
    tmp6 = vec1 >> 2;
    tmp6 -= vec2;
    tmp7 = vec0 >> 2;
    tmp7 = vec3 - tmp7;

    BUTTERFLY_8( tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
                 res0, res1, res2, res3, res4, res5, res6, res7 );
    TRANSPOSE8x8_SH_SH( res0, res1, res2, res3, res4, res5, res6, res7,
                        res0, res1, res2, res3, res4, res5, res6, res7 );
    UNPCK_SH_SW( res0, tmp0_r, tmp0_l );
    UNPCK_SH_SW( res1, tmp1_r, tmp1_l );
    UNPCK_SH_SW( res2, tmp2_r, tmp2_l );
    UNPCK_SH_SW( res3, tmp3_r, tmp3_l );
    UNPCK_SH_SW( res4, tmp4_r, tmp4_l );
    UNPCK_SH_SW( res5, tmp5_r, tmp5_l );
    UNPCK_SH_SW( res6, tmp6_r, tmp6_l );
    UNPCK_SH_SW( res7, tmp7_r, tmp7_l );
    BUTTERFLY_4( tmp0_r, tmp0_l, tmp4_l, tmp4_r,
                 vec0_r, vec0_l, vec1_l, vec1_r );

    vec2_r = tmp2_r >> 1;
    vec2_l = tmp2_l >> 1;
    vec2_r -= tmp6_r;
    vec2_l -= tmp6_l;
    vec3_r = tmp6_r >> 1;
    vec3_l = tmp6_l >> 1;
    vec3_r += tmp2_r;
    vec3_l += tmp2_l;

    BUTTERFLY_4( vec0_r, vec1_r, vec2_r, vec3_r,
                 tmp0_r, tmp2_r, tmp4_r, tmp6_r );
    BUTTERFLY_4( vec0_l, vec1_l, vec2_l, vec3_l,
                 tmp0_l, tmp2_l, tmp4_l, tmp6_l );

    vec0_r = tmp7_r >> 1;
    vec0_l = tmp7_l >> 1;
    vec0_r = tmp5_r - vec0_r - tmp3_r - tmp7_r;
    vec0_l = tmp5_l - vec0_l - tmp3_l - tmp7_l;
    vec1_r = tmp3_r >> 1;
    vec1_l = tmp3_l >> 1;
    vec1_r = tmp1_r - vec1_r + tmp7_r - tmp3_r;
    vec1_l = tmp1_l - vec1_l + tmp7_l - tmp3_l;
    vec2_r = tmp5_r >> 1;
    vec2_l = tmp5_l >> 1;
    vec2_r = vec2_r - tmp1_r + tmp7_r + tmp5_r;
    vec2_l = vec2_l - tmp1_l + tmp7_l + tmp5_l;
    vec3_r = tmp1_r >> 1;
    vec3_l = tmp1_l >> 1;
    vec3_r = vec3_r + tmp3_r + tmp5_r + tmp1_r;
    vec3_l = vec3_l + tmp3_l + tmp5_l + tmp1_l;
    tmp1_r = vec3_r >> 2;
    tmp1_l = vec3_l >> 2;
    tmp1_r += vec0_r;
    tmp1_l += vec0_l;
    tmp3_r = vec2_r >> 2;
    tmp3_l = vec2_l >> 2;
    tmp3_r += vec1_r;
    tmp3_l += vec1_l;
    tmp5_r = vec1_r >> 2;
    tmp5_l = vec1_l >> 2;
    tmp5_r -= vec2_r;
    tmp5_l -= vec2_l;
    tmp7_r = vec0_r >> 2;
    tmp7_l = vec0_l >> 2;
    tmp7_r = vec3_r - tmp7_r;
    tmp7_l = vec3_l - tmp7_l;

    BUTTERFLY_4( tmp0_r, tmp0_l, tmp7_l, tmp7_r,
                 res0_r, res0_l, res7_l, res7_r );
    BUTTERFLY_4( tmp2_r, tmp2_l, tmp5_l, tmp5_r,
                 res1_r, res1_l, res6_l, res6_r );
    BUTTERFLY_4( tmp4_r, tmp4_l, tmp3_l, tmp3_r,
                 res2_r, res2_l, res5_l, res5_r );
    BUTTERFLY_4( tmp6_r, tmp6_l, tmp1_l, tmp1_r,
                 res3_r, res3_l, res4_l, res4_r );
    SRA_4V( res0_r, res0_l, res1_r, res1_l, 6 );
    SRA_4V( res2_r, res2_l, res3_r, res3_l, 6 );
    SRA_4V( res4_r, res4_l, res5_r, res5_l, 6 );
    SRA_4V( res6_r, res6_l, res7_r, res7_l, 6 );
    PCKEV_H4_SH( res0_l, res0_r, res1_l, res1_r, res2_l, res2_r, res3_l, res3_r,
                 res0, res1, res2, res3 );
    PCKEV_H4_SH( res4_l, res4_r, res5_l, res5_r, res6_l, res6_r, res7_l, res7_r,
                 res4, res5, res6, res7 );
    LD_SB8( p_dst, i_dst_stride,
            dst0, dst1, dst2, dst3,
            dst4, dst5, dst6, dst7 );
    ILVR_B4_SH( zeros, dst0, zeros, dst1, zeros, dst2, zeros, dst3,
                tmp0, tmp1, tmp2, tmp3 );
    ILVR_B4_SH( zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7,
                tmp4, tmp5, tmp6, tmp7 );
    ADD4( res0, tmp0, res1, tmp1, res2, tmp2, res3, tmp3,
          res0, res1, res2, res3 );
    ADD4( res4, tmp4, res5, tmp5, res6, tmp6, res7, tmp7,
          res4, res5, res6, res7 );
    CLIP_SH4_0_255( res0, res1, res2, res3 );
    CLIP_SH4_0_255( res4, res5, res6, res7 );
    PCKEV_B4_SB( res1, res0, res3, res2, res5, res4, res7, res6,
                 dst0, dst1, dst2, dst3 );
    ST8x4_UB( dst0, dst1, p_dst, i_dst_stride );
    p_dst += ( 4 * i_dst_stride );
    ST8x4_UB( dst2, dst3, p_dst, i_dst_stride );
}