コード例 #1
0
ファイル: enc_msa.c プロジェクト: garrettmoon/libwebp
static WEBP_INLINE void TrueMotion16x16(uint8_t* dst, const uint8_t* left,
                                        const uint8_t* top) {
  if (left != NULL) {
    if (top != NULL) {
      int j;
      v8i16 d1, d2;
      const v16i8 zero = { 0 };
      const v8i16 TL = (v8i16)__msa_fill_h(left[-1]);
      const v16u8 T = LD_UB(top);
      ILVRL_B2_SH(zero, T, d1, d2);
      SUB2(d1, TL, d2, TL, d1, d2);
      for (j = 0; j < 16; j += 4) {
        v16i8 t0, t1, t2, t3;
        v8i16 r0, r1, r2, r3, r4, r5, r6, r7;
        const v8i16 L0 = (v8i16)__msa_fill_h(left[j + 0]);
        const v8i16 L1 = (v8i16)__msa_fill_h(left[j + 1]);
        const v8i16 L2 = (v8i16)__msa_fill_h(left[j + 2]);
        const v8i16 L3 = (v8i16)__msa_fill_h(left[j + 3]);
        ADD4(d1, L0, d1, L1, d1, L2, d1, L3, r0, r1, r2, r3);
        ADD4(d2, L0, d2, L1, d2, L2, d2, L3, r4, r5, r6, r7);
        CLIP_SH4_0_255(r0, r1, r2, r3);
        CLIP_SH4_0_255(r4, r5, r6, r7);
        PCKEV_B4_SB(r4, r0, r5, r1, r6, r2, r7, r3, t0, t1, t2, t3);
        ST_SB4(t0, t1, t2, t3, dst, BPS);
        dst += 4 * BPS;
      }
    } else {
      HorizontalPred16x16(dst, left);
    }
  } else {
    if (top != NULL) {
      VerticalPred16x16(dst, top);
    } else {
      const v16u8 out = (v16u8)__msa_fill_b(0x81);
      STORE16x16(out, dst);
    }
  }
}
コード例 #2
0
ファイル: hevc_idct_msa.c プロジェクト: DeHackEd/FFmpeg
static void hevc_addblk_8x8_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
{
    uint8_t *temp_dst = dst;
    uint64_t dst0, dst1, dst2, dst3;
    v2i64 dst_vec0 = { 0 };
    v2i64 dst_vec1 = { 0 };
    v8i16 dst_r0, dst_l0, dst_r1, dst_l1;
    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
    v16u8 zeros = { 0 };

    LD_SH8(coeffs, 8, in0, in1, in2, in3, in4, in5, in6, in7);
    LD4(temp_dst, stride, dst0, dst1, dst2, dst3);
    temp_dst += (4 * stride);

    INSERT_D2_SD(dst0, dst1, dst_vec0);
    INSERT_D2_SD(dst2, dst3, dst_vec1);
    ILVRL_B2_SH(zeros, dst_vec0, dst_r0, dst_l0);
    ILVRL_B2_SH(zeros, dst_vec1, dst_r1, dst_l1);
    ADD4(dst_r0, in0, dst_l0, in1, dst_r1, in2, dst_l1, in3,
         dst_r0, dst_l0, dst_r1, dst_l1);
    CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
    PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1);
    ST8x4_UB(dst_r0, dst_r1, dst, stride);
    dst += (4 * stride);

    LD4(temp_dst, stride, dst0, dst1, dst2, dst3);
    INSERT_D2_SD(dst0, dst1, dst_vec0);
    INSERT_D2_SD(dst2, dst3, dst_vec1);
    UNPCK_UB_SH(dst_vec0, dst_r0, dst_l0);
    UNPCK_UB_SH(dst_vec1, dst_r1, dst_l1);
    ADD4(dst_r0, in4, dst_l0, in5, dst_r1, in6, dst_l1, in7,
         dst_r0, dst_l0, dst_r1, dst_l1);
    CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
    PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1);
    ST8x4_UB(dst_r0, dst_r1, dst, stride);
}
コード例 #3
0
ファイル: enc_msa.c プロジェクト: garrettmoon/libwebp
static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
  const v16i8 zero = { 0 };
  const v8i16 TL = (v8i16)__msa_fill_h(top[-1]);
  const v8i16 L0 = (v8i16)__msa_fill_h(top[-2]);
  const v8i16 L1 = (v8i16)__msa_fill_h(top[-3]);
  const v8i16 L2 = (v8i16)__msa_fill_h(top[-4]);
  const v8i16 L3 = (v8i16)__msa_fill_h(top[-5]);
  const v16u8 T1 = LD_UB(top);
  const v8i16 T  = (v8i16)__msa_ilvr_b(zero, (v16i8)T1);
  const v8i16 d = T - TL;
  v8i16 r0, r1, r2, r3;
  ADD4(d, L0, d, L1, d, L2, d, L3, r0, r1, r2, r3);
  CLIP_SH4_0_255(r0, r1, r2, r3);
  PCKEV_ST4x4_UB(r0, r1, r2, r3, dst, BPS);
}
コード例 #4
0
ファイル: dct-c.c プロジェクト: 0x0B501E7E/x264
static void avc_idct8_addblk_msa( uint8_t *p_dst, int16_t *p_src,
                                  int32_t i_dst_stride )
{
    v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
    v8i16 vec0, vec1, vec2, vec3;
    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
    v4i32 tmp0_r, tmp1_r, tmp2_r, tmp3_r, tmp4_r, tmp5_r, tmp6_r, tmp7_r;
    v4i32 tmp0_l, tmp1_l, tmp2_l, tmp3_l, tmp4_l, tmp5_l, tmp6_l, tmp7_l;
    v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec0_l, vec1_l, vec2_l, vec3_l;
    v4i32 res0_r, res1_r, res2_r, res3_r, res4_r, res5_r, res6_r, res7_r;
    v4i32 res0_l, res1_l, res2_l, res3_l, res4_l, res5_l, res6_l, res7_l;
    v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
    v16i8 zeros = { 0 };

    p_src[ 0 ] += 32;

    LD_SH8( p_src, 8, src0, src1, src2, src3, src4, src5, src6, src7 );

    vec0 = src0 + src4;
    vec1 = src0 - src4;
    vec2 = src2 >> 1;
    vec2 = vec2 - src6;
    vec3 = src6 >> 1;
    vec3 = src2 + vec3;

    BUTTERFLY_4( vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3 );

    vec0 = src7 >> 1;
    vec0 = src5 - vec0 - src3 - src7;
    vec1 = src3 >> 1;
    vec1 = src1 - vec1 + src7 - src3;
    vec2 = src5 >> 1;
    vec2 = vec2 - src1 + src7 + src5;
    vec3 = src1 >> 1;
    vec3 = vec3 + src3 + src5 + src1;
    tmp4 = vec3 >> 2;
    tmp4 += vec0;
    tmp5 = vec2 >> 2;
    tmp5 += vec1;
    tmp6 = vec1 >> 2;
    tmp6 -= vec2;
    tmp7 = vec0 >> 2;
    tmp7 = vec3 - tmp7;

    BUTTERFLY_8( tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
                 res0, res1, res2, res3, res4, res5, res6, res7 );
    TRANSPOSE8x8_SH_SH( res0, res1, res2, res3, res4, res5, res6, res7,
                        res0, res1, res2, res3, res4, res5, res6, res7 );
    UNPCK_SH_SW( res0, tmp0_r, tmp0_l );
    UNPCK_SH_SW( res1, tmp1_r, tmp1_l );
    UNPCK_SH_SW( res2, tmp2_r, tmp2_l );
    UNPCK_SH_SW( res3, tmp3_r, tmp3_l );
    UNPCK_SH_SW( res4, tmp4_r, tmp4_l );
    UNPCK_SH_SW( res5, tmp5_r, tmp5_l );
    UNPCK_SH_SW( res6, tmp6_r, tmp6_l );
    UNPCK_SH_SW( res7, tmp7_r, tmp7_l );
    BUTTERFLY_4( tmp0_r, tmp0_l, tmp4_l, tmp4_r,
                 vec0_r, vec0_l, vec1_l, vec1_r );

    vec2_r = tmp2_r >> 1;
    vec2_l = tmp2_l >> 1;
    vec2_r -= tmp6_r;
    vec2_l -= tmp6_l;
    vec3_r = tmp6_r >> 1;
    vec3_l = tmp6_l >> 1;
    vec3_r += tmp2_r;
    vec3_l += tmp2_l;

    BUTTERFLY_4( vec0_r, vec1_r, vec2_r, vec3_r,
                 tmp0_r, tmp2_r, tmp4_r, tmp6_r );
    BUTTERFLY_4( vec0_l, vec1_l, vec2_l, vec3_l,
                 tmp0_l, tmp2_l, tmp4_l, tmp6_l );

    vec0_r = tmp7_r >> 1;
    vec0_l = tmp7_l >> 1;
    vec0_r = tmp5_r - vec0_r - tmp3_r - tmp7_r;
    vec0_l = tmp5_l - vec0_l - tmp3_l - tmp7_l;
    vec1_r = tmp3_r >> 1;
    vec1_l = tmp3_l >> 1;
    vec1_r = tmp1_r - vec1_r + tmp7_r - tmp3_r;
    vec1_l = tmp1_l - vec1_l + tmp7_l - tmp3_l;
    vec2_r = tmp5_r >> 1;
    vec2_l = tmp5_l >> 1;
    vec2_r = vec2_r - tmp1_r + tmp7_r + tmp5_r;
    vec2_l = vec2_l - tmp1_l + tmp7_l + tmp5_l;
    vec3_r = tmp1_r >> 1;
    vec3_l = tmp1_l >> 1;
    vec3_r = vec3_r + tmp3_r + tmp5_r + tmp1_r;
    vec3_l = vec3_l + tmp3_l + tmp5_l + tmp1_l;
    tmp1_r = vec3_r >> 2;
    tmp1_l = vec3_l >> 2;
    tmp1_r += vec0_r;
    tmp1_l += vec0_l;
    tmp3_r = vec2_r >> 2;
    tmp3_l = vec2_l >> 2;
    tmp3_r += vec1_r;
    tmp3_l += vec1_l;
    tmp5_r = vec1_r >> 2;
    tmp5_l = vec1_l >> 2;
    tmp5_r -= vec2_r;
    tmp5_l -= vec2_l;
    tmp7_r = vec0_r >> 2;
    tmp7_l = vec0_l >> 2;
    tmp7_r = vec3_r - tmp7_r;
    tmp7_l = vec3_l - tmp7_l;

    BUTTERFLY_4( tmp0_r, tmp0_l, tmp7_l, tmp7_r,
                 res0_r, res0_l, res7_l, res7_r );
    BUTTERFLY_4( tmp2_r, tmp2_l, tmp5_l, tmp5_r,
                 res1_r, res1_l, res6_l, res6_r );
    BUTTERFLY_4( tmp4_r, tmp4_l, tmp3_l, tmp3_r,
                 res2_r, res2_l, res5_l, res5_r );
    BUTTERFLY_4( tmp6_r, tmp6_l, tmp1_l, tmp1_r,
                 res3_r, res3_l, res4_l, res4_r );
    SRA_4V( res0_r, res0_l, res1_r, res1_l, 6 );
    SRA_4V( res2_r, res2_l, res3_r, res3_l, 6 );
    SRA_4V( res4_r, res4_l, res5_r, res5_l, 6 );
    SRA_4V( res6_r, res6_l, res7_r, res7_l, 6 );
    PCKEV_H4_SH( res0_l, res0_r, res1_l, res1_r, res2_l, res2_r, res3_l, res3_r,
                 res0, res1, res2, res3 );
    PCKEV_H4_SH( res4_l, res4_r, res5_l, res5_r, res6_l, res6_r, res7_l, res7_r,
                 res4, res5, res6, res7 );
    LD_SB8( p_dst, i_dst_stride,
            dst0, dst1, dst2, dst3,
            dst4, dst5, dst6, dst7 );
    ILVR_B4_SH( zeros, dst0, zeros, dst1, zeros, dst2, zeros, dst3,
                tmp0, tmp1, tmp2, tmp3 );
    ILVR_B4_SH( zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7,
                tmp4, tmp5, tmp6, tmp7 );
    ADD4( res0, tmp0, res1, tmp1, res2, tmp2, res3, tmp3,
          res0, res1, res2, res3 );
    ADD4( res4, tmp4, res5, tmp5, res6, tmp6, res7, tmp7,
          res4, res5, res6, res7 );
    CLIP_SH4_0_255( res0, res1, res2, res3 );
    CLIP_SH4_0_255( res4, res5, res6, res7 );
    PCKEV_B4_SB( res1, res0, res3, res2, res5, res4, res7, res6,
                 dst0, dst1, dst2, dst3 );
    ST8x4_UB( dst0, dst1, p_dst, i_dst_stride );
    p_dst += ( 4 * i_dst_stride );
    ST8x4_UB( dst2, dst3, p_dst, i_dst_stride );
}
コード例 #5
0
ファイル: hevc_idct_msa.c プロジェクト: DeHackEd/FFmpeg
static void hevc_addblk_32x32_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
{
    uint8_t loop_cnt;
    uint8_t *temp_dst = dst;
    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
    v8i16 dst_r0, dst_l0, dst_r1, dst_l1, dst_r2, dst_l2, dst_r3, dst_l3;
    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;

    /* Pre-load for next iteration */
    LD_UB2(temp_dst, 16, dst4, dst5);
    temp_dst += stride;
    LD_UB2(temp_dst, 16, dst6, dst7);
    temp_dst += stride;
    LD_SH4(coeffs, 16, in0, in2, in4, in6);
    LD_SH4((coeffs + 8), 16, in1, in3, in5, in7);
    coeffs += 64;

    for (loop_cnt = 14; loop_cnt--;) {
        UNPCK_UB_SH(dst4, dst_r0, dst_l0);
        UNPCK_UB_SH(dst5, dst_r1, dst_l1);
        UNPCK_UB_SH(dst6, dst_r2, dst_l2);
        UNPCK_UB_SH(dst7, dst_r3, dst_l3);

        dst_r0 += in0;
        dst_l0 += in1;
        dst_r1 += in2;
        dst_l1 += in3;
        dst_r2 += in4;
        dst_l2 += in5;
        dst_r3 += in6;
        dst_l3 += in7;

        /* Pre-load for next iteration */
        LD_UB2(temp_dst, 16, dst4, dst5);
        temp_dst += stride;
        LD_UB2(temp_dst, 16, dst6, dst7);
        temp_dst += stride;
        LD_SH4(coeffs, 16, in0, in2, in4, in6);
        LD_SH4((coeffs + 8), 16, in1, in3, in5, in7);
        coeffs += 64;

        CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
        CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3);
        PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
                    dst_r3, dst0, dst1, dst2, dst3);
        ST_UB2(dst0, dst1, dst, 16);
        dst += stride;
        ST_UB2(dst2, dst3, dst, 16);
        dst += stride;
    }

    UNPCK_UB_SH(dst4, dst_r0, dst_l0);
    UNPCK_UB_SH(dst5, dst_r1, dst_l1);
    UNPCK_UB_SH(dst6, dst_r2, dst_l2);
    UNPCK_UB_SH(dst7, dst_r3, dst_l3);

    dst_r0 += in0;
    dst_l0 += in1;
    dst_r1 += in2;
    dst_l1 += in3;
    dst_r2 += in4;
    dst_l2 += in5;
    dst_r3 += in6;
    dst_l3 += in7;

    /* Pre-load for next iteration */
    LD_UB2(temp_dst, 16, dst4, dst5);
    temp_dst += stride;
    LD_UB2(temp_dst, 16, dst6, dst7);
    temp_dst += stride;
    LD_SH4(coeffs, 16, in0, in2, in4, in6);
    LD_SH4((coeffs + 8), 16, in1, in3, in5, in7);

    CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
    CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3);
    PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
                dst_r3, dst0, dst1, dst2, dst3);
    ST_UB2(dst0, dst1, dst, 16);
    dst += stride;
    ST_UB2(dst2, dst3, dst, 16);
    dst += stride;

    UNPCK_UB_SH(dst4, dst_r0, dst_l0);
    UNPCK_UB_SH(dst5, dst_r1, dst_l1);
    UNPCK_UB_SH(dst6, dst_r2, dst_l2);
    UNPCK_UB_SH(dst7, dst_r3, dst_l3);

    dst_r0 += in0;
    dst_l0 += in1;
    dst_r1 += in2;
    dst_l1 += in3;
    dst_r2 += in4;
    dst_l2 += in5;
    dst_r3 += in6;
    dst_l3 += in7;

    CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
    CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3);
    PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
                dst_r3, dst0, dst1, dst2, dst3);
    ST_UB2(dst0, dst1, dst, 16);
    dst += stride;
    ST_UB2(dst2, dst3, dst, 16);
}