Пример #1
0
static WEBP_INLINE void PredictLineGradient(const uint8_t* pinput,
                                            const uint8_t* ppred,
                                            uint8_t* poutput, int stride,
                                            int size) {
  int w;
  const v16i8 zero = { 0 };
  while (size >= 16) {
    v16u8 pred0, dst0;
    v8i16 a0, a1, b0, b1, c0, c1;
    const v16u8 tmp0 = LD_UB(ppred - 1);
    const v16u8 tmp1 = LD_UB(ppred - stride);
    const v16u8 tmp2 = LD_UB(ppred - stride - 1);
    const v16u8 src0 = LD_UB(pinput);
    ILVRL_B2_SH(zero, tmp0, a0, a1);
    ILVRL_B2_SH(zero, tmp1, b0, b1);
    ILVRL_B2_SH(zero, tmp2, c0, c1);
    ADD2(a0, b0, a1, b1, a0, a1);
    SUB2(a0, c0, a1, c1, a0, a1);
    CLIP_SH2_0_255(a0, a1);
    pred0 = (v16u8)__msa_pckev_b((v16i8)a1, (v16i8)a0);
    dst0 = src0 - pred0;
    ST_UB(dst0, poutput);
    ppred += 16;
    pinput += 16;
    poutput += 16;
    size -= 16;
  }
  for (w = 0; w < size; ++w) {
    const int pred = ppred[w - 1] + ppred[w - stride] - ppred[w - stride - 1];
    poutput[w] = pinput[w] - (pred < 0 ? 0 : pred > 255 ? 255 : pred);
  }
}
Пример #2
0
static int TTransform(const uint8_t* in, const uint16_t* w) {
  int sum;
  uint32_t in0_m, in1_m, in2_m, in3_m;
  v16i8 src0;
  v8i16 in0, in1, tmp0, tmp1, tmp2, tmp3;
  v4i32 dst0, dst1;
  const v16i8 zero = { 0 };
  const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
  const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
  const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };
  const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };

  LW4(in, BPS, in0_m, in1_m, in2_m, in3_m);
  INSERT_W4_SB(in0_m, in1_m, in2_m, in3_m, src0);
  ILVRL_B2_SH(zero, src0, tmp0, tmp1);
  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
  ADDSUB2(in0, in1, tmp0, tmp1);
  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
  ADDSUB2(tmp2, tmp3, tmp0, tmp1);
  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
  ADDSUB2(in0, in1, tmp0, tmp1);
  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
  ADDSUB2(tmp2, tmp3, tmp0, tmp1);
  tmp0 = __msa_add_a_h(tmp0, (v8i16)zero);
  tmp1 = __msa_add_a_h(tmp1, (v8i16)zero);
  LD_SH2(w, 8, tmp2, tmp3);
  DOTP_SH2_SW(tmp0, tmp1, tmp2, tmp3, dst0, dst1);
  dst0 = dst0 + dst1;
  sum = HADD_SW_S32(dst0);
  return sum;
}
Пример #3
0
void vp8_loop_filter_simple_vertical_edge_msa(uint8_t *src, int32_t pitch,
                                              const uint8_t *b_limit_ptr) {
  uint8_t *temp_src;
  v16u8 p1, p0, q1, q0;
  v16u8 mask, b_limit;
  v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
  v16u8 row9, row10, row11, row12, row13, row14, row15;
  v8i16 tmp0, tmp1;

  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
  temp_src = src - 2;
  LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
  temp_src += (8 * pitch);
  LD_UB8(temp_src, pitch, row8, row9, row10, row11, row12, row13, row14, row15);
  TRANSPOSE16x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
                      row9, row10, row11, row12, row13, row14, row15, p1, p0,
                      q0, q1);
  VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
  VP8_SIMPLE_FILT(p1, p0, q0, q1, mask);
  ILVRL_B2_SH(q0, p0, tmp1, tmp0);

  src -= 1;
  ST2x4_UB(tmp1, 0, src, pitch);
  src += 4 * pitch;
  ST2x4_UB(tmp1, 4, src, pitch);
  src += 4 * pitch;
  ST2x4_UB(tmp0, 0, src, pitch);
  src += 4 * pitch;
  ST2x4_UB(tmp0, 4, src, pitch);
  src += 4 * pitch;
}
Пример #4
0
static void hevc_addblk_8x8_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
{
    uint8_t *temp_dst = dst;
    uint64_t dst0, dst1, dst2, dst3;
    v2i64 dst_vec0 = { 0 };
    v2i64 dst_vec1 = { 0 };
    v8i16 dst_r0, dst_l0, dst_r1, dst_l1;
    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
    v16u8 zeros = { 0 };

    LD_SH8(coeffs, 8, in0, in1, in2, in3, in4, in5, in6, in7);
    LD4(temp_dst, stride, dst0, dst1, dst2, dst3);
    temp_dst += (4 * stride);

    INSERT_D2_SD(dst0, dst1, dst_vec0);
    INSERT_D2_SD(dst2, dst3, dst_vec1);
    ILVRL_B2_SH(zeros, dst_vec0, dst_r0, dst_l0);
    ILVRL_B2_SH(zeros, dst_vec1, dst_r1, dst_l1);
    ADD4(dst_r0, in0, dst_l0, in1, dst_r1, in2, dst_l1, in3,
         dst_r0, dst_l0, dst_r1, dst_l1);
    CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
    PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1);
    ST8x4_UB(dst_r0, dst_r1, dst, stride);
    dst += (4 * stride);

    LD4(temp_dst, stride, dst0, dst1, dst2, dst3);
    INSERT_D2_SD(dst0, dst1, dst_vec0);
    INSERT_D2_SD(dst2, dst3, dst_vec1);
    UNPCK_UB_SH(dst_vec0, dst_r0, dst_l0);
    UNPCK_UB_SH(dst_vec1, dst_r1, dst_l1);
    ADD4(dst_r0, in4, dst_l0, in5, dst_r1, in6, dst_l1, in7,
         dst_r0, dst_l0, dst_r1, dst_l1);
    CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
    PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1);
    ST8x4_UB(dst_r0, dst_r1, dst, stride);
}
Пример #5
0
static void hevc_addblk_4x4_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
{
    uint32_t dst0, dst1, dst2, dst3;
    v8i16 dst_r0, dst_l0, in0, in1;
    v4i32 dst_vec = { 0 };
    v16u8 zeros = { 0 };

    LD_SH2(coeffs, 8, in0, in1);
    LW4(dst, stride, dst0, dst1, dst2, dst3);
    INSERT_W4_SW(dst0, dst1, dst2, dst3, dst_vec);
    ILVRL_B2_SH(zeros, dst_vec, dst_r0, dst_l0);
    ADD2(dst_r0, in0, dst_l0, in1, dst_r0, dst_l0);
    CLIP_SH2_0_255(dst_r0, dst_l0);
    dst_vec = (v4i32) __msa_pckev_b((v16i8) dst_l0, (v16i8) dst_r0);
    ST4x4_UB(dst_vec, dst_vec, 0, 1, 2, 3, dst, stride);
}
Пример #6
0
static WEBP_INLINE void TrueMotion16x16(uint8_t* dst, const uint8_t* left,
                                        const uint8_t* top) {
  if (left != NULL) {
    if (top != NULL) {
      int j;
      v8i16 d1, d2;
      const v16i8 zero = { 0 };
      const v8i16 TL = (v8i16)__msa_fill_h(left[-1]);
      const v16u8 T = LD_UB(top);
      ILVRL_B2_SH(zero, T, d1, d2);
      SUB2(d1, TL, d2, TL, d1, d2);
      for (j = 0; j < 16; j += 4) {
        v16i8 t0, t1, t2, t3;
        v8i16 r0, r1, r2, r3, r4, r5, r6, r7;
        const v8i16 L0 = (v8i16)__msa_fill_h(left[j + 0]);
        const v8i16 L1 = (v8i16)__msa_fill_h(left[j + 1]);
        const v8i16 L2 = (v8i16)__msa_fill_h(left[j + 2]);
        const v8i16 L3 = (v8i16)__msa_fill_h(left[j + 3]);
        ADD4(d1, L0, d1, L1, d1, L2, d1, L3, r0, r1, r2, r3);
        ADD4(d2, L0, d2, L1, d2, L2, d2, L3, r4, r5, r6, r7);
        CLIP_SH4_0_255(r0, r1, r2, r3);
        CLIP_SH4_0_255(r4, r5, r6, r7);
        PCKEV_B4_SB(r4, r0, r5, r1, r6, r2, r7, r3, t0, t1, t2, t3);
        ST_SB4(t0, t1, t2, t3, dst, BPS);
        dst += 4 * BPS;
      }
    } else {
      HorizontalPred16x16(dst, left);
    }
  } else {
    if (top != NULL) {
      VerticalPred16x16(dst, top);
    } else {
      const v16u8 out = (v16u8)__msa_fill_b(0x81);
      STORE16x16(out, dst);
    }
  }
}
Пример #7
0
void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
                                 const uint8_t *b_limit0,
                                 const uint8_t *limit0,
                                 const uint8_t *thresh0,
                                 const uint8_t *b_limit1,
                                 const uint8_t *limit1,
                                 const uint8_t *thresh1) {
  uint8_t *temp_src;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
  v16u8 p1_out, p0_out, q0_out, q1_out;
  v16u8 flat, mask, hev, thresh, b_limit, limit;
  v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
  v16u8 zero = { 0 };
  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;

  temp_src = src - 4;

  LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
  temp_src += (8 * pitch);
  LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);

  /* transpose 16x8 matrix into 8x16 */
  TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
                      q3, q2, q1, q0, row12, row13, row14, row15,
                      p3, p2, p1, p0, q0, q1, q2, q3);

  thresh = (v16u8)__msa_fill_b(*thresh0);
  vec0 = (v8i16)__msa_fill_b(*thresh1);
  thresh = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)thresh);

  b_limit = (v16u8)__msa_fill_b(*b_limit0);
  vec0 = (v8i16)__msa_fill_b(*b_limit1);
  b_limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)b_limit);

  limit = (v16u8)__msa_fill_b(*limit0);
  vec0 = (v8i16)__msa_fill_b(*limit1);
  limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)limit);

  /* mask and hev */
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
               hev, mask, flat);
  /* flat4 */
  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
  /* filter4 */
  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);

  if (__msa_test_bz_v(flat)) {
    ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
    ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
    ILVRL_H2_SH(vec1, vec0, vec4, vec5);

    src -= 2;
    ST4x8_UB(vec2, vec3, src, pitch);
    src += 8 * pitch;
    ST4x8_UB(vec4, vec5, src, pitch);
  } else {
    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
               q3_r);
    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);

    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
    ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);

    /* filter8 */
    VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);

    /* convert 16 bit output data into 8 bit */
    PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
                p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
                p0_filt8_r, q0_filt8_r);
    PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
                q2_filt8_r);

    /* store pixel values */
    p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
    p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
    p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
    q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
    q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
    q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);

    ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
    ILVRL_H2_SH(vec1, vec0, vec3, vec4);
    ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
    ILVRL_H2_SH(vec1, vec0, vec6, vec7);
    ILVRL_B2_SH(q2, q1, vec2, vec5);

    src -= 3;
    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
    ST2x4_UB(vec2, 0, src + 4, pitch);
    src += (4 * pitch);
    ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
    ST2x4_UB(vec2, 4, src + 4, pitch);
    src += (4 * pitch);
    ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
    ST2x4_UB(vec5, 0, src + 4, pitch);
    src += (4 * pitch);
    ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
    ST2x4_UB(vec5, 4, src + 4, pitch);
  }
}
Пример #8
0
static void temporal_filter_apply_16size_msa(uint8_t *frame1_ptr,
                                             uint32_t stride,
                                             uint8_t *frame2_ptr,
                                             int32_t strength_in,
                                             int32_t filter_wt_in,
                                             uint32_t *acc, uint16_t *cnt)
{
    uint32_t row;
    v16i8 frame1_0_b, frame1_1_b, frame2_0_b, frame2_1_b;
    v16u8 frame_l, frame_h;
    v16i8 zero = { 0 };
    v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h;
    v8i16 diff0, diff1, cnt0, cnt1;
    v4i32 const3, const16, filter_wt, strength;
    v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
    v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
    v4i32 frame2_0, frame2_1, frame2_2, frame2_3;
    v4i32 acc0, acc1, acc2, acc3;

    filter_wt = __msa_fill_w(filter_wt_in);
    strength = __msa_fill_w(strength_in);
    const3 = __msa_ldi_w(3);
    const16 = __msa_ldi_w(16);

    for (row = 8; row--;)
    {
        frame1_0_b = LD_SB(frame1_ptr);
        frame2_0_b = LD_SB(frame2_ptr);
        frame1_ptr += stride;
        frame2_ptr += 16;
        frame1_1_b = LD_SB(frame1_ptr);
        frame2_1_b = LD_SB(frame2_ptr);
        LD_SW2(acc, 4, acc0, acc1);
        LD_SW2(acc + 8, 4, acc2, acc3);
        LD_SH2(cnt, 8, cnt0, cnt1);
        ILVRL_B2_UB(frame1_0_b, frame2_0_b, frame_l, frame_h);
        HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
        UNPCK_SH_SW(diff0, diff0_r, diff0_l);
        UNPCK_SH_SW(diff1, diff1_r, diff1_l);
        MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
             diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
        MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
             mod0_w, mod1_w, mod2_w, mod3_w);
        SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
        diff0_r = (mod0_w < const16);
        diff0_l = (mod1_w < const16);
        diff1_r = (mod2_w < const16);
        diff1_l = (mod3_w < const16);
        SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
             mod0_w, mod1_w, mod2_w, mod3_w);
        mod0_w = diff0_r & mod0_w;
        mod1_w = diff0_l & mod1_w;
        mod2_w = diff1_r & mod2_w;
        mod3_w = diff1_l & mod3_w;
        MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
             filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
        PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h)
        ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
        ST_SH2(mod0_h, mod1_h, cnt, 8);
        cnt += 16;
        ILVRL_B2_SH(zero, frame2_0_b, frame2_0_h, frame2_1_h);
        UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
        UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
        MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
             frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
        ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
             mod0_w, mod1_w, mod2_w, mod3_w);
        ST_SW2(mod0_w, mod1_w, acc, 4);
        ST_SW2(mod2_w, mod3_w, acc + 8, 4);
        acc += 16;
        LD_SW2(acc, 4, acc0, acc1);
        LD_SW2(acc + 8, 4, acc2, acc3);
        LD_SH2(cnt, 8, cnt0, cnt1);
        ILVRL_B2_UB(frame1_1_b, frame2_1_b, frame_l, frame_h);
        HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
        UNPCK_SH_SW(diff0, diff0_r, diff0_l);
        UNPCK_SH_SW(diff1, diff1_r, diff1_l);
        MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
             diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
        MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
             mod0_w, mod1_w, mod2_w, mod3_w);
        SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
        diff0_r = (mod0_w < const16);
        diff0_l = (mod1_w < const16);
        diff1_r = (mod2_w < const16);
        diff1_l = (mod3_w < const16);
        SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
             mod0_w, mod1_w, mod2_w, mod3_w);
        mod0_w = diff0_r & mod0_w;
        mod1_w = diff0_l & mod1_w;
        mod2_w = diff1_r & mod2_w;
        mod3_w = diff1_l & mod3_w;
        MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
             filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
        PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
        ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
        ST_SH2(mod0_h, mod1_h, cnt, 8);
        cnt += 16;

        UNPCK_UB_SH(frame2_1_b, frame2_0_h, frame2_1_h);
        UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
        UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
        MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
             frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
        ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
             mod0_w, mod1_w, mod2_w, mod3_w);
        ST_SW2(mod0_w, mod1_w, acc, 4);
        ST_SW2(mod2_w, mod3_w, acc + 8, 4);
        acc += 16;
        frame1_ptr += stride;
        frame2_ptr += 16;
    }
}
Пример #9
0
static void mbloop_filter_vertical_edge_uv_msa(uint8_t *src_u, uint8_t *src_v,
                                               int32_t pitch,
                                               const uint8_t b_limit_in,
                                               const uint8_t limit_in,
                                               const uint8_t thresh_in) {
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
  v16u8 mask, hev, flat, thresh, limit, b_limit;
  v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
  v16u8 row9, row10, row11, row12, row13, row14, row15;
  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

  b_limit = (v16u8)__msa_fill_b(b_limit_in);
  limit = (v16u8)__msa_fill_b(limit_in);
  thresh = (v16u8)__msa_fill_b(thresh_in);

  LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
  LD_UB8(src_v - 4, pitch, row8, row9, row10, row11, row12, row13, row14,
         row15);
  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
                      row9, row10, row11, row12, row13, row14, row15, p3, p2,
                      p1, p0, q0, q1, q2, q3);

  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
               mask, flat);
  VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);

  ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
  ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
  ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
  ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
  ILVRL_B2_SH(q2, q1, tmp2, tmp5);

  src_u -= 3;
  VP8_ST6x1_UB(tmp3, 0, tmp2, 0, src_u, 4);
  src_u += pitch;
  VP8_ST6x1_UB(tmp3, 1, tmp2, 1, src_u, 4);
  src_u += pitch;
  VP8_ST6x1_UB(tmp3, 2, tmp2, 2, src_u, 4);
  src_u += pitch;
  VP8_ST6x1_UB(tmp3, 3, tmp2, 3, src_u, 4);
  src_u += pitch;
  VP8_ST6x1_UB(tmp4, 0, tmp2, 4, src_u, 4);
  src_u += pitch;
  VP8_ST6x1_UB(tmp4, 1, tmp2, 5, src_u, 4);
  src_u += pitch;
  VP8_ST6x1_UB(tmp4, 2, tmp2, 6, src_u, 4);
  src_u += pitch;
  VP8_ST6x1_UB(tmp4, 3, tmp2, 7, src_u, 4);

  src_v -= 3;
  VP8_ST6x1_UB(tmp6, 0, tmp5, 0, src_v, 4);
  src_v += pitch;
  VP8_ST6x1_UB(tmp6, 1, tmp5, 1, src_v, 4);
  src_v += pitch;
  VP8_ST6x1_UB(tmp6, 2, tmp5, 2, src_v, 4);
  src_v += pitch;
  VP8_ST6x1_UB(tmp6, 3, tmp5, 3, src_v, 4);
  src_v += pitch;
  VP8_ST6x1_UB(tmp7, 0, tmp5, 4, src_v, 4);
  src_v += pitch;
  VP8_ST6x1_UB(tmp7, 1, tmp5, 5, src_v, 4);
  src_v += pitch;
  VP8_ST6x1_UB(tmp7, 2, tmp5, 6, src_v, 4);
  src_v += pitch;
  VP8_ST6x1_UB(tmp7, 3, tmp5, 7, src_v, 4);
}
Пример #10
0
void ff_vp8_h_loop_filter16_msa(uint8_t *src, ptrdiff_t pitch, int b_limit_in,
                                int limit_in, int thresh_in)
{
    uint8_t *temp_src;
    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
    v16u8 mask, hev, flat, thresh, limit, b_limit;
    v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
    v16u8 row9, row10, row11, row12, row13, row14, row15;
    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

    b_limit = (v16u8) __msa_fill_b(b_limit_in);
    limit = (v16u8) __msa_fill_b(limit_in);
    thresh = (v16u8) __msa_fill_b(thresh_in);
    temp_src = src - 4;
    LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
    temp_src += (8 * pitch);
    LD_UB8(temp_src, pitch,
           row8, row9, row10, row11, row12, row13, row14, row15);
    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
                        row8, row9, row10, row11, row12, row13, row14, row15,
                        p3, p2, p1, p0, q0, q1, q2, q3);

    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
                 hev, mask, flat);
    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
    ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
    ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
    ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
    ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
    ILVRL_B2_SH(q2, q1, tmp2, tmp5);

    temp_src = src - 3;
    VP8_ST6x1_UB(tmp3, 0, tmp2, 0, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp3, 1, tmp2, 1, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp3, 2, tmp2, 2, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp3, 3, tmp2, 3, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp4, 0, tmp2, 4, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp4, 1, tmp2, 5, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp4, 2, tmp2, 6, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp4, 3, tmp2, 7, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp6, 0, tmp5, 0, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp6, 1, tmp5, 1, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp6, 2, tmp5, 2, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp6, 3, tmp5, 3, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp7, 0, tmp5, 4, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp7, 1, tmp5, 5, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp7, 2, tmp5, 6, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp7, 3, tmp5, 7, temp_src, 4);
}