Esempio n. 1
0
void vp9_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch,
                              const uint8_t *b_limit_ptr,
                              const uint8_t *limit_ptr,
                              const uint8_t *thresh_ptr,
                              int32_t count) {
  uint64_t p1_d, p0_d, q0_d, q1_d;
  v16u8 mask, hev, flat, thresh, b_limit, limit;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;

  (void)count;

  /* load vector elements */
  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);

  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
  limit = (v16u8)__msa_fill_b(*limit_ptr);

  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
               hev, mask, flat);
  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);

  p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
  p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
  q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
  q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
  SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
}
Esempio n. 2
0
static void intra_predict_dc_8x8_msa(const uint8_t *src_top,
                                     const uint8_t *src_left, uint8_t *dst,
                                     int32_t dst_stride) {
  uint64_t val0, val1;
  v16i8 store;
  v16u8 src = { 0 };
  v8u16 sum_h;
  v4u32 sum_w;
  v2u64 sum_d;

  val0 = LD(src_top);
  val1 = LD(src_left);
  INSERT_D2_UB(val0, val1, src);
  sum_h = __msa_hadd_u_h(src, src);
  sum_w = __msa_hadd_u_w(sum_h, sum_h);
  sum_d = __msa_hadd_u_d(sum_w, sum_w);
  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
  sum_d = __msa_hadd_u_d(sum_w, sum_w);
  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
  store = __msa_splati_b((v16i8)sum_w, 0);
  val0 = __msa_copy_u_d((v2i64)store, 0);

  SD4(val0, val0, val0, val0, dst, dst_stride);
  dst += (4 * dst_stride);
  SD4(val0, val0, val0, val0, dst, dst_stride);
}
Esempio n. 3
0
static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) {
  uint64_t out;
  const v16i8 store = __msa_ldi_b(128);

  out = __msa_copy_u_d((v2i64)store, 0);

  SD4(out, out, out, out, dst, dst_stride);
  dst += (4 * dst_stride);
  SD4(out, out, out, out, dst, dst_stride);
}
Esempio n. 4
0
static void copy_8bit_value_width8_msa(uint8_t *src, uint8_t val,
                                       int32_t src_stride, int32_t height)
{
    int32_t cnt;
    uint64_t dst0;
    v16u8 val0;

    val0 = (v16u8) __msa_fill_b(val);
    dst0 = __msa_copy_u_d((v2i64) val0, 0);

    for (cnt = (height >> 2); cnt--;) {
static void loop_filter_horizontal_edge_uv_msa(uint8_t *src_u, uint8_t *src_v,
                                               int32_t pitch,
                                               const uint8_t b_limit_in,
                                               const uint8_t limit_in,
                                               const uint8_t thresh_in) {
  uint64_t p1_d, p0_d, q0_d, q1_d;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
  v16u8 mask, hev, flat, thresh, limit, b_limit;
  v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
  v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;

  thresh = (v16u8)__msa_fill_b(thresh_in);
  limit = (v16u8)__msa_fill_b(limit_in);
  b_limit = (v16u8)__msa_fill_b(b_limit_in);

  src_u = src_u - (pitch << 2);
  LD_UB8(src_u, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
  src_u += (5 * pitch);
  src_v = src_v - (pitch << 2);
  LD_UB8(src_v, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
  src_v += (5 * pitch);

  /* right 8 element of p3 are u pixel and
     left 8 element of p3 are v pixel */
  ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
  ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
               mask, flat);
  VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);

  p1_d = __msa_copy_u_d((v2i64)p1, 0);
  p0_d = __msa_copy_u_d((v2i64)p0, 0);
  q0_d = __msa_copy_u_d((v2i64)q0, 0);
  q1_d = __msa_copy_u_d((v2i64)q1, 0);
  SD4(q1_d, q0_d, p0_d, p1_d, src_u, (-pitch));

  p1_d = __msa_copy_u_d((v2i64)p1, 1);
  p0_d = __msa_copy_u_d((v2i64)p0, 1);
  q0_d = __msa_copy_u_d((v2i64)q0, 1);
  q1_d = __msa_copy_u_d((v2i64)q1, 1);
  SD4(q1_d, q0_d, p0_d, p1_d, src_v, (-pitch));
}
Esempio n. 6
0
static void intra_predict_dc_tl_8x8_msa(const uint8_t *src, uint8_t *dst,
                                        int32_t dst_stride) {
  uint64_t val0;
  v16i8 store;
  v16u8 data = { 0 };
  v8u16 sum_h;
  v4u32 sum_w;
  v2u64 sum_d;

  val0 = LD(src);
  data = (v16u8)__msa_insert_d((v2i64)data, 0, val0);
  sum_h = __msa_hadd_u_h(data, data);
  sum_w = __msa_hadd_u_w(sum_h, sum_h);
  sum_d = __msa_hadd_u_d(sum_w, sum_w);
  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
  store = __msa_splati_b((v16i8)sum_w, 0);
  val0 = __msa_copy_u_d((v2i64)store, 0);

  SD4(val0, val0, val0, val0, dst, dst_stride);
  dst += (4 * dst_stride);
  SD4(val0, val0, val0, val0, dst, dst_stride);
}
Esempio n. 7
0
void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
                              const uint8_t *b_limit_ptr,
                              const uint8_t *limit_ptr,
                              const uint8_t *thresh_ptr) {
  uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
  v16u8 mask, hev, flat, thresh, b_limit, limit;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
  v8i16 p2_filter8, p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8;
  v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
  v16i8 zero = { 0 };

  /* load vector elements */
  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);

  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
  limit = (v16u8)__msa_fill_b(*limit_ptr);

  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
               hev, mask, flat);
  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);

  flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);

  if (__msa_test_bz_v(flat)) {
    p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
    p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
    q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
    q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
    SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
  } else {
    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
               q2_r, q3_r);
    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
                p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);

    /* convert 16 bit output data into 8 bit */
    PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
                zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
                q0_filter8);
    PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);

    /* store pixel values */
    p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat);
    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat);
    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat);
    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat);
    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat);
    q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat);

    p2_d = __msa_copy_u_d((v2i64)p2_out, 0);
    p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
    p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
    q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
    q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
    q2_d = __msa_copy_u_d((v2i64)q2_out, 0);

    src -= 3 * pitch;

    SD4(p2_d, p1_d, p0_d, q0_d, src, pitch);
    src += (4 * pitch);
    SD(q1_d, src);
    src += pitch;
    SD(q2_d, src);
  }
}
Esempio n. 8
0
static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
                            uint8_t *dst, int32_t dst_stride, int32_t height) {
  int32_t cnt;
  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;

  if (0 == height % 12) {
    for (cnt = (height / 12); cnt--;) {
      LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
      src += (8 * src_stride);

      out0 = __msa_copy_u_d((v2i64)src0, 0);
      out1 = __msa_copy_u_d((v2i64)src1, 0);
      out2 = __msa_copy_u_d((v2i64)src2, 0);
      out3 = __msa_copy_u_d((v2i64)src3, 0);
      out4 = __msa_copy_u_d((v2i64)src4, 0);
      out5 = __msa_copy_u_d((v2i64)src5, 0);
      out6 = __msa_copy_u_d((v2i64)src6, 0);
      out7 = __msa_copy_u_d((v2i64)src7, 0);

      SD4(out0, out1, out2, out3, dst, dst_stride);
      dst += (4 * dst_stride);
      SD4(out4, out5, out6, out7, dst, dst_stride);
      dst += (4 * dst_stride);

      LD_UB4(src, src_stride, src0, src1, src2, src3);
      src += (4 * src_stride);

      out0 = __msa_copy_u_d((v2i64)src0, 0);
      out1 = __msa_copy_u_d((v2i64)src1, 0);
      out2 = __msa_copy_u_d((v2i64)src2, 0);
      out3 = __msa_copy_u_d((v2i64)src3, 0);
      SD4(out0, out1, out2, out3, dst, dst_stride);
      dst += (4 * dst_stride);
    }
  } else if (0 == height % 8) {
    for (cnt = height >> 3; cnt--;) {
      LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
      src += (8 * src_stride);

      out0 = __msa_copy_u_d((v2i64)src0, 0);
      out1 = __msa_copy_u_d((v2i64)src1, 0);
      out2 = __msa_copy_u_d((v2i64)src2, 0);
      out3 = __msa_copy_u_d((v2i64)src3, 0);
      out4 = __msa_copy_u_d((v2i64)src4, 0);
      out5 = __msa_copy_u_d((v2i64)src5, 0);
      out6 = __msa_copy_u_d((v2i64)src6, 0);
      out7 = __msa_copy_u_d((v2i64)src7, 0);

      SD4(out0, out1, out2, out3, dst, dst_stride);
      dst += (4 * dst_stride);
      SD4(out4, out5, out6, out7, dst, dst_stride);
      dst += (4 * dst_stride);
    }
  } else if (0 == height % 4) {
static void mbloop_filter_horizontal_edge_uv_msa(uint8_t *src_u, uint8_t *src_v,
                                                 int32_t pitch,
                                                 const uint8_t b_limit_in,
                                                 const uint8_t limit_in,
                                                 const uint8_t thresh_in) {
  uint8_t *temp_src;
  uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
  v16u8 mask, hev, flat, thresh, limit, b_limit;
  v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
  v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;

  b_limit = (v16u8)__msa_fill_b(b_limit_in);
  limit = (v16u8)__msa_fill_b(limit_in);
  thresh = (v16u8)__msa_fill_b(thresh_in);

  temp_src = src_u - (pitch << 2);
  LD_UB8(temp_src, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
  temp_src = src_v - (pitch << 2);
  LD_UB8(temp_src, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);

  ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
  ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
               mask, flat);
  VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);

  p2_d = __msa_copy_u_d((v2i64)p2, 0);
  p1_d = __msa_copy_u_d((v2i64)p1, 0);
  p0_d = __msa_copy_u_d((v2i64)p0, 0);
  q0_d = __msa_copy_u_d((v2i64)q0, 0);
  q1_d = __msa_copy_u_d((v2i64)q1, 0);
  q2_d = __msa_copy_u_d((v2i64)q2, 0);
  src_u -= (pitch * 3);
  SD4(p2_d, p1_d, p0_d, q0_d, src_u, pitch);
  src_u += 4 * pitch;
  SD(q1_d, src_u);
  src_u += pitch;
  SD(q2_d, src_u);

  p2_d = __msa_copy_u_d((v2i64)p2, 1);
  p1_d = __msa_copy_u_d((v2i64)p1, 1);
  p0_d = __msa_copy_u_d((v2i64)p0, 1);
  q0_d = __msa_copy_u_d((v2i64)q0, 1);
  q1_d = __msa_copy_u_d((v2i64)q1, 1);
  q2_d = __msa_copy_u_d((v2i64)q2, 1);
  src_v -= (pitch * 3);
  SD4(p2_d, p1_d, p0_d, q0_d, src_v, pitch);
  src_v += 4 * pitch;
  SD(q1_d, src_v);
  src_v += pitch;
  SD(q2_d, src_v);
}
Esempio n. 10
0
void ff_vp8_v_loop_filter8uv_msa(uint8_t *src_u, uint8_t *src_v,
                                 ptrdiff_t pitch, int b_limit_in, int limit_in,
                                 int thresh_in)
{
    uint8_t *temp_src;
    uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
    v16u8 mask, hev, flat, thresh, limit, b_limit;
    v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
    v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;

    b_limit = (v16u8) __msa_fill_b(b_limit_in);
    limit = (v16u8) __msa_fill_b(limit_in);
    thresh = (v16u8) __msa_fill_b(thresh_in);

    temp_src = src_u - (pitch << 2);
    LD_UB8(temp_src, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
    temp_src = src_v - (pitch << 2);
    LD_UB8(temp_src, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);

    /* rht 8 element of p3 are u pixel and left 8 element of p3 are v pixel */
    ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
    ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
                 hev, mask, flat);
    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);

    p2_d = __msa_copy_u_d((v2i64) p2, 0);
    p1_d = __msa_copy_u_d((v2i64) p1, 0);
    p0_d = __msa_copy_u_d((v2i64) p0, 0);
    q0_d = __msa_copy_u_d((v2i64) q0, 0);
    q1_d = __msa_copy_u_d((v2i64) q1, 0);
    q2_d = __msa_copy_u_d((v2i64) q2, 0);
    src_u -= (pitch * 3);
    SD4(p2_d, p1_d, p0_d, q0_d, src_u, pitch);
    src_u += 4 * pitch;
    SD(q1_d, src_u);
    src_u += pitch;
    SD(q2_d, src_u);

    p2_d = __msa_copy_u_d((v2i64) p2, 1);
    p1_d = __msa_copy_u_d((v2i64) p1, 1);
    p0_d = __msa_copy_u_d((v2i64) p0, 1);
    q0_d = __msa_copy_u_d((v2i64) q0, 1);
    q1_d = __msa_copy_u_d((v2i64) q1, 1);
    q2_d = __msa_copy_u_d((v2i64) q2, 1);
    src_v -= (pitch * 3);
    SD4(p2_d, p1_d, p0_d, q0_d, src_v, pitch);
    src_v += 4 * pitch;
    SD(q1_d, src_v);
    src_v += pitch;
    SD(q2_d, src_v);
}