コード例 #1
0
ファイル: vp8_lpf_msa.c プロジェクト: 0day-ci/FFmpeg
void ff_vp8_h_loop_filter16_inner_msa(uint8_t *src, ptrdiff_t pitch,
                                      int32_t e, int32_t i, int32_t h)
{
    v16u8 mask, hev, flat;
    v16u8 thresh, b_limit, limit;
    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
    v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
    v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;

    LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
    LD_UB8(src - 4 + (8 * pitch), pitch,
           row8, row9, row10, row11, row12, row13, row14, row15);
    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
                        row8, row9, row10, row11, row12, row13, row14, row15,
                        p3, p2, p1, p0, q0, q1, q2, q3);

    thresh = (v16u8) __msa_fill_b(h);
    b_limit = (v16u8) __msa_fill_b(e);
    limit = (v16u8) __msa_fill_b(i);

    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
                 hev, mask, flat);
    VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
    ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
    ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
    ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
    ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);

    src -= 2;
    ST4x8_UB(tmp2, tmp3, src, pitch);
    src += (8 * pitch);
    ST4x8_UB(tmp4, tmp5, src, pitch);
}
コード例 #2
0
static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr,
                                     const uint8_t *src_left,
                                     uint8_t *dst, int32_t dst_stride) {
  uint64_t val;
  uint8_t top_left = src_top_ptr[-1];
  uint32_t loop_cnt;
  v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
  v8u16 src_top_left, vec0, vec1, vec2, vec3;
  v16u8 src0, src1, src2, src3;

  val = LD(src_top_ptr);
  src_top = (v16i8)__msa_insert_d((v2i64)src_top, 0, val);
  src_top_left = (v8u16)__msa_fill_h(top_left);

  for (loop_cnt = 2; loop_cnt--;) {
    src_left0 = __msa_fill_b(src_left[0]);
    src_left1 = __msa_fill_b(src_left[1]);
    src_left2 = __msa_fill_b(src_left[2]);
    src_left3 = __msa_fill_b(src_left[3]);
    src_left += 4;

    ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
               src_left3, src_top, src0, src1, src2, src3);
    HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
    SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
    PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
    dst += (4 * dst_stride);
  }
}
コード例 #3
0
ファイル: intrapred_msa.c プロジェクト: jfiguinha/Regards
static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst,
                                          int32_t dst_stride) {
  uint32_t row;
  uint8_t inp0, inp1, inp2, inp3;
  v16u8 src0, src1, src2, src3;

  for (row = 8; row--;) {
    inp0 = src[0];
    inp1 = src[1];
    inp2 = src[2];
    inp3 = src[3];
    src += 4;

    src0 = (v16u8)__msa_fill_b(inp0);
    src1 = (v16u8)__msa_fill_b(inp1);
    src2 = (v16u8)__msa_fill_b(inp2);
    src3 = (v16u8)__msa_fill_b(inp3);

    ST_UB2(src0, src0, dst, 16);
    dst += dst_stride;
    ST_UB2(src1, src1, dst, 16);
    dst += dst_stride;
    ST_UB2(src2, src2, dst, 16);
    dst += dst_stride;
    ST_UB2(src3, src3, dst, 16);
    dst += dst_stride;
  }
}
コード例 #4
0
static void intra_predict_horiz_16x16_msa(uint8_t *src, int32_t src_stride,
                                          uint8_t *dst, int32_t dst_stride)
{
    uint32_t row;
    uint8_t inp0, inp1, inp2, inp3;
    v16u8 src0, src1, src2, src3;

    for (row = 4; row--;)
    {
        inp0 = src[0];
        src += src_stride;
        inp1 = src[0];
        src += src_stride;
        inp2 = src[0];
        src += src_stride;
        inp3 = src[0];
        src += src_stride;

        src0 = (v16u8)__msa_fill_b(inp0);
        src1 = (v16u8)__msa_fill_b(inp1);
        src2 = (v16u8)__msa_fill_b(inp2);
        src3 = (v16u8)__msa_fill_b(inp3);

        ST_UB4(src0, src1, src2, src3, dst, dst_stride);
        dst += (4 * dst_stride);
    }
}
コード例 #5
0
void vp9_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch,
                              const uint8_t *b_limit_ptr,
                              const uint8_t *limit_ptr,
                              const uint8_t *thresh_ptr,
                              int32_t count) {
  uint64_t p1_d, p0_d, q0_d, q1_d;
  v16u8 mask, hev, flat, thresh, b_limit, limit;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;

  (void)count;

  /* load vector elements */
  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);

  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
  limit = (v16u8)__msa_fill_b(*limit_ptr);

  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
               hev, mask, flat);
  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);

  p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
  p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
  q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
  q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
  SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
}
コード例 #6
0
void vp9_lpf_vertical_4_msa(uint8_t *src, int32_t pitch,
                            const uint8_t *b_limit_ptr,
                            const uint8_t *limit_ptr,
                            const uint8_t *thresh_ptr,
                            int32_t count) {
  v16u8 mask, hev, flat, limit, thresh, b_limit;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
  v8i16 vec0, vec1, vec2, vec3;

  (void)count;

  LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3);

  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
  limit = (v16u8)__msa_fill_b(*limit_ptr);

  TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
                     p3, p2, p1, p0, q0, q1, q2, q3);
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
               hev, mask, flat);
  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
  ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1);
  ILVRL_H2_SH(vec1, vec0, vec2, vec3);

  src -= 2;
  ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
  src += 4 * pitch;
  ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
}
コード例 #7
0
static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr,
                                     const uint8_t *src_left,
                                     uint8_t *dst, int32_t dst_stride) {
  uint32_t val;
  uint8_t top_left = src_top_ptr[-1];
  v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
  v16u8 src0, src1, src2, src3;
  v8u16 src_top_left, vec0, vec1, vec2, vec3;

  src_top_left = (v8u16)__msa_fill_h(top_left);
  val = LW(src_top_ptr);
  src_top = (v16i8)__msa_insert_w((v4i32)src_top, 0, val);

  src_left0 = __msa_fill_b(src_left[0]);
  src_left1 = __msa_fill_b(src_left[1]);
  src_left2 = __msa_fill_b(src_left[2]);
  src_left3 = __msa_fill_b(src_left[3]);

  ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
             src_left3, src_top, src0, src1, src2, src3);
  HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
  SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
  PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
  ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride);
}
コード例 #8
0
static void intra_predict_tm_32x32_msa(const uint8_t *src_top,
                                       const uint8_t *src_left,
                                       uint8_t *dst, int32_t dst_stride) {
  uint8_t top_left = src_top[-1];
  uint32_t loop_cnt;
  v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3;
  v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1;

  LD_SB2(src_top, 16, src_top0, src_top1);
  src_top_left = (v8u16)__msa_fill_h(top_left);

  for (loop_cnt = 8; loop_cnt--;) {
    src_left0 = __msa_fill_b(src_left[0]);
    src_left1 = __msa_fill_b(src_left[1]);
    src_left2 = __msa_fill_b(src_left[2]);
    src_left3 = __msa_fill_b(src_left[3]);
    src_left += 4;

    ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1);
    ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1);
    HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
    SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
    PCKEV_ST_SB(res_r0, res_l0, dst);
    PCKEV_ST_SB(res_r1, res_l1, dst + 16);
    dst += dst_stride;

    ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1);
    ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1);
    HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
    SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
    PCKEV_ST_SB(res_r0, res_l0, dst);
    PCKEV_ST_SB(res_r1, res_l1, dst + 16);
    dst += dst_stride;

    ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1);
    ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1);
    HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
    SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
    PCKEV_ST_SB(res_r0, res_l0, dst);
    PCKEV_ST_SB(res_r1, res_l1, dst + 16);
    dst += dst_stride;

    ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1);
    ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1);
    HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
    SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
    PCKEV_ST_SB(res_r0, res_l0, dst);
    PCKEV_ST_SB(res_r1, res_l1, dst + 16);
    dst += dst_stride;
  }
}
コード例 #9
0
static void mbloop_filter_horizontal_edge_uv_msa(uint8_t *src_u, uint8_t *src_v,
                                                 int32_t pitch,
                                                 const uint8_t b_limit_in,
                                                 const uint8_t limit_in,
                                                 const uint8_t thresh_in) {
  uint8_t *temp_src;
  uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
  v16u8 mask, hev, flat, thresh, limit, b_limit;
  v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
  v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;

  b_limit = (v16u8)__msa_fill_b(b_limit_in);
  limit = (v16u8)__msa_fill_b(limit_in);
  thresh = (v16u8)__msa_fill_b(thresh_in);

  temp_src = src_u - (pitch << 2);
  LD_UB8(temp_src, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
  temp_src = src_v - (pitch << 2);
  LD_UB8(temp_src, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);

  ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
  ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
               mask, flat);
  VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);

  p2_d = __msa_copy_u_d((v2i64)p2, 0);
  p1_d = __msa_copy_u_d((v2i64)p1, 0);
  p0_d = __msa_copy_u_d((v2i64)p0, 0);
  q0_d = __msa_copy_u_d((v2i64)q0, 0);
  q1_d = __msa_copy_u_d((v2i64)q1, 0);
  q2_d = __msa_copy_u_d((v2i64)q2, 0);
  src_u -= (pitch * 3);
  SD4(p2_d, p1_d, p0_d, q0_d, src_u, pitch);
  src_u += 4 * pitch;
  SD(q1_d, src_u);
  src_u += pitch;
  SD(q2_d, src_u);

  p2_d = __msa_copy_u_d((v2i64)p2, 1);
  p1_d = __msa_copy_u_d((v2i64)p1, 1);
  p0_d = __msa_copy_u_d((v2i64)p0, 1);
  q0_d = __msa_copy_u_d((v2i64)q0, 1);
  q1_d = __msa_copy_u_d((v2i64)q1, 1);
  q2_d = __msa_copy_u_d((v2i64)q2, 1);
  src_v -= (pitch * 3);
  SD4(p2_d, p1_d, p0_d, q0_d, src_v, pitch);
  src_v += 4 * pitch;
  SD(q1_d, src_v);
  src_v += pitch;
  SD(q2_d, src_v);
}
コード例 #10
0
ファイル: vp8_lpf_msa.c プロジェクト: 0day-ci/FFmpeg
void ff_vp8_v_loop_filter8uv_msa(uint8_t *src_u, uint8_t *src_v,
                                 ptrdiff_t pitch, int b_limit_in, int limit_in,
                                 int thresh_in)
{
    uint8_t *temp_src;
    uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
    v16u8 mask, hev, flat, thresh, limit, b_limit;
    v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
    v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;

    b_limit = (v16u8) __msa_fill_b(b_limit_in);
    limit = (v16u8) __msa_fill_b(limit_in);
    thresh = (v16u8) __msa_fill_b(thresh_in);

    temp_src = src_u - (pitch << 2);
    LD_UB8(temp_src, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
    temp_src = src_v - (pitch << 2);
    LD_UB8(temp_src, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);

    /* rht 8 element of p3 are u pixel and left 8 element of p3 are v pixel */
    ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
    ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
                 hev, mask, flat);
    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);

    p2_d = __msa_copy_u_d((v2i64) p2, 0);
    p1_d = __msa_copy_u_d((v2i64) p1, 0);
    p0_d = __msa_copy_u_d((v2i64) p0, 0);
    q0_d = __msa_copy_u_d((v2i64) q0, 0);
    q1_d = __msa_copy_u_d((v2i64) q1, 0);
    q2_d = __msa_copy_u_d((v2i64) q2, 0);
    src_u -= (pitch * 3);
    SD4(p2_d, p1_d, p0_d, q0_d, src_u, pitch);
    src_u += 4 * pitch;
    SD(q1_d, src_u);
    src_u += pitch;
    SD(q2_d, src_u);

    p2_d = __msa_copy_u_d((v2i64) p2, 1);
    p1_d = __msa_copy_u_d((v2i64) p1, 1);
    p0_d = __msa_copy_u_d((v2i64) p0, 1);
    q0_d = __msa_copy_u_d((v2i64) q0, 1);
    q1_d = __msa_copy_u_d((v2i64) q1, 1);
    q2_d = __msa_copy_u_d((v2i64) q2, 1);
    src_v -= (pitch * 3);
    SD4(p2_d, p1_d, p0_d, q0_d, src_v, pitch);
    src_v += 4 * pitch;
    SD(q1_d, src_v);
    src_v += pitch;
    SD(q2_d, src_v);
}
コード例 #11
0
static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr,
                                       const uint8_t *src_left,
                                       uint8_t *dst, int32_t dst_stride) {
  uint8_t top_left = src_top_ptr[-1];
  uint32_t loop_cnt;
  v16i8 src_top, src_left0, src_left1, src_left2, src_left3;
  v8u16 src_top_left, res_r, res_l;

  src_top = LD_SB(src_top_ptr);
  src_top_left = (v8u16)__msa_fill_h(top_left);

  for (loop_cnt = 4; loop_cnt--;) {
    src_left0 = __msa_fill_b(src_left[0]);
    src_left1 = __msa_fill_b(src_left[1]);
    src_left2 = __msa_fill_b(src_left[2]);
    src_left3 = __msa_fill_b(src_left[3]);
    src_left += 4;

    ILVRL_B2_UH(src_left0, src_top, res_r, res_l);
    HADD_UB2_UH(res_r, res_l, res_r, res_l);
    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);

    SAT_UH2_UH(res_r, res_l, 7);
    PCKEV_ST_SB(res_r, res_l, dst);
    dst += dst_stride;

    ILVRL_B2_UH(src_left1, src_top, res_r, res_l);
    HADD_UB2_UH(res_r, res_l, res_r, res_l);
    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
    SAT_UH2_UH(res_r, res_l, 7);
    PCKEV_ST_SB(res_r, res_l, dst);
    dst += dst_stride;

    ILVRL_B2_UH(src_left2, src_top, res_r, res_l);
    HADD_UB2_UH(res_r, res_l, res_r, res_l);
    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
    SAT_UH2_UH(res_r, res_l, 7);
    PCKEV_ST_SB(res_r, res_l, dst);
    dst += dst_stride;

    ILVRL_B2_UH(src_left3, src_top, res_r, res_l);
    HADD_UB2_UH(res_r, res_l, res_r, res_l);
    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
    SAT_UH2_UH(res_r, res_l, 7);
    PCKEV_ST_SB(res_r, res_l, dst);
    dst += dst_stride;
  }
}
コード例 #12
0
void vp8_loop_filter_simple_vertical_edge_msa(uint8_t *src, int32_t pitch,
                                              const uint8_t *b_limit_ptr) {
  uint8_t *temp_src;
  v16u8 p1, p0, q1, q0;
  v16u8 mask, b_limit;
  v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
  v16u8 row9, row10, row11, row12, row13, row14, row15;
  v8i16 tmp0, tmp1;

  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
  temp_src = src - 2;
  LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
  temp_src += (8 * pitch);
  LD_UB8(temp_src, pitch, row8, row9, row10, row11, row12, row13, row14, row15);
  TRANSPOSE16x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
                      row9, row10, row11, row12, row13, row14, row15, p1, p0,
                      q0, q1);
  VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
  VP8_SIMPLE_FILT(p1, p0, q0, q1, mask);
  ILVRL_B2_SH(q0, p0, tmp1, tmp0);

  src -= 1;
  ST2x4_UB(tmp1, 0, src, pitch);
  src += 4 * pitch;
  ST2x4_UB(tmp1, 4, src, pitch);
  src += 4 * pitch;
  ST2x4_UB(tmp0, 0, src, pitch);
  src += 4 * pitch;
  ST2x4_UB(tmp0, 4, src, pitch);
  src += 4 * pitch;
}
コード例 #13
0
static void intra_predict_dc_8x8_msa(uint8_t *src_top, uint8_t *src_left,
                                     int32_t src_stride_left,
                                     uint8_t *dst, int32_t dst_stride,
                                     uint8_t is_above, uint8_t is_left)
{
    uint32_t row, addition = 0;
    uint64_t out;
    v16u8 src_above, store;
    v8u16 sum_above;
    v4u32 sum_top;
    v2u64 sum;

    if (is_left && is_above)
    {
        src_above = LD_UB(src_top);

        sum_above = __msa_hadd_u_h(src_above, src_above);
        sum_top = __msa_hadd_u_w(sum_above, sum_above);
        sum = __msa_hadd_u_d(sum_top, sum_top);
        addition = __msa_copy_u_w((v4i32)sum, 0);

        for (row = 0; row < 8; ++row)
        {
            addition += src_left[row * src_stride_left];
        }

        addition = (addition + 8) >> 4;
        store = (v16u8)__msa_fill_b(addition);
    }
コード例 #14
0
void aom_plane_add_noise_msa(uint8_t *start_ptr, char *noise,
                             char blackclamp[16], char whiteclamp[16],
                             char bothclamp[16], uint32_t width,
                             uint32_t height, int32_t pitch) {
  uint32_t i, j;

  for (i = 0; i < height / 2; ++i) {
    uint8_t *pos0_ptr = start_ptr + (2 * i) * pitch;
    int8_t *ref0_ptr = (int8_t *)(noise + (rand() & 0xff));
    uint8_t *pos1_ptr = start_ptr + (2 * i + 1) * pitch;
    int8_t *ref1_ptr = (int8_t *)(noise + (rand() & 0xff));
    for (j = width / 16; j--;) {
      v16i8 temp00_s, temp01_s;
      v16u8 temp00, temp01, black_clamp, white_clamp;
      v16u8 pos0, ref0, pos1, ref1;
      v16i8 const127 = __msa_ldi_b(127);

      pos0 = LD_UB(pos0_ptr);
      ref0 = LD_UB(ref0_ptr);
      pos1 = LD_UB(pos1_ptr);
      ref1 = LD_UB(ref1_ptr);
      black_clamp = (v16u8)__msa_fill_b(blackclamp[0]);
      white_clamp = (v16u8)__msa_fill_b(whiteclamp[0]);
      temp00 = (pos0 < black_clamp);
      pos0 = __msa_bmnz_v(pos0, black_clamp, temp00);
      temp01 = (pos1 < black_clamp);
      pos1 = __msa_bmnz_v(pos1, black_clamp, temp01);
      XORI_B2_128_UB(pos0, pos1);
      temp00_s = __msa_adds_s_b((v16i8)white_clamp, const127);
      temp00 = (v16u8)(temp00_s < pos0);
      pos0 = (v16u8)__msa_bmnz_v((v16u8)pos0, (v16u8)temp00_s, temp00);
      temp01_s = __msa_adds_s_b((v16i8)white_clamp, const127);
      temp01 = (temp01_s < pos1);
      pos1 = (v16u8)__msa_bmnz_v((v16u8)pos1, (v16u8)temp01_s, temp01);
      XORI_B2_128_UB(pos0, pos1);
      pos0 += ref0;
      ST_UB(pos0, pos0_ptr);
      pos1 += ref1;
      ST_UB(pos1, pos1_ptr);
      pos0_ptr += 16;
      pos1_ptr += 16;
      ref0_ptr += 16;
      ref1_ptr += 16;
    }
  }
}
コード例 #15
0
ファイル: enc_msa.c プロジェクト: garrettmoon/libwebp
static WEBP_INLINE void VerticalPred16x16(uint8_t* dst, const uint8_t* top) {
  if (top != NULL) {
    const v16u8 out = LD_UB(top);
    STORE16x16(out, dst);
  } else {
    const v16u8 out = (v16u8)__msa_fill_b(0x7f);
    STORE16x16(out, dst);
  }
}
コード例 #16
0
ファイル: enc_msa.c プロジェクト: garrettmoon/libwebp
static WEBP_INLINE void HorizontalPred16x16(uint8_t* dst,
                                            const uint8_t* left) {
  if (left != NULL) {
    int j;
    for (j = 0; j < 16; j += 4) {
      const v16u8 L0 = (v16u8)__msa_fill_b(left[0]);
      const v16u8 L1 = (v16u8)__msa_fill_b(left[1]);
      const v16u8 L2 = (v16u8)__msa_fill_b(left[2]);
      const v16u8 L3 = (v16u8)__msa_fill_b(left[3]);
      ST_UB4(L0, L1, L2, L3, dst, BPS);
      dst += 4 * BPS;
      left += 4;
    }
  } else {
    const v16u8 out = (v16u8)__msa_fill_b(0x81);
    STORE16x16(out, dst);
  }
}
コード例 #17
0
ファイル: vp8_lpf_msa.c プロジェクト: 0day-ci/FFmpeg
void ff_vp8_v_loop_filter16_inner_msa(uint8_t *src, ptrdiff_t pitch,
                                      int32_t e, int32_t i, int32_t h)
{
    v16u8 mask, hev, flat;
    v16u8 thresh, b_limit, limit;
    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;

    /* load vector elements */
    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
    thresh = (v16u8) __msa_fill_b(h);
    b_limit = (v16u8) __msa_fill_b(e);
    limit = (v16u8) __msa_fill_b(i);

    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
                 hev, mask, flat);
    VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);

    ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
}
コード例 #18
0
static void loop_filter_horizontal_edge_uv_msa(uint8_t *src_u, uint8_t *src_v,
                                               int32_t pitch,
                                               const uint8_t b_limit_in,
                                               const uint8_t limit_in,
                                               const uint8_t thresh_in) {
  uint64_t p1_d, p0_d, q0_d, q1_d;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
  v16u8 mask, hev, flat, thresh, limit, b_limit;
  v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
  v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;

  thresh = (v16u8)__msa_fill_b(thresh_in);
  limit = (v16u8)__msa_fill_b(limit_in);
  b_limit = (v16u8)__msa_fill_b(b_limit_in);

  src_u = src_u - (pitch << 2);
  LD_UB8(src_u, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
  src_u += (5 * pitch);
  src_v = src_v - (pitch << 2);
  LD_UB8(src_v, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
  src_v += (5 * pitch);

  /* right 8 element of p3 are u pixel and
     left 8 element of p3 are v pixel */
  ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
  ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
               mask, flat);
  VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);

  p1_d = __msa_copy_u_d((v2i64)p1, 0);
  p0_d = __msa_copy_u_d((v2i64)p0, 0);
  q0_d = __msa_copy_u_d((v2i64)q0, 0);
  q1_d = __msa_copy_u_d((v2i64)q1, 0);
  SD4(q1_d, q0_d, p0_d, p1_d, src_u, (-pitch));

  p1_d = __msa_copy_u_d((v2i64)p1, 1);
  p0_d = __msa_copy_u_d((v2i64)p0, 1);
  q0_d = __msa_copy_u_d((v2i64)q0, 1);
  q1_d = __msa_copy_u_d((v2i64)q1, 1);
  SD4(q1_d, q0_d, p0_d, p1_d, src_v, (-pitch));
}
コード例 #19
0
static void loop_filter_vertical_edge_uv_msa(uint8_t *src_u, uint8_t *src_v,
                                             int32_t pitch,
                                             const uint8_t b_limit_in,
                                             const uint8_t limit_in,
                                             const uint8_t thresh_in) {
  uint8_t *temp_src_u, *temp_src_v;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
  v16u8 mask, hev, flat, thresh, limit, b_limit;
  v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
  v16u8 row9, row10, row11, row12, row13, row14, row15;
  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;

  thresh = (v16u8)__msa_fill_b(thresh_in);
  limit = (v16u8)__msa_fill_b(limit_in);
  b_limit = (v16u8)__msa_fill_b(b_limit_in);

  LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
  LD_UB8(src_v - 4, pitch, row8, row9, row10, row11, row12, row13, row14,
         row15);
  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
                      row9, row10, row11, row12, row13, row14, row15, p3, p2,
                      p1, p0, q0, q1, q2, q3);

  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
               mask, flat);
  VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
  ILVR_B2_SW(p0, p1, q1, q0, tmp0, tmp1);
  ILVRL_H2_SW(tmp1, tmp0, tmp2, tmp3);
  tmp0 = (v4i32)__msa_ilvl_b((v16i8)p0, (v16i8)p1);
  tmp1 = (v4i32)__msa_ilvl_b((v16i8)q1, (v16i8)q0);
  ILVRL_H2_SW(tmp1, tmp0, tmp4, tmp5);

  temp_src_u = src_u - 2;
  ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, temp_src_u, pitch);
  temp_src_u += 4 * pitch;
  ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, temp_src_u, pitch);

  temp_src_v = src_v - 2;
  ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, temp_src_v, pitch);
  temp_src_v += 4 * pitch;
  ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, temp_src_v, pitch);
}
コード例 #20
0
ファイル: blockdsp_msa.c プロジェクト: DeHackEd/FFmpeg
static void copy_8bit_value_width8_msa(uint8_t *src, uint8_t val,
                                       int32_t src_stride, int32_t height)
{
    int32_t cnt;
    uint64_t dst0;
    v16u8 val0;

    val0 = (v16u8) __msa_fill_b(val);
    dst0 = __msa_copy_u_d((v2i64) val0, 0);

    for (cnt = (height >> 2); cnt--;) {
コード例 #21
0
void vp8_loop_filter_simple_horizontal_edge_msa(uint8_t *src, int32_t pitch,
                                                const uint8_t *b_limit_ptr) {
  v16u8 p1, p0, q1, q0;
  v16u8 mask, b_limit;

  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
  LD_UB4(src - (pitch << 1), pitch, p1, p0, q0, q1);
  VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
  VP8_SIMPLE_FILT(p1, p0, q0, q1, mask);
  ST_UB2(p0, q0, (src - pitch), pitch);
}
コード例 #22
0
static void mbloop_filter_horizontal_edge_y_msa(uint8_t *src, int32_t pitch,
                                                const uint8_t b_limit_in,
                                                const uint8_t limit_in,
                                                const uint8_t thresh_in) {
  uint8_t *temp_src;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
  v16u8 mask, hev, flat, thresh, limit, b_limit;

  b_limit = (v16u8)__msa_fill_b(b_limit_in);
  limit = (v16u8)__msa_fill_b(limit_in);
  thresh = (v16u8)__msa_fill_b(thresh_in);
  temp_src = src - (pitch << 2);
  LD_UB8(temp_src, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
               mask, flat);
  VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
  temp_src = src - 3 * pitch;
  ST_UB4(p2, p1, p0, q0, temp_src, pitch);
  temp_src += (4 * pitch);
  ST_UB2(q1, q2, temp_src, pitch);
}
コード例 #23
0
void vp9_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch,
                                   const uint8_t *b_limit0_ptr,
                                   const uint8_t *limit0_ptr,
                                   const uint8_t *thresh0_ptr,
                                   const uint8_t *b_limit1_ptr,
                                   const uint8_t *limit1_ptr,
                                   const uint8_t *thresh1_ptr) {
  v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;

  /* load vector elements */
  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);

  thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
  thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
  thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);

  b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
  b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
  b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);

  limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
  limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
  limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);

  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
               hev, mask, flat);
  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);

  ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
}
コード例 #24
0
static void loop_filter_horizontal_4_dual_msa(uint8_t *src, int32_t pitch,
                                              const uint8_t *b_limit0_ptr,
                                              const uint8_t *limit0_ptr,
                                              const uint8_t *thresh0_ptr,
                                              const uint8_t *b_limit1_ptr,
                                              const uint8_t *limit1_ptr,
                                              const uint8_t *thresh1_ptr) {
  v16u8 mask, hev, flat;
  v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;

  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
  thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
  thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
  thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);

  b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
  b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
  b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);

  limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
  limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
  limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);

  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
               mask, flat);
  VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);

  ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
}
コード例 #25
0
ファイル: vp8_lpf_msa.c プロジェクト: 0day-ci/FFmpeg
void ff_vp8_v_loop_filter16_msa(uint8_t *src, ptrdiff_t pitch, int b_limit_in,
                                int limit_in, int thresh_in)
{
    uint8_t *temp_src;
    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
    v16u8 mask, hev, flat, thresh, limit, b_limit;

    b_limit = (v16u8) __msa_fill_b(b_limit_in);
    limit = (v16u8) __msa_fill_b(limit_in);
    thresh = (v16u8) __msa_fill_b(thresh_in);
    /* load vector elements */
    temp_src = src - (pitch << 2);
    LD_UB8(temp_src, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
                 hev, mask, flat);
    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
    /* store vector elements */
    temp_src = src - 3 * pitch;
    ST_UB4(p2, p1, p0, q0, temp_src, pitch);
    temp_src += (4 * pitch);
    ST_UB2(q1, q2, temp_src, pitch);
}
コード例 #26
0
ファイル: vp8_lpf_msa.c プロジェクト: 0day-ci/FFmpeg
void ff_vp8_v_loop_filter_simple_msa(uint8_t *src, ptrdiff_t pitch,
                                     int b_limit_ptr)
{
    v16u8 p1, p0, q1, q0;
    v16u8 mask, b_limit;

    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
    /* load vector elements */
    LD_UB4(src - (pitch << 1), pitch, p1, p0, q0, q1);
    VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
    VP8_SIMPLE_FILT(p1, p0, q0, q1, mask);
    ST_UB2(p0, q0, (src - pitch), pitch);
}
コード例 #27
0
void vp9_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch,
                                 const uint8_t *b_limit0_ptr,
                                 const uint8_t *limit0_ptr,
                                 const uint8_t *thresh0_ptr,
                                 const uint8_t *b_limit1_ptr,
                                 const uint8_t *limit1_ptr,
                                 const uint8_t *thresh1_ptr) {
  v16u8 mask, hev, flat;
  v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;

  LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
  LD_UB8(src - 4 + (8 * pitch), pitch,
         row8, row9, row10, row11, row12, row13, row14, row15);

  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
                      row8, row9, row10, row11, row12, row13, row14, row15,
                      p3, p2, p1, p0, q0, q1, q2, q3);

  thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
  thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
  thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);

  b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
  b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
  b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);

  limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
  limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
  limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);

  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
               hev, mask, flat);
  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
  ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
  ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
  ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
  ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);

  src -= 2;

  ST4x8_UB(tmp2, tmp3, src, pitch);
  src += (8 * pitch);
  ST4x8_UB(tmp4, tmp5, src, pitch);
}
コード例 #28
0
ファイル: enc_msa.c プロジェクト: garrettmoon/libwebp
static WEBP_INLINE void TrueMotion16x16(uint8_t* dst, const uint8_t* left,
                                        const uint8_t* top) {
  if (left != NULL) {
    if (top != NULL) {
      int j;
      v8i16 d1, d2;
      const v16i8 zero = { 0 };
      const v8i16 TL = (v8i16)__msa_fill_h(left[-1]);
      const v16u8 T = LD_UB(top);
      ILVRL_B2_SH(zero, T, d1, d2);
      SUB2(d1, TL, d2, TL, d1, d2);
      for (j = 0; j < 16; j += 4) {
        v16i8 t0, t1, t2, t3;
        v8i16 r0, r1, r2, r3, r4, r5, r6, r7;
        const v8i16 L0 = (v8i16)__msa_fill_h(left[j + 0]);
        const v8i16 L1 = (v8i16)__msa_fill_h(left[j + 1]);
        const v8i16 L2 = (v8i16)__msa_fill_h(left[j + 2]);
        const v8i16 L3 = (v8i16)__msa_fill_h(left[j + 3]);
        ADD4(d1, L0, d1, L1, d1, L2, d1, L3, r0, r1, r2, r3);
        ADD4(d2, L0, d2, L1, d2, L2, d2, L3, r4, r5, r6, r7);
        CLIP_SH4_0_255(r0, r1, r2, r3);
        CLIP_SH4_0_255(r4, r5, r6, r7);
        PCKEV_B4_SB(r4, r0, r5, r1, r6, r2, r7, r3, t0, t1, t2, t3);
        ST_SB4(t0, t1, t2, t3, dst, BPS);
        dst += 4 * BPS;
      }
    } else {
      HorizontalPred16x16(dst, left);
    }
  } else {
    if (top != NULL) {
      VerticalPred16x16(dst, top);
    } else {
      const v16u8 out = (v16u8)__msa_fill_b(0x81);
      STORE16x16(out, dst);
    }
  }
}
コード例 #29
0
void vpx_lpf_horizontal_8_dual_msa(uint8_t *src, int32_t pitch,
                                   const uint8_t *b_limit0,
                                   const uint8_t *limit0,
                                   const uint8_t *thresh0,
                                   const uint8_t *b_limit1,
                                   const uint8_t *limit1,
                                   const uint8_t *thresh1) {
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
  v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
  v16u8 zero = { 0 };

  /* load vector elements */
  LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);

  thresh = (v16u8)__msa_fill_b(*thresh0);
  tmp = (v16u8)__msa_fill_b(*thresh1);
  thresh = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)thresh);

  b_limit = (v16u8)__msa_fill_b(*b_limit0);
  tmp = (v16u8)__msa_fill_b(*b_limit1);
  b_limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)b_limit);

  limit = (v16u8)__msa_fill_b(*limit0);
  tmp = (v16u8)__msa_fill_b(*limit1);
  limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)limit);

  /* mask and hev */
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
               hev, mask, flat);
  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);

  if (__msa_test_bz_v(flat)) {
    ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
  } else {
    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
               q2_r, q3_r);
    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);

    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
    ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
    VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);

    /* convert 16 bit output data into 8 bit */
    PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
                p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
                p0_filt8_r, q0_filt8_r);
    PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
                q2_filt8_r);

    /* store pixel values */
    p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
    q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);

    src -= 3 * pitch;

    ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
    src += (4 * pitch);
    ST_UB2(q1_out, q2_out, src, pitch);
    src += (2 * pitch);
  }
}
コード例 #30
0
void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
                                 const uint8_t *b_limit0,
                                 const uint8_t *limit0,
                                 const uint8_t *thresh0,
                                 const uint8_t *b_limit1,
                                 const uint8_t *limit1,
                                 const uint8_t *thresh1) {
  uint8_t *temp_src;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
  v16u8 p1_out, p0_out, q0_out, q1_out;
  v16u8 flat, mask, hev, thresh, b_limit, limit;
  v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
  v16u8 zero = { 0 };
  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;

  temp_src = src - 4;

  LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
  temp_src += (8 * pitch);
  LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);

  /* transpose 16x8 matrix into 8x16 */
  TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
                      q3, q2, q1, q0, row12, row13, row14, row15,
                      p3, p2, p1, p0, q0, q1, q2, q3);

  thresh = (v16u8)__msa_fill_b(*thresh0);
  vec0 = (v8i16)__msa_fill_b(*thresh1);
  thresh = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)thresh);

  b_limit = (v16u8)__msa_fill_b(*b_limit0);
  vec0 = (v8i16)__msa_fill_b(*b_limit1);
  b_limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)b_limit);

  limit = (v16u8)__msa_fill_b(*limit0);
  vec0 = (v8i16)__msa_fill_b(*limit1);
  limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)limit);

  /* mask and hev */
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
               hev, mask, flat);
  /* flat4 */
  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
  /* filter4 */
  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);

  if (__msa_test_bz_v(flat)) {
    ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
    ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
    ILVRL_H2_SH(vec1, vec0, vec4, vec5);

    src -= 2;
    ST4x8_UB(vec2, vec3, src, pitch);
    src += 8 * pitch;
    ST4x8_UB(vec4, vec5, src, pitch);
  } else {
    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
               q3_r);
    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);

    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
    ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);

    /* filter8 */
    VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);

    /* convert 16 bit output data into 8 bit */
    PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
                p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
                p0_filt8_r, q0_filt8_r);
    PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
                q2_filt8_r);

    /* store pixel values */
    p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
    p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
    p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
    q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
    q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
    q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);

    ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
    ILVRL_H2_SH(vec1, vec0, vec3, vec4);
    ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
    ILVRL_H2_SH(vec1, vec0, vec6, vec7);
    ILVRL_B2_SH(q2, q1, vec2, vec5);

    src -= 3;
    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
    ST2x4_UB(vec2, 0, src + 4, pitch);
    src += (4 * pitch);
    ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
    ST2x4_UB(vec2, 4, src + 4, pitch);
    src += (4 * pitch);
    ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
    ST2x4_UB(vec5, 0, src + 4, pitch);
    src += (4 * pitch);
    ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
    ST2x4_UB(vec5, 4, src + 4, pitch);
  }
}