예제 #1
0
void vp8_loop_filter_simple_vertical_edge_msa(uint8_t *src, int32_t pitch,
                                              const uint8_t *b_limit_ptr) {
  uint8_t *temp_src;
  v16u8 p1, p0, q1, q0;
  v16u8 mask, b_limit;
  v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
  v16u8 row9, row10, row11, row12, row13, row14, row15;
  v8i16 tmp0, tmp1;

  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
  temp_src = src - 2;
  LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
  temp_src += (8 * pitch);
  LD_UB8(temp_src, pitch, row8, row9, row10, row11, row12, row13, row14, row15);
  TRANSPOSE16x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
                      row9, row10, row11, row12, row13, row14, row15, p1, p0,
                      q0, q1);
  VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
  VP8_SIMPLE_FILT(p1, p0, q0, q1, mask);
  ILVRL_B2_SH(q0, p0, tmp1, tmp0);

  src -= 1;
  ST2x4_UB(tmp1, 0, src, pitch);
  src += 4 * pitch;
  ST2x4_UB(tmp1, 4, src, pitch);
  src += 4 * pitch;
  ST2x4_UB(tmp0, 0, src, pitch);
  src += 4 * pitch;
  ST2x4_UB(tmp0, 4, src, pitch);
  src += 4 * pitch;
}
예제 #2
0
void ff_vp8_h_loop_filter16_inner_msa(uint8_t *src, ptrdiff_t pitch,
                                      int32_t e, int32_t i, int32_t h)
{
    v16u8 mask, hev, flat;
    v16u8 thresh, b_limit, limit;
    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
    v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
    v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;

    LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
    LD_UB8(src - 4 + (8 * pitch), pitch,
           row8, row9, row10, row11, row12, row13, row14, row15);
    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
                        row8, row9, row10, row11, row12, row13, row14, row15,
                        p3, p2, p1, p0, q0, q1, q2, q3);

    thresh = (v16u8) __msa_fill_b(h);
    b_limit = (v16u8) __msa_fill_b(e);
    limit = (v16u8) __msa_fill_b(i);

    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
                 hev, mask, flat);
    VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
    ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
    ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
    ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
    ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);

    src -= 2;
    ST4x8_UB(tmp2, tmp3, src, pitch);
    src += (8 * pitch);
    ST4x8_UB(tmp4, tmp5, src, pitch);
}
예제 #3
0
static int32_t sum_u8src_16width_msa(uint8_t *src, int32_t stride)
{
    uint32_t sum = 0;
    v16u8 in0, in1, in2, in3, in4, in5, in6, in7;
    v16u8 in8, in9, in10, in11, in12, in13, in14, in15;

    LD_UB8(src, stride, in0, in1, in2, in3, in4, in5, in6, in7);
    src += (8 * stride);
    LD_UB8(src, stride, in8, in9, in10, in11, in12, in13, in14, in15);

    HADD_UB4_UB(in0, in1, in2, in3, in0, in1, in2, in3);
    HADD_UB4_UB(in4, in5, in6, in7, in4, in5, in6, in7);
    HADD_UB4_UB(in8, in9, in10, in11, in8, in9, in10, in11);
    HADD_UB4_UB(in12, in13, in14, in15, in12, in13, in14, in15);

    sum = HADD_UH_U32(in0);
    sum += HADD_UH_U32(in1);
    sum += HADD_UH_U32(in2);
    sum += HADD_UH_U32(in3);
    sum += HADD_UH_U32(in4);
    sum += HADD_UH_U32(in5);
    sum += HADD_UH_U32(in6);
    sum += HADD_UH_U32(in7);
    sum += HADD_UH_U32(in8);
    sum += HADD_UH_U32(in9);
    sum += HADD_UH_U32(in10);
    sum += HADD_UH_U32(in11);
    sum += HADD_UH_U32(in12);
    sum += HADD_UH_U32(in13);
    sum += HADD_UH_U32(in14);
    sum += HADD_UH_U32(in15);

    return sum;
}
예제 #4
0
static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
                            uint8_t *dst, int32_t dst_stride, int32_t height) {
  int32_t cnt;
  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;

  if (0 == height % 12) {
    for (cnt = (height / 12); cnt--;) {
      LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
      src += (8 * src_stride);

      out0 = __msa_copy_u_d((v2i64)src0, 0);
      out1 = __msa_copy_u_d((v2i64)src1, 0);
      out2 = __msa_copy_u_d((v2i64)src2, 0);
      out3 = __msa_copy_u_d((v2i64)src3, 0);
      out4 = __msa_copy_u_d((v2i64)src4, 0);
      out5 = __msa_copy_u_d((v2i64)src5, 0);
      out6 = __msa_copy_u_d((v2i64)src6, 0);
      out7 = __msa_copy_u_d((v2i64)src7, 0);

      SD4(out0, out1, out2, out3, dst, dst_stride);
      dst += (4 * dst_stride);
      SD4(out4, out5, out6, out7, dst, dst_stride);
      dst += (4 * dst_stride);

      LD_UB4(src, src_stride, src0, src1, src2, src3);
      src += (4 * src_stride);

      out0 = __msa_copy_u_d((v2i64)src0, 0);
      out1 = __msa_copy_u_d((v2i64)src1, 0);
      out2 = __msa_copy_u_d((v2i64)src2, 0);
      out3 = __msa_copy_u_d((v2i64)src3, 0);
      SD4(out0, out1, out2, out3, dst, dst_stride);
      dst += (4 * dst_stride);
    }
  } else if (0 == height % 8) {
    for (cnt = height >> 3; cnt--;) {
      LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
      src += (8 * src_stride);

      out0 = __msa_copy_u_d((v2i64)src0, 0);
      out1 = __msa_copy_u_d((v2i64)src1, 0);
      out2 = __msa_copy_u_d((v2i64)src2, 0);
      out3 = __msa_copy_u_d((v2i64)src3, 0);
      out4 = __msa_copy_u_d((v2i64)src4, 0);
      out5 = __msa_copy_u_d((v2i64)src5, 0);
      out6 = __msa_copy_u_d((v2i64)src6, 0);
      out7 = __msa_copy_u_d((v2i64)src7, 0);

      SD4(out0, out1, out2, out3, dst, dst_stride);
      dst += (4 * dst_stride);
      SD4(out4, out5, out6, out7, dst, dst_stride);
      dst += (4 * dst_stride);
    }
  } else if (0 == height % 4) {
예제 #5
0
void ff_vp8_v_loop_filter8uv_msa(uint8_t *src_u, uint8_t *src_v,
                                 ptrdiff_t pitch, int b_limit_in, int limit_in,
                                 int thresh_in)
{
    uint8_t *temp_src;
    uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
    v16u8 mask, hev, flat, thresh, limit, b_limit;
    v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
    v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;

    b_limit = (v16u8) __msa_fill_b(b_limit_in);
    limit = (v16u8) __msa_fill_b(limit_in);
    thresh = (v16u8) __msa_fill_b(thresh_in);

    temp_src = src_u - (pitch << 2);
    LD_UB8(temp_src, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
    temp_src = src_v - (pitch << 2);
    LD_UB8(temp_src, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);

    /* rht 8 element of p3 are u pixel and left 8 element of p3 are v pixel */
    ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
    ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
                 hev, mask, flat);
    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);

    p2_d = __msa_copy_u_d((v2i64) p2, 0);
    p1_d = __msa_copy_u_d((v2i64) p1, 0);
    p0_d = __msa_copy_u_d((v2i64) p0, 0);
    q0_d = __msa_copy_u_d((v2i64) q0, 0);
    q1_d = __msa_copy_u_d((v2i64) q1, 0);
    q2_d = __msa_copy_u_d((v2i64) q2, 0);
    src_u -= (pitch * 3);
    SD4(p2_d, p1_d, p0_d, q0_d, src_u, pitch);
    src_u += 4 * pitch;
    SD(q1_d, src_u);
    src_u += pitch;
    SD(q2_d, src_u);

    p2_d = __msa_copy_u_d((v2i64) p2, 1);
    p1_d = __msa_copy_u_d((v2i64) p1, 1);
    p0_d = __msa_copy_u_d((v2i64) p0, 1);
    q0_d = __msa_copy_u_d((v2i64) q0, 1);
    q1_d = __msa_copy_u_d((v2i64) q1, 1);
    q2_d = __msa_copy_u_d((v2i64) q2, 1);
    src_v -= (pitch * 3);
    SD4(p2_d, p1_d, p0_d, q0_d, src_v, pitch);
    src_v += 4 * pitch;
    SD(q1_d, src_v);
    src_v += pitch;
    SD(q2_d, src_v);
}
예제 #6
0
static void mbloop_filter_horizontal_edge_uv_msa(uint8_t *src_u, uint8_t *src_v,
                                                 int32_t pitch,
                                                 const uint8_t b_limit_in,
                                                 const uint8_t limit_in,
                                                 const uint8_t thresh_in) {
  uint8_t *temp_src;
  uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
  v16u8 mask, hev, flat, thresh, limit, b_limit;
  v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
  v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;

  b_limit = (v16u8)__msa_fill_b(b_limit_in);
  limit = (v16u8)__msa_fill_b(limit_in);
  thresh = (v16u8)__msa_fill_b(thresh_in);

  temp_src = src_u - (pitch << 2);
  LD_UB8(temp_src, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
  temp_src = src_v - (pitch << 2);
  LD_UB8(temp_src, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);

  ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
  ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
               mask, flat);
  VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);

  p2_d = __msa_copy_u_d((v2i64)p2, 0);
  p1_d = __msa_copy_u_d((v2i64)p1, 0);
  p0_d = __msa_copy_u_d((v2i64)p0, 0);
  q0_d = __msa_copy_u_d((v2i64)q0, 0);
  q1_d = __msa_copy_u_d((v2i64)q1, 0);
  q2_d = __msa_copy_u_d((v2i64)q2, 0);
  src_u -= (pitch * 3);
  SD4(p2_d, p1_d, p0_d, q0_d, src_u, pitch);
  src_u += 4 * pitch;
  SD(q1_d, src_u);
  src_u += pitch;
  SD(q2_d, src_u);

  p2_d = __msa_copy_u_d((v2i64)p2, 1);
  p1_d = __msa_copy_u_d((v2i64)p1, 1);
  p0_d = __msa_copy_u_d((v2i64)p0, 1);
  q0_d = __msa_copy_u_d((v2i64)q0, 1);
  q1_d = __msa_copy_u_d((v2i64)q1, 1);
  q2_d = __msa_copy_u_d((v2i64)q2, 1);
  src_v -= (pitch * 3);
  SD4(p2_d, p1_d, p0_d, q0_d, src_v, pitch);
  src_v += 4 * pitch;
  SD(q1_d, src_v);
  src_v += pitch;
  SD(q2_d, src_v);
}
예제 #7
0
void vp9_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch,
                                 const uint8_t *b_limit0_ptr,
                                 const uint8_t *limit0_ptr,
                                 const uint8_t *thresh0_ptr,
                                 const uint8_t *b_limit1_ptr,
                                 const uint8_t *limit1_ptr,
                                 const uint8_t *thresh1_ptr) {
  v16u8 mask, hev, flat;
  v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;

  LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
  LD_UB8(src - 4 + (8 * pitch), pitch,
         row8, row9, row10, row11, row12, row13, row14, row15);

  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
                      row8, row9, row10, row11, row12, row13, row14, row15,
                      p3, p2, p1, p0, q0, q1, q2, q3);

  thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
  thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
  thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);

  b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
  b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
  b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);

  limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
  limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
  limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);

  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
               hev, mask, flat);
  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
  ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
  ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
  ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
  ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);

  src -= 2;

  ST4x8_UB(tmp2, tmp3, src, pitch);
  src += (8 * pitch);
  ST4x8_UB(tmp4, tmp5, src, pitch);
}
예제 #8
0
static void loop_filter_horizontal_4_dual_msa(uint8_t *src, int32_t pitch,
                                              const uint8_t *b_limit0_ptr,
                                              const uint8_t *limit0_ptr,
                                              const uint8_t *thresh0_ptr,
                                              const uint8_t *b_limit1_ptr,
                                              const uint8_t *limit1_ptr,
                                              const uint8_t *thresh1_ptr) {
  v16u8 mask, hev, flat;
  v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;

  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
  thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
  thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
  thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);

  b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
  b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
  b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);

  limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
  limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
  limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);

  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
               mask, flat);
  VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);

  ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
}
예제 #9
0
void vp9_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch,
                              const uint8_t *b_limit_ptr,
                              const uint8_t *limit_ptr,
                              const uint8_t *thresh_ptr,
                              int32_t count) {
  uint64_t p1_d, p0_d, q0_d, q1_d;
  v16u8 mask, hev, flat, thresh, b_limit, limit;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;

  (void)count;

  /* load vector elements */
  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);

  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
  limit = (v16u8)__msa_fill_b(*limit_ptr);

  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
               hev, mask, flat);
  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);

  p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
  p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
  q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
  q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
  SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
}
예제 #10
0
void vp9_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch,
                                   const uint8_t *b_limit0_ptr,
                                   const uint8_t *limit0_ptr,
                                   const uint8_t *thresh0_ptr,
                                   const uint8_t *b_limit1_ptr,
                                   const uint8_t *limit1_ptr,
                                   const uint8_t *thresh1_ptr) {
  v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;

  /* load vector elements */
  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);

  thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
  thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
  thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);

  b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
  b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
  b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);

  limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
  limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
  limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);

  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
               hev, mask, flat);
  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);

  ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
}
예제 #11
0
void vp9_lpf_vertical_4_msa(uint8_t *src, int32_t pitch,
                            const uint8_t *b_limit_ptr,
                            const uint8_t *limit_ptr,
                            const uint8_t *thresh_ptr,
                            int32_t count) {
  v16u8 mask, hev, flat, limit, thresh, b_limit;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
  v8i16 vec0, vec1, vec2, vec3;

  (void)count;

  LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3);

  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
  limit = (v16u8)__msa_fill_b(*limit_ptr);

  TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
                     p3, p2, p1, p0, q0, q1, q2, q3);
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
               hev, mask, flat);
  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
  ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1);
  ILVRL_H2_SH(vec1, vec0, vec2, vec3);

  src -= 2;
  ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
  src += 4 * pitch;
  ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
}
예제 #12
0
static void loop_filter_horizontal_edge_uv_msa(uint8_t *src_u, uint8_t *src_v,
                                               int32_t pitch,
                                               const uint8_t b_limit_in,
                                               const uint8_t limit_in,
                                               const uint8_t thresh_in) {
  uint64_t p1_d, p0_d, q0_d, q1_d;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
  v16u8 mask, hev, flat, thresh, limit, b_limit;
  v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
  v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;

  thresh = (v16u8)__msa_fill_b(thresh_in);
  limit = (v16u8)__msa_fill_b(limit_in);
  b_limit = (v16u8)__msa_fill_b(b_limit_in);

  src_u = src_u - (pitch << 2);
  LD_UB8(src_u, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
  src_u += (5 * pitch);
  src_v = src_v - (pitch << 2);
  LD_UB8(src_v, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
  src_v += (5 * pitch);

  /* right 8 element of p3 are u pixel and
     left 8 element of p3 are v pixel */
  ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
  ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
               mask, flat);
  VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);

  p1_d = __msa_copy_u_d((v2i64)p1, 0);
  p0_d = __msa_copy_u_d((v2i64)p0, 0);
  q0_d = __msa_copy_u_d((v2i64)q0, 0);
  q1_d = __msa_copy_u_d((v2i64)q1, 0);
  SD4(q1_d, q0_d, p0_d, p1_d, src_u, (-pitch));

  p1_d = __msa_copy_u_d((v2i64)p1, 1);
  p0_d = __msa_copy_u_d((v2i64)p0, 1);
  q0_d = __msa_copy_u_d((v2i64)q0, 1);
  q1_d = __msa_copy_u_d((v2i64)q1, 1);
  SD4(q1_d, q0_d, p0_d, p1_d, src_v, (-pitch));
}
예제 #13
0
static void loop_filter_vertical_edge_uv_msa(uint8_t *src_u, uint8_t *src_v,
                                             int32_t pitch,
                                             const uint8_t b_limit_in,
                                             const uint8_t limit_in,
                                             const uint8_t thresh_in) {
  uint8_t *temp_src_u, *temp_src_v;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
  v16u8 mask, hev, flat, thresh, limit, b_limit;
  v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
  v16u8 row9, row10, row11, row12, row13, row14, row15;
  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;

  thresh = (v16u8)__msa_fill_b(thresh_in);
  limit = (v16u8)__msa_fill_b(limit_in);
  b_limit = (v16u8)__msa_fill_b(b_limit_in);

  LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
  LD_UB8(src_v - 4, pitch, row8, row9, row10, row11, row12, row13, row14,
         row15);
  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
                      row9, row10, row11, row12, row13, row14, row15, p3, p2,
                      p1, p0, q0, q1, q2, q3);

  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
               mask, flat);
  VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
  ILVR_B2_SW(p0, p1, q1, q0, tmp0, tmp1);
  ILVRL_H2_SW(tmp1, tmp0, tmp2, tmp3);
  tmp0 = (v4i32)__msa_ilvl_b((v16i8)p0, (v16i8)p1);
  tmp1 = (v4i32)__msa_ilvl_b((v16i8)q1, (v16i8)q0);
  ILVRL_H2_SW(tmp1, tmp0, tmp4, tmp5);

  temp_src_u = src_u - 2;
  ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, temp_src_u, pitch);
  temp_src_u += 4 * pitch;
  ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, temp_src_u, pitch);

  temp_src_v = src_v - 2;
  ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, temp_src_v, pitch);
  temp_src_v += 4 * pitch;
  ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, temp_src_v, pitch);
}
예제 #14
0
void ff_vp8_v_loop_filter16_inner_msa(uint8_t *src, ptrdiff_t pitch,
                                      int32_t e, int32_t i, int32_t h)
{
    v16u8 mask, hev, flat;
    v16u8 thresh, b_limit, limit;
    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;

    /* load vector elements */
    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
    thresh = (v16u8) __msa_fill_b(h);
    b_limit = (v16u8) __msa_fill_b(e);
    limit = (v16u8) __msa_fill_b(i);

    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
                 hev, mask, flat);
    VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);

    ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
}
예제 #15
0
uint32_t vp10_avg_8x8_msa(const uint8_t *src, int32_t src_stride) {
  uint32_t sum_out;
  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
  v4u32 sum = { 0 };

  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  HADD_UB4_UH(src0, src1, src2, src3, sum0, sum1, sum2, sum3);
  HADD_UB4_UH(src4, src5, src6, src7, sum4, sum5, sum6, sum7);
  ADD4(sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum0, sum2, sum4, sum6);
  ADD2(sum0, sum2, sum4, sum6, sum0, sum4);
  sum0 += sum4;

  sum = __msa_hadd_u_w(sum0, sum0);
  sum0 = (v8u16)__msa_pckev_h((v8i16)sum, (v8i16)sum);
  sum = __msa_hadd_u_w(sum0, sum0);
  sum = (v4u32)__msa_srari_w((v4i32)sum, 6);
  sum_out = __msa_copy_u_w((v4i32)sum, 0);

  return sum_out;
}
예제 #16
0
static void mbloop_filter_horizontal_edge_y_msa(uint8_t *src, int32_t pitch,
                                                const uint8_t b_limit_in,
                                                const uint8_t limit_in,
                                                const uint8_t thresh_in) {
  uint8_t *temp_src;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
  v16u8 mask, hev, flat, thresh, limit, b_limit;

  b_limit = (v16u8)__msa_fill_b(b_limit_in);
  limit = (v16u8)__msa_fill_b(limit_in);
  thresh = (v16u8)__msa_fill_b(thresh_in);
  temp_src = src - (pitch << 2);
  LD_UB8(temp_src, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
               mask, flat);
  VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
  temp_src = src - 3 * pitch;
  ST_UB4(p2, p1, p0, q0, temp_src, pitch);
  temp_src += (4 * pitch);
  ST_UB2(q1, q2, temp_src, pitch);
}
예제 #17
0
void ff_vp8_v_loop_filter16_msa(uint8_t *src, ptrdiff_t pitch, int b_limit_in,
                                int limit_in, int thresh_in)
{
    uint8_t *temp_src;
    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
    v16u8 mask, hev, flat, thresh, limit, b_limit;

    b_limit = (v16u8) __msa_fill_b(b_limit_in);
    limit = (v16u8) __msa_fill_b(limit_in);
    thresh = (v16u8) __msa_fill_b(thresh_in);
    /* load vector elements */
    temp_src = src - (pitch << 2);
    LD_UB8(temp_src, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
                 hev, mask, flat);
    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
    /* store vector elements */
    temp_src = src - 3 * pitch;
    ST_UB4(p2, p1, p0, q0, temp_src, pitch);
    temp_src += (4 * pitch);
    ST_UB2(q1, q2, temp_src, pitch);
}
예제 #18
0
void ff_vp8_h_loop_filter16_msa(uint8_t *src, ptrdiff_t pitch, int b_limit_in,
                                int limit_in, int thresh_in)
{
    uint8_t *temp_src;
    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
    v16u8 mask, hev, flat, thresh, limit, b_limit;
    v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
    v16u8 row9, row10, row11, row12, row13, row14, row15;
    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

    b_limit = (v16u8) __msa_fill_b(b_limit_in);
    limit = (v16u8) __msa_fill_b(limit_in);
    thresh = (v16u8) __msa_fill_b(thresh_in);
    temp_src = src - 4;
    LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
    temp_src += (8 * pitch);
    LD_UB8(temp_src, pitch,
           row8, row9, row10, row11, row12, row13, row14, row15);
    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
                        row8, row9, row10, row11, row12, row13, row14, row15,
                        p3, p2, p1, p0, q0, q1, q2, q3);

    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
                 hev, mask, flat);
    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
    ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
    ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
    ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
    ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
    ILVRL_B2_SH(q2, q1, tmp2, tmp5);

    temp_src = src - 3;
    VP8_ST6x1_UB(tmp3, 0, tmp2, 0, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp3, 1, tmp2, 1, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp3, 2, tmp2, 2, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp3, 3, tmp2, 3, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp4, 0, tmp2, 4, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp4, 1, tmp2, 5, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp4, 2, tmp2, 6, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp4, 3, tmp2, 7, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp6, 0, tmp5, 0, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp6, 1, tmp5, 1, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp6, 2, tmp5, 2, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp6, 3, tmp5, 3, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp7, 0, tmp5, 4, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp7, 1, tmp5, 5, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp7, 2, tmp5, 6, temp_src, 4);
    temp_src += pitch;
    VP8_ST6x1_UB(tmp7, 3, tmp5, 7, temp_src, 4);
}
예제 #19
0
void vpx_lpf_horizontal_8_dual_msa(uint8_t *src, int32_t pitch,
                                   const uint8_t *b_limit0,
                                   const uint8_t *limit0,
                                   const uint8_t *thresh0,
                                   const uint8_t *b_limit1,
                                   const uint8_t *limit1,
                                   const uint8_t *thresh1) {
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
  v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
  v16u8 zero = { 0 };

  /* load vector elements */
  LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);

  thresh = (v16u8)__msa_fill_b(*thresh0);
  tmp = (v16u8)__msa_fill_b(*thresh1);
  thresh = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)thresh);

  b_limit = (v16u8)__msa_fill_b(*b_limit0);
  tmp = (v16u8)__msa_fill_b(*b_limit1);
  b_limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)b_limit);

  limit = (v16u8)__msa_fill_b(*limit0);
  tmp = (v16u8)__msa_fill_b(*limit1);
  limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)limit);

  /* mask and hev */
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
               hev, mask, flat);
  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);

  if (__msa_test_bz_v(flat)) {
    ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
  } else {
    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
               q2_r, q3_r);
    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);

    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
    ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
    VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);

    /* convert 16 bit output data into 8 bit */
    PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
                p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
                p0_filt8_r, q0_filt8_r);
    PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
                q2_filt8_r);

    /* store pixel values */
    p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
    q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);

    src -= 3 * pitch;

    ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
    src += (4 * pitch);
    ST_UB2(q1_out, q2_out, src, pitch);
    src += (2 * pitch);
  }
}
예제 #20
0
void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
                                 const uint8_t *b_limit0,
                                 const uint8_t *limit0,
                                 const uint8_t *thresh0,
                                 const uint8_t *b_limit1,
                                 const uint8_t *limit1,
                                 const uint8_t *thresh1) {
  uint8_t *temp_src;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
  v16u8 p1_out, p0_out, q0_out, q1_out;
  v16u8 flat, mask, hev, thresh, b_limit, limit;
  v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
  v16u8 zero = { 0 };
  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;

  temp_src = src - 4;

  LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
  temp_src += (8 * pitch);
  LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);

  /* transpose 16x8 matrix into 8x16 */
  TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
                      q3, q2, q1, q0, row12, row13, row14, row15,
                      p3, p2, p1, p0, q0, q1, q2, q3);

  thresh = (v16u8)__msa_fill_b(*thresh0);
  vec0 = (v8i16)__msa_fill_b(*thresh1);
  thresh = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)thresh);

  b_limit = (v16u8)__msa_fill_b(*b_limit0);
  vec0 = (v8i16)__msa_fill_b(*b_limit1);
  b_limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)b_limit);

  limit = (v16u8)__msa_fill_b(*limit0);
  vec0 = (v8i16)__msa_fill_b(*limit1);
  limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)limit);

  /* mask and hev */
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
               hev, mask, flat);
  /* flat4 */
  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
  /* filter4 */
  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);

  if (__msa_test_bz_v(flat)) {
    ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
    ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
    ILVRL_H2_SH(vec1, vec0, vec4, vec5);

    src -= 2;
    ST4x8_UB(vec2, vec3, src, pitch);
    src += 8 * pitch;
    ST4x8_UB(vec4, vec5, src, pitch);
  } else {
    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
               q3_r);
    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);

    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
    ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);

    /* filter8 */
    VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);

    /* convert 16 bit output data into 8 bit */
    PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
                p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
                p0_filt8_r, q0_filt8_r);
    PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
                q2_filt8_r);

    /* store pixel values */
    p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
    p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
    p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
    q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
    q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
    q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);

    ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
    ILVRL_H2_SH(vec1, vec0, vec3, vec4);
    ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
    ILVRL_H2_SH(vec1, vec0, vec6, vec7);
    ILVRL_B2_SH(q2, q1, vec2, vec5);

    src -= 3;
    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
    ST2x4_UB(vec2, 0, src + 4, pitch);
    src += (4 * pitch);
    ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
    ST2x4_UB(vec2, 4, src + 4, pitch);
    src += (4 * pitch);
    ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
    ST2x4_UB(vec5, 0, src + 4, pitch);
    src += (4 * pitch);
    ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
    ST2x4_UB(vec5, 4, src + 4, pitch);
  }
}
예제 #21
0
void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
                            const uint8_t *b_limit_ptr,
                            const uint8_t *limit_ptr,
                            const uint8_t *thresh_ptr) {
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
  v16u8 p1_out, p0_out, q0_out, q1_out;
  v16u8 flat, mask, hev, thresh, b_limit, limit;
  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
  v16u8 zero = { 0 };
  v8i16 vec0, vec1, vec2, vec3, vec4;

  /* load vector elements */
  LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3);

  TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
                     p3, p2, p1, p0, q0, q1, q2, q3);

  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
  limit = (v16u8)__msa_fill_b(*limit_ptr);

  /* mask and hev */
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
               hev, mask, flat);
  /* flat4 */
  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
  /* filter4 */
  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);

  flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);

  if (__msa_test_bz_v(flat)) {
    /* Store 4 pixels p1-_q1 */
    ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
    ILVRL_H2_SH(vec1, vec0, vec2, vec3);

    src -= 2;
    ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
    src += 4 * pitch;
    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
  } else {
    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
               q3_r);
    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
    /* convert 16 bit output data into 8 bit */
    PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r,
                p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r,
                p0_filt8_r, q0_filt8_r);
    PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r,
                q2_filt8_r);

    /* store pixel values */
    p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
    p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
    p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
    q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
    q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
    q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);

    /* Store 6 pixels p2-_q2 */
    ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
    vec4 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1);

    src -= 3;
    ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
    ST2x4_UB(vec4, 0, src + 4, pitch);
    src += (4 * pitch);
    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
    ST2x4_UB(vec4, 4, src + 4, pitch);
  }
}
예제 #22
0
void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
                              const uint8_t *b_limit_ptr,
                              const uint8_t *limit_ptr,
                              const uint8_t *thresh_ptr) {
  uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
  v16u8 mask, hev, flat, thresh, b_limit, limit;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
  v8i16 p2_filter8, p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8;
  v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
  v16i8 zero = { 0 };

  /* load vector elements */
  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);

  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
  limit = (v16u8)__msa_fill_b(*limit_ptr);

  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
               hev, mask, flat);
  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);

  flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);

  if (__msa_test_bz_v(flat)) {
    p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
    p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
    q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
    q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
    SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
  } else {
    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
               q2_r, q3_r);
    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
                p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);

    /* convert 16 bit output data into 8 bit */
    PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
                zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
                q0_filter8);
    PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);

    /* store pixel values */
    p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat);
    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat);
    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat);
    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat);
    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat);
    q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat);

    p2_d = __msa_copy_u_d((v2i64)p2_out, 0);
    p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
    p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
    q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
    q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
    q2_d = __msa_copy_u_d((v2i64)q2_out, 0);

    src -= 3 * pitch;

    SD4(p2_d, p1_d, p0_d, q0_d, src, pitch);
    src += (4 * pitch);
    SD(q1_d, src);
    src += pitch;
    SD(q2_d, src);
  }
}
예제 #23
0
static void mbloop_filter_vertical_edge_uv_msa(uint8_t *src_u, uint8_t *src_v,
                                               int32_t pitch,
                                               const uint8_t b_limit_in,
                                               const uint8_t limit_in,
                                               const uint8_t thresh_in) {
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
  v16u8 mask, hev, flat, thresh, limit, b_limit;
  v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
  v16u8 row9, row10, row11, row12, row13, row14, row15;
  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

  b_limit = (v16u8)__msa_fill_b(b_limit_in);
  limit = (v16u8)__msa_fill_b(limit_in);
  thresh = (v16u8)__msa_fill_b(thresh_in);

  LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
  LD_UB8(src_v - 4, pitch, row8, row9, row10, row11, row12, row13, row14,
         row15);
  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
                      row9, row10, row11, row12, row13, row14, row15, p3, p2,
                      p1, p0, q0, q1, q2, q3);

  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
               mask, flat);
  VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);

  ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
  ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
  ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
  ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
  ILVRL_B2_SH(q2, q1, tmp2, tmp5);

  src_u -= 3;
  VP8_ST6x1_UB(tmp3, 0, tmp2, 0, src_u, 4);
  src_u += pitch;
  VP8_ST6x1_UB(tmp3, 1, tmp2, 1, src_u, 4);
  src_u += pitch;
  VP8_ST6x1_UB(tmp3, 2, tmp2, 2, src_u, 4);
  src_u += pitch;
  VP8_ST6x1_UB(tmp3, 3, tmp2, 3, src_u, 4);
  src_u += pitch;
  VP8_ST6x1_UB(tmp4, 0, tmp2, 4, src_u, 4);
  src_u += pitch;
  VP8_ST6x1_UB(tmp4, 1, tmp2, 5, src_u, 4);
  src_u += pitch;
  VP8_ST6x1_UB(tmp4, 2, tmp2, 6, src_u, 4);
  src_u += pitch;
  VP8_ST6x1_UB(tmp4, 3, tmp2, 7, src_u, 4);

  src_v -= 3;
  VP8_ST6x1_UB(tmp6, 0, tmp5, 0, src_v, 4);
  src_v += pitch;
  VP8_ST6x1_UB(tmp6, 1, tmp5, 1, src_v, 4);
  src_v += pitch;
  VP8_ST6x1_UB(tmp6, 2, tmp5, 2, src_v, 4);
  src_v += pitch;
  VP8_ST6x1_UB(tmp6, 3, tmp5, 3, src_v, 4);
  src_v += pitch;
  VP8_ST6x1_UB(tmp7, 0, tmp5, 4, src_v, 4);
  src_v += pitch;
  VP8_ST6x1_UB(tmp7, 1, tmp5, 5, src_v, 4);
  src_v += pitch;
  VP8_ST6x1_UB(tmp7, 2, tmp5, 6, src_v, 4);
  src_v += pitch;
  VP8_ST6x1_UB(tmp7, 3, tmp5, 7, src_v, 4);
}