Ejemplo n.º 1
0
static void ConvertARGBToUV_NEON(const uint32_t* argb, uint8_t* u, uint8_t* v,
                                 int src_width, int do_store) {
  int i;
  for (i = 0; i + 16 <= src_width; i += 16, u += 8, v += 8) {
    const uint8x16x4_t RGB = vld4q_u8((const uint8_t*)&argb[i]);
    const uint16x8_t R = vpaddlq_u8(RGB.val[2]);  // pair-wise adds
    const uint16x8_t G = vpaddlq_u8(RGB.val[1]);
    const uint16x8_t B = vpaddlq_u8(RGB.val[0]);
    int16x8_t U_tmp, V_tmp;
    CONVERT_RGB_TO_UV(R, G, B, 1, U_tmp, V_tmp);
    {
      const uint8x8_t U = vqrshrun_n_s16(U_tmp, 1);
      const uint8x8_t V = vqrshrun_n_s16(V_tmp, 1);
      if (do_store) {
        vst1_u8(u, U);
        vst1_u8(v, V);
      } else {
        const uint8x8_t prev_u = vld1_u8(u);
        const uint8x8_t prev_v = vld1_u8(v);
        vst1_u8(u, vrhadd_u8(U, prev_u));
        vst1_u8(v, vrhadd_u8(V, prev_v));
      }
    }
  }
  if (i < src_width) {  // left-over
    WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
  }
}
Ejemplo n.º 2
0
void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
  const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
  const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
  const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
  const uint32x2_t zero = vdup_n_u32(0);
  const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);
  const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL);
  const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8));
  const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
  const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
  const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
  const uint8_t D = vget_lane_u8(XABCD_u8, 4);
  const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
  const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
  const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
  const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
  const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
  const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
}
Ejemplo n.º 3
0
void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
  static const uint8_t shuffle1[8] = { 1, 2, 3, 4, 5, 6, 7, 7 };
  static const uint8_t shuffle2[8] = { 2, 3, 4, 5, 6, 7, 7, 7 };
  const uint8x8_t sh_12345677 = vld1_u8(shuffle1);
  const uint8x8_t sh_23456777 = vld1_u8(shuffle2);
  const uint8x8_t A0 = vld1_u8(above);  // top row
  const uint8x8_t A1 = vtbl1_u8(A0, sh_12345677);
  const uint8x8_t A2 = vtbl1_u8(A0, sh_23456777);
  const uint8x8_t avg1 = vhadd_u8(A0, A2);
  uint8x8_t row = vrhadd_u8(avg1, A1);
  int i;
  (void)left;
  for (i = 0; i < 7; ++i) {
    vst1_u8(dst + i * stride, row);
    row = vtbl1_u8(row, sh_12345677);
  }
  vst1_u8(dst + i * stride, row);
}
Ejemplo n.º 4
0
void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
  const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above));  // top row
  const uint64x1_t A1 = vshr_n_u64(A0, 8);
  const uint64x1_t A2 = vshr_n_u64(A0, 16);
  const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
  const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
  const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
  const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00);
  const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
  const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
  const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
  (void)left;
  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
  dst[3 * stride + 3] = above[7];
}
Ejemplo n.º 5
0
void UpsampleRgbaLinePairNEON(const uint8_t *top_y, const uint8_t *bottom_y, const uint8_t *top_u, const uint8_t *top_v, const uint8_t *cur_u, const uint8_t *cur_v, uint8_t *top_dst, uint8_t *bottom_dst, int len)
{
    int block;
    uint8_t uv_buf[2 * 32 + 15];
    uint8_t *const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);
    const int uv_len = (len + 1) >> 1;
    const int num_blocks = (uv_len - 1) >> 3;
    const int leftover = uv_len - num_blocks * 8;
    const int last_pos = 1 + 16 * num_blocks;
    const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;
    const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;
    const int16x4_t cf16 = vld1_s16(coef);
    const int32x2_t cf32 = vmov_n_s32(76283);
    const uint8x8_t u16 = vmov_n_u8(16);
    const uint8x8_t u128 = vmov_n_u8(128);
    for (block = 0; block < num_blocks; ++block) {
        {
            uint8x8_t a = vld1_u8(top_u);
            uint8x8_t b = vld1_u8(top_u + 1);
            uint8x8_t c = vld1_u8(cur_u);
            uint8x8_t d = vld1_u8(cur_u + 1);
            uint16x8_t al = vshll_n_u8(a, 1);
            uint16x8_t bl = vshll_n_u8(b, 1);
            uint16x8_t cl = vshll_n_u8(c, 1);
            uint16x8_t dl = vshll_n_u8(d, 1);
            uint8x8_t diag1, diag2;
            uint16x8_t sl;
            sl = vaddl_u8(a, b);
            sl = vaddw_u8(sl, c);
            sl = vaddw_u8(sl, d);
            al = vaddq_u16(sl, al);
            bl = vaddq_u16(sl, bl);
            al = vaddq_u16(al, dl);
            bl = vaddq_u16(bl, cl);
            diag2 = vshrn_n_u16(al, 3);
            diag1 = vshrn_n_u16(bl, 3);
            a = vrhadd_u8(a, diag1);
            b = vrhadd_u8(b, diag2);
            c = vrhadd_u8(c, diag2);
            d = vrhadd_u8(d, diag1);
            {
                const uint8x8x2_t a_b = {{ a, b }};
                const uint8x8x2_t c_d = {{ c, d }};
                vst2_u8(r_uv, a_b);
                vst2_u8(r_uv + 32, c_d);
            }
        }
        {
            uint8x8_t a = vld1_u8(top_v);
            uint8x8_t b = vld1_u8(top_v + 1);
            uint8x8_t c = vld1_u8(cur_v);
            uint8x8_t d = vld1_u8(cur_v + 1);
            uint16x8_t al = vshll_n_u8(a, 1);
            uint16x8_t bl = vshll_n_u8(b, 1);
            uint16x8_t cl = vshll_n_u8(c, 1);
            uint16x8_t dl = vshll_n_u8(d, 1);
            uint8x8_t diag1, diag2;
            uint16x8_t sl;
            sl = vaddl_u8(a, b);
            sl = vaddw_u8(sl, c);
            sl = vaddw_u8(sl, d);
            al = vaddq_u16(sl, al);
            bl = vaddq_u16(sl, bl);
            al = vaddq_u16(al, dl);
            bl = vaddq_u16(bl, cl);
            diag2 = vshrn_n_u16(al, 3);
            diag1 = vshrn_n_u16(bl, 3);
            a = vrhadd_u8(a, diag1);
            b = vrhadd_u8(b, diag2);
            c = vrhadd_u8(c, diag2);
            d = vrhadd_u8(d, diag1);
            {
                const uint8x8x2_t a_b = {{ a, b }};
                const uint8x8x2_t c_d = {{ c, d }};
                vst2_u8(r_uv + 16, a_b);
                vst2_u8(r_uv + 16 + 32, c_d);
            }
        }
        {
            if (top_y) {
                {
                    int i;
                    for (i = 0; i < 16; i += 8) {
                        int off = ((16 * block + 1) + i) * 4;
                        uint8x8_t y = vld1_u8(top_y + (16 * block + 1) + i);
                        uint8x8_t u = vld1_u8((r_uv) + i);
                        uint8x8_t v = vld1_u8((r_uv) + i + 16);
                        int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16));
                        int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128));
                        int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128));
                        int16x8_t ud = vshlq_n_s16(uu, 1);
                        int16x8_t vd = vshlq_n_s16(vv, 1);
                        int32x4_t vrl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(vv), 1), vget_low_s16(vd), cf16, 0);
                        int32x4_t vrh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(vv), 1), vget_high_s16(vd), cf16, 0);
                        int16x8_t vr = vcombine_s16(vrshrn_n_s32(vrl, 16), vrshrn_n_s32(vrh, 16));
                        int32x4_t vl = vmovl_s16(vget_low_s16(vv));
                        int32x4_t vh = vmovl_s16(vget_high_s16(vv));
                        int32x4_t ugl = vmlal_lane_s16(vl, vget_low_s16(uu), cf16, 1);
                        int32x4_t ugh = vmlal_lane_s16(vh, vget_high_s16(uu), cf16, 1);
                        int32x4_t gcl = vqdmlal_lane_s16(ugl, vget_low_s16(vv), cf16, 2);
                        int32x4_t gch = vqdmlal_lane_s16(ugh, vget_high_s16(vv), cf16, 2);
                        int16x8_t gc = vcombine_s16(vrshrn_n_s32(gcl, 16), vrshrn_n_s32(gch, 16));
                        int32x4_t ubl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(uu), 1), vget_low_s16(ud), cf16, 3);
                        int32x4_t ubh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(uu), 1), vget_high_s16(ud), cf16, 3);
                        int16x8_t ub = vcombine_s16(vrshrn_n_s32(ubl, 16), vrshrn_n_s32(ubh, 16));
                        int32x4_t rl = vaddl_s16(vget_low_s16(yy), vget_low_s16(vr));
                        int32x4_t rh = vaddl_s16(vget_high_s16(yy), vget_high_s16(vr));
                        int32x4_t gl = vsubl_s16(vget_low_s16(yy), vget_low_s16(gc));
                        int32x4_t gh = vsubl_s16(vget_high_s16(yy), vget_high_s16(gc));
                        int32x4_t bl = vaddl_s16(vget_low_s16(yy), vget_low_s16(ub));
                        int32x4_t bh = vaddl_s16(vget_high_s16(yy), vget_high_s16(ub));
                        rl = vmulq_lane_s32(rl, cf32, 0);
                        rh = vmulq_lane_s32(rh, cf32, 0);
                        gl = vmulq_lane_s32(gl, cf32, 0);
                        gh = vmulq_lane_s32(gh, cf32, 0);
                        bl = vmulq_lane_s32(bl, cf32, 0);
                        bh = vmulq_lane_s32(bh, cf32, 0);
                        y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, 16), vrshrn_n_s32(rh, 16)));
                        u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, 16), vrshrn_n_s32(gh, 16)));
                        v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(bl, 16), vrshrn_n_s32(bh, 16)));
                        do {
                            const uint8x8x4_t r_g_b_v255 = {{ y, u, v, vmov_n_u8(255) }};
                            vst4_u8(top_dst + off, r_g_b_v255); 
                        } while (0);
                    }
                }
            }
            if (bottom_y) {
                {
                    int i;
                    for (i = 0; i < 16; i += 8) {
                        int off = ((16 * block + 1) + i) * 4;
                        uint8x8_t y = vld1_u8(bottom_y + (16 * block + 1) + i);
                        uint8x8_t u = vld1_u8(((r_uv) + 32) + i);
                        uint8x8_t v = vld1_u8(((r_uv) + 32) + i + 16);
                        int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16));
                        int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128));
                        int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128));
                        int16x8_t ud = vshlq_n_s16(uu, 1);
                        int16x8_t vd = vshlq_n_s16(vv, 1);
                        int32x4_t vrl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(vv), 1), vget_low_s16(vd), cf16, 0);
                        int32x4_t vrh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(vv), 1), vget_high_s16(vd), cf16, 0);
                        int16x8_t vr = vcombine_s16(vrshrn_n_s32(vrl, 16), vrshrn_n_s32(vrh, 16));
                        int32x4_t vl = vmovl_s16(vget_low_s16(vv));
                        int32x4_t vh = vmovl_s16(vget_high_s16(vv));
                        int32x4_t ugl = vmlal_lane_s16(vl, vget_low_s16(uu), cf16, 1);
                        int32x4_t ugh = vmlal_lane_s16(vh, vget_high_s16(uu), cf16, 1);
                        int32x4_t gcl = vqdmlal_lane_s16(ugl, vget_low_s16(vv), cf16, 2);
                        int32x4_t gch = vqdmlal_lane_s16(ugh, vget_high_s16(vv), cf16, 2);
                        int16x8_t gc = vcombine_s16(vrshrn_n_s32(gcl, 16), vrshrn_n_s32(gch, 16));
                        int32x4_t ubl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(uu), 1), vget_low_s16(ud), cf16, 3);
                        int32x4_t ubh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(uu), 1), vget_high_s16(ud), cf16, 3);
                        int16x8_t ub = vcombine_s16(vrshrn_n_s32(ubl, 16), vrshrn_n_s32(ubh, 16));
                        int32x4_t rl = vaddl_s16(vget_low_s16(yy), vget_low_s16(vr));
                        int32x4_t rh = vaddl_s16(vget_high_s16(yy), vget_high_s16(vr));
                        int32x4_t gl = vsubl_s16(vget_low_s16(yy), vget_low_s16(gc));
                        int32x4_t gh = vsubl_s16(vget_high_s16(yy), vget_high_s16(gc));
                        int32x4_t bl = vaddl_s16(vget_low_s16(yy), vget_low_s16(ub));
                        int32x4_t bh = vaddl_s16(vget_high_s16(yy), vget_high_s16(ub));
                        rl = vmulq_lane_s32(rl, cf32, 0);
                        rh = vmulq_lane_s32(rh, cf32, 0);
                        gl = vmulq_lane_s32(gl, cf32, 0);
                        gh = vmulq_lane_s32(gh, cf32, 0);
                        bl = vmulq_lane_s32(bl, cf32, 0);
                        bh = vmulq_lane_s32(bh, cf32, 0);
                        y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, 16), vrshrn_n_s32(rh, 16)));
                        u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, 16), vrshrn_n_s32(gh, 16)));
                        v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(bl, 16), vrshrn_n_s32(bh, 16)));
                        do {
                            const uint8x8x4_t r_g_b_v255 = {{ y, u, v, vmov_n_u8(255) }};
                            vst4_u8(bottom_dst + off, r_g_b_v255);
                        } while (0);
                    }
                }
            }
        }
    }
}