Exemple #1
0
void vpx_lpf_horizontal_4_dual_neon(
    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
    const uint8_t *limit1, const uint8_t *thresh1) {
  uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1;
  uint8x16_t qblimit, qlimit, qthresh;
  uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;

  dblimit0 = vld1_u8(blimit0);
  dlimit0 = vld1_u8(limit0);
  dthresh0 = vld1_u8(thresh0);
  dblimit1 = vld1_u8(blimit1);
  dlimit1 = vld1_u8(limit1);
  dthresh1 = vld1_u8(thresh1);
  qblimit = vcombine_u8(dblimit0, dblimit1);
  qlimit = vcombine_u8(dlimit0, dlimit1);
  qthresh = vcombine_u8(dthresh0, dthresh1);

  s -= (p << 2);

  q3u8 = vld1q_u8(s);
  s += p;
  q4u8 = vld1q_u8(s);
  s += p;
  q5u8 = vld1q_u8(s);
  s += p;
  q6u8 = vld1q_u8(s);
  s += p;
  q7u8 = vld1q_u8(s);
  s += p;
  q8u8 = vld1q_u8(s);
  s += p;
  q9u8 = vld1q_u8(s);
  s += p;
  q10u8 = vld1q_u8(s);

  loop_filter_neon_16(qblimit, qlimit, qthresh, q3u8, q4u8, q5u8, q6u8, q7u8,
                      q8u8, q9u8, q10u8, &q5u8, &q6u8, &q7u8, &q8u8);

  s -= (p * 5);
  vst1q_u8(s, q5u8);
  s += p;
  vst1q_u8(s, q6u8);
  s += p;
  vst1q_u8(s, q7u8);
  s += p;
  vst1q_u8(s, q8u8);
  return;
}
static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
                                       uint8_t *output_ptr,
                                       unsigned int src_pixels_per_line,
                                       int pixel_step,
                                       unsigned int output_height,
                                       unsigned int output_width,
                                       const uint8_t *filter) {
  const uint8x8_t f0 = vmov_n_u8(filter[0]);
  const uint8x8_t f1 = vmov_n_u8(filter[1]);
  unsigned int i, j;
  for (i = 0; i < output_height; ++i) {
    for (j = 0; j < output_width; j += 16) {
      const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);
      const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);
      const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
      const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
      const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
      const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
      const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
      const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
      vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi));
    }
    // Next row...
    src_ptr += src_pixels_per_line;
    output_ptr += output_width;
  }
}
Exemple #3
0
static v16
mulby(uint8_t x, v16 v)
{
#ifdef LIBRS_USE_NEON

#define uint8x16_to_8x8x2(v) ((uint8x8x2_t) { vget_low_u8(v), vget_high_u8(v) })

    v16 lo, hi;

    lo = v & VEC16(0x0f);
    hi = vshrq_n_u8(v, 4);
    lo = vcombine_u8(
            vtbl2_u8(uint8x16_to_8x8x2(rs_nibmul[x].lo), vget_low_u8(lo)),
            vtbl2_u8(uint8x16_to_8x8x2(rs_nibmul[x].lo), vget_high_u8(lo)));
    hi = vcombine_u8(
            vtbl2_u8(uint8x16_to_8x8x2(rs_nibmul[x].hi), vget_low_u8(hi)),
            vtbl2_u8(uint8x16_to_8x8x2(rs_nibmul[x].hi), vget_high_u8(hi)));
    return lo ^ hi;

#elif defined(LIBRS_USE_SSSE3)

    v16 lo, hi;

    lo = v & VEC16(0x0f);
    hi = __builtin_ia32_psrawi128(v, 4);
    hi &= VEC16(0x0f);
    lo = __builtin_ia32_pshufb128(rs_nibmul[x].lo, lo);
    hi = __builtin_ia32_pshufb128(rs_nibmul[x].hi, hi);
    return lo ^ hi;

#else

    v16 vv = VEC16(0);

    while (x != 0) {
        if (x & 1)
            vv ^= v;
        x >>= 1;
        v = mul2(v);
    }
    return vv;

#endif
}
static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) {
  const uint32_t* const end = argb_data + (num_pixels & ~3);
  const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
  for (; argb_data < end; argb_data += 4) {
    const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
    const uint8x16_t greens =
        vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
                    vtbl1_u8(vget_high_u8(argb), shuffle));
    vst1q_u8((uint8_t*)argb_data, vaddq_u8(argb, greens));
  }
  // fallthrough and finish off with plain-C
  VP8LAddGreenToBlueAndRed_C(argb_data, num_pixels & 3);
}
static INLINE void scaledconvolve_vert_w16(
    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
    const int y0_q4, const int y_step_q4, const int w, const int h) {
  int x, y;
  int y_q4 = y0_q4;

  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
  y = h;
  do {
    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
    if (y_q4 & SUBPEL_MASK) {
      x = 0;
      do {
        const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
        uint8x16_t ss[8];
        uint8x8_t s[8], d[2];
        load_u8_16x8(src_y, src_stride, &ss[0], &ss[1], &ss[2], &ss[3], &ss[4],
                     &ss[5], &ss[6], &ss[7]);
        s[0] = vget_low_u8(ss[0]);
        s[1] = vget_low_u8(ss[1]);
        s[2] = vget_low_u8(ss[2]);
        s[3] = vget_low_u8(ss[3]);
        s[4] = vget_low_u8(ss[4]);
        s[5] = vget_low_u8(ss[5]);
        s[6] = vget_low_u8(ss[6]);
        s[7] = vget_low_u8(ss[7]);
        d[0] = scale_filter_8(s, filters);

        s[0] = vget_high_u8(ss[0]);
        s[1] = vget_high_u8(ss[1]);
        s[2] = vget_high_u8(ss[2]);
        s[3] = vget_high_u8(ss[3]);
        s[4] = vget_high_u8(ss[4]);
        s[5] = vget_high_u8(ss[5]);
        s[6] = vget_high_u8(ss[6]);
        s[7] = vget_high_u8(ss[7]);
        d[1] = scale_filter_8(s, filters);
        vst1q_u8(&dst[x], vcombine_u8(d[0], d[1]));
        src_y += 16;
        x += 16;
      } while (x < w);
    } else {
      memcpy(dst, &src_y[3 * src_stride], w);
    }
    dst += dst_stride;
    y_q4 += y_step_q4;
  } while (--y);
}
static inline uint8x16_t condense_float_rgbas(float32x4_t rgba0,
					      float32x4_t rgba1,  
					      float32x4_t rgba2,
					      float32x4_t rgba3)
{
  uint8x16_t retval = {0};  /* 16 bytes as 4 4-byte RGBAs  */
  int32x4_t i32pixels0, i32pixels1, i32pixels2, i32pixels3;
  int16x4_t i16pixels0, i16pixels1, i16pixels2, i16pixels3;
  int16x8_t i16pixels01, i16pixels23;
  uint8x8_t u8pixels0, u8pixels1;
  
  /* the choice of saturating conversions here will turn the elements  */
  /* of the rgbaN vectors into unsigned chars (0 - 255), so no max/min  */
  /* is required here.   */


  /* first float to int  */
  i32pixels0 = vcvtq_s32_f32(rgba0);
  i32pixels1 = vcvtq_s32_f32(rgba1);
  i32pixels2 = vcvtq_s32_f32(rgba2);
  i32pixels3 = vcvtq_s32_f32(rgba3);

  /* then int to short  */
  i16pixels0 = vqmovn_s32(i32pixels0);
  i16pixels1 = vqmovn_s32(i32pixels1);
  i16pixels2 = vqmovn_s32(i32pixels2);
  i16pixels3 = vqmovn_s32(i32pixels3);
  
  i16pixels01 = vcombine_s16(i16pixels0, i16pixels1);
  i16pixels23 = vcombine_s16(i16pixels2, i16pixels3);

  /* now short to unsigned int. saturation takes care of the boundary cases  */
  u8pixels0 = vqmovun_s16(i16pixels01);
  u8pixels1 = vqmovun_s16(i16pixels23);
  
  retval = vcombine_u8(u8pixels0, u8pixels1);

  return(retval);
}
void vp8_mbloop_filter_horizontal_edge_uv_neon(
        unsigned char *u,
        int pitch,
        unsigned char blimit,
        unsigned char limit,
        unsigned char thresh,
        unsigned char *v) {
    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
    uint8x16_t q5, q6, q7, q8, q9, q10;
    uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
    uint8x8_t d15, d16, d17, d18, d19, d20, d21;

    qblimit = vdupq_n_u8(blimit);
    qlimit = vdupq_n_u8(limit);
    qthresh = vdupq_n_u8(thresh);

    u -= (pitch << 2);
    v -= (pitch << 2);

    d6 = vld1_u8(u);
    u += pitch;
    d7 = vld1_u8(v);
    v += pitch;
    d8 = vld1_u8(u);
    u += pitch;
    d9 = vld1_u8(v);
    v += pitch;
    d10 = vld1_u8(u);
    u += pitch;
    d11 = vld1_u8(v);
    v += pitch;
    d12 = vld1_u8(u);
    u += pitch;
    d13 = vld1_u8(v);
    v += pitch;
    d14 = vld1_u8(u);
    u += pitch;
    d15 = vld1_u8(v);
    v += pitch;
    d16 = vld1_u8(u);
    u += pitch;
    d17 = vld1_u8(v);
    v += pitch;
    d18 = vld1_u8(u);
    u += pitch;
    d19 = vld1_u8(v);
    v += pitch;
    d20 = vld1_u8(u);
    d21 = vld1_u8(v);

    q3 = vcombine_u8(d6, d7);
    q4 = vcombine_u8(d8, d9);
    q5 = vcombine_u8(d10, d11);
    q6 = vcombine_u8(d12, d13);
    q7 = vcombine_u8(d14, d15);
    q8 = vcombine_u8(d16, d17);
    q9 = vcombine_u8(d18, d19);
    q10 = vcombine_u8(d20, d21);

    vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
                         q5, q6, q7, q8, q9, q10,
                         &q4, &q5, &q6, &q7, &q8, &q9);

    u -= (pitch * 6);
    v -= (pitch * 6);
    vst1_u8(u, vget_low_u8(q4));
    u += pitch;
    vst1_u8(v, vget_high_u8(q4));
    v += pitch;
    vst1_u8(u, vget_low_u8(q5));
    u += pitch;
    vst1_u8(v, vget_high_u8(q5));
    v += pitch;
    vst1_u8(u, vget_low_u8(q6));
    u += pitch;
    vst1_u8(v, vget_high_u8(q6));
    v += pitch;
    vst1_u8(u, vget_low_u8(q7));
    u += pitch;
    vst1_u8(v, vget_high_u8(q7));
    v += pitch;
    vst1_u8(u, vget_low_u8(q8));
    u += pitch;
    vst1_u8(v, vget_high_u8(q8));
    v += pitch;
    vst1_u8(u, vget_low_u8(q9));
    vst1_u8(v, vget_high_u8(q9));
    return;
}
Exemple #8
0
// about twice as fast as generic
void MipMap_24_neon( int width, int height, unsigned char *source, unsigned char *target )
{
    if(width < 32) {
        MipMap_24_generic(width, height, source, target);
        return;
    }

    int newwidth = width / 2;
    int newheight = height / 2;
    int stride = width * 3;
    
    unsigned char *s = target;
    unsigned char *t = source;
    unsigned char *u = t+stride;

    int y, x;
    for( y = 0; y < newheight; y++ ) {
        for( x = 0; x < newwidth; x+=16 ) {
            uint8x16_t a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aa, ab;

            memcpy(&a0, t,    16);
            memcpy(&a1, t+16, 16);
            memcpy(&a2, t+32, 16);
            memcpy(&a3, t+48, 16);
            memcpy(&a4, t+64, 16);
            memcpy(&a5, t+80, 16);
            memcpy(&a6, u,    16);
            memcpy(&a7, u+16, 16);
            memcpy(&a8, u+32, 16);
            memcpy(&a9, u+48, 16);
            memcpy(&aa, u+64, 16);
            memcpy(&ab, u+80, 16);

            // average first and second scan lines
            a0 = vhaddq_u8(a0, a6);
            a1 = vhaddq_u8(a1, a7);
            a2 = vhaddq_u8(a2, a8);
            a3 = vhaddq_u8(a3, a9);
            a4 = vhaddq_u8(a4, aa);
            a5 = vhaddq_u8(a5, ab);

            // at this point, we have averaged the two scanlines,
            // in 24bit it's a bit of a pain to average the pixels
            // because they are aligned to 3 bytes
            
#if 0  // simple readable version finishing without neon (slower)
            uint8_t b[96], c[96];
            memcpy(b,    &a0, 16);
            memcpy(b+16, &a1, 16);
            memcpy(b+32, &a2, 16);
            memcpy(b+48, &a3, 16);
            memcpy(b+64, &a4, 16);
            memcpy(b+80, &a5, 16);

            int i,j;
            for( i=0; i<16; i++)
                for( j=0; j<3; j++)
                    s[3*i+j] = (b[3*2*i+j] + b[3*(2*i+1)+j]) / 2;
#else
            // full neon version with swizzel (ugly but fast)
            uint8x8_t r0, r1;
            
            uint8x8x4_t z;
            uint8x8x2_t *z2 = (uint8x8x2_t*)&z, *z3 = (uint8x8x2_t*)&z+1;

//          a00 a01 a02 a06 a07 a08 a0c a0d a0e a12 a13 a14 a18 a19 a1a a1e
//          a03 a04 a05 a09 a0a a0b a0f a10 a11 a15 a16 a17 a1b a1c a1d a21
#define int8x16_to_8x8x2(v) ((int8x8x2_t) { vget_low_s8(v), vget_high_s8(v) })
            
            uint8x8_t s0l = {0, 1, 2, 6, 7, 8, 12, 13};
            memcpy(&z, &a0, sizeof a0);
            r0 = vtbl2_u8(*z2, s0l);

            memcpy(z3, &a1, sizeof a1);
            uint8x8_t s0h = {14, 16+2, 16+3, 16+4, 16+8, 16+9, 16+10, 16+14};
            r1 = vtbl4_u8(z, s0h);
            a6 = vcombine_u8 (r0, r1);

            uint8x8_t s1l = {3, 4, 5, 9, 10, 11, 15, 16+0};
            r0 = vtbl4_u8(z, s1l);
            memcpy(&z, &a2, sizeof a2);
            uint8x8_t s1h = {16+1, 16+5, 16+6, 16+7, 16+11, 16+12, 16+13, 1};
            r1 = vtbl4_u8(z, s1h);
            a7 = vcombine_u8 (r0, r1);

            a0 = vhaddq_u8(a6, a7);

//          a1f a20 a24 a25 a26 a2a a2b a2c a30 a31 a32 a36 a37 a38 a3c a3d
//          a22 a23 a27 a28 a29 a2d a2e a2f a33 a34 a35 a39 a3a a3b a3f a40

            uint8x8_t s2l = {16+15, 0, 4, 5, 6, 10, 11, 12};
            r0 = vtbl4_u8(z, s2l);
            uint8x8_t s2h = {0, 1, 2, 6, 7, 8, 12, 13};
            memcpy(z3, &a3, sizeof a3);
            r1 = vtbl2_u8(*z3, s2h);
            a6 = vcombine_u8 (r0, r1);

            uint8x8_t s3l = {2, 3, 7, 8, 9, 13, 14, 15};
            r0 = vtbl2_u8(*z2, s3l);
            memcpy(&z, &a4, sizeof a4);
            uint8x8_t s3h = {16+3, 16+4, 16+5, 16+9, 16+10, 16+11, 16+15, 0};
            r1 = vtbl4_u8(z, s3h);
            a7 = vcombine_u8 (r0, r1);

            a1 = vhaddq_u8(a6, a7);

//          a3e a42 a43 a44 a48 a49 a4a a4e a4f a50 a54 a55 a56 a5a a5b a5c
//          a41 a45 a46 a47 a4b a4c a4d a51 a52 a53 a57 a58 a59 a5d a5e a5f

            uint8x8_t s4l = {16+14, 2, 3, 4, 8, 9, 10, 14};
            r0 = vtbl4_u8(z, s4l);
            memcpy(z3, &a5, sizeof a5);
            uint8x8_t s4h = {15, 16+0, 16+4, 16+5, 16+6, 16+10, 16+11, 16+12};
            r1 = vtbl4_u8(z, s4h);
            a6 = vcombine_u8 (r0, r1);

            uint8x8_t s5l = {1, 5, 6, 7, 11, 12, 13, 16+1};
            r0 = vtbl4_u8(z, s5l);
            uint8x8_t s5h = {2, 3, 7, 8, 9, 13, 14, 15};
            r1 = vtbl2_u8(*z3, s5h);
            a7 = vcombine_u8 (r0, r1);

            a2 = vhaddq_u8(a6, a7);

            memcpy(s,    &a0, 16);
            memcpy(s+16, &a1, 16);
            memcpy(s+32, &a2, 16);
#endif
            s+=48;
            t+=96;
            u+=96;
        }
        t += stride;
        u += stride;
    }
}
Exemple #9
0
static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
                                             const uint8x8_t shuffle) {
  return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
                     vtbl1_u8(vget_high_u8(argb), shuffle));
}
void vp9_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
  int j, k;
  uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
  uint8x16_t q0u8, q1u8, q2u8;
  int16x8_t q12s16, q13s16, q14s16, q15s16;
  uint16x4_t d6u16;
  uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8;

  q0u8 = vld1q_dup_u8(above - 1);
  q1u8 = vld1q_u8(above);
  q2u8 = vld1q_u8(above + 16);
  q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
  q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
  q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8));
  q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8));
  for (k = 0; k < 4; k++, left += 8) {
    d26u8 = vld1_u8(left);
    q3u16 = vmovl_u8(d26u8);
    d6u16 = vget_low_u16(q3u16);
    for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
      q0u16 = vdupq_lane_u16(d6u16, 0);
      q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
                         vreinterpretq_s16_u16(q8u16));
      q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
                         vreinterpretq_s16_u16(q9u16));
      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
                         vreinterpretq_s16_u16(q10u16));
      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
                         vreinterpretq_s16_u16(q11u16));
      d0u8 = vqmovun_s16(q12s16);
      d1u8 = vqmovun_s16(q13s16);
      d2u8 = vqmovun_s16(q14s16);
      d3u8 = vqmovun_s16(q15s16);
      q0u8 = vcombine_u8(d0u8, d1u8);
      q1u8 = vcombine_u8(d2u8, d3u8);
      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
      dst += stride;

      q0u16 = vdupq_lane_u16(d6u16, 1);
      q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
                         vreinterpretq_s16_u16(q8u16));
      q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
                         vreinterpretq_s16_u16(q9u16));
      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
                         vreinterpretq_s16_u16(q10u16));
      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
                         vreinterpretq_s16_u16(q11u16));
      d0u8 = vqmovun_s16(q12s16);
      d1u8 = vqmovun_s16(q13s16);
      d2u8 = vqmovun_s16(q14s16);
      d3u8 = vqmovun_s16(q15s16);
      q0u8 = vcombine_u8(d0u8, d1u8);
      q1u8 = vcombine_u8(d2u8, d3u8);
      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
      dst += stride;

      q0u16 = vdupq_lane_u16(d6u16, 2);
      q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
                         vreinterpretq_s16_u16(q8u16));
      q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
                         vreinterpretq_s16_u16(q9u16));
      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
                         vreinterpretq_s16_u16(q10u16));
      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
                         vreinterpretq_s16_u16(q11u16));
      d0u8 = vqmovun_s16(q12s16);
      d1u8 = vqmovun_s16(q13s16);
      d2u8 = vqmovun_s16(q14s16);
      d3u8 = vqmovun_s16(q15s16);
      q0u8 = vcombine_u8(d0u8, d1u8);
      q1u8 = vcombine_u8(d2u8, d3u8);
      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
      dst += stride;

      q0u16 = vdupq_lane_u16(d6u16, 3);
      q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
                         vreinterpretq_s16_u16(q8u16));
      q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
                         vreinterpretq_s16_u16(q9u16));
      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
                         vreinterpretq_s16_u16(q10u16));
      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
                         vreinterpretq_s16_u16(q11u16));
      d0u8 = vqmovun_s16(q12s16);
      d1u8 = vqmovun_s16(q13s16);
      d2u8 = vqmovun_s16(q14s16);
      d3u8 = vqmovun_s16(q15s16);
      q0u8 = vcombine_u8(d0u8, d1u8);
      q1u8 = vcombine_u8(d2u8, d3u8);
      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
      dst += stride;
    }
  }
}
static INLINE void vp8_loop_filter_simple_vertical_edge_neon(
    unsigned char *s,
    int p,
    const unsigned char *blimit) {
    unsigned char *src1;
    uint8x16_t qblimit, q0u8;
    uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q11u8, q12u8, q14u8, q15u8;
    int16x8_t q2s16, q13s16, q11s16;
    int8x8_t d28s8, d29s8;
    int8x16_t q2s8, q3s8, q10s8, q11s8, q14s8;
    uint8x8x4_t d0u8x4;  // d6, d7, d8, d9
    uint8x8x4_t d1u8x4;  // d10, d11, d12, d13
    uint8x8x2_t d2u8x2;  // d12, d13
    uint8x8x2_t d3u8x2;  // d14, d15

    qblimit = vdupq_n_u8(*blimit);

    src1 = s - 2;
    d0u8x4 = read_4x8(src1, p);
    src1 += p * 8;
    d1u8x4 = read_4x8(src1, p);

    q3u8 = vcombine_u8(d0u8x4.val[0], d1u8x4.val[0]);  // d6 d10
    q4u8 = vcombine_u8(d0u8x4.val[2], d1u8x4.val[2]);  // d8 d12
    q5u8 = vcombine_u8(d0u8x4.val[1], d1u8x4.val[1]);  // d7 d11
    q6u8 = vcombine_u8(d0u8x4.val[3], d1u8x4.val[3]);  // d9 d13

    q15u8 = vabdq_u8(q5u8, q4u8);
    q14u8 = vabdq_u8(q3u8, q6u8);

    q15u8 = vqaddq_u8(q15u8, q15u8);
    q14u8 = vshrq_n_u8(q14u8, 1);
    q0u8 = vdupq_n_u8(0x80);
    q11s16 = vdupq_n_s16(3);
    q15u8 = vqaddq_u8(q15u8, q14u8);

    q3u8 = veorq_u8(q3u8, q0u8);
    q4u8 = veorq_u8(q4u8, q0u8);
    q5u8 = veorq_u8(q5u8, q0u8);
    q6u8 = veorq_u8(q6u8, q0u8);

    q15u8 = vcgeq_u8(qblimit, q15u8);

    q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q4u8)),
                     vget_low_s8(vreinterpretq_s8_u8(q5u8)));
    q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q4u8)),
                      vget_high_s8(vreinterpretq_s8_u8(q5u8)));

    q14s8 = vqsubq_s8(vreinterpretq_s8_u8(q3u8),
                      vreinterpretq_s8_u8(q6u8));

    q2s16 = vmulq_s16(q2s16, q11s16);
    q13s16 = vmulq_s16(q13s16, q11s16);

    q11u8 = vdupq_n_u8(3);
    q12u8 = vdupq_n_u8(4);

    q2s16 = vaddw_s8(q2s16, vget_low_s8(q14s8));
    q13s16 = vaddw_s8(q13s16, vget_high_s8(q14s8));

    d28s8 = vqmovn_s16(q2s16);
    d29s8 = vqmovn_s16(q13s16);
    q14s8 = vcombine_s8(d28s8, d29s8);

    q14s8 = vandq_s8(q14s8, vreinterpretq_s8_u8(q15u8));

    q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q11u8));
    q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q12u8));
    q2s8 = vshrq_n_s8(q2s8, 3);
    q14s8 = vshrq_n_s8(q3s8, 3);

    q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q5u8), q2s8);
    q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q4u8), q14s8);

    q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
    q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);

    d2u8x2.val[0] = vget_low_u8(q6u8);   // d12
    d2u8x2.val[1] = vget_low_u8(q7u8);   // d14
    d3u8x2.val[0] = vget_high_u8(q6u8);  // d13
    d3u8x2.val[1] = vget_high_u8(q7u8);  // d15

    src1 = s - 1;
    write_2x8(src1, p, d2u8x2, d3u8x2);
}
// CHECK-LABEL: define <16 x i8> @test_vcombine_u8(<8 x i8> %low, <8 x i8> %high) #0 {
// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %low, <8 x i8> %high, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
uint8x16_t test_vcombine_u8(uint8x8_t low, uint8x8_t high) {
  return vcombine_u8(low, high);
}
void vp8_mbloop_filter_vertical_edge_uv_neon(
        unsigned char *u,
        int pitch,
        unsigned char blimit,
        unsigned char limit,
        unsigned char thresh,
        unsigned char *v) {
    unsigned char *us, *ud;
    unsigned char *vs, *vd;
    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
    uint8x16_t q5, q6, q7, q8, q9, q10;
    uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
    uint8x8_t d15, d16, d17, d18, d19, d20, d21;
    uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
    uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
    uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;

    qblimit = vdupq_n_u8(blimit);
    qlimit = vdupq_n_u8(limit);
    qthresh = vdupq_n_u8(thresh);

    us = u - 4;
    vs = v - 4;
    d6 = vld1_u8(us);
    us += pitch;
    d7 = vld1_u8(vs);
    vs += pitch;
    d8 = vld1_u8(us);
    us += pitch;
    d9 = vld1_u8(vs);
    vs += pitch;
    d10 = vld1_u8(us);
    us += pitch;
    d11 = vld1_u8(vs);
    vs += pitch;
    d12 = vld1_u8(us);
    us += pitch;
    d13 = vld1_u8(vs);
    vs += pitch;
    d14 = vld1_u8(us);
    us += pitch;
    d15 = vld1_u8(vs);
    vs += pitch;
    d16 = vld1_u8(us);
    us += pitch;
    d17 = vld1_u8(vs);
    vs += pitch;
    d18 = vld1_u8(us);
    us += pitch;
    d19 = vld1_u8(vs);
    vs += pitch;
    d20 = vld1_u8(us);
    d21 = vld1_u8(vs);

    q3 = vcombine_u8(d6, d7);
    q4 = vcombine_u8(d8, d9);
    q5 = vcombine_u8(d10, d11);
    q6 = vcombine_u8(d12, d13);
    q7 = vcombine_u8(d14, d15);
    q8 = vcombine_u8(d16, d17);
    q9 = vcombine_u8(d18, d19);
    q10 = vcombine_u8(d20, d21);

    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));

    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
                       vreinterpretq_u16_u32(q2tmp2.val[0]));
    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
                       vreinterpretq_u16_u32(q2tmp3.val[0]));
    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
                       vreinterpretq_u16_u32(q2tmp2.val[1]));
    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
                       vreinterpretq_u16_u32(q2tmp3.val[1]));

    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
                       vreinterpretq_u8_u16(q2tmp5.val[0]));
    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
                       vreinterpretq_u8_u16(q2tmp5.val[1]));
    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
                       vreinterpretq_u8_u16(q2tmp7.val[0]));
    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
                       vreinterpretq_u8_u16(q2tmp7.val[1]));

    q3 = q2tmp8.val[0];
    q4 = q2tmp8.val[1];
    q5 = q2tmp9.val[0];
    q6 = q2tmp9.val[1];
    q7 = q2tmp10.val[0];
    q8 = q2tmp10.val[1];
    q9 = q2tmp11.val[0];
    q10 = q2tmp11.val[1];

    vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
                         q5, q6, q7, q8, q9, q10,
                         &q4, &q5, &q6, &q7, &q8, &q9);

    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));

    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
                       vreinterpretq_u16_u32(q2tmp2.val[0]));
    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
                       vreinterpretq_u16_u32(q2tmp3.val[0]));
    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
                       vreinterpretq_u16_u32(q2tmp2.val[1]));
    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
                       vreinterpretq_u16_u32(q2tmp3.val[1]));

    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
                       vreinterpretq_u8_u16(q2tmp5.val[0]));
    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
                       vreinterpretq_u8_u16(q2tmp5.val[1]));
    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
                       vreinterpretq_u8_u16(q2tmp7.val[0]));
    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
                       vreinterpretq_u8_u16(q2tmp7.val[1]));

    q3 = q2tmp8.val[0];
    q4 = q2tmp8.val[1];
    q5 = q2tmp9.val[0];
    q6 = q2tmp9.val[1];
    q7 = q2tmp10.val[0];
    q8 = q2tmp10.val[1];
    q9 = q2tmp11.val[0];
    q10 = q2tmp11.val[1];

    ud = u - 4;
    vst1_u8(ud, vget_low_u8(q3));
    ud += pitch;
    vst1_u8(ud, vget_low_u8(q4));
    ud += pitch;
    vst1_u8(ud, vget_low_u8(q5));
    ud += pitch;
    vst1_u8(ud, vget_low_u8(q6));
    ud += pitch;
    vst1_u8(ud, vget_low_u8(q7));
    ud += pitch;
    vst1_u8(ud, vget_low_u8(q8));
    ud += pitch;
    vst1_u8(ud, vget_low_u8(q9));
    ud += pitch;
    vst1_u8(ud, vget_low_u8(q10));

    vd = v - 4;
    vst1_u8(vd, vget_high_u8(q3));
    vd += pitch;
    vst1_u8(vd, vget_high_u8(q4));
    vd += pitch;
    vst1_u8(vd, vget_high_u8(q5));
    vd += pitch;
    vst1_u8(vd, vget_high_u8(q6));
    vd += pitch;
    vst1_u8(vd, vget_high_u8(q7));
    vd += pitch;
    vst1_u8(vd, vget_high_u8(q8));
    vd += pitch;
    vst1_u8(vd, vget_high_u8(q9));
    vd += pitch;
    vst1_u8(vd, vget_high_u8(q10));
    return;
}
void  yuv422rgb_neon_int(const unsigned char * sourcep, int source_byte_count,
			 unsigned char * destp)
{
  const unsigned char *source_endp;
  const unsigned char *vector_endp;
  int remainder;
  const int16x8_t u_coeff = {0, -22, 113, 0, 0, -22, 113, 0};
  const int16x8_t v_coeff = {90, -46, 0,  0, 90, -46, 0,  0};
  const uint8x8_t zeroalpha = {0x0, 0x0, 0x0, 0xFF, 0x0, 0x0, 0x0, 0xFF};
  const int16x8_t uvbias = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; 
  int16x8_t mp0_rgba;  /* macropixel 0's resulting RGBA RGBA pixels  */
  int16x8_t mp1_rgba; /* macropixel 1's resulting RGBA RGBA pixels  */
  uint8x8_t rawpixels; /* source pixels as {[YUYV]0 [YUYV]1}   */
  uint8x8_t rgba0, rgba1; /* rgba values as bytes  */
  uint8x16_t bothrgba;
  uint8_t * destinationp; /* pointer into output buffer destp  */
  int16x8_t widerpixels; /*  rawpixels promoted to shorts per component */
  const uint8x8_t yselect = {0xff, 0xff, 0xff, 0xff,
			     0x00, 0x00, 0x00, 0x00};
  
  
  /* we're working with things in 4-byte macropixels  */
  remainder = source_byte_count % 4;

  source_endp = sourcep + source_byte_count;
  vector_endp = source_endp - remainder;
  destinationp = (uint8_t *)destp;

  while (sourcep < vector_endp)
    {
     /* pull YUYV from 2 four byte macropixels starting at sourcep. */
      /* we'll increment sourcep as we go to save the array dereference */
      /* and separate increment instruction at the end of the loop  */

      /* load rawpixels with {[YUYV]0 [YUYV]1 } with byte components */
      rawpixels = vld1_u8(sourcep);
      sourcep += sizeof(rawpixels);

      widerpixels = vreinterpretq_s16_u16(vmovl_u8(rawpixels));
 


      
      /* ---------- process macropixel 0 --------------- */
      /* take macropixel zero ([YUYV]0) from rawpixels and   */
      /* compute the two RGBA pixels that come from it. store  */
      /* those two pixels in mp0_rgba  */
      {
	int16x8_t wider_yalpha;
	int16x8_t u_vec, v_vec, uv_vec;
	uint8x8_t narrow_yalpha;
	uint8x8_t y0_vec, y1_vec;
	int16x4_t yuyv;

	/* narrow_yalpha is drawn from [YUYV]0 and formed into */
	/* {Y0, Y0, Y0, alpha, Y1, Y1, Y1, alpha}   */
	/* this would have been a nice place for vtbx1_u8, but i  */
	/* can't get it to work. so i'll have to use vbsl_u8 instead.  */

	y0_vec = vdup_lane_u8(rawpixels, MP0_Y0);
	y1_vec = vdup_lane_u8(rawpixels, MP0_Y1);
	narrow_yalpha = vbsl_u8(yselect, y0_vec, y1_vec);

	/* store ALPHA in elements 3 and 7 (after the RGB components)  */
	narrow_yalpha =  vset_lane_u8(ALPHA, narrow_yalpha, 3);
	narrow_yalpha =  vset_lane_u8(ALPHA, narrow_yalpha, 7);

	/* use vmovl_u8 to go from being unsigned 8-bit to  */
	/* unsigned 16-bit, the use vreinterpretq_s16_u16 to  */
	/* change interpretation from unsigned 16-bit to signed  */
	/* 16-bit.   */
	wider_yalpha = vreinterpretq_s16_u16(vmovl_u8(narrow_yalpha));

	yuyv = vget_low_s16(widerpixels);
	
	/* form a vector of the U component from MP0  */
	u_vec = vdupq_lane_s16(yuyv, MP0_U);
	
	/* subtract uvbias from u_vec */
	u_vec = vsubq_s16(u_vec, uvbias);

	/* form a vector of the V component from MP0  */
	v_vec = vdupq_lane_s16(yuyv, MP0_V);
	
	/* subtract uvbias from v_vec */
	v_vec = vsubq_s16(v_vec, uvbias);

		
	/* Multiply eight 16-bit values in u_vec by eight 16-bit */
	/* values in u_coeff and store the results in u_vec.  */


	u_vec = vmulq_s16(u_vec, u_coeff);

	/* likewise multiply eight 16-bit values in v_vec by   */
	/* v_coeff and store the results in  v_vec */
	
	v_vec = vmulq_s16(v_vec, v_coeff);

	/* form uv_vec as the sum of u_vec & v_vec, then shift 6 places   */
	/* (dividing by 64)  */
	uv_vec = vaddq_s16(u_vec, v_vec);
	  
	uv_vec = vshrq_n_s16(uv_vec, 6);

	/* now mp0_rgba = y_vec + u_vec + v_vec  */
	mp0_rgba = vaddq_s16(wider_yalpha, uv_vec);

      }

      /* ---------- process macropixel 1 --------------- */
      /* take macropixel one ([YUYV]1) from rawpixels and   */
      /* compute the two RGBA pixels that come from it. store  */
      /* those two pixels in mp1_rgba  */      
      {
	int16x8_t wider_yalpha;
	int16x8_t u_vec, v_vec, uv_vec;
	uint8x8_t narrow_yalpha;
	uint8x8_t y0_vec, y1_vec;
	int16x4_t yuyv;

	/* narrow_yalpha is drawn from [YUYV]1 and formed into */
	/* {Y0, Y0, Y0, alpha, Y1, Y1, Y1, alpha}   */
	/* this would have been a nice place for vtbx1_u8, but i  */
	/* can't get it to work. so i'll have to use vbsl_u8 instead.  */

	y0_vec = vdup_lane_u8(rawpixels, MP1_Y0);
	y1_vec = vdup_lane_u8(rawpixels, MP1_Y1);
	narrow_yalpha = vbsl_u8(yselect, y0_vec, y1_vec);
	  
	narrow_yalpha =  vset_lane_u8(ALPHA, narrow_yalpha, 3);
	narrow_yalpha =  vset_lane_u8(ALPHA, narrow_yalpha, 7);

	/* use vmovl_u8 to go from being unsigned 8-bit to  */
	/* unsigned 16-bit, the use vreinterpretq_s16_u16 to  */


	wider_yalpha = vreinterpretq_s16_u16(vmovl_u8(narrow_yalpha));

	yuyv = vget_high_s16(widerpixels);
	u_vec = vdupq_lane_s16(yuyv, 1);
	u_vec = vsubq_s16(u_vec, uvbias);
	
	v_vec = vdupq_lane_s16(yuyv, 3);
	v_vec = vsubq_s16(v_vec, uvbias);

		
	/* Multiply eight 16-bit values in u_vec by eight 16-bit */
	/* values in u_coeff and store the results in u_vec.  */


	u_vec = vmulq_s16(u_vec, u_coeff);

	/* likewise multiply eight 16-bit values in v_vec by   */
	/* v_coeff and store the results in  v_vec */
	
	v_vec = vmulq_s16(v_vec, v_coeff);
     
	/* form uv_vec as the sum of u_vec & v_vec, then shift 6 places   */
	/* (dividing by 64)  */
	uv_vec  = vaddq_s16(u_vec, v_vec);
	uv_vec = vshrq_n_s16(uv_vec, 6);


	/* now mp1_rgba = y_vec + u_vec + v_vec  */
	mp1_rgba = vaddq_s16(wider_yalpha, uv_vec);
      }
      

      /* turn mp0_rgba from a vector of shorts to a vector of  */
      /* unsigned unsigned chars. this will saturate: clipping  */
      /* the values between 0 and 255.   */
      
      rgba0 = vqmovun_s16(mp0_rgba);
      rgba1 = vqmovun_s16(mp1_rgba);

      /* make it faster to copy these back out of vector registers into  */
      /* memory by combining rgba0 and rgba1 into the larger bothrgba.   */
      /* then store that back into memory at destinationp.               */

      bothrgba = vcombine_u8(rgba0, rgba1);
      
      vst1q_u8(destinationp, bothrgba);
      destinationp += 16;
      
      
    }
}
Exemple #15
0
inline   uint8x16_t vcombine(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vcombine_u8 (v0, v1); }
Exemple #16
0
void phase(const Size2D &size,
           const s16 * src0Base, ptrdiff_t src0Stride,
           const s16 * src1Base, ptrdiff_t src1Stride,
           u8 * dstBase, ptrdiff_t dstStride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    FASTATAN2CONST(256.0f / 360.0f)
    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;

    float32x4_t v_05 = vdupq_n_f32(0.5f);

    for (size_t i = 0; i < size.height; ++i)
    {
        const s16 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
        const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
        size_t j = 0;

        for (; j < roiw16; j += 16)
        {
            internal::prefetch(src0 + j);
            internal::prefetch(src1 + j);

            int16x8_t v_src00 = vld1q_s16(src0 + j), v_src01 = vld1q_s16(src0 + j + 8);
            int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);

            // 0
            float32x4_t v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00)));
            float32x4_t v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10)));
            float32x4_t v_dst32f0;
            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0)

            v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00)));
            v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10)));
            float32x4_t v_dst32f1;
            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)

            uint16x8_t v_dst16s0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
                                                vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));

            // 1
            v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src01)));
            v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src11)));
            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0)

            v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src01)));
            v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src11)));
            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)

            uint16x8_t v_dst16s1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
                                                vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));

            vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst16s0),
                                          vmovn_u16(v_dst16s1)));
        }
        for (; j < roiw8; j += 8)
        {
            int16x8_t v_src0 = vld1q_s16(src0 + j);
            int16x8_t v_src1 = vld1q_s16(src1 + j);

            float32x4_t v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src0)));
            float32x4_t v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1)));
            float32x4_t v_dst32f0;
            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0)

            v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src0)));
            v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1)));
            float32x4_t v_dst32f1;
            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)

            uint16x8_t v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
                                            vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));

            vst1_u8(dst + j, vmovn_u16(v_dst));
        }

        for (; j < size.width; j++)
        {
            f32 x = src0[j], y = src1[j];
            f32 a;
            FASTATAN2SCALAR(y, x, a)
            dst[j] = (u8)(s32)floor(a + 0.5f);
        }
    }
#else
    (void)size;
    (void)src0Base;
    (void)src0Stride;
    (void)src1Base;
    (void)src1Stride;
    (void)dstBase;
    (void)dstStride;
#endif
}
unsigned int vp8_sub_pixel_variance16x16_neon_func(
        const unsigned char *src_ptr,
        int src_pixels_per_line,
        int xoffset,
        int yoffset,
        const unsigned char *dst_ptr,
        int dst_pixels_per_line,
        unsigned int *sse) {
    int i;
    DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 528);
    unsigned char *tmpp;
    unsigned char *tmpp2;
    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
    uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8;
    uint8x8_t d19u8, d20u8, d21u8;
    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
    uint32x2_t d0u32, d10u32;
    int64x1_t d0s64, d1s64, d2s64, d3s64;
    uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8;
    uint8x16_t q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
    uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16;
    uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16;
    int32x4_t q8s32, q9s32, q10s32;
    int64x2_t q0s64, q1s64, q5s64;

    tmpp2 = tmp + 272;
    tmpp = tmp;
    if (xoffset == 0) {  // secondpass_bfilter16x16_only
        d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]);
        d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]);

        q11u8 = vld1q_u8(src_ptr);
        src_ptr += src_pixels_per_line;
        for (i = 4; i > 0; i--) {
            q12u8 = vld1q_u8(src_ptr);
            src_ptr += src_pixels_per_line;
            q13u8 = vld1q_u8(src_ptr);
            src_ptr += src_pixels_per_line;
            q14u8 = vld1q_u8(src_ptr);
            src_ptr += src_pixels_per_line;
            q15u8 = vld1q_u8(src_ptr);
            src_ptr += src_pixels_per_line;

            __builtin_prefetch(src_ptr);
            __builtin_prefetch(src_ptr + src_pixels_per_line);
            __builtin_prefetch(src_ptr + src_pixels_per_line * 2);

            q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
            q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
            q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
            q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
            q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
            q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
            q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
            q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);

            q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
            q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
            q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
            q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
            q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
            q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
            q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
            q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);

            d2u8 = vqrshrn_n_u16(q1u16, 7);
            d3u8 = vqrshrn_n_u16(q2u16, 7);
            d4u8 = vqrshrn_n_u16(q3u16, 7);
            d5u8 = vqrshrn_n_u16(q4u16, 7);
            d6u8 = vqrshrn_n_u16(q5u16, 7);
            d7u8 = vqrshrn_n_u16(q6u16, 7);
            d8u8 = vqrshrn_n_u16(q7u16, 7);
            d9u8 = vqrshrn_n_u16(q8u16, 7);

            q1u8 = vcombine_u8(d2u8, d3u8);
            q2u8 = vcombine_u8(d4u8, d5u8);
            q3u8 = vcombine_u8(d6u8, d7u8);
            q4u8 = vcombine_u8(d8u8, d9u8);

            q11u8 = q15u8;

            vst1q_u8((uint8_t *)tmpp2, q1u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q2u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q3u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q4u8);
            tmpp2 += 16;
        }
    } else if (yoffset == 0) {  // firstpass_bfilter16x16_only
        d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]);
        d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]);

        for (i = 4; i > 0 ; i--) {
            d2u8 = vld1_u8(src_ptr);
            d3u8 = vld1_u8(src_ptr + 8);
            d4u8 = vld1_u8(src_ptr + 16);
            src_ptr += src_pixels_per_line;
            d5u8 = vld1_u8(src_ptr);
            d6u8 = vld1_u8(src_ptr + 8);
            d7u8 = vld1_u8(src_ptr + 16);
            src_ptr += src_pixels_per_line;
            d8u8 = vld1_u8(src_ptr);
            d9u8 = vld1_u8(src_ptr + 8);
            d10u8 = vld1_u8(src_ptr + 16);
            src_ptr += src_pixels_per_line;
            d11u8 = vld1_u8(src_ptr);
            d12u8 = vld1_u8(src_ptr + 8);
            d13u8 = vld1_u8(src_ptr + 16);
            src_ptr += src_pixels_per_line;

            __builtin_prefetch(src_ptr);
            __builtin_prefetch(src_ptr + src_pixels_per_line);
            __builtin_prefetch(src_ptr + src_pixels_per_line * 2);

            q7u16  = vmull_u8(d2u8, d0u8);
            q8u16  = vmull_u8(d3u8, d0u8);
            q9u16  = vmull_u8(d5u8, d0u8);
            q10u16 = vmull_u8(d6u8, d0u8);
            q11u16 = vmull_u8(d8u8, d0u8);
            q12u16 = vmull_u8(d9u8, d0u8);
            q13u16 = vmull_u8(d11u8, d0u8);
            q14u16 = vmull_u8(d12u8, d0u8);

            d2u8  = vext_u8(d2u8, d3u8, 1);
            d5u8  = vext_u8(d5u8, d6u8, 1);
            d8u8  = vext_u8(d8u8, d9u8, 1);
            d11u8 = vext_u8(d11u8, d12u8, 1);

            q7u16  = vmlal_u8(q7u16, d2u8, d1u8);
            q9u16  = vmlal_u8(q9u16, d5u8, d1u8);
            q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
            q13u16 = vmlal_u8(q13u16, d11u8, d1u8);

            d3u8  = vext_u8(d3u8, d4u8, 1);
            d6u8  = vext_u8(d6u8, d7u8, 1);
            d9u8  = vext_u8(d9u8, d10u8, 1);
            d12u8 = vext_u8(d12u8, d13u8, 1);

            q8u16  = vmlal_u8(q8u16,  d3u8, d1u8);
            q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
            q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
            q14u16 = vmlal_u8(q14u16, d12u8, d1u8);

            d14u8 = vqrshrn_n_u16(q7u16, 7);
            d15u8 = vqrshrn_n_u16(q8u16, 7);
            d16u8 = vqrshrn_n_u16(q9u16, 7);
            d17u8 = vqrshrn_n_u16(q10u16, 7);
            d18u8 = vqrshrn_n_u16(q11u16, 7);
            d19u8 = vqrshrn_n_u16(q12u16, 7);
            d20u8 = vqrshrn_n_u16(q13u16, 7);
            d21u8 = vqrshrn_n_u16(q14u16, 7);

            q7u8  = vcombine_u8(d14u8, d15u8);
            q8u8  = vcombine_u8(d16u8, d17u8);
            q9u8  = vcombine_u8(d18u8, d19u8);
            q10u8 = vcombine_u8(d20u8, d21u8);

            vst1q_u8((uint8_t *)tmpp2, q7u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q8u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q9u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q10u8);
            tmpp2 += 16;
        }
    } else {
        d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]);
        d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]);

        d2u8 = vld1_u8(src_ptr);
        d3u8 = vld1_u8(src_ptr + 8);
        d4u8 = vld1_u8(src_ptr + 16);
        src_ptr += src_pixels_per_line;
        d5u8 = vld1_u8(src_ptr);
        d6u8 = vld1_u8(src_ptr + 8);
        d7u8 = vld1_u8(src_ptr + 16);
        src_ptr += src_pixels_per_line;
        d8u8 = vld1_u8(src_ptr);
        d9u8 = vld1_u8(src_ptr + 8);
        d10u8 = vld1_u8(src_ptr + 16);
        src_ptr += src_pixels_per_line;
        d11u8 = vld1_u8(src_ptr);
        d12u8 = vld1_u8(src_ptr + 8);
        d13u8 = vld1_u8(src_ptr + 16);
        src_ptr += src_pixels_per_line;

        // First Pass: output_height lines x output_width columns (17x16)
        for (i = 3; i > 0; i--) {
            q7u16  = vmull_u8(d2u8, d0u8);
            q8u16  = vmull_u8(d3u8, d0u8);
            q9u16  = vmull_u8(d5u8, d0u8);
            q10u16 = vmull_u8(d6u8, d0u8);
            q11u16 = vmull_u8(d8u8, d0u8);
            q12u16 = vmull_u8(d9u8, d0u8);
            q13u16 = vmull_u8(d11u8, d0u8);
            q14u16 = vmull_u8(d12u8, d0u8);

            d2u8  = vext_u8(d2u8, d3u8, 1);
            d5u8  = vext_u8(d5u8, d6u8, 1);
            d8u8  = vext_u8(d8u8, d9u8, 1);
            d11u8 = vext_u8(d11u8, d12u8, 1);

            q7u16  = vmlal_u8(q7u16, d2u8, d1u8);
            q9u16  = vmlal_u8(q9u16, d5u8, d1u8);
            q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
            q13u16 = vmlal_u8(q13u16, d11u8, d1u8);

            d3u8  = vext_u8(d3u8, d4u8, 1);
            d6u8  = vext_u8(d6u8, d7u8, 1);
            d9u8  = vext_u8(d9u8, d10u8, 1);
            d12u8 = vext_u8(d12u8, d13u8, 1);

            q8u16  = vmlal_u8(q8u16,  d3u8, d1u8);
            q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
            q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
            q14u16 = vmlal_u8(q14u16, d12u8, d1u8);

            d14u8 = vqrshrn_n_u16(q7u16, 7);
            d15u8 = vqrshrn_n_u16(q8u16, 7);
            d16u8 = vqrshrn_n_u16(q9u16, 7);
            d17u8 = vqrshrn_n_u16(q10u16, 7);
            d18u8 = vqrshrn_n_u16(q11u16, 7);
            d19u8 = vqrshrn_n_u16(q12u16, 7);
            d20u8 = vqrshrn_n_u16(q13u16, 7);
            d21u8 = vqrshrn_n_u16(q14u16, 7);

            d2u8 = vld1_u8(src_ptr);
            d3u8 = vld1_u8(src_ptr + 8);
            d4u8 = vld1_u8(src_ptr + 16);
            src_ptr += src_pixels_per_line;
            d5u8 = vld1_u8(src_ptr);
            d6u8 = vld1_u8(src_ptr + 8);
            d7u8 = vld1_u8(src_ptr + 16);
            src_ptr += src_pixels_per_line;
            d8u8 = vld1_u8(src_ptr);
            d9u8 = vld1_u8(src_ptr + 8);
            d10u8 = vld1_u8(src_ptr + 16);
            src_ptr += src_pixels_per_line;
            d11u8 = vld1_u8(src_ptr);
            d12u8 = vld1_u8(src_ptr + 8);
            d13u8 = vld1_u8(src_ptr + 16);
            src_ptr += src_pixels_per_line;

            q7u8 = vcombine_u8(d14u8, d15u8);
            q8u8 = vcombine_u8(d16u8, d17u8);
            q9u8 = vcombine_u8(d18u8, d19u8);
            q10u8 = vcombine_u8(d20u8, d21u8);

            vst1q_u8((uint8_t *)tmpp, q7u8);
            tmpp += 16;
            vst1q_u8((uint8_t *)tmpp, q8u8);
            tmpp += 16;
            vst1q_u8((uint8_t *)tmpp, q9u8);
            tmpp += 16;
            vst1q_u8((uint8_t *)tmpp, q10u8);
            tmpp += 16;
        }

        // First-pass filtering for rest 5 lines
        d14u8 = vld1_u8(src_ptr);
        d15u8 = vld1_u8(src_ptr + 8);
        d16u8 = vld1_u8(src_ptr + 16);
        src_ptr += src_pixels_per_line;

        q9u16  = vmull_u8(d2u8, d0u8);
        q10u16 = vmull_u8(d3u8, d0u8);
        q11u16 = vmull_u8(d5u8, d0u8);
        q12u16 = vmull_u8(d6u8, d0u8);
        q13u16 = vmull_u8(d8u8, d0u8);
        q14u16 = vmull_u8(d9u8, d0u8);

        d2u8  = vext_u8(d2u8, d3u8, 1);
        d5u8  = vext_u8(d5u8, d6u8, 1);
        d8u8  = vext_u8(d8u8, d9u8, 1);

        q9u16  = vmlal_u8(q9u16, d2u8, d1u8);
        q11u16 = vmlal_u8(q11u16, d5u8, d1u8);
        q13u16 = vmlal_u8(q13u16, d8u8, d1u8);

        d3u8  = vext_u8(d3u8, d4u8, 1);
        d6u8  = vext_u8(d6u8, d7u8, 1);
        d9u8  = vext_u8(d9u8, d10u8, 1);

        q10u16 = vmlal_u8(q10u16, d3u8, d1u8);
        q12u16 = vmlal_u8(q12u16, d6u8, d1u8);
        q14u16 = vmlal_u8(q14u16, d9u8, d1u8);

        q1u16 = vmull_u8(d11u8, d0u8);
        q2u16 = vmull_u8(d12u8, d0u8);
        q3u16 = vmull_u8(d14u8, d0u8);
        q4u16 = vmull_u8(d15u8, d0u8);

        d11u8 = vext_u8(d11u8, d12u8, 1);
        d14u8 = vext_u8(d14u8, d15u8, 1);

        q1u16 = vmlal_u8(q1u16, d11u8, d1u8);
        q3u16 = vmlal_u8(q3u16, d14u8, d1u8);

        d12u8 = vext_u8(d12u8, d13u8, 1);
        d15u8 = vext_u8(d15u8, d16u8, 1);

        q2u16 = vmlal_u8(q2u16, d12u8, d1u8);
        q4u16 = vmlal_u8(q4u16, d15u8, d1u8);

        d10u8 = vqrshrn_n_u16(q9u16, 7);
        d11u8 = vqrshrn_n_u16(q10u16, 7);
        d12u8 = vqrshrn_n_u16(q11u16, 7);
        d13u8 = vqrshrn_n_u16(q12u16, 7);
        d14u8 = vqrshrn_n_u16(q13u16, 7);
        d15u8 = vqrshrn_n_u16(q14u16, 7);
        d16u8 = vqrshrn_n_u16(q1u16, 7);
        d17u8 = vqrshrn_n_u16(q2u16, 7);
        d18u8 = vqrshrn_n_u16(q3u16, 7);
        d19u8 = vqrshrn_n_u16(q4u16, 7);

        q5u8 = vcombine_u8(d10u8, d11u8);
        q6u8 = vcombine_u8(d12u8, d13u8);
        q7u8 = vcombine_u8(d14u8, d15u8);
        q8u8 = vcombine_u8(d16u8, d17u8);
        q9u8 = vcombine_u8(d18u8, d19u8);

        vst1q_u8((uint8_t *)tmpp, q5u8);
        tmpp += 16;
        vst1q_u8((uint8_t *)tmpp, q6u8);
        tmpp += 16;
        vst1q_u8((uint8_t *)tmpp, q7u8);
        tmpp += 16;
        vst1q_u8((uint8_t *)tmpp, q8u8);
        tmpp += 16;
        vst1q_u8((uint8_t *)tmpp, q9u8);

        // secondpass_filter
        d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]);
        d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]);

        tmpp = tmp;
        tmpp2 = tmpp + 272;
        q11u8 = vld1q_u8(tmpp);
        tmpp += 16;
        for (i = 4; i > 0; i--) {
            q12u8 = vld1q_u8(tmpp);
            tmpp += 16;
            q13u8 = vld1q_u8(tmpp);
            tmpp += 16;
            q14u8 = vld1q_u8(tmpp);
            tmpp += 16;
            q15u8 = vld1q_u8(tmpp);
            tmpp += 16;

            q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
            q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
            q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
            q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
            q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
            q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
            q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
            q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);

            q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
            q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
            q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
            q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
            q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
            q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
            q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
            q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);

            d2u8 = vqrshrn_n_u16(q1u16, 7);
            d3u8 = vqrshrn_n_u16(q2u16, 7);
            d4u8 = vqrshrn_n_u16(q3u16, 7);
            d5u8 = vqrshrn_n_u16(q4u16, 7);
            d6u8 = vqrshrn_n_u16(q5u16, 7);
            d7u8 = vqrshrn_n_u16(q6u16, 7);
            d8u8 = vqrshrn_n_u16(q7u16, 7);
            d9u8 = vqrshrn_n_u16(q8u16, 7);

            q1u8 = vcombine_u8(d2u8, d3u8);
            q2u8 = vcombine_u8(d4u8, d5u8);
            q3u8 = vcombine_u8(d6u8, d7u8);
            q4u8 = vcombine_u8(d8u8, d9u8);

            q11u8 = q15u8;

            vst1q_u8((uint8_t *)tmpp2, q1u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q2u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q3u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q4u8);
            tmpp2 += 16;
        }
    }

    // sub_pixel_variance16x16_neon
    q8s32 = vdupq_n_s32(0);
    q9s32 = vdupq_n_s32(0);
    q10s32 = vdupq_n_s32(0);

    tmpp = tmp + 272;
    for (i = 0; i < 8; i++) {  // sub_pixel_variance16x16_neon_loop
        q0u8 = vld1q_u8(tmpp);
        tmpp += 16;
        q1u8 = vld1q_u8(tmpp);
        tmpp += 16;
        q2u8 = vld1q_u8(dst_ptr);
        dst_ptr += dst_pixels_per_line;
        q3u8 = vld1q_u8(dst_ptr);
        dst_ptr += dst_pixels_per_line;

        d0u8 = vget_low_u8(q0u8);
        d1u8 = vget_high_u8(q0u8);
        d2u8 = vget_low_u8(q1u8);
        d3u8 = vget_high_u8(q1u8);

        q11u16 = vsubl_u8(d0u8, vget_low_u8(q2u8));
        q12u16 = vsubl_u8(d1u8, vget_high_u8(q2u8));
        q13u16 = vsubl_u8(d2u8, vget_low_u8(q3u8));
        q14u16 = vsubl_u8(d3u8, vget_high_u8(q3u8));

        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);

        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);

        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);

        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
    }

    q10s32 = vaddq_s32(q10s32, q9s32);
    q0s64 = vpaddlq_s32(q8s32);
    q1s64 = vpaddlq_s32(q10s32);

    d0s64 = vget_low_s64(q0s64);
    d1s64 = vget_high_s64(q0s64);
    d2s64 = vget_low_s64(q1s64);
    d3s64 = vget_high_s64(q1s64);
    d0s64 = vadd_s64(d0s64, d1s64);
    d1s64 = vadd_s64(d2s64, d3s64);

    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
                      vreinterpret_s32_s64(d0s64));
    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);

    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);

    return vget_lane_u32(d0u32, 0);
}
Exemple #18
0
void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
                              const int16_t *filter_x, int x_step_q4,
                              const int16_t *filter_y,  // unused
                              int y_step_q4,            // unused
                              int w, int h) {
  int width;
  const uint8_t *s, *psrc;
  uint8_t *d, *pdst;
  uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
  uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32;
  uint8x16_t q12u8, q13u8, q14u8, q15u8;
  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
  int16x4_t d24s16, d25s16, d26s16, d27s16;
  uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
  int16x8_t q0s16;
  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
  int32x4_t q1s32, q2s32, q14s32, q15s32;
  uint16x8x2_t q0x2u16;
  uint8x8x2_t d0x2u8, d1x2u8;
  uint32x2x2_t d0x2u32;
  uint16x4x2_t d0x2u16, d1x2u16;
  uint32x4x2_t q0x2u32;

  assert(x_step_q4 == 16);

  (void)x_step_q4;
  (void)y_step_q4;
  (void)filter_y;

  q0s16 = vld1q_s16(filter_x);

  src -= 3;  // adjust for taps
  for (; h > 0; h -= 4, src += src_stride * 4,
                dst += dst_stride * 4) {  // loop_horiz_v
    s = src;
    d24u8 = vld1_u8(s);
    s += src_stride;
    d25u8 = vld1_u8(s);
    s += src_stride;
    d26u8 = vld1_u8(s);
    s += src_stride;
    d27u8 = vld1_u8(s);

    q12u8 = vcombine_u8(d24u8, d25u8);
    q13u8 = vcombine_u8(d26u8, d27u8);

    q0x2u16 =
        vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8));
    d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
    d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
    d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
    d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
    d0x2u8 = vtrn_u8(d24u8, d25u8);
    d1x2u8 = vtrn_u8(d26u8, d27u8);

    __builtin_prefetch(src + src_stride * 4);
    __builtin_prefetch(src + src_stride * 5);
    __builtin_prefetch(src + src_stride * 6);

    q8u16 = vmovl_u8(d0x2u8.val[0]);
    q9u16 = vmovl_u8(d0x2u8.val[1]);
    q10u16 = vmovl_u8(d1x2u8.val[0]);
    q11u16 = vmovl_u8(d1x2u8.val[1]);

    d16u16 = vget_low_u16(q8u16);
    d17u16 = vget_high_u16(q8u16);
    d18u16 = vget_low_u16(q9u16);
    d19u16 = vget_high_u16(q9u16);
    q8u16 = vcombine_u16(d16u16, d18u16);  // vswp 17 18
    q9u16 = vcombine_u16(d17u16, d19u16);

    d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
    d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));  // vmov 23 21
    for (width = w, psrc = src + 7, pdst = dst; width > 0;
         width -= 4, psrc += 4, pdst += 4) {  // loop_horiz
      s = psrc;
      d28u32 = vld1_dup_u32((const uint32_t *)s);
      s += src_stride;
      d29u32 = vld1_dup_u32((const uint32_t *)s);
      s += src_stride;
      d31u32 = vld1_dup_u32((const uint32_t *)s);
      s += src_stride;
      d30u32 = vld1_dup_u32((const uint32_t *)s);

      __builtin_prefetch(psrc + 64);

      d0x2u16 =
          vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32));
      d1x2u16 =
          vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32));
      d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28
                       vreinterpret_u8_u16(d1x2u16.val[0]));  // d29
      d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31
                       vreinterpret_u8_u16(d1x2u16.val[1]));  // d30

      __builtin_prefetch(psrc + 64 + src_stride);

      q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
      q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
      q0x2u32 =
          vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8));

      d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
      d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
      q12u16 = vmovl_u8(d28u8);
      q13u16 = vmovl_u8(d29u8);

      __builtin_prefetch(psrc + 64 + src_stride * 2);

      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
      d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
      d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
      d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));

      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16,
                             d23s16, d24s16, q0s16);
      q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16,
                             d24s16, d26s16, q0s16);
      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16,
                              d26s16, d27s16, q0s16);
      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16,
                              d27s16, d25s16, q0s16);

      __builtin_prefetch(psrc + 60 + src_stride * 3);

      d2u16 = vqrshrun_n_s32(q1s32, 7);
      d3u16 = vqrshrun_n_s32(q2s32, 7);
      d4u16 = vqrshrun_n_s32(q14s32, 7);
      d5u16 = vqrshrun_n_s32(q15s32, 7);

      q1u16 = vcombine_u16(d2u16, d3u16);
      q2u16 = vcombine_u16(d4u16, d5u16);

      d2u8 = vqmovn_u16(q1u16);
      d3u8 = vqmovn_u16(q2u16);

      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8));
      d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
                         vreinterpret_u32_u16(d0x2u16.val[1]));
      d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
                       vreinterpret_u8_u32(d0x2u32.val[1]));

      d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]);
      d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]);

      d = pdst;
      vst1_lane_u32((uint32_t *)d, d2u32, 0);
      d += dst_stride;
      vst1_lane_u32((uint32_t *)d, d3u32, 0);
      d += dst_stride;
      vst1_lane_u32((uint32_t *)d, d2u32, 1);
      d += dst_stride;
      vst1_lane_u32((uint32_t *)d, d3u32, 1);

      q8u16 = q9u16;
      d20s16 = d23s16;
      q11u16 = q12u16;
      q9u16 = q13u16;
      d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
    }
  }
  return;
}
Exemple #19
0
void vp8_sixtap_predict16x16_neon(
    unsigned char *src_ptr,
    int src_pixels_per_line,
    int xoffset,
    int yoffset,
    unsigned char *dst_ptr,
    int dst_pitch) {
    unsigned char *src, *src_tmp, *dst, *tmpp;
    unsigned char tmp[336];
    int i, j;
    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
    uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d18u8, d19u8;
    uint8x8_t d20u8, d21u8, d22u8, d23u8, d24u8, d25u8, d26u8, d27u8;
    uint8x8_t d28u8, d29u8, d30u8, d31u8;
    int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
    uint8x16_t q3u8, q4u8;
    uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16, q8u16, q9u16, q10u16;
    uint16x8_t q11u16, q12u16, q13u16, q15u16;
    int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16, q8s16, q9s16, q10s16;
    int16x8_t q11s16, q12s16, q13s16, q15s16;

    if (xoffset == 0) {  // secondpass_filter8x8_only
        // load second_pass filter
        dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
        d0s8 = vdup_lane_s8(dtmps8, 0);
        d1s8 = vdup_lane_s8(dtmps8, 1);
        d2s8 = vdup_lane_s8(dtmps8, 2);
        d3s8 = vdup_lane_s8(dtmps8, 3);
        d4s8 = vdup_lane_s8(dtmps8, 4);
        d5s8 = vdup_lane_s8(dtmps8, 5);
        d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
        d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
        d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
        d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
        d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
        d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));

        // load src data
        src_tmp = src_ptr - src_pixels_per_line * 2;
        for (i = 0; i < 2; i++) {
            src = src_tmp + i * 8;
            dst = dst_ptr + i * 8;
            d18u8 = vld1_u8(src);
            src += src_pixels_per_line;
            d19u8 = vld1_u8(src);
            src += src_pixels_per_line;
            d20u8 = vld1_u8(src);
            src += src_pixels_per_line;
            d21u8 = vld1_u8(src);
            src += src_pixels_per_line;
            d22u8 = vld1_u8(src);
            src += src_pixels_per_line;
            for (j = 0; j < 4; j++) {
                d23u8 = vld1_u8(src);
                src += src_pixels_per_line;
                d24u8 = vld1_u8(src);
                src += src_pixels_per_line;
                d25u8 = vld1_u8(src);
                src += src_pixels_per_line;
                d26u8 = vld1_u8(src);
                src += src_pixels_per_line;

                q3u16 = vmull_u8(d18u8, d0u8);
                q4u16 = vmull_u8(d19u8, d0u8);
                q5u16 = vmull_u8(d20u8, d0u8);
                q6u16 = vmull_u8(d21u8, d0u8);

                q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
                q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
                q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
                q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);

                q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
                q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
                q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
                q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);

                q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
                q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
                q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
                q6u16 = vmlal_u8(q6u16, d23u8, d2u8);

                q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
                q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
                q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
                q6u16 = vmlal_u8(q6u16, d26u8, d5u8);

                q7u16 = vmull_u8(d21u8, d3u8);
                q8u16 = vmull_u8(d22u8, d3u8);
                q9u16 = vmull_u8(d23u8, d3u8);
                q10u16 = vmull_u8(d24u8, d3u8);

                q3s16 = vreinterpretq_s16_u16(q3u16);
                q4s16 = vreinterpretq_s16_u16(q4u16);
                q5s16 = vreinterpretq_s16_u16(q5u16);
                q6s16 = vreinterpretq_s16_u16(q6u16);
                q7s16 = vreinterpretq_s16_u16(q7u16);
                q8s16 = vreinterpretq_s16_u16(q8u16);
                q9s16 = vreinterpretq_s16_u16(q9u16);
                q10s16 = vreinterpretq_s16_u16(q10u16);

                q7s16 = vqaddq_s16(q7s16, q3s16);
                q8s16 = vqaddq_s16(q8s16, q4s16);
                q9s16 = vqaddq_s16(q9s16, q5s16);
                q10s16 = vqaddq_s16(q10s16, q6s16);

                d6u8 = vqrshrun_n_s16(q7s16, 7);
                d7u8 = vqrshrun_n_s16(q8s16, 7);
                d8u8 = vqrshrun_n_s16(q9s16, 7);
                d9u8 = vqrshrun_n_s16(q10s16, 7);

                d18u8 = d22u8;
                d19u8 = d23u8;
                d20u8 = d24u8;
                d21u8 = d25u8;
                d22u8 = d26u8;

                vst1_u8(dst, d6u8);
                dst += dst_pitch;
                vst1_u8(dst, d7u8);
                dst += dst_pitch;
                vst1_u8(dst, d8u8);
                dst += dst_pitch;
                vst1_u8(dst, d9u8);
                dst += dst_pitch;
            }
        }
        return;
    }

    // load first_pass filter
    dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
    d0s8 = vdup_lane_s8(dtmps8, 0);
    d1s8 = vdup_lane_s8(dtmps8, 1);
    d2s8 = vdup_lane_s8(dtmps8, 2);
    d3s8 = vdup_lane_s8(dtmps8, 3);
    d4s8 = vdup_lane_s8(dtmps8, 4);
    d5s8 = vdup_lane_s8(dtmps8, 5);
    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));

    // First pass: output_height lines x output_width columns (9x4)
    if (yoffset == 0) {  // firstpass_filter4x4_only
        src = src_ptr - 2;
        dst = dst_ptr;
        for (i = 0; i < 8; i++) {
            d6u8 = vld1_u8(src);
            d7u8 = vld1_u8(src + 8);
            d8u8 = vld1_u8(src + 16);
            src += src_pixels_per_line;
            d9u8 = vld1_u8(src);
            d10u8 = vld1_u8(src + 8);
            d11u8 = vld1_u8(src + 16);
            src += src_pixels_per_line;

            __builtin_prefetch(src);
            __builtin_prefetch(src + src_pixels_per_line);

            q6u16 = vmull_u8(d6u8, d0u8);
            q7u16 = vmull_u8(d7u8, d0u8);
            q8u16 = vmull_u8(d9u8, d0u8);
            q9u16 = vmull_u8(d10u8, d0u8);

            d20u8 = vext_u8(d6u8, d7u8, 1);
            d21u8 = vext_u8(d9u8, d10u8, 1);
            d22u8 = vext_u8(d7u8, d8u8, 1);
            d23u8 = vext_u8(d10u8, d11u8, 1);
            d24u8 = vext_u8(d6u8, d7u8, 4);
            d25u8 = vext_u8(d9u8, d10u8, 4);
            d26u8 = vext_u8(d7u8, d8u8, 4);
            d27u8 = vext_u8(d10u8, d11u8, 4);
            d28u8 = vext_u8(d6u8, d7u8, 5);
            d29u8 = vext_u8(d9u8, d10u8, 5);

            q6u16 = vmlsl_u8(q6u16, d20u8, d1u8);
            q8u16 = vmlsl_u8(q8u16, d21u8, d1u8);
            q7u16 = vmlsl_u8(q7u16, d22u8, d1u8);
            q9u16 = vmlsl_u8(q9u16, d23u8, d1u8);
            q6u16 = vmlsl_u8(q6u16, d24u8, d4u8);
            q8u16 = vmlsl_u8(q8u16, d25u8, d4u8);
            q7u16 = vmlsl_u8(q7u16, d26u8, d4u8);
            q9u16 = vmlsl_u8(q9u16, d27u8, d4u8);
            q6u16 = vmlal_u8(q6u16, d28u8, d5u8);
            q8u16 = vmlal_u8(q8u16, d29u8, d5u8);

            d20u8 = vext_u8(d7u8, d8u8, 5);
            d21u8 = vext_u8(d10u8, d11u8, 5);
            d22u8 = vext_u8(d6u8, d7u8, 2);
            d23u8 = vext_u8(d9u8, d10u8, 2);
            d24u8 = vext_u8(d7u8, d8u8, 2);
            d25u8 = vext_u8(d10u8, d11u8, 2);
            d26u8 = vext_u8(d6u8, d7u8, 3);
            d27u8 = vext_u8(d9u8, d10u8, 3);
            d28u8 = vext_u8(d7u8, d8u8, 3);
            d29u8 = vext_u8(d10u8, d11u8, 3);

            q7u16 = vmlal_u8(q7u16, d20u8, d5u8);
            q9u16 = vmlal_u8(q9u16, d21u8, d5u8);
            q6u16 = vmlal_u8(q6u16, d22u8, d2u8);
            q8u16 = vmlal_u8(q8u16, d23u8, d2u8);
            q7u16 = vmlal_u8(q7u16, d24u8, d2u8);
            q9u16 = vmlal_u8(q9u16, d25u8, d2u8);

            q10u16 = vmull_u8(d26u8, d3u8);
            q11u16 = vmull_u8(d27u8, d3u8);
            q12u16 = vmull_u8(d28u8, d3u8);
            q15u16 = vmull_u8(d29u8, d3u8);

            q6s16 = vreinterpretq_s16_u16(q6u16);
            q7s16 = vreinterpretq_s16_u16(q7u16);
            q8s16 = vreinterpretq_s16_u16(q8u16);
            q9s16 = vreinterpretq_s16_u16(q9u16);
            q10s16 = vreinterpretq_s16_u16(q10u16);
            q11s16 = vreinterpretq_s16_u16(q11u16);
            q12s16 = vreinterpretq_s16_u16(q12u16);
            q15s16 = vreinterpretq_s16_u16(q15u16);

            q6s16 = vqaddq_s16(q6s16, q10s16);
            q8s16 = vqaddq_s16(q8s16, q11s16);
            q7s16 = vqaddq_s16(q7s16, q12s16);
            q9s16 = vqaddq_s16(q9s16, q15s16);

            d6u8 = vqrshrun_n_s16(q6s16, 7);
            d7u8 = vqrshrun_n_s16(q7s16, 7);
            d8u8 = vqrshrun_n_s16(q8s16, 7);
            d9u8 = vqrshrun_n_s16(q9s16, 7);

            q3u8 = vcombine_u8(d6u8, d7u8);
            q4u8 = vcombine_u8(d8u8, d9u8);
            vst1q_u8(dst, q3u8);
            dst += dst_pitch;
            vst1q_u8(dst, q4u8);
            dst += dst_pitch;
        }
        return;
    }

    src = src_ptr - 2 - src_pixels_per_line * 2;
    tmpp = tmp;
    for (i = 0; i < 7; i++) {
        d6u8 = vld1_u8(src);
        d7u8 = vld1_u8(src + 8);
        d8u8 = vld1_u8(src + 16);
        src += src_pixels_per_line;
        d9u8 = vld1_u8(src);
        d10u8 = vld1_u8(src + 8);
        d11u8 = vld1_u8(src + 16);
        src += src_pixels_per_line;
        d12u8 = vld1_u8(src);
        d13u8 = vld1_u8(src + 8);
        d14u8 = vld1_u8(src + 16);
        src += src_pixels_per_line;

        __builtin_prefetch(src);
        __builtin_prefetch(src + src_pixels_per_line);
        __builtin_prefetch(src + src_pixels_per_line * 2);

        q8u16 = vmull_u8(d6u8, d0u8);
        q9u16 = vmull_u8(d7u8, d0u8);
        q10u16 = vmull_u8(d9u8, d0u8);
        q11u16 = vmull_u8(d10u8, d0u8);
        q12u16 = vmull_u8(d12u8, d0u8);
        q13u16 = vmull_u8(d13u8, d0u8);

        d28u8 = vext_u8(d6u8, d7u8, 1);
        d29u8 = vext_u8(d9u8, d10u8, 1);
        d30u8 = vext_u8(d12u8, d13u8, 1);
        q8u16 = vmlsl_u8(q8u16, d28u8, d1u8);
        q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
        q12u16 = vmlsl_u8(q12u16, d30u8, d1u8);
        d28u8 = vext_u8(d7u8, d8u8, 1);
        d29u8 = vext_u8(d10u8, d11u8, 1);
        d30u8 = vext_u8(d13u8, d14u8, 1);
        q9u16  = vmlsl_u8(q9u16, d28u8, d1u8);
        q11u16 = vmlsl_u8(q11u16, d29u8, d1u8);
        q13u16 = vmlsl_u8(q13u16, d30u8, d1u8);

        d28u8 = vext_u8(d6u8, d7u8, 4);
        d29u8 = vext_u8(d9u8, d10u8, 4);
        d30u8 = vext_u8(d12u8, d13u8, 4);
        q8u16 = vmlsl_u8(q8u16, d28u8, d4u8);
        q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
        q12u16 = vmlsl_u8(q12u16, d30u8, d4u8);
        d28u8 = vext_u8(d7u8, d8u8, 4);
        d29u8 = vext_u8(d10u8, d11u8, 4);
        d30u8 = vext_u8(d13u8, d14u8, 4);
        q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
        q11u16 = vmlsl_u8(q11u16, d29u8, d4u8);
        q13u16 = vmlsl_u8(q13u16, d30u8, d4u8);

        d28u8 = vext_u8(d6u8, d7u8, 5);
        d29u8 = vext_u8(d9u8, d10u8, 5);
        d30u8 = vext_u8(d12u8, d13u8, 5);
        q8u16 = vmlal_u8(q8u16, d28u8, d5u8);
        q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
        q12u16 = vmlal_u8(q12u16, d30u8, d5u8);
        d28u8 = vext_u8(d7u8, d8u8, 5);
        d29u8 = vext_u8(d10u8, d11u8, 5);
        d30u8 = vext_u8(d13u8, d14u8, 5);
        q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
        q11u16 = vmlal_u8(q11u16, d29u8, d5u8);
        q13u16 = vmlal_u8(q13u16, d30u8, d5u8);

        d28u8 = vext_u8(d6u8, d7u8, 2);
        d29u8 = vext_u8(d9u8, d10u8, 2);
        d30u8 = vext_u8(d12u8, d13u8, 2);
        q8u16 = vmlal_u8(q8u16, d28u8, d2u8);
        q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
        q12u16 = vmlal_u8(q12u16, d30u8, d2u8);
        d28u8 = vext_u8(d7u8, d8u8, 2);
        d29u8 = vext_u8(d10u8, d11u8, 2);
        d30u8 = vext_u8(d13u8, d14u8, 2);
        q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
        q11u16 = vmlal_u8(q11u16, d29u8, d2u8);
        q13u16 = vmlal_u8(q13u16, d30u8, d2u8);

        d28u8 = vext_u8(d6u8, d7u8, 3);
        d29u8 = vext_u8(d9u8, d10u8, 3);
        d30u8 = vext_u8(d12u8, d13u8, 3);
        d15u8 = vext_u8(d7u8, d8u8, 3);
        d31u8 = vext_u8(d10u8, d11u8, 3);
        d6u8  = vext_u8(d13u8, d14u8, 3);
        q4u16 = vmull_u8(d28u8, d3u8);
        q5u16 = vmull_u8(d29u8, d3u8);
        q6u16 = vmull_u8(d30u8, d3u8);
        q4s16 = vreinterpretq_s16_u16(q4u16);
        q5s16 = vreinterpretq_s16_u16(q5u16);
        q6s16 = vreinterpretq_s16_u16(q6u16);
        q8s16 = vreinterpretq_s16_u16(q8u16);
        q10s16 = vreinterpretq_s16_u16(q10u16);
        q12s16 = vreinterpretq_s16_u16(q12u16);
        q8s16 = vqaddq_s16(q8s16, q4s16);
        q10s16 = vqaddq_s16(q10s16, q5s16);
        q12s16 = vqaddq_s16(q12s16, q6s16);

        q6u16 = vmull_u8(d15u8, d3u8);
        q7u16 = vmull_u8(d31u8, d3u8);
        q3u16 = vmull_u8(d6u8, d3u8);
        q3s16 = vreinterpretq_s16_u16(q3u16);
        q6s16 = vreinterpretq_s16_u16(q6u16);
        q7s16 = vreinterpretq_s16_u16(q7u16);
        q9s16 = vreinterpretq_s16_u16(q9u16);
        q11s16 = vreinterpretq_s16_u16(q11u16);
        q13s16 = vreinterpretq_s16_u16(q13u16);
        q9s16 = vqaddq_s16(q9s16, q6s16);
        q11s16 = vqaddq_s16(q11s16, q7s16);
        q13s16 = vqaddq_s16(q13s16, q3s16);

        d6u8 = vqrshrun_n_s16(q8s16, 7);
        d7u8 = vqrshrun_n_s16(q9s16, 7);
        d8u8 = vqrshrun_n_s16(q10s16, 7);
        d9u8 = vqrshrun_n_s16(q11s16, 7);
        d10u8 = vqrshrun_n_s16(q12s16, 7);
        d11u8 = vqrshrun_n_s16(q13s16, 7);

        vst1_u8(tmpp, d6u8);
        tmpp += 8;
        vst1_u8(tmpp, d7u8);
        tmpp += 8;
        vst1_u8(tmpp, d8u8);
        tmpp += 8;
        vst1_u8(tmpp, d9u8);
        tmpp += 8;
        vst1_u8(tmpp, d10u8);
        tmpp += 8;
        vst1_u8(tmpp, d11u8);
        tmpp += 8;
    }

    // Second pass: 16x16
    dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
    d0s8 = vdup_lane_s8(dtmps8, 0);
    d1s8 = vdup_lane_s8(dtmps8, 1);
    d2s8 = vdup_lane_s8(dtmps8, 2);
    d3s8 = vdup_lane_s8(dtmps8, 3);
    d4s8 = vdup_lane_s8(dtmps8, 4);
    d5s8 = vdup_lane_s8(dtmps8, 5);
    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));

    for (i = 0; i < 2; i++) {
        dst = dst_ptr + 8 * i;
        tmpp = tmp + 8 * i;
        d18u8 = vld1_u8(tmpp);
        tmpp += 16;
        d19u8 = vld1_u8(tmpp);
        tmpp += 16;
        d20u8 = vld1_u8(tmpp);
        tmpp += 16;
        d21u8 = vld1_u8(tmpp);
        tmpp += 16;
        d22u8 = vld1_u8(tmpp);
        tmpp += 16;
        for (j = 0; j < 4; j++) {
            d23u8 = vld1_u8(tmpp);
            tmpp += 16;
            d24u8 = vld1_u8(tmpp);
            tmpp += 16;
            d25u8 = vld1_u8(tmpp);
            tmpp += 16;
            d26u8 = vld1_u8(tmpp);
            tmpp += 16;

            q3u16 = vmull_u8(d18u8, d0u8);
            q4u16 = vmull_u8(d19u8, d0u8);
            q5u16 = vmull_u8(d20u8, d0u8);
            q6u16 = vmull_u8(d21u8, d0u8);

            q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
            q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
            q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
            q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);

            q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
            q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
            q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
            q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);

            q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
            q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
            q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
            q6u16 = vmlal_u8(q6u16, d23u8, d2u8);

            q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
            q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
            q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
            q6u16 = vmlal_u8(q6u16, d26u8, d5u8);

            q7u16 = vmull_u8(d21u8, d3u8);
            q8u16 = vmull_u8(d22u8, d3u8);
            q9u16 = vmull_u8(d23u8, d3u8);
            q10u16 = vmull_u8(d24u8, d3u8);

            q3s16 = vreinterpretq_s16_u16(q3u16);
            q4s16 = vreinterpretq_s16_u16(q4u16);
            q5s16 = vreinterpretq_s16_u16(q5u16);
            q6s16 = vreinterpretq_s16_u16(q6u16);
            q7s16 = vreinterpretq_s16_u16(q7u16);
            q8s16 = vreinterpretq_s16_u16(q8u16);
            q9s16 = vreinterpretq_s16_u16(q9u16);
            q10s16 = vreinterpretq_s16_u16(q10u16);

            q7s16 = vqaddq_s16(q7s16, q3s16);
            q8s16 = vqaddq_s16(q8s16, q4s16);
            q9s16 = vqaddq_s16(q9s16, q5s16);
            q10s16 = vqaddq_s16(q10s16, q6s16);

            d6u8 = vqrshrun_n_s16(q7s16, 7);
            d7u8 = vqrshrun_n_s16(q8s16, 7);
            d8u8 = vqrshrun_n_s16(q9s16, 7);
            d9u8 = vqrshrun_n_s16(q10s16, 7);

            d18u8 = d22u8;
            d19u8 = d23u8;
            d20u8 = d24u8;
            d21u8 = d25u8;
            d22u8 = d26u8;

            vst1_u8(dst, d6u8);
            dst += dst_pitch;
            vst1_u8(dst, d7u8);
            dst += dst_pitch;
            vst1_u8(dst, d8u8);
            dst += dst_pitch;
            vst1_u8(dst, d9u8);
            dst += dst_pitch;
        }
    }
    return;
}