static void variance_neon_w8(const uint8_t *a, int a_stride,
                             const uint8_t *b, int b_stride,
                             int w, int h, unsigned int *sse, int *sum) {
  int i, j;
  int16x8_t v_sum = vdupq_n_s16(0);
  int32x4_t v_sse_lo = vdupq_n_s32(0);
  int32x4_t v_sse_hi = vdupq_n_s32(0);

  for (i = 0; i < h; ++i) {
    for (j = 0; j < w; j += 8) {
      const uint8x8_t v_a = vld1_u8(&a[j]);
      const uint8x8_t v_b = vld1_u8(&b[j]);
      const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
      const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
      v_sum = vaddq_s16(v_sum, sv_diff);
      v_sse_lo = vmlal_s16(v_sse_lo,
                           vget_low_s16(sv_diff),
                           vget_low_s16(sv_diff));
      v_sse_hi = vmlal_s16(v_sse_hi,
                           vget_high_s16(sv_diff),
                           vget_high_s16(sv_diff));
    }
    a += a_stride;
    b += b_stride;
  }

  *sum = horizontal_add_s16x8(v_sum);
  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
}
Exemple #2
0
static void ConvertARGBToUV_NEON(const uint32_t* argb, uint8_t* u, uint8_t* v,
                                 int src_width, int do_store) {
  int i;
  for (i = 0; i + 16 <= src_width; i += 16, u += 8, v += 8) {
    const uint8x16x4_t RGB = vld4q_u8((const uint8_t*)&argb[i]);
    const uint16x8_t R = vpaddlq_u8(RGB.val[2]);  // pair-wise adds
    const uint16x8_t G = vpaddlq_u8(RGB.val[1]);
    const uint16x8_t B = vpaddlq_u8(RGB.val[0]);
    int16x8_t U_tmp, V_tmp;
    CONVERT_RGB_TO_UV(R, G, B, 1, U_tmp, V_tmp);
    {
      const uint8x8_t U = vqrshrun_n_s16(U_tmp, 1);
      const uint8x8_t V = vqrshrun_n_s16(V_tmp, 1);
      if (do_store) {
        vst1_u8(u, U);
        vst1_u8(v, V);
      } else {
        const uint8x8_t prev_u = vld1_u8(u);
        const uint8x8_t prev_v = vld1_u8(v);
        vst1_u8(u, vrhadd_u8(U, prev_u));
        vst1_u8(v, vrhadd_u8(V, prev_v));
      }
    }
  }
  if (i < src_width) {  // left-over
    WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
  }
}
Exemple #3
0
int main(void)
{
    uint8_t v1_init[8] = {1, 1, 1, 1, 1, 1, 1, 1};
    uint8_t v2_init[8] = {2, 2, 2, 2, 2, 2, 2, 2};
    uint8x8_t v1 = vld1_u8 (v1_init);
    uint8x8_t v2 = vld1_u8 (v2_init);
    uint8x8x2_t vd1, vd2;
    union {uint8x8_t v; uint8_t buf[8];} d1, d2, d3, d4;
    int i;
    uint8_t odd, even;

    vd1 = vzip_u8(v1, vdup_n_u8(0));
    vd2 = vzip_u8(v2, vdup_n_u8(0));

    vst1_u8(d1.buf, vd1.val[0]);
    vst1_u8(d2.buf, vd1.val[1]);
    vst1_u8(d3.buf, vd2.val[0]);
    vst1_u8(d4.buf, vd2.val[1]);

#ifdef __ARMEL__
    odd = 1;
    even = 0;
#else
    odd = 0;
    even = 1;
#endif

    for (i = 0; i < 8; i++)
      if ((i % 2 == even && d4.buf[i] != 2)
          || (i % 2 == odd && d4.buf[i] != 0))
         abort ();

    return 0;
}
void byte2float48_neon(const uint8_t *t, const int pitch, float *p) {
    uint16x8_t m0, m1, m2, m3, m4, m5;
    uint32x2_t temp1, temp4;

    m0 = vmovl_u8(vld1_u8(t));
    temp1 = vld1_lane_u32((const uint32_t *)(t + 8), temp1, 0);
    temp1 = vld1_lane_u32((const uint32_t *)(t + pitch * 2), temp1, 1);
    m1 = vmovl_u8(vreinterpret_u8_u32(temp1));
    m2 = vmovl_u8(vld1_u8(t + pitch * 2 + 4));

    t += pitch * 4;

    m3 = vmovl_u8(vld1_u8(t));
    temp4 = vld1_lane_u32((const uint32_t *)(t + 8), temp4, 0);
    temp4 = vld1_lane_u32((const uint32_t *)(t + pitch * 2), temp4, 1);
    m4 = vmovl_u8(vreinterpret_u8_u32(temp4));
    m5 = vmovl_u8(vld1_u8(t + pitch * 2 + 4));

    vst1q_f32(p, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m0))));
    vst1q_f32(p + 4, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m0))));
    vst1q_f32(p + 8, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m1))));
    vst1q_f32(p + 12, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m1))));
    vst1q_f32(p + 16, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m2))));
    vst1q_f32(p + 20, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m2))));
    vst1q_f32(p + 24, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m3))));
    vst1q_f32(p + 28, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m3))));
    vst1q_f32(p + 32, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m4))));
    vst1q_f32(p + 36, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m4))));
    vst1q_f32(p + 40, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m5))));
    vst1q_f32(p + 44, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m5))));
}
static void ConvertBGRAToRGBA(const uint32_t* src,
                              int num_pixels, uint8_t* dst) {
  const uint32_t* const end = src + (num_pixels & ~1);
  const uint8x8_t shuffle = vld1_u8(kRGBAShuffle);
  for (; src < end; src += 2) {
    const uint8x8_t pixels = vld1_u8((uint8_t*)src);
    vst1_u8(dst, vtbl1_u8(pixels, shuffle));
    dst += 8;
  }
  VP8LConvertBGRAToRGBA_C(src, num_pixels & 1, dst);  // left-overs
}
unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) {
  uint8x8_t v_s0 = vld1_u8(s);
  const uint8x8_t v_s1 = vld1_u8(s + p);
  uint16x8_t v_sum = vaddl_u8(v_s0, v_s1);

  v_s0 = vld1_u8(s + 2 * p);
  v_sum = vaddw_u8(v_sum, v_s0);

  v_s0 = vld1_u8(s + 3 * p);
  v_sum = vaddw_u8(v_sum, v_s0);

  v_s0 = vld1_u8(s + 4 * p);
  v_sum = vaddw_u8(v_sum, v_s0);

  v_s0 = vld1_u8(s + 5 * p);
  v_sum = vaddw_u8(v_sum, v_s0);

  v_s0 = vld1_u8(s + 6 * p);
  v_sum = vaddw_u8(v_sum, v_s0);

  v_s0 = vld1_u8(s + 7 * p);
  v_sum = vaddw_u8(v_sum, v_s0);

  return (horizontal_add_u16x8(v_sum) + 32) >> 6;
}
static INLINE
uint8x8x4_t read_4x8(unsigned char *src, int pitch) {
    uint8x8x4_t x;
    const uint8x8_t a = vld1_u8(src);
    const uint8x8_t b = vld1_u8(src + pitch * 1);
    const uint8x8_t c = vld1_u8(src + pitch * 2);
    const uint8x8_t d = vld1_u8(src + pitch * 3);
    const uint8x8_t e = vld1_u8(src + pitch * 4);
    const uint8x8_t f = vld1_u8(src + pitch * 5);
    const uint8x8_t g = vld1_u8(src + pitch * 6);
    const uint8x8_t h = vld1_u8(src + pitch * 7);
    const uint32x2x2_t r04_u32 = vtrn_u32(vreinterpret_u32_u8(a),
                                          vreinterpret_u32_u8(e));
    const uint32x2x2_t r15_u32 = vtrn_u32(vreinterpret_u32_u8(b),
                                          vreinterpret_u32_u8(f));
    const uint32x2x2_t r26_u32 = vtrn_u32(vreinterpret_u32_u8(c),
                                          vreinterpret_u32_u8(g));
    const uint32x2x2_t r37_u32 = vtrn_u32(vreinterpret_u32_u8(d),
                                          vreinterpret_u32_u8(h));
    const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u32(r04_u32.val[0]),
                                          vreinterpret_u16_u32(r26_u32.val[0]));
    const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u32(r15_u32.val[0]),
                                          vreinterpret_u16_u32(r37_u32.val[0]));
    const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
                                       vreinterpret_u8_u16(r13_u16.val[0]));
    const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
                                       vreinterpret_u8_u16(r13_u16.val[1]));
    /*
     * after vtrn_u32
    00 01 02 03 | 40 41 42 43
    10 11 12 13 | 50 51 52 53
    20 21 22 23 | 60 61 62 63
    30 31 32 33 | 70 71 72 73
    ---
    * after vtrn_u16
    00 01 20 21 | 40 41 60 61
    02 03 22 23 | 42 43 62 63
    10 11 30 31 | 50 51 70 71
    12 13 32 33 | 52 52 72 73

    00 01 20 21 | 40 41 60 61
    10 11 30 31 | 50 51 70 71
    02 03 22 23 | 42 43 62 63
    12 13 32 33 | 52 52 72 73
    ---
    * after vtrn_u8
    00 10 20 30 | 40 50 60 70
    01 11 21 31 | 41 51 61 71
    02 12 22 32 | 42 52 62 72
    03 13 23 33 | 43 53 63 73
    */
    x.val[0] = r01_u8.val[0];
    x.val[1] = r01_u8.val[1];
    x.val[2] = r23_u8.val[0];
    x.val[3] = r23_u8.val[1];

    return x;
}
static inline void char_to_float_vectors(const unsigned char * sourcep,
			   float32x4_t *mp0, float32x4_t * mp1)
{
 uint8x8_t rawpixels; /* source pixels as {[YUYV]0 [YUYV]1}   */
 int16x8_t widerpixels; /*  rawpixels promoted to shorts per component */
 int16x4_t high16, low16;
 int32x4_t high32, low32;
 const  int16x8_t uvbias = {0, 128, 0, 128, 0, 128, 0, 128};
 
 rawpixels = vld1_u8(sourcep);
 widerpixels = vreinterpretq_s16_u16(vmovl_u8(rawpixels));

 /* subtract uvbias from widerpixels  */
 widerpixels = vsubq_s16(widerpixels, uvbias);

 /* now take widerpixels apart into (low16, high16) and   */
 /* then expand those into (low32, high32)    */
 low16 = vget_low_s16(widerpixels);
 high16 = vget_high_s16(widerpixels);
 high32 = vmovl_s16(high16);
 low32  = vmovl_s16(low16);

 /* now convert low32 and high32 into floats and store them in   */
 /*  *mp0,  *mp1 */

 *mp0 = vcvtq_f32_s32(low32);
 *mp1 = vcvtq_f32_s32(high32);
  
}
Exemple #9
0
void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
  const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
  const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
  const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
  const uint32x2_t zero = vdup_n_u32(0);
  const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);
  const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL);
  const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8));
  const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
  const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
  const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
  const uint8_t D = vget_lane_u8(XABCD_u8, 4);
  const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
  const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
  const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
  const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
  const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
  const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
}
Exemple #10
0
void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                              const uint8_t *above, const uint8_t *left) {
  int i;
  uint8x8_t d0u8 = vdup_n_u8(0);
  (void)left;

  d0u8 = vld1_u8(above);
  for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8);
}
Exemple #11
0
void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
  static const uint8_t shuffle1[8] = { 1, 2, 3, 4, 5, 6, 7, 7 };
  static const uint8_t shuffle2[8] = { 2, 3, 4, 5, 6, 7, 7, 7 };
  const uint8x8_t sh_12345677 = vld1_u8(shuffle1);
  const uint8x8_t sh_23456777 = vld1_u8(shuffle2);
  const uint8x8_t A0 = vld1_u8(above);  // top row
  const uint8x8_t A1 = vtbl1_u8(A0, sh_12345677);
  const uint8x8_t A2 = vtbl1_u8(A0, sh_23456777);
  const uint8x8_t avg1 = vhadd_u8(A0, A2);
  uint8x8_t row = vrhadd_u8(avg1, A1);
  int i;
  (void)left;
  for (i = 0; i < 7; ++i) {
    vst1_u8(dst + i * stride, row);
    row = vtbl1_u8(row, sh_12345677);
  }
  vst1_u8(dst + i * stride, row);
}
void byte2word64_neon(const uint8_t *t, const int pitch, float *pf) {
    uint16_t *p = (uint16_t *)pf;

    vst1q_u16(p, vmovl_u8(vld1_u8(t)));
    vst1q_u16(p + 8, vmovl_u8(vld1_u8(t + 8)));
    vst1q_u16(p + 16, vmovl_u8(vld1_u8(t + pitch * 2)));
    vst1q_u16(p + 24, vmovl_u8(vld1_u8(t + pitch * 2 + 8)));
    vst1q_u16(p + 32, vmovl_u8(vld1_u8(t + pitch * 4)));
    vst1q_u16(p + 40, vmovl_u8(vld1_u8(t + pitch * 4 + 8)));
    vst1q_u16(p + 48, vmovl_u8(vld1_u8(t + pitch * 6)));
    vst1q_u16(p + 56, vmovl_u8(vld1_u8(t + pitch * 6 + 8)));
}
Exemple #13
0
unsigned int vpx_get4x4sse_cs_neon(
        const unsigned char *src_ptr,
        int source_stride,
        const unsigned char *ref_ptr,
        int recon_stride) {
    int16x4_t d22s16, d24s16, d26s16, d28s16;
    int64x1_t d0s64;
    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
    int32x4_t q7s32, q8s32, q9s32, q10s32;
    uint16x8_t q11u16, q12u16, q13u16, q14u16;
    int64x2_t q1s64;

    d0u8 = vld1_u8(src_ptr);
    src_ptr += source_stride;
    d4u8 = vld1_u8(ref_ptr);
    ref_ptr += recon_stride;
    d1u8 = vld1_u8(src_ptr);
    src_ptr += source_stride;
    d5u8 = vld1_u8(ref_ptr);
    ref_ptr += recon_stride;
    d2u8 = vld1_u8(src_ptr);
    src_ptr += source_stride;
    d6u8 = vld1_u8(ref_ptr);
    ref_ptr += recon_stride;
    d3u8 = vld1_u8(src_ptr);
    src_ptr += source_stride;
    d7u8 = vld1_u8(ref_ptr);
    ref_ptr += recon_stride;

    q11u16 = vsubl_u8(d0u8, d4u8);
    q12u16 = vsubl_u8(d1u8, d5u8);
    q13u16 = vsubl_u8(d2u8, d6u8);
    q14u16 = vsubl_u8(d3u8, d7u8);

    d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
    d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
    d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
    d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));

    q7s32 = vmull_s16(d22s16, d22s16);
    q8s32 = vmull_s16(d24s16, d24s16);
    q9s32 = vmull_s16(d26s16, d26s16);
    q10s32 = vmull_s16(d28s16, d28s16);

    q7s32 = vaddq_s32(q7s32, q8s32);
    q9s32 = vaddq_s32(q9s32, q10s32);
    q9s32 = vaddq_s32(q7s32, q9s32);

    q1s64 = vpaddlq_s32(q9s32);
    d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));

    return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
}
// 'do_above' and 'do_left' facilitate branch removal when inlined.
static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride,
                          const uint8_t *above, const uint8_t *left,
                          int do_above, int do_left) {
  uint16x8_t sum_top;
  uint16x8_t sum_left;
  uint8x8_t dc0;

  if (do_above) {
    const uint8x8_t A = vld1_u8(above);  // top row
    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
    const uint16x4_t p1 = vpadd_u16(p0, p0);
    const uint16x4_t p2 = vpadd_u16(p1, p1);
    sum_top = vcombine_u16(p2, p2);
  }

  if (do_left) {
    const uint8x8_t L = vld1_u8(left);  // left border
    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
    const uint16x4_t p1 = vpadd_u16(p0, p0);
    const uint16x4_t p2 = vpadd_u16(p1, p1);
    sum_left = vcombine_u16(p2, p2);
  }

  if (do_above && do_left) {
    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
    dc0 = vrshrn_n_u16(sum, 4);
  } else if (do_above) {
    dc0 = vrshrn_n_u16(sum_top, 3);
  } else if (do_left) {
    dc0 = vrshrn_n_u16(sum_left, 3);
  } else {
    dc0 = vdup_n_u8(0x80);
  }

  {
    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
    int i;
    for (i = 0; i < 8; ++i) {
      vst1_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc));
    }
  }
}
// Process a block exactly 8 wide and any height.
static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
                                      uint8_t *output_ptr,
                                      unsigned int src_pixels_per_line,
                                      int pixel_step,
                                      unsigned int output_height,
                                      const uint8_t *filter) {
  const uint8x8_t f0 = vdup_n_u8(filter[0]);
  const uint8x8_t f1 = vdup_n_u8(filter[1]);
  unsigned int i;
  for (i = 0; i < output_height; ++i) {
    const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
    const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
    const uint16x8_t a = vmull_u8(src_0, f0);
    const uint16x8_t b = vmlal_u8(a, src_1, f1);
    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
    vst1_u8(output_ptr, out);
    src_ptr += src_pixels_per_line;
    output_ptr += 8;
  }
}
void vp9_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
  int j;
  uint16x8_t q0u16, q3u16, q10u16;
  int16x8_t q0s16;
  uint16x4_t d20u16;
  uint8x8_t d0u8, d2u8, d30u8;

  d0u8 = vld1_dup_u8(above - 1);
  d30u8 = vld1_u8(left);
  d2u8 = vld1_u8(above);
  q10u16 = vmovl_u8(d30u8);
  q3u16 = vsubl_u8(d2u8, d0u8);
  d20u16 = vget_low_u16(q10u16);
  for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
    q0u16 = vdupq_lane_u16(d20u16, 0);
    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
                      vreinterpretq_s16_u16(q0u16));
    d0u8 = vqmovun_s16(q0s16);
    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
    dst += stride;
    q0u16 = vdupq_lane_u16(d20u16, 1);
    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
                      vreinterpretq_s16_u16(q0u16));
    d0u8 = vqmovun_s16(q0s16);
    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
    dst += stride;
    q0u16 = vdupq_lane_u16(d20u16, 2);
    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
                      vreinterpretq_s16_u16(q0u16));
    d0u8 = vqmovun_s16(q0s16);
    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
    dst += stride;
    q0u16 = vdupq_lane_u16(d20u16, 3);
    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
                      vreinterpretq_s16_u16(q0u16));
    d0u8 = vqmovun_s16(q0s16);
    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
    dst += stride;
  }
}
Exemple #17
0
void vp8_copy_mem8x8_neon(unsigned char *src, int src_stride,
                          unsigned char *dst, int dst_stride) {
  uint8x8_t vtmp;
  int r;

  for (r = 0; r < 8; ++r) {
    vtmp = vld1_u8(src);
    vst1_u8(dst, vtmp);
    src += src_stride;
    dst += dst_stride;
  }
}
static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
                                      uint8_t *output_ptr,
                                      unsigned int src_pixels_per_line,
                                      int pixel_step,
                                      unsigned int output_height,
                                      unsigned int output_width,
                                      const uint16_t *vpx_filter) {
  const uint8x8_t f0 = vmov_n_u8((uint8_t)vpx_filter[0]);
  const uint8x8_t f1 = vmov_n_u8((uint8_t)vpx_filter[1]);
  unsigned int i;
  for (i = 0; i < output_height; ++i) {
    const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
    const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
    const uint16x8_t a = vmull_u8(src_0, f0);
    const uint16x8_t b = vmlal_u8(a, src_1, f1);
    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
    vst1_u8(&output_ptr[0], out);
    // Next row...
    src_ptr += src_pixels_per_line;
    output_ptr += output_width;
  }
}
static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) {
  const uint32_t* const end = argb_data + (num_pixels & ~3);
  const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
  for (; argb_data < end; argb_data += 4) {
    const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
    const uint8x16_t greens =
        vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
                    vtbl1_u8(vget_high_u8(argb), shuffle));
    vst1q_u8((uint8_t*)argb_data, vaddq_u8(argb, greens));
  }
  // fallthrough and finish off with plain-C
  VP8LAddGreenToBlueAndRed_C(argb_data, num_pixels & 3);
}
void byte2word48_neon(const uint8_t *t, const int pitch, float *pf) {
    uint16_t *p = (uint16_t *)pf;

    uint8x8_t m0, m1, m2, m3, m4, m5;

    m0 = vld1_u8(t);
    m1 = vreinterpret_u8_u32(vld1_lane_u32((const uint32_t *)(t + 8), vreinterpret_u32_u8(m1), 0));
    m1 = vreinterpret_u8_u32(vld1_lane_u32((const uint32_t *)(t + pitch * 2), vreinterpret_u32_u8(m1), 1));
    m2 = vld1_u8(t + pitch * 2 + 4);

    t += pitch * 4;

    m3 = vld1_u8(t);
    m4 = vreinterpret_u8_u32(vld1_lane_u32((const uint32_t *)(t + 8), vreinterpret_u32_u8(m4), 0));
    m4 = vreinterpret_u8_u32(vld1_lane_u32((const uint32_t *)(t + pitch * 2), vreinterpret_u32_u8(m4), 1));
    m5 = vld1_u8(t + pitch * 2 + 4);

    vst1q_u16(p, vmovl_u8(m0));
    vst1q_u16(p + 8, vmovl_u8(m1));
    vst1q_u16(p + 16, vmovl_u8(m2));
    vst1q_u16(p + 24, vmovl_u8(m3));
    vst1q_u16(p + 32, vmovl_u8(m4));
    vst1q_u16(p + 40, vmovl_u8(m5));
}
Exemple #21
0
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
  const uint32_t* const end = argb_data + (num_pixels & ~3);
#ifdef USE_VTBLQ
  const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
#else
  const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
#endif
  for (; argb_data < end; argb_data += 4) {
    const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
    const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
    vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens));
  }
  // fallthrough and finish off with plain-C
  VP8LSubtractGreenFromBlueAndRed_C(argb_data, num_pixels & 3);
}
Exemple #22
0
test_vdupb_lane_u8 ()
{
  uint8x8_t a;
  uint8_t b;
  uint8_t c[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };

  a = vld1_u8 (c);
  b = wrap_vdupb_lane_u8_0 (a, a);
  if (c[0] != b)
    return 1;
  b = wrap_vdupb_lane_u8_1 (a);
  if (c[1] != b)
    return 1;
  return 0;
}
Exemple #23
0
void vpx_lpf_horizontal_4_dual_neon(
    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
    const uint8_t *limit1, const uint8_t *thresh1) {
  uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1;
  uint8x16_t qblimit, qlimit, qthresh;
  uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;

  dblimit0 = vld1_u8(blimit0);
  dlimit0 = vld1_u8(limit0);
  dthresh0 = vld1_u8(thresh0);
  dblimit1 = vld1_u8(blimit1);
  dlimit1 = vld1_u8(limit1);
  dthresh1 = vld1_u8(thresh1);
  qblimit = vcombine_u8(dblimit0, dblimit1);
  qlimit = vcombine_u8(dlimit0, dlimit1);
  qthresh = vcombine_u8(dthresh0, dthresh1);

  s -= (p << 2);

  q3u8 = vld1q_u8(s);
  s += p;
  q4u8 = vld1q_u8(s);
  s += p;
  q5u8 = vld1q_u8(s);
  s += p;
  q6u8 = vld1q_u8(s);
  s += p;
  q7u8 = vld1q_u8(s);
  s += p;
  q8u8 = vld1q_u8(s);
  s += p;
  q9u8 = vld1q_u8(s);
  s += p;
  q10u8 = vld1q_u8(s);

  loop_filter_neon_16(qblimit, qlimit, qthresh, q3u8, q4u8, q5u8, q6u8, q7u8,
                      q8u8, q9u8, q10u8, &q5u8, &q6u8, &q7u8, &q8u8);

  s -= (p * 5);
  vst1q_u8(s, q5u8);
  s += p;
  vst1q_u8(s, q6u8);
  s += p;
  vst1q_u8(s, q7u8);
  s += p;
  vst1q_u8(s, q8u8);
  return;
}
static void ConvertBGRAToBGR(const uint32_t* src,
                             int num_pixels, uint8_t* dst) {
  const uint32_t* const end = src + (num_pixels & ~7);
  const uint8x8_t shuffle0 = vld1_u8(kBGRShuffle[0]);
  const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]);
  const uint8x8_t shuffle2 = vld1_u8(kBGRShuffle[2]);
  for (; src < end; src += 8) {
    uint8x8x4_t pixels;
    INIT_VECTOR4(pixels,
                 vld1_u8((const uint8_t*)(src + 0)),
                 vld1_u8((const uint8_t*)(src + 2)),
                 vld1_u8((const uint8_t*)(src + 4)),
                 vld1_u8((const uint8_t*)(src + 6)));
    vst1_u8(dst +  0, vtbl4_u8(pixels, shuffle0));
    vst1_u8(dst +  8, vtbl4_u8(pixels, shuffle1));
    vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2));
    dst += 8 * 3;
  }
  VP8LConvertBGRAToBGR_C(src, num_pixels & 7, dst);  // left-overs
}
Exemple #25
0
void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
  const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above));  // top row
  const uint64x1_t A1 = vshr_n_u64(A0, 8);
  const uint64x1_t A2 = vshr_n_u64(A0, 16);
  const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
  const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
  const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
  const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00);
  const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
  const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
  const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
  (void)left;
  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
  dst[3 * stride + 3] = above[7];
}
void aom_lpf_horizontal_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
                               const uint8_t *limit, const uint8_t *thresh) {
  int i;
  uint8_t *s, *psrc;
  uint8x8_t dblimit, dlimit, dthresh;
  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
  uint8x8_t d16u8, d17u8, d18u8;

  dblimit = vld1_u8(blimit);
  dlimit = vld1_u8(limit);
  dthresh = vld1_u8(thresh);

  psrc = src - (pitch << 2);
  for (i = 0; i < 1; i++) {
    s = psrc + i * 8;

    d3u8 = vld1_u8(s);
    s += pitch;
    d4u8 = vld1_u8(s);
    s += pitch;
    d5u8 = vld1_u8(s);
    s += pitch;
    d6u8 = vld1_u8(s);
    s += pitch;
    d7u8 = vld1_u8(s);
    s += pitch;
    d16u8 = vld1_u8(s);
    s += pitch;
    d17u8 = vld1_u8(s);
    s += pitch;
    d18u8 = vld1_u8(s);

    mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
                       d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,
                       &d5u8);

    s -= (pitch * 6);
    vst1_u8(s, d0u8);
    s += pitch;
    vst1_u8(s, d1u8);
    s += pitch;
    vst1_u8(s, d2u8);
    s += pitch;
    vst1_u8(s, d3u8);
    s += pitch;
    vst1_u8(s, d4u8);
    s += pitch;
    vst1_u8(s, d5u8);
  }
  return;
}
void aom_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
                             const uint8_t *limit, const uint8_t *thresh) {
  int i;
  uint8_t *s;
  uint8x8_t dblimit, dlimit, dthresh;
  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
  uint8x8_t d16u8, d17u8, d18u8;
  uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
  uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
  uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
  uint8x8x4_t d4Result;
  uint8x8x2_t d2Result;

  dblimit = vld1_u8(blimit);
  dlimit = vld1_u8(limit);
  dthresh = vld1_u8(thresh);

  for (i = 0; i < 1; i++) {
    s = src + (i * (pitch << 3)) - 4;

    d3u8 = vld1_u8(s);
    s += pitch;
    d4u8 = vld1_u8(s);
    s += pitch;
    d5u8 = vld1_u8(s);
    s += pitch;
    d6u8 = vld1_u8(s);
    s += pitch;
    d7u8 = vld1_u8(s);
    s += pitch;
    d16u8 = vld1_u8(s);
    s += pitch;
    d17u8 = vld1_u8(s);
    s += pitch;
    d18u8 = vld1_u8(s);

    d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8));
    d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8));
    d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8));
    d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8));

    d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
                      vreinterpret_u16_u32(d2tmp2.val[0]));
    d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
                      vreinterpret_u16_u32(d2tmp3.val[0]));
    d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
                      vreinterpret_u16_u32(d2tmp2.val[1]));
    d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
                      vreinterpret_u16_u32(d2tmp3.val[1]));

    d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
                     vreinterpret_u8_u16(d2tmp5.val[0]));
    d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
                     vreinterpret_u8_u16(d2tmp5.val[1]));
    d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
                      vreinterpret_u8_u16(d2tmp7.val[0]));
    d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
                      vreinterpret_u8_u16(d2tmp7.val[1]));

    d3u8 = d2tmp8.val[0];
    d4u8 = d2tmp8.val[1];
    d5u8 = d2tmp9.val[0];
    d6u8 = d2tmp9.val[1];
    d7u8 = d2tmp10.val[0];
    d16u8 = d2tmp10.val[1];
    d17u8 = d2tmp11.val[0];
    d18u8 = d2tmp11.val[1];

    mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
                       d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,
                       &d5u8);

    d4Result.val[0] = d0u8;
    d4Result.val[1] = d1u8;
    d4Result.val[2] = d2u8;
    d4Result.val[3] = d3u8;

    d2Result.val[0] = d4u8;
    d2Result.val[1] = d5u8;

    s = src - 3;
    vst4_lane_u8(s, d4Result, 0);
    s += pitch;
    vst4_lane_u8(s, d4Result, 1);
    s += pitch;
    vst4_lane_u8(s, d4Result, 2);
    s += pitch;
    vst4_lane_u8(s, d4Result, 3);
    s += pitch;
    vst4_lane_u8(s, d4Result, 4);
    s += pitch;
    vst4_lane_u8(s, d4Result, 5);
    s += pitch;
    vst4_lane_u8(s, d4Result, 6);
    s += pitch;
    vst4_lane_u8(s, d4Result, 7);

    s = src + 1;
    vst2_lane_u8(s, d2Result, 0);
    s += pitch;
    vst2_lane_u8(s, d2Result, 1);
    s += pitch;
    vst2_lane_u8(s, d2Result, 2);
    s += pitch;
    vst2_lane_u8(s, d2Result, 3);
    s += pitch;
    vst2_lane_u8(s, d2Result, 4);
    s += pitch;
    vst2_lane_u8(s, d2Result, 5);
    s += pitch;
    vst2_lane_u8(s, d2Result, 6);
    s += pitch;
    vst2_lane_u8(s, d2Result, 7);
  }
  return;
}
Exemple #28
0
int crypto_stream_xor(
        unsigned char *c,
  const unsigned char *m,unsigned long long mlen,
  const unsigned char *n,
  const unsigned char *k
)
{
  const uint32x4_t abab = {-1,0,-1,0};
  const uint64x1_t nextblock = {1};
  uint32x4_t k0k1k2k3 = (uint32x4_t) vld1q_u8((uint8_t *) k);
  uint32x4_t k4k5k6k7 = (uint32x4_t) vld1q_u8((uint8_t *) (k + 16));
  uint32x4_t start0 = (uint32x4_t) vld1q_u8((uint8_t *) sigma);
  uint32x2_t n0n1 = (uint32x2_t) vld1_u8((uint8_t *) n);
  uint32x2_t n2n3 = {0,0};
  uint32x2_t k0k1 = vget_low_u32(k0k1k2k3);
  uint32x2_t k2k3 = vget_high_u32(k0k1k2k3);
  uint32x2_t k4k5 = vget_low_u32(k4k5k6k7);
  uint32x2_t k6k7 = vget_high_u32(k4k5k6k7);
  uint32x2_t n1n0 = vext_u32(n0n1,n0n1,1);
  uint32x2_t n3n2;
  uint32x2_t n0k4 = vext_u32(n1n0,k4k5,1);
  uint32x2_t k5k0 = vext_u32(k4k5,k0k1,1);
  uint32x2_t k1n1 = vext_u32(k0k1,n1n0,1);
  uint32x2_t n2k6;
  uint32x2_t k7k2 = vext_u32(k6k7,k2k3,1);
  uint32x2_t k3n3;
  uint32x4_t start1 = vcombine_u32(k5k0,n0k4);
  uint32x4_t start2;
  uint32x4_t start3;
  register uint32x4_t diag0;
  register uint32x4_t diag1;
  register uint32x4_t diag2;
  register uint32x4_t diag3;
  uint32x4_t next_start2;
  uint32x4_t next_start3;
  register uint32x4_t next_diag0;
  register uint32x4_t next_diag1;
  register uint32x4_t next_diag2;
  register uint32x4_t next_diag3;
  uint32x4_t x0x5x10x15;
  uint32x4_t x12x1x6x11;
  uint32x4_t x8x13x2x7;
  uint32x4_t x4x9x14x3;
  uint32x4_t x0x1x10x11;
  uint32x4_t x12x13x6x7;
  uint32x4_t x8x9x2x3;
  uint32x4_t x4x5x14x15;
  uint32x4_t x0x1x2x3;
  uint32x4_t x4x5x6x7;
  uint32x4_t x8x9x10x11;
  uint32x4_t x12x13x14x15;
  uint32x4_t m0m1m2m3;
  uint32x4_t m4m5m6m7;
  uint32x4_t m8m9m10m11;
  uint32x4_t m12m13m14m15;
  register uint32x4_t a0;
  register uint32x4_t a1;
  register uint32x4_t a2;
  register uint32x4_t a3;
  register uint32x4_t b0;
  register uint32x4_t b1;
  register uint32x4_t b2;
  register uint32x4_t b3;
  register uint32x4_t next_a0;
  register uint32x4_t next_a1;
  register uint32x4_t next_a2;
  register uint32x4_t next_a3;
  register uint32x4_t next_b0;
  register uint32x4_t next_b1;
  register uint32x4_t next_b2;
  register uint32x4_t next_b3;
  unsigned char block[64];
  unsigned char *savec;
  int i;
  int flagm = (m != 0);

  if (!mlen) return 0;
  if (mlen < 128) goto mlenatleast1;

  mlenatleast128:

  n3n2 = vext_u32(n2n3,n2n3,1);
  n2k6 = vext_u32(n3n2,k6k7,1);
  k3n3 = vext_u32(k2k3,n3n2,1);
  start2 = vcombine_u32(n2k6,k1n1);
  start3 = vcombine_u32(k3n3,k7k2);

  n2n3 = (uint32x2_t) vadd_u64(nextblock,(uint64x1_t) n2n3);

  diag0 = start0;
  diag1 = start1;
  diag2 = start2;
  diag3 = start3;

  n3n2 = vext_u32(n2n3,n2n3,1);
  n2k6 = vext_u32(n3n2,k6k7,1);
  k3n3 = vext_u32(k2k3,n3n2,1);
  next_start2 = vcombine_u32(n2k6,k1n1);
  next_start3 = vcombine_u32(k3n3,k7k2);

  n2n3 = (uint32x2_t) vadd_u64(nextblock,(uint64x1_t) n2n3);

  next_diag0 = start0;
  next_diag1 = start1;
  next_diag2 = next_start2;
  next_diag3 = next_start3;

  for (i = ROUNDS;i > 0;i -= 2) {
    a0 = diag1 + diag0;
    b0 = vshlq_n_u32(a0,7);
                                        next_a0 = next_diag1 + next_diag0;
    a0 = vsriq_n_u32(b0,a0,25);
                                        next_b0 = vshlq_n_u32(next_a0,7);
    diag3 ^= a0;
                                        next_a0 = vsriq_n_u32(next_b0,next_a0,25);
    a1 = diag0 + diag3;
                                        next_diag3 ^= next_a0;
    b1 = vshlq_n_u32(a1,9);
                                        next_a1 = next_diag0 + next_diag3;
    a1 = vsriq_n_u32(b1,a1,23);
                                        next_b1 = vshlq_n_u32(next_a1,9);
    diag2 ^= a1;
                                        next_a1 = vsriq_n_u32(next_b1,next_a1,23);
    a2 = diag3 + diag2;
      diag3 = vextq_u32(diag3,diag3,3);
                                        next_diag2 ^= next_a1;
    b2 = vshlq_n_u32(a2,13);
                                        next_a2 = next_diag3 + next_diag2;
                                          next_diag3 = vextq_u32(next_diag3,next_diag3,3);
    a2 = vsriq_n_u32(b2,a2,19);
                                        next_b2 = vshlq_n_u32(next_a2,13);
    diag1 ^= a2;
                                        next_a2 = vsriq_n_u32(next_b2,next_a2,19);
    a3 = diag2 + diag1;
      diag2 = vextq_u32(diag2,diag2,2);
                                        next_diag1 ^= next_a2;
    b3 = vshlq_n_u32(a3,18);
      diag1 = vextq_u32(diag1,diag1,1);
                                        next_a3 = next_diag2 + next_diag1;
                                          next_diag2 = vextq_u32(next_diag2,next_diag2,2);
    a3 = vsriq_n_u32(b3,a3,14);
                                        next_b3 = vshlq_n_u32(next_a3,18);
                                          next_diag1 = vextq_u32(next_diag1,next_diag1,1);
    diag0 ^= a3;
                                        next_a3 = vsriq_n_u32(next_b3,next_a3,14);
    a0 = diag3 + diag0;
                                        next_diag0 ^= next_a3;
    b0 = vshlq_n_u32(a0,7);
                                        next_a0 = next_diag3 + next_diag0;
    a0 = vsriq_n_u32(b0,a0,25);
                                        next_b0 = vshlq_n_u32(next_a0,7);
    diag1 ^= a0;
                                        next_a0 = vsriq_n_u32(next_b0,next_a0,25);
    a1 = diag0 + diag1;
                                        next_diag1 ^= next_a0;
    b1 = vshlq_n_u32(a1,9);
                                        next_a1 = next_diag0 + next_diag1;
    a1 = vsriq_n_u32(b1,a1,23);
                                        next_b1 = vshlq_n_u32(next_a1,9);
    diag2 ^= a1;
                                        next_a1 = vsriq_n_u32(next_b1,next_a1,23);
    a2 = diag1 + diag2;
      diag1 = vextq_u32(diag1,diag1,3);
                                        next_diag2 ^= next_a1;
    b2 = vshlq_n_u32(a2,13);
                                        next_a2 = next_diag1 + next_diag2;
                                          next_diag1 = vextq_u32(next_diag1,next_diag1,3);
    a2 = vsriq_n_u32(b2,a2,19);
                                        next_b2 = vshlq_n_u32(next_a2,13);
    diag3 ^= a2;
                                        next_a2 = vsriq_n_u32(next_b2,next_a2,19);
    a3 = diag2 + diag3;
      diag2 = vextq_u32(diag2,diag2,2);
                                        next_diag3 ^= next_a2;
    b3 = vshlq_n_u32(a3,18);
      diag3 = vextq_u32(diag3,diag3,1);
                                        next_a3 = next_diag2 + next_diag3;
                                          next_diag2 = vextq_u32(next_diag2,next_diag2,2);
    a3 = vsriq_n_u32(b3,a3,14);
                                        next_b3 = vshlq_n_u32(next_a3,18);
                                          next_diag3 = vextq_u32(next_diag3,next_diag3,1);
    diag0 ^= a3;
                                        next_a3 = vsriq_n_u32(next_b3,next_a3,14);
                                        next_diag0 ^= next_a3;
  }

  x0x5x10x15 = diag0 + start0;
  x12x1x6x11 = diag1 + start1;
  x8x13x2x7 = diag2 + start2;
  x4x9x14x3 = diag3 + start3;

  if (flagm) m0m1m2m3 = (uint32x4_t) vld1q_u8((uint8_t *) m);
  if (flagm) m4m5m6m7 = (uint32x4_t) vld1q_u8(16 + (uint8_t *) m);
  if (flagm) m8m9m10m11 = (uint32x4_t) vld1q_u8(32 + (uint8_t *) m);
  if (flagm) m12m13m14m15 = (uint32x4_t) vld1q_u8(48 + (uint8_t *) m);

  x0x1x10x11 = vbslq_u32(abab,x0x5x10x15,x12x1x6x11);
  x12x13x6x7 = vbslq_u32(abab,x12x1x6x11,x8x13x2x7);
  x8x9x2x3 = vbslq_u32(abab,x8x13x2x7,x4x9x14x3);
  x4x5x14x15 = vbslq_u32(abab,x4x9x14x3,x0x5x10x15);

  x0x1x2x3 = vcombine_u32(vget_low_u32(x0x1x10x11),vget_high_u32(x8x9x2x3));
  x4x5x6x7 = vcombine_u32(vget_low_u32(x4x5x14x15),vget_high_u32(x12x13x6x7));
  x8x9x10x11 = vcombine_u32(vget_low_u32(x8x9x2x3),vget_high_u32(x0x1x10x11));
  x12x13x14x15 = vcombine_u32(vget_low_u32(x12x13x6x7),vget_high_u32(x4x5x14x15));

  if (flagm) x0x1x2x3 ^= m0m1m2m3;
  if (flagm) x4x5x6x7 ^= m4m5m6m7;
  if (flagm) x8x9x10x11 ^= m8m9m10m11;
  if (flagm) x12x13x14x15 ^= m12m13m14m15;

  vst1q_u8((uint8_t *) c,(uint8x16_t) x0x1x2x3);
  vst1q_u8(16 + (uint8_t *) c,(uint8x16_t) x4x5x6x7);
  vst1q_u8(32 + (uint8_t *) c,(uint8x16_t) x8x9x10x11);
  vst1q_u8(48 + (uint8_t *) c,(uint8x16_t) x12x13x14x15);

  x0x5x10x15 = next_diag0 + start0;
  x12x1x6x11 = next_diag1 + start1;
  x8x13x2x7 = next_diag2 + next_start2;
  x4x9x14x3 = next_diag3 + next_start3;

  if (flagm) m0m1m2m3 = (uint32x4_t) vld1q_u8(64 + (uint8_t *) m);
  if (flagm) m4m5m6m7 = (uint32x4_t) vld1q_u8(80 + (uint8_t *) m);
  if (flagm) m8m9m10m11 = (uint32x4_t) vld1q_u8(96 + (uint8_t *) m);
  if (flagm) m12m13m14m15 = (uint32x4_t) vld1q_u8(112 + (uint8_t *) m);

  x0x1x10x11 = vbslq_u32(abab,x0x5x10x15,x12x1x6x11);
  x12x13x6x7 = vbslq_u32(abab,x12x1x6x11,x8x13x2x7);
  x8x9x2x3 = vbslq_u32(abab,x8x13x2x7,x4x9x14x3);
  x4x5x14x15 = vbslq_u32(abab,x4x9x14x3,x0x5x10x15);

  x0x1x2x3 = vcombine_u32(vget_low_u32(x0x1x10x11),vget_high_u32(x8x9x2x3));
  x4x5x6x7 = vcombine_u32(vget_low_u32(x4x5x14x15),vget_high_u32(x12x13x6x7));
  x8x9x10x11 = vcombine_u32(vget_low_u32(x8x9x2x3),vget_high_u32(x0x1x10x11));
  x12x13x14x15 = vcombine_u32(vget_low_u32(x12x13x6x7),vget_high_u32(x4x5x14x15));

  if (flagm) x0x1x2x3 ^= m0m1m2m3;
  if (flagm) x4x5x6x7 ^= m4m5m6m7;
  if (flagm) x8x9x10x11 ^= m8m9m10m11;
  if (flagm) x12x13x14x15 ^= m12m13m14m15;

  vst1q_u8(64 + (uint8_t *) c,(uint8x16_t) x0x1x2x3);
  vst1q_u8(80 + (uint8_t *) c,(uint8x16_t) x4x5x6x7);
  vst1q_u8(96 + (uint8_t *) c,(uint8x16_t) x8x9x10x11);
  vst1q_u8(112 + (uint8_t *) c,(uint8x16_t) x12x13x14x15);

  mlen -= 128;
  c += 128;
  if (flagm) m += 128;

  if (mlen >= 128) goto mlenatleast128;

  mlenatleast1:

  if (mlen < 64) {
    if (flagm) for (i = 0;i < 64;++i) block[i] = 0;
    if (flagm) for (i = 0;i < mlen;++i) block[i] = m[i];
    savec = c;
    c = block;
    if (flagm) m = block;
  }

  n3n2 = vext_u32(n2n3,n2n3,1);
  n2k6 = vext_u32(n3n2,k6k7,1);
  k3n3 = vext_u32(k2k3,n3n2,1);
  start2 = vcombine_u32(n2k6,k1n1);
  start3 = vcombine_u32(k3n3,k7k2);

  diag0 = start0;
  diag1 = start1;
  diag2 = start2;
  diag3 = start3;

  for (i = ROUNDS;i > 0;i -= 2) {
    a0 = diag1 + diag0;
    b0 = vshlq_n_u32(a0,7);
    a0 = vsriq_n_u32(b0,a0,25);
    diag3 ^= a0;
    a1 = diag0 + diag3;
    b1 = vshlq_n_u32(a1,9);
    a1 = vsriq_n_u32(b1,a1,23);
    diag2 ^= a1;
    a2 = diag3 + diag2;
      diag3 = vextq_u32(diag3,diag3,3);
    b2 = vshlq_n_u32(a2,13);
    a2 = vsriq_n_u32(b2,a2,19);
    diag1 ^= a2;
    a3 = diag2 + diag1;
      diag2 = vextq_u32(diag2,diag2,2);
    b3 = vshlq_n_u32(a3,18);
      diag1 = vextq_u32(diag1,diag1,1);
    a3 = vsriq_n_u32(b3,a3,14);
    diag0 ^= a3;

    a0 = diag3 + diag0;
    b0 = vshlq_n_u32(a0,7);
    a0 = vsriq_n_u32(b0,a0,25);
    diag1 ^= a0;
    a1 = diag0 + diag1;
    b1 = vshlq_n_u32(a1,9);
    a1 = vsriq_n_u32(b1,a1,23);
    diag2 ^= a1;
    a2 = diag1 + diag2;
      diag1 = vextq_u32(diag1,diag1,3);
    b2 = vshlq_n_u32(a2,13);
    a2 = vsriq_n_u32(b2,a2,19);
    diag3 ^= a2;
    a3 = diag2 + diag3;
      diag2 = vextq_u32(diag2,diag2,2);
    b3 = vshlq_n_u32(a3,18);
      diag3 = vextq_u32(diag3,diag3,1);
    a3 = vsriq_n_u32(b3,a3,14);
    diag0 ^= a3;
  }

  x0x5x10x15 = diag0 + start0;
  x12x1x6x11 = diag1 + start1;
  x8x13x2x7 = diag2 + start2;
  x4x9x14x3 = diag3 + start3;

  if (flagm) m0m1m2m3 = (uint32x4_t) vld1q_u8((uint8_t *) m);
  if (flagm) m4m5m6m7 = (uint32x4_t) vld1q_u8(16 + (uint8_t *) m);
  if (flagm) m8m9m10m11 = (uint32x4_t) vld1q_u8(32 + (uint8_t *) m);
  if (flagm) m12m13m14m15 = (uint32x4_t) vld1q_u8(48 + (uint8_t *) m);

  x0x1x10x11 = vbslq_u32(abab,x0x5x10x15,x12x1x6x11);
  x12x13x6x7 = vbslq_u32(abab,x12x1x6x11,x8x13x2x7);
  x8x9x2x3 = vbslq_u32(abab,x8x13x2x7,x4x9x14x3);
  x4x5x14x15 = vbslq_u32(abab,x4x9x14x3,x0x5x10x15);

  x0x1x2x3 = vcombine_u32(vget_low_u32(x0x1x10x11),vget_high_u32(x8x9x2x3));
  x4x5x6x7 = vcombine_u32(vget_low_u32(x4x5x14x15),vget_high_u32(x12x13x6x7));
  x8x9x10x11 = vcombine_u32(vget_low_u32(x8x9x2x3),vget_high_u32(x0x1x10x11));
  x12x13x14x15 = vcombine_u32(vget_low_u32(x12x13x6x7),vget_high_u32(x4x5x14x15));

  if (flagm) x0x1x2x3 ^= m0m1m2m3;
  if (flagm) x4x5x6x7 ^= m4m5m6m7;
  if (flagm) x8x9x10x11 ^= m8m9m10m11;
  if (flagm) x12x13x14x15 ^= m12m13m14m15;

  vst1q_u8((uint8_t *) c,(uint8x16_t) x0x1x2x3);
  vst1q_u8(16 + (uint8_t *) c,(uint8x16_t) x4x5x6x7);
  vst1q_u8(32 + (uint8_t *) c,(uint8x16_t) x8x9x10x11);
  vst1q_u8(48 + (uint8_t *) c,(uint8x16_t) x12x13x14x15);

  if (mlen < 64) {
    for (i = 0;i < mlen;++i) savec[i] = c[i];
  }
  if (mlen <= 64) return 0;

  n2n3 = (uint32x2_t) vadd_u64(nextblock,(uint64x1_t) n2n3);

  mlen -= 64;
  c += 64;
  if (flagm) m += 64;

  goto mlenatleast1;
}
Exemple #29
0
void ne10_img_hresize_4channels_linear_neon (const unsigned char** src, int** dst, int count,
        const int* xofs, const short* alpha,
        int swidth, int dwidth, int cn, int xmin, int xmax)
{
    int dx, k;
    int dx0 = 0;

    int16x4x2_t alpha_vec;

    uint8x8_t dS0_vec, dS1_vec;
    int16x8_t qS0_vec, qS1_vec;
    int16x4_t dS0_0123, dS0_4567, dS1_0123, dS1_4567;

    int32x4_t qT0_vec, qT1_vec;

    int16x4_t dCoeff;
    dCoeff = vdup_n_s16 (INTER_RESIZE_COEF_SCALE);

    for (k = 0; k <= count - 2; k++)
    {
        const unsigned char *S0 = src[k], *S1 = src[k + 1];
        int *D0 = dst[k], *D1 = dst[k + 1];

        for (dx = dx0; dx < xmax; dx += 4)
        {
            int sx = xofs[dx];

            alpha_vec = vld2_s16 (&alpha[dx * 2]);

            dS0_vec = vld1_u8 (&S0[sx]);
            dS1_vec = vld1_u8 (&S1[sx]);

            qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));
            qS1_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS1_vec));

            dS0_0123 = vget_low_s16 (qS0_vec);
            dS0_4567 = vget_high_s16 (qS0_vec);
            dS1_0123 = vget_low_s16 (qS1_vec);
            dS1_4567 = vget_high_s16 (qS1_vec);

            qT0_vec = vmull_s16 (dS0_0123, alpha_vec.val[0]);
            qT1_vec = vmull_s16 (dS1_0123, alpha_vec.val[0]);
            qT0_vec = vmlal_s16 (qT0_vec, dS0_4567, alpha_vec.val[1]);
            qT1_vec = vmlal_s16 (qT1_vec, dS1_4567, alpha_vec.val[1]);

            vst1q_s32 (&D0[dx], qT0_vec);
            vst1q_s32 (&D1[dx], qT1_vec);
        }

        for (; dx < dwidth; dx += 4)
        {
            int sx = xofs[dx];

            dS0_vec = vld1_u8 (&S0[sx]);
            dS1_vec = vld1_u8 (&S1[sx]);

            qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));
            qS1_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS1_vec));

            dS0_0123 = vget_low_s16 (qS0_vec);
            dS1_0123 = vget_low_s16 (qS1_vec);

            qT0_vec = vmull_s16 (dS0_0123, dCoeff);
            qT1_vec = vmull_s16 (dS1_0123, dCoeff);

            vst1q_s32 (&D0[dx], qT0_vec);
            vst1q_s32 (&D1[dx], qT1_vec);
        }
    }

    for (; k < count; k++)
    {
        const unsigned char *S = src[k];
        int *D = dst[k];
        for (dx = 0; dx < xmax; dx += 4)
        {
            int sx = xofs[dx];

            alpha_vec = vld2_s16 (&alpha[dx * 2]);

            dS0_vec = vld1_u8 (&S[sx]);
            qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));

            dS0_0123 = vget_low_s16 (qS0_vec);
            dS0_4567 = vget_high_s16 (qS0_vec);

            qT0_vec = vmull_s16 (dS0_0123, alpha_vec.val[0]);
            qT0_vec = vmlal_s16 (qT0_vec, dS0_4567, alpha_vec.val[1]);

            vst1q_s32 (&D[dx], qT0_vec);
        }

        for (; dx < dwidth; dx += 4)
        {
            int sx = xofs[dx];

            dS0_vec = vld1_u8 (&S[sx]);
            qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));
            dS0_0123 = vget_low_s16 (qS0_vec);
            qT0_vec = vmull_s16 (dS0_0123, dCoeff);

            vst1q_s32 (&D[dx], qT0_vec);
        }
    }
}
Exemple #30
0
void ne10_img_vresize_linear_neon (const int** src, unsigned char* dst, const short* beta, int width)
{
    const int *S0 = src[0], *S1 = src[1];

    int32x4_t qS0_0123, qS0_4567, qS1_0123, qS1_4567;
    int32x4_t qT_0123, qT_4567;
    int16x4_t dT_0123, dT_4567;
    uint16x8_t qT_01234567;
    uint8x8_t dT_01234567, dDst_01234567;

    int32x2_t dBeta;
    dBeta = vset_lane_s32 ( (int) (beta[0]), dBeta, 0);
    dBeta = vset_lane_s32 ( (int) (beta[1]), dBeta, 1);

    int32x4_t qDelta, qMin, qMax;
    qDelta = vdupq_n_s32 (DELTA);
    qMin = vdupq_n_s32 (0);
    qMax = vdupq_n_s32 (255);

    int x = 0;
    for (; x <= width - 8; x += 8)
    {
        qS0_0123 = vld1q_s32 (&S0[x]);
        qS0_4567 = vld1q_s32 (&S0[x + 4]);
        qS1_0123 = vld1q_s32 (&S1[x]);
        qS1_4567 = vld1q_s32 (&S1[x + 4]);

        qT_0123 = vmulq_lane_s32 (qS0_0123, dBeta, 0);
        qT_4567 = vmulq_lane_s32 (qS0_4567, dBeta, 0);
        qT_0123 = vmlaq_lane_s32 (qT_0123, qS1_0123, dBeta, 1);
        qT_4567 = vmlaq_lane_s32 (qT_4567, qS1_4567, dBeta, 1);

        qT_0123 = vaddq_s32 (qT_0123, qDelta);
        qT_4567 = vaddq_s32 (qT_4567, qDelta);

        qT_0123 = vshrq_n_s32 (qT_0123, BITS);
        qT_4567 = vshrq_n_s32 (qT_4567, BITS);

        qT_0123 = vmaxq_s32 (qT_0123, qMin);
        qT_4567 = vmaxq_s32 (qT_4567, qMin);
        qT_0123 = vminq_s32 (qT_0123, qMax);
        qT_4567 = vminq_s32 (qT_4567, qMax);

        dT_0123 = vmovn_s32 (qT_0123);
        dT_4567 = vmovn_s32 (qT_4567);
        qT_01234567 = vreinterpretq_u16_s16 (vcombine_s16 (dT_0123, dT_4567));
        dT_01234567 = vmovn_u16 (qT_01234567);

        vst1_u8 (&dst[x], dT_01234567);
    }

    if (x < width)
    {
        uint8x8_t dMask;
        dMask = vld1_u8 ( (uint8_t *) (&ne10_img_vresize_linear_mask_residual_table[ (width - x - 1)]));
        dDst_01234567 = vld1_u8 (&dst[x]);

        qS0_0123 = vld1q_s32 (&S0[x]);
        qS0_4567 = vld1q_s32 (&S0[x + 4]);
        qS1_0123 = vld1q_s32 (&S1[x]);
        qS1_4567 = vld1q_s32 (&S1[x + 4]);

        qT_0123 = vmulq_lane_s32 (qS0_0123, dBeta, 0);
        qT_4567 = vmulq_lane_s32 (qS0_4567, dBeta, 0);
        qT_0123 = vmlaq_lane_s32 (qT_0123, qS1_0123, dBeta, 1);
        qT_4567 = vmlaq_lane_s32 (qT_4567, qS1_4567, dBeta, 1);

        qT_0123 = vaddq_s32 (qT_0123, qDelta);
        qT_4567 = vaddq_s32 (qT_4567, qDelta);

        qT_0123 = vshrq_n_s32 (qT_0123, BITS);
        qT_4567 = vshrq_n_s32 (qT_4567, BITS);

        qT_0123 = vmaxq_s32 (qT_0123, qMin);
        qT_4567 = vmaxq_s32 (qT_4567, qMin);
        qT_0123 = vminq_s32 (qT_0123, qMax);
        qT_4567 = vminq_s32 (qT_4567, qMax);

        dT_0123 = vmovn_s32 (qT_0123);
        dT_4567 = vmovn_s32 (qT_4567);
        qT_01234567 = vreinterpretq_u16_s16 (vcombine_s16 (dT_0123, dT_4567));
        dT_01234567 = vmovn_u16 (qT_01234567);

        dMask = vbsl_u8 (dMask, dT_01234567, dDst_01234567);
        vst1_u8 (&dst[x], dMask);
    }
}