// 'do_above' and 'do_left' facilitate branch removal when inlined. static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left, int do_above, int do_left) { uint16x8_t sum_top; uint16x8_t sum_left; uint8x8_t dc0; if (do_above) { const uint8x16_t A0 = vld1q_u8(above); // top row const uint8x16_t A1 = vld1q_u8(above + 16); const uint16x8_t p0 = vpaddlq_u8(A0); // cascading summation of the top const uint16x8_t p1 = vpaddlq_u8(A1); const uint16x8_t p2 = vaddq_u16(p0, p1); const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); const uint16x4_t p4 = vpadd_u16(p3, p3); const uint16x4_t p5 = vpadd_u16(p4, p4); sum_top = vcombine_u16(p5, p5); } if (do_left) { const uint8x16_t L0 = vld1q_u8(left); // left row const uint8x16_t L1 = vld1q_u8(left + 16); const uint16x8_t p0 = vpaddlq_u8(L0); // cascading summation of the left const uint16x8_t p1 = vpaddlq_u8(L1); const uint16x8_t p2 = vaddq_u16(p0, p1); const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); const uint16x4_t p4 = vpadd_u16(p3, p3); const uint16x4_t p5 = vpadd_u16(p4, p4); sum_left = vcombine_u16(p5, p5); } if (do_above && do_left) { const uint16x8_t sum = vaddq_u16(sum_left, sum_top); dc0 = vrshrn_n_u16(sum, 6); } else if (do_above) { dc0 = vrshrn_n_u16(sum_top, 5); } else if (do_left) { dc0 = vrshrn_n_u16(sum_left, 5); } else { dc0 = vdup_n_u8(0x80); } { const uint8x16_t dc = vdupq_lane_u8(dc0, 0); int i; for (i = 0; i < 32; ++i) { vst1q_u8(dst + i * stride, dc); vst1q_u8(dst + i * stride + 16, dc); } } }
//Note: it takes size and offset in units of byte static inline int compute_ham_similarity_64(unsigned short* ref, unsigned short* circ_array, int size){ const uint8_t* ref_c=(uint8_t*) ref; const uint8_t* circ_c=(uint8_t*) circ_array; register uint8x16_t a,b; register uint8x16_t c,d,temp; register uint16x8_t acc; register uint i=0,count=0; int j=0; int shift=size&0xF; for(i=0;i<=size-16; i+=16){ j++; a=vld1q_u8(&ref_c[i]); b=vld1q_u8(&circ_c[i]); c=veorq_u8(a,b); acc=vaddq_u16(acc,vpaddlq_u8(vcntq_u8(c))); } count=setbits(acc); a=vld1q_u8(&ref_c[i]); b=vld1q_u8(&circ_c[i]); c=veorq_u8(a,b); c=vcntq_u8(c); for(i=0;i<shift;i++){ count=count+vgetq_lane_u8 (c,i); } return size*8-count; }
/* u16x8 mv mul */ void mw_neon_mv_mul_u16x8(unsigned short * A, int Row, int T, unsigned short * B, unsigned short * C) { int i = 0; int k = 0; uint16x8_t neon_b, neon_c; uint16x8_t neon_a0, neon_a1, neon_a2, neon_a3, neon_a4, neon_a5, neon_a6, neon_a7; uint16x8_t neon_b0, neon_b1, neon_b2, neon_b3, neon_b4, neon_b5, neon_b6, neon_b7; for (i = 0; i < Row; i+=8) { neon_c = vmovq_n_u16(0); for (k = 0; k < T; k+=8) { int j = k * T + i; neon_a0 = vld1q_u16(A + j); j+=Row; neon_a1 = vld1q_u16(A + j); j+=Row; neon_a2 = vld1q_u16(A + j); j+=Row; neon_a3 = vld1q_u16(A + j); j+=Row; neon_a4 = vld1q_u16(A + j); j+=Row; neon_a5 = vld1q_u16(A + j); j+=Row; neon_a6 = vld1q_u16(A + j); j+=Row; neon_a7 = vld1q_u16(A + j); neon_b = vld1q_u16(B + k); neon_b0 = vdupq_n_u16(vgetq_lane_u16(neon_b, 0)); neon_b1 = vdupq_n_u16(vgetq_lane_u16(neon_b, 1)); neon_b2 = vdupq_n_u16(vgetq_lane_u16(neon_b, 2)); neon_b3 = vdupq_n_u16(vgetq_lane_u16(neon_b, 3)); neon_b4 = vdupq_n_u16(vgetq_lane_u16(neon_b, 4)); neon_b5 = vdupq_n_u16(vgetq_lane_u16(neon_b, 5)); neon_b6 = vdupq_n_u16(vgetq_lane_u16(neon_b, 6)); neon_b7 = vdupq_n_u16(vgetq_lane_u16(neon_b, 7)); neon_c = vaddq_u16(vmulq_u16(neon_a0, neon_b0), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a1, neon_b1), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a2, neon_b2), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a3, neon_b3), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a4, neon_b4), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a5, neon_b5), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a6, neon_b6), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a7, neon_b7), neon_c); } vst1q_u16(C + i, neon_c); } }
void test_vaddQu16 (void) { uint16x8_t out_uint16x8_t; uint16x8_t arg0_uint16x8_t; uint16x8_t arg1_uint16x8_t; out_uint16x8_t = vaddq_u16 (arg0_uint16x8_t, arg1_uint16x8_t); }
// 'do_above' and 'do_left' facilitate branch removal when inlined. static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left, int do_above, int do_left) { uint16x8_t sum_top; uint16x8_t sum_left; uint8x8_t dc0; if (do_above) { const uint8x16_t A = vld1q_u8(above); // top row const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); const uint16x4_t p2 = vpadd_u16(p1, p1); const uint16x4_t p3 = vpadd_u16(p2, p2); sum_top = vcombine_u16(p3, p3); } if (do_left) { const uint8x16_t L = vld1q_u8(left); // left row const uint16x8_t p0 = vpaddlq_u8(L); // cascading summation of the left const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); const uint16x4_t p2 = vpadd_u16(p1, p1); const uint16x4_t p3 = vpadd_u16(p2, p2); sum_left = vcombine_u16(p3, p3); } if (do_above && do_left) { const uint16x8_t sum = vaddq_u16(sum_left, sum_top); dc0 = vrshrn_n_u16(sum, 5); } else if (do_above) { dc0 = vrshrn_n_u16(sum_top, 4); } else if (do_left) { dc0 = vrshrn_n_u16(sum_left, 4); } else { dc0 = vdup_n_u8(0x80); } { const uint8x16_t dc = vdupq_lane_u8(dc0, 0); int i; for (i = 0; i < 16; ++i) { vst1q_u8(dst + i * stride, dc); } } }
// 'do_above' and 'do_left' facilitate branch removal when inlined. static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left, int do_above, int do_left) { uint16x8_t sum_top; uint16x8_t sum_left; uint8x8_t dc0; if (do_above) { const uint8x8_t A = vld1_u8(above); // top row const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top const uint16x4_t p1 = vpadd_u16(p0, p0); const uint16x4_t p2 = vpadd_u16(p1, p1); sum_top = vcombine_u16(p2, p2); } if (do_left) { const uint8x8_t L = vld1_u8(left); // left border const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left const uint16x4_t p1 = vpadd_u16(p0, p0); const uint16x4_t p2 = vpadd_u16(p1, p1); sum_left = vcombine_u16(p2, p2); } if (do_above && do_left) { const uint16x8_t sum = vaddq_u16(sum_left, sum_top); dc0 = vrshrn_n_u16(sum, 4); } else if (do_above) { dc0 = vrshrn_n_u16(sum_top, 3); } else if (do_left) { dc0 = vrshrn_n_u16(sum_left, 3); } else { dc0 = vdup_n_u8(0x80); } { const uint8x8_t dc = vdup_lane_u8(dc0, 0); int i; for (i = 0; i < 8; ++i) { vst1_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc)); } } }
/* u16x8 add */ void mw_neon_mm_add_u16x8(unsigned short * A, int Row, int Col, unsigned short * B, unsigned short * C) { uint16x8_t neon_a, neon_b, neon_c; int size = Row * Col; int i = 0; int k = 0; for (i = 8; i <= size ; i+=8) { k = i - 8; neon_a = vld1q_u16(A + k); neon_b = vld1q_u16(B + k); neon_c = vaddq_u16(neon_a, neon_b); vst1q_u16(C + k, neon_c); } k = i - 8; for (i = 0; i < size % 8; i++) { C[k + i] = A[k + i] + B[k + i]; } }
static uint8x8_t ConvertRGBToY_NEON(const uint8x8_t R, const uint8x8_t G, const uint8x8_t B) { const uint16x8_t r = vmovl_u8(R); const uint16x8_t g = vmovl_u8(G); const uint16x8_t b = vmovl_u8(B); const uint16x4_t r_lo = vget_low_u16(r); const uint16x4_t r_hi = vget_high_u16(r); const uint16x4_t g_lo = vget_low_u16(g); const uint16x4_t g_hi = vget_high_u16(g); const uint16x4_t b_lo = vget_low_u16(b); const uint16x4_t b_hi = vget_high_u16(b); const uint32x4_t tmp0_lo = vmull_n_u16( r_lo, 16839u); const uint32x4_t tmp0_hi = vmull_n_u16( r_hi, 16839u); const uint32x4_t tmp1_lo = vmlal_n_u16(tmp0_lo, g_lo, 33059u); const uint32x4_t tmp1_hi = vmlal_n_u16(tmp0_hi, g_hi, 33059u); const uint32x4_t tmp2_lo = vmlal_n_u16(tmp1_lo, b_lo, 6420u); const uint32x4_t tmp2_hi = vmlal_n_u16(tmp1_hi, b_hi, 6420u); const uint16x8_t Y1 = vcombine_u16(vrshrn_n_u32(tmp2_lo, 16), vrshrn_n_u32(tmp2_hi, 16)); const uint16x8_t Y2 = vaddq_u16(Y1, vdupq_n_u16(16)); return vqmovn_u16(Y2); }
void UpsampleRgbaLinePairNEON(const uint8_t *top_y, const uint8_t *bottom_y, const uint8_t *top_u, const uint8_t *top_v, const uint8_t *cur_u, const uint8_t *cur_v, uint8_t *top_dst, uint8_t *bottom_dst, int len) { int block; uint8_t uv_buf[2 * 32 + 15]; uint8_t *const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); const int uv_len = (len + 1) >> 1; const int num_blocks = (uv_len - 1) >> 3; const int leftover = uv_len - num_blocks * 8; const int last_pos = 1 + 16 * num_blocks; const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; const int16x4_t cf16 = vld1_s16(coef); const int32x2_t cf32 = vmov_n_s32(76283); const uint8x8_t u16 = vmov_n_u8(16); const uint8x8_t u128 = vmov_n_u8(128); for (block = 0; block < num_blocks; ++block) { { uint8x8_t a = vld1_u8(top_u); uint8x8_t b = vld1_u8(top_u + 1); uint8x8_t c = vld1_u8(cur_u); uint8x8_t d = vld1_u8(cur_u + 1); uint16x8_t al = vshll_n_u8(a, 1); uint16x8_t bl = vshll_n_u8(b, 1); uint16x8_t cl = vshll_n_u8(c, 1); uint16x8_t dl = vshll_n_u8(d, 1); uint8x8_t diag1, diag2; uint16x8_t sl; sl = vaddl_u8(a, b); sl = vaddw_u8(sl, c); sl = vaddw_u8(sl, d); al = vaddq_u16(sl, al); bl = vaddq_u16(sl, bl); al = vaddq_u16(al, dl); bl = vaddq_u16(bl, cl); diag2 = vshrn_n_u16(al, 3); diag1 = vshrn_n_u16(bl, 3); a = vrhadd_u8(a, diag1); b = vrhadd_u8(b, diag2); c = vrhadd_u8(c, diag2); d = vrhadd_u8(d, diag1); { const uint8x8x2_t a_b = {{ a, b }}; const uint8x8x2_t c_d = {{ c, d }}; vst2_u8(r_uv, a_b); vst2_u8(r_uv + 32, c_d); } } { uint8x8_t a = vld1_u8(top_v); uint8x8_t b = vld1_u8(top_v + 1); uint8x8_t c = vld1_u8(cur_v); uint8x8_t d = vld1_u8(cur_v + 1); uint16x8_t al = vshll_n_u8(a, 1); uint16x8_t bl = vshll_n_u8(b, 1); uint16x8_t cl = vshll_n_u8(c, 1); uint16x8_t dl = vshll_n_u8(d, 1); uint8x8_t diag1, diag2; uint16x8_t sl; sl = vaddl_u8(a, b); sl = vaddw_u8(sl, c); sl = vaddw_u8(sl, d); al = vaddq_u16(sl, al); bl = vaddq_u16(sl, bl); al = vaddq_u16(al, dl); bl = vaddq_u16(bl, cl); diag2 = vshrn_n_u16(al, 3); diag1 = vshrn_n_u16(bl, 3); a = vrhadd_u8(a, diag1); b = vrhadd_u8(b, diag2); c = vrhadd_u8(c, diag2); d = vrhadd_u8(d, diag1); { const uint8x8x2_t a_b = {{ a, b }}; const uint8x8x2_t c_d = {{ c, d }}; vst2_u8(r_uv + 16, a_b); vst2_u8(r_uv + 16 + 32, c_d); } } { if (top_y) { { int i; for (i = 0; i < 16; i += 8) { int off = ((16 * block + 1) + i) * 4; uint8x8_t y = vld1_u8(top_y + (16 * block + 1) + i); uint8x8_t u = vld1_u8((r_uv) + i); uint8x8_t v = vld1_u8((r_uv) + i + 16); int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16)); int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128)); int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128)); int16x8_t ud = vshlq_n_s16(uu, 1); int16x8_t vd = vshlq_n_s16(vv, 1); int32x4_t vrl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(vv), 1), vget_low_s16(vd), cf16, 0); int32x4_t vrh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(vv), 1), vget_high_s16(vd), cf16, 0); int16x8_t vr = vcombine_s16(vrshrn_n_s32(vrl, 16), vrshrn_n_s32(vrh, 16)); int32x4_t vl = vmovl_s16(vget_low_s16(vv)); int32x4_t vh = vmovl_s16(vget_high_s16(vv)); int32x4_t ugl = vmlal_lane_s16(vl, vget_low_s16(uu), cf16, 1); int32x4_t ugh = vmlal_lane_s16(vh, vget_high_s16(uu), cf16, 1); int32x4_t gcl = vqdmlal_lane_s16(ugl, vget_low_s16(vv), cf16, 2); int32x4_t gch = vqdmlal_lane_s16(ugh, vget_high_s16(vv), cf16, 2); int16x8_t gc = vcombine_s16(vrshrn_n_s32(gcl, 16), vrshrn_n_s32(gch, 16)); int32x4_t ubl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(uu), 1), vget_low_s16(ud), cf16, 3); int32x4_t ubh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(uu), 1), vget_high_s16(ud), cf16, 3); int16x8_t ub = vcombine_s16(vrshrn_n_s32(ubl, 16), vrshrn_n_s32(ubh, 16)); int32x4_t rl = vaddl_s16(vget_low_s16(yy), vget_low_s16(vr)); int32x4_t rh = vaddl_s16(vget_high_s16(yy), vget_high_s16(vr)); int32x4_t gl = vsubl_s16(vget_low_s16(yy), vget_low_s16(gc)); int32x4_t gh = vsubl_s16(vget_high_s16(yy), vget_high_s16(gc)); int32x4_t bl = vaddl_s16(vget_low_s16(yy), vget_low_s16(ub)); int32x4_t bh = vaddl_s16(vget_high_s16(yy), vget_high_s16(ub)); rl = vmulq_lane_s32(rl, cf32, 0); rh = vmulq_lane_s32(rh, cf32, 0); gl = vmulq_lane_s32(gl, cf32, 0); gh = vmulq_lane_s32(gh, cf32, 0); bl = vmulq_lane_s32(bl, cf32, 0); bh = vmulq_lane_s32(bh, cf32, 0); y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, 16), vrshrn_n_s32(rh, 16))); u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, 16), vrshrn_n_s32(gh, 16))); v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(bl, 16), vrshrn_n_s32(bh, 16))); do { const uint8x8x4_t r_g_b_v255 = {{ y, u, v, vmov_n_u8(255) }}; vst4_u8(top_dst + off, r_g_b_v255); } while (0); } } } if (bottom_y) { { int i; for (i = 0; i < 16; i += 8) { int off = ((16 * block + 1) + i) * 4; uint8x8_t y = vld1_u8(bottom_y + (16 * block + 1) + i); uint8x8_t u = vld1_u8(((r_uv) + 32) + i); uint8x8_t v = vld1_u8(((r_uv) + 32) + i + 16); int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16)); int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128)); int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128)); int16x8_t ud = vshlq_n_s16(uu, 1); int16x8_t vd = vshlq_n_s16(vv, 1); int32x4_t vrl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(vv), 1), vget_low_s16(vd), cf16, 0); int32x4_t vrh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(vv), 1), vget_high_s16(vd), cf16, 0); int16x8_t vr = vcombine_s16(vrshrn_n_s32(vrl, 16), vrshrn_n_s32(vrh, 16)); int32x4_t vl = vmovl_s16(vget_low_s16(vv)); int32x4_t vh = vmovl_s16(vget_high_s16(vv)); int32x4_t ugl = vmlal_lane_s16(vl, vget_low_s16(uu), cf16, 1); int32x4_t ugh = vmlal_lane_s16(vh, vget_high_s16(uu), cf16, 1); int32x4_t gcl = vqdmlal_lane_s16(ugl, vget_low_s16(vv), cf16, 2); int32x4_t gch = vqdmlal_lane_s16(ugh, vget_high_s16(vv), cf16, 2); int16x8_t gc = vcombine_s16(vrshrn_n_s32(gcl, 16), vrshrn_n_s32(gch, 16)); int32x4_t ubl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(uu), 1), vget_low_s16(ud), cf16, 3); int32x4_t ubh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(uu), 1), vget_high_s16(ud), cf16, 3); int16x8_t ub = vcombine_s16(vrshrn_n_s32(ubl, 16), vrshrn_n_s32(ubh, 16)); int32x4_t rl = vaddl_s16(vget_low_s16(yy), vget_low_s16(vr)); int32x4_t rh = vaddl_s16(vget_high_s16(yy), vget_high_s16(vr)); int32x4_t gl = vsubl_s16(vget_low_s16(yy), vget_low_s16(gc)); int32x4_t gh = vsubl_s16(vget_high_s16(yy), vget_high_s16(gc)); int32x4_t bl = vaddl_s16(vget_low_s16(yy), vget_low_s16(ub)); int32x4_t bh = vaddl_s16(vget_high_s16(yy), vget_high_s16(ub)); rl = vmulq_lane_s32(rl, cf32, 0); rh = vmulq_lane_s32(rh, cf32, 0); gl = vmulq_lane_s32(gl, cf32, 0); gh = vmulq_lane_s32(gh, cf32, 0); bl = vmulq_lane_s32(bl, cf32, 0); bh = vmulq_lane_s32(bh, cf32, 0); y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, 16), vrshrn_n_s32(rh, 16))); u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, 16), vrshrn_n_s32(gh, 16))); v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(bl, 16), vrshrn_n_s32(bh, 16))); do { const uint8x8x4_t r_g_b_v255 = {{ y, u, v, vmov_n_u8(255) }}; vst4_u8(bottom_dst + off, r_g_b_v255); } while (0); } } } } } }
/* u16x8 mm mul */ void mw_neon_mm_mul_u16x8(unsigned short * A, int Row, int T, unsigned short * B, int Col, unsigned short * C) { int i, k, j; uint16x8_t neon_b, neon_c; uint16x8_t neon_a0, neon_a1, neon_a2, neon_a3, neon_a4, neon_a5, neon_a6, neon_a7; uint16x8_t neon_b0, neon_b1, neon_b2, neon_b3, neon_b4, neon_b5, neon_b6, neon_b7; for (i = 0; i < Row; i+=8) { for (k = 0; k < Col; k+=1) { neon_c = vmovq_n_u16(0); for (j = 0; j < T; j+=8) { int j_T = j * T + i; int k_Row = k * Row; neon_a0 = vld1q_u16(A + j_T); j_T+=Row; neon_a1 = vld1q_u16(A + j_T); j_T+=Row; neon_a2 = vld1q_u16(A + j_T); j_T+=Row; neon_a3 = vld1q_u16(A + j_T); j_T+=Row; neon_a4 = vld1q_u16(A + j_T); j_T+=Row; neon_a5 = vld1q_u16(A + j_T); j_T+=Row; neon_a6 = vld1q_u16(A + j_T); j_T+=Row; neon_a7 = vld1q_u16(A + j_T); neon_b = vld1q_u16(B + k_Row + j); neon_b0 = vdupq_n_u16(vgetq_lane_u16(neon_b, 0)); neon_b1 = vdupq_n_u16(vgetq_lane_u16(neon_b, 1)); neon_b2 = vdupq_n_u16(vgetq_lane_u16(neon_b, 2)); neon_b3 = vdupq_n_u16(vgetq_lane_u16(neon_b, 3)); neon_b4 = vdupq_n_u16(vgetq_lane_u16(neon_b, 4)); neon_b5 = vdupq_n_u16(vgetq_lane_u16(neon_b, 5)); neon_b6 = vdupq_n_u16(vgetq_lane_u16(neon_b, 6)); neon_b7 = vdupq_n_u16(vgetq_lane_u16(neon_b, 7)); neon_c = vaddq_u16(vmulq_u16(neon_a0, neon_b0), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a1, neon_b1), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a2, neon_b2), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a3, neon_b3), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a4, neon_b4), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a5, neon_b5), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a6, neon_b6), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a7, neon_b7), neon_c); vst1q_lane_u16(C + k_Row + i, neon_c, 0); vst1q_lane_u16(C + k_Row + i + 1, neon_c, 1); vst1q_lane_u16(C + k_Row + i + 2, neon_c, 2); vst1q_lane_u16(C + k_Row + i + 3, neon_c, 3); vst1q_lane_u16(C + k_Row + i + 4, neon_c, 4); vst1q_lane_u16(C + k_Row + i + 5, neon_c, 5); vst1q_lane_u16(C + k_Row + i + 6, neon_c, 6); vst1q_lane_u16(C + k_Row + i + 7, neon_c, 7); } } } }
SIMD_INLINE uint16x8_t ReduceColBody(const uint8_t *src) { const uint8x8x2_t t01 = vld2_u8(src - 1); const uint8x8x2_t t23 = vld2_u8(src + 1); return vaddq_u16(vaddl_u8(t01.val[0], t23.val[1]), vmulq_u16(vaddl_u8(t01.val[1], t23.val[0]), K16_0003)); }
template<bool align> SIMD_INLINE uint16x8_t ReduceColBody(const uint8_t * src) { uint8x16_t t01 = Load<false>(src - 1); uint8x16_t t12 = Load<align>(src); return vaddq_u16(vpaddlq_u8(t01), vpaddlq_u8(t12)); }
template<bool align> SIMD_INLINE uint16x8_t ReduceColNose(const uint8_t * src) { uint8x16_t t12 = Load<align>(src); uint8x16_t t01 = LoadBeforeFirst<1>(t12); return vaddq_u16(vpaddlq_u8(t01), vpaddlq_u8(t12)); }
template <> SIMD_INLINE uint16x8_t DivideBy16<true>(uint16x8_t value) { return vshrq_n_u16(vaddq_u16(value, K16_0008), 4); }
SIMD_INLINE uint16x8_t DivideBy64(uint16x8_t value) { return vshrq_n_u16(vaddq_u16(value, K16_0020), 6); }
SIMD_INLINE uint16x8_t ReduceColNose(const uint8_t * src) { const uint8x8x2_t t01 = Deinterleave(LoadBeforeFirst<1>(vld1q_u8(src))); const uint8x8x2_t t23 = vld2_u8(src + 1); return vaddq_u16(vaddl_u8(t01.val[0], t23.val[1]), vmulq_u16(vaddl_u8(t01.val[1], t23.val[0]), K16_0003)); }
inline uint16x8_t vaddq(const uint16x8_t & v0, const uint16x8_t & v1) { return vaddq_u16(v0, v1); }
template <> SIMD_INLINE uint16x8_t ReduceColTail<false>(const uint8_t *src) { const uint8x8x2_t t01 = vld2_u8(src - 1); const uint8x8x2_t t23 = Deinterleave(LoadAfterLast<1>(LoadAfterLast<1>(vld1q_u8(src - 1)))); return vaddq_u16(vaddl_u8(t01.val[0], t23.val[1]), vmulq_u16(vaddl_u8(t01.val[1], t23.val[0]), K16_0003)); }
int neon_new(DATA32* _p0, DATA32* _p1, DATA32* _p2, DATA32* _p3, DATA32* _ax, DATA32 _ay, DATA32* result, int len) { int ay = _ay; int i; DATA32* pbuf = result; uint16x4_t ay_16x4; uint16x4_t p0_16x4; uint16x4_t p2_16x4; uint16x8_t ax_16x8; uint16x8_t p0_p2_16x8; uint16x8_t p1_p3_16x8; uint16x8_t x255_16x8; uint32x2_t p0_p2_32x2; uint32x2_t p1_p3_32x2; uint32x2_t res_32x2; uint8x8_t p0_p2_8x8; uint8x8_t p1_p3_8x8; uint8x8_t p2_8x8; uint16x4_t temp_16x4; ay_16x4 = vdup_n_u16(ay); x255_16x8 = vdupq_n_u16(0xff); for(i = 0; i < len; i++) { DATA32 p0 = *_p0++; DATA32 p1 = *_p1++; DATA32 p2 = *_p2++; DATA32 p3 = *_p3++; int ax = *_ax++; if (p0 | p1 | p2 | p3) { ax_16x8 = vdupq_n_u16(ax); p0_p2_32x2 = vset_lane_u32(p0, p0_p2_32x2, 0); p0_p2_32x2 = vset_lane_u32(p2, p0_p2_32x2, 1); p1_p3_32x2 = vset_lane_u32(p1, p1_p3_32x2, 0); p1_p3_32x2 = vset_lane_u32(p3, p1_p3_32x2, 1); p0_p2_8x8 = vreinterpret_u8_u32(p0_p2_32x2); p1_p3_8x8 = vreinterpret_u8_u32(p1_p3_32x2); p1_p3_16x8 = vmovl_u8(p1_p3_8x8); p0_p2_16x8 = vmovl_u8(p0_p2_8x8); p1_p3_16x8 = vsubq_u16(p1_p3_16x8, p0_p2_16x8); p1_p3_16x8 = vmulq_u16(p1_p3_16x8, ax_16x8); p1_p3_16x8 = vshrq_n_u16(p1_p3_16x8, 8); p1_p3_16x8 = vaddq_u16(p1_p3_16x8, p0_p2_16x8); p1_p3_16x8 = vandq_u16(p1_p3_16x8, x255_16x8); p0_16x4 = vget_low_u16(p1_p3_16x8); p2_16x4 = vget_high_u16(p1_p3_16x8); p2_16x4 = vsub_u16(p2_16x4, p0_16x4); p2_16x4 = vmul_u16(p2_16x4, ay_16x4); p2_16x4 = vshr_n_u16(p2_16x4, 8); p2_16x4 = vadd_u16(p2_16x4, p0_16x4); p1_p3_16x8 = vcombine_u16(temp_16x4, p2_16x4); p2_8x8 = vmovn_u16(p1_p3_16x8); res_32x2 = vreinterpret_u32_u8(p2_8x8); vst1_lane_u32(pbuf++, res_32x2, 1); } else *pbuf++ = p0; } return 0; }