static void ConvertARGBToUV_NEON(const uint32_t* argb, uint8_t* u, uint8_t* v, int src_width, int do_store) { int i; for (i = 0; i + 16 <= src_width; i += 16, u += 8, v += 8) { const uint8x16x4_t RGB = vld4q_u8((const uint8_t*)&argb[i]); const uint16x8_t R = vpaddlq_u8(RGB.val[2]); // pair-wise adds const uint16x8_t G = vpaddlq_u8(RGB.val[1]); const uint16x8_t B = vpaddlq_u8(RGB.val[0]); int16x8_t U_tmp, V_tmp; CONVERT_RGB_TO_UV(R, G, B, 1, U_tmp, V_tmp); { const uint8x8_t U = vqrshrun_n_s16(U_tmp, 1); const uint8x8_t V = vqrshrun_n_s16(V_tmp, 1); if (do_store) { vst1_u8(u, U); vst1_u8(v, V); } else { const uint8x8_t prev_u = vld1_u8(u); const uint8x8_t prev_v = vld1_u8(v); vst1_u8(u, vrhadd_u8(U, prev_u)); vst1_u8(v, vrhadd_u8(V, prev_v)); } } } if (i < src_width) { // left-over WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store); } }
// 'do_above' and 'do_left' facilitate branch removal when inlined. static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left, int do_above, int do_left) { uint16x8_t sum_top; uint16x8_t sum_left; uint8x8_t dc0; if (do_above) { const uint8x16_t A0 = vld1q_u8(above); // top row const uint8x16_t A1 = vld1q_u8(above + 16); const uint16x8_t p0 = vpaddlq_u8(A0); // cascading summation of the top const uint16x8_t p1 = vpaddlq_u8(A1); const uint16x8_t p2 = vaddq_u16(p0, p1); const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); const uint16x4_t p4 = vpadd_u16(p3, p3); const uint16x4_t p5 = vpadd_u16(p4, p4); sum_top = vcombine_u16(p5, p5); } if (do_left) { const uint8x16_t L0 = vld1q_u8(left); // left row const uint8x16_t L1 = vld1q_u8(left + 16); const uint16x8_t p0 = vpaddlq_u8(L0); // cascading summation of the left const uint16x8_t p1 = vpaddlq_u8(L1); const uint16x8_t p2 = vaddq_u16(p0, p1); const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); const uint16x4_t p4 = vpadd_u16(p3, p3); const uint16x4_t p5 = vpadd_u16(p4, p4); sum_left = vcombine_u16(p5, p5); } if (do_above && do_left) { const uint16x8_t sum = vaddq_u16(sum_left, sum_top); dc0 = vrshrn_n_u16(sum, 6); } else if (do_above) { dc0 = vrshrn_n_u16(sum_top, 5); } else if (do_left) { dc0 = vrshrn_n_u16(sum_left, 5); } else { dc0 = vdup_n_u8(0x80); } { const uint8x16_t dc = vdupq_lane_u8(dc0, 0); int i; for (i = 0; i < 32; ++i) { vst1q_u8(dst + i * stride, dc); vst1q_u8(dst + i * stride + 16, dc); } } }
void test_vpaddlQu8 (void) { uint16x8_t out_uint16x8_t; uint8x16_t arg0_uint8x16_t; out_uint16x8_t = vpaddlq_u8 (arg0_uint8x16_t); }
inline int v_signmask(const v_uint8x16& a) { int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100)); uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0)); uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0))); return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8); }
//Note: it takes size and offset in units of byte static inline int compute_ham_similarity_64(unsigned short* ref, unsigned short* circ_array, int size){ const uint8_t* ref_c=(uint8_t*) ref; const uint8_t* circ_c=(uint8_t*) circ_array; register uint8x16_t a,b; register uint8x16_t c,d,temp; register uint16x8_t acc; register uint i=0,count=0; int j=0; int shift=size&0xF; for(i=0;i<=size-16; i+=16){ j++; a=vld1q_u8(&ref_c[i]); b=vld1q_u8(&circ_c[i]); c=veorq_u8(a,b); acc=vaddq_u16(acc,vpaddlq_u8(vcntq_u8(c))); } count=setbits(acc); a=vld1q_u8(&ref_c[i]); b=vld1q_u8(&circ_c[i]); c=veorq_u8(a,b); c=vcntq_u8(c); for(i=0;i<shift;i++){ count=count+vgetq_lane_u8 (c,i); } return size*8-count; }
uint32x2_t FORCE_INLINE popcnt_neon_qreg(const uint8x16_t reg) { const uint8x16_t pcnt = vcntq_u8(reg); const uint16x8_t t0 = vpaddlq_u8(pcnt); const uint32x4_t t1 = vpaddlq_u16(t0); const uint32x2_t t2 = vadd_u32(vget_low_u32(t1), vget_high_u32(t1)); return t2; }
// 'do_above' and 'do_left' facilitate branch removal when inlined. static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left, int do_above, int do_left) { uint16x8_t sum_top; uint16x8_t sum_left; uint8x8_t dc0; if (do_above) { const uint8x16_t A = vld1q_u8(above); // top row const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); const uint16x4_t p2 = vpadd_u16(p1, p1); const uint16x4_t p3 = vpadd_u16(p2, p2); sum_top = vcombine_u16(p3, p3); } if (do_left) { const uint8x16_t L = vld1q_u8(left); // left row const uint16x8_t p0 = vpaddlq_u8(L); // cascading summation of the left const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); const uint16x4_t p2 = vpadd_u16(p1, p1); const uint16x4_t p3 = vpadd_u16(p2, p2); sum_left = vcombine_u16(p3, p3); } if (do_above && do_left) { const uint16x8_t sum = vaddq_u16(sum_left, sum_top); dc0 = vrshrn_n_u16(sum, 5); } else if (do_above) { dc0 = vrshrn_n_u16(sum_top, 4); } else if (do_left) { dc0 = vrshrn_n_u16(sum_left, 4); } else { dc0 = vdup_n_u8(0x80); } { const uint8x16_t dc = vdupq_lane_u8(dc0, 0); int i; for (i = 0; i < 16; ++i) { vst1q_u8(dst + i * stride, dc); } } }
uint64_t popcnt_neon_vcnt(const uint8_t* data, const size_t size) { const size_t chunk_size = 16 * 4 * 2; uint8_t* ptr = const_cast<uint8_t*>(data); const size_t n = size / chunk_size; const size_t k = size % chunk_size; uint32x4_t sum = vcombine_u32(vcreate_u32(0), vcreate_u32(0)); for (size_t i=0; i < n; i++, ptr += chunk_size) { uint8x16x4_t input0 = vld4q_u8(ptr + 0 * 16 * 4); uint8x16x4_t input1 = vld4q_u8(ptr + 1 * 16 * 4); uint8x16_t t0 = vcntq_u8(input0.val[0]); t0 = vaddq_u8(t0, vcntq_u8(input0.val[1])); t0 = vaddq_u8(t0, vcntq_u8(input0.val[2])); t0 = vaddq_u8(t0, vcntq_u8(input0.val[3])); t0 = vaddq_u8(t0, vcntq_u8(input1.val[0])); t0 = vaddq_u8(t0, vcntq_u8(input1.val[1])); t0 = vaddq_u8(t0, vcntq_u8(input1.val[2])); t0 = vaddq_u8(t0, vcntq_u8(input1.val[3])); const uint16x8_t t1 = vpaddlq_u8(t0); sum = vpadalq_u16(sum, t1); } uint32_t scalar = 0; uint32_t tmp[4]; vst1q_u32(tmp, sum); for (int i=0; i < 4; i++) { scalar += tmp[i]; } for (size_t j=0; j < k; j++) { scalar += lookup8bit[ptr[j]]; } return scalar; }
size_t mempopcnt(const void *s, size_t len) { uint8x16_t v_0; uint8x16_t c; uint32x4_t v_sum; uint32x2_t v_tsum; unsigned char *p; size_t r; unsigned shift; prefetch(s); // TODO: do this in 64 bit? the mem model seems more that way... v_0 = (uint8x16_t){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; v_sum = (uint32x4_t)v_0; p = (unsigned char *)ALIGN_DOWN(s, SOVUCQ); shift = ALIGN_DOWN_DIFF(s, SOVUCQ); c = *(const uint8x16_t *)p; if(HOST_IS_BIGENDIAN) c = neon_simple_alignq(v_0, c, SOVUCQ - shift); else c = neon_simple_alignq(c, v_0, shift); if(len >= SOVUCQ || len + shift >= SOVUCQ) { p += SOVUCQ; len -= SOVUCQ - shift; v_sum = vpadalq_u16(v_sum, vpaddlq_u8(vcntq_u8(c))); while(len >= SOVUCQ * 2) { uint8x16_t v_sumb = v_0; r = len / (SOVUCQ * 2); r = r > 15 ? 15 : r; len -= r * SOVUCQ * 2; /* * NEON has a vector popcnt instruction, so no compression. * We trust the speed given in the handbook (adding more * instructions would not make it faster), 1-2 cycles. */ for(; r; r--, p += SOVUCQ * 2) { c = *(const uint8x16_t *)p; v_sumb = vaddq_u8(v_sumb, vcntq_u8(c)); c = *((const uint8x16_t *)(p + SOVUCQ)); v_sumb = vaddq_u8(v_sumb, vcntq_u8(c)); } v_sum = vpadalq_u16(v_sum, vpaddlq_u8(v_sumb)); } if(len >= SOVUCQ) { c = *(const uint8x16_t *)p; p += SOVUCQ; v_sum = vpadalq_u16(v_sum, vpaddlq_u8(vcntq_u8(c))); len -= SOVUCQ; } if(len) c = *(const uint8x16_t *)p; } if(len) { if(HOST_IS_BIGENDIAN) c = neon_simple_alignq(c, v_0, SOVUCQ - len); else c = neon_simple_alignq(v_0, c, len); v_sum = vpadalq_u16(v_sum, vpaddlq_u8(vcntq_u8(c))); } v_tsum = vpadd_u32(vget_high_u32(v_sum), vget_low_u32(v_sum)); v_tsum = vpadd_u32(v_tsum, v_tsum); return vget_lane_u32(v_tsum, 0); }
template<bool align> SIMD_INLINE uint16x8_t ReduceColBody(const uint8_t * src) { uint8x16_t t01 = Load<false>(src - 1); uint8x16_t t12 = Load<align>(src); return vaddq_u16(vpaddlq_u8(t01), vpaddlq_u8(t12)); }
template<bool align> SIMD_INLINE uint16x8_t ReduceColNose(const uint8_t * src) { uint8x16_t t12 = Load<align>(src); uint8x16_t t01 = LoadBeforeFirst<1>(t12); return vaddq_u16(vpaddlq_u8(t01), vpaddlq_u8(t12)); }
inline ResultType operator()(Iterator1 a, Iterator2 b, size_t size) const { ResultType result = 0; #if (defined __GNUC__ || defined __clang__) && defined USE_SSE #ifdef __ARM_NEON__ { uint32x4_t bits = vmovq_n_u32(0); for (size_t i = 0; i < size; i += 16) { uint8x16_t A_vec = vld1q_u8 (a + i); uint8x16_t B_vec = vld1q_u8 (b + i); uint8x16_t AxorB = veorq_u8 (A_vec, B_vec); uint8x16_t bitsSet = vcntq_u8 (AxorB); uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet); uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8); bits = vaddq_u32(bits, bitSet4); } uint64x2_t bitSet2 = vpaddlq_u32 (bits); result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0); result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2); } #else { //for portability just use unsigned long -- and use the __builtin_popcountll (see docs for __builtin_popcountll) typedef unsigned long long pop_t; const size_t modulo = size % sizeof(pop_t); const pop_t* a2 = reinterpret_cast<const pop_t*> (a); const pop_t* b2 = reinterpret_cast<const pop_t*> (b); const pop_t* a2_end = a2 + (size / sizeof(pop_t)); for (; a2 != a2_end; ++a2, ++b2) result += __builtin_popcountll((*a2) ^ (*b2)); if (modulo) { //in the case where size is not dividable by sizeof(pop_t) //need to mask off the bits at the end pop_t a_final = 0, b_final = 0; memcpy(&a_final, a2, modulo); memcpy(&b_final, b2, modulo); result += __builtin_popcountll(a_final ^ b_final); } } #endif //NEON return result; #endif #ifdef PLATFORM_64_BIT if(size%64 == 0) { const uint64_t* pa = reinterpret_cast<const uint64_t*>(a); const uint64_t* pb = reinterpret_cast<const uint64_t*>(b); size /= (sizeof(uint64_t)/sizeof(unsigned char)); for(size_t i = 0; i < size; ++i, ++pa, ++pb ) { result += popcnt64(*pa ^ *pb); } } else { const uint32_t* pa = reinterpret_cast<const uint32_t*>(a); const uint32_t* pb = reinterpret_cast<const uint32_t*>(b); size /= (sizeof(uint32_t)/sizeof(unsigned char)); for(size_t i = 0; i < size; ++i, ++pa, ++pb ) { result += popcnt32(*pa ^ *pb); } } #else const uint32_t* pa = reinterpret_cast<const uint32_t*>(a); const uint32_t* pb = reinterpret_cast<const uint32_t*>(b); size /= (sizeof(uint32_t)/sizeof(unsigned char)); for(size_t i = 0; i < size; ++i, ++pa, ++pb ) { result += popcnt32(*pa ^ *pb); } #endif return result; }