//Note: it takes size and offset in units of byte static inline int compute_ham_similarity_64(unsigned short* ref, unsigned short* circ_array, int size){ const uint8_t* ref_c=(uint8_t*) ref; const uint8_t* circ_c=(uint8_t*) circ_array; register uint8x16_t a,b; register uint8x16_t c,d,temp; register uint16x8_t acc; register uint i=0,count=0; int j=0; int shift=size&0xF; for(i=0;i<=size-16; i+=16){ j++; a=vld1q_u8(&ref_c[i]); b=vld1q_u8(&circ_c[i]); c=veorq_u8(a,b); acc=vaddq_u16(acc,vpaddlq_u8(vcntq_u8(c))); } count=setbits(acc); a=vld1q_u8(&ref_c[i]); b=vld1q_u8(&circ_c[i]); c=veorq_u8(a,b); c=vcntq_u8(c); for(i=0;i<shift;i++){ count=count+vgetq_lane_u8 (c,i); } return size*8-count; }
uint32x2_t FORCE_INLINE popcnt_neon_qreg(const uint8x16_t reg) { const uint8x16_t pcnt = vcntq_u8(reg); const uint16x8_t t0 = vpaddlq_u8(pcnt); const uint32x4_t t1 = vpaddlq_u16(t0); const uint32x2_t t2 = vadd_u32(vget_low_u32(t1), vget_high_u32(t1)); return t2; }
uint64_t popcnt_neon_vcnt(const uint8_t* data, const size_t size) { const size_t chunk_size = 16 * 4 * 2; uint8_t* ptr = const_cast<uint8_t*>(data); const size_t n = size / chunk_size; const size_t k = size % chunk_size; uint32x4_t sum = vcombine_u32(vcreate_u32(0), vcreate_u32(0)); for (size_t i=0; i < n; i++, ptr += chunk_size) { uint8x16x4_t input0 = vld4q_u8(ptr + 0 * 16 * 4); uint8x16x4_t input1 = vld4q_u8(ptr + 1 * 16 * 4); uint8x16_t t0 = vcntq_u8(input0.val[0]); t0 = vaddq_u8(t0, vcntq_u8(input0.val[1])); t0 = vaddq_u8(t0, vcntq_u8(input0.val[2])); t0 = vaddq_u8(t0, vcntq_u8(input0.val[3])); t0 = vaddq_u8(t0, vcntq_u8(input1.val[0])); t0 = vaddq_u8(t0, vcntq_u8(input1.val[1])); t0 = vaddq_u8(t0, vcntq_u8(input1.val[2])); t0 = vaddq_u8(t0, vcntq_u8(input1.val[3])); const uint16x8_t t1 = vpaddlq_u8(t0); sum = vpadalq_u16(sum, t1); } uint32_t scalar = 0; uint32_t tmp[4]; vst1q_u32(tmp, sum); for (int i=0; i < 4; i++) { scalar += tmp[i]; } for (size_t j=0; j < k; j++) { scalar += lookup8bit[ptr[j]]; } return scalar; }
size_t mempopcnt(const void *s, size_t len) { uint8x16_t v_0; uint8x16_t c; uint32x4_t v_sum; uint32x2_t v_tsum; unsigned char *p; size_t r; unsigned shift; prefetch(s); // TODO: do this in 64 bit? the mem model seems more that way... v_0 = (uint8x16_t){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; v_sum = (uint32x4_t)v_0; p = (unsigned char *)ALIGN_DOWN(s, SOVUCQ); shift = ALIGN_DOWN_DIFF(s, SOVUCQ); c = *(const uint8x16_t *)p; if(HOST_IS_BIGENDIAN) c = neon_simple_alignq(v_0, c, SOVUCQ - shift); else c = neon_simple_alignq(c, v_0, shift); if(len >= SOVUCQ || len + shift >= SOVUCQ) { p += SOVUCQ; len -= SOVUCQ - shift; v_sum = vpadalq_u16(v_sum, vpaddlq_u8(vcntq_u8(c))); while(len >= SOVUCQ * 2) { uint8x16_t v_sumb = v_0; r = len / (SOVUCQ * 2); r = r > 15 ? 15 : r; len -= r * SOVUCQ * 2; /* * NEON has a vector popcnt instruction, so no compression. * We trust the speed given in the handbook (adding more * instructions would not make it faster), 1-2 cycles. */ for(; r; r--, p += SOVUCQ * 2) { c = *(const uint8x16_t *)p; v_sumb = vaddq_u8(v_sumb, vcntq_u8(c)); c = *((const uint8x16_t *)(p + SOVUCQ)); v_sumb = vaddq_u8(v_sumb, vcntq_u8(c)); } v_sum = vpadalq_u16(v_sum, vpaddlq_u8(v_sumb)); } if(len >= SOVUCQ) { c = *(const uint8x16_t *)p; p += SOVUCQ; v_sum = vpadalq_u16(v_sum, vpaddlq_u8(vcntq_u8(c))); len -= SOVUCQ; } if(len) c = *(const uint8x16_t *)p; } if(len) { if(HOST_IS_BIGENDIAN) c = neon_simple_alignq(c, v_0, SOVUCQ - len); else c = neon_simple_alignq(v_0, c, len); v_sum = vpadalq_u16(v_sum, vpaddlq_u8(vcntq_u8(c))); } v_tsum = vpadd_u32(vget_high_u32(v_sum), vget_low_u32(v_sum)); v_tsum = vpadd_u32(v_tsum, v_tsum); return vget_lane_u32(v_tsum, 0); }
inline ResultType operator()(Iterator1 a, Iterator2 b, size_t size) const { ResultType result = 0; #if (defined __GNUC__ || defined __clang__) && defined USE_SSE #ifdef __ARM_NEON__ { uint32x4_t bits = vmovq_n_u32(0); for (size_t i = 0; i < size; i += 16) { uint8x16_t A_vec = vld1q_u8 (a + i); uint8x16_t B_vec = vld1q_u8 (b + i); uint8x16_t AxorB = veorq_u8 (A_vec, B_vec); uint8x16_t bitsSet = vcntq_u8 (AxorB); uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet); uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8); bits = vaddq_u32(bits, bitSet4); } uint64x2_t bitSet2 = vpaddlq_u32 (bits); result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0); result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2); } #else { //for portability just use unsigned long -- and use the __builtin_popcountll (see docs for __builtin_popcountll) typedef unsigned long long pop_t; const size_t modulo = size % sizeof(pop_t); const pop_t* a2 = reinterpret_cast<const pop_t*> (a); const pop_t* b2 = reinterpret_cast<const pop_t*> (b); const pop_t* a2_end = a2 + (size / sizeof(pop_t)); for (; a2 != a2_end; ++a2, ++b2) result += __builtin_popcountll((*a2) ^ (*b2)); if (modulo) { //in the case where size is not dividable by sizeof(pop_t) //need to mask off the bits at the end pop_t a_final = 0, b_final = 0; memcpy(&a_final, a2, modulo); memcpy(&b_final, b2, modulo); result += __builtin_popcountll(a_final ^ b_final); } } #endif //NEON return result; #endif #ifdef PLATFORM_64_BIT if(size%64 == 0) { const uint64_t* pa = reinterpret_cast<const uint64_t*>(a); const uint64_t* pb = reinterpret_cast<const uint64_t*>(b); size /= (sizeof(uint64_t)/sizeof(unsigned char)); for(size_t i = 0; i < size; ++i, ++pa, ++pb ) { result += popcnt64(*pa ^ *pb); } } else { const uint32_t* pa = reinterpret_cast<const uint32_t*>(a); const uint32_t* pb = reinterpret_cast<const uint32_t*>(b); size /= (sizeof(uint32_t)/sizeof(unsigned char)); for(size_t i = 0; i < size; ++i, ++pa, ++pb ) { result += popcnt32(*pa ^ *pb); } } #else const uint32_t* pa = reinterpret_cast<const uint32_t*>(a); const uint32_t* pb = reinterpret_cast<const uint32_t*>(b); size /= (sizeof(uint32_t)/sizeof(unsigned char)); for(size_t i = 0; i < size; ++i, ++pa, ++pb ) { result += popcnt32(*pa ^ *pb); } #endif return result; }