static inline void AddLanes(uint32_t* ptr, uint32x4_t v) { #if defined(WEBRTC_ARCH_ARM64) *(ptr) = vaddvq_u32(v); #else uint32x2_t tmp_v; tmp_v = vadd_u32(vget_low_u32(v), vget_high_u32(v)); tmp_v = vpadd_u32(tmp_v, tmp_v); *(ptr) = vget_lane_u32(tmp_v, 0); #endif }
uint32x2_t test_vpadd_u32 (uint32x2_t __a, uint32x2_t __b) { return vpadd_u32(__a, __b); }
size_t mempopcnt(const void *s, size_t len) { uint8x16_t v_0; uint8x16_t c; uint32x4_t v_sum; uint32x2_t v_tsum; unsigned char *p; size_t r; unsigned shift; prefetch(s); // TODO: do this in 64 bit? the mem model seems more that way... v_0 = (uint8x16_t){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; v_sum = (uint32x4_t)v_0; p = (unsigned char *)ALIGN_DOWN(s, SOVUCQ); shift = ALIGN_DOWN_DIFF(s, SOVUCQ); c = *(const uint8x16_t *)p; if(HOST_IS_BIGENDIAN) c = neon_simple_alignq(v_0, c, SOVUCQ - shift); else c = neon_simple_alignq(c, v_0, shift); if(len >= SOVUCQ || len + shift >= SOVUCQ) { p += SOVUCQ; len -= SOVUCQ - shift; v_sum = vpadalq_u16(v_sum, vpaddlq_u8(vcntq_u8(c))); while(len >= SOVUCQ * 2) { uint8x16_t v_sumb = v_0; r = len / (SOVUCQ * 2); r = r > 15 ? 15 : r; len -= r * SOVUCQ * 2; /* * NEON has a vector popcnt instruction, so no compression. * We trust the speed given in the handbook (adding more * instructions would not make it faster), 1-2 cycles. */ for(; r; r--, p += SOVUCQ * 2) { c = *(const uint8x16_t *)p; v_sumb = vaddq_u8(v_sumb, vcntq_u8(c)); c = *((const uint8x16_t *)(p + SOVUCQ)); v_sumb = vaddq_u8(v_sumb, vcntq_u8(c)); } v_sum = vpadalq_u16(v_sum, vpaddlq_u8(v_sumb)); } if(len >= SOVUCQ) { c = *(const uint8x16_t *)p; p += SOVUCQ; v_sum = vpadalq_u16(v_sum, vpaddlq_u8(vcntq_u8(c))); len -= SOVUCQ; } if(len) c = *(const uint8x16_t *)p; } if(len) { if(HOST_IS_BIGENDIAN) c = neon_simple_alignq(c, v_0, SOVUCQ - len); else c = neon_simple_alignq(v_0, c, len); v_sum = vpadalq_u16(v_sum, vpaddlq_u8(vcntq_u8(c))); } v_tsum = vpadd_u32(vget_high_u32(v_sum), vget_low_u32(v_sum)); v_tsum = vpadd_u32(v_tsum, v_tsum); return vget_lane_u32(v_tsum, 0); }