Esempio n. 1
0
static inline void AddLanes(uint32_t* ptr, uint32x4_t v) {
#if defined(WEBRTC_ARCH_ARM64)
  *(ptr) = vaddvq_u32(v);
#else
  uint32x2_t tmp_v;
  tmp_v = vadd_u32(vget_low_u32(v), vget_high_u32(v));
  tmp_v = vpadd_u32(tmp_v, tmp_v);
  *(ptr) = vget_lane_u32(tmp_v, 0);
#endif
}
Esempio n. 2
0
uint32x2_t
test_vpadd_u32 (uint32x2_t __a, uint32x2_t __b)
{
  return vpadd_u32(__a, __b);
}
Esempio n. 3
0
size_t mempopcnt(const void *s, size_t len)
{
	uint8x16_t v_0;
	uint8x16_t c;
	uint32x4_t v_sum;
	uint32x2_t v_tsum;
	unsigned char *p;
	size_t r;
	unsigned shift;

	prefetch(s);

// TODO: do this in 64 bit? the mem model seems more that way...
	v_0   = (uint8x16_t){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
	v_sum = (uint32x4_t)v_0;
	p = (unsigned char *)ALIGN_DOWN(s, SOVUCQ);
	shift = ALIGN_DOWN_DIFF(s, SOVUCQ);
	c = *(const uint8x16_t *)p;
	if(HOST_IS_BIGENDIAN)
		c = neon_simple_alignq(v_0, c, SOVUCQ - shift);
	else
		c = neon_simple_alignq(c, v_0, shift);
	if(len >= SOVUCQ || len + shift >= SOVUCQ)
	{
		p    += SOVUCQ;
		len  -= SOVUCQ - shift;
		v_sum = vpadalq_u16(v_sum, vpaddlq_u8(vcntq_u8(c)));

		while(len >= SOVUCQ * 2)
		{
			uint8x16_t v_sumb = v_0;

			r    = len / (SOVUCQ * 2);
			r    = r > 15 ? 15 : r;
			len -= r * SOVUCQ * 2;
			/*
			 * NEON has a vector popcnt instruction, so no compression.
			 * We trust the speed given in the handbook (adding more
			 * instructions would not make it faster), 1-2 cycles.
			 */
			for(; r; r--, p += SOVUCQ * 2) {
				c      = *(const uint8x16_t *)p;
				v_sumb = vaddq_u8(v_sumb, vcntq_u8(c));
				c      = *((const uint8x16_t *)(p + SOVUCQ));
				v_sumb = vaddq_u8(v_sumb, vcntq_u8(c));
			}
			v_sum = vpadalq_u16(v_sum, vpaddlq_u8(v_sumb));
		}
		if(len >= SOVUCQ) {
			c     = *(const uint8x16_t *)p;
			p    += SOVUCQ;
			v_sum = vpadalq_u16(v_sum, vpaddlq_u8(vcntq_u8(c)));
			len  -= SOVUCQ;
		}

		if(len)
			c = *(const uint8x16_t *)p;
	}
	if(len) {
		if(HOST_IS_BIGENDIAN)
			c      = neon_simple_alignq(c, v_0, SOVUCQ - len);
		else
			c      = neon_simple_alignq(v_0, c, len);
		v_sum  = vpadalq_u16(v_sum, vpaddlq_u8(vcntq_u8(c)));
	}

	v_tsum = vpadd_u32(vget_high_u32(v_sum), vget_low_u32(v_sum));
	v_tsum = vpadd_u32(v_tsum, v_tsum);
	return vget_lane_u32(v_tsum, 0);
}