Esempio n. 1
0
static void ConvertARGBToUV_NEON(const uint32_t* argb, uint8_t* u, uint8_t* v,
                                 int src_width, int do_store) {
  int i;
  for (i = 0; i + 16 <= src_width; i += 16, u += 8, v += 8) {
    const uint8x16x4_t RGB = vld4q_u8((const uint8_t*)&argb[i]);
    const uint16x8_t R = vpaddlq_u8(RGB.val[2]);  // pair-wise adds
    const uint16x8_t G = vpaddlq_u8(RGB.val[1]);
    const uint16x8_t B = vpaddlq_u8(RGB.val[0]);
    int16x8_t U_tmp, V_tmp;
    CONVERT_RGB_TO_UV(R, G, B, 1, U_tmp, V_tmp);
    {
      const uint8x8_t U = vqrshrun_n_s16(U_tmp, 1);
      const uint8x8_t V = vqrshrun_n_s16(V_tmp, 1);
      if (do_store) {
        vst1_u8(u, U);
        vst1_u8(v, V);
      } else {
        const uint8x8_t prev_u = vld1_u8(u);
        const uint8x8_t prev_v = vld1_u8(v);
        vst1_u8(u, vrhadd_u8(U, prev_u));
        vst1_u8(v, vrhadd_u8(V, prev_v));
      }
    }
  }
  if (i < src_width) {  // left-over
    WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
  }
}
Esempio n. 2
0
// 'do_above' and 'do_left' facilitate branch removal when inlined.
static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride,
                            const uint8_t *above, const uint8_t *left,
                            int do_above, int do_left) {
  uint16x8_t sum_top;
  uint16x8_t sum_left;
  uint8x8_t dc0;

  if (do_above) {
    const uint8x16_t A0 = vld1q_u8(above);  // top row
    const uint8x16_t A1 = vld1q_u8(above + 16);
    const uint16x8_t p0 = vpaddlq_u8(A0);  // cascading summation of the top
    const uint16x8_t p1 = vpaddlq_u8(A1);
    const uint16x8_t p2 = vaddq_u16(p0, p1);
    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
    const uint16x4_t p4 = vpadd_u16(p3, p3);
    const uint16x4_t p5 = vpadd_u16(p4, p4);
    sum_top = vcombine_u16(p5, p5);
  }

  if (do_left) {
    const uint8x16_t L0 = vld1q_u8(left);  // left row
    const uint8x16_t L1 = vld1q_u8(left + 16);
    const uint16x8_t p0 = vpaddlq_u8(L0);  // cascading summation of the left
    const uint16x8_t p1 = vpaddlq_u8(L1);
    const uint16x8_t p2 = vaddq_u16(p0, p1);
    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
    const uint16x4_t p4 = vpadd_u16(p3, p3);
    const uint16x4_t p5 = vpadd_u16(p4, p4);
    sum_left = vcombine_u16(p5, p5);
  }

  if (do_above && do_left) {
    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
    dc0 = vrshrn_n_u16(sum, 6);
  } else if (do_above) {
    dc0 = vrshrn_n_u16(sum_top, 5);
  } else if (do_left) {
    dc0 = vrshrn_n_u16(sum_left, 5);
  } else {
    dc0 = vdup_n_u8(0x80);
  }

  {
    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
    int i;
    for (i = 0; i < 32; ++i) {
      vst1q_u8(dst + i * stride, dc);
      vst1q_u8(dst + i * stride + 16, dc);
    }
  }
}
void test_vpaddlQu8 (void)
{
  uint16x8_t out_uint16x8_t;
  uint8x16_t arg0_uint8x16_t;

  out_uint16x8_t = vpaddlq_u8 (arg0_uint8x16_t);
}
Esempio n. 4
0
inline int v_signmask(const v_uint8x16& a)
{
    int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100));
    uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0));
    uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
    return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
}
//Note: it takes size and offset in units of byte
static inline int compute_ham_similarity_64(unsigned short* ref, unsigned short* circ_array, 
                                int size){
	
    const uint8_t*         	ref_c=(uint8_t*) ref;
    const uint8_t*         	circ_c=(uint8_t*) circ_array;
    register uint8x16_t     a,b;
    register uint8x16_t     c,d,temp;
    register uint16x8_t     acc;
    register uint           i=0,count=0;
	int 					j=0;
	int 					shift=size&0xF;
	for(i=0;i<=size-16; i+=16){
		j++;
		a=vld1q_u8(&ref_c[i]);
		b=vld1q_u8(&circ_c[i]);
		c=veorq_u8(a,b);
		acc=vaddq_u16(acc,vpaddlq_u8(vcntq_u8(c)));
	}	
	count=setbits(acc);
	a=vld1q_u8(&ref_c[i]);
	b=vld1q_u8(&circ_c[i]);
	c=veorq_u8(a,b);
	c=vcntq_u8(c);
	
	for(i=0;i<shift;i++){
		count=count+vgetq_lane_u8 (c,i);
	}
    return size*8-count;
}
Esempio n. 6
0
uint32x2_t FORCE_INLINE popcnt_neon_qreg(const uint8x16_t reg) {
    
    const uint8x16_t pcnt = vcntq_u8(reg);
    const uint16x8_t t0 = vpaddlq_u8(pcnt);
    const uint32x4_t t1 = vpaddlq_u16(t0);
    const uint32x2_t t2 = vadd_u32(vget_low_u32(t1), vget_high_u32(t1));

    return t2;
}
Esempio n. 7
0
// 'do_above' and 'do_left' facilitate branch removal when inlined.
static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
                            const uint8_t *above, const uint8_t *left,
                            int do_above, int do_left) {
  uint16x8_t sum_top;
  uint16x8_t sum_left;
  uint8x8_t dc0;

  if (do_above) {
    const uint8x16_t A = vld1q_u8(above);  // top row
    const uint16x8_t p0 = vpaddlq_u8(A);  // cascading summation of the top
    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
    const uint16x4_t p2 = vpadd_u16(p1, p1);
    const uint16x4_t p3 = vpadd_u16(p2, p2);
    sum_top = vcombine_u16(p3, p3);
  }

  if (do_left) {
    const uint8x16_t L = vld1q_u8(left);  // left row
    const uint16x8_t p0 = vpaddlq_u8(L);  // cascading summation of the left
    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
    const uint16x4_t p2 = vpadd_u16(p1, p1);
    const uint16x4_t p3 = vpadd_u16(p2, p2);
    sum_left = vcombine_u16(p3, p3);
  }

  if (do_above && do_left) {
    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
    dc0 = vrshrn_n_u16(sum, 5);
  } else if (do_above) {
    dc0 = vrshrn_n_u16(sum_top, 4);
  } else if (do_left) {
    dc0 = vrshrn_n_u16(sum_left, 4);
  } else {
    dc0 = vdup_n_u8(0x80);
  }

  {
    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
    int i;
    for (i = 0; i < 16; ++i) {
      vst1q_u8(dst + i * stride, dc);
    }
  }
}
Esempio n. 8
0
uint64_t popcnt_neon_vcnt(const uint8_t* data, const size_t size)
{
    const size_t chunk_size = 16 * 4 * 2;

    uint8_t* ptr = const_cast<uint8_t*>(data);

    const size_t n = size / chunk_size;
    const size_t k = size % chunk_size;

    uint32x4_t sum = vcombine_u32(vcreate_u32(0), vcreate_u32(0));

    for (size_t i=0; i < n; i++, ptr += chunk_size) {

        uint8x16x4_t input0 = vld4q_u8(ptr + 0 * 16 * 4);
        uint8x16x4_t input1 = vld4q_u8(ptr + 1 * 16 * 4);

        uint8x16_t t0   = vcntq_u8(input0.val[0]);
        t0 = vaddq_u8(t0, vcntq_u8(input0.val[1]));
        t0 = vaddq_u8(t0, vcntq_u8(input0.val[2]));
        t0 = vaddq_u8(t0, vcntq_u8(input0.val[3]));

        t0 = vaddq_u8(t0, vcntq_u8(input1.val[0]));
        t0 = vaddq_u8(t0, vcntq_u8(input1.val[1]));
        t0 = vaddq_u8(t0, vcntq_u8(input1.val[2]));
        t0 = vaddq_u8(t0, vcntq_u8(input1.val[3]));

        const uint16x8_t t1 = vpaddlq_u8(t0);

        sum = vpadalq_u16(sum, t1);
    }

    uint32_t scalar = 0;
    uint32_t tmp[4];

    vst1q_u32(tmp, sum);
    for (int i=0; i < 4; i++) {
        scalar += tmp[i];
    }

    for (size_t j=0; j < k; j++) {
        scalar += lookup8bit[ptr[j]];
    }

    return scalar;
}
Esempio n. 9
0
size_t mempopcnt(const void *s, size_t len)
{
	uint8x16_t v_0;
	uint8x16_t c;
	uint32x4_t v_sum;
	uint32x2_t v_tsum;
	unsigned char *p;
	size_t r;
	unsigned shift;

	prefetch(s);

// TODO: do this in 64 bit? the mem model seems more that way...
	v_0   = (uint8x16_t){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
	v_sum = (uint32x4_t)v_0;
	p = (unsigned char *)ALIGN_DOWN(s, SOVUCQ);
	shift = ALIGN_DOWN_DIFF(s, SOVUCQ);
	c = *(const uint8x16_t *)p;
	if(HOST_IS_BIGENDIAN)
		c = neon_simple_alignq(v_0, c, SOVUCQ - shift);
	else
		c = neon_simple_alignq(c, v_0, shift);
	if(len >= SOVUCQ || len + shift >= SOVUCQ)
	{
		p    += SOVUCQ;
		len  -= SOVUCQ - shift;
		v_sum = vpadalq_u16(v_sum, vpaddlq_u8(vcntq_u8(c)));

		while(len >= SOVUCQ * 2)
		{
			uint8x16_t v_sumb = v_0;

			r    = len / (SOVUCQ * 2);
			r    = r > 15 ? 15 : r;
			len -= r * SOVUCQ * 2;
			/*
			 * NEON has a vector popcnt instruction, so no compression.
			 * We trust the speed given in the handbook (adding more
			 * instructions would not make it faster), 1-2 cycles.
			 */
			for(; r; r--, p += SOVUCQ * 2) {
				c      = *(const uint8x16_t *)p;
				v_sumb = vaddq_u8(v_sumb, vcntq_u8(c));
				c      = *((const uint8x16_t *)(p + SOVUCQ));
				v_sumb = vaddq_u8(v_sumb, vcntq_u8(c));
			}
			v_sum = vpadalq_u16(v_sum, vpaddlq_u8(v_sumb));
		}
		if(len >= SOVUCQ) {
			c     = *(const uint8x16_t *)p;
			p    += SOVUCQ;
			v_sum = vpadalq_u16(v_sum, vpaddlq_u8(vcntq_u8(c)));
			len  -= SOVUCQ;
		}

		if(len)
			c = *(const uint8x16_t *)p;
	}
	if(len) {
		if(HOST_IS_BIGENDIAN)
			c      = neon_simple_alignq(c, v_0, SOVUCQ - len);
		else
			c      = neon_simple_alignq(v_0, c, len);
		v_sum  = vpadalq_u16(v_sum, vpaddlq_u8(vcntq_u8(c)));
	}

	v_tsum = vpadd_u32(vget_high_u32(v_sum), vget_low_u32(v_sum));
	v_tsum = vpadd_u32(v_tsum, v_tsum);
	return vget_lane_u32(v_tsum, 0);
}
Esempio n. 10
0
        template<bool align> SIMD_INLINE uint16x8_t ReduceColBody(const uint8_t * src)
        {
			uint8x16_t t01 = Load<false>(src - 1);
			uint8x16_t t12 = Load<align>(src);
			return vaddq_u16(vpaddlq_u8(t01), vpaddlq_u8(t12));
		}
Esempio n. 11
0
        template<bool align> SIMD_INLINE uint16x8_t ReduceColNose(const uint8_t * src)
        {
			uint8x16_t t12 = Load<align>(src);
			uint8x16_t t01 = LoadBeforeFirst<1>(t12);
            return vaddq_u16(vpaddlq_u8(t01), vpaddlq_u8(t12));
        }
Esempio n. 12
0
  inline ResultType operator()(Iterator1 a, Iterator2 b, size_t size) const
  {
    ResultType result = 0;
#if (defined __GNUC__ || defined __clang__) && defined USE_SSE
#ifdef __ARM_NEON__
    {
      uint32x4_t bits = vmovq_n_u32(0);
      for (size_t i = 0; i < size; i += 16) {
        uint8x16_t A_vec = vld1q_u8 (a + i);
        uint8x16_t B_vec = vld1q_u8 (b + i);
        uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
        uint8x16_t bitsSet = vcntq_u8 (AxorB);
        uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
        uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
        bits = vaddq_u32(bits, bitSet4);
      }
      uint64x2_t bitSet2 = vpaddlq_u32 (bits);
      result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
      result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
    }
#else
    {
      //for portability just use unsigned long -- and use the __builtin_popcountll (see docs for __builtin_popcountll)
      typedef unsigned long long pop_t;
      const size_t modulo = size % sizeof(pop_t);
      const pop_t* a2 = reinterpret_cast<const pop_t*> (a);
      const pop_t* b2 = reinterpret_cast<const pop_t*> (b);
      const pop_t* a2_end = a2 + (size / sizeof(pop_t));

      for (; a2 != a2_end; ++a2, ++b2) result += __builtin_popcountll((*a2) ^ (*b2));

      if (modulo) {
        //in the case where size is not dividable by sizeof(pop_t)
        //need to mask off the bits at the end
        pop_t a_final = 0, b_final = 0;
        memcpy(&a_final, a2, modulo);
        memcpy(&b_final, b2, modulo);
        result += __builtin_popcountll(a_final ^ b_final);
      }
    }
#endif //NEON
    return result;
#endif
#ifdef PLATFORM_64_BIT
    if(size%64 == 0)
    {
      const uint64_t* pa = reinterpret_cast<const uint64_t*>(a);
      const uint64_t* pb = reinterpret_cast<const uint64_t*>(b);
      size /= (sizeof(uint64_t)/sizeof(unsigned char));
      for(size_t i = 0; i < size; ++i, ++pa, ++pb ) {
        result += popcnt64(*pa ^ *pb);
      }
    }
    else
    {
      const uint32_t* pa = reinterpret_cast<const uint32_t*>(a);
      const uint32_t* pb = reinterpret_cast<const uint32_t*>(b);
      size /= (sizeof(uint32_t)/sizeof(unsigned char));
      for(size_t i = 0; i < size; ++i, ++pa, ++pb ) {
        result += popcnt32(*pa ^ *pb);
      }
    }
#else
    const uint32_t* pa = reinterpret_cast<const uint32_t*>(a);
    const uint32_t* pb = reinterpret_cast<const uint32_t*>(b);
    size /= (sizeof(uint32_t)/sizeof(unsigned char));
    for(size_t i = 0; i < size; ++i, ++pa, ++pb ) {
      result += popcnt32(*pa ^ *pb);
    }
#endif
    return result;
  }