Exemplo n.º 1
0
int main(int, char**)
{
    /* AVX */
    _mm256_zeroall();
    __m256i a = _mm256_setzero_si256();

    /* AVX2 */
    __m256i b = _mm256_and_si256(a, a);
    __m256i result = _mm256_add_epi8(a, b);
    (void)result;
    return 0;
}
Exemplo n.º 2
0
void static
avx2_test (void)
{
  union256i_b u, s1, s2;
  char e[32];
  unsigned i;

  s1.x = _mm256_set_epi8 (10, 74, 50, 4, 6, 99, 1, 4, 87, 83, 84,
			  29, 81, 79, 1, 3, 1, 5, 2, 47, 20, 2, 72,
			  92, 9, 4, 23, 17, 99, 43, 72, 17);

  s2.x = _mm256_set_epi8 (88, 44, 33, 20, 56, 99, 2, 90, 38, 4, 200,
			  17, 3, 39, 2, 37, 27, 95, 17, 74, 72, 43,
			  27, 112, 71, 50, 32, 72, 84, 17, 27, 96);

  u.x = _mm256_add_epi8 (s1.x, s2.x);

  for (i = 0; i < 32; i++)
    e[i] = s1.a[i] + s2.a[i];

  if (check_union256i_b (u, e))
    abort ();
}
static __m256i avx2_popcount(const __m256i vec) {

    const __m256i lookup = _mm256_setr_epi8(
        /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
        /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
        /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
        /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4,

        /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
        /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
        /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
        /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4
    );

    const __m256i low_mask = _mm256_set1_epi8(0x0f);

    const __m256i lo  = _mm256_and_si256(vec, low_mask);
    const __m256i hi  = _mm256_and_si256(_mm256_srli_epi16(vec, 4), low_mask);
    const __m256i popcnt1 = _mm256_shuffle_epi8(lookup, lo);
    const __m256i popcnt2 = _mm256_shuffle_epi8(lookup, hi);

    return _mm256_add_epi8(popcnt1, popcnt2);
}
Exemplo n.º 4
0
__m256i test_mm256_add_epi8(__m256i a, __m256i b) {
  // CHECK: add <32 x i8>
  return _mm256_add_epi8(a, b);
}
Exemplo n.º 5
0
 /*!
  * \brief Add the two given values and return the result.
  */
 ETL_STATIC_INLINE(avx_simd_byte) add(avx_simd_byte lhs, avx_simd_byte rhs) {
     return _mm256_add_epi8(lhs.value, rhs.value);
 }
Exemplo n.º 6
0
	/* set 4: 62, "+" */
	s4mask = _mm256_cmpeq_epi8(res, _mm256_set1_epi8(62));
	blockmask = _mm256_or_si256(blockmask, s4mask);

	/* set 3: 52..61, "0123456789" */
	s3mask = _mm256_andnot_si256(blockmask, _mm256_cmpgt_epi8(res, _mm256_set1_epi8(51)));
	blockmask = _mm256_or_si256(blockmask, s3mask);

	/* set 2: 26..51, "abcdefghijklmnopqrstuvwxyz" */
	s2mask = _mm256_andnot_si256(blockmask, _mm256_cmpgt_epi8(res, _mm256_set1_epi8(25)));
	blockmask = _mm256_or_si256(blockmask, s2mask);

	/* set 1: 0..25, "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
	 * Everything that is not blockmasked */

	/* Create the masked character sets: */
	str = _mm256_and_si256(_mm256_set1_epi8('/'), s5mask);
	str = _mm256_blendv_epi8(str, _mm256_set1_epi8('+'), s4mask);
	str = _mm256_blendv_epi8(str, _mm256_add_epi8(res, _mm256_set1_epi8('0' - 52)), s3mask);
	str = _mm256_blendv_epi8(str, _mm256_add_epi8(res, _mm256_set1_epi8('a' - 26)), s2mask);
	str = _mm256_blendv_epi8(_mm256_add_epi8(res, _mm256_set1_epi8('A')), str, blockmask);

	/* Blend all the sets together and store: */
	_mm256_storeu_si256((__m256i *)o, str);

	c += 24;	/* 6 * 4 bytes of input  */
	o += 32;	/* 8 * 4 bytes of output */
	outl += 32;
	srclen -= 24;
}
Exemplo n.º 7
0
int normHamming(const uchar* a, const uchar* b, int n)
{
    CV_AVX_GUARD;

    int i = 0;
    int result = 0;
#if CV_AVX2
    {
        __m256i _r0 = _mm256_setzero_si256();
        __m256i _0 = _mm256_setzero_si256();
        __m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
                                                 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
        __m256i _popcnt_mask = _mm256_set1_epi8(0x0F);

        for(; i <= n - 32; i+= 32)
        {
            __m256i _a0 = _mm256_loadu_si256((const __m256i*)(a + i));
            __m256i _b0 = _mm256_loadu_si256((const __m256i*)(b + i));

            __m256i _xor = _mm256_xor_si256(_a0, _b0);

            __m256i _popc0 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_xor, _popcnt_mask));
            __m256i _popc1 = _mm256_shuffle_epi8(_popcnt_table,
                             _mm256_and_si256(_mm256_srli_epi16(_xor, 4), _popcnt_mask));

            _r0 = _mm256_add_epi32(_r0, _mm256_sad_epu8(_0, _mm256_add_epi8(_popc0, _popc1)));
        }
        _r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2));
        result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0);
    }
#endif // CV_AVX2

#if CV_POPCNT
    {
#  if defined CV_POPCNT_U64
        for(; i <= n - 8; i += 8)
        {
            result += (int)CV_POPCNT_U64(*(uint64*)(a + i) ^ *(uint64*)(b + i));
        }
#  endif
        for(; i <= n - 4; i += 4)
        {
            result += CV_POPCNT_U32(*(uint*)(a + i) ^ *(uint*)(b + i));
        }
    }
#endif // CV_POPCNT

#if CV_SIMD128
    {
        v_uint32x4 t = v_setzero_u32();
        for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
        {
            t += v_popcount(v_load(a + i) ^ v_load(b + i));
        }
        result += v_reduce_sum(t);
    }
#endif // CV_SIMD128
#if CV_ENABLE_UNROLLED
    for(; i <= n - 4; i += 4)
    {
        result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +
                popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
    }
#endif
    for(; i < n; i++)
    {
        result += popCountTable[a[i] ^ b[i]];
    }
    return result;
}
Exemplo n.º 8
0
// count genotype sum and number of calls, not requiring 16-aligned p
COREARRAY_DLL_DEFAULT C_UInt8* vec_u8_geno_count(C_UInt8 *p,
	size_t n, C_Int32 &out_sum, C_Int32 &out_num)
{
	C_Int32 sum=0, num=0;

#if defined(COREARRAY_SIMD_AVX2)

	const __m256i three = _mm256_set1_epi8(3);
	const __m256i zero = _mm256_setzero_si256();
	__m256i sum32 = zero, num32 = zero;
	size_t limit_by_U8 = 0;

	for (; n >= 32; )
	{
		__m256i v = _mm256_loadu_si256((__m256i const*)p);
		p += 32;
		__m256i m = _mm256_cmpgt_epi8(three, _mm256_min_epu8(v, three));
		sum32 = _mm256_add_epi8(sum32, _mm256_and_si256(v, m));
		num32 = _mm256_sub_epi8(num32, m);
		n -= 32;
		limit_by_U8 ++;
		if ((limit_by_U8 >= 127) || (n < 32))
		{
			// add to sum
			sum32 = _mm256_sad_epu8(sum32, zero);
			sum32 = _mm256_add_epi32(sum32,
				_mm256_permute4x64_epi64(sum32, _MM_SHUFFLE(1,0,3,2)));
			sum32 = _mm256_add_epi32(sum32,
				_mm256_permute4x64_epi64(sum32, _MM_SHUFFLE(0,0,0,1)));
			sum += _mm_cvtsi128_si32(_mm256_castsi256_si128(sum32));
			// add to num
			num32 = _mm256_sad_epu8(num32, zero);
			num32 = _mm256_add_epi32(num32,
				_mm256_permute4x64_epi64(num32, _MM_SHUFFLE(1,0,3,2)));
			num32 = _mm256_add_epi32(num32,
				_mm256_permute4x64_epi64(num32, _MM_SHUFFLE(0,0,0,1)));
			num += _mm_cvtsi128_si32(_mm256_castsi256_si128(num32));
			// reset
			sum32 = num32 = zero;
			limit_by_U8 = 0;
		}
	}

#elif defined(COREARRAY_SIMD_SSE2)

	// header, 16-byte aligned
	size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F;
	for (; (n > 0) && (h > 0); n--, h--, p++)
		if (*p <= 2) { sum += *p; num++; }

	const __m128i three = _mm_set1_epi8(3);
	const __m128i zero = _mm_setzero_si128();
	__m128i sum16=zero, num16=zero;
	size_t limit_by_U8 = 0;

	for (; n >= 16; )
	{
		__m128i v = _mm_load_si128((__m128i const*)p);
		p += 16;
		__m128i m = _mm_cmpgt_epi8(three, _mm_min_epu8(v, three));
		sum16 = _mm_add_epi8(sum16, v & m);
		num16 = _mm_sub_epi8(num16, m);
		n -= 16;
		limit_by_U8 ++;
		if ((limit_by_U8 >= 127) || (n < 16))
		{
			// add to sum
			sum16 = _mm_sad_epu8(sum16, zero);
			sum += _mm_cvtsi128_si32(sum16);
			sum += _mm_cvtsi128_si32(_mm_shuffle_epi32(sum16, 2));
			// add to num
			num16 = _mm_sad_epu8(num16, zero);
			num += _mm_cvtsi128_si32(num16);
			num += _mm_cvtsi128_si32(_mm_shuffle_epi32(num16, 2));
			// reset
			sum16 = num16 = zero;
			limit_by_U8 = 0;
		}
	}

#endif

	for (; n > 0; n--, p++)
		if (*p <= 2) { sum += *p; num++; }
	out_sum = sum;
	out_num = num;
	return p;
}