Esempio n. 1
0
void
mulrc16_shuffle_avx2(uint8_t *region, uint8_t constant, size_t length)
{
	uint8_t *end;
	register __m256i in, out, t1, t2, m1, m2, l, h;
	register __m128i bc;

	if (constant == 0) {
		memset(region, 0, length);
		return;
	}

	if (constant == 1)
		return;

	bc = _mm_load_si128((void *)tl[constant]);
	t1 = __builtin_ia32_vbroadcastsi256(bc);
	bc = _mm_load_si128((void *)th[constant]);
	t2 = __builtin_ia32_vbroadcastsi256(bc);
	m1 = _mm256_set1_epi8(0x0f);
	m2 = _mm256_set1_epi8(0xf0);

	for (end=region+length; region<end; region+=32) {
		in = _mm256_load_si256((void *)region);
		l = _mm256_and_si256(in, m1);
		l = _mm256_shuffle_epi8(t1, l);
		h = _mm256_and_si256(in, m2);
		h = _mm256_srli_epi64(h, 4);
		h = _mm256_shuffle_epi8(t2, h);
		out = _mm256_xor_si256(h, l);
		_mm256_store_si256((void *)region, out);
	}
}
        template <bool align> void EdgeBackgroundAdjustRangeMasked(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, 
            uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride)
        {
            assert(width >= A);
            if(align)
            {
                assert(Aligned(backgroundValue) && Aligned(backgroundValueStride));
                assert(Aligned(backgroundCount) && Aligned(backgroundCountStride));
                assert(Aligned(mask) && Aligned(maskStride));
            }

            const __m256i _threshold = _mm256_set1_epi8((char)threshold);
            size_t alignedWidth = AlignLo(width, A);
            __m256i tailMask = SetMask<uint8_t>(0, A - width + alignedWidth, 1);
            for(size_t row = 0; row < height; ++row)
            {
                for(size_t col = 0; col < alignedWidth; col += A)
                    EdgeBackgroundAdjustRangeMasked<align>(backgroundCount, backgroundValue, mask, col, _threshold, K8_01);
                if(alignedWidth != width)
                    EdgeBackgroundAdjustRangeMasked<false>(backgroundCount, backgroundValue, mask, width - A, _threshold, tailMask);
                backgroundValue += backgroundValueStride;
                backgroundCount += backgroundCountStride;
                mask += maskStride;
            }		
        }
Esempio n. 3
0
void* xmemchr(const void* src, int c, size_t n)
{

	if (n < 32) {
		return xmemchr_tiny(src, c, n);
	}


	__m256i ymm0 = _mm256_set1_epi8((char)c), ymm1;
	int mask;

	size_t rem = n % 32;
	n /= 32;

	for (size_t i = 0; i < n; i++) {
		ymm1 = _mm256_loadu_si256((const __m256i*)src + i);
		ymm1 = _mm256_cmpeq_epi8(ymm1, ymm0);
		mask = _mm256_movemask_epi8(ymm1);
		if (mask) {
			__asm__("bsfl %0, %0\n\t"
			        :"=r"(mask)
			        :"0"(mask)
			       );
			return (void*)((unsigned long)((const __m256i*)src + i) + mask);
		}
	}

	return xmemchr_tiny((const void*)((unsigned long)src + n), c, rem);
}
static inline __m256i
enc_translate (const __m256i in)
{
	// LUT contains Absolute offset for all ranges:
	const __m256i lut = _mm256_setr_epi8(65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0,
	                                     65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
	// Translate values 0..63 to the Base64 alphabet. There are five sets:
	// #  From      To         Abs    Index  Characters
	// 0  [0..25]   [65..90]   +65        0  ABCDEFGHIJKLMNOPQRSTUVWXYZ
	// 1  [26..51]  [97..122]  +71        1  abcdefghijklmnopqrstuvwxyz
	// 2  [52..61]  [48..57]    -4  [2..11]  0123456789
	// 3  [62]      [43]       -19       12  +
	// 4  [63]      [47]       -16       13  /

	// Create LUT indices from input:
	// the index for range #0 is right, others are 1 less than expected:
	__m256i indices = _mm256_subs_epu8(in, _mm256_set1_epi8(51));

	// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
	__m256i mask = CMPGT(in, 25);

	// substract -1, so add 1 to indices for range #[1..4], All indices are now correct:
	indices = _mm256_sub_epi8(indices, mask);

	// Add offsets to input values:
	__m256i out = _mm256_add_epi8(in, _mm256_shuffle_epi8(lut, indices));

	return out;
}
Esempio n. 5
0
void
maddrc16_imul_avx2(uint8_t* region1, const uint8_t* region2,
					uint8_t constant, size_t length)
{
	uint8_t *end;
	register __m256i reg1, reg2, ri[4], sp[4], mi[4];
	const uint8_t *p = pt[constant];

	if (constant == 0)
		return;

	if (constant == 1) {
		xorr_avx2(region1, region2, length);
		return;
	}
	
	mi[0] = _mm256_set1_epi8(0x11);
	mi[1] = _mm256_set1_epi8(0x22);
	mi[2] = _mm256_set1_epi8(0x44);
	mi[3] = _mm256_set1_epi8(0x88);
	sp[0] = _mm256_set1_epi16(p[0]);
	sp[1] = _mm256_set1_epi16(p[1]);
	sp[2] = _mm256_set1_epi16(p[2]);
	sp[3] = _mm256_set1_epi16(p[3]);

	for (end=region1+length; region1<end; region1+=32, region2+=32) {
		reg2 = _mm256_load_si256((void *)region2);
		reg1 = _mm256_load_si256((void *)region1);
		ri[0] = _mm256_and_si256(reg2, mi[0]);
		ri[1] = _mm256_and_si256(reg2, mi[1]);
		ri[2] = _mm256_and_si256(reg2, mi[2]);
		ri[3] = _mm256_and_si256(reg2, mi[3]);
		ri[1] = _mm256_srli_epi16(ri[1], 1);
		ri[2] = _mm256_srli_epi16(ri[2], 2);
		ri[3] = _mm256_srli_epi16(ri[3], 3);
		ri[0] = _mm256_mullo_epi16(ri[0], sp[0]);
		ri[1] = _mm256_mullo_epi16(ri[1], sp[1]);
		ri[2] = _mm256_mullo_epi16(ri[2], sp[2]);
		ri[3] = _mm256_mullo_epi16(ri[3], sp[3]);
		ri[0] = _mm256_xor_si256(ri[0], ri[1]);
		ri[2] = _mm256_xor_si256(ri[2], ri[3]);
		ri[0] = _mm256_xor_si256(ri[0], ri[2]);
		ri[0] = _mm256_xor_si256(ri[0], reg1);
		_mm256_store_si256((void *)region1, ri[0]);
	}
}
Esempio n. 6
0
uint64_t avx2_count_byte_popcount(const uint8_t* data, size_t size, uint8_t byte) {

    const __m256i v = _mm256_set1_epi8(byte);

    const uint8_t* end = data + size;
    const uint8_t* ptr = data;

    uint64_t result = 0;

    // 1. blocks of 8 registers
    while (ptr + 8*32 < end) {
        const __m256i eq0 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 0*32)));
        const __m256i eq1 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 1*32)));
        const __m256i eq2 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 2*32)));
        const __m256i eq3 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 3*32)));
        const __m256i eq4 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 4*32)));
        const __m256i eq5 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 5*32)));
        const __m256i eq6 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 6*32)));
        const __m256i eq7 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 7*32)));

        const __m256i eq0bit = _mm256_and_si256(eq0, _mm256_set1_epi8(0x01));
        const __m256i eq1bit = _mm256_and_si256(eq1, _mm256_set1_epi8(0x02));
        const __m256i eq2bit = _mm256_and_si256(eq2, _mm256_set1_epi8(0x04));
        const __m256i eq3bit = _mm256_and_si256(eq3, _mm256_set1_epi8(0x08));
        const __m256i eq4bit = _mm256_and_si256(eq4, _mm256_set1_epi8(0x10));
        const __m256i eq5bit = _mm256_and_si256(eq5, _mm256_set1_epi8(0x20));
        const __m256i eq6bit = _mm256_and_si256(eq6, _mm256_set1_epi8(0x40));
        const __m256i eq7bit = _mm256_and_si256(eq7, _mm256_set1_epi8(int8_t(0x80)));

        const __m256i m01    = _mm256_or_si256(eq0bit, eq1bit);
        const __m256i m23    = _mm256_or_si256(eq2bit, eq3bit);
        const __m256i m45    = _mm256_or_si256(eq4bit, eq5bit);
        const __m256i m67    = _mm256_or_si256(eq6bit, eq7bit);

        const __m256i m0123  = _mm256_or_si256(m01, m23);
        const __m256i m4567  = _mm256_or_si256(m45, m67);

        const __m256i merged = _mm256_or_si256(m0123, m4567);

        result += __builtin_popcountll(_mm256_extract_epi64(merged, 0));
        result += __builtin_popcountll(_mm256_extract_epi64(merged, 1));
        result += __builtin_popcountll(_mm256_extract_epi64(merged, 2));
        result += __builtin_popcountll(_mm256_extract_epi64(merged, 3));

        ptr += 8 * 32;
    }

    return result + scalar_count_bytes(ptr, end - ptr, byte);
}
Esempio n. 7
0
uint64_t avx2_count_byte(const uint8_t* data, size_t size, uint8_t byte) {

    const __m256i v = _mm256_set1_epi8(byte);

    const uint8_t* end = data + size;
    const uint8_t* ptr = data;

    __m256i global_sum = _mm256_setzero_si256();
    __m256i local_sum;

    // 1. blocks of 256 registers
    while (ptr + 255*32 < end) {
        local_sum = _mm256_setzero_si256();

        // update 32 x 8-bit counter
        for (int i=0; i < 255; i++, ptr += 32) {
            const __m256i in = _mm256_loadu_si256((const __m256i*)ptr);
            const __m256i eq = _mm256_cmpeq_epi8(in, v); // 0 or -1

            local_sum = _mm256_sub_epi8(local_sum, eq);
        }

        // update the global accumulator 2 x 64-bit
        const __m256i tmp = _mm256_sad_epu8(local_sum, _mm256_setzero_si256());
        global_sum = _mm256_add_epi64(global_sum, tmp);
    }


    // 2. tail of < 256 registers
    local_sum = _mm256_setzero_si256();
    while (ptr + 32 < end) {
        const __m256i in = _mm256_loadu_si256((const __m256i*)ptr);
        const __m256i eq = _mm256_cmpeq_epi8(in, v);

        local_sum = _mm256_sub_epi8(local_sum, eq);

        ptr += 32;
    }

    const __m256i tmp = _mm256_sad_epu8(local_sum, _mm256_setzero_si256());
    global_sum = _mm256_add_epi64(global_sum, tmp);


    // 3. process tail < 32 bytes
    uint64_t result = _mm256_extract_epi64(global_sum, 0)
                    + _mm256_extract_epi64(global_sum, 1)
                    + _mm256_extract_epi64(global_sum, 2)
                    + _mm256_extract_epi64(global_sum, 3);

    return result + scalar_count_bytes(ptr, end - ptr, byte);
}
Esempio n. 8
0
uint8_t
get_xE_avx(const ESL_DSQ *dsq, int L, const P7_OPROFILE *om)
{
#ifdef HAVE_AVX2
  __m256i xEv_AVX;		           /* E state: keeps max for Mk->E as we go                     */
  __m256i beginv_AVX;                  /* begin scores                                              */
  uint8_t retval_AVX;
  int q_AVX;			   /* counter over vectors 0..nq-1                              */
  int Q_AVX        = P7_NVB_AVX(om->M);   /* segment length: # of vectors                              */

  int bands_AVX;                       /* the number of bands (rounds) to use                       */
  beginv_AVX =  _mm256_set1_epi8(128);
  xEv_AVX    =  beginv_AVX;
  /* function pointers for the various number of vectors to use */
  __m256i (*fs_AVX[MAX_BANDS + 1]) (const ESL_DSQ *, int, const P7_OPROFILE *, int, register __m256i, __m256i)
    = {NULL
       , calc_band_1_AVX,  calc_band_2_AVX,  calc_band_3_AVX,  calc_band_4_AVX,  calc_band_5_AVX,  calc_band_6_AVX
#if MAX_BANDS > 6
       , calc_band_7_AVX,  calc_band_8_AVX,  calc_band_9_AVX,  calc_band_10_AVX, calc_band_11_AVX, calc_band_12_AVX
       , calc_band_13_AVX, calc_band_14_AVX
#endif
#if MAX_BANDS > 14
       , calc_band_15_AVX, calc_band_16_AVX, calc_band_17_AVX, calc_band_18_AVX
#endif
  };

  int last_q;                  /* for saving the last q value to find band width            */
  int i;                           /* counter for bands                                         */

  last_q = 0; // reset in case we also ran SSE code
  /* Use the highest number of bands but no more than MAX_BANDS */
  bands_AVX = (Q_AVX + MAX_BANDS - 1) / MAX_BANDS;
  for (i = 0; i < bands_AVX; i++) {
    q_AVX = (Q_AVX * (i + 1)) / bands_AVX;

    xEv_AVX = fs_AVX[q_AVX-last_q](dsq, L, om, last_q, beginv_AVX, xEv_AVX);

    last_q = q_AVX;
  }

 
  retval_AVX = esl_avx_hmax_epu8(xEv_AVX);

  return retval_AVX;
#endif // HAVE_AVX2
#ifndef HAVE_AVX2
  	return 0;  // Stub so there's something to link if we don't have AVX2 support
#endif
}
Esempio n. 9
0
void
maddrc16_shuffle_avx2(uint8_t* region1, const uint8_t* region2,
					uint8_t constant, size_t length)
{
	uint8_t *end;
	register __m256i in1, in2, out, t1, t2, m1, m2, l, h;
	register __m128i bc;

	if (constant == 0)
		return;

	if (constant == 1) {
		xorr_avx2(region1, region2, length);
		return;
	}

	bc = _mm_load_si128((void *)tl[constant]);
	t1 = __builtin_ia32_vbroadcastsi256(bc);
	bc = _mm_load_si128((void *)th[constant]);
	t2 = __builtin_ia32_vbroadcastsi256(bc);
	m1 = _mm256_set1_epi8(0x0f);
	m2 = _mm256_set1_epi8(0xf0);

	for (end=region1+length; region1<end; region1+=32, region2+=32) {
		in2 = _mm256_load_si256((void *)region2);
		in1 = _mm256_load_si256((void *)region1);
		l = _mm256_and_si256(in2, m1);
		l = _mm256_shuffle_epi8(t1, l);
		h = _mm256_and_si256(in2, m2);
		h = _mm256_srli_epi64(h, 4);
		h = _mm256_shuffle_epi8(t2, h);
		out = _mm256_xor_si256(h,l);
		out = _mm256_xor_si256(out, in1);
		_mm256_store_si256((void *)region1, out);
	}
}
Esempio n. 10
0
static INLINE unsigned int masked_sad32xh_avx2(
    const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
    const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
    int width, int height) {
  int x, y;
  __m256i res = _mm256_setzero_si256();
  const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
  const __m256i round_scale =
      _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
  for (y = 0; y < height; y++) {
    for (x = 0; x < width; x += 32) {
      const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]);
      const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]);
      const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]);
      const __m256i m = _mm256_lddqu_si256((const __m256i *)&m_ptr[x]);
      const __m256i m_inv = _mm256_sub_epi8(mask_max, m);

      // Calculate 16 predicted pixels.
      // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
      // is 64 * 255, so we have plenty of space to add rounding constants.
      const __m256i data_l = _mm256_unpacklo_epi8(a, b);
      const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv);
      __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l);
      pred_l = _mm256_mulhrs_epi16(pred_l, round_scale);

      const __m256i data_r = _mm256_unpackhi_epi8(a, b);
      const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv);
      __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r);
      pred_r = _mm256_mulhrs_epi16(pred_r, round_scale);

      const __m256i pred = _mm256_packus_epi16(pred_l, pred_r);
      res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src));
    }

    src_ptr += src_stride;
    a_ptr += a_stride;
    b_ptr += b_stride;
    m_ptr += m_stride;
  }
  // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
  res = _mm256_shuffle_epi32(res, 0xd8);
  res = _mm256_permute4x64_epi64(res, 0xd8);
  res = _mm256_hadd_epi32(res, res);
  res = _mm256_hadd_epi32(res, res);
  int32_t sad = _mm256_extract_epi32(res, 0);
  return (sad + 31) >> 6;
}
		template <bool align> void SquaredDifferenceSumMasked(
			const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, 
			const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum)
		{
			assert(width < 0x10000);
			if(align)
			{
				assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride));
				assert(Aligned(mask) && Aligned(maskStride));
			}

			size_t bodyWidth = AlignLo(width, A);
			__m256i tailMask = SetMask<uint8_t>(0, A - width + bodyWidth, 0xFF);
			__m256i fullSum = _mm256_setzero_si256();
			__m256i index_= _mm256_set1_epi8(index);
			for(size_t row = 0; row < height; ++row)
			{
				__m256i rowSum = _mm256_setzero_si256();
				for(size_t col = 0; col < bodyWidth; col += A)
				{
					const __m256i mask_ = LoadMaskI8<align>((__m256i*)(mask + col), index_);
					const __m256i a_ = _mm256_and_si256(mask_, Load<align>((__m256i*)(a + col)));
					const __m256i b_ = _mm256_and_si256(mask_, Load<align>((__m256i*)(b + col))); 
					rowSum = _mm256_add_epi32(rowSum, SquaredDifference(a_, b_));
				}
				if(width - bodyWidth)
				{
					const __m256i mask_ = _mm256_and_si256(tailMask, LoadMaskI8<false>((__m256i*)(mask + width - A), index_));
					const __m256i a_ = _mm256_and_si256(mask_, Load<false>((__m256i*)(a + width - A)));
					const __m256i b_ = _mm256_and_si256(mask_, Load<false>((__m256i*)(b + width - A))); 
					rowSum = _mm256_add_epi32(rowSum, SquaredDifference(a_, b_));
				}
				fullSum = _mm256_add_epi64(fullSum, HorizontalSum32(rowSum));
				a += aStride;
				b += bStride;
				mask += maskStride;
			}
			*sum = ExtractSum<uint64_t>(fullSum);
		}
Esempio n. 12
0
static INLINE void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1,
                                            const __m256i a,
                                            uint8_t *comp_pred) {
  const __m256i alpha_max = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
  const int16_t round_bits = 15 - AOM_BLEND_A64_ROUND_BITS;
  const __m256i round_offset = _mm256_set1_epi16(1 << (round_bits));

  const __m256i ma = _mm256_sub_epi8(alpha_max, a);

  const __m256i ssAL = _mm256_unpacklo_epi8(s0, s1);
  const __m256i aaAL = _mm256_unpacklo_epi8(a, ma);
  const __m256i ssAH = _mm256_unpackhi_epi8(s0, s1);
  const __m256i aaAH = _mm256_unpackhi_epi8(a, ma);

  const __m256i blendAL = _mm256_maddubs_epi16(ssAL, aaAL);
  const __m256i blendAH = _mm256_maddubs_epi16(ssAH, aaAH);
  const __m256i roundAL = _mm256_mulhrs_epi16(blendAL, round_offset);
  const __m256i roundAH = _mm256_mulhrs_epi16(blendAH, round_offset);

  const __m256i roundA = _mm256_packus_epi16(roundAL, roundAH);
  _mm256_storeu_si256((__m256i *)(comp_pred), roundA);
}
static __m256i avx2_popcount(const __m256i vec) {

    const __m256i lookup = _mm256_setr_epi8(
        /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
        /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
        /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
        /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4,

        /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
        /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
        /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
        /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4
    );

    const __m256i low_mask = _mm256_set1_epi8(0x0f);

    const __m256i lo  = _mm256_and_si256(vec, low_mask);
    const __m256i hi  = _mm256_and_si256(_mm256_srli_epi16(vec, 4), low_mask);
    const __m256i popcnt1 = _mm256_shuffle_epi8(lookup, lo);
    const __m256i popcnt2 = _mm256_shuffle_epi8(lookup, hi);

    return _mm256_add_epi8(popcnt1, popcnt2);
}
Esempio n. 14
0
        template<bool align> void HistogramMasked(const uint8_t * src, size_t srcStride, size_t width, size_t height, 
            const uint8_t * mask, size_t maskStride, uint8_t index, uint32_t * histogram)
        {
            Buffer<uint16_t> buffer(AlignHi(width, A), HISTOGRAM_SIZE + 8);
            size_t widthAligned4 = Simd::AlignLo(width, 4);
            size_t widthAlignedA = Simd::AlignLo(width, A);
            size_t widthAlignedDA = Simd::AlignLo(width, DA);
            __m256i _index = _mm256_set1_epi8(index);
            for(size_t row = 0; row < height; ++row)
            {
                size_t col = 0;
                for(; col < widthAlignedDA; col += DA)
                {
                    MaskSrc<align, true>(src, mask, _index, col, buffer.v);
                    MaskSrc<align, true>(src, mask, _index, col + A, buffer.v);
                }
                for(; col < widthAlignedA; col += A)
                    MaskSrc<align, true>(src, mask, _index, col, buffer.v);
                if(width != widthAlignedA)
                    MaskSrc<false, false>(src, mask, _index, width - A, buffer.v);

                for(col = 0; col < widthAligned4; col += 4)
                {
                    ++buffer.h[0][buffer.v[col + 0]];
                    ++buffer.h[1][buffer.v[col + 1]];
                    ++buffer.h[2][buffer.v[col + 2]];
                    ++buffer.h[3][buffer.v[col + 3]];
                }
                for(; col < width; ++col)
                    ++buffer.h[0][buffer.v[col]];

                src += srcStride;
                mask += maskStride;
            }

            SumHistograms(buffer.h[0], 8, histogram);
        }
Esempio n. 15
0
int normHamming(const uchar* a, const uchar* b, int n)
{
    CV_AVX_GUARD;

    int i = 0;
    int result = 0;
#if CV_AVX2
    {
        __m256i _r0 = _mm256_setzero_si256();
        __m256i _0 = _mm256_setzero_si256();
        __m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
                                                 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
        __m256i _popcnt_mask = _mm256_set1_epi8(0x0F);

        for(; i <= n - 32; i+= 32)
        {
            __m256i _a0 = _mm256_loadu_si256((const __m256i*)(a + i));
            __m256i _b0 = _mm256_loadu_si256((const __m256i*)(b + i));

            __m256i _xor = _mm256_xor_si256(_a0, _b0);

            __m256i _popc0 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_xor, _popcnt_mask));
            __m256i _popc1 = _mm256_shuffle_epi8(_popcnt_table,
                             _mm256_and_si256(_mm256_srli_epi16(_xor, 4), _popcnt_mask));

            _r0 = _mm256_add_epi32(_r0, _mm256_sad_epu8(_0, _mm256_add_epi8(_popc0, _popc1)));
        }
        _r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2));
        result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0);
    }
#endif // CV_AVX2

#if CV_POPCNT
    {
#  if defined CV_POPCNT_U64
        for(; i <= n - 8; i += 8)
        {
            result += (int)CV_POPCNT_U64(*(uint64*)(a + i) ^ *(uint64*)(b + i));
        }
#  endif
        for(; i <= n - 4; i += 4)
        {
            result += CV_POPCNT_U32(*(uint*)(a + i) ^ *(uint*)(b + i));
        }
    }
#endif // CV_POPCNT

#if CV_SIMD128
    {
        v_uint32x4 t = v_setzero_u32();
        for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
        {
            t += v_popcount(v_load(a + i) ^ v_load(b + i));
        }
        result += v_reduce_sum(t);
    }
#endif // CV_SIMD128
#if CV_ENABLE_UNROLLED
    for(; i <= n - 4; i += 4)
    {
        result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +
                popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
    }
#endif
    for(; i < n; i++)
    {
        result += popCountTable[a[i] ^ b[i]];
    }
    return result;
}
Esempio n. 16
0
/* Function:  p7_MSVFilter()
 * Synopsis:  Calculates MSV score, vewy vewy fast, in limited precision.
 *
 * Purpose:   Calculates an approximation of the MSV score for sequence
 *            <dsq> of length <L> residues, using optimized profile <om>,
 *            and the one-row DP matrix <ox>. Return the 
 *            estimated MSV score (in nats) in <ret_sc>.
 *            
 *            Score may overflow (and will, on high-scoring
 *            sequences), but will not underflow.
 *            
 *            <ox> will be resized if needed. It's fine if it was
 *            just <_Reuse()'d> from a previous, smaller profile.
 *            
 *            The model may be in any mode, because only its match
 *            emission scores will be used. The MSV filter inherently
 *            assumes a multihit local mode, and uses its own special
 *            state transition scores, not the scores in the profile.
 *
 * Args:      dsq     - digital target sequence, 1..L
 *            L       - length of dsq in residues          
 *            om      - optimized profile
 *            ox      - filter DP matrix (one row)
 *            ret_sc  - RETURN: MSV score (in nats)          
 *                      
 * Returns:   <eslOK> on success.
 *            <eslERANGE> if the score overflows the limited range; in
 *            this case, this is a high-scoring hit.
 *            <ox> may have been resized.
 *
 * Throws:    <eslEMEML> if <ox> reallocation fails.
 */
int
p7_MSVFilter_avx(const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, P7_FILTERMX *ox, float *ret_sc)
{
#ifdef HAVE_AVX2 
  uint8_t  xJ;                     /* special states' scores                  */
  register __m256i mpv_AVX;            /* previous row values                                       */
  register __m256i xEv_AVX;      /* E state: keeps max for Mk->E as we go                     */
  register __m256i xBv_AVX;      /* B state: splatted vector of B[i-1] for B->Mk calculations */
  register __m256i sv_AVX;       /* temp storage of 1 curr row value in progress              */
  register __m256i biasv_AVX;    /* emission bias in a vector                                 */
  __m256i *dp_AVX;               /* the dp row memory                                         */
  __m256i *rsc_AVX;        /* will point at om->rbv[x] for residue x[i]                 */
  __m256i xJv_AVX;                     /* vector for states score                                   */
  __m256i tjbmv_AVX;                   /* vector for cost of moving {JN}->B->M                      */
  __m256i tecv_AVX;                    /* vector for E->C  cost                                     */
  __m256i basev_AVX;                   /* offset for scores                                         */
  __m256i ceilingv_AVX;                /* saturated simd value used to test for overflow            */
  __m256i tempv_AVX;                   /* work vector                                               */
  int Q_AVX        = P7_NVB_AVX(om->M);    /* segment length: # of vectors                              */
  int q_AVX;         /* counter over vectors 0..nq-1                              */

  int i;         /* counter over sequence positions 1..L                      */
  int     cmp;
  int     status;
//printf("Starting MSVFilter\n");
  /* Contract checks */
  ESL_DASSERT1(( om->mode == p7_LOCAL )); /* Production code assumes multilocal mode w/ length model <L> */
  ESL_DASSERT1(( om->L    == L ));	  /*  ... and it's easy to forget to set <om> that way           */
  ESL_DASSERT1(( om->nj   == 1.0f ));	  /*  ... hence the check                                        */
                                          /*  ... which you can disable, if you're playing w/ config     */
  /* note however that it makes no sense to run MSV w/ a model in glocal mode                            */

  /* Try highly optimized Knudsen SSV filter first. 
   * Note that SSV doesn't use any main memory (from <ox>) at all! 
  */
 //extern uint64_t SSV_time;
 uint64_t filter_start_time = __rdtsc();
   status = p7_SSVFilter_avx(dsq, L, om, ret_sc);
    uint64_t filter_end_time = __rdtsc();
  //SSV_time += (filter_end_time - filter_start_time);   
   if (status != eslENORESULT) return status;
   extern uint64_t full_MSV_calls;
  
  full_MSV_calls++;
 
  /* Resize the filter mx as needed */
  if (( status = p7_filtermx_GrowTo(ox, om->M))    != eslOK) ESL_EXCEPTION(status, "Reallocation of MSV filter matrix failed");

  dp_AVX = ox->dp_AVX;  /* ditto this */

  /* Matrix type and size must be set early, not late: debugging dump functions need this information. */
  ox->M    = om->M;
  ox->type = p7F_MSVFILTER;

  /* Initialization. In offset unsigned arithmetic, -infinity is 0, and 0 is om->base.  */

  biasv_AVX = _mm256_set1_epi8((int8_t) om->bias_b); /* yes, you can set1() an unsigned char vector this way */

  for (q_AVX = 0; q_AVX < Q_AVX; q_AVX++) dp_AVX[q_AVX] = _mm256_setzero_si256(); 
  /* saturate simd register for overflow test */

  ceilingv_AVX = _mm256_cmpeq_epi8(biasv_AVX, biasv_AVX);
  basev_AVX    = _mm256_set1_epi8((int8_t) om->base_b);
  tjbmv_AVX    = _mm256_set1_epi8((int8_t) om->tjb_b + (int8_t) om->tbm_b);
  tecv_AVX     = _mm256_set1_epi8((int8_t) om->tec_b);
  xJv_AVX      = _mm256_subs_epu8(biasv_AVX, biasv_AVX);
  xBv_AVX      = _mm256_subs_epu8(basev_AVX, tjbmv_AVX);

#ifdef p7_DEBUGGING
  if (ox->do_dumping)
    {
      uint8_t xB;
      xB = _mm_extract_epi16(xBv, 0);
      xJ = _mm_extract_epi16(xJv, 0);
      p7_filtermx_DumpMFRow(ox, 0, 0, 0, xJ, xB, xJ);
    }
#endif


  for (i = 1; i <= L; i++)  /* Outer loop over residues*/
    {

      rsc_AVX = om->rbv_AVX[dsq[i]];
      xEv_AVX = _mm256_setzero_si256();      

      /* Right shifts by 1 byte. 4,8,12,x becomes x,4,8,12. 
       * Because ia32 is littlendian, this means a left bit shift.
       * Zeros shift on automatically, which is our -infinity.
       */
       __m256i dp_temp_AVX = dp_AVX[Q_AVX -1];
      mpv_AVX = esl_avx_leftshift_one(dp_temp_AVX);
      
      for (q_AVX = 0; q_AVX < Q_AVX; q_AVX++)
      {
        /* Calculate new MMXo(i,q); don't store it yet, hold it in sv. */
        sv_AVX   = _mm256_max_epu8(mpv_AVX, xBv_AVX);
        sv_AVX   = _mm256_adds_epu8(sv_AVX, biasv_AVX);
        sv_AVX   = _mm256_subs_epu8(sv_AVX, *rsc_AVX);   rsc_AVX++;
        xEv_AVX  = _mm256_max_epu8(xEv_AVX, sv_AVX);

        mpv_AVX   = dp_AVX[q_AVX];      /* Load {MDI}(i-1,q) into mpv */
        dp_AVX[q_AVX] = sv_AVX;           /* Do delayed store of M(i,q) now that memory is usable */
      }

      /* test for the overflow condition */
      tempv_AVX = _mm256_adds_epu8(xEv_AVX, biasv_AVX);
      tempv_AVX = _mm256_cmpeq_epi8(tempv_AVX, ceilingv_AVX);
      cmp   = _mm256_movemask_epi8(tempv_AVX);

      /* Now the "special" states, which start from Mk->E (->C, ->J->B)
       * Use shuffles instead of shifts so when the last max has completed,
       * the last four elements of the simd register will contain the
       * max value.  Then the last shuffle will broadcast the max value
       * to all simd elements.
       */

      xEv_AVX = _mm256_set1_epi8(esl_avx_hmax_epu8(xEv_AVX));  // broadcast the max byte from original xEv_AVX
      // to all bytes of xEv_AVX

      /* immediately detect overflow */
      if (cmp != 0x0000) {
   //            MSV_end_time = __rdtsc();
   //     MSV_time += (MSV_end_time - MSV_start_time);
        *ret_sc = eslINFINITY; return eslERANGE; }

      xEv_AVX = _mm256_subs_epu8(xEv_AVX, tecv_AVX);
      xJv_AVX = _mm256_max_epu8(xJv_AVX,xEv_AVX);
      xBv_AVX = _mm256_max_epu8(basev_AVX, xJv_AVX);
      xBv_AVX = _mm256_subs_epu8(xBv_AVX, tjbmv_AVX);

#ifdef p7_DEBUGGING
      if (ox->do_dumping)
	{
	  uint8_t xB, xE;
	  xB = _mm_extract_epi16(xBv, 0);
	  xE = _mm_extract_epi16(xEv, 0);
	  xJ = _mm_extract_epi16(xJv, 0);
	  p7_filtermx_DumpMFRow(ox, i, xE, 0, xJ, xB, xJ);
	}
#endif
    } /* end loop over sequence residues 1..L */

  /* finally C->T, and add our missing precision on the NN,CC,JJ back */
  xJ =  _mm256_extract_epi8(xJv_AVX, 0);

  *ret_sc = ((float) (xJ - om->tjb_b) - (float) om->base_b);
  *ret_sc /= om->scale_b;
  *ret_sc -= 3.0; /* that's ~ L \log \frac{L}{L+3}, for our NN,CC,JJ */
    /*     MSV_end_time = __rdtsc();
        MSV_time += (MSV_end_time - MSV_start_time); */

  return eslOK;
  #endif
  #ifndef HAVE_AVX2
  return eslENORESULT;  // Stub so we have something to link if we build without AVX2 support
  #endif
  }
Esempio n. 17
0
size_t vec_i8_count(const char *p, size_t n, char val)
{
	size_t num = 0;

#ifdef COREARRAY_SIMD_SSE2

	// header 1, 16-byte aligned
	size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F;
	for (; (n > 0) && (h > 0); n--, h--)
		if (*p++ == val) num++;

#   ifdef COREARRAY_SIMD_AVX2
	// body, AVX2
	const __m128i zeros = _mm_setzero_si128();
	const __m256i mask = _mm256_set1_epi8(val);
	__m256i sum = _mm256_setzero_si256();
	size_t offset = 0;

	// header 2, 32-byte aligned
	if ((n >= 16) && ((size_t)p & 0x10))
	{
		__m128i v = _mm_load_si128((__m128i const*)p);
		__m128i c1 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask));
		sum = MM_SET_M128(_mm_sub_epi8(zeros, c1), zeros);
		n -= 16; p += 16;
	}

	for (; n >= 128; n-=128)
	{
		__m256i v = _mm256_load_si256((__m256i const*)p); p += 32;
		sum = _mm256_sub_epi8(sum, _mm256_cmpeq_epi8(v, mask));
		v = _mm256_load_si256((__m256i const*)p); p += 32;
		sum = _mm256_sub_epi8(sum, _mm256_cmpeq_epi8(v, mask));
		v = _mm256_load_si256((__m256i const*)p); p += 32;
		sum = _mm256_sub_epi8(sum, _mm256_cmpeq_epi8(v, mask));
		v = _mm256_load_si256((__m256i const*)p); p += 32;
		sum = _mm256_sub_epi8(sum, _mm256_cmpeq_epi8(v, mask));
		offset += 4;
		if (offset >= 252)
		{
			num += vec_avx_sum_u8(sum);
			sum = _mm256_setzero_si256();
			offset = 0;
		}
	}
	for (; n >= 32; n-=32)
	{
		__m256i v = _mm256_load_si256((__m256i const*)p); p += 32;
		sum = _mm256_sub_epi8(sum, _mm256_cmpeq_epi8(v, mask));
		if ((++offset) >= 252)
		{
			num += vec_avx_sum_u8(sum);
			sum = _mm256_setzero_si256();
			offset = 0;
		}
	}
	if (n >= 16)
	{
		__m128i v = _mm_load_si128((__m128i const*)p);
		__m128i c1 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask));
		sum = _mm256_sub_epi8(sum, MM_SET_M128(zeros, c1));
		n -= 16; p += 16;
	}

	if (offset > 0)
		num += vec_avx_sum_u8(sum);

#   else
	// body, SSE2
	const __m128i mask = _mm_set1_epi8(val);
	__m128i sum = _mm_setzero_si128();
	size_t offset = 0;

	for (; n >= 64; n-=64)
	{
		__m128i v = _mm_load_si128((__m128i const*)p); p += 16;
		sum = _mm_sub_epi8(sum, _mm_cmpeq_epi8(v, mask));
		v = _mm_load_si128((__m128i const*)p); p += 16;
		sum = _mm_sub_epi8(sum, _mm_cmpeq_epi8(v, mask));
		v = _mm_load_si128((__m128i const*)p); p += 16;
		sum = _mm_sub_epi8(sum, _mm_cmpeq_epi8(v, mask));
		v = _mm_load_si128((__m128i const*)p); p += 16;
		sum = _mm_sub_epi8(sum, _mm_cmpeq_epi8(v, mask));
		offset += 4;
		if (offset >= 252)
		{
			num += vec_sum_u8(sum);
			sum = _mm_setzero_si128();
			offset = 0;
		}
	}
	for (; n >= 16; n-=16, p+=16)
	{
		__m128i v = _mm_load_si128((__m128i const*)p);
		sum = _mm_sub_epi8(sum, _mm_cmpeq_epi8(v, mask));
		if ((++offset) >= 252)
		{
			num += vec_sum_u8(sum);
			sum = _mm_setzero_si128();
			offset = 0;
		}
	}

	if (offset > 0)
		num += vec_sum_u8(sum);
#endif

#endif

	// tail
	for (; n > 0; n--)
		if (*p++ == val) num++;
	return num;
}
Esempio n. 18
0
 /*!
  * \brief Fill a packed vector  by replicating a value
  */
 ETL_STATIC_INLINE(avx_simd_byte) set(int8_t value) {
     return _mm256_set1_epi8(value);
 }
Esempio n. 19
0
void nibble_sort_beekman1(uint64_t *buf) {
  // already in the right order
  //__m256i
  // shuf0={0x1716151413121110ULL,0x1f1e1d1c1b1a1918ULL,0x0706050403020100ULL,0x0f0e0d0c0b0a0908ULL};
  __m256i shuf1 = {0x1e161c141a121810ULL, 0x1f171d151b131911ULL,
                   0x0e060c040a020800ULL, 0x0f070d050b030901ULL};
  __m256i shuf2 = {0x1d1c151419181110ULL, 0x1f1e17161b1a1312ULL,
                   0x0d0c050409080100ULL, 0x0f0e07060b0a0302ULL};
  // use less instructions below
  //__m256i
  // shuf3={0x1b1a191813121110ULL,0x1f1e1d1c17161514ULL,0x0b0a090803020100ULL,0x0f0e0d0c07060504ULL};
  __m256i shuf4 = {0x101d171615141311ULL, 0x1f1e1b191a181c12ULL,
                   0x000d070605040301ULL, 0x0f0e0b090a080c02ULL};
  __m256i shuf5 = {0x171d151413111810ULL, 0x1f1e16191c1b1a12ULL,
                   0x070d050403010800ULL, 0x0f0e06090c0b0a02ULL};
  __m256i shuf6 = {0x1e17161a15141211ULL, 0x1f101d1c1b191318ULL,
                   0x0e07060a05040201ULL, 0x0f000d0c0b090308ULL};
  __m256i shuf7 = {0x171510161b131911ULL, 0x1f1d181e1c141a12ULL,
                   0x070500060b030901ULL, 0x0f0d080e0c040a02ULL};
  __m256i shuf8 = {0x1715141613121110ULL, 0x1f1e1c1b1a19181dULL,
                   0x0705040603020100ULL, 0x0f0e0c0b0a09080dULL};
  __m256i shuf9 = {0x171c1b1a19181615ULL, 0x1f1e14131211101dULL,
                   0x070c0b0a09080605ULL, 0x0f0e04030201000dULL};
  __m256i nibblemask = _mm256_set1_epi8(0x0f);
  for (uint32_t i = 0; i < (1024 / 4); i += 1) {
    __m256i r0 = _mm256_loadu_si256(((__m256i *)buf) + i), r1 = r0, r2;
    r0 &= nibblemask;
    r1 ^= r0;
    r1 = _mm256_srli_epi64(r1, 4);

#define sort_and_shuffle(n)                                                    \
  r2 = _mm256_max_epi8(r0, r1);                                                \
  r0 = _mm256_min_epi8(r0, r1);                                                \
  r1 = (__m256i)_mm256_shuffle_pd((__m256d)r0, (__m256d)r2, 0b0000);           \
  r2 = (__m256i)_mm256_shuffle_pd((__m256d)r0, (__m256d)r2, 0b1111);           \
  r1 = _mm256_shuffle_epi8(r1, shuf##n);                                       \
  r2 = _mm256_shuffle_epi8(r2, shuf##n);                                       \
  r0 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b0000);           \
  r1 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b1111)

    sort_and_shuffle(1);
    sort_and_shuffle(2);
    { // sort_and_shuffle(3);
      r2 = _mm256_max_epi8(r0, r1);
      r0 = _mm256_min_epi8(r0, r1);
      r1 = (__m256i)_mm256_unpacklo_ps((__m256)r0, (__m256)r2);
      r2 = (__m256i)_mm256_unpackhi_ps((__m256)r0, (__m256)r2);
      r0 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b1111);
      r1 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b0000);
    }
    sort_and_shuffle(4);
    sort_and_shuffle(5);
    sort_and_shuffle(6);
    sort_and_shuffle(7);
    sort_and_shuffle(8);
    sort_and_shuffle(9);

    r1 = _mm256_slli_epi64(r1, 4);
    _mm256_storeu_si256(((__m256i *)buf) + i, r1 | r0);
  }
}
/*
 * Do or undo the 'E8' preprocessing used in LZX.  Before compression, the
 * uncompressed data is preprocessed by changing the targets of x86 CALL
 * instructions from relative offsets to absolute offsets.  After decompression,
 * the translation is undone by changing the targets of x86 CALL instructions
 * from absolute offsets to relative offsets.
 *
 * Note that despite its intent, E8 preprocessing can be done on any data even
 * if it is not actually x86 machine code.  In fact, E8 preprocessing appears to
 * always be used in LZX-compressed resources in WIM files; there is no bit to
 * indicate whether it is used or not, unlike in the LZX compressed format as
 * used in cabinet files, where a bit is reserved for that purpose.
 *
 * E8 preprocessing is disabled in the last 6 bytes of the uncompressed data,
 * which really means the 5-byte call instruction cannot start in the last 10
 * bytes of the uncompressed data.  This is one of the errors in the LZX
 * documentation.
 *
 * E8 preprocessing does not appear to be disabled after the 32768th chunk of a
 * WIM resource, which apparently is another difference from the LZX compression
 * used in cabinet files.
 *
 * E8 processing is supposed to take the file size as a parameter, as it is used
 * in calculating the translated jump targets.  But in WIM files, this file size
 * is always the same (LZX_WIM_MAGIC_FILESIZE == 12000000).
 */
static void
lzx_e8_filter(u8 *data, u32 size, void (*process_target)(void *, s32))
{

#if !defined(__SSE2__) && !defined(__AVX2__)
	/*
	 * A worthwhile optimization is to push the end-of-buffer check into the
	 * relatively rare E8 case.  This is possible if we replace the last six
	 * bytes of data with E8 bytes; then we are guaranteed to hit an E8 byte
	 * before reaching end-of-buffer.  In addition, this scheme guarantees
	 * that no translation can begin following an E8 byte in the last 10
	 * bytes because a 4-byte offset containing E8 as its high byte is a
	 * large negative number that is not valid for translation.  That is
	 * exactly what we need.
	 */
	u8 *tail;
	u8 saved_bytes[6];
	u8 *p;

	if (size <= 10)
		return;

	tail = &data[size - 6];
	memcpy(saved_bytes, tail, 6);
	memset(tail, 0xE8, 6);
	p = data;
	for (;;) {
		while (*p != 0xE8)
			p++;
		if (p >= tail)
			break;
		(*process_target)(p + 1, p - data);
		p += 5;
	}
	memcpy(tail, saved_bytes, 6);
#else
	/* SSE2 or AVX-2 optimized version for x86_64  */

	u8 *p = data;
	u64 valid_mask = ~0;

	if (size <= 10)
		return;
#ifdef __AVX2__
#  define ALIGNMENT_REQUIRED 32
#else
#  define ALIGNMENT_REQUIRED 16
#endif

	/* Process one byte at a time until the pointer is properly aligned.  */
	while ((uintptr_t)p % ALIGNMENT_REQUIRED != 0) {
		if (p >= data + size - 10)
			return;
		if (*p == 0xE8 && (valid_mask & 1)) {
			(*process_target)(p + 1, p - data);
			valid_mask &= ~0x1F;
		}
		p++;
		valid_mask >>= 1;
		valid_mask |= (u64)1 << 63;
	}

	if (data + size - p >= 64) {

		/* Vectorized processing  */

		/* Note: we use a "trap" E8 byte to eliminate the need to check
		 * for end-of-buffer in the inner loop.  This byte is carefully
		 * positioned so that it will never be changed by a previous
		 * translation before it is detected.  */

		u8 *trap = p + ((data + size - p) & ~31) - 32 + 4;
		u8 saved_byte = *trap;
		*trap = 0xE8;

		for (;;) {
			u32 e8_mask;
			u8 *orig_p = p;
		#ifdef __AVX2__
			const __m256i e8_bytes = _mm256_set1_epi8(0xE8);
			for (;;) {
				__m256i bytes = *(const __m256i *)p;
				__m256i cmpresult = _mm256_cmpeq_epi8(bytes, e8_bytes);
				e8_mask = _mm256_movemask_epi8(cmpresult);
				if (e8_mask)
					break;
				p += 32;
			}
		#else
			const __m128i e8_bytes = _mm_set1_epi8(0xE8);
			for (;;) {
				/* Read the next 32 bytes of data and test them
				 * for E8 bytes.  */
				__m128i bytes1 = *(const __m128i *)p;
				__m128i bytes2 = *(const __m128i *)(p + 16);
				__m128i cmpresult1 = _mm_cmpeq_epi8(bytes1, e8_bytes);
				__m128i cmpresult2 = _mm_cmpeq_epi8(bytes2, e8_bytes);
				u32 mask1 = _mm_movemask_epi8(cmpresult1);
				u32 mask2 = _mm_movemask_epi8(cmpresult2);
				/* The masks have a bit set for each E8 byte.
				 * We stay in this fast inner loop as long as
				 * there are no E8 bytes.  */
				if (mask1 | mask2) {
					e8_mask = mask1 | (mask2 << 16);
					break;
				}
				p += 32;
			}
		#endif

			/* Did we pass over data with no E8 bytes?  */
			if (p != orig_p)
				valid_mask = ~0;

			/* Are we nearing end-of-buffer?  */
			if (p == trap - 4)
				break;

			/* Process the E8 bytes.  However, the AND with
			 * 'valid_mask' ensures we never process an E8 byte that
			 * was itself part of a translation target.  */
			while ((e8_mask &= valid_mask)) {
				unsigned bit = bsf32(e8_mask);
				(*process_target)(p + bit + 1, p + bit - data);
				valid_mask &= ~((u64)0x1F << bit);
			}

			valid_mask >>= 32;
			valid_mask |= 0xFFFFFFFF00000000;
			p += 32;
		}

		*trap = saved_byte;
	}
#else
    parasail_result_t *result = parasail_result_new_stats();
#endif
#endif
    int32_t i = 0;
    int32_t j = 0;
    int32_t end_query = 0;
    int32_t end_ref = 0;
    const int8_t NEG_LIMIT = (-open < matrix->min ?
        INT8_MIN + open : INT8_MIN - matrix->min) + 1;
    const int8_t POS_LIMIT = INT8_MAX - matrix->max - 1;
    int8_t score = NEG_LIMIT;
    int8_t matches = NEG_LIMIT;
    int8_t similar = NEG_LIMIT;
    int8_t length = NEG_LIMIT;
    __m256i vNegLimit = _mm256_set1_epi8(NEG_LIMIT);
    __m256i vPosLimit = _mm256_set1_epi8(POS_LIMIT);
    __m256i vSaturationCheckMin = vPosLimit;
    __m256i vSaturationCheckMax = vNegLimit;
    __m256i vNegInf = _mm256_set1_epi8(NEG_LIMIT);
    __m256i vNegInf0 = _mm256_srli_si256_rpl(vNegInf, 1); /* shift in a 0 */
    __m256i vOpen = _mm256_set1_epi8(open);
    __m256i vGap  = _mm256_set1_epi8(gap);
    __m256i vZero = _mm256_set1_epi8(0);
    __m256i vOne = _mm256_set1_epi8(1);
    __m256i vOne16 = _mm256_set1_epi16(1);
    __m256i vNegOne16 = _mm256_set1_epi16(-1);
    __m256i vN16 = _mm256_set1_epi16(N);
    __m256i vILo16 = _mm256_set_epi16(16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31);
    __m256i vIHi16 = _mm256_set_epi16(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
    __m256i vJresetLo16 = _mm256_set_epi16(-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31);
Esempio n. 22
0
	      _mm256_setr_epi8(
			 3,  2,  1,  0,
			 7,  6,  5,  4,
			11, 10,  9,  8,
			15, 14, 13, 12,
			 3,  2,  1,  0,
			 7,  6,  5,  4,
			11, 10,  9,  8,
			15, 14, 13, 12));

	/* The bits have now been shifted to the right locations;
	 * translate their values 0..63 to the Base64 alphabet.
	 * Because AVX2 can only compare 'greater than', start from end of alphabet: */

	/* set 5: 63, "/" */
	s5mask = _mm256_cmpeq_epi8(res, _mm256_set1_epi8(63));
	blockmask = s5mask;

	/* set 4: 62, "+" */
	s4mask = _mm256_cmpeq_epi8(res, _mm256_set1_epi8(62));
	blockmask = _mm256_or_si256(blockmask, s4mask);

	/* set 3: 52..61, "0123456789" */
	s3mask = _mm256_andnot_si256(blockmask, _mm256_cmpgt_epi8(res, _mm256_set1_epi8(51)));
	blockmask = _mm256_or_si256(blockmask, s3mask);

	/* set 2: 26..51, "abcdefghijklmnopqrstuvwxyz" */
	s2mask = _mm256_andnot_si256(blockmask, _mm256_cmpgt_epi8(res, _mm256_set1_epi8(25)));
	blockmask = _mm256_or_si256(blockmask, s2mask);

	/* set 1: 0..25, "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
Esempio n. 23
0
void vec_i8_count3(const char *p, size_t n, char val1, char val2, char val3,
	size_t *out_n1, size_t *out_n2, size_t *out_n3)
{
	size_t n1 = 0, n2 = 0, n3 = 0;

#ifdef COREARRAY_SIMD_SSE2

	// header 1, 16-byte aligned
	size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F;
	for (; (n > 0) && (h > 0); n--, h--)
	{
		char v = *p++;
		if (v == val1) n1++;
		if (v == val2) n2++;
		if (v == val3) n3++;
	}

#   ifdef COREARRAY_SIMD_AVX2
	// body, AVX2
	const __m128i zeros = _mm_setzero_si128();
	const __m256i mask1 = _mm256_set1_epi8(val1);
	const __m256i mask2 = _mm256_set1_epi8(val2);
	const __m256i mask3 = _mm256_set1_epi8(val3);
	__m256i sum1, sum2, sum3;
	sum1 = sum2 = sum3 = _mm256_setzero_si256();
	size_t offset = 0;

	// header 2, 32-byte aligned
	if ((n >= 16) && ((size_t)p & 0x10))
	{
		__m128i v = _mm_load_si128((__m128i const*)p);
		__m128i c1 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask1));
		sum1 = MM_SET_M128(_mm_sub_epi8(zeros, c1), zeros);
		__m128i c2 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask2));
		sum2 = MM_SET_M128(_mm_sub_epi8(zeros, c2), zeros);
		__m128i c3 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask3));
		sum3 = MM_SET_M128(_mm_sub_epi8(zeros, c3), zeros);
		n -= 16; p += 16;
	}

	for (; n >= 32; n-=32, p+=32)
	{
		__m256i v = _mm256_load_si256((__m256i const*)p);
		sum1 = _mm256_sub_epi8(sum1, _mm256_cmpeq_epi8(v, mask1));
		sum2 = _mm256_sub_epi8(sum2, _mm256_cmpeq_epi8(v, mask2));
		sum3 = _mm256_sub_epi8(sum3, _mm256_cmpeq_epi8(v, mask3));
		if ((++offset) >= 252)
		{
			n1 += vec_avx_sum_u8(sum1);
			n2 += vec_avx_sum_u8(sum2);
			n3 += vec_avx_sum_u8(sum3);
			sum1 = sum2 = sum3 = _mm256_setzero_si256();
			offset = 0;
		}
	}

	if (n >= 16)
	{
		__m128i v = _mm_load_si128((__m128i const*)p);
		__m128i c1 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask1));
		sum1 = _mm256_sub_epi8(sum1, MM_SET_M128(c1, zeros));
		__m128i c2 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask2));
		sum2 = _mm256_sub_epi8(sum2, MM_SET_M128(c2, zeros));
		__m128i c3 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask3));
		sum3 = _mm256_sub_epi8(sum3, MM_SET_M128(c3, zeros));
		n -= 16; p += 16;
	}

	if (offset > 0)
	{
		n1 += vec_avx_sum_u8(sum1);
		n2 += vec_avx_sum_u8(sum2);
		n3 += vec_avx_sum_u8(sum3);
	}

#   else
	// body, SSE2
	const __m128i mask1 = _mm_set1_epi8(val1);
	const __m128i mask2 = _mm_set1_epi8(val2);
	const __m128i mask3 = _mm_set1_epi8(val3);
	__m128i sum1, sum2, sum3;
	sum1 = sum2 = sum3 = _mm_setzero_si128();
	size_t offset = 0;

	for (; n >= 16; n-=16, p+=16)
	{
		__m128i v = _mm_load_si128((__m128i const*)p);
		sum1 = _mm_sub_epi8(sum1, _mm_cmpeq_epi8(v, mask1));
		sum2 = _mm_sub_epi8(sum2, _mm_cmpeq_epi8(v, mask2));
		sum3 = _mm_sub_epi8(sum3, _mm_cmpeq_epi8(v, mask3));
		if ((++offset) >= 252)
		{
			n1 += vec_sum_u8(sum1);
			n2 += vec_sum_u8(sum2);
			n3 += vec_sum_u8(sum3);
			sum1 = sum2 = sum3 = _mm_setzero_si128();
			offset = 0;
		}
	}

	if (offset > 0)
	{
		n1 += vec_sum_u8(sum1);
		n2 += vec_sum_u8(sum2);
		n3 += vec_sum_u8(sum3);
	}
#endif

#endif

	// tail
	for (; n > 0; n--)
	{
		char v = *p++;
		if (v == val1) n1++;
		if (v == val2) n2++;
		if (v == val3) n3++;
	}

	if (out_n1) *out_n1 = n1;
	if (out_n2) *out_n2 = n2;
	if (out_n3) *out_n3 = n3;
}
Esempio n. 24
0
void vec_i8_replace(int8_t *p, size_t n, int8_t val, int8_t substitute)
{
#ifdef COREARRAY_SIMD_SSE2

	// header 1, 16-byte aligned
	size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F;
	for (; (n > 0) && (h > 0); n--, h--, p++)
		if (*p == val) *p = substitute;

	// body, SSE2
	const __m128i mask = _mm_set1_epi8(val);
	const __m128i sub  = _mm_set1_epi8(substitute);

#   ifdef COREARRAY_SIMD_AVX2

	// header 2, 32-byte aligned
	if ((n >= 16) && ((size_t)p & 0x10))
	{
		__m128i v = _mm_load_si128((__m128i const*)p);
		__m128i c = _mm_cmpeq_epi8(v, mask);
		if (_mm_movemask_epi8(c))
		{
			_mm_store_si128((__m128i *)p,
				_mm_or_si128(_mm_and_si128(c, sub), _mm_andnot_si128(c, v)));
		}
		n -= 16; p += 16;
	}

	const __m256i mask2 = _mm256_set1_epi8(val);
	const __m256i sub32 = _mm256_set1_epi8(substitute);
	const __m256i zero = _mm256_setzero_si256();
	const __m256i ones = _mm256_cmpeq_epi64(zero, zero);

	for (; n >= 32; n-=32, p+=32)
	{
		__m256i v = _mm256_load_si256((__m256i const*)p);
		__m256i c = _mm256_cmpeq_epi8(v, mask2);
		if (_mm256_movemask_epi8(c))
		{
			// TODO
			_mm256_store_si256((__m256i *)p,
				_mm256_or_si256(_mm256_and_si256(c, sub32),
				_mm256_andnot_si256(c, v)));
		}
	}

#   endif

	for (; n >= 16; n-=16, p+=16)
	{
		__m128i v = _mm_load_si128((__m128i const*)p);
		__m128i c = _mm_cmpeq_epi8(v, mask);
		if (_mm_movemask_epi8(c))
			_mm_maskmoveu_si128(sub, c, (char*)p);
	}

#endif

	// tail
	for (; n > 0; n--, p++)
		if (*p == val) *p = substitute;
}
Esempio n. 25
0
void vec_i8_cnt_dosage2(const int8_t *p, int8_t *out, size_t n, int8_t val,
	int8_t missing, int8_t missing_substitute)
{
#ifdef COREARRAY_SIMD_SSE2

	// header 1, 16-byte aligned
	size_t h = (16 - ((size_t)out & 0x0F)) & 0x0F;
	for (; (n > 0) && (h > 0); n--, h--, p+=2)
	{
		*out ++ = ((p[0] == missing) || (p[1] == missing)) ?
			missing_substitute :
			(p[0]==val ? 1 : 0) + (p[1]==val ? 1 : 0);
	}

	// body, SSE2
	const __m128i val16  = _mm_set1_epi8(val);
	const __m128i miss16 = _mm_set1_epi8(missing);
	const __m128i sub16  = _mm_set1_epi8(missing_substitute);
	const __m128i mask   = _mm_set1_epi16(0x00FF);

#   ifdef COREARRAY_SIMD_AVX2

	// header 2, 32-byte aligned
	if ((n >= 16) && ((size_t)out & 0x10))
	{
		__m128i w1 = MM_LOADU_128((__m128i const*)p); p += 16;
		__m128i w2 = MM_LOADU_128((__m128i const*)p); p += 16;

		__m128i v1 = _mm_packus_epi16(_mm_and_si128(w1, mask), _mm_and_si128(w2, mask));
		__m128i v2 = _mm_packus_epi16(_mm_srli_epi16(w1, 8), _mm_srli_epi16(w2, 8));

		__m128i c = _mm_setzero_si128();
		c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v1, val16));
		c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v2, val16));

		w1 = _mm_cmpeq_epi8(v1, miss16);
		w2 = _mm_cmpeq_epi8(v2, miss16);
		__m128i w  = _mm_or_si128(w1, w2);
		c = _mm_or_si128(_mm_and_si128(w, sub16), _mm_andnot_si128(w, c));

		_mm_store_si128((__m128i *)out, c);
		n -= 16; out += 16;
	}

	const __m256i val32  = _mm256_set1_epi8(val);
	const __m256i miss32 = _mm256_set1_epi8(missing);
	const __m256i sub32  = _mm256_set1_epi8(missing_substitute);
	const __m256i mask2  = _mm256_set1_epi16(0x00FF);

	for (; n >= 32; n-=32)
	{
		__m256i w1 = MM_LOADU_256((__m256i const*)p); p += 32;
		__m256i w2 = MM_LOADU_256((__m256i const*)p); p += 32;

		__m256i v1 = _mm256_packus_epi16(_mm256_and_si256(w1, mask2), _mm256_and_si256(w2, mask2));
		__m256i v2 = _mm256_packus_epi16(_mm256_srli_epi16(w1, 8), _mm256_srli_epi16(w2, 8));

		__m256i c = _mm256_setzero_si256();
		c = _mm256_sub_epi8(c, _mm256_cmpeq_epi8(v1, val32));
		c = _mm256_sub_epi8(c, _mm256_cmpeq_epi8(v2, val32));

		w1 = _mm256_cmpeq_epi8(v1, miss32);
		w2 = _mm256_cmpeq_epi8(v2, miss32);
		__m256i w = _mm256_or_si256(w1, w2);
		c = _mm256_or_si256(_mm256_and_si256(w, sub32), _mm256_andnot_si256(w, c));

		c = _mm256_permute4x64_epi64(c, 0xD8);
		_mm256_store_si256((__m256i *)out, c);
		out += 32;
	}

#   endif

	// SSE2 only
	for (; n >= 16; n-=16)
	{
		__m128i w1 = MM_LOADU_128((__m128i const*)p); p += 16;
		__m128i w2 = MM_LOADU_128((__m128i const*)p); p += 16;

		__m128i v1 = _mm_packus_epi16(_mm_and_si128(w1, mask), _mm_and_si128(w2, mask));
		__m128i v2 = _mm_packus_epi16(_mm_srli_epi16(w1, 8), _mm_srli_epi16(w2, 8));

		__m128i c = _mm_setzero_si128();
		c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v1, val16));
		c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v2, val16));

		w1 = _mm_cmpeq_epi8(v1, miss16);
		w2 = _mm_cmpeq_epi8(v2, miss16);
		__m128i w = _mm_or_si128(w1, w2);
		c = _mm_or_si128(_mm_and_si128(w, sub16), _mm_andnot_si128(w, c));

		_mm_store_si128((__m128i *)out, c);
		out += 16;
	}

#endif

	// tail
	for (; n > 0; n--, p+=2)
	{
		*out ++ = ((p[0] == missing) || (p[1] == missing)) ?
			missing_substitute :
			(p[0]==val ? 1 : 0) + (p[1]==val ? 1 : 0);
	}
}
Esempio n. 26
0
    int8_t * const restrict del_pr = _del_pr+PAD;
#ifdef PARASAIL_TABLE
    parasail_result_t *result = parasail_result_new_table1(s1Len, s2Len);
#else
#ifdef PARASAIL_ROWCOL
    parasail_result_t *result = parasail_result_new_rowcol1(s1Len, s2Len);
#else
    parasail_result_t *result = parasail_result_new();
#endif
#endif
    int32_t i = 0;
    int32_t j = 0;
    int32_t end_query = 0;
    int32_t end_ref = 0;
    int8_t score = NEG_INF;
    __m256i vNegInf = _mm256_set1_epi8(NEG_INF);
    __m256i vNegInf0 = _mm256_srli_si256_rpl(vNegInf, 1); /* shift in a 0 */
    __m256i vOpen = _mm256_set1_epi8(open);
    __m256i vGap  = _mm256_set1_epi8(gap);
    __m256i vZero = _mm256_set1_epi8(0);
    __m256i vOne16 = _mm256_set1_epi16(1);
    __m256i vNegOne16 = _mm256_set1_epi16(-1);
    __m256i vN16 = _mm256_set1_epi16(N);
    __m256i vILo16 = _mm256_set_epi16(16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31);
    __m256i vIHi16 = _mm256_set_epi16(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
    __m256i vJresetLo16 = _mm256_set_epi16(-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31);
    __m256i vJresetHi16 = _mm256_set_epi16(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15);
    __m256i vMax = vNegInf;
    __m256i vEndILo = vNegInf;
    __m256i vEndIHi = vNegInf;
    __m256i vEndJLo = vNegInf;
Esempio n. 27
0
void*  xmemset(void* dest, int c, size_t n)
{
    void* ret = dest;
    if (n < 16) {
        xmemset_lt16(dest, c, n);
        return ret;
    }

    __m256i mm = _mm256_set1_epi8((char)c);

    if (((unsigned long)dest & 31) == 0) {
        for ( ; n >= 256; n -= 256) {
            _mm256_store_si256((__m256i*)dest, mm);
            _mm256_store_si256((__m256i*)dest + 1, mm);
            _mm256_store_si256((__m256i*)dest + 2, mm);
            _mm256_store_si256((__m256i*)dest + 3, mm);
            _mm256_store_si256((__m256i*)dest + 4, mm);
            _mm256_store_si256((__m256i*)dest + 5, mm);
            _mm256_store_si256((__m256i*)dest + 6, mm);
            _mm256_store_si256((__m256i*)dest + 7, mm); // 8
            dest = (void*)((__m256i*)dest + 8);
        }

        if (n >= 128) {
            _mm256_store_si256((__m256i*)dest, mm);
            _mm256_store_si256((__m256i*)dest + 1, mm);
            _mm256_store_si256((__m256i*)dest + 2, mm);
            _mm256_store_si256((__m256i*)dest + 3, mm);
            dest = (void*)((__m256i*)dest + 4);
            n -= 128;
        }

        if (n >= 64) {
            _mm256_store_si256((__m256i*)dest, mm);
            _mm256_store_si256((__m256i*)dest + 1, mm);
            dest = (void*)((__m256i*)dest + 2);
            n -= 64;
        }

        if (n >= 32) {
            _mm256_store_si256((__m256i*)dest, mm);
            dest = (void*)((__m256i*)dest + 1);
            n -= 32;
        }

        if (n >= 16) {
            _mm_store_si128((__m128i*)dest, _mm_set1_epi8((char)c));
            dest = (void*)((__m128i*)dest + 1);
            n -= 16;
        }

    } else {
        for ( ; n >= 256; n -= 256) {
            _mm256_storeu_si256((__m256i*)dest, mm);
            _mm256_storeu_si256((__m256i*)dest + 1, mm);
            _mm256_storeu_si256((__m256i*)dest + 2, mm);
            _mm256_storeu_si256((__m256i*)dest + 3, mm);
            _mm256_storeu_si256((__m256i*)dest + 4, mm);
            _mm256_storeu_si256((__m256i*)dest + 5, mm);
            _mm256_storeu_si256((__m256i*)dest + 6, mm);
            _mm256_storeu_si256((__m256i*)dest + 7, mm); // 8
            dest = (void*)((__m256i*)dest + 8);
        }

        if (n >= 128) {
            _mm256_storeu_si256((__m256i*)dest, mm);
            _mm256_storeu_si256((__m256i*)dest + 1, mm);
            _mm256_storeu_si256((__m256i*)dest + 2, mm);
            _mm256_storeu_si256((__m256i*)dest + 3, mm);
            dest = (void*)((__m256i*)dest + 4);
            n -= 128;
        }

        if (n >= 64) {
            _mm256_storeu_si256((__m256i*)dest, mm);
            _mm256_storeu_si256((__m256i*)dest + 1, mm);
            dest = (void*)((__m256i*)dest + 2);
            n -= 64;
        }

        if (n >= 32) {
            _mm256_storeu_si256((__m256i*)dest, mm);
            dest = (void*)((__m256i*)dest + 1);
            n -= 32;
        }

        if (n >= 16) {
            _mm_storeu_si128((__m128i*)dest, _mm_set1_epi8((char)c));
            dest = (void*)((__m128i*)dest + 1);
            n -= 16;
        }
    }

    xmemset_lt16(dest, c, n);
    return ret;
}
Esempio n. 28
0
// count genotype sum and number of calls, not requiring 16-aligned p
COREARRAY_DLL_DEFAULT C_UInt8* vec_u8_geno_count(C_UInt8 *p,
	size_t n, C_Int32 &out_sum, C_Int32 &out_num)
{
	C_Int32 sum=0, num=0;

#if defined(COREARRAY_SIMD_AVX2)

	const __m256i three = _mm256_set1_epi8(3);
	const __m256i zero = _mm256_setzero_si256();
	__m256i sum32 = zero, num32 = zero;
	size_t limit_by_U8 = 0;

	for (; n >= 32; )
	{
		__m256i v = _mm256_loadu_si256((__m256i const*)p);
		p += 32;
		__m256i m = _mm256_cmpgt_epi8(three, _mm256_min_epu8(v, three));
		sum32 = _mm256_add_epi8(sum32, _mm256_and_si256(v, m));
		num32 = _mm256_sub_epi8(num32, m);
		n -= 32;
		limit_by_U8 ++;
		if ((limit_by_U8 >= 127) || (n < 32))
		{
			// add to sum
			sum32 = _mm256_sad_epu8(sum32, zero);
			sum32 = _mm256_add_epi32(sum32,
				_mm256_permute4x64_epi64(sum32, _MM_SHUFFLE(1,0,3,2)));
			sum32 = _mm256_add_epi32(sum32,
				_mm256_permute4x64_epi64(sum32, _MM_SHUFFLE(0,0,0,1)));
			sum += _mm_cvtsi128_si32(_mm256_castsi256_si128(sum32));
			// add to num
			num32 = _mm256_sad_epu8(num32, zero);
			num32 = _mm256_add_epi32(num32,
				_mm256_permute4x64_epi64(num32, _MM_SHUFFLE(1,0,3,2)));
			num32 = _mm256_add_epi32(num32,
				_mm256_permute4x64_epi64(num32, _MM_SHUFFLE(0,0,0,1)));
			num += _mm_cvtsi128_si32(_mm256_castsi256_si128(num32));
			// reset
			sum32 = num32 = zero;
			limit_by_U8 = 0;
		}
	}

#elif defined(COREARRAY_SIMD_SSE2)

	// header, 16-byte aligned
	size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F;
	for (; (n > 0) && (h > 0); n--, h--, p++)
		if (*p <= 2) { sum += *p; num++; }

	const __m128i three = _mm_set1_epi8(3);
	const __m128i zero = _mm_setzero_si128();
	__m128i sum16=zero, num16=zero;
	size_t limit_by_U8 = 0;

	for (; n >= 16; )
	{
		__m128i v = _mm_load_si128((__m128i const*)p);
		p += 16;
		__m128i m = _mm_cmpgt_epi8(three, _mm_min_epu8(v, three));
		sum16 = _mm_add_epi8(sum16, v & m);
		num16 = _mm_sub_epi8(num16, m);
		n -= 16;
		limit_by_U8 ++;
		if ((limit_by_U8 >= 127) || (n < 16))
		{
			// add to sum
			sum16 = _mm_sad_epu8(sum16, zero);
			sum += _mm_cvtsi128_si32(sum16);
			sum += _mm_cvtsi128_si32(_mm_shuffle_epi32(sum16, 2));
			// add to num
			num16 = _mm_sad_epu8(num16, zero);
			num += _mm_cvtsi128_si32(num16);
			num += _mm_cvtsi128_si32(_mm_shuffle_epi32(num16, 2));
			// reset
			sum16 = num16 = zero;
			limit_by_U8 = 0;
		}
	}

#endif

	for (; n > 0; n--, p++)
		if (*p <= 2) { sum += *p; num++; }
	out_sum = sum;
	out_num = num;
	return p;
}