Esempio n. 1
0
uint64_t avx2_count_byte(const uint8_t* data, size_t size, uint8_t byte) {

    const __m256i v = _mm256_set1_epi8(byte);

    const uint8_t* end = data + size;
    const uint8_t* ptr = data;

    __m256i global_sum = _mm256_setzero_si256();
    __m256i local_sum;

    // 1. blocks of 256 registers
    while (ptr + 255*32 < end) {
        local_sum = _mm256_setzero_si256();

        // update 32 x 8-bit counter
        for (int i=0; i < 255; i++, ptr += 32) {
            const __m256i in = _mm256_loadu_si256((const __m256i*)ptr);
            const __m256i eq = _mm256_cmpeq_epi8(in, v); // 0 or -1

            local_sum = _mm256_sub_epi8(local_sum, eq);
        }

        // update the global accumulator 2 x 64-bit
        const __m256i tmp = _mm256_sad_epu8(local_sum, _mm256_setzero_si256());
        global_sum = _mm256_add_epi64(global_sum, tmp);
    }


    // 2. tail of < 256 registers
    local_sum = _mm256_setzero_si256();
    while (ptr + 32 < end) {
        const __m256i in = _mm256_loadu_si256((const __m256i*)ptr);
        const __m256i eq = _mm256_cmpeq_epi8(in, v);

        local_sum = _mm256_sub_epi8(local_sum, eq);

        ptr += 32;
    }

    const __m256i tmp = _mm256_sad_epu8(local_sum, _mm256_setzero_si256());
    global_sum = _mm256_add_epi64(global_sum, tmp);


    // 3. process tail < 32 bytes
    uint64_t result = _mm256_extract_epi64(global_sum, 0)
                    + _mm256_extract_epi64(global_sum, 1)
                    + _mm256_extract_epi64(global_sum, 2)
                    + _mm256_extract_epi64(global_sum, 3);

    return result + scalar_count_bytes(ptr, end - ptr, byte);
}
static inline __m256i
enc_translate (const __m256i in)
{
	// LUT contains Absolute offset for all ranges:
	const __m256i lut = _mm256_setr_epi8(65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0,
	                                     65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
	// Translate values 0..63 to the Base64 alphabet. There are five sets:
	// #  From      To         Abs    Index  Characters
	// 0  [0..25]   [65..90]   +65        0  ABCDEFGHIJKLMNOPQRSTUVWXYZ
	// 1  [26..51]  [97..122]  +71        1  abcdefghijklmnopqrstuvwxyz
	// 2  [52..61]  [48..57]    -4  [2..11]  0123456789
	// 3  [62]      [43]       -19       12  +
	// 4  [63]      [47]       -16       13  /

	// Create LUT indices from input:
	// the index for range #0 is right, others are 1 less than expected:
	__m256i indices = _mm256_subs_epu8(in, _mm256_set1_epi8(51));

	// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
	__m256i mask = CMPGT(in, 25);

	// substract -1, so add 1 to indices for range #[1..4], All indices are now correct:
	indices = _mm256_sub_epi8(indices, mask);

	// Add offsets to input values:
	__m256i out = _mm256_add_epi8(in, _mm256_shuffle_epi8(lut, indices));

	return out;
}
Esempio n. 3
0
static INLINE unsigned int masked_sad32xh_avx2(
    const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
    const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
    int width, int height) {
  int x, y;
  __m256i res = _mm256_setzero_si256();
  const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
  const __m256i round_scale =
      _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
  for (y = 0; y < height; y++) {
    for (x = 0; x < width; x += 32) {
      const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]);
      const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]);
      const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]);
      const __m256i m = _mm256_lddqu_si256((const __m256i *)&m_ptr[x]);
      const __m256i m_inv = _mm256_sub_epi8(mask_max, m);

      // Calculate 16 predicted pixels.
      // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
      // is 64 * 255, so we have plenty of space to add rounding constants.
      const __m256i data_l = _mm256_unpacklo_epi8(a, b);
      const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv);
      __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l);
      pred_l = _mm256_mulhrs_epi16(pred_l, round_scale);

      const __m256i data_r = _mm256_unpackhi_epi8(a, b);
      const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv);
      __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r);
      pred_r = _mm256_mulhrs_epi16(pred_r, round_scale);

      const __m256i pred = _mm256_packus_epi16(pred_l, pred_r);
      res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src));
    }

    src_ptr += src_stride;
    a_ptr += a_stride;
    b_ptr += b_stride;
    m_ptr += m_stride;
  }
  // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
  res = _mm256_shuffle_epi32(res, 0xd8);
  res = _mm256_permute4x64_epi64(res, 0xd8);
  res = _mm256_hadd_epi32(res, res);
  res = _mm256_hadd_epi32(res, res);
  int32_t sad = _mm256_extract_epi32(res, 0);
  return (sad + 31) >> 6;
}
Esempio n. 4
0
static INLINE void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1,
                                            const __m256i a,
                                            uint8_t *comp_pred) {
  const __m256i alpha_max = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
  const int16_t round_bits = 15 - AOM_BLEND_A64_ROUND_BITS;
  const __m256i round_offset = _mm256_set1_epi16(1 << (round_bits));

  const __m256i ma = _mm256_sub_epi8(alpha_max, a);

  const __m256i ssAL = _mm256_unpacklo_epi8(s0, s1);
  const __m256i aaAL = _mm256_unpacklo_epi8(a, ma);
  const __m256i ssAH = _mm256_unpackhi_epi8(s0, s1);
  const __m256i aaAH = _mm256_unpackhi_epi8(a, ma);

  const __m256i blendAL = _mm256_maddubs_epi16(ssAL, aaAL);
  const __m256i blendAH = _mm256_maddubs_epi16(ssAH, aaAH);
  const __m256i roundAL = _mm256_mulhrs_epi16(blendAL, round_offset);
  const __m256i roundAH = _mm256_mulhrs_epi16(blendAH, round_offset);

  const __m256i roundA = _mm256_packus_epi16(roundAL, roundAH);
  _mm256_storeu_si256((__m256i *)(comp_pred), roundA);
}
Esempio n. 5
0
__m256i test_mm256_sub_epi8(__m256i a, __m256i b) {
  // CHECK: sub <32 x i8>
  return _mm256_sub_epi8(a, b);
}
Esempio n. 6
0
 /*!
  * \brief Subtract the two given values and return the result.
  */
 ETL_STATIC_INLINE(avx_simd_byte) sub(avx_simd_byte lhs, avx_simd_byte rhs) {
     return _mm256_sub_epi8(lhs.value, rhs.value);
 }
Esempio n. 7
0
void extern
avx2_test (void)
{
  x = _mm256_sub_epi8 (x, x);
}
Esempio n. 8
0
void vec_i8_cnt_dosage2(const int8_t *p, int8_t *out, size_t n, int8_t val,
	int8_t missing, int8_t missing_substitute)
{
#ifdef COREARRAY_SIMD_SSE2

	// header 1, 16-byte aligned
	size_t h = (16 - ((size_t)out & 0x0F)) & 0x0F;
	for (; (n > 0) && (h > 0); n--, h--, p+=2)
	{
		*out ++ = ((p[0] == missing) || (p[1] == missing)) ?
			missing_substitute :
			(p[0]==val ? 1 : 0) + (p[1]==val ? 1 : 0);
	}

	// body, SSE2
	const __m128i val16  = _mm_set1_epi8(val);
	const __m128i miss16 = _mm_set1_epi8(missing);
	const __m128i sub16  = _mm_set1_epi8(missing_substitute);
	const __m128i mask   = _mm_set1_epi16(0x00FF);

#   ifdef COREARRAY_SIMD_AVX2

	// header 2, 32-byte aligned
	if ((n >= 16) && ((size_t)out & 0x10))
	{
		__m128i w1 = MM_LOADU_128((__m128i const*)p); p += 16;
		__m128i w2 = MM_LOADU_128((__m128i const*)p); p += 16;

		__m128i v1 = _mm_packus_epi16(_mm_and_si128(w1, mask), _mm_and_si128(w2, mask));
		__m128i v2 = _mm_packus_epi16(_mm_srli_epi16(w1, 8), _mm_srli_epi16(w2, 8));

		__m128i c = _mm_setzero_si128();
		c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v1, val16));
		c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v2, val16));

		w1 = _mm_cmpeq_epi8(v1, miss16);
		w2 = _mm_cmpeq_epi8(v2, miss16);
		__m128i w  = _mm_or_si128(w1, w2);
		c = _mm_or_si128(_mm_and_si128(w, sub16), _mm_andnot_si128(w, c));

		_mm_store_si128((__m128i *)out, c);
		n -= 16; out += 16;
	}

	const __m256i val32  = _mm256_set1_epi8(val);
	const __m256i miss32 = _mm256_set1_epi8(missing);
	const __m256i sub32  = _mm256_set1_epi8(missing_substitute);
	const __m256i mask2  = _mm256_set1_epi16(0x00FF);

	for (; n >= 32; n-=32)
	{
		__m256i w1 = MM_LOADU_256((__m256i const*)p); p += 32;
		__m256i w2 = MM_LOADU_256((__m256i const*)p); p += 32;

		__m256i v1 = _mm256_packus_epi16(_mm256_and_si256(w1, mask2), _mm256_and_si256(w2, mask2));
		__m256i v2 = _mm256_packus_epi16(_mm256_srli_epi16(w1, 8), _mm256_srli_epi16(w2, 8));

		__m256i c = _mm256_setzero_si256();
		c = _mm256_sub_epi8(c, _mm256_cmpeq_epi8(v1, val32));
		c = _mm256_sub_epi8(c, _mm256_cmpeq_epi8(v2, val32));

		w1 = _mm256_cmpeq_epi8(v1, miss32);
		w2 = _mm256_cmpeq_epi8(v2, miss32);
		__m256i w = _mm256_or_si256(w1, w2);
		c = _mm256_or_si256(_mm256_and_si256(w, sub32), _mm256_andnot_si256(w, c));

		c = _mm256_permute4x64_epi64(c, 0xD8);
		_mm256_store_si256((__m256i *)out, c);
		out += 32;
	}

#   endif

	// SSE2 only
	for (; n >= 16; n-=16)
	{
		__m128i w1 = MM_LOADU_128((__m128i const*)p); p += 16;
		__m128i w2 = MM_LOADU_128((__m128i const*)p); p += 16;

		__m128i v1 = _mm_packus_epi16(_mm_and_si128(w1, mask), _mm_and_si128(w2, mask));
		__m128i v2 = _mm_packus_epi16(_mm_srli_epi16(w1, 8), _mm_srli_epi16(w2, 8));

		__m128i c = _mm_setzero_si128();
		c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v1, val16));
		c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v2, val16));

		w1 = _mm_cmpeq_epi8(v1, miss16);
		w2 = _mm_cmpeq_epi8(v2, miss16);
		__m128i w = _mm_or_si128(w1, w2);
		c = _mm_or_si128(_mm_and_si128(w, sub16), _mm_andnot_si128(w, c));

		_mm_store_si128((__m128i *)out, c);
		out += 16;
	}

#endif

	// tail
	for (; n > 0; n--, p+=2)
	{
		*out ++ = ((p[0] == missing) || (p[1] == missing)) ?
			missing_substitute :
			(p[0]==val ? 1 : 0) + (p[1]==val ? 1 : 0);
	}
}
Esempio n. 9
0
void vec_i8_count3(const char *p, size_t n, char val1, char val2, char val3,
	size_t *out_n1, size_t *out_n2, size_t *out_n3)
{
	size_t n1 = 0, n2 = 0, n3 = 0;

#ifdef COREARRAY_SIMD_SSE2

	// header 1, 16-byte aligned
	size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F;
	for (; (n > 0) && (h > 0); n--, h--)
	{
		char v = *p++;
		if (v == val1) n1++;
		if (v == val2) n2++;
		if (v == val3) n3++;
	}

#   ifdef COREARRAY_SIMD_AVX2
	// body, AVX2
	const __m128i zeros = _mm_setzero_si128();
	const __m256i mask1 = _mm256_set1_epi8(val1);
	const __m256i mask2 = _mm256_set1_epi8(val2);
	const __m256i mask3 = _mm256_set1_epi8(val3);
	__m256i sum1, sum2, sum3;
	sum1 = sum2 = sum3 = _mm256_setzero_si256();
	size_t offset = 0;

	// header 2, 32-byte aligned
	if ((n >= 16) && ((size_t)p & 0x10))
	{
		__m128i v = _mm_load_si128((__m128i const*)p);
		__m128i c1 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask1));
		sum1 = MM_SET_M128(_mm_sub_epi8(zeros, c1), zeros);
		__m128i c2 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask2));
		sum2 = MM_SET_M128(_mm_sub_epi8(zeros, c2), zeros);
		__m128i c3 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask3));
		sum3 = MM_SET_M128(_mm_sub_epi8(zeros, c3), zeros);
		n -= 16; p += 16;
	}

	for (; n >= 32; n-=32, p+=32)
	{
		__m256i v = _mm256_load_si256((__m256i const*)p);
		sum1 = _mm256_sub_epi8(sum1, _mm256_cmpeq_epi8(v, mask1));
		sum2 = _mm256_sub_epi8(sum2, _mm256_cmpeq_epi8(v, mask2));
		sum3 = _mm256_sub_epi8(sum3, _mm256_cmpeq_epi8(v, mask3));
		if ((++offset) >= 252)
		{
			n1 += vec_avx_sum_u8(sum1);
			n2 += vec_avx_sum_u8(sum2);
			n3 += vec_avx_sum_u8(sum3);
			sum1 = sum2 = sum3 = _mm256_setzero_si256();
			offset = 0;
		}
	}

	if (n >= 16)
	{
		__m128i v = _mm_load_si128((__m128i const*)p);
		__m128i c1 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask1));
		sum1 = _mm256_sub_epi8(sum1, MM_SET_M128(c1, zeros));
		__m128i c2 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask2));
		sum2 = _mm256_sub_epi8(sum2, MM_SET_M128(c2, zeros));
		__m128i c3 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask3));
		sum3 = _mm256_sub_epi8(sum3, MM_SET_M128(c3, zeros));
		n -= 16; p += 16;
	}

	if (offset > 0)
	{
		n1 += vec_avx_sum_u8(sum1);
		n2 += vec_avx_sum_u8(sum2);
		n3 += vec_avx_sum_u8(sum3);
	}

#   else
	// body, SSE2
	const __m128i mask1 = _mm_set1_epi8(val1);
	const __m128i mask2 = _mm_set1_epi8(val2);
	const __m128i mask3 = _mm_set1_epi8(val3);
	__m128i sum1, sum2, sum3;
	sum1 = sum2 = sum3 = _mm_setzero_si128();
	size_t offset = 0;

	for (; n >= 16; n-=16, p+=16)
	{
		__m128i v = _mm_load_si128((__m128i const*)p);
		sum1 = _mm_sub_epi8(sum1, _mm_cmpeq_epi8(v, mask1));
		sum2 = _mm_sub_epi8(sum2, _mm_cmpeq_epi8(v, mask2));
		sum3 = _mm_sub_epi8(sum3, _mm_cmpeq_epi8(v, mask3));
		if ((++offset) >= 252)
		{
			n1 += vec_sum_u8(sum1);
			n2 += vec_sum_u8(sum2);
			n3 += vec_sum_u8(sum3);
			sum1 = sum2 = sum3 = _mm_setzero_si128();
			offset = 0;
		}
	}

	if (offset > 0)
	{
		n1 += vec_sum_u8(sum1);
		n2 += vec_sum_u8(sum2);
		n3 += vec_sum_u8(sum3);
	}
#endif

#endif

	// tail
	for (; n > 0; n--)
	{
		char v = *p++;
		if (v == val1) n1++;
		if (v == val2) n2++;
		if (v == val3) n3++;
	}

	if (out_n1) *out_n1 = n1;
	if (out_n2) *out_n2 = n2;
	if (out_n3) *out_n3 = n3;
}
Esempio n. 10
0
size_t vec_i8_count(const char *p, size_t n, char val)
{
	size_t num = 0;

#ifdef COREARRAY_SIMD_SSE2

	// header 1, 16-byte aligned
	size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F;
	for (; (n > 0) && (h > 0); n--, h--)
		if (*p++ == val) num++;

#   ifdef COREARRAY_SIMD_AVX2
	// body, AVX2
	const __m128i zeros = _mm_setzero_si128();
	const __m256i mask = _mm256_set1_epi8(val);
	__m256i sum = _mm256_setzero_si256();
	size_t offset = 0;

	// header 2, 32-byte aligned
	if ((n >= 16) && ((size_t)p & 0x10))
	{
		__m128i v = _mm_load_si128((__m128i const*)p);
		__m128i c1 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask));
		sum = MM_SET_M128(_mm_sub_epi8(zeros, c1), zeros);
		n -= 16; p += 16;
	}

	for (; n >= 128; n-=128)
	{
		__m256i v = _mm256_load_si256((__m256i const*)p); p += 32;
		sum = _mm256_sub_epi8(sum, _mm256_cmpeq_epi8(v, mask));
		v = _mm256_load_si256((__m256i const*)p); p += 32;
		sum = _mm256_sub_epi8(sum, _mm256_cmpeq_epi8(v, mask));
		v = _mm256_load_si256((__m256i const*)p); p += 32;
		sum = _mm256_sub_epi8(sum, _mm256_cmpeq_epi8(v, mask));
		v = _mm256_load_si256((__m256i const*)p); p += 32;
		sum = _mm256_sub_epi8(sum, _mm256_cmpeq_epi8(v, mask));
		offset += 4;
		if (offset >= 252)
		{
			num += vec_avx_sum_u8(sum);
			sum = _mm256_setzero_si256();
			offset = 0;
		}
	}
	for (; n >= 32; n-=32)
	{
		__m256i v = _mm256_load_si256((__m256i const*)p); p += 32;
		sum = _mm256_sub_epi8(sum, _mm256_cmpeq_epi8(v, mask));
		if ((++offset) >= 252)
		{
			num += vec_avx_sum_u8(sum);
			sum = _mm256_setzero_si256();
			offset = 0;
		}
	}
	if (n >= 16)
	{
		__m128i v = _mm_load_si128((__m128i const*)p);
		__m128i c1 = _mm_cmpeq_epi8(v, _mm256_castsi256_si128(mask));
		sum = _mm256_sub_epi8(sum, MM_SET_M128(zeros, c1));
		n -= 16; p += 16;
	}

	if (offset > 0)
		num += vec_avx_sum_u8(sum);

#   else
	// body, SSE2
	const __m128i mask = _mm_set1_epi8(val);
	__m128i sum = _mm_setzero_si128();
	size_t offset = 0;

	for (; n >= 64; n-=64)
	{
		__m128i v = _mm_load_si128((__m128i const*)p); p += 16;
		sum = _mm_sub_epi8(sum, _mm_cmpeq_epi8(v, mask));
		v = _mm_load_si128((__m128i const*)p); p += 16;
		sum = _mm_sub_epi8(sum, _mm_cmpeq_epi8(v, mask));
		v = _mm_load_si128((__m128i const*)p); p += 16;
		sum = _mm_sub_epi8(sum, _mm_cmpeq_epi8(v, mask));
		v = _mm_load_si128((__m128i const*)p); p += 16;
		sum = _mm_sub_epi8(sum, _mm_cmpeq_epi8(v, mask));
		offset += 4;
		if (offset >= 252)
		{
			num += vec_sum_u8(sum);
			sum = _mm_setzero_si128();
			offset = 0;
		}
	}
	for (; n >= 16; n-=16, p+=16)
	{
		__m128i v = _mm_load_si128((__m128i const*)p);
		sum = _mm_sub_epi8(sum, _mm_cmpeq_epi8(v, mask));
		if ((++offset) >= 252)
		{
			num += vec_sum_u8(sum);
			sum = _mm_setzero_si128();
			offset = 0;
		}
	}

	if (offset > 0)
		num += vec_sum_u8(sum);
#endif

#endif

	// tail
	for (; n > 0; n--)
		if (*p++ == val) num++;
	return num;
}
Esempio n. 11
0
// count genotype sum and number of calls, not requiring 16-aligned p
COREARRAY_DLL_DEFAULT C_UInt8* vec_u8_geno_count(C_UInt8 *p,
	size_t n, C_Int32 &out_sum, C_Int32 &out_num)
{
	C_Int32 sum=0, num=0;

#if defined(COREARRAY_SIMD_AVX2)

	const __m256i three = _mm256_set1_epi8(3);
	const __m256i zero = _mm256_setzero_si256();
	__m256i sum32 = zero, num32 = zero;
	size_t limit_by_U8 = 0;

	for (; n >= 32; )
	{
		__m256i v = _mm256_loadu_si256((__m256i const*)p);
		p += 32;
		__m256i m = _mm256_cmpgt_epi8(three, _mm256_min_epu8(v, three));
		sum32 = _mm256_add_epi8(sum32, _mm256_and_si256(v, m));
		num32 = _mm256_sub_epi8(num32, m);
		n -= 32;
		limit_by_U8 ++;
		if ((limit_by_U8 >= 127) || (n < 32))
		{
			// add to sum
			sum32 = _mm256_sad_epu8(sum32, zero);
			sum32 = _mm256_add_epi32(sum32,
				_mm256_permute4x64_epi64(sum32, _MM_SHUFFLE(1,0,3,2)));
			sum32 = _mm256_add_epi32(sum32,
				_mm256_permute4x64_epi64(sum32, _MM_SHUFFLE(0,0,0,1)));
			sum += _mm_cvtsi128_si32(_mm256_castsi256_si128(sum32));
			// add to num
			num32 = _mm256_sad_epu8(num32, zero);
			num32 = _mm256_add_epi32(num32,
				_mm256_permute4x64_epi64(num32, _MM_SHUFFLE(1,0,3,2)));
			num32 = _mm256_add_epi32(num32,
				_mm256_permute4x64_epi64(num32, _MM_SHUFFLE(0,0,0,1)));
			num += _mm_cvtsi128_si32(_mm256_castsi256_si128(num32));
			// reset
			sum32 = num32 = zero;
			limit_by_U8 = 0;
		}
	}

#elif defined(COREARRAY_SIMD_SSE2)

	// header, 16-byte aligned
	size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F;
	for (; (n > 0) && (h > 0); n--, h--, p++)
		if (*p <= 2) { sum += *p; num++; }

	const __m128i three = _mm_set1_epi8(3);
	const __m128i zero = _mm_setzero_si128();
	__m128i sum16=zero, num16=zero;
	size_t limit_by_U8 = 0;

	for (; n >= 16; )
	{
		__m128i v = _mm_load_si128((__m128i const*)p);
		p += 16;
		__m128i m = _mm_cmpgt_epi8(three, _mm_min_epu8(v, three));
		sum16 = _mm_add_epi8(sum16, v & m);
		num16 = _mm_sub_epi8(num16, m);
		n -= 16;
		limit_by_U8 ++;
		if ((limit_by_U8 >= 127) || (n < 16))
		{
			// add to sum
			sum16 = _mm_sad_epu8(sum16, zero);
			sum += _mm_cvtsi128_si32(sum16);
			sum += _mm_cvtsi128_si32(_mm_shuffle_epi32(sum16, 2));
			// add to num
			num16 = _mm_sad_epu8(num16, zero);
			num += _mm_cvtsi128_si32(num16);
			num += _mm_cvtsi128_si32(_mm_shuffle_epi32(num16, 2));
			// reset
			sum16 = num16 = zero;
			limit_by_U8 = 0;
		}
	}

#endif

	for (; n > 0; n--, p++)
		if (*p <= 2) { sum += *p; num++; }
	out_sum = sum;
	out_num = num;
	return p;
}