コード例 #1
0
static uint64_t avx2_sum_epu64(const __m256i v) {
    
    return _mm256_extract_epi64(v, 0)
         + _mm256_extract_epi64(v, 1)
         + _mm256_extract_epi64(v, 2)
         + _mm256_extract_epi64(v, 3);
}
コード例 #2
0
ファイル: avx2.cpp プロジェクト: WojciechMula/toys
uint64_t avx2_count_byte(const uint8_t* data, size_t size, uint8_t byte) {

    const __m256i v = _mm256_set1_epi8(byte);

    const uint8_t* end = data + size;
    const uint8_t* ptr = data;

    __m256i global_sum = _mm256_setzero_si256();
    __m256i local_sum;

    // 1. blocks of 256 registers
    while (ptr + 255*32 < end) {
        local_sum = _mm256_setzero_si256();

        // update 32 x 8-bit counter
        for (int i=0; i < 255; i++, ptr += 32) {
            const __m256i in = _mm256_loadu_si256((const __m256i*)ptr);
            const __m256i eq = _mm256_cmpeq_epi8(in, v); // 0 or -1

            local_sum = _mm256_sub_epi8(local_sum, eq);
        }

        // update the global accumulator 2 x 64-bit
        const __m256i tmp = _mm256_sad_epu8(local_sum, _mm256_setzero_si256());
        global_sum = _mm256_add_epi64(global_sum, tmp);
    }


    // 2. tail of < 256 registers
    local_sum = _mm256_setzero_si256();
    while (ptr + 32 < end) {
        const __m256i in = _mm256_loadu_si256((const __m256i*)ptr);
        const __m256i eq = _mm256_cmpeq_epi8(in, v);

        local_sum = _mm256_sub_epi8(local_sum, eq);

        ptr += 32;
    }

    const __m256i tmp = _mm256_sad_epu8(local_sum, _mm256_setzero_si256());
    global_sum = _mm256_add_epi64(global_sum, tmp);


    // 3. process tail < 32 bytes
    uint64_t result = _mm256_extract_epi64(global_sum, 0)
                    + _mm256_extract_epi64(global_sum, 1)
                    + _mm256_extract_epi64(global_sum, 2)
                    + _mm256_extract_epi64(global_sum, 3);

    return result + scalar_count_bytes(ptr, end - ptr, byte);
}
コード例 #3
0
ファイル: avx2.cpp プロジェクト: WojciechMula/toys
uint64_t avx2_count_byte_popcount(const uint8_t* data, size_t size, uint8_t byte) {

    const __m256i v = _mm256_set1_epi8(byte);

    const uint8_t* end = data + size;
    const uint8_t* ptr = data;

    uint64_t result = 0;

    // 1. blocks of 8 registers
    while (ptr + 8*32 < end) {
        const __m256i eq0 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 0*32)));
        const __m256i eq1 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 1*32)));
        const __m256i eq2 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 2*32)));
        const __m256i eq3 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 3*32)));
        const __m256i eq4 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 4*32)));
        const __m256i eq5 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 5*32)));
        const __m256i eq6 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 6*32)));
        const __m256i eq7 = _mm256_cmpeq_epi8(v, _mm256_loadu_si256((const __m256i*)(ptr + 7*32)));

        const __m256i eq0bit = _mm256_and_si256(eq0, _mm256_set1_epi8(0x01));
        const __m256i eq1bit = _mm256_and_si256(eq1, _mm256_set1_epi8(0x02));
        const __m256i eq2bit = _mm256_and_si256(eq2, _mm256_set1_epi8(0x04));
        const __m256i eq3bit = _mm256_and_si256(eq3, _mm256_set1_epi8(0x08));
        const __m256i eq4bit = _mm256_and_si256(eq4, _mm256_set1_epi8(0x10));
        const __m256i eq5bit = _mm256_and_si256(eq5, _mm256_set1_epi8(0x20));
        const __m256i eq6bit = _mm256_and_si256(eq6, _mm256_set1_epi8(0x40));
        const __m256i eq7bit = _mm256_and_si256(eq7, _mm256_set1_epi8(int8_t(0x80)));

        const __m256i m01    = _mm256_or_si256(eq0bit, eq1bit);
        const __m256i m23    = _mm256_or_si256(eq2bit, eq3bit);
        const __m256i m45    = _mm256_or_si256(eq4bit, eq5bit);
        const __m256i m67    = _mm256_or_si256(eq6bit, eq7bit);

        const __m256i m0123  = _mm256_or_si256(m01, m23);
        const __m256i m4567  = _mm256_or_si256(m45, m67);

        const __m256i merged = _mm256_or_si256(m0123, m4567);

        result += __builtin_popcountll(_mm256_extract_epi64(merged, 0));
        result += __builtin_popcountll(_mm256_extract_epi64(merged, 1));
        result += __builtin_popcountll(_mm256_extract_epi64(merged, 2));
        result += __builtin_popcountll(_mm256_extract_epi64(merged, 3));

        ptr += 8 * 32;
    }

    return result + scalar_count_bytes(ptr, end - ptr, byte);
}
コード例 #4
0
ファイル: pr51393.c プロジェクト: 0day-ci/gcc
avx_test (void)
{
  long long in = 0x800000000ll;
  long long out;

  __m256i zero = _mm256_setzero_si256();
  __m256i tmp  = _mm256_insert_epi64 (zero, in, 0);
  out = _mm256_extract_epi64(tmp, 0);

  if (in != out)
    abort ();
}
コード例 #5
0
 std::uint64_t _mm256_hsum_epi64(__m256i v) {
     return _mm256_extract_epi64(v, 0)
          + _mm256_extract_epi64(v, 1)
          + _mm256_extract_epi64(v, 2)
          + _mm256_extract_epi64(v, 3);
 }
コード例 #6
0
ファイル: dotproductavx.cpp プロジェクト: bhanu475/tesseract
// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel AVX intrinsics to access the SIMD instruction set.
double DotProductAVX(const double* u, const double* v, int n) {
  int max_offset = n - 4;
  int offset = 0;
  // Accumulate a set of 4 sums in sum, by loading pairs of 4 values from u and
  // v, and multiplying them together in parallel.
  __m256d sum = _mm256_setzero_pd();
  if (offset <= max_offset) {
    offset = 4;
    // Aligned load is reputedly faster but requires 32 byte aligned input.
    if ((reinterpret_cast<const uintptr_t>(u) & 31) == 0 &&
        (reinterpret_cast<const uintptr_t>(v) & 31) == 0) {
      // Use aligned load.
      __m256d floats1 = _mm256_load_pd(u);
      __m256d floats2 = _mm256_load_pd(v);
      // Multiply.
      sum = _mm256_mul_pd(floats1, floats2);
      while (offset <= max_offset) {
        floats1 = _mm256_load_pd(u + offset);
        floats2 = _mm256_load_pd(v + offset);
        offset += 4;
        __m256d product = _mm256_mul_pd(floats1, floats2);
        sum = _mm256_add_pd(sum, product);
      }
    } else {
      // Use unaligned load.
      __m256d floats1 = _mm256_loadu_pd(u);
      __m256d floats2 = _mm256_loadu_pd(v);
      // Multiply.
      sum = _mm256_mul_pd(floats1, floats2);
      while (offset <= max_offset) {
        floats1 = _mm256_loadu_pd(u + offset);
        floats2 = _mm256_loadu_pd(v + offset);
        offset += 4;
        __m256d product = _mm256_mul_pd(floats1, floats2);
        sum = _mm256_add_pd(sum, product);
      }
    }
  }
  // Add the 4 product sums together horizontally. Not so easy as with sse, as
  // there is no add across the upper/lower 128 bit boundary, so permute to
  // move the upper 128 bits to lower in another register.
  __m256d sum2 = _mm256_permute2f128_pd(sum, sum, 1);
  sum = _mm256_hadd_pd(sum, sum2);
  sum = _mm256_hadd_pd(sum, sum);
  double result;
  // _mm256_extract_f64 doesn't exist, but resist the temptation to use an sse
  // instruction, as that introduces a 70 cycle delay. All this casting is to
  // fool the instrinsics into thinking we are extracting the bottom int64.
  auto cast_sum = _mm256_castpd_si256(sum);
  *(reinterpret_cast<inT64*>(&result)) =
#if defined(_WIN32) || defined(__i386__)
      // This is a very simple workaround that is activated
      // for all platforms that do not have _mm256_extract_epi64.
      // _mm256_extract_epi64(X, Y) == ((uint64_t*)&X)[Y]
      ((uint64_t*)&cast_sum)[0]
#else
      _mm256_extract_epi64(cast_sum, 0)
#endif
      ;
  while (offset < n) {
    result += u[offset] * v[offset];
    ++offset;
  }
  return result;
}
コード例 #7
0
ファイル: Primate.c プロジェクト: opolo/Bitsliced-AEAD
void mixcolumns_inv_80bit(__m256i (*state)[2]) {
	/*
	6       22      31      1       15
	15      19      8       1       20
	20      5       30      5       11
	11      3       19      8       18
	18      2       2       18      1
	*/

	__m256i T2_regs[5][2];
	__m256i T3_regs[5][2];
	__m256i T4_regs[5][2];
	__m256i T5_regs[5];
	__m256i T6_regs[5];
	__m256i T8_regs[5][2];
	__m256i T11_regs[5][2];
	__m256i T15_regs[5][2];
	__m256i T16_regs[5][2];
	__m256i T18_regs[5][2];
	__m256i T19_regs[5];
	__m256i T20_regs[5][2];
	__m256i T22_regs[5];
	__m256i T30_regs[5];
	__m256i T31_regs[5];

	T2(state, T2_regs); //T2
	T2(T2_regs, T4_regs); //T4
	T2(T4_regs, T8_regs); //T8
	T2(T8_regs, T16_regs); //T16

	for (int i = 0; i < 5; i++) {
		T3_regs[i][0] = XOR(state[i][0], T2_regs[i][0]);
		T3_regs[i][1] = XOR(state[i][1], T2_regs[i][1]);

		T5_regs[i] = XOR(state[i][0], T4_regs[i][0]);

		T6_regs[i] = XOR(T2_regs[i][0], T4_regs[i][0]);

		T11_regs[i][0] = XOR(T3_regs[i][0], T8_regs[i][0]);
		T11_regs[i][1] = XOR(T3_regs[i][1], T8_regs[i][1]);

		T15_regs[i][0] = XOR(T11_regs[i][0], T4_regs[i][0]);
		T15_regs[i][1] = XOR(T11_regs[i][1], T4_regs[i][1]);

		T18_regs[i][0] = XOR(T16_regs[i][0], T2_regs[i][0]);
		T18_regs[i][1] = XOR(T16_regs[i][1], T2_regs[i][1]);

		T19_regs[i] = XOR(T16_regs[i][0], T3_regs[i][0]);

		T20_regs[i][0] = XOR(T16_regs[i][0], T4_regs[i][0]);
		T20_regs[i][1] = XOR(T16_regs[i][1], T4_regs[i][1]);

		T22_regs[i] = XOR(T16_regs[i][0], T6_regs[i]);

		T30_regs[i] = XOR(T22_regs[i], T8_regs[i][0]);

		T31_regs[i] = XOR(T30_regs[i], state[i][0]);
	}

	for (int i = 0; i < 5; i++) {

		state[i][0] = XOR5(
			_mm256_setr_epi64x(_mm256_extract_epi64(T6_regs[i], 0), _mm256_extract_epi64(T15_regs[i][0], 0), _mm256_extract_epi64(T20_regs[i][0], 0), _mm256_extract_epi64(T11_regs[i][0], 0)),
			_mm256_setr_epi64x(_mm256_extract_epi64(T22_regs[i], 1), _mm256_extract_epi64(T19_regs[i], 1), _mm256_extract_epi64(T5_regs[i], 1), _mm256_extract_epi64(T3_regs[i][0], 1)),
			_mm256_setr_epi64x(_mm256_extract_epi64(T31_regs[i], 2), _mm256_extract_epi64(T8_regs[i][0], 2), _mm256_extract_epi64(T30_regs[i], 2), _mm256_extract_epi64(T19_regs[i], 2)),
			_mm256_setr_epi64x(_mm256_extract_epi64(state[i][0], 3), _mm256_extract_epi64(state[i][0], 3), _mm256_extract_epi64(T5_regs[i], 3), _mm256_extract_epi64(T8_regs[i][0], 3)),
			_mm256_setr_epi64x(_mm256_extract_epi64(T15_regs[i][1], 0), _mm256_extract_epi64(T20_regs[i][1], 0), _mm256_extract_epi64(T11_regs[i][1], 0), _mm256_extract_epi64(T18_regs[i][1], 0)));

		state[i][1] = XOR5(
			_mm256_setr_epi64x(_mm256_extract_epi64(T18_regs[i][0], 0), 0, 0, 0),
			_mm256_setr_epi64x(_mm256_extract_epi64(T2_regs[i][0], 1), 0, 0, 0),
			_mm256_setr_epi64x(_mm256_extract_epi64(T2_regs[i][0], 2), 0, 0, 0),
			_mm256_setr_epi64x(_mm256_extract_epi64(T18_regs[i][0], 3), 0, 0, 0),
			_mm256_setr_epi64x(_mm256_extract_epi64(state[i][1], 0), 0, 0, 0));
	}
}
コード例 #8
0
ファイル: Primate.c プロジェクト: opolo/Bitsliced-AEAD
void test_primates() {

	////////////////////TEST 80 BIT//////////////
	//Prepare test vectors
	YMM YMM_p1_input_80bit[5][2];

	for (int i = 0; i < 5; i++) {
		YMM_p1_input_80bit[i][0] = _mm256_setzero_si256();
		YMM_p1_input_80bit[i][1] = _mm256_setzero_si256();
	}

	//use test vectors
	p1(YMM_p1_input_80bit);
	p1_inv(YMM_p1_input_80bit);

	//test if vectors are zero again... Dont test last 192 bits of section 2, as they are not part of the state (and sub elements turn the 0s there to 1s...)
	if (_mm256_extract_epi64(YMM_p1_input_80bit[0][0], 0) != 0 || _mm256_extract_epi64(YMM_p1_input_80bit[0][0], 1) != 0 ||
		_mm256_extract_epi64(YMM_p1_input_80bit[0][0], 2) != 0 || _mm256_extract_epi64(YMM_p1_input_80bit[0][0], 3) != 0 ||
		_mm256_extract_epi64(YMM_p1_input_80bit[1][0], 0) != 0 || _mm256_extract_epi64(YMM_p1_input_80bit[1][0], 1) != 0 ||
		_mm256_extract_epi64(YMM_p1_input_80bit[1][0], 2) != 0 || _mm256_extract_epi64(YMM_p1_input_80bit[1][0], 3) != 0 ||
		_mm256_extract_epi64(YMM_p1_input_80bit[2][0], 0) != 0 || _mm256_extract_epi64(YMM_p1_input_80bit[2][0], 1) != 0 ||
		_mm256_extract_epi64(YMM_p1_input_80bit[2][0], 2) != 0 || _mm256_extract_epi64(YMM_p1_input_80bit[2][0], 3) != 0 ||
		_mm256_extract_epi64(YMM_p1_input_80bit[3][0], 0) != 0 || _mm256_extract_epi64(YMM_p1_input_80bit[3][0], 1) != 0 ||
		_mm256_extract_epi64(YMM_p1_input_80bit[3][0], 2) != 0 || _mm256_extract_epi64(YMM_p1_input_80bit[3][0], 3) != 0 ||
		_mm256_extract_epi64(YMM_p1_input_80bit[4][0], 0) != 0 || _mm256_extract_epi64(YMM_p1_input_80bit[4][0], 1) != 0 ||
		_mm256_extract_epi64(YMM_p1_input_80bit[4][0], 2) != 0 || _mm256_extract_epi64(YMM_p1_input_80bit[4][0], 3) != 0 ||

		_mm256_extract_epi64(YMM_p1_input_80bit[0][1], 0) != 0 ||
		_mm256_extract_epi64(YMM_p1_input_80bit[1][1], 0) != 0 ||
		_mm256_extract_epi64(YMM_p1_input_80bit[2][1], 0) != 0 ||
		_mm256_extract_epi64(YMM_p1_input_80bit[3][1], 0) != 0 ||
		_mm256_extract_epi64(YMM_p1_input_80bit[4][1], 0) != 0) {
		printf("P1 inv not working \n");
	}
}