Esempio n. 1
0
void bitmask_avx2(uint32_t* ptr, size_t n, uint32_t key, uint8_t* out) {

    uint32_t* output = (uint32_t*)out;

    const size_t N = 8*4; // unrolled 4 times
    const size_t chunks = n / N;
    const size_t tail   = n % N;

    const __m256i vkey = _mm256_set1_epi32(key);
    
    for (size_t i=0; i < chunks; i++) {
        
        const __m256i in0 = _mm256_loadu_si256((const __m256i*)(ptr + i*N + 0*8));
        const __m256i in1 = _mm256_loadu_si256((const __m256i*)(ptr + i*N + 1*8));
        const __m256i in2 = _mm256_loadu_si256((const __m256i*)(ptr + i*N + 2*8));
        const __m256i in3 = _mm256_loadu_si256((const __m256i*)(ptr + i*N + 3*8));

        const __m256i eq0 = _mm256_cmpeq_epi32(in0, vkey);
        const __m256i eq1 = _mm256_cmpeq_epi32(in1, vkey);
        const __m256i eq2 = _mm256_cmpeq_epi32(in2, vkey);
        const __m256i eq3 = _mm256_cmpeq_epi32(in3, vkey);

        // eq0 = [a0 a1 a2 a3 a4 a5 a6 a7] (packed dword)
        // eq1 = [b0 b1 b2 b3 b4 b5 b6 b7] (packed dword)
        // eq2 = [c0 c1 c2 c3 c4 c5 c6 c7] (packed dword)
        // eq3 = [d0 d1 d2 d3 d4 d5 d6 d7] (packed dword)

        //  t0 = [a0 a1 a2 a3 c0 c1 c2 c3 a4 a5 a6 a7 c4 c5 c6 c7] (packed word)
        const __m256i t0  = _mm256_packs_epi32(eq0, eq2);
        // m02 = [a0 a1 a2 a3 a4 a5 a6 a7 c0 c1 c2 c3 c4 c5 c6 c7] (packed word)
        const __m256i m02 = _mm256_permutevar8x32_epi32(t0,
                                _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7));

        //  t0 = [b0 b1 b2 b3 d0 d1 d2 d3 b4 b5 b6 b7 d4 d5 d6 d7] (packed word)
        const __m256i t1 = _mm256_packs_epi32(eq1, eq3);
        // m13 = [b0 b1 b2 b3 b4 b5 b6 b7 d0 d1 d2 d3 d4 d5 d6 d7] (packed word)
        const __m256i m13 = _mm256_permutevar8x32_epi32(t1,
                                _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7));

        // m   = [a0..7 b0..7 c0..7 d0..7] (packed byte)
        const __m256i m   = _mm256_packs_epi16(m02, m13);

        *output++ = _mm256_movemask_epi8(m);
    }

    if (tail > 0) {
        bitmask_better_2(ptr + chunks*N, tail, key, out + chunks*N);
    }
}
Esempio n. 2
0
bool is_sorted_avx2(int32_t* a, size_t n) {

    const __m256i shuffle_pattern = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 7);

    size_t i = 0;
    while (i < n - 8) {
        // curr = [ a0 | a1 | a2 | a3 | a4 | a5 | a6 | a7 ]
        const __m256i curr = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(a + i));
        // next = [ a1 | a2 | a3 | a4 | a5 | a6 | a7 | a7 ]
        const __m256i next = _mm256_permutevar8x32_epi32(curr, shuffle_pattern);

        // Note: the last element of curr and next is a7, thus for this element 
        //       the comparison result is always zero.
        //
        // In fact, the first 7 elements are being tested.
        const __m256i mask = _mm256_cmpgt_epi32(curr, next);
        if (!_mm256_testz_si256(mask, mask)) {
            return false;
        }

        i += 7;
    }

    for (/**/; i + 1 < n; i++) {
        if (a[i] > a[i + 1])
            return false;
    }

    return true;
}
Esempio n. 3
0
// Computes part of matrix.vector v = Wu. Computes N=8 results.
// For details see PartialMatrixDotVector64 with N=8.
static void PartialMatrixDotVector8(const int8_t* wi, const double* scales,
                                    const int8_t* u, int num_in, int num_out,
                                    double* v) {
  // Register containing 16-bit ones for horizontal add with 16->32 bit
  // conversion.
  __m256i ones =
      _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
  __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
  // Initialize all the results to 0.
  __m256i result0 = _mm256_setzero_si256();
  // Iterate over the input (u), one registerful at a time.
  for (int j = 0; j < num_in;) {
    __m256i inputs =
        _mm256_loadu_si256(reinterpret_cast<const __m256i*>(u + j));
    // Inputs are processed in groups of kNumInputsPerGroup, replicated
    // kNumInputGroups times.
    for (int ig = 0; ig < kNumInputGroups && j < num_in;
         ++ig, j += kNumInputsPerGroup) {
      // Replicate the low 32 bits (4 inputs) 8 times.
      __m256i rep_input =
          _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));
      // Rotate the inputs in groups of 4, so the next 4 inputs are ready.
      inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);
      __m256i weights, reps;
      // Mul-add, with horizontal add of the 4 inputs to each of the results.
      MultiplyGroup(rep_input, ones, wi, weights, reps, result0);
    }
  }
  ExtractResults(result0, shift_id, wi, scales, num_out, v);
}
Esempio n. 4
0
// Extracts and converts 8x32-bit results from result, adding the bias from wi
// and scaling by scales, before storing in *v. Note that wi, scales and v are
// expected to contain 8 consecutive elements or num_out if less.
inline void ExtractResults(__m256i& result, __m256i& shift_id,
                           const int8_t*& wi, const double*& scales,
                           int num_out, double*& v) {
  for (int out = 0; out < num_out; ++out) {
    int32_t res = _mm256_extract_epi32(result, 0);
    *v++ = (static_cast<double>(res) / MAX_INT8 + *wi++) * *scales++;
    // Rotate the results in int32_t units, so the next result is ready.
    result = _mm256_permutevar8x32_epi32(result, shift_id);
  }
}
Esempio n. 5
0
bool is_sorted_avx2_unrolled4(int32_t* a, size_t n) {

    const __m256i shuffle_pattern = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 7);

    size_t i = 0;
    while (i < n - (4*7 + 1)) {
        const __m256i curr0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(a + i + 0*7));
        const __m256i curr1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(a + i + 1*7));
        const __m256i curr2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(a + i + 2*7));
        const __m256i curr3 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(a + i + 3*7));

        const __m256i next0 = _mm256_permutevar8x32_epi32(curr0, shuffle_pattern);
        const __m256i next1 = _mm256_permutevar8x32_epi32(curr1, shuffle_pattern);
        const __m256i next2 = _mm256_permutevar8x32_epi32(curr2, shuffle_pattern);
        const __m256i next3 = _mm256_permutevar8x32_epi32(curr3, shuffle_pattern);

        const __m256i mask0 = _mm256_cmpgt_epi32(curr0, next0);
        const __m256i mask1 = _mm256_cmpgt_epi32(curr1, next1);
        const __m256i mask2 = _mm256_cmpgt_epi32(curr2, next2);
        const __m256i mask3 = _mm256_cmpgt_epi32(curr3, next3);

        const __m256i mask = _mm256_or_si256(mask0, 
                             _mm256_or_si256(mask1, 
                             _mm256_or_si256(mask2, mask3)));

        if (!_mm256_testz_si256(mask, mask)) {
            return false;
        }

        i += 7*4;
    }

    for (/**/; i + 1 < n; i++) {
        if (a[i] > a[i + 1])
            return false;
    }

    return true;
}
Esempio n. 6
0
/* Routine optimized for shuffling a buffer for a type size of 4 bytes. */
static void
shuffle4_avx2(uint8_t* const dest, const uint8_t* const src,
  const size_t vectorizable_elements, const size_t total_elements)
{
  static const size_t bytesoftype = 4;
  size_t i;
  int j;
  __m256i ymm0[4], ymm1[4];

  /* Create the shuffle mask.
     NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
     most to least significant (i.e., their order is reversed when compared to
     loading the mask from an array). */
  const __m256i mask = _mm256_set_epi32(
    0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);

  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
    /* Fetch 32 elements (128 bytes) then transpose bytes and words. */
    for (j = 0; j < 4; j++) {
      ymm0[j] = _mm256_loadu_si256((__m256i*)(src + (i * bytesoftype) + (j * sizeof(__m256i))));
      ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0xd8);
      ymm0[j] = _mm256_shuffle_epi32(ymm0[j], 0x8d);
      ymm0[j] = _mm256_unpacklo_epi8(ymm1[j], ymm0[j]);
      ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0x04e);
      ymm0[j] = _mm256_unpacklo_epi16(ymm0[j], ymm1[j]);
    }
    /* Transpose double words */
    for (j = 0; j < 2; j++) {
      ymm1[j*2] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]);
      ymm1[j*2+1] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]);
    }
    /* Transpose quad words */
    for (j = 0; j < 2; j++) {
      ymm0[j*2] = _mm256_unpacklo_epi64(ymm1[j], ymm1[j+2]);
      ymm0[j*2+1] = _mm256_unpackhi_epi64(ymm1[j], ymm1[j+2]);
    }
    for (j = 0; j < 4; j++) {
      ymm0[j] = _mm256_permutevar8x32_epi32(ymm0[j], mask);
    }
    /* Store the result vectors */
    uint8_t* const dest_for_ith_element = dest + i;
    for (j = 0; j < 4; j++) {
      _mm256_storeu_si256((__m256i*)(dest_for_ith_element + (j * total_elements)), ymm0[j]);
    }
  }
}
static void
avx2_test (void)
{
  union256i_d src1, src2, dst;
  int dst_ref[8];
  int i;

  for (i = 0; i < NUM; i++)
    {
      init_permd (src1.a, src2.a, i);

      dst.x = _mm256_permutevar8x32_epi32 (src1.x, src2.x);
      calc_permd (src1.a, src2.a, dst_ref);

      if (check_union256i_d (dst, dst_ref))
	abort ();
    }
}
static inline __m256i
enc_reshuffle (__m256i in)
{
	// Spread out 32-bit words over both halves of the input register:
	in = _mm256_permutevar8x32_epi32(in, _mm256_setr_epi32(
		0, 1, 2, -1,
		3, 4, 5, -1));

	// Slice into 32-bit chunks and operate on all chunks in parallel.
	// All processing is done within the 32-bit chunk. First, shuffle:
	// before: [eeeeeeff|ccdddddd|bbbbcccc|aaaaaabb]
	// after:  [00000000|aaaaaabb|bbbbcccc|ccdddddd]
	in = _mm256_shuffle_epi8(in, _mm256_set_epi8(
		-1, 9, 10, 11,
		-1, 6,  7,  8,
		-1, 3,  4,  5,
		-1, 0,  1,  2,
		-1, 9, 10, 11,
		-1, 6,  7,  8,
		-1, 3,  4,  5,
		-1, 0,  1,  2));

	// cd      = [00000000|00000000|0000cccc|ccdddddd]
	const __m256i cd = _mm256_and_si256(in, _mm256_set1_epi32(0x00000FFF));

	// ab      = [0000aaaa|aabbbbbb|00000000|00000000]
	const __m256i ab = _mm256_and_si256(_mm256_slli_epi32(in, 4), _mm256_set1_epi32(0x0FFF0000));

	// merged  = [0000aaaa|aabbbbbb|0000cccc|ccdddddd]
	const __m256i merged = _mm256_or_si256(ab, cd);

	// bd      = [00000000|00bbbbbb|00000000|00dddddd]
	const __m256i bd = _mm256_and_si256(merged, _mm256_set1_epi32(0x003F003F));

	// ac      = [00aaaaaa|00000000|00cccccc|00000000]
	const __m256i ac = _mm256_and_si256(_mm256_slli_epi32(merged, 2), _mm256_set1_epi32(0x3F003F00));

	// indices = [00aaaaaa|00bbbbbb|00cccccc|00dddddd]
	const __m256i indices = _mm256_or_si256(ac, bd);

	// return  = [00dddddd|00cccccc|00bbbbbb|00aaaaaa]
	return _mm256_bswap_epi32(indices);
}
static inline __m256i
dec_reshuffle (__m256i in)
{
	// Shuffle bytes to 32-bit bigendian:
	in = _mm256_bswap_epi32(in);

	// Mask in a single byte per shift:
	__m256i mask = _mm256_set1_epi32(0x3F000000);

	// Pack bytes together:
	__m256i out = _mm256_slli_epi32(_mm256_and_si256(in, mask), 2);
	mask = _mm256_srli_epi32(mask, 8);

	out = _mm256_or_si256(out, _mm256_slli_epi32(_mm256_and_si256(in, mask), 4));
	mask = _mm256_srli_epi32(mask, 8);

	out = _mm256_or_si256(out, _mm256_slli_epi32(_mm256_and_si256(in, mask), 6));
	mask = _mm256_srli_epi32(mask, 8);

	out = _mm256_or_si256(out, _mm256_slli_epi32(_mm256_and_si256(in, mask), 8));

	// Pack bytes together within 32-bit words, discarding words 3 and 7:
	out = _mm256_shuffle_epi8(out, _mm256_setr_epi8(
		 3,  2,  1,
		 7,  6,  5,
		11, 10,  9,
		15, 14, 13,
		-1, -1, -1, -1,
		 3,  2,  1,
		 7,  6,  5,
		11, 10,  9,
		15, 14, 13,
		-1, -1, -1, -1));

	// Pack 32-bit words together, squashing empty words 3 and 7:
	return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32(
		0, 1, 2, 4, 5, 6, -1, -1));
}
Esempio n. 10
0
__m256i test_mm256_permutevar8x32_epi32(__m256i a, __m256i b) {
  // CHECK: @llvm.x86.avx2.permd
  return _mm256_permutevar8x32_epi32(a, b);
}
Esempio n. 11
0
__m256i test_mm256_permutevar8x32_epi32(__m256i a, __m256i b) {
  // CHECK-LABEL: test_mm256_permutevar8x32_epi32
  // CHECK: call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
  return _mm256_permutevar8x32_epi32(a, b);
}
Esempio n. 12
0
void extern
avx2_test (void)
{
  x = _mm256_permutevar8x32_epi32 (x, x);
}
Esempio n. 13
0
 SIMD_INLINE __m256i PermuteAndShiffle(__m256i bgr, __m256i permute, __m256i shuffle)
 {
     return _mm256_shuffle_epi8(_mm256_permutevar8x32_epi32(bgr, permute), shuffle);
 }