size_t variablevectorshift_unrolled(uint32_t *array, size_t length, int shiftamount) {
  size_t k = 0;
  __m256i * a = (__m256i *) array;
  __m256i s = _mm256_set1_epi32(shiftamount);
  for (; k + 3 < length / 8 ; k +=4, a+=4) {
    __m256i v1 = _mm256_loadu_si256(a);
    __m256i v2 = _mm256_loadu_si256(a + 1);
    __m256i v3 = _mm256_loadu_si256(a + 2);
    __m256i v4 = _mm256_loadu_si256(a + 3);

    v1 = _mm256_srlv_epi32(v1,s);
    v2 = _mm256_srlv_epi32(v2,s);
    v3 = _mm256_srlv_epi32(v3,s);
    v4 = _mm256_srlv_epi32(v4,s);

     _mm256_storeu_si256(a,v1);
     _mm256_storeu_si256(a + 1,v2);
     _mm256_storeu_si256(a + 2,v3);
     _mm256_storeu_si256(a + 3,v4);

  }
  for (; k  < length / 8 ; k ++, a++) {
    __m256i v = _mm256_loadu_si256(a);
    v = _mm256_srlv_epi32(v,s);
     _mm256_storeu_si256(a,v);
  }
  k *= 8;
  for (; k < length; ++k) {
    array[k] = array[k] >> shiftamount;
  }
  return 0;
}
Beispiel #2
0
void static
avx2_test (void)
{
  union256i_d s1, s2, res;
  int res_ref[8];
  int i, j, sign = 1;
  int fail = 0;

  for (i = 0; i < 10; i++)
    {
      for (j = 0; j < 8; j++)
	{
	  s1.a[j] = j * i * sign;
	  s2.a[j] = (j + i) >> 2;
	  sign = -sign;
	}

      res.x = _mm256_srlv_epi32 (s1.x, s2.x);

      compute_psrlvd256 (s1.a, s2.a, res_ref);

      fail += check_union256i_d (res, res_ref);
    }

  if (fail != 0)
    abort ();
}
size_t variablevectorshift(uint32_t *array, size_t length, int shiftamount) {
  size_t k = 0;
  __m256i * a = (__m256i *) array;
  __m256i s = _mm256_set1_epi32(shiftamount);
  for (; k  < length / 8 ; k ++, a++) {
    __m256i v = _mm256_loadu_si256(a);
    v = _mm256_srlv_epi32(v,s);
     _mm256_storeu_si256(a,v);
  }
  k *= 8;
  for (; k < length; ++k) {
    array[k] = array[k] >> shiftamount;
  }
  return 0;
}
Beispiel #4
0
static inline void do_encode_12bytes(const char (*alphabet)[2], char *out, __m256i chunk)
{
    const __m256i shufflemask = _mm256_set_epi8(
        -1, 9, 10, 11,
        -1, 9, 10, 11,
        -1, 6, 7, 8,
        -1, 6, 7, 8,
        -1, 3, 4, 5,
        -1, 3, 4, 5,
        -1, 0, 1, 2,
        -1, 0, 1, 2
    );
    const __m256i shifts = _mm256_set_epi32(0, 12, 0, 12, 0, 12, 0, 12);
    const __m256i masks = _mm256_set1_epi32(4095);

    // convert from big endian and rearrange the bytes
    chunk = _mm256_shuffle_epi8(chunk, shufflemask);
    chunk = _mm256_srlv_epi32(chunk, shifts);
    chunk = _mm256_and_si256(chunk, masks);

    // write the two halves to memory
    do_encode_6bytes(alphabet, out + 0, _mm256_extracti128_si256(chunk, 0));
    do_encode_6bytes(alphabet, out + 8, _mm256_extracti128_si256(chunk, 1));
}
Beispiel #5
0
__m256i test_mm256_srlv_epi32(__m256i a, __m256i b) {
  // CHECK: @llvm.x86.avx2.psrlv.d.256
  return _mm256_srlv_epi32(a, b);
}
__m256i test_mm256_srlv_epi32(__m256i a, __m256i b) {
  // CHECK-LABEL: test_mm256_srlv_epi32
  // CHECK: call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
  return _mm256_srlv_epi32(a, b);
}