size_t variablevectorshift_unrolled(uint32_t *array, size_t length, int shiftamount) { size_t k = 0; __m256i * a = (__m256i *) array; __m256i s = _mm256_set1_epi32(shiftamount); for (; k + 3 < length / 8 ; k +=4, a+=4) { __m256i v1 = _mm256_loadu_si256(a); __m256i v2 = _mm256_loadu_si256(a + 1); __m256i v3 = _mm256_loadu_si256(a + 2); __m256i v4 = _mm256_loadu_si256(a + 3); v1 = _mm256_srlv_epi32(v1,s); v2 = _mm256_srlv_epi32(v2,s); v3 = _mm256_srlv_epi32(v3,s); v4 = _mm256_srlv_epi32(v4,s); _mm256_storeu_si256(a,v1); _mm256_storeu_si256(a + 1,v2); _mm256_storeu_si256(a + 2,v3); _mm256_storeu_si256(a + 3,v4); } for (; k < length / 8 ; k ++, a++) { __m256i v = _mm256_loadu_si256(a); v = _mm256_srlv_epi32(v,s); _mm256_storeu_si256(a,v); } k *= 8; for (; k < length; ++k) { array[k] = array[k] >> shiftamount; } return 0; }
void static avx2_test (void) { union256i_d s1, s2, res; int res_ref[8]; int i, j, sign = 1; int fail = 0; for (i = 0; i < 10; i++) { for (j = 0; j < 8; j++) { s1.a[j] = j * i * sign; s2.a[j] = (j + i) >> 2; sign = -sign; } res.x = _mm256_srlv_epi32 (s1.x, s2.x); compute_psrlvd256 (s1.a, s2.a, res_ref); fail += check_union256i_d (res, res_ref); } if (fail != 0) abort (); }
size_t variablevectorshift(uint32_t *array, size_t length, int shiftamount) { size_t k = 0; __m256i * a = (__m256i *) array; __m256i s = _mm256_set1_epi32(shiftamount); for (; k < length / 8 ; k ++, a++) { __m256i v = _mm256_loadu_si256(a); v = _mm256_srlv_epi32(v,s); _mm256_storeu_si256(a,v); } k *= 8; for (; k < length; ++k) { array[k] = array[k] >> shiftamount; } return 0; }
static inline void do_encode_12bytes(const char (*alphabet)[2], char *out, __m256i chunk) { const __m256i shufflemask = _mm256_set_epi8( -1, 9, 10, 11, -1, 9, 10, 11, -1, 6, 7, 8, -1, 6, 7, 8, -1, 3, 4, 5, -1, 3, 4, 5, -1, 0, 1, 2, -1, 0, 1, 2 ); const __m256i shifts = _mm256_set_epi32(0, 12, 0, 12, 0, 12, 0, 12); const __m256i masks = _mm256_set1_epi32(4095); // convert from big endian and rearrange the bytes chunk = _mm256_shuffle_epi8(chunk, shufflemask); chunk = _mm256_srlv_epi32(chunk, shifts); chunk = _mm256_and_si256(chunk, masks); // write the two halves to memory do_encode_6bytes(alphabet, out + 0, _mm256_extracti128_si256(chunk, 0)); do_encode_6bytes(alphabet, out + 8, _mm256_extracti128_si256(chunk, 1)); }
__m256i test_mm256_srlv_epi32(__m256i a, __m256i b) { // CHECK: @llvm.x86.avx2.psrlv.d.256 return _mm256_srlv_epi32(a, b); }
__m256i test_mm256_srlv_epi32(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_srlv_epi32 // CHECK: call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}) return _mm256_srlv_epi32(a, b); }