/* Transpose bits within bytes. */ int64_t bshuf_trans_bit_byte_AVX(void* in, void* out, const size_t size, const size_t elem_size) { size_t ii, kk; char* in_b = (char*) in; char* out_b = (char*) out; int32_t* out_i32; size_t nbyte = elem_size * size; int64_t count; __m256i ymm; int32_t bt; for (ii = 0; ii + 31 < nbyte; ii += 32) { ymm = _mm256_loadu_si256((__m256i *) &in_b[ii]); for (kk = 0; kk < 8; kk++) { bt = _mm256_movemask_epi8(ymm); ymm = _mm256_slli_epi16(ymm, 1); out_i32 = (int32_t*) &out_b[((7 - kk) * nbyte + ii) / 8]; *out_i32 = bt; } } count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size, nbyte - nbyte % 32); return count; }
/* Transpose bits within bytes. */ int64_t bshuf_trans_bit_byte_sse2(void* in, void* out, const size_t size, const size_t elem_size) { char* in_b = (char*) in; char* out_b = (char*) out; uint16_t* out_ui16; int64_t count; size_t nbyte = elem_size * size; __m128i xmm; int32_t bt; size_t ii, kk; CHECK_MULT_EIGHT(nbyte); for (ii = 0; ii + 15 < nbyte; ii += 16) { xmm = _mm_loadu_si128((__m128i *) &in_b[ii]); for (kk = 0; kk < 8; kk++) { bt = _mm_movemask_epi8(xmm); xmm = _mm_slli_epi16(xmm, 1); out_ui16 = (uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8]; *out_ui16 = bt; } } count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size, nbyte - nbyte % 16); return count; }
/* Transpose bits within bytes. */ int64_t bshuf_trans_bit_byte_scal(void* in, void* out, const size_t size, const size_t elem_size) { return bshuf_trans_bit_byte_remainder(in, out, size, elem_size, 0); }