/* Shuffle bits within the bytes of eight element blocks. */ int64_t bshuf_shuffle_bit_eightelem_sse2(void* in, void* out, const size_t size, const size_t elem_size) { /* With a bit of care, this could be written such that such that it is */ /* in_buf = out_buf safe. */ char* in_b = (char*) in; uint16_t* out_ui16 = (uint16_t*) out; size_t nbyte = elem_size * size; __m128i xmm; int32_t bt; size_t ii, jj, kk; size_t ind; CHECK_MULT_EIGHT(size); if (elem_size % 2) { bshuf_shuffle_bit_eightelem_scal(in, out, size, elem_size); } else { for (ii = 0; ii + 8 * elem_size - 1 < nbyte; ii += 8 * elem_size) { for (jj = 0; jj + 15 < 8 * elem_size; jj += 16) { xmm = _mm_loadu_si128((__m128i *) &in_b[ii + jj]); for (kk = 0; kk < 8; kk++) { bt = _mm_movemask_epi8(xmm); xmm = _mm_slli_epi16(xmm, 1); ind = (ii + jj / 8 + (7 - kk) * elem_size); out_ui16[ind / 2] = bt; } } } } return size * elem_size; }
/* Untranspose bits within elements. */ int64_t bshuf_untrans_bit_elem_scal(void* in, void* out, const size_t size, const size_t elem_size, void* tmp_buf) { int64_t count; CHECK_MULT_EIGHT(size); count = bshuf_trans_byte_bitrow_scal(in, tmp_buf, size, elem_size); CHECK_ERR(count); count = bshuf_shuffle_bit_eightelem_scal(tmp_buf, out, size, elem_size); return count; }
/* Untranspose bits within elements. */ int64_t bshuf_untrans_bit_elem_scal(void* in, void* out, const size_t size, const size_t elem_size) { int64_t count; CHECK_MULT_EIGHT(size); void* tmp_buf = malloc(size * elem_size); if (tmp_buf == NULL) return -1; count = bshuf_trans_byte_bitrow_scal(in, tmp_buf, size, elem_size); CHECK_ERR_FREE(count, tmp_buf); count = bshuf_shuffle_bit_eightelem_scal(tmp_buf, out, size, elem_size); free(tmp_buf); return count; }