/* Transpose bytes within elements using best SSE algorithm available. */ int64_t bshuf_trans_byte_elem_SSE(void* in, void* out, const size_t size, const size_t elem_size) { int64_t count; // Trivial cases: power of 2 bytes. switch (elem_size) { case 1: count = bshuf_copy(in, out, size, elem_size); return count; case 2: count = bshuf_trans_byte_elem_SSE_16(in, out, size); return count; case 4: count = bshuf_trans_byte_elem_SSE_32(in, out, size); return count; case 8: count = bshuf_trans_byte_elem_SSE_64(in, out, size); return count; } // Worst case: odd number of bytes. Turns out that this is faster for // (odd * 2) byte elements as well (hence % 4). if (elem_size % 4) { count = bshuf_trans_byte_elem_scal(in, out, size, elem_size); return count; } // Multiple of power of 2: transpose hierarchically. { size_t nchunk_elem; void* tmp_buf = malloc(size * elem_size); if (tmp_buf == NULL) return -1; if ((elem_size % 8) == 0) { nchunk_elem = elem_size / 8; TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int64_t); count = bshuf_trans_byte_elem_SSE_64(out, tmp_buf, size * nchunk_elem); bshuf_trans_elem(tmp_buf, out, 8, nchunk_elem, size); } else if ((elem_size % 4) == 0) { nchunk_elem = elem_size / 4; TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int32_t); count = bshuf_trans_byte_elem_SSE_32(out, tmp_buf, size * nchunk_elem); bshuf_trans_elem(tmp_buf, out, 4, nchunk_elem, size); } else { // Not used since scalar algorithm is faster. nchunk_elem = elem_size / 2; TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int16_t); count = bshuf_trans_byte_elem_SSE_16(out, tmp_buf, size * nchunk_elem); bshuf_trans_elem(tmp_buf, out, 2, nchunk_elem, size); } free(tmp_buf); return count; } }
/* Transpose bits within elements. */ int64_t bshuf_trans_bit_elem_scal(void* in, void* out, const size_t size, const size_t elem_size, void* tmp_buf) { int64_t count; CHECK_MULT_EIGHT(size); count = bshuf_trans_byte_elem_scal(in, out, size, elem_size); CHECK_ERR(count); count = bshuf_trans_bit_byte_scal(out, tmp_buf, size, elem_size); CHECK_ERR(count); count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size); return count; }
/* Transpose bits within elements. */ int64_t bshuf_trans_bit_elem_scal(void* in, void* out, const size_t size, const size_t elem_size) { int64_t count; CHECK_MULT_EIGHT(size); void* tmp_buf = malloc(size * elem_size); if (tmp_buf == NULL) return -1; count = bshuf_trans_byte_elem_scal(in, out, size, elem_size); CHECK_ERR_FREE(count, tmp_buf); count = bshuf_trans_bit_byte_scal(out, tmp_buf, size, elem_size); CHECK_ERR_FREE(count, tmp_buf); count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size); free(tmp_buf); return count; }