/* Transpose bits within bytes. */ int64_t bshuf_trans_bit_byte_remainder(void* in, void* out, const size_t size, const size_t elem_size, const size_t start_byte) { int ii, kk; uint64_t* in_b = (uint64_t*) in; uint8_t* out_b = (uint8_t*) out; uint64_t x, t; size_t nbyte = elem_size * size; size_t nbyte_bitrow = nbyte / 8; CHECK_MULT_EIGHT(nbyte); CHECK_MULT_EIGHT(start_byte); for (ii = start_byte / 8; ii < nbyte_bitrow; ii ++) { x = in_b[ii]; TRANS_BIT_8X8(x, t); for (kk = 0; kk < 8; kk ++) { out_b[kk * nbyte_bitrow + ii] = x; x = x >> 8; } } return size * elem_size; }
/* Shuffle bits within the bytes of eight element blocks. */ int64_t bshuf_shuffle_bit_eightelem_sse2(void* in, void* out, const size_t size, const size_t elem_size) { /* With a bit of care, this could be written such that such that it is */ /* in_buf = out_buf safe. */ char* in_b = (char*) in; uint16_t* out_ui16 = (uint16_t*) out; size_t nbyte = elem_size * size; __m128i xmm; int32_t bt; size_t ii, jj, kk; size_t ind; CHECK_MULT_EIGHT(size); if (elem_size % 2) { bshuf_shuffle_bit_eightelem_scal(in, out, size, elem_size); } else { for (ii = 0; ii + 8 * elem_size - 1 < nbyte; ii += 8 * elem_size) { for (jj = 0; jj + 15 < 8 * elem_size; jj += 16) { xmm = _mm_loadu_si128((__m128i *) &in_b[ii + jj]); for (kk = 0; kk < 8; kk++) { bt = _mm_movemask_epi8(xmm); xmm = _mm_slli_epi16(xmm, 1); ind = (ii + jj / 8 + (7 - kk) * elem_size); out_ui16[ind / 2] = bt; } } } } return size * elem_size; }
/* Transpose bits within bytes. */ int64_t bshuf_trans_bit_byte_sse2(void* in, void* out, const size_t size, const size_t elem_size) { char* in_b = (char*) in; char* out_b = (char*) out; uint16_t* out_ui16; int64_t count; size_t nbyte = elem_size * size; __m128i xmm; int32_t bt; size_t ii, kk; CHECK_MULT_EIGHT(nbyte); for (ii = 0; ii + 15 < nbyte; ii += 16) { xmm = _mm_loadu_si128((__m128i *) &in_b[ii]); for (kk = 0; kk < 8; kk++) { bt = _mm_movemask_epi8(xmm); xmm = _mm_slli_epi16(xmm, 1); out_ui16 = (uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8]; *out_ui16 = bt; } } count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size, nbyte - nbyte % 16); return count; }
/* Shuffle bits within the bytes of eight element blocks. */ int64_t bshuf_shuffle_bit_eightelem_AVX(void* in, void* out, const size_t size, const size_t elem_size) { CHECK_MULT_EIGHT(size); // With a bit of care, this could be written such that such that it is // in_buf = out_buf safe. char* in_b = (char*) in; char* out_b = (char*) out; size_t ii, jj, kk; size_t nbyte = elem_size * size; __m256i ymm; int32_t bt; if (elem_size % 4) { return bshuf_shuffle_bit_eightelem_SSE(in, out, size, elem_size); } else { for (jj = 0; jj + 31 < 8 * elem_size; jj += 32) { for (ii = 0; ii + 8 * elem_size - 1 < nbyte; ii += 8 * elem_size) { ymm = _mm256_loadu_si256((__m256i *) &in_b[ii + jj]); for (kk = 0; kk < 8; kk++) { bt = _mm256_movemask_epi8(ymm); ymm = _mm256_slli_epi16(ymm, 1); size_t ind = (ii + jj / 8 + (7 - kk) * elem_size); * (int32_t *) &out_b[ind] = bt; } } } } return size * elem_size; }
/* Shuffle bits within the bytes of eight element blocks. */ int64_t bshuf_shuffle_bit_eightelem_scal(void* in, void* out, const size_t size, const size_t elem_size) { CHECK_MULT_EIGHT(size); size_t ii, jj, kk; char* in_b = (char*) in; char* out_b = (char*) out; size_t nbyte = elem_size * size; uint64_t x, t; for (jj = 0; jj < 8 * elem_size; jj += 8) { for (ii = 0; ii + 8 * elem_size - 1 < nbyte; ii += 8 * elem_size) { x = *((uint64_t*) &in_b[ii + jj]); TRANS_BIT_8X8(x, t); for (kk = 0; kk < 8; kk++) { *((uint8_t*) &out_b[ii + jj / 8 + kk * elem_size]) = x; x = x >> 8; } } } return size * elem_size; }
/* Transpose bytes within elements, starting partway through input. */ int64_t bshuf_trans_byte_elem_remainder(void* in, void* out, const size_t size, const size_t elem_size, const size_t start) { size_t ii, jj, kk; char* in_b = (char*) in; char* out_b = (char*) out; CHECK_MULT_EIGHT(start); if (size > start) { // ii loop separated into 2 loops so the compiler can unroll // the inner one. for (ii = start; ii + 7 < size; ii += 8) { for (jj = 0; jj < elem_size; jj++) { for (kk = 0; kk < 8; kk++) { out_b[jj * size + ii + kk] = in_b[ii * elem_size + kk * elem_size + jj]; } } } for (ii = size - size % 8; ii < size; ii ++) { for (jj = 0; jj < elem_size; jj++) { out_b[jj * size + ii] = in_b[ii * elem_size + jj]; } } } return size * elem_size; }
/* Transpose rows of shuffled bits (size / 8 bytes) within groups of 8. */ int64_t bshuf_trans_bitrow_eight(void* in, void* out, const size_t size, const size_t elem_size) { size_t nbyte_bitrow = size / 8; CHECK_MULT_EIGHT(size); return bshuf_trans_elem(in, out, 8, elem_size, nbyte_bitrow); }
/* Untranspose bits within elements. */ int64_t bshuf_untrans_bit_elem_sse2(void* in, void* out, const size_t size, const size_t elem_size, void* tmp_buf) { int64_t count; CHECK_MULT_EIGHT(size); count = bshuf_trans_byte_bitrow_sse2(in, tmp_buf, size, elem_size); CHECK_ERR(count); count = bshuf_shuffle_bit_eightelem_sse2(tmp_buf, out, size, elem_size); return count; }
/* Transpose bits within elements. */ int64_t bshuf_trans_bit_elem_scal(void* in, void* out, const size_t size, const size_t elem_size, void* tmp_buf) { int64_t count; CHECK_MULT_EIGHT(size); count = bshuf_trans_byte_elem_scal(in, out, size, elem_size); CHECK_ERR(count); count = bshuf_trans_bit_byte_scal(out, tmp_buf, size, elem_size); CHECK_ERR(count); count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size); return count; }
/* Untranspose bits within elements. */ int64_t bshuf_untrans_bit_elem_AVX(void* in, void* out, const size_t size, const size_t elem_size) { int64_t count; CHECK_MULT_EIGHT(size); void* tmp_buf = malloc(size * elem_size); if (tmp_buf == NULL) return -1; count = bshuf_trans_byte_bitrow_AVX(in, tmp_buf, size, elem_size); CHECK_ERR_FREE(count, tmp_buf); count = bshuf_shuffle_bit_eightelem_AVX(tmp_buf, out, size, elem_size); free(tmp_buf); return count; }
/* For data organized into a row for each bit (8 * elem_size rows), transpose * the bytes. */ int64_t bshuf_trans_byte_bitrow_scal(void* in, void* out, const size_t size, const size_t elem_size) { size_t ii, jj, kk; char* in_b = (char*) in; char* out_b = (char*) out; size_t nbyte_row = size / 8; CHECK_MULT_EIGHT(size); for (jj = 0; jj < elem_size; jj++) { for (ii = 0; ii < nbyte_row; ii++) { for (kk = 0; kk < 8; kk++) { out_b[ii * 8 * elem_size + jj * 8 + kk] = \ in_b[(jj * 8 + kk) * nbyte_row + ii]; } } } return size * elem_size; }
/* For data organized into a row for each bit (8 * elem_size rows), transpose * the bytes. */ int64_t bshuf_trans_byte_bitrow_sse2(void* in, void* out, const size_t size, const size_t elem_size) { char* in_b = (char*) in; char* out_b = (char*) out; size_t nrows = 8 * elem_size; size_t nbyte_row = size / 8; size_t ii, jj; __m128i a0, b0, c0, d0, e0, f0, g0, h0; __m128i a1, b1, c1, d1, e1, f1, g1, h1; __m128 *as, *bs, *cs, *ds, *es, *fs, *gs, *hs; CHECK_MULT_EIGHT(size); for (ii = 0; ii + 7 < nrows; ii += 8) { for (jj = 0; jj + 15 < nbyte_row; jj += 16) { a0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 0)*nbyte_row + jj]); b0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 1)*nbyte_row + jj]); c0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 2)*nbyte_row + jj]); d0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 3)*nbyte_row + jj]); e0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 4)*nbyte_row + jj]); f0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 5)*nbyte_row + jj]); g0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 6)*nbyte_row + jj]); h0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 7)*nbyte_row + jj]); a1 = _mm_unpacklo_epi8(a0, b0); b1 = _mm_unpacklo_epi8(c0, d0); c1 = _mm_unpacklo_epi8(e0, f0); d1 = _mm_unpacklo_epi8(g0, h0); e1 = _mm_unpackhi_epi8(a0, b0); f1 = _mm_unpackhi_epi8(c0, d0); g1 = _mm_unpackhi_epi8(e0, f0); h1 = _mm_unpackhi_epi8(g0, h0); a0 = _mm_unpacklo_epi16(a1, b1); b0 = _mm_unpacklo_epi16(c1, d1); c0 = _mm_unpackhi_epi16(a1, b1); d0 = _mm_unpackhi_epi16(c1, d1); e0 = _mm_unpacklo_epi16(e1, f1); f0 = _mm_unpacklo_epi16(g1, h1); g0 = _mm_unpackhi_epi16(e1, f1); h0 = _mm_unpackhi_epi16(g1, h1); a1 = _mm_unpacklo_epi32(a0, b0); b1 = _mm_unpackhi_epi32(a0, b0); c1 = _mm_unpacklo_epi32(c0, d0); d1 = _mm_unpackhi_epi32(c0, d0); e1 = _mm_unpacklo_epi32(e0, f0); f1 = _mm_unpackhi_epi32(e0, f0); g1 = _mm_unpacklo_epi32(g0, h0); h1 = _mm_unpackhi_epi32(g0, h0); /* We don't have a storeh instruction for integers, so interpret */ /* as a float. Have a storel (_mm_storel_epi64). */ as = (__m128 *) &a1; bs = (__m128 *) &b1; cs = (__m128 *) &c1; ds = (__m128 *) &d1; es = (__m128 *) &e1; fs = (__m128 *) &f1; gs = (__m128 *) &g1; hs = (__m128 *) &h1; _mm_storel_pi((__m64 *) &out_b[(jj + 0) * nrows + ii], *as); _mm_storel_pi((__m64 *) &out_b[(jj + 2) * nrows + ii], *bs); _mm_storel_pi((__m64 *) &out_b[(jj + 4) * nrows + ii], *cs); _mm_storel_pi((__m64 *) &out_b[(jj + 6) * nrows + ii], *ds); _mm_storel_pi((__m64 *) &out_b[(jj + 8) * nrows + ii], *es); _mm_storel_pi((__m64 *) &out_b[(jj + 10) * nrows + ii], *fs); _mm_storel_pi((__m64 *) &out_b[(jj + 12) * nrows + ii], *gs); _mm_storel_pi((__m64 *) &out_b[(jj + 14) * nrows + ii], *hs); _mm_storeh_pi((__m64 *) &out_b[(jj + 1) * nrows + ii], *as); _mm_storeh_pi((__m64 *) &out_b[(jj + 3) * nrows + ii], *bs); _mm_storeh_pi((__m64 *) &out_b[(jj + 5) * nrows + ii], *cs); _mm_storeh_pi((__m64 *) &out_b[(jj + 7) * nrows + ii], *ds); _mm_storeh_pi((__m64 *) &out_b[(jj + 9) * nrows + ii], *es); _mm_storeh_pi((__m64 *) &out_b[(jj + 11) * nrows + ii], *fs); _mm_storeh_pi((__m64 *) &out_b[(jj + 13) * nrows + ii], *gs); _mm_storeh_pi((__m64 *) &out_b[(jj + 15) * nrows + ii], *hs); } for (jj = nbyte_row - nbyte_row % 16; jj < nbyte_row; jj ++) { out_b[jj * nrows + ii + 0] = in_b[(ii + 0)*nbyte_row + jj]; out_b[jj * nrows + ii + 1] = in_b[(ii + 1)*nbyte_row + jj]; out_b[jj * nrows + ii + 2] = in_b[(ii + 2)*nbyte_row + jj]; out_b[jj * nrows + ii + 3] = in_b[(ii + 3)*nbyte_row + jj]; out_b[jj * nrows + ii + 4] = in_b[(ii + 4)*nbyte_row + jj]; out_b[jj * nrows + ii + 5] = in_b[(ii + 5)*nbyte_row + jj]; out_b[jj * nrows + ii + 6] = in_b[(ii + 6)*nbyte_row + jj]; out_b[jj * nrows + ii + 7] = in_b[(ii + 7)*nbyte_row + jj]; } } return size * elem_size; }
/* For data organized into a row for each bit (8 * elem_size rows), transpose * the bytes. */ int64_t bshuf_trans_byte_bitrow_AVX(void* in, void* out, const size_t size, const size_t elem_size) { size_t hh, ii, jj, kk, mm; char* in_b = (char*) in; char* out_b = (char*) out; CHECK_MULT_EIGHT(size); size_t nrows = 8 * elem_size; size_t nbyte_row = size / 8; if (elem_size % 4) return bshuf_trans_byte_bitrow_SSE(in, out, size, elem_size); __m256i ymm_0[8]; __m256i ymm_1[8]; __m256i ymm_storeage[8][4]; for (jj = 0; jj + 31 < nbyte_row; jj += 32) { for (ii = 0; ii + 3 < elem_size; ii += 4) { for (hh = 0; hh < 4; hh ++) { for (kk = 0; kk < 8; kk ++){ ymm_0[kk] = _mm256_loadu_si256((__m256i *) &in_b[ (ii * 8 + hh * 8 + kk) * nbyte_row + jj]); } for (kk = 0; kk < 4; kk ++){ ymm_1[kk] = _mm256_unpacklo_epi8(ymm_0[kk * 2], ymm_0[kk * 2 + 1]); ymm_1[kk + 4] = _mm256_unpackhi_epi8(ymm_0[kk * 2], ymm_0[kk * 2 + 1]); } for (kk = 0; kk < 2; kk ++){ for (mm = 0; mm < 2; mm ++){ ymm_0[kk * 4 + mm] = _mm256_unpacklo_epi16( ymm_1[kk * 4 + mm * 2], ymm_1[kk * 4 + mm * 2 + 1]); ymm_0[kk * 4 + mm + 2] = _mm256_unpackhi_epi16( ymm_1[kk * 4 + mm * 2], ymm_1[kk * 4 + mm * 2 + 1]); } } for (kk = 0; kk < 4; kk ++){ ymm_1[kk * 2] = _mm256_unpacklo_epi32(ymm_0[kk * 2], ymm_0[kk * 2 + 1]); ymm_1[kk * 2 + 1] = _mm256_unpackhi_epi32(ymm_0[kk * 2], ymm_0[kk * 2 + 1]); } for (kk = 0; kk < 8; kk ++){ ymm_storeage[kk][hh] = ymm_1[kk]; } } for (mm = 0; mm < 8; mm ++) { for (kk = 0; kk < 4; kk ++){ ymm_0[kk] = ymm_storeage[mm][kk]; } ymm_1[0] = _mm256_unpacklo_epi64(ymm_0[0], ymm_0[1]); ymm_1[1] = _mm256_unpacklo_epi64(ymm_0[2], ymm_0[3]); ymm_1[2] = _mm256_unpackhi_epi64(ymm_0[0], ymm_0[1]); ymm_1[3] = _mm256_unpackhi_epi64(ymm_0[2], ymm_0[3]); ymm_0[0] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 32); ymm_0[1] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 32); ymm_0[2] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 49); ymm_0[3] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 49); _mm256_storeu_si256((__m256i *) &out_b[ (jj + mm * 2 + 0 * 16) * nrows + ii * 8], ymm_0[0]); _mm256_storeu_si256((__m256i *) &out_b[ (jj + mm * 2 + 0 * 16 + 1) * nrows + ii * 8], ymm_0[1]); _mm256_storeu_si256((__m256i *) &out_b[ (jj + mm * 2 + 1 * 16) * nrows + ii * 8], ymm_0[2]); _mm256_storeu_si256((__m256i *) &out_b[ (jj + mm * 2 + 1 * 16 + 1) * nrows + ii * 8], ymm_0[3]); } } } for (ii = 0; ii < nrows; ii ++ ) { for (jj = nbyte_row - nbyte_row % 32; jj < nbyte_row; jj ++) { out_b[jj * nrows + ii] = in_b[ii * nbyte_row + jj]; } } return size * elem_size; }