/* Transpose bytes within elements for 16 bit elements. */ int64_t bshuf_trans_byte_elem_SSE_16(void* in, void* out, const size_t size) { char* in_b = (char*) in; char* out_b = (char*) out; __m128i a0, b0, a1, b1; size_t ii; for (ii=0; ii + 15 < size; ii += 16) { a0 = _mm_loadu_si128((__m128i *) &in_b[2*ii + 0*16]); b0 = _mm_loadu_si128((__m128i *) &in_b[2*ii + 1*16]); a1 = _mm_unpacklo_epi8(a0, b0); b1 = _mm_unpackhi_epi8(a0, b0); a0 = _mm_unpacklo_epi8(a1, b1); b0 = _mm_unpackhi_epi8(a1, b1); a1 = _mm_unpacklo_epi8(a0, b0); b1 = _mm_unpackhi_epi8(a0, b0); a0 = _mm_unpacklo_epi8(a1, b1); b0 = _mm_unpackhi_epi8(a1, b1); _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0); _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0); } return bshuf_trans_byte_elem_remainder(in, out, size, 2, size - size % 16); }
/* Transpose bytes within elements for 32 bit elements. */ int64_t bshuf_trans_byte_elem_SSE_32(void* in, void* out, const size_t size) { char* in_b = (char*) in; char* out_b = (char*) out; __m128i a0, b0, c0, d0, a1, b1, c1, d1; size_t ii; for (ii=0; ii + 15 < size; ii += 16) { a0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 0*16]); b0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 1*16]); c0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 2*16]); d0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 3*16]); a1 = _mm_unpacklo_epi8(a0, b0); b1 = _mm_unpackhi_epi8(a0, b0); c1 = _mm_unpacklo_epi8(c0, d0); d1 = _mm_unpackhi_epi8(c0, d0); a0 = _mm_unpacklo_epi8(a1, b1); b0 = _mm_unpackhi_epi8(a1, b1); c0 = _mm_unpacklo_epi8(c1, d1); d0 = _mm_unpackhi_epi8(c1, d1); a1 = _mm_unpacklo_epi8(a0, b0); b1 = _mm_unpackhi_epi8(a0, b0); c1 = _mm_unpacklo_epi8(c0, d0); d1 = _mm_unpackhi_epi8(c0, d0); a0 = _mm_unpacklo_epi64(a1, c1); b0 = _mm_unpackhi_epi64(a1, c1); c0 = _mm_unpacklo_epi64(b1, d1); d0 = _mm_unpackhi_epi64(b1, d1); _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0); _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0); _mm_storeu_si128((__m128i *) &out_b[2*size + ii], c0); _mm_storeu_si128((__m128i *) &out_b[3*size + ii], d0); } return bshuf_trans_byte_elem_remainder(in, out, size, 4, size - size % 16); }
/* Transpose bytes within elements for 64 bit elements. */ int64_t bshuf_trans_byte_elem_SSE_64(void* in, void* out, const size_t size) { char* in_b = (char*) in; char* out_b = (char*) out; __m128i a0, b0, c0, d0, e0, f0, g0, h0; __m128i a1, b1, c1, d1, e1, f1, g1, h1; size_t ii; for (ii=0; ii + 15 < size; ii += 16) { a0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 0*16]); b0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 1*16]); c0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 2*16]); d0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 3*16]); e0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 4*16]); f0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 5*16]); g0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 6*16]); h0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 7*16]); a1 = _mm_unpacklo_epi8(a0, b0); b1 = _mm_unpackhi_epi8(a0, b0); c1 = _mm_unpacklo_epi8(c0, d0); d1 = _mm_unpackhi_epi8(c0, d0); e1 = _mm_unpacklo_epi8(e0, f0); f1 = _mm_unpackhi_epi8(e0, f0); g1 = _mm_unpacklo_epi8(g0, h0); h1 = _mm_unpackhi_epi8(g0, h0); a0 = _mm_unpacklo_epi8(a1, b1); b0 = _mm_unpackhi_epi8(a1, b1); c0 = _mm_unpacklo_epi8(c1, d1); d0 = _mm_unpackhi_epi8(c1, d1); e0 = _mm_unpacklo_epi8(e1, f1); f0 = _mm_unpackhi_epi8(e1, f1); g0 = _mm_unpacklo_epi8(g1, h1); h0 = _mm_unpackhi_epi8(g1, h1); a1 = _mm_unpacklo_epi32(a0, c0); b1 = _mm_unpackhi_epi32(a0, c0); c1 = _mm_unpacklo_epi32(b0, d0); d1 = _mm_unpackhi_epi32(b0, d0); e1 = _mm_unpacklo_epi32(e0, g0); f1 = _mm_unpackhi_epi32(e0, g0); g1 = _mm_unpacklo_epi32(f0, h0); h1 = _mm_unpackhi_epi32(f0, h0); a0 = _mm_unpacklo_epi64(a1, e1); b0 = _mm_unpackhi_epi64(a1, e1); c0 = _mm_unpacklo_epi64(b1, f1); d0 = _mm_unpackhi_epi64(b1, f1); e0 = _mm_unpacklo_epi64(c1, g1); f0 = _mm_unpackhi_epi64(c1, g1); g0 = _mm_unpacklo_epi64(d1, h1); h0 = _mm_unpackhi_epi64(d1, h1); _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0); _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0); _mm_storeu_si128((__m128i *) &out_b[2*size + ii], c0); _mm_storeu_si128((__m128i *) &out_b[3*size + ii], d0); _mm_storeu_si128((__m128i *) &out_b[4*size + ii], e0); _mm_storeu_si128((__m128i *) &out_b[5*size + ii], f0); _mm_storeu_si128((__m128i *) &out_b[6*size + ii], g0); _mm_storeu_si128((__m128i *) &out_b[7*size + ii], h0); } return bshuf_trans_byte_elem_remainder(in, out, size, 8, size - size % 16); }
/* Transpose bytes within elements. */ int64_t bshuf_trans_byte_elem_scal(void* in, void* out, const size_t size, const size_t elem_size) { return bshuf_trans_byte_elem_remainder(in, out, size, elem_size, 0); }