static void hadamard_col8x2_avx2(__m256i *in, int iter) { __m256i a0 = in[0]; __m256i a1 = in[1]; __m256i a2 = in[2]; __m256i a3 = in[3]; __m256i a4 = in[4]; __m256i a5 = in[5]; __m256i a6 = in[6]; __m256i a7 = in[7]; __m256i b0 = _mm256_add_epi16(a0, a1); __m256i b1 = _mm256_sub_epi16(a0, a1); __m256i b2 = _mm256_add_epi16(a2, a3); __m256i b3 = _mm256_sub_epi16(a2, a3); __m256i b4 = _mm256_add_epi16(a4, a5); __m256i b5 = _mm256_sub_epi16(a4, a5); __m256i b6 = _mm256_add_epi16(a6, a7); __m256i b7 = _mm256_sub_epi16(a6, a7); a0 = _mm256_add_epi16(b0, b2); a1 = _mm256_add_epi16(b1, b3); a2 = _mm256_sub_epi16(b0, b2); a3 = _mm256_sub_epi16(b1, b3); a4 = _mm256_add_epi16(b4, b6); a5 = _mm256_add_epi16(b5, b7); a6 = _mm256_sub_epi16(b4, b6); a7 = _mm256_sub_epi16(b5, b7); if (iter == 0) { b0 = _mm256_add_epi16(a0, a4); b7 = _mm256_add_epi16(a1, a5); b3 = _mm256_add_epi16(a2, a6); b4 = _mm256_add_epi16(a3, a7); b2 = _mm256_sub_epi16(a0, a4); b6 = _mm256_sub_epi16(a1, a5); b1 = _mm256_sub_epi16(a2, a6); b5 = _mm256_sub_epi16(a3, a7); a0 = _mm256_unpacklo_epi16(b0, b1); a1 = _mm256_unpacklo_epi16(b2, b3); a2 = _mm256_unpackhi_epi16(b0, b1); a3 = _mm256_unpackhi_epi16(b2, b3); a4 = _mm256_unpacklo_epi16(b4, b5); a5 = _mm256_unpacklo_epi16(b6, b7); a6 = _mm256_unpackhi_epi16(b4, b5); a7 = _mm256_unpackhi_epi16(b6, b7); b0 = _mm256_unpacklo_epi32(a0, a1); b1 = _mm256_unpacklo_epi32(a4, a5); b2 = _mm256_unpackhi_epi32(a0, a1); b3 = _mm256_unpackhi_epi32(a4, a5); b4 = _mm256_unpacklo_epi32(a2, a3); b5 = _mm256_unpacklo_epi32(a6, a7); b6 = _mm256_unpackhi_epi32(a2, a3); b7 = _mm256_unpackhi_epi32(a6, a7); in[0] = _mm256_unpacklo_epi64(b0, b1); in[1] = _mm256_unpackhi_epi64(b0, b1); in[2] = _mm256_unpacklo_epi64(b2, b3); in[3] = _mm256_unpackhi_epi64(b2, b3); in[4] = _mm256_unpacklo_epi64(b4, b5); in[5] = _mm256_unpackhi_epi64(b4, b5); in[6] = _mm256_unpacklo_epi64(b6, b7); in[7] = _mm256_unpackhi_epi64(b6, b7); } else { in[0] = _mm256_add_epi16(a0, a4); in[7] = _mm256_add_epi16(a1, a5); in[3] = _mm256_add_epi16(a2, a6); in[4] = _mm256_add_epi16(a3, a7); in[2] = _mm256_sub_epi16(a0, a4); in[6] = _mm256_sub_epi16(a1, a5); in[1] = _mm256_sub_epi16(a2, a6); in[5] = _mm256_sub_epi16(a3, a7); } }
__m256i test_mm256_unpacklo_epi64(__m256i a, __m256i b) { // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 2, i32 6> return _mm256_unpacklo_epi64(a, b); }
/* For data organized into a row for each bit (8 * elem_size rows), transpose * the bytes. */ int64_t bshuf_trans_byte_bitrow_AVX(void* in, void* out, const size_t size, const size_t elem_size) { size_t hh, ii, jj, kk, mm; char* in_b = (char*) in; char* out_b = (char*) out; CHECK_MULT_EIGHT(size); size_t nrows = 8 * elem_size; size_t nbyte_row = size / 8; if (elem_size % 4) return bshuf_trans_byte_bitrow_SSE(in, out, size, elem_size); __m256i ymm_0[8]; __m256i ymm_1[8]; __m256i ymm_storeage[8][4]; for (jj = 0; jj + 31 < nbyte_row; jj += 32) { for (ii = 0; ii + 3 < elem_size; ii += 4) { for (hh = 0; hh < 4; hh ++) { for (kk = 0; kk < 8; kk ++){ ymm_0[kk] = _mm256_loadu_si256((__m256i *) &in_b[ (ii * 8 + hh * 8 + kk) * nbyte_row + jj]); } for (kk = 0; kk < 4; kk ++){ ymm_1[kk] = _mm256_unpacklo_epi8(ymm_0[kk * 2], ymm_0[kk * 2 + 1]); ymm_1[kk + 4] = _mm256_unpackhi_epi8(ymm_0[kk * 2], ymm_0[kk * 2 + 1]); } for (kk = 0; kk < 2; kk ++){ for (mm = 0; mm < 2; mm ++){ ymm_0[kk * 4 + mm] = _mm256_unpacklo_epi16( ymm_1[kk * 4 + mm * 2], ymm_1[kk * 4 + mm * 2 + 1]); ymm_0[kk * 4 + mm + 2] = _mm256_unpackhi_epi16( ymm_1[kk * 4 + mm * 2], ymm_1[kk * 4 + mm * 2 + 1]); } } for (kk = 0; kk < 4; kk ++){ ymm_1[kk * 2] = _mm256_unpacklo_epi32(ymm_0[kk * 2], ymm_0[kk * 2 + 1]); ymm_1[kk * 2 + 1] = _mm256_unpackhi_epi32(ymm_0[kk * 2], ymm_0[kk * 2 + 1]); } for (kk = 0; kk < 8; kk ++){ ymm_storeage[kk][hh] = ymm_1[kk]; } } for (mm = 0; mm < 8; mm ++) { for (kk = 0; kk < 4; kk ++){ ymm_0[kk] = ymm_storeage[mm][kk]; } ymm_1[0] = _mm256_unpacklo_epi64(ymm_0[0], ymm_0[1]); ymm_1[1] = _mm256_unpacklo_epi64(ymm_0[2], ymm_0[3]); ymm_1[2] = _mm256_unpackhi_epi64(ymm_0[0], ymm_0[1]); ymm_1[3] = _mm256_unpackhi_epi64(ymm_0[2], ymm_0[3]); ymm_0[0] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 32); ymm_0[1] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 32); ymm_0[2] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 49); ymm_0[3] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 49); _mm256_storeu_si256((__m256i *) &out_b[ (jj + mm * 2 + 0 * 16) * nrows + ii * 8], ymm_0[0]); _mm256_storeu_si256((__m256i *) &out_b[ (jj + mm * 2 + 0 * 16 + 1) * nrows + ii * 8], ymm_0[1]); _mm256_storeu_si256((__m256i *) &out_b[ (jj + mm * 2 + 1 * 16) * nrows + ii * 8], ymm_0[2]); _mm256_storeu_si256((__m256i *) &out_b[ (jj + mm * 2 + 1 * 16 + 1) * nrows + ii * 8], ymm_0[3]); } } } for (ii = 0; ii < nrows; ii ++ ) { for (jj = nbyte_row - nbyte_row % 32; jj < nbyte_row; jj ++) { out_b[jj * nrows + ii] = in_b[ii * nbyte_row + jj]; } } return size * elem_size; }