/* Routine optimized for shuffling a buffer for a type size of 16 bytes. */ static void shuffle16_avx2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 16; size_t j; int k, l; __m256i ymm0[16], ymm1[16]; /* Create the shuffle mask. NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from most to least significant (i.e., their order is reversed when compared to loading the mask from an array). */ const __m256i shmask = _mm256_set_epi8( 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04, 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00, 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04, 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00); for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) { /* Fetch 32 elements (512 bytes) into 16 YMM registers. */ for (k = 0; k < 16; k++) { ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i)))); } /* Transpose bytes */ for (k = 0, l = 0; k < 8; k++, l +=2) { ymm1[k*2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l+1]); ymm1[k*2+1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l+1]); } /* Transpose words */ for (k = 0, l = -2; k < 8; k++, l++) { if ((k%2) == 0) l += 2; ymm0[k*2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l+2]); ymm0[k*2+1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l+2]); } /* Transpose double words */ for (k = 0, l = -4; k < 8; k++, l++) { if ((k%4) == 0) l += 4; ymm1[k*2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l+4]); ymm1[k*2+1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l+4]); } /* Transpose quad words */ for (k = 0; k < 8; k++) { ymm0[k*2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k+8]); ymm0[k*2+1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k+8]); } for (k = 0; k < 16; k++) { ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8); ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask); } /* Store the result vectors */ uint8_t* const dest_for_jth_element = dest + j; for (k = 0; k < 16; k++) { _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm0[k]); } } }
/* Routine optimized for shuffling a buffer for a type size of 4 bytes. */ static void shuffle4_avx2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 4; size_t i; int j; __m256i ymm0[4], ymm1[4]; /* Create the shuffle mask. NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from most to least significant (i.e., their order is reversed when compared to loading the mask from an array). */ const __m256i mask = _mm256_set_epi32( 0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00); for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { /* Fetch 32 elements (128 bytes) then transpose bytes and words. */ for (j = 0; j < 4; j++) { ymm0[j] = _mm256_loadu_si256((__m256i*)(src + (i * bytesoftype) + (j * sizeof(__m256i)))); ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0xd8); ymm0[j] = _mm256_shuffle_epi32(ymm0[j], 0x8d); ymm0[j] = _mm256_unpacklo_epi8(ymm1[j], ymm0[j]); ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0x04e); ymm0[j] = _mm256_unpacklo_epi16(ymm0[j], ymm1[j]); } /* Transpose double words */ for (j = 0; j < 2; j++) { ymm1[j*2] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]); ymm1[j*2+1] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]); } /* Transpose quad words */ for (j = 0; j < 2; j++) { ymm0[j*2] = _mm256_unpacklo_epi64(ymm1[j], ymm1[j+2]); ymm0[j*2+1] = _mm256_unpackhi_epi64(ymm1[j], ymm1[j+2]); } for (j = 0; j < 4; j++) { ymm0[j] = _mm256_permutevar8x32_epi32(ymm0[j], mask); } /* Store the result vectors */ uint8_t* const dest_for_ith_element = dest + i; for (j = 0; j < 4; j++) { _mm256_storeu_si256((__m256i*)(dest_for_ith_element + (j * total_elements)), ymm0[j]); } } }
/* Routine optimized for shuffling a buffer for a type size of 8 bytes. */ static void shuffle8_avx2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 8; size_t j; int k, l; __m256i ymm0[8], ymm1[8]; for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) { /* Fetch 32 elements (256 bytes) then transpose bytes. */ for (k = 0; k < 8; k++) { ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i)))); ymm1[k] = _mm256_shuffle_epi32(ymm0[k], 0x4e); ymm1[k] = _mm256_unpacklo_epi8(ymm0[k], ymm1[k]); } /* Transpose words */ for (k = 0, l = 0; k < 4; k++, l +=2) { ymm0[k*2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l+1]); ymm0[k*2+1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l+1]); } /* Transpose double words */ for (k = 0, l = 0; k < 4; k++, l++) { if (k == 2) l += 2; ymm1[k*2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l+2]); ymm1[k*2+1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l+2]); } /* Transpose quad words */ for (k = 0; k < 4; k++) { ymm0[k*2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k+4]); ymm0[k*2+1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k+4]); } for(k = 0; k < 8; k++) { ymm1[k] = _mm256_permute4x64_epi64(ymm0[k], 0x72); ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xD8); ymm0[k] = _mm256_unpacklo_epi16(ymm0[k], ymm1[k]); } /* Store the result vectors */ uint8_t* const dest_for_jth_element = dest + j; for (k = 0; k < 8; k++) { _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm0[k]); } } }
__m256i test_mm256_unpackhi_epi64(__m256i a, __m256i b) { // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 1, i32 5, i32 3, i32 7> return _mm256_unpackhi_epi64(a, b); }
/* For data organized into a row for each bit (8 * elem_size rows), transpose * the bytes. */ int64_t bshuf_trans_byte_bitrow_AVX(void* in, void* out, const size_t size, const size_t elem_size) { size_t hh, ii, jj, kk, mm; char* in_b = (char*) in; char* out_b = (char*) out; CHECK_MULT_EIGHT(size); size_t nrows = 8 * elem_size; size_t nbyte_row = size / 8; if (elem_size % 4) return bshuf_trans_byte_bitrow_SSE(in, out, size, elem_size); __m256i ymm_0[8]; __m256i ymm_1[8]; __m256i ymm_storeage[8][4]; for (jj = 0; jj + 31 < nbyte_row; jj += 32) { for (ii = 0; ii + 3 < elem_size; ii += 4) { for (hh = 0; hh < 4; hh ++) { for (kk = 0; kk < 8; kk ++){ ymm_0[kk] = _mm256_loadu_si256((__m256i *) &in_b[ (ii * 8 + hh * 8 + kk) * nbyte_row + jj]); } for (kk = 0; kk < 4; kk ++){ ymm_1[kk] = _mm256_unpacklo_epi8(ymm_0[kk * 2], ymm_0[kk * 2 + 1]); ymm_1[kk + 4] = _mm256_unpackhi_epi8(ymm_0[kk * 2], ymm_0[kk * 2 + 1]); } for (kk = 0; kk < 2; kk ++){ for (mm = 0; mm < 2; mm ++){ ymm_0[kk * 4 + mm] = _mm256_unpacklo_epi16( ymm_1[kk * 4 + mm * 2], ymm_1[kk * 4 + mm * 2 + 1]); ymm_0[kk * 4 + mm + 2] = _mm256_unpackhi_epi16( ymm_1[kk * 4 + mm * 2], ymm_1[kk * 4 + mm * 2 + 1]); } } for (kk = 0; kk < 4; kk ++){ ymm_1[kk * 2] = _mm256_unpacklo_epi32(ymm_0[kk * 2], ymm_0[kk * 2 + 1]); ymm_1[kk * 2 + 1] = _mm256_unpackhi_epi32(ymm_0[kk * 2], ymm_0[kk * 2 + 1]); } for (kk = 0; kk < 8; kk ++){ ymm_storeage[kk][hh] = ymm_1[kk]; } } for (mm = 0; mm < 8; mm ++) { for (kk = 0; kk < 4; kk ++){ ymm_0[kk] = ymm_storeage[mm][kk]; } ymm_1[0] = _mm256_unpacklo_epi64(ymm_0[0], ymm_0[1]); ymm_1[1] = _mm256_unpacklo_epi64(ymm_0[2], ymm_0[3]); ymm_1[2] = _mm256_unpackhi_epi64(ymm_0[0], ymm_0[1]); ymm_1[3] = _mm256_unpackhi_epi64(ymm_0[2], ymm_0[3]); ymm_0[0] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 32); ymm_0[1] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 32); ymm_0[2] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 49); ymm_0[3] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 49); _mm256_storeu_si256((__m256i *) &out_b[ (jj + mm * 2 + 0 * 16) * nrows + ii * 8], ymm_0[0]); _mm256_storeu_si256((__m256i *) &out_b[ (jj + mm * 2 + 0 * 16 + 1) * nrows + ii * 8], ymm_0[1]); _mm256_storeu_si256((__m256i *) &out_b[ (jj + mm * 2 + 1 * 16) * nrows + ii * 8], ymm_0[2]); _mm256_storeu_si256((__m256i *) &out_b[ (jj + mm * 2 + 1 * 16 + 1) * nrows + ii * 8], ymm_0[3]); } } } for (ii = 0; ii < nrows; ii ++ ) { for (jj = nbyte_row - nbyte_row % 32; jj < nbyte_row; jj ++) { out_b[jj * nrows + ii] = in_b[ii * nbyte_row + jj]; } } return size * elem_size; }
/* Routine optimized for unshuffling a buffer for a type size larger than 16 bytes. */ static void unshuffle16_tiled_avx2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements, const size_t bytesoftype) { size_t i; int j; __m256i ymm0[16], ymm1[16]; const lldiv_t vecs_per_el = lldiv(bytesoftype, sizeof(__m128i)); /* The unshuffle loops are inverted (compared to shuffle_tiled16_avx2) to optimize cache utilization. */ size_t offset_into_type; for (offset_into_type = 0; offset_into_type < bytesoftype; offset_into_type += (offset_into_type == 0 && vecs_per_el.rem > 0 ? vecs_per_el.rem : sizeof(__m128i))) { for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { /* Load the first 16 bytes of 32 adjacent elements (512 bytes) into 16 YMM registers */ const uint8_t* const src_for_ith_element = src + i; for (j = 0; j < 16; j++) { ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (total_elements * (offset_into_type + j)))); } /* Shuffle bytes */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]); /* Compute the hi 32 bytes */ ymm1[8+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]); } /* Shuffle 2-byte words */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]); /* Compute the hi 32 bytes */ ymm0[8+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]); } /* Shuffle 4-byte dwords */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm1[j] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]); /* Compute the hi 32 bytes */ ymm1[8+j] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]); } /* Shuffle 8-byte qwords */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm0[j] = _mm256_unpacklo_epi64(ymm1[j*2], ymm1[j*2+1]); /* Compute the hi 32 bytes */ ymm0[8+j] = _mm256_unpackhi_epi64(ymm1[j*2], ymm1[j*2+1]); } for (j = 0; j < 8; j++) { ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x20); ymm1[j+8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x31); } /* Store the result vectors in proper order */ const uint8_t* const dest_with_offset = dest + offset_into_type; _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x01) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x00) * bytesoftype), ymm1[0]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x03) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x02) * bytesoftype), ymm1[4]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x05) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x04) * bytesoftype), ymm1[2]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x07) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x06) * bytesoftype), ymm1[6]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x09) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x08) * bytesoftype), ymm1[1]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x0b) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x0a) * bytesoftype), ymm1[5]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x0d) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x0c) * bytesoftype), ymm1[3]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x0f) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x0e) * bytesoftype), ymm1[7]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x11) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x10) * bytesoftype), ymm1[8]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x13) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x12) * bytesoftype), ymm1[12]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x15) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x14) * bytesoftype), ymm1[10]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x17) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x16) * bytesoftype), ymm1[14]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x19) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x18) * bytesoftype), ymm1[9]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x1b) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x1a) * bytesoftype), ymm1[13]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x1d) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x1c) * bytesoftype), ymm1[11]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x1f) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x1e) * bytesoftype), ymm1[15]); } } }
/* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */ static void unshuffle16_avx2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 16; size_t i; int j; __m256i ymm0[16], ymm1[16]; for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { /* Fetch 32 elements (512 bytes) into 16 YMM registers. */ const uint8_t* const src_for_ith_element = src + i; for (j = 0; j < 16; j++) { ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements))); } /* Shuffle bytes */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]); /* Compute the hi 32 bytes */ ymm1[8+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]); } /* Shuffle 2-byte words */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]); /* Compute the hi 32 bytes */ ymm0[8+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]); } /* Shuffle 4-byte dwords */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm1[j] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]); /* Compute the hi 32 bytes */ ymm1[8+j] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]); } /* Shuffle 8-byte qwords */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm0[j] = _mm256_unpacklo_epi64(ymm1[j*2], ymm1[j*2+1]); /* Compute the hi 32 bytes */ ymm0[8+j] = _mm256_unpackhi_epi64(ymm1[j*2], ymm1[j*2+1]); } for (j = 0; j < 8; j++) { ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x20); ymm1[j+8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x31); } /* Store the result vectors in proper order */ _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[4]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * sizeof(__m256i))), ymm1[2]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * sizeof(__m256i))), ymm1[6]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * sizeof(__m256i))), ymm1[1]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * sizeof(__m256i))), ymm1[5]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * sizeof(__m256i))), ymm1[3]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * sizeof(__m256i))), ymm1[7]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (8 * sizeof(__m256i))), ymm1[8]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (9 * sizeof(__m256i))), ymm1[12]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (10 * sizeof(__m256i))), ymm1[10]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (11 * sizeof(__m256i))), ymm1[14]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (12 * sizeof(__m256i))), ymm1[9]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (13 * sizeof(__m256i))), ymm1[13]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (14 * sizeof(__m256i))), ymm1[11]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (15 * sizeof(__m256i))), ymm1[15]); } }
/* Routine optimized for shuffling a buffer for a type size larger than 16 bytes. */ static void shuffle16_tiled_avx2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements, const size_t bytesoftype) { size_t j; int k, l; __m256i ymm0[16], ymm1[16]; const lldiv_t vecs_per_el = lldiv(bytesoftype, sizeof(__m128i)); /* Create the shuffle mask. NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from most to least significant (i.e., their order is reversed when compared to loading the mask from an array). */ const __m256i shmask = _mm256_set_epi8( 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04, 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00, 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04, 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00); for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) { /* Advance the offset into the type by the vector size (in bytes), unless this is the initial iteration and the type size is not a multiple of the vector size. In that case, only advance by the number of bytes necessary so that the number of remaining bytes in the type will be a multiple of the vector size. */ size_t offset_into_type; for (offset_into_type = 0; offset_into_type < bytesoftype; offset_into_type += (offset_into_type == 0 && vecs_per_el.rem > 0 ? vecs_per_el.rem : sizeof(__m128i))) { /* Fetch elements in groups of 512 bytes */ const uint8_t* const src_with_offset = src + offset_into_type; for (k = 0; k < 16; k++) { ymm0[k] = _mm256_loadu2_m128i( (__m128i*)(src_with_offset + (j + (2 * k) + 1) * bytesoftype), (__m128i*)(src_with_offset + (j + (2 * k)) * bytesoftype)); } /* Transpose bytes */ for (k = 0, l = 0; k < 8; k++, l +=2) { ymm1[k*2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l+1]); ymm1[k*2+1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l+1]); } /* Transpose words */ for (k = 0, l = -2; k < 8; k++, l++) { if ((k%2) == 0) l += 2; ymm0[k*2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l+2]); ymm0[k*2+1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l+2]); } /* Transpose double words */ for (k = 0, l = -4; k < 8; k++, l++) { if ((k%4) == 0) l += 4; ymm1[k*2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l+4]); ymm1[k*2+1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l+4]); } /* Transpose quad words */ for (k = 0; k < 8; k++) { ymm0[k*2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k+8]); ymm0[k*2+1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k+8]); } for (k = 0; k < 16; k++) { ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8); ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask); } /* Store the result vectors */ uint8_t* const dest_for_jth_element = dest + j; for (k = 0; k < 16; k++) { _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (total_elements * (offset_into_type + k))), ymm0[k]); } } } }
static void sfid_render_cache_rt_write_simd8_rgba_uint32_linear(struct thread *t, const struct sfid_render_cache_args *args) { const int slice_y = args->rt.minimum_array_element * args->rt.qpitch; const int x = t->grf[1].uw[4]; const int y = t->grf[1].uw[5] + slice_y; const struct reg *src = &t->grf[args->src]; __m128i *base0 = args->rt.pixels + x * args->rt.cpp + y * args->rt.stride; __m128i *base1 = (void *) base0 + args->rt.stride; __m256i rg0145 = _mm256_unpacklo_epi32(src[0].ireg, src[1].ireg); __m256i rg2367 = _mm256_unpackhi_epi32(src[0].ireg, src[1].ireg); __m256i ba0145 = _mm256_unpacklo_epi32(src[2].ireg, src[3].ireg); __m256i ba2367 = _mm256_unpackhi_epi32(src[2].ireg, src[3].ireg); __m256i rgba04 = _mm256_unpacklo_epi64(rg0145, ba0145); __m256i rgba15 = _mm256_unpackhi_epi64(rg0145, ba0145); __m256i rgba26 = _mm256_unpacklo_epi64(rg2367, ba2367); __m256i rgba37 = _mm256_unpackhi_epi64(rg2367, ba2367); struct reg mask = { .ireg = t->mask_q1 }; if (mask.d[0] < 0) base0[0] = _mm256_extractf128_si256(rgba04, 0); if (mask.d[1] < 0) base0[1] = _mm256_extractf128_si256(rgba15, 0); if (mask.d[2] < 0) base1[0] = _mm256_extractf128_si256(rgba26, 0); if (mask.d[3] < 0) base1[1] = _mm256_extractf128_si256(rgba37, 0); if (mask.d[4] < 0) base0[2] = _mm256_extractf128_si256(rgba04, 1); if (mask.d[5] < 0) base0[3] = _mm256_extractf128_si256(rgba15, 1); if (mask.d[6] < 0) base1[2] = _mm256_extractf128_si256(rgba26, 1); if (mask.d[7] < 0) base1[3] = _mm256_extractf128_si256(rgba37, 1); } static void write_uint16_linear(struct thread *t, const struct sfid_render_cache_args *args, __m256i r, __m256i g, __m256i b, __m256i a) { const int slice_y = args->rt.minimum_array_element * args->rt.qpitch; const int x = t->grf[1].uw[4]; const int y = t->grf[1].uw[5] + slice_y; __m256i rg, ba; rg = _mm256_slli_epi32(g, 16); rg = _mm256_or_si256(rg, r); ba = _mm256_slli_epi32(a, 16); ba = _mm256_or_si256(ba, b); __m256i p0 = _mm256_unpacklo_epi32(rg, ba); __m256i m0 = _mm256_cvtepi32_epi64(_mm256_extractf128_si256(t->mask_q1, 0)); __m256i p1 = _mm256_unpackhi_epi32(rg, ba); __m256i m1 = _mm256_cvtepi32_epi64(_mm256_extractf128_si256(t->mask_q1, 1)); void *base = args->rt.pixels + x * args->rt.cpp + y * args->rt.stride; _mm_maskstore_epi64(base, _mm256_extractf128_si256(m0, 0), _mm256_extractf128_si256(p0, 0)); _mm_maskstore_epi64((base + 16), _mm256_extractf128_si256(m1, 0), _mm256_extractf128_si256(p0, 1)); _mm_maskstore_epi64((base + args->rt.stride), _mm256_extractf128_si256(m0, 1), _mm256_extractf128_si256(p1, 0)); _mm_maskstore_epi64((base + args->rt.stride + 16), _mm256_extractf128_si256(m1, 1), _mm256_extractf128_si256(p1, 1)); } static void sfid_render_cache_rt_write_simd8_rgba_unorm16_linear(struct thread *t, const struct sfid_render_cache_args *args) { __m256i r, g, b, a; const __m256 scale = _mm256_set1_ps(65535.0f); const __m256 half = _mm256_set1_ps(0.5f); struct reg *src = &t->grf[args->src]; r = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[0].reg, scale), half)); g = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[1].reg, scale), half)); b = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[2].reg, scale), half)); a = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[3].reg, scale), half)); write_uint16_linear(t, args, r, g, b, a); }
void aom_sad64x64x4d_avx2(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t res[4]) { __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg; __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg; __m256i ref3_reg, ref3next_reg; __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; __m256i sum_mlow, sum_mhigh; int i; const uint8_t *ref0, *ref1, *ref2, *ref3; ref0 = ref[0]; ref1 = ref[1]; ref2 = ref[2]; ref3 = ref[3]; sum_ref0 = _mm256_set1_epi16(0); sum_ref1 = _mm256_set1_epi16(0); sum_ref2 = _mm256_set1_epi16(0); sum_ref3 = _mm256_set1_epi16(0); for (i = 0; i < 64; i++) { // load 64 bytes from src and all refs src_reg = _mm256_loadu_si256((const __m256i *)src); srcnext_reg = _mm256_loadu_si256((const __m256i *)(src + 32)); ref0_reg = _mm256_loadu_si256((const __m256i *)ref0); ref0next_reg = _mm256_loadu_si256((const __m256i *)(ref0 + 32)); ref1_reg = _mm256_loadu_si256((const __m256i *)ref1); ref1next_reg = _mm256_loadu_si256((const __m256i *)(ref1 + 32)); ref2_reg = _mm256_loadu_si256((const __m256i *)ref2); ref2next_reg = _mm256_loadu_si256((const __m256i *)(ref2 + 32)); ref3_reg = _mm256_loadu_si256((const __m256i *)ref3); ref3next_reg = _mm256_loadu_si256((const __m256i *)(ref3 + 32)); // sum of the absolute differences between every ref-i to src ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); ref0next_reg = _mm256_sad_epu8(ref0next_reg, srcnext_reg); ref1next_reg = _mm256_sad_epu8(ref1next_reg, srcnext_reg); ref2next_reg = _mm256_sad_epu8(ref2next_reg, srcnext_reg); ref3next_reg = _mm256_sad_epu8(ref3next_reg, srcnext_reg); // sum every ref-i sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); sum_ref0 = _mm256_add_epi32(sum_ref0, ref0next_reg); sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg); sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg); sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg); src += src_stride; ref0 += ref_stride; ref1 += ref_stride; ref2 += ref_stride; ref3 += ref_stride; } { __m128i sum; // in sum_ref-i the result is saved in the first 4 bytes // the other 4 bytes are zeroed. // sum_ref1 and sum_ref3 are shifted left by 4 bytes sum_ref1 = _mm256_slli_si256(sum_ref1, 4); sum_ref3 = _mm256_slli_si256(sum_ref3, 4); // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3 sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1); sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3); // merge every 64 bit from each sum_ref-i sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2); sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2); // add the low 64 bit to the high 64 bit sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); // add the low 128 bit to the high 128 bit sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), _mm256_extractf128_si256(sum_mlow, 1)); _mm_storeu_si128((__m128i *)(res), sum); } _mm256_zeroupper(); }
static void hadamard_col8x2_avx2(__m256i *in, int iter) { __m256i a0 = in[0]; __m256i a1 = in[1]; __m256i a2 = in[2]; __m256i a3 = in[3]; __m256i a4 = in[4]; __m256i a5 = in[5]; __m256i a6 = in[6]; __m256i a7 = in[7]; __m256i b0 = _mm256_add_epi16(a0, a1); __m256i b1 = _mm256_sub_epi16(a0, a1); __m256i b2 = _mm256_add_epi16(a2, a3); __m256i b3 = _mm256_sub_epi16(a2, a3); __m256i b4 = _mm256_add_epi16(a4, a5); __m256i b5 = _mm256_sub_epi16(a4, a5); __m256i b6 = _mm256_add_epi16(a6, a7); __m256i b7 = _mm256_sub_epi16(a6, a7); a0 = _mm256_add_epi16(b0, b2); a1 = _mm256_add_epi16(b1, b3); a2 = _mm256_sub_epi16(b0, b2); a3 = _mm256_sub_epi16(b1, b3); a4 = _mm256_add_epi16(b4, b6); a5 = _mm256_add_epi16(b5, b7); a6 = _mm256_sub_epi16(b4, b6); a7 = _mm256_sub_epi16(b5, b7); if (iter == 0) { b0 = _mm256_add_epi16(a0, a4); b7 = _mm256_add_epi16(a1, a5); b3 = _mm256_add_epi16(a2, a6); b4 = _mm256_add_epi16(a3, a7); b2 = _mm256_sub_epi16(a0, a4); b6 = _mm256_sub_epi16(a1, a5); b1 = _mm256_sub_epi16(a2, a6); b5 = _mm256_sub_epi16(a3, a7); a0 = _mm256_unpacklo_epi16(b0, b1); a1 = _mm256_unpacklo_epi16(b2, b3); a2 = _mm256_unpackhi_epi16(b0, b1); a3 = _mm256_unpackhi_epi16(b2, b3); a4 = _mm256_unpacklo_epi16(b4, b5); a5 = _mm256_unpacklo_epi16(b6, b7); a6 = _mm256_unpackhi_epi16(b4, b5); a7 = _mm256_unpackhi_epi16(b6, b7); b0 = _mm256_unpacklo_epi32(a0, a1); b1 = _mm256_unpacklo_epi32(a4, a5); b2 = _mm256_unpackhi_epi32(a0, a1); b3 = _mm256_unpackhi_epi32(a4, a5); b4 = _mm256_unpacklo_epi32(a2, a3); b5 = _mm256_unpacklo_epi32(a6, a7); b6 = _mm256_unpackhi_epi32(a2, a3); b7 = _mm256_unpackhi_epi32(a6, a7); in[0] = _mm256_unpacklo_epi64(b0, b1); in[1] = _mm256_unpackhi_epi64(b0, b1); in[2] = _mm256_unpacklo_epi64(b2, b3); in[3] = _mm256_unpackhi_epi64(b2, b3); in[4] = _mm256_unpacklo_epi64(b4, b5); in[5] = _mm256_unpackhi_epi64(b4, b5); in[6] = _mm256_unpacklo_epi64(b6, b7); in[7] = _mm256_unpackhi_epi64(b6, b7); } else { in[0] = _mm256_add_epi16(a0, a4); in[7] = _mm256_add_epi16(a1, a5); in[3] = _mm256_add_epi16(a2, a6); in[4] = _mm256_add_epi16(a3, a7); in[2] = _mm256_sub_epi16(a0, a4); in[6] = _mm256_sub_epi16(a1, a5); in[1] = _mm256_sub_epi16(a2, a6); in[5] = _mm256_sub_epi16(a3, a7); } }