int64_t vp9_block_error_avx2(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz) { __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg; __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi; __m256i sse_reg_64hi, ssz_reg_64hi; __m128i sse_reg128, ssz_reg128; int64_t sse; int i; const __m256i zero_reg = _mm256_set1_epi16(0); // init sse and ssz registerd to zero sse_reg = _mm256_set1_epi16(0); ssz_reg = _mm256_set1_epi16(0); for (i = 0 ; i < block_size ; i+= 16) { // load 32 bytes from coeff and dqcoeff coeff_reg = _mm256_loadu_si256((const __m256i *)(coeff + i)); dqcoeff_reg = _mm256_loadu_si256((const __m256i *)(dqcoeff + i)); // dqcoeff - coeff dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg); // madd (dqcoeff - coeff) dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg); // madd coeff coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg); // expand each double word of madd (dqcoeff - coeff) to quad word exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg); exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg); // expand each double word of madd (coeff) to quad word exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg); exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg); // add each quad word of madd (dqcoeff - coeff) and madd (coeff) sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo); ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo); sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi); ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi); } // save the higher 64 bit of each 128 bit lane sse_reg_64hi = _mm256_srli_si256(sse_reg, 8); ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8); // add the higher 64 bit to the low 64 bit sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi); ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi); // add each 64 bit from each of the 128 bit lane of the 256 bit sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg), _mm256_extractf128_si256(sse_reg, 1)); ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg), _mm256_extractf128_si256(ssz_reg, 1)); // store the results _mm_storel_epi64((__m128i*)(&sse), sse_reg128); _mm_storel_epi64((__m128i*)(ssz), ssz_reg128); return sse; }
/* Routine optimized for shuffling a buffer for a type size of 16 bytes. */ static void shuffle16_avx2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 16; size_t j; int k, l; __m256i ymm0[16], ymm1[16]; /* Create the shuffle mask. NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from most to least significant (i.e., their order is reversed when compared to loading the mask from an array). */ const __m256i shmask = _mm256_set_epi8( 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04, 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00, 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04, 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00); for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) { /* Fetch 32 elements (512 bytes) into 16 YMM registers. */ for (k = 0; k < 16; k++) { ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i)))); } /* Transpose bytes */ for (k = 0, l = 0; k < 8; k++, l +=2) { ymm1[k*2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l+1]); ymm1[k*2+1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l+1]); } /* Transpose words */ for (k = 0, l = -2; k < 8; k++, l++) { if ((k%2) == 0) l += 2; ymm0[k*2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l+2]); ymm0[k*2+1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l+2]); } /* Transpose double words */ for (k = 0, l = -4; k < 8; k++, l++) { if ((k%4) == 0) l += 4; ymm1[k*2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l+4]); ymm1[k*2+1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l+4]); } /* Transpose quad words */ for (k = 0; k < 8; k++) { ymm0[k*2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k+8]); ymm0[k*2+1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k+8]); } for (k = 0; k < 16; k++) { ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8); ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask); } /* Store the result vectors */ uint8_t* const dest_for_jth_element = dest + j; for (k = 0; k < 16; k++) { _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm0[k]); } } }
/* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */ static void unshuffle8_avx2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 8; size_t i; int j; __m256i ymm0[8], ymm1[8]; for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { /* Fetch 32 elements (256 bytes) into 8 YMM registers. */ const uint8_t* const src_for_ith_element = src + i; for (j = 0; j < 8; j++) { ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements))); } /* Shuffle bytes */ for (j = 0; j < 4; j++) { /* Compute the low 32 bytes */ ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]); /* Compute the hi 32 bytes */ ymm1[4+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]); } /* Shuffle words */ for (j = 0; j < 4; j++) { /* Compute the low 32 bytes */ ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]); /* Compute the hi 32 bytes */ ymm0[4+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]); } for (j = 0; j < 8; j++) { ymm0[j] = _mm256_permute4x64_epi64(ymm0[j], 0xd8); } /* Shuffle 4-byte dwords */ for (j = 0; j < 4; j++) { /* Compute the low 32 bytes */ ymm1[j] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]); /* Compute the hi 32 bytes */ ymm1[4+j] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]); } /* Store the result vectors in proper order */ _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[2]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * sizeof(__m256i))), ymm1[1]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * sizeof(__m256i))), ymm1[3]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * sizeof(__m256i))), ymm1[4]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * sizeof(__m256i))), ymm1[6]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * sizeof(__m256i))), ymm1[5]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * sizeof(__m256i))), ymm1[7]); } }
/* Routine optimized for shuffling a buffer for a type size of 4 bytes. */ static void shuffle4_avx2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 4; size_t i; int j; __m256i ymm0[4], ymm1[4]; /* Create the shuffle mask. NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from most to least significant (i.e., their order is reversed when compared to loading the mask from an array). */ const __m256i mask = _mm256_set_epi32( 0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00); for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { /* Fetch 32 elements (128 bytes) then transpose bytes and words. */ for (j = 0; j < 4; j++) { ymm0[j] = _mm256_loadu_si256((__m256i*)(src + (i * bytesoftype) + (j * sizeof(__m256i)))); ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0xd8); ymm0[j] = _mm256_shuffle_epi32(ymm0[j], 0x8d); ymm0[j] = _mm256_unpacklo_epi8(ymm1[j], ymm0[j]); ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0x04e); ymm0[j] = _mm256_unpacklo_epi16(ymm0[j], ymm1[j]); } /* Transpose double words */ for (j = 0; j < 2; j++) { ymm1[j*2] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]); ymm1[j*2+1] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]); } /* Transpose quad words */ for (j = 0; j < 2; j++) { ymm0[j*2] = _mm256_unpacklo_epi64(ymm1[j], ymm1[j+2]); ymm0[j*2+1] = _mm256_unpackhi_epi64(ymm1[j], ymm1[j+2]); } for (j = 0; j < 4; j++) { ymm0[j] = _mm256_permutevar8x32_epi32(ymm0[j], mask); } /* Store the result vectors */ uint8_t* const dest_for_ith_element = dest + i; for (j = 0; j < 4; j++) { _mm256_storeu_si256((__m256i*)(dest_for_ith_element + (j * total_elements)), ymm0[j]); } } }
/* Routine optimized for shuffling a buffer for a type size of 8 bytes. */ static void shuffle8_avx2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 8; size_t j; int k, l; __m256i ymm0[8], ymm1[8]; for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) { /* Fetch 32 elements (256 bytes) then transpose bytes. */ for (k = 0; k < 8; k++) { ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i)))); ymm1[k] = _mm256_shuffle_epi32(ymm0[k], 0x4e); ymm1[k] = _mm256_unpacklo_epi8(ymm0[k], ymm1[k]); } /* Transpose words */ for (k = 0, l = 0; k < 4; k++, l +=2) { ymm0[k*2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l+1]); ymm0[k*2+1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l+1]); } /* Transpose double words */ for (k = 0, l = 0; k < 4; k++, l++) { if (k == 2) l += 2; ymm1[k*2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l+2]); ymm1[k*2+1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l+2]); } /* Transpose quad words */ for (k = 0; k < 4; k++) { ymm0[k*2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k+4]); ymm0[k*2+1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k+4]); } for(k = 0; k < 8; k++) { ymm1[k] = _mm256_permute4x64_epi64(ymm0[k], 0x72); ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xD8); ymm0[k] = _mm256_unpacklo_epi16(ymm0[k], ymm1[k]); } /* Store the result vectors */ uint8_t* const dest_for_jth_element = dest + j; for (k = 0; k < 8; k++) { _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm0[k]); } } }
__m256i test_mm256_unpackhi_epi32(__m256i a, __m256i b) { // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> return _mm256_unpackhi_epi32(a, b); }
/* For data organized into a row for each bit (8 * elem_size rows), transpose * the bytes. */ int64_t bshuf_trans_byte_bitrow_AVX(void* in, void* out, const size_t size, const size_t elem_size) { size_t hh, ii, jj, kk, mm; char* in_b = (char*) in; char* out_b = (char*) out; CHECK_MULT_EIGHT(size); size_t nrows = 8 * elem_size; size_t nbyte_row = size / 8; if (elem_size % 4) return bshuf_trans_byte_bitrow_SSE(in, out, size, elem_size); __m256i ymm_0[8]; __m256i ymm_1[8]; __m256i ymm_storeage[8][4]; for (jj = 0; jj + 31 < nbyte_row; jj += 32) { for (ii = 0; ii + 3 < elem_size; ii += 4) { for (hh = 0; hh < 4; hh ++) { for (kk = 0; kk < 8; kk ++){ ymm_0[kk] = _mm256_loadu_si256((__m256i *) &in_b[ (ii * 8 + hh * 8 + kk) * nbyte_row + jj]); } for (kk = 0; kk < 4; kk ++){ ymm_1[kk] = _mm256_unpacklo_epi8(ymm_0[kk * 2], ymm_0[kk * 2 + 1]); ymm_1[kk + 4] = _mm256_unpackhi_epi8(ymm_0[kk * 2], ymm_0[kk * 2 + 1]); } for (kk = 0; kk < 2; kk ++){ for (mm = 0; mm < 2; mm ++){ ymm_0[kk * 4 + mm] = _mm256_unpacklo_epi16( ymm_1[kk * 4 + mm * 2], ymm_1[kk * 4 + mm * 2 + 1]); ymm_0[kk * 4 + mm + 2] = _mm256_unpackhi_epi16( ymm_1[kk * 4 + mm * 2], ymm_1[kk * 4 + mm * 2 + 1]); } } for (kk = 0; kk < 4; kk ++){ ymm_1[kk * 2] = _mm256_unpacklo_epi32(ymm_0[kk * 2], ymm_0[kk * 2 + 1]); ymm_1[kk * 2 + 1] = _mm256_unpackhi_epi32(ymm_0[kk * 2], ymm_0[kk * 2 + 1]); } for (kk = 0; kk < 8; kk ++){ ymm_storeage[kk][hh] = ymm_1[kk]; } } for (mm = 0; mm < 8; mm ++) { for (kk = 0; kk < 4; kk ++){ ymm_0[kk] = ymm_storeage[mm][kk]; } ymm_1[0] = _mm256_unpacklo_epi64(ymm_0[0], ymm_0[1]); ymm_1[1] = _mm256_unpacklo_epi64(ymm_0[2], ymm_0[3]); ymm_1[2] = _mm256_unpackhi_epi64(ymm_0[0], ymm_0[1]); ymm_1[3] = _mm256_unpackhi_epi64(ymm_0[2], ymm_0[3]); ymm_0[0] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 32); ymm_0[1] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 32); ymm_0[2] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 49); ymm_0[3] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 49); _mm256_storeu_si256((__m256i *) &out_b[ (jj + mm * 2 + 0 * 16) * nrows + ii * 8], ymm_0[0]); _mm256_storeu_si256((__m256i *) &out_b[ (jj + mm * 2 + 0 * 16 + 1) * nrows + ii * 8], ymm_0[1]); _mm256_storeu_si256((__m256i *) &out_b[ (jj + mm * 2 + 1 * 16) * nrows + ii * 8], ymm_0[2]); _mm256_storeu_si256((__m256i *) &out_b[ (jj + mm * 2 + 1 * 16 + 1) * nrows + ii * 8], ymm_0[3]); } } } for (ii = 0; ii < nrows; ii ++ ) { for (jj = nbyte_row - nbyte_row % 32; jj < nbyte_row; jj ++) { out_b[jj * nrows + ii] = in_b[ii * nbyte_row + jj]; } } return size * elem_size; }
/* Routine optimized for unshuffling a buffer for a type size larger than 16 bytes. */ static void unshuffle16_tiled_avx2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements, const size_t bytesoftype) { size_t i; int j; __m256i ymm0[16], ymm1[16]; const lldiv_t vecs_per_el = lldiv(bytesoftype, sizeof(__m128i)); /* The unshuffle loops are inverted (compared to shuffle_tiled16_avx2) to optimize cache utilization. */ size_t offset_into_type; for (offset_into_type = 0; offset_into_type < bytesoftype; offset_into_type += (offset_into_type == 0 && vecs_per_el.rem > 0 ? vecs_per_el.rem : sizeof(__m128i))) { for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { /* Load the first 16 bytes of 32 adjacent elements (512 bytes) into 16 YMM registers */ const uint8_t* const src_for_ith_element = src + i; for (j = 0; j < 16; j++) { ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (total_elements * (offset_into_type + j)))); } /* Shuffle bytes */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]); /* Compute the hi 32 bytes */ ymm1[8+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]); } /* Shuffle 2-byte words */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]); /* Compute the hi 32 bytes */ ymm0[8+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]); } /* Shuffle 4-byte dwords */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm1[j] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]); /* Compute the hi 32 bytes */ ymm1[8+j] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]); } /* Shuffle 8-byte qwords */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm0[j] = _mm256_unpacklo_epi64(ymm1[j*2], ymm1[j*2+1]); /* Compute the hi 32 bytes */ ymm0[8+j] = _mm256_unpackhi_epi64(ymm1[j*2], ymm1[j*2+1]); } for (j = 0; j < 8; j++) { ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x20); ymm1[j+8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x31); } /* Store the result vectors in proper order */ const uint8_t* const dest_with_offset = dest + offset_into_type; _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x01) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x00) * bytesoftype), ymm1[0]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x03) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x02) * bytesoftype), ymm1[4]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x05) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x04) * bytesoftype), ymm1[2]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x07) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x06) * bytesoftype), ymm1[6]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x09) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x08) * bytesoftype), ymm1[1]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x0b) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x0a) * bytesoftype), ymm1[5]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x0d) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x0c) * bytesoftype), ymm1[3]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x0f) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x0e) * bytesoftype), ymm1[7]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x11) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x10) * bytesoftype), ymm1[8]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x13) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x12) * bytesoftype), ymm1[12]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x15) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x14) * bytesoftype), ymm1[10]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x17) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x16) * bytesoftype), ymm1[14]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x19) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x18) * bytesoftype), ymm1[9]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x1b) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x1a) * bytesoftype), ymm1[13]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x1d) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x1c) * bytesoftype), ymm1[11]); _mm256_storeu2_m128i( (__m128i*)(dest_with_offset + (i + 0x1f) * bytesoftype), (__m128i*)(dest_with_offset + (i + 0x1e) * bytesoftype), ymm1[15]); } } }
/* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */ static void unshuffle16_avx2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 16; size_t i; int j; __m256i ymm0[16], ymm1[16]; for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { /* Fetch 32 elements (512 bytes) into 16 YMM registers. */ const uint8_t* const src_for_ith_element = src + i; for (j = 0; j < 16; j++) { ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements))); } /* Shuffle bytes */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]); /* Compute the hi 32 bytes */ ymm1[8+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]); } /* Shuffle 2-byte words */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]); /* Compute the hi 32 bytes */ ymm0[8+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]); } /* Shuffle 4-byte dwords */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm1[j] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]); /* Compute the hi 32 bytes */ ymm1[8+j] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]); } /* Shuffle 8-byte qwords */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm0[j] = _mm256_unpacklo_epi64(ymm1[j*2], ymm1[j*2+1]); /* Compute the hi 32 bytes */ ymm0[8+j] = _mm256_unpackhi_epi64(ymm1[j*2], ymm1[j*2+1]); } for (j = 0; j < 8; j++) { ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x20); ymm1[j+8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x31); } /* Store the result vectors in proper order */ _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[4]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * sizeof(__m256i))), ymm1[2]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * sizeof(__m256i))), ymm1[6]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * sizeof(__m256i))), ymm1[1]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * sizeof(__m256i))), ymm1[5]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * sizeof(__m256i))), ymm1[3]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * sizeof(__m256i))), ymm1[7]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (8 * sizeof(__m256i))), ymm1[8]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (9 * sizeof(__m256i))), ymm1[12]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (10 * sizeof(__m256i))), ymm1[10]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (11 * sizeof(__m256i))), ymm1[14]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (12 * sizeof(__m256i))), ymm1[9]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (13 * sizeof(__m256i))), ymm1[13]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (14 * sizeof(__m256i))), ymm1[11]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (15 * sizeof(__m256i))), ymm1[15]); } }
/* Routine optimized for shuffling a buffer for a type size larger than 16 bytes. */ static void shuffle16_tiled_avx2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements, const size_t bytesoftype) { size_t j; int k, l; __m256i ymm0[16], ymm1[16]; const lldiv_t vecs_per_el = lldiv(bytesoftype, sizeof(__m128i)); /* Create the shuffle mask. NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from most to least significant (i.e., their order is reversed when compared to loading the mask from an array). */ const __m256i shmask = _mm256_set_epi8( 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04, 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00, 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04, 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00); for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) { /* Advance the offset into the type by the vector size (in bytes), unless this is the initial iteration and the type size is not a multiple of the vector size. In that case, only advance by the number of bytes necessary so that the number of remaining bytes in the type will be a multiple of the vector size. */ size_t offset_into_type; for (offset_into_type = 0; offset_into_type < bytesoftype; offset_into_type += (offset_into_type == 0 && vecs_per_el.rem > 0 ? vecs_per_el.rem : sizeof(__m128i))) { /* Fetch elements in groups of 512 bytes */ const uint8_t* const src_with_offset = src + offset_into_type; for (k = 0; k < 16; k++) { ymm0[k] = _mm256_loadu2_m128i( (__m128i*)(src_with_offset + (j + (2 * k) + 1) * bytesoftype), (__m128i*)(src_with_offset + (j + (2 * k)) * bytesoftype)); } /* Transpose bytes */ for (k = 0, l = 0; k < 8; k++, l +=2) { ymm1[k*2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l+1]); ymm1[k*2+1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l+1]); } /* Transpose words */ for (k = 0, l = -2; k < 8; k++, l++) { if ((k%2) == 0) l += 2; ymm0[k*2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l+2]); ymm0[k*2+1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l+2]); } /* Transpose double words */ for (k = 0, l = -4; k < 8; k++, l++) { if ((k%4) == 0) l += 4; ymm1[k*2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l+4]); ymm1[k*2+1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l+4]); } /* Transpose quad words */ for (k = 0; k < 8; k++) { ymm0[k*2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k+8]); ymm0[k*2+1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k+8]); } for (k = 0; k < 16; k++) { ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8); ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask); } /* Store the result vectors */ uint8_t* const dest_for_jth_element = dest + j; for (k = 0; k < 16; k++) { _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (total_elements * (offset_into_type + k))), ymm0[k]); } } } }
void extern avx2_test (void) { x = _mm256_unpackhi_epi32 (x, x); }
static void sfid_render_cache_rt_write_simd8_rgba_uint32_linear(struct thread *t, const struct sfid_render_cache_args *args) { const int slice_y = args->rt.minimum_array_element * args->rt.qpitch; const int x = t->grf[1].uw[4]; const int y = t->grf[1].uw[5] + slice_y; const struct reg *src = &t->grf[args->src]; __m128i *base0 = args->rt.pixels + x * args->rt.cpp + y * args->rt.stride; __m128i *base1 = (void *) base0 + args->rt.stride; __m256i rg0145 = _mm256_unpacklo_epi32(src[0].ireg, src[1].ireg); __m256i rg2367 = _mm256_unpackhi_epi32(src[0].ireg, src[1].ireg); __m256i ba0145 = _mm256_unpacklo_epi32(src[2].ireg, src[3].ireg); __m256i ba2367 = _mm256_unpackhi_epi32(src[2].ireg, src[3].ireg); __m256i rgba04 = _mm256_unpacklo_epi64(rg0145, ba0145); __m256i rgba15 = _mm256_unpackhi_epi64(rg0145, ba0145); __m256i rgba26 = _mm256_unpacklo_epi64(rg2367, ba2367); __m256i rgba37 = _mm256_unpackhi_epi64(rg2367, ba2367); struct reg mask = { .ireg = t->mask_q1 }; if (mask.d[0] < 0) base0[0] = _mm256_extractf128_si256(rgba04, 0); if (mask.d[1] < 0) base0[1] = _mm256_extractf128_si256(rgba15, 0); if (mask.d[2] < 0) base1[0] = _mm256_extractf128_si256(rgba26, 0); if (mask.d[3] < 0) base1[1] = _mm256_extractf128_si256(rgba37, 0); if (mask.d[4] < 0) base0[2] = _mm256_extractf128_si256(rgba04, 1); if (mask.d[5] < 0) base0[3] = _mm256_extractf128_si256(rgba15, 1); if (mask.d[6] < 0) base1[2] = _mm256_extractf128_si256(rgba26, 1); if (mask.d[7] < 0) base1[3] = _mm256_extractf128_si256(rgba37, 1); } static void write_uint16_linear(struct thread *t, const struct sfid_render_cache_args *args, __m256i r, __m256i g, __m256i b, __m256i a) { const int slice_y = args->rt.minimum_array_element * args->rt.qpitch; const int x = t->grf[1].uw[4]; const int y = t->grf[1].uw[5] + slice_y; __m256i rg, ba; rg = _mm256_slli_epi32(g, 16); rg = _mm256_or_si256(rg, r); ba = _mm256_slli_epi32(a, 16); ba = _mm256_or_si256(ba, b); __m256i p0 = _mm256_unpacklo_epi32(rg, ba); __m256i m0 = _mm256_cvtepi32_epi64(_mm256_extractf128_si256(t->mask_q1, 0)); __m256i p1 = _mm256_unpackhi_epi32(rg, ba); __m256i m1 = _mm256_cvtepi32_epi64(_mm256_extractf128_si256(t->mask_q1, 1)); void *base = args->rt.pixels + x * args->rt.cpp + y * args->rt.stride; _mm_maskstore_epi64(base, _mm256_extractf128_si256(m0, 0), _mm256_extractf128_si256(p0, 0)); _mm_maskstore_epi64((base + 16), _mm256_extractf128_si256(m1, 0), _mm256_extractf128_si256(p0, 1)); _mm_maskstore_epi64((base + args->rt.stride), _mm256_extractf128_si256(m0, 1), _mm256_extractf128_si256(p1, 0)); _mm_maskstore_epi64((base + args->rt.stride + 16), _mm256_extractf128_si256(m1, 1), _mm256_extractf128_si256(p1, 1)); } static void sfid_render_cache_rt_write_simd8_rgba_unorm16_linear(struct thread *t, const struct sfid_render_cache_args *args) { __m256i r, g, b, a; const __m256 scale = _mm256_set1_ps(65535.0f); const __m256 half = _mm256_set1_ps(0.5f); struct reg *src = &t->grf[args->src]; r = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[0].reg, scale), half)); g = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[1].reg, scale), half)); b = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[2].reg, scale), half)); a = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[3].reg, scale), half)); write_uint16_linear(t, args, r, g, b, a); }
static void hadamard_col8x2_avx2(__m256i *in, int iter) { __m256i a0 = in[0]; __m256i a1 = in[1]; __m256i a2 = in[2]; __m256i a3 = in[3]; __m256i a4 = in[4]; __m256i a5 = in[5]; __m256i a6 = in[6]; __m256i a7 = in[7]; __m256i b0 = _mm256_add_epi16(a0, a1); __m256i b1 = _mm256_sub_epi16(a0, a1); __m256i b2 = _mm256_add_epi16(a2, a3); __m256i b3 = _mm256_sub_epi16(a2, a3); __m256i b4 = _mm256_add_epi16(a4, a5); __m256i b5 = _mm256_sub_epi16(a4, a5); __m256i b6 = _mm256_add_epi16(a6, a7); __m256i b7 = _mm256_sub_epi16(a6, a7); a0 = _mm256_add_epi16(b0, b2); a1 = _mm256_add_epi16(b1, b3); a2 = _mm256_sub_epi16(b0, b2); a3 = _mm256_sub_epi16(b1, b3); a4 = _mm256_add_epi16(b4, b6); a5 = _mm256_add_epi16(b5, b7); a6 = _mm256_sub_epi16(b4, b6); a7 = _mm256_sub_epi16(b5, b7); if (iter == 0) { b0 = _mm256_add_epi16(a0, a4); b7 = _mm256_add_epi16(a1, a5); b3 = _mm256_add_epi16(a2, a6); b4 = _mm256_add_epi16(a3, a7); b2 = _mm256_sub_epi16(a0, a4); b6 = _mm256_sub_epi16(a1, a5); b1 = _mm256_sub_epi16(a2, a6); b5 = _mm256_sub_epi16(a3, a7); a0 = _mm256_unpacklo_epi16(b0, b1); a1 = _mm256_unpacklo_epi16(b2, b3); a2 = _mm256_unpackhi_epi16(b0, b1); a3 = _mm256_unpackhi_epi16(b2, b3); a4 = _mm256_unpacklo_epi16(b4, b5); a5 = _mm256_unpacklo_epi16(b6, b7); a6 = _mm256_unpackhi_epi16(b4, b5); a7 = _mm256_unpackhi_epi16(b6, b7); b0 = _mm256_unpacklo_epi32(a0, a1); b1 = _mm256_unpacklo_epi32(a4, a5); b2 = _mm256_unpackhi_epi32(a0, a1); b3 = _mm256_unpackhi_epi32(a4, a5); b4 = _mm256_unpacklo_epi32(a2, a3); b5 = _mm256_unpacklo_epi32(a6, a7); b6 = _mm256_unpackhi_epi32(a2, a3); b7 = _mm256_unpackhi_epi32(a6, a7); in[0] = _mm256_unpacklo_epi64(b0, b1); in[1] = _mm256_unpackhi_epi64(b0, b1); in[2] = _mm256_unpacklo_epi64(b2, b3); in[3] = _mm256_unpackhi_epi64(b2, b3); in[4] = _mm256_unpacklo_epi64(b4, b5); in[5] = _mm256_unpackhi_epi64(b4, b5); in[6] = _mm256_unpacklo_epi64(b6, b7); in[7] = _mm256_unpackhi_epi64(b6, b7); } else { in[0] = _mm256_add_epi16(a0, a4); in[7] = _mm256_add_epi16(a1, a5); in[3] = _mm256_add_epi16(a2, a6); in[4] = _mm256_add_epi16(a3, a7); in[2] = _mm256_sub_epi16(a0, a4); in[6] = _mm256_sub_epi16(a1, a5); in[1] = _mm256_sub_epi16(a2, a6); in[5] = _mm256_sub_epi16(a3, a7); } }