/* Routine optimized for unshuffling a buffer for a type size of 2 bytes. */ static void unshuffle2_avx2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 2; size_t i; int j; __m256i ymm0[2], ymm1[2]; for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { /* Load 32 elements (64 bytes) into 2 YMM registers. */ const uint8_t* const src_for_ith_element = src + i; for (j = 0; j < 2; j++) { ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements))); } /* Shuffle bytes */ for (j = 0; j < 2; j++) { ymm0[j] = _mm256_permute4x64_epi64(ymm0[j], 0xd8); } /* Compute the low 64 bytes */ ymm1[0] = _mm256_unpacklo_epi8(ymm0[0], ymm0[1]); /* Compute the hi 64 bytes */ ymm1[1] = _mm256_unpackhi_epi8(ymm0[0], ymm0[1]); /* Store the result vectors in proper order */ _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[1]); } }
size_t varvectorshift_unrolled(uint32_t *array, size_t length, int shiftamount) { size_t k = 0; __m256i * a = (__m256i *) array; __m128i s = _mm_set1_epi32(shiftamount); for (; k + 3 < length / 8 ; k +=4, a+=4) { __m256i v1 = _mm256_loadu_si256(a); __m256i v2 = _mm256_loadu_si256(a + 1); __m256i v3 = _mm256_loadu_si256(a + 2); __m256i v4 = _mm256_loadu_si256(a + 3); v1 = _mm256_srl_epi32(v1,s); v2 = _mm256_srl_epi32(v2,s); v3 = _mm256_srl_epi32(v3,s); v4 = _mm256_srl_epi32(v4,s); _mm256_storeu_si256(a,v1); _mm256_storeu_si256(a + 1,v2); _mm256_storeu_si256(a + 2,v3); _mm256_storeu_si256(a + 3,v4); } for (; k < length / 8 ; k ++, a++) { __m256i v = _mm256_loadu_si256(a); v = _mm256_srl_epi32(v,s); _mm256_storeu_si256(a,v); } k *= 8; for (; k < length; ++k) { array[k] = array[k] >> shiftamount; } return 0; }
static void avx2_test (void) { int i; int ck[8]; int r[8]; unsigned int imm; int fail = 0; union256i_q s1, s2, d; for (i = 0; i < 256; i += 16) for (imm = 0; imm < 100; imm++) { /* Recompute the results for 256-bits */ compute_correct_result_256 (&vals[i + 0], &vals[i + 8], imm, ck); s1.x = _mm256_loadu_si256 ((__m256i *) & vals[i + 0]); s2.x = _mm256_loadu_si256 ((__m256i *) & vals[i + 8]); /* Run the 256-bit tests */ avx2_test_palignr256 (s1.x, s2.x, imm, &d.x); _mm256_storeu_si256 ((__m256i *) r, d.x); fail += checkVi (r, ck, 8); } if (fail != 0) abort (); }
static INLINE void hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff, int is_final) { #if CONFIG_VP9_HIGHBITDEPTH DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]); int16_t *t_coeff = temp_coeff; #else int16_t *t_coeff = coeff; #endif int16_t *coeff16 = (int16_t *)coeff; int idx; for (idx = 0; idx < 2; ++idx) { const int16_t *src_ptr = src_diff + idx * 8 * src_stride; hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2)); } for (idx = 0; idx < 64; idx += 16) { const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64)); const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128)); const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192)); __m256i b0 = _mm256_add_epi16(coeff0, coeff1); __m256i b1 = _mm256_sub_epi16(coeff0, coeff1); __m256i b2 = _mm256_add_epi16(coeff2, coeff3); __m256i b3 = _mm256_sub_epi16(coeff2, coeff3); b0 = _mm256_srai_epi16(b0, 1); b1 = _mm256_srai_epi16(b1, 1); b2 = _mm256_srai_epi16(b2, 1); b3 = _mm256_srai_epi16(b3, 1); if (is_final) { store_tran_low(_mm256_add_epi16(b0, b2), coeff); store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64); store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128); store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192); coeff += 16; } else { _mm256_storeu_si256((__m256i *)coeff16, _mm256_add_epi16(b0, b2)); _mm256_storeu_si256((__m256i *)(coeff16 + 64), _mm256_add_epi16(b1, b3)); _mm256_storeu_si256((__m256i *)(coeff16 + 128), _mm256_sub_epi16(b0, b2)); _mm256_storeu_si256((__m256i *)(coeff16 + 192), _mm256_sub_epi16(b1, b3)); coeff16 += 16; } t_coeff += 16; } }
static void _mm256i_print(__m256i x) { int *data = new int[8]; _mm256_storeu_si256((__m256i*)&data[0],x); for(size_t i =0 ; i < 8; i++) { std::cout << "Data[" << i << "]: " << data[i] << std::endl; } delete[] data; }
/* Routine optimized for shuffling a buffer for a type size of 16 bytes. */ static void shuffle16_avx2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 16; size_t j; int k, l; __m256i ymm0[16], ymm1[16]; /* Create the shuffle mask. NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from most to least significant (i.e., their order is reversed when compared to loading the mask from an array). */ const __m256i shmask = _mm256_set_epi8( 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04, 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00, 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04, 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00); for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) { /* Fetch 32 elements (512 bytes) into 16 YMM registers. */ for (k = 0; k < 16; k++) { ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i)))); } /* Transpose bytes */ for (k = 0, l = 0; k < 8; k++, l +=2) { ymm1[k*2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l+1]); ymm1[k*2+1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l+1]); } /* Transpose words */ for (k = 0, l = -2; k < 8; k++, l++) { if ((k%2) == 0) l += 2; ymm0[k*2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l+2]); ymm0[k*2+1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l+2]); } /* Transpose double words */ for (k = 0, l = -4; k < 8; k++, l++) { if ((k%4) == 0) l += 4; ymm1[k*2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l+4]); ymm1[k*2+1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l+4]); } /* Transpose quad words */ for (k = 0; k < 8; k++) { ymm0[k*2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k+8]); ymm0[k*2+1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k+8]); } for (k = 0; k < 16; k++) { ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8); ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask); } /* Store the result vectors */ uint8_t* const dest_for_jth_element = dest + j; for (k = 0; k < 16; k++) { _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm0[k]); } } }
/* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */ static void unshuffle8_avx2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 8; size_t i; int j; __m256i ymm0[8], ymm1[8]; for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { /* Fetch 32 elements (256 bytes) into 8 YMM registers. */ const uint8_t* const src_for_ith_element = src + i; for (j = 0; j < 8; j++) { ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements))); } /* Shuffle bytes */ for (j = 0; j < 4; j++) { /* Compute the low 32 bytes */ ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]); /* Compute the hi 32 bytes */ ymm1[4+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]); } /* Shuffle words */ for (j = 0; j < 4; j++) { /* Compute the low 32 bytes */ ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]); /* Compute the hi 32 bytes */ ymm0[4+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]); } for (j = 0; j < 8; j++) { ymm0[j] = _mm256_permute4x64_epi64(ymm0[j], 0xd8); } /* Shuffle 4-byte dwords */ for (j = 0; j < 4; j++) { /* Compute the low 32 bytes */ ymm1[j] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]); /* Compute the hi 32 bytes */ ymm1[4+j] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]); } /* Store the result vectors in proper order */ _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[2]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * sizeof(__m256i))), ymm1[1]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * sizeof(__m256i))), ymm1[3]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * sizeof(__m256i))), ymm1[4]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * sizeof(__m256i))), ymm1[6]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * sizeof(__m256i))), ymm1[5]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * sizeof(__m256i))), ymm1[7]); } }
static INLINE void quantize(const __m256i *qp, __m256i *c, const int16_t *iscan_ptr, int log_scale, tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob) { const __m256i abs_coeff = _mm256_abs_epi32(*c); __m256i q = _mm256_add_epi32(abs_coeff, qp[0]); __m256i q_lo = _mm256_mul_epi32(q, qp[1]); __m256i q_hi = _mm256_srli_epi64(q, 32); const __m256i qp_hi = _mm256_srli_epi64(qp[1], 32); q_hi = _mm256_mul_epi32(q_hi, qp_hi); q_lo = _mm256_srli_epi64(q_lo, 16 - log_scale); q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale); q_hi = _mm256_slli_epi64(q_hi, 32); q = _mm256_or_si256(q_lo, q_hi); const __m256i abs_s = _mm256_slli_epi32(abs_coeff, 1 + log_scale); const __m256i mask = _mm256_cmpgt_epi32(qp[2], abs_s); q = _mm256_andnot_si256(mask, q); __m256i dq = _mm256_mullo_epi32(q, qp[2]); dq = _mm256_srai_epi32(dq, log_scale); q = _mm256_sign_epi32(q, *c); dq = _mm256_sign_epi32(dq, *c); _mm256_storeu_si256((__m256i *)qcoeff, q); _mm256_storeu_si256((__m256i *)dqcoeff, dq); const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr); const __m128i zr = _mm_setzero_si128(); const __m128i lo = _mm_unpacklo_epi16(isc, zr); const __m128i hi = _mm_unpackhi_epi16(isc, zr); const __m256i iscan = _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); const __m256i zero = _mm256_setzero_si256(); const __m256i zc = _mm256_cmpeq_epi32(dq, zero); const __m256i nz = _mm256_cmpeq_epi32(zc, zero); __m256i cur_eob = _mm256_sub_epi32(iscan, nz); cur_eob = _mm256_and_si256(cur_eob, nz); *eob = _mm256_max_epi32(cur_eob, *eob); }
void vpx_highbd_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { int idx; tran_low_t *t_coeff = coeff; for (idx = 0; idx < 4; ++idx) { const int16_t *src_ptr = src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; vpx_highbd_hadamard_16x16_avx2(src_ptr, src_stride, t_coeff + idx * 256); } for (idx = 0; idx < 256; idx += 8) { __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256)); __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512)); __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768)); __m256i b0 = _mm256_add_epi32(coeff0, coeff1); __m256i b1 = _mm256_sub_epi32(coeff0, coeff1); __m256i b2 = _mm256_add_epi32(coeff2, coeff3); __m256i b3 = _mm256_sub_epi32(coeff2, coeff3); b0 = _mm256_srai_epi32(b0, 2); b1 = _mm256_srai_epi32(b1, 2); b2 = _mm256_srai_epi32(b2, 2); b3 = _mm256_srai_epi32(b3, 2); coeff0 = _mm256_add_epi32(b0, b2); coeff1 = _mm256_add_epi32(b1, b3); coeff2 = _mm256_sub_epi32(b0, b2); coeff3 = _mm256_sub_epi32(b1, b3); _mm256_storeu_si256((__m256i *)coeff, coeff0); _mm256_storeu_si256((__m256i *)(coeff + 256), coeff1); _mm256_storeu_si256((__m256i *)(coeff + 512), coeff2); _mm256_storeu_si256((__m256i *)(coeff + 768), coeff3); coeff += 8; t_coeff += 8; } }
void convert_f32_bf16(float* in, libxsmm_bfloat16* out, int len) { int i; #ifdef _OPENMP #pragma omp parallel for private(i) #endif for ( i = 0; i < len; i+=16 ) { __m512 vfp32 = gxm_fp32_to_bfp16_rne_adjustment_avx512f( _mm512_loadu_ps( in+i ) ); __m256i vbfp16 = gxm_fp32_to_bfp16_truncate_avx512f( vfp32 ); _mm256_storeu_si256( (__m256i*)(out+i), vbfp16 ); } }
// Reverse with intrinsics // First - reverse the values in the first half and second half // Second - copy the second half of temp array to the beginning of original one, and first half (from temp array) next to the first in original one // (swap the halfs) inline void reverse(char * bytes, int numChunks) { static char temp[64]; __m256i firstHalf, secondHalf; for (int i = 0; i < numChunks; ++i) { // Reverse first half. firstHalf = _mm256_set_epi8(bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7], bytes[8], bytes[9], bytes[10], bytes[11], bytes[12], bytes[13], bytes[14], bytes[15], bytes[16], bytes[17], bytes[18], bytes[19], bytes[20], bytes[21], bytes[22], bytes[23], bytes[24], bytes[25], bytes[26], bytes[27], bytes[28], bytes[29], bytes[30], bytes[31]); // Reverse second half. secondHalf = _mm256_set_epi8(bytes[32], bytes[33], bytes[34], bytes[35], bytes[36], bytes[37], bytes[38], bytes[39], bytes[40], bytes[41], bytes[42], bytes[43], bytes[44], bytes[45], bytes[46], bytes[47], bytes[48], bytes[49], bytes[50], bytes[51], bytes[52], bytes[53], bytes[54], bytes[55], bytes[56], bytes[57], bytes[58], bytes[59], bytes[60], bytes[61], bytes[62], bytes[63]); // write the second half at the begining, and after it- first one. _mm256_storeu_si256((__m256i*)bytes, secondHalf); _mm256_storeu_si256((__m256i*)(bytes + 32), firstHalf); bytes += 64; } }
void cvtScale_s16s32f32Line_AVX2(const short* src, int* dst, float scale, float shift, int width) { int x = 0; __m256 scale256 = _mm256_set1_ps(scale); __m256 shift256 = _mm256_set1_ps(shift); const int shuffle = 0xD8; for (; x <= width - 16; x += 16) { __m256i v_src = _mm256_loadu_si256((const __m256i *)(src + x)); v_src = _mm256_permute4x64_epi64(v_src, shuffle); __m256i v_src_lo = _mm256_srai_epi32(_mm256_unpacklo_epi16(v_src, v_src), 16); __m256i v_src_hi = _mm256_srai_epi32(_mm256_unpackhi_epi16(v_src, v_src), 16); __m256 v_dst0 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_lo), scale256), shift256); __m256 v_dst1 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_hi), scale256), shift256); _mm256_storeu_si256((__m256i *)(dst + x), _mm256_cvtps_epi32(v_dst0)); _mm256_storeu_si256((__m256i *)(dst + x + 8), _mm256_cvtps_epi32(v_dst1)); } for (; x < width; x++) dst[x] = saturate_cast<int>(src[x] * scale + shift); }
inline void reverse(char* bytes, int numChunks) { static __m256i reversed1; static __m256i reversed2; for(size_t i = 0; i < numChunks; ++i) { reversed1 = _mm256_set_epi8(bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7], bytes[8], bytes[9], bytes[10], bytes[11], bytes[12], bytes[13], bytes[14], bytes[15], bytes[16], bytes[17], bytes[18], bytes[19], bytes[20], bytes[21], bytes[22], bytes[23], bytes[24], bytes[25], bytes[26], bytes[27], bytes[28], bytes[29], bytes[30], bytes[31]); reversed2 = _mm256_set_epi8(bytes[32], bytes[33], bytes[34], bytes[35], bytes[36], bytes[37], bytes[38], bytes[39], bytes[40], bytes[41], bytes[42], bytes[43], bytes[44], bytes[45], bytes[46], bytes[47], bytes[48], bytes[49], bytes[50], bytes[51], bytes[52], bytes[53], bytes[54], bytes[55], bytes[56], bytes[57], bytes[58], bytes[59], bytes[60], bytes[61], bytes[62], bytes[63]); _mm256_storeu_si256((__m256i*)&bytes[0], reversed2); _mm256_storeu_si256((__m256i*)&bytes[32], reversed1); bytes += 64; } }
size_t vectorshift(uint32_t *array, size_t length, int shiftamount) { size_t k = 0; __m256i * a = (__m256i *) array; for (; k < length / 8 ; k ++, a++) { __m256i v = _mm256_loadu_si256(a); v = _mm256_srli_epi32(v,SHIFTAMOUNT); _mm256_storeu_si256(a,v); } k *= 8; for (; k < length; ++k) { array[k] = array[k] >> SHIFTAMOUNT; } return 0; }
int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw) { const int AB_BITS = MAX(10, (int)INTER_BITS); int x1 = 0; __m256i fxy_mask = _mm256_set1_epi32(INTER_TAB_SIZE - 1); __m256i XX = _mm256_set1_epi32(X0), YY = _mm256_set1_epi32(Y0); for (; x1 <= bw - 16; x1 += 16) { __m256i tx0, tx1, ty0, ty1; tx0 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(adelta + x1)), XX); ty0 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(bdelta + x1)), YY); tx1 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(adelta + x1 + 8)), XX); ty1 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(bdelta + x1 + 8)), YY); tx0 = _mm256_srai_epi32(tx0, AB_BITS - INTER_BITS); ty0 = _mm256_srai_epi32(ty0, AB_BITS - INTER_BITS); tx1 = _mm256_srai_epi32(tx1, AB_BITS - INTER_BITS); ty1 = _mm256_srai_epi32(ty1, AB_BITS - INTER_BITS); __m256i fx_ = _mm256_packs_epi32(_mm256_and_si256(tx0, fxy_mask), _mm256_and_si256(tx1, fxy_mask)); __m256i fy_ = _mm256_packs_epi32(_mm256_and_si256(ty0, fxy_mask), _mm256_and_si256(ty1, fxy_mask)); tx0 = _mm256_packs_epi32(_mm256_srai_epi32(tx0, INTER_BITS), _mm256_srai_epi32(tx1, INTER_BITS)); ty0 = _mm256_packs_epi32(_mm256_srai_epi32(ty0, INTER_BITS), _mm256_srai_epi32(ty1, INTER_BITS)); fx_ = _mm256_adds_epi16(fx_, _mm256_slli_epi16(fy_, INTER_BITS)); fx_ = _mm256_permute4x64_epi64(fx_, (3 << 6) + (1 << 4) + (2 << 2) + 0); _mm256_storeu_si256((__m256i*)(xy + x1 * 2), _mm256_unpacklo_epi16(tx0, ty0)); _mm256_storeu_si256((__m256i*)(xy + x1 * 2 + 16), _mm256_unpackhi_epi16(tx0, ty0)); _mm256_storeu_si256((__m256i*)(alpha + x1), fx_); } _mm256_zeroupper(); return x1; }
size_t varvectorshift(uint32_t *array, size_t length, int shiftamount) { size_t k = 0; __m256i * a = (__m256i *) array; __m128i s = _mm_set1_epi32(shiftamount); for (; k < length / 8 ; k ++, a++) { __m256i v = _mm256_loadu_si256(a); v = _mm256_srl_epi32(v,s); _mm256_storeu_si256(a,v); } k *= 8; for (; k < length; ++k) { array[k] = array[k] >> shiftamount; } return 0; }
/* Routine optimized for shuffling a buffer for a type size of 4 bytes. */ static void shuffle4_avx2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 4; size_t i; int j; __m256i ymm0[4], ymm1[4]; /* Create the shuffle mask. NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from most to least significant (i.e., their order is reversed when compared to loading the mask from an array). */ const __m256i mask = _mm256_set_epi32( 0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00); for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { /* Fetch 32 elements (128 bytes) then transpose bytes and words. */ for (j = 0; j < 4; j++) { ymm0[j] = _mm256_loadu_si256((__m256i*)(src + (i * bytesoftype) + (j * sizeof(__m256i)))); ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0xd8); ymm0[j] = _mm256_shuffle_epi32(ymm0[j], 0x8d); ymm0[j] = _mm256_unpacklo_epi8(ymm1[j], ymm0[j]); ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0x04e); ymm0[j] = _mm256_unpacklo_epi16(ymm0[j], ymm1[j]); } /* Transpose double words */ for (j = 0; j < 2; j++) { ymm1[j*2] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]); ymm1[j*2+1] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]); } /* Transpose quad words */ for (j = 0; j < 2; j++) { ymm0[j*2] = _mm256_unpacklo_epi64(ymm1[j], ymm1[j+2]); ymm0[j*2+1] = _mm256_unpackhi_epi64(ymm1[j], ymm1[j+2]); } for (j = 0; j < 4; j++) { ymm0[j] = _mm256_permutevar8x32_epi32(ymm0[j], mask); } /* Store the result vectors */ uint8_t* const dest_for_ith_element = dest + i; for (j = 0; j < 4; j++) { _mm256_storeu_si256((__m256i*)(dest_for_ith_element + (j * total_elements)), ymm0[j]); } } }
void vpx_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { __m128i src16[8]; __m256i src32[8]; src16[0] = _mm_loadu_si128((const __m128i *)src_diff); src16[1] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); src16[2] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); src16[3] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); src16[7] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); src32[0] = _mm256_cvtepi16_epi32(src16[0]); src32[1] = _mm256_cvtepi16_epi32(src16[1]); src32[2] = _mm256_cvtepi16_epi32(src16[2]); src32[3] = _mm256_cvtepi16_epi32(src16[3]); src32[4] = _mm256_cvtepi16_epi32(src16[4]); src32[5] = _mm256_cvtepi16_epi32(src16[5]); src32[6] = _mm256_cvtepi16_epi32(src16[6]); src32[7] = _mm256_cvtepi16_epi32(src16[7]); highbd_hadamard_col8_avx2(src32, 0); highbd_hadamard_col8_avx2(src32, 1); _mm256_storeu_si256((__m256i *)coeff, src32[0]); coeff += 8; _mm256_storeu_si256((__m256i *)coeff, src32[1]); coeff += 8; _mm256_storeu_si256((__m256i *)coeff, src32[2]); coeff += 8; _mm256_storeu_si256((__m256i *)coeff, src32[3]); coeff += 8; _mm256_storeu_si256((__m256i *)coeff, src32[4]); coeff += 8; _mm256_storeu_si256((__m256i *)coeff, src32[5]); coeff += 8; _mm256_storeu_si256((__m256i *)coeff, src32[6]); coeff += 8; _mm256_storeu_si256((__m256i *)coeff, src32[7]); }
/* Routine optimized for shuffling a buffer for a type size of 8 bytes. */ static void shuffle8_avx2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 8; size_t j; int k, l; __m256i ymm0[8], ymm1[8]; for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) { /* Fetch 32 elements (256 bytes) then transpose bytes. */ for (k = 0; k < 8; k++) { ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i)))); ymm1[k] = _mm256_shuffle_epi32(ymm0[k], 0x4e); ymm1[k] = _mm256_unpacklo_epi8(ymm0[k], ymm1[k]); } /* Transpose words */ for (k = 0, l = 0; k < 4; k++, l +=2) { ymm0[k*2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l+1]); ymm0[k*2+1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l+1]); } /* Transpose double words */ for (k = 0, l = 0; k < 4; k++, l++) { if (k == 2) l += 2; ymm1[k*2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l+2]); ymm1[k*2+1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l+2]); } /* Transpose quad words */ for (k = 0; k < 4; k++) { ymm0[k*2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k+4]); ymm0[k*2+1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k+4]); } for(k = 0; k < 8; k++) { ymm1[k] = _mm256_permute4x64_epi64(ymm0[k], 0x72); ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xD8); ymm0[k] = _mm256_unpacklo_epi16(ymm0[k], ymm1[k]); } /* Store the result vectors */ uint8_t* const dest_for_jth_element = dest + j; for (k = 0; k < 8; k++) { _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm0[k]); } } }
static void hadamard_8x8x2_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff) { __m256i src[8]; src[0] = _mm256_loadu_si256((const __m256i *)src_diff); src[1] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); src[2] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); src[3] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); src[7] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); hadamard_col8x2_avx2(src, 0); hadamard_col8x2_avx2(src, 1); _mm256_storeu_si256((__m256i *)coeff, _mm256_permute2x128_si256(src[0], src[1], 0x20)); coeff += 16; _mm256_storeu_si256((__m256i *)coeff, _mm256_permute2x128_si256(src[2], src[3], 0x20)); coeff += 16; _mm256_storeu_si256((__m256i *)coeff, _mm256_permute2x128_si256(src[4], src[5], 0x20)); coeff += 16; _mm256_storeu_si256((__m256i *)coeff, _mm256_permute2x128_si256(src[6], src[7], 0x20)); coeff += 16; _mm256_storeu_si256((__m256i *)coeff, _mm256_permute2x128_si256(src[0], src[1], 0x31)); coeff += 16; _mm256_storeu_si256((__m256i *)coeff, _mm256_permute2x128_si256(src[2], src[3], 0x31)); coeff += 16; _mm256_storeu_si256((__m256i *)coeff, _mm256_permute2x128_si256(src[4], src[5], 0x31)); coeff += 16; _mm256_storeu_si256((__m256i *)coeff, _mm256_permute2x128_si256(src[6], src[7], 0x31)); }
/* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */ static void unshuffle4_avx2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 4; size_t i; int j; __m256i ymm0[4], ymm1[4]; for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { /* Load 32 elements (128 bytes) into 4 YMM registers. */ const uint8_t* const src_for_ith_element = src + i; for (j = 0; j < 4; j++) { ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements))); } /* Shuffle bytes */ for (j = 0; j < 2; j++) { /* Compute the low 64 bytes */ ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]); /* Compute the hi 64 bytes */ ymm1[2+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]); } /* Shuffle 2-byte words */ for (j = 0; j < 2; j++) { /* Compute the low 64 bytes */ ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]); /* Compute the hi 64 bytes */ ymm0[2+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]); } ymm1[0] = _mm256_permute2x128_si256(ymm0[0], ymm0[2], 0x20); ymm1[1] = _mm256_permute2x128_si256(ymm0[1], ymm0[3], 0x20); ymm1[2] = _mm256_permute2x128_si256(ymm0[0], ymm0[2], 0x31); ymm1[3] = _mm256_permute2x128_si256(ymm0[1], ymm0[3], 0x31); /* Store the result vectors in proper order */ for (j = 0; j < 4; j++) { _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (j * sizeof(__m256i))), ymm1[j]); } } }
/* Routine optimized for shuffling a buffer for a type size of 2 bytes. */ static void shuffle2_avx2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 2; size_t j; int k; __m256i ymm0[2], ymm1[2]; /* Create the shuffle mask. NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from most to least significant (i.e., their order is reversed when compared to loading the mask from an array). */ const __m256i shmask = _mm256_set_epi8( 0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00, 0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00); for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) { /* Fetch 32 elements (64 bytes) then transpose bytes, words and double words. */ for (k = 0; k < 2; k++) { ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i)))); ymm1[k] = _mm256_shuffle_epi8(ymm0[k], shmask); } ymm0[0] = _mm256_permute4x64_epi64(ymm1[0], 0xd8); ymm0[1] = _mm256_permute4x64_epi64(ymm1[1], 0x8d); ymm1[0] = _mm256_blend_epi32(ymm0[0], ymm0[1], 0xf0); ymm0[1] = _mm256_blend_epi32(ymm0[0], ymm0[1], 0x0f); ymm1[1] = _mm256_permute4x64_epi64(ymm0[1], 0x4e); /* Store the result vectors */ uint8_t* const dest_for_jth_element = dest + j; for (k = 0; k < 2; k++) { _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm1[k]); } } }
static INLINE void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1, const __m256i a, uint8_t *comp_pred) { const __m256i alpha_max = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); const int16_t round_bits = 15 - AOM_BLEND_A64_ROUND_BITS; const __m256i round_offset = _mm256_set1_epi16(1 << (round_bits)); const __m256i ma = _mm256_sub_epi8(alpha_max, a); const __m256i ssAL = _mm256_unpacklo_epi8(s0, s1); const __m256i aaAL = _mm256_unpacklo_epi8(a, ma); const __m256i ssAH = _mm256_unpackhi_epi8(s0, s1); const __m256i aaAH = _mm256_unpackhi_epi8(a, ma); const __m256i blendAL = _mm256_maddubs_epi16(ssAL, aaAL); const __m256i blendAH = _mm256_maddubs_epi16(ssAH, aaAH); const __m256i roundAL = _mm256_mulhrs_epi16(blendAL, round_offset); const __m256i roundAH = _mm256_mulhrs_epi16(blendAH, round_offset); const __m256i roundA = _mm256_packus_epi16(roundAL, roundAH); _mm256_storeu_si256((__m256i *)(comp_pred), roundA); }
void* xmemset(void* dest, int c, size_t n) { void* ret = dest; if (n < 16) { xmemset_lt16(dest, c, n); return ret; } __m256i mm = _mm256_set1_epi8((char)c); if (((unsigned long)dest & 31) == 0) { for ( ; n >= 256; n -= 256) { _mm256_store_si256((__m256i*)dest, mm); _mm256_store_si256((__m256i*)dest + 1, mm); _mm256_store_si256((__m256i*)dest + 2, mm); _mm256_store_si256((__m256i*)dest + 3, mm); _mm256_store_si256((__m256i*)dest + 4, mm); _mm256_store_si256((__m256i*)dest + 5, mm); _mm256_store_si256((__m256i*)dest + 6, mm); _mm256_store_si256((__m256i*)dest + 7, mm); // 8 dest = (void*)((__m256i*)dest + 8); } if (n >= 128) { _mm256_store_si256((__m256i*)dest, mm); _mm256_store_si256((__m256i*)dest + 1, mm); _mm256_store_si256((__m256i*)dest + 2, mm); _mm256_store_si256((__m256i*)dest + 3, mm); dest = (void*)((__m256i*)dest + 4); n -= 128; } if (n >= 64) { _mm256_store_si256((__m256i*)dest, mm); _mm256_store_si256((__m256i*)dest + 1, mm); dest = (void*)((__m256i*)dest + 2); n -= 64; } if (n >= 32) { _mm256_store_si256((__m256i*)dest, mm); dest = (void*)((__m256i*)dest + 1); n -= 32; } if (n >= 16) { _mm_store_si128((__m128i*)dest, _mm_set1_epi8((char)c)); dest = (void*)((__m128i*)dest + 1); n -= 16; } } else { for ( ; n >= 256; n -= 256) { _mm256_storeu_si256((__m256i*)dest, mm); _mm256_storeu_si256((__m256i*)dest + 1, mm); _mm256_storeu_si256((__m256i*)dest + 2, mm); _mm256_storeu_si256((__m256i*)dest + 3, mm); _mm256_storeu_si256((__m256i*)dest + 4, mm); _mm256_storeu_si256((__m256i*)dest + 5, mm); _mm256_storeu_si256((__m256i*)dest + 6, mm); _mm256_storeu_si256((__m256i*)dest + 7, mm); // 8 dest = (void*)((__m256i*)dest + 8); } if (n >= 128) { _mm256_storeu_si256((__m256i*)dest, mm); _mm256_storeu_si256((__m256i*)dest + 1, mm); _mm256_storeu_si256((__m256i*)dest + 2, mm); _mm256_storeu_si256((__m256i*)dest + 3, mm); dest = (void*)((__m256i*)dest + 4); n -= 128; } if (n >= 64) { _mm256_storeu_si256((__m256i*)dest, mm); _mm256_storeu_si256((__m256i*)dest + 1, mm); dest = (void*)((__m256i*)dest + 2); n -= 64; } if (n >= 32) { _mm256_storeu_si256((__m256i*)dest, mm); dest = (void*)((__m256i*)dest + 1); n -= 32; } if (n >= 16) { _mm_storeu_si128((__m128i*)dest, _mm_set1_epi8((char)c)); dest = (void*)((__m128i*)dest + 1); n -= 16; } } xmemset_lt16(dest, c, n); return ret; }
void kvz_pixels_blit_avx2(const kvz_pixel * const orig, kvz_pixel * const dst, const unsigned width, const unsigned height, const unsigned orig_stride, const unsigned dst_stride) { unsigned y; //There is absolutely no reason to have a width greater than the source or the destination stride. assert(width <= orig_stride); assert(width <= dst_stride); #ifdef CHECKPOINTS char *buffer = malloc((3 * width + 1) * sizeof(char)); for (y = 0; y < height; ++y) { int p; for (p = 0; p < width; ++p) { sprintf((buffer + 3*p), "%02X ", orig[y*orig_stride]); } buffer[3*width] = 0; CHECKPOINT("kvz_pixels_blit_avx2: %04d: %s", y, buffer); } FREE_POINTER(buffer); #endif //CHECKPOINTS if (width == orig_stride && width == dst_stride) { memcpy(dst, orig, width * height * sizeof(kvz_pixel)); return; } int nxn_width = (width == height) ? width : 0; switch (nxn_width) { case 4: *(int32_t*)&dst[dst_stride*0] = *(int32_t*)&orig[orig_stride*0]; *(int32_t*)&dst[dst_stride*1] = *(int32_t*)&orig[orig_stride*1]; *(int32_t*)&dst[dst_stride*2] = *(int32_t*)&orig[orig_stride*2]; *(int32_t*)&dst[dst_stride*3] = *(int32_t*)&orig[orig_stride*3]; break; case 8: *(int64_t*)&dst[dst_stride*0] = *(int64_t*)&orig[orig_stride*0]; *(int64_t*)&dst[dst_stride*1] = *(int64_t*)&orig[orig_stride*1]; *(int64_t*)&dst[dst_stride*2] = *(int64_t*)&orig[orig_stride*2]; *(int64_t*)&dst[dst_stride*3] = *(int64_t*)&orig[orig_stride*3]; *(int64_t*)&dst[dst_stride*4] = *(int64_t*)&orig[orig_stride*4]; *(int64_t*)&dst[dst_stride*5] = *(int64_t*)&orig[orig_stride*5]; *(int64_t*)&dst[dst_stride*6] = *(int64_t*)&orig[orig_stride*6]; *(int64_t*)&dst[dst_stride*7] = *(int64_t*)&orig[orig_stride*7]; break; case 16: for (int i = 0; i < 16; ++i) { __m128i temp = _mm_loadu_si128((__m128i*)(orig + i * orig_stride)); _mm_storeu_si128((__m128i*)(dst + i * dst_stride), temp); } break; case 32: for (int i = 0; i < 32; ++i) { __m256i temp = _mm256_loadu_si256((__m256i*)(orig + i * orig_stride)); _mm256_storeu_si256((__m256i*)(dst + i * dst_stride), temp); } break; case 64: for (int i = 0; i < 64; ++i) { __m256i temp0 = _mm256_loadu_si256((__m256i*)(orig + i * orig_stride)); _mm256_storeu_si256((__m256i*)(dst + i * dst_stride), temp0); __m256i temp1 = _mm256_loadu_si256((__m256i*)(orig + i * orig_stride + sizeof(__m256))); _mm256_storeu_si256((__m256i*)(dst + i * dst_stride + sizeof(__m256)), temp1); } break; default: if (orig == dst) { //If we have the same array, then we should have the same stride assert(orig_stride == dst_stride); return; } assert(orig != dst || orig_stride == dst_stride); for (y = 0; y < height; ++y) { memcpy(&dst[y*dst_stride], &orig[y*orig_stride], width * sizeof(kvz_pixel)); } break; } }
/*! * \brief Unaligned store of the given packed vector at the * given memory position */ ETL_STATIC_INLINE(void) storeu(int64_t* memory, avx_simd_long value) { _mm256_storeu_si256(reinterpret_cast<__m256i*>(memory), value.value); }
void nibble_sort_beekman1(uint64_t *buf) { // already in the right order //__m256i // shuf0={0x1716151413121110ULL,0x1f1e1d1c1b1a1918ULL,0x0706050403020100ULL,0x0f0e0d0c0b0a0908ULL}; __m256i shuf1 = {0x1e161c141a121810ULL, 0x1f171d151b131911ULL, 0x0e060c040a020800ULL, 0x0f070d050b030901ULL}; __m256i shuf2 = {0x1d1c151419181110ULL, 0x1f1e17161b1a1312ULL, 0x0d0c050409080100ULL, 0x0f0e07060b0a0302ULL}; // use less instructions below //__m256i // shuf3={0x1b1a191813121110ULL,0x1f1e1d1c17161514ULL,0x0b0a090803020100ULL,0x0f0e0d0c07060504ULL}; __m256i shuf4 = {0x101d171615141311ULL, 0x1f1e1b191a181c12ULL, 0x000d070605040301ULL, 0x0f0e0b090a080c02ULL}; __m256i shuf5 = {0x171d151413111810ULL, 0x1f1e16191c1b1a12ULL, 0x070d050403010800ULL, 0x0f0e06090c0b0a02ULL}; __m256i shuf6 = {0x1e17161a15141211ULL, 0x1f101d1c1b191318ULL, 0x0e07060a05040201ULL, 0x0f000d0c0b090308ULL}; __m256i shuf7 = {0x171510161b131911ULL, 0x1f1d181e1c141a12ULL, 0x070500060b030901ULL, 0x0f0d080e0c040a02ULL}; __m256i shuf8 = {0x1715141613121110ULL, 0x1f1e1c1b1a19181dULL, 0x0705040603020100ULL, 0x0f0e0c0b0a09080dULL}; __m256i shuf9 = {0x171c1b1a19181615ULL, 0x1f1e14131211101dULL, 0x070c0b0a09080605ULL, 0x0f0e04030201000dULL}; __m256i nibblemask = _mm256_set1_epi8(0x0f); for (uint32_t i = 0; i < (1024 / 4); i += 1) { __m256i r0 = _mm256_loadu_si256(((__m256i *)buf) + i), r1 = r0, r2; r0 &= nibblemask; r1 ^= r0; r1 = _mm256_srli_epi64(r1, 4); #define sort_and_shuffle(n) \ r2 = _mm256_max_epi8(r0, r1); \ r0 = _mm256_min_epi8(r0, r1); \ r1 = (__m256i)_mm256_shuffle_pd((__m256d)r0, (__m256d)r2, 0b0000); \ r2 = (__m256i)_mm256_shuffle_pd((__m256d)r0, (__m256d)r2, 0b1111); \ r1 = _mm256_shuffle_epi8(r1, shuf##n); \ r2 = _mm256_shuffle_epi8(r2, shuf##n); \ r0 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b0000); \ r1 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b1111) sort_and_shuffle(1); sort_and_shuffle(2); { // sort_and_shuffle(3); r2 = _mm256_max_epi8(r0, r1); r0 = _mm256_min_epi8(r0, r1); r1 = (__m256i)_mm256_unpacklo_ps((__m256)r0, (__m256)r2); r2 = (__m256i)_mm256_unpackhi_ps((__m256)r0, (__m256)r2); r0 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b1111); r1 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b0000); } sort_and_shuffle(4); sort_and_shuffle(5); sort_and_shuffle(6); sort_and_shuffle(7); sort_and_shuffle(8); sort_and_shuffle(9); r1 = _mm256_slli_epi64(r1, 4); _mm256_storeu_si256(((__m256i *)buf) + i, r1 | r0); } }
/* For data organized into a row for each bit (8 * elem_size rows), transpose * the bytes. */ int64_t bshuf_trans_byte_bitrow_AVX(void* in, void* out, const size_t size, const size_t elem_size) { size_t hh, ii, jj, kk, mm; char* in_b = (char*) in; char* out_b = (char*) out; CHECK_MULT_EIGHT(size); size_t nrows = 8 * elem_size; size_t nbyte_row = size / 8; if (elem_size % 4) return bshuf_trans_byte_bitrow_SSE(in, out, size, elem_size); __m256i ymm_0[8]; __m256i ymm_1[8]; __m256i ymm_storeage[8][4]; for (jj = 0; jj + 31 < nbyte_row; jj += 32) { for (ii = 0; ii + 3 < elem_size; ii += 4) { for (hh = 0; hh < 4; hh ++) { for (kk = 0; kk < 8; kk ++){ ymm_0[kk] = _mm256_loadu_si256((__m256i *) &in_b[ (ii * 8 + hh * 8 + kk) * nbyte_row + jj]); } for (kk = 0; kk < 4; kk ++){ ymm_1[kk] = _mm256_unpacklo_epi8(ymm_0[kk * 2], ymm_0[kk * 2 + 1]); ymm_1[kk + 4] = _mm256_unpackhi_epi8(ymm_0[kk * 2], ymm_0[kk * 2 + 1]); } for (kk = 0; kk < 2; kk ++){ for (mm = 0; mm < 2; mm ++){ ymm_0[kk * 4 + mm] = _mm256_unpacklo_epi16( ymm_1[kk * 4 + mm * 2], ymm_1[kk * 4 + mm * 2 + 1]); ymm_0[kk * 4 + mm + 2] = _mm256_unpackhi_epi16( ymm_1[kk * 4 + mm * 2], ymm_1[kk * 4 + mm * 2 + 1]); } } for (kk = 0; kk < 4; kk ++){ ymm_1[kk * 2] = _mm256_unpacklo_epi32(ymm_0[kk * 2], ymm_0[kk * 2 + 1]); ymm_1[kk * 2 + 1] = _mm256_unpackhi_epi32(ymm_0[kk * 2], ymm_0[kk * 2 + 1]); } for (kk = 0; kk < 8; kk ++){ ymm_storeage[kk][hh] = ymm_1[kk]; } } for (mm = 0; mm < 8; mm ++) { for (kk = 0; kk < 4; kk ++){ ymm_0[kk] = ymm_storeage[mm][kk]; } ymm_1[0] = _mm256_unpacklo_epi64(ymm_0[0], ymm_0[1]); ymm_1[1] = _mm256_unpacklo_epi64(ymm_0[2], ymm_0[3]); ymm_1[2] = _mm256_unpackhi_epi64(ymm_0[0], ymm_0[1]); ymm_1[3] = _mm256_unpackhi_epi64(ymm_0[2], ymm_0[3]); ymm_0[0] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 32); ymm_0[1] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 32); ymm_0[2] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 49); ymm_0[3] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 49); _mm256_storeu_si256((__m256i *) &out_b[ (jj + mm * 2 + 0 * 16) * nrows + ii * 8], ymm_0[0]); _mm256_storeu_si256((__m256i *) &out_b[ (jj + mm * 2 + 0 * 16 + 1) * nrows + ii * 8], ymm_0[1]); _mm256_storeu_si256((__m256i *) &out_b[ (jj + mm * 2 + 1 * 16) * nrows + ii * 8], ymm_0[2]); _mm256_storeu_si256((__m256i *) &out_b[ (jj + mm * 2 + 1 * 16 + 1) * nrows + ii * 8], ymm_0[3]); } } } for (ii = 0; ii < nrows; ii ++ ) { for (jj = nbyte_row - nbyte_row % 32; jj < nbyte_row; jj ++) { out_b[jj * nrows + ii] = in_b[ii * nbyte_row + jj]; } } return size * elem_size; }
/* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */ static void unshuffle16_avx2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 16; size_t i; int j; __m256i ymm0[16], ymm1[16]; for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { /* Fetch 32 elements (512 bytes) into 16 YMM registers. */ const uint8_t* const src_for_ith_element = src + i; for (j = 0; j < 16; j++) { ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements))); } /* Shuffle bytes */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]); /* Compute the hi 32 bytes */ ymm1[8+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]); } /* Shuffle 2-byte words */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]); /* Compute the hi 32 bytes */ ymm0[8+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]); } /* Shuffle 4-byte dwords */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm1[j] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]); /* Compute the hi 32 bytes */ ymm1[8+j] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]); } /* Shuffle 8-byte qwords */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ ymm0[j] = _mm256_unpacklo_epi64(ymm1[j*2], ymm1[j*2+1]); /* Compute the hi 32 bytes */ ymm0[8+j] = _mm256_unpackhi_epi64(ymm1[j*2], ymm1[j*2+1]); } for (j = 0; j < 8; j++) { ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x20); ymm1[j+8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x31); } /* Store the result vectors in proper order */ _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[4]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * sizeof(__m256i))), ymm1[2]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * sizeof(__m256i))), ymm1[6]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * sizeof(__m256i))), ymm1[1]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * sizeof(__m256i))), ymm1[5]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * sizeof(__m256i))), ymm1[3]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * sizeof(__m256i))), ymm1[7]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (8 * sizeof(__m256i))), ymm1[8]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (9 * sizeof(__m256i))), ymm1[12]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (10 * sizeof(__m256i))), ymm1[10]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (11 * sizeof(__m256i))), ymm1[14]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (12 * sizeof(__m256i))), ymm1[9]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (13 * sizeof(__m256i))), ymm1[13]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (14 * sizeof(__m256i))), ymm1[11]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (15 * sizeof(__m256i))), ymm1[15]); } }
void aom_highbd_comp_mask_pred_avx2(uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask) { int i = 0; uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); const uint16_t *src0 = invert_mask ? pred : ref; const uint16_t *src1 = invert_mask ? ref : pred; const int stride0 = invert_mask ? width : ref_stride; const int stride1 = invert_mask ? ref_stride : width; const __m256i zero = _mm256_setzero_si256(); if (width == 8) { do { const __m256i s0 = mm256_loadu2_16(src0 + stride0, src0); const __m256i s1 = mm256_loadu2_16(src1 + stride1, src1); const __m128i m_l = _mm_loadl_epi64((const __m128i *)mask); const __m128i m_h = _mm_loadl_epi64((const __m128i *)(mask + 8)); __m256i m = _mm256_castsi128_si256(m_l); m = _mm256_insertf128_si256(m, m_h, 1); const __m256i m_16 = _mm256_unpacklo_epi8(m, zero); const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16); _mm_storeu_si128((__m128i *)(comp_pred), _mm256_castsi256_si128(comp)); _mm_storeu_si128((__m128i *)(comp_pred + width), _mm256_extractf128_si256(comp, 1)); src0 += (stride0 << 1); src1 += (stride1 << 1); mask += (mask_stride << 1); comp_pred += (width << 1); i += 2; } while (i < height); } else if (width == 16) { do { const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src0)); const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src1)); const __m256i m_16 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask)); const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16); _mm256_storeu_si256((__m256i *)comp_pred, comp); src0 += stride0; src1 += stride1; mask += mask_stride; comp_pred += width; i += 1; } while (i < height); } else if (width == 32) { do { const __m256i s0 = _mm256_loadu_si256((const __m256i *)src0); const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src0 + 16)); const __m256i s1 = _mm256_loadu_si256((const __m256i *)src1); const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src1 + 16)); const __m256i m01_16 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask)); const __m256i m23_16 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(mask + 16))); const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m01_16); const __m256i comp1 = highbd_comp_mask_pred_line_avx2(s2, s3, m23_16); _mm256_storeu_si256((__m256i *)comp_pred, comp); _mm256_storeu_si256((__m256i *)(comp_pred + 16), comp1); src0 += stride0; src1 += stride1; mask += mask_stride; comp_pred += width; i += 1; } while (i < height); } }