static void avx2_test (void) { int i; int ck[8]; int r[8]; unsigned int imm; int fail = 0; union256i_q s1, s2, d; for (i = 0; i < 256; i += 16) for (imm = 0; imm < 100; imm++) { /* Recompute the results for 256-bits */ compute_correct_result_256 (&vals[i + 0], &vals[i + 8], imm, ck); s1.x = _mm256_loadu_si256 ((__m256i *) & vals[i + 0]); s2.x = _mm256_loadu_si256 ((__m256i *) & vals[i + 8]); /* Run the 256-bit tests */ avx2_test_palignr256 (s1.x, s2.x, imm, &d.x); _mm256_storeu_si256 ((__m256i *) r, d.x); fail += checkVi (r, ck, 8); } if (fail != 0) abort (); }
size_t varvectorshift_unrolled(uint32_t *array, size_t length, int shiftamount) { size_t k = 0; __m256i * a = (__m256i *) array; __m128i s = _mm_set1_epi32(shiftamount); for (; k + 3 < length / 8 ; k +=4, a+=4) { __m256i v1 = _mm256_loadu_si256(a); __m256i v2 = _mm256_loadu_si256(a + 1); __m256i v3 = _mm256_loadu_si256(a + 2); __m256i v4 = _mm256_loadu_si256(a + 3); v1 = _mm256_srl_epi32(v1,s); v2 = _mm256_srl_epi32(v2,s); v3 = _mm256_srl_epi32(v3,s); v4 = _mm256_srl_epi32(v4,s); _mm256_storeu_si256(a,v1); _mm256_storeu_si256(a + 1,v2); _mm256_storeu_si256(a + 2,v3); _mm256_storeu_si256(a + 3,v4); } for (; k < length / 8 ; k ++, a++) { __m256i v = _mm256_loadu_si256(a); v = _mm256_srl_epi32(v,s); _mm256_storeu_si256(a,v); } k *= 8; for (; k < length; ++k) { array[k] = array[k] >> shiftamount; } return 0; }
static INLINE void variance32_kernel_avx2(const uint8_t *const src, const uint8_t *const ref, __m256i *const sse, __m256i *const sum) { const __m256i s = _mm256_loadu_si256((__m256i const *)(src)); const __m256i r = _mm256_loadu_si256((__m256i const *)(ref)); variance_kernel_avx2(s, r, sse, sum); }
/** * Performs an absorb operation for a single block (BLOCK_LEN_BLAKE2_SAFE_INT64 * words of type uint64_t), using Blake2b's G function as the internal permutation * * @param state The current state of the sponge * @param in The block to be absorbed (BLOCK_LEN_BLAKE2_SAFE_INT64 words) */ void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) { //XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state #if defined __AVX2__ __m256i state_v[2], in_v[2]; state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); in_v [0] = _mm256_loadu_si256( (__m256i*)(&in[0]) ); state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); in_v [1] = _mm256_loadu_si256( (__m256i*)(&in[4]) ); _mm256_store_si256( (__m256i*)(&state[0]), _mm256_xor_si256( state_v[0], in_v[0] ) ); _mm256_store_si256( (__m256i*)(&state[4]), _mm256_xor_si256( state_v[1], in_v[1] ) ); #elif defined __AVX__ __m128i state_v[4], in_v[4]; state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) ); state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) ); state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) ); state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) ); in_v[0] = _mm_loadu_si128( (__m128i*)(&in[0]) ); in_v[1] = _mm_loadu_si128( (__m128i*)(&in[2]) ); in_v[2] = _mm_loadu_si128( (__m128i*)(&in[4]) ); in_v[3] = _mm_loadu_si128( (__m128i*)(&in[6]) ); _mm_store_si128( (__m128i*)(&state[0]), _mm_xor_si128( state_v[0], in_v[0] ) ); _mm_store_si128( (__m128i*)(&state[2]), _mm_xor_si128( state_v[1], in_v[1] ) ); _mm_store_si128( (__m128i*)(&state[4]), _mm_xor_si128( state_v[2], in_v[2] ) ); _mm_store_si128( (__m128i*)(&state[6]), _mm_xor_si128( state_v[3], in_v[3] ) ); #else state[0] ^= in[0]; state[1] ^= in[1]; state[2] ^= in[2]; state[3] ^= in[3]; state[4] ^= in[4]; state[5] ^= in[5]; state[6] ^= in[6]; state[7] ^= in[7]; #endif //Applies the transformation f to the sponge's state blake2bLyra(state); }
int64_t vp9_block_error_avx2(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz) { __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg; __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi; __m256i sse_reg_64hi, ssz_reg_64hi; __m128i sse_reg128, ssz_reg128; int64_t sse; int i; const __m256i zero_reg = _mm256_set1_epi16(0); // init sse and ssz registerd to zero sse_reg = _mm256_set1_epi16(0); ssz_reg = _mm256_set1_epi16(0); for (i = 0 ; i < block_size ; i+= 16) { // load 32 bytes from coeff and dqcoeff coeff_reg = _mm256_loadu_si256((const __m256i *)(coeff + i)); dqcoeff_reg = _mm256_loadu_si256((const __m256i *)(dqcoeff + i)); // dqcoeff - coeff dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg); // madd (dqcoeff - coeff) dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg); // madd coeff coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg); // expand each double word of madd (dqcoeff - coeff) to quad word exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg); exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg); // expand each double word of madd (coeff) to quad word exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg); exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg); // add each quad word of madd (dqcoeff - coeff) and madd (coeff) sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo); ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo); sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi); ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi); } // save the higher 64 bit of each 128 bit lane sse_reg_64hi = _mm256_srli_si256(sse_reg, 8); ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8); // add the higher 64 bit to the low 64 bit sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi); ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi); // add each 64 bit from each of the 128 bit lane of the 256 bit sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg), _mm256_extractf128_si256(sse_reg, 1)); ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg), _mm256_extractf128_si256(ssz_reg, 1)); // store the results _mm_storel_epi64((__m128i*)(&sse), sse_reg128); _mm_storel_epi64((__m128i*)(ssz), ssz_reg128); return sse; }
template <bool align, bool compensation> SIMD_INLINE __m256i MainRowX5x5(uint16_t * dst) { __m256i t0 = _mm256_loadu_si256((__m256i*)(dst - 2)); __m256i t1 = _mm256_loadu_si256((__m256i*)(dst - 1)); __m256i t2 = Load<align>((__m256i*)dst); __m256i t3 = _mm256_loadu_si256((__m256i*)(dst + 1)); __m256i t4 = _mm256_loadu_si256((__m256i*)(dst + 2)); t2 = _mm256_add_epi16(_mm256_add_epi16(_mm256_mullo_epi16(t2, K16_0006), _mm256_mullo_epi16(_mm256_add_epi16(t1, t3), K16_0004)), _mm256_add_epi16(t0, t4)); return DivideBy256<compensation>(t2); }
uint64_t avx2_count_byte(const uint8_t* data, size_t size, uint8_t byte) { const __m256i v = _mm256_set1_epi8(byte); const uint8_t* end = data + size; const uint8_t* ptr = data; __m256i global_sum = _mm256_setzero_si256(); __m256i local_sum; // 1. blocks of 256 registers while (ptr + 255*32 < end) { local_sum = _mm256_setzero_si256(); // update 32 x 8-bit counter for (int i=0; i < 255; i++, ptr += 32) { const __m256i in = _mm256_loadu_si256((const __m256i*)ptr); const __m256i eq = _mm256_cmpeq_epi8(in, v); // 0 or -1 local_sum = _mm256_sub_epi8(local_sum, eq); } // update the global accumulator 2 x 64-bit const __m256i tmp = _mm256_sad_epu8(local_sum, _mm256_setzero_si256()); global_sum = _mm256_add_epi64(global_sum, tmp); } // 2. tail of < 256 registers local_sum = _mm256_setzero_si256(); while (ptr + 32 < end) { const __m256i in = _mm256_loadu_si256((const __m256i*)ptr); const __m256i eq = _mm256_cmpeq_epi8(in, v); local_sum = _mm256_sub_epi8(local_sum, eq); ptr += 32; } const __m256i tmp = _mm256_sad_epu8(local_sum, _mm256_setzero_si256()); global_sum = _mm256_add_epi64(global_sum, tmp); // 3. process tail < 32 bytes uint64_t result = _mm256_extract_epi64(global_sum, 0) + _mm256_extract_epi64(global_sum, 1) + _mm256_extract_epi64(global_sum, 2) + _mm256_extract_epi64(global_sum, 3); return result + scalar_count_bytes(ptr, end - ptr, byte); }
void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]) { __m256i sums[4]; int i; const uint8_t *refs[4]; refs[0] = ref_array[0]; refs[1] = ref_array[1]; refs[2] = ref_array[2]; refs[3] = ref_array[3]; sums[0] = _mm256_setzero_si256(); sums[1] = _mm256_setzero_si256(); sums[2] = _mm256_setzero_si256(); sums[3] = _mm256_setzero_si256(); for (i = 0; i < 64; i++) { __m256i r_lo[4], r_hi[4]; // load 64 bytes from src and all ref[] const __m256i s_lo = _mm256_load_si256((const __m256i *)src_ptr); const __m256i s_hi = _mm256_load_si256((const __m256i *)(src_ptr + 32)); r_lo[0] = _mm256_loadu_si256((const __m256i *)refs[0]); r_hi[0] = _mm256_loadu_si256((const __m256i *)(refs[0] + 32)); r_lo[1] = _mm256_loadu_si256((const __m256i *)refs[1]); r_hi[1] = _mm256_loadu_si256((const __m256i *)(refs[1] + 32)); r_lo[2] = _mm256_loadu_si256((const __m256i *)refs[2]); r_hi[2] = _mm256_loadu_si256((const __m256i *)(refs[2] + 32)); r_lo[3] = _mm256_loadu_si256((const __m256i *)refs[3]); r_hi[3] = _mm256_loadu_si256((const __m256i *)(refs[3] + 32)); // sum of the absolute differences between every ref[] to src r_lo[0] = _mm256_sad_epu8(r_lo[0], s_lo); r_lo[1] = _mm256_sad_epu8(r_lo[1], s_lo); r_lo[2] = _mm256_sad_epu8(r_lo[2], s_lo); r_lo[3] = _mm256_sad_epu8(r_lo[3], s_lo); r_hi[0] = _mm256_sad_epu8(r_hi[0], s_hi); r_hi[1] = _mm256_sad_epu8(r_hi[1], s_hi); r_hi[2] = _mm256_sad_epu8(r_hi[2], s_hi); r_hi[3] = _mm256_sad_epu8(r_hi[3], s_hi); // sum every ref[] sums[0] = _mm256_add_epi32(sums[0], r_lo[0]); sums[1] = _mm256_add_epi32(sums[1], r_lo[1]); sums[2] = _mm256_add_epi32(sums[2], r_lo[2]); sums[3] = _mm256_add_epi32(sums[3], r_lo[3]); sums[0] = _mm256_add_epi32(sums[0], r_hi[0]); sums[1] = _mm256_add_epi32(sums[1], r_hi[1]); sums[2] = _mm256_add_epi32(sums[2], r_hi[2]); sums[3] = _mm256_add_epi32(sums[3], r_hi[3]); src_ptr += src_stride; refs[0] += ref_stride; refs[1] += ref_stride; refs[2] += ref_stride; refs[3] += ref_stride; } calc_final(sums, sad_array); }
void bitmask_avx2(uint32_t* ptr, size_t n, uint32_t key, uint8_t* out) { uint32_t* output = (uint32_t*)out; const size_t N = 8*4; // unrolled 4 times const size_t chunks = n / N; const size_t tail = n % N; const __m256i vkey = _mm256_set1_epi32(key); for (size_t i=0; i < chunks; i++) { const __m256i in0 = _mm256_loadu_si256((const __m256i*)(ptr + i*N + 0*8)); const __m256i in1 = _mm256_loadu_si256((const __m256i*)(ptr + i*N + 1*8)); const __m256i in2 = _mm256_loadu_si256((const __m256i*)(ptr + i*N + 2*8)); const __m256i in3 = _mm256_loadu_si256((const __m256i*)(ptr + i*N + 3*8)); const __m256i eq0 = _mm256_cmpeq_epi32(in0, vkey); const __m256i eq1 = _mm256_cmpeq_epi32(in1, vkey); const __m256i eq2 = _mm256_cmpeq_epi32(in2, vkey); const __m256i eq3 = _mm256_cmpeq_epi32(in3, vkey); // eq0 = [a0 a1 a2 a3 a4 a5 a6 a7] (packed dword) // eq1 = [b0 b1 b2 b3 b4 b5 b6 b7] (packed dword) // eq2 = [c0 c1 c2 c3 c4 c5 c6 c7] (packed dword) // eq3 = [d0 d1 d2 d3 d4 d5 d6 d7] (packed dword) // t0 = [a0 a1 a2 a3 c0 c1 c2 c3 a4 a5 a6 a7 c4 c5 c6 c7] (packed word) const __m256i t0 = _mm256_packs_epi32(eq0, eq2); // m02 = [a0 a1 a2 a3 a4 a5 a6 a7 c0 c1 c2 c3 c4 c5 c6 c7] (packed word) const __m256i m02 = _mm256_permutevar8x32_epi32(t0, _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7)); // t0 = [b0 b1 b2 b3 d0 d1 d2 d3 b4 b5 b6 b7 d4 d5 d6 d7] (packed word) const __m256i t1 = _mm256_packs_epi32(eq1, eq3); // m13 = [b0 b1 b2 b3 b4 b5 b6 b7 d0 d1 d2 d3 d4 d5 d6 d7] (packed word) const __m256i m13 = _mm256_permutevar8x32_epi32(t1, _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7)); // m = [a0..7 b0..7 c0..7 d0..7] (packed byte) const __m256i m = _mm256_packs_epi16(m02, m13); *output++ = _mm256_movemask_epi8(m); } if (tail > 0) { bitmask_better_2(ptr + chunks*N, tail, key, out + chunks*N); } }
void av1_highbd_quantize_fp_avx2( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale) { (void)scan; (void)zbin_ptr; (void)quant_shift_ptr; const unsigned int step = 8; __m256i qp[3], coeff; init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, qp); coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); __m256i eob = _mm256_setzero_si256(); quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob); coeff_ptr += step; qcoeff_ptr += step; dqcoeff_ptr += step; iscan += step; n_coeffs -= step; update_qp(qp); while (n_coeffs > 0) { coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob); coeff_ptr += step; qcoeff_ptr += step; dqcoeff_ptr += step; iscan += step; n_coeffs -= step; } { __m256i eob_s; eob_s = _mm256_shuffle_epi32(eob, 0xe); eob = _mm256_max_epi16(eob, eob_s); eob_s = _mm256_shufflelo_epi16(eob, 0xe); eob = _mm256_max_epi16(eob, eob_s); eob_s = _mm256_shufflelo_epi16(eob, 1); eob = _mm256_max_epi16(eob, eob_s); const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob), _mm256_extractf128_si256(eob, 1)); *eob_ptr = _mm_extract_epi16(final_eob, 0); } }
inline void scatter(double *ptr, const int *offsets) const { __m256i indices; indices = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(offsets)); _mm512_i32scatter_pd(ptr, indices, val, 8); }
// Computes part of matrix.vector v = Wu. Computes N=8 results. // For details see PartialMatrixDotVector64 with N=8. static void PartialMatrixDotVector8(const int8_t* wi, const double* scales, const int8_t* u, int num_in, int num_out, double* v) { // Register containing 16-bit ones for horizontal add with 16->32 bit // conversion. __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1); // Initialize all the results to 0. __m256i result0 = _mm256_setzero_si256(); // Iterate over the input (u), one registerful at a time. for (int j = 0; j < num_in;) { __m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(u + j)); // Inputs are processed in groups of kNumInputsPerGroup, replicated // kNumInputGroups times. for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) { // Replicate the low 32 bits (4 inputs) 8 times. __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs)); // Rotate the inputs in groups of 4, so the next 4 inputs are ready. inputs = _mm256_permutevar8x32_epi32(inputs, shift_id); __m256i weights, reps; // Mul-add, with horizontal add of the 4 inputs to each of the results. MultiplyGroup(rep_input, ones, wi, weights, reps, result0); } } ExtractResults(result0, shift_id, wi, scales, num_out, v); }
inline void gather(const double *ptr, const int *offsets) { __m256i indices; indices = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(offsets)); val = _mm512_i32gather_pd(indices, ptr, 8); }
int32_t avx2_sumsignedbytes(int8_t* array, size_t size) { __m256i accumulator = _mm256_setzero_si256(); for (size_t i=0; i < size; i += 32) { const __m256i v = _mm256_loadu_si256((__m256i*)(array + i)); const __m128i lo = _mm256_extracti128_si256(v, 0); const __m128i hi = _mm256_extracti128_si256(v, 1); const __m256i t0 = _mm256_cvtepi8_epi32(lo); const __m256i t1 = _mm256_cvtepi8_epi32(hi); const __m256i t2 = _mm256_cvtepi8_epi32(_mm_bsrli_si128(lo, 8)); const __m256i t3 = _mm256_cvtepi8_epi32(_mm_bsrli_si128(hi, 8)); accumulator = _mm256_add_epi32(accumulator, t0); accumulator = _mm256_add_epi32(accumulator, t1); accumulator = _mm256_add_epi32(accumulator, t2); accumulator = _mm256_add_epi32(accumulator, t3); } return int32_t(_mm256_extract_epi32(accumulator, 0)) + int32_t(_mm256_extract_epi32(accumulator, 1)) + int32_t(_mm256_extract_epi32(accumulator, 2)) + int32_t(_mm256_extract_epi32(accumulator, 3)) + int32_t(_mm256_extract_epi32(accumulator, 4)) + int32_t(_mm256_extract_epi32(accumulator, 5)) + int32_t(_mm256_extract_epi32(accumulator, 6)) + int32_t(_mm256_extract_epi32(accumulator, 7)); }
int32_t avx2_sumsignedbytes_variant2(int8_t* array, size_t size) { __m256i accumulator = _mm256_setzero_si256(); for (size_t i=0; i < size; i += 32) { const __m256i v = _mm256_loadu_si256((__m256i*)(array + i)); const __m256i v0 = _mm256_srai_epi32(v, 3*8); const __m256i v1 = _mm256_srai_epi32(_mm256_slli_epi32(v, 1*8), 3*8); const __m256i v2 = _mm256_srai_epi32(_mm256_slli_epi32(v, 2*8), 3*8); const __m256i v3 = _mm256_srai_epi32(_mm256_slli_epi32(v, 3*8), 3*8); accumulator = _mm256_add_epi32(accumulator, v0); accumulator = _mm256_add_epi32(accumulator, v1); accumulator = _mm256_add_epi32(accumulator, v2); accumulator = _mm256_add_epi32(accumulator, v3); } return int32_t(_mm256_extract_epi32(accumulator, 0)) + int32_t(_mm256_extract_epi32(accumulator, 1)) + int32_t(_mm256_extract_epi32(accumulator, 2)) + int32_t(_mm256_extract_epi32(accumulator, 3)) + int32_t(_mm256_extract_epi32(accumulator, 4)) + int32_t(_mm256_extract_epi32(accumulator, 5)) + int32_t(_mm256_extract_epi32(accumulator, 6)) + int32_t(_mm256_extract_epi32(accumulator, 7)); }
/* Routine optimized for unshuffling a buffer for a type size of 2 bytes. */ static void unshuffle2_avx2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 2; size_t i; int j; __m256i ymm0[2], ymm1[2]; for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { /* Load 32 elements (64 bytes) into 2 YMM registers. */ const uint8_t* const src_for_ith_element = src + i; for (j = 0; j < 2; j++) { ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements))); } /* Shuffle bytes */ for (j = 0; j < 2; j++) { ymm0[j] = _mm256_permute4x64_epi64(ymm0[j], 0xd8); } /* Compute the low 64 bytes */ ymm1[0] = _mm256_unpacklo_epi8(ymm0[0], ymm0[1]); /* Compute the hi 64 bytes */ ymm1[1] = _mm256_unpackhi_epi8(ymm0[0], ymm0[1]); /* Store the result vectors in proper order */ _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[1]); } }
bool is_sorted_avx2(int32_t* a, size_t n) { const __m256i shuffle_pattern = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 7); size_t i = 0; while (i < n - 8) { // curr = [ a0 | a1 | a2 | a3 | a4 | a5 | a6 | a7 ] const __m256i curr = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(a + i)); // next = [ a1 | a2 | a3 | a4 | a5 | a6 | a7 | a7 ] const __m256i next = _mm256_permutevar8x32_epi32(curr, shuffle_pattern); // Note: the last element of curr and next is a7, thus for this element // the comparison result is always zero. // // In fact, the first 7 elements are being tested. const __m256i mask = _mm256_cmpgt_epi32(curr, next); if (!_mm256_testz_si256(mask, mask)) { return false; } i += 7; } for (/**/; i + 1 < n; i++) { if (a[i] > a[i + 1]) return false; } return true; }
void* xmemchr(const void* src, int c, size_t n) { if (n < 32) { return xmemchr_tiny(src, c, n); } __m256i ymm0 = _mm256_set1_epi8((char)c), ymm1; int mask; size_t rem = n % 32; n /= 32; for (size_t i = 0; i < n; i++) { ymm1 = _mm256_loadu_si256((const __m256i*)src + i); ymm1 = _mm256_cmpeq_epi8(ymm1, ymm0); mask = _mm256_movemask_epi8(ymm1); if (mask) { __asm__("bsfl %0, %0\n\t" :"=r"(mask) :"0"(mask) ); return (void*)((unsigned long)((const __m256i*)src + i) + mask); } } return xmemchr_tiny((const void*)((unsigned long)src + n), c, rem); }
template <> SIMD_INLINE __m256i ReduceColTail<false>(const uint8_t * src) { const __m256i t0 = _mm256_loadu_si256((__m256i*)(src - 1)); __m256i t1, t2; LoadAfterLast<false, 1>(src - 1, t1, t2); return BinomialSum16(t0, t2); }
/* Transpose bits within bytes. */ int64_t bshuf_trans_bit_byte_AVX(void* in, void* out, const size_t size, const size_t elem_size) { size_t ii, kk; char* in_b = (char*) in; char* out_b = (char*) out; int32_t* out_i32; size_t nbyte = elem_size * size; int64_t count; __m256i ymm; int32_t bt; for (ii = 0; ii + 31 < nbyte; ii += 32) { ymm = _mm256_loadu_si256((__m256i *) &in_b[ii]); for (kk = 0; kk < 8; kk++) { bt = _mm256_movemask_epi8(ymm); ymm = _mm256_slli_epi16(ymm, 1); out_i32 = (int32_t*) &out_b[((7 - kk) * nbyte + ii) / 8]; *out_i32 = bt; } } count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size, nbyte - nbyte % 32); return count; }
/* Shuffle bits within the bytes of eight element blocks. */ int64_t bshuf_shuffle_bit_eightelem_AVX(void* in, void* out, const size_t size, const size_t elem_size) { CHECK_MULT_EIGHT(size); // With a bit of care, this could be written such that such that it is // in_buf = out_buf safe. char* in_b = (char*) in; char* out_b = (char*) out; size_t ii, jj, kk; size_t nbyte = elem_size * size; __m256i ymm; int32_t bt; if (elem_size % 4) { return bshuf_shuffle_bit_eightelem_SSE(in, out, size, elem_size); } else { for (jj = 0; jj + 31 < 8 * elem_size; jj += 32) { for (ii = 0; ii + 8 * elem_size - 1 < nbyte; ii += 8 * elem_size) { ymm = _mm256_loadu_si256((__m256i *) &in_b[ii + jj]); for (kk = 0; kk < 8; kk++) { bt = _mm256_movemask_epi8(ymm); ymm = _mm256_slli_epi16(ymm, 1); size_t ind = (ii + jj / 8 + (7 - kk) * elem_size); * (int32_t *) &out_b[ind] = bt; } } } } return size * elem_size; }
static INLINE void hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff, int is_final) { #if CONFIG_VP9_HIGHBITDEPTH DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]); int16_t *t_coeff = temp_coeff; #else int16_t *t_coeff = coeff; #endif int16_t *coeff16 = (int16_t *)coeff; int idx; for (idx = 0; idx < 2; ++idx) { const int16_t *src_ptr = src_diff + idx * 8 * src_stride; hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2)); } for (idx = 0; idx < 64; idx += 16) { const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64)); const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128)); const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192)); __m256i b0 = _mm256_add_epi16(coeff0, coeff1); __m256i b1 = _mm256_sub_epi16(coeff0, coeff1); __m256i b2 = _mm256_add_epi16(coeff2, coeff3); __m256i b3 = _mm256_sub_epi16(coeff2, coeff3); b0 = _mm256_srai_epi16(b0, 1); b1 = _mm256_srai_epi16(b1, 1); b2 = _mm256_srai_epi16(b2, 1); b3 = _mm256_srai_epi16(b3, 1); if (is_final) { store_tran_low(_mm256_add_epi16(b0, b2), coeff); store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64); store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128); store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192); coeff += 16; } else { _mm256_storeu_si256((__m256i *)coeff16, _mm256_add_epi16(b0, b2)); _mm256_storeu_si256((__m256i *)(coeff16 + 64), _mm256_add_epi16(b1, b3)); _mm256_storeu_si256((__m256i *)(coeff16 + 128), _mm256_sub_epi16(b0, b2)); _mm256_storeu_si256((__m256i *)(coeff16 + 192), _mm256_sub_epi16(b1, b3)); coeff16 += 16; } t_coeff += 16; } }
void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]) { int i; const uint8_t *refs[4]; __m256i sums[4]; refs[0] = ref_array[0]; refs[1] = ref_array[1]; refs[2] = ref_array[2]; refs[3] = ref_array[3]; sums[0] = _mm256_setzero_si256(); sums[1] = _mm256_setzero_si256(); sums[2] = _mm256_setzero_si256(); sums[3] = _mm256_setzero_si256(); for (i = 0; i < 32; i++) { __m256i r[4]; // load src and all ref[] const __m256i s = _mm256_load_si256((const __m256i *)src_ptr); r[0] = _mm256_loadu_si256((const __m256i *)refs[0]); r[1] = _mm256_loadu_si256((const __m256i *)refs[1]); r[2] = _mm256_loadu_si256((const __m256i *)refs[2]); r[3] = _mm256_loadu_si256((const __m256i *)refs[3]); // sum of the absolute differences between every ref[] to src r[0] = _mm256_sad_epu8(r[0], s); r[1] = _mm256_sad_epu8(r[1], s); r[2] = _mm256_sad_epu8(r[2], s); r[3] = _mm256_sad_epu8(r[3], s); // sum every ref[] sums[0] = _mm256_add_epi32(sums[0], r[0]); sums[1] = _mm256_add_epi32(sums[1], r[1]); sums[2] = _mm256_add_epi32(sums[2], r[2]); sums[3] = _mm256_add_epi32(sums[3], r[3]); src_ptr += src_stride; refs[0] += ref_stride; refs[1] += ref_stride; refs[2] += ref_stride; refs[3] += ref_stride; } calc_final(sums, sad_array); }
void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { #if CONFIG_VP9_HIGHBITDEPTH // For high bitdepths, it is unnecessary to store_tran_low // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the // next stage. Output to an intermediate buffer first, then store_tran_low() // in the final stage. DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]); int16_t *t_coeff = temp_coeff; #else int16_t *t_coeff = coeff; #endif int idx; for (idx = 0; idx < 4; ++idx) { // src_diff: 9 bit, dynamic range [-255, 255] const int16_t *src_ptr = src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; hadamard_16x16_avx2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 256), 0); } for (idx = 0; idx < 256; idx += 16) { const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256)); const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512)); const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768)); __m256i b0 = _mm256_add_epi16(coeff0, coeff1); __m256i b1 = _mm256_sub_epi16(coeff0, coeff1); __m256i b2 = _mm256_add_epi16(coeff2, coeff3); __m256i b3 = _mm256_sub_epi16(coeff2, coeff3); b0 = _mm256_srai_epi16(b0, 2); b1 = _mm256_srai_epi16(b1, 2); b2 = _mm256_srai_epi16(b2, 2); b3 = _mm256_srai_epi16(b3, 2); store_tran_low(_mm256_add_epi16(b0, b2), coeff); store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256); store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 512); store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 768); coeff += 16; t_coeff += 16; } }
/* Routine optimized for shuffling a buffer for a type size of 16 bytes. */ static void shuffle16_avx2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 16; size_t j; int k, l; __m256i ymm0[16], ymm1[16]; /* Create the shuffle mask. NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from most to least significant (i.e., their order is reversed when compared to loading the mask from an array). */ const __m256i shmask = _mm256_set_epi8( 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04, 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00, 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04, 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00); for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) { /* Fetch 32 elements (512 bytes) into 16 YMM registers. */ for (k = 0; k < 16; k++) { ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i)))); } /* Transpose bytes */ for (k = 0, l = 0; k < 8; k++, l +=2) { ymm1[k*2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l+1]); ymm1[k*2+1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l+1]); } /* Transpose words */ for (k = 0, l = -2; k < 8; k++, l++) { if ((k%2) == 0) l += 2; ymm0[k*2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l+2]); ymm0[k*2+1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l+2]); } /* Transpose double words */ for (k = 0, l = -4; k < 8; k++, l++) { if ((k%4) == 0) l += 4; ymm1[k*2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l+4]); ymm1[k*2+1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l+4]); } /* Transpose quad words */ for (k = 0; k < 8; k++) { ymm0[k*2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k+8]); ymm0[k*2+1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k+8]); } for (k = 0; k < 16; k++) { ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8); ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask); } /* Store the result vectors */ uint8_t* const dest_for_jth_element = dest + j; for (k = 0; k < 16; k++) { _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm0[k]); } } }
void vpx_highbd_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { int idx; tran_low_t *t_coeff = coeff; for (idx = 0; idx < 4; ++idx) { const int16_t *src_ptr = src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; vpx_highbd_hadamard_16x16_avx2(src_ptr, src_stride, t_coeff + idx * 256); } for (idx = 0; idx < 256; idx += 8) { __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256)); __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512)); __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768)); __m256i b0 = _mm256_add_epi32(coeff0, coeff1); __m256i b1 = _mm256_sub_epi32(coeff0, coeff1); __m256i b2 = _mm256_add_epi32(coeff2, coeff3); __m256i b3 = _mm256_sub_epi32(coeff2, coeff3); b0 = _mm256_srai_epi32(b0, 2); b1 = _mm256_srai_epi32(b1, 2); b2 = _mm256_srai_epi32(b2, 2); b3 = _mm256_srai_epi32(b3, 2); coeff0 = _mm256_add_epi32(b0, b2); coeff1 = _mm256_add_epi32(b1, b3); coeff2 = _mm256_sub_epi32(b0, b2); coeff3 = _mm256_sub_epi32(b1, b3); _mm256_storeu_si256((__m256i *)coeff, coeff0); _mm256_storeu_si256((__m256i *)(coeff + 256), coeff1); _mm256_storeu_si256((__m256i *)(coeff + 512), coeff2); _mm256_storeu_si256((__m256i *)(coeff + 768), coeff3); coeff += 8; t_coeff += 8; } }
static unsigned int sad_w64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const int h, const uint8_t *second_pred, const int second_pred_stride) { int i, res; __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; __m256i sum_sad = _mm256_setzero_si256(); __m256i sum_sad_h; __m128i sum_sad128; for (i = 0; i < h; i++) { ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); ref1_reg = _mm256_avg_epu8( ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); ref2_reg = _mm256_avg_epu8( ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); sad1_reg = _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); sad2_reg = _mm256_sad_epu8( ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); ref_ptr += ref_stride; src_ptr += src_stride; second_pred += second_pred_stride; } sum_sad_h = _mm256_srli_si256(sum_sad, 8); sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); res = _mm_cvtsi128_si32(sum_sad128); return res; }
/* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */ static void unshuffle8_avx2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 8; size_t i; int j; __m256i ymm0[8], ymm1[8]; for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { /* Fetch 32 elements (256 bytes) into 8 YMM registers. */ const uint8_t* const src_for_ith_element = src + i; for (j = 0; j < 8; j++) { ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements))); } /* Shuffle bytes */ for (j = 0; j < 4; j++) { /* Compute the low 32 bytes */ ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]); /* Compute the hi 32 bytes */ ymm1[4+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]); } /* Shuffle words */ for (j = 0; j < 4; j++) { /* Compute the low 32 bytes */ ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]); /* Compute the hi 32 bytes */ ymm0[4+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]); } for (j = 0; j < 8; j++) { ymm0[j] = _mm256_permute4x64_epi64(ymm0[j], 0xd8); } /* Shuffle 4-byte dwords */ for (j = 0; j < 4; j++) { /* Compute the low 32 bytes */ ymm1[j] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]); /* Compute the hi 32 bytes */ ymm1[4+j] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]); } /* Store the result vectors in proper order */ _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[2]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * sizeof(__m256i))), ymm1[1]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * sizeof(__m256i))), ymm1[3]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * sizeof(__m256i))), ymm1[4]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * sizeof(__m256i))), ymm1[6]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * sizeof(__m256i))), ymm1[5]); _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * sizeof(__m256i))), ymm1[7]); } }
void static avx_test (void) { int s1i[8] = {0, 0, 0, 0, 0, 0, 0, 0}; int s2i[8] = {1, 2, 3, 4, 5, 6, 7, 8}; int d; int e; int i; union256i_d s1, s2; s1.x = _mm256_loadu_si256 ((__m256i*)s1i); s2.x = _mm256_loadu_si256 ((__m256i*)s2i); d = _mm256_testc_si256 (s1.x, s2.x); e = 1; for (i = 0; i < 8; i++) if ((~s1i[i] & s2i[i]) != 0) e = 0; if (d != e) abort (); }
bool is_sorted_avx2_unrolled4(int32_t* a, size_t n) { const __m256i shuffle_pattern = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 7); size_t i = 0; while (i < n - (4*7 + 1)) { const __m256i curr0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(a + i + 0*7)); const __m256i curr1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(a + i + 1*7)); const __m256i curr2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(a + i + 2*7)); const __m256i curr3 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(a + i + 3*7)); const __m256i next0 = _mm256_permutevar8x32_epi32(curr0, shuffle_pattern); const __m256i next1 = _mm256_permutevar8x32_epi32(curr1, shuffle_pattern); const __m256i next2 = _mm256_permutevar8x32_epi32(curr2, shuffle_pattern); const __m256i next3 = _mm256_permutevar8x32_epi32(curr3, shuffle_pattern); const __m256i mask0 = _mm256_cmpgt_epi32(curr0, next0); const __m256i mask1 = _mm256_cmpgt_epi32(curr1, next1); const __m256i mask2 = _mm256_cmpgt_epi32(curr2, next2); const __m256i mask3 = _mm256_cmpgt_epi32(curr3, next3); const __m256i mask = _mm256_or_si256(mask0, _mm256_or_si256(mask1, _mm256_or_si256(mask2, mask3))); if (!_mm256_testz_si256(mask, mask)) { return false; } i += 7*4; } for (/**/; i + 1 < n; i++) { if (a[i] > a[i + 1]) return false; } return true; }