// Computes part of matrix.vector v = Wu. Computes N=16 results. // For details see PartialMatrixDotVector64 with N=16. static void PartialMatrixDotVector16(const int8_t* wi, const double* scales, const int8_t* u, int num_in, int num_out, double* v) { // Register containing 16-bit ones for horizontal add with 16->32 bit // conversion. __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1); // Initialize all the results to 0. __m256i result0 = _mm256_setzero_si256(); __m256i result1 = _mm256_setzero_si256(); // Iterate over the input (u), one registerful at a time. for (int j = 0; j < num_in;) { __m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(u + j)); // Inputs are processed in groups of kNumInputsPerGroup, replicated // kNumInputGroups times. for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) { // Replicate the low 32 bits (4 inputs) 8 times. __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs)); // Rotate the inputs in groups of 4, so the next 4 inputs are ready. inputs = _mm256_permutevar8x32_epi32(inputs, shift_id); __m256i weights, reps; // Mul-add, with horizontal add of the 4 inputs to each of the results. MultiplyGroup(rep_input, ones, wi, weights, reps, result0); MultiplyGroup(rep_input, ones, wi, weights, reps, result1); } } ExtractResults(result0, shift_id, wi, scales, kNumOutputsPerRegister, v); num_out -= kNumOutputsPerRegister; ExtractResults(result1, shift_id, wi, scales, std::min(kNumOutputsPerRegister, num_out), v); }
/** * Initializes the Sponge State. The first 512 bits are set to zeros and the remainder * receive Blake2b's IV as per Blake2b's specification. <b>Note:</b> Even though sponges * typically have their internal state initialized with zeros, Blake2b's G function * has a fixed point: if the internal state and message are both filled with zeros. the * resulting permutation will always be a block filled with zeros; this happens because * Blake2b does not use the constants originally employed in Blake2 inside its G function, * relying on the IV for avoiding possible fixed points. * * @param state The 1024-bit array to be initialized */ inline void initState(uint64_t state[/*16*/]) { #ifdef __AVX2__ (*(__m256i*)(&state[0])) = _mm256_setzero_si256(); (*(__m256i*)(&state[4])) = _mm256_setzero_si256(); (*(__m256i*)(&state[8])) = _mm256_set_epi64x( blake2b_IV[3], blake2b_IV[2], blake2b_IV[1], blake2b_IV[0] ); (*(__m256i*)(&state[12])) = _mm256_set_epi64x(blake2b_IV[7], blake2b_IV[6], blake2b_IV[5], blake2b_IV[4] ); //AVX is around the same number of instructions as unnoptimized //#elif defined __AVX__ #else //First 512 bis are zeros memset(state, 0, 64); //Remainder BLOCK_LEN_BLAKE2_SAFE_BYTES are reserved to the IV state[8] = blake2b_IV[0]; state[9] = blake2b_IV[1]; state[10] = blake2b_IV[2]; state[11] = blake2b_IV[3]; state[12] = blake2b_IV[4]; state[13] = blake2b_IV[5]; state[14] = blake2b_IV[6]; state[15] = blake2b_IV[7]; #endif }
template <bool align> void SquaredDifferenceSum( const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, size_t width, size_t height, uint64_t * sum) { assert(width < 0x10000); if(align) { assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)); } size_t bodyWidth = AlignLo(width, A); __m256i tailMask = SetMask<uint8_t>(0, A - width + bodyWidth, 0xFF); __m256i fullSum = _mm256_setzero_si256(); for(size_t row = 0; row < height; ++row) { __m256i rowSum = _mm256_setzero_si256(); for(size_t col = 0; col < bodyWidth; col += A) { const __m256i a_ = Load<align>((__m256i*)(a + col)); const __m256i b_ = Load<align>((__m256i*)(b + col)); rowSum = _mm256_add_epi32(rowSum, SquaredDifference(a_, b_)); } if(width - bodyWidth) { const __m256i a_ = _mm256_and_si256(tailMask, Load<false>((__m256i*)(a + width - A))); const __m256i b_ = _mm256_and_si256(tailMask, Load<false>((__m256i*)(b + width - A))); rowSum = _mm256_add_epi32(rowSum, SquaredDifference(a_, b_)); } fullSum = _mm256_add_epi64(fullSum, HorizontalSum32(rowSum)); a += aStride; b += bStride; } *sum = ExtractSum<uint64_t>(fullSum); }
void test_primates() { ////////////////////TEST 80 BIT////////////// //Prepare test vectors YMM YMM_p1_input_80bit[5][2]; for (int i = 0; i < 5; i++) { YMM_p1_input_80bit[i][0] = _mm256_setzero_si256(); YMM_p1_input_80bit[i][1] = _mm256_setzero_si256(); } //use test vectors p1(YMM_p1_input_80bit); p1_inv(YMM_p1_input_80bit); //test if vectors are zero again... Dont test last 192 bits of section 2, as they are not part of the state (and sub elements turn the 0s there to 1s...) if (_mm256_extract_epi64(YMM_p1_input_80bit[0][0], 0) != 0 || _mm256_extract_epi64(YMM_p1_input_80bit[0][0], 1) != 0 || _mm256_extract_epi64(YMM_p1_input_80bit[0][0], 2) != 0 || _mm256_extract_epi64(YMM_p1_input_80bit[0][0], 3) != 0 || _mm256_extract_epi64(YMM_p1_input_80bit[1][0], 0) != 0 || _mm256_extract_epi64(YMM_p1_input_80bit[1][0], 1) != 0 || _mm256_extract_epi64(YMM_p1_input_80bit[1][0], 2) != 0 || _mm256_extract_epi64(YMM_p1_input_80bit[1][0], 3) != 0 || _mm256_extract_epi64(YMM_p1_input_80bit[2][0], 0) != 0 || _mm256_extract_epi64(YMM_p1_input_80bit[2][0], 1) != 0 || _mm256_extract_epi64(YMM_p1_input_80bit[2][0], 2) != 0 || _mm256_extract_epi64(YMM_p1_input_80bit[2][0], 3) != 0 || _mm256_extract_epi64(YMM_p1_input_80bit[3][0], 0) != 0 || _mm256_extract_epi64(YMM_p1_input_80bit[3][0], 1) != 0 || _mm256_extract_epi64(YMM_p1_input_80bit[3][0], 2) != 0 || _mm256_extract_epi64(YMM_p1_input_80bit[3][0], 3) != 0 || _mm256_extract_epi64(YMM_p1_input_80bit[4][0], 0) != 0 || _mm256_extract_epi64(YMM_p1_input_80bit[4][0], 1) != 0 || _mm256_extract_epi64(YMM_p1_input_80bit[4][0], 2) != 0 || _mm256_extract_epi64(YMM_p1_input_80bit[4][0], 3) != 0 || _mm256_extract_epi64(YMM_p1_input_80bit[0][1], 0) != 0 || _mm256_extract_epi64(YMM_p1_input_80bit[1][1], 0) != 0 || _mm256_extract_epi64(YMM_p1_input_80bit[2][1], 0) != 0 || _mm256_extract_epi64(YMM_p1_input_80bit[3][1], 0) != 0 || _mm256_extract_epi64(YMM_p1_input_80bit[4][1], 0) != 0) { printf("P1 inv not working \n"); } }
void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]) { __m256i sums[4]; int i; const uint8_t *refs[4]; refs[0] = ref_array[0]; refs[1] = ref_array[1]; refs[2] = ref_array[2]; refs[3] = ref_array[3]; sums[0] = _mm256_setzero_si256(); sums[1] = _mm256_setzero_si256(); sums[2] = _mm256_setzero_si256(); sums[3] = _mm256_setzero_si256(); for (i = 0; i < 64; i++) { __m256i r_lo[4], r_hi[4]; // load 64 bytes from src and all ref[] const __m256i s_lo = _mm256_load_si256((const __m256i *)src_ptr); const __m256i s_hi = _mm256_load_si256((const __m256i *)(src_ptr + 32)); r_lo[0] = _mm256_loadu_si256((const __m256i *)refs[0]); r_hi[0] = _mm256_loadu_si256((const __m256i *)(refs[0] + 32)); r_lo[1] = _mm256_loadu_si256((const __m256i *)refs[1]); r_hi[1] = _mm256_loadu_si256((const __m256i *)(refs[1] + 32)); r_lo[2] = _mm256_loadu_si256((const __m256i *)refs[2]); r_hi[2] = _mm256_loadu_si256((const __m256i *)(refs[2] + 32)); r_lo[3] = _mm256_loadu_si256((const __m256i *)refs[3]); r_hi[3] = _mm256_loadu_si256((const __m256i *)(refs[3] + 32)); // sum of the absolute differences between every ref[] to src r_lo[0] = _mm256_sad_epu8(r_lo[0], s_lo); r_lo[1] = _mm256_sad_epu8(r_lo[1], s_lo); r_lo[2] = _mm256_sad_epu8(r_lo[2], s_lo); r_lo[3] = _mm256_sad_epu8(r_lo[3], s_lo); r_hi[0] = _mm256_sad_epu8(r_hi[0], s_hi); r_hi[1] = _mm256_sad_epu8(r_hi[1], s_hi); r_hi[2] = _mm256_sad_epu8(r_hi[2], s_hi); r_hi[3] = _mm256_sad_epu8(r_hi[3], s_hi); // sum every ref[] sums[0] = _mm256_add_epi32(sums[0], r_lo[0]); sums[1] = _mm256_add_epi32(sums[1], r_lo[1]); sums[2] = _mm256_add_epi32(sums[2], r_lo[2]); sums[3] = _mm256_add_epi32(sums[3], r_lo[3]); sums[0] = _mm256_add_epi32(sums[0], r_hi[0]); sums[1] = _mm256_add_epi32(sums[1], r_hi[1]); sums[2] = _mm256_add_epi32(sums[2], r_hi[2]); sums[3] = _mm256_add_epi32(sums[3], r_hi[3]); src_ptr += src_stride; refs[0] += ref_stride; refs[1] += ref_stride; refs[2] += ref_stride; refs[3] += ref_stride; } calc_final(sums, sad_array); }
/* dot products: d1={dot(a1,b1),dot(a1,b2)},d2={dot(a2,b1),dot(a2,b2)} --------- * args : short *a1 I input short array * short *a2 I input short array * short *b1 I input short array * short *b2 I input short array * int n I number of input data * short *d1 O output short array * short *d2 O output short array * return : none *-----------------------------------------------------------------------------*/ extern void dot_22(const short *a1, const short *a2, const short *b1, const short *b2, int n, double *d1, double *d2) { const short *p1=a1,*p2=a2,*q1=b1,*q2=b2; #if defined(AVX2_ENABLE) __m256i xmm1,xmm2,xmm3,xmm4; n=16*(int)ceil((double)n/16); /* modification to multiples of 16 */ xmm1=_mm256_setzero_si256(); xmm2=_mm256_setzero_si256(); xmm3=_mm256_setzero_si256(); xmm4=_mm256_setzero_si256(); for (;p1<a1+n;p1+=16,p2+=16,q1+=16,q2+=16) { MULADD_INT16_AVX(xmm1,p1,q1); MULADD_INT16_AVX(xmm2,p1,q2); MULADD_INT16_AVX(xmm3,p2,q1); MULADD_INT16_AVX(xmm4,p2,q2); } SUM_INT32_AVX(d1[0],xmm1); SUM_INT32_AVX(d1[1],xmm2); SUM_INT32_AVX(d2[0],xmm3); SUM_INT32_AVX(d2[1],xmm4); #elif defined(SSE2_ENABLE) __m128i xmm1,xmm2,xmm3,xmm4; n=8*(int)ceil((double)n/8); /* modification to multiples of 8 */ xmm1=_mm_setzero_si128(); xmm2=_mm_setzero_si128(); xmm3=_mm_setzero_si128(); xmm4=_mm_setzero_si128(); for (;p1<a1+n;p1+=8,p2+=8,q1+=8,q2+=8) { MULADD_INT16(xmm1,p1,q1); MULADD_INT16(xmm2,p1,q2); MULADD_INT16(xmm3,p2,q1); MULADD_INT16(xmm4,p2,q2); } SUM_INT32(d1[0],xmm1); SUM_INT32(d1[1],xmm2); SUM_INT32(d2[0],xmm3); SUM_INT32(d2[1],xmm4); #else d1[0]=d1[1]=d2[0]=d2[1]=0.0; for (;p1<a1+n;p1++,p2++,q1++,q2++) { d1[0]+=(*p1)*(*q1); d1[1]+=(*p1)*(*q2); d2[0]+=(*p2)*(*q1); d2[1]+=(*p2)*(*q2); } #endif }
inline void avx2_hexid_to_uv_ccw(const __m256i hexid, __m256i& u, __m256i& v) { // if(hexid==0) { u = v = 0; return; } // unsigned ringid; // unsigned segid; // unsigned runid; // positive_hexid_to_ringid_segid_runid(hexid, ringid, segid, runid); // switch(segid) // { // case 0: u = ringid-runid; v = runid; break; // case 1: u = -runid; v = ringid; break; // case 2: u = -ringid; v = ringid-runid; break; // case 3: u = runid-ringid; v = -runid; break; // case 4: u = runid; v = -ringid; break; // case 5: u = ringid; v = runid-ringid; break; // default: assert(0); // } const __m256i one = _mm256_set1_epi32(1); __m256i ringid = avx2_positive_hexid_to_ringid(hexid); __m256i iring = _mm256_sub_epi32(hexid, avx2_ringid_to_nsites_contained(_mm256_sub_epi32(ringid,one))); u = ringid; v = _mm256_setzero_si256(); __m256i irun = _mm256_min_epu32(iring, ringid); u = _mm256_sub_epi32(u, irun); v = _mm256_add_epi32(v, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); u = _mm256_sub_epi32(u, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); v = _mm256_sub_epi32(v, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); u = _mm256_add_epi32(u, irun); v = _mm256_sub_epi32(v, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); u = _mm256_add_epi32(u, irun); iring = _mm256_sub_epi32(iring, irun); v = _mm256_add_epi32(v, iring); const __m256i mask = _mm256_cmpeq_epi32(hexid, _mm256_setzero_si256()); u = _mm256_andnot_si256(mask, u); v = _mm256_andnot_si256(mask, v); }
uint64_t avx2_count_byte(const uint8_t* data, size_t size, uint8_t byte) { const __m256i v = _mm256_set1_epi8(byte); const uint8_t* end = data + size; const uint8_t* ptr = data; __m256i global_sum = _mm256_setzero_si256(); __m256i local_sum; // 1. blocks of 256 registers while (ptr + 255*32 < end) { local_sum = _mm256_setzero_si256(); // update 32 x 8-bit counter for (int i=0; i < 255; i++, ptr += 32) { const __m256i in = _mm256_loadu_si256((const __m256i*)ptr); const __m256i eq = _mm256_cmpeq_epi8(in, v); // 0 or -1 local_sum = _mm256_sub_epi8(local_sum, eq); } // update the global accumulator 2 x 64-bit const __m256i tmp = _mm256_sad_epu8(local_sum, _mm256_setzero_si256()); global_sum = _mm256_add_epi64(global_sum, tmp); } // 2. tail of < 256 registers local_sum = _mm256_setzero_si256(); while (ptr + 32 < end) { const __m256i in = _mm256_loadu_si256((const __m256i*)ptr); const __m256i eq = _mm256_cmpeq_epi8(in, v); local_sum = _mm256_sub_epi8(local_sum, eq); ptr += 32; } const __m256i tmp = _mm256_sad_epu8(local_sum, _mm256_setzero_si256()); global_sum = _mm256_add_epi64(global_sum, tmp); // 3. process tail < 32 bytes uint64_t result = _mm256_extract_epi64(global_sum, 0) + _mm256_extract_epi64(global_sum, 1) + _mm256_extract_epi64(global_sum, 2) + _mm256_extract_epi64(global_sum, 3); return result + scalar_count_bytes(ptr, end - ptr, byte); }
inline void avx2_positive_hexid_to_ringid_segid_runid( const __m256i hexid, __m256i& ringid, __m256i& segid, __m256i& runid) { // ringid = positive_hexid_to_ringid(hexid); // unsigned iring = hexid - ringid_to_nsites_contained(ringid-1); // segid = int(iring/ringid); // runid = iring - segid*ringid; const __m256i one = _mm256_set1_epi32(1); ringid = avx2_positive_hexid_to_ringid(hexid); runid = _mm256_sub_epi32(hexid, avx2_ringid_to_nsites_contained(_mm256_sub_epi32(ringid,one))); segid = _mm256_setzero_si256(); const __m256i ringid_minus_one = _mm256_sub_epi32(ringid, one); __m256i mask = _mm256_cmpgt_epi32(runid, ringid_minus_one); runid = _mm256_sub_epi32(runid, _mm256_and_si256(mask, ringid)); segid = _mm256_add_epi32(segid, _mm256_and_si256(mask, one)); mask = _mm256_cmpgt_epi32(runid, ringid_minus_one); runid = _mm256_sub_epi32(runid, _mm256_and_si256(mask, ringid)); segid = _mm256_add_epi32(segid, _mm256_and_si256(mask, one)); mask = _mm256_cmpgt_epi32(runid, ringid_minus_one); runid = _mm256_sub_epi32(runid, _mm256_and_si256(mask, ringid)); segid = _mm256_add_epi32(segid, _mm256_and_si256(mask, one)); mask = _mm256_cmpgt_epi32(runid, ringid_minus_one); runid = _mm256_sub_epi32(runid, _mm256_and_si256(mask, ringid)); segid = _mm256_add_epi32(segid, _mm256_and_si256(mask, one)); mask = _mm256_cmpgt_epi32(runid, ringid_minus_one); runid = _mm256_sub_epi32(runid, _mm256_and_si256(mask, ringid)); segid = _mm256_add_epi32(segid, _mm256_and_si256(mask, one)); }
int32_t avx2_sumsignedbytes_variant2(int8_t* array, size_t size) { __m256i accumulator = _mm256_setzero_si256(); for (size_t i=0; i < size; i += 32) { const __m256i v = _mm256_loadu_si256((__m256i*)(array + i)); const __m256i v0 = _mm256_srai_epi32(v, 3*8); const __m256i v1 = _mm256_srai_epi32(_mm256_slli_epi32(v, 1*8), 3*8); const __m256i v2 = _mm256_srai_epi32(_mm256_slli_epi32(v, 2*8), 3*8); const __m256i v3 = _mm256_srai_epi32(_mm256_slli_epi32(v, 3*8), 3*8); accumulator = _mm256_add_epi32(accumulator, v0); accumulator = _mm256_add_epi32(accumulator, v1); accumulator = _mm256_add_epi32(accumulator, v2); accumulator = _mm256_add_epi32(accumulator, v3); } return int32_t(_mm256_extract_epi32(accumulator, 0)) + int32_t(_mm256_extract_epi32(accumulator, 1)) + int32_t(_mm256_extract_epi32(accumulator, 2)) + int32_t(_mm256_extract_epi32(accumulator, 3)) + int32_t(_mm256_extract_epi32(accumulator, 4)) + int32_t(_mm256_extract_epi32(accumulator, 5)) + int32_t(_mm256_extract_epi32(accumulator, 6)) + int32_t(_mm256_extract_epi32(accumulator, 7)); }
int32_t avx2_sumsignedbytes(int8_t* array, size_t size) { __m256i accumulator = _mm256_setzero_si256(); for (size_t i=0; i < size; i += 32) { const __m256i v = _mm256_loadu_si256((__m256i*)(array + i)); const __m128i lo = _mm256_extracti128_si256(v, 0); const __m128i hi = _mm256_extracti128_si256(v, 1); const __m256i t0 = _mm256_cvtepi8_epi32(lo); const __m256i t1 = _mm256_cvtepi8_epi32(hi); const __m256i t2 = _mm256_cvtepi8_epi32(_mm_bsrli_si128(lo, 8)); const __m256i t3 = _mm256_cvtepi8_epi32(_mm_bsrli_si128(hi, 8)); accumulator = _mm256_add_epi32(accumulator, t0); accumulator = _mm256_add_epi32(accumulator, t1); accumulator = _mm256_add_epi32(accumulator, t2); accumulator = _mm256_add_epi32(accumulator, t3); } return int32_t(_mm256_extract_epi32(accumulator, 0)) + int32_t(_mm256_extract_epi32(accumulator, 1)) + int32_t(_mm256_extract_epi32(accumulator, 2)) + int32_t(_mm256_extract_epi32(accumulator, 3)) + int32_t(_mm256_extract_epi32(accumulator, 4)) + int32_t(_mm256_extract_epi32(accumulator, 5)) + int32_t(_mm256_extract_epi32(accumulator, 6)) + int32_t(_mm256_extract_epi32(accumulator, 7)); }
static unsigned int sad_w64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const int h, const uint8_t *second_pred, const int second_pred_stride) { int i, res; __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; __m256i sum_sad = _mm256_setzero_si256(); __m256i sum_sad_h; __m128i sum_sad128; for (i = 0; i < h; i++) { ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); ref1_reg = _mm256_avg_epu8( ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); ref2_reg = _mm256_avg_epu8( ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); sad1_reg = _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); sad2_reg = _mm256_sad_epu8( ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); ref_ptr += ref_stride; src_ptr += src_stride; second_pred += second_pred_stride; } sum_sad_h = _mm256_srli_si256(sum_sad, 8); sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); res = _mm_cvtsi128_si32(sum_sad128); return res; }
inline void avx2_hexid_to_uv_cw(const __m256i hexid, __m256i& u, __m256i& v) { #if 0 // This code is correct but it's not worth maintaining two versions const __m256i one = _mm256_set1_epi32(1); __m256i ringid = avx2_positive_hexid_to_ringid(hexid); __m256i iring = _mm256_sub_epi32(hexid, avx2_ringid_to_nsites_contained(_mm256_sub_epi32(ringid,one))); u = ringid; v = _mm256_setzero_si256(); __m256i irun = _mm256_min_epu32(iring, ringid); v = _mm256_sub_epi32(v, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); u = _mm256_sub_epi32(u, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); u = _mm256_sub_epi32(u, irun); v = _mm256_add_epi32(v, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); v = _mm256_add_epi32(v, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); u = _mm256_add_epi32(u, irun); iring = _mm256_sub_epi32(iring, irun); u = _mm256_add_epi32(u, irun); v = _mm256_add_epi32(v, iring); const __m256i mask = _mm256_cmpeq_epi32(hexid, _mm256_setzero_si256()); u = _mm256_andnot_si256(mask, u); v = _mm256_andnot_si256(mask, v); #else // hexid_to_uv_ccw(hexid, u, v); // u += v; // v = -v; avx2_hexid_to_uv_ccw(hexid, u, v); u = _mm256_add_epi32(u, v); v = _mm256_sign_epi32(v, _mm256_cmpeq_epi32(v, v)); #endif }
void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]) { int i; const uint8_t *refs[4]; __m256i sums[4]; refs[0] = ref_array[0]; refs[1] = ref_array[1]; refs[2] = ref_array[2]; refs[3] = ref_array[3]; sums[0] = _mm256_setzero_si256(); sums[1] = _mm256_setzero_si256(); sums[2] = _mm256_setzero_si256(); sums[3] = _mm256_setzero_si256(); for (i = 0; i < 32; i++) { __m256i r[4]; // load src and all ref[] const __m256i s = _mm256_load_si256((const __m256i *)src_ptr); r[0] = _mm256_loadu_si256((const __m256i *)refs[0]); r[1] = _mm256_loadu_si256((const __m256i *)refs[1]); r[2] = _mm256_loadu_si256((const __m256i *)refs[2]); r[3] = _mm256_loadu_si256((const __m256i *)refs[3]); // sum of the absolute differences between every ref[] to src r[0] = _mm256_sad_epu8(r[0], s); r[1] = _mm256_sad_epu8(r[1], s); r[2] = _mm256_sad_epu8(r[2], s); r[3] = _mm256_sad_epu8(r[3], s); // sum every ref[] sums[0] = _mm256_add_epi32(sums[0], r[0]); sums[1] = _mm256_add_epi32(sums[1], r[1]); sums[2] = _mm256_add_epi32(sums[2], r[2]); sums[3] = _mm256_add_epi32(sums[3], r[3]); src_ptr += src_stride; refs[0] += ref_stride; refs[1] += ref_stride; refs[2] += ref_stride; refs[3] += ref_stride; } calc_final(sums, sad_array); }
static __m256i popcount(const __m512i v) { const __m256i lo = _mm512_extracti64x4_epi64(v, 0); const __m256i hi = _mm512_extracti64x4_epi64(v, 1); const __m256i s = _mm256_add_epi8(avx2_popcount(lo), avx2_popcount(hi)); return _mm256_sad_epu8(s, _mm256_setzero_si256()); }
inline __m256i avx2_positive_hexid_to_ringid_loop(const __m256i hexid) { // This algorithm is relatively slow in comparisson to the scalar version // but still faster overall conidering we compute 8 rigids in one go const __m256i six = _mm256_set1_epi32(6); const __m256i one = _mm256_set1_epi32(1); __m256i ringid = _mm256_setzero_si256(); __m256i nsites = one; __m256i nring = _mm256_setzero_si256(); __m256i mask = _mm256_cmpgt_epi32(nsites, hexid); while(~_mm256_movemask_epi8(mask)) { ringid = _mm256_blendv_epi8(_mm256_add_epi32(ringid, one), ringid, mask); nring = _mm256_add_epi32(nring, six); nsites = _mm256_add_epi32(nsites, nring); mask = _mm256_cmpgt_epi32(nsites, hexid); } return ringid; }
static INLINE unsigned int highbd_masked_sad16xh_avx2( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height) { const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); int x, y; __m256i res = _mm256_setzero_si256(); const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); const __m256i round_const = _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); const __m256i one = _mm256_set1_epi16(1); for (y = 0; y < height; y++) { for (x = 0; x < width; x += 16) { const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]); const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]); const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]); // Zero-extend mask to 16 bits const __m256i m = _mm256_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)&m_ptr[x])); const __m256i m_inv = _mm256_sub_epi16(mask_max, m); const __m256i data_l = _mm256_unpacklo_epi16(a, b); const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv); __m256i pred_l = _mm256_madd_epi16(data_l, mask_l); pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const), AOM_BLEND_A64_ROUND_BITS); const __m256i data_r = _mm256_unpackhi_epi16(a, b); const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv); __m256i pred_r = _mm256_madd_epi16(data_r, mask_r); pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const), AOM_BLEND_A64_ROUND_BITS); // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, // so it is safe to do signed saturation here. const __m256i pred = _mm256_packs_epi32(pred_l, pred_r); // There is no 16-bit SAD instruction, so we have to synthesize // an 8-element SAD. We do this by storing 4 32-bit partial SADs, // and accumulating them at the end const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src)); res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one)); } src_ptr += src_stride; a_ptr += a_stride; b_ptr += b_stride; m_ptr += m_stride; } // At this point, we have four 32-bit partial SADs stored in 'res'. res = _mm256_hadd_epi32(res, res); res = _mm256_hadd_epi32(res, res); int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4); return (sad + 31) >> 6; }
inline __m256i avx2_ringid_segid_runid_to_hexid( const __m256i ringid, const __m256i segid, const __m256i runid) { // return (ringid==0) ? 0 : // positive_ringid_segid_runid_to_hexid(ringid, segid, runid); const __m256i mask = _mm256_cmpeq_epi32(ringid, _mm256_setzero_si256()); return _mm256_andnot_si256(mask, avx2_positive_ringid_segid_runid_to_hexid(ringid, segid, runid)); }
inline avx_m256_t newsin_ps(avx_m256_t x) { avx_m256_t sign_bit = _mm256_and_ps(x, _ps_sign_mask); x = _mm256_and_ps(x, _ps_inv_sign_mask); avx_m256_t y = _mm256_mul_ps(x, _ps_cephes_FOPI); avx_m256i_t emm2 = _mm256_cvttps_epi32(y); emm2 = _mm256_add_epi32(emm2, _pi32_1); emm2 = _mm256_and_si256(emm2, _pi32_inv1); y = _mm256_cvtepi32_ps(emm2); avx_m256i_t emm0 = _mm256_and_si256(emm2, _pi32_4); emm0 = _mm256_slli_epi32(emm0, 29); emm2 = _mm256_and_si256(emm2, _pi32_2); emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256()); avx_m256_t swap_sign_bit = _mm256_castsi256_ps(emm0); avx_m256_t poly_mask = _mm256_castsi256_ps(emm2); sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit); avx_m256_t temp = _ps_minus_cephes_DP123; temp = _mm256_mul_ps(y, temp); x = _mm256_add_ps(x, temp); avx_m256_t x2 = _mm256_mul_ps(x, x); avx_m256_t x3 = _mm256_mul_ps(x2, x); avx_m256_t x4 = _mm256_mul_ps(x2, x2); y = _ps_coscof_p0; avx_m256_t y2 = _ps_sincof_p0; y = _mm256_mul_ps(y, x2); y2 = _mm256_mul_ps(y2, x2); y = _mm256_add_ps(y, _ps_coscof_p1); y2 = _mm256_add_ps(y2, _ps_sincof_p1); y = _mm256_mul_ps(y, x2); y2 = _mm256_mul_ps(y2, x2); y = _mm256_add_ps(y, _ps_coscof_p2); y2 = _mm256_add_ps(y2, _ps_sincof_p2); y = _mm256_mul_ps(y, x4); y2 = _mm256_mul_ps(y2, x3); temp = _mm256_mul_ps(x2, _ps_0p5); temp = _mm256_sub_ps(temp, _ps_1); y = _mm256_sub_ps(y, temp); y2 = _mm256_add_ps(y2, x); y = _mm256_andnot_ps(poly_mask, y); y2 = _mm256_and_ps(poly_mask, y2); y = _mm256_add_ps(y, y2); y = _mm256_xor_ps(y, sign_bit); return y; } // newsin_ps()
inline void avx2_hexid_to_ringid_segid_runid( const __m256i hexid, __m256i& ringid, __m256i& segid, __m256i& runid) { // if(hexid==0) { ringid = segid = runid = 0; return; } // return positive_hexid_to_ringid_segid_runid(hexid, ringid, segid, runid); avx2_positive_hexid_to_ringid_segid_runid(hexid, ringid, segid, runid); const __m256i mask = _mm256_cmpeq_epi32(hexid, _mm256_setzero_si256()); ringid = _mm256_andnot_si256(mask, ringid); segid = _mm256_andnot_si256(mask, segid); runid = _mm256_andnot_si256(mask, runid); }
int main(int, char**) { /* AVX */ _mm256_zeroall(); __m256i a = _mm256_setzero_si256(); /* AVX2 */ __m256i b = _mm256_and_si256(a, a); __m256i result = _mm256_add_epi8(a, b); (void)result; return 0; }
avx_test (void) { long long in = 0x800000000ll; long long out; __m256i zero = _mm256_setzero_si256(); __m256i tmp = _mm256_insert_epi64 (zero, in, 0); out = _mm256_extract_epi64(tmp, 0); if (in != out) abort (); }
static INLINE void variance32_avx2(const uint8_t *src, const int src_stride, const uint8_t *ref, const int ref_stride, const int h, __m256i *const vsse, __m256i *const vsum) { *vsum = _mm256_setzero_si256(); for (int i = 0; i < h; i++) { variance32_kernel_avx2(src, ref, vsse, vsum); src += src_stride; ref += ref_stride; } }
// credit: Harold Aptroot uint32_t maskedvectorsum(uint32_t * z, uint32_t N, uint32_t * accesses, uint32_t nmbr) { __m256i Nvec = _mm256_set1_epi32(N - 1); __m256i sum = _mm256_setzero_si256(); for(uint32_t j = 0; j < nmbr ; j += 8) { __m256i indexes = _mm256_loadu_si256((__m256i*)(accesses + j)); indexes = _mm256_and_si256(indexes, Nvec); __m256i fi = _mm256_i32gather_epi32((int*)z, indexes, 4); sum = _mm256_add_epi32(sum, fi); } __m128i sum128 = _mm_add_epi32(_mm256_extracti128_si256(sum, 0), _mm256_extracti128_si256(sum, 1)); sum128 = _mm_hadd_epi32(sum128, sum128); return _mm_extract_epi32(sum128, 0) + _mm_extract_epi32(sum128, 1); }
void av1_highbd_quantize_fp_avx2( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale) { (void)scan; (void)zbin_ptr; (void)quant_shift_ptr; const unsigned int step = 8; __m256i qp[3], coeff; init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, qp); coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); __m256i eob = _mm256_setzero_si256(); quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob); coeff_ptr += step; qcoeff_ptr += step; dqcoeff_ptr += step; iscan += step; n_coeffs -= step; update_qp(qp); while (n_coeffs > 0) { coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob); coeff_ptr += step; qcoeff_ptr += step; dqcoeff_ptr += step; iscan += step; n_coeffs -= step; } { __m256i eob_s; eob_s = _mm256_shuffle_epi32(eob, 0xe); eob = _mm256_max_epi16(eob, eob_s); eob_s = _mm256_shufflelo_epi16(eob, 0xe); eob = _mm256_max_epi16(eob, eob_s); eob_s = _mm256_shufflelo_epi16(eob, 1); eob = _mm256_max_epi16(eob, eob_s); const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob), _mm256_extractf128_si256(eob, 1)); *eob_ptr = _mm_extract_epi16(final_eob, 0); } }
static void keccak64_init( keccak64_ctx_m256i *kc, unsigned out_size ) { int i; for (i = 0; i < 25; i ++) kc->w[i] = _mm256_setzero_si256(); // Initialization for the "lane complement". kc->w[ 1] = m256_neg1; kc->w[ 2] = m256_neg1; kc->w[ 8] = m256_neg1; kc->w[12] = m256_neg1; kc->w[17] = m256_neg1; kc->w[20] = m256_neg1; kc->ptr = 0; kc->lim = 200 - (out_size >> 2); }
static uint64_t popcnt_harley_seal(const __m512i* data, const uint64_t size) { __m256i total = _mm256_setzero_si256(); __m512i ones = _mm512_setzero_si512(); __m512i twos = _mm512_setzero_si512(); __m512i fours = _mm512_setzero_si512(); __m512i eights = _mm512_setzero_si512(); __m512i sixteens = _mm512_setzero_si512(); __m512i twosA, twosB, foursA, foursB, eightsA, eightsB; const uint64_t limit = size - size % 16; uint64_t i = 0; for(; i < limit; i += 16) { CSA(&twosA, &ones, ones, data[i+0], data[i+1]); CSA(&twosB, &ones, ones, data[i+2], data[i+3]); CSA(&foursA, &twos, twos, twosA, twosB); CSA(&twosA, &ones, ones, data[i+4], data[i+5]); CSA(&twosB, &ones, ones, data[i+6], data[i+7]); CSA(&foursB, &twos, twos, twosA, twosB); CSA(&eightsA,&fours, fours, foursA, foursB); CSA(&twosA, &ones, ones, data[i+8], data[i+9]); CSA(&twosB, &ones, ones, data[i+10], data[i+11]); CSA(&foursA, &twos, twos, twosA, twosB); CSA(&twosA, &ones, ones, data[i+12], data[i+13]); CSA(&twosB, &ones, ones, data[i+14], data[i+15]); CSA(&foursB, &twos, twos, twosA, twosB); CSA(&eightsB, &fours, fours, foursA, foursB); CSA(&sixteens, &eights, eights, eightsA, eightsB); total = _mm256_add_epi64(total, popcount(sixteens)); } total = _mm256_slli_epi64(total, 4); // * 16 total = _mm256_add_epi64(total, _mm256_slli_epi64(popcount(eights), 3)); // += 8 * ... total = _mm256_add_epi64(total, _mm256_slli_epi64(popcount(fours), 2)); // += 4 * ... total = _mm256_add_epi64(total, _mm256_slli_epi64(popcount(twos), 1)); // += 2 * ... total = _mm256_add_epi64(total, popcount(ones)); for(; i < size; i++) { total = _mm256_add_epi64(total, popcount(data[i])); } return avx2_sum_epu64(total); }
static INLINE unsigned int masked_sad32xh_avx2( const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height) { int x, y; __m256i res = _mm256_setzero_si256(); const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); const __m256i round_scale = _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); for (y = 0; y < height; y++) { for (x = 0; x < width; x += 32) { const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]); const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]); const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]); const __m256i m = _mm256_lddqu_si256((const __m256i *)&m_ptr[x]); const __m256i m_inv = _mm256_sub_epi8(mask_max, m); // Calculate 16 predicted pixels. // Note that the maximum value of any entry of 'pred_l' or 'pred_r' // is 64 * 255, so we have plenty of space to add rounding constants. const __m256i data_l = _mm256_unpacklo_epi8(a, b); const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv); __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l); pred_l = _mm256_mulhrs_epi16(pred_l, round_scale); const __m256i data_r = _mm256_unpackhi_epi8(a, b); const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv); __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r); pred_r = _mm256_mulhrs_epi16(pred_r, round_scale); const __m256i pred = _mm256_packus_epi16(pred_l, pred_r); res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src)); } src_ptr += src_stride; a_ptr += a_stride; b_ptr += b_stride; m_ptr += m_stride; } // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'. res = _mm256_shuffle_epi32(res, 0xd8); res = _mm256_permute4x64_epi64(res, 0xd8); res = _mm256_hadd_epi32(res, res); res = _mm256_hadd_epi32(res, res); int32_t sad = _mm256_extract_epi32(res, 0); return (sad + 31) >> 6; }
size_t xstrlen(const char* src) { __m256i m0 = _mm256_setzero_si256(); __m256i m1 ; int mask; for (size_t count = 0;; count += 32){ m1 = _mm256_loadu_si256((const __m256i*)((unsigned long)src + count)); m1 = _mm256_cmpeq_epi8(m1, m0); mask = _mm256_movemask_epi8(m1); if (mask != 0) { __asm__("bsfl %0, %0\n\t" :"=r"(mask) :"r"(mask) ); return count + (size_t)mask; } }
int main (void) { unsigned int eax, ebx, ecx, edx; /* Run AVX test only if AVX is supported. */ if (__get_cpuid (1, &eax, &ebx, &ecx, &edx) && (ecx & bit_AVX)) { __m256i ymm = _mm256_setzero_si256 (); __m256i ret = audit_test (ymm, ymm, ymm, ymm, ymm, ymm, ymm, ymm); ymm = _mm256_set1_epi32 (0x12349876); if (memcmp (&ymm, &ret, sizeof (ret))) abort (); } return 0; }