template <bool align> void SquaredDifferenceSum( const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, size_t width, size_t height, uint64_t * sum) { assert(width < 0x10000); if(align) { assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)); } size_t bodyWidth = AlignLo(width, A); __m256i tailMask = SetMask<uint8_t>(0, A - width + bodyWidth, 0xFF); __m256i fullSum = _mm256_setzero_si256(); for(size_t row = 0; row < height; ++row) { __m256i rowSum = _mm256_setzero_si256(); for(size_t col = 0; col < bodyWidth; col += A) { const __m256i a_ = Load<align>((__m256i*)(a + col)); const __m256i b_ = Load<align>((__m256i*)(b + col)); rowSum = _mm256_add_epi32(rowSum, SquaredDifference(a_, b_)); } if(width - bodyWidth) { const __m256i a_ = _mm256_and_si256(tailMask, Load<false>((__m256i*)(a + width - A))); const __m256i b_ = _mm256_and_si256(tailMask, Load<false>((__m256i*)(b + width - A))); rowSum = _mm256_add_epi32(rowSum, SquaredDifference(a_, b_)); } fullSum = _mm256_add_epi64(fullSum, HorizontalSum32(rowSum)); a += aStride; b += bStride; } *sum = ExtractSum<uint64_t>(fullSum); }
int main(void) { for (int a = 0; a < 1000; a++) { for (int b = 0; b < 1000; b++) { uint32_t lhs_ab = 1000 * 1000 * a + 1000 * b; m256u_t lhs_ab_v = {.u = {lhs_ab, lhs_ab, lhs_ab, lhs_ab, lhs_ab, lhs_ab, lhs_ab, lhs_ab}}; uint32_t rhs_ab = a * a * a + b * b * b; m256u_t rhs_ab_v = {.u = {rhs_ab, rhs_ab, rhs_ab, rhs_ab, rhs_ab, rhs_ab, rhs_ab, rhs_ab}}; m256u_t c_v = {.u = {0, 1, 2, 3, 4, 5, 6, 7}}; m256u_t c_inc_v = {.u = {8, 8, 8, 8, 8, 8, 8, 8}}; m256u_t lhs_v, rhs_v, cmp_v; for (int c = 0; c < 1000; c += 8) { lhs_v.m = _mm256_add_epi32(lhs_ab_v.m, c_v.m); rhs_v.m = _mm256_mullo_epi32(c_v.m, c_v.m); rhs_v.m = _mm256_mullo_epi32(rhs_v.m, c_v.m); rhs_v.m = _mm256_add_epi32(rhs_v.m, rhs_ab_v.m); cmp_v.m = _mm256_cmpeq_epi32(lhs_v.m, rhs_v.m); if (_mm256_movemask_epi8(cmp_v.m)) { for (int i = 0; i < 8; i++) if (cmp_v.u[i] != 0) printf("%09u\n", lhs_v.u[i]); } c_v.m = _mm256_add_epi32(c_v.m, c_inc_v.m); } } } return 0; }
int32_t avx2_sumsignedbytes(int8_t* array, size_t size) { __m256i accumulator = _mm256_setzero_si256(); for (size_t i=0; i < size; i += 32) { const __m256i v = _mm256_loadu_si256((__m256i*)(array + i)); const __m128i lo = _mm256_extracti128_si256(v, 0); const __m128i hi = _mm256_extracti128_si256(v, 1); const __m256i t0 = _mm256_cvtepi8_epi32(lo); const __m256i t1 = _mm256_cvtepi8_epi32(hi); const __m256i t2 = _mm256_cvtepi8_epi32(_mm_bsrli_si128(lo, 8)); const __m256i t3 = _mm256_cvtepi8_epi32(_mm_bsrli_si128(hi, 8)); accumulator = _mm256_add_epi32(accumulator, t0); accumulator = _mm256_add_epi32(accumulator, t1); accumulator = _mm256_add_epi32(accumulator, t2); accumulator = _mm256_add_epi32(accumulator, t3); } return int32_t(_mm256_extract_epi32(accumulator, 0)) + int32_t(_mm256_extract_epi32(accumulator, 1)) + int32_t(_mm256_extract_epi32(accumulator, 2)) + int32_t(_mm256_extract_epi32(accumulator, 3)) + int32_t(_mm256_extract_epi32(accumulator, 4)) + int32_t(_mm256_extract_epi32(accumulator, 5)) + int32_t(_mm256_extract_epi32(accumulator, 6)) + int32_t(_mm256_extract_epi32(accumulator, 7)); }
inline void avx2_positive_hexid_to_ringid_segid_runid( const __m256i hexid, __m256i& ringid, __m256i& segid, __m256i& runid) { // ringid = positive_hexid_to_ringid(hexid); // unsigned iring = hexid - ringid_to_nsites_contained(ringid-1); // segid = int(iring/ringid); // runid = iring - segid*ringid; const __m256i one = _mm256_set1_epi32(1); ringid = avx2_positive_hexid_to_ringid(hexid); runid = _mm256_sub_epi32(hexid, avx2_ringid_to_nsites_contained(_mm256_sub_epi32(ringid,one))); segid = _mm256_setzero_si256(); const __m256i ringid_minus_one = _mm256_sub_epi32(ringid, one); __m256i mask = _mm256_cmpgt_epi32(runid, ringid_minus_one); runid = _mm256_sub_epi32(runid, _mm256_and_si256(mask, ringid)); segid = _mm256_add_epi32(segid, _mm256_and_si256(mask, one)); mask = _mm256_cmpgt_epi32(runid, ringid_minus_one); runid = _mm256_sub_epi32(runid, _mm256_and_si256(mask, ringid)); segid = _mm256_add_epi32(segid, _mm256_and_si256(mask, one)); mask = _mm256_cmpgt_epi32(runid, ringid_minus_one); runid = _mm256_sub_epi32(runid, _mm256_and_si256(mask, ringid)); segid = _mm256_add_epi32(segid, _mm256_and_si256(mask, one)); mask = _mm256_cmpgt_epi32(runid, ringid_minus_one); runid = _mm256_sub_epi32(runid, _mm256_and_si256(mask, ringid)); segid = _mm256_add_epi32(segid, _mm256_and_si256(mask, one)); mask = _mm256_cmpgt_epi32(runid, ringid_minus_one); runid = _mm256_sub_epi32(runid, _mm256_and_si256(mask, ringid)); segid = _mm256_add_epi32(segid, _mm256_and_si256(mask, one)); }
SIMD_INLINE __m256i BgraToGray32(__m256i bgra) { const __m256i g0a0 = _mm256_and_si256(_mm256_srli_si256(bgra, 1), K16_00FF); const __m256i b0r0 = _mm256_and_si256(bgra, K16_00FF); const __m256i weightedSum = _mm256_add_epi32(_mm256_madd_epi16(g0a0, K16_GREEN_0000), _mm256_madd_epi16(b0r0, K16_BLUE_RED)); return _mm256_srli_epi32(_mm256_add_epi32(weightedSum, K32_ROUND_TERM), Base::BGR_TO_GRAY_AVERAGING_SHIFT); }
static unsigned int sad_w64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const int h, const uint8_t *second_pred, const int second_pred_stride) { int i, res; __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; __m256i sum_sad = _mm256_setzero_si256(); __m256i sum_sad_h; __m128i sum_sad128; for (i = 0; i < h; i++) { ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); ref1_reg = _mm256_avg_epu8( ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); ref2_reg = _mm256_avg_epu8( ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); sad1_reg = _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); sad2_reg = _mm256_sad_epu8( ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); ref_ptr += ref_stride; src_ptr += src_stride; second_pred += second_pred_stride; } sum_sad_h = _mm256_srli_si256(sum_sad, 8); sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); res = _mm_cvtsi128_si32(sum_sad128); return res; }
int32_t avx2_sumsignedbytes_variant2(int8_t* array, size_t size) { __m256i accumulator = _mm256_setzero_si256(); for (size_t i=0; i < size; i += 32) { const __m256i v = _mm256_loadu_si256((__m256i*)(array + i)); const __m256i v0 = _mm256_srai_epi32(v, 3*8); const __m256i v1 = _mm256_srai_epi32(_mm256_slli_epi32(v, 1*8), 3*8); const __m256i v2 = _mm256_srai_epi32(_mm256_slli_epi32(v, 2*8), 3*8); const __m256i v3 = _mm256_srai_epi32(_mm256_slli_epi32(v, 3*8), 3*8); accumulator = _mm256_add_epi32(accumulator, v0); accumulator = _mm256_add_epi32(accumulator, v1); accumulator = _mm256_add_epi32(accumulator, v2); accumulator = _mm256_add_epi32(accumulator, v3); } return int32_t(_mm256_extract_epi32(accumulator, 0)) + int32_t(_mm256_extract_epi32(accumulator, 1)) + int32_t(_mm256_extract_epi32(accumulator, 2)) + int32_t(_mm256_extract_epi32(accumulator, 3)) + int32_t(_mm256_extract_epi32(accumulator, 4)) + int32_t(_mm256_extract_epi32(accumulator, 5)) + int32_t(_mm256_extract_epi32(accumulator, 6)) + int32_t(_mm256_extract_epi32(accumulator, 7)); }
static INLINE unsigned int highbd_masked_sad16xh_avx2( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height) { const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); int x, y; __m256i res = _mm256_setzero_si256(); const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); const __m256i round_const = _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); const __m256i one = _mm256_set1_epi16(1); for (y = 0; y < height; y++) { for (x = 0; x < width; x += 16) { const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]); const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]); const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]); // Zero-extend mask to 16 bits const __m256i m = _mm256_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)&m_ptr[x])); const __m256i m_inv = _mm256_sub_epi16(mask_max, m); const __m256i data_l = _mm256_unpacklo_epi16(a, b); const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv); __m256i pred_l = _mm256_madd_epi16(data_l, mask_l); pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const), AOM_BLEND_A64_ROUND_BITS); const __m256i data_r = _mm256_unpackhi_epi16(a, b); const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv); __m256i pred_r = _mm256_madd_epi16(data_r, mask_r); pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const), AOM_BLEND_A64_ROUND_BITS); // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, // so it is safe to do signed saturation here. const __m256i pred = _mm256_packs_epi32(pred_l, pred_r); // There is no 16-bit SAD instruction, so we have to synthesize // an 8-element SAD. We do this by storing 4 32-bit partial SADs, // and accumulating them at the end const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src)); res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one)); } src_ptr += src_stride; a_ptr += a_stride; b_ptr += b_stride; m_ptr += m_stride; } // At this point, we have four 32-bit partial SADs stored in 'res'. res = _mm256_hadd_epi32(res, res); res = _mm256_hadd_epi32(res, res); int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4); return (sad + 31) >> 6; }
inline __m256i avx2_ringid_to_nsites_contained(const __m256i ringid) { // return 3*ringid*(ringid+1)+1; const __m256i one = _mm256_set1_epi32(1); __m256i nsites = _mm256_add_epi32(ringid, one); nsites = _mm256_mullo_epi32(ringid, nsites); nsites = _mm256_sub_epi32(_mm256_slli_epi32(nsites, 2), nsites); nsites = _mm256_add_epi32(nsites, one); return nsites; }
inline __m256i avx2_positive_ringid_segid_runid_to_hexid( const __m256i ringid, const __m256i segid, const __m256i runid) { // return ringid_to_nsites_contained(ringid-1)+segid*ringid+runid; const __m256i one = _mm256_set1_epi32(1); __m256i nsites = avx2_ringid_to_nsites_contained(_mm256_sub_epi32(ringid, one)); nsites = _mm256_add_epi32(nsites, _mm256_mullo_epi32(segid, ringid)); nsites = _mm256_add_epi32(nsites, runid); return nsites; }
SIMD_INLINE void SumHistograms(uint32_t * src, size_t start, uint32_t * dst) { uint32_t * src0 = src + start; uint32_t * src1 = src0 + start + HISTOGRAM_SIZE; uint32_t * src2 = src1 + start + HISTOGRAM_SIZE; uint32_t * src3 = src2 + start + HISTOGRAM_SIZE; for(size_t i = 0; i < HISTOGRAM_SIZE; i += 8) Store<false>((__m256i*)(dst + i), _mm256_add_epi32( _mm256_add_epi32(Load<true>((__m256i*)(src0 + i)), Load<true>((__m256i*)(src1 + i))), _mm256_add_epi32(Load<true>((__m256i*)(src2 + i)), Load<true>((__m256i*)(src3 + i))))); }
inline void avx2_hexid_to_uv_ccw(const __m256i hexid, __m256i& u, __m256i& v) { // if(hexid==0) { u = v = 0; return; } // unsigned ringid; // unsigned segid; // unsigned runid; // positive_hexid_to_ringid_segid_runid(hexid, ringid, segid, runid); // switch(segid) // { // case 0: u = ringid-runid; v = runid; break; // case 1: u = -runid; v = ringid; break; // case 2: u = -ringid; v = ringid-runid; break; // case 3: u = runid-ringid; v = -runid; break; // case 4: u = runid; v = -ringid; break; // case 5: u = ringid; v = runid-ringid; break; // default: assert(0); // } const __m256i one = _mm256_set1_epi32(1); __m256i ringid = avx2_positive_hexid_to_ringid(hexid); __m256i iring = _mm256_sub_epi32(hexid, avx2_ringid_to_nsites_contained(_mm256_sub_epi32(ringid,one))); u = ringid; v = _mm256_setzero_si256(); __m256i irun = _mm256_min_epu32(iring, ringid); u = _mm256_sub_epi32(u, irun); v = _mm256_add_epi32(v, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); u = _mm256_sub_epi32(u, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); v = _mm256_sub_epi32(v, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); u = _mm256_add_epi32(u, irun); v = _mm256_sub_epi32(v, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); u = _mm256_add_epi32(u, irun); iring = _mm256_sub_epi32(iring, irun); v = _mm256_add_epi32(v, iring); const __m256i mask = _mm256_cmpeq_epi32(hexid, _mm256_setzero_si256()); u = _mm256_andnot_si256(mask, u); v = _mm256_andnot_si256(mask, v); }
void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]) { __m256i sums[4]; int i; const uint8_t *refs[4]; refs[0] = ref_array[0]; refs[1] = ref_array[1]; refs[2] = ref_array[2]; refs[3] = ref_array[3]; sums[0] = _mm256_setzero_si256(); sums[1] = _mm256_setzero_si256(); sums[2] = _mm256_setzero_si256(); sums[3] = _mm256_setzero_si256(); for (i = 0; i < 64; i++) { __m256i r_lo[4], r_hi[4]; // load 64 bytes from src and all ref[] const __m256i s_lo = _mm256_load_si256((const __m256i *)src_ptr); const __m256i s_hi = _mm256_load_si256((const __m256i *)(src_ptr + 32)); r_lo[0] = _mm256_loadu_si256((const __m256i *)refs[0]); r_hi[0] = _mm256_loadu_si256((const __m256i *)(refs[0] + 32)); r_lo[1] = _mm256_loadu_si256((const __m256i *)refs[1]); r_hi[1] = _mm256_loadu_si256((const __m256i *)(refs[1] + 32)); r_lo[2] = _mm256_loadu_si256((const __m256i *)refs[2]); r_hi[2] = _mm256_loadu_si256((const __m256i *)(refs[2] + 32)); r_lo[3] = _mm256_loadu_si256((const __m256i *)refs[3]); r_hi[3] = _mm256_loadu_si256((const __m256i *)(refs[3] + 32)); // sum of the absolute differences between every ref[] to src r_lo[0] = _mm256_sad_epu8(r_lo[0], s_lo); r_lo[1] = _mm256_sad_epu8(r_lo[1], s_lo); r_lo[2] = _mm256_sad_epu8(r_lo[2], s_lo); r_lo[3] = _mm256_sad_epu8(r_lo[3], s_lo); r_hi[0] = _mm256_sad_epu8(r_hi[0], s_hi); r_hi[1] = _mm256_sad_epu8(r_hi[1], s_hi); r_hi[2] = _mm256_sad_epu8(r_hi[2], s_hi); r_hi[3] = _mm256_sad_epu8(r_hi[3], s_hi); // sum every ref[] sums[0] = _mm256_add_epi32(sums[0], r_lo[0]); sums[1] = _mm256_add_epi32(sums[1], r_lo[1]); sums[2] = _mm256_add_epi32(sums[2], r_lo[2]); sums[3] = _mm256_add_epi32(sums[3], r_lo[3]); sums[0] = _mm256_add_epi32(sums[0], r_hi[0]); sums[1] = _mm256_add_epi32(sums[1], r_hi[1]); sums[2] = _mm256_add_epi32(sums[2], r_hi[2]); sums[3] = _mm256_add_epi32(sums[3], r_hi[3]); src_ptr += src_stride; refs[0] += ref_stride; refs[1] += ref_stride; refs[2] += ref_stride; refs[3] += ref_stride; } calc_final(sums, sad_array); }
/** * \brief Calculate SAD for 16x16 bytes in continuous memory. */ static INLINE __m256i inline_8bit_sad_16x16_avx2(const __m256i *const a, const __m256i *const b) { const unsigned size_of_8x8 = 8 * 8 / sizeof(__m256i); // Calculate in 4 chunks of 16x4. __m256i sum0, sum1, sum2, sum3; sum0 = inline_8bit_sad_8x8_avx2(a + 0 * size_of_8x8, b + 0 * size_of_8x8); sum1 = inline_8bit_sad_8x8_avx2(a + 1 * size_of_8x8, b + 1 * size_of_8x8); sum2 = inline_8bit_sad_8x8_avx2(a + 2 * size_of_8x8, b + 2 * size_of_8x8); sum3 = inline_8bit_sad_8x8_avx2(a + 3 * size_of_8x8, b + 3 * size_of_8x8); sum0 = _mm256_add_epi32(sum0, sum1); sum2 = _mm256_add_epi32(sum2, sum3); return _mm256_add_epi32(sum0, sum2); }
inline void avx2_xy_to_uv_f(__m256 x, __m256 y, __m256i& u, __m256i& v) { // Convert X,Y first into U,V space then round to nearest // integer. That gets us close to correct answer, mapping XY to a // lozenge-shaped space rather than hexagonal. We then correct the // four regions that lie outside the hexagonal cell assigning them // to their correct neighboring cell. // Writer's note: see ~/Google Drive/Work/calin // double dv = y*c_vy_inv; // double du = x-dv*c_vx; // u = std::lround(du); // v = std::lround(dv); // du -= u; // dv -= v; y = _mm256_mul_ps(y, calin::math::simd::c_m256(_c_m256_vy_inv)); x = _mm256_fnmadd_ps(y, calin::math::simd::c_m256(_c_m256_vx), x); u = _mm256_cvtps_epi32(x); v = _mm256_cvtps_epi32(y); x = _mm256_sub_ps(x, _mm256_cvtepi32_ps(u)); y = _mm256_sub_ps(y, _mm256_cvtepi32_ps(v)); // double c3 = dv-du; const __m256i c3 = _mm256_castps_si256(_mm256_sub_ps(y, x)); __m256i uvshift; __m256i mask; // double c1 = du+0.5*dv; // double c2 = dv+0.5*du; // if(c3<0) { // if(c1>=1) u++; // else if(c2<-1) v--; // } else { // if(c2>=1) v++; // else if(c1<-1) u--; // } uvshift = _mm256_cvtps_epi32(_mm256_fmadd_ps(y, calin::math::simd::c_m256(_c_m256_one_half), x)); mask = _mm256_srai_epi32(_mm256_xor_si256(uvshift, c3), 31); u = _mm256_blendv_epi8(u, _mm256_add_epi32(u, uvshift), mask); uvshift = _mm256_cvtps_epi32(_mm256_fmadd_ps(x, calin::math::simd::c_m256(_c_m256_one_half), y)); mask = _mm256_srai_epi32(_mm256_xor_si256(uvshift, c3), 31); v = _mm256_blendv_epi8(_mm256_add_epi32(v, uvshift), v, mask); }
void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]) { int i; const uint8_t *refs[4]; __m256i sums[4]; refs[0] = ref_array[0]; refs[1] = ref_array[1]; refs[2] = ref_array[2]; refs[3] = ref_array[3]; sums[0] = _mm256_setzero_si256(); sums[1] = _mm256_setzero_si256(); sums[2] = _mm256_setzero_si256(); sums[3] = _mm256_setzero_si256(); for (i = 0; i < 32; i++) { __m256i r[4]; // load src and all ref[] const __m256i s = _mm256_load_si256((const __m256i *)src_ptr); r[0] = _mm256_loadu_si256((const __m256i *)refs[0]); r[1] = _mm256_loadu_si256((const __m256i *)refs[1]); r[2] = _mm256_loadu_si256((const __m256i *)refs[2]); r[3] = _mm256_loadu_si256((const __m256i *)refs[3]); // sum of the absolute differences between every ref[] to src r[0] = _mm256_sad_epu8(r[0], s); r[1] = _mm256_sad_epu8(r[1], s); r[2] = _mm256_sad_epu8(r[2], s); r[3] = _mm256_sad_epu8(r[3], s); // sum every ref[] sums[0] = _mm256_add_epi32(sums[0], r[0]); sums[1] = _mm256_add_epi32(sums[1], r[1]); sums[2] = _mm256_add_epi32(sums[2], r[2]); sums[3] = _mm256_add_epi32(sums[3], r[3]); src_ptr += src_stride; refs[0] += ref_stride; refs[1] += ref_stride; refs[2] += ref_stride; refs[3] += ref_stride; } calc_final(sums, sad_array); }
/** * \brief Calculate SAD for 8x8 bytes in continuous memory. */ static INLINE __m256i inline_8bit_sad_8x8_avx2(const __m256i *const a, const __m256i *const b) { __m256i sum0, sum1; sum0 = _mm256_sad_epu8(_mm256_load_si256(a + 0), _mm256_load_si256(b + 0)); sum1 = _mm256_sad_epu8(_mm256_load_si256(a + 1), _mm256_load_si256(b + 1)); return _mm256_add_epi32(sum0, sum1); }
static FORCE_INLINE __m256i lookup_double_AVX2(const int16_t *VXFull, const int16_t *VYFull, const PixelType *pref, int w, const __m256i &dwords_ref_pitch, const __m256i &dwords_hoffsets) { __m256i vx = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i *)&VXFull[w])); vx = _mm256_srai_epi32(vx, 1); __m256i vy = _mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)&VYFull[w])); vy = _mm256_srai_epi16(vy, 1); __m256i addr = _mm256_madd_epi16(vy, dwords_ref_pitch); addr = _mm256_add_epi32(addr, vx); addr = _mm256_add_epi32(addr, dwords_hoffsets); // It's okay to read two or three bytes more than needed. pref is always padded, unless the user chooses a horizontal padding of 0, which would be stupid. __m256i gathered = _mm256_i32gather_epi32((const int *)pref, addr, sizeof(PixelType)); gathered = _mm256_and_si256(gathered, _mm256_set1_epi32((1 << (sizeof(PixelType) * 8)) - 1)); return gathered; }
static FORCE_INLINE void FlowInterSimple_double_8px_AVX2( int w, PixelType *pdst, const PixelType *prefB, const PixelType *prefF, const int16_t *VXFullB, const int16_t *VXFullF, const int16_t *VYFullB, const int16_t *VYFullF, const uint8_t *MaskB, const uint8_t *MaskF, int nPelLog, const __m256i &dwords_ref_pitch, const __m256i &dwords_hoffsets) { __m256i dwords_w = _mm256_add_epi32(_mm256_set1_epi32(w << nPelLog), dwords_hoffsets); /// maybe do it another way __m256i dstF = lookup_double_AVX2(VXFullF, VYFullF, prefF, w, dwords_ref_pitch, dwords_w); __m256i dstB = lookup_double_AVX2(VXFullB, VYFullB, prefB, w, dwords_ref_pitch, dwords_w); __m256i maskf = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskF[w])); __m256i maskb = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskB[w])); __m256i dstF_dstB = _mm256_add_epi32(dstF, dstB); dstF_dstB = _mm256_slli_epi32(dstF_dstB, 8); __m256i dst; if (sizeof(PixelType) == 1) { __m256i dstB_dstF = _mm256_sub_epi16(dstB, dstF); __m256i maskf_maskb = _mm256_sub_epi16(maskf, maskb); dst = _mm256_madd_epi16(dstB_dstF, maskf_maskb); } else { __m256i dstB_dstF = _mm256_sub_epi32(dstB, dstF); __m256i maskf_maskb = _mm256_sub_epi32(maskf, maskb); dst = _mm256_mullo_epi32(dstB_dstF, maskf_maskb); } dst = _mm256_add_epi32(dst, dstF_dstB); dst = _mm256_srai_epi32(dst, 9); dst = _mm256_packus_epi32(dst, dst); dst = _mm256_permute4x64_epi64(dst, 0xe8); // 0b11101000 - copy third qword to second qword __m128i dst128 = _mm256_castsi256_si128(dst); if (sizeof(PixelType) == 1) { dst128 = _mm_packus_epi16(dst128, dst128); _mm_storel_epi64((__m128i *)&pdst[w], dst128); } else { _mm_storeu_si128((__m128i *)&pdst[w], dst128); } }
inline __m256i avx2_uv_to_ringid(const __m256i u, const __m256i v) { // return static_cast<unsigned>(std::max({std::abs(u), std::abs(v), // std::abs(u+v)})); __m256i ringid = _mm256_abs_epi32(u); ringid = _mm256_max_epu32(ringid, _mm256_abs_epi32(v)); ringid = _mm256_max_epu32(ringid, _mm256_abs_epi32(_mm256_add_epi32(u,v))); return ringid; }
inline __m256i avx2_uv_to_hexid_cw(__m256i u, __m256i v) { // u += v; // v = -v; // return uv_to_hexid_ccw(u, v); u = _mm256_add_epi32(u, v); v = _mm256_sign_epi32(v, _mm256_cmpeq_epi32(v, v)); return avx2_uv_to_hexid_ccw(u, v); }
INLINE static void sum_block_dual_avx2(__m256i *ver_row, unsigned *sum0, unsigned *sum1) { __m256i sad = _mm256_setzero_si256(); haddwd_accumulate_dual_avx2(&sad, ver_row + 0); haddwd_accumulate_dual_avx2(&sad, ver_row + 1); haddwd_accumulate_dual_avx2(&sad, ver_row + 2); haddwd_accumulate_dual_avx2(&sad, ver_row + 3); haddwd_accumulate_dual_avx2(&sad, ver_row + 4); haddwd_accumulate_dual_avx2(&sad, ver_row + 5); haddwd_accumulate_dual_avx2(&sad, ver_row + 6); haddwd_accumulate_dual_avx2(&sad, ver_row + 7); sad = _mm256_add_epi32(sad, _mm256_shuffle_epi32(sad, KVZ_PERMUTE(2, 3, 0, 1))); sad = _mm256_add_epi32(sad, _mm256_shuffle_epi32(sad, KVZ_PERMUTE(1, 0, 1, 0))); *sum0 = _mm_cvtsi128_si32(_mm256_extracti128_si256(sad, 0)); *sum1 = _mm_cvtsi128_si32(_mm256_extracti128_si256(sad, 1)); }
inline __m256i avx2_positive_hexid_to_ringid_loop(const __m256i hexid) { // This algorithm is relatively slow in comparisson to the scalar version // but still faster overall conidering we compute 8 rigids in one go const __m256i six = _mm256_set1_epi32(6); const __m256i one = _mm256_set1_epi32(1); __m256i ringid = _mm256_setzero_si256(); __m256i nsites = one; __m256i nring = _mm256_setzero_si256(); __m256i mask = _mm256_cmpgt_epi32(nsites, hexid); while(~_mm256_movemask_epi8(mask)) { ringid = _mm256_blendv_epi8(_mm256_add_epi32(ringid, one), ringid, mask); nring = _mm256_add_epi32(nring, six); nsites = _mm256_add_epi32(nsites, nring); mask = _mm256_cmpgt_epi32(nsites, hexid); } return ringid; }
__m256i branchfree_search8_avx(int* source, size_t n, __m256i target) { __m256i offsets = _mm256_setzero_si256(); if(n == 0) return offsets; __m256i ha = _mm256_set1_epi32(n>>1); while(n>1) { n -= n>>1; __m256i offsetsplushalf = _mm256_add_epi32(offsets,ha); ha = _mm256_sub_epi32(ha,_mm256_srli_epi32(ha,1)); __m256i keys = _mm256_i32gather_epi32(source,offsetsplushalf,4); __m256i lt = _mm256_cmpgt_epi32(target,keys); offsets = _mm256_blendv_epi8(offsets,offsetsplushalf,lt); } __m256i lastkeys = _mm256_i32gather_epi32(source,offsets,4); __m256i lastlt = _mm256_cmpgt_epi32(target,lastkeys); __m256i oneswhereneeded = _mm256_srli_epi32(lastlt,31); __m256i answer = _mm256_add_epi32(offsets,oneswhereneeded); return answer; }
static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref, __m256i *const sse, __m256i *const sum) { const __m256i adj_sub = _mm256_set1_epi16(0xff01); // (1,-1) // unpack into pairs of source and reference values const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref); const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref); // subtract adjacent elements using src*1 + ref*-1 const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub); const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub); const __m256i madd0 = _mm256_madd_epi16(diff0, diff0); const __m256i madd1 = _mm256_madd_epi16(diff1, diff1); // add to the running totals *sum = _mm256_add_epi16(*sum, _mm256_add_epi16(diff0, diff1)); *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1)); }
v8sf exp256_ps(v8sf x) { v8sf tmp = _mm256_setzero_ps(), fx; v8si imm0; v8sf one = *(v8sf*)_ps256_1; x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi); x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo); /* express exp(x) as exp(g + n*log(2)) */ fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF); fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5); /* how to perform a floorf with SSE: just below */ //imm0 = _mm256_cvttps_epi32(fx); //tmp = _mm256_cvtepi32_ps(imm0); tmp = _mm256_floor_ps(fx); /* if greater, substract 1 */ //v8sf mask = _mm256_cmpgt_ps(tmp, fx); v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS); mask = _mm256_and_ps(mask, one); fx = _mm256_sub_ps(tmp, mask); tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1); v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2); x = _mm256_sub_ps(x, tmp); x = _mm256_sub_ps(x, z); z = _mm256_mul_ps(x,x); v8sf y = *(v8sf*)_ps256_cephes_exp_p0; y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p1); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p2); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p3); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p4); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p5); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, x); y = _mm256_add_ps(y, one); /* build 2^n */ imm0 = _mm256_cvttps_epi32(fx); // another two AVX2 instructions imm0 = _mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f); imm0 = _mm256_slli_epi32(imm0, 23); v8sf pow2n = _mm256_castsi256_ps(imm0); y = _mm256_mul_ps(y, pow2n); return y; }
__m256 mm256_exp_ps(__m256 x) { __m256 tmp = _mm256_setzero_ps(), fx; __m256i emm0; __m256 one = *(__m256*)m256_ps_1; x = _mm256_min_ps(x, *(__m256*)m256_ps_exp_hi); x = _mm256_max_ps(x, *(__m256*)m256_ps_exp_lo); /* express exp(x) as exp(g + n*log(2)) */ fx = _mm256_mul_ps(x, *(__m256*)m256_ps_cephes_LOG2EF); fx = _mm256_add_ps(fx, *(__m256*)m256_ps_0p5); /* how to perform a floorf with SSE: just below */ /* step 1 : cast to int */ emm0 = _mm256_cvttps_epi32(fx); /* step 2 : cast back to float */ tmp = _mm256_cvtepi32_ps(emm0); /* if greater, substract 1 */ __m256 mask = _mm256_cmp_ps( tmp, fx, _CMP_GT_OS ); mask = _mm256_and_ps(mask, one); fx = _mm256_sub_ps(tmp, mask); tmp = _mm256_mul_ps(fx, *(__m256*)m256_ps_cephes_exp_C1); __m256 z = _mm256_mul_ps(fx, *(__m256*)m256_ps_cephes_exp_C2); x = _mm256_sub_ps(x, tmp); x = _mm256_sub_ps(x, z); z = _mm256_mul_ps(x,x); __m256 y = *(__m256*)m256_ps_cephes_exp_p0; y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p1); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p2); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p3); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p4); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p5); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, x); y = _mm256_add_ps(y, one); /* build 2^n */ emm0 = _mm256_cvttps_epi32(fx); emm0 = _mm256_add_epi32(emm0, *(__m256i*)m256_pi32_0x7f); emm0 = _mm256_slli_epi32(emm0, 23); __m256 pow2n = _mm256_castsi256_ps(emm0); y = _mm256_mul_ps(y, pow2n); _mm256_zeroupper(); return y; }
int vpx_highbd_satd_avx2(const tran_low_t *coeff, int length) { __m256i accum = _mm256_setzero_si256(); int i; for (i = 0; i < length; i += 8, coeff += 8) { const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff); const __m256i abs = _mm256_abs_epi32(src_line); accum = _mm256_add_epi32(accum, abs); } { // 32 bit horizontal add const __m256i a = _mm256_srli_si256(accum, 8); const __m256i b = _mm256_add_epi32(accum, a); const __m256i c = _mm256_srli_epi64(b, 32); const __m256i d = _mm256_add_epi32(b, c); const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d), _mm256_extractf128_si256(d, 1)); return _mm_cvtsi128_si32(accum_128); } }
inline avx_m256_t newsin_ps(avx_m256_t x) { avx_m256_t sign_bit = _mm256_and_ps(x, _ps_sign_mask); x = _mm256_and_ps(x, _ps_inv_sign_mask); avx_m256_t y = _mm256_mul_ps(x, _ps_cephes_FOPI); avx_m256i_t emm2 = _mm256_cvttps_epi32(y); emm2 = _mm256_add_epi32(emm2, _pi32_1); emm2 = _mm256_and_si256(emm2, _pi32_inv1); y = _mm256_cvtepi32_ps(emm2); avx_m256i_t emm0 = _mm256_and_si256(emm2, _pi32_4); emm0 = _mm256_slli_epi32(emm0, 29); emm2 = _mm256_and_si256(emm2, _pi32_2); emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256()); avx_m256_t swap_sign_bit = _mm256_castsi256_ps(emm0); avx_m256_t poly_mask = _mm256_castsi256_ps(emm2); sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit); avx_m256_t temp = _ps_minus_cephes_DP123; temp = _mm256_mul_ps(y, temp); x = _mm256_add_ps(x, temp); avx_m256_t x2 = _mm256_mul_ps(x, x); avx_m256_t x3 = _mm256_mul_ps(x2, x); avx_m256_t x4 = _mm256_mul_ps(x2, x2); y = _ps_coscof_p0; avx_m256_t y2 = _ps_sincof_p0; y = _mm256_mul_ps(y, x2); y2 = _mm256_mul_ps(y2, x2); y = _mm256_add_ps(y, _ps_coscof_p1); y2 = _mm256_add_ps(y2, _ps_sincof_p1); y = _mm256_mul_ps(y, x2); y2 = _mm256_mul_ps(y2, x2); y = _mm256_add_ps(y, _ps_coscof_p2); y2 = _mm256_add_ps(y2, _ps_sincof_p2); y = _mm256_mul_ps(y, x4); y2 = _mm256_mul_ps(y2, x3); temp = _mm256_mul_ps(x2, _ps_0p5); temp = _mm256_sub_ps(temp, _ps_1); y = _mm256_sub_ps(y, temp); y2 = _mm256_add_ps(y2, x); y = _mm256_andnot_ps(poly_mask, y); y2 = _mm256_and_ps(poly_mask, y2); y = _mm256_add_ps(y, y2); y = _mm256_xor_ps(y, sign_bit); return y; } // newsin_ps()
void vpx_highbd_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { int idx; tran_low_t *t_coeff = coeff; for (idx = 0; idx < 4; ++idx) { const int16_t *src_ptr = src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; vpx_highbd_hadamard_16x16_avx2(src_ptr, src_stride, t_coeff + idx * 256); } for (idx = 0; idx < 256; idx += 8) { __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256)); __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512)); __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768)); __m256i b0 = _mm256_add_epi32(coeff0, coeff1); __m256i b1 = _mm256_sub_epi32(coeff0, coeff1); __m256i b2 = _mm256_add_epi32(coeff2, coeff3); __m256i b3 = _mm256_sub_epi32(coeff2, coeff3); b0 = _mm256_srai_epi32(b0, 2); b1 = _mm256_srai_epi32(b1, 2); b2 = _mm256_srai_epi32(b2, 2); b3 = _mm256_srai_epi32(b3, 2); coeff0 = _mm256_add_epi32(b0, b2); coeff1 = _mm256_add_epi32(b1, b3); coeff2 = _mm256_sub_epi32(b0, b2); coeff3 = _mm256_sub_epi32(b1, b3); _mm256_storeu_si256((__m256i *)coeff, coeff0); _mm256_storeu_si256((__m256i *)(coeff + 256), coeff1); _mm256_storeu_si256((__m256i *)(coeff + 512), coeff2); _mm256_storeu_si256((__m256i *)(coeff + 768), coeff3); coeff += 8; t_coeff += 8; } }