int TestAVX2(long long int size) { long long int bytes = 1024ll * 1024ll * 4096ll; long long int count = bytes / (size * 1024ll); std::ostringstream oss; oss << "Size: " << size << "KB; speed:"; void* mem = _aligned_malloc((size * 1024), 32); int limit = (size * 1024) / 32; __m256i dummy = _mm256_set1_epi32(0); { Util::Timer timer(oss.str().c_str(), bytes); for (int i = 0; i < count; ++i) { // AVX2 load & xor: const __m256i* data = (const __m256i*)mem; const __m256i* end = (const __m256i*)(((byte*)mem) + size * 1024); __m256i dummy2 = _mm256_set1_epi32(0); for (; data != end; ++data) { dummy2 = _mm256_load_si256(data); } dummy = _mm256_xor_si256(dummy2, dummy); } } _aligned_free(mem); return (int)(dummy.m256i_i32[0]); }
static void TestAVX2MT2(MTTest *t) { long long int count = t->count; void* mem = t->mem; int size = t->size; __m256i dummy = _mm256_set1_epi32(0); for (int i = 0; i < count; ++i) { // AVX2 load & xor: const __m256i* data = (const __m256i*)mem; const __m256i* end = (const __m256i*)(((byte*)mem) + size * 1024); // We're attempting to get the compiler to make dummy2 a register. We need it because // otherwise the complete loop will get eliminated. __m256i dummy2 = _mm256_set1_epi32(0); for (; data != end; ++data) { dummy2 = _mm256_load_si256(data); } dummy = _mm256_xor_si256(dummy, dummy2); } t->dummy ^= dummy.m256i_i32[0]; }
unsigned calin::math::hex_array::test_avx2_uv_to_hexid_cw(int u, int v) { #if defined(__AVX2__) and defined(__FMA__) __m256i vu = _mm256_set1_epi32(u); __m256i vv = _mm256_set1_epi32(v); __m256i vhexid = avx2_uv_to_hexid_cw(vu, vv); return vhexid[0]; #else throw std::runtime_error("AVX2 and FMA not available at compile time"); #endif }
unsigned calin::math::hex_array::test_avx2_ringid_segid_runid_to_hexid( unsigned ringid, unsigned segid, unsigned runid) { #if defined(__AVX2__) and defined(__FMA__) __m256i vringid = _mm256_set1_epi32(ringid); __m256i vsegid = _mm256_set1_epi32(segid); __m256i vrunid = _mm256_set1_epi32(runid); __m256i vhexid = avx2_ringid_segid_runid_to_hexid(vringid, vsegid, vrunid); return vhexid[0]; #else throw std::runtime_error("AVX2 and FMA not available at compile time"); #endif }
static void FlowInterExtra_AVX2( uint8_t *pdst8, int dst_pitch, const uint8_t *prefB8, const uint8_t *prefF8, int ref_pitch, const int16_t *VXFullB, const int16_t *VXFullF, const int16_t *VYFullB, const int16_t *VYFullF, const uint8_t *MaskB, const uint8_t *MaskF, int VPitch, int width, int height, int time256, int nPel, const int16_t *VXFullBB, const int16_t *VXFullFF, const int16_t *VYFullBB, const int16_t *VYFullFF) { const PixelType *prefB = (const PixelType *)prefB8; const PixelType *prefF = (const PixelType *)prefF8; PixelType *pdst = (PixelType *)pdst8; ref_pitch /= sizeof(PixelType); dst_pitch /= sizeof(PixelType); int nPelLog = ilog2(nPel); const __m256i dwords_time256 = _mm256_set1_epi32(time256); const __m256i dwords_256_time256 = _mm256_set1_epi32(256 - time256); const __m256i dwords_ref_pitch = _mm256_set1_epi32(ref_pitch); const __m256i dwords_hoffsets = _mm256_set_epi32(7 << nPelLog, 6 << nPelLog, 5 << nPelLog, 4 << nPelLog, 3 << nPelLog, 2 << nPelLog, 1 << nPelLog, 0); const int pixels_per_iteration = 8; const int width_avx2 = width & ~(pixels_per_iteration - 1); for (int h = 0; h < height; h++) { for (int w = 0; w < width_avx2; w += pixels_per_iteration) FlowInterExtra_8px_AVX2(w, pdst, prefB, prefF, VXFullB, VXFullF, VYFullB, VYFullF, MaskB, MaskF, nPelLog, VXFullBB, VXFullFF, VYFullBB, VYFullFF, dwords_time256, dwords_256_time256, dwords_ref_pitch, dwords_hoffsets); if (width_avx2 < width) FlowInterExtra_8px_AVX2(width - pixels_per_iteration, pdst, prefB, prefF, VXFullB, VXFullF, VYFullB, VYFullF, MaskB, MaskF, nPelLog, VXFullBB, VXFullFF, VYFullBB, VYFullFF, dwords_time256, dwords_256_time256, dwords_ref_pitch, dwords_hoffsets); pdst += dst_pitch; prefB += ref_pitch << nPelLog; prefF += ref_pitch << nPelLog; VXFullB += VPitch; VYFullB += VPitch; VXFullF += VPitch; VYFullF += VPitch; MaskB += VPitch; MaskF += VPitch; VXFullBB += VPitch; VYFullBB += VPitch; VXFullFF += VPitch; VYFullFF += VPitch; } }
void calin::math::hex_array::test_avx2_uv_to_xy_f(int u, int v, float& x, float& y) { #if defined(__AVX2__) and defined(__FMA__) __m256i vu = _mm256_set1_epi32(u); __m256i vv = _mm256_set1_epi32(v); __m256 vx; __m256 vy; avx2_uv_to_xy_f(vu, vv, vx, vy); x = vx[0]; y = vy[0]; #else throw std::runtime_error("AVX2 and FMA not available at compile time"); #endif }
/*! * \brief Multiply the two given vectors of byte */ ETL_STATIC_INLINE(avx_simd_byte) mul(avx_simd_byte lhs, avx_simd_byte rhs) { auto aodd = _mm256_srli_epi16(lhs.value, 8); auto bodd = _mm256_srli_epi16(rhs.value, 8); auto muleven = _mm256_mullo_epi16(lhs.value, rhs.value); auto mulodd = _mm256_slli_epi16(_mm256_mullo_epi16(aodd, bodd), 8); return _mm256_blendv_epi8(mulodd, muleven, _mm256_set1_epi32(0x00FF00FF)); }
static inline void blend_unorm8_argb(struct reg *src, __m256i dst_argb) { if (gt.blend.enable) { const __m256i mask = _mm256_set1_epi32(0xff); const __m256 scale = _mm256_set1_ps(1.0f / 255.0f); struct reg dst[4]; /* Convert to float */ dst[2].reg = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(dst_argb, mask)), scale); dst_argb = _mm256_srli_epi32(dst_argb, 8); dst[1].reg = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(dst_argb, mask)), scale); dst_argb = _mm256_srli_epi32(dst_argb, 8); dst[0].reg = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(dst_argb, mask)), scale); dst_argb = _mm256_srli_epi32(dst_argb, 8); dst[3].reg = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(dst_argb, mask)), scale); /* Blend, assuming src BLENDFACTOR_SRC_ALPHA, dst * BLENDFACTOR_INV_SRC_ALPHA, and BLENDFUNCTION_ADD. */ const __m256 inv_alpha = _mm256_sub_ps(_mm256_set1_ps(1.0f), src[3].reg); src[0].reg = _mm256_add_ps(_mm256_mul_ps(src[3].reg, src[0].reg), _mm256_mul_ps(inv_alpha, dst[0].reg)); src[1].reg = _mm256_add_ps(_mm256_mul_ps(src[3].reg, src[1].reg), _mm256_mul_ps(inv_alpha, dst[1].reg)); src[2].reg = _mm256_add_ps(_mm256_mul_ps(src[3].reg, src[2].reg), _mm256_mul_ps(inv_alpha, dst[2].reg)); src[3].reg = _mm256_add_ps(_mm256_mul_ps(src[3].reg, src[3].reg), _mm256_mul_ps(inv_alpha, dst[3].reg)); } }
size_t variablevectorshift_unrolled(uint32_t *array, size_t length, int shiftamount) { size_t k = 0; __m256i * a = (__m256i *) array; __m256i s = _mm256_set1_epi32(shiftamount); for (; k + 3 < length / 8 ; k +=4, a+=4) { __m256i v1 = _mm256_loadu_si256(a); __m256i v2 = _mm256_loadu_si256(a + 1); __m256i v3 = _mm256_loadu_si256(a + 2); __m256i v4 = _mm256_loadu_si256(a + 3); v1 = _mm256_srlv_epi32(v1,s); v2 = _mm256_srlv_epi32(v2,s); v3 = _mm256_srlv_epi32(v3,s); v4 = _mm256_srlv_epi32(v4,s); _mm256_storeu_si256(a,v1); _mm256_storeu_si256(a + 1,v2); _mm256_storeu_si256(a + 2,v3); _mm256_storeu_si256(a + 3,v4); } for (; k < length / 8 ; k ++, a++) { __m256i v = _mm256_loadu_si256(a); v = _mm256_srlv_epi32(v,s); _mm256_storeu_si256(a,v); } k *= 8; for (; k < length; ++k) { array[k] = array[k] >> shiftamount; } return 0; }
inline void avx2_positive_hexid_to_ringid_segid_runid( const __m256i hexid, __m256i& ringid, __m256i& segid, __m256i& runid) { // ringid = positive_hexid_to_ringid(hexid); // unsigned iring = hexid - ringid_to_nsites_contained(ringid-1); // segid = int(iring/ringid); // runid = iring - segid*ringid; const __m256i one = _mm256_set1_epi32(1); ringid = avx2_positive_hexid_to_ringid(hexid); runid = _mm256_sub_epi32(hexid, avx2_ringid_to_nsites_contained(_mm256_sub_epi32(ringid,one))); segid = _mm256_setzero_si256(); const __m256i ringid_minus_one = _mm256_sub_epi32(ringid, one); __m256i mask = _mm256_cmpgt_epi32(runid, ringid_minus_one); runid = _mm256_sub_epi32(runid, _mm256_and_si256(mask, ringid)); segid = _mm256_add_epi32(segid, _mm256_and_si256(mask, one)); mask = _mm256_cmpgt_epi32(runid, ringid_minus_one); runid = _mm256_sub_epi32(runid, _mm256_and_si256(mask, ringid)); segid = _mm256_add_epi32(segid, _mm256_and_si256(mask, one)); mask = _mm256_cmpgt_epi32(runid, ringid_minus_one); runid = _mm256_sub_epi32(runid, _mm256_and_si256(mask, ringid)); segid = _mm256_add_epi32(segid, _mm256_and_si256(mask, one)); mask = _mm256_cmpgt_epi32(runid, ringid_minus_one); runid = _mm256_sub_epi32(runid, _mm256_and_si256(mask, ringid)); segid = _mm256_add_epi32(segid, _mm256_and_si256(mask, one)); mask = _mm256_cmpgt_epi32(runid, ringid_minus_one); runid = _mm256_sub_epi32(runid, _mm256_and_si256(mask, ringid)); segid = _mm256_add_epi32(segid, _mm256_and_si256(mask, one)); }
inline __m256i avx2_positive_hexid_to_ringid_loop(const __m256i hexid) { // This algorithm is relatively slow in comparisson to the scalar version // but still faster overall conidering we compute 8 rigids in one go const __m256i six = _mm256_set1_epi32(6); const __m256i one = _mm256_set1_epi32(1); __m256i ringid = _mm256_setzero_si256(); __m256i nsites = one; __m256i nring = _mm256_setzero_si256(); __m256i mask = _mm256_cmpgt_epi32(nsites, hexid); while(~_mm256_movemask_epi8(mask)) { ringid = _mm256_blendv_epi8(_mm256_add_epi32(ringid, one), ringid, mask); nring = _mm256_add_epi32(nring, six); nsites = _mm256_add_epi32(nsites, nring); mask = _mm256_cmpgt_epi32(nsites, hexid); } return ringid; }
inline __m256i avx2_positive_ringid_segid_runid_to_hexid( const __m256i ringid, const __m256i segid, const __m256i runid) { // return ringid_to_nsites_contained(ringid-1)+segid*ringid+runid; const __m256i one = _mm256_set1_epi32(1); __m256i nsites = avx2_ringid_to_nsites_contained(_mm256_sub_epi32(ringid, one)); nsites = _mm256_add_epi32(nsites, _mm256_mullo_epi32(segid, ringid)); nsites = _mm256_add_epi32(nsites, runid); return nsites; }
void PriorityQueue_AVX2::clear() { _size = _current = 0; _maxrank = INT_MAX; _isPopping = false; __m256i max = _mm256_set1_epi32(_maxrank); for (int i = 0; i < 5; ++i) _mm256_store_si256(_rv + i, max); }
__SIMDi _SIMD_abs_epi32(__SIMDi a) { #ifdef USE_SSE return _mm_andnot_si128(_mm_set1_epi32(-0), a); #elif defined USE_AVX return _mm256_andnot_si256(_mm256_set1_epi32(-0), a); #elif defined USE_IBM return vec_abs(a); #endif }
static inline __m256i enc_reshuffle (__m256i in) { // Spread out 32-bit words over both halves of the input register: in = _mm256_permutevar8x32_epi32(in, _mm256_setr_epi32( 0, 1, 2, -1, 3, 4, 5, -1)); // Slice into 32-bit chunks and operate on all chunks in parallel. // All processing is done within the 32-bit chunk. First, shuffle: // before: [eeeeeeff|ccdddddd|bbbbcccc|aaaaaabb] // after: [00000000|aaaaaabb|bbbbcccc|ccdddddd] in = _mm256_shuffle_epi8(in, _mm256_set_epi8( -1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2, -1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2)); // cd = [00000000|00000000|0000cccc|ccdddddd] const __m256i cd = _mm256_and_si256(in, _mm256_set1_epi32(0x00000FFF)); // ab = [0000aaaa|aabbbbbb|00000000|00000000] const __m256i ab = _mm256_and_si256(_mm256_slli_epi32(in, 4), _mm256_set1_epi32(0x0FFF0000)); // merged = [0000aaaa|aabbbbbb|0000cccc|ccdddddd] const __m256i merged = _mm256_or_si256(ab, cd); // bd = [00000000|00bbbbbb|00000000|00dddddd] const __m256i bd = _mm256_and_si256(merged, _mm256_set1_epi32(0x003F003F)); // ac = [00aaaaaa|00000000|00cccccc|00000000] const __m256i ac = _mm256_and_si256(_mm256_slli_epi32(merged, 2), _mm256_set1_epi32(0x3F003F00)); // indices = [00aaaaaa|00bbbbbb|00cccccc|00dddddd] const __m256i indices = _mm256_or_si256(ac, bd); // return = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] return _mm256_bswap_epi32(indices); }
inline __m256i avx2_ringid_to_nsites_contained(const __m256i ringid) { // return 3*ringid*(ringid+1)+1; const __m256i one = _mm256_set1_epi32(1); __m256i nsites = _mm256_add_epi32(ringid, one); nsites = _mm256_mullo_epi32(ringid, nsites); nsites = _mm256_sub_epi32(_mm256_slli_epi32(nsites, 2), nsites); nsites = _mm256_add_epi32(nsites, one); return nsites; }
void TransLut_FindIndexAvx2 <TransLut::MapperLin>::find_index (const TransLut::FloatIntMix val_arr [8], __m256i &index, __m256 &frac) { assert (val_arr != 0); const __m256 scale = _mm256_set1_ps (1 << LINLUT_RES_L2); const __m256i offset = _mm256_set1_epi32 (-LINLUT_MIN_F * (1 << LINLUT_RES_L2)); const __m256i val_min = _mm256_setzero_si256 (); const __m256i val_max = _mm256_set1_epi32 (LINLUT_SIZE_F - 2); const __m256 v = _mm256_load_ps (reinterpret_cast <const float *> (val_arr)); const __m256 val_scl = _mm256_mul_ps (v, scale); const __m256i index_raw = _mm256_cvtps_epi32 (val_scl); __m256i index_tmp = _mm256_add_epi32 (index_raw, offset); index_tmp = _mm256_min_epi32 (index_tmp, val_max); index = _mm256_max_epi32 (index_tmp, val_min); frac = _mm256_sub_ps (val_scl, _mm256_cvtepi32_ps (index_raw)); }
unsigned calin::math::hex_array::test_avx2_positive_hexid_to_ringid_root(unsigned hexid) { #if defined(__AVX2__) and defined(__FMA__) __m256i vhexid = _mm256_set1_epi32(hexid); __m256i vringid = avx2_positive_hexid_to_ringid_root(vhexid); return vringid[0]; #else throw std::runtime_error("AVX2 and FMA not available at compile time"); #endif }
unsigned calin::math::hex_array::test_avx2_ringid_to_nsites_contained(unsigned ringid) { #if defined(__AVX2__) and defined(__FMA__) __m256i vringid = _mm256_set1_epi32(ringid); __m256i vnsites = avx2_ringid_to_nsites_contained(vringid); return vnsites[0]; #else throw std::runtime_error("AVX2 and FMA not available at compile time"); #endif }
inline void avx2_hexid_to_uv_ccw(const __m256i hexid, __m256i& u, __m256i& v) { // if(hexid==0) { u = v = 0; return; } // unsigned ringid; // unsigned segid; // unsigned runid; // positive_hexid_to_ringid_segid_runid(hexid, ringid, segid, runid); // switch(segid) // { // case 0: u = ringid-runid; v = runid; break; // case 1: u = -runid; v = ringid; break; // case 2: u = -ringid; v = ringid-runid; break; // case 3: u = runid-ringid; v = -runid; break; // case 4: u = runid; v = -ringid; break; // case 5: u = ringid; v = runid-ringid; break; // default: assert(0); // } const __m256i one = _mm256_set1_epi32(1); __m256i ringid = avx2_positive_hexid_to_ringid(hexid); __m256i iring = _mm256_sub_epi32(hexid, avx2_ringid_to_nsites_contained(_mm256_sub_epi32(ringid,one))); u = ringid; v = _mm256_setzero_si256(); __m256i irun = _mm256_min_epu32(iring, ringid); u = _mm256_sub_epi32(u, irun); v = _mm256_add_epi32(v, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); u = _mm256_sub_epi32(u, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); v = _mm256_sub_epi32(v, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); u = _mm256_add_epi32(u, irun); v = _mm256_sub_epi32(v, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); u = _mm256_add_epi32(u, irun); iring = _mm256_sub_epi32(iring, irun); v = _mm256_add_epi32(v, iring); const __m256i mask = _mm256_cmpeq_epi32(hexid, _mm256_setzero_si256()); u = _mm256_andnot_si256(mask, u); v = _mm256_andnot_si256(mask, v); }
void calin::math::hex_array::test_avx2_hexid_to_uv_cw(unsigned hexid, int& u, int& v) { #if defined(__AVX2__) and defined(__FMA__) __m256i vhexid = _mm256_set1_epi32(hexid); __m256i vu; __m256i vv; avx2_hexid_to_uv_cw(vhexid, vu, vv); u = vu[0]; v = vv[0]; #else throw std::runtime_error("AVX2 and FMA not available at compile time"); #endif }
void calin::math::hex_array:: test_avx2_hexid_to_xy_f(unsigned hexid, float& x, float& y, bool clockwise) { #if defined(__AVX2__) and defined(__FMA__) __m256i vhexid = _mm256_set1_epi32(hexid); __m256 vx; __m256 vy; avx2_hexid_to_xy_f(vhexid, vx, vy, clockwise); x = vx[0]; y = vy[0]; #else throw std::runtime_error("AVX2 and FMA not available at compile time"); #endif }
// credit: Harold Aptroot uint32_t maskedvectorsum(uint32_t * z, uint32_t N, uint32_t * accesses, uint32_t nmbr) { __m256i Nvec = _mm256_set1_epi32(N - 1); __m256i sum = _mm256_setzero_si256(); for(uint32_t j = 0; j < nmbr ; j += 8) { __m256i indexes = _mm256_loadu_si256((__m256i*)(accesses + j)); indexes = _mm256_and_si256(indexes, Nvec); __m256i fi = _mm256_i32gather_epi32((int*)z, indexes, 4); sum = _mm256_add_epi32(sum, fi); } __m128i sum128 = _mm_add_epi32(_mm256_extracti128_si256(sum, 0), _mm256_extracti128_si256(sum, 1)); sum128 = _mm_hadd_epi32(sum128, sum128); return _mm_extract_epi32(sum128, 0) + _mm_extract_epi32(sum128, 1); }
// assume N is divisible by 4 uint32_t vectorsum(uint32_t * z, uint32_t N, uint32_t * accesses, uint32_t nmbr) { __m256i Nvec = _mm256_set1_epi32(N); __m128i sum = _mm_setzero_si128(); for(uint32_t j = 0; j < nmbr ; j+=4) { __m256i fourints = _mm256_loadu_si256((const __m256i *)(accesses + j)); __m256i four64bitsproducts = _mm256_mul_epu32(fourints, Nvec); __m256i fourtop32ints = _mm256_srli_epi64(four64bitsproducts,32); __m128i four32ints = _mm256_i64gather_epi32 (z,fourtop32ints , 4); sum = _mm_add_epi32(sum, four32ints); } uint32_t buffer[4]; _mm_storeu_si128((__m128i *)buffer,sum); return buffer[0] + buffer[1] + buffer[2] + buffer[3]; }
int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw) { const int AB_BITS = MAX(10, (int)INTER_BITS); int x1 = 0; __m256i fxy_mask = _mm256_set1_epi32(INTER_TAB_SIZE - 1); __m256i XX = _mm256_set1_epi32(X0), YY = _mm256_set1_epi32(Y0); for (; x1 <= bw - 16; x1 += 16) { __m256i tx0, tx1, ty0, ty1; tx0 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(adelta + x1)), XX); ty0 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(bdelta + x1)), YY); tx1 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(adelta + x1 + 8)), XX); ty1 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(bdelta + x1 + 8)), YY); tx0 = _mm256_srai_epi32(tx0, AB_BITS - INTER_BITS); ty0 = _mm256_srai_epi32(ty0, AB_BITS - INTER_BITS); tx1 = _mm256_srai_epi32(tx1, AB_BITS - INTER_BITS); ty1 = _mm256_srai_epi32(ty1, AB_BITS - INTER_BITS); __m256i fx_ = _mm256_packs_epi32(_mm256_and_si256(tx0, fxy_mask), _mm256_and_si256(tx1, fxy_mask)); __m256i fy_ = _mm256_packs_epi32(_mm256_and_si256(ty0, fxy_mask), _mm256_and_si256(ty1, fxy_mask)); tx0 = _mm256_packs_epi32(_mm256_srai_epi32(tx0, INTER_BITS), _mm256_srai_epi32(tx1, INTER_BITS)); ty0 = _mm256_packs_epi32(_mm256_srai_epi32(ty0, INTER_BITS), _mm256_srai_epi32(ty1, INTER_BITS)); fx_ = _mm256_adds_epi16(fx_, _mm256_slli_epi16(fy_, INTER_BITS)); fx_ = _mm256_permute4x64_epi64(fx_, (3 << 6) + (1 << 4) + (2 << 2) + 0); _mm256_storeu_si256((__m256i*)(xy + x1 * 2), _mm256_unpacklo_epi16(tx0, ty0)); _mm256_storeu_si256((__m256i*)(xy + x1 * 2 + 16), _mm256_unpackhi_epi16(tx0, ty0)); _mm256_storeu_si256((__m256i*)(alpha + x1), fx_); } _mm256_zeroupper(); return x1; }
void bitmask_avx2(uint32_t* ptr, size_t n, uint32_t key, uint8_t* out) { uint32_t* output = (uint32_t*)out; const size_t N = 8*4; // unrolled 4 times const size_t chunks = n / N; const size_t tail = n % N; const __m256i vkey = _mm256_set1_epi32(key); for (size_t i=0; i < chunks; i++) { const __m256i in0 = _mm256_loadu_si256((const __m256i*)(ptr + i*N + 0*8)); const __m256i in1 = _mm256_loadu_si256((const __m256i*)(ptr + i*N + 1*8)); const __m256i in2 = _mm256_loadu_si256((const __m256i*)(ptr + i*N + 2*8)); const __m256i in3 = _mm256_loadu_si256((const __m256i*)(ptr + i*N + 3*8)); const __m256i eq0 = _mm256_cmpeq_epi32(in0, vkey); const __m256i eq1 = _mm256_cmpeq_epi32(in1, vkey); const __m256i eq2 = _mm256_cmpeq_epi32(in2, vkey); const __m256i eq3 = _mm256_cmpeq_epi32(in3, vkey); // eq0 = [a0 a1 a2 a3 a4 a5 a6 a7] (packed dword) // eq1 = [b0 b1 b2 b3 b4 b5 b6 b7] (packed dword) // eq2 = [c0 c1 c2 c3 c4 c5 c6 c7] (packed dword) // eq3 = [d0 d1 d2 d3 d4 d5 d6 d7] (packed dword) // t0 = [a0 a1 a2 a3 c0 c1 c2 c3 a4 a5 a6 a7 c4 c5 c6 c7] (packed word) const __m256i t0 = _mm256_packs_epi32(eq0, eq2); // m02 = [a0 a1 a2 a3 a4 a5 a6 a7 c0 c1 c2 c3 c4 c5 c6 c7] (packed word) const __m256i m02 = _mm256_permutevar8x32_epi32(t0, _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7)); // t0 = [b0 b1 b2 b3 d0 d1 d2 d3 b4 b5 b6 b7 d4 d5 d6 d7] (packed word) const __m256i t1 = _mm256_packs_epi32(eq1, eq3); // m13 = [b0 b1 b2 b3 b4 b5 b6 b7 d0 d1 d2 d3 d4 d5 d6 d7] (packed word) const __m256i m13 = _mm256_permutevar8x32_epi32(t1, _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7)); // m = [a0..7 b0..7 c0..7 d0..7] (packed byte) const __m256i m = _mm256_packs_epi16(m02, m13); *output++ = _mm256_movemask_epi8(m); } if (tail > 0) { bitmask_better_2(ptr + chunks*N, tail, key, out + chunks*N); } }
size_t variablevectorshift(uint32_t *array, size_t length, int shiftamount) { size_t k = 0; __m256i * a = (__m256i *) array; __m256i s = _mm256_set1_epi32(shiftamount); for (; k < length / 8 ; k ++, a++) { __m256i v = _mm256_loadu_si256(a); v = _mm256_srlv_epi32(v,s); _mm256_storeu_si256(a,v); } k *= 8; for (; k < length; ++k) { array[k] = array[k] >> shiftamount; } return 0; }
void calin::math::hex_array::test_avx2_positive_hexid_to_ringid_segid_runid(unsigned hexid, unsigned& ringid, unsigned& segid, unsigned& runid) { #if defined(__AVX2__) and defined(__FMA__) __m256i vhexid = _mm256_set1_epi32(hexid); __m256i vringid; __m256i vsegid; __m256i vrunid; avx2_positive_hexid_to_ringid_segid_runid(vhexid, vringid, vsegid, vrunid); ringid = vringid[0]; segid = vsegid[0]; runid = vrunid[0]; #else throw std::runtime_error("AVX2 and FMA not available at compile time"); #endif }
inline void avx2_hexid_to_uv_cw(const __m256i hexid, __m256i& u, __m256i& v) { #if 0 // This code is correct but it's not worth maintaining two versions const __m256i one = _mm256_set1_epi32(1); __m256i ringid = avx2_positive_hexid_to_ringid(hexid); __m256i iring = _mm256_sub_epi32(hexid, avx2_ringid_to_nsites_contained(_mm256_sub_epi32(ringid,one))); u = ringid; v = _mm256_setzero_si256(); __m256i irun = _mm256_min_epu32(iring, ringid); v = _mm256_sub_epi32(v, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); u = _mm256_sub_epi32(u, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); u = _mm256_sub_epi32(u, irun); v = _mm256_add_epi32(v, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); v = _mm256_add_epi32(v, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); u = _mm256_add_epi32(u, irun); iring = _mm256_sub_epi32(iring, irun); u = _mm256_add_epi32(u, irun); v = _mm256_add_epi32(v, iring); const __m256i mask = _mm256_cmpeq_epi32(hexid, _mm256_setzero_si256()); u = _mm256_andnot_si256(mask, u); v = _mm256_andnot_si256(mask, v); #else // hexid_to_uv_ccw(hexid, u, v); // u += v; // v = -v; avx2_hexid_to_uv_ccw(hexid, u, v); u = _mm256_add_epi32(u, v); v = _mm256_sign_epi32(v, _mm256_cmpeq_epi32(v, v)); #endif }
static FORCE_INLINE __m256i lookup_double_AVX2(const int16_t *VXFull, const int16_t *VYFull, const PixelType *pref, int w, const __m256i &dwords_ref_pitch, const __m256i &dwords_hoffsets) { __m256i vx = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i *)&VXFull[w])); vx = _mm256_srai_epi32(vx, 1); __m256i vy = _mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)&VYFull[w])); vy = _mm256_srai_epi16(vy, 1); __m256i addr = _mm256_madd_epi16(vy, dwords_ref_pitch); addr = _mm256_add_epi32(addr, vx); addr = _mm256_add_epi32(addr, dwords_hoffsets); // It's okay to read two or three bytes more than needed. pref is always padded, unless the user chooses a horizontal padding of 0, which would be stupid. __m256i gathered = _mm256_i32gather_epi32((const int *)pref, addr, sizeof(PixelType)); gathered = _mm256_and_si256(gathered, _mm256_set1_epi32((1 << (sizeof(PixelType) * 8)) - 1)); return gathered; }