/*! * \brief Multiply the two given vectors of byte */ ETL_STATIC_INLINE(avx_simd_byte) mul(avx_simd_byte lhs, avx_simd_byte rhs) { auto aodd = _mm256_srli_epi16(lhs.value, 8); auto bodd = _mm256_srli_epi16(rhs.value, 8); auto muleven = _mm256_mullo_epi16(lhs.value, rhs.value); auto mulodd = _mm256_slli_epi16(_mm256_mullo_epi16(aodd, bodd), 8); return _mm256_blendv_epi8(mulodd, muleven, _mm256_set1_epi32(0x00FF00FF)); }
inline void avx2_xy_to_uv_f(__m256 x, __m256 y, __m256i& u, __m256i& v) { // Convert X,Y first into U,V space then round to nearest // integer. That gets us close to correct answer, mapping XY to a // lozenge-shaped space rather than hexagonal. We then correct the // four regions that lie outside the hexagonal cell assigning them // to their correct neighboring cell. // Writer's note: see ~/Google Drive/Work/calin // double dv = y*c_vy_inv; // double du = x-dv*c_vx; // u = std::lround(du); // v = std::lround(dv); // du -= u; // dv -= v; y = _mm256_mul_ps(y, calin::math::simd::c_m256(_c_m256_vy_inv)); x = _mm256_fnmadd_ps(y, calin::math::simd::c_m256(_c_m256_vx), x); u = _mm256_cvtps_epi32(x); v = _mm256_cvtps_epi32(y); x = _mm256_sub_ps(x, _mm256_cvtepi32_ps(u)); y = _mm256_sub_ps(y, _mm256_cvtepi32_ps(v)); // double c3 = dv-du; const __m256i c3 = _mm256_castps_si256(_mm256_sub_ps(y, x)); __m256i uvshift; __m256i mask; // double c1 = du+0.5*dv; // double c2 = dv+0.5*du; // if(c3<0) { // if(c1>=1) u++; // else if(c2<-1) v--; // } else { // if(c2>=1) v++; // else if(c1<-1) u--; // } uvshift = _mm256_cvtps_epi32(_mm256_fmadd_ps(y, calin::math::simd::c_m256(_c_m256_one_half), x)); mask = _mm256_srai_epi32(_mm256_xor_si256(uvshift, c3), 31); u = _mm256_blendv_epi8(u, _mm256_add_epi32(u, uvshift), mask); uvshift = _mm256_cvtps_epi32(_mm256_fmadd_ps(x, calin::math::simd::c_m256(_c_m256_one_half), y)); mask = _mm256_srai_epi32(_mm256_xor_si256(uvshift, c3), 31); v = _mm256_blendv_epi8(_mm256_add_epi32(v, uvshift), v, mask); }
inline __m256i avx2_positive_hexid_to_ringid_loop(const __m256i hexid) { // This algorithm is relatively slow in comparisson to the scalar version // but still faster overall conidering we compute 8 rigids in one go const __m256i six = _mm256_set1_epi32(6); const __m256i one = _mm256_set1_epi32(1); __m256i ringid = _mm256_setzero_si256(); __m256i nsites = one; __m256i nring = _mm256_setzero_si256(); __m256i mask = _mm256_cmpgt_epi32(nsites, hexid); while(~_mm256_movemask_epi8(mask)) { ringid = _mm256_blendv_epi8(_mm256_add_epi32(ringid, one), ringid, mask); nring = _mm256_add_epi32(nring, six); nsites = _mm256_add_epi32(nsites, nring); mask = _mm256_cmpgt_epi32(nsites, hexid); } return ringid; }
__m256i branchfree_search8_avx(int* source, size_t n, __m256i target) { __m256i offsets = _mm256_setzero_si256(); if(n == 0) return offsets; __m256i ha = _mm256_set1_epi32(n>>1); while(n>1) { n -= n>>1; __m256i offsetsplushalf = _mm256_add_epi32(offsets,ha); ha = _mm256_sub_epi32(ha,_mm256_srli_epi32(ha,1)); __m256i keys = _mm256_i32gather_epi32(source,offsetsplushalf,4); __m256i lt = _mm256_cmpgt_epi32(target,keys); offsets = _mm256_blendv_epi8(offsets,offsetsplushalf,lt); } __m256i lastkeys = _mm256_i32gather_epi32(source,offsets,4); __m256i lastlt = _mm256_cmpgt_epi32(target,lastkeys); __m256i oneswhereneeded = _mm256_srli_epi32(lastlt,31); __m256i answer = _mm256_add_epi32(offsets,oneswhereneeded); return answer; }
__m256i test_mm256_blendv_epi8(__m256i a, __m256i b, __m256i m) { // CHECK: @llvm.x86.avx2.pblendvb return _mm256_blendv_epi8(a, b, m); }
__m256i test_mm256_blendv_epi8(__m256i a, __m256i b, __m256i m) { // CHECK-LABEL: test_mm256_blendv_epi8 // CHECK: call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}) return _mm256_blendv_epi8(a, b, m); }
/* set 4: 62, "+" */ s4mask = _mm256_cmpeq_epi8(res, _mm256_set1_epi8(62)); blockmask = _mm256_or_si256(blockmask, s4mask); /* set 3: 52..61, "0123456789" */ s3mask = _mm256_andnot_si256(blockmask, _mm256_cmpgt_epi8(res, _mm256_set1_epi8(51))); blockmask = _mm256_or_si256(blockmask, s3mask); /* set 2: 26..51, "abcdefghijklmnopqrstuvwxyz" */ s2mask = _mm256_andnot_si256(blockmask, _mm256_cmpgt_epi8(res, _mm256_set1_epi8(25))); blockmask = _mm256_or_si256(blockmask, s2mask); /* set 1: 0..25, "ABCDEFGHIJKLMNOPQRSTUVWXYZ" * Everything that is not blockmasked */ /* Create the masked character sets: */ str = _mm256_and_si256(_mm256_set1_epi8('/'), s5mask); str = _mm256_blendv_epi8(str, _mm256_set1_epi8('+'), s4mask); str = _mm256_blendv_epi8(str, _mm256_add_epi8(res, _mm256_set1_epi8('0' - 52)), s3mask); str = _mm256_blendv_epi8(str, _mm256_add_epi8(res, _mm256_set1_epi8('a' - 26)), s2mask); str = _mm256_blendv_epi8(_mm256_add_epi8(res, _mm256_set1_epi8('A')), str, blockmask); /* Blend all the sets together and store: */ _mm256_storeu_si256((__m256i *)o, str); c += 24; /* 6 * 4 bytes of input */ o += 32; /* 8 * 4 bytes of output */ outl += 32; srclen -= 24; }
inline __m256i avx2_uv_to_hexid_ccw(const __m256i u, const __m256i v) { // if(u==0 and v==0)return 0; // int ringid = uv_to_ringid(u,v); // unsigned segid; // int runid; // int upv = u+v; // if(upv==ringid and v!=ringid) { segid=0; runid=v; } // else if(v==ringid and u!=-ringid) { segid=1; runid=-u; } // else if(u==-ringid and upv!=-ringid) { segid=2; runid=ringid-v; } // else if(u+v==-ringid and v!=-ringid) { segid=3; runid=-v; } // else if(v==-ringid and u!=ringid) { segid=4; runid=u; } // else /*if(u==ringid and upv!=ringid)*/{ segid=5; runid=ringid+v; } // return positive_ringid_segid_runid_to_hexid(ringid, segid, runid); const __m256i one = _mm256_set1_epi32(1); const __m256i minus_one = _mm256_set1_epi32(-1); const __m256i ringid = avx2_uv_to_ringid(u,v); const __m256i minus_ringid = _mm256_sign_epi32(ringid, minus_one); const __m256i upv = _mm256_add_epi32(u, v); __m256i not_found_mask = minus_one; __m256i hexid = avx2_ringid_to_nsites_contained(_mm256_sub_epi32(ringid, one)); // Seg ID = 0 // if(upv==ringid and v!=ringid) { segid=0; runid=v; } __m256i here_mask = _mm256_cmpeq_epi32(upv, ringid); hexid = _mm256_add_epi32(hexid, _mm256_and_si256(not_found_mask, _mm256_blendv_epi8(ringid, v, here_mask))); not_found_mask = _mm256_andnot_si256(here_mask, not_found_mask); // hexid = _mm256_add_epi32(hexid, _mm256_or_si256( // _mm256_and_si256(here_mask, v), // _mm256_and_si256(not_found_mask, ringid))); // Seg ID = 1 // else if(v==ringid and u!=-ringid) { segid=1; runid=-u; } here_mask = _mm256_cmpeq_epi32(v, ringid); hexid = _mm256_sub_epi32(hexid, _mm256_and_si256(not_found_mask, _mm256_blendv_epi8(minus_ringid, u, here_mask))); not_found_mask = _mm256_andnot_si256(here_mask, not_found_mask); // hexid = _mm256_sub_epi32(hexid, _mm256_or_si256( // _mm256_and_si256(here_mask, u), // _mm256_and_si256(not_found_mask, minus_ringid))); // Seg ID = 2 // else if(u==-ringid and upv!=-ringid) { segid=2; runid=ringid-v; } here_mask = _mm256_cmpeq_epi32(u, minus_ringid); hexid = _mm256_sub_epi32(hexid, _mm256_and_si256(not_found_mask, _mm256_blendv_epi8(minus_ringid, upv, here_mask))); not_found_mask = _mm256_andnot_si256(here_mask, not_found_mask); // hexid = _mm256_sub_epi32(hexid, _mm256_or_si256( // _mm256_and_si256(here_mask, upv), // _mm256_and_si256(not_found_mask, minus_ringid))); // Seg ID = 3 // else if(u+v==-ringid and v!=-ringid) { segid=3; runid=-v; } here_mask = _mm256_cmpeq_epi32(upv, minus_ringid); hexid = _mm256_sub_epi32(hexid, _mm256_and_si256(not_found_mask, _mm256_blendv_epi8(minus_ringid, v, here_mask))); not_found_mask = _mm256_andnot_si256(here_mask, not_found_mask); // hexid = _mm256_sub_epi32(hexid, _mm256_or_si256( // _mm256_and_si256(here_mask, v), // _mm256_and_si256(not_found_mask, minus_ringid))); // Seg ID = 4 // else if(v==-ringid and u!=ringid) { segid=4; runid=u; } here_mask = _mm256_cmpeq_epi32(v, minus_ringid); hexid = _mm256_add_epi32(hexid, _mm256_and_si256(not_found_mask, _mm256_blendv_epi8(ringid, u, here_mask))); not_found_mask = _mm256_andnot_si256(here_mask, not_found_mask); // hexid = _mm256_add_epi32(hexid, _mm256_or_si256( // _mm256_and_si256(here_mask, u), // _mm256_and_si256(not_found_mask, ringid))); // Seg ID = 5 // else /*if(u==ringid and upv!=ringid)*/{ segid=5; runid=ringid+v; } hexid = _mm256_add_epi32(hexid, _mm256_and_si256(not_found_mask, upv)); const __m256i mask = _mm256_cmpeq_epi32(ringid, _mm256_setzero_si256()); hexid = _mm256_andnot_si256(mask, hexid); return hexid; }