Esempio n. 1
0
 /*!
  * \brief Multiply the two given vectors of byte
  */
 ETL_STATIC_INLINE(avx_simd_byte) mul(avx_simd_byte lhs, avx_simd_byte rhs) {
     auto aodd    = _mm256_srli_epi16(lhs.value, 8);
     auto bodd    = _mm256_srli_epi16(rhs.value, 8);
     auto muleven = _mm256_mullo_epi16(lhs.value, rhs.value);
     auto mulodd  = _mm256_slli_epi16(_mm256_mullo_epi16(aodd, bodd), 8);
     return _mm256_blendv_epi8(mulodd, muleven, _mm256_set1_epi32(0x00FF00FF));
 }
Esempio n. 2
0
inline void avx2_xy_to_uv_f(__m256 x, __m256 y, __m256i& u, __m256i& v)
{
  // Convert X,Y first into U,V space then round to nearest
  // integer. That gets us close to correct answer, mapping XY to a
  // lozenge-shaped space rather than hexagonal. We then correct the
  // four regions that lie outside the hexagonal cell assigning them
  // to their correct neighboring cell.
  // Writer's note: see ~/Google Drive/Work/calin

  // double dv = y*c_vy_inv;
  // double du = x-dv*c_vx;
  // u = std::lround(du);
  // v = std::lround(dv);
  // du -= u;
  // dv -= v;

  y = _mm256_mul_ps(y, calin::math::simd::c_m256(_c_m256_vy_inv));
  x = _mm256_fnmadd_ps(y, calin::math::simd::c_m256(_c_m256_vx), x);
  u = _mm256_cvtps_epi32(x);
  v = _mm256_cvtps_epi32(y);
  x = _mm256_sub_ps(x, _mm256_cvtepi32_ps(u));
  y = _mm256_sub_ps(y, _mm256_cvtepi32_ps(v));

  // double c3 = dv-du;
  const __m256i c3 = _mm256_castps_si256(_mm256_sub_ps(y, x));

  __m256i uvshift;
  __m256i mask;

  // double c1 = du+0.5*dv;
  // double c2 = dv+0.5*du;
  // if(c3<0) {
  //   if(c1>=1) u++;
  //   else if(c2<-1) v--;
  // } else {
  //   if(c2>=1) v++;
  //   else if(c1<-1) u--;
  // }

  uvshift = _mm256_cvtps_epi32(_mm256_fmadd_ps(y, calin::math::simd::c_m256(_c_m256_one_half), x));
  mask = _mm256_srai_epi32(_mm256_xor_si256(uvshift, c3), 31);
  u = _mm256_blendv_epi8(u, _mm256_add_epi32(u, uvshift), mask);

  uvshift = _mm256_cvtps_epi32(_mm256_fmadd_ps(x, calin::math::simd::c_m256(_c_m256_one_half), y));
  mask = _mm256_srai_epi32(_mm256_xor_si256(uvshift, c3), 31);
  v = _mm256_blendv_epi8(_mm256_add_epi32(v, uvshift), v, mask);
}
Esempio n. 3
0
inline __m256i avx2_positive_hexid_to_ringid_loop(const __m256i hexid)
{
  // This algorithm is relatively slow in comparisson to the scalar version
  // but still faster overall conidering we compute 8 rigids in one go
  const __m256i six = _mm256_set1_epi32(6);
  const __m256i one = _mm256_set1_epi32(1);
  __m256i ringid = _mm256_setzero_si256();
  __m256i nsites = one;
  __m256i nring = _mm256_setzero_si256();
  __m256i mask = _mm256_cmpgt_epi32(nsites, hexid);
  while(~_mm256_movemask_epi8(mask)) {
    ringid = _mm256_blendv_epi8(_mm256_add_epi32(ringid, one), ringid, mask);
    nring = _mm256_add_epi32(nring, six);
    nsites = _mm256_add_epi32(nsites, nring);
    mask = _mm256_cmpgt_epi32(nsites, hexid);
  }
  return ringid;
}
__m256i branchfree_search8_avx(int* source, size_t n, __m256i target) {
    __m256i offsets = _mm256_setzero_si256();
    if(n == 0) return offsets;

    __m256i ha = _mm256_set1_epi32(n>>1);
    while(n>1) {
        n -=  n>>1;
        __m256i offsetsplushalf = _mm256_add_epi32(offsets,ha);
        ha = _mm256_sub_epi32(ha,_mm256_srli_epi32(ha,1));
        __m256i keys = _mm256_i32gather_epi32(source,offsetsplushalf,4);
        __m256i lt = _mm256_cmpgt_epi32(target,keys);
        offsets = _mm256_blendv_epi8(offsets,offsetsplushalf,lt);
    }
    __m256i lastkeys = _mm256_i32gather_epi32(source,offsets,4);
    __m256i lastlt = _mm256_cmpgt_epi32(target,lastkeys);
    __m256i oneswhereneeded = _mm256_srli_epi32(lastlt,31);
    __m256i  answer = _mm256_add_epi32(offsets,oneswhereneeded);
    return answer;
}
Esempio n. 5
0
__m256i test_mm256_blendv_epi8(__m256i a, __m256i b, __m256i m) {
  // CHECK: @llvm.x86.avx2.pblendvb
  return _mm256_blendv_epi8(a, b, m);
}
Esempio n. 6
0
__m256i test_mm256_blendv_epi8(__m256i a, __m256i b, __m256i m) {
  // CHECK-LABEL: test_mm256_blendv_epi8
  // CHECK: call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
  return _mm256_blendv_epi8(a, b, m);
}
Esempio n. 7
0
	/* set 4: 62, "+" */
	s4mask = _mm256_cmpeq_epi8(res, _mm256_set1_epi8(62));
	blockmask = _mm256_or_si256(blockmask, s4mask);

	/* set 3: 52..61, "0123456789" */
	s3mask = _mm256_andnot_si256(blockmask, _mm256_cmpgt_epi8(res, _mm256_set1_epi8(51)));
	blockmask = _mm256_or_si256(blockmask, s3mask);

	/* set 2: 26..51, "abcdefghijklmnopqrstuvwxyz" */
	s2mask = _mm256_andnot_si256(blockmask, _mm256_cmpgt_epi8(res, _mm256_set1_epi8(25)));
	blockmask = _mm256_or_si256(blockmask, s2mask);

	/* set 1: 0..25, "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
	 * Everything that is not blockmasked */

	/* Create the masked character sets: */
	str = _mm256_and_si256(_mm256_set1_epi8('/'), s5mask);
	str = _mm256_blendv_epi8(str, _mm256_set1_epi8('+'), s4mask);
	str = _mm256_blendv_epi8(str, _mm256_add_epi8(res, _mm256_set1_epi8('0' - 52)), s3mask);
	str = _mm256_blendv_epi8(str, _mm256_add_epi8(res, _mm256_set1_epi8('a' - 26)), s2mask);
	str = _mm256_blendv_epi8(_mm256_add_epi8(res, _mm256_set1_epi8('A')), str, blockmask);

	/* Blend all the sets together and store: */
	_mm256_storeu_si256((__m256i *)o, str);

	c += 24;	/* 6 * 4 bytes of input  */
	o += 32;	/* 8 * 4 bytes of output */
	outl += 32;
	srclen -= 24;
}
Esempio n. 8
0
inline __m256i avx2_uv_to_hexid_ccw(const __m256i u, const __m256i v)
{
  // if(u==0 and v==0)return 0;
  // int ringid = uv_to_ringid(u,v);
  // unsigned segid;
  // int runid;
  // int upv = u+v;
  // if(upv==ringid and v!=ringid)         { segid=0; runid=v; }
  // else if(v==ringid and u!=-ringid)     { segid=1; runid=-u; }
  // else if(u==-ringid and upv!=-ringid)  { segid=2; runid=ringid-v; }
  // else if(u+v==-ringid and v!=-ringid)  { segid=3; runid=-v; }
  // else if(v==-ringid and u!=ringid)     { segid=4; runid=u; }
  // else /*if(u==ringid and upv!=ringid)*/{ segid=5; runid=ringid+v; }
  // return positive_ringid_segid_runid_to_hexid(ringid, segid, runid);
  const __m256i one = _mm256_set1_epi32(1);
  const __m256i minus_one = _mm256_set1_epi32(-1);
  const __m256i ringid = avx2_uv_to_ringid(u,v);
  const __m256i minus_ringid = _mm256_sign_epi32(ringid, minus_one);
  const __m256i upv = _mm256_add_epi32(u, v);
  __m256i not_found_mask = minus_one;
  __m256i hexid = avx2_ringid_to_nsites_contained(_mm256_sub_epi32(ringid, one));

  // Seg ID = 0
  // if(upv==ringid and v!=ringid)         { segid=0; runid=v; }
  __m256i here_mask = _mm256_cmpeq_epi32(upv, ringid);
  hexid = _mm256_add_epi32(hexid, _mm256_and_si256(not_found_mask,
    _mm256_blendv_epi8(ringid, v, here_mask)));
  not_found_mask = _mm256_andnot_si256(here_mask, not_found_mask);
  // hexid = _mm256_add_epi32(hexid, _mm256_or_si256(
  //   _mm256_and_si256(here_mask, v),
  //   _mm256_and_si256(not_found_mask, ringid)));

  // Seg ID = 1
  // else if(v==ringid and u!=-ringid)     { segid=1; runid=-u; }
  here_mask = _mm256_cmpeq_epi32(v, ringid);
  hexid = _mm256_sub_epi32(hexid, _mm256_and_si256(not_found_mask,
    _mm256_blendv_epi8(minus_ringid, u, here_mask)));
  not_found_mask = _mm256_andnot_si256(here_mask, not_found_mask);
  // hexid = _mm256_sub_epi32(hexid, _mm256_or_si256(
  //   _mm256_and_si256(here_mask, u),
  //   _mm256_and_si256(not_found_mask, minus_ringid)));

  // Seg ID = 2
  // else if(u==-ringid and upv!=-ringid)  { segid=2; runid=ringid-v; }
  here_mask = _mm256_cmpeq_epi32(u, minus_ringid);
  hexid = _mm256_sub_epi32(hexid, _mm256_and_si256(not_found_mask,
    _mm256_blendv_epi8(minus_ringid, upv, here_mask)));
  not_found_mask = _mm256_andnot_si256(here_mask, not_found_mask);
  // hexid = _mm256_sub_epi32(hexid, _mm256_or_si256(
  //   _mm256_and_si256(here_mask, upv),
  //   _mm256_and_si256(not_found_mask, minus_ringid)));

  // Seg ID = 3
  // else if(u+v==-ringid and v!=-ringid)  { segid=3; runid=-v; }
  here_mask = _mm256_cmpeq_epi32(upv, minus_ringid);
  hexid = _mm256_sub_epi32(hexid, _mm256_and_si256(not_found_mask,
    _mm256_blendv_epi8(minus_ringid, v, here_mask)));
  not_found_mask = _mm256_andnot_si256(here_mask, not_found_mask);
  // hexid = _mm256_sub_epi32(hexid, _mm256_or_si256(
  //   _mm256_and_si256(here_mask, v),
  //   _mm256_and_si256(not_found_mask, minus_ringid)));

  // Seg ID = 4
  // else if(v==-ringid and u!=ringid)     { segid=4; runid=u; }
  here_mask = _mm256_cmpeq_epi32(v, minus_ringid);
  hexid = _mm256_add_epi32(hexid, _mm256_and_si256(not_found_mask,
    _mm256_blendv_epi8(ringid, u, here_mask)));
  not_found_mask = _mm256_andnot_si256(here_mask, not_found_mask);
  // hexid = _mm256_add_epi32(hexid, _mm256_or_si256(
  //   _mm256_and_si256(here_mask, u),
  //   _mm256_and_si256(not_found_mask, ringid)));

  // Seg ID = 5
  // else /*if(u==ringid and upv!=ringid)*/{ segid=5; runid=ringid+v; }
  hexid = _mm256_add_epi32(hexid, _mm256_and_si256(not_found_mask, upv));

  const __m256i mask = _mm256_cmpeq_epi32(ringid, _mm256_setzero_si256());
  hexid = _mm256_andnot_si256(mask, hexid);
  return hexid;
}