Пример #1
0
// Philox RNG for Xeon Phi cards
__forceinline
void philox2x32_mic(uint64_t counter, uint32_t key, __m512i& rnd1, __m512i& rnd2)
{
#ifdef __MIC__
  const __m512i m = _mm512_set1_epi32(0xD256D193);
  const __m512i w = _mm512_set1_epi32(0x9E3779B9);
  const __m512i incr = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);

  __m512i r = _mm512_set1_epi32(counter & 0xFFFFFFFF);
  __m512i l = _mm512_set1_epi32(counter >> 32);
  __m512i keyV = _mm512_set1_epi32(key);
  keyV = _mm512_add_epi32(keyV, incr);
  
  #pragma unroll(10)
  for(int i = 0; i < 10; ++i)
  {
    __m512i l_old = l;
    l = _mm512_mullo_epi32(r, m);
    r = _mm512_xor_epi32(_mm512_xor_epi32(_mm512_mulhi_epu32(r, m), keyV), l_old);
    keyV = _mm512_add_epi32(keyV, w);
  }
  rnd1 = r;
  rnd2 = l;
#endif
}
Пример #2
0
void extern
avx512f_test (void)
{
  x = _mm512_broadcastd_epi32 (y);
  x = _mm512_mask_broadcastd_epi32 (x, m, y);
  x = _mm512_maskz_broadcastd_epi32 (m, y);

  x = _mm512_set1_epi32 (z);
  x = _mm512_mask_set1_epi32 (x, m, z);
  x = _mm512_maskz_set1_epi32 (m, z);
}
Пример #3
0
 static batch_type abs(const batch_type& rhs)
 {
     return (__m512)(_mm512_and_epi32((__m512i)((__m512)(rhs)),
                                      _mm512_set1_epi32(0x7fffffff)));
 }
Пример #4
0
 static batch_type bitwise_not(const batch_type& rhs)
 {
     return _mm512_xor_ps(rhs, _mm512_castsi512_ps(_mm512_set1_epi32(-1)));
 }
Пример #5
0
// Main spin update routine
__forceinline
void spinFlipCore(uint32_t* updated, const uint32_t* neighbours, const uint32_t* field,
                  unsigned int x, unsigned int y, unsigned int z, __m512i rndInt)
{
#ifdef __MIC__
  const __m512i one = _mm512_set1_epi32(0xFFFFFFFF);
  const __m512i zero = _mm512_setzero_epi32();

  // calculate indices
  unsigned int x0 = (x+N-1)%N;
  unsigned int y0 = (y+N-1)%N;
  unsigned int z0 = (z+N-1)%N;
  unsigned int x1 = (x+17)%N;
  unsigned int y1 = (y+1)%N;
  unsigned int z1 = (z+1)%N;

  // neighbour spins
  Iu32vec16 n[6];
  n[0] = _mm512_loadunpacklo_epi32(_mm512_undefined_epi32(), neighbours + z*N*N + y*N + x0);
  n[0] = _mm512_loadunpackhi_epi32(n[0], neighbours + z*N*N + y*N + x + 16 - 1);
  n[1] = _mm512_loadunpacklo_epi32(_mm512_undefined_epi32(), neighbours + z*N*N + y*N + x + 1);
  n[1] = _mm512_loadunpackhi_epi32(n[1], neighbours + z*N*N + y*N + x1);
  n[2] = _mm512_load_epi32(neighbours + z*N*N + y0*N + x);
  n[3] = _mm512_load_epi32(neighbours + z*N*N + y1*N + x);
  n[4] = _mm512_load_epi32(neighbours + z0*N*N + y*N + x);
  n[5] = _mm512_load_epi32(neighbours + z1*N*N + y*N + x);
  
  // bits are set if spins are antiparallel
  unsigned int i = z*N*N + y*N + x;
  Iu32vec16 current = _mm512_load_epi32(updated + i);
  #pragma unroll(6)
  for(int j = 0; j < 6; ++j)
    n[j] = current ^ n[j];

  // count wrong spins using vertical counters
  Iu32vec16 c0, c1, c2, carry;

  c0 = n[0] ^ n[1];
  c1 = n[0] & n[1];
  
  c0 ^= n[2];
  c1 |= andn(c0, n[2]);
  
  c0 ^= n[3];
  carry = andn(c0, n[3]);
  c1 ^= carry;
  c2 = andn(c1, carry);
  
  c0 ^= n[4];
  carry = andn(c0, n[4]);
  c1 ^= carry;
  c2 |= andn(c1, carry);

  c0 ^= n[5];
  carry = andn(c0, n[5]);
  c1 ^= carry;
  c2 |= andn(c1, carry);
  
  Iu32vec16 w1 = andn(c2, andn(c1, c0));
  Iu32vec16 w2 = andn(c2, andn(c0, c1));
  Iu32vec16 w3 = andn(c2, c0 & c1);
  Iu32vec16 w4 = andn(c0, andn(c1, c2));
  Iu32vec16 w5 = andn(c1, c0 & c2);
  Iu32vec16 w6 = andn(c0, c1 & c2);
  
  // relation to field
  Iu32vec16 e[7];
  Iu32vec16 f = current ^ _mm512_load_epi32(field + i);
  #pragma unroll(7)
  for(int j = 0; j < 7; j++)
  {
    __mmask16 ep = _mm512_cmple_epu32_mask(rndInt, _mm512_set1_epi32(expBeta[2*j]));
    __mmask16 em = _mm512_cmple_epu32_mask(rndInt, _mm512_set1_epi32(expBeta[2*j+1]));
    e[6-j] = _mm512_mask_mov_epi32(_mm512_mask_mov_epi32(zero, em, f), ep, one);
  }

  // check for spin flip
  Iu32vec16 flip = e[0] | e[1] & w1 | e[2] & w2 | e[3] & w3 | e[4] & w4 | e[5] & w5 | e[6] & w6;
  _mm512_store_epi32(updated + i, flip ^ current);
#endif
}
Пример #6
0
 *              Abhinav Sarje <*****@*****.**>
 *              Elaine Chan <*****@*****.**>
 *              Alexander Hexemer <*****@*****.**>
 *              Xiaoye Li <*****@*****.**>
 *
 *  Licensing: The HipGISAXS software is only available to be downloaded and
 *  used by employees of academic research institutions, not-for-profit
 *  research laboratories, or governmental research facilities. Please read the
 *  accompanying LICENSE file before downloading the software. By downloading
 *  the software, you are agreeing to be bound by the terms of this
 *  NON-COMMERCIAL END USER LICENSE AGREEMENT.
 */

#pragma offload_attribute(push, target(mic))

static const __m512i _pi32_sign_mask = _mm512_set1_epi32(0x80000000);
static const __m512i _pi32_inv_sign_mask = _mm512_set1_epi32(~0x80000000);
static const __m512i _pi32_0 = _mm512_set1_epi32(0);
static const __m512i _pi32_1 = _mm512_set1_epi32(1);
static const __m512i _pi32_2 = _mm512_set1_epi32(2);
static const __m512i _pi32_4 = _mm512_set1_epi32(4);
static const __m512i _pi32_inv1 = _mm512_set1_epi32(~1);
static const __m512i _pi32_0x7f = _mm512_set1_epi32(0x7f);
static const __m512i _pi32_ffff = _mm512_set1_epi32(0xffffffff);

static const mic_m512_t _ps_1 = _mm512_set1_ps(1.0f);
static const mic_m512_t _ps_0point5 = _mm512_set1_ps(0.5f);
static const mic_m512_t _ps_0 = _mm512_set1_ps(0.0f);
static const mic_m512_t _ps_exp_hi = _mm512_set1_ps(88.3762626647949f);
static const mic_m512_t _ps_exp_lo = _mm512_set1_ps(-88.3762626647949f);
static const mic_m512_t _ps_cephes_LOG2EF = _mm512_set1_ps(1.44269504088896341f);
int main(int argc, char **argv) {
  int w, h, bit_num = 0;
  char byte_acc = 0;
  long byte_total = 0;
  int i, iter = 50;
  double x, y, limit = 2.0;
  double Zr, Zi, Cr, Ci, Tr, Ti;

  w = h = argc > 1 ? atoi(argv[1]) : 32000;

  printf("P4\n%d %d\n", w, h);
#ifdef USEAVX512
  __m512i a = _mm512_set1_epi32(0);
  __m512i b = _mm512_set1_epi32(1);
  __m512i t;
#endif
  for (y = 0; y < h; ++y) {
#ifdef USEAVX512
    t = a;
    a = b;
#ifdef USEHEAVYAVX512
    b = _mm512_mul_epi32(b, t);
#else
    b = _mm512_add_epi32(b, t);
#endif
#endif
    for (x = 0; x < w; ++x) {
      Zr = Zi = Tr = Ti = 0.0;
      Cr = (2.0 * x / w - 1.5);
      Ci = (2.0 * y / h - 1.0);

      for (i = 0; i < iter && (Tr + Ti <= limit * limit); ++i) {
        Zi = 2.0 * Zr * Zi + Ci;
        Zr = Tr - Ti + Cr;
        Tr = Zr * Zr;
        Ti = Zi * Zi;
      }

      byte_acc <<= 1;
      if (Tr + Ti <= limit * limit)
        byte_acc |= 0x01;

      ++bit_num;

      if (bit_num == 8) {
        byte_total += byte_acc;
        // putc(byte_acc,stdout);
        byte_acc = 0;
        bit_num = 0;
      } else if (x == w - 1) {
        byte_acc <<= (8 - w % 8);
        byte_total += byte_acc;
        // putc(byte_acc,stdout);
        byte_acc = 0;
        bit_num = 0;
      }
    }
  }
#ifdef USEAVX512
  printf("we used avx512 %d \n", _mm256_extract_epi32(_mm512_extracti64x4_epi64(b, 1), 7));
#else
  printf("we did not use avx512\n");
#endif
  return byte_total;
}