// Philox RNG for Xeon Phi cards __forceinline void philox2x32_mic(uint64_t counter, uint32_t key, __m512i& rnd1, __m512i& rnd2) { #ifdef __MIC__ const __m512i m = _mm512_set1_epi32(0xD256D193); const __m512i w = _mm512_set1_epi32(0x9E3779B9); const __m512i incr = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); __m512i r = _mm512_set1_epi32(counter & 0xFFFFFFFF); __m512i l = _mm512_set1_epi32(counter >> 32); __m512i keyV = _mm512_set1_epi32(key); keyV = _mm512_add_epi32(keyV, incr); #pragma unroll(10) for(int i = 0; i < 10; ++i) { __m512i l_old = l; l = _mm512_mullo_epi32(r, m); r = _mm512_xor_epi32(_mm512_xor_epi32(_mm512_mulhi_epu32(r, m), keyV), l_old); keyV = _mm512_add_epi32(keyV, w); } rnd1 = r; rnd2 = l; #endif }
void extern avx512f_test (void) { x = _mm512_broadcastd_epi32 (y); x = _mm512_mask_broadcastd_epi32 (x, m, y); x = _mm512_maskz_broadcastd_epi32 (m, y); x = _mm512_set1_epi32 (z); x = _mm512_mask_set1_epi32 (x, m, z); x = _mm512_maskz_set1_epi32 (m, z); }
static batch_type abs(const batch_type& rhs) { return (__m512)(_mm512_and_epi32((__m512i)((__m512)(rhs)), _mm512_set1_epi32(0x7fffffff))); }
static batch_type bitwise_not(const batch_type& rhs) { return _mm512_xor_ps(rhs, _mm512_castsi512_ps(_mm512_set1_epi32(-1))); }
// Main spin update routine __forceinline void spinFlipCore(uint32_t* updated, const uint32_t* neighbours, const uint32_t* field, unsigned int x, unsigned int y, unsigned int z, __m512i rndInt) { #ifdef __MIC__ const __m512i one = _mm512_set1_epi32(0xFFFFFFFF); const __m512i zero = _mm512_setzero_epi32(); // calculate indices unsigned int x0 = (x+N-1)%N; unsigned int y0 = (y+N-1)%N; unsigned int z0 = (z+N-1)%N; unsigned int x1 = (x+17)%N; unsigned int y1 = (y+1)%N; unsigned int z1 = (z+1)%N; // neighbour spins Iu32vec16 n[6]; n[0] = _mm512_loadunpacklo_epi32(_mm512_undefined_epi32(), neighbours + z*N*N + y*N + x0); n[0] = _mm512_loadunpackhi_epi32(n[0], neighbours + z*N*N + y*N + x + 16 - 1); n[1] = _mm512_loadunpacklo_epi32(_mm512_undefined_epi32(), neighbours + z*N*N + y*N + x + 1); n[1] = _mm512_loadunpackhi_epi32(n[1], neighbours + z*N*N + y*N + x1); n[2] = _mm512_load_epi32(neighbours + z*N*N + y0*N + x); n[3] = _mm512_load_epi32(neighbours + z*N*N + y1*N + x); n[4] = _mm512_load_epi32(neighbours + z0*N*N + y*N + x); n[5] = _mm512_load_epi32(neighbours + z1*N*N + y*N + x); // bits are set if spins are antiparallel unsigned int i = z*N*N + y*N + x; Iu32vec16 current = _mm512_load_epi32(updated + i); #pragma unroll(6) for(int j = 0; j < 6; ++j) n[j] = current ^ n[j]; // count wrong spins using vertical counters Iu32vec16 c0, c1, c2, carry; c0 = n[0] ^ n[1]; c1 = n[0] & n[1]; c0 ^= n[2]; c1 |= andn(c0, n[2]); c0 ^= n[3]; carry = andn(c0, n[3]); c1 ^= carry; c2 = andn(c1, carry); c0 ^= n[4]; carry = andn(c0, n[4]); c1 ^= carry; c2 |= andn(c1, carry); c0 ^= n[5]; carry = andn(c0, n[5]); c1 ^= carry; c2 |= andn(c1, carry); Iu32vec16 w1 = andn(c2, andn(c1, c0)); Iu32vec16 w2 = andn(c2, andn(c0, c1)); Iu32vec16 w3 = andn(c2, c0 & c1); Iu32vec16 w4 = andn(c0, andn(c1, c2)); Iu32vec16 w5 = andn(c1, c0 & c2); Iu32vec16 w6 = andn(c0, c1 & c2); // relation to field Iu32vec16 e[7]; Iu32vec16 f = current ^ _mm512_load_epi32(field + i); #pragma unroll(7) for(int j = 0; j < 7; j++) { __mmask16 ep = _mm512_cmple_epu32_mask(rndInt, _mm512_set1_epi32(expBeta[2*j])); __mmask16 em = _mm512_cmple_epu32_mask(rndInt, _mm512_set1_epi32(expBeta[2*j+1])); e[6-j] = _mm512_mask_mov_epi32(_mm512_mask_mov_epi32(zero, em, f), ep, one); } // check for spin flip Iu32vec16 flip = e[0] | e[1] & w1 | e[2] & w2 | e[3] & w3 | e[4] & w4 | e[5] & w5 | e[6] & w6; _mm512_store_epi32(updated + i, flip ^ current); #endif }
* Abhinav Sarje <*****@*****.**> * Elaine Chan <*****@*****.**> * Alexander Hexemer <*****@*****.**> * Xiaoye Li <*****@*****.**> * * Licensing: The HipGISAXS software is only available to be downloaded and * used by employees of academic research institutions, not-for-profit * research laboratories, or governmental research facilities. Please read the * accompanying LICENSE file before downloading the software. By downloading * the software, you are agreeing to be bound by the terms of this * NON-COMMERCIAL END USER LICENSE AGREEMENT. */ #pragma offload_attribute(push, target(mic)) static const __m512i _pi32_sign_mask = _mm512_set1_epi32(0x80000000); static const __m512i _pi32_inv_sign_mask = _mm512_set1_epi32(~0x80000000); static const __m512i _pi32_0 = _mm512_set1_epi32(0); static const __m512i _pi32_1 = _mm512_set1_epi32(1); static const __m512i _pi32_2 = _mm512_set1_epi32(2); static const __m512i _pi32_4 = _mm512_set1_epi32(4); static const __m512i _pi32_inv1 = _mm512_set1_epi32(~1); static const __m512i _pi32_0x7f = _mm512_set1_epi32(0x7f); static const __m512i _pi32_ffff = _mm512_set1_epi32(0xffffffff); static const mic_m512_t _ps_1 = _mm512_set1_ps(1.0f); static const mic_m512_t _ps_0point5 = _mm512_set1_ps(0.5f); static const mic_m512_t _ps_0 = _mm512_set1_ps(0.0f); static const mic_m512_t _ps_exp_hi = _mm512_set1_ps(88.3762626647949f); static const mic_m512_t _ps_exp_lo = _mm512_set1_ps(-88.3762626647949f); static const mic_m512_t _ps_cephes_LOG2EF = _mm512_set1_ps(1.44269504088896341f);
int main(int argc, char **argv) { int w, h, bit_num = 0; char byte_acc = 0; long byte_total = 0; int i, iter = 50; double x, y, limit = 2.0; double Zr, Zi, Cr, Ci, Tr, Ti; w = h = argc > 1 ? atoi(argv[1]) : 32000; printf("P4\n%d %d\n", w, h); #ifdef USEAVX512 __m512i a = _mm512_set1_epi32(0); __m512i b = _mm512_set1_epi32(1); __m512i t; #endif for (y = 0; y < h; ++y) { #ifdef USEAVX512 t = a; a = b; #ifdef USEHEAVYAVX512 b = _mm512_mul_epi32(b, t); #else b = _mm512_add_epi32(b, t); #endif #endif for (x = 0; x < w; ++x) { Zr = Zi = Tr = Ti = 0.0; Cr = (2.0 * x / w - 1.5); Ci = (2.0 * y / h - 1.0); for (i = 0; i < iter && (Tr + Ti <= limit * limit); ++i) { Zi = 2.0 * Zr * Zi + Ci; Zr = Tr - Ti + Cr; Tr = Zr * Zr; Ti = Zi * Zi; } byte_acc <<= 1; if (Tr + Ti <= limit * limit) byte_acc |= 0x01; ++bit_num; if (bit_num == 8) { byte_total += byte_acc; // putc(byte_acc,stdout); byte_acc = 0; bit_num = 0; } else if (x == w - 1) { byte_acc <<= (8 - w % 8); byte_total += byte_acc; // putc(byte_acc,stdout); byte_acc = 0; bit_num = 0; } } } #ifdef USEAVX512 printf("we used avx512 %d \n", _mm256_extract_epi32(_mm512_extracti64x4_epi64(b, 1), 7)); #else printf("we did not use avx512\n"); #endif return byte_total; }