inline void zmatmul(float *ain, float *bin, float *cout) { #ifdef __MIC__ __m512 a,b,c; __m512 a0,a1,a2,a3; __m512 b0,b1,b2,b3; __m512i pa0={ 0, 0, 2, 2, 0, 0, 2, 2, 8, 8,10,10, 8, 8,10,10}; __m512i pa1={ 1, 1, 3, 3, 1, 1, 3, 3, 9, 9,11,11, 9, 9,11,11}; __m512i pa2={ 4, 4, 6, 6, 4, 4, 6, 6,12,12,14,14,12,12,14,14}; __m512i pa3={ 5, 5, 7, 7, 5, 5, 7, 7,13,13,15,15,13,13,15,15}; __m512i pb0={ 0, 1, 0, 1, 4, 5, 4, 5, 0, 1, 0, 1, 4, 5, 4, 5}; __m512i pb1={ 2, 3, 2, 3, 6, 7, 6, 7, 2, 3, 2, 3, 6, 7, 6, 7}; __m512i pb2={ 8, 9, 8, 9,12,13,12,13, 8, 9, 8, 9,12,13,12,13}; __m512i pb3={10,11,10,11,14,15,14,15,10,11,10,11,14,15,14,15}; a=_mm512_load_ps(ain); b=_mm512_load_ps(bin); c=_mm512_load_ps(cout); a0=_mm512_castsi512_ps(_mm512_permutevar_epi32(pa0,_mm512_castps_si512(a))); a1=_mm512_castsi512_ps(_mm512_permutevar_epi32(pa1,_mm512_castps_si512(a))); a2=_mm512_castsi512_ps(_mm512_permutevar_epi32(pa2,_mm512_castps_si512(a))); a3=_mm512_castsi512_ps(_mm512_permutevar_epi32(pa3,_mm512_castps_si512(a))); b0=_mm512_castsi512_ps(_mm512_permutevar_epi32(pb0,_mm512_castps_si512(b))); b1=_mm512_castsi512_ps(_mm512_permutevar_epi32(pb1,_mm512_castps_si512(b))); b2=_mm512_castsi512_ps(_mm512_permutevar_epi32(pb2,_mm512_castps_si512(b))); b3=_mm512_castsi512_ps(_mm512_permutevar_epi32(pb3,_mm512_castps_si512(b))); c=_mm512_fmadd_ps(a0,b0,c); c=_mm512_fmadd_ps(a1,b1,c); c=_mm512_fmadd_ps(a2,b2,c); c=_mm512_fmadd_ps(a3,b3,c); _mm512_store_ps(cout,c); #else cout[0] +=ain[0] *bin[0]+ain[1] *bin[2]+ain[4] *bin[8] +ain[5] *bin[10]; cout[1] +=ain[0] *bin[1]+ain[1] *bin[3]+ain[4] *bin[9] +ain[5] *bin[11]; cout[2] +=ain[2] *bin[0]+ain[3] *bin[2]+ain[6] *bin[8] +ain[7] *bin[10]; cout[3] +=ain[2] *bin[1]+ain[3] *bin[3]+ain[6] *bin[9] +ain[7] *bin[11]; cout[4] +=ain[0] *bin[4]+ain[1] *bin[6]+ain[4] *bin[12]+ain[5] *bin[14]; cout[5] +=ain[0] *bin[5]+ain[1] *bin[7]+ain[4] *bin[13]+ain[5] *bin[15]; cout[6] +=ain[2] *bin[4]+ain[3] *bin[6]+ain[6] *bin[12]+ain[7] *bin[14]; cout[7] +=ain[2] *bin[5]+ain[3] *bin[7]+ain[6] *bin[13]+ain[7] *bin[15]; cout[8] +=ain[8] *bin[0]+ain[9] *bin[2]+ain[12]*bin[8] +ain[13]*bin[10]; cout[9] +=ain[8] *bin[1]+ain[9] *bin[3]+ain[12]*bin[9] +ain[13]*bin[11]; cout[10]+=ain[10]*bin[0]+ain[11]*bin[2]+ain[14]*bin[8] +ain[15]*bin[10]; cout[11]+=ain[10]*bin[1]+ain[11]*bin[3]+ain[14]*bin[9] +ain[15]*bin[11]; cout[12]+=ain[8] *bin[4]+ain[9] *bin[6]+ain[12]*bin[12]+ain[13]*bin[14]; cout[13]+=ain[8] *bin[5]+ain[9] *bin[7]+ain[12]*bin[13]+ain[13]*bin[15]; cout[14]+=ain[10]*bin[4]+ain[11]*bin[6]+ain[14]*bin[12]+ain[15]*bin[14]; cout[15]+=ain[10]*bin[5]+ain[11]*bin[7]+ain[14]*bin[13]+ain[15]*bin[15]; #endif }
// exp() inline mic_m512_t mic_exp_ps(mic_m512_t x) { x = _mm512_min_ps(x, _ps_exp_hi); x = _mm512_max_ps(x, _ps_exp_lo); mic_m512_t temp_2 = _mm512_fmadd_ps(x, _ps_cephes_LOG2EF, _ps_0point5); mic_m512_t temp_1 = _mm512_round_ps(temp_2, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE); mic_m512_t temp_3 = _mm512_sub_ps(temp_1, temp_2); __mmask16 mask = _mm512_cmp_ps_mask(temp_3, _ps_0, _MM_CMPINT_GT); temp_2 = _mm512_mask_sub_ps(temp_1, mask, temp_1, _ps_1); __m512i emm0 = _mm512_cvtfxpnt_round_adjustps_epi32(temp_2, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE); x = _mm512_fnmadd_ps(temp_2, _ps_cephes_exp_C12, x); mic_m512_t x2 = _mm512_mul_ps(x, x); mic_m512_t x3 = _mm512_mul_ps(x2, x); mic_m512_t x4 = _mm512_mul_ps(x2, x2); temp_1 = _mm512_add_ps(x, _ps_1); temp_1 = _mm512_fmadd_ps(x2, _ps_cephes_exp_p5, temp_1); temp_1 = _mm512_fmadd_ps(x3, _ps_cephes_exp_p4, temp_1); temp_2 = _mm512_mul_ps(x3, _ps_cephes_exp_p0); temp_3 = _mm512_mul_ps(x2, _ps_cephes_exp_p1); mic_m512_t temp_4 = _mm512_mul_ps(x, _ps_cephes_exp_p2); emm0 = _mm512_add_epi32(emm0, _pi32_0x7f); temp_2 = _mm512_add_ps(temp_2, temp_3); temp_3 = _mm512_add_ps(temp_3, temp_4); temp_2 = _mm512_add_ps(temp_2, temp_3); emm0 = _mm512_slli_epi32(emm0, 23); mic_m512_t pow2n = _mm512_castsi512_ps(emm0); temp_2 = _mm512_mul_ps(temp_2, x4); mic_m512_t y = _mm512_add_ps(temp_1, temp_2); y = _mm512_mul_ps(y, pow2n); return y; } // newexp_ps()
__attribute__((noinline)) float dot512fma2(float *x1, float *x2, size_t len) { assert(len % 32 == 0); __m512 sum = _mm512_setzero_ps(); if (len > 31) { size_t limit = len - 31; for (size_t i = 0; i < limit; i += 32) { __m512 v11 = _mm512_loadu_ps(x1 + i); __m512 v21 = _mm512_loadu_ps(x2 + i); __m512 v12 = _mm512_loadu_ps(x1 + i + 16); __m512 v22 = _mm512_loadu_ps(x2 + i + 16); sum = _mm512_fmadd_ps(v11, v21, sum); sum = _mm512_fmadd_ps(v12, v22, sum); } } float buffer[16]; _mm512_storeu_ps(buffer, sum); return buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7] + buffer[8] + buffer[9] + buffer[10] + buffer[11] + buffer[12] + buffer[13] + buffer[14] + buffer[15]; }
inline void mic_sincos_ps(mic_m512_t x, mic_m512_t *s, mic_m512_t *c) { __m512i sign_bit = _mm512_and_epi32(_mm512_castps_si512(x), _pi32_sign_mask); x = _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(x), _pi32_inv_sign_mask)); mic_m512_t y = _mm512_mul_ps(x, _ps_cephes_FOPI); __m512i emm2 = _mm512_cvtfxpnt_round_adjustps_epi32(y, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE); emm2 = _mm512_add_epi32(emm2, _pi32_1); emm2 = _mm512_and_epi32(emm2, _pi32_inv1); y = _mm512_cvtfxpnt_round_adjustepu32_ps(emm2, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE); __m512i cos_emm2 = _mm512_sub_epi32(emm2, _pi32_2); __m512i emm0 = _mm512_and_epi32(emm2, _pi32_4); __m512i cos_emm0 = _mm512_andnot_epi32(cos_emm2, _pi32_4); emm0 = _mm512_slli_epi32(emm0, 29); cos_emm0 = _mm512_slli_epi32(cos_emm0, 29); sign_bit = _mm512_xor_epi32(sign_bit, emm0); emm2 = _mm512_and_epi32(emm2, _pi32_2); cos_emm2 = _mm512_and_epi32(cos_emm2, _pi32_2); __mmask16 mask = _mm512_cmp_epi32_mask(emm2, _pi32_0, _MM_CMPINT_EQ); emm2 = _mm512_mask_add_epi32(_pi32_0, mask, _pi32_ffff, _pi32_0); __mmask16 cos_mask = _mm512_cmp_epi32_mask(cos_emm2, _pi32_0, _MM_CMPINT_EQ); cos_emm2 = _mm512_mask_add_epi32(_pi32_0, cos_mask, _pi32_ffff, _pi32_0); x = _mm512_fmadd_ps(y, _ps_minus_cephes_DP123, x); mic_m512_t x2 = _mm512_mul_ps(x, x); mic_m512_t x3 = _mm512_mul_ps(x2, x); mic_m512_t x4 = _mm512_mul_ps(x2, x2); y = _mm512_fmadd_ps(_ps_coscof_p0, x2, _ps_coscof_p1); y = _mm512_fmadd_ps(y, x2, _ps_coscof_p2); mic_m512_t temp_2 = _mm512_fmsub_ps(x2, _ps_0point5, _ps_1); y = _mm512_fmsub_ps(y, x4, temp_2); mic_m512_t y2 = _mm512_fmadd_ps(_ps_sincof_p0, x2, _ps_sincof_p1); y2 = _mm512_fmadd_ps(y2, x2, _ps_sincof_p2); y2 = _mm512_fmadd_ps(y2, x3, x); mic_m512_t cos_y = y; mic_m512_t cos_y2 = y2; y = _mm512_castsi512_ps(_mm512_andnot_epi32(emm2, _mm512_castps_si512(y))); cos_y = _mm512_castsi512_ps(_mm512_andnot_epi32(cos_emm2, _mm512_castps_si512(cos_y))); y2 = _mm512_castsi512_ps(_mm512_and_epi32(emm2, _mm512_castps_si512(y2))); cos_y2 = _mm512_castsi512_ps(_mm512_and_epi32(cos_emm2, _mm512_castps_si512(cos_y2))); y = _mm512_add_ps(y, y2); cos_y = _mm512_add_ps(cos_y, cos_y2); *s = _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(y), sign_bit)); *c = _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(cos_y), cos_emm0)); } // sincos_ps()
int main() { // Initialize int N = 1 << 16; int NALIGN = 64; int i, j; float OPS = 20. * N * N * 1e-9; float EPS2 = 1e-6; double tic, toc; float * x = (float*) _mm_malloc(N * sizeof(float), NALIGN); float * y = (float*) _mm_malloc(N * sizeof(float), NALIGN); float * z = (float*) _mm_malloc(N * sizeof(float), NALIGN); float * m = (float*) _mm_malloc(N * sizeof(float), NALIGN); float * p = (float*) _mm_malloc(N * sizeof(float), NALIGN); float * ax = (float*) _mm_malloc(N * sizeof(float), NALIGN); float * ay = (float*) _mm_malloc(N * sizeof(float), NALIGN); float * az = (float*) _mm_malloc(N * sizeof(float), NALIGN); #pragma omp parallel for for (i=0; i<N; i++) { x[i] = drand48(); y[i] = drand48(); z[i] = drand48(); m[i] = drand48() / N; p[i] = ax[i] = ay[i] = az[i] = 0; } printf("N : %d\n",N); #pragma omp parallel private(j) { #pragma omp single tic = get_time(); // Vectorize target with intrinsics #pragma omp for for (i=0; i<N; i+=16) { __m512 pi = _mm512_setzero_ps(); __m512 axi = _mm512_setzero_ps(); __m512 ayi = _mm512_setzero_ps(); __m512 azi = _mm512_setzero_ps(); __m512 xi = _mm512_load_ps(x+i); __m512 yi = _mm512_load_ps(y+i); __m512 zi = _mm512_load_ps(z+i); for (j=0; j<N; j++) { __m512 xj = _mm512_set1_ps(x[j]); xj = _mm512_sub_ps(xj, xi); __m512 yj = _mm512_set1_ps(y[j]); yj = _mm512_sub_ps(yj, yi); __m512 zj = _mm512_set1_ps(z[j]); zj = _mm512_sub_ps(zj, zi); __m512 R2 = _mm512_set1_ps(EPS2); R2 = _mm512_fmadd_ps(xj, xj, R2); R2 = _mm512_fmadd_ps(yj, yj, R2); R2 = _mm512_fmadd_ps(zj, zj, R2); __m512 mj = _mm512_set1_ps(m[j]); __m512 invR = _mm512_rsqrt23_ps(R2); mj = _mm512_mul_ps(mj, invR); pi = _mm512_add_ps(pi, mj); invR = _mm512_mul_ps(invR, invR); invR = _mm512_mul_ps(invR, mj); axi = _mm512_fmadd_ps(xj, invR, axi); ayi = _mm512_fmadd_ps(yj, invR, ayi); azi = _mm512_fmadd_ps(zj, invR, azi); } _mm512_store_ps(p+i, pi); _mm512_store_ps(ax+i, axi); _mm512_store_ps(ay+i, ayi); _mm512_store_ps(az+i, azi); } #pragma omp single { toc = get_time(); printf("Vectorize target with intrinsics : %e s : %lf GFlops\n",toc-tic, OPS/(toc-tic)); // Vectorize source with intrinsics tic = get_time(); } #pragma omp for for (i=0; i<N; i++) { __m512 pi = _mm512_setzero_ps(); __m512 axi = _mm512_setzero_ps(); __m512 ayi = _mm512_setzero_ps(); __m512 azi = _mm512_setzero_ps(); __m512 xi = _mm512_set1_ps(x[i]); __m512 yi = _mm512_set1_ps(y[i]); __m512 zi = _mm512_set1_ps(z[i]); for (j=0; j<N; j+=16) { __m512 xj = _mm512_load_ps(x+j); xj = _mm512_sub_ps(xj, xi); __m512 yj = _mm512_load_ps(y+j); yj = _mm512_sub_ps(yj, yi); __m512 zj = _mm512_load_ps(z+j); zj = _mm512_sub_ps(zj, zi); __m512 R2 = _mm512_set1_ps(EPS2); R2 = _mm512_fmadd_ps(xj, xj, R2); R2 = _mm512_fmadd_ps(yj, yj, R2); R2 = _mm512_fmadd_ps(zj, zj, R2); __m512 mj = _mm512_load_ps(m+j); __m512 invR = _mm512_rsqrt23_ps(R2); mj = _mm512_mul_ps(mj, invR); pi = _mm512_add_ps(pi, mj); invR = _mm512_mul_ps(invR, invR); invR = _mm512_mul_ps(invR, mj); axi = _mm512_fmadd_ps(xj, invR, axi); ayi = _mm512_fmadd_ps(yj, invR, ayi); azi = _mm512_fmadd_ps(zj, invR, azi); } p[i] = _mm512_reduce_add_ps(pi); ax[i] = _mm512_reduce_add_ps(axi); ay[i] = _mm512_reduce_add_ps(ayi); az[i] = _mm512_reduce_add_ps(azi); } #pragma omp single { toc = get_time(); printf("Vectorize source with intrinsics : %e s : %lf GFlops\n",toc-tic, OPS/(toc-tic)); // Vectorize target with pragma simd tic = get_time(); } #pragma simd #pragma omp for for (i=0; i<N; i++) { float pi = 0; float axi = 0; float ayi = 0; float azi = 0; float xi = x[i]; float yi = y[i]; float zi = z[i]; for (j=0; j<N; j++) { float dx = x[j] - xi; float dy = y[j] - yi; float dz = z[j] - zi; float R2 = dx * dx + dy * dy + dz * dz + EPS2; float invR = 1.0f / sqrtf(R2); float invR3 = m[j] * invR * invR * invR; pi += m[j] * invR; axi += dx * invR3; ayi += dy * invR3; azi += dz * invR3; } p[i] = pi; ax[i] = axi; ay[i] = ayi; az[i] = azi; } #pragma omp single { toc = get_time(); printf("Vectorize target with pragma simd: %e s : %lf GFlops\n",toc-tic, OPS/(toc-tic)); // Vectorize source with pragma simd tic = get_time(); } #pragma omp for for (i=0; i<N; i++) { float pi = 0; float axi = 0; float ayi = 0; float azi = 0; float xi = x[i]; float yi = y[i]; float zi = z[i]; #pragma simd for (j=0; j<N; j++) { float dx = x[j] - xi; float dy = y[j] - yi; float dz = z[j] - zi; float R2 = dx * dx + dy * dy + dz * dz + EPS2; float invR = 1.0f / sqrtf(R2); float invR3 = m[j] * invR * invR * invR; pi += m[j] * invR; axi += dx * invR3; ayi += dy * invR3; azi += dz * invR3; } p[i] = pi; ax[i] = axi; ay[i] = ayi; az[i] = azi; } #pragma omp single { toc = get_time(); printf("Vectorize source with pragma simd: %e s : %lf GFlops\n",toc-tic, OPS/(toc-tic)); } } _mm_free(x); _mm_free(y); _mm_free(z); _mm_free(m); _mm_free(p); _mm_free(ax); _mm_free(ay); _mm_free(az); return 0; }
void * avx512_fma(void *args_in) { /* Thread input */ struct thread_args *args; const int n_avx512 = VFMAPS_LATENCY; const __m512 add0 = _mm512_set1_ps((float) 1e-6); const __m512 mul0 = _mm512_set1_ps((float) (1. + 1e-6)); __m512 r[n_avx512]; // Declare as volatile to prevent removal during optimisation volatile float result; long r_max, i; int j; double runtime, flops; Stopwatch *t; /* Read inputs */ args = (struct thread_args *) args_in; t = stopwatch_create(args->timer_type); for (j = 0; j < n_avx512; j++) { r[j] = _mm512_set1_ps((float) j); } /* Add over registers r0-r4, multiply over r5-r9, and rely on pipelining, * OOO execution, and latency difference (3 vs 5 cycles) for 2x FLOPs */ runtime_flag = 0; r_max = 1; do { pthread_barrier_wait(&timer_barrier); t->start(t); for (i = 0; i < r_max; i++) { #pragma unroll(n_avx512) for (j = 0; j < n_avx512; j++) r[j] = _mm512_fmadd_ps(r[j], mul0, add0); } t->stop(t); runtime = t->runtime(t); /* Set runtime flag if any thread exceeds runtime limit */ if (runtime > args->min_runtime) { pthread_mutex_lock(&runtime_mutex); runtime_flag = 1; pthread_mutex_unlock(&runtime_mutex); } pthread_barrier_wait(&timer_barrier); if (!runtime_flag) r_max *= 2; } while (!runtime_flag); /* In order to prevent removal of the prior loop by optimisers, * sum the register values and save the result as volatile. */ for (j = 0; j < n_avx512; j++) r[0] = _mm512_add_ps(r[0], r[j]); result = reduce_AVX512(r[0]); /* (iter) * (16 instr / reg) * (2 flops / instr) * (n_avx512 reg / iter) */ flops = r_max * 16 * 2 * n_avx512 / runtime; /* Cleanup */ t->destroy(t); /* Thread output */ args->runtime = runtime; args->flops = flops; args->bw_load = 0.; args->bw_store = 0.; pthread_exit(NULL); }
static batch_type fma(const batch_type& x, const batch_type& y, const batch_type& z) { return _mm512_fmadd_ps(x, y, z); }