float test_mm512_reduce_add_ps(__m512 __W){ // CHECK: %shuffle.i = shufflevector <16 x float> %__W, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> // CHECK: %shuffle1.i = shufflevector <16 x float> %__W, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> // CHECK: %add.i = fadd <8 x float> %shuffle.i, %shuffle1.i // CHECK: %shuffle2.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> // CHECK: %shuffle3.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> // CHECK: %add4.i = fadd <4 x float> %shuffle2.i, %shuffle3.i // CHECK: %shuffle6.i = shufflevector <4 x float> %add4.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> // CHECK: %add7.i = fadd <4 x float> %add4.i, %shuffle6.i // CHECK: %shuffle9.i = shufflevector <4 x float> %add7.i, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> // CHECK: %add10.i = fadd <4 x float> %add7.i, %shuffle9.i // CHECK: %vecext.i = extractelement <4 x float> %add10.i, i32 0 // CHECK: ret float %vecext.i return _mm512_reduce_add_ps(__W); }
int main() { // Initialize int N = 1 << 16; int NALIGN = 64; int i, j; float OPS = 20. * N * N * 1e-9; float EPS2 = 1e-6; double tic, toc; float * x = (float*) _mm_malloc(N * sizeof(float), NALIGN); float * y = (float*) _mm_malloc(N * sizeof(float), NALIGN); float * z = (float*) _mm_malloc(N * sizeof(float), NALIGN); float * m = (float*) _mm_malloc(N * sizeof(float), NALIGN); float * p = (float*) _mm_malloc(N * sizeof(float), NALIGN); float * ax = (float*) _mm_malloc(N * sizeof(float), NALIGN); float * ay = (float*) _mm_malloc(N * sizeof(float), NALIGN); float * az = (float*) _mm_malloc(N * sizeof(float), NALIGN); #pragma omp parallel for for (i=0; i<N; i++) { x[i] = drand48(); y[i] = drand48(); z[i] = drand48(); m[i] = drand48() / N; p[i] = ax[i] = ay[i] = az[i] = 0; } printf("N : %d\n",N); #pragma omp parallel private(j) { #pragma omp single tic = get_time(); // Vectorize target with intrinsics #pragma omp for for (i=0; i<N; i+=16) { __m512 pi = _mm512_setzero_ps(); __m512 axi = _mm512_setzero_ps(); __m512 ayi = _mm512_setzero_ps(); __m512 azi = _mm512_setzero_ps(); __m512 xi = _mm512_load_ps(x+i); __m512 yi = _mm512_load_ps(y+i); __m512 zi = _mm512_load_ps(z+i); for (j=0; j<N; j++) { __m512 xj = _mm512_set1_ps(x[j]); xj = _mm512_sub_ps(xj, xi); __m512 yj = _mm512_set1_ps(y[j]); yj = _mm512_sub_ps(yj, yi); __m512 zj = _mm512_set1_ps(z[j]); zj = _mm512_sub_ps(zj, zi); __m512 R2 = _mm512_set1_ps(EPS2); R2 = _mm512_fmadd_ps(xj, xj, R2); R2 = _mm512_fmadd_ps(yj, yj, R2); R2 = _mm512_fmadd_ps(zj, zj, R2); __m512 mj = _mm512_set1_ps(m[j]); __m512 invR = _mm512_rsqrt23_ps(R2); mj = _mm512_mul_ps(mj, invR); pi = _mm512_add_ps(pi, mj); invR = _mm512_mul_ps(invR, invR); invR = _mm512_mul_ps(invR, mj); axi = _mm512_fmadd_ps(xj, invR, axi); ayi = _mm512_fmadd_ps(yj, invR, ayi); azi = _mm512_fmadd_ps(zj, invR, azi); } _mm512_store_ps(p+i, pi); _mm512_store_ps(ax+i, axi); _mm512_store_ps(ay+i, ayi); _mm512_store_ps(az+i, azi); } #pragma omp single { toc = get_time(); printf("Vectorize target with intrinsics : %e s : %lf GFlops\n",toc-tic, OPS/(toc-tic)); // Vectorize source with intrinsics tic = get_time(); } #pragma omp for for (i=0; i<N; i++) { __m512 pi = _mm512_setzero_ps(); __m512 axi = _mm512_setzero_ps(); __m512 ayi = _mm512_setzero_ps(); __m512 azi = _mm512_setzero_ps(); __m512 xi = _mm512_set1_ps(x[i]); __m512 yi = _mm512_set1_ps(y[i]); __m512 zi = _mm512_set1_ps(z[i]); for (j=0; j<N; j+=16) { __m512 xj = _mm512_load_ps(x+j); xj = _mm512_sub_ps(xj, xi); __m512 yj = _mm512_load_ps(y+j); yj = _mm512_sub_ps(yj, yi); __m512 zj = _mm512_load_ps(z+j); zj = _mm512_sub_ps(zj, zi); __m512 R2 = _mm512_set1_ps(EPS2); R2 = _mm512_fmadd_ps(xj, xj, R2); R2 = _mm512_fmadd_ps(yj, yj, R2); R2 = _mm512_fmadd_ps(zj, zj, R2); __m512 mj = _mm512_load_ps(m+j); __m512 invR = _mm512_rsqrt23_ps(R2); mj = _mm512_mul_ps(mj, invR); pi = _mm512_add_ps(pi, mj); invR = _mm512_mul_ps(invR, invR); invR = _mm512_mul_ps(invR, mj); axi = _mm512_fmadd_ps(xj, invR, axi); ayi = _mm512_fmadd_ps(yj, invR, ayi); azi = _mm512_fmadd_ps(zj, invR, azi); } p[i] = _mm512_reduce_add_ps(pi); ax[i] = _mm512_reduce_add_ps(axi); ay[i] = _mm512_reduce_add_ps(ayi); az[i] = _mm512_reduce_add_ps(azi); } #pragma omp single { toc = get_time(); printf("Vectorize source with intrinsics : %e s : %lf GFlops\n",toc-tic, OPS/(toc-tic)); // Vectorize target with pragma simd tic = get_time(); } #pragma simd #pragma omp for for (i=0; i<N; i++) { float pi = 0; float axi = 0; float ayi = 0; float azi = 0; float xi = x[i]; float yi = y[i]; float zi = z[i]; for (j=0; j<N; j++) { float dx = x[j] - xi; float dy = y[j] - yi; float dz = z[j] - zi; float R2 = dx * dx + dy * dy + dz * dz + EPS2; float invR = 1.0f / sqrtf(R2); float invR3 = m[j] * invR * invR * invR; pi += m[j] * invR; axi += dx * invR3; ayi += dy * invR3; azi += dz * invR3; } p[i] = pi; ax[i] = axi; ay[i] = ayi; az[i] = azi; } #pragma omp single { toc = get_time(); printf("Vectorize target with pragma simd: %e s : %lf GFlops\n",toc-tic, OPS/(toc-tic)); // Vectorize source with pragma simd tic = get_time(); } #pragma omp for for (i=0; i<N; i++) { float pi = 0; float axi = 0; float ayi = 0; float azi = 0; float xi = x[i]; float yi = y[i]; float zi = z[i]; #pragma simd for (j=0; j<N; j++) { float dx = x[j] - xi; float dy = y[j] - yi; float dz = z[j] - zi; float R2 = dx * dx + dy * dy + dz * dz + EPS2; float invR = 1.0f / sqrtf(R2); float invR3 = m[j] * invR * invR * invR; pi += m[j] * invR; axi += dx * invR3; ayi += dy * invR3; azi += dz * invR3; } p[i] = pi; ax[i] = axi; ay[i] = ayi; az[i] = azi; } #pragma omp single { toc = get_time(); printf("Vectorize source with pragma simd: %e s : %lf GFlops\n",toc-tic, OPS/(toc-tic)); } } _mm_free(x); _mm_free(y); _mm_free(z); _mm_free(m); _mm_free(p); _mm_free(ax); _mm_free(ay); _mm_free(az); return 0; }
static inline scomplex_t mic_reduce_add_cps(mic_m512c_t v) { scomplex_t temp; temp.x = _mm512_reduce_add_ps(v.xvec); temp.y = _mm512_reduce_add_ps(v.yvec); return temp; } // mic_reduce_add_cps()