inline short_vec<float, 32> operator-(const short_vec<float, 32>& other) const { return short_vec<float, 32>( _mm512_sub_ps(val1, other.val1), _mm512_sub_ps(val2, other.val2)); }
// sin() static inline mic_m512_t mic_sin_ps(mic_m512_t x) { __m512i sign_bit; sign_bit = _mm512_and_epi32(_mm512_castps_si512(x), _pi32_sign_mask); x = _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(x), _pi32_inv_sign_mask)); mic_m512_t y = _mm512_mul_ps(x, _ps_cephes_FOPI); __m512i emm2 = _mm512_cvtfxpnt_round_adjustps_epi32(y, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE); emm2 = _mm512_add_epi32(emm2, _pi32_1); emm2 = _mm512_and_epi32(emm2, _pi32_inv1); y = _mm512_cvtfxpnt_round_adjustepu32_ps(emm2, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE); __m512i emm0 = _mm512_and_epi32(emm2, _pi32_4); emm0 = _mm512_slli_epi32(emm0, 29); emm2 = _mm512_and_epi32(emm2, _pi32_2); __mmask16 mask = _mm512_cmp_epi32_mask(emm2, _pi32_0, _MM_CMPINT_EQ); emm2 = _mm512_mask_add_epi32(_pi32_0, mask, _pi32_ffff, _pi32_0); sign_bit = _mm512_xor_epi32(sign_bit, emm0); mic_m512_t temp = _ps_minus_cephes_DP123; temp = _mm512_mul_ps(y, temp); x = _mm512_add_ps(x, temp); mic_m512_t x2 = _mm512_mul_ps(x, x); mic_m512_t x3 = _mm512_mul_ps(x2, x); mic_m512_t x4 = _mm512_mul_ps(x2, x2); y = _mm512_mul_ps(_ps_coscof_p0, x2); mic_m512_t y2 = _mm512_mul_ps(_ps_sincof_p0, x2); y = _mm512_add_ps(y, _ps_coscof_p1); y2 = _mm512_add_ps(y2, _ps_sincof_p1); y = _mm512_mul_ps(y, x2); y2 = _mm512_mul_ps(y2, x2); y = _mm512_add_ps(y, _ps_coscof_p2); y2 = _mm512_add_ps(y2, _ps_sincof_p2); y = _mm512_mul_ps(y, x4); y2 = _mm512_mul_ps(y2, x3); temp = _mm512_mul_ps(x2, _ps_0point5); temp = _mm512_sub_ps(temp, _ps_1); y = _mm512_sub_ps(y, temp); y2 = _mm512_add_ps(y2, x); y = _mm512_castsi512_ps(_mm512_andnot_epi32(emm2, _mm512_castps_si512(y))); y2 = _mm512_castsi512_ps(_mm512_and_epi32(emm2, _mm512_castps_si512(y2))); y = _mm512_add_ps(y, y2); y = _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(y), sign_bit)); return y; } // sin_ps()
static inline mic_m512c_t mic_rcp_cps(mic_m512c_t a) { mic_m512_t temp1 = _mm512_add_ps(_mm512_mul_ps(a.xvec, a.xvec), _mm512_mul_ps(a.yvec, a.yvec)); mic_m512_t temp2 = _mm512_rcp23_ps(temp1); mic_m512c_t vec; //__m512 neg_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000)); vec.xvec = _mm512_mul_ps(a.xvec, temp2); //vec.yvec = _mm512_xor_ps(_mm512_mul_ps(a.yvec, temp2), neg_mask); mic_m512_t zero = _mm512_setzero_ps(); vec.yvec = _mm512_sub_ps(zero, _mm512_mul_ps(a.yvec, temp2)); return vec; } // mic_rcp_cps()
// exp() inline mic_m512_t mic_exp_ps(mic_m512_t x) { x = _mm512_min_ps(x, _ps_exp_hi); x = _mm512_max_ps(x, _ps_exp_lo); mic_m512_t temp_2 = _mm512_fmadd_ps(x, _ps_cephes_LOG2EF, _ps_0point5); mic_m512_t temp_1 = _mm512_round_ps(temp_2, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE); mic_m512_t temp_3 = _mm512_sub_ps(temp_1, temp_2); __mmask16 mask = _mm512_cmp_ps_mask(temp_3, _ps_0, _MM_CMPINT_GT); temp_2 = _mm512_mask_sub_ps(temp_1, mask, temp_1, _ps_1); __m512i emm0 = _mm512_cvtfxpnt_round_adjustps_epi32(temp_2, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE); x = _mm512_fnmadd_ps(temp_2, _ps_cephes_exp_C12, x); mic_m512_t x2 = _mm512_mul_ps(x, x); mic_m512_t x3 = _mm512_mul_ps(x2, x); mic_m512_t x4 = _mm512_mul_ps(x2, x2); temp_1 = _mm512_add_ps(x, _ps_1); temp_1 = _mm512_fmadd_ps(x2, _ps_cephes_exp_p5, temp_1); temp_1 = _mm512_fmadd_ps(x3, _ps_cephes_exp_p4, temp_1); temp_2 = _mm512_mul_ps(x3, _ps_cephes_exp_p0); temp_3 = _mm512_mul_ps(x2, _ps_cephes_exp_p1); mic_m512_t temp_4 = _mm512_mul_ps(x, _ps_cephes_exp_p2); emm0 = _mm512_add_epi32(emm0, _pi32_0x7f); temp_2 = _mm512_add_ps(temp_2, temp_3); temp_3 = _mm512_add_ps(temp_3, temp_4); temp_2 = _mm512_add_ps(temp_2, temp_3); emm0 = _mm512_slli_epi32(emm0, 23); mic_m512_t pow2n = _mm512_castsi512_ps(emm0); temp_2 = _mm512_mul_ps(temp_2, x4); mic_m512_t y = _mm512_add_ps(temp_1, temp_2); y = _mm512_mul_ps(y, pow2n); return y; } // newexp_ps()
int main() { // Initialize int N = 1 << 16; int NALIGN = 64; int i, j; float OPS = 20. * N * N * 1e-9; float EPS2 = 1e-6; double tic, toc; float * x = (float*) _mm_malloc(N * sizeof(float), NALIGN); float * y = (float*) _mm_malloc(N * sizeof(float), NALIGN); float * z = (float*) _mm_malloc(N * sizeof(float), NALIGN); float * m = (float*) _mm_malloc(N * sizeof(float), NALIGN); float * p = (float*) _mm_malloc(N * sizeof(float), NALIGN); float * ax = (float*) _mm_malloc(N * sizeof(float), NALIGN); float * ay = (float*) _mm_malloc(N * sizeof(float), NALIGN); float * az = (float*) _mm_malloc(N * sizeof(float), NALIGN); #pragma omp parallel for for (i=0; i<N; i++) { x[i] = drand48(); y[i] = drand48(); z[i] = drand48(); m[i] = drand48() / N; p[i] = ax[i] = ay[i] = az[i] = 0; } printf("N : %d\n",N); #pragma omp parallel private(j) { #pragma omp single tic = get_time(); // Vectorize target with intrinsics #pragma omp for for (i=0; i<N; i+=16) { __m512 pi = _mm512_setzero_ps(); __m512 axi = _mm512_setzero_ps(); __m512 ayi = _mm512_setzero_ps(); __m512 azi = _mm512_setzero_ps(); __m512 xi = _mm512_load_ps(x+i); __m512 yi = _mm512_load_ps(y+i); __m512 zi = _mm512_load_ps(z+i); for (j=0; j<N; j++) { __m512 xj = _mm512_set1_ps(x[j]); xj = _mm512_sub_ps(xj, xi); __m512 yj = _mm512_set1_ps(y[j]); yj = _mm512_sub_ps(yj, yi); __m512 zj = _mm512_set1_ps(z[j]); zj = _mm512_sub_ps(zj, zi); __m512 R2 = _mm512_set1_ps(EPS2); R2 = _mm512_fmadd_ps(xj, xj, R2); R2 = _mm512_fmadd_ps(yj, yj, R2); R2 = _mm512_fmadd_ps(zj, zj, R2); __m512 mj = _mm512_set1_ps(m[j]); __m512 invR = _mm512_rsqrt23_ps(R2); mj = _mm512_mul_ps(mj, invR); pi = _mm512_add_ps(pi, mj); invR = _mm512_mul_ps(invR, invR); invR = _mm512_mul_ps(invR, mj); axi = _mm512_fmadd_ps(xj, invR, axi); ayi = _mm512_fmadd_ps(yj, invR, ayi); azi = _mm512_fmadd_ps(zj, invR, azi); } _mm512_store_ps(p+i, pi); _mm512_store_ps(ax+i, axi); _mm512_store_ps(ay+i, ayi); _mm512_store_ps(az+i, azi); } #pragma omp single { toc = get_time(); printf("Vectorize target with intrinsics : %e s : %lf GFlops\n",toc-tic, OPS/(toc-tic)); // Vectorize source with intrinsics tic = get_time(); } #pragma omp for for (i=0; i<N; i++) { __m512 pi = _mm512_setzero_ps(); __m512 axi = _mm512_setzero_ps(); __m512 ayi = _mm512_setzero_ps(); __m512 azi = _mm512_setzero_ps(); __m512 xi = _mm512_set1_ps(x[i]); __m512 yi = _mm512_set1_ps(y[i]); __m512 zi = _mm512_set1_ps(z[i]); for (j=0; j<N; j+=16) { __m512 xj = _mm512_load_ps(x+j); xj = _mm512_sub_ps(xj, xi); __m512 yj = _mm512_load_ps(y+j); yj = _mm512_sub_ps(yj, yi); __m512 zj = _mm512_load_ps(z+j); zj = _mm512_sub_ps(zj, zi); __m512 R2 = _mm512_set1_ps(EPS2); R2 = _mm512_fmadd_ps(xj, xj, R2); R2 = _mm512_fmadd_ps(yj, yj, R2); R2 = _mm512_fmadd_ps(zj, zj, R2); __m512 mj = _mm512_load_ps(m+j); __m512 invR = _mm512_rsqrt23_ps(R2); mj = _mm512_mul_ps(mj, invR); pi = _mm512_add_ps(pi, mj); invR = _mm512_mul_ps(invR, invR); invR = _mm512_mul_ps(invR, mj); axi = _mm512_fmadd_ps(xj, invR, axi); ayi = _mm512_fmadd_ps(yj, invR, ayi); azi = _mm512_fmadd_ps(zj, invR, azi); } p[i] = _mm512_reduce_add_ps(pi); ax[i] = _mm512_reduce_add_ps(axi); ay[i] = _mm512_reduce_add_ps(ayi); az[i] = _mm512_reduce_add_ps(azi); } #pragma omp single { toc = get_time(); printf("Vectorize source with intrinsics : %e s : %lf GFlops\n",toc-tic, OPS/(toc-tic)); // Vectorize target with pragma simd tic = get_time(); } #pragma simd #pragma omp for for (i=0; i<N; i++) { float pi = 0; float axi = 0; float ayi = 0; float azi = 0; float xi = x[i]; float yi = y[i]; float zi = z[i]; for (j=0; j<N; j++) { float dx = x[j] - xi; float dy = y[j] - yi; float dz = z[j] - zi; float R2 = dx * dx + dy * dy + dz * dz + EPS2; float invR = 1.0f / sqrtf(R2); float invR3 = m[j] * invR * invR * invR; pi += m[j] * invR; axi += dx * invR3; ayi += dy * invR3; azi += dz * invR3; } p[i] = pi; ax[i] = axi; ay[i] = ayi; az[i] = azi; } #pragma omp single { toc = get_time(); printf("Vectorize target with pragma simd: %e s : %lf GFlops\n",toc-tic, OPS/(toc-tic)); // Vectorize source with pragma simd tic = get_time(); } #pragma omp for for (i=0; i<N; i++) { float pi = 0; float axi = 0; float ayi = 0; float azi = 0; float xi = x[i]; float yi = y[i]; float zi = z[i]; #pragma simd for (j=0; j<N; j++) { float dx = x[j] - xi; float dy = y[j] - yi; float dz = z[j] - zi; float R2 = dx * dx + dy * dy + dz * dz + EPS2; float invR = 1.0f / sqrtf(R2); float invR3 = m[j] * invR * invR * invR; pi += m[j] * invR; axi += dx * invR3; ayi += dy * invR3; azi += dz * invR3; } p[i] = pi; ax[i] = axi; ay[i] = ayi; az[i] = azi; } #pragma omp single { toc = get_time(); printf("Vectorize source with pragma simd: %e s : %lf GFlops\n",toc-tic, OPS/(toc-tic)); } } _mm_free(x); _mm_free(y); _mm_free(z); _mm_free(m); _mm_free(p); _mm_free(ax); _mm_free(ay); _mm_free(az); return 0; }
static inline mic_m512c_t mic_mul_ccps(mic_m512c_t a, mic_m512c_t b) { mic_m512c_t vec; vec.xvec = _mm512_sub_ps(_mm512_mul_ps(a.xvec, b.xvec), _mm512_mul_ps(a.yvec, b.yvec)); vec.yvec = _mm512_add_ps(_mm512_mul_ps(a.xvec, b.yvec), _mm512_mul_ps(a.yvec, b.xvec)); return vec; } // mic_mul_ccps()
static batch_type sub(const batch_type& lhs, const batch_type& rhs) { return _mm512_sub_ps(lhs, rhs); }
static batch_type neg(const batch_type& rhs) { return _mm512_sub_ps(_mm512_setzero_ps(), rhs); }
void AVX512BW_mandelbrot( float Re_min, float Re_max, float Im_min, float Im_max, float threshold, int maxiters, int width, int height, uint8_t *data) { float dRe, dIm; int x, y; __m128i* ptr = (__m128i*)data; // step on Re and Im axis dRe = (Re_max - Re_min)/width; dIm = (Im_max - Im_min)/height; // prepare vectors // 1. threshold const __m512 vec_threshold = _mm512_set1_ps(threshold); // 2. Cim __m512 Cim = _mm512_set1_ps(Im_min); // 3. Re advance every x iteration const __m512 vec_dRe = _mm512_set1_ps(16*dRe); // 4. Im advance every y iteration const __m512 vec_dIm = _mm512_set1_ps(dIm); // calculations for (y=0; y < height; y++) { __m512 Cre = _mm512_setr_ps( Re_min + 0*dRe, Re_min + 1*dRe, Re_min + 2*dRe, Re_min + 3*dRe, Re_min + 4*dRe, Re_min + 5*dRe, Re_min + 6*dRe, Re_min + 7*dRe, Re_min + 8*dRe, Re_min + 9*dRe, Re_min + 10*dRe, Re_min + 11*dRe, Re_min + 12*dRe, Re_min + 13*dRe, Re_min + 14*dRe, Re_min + 15*dRe ); for (x=0; x < width; x+=16) { __m512 Xre = _mm512_setzero_ps(); __m512 Xim = _mm512_setzero_ps(); __m128i itercount = _mm_setzero_si128(); int i; for (i=0; i < maxiters; i++) { // Tre = Xre^2 - Xim^2 + Cim const __m512 Xre2 = _mm512_mul_ps(Xre, Xre); const __m512 Xim2 = _mm512_mul_ps(Xim, Xim); const __m512 Tre = _mm512_add_ps(Cre, _mm512_sub_ps(Xre2, Xim2)); // Tim = 2*Xre*Xim + Cre const __m512 t1 = _mm512_mul_ps(Xre, Xim); const __m512 Tim = _mm512_add_ps(Cim, _mm512_add_ps(t1, t1)); // sqr_dist = Tre^2 + Tim^2 __m512 Tre2 = _mm512_mul_ps(Tre, Tre); __m512 Tim2 = _mm512_mul_ps(Tim, Tim); __m512 sqr_dist = _mm512_add_ps(Tre2, Tim2); // sqr_dist < threshold => 16-bit mask __mmask16 mask = _mm512_cmp_ps_mask(sqr_dist, vec_threshold, _CMP_LE_OS); if (mask == 0) { break; } // Note: unlike SSE/AVX2 versions itercount is a packed byte vector, // thus conversion packed dword -> byte is not needed. itercount = _mm_sub_epi8(itercount, _mm_movm_epi8(mask)); Xre = Tre; Xim = Tim; } // for *ptr++ = itercount; // advance Cre vector Cre = _mm512_add_ps(Cre, vec_dRe); } // advance Cim vector Cim = _mm512_add_ps(Cim, vec_dIm); } }
inline void operator-=(const short_vec<float, 32>& other) { val1 = _mm512_sub_ps(val1, other.val1); val2 = _mm512_sub_ps(val2, other.val2); }