int main() {
    // Initialize
    int N = 1 << 16;
    int NALIGN = 64;
    int i, j;
    float OPS = 20. * N * N * 1e-9;
    float EPS2 = 1e-6;
    double tic, toc;
    float * x = (float*) _mm_malloc(N * sizeof(float), NALIGN);
    float * y = (float*) _mm_malloc(N * sizeof(float), NALIGN);
    float * z = (float*) _mm_malloc(N * sizeof(float), NALIGN);
    float * m = (float*) _mm_malloc(N * sizeof(float), NALIGN);
    float * p = (float*) _mm_malloc(N * sizeof(float), NALIGN);
    float * ax = (float*) _mm_malloc(N * sizeof(float), NALIGN);
    float * ay = (float*) _mm_malloc(N * sizeof(float), NALIGN);
    float * az = (float*) _mm_malloc(N * sizeof(float), NALIGN);
    #pragma omp parallel for
    for (i=0; i<N; i++) {
        x[i] = drand48();
        y[i] = drand48();
        z[i] = drand48();
        m[i] = drand48() / N;
        p[i] = ax[i] = ay[i] = az[i] = 0;
    }
    printf("N : %d\n",N);

    #pragma omp parallel private(j)
    {
        #pragma omp single
        tic = get_time();
        // Vectorize target with intrinsics
        #pragma omp for
        for (i=0; i<N; i+=16) {
            __m512 pi = _mm512_setzero_ps();
            __m512 axi = _mm512_setzero_ps();
            __m512 ayi = _mm512_setzero_ps();
            __m512 azi = _mm512_setzero_ps();
            __m512 xi = _mm512_load_ps(x+i);
            __m512 yi = _mm512_load_ps(y+i);
            __m512 zi = _mm512_load_ps(z+i);
            for (j=0; j<N; j++) {
                __m512 xj = _mm512_set1_ps(x[j]);
                xj = _mm512_sub_ps(xj, xi);
                __m512 yj = _mm512_set1_ps(y[j]);
                yj = _mm512_sub_ps(yj, yi);
                __m512 zj = _mm512_set1_ps(z[j]);
                zj = _mm512_sub_ps(zj, zi);
                __m512 R2 = _mm512_set1_ps(EPS2);
                R2 = _mm512_fmadd_ps(xj, xj, R2);
                R2 = _mm512_fmadd_ps(yj, yj, R2);
                R2 = _mm512_fmadd_ps(zj, zj, R2);
                __m512 mj = _mm512_set1_ps(m[j]);
                __m512 invR = _mm512_rsqrt23_ps(R2);
                mj = _mm512_mul_ps(mj, invR);
                pi = _mm512_add_ps(pi, mj);
                invR = _mm512_mul_ps(invR, invR);
                invR = _mm512_mul_ps(invR, mj);
                axi = _mm512_fmadd_ps(xj, invR, axi);
                ayi = _mm512_fmadd_ps(yj, invR, ayi);
                azi = _mm512_fmadd_ps(zj, invR, azi);
            }
            _mm512_store_ps(p+i, pi);
            _mm512_store_ps(ax+i, axi);
            _mm512_store_ps(ay+i, ayi);
            _mm512_store_ps(az+i, azi);
        }
        #pragma omp single
        {
            toc = get_time();
            printf("Vectorize target with intrinsics : %e s : %lf GFlops\n",toc-tic, OPS/(toc-tic));

            // Vectorize source with intrinsics
            tic = get_time();
        }
        #pragma omp for
        for (i=0; i<N; i++) {
            __m512 pi = _mm512_setzero_ps();
            __m512 axi = _mm512_setzero_ps();
            __m512 ayi = _mm512_setzero_ps();
            __m512 azi = _mm512_setzero_ps();
            __m512 xi = _mm512_set1_ps(x[i]);
            __m512 yi = _mm512_set1_ps(y[i]);
            __m512 zi = _mm512_set1_ps(z[i]);
            for (j=0; j<N; j+=16) {
                __m512 xj = _mm512_load_ps(x+j);
                xj = _mm512_sub_ps(xj, xi);
                __m512 yj = _mm512_load_ps(y+j);
                yj = _mm512_sub_ps(yj, yi);
                __m512 zj = _mm512_load_ps(z+j);
                zj = _mm512_sub_ps(zj, zi);
                __m512 R2 = _mm512_set1_ps(EPS2);
                R2 = _mm512_fmadd_ps(xj, xj, R2);
                R2 = _mm512_fmadd_ps(yj, yj, R2);
                R2 = _mm512_fmadd_ps(zj, zj, R2);
                __m512 mj = _mm512_load_ps(m+j);
                __m512 invR = _mm512_rsqrt23_ps(R2);
                mj = _mm512_mul_ps(mj, invR);
                pi = _mm512_add_ps(pi, mj);
                invR = _mm512_mul_ps(invR, invR);
                invR = _mm512_mul_ps(invR, mj);
                axi = _mm512_fmadd_ps(xj, invR, axi);
                ayi = _mm512_fmadd_ps(yj, invR, ayi);
                azi = _mm512_fmadd_ps(zj, invR, azi);
            }
            p[i] = _mm512_reduce_add_ps(pi);
            ax[i] = _mm512_reduce_add_ps(axi);
            ay[i] = _mm512_reduce_add_ps(ayi);
            az[i] = _mm512_reduce_add_ps(azi);
        }
        #pragma omp single
        {
            toc = get_time();
            printf("Vectorize source with intrinsics : %e s : %lf GFlops\n",toc-tic, OPS/(toc-tic));

            // Vectorize target with pragma simd
            tic = get_time();
        }
#pragma simd
        #pragma omp for
        for (i=0; i<N; i++) {
            float pi = 0;
            float axi = 0;
            float ayi = 0;
            float azi = 0;
            float xi = x[i];
            float yi = y[i];
            float zi = z[i];
            for (j=0; j<N; j++) {
                float dx = x[j] - xi;
                float dy = y[j] - yi;
                float dz = z[j] - zi;
                float R2 = dx * dx + dy * dy + dz * dz + EPS2;
                float invR = 1.0f / sqrtf(R2);
                float invR3 = m[j] * invR * invR * invR;
                pi += m[j] * invR;
                axi += dx * invR3;
                ayi += dy * invR3;
                azi += dz * invR3;
            }
            p[i] = pi;
            ax[i] = axi;
            ay[i] = ayi;
            az[i] = azi;
        }
        #pragma omp single
        {
            toc = get_time();
            printf("Vectorize target with pragma simd: %e s : %lf GFlops\n",toc-tic, OPS/(toc-tic));

            // Vectorize source with pragma simd
            tic = get_time();
        }
        #pragma omp for
        for (i=0; i<N; i++) {
            float pi = 0;
            float axi = 0;
            float ayi = 0;
            float azi = 0;
            float xi = x[i];
            float yi = y[i];
            float zi = z[i];
#pragma simd
            for (j=0; j<N; j++) {
                float dx = x[j] - xi;
                float dy = y[j] - yi;
                float dz = z[j] - zi;
                float R2 = dx * dx + dy * dy + dz * dz + EPS2;
                float invR = 1.0f / sqrtf(R2);
                float invR3 = m[j] * invR * invR * invR;
                pi += m[j] * invR;
                axi += dx * invR3;
                ayi += dy * invR3;
                azi += dz * invR3;
            }
            p[i] = pi;
            ax[i] = axi;
            ay[i] = ayi;
            az[i] = azi;
        }
        #pragma omp single
        {
            toc = get_time();
            printf("Vectorize source with pragma simd: %e s : %lf GFlops\n",toc-tic, OPS/(toc-tic));
        }
    }

    _mm_free(x);
    _mm_free(y);
    _mm_free(z);
    _mm_free(m);
    _mm_free(p);
    _mm_free(ax);
    _mm_free(ay);
    _mm_free(az);
    return 0;
}
	static inline mic_m512c_t mic_set1_cps(scomplex_t a) {
		mic_m512c_t vec;
		vec.xvec = _mm512_set1_ps(a.x);
		vec.yvec = _mm512_set1_ps(a.y);
		return vec;
	} // mic_set1_cps()
Exemple #3
0
void * avx512_fma(void *args_in)
{
    /* Thread input */
    struct thread_args *args;

    const int n_avx512 = VFMAPS_LATENCY;
    const __m512 add0 = _mm512_set1_ps((float) 1e-6);
    const __m512 mul0 = _mm512_set1_ps((float) (1. + 1e-6));
    __m512 r[n_avx512];

    // Declare as volatile to prevent removal during optimisation
    volatile float result;

    long r_max, i;
    int j;
    double runtime, flops;
    Stopwatch *t;

    /* Read inputs */
    args = (struct thread_args *) args_in;

    t = stopwatch_create(args->timer_type);

    for (j = 0; j < n_avx512; j++) {
        r[j] = _mm512_set1_ps((float) j);
    }

    /* Add over registers r0-r4, multiply over r5-r9, and rely on pipelining,
     * OOO execution, and latency difference (3 vs 5 cycles) for 2x FLOPs
     */

    runtime_flag = 0;
    r_max = 1;
    do {
        pthread_barrier_wait(&timer_barrier);
        t->start(t);
        for (i = 0; i < r_max; i++) {
            #pragma unroll(n_avx512)
            for (j = 0; j < n_avx512; j++)
                r[j] = _mm512_fmadd_ps(r[j], mul0, add0);
        }
        t->stop(t);
        runtime = t->runtime(t);

        /* Set runtime flag if any thread exceeds runtime limit */
        if (runtime > args->min_runtime) {
            pthread_mutex_lock(&runtime_mutex);
            runtime_flag = 1;
            pthread_mutex_unlock(&runtime_mutex);
        }

        pthread_barrier_wait(&timer_barrier);
        if (!runtime_flag) r_max *= 2;

    } while (!runtime_flag);

    /* In order to prevent removal of the prior loop by optimisers,
     * sum the register values and save the result as volatile. */

    for (j = 0; j < n_avx512; j++)
        r[0] = _mm512_add_ps(r[0], r[j]);
    result = reduce_AVX512(r[0]);

    /* (iter) * (16 instr / reg) * (2 flops / instr) * (n_avx512 reg / iter) */
    flops = r_max * 16 * 2 * n_avx512 / runtime;

    /* Cleanup */
    t->destroy(t);

    /* Thread output */
    args->runtime = runtime;
    args->flops = flops;
    args->bw_load = 0.;
    args->bw_store = 0.;

    pthread_exit(NULL);
}
	static inline mic_m512_t mic_set1_rps(real_t a) {
		return _mm512_set1_ps(a);
	} // mic_load1_rps()
 inline
 short_vec(const float data = 0) :
     val{_mm512_set1_ps(data),
         _mm512_set1_ps(data)}
 {}
 *  NON-COMMERCIAL END USER LICENSE AGREEMENT.
 */

#pragma offload_attribute(push, target(mic))

static const __m512i _pi32_sign_mask = _mm512_set1_epi32(0x80000000);
static const __m512i _pi32_inv_sign_mask = _mm512_set1_epi32(~0x80000000);
static const __m512i _pi32_0 = _mm512_set1_epi32(0);
static const __m512i _pi32_1 = _mm512_set1_epi32(1);
static const __m512i _pi32_2 = _mm512_set1_epi32(2);
static const __m512i _pi32_4 = _mm512_set1_epi32(4);
static const __m512i _pi32_inv1 = _mm512_set1_epi32(~1);
static const __m512i _pi32_0x7f = _mm512_set1_epi32(0x7f);
static const __m512i _pi32_ffff = _mm512_set1_epi32(0xffffffff);

static const mic_m512_t _ps_1 = _mm512_set1_ps(1.0f);
static const mic_m512_t _ps_0point5 = _mm512_set1_ps(0.5f);
static const mic_m512_t _ps_0 = _mm512_set1_ps(0.0f);
static const mic_m512_t _ps_exp_hi = _mm512_set1_ps(88.3762626647949f);
static const mic_m512_t _ps_exp_lo = _mm512_set1_ps(-88.3762626647949f);
static const mic_m512_t _ps_cephes_LOG2EF = _mm512_set1_ps(1.44269504088896341f);
static const mic_m512_t _ps_cephes_exp_C12 = _mm512_set1_ps(0.69314718056f);
static const mic_m512_t _ps_cephes_exp_p0 = _mm512_set1_ps(1.9875691500E-4f);
static const mic_m512_t _ps_cephes_exp_p1 = _mm512_set1_ps(1.3981999507E-3f);
static const mic_m512_t _ps_cephes_exp_p2 = _mm512_set1_ps(8.3334519073E-3f);
static const mic_m512_t _ps_cephes_exp_p4 = _mm512_set1_ps(1.6666665459E-1f);
static const mic_m512_t _ps_cephes_exp_p5 = _mm512_set1_ps(5.0000001201E-1f);
static const mic_m512_t _ps_minus_cephes_DP1 = _mm512_set1_ps(-0.78515625f);
static const mic_m512_t _ps_minus_cephes_DP2 = _mm512_set1_ps(-2.4187564849853515625e-4f);
static const mic_m512_t _ps_minus_cephes_DP3 = _mm512_set1_ps(-3.77489497744594108e-8f);
static const mic_m512_t _ps_minus_cephes_DP123 = _mm512_set1_ps(-0.7853981633974483096156608f);
void AVX512BW_mandelbrot(
	float Re_min, float Re_max,
	float Im_min, float Im_max,
	float threshold,
	int maxiters,
	int width, int height,
	uint8_t *data)

{
	float dRe, dIm;
	int x, y;

	__m128i* ptr = (__m128i*)data;

	// step on Re and Im axis
	dRe = (Re_max - Re_min)/width;
	dIm = (Im_max - Im_min)/height;

	// prepare vectors
	// 1. threshold
    const __m512 vec_threshold = _mm512_set1_ps(threshold);

	// 2. Cim
    __m512 Cim = _mm512_set1_ps(Im_min);

	// 3. Re advance every x iteration
    const __m512 vec_dRe = _mm512_set1_ps(16*dRe);

	// 4. Im advance every y iteration
    const __m512 vec_dIm = _mm512_set1_ps(dIm);

	// calculations
	for (y=0; y < height; y++) {

        __m512 Cre = _mm512_setr_ps(
            Re_min +  0*dRe, Re_min +  1*dRe, Re_min +  2*dRe, Re_min +  3*dRe,
            Re_min +  4*dRe, Re_min +  5*dRe, Re_min +  6*dRe, Re_min +  7*dRe,
            Re_min +  8*dRe, Re_min +  9*dRe, Re_min + 10*dRe, Re_min + 11*dRe,
            Re_min + 12*dRe, Re_min + 13*dRe, Re_min + 14*dRe, Re_min + 15*dRe
        );

		for (x=0; x < width; x+=16) {

            __m512 Xre = _mm512_setzero_ps();
            __m512 Xim = _mm512_setzero_ps();

            __m128i itercount = _mm_setzero_si128();

            int i;
            for (i=0; i < maxiters; i++) {

			    // Tre = Xre^2 - Xim^2 + Cim
                const __m512 Xre2 = _mm512_mul_ps(Xre, Xre);
                const __m512 Xim2 = _mm512_mul_ps(Xim, Xim);
                const __m512 Tre  = _mm512_add_ps(Cre, _mm512_sub_ps(Xre2, Xim2));

			    // Tim = 2*Xre*Xim + Cre
                const __m512 t1  = _mm512_mul_ps(Xre, Xim);
                const __m512 Tim = _mm512_add_ps(Cim, _mm512_add_ps(t1, t1));

                // sqr_dist = Tre^2 + Tim^2
                __m512 Tre2 = _mm512_mul_ps(Tre, Tre);
                __m512 Tim2 = _mm512_mul_ps(Tim, Tim);
                __m512 sqr_dist = _mm512_add_ps(Tre2, Tim2);

                // sqr_dist < threshold => 16-bit mask
                __mmask16 mask = _mm512_cmp_ps_mask(sqr_dist, vec_threshold, _CMP_LE_OS);
                if (mask == 0) {
                    break;
                }

                // Note: unlike SSE/AVX2 versions itercount is a packed byte vector,
                //       thus conversion packed dword -> byte is not needed.
                itercount = _mm_sub_epi8(itercount, _mm_movm_epi8(mask));

                Xre = Tre;
                Xim = Tim;

            } // for

            *ptr++ = itercount;

			// advance Cre vector
            Cre = _mm512_add_ps(Cre, vec_dRe);
		}

		// advance Cim vector
        Cim = _mm512_add_ps(Cim, vec_dIm);
	}
}
 inline
 short_vec(const float data = 0) :
     val1(_mm512_set1_ps(data)),
     val2(_mm512_set1_ps(data))
 {}