inline
 void load_aligned(const float *data)
 {
     SHORTVEC_ASSERT_ALIGNED(data, 64);
     val1 = _mm512_load_ps(data +  0);
     val2 = _mm512_load_ps(data + 16);
 }
inline void zmatmul(float *ain, float *bin, float *cout) {
#ifdef __MIC__
  __m512 a,b,c;
  __m512 a0,a1,a2,a3;
  __m512 b0,b1,b2,b3;

  __m512i pa0={ 0, 0, 2, 2, 0, 0, 2, 2, 8, 8,10,10, 8, 8,10,10};
  __m512i pa1={ 1, 1, 3, 3, 1, 1, 3, 3, 9, 9,11,11, 9, 9,11,11};
  __m512i pa2={ 4, 4, 6, 6, 4, 4, 6, 6,12,12,14,14,12,12,14,14};
  __m512i pa3={ 5, 5, 7, 7, 5, 5, 7, 7,13,13,15,15,13,13,15,15};

  __m512i pb0={ 0, 1, 0, 1, 4, 5, 4, 5, 0, 1, 0, 1, 4, 5, 4, 5};
  __m512i pb1={ 2, 3, 2, 3, 6, 7, 6, 7, 2, 3, 2, 3, 6, 7, 6, 7};
  __m512i pb2={ 8, 9, 8, 9,12,13,12,13, 8, 9, 8, 9,12,13,12,13};
  __m512i pb3={10,11,10,11,14,15,14,15,10,11,10,11,14,15,14,15};

  a=_mm512_load_ps(ain);
  b=_mm512_load_ps(bin);
  c=_mm512_load_ps(cout);

  a0=_mm512_castsi512_ps(_mm512_permutevar_epi32(pa0,_mm512_castps_si512(a)));
  a1=_mm512_castsi512_ps(_mm512_permutevar_epi32(pa1,_mm512_castps_si512(a)));
  a2=_mm512_castsi512_ps(_mm512_permutevar_epi32(pa2,_mm512_castps_si512(a)));
  a3=_mm512_castsi512_ps(_mm512_permutevar_epi32(pa3,_mm512_castps_si512(a)));

  b0=_mm512_castsi512_ps(_mm512_permutevar_epi32(pb0,_mm512_castps_si512(b)));
  b1=_mm512_castsi512_ps(_mm512_permutevar_epi32(pb1,_mm512_castps_si512(b)));
  b2=_mm512_castsi512_ps(_mm512_permutevar_epi32(pb2,_mm512_castps_si512(b)));
  b3=_mm512_castsi512_ps(_mm512_permutevar_epi32(pb3,_mm512_castps_si512(b)));

  c=_mm512_fmadd_ps(a0,b0,c);
  c=_mm512_fmadd_ps(a1,b1,c);
  c=_mm512_fmadd_ps(a2,b2,c);
  c=_mm512_fmadd_ps(a3,b3,c);

  _mm512_store_ps(cout,c);
#else
  cout[0] +=ain[0] *bin[0]+ain[1] *bin[2]+ain[4] *bin[8] +ain[5] *bin[10];
  cout[1] +=ain[0] *bin[1]+ain[1] *bin[3]+ain[4] *bin[9] +ain[5] *bin[11];
  cout[2] +=ain[2] *bin[0]+ain[3] *bin[2]+ain[6] *bin[8] +ain[7] *bin[10];
  cout[3] +=ain[2] *bin[1]+ain[3] *bin[3]+ain[6] *bin[9] +ain[7] *bin[11];
  cout[4] +=ain[0] *bin[4]+ain[1] *bin[6]+ain[4] *bin[12]+ain[5] *bin[14];
  cout[5] +=ain[0] *bin[5]+ain[1] *bin[7]+ain[4] *bin[13]+ain[5] *bin[15];
  cout[6] +=ain[2] *bin[4]+ain[3] *bin[6]+ain[6] *bin[12]+ain[7] *bin[14];
  cout[7] +=ain[2] *bin[5]+ain[3] *bin[7]+ain[6] *bin[13]+ain[7] *bin[15];
  cout[8] +=ain[8] *bin[0]+ain[9] *bin[2]+ain[12]*bin[8] +ain[13]*bin[10];
  cout[9] +=ain[8] *bin[1]+ain[9] *bin[3]+ain[12]*bin[9] +ain[13]*bin[11];
  cout[10]+=ain[10]*bin[0]+ain[11]*bin[2]+ain[14]*bin[8] +ain[15]*bin[10];
  cout[11]+=ain[10]*bin[1]+ain[11]*bin[3]+ain[14]*bin[9] +ain[15]*bin[11];
  cout[12]+=ain[8] *bin[4]+ain[9] *bin[6]+ain[12]*bin[12]+ain[13]*bin[14];
  cout[13]+=ain[8] *bin[5]+ain[9] *bin[7]+ain[12]*bin[13]+ain[13]*bin[15];
  cout[14]+=ain[10]*bin[4]+ain[11]*bin[6]+ain[14]*bin[12]+ain[15]*bin[14];
  cout[15]+=ain[10]*bin[5]+ain[11]*bin[7]+ain[14]*bin[13]+ain[15]*bin[15];
#endif

}
Example #3
0
int main() {
    // Initialize
    int N = 1 << 16;
    int NALIGN = 64;
    int i, j;
    float OPS = 20. * N * N * 1e-9;
    float EPS2 = 1e-6;
    double tic, toc;
    float * x = (float*) _mm_malloc(N * sizeof(float), NALIGN);
    float * y = (float*) _mm_malloc(N * sizeof(float), NALIGN);
    float * z = (float*) _mm_malloc(N * sizeof(float), NALIGN);
    float * m = (float*) _mm_malloc(N * sizeof(float), NALIGN);
    float * p = (float*) _mm_malloc(N * sizeof(float), NALIGN);
    float * ax = (float*) _mm_malloc(N * sizeof(float), NALIGN);
    float * ay = (float*) _mm_malloc(N * sizeof(float), NALIGN);
    float * az = (float*) _mm_malloc(N * sizeof(float), NALIGN);
    #pragma omp parallel for
    for (i=0; i<N; i++) {
        x[i] = drand48();
        y[i] = drand48();
        z[i] = drand48();
        m[i] = drand48() / N;
        p[i] = ax[i] = ay[i] = az[i] = 0;
    }
    printf("N : %d\n",N);

    #pragma omp parallel private(j)
    {
        #pragma omp single
        tic = get_time();
        // Vectorize target with intrinsics
        #pragma omp for
        for (i=0; i<N; i+=16) {
            __m512 pi = _mm512_setzero_ps();
            __m512 axi = _mm512_setzero_ps();
            __m512 ayi = _mm512_setzero_ps();
            __m512 azi = _mm512_setzero_ps();
            __m512 xi = _mm512_load_ps(x+i);
            __m512 yi = _mm512_load_ps(y+i);
            __m512 zi = _mm512_load_ps(z+i);
            for (j=0; j<N; j++) {
                __m512 xj = _mm512_set1_ps(x[j]);
                xj = _mm512_sub_ps(xj, xi);
                __m512 yj = _mm512_set1_ps(y[j]);
                yj = _mm512_sub_ps(yj, yi);
                __m512 zj = _mm512_set1_ps(z[j]);
                zj = _mm512_sub_ps(zj, zi);
                __m512 R2 = _mm512_set1_ps(EPS2);
                R2 = _mm512_fmadd_ps(xj, xj, R2);
                R2 = _mm512_fmadd_ps(yj, yj, R2);
                R2 = _mm512_fmadd_ps(zj, zj, R2);
                __m512 mj = _mm512_set1_ps(m[j]);
                __m512 invR = _mm512_rsqrt23_ps(R2);
                mj = _mm512_mul_ps(mj, invR);
                pi = _mm512_add_ps(pi, mj);
                invR = _mm512_mul_ps(invR, invR);
                invR = _mm512_mul_ps(invR, mj);
                axi = _mm512_fmadd_ps(xj, invR, axi);
                ayi = _mm512_fmadd_ps(yj, invR, ayi);
                azi = _mm512_fmadd_ps(zj, invR, azi);
            }
            _mm512_store_ps(p+i, pi);
            _mm512_store_ps(ax+i, axi);
            _mm512_store_ps(ay+i, ayi);
            _mm512_store_ps(az+i, azi);
        }
        #pragma omp single
        {
            toc = get_time();
            printf("Vectorize target with intrinsics : %e s : %lf GFlops\n",toc-tic, OPS/(toc-tic));

            // Vectorize source with intrinsics
            tic = get_time();
        }
        #pragma omp for
        for (i=0; i<N; i++) {
            __m512 pi = _mm512_setzero_ps();
            __m512 axi = _mm512_setzero_ps();
            __m512 ayi = _mm512_setzero_ps();
            __m512 azi = _mm512_setzero_ps();
            __m512 xi = _mm512_set1_ps(x[i]);
            __m512 yi = _mm512_set1_ps(y[i]);
            __m512 zi = _mm512_set1_ps(z[i]);
            for (j=0; j<N; j+=16) {
                __m512 xj = _mm512_load_ps(x+j);
                xj = _mm512_sub_ps(xj, xi);
                __m512 yj = _mm512_load_ps(y+j);
                yj = _mm512_sub_ps(yj, yi);
                __m512 zj = _mm512_load_ps(z+j);
                zj = _mm512_sub_ps(zj, zi);
                __m512 R2 = _mm512_set1_ps(EPS2);
                R2 = _mm512_fmadd_ps(xj, xj, R2);
                R2 = _mm512_fmadd_ps(yj, yj, R2);
                R2 = _mm512_fmadd_ps(zj, zj, R2);
                __m512 mj = _mm512_load_ps(m+j);
                __m512 invR = _mm512_rsqrt23_ps(R2);
                mj = _mm512_mul_ps(mj, invR);
                pi = _mm512_add_ps(pi, mj);
                invR = _mm512_mul_ps(invR, invR);
                invR = _mm512_mul_ps(invR, mj);
                axi = _mm512_fmadd_ps(xj, invR, axi);
                ayi = _mm512_fmadd_ps(yj, invR, ayi);
                azi = _mm512_fmadd_ps(zj, invR, azi);
            }
            p[i] = _mm512_reduce_add_ps(pi);
            ax[i] = _mm512_reduce_add_ps(axi);
            ay[i] = _mm512_reduce_add_ps(ayi);
            az[i] = _mm512_reduce_add_ps(azi);
        }
        #pragma omp single
        {
            toc = get_time();
            printf("Vectorize source with intrinsics : %e s : %lf GFlops\n",toc-tic, OPS/(toc-tic));

            // Vectorize target with pragma simd
            tic = get_time();
        }
#pragma simd
        #pragma omp for
        for (i=0; i<N; i++) {
            float pi = 0;
            float axi = 0;
            float ayi = 0;
            float azi = 0;
            float xi = x[i];
            float yi = y[i];
            float zi = z[i];
            for (j=0; j<N; j++) {
                float dx = x[j] - xi;
                float dy = y[j] - yi;
                float dz = z[j] - zi;
                float R2 = dx * dx + dy * dy + dz * dz + EPS2;
                float invR = 1.0f / sqrtf(R2);
                float invR3 = m[j] * invR * invR * invR;
                pi += m[j] * invR;
                axi += dx * invR3;
                ayi += dy * invR3;
                azi += dz * invR3;
            }
            p[i] = pi;
            ax[i] = axi;
            ay[i] = ayi;
            az[i] = azi;
        }
        #pragma omp single
        {
            toc = get_time();
            printf("Vectorize target with pragma simd: %e s : %lf GFlops\n",toc-tic, OPS/(toc-tic));

            // Vectorize source with pragma simd
            tic = get_time();
        }
        #pragma omp for
        for (i=0; i<N; i++) {
            float pi = 0;
            float axi = 0;
            float ayi = 0;
            float azi = 0;
            float xi = x[i];
            float yi = y[i];
            float zi = z[i];
#pragma simd
            for (j=0; j<N; j++) {
                float dx = x[j] - xi;
                float dy = y[j] - yi;
                float dz = z[j] - zi;
                float R2 = dx * dx + dy * dy + dz * dz + EPS2;
                float invR = 1.0f / sqrtf(R2);
                float invR3 = m[j] * invR * invR * invR;
                pi += m[j] * invR;
                axi += dx * invR3;
                ayi += dy * invR3;
                azi += dz * invR3;
            }
            p[i] = pi;
            ax[i] = axi;
            ay[i] = ayi;
            az[i] = azi;
        }
        #pragma omp single
        {
            toc = get_time();
            printf("Vectorize source with pragma simd: %e s : %lf GFlops\n",toc-tic, OPS/(toc-tic));
        }
    }

    _mm_free(x);
    _mm_free(y);
    _mm_free(z);
    _mm_free(m);
    _mm_free(p);
    _mm_free(ax);
    _mm_free(ay);
    _mm_free(az);
    return 0;
}
Example #4
0
	static inline mic_m512_t mic_load_rps(real_t* p) {
		return _mm512_load_ps(p);
	} // mic_load_rps()