Пример #1
0
LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE
void stream_vector_copy( const double* i_a,
                         double*       io_c,
                         const int     i_length) {
  int l_n = 0;
  int l_trip_prolog = 0;
  int l_trip_stream = 0;
  
  /* init the trip counts */
  stream_init( i_length, (size_t)io_c, &l_trip_prolog, &l_trip_stream );

  /* run the prologue */
  for ( ; l_n < l_trip_prolog;  l_n++ ) {
    io_c[l_n] = i_a[l_n];
  }
  /* run the bulk, hopefully using streaming stores */
#if defined(__SSE3__) && defined(__AVX__) && !defined(__AVX512F__)
  {
    /* we need manual unrolling as the compiler otherwise generates 
       too many dependencies */
    for ( ; l_n < l_trip_stream;  l_n+=8 ) {
#ifdef DISABLE_NONTEMPORAL_STORES
      _mm256_store_pd(  &(io_c[l_n]),   _mm256_loadu_pd(&(i_a[l_n]))   );
      _mm256_store_pd(  &(io_c[l_n+4]), _mm256_loadu_pd(&(i_a[l_n+4])) );
#else
      _mm256_stream_pd( &(io_c[l_n]),   _mm256_loadu_pd(&(i_a[l_n]))   );
      _mm256_stream_pd( &(io_c[l_n+4]), _mm256_loadu_pd(&(i_a[l_n+4])) );
#endif
    }
  }
#elif defined(__SSE3__) && defined(__AVX__) && defined(__AVX512F__)
  {
    for ( ; l_n < l_trip_stream;  l_n+=8 ) {
#ifdef DISABLE_NONTEMPORAL_STORES
      _mm512_store_pd(  &(io_c[l_n]),   _mm512_loadu_pd(&(i_a[l_n]))   );
#else
      _mm512_stream_pd( &(io_c[l_n]),   _mm512_loadu_pd(&(i_a[l_n]))   );
#endif
    }
  }
#else
  for ( ; l_n < l_trip_stream;  l_n++ ) {
    io_c[l_n] = i_a[l_n];
  }
#endif
  /* run the epilogue */
  for ( ; l_n < i_length;  l_n++ ) {
    io_c[l_n] = i_a[l_n];
  }
}
Пример #2
0
int main(int, char**argv)
{
    /* AVX512 Foundation */
    __m512i i;
    __m512d d;
    __m512 f;
    __mmask16 m = ~1;
    i = _mm512_maskz_loadu_epi32(0, argv);
    d = _mm512_loadu_pd((double *)argv + 64);
    f = _mm512_loadu_ps((float *)argv + 128);

#ifdef __AVX512ER__
    /* AVX512 Exponential and Reciprocal */
    f =  _mm512_exp2a23_round_ps(f, 8);
#endif
#ifdef __AVX512CD__
    /* AVX512 Conflict Detection */
    i = _mm512_maskz_conflict_epi32(m, i);
#endif
#ifdef __AVX512PF__
    /* AVX512 Prefetch */
    _mm512_mask_prefetch_i64scatter_pd(argv, 0xf, i, 2, 2);
#endif
#ifdef __AVX512DQ__
    /* AVX512 Doubleword and Quadword support */
    m = _mm512_movepi32_mask(i);
#endif
#ifdef __AVX512BW__
    /* AVX512 Byte and Word support */
    i =  _mm512_mask_loadu_epi8(i, m, argv - 8);
#endif
#ifdef __AVX512VL__
    /* AVX512 Vector Length */
    __m256i i2 = _mm256_maskz_loadu_epi32(0, argv);
    _mm256_mask_storeu_epi32(argv + 1, m, i2);
#endif
#ifdef __AVX512IFMA__
    /* AVX512 Integer Fused Multiply-Add */
    i = _mm512_madd52lo_epu64(i, i, i);
#endif
#ifdef __AVX512VBMI__
    /* AVX512 Vector Byte Manipulation Instructions */
    i = _mm512_permutexvar_epi8(i, i);
#endif

    _mm512_mask_storeu_epi64(argv, m, i);
    _mm512_mask_storeu_ps(argv + 64, m, f);
    _mm512_mask_storeu_pd(argv + 128, m, d);
    return 0;
}
Пример #3
0
KFR_INTRINSIC f64avx512 read(cunaligned_t, csize_t<8>, const f64* ptr) { return _mm512_loadu_pd(ptr); }
Пример #4
0
LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE
void stream_update_helmholtz_no_h2( const double* i_g1,
                                    const double* i_g2,
                                    const double* i_g3, 
                                    const double* i_tm1,
                                    const double* i_tm2,
                                    const double* i_tm3,
                                    double*       io_c,
                                    const double  i_h1,
                                    const int     i_length) {
  int l_n = 0;
  int l_trip_prolog = 0;
  int l_trip_stream = 0;
  
  /* init the trip counts */
  stream_init( i_length, (size_t)io_c, &l_trip_prolog, &l_trip_stream );
  
  /* run the prologue */
  for ( ; l_n < l_trip_prolog;  l_n++ ) {
    io_c[l_n] =   i_h1*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]);
  }
  /* run the bulk, hopefully using streaming stores */
#if defined(__SSE3__) && defined(__AVX__) && !defined(__AVX512F__)
  {
    const __m256d vec_h1 = _mm256_broadcast_sd(&i_h1);
    /* we need manual unrolling as the compiler otherwise generates 
       too many dependencies */
    for ( ; l_n < l_trip_stream;  l_n+=8 ) {
      __m256d vec_g1_1, vec_g2_1, vec_g3_1, vec_tm1_1, vec_tm2_1, vec_tm3_1;
      __m256d vec_g1_2, vec_g2_2, vec_g3_2, vec_tm1_2, vec_tm2_2, vec_tm3_2;

      vec_g1_1 = _mm256_loadu_pd(&(i_g1[l_n]));
      vec_tm1_1 = _mm256_loadu_pd(&(i_tm1[l_n]));
      vec_g1_2 = _mm256_loadu_pd(&(i_g1[l_n+4]));
      vec_tm1_2 = _mm256_loadu_pd(&(i_tm1[l_n+4]));

      vec_g1_1 = _mm256_mul_pd(vec_g1_1, vec_tm1_1);
      vec_g2_1 = _mm256_loadu_pd(&(i_g2[l_n]));
      vec_g1_2 = _mm256_mul_pd(vec_g1_2, vec_tm1_2);
      vec_g2_2 = _mm256_loadu_pd(&(i_g2[l_n+4]));

      vec_tm2_1 = _mm256_loadu_pd(&(i_tm2[l_n]));
      vec_g2_1 = _mm256_mul_pd(vec_g2_1, vec_tm2_1);
      vec_tm2_2 = _mm256_loadu_pd(&(i_tm2[l_n+4]));
      vec_g2_2 = _mm256_mul_pd(vec_g2_2, vec_tm2_2);

      vec_g3_1 = _mm256_loadu_pd(&(i_g3[l_n]));
      vec_tm3_1 = _mm256_loadu_pd(&(i_tm3[l_n]));
      vec_g3_2 = _mm256_loadu_pd(&(i_g3[l_n+4]));
      vec_tm3_2 = _mm256_loadu_pd(&(i_tm3[l_n+4]));

      vec_g3_1 = _mm256_mul_pd(vec_g3_1, vec_tm3_1);
      vec_g3_2 = _mm256_mul_pd(vec_g3_2, vec_tm3_2);
      vec_g1_1 = _mm256_add_pd(vec_g1_1, vec_g2_1);
      vec_g1_2 = _mm256_add_pd(vec_g1_2, vec_g2_2);

      vec_g1_1 = _mm256_add_pd(vec_g1_1, vec_g3_1);
#ifdef DISABLE_NONTEMPORAL_STORES
      _mm256_store_pd(  &(io_c[l_n]), _mm256_mul_pd(vec_g1_1, vec_h1) );
#else
      _mm256_stream_pd( &(io_c[l_n]), _mm256_mul_pd(vec_g1_1, vec_h1) );
#endif
      vec_g1_2 = _mm256_add_pd(vec_g1_2, vec_g3_2);
#ifdef DISABLE_NONTEMPORAL_STORES
      _mm256_store_pd(  &(io_c[l_n+4]), _mm256_mul_pd(vec_g1_2, vec_h1) );
#else
      _mm256_stream_pd( &(io_c[l_n+4]), _mm256_mul_pd(vec_g1_2, vec_h1) );
#endif
    }
  }
#elif defined(__SSE3__) && defined(__AVX__) && defined(__AVX512F__)
  {
    const __m512d vec_h1 = _mm512_broadcastsd_pd(_mm_load_sd(&i_h1));
    for ( ; l_n < l_trip_stream;  l_n+=8 ) {
      __m512d vec_g1, vec_g2, vec_g3, vec_tm1, vec_tm2, vec_tm3;
      vec_g1 = _mm512_loadu_pd(&(i_g1[l_n]));
      vec_tm1 = _mm512_loadu_pd(&(i_tm1[l_n]));
      vec_g1 = _mm512_mul_pd(vec_g1, vec_tm1);
      vec_g2 = _mm512_loadu_pd(&(i_g2[l_n]));
      vec_tm2 = _mm512_loadu_pd(&(i_tm2[l_n]));
      vec_g2 = _mm512_mul_pd(vec_g2, vec_tm2);
      vec_g3 = _mm512_loadu_pd(&(i_g3[l_n]));
      vec_tm3 = _mm512_loadu_pd(&(i_tm3[l_n]));
      vec_g3 = _mm512_mul_pd(vec_g3, vec_tm3);
      vec_g1 = _mm512_add_pd(vec_g1, vec_g2);
      vec_g1 = _mm512_add_pd(vec_g1, vec_g3);
#ifdef DISABLE_NONTEMPORAL_STORES
      _mm512_store_pd(  &(io_c[l_n]), _mm512_mul_pd(vec_g1, vec_h1) );
#else
      _mm512_stream_pd( &(io_c[l_n]), _mm512_mul_pd(vec_g1, vec_h1) );
#endif
    }
  }
#else
  for ( ; l_n < l_trip_stream;  l_n++ ) {
    io_c[l_n] =   i_h1*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]);
  }
#endif
  /* run the epilogue */
  for ( ; l_n < i_length;  l_n++ ) {
    io_c[l_n] =   i_h1*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]);
  }
}
 inline
 void load(const double *data)
 {
     val = _mm512_loadu_pd(data);
 }