LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE void stream_vector_copy( const double* i_a, double* io_c, const int i_length) { int l_n = 0; int l_trip_prolog = 0; int l_trip_stream = 0; /* init the trip counts */ stream_init( i_length, (size_t)io_c, &l_trip_prolog, &l_trip_stream ); /* run the prologue */ for ( ; l_n < l_trip_prolog; l_n++ ) { io_c[l_n] = i_a[l_n]; } /* run the bulk, hopefully using streaming stores */ #if defined(__SSE3__) && defined(__AVX__) && !defined(__AVX512F__) { /* we need manual unrolling as the compiler otherwise generates too many dependencies */ for ( ; l_n < l_trip_stream; l_n+=8 ) { #ifdef DISABLE_NONTEMPORAL_STORES _mm256_store_pd( &(io_c[l_n]), _mm256_loadu_pd(&(i_a[l_n])) ); _mm256_store_pd( &(io_c[l_n+4]), _mm256_loadu_pd(&(i_a[l_n+4])) ); #else _mm256_stream_pd( &(io_c[l_n]), _mm256_loadu_pd(&(i_a[l_n])) ); _mm256_stream_pd( &(io_c[l_n+4]), _mm256_loadu_pd(&(i_a[l_n+4])) ); #endif } } #elif defined(__SSE3__) && defined(__AVX__) && defined(__AVX512F__) { for ( ; l_n < l_trip_stream; l_n+=8 ) { #ifdef DISABLE_NONTEMPORAL_STORES _mm512_store_pd( &(io_c[l_n]), _mm512_loadu_pd(&(i_a[l_n])) ); #else _mm512_stream_pd( &(io_c[l_n]), _mm512_loadu_pd(&(i_a[l_n])) ); #endif } } #else for ( ; l_n < l_trip_stream; l_n++ ) { io_c[l_n] = i_a[l_n]; } #endif /* run the epilogue */ for ( ; l_n < i_length; l_n++ ) { io_c[l_n] = i_a[l_n]; } }
int main(int, char**argv) { /* AVX512 Foundation */ __m512i i; __m512d d; __m512 f; __mmask16 m = ~1; i = _mm512_maskz_loadu_epi32(0, argv); d = _mm512_loadu_pd((double *)argv + 64); f = _mm512_loadu_ps((float *)argv + 128); #ifdef __AVX512ER__ /* AVX512 Exponential and Reciprocal */ f = _mm512_exp2a23_round_ps(f, 8); #endif #ifdef __AVX512CD__ /* AVX512 Conflict Detection */ i = _mm512_maskz_conflict_epi32(m, i); #endif #ifdef __AVX512PF__ /* AVX512 Prefetch */ _mm512_mask_prefetch_i64scatter_pd(argv, 0xf, i, 2, 2); #endif #ifdef __AVX512DQ__ /* AVX512 Doubleword and Quadword support */ m = _mm512_movepi32_mask(i); #endif #ifdef __AVX512BW__ /* AVX512 Byte and Word support */ i = _mm512_mask_loadu_epi8(i, m, argv - 8); #endif #ifdef __AVX512VL__ /* AVX512 Vector Length */ __m256i i2 = _mm256_maskz_loadu_epi32(0, argv); _mm256_mask_storeu_epi32(argv + 1, m, i2); #endif #ifdef __AVX512IFMA__ /* AVX512 Integer Fused Multiply-Add */ i = _mm512_madd52lo_epu64(i, i, i); #endif #ifdef __AVX512VBMI__ /* AVX512 Vector Byte Manipulation Instructions */ i = _mm512_permutexvar_epi8(i, i); #endif _mm512_mask_storeu_epi64(argv, m, i); _mm512_mask_storeu_ps(argv + 64, m, f); _mm512_mask_storeu_pd(argv + 128, m, d); return 0; }
KFR_INTRINSIC f64avx512 read(cunaligned_t, csize_t<8>, const f64* ptr) { return _mm512_loadu_pd(ptr); }
LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE void stream_update_helmholtz_no_h2( const double* i_g1, const double* i_g2, const double* i_g3, const double* i_tm1, const double* i_tm2, const double* i_tm3, double* io_c, const double i_h1, const int i_length) { int l_n = 0; int l_trip_prolog = 0; int l_trip_stream = 0; /* init the trip counts */ stream_init( i_length, (size_t)io_c, &l_trip_prolog, &l_trip_stream ); /* run the prologue */ for ( ; l_n < l_trip_prolog; l_n++ ) { io_c[l_n] = i_h1*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]); } /* run the bulk, hopefully using streaming stores */ #if defined(__SSE3__) && defined(__AVX__) && !defined(__AVX512F__) { const __m256d vec_h1 = _mm256_broadcast_sd(&i_h1); /* we need manual unrolling as the compiler otherwise generates too many dependencies */ for ( ; l_n < l_trip_stream; l_n+=8 ) { __m256d vec_g1_1, vec_g2_1, vec_g3_1, vec_tm1_1, vec_tm2_1, vec_tm3_1; __m256d vec_g1_2, vec_g2_2, vec_g3_2, vec_tm1_2, vec_tm2_2, vec_tm3_2; vec_g1_1 = _mm256_loadu_pd(&(i_g1[l_n])); vec_tm1_1 = _mm256_loadu_pd(&(i_tm1[l_n])); vec_g1_2 = _mm256_loadu_pd(&(i_g1[l_n+4])); vec_tm1_2 = _mm256_loadu_pd(&(i_tm1[l_n+4])); vec_g1_1 = _mm256_mul_pd(vec_g1_1, vec_tm1_1); vec_g2_1 = _mm256_loadu_pd(&(i_g2[l_n])); vec_g1_2 = _mm256_mul_pd(vec_g1_2, vec_tm1_2); vec_g2_2 = _mm256_loadu_pd(&(i_g2[l_n+4])); vec_tm2_1 = _mm256_loadu_pd(&(i_tm2[l_n])); vec_g2_1 = _mm256_mul_pd(vec_g2_1, vec_tm2_1); vec_tm2_2 = _mm256_loadu_pd(&(i_tm2[l_n+4])); vec_g2_2 = _mm256_mul_pd(vec_g2_2, vec_tm2_2); vec_g3_1 = _mm256_loadu_pd(&(i_g3[l_n])); vec_tm3_1 = _mm256_loadu_pd(&(i_tm3[l_n])); vec_g3_2 = _mm256_loadu_pd(&(i_g3[l_n+4])); vec_tm3_2 = _mm256_loadu_pd(&(i_tm3[l_n+4])); vec_g3_1 = _mm256_mul_pd(vec_g3_1, vec_tm3_1); vec_g3_2 = _mm256_mul_pd(vec_g3_2, vec_tm3_2); vec_g1_1 = _mm256_add_pd(vec_g1_1, vec_g2_1); vec_g1_2 = _mm256_add_pd(vec_g1_2, vec_g2_2); vec_g1_1 = _mm256_add_pd(vec_g1_1, vec_g3_1); #ifdef DISABLE_NONTEMPORAL_STORES _mm256_store_pd( &(io_c[l_n]), _mm256_mul_pd(vec_g1_1, vec_h1) ); #else _mm256_stream_pd( &(io_c[l_n]), _mm256_mul_pd(vec_g1_1, vec_h1) ); #endif vec_g1_2 = _mm256_add_pd(vec_g1_2, vec_g3_2); #ifdef DISABLE_NONTEMPORAL_STORES _mm256_store_pd( &(io_c[l_n+4]), _mm256_mul_pd(vec_g1_2, vec_h1) ); #else _mm256_stream_pd( &(io_c[l_n+4]), _mm256_mul_pd(vec_g1_2, vec_h1) ); #endif } } #elif defined(__SSE3__) && defined(__AVX__) && defined(__AVX512F__) { const __m512d vec_h1 = _mm512_broadcastsd_pd(_mm_load_sd(&i_h1)); for ( ; l_n < l_trip_stream; l_n+=8 ) { __m512d vec_g1, vec_g2, vec_g3, vec_tm1, vec_tm2, vec_tm3; vec_g1 = _mm512_loadu_pd(&(i_g1[l_n])); vec_tm1 = _mm512_loadu_pd(&(i_tm1[l_n])); vec_g1 = _mm512_mul_pd(vec_g1, vec_tm1); vec_g2 = _mm512_loadu_pd(&(i_g2[l_n])); vec_tm2 = _mm512_loadu_pd(&(i_tm2[l_n])); vec_g2 = _mm512_mul_pd(vec_g2, vec_tm2); vec_g3 = _mm512_loadu_pd(&(i_g3[l_n])); vec_tm3 = _mm512_loadu_pd(&(i_tm3[l_n])); vec_g3 = _mm512_mul_pd(vec_g3, vec_tm3); vec_g1 = _mm512_add_pd(vec_g1, vec_g2); vec_g1 = _mm512_add_pd(vec_g1, vec_g3); #ifdef DISABLE_NONTEMPORAL_STORES _mm512_store_pd( &(io_c[l_n]), _mm512_mul_pd(vec_g1, vec_h1) ); #else _mm512_stream_pd( &(io_c[l_n]), _mm512_mul_pd(vec_g1, vec_h1) ); #endif } } #else for ( ; l_n < l_trip_stream; l_n++ ) { io_c[l_n] = i_h1*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]); } #endif /* run the epilogue */ for ( ; l_n < i_length; l_n++ ) { io_c[l_n] = i_h1*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]); } }
inline void load(const double *data) { val = _mm512_loadu_pd(data); }