LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE void stream_vector_set( const double i_scalar, double* io_c, const int i_length) { int l_n = 0; int l_trip_prolog = 0; int l_trip_stream = 0; /* init the trip counts */ stream_init( i_length, (size_t)io_c, &l_trip_prolog, &l_trip_stream ); /* run the prologue */ for ( ; l_n < l_trip_prolog; l_n++ ) { io_c[l_n] = i_scalar; } /* run the bulk, hopefully using streaming stores */ #if defined(__SSE3__) && defined(__AVX__) && !defined(__AVX512F__) { /* we need manual unrolling as the compiler otherwise generates too many dependencies */ const __m256d vec_scalar = _mm256_broadcast_sd(&i_scalar); for ( ; l_n < l_trip_stream; l_n+=8 ) { #ifdef DISABLE_NONTEMPORAL_STORES _mm256_store_pd( &(io_c[l_n]), vec_scalar ); _mm256_store_pd( &(io_c[l_n+4]), vec_scalar ); #else _mm256_stream_pd( &(io_c[l_n]), vec_scalar ); _mm256_stream_pd( &(io_c[l_n+4]), vec_scalar ); #endif } } #elif defined(__SSE3__) && defined(__AVX__) && defined(__AVX512F__) { const __m512d vec_scalar = _mm512_broadcastsd_pd(_mm_load_sd(&i_scalar)); for ( ; l_n < l_trip_stream; l_n+=8 ) { #ifdef DISABLE_NONTEMPORAL_STORES _mm512_store_pd( &(io_c[l_n]), vec_scalar ); #else _mm512_stream_pd( &(io_c[l_n]), vec_scalar ); #endif } } #else for ( ; l_n < l_trip_stream; l_n++ ) { io_c[l_n] = i_scalar; } #endif /* run the epilogue */ for ( ; l_n < i_length; l_n++ ) { io_c[l_n] = i_scalar; } }
inline void kernel(adouble* v1, adouble * v2, int m) { __m256d alpha = _mm256_set1_pd(0.25); // __m256d phi_e = _mm256_loadu_pd (v1 + 1 ); __m256d phi_w = _mm256_loadu_pd (v1 - 1 ); __m256d phi_n = _mm256_loadu_pd (v1 + m); __m256d phi_s = _mm256_loadu_pd (v1 - m); // phi_e = _mm256_add_pd(phi_e, phi_s); phi_e = _mm256_add_pd(phi_e, phi_n); //phi_e = _mm_fmadd_pd(alpha, phi_e, phi_w); phi_e = _mm256_add_pd(phi_e, phi_w); phi_e = _mm256_mul_pd(alpha, phi_e); // //printf("-> p = %p\n", &v2[0]); _mm256_stream_pd(v2, phi_e); }
/*! * \brief Non-temporal, aligned, store of the given packed vector at the * given memory position */ ETL_STATIC_INLINE(void) stream(etl::complex<double>* memory, avx_simd_complex_double<etl::complex<double>> value) { _mm256_stream_pd(reinterpret_cast<double*>(memory), value.value); }
/*! * \brief Non-temporal, aligned, store of the given packed vector at the * given memory position */ ETL_STATIC_INLINE(void) stream(double* memory, avx_simd_double value) { _mm256_stream_pd(memory, value.value); }
LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE void stream_update_helmholtz_no_h2( const double* i_g1, const double* i_g2, const double* i_g3, const double* i_tm1, const double* i_tm2, const double* i_tm3, double* io_c, const double i_h1, const int i_length) { int l_n = 0; int l_trip_prolog = 0; int l_trip_stream = 0; /* init the trip counts */ stream_init( i_length, (size_t)io_c, &l_trip_prolog, &l_trip_stream ); /* run the prologue */ for ( ; l_n < l_trip_prolog; l_n++ ) { io_c[l_n] = i_h1*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]); } /* run the bulk, hopefully using streaming stores */ #if defined(__SSE3__) && defined(__AVX__) && !defined(__AVX512F__) { const __m256d vec_h1 = _mm256_broadcast_sd(&i_h1); /* we need manual unrolling as the compiler otherwise generates too many dependencies */ for ( ; l_n < l_trip_stream; l_n+=8 ) { __m256d vec_g1_1, vec_g2_1, vec_g3_1, vec_tm1_1, vec_tm2_1, vec_tm3_1; __m256d vec_g1_2, vec_g2_2, vec_g3_2, vec_tm1_2, vec_tm2_2, vec_tm3_2; vec_g1_1 = _mm256_loadu_pd(&(i_g1[l_n])); vec_tm1_1 = _mm256_loadu_pd(&(i_tm1[l_n])); vec_g1_2 = _mm256_loadu_pd(&(i_g1[l_n+4])); vec_tm1_2 = _mm256_loadu_pd(&(i_tm1[l_n+4])); vec_g1_1 = _mm256_mul_pd(vec_g1_1, vec_tm1_1); vec_g2_1 = _mm256_loadu_pd(&(i_g2[l_n])); vec_g1_2 = _mm256_mul_pd(vec_g1_2, vec_tm1_2); vec_g2_2 = _mm256_loadu_pd(&(i_g2[l_n+4])); vec_tm2_1 = _mm256_loadu_pd(&(i_tm2[l_n])); vec_g2_1 = _mm256_mul_pd(vec_g2_1, vec_tm2_1); vec_tm2_2 = _mm256_loadu_pd(&(i_tm2[l_n+4])); vec_g2_2 = _mm256_mul_pd(vec_g2_2, vec_tm2_2); vec_g3_1 = _mm256_loadu_pd(&(i_g3[l_n])); vec_tm3_1 = _mm256_loadu_pd(&(i_tm3[l_n])); vec_g3_2 = _mm256_loadu_pd(&(i_g3[l_n+4])); vec_tm3_2 = _mm256_loadu_pd(&(i_tm3[l_n+4])); vec_g3_1 = _mm256_mul_pd(vec_g3_1, vec_tm3_1); vec_g3_2 = _mm256_mul_pd(vec_g3_2, vec_tm3_2); vec_g1_1 = _mm256_add_pd(vec_g1_1, vec_g2_1); vec_g1_2 = _mm256_add_pd(vec_g1_2, vec_g2_2); vec_g1_1 = _mm256_add_pd(vec_g1_1, vec_g3_1); #ifdef DISABLE_NONTEMPORAL_STORES _mm256_store_pd( &(io_c[l_n]), _mm256_mul_pd(vec_g1_1, vec_h1) ); #else _mm256_stream_pd( &(io_c[l_n]), _mm256_mul_pd(vec_g1_1, vec_h1) ); #endif vec_g1_2 = _mm256_add_pd(vec_g1_2, vec_g3_2); #ifdef DISABLE_NONTEMPORAL_STORES _mm256_store_pd( &(io_c[l_n+4]), _mm256_mul_pd(vec_g1_2, vec_h1) ); #else _mm256_stream_pd( &(io_c[l_n+4]), _mm256_mul_pd(vec_g1_2, vec_h1) ); #endif } } #elif defined(__SSE3__) && defined(__AVX__) && defined(__AVX512F__) { const __m512d vec_h1 = _mm512_broadcastsd_pd(_mm_load_sd(&i_h1)); for ( ; l_n < l_trip_stream; l_n+=8 ) { __m512d vec_g1, vec_g2, vec_g3, vec_tm1, vec_tm2, vec_tm3; vec_g1 = _mm512_loadu_pd(&(i_g1[l_n])); vec_tm1 = _mm512_loadu_pd(&(i_tm1[l_n])); vec_g1 = _mm512_mul_pd(vec_g1, vec_tm1); vec_g2 = _mm512_loadu_pd(&(i_g2[l_n])); vec_tm2 = _mm512_loadu_pd(&(i_tm2[l_n])); vec_g2 = _mm512_mul_pd(vec_g2, vec_tm2); vec_g3 = _mm512_loadu_pd(&(i_g3[l_n])); vec_tm3 = _mm512_loadu_pd(&(i_tm3[l_n])); vec_g3 = _mm512_mul_pd(vec_g3, vec_tm3); vec_g1 = _mm512_add_pd(vec_g1, vec_g2); vec_g1 = _mm512_add_pd(vec_g1, vec_g3); #ifdef DISABLE_NONTEMPORAL_STORES _mm512_store_pd( &(io_c[l_n]), _mm512_mul_pd(vec_g1, vec_h1) ); #else _mm512_stream_pd( &(io_c[l_n]), _mm512_mul_pd(vec_g1, vec_h1) ); #endif } } #else for ( ; l_n < l_trip_stream; l_n++ ) { io_c[l_n] = i_h1*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]); } #endif /* run the epilogue */ for ( ; l_n < i_length; l_n++ ) { io_c[l_n] = i_h1*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]); } }
LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE void stream_vector_compscale( const double* i_a, const double* i_b, double* io_c, const int i_length) { int l_n = 0; int l_trip_prolog = 0; int l_trip_stream = 0; /* init the trip counts */ stream_init( i_length, (size_t)io_c, &l_trip_prolog, &l_trip_stream ); /* run the prologue */ for ( ; l_n < l_trip_prolog; l_n++ ) { io_c[l_n] = i_a[l_n]*i_b[l_n]; } /* run the bulk, hopefully using streaming stores */ #if defined(__SSE3__) && defined(__AVX__) && !defined(__AVX512F__) { /* we need manual unrolling as the compiler otherwise generates too many dependencies */ for ( ; l_n < l_trip_stream; l_n+=8 ) { __m256d vec_a_1, vec_b_1; __m256d vec_a_2, vec_b_2; vec_a_1 = _mm256_loadu_pd(&(i_a[l_n])); vec_a_2 = _mm256_loadu_pd(&(i_a[l_n+4])); vec_b_1 = _mm256_loadu_pd(&(i_b[l_n])); vec_b_2 = _mm256_loadu_pd(&(i_b[l_n+4])); #ifdef DISABLE_NONTEMPORAL_STORES _mm256_store_pd( &(io_c[l_n]), _mm256_mul_pd( vec_a_1, vec_b_1 ) ); _mm256_store_pd( &(io_c[l_n+4]), _mm256_mul_pd( vec_a_2, vec_b_2 ) ); #else _mm256_stream_pd( &(io_c[l_n]), _mm256_mul_pd( vec_a_1, vec_b_1 ) ); _mm256_stream_pd( &(io_c[l_n+4]), _mm256_mul_pd( vec_a_2, vec_b_2 ) ); #endif } } #elif defined(__SSE3__) && defined(__AVX__) && defined(__AVX512F__) { for ( ; l_n < l_trip_stream; l_n+=8 ) { __m512d vec_a, vec_b; vec_a = _mm512_loadu_pd(&(i_a[l_n])); vec_b = _mm512_loadu_pd(&(i_b[l_n])); #ifdef DISABLE_NONTEMPORAL_STORES _mm512_store_pd( &(io_c[l_n]), _mm512_mul_pd( vec_a, vec_b ) ); #else _mm512_stream_pd( &(io_c[l_n]), _mm512_mul_pd( vec_a, vec_b ) ); #endif } } #else for ( ; l_n < l_trip_stream; l_n++ ) { io_c[l_n] = i_a[l_n]*i_b[l_n]; } #endif /* run the epilogue */ for ( ; l_n < i_length; l_n++ ) { io_c[l_n] = i_a[l_n]*i_b[l_n]; } }