__inline void mic_broadcast16x64(const double* inv, double* outv) { __mmask8 k1 = _mm512_int2mask(0x0F); __mmask8 k2 = _mm512_int2mask(0xF0); for(int l = 0; l < 16; l += 2) { __m512d t = _mm512_setzero_pd(); t = _mm512_mask_extload_pd(t, k1, &inv[(l%4)*4 + l/4], _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE); t = _mm512_mask_extload_pd(t, k2, &inv[((l+1)%4)*4 + (l+1)/4], _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE); _mm512_store_pd(&outv[l*4], t); } }
static void avx512f_test (void) { double v[8] = { -3.3, 2.6, 1.48, 9.104, -23.9, -173.37, -13.48, 69.78 }; union512d res; res.x = foo (v); if (check_union512d (res, v)) abort (); res.x = _mm512_setzero_pd (); res.x = foo_r (v); if (check_union512d (res, v)) abort (); }
static void avx512f_test (void) { int i; double e = 34.5; double v[8]; union512d res; for (i = 0; i < 8; i++) v[i] = e; res.x = foo (e); if (check_union512d (res, v)) abort (); res.x = _mm512_setzero_pd (); res.x = foo_r (e); if (check_union512d (res, v)) abort (); }
#include <stdlib.h> #include <string.h> #include "AVX512_DJACV.h" #define VSIZE_B 64u #define DBLE_SZ 8u #define NDBLE_V (VSIZE_B / DBLE_SZ) #define NPAIR_V NDBLE_V static inline __m256d avx512_ddots(const unsigned m, const double *const restrict Gp, const double *const restrict Gq) { register const double *const Gp_i = (const double*)__builtin_assume_aligned(Gp, VSIZE_B); register const double *const Gq_i = (const double*)__builtin_assume_aligned(Gq, VSIZE_B); register __m512d Gpp = _mm512_setzero_pd(); register __m512d Gqq = _mm512_setzero_pd(); register __m512d Gpq = _mm512_setzero_pd(); for (register unsigned i = 0u; i < m; i += NDBLE_V) { register const __m512d Gpi = _mm512_load_pd(Gp_i + i); register const __m512d Gqi = _mm512_load_pd(Gq_i + i); Gpp = _mm512_fmadd_pd(Gpi, Gpi, Gpp); Gqq = _mm512_fmadd_pd(Gqi, Gqi, Gqq); Gpq = _mm512_fmadd_pd(Gpi, Gqi, Gpq); } register const double pq = _mm512_reduce_add_pd(Gpq); /* out[0] = Gpp; out[1] = Gqq; out[2] = Gpq; out[3] = |Gpq|; */ return _mm256_set_pd(fabs(pq), pq, _mm512_reduce_add_pd(Gqq), _mm512_reduce_add_pd(Gpp));