void extern avx512f_test (void) { x = _mm512_loadu_ps (p); x = _mm512_mask_loadu_ps (x, m, p); x = _mm512_maskz_loadu_ps (m, p); _mm512_storeu_ps (p, x); _mm512_mask_storeu_ps (p, m, x); }
__attribute__((noinline)) float dot512(float *x1, float *x2, size_t len) { assert(len % 16 == 0); __m512 sum = _mm512_setzero_ps(); if (len > 15) { size_t limit = len - 15; for (size_t i = 0; i < limit; i += 16) { __m512 v1 = _mm512_loadu_ps(x1 + i); __m512 v2 = _mm512_loadu_ps(x2 + i); sum = _mm512_add_ps(sum, _mm512_mul_ps(v1, v2)); } } float buffer[16]; _mm512_storeu_ps(buffer, sum); return buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7] + buffer[8] + buffer[9] + buffer[10] + buffer[11] + buffer[12] + buffer[13] + buffer[14] + buffer[15]; }
__attribute__((noinline)) float dot512fma2(float *x1, float *x2, size_t len) { assert(len % 32 == 0); __m512 sum = _mm512_setzero_ps(); if (len > 31) { size_t limit = len - 31; for (size_t i = 0; i < limit; i += 32) { __m512 v11 = _mm512_loadu_ps(x1 + i); __m512 v21 = _mm512_loadu_ps(x2 + i); __m512 v12 = _mm512_loadu_ps(x1 + i + 16); __m512 v22 = _mm512_loadu_ps(x2 + i + 16); sum = _mm512_fmadd_ps(v11, v21, sum); sum = _mm512_fmadd_ps(v12, v22, sum); } } float buffer[16]; _mm512_storeu_ps(buffer, sum); return buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7] + buffer[8] + buffer[9] + buffer[10] + buffer[11] + buffer[12] + buffer[13] + buffer[14] + buffer[15]; }
KFR_INTRINSIC void write(cunaligned_t, f32* ptr, const f32avx512& x) { _mm512_storeu_ps(ptr, x.v); }
inline void store(float *data) const { _mm512_storeu_ps(data + 0, val1); _mm512_storeu_ps(data + 16, val2); }