__attribute__((noinline)) float dot512(float *x1, float *x2, size_t len) { assert(len % 16 == 0); __m512 sum = _mm512_setzero_ps(); if (len > 15) { size_t limit = len - 15; for (size_t i = 0; i < limit; i += 16) { __m512 v1 = _mm512_loadu_ps(x1 + i); __m512 v2 = _mm512_loadu_ps(x2 + i); sum = _mm512_add_ps(sum, _mm512_mul_ps(v1, v2)); } } float buffer[16]; _mm512_storeu_ps(buffer, sum); return buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7] + buffer[8] + buffer[9] + buffer[10] + buffer[11] + buffer[12] + buffer[13] + buffer[14] + buffer[15]; }
void extern avx512f_test (void) { x = _mm512_loadu_ps (p); x = _mm512_mask_loadu_ps (x, m, p); x = _mm512_maskz_loadu_ps (m, p); _mm512_storeu_ps (p, x); _mm512_mask_storeu_ps (p, m, x); }
__attribute__((noinline)) float dot512fma2(float *x1, float *x2, size_t len) { assert(len % 32 == 0); __m512 sum = _mm512_setzero_ps(); if (len > 31) { size_t limit = len - 31; for (size_t i = 0; i < limit; i += 32) { __m512 v11 = _mm512_loadu_ps(x1 + i); __m512 v21 = _mm512_loadu_ps(x2 + i); __m512 v12 = _mm512_loadu_ps(x1 + i + 16); __m512 v22 = _mm512_loadu_ps(x2 + i + 16); sum = _mm512_fmadd_ps(v11, v21, sum); sum = _mm512_fmadd_ps(v12, v22, sum); } } float buffer[16]; _mm512_storeu_ps(buffer, sum); return buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7] + buffer[8] + buffer[9] + buffer[10] + buffer[11] + buffer[12] + buffer[13] + buffer[14] + buffer[15]; }
void convert_f32_bf16(float* in, libxsmm_bfloat16* out, int len) { int i; #ifdef _OPENMP #pragma omp parallel for private(i) #endif for ( i = 0; i < len; i+=16 ) { __m512 vfp32 = gxm_fp32_to_bfp16_rne_adjustment_avx512f( _mm512_loadu_ps( in+i ) ); __m256i vbfp16 = gxm_fp32_to_bfp16_truncate_avx512f( vfp32 ); _mm256_storeu_si256( (__m256i*)(out+i), vbfp16 ); } }
int main(int, char**argv) { /* AVX512 Foundation */ __m512i i; __m512d d; __m512 f; __mmask16 m = ~1; i = _mm512_maskz_loadu_epi32(0, argv); d = _mm512_loadu_pd((double *)argv + 64); f = _mm512_loadu_ps((float *)argv + 128); #ifdef __AVX512ER__ /* AVX512 Exponential and Reciprocal */ f = _mm512_exp2a23_round_ps(f, 8); #endif #ifdef __AVX512CD__ /* AVX512 Conflict Detection */ i = _mm512_maskz_conflict_epi32(m, i); #endif #ifdef __AVX512PF__ /* AVX512 Prefetch */ _mm512_mask_prefetch_i64scatter_pd(argv, 0xf, i, 2, 2); #endif #ifdef __AVX512DQ__ /* AVX512 Doubleword and Quadword support */ m = _mm512_movepi32_mask(i); #endif #ifdef __AVX512BW__ /* AVX512 Byte and Word support */ i = _mm512_mask_loadu_epi8(i, m, argv - 8); #endif #ifdef __AVX512VL__ /* AVX512 Vector Length */ __m256i i2 = _mm256_maskz_loadu_epi32(0, argv); _mm256_mask_storeu_epi32(argv + 1, m, i2); #endif #ifdef __AVX512IFMA__ /* AVX512 Integer Fused Multiply-Add */ i = _mm512_madd52lo_epu64(i, i, i); #endif #ifdef __AVX512VBMI__ /* AVX512 Vector Byte Manipulation Instructions */ i = _mm512_permutexvar_epi8(i, i); #endif _mm512_mask_storeu_epi64(argv, m, i); _mm512_mask_storeu_ps(argv + 64, m, f); _mm512_mask_storeu_pd(argv + 128, m, d); return 0; }
KFR_INTRINSIC f32avx512 read(cunaligned_t, csize_t<16>, const f32* ptr) { return _mm512_loadu_ps(ptr); }
inline void load(const float *data) { val1 = _mm512_loadu_ps(data + 0); val2 = _mm512_loadu_ps(data + 16); }