__attribute__((noinline)) float dot512(float *x1, float *x2, size_t len) {
  assert(len % 16 == 0);
  __m512 sum = _mm512_setzero_ps();
  if (len > 15) {
    size_t limit = len - 15;
    for (size_t i = 0; i < limit; i += 16) {
      __m512 v1 = _mm512_loadu_ps(x1 + i);
      __m512 v2 = _mm512_loadu_ps(x2 + i);
      sum = _mm512_add_ps(sum, _mm512_mul_ps(v1, v2));
    }
  }
  float buffer[16];
  _mm512_storeu_ps(buffer, sum);
  return buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] +
         buffer[6] + buffer[7] + buffer[8] + buffer[9] + buffer[10] +
         buffer[11] + buffer[12] + buffer[13] + buffer[14] + buffer[15];
}
Esempio n. 2
0
void extern
avx512f_test (void)
{
  x = _mm512_loadu_ps (p);
  x = _mm512_mask_loadu_ps (x, m, p);
  x = _mm512_maskz_loadu_ps (m, p);

  _mm512_storeu_ps (p, x);
  _mm512_mask_storeu_ps (p, m, x);
}
__attribute__((noinline)) float dot512fma2(float *x1, float *x2, size_t len) {
  assert(len % 32 == 0);
  __m512 sum = _mm512_setzero_ps();
  if (len > 31) {
    size_t limit = len - 31;
    for (size_t i = 0; i < limit; i += 32) {

      __m512 v11 = _mm512_loadu_ps(x1 + i);
      __m512 v21 = _mm512_loadu_ps(x2 + i);
      __m512 v12 = _mm512_loadu_ps(x1 + i + 16);
      __m512 v22 = _mm512_loadu_ps(x2 + i + 16);
      sum = _mm512_fmadd_ps(v11, v21, sum);
      sum = _mm512_fmadd_ps(v12, v22, sum);
    }
  }
  float buffer[16];
  _mm512_storeu_ps(buffer, sum);
  return buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] +
         buffer[6] + buffer[7] + buffer[8] + buffer[9] + buffer[10] +
         buffer[11] + buffer[12] + buffer[13] + buffer[14] + buffer[15];
}
Esempio n. 4
0
void convert_f32_bf16(float* in, libxsmm_bfloat16* out, int len)
{
  int i;

#ifdef _OPENMP
#pragma omp parallel for private(i)
#endif
  for ( i = 0; i < len; i+=16 ) {
    __m512  vfp32  = gxm_fp32_to_bfp16_rne_adjustment_avx512f( _mm512_loadu_ps( in+i ) );
    __m256i vbfp16 = gxm_fp32_to_bfp16_truncate_avx512f( vfp32 );
    _mm256_storeu_si256( (__m256i*)(out+i), vbfp16 );
  }
}
Esempio n. 5
0
File: avx512.cpp Progetto: RSATom/Qt
int main(int, char**argv)
{
    /* AVX512 Foundation */
    __m512i i;
    __m512d d;
    __m512 f;
    __mmask16 m = ~1;
    i = _mm512_maskz_loadu_epi32(0, argv);
    d = _mm512_loadu_pd((double *)argv + 64);
    f = _mm512_loadu_ps((float *)argv + 128);

#ifdef __AVX512ER__
    /* AVX512 Exponential and Reciprocal */
    f =  _mm512_exp2a23_round_ps(f, 8);
#endif
#ifdef __AVX512CD__
    /* AVX512 Conflict Detection */
    i = _mm512_maskz_conflict_epi32(m, i);
#endif
#ifdef __AVX512PF__
    /* AVX512 Prefetch */
    _mm512_mask_prefetch_i64scatter_pd(argv, 0xf, i, 2, 2);
#endif
#ifdef __AVX512DQ__
    /* AVX512 Doubleword and Quadword support */
    m = _mm512_movepi32_mask(i);
#endif
#ifdef __AVX512BW__
    /* AVX512 Byte and Word support */
    i =  _mm512_mask_loadu_epi8(i, m, argv - 8);
#endif
#ifdef __AVX512VL__
    /* AVX512 Vector Length */
    __m256i i2 = _mm256_maskz_loadu_epi32(0, argv);
    _mm256_mask_storeu_epi32(argv + 1, m, i2);
#endif
#ifdef __AVX512IFMA__
    /* AVX512 Integer Fused Multiply-Add */
    i = _mm512_madd52lo_epu64(i, i, i);
#endif
#ifdef __AVX512VBMI__
    /* AVX512 Vector Byte Manipulation Instructions */
    i = _mm512_permutexvar_epi8(i, i);
#endif

    _mm512_mask_storeu_epi64(argv, m, i);
    _mm512_mask_storeu_ps(argv + 64, m, f);
    _mm512_mask_storeu_pd(argv + 128, m, d);
    return 0;
}
Esempio n. 6
0
KFR_INTRINSIC f32avx512 read(cunaligned_t, csize_t<16>, const f32* ptr) { return _mm512_loadu_ps(ptr); }
 inline
 void load(const float *data)
 {
     val1 = _mm512_loadu_ps(data +  0);
     val2 = _mm512_loadu_ps(data + 16);
 }