Exemple #1
0
LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE LIBXSMM_ATTRIBUTE(weak) void LIBXSMM_FSYMBOL(dgemm)(
  const char* transa, const char* transb,
  const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k,
  const double* alpha, const double* a, const libxsmm_blasint* lda,
  const double* b, const libxsmm_blasint* ldb,
  const double* beta, double* c, const libxsmm_blasint* ldc)
{
  typedef void (*function_type)(
    const char*, const char*,
    const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*,
    const double*, const double*, const libxsmm_blasint*,
    const double*, const libxsmm_blasint*,
    const double*, double*, const libxsmm_blasint*);
  static LIBXSMM_RETARGETABLE union {
    function_type fn;
    void* pv;
  } original = { 0 };
  int flags = LIBXSMM_FLAGS;
  flags = (0 != transa
      ? (('N' == *transa || 'n' == *transa) ? (flags & ~LIBXSMM_GEMM_FLAG_TRANS_A)
                                            : (flags |  LIBXSMM_GEMM_FLAG_TRANS_A))
      : flags);
  flags = (0 != transb
      ? (('N' == *transb || 'n' == *transb) ? (flags & ~LIBXSMM_GEMM_FLAG_TRANS_B)
                                            : (flags |  LIBXSMM_GEMM_FLAG_TRANS_B))
      : flags);
  if (0 == original.pv) {
    original.pv = dlsym(RTLD_NEXT, LIBXSMM_STRINGIFY(LIBXSMM_FSYMBOL(dgemm)));
  }
  assert(m && n && k && a && b && c);
  LIBXSMM_XGEMM(double, libxsmm_blasint, original.fn, flags, *m, *n, *k,
    0 != alpha ? *alpha : ((double)LIBXSMM_ALPHA),
    a, *(lda ? lda : LIBXSMM_LD(m, k)), b, *(ldb ? ldb : LIBXSMM_LD(k, n)),
    0 != beta ? *beta : ((double)LIBXSMM_BETA),
    c, *(ldc ? ldc : LIBXSMM_LD(m, n)));
}
Exemple #2
0
LIBXSMM_API_DEFINITION void libxsmm_gemm_configure(int archid, int prefetch)
{
  int config = 0;
  LIBXSMM_UNUSED(prefetch);
  internal_gemm_prefetch = LIBXSMM_PREFETCH_AL2_AHEAD;
  internal_gemm_nt = 2;
  internal_gemm = 2;
  {
    /* behaviour of libxsmm_omp_?gemm routines or LD_PRELOAD ?GEMM routines
     * 0: sequential below-threshold routine (no OpenMP); may fall-back to BLAS,
     * 1: OpenMP-parallelized but without internal parallel region,
     * 2: OpenMP-parallelized with internal parallel region" )
     */
    const char *const env = getenv("LIBXSMM_GEMM");
    if (0 != env && 0 != *env) {
      internal_gemm = atoi(env);
    }
  }
#if defined(LIBXSMM_EXT_GEMM_TASKS)
  { /* consider user input about using (OpenMP-)tasks; this code must be here
    * because maybe only this translation unit is compiled with OpenMP support
    */
    const char *const env_tasks = getenv("LIBXSMM_TASKS");
    if (0 != env_tasks && 0 != *env_tasks) {
      internal_gemm_tasks = atoi(env_tasks);
    }
  }
#endif
#if defined(__MIC__)
  LIBXSMM_UNUSED(archid);
#else
  if (LIBXSMM_X86_AVX512_MIC == archid)
#endif
  {
    internal_gemm_nt = 4;
    config = 1;
  }
  { /* attempt to setup tile sizes from the environment (LIBXSMM_M, LIBXSMM_N, and LIBXSMM_K) */
    const int tile_configs[/*configs*/][2/*DP/SP*/][3/*TILE_M,TILE_N,TILE_K*/] = {
      { { 72, 32, 16 }, { 72, 32, 16 } }, /*generic*/
      { { 72, 32, 16 }, { 72, 32, 16 } }  /*knl*/
    };
    const char* env[3];
    env[0] = getenv("LIBXSMM_M"); env[1] = getenv("LIBXSMM_N"); env[2] = getenv("LIBXSMM_K");
    internal_gemm_tile[0/*DP*/][0/*M*/] = (env[0] ? atoi(env[0]) : 0);
    internal_gemm_tile[0/*DP*/][1/*N*/] = (env[1] ? atoi(env[1]) : 0);
    internal_gemm_tile[0/*DP*/][2/*K*/] = (env[2] ? atoi(env[2]) : 0);
    /* environment-defined tile sizes applies for DP and SP */
    internal_gemm_tile[1/*SP*/][0/*M*/] = internal_gemm_tile[0/*DP*/][0];
    internal_gemm_tile[1/*SP*/][1/*N*/] = internal_gemm_tile[0/*DP*/][1];
    internal_gemm_tile[1/*SP*/][2/*K*/] = internal_gemm_tile[0/*DP*/][2];
    /* load predefined configuration if tile size is not setup by the environment */
    if (0 >= internal_gemm_tile[0/*DP*/][0/*M*/]) internal_gemm_tile[0][0] = tile_configs[config][0][0];
    if (0 >= internal_gemm_tile[0/*DP*/][1/*N*/]) internal_gemm_tile[0][1] = tile_configs[config][0][1];
    if (0 >= internal_gemm_tile[0/*DP*/][2/*K*/]) internal_gemm_tile[0][2] = tile_configs[config][0][2];
    if (0 >= internal_gemm_tile[1/*SP*/][0/*M*/]) internal_gemm_tile[1][0] = tile_configs[config][1][0];
    if (0 >= internal_gemm_tile[1/*SP*/][1/*N*/]) internal_gemm_tile[1][1] = tile_configs[config][1][1];
    if (0 >= internal_gemm_tile[1/*SP*/][2/*K*/]) internal_gemm_tile[1][2] = tile_configs[config][1][2];
  }
#if defined(__STATIC) && defined(LIBXSMM_BUILD) && !defined(__CYGWIN__) && \
  !(defined(__APPLE__) && defined(__MACH__) /*&& defined(__clang__)*/)
  if (0 == libxsmm_original_sgemm) {
    libxsmm_original_sgemm = LIBXSMM_FSYMBOL(__real_sgemm);
  }
#endif
#if !defined(__BLAS) || (0 != __BLAS)
  if (0 == libxsmm_original_sgemm) {
    libxsmm_original_sgemm = LIBXSMM_FSYMBOL(sgemm);
  }
#endif
#if defined(LIBXSMM_RTLD_NEXT)
  if (0 == libxsmm_original_sgemm) {
    union { const void* pv; libxsmm_sgemm_function pf; } gemm = { NULL };
    gemm.pv = dlsym(RTLD_NEXT, LIBXSMM_STRINGIFY(LIBXSMM_FSYMBOL(sgemm)));
    libxsmm_original_sgemm = gemm.pf;
  }
#endif
#if defined(__STATIC) && defined(LIBXSMM_BUILD) && !defined(__CYGWIN__) && \
  !(defined(__APPLE__) && defined(__MACH__) /*&& defined(__clang__)*/)
  if (0 == libxsmm_original_dgemm) {
    libxsmm_original_dgemm = LIBXSMM_FSYMBOL(__real_dgemm);
  }
#endif
#if !defined(__BLAS) || (0 != __BLAS)
  if (0 == libxsmm_original_dgemm) {
    libxsmm_original_dgemm = LIBXSMM_FSYMBOL(dgemm);
  }
#endif
#if defined(LIBXSMM_RTLD_NEXT)
  if (0 == libxsmm_original_dgemm) {
    union { const void* pv; libxsmm_dgemm_function pf; } gemm = { NULL };
    gemm.pv = dlsym(RTLD_NEXT, LIBXSMM_STRINGIFY(LIBXSMM_FSYMBOL(dgemm)));
    libxsmm_original_dgemm = gemm.pf;
  }
#endif
}
Exemple #3
0
int main(int argc, char* argv[])
{
  const char t = (char)(1 < argc ? *argv[1] : 'o');
  const libxsmm_blasint m = (2 < argc ? atoi(argv[2]) : 4096);
#if 0 /* TODO: enable when in-place transpose is fully supported */
  const libxsmm_blasint n = (3 < argc ? atoi(argv[3]) : m);
#else
  const libxsmm_blasint n = (3 < argc ? (('o' == t || 'O' == t) ? atoi(argv[3]) : m) : m);
#endif
  const libxsmm_blasint ldi = LIBXSMM_MAX/*sanitize ld*/(4 < argc ? atoi(argv[4]) : 0, m);
  const libxsmm_blasint ldo = LIBXSMM_MAX/*sanitize ld*/(5 < argc ? atoi(argv[5]) : 0, n);
  const int r = (6 < argc ? atoi(argv[6]) : 0), s = LIBXSMM_ABS(r);
  const libxsmm_blasint lower = (7 < argc ? atoi(argv[7]) : 0);
  libxsmm_blasint km = m, kn = n, kldi = ldi, kldo = (('o' == t || 'O' == t) ? ldo : ldi);
  int result = EXIT_SUCCESS, k;

  if (0 == strchr("oOiI", t)) {
    fprintf(stderr, "%s [<transpose-kind:o|i>] [<m>] [<n>] [<ld-in>] [<ld-out>] [random:0|nruns] [lbound]\n", argv[0]);
    exit(EXIT_FAILURE);
  }

#if defined(LIBXSMM_OFFLOAD_TARGET)
# pragma offload target(LIBXSMM_OFFLOAD_TARGET)
#endif
  {
    const char *const env_tasks = getenv("TASKS"), *const env_check = getenv("CHECK");
    const int tasks = (0 == env_tasks || 0 == *env_tasks) ? 0/*default*/ : atoi(env_tasks);
    const int check = (0 == env_check || 0 == *env_check) ? 1/*default*/ : atoi(env_check);
    ELEM_TYPE *const a = (ELEM_TYPE*)libxsmm_malloc((size_t)(ldi * (('o' == t || 'O' == t) ? n : ldo) * sizeof(ELEM_TYPE)));
    ELEM_TYPE *const b = (ELEM_TYPE*)libxsmm_malloc((size_t)(ldo * (('o' == t || 'O' == t) ? m : ldi) * sizeof(ELEM_TYPE)));
    libxsmm_timer_tickint start, duration = 0;
#if defined(USE_REFERENCE) /* benchmark against a reference */
    libxsmm_timer_tickint duration2 = 0;
#endif
    libxsmm_blasint i;
    size_t size = 0;
#if defined(MKL_ENABLE_AVX512)
    mkl_enable_instructions(MKL_ENABLE_AVX512);
#endif
    fprintf(stdout, "m=%lli n=%lli ldi=%lli ldo=%lli size=%.fMB (%s, %s)\n",
      (long long)m, (long long)n, (long long)ldi, (long long)ldo,
      1.0 * (m * n * sizeof(ELEM_TYPE)) / (1 << 20), LIBXSMM_STRINGIFY(ELEM_TYPE),
      ('o' == t || 'O' == t) ? "out-of-place" : "in-place");

#if defined(_OPENMP)
#   pragma omp parallel for private(i)
#endif
    for (i = 0; i < n; ++i) {
      libxsmm_blasint j;
      for (j = 0; j < m; ++j) {
        a[i*ldi+j] = initial_value(i, j, m);
      }
    }

    if (0 != check) { /* repeatable (reference) */
      srand(RAND_SEED);
    }
    else { /* randomized selection */
      srand(libxsmm_timer_tick() % ((unsigned int)-1));
    }
    for (k = (0 == r ? -1 : 0); k < s && EXIT_SUCCESS == result; ++k) {
      if (0 < r) {
        const libxsmm_blasint rldi = 0 <= lower ? randstart(lower, ldi) : 0;
        km = randstart(LIBXSMM_ABS(lower), m);
        kldi = LIBXSMM_MAX(rldi, km);
        if (('o' == t || 'O' == t)) {
          const libxsmm_blasint rldo = 0 <= lower ? randstart(lower, ldo) : 0;
          kn = randstart(LIBXSMM_ABS(lower), n);
          kldo = LIBXSMM_MAX(rldo, kn);
          /* trigger JIT-generated code */
          OTRANS(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo);
        }
        else {
#if 0 /* TODO: enable when in-place transpose is fully supported */
          kn = randstart(LIBXSMM_ABS(lower), n);
#else
          kn = km;
#endif
          kldo = kldi;
          /* trigger JIT-generated code */
          ITRANS(b, sizeof(ELEM_TYPE), km, kn, kldi);
        }
      }
      size += (size_t)(km * kn * sizeof(ELEM_TYPE));

      if (('o' == t || 'O' == t)) {
        if (0 == tasks) { /* library-internal parallelization */
          start = libxsmm_timer_tick();
#if defined(OTRANS_THREAD)
#         pragma omp parallel
          OTRANS_THREAD(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo, omp_get_thread_num(), omp_get_num_threads());
#else
          result = OTRANS(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo);
#endif
          duration += libxsmm_timer_diff(start, libxsmm_timer_tick());
        }
        else { /* external parallelization */
          start = libxsmm_timer_tick();
#if defined(_OPENMP)
#         pragma omp parallel
#         pragma omp single nowait
#endif
          result = OTRANS(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo);
          duration += libxsmm_timer_diff(start, libxsmm_timer_tick());
        }
      }
      else {
        assert(('i' == t || 'I' == t) && kldo == kldi);
        memcpy(b, a, (size_t)(kldi * kn * sizeof(ELEM_TYPE)));

        if (2 > tasks) { /* library-internal parallelization */
          start = libxsmm_timer_tick();
          result = ITRANS(b, sizeof(ELEM_TYPE), km, kn, kldi);
          duration += libxsmm_timer_diff(start, libxsmm_timer_tick());
        }
        else { /* external parallelization */
          start = libxsmm_timer_tick();
#if defined(_OPENMP)
#         pragma omp parallel
#         pragma omp single
#endif
          result = ITRANS(b, sizeof(ELEM_TYPE), km, kn, kldi);
          duration += libxsmm_timer_diff(start, libxsmm_timer_tick());
        }
      }
      if (0 != check) { /* check */
        for (i = 0; i < km; ++i) {
          libxsmm_blasint j;
          for (j = 0; j < kn; ++j) {
            const ELEM_TYPE u = b[i*kldo+j];
            const ELEM_TYPE v = a[j*kldi+i];
            if (LIBXSMM_NEQ(u, v)) {
              i += km; /* leave outer loop as well */
              result = EXIT_FAILURE;
              break;
            }
          }
        }
      }
    }

#if defined(USE_REFERENCE)
    if (0 < check) { /* check shall imply reference (performance-)test */
      srand(RAND_SEED); /* reproduce the same sequence as above */
      for (k = (0 == r ? -1 : 0); k < s && EXIT_SUCCESS == result; ++k) {
        if (0 < r) {
          const libxsmm_blasint rldi = 0 <= lower ? randstart(lower, ldi) : 0;
          km = randstart(LIBXSMM_ABS(lower), m);
          kldi = LIBXSMM_MAX(rldi, km);
          if (('o' == t || 'O' == t)) {
            const libxsmm_blasint rldo = 0 <= lower ? randstart(lower, ldo) : 0;
            kn = randstart(LIBXSMM_ABS(lower), n);
            kldo = LIBXSMM_MAX(rldo, kn);
          }
          else {
#if 0 /* TODO: enable when in-place transpose is fully supported */
            kn = randstart(LIBXSMM_ABS(lower), n);
#else
            kn = km;
#endif
            kldo = kldi;
          }
        }

        if (('o' == t || 'O' == t)) {
          start = libxsmm_timer_tick();
          OTRANS_GOLD(&km, &kn, a, &kldi, b, &kldo);
          duration2 += libxsmm_timer_diff(start, libxsmm_timer_tick());
        }
        else {
          assert(('i' == t || 'I' == t) && kldo == kldi);
          memcpy(b, a, (size_t)(kldi * kn * sizeof(ELEM_TYPE)));
          start = libxsmm_timer_tick();
          ITRANS_GOLD(&km, &kn, b, &kldi, &kldo);
          duration2 += libxsmm_timer_diff(start, libxsmm_timer_tick());
        }
        if (1 < check || 0 > check) { /* check */
          for (i = 0; i < km; ++i) {
            libxsmm_blasint j;
            for (j = 0; j < kn; ++j) {
              const ELEM_TYPE u = b[i*kldo+j];
              const ELEM_TYPE v = a[j*kldi+i];
              if (LIBXSMM_NEQ(u, v)) {
                i += km; /* leave outer loop as well */
                result = EXIT_FAILURE;
                break;
              }
            }
          }
        }
      }
    }
#endif

    if (EXIT_SUCCESS == result) {
      const double d = libxsmm_timer_duration(0, duration);
      if (0 < duration) {
        /* out-of-place transpose bandwidth assumes RFO */
        fprintf(stdout, "\tbandwidth: %.1f GB/s\n", size
          * ((('o' == t || 'O' == t)) ? 3 : 2) / (d * (1 << 30)));
      }
      if (0 == lower) {
        fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * (d / (0 == r ? (s + 1) : s)));
      }
      else {
        fprintf(stdout, "\tduration: %f ms\n", 1000.0 * d);
      }
#if defined(USE_REFERENCE)
      if (0 < duration2) {
        fprintf(stdout, "\treference: %.1fx\n", (1.0 * duration) / duration2);
      }
#endif
    }
    else if (0 != check) { /* check */
      fprintf(stderr, "Error: validation failed for m=%lli, n=%lli, ldi=%lli, and ldo=%lli!\n",
        (long long)km, (long long)kn, (long long)kldi, (long long)kldo);
    }

    libxsmm_free(a);
    libxsmm_free(b);
  }
  return result;
}