Example #1
0
#include <stdlib.h>
#include <stdio.h>
#if defined(LIBXSMM_OFFLOAD_TARGET)
# pragma offload_attribute(pop)
#endif

#if !defined(REAL_TYPE)
# define REAL_TYPE double
#endif


LIBXSMM_RETARGETABLE void init(int seed, REAL_TYPE *LIBXSMM_RESTRICT dst, double scale, libxsmm_blasint nrows, libxsmm_blasint ncols, libxsmm_blasint ld);
LIBXSMM_RETARGETABLE void init(int seed, REAL_TYPE *LIBXSMM_RESTRICT dst, double scale, libxsmm_blasint nrows, libxsmm_blasint ncols, libxsmm_blasint ld)
{
  const libxsmm_blasint minval = seed, addval = (nrows - 1) * ld + (ncols - 1);
  const libxsmm_blasint maxval = LIBXSMM_MAX(LIBXSMM_ABS(minval), addval);
  const double norm = 0 != maxval ? (scale / maxval) : scale;
  libxsmm_blasint i, j;
#if defined(_OPENMP)
# pragma omp parallel for
#endif
  for (i = 0; i < ncols; ++i) {
    for (j = 0; j < nrows; ++j) {
      const libxsmm_blasint k = i * ld + j;
      const double value = (double)(k + minval);
      dst[k] = (REAL_TYPE)(norm * (value - 0.5 * addval));
    }
  }
}

Example #2
0
File: diff.c Project: hfp/libxsmm
int main(int argc, char* argv[])
{
  const int insize = (1 < argc ? atoi(argv[1]) : 0);
  const int incrmt = (2 < argc ? atoi(argv[2]) : 0);
  const int nelems = (3 < argc ? atoi(argv[3]) : 0);
  const int niters = (4 < argc ? atoi(argv[4]) : 1);
  const int elsize = (0 >= insize ? LIBXSMM_DESCRIPTOR_SIGSIZE : insize);
  const int stride = (0 >= incrmt ? LIBXSMM_MAX(LIBXSMM_DESCRIPTOR_MAXSIZE, elsize) : LIBXSMM_MAX(incrmt, elsize));
  const size_t n = (0 >= nelems ? (((size_t)2 << 30/*2 GB*/) / stride) : ((size_t)nelems));
  unsigned char *input, *icopy = NULL, *ilast = NULL;
  int result = EXIT_SUCCESS;
  size_t nbytes, size, nrpt;

  if (0 < niters) {
    size = n;
    nrpt = niters;
  }
  else {
    size = LIBXSMM_MAX(LIBXSMM_ABS(niters), 1);
    nrpt = n;
  }
  nbytes = size * stride;
  input = (unsigned char*)(0 != nbytes ? malloc(nbytes) : NULL);

  if (NULL != input) {
    unsigned char *const ref = input + (size - 1) * stride; /* last item */
    libxsmm_timer_tickint start;
    size_t i, j = 0;

    /* initialize the input data */
    for (i = 0; i < nbytes; ++i) input[i] = LIBXSMM_MOD2(i, 128);
    for (i = 0; i < (size_t)elsize; ++i) ref[i] = 255;

    { /* benchmark libxsmm_diff_n */
#if defined(USE_HASH)
      const unsigned int hashref = libxsmm_hash(ref, elsize, 0/*seed*/);
#endif
      start = libxsmm_timer_tick();
      for (i = 0; i < nrpt; ++i) {
#if !defined(USE_HASH)
        j = libxsmm_diff_n(ref, input, (unsigned char)elsize, (unsigned char)stride,
          (unsigned int)LIBXSMM_MIN(i, size)/*hint*/, (unsigned int)size);
#else
        const unsigned char* tst = input;
        for (j = 0; j < size; ++j) {
          const unsigned int hashtst = libxsmm_hash(tst, elsize, 0/*seed*/);
          if (hashref == hashtst && 0 == libxsmm_diff(ref, tst, (unsigned char)elsize)) {
            break;
          }
          tst += stride;
        }
#endif
      }
      printf("libxsmm_diff_n:\t\t%.8f s\n", libxsmm_timer_duration(start, libxsmm_timer_tick()));
    }

    if (size == (j + 1) && 0 == memcmp(ref, input + j * stride, elsize)) { /* benchmark libxsmm_memcmp */
      icopy = (unsigned char*)(elsize == stride ? malloc(nbytes) : NULL);
      if (NULL != icopy) {
        ilast = icopy + (size - 1) * stride; /* last item */
        memcpy(icopy, input, nbytes);
        start = libxsmm_timer_tick();
        for (i = 0; i < nrpt; ++i) {
          j += libxsmm_memcmp(input, icopy, nbytes); /* take result of every execution */
          /* memcmp may be pure and without touching input it is not repeated (nrpt) */
          ilast[i%elsize] = 255;
        }
        printf("libxsmm_memcmp:\t\t%.8f s\n", libxsmm_timer_duration(start, libxsmm_timer_tick()));
        result += (int)j * ((int)stride / ((int)stride + 1)); /* ignore result */
      }
    }
    else {
      result = EXIT_FAILURE;
    }

    if (NULL != icopy) { /* benchmark stdlib's memcmp */
      LIBXSMM_ASSERT(NULL != ilast);
      start = libxsmm_timer_tick();
      for (i = 0; i < nrpt; ++i) {
        j += memcmp(input, icopy, nbytes); /* take result of every execution */
        /* memcmp is likely pure and without touching input it is not repeated (nrpt) */
        ilast[i%elsize] = 255;
      }
      printf("stdlib memcmp:\t\t%.8f s\n", libxsmm_timer_duration(start, libxsmm_timer_tick()));
      result += (int)j * ((int)stride / ((int)stride + 1)); /* ignore result */
      free(icopy);
    }

    free(input);
  }
  else {
    result = EXIT_FAILURE;
  }

  return result;
}
Example #3
0
int main(int argc, char* argv[])
{
  LIBXSMM_GEMM_CONST libxsmm_blasint m = (1 < argc ? atoi(argv[1]) : 1024);
  LIBXSMM_GEMM_CONST libxsmm_blasint k = (3 < argc ? atoi(argv[3]) : m);
  LIBXSMM_GEMM_CONST libxsmm_blasint n = (2 < argc ? atoi(argv[2]) : k);
  const libxsmm_blasint bm = (4 < argc ? atoi(argv[4]) : 32);
  const libxsmm_blasint bk = (6 < argc ? atoi(argv[6]) : bm);
  const libxsmm_blasint bn = (5 < argc ? atoi(argv[5]) : bk);
  const libxsmm_bgemm_order order = (libxsmm_bgemm_order)(7 < argc ? atoi(argv[7]) : 0);
  const int nrepeat = (8 < argc ? atoi(argv[8]) : 100);
  const libxsmm_blasint b_m1 = (9 < argc ? atoi(argv[9]) : 1);
  const libxsmm_blasint b_n1  = (10 < argc ? atoi(argv[10]) : 1);
  const libxsmm_blasint b_k1 = (11 < argc ? atoi(argv[11]) : 1);
  const libxsmm_blasint b_k2 = (12 < argc ? atoi(argv[12]) : 1);
  const int ab = (13 < argc ? atoi(argv[13]) : 0);
  LIBXSMM_GEMM_CONST libxsmm_blasint lda = (14 < argc ? atoi(argv[13]) : m);
  LIBXSMM_GEMM_CONST libxsmm_blasint ldb = (15 < argc ? atoi(argv[14]) : k);
  LIBXSMM_GEMM_CONST libxsmm_blasint ldc = (16 < argc ? atoi(argv[15]) : m);
  LIBXSMM_GEMM_CONST char transa = 'N', transb = 'N'; /* no transposes */
  LIBXSMM_GEMM_CONST ITYPE alpha = 1, beta = 1;
  const int gemm_flags = LIBXSMM_GEMM_FLAGS(transa, transb);
  const double gflops = 2.0 * m * n * k * 1E-9;
  int result = EXIT_SUCCESS;
#if defined(CHECK)
  const char *const env_check = getenv("CHECK");
  const double check = LIBXSMM_ABS(0 == env_check ? 0 : atof(env_check));
#endif
  if (argc > 1 && !strncmp(argv[1], "-h", 3)) { /* check command line */
    printf("\nUsage: ./bgemm [M] [N] [K] [bm] [bn] [bk] [order] [reps] [b_m1] [b_n1] [b_k1] [b_k2] [verbose]\n\n");
    return result;
  }

  MYASSERT(m % b_m1 == 0);
  MYASSERT(n % b_n1 == 0);
  MYASSERT(k % b_k1 == 0);
  MYASSERT(m/b_m1 % bm == 0);
  MYASSERT(n/b_n1 % bn == 0);
  MYASSERT(k/b_k1/b_k2 % bk == 0);

#if defined(LIBXSMM_OFFLOAD_TARGET)
# pragma offload target(LIBXSMM_OFFLOAD_TARGET)
#endif
  {
    ITYPE* agold = (ITYPE*)libxsmm_malloc((size_t)(lda * k * sizeof(ITYPE)));
    ITYPE* bgold = (ITYPE*)libxsmm_malloc((size_t)(ldb * n * sizeof(ITYPE)));
    ITYPE* cgold = (ITYPE*)libxsmm_malloc((size_t)(ldc * n * sizeof(ITYPE)));
    ITYPE* a = (ITYPE*)libxsmm_malloc((size_t)(m * k * sizeof(ITYPE)));
    ITYPE* b = (ITYPE*)libxsmm_malloc((size_t)(k * n * sizeof(ITYPE)));
    ITYPE* c = (ITYPE*)libxsmm_malloc((size_t)(m * n * sizeof(ITYPE)));
    libxsmm_bgemm_handle* handle = 0;
    unsigned long long start;
    double duration;
    handle = libxsmm_bgemm_handle_create(
      LIBXSMM_GEMM_PRECISION(ITYPE), LIBXSMM_GEMM_PRECISION(ITYPE),
      m, n, k, &bm, &bn, &bk, &b_m1, &b_n1, &b_k1, &b_k2,
      &alpha, &beta, &gemm_flags, NULL/*auto-prefetch*/, &order);

    if (0 != handle) {
      LIBXSMM_MATINIT(ITYPE, 42, agold, m, k, lda, 1.0);
      LIBXSMM_MATINIT(ITYPE, 24, bgold, k, n, ldb, 1.0);
      LIBXSMM_MATINIT(ITYPE,  0, cgold, m, n, ldc, 1.0);
      libxsmm_bgemm_copyin_a(handle, agold, &lda, a);
      libxsmm_bgemm_copyin_b(handle, bgold, &ldb, b);
      libxsmm_bgemm_copyin_c(handle, cgold, &ldc, c);
#if defined(MKL_ENABLE_AVX512)
      mkl_enable_instructions(MKL_ENABLE_AVX512);
#endif
      /* warm-up OpenMP (populate thread pool) */
      libxsmm_bgemm_omp(handle, a, b, c, 1);
#if defined(CHECK)
      if (!LIBXSMM_FEQ(0, check)) {
        LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, agold, &lda, bgold, &ldb, &beta, cgold, &ldc);
      }
#endif
      if (!ab) {
      libxsmm_gemm_print(stdout, LIBXSMM_GEMM_PRECISION(ITYPE),
        &transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
      fprintf(stdout, "\n\n");
      }
      start = libxsmm_timer_tick();
      libxsmm_bgemm_omp(handle, a, b, c, nrepeat);
      duration = libxsmm_timer_duration(start, libxsmm_timer_tick());
      if (0 < duration) {
        if (ab) {
          fprintf(stdout, "\tLIBXSMM: %.1f GFLOPS/s | %lli,%lli,%lli,%lli,%lli,%lli,%i,%lli,%lli,%lli,%lli\n",
            gflops * nrepeat / duration, (long long)m, (long long)n, (long long)k, (long long)bm, (long long)bn, (long long)bk,
            (int)order, (long long)b_m1, (long long)b_n1, (long long)b_k1, (long long)b_k2);
        } else {
          fprintf(stdout, "\tLIBXSMM: %.1f GFLOPS/s\n", gflops * nrepeat / duration);
        }
      }
#if defined(CHECK)
      if (!LIBXSMM_FEQ(0, check)) { /* validate result against LAPACK/BLAS xGEMM */
        ITYPE* ctest = 0;
        int i;
        start = libxsmm_timer_tick();
        for (i = 0; i < nrepeat; ++i) {
          LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, agold, &lda, bgold, &ldb, &beta, cgold, &ldc);
        }
        duration = libxsmm_timer_duration(start, libxsmm_timer_tick());
        if (0 < duration) {
          fprintf(stdout, "\tBLAS: %.1f GFLOPS/s\n", gflops * nrepeat / duration);
        }
        /* free memory not needed further; avoid double-free later on */
        libxsmm_free(agold); agold = 0;
        libxsmm_free(bgold); bgold = 0;
        libxsmm_free(a); a = 0;
        libxsmm_free(b); b = 0;
        /* allocate C-matrix in regular format, and perform copy-out */
        ctest = (ITYPE*)libxsmm_malloc((size_t)(ldc * n * sizeof(ITYPE)));
        if (0 != ctest) {
          libxsmm_matdiff_info diff;
          libxsmm_bgemm_copyout_c(handle, c, &ldc, ctest);
          if (EXIT_SUCCESS == libxsmm_matdiff(LIBXSMM_DATATYPE(ITYPE), m, n, cgold, ctest, &ldc, &ldc, &diff)) {
            fprintf(stdout, "\tdiff: L2abs=%f Linf=%f\n", diff.l2_abs, diff.linf_abs);
            if (check < 100.0 * diff.normf_rel) {
              fprintf(stderr, "FAILED with an error of %f%%!\n", 100.0 * diff.normf_rel);
              result = EXIT_FAILURE;
            }
          }
          libxsmm_free(ctest);
        }
      }
#endif
      libxsmm_bgemm_handle_destroy(handle);
    }
    else {
      fprintf(stderr, "FAILED to create BGEMM-handle! For details retry with LIBXSMM_VERBOSE=1.\n");
      result = EXIT_FAILURE;
    }
    libxsmm_free(agold);
    libxsmm_free(bgold);
    libxsmm_free(cgold);
    libxsmm_free(a);
    libxsmm_free(b);
    libxsmm_free(c);
  }
  if(!ab) {
    fprintf(stdout, "Finished\n");
  }
  return result;
}
Example #4
0
int main(int argc, char* argv[])
{
  const char t = (char)(1 < argc ? *argv[1] : 'o');
  const libxsmm_blasint m = (2 < argc ? atoi(argv[2]) : 4096);
#if 0 /* TODO: enable when in-place transpose is fully supported */
  const libxsmm_blasint n = (3 < argc ? atoi(argv[3]) : m);
#else
  const libxsmm_blasint n = (3 < argc ? (('o' == t || 'O' == t) ? atoi(argv[3]) : m) : m);
#endif
  const libxsmm_blasint ldi = LIBXSMM_MAX/*sanitize ld*/(4 < argc ? atoi(argv[4]) : 0, m);
  const libxsmm_blasint ldo = LIBXSMM_MAX/*sanitize ld*/(5 < argc ? atoi(argv[5]) : 0, n);
  const int r = (6 < argc ? atoi(argv[6]) : 0), s = LIBXSMM_ABS(r);
  const libxsmm_blasint lower = (7 < argc ? atoi(argv[7]) : 0);
  libxsmm_blasint km = m, kn = n, kldi = ldi, kldo = (('o' == t || 'O' == t) ? ldo : ldi);
  int result = EXIT_SUCCESS, k;

  if (0 == strchr("oOiI", t)) {
    fprintf(stderr, "%s [<transpose-kind:o|i>] [<m>] [<n>] [<ld-in>] [<ld-out>] [random:0|nruns] [lbound]\n", argv[0]);
    exit(EXIT_FAILURE);
  }

#if defined(LIBXSMM_OFFLOAD_TARGET)
# pragma offload target(LIBXSMM_OFFLOAD_TARGET)
#endif
  {
    const char *const env_tasks = getenv("TASKS"), *const env_check = getenv("CHECK");
    const int tasks = (0 == env_tasks || 0 == *env_tasks) ? 0/*default*/ : atoi(env_tasks);
    const int check = (0 == env_check || 0 == *env_check) ? 1/*default*/ : atoi(env_check);
    ELEM_TYPE *const a = (ELEM_TYPE*)libxsmm_malloc((size_t)(ldi * (('o' == t || 'O' == t) ? n : ldo) * sizeof(ELEM_TYPE)));
    ELEM_TYPE *const b = (ELEM_TYPE*)libxsmm_malloc((size_t)(ldo * (('o' == t || 'O' == t) ? m : ldi) * sizeof(ELEM_TYPE)));
    libxsmm_timer_tickint start, duration = 0;
#if defined(USE_REFERENCE) /* benchmark against a reference */
    libxsmm_timer_tickint duration2 = 0;
#endif
    libxsmm_blasint i;
    size_t size = 0;
#if defined(MKL_ENABLE_AVX512)
    mkl_enable_instructions(MKL_ENABLE_AVX512);
#endif
    fprintf(stdout, "m=%lli n=%lli ldi=%lli ldo=%lli size=%.fMB (%s, %s)\n",
      (long long)m, (long long)n, (long long)ldi, (long long)ldo,
      1.0 * (m * n * sizeof(ELEM_TYPE)) / (1 << 20), LIBXSMM_STRINGIFY(ELEM_TYPE),
      ('o' == t || 'O' == t) ? "out-of-place" : "in-place");

#if defined(_OPENMP)
#   pragma omp parallel for private(i)
#endif
    for (i = 0; i < n; ++i) {
      libxsmm_blasint j;
      for (j = 0; j < m; ++j) {
        a[i*ldi+j] = initial_value(i, j, m);
      }
    }

    if (0 != check) { /* repeatable (reference) */
      srand(RAND_SEED);
    }
    else { /* randomized selection */
      srand(libxsmm_timer_tick() % ((unsigned int)-1));
    }
    for (k = (0 == r ? -1 : 0); k < s && EXIT_SUCCESS == result; ++k) {
      if (0 < r) {
        const libxsmm_blasint rldi = 0 <= lower ? randstart(lower, ldi) : 0;
        km = randstart(LIBXSMM_ABS(lower), m);
        kldi = LIBXSMM_MAX(rldi, km);
        if (('o' == t || 'O' == t)) {
          const libxsmm_blasint rldo = 0 <= lower ? randstart(lower, ldo) : 0;
          kn = randstart(LIBXSMM_ABS(lower), n);
          kldo = LIBXSMM_MAX(rldo, kn);
          /* trigger JIT-generated code */
          OTRANS(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo);
        }
        else {
#if 0 /* TODO: enable when in-place transpose is fully supported */
          kn = randstart(LIBXSMM_ABS(lower), n);
#else
          kn = km;
#endif
          kldo = kldi;
          /* trigger JIT-generated code */
          ITRANS(b, sizeof(ELEM_TYPE), km, kn, kldi);
        }
      }
      size += (size_t)(km * kn * sizeof(ELEM_TYPE));

      if (('o' == t || 'O' == t)) {
        if (0 == tasks) { /* library-internal parallelization */
          start = libxsmm_timer_tick();
#if defined(OTRANS_THREAD)
#         pragma omp parallel
          OTRANS_THREAD(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo, omp_get_thread_num(), omp_get_num_threads());
#else
          result = OTRANS(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo);
#endif
          duration += libxsmm_timer_diff(start, libxsmm_timer_tick());
        }
        else { /* external parallelization */
          start = libxsmm_timer_tick();
#if defined(_OPENMP)
#         pragma omp parallel
#         pragma omp single nowait
#endif
          result = OTRANS(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo);
          duration += libxsmm_timer_diff(start, libxsmm_timer_tick());
        }
      }
      else {
        assert(('i' == t || 'I' == t) && kldo == kldi);
        memcpy(b, a, (size_t)(kldi * kn * sizeof(ELEM_TYPE)));

        if (2 > tasks) { /* library-internal parallelization */
          start = libxsmm_timer_tick();
          result = ITRANS(b, sizeof(ELEM_TYPE), km, kn, kldi);
          duration += libxsmm_timer_diff(start, libxsmm_timer_tick());
        }
        else { /* external parallelization */
          start = libxsmm_timer_tick();
#if defined(_OPENMP)
#         pragma omp parallel
#         pragma omp single
#endif
          result = ITRANS(b, sizeof(ELEM_TYPE), km, kn, kldi);
          duration += libxsmm_timer_diff(start, libxsmm_timer_tick());
        }
      }
      if (0 != check) { /* check */
        for (i = 0; i < km; ++i) {
          libxsmm_blasint j;
          for (j = 0; j < kn; ++j) {
            const ELEM_TYPE u = b[i*kldo+j];
            const ELEM_TYPE v = a[j*kldi+i];
            if (LIBXSMM_NEQ(u, v)) {
              i += km; /* leave outer loop as well */
              result = EXIT_FAILURE;
              break;
            }
          }
        }
      }
    }

#if defined(USE_REFERENCE)
    if (0 < check) { /* check shall imply reference (performance-)test */
      srand(RAND_SEED); /* reproduce the same sequence as above */
      for (k = (0 == r ? -1 : 0); k < s && EXIT_SUCCESS == result; ++k) {
        if (0 < r) {
          const libxsmm_blasint rldi = 0 <= lower ? randstart(lower, ldi) : 0;
          km = randstart(LIBXSMM_ABS(lower), m);
          kldi = LIBXSMM_MAX(rldi, km);
          if (('o' == t || 'O' == t)) {
            const libxsmm_blasint rldo = 0 <= lower ? randstart(lower, ldo) : 0;
            kn = randstart(LIBXSMM_ABS(lower), n);
            kldo = LIBXSMM_MAX(rldo, kn);
          }
          else {
#if 0 /* TODO: enable when in-place transpose is fully supported */
            kn = randstart(LIBXSMM_ABS(lower), n);
#else
            kn = km;
#endif
            kldo = kldi;
          }
        }

        if (('o' == t || 'O' == t)) {
          start = libxsmm_timer_tick();
          OTRANS_GOLD(&km, &kn, a, &kldi, b, &kldo);
          duration2 += libxsmm_timer_diff(start, libxsmm_timer_tick());
        }
        else {
          assert(('i' == t || 'I' == t) && kldo == kldi);
          memcpy(b, a, (size_t)(kldi * kn * sizeof(ELEM_TYPE)));
          start = libxsmm_timer_tick();
          ITRANS_GOLD(&km, &kn, b, &kldi, &kldo);
          duration2 += libxsmm_timer_diff(start, libxsmm_timer_tick());
        }
        if (1 < check || 0 > check) { /* check */
          for (i = 0; i < km; ++i) {
            libxsmm_blasint j;
            for (j = 0; j < kn; ++j) {
              const ELEM_TYPE u = b[i*kldo+j];
              const ELEM_TYPE v = a[j*kldi+i];
              if (LIBXSMM_NEQ(u, v)) {
                i += km; /* leave outer loop as well */
                result = EXIT_FAILURE;
                break;
              }
            }
          }
        }
      }
    }
#endif

    if (EXIT_SUCCESS == result) {
      const double d = libxsmm_timer_duration(0, duration);
      if (0 < duration) {
        /* out-of-place transpose bandwidth assumes RFO */
        fprintf(stdout, "\tbandwidth: %.1f GB/s\n", size
          * ((('o' == t || 'O' == t)) ? 3 : 2) / (d * (1 << 30)));
      }
      if (0 == lower) {
        fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * (d / (0 == r ? (s + 1) : s)));
      }
      else {
        fprintf(stdout, "\tduration: %f ms\n", 1000.0 * d);
      }
#if defined(USE_REFERENCE)
      if (0 < duration2) {
        fprintf(stdout, "\treference: %.1fx\n", (1.0 * duration) / duration2);
      }
#endif
    }
    else if (0 != check) { /* check */
      fprintf(stderr, "Error: validation failed for m=%lli, n=%lli, ldi=%lli, and ldo=%lli!\n",
        (long long)km, (long long)kn, (long long)kldi, (long long)kldo);
    }

    libxsmm_free(a);
    libxsmm_free(b);
  }
  return result;
}