LIBXSMM_API_DEFINITION void libxsmm_sgemm_omp(const char* transa, const char* transb,
  const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k,
  const float* alpha, const float* a, const libxsmm_blasint* lda,
  const float* b, const libxsmm_blasint* ldb,
  const float* beta, float* c, const libxsmm_blasint* ldc)
{
  LIBXSMM_GEMM_DESCRIPTOR_DIM_TYPE tm, tn, tk;
  LIBXSMM_GEMM_DECLARE_FLAGS(flags, transa, transb);
  LIBXSMM_INIT
  tm = libxsmm_gemm_tile[1/*SP*/][0/*M*/];
  tn = libxsmm_gemm_tile[1/*SP*/][1/*N*/];
  tk = libxsmm_gemm_tile[1/*SP*/][2/*K*/];
  assert(0 < tm && 0 < tn && 0 < tk && 0 < libxsmm_nt);
#if defined(_OPENMP)
  if (0 != libxsmm_mt) { /* enable OpenMP support */
    if (0 == LIBXSMM_MOD2(libxsmm_mt, 2)) { /* even: enable internal parallelization */
# if defined(LIBXSMM_EXT_TASKS)
      if (0 == libxsmm_tasks)
# endif
      {
        LIBXSMM_TILED_XGEMM(LIBXSMM_EXT_FOR_PARALLEL, LIBXSMM_NOOP, LIBXSMM_EXT_FOR_SINGLE,
          LIBXSMM_GEMM_COLLAPSE, LIBXSMM_EXT_FOR_LOOP, LIBXSMM_EXT_FOR_KERNEL, LIBXSMM_EXT_FOR_SYNC,
          LIBXSMM_EXT_MIN_NTASKS, LIBXSMM_EXT_OVERHEAD, libxsmm_nt,
          float, flags | LIBXSMM_GEMM_FLAG_F32PREC, tm, tn, tk, *m, *n, *k,
          0 != alpha ? *alpha : ((float)LIBXSMM_ALPHA),
          a, *(lda ? lda : LIBXSMM_LD(m, k)), b, *(ldb ? ldb : LIBXSMM_LD(k, n)),
          0 != beta ? *beta : ((float)LIBXSMM_BETA),
          c, *(ldc ? ldc : LIBXSMM_LD(m, n)));
      }
# if defined(LIBXSMM_EXT_TASKS)
      else {
        LIBXSMM_TILED_XGEMM(LIBXSMM_EXT_TSK_PARALLEL, LIBXSMM_EXT_SINGLE, LIBXSMM_NOOP,
          LIBXSMM_GEMM_COLLAPSE, LIBXSMM_EXT_TSK_LOOP, LIBXSMM_EXT_TSK_KERNEL_VARS, LIBXSMM_NOOP,
          LIBXSMM_EXT_MIN_NTASKS, LIBXSMM_EXT_OVERHEAD, libxsmm_nt,
          float, flags | LIBXSMM_GEMM_FLAG_F32PREC, tm, tn, tk, *m, *n, *k,
          0 != alpha ? *alpha : ((float)LIBXSMM_ALPHA),
          a, *(lda ? lda : LIBXSMM_LD(m, k)), b, *(ldb ? ldb : LIBXSMM_LD(k, n)),
          0 != beta ? *beta : ((float)LIBXSMM_BETA),
          c, *(ldc ? ldc : LIBXSMM_LD(m, n)));
      }
# endif
    }
Exemple #2
0
int main(int argc, char* argv[])
{
  const int insize = (1 < argc ? atoi(argv[1]) : 0);
  const int incrmt = (2 < argc ? atoi(argv[2]) : 0);
  const int nelems = (3 < argc ? atoi(argv[3]) : 0);
  const int niters = (4 < argc ? atoi(argv[4]) : 1);
  const int elsize = (0 >= insize ? LIBXSMM_DESCRIPTOR_SIGSIZE : insize);
  const int stride = (0 >= incrmt ? LIBXSMM_MAX(LIBXSMM_DESCRIPTOR_MAXSIZE, elsize) : LIBXSMM_MAX(incrmt, elsize));
  const size_t n = (0 >= nelems ? (((size_t)2 << 30/*2 GB*/) / stride) : ((size_t)nelems));
  unsigned char *input, *icopy = NULL, *ilast = NULL;
  int result = EXIT_SUCCESS;
  size_t nbytes, size, nrpt;

  if (0 < niters) {
    size = n;
    nrpt = niters;
  }
  else {
    size = LIBXSMM_MAX(LIBXSMM_ABS(niters), 1);
    nrpt = n;
  }
  nbytes = size * stride;
  input = (unsigned char*)(0 != nbytes ? malloc(nbytes) : NULL);

  if (NULL != input) {
    unsigned char *const ref = input + (size - 1) * stride; /* last item */
    libxsmm_timer_tickint start;
    size_t i, j = 0;

    /* initialize the input data */
    for (i = 0; i < nbytes; ++i) input[i] = LIBXSMM_MOD2(i, 128);
    for (i = 0; i < (size_t)elsize; ++i) ref[i] = 255;

    { /* benchmark libxsmm_diff_n */
#if defined(USE_HASH)
      const unsigned int hashref = libxsmm_hash(ref, elsize, 0/*seed*/);
#endif
      start = libxsmm_timer_tick();
      for (i = 0; i < nrpt; ++i) {
#if !defined(USE_HASH)
        j = libxsmm_diff_n(ref, input, (unsigned char)elsize, (unsigned char)stride,
          (unsigned int)LIBXSMM_MIN(i, size)/*hint*/, (unsigned int)size);
#else
        const unsigned char* tst = input;
        for (j = 0; j < size; ++j) {
          const unsigned int hashtst = libxsmm_hash(tst, elsize, 0/*seed*/);
          if (hashref == hashtst && 0 == libxsmm_diff(ref, tst, (unsigned char)elsize)) {
            break;
          }
          tst += stride;
        }
#endif
      }
      printf("libxsmm_diff_n:\t\t%.8f s\n", libxsmm_timer_duration(start, libxsmm_timer_tick()));
    }

    if (size == (j + 1) && 0 == memcmp(ref, input + j * stride, elsize)) { /* benchmark libxsmm_memcmp */
      icopy = (unsigned char*)(elsize == stride ? malloc(nbytes) : NULL);
      if (NULL != icopy) {
        ilast = icopy + (size - 1) * stride; /* last item */
        memcpy(icopy, input, nbytes);
        start = libxsmm_timer_tick();
        for (i = 0; i < nrpt; ++i) {
          j += libxsmm_memcmp(input, icopy, nbytes); /* take result of every execution */
          /* memcmp may be pure and without touching input it is not repeated (nrpt) */
          ilast[i%elsize] = 255;
        }
        printf("libxsmm_memcmp:\t\t%.8f s\n", libxsmm_timer_duration(start, libxsmm_timer_tick()));
        result += (int)j * ((int)stride / ((int)stride + 1)); /* ignore result */
      }
    }
    else {
      result = EXIT_FAILURE;
    }

    if (NULL != icopy) { /* benchmark stdlib's memcmp */
      LIBXSMM_ASSERT(NULL != ilast);
      start = libxsmm_timer_tick();
      for (i = 0; i < nrpt; ++i) {
        j += memcmp(input, icopy, nbytes); /* take result of every execution */
        /* memcmp is likely pure and without touching input it is not repeated (nrpt) */
        ilast[i%elsize] = 255;
      }
      printf("stdlib memcmp:\t\t%.8f s\n", libxsmm_timer_duration(start, libxsmm_timer_tick()));
      result += (int)j * ((int)stride / ((int)stride + 1)); /* ignore result */
      free(icopy);
    }

    free(input);
  }
  else {
    result = EXIT_FAILURE;
  }

  return result;
}