LIBXSMM_API_DEFINITION void libxsmm_sgemm_omp(const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const float* alpha, const float* a, const libxsmm_blasint* lda, const float* b, const libxsmm_blasint* ldb, const float* beta, float* c, const libxsmm_blasint* ldc) { LIBXSMM_GEMM_DESCRIPTOR_DIM_TYPE tm, tn, tk; LIBXSMM_GEMM_DECLARE_FLAGS(flags, transa, transb); LIBXSMM_INIT tm = libxsmm_gemm_tile[1/*SP*/][0/*M*/]; tn = libxsmm_gemm_tile[1/*SP*/][1/*N*/]; tk = libxsmm_gemm_tile[1/*SP*/][2/*K*/]; assert(0 < tm && 0 < tn && 0 < tk && 0 < libxsmm_nt); #if defined(_OPENMP) if (0 != libxsmm_mt) { /* enable OpenMP support */ if (0 == LIBXSMM_MOD2(libxsmm_mt, 2)) { /* even: enable internal parallelization */ # if defined(LIBXSMM_EXT_TASKS) if (0 == libxsmm_tasks) # endif { LIBXSMM_TILED_XGEMM(LIBXSMM_EXT_FOR_PARALLEL, LIBXSMM_NOOP, LIBXSMM_EXT_FOR_SINGLE, LIBXSMM_GEMM_COLLAPSE, LIBXSMM_EXT_FOR_LOOP, LIBXSMM_EXT_FOR_KERNEL, LIBXSMM_EXT_FOR_SYNC, LIBXSMM_EXT_MIN_NTASKS, LIBXSMM_EXT_OVERHEAD, libxsmm_nt, float, flags | LIBXSMM_GEMM_FLAG_F32PREC, tm, tn, tk, *m, *n, *k, 0 != alpha ? *alpha : ((float)LIBXSMM_ALPHA), a, *(lda ? lda : LIBXSMM_LD(m, k)), b, *(ldb ? ldb : LIBXSMM_LD(k, n)), 0 != beta ? *beta : ((float)LIBXSMM_BETA), c, *(ldc ? ldc : LIBXSMM_LD(m, n))); } # if defined(LIBXSMM_EXT_TASKS) else { LIBXSMM_TILED_XGEMM(LIBXSMM_EXT_TSK_PARALLEL, LIBXSMM_EXT_SINGLE, LIBXSMM_NOOP, LIBXSMM_GEMM_COLLAPSE, LIBXSMM_EXT_TSK_LOOP, LIBXSMM_EXT_TSK_KERNEL_VARS, LIBXSMM_NOOP, LIBXSMM_EXT_MIN_NTASKS, LIBXSMM_EXT_OVERHEAD, libxsmm_nt, float, flags | LIBXSMM_GEMM_FLAG_F32PREC, tm, tn, tk, *m, *n, *k, 0 != alpha ? *alpha : ((float)LIBXSMM_ALPHA), a, *(lda ? lda : LIBXSMM_LD(m, k)), b, *(ldb ? ldb : LIBXSMM_LD(k, n)), 0 != beta ? *beta : ((float)LIBXSMM_BETA), c, *(ldc ? ldc : LIBXSMM_LD(m, n))); } # endif }
int main(int argc, char* argv[]) { const int insize = (1 < argc ? atoi(argv[1]) : 0); const int incrmt = (2 < argc ? atoi(argv[2]) : 0); const int nelems = (3 < argc ? atoi(argv[3]) : 0); const int niters = (4 < argc ? atoi(argv[4]) : 1); const int elsize = (0 >= insize ? LIBXSMM_DESCRIPTOR_SIGSIZE : insize); const int stride = (0 >= incrmt ? LIBXSMM_MAX(LIBXSMM_DESCRIPTOR_MAXSIZE, elsize) : LIBXSMM_MAX(incrmt, elsize)); const size_t n = (0 >= nelems ? (((size_t)2 << 30/*2 GB*/) / stride) : ((size_t)nelems)); unsigned char *input, *icopy = NULL, *ilast = NULL; int result = EXIT_SUCCESS; size_t nbytes, size, nrpt; if (0 < niters) { size = n; nrpt = niters; } else { size = LIBXSMM_MAX(LIBXSMM_ABS(niters), 1); nrpt = n; } nbytes = size * stride; input = (unsigned char*)(0 != nbytes ? malloc(nbytes) : NULL); if (NULL != input) { unsigned char *const ref = input + (size - 1) * stride; /* last item */ libxsmm_timer_tickint start; size_t i, j = 0; /* initialize the input data */ for (i = 0; i < nbytes; ++i) input[i] = LIBXSMM_MOD2(i, 128); for (i = 0; i < (size_t)elsize; ++i) ref[i] = 255; { /* benchmark libxsmm_diff_n */ #if defined(USE_HASH) const unsigned int hashref = libxsmm_hash(ref, elsize, 0/*seed*/); #endif start = libxsmm_timer_tick(); for (i = 0; i < nrpt; ++i) { #if !defined(USE_HASH) j = libxsmm_diff_n(ref, input, (unsigned char)elsize, (unsigned char)stride, (unsigned int)LIBXSMM_MIN(i, size)/*hint*/, (unsigned int)size); #else const unsigned char* tst = input; for (j = 0; j < size; ++j) { const unsigned int hashtst = libxsmm_hash(tst, elsize, 0/*seed*/); if (hashref == hashtst && 0 == libxsmm_diff(ref, tst, (unsigned char)elsize)) { break; } tst += stride; } #endif } printf("libxsmm_diff_n:\t\t%.8f s\n", libxsmm_timer_duration(start, libxsmm_timer_tick())); } if (size == (j + 1) && 0 == memcmp(ref, input + j * stride, elsize)) { /* benchmark libxsmm_memcmp */ icopy = (unsigned char*)(elsize == stride ? malloc(nbytes) : NULL); if (NULL != icopy) { ilast = icopy + (size - 1) * stride; /* last item */ memcpy(icopy, input, nbytes); start = libxsmm_timer_tick(); for (i = 0; i < nrpt; ++i) { j += libxsmm_memcmp(input, icopy, nbytes); /* take result of every execution */ /* memcmp may be pure and without touching input it is not repeated (nrpt) */ ilast[i%elsize] = 255; } printf("libxsmm_memcmp:\t\t%.8f s\n", libxsmm_timer_duration(start, libxsmm_timer_tick())); result += (int)j * ((int)stride / ((int)stride + 1)); /* ignore result */ } } else { result = EXIT_FAILURE; } if (NULL != icopy) { /* benchmark stdlib's memcmp */ LIBXSMM_ASSERT(NULL != ilast); start = libxsmm_timer_tick(); for (i = 0; i < nrpt; ++i) { j += memcmp(input, icopy, nbytes); /* take result of every execution */ /* memcmp is likely pure and without touching input it is not repeated (nrpt) */ ilast[i%elsize] = 255; } printf("stdlib memcmp:\t\t%.8f s\n", libxsmm_timer_duration(start, libxsmm_timer_tick())); result += (int)j * ((int)stride / ((int)stride + 1)); /* ignore result */ free(icopy); } free(input); } else { result = EXIT_FAILURE; } return result; }