LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE LIBXSMM_ATTRIBUTE(weak) void LIBXSMM_FSYMBOL(dgemm)( const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, const double* alpha, const double* a, const libxsmm_blasint* lda, const double* b, const libxsmm_blasint* ldb, const double* beta, double* c, const libxsmm_blasint* ldc) { typedef void (*function_type)( const char*, const char*, const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, const double*, const double*, const libxsmm_blasint*, const double*, const libxsmm_blasint*, const double*, double*, const libxsmm_blasint*); static LIBXSMM_RETARGETABLE union { function_type fn; void* pv; } original = { 0 }; int flags = LIBXSMM_FLAGS; flags = (0 != transa ? (('N' == *transa || 'n' == *transa) ? (flags & ~LIBXSMM_GEMM_FLAG_TRANS_A) : (flags | LIBXSMM_GEMM_FLAG_TRANS_A)) : flags); flags = (0 != transb ? (('N' == *transb || 'n' == *transb) ? (flags & ~LIBXSMM_GEMM_FLAG_TRANS_B) : (flags | LIBXSMM_GEMM_FLAG_TRANS_B)) : flags); if (0 == original.pv) { original.pv = dlsym(RTLD_NEXT, LIBXSMM_STRINGIFY(LIBXSMM_FSYMBOL(dgemm))); } assert(m && n && k && a && b && c); LIBXSMM_XGEMM(double, libxsmm_blasint, original.fn, flags, *m, *n, *k, 0 != alpha ? *alpha : ((double)LIBXSMM_ALPHA), a, *(lda ? lda : LIBXSMM_LD(m, k)), b, *(ldb ? ldb : LIBXSMM_LD(k, n)), 0 != beta ? *beta : ((double)LIBXSMM_BETA), c, *(ldc ? ldc : LIBXSMM_LD(m, n))); }
LIBXSMM_API_DEFINITION void libxsmm_gemm_configure(int archid, int prefetch) { int config = 0; LIBXSMM_UNUSED(prefetch); internal_gemm_prefetch = LIBXSMM_PREFETCH_AL2_AHEAD; internal_gemm_nt = 2; internal_gemm = 2; { /* behaviour of libxsmm_omp_?gemm routines or LD_PRELOAD ?GEMM routines * 0: sequential below-threshold routine (no OpenMP); may fall-back to BLAS, * 1: OpenMP-parallelized but without internal parallel region, * 2: OpenMP-parallelized with internal parallel region" ) */ const char *const env = getenv("LIBXSMM_GEMM"); if (0 != env && 0 != *env) { internal_gemm = atoi(env); } } #if defined(LIBXSMM_EXT_GEMM_TASKS) { /* consider user input about using (OpenMP-)tasks; this code must be here * because maybe only this translation unit is compiled with OpenMP support */ const char *const env_tasks = getenv("LIBXSMM_TASKS"); if (0 != env_tasks && 0 != *env_tasks) { internal_gemm_tasks = atoi(env_tasks); } } #endif #if defined(__MIC__) LIBXSMM_UNUSED(archid); #else if (LIBXSMM_X86_AVX512_MIC == archid) #endif { internal_gemm_nt = 4; config = 1; } { /* attempt to setup tile sizes from the environment (LIBXSMM_M, LIBXSMM_N, and LIBXSMM_K) */ const int tile_configs[/*configs*/][2/*DP/SP*/][3/*TILE_M,TILE_N,TILE_K*/] = { { { 72, 32, 16 }, { 72, 32, 16 } }, /*generic*/ { { 72, 32, 16 }, { 72, 32, 16 } } /*knl*/ }; const char* env[3]; env[0] = getenv("LIBXSMM_M"); env[1] = getenv("LIBXSMM_N"); env[2] = getenv("LIBXSMM_K"); internal_gemm_tile[0/*DP*/][0/*M*/] = (env[0] ? atoi(env[0]) : 0); internal_gemm_tile[0/*DP*/][1/*N*/] = (env[1] ? atoi(env[1]) : 0); internal_gemm_tile[0/*DP*/][2/*K*/] = (env[2] ? atoi(env[2]) : 0); /* environment-defined tile sizes applies for DP and SP */ internal_gemm_tile[1/*SP*/][0/*M*/] = internal_gemm_tile[0/*DP*/][0]; internal_gemm_tile[1/*SP*/][1/*N*/] = internal_gemm_tile[0/*DP*/][1]; internal_gemm_tile[1/*SP*/][2/*K*/] = internal_gemm_tile[0/*DP*/][2]; /* load predefined configuration if tile size is not setup by the environment */ if (0 >= internal_gemm_tile[0/*DP*/][0/*M*/]) internal_gemm_tile[0][0] = tile_configs[config][0][0]; if (0 >= internal_gemm_tile[0/*DP*/][1/*N*/]) internal_gemm_tile[0][1] = tile_configs[config][0][1]; if (0 >= internal_gemm_tile[0/*DP*/][2/*K*/]) internal_gemm_tile[0][2] = tile_configs[config][0][2]; if (0 >= internal_gemm_tile[1/*SP*/][0/*M*/]) internal_gemm_tile[1][0] = tile_configs[config][1][0]; if (0 >= internal_gemm_tile[1/*SP*/][1/*N*/]) internal_gemm_tile[1][1] = tile_configs[config][1][1]; if (0 >= internal_gemm_tile[1/*SP*/][2/*K*/]) internal_gemm_tile[1][2] = tile_configs[config][1][2]; } #if defined(__STATIC) && defined(LIBXSMM_BUILD) && !defined(__CYGWIN__) && \ !(defined(__APPLE__) && defined(__MACH__) /*&& defined(__clang__)*/) if (0 == libxsmm_original_sgemm) { libxsmm_original_sgemm = LIBXSMM_FSYMBOL(__real_sgemm); } #endif #if !defined(__BLAS) || (0 != __BLAS) if (0 == libxsmm_original_sgemm) { libxsmm_original_sgemm = LIBXSMM_FSYMBOL(sgemm); } #endif #if defined(LIBXSMM_RTLD_NEXT) if (0 == libxsmm_original_sgemm) { union { const void* pv; libxsmm_sgemm_function pf; } gemm = { NULL }; gemm.pv = dlsym(RTLD_NEXT, LIBXSMM_STRINGIFY(LIBXSMM_FSYMBOL(sgemm))); libxsmm_original_sgemm = gemm.pf; } #endif #if defined(__STATIC) && defined(LIBXSMM_BUILD) && !defined(__CYGWIN__) && \ !(defined(__APPLE__) && defined(__MACH__) /*&& defined(__clang__)*/) if (0 == libxsmm_original_dgemm) { libxsmm_original_dgemm = LIBXSMM_FSYMBOL(__real_dgemm); } #endif #if !defined(__BLAS) || (0 != __BLAS) if (0 == libxsmm_original_dgemm) { libxsmm_original_dgemm = LIBXSMM_FSYMBOL(dgemm); } #endif #if defined(LIBXSMM_RTLD_NEXT) if (0 == libxsmm_original_dgemm) { union { const void* pv; libxsmm_dgemm_function pf; } gemm = { NULL }; gemm.pv = dlsym(RTLD_NEXT, LIBXSMM_STRINGIFY(LIBXSMM_FSYMBOL(dgemm))); libxsmm_original_dgemm = gemm.pf; } #endif }
int main(int argc, char* argv[]) { const char t = (char)(1 < argc ? *argv[1] : 'o'); const libxsmm_blasint m = (2 < argc ? atoi(argv[2]) : 4096); #if 0 /* TODO: enable when in-place transpose is fully supported */ const libxsmm_blasint n = (3 < argc ? atoi(argv[3]) : m); #else const libxsmm_blasint n = (3 < argc ? (('o' == t || 'O' == t) ? atoi(argv[3]) : m) : m); #endif const libxsmm_blasint ldi = LIBXSMM_MAX/*sanitize ld*/(4 < argc ? atoi(argv[4]) : 0, m); const libxsmm_blasint ldo = LIBXSMM_MAX/*sanitize ld*/(5 < argc ? atoi(argv[5]) : 0, n); const int r = (6 < argc ? atoi(argv[6]) : 0), s = LIBXSMM_ABS(r); const libxsmm_blasint lower = (7 < argc ? atoi(argv[7]) : 0); libxsmm_blasint km = m, kn = n, kldi = ldi, kldo = (('o' == t || 'O' == t) ? ldo : ldi); int result = EXIT_SUCCESS, k; if (0 == strchr("oOiI", t)) { fprintf(stderr, "%s [<transpose-kind:o|i>] [<m>] [<n>] [<ld-in>] [<ld-out>] [random:0|nruns] [lbound]\n", argv[0]); exit(EXIT_FAILURE); } #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload target(LIBXSMM_OFFLOAD_TARGET) #endif { const char *const env_tasks = getenv("TASKS"), *const env_check = getenv("CHECK"); const int tasks = (0 == env_tasks || 0 == *env_tasks) ? 0/*default*/ : atoi(env_tasks); const int check = (0 == env_check || 0 == *env_check) ? 1/*default*/ : atoi(env_check); ELEM_TYPE *const a = (ELEM_TYPE*)libxsmm_malloc((size_t)(ldi * (('o' == t || 'O' == t) ? n : ldo) * sizeof(ELEM_TYPE))); ELEM_TYPE *const b = (ELEM_TYPE*)libxsmm_malloc((size_t)(ldo * (('o' == t || 'O' == t) ? m : ldi) * sizeof(ELEM_TYPE))); libxsmm_timer_tickint start, duration = 0; #if defined(USE_REFERENCE) /* benchmark against a reference */ libxsmm_timer_tickint duration2 = 0; #endif libxsmm_blasint i; size_t size = 0; #if defined(MKL_ENABLE_AVX512) mkl_enable_instructions(MKL_ENABLE_AVX512); #endif fprintf(stdout, "m=%lli n=%lli ldi=%lli ldo=%lli size=%.fMB (%s, %s)\n", (long long)m, (long long)n, (long long)ldi, (long long)ldo, 1.0 * (m * n * sizeof(ELEM_TYPE)) / (1 << 20), LIBXSMM_STRINGIFY(ELEM_TYPE), ('o' == t || 'O' == t) ? "out-of-place" : "in-place"); #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < n; ++i) { libxsmm_blasint j; for (j = 0; j < m; ++j) { a[i*ldi+j] = initial_value(i, j, m); } } if (0 != check) { /* repeatable (reference) */ srand(RAND_SEED); } else { /* randomized selection */ srand(libxsmm_timer_tick() % ((unsigned int)-1)); } for (k = (0 == r ? -1 : 0); k < s && EXIT_SUCCESS == result; ++k) { if (0 < r) { const libxsmm_blasint rldi = 0 <= lower ? randstart(lower, ldi) : 0; km = randstart(LIBXSMM_ABS(lower), m); kldi = LIBXSMM_MAX(rldi, km); if (('o' == t || 'O' == t)) { const libxsmm_blasint rldo = 0 <= lower ? randstart(lower, ldo) : 0; kn = randstart(LIBXSMM_ABS(lower), n); kldo = LIBXSMM_MAX(rldo, kn); /* trigger JIT-generated code */ OTRANS(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo); } else { #if 0 /* TODO: enable when in-place transpose is fully supported */ kn = randstart(LIBXSMM_ABS(lower), n); #else kn = km; #endif kldo = kldi; /* trigger JIT-generated code */ ITRANS(b, sizeof(ELEM_TYPE), km, kn, kldi); } } size += (size_t)(km * kn * sizeof(ELEM_TYPE)); if (('o' == t || 'O' == t)) { if (0 == tasks) { /* library-internal parallelization */ start = libxsmm_timer_tick(); #if defined(OTRANS_THREAD) # pragma omp parallel OTRANS_THREAD(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo, omp_get_thread_num(), omp_get_num_threads()); #else result = OTRANS(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo); #endif duration += libxsmm_timer_diff(start, libxsmm_timer_tick()); } else { /* external parallelization */ start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel # pragma omp single nowait #endif result = OTRANS(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo); duration += libxsmm_timer_diff(start, libxsmm_timer_tick()); } } else { assert(('i' == t || 'I' == t) && kldo == kldi); memcpy(b, a, (size_t)(kldi * kn * sizeof(ELEM_TYPE))); if (2 > tasks) { /* library-internal parallelization */ start = libxsmm_timer_tick(); result = ITRANS(b, sizeof(ELEM_TYPE), km, kn, kldi); duration += libxsmm_timer_diff(start, libxsmm_timer_tick()); } else { /* external parallelization */ start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel # pragma omp single #endif result = ITRANS(b, sizeof(ELEM_TYPE), km, kn, kldi); duration += libxsmm_timer_diff(start, libxsmm_timer_tick()); } } if (0 != check) { /* check */ for (i = 0; i < km; ++i) { libxsmm_blasint j; for (j = 0; j < kn; ++j) { const ELEM_TYPE u = b[i*kldo+j]; const ELEM_TYPE v = a[j*kldi+i]; if (LIBXSMM_NEQ(u, v)) { i += km; /* leave outer loop as well */ result = EXIT_FAILURE; break; } } } } } #if defined(USE_REFERENCE) if (0 < check) { /* check shall imply reference (performance-)test */ srand(RAND_SEED); /* reproduce the same sequence as above */ for (k = (0 == r ? -1 : 0); k < s && EXIT_SUCCESS == result; ++k) { if (0 < r) { const libxsmm_blasint rldi = 0 <= lower ? randstart(lower, ldi) : 0; km = randstart(LIBXSMM_ABS(lower), m); kldi = LIBXSMM_MAX(rldi, km); if (('o' == t || 'O' == t)) { const libxsmm_blasint rldo = 0 <= lower ? randstart(lower, ldo) : 0; kn = randstart(LIBXSMM_ABS(lower), n); kldo = LIBXSMM_MAX(rldo, kn); } else { #if 0 /* TODO: enable when in-place transpose is fully supported */ kn = randstart(LIBXSMM_ABS(lower), n); #else kn = km; #endif kldo = kldi; } } if (('o' == t || 'O' == t)) { start = libxsmm_timer_tick(); OTRANS_GOLD(&km, &kn, a, &kldi, b, &kldo); duration2 += libxsmm_timer_diff(start, libxsmm_timer_tick()); } else { assert(('i' == t || 'I' == t) && kldo == kldi); memcpy(b, a, (size_t)(kldi * kn * sizeof(ELEM_TYPE))); start = libxsmm_timer_tick(); ITRANS_GOLD(&km, &kn, b, &kldi, &kldo); duration2 += libxsmm_timer_diff(start, libxsmm_timer_tick()); } if (1 < check || 0 > check) { /* check */ for (i = 0; i < km; ++i) { libxsmm_blasint j; for (j = 0; j < kn; ++j) { const ELEM_TYPE u = b[i*kldo+j]; const ELEM_TYPE v = a[j*kldi+i]; if (LIBXSMM_NEQ(u, v)) { i += km; /* leave outer loop as well */ result = EXIT_FAILURE; break; } } } } } } #endif if (EXIT_SUCCESS == result) { const double d = libxsmm_timer_duration(0, duration); if (0 < duration) { /* out-of-place transpose bandwidth assumes RFO */ fprintf(stdout, "\tbandwidth: %.1f GB/s\n", size * ((('o' == t || 'O' == t)) ? 3 : 2) / (d * (1 << 30))); } if (0 == lower) { fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * (d / (0 == r ? (s + 1) : s))); } else { fprintf(stdout, "\tduration: %f ms\n", 1000.0 * d); } #if defined(USE_REFERENCE) if (0 < duration2) { fprintf(stdout, "\treference: %.1fx\n", (1.0 * duration) / duration2); } #endif } else if (0 != check) { /* check */ fprintf(stderr, "Error: validation failed for m=%lli, n=%lli, ldi=%lli, and ldo=%lli!\n", (long long)km, (long long)kn, (long long)kldi, (long long)kldo); } libxsmm_free(a); libxsmm_free(b); } return result; }