LIBXSMM_INLINE LIBXSMM_RETARGETABLE internal_code_type* internal_init(void) { /*const*/internal_code_type* result; int i; #if !defined(LIBXSMM_OPENMP) # if !defined(LIBXSMM_NOSYNC) static int internal_reglock_check = 1; /* setup the locks in a thread-safe fashion */ assert(sizeof(internal_reglock) == (INTERNAL_REGLOCK_COUNT * sizeof(*internal_reglock))); if (1 == LIBXSMM_ATOMIC_LOAD(&internal_reglock_check, LIBXSMM_ATOMIC_SEQ_CST)) { LIBXSMM_ATOMIC_ADD_FETCH(&internal_reglock_check, 1, LIBXSMM_ATOMIC_SEQ_CST); if (2 == internal_reglock_check) { for (i = 0; i < INTERNAL_REGLOCK_COUNT; ++i) LIBXSMM_LOCK_INIT(internal_reglock + i); LIBXSMM_ATOMIC_STORE_ZERO(&internal_reglock_check, LIBXSMM_ATOMIC_SEQ_CST); } } while (0 != internal_reglock_check); /* wait until locks are initialized */ for (i = 0; i < INTERNAL_REGLOCK_COUNT; ++i) LIBXSMM_LOCK_ACQUIRE(internal_reglock + i); # endif #else # pragma omp critical(internal_reglock) #endif { result = LIBXSMM_ATOMIC_LOAD(&internal_registry, LIBXSMM_ATOMIC_SEQ_CST); if (0 == result) { int init_code; /* set internal_target_archid */ libxsmm_set_target_arch(getenv("LIBXSMM_TARGET")); { /* select prefetch strategy for JIT */ const char *const env_prefetch = getenv("LIBXSMM_PREFETCH"); if (0 == env_prefetch || 0 == *env_prefetch) { #if (0 > LIBXSMM_PREFETCH) /* permitted by LIBXSMM_PREFETCH_AUTO */ internal_prefetch = (LIBXSMM_X86_AVX512_MIC != internal_target_archid ? LIBXSMM_PREFETCH_NONE : LIBXSMM_PREFETCH_AL2BL2_VIA_C); #else internal_prefetch = LIBXSMM_MAX(INTERNAL_PREFETCH, 0); #endif } else { /* user input considered even if LIBXSMM_PREFETCH_AUTO is disabled */ switch (atoi(env_prefetch)) { case 2: internal_prefetch = LIBXSMM_PREFETCH_SIGONLY; break; case 3: internal_prefetch = LIBXSMM_PREFETCH_BL2_VIA_C; break; case 4: internal_prefetch = LIBXSMM_PREFETCH_AL2; break; case 5: internal_prefetch = LIBXSMM_PREFETCH_AL2_AHEAD; break; case 6: internal_prefetch = LIBXSMM_PREFETCH_AL2BL2_VIA_C; break; case 7: internal_prefetch = LIBXSMM_PREFETCH_AL2BL2_VIA_C_AHEAD; break; case 8: internal_prefetch = LIBXSMM_PREFETCH_AL2_JPST; break; case 9: internal_prefetch = LIBXSMM_PREFETCH_AL2BL2_VIA_C_JPST; break; default: internal_prefetch = LIBXSMM_PREFETCH_NONE; } } } libxsmm_hash_init(internal_target_archid); libxsmm_gemm_diff_init(internal_target_archid); init_code = libxsmm_gemm_init(internal_target_archid, internal_prefetch); #if defined(__TRACE) { int filter_threadid = 0, filter_mindepth = 1, filter_maxnsyms = 0; const char *const env_trace_init = getenv("LIBXSMM_TRACE"); if (EXIT_SUCCESS == init_code && 0 != env_trace_init && 0 != *env_trace_init) { char buffer[32]; if (1 == sscanf(env_trace_init, "%32[^,],", buffer)) { sscanf(buffer, "%i", &filter_threadid); } if (1 == sscanf(env_trace_init, "%*[^,],%32[^,],", buffer)) { sscanf(buffer, "%i", &filter_mindepth); } if (1 == sscanf(env_trace_init, "%*[^,],%*[^,],%32s", buffer)) { sscanf(buffer, "%i", &filter_maxnsyms); } else { filter_maxnsyms = -1; /* all */ } } init_code = libxsmm_trace_init(filter_threadid - 1, filter_mindepth, filter_maxnsyms); } #endif if (EXIT_SUCCESS == init_code) { assert(0 == internal_registry_keys && 0 == internal_registry); /* should never happen */ result = (internal_code_type*)libxsmm_malloc(LIBXSMM_REGSIZE * sizeof(internal_code_type)); internal_registry_keys = (internal_regkey_type*)libxsmm_malloc(LIBXSMM_REGSIZE * sizeof(internal_regkey_type)); if (0 != result && 0 != internal_registry_keys) { const char *const env_verbose = getenv("LIBXSMM_VERBOSE"); internal_statistic_mnk = (unsigned int)(pow((double)(LIBXSMM_MAX_MNK), 0.3333333333333333) + 0.5); internal_statistic_sml = 13; internal_statistic_med = 23; if (0 != env_verbose && 0 != *env_verbose) { internal_verbose_mode = atoi(env_verbose); } #if !defined(NDEBUG) else { internal_verbose_mode = 1; /* quiet -> verbose */ } #endif for (i = 0; i < LIBXSMM_REGSIZE; ++i) result[i].pmm = 0; /* omit registering code if JIT is enabled and if an ISA extension is found * which is beyond the static code path used to compile the library */ #if defined(LIBXSMM_BUILD) # if (0 != LIBXSMM_JIT) && !defined(__MIC__) /* check if target arch. permits execution (arch. may be overridden) */ if (LIBXSMM_STATIC_TARGET_ARCH <= internal_target_archid && (LIBXSMM_X86_AVX > internal_target_archid /* jit is not available */ /* condition allows to avoid JIT (if static code is good enough) */ || LIBXSMM_STATIC_TARGET_ARCH == internal_target_archid)) # endif { /* opening a scope for eventually declaring variables */ /* setup the dispatch table for the statically generated code */ # include <libxsmm_dispatch.h> } #endif atexit(libxsmm_finalize); LIBXSMM_ATOMIC_STORE(&internal_registry, result, LIBXSMM_ATOMIC_SEQ_CST); } else { #if !defined(NDEBUG) && defined(__TRACE) /* library code is expected to be mute */ fprintf(stderr, "LIBXSMM: failed to allocate code registry!\n"); #endif libxsmm_free(internal_registry_keys); libxsmm_free(result); } } #if !defined(NDEBUG) && defined(__TRACE) /* library code is expected to be mute */ else { fprintf(stderr, "LIBXSMM: failed to initialize sub-component (error #%i)!\n", init_code); } #endif } } #if !defined(LIBXSMM_OPENMP) && !defined(LIBXSMM_NOSYNC) /* release locks */ for (i = 0; i < INTERNAL_REGLOCK_COUNT; ++i) LIBXSMM_LOCK_RELEASE(internal_reglock + i); #endif assert(result); return result; }
int main(void) { const libxsmm_blasint m[] = { 1, 1, 1, 1, 2, 3, 5, 5, 5, 16, 63, 16, 75, 2507 }; const libxsmm_blasint n[] = { 1, 7, 7, 7, 2, 3, 1, 1, 1, 16, 31, 500, 130, 1975 }; const libxsmm_blasint ldi[] = { 1, 1, 1, 9, 2, 3, 5, 8, 8, 16, 64, 16, 87, 3000 }; const libxsmm_blasint ldo[] = { 1, 7, 8, 8, 2, 3, 1, 1, 4, 16, 32, 512, 136, 3072 }; const int start = 0, ntests = sizeof(m) / sizeof(*m); libxsmm_blasint max_size_a = 0, max_size_b = 0; unsigned int nerrors = 0; ELEM_TYPE *a = 0, *b = 0; int test; for (test = start; test < ntests; ++test) { const libxsmm_blasint size_a = ldi[test] * n[test], size_b = ldo[test] * m[test]; assert(m[test] <= ldi[test] && n[test] <= ldo[test]); max_size_a = LIBXSMM_MAX(max_size_a, size_a); max_size_b = LIBXSMM_MAX(max_size_b, size_b); } a = (ELEM_TYPE*)libxsmm_malloc((size_t)(max_size_a * sizeof(ELEM_TYPE))); b = (ELEM_TYPE*)libxsmm_malloc((size_t)(max_size_b * sizeof(ELEM_TYPE))); assert(0 != a && 0 != b); LIBXSMM_MATINIT(ELEM_TYPE, 42, a, max_size_a, 1, max_size_a, 1.0); LIBXSMM_MATINIT(ELEM_TYPE, 0, b, max_size_b, 1, max_size_b, 1.0); for (test = start; test < ntests; ++test) { unsigned int testerrors = (EXIT_SUCCESS == libxsmm_otrans( b, a, sizeof(ELEM_TYPE), m[test], n[test], ldi[test], ldo[test]) ? 0 : 1); if (0 == testerrors) { libxsmm_blasint i, j; for (i = 0; i < n[test]; ++i) { for (j = 0; j < m[test]; ++j) { const libxsmm_blasint u = i * ldi[test] + j; const libxsmm_blasint v = j * ldo[test] + i; testerrors += (LIBXSMM_FEQ(a[u], b[v]) ? 0u : 1u); } } } if (nerrors < testerrors) { nerrors = testerrors; } } if (0 == nerrors) { /* previous results are correct and may be used to validate other tests */ for (test = start; test < ntests; ++test) { /* prepare expected results in b (correct according to the previous test block) */ libxsmm_otrans(b, a, sizeof(ELEM_TYPE), m[test], n[test], ldi[test], ldo[test]); if (m[test] == n[test] && ldi[test] == ldo[test]) { unsigned int testerrors = (EXIT_SUCCESS == libxsmm_otrans( a, a, sizeof(ELEM_TYPE), m[test], n[test], ldi[test], ldo[test]) ? 0 : 1); if (0 == testerrors) { libxsmm_blasint i, j; for (i = 0; i < n[test]; ++i) { for (j = 0; j < m[test]; ++j) { /* address serves both a and b since ldi and ldo are equal */ const libxsmm_blasint uv = i * ldi[test] + j; testerrors += (LIBXSMM_FEQ(a[uv], b[uv]) ? 0u : 1u); } } } if (nerrors < testerrors) { nerrors = testerrors; } } else { /* negative tests */ nerrors = LIBXSMM_MAX(EXIT_SUCCESS != libxsmm_otrans( a, a, sizeof(ELEM_TYPE), m[test], n[test], ldi[test], ldo[test]) ? 0u : 1u, nerrors); } } } libxsmm_free(a); libxsmm_free(b); if (0 == nerrors) { return EXIT_SUCCESS; } else { # if defined(_DEBUG) fprintf(stderr, "errors=%u\n", nerrors); # endif return EXIT_FAILURE; } }
int main(int argc, char* argv[]) { LIBXSMM_GEMM_CONST libxsmm_blasint m = (1 < argc ? atoi(argv[1]) : 1024); LIBXSMM_GEMM_CONST libxsmm_blasint k = (3 < argc ? atoi(argv[3]) : m); LIBXSMM_GEMM_CONST libxsmm_blasint n = (2 < argc ? atoi(argv[2]) : k); const libxsmm_blasint bm = (4 < argc ? atoi(argv[4]) : 32); const libxsmm_blasint bk = (6 < argc ? atoi(argv[6]) : bm); const libxsmm_blasint bn = (5 < argc ? atoi(argv[5]) : bk); const libxsmm_bgemm_order order = (libxsmm_bgemm_order)(7 < argc ? atoi(argv[7]) : 0); const int nrepeat = (8 < argc ? atoi(argv[8]) : 100); const libxsmm_blasint b_m1 = (9 < argc ? atoi(argv[9]) : 1); const libxsmm_blasint b_n1 = (10 < argc ? atoi(argv[10]) : 1); const libxsmm_blasint b_k1 = (11 < argc ? atoi(argv[11]) : 1); const libxsmm_blasint b_k2 = (12 < argc ? atoi(argv[12]) : 1); const int ab = (13 < argc ? atoi(argv[13]) : 0); LIBXSMM_GEMM_CONST libxsmm_blasint lda = (14 < argc ? atoi(argv[13]) : m); LIBXSMM_GEMM_CONST libxsmm_blasint ldb = (15 < argc ? atoi(argv[14]) : k); LIBXSMM_GEMM_CONST libxsmm_blasint ldc = (16 < argc ? atoi(argv[15]) : m); LIBXSMM_GEMM_CONST char transa = 'N', transb = 'N'; /* no transposes */ LIBXSMM_GEMM_CONST ITYPE alpha = 1, beta = 1; const int gemm_flags = LIBXSMM_GEMM_FLAGS(transa, transb); const double gflops = 2.0 * m * n * k * 1E-9; int result = EXIT_SUCCESS; #if defined(CHECK) const char *const env_check = getenv("CHECK"); const double check = LIBXSMM_ABS(0 == env_check ? 0 : atof(env_check)); #endif if (argc > 1 && !strncmp(argv[1], "-h", 3)) { /* check command line */ printf("\nUsage: ./bgemm [M] [N] [K] [bm] [bn] [bk] [order] [reps] [b_m1] [b_n1] [b_k1] [b_k2] [verbose]\n\n"); return result; } MYASSERT(m % b_m1 == 0); MYASSERT(n % b_n1 == 0); MYASSERT(k % b_k1 == 0); MYASSERT(m/b_m1 % bm == 0); MYASSERT(n/b_n1 % bn == 0); MYASSERT(k/b_k1/b_k2 % bk == 0); #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload target(LIBXSMM_OFFLOAD_TARGET) #endif { ITYPE* agold = (ITYPE*)libxsmm_malloc((size_t)(lda * k * sizeof(ITYPE))); ITYPE* bgold = (ITYPE*)libxsmm_malloc((size_t)(ldb * n * sizeof(ITYPE))); ITYPE* cgold = (ITYPE*)libxsmm_malloc((size_t)(ldc * n * sizeof(ITYPE))); ITYPE* a = (ITYPE*)libxsmm_malloc((size_t)(m * k * sizeof(ITYPE))); ITYPE* b = (ITYPE*)libxsmm_malloc((size_t)(k * n * sizeof(ITYPE))); ITYPE* c = (ITYPE*)libxsmm_malloc((size_t)(m * n * sizeof(ITYPE))); libxsmm_bgemm_handle* handle = 0; unsigned long long start; double duration; handle = libxsmm_bgemm_handle_create( LIBXSMM_GEMM_PRECISION(ITYPE), LIBXSMM_GEMM_PRECISION(ITYPE), m, n, k, &bm, &bn, &bk, &b_m1, &b_n1, &b_k1, &b_k2, &alpha, &beta, &gemm_flags, NULL/*auto-prefetch*/, &order); if (0 != handle) { LIBXSMM_MATINIT(ITYPE, 42, agold, m, k, lda, 1.0); LIBXSMM_MATINIT(ITYPE, 24, bgold, k, n, ldb, 1.0); LIBXSMM_MATINIT(ITYPE, 0, cgold, m, n, ldc, 1.0); libxsmm_bgemm_copyin_a(handle, agold, &lda, a); libxsmm_bgemm_copyin_b(handle, bgold, &ldb, b); libxsmm_bgemm_copyin_c(handle, cgold, &ldc, c); #if defined(MKL_ENABLE_AVX512) mkl_enable_instructions(MKL_ENABLE_AVX512); #endif /* warm-up OpenMP (populate thread pool) */ libxsmm_bgemm_omp(handle, a, b, c, 1); #if defined(CHECK) if (!LIBXSMM_FEQ(0, check)) { LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, agold, &lda, bgold, &ldb, &beta, cgold, &ldc); } #endif if (!ab) { libxsmm_gemm_print(stdout, LIBXSMM_GEMM_PRECISION(ITYPE), &transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); fprintf(stdout, "\n\n"); } start = libxsmm_timer_tick(); libxsmm_bgemm_omp(handle, a, b, c, nrepeat); duration = libxsmm_timer_duration(start, libxsmm_timer_tick()); if (0 < duration) { if (ab) { fprintf(stdout, "\tLIBXSMM: %.1f GFLOPS/s | %lli,%lli,%lli,%lli,%lli,%lli,%i,%lli,%lli,%lli,%lli\n", gflops * nrepeat / duration, (long long)m, (long long)n, (long long)k, (long long)bm, (long long)bn, (long long)bk, (int)order, (long long)b_m1, (long long)b_n1, (long long)b_k1, (long long)b_k2); } else { fprintf(stdout, "\tLIBXSMM: %.1f GFLOPS/s\n", gflops * nrepeat / duration); } } #if defined(CHECK) if (!LIBXSMM_FEQ(0, check)) { /* validate result against LAPACK/BLAS xGEMM */ ITYPE* ctest = 0; int i; start = libxsmm_timer_tick(); for (i = 0; i < nrepeat; ++i) { LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, agold, &lda, bgold, &ldb, &beta, cgold, &ldc); } duration = libxsmm_timer_duration(start, libxsmm_timer_tick()); if (0 < duration) { fprintf(stdout, "\tBLAS: %.1f GFLOPS/s\n", gflops * nrepeat / duration); } /* free memory not needed further; avoid double-free later on */ libxsmm_free(agold); agold = 0; libxsmm_free(bgold); bgold = 0; libxsmm_free(a); a = 0; libxsmm_free(b); b = 0; /* allocate C-matrix in regular format, and perform copy-out */ ctest = (ITYPE*)libxsmm_malloc((size_t)(ldc * n * sizeof(ITYPE))); if (0 != ctest) { libxsmm_matdiff_info diff; libxsmm_bgemm_copyout_c(handle, c, &ldc, ctest); if (EXIT_SUCCESS == libxsmm_matdiff(LIBXSMM_DATATYPE(ITYPE), m, n, cgold, ctest, &ldc, &ldc, &diff)) { fprintf(stdout, "\tdiff: L2abs=%f Linf=%f\n", diff.l2_abs, diff.linf_abs); if (check < 100.0 * diff.normf_rel) { fprintf(stderr, "FAILED with an error of %f%%!\n", 100.0 * diff.normf_rel); result = EXIT_FAILURE; } } libxsmm_free(ctest); } } #endif libxsmm_bgemm_handle_destroy(handle); } else { fprintf(stderr, "FAILED to create BGEMM-handle! For details retry with LIBXSMM_VERBOSE=1.\n"); result = EXIT_FAILURE; } libxsmm_free(agold); libxsmm_free(bgold); libxsmm_free(cgold); libxsmm_free(a); libxsmm_free(b); libxsmm_free(c); } if(!ab) { fprintf(stdout, "Finished\n"); } return result; }
int main(int argc, char* argv[]) { const char t = (char)(1 < argc ? *argv[1] : 'o'); const libxsmm_blasint m = (2 < argc ? atoi(argv[2]) : 4096); #if 0 /* TODO: enable when in-place transpose is fully supported */ const libxsmm_blasint n = (3 < argc ? atoi(argv[3]) : m); #else const libxsmm_blasint n = (3 < argc ? (('o' == t || 'O' == t) ? atoi(argv[3]) : m) : m); #endif const libxsmm_blasint ldi = LIBXSMM_MAX/*sanitize ld*/(4 < argc ? atoi(argv[4]) : 0, m); const libxsmm_blasint ldo = LIBXSMM_MAX/*sanitize ld*/(5 < argc ? atoi(argv[5]) : 0, n); const int r = (6 < argc ? atoi(argv[6]) : 0), s = LIBXSMM_ABS(r); const libxsmm_blasint lower = (7 < argc ? atoi(argv[7]) : 0); libxsmm_blasint km = m, kn = n, kldi = ldi, kldo = (('o' == t || 'O' == t) ? ldo : ldi); int result = EXIT_SUCCESS, k; if (0 == strchr("oOiI", t)) { fprintf(stderr, "%s [<transpose-kind:o|i>] [<m>] [<n>] [<ld-in>] [<ld-out>] [random:0|nruns] [lbound]\n", argv[0]); exit(EXIT_FAILURE); } #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload target(LIBXSMM_OFFLOAD_TARGET) #endif { const char *const env_tasks = getenv("TASKS"), *const env_check = getenv("CHECK"); const int tasks = (0 == env_tasks || 0 == *env_tasks) ? 0/*default*/ : atoi(env_tasks); const int check = (0 == env_check || 0 == *env_check) ? 1/*default*/ : atoi(env_check); ELEM_TYPE *const a = (ELEM_TYPE*)libxsmm_malloc((size_t)(ldi * (('o' == t || 'O' == t) ? n : ldo) * sizeof(ELEM_TYPE))); ELEM_TYPE *const b = (ELEM_TYPE*)libxsmm_malloc((size_t)(ldo * (('o' == t || 'O' == t) ? m : ldi) * sizeof(ELEM_TYPE))); libxsmm_timer_tickint start, duration = 0; #if defined(USE_REFERENCE) /* benchmark against a reference */ libxsmm_timer_tickint duration2 = 0; #endif libxsmm_blasint i; size_t size = 0; #if defined(MKL_ENABLE_AVX512) mkl_enable_instructions(MKL_ENABLE_AVX512); #endif fprintf(stdout, "m=%lli n=%lli ldi=%lli ldo=%lli size=%.fMB (%s, %s)\n", (long long)m, (long long)n, (long long)ldi, (long long)ldo, 1.0 * (m * n * sizeof(ELEM_TYPE)) / (1 << 20), LIBXSMM_STRINGIFY(ELEM_TYPE), ('o' == t || 'O' == t) ? "out-of-place" : "in-place"); #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < n; ++i) { libxsmm_blasint j; for (j = 0; j < m; ++j) { a[i*ldi+j] = initial_value(i, j, m); } } if (0 != check) { /* repeatable (reference) */ srand(RAND_SEED); } else { /* randomized selection */ srand(libxsmm_timer_tick() % ((unsigned int)-1)); } for (k = (0 == r ? -1 : 0); k < s && EXIT_SUCCESS == result; ++k) { if (0 < r) { const libxsmm_blasint rldi = 0 <= lower ? randstart(lower, ldi) : 0; km = randstart(LIBXSMM_ABS(lower), m); kldi = LIBXSMM_MAX(rldi, km); if (('o' == t || 'O' == t)) { const libxsmm_blasint rldo = 0 <= lower ? randstart(lower, ldo) : 0; kn = randstart(LIBXSMM_ABS(lower), n); kldo = LIBXSMM_MAX(rldo, kn); /* trigger JIT-generated code */ OTRANS(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo); } else { #if 0 /* TODO: enable when in-place transpose is fully supported */ kn = randstart(LIBXSMM_ABS(lower), n); #else kn = km; #endif kldo = kldi; /* trigger JIT-generated code */ ITRANS(b, sizeof(ELEM_TYPE), km, kn, kldi); } } size += (size_t)(km * kn * sizeof(ELEM_TYPE)); if (('o' == t || 'O' == t)) { if (0 == tasks) { /* library-internal parallelization */ start = libxsmm_timer_tick(); #if defined(OTRANS_THREAD) # pragma omp parallel OTRANS_THREAD(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo, omp_get_thread_num(), omp_get_num_threads()); #else result = OTRANS(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo); #endif duration += libxsmm_timer_diff(start, libxsmm_timer_tick()); } else { /* external parallelization */ start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel # pragma omp single nowait #endif result = OTRANS(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo); duration += libxsmm_timer_diff(start, libxsmm_timer_tick()); } } else { assert(('i' == t || 'I' == t) && kldo == kldi); memcpy(b, a, (size_t)(kldi * kn * sizeof(ELEM_TYPE))); if (2 > tasks) { /* library-internal parallelization */ start = libxsmm_timer_tick(); result = ITRANS(b, sizeof(ELEM_TYPE), km, kn, kldi); duration += libxsmm_timer_diff(start, libxsmm_timer_tick()); } else { /* external parallelization */ start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel # pragma omp single #endif result = ITRANS(b, sizeof(ELEM_TYPE), km, kn, kldi); duration += libxsmm_timer_diff(start, libxsmm_timer_tick()); } } if (0 != check) { /* check */ for (i = 0; i < km; ++i) { libxsmm_blasint j; for (j = 0; j < kn; ++j) { const ELEM_TYPE u = b[i*kldo+j]; const ELEM_TYPE v = a[j*kldi+i]; if (LIBXSMM_NEQ(u, v)) { i += km; /* leave outer loop as well */ result = EXIT_FAILURE; break; } } } } } #if defined(USE_REFERENCE) if (0 < check) { /* check shall imply reference (performance-)test */ srand(RAND_SEED); /* reproduce the same sequence as above */ for (k = (0 == r ? -1 : 0); k < s && EXIT_SUCCESS == result; ++k) { if (0 < r) { const libxsmm_blasint rldi = 0 <= lower ? randstart(lower, ldi) : 0; km = randstart(LIBXSMM_ABS(lower), m); kldi = LIBXSMM_MAX(rldi, km); if (('o' == t || 'O' == t)) { const libxsmm_blasint rldo = 0 <= lower ? randstart(lower, ldo) : 0; kn = randstart(LIBXSMM_ABS(lower), n); kldo = LIBXSMM_MAX(rldo, kn); } else { #if 0 /* TODO: enable when in-place transpose is fully supported */ kn = randstart(LIBXSMM_ABS(lower), n); #else kn = km; #endif kldo = kldi; } } if (('o' == t || 'O' == t)) { start = libxsmm_timer_tick(); OTRANS_GOLD(&km, &kn, a, &kldi, b, &kldo); duration2 += libxsmm_timer_diff(start, libxsmm_timer_tick()); } else { assert(('i' == t || 'I' == t) && kldo == kldi); memcpy(b, a, (size_t)(kldi * kn * sizeof(ELEM_TYPE))); start = libxsmm_timer_tick(); ITRANS_GOLD(&km, &kn, b, &kldi, &kldo); duration2 += libxsmm_timer_diff(start, libxsmm_timer_tick()); } if (1 < check || 0 > check) { /* check */ for (i = 0; i < km; ++i) { libxsmm_blasint j; for (j = 0; j < kn; ++j) { const ELEM_TYPE u = b[i*kldo+j]; const ELEM_TYPE v = a[j*kldi+i]; if (LIBXSMM_NEQ(u, v)) { i += km; /* leave outer loop as well */ result = EXIT_FAILURE; break; } } } } } } #endif if (EXIT_SUCCESS == result) { const double d = libxsmm_timer_duration(0, duration); if (0 < duration) { /* out-of-place transpose bandwidth assumes RFO */ fprintf(stdout, "\tbandwidth: %.1f GB/s\n", size * ((('o' == t || 'O' == t)) ? 3 : 2) / (d * (1 << 30))); } if (0 == lower) { fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * (d / (0 == r ? (s + 1) : s))); } else { fprintf(stdout, "\tduration: %f ms\n", 1000.0 * d); } #if defined(USE_REFERENCE) if (0 < duration2) { fprintf(stdout, "\treference: %.1fx\n", (1.0 * duration) / duration2); } #endif } else if (0 != check) { /* check */ fprintf(stderr, "Error: validation failed for m=%lli, n=%lli, ldi=%lli, and ldo=%lli!\n", (long long)km, (long long)kn, (long long)kldi, (long long)kldo); } libxsmm_free(a); libxsmm_free(b); } return result; }