int main(int argc, char* argv[]) { int result = EXIT_SUCCESS; try { const libxsmm_blasint benchmark = 1 < argc ? std::atoi(argv[1]) : 0; LIBXSMM_GEMM_CONST libxsmm_blasint m = (2 < argc ? std::atoi(argv[2]) : 23); LIBXSMM_GEMM_CONST libxsmm_blasint k = (4 < argc ? std::atoi(argv[4]) : m); LIBXSMM_GEMM_CONST libxsmm_blasint n = (3 < argc ? std::atoi(argv[3]) : k); const libxsmm_blasint q = (5 < argc ? std::atoi(argv[5]) : 0/*auto*/); const libxsmm_blasint nrepeat = (6 < argc ? std::atoi(argv[6]) : (0 >= q ? 13 : 1)); LIBXSMM_GEMM_CONST libxsmm_blasint lda = m, ldb = k, ldc = m; LIBXSMM_GEMM_CONST char transa = 'N', transb = 'N'; LIBXSMM_GEMM_CONST OTYPE alpha = 1, beta = 1; const libxsmm_blasint asize = PAD(ITYPE, lda * k), bsize = PAD(ITYPE, ldb * n), csize = PAD(OTYPE, ldc * n); const libxsmm_blasint max_size = ((2ULL << 30/*2 GB*/) / ((asize + bsize) * sizeof(ITYPE) + csize * sizeof(OTYPE))); const libxsmm_blasint s = LIBXSMM_MIN(0 < q ? q : max_size, max_size); const libxsmm_blasint aspace = LIBXSMM_ALIGNMENT / sizeof(ITYPE); const size_t bwsize = static_cast<size_t>((asize/*load*/ + bsize/*load*/) * sizeof(ITYPE) + 2/*RFO*/ * csize * sizeof(OTYPE)); const double gflops = 2E-9 * s * m * n * k; #if LIBXSMM_TYPEINFO(ITYPE, FP) const char *const ops = "FLOPS"; const double scale = 1.0 / s; #else const char *const ops = "OPS"; const double scale = 1; #endif #if !defined(_DEBUG) const char *const env_check = getenv("CHECK"); const int check = (0 == env_check ? 0 : atoi(env_check)); #else /*const*/ int check = 1; #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload target(LIBXSMM_OFFLOAD_TARGET) #endif { #if defined(_OPENMP) const libxsmm_blasint chunksize = s / omp_get_max_threads(); #endif struct raii { // avoid std::vector (first-touch init. causes NUMA issue) ITYPE *a, *b; OTYPE *c, *d; libxsmm_blasint *m_shuffle; raii(libxsmm_blasint asize_, libxsmm_blasint bsize_, libxsmm_blasint csize_, libxsmm_blasint size_) : a(new ITYPE[static_cast<size_t>(asize_)]), b(new ITYPE[static_cast<size_t>(bsize_)]) , c(new OTYPE[static_cast<size_t>(csize_)]), d(new OTYPE[static_cast<size_t>(csize_)]) , m_shuffle(new libxsmm_blasint[size_]) { # if defined(_OPENMP) # pragma omp parallel for schedule(static) # endif for (libxsmm_blasint i = 0; i < size_; ++i) m_shuffle[i] = libxsmm_rand_u32(size_); } ~raii() { delete[] a; delete[] b; delete[] c; delete[] d; delete[] m_shuffle; } #if defined(RANDOMIZED) libxsmm_blasint shuffle(libxsmm_blasint i) const { return m_shuffle[i]; } #else libxsmm_blasint shuffle(libxsmm_blasint i) const { return i; } #endif } helper(s * asize + aspace - 1, s * bsize + aspace - 1, s * csize + aspace - 1, s); ITYPE *const a = LIBXSMM_ALIGN(helper.a, LIBXSMM_ALIGNMENT); ITYPE *const b = LIBXSMM_ALIGN(helper.b, LIBXSMM_ALIGNMENT); OTYPE *const c = LIBXSMM_ALIGN(helper.c, LIBXSMM_ALIGNMENT); OTYPE *const d = LIBXSMM_ALIGN(helper.d, LIBXSMM_ALIGNMENT); #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { LIBXSMM_MATINIT(ITYPE, 42 + helper.shuffle(i), a + helper.shuffle(i) * asize, m, k, lda, scale); LIBXSMM_MATINIT(ITYPE, 24 + helper.shuffle(i), b + helper.shuffle(i) * bsize, k, n, ldb, scale); LIBXSMM_MATINIT(OTYPE, 22 + i, c + i * csize, m, n, ldc, scale); LIBXSMM_MATINIT(OTYPE, 22 + i, d + i * csize, m, n, ldc, scale); } #if defined(MKL_ENABLE_AVX512) mkl_enable_instructions(MKL_ENABLE_AVX512); #endif // initialize LIBXSMM libxsmm_init(); fprintf(stdout, "m=%lli n=%lli k=%lli size=%lli memory=%.1f MB (input=%s output=%s)\n\n", static_cast<long long>(m), static_cast<long long>(n), static_cast<long long>(k), static_cast<long long>(s), 1.0 * (s * ((asize + bsize) * sizeof(ITYPE) + csize * sizeof(OTYPE))) / (1 << 20), LIBXSMM_TYPENAME(ITYPE), LIBXSMM_TYPENAME(OTYPE)); // LAPACK/BLAS3 (warm-up BLAS Library) #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, a + helper.shuffle(i) * asize, &lda, b + helper.shuffle(i) * bsize, &ldb, &beta, c + i * csize, &ldc); } #if (defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL)) && (LIBXSMM_VERSION3(11, 3, 0) <= INTEL_MKL_VERSION) std::vector<const ITYPE*> va_array(static_cast<size_t>(s)), vb_array(static_cast<size_t>(s)); std::vector<OTYPE*> vc_array(static_cast<size_t>(s)); const ITYPE* *const a_array = &va_array[0]; const ITYPE* *const b_array = &vb_array[0]; OTYPE* *const c_array = &vc_array[0]; const libxsmm_blasint group_count = 1; for (libxsmm_blasint i = 0; i < s; ++i) { // setup batched (A,B,C) a_array[i] = a + helper.shuffle(i) * asize; b_array[i] = b + helper.shuffle(i) * bsize; c_array[i] = d + i * csize; } // additional warm-up (also to eventually match the Gold result) LIBXSMM_TPREFIX(ITYPE,gemm_batch)(&transa, &transb, &m, &n, &k, &alpha, &a_array[0], &lda, &b_array[0], &ldb, &beta, &c_array[0], &ldc, &group_count, &s); #endif switch (benchmark) { case 0: { // batched fprintf(stdout, "Batched (A,B,C)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, a + helper.shuffle(i) * asize, &lda, b + helper.shuffle(i) * bsize, &ldb, &beta, c + i * csize, &ldc); } } const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * bwsize / (duration * (1 << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } /* fallthrough */ #if (defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL)) && (LIBXSMM_VERSION3(11, 3, 0) <= INTEL_MKL_VERSION) case 1: { // batched indirect fprintf(stdout, "Indirect (A,B,C)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { LIBXSMM_TPREFIX(ITYPE,gemm_batch)(&transa, &transb, &m, &n, &k, &alpha, &a_array[0], &lda, &b_array[0], &ldb, &beta, &c_array[0], &ldc, &group_count, &s); } const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * bwsize / (duration * (1 << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); if (0 == benchmark) { /* Gold result is available */ libxsmm_matdiff_info diff; memset(&diff, 0, sizeof(diff)); for (libxsmm_blasint h = 0; h < s; ++h) { const OTYPE *const u = c + h * csize, *const v = c_array[h]; libxsmm_matdiff_info dv; if (EXIT_SUCCESS == libxsmm_matdiff(LIBXSMM_DATATYPE(OTYPE), m, n, u, v, &ldc, &ldc, &dv)) { libxsmm_matdiff_reduce(&diff, &dv); } } if (0 < diff.normf_rel) fprintf(stdout, "\tdiff: %.0f%%\n", 100.0 * diff.normf_rel); } } #endif break; case 2: { // streaming A and C fprintf(stdout, "Streamed (A,C)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, a + helper.shuffle(i) * asize, &lda, b, &ldb, &beta, c + i * csize, &ldc); } } const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - bsize * sizeof(ITYPE)) / (duration * (1 << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } /* fallthrough */ #if (defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL)) && (LIBXSMM_VERSION3(11, 3, 0) <= INTEL_MKL_VERSION) case 3: { // indirect A and C fprintf(stdout, "Indirect (A,C)...\n"); for (libxsmm_blasint i = 0; i < s; ++i) { a_array[i] = a + helper.shuffle(i) * asize; b_array[i] = b; c_array[i] = d + i * csize; } const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { LIBXSMM_TPREFIX(ITYPE, gemm_batch)(&transa, &transb, &m, &n, &k, &alpha, &a_array[0], &lda, &b_array[0], &ldb, &beta, &c_array[0], &ldc, &group_count, &s); } const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - bsize * sizeof(ITYPE)) / (duration * (1 << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } #endif break; case 4: { // streaming B and C fprintf(stdout, "Streamed (B,C)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b + helper.shuffle(i) * bsize, &ldb, &beta, c + i * csize, &ldc); } } const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - asize * sizeof(ITYPE)) / (duration * (1 << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } /* fallthrough */ #if (defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL)) && (LIBXSMM_VERSION3(11, 3, 0) <= INTEL_MKL_VERSION) case 5: { // indirect B and C fprintf(stdout, "Indirect (B,C)...\n"); for (libxsmm_blasint i = 0; i < s; ++i) { a_array[i] = a; b_array[i] = b + helper.shuffle(i) * bsize; c_array[i] = d + i * csize; } const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { LIBXSMM_TPREFIX(ITYPE, gemm_batch)(&transa, &transb, &m, &n, &k, &alpha, &a_array[0], &lda, &b_array[0], &ldb, &beta, &c_array[0], &ldc, &group_count, &s); } const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - asize * sizeof(ITYPE)) / (duration * (1 << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } #endif break; case 6: { // streaming A and B fprintf(stdout, "Streamed (A,B)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { #if defined(_OPENMP) /* attempt to write to disjunct cachelines */ const libxsmm_blasint j = omp_get_thread_num() * chunksize * csize; #else const libxsmm_blasint j = 0; #endif LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, a + helper.shuffle(i) * asize, &lda, b + helper.shuffle(i) * bsize, &ldb, &beta, c + j, &ldc); } } const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - 2 * csize * sizeof(OTYPE)) / (duration * (1 << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } /* fallthrough */ #if (defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL)) && (LIBXSMM_VERSION3(11, 3, 0) <= INTEL_MKL_VERSION) case 7: { // indirect A and B fprintf(stdout, "Indirect (A,B)...\n"); #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { a_array[i] = a + helper.shuffle(i) * asize; b_array[i] = b + helper.shuffle(i) * bsize; #if defined(_OPENMP) /* attempt to write to disjunct cachelines */ c_array[i] = d + omp_get_thread_num() * chunksize * csize; #else c_array[i] = d; #endif } const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { LIBXSMM_TPREFIX(ITYPE, gemm_batch)(&transa, &transb, &m, &n, &k, &alpha, &a_array[0], &lda, &b_array[0], &ldb, &beta, &c_array[0], &ldc, &group_count, &s); } const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - 2 * csize * sizeof(OTYPE)) / (duration * (1 << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } #endif break; case 8: { // cached fprintf(stdout, "Cached...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { #if defined(_OPENMP) /* attempt to write to disjunct cachelines */ const libxsmm_blasint j = omp_get_thread_num() * chunksize * csize; #else const libxsmm_blasint j = 0; #endif LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c + j, &ldc); } } const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } /* fallthrough */ #if (defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL)) && (LIBXSMM_VERSION3(11, 3, 0) <= INTEL_MKL_VERSION) case 9: { // indirect cached fprintf(stdout, "Indirect cached...\n"); #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { a_array[i] = a; b_array[i] = b; #if defined(_OPENMP) /* attempt to write to disjunct cachelines */ c_array[i] = d + omp_get_thread_num() * chunksize * csize; #else c_array[i] = d; #endif } const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { LIBXSMM_TPREFIX(ITYPE, gemm_batch)(&transa, &transb, &m, &n, &k, &alpha, &a_array[0], &lda, &b_array[0], &ldb, &beta, &c_array[0], &ldc, &group_count, &s); } const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } #endif break; default: throw "invalid case selected!"; } /*switch*/ if (0 != check) { libxsmm_matdiff_info diff; if (EXIT_SUCCESS == libxsmm_matdiff(LIBXSMM_DATATYPE(OTYPE), m, n, 0 == (benchmark & 1) ? c : d, NULL, &ldc, &ldc, &diff)) { fprintf(stdout, "\tcheck: %f\n", diff.l1_ref); } } // finalize LIBXSMM libxsmm_finalize(); fprintf(stdout, "Finished\n"); } } catch(const std::exception& e) { fprintf(stderr, "Error: %s\n", e.what()); result = EXIT_FAILURE; } catch(const char* message) { fprintf(stderr, "Error: %s\n", message); result = EXIT_FAILURE; } catch(...) { fprintf(stderr, "Error: unknown exception caught!\n"); result = EXIT_FAILURE; } return result; }
int main(int argc, char* argv[]) { const libxsmm_blasint m = 1 < argc ? atoi(argv[1]) : 4096; const libxsmm_blasint n = 2 < argc ? atoi(argv[2]) : m; const libxsmm_blasint lda = LIBXSMM_MAX(3 < argc ? atoi(argv[3]) : 0, m); const libxsmm_blasint ldb = LIBXSMM_MAX(4 < argc ? atoi(argv[4]) : 0, n); REAL_TYPE *const a = (REAL_TYPE*)malloc(lda * n * sizeof(REAL_TYPE)); REAL_TYPE *const b = (REAL_TYPE*)malloc(ldb * m * sizeof(REAL_TYPE)); const unsigned int size = m * n * sizeof(REAL_TYPE); unsigned long long start; libxsmm_blasint i, j; double duration; fprintf(stdout, "m=%i n=%i lda=%i ldb=%i size=%.fMB (%s)\n", m, n, lda, ldb, 1.0 * size / (1 << 20), 8 == sizeof(REAL_TYPE) ? "DP" : "SP"); for (i = 0; i < n; ++i) { for (j = 0; j < m; ++j) { a[i*lda+j] = initial_value(i, j, lda); } } start = libxsmm_timer_tick(); libxsmm_transpose_oop(b, a, sizeof(REAL_TYPE), m, n, lda, ldb); libxsmm_transpose_oop(a, b, sizeof(REAL_TYPE), n, m, ldb, lda); duration = libxsmm_timer_duration(start, libxsmm_timer_tick()); for (i = 0; i < n; ++i) { for (j = 0; j < m; ++j) { if (0 < fabs(a[i*lda+j] - initial_value(i, j, lda))) { i = n + 1; break; } } } if (i <= n) { if (0 < duration) { fprintf(stdout, "\tbandwidth: %.1f GB/s\n", size / (duration * (1 << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } else { fprintf(stderr, "Validation failed!\n"); } #if defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL) { double mkl_duration; start = libxsmm_timer_tick(); LIBXSMM_CONCATENATE(mkl_, LIBXSMM_TPREFIX(REAL_TYPE, omatcopy))('C', 'T', m, n, 1, a, lda, b, ldb); LIBXSMM_CONCATENATE(mkl_, LIBXSMM_TPREFIX(REAL_TYPE, omatcopy))('C', 'T', n, m, 1, b, ldb, a, lda); mkl_duration = libxsmm_timer_duration(start, libxsmm_timer_tick()); if (0 < mkl_duration) { fprintf(stdout, "\tMKL: %.1fx\n", duration / mkl_duration); } } #endif free(a); free(b); return EXIT_SUCCESS; }