void run_jit_float( const float* i_a, const float* i_b, float* o_c, const int i_M, const int i_N, const int i_K, const libxsmm_prefetch_type i_prefetch, const char* i_arch ) { /* define function pointer */ libxsmm_smmfunction l_test_jit; double l_jittime = 0.0, l_runtime = 0.0; float l_alpha = 1.0f; float l_beta = 1.0f; unsigned long long l_start; unsigned int l_t; if ( l_beta != 0.0f && l_beta != 1.0f ) { fprintf(stderr, "JIT float: beta needs to be 0.0 or 1.0!\n"); exit(-1); } if ( l_alpha != 1.0f ) { fprintf(stderr, "JIT float: alpha needs to be 1.0!\n"); exit(-1); } l_start = libxsmm_timer_tick(); l_test_jit = libxsmm_smmdispatch(i_M, i_N, i_K, &i_M, &i_K, &i_M, &l_alpha, &l_beta, NULL, &i_prefetch ); l_jittime = libxsmm_timer_duration(l_start, libxsmm_timer_tick()); printf("function pointer address: %llx\n", (size_t)l_test_jit); l_start = libxsmm_timer_tick(); if ( i_prefetch == LIBXSMM_PREFETCH_NONE ) { for ( l_t = 0; l_t < g_jit_code_reps; l_t++ ) { l_test_jit(i_a, i_b, o_c); } } else { for ( l_t = 0; l_t < g_jit_code_reps; l_t++ ) { l_test_jit(i_a, i_b, o_c, i_a, i_b, o_c); } } l_runtime = libxsmm_timer_duration(l_start, libxsmm_timer_tick()); printf("%fs for creating jit\n", l_jittime); printf("%fs for executing jit\n", l_runtime); printf("%f GFLOPS for jit\n", ((double)((double)g_jit_code_reps * (double)i_M * (double)i_N * (double)i_K) * 2.0) / (l_runtime * 1.0e9)); }
void run_gold_float( const float* i_a, const float* i_b, float* o_c, const libxsmm_gemm_descriptor* i_xgemm_desc ) { unsigned int l_m, l_n, l_k, l_t; double l_runtime = 0.0; const unsigned long long l_start = libxsmm_timer_tick(); for ( l_t = 0; l_t < g_jit_code_reps; l_t++ ) { for ( l_n = 0; l_n < i_xgemm_desc->n; l_n++ ) { for ( l_k = 0; l_k < i_xgemm_desc->k; l_k++ ) { for ( l_m = 0; l_m < i_xgemm_desc->m; l_m++ ) { o_c[(l_n * i_xgemm_desc->ldc) + l_m] += i_a[(l_k * i_xgemm_desc->lda) + l_m] * i_b[(l_n * i_xgemm_desc->ldb) + l_k]; } } } } l_runtime = libxsmm_timer_duration(l_start, libxsmm_timer_tick()); printf("%fs for C\n", l_runtime); printf("%f GFLOPS for C\n", ((double)((double)g_jit_code_reps * (double)i_xgemm_desc->m * (double)i_xgemm_desc->n * (double)i_xgemm_desc->k) * 2.0) / (l_runtime * 1.0e9)); }
int main(int argc, char* argv[]) { const int m = (1 < argc ? atoi(argv[1]) : 16); const int n = (2 < argc ? atoi(argv[2]) : m); const int unsigned ldi = LIBXSMM_MAX(3 < argc ? atoi(argv[3]) : 0, m); const int unsigned ldo = LIBXSMM_MAX(4 < argc ? atoi(argv[4]) : 0, m); const int unroll = (5 < argc ? atoi(argv[5]) : 1); const int prefetch = (6 < argc ? atoi(argv[6]) : 0); const int flags = ((7 < argc && 0 != atoi(argv[7])) ? LIBXSMM_MATCOPY_FLAG_ZERO_SOURCE : 0); const int iters = (8 < argc ? atoi(argv[8]) : 1); /* we should modify to test all data-types */ const libxsmm_mcopy_descriptor* desc; libxsmm_xmcopyfunction kernel; libxsmm_descriptor_blob blob; libxsmm_timer_tickint l_start; libxsmm_timer_tickint l_end; int error = 0, i, j; ELEM_TYPE *a, *b; double copy_time; printf("This is a tester for JIT matcopy kernels!\n"); desc = libxsmm_mcopy_descriptor_init(&blob, sizeof(ELEM_TYPE), m, n, ldo, ldi, flags, prefetch, &unroll); a = (ELEM_TYPE*)malloc(n * ldi * sizeof(ELEM_TYPE)); b = (ELEM_TYPE*)malloc(n * ldo * sizeof(ELEM_TYPE)); for (i = 0; i < n; i++) { for (j = 0; j < m; j++) { a[j+ldi*i] = (ELEM_TYPE)rand(); if (0 != (LIBXSMM_MATCOPY_FLAG_ZERO_SOURCE & flags)) { b[j+ldo*i] = (ELEM_TYPE)rand(); } } } /* test dispatch call */ kernel = libxsmm_dispatch_mcopy(desc); if (kernel == 0) { printf("JIT error -> exit!!!!\n"); exit(EXIT_FAILURE); } /* let's call */ kernel(a, &ldi, b, &ldo, &a[128]); l_start = libxsmm_timer_tick(); for (i = 0; i < iters; ++i) { kernel(a, &ldi, b, &ldo, &a[128]); } l_end = libxsmm_timer_tick(); copy_time = libxsmm_timer_duration(l_start, l_end); for (i = 0; i < n; ++i) { for (j = 0; j < m; ++j) { if (0 != (LIBXSMM_MATCOPY_FLAG_ZERO_SOURCE & flags)) { if (LIBXSMM_NEQ(b[j+ldo*i], 0)) { printf("ERROR!!!\n"); i = n; error = 1; break; } } else if (LIBXSMM_NEQ(a[j+ldi*i], b[j+ldo*i])) { printf("ERROR!!!\n"); i = n; error = 1; break; } } } if (error == 0) { printf("CORRECT copy!!!!\n"); printf("Time taken is\t%.5f seconds\n", copy_time); } return EXIT_SUCCESS; }
int main(int argc, char* argv[]) { int M = ( argc == 7 ) ? atoi(argv[1]) : 9; int N = ( argc == 7 ) ? atoi(argv[2]) : 10; int K = ( argc == 7 ) ? atoi(argv[3]) : 9; unsigned int N_CRUNS = ( argc == 7 ) ? atoi(argv[4]) : 8; unsigned int REPS = ( argc == 7 ) ? atoi(argv[5]) : 1; char* l_csr_file = ( argc == 7 ) ? argv[6] : "file.csr"; const libxsmm_gemm_prefetch_type prefetch = LIBXSMM_GEMM_PREFETCH_NONE; const int flags = LIBXSMM_GEMM_FLAGS('N', 'N'); const REALTYPE alpha = 1, beta = 1; REALTYPE* l_a_de = (REALTYPE*)libxsmm_aligned_malloc(K * K * sizeof(REALTYPE), 64); REALTYPE* l_a_sp = NULL; REALTYPE* l_b = (REALTYPE*)libxsmm_aligned_malloc(K * N * N_CRUNS* sizeof(REALTYPE), 64); unsigned int* l_rowptr = NULL; unsigned int* l_colidx = NULL; unsigned int l_rowcount, l_colcount, l_elements; REALTYPE* l_c = (REALTYPE*)libxsmm_aligned_malloc(K * N * N_CRUNS * sizeof(REALTYPE), 64); REALTYPE* l_c_gold = (REALTYPE*)libxsmm_aligned_malloc(K * N * N_CRUNS * sizeof(REALTYPE), 64); REALTYPE* l_c_asm = (REALTYPE*)libxsmm_aligned_malloc(K * N * N_CRUNS * sizeof(REALTYPE), 64); REALTYPE l_max_error = 0.0; unsigned int l_k, l_n; int l_i, l_j, l_jj; LIBXSMM_VLA_DECL(3, REALTYPE, l_p_b, l_b, N, N_CRUNS); LIBXSMM_VLA_DECL(3, REALTYPE, l_p_c_asm, l_c_asm, N, N_CRUNS); LIBXSMM_VLA_DECL(3, REALTYPE, l_p_c_gold, l_c_gold, N, N_CRUNS); libxsmm_descriptor_blob l_xgemm_blob; const libxsmm_gemm_descriptor* l_xgemm_desc = 0; LIBXSMM_MMFUNCTION_TYPE(REALTYPE) mykernel = NULL; unsigned long long l_start, l_end; double l_total; if (argc != 7) { fprintf( stderr, "arguments: M #iters CSR-file!\n" ); return -1; } /* touch B */ for ( l_i = 0; l_i < K; l_i++) { for ( l_j = 0; l_j < N; l_j++) { for ( l_k = 0; l_k < N_CRUNS; l_k++ ) { LIBXSMM_VLA_ACCESS(3, l_p_b, l_i, l_j, l_k, N, N_CRUNS) = (REALTYPE)libxsmm_rand_f64(); } } } /* touch C */ for ( l_i = 0; l_i < K; l_i++) { for ( l_j = 0; l_j < N; l_j++) { for ( l_k = 0; l_k < N_CRUNS; l_k++ ) { LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, N, N_CRUNS) = (REALTYPE)0.0; LIBXSMM_VLA_ACCESS(3, l_p_c_asm, l_i, l_j, l_k, N, N_CRUNS) = (REALTYPE)0.0; } } } /* read A, CSR */ libxsmm_sparse_csr_reader( l_csr_file, &l_rowptr, &l_colidx, &l_a_sp, &l_rowcount, &l_colcount, &l_elements ); /* copy b to dense */ printf("CSR matrix data structure we just read:\n"); printf("rows: %u, columns: %u, elements: %u\n", l_rowcount, l_colcount, l_elements); for ( l_n = 0; l_n < (((unsigned int)K) * K); l_n++) { l_a_de[l_n] = 0.0; } for ( l_n = 0; l_n < (unsigned int)K; l_n++) { const unsigned int l_rowelems = l_rowptr[l_n+1] - l_rowptr[l_n]; assert(l_rowptr[l_n+1] >= l_rowptr[l_n]); for ( l_k = 0; l_k < l_rowelems; l_k++) { l_a_de[(l_n * K) + l_colidx[l_rowptr[l_n] + l_k]] = l_a_sp[l_rowptr[l_n] + l_k]; } } /* dense routine */ l_start = libxsmm_timer_tick(); #if 1 for ( l_n = 0; l_n < REPS; l_n++) { for ( l_i = 0; l_i < K; l_i++) { for ( l_j = 0; l_j < N; l_j++) { for ( l_jj = 0; l_jj < K; l_jj++) { LIBXSMM_PRAGMA_SIMD for (l_k = 0; l_k < N_CRUNS; l_k++) { LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, N, N_CRUNS) += l_a_de[(l_i*K)+l_jj] * LIBXSMM_VLA_ACCESS(3, l_p_b, l_jj, l_j, l_k, N, N_CRUNS); } } } } } #endif l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); printf("%fs for dense\n", l_total); printf("%f GFLOPS for dense\n", ((double)((double)REPS * (double)K * (double)K * (double)N * (double)N_CRUNS) * 2.0) / (l_total * 1.0e9)); l_xgemm_desc = libxsmm_gemm_descriptor_dinit(&l_xgemm_blob, LIBXSMM_GEMM_PRECISION(REALTYPE), K, N, K, 0, N, N, alpha, beta, flags, prefetch); /* sparse routine */ #if defined(__EDGE_EXECUTE_F32__) mykernel = libxsmm_create_xcsr_soa( l_xgemm_desc, l_rowptr, l_colidx, (const void*)l_a_sp ).smm; #else mykernel = libxsmm_create_xcsr_soa( l_xgemm_desc, l_rowptr, l_colidx, (const void*)l_a_sp ).dmm; #endif l_start = libxsmm_timer_tick(); for ( l_n = 0; l_n < REPS; l_n++) { mykernel( l_a_sp, l_b, l_c_asm ); } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); printf("%fs for sparse (asm)\n", l_total); printf("%f GFLOPS for sparse (asm)\n", ((double)((double)REPS * (double)K * (double)l_elements * (double)N_CRUNS) * 2.0) / (l_total * 1.0e9)); /* check for errors */ l_max_error = (REALTYPE)0.0; for ( l_i = 0; l_i < K; l_i++) { for ( l_j = 0; l_j < N; l_j++) { for ( l_k = 0; l_k < N_CRUNS; l_k++ ) { if (fabs( LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, N, N_CRUNS) - LIBXSMM_VLA_ACCESS(3, l_p_c_asm, l_i, l_j, l_k, N, N_CRUNS) ) > l_max_error ) { l_max_error = (REALTYPE)fabs( LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, N, N_CRUNS) -LIBXSMM_VLA_ACCESS(3, l_p_c_asm, l_i, l_j, l_k, N, N_CRUNS) ); } } } } printf("max error: %f\n", l_max_error); printf("PERFDUMP,%s,%u,%i,%i,%i,%u,%u,%f,%f,%f\n", l_csr_file, REPS, M, N, K, l_elements, K * l_elements * N_CRUNS * 2, l_max_error, l_total, ((double)((double)REPS * (double)K * (double)l_elements * (double)N_CRUNS) * 2.0) / (l_total * 1.0e9) ); /* free */ libxsmm_free( l_a_de ); libxsmm_free( l_b ); libxsmm_free( l_c ); libxsmm_free( l_c_gold ); libxsmm_free( l_c_asm ); free( l_a_sp ); free( l_rowptr ); free( l_colidx ); return 0; }
int main(int argc, char* argv[]) { int result = EXIT_SUCCESS; try { const libxsmm_blasint benchmark = 1 < argc ? std::atoi(argv[1]) : 0; LIBXSMM_GEMM_CONST libxsmm_blasint m = (2 < argc ? std::atoi(argv[2]) : 23); LIBXSMM_GEMM_CONST libxsmm_blasint k = (4 < argc ? std::atoi(argv[4]) : m); LIBXSMM_GEMM_CONST libxsmm_blasint n = (3 < argc ? std::atoi(argv[3]) : k); const libxsmm_blasint q = (5 < argc ? std::atoi(argv[5]) : 0/*auto*/); const libxsmm_blasint nrepeat = (6 < argc ? std::atoi(argv[6]) : (0 >= q ? 13 : 1)); LIBXSMM_GEMM_CONST libxsmm_blasint lda = m, ldb = k, ldc = m; LIBXSMM_GEMM_CONST char transa = 'N', transb = 'N'; LIBXSMM_GEMM_CONST OTYPE alpha = 1, beta = 1; const libxsmm_blasint asize = PAD(ITYPE, lda * k), bsize = PAD(ITYPE, ldb * n), csize = PAD(OTYPE, ldc * n); const libxsmm_blasint max_size = ((2ULL << 30/*2 GB*/) / ((asize + bsize) * sizeof(ITYPE) + csize * sizeof(OTYPE))); const libxsmm_blasint s = LIBXSMM_MIN(0 < q ? q : max_size, max_size); const libxsmm_blasint aspace = LIBXSMM_ALIGNMENT / sizeof(ITYPE); const size_t bwsize = static_cast<size_t>((asize/*load*/ + bsize/*load*/) * sizeof(ITYPE) + 2/*RFO*/ * csize * sizeof(OTYPE)); const double gflops = 2E-9 * s * m * n * k; #if LIBXSMM_TYPEINFO(ITYPE, FP) const char *const ops = "FLOPS"; const double scale = 1.0 / s; #else const char *const ops = "OPS"; const double scale = 1; #endif #if !defined(_DEBUG) const char *const env_check = getenv("CHECK"); const int check = (0 == env_check ? 0 : atoi(env_check)); #else /*const*/ int check = 1; #endif #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload target(LIBXSMM_OFFLOAD_TARGET) #endif { #if defined(_OPENMP) const libxsmm_blasint chunksize = s / omp_get_max_threads(); #endif struct raii { // avoid std::vector (first-touch init. causes NUMA issue) ITYPE *a, *b; OTYPE *c, *d; libxsmm_blasint *m_shuffle; raii(libxsmm_blasint asize_, libxsmm_blasint bsize_, libxsmm_blasint csize_, libxsmm_blasint size_) : a(new ITYPE[static_cast<size_t>(asize_)]), b(new ITYPE[static_cast<size_t>(bsize_)]) , c(new OTYPE[static_cast<size_t>(csize_)]), d(new OTYPE[static_cast<size_t>(csize_)]) , m_shuffle(new libxsmm_blasint[size_]) { # if defined(_OPENMP) # pragma omp parallel for schedule(static) # endif for (libxsmm_blasint i = 0; i < size_; ++i) m_shuffle[i] = libxsmm_rand_u32(size_); } ~raii() { delete[] a; delete[] b; delete[] c; delete[] d; delete[] m_shuffle; } #if defined(RANDOMIZED) libxsmm_blasint shuffle(libxsmm_blasint i) const { return m_shuffle[i]; } #else libxsmm_blasint shuffle(libxsmm_blasint i) const { return i; } #endif } helper(s * asize + aspace - 1, s * bsize + aspace - 1, s * csize + aspace - 1, s); ITYPE *const a = LIBXSMM_ALIGN(helper.a, LIBXSMM_ALIGNMENT); ITYPE *const b = LIBXSMM_ALIGN(helper.b, LIBXSMM_ALIGNMENT); OTYPE *const c = LIBXSMM_ALIGN(helper.c, LIBXSMM_ALIGNMENT); OTYPE *const d = LIBXSMM_ALIGN(helper.d, LIBXSMM_ALIGNMENT); #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { LIBXSMM_MATINIT(ITYPE, 42 + helper.shuffle(i), a + helper.shuffle(i) * asize, m, k, lda, scale); LIBXSMM_MATINIT(ITYPE, 24 + helper.shuffle(i), b + helper.shuffle(i) * bsize, k, n, ldb, scale); LIBXSMM_MATINIT(OTYPE, 22 + i, c + i * csize, m, n, ldc, scale); LIBXSMM_MATINIT(OTYPE, 22 + i, d + i * csize, m, n, ldc, scale); } #if defined(MKL_ENABLE_AVX512) mkl_enable_instructions(MKL_ENABLE_AVX512); #endif // initialize LIBXSMM libxsmm_init(); fprintf(stdout, "m=%lli n=%lli k=%lli size=%lli memory=%.1f MB (input=%s output=%s)\n\n", static_cast<long long>(m), static_cast<long long>(n), static_cast<long long>(k), static_cast<long long>(s), 1.0 * (s * ((asize + bsize) * sizeof(ITYPE) + csize * sizeof(OTYPE))) / (1 << 20), LIBXSMM_TYPENAME(ITYPE), LIBXSMM_TYPENAME(OTYPE)); // LAPACK/BLAS3 (warm-up BLAS Library) #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, a + helper.shuffle(i) * asize, &lda, b + helper.shuffle(i) * bsize, &ldb, &beta, c + i * csize, &ldc); } #if (defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL)) && (LIBXSMM_VERSION3(11, 3, 0) <= INTEL_MKL_VERSION) std::vector<const ITYPE*> va_array(static_cast<size_t>(s)), vb_array(static_cast<size_t>(s)); std::vector<OTYPE*> vc_array(static_cast<size_t>(s)); const ITYPE* *const a_array = &va_array[0]; const ITYPE* *const b_array = &vb_array[0]; OTYPE* *const c_array = &vc_array[0]; const libxsmm_blasint group_count = 1; for (libxsmm_blasint i = 0; i < s; ++i) { // setup batched (A,B,C) a_array[i] = a + helper.shuffle(i) * asize; b_array[i] = b + helper.shuffle(i) * bsize; c_array[i] = d + i * csize; } // additional warm-up (also to eventually match the Gold result) LIBXSMM_TPREFIX(ITYPE,gemm_batch)(&transa, &transb, &m, &n, &k, &alpha, &a_array[0], &lda, &b_array[0], &ldb, &beta, &c_array[0], &ldc, &group_count, &s); #endif switch (benchmark) { case 0: { // batched fprintf(stdout, "Batched (A,B,C)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, a + helper.shuffle(i) * asize, &lda, b + helper.shuffle(i) * bsize, &ldb, &beta, c + i * csize, &ldc); } } const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * bwsize / (duration * (1 << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } /* fallthrough */ #if (defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL)) && (LIBXSMM_VERSION3(11, 3, 0) <= INTEL_MKL_VERSION) case 1: { // batched indirect fprintf(stdout, "Indirect (A,B,C)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { LIBXSMM_TPREFIX(ITYPE,gemm_batch)(&transa, &transb, &m, &n, &k, &alpha, &a_array[0], &lda, &b_array[0], &ldb, &beta, &c_array[0], &ldc, &group_count, &s); } const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * bwsize / (duration * (1 << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); if (0 == benchmark) { /* Gold result is available */ libxsmm_matdiff_info diff; memset(&diff, 0, sizeof(diff)); for (libxsmm_blasint h = 0; h < s; ++h) { const OTYPE *const u = c + h * csize, *const v = c_array[h]; libxsmm_matdiff_info dv; if (EXIT_SUCCESS == libxsmm_matdiff(LIBXSMM_DATATYPE(OTYPE), m, n, u, v, &ldc, &ldc, &dv)) { libxsmm_matdiff_reduce(&diff, &dv); } } if (0 < diff.normf_rel) fprintf(stdout, "\tdiff: %.0f%%\n", 100.0 * diff.normf_rel); } } #endif break; case 2: { // streaming A and C fprintf(stdout, "Streamed (A,C)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, a + helper.shuffle(i) * asize, &lda, b, &ldb, &beta, c + i * csize, &ldc); } } const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - bsize * sizeof(ITYPE)) / (duration * (1 << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } /* fallthrough */ #if (defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL)) && (LIBXSMM_VERSION3(11, 3, 0) <= INTEL_MKL_VERSION) case 3: { // indirect A and C fprintf(stdout, "Indirect (A,C)...\n"); for (libxsmm_blasint i = 0; i < s; ++i) { a_array[i] = a + helper.shuffle(i) * asize; b_array[i] = b; c_array[i] = d + i * csize; } const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { LIBXSMM_TPREFIX(ITYPE, gemm_batch)(&transa, &transb, &m, &n, &k, &alpha, &a_array[0], &lda, &b_array[0], &ldb, &beta, &c_array[0], &ldc, &group_count, &s); } const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - bsize * sizeof(ITYPE)) / (duration * (1 << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } #endif break; case 4: { // streaming B and C fprintf(stdout, "Streamed (B,C)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b + helper.shuffle(i) * bsize, &ldb, &beta, c + i * csize, &ldc); } } const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - asize * sizeof(ITYPE)) / (duration * (1 << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } /* fallthrough */ #if (defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL)) && (LIBXSMM_VERSION3(11, 3, 0) <= INTEL_MKL_VERSION) case 5: { // indirect B and C fprintf(stdout, "Indirect (B,C)...\n"); for (libxsmm_blasint i = 0; i < s; ++i) { a_array[i] = a; b_array[i] = b + helper.shuffle(i) * bsize; c_array[i] = d + i * csize; } const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { LIBXSMM_TPREFIX(ITYPE, gemm_batch)(&transa, &transb, &m, &n, &k, &alpha, &a_array[0], &lda, &b_array[0], &ldb, &beta, &c_array[0], &ldc, &group_count, &s); } const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - asize * sizeof(ITYPE)) / (duration * (1 << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } #endif break; case 6: { // streaming A and B fprintf(stdout, "Streamed (A,B)...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { #if defined(_OPENMP) /* attempt to write to disjunct cachelines */ const libxsmm_blasint j = omp_get_thread_num() * chunksize * csize; #else const libxsmm_blasint j = 0; #endif LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, a + helper.shuffle(i) * asize, &lda, b + helper.shuffle(i) * bsize, &ldb, &beta, c + j, &ldc); } } const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - 2 * csize * sizeof(OTYPE)) / (duration * (1 << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } /* fallthrough */ #if (defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL)) && (LIBXSMM_VERSION3(11, 3, 0) <= INTEL_MKL_VERSION) case 7: { // indirect A and B fprintf(stdout, "Indirect (A,B)...\n"); #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { a_array[i] = a + helper.shuffle(i) * asize; b_array[i] = b + helper.shuffle(i) * bsize; #if defined(_OPENMP) /* attempt to write to disjunct cachelines */ c_array[i] = d + omp_get_thread_num() * chunksize * csize; #else c_array[i] = d; #endif } const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { LIBXSMM_TPREFIX(ITYPE, gemm_batch)(&transa, &transb, &m, &n, &k, &alpha, &a_array[0], &lda, &b_array[0], &ldb, &beta, &c_array[0], &ldc, &group_count, &s); } const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - 2 * csize * sizeof(OTYPE)) / (duration * (1 << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } #endif break; case 8: { // cached fprintf(stdout, "Cached...\n"); const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { #if defined(_OPENMP) /* attempt to write to disjunct cachelines */ const libxsmm_blasint j = omp_get_thread_num() * chunksize * csize; #else const libxsmm_blasint j = 0; #endif LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c + j, &ldc); } } const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } /* fallthrough */ #if (defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL)) && (LIBXSMM_VERSION3(11, 3, 0) <= INTEL_MKL_VERSION) case 9: { // indirect cached fprintf(stdout, "Indirect cached...\n"); #if defined(_OPENMP) # pragma omp parallel for schedule(static) #endif for (libxsmm_blasint i = 0; i < s; ++i) { a_array[i] = a; b_array[i] = b; #if defined(_OPENMP) /* attempt to write to disjunct cachelines */ c_array[i] = d + omp_get_thread_num() * chunksize * csize; #else c_array[i] = d; #endif } const unsigned long long start = libxsmm_timer_tick(); for (libxsmm_blasint r = 0; r < nrepeat; ++r) { LIBXSMM_TPREFIX(ITYPE, gemm_batch)(&transa, &transb, &m, &n, &k, &alpha, &a_array[0], &lda, &b_array[0], &ldb, &beta, &c_array[0], &ldc, &group_count, &s); } const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick()); const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat; if (0 < duration && 0 != ncycles) { fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops); fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } #endif break; default: throw "invalid case selected!"; } /*switch*/ if (0 != check) { libxsmm_matdiff_info diff; if (EXIT_SUCCESS == libxsmm_matdiff(LIBXSMM_DATATYPE(OTYPE), m, n, 0 == (benchmark & 1) ? c : d, NULL, &ldc, &ldc, &diff)) { fprintf(stdout, "\tcheck: %f\n", diff.l1_ref); } } // finalize LIBXSMM libxsmm_finalize(); fprintf(stdout, "Finished\n"); } } catch(const std::exception& e) { fprintf(stderr, "Error: %s\n", e.what()); result = EXIT_FAILURE; } catch(const char* message) { fprintf(stderr, "Error: %s\n", message); result = EXIT_FAILURE; } catch(...) { fprintf(stderr, "Error: unknown exception caught!\n"); result = EXIT_FAILURE; } return result; }
int main(int argc, char* argv[]) { LIBXSMM_GEMM_CONST libxsmm_blasint m = (1 < argc ? atoi(argv[1]) : 1024); LIBXSMM_GEMM_CONST libxsmm_blasint k = (3 < argc ? atoi(argv[3]) : m); LIBXSMM_GEMM_CONST libxsmm_blasint n = (2 < argc ? atoi(argv[2]) : k); const libxsmm_blasint bm = (4 < argc ? atoi(argv[4]) : 32); const libxsmm_blasint bk = (6 < argc ? atoi(argv[6]) : bm); const libxsmm_blasint bn = (5 < argc ? atoi(argv[5]) : bk); const libxsmm_bgemm_order order = (libxsmm_bgemm_order)(7 < argc ? atoi(argv[7]) : 0); const int nrepeat = (8 < argc ? atoi(argv[8]) : 100); const libxsmm_blasint b_m1 = (9 < argc ? atoi(argv[9]) : 1); const libxsmm_blasint b_n1 = (10 < argc ? atoi(argv[10]) : 1); const libxsmm_blasint b_k1 = (11 < argc ? atoi(argv[11]) : 1); const libxsmm_blasint b_k2 = (12 < argc ? atoi(argv[12]) : 1); const int ab = (13 < argc ? atoi(argv[13]) : 0); LIBXSMM_GEMM_CONST libxsmm_blasint lda = (14 < argc ? atoi(argv[13]) : m); LIBXSMM_GEMM_CONST libxsmm_blasint ldb = (15 < argc ? atoi(argv[14]) : k); LIBXSMM_GEMM_CONST libxsmm_blasint ldc = (16 < argc ? atoi(argv[15]) : m); LIBXSMM_GEMM_CONST char transa = 'N', transb = 'N'; /* no transposes */ LIBXSMM_GEMM_CONST ITYPE alpha = 1, beta = 1; const int gemm_flags = LIBXSMM_GEMM_FLAGS(transa, transb); const double gflops = 2.0 * m * n * k * 1E-9; int result = EXIT_SUCCESS; #if defined(CHECK) const char *const env_check = getenv("CHECK"); const double check = LIBXSMM_ABS(0 == env_check ? 0 : atof(env_check)); #endif if (argc > 1 && !strncmp(argv[1], "-h", 3)) { /* check command line */ printf("\nUsage: ./bgemm [M] [N] [K] [bm] [bn] [bk] [order] [reps] [b_m1] [b_n1] [b_k1] [b_k2] [verbose]\n\n"); return result; } MYASSERT(m % b_m1 == 0); MYASSERT(n % b_n1 == 0); MYASSERT(k % b_k1 == 0); MYASSERT(m/b_m1 % bm == 0); MYASSERT(n/b_n1 % bn == 0); MYASSERT(k/b_k1/b_k2 % bk == 0); #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload target(LIBXSMM_OFFLOAD_TARGET) #endif { ITYPE* agold = (ITYPE*)libxsmm_malloc((size_t)(lda * k * sizeof(ITYPE))); ITYPE* bgold = (ITYPE*)libxsmm_malloc((size_t)(ldb * n * sizeof(ITYPE))); ITYPE* cgold = (ITYPE*)libxsmm_malloc((size_t)(ldc * n * sizeof(ITYPE))); ITYPE* a = (ITYPE*)libxsmm_malloc((size_t)(m * k * sizeof(ITYPE))); ITYPE* b = (ITYPE*)libxsmm_malloc((size_t)(k * n * sizeof(ITYPE))); ITYPE* c = (ITYPE*)libxsmm_malloc((size_t)(m * n * sizeof(ITYPE))); libxsmm_bgemm_handle* handle = 0; unsigned long long start; double duration; handle = libxsmm_bgemm_handle_create( LIBXSMM_GEMM_PRECISION(ITYPE), LIBXSMM_GEMM_PRECISION(ITYPE), m, n, k, &bm, &bn, &bk, &b_m1, &b_n1, &b_k1, &b_k2, &alpha, &beta, &gemm_flags, NULL/*auto-prefetch*/, &order); if (0 != handle) { LIBXSMM_MATINIT(ITYPE, 42, agold, m, k, lda, 1.0); LIBXSMM_MATINIT(ITYPE, 24, bgold, k, n, ldb, 1.0); LIBXSMM_MATINIT(ITYPE, 0, cgold, m, n, ldc, 1.0); libxsmm_bgemm_copyin_a(handle, agold, &lda, a); libxsmm_bgemm_copyin_b(handle, bgold, &ldb, b); libxsmm_bgemm_copyin_c(handle, cgold, &ldc, c); #if defined(MKL_ENABLE_AVX512) mkl_enable_instructions(MKL_ENABLE_AVX512); #endif /* warm-up OpenMP (populate thread pool) */ libxsmm_bgemm_omp(handle, a, b, c, 1); #if defined(CHECK) if (!LIBXSMM_FEQ(0, check)) { LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, agold, &lda, bgold, &ldb, &beta, cgold, &ldc); } #endif if (!ab) { libxsmm_gemm_print(stdout, LIBXSMM_GEMM_PRECISION(ITYPE), &transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); fprintf(stdout, "\n\n"); } start = libxsmm_timer_tick(); libxsmm_bgemm_omp(handle, a, b, c, nrepeat); duration = libxsmm_timer_duration(start, libxsmm_timer_tick()); if (0 < duration) { if (ab) { fprintf(stdout, "\tLIBXSMM: %.1f GFLOPS/s | %lli,%lli,%lli,%lli,%lli,%lli,%i,%lli,%lli,%lli,%lli\n", gflops * nrepeat / duration, (long long)m, (long long)n, (long long)k, (long long)bm, (long long)bn, (long long)bk, (int)order, (long long)b_m1, (long long)b_n1, (long long)b_k1, (long long)b_k2); } else { fprintf(stdout, "\tLIBXSMM: %.1f GFLOPS/s\n", gflops * nrepeat / duration); } } #if defined(CHECK) if (!LIBXSMM_FEQ(0, check)) { /* validate result against LAPACK/BLAS xGEMM */ ITYPE* ctest = 0; int i; start = libxsmm_timer_tick(); for (i = 0; i < nrepeat; ++i) { LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, agold, &lda, bgold, &ldb, &beta, cgold, &ldc); } duration = libxsmm_timer_duration(start, libxsmm_timer_tick()); if (0 < duration) { fprintf(stdout, "\tBLAS: %.1f GFLOPS/s\n", gflops * nrepeat / duration); } /* free memory not needed further; avoid double-free later on */ libxsmm_free(agold); agold = 0; libxsmm_free(bgold); bgold = 0; libxsmm_free(a); a = 0; libxsmm_free(b); b = 0; /* allocate C-matrix in regular format, and perform copy-out */ ctest = (ITYPE*)libxsmm_malloc((size_t)(ldc * n * sizeof(ITYPE))); if (0 != ctest) { libxsmm_matdiff_info diff; libxsmm_bgemm_copyout_c(handle, c, &ldc, ctest); if (EXIT_SUCCESS == libxsmm_matdiff(LIBXSMM_DATATYPE(ITYPE), m, n, cgold, ctest, &ldc, &ldc, &diff)) { fprintf(stdout, "\tdiff: L2abs=%f Linf=%f\n", diff.l2_abs, diff.linf_abs); if (check < 100.0 * diff.normf_rel) { fprintf(stderr, "FAILED with an error of %f%%!\n", 100.0 * diff.normf_rel); result = EXIT_FAILURE; } } libxsmm_free(ctest); } } #endif libxsmm_bgemm_handle_destroy(handle); } else { fprintf(stderr, "FAILED to create BGEMM-handle! For details retry with LIBXSMM_VERBOSE=1.\n"); result = EXIT_FAILURE; } libxsmm_free(agold); libxsmm_free(bgold); libxsmm_free(cgold); libxsmm_free(a); libxsmm_free(b); libxsmm_free(c); } if(!ab) { fprintf(stdout, "Finished\n"); } return result; }
int main(int argc, char* argv[]) { const libxsmm_blasint m = 1 < argc ? atoi(argv[1]) : 4096; const libxsmm_blasint n = 2 < argc ? atoi(argv[2]) : m; const libxsmm_blasint lda = LIBXSMM_MAX(3 < argc ? atoi(argv[3]) : 0, m); const libxsmm_blasint ldb = LIBXSMM_MAX(4 < argc ? atoi(argv[4]) : 0, n); REAL_TYPE *const a = (REAL_TYPE*)malloc(lda * n * sizeof(REAL_TYPE)); REAL_TYPE *const b = (REAL_TYPE*)malloc(ldb * m * sizeof(REAL_TYPE)); const unsigned int size = m * n * sizeof(REAL_TYPE); unsigned long long start; libxsmm_blasint i, j; double duration; fprintf(stdout, "m=%i n=%i lda=%i ldb=%i size=%.fMB (%s)\n", m, n, lda, ldb, 1.0 * size / (1 << 20), 8 == sizeof(REAL_TYPE) ? "DP" : "SP"); for (i = 0; i < n; ++i) { for (j = 0; j < m; ++j) { a[i*lda+j] = initial_value(i, j, lda); } } start = libxsmm_timer_tick(); libxsmm_transpose_oop(b, a, sizeof(REAL_TYPE), m, n, lda, ldb); libxsmm_transpose_oop(a, b, sizeof(REAL_TYPE), n, m, ldb, lda); duration = libxsmm_timer_duration(start, libxsmm_timer_tick()); for (i = 0; i < n; ++i) { for (j = 0; j < m; ++j) { if (0 < fabs(a[i*lda+j] - initial_value(i, j, lda))) { i = n + 1; break; } } } if (i <= n) { if (0 < duration) { fprintf(stdout, "\tbandwidth: %.1f GB/s\n", size / (duration * (1 << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } else { fprintf(stderr, "Validation failed!\n"); } #if defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL) { double mkl_duration; start = libxsmm_timer_tick(); LIBXSMM_CONCATENATE(mkl_, LIBXSMM_TPREFIX(REAL_TYPE, omatcopy))('C', 'T', m, n, 1, a, lda, b, ldb); LIBXSMM_CONCATENATE(mkl_, LIBXSMM_TPREFIX(REAL_TYPE, omatcopy))('C', 'T', n, m, 1, b, ldb, a, lda); mkl_duration = libxsmm_timer_duration(start, libxsmm_timer_tick()); if (0 < mkl_duration) { fprintf(stdout, "\tMKL: %.1fx\n", duration / mkl_duration); } } #endif free(a); free(b); return EXIT_SUCCESS; }
int main(int argc, char *argv[]) { REAL_TYPE *A_gold, *B_gold, *A_gold2, *B_gold2; float *C_gold, *C0_gold, *C, *C2; int M, N, K; REAL_TYPE alpha, beta; int reps; libxsmm_spmdm_handle handle, handle2; libxsmm_CSR_sparseslice *A_sparse, *A_sparse2; int max_threads; /* Step 1: Read in args */ libxsmm_timer_tickint start, end; double flops, duration; char transA, transB, transC; int i, j, k; size_t l; /* Step 1: Initialize handle */ M = 0; N = 0; K = 0; alpha = (REAL_TYPE)1.0; beta = (REAL_TYPE)0.0; reps = 0; transA = 'N'; transB = 'N'; if (argc > 1 && !strncmp(argv[1], "-h", 3)) { printf("\nUsage: %s [M] [N] [K] [transA] [transB] [reps]\n\n", argv[0]); return EXIT_SUCCESS; } /* defaults */ M = 2048; N = 2048; K = 2048; transA = 'N'; transB = 'N'; transC = 'N'; reps = 100; /* reading new values from cli */ i = 1; if (argc > i) M = atoi(argv[i++]); if (argc > i) N = atoi(argv[i++]); if (argc > i) K = atoi(argv[i++]); if (argc > i) { transA = argv[i][0]; i++; } if (argc > i) { transB = argv[i][0]; i++; } if (argc > i) { transC = argv[i][0]; i++; } if (argc > i) reps = atoi(argv[i++]); /* Step 2: allocate data */ A_gold = (REAL_TYPE*)libxsmm_aligned_malloc( M*K*sizeof(REAL_TYPE), 64 ); B_gold = (REAL_TYPE*)libxsmm_aligned_malloc( K*N*sizeof(REAL_TYPE), 64 ); C_gold = (float*)libxsmm_aligned_malloc( M*N*sizeof(float), 64 ); C0_gold = (float*)libxsmm_aligned_malloc( M*N*sizeof(float), 64 ); C = (float*)libxsmm_aligned_malloc( M*N*sizeof(float), 64 ); /* Step 3: init data */ libxsmm_rng_set_seed(1); for (l = 0; l < (size_t)M * (size_t)K; ++l) { const double r64 = libxsmm_rng_f64(); const float r32 = (float)r64; #ifdef USE_BFLOAT const int r = *(const int*)(&r32); const libxsmm_bfloat16 val = (r >> 16); #else const float val = r32; #endif if (r64 > 0.85) A_gold[l] = val; else A_gold[l] = (REAL_TYPE)0.0; } for (l = 0; l < (size_t)K * (size_t)N; ++l) { const double r64 = libxsmm_rng_f64(); const float r32 = (float)r64; #ifdef USE_BFLOAT const int r = *(const int*)(&r32); const libxsmm_bfloat16 val = (r >> 16); #else const float val = r32; #endif B_gold[l] = val; } for (l = 0; l < (size_t)M * (size_t)N; ++l) { C0_gold[l] = (float)libxsmm_rng_f64(); C_gold[l] = C0_gold[l]; } for (l = 0; l < (size_t)M * (size_t)N; ++l) { C[l] = (float)C0_gold[l]; } flops = (double)M * (double)N * (double)K * 2.0; /*----------------------------------------------------------------------------------------------------------------------*/ /* Step 4: Initialize LIBXSMM for these sizes - allocates handle and temporary space for the sparse data structure for A */ # if defined(_OPENMP) max_threads = omp_get_max_threads(); # else max_threads = 1; # endif start = libxsmm_timer_tick(); libxsmm_spmdm_init(M, N, K, max_threads, &handle, &A_sparse); end = libxsmm_timer_tick(); printf("Time for handle init = %f\n", libxsmm_timer_duration(start, end)); printf(" running with: M=%i, N=%i, K=%i, bm=%i, bn=%i, bk=%i, mb=%i, nb=%i, kb=%i, reps=%i -- forward pass\n", M, N, K, handle.bm, handle.bn, handle.bk, handle.mb, handle.nb, handle.kb, reps ); /* The overall function that takes in matrix inputs in dense format, does the conversion of A to sparse format and does the matrix multiply */ /* Currently ignores alpha */ /* TODO: fix alpha input */ # ifdef USE_BFLOAT spmdm_exec_bfloat16(&handle, transA, transB, &alpha, A_gold, B_gold, transC, &beta, C, A_sparse); # else spmdm_exec_fp32(&handle, transA, transB, &alpha, A_gold, B_gold, transC, &beta, C, A_sparse); # endif /* Checks */ /* Compute a "gold" answer sequentially */ #if defined(_OPENMP) LIBXSMM_OMP_VAR(k); # pragma omp parallel for private(i, j, k) LIBXSMM_OPENMP_COLLAPSE(2) #endif for (i = 0; i < M; ++i) { for (j = 0; j < N; ++j) { float sum = 0.0; float Cval; for (k = 0; k < K; ++k) { # ifdef USE_BFLOAT libxsmm_bfloat16 Atmp = A_gold[i*K+k]; int Atmp_int = Atmp; Atmp_int <<= 16; float Aval = *(float *)&Atmp_int; libxsmm_bfloat16 Btmp = B_gold[k*N+j]; int Btmp_int = Btmp; Btmp_int <<= 16; float Bval = *(float *)&Btmp_int; # else float Aval = A_gold[i*K + k]; float Bval = B_gold[k*N + j]; # endif sum += Aval * Bval; } Cval = sum; C_gold[i*N + j] = Cval + beta*C_gold[i*N + j]; } } /* LIBXSMM_FSYMBOL(sgemm)(&trans, &trans, &N, &M, &K, &alpha, B_gold, &N, A_gold, &K, &beta, C_gold, &N); */ /* Compute the max difference between gold and computed results. */ spmdm_check_c( &handle, C, C_gold ); /* Timing loop starts */ start = libxsmm_timer_tick(); for (i = 0; i < reps; ++i) { # ifdef USE_BFLOAT spmdm_exec_bfloat16( &handle, transA, transB, &alpha, A_gold, B_gold, transC, &beta, C, A_sparse); # else spmdm_exec_fp32( &handle, transA, transB, &alpha, A_gold, B_gold, transC, &beta, C, A_sparse); # endif } end = libxsmm_timer_tick(); duration = libxsmm_timer_duration(start, end); printf("Time = %f Time/rep = %f, TFlops/s = %f\n", duration, duration*1.0/reps, flops/1000./1000./1000./1000./duration*reps); libxsmm_spmdm_destroy(&handle); /*----------------------------------------------------------------------------------------------------------------------*/ /* Step 5: Initialize libxsmm for transpose A - allocates handle and temporary space for the sparse data structure for A */ transA = 'T'; transB = 'N'; transC = 'T'; libxsmm_spmdm_init(M, N, K, max_threads, &handle2, &A_sparse2); printf(" running with: M=%i, N=%i, K=%i, bm=%i, bn=%i, bk=%i, mb=%i, nb=%i, kb=%i, reps=%i, transA = Y, transC = Y -- weight update\n", handle2.m, handle2.n, handle2.k, handle2.bm, handle2.bn, handle2.bk, handle2.mb, handle2.nb, handle2.kb, reps ); A_gold2 = (REAL_TYPE*)libxsmm_aligned_malloc( M*K*sizeof(REAL_TYPE), 64 ); C2 = (float*)libxsmm_aligned_malloc( M*N*sizeof(float), 64 ); for (i = 0; i < M; ++i) { for (j = 0; j < K; ++j) { A_gold2[j*M + i] = A_gold[i*K + j]; } } for (i = 0; i < M; ++i) { for (j = 0; j < N; ++j) { C[j*M + i] = (float)C0_gold[i*N + j]; } } /* The overall function that takes in matrix inputs in dense format, does the conversion of A to sparse format and does the matrix multiply */ /* Currently ignores alpha */ /* TODO: fix alpha inputs */ # ifdef USE_BFLOAT spmdm_exec_bfloat16( &handle2, transA, transB, &alpha, A_gold2, B_gold, transC, &beta, C, A_sparse2); # else spmdm_exec_fp32( &handle2, transA, transB, &alpha, A_gold2, B_gold, transC, &beta, C, A_sparse2); # endif for (i = 0; i < M; ++i) { for (j = 0; j < N; ++j) { C2[i*N + j] = C[j*M + i]; } } /* Checks */ spmdm_check_c( &handle2, C2, C_gold); /* Timing loop starts */ start = libxsmm_timer_tick(); for (i = 0; i < reps; ++i) { # ifdef USE_BFLOAT spmdm_exec_bfloat16( &handle2, transA, transB, &alpha, A_gold2, B_gold, transC, &beta, C, A_sparse2); # else spmdm_exec_fp32( &handle2, transA, transB, &alpha, A_gold2, B_gold, transC, &beta, C, A_sparse2); # endif } end = libxsmm_timer_tick(); duration = libxsmm_timer_duration(start, end); printf("Time = %f Time/rep = %f, TFlops/s = %f\n", duration, duration*1.0/reps, flops/1000./1000./1000./1000./duration*reps); /*----------------------------------------------------------------------------------------------------------------------*/ /* Step 6: Test transpose B */ transA = 'N'; transB = 'T'; transC = 'N'; printf(" running with: M=%i, N=%i, K=%i, bm=%i, bn=%i, bk=%i, mb=%i, nb=%i, kb=%i, reps=%i, transB = Y -- backprop\n", handle2.m, handle2.n, handle2.k, handle2.bm, handle2.bn, handle2.bk, handle2.mb, handle2.nb, handle2.kb, reps ); B_gold2 = (REAL_TYPE*)libxsmm_aligned_malloc( K*N*sizeof(REAL_TYPE), 64 ); for (i = 0; i < K; ++i) { for (j = 0; j < N; ++j) { B_gold2[j*K + i] = B_gold[i*N + j]; } } for (l = 0; l < (size_t)M * (size_t)N; ++l) { C[l] = (float)C0_gold[l]; } /* The overall function that takes in matrix inputs in dense format, does the conversion of A to sparse format and does the matrix multiply */ /* Currently ignores alpha */ /* TODO: fix alpha inputs */ # ifdef USE_BFLOAT spmdm_exec_bfloat16( &handle2, transA, transB, &alpha, A_gold, B_gold2, transC, &beta, C, A_sparse2); # else spmdm_exec_fp32( &handle2, transA, transB, &alpha, A_gold, B_gold2, transC, &beta, C, A_sparse2); # endif /* Checks */ spmdm_check_c( &handle2, C, C_gold); /* Timing loop starts */ start = libxsmm_timer_tick(); for (i = 0; i < reps; ++i) { # ifdef USE_BFLOAT spmdm_exec_bfloat16( &handle2, transA, transB, &alpha, A_gold, B_gold2, transC, &beta, C, A_sparse2); # else spmdm_exec_fp32( &handle2, transA, transB, &alpha, A_gold, B_gold2, transC, &beta, C, A_sparse2); # endif } end = libxsmm_timer_tick(); duration = libxsmm_timer_duration(start, end); printf("Time = %f Time/rep = %f, TFlops/s = %f\n", duration, duration*1.0/reps, flops/1000./1000./1000./1000./duration*reps); libxsmm_spmdm_destroy(&handle2); libxsmm_free(A_gold); libxsmm_free(B_gold); libxsmm_free(C_gold); libxsmm_free(C); libxsmm_free(C2); libxsmm_free(C0_gold); libxsmm_free(B_gold2); libxsmm_free(A_gold2); return EXIT_SUCCESS; }
int main(int argc, char* argv[]) { const int insize = (1 < argc ? atoi(argv[1]) : 0); const int incrmt = (2 < argc ? atoi(argv[2]) : 0); const int nelems = (3 < argc ? atoi(argv[3]) : 0); const int niters = (4 < argc ? atoi(argv[4]) : 1); const int elsize = (0 >= insize ? LIBXSMM_DESCRIPTOR_SIGSIZE : insize); const int stride = (0 >= incrmt ? LIBXSMM_MAX(LIBXSMM_DESCRIPTOR_MAXSIZE, elsize) : LIBXSMM_MAX(incrmt, elsize)); const size_t n = (0 >= nelems ? (((size_t)2 << 30/*2 GB*/) / stride) : ((size_t)nelems)); unsigned char *input, *icopy = NULL, *ilast = NULL; int result = EXIT_SUCCESS; size_t nbytes, size, nrpt; if (0 < niters) { size = n; nrpt = niters; } else { size = LIBXSMM_MAX(LIBXSMM_ABS(niters), 1); nrpt = n; } nbytes = size * stride; input = (unsigned char*)(0 != nbytes ? malloc(nbytes) : NULL); if (NULL != input) { unsigned char *const ref = input + (size - 1) * stride; /* last item */ libxsmm_timer_tickint start; size_t i, j = 0; /* initialize the input data */ for (i = 0; i < nbytes; ++i) input[i] = LIBXSMM_MOD2(i, 128); for (i = 0; i < (size_t)elsize; ++i) ref[i] = 255; { /* benchmark libxsmm_diff_n */ #if defined(USE_HASH) const unsigned int hashref = libxsmm_hash(ref, elsize, 0/*seed*/); #endif start = libxsmm_timer_tick(); for (i = 0; i < nrpt; ++i) { #if !defined(USE_HASH) j = libxsmm_diff_n(ref, input, (unsigned char)elsize, (unsigned char)stride, (unsigned int)LIBXSMM_MIN(i, size)/*hint*/, (unsigned int)size); #else const unsigned char* tst = input; for (j = 0; j < size; ++j) { const unsigned int hashtst = libxsmm_hash(tst, elsize, 0/*seed*/); if (hashref == hashtst && 0 == libxsmm_diff(ref, tst, (unsigned char)elsize)) { break; } tst += stride; } #endif } printf("libxsmm_diff_n:\t\t%.8f s\n", libxsmm_timer_duration(start, libxsmm_timer_tick())); } if (size == (j + 1) && 0 == memcmp(ref, input + j * stride, elsize)) { /* benchmark libxsmm_memcmp */ icopy = (unsigned char*)(elsize == stride ? malloc(nbytes) : NULL); if (NULL != icopy) { ilast = icopy + (size - 1) * stride; /* last item */ memcpy(icopy, input, nbytes); start = libxsmm_timer_tick(); for (i = 0; i < nrpt; ++i) { j += libxsmm_memcmp(input, icopy, nbytes); /* take result of every execution */ /* memcmp may be pure and without touching input it is not repeated (nrpt) */ ilast[i%elsize] = 255; } printf("libxsmm_memcmp:\t\t%.8f s\n", libxsmm_timer_duration(start, libxsmm_timer_tick())); result += (int)j * ((int)stride / ((int)stride + 1)); /* ignore result */ } } else { result = EXIT_FAILURE; } if (NULL != icopy) { /* benchmark stdlib's memcmp */ LIBXSMM_ASSERT(NULL != ilast); start = libxsmm_timer_tick(); for (i = 0; i < nrpt; ++i) { j += memcmp(input, icopy, nbytes); /* take result of every execution */ /* memcmp is likely pure and without touching input it is not repeated (nrpt) */ ilast[i%elsize] = 255; } printf("stdlib memcmp:\t\t%.8f s\n", libxsmm_timer_duration(start, libxsmm_timer_tick())); result += (int)j * ((int)stride / ((int)stride + 1)); /* ignore result */ free(icopy); } free(input); } else { result = EXIT_FAILURE; } return result; }
int edge::reproducers::local( unsigned int const i_nSteps, unsigned int const i_nElements ) { double l_dT = 0.000001; unsigned int l_nSteps = i_nSteps; unsigned int l_nElements = i_nElements; t_elementChars * l_elChars; /* zero initialization */ t_dg l_dg; t_matStar (* l_starM)[N_DIM]; t_fluxSolver (* l_fluxSolvers)[ C_ENT[T_SDISC.ELEMENT].N_FACES ]; real_base (* l_dofs)[N_QUANTITIES][N_ELEMENT_MODES][N_CRUNS]; real_base (* l_tInt)[N_QUANTITIES][N_ELEMENT_MODES][N_CRUNS]; edge::io::Receivers l_recvs; edge::data::MmXsmmFused< real_base > l_mm; unsigned int l_dummyUInt; double l_dummyDouble; // 1. Set up structures setupDg( l_dg ); setupKernel( l_mm ); setupStarM( l_nElements, &l_starM ); setupFluxSolv( l_nElements, &l_fluxSolvers ); setupTensor( l_nElements, &l_dofs, &l_tInt ); #ifdef PP_USE_OMP #pragma omp parallel #pragma omp critical #endif setupScratchMem( &(edge::parallel::g_scratchMem) ); setupPseudoMesh( edge::reproducers::C_MODE_LOCAL, l_nElements, &l_elChars, nullptr, nullptr, nullptr, nullptr, nullptr ); // 2. Run solvers std::cout << "Runing solvers" << std::endl; unsigned long long l_start = libxsmm_timer_tick(); #ifdef PP_USE_OMP #pragma omp parallel firstprivate( l_nSteps, l_nElements, l_dT ) \ firstprivate( l_elChars ) \ firstprivate( l_dg, l_starM, l_fluxSolvers ) \ firstprivate( l_dofs, l_tInt ) \ firstprivate( l_mm ) \ private( l_recvs, l_dummyUInt, l_dummyDouble ) #endif { const unsigned int l_nThreads = omp_get_num_threads(); const unsigned int l_tid = omp_get_thread_num(); unsigned int l_firstEl = (unsigned int)((l_nElements + l_nThreads - 1) / l_nThreads) * l_tid; unsigned int l_lastEl = (unsigned int)((l_nElements + l_nThreads - 1) / l_nThreads) * (l_tid + 1); l_lastEl = std::min(l_lastEl, l_nElements); unsigned int l_numEl = l_lastEl - l_firstEl; for ( unsigned int l_step = 0; l_step < l_nSteps; l_step++ ) { edge::elastic::solvers::AderDg::local< unsigned int, real_base, edge::data::MmXsmmFused< real_base > > ( l_firstEl, l_numEl, l_dummyDouble, l_dT, l_dummyUInt, l_dummyUInt, nullptr, nullptr, l_elChars, l_dg, l_starM, l_fluxSolvers, l_dofs, l_tInt, nullptr, l_recvs, l_mm ); #ifdef PP_USE_OMP #pragma omp barrier #endif } } unsigned long long l_end = libxsmm_timer_tick(); // 3. Print statistics double l_time = libxsmm_timer_duration(l_start, l_end); unsigned int l_local_flops[] = { 792, 3564, 11412, 31500, 77184, 173538, 360522 }; unsigned long long l_flops = (unsigned long long)l_local_flops[ORDER-1] * PP_N_CRUNS * \ l_nElements * l_nSteps; double l_gflops = (double)l_flops / (l_time * 1000000000); std::cout << "Elapsed time: " << l_time << " s" << std::endl; std::cout << "Performance: " << l_gflops << " GFLOPS" << std::endl; std::cout << std::endl; #ifdef PP_REPRODUCER_DUMP std::string l_dumpFileName1 = "./local_o"+std::to_string(ORDER)+"_" "f"+std::to_string(PP_PRECISION)+"_" "el"+std::to_string(l_nElements)+"_" "stp"+std::to_string(l_nSteps)+"_dofs.log"; std::string l_dumpFileName2 = "./local_o"+std::to_string(ORDER)+"_" "f"+std::to_string(PP_PRECISION)+"_" "el"+std::to_string(l_nElements)+"_" "stp"+std::to_string(l_nSteps)+"_tInt.log"; std::ofstream l_fp1( l_dumpFileName1 ); std::ofstream l_fp2( l_dumpFileName2 ); for ( unsigned int l_el = 0; l_el < l_nElements; l_el++ ) { for ( unsigned int l_qt = 0; l_qt < N_QUANTITIES; l_qt++ ) { for ( unsigned int l_md = 0; l_md < N_ELEMENT_MODES; l_md++ ) { for ( unsigned int l_cfr = 0; l_cfr < N_CRUNS; l_cfr++ ) { l_fp1 << l_dofs[l_el][l_qt][l_md][l_cfr] << "\n"; l_fp2 << l_tInt[l_el][l_qt][l_md][l_cfr] << "\n"; } } } } #endif // 4. Clean up cleanupDg( l_dg ); cleanupStarM( l_starM ); cleanupFluxSolv( l_fluxSolvers ); cleanupTensor( l_dofs, l_tInt ); #ifdef PP_USE_OMP #pragma omp parallel #pragma omp critical #endif cleanupScratchMem( edge::parallel::g_scratchMem ); cleanupPseudoMesh( edge::reproducers::C_MODE_LOCAL, l_elChars, nullptr, nullptr, nullptr, nullptr, nullptr ); return 0; }
int main(int argc, char* argv[]) { const int ncalls = 1000000; #if defined(_OPENMP) const int max_nthreads = omp_get_max_threads(); #else const int max_nthreads = 1; #endif const int ncycles = LIBXSMM_MAX(1 < argc ? atoi(argv[1]) : 100, 1); const int max_nallocs = LIBXSMM_CLMP(2 < argc ? atoi(argv[2]) : 4, 1, MAX_MALLOC_N); const int nthreads = LIBXSMM_CLMP(3 < argc ? atoi(argv[3]) : 1, 1, max_nthreads); unsigned int nallocs = 0, nerrors = 0; int r[MAX_MALLOC_N], i; /* generate set of random number for parallel region */ for (i = 0; i < (MAX_MALLOC_N); ++i) r[i] = rand(); /* count number of calls according to randomized scheme */ for (i = 0; i < ncycles; ++i) { nallocs += r[i%(MAX_MALLOC_N)] % max_nallocs + 1; } assert(0 != nallocs); fprintf(stdout, "Running %i cycles with max. %i malloc+free (%u calls) using %i thread%s...\n", ncycles, max_nallocs, nallocs, 1 >= nthreads ? 1 : nthreads, 1 >= nthreads ? "" : "s"); #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload target(LIBXSMM_OFFLOAD_TARGET) #endif { const char *const longlife_env = getenv("LONGLIFE"); const int enable_longlife = ((0 == longlife_env || 0 == *longlife_env) ? 0 : atoi(longlife_env)); void *const longlife = (0 == enable_longlife ? 0 : malloc_offsite((MAX_MALLOC_MB) << 20)); unsigned long long d0, d1 = 0; libxsmm_scratch_info info; /* run non-inline function to measure call overhead of an "empty" function */ const unsigned long long t0 = libxsmm_timer_tick(); for (i = 0; i < ncalls; ++i) { libxsmm_init(); /* subsequent calls are not doing any work */ } d0 = libxsmm_timer_diff(t0, libxsmm_timer_tick()); #if defined(_OPENMP) # pragma omp parallel for num_threads(nthreads) private(i) default(none) shared(r) reduction(+:d1,nerrors) #endif for (i = 0; i < ncycles; ++i) { const int count = r[i%(MAX_MALLOC_N)] % max_nallocs + 1; void* p[MAX_MALLOC_N]; int j; assert(count <= MAX_MALLOC_N); for (j = 0; j < count; ++j) { const int k = (i * count + j) % (MAX_MALLOC_N); const size_t nbytes = (r[k] % (MAX_MALLOC_MB) + 1) << 20; const unsigned long long t1 = libxsmm_timer_tick(); p[j] = libxsmm_aligned_scratch(nbytes, 0/*auto*/); d1 += libxsmm_timer_diff(t1, libxsmm_timer_tick()); if (0 != p[j]) { memset(p[j], j, nbytes); } else { ++nerrors; } } for (j = 0; j < count; ++j) { libxsmm_free(p[j]); } } libxsmm_free(longlife); if (0 != d0 && 0 != d1 && 0 < nallocs) { const double dcalls = libxsmm_timer_duration(0, d0); const double dalloc = libxsmm_timer_duration(0, d1); const double alloc_freq = 1E-3 * nallocs / dalloc; const double empty_freq = 1E-3 * ncalls / dcalls; fprintf(stdout, "\tallocation+free calls/s: %.1f kHz\n", alloc_freq); fprintf(stdout, "\tempty calls/s: %.1f MHz\n", 1E-3 * empty_freq); fprintf(stdout, "\toverhead: %.1fx\n", empty_freq / alloc_freq); } if (EXIT_SUCCESS == libxsmm_get_scratch_info(&info) && 0 < info.size) { fprintf(stdout, "\nScratch: %.f MB (mallocs=%lu, pools=%u", 1.0 * info.size / (1 << 20), (unsigned long int)info.nmallocs, info.npools); if (1 < nthreads) fprintf(stdout, ", threads=%i)\n", nthreads); else fprintf(stdout, ")\n"); libxsmm_release_scratch(); /* suppress LIBXSMM's termination message about scratch */ } } if (0 == nerrors) { fprintf(stdout, "Finished\n"); return EXIT_SUCCESS; } else { fprintf(stdout, "FAILED (%u errors)\n", nerrors); return EXIT_FAILURE; } }
int main(int argc, char* argv[]) { unsigned int m=8, n=8, lda=8, ldb=8, nerrs, num, nmat, nmats, nmatd, ntest; unsigned int layout, asize, VLEND=4, VLENS=8, bsize; unsigned int ncorr; int i, j; char side, uplo, trans, diag; unsigned int typesize8 = 8; unsigned int typesize4 = 4; float *sa, *sb, *sc, *sd; double *da, *db, *dc, *dd, *tmpbuf; double dalpha = 1.0; float salpha; double dtmp; const unsigned char *cptr; unsigned long op_count; const libxsmm_trsm_descriptor* desc8 = NULL; const libxsmm_trsm_descriptor* desc4 = NULL; libxsmm_descriptor_blob blob; union { libxsmm_xtrsmfunction dp; libxsmm_xtrsmfunction sp; const void* pv; } mykernel = { 0 }; #ifdef USE_KERNEL_GENERATION_DIRECTLY void (*opcode_routine)(); #endif #ifdef USE_KERNEL_GENERATION_DIRECTLY #include <unistd.h> #include <signal.h> #include <malloc.h> #include <sys/mman.h> #include "../../src/generator_packed_trsm_avx_avx512.h" unsigned char *routine_output; libxsmm_generated_code io_generated_code; int pagesize = sysconf(_SC_PAGE_SIZE); if (pagesize == -1) fprintf(stderr,"sysconf pagesize\n"); routine_output = (unsigned char *) mmap(NULL, BUFSIZE2, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0,0); if (mprotect(routine_output, BUFSIZE2, PROT_EXEC | PROT_READ | PROT_WRITE ) == -1) fprintf(stderr,"mprotect\n"); printf("Routine ready\n"); io_generated_code.generated_code = &routine_output[0]; io_generated_code.buffer_size = BUFSIZE2; io_generated_code.code_size = 0; io_generated_code.code_type = 2; io_generated_code.last_error = 0; #endif if ( argc <= 3 ) { printf("\nUSAGE: %s m n lda ldb nmat side uplo trans diag layout ntest alpha\n",argv[0]); printf("Compact TRSM a mxn matrix of leading dimension ldb\n"); printf("This will test the jit of 1 VLEN work of nmat at a time\n"); printf("Defaults: m=n=lda=ldb=nmat=8, alpha=1.0, side=uplo='L',trans=diag='N',layout=102,ntest=1\n"); } if ( argc > 1 ) m = atoi(argv[1]); else m = 8; if ( argc > 2 ) n = atoi(argv[2]); else n = 8; if ( argc > 3 ) lda= atoi(argv[3]); else lda = 8; if ( argc > 4 ) ldb = atoi(argv[4]); else ldb = 8; if ( argc > 5 ) nmat = atoi(argv[5]); else nmat = 8; if ( argc > 6 ) side = argv[6][0]; else side = 'L'; if ( argc > 7 ) uplo = argv[7][0]; else uplo = 'L'; if ( argc > 8 ) trans = argv[8][0]; else trans = 'N'; if ( argc > 9 ) diag = argv[9][0]; else diag = 'N'; if ( argc > 10 ) layout = atoi(argv[10]); else layout=102; if ( argc > 11 ) ntest = atoi(argv[11]); else ntest = 1; if ( argc > 12 ) dalpha = atof(argv[12]); else dalpha = 1.0; salpha = (float)dalpha; m = LIBXSMM_MAX(m,1); n = LIBXSMM_MAX(n,1); /* A is either mxm or nxn depending on side */ if ( (side == 'L') || (side=='l') ) asize = m; else asize = n; lda = LIBXSMM_MAX(lda,asize); if ( layout == 102 ) { /* Column major: B is mxn, and stored in B format */ ldb = LIBXSMM_MAX(ldb,m); bsize = ldb*n; } else { /* Row major: B is mxn, and stored in B^T format */ ldb = LIBXSMM_MAX(ldb,n); bsize = ldb*m; } nmats = LIBXSMM_MAX(VLENS,nmat - (nmat%VLENS)); nmatd = LIBXSMM_MAX(VLEND,nmat - (nmat%VLEND)); nmat = LIBXSMM_MAX(nmats,nmatd); op_count = n * m * asize; printf("This is a real*%u tester for JIT compact TRSM kernels! (%c%c%c%c m=%u n=%u lda=%u ldb=%u layout=%u nmat=%u)\n",typesize8,side,uplo,trans,diag,m,n,lda,ldb,layout,nmat); #ifdef USE_XSMM_GENERATED printf("This code tests the LIBXSMM generated kernels\n"); #endif #ifdef USE_PREDEFINED_ASSEMBLY printf("This code tests some predefined assembly kenrel\n"); #endif #ifdef USE_KERNEL_GENERATION_DIRECTLY printf("This code tests kernel generation directly\n"); #endif #ifdef TIME_MKL printf("This code tests MKL compact batch directly\n"); #endif desc8 = libxsmm_trsm_descriptor_init(&blob, typesize8, m, n, lda, ldb, &dalpha, trans, diag, side, uplo, layout); desc4 = libxsmm_trsm_descriptor_init(&blob, typesize4, m, n, lda, ldb, &salpha, trans, diag, side, uplo, layout); #ifdef USE_XSMM_GENERATED printf("calling libxsmm_dispatch_trsm: typesize8=%u\n",typesize8); mykernel.dp = libxsmm_dispatch_trsm(desc8); printf("done calling libxsmm_dispatch_trsm: typesize8=%u\n",typesize8); if ( mykernel.dp == NULL ) printf("R8 Kernel after the create call is null\n"); mykernel.sp = libxsmm_dispatch_trsm(desc4); if ( mykernel.sp == NULL ) printf("R4 kernel after the create call is null\n"); #endif #ifdef USE_KERNEL_GENERATION_DIRECTLY libxsmm_generator_trsm_kernel ( &io_generated_code, &desc8, "hsw" ); #endif #ifndef NO_ACCURACY_CHECK printf("mallocing matrices\n"); #endif sa = (float *) malloc ( lda*asize*nmats*sizeof(float) ); da = (double *) malloc ( lda*asize*nmatd*sizeof(double) ); sb = (float *) malloc ( bsize*nmats*sizeof(float) ); db = (double *) malloc ( bsize*nmatd*sizeof(double) ); sc = (float *) malloc ( bsize*nmats*sizeof(float) ); dc = (double *) malloc ( bsize*nmatd*sizeof(double) ); sd = (float *) malloc ( bsize*nmats*sizeof(float) ); dd = (double *) malloc ( bsize*nmatd*sizeof(double) ); tmpbuf = (double *) malloc ( asize*VLEND*sizeof(double) ); #ifndef NO_ACCURACY_CHECK printf("filling matrices\n"); #endif sfill_matrix ( sa, lda, asize, asize*nmats ); #ifdef TRIANGLE_IS_IDENTITY printf("Warning: setting triangular matrix to identity. Not good for accuracy testing\n"); dfill_identity ( da, lda, asize, asize, VLEND, nmatd/VLEND ); #else dfill_matrix ( da, lda, asize, asize*nmatd ); #endif sfill_matrix ( sb, bsize, bsize, nmats ); dfill_matrix ( db, bsize, bsize, nmatd ); #ifndef NO_ACCURACY_CHECK for ( i = 0 ; i < (int)(bsize*nmats) ; i++ ) sc[i]=sb[i]; for ( i = 0 ; i < (int)(bsize*nmatd) ; i++ ) dc[i]=db[i]; for ( i = 0 ; i < (int)(bsize*nmats) ; i++ ) sd[i]=sb[i]; for ( i = 0 ; i < (int)(bsize*nmatd) ; i++ ) dd[i]=db[i]; printf("Pointing at the kernel now\n"); #endif #ifdef USE_XSMM_GENERATED cptr = (const unsigned char*) mykernel.pv; #endif #ifdef USE_PREDEFINED_ASSEMBLY cptr = (const unsigned char*) trsm_xct_; #endif #ifdef USE_KERNEL_GENERATION_DIRECTLY cptr = (const unsigned char*) &routine_output[0]; opcode_routine = (void *) &cptr[0]; #endif #ifndef TIME_MKL #define DUMP_ASSEMBLY_FILE #endif #ifdef DUMP_ASSEMBLY_FILE printf("Dumping assembly file\n"); FILE *fp = fopen("foo.s","w"); char buffer[80]; fputs("\t.text\n",fp); fputs("\t.align 256\n",fp); fputs("\t.globl trsm_xct_\n",fp); fputs("trsm_xct_:\n",fp); for (i = 0 ; i < 4000; i+=4 ) { sprintf(buffer,".byte 0x%02x, 0x%02x, 0x%02x, 0x%02x\n",cptr[i],cptr[i+1],cptr[i+2],cptr[i+3]); fputs(buffer,fp); } fputs("\tretq\n",fp); fputs("\t.type trsm_xct_,@function\n",fp); fputs("\t.size trsm_xct_,.-trsm_xct_\n",fp); fclose(fp); #endif #if defined(USE_MKL_FOR_REFERENCE) || defined(TIME_MKL) #include "mkl.h" MKL_LAYOUT CLAYOUT = (layout == 101) ? MKL_ROW_MAJOR : MKL_COL_MAJOR; MKL_SIDE SIDE = (side == 'R' || side == 'r') ? MKL_RIGHT : MKL_LEFT; MKL_UPLO UPLO = (uplo == 'U' || uplo == 'u') ? MKL_UPPER : MKL_LOWER; MKL_TRANSPOSE TRANSA = (trans == 'N' || trans == 'n') ? MKL_NOTRANS : MKL_TRANS; MKL_DIAG DIAG = (diag == 'N' || diag == 'n') ? MKL_NONUNIT : MKL_UNIT; MKL_COMPACT_PACK CMP_FORMAT = mkl_get_format_compact(); #if 0 MKL_COMPACT_PACK CMP_FORMAT = MKL_COMPACT_AVX; #endif #endif #ifndef NO_ACCURACY_CHECK printf("Before routine, initial B(1,1)=%g B[256]=%g\n",db[0],db[256]); #endif #ifdef USE_PREDEFINED_ASSEMBLY double one = 1.0; #endif double timer; #ifdef MKL_TIMER double tmptimer; tmptimer = dsecnd_(); #else unsigned long long l_start, l_end; #endif timer = 0.0; for ( j = 0 ; j < (int)ntest ; j++ ) { #ifndef TRIANGLE_IS_IDENTITY for ( i = 0 ; i < (int)(bsize*nmatd) ; i++ ) db[i]=dd[i]; #endif for ( i = 0 , num = 0; i < (int)nmatd ; i+= (int)VLEND, num++ ) { double *Ap = &da[num*lda*asize*VLEND]; double *Bp = &db[num*bsize*VLEND]; #ifdef MKL_TIMER tmptimer = dsecnd_(); #else l_start = libxsmm_timer_tick(); #endif #ifdef USE_XSMM_GENERATED mykernel.dp ( Ap, Bp, tmpbuf ); #endif #ifdef USE_PREDEFINED_ASSEMBLY trsm_xct_ ( Ap, Bp, &one ); #endif #ifdef USE_KERNEL_GENERATION_DIRECTLY (*opcode_routine)( Ap, Bp ); #endif #ifdef TIME_MKL mkl_dtrsm_compact ( CLAYOUT, SIDE, UPLO, TRANSA, DIAG, m, n, dalpha, da, lda, db, ldb, CMP_FORMAT, nmatd ); i+=nmatd; /* Because MKL will do everything */ #endif #ifdef MKL_TIMER timer += dsecnd_() - tmptimer; #else l_end = libxsmm_timer_tick(); timer += libxsmm_timer_duration(l_start,l_end); #endif } } timer /= ((double)ntest); #ifndef NO_ACCURACY_CHECK printf("Average time to get through %u matrices: %g\n",nmatd,timer); printf("Gflops: %g\n",(double)(op_count*nmatd)/(timer*1.0e9)); printf("after routine, new B(1,1)=%g B[256]=%g\n",db[0],db[256]); #endif #ifdef TEST_SINGLE printf("Before r4 routine, initial B(1,1)=%g B[256]=%g\n",sb[0],sb[256]); for ( i = 0 , num = 0; i < nmats ; i+= VLENS, num++ ) { float *Ap = &sa[num*lda*asize*VLENS]; float *Bp = &sb[num*bsize*VLENS]; #ifdef USE_XSMM_GENERATED mykernel.sp ( Ap, Bp, NULL ); #endif } printf("after r4 routine, new B(1,1)=%g B]256]=%g\n",db[0],db[256]); #endif #ifndef NO_ACCURACY_CHECK /* Call some reference code now on a copy of the B matrix (C) */ double timer2 = 0.0; for ( j = 0 ; j < (int)ntest ; j++ ) { #ifndef TRIANGLE_IS_IDENTITY for ( i = 0 ; i < (int)(bsize*nmatd) ; i++ ) dc[i]=dd[i]; #endif #ifdef MKL_TIMER tmptimer = dsecnd_(); #else l_start = libxsmm_timer_tick(); #endif #ifdef USE_MKL_FOR_REFERENCE mkl_dtrsm_compact ( CLAYOUT, SIDE, UPLO, TRANSA, DIAG, m, n, dalpha, da, lda, dc, ldb, CMP_FORMAT, nmatd ); #elif !defined(LIBXSMM_NOFORTRAN) if ( (layout == 101) && (nmatd!=VLEND) ) { unsigned int lay = 102, m1 = n, n1 = m; char side1='L', uplo1='L'; if ( side == 'L' || side == 'l' ) side1 = 'R'; if ( uplo == 'L' || uplo == 'l' ) uplo1 = 'U'; compact_dtrsm_ ( &lay, &side1, &uplo1, &trans, &diag, &m1, &n1, &dalpha, da, &lda, dc, &ldb, &nmatd, &VLEND ); } else { compact_dtrsm_ ( &layout, &side, &uplo, &trans, &diag, &m, &n, &dalpha, da, &lda, dc, &ldb, &nmatd, &VLEND ); } #endif #ifdef MKL_TIMER timer2 += dsecnd_() - tmptimer; #else l_end = libxsmm_timer_tick(); timer2 += libxsmm_timer_duration(l_start,l_end); #endif } timer2 /= ((double)ntest); printf("Reference time=%g Reference Gflops=%g\n",timer2,(op_count*nmatd)/(timer2*1.0e9)); /* Compute the residual between B and C */ dtmp = residual_d ( dc, bsize, bsize, nmatd, db, bsize, &nerrs, &ncorr ); printf("R8 %c%c%c%c m=%u n=%u lda=%u ldb=%u error: %g number of errors: %u corrects: %u",side,uplo,trans,diag,m,n,lda,ldb,dtmp,nerrs,ncorr); if ( nerrs > 0 ) printf(" ->FAILED at %ux%u real*8 %u case",m,n,layout); printf("\n"); #ifdef TEST_SINGLE /* Call some reference code now on a copy of the B matrix (C) */ compact_strsm_ ( &layout, &side, &uplo, &trans, &diag, &m, &n, &salpha, sa, &lda, sc, &ldb, &nmats, &VLENS ); /* Compute the residual between B and C */ dtmp = residual_s ( sc, bsize, bsize, nmats, sb, bsize, &nerrs, &ncorr ); printf("R4 %c%c%c%c m=%u n=%u lda=%u ldb=%u error: %g number of errors: %u corrects: %u\n",side,uplo,trans,diag,m,n,lda,ldb,dtmp,nerrs,ncorr); if ( nerrs > 0 ) printf(" ->FAILED at %ux%u real*4 case",m,n); printf("\n"); #endif #else for ( j = 0, nerrs = 0 ; j < bsize*nmatd ; j++ ) { if ( isnan(db[j]) || isinf(db[j]) ) { if ( ++nerrs < 10 ) { printf("WARNING: db[%d]=%g\n",j,db[j]); } } } printf("%g,real*8 %c%c%c%c m=%u n=%u lda=%u ldb=%u Denormals=%u Time=%g Gflops=%g",(op_count*nmatd)/(timer*1.0e9),side,uplo,trans,diag,m,n,lda,ldb,nerrs,timer,(op_count*nmatd)/(timer*1.0e9)); if ( nerrs > 0 ) printf(" -> FAILED at %ux%u real*8 case",m,n); printf("\n"); #endif free(dd); free(sd); free(dc); free(sc); free(db); free(sb); free(da); free(sa); return 0; }
int main(int argc, char* argv[]) { double rng_stddev = 0; float* rngs; float vrng[16]; libxsmm_matdiff_info info; libxsmm_blasint num_rngs; libxsmm_blasint i; unsigned long long start; if (2 < argc) { fprintf(stderr, "Usage:\n %s number_rngs\n", argv[0]); return EXIT_SUCCESS; } /* parse the command line and set up the test parameters */ num_rngs = (1 < argc ? atoi(argv[1]) : 1000); assert(num_rngs >= 1); rngs = (float*)malloc((size_t)(sizeof(float) * num_rngs)); if (NULL == rngs) num_rngs = 0; libxsmm_rng_set_seed( (uint32_t)(time(0))); /* fill array with random floats */ libxsmm_rng_f32_seq( rngs, num_rngs ); /* some quality measure; variance is based on discovered average rather than expected value */ if (EXIT_SUCCESS == libxsmm_matdiff(&info, LIBXSMM_DATATYPE_F32, 1/*m*/, num_rngs, NULL/*ref*/, rngs/*tst*/, NULL/*ldref*/, NULL/*ldtst*/)) { rng_stddev = sqrt(info.var_tst); } start = libxsmm_timer_tick(); for (i = 0; i < num_rngs; ++i) { libxsmm_rng_f32_seq( rngs, 1 ); } printf("\nlibxsmm_rng_float: %llu cycles per random number (scalar)\n", libxsmm_timer_cycles(start, libxsmm_timer_tick()) / num_rngs); start = libxsmm_timer_tick(); for (i = 0; i < num_rngs; ++i) { libxsmm_rng_f32_seq( vrng, 16 ); } printf("\nlibxsmm_rng_float: %llu cycles per random number (vlen=16)\n", libxsmm_timer_cycles(start, libxsmm_timer_tick()) / ((size_t)num_rngs*16)); /* let's compute some values of the random numbers */ printf("\nWe have generated %lli random numbers uniformly distributed in [0,1(\n", (long long)num_rngs); printf("We expect the following values E=0.5, Var=0.083333, Stddev=0.288675\n\n"); printf("minimum random number is: %f\n", info.min_tst); printf("maximum random number is: %f\n", info.max_tst); printf("sum of random numbers is: %f\n", info.l1_tst); printf("Expected Value of random numbers is: %f\n", info.avg_tst); printf("Variance of random numbers is: %f\n", info.var_tst); printf("StdDev of random numbers is: %f\n\n", rng_stddev); free( rngs ); return EXIT_SUCCESS; }
int main(int argc, char* argv[]) { const char t = (char)(1 < argc ? *argv[1] : 'o'); const libxsmm_blasint m = (2 < argc ? atoi(argv[2]) : 4096); #if 0 /* TODO: enable when in-place transpose is fully supported */ const libxsmm_blasint n = (3 < argc ? atoi(argv[3]) : m); #else const libxsmm_blasint n = (3 < argc ? (('o' == t || 'O' == t) ? atoi(argv[3]) : m) : m); #endif const libxsmm_blasint ldi = LIBXSMM_MAX/*sanitize ld*/(4 < argc ? atoi(argv[4]) : 0, m); const libxsmm_blasint ldo = LIBXSMM_MAX/*sanitize ld*/(5 < argc ? atoi(argv[5]) : 0, n); const int r = (6 < argc ? atoi(argv[6]) : 0), s = LIBXSMM_ABS(r); const libxsmm_blasint lower = (7 < argc ? atoi(argv[7]) : 0); libxsmm_blasint km = m, kn = n, kldi = ldi, kldo = (('o' == t || 'O' == t) ? ldo : ldi); int result = EXIT_SUCCESS, k; if (0 == strchr("oOiI", t)) { fprintf(stderr, "%s [<transpose-kind:o|i>] [<m>] [<n>] [<ld-in>] [<ld-out>] [random:0|nruns] [lbound]\n", argv[0]); exit(EXIT_FAILURE); } #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload target(LIBXSMM_OFFLOAD_TARGET) #endif { const char *const env_tasks = getenv("TASKS"), *const env_check = getenv("CHECK"); const int tasks = (0 == env_tasks || 0 == *env_tasks) ? 0/*default*/ : atoi(env_tasks); const int check = (0 == env_check || 0 == *env_check) ? 1/*default*/ : atoi(env_check); ELEM_TYPE *const a = (ELEM_TYPE*)libxsmm_malloc((size_t)(ldi * (('o' == t || 'O' == t) ? n : ldo) * sizeof(ELEM_TYPE))); ELEM_TYPE *const b = (ELEM_TYPE*)libxsmm_malloc((size_t)(ldo * (('o' == t || 'O' == t) ? m : ldi) * sizeof(ELEM_TYPE))); libxsmm_timer_tickint start, duration = 0; #if defined(USE_REFERENCE) /* benchmark against a reference */ libxsmm_timer_tickint duration2 = 0; #endif libxsmm_blasint i; size_t size = 0; #if defined(MKL_ENABLE_AVX512) mkl_enable_instructions(MKL_ENABLE_AVX512); #endif fprintf(stdout, "m=%lli n=%lli ldi=%lli ldo=%lli size=%.fMB (%s, %s)\n", (long long)m, (long long)n, (long long)ldi, (long long)ldo, 1.0 * (m * n * sizeof(ELEM_TYPE)) / (1 << 20), LIBXSMM_STRINGIFY(ELEM_TYPE), ('o' == t || 'O' == t) ? "out-of-place" : "in-place"); #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < n; ++i) { libxsmm_blasint j; for (j = 0; j < m; ++j) { a[i*ldi+j] = initial_value(i, j, m); } } if (0 != check) { /* repeatable (reference) */ srand(RAND_SEED); } else { /* randomized selection */ srand(libxsmm_timer_tick() % ((unsigned int)-1)); } for (k = (0 == r ? -1 : 0); k < s && EXIT_SUCCESS == result; ++k) { if (0 < r) { const libxsmm_blasint rldi = 0 <= lower ? randstart(lower, ldi) : 0; km = randstart(LIBXSMM_ABS(lower), m); kldi = LIBXSMM_MAX(rldi, km); if (('o' == t || 'O' == t)) { const libxsmm_blasint rldo = 0 <= lower ? randstart(lower, ldo) : 0; kn = randstart(LIBXSMM_ABS(lower), n); kldo = LIBXSMM_MAX(rldo, kn); /* trigger JIT-generated code */ OTRANS(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo); } else { #if 0 /* TODO: enable when in-place transpose is fully supported */ kn = randstart(LIBXSMM_ABS(lower), n); #else kn = km; #endif kldo = kldi; /* trigger JIT-generated code */ ITRANS(b, sizeof(ELEM_TYPE), km, kn, kldi); } } size += (size_t)(km * kn * sizeof(ELEM_TYPE)); if (('o' == t || 'O' == t)) { if (0 == tasks) { /* library-internal parallelization */ start = libxsmm_timer_tick(); #if defined(OTRANS_THREAD) # pragma omp parallel OTRANS_THREAD(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo, omp_get_thread_num(), omp_get_num_threads()); #else result = OTRANS(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo); #endif duration += libxsmm_timer_diff(start, libxsmm_timer_tick()); } else { /* external parallelization */ start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel # pragma omp single nowait #endif result = OTRANS(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo); duration += libxsmm_timer_diff(start, libxsmm_timer_tick()); } } else { assert(('i' == t || 'I' == t) && kldo == kldi); memcpy(b, a, (size_t)(kldi * kn * sizeof(ELEM_TYPE))); if (2 > tasks) { /* library-internal parallelization */ start = libxsmm_timer_tick(); result = ITRANS(b, sizeof(ELEM_TYPE), km, kn, kldi); duration += libxsmm_timer_diff(start, libxsmm_timer_tick()); } else { /* external parallelization */ start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel # pragma omp single #endif result = ITRANS(b, sizeof(ELEM_TYPE), km, kn, kldi); duration += libxsmm_timer_diff(start, libxsmm_timer_tick()); } } if (0 != check) { /* check */ for (i = 0; i < km; ++i) { libxsmm_blasint j; for (j = 0; j < kn; ++j) { const ELEM_TYPE u = b[i*kldo+j]; const ELEM_TYPE v = a[j*kldi+i]; if (LIBXSMM_NEQ(u, v)) { i += km; /* leave outer loop as well */ result = EXIT_FAILURE; break; } } } } } #if defined(USE_REFERENCE) if (0 < check) { /* check shall imply reference (performance-)test */ srand(RAND_SEED); /* reproduce the same sequence as above */ for (k = (0 == r ? -1 : 0); k < s && EXIT_SUCCESS == result; ++k) { if (0 < r) { const libxsmm_blasint rldi = 0 <= lower ? randstart(lower, ldi) : 0; km = randstart(LIBXSMM_ABS(lower), m); kldi = LIBXSMM_MAX(rldi, km); if (('o' == t || 'O' == t)) { const libxsmm_blasint rldo = 0 <= lower ? randstart(lower, ldo) : 0; kn = randstart(LIBXSMM_ABS(lower), n); kldo = LIBXSMM_MAX(rldo, kn); } else { #if 0 /* TODO: enable when in-place transpose is fully supported */ kn = randstart(LIBXSMM_ABS(lower), n); #else kn = km; #endif kldo = kldi; } } if (('o' == t || 'O' == t)) { start = libxsmm_timer_tick(); OTRANS_GOLD(&km, &kn, a, &kldi, b, &kldo); duration2 += libxsmm_timer_diff(start, libxsmm_timer_tick()); } else { assert(('i' == t || 'I' == t) && kldo == kldi); memcpy(b, a, (size_t)(kldi * kn * sizeof(ELEM_TYPE))); start = libxsmm_timer_tick(); ITRANS_GOLD(&km, &kn, b, &kldi, &kldo); duration2 += libxsmm_timer_diff(start, libxsmm_timer_tick()); } if (1 < check || 0 > check) { /* check */ for (i = 0; i < km; ++i) { libxsmm_blasint j; for (j = 0; j < kn; ++j) { const ELEM_TYPE u = b[i*kldo+j]; const ELEM_TYPE v = a[j*kldi+i]; if (LIBXSMM_NEQ(u, v)) { i += km; /* leave outer loop as well */ result = EXIT_FAILURE; break; } } } } } } #endif if (EXIT_SUCCESS == result) { const double d = libxsmm_timer_duration(0, duration); if (0 < duration) { /* out-of-place transpose bandwidth assumes RFO */ fprintf(stdout, "\tbandwidth: %.1f GB/s\n", size * ((('o' == t || 'O' == t)) ? 3 : 2) / (d * (1 << 30))); } if (0 == lower) { fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * (d / (0 == r ? (s + 1) : s))); } else { fprintf(stdout, "\tduration: %f ms\n", 1000.0 * d); } #if defined(USE_REFERENCE) if (0 < duration2) { fprintf(stdout, "\treference: %.1fx\n", (1.0 * duration) / duration2); } #endif } else if (0 != check) { /* check */ fprintf(stderr, "Error: validation failed for m=%lli, n=%lli, ldi=%lli, and ldo=%lli!\n", (long long)km, (long long)kn, (long long)kldi, (long long)kldo); } libxsmm_free(a); libxsmm_free(b); } return result; }
int main(int argc, char* argv[]) { unsigned int m=8, n=8, k=8, lda=8, ldb=8, ldc=8, nerrs, num, nmat; unsigned int layout, asize, bsize, ntest, ncorr; #ifdef AVX512_TESTING unsigned int VLEND=8, VLENS=16; int arch=LIBXSMM_X86_AVX512_CORE; #else unsigned int VLEND=4, VLENS=8; int arch=LIBXSMM_X86_AVX2; #endif unsigned int nmats, nmatd; unsigned int i, j, l, iunroll, junroll, loopi, loopj; char side='L', uplo='U', transa='N', transb='N', diag='N'; unsigned int typesize8 = 8; unsigned int typesize4 = 4; float *sa, *sb, *sc, *sd, *sc1; double *da, *db, *dc, *dd, *dc1; double dalpha = 1.0; float salpha = (float)dalpha; double dbeta = 1.0; float sbeta = (float)dbeta; double dtmp; const unsigned char *cptr = NULL; unsigned long op_count; const libxsmm_pgemm_descriptor* desc8 = NULL; const libxsmm_pgemm_descriptor* desc4 = NULL; #ifdef USE_XSMM_GENERATED libxsmm_descriptor_blob blob; libxsmm_pgemm_xfunction mykernel = NULL; #endif #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__) void (*opcode_routine)(); unsigned char *routine_output; libxsmm_generated_code io_generated_code; int pagesize = sysconf(_SC_PAGE_SIZE); if (pagesize == -1) fprintf(stderr,"sysconf pagesize\n"); routine_output = (unsigned char *) mmap(NULL, BUFSIZE2, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0,0); if (mprotect(routine_output, BUFSIZE2, PROT_EXEC | PROT_READ | PROT_WRITE ) == -1) fprintf(stderr,"mprotect\n"); printf("Routine ready\n"); io_generated_code.generated_code = &routine_output[0]; io_generated_code.buffer_size = BUFSIZE2; io_generated_code.code_size = 0; io_generated_code.code_type = 2; io_generated_code.last_error = 0; #endif printf("\nUSAGE: %s m n k lda ldb ldc nmat layout ntest transa transb iunroll junroll loopj loopi\n",argv[0]); if ( argc <= 3 ) { #ifdef TEST_SINGLE printf("Compact SGEMM a C_mxn<-C_mxn+A_mxk*B_kxn matrix of leading dims lda/b/c\n"); printf("This will test the jit of 1 VLEN=%d ",VLENS); if ( VLENS==8 ) printf("(AVX2)"); else printf("(AVX512)"); #else printf("Compact DGEMM a C_mxn<-C_mxn+A_mxk*B_kxn matrix of leading dims lda/b/c\n"); printf("This will test the jit of 1 VLEN=%d ",VLEND); if ( VLEND==4 ) printf("(AVX2)"); else printf("(AVX512)"); #endif printf(" work of nmat at a time\n"); printf("Configurable: M-loop controlled by iunroll & loopi. N-loop by junroll & loopj\n"); printf("Defaults: m=n=k=lda=ldb=ldc=nmat=8, layout=102 (col major), transa=/b='N', ntest=1\n"); } if ( argc > 1 ) m = atoi(argv[1]); else m = 8; if ( argc > 2 ) n = atoi(argv[2]); else n = 8; if ( argc > 3 ) k = atoi(argv[3]); else k = 8; if ( argc > 4 ) lda= atoi(argv[4]); else lda = 8; if ( argc > 5 ) ldb= atoi(argv[5]); else ldb = 8; if ( argc > 6 ) ldc= atoi(argv[6]); else ldc = 8; if ( argc > 7 ) nmat = atoi(argv[7]); else nmat = 8; if ( argc > 8 ) layout = atoi(argv[8]); else layout=102; if ( argc > 9 ) ntest = atoi(argv[9]); else ntest = 1; if ( argc > 10 ) transa = argv[10][0]; else transa = 'N'; if ( argc > 11 ) transb = argv[11][0]; else transb = 'N'; if ( argc > 12 ) iunroll=atoi(argv[12]); else iunroll=0; if ( argc > 13 ) junroll=atoi(argv[13]); else junroll=0; if ( argc > 14 ) loopj=atoi(argv[14]); else loopj=0; if ( argc > 15 ) loopi=atoi(argv[15]); else loopi=0; salpha = (float)dalpha; m = LIBXSMM_MAX(m,1); n = LIBXSMM_MAX(n,1); k = LIBXSMM_MAX(k,1); ntest = LIBXSMM_MAX(ntest,1); nmat = LIBXSMM_MAX(nmat,VLEND); layout = LIBXSMM_MAX(LIBXSMM_MIN(layout,102),101); if ( transa!='N' && transa!='n' && transa!='T' && transa!='t' ) transa='N'; if ( transb!='N' && transb!='n' && transb!='T' && transb!='t' ) transb='N'; lda = LIBXSMM_MAX(lda,m); ldb = LIBXSMM_MAX(ldb,k); ldc = LIBXSMM_MAX(ldc,m); nmats = LIBXSMM_MAX(VLENS,nmat - (nmat%VLENS)); nmatd = LIBXSMM_MAX(VLEND,nmat - (nmat%VLEND)); #ifdef TEST_SINGLE nmat = nmats; #else nmat = nmatd; #endif op_count = (unsigned long)(nmat * 2.0 * (double)m * (double)n * (double)k); #ifdef TEST_SINGLE printf("This is a real*%d tester for JIT compact SGEMM %c%c kernels! (m=%u n=%u k=%u lda=%u ldb=%u ldc=%u layout=%d nmat=%d alpha=%g beta=%g iun=%d jun=%d loopi=%d loopj=%d VLEN=%d)\n",typesize4,transa,transb,m,n,k,lda,ldb,ldc,layout,nmat,dalpha,dbeta,iunroll,junroll,loopi,loopj,VLENS); #else printf("This is a real*%d tester for JIT compact DGEMM %c%c kernels! (m=%u n=%u k=%u lda=%u ldb=%u ldc=%u layout=%d nmat=%d alpha=%g beta=%g iun=%d jun=%d loopi=%d loopj=%d VLEN=%d)\n",typesize8,transa,transb,m,n,k,lda,ldb,ldc,layout,nmat,dalpha,dbeta,iunroll,junroll,loopi,loopj,VLEND); #endif #ifdef USE_XSMM_GENERATED printf("This code tests the LIBXSMM generated kernels\n"); #endif #ifdef USE_PREDEFINED_ASSEMBLY printf("This code tests some predefined assembly kernel\n"); #endif #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__) printf("This code tests kernel generation directly\n"); #endif #ifdef TIME_MKL printf("This code tests MKL compact batch directly\n"); #endif #ifdef AVX512_TESTING printf("This tests AVX512 binaries\n"); #endif #ifdef AVX2_TESTING printf("This tests AVX2 binaries\n"); #endif desc8 = libxsmm_pgemm_descriptor_init(&blob, typesize8, m, n, k, lda, ldb, ldc, &dalpha, transa, transb, layout ); #ifdef TEST_SINGLE desc4 = libxsmm_pgemm_descriptor_init(&blob, typesize4, m, n, k, lda, ldb, ldc, &dalpha, transa, transb, layout ); #endif printf("Descriptor set\n"); #ifdef USE_XSMM_GENERATED printf("calling libxsmm_dispatch_pgemm: typesize8=%u\n",typesize8); mykernel = libxsmm_dispatch_pgemm(desc8); printf("done calling libxsmm_dispatch_pgemm: typesize8=%u\n",typesize8); if ( mykernel == NULL ) printf("R8 Kernel after the create call is null\n"); #ifdef TEST_SINGLE mykernel = libxsmm_dispatch_pgemm(desc4); if ( mykernel == NULL ) printf("R4 kernel after the create call is null\n"); #endif #endif #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__) libxsmm_generator_pgemm_kernel( &io_generated_code, desc8, arch, iunroll, junroll, loopi, loopj ); #endif #ifndef NO_ACCURACY_CHECK printf("mallocing matrices\n"); #endif sa = (float *) malloc ( lda*k*nmat*sizeof(float) ); da = (double *) malloc ( lda*k*nmat*sizeof(double) ); sb = (float *) malloc ( ldb*n*nmat*sizeof(float) ); db = (double *) malloc ( ldb*n*nmat*sizeof(double) ); sc1 = (float *) malloc ( ldc*n*nmat*sizeof(float) ); dc1 = (double *) malloc ( ldc*n*nmat*sizeof(double) ); sc = (float *) malloc ( ldc*n*nmat*sizeof(float) ); dc = (double *) malloc ( ldc*n*nmat*sizeof(double) ); sd = (float *) malloc ( ldc*n*nmat*sizeof(float) ); dd = (double *) malloc ( ldc*n*nmat*sizeof(double) ); #ifndef NO_ACCURACY_CHECK printf("filling matrices\n"); #endif sfill_matrix ( sa, lda, m, k*nmat ); sfill_matrix ( sb, ldb, k, n*nmat ); sfill_matrix ( sc, ldc, m, n*nmat ); dfill_matrix ( da, lda, m, k*nmat ); dfill_matrix ( db, ldb, k, n*nmat ); dfill_matrix ( dc, ldc, m, n*nmat ); #ifndef NO_ACCURACY_CHECK for ( i = 0 ; i < ldc*n*nmat ; i++ ) sd[i]=sc[i]; for ( i = 0 ; i < ldc*n*nmat ; i++ ) dd[i]=dc[i]; for ( i = 0 ; i < ldc*n*nmat ; i++ ) sc1[i]=sc[i]; for ( i = 0 ; i < ldc*n*nmat ; i++ ) dc1[i]=dc[i]; printf("Pointing at the kernel now\n"); #endif #ifdef USE_XSMM_GENERATED cptr = (const unsigned char*) mykernel; #endif #ifdef USE_PREDEFINED_ASSEMBLY cptr = (const unsigned char*) gemm_; #endif #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__) cptr = (const unsigned char*) &routine_output[0]; opcode_routine = (void *) &cptr[0]; #endif #ifndef TIME_MKL # define DUMP_ASSEMBLY_FILE #endif #ifdef DUMP_ASSEMBLY_FILE printf("Dumping assembly file\n"); FILE *fp = fopen("foo.s","w"); char buffer[80]; fputs("\t.text\n",fp); fputs("\t.align 256\n",fp); fputs("\t.globl gemm_\n",fp); fputs("gemm_:\n",fp); for (i = 0 ; i < 7000; i+=4 ) { sprintf(buffer,".byte 0x%02x, 0x%02x, 0x%02x, 0x%02x\n",cptr[i],cptr[i+1],cptr[i+2],cptr[i+3]); fputs(buffer,fp); } fputs("\tretq\n",fp); fputs("\t.type gemm_,@function\n",fp); fputs("\t.size gemm_,.-gemm_\n",fp); fclose(fp); #endif #if defined(USE_MKL_FOR_REFERENCE) || defined(TIME_MKL) # include <mkl.h> MKL_LAYOUT CLAYOUT = (layout == 101) ? MKL_ROW_MAJOR : MKL_COL_MAJOR; MKL_SIDE SIDE = (side == 'R' || side == 'r') ? MKL_RIGHT : MKL_LEFT; MKL_UPLO UPLO = (uplo == 'U' || uplo == 'u') ? MKL_UPPER : MKL_LOWER; MKL_TRANSPOSE TRANSA = (transa == 'N' || transa == 'n') ? MKL_NOTRANS : MKL_TRANS; MKL_TRANSPOSE TRANSB = (transb == 'N' || transb == 'n') ? MKL_NOTRANS : MKL_TRANS; MKL_DIAG DIAG = (diag == 'N' || diag == 'n') ? MKL_NONUNIT : MKL_UNIT; MKL_COMPACT_PACK CMP_FORMAT = mkl_get_format_compact(); #if 0 MKL_COMPACT_PACK CMP_FORMAT = MKL_COMPACT_AVX; #endif #endif #ifndef NO_ACCURACY_CHECK printf("Before routine, initial A(1,1)=%g A[256]=%g\n",da[0],da[256]); #endif #ifdef USE_PREDEFINED_ASSEMBLY double one = 1.0; #endif double timer, firsttime = 0; #ifdef MKL_TIMER double tmptimer; tmptimer = dsecnd_(); #else unsigned long long l_start, l_end; #endif timer = 0.0; for ( j = 0 ; j < (int)ntest ; j++ ) { for ( i = 0 ; i < ldc*n*nmat ; i++ ) dc[i]=dc1[i]; for ( i = 0 , num = 0; i < (int)nmat ; i+= (int)VLEND, num++ ) { double *Ap = &da[num*lda*k*VLEND]; double *Bp = &db[num*ldb*n*VLEND]; double *Cp = &dc[num*ldc*n*VLEND]; #ifdef MKL_TIMER tmptimer = dsecnd_(); #else l_start = libxsmm_timer_tick(); #endif #if !defined(USE_XSMM_GENERATED) && !defined(USE_PREDEFINED_ASSEMBLY) && !defined(USE_KERNEL_GENERATION_DIRECTLY) && !defined(TIME_MKL) && !defined(USE_PREDEFINED_ASSEMBLY_XCT) gen_compact_dgemm_ ( &layout, &m, &n, &k, &dalpha, Ap, &lda, Bp, &ldb, &dbeta, Cp, &ldc, &VLEND ); #endif #ifdef USE_XSMM_GENERATED mykernel ( Ap, Bp, Cp ); #endif #ifdef USE_PREDEFINED_ASSEMBLY gemm_ ( Ap, Bp, Cp ); #endif #ifdef USE_KERNEL_GENERATION_DIRECTLY (*opcode_routine)( Ap, Bp, Cp ); #endif #ifdef TIME_MKL mkl_dgemm_compact ( CLAYOUT, TRANSA, TRANSB, m, n, k, dalpha, da, lda, db, ldb, dbeta, dc, ldc, CMP_FORMAT, nmat ); i+=nmatd; /* Because MKL will do everything */ #endif #ifdef MKL_TIMER dtmp = dsecnd_() - tmptimer; #else l_end = libxsmm_timer_tick(); dtmp = libxsmm_timer_duration(l_start,l_end); #endif if ( j == 0 ) firsttime=dtmp; timer += dtmp; } } if ( ntest >= 100 ) { /* Skip the first timing: super necessary if using MKL */ timer = (timer-firsttime)/((double)(ntest-1)); } else { timer /= ((double)ntest); } #ifndef NO_ACCURACY_CHECK printf("Average time to get through %u matrices: %g\n",nmat,timer); printf("Gflops: %g\n",(double)op_count/(timer*1.0e9)); printf("after routine, new C(1,1)=%g C[256]=%g\n",dc[0],dc[256]); #endif #ifdef TEST_SINGLE printf("Before r4 routine, initial C(1,1)=%g C[256]=%g\n",sc[0],sc[256]); for ( i = 0 , num = 0; i < nmats ; i+= VLENS, num++ ) { float *Ap = &sa[num*lda*k*VLENS]; float *Bp = &sb[num*ldb*n*VLENS]; float *Cp = &sc[num*ldc*n*VLENS]; #ifdef USE_XSMM_GENERATED mykernel ( Ap, Bp, Cp ); #endif } printf("after r4 routine, new C(1,1)=%g C]256]=%g\n",dc[0],dc[256]); #endif #ifndef NO_ACCURACY_CHECK /* Call some reference code now on a copy of the B matrix (C) */ double timer2 = 0.0; for ( j = 0 ; j < (int)ntest ; j++ ) { for ( i = 0 ; i < ldc*n*nmat ; i++ ) dd[i]=dc1[i]; #ifdef MKL_TIMER tmptimer = dsecnd_(); #else l_start = libxsmm_timer_tick(); #endif #ifndef USE_MKL_FOR_REFERENCE compact_dgemm_ ( &layout, &transa, &transb, &m, &n, &k, &dalpha, da, &lda, db, &ldb, &dbeta, dd, &ldc, &nmat, &VLEND ); #else mkl_dgemm_compact ( CLAYOUT, TRANSA, TRANSB, m, n, k, dalpha, da, lda, db, ldb, dbeta, dd, ldc, CMP_FORMAT, nmat ); #endif #ifdef MKL_TIMER timer2 += dsecnd_() - tmptimer; #else l_end = libxsmm_timer_tick(); timer2 += libxsmm_timer_duration(l_start,l_end); #endif } timer2 /= ((double)ntest); printf("Reference time=%g Reference Gflops=%g\n",timer2,op_count/(timer2*1.0e9)); /* Compute the residual between B and C */ dtmp = residual_d ( dc, ldc, m, n*nmat, dd, ldc, &nerrs, &ncorr ); printf("R8 mnk=%u %u %u ldabc=%u %u %u error: %g number of errors: %u corrects: %u",m,n,k,lda,ldb,ldc,dtmp,nerrs,ncorr); if ( nerrs > 0 ) printf(" ->FAILED at %ux%u real*8 %u case",m,n,layout); printf("\n"); #ifdef TEST_SINGLE /* Call some reference code now on a copy of the B matrix (C) */ compact_dgemm_ ( &layout, &transa, &transb, &m, &n, &k, &salpha, sa, &lda, sb, &ldb, &sbeta, sd, &ldc, &nmat, &VLENS ); /* Compute the residual between C and D */ dtmp = residual_s ( sc, ldc, m, n*nmat, sd, ldc, &nerrs, &ncorr ); printf("R4 mnk=%u %u %u ldabc=%u %u %u error: %g number of errors: %u corrects: %u",m,n,k,lda,ldb,ldc,dtmp,nerrs,ncorr); if ( nerrs > 0 ) printf(" ->FAILED at %ux%u real*4 case",m,n); printf("\n"); #endif #else for ( j = 0, nerrs = 0 ; j < lda*n*nmat; j++ ) { if ( isnan(dc[j]) || isinf(dc[j]) ) { if ( ++nerrs < 10 ) { printf("WARNING: dc[%d]=%g\n",j,dc[j]); } } } printf("%g,real*8 m/n/k=%u %u %u lda-c=%u %u %u Denormals=%u Time=%g Gflops=%g",op_count/(timer*1.0e9),m,n,k,lda,ldb,ldc,nerrs,timer,op_count/(timer*1.0e9)); if ( nerrs > 0 ) printf(" -> FAILED at %ux%u real*8 case",m,n); printf("\n"); #endif free(dd); free(sd); free(dc); free(sc); free(dc1); free(sc1); free(db); free(sb); free(da); free(sa); return 0; }