int main(int argc, char* argv[]) { int M = ( argc == 7 ) ? atoi(argv[1]) : 9; int N = ( argc == 7 ) ? atoi(argv[2]) : 10; int K = ( argc == 7 ) ? atoi(argv[3]) : 9; unsigned int N_CRUNS = ( argc == 7 ) ? atoi(argv[4]) : 8; unsigned int REPS = ( argc == 7 ) ? atoi(argv[5]) : 1; char* l_csr_file = ( argc == 7 ) ? argv[6] : "file.csr"; const libxsmm_gemm_prefetch_type prefetch = LIBXSMM_GEMM_PREFETCH_NONE; const int flags = LIBXSMM_GEMM_FLAGS('N', 'N'); const REALTYPE alpha = 1, beta = 1; REALTYPE* l_a_de = (REALTYPE*)libxsmm_aligned_malloc(K * K * sizeof(REALTYPE), 64); REALTYPE* l_a_sp = NULL; REALTYPE* l_b = (REALTYPE*)libxsmm_aligned_malloc(K * N * N_CRUNS* sizeof(REALTYPE), 64); unsigned int* l_rowptr = NULL; unsigned int* l_colidx = NULL; unsigned int l_rowcount, l_colcount, l_elements; REALTYPE* l_c = (REALTYPE*)libxsmm_aligned_malloc(K * N * N_CRUNS * sizeof(REALTYPE), 64); REALTYPE* l_c_gold = (REALTYPE*)libxsmm_aligned_malloc(K * N * N_CRUNS * sizeof(REALTYPE), 64); REALTYPE* l_c_asm = (REALTYPE*)libxsmm_aligned_malloc(K * N * N_CRUNS * sizeof(REALTYPE), 64); REALTYPE l_max_error = 0.0; unsigned int l_k, l_n; int l_i, l_j, l_jj; LIBXSMM_VLA_DECL(3, REALTYPE, l_p_b, l_b, N, N_CRUNS); LIBXSMM_VLA_DECL(3, REALTYPE, l_p_c_asm, l_c_asm, N, N_CRUNS); LIBXSMM_VLA_DECL(3, REALTYPE, l_p_c_gold, l_c_gold, N, N_CRUNS); libxsmm_descriptor_blob l_xgemm_blob; const libxsmm_gemm_descriptor* l_xgemm_desc = 0; LIBXSMM_MMFUNCTION_TYPE(REALTYPE) mykernel = NULL; unsigned long long l_start, l_end; double l_total; if (argc != 7) { fprintf( stderr, "arguments: M #iters CSR-file!\n" ); return -1; } /* touch B */ for ( l_i = 0; l_i < K; l_i++) { for ( l_j = 0; l_j < N; l_j++) { for ( l_k = 0; l_k < N_CRUNS; l_k++ ) { LIBXSMM_VLA_ACCESS(3, l_p_b, l_i, l_j, l_k, N, N_CRUNS) = (REALTYPE)libxsmm_rand_f64(); } } } /* touch C */ for ( l_i = 0; l_i < K; l_i++) { for ( l_j = 0; l_j < N; l_j++) { for ( l_k = 0; l_k < N_CRUNS; l_k++ ) { LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, N, N_CRUNS) = (REALTYPE)0.0; LIBXSMM_VLA_ACCESS(3, l_p_c_asm, l_i, l_j, l_k, N, N_CRUNS) = (REALTYPE)0.0; } } } /* read A, CSR */ libxsmm_sparse_csr_reader( l_csr_file, &l_rowptr, &l_colidx, &l_a_sp, &l_rowcount, &l_colcount, &l_elements ); /* copy b to dense */ printf("CSR matrix data structure we just read:\n"); printf("rows: %u, columns: %u, elements: %u\n", l_rowcount, l_colcount, l_elements); for ( l_n = 0; l_n < (((unsigned int)K) * K); l_n++) { l_a_de[l_n] = 0.0; } for ( l_n = 0; l_n < (unsigned int)K; l_n++) { const unsigned int l_rowelems = l_rowptr[l_n+1] - l_rowptr[l_n]; assert(l_rowptr[l_n+1] >= l_rowptr[l_n]); for ( l_k = 0; l_k < l_rowelems; l_k++) { l_a_de[(l_n * K) + l_colidx[l_rowptr[l_n] + l_k]] = l_a_sp[l_rowptr[l_n] + l_k]; } } /* dense routine */ l_start = libxsmm_timer_tick(); #if 1 for ( l_n = 0; l_n < REPS; l_n++) { for ( l_i = 0; l_i < K; l_i++) { for ( l_j = 0; l_j < N; l_j++) { for ( l_jj = 0; l_jj < K; l_jj++) { LIBXSMM_PRAGMA_SIMD for (l_k = 0; l_k < N_CRUNS; l_k++) { LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, N, N_CRUNS) += l_a_de[(l_i*K)+l_jj] * LIBXSMM_VLA_ACCESS(3, l_p_b, l_jj, l_j, l_k, N, N_CRUNS); } } } } } #endif l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); printf("%fs for dense\n", l_total); printf("%f GFLOPS for dense\n", ((double)((double)REPS * (double)K * (double)K * (double)N * (double)N_CRUNS) * 2.0) / (l_total * 1.0e9)); l_xgemm_desc = libxsmm_gemm_descriptor_dinit(&l_xgemm_blob, LIBXSMM_GEMM_PRECISION(REALTYPE), K, N, K, 0, N, N, alpha, beta, flags, prefetch); /* sparse routine */ #if defined(__EDGE_EXECUTE_F32__) mykernel = libxsmm_create_xcsr_soa( l_xgemm_desc, l_rowptr, l_colidx, (const void*)l_a_sp ).smm; #else mykernel = libxsmm_create_xcsr_soa( l_xgemm_desc, l_rowptr, l_colidx, (const void*)l_a_sp ).dmm; #endif l_start = libxsmm_timer_tick(); for ( l_n = 0; l_n < REPS; l_n++) { mykernel( l_a_sp, l_b, l_c_asm ); } l_end = libxsmm_timer_tick(); l_total = libxsmm_timer_duration(l_start, l_end); printf("%fs for sparse (asm)\n", l_total); printf("%f GFLOPS for sparse (asm)\n", ((double)((double)REPS * (double)K * (double)l_elements * (double)N_CRUNS) * 2.0) / (l_total * 1.0e9)); /* check for errors */ l_max_error = (REALTYPE)0.0; for ( l_i = 0; l_i < K; l_i++) { for ( l_j = 0; l_j < N; l_j++) { for ( l_k = 0; l_k < N_CRUNS; l_k++ ) { if (fabs( LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, N, N_CRUNS) - LIBXSMM_VLA_ACCESS(3, l_p_c_asm, l_i, l_j, l_k, N, N_CRUNS) ) > l_max_error ) { l_max_error = (REALTYPE)fabs( LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, N, N_CRUNS) -LIBXSMM_VLA_ACCESS(3, l_p_c_asm, l_i, l_j, l_k, N, N_CRUNS) ); } } } } printf("max error: %f\n", l_max_error); printf("PERFDUMP,%s,%u,%i,%i,%i,%u,%u,%f,%f,%f\n", l_csr_file, REPS, M, N, K, l_elements, K * l_elements * N_CRUNS * 2, l_max_error, l_total, ((double)((double)REPS * (double)K * (double)l_elements * (double)N_CRUNS) * 2.0) / (l_total * 1.0e9) ); /* free */ libxsmm_free( l_a_de ); libxsmm_free( l_b ); libxsmm_free( l_c ); libxsmm_free( l_c_gold ); libxsmm_free( l_c_asm ); free( l_a_sp ); free( l_rowptr ); free( l_colidx ); return 0; }
int main(int argc, char *argv[]) { REAL_TYPE *A_gold, *B_gold, *A_gold2, *B_gold2; float *C_gold, *C0_gold, *C, *C2; int M, N, K; REAL_TYPE alpha, beta; int reps; libxsmm_spmdm_handle handle, handle2; libxsmm_CSR_sparseslice *A_sparse, *A_sparse2; int max_threads; /* Step 1: Read in args */ libxsmm_timer_tickint start, end; double flops, duration; char transA, transB, transC; int i, j, k; size_t l; /* Step 1: Initialize handle */ M = 0; N = 0; K = 0; alpha = (REAL_TYPE)1.0; beta = (REAL_TYPE)0.0; reps = 0; transA = 'N'; transB = 'N'; if (argc > 1 && !strncmp(argv[1], "-h", 3)) { printf("\nUsage: %s [M] [N] [K] [transA] [transB] [reps]\n\n", argv[0]); return EXIT_SUCCESS; } /* defaults */ M = 2048; N = 2048; K = 2048; transA = 'N'; transB = 'N'; transC = 'N'; reps = 100; /* reading new values from cli */ i = 1; if (argc > i) M = atoi(argv[i++]); if (argc > i) N = atoi(argv[i++]); if (argc > i) K = atoi(argv[i++]); if (argc > i) { transA = argv[i][0]; i++; } if (argc > i) { transB = argv[i][0]; i++; } if (argc > i) { transC = argv[i][0]; i++; } if (argc > i) reps = atoi(argv[i++]); /* Step 2: allocate data */ A_gold = (REAL_TYPE*)libxsmm_aligned_malloc( M*K*sizeof(REAL_TYPE), 64 ); B_gold = (REAL_TYPE*)libxsmm_aligned_malloc( K*N*sizeof(REAL_TYPE), 64 ); C_gold = (float*)libxsmm_aligned_malloc( M*N*sizeof(float), 64 ); C0_gold = (float*)libxsmm_aligned_malloc( M*N*sizeof(float), 64 ); C = (float*)libxsmm_aligned_malloc( M*N*sizeof(float), 64 ); /* Step 3: init data */ libxsmm_rng_set_seed(1); for (l = 0; l < (size_t)M * (size_t)K; ++l) { const double r64 = libxsmm_rng_f64(); const float r32 = (float)r64; #ifdef USE_BFLOAT const int r = *(const int*)(&r32); const libxsmm_bfloat16 val = (r >> 16); #else const float val = r32; #endif if (r64 > 0.85) A_gold[l] = val; else A_gold[l] = (REAL_TYPE)0.0; } for (l = 0; l < (size_t)K * (size_t)N; ++l) { const double r64 = libxsmm_rng_f64(); const float r32 = (float)r64; #ifdef USE_BFLOAT const int r = *(const int*)(&r32); const libxsmm_bfloat16 val = (r >> 16); #else const float val = r32; #endif B_gold[l] = val; } for (l = 0; l < (size_t)M * (size_t)N; ++l) { C0_gold[l] = (float)libxsmm_rng_f64(); C_gold[l] = C0_gold[l]; } for (l = 0; l < (size_t)M * (size_t)N; ++l) { C[l] = (float)C0_gold[l]; } flops = (double)M * (double)N * (double)K * 2.0; /*----------------------------------------------------------------------------------------------------------------------*/ /* Step 4: Initialize LIBXSMM for these sizes - allocates handle and temporary space for the sparse data structure for A */ # if defined(_OPENMP) max_threads = omp_get_max_threads(); # else max_threads = 1; # endif start = libxsmm_timer_tick(); libxsmm_spmdm_init(M, N, K, max_threads, &handle, &A_sparse); end = libxsmm_timer_tick(); printf("Time for handle init = %f\n", libxsmm_timer_duration(start, end)); printf(" running with: M=%i, N=%i, K=%i, bm=%i, bn=%i, bk=%i, mb=%i, nb=%i, kb=%i, reps=%i -- forward pass\n", M, N, K, handle.bm, handle.bn, handle.bk, handle.mb, handle.nb, handle.kb, reps ); /* The overall function that takes in matrix inputs in dense format, does the conversion of A to sparse format and does the matrix multiply */ /* Currently ignores alpha */ /* TODO: fix alpha input */ # ifdef USE_BFLOAT spmdm_exec_bfloat16(&handle, transA, transB, &alpha, A_gold, B_gold, transC, &beta, C, A_sparse); # else spmdm_exec_fp32(&handle, transA, transB, &alpha, A_gold, B_gold, transC, &beta, C, A_sparse); # endif /* Checks */ /* Compute a "gold" answer sequentially */ #if defined(_OPENMP) LIBXSMM_OMP_VAR(k); # pragma omp parallel for private(i, j, k) LIBXSMM_OPENMP_COLLAPSE(2) #endif for (i = 0; i < M; ++i) { for (j = 0; j < N; ++j) { float sum = 0.0; float Cval; for (k = 0; k < K; ++k) { # ifdef USE_BFLOAT libxsmm_bfloat16 Atmp = A_gold[i*K+k]; int Atmp_int = Atmp; Atmp_int <<= 16; float Aval = *(float *)&Atmp_int; libxsmm_bfloat16 Btmp = B_gold[k*N+j]; int Btmp_int = Btmp; Btmp_int <<= 16; float Bval = *(float *)&Btmp_int; # else float Aval = A_gold[i*K + k]; float Bval = B_gold[k*N + j]; # endif sum += Aval * Bval; } Cval = sum; C_gold[i*N + j] = Cval + beta*C_gold[i*N + j]; } } /* LIBXSMM_FSYMBOL(sgemm)(&trans, &trans, &N, &M, &K, &alpha, B_gold, &N, A_gold, &K, &beta, C_gold, &N); */ /* Compute the max difference between gold and computed results. */ spmdm_check_c( &handle, C, C_gold ); /* Timing loop starts */ start = libxsmm_timer_tick(); for (i = 0; i < reps; ++i) { # ifdef USE_BFLOAT spmdm_exec_bfloat16( &handle, transA, transB, &alpha, A_gold, B_gold, transC, &beta, C, A_sparse); # else spmdm_exec_fp32( &handle, transA, transB, &alpha, A_gold, B_gold, transC, &beta, C, A_sparse); # endif } end = libxsmm_timer_tick(); duration = libxsmm_timer_duration(start, end); printf("Time = %f Time/rep = %f, TFlops/s = %f\n", duration, duration*1.0/reps, flops/1000./1000./1000./1000./duration*reps); libxsmm_spmdm_destroy(&handle); /*----------------------------------------------------------------------------------------------------------------------*/ /* Step 5: Initialize libxsmm for transpose A - allocates handle and temporary space for the sparse data structure for A */ transA = 'T'; transB = 'N'; transC = 'T'; libxsmm_spmdm_init(M, N, K, max_threads, &handle2, &A_sparse2); printf(" running with: M=%i, N=%i, K=%i, bm=%i, bn=%i, bk=%i, mb=%i, nb=%i, kb=%i, reps=%i, transA = Y, transC = Y -- weight update\n", handle2.m, handle2.n, handle2.k, handle2.bm, handle2.bn, handle2.bk, handle2.mb, handle2.nb, handle2.kb, reps ); A_gold2 = (REAL_TYPE*)libxsmm_aligned_malloc( M*K*sizeof(REAL_TYPE), 64 ); C2 = (float*)libxsmm_aligned_malloc( M*N*sizeof(float), 64 ); for (i = 0; i < M; ++i) { for (j = 0; j < K; ++j) { A_gold2[j*M + i] = A_gold[i*K + j]; } } for (i = 0; i < M; ++i) { for (j = 0; j < N; ++j) { C[j*M + i] = (float)C0_gold[i*N + j]; } } /* The overall function that takes in matrix inputs in dense format, does the conversion of A to sparse format and does the matrix multiply */ /* Currently ignores alpha */ /* TODO: fix alpha inputs */ # ifdef USE_BFLOAT spmdm_exec_bfloat16( &handle2, transA, transB, &alpha, A_gold2, B_gold, transC, &beta, C, A_sparse2); # else spmdm_exec_fp32( &handle2, transA, transB, &alpha, A_gold2, B_gold, transC, &beta, C, A_sparse2); # endif for (i = 0; i < M; ++i) { for (j = 0; j < N; ++j) { C2[i*N + j] = C[j*M + i]; } } /* Checks */ spmdm_check_c( &handle2, C2, C_gold); /* Timing loop starts */ start = libxsmm_timer_tick(); for (i = 0; i < reps; ++i) { # ifdef USE_BFLOAT spmdm_exec_bfloat16( &handle2, transA, transB, &alpha, A_gold2, B_gold, transC, &beta, C, A_sparse2); # else spmdm_exec_fp32( &handle2, transA, transB, &alpha, A_gold2, B_gold, transC, &beta, C, A_sparse2); # endif } end = libxsmm_timer_tick(); duration = libxsmm_timer_duration(start, end); printf("Time = %f Time/rep = %f, TFlops/s = %f\n", duration, duration*1.0/reps, flops/1000./1000./1000./1000./duration*reps); /*----------------------------------------------------------------------------------------------------------------------*/ /* Step 6: Test transpose B */ transA = 'N'; transB = 'T'; transC = 'N'; printf(" running with: M=%i, N=%i, K=%i, bm=%i, bn=%i, bk=%i, mb=%i, nb=%i, kb=%i, reps=%i, transB = Y -- backprop\n", handle2.m, handle2.n, handle2.k, handle2.bm, handle2.bn, handle2.bk, handle2.mb, handle2.nb, handle2.kb, reps ); B_gold2 = (REAL_TYPE*)libxsmm_aligned_malloc( K*N*sizeof(REAL_TYPE), 64 ); for (i = 0; i < K; ++i) { for (j = 0; j < N; ++j) { B_gold2[j*K + i] = B_gold[i*N + j]; } } for (l = 0; l < (size_t)M * (size_t)N; ++l) { C[l] = (float)C0_gold[l]; } /* The overall function that takes in matrix inputs in dense format, does the conversion of A to sparse format and does the matrix multiply */ /* Currently ignores alpha */ /* TODO: fix alpha inputs */ # ifdef USE_BFLOAT spmdm_exec_bfloat16( &handle2, transA, transB, &alpha, A_gold, B_gold2, transC, &beta, C, A_sparse2); # else spmdm_exec_fp32( &handle2, transA, transB, &alpha, A_gold, B_gold2, transC, &beta, C, A_sparse2); # endif /* Checks */ spmdm_check_c( &handle2, C, C_gold); /* Timing loop starts */ start = libxsmm_timer_tick(); for (i = 0; i < reps; ++i) { # ifdef USE_BFLOAT spmdm_exec_bfloat16( &handle2, transA, transB, &alpha, A_gold, B_gold2, transC, &beta, C, A_sparse2); # else spmdm_exec_fp32( &handle2, transA, transB, &alpha, A_gold, B_gold2, transC, &beta, C, A_sparse2); # endif } end = libxsmm_timer_tick(); duration = libxsmm_timer_duration(start, end); printf("Time = %f Time/rep = %f, TFlops/s = %f\n", duration, duration*1.0/reps, flops/1000./1000./1000./1000./duration*reps); libxsmm_spmdm_destroy(&handle2); libxsmm_free(A_gold); libxsmm_free(B_gold); libxsmm_free(C_gold); libxsmm_free(C); libxsmm_free(C2); libxsmm_free(C0_gold); libxsmm_free(B_gold2); libxsmm_free(A_gold2); return EXIT_SUCCESS; }