int main(int argc, char** argv) { char* mpi_inbuf; char* mpi_outbuf; char* farc_inbuf; char* farc_outbuf; MPI_Init(&argc, &argv); test_start("unpack(2, hindexed[{(1*MPI_INT, offset=4), (3*MPI_INT, offset=16), (2*MPI_INT, offset=32)}])"); init_buffers(20*sizeof(int), &mpi_inbuf, &farc_inbuf, &mpi_outbuf, &farc_outbuf); MPI_Datatype mpitype; int blocklen[3] = {1, 3, 2}; MPI_Aint disp[3] = {4, 16, 32}; MPI_Type_create_hindexed(3, blocklen, disp, MPI_INT, &mpitype); MPI_Type_commit(&mpitype); farc::DDT_Init(); farc::Datatype* t1 = new farc::PrimitiveDatatype(farc::PrimitiveDatatype::INT); farc::Datatype* t2 = new farc::HIndexedDatatype(3, blocklen, disp, t1); farc::DDT_Commit(t2); farc::DDT_Unpack(farc_inbuf, farc_outbuf, t2, 2); int position = 0; MPI_Unpack(mpi_inbuf, 20*sizeof(int), &position, mpi_outbuf, 2, mpitype, MPI_COMM_WORLD); int res = compare_buffers(20*sizeof(int), &mpi_inbuf, &farc_inbuf, &mpi_outbuf, &farc_outbuf); free_buffers(&mpi_inbuf, &farc_inbuf, &mpi_outbuf, &farc_outbuf); test_result(res); MPI_Finalize(); return 0; }
static int test_circle(const struct testcase *t) { GP_Context *c; int err; c = GP_ContextAlloc(t->w, t->h, GP_PIXEL_G8); if (c == NULL) { tst_err("Failed to allocate context"); return TST_UNTESTED; } /* zero the pixels buffer */ memset(c->pixels, 0, c->w * c->h); GP_FillCircle(c, t->x, t->y, t->r, 1); err = compare_buffers(t->pixmap, c); if (err) return TST_FAILED; return TST_SUCCESS; }
int main(int, char **) { Halide::Buffer<uint8_t> bufA(NN, NN); Halide::Buffer<uint8_t> output_buf(NN, NN); Halide::Buffer<uint8_t> reference_buf(NN, NN); init_buffer(output_buf, (uint8_t) 177); // Dummy data for (int i = 0; i < NN; i++) { for (int j = 0; j < NN; j++) { bufA(j, i) = i * 10 + j; } } for (int i = 0; i < NN; i++) { for (int j = 0; j < NN; j++) { // Same access pattern as in generator reference_buf((i + j * (NN - 1)) % NN, (i + j * 2) % NN) = bufA((j + 3) % NN, i); } } func(bufA.raw_buffer(), output_buf.raw_buffer()); compare_buffers("test", output_buf, reference_buf); return 0; }
void test_gemm(const std::string &name, int M, int N, int K, float alpha, float beta, int rowsA, int colsA, int rowsB, int colsB, int rowsC, int colsC, int offsetA, int offsetB, int offsetC, bool transposeA, bool transposeB) { Halide::Buffer<int32_t> sizes(12); Halide::Buffer<float> params(2); Halide::Buffer<bool> transposes(2); Halide::Buffer<float> A(colsA, rowsA); Halide::Buffer<float> B(colsB, rowsB); Halide::Buffer<float> C(colsC, rowsC); Halide::Buffer<float> C_ref(colsC, rowsC); sizes(0) = M; sizes(1) = N; sizes(2) = K; sizes(3) = rowsA; sizes(4) = colsA; sizes(5) = rowsB; sizes(6) = colsB; sizes(7) = rowsC; sizes(8) = colsC; sizes(9) = offsetA; sizes(10) = offsetB; sizes(11) = offsetC; params(0) = alpha; params(1) = beta; transposes(0) = transposeA; transposes(1) = transposeB; for (int i = 0; i < rowsA; i++) for (int j = 0; j < colsA; j++) A(j, i) = std::rand() % 10 - 5; for (int i = 0; i < rowsB; i++) for (int j = 0; j < colsB; j++) B(j, i) = std::rand() % 10 - 5; for (int i = 0; i < rowsC; i++) for (int j = 0; j < colsC; j++) C(j, i) = C_ref(j, i) = std::rand() % 10 - 5; for (int i = 0; i < M; i++) { for (int j = 0; j < N; j++) { C_ref(j + offsetC % colsC, i + offsetC / colsC) *= beta; for (int k = 0; k < K; k++) { float a = transposeA ? A(i + offsetA % colsA, k + offsetA / colsA) : A(k + offsetA % colsA, i + offsetA / colsA); float b = transposeB ? B(k + offsetB % colsB, j + offsetB / colsB) : B(j + offsetB % colsB, k + offsetB / colsB); C_ref(j + offsetC % colsC, i + offsetC / colsC) += alpha * a * b; } } } test_164(sizes.raw_buffer(), params.raw_buffer(), transposes.raw_buffer(), A.raw_buffer(), B.raw_buffer(), C.raw_buffer()); compare_buffers(name, C, C_ref); }
int main(int argc, char** argv) { std::vector<std::chrono::duration<double, std::milli>> duration_vector_1, duration_vector_2; bool run_ref = false, run_tiramisu = false; const char* env_ref = std::getenv("RUN_REF"); if (env_ref != NULL && env_ref[0] == '1') run_ref = true; const char* env_tiramisu = std::getenv("RUN_TIRAMISU"); if (env_tiramisu != NULL && env_tiramisu[0] == '1') run_tiramisu = true; // --------------------------------------------------------------------- // --------------------------------------------------------------------- // --------------------------------------------------------------------- double alpha = 2.5; Halide::Buffer<int> SIZES(1); SIZES(0) = nrow; Halide::Buffer<double> b_alpha(1); b_alpha(0) = alpha; Halide::Buffer<double> b_X(nrow), b_X_ref(nrow); // --------------------------------------------------------------------- // --------------------------------------------------------------------- // --------------------------------------------------------------------- { for (int i = 0; i < NB_TESTS; ++i) { init_buffer(b_X_ref, (double)1); auto start = std::chrono::high_resolution_clock::now(); if (run_ref) scal_ref(nrow, alpha, b_X_ref.data()); auto end = std::chrono::high_resolution_clock::now(); duration_vector_1.push_back(end - start); } } { for (int i = 0; i < NB_TESTS; ++i) { init_buffer(b_X, (double)1); auto start = std::chrono::high_resolution_clock::now(); if (run_tiramisu) scal(SIZES.raw_buffer(), b_alpha.raw_buffer(), b_X.raw_buffer()); auto end = std::chrono::high_resolution_clock::now(); duration_vector_2.push_back(end - start); } } print_time("performance_cpu.csv", "scal", {"Ref", "Tiramisu"}, {median(duration_vector_1), median(duration_vector_2)}); if (CHECK_CORRECTNESS && run_ref && run_tiramisu) compare_buffers("scal", b_X_ref, b_X); if (PRINT_OUTPUT) { std::cout << "Tiramisu " << std::endl; print_buffer(b_X); std::cout << "Reference " << std::endl; print_buffer(b_X_ref); } return 0; }
int main(int argc, char **argv) { int64_t block_size, buf_size, state_offset; int64_t fsize1, fsize2, max_size, cpos, len, bad_bytes, bad_groups; int start_option, i, state; char *fname1, *fname2; char *buf1, *buf2; FILE *fd1, *fd2; block_size = 64 *1024; buf_size = 20 *1024*1024; if (argc < 3) { printf("\n"); printf("ldiff [-b block_size] [-s buffer_size] file1 file2\n"); printf(" -b block_size Block size for matching to cache page boundaries. Defaults to " I64T ". Can use units.\n", block_size); printf(" -s buffer_size Buffer size used in the comparison. This is divided in 1/2 for each buffer. Defaults to " I64T ". Cant use units.\n", buf_size); printf("\n"); return(1); } //*** Parse the args i=1; if (argc > 1) { do { start_option = i; if (strcmp(argv[i], "-b") == 0) { //** Block size i++; block_size = string_get_integer(argv[i]); i++; } else if (strcmp(argv[i], "-s") == 0) { //** Buffer size i++; buf_size = string_get_integer(argv[i]); i++; } } while ((start_option < i) && (i<argc)); } if ((argc-i) < 2) { printf("Missing filenames!\n"); return(1); } //** Open the files and get their sizes fname1 = argv[i]; fname2 = argv[i+1]; fd1 = fopen(fname1, "r"); assert(fd1 != NULL); fd2 = fopen(fname2, "r"); assert(fd2 != NULL); fseek(fd1, 0, SEEK_END); fseek(fd2, 0, SEEK_END); fsize1 = ftell(fd1); fsize2 = ftell(fd2); fseek(fd1, 0, SEEK_SET); fseek(fd2, 0, SEEK_SET); //** Print a summary of options printf("File 1: %s (" I64T " bytes)\n", fname1, fsize1); printf("File 2: %s (" I64T " bytes)\n", fname2, fsize2); if (fsize1 != fsize2) printf("WARNING: File sizes differ!!!!!\n"); printf("Block size: " I64T "\n", block_size); printf("Buffer size: " I64T "\n", buf_size); buf_size /= 2; //** It's split between the 2 buffers type_malloc(buf1, char, buf_size); type_malloc(buf2, char, buf_size); max_size = (fsize1 > fsize2) ? fsize2 : fsize1; printf("\n"); printf("Printing comparision breakdown -- Single byte matches are suppressed (max_size=" I64T ")\n", max_size); state = 0; state_offset = 0; bad_bytes = bad_groups = 0; for (cpos=0; cpos < max_size; cpos += buf_size) { len = ((cpos+buf_size) < max_size) ? buf_size : max_size - cpos; assert_result(fread(buf1, 1, len, fd1), len); assert_result(fread(buf2, 1, len, fd2), len); compare_buffers(buf1, buf2, len, cpos, &bad_bytes, &bad_groups, block_size, &state, &state_offset, max_size); } printf("\n"); printf("Bad bytes: " I64T " Bad groups: " I64T "\n", bad_bytes, bad_groups); free(buf1); free(buf2); fclose(fd1); fclose(fd2); i = ((bad_bytes > 0) || (fsize1 != fsize2)) ? 1 : 0; return(i); }
int main(int, char**) { std::vector<std::chrono::duration<double,std::milli>> duration_vector_1; std::vector<std::chrono::duration<double,std::milli>> duration_vector_2; Halide::Buffer<uint8_t> input = Halide::Tools::load_image("./utils/images/rgb.png"); Halide::Buffer<int32_t> size(2); size(0) = input.extent(0); size(1) = input.extent(1); Halide::Buffer<uint8_t> output_ref_y(input.width(), input.height()); Halide::Buffer<uint8_t> output_ref_u(input.width()/2, input.height()/2); Halide::Buffer<uint8_t> output_ref_v(input.width()/2, input.height()/2); Halide::Buffer<uint8_t> output_tiramisu_y(input.width(), input.height()); Halide::Buffer<uint8_t> output_tiramisu_u(input.width()/2, input.height()/2); Halide::Buffer<uint8_t> output_tiramisu_v(input.width()/2, input.height()/2); std::cout << "STARTING TEST\n"; std::cout << "y size (width, height): " << output_tiramisu_y.width() << ", " << output_tiramisu_y.height() << "\n"; std::cout << "u size (width, height): " << output_tiramisu_u.width() << ", " << output_tiramisu_u.height() << "\n"; std::cout << "v size (width, height): " << output_tiramisu_v.width() << ", " << output_tiramisu_v.height() << "\n"; // Warm up rgbyuv420gpu_tiramisu(size.raw_buffer(), input.raw_buffer(), output_tiramisu_y.raw_buffer(), output_tiramisu_u.raw_buffer(), output_tiramisu_v.raw_buffer()); run_halide(input, output_ref_y, output_ref_u, output_ref_v); // Tiramisu for (int i=0; i<NB_TESTS; i++) { auto start1 = std::chrono::high_resolution_clock::now(); rgbyuv420gpu_tiramisu(size.raw_buffer(), input.raw_buffer(), output_tiramisu_y.raw_buffer(), output_tiramisu_u.raw_buffer(), output_tiramisu_v.raw_buffer()); auto end1 = std::chrono::high_resolution_clock::now(); std::chrono::duration<double,std::milli> duration1 = end1 - start1; duration_vector_1.push_back(duration1); } // Reference for (int i=0; i<NB_TESTS; i++) { duration_vector_2.push_back(run_halide(input, output_ref_y, output_ref_u, output_ref_v)); } print_time("performance_CPU.csv", "rgbyuv420gpu", {"Tiramisu", "Halide"}, {median(duration_vector_1), median(duration_vector_2)}); Halide::Tools::save_image(output_tiramisu_y, "./build/rgbyuv420gpu_y_tiramisu.png"); Halide::Tools::save_image(output_tiramisu_u, "./build/rgbyuv420gpu_u_tiramisu.png"); Halide::Tools::save_image(output_tiramisu_v, "./build/rgbyuv420gpu_v_tiramisu.png"); Halide::Tools::save_image(output_ref_y, "./build/rgbyuv420gpu_y_ref.png"); Halide::Tools::save_image(output_ref_u, "./build/rgbyuv420gpu_u_ref.png"); Halide::Tools::save_image(output_ref_v, "./build/rgbyuv420gpu_v_ref.png"); if (CHECK_CORRECTNESS) { std::cout << "Compare y buffer\n"; compare_buffers("benchmark_rgbyuv420gpu", output_tiramisu_y, output_ref_y); std::cout << "Compare u buffer\n"; compare_buffers("benchmark_rgbyuv420gpu", output_tiramisu_u, output_ref_u); std::cout << "Compare v buffer\n"; compare_buffers("benchmark_rgbyuv420gpu", output_tiramisu_y, output_ref_y); } return 0; }
int main(int argc, char** argv) { MPI_Init(&argc, &argv); int rank, peer, commsize; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &commsize); if (rank % 2) peer = rank - 1 % commsize; else peer = rank + 1 % commsize; if (commsize % 2 != 0) { fprintf(stderr, "Use even number of processes.\n"); exit(EXIT_FAILURE); } char* mpi_inbuf; char* mpi_outbuf; char* pmpi_inbuf; char* pmpi_outbuf; test_start("isend/irecv + test (2, vector[[int], count=2, blklen=3, stride=5])"); init_buffers(20*sizeof(int), &mpi_inbuf, &pmpi_inbuf, &mpi_outbuf, &pmpi_outbuf); MPI_Datatype vector_ddt; MPI_Type_vector(2, 3, 5, MPI_INT, &vector_ddt); MPI_Type_commit(&vector_ddt); MPI_Datatype pmpi_vector_ddt; PMPI_Type_vector(2, 3, 5, MPI_INT, &pmpi_vector_ddt); PMPI_Type_commit(&pmpi_vector_ddt); MPI_Request requests_mpi[2]; MPI_Request requests_pmpi[2]; MPI_Status statuses_mpi[2]; MPI_Status statuses_pmpi[2]; if (rank % 2 == 0) { MPI_Isend(mpi_inbuf, 2, vector_ddt, peer, 0, MPI_COMM_WORLD, &(requests_mpi[0])); MPI_Irecv(mpi_outbuf, 2, vector_ddt, peer, 0, MPI_COMM_WORLD, &(requests_mpi[1])); PMPI_Isend(pmpi_inbuf, 2, pmpi_vector_ddt, peer, 0, MPI_COMM_WORLD, &(requests_pmpi[0])); PMPI_Irecv(pmpi_outbuf, 2, pmpi_vector_ddt, peer, 0, MPI_COMM_WORLD, &(requests_pmpi[1])); } else { MPI_Irecv(mpi_outbuf, 2, vector_ddt, peer, 0, MPI_COMM_WORLD, &(requests_mpi[0])); MPI_Isend(mpi_inbuf, 2, vector_ddt, peer, 0, MPI_COMM_WORLD, &(requests_mpi[1])); PMPI_Irecv(pmpi_outbuf, 2, pmpi_vector_ddt, peer, 0, MPI_COMM_WORLD, &(requests_pmpi[0])); PMPI_Isend(pmpi_inbuf, 2, pmpi_vector_ddt, peer, 0, MPI_COMM_WORLD, &(requests_pmpi[1])); } int flag; flag = 0; while (flag == 0) MPI_Test(&(requests_mpi[0]), &flag, &(statuses_mpi[0])); flag = 0; while (flag == 0) MPI_Test(&(requests_mpi[1]), &flag, &(statuses_mpi[1])); flag = 0; while (flag == 0) MPI_Test(&(requests_pmpi[0]), &flag, &(statuses_pmpi[0])); flag = 0; while (flag == 0) MPI_Test(&(requests_pmpi[1]), &flag, &(statuses_pmpi[1])); int res = compare_buffers(20*sizeof(int), &mpi_inbuf, &pmpi_inbuf, &mpi_outbuf, &pmpi_outbuf); free_buffers(&mpi_inbuf, &pmpi_inbuf, &mpi_outbuf, &pmpi_outbuf); test_result(res); MPI_Type_free(&vector_ddt); PMPI_Type_free(&pmpi_vector_ddt); MPI_Finalize(); }
int main(int, char **) { int N = 1; int W = 1001; int H = 1000; int F_In = 6; int F_Out = 4; int K_W = 5; int K_H = 3; // Note that Halide indices are reversed Halide::Buffer<int> parameters(7); Halide::Buffer<float> input_padded(F_In, H + K_H - 1, W + K_W - 1, N); Halide::Buffer<float> input_col(K_H, K_W, F_In, H, W, N); Halide::Buffer<float> kernel(F_Out, K_H, K_W, F_In); Halide::Buffer<float> output(F_Out, H, W, N); Halide::Buffer<float> output_test(F_Out, H, W, N); parameters(0) = N; parameters(1) = W; parameters(2) = H; parameters(3) = F_In; parameters(4) = F_Out; parameters(5) = K_W; parameters(6) = K_H; init_buffer(input_padded, (float)0); init_buffer(kernel, (float)1); // With decimal values test might fail due to floating point arithmetic. for (int n = 0; n < N; n++) { for (int x = 0; x < W; x++) { for (int y = 0; y < H; y++) { for (int c = 0; c < F_In; c++) { input_padded(c, y + (K_H - 1) / 2, x + (K_W - 1) / 2, n) = 1; } } } } for (int f_out = 0; f_out < F_Out; f_out++) { for (int k_x = 0; k_x < K_W; k_x++) { for (int k_y = 0; k_y < K_H; k_y++) { for (int f_in = 0; f_in < F_In; f_in++) { kernel(f_out, k_y, k_x, f_in) = 1; } } } } bool test = true; if (test) { init_buffer(output_test, (float)0); for (int n = 0; n < N; n++) { for (int x = 0; x < W; x++) { for (int y = 0; y < H; y++) { for (int k_x = 0; k_x < K_W; k_x++) { for (int k_y = 0; k_y < K_H; k_y++) { for (int f_in = 0; f_in < F_In; f_in++) { for (int f_out = 0; f_out < F_Out; f_out++) { output_test(f_out, y, x, n) += input_padded(f_in, y + k_y, x + k_x, n) * kernel(f_out, k_y, k_x, f_in); } } } } } } } } std::cout << "Buffers initialized" << std::endl; gemm_conv(parameters.raw_buffer(), input_padded.raw_buffer(), input_col.raw_buffer(), input_col.raw_buffer(), kernel.raw_buffer(), output.raw_buffer()); if (test) { compare_buffers("convs", output, output_test); } bool print = false; if (print) { for (int y = 0; y < H; y++) { for (int x = 0; x < W; x++) { std::printf("%3.1f ", input_col(0, 0, 0, y, x, 0)); } std::cout << std::endl; } for (int y = 0; y < H; y++) { for (int x = 0; x < W; x++) { std::printf("%3.1f ", output(0, y, x, 0)); } std::cout << std::endl; } } return 0; }
int main(int argc, char *argv[]) { int testN = 1; bool check_correctness = false; if (argc > 1) { testN = atoi(argv[1]); } if (argc > 2) { check_correctness = atoi(argv[2]); } std::cout << std::endl << "----------" << std::endl; std::cout << "Running sequential MM benchmark: testN: " << testN << ", check correctness: " << check_correctness << ", size: (" << S0 << ", " << S1 << ", " << S2 << ", " << S3 << ")" << std::endl; auto t1 = std::chrono::high_resolution_clock::now(); auto t2 = t1; float *A = (float*) malloc(S0 * S1 * sizeof(float)); float *B = (float*) malloc(S1 * S2 * sizeof(float)); float *C = (float*) malloc(S2 * S3 * sizeof(float)); // Initialize matrices with random values: for (int i = 0; i < S0 * S1; i++) A[i] = std::rand() % 10; for (int i = 0; i < S1 * S2; i++) B[i] = std::rand() % 10; for (int i = 0; i < S2 * S3; i++) C[i] = std::rand() % 10; std::cout << "Buffers initialized" << std::endl << std::flush; // Note that indices are flipped (see tutorial 2) Halide::Buffer<DATA_TYPE> A_buf(A, {S1, S0}); Halide::Buffer<DATA_TYPE> B_buf(B, {S2, S1}); Halide::Buffer<DATA_TYPE> C_buf(C, {S3, S2}); Halide::Buffer<DATA_TYPE> O_buf(S3, S0); // Make a dummy call to set up GPU (initalization takes time) matmul(A_buf.raw_buffer(), B_buf.raw_buffer(), C_buf.raw_buffer(), O_buf.raw_buffer()); // CPU Multiplication for correctness check if (check_correctness) { // Reference matrix multiplication std::cout << "Running CPU multiplication.." << std::endl; Halide::Buffer<DATA_TYPE> O_val_buf(S3, S0); Halide::Buffer<DATA_TYPE> T_val_buf(S2, S0); t1 = std::chrono::high_resolution_clock::now(); for (int i = 0; i < S0; i++) { for (int k = 0; k < S2; k++) { // Note that indices are flipped (see tutorial 2) T_val_buf(k, i) = 0; } } for (int i = 0; i < S0; i++) { for (int l = 0; l < S3; l++) { // Note that indices are flipped (see tutorial 2) O_val_buf(l, i) = 0; } } for (int j = 0; j < S1; j++) { for (int i = 0; i < S0; i++) { for (int k = 0; k < S2; k++) { // Note that indices are flipped (see tutorial 2) T_val_buf(k, i) += A_buf(j, i) * B_buf(k, j); } } } for (int k = 0; k < S2; k++) { for (int i = 0; i < S0; i++) { for (int l = 0; l < S3; l++) { // Note that indices are flipped (see tutorial 2) O_val_buf(l, i) += T_val_buf(k, i) * C_buf(l, k); } } } t2 = std::chrono::high_resolution_clock::now(); std::cout << "CPU matmul done: " << (std::chrono::duration<double,std::milli>(t2 - t1)).count() << "ms" << std::endl << std::flush; compare_buffers("matmul", O_buf, O_val_buf); } // GPU Multiplication t1 = std::chrono::high_resolution_clock::now(); for (int i = 0; i < testN; i++) { matmul(A_buf.raw_buffer(), B_buf.raw_buffer(), C_buf.raw_buffer(), O_buf.raw_buffer()); } t2 = std::chrono::high_resolution_clock::now(); std::cout << "GPU matmul done: " << (std::chrono::duration<double,std::milli>(t2 - t1)).count() / testN << "ms" << std::endl << std::flush; // CUBLAS SGEMM // Transposed copies for cublas float *A_T = (float*) malloc(S0 * S1 * sizeof(float)); float *B_T = (float*) malloc(S1 * S2 * sizeof(float)); float *C_T = (float*) malloc(S2 * S3 * sizeof(float)); float *O_T = (float*) malloc(S0 * S3 * sizeof(float)); // Transpose for (int i = 0; i < S0; i++) for (int j = 0; j < S1; j++) A_T[i + j * S0] = A[i * S1 + j]; for (int i = 0; i < S1; i++) for (int j = 0; j < S2; j++) B_T[i + j * S1] = B[i * S2 + j]; for (int i = 0; i < S2; i++) for (int j = 0; j < S3; j++) C_T[i + j * S2] = C[i * S3 + j]; // Excluding handle creation which is time consuming cublasHandle_t handle; cublasCreate(&handle); t1 = std::chrono::high_resolution_clock::now(); for (int i = 0; i < testN; i++) { float *d_A; float *d_B; float *d_C; float *d_T; float *d_O; cudaMalloc((void**)&d_A, S0 * S1 * sizeof(*A)); cudaMalloc((void**)&d_B, S1 * S2 * sizeof(*A)); cudaMalloc((void**)&d_C, S2 * S3 * sizeof(*A)); cudaMalloc((void**)&d_T, S0 * S2 * sizeof(*A)); cudaMalloc((void**)&d_O, S0 * S3 * sizeof(*A)); cublasSetMatrix(S0, S1, sizeof(*A), A_T, S0, d_A, S0); cublasSetMatrix(S1, S2, sizeof(*B), B_T, S1, d_B, S1); cublasSetMatrix(S2, S3, sizeof(*C), C_T, S2, d_C, S2); float alpha_var = 1; float beta_var = 0; cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, S0, S2, S1, &alpha_var, d_A, S0, d_B, S1, &beta_var, d_T, S0); cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, S0, S3, S2, &alpha_var, d_T, S0, d_C, S2, &beta_var, d_O, S0); cublasGetMatrix(S0, S3, sizeof(*C), d_O, S0, O_T, S0); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); cudaFree(d_T); cudaFree(d_O); } t2 = std::chrono::high_resolution_clock::now(); std::cout << "cublas matmul done (excluding cublasHandle creation): " << (std::chrono::duration<double,std::milli>(t2 - t1) / testN).count() << "ms" << std::endl << std::flush; cublasDestroy(handle); bool check_cublas_difference = false; if (check_cublas_difference) { bool flag = true; for (int i = 0; i < S0 && flag; i++) { for (int j = 0; j < S3; j++) { if (O_buf(j, i) != O_T[i + j * S0]) { std::cout << "cublas validation mismatch:" << std::endl; std::cout << i << " " << j << " " << O_T[i + j * S0] << " " << O_buf(j, i) << std::endl; } } } if (flag) { std::cout << "cublas and validation match" << std::endl; } } free(A); free(B); free(C); free(A_T); free(B_T); free(C_T); free(O_T); std::cout << "----------" << std::endl << std::endl; return 0; }