Пример #1
0
int main(int argc, char** argv) {

    char* mpi_inbuf;
    char* mpi_outbuf;
    char* farc_inbuf;
    char* farc_outbuf;

    MPI_Init(&argc, &argv);

    test_start("unpack(2, hindexed[{(1*MPI_INT, offset=4), (3*MPI_INT, offset=16), (2*MPI_INT, offset=32)}])");
    init_buffers(20*sizeof(int), &mpi_inbuf, &farc_inbuf, &mpi_outbuf, &farc_outbuf);

    MPI_Datatype mpitype; 
    int blocklen[3] = {1, 3, 2};
    MPI_Aint disp[3] = {4, 16, 32};

    MPI_Type_create_hindexed(3, blocklen, disp, MPI_INT, &mpitype);
    MPI_Type_commit(&mpitype);

    farc::DDT_Init();
    farc::Datatype* t1 = new farc::PrimitiveDatatype(farc::PrimitiveDatatype::INT);
    farc::Datatype* t2 = new farc::HIndexedDatatype(3, blocklen, disp, t1);
    farc::DDT_Commit(t2);
    farc::DDT_Unpack(farc_inbuf, farc_outbuf, t2, 2);

    int position = 0;
    MPI_Unpack(mpi_inbuf, 20*sizeof(int), &position, mpi_outbuf, 2, mpitype, MPI_COMM_WORLD);

    int res = compare_buffers(20*sizeof(int), &mpi_inbuf, &farc_inbuf, &mpi_outbuf, &farc_outbuf);
    free_buffers(&mpi_inbuf, &farc_inbuf, &mpi_outbuf, &farc_outbuf);
    test_result(res);

    MPI_Finalize();

    return 0;

}
Пример #2
0
static int test_circle(const struct testcase *t)
{
	GP_Context *c;
	int err;

	c = GP_ContextAlloc(t->w, t->h, GP_PIXEL_G8);

	if (c == NULL) {
		tst_err("Failed to allocate context");
		return TST_UNTESTED;
	}

	/* zero the pixels buffer */
	memset(c->pixels, 0, c->w * c->h);

	GP_FillCircle(c, t->x, t->y, t->r, 1);

	err = compare_buffers(t->pixmap, c);

	if (err)
		return TST_FAILED;

	return TST_SUCCESS;
}
Пример #3
0
int main(int, char **)
{
    Halide::Buffer<uint8_t> bufA(NN, NN);
    Halide::Buffer<uint8_t> output_buf(NN, NN);
    Halide::Buffer<uint8_t> reference_buf(NN, NN);
    init_buffer(output_buf, (uint8_t) 177);
    // Dummy data
    for (int i = 0; i < NN; i++) {
        for (int j = 0; j < NN; j++) {
            bufA(j, i) = i * 10 + j;
        }
    }
    for (int i = 0; i < NN; i++) {
        for (int j = 0; j < NN; j++) {
            // Same access pattern as in generator
            reference_buf((i + j * (NN - 1)) % NN, (i + j * 2) % NN) = bufA((j + 3) % NN, i);
        }
    }

    func(bufA.raw_buffer(), output_buf.raw_buffer());
    compare_buffers("test", output_buf, reference_buf);

    return 0;
}
Пример #4
0
void test_gemm(const std::string &name,
               int M, int N, int K,
               float alpha, float beta,
               int rowsA, int colsA,
               int rowsB, int colsB,
               int rowsC, int colsC,
               int offsetA, int offsetB, int offsetC,
               bool transposeA, bool transposeB)
{
    Halide::Buffer<int32_t> sizes(12);
    Halide::Buffer<float> params(2);
    Halide::Buffer<bool> transposes(2);
    Halide::Buffer<float> A(colsA, rowsA);
    Halide::Buffer<float> B(colsB, rowsB);
    Halide::Buffer<float> C(colsC, rowsC);
    Halide::Buffer<float> C_ref(colsC, rowsC);
    sizes(0) = M;
    sizes(1) = N;
    sizes(2) = K;
    sizes(3) = rowsA;
    sizes(4) = colsA;
    sizes(5) = rowsB;
    sizes(6) = colsB;
    sizes(7) = rowsC;
    sizes(8) = colsC;
    sizes(9) = offsetA;
    sizes(10) = offsetB;
    sizes(11) = offsetC;
    params(0) = alpha;
    params(1) = beta;
    transposes(0) = transposeA;
    transposes(1) = transposeB;

    for (int i = 0; i < rowsA; i++)
        for (int j = 0; j < colsA; j++)
            A(j, i) = std::rand() % 10 - 5;
    for (int i = 0; i < rowsB; i++)
        for (int j = 0; j < colsB; j++)
            B(j, i) = std::rand() % 10 - 5;
    for (int i = 0; i < rowsC; i++)
        for (int j = 0; j < colsC; j++)
            C(j, i) = C_ref(j, i) = std::rand() % 10 - 5;
    for (int i = 0; i < M; i++) {
        for (int j = 0; j < N; j++) {
            C_ref(j + offsetC % colsC, i + offsetC / colsC) *= beta;
            for (int k = 0; k < K; k++) {
                float a = transposeA ?
                              A(i + offsetA % colsA, k + offsetA / colsA) :
                              A(k + offsetA % colsA, i + offsetA / colsA);
                float b = transposeB ?
                              B(k + offsetB % colsB, j + offsetB / colsB) :
                              B(j + offsetB % colsB, k + offsetB / colsB);
                C_ref(j + offsetC % colsC, i + offsetC / colsC) += alpha * a * b;
            }
        }
    }

    test_164(sizes.raw_buffer(), params.raw_buffer(), transposes.raw_buffer(),
             A.raw_buffer(), B.raw_buffer(), C.raw_buffer());
    compare_buffers(name, C, C_ref);
}
Пример #5
0
int main(int argc, char** argv)
{
    std::vector<std::chrono::duration<double, std::milli>> duration_vector_1, duration_vector_2;

    bool run_ref = false, run_tiramisu = false;

    const char* env_ref = std::getenv("RUN_REF");
    if (env_ref != NULL && env_ref[0] == '1')
        run_ref = true;

    const char* env_tiramisu = std::getenv("RUN_TIRAMISU");
    if (env_tiramisu != NULL && env_tiramisu[0] == '1')
        run_tiramisu = true;

    // ---------------------------------------------------------------------
    // ---------------------------------------------------------------------
    // ---------------------------------------------------------------------
  
    double alpha = 2.5;
  
    Halide::Buffer<int> SIZES(1);
    SIZES(0) = nrow;
  
    Halide::Buffer<double> b_alpha(1);
    b_alpha(0) = alpha;
  
    Halide::Buffer<double> b_X(nrow), b_X_ref(nrow);

    // ---------------------------------------------------------------------
    // ---------------------------------------------------------------------
    // ---------------------------------------------------------------------

    {
        for (int i = 0; i < NB_TESTS; ++i)
        {
            init_buffer(b_X_ref, (double)1);
            auto start = std::chrono::high_resolution_clock::now();
      
            if (run_ref)
	    	scal_ref(nrow, alpha, b_X_ref.data());
      
            auto end = std::chrono::high_resolution_clock::now();
            duration_vector_1.push_back(end - start);
        }
    }

    {
        for (int i = 0; i < NB_TESTS; ++i)
        {
            init_buffer(b_X, (double)1);
            auto start = std::chrono::high_resolution_clock::now();

            if (run_tiramisu)
	    	scal(SIZES.raw_buffer(), b_alpha.raw_buffer(), b_X.raw_buffer());

            auto end = std::chrono::high_resolution_clock::now();
            duration_vector_2.push_back(end - start);
        }
    }

    print_time("performance_cpu.csv", "scal",
	       {"Ref", "Tiramisu"},
	       {median(duration_vector_1), median(duration_vector_2)});

    if (CHECK_CORRECTNESS && run_ref && run_tiramisu)
        compare_buffers("scal", b_X_ref, b_X);

    if (PRINT_OUTPUT)
    {
        std::cout << "Tiramisu " << std::endl;
        print_buffer(b_X);

        std::cout << "Reference " << std::endl;
        print_buffer(b_X_ref);
    }
  
    return 0;
}
Пример #6
0
int main(int argc, char **argv)
{
    int64_t block_size, buf_size, state_offset;
    int64_t fsize1, fsize2, max_size, cpos, len, bad_bytes, bad_groups;
    int start_option, i, state;
    char *fname1, *fname2;
    char *buf1, *buf2;
    FILE *fd1, *fd2;

    block_size = 64 *1024;
    buf_size = 20 *1024*1024;

    if (argc < 3) {
        printf("\n");
        printf("ldiff [-b block_size] [-s buffer_size] file1 file2\n");
        printf("     -b block_size   Block size for matching to cache page boundaries.  Defaults to " I64T ".  Can use units.\n", block_size);
        printf("     -s buffer_size  Buffer size used in the comparison. This is divided in 1/2 for each buffer. Defaults to " I64T ". Cant use units.\n", buf_size);
        printf("\n");
        return(1);
    }

    //*** Parse the args
    i=1;
    if (argc > 1) {
        do {
            start_option = i;
            if (strcmp(argv[i], "-b") == 0) { //** Block size
                i++;
                block_size = string_get_integer(argv[i]);
                i++;
            } else if (strcmp(argv[i], "-s") == 0) { //** Buffer size
                i++;
                buf_size = string_get_integer(argv[i]);
                i++;
            }
        } while ((start_option < i) && (i<argc));
    }


    if ((argc-i) < 2) {
        printf("Missing filenames!\n");
        return(1);
    }

    //** Open the files and get their sizes
    fname1 = argv[i];
    fname2 = argv[i+1];
    fd1 = fopen(fname1, "r"); assert(fd1 != NULL);
    fd2 = fopen(fname2, "r"); assert(fd2 != NULL);

    fseek(fd1, 0, SEEK_END);
    fseek(fd2, 0, SEEK_END);
    fsize1 = ftell(fd1);
    fsize2 = ftell(fd2);
    fseek(fd1, 0, SEEK_SET);
    fseek(fd2, 0, SEEK_SET);

    //** Print a summary of options
    printf("File 1: %s  (" I64T " bytes)\n", fname1, fsize1);
    printf("File 2: %s  (" I64T " bytes)\n", fname2, fsize2);
    if (fsize1 != fsize2) printf("WARNING:  File sizes differ!!!!!\n");
    printf("Block size: " I64T "\n", block_size);
    printf("Buffer size: " I64T "\n", buf_size);

    buf_size /= 2;  //** It's split between the 2 buffers

    type_malloc(buf1, char, buf_size);
    type_malloc(buf2, char, buf_size);

    max_size = (fsize1 > fsize2) ? fsize2 : fsize1;

    printf("\n");
    printf("Printing comparision breakdown -- Single byte matches are suppressed (max_size=" I64T ")\n", max_size);

    state = 0;
    state_offset = 0;
    bad_bytes = bad_groups = 0;
    for (cpos=0; cpos < max_size; cpos += buf_size) {
        len = ((cpos+buf_size) < max_size) ? buf_size : max_size - cpos;
        assert_result(fread(buf1, 1, len, fd1), len);
        assert_result(fread(buf2, 1, len, fd2), len);

        compare_buffers(buf1, buf2, len, cpos, &bad_bytes, &bad_groups, block_size, &state, &state_offset, max_size);
    }

    printf("\n");
    printf("Bad bytes: " I64T " Bad groups: " I64T "\n", bad_bytes, bad_groups);

    free(buf1);
    free(buf2);

    fclose(fd1);
    fclose(fd2);

    i = ((bad_bytes > 0) || (fsize1 != fsize2)) ? 1 : 0;
    return(i);
}
Пример #7
0
int main(int, char**)
{
    std::vector<std::chrono::duration<double,std::milli>> duration_vector_1;
    std::vector<std::chrono::duration<double,std::milli>> duration_vector_2;
    Halide::Buffer<uint8_t> input = Halide::Tools::load_image("./utils/images/rgb.png");
    Halide::Buffer<int32_t> size(2);
    size(0) = input.extent(0);
    size(1) = input.extent(1);

    Halide::Buffer<uint8_t> output_ref_y(input.width(), input.height());
    Halide::Buffer<uint8_t> output_ref_u(input.width()/2, input.height()/2);
    Halide::Buffer<uint8_t> output_ref_v(input.width()/2, input.height()/2);

    Halide::Buffer<uint8_t> output_tiramisu_y(input.width(), input.height());
    Halide::Buffer<uint8_t> output_tiramisu_u(input.width()/2, input.height()/2);
    Halide::Buffer<uint8_t> output_tiramisu_v(input.width()/2, input.height()/2);

    std::cout << "STARTING TEST\n";
    std::cout << "y size (width, height): " << output_tiramisu_y.width() << ", " << output_tiramisu_y.height() << "\n";
    std::cout << "u size (width, height): " << output_tiramisu_u.width() << ", " << output_tiramisu_u.height() << "\n";
    std::cout << "v size (width, height): " << output_tiramisu_v.width() << ", " << output_tiramisu_v.height() << "\n";
    // Warm up
    rgbyuv420gpu_tiramisu(size.raw_buffer(), input.raw_buffer(), output_tiramisu_y.raw_buffer(), output_tiramisu_u.raw_buffer(), output_tiramisu_v.raw_buffer());
    run_halide(input, output_ref_y, output_ref_u, output_ref_v);

    // Tiramisu
    for (int i=0; i<NB_TESTS; i++)
    {
        auto start1 = std::chrono::high_resolution_clock::now();
        rgbyuv420gpu_tiramisu(size.raw_buffer(), input.raw_buffer(), output_tiramisu_y.raw_buffer(), output_tiramisu_u.raw_buffer(), output_tiramisu_v.raw_buffer());
        auto end1 = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double,std::milli> duration1 = end1 - start1;
        duration_vector_1.push_back(duration1);
    }

    // Reference
    for (int i=0; i<NB_TESTS; i++)
    {
        duration_vector_2.push_back(run_halide(input, output_ref_y, output_ref_u, output_ref_v));
    }

    print_time("performance_CPU.csv", "rgbyuv420gpu",
               {"Tiramisu", "Halide"},
               {median(duration_vector_1), median(duration_vector_2)});

    Halide::Tools::save_image(output_tiramisu_y, "./build/rgbyuv420gpu_y_tiramisu.png");
    Halide::Tools::save_image(output_tiramisu_u, "./build/rgbyuv420gpu_u_tiramisu.png");
    Halide::Tools::save_image(output_tiramisu_v, "./build/rgbyuv420gpu_v_tiramisu.png");
    Halide::Tools::save_image(output_ref_y, "./build/rgbyuv420gpu_y_ref.png");
    Halide::Tools::save_image(output_ref_u, "./build/rgbyuv420gpu_u_ref.png");
    Halide::Tools::save_image(output_ref_v, "./build/rgbyuv420gpu_v_ref.png");

    if (CHECK_CORRECTNESS)
    {
        std::cout << "Compare y buffer\n";
    	compare_buffers("benchmark_rgbyuv420gpu", output_tiramisu_y, output_ref_y);
        std::cout << "Compare u buffer\n";
    	compare_buffers("benchmark_rgbyuv420gpu", output_tiramisu_u, output_ref_u);
        std::cout << "Compare v buffer\n";
    	compare_buffers("benchmark_rgbyuv420gpu", output_tiramisu_y, output_ref_y);
    }

    return 0;
}
Пример #8
0
int main(int argc, char** argv) {

    MPI_Init(&argc, &argv);

    int rank, peer, commsize;

    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &commsize);
    if (rank % 2) peer = rank - 1 % commsize;
    else peer = rank + 1 % commsize;

    if (commsize % 2 != 0) {
        fprintf(stderr, "Use even number of processes.\n");
        exit(EXIT_FAILURE);
    }

    char* mpi_inbuf;
    char* mpi_outbuf;
    char* pmpi_inbuf;
    char* pmpi_outbuf;

    test_start("isend/irecv + test (2, vector[[int], count=2, blklen=3, stride=5])");
    init_buffers(20*sizeof(int), &mpi_inbuf, &pmpi_inbuf, &mpi_outbuf, &pmpi_outbuf);

    MPI_Datatype vector_ddt;
    MPI_Type_vector(2, 3, 5, MPI_INT, &vector_ddt);
    MPI_Type_commit(&vector_ddt);

    MPI_Datatype pmpi_vector_ddt;
    PMPI_Type_vector(2, 3, 5, MPI_INT, &pmpi_vector_ddt);
    PMPI_Type_commit(&pmpi_vector_ddt);

    MPI_Request requests_mpi[2];
    MPI_Request requests_pmpi[2];
    MPI_Status statuses_mpi[2]; 
    MPI_Status statuses_pmpi[2];

    if (rank % 2 == 0) {
        MPI_Isend(mpi_inbuf, 2, vector_ddt, peer, 0, MPI_COMM_WORLD, &(requests_mpi[0]));
        MPI_Irecv(mpi_outbuf, 2, vector_ddt, peer, 0, MPI_COMM_WORLD, &(requests_mpi[1]));

        PMPI_Isend(pmpi_inbuf, 2, pmpi_vector_ddt, peer, 0, MPI_COMM_WORLD, &(requests_pmpi[0]));
        PMPI_Irecv(pmpi_outbuf, 2, pmpi_vector_ddt, peer, 0, MPI_COMM_WORLD, &(requests_pmpi[1]));       
    }
    else {
        MPI_Irecv(mpi_outbuf, 2, vector_ddt, peer, 0, MPI_COMM_WORLD, &(requests_mpi[0]));       
        MPI_Isend(mpi_inbuf, 2, vector_ddt, peer, 0, MPI_COMM_WORLD, &(requests_mpi[1]));

        PMPI_Irecv(pmpi_outbuf, 2, pmpi_vector_ddt, peer, 0, MPI_COMM_WORLD, &(requests_pmpi[0]));       
        PMPI_Isend(pmpi_inbuf, 2, pmpi_vector_ddt, peer, 0, MPI_COMM_WORLD, &(requests_pmpi[1]));
    }

    int flag;
    flag = 0;
    while (flag == 0) MPI_Test(&(requests_mpi[0]), &flag, &(statuses_mpi[0]));
    flag = 0;
    while (flag == 0) MPI_Test(&(requests_mpi[1]), &flag, &(statuses_mpi[1]));
    flag = 0;
    while (flag == 0) MPI_Test(&(requests_pmpi[0]), &flag, &(statuses_pmpi[0]));
    flag = 0;
    while (flag == 0) MPI_Test(&(requests_pmpi[1]), &flag, &(statuses_pmpi[1]));

    int res = compare_buffers(20*sizeof(int), &mpi_inbuf, &pmpi_inbuf, &mpi_outbuf, &pmpi_outbuf);
    free_buffers(&mpi_inbuf, &pmpi_inbuf, &mpi_outbuf, &pmpi_outbuf);
    test_result(res);

    MPI_Type_free(&vector_ddt);
    PMPI_Type_free(&pmpi_vector_ddt);

    MPI_Finalize();

}
Пример #9
0
int main(int, char **)
{
    int N = 1;
    int W = 1001;
    int H = 1000;
    int F_In = 6;
    int F_Out = 4;
    int K_W = 5;
    int K_H = 3;
    // Note that Halide indices are reversed
    Halide::Buffer<int> parameters(7);
    Halide::Buffer<float> input_padded(F_In, H + K_H - 1, W + K_W - 1, N);
    Halide::Buffer<float> input_col(K_H, K_W, F_In, H, W, N);
    Halide::Buffer<float> kernel(F_Out, K_H, K_W, F_In);
    Halide::Buffer<float> output(F_Out, H, W, N);
    Halide::Buffer<float> output_test(F_Out, H, W, N);

    parameters(0) = N;
    parameters(1) = W;
    parameters(2) = H;
    parameters(3) = F_In;
    parameters(4) = F_Out;
    parameters(5) = K_W;
    parameters(6) = K_H;

    init_buffer(input_padded, (float)0);
    init_buffer(kernel, (float)1);

    // With decimal values test might fail due to floating point arithmetic.
    for (int n = 0; n < N; n++) {
        for (int x = 0; x < W; x++) {
            for (int y = 0; y < H; y++) {
                for (int c = 0; c < F_In; c++) {
                    input_padded(c, y + (K_H - 1) / 2, x + (K_W - 1) / 2, n) = 1;
                }
            }
        }
    }

    for (int f_out = 0; f_out < F_Out; f_out++) {
        for (int k_x = 0; k_x < K_W; k_x++) {
            for (int k_y = 0; k_y < K_H; k_y++) {
                for (int f_in = 0; f_in < F_In; f_in++) {
                    kernel(f_out, k_y, k_x, f_in) = 1;
                }
            }
        }
    }

    bool test = true;
    if (test) {
        init_buffer(output_test, (float)0);
        for (int n = 0; n < N; n++) {
            for (int x = 0; x < W; x++) {
                for (int y = 0; y < H; y++) {
                    for (int k_x = 0; k_x < K_W; k_x++) {
                        for (int k_y = 0; k_y < K_H; k_y++) {
                            for (int f_in = 0; f_in < F_In; f_in++) {
                                for (int f_out = 0; f_out < F_Out; f_out++) {
                                    output_test(f_out, y, x, n) += input_padded(f_in, y + k_y, x + k_x, n) * kernel(f_out, k_y, k_x, f_in);
                                }
                            }
                        }
                    }
                }
            }
        }
    }

    std::cout << "Buffers initialized" << std::endl;

    gemm_conv(parameters.raw_buffer(), input_padded.raw_buffer(), input_col.raw_buffer(), input_col.raw_buffer(), kernel.raw_buffer(), output.raw_buffer());

    if (test) {
        compare_buffers("convs", output, output_test);
    }

    bool print = false;
    if (print) {
        for (int y = 0; y < H; y++) {
            for (int x = 0; x < W; x++) {
                std::printf("%3.1f ", input_col(0, 0, 0, y, x, 0));
            }
            std::cout << std::endl;
        }

        for (int y = 0; y < H; y++) {
            for (int x = 0; x < W; x++) {
                std::printf("%3.1f ", output(0, y, x, 0));
            }
            std::cout << std::endl;
        }
    }

    return 0;
}
Пример #10
0
int main(int argc, char *argv[])
{
    int testN = 1;
    bool check_correctness = false;
    if (argc > 1) {
        testN = atoi(argv[1]);
    }
    if (argc > 2) {
        check_correctness = atoi(argv[2]);
    }

    std::cout << std::endl << "----------" << std::endl;
    std::cout << "Running sequential MM benchmark: testN: " << testN
              << ", check correctness: " << check_correctness
              << ", size: (" << S0 << ", " << S1 << ", " << S2 << ", " << S3 << ")" << std::endl;

    auto t1 = std::chrono::high_resolution_clock::now();
    auto t2 = t1;
    
    float *A = (float*) malloc(S0 * S1 * sizeof(float));
    float *B = (float*) malloc(S1 * S2 * sizeof(float));
    float *C = (float*) malloc(S2 * S3 * sizeof(float));

    // Initialize matrices with random values:
    for (int i = 0; i < S0 * S1; i++) A[i] = std::rand() % 10;
    for (int i = 0; i < S1 * S2; i++) B[i] = std::rand() % 10;
    for (int i = 0; i < S2 * S3; i++) C[i] = std::rand() % 10;

    std::cout << "Buffers initialized" << std::endl << std::flush;

    // Note that indices are flipped (see tutorial 2)
    Halide::Buffer<DATA_TYPE> A_buf(A, {S1, S0});
    Halide::Buffer<DATA_TYPE> B_buf(B, {S2, S1});
    Halide::Buffer<DATA_TYPE> C_buf(C, {S3, S2});
    Halide::Buffer<DATA_TYPE> O_buf(S3, S0);

    // Make a dummy call to set up GPU (initalization takes time)
    matmul(A_buf.raw_buffer(), B_buf.raw_buffer(), C_buf.raw_buffer(), O_buf.raw_buffer());

    // CPU Multiplication for correctness check

    if (check_correctness) {
        // Reference matrix multiplication

        std::cout << "Running CPU multiplication.." << std::endl;

        Halide::Buffer<DATA_TYPE> O_val_buf(S3, S0);
        Halide::Buffer<DATA_TYPE> T_val_buf(S2, S0);
        t1 = std::chrono::high_resolution_clock::now();
        for (int i = 0; i < S0; i++) {
            for (int k = 0; k < S2; k++) {
                // Note that indices are flipped (see tutorial 2)
                T_val_buf(k, i) = 0;
            }
        }
        for (int i = 0; i < S0; i++) {
            for (int l = 0; l < S3; l++) {
                // Note that indices are flipped (see tutorial 2)
                O_val_buf(l, i) = 0;
            }
        }
        for (int j = 0; j < S1; j++) {
            for (int i = 0; i < S0; i++) {
                for (int k = 0; k < S2; k++) {
                    // Note that indices are flipped (see tutorial 2)
                    T_val_buf(k, i) += A_buf(j, i) * B_buf(k, j);
                }
            }
        }
        for (int k = 0; k < S2; k++) {
            for (int i = 0; i < S0; i++) {
                for (int l = 0; l < S3; l++) {
                    // Note that indices are flipped (see tutorial 2)
                    O_val_buf(l, i) += T_val_buf(k, i) * C_buf(l, k);
                }
            }
        }
        t2 = std::chrono::high_resolution_clock::now();

        std::cout << "CPU matmul done: " << (std::chrono::duration<double,std::milli>(t2 - t1)).count() << "ms" << std::endl << std::flush;

        compare_buffers("matmul", O_buf, O_val_buf);
    }

    // GPU Multiplication

    t1 = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < testN; i++) {
        matmul(A_buf.raw_buffer(), B_buf.raw_buffer(), C_buf.raw_buffer(), O_buf.raw_buffer());
    }
    t2 = std::chrono::high_resolution_clock::now();

    std::cout << "GPU matmul done: " << (std::chrono::duration<double,std::milli>(t2 - t1)).count() / testN << "ms" << std::endl << std::flush;

    // CUBLAS SGEMM

    // Transposed copies for cublas
    float *A_T = (float*) malloc(S0 * S1 * sizeof(float));
    float *B_T = (float*) malloc(S1 * S2 * sizeof(float));
    float *C_T = (float*) malloc(S2 * S3 * sizeof(float));
    float *O_T = (float*) malloc(S0 * S3 * sizeof(float));
    // Transpose
    for (int i = 0; i < S0; i++) for (int j = 0; j < S1; j++) A_T[i + j * S0] = A[i * S1 + j];
    for (int i = 0; i < S1; i++) for (int j = 0; j < S2; j++) B_T[i + j * S1] = B[i * S2 + j];
    for (int i = 0; i < S2; i++) for (int j = 0; j < S3; j++) C_T[i + j * S2] = C[i * S3 + j];

    // Excluding handle creation which is time consuming
    cublasHandle_t handle;
    cublasCreate(&handle);

    t1 = std::chrono::high_resolution_clock::now();

    for (int i = 0; i < testN; i++) {
        float *d_A;
        float *d_B;
        float *d_C;
        float *d_T;
        float *d_O;
        cudaMalloc((void**)&d_A, S0 * S1 * sizeof(*A));
        cudaMalloc((void**)&d_B, S1 * S2 * sizeof(*A));
        cudaMalloc((void**)&d_C, S2 * S3 * sizeof(*A));
        cudaMalloc((void**)&d_T, S0 * S2 * sizeof(*A));
        cudaMalloc((void**)&d_O, S0 * S3 * sizeof(*A));

        cublasSetMatrix(S0, S1, sizeof(*A), A_T, S0, d_A, S0);
        cublasSetMatrix(S1, S2, sizeof(*B), B_T, S1, d_B, S1);
        cublasSetMatrix(S2, S3, sizeof(*C), C_T, S2, d_C, S2);

        float alpha_var = 1;
        float beta_var = 0;

        cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, S0, S2, S1, &alpha_var, d_A, S0, d_B, S1, &beta_var, d_T, S0);
        cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, S0, S3, S2, &alpha_var, d_T, S0, d_C, S2, &beta_var, d_O, S0);

        cublasGetMatrix(S0, S3, sizeof(*C), d_O, S0, O_T, S0);

        cudaFree(d_A);
        cudaFree(d_B);
        cudaFree(d_C);
        cudaFree(d_T);
        cudaFree(d_O);
    }

    t2 = std::chrono::high_resolution_clock::now();

    std::cout << "cublas matmul done (excluding cublasHandle creation): "
              << (std::chrono::duration<double,std::milli>(t2 - t1) / testN).count() << "ms" << std::endl << std::flush;

    cublasDestroy(handle);

    bool check_cublas_difference = false;
    if (check_cublas_difference) {
        bool flag = true;
        for (int i = 0; i < S0 && flag; i++) {
            for (int j = 0; j < S3; j++) {
                if (O_buf(j, i) != O_T[i + j * S0]) {
                    std::cout << "cublas validation mismatch:" << std::endl;
                    std::cout << i << " " << j << " " << O_T[i + j * S0] << " " << O_buf(j, i) << std::endl;
                }
            }
        }
        if (flag) {
            std::cout << "cublas and validation match" << std::endl;
        }
    }

    free(A);
    free(B);
    free(C);
    free(A_T);
    free(B_T);
    free(C_T);
    free(O_T);

    std::cout << "----------" << std::endl << std::endl;

    return 0;
}