int main(void) { int Mdim, Ndim, Pdim; // A[N][P], B[P][M], C[N][M] int szA, szB, szC; // number of elements in each matrix double start_time; // Starting time double run_time; // timing data Ndim = ORDER; Pdim = ORDER; Mdim = ORDER; szA = Ndim * Pdim; szB = Pdim * Mdim; szC = Ndim * Mdim; std::vector<float> h_A(szA); // Host memory for Matrix A std::vector<float> h_B(szB); // Host memory for Matrix B std::vector<float> h_C(szC); // Host memory for Matrix C cl::Buffer d_a, d_b, d_c; // Matrices in device memory initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C); printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER); for(int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = wtime(); seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C); run_time = wtime() - start_time; results(Mdim, Ndim, Pdim, h_C, run_time); } try { //-------------------------------------------------------------------------------- // Create a context and queue for DEVICE //-------------------------------------------------------------------------------- cl::Context context(DEVICE); cl::CommandQueue queue(context); //-------------------------------------------------------------------------------- // Setup the buffers, initialize matrices, and write them into global memory //-------------------------------------------------------------------------------- // Reset A, B and C matrices (just to play it safe) initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C); d_a = cl::Buffer(context, begin(h_A), end(h_A), true); d_b = cl::Buffer(context, begin(h_B), end(h_B), true); d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * szC); //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... Naive //-------------------------------------------------------------------------------- // Create the compute program from the source buffer cl::Program program(context, util::loadProgram("../C_elem.cl"), true); // Create the compute kernel from the program auto naive_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer>(program, "mmul"); printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = wtime(); // Execute the kernel over the entire range of C matrix elements ... computing // a dot product for each element of the product matrix. The local work // group size is set to NULL ... so I'm telling the OpenCL runtime to // figure out a local work group size for me. cl::NDRange global(Ndim, Mdim); naive_mmul(cl::EnqueueArgs(queue, global), Mdim, Ndim, Pdim, d_a, d_b, d_c); queue.finish(); run_time = wtime() - start_time; cl::copy(queue, d_c, begin(h_C), end(h_C)); results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... C row per work item //-------------------------------------------------------------------------------- // Create the compute program from the source buffer program = cl::Program(context, util::loadProgram("../C_row.cl"), true); // Create the compute kernel from the program auto crow_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer>(program, "mmul"); printf("\n===== OpenCL, matrix mult, C row per work item, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = wtime(); cl::NDRange global(Ndim); crow_mmul(cl::EnqueueArgs(queue, global), Mdim, Ndim, Pdim, d_a, d_b, d_c); queue.finish(); run_time = wtime() - start_time; cl::copy(queue, d_c, begin(h_C), end(h_C)); results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... C row per work item, A row in pivate memory //-------------------------------------------------------------------------------- // Create the compute program from the source buffer program = cl::Program(context, util::loadProgram("../C_row_priv.cl"), true); // Create the compute kernel from the program auto arowpriv_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer>(program, "mmul"); printf("\n===== OpenCL, matrix mult, C row, A row in priv mem, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = wtime(); cl::NDRange global(Ndim); cl::NDRange local(ORDER / 16); arowpriv_mmul(cl::EnqueueArgs(queue, global, local), Mdim, Ndim, Pdim, d_a, d_b, d_c); queue.finish(); run_time = wtime() - start_time; cl::copy(queue, d_c, begin(h_C), end(h_C)); results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... C row per work item, A row pivate, B col local //-------------------------------------------------------------------------------- // Create the compute program from the source buffer program = cl::Program(context, util::loadProgram("../C_row_priv_bloc.cl"), true); // Create the compute kernel from the program auto browloc_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer, cl::LocalSpaceArg>(program, "mmul"); printf("\n===== OpenCL, mat mult, C row, priv A, B cols loc, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = wtime(); cl::NDRange global(Ndim); cl::NDRange local(ORDER / 16); cl::LocalSpaceArg localmem = cl::Local(sizeof(float) * Pdim); browloc_mmul(cl::EnqueueArgs(queue, global, local), Mdim, Ndim, Pdim, d_a, d_b, d_c, localmem); queue.finish(); run_time = wtime() - start_time; cl::copy(queue, d_c, begin(h_C), end(h_C)); results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... A and B in block form in local memory //-------------------------------------------------------------------------------- // Create the compute program from the source buffer program = cl::Program(context, util::loadProgram("../C_block_form.cl"), true); // Create the compute kernel from the program auto block_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer, cl::LocalSpaceArg, cl::LocalSpaceArg>(program, "mmul"); printf("\n===== OpenCL, A and B in block form in local memory, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = wtime(); int blocksize = 16; cl::NDRange global(Ndim, Mdim); cl::NDRange local(blocksize, blocksize); cl::LocalSpaceArg localmem1 = cl::Local(sizeof(float) * blocksize * blocksize); cl::LocalSpaceArg localmem2 = cl::Local(sizeof(float) * blocksize * blocksize); block_mmul(cl::EnqueueArgs(queue, global, local), Mdim, Ndim, Pdim, d_a, d_b, d_c, localmem1, localmem2); queue.finish(); run_time = wtime() - start_time; cl::copy(queue, d_c, begin(h_C), end(h_C)); results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << "(" << err_code(err.err()) << ")" << std::endl; } return EXIT_SUCCESS; }
int main(void) { int N; // A[N][N], B[N][N], C[N][N] int sz; // number of elements in each matrix float tmp; N = ORDER; sz = N * N; std::vector<float> h_A(sz); // Matrix A on the host std::vector<float> h_B(sz); // Matrix B on the host std::vector<float> h_C(sz); // Matrix C on the host cl::Buffer d_A; // matrix A on the device cl::Buffer d_B; // matrix B on the device cl::Buffer d_C; // matrix C on the device initmat(N, N, N, h_A, h_B, h_C); printf("\n===== Sequential, matrix mult (dot prod), order %d on CPU ======\n",ORDER); zero_mat(N, N, h_C); util::Timer timer; for (int i = 0; i < N; i++) { for (int j = 0; j < N; j++) { tmp = 0.0f; for (int k = 0; k < N; k++) { tmp += h_A[i*N+k] * h_B[k*N+j]; } h_C[i*N+j] = tmp; } } double rtime = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; results(N, N, N, h_C, rtime); printf("\n===== Parallel matrix mult (dot prod), order %d on device ======\n",ORDER); switch (DEVICE) { case CL_DEVICE_TYPE_DEFAULT: printf("DEVICE=DEFAULT\n"); break; case CL_DEVICE_TYPE_CPU: printf("DEVICE=CPU\n"); break; case CL_DEVICE_TYPE_GPU: printf("DEVICE=GPU\n"); break; default: printf("DEVICE=%d\n", DEVICE); break; } zero_mat(N, N, h_C); try { cl::Context context(DEVICE); // Load in kernel source, creating a program object for the context. // Build program explicitly so I can catch errors and display // compiler error messages (should any be generated) cl::Program program(context, util::loadProgram("matmul_kernel.cl")); try { program.build(); } catch (cl::Error error) { // If it was a build error then show the error if (error.err() == CL_BUILD_PROGRAM_FAILURE) { std::vector<cl::Device> devices; devices = context.getInfo<CL_CONTEXT_DEVICES>(); std::string built = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]); std::cerr << built << "\n"; } throw error; } // Get the command queue cl::CommandQueue queue(context); // Create the kernel functor auto mmul = cl::make_kernel<int, cl::Buffer, cl::Buffer, cl::Buffer, cl::LocalSpaceArg, cl::LocalSpaceArg> (program, "mmul"); util::Timer timer; d_A = cl::Buffer(context, begin(h_A), end(h_A), true); d_B = cl::Buffer(context, begin(h_B), end(h_B), true); d_C = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * sz); // Work-group computes a block of C. This size is also set // in a #define inside the kernel function. Note this blocksize // must evenly divide the matrix order int blocksize = 16; cl::LocalSpaceArg A_block = cl::Local(sizeof(float) * blocksize*blocksize); cl::LocalSpaceArg B_block = cl::Local(sizeof(float) * blocksize*blocksize); mmul( cl::EnqueueArgs( queue, cl::NDRange(N,N), cl::NDRange(blocksize,blocksize)), N, d_A, d_B, d_C, A_block, B_block); cl::copy(queue, d_C, begin(h_C), end(h_C)); double rtime = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; results(N, N, N, h_C, rtime); } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << std::endl; } }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgeadd */ int main( int argc, char** argv) { #define h_A(i_, j_) (h_A + (i_) + (j_)*lda) #define h_B(i_, j_) (h_B + (i_) + (j_)*lda) // B uses lda TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float Bnorm, error, work[1]; float *h_A, *h_B, *d_A, *d_B; float alpha = MAGMA_S_MAKE( 3.1415, 2.71828 ); float beta = MAGMA_S_MAKE( 6.0221, 6.67408 ); float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t M, N, size, lda, ldda; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); float tol = opts.tolerance * lapackf77_slamch("E"); /* Uncomment these lines to check parameters. * magma_xerbla calls lapack's xerbla to print out error. */ //magmablas_sgeadd( -1, N, alpha, d_A, ldda, d_B, ldda, opts.queue ); //magmablas_sgeadd( M, -1, alpha, d_A, ldda, d_B, ldda, opts.queue ); //magmablas_sgeadd( M, N, alpha, d_A, M-1, d_B, ldda, opts.queue ); //magmablas_sgeadd( M, N, alpha, d_A, ldda, d_B, N-1, opts.queue ); printf("%% M N CPU Gflop/s (ms) GPU Gflop/s (ms) |Bl-Bm|/|Bl|\n"); printf("%%========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = M; ldda = magma_roundup( M, opts.align ); // multiple of 32 by default size = lda*N; gflops = 2.*M*N / 1e9; TESTING_MALLOC_CPU( h_A, float, lda *N ); TESTING_MALLOC_CPU( h_B, float, lda *N ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); TESTING_MALLOC_DEV( d_B, float, ldda*N ); lapackf77_slarnv( &ione, ISEED, &size, h_A ); lapackf77_slarnv( &ione, ISEED, &size, h_B ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_ssetmatrix( M, N, h_A, lda, d_A, ldda, opts.queue ); magma_ssetmatrix( M, N, h_B, lda, d_B, ldda, opts.queue ); gpu_time = magma_sync_wtime( opts.queue ); if ( opts.version == 1 ) { magmablas_sgeadd( M, N, alpha, d_A, ldda, d_B, ldda, opts.queue ); } else { magmablas_sgeadd2( M, N, alpha, d_A, ldda, beta, d_B, ldda, opts.queue ); } gpu_time = magma_sync_wtime( opts.queue ) - gpu_time; gpu_perf = gflops / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); if ( opts.version == 1 ) { for( int j = 0; j < N; ++j ) { blasf77_saxpy( &M, &alpha, &h_A[j*lda], &ione, &h_B[j*lda], &ione ); } } else { for( int j = 0; j < N; ++j ) { // daxpby for( int i=0; i < M; ++i ) { *h_B(i,j) = alpha * (*h_A(i,j)) + beta * (*h_B(i,j)); } } } cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check result =================================================================== */ magma_sgetmatrix( M, N, d_B, ldda, h_A, lda, opts.queue ); blasf77_saxpy( &size, &c_neg_one, h_B, &ione, h_A, &ione ); Bnorm = lapackf77_slange( "F", &M, &N, h_B, &lda, work ); error = lapackf77_slange( "F", &M, &N, h_A, &lda, work ) / Bnorm; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., error, (error < tol ? "ok" : "failed")); status += ! (error < tol); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
int main(int argc, char *argv[]) { int N; // A[N][N], B[N][N], C[N][N] int size; // Number of elements in each matrix double start_time; // Starting time double run_time; // Timing util::Timer timer; // Timing N = ORDER; size = N * N; std::vector<float> h_A(size); // Host memory for Matrix A std::vector<float> h_B(size); // Host memory for Matrix B std::vector<float> h_C(size); // Host memory for Matrix C cl::Buffer d_a, d_b, d_c; // Matrices in device memory //-------------------------------------------------------------------------------- // Create a context and queue //-------------------------------------------------------------------------------- try { cl_uint deviceIndex = 0; parseArguments(argc, argv, &deviceIndex); // Get list of devices std::vector<cl::Device> devices; unsigned numDevices = getDeviceList(devices); // Check device index in range if (deviceIndex >= numDevices) { std::cout << "Invalid device index (try '--list')\n"; return EXIT_FAILURE; } cl::Device device = devices[deviceIndex]; std::string name; getDeviceName(device, name); std::cout << "\nUsing OpenCL device: " << name << "\n"; std::vector<cl::Device> chosen_device; chosen_device.push_back(device); cl::Context context(chosen_device); cl::CommandQueue queue(context, device); //-------------------------------------------------------------------------------- // Run sequential matmul //-------------------------------------------------------------------------------- initmat(N, h_A, h_B, h_C); timer.reset(); printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",N); for(int i = 0; i < COUNT; i++) { zero_mat(N, h_C); start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; seq_mat_mul_sdot(N, h_A, h_B, h_C); run_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time; results(N, h_C, run_time); } //-------------------------------------------------------------------------------- // Setup the buffers, initialize matrices, and write them into global memory //-------------------------------------------------------------------------------- // Reset A, B and C matrices (just to play it safe) initmat(N, h_A, h_B, h_C); d_a = cl::Buffer(context, h_A.begin(), h_A.end(), true); d_b = cl::Buffer(context, h_B.begin(), h_B.end(), true); d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * size); //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... Naive //-------------------------------------------------------------------------------- // Create the compute program from the source buffer cl::Program program(context, kernelsource, true); // Create the compute kernel from the program cl::make_kernel<int, cl::Buffer, cl::Buffer, cl::Buffer> naive_mmul(program, "mmul"); printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",N); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(N, h_C); start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; // Execute the kernel over the entire range of C matrix elements ... computing // a dot product for each element of the product matrix. The local work // group size is set to NULL ... so I'm telling the OpenCL runtime to // figure out a local work group size for me. cl::NDRange global(N, N); naive_mmul(cl::EnqueueArgs(queue, global), N, d_a, d_b, d_c); queue.finish(); run_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time; cl::copy(queue, d_c, h_C.begin(), h_C.end()); results(N, h_C, run_time); } // end for loop } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << "(" << err_code(err.err()) << ")" << std::endl; } return EXIT_SUCCESS; }
int main(void) { int Mdim, Ndim, Pdim; // A[N][P], B[P][M], C[N][M] int szA, szB, szC; // Number of elements in each matrix double start_time; // Starting time double run_time; // Timing util::Timer timer; // Timing Ndim = ORDER; Pdim = ORDER; Mdim = ORDER; szA = Ndim * Pdim; szB = Pdim * Mdim; szC = Ndim * Mdim; std::vector<float> h_A(szA); // Host memory for Matrix A std::vector<float> h_B(szB); // Host memory for Matrix B std::vector<float> h_C(szC); // Host memory for Matrix C cl::Buffer d_a, d_b, d_c; // Matrices in device memory initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C); timer.reset(); printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER); for(int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C); run_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time; results(Mdim, Ndim, Pdim, h_C, run_time); } try { //-------------------------------------------------------------------------------- // Create a context and queue for DEVICE //-------------------------------------------------------------------------------- cl::Context context(DEVICE); cl::CommandQueue queue(context); //-------------------------------------------------------------------------------- // Setup the buffers, initialize matrices, and write them into global memory //-------------------------------------------------------------------------------- // Reset A, B and C matrices (just to play it safe) initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C); d_a = cl::Buffer(context, begin(h_A), end(h_A), true); d_b = cl::Buffer(context, begin(h_B), end(h_B), true); d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * szC); //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... Naive //-------------------------------------------------------------------------------- // Create the compute program from the source buffer cl::Program program(context, kernelsource, true); // Create the compute kernel from the program auto naive_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer>(program, "mmul"); printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; // Execute the kernel over the entire range of C matrix elements ... computing // a dot product for each element of the product matrix. The local work // group size is set to NULL ... so I'm telling the OpenCL runtime to // figure out a local work group size for me. cl::NDRange global(Ndim, Mdim); naive_mmul(cl::EnqueueArgs(queue, global), Mdim, Ndim, Pdim, d_a, d_b, d_c); queue.finish(); run_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time; cl::copy(queue, d_c, begin(h_C), end(h_C)); results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << "(" << err_code(err.err()) << ")" << std::endl; } return EXIT_SUCCESS; }