int main(void) { int Mdim, Ndim, Pdim; // A[N][P], B[P][M], C[N][M] int szA, szB, szC; // number of elements in each matrix double start_time; // Starting time double run_time; // timing data Ndim = ORDER; Pdim = ORDER; Mdim = ORDER; szA = Ndim * Pdim; szB = Pdim * Mdim; szC = Ndim * Mdim; std::vector<float> h_A(szA); // Host memory for Matrix A std::vector<float> h_B(szB); // Host memory for Matrix B std::vector<float> h_C(szC); // Host memory for Matrix C cl::Buffer d_a, d_b, d_c; // Matrices in device memory initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C); printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER); for(int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = wtime(); seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C); run_time = wtime() - start_time; results(Mdim, Ndim, Pdim, h_C, run_time); } try { //-------------------------------------------------------------------------------- // Create a context and queue for DEVICE //-------------------------------------------------------------------------------- cl::Context context(DEVICE); cl::CommandQueue queue(context); //-------------------------------------------------------------------------------- // Setup the buffers, initialize matrices, and write them into global memory //-------------------------------------------------------------------------------- // Reset A, B and C matrices (just to play it safe) initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C); d_a = cl::Buffer(context, begin(h_A), end(h_A), true); d_b = cl::Buffer(context, begin(h_B), end(h_B), true); d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * szC); //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... Naive //-------------------------------------------------------------------------------- // Create the compute program from the source buffer cl::Program program(context, util::loadProgram("../C_elem.cl"), true); // Create the compute kernel from the program auto naive_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer>(program, "mmul"); printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = wtime(); // Execute the kernel over the entire range of C matrix elements ... computing // a dot product for each element of the product matrix. The local work // group size is set to NULL ... so I'm telling the OpenCL runtime to // figure out a local work group size for me. cl::NDRange global(Ndim, Mdim); naive_mmul(cl::EnqueueArgs(queue, global), Mdim, Ndim, Pdim, d_a, d_b, d_c); queue.finish(); run_time = wtime() - start_time; cl::copy(queue, d_c, begin(h_C), end(h_C)); results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... C row per work item //-------------------------------------------------------------------------------- // Create the compute program from the source buffer program = cl::Program(context, util::loadProgram("../C_row.cl"), true); // Create the compute kernel from the program auto crow_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer>(program, "mmul"); printf("\n===== OpenCL, matrix mult, C row per work item, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = wtime(); cl::NDRange global(Ndim); crow_mmul(cl::EnqueueArgs(queue, global), Mdim, Ndim, Pdim, d_a, d_b, d_c); queue.finish(); run_time = wtime() - start_time; cl::copy(queue, d_c, begin(h_C), end(h_C)); results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... C row per work item, A row in pivate memory //-------------------------------------------------------------------------------- // Create the compute program from the source buffer program = cl::Program(context, util::loadProgram("../C_row_priv.cl"), true); // Create the compute kernel from the program auto arowpriv_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer>(program, "mmul"); printf("\n===== OpenCL, matrix mult, C row, A row in priv mem, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = wtime(); cl::NDRange global(Ndim); cl::NDRange local(ORDER / 16); arowpriv_mmul(cl::EnqueueArgs(queue, global, local), Mdim, Ndim, Pdim, d_a, d_b, d_c); queue.finish(); run_time = wtime() - start_time; cl::copy(queue, d_c, begin(h_C), end(h_C)); results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... C row per work item, A row pivate, B col local //-------------------------------------------------------------------------------- // Create the compute program from the source buffer program = cl::Program(context, util::loadProgram("../C_row_priv_bloc.cl"), true); // Create the compute kernel from the program auto browloc_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer, cl::LocalSpaceArg>(program, "mmul"); printf("\n===== OpenCL, mat mult, C row, priv A, B cols loc, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = wtime(); cl::NDRange global(Ndim); cl::NDRange local(ORDER / 16); cl::LocalSpaceArg localmem = cl::Local(sizeof(float) * Pdim); browloc_mmul(cl::EnqueueArgs(queue, global, local), Mdim, Ndim, Pdim, d_a, d_b, d_c, localmem); queue.finish(); run_time = wtime() - start_time; cl::copy(queue, d_c, begin(h_C), end(h_C)); results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... A and B in block form in local memory //-------------------------------------------------------------------------------- // Create the compute program from the source buffer program = cl::Program(context, util::loadProgram("../C_block_form.cl"), true); // Create the compute kernel from the program auto block_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer, cl::LocalSpaceArg, cl::LocalSpaceArg>(program, "mmul"); printf("\n===== OpenCL, A and B in block form in local memory, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = wtime(); int blocksize = 16; cl::NDRange global(Ndim, Mdim); cl::NDRange local(blocksize, blocksize); cl::LocalSpaceArg localmem1 = cl::Local(sizeof(float) * blocksize * blocksize); cl::LocalSpaceArg localmem2 = cl::Local(sizeof(float) * blocksize * blocksize); block_mmul(cl::EnqueueArgs(queue, global, local), Mdim, Ndim, Pdim, d_a, d_b, d_c, localmem1, localmem2); queue.finish(); run_time = wtime() - start_time; cl::copy(queue, d_c, begin(h_C), end(h_C)); results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << "(" << err_code(err.err()) << ")" << std::endl; } return EXIT_SUCCESS; }
int main(int argc, char *argv[]) { int N; // A[N][N], B[N][N], C[N][N] int size; // Number of elements in each matrix double start_time; // Starting time double run_time; // Timing util::Timer timer; // Timing N = ORDER; size = N * N; std::vector<float> h_A(size); // Host memory for Matrix A std::vector<float> h_B(size); // Host memory for Matrix B std::vector<float> h_C(size); // Host memory for Matrix C cl::Buffer d_a, d_b, d_c; // Matrices in device memory //-------------------------------------------------------------------------------- // Create a context and queue //-------------------------------------------------------------------------------- try { cl_uint deviceIndex = 0; parseArguments(argc, argv, &deviceIndex); // Get list of devices std::vector<cl::Device> devices; unsigned numDevices = getDeviceList(devices); // Check device index in range if (deviceIndex >= numDevices) { std::cout << "Invalid device index (try '--list')\n"; return EXIT_FAILURE; } cl::Device device = devices[deviceIndex]; std::string name; getDeviceName(device, name); std::cout << "\nUsing OpenCL device: " << name << "\n"; std::vector<cl::Device> chosen_device; chosen_device.push_back(device); cl::Context context(chosen_device); cl::CommandQueue queue(context, device); //-------------------------------------------------------------------------------- // Run sequential matmul //-------------------------------------------------------------------------------- initmat(N, h_A, h_B, h_C); timer.reset(); printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",N); for(int i = 0; i < COUNT; i++) { zero_mat(N, h_C); start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; seq_mat_mul_sdot(N, h_A, h_B, h_C); run_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time; results(N, h_C, run_time); } //-------------------------------------------------------------------------------- // Setup the buffers, initialize matrices, and write them into global memory //-------------------------------------------------------------------------------- // Reset A, B and C matrices (just to play it safe) initmat(N, h_A, h_B, h_C); d_a = cl::Buffer(context, h_A.begin(), h_A.end(), true); d_b = cl::Buffer(context, h_B.begin(), h_B.end(), true); d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * size); //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... Naive //-------------------------------------------------------------------------------- // Create the compute program from the source buffer cl::Program program(context, kernelsource, true); // Create the compute kernel from the program cl::make_kernel<int, cl::Buffer, cl::Buffer, cl::Buffer> naive_mmul(program, "mmul"); printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",N); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(N, h_C); start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; // Execute the kernel over the entire range of C matrix elements ... computing // a dot product for each element of the product matrix. The local work // group size is set to NULL ... so I'm telling the OpenCL runtime to // figure out a local work group size for me. cl::NDRange global(N, N); naive_mmul(cl::EnqueueArgs(queue, global), N, d_a, d_b, d_c); queue.finish(); run_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time; cl::copy(queue, d_c, h_C.begin(), h_C.end()); results(N, h_C, run_time); } // end for loop } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << "(" << err_code(err.err()) << ")" << std::endl; } return EXIT_SUCCESS; }
int main(void) { int N; // A[N][N], B[N][N], C[N][N] int sz; // number of elements in each matrix float tmp; N = ORDER; sz = N * N; std::vector<float> h_A(sz); // Matrix A on the host std::vector<float> h_B(sz); // Matrix B on the host std::vector<float> h_C(sz); // Matrix C on the host cl::Buffer d_A; // matrix A on the device cl::Buffer d_B; // matrix B on the device cl::Buffer d_C; // matrix C on the device initmat(N, N, N, h_A, h_B, h_C); printf("\n===== Sequential, matrix mult (dot prod), order %d on CPU ======\n",ORDER); zero_mat(N, N, h_C); util::Timer timer; for (int i = 0; i < N; i++) { for (int j = 0; j < N; j++) { tmp = 0.0f; for (int k = 0; k < N; k++) { tmp += h_A[i*N+k] * h_B[k*N+j]; } h_C[i*N+j] = tmp; } } double rtime = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; results(N, N, N, h_C, rtime); printf("\n===== Parallel matrix mult (dot prod), order %d on device ======\n",ORDER); switch (DEVICE) { case CL_DEVICE_TYPE_DEFAULT: printf("DEVICE=DEFAULT\n"); break; case CL_DEVICE_TYPE_CPU: printf("DEVICE=CPU\n"); break; case CL_DEVICE_TYPE_GPU: printf("DEVICE=GPU\n"); break; default: printf("DEVICE=%d\n", DEVICE); break; } zero_mat(N, N, h_C); try { cl::Context context(DEVICE); // Load in kernel source, creating a program object for the context. // Build program explicitly so I can catch errors and display // compiler error messages (should any be generated) cl::Program program(context, util::loadProgram("matmul_kernel.cl")); try { program.build(); } catch (cl::Error error) { // If it was a build error then show the error if (error.err() == CL_BUILD_PROGRAM_FAILURE) { std::vector<cl::Device> devices; devices = context.getInfo<CL_CONTEXT_DEVICES>(); std::string built = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]); std::cerr << built << "\n"; } throw error; } // Get the command queue cl::CommandQueue queue(context); // Create the kernel functor auto mmul = cl::make_kernel<int, cl::Buffer, cl::Buffer, cl::Buffer, cl::LocalSpaceArg, cl::LocalSpaceArg> (program, "mmul"); util::Timer timer; d_A = cl::Buffer(context, begin(h_A), end(h_A), true); d_B = cl::Buffer(context, begin(h_B), end(h_B), true); d_C = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * sz); // Work-group computes a block of C. This size is also set // in a #define inside the kernel function. Note this blocksize // must evenly divide the matrix order int blocksize = 16; cl::LocalSpaceArg A_block = cl::Local(sizeof(float) * blocksize*blocksize); cl::LocalSpaceArg B_block = cl::Local(sizeof(float) * blocksize*blocksize); mmul( cl::EnqueueArgs( queue, cl::NDRange(N,N), cl::NDRange(blocksize,blocksize)), N, d_A, d_B, d_C, A_block, B_block); cl::copy(queue, d_C, begin(h_C), end(h_C)); double rtime = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; results(N, N, N, h_C, rtime); } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << std::endl; } }
int main(void) { int Mdim, Ndim, Pdim; // A[N][P], B[P][M], C[N][M] int szA, szB, szC; // Number of elements in each matrix double start_time; // Starting time double run_time; // Timing util::Timer timer; // Timing Ndim = ORDER; Pdim = ORDER; Mdim = ORDER; szA = Ndim * Pdim; szB = Pdim * Mdim; szC = Ndim * Mdim; std::vector<float> h_A(szA); // Host memory for Matrix A std::vector<float> h_B(szB); // Host memory for Matrix B std::vector<float> h_C(szC); // Host memory for Matrix C cl::Buffer d_a, d_b, d_c; // Matrices in device memory initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C); timer.reset(); printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER); for(int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C); run_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time; results(Mdim, Ndim, Pdim, h_C, run_time); } try { //-------------------------------------------------------------------------------- // Create a context and queue for DEVICE //-------------------------------------------------------------------------------- cl::Context context(DEVICE); cl::CommandQueue queue(context); //-------------------------------------------------------------------------------- // Setup the buffers, initialize matrices, and write them into global memory //-------------------------------------------------------------------------------- // Reset A, B and C matrices (just to play it safe) initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C); d_a = cl::Buffer(context, begin(h_A), end(h_A), true); d_b = cl::Buffer(context, begin(h_B), end(h_B), true); d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * szC); //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... Naive //-------------------------------------------------------------------------------- // Create the compute program from the source buffer cl::Program program(context, kernelsource, true); // Create the compute kernel from the program auto naive_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer>(program, "mmul"); printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; // Execute the kernel over the entire range of C matrix elements ... computing // a dot product for each element of the product matrix. The local work // group size is set to NULL ... so I'm telling the OpenCL runtime to // figure out a local work group size for me. cl::NDRange global(Ndim, Mdim); naive_mmul(cl::EnqueueArgs(queue, global), Mdim, Ndim, Pdim, d_a, d_b, d_c); queue.finish(); run_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time; cl::copy(queue, d_c, begin(h_C), end(h_C)); results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << "(" << err_code(err.err()) << ")" << std::endl; } return EXIT_SUCCESS; }