int main(void) { float *h_A; // A matrix float *h_B; // B matrix float *h_C; // C = A*B matrix int Mdim, Ndim, Pdim; // A[N][P], B[P][M], C[N][M] int szA, szB, szC; // number of elements in each matrix cl_mem d_a, d_b, d_c; // Matrices in device memory double start_time; // Starting time double run_time; // timing data char * kernelsource; // kernel source string cl_int err; // error code returned from OpenCL calls cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel Ndim = ORDER; Pdim = ORDER; Mdim = ORDER; szA = Ndim * Pdim; szB = Pdim * Mdim; szC = Ndim * Mdim; h_A = (float *)malloc(szA * sizeof(float)); h_B = (float *)malloc(szB * sizeof(float)); h_C = (float *)malloc(szC * sizeof(float)); initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C); printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER); for(int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = wtime(); seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C); run_time = wtime() - start_time; results(Mdim, Ndim, Pdim, h_C, run_time); } //-------------------------------------------------------------------------------- // Create a context, queue and device. //-------------------------------------------------------------------------------- // Set up OpenCL context. queue, kernel, etc. cl_uint numPlatforms; // Find number of platforms err = clGetPlatformIDs(0, NULL, &numPlatforms); if (err != CL_SUCCESS || numPlatforms <= 0) { printf("Error: Failed to find a platform!\n%s\n",err_code(err)); return EXIT_FAILURE; } // Get all platforms cl_platform_id Platform[numPlatforms]; err = clGetPlatformIDs(numPlatforms, Platform, NULL); if (err != CL_SUCCESS || numPlatforms <= 0) { printf("Error: Failed to get the platform!\n%s\n",err_code(err)); return EXIT_FAILURE; } // Secure a device for (int i = 0; i < numPlatforms; i++) { err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL); if (err == CL_SUCCESS) break; } if (device_id == NULL) { printf("Error: Failed to create a device group!\n%s\n",err_code(err)); return EXIT_FAILURE; } // Create a compute context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n%s\n",err_code(err)); return EXIT_FAILURE; } // Create a command queue commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n%s\n", err_code(err)); return EXIT_FAILURE; } //-------------------------------------------------------------------------------- // Setup the buffers, initialize matrices, and write them into global memory //-------------------------------------------------------------------------------- // Reset A, B and C matrices (just to play it safe) initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C); d_a = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * szA, h_A, &err); if (err != CL_SUCCESS) { printf("Error: failed to create buffer\n%s\n", err_code(err)); return EXIT_FAILURE; } d_b = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * szB, h_B, &err); if (err != CL_SUCCESS) { printf("Error: failed to create buffer\n%s\n", err_code(err)); return EXIT_FAILURE; } d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * szC, NULL, &err); if (err != CL_SUCCESS) { printf("Error: failed to create buffer\n%s\n", err_code(err)); return EXIT_FAILURE; } //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... Naive //-------------------------------------------------------------------------------- kernelsource = getKernelSource("../C_elem.cl"); // Create the comput program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err); if (err != CL_SUCCESS) { printf("Error: could not create program\n%s\n", err_code(err)); return EXIT_FAILURE; } free(kernelsource); // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return EXIT_FAILURE; } // Create the compute kernel from the program kernel = clCreateKernel(program, "mmul", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n%s\n", err_code(err)); return EXIT_FAILURE; } printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); err = clSetKernelArg(kernel, 0, sizeof(int), &Mdim); err |= clSetKernelArg(kernel, 1, sizeof(int), &Ndim); err |= clSetKernelArg(kernel, 2, sizeof(int), &Pdim); err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_a); err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_b); err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &d_c); if (err != CL_SUCCESS) { printf("Error: Could not set kernel arguments\n"); return EXIT_FAILURE; } start_time = wtime(); // Execute the kernel over the entire range of C matrix elements ... computing // a dot product for each element of the product matrix. The local work // group size is set to NULL ... so I'm telling the OpenCL runtime to // figure out a local work group size for me. const size_t global[2] = {Ndim, Mdim}; err = clEnqueueNDRangeKernel( commands, kernel, 2, NULL, global, NULL, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to execute kernel\n%s\n", err_code(err)); return EXIT_FAILURE; } err = clFinish(commands); if (err != CL_SUCCESS) { printf("Error: waiting for queue to finish failed\n%s\n", err_code(err)); return EXIT_FAILURE; } run_time = wtime() - start_time; err = clEnqueueReadBuffer( commands, d_c, CL_TRUE, 0, sizeof(float) * szC, h_C, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to read buffer\n%s\n", err_code(err)); return EXIT_FAILURE; } results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... C row per work item //-------------------------------------------------------------------------------- kernelsource = getKernelSource("../C_row.cl"); // Create the comput program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err); if (err != CL_SUCCESS) { printf("Error: could not create program\n%s\n", err_code(err)); return EXIT_FAILURE; } free(kernelsource); // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return EXIT_FAILURE; } // Create the compute kernel from the program kernel = clCreateKernel(program, "mmul", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n%s\n", err_code(err)); return EXIT_FAILURE; } printf("\n===== OpenCL, matrix mult, C row per work item, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); err = clSetKernelArg(kernel, 0, sizeof(int), &Mdim); err |= clSetKernelArg(kernel, 1, sizeof(int), &Ndim); err |= clSetKernelArg(kernel, 2, sizeof(int), &Pdim); err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_a); err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_b); err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &d_c); if (err != CL_SUCCESS) { printf("Error: Could not set kernel arguments\n"); return EXIT_FAILURE; } start_time = wtime(); // Execute the kernel over the rows of the C matrix ... computing // a dot product for each element of the product matrix. const size_t global = Ndim; err = clEnqueueNDRangeKernel( commands, kernel, 1, NULL, &global, NULL, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to execute kernel\n%s\n", err_code(err)); return EXIT_FAILURE; } err = clFinish(commands); if (err != CL_SUCCESS) { printf("Error: waiting for queue to finish failed\n%s\n", err_code(err)); return EXIT_FAILURE; } run_time = wtime() - start_time; err = clEnqueueReadBuffer( commands, d_c, CL_TRUE, 0, sizeof(float) * szC, h_C, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to read buffer\n%s\n", err_code(err)); return EXIT_FAILURE; } results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... C row per work item, A row in pivate memory //-------------------------------------------------------------------------------- kernelsource = getKernelSource("../C_row_priv.cl"); // Create the comput program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err); if (err != CL_SUCCESS) { printf("Error: could not create program\n%s\n", err_code(err)); return EXIT_FAILURE; } free(kernelsource); // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return EXIT_FAILURE; } // Create the compute kernel from the program kernel = clCreateKernel(program, "mmul", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n%s\n", err_code(err)); return EXIT_FAILURE; } printf("\n===== OpenCL, matrix mult, C row, A row in priv mem, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); err = clSetKernelArg(kernel, 0, sizeof(int), &Mdim); err |= clSetKernelArg(kernel, 1, sizeof(int), &Ndim); err |= clSetKernelArg(kernel, 2, sizeof(int), &Pdim); err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_a); err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_b); err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &d_c); if (err != CL_SUCCESS) { printf("Error: Could not set kernel arguments\n"); return EXIT_FAILURE; } start_time = wtime(); // Execute the kernel over the rows of the C matrix ... computing // a dot product for each element of the product matrix. const size_t global = Ndim; const size_t local = ORDER / 16; err = clEnqueueNDRangeKernel( commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to execute kernel\n%s\n", err_code(err)); return EXIT_FAILURE; } err = clFinish(commands); if (err != CL_SUCCESS) { printf("Error: waiting for queue to finish failed\n%s\n", err_code(err)); return EXIT_FAILURE; } run_time = wtime() - start_time; err = clEnqueueReadBuffer( commands, d_c, CL_TRUE, 0, sizeof(float) * szC, h_C, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to read buffer\n%s\n", err_code(err)); return EXIT_FAILURE; } results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // Clean up! //-------------------------------------------------------------------------------- free(h_A); free(h_B); free(h_C); clReleaseMemObject(d_a); clReleaseMemObject(d_b); clReleaseMemObject(d_c); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); return EXIT_SUCCESS; }
int main(void) { int Mdim, Ndim, Pdim; // A[N][P], B[P][M], C[N][M] int szA, szB, szC; // number of elements in each matrix double start_time; // Starting time double run_time; // timing data Ndim = ORDER; Pdim = ORDER; Mdim = ORDER; szA = Ndim * Pdim; szB = Pdim * Mdim; szC = Ndim * Mdim; std::vector<float> h_A(szA); // Host memory for Matrix A std::vector<float> h_B(szB); // Host memory for Matrix B std::vector<float> h_C(szC); // Host memory for Matrix C cl::Buffer d_a, d_b, d_c; // Matrices in device memory initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C); printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER); for(int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = wtime(); seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C); run_time = wtime() - start_time; results(Mdim, Ndim, Pdim, h_C, run_time); } try { //-------------------------------------------------------------------------------- // Create a context and queue for DEVICE //-------------------------------------------------------------------------------- cl::Context context(DEVICE); cl::CommandQueue queue(context); //-------------------------------------------------------------------------------- // Setup the buffers, initialize matrices, and write them into global memory //-------------------------------------------------------------------------------- // Reset A, B and C matrices (just to play it safe) initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C); d_a = cl::Buffer(context, begin(h_A), end(h_A), true); d_b = cl::Buffer(context, begin(h_B), end(h_B), true); d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * szC); //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... Naive //-------------------------------------------------------------------------------- // Create the compute program from the source buffer cl::Program program(context, util::loadProgram("../C_elem.cl"), true); // Create the compute kernel from the program auto naive_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer>(program, "mmul"); printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = wtime(); // Execute the kernel over the entire range of C matrix elements ... computing // a dot product for each element of the product matrix. The local work // group size is set to NULL ... so I'm telling the OpenCL runtime to // figure out a local work group size for me. cl::NDRange global(Ndim, Mdim); naive_mmul(cl::EnqueueArgs(queue, global), Mdim, Ndim, Pdim, d_a, d_b, d_c); queue.finish(); run_time = wtime() - start_time; cl::copy(queue, d_c, begin(h_C), end(h_C)); results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... C row per work item //-------------------------------------------------------------------------------- // Create the compute program from the source buffer program = cl::Program(context, util::loadProgram("../C_row.cl"), true); // Create the compute kernel from the program auto crow_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer>(program, "mmul"); printf("\n===== OpenCL, matrix mult, C row per work item, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = wtime(); cl::NDRange global(Ndim); crow_mmul(cl::EnqueueArgs(queue, global), Mdim, Ndim, Pdim, d_a, d_b, d_c); queue.finish(); run_time = wtime() - start_time; cl::copy(queue, d_c, begin(h_C), end(h_C)); results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... C row per work item, A row in pivate memory //-------------------------------------------------------------------------------- // Create the compute program from the source buffer program = cl::Program(context, util::loadProgram("../C_row_priv.cl"), true); // Create the compute kernel from the program auto arowpriv_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer>(program, "mmul"); printf("\n===== OpenCL, matrix mult, C row, A row in priv mem, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = wtime(); cl::NDRange global(Ndim); cl::NDRange local(ORDER / 16); arowpriv_mmul(cl::EnqueueArgs(queue, global, local), Mdim, Ndim, Pdim, d_a, d_b, d_c); queue.finish(); run_time = wtime() - start_time; cl::copy(queue, d_c, begin(h_C), end(h_C)); results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... C row per work item, A row pivate, B col local //-------------------------------------------------------------------------------- // Create the compute program from the source buffer program = cl::Program(context, util::loadProgram("../C_row_priv_bloc.cl"), true); // Create the compute kernel from the program auto browloc_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer, cl::LocalSpaceArg>(program, "mmul"); printf("\n===== OpenCL, mat mult, C row, priv A, B cols loc, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = wtime(); cl::NDRange global(Ndim); cl::NDRange local(ORDER / 16); cl::LocalSpaceArg localmem = cl::Local(sizeof(float) * Pdim); browloc_mmul(cl::EnqueueArgs(queue, global, local), Mdim, Ndim, Pdim, d_a, d_b, d_c, localmem); queue.finish(); run_time = wtime() - start_time; cl::copy(queue, d_c, begin(h_C), end(h_C)); results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... A and B in block form in local memory //-------------------------------------------------------------------------------- // Create the compute program from the source buffer program = cl::Program(context, util::loadProgram("../C_block_form.cl"), true); // Create the compute kernel from the program auto block_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer, cl::LocalSpaceArg, cl::LocalSpaceArg>(program, "mmul"); printf("\n===== OpenCL, A and B in block form in local memory, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = wtime(); int blocksize = 16; cl::NDRange global(Ndim, Mdim); cl::NDRange local(blocksize, blocksize); cl::LocalSpaceArg localmem1 = cl::Local(sizeof(float) * blocksize * blocksize); cl::LocalSpaceArg localmem2 = cl::Local(sizeof(float) * blocksize * blocksize); block_mmul(cl::EnqueueArgs(queue, global, local), Mdim, Ndim, Pdim, d_a, d_b, d_c, localmem1, localmem2); queue.finish(); run_time = wtime() - start_time; cl::copy(queue, d_c, begin(h_C), end(h_C)); results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << "(" << err_code(err.err()) << ")" << std::endl; } return EXIT_SUCCESS; }
int main(int argc, char *argv[]) { int N; // A[N][N], B[N][N], C[N][N] int size; // Number of elements in each matrix double start_time; // Starting time double run_time; // Timing util::Timer timer; // Timing N = ORDER; size = N * N; std::vector<float> h_A(size); // Host memory for Matrix A std::vector<float> h_B(size); // Host memory for Matrix B std::vector<float> h_C(size); // Host memory for Matrix C cl::Buffer d_a, d_b, d_c; // Matrices in device memory //-------------------------------------------------------------------------------- // Create a context and queue //-------------------------------------------------------------------------------- try { cl_uint deviceIndex = 0; parseArguments(argc, argv, &deviceIndex); // Get list of devices std::vector<cl::Device> devices; unsigned numDevices = getDeviceList(devices); // Check device index in range if (deviceIndex >= numDevices) { std::cout << "Invalid device index (try '--list')\n"; return EXIT_FAILURE; } cl::Device device = devices[deviceIndex]; std::string name; getDeviceName(device, name); std::cout << "\nUsing OpenCL device: " << name << "\n"; std::vector<cl::Device> chosen_device; chosen_device.push_back(device); cl::Context context(chosen_device); cl::CommandQueue queue(context, device); //-------------------------------------------------------------------------------- // Run sequential matmul //-------------------------------------------------------------------------------- initmat(N, h_A, h_B, h_C); timer.reset(); printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",N); for(int i = 0; i < COUNT; i++) { zero_mat(N, h_C); start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; seq_mat_mul_sdot(N, h_A, h_B, h_C); run_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time; results(N, h_C, run_time); } //-------------------------------------------------------------------------------- // Setup the buffers, initialize matrices, and write them into global memory //-------------------------------------------------------------------------------- // Reset A, B and C matrices (just to play it safe) initmat(N, h_A, h_B, h_C); d_a = cl::Buffer(context, h_A.begin(), h_A.end(), true); d_b = cl::Buffer(context, h_B.begin(), h_B.end(), true); d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * size); //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... Naive //-------------------------------------------------------------------------------- // Create the compute program from the source buffer cl::Program program(context, kernelsource, true); // Create the compute kernel from the program cl::make_kernel<int, cl::Buffer, cl::Buffer, cl::Buffer> naive_mmul(program, "mmul"); printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",N); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(N, h_C); start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; // Execute the kernel over the entire range of C matrix elements ... computing // a dot product for each element of the product matrix. The local work // group size is set to NULL ... so I'm telling the OpenCL runtime to // figure out a local work group size for me. cl::NDRange global(N, N); naive_mmul(cl::EnqueueArgs(queue, global), N, d_a, d_b, d_c); queue.finish(); run_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time; cl::copy(queue, d_c, h_C.begin(), h_C.end()); results(N, h_C, run_time); } // end for loop } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << "(" << err_code(err.err()) << ")" << std::endl; } return EXIT_SUCCESS; }
int main(int argc, char *argv[]) { float *h_A; // A matrix float *h_B; // B matrix float *h_C; // C = A*B matrix int N; // A[N][N], B[N][N], C[N][N] int size; // number of elements in each matrix cl_mem d_a, d_b, d_c; // Matrices in device memory double start_time; // Starting time double run_time; // timing data char * kernelsource; // kernel source string cl_int err; // error code returned from OpenCL calls cl_device_id device; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel N = ORDER; size = N * N; h_A = (float *)malloc(size * sizeof(float)); h_B = (float *)malloc(size * sizeof(float)); h_C = (float *)malloc(size * sizeof(float)); //-------------------------------------------------------------------------------- // Create a context, queue and device. //-------------------------------------------------------------------------------- cl_uint deviceIndex = 0; parseArguments(argc, argv, &deviceIndex); // Get list of devices cl_device_id devices[MAX_DEVICES]; unsigned numDevices = getDeviceList(devices); // Check device index in range if (deviceIndex >= numDevices) { printf("Invalid device index (try '--list')\n"); return EXIT_FAILURE; } device = devices[deviceIndex]; char name[MAX_INFO_STRING]; getDeviceName(device, name); printf("\nUsing OpenCL device: %s\n", name); // Create a compute context context = clCreateContext(0, 1, &device, NULL, NULL, &err); checkError(err, "Creating context"); // Create a command queue commands = clCreateCommandQueue(context, device, 0, &err); checkError(err, "Creating command queue"); //-------------------------------------------------------------------------------- // Run sequential version on the host //-------------------------------------------------------------------------------- initmat(N, h_A, h_B, h_C); printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER); for(int i = 0; i < COUNT; i++) { zero_mat(N, h_C); start_time = wtime(); seq_mat_mul_sdot(N, h_A, h_B, h_C); run_time = wtime() - start_time; results(N, h_C, run_time); } //-------------------------------------------------------------------------------- // Setup the buffers, initialize matrices, and write them into global memory //-------------------------------------------------------------------------------- // Reset A, B and C matrices (just to play it safe) initmat(N, h_A, h_B, h_C); d_a = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * size, h_A, &err); checkError(err, "Creating buffer d_a"); d_b = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * size, h_B, &err); checkError(err, "Creating buffer d_b"); d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * size, NULL, &err); checkError(err, "Creating buffer d_c"); //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... Naive //-------------------------------------------------------------------------------- kernelsource = getKernelSource("../C_elem.cl"); // Create the comput program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err); checkError(err, "Creating program with C_elem.cl"); free(kernelsource); // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return EXIT_FAILURE; } // Create the compute kernel from the program kernel = clCreateKernel(program, "mmul", &err); checkError(err, "Creating kernel from C_elem.cl"); printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",N); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(N, h_C); err = clSetKernelArg(kernel, 0, sizeof(int), &N); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_a); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_b); err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_c); checkError(err, "Setting kernel args"); start_time = wtime(); // Execute the kernel over the entire range of C matrix elements ... computing // a dot product for each element of the product matrix. The local work // group size is set to NULL ... so I'm telling the OpenCL runtime to // figure out a local work group size for me. const size_t global[2] = {N, N}; err = clEnqueueNDRangeKernel( commands, kernel, 2, NULL, global, NULL, 0, NULL, NULL); checkError(err, "Enqueueing kernel"); err = clFinish(commands); checkError(err, "Waiting for kernel to finish"); run_time = wtime() - start_time; err = clEnqueueReadBuffer( commands, d_c, CL_TRUE, 0, sizeof(float) * size, h_C, 0, NULL, NULL); checkError(err, "Copying back d_c"); results(N, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... C row per work item //-------------------------------------------------------------------------------- kernelsource = getKernelSource("../C_row.cl"); // Create the comput program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err); checkError(err, "Creating program with C_row.cl"); free(kernelsource); // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return EXIT_FAILURE; } // Create the compute kernel from the program kernel = clCreateKernel(program, "mmul", &err); checkError(err, "Creating kernel from C_row.cl"); printf("\n===== OpenCL, matrix mult, C row per work item, order %d ======\n",N); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(N, h_C); err = clSetKernelArg(kernel, 0, sizeof(int), &N); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_a); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_b); err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_c); checkError(err, "Setting kernel args"); start_time = wtime(); // Execute the kernel over the rows of the C matrix ... computing // a dot product for each element of the product matrix. const size_t global = N; err = clEnqueueNDRangeKernel( commands, kernel, 1, NULL, &global, NULL, 0, NULL, NULL); checkError(err, "Enqueueing kernel"); err = clFinish(commands); checkError(err, "Waiting for kernel to finish"); run_time = wtime() - start_time; err = clEnqueueReadBuffer( commands, d_c, CL_TRUE, 0, sizeof(float) * size, h_C, 0, NULL, NULL); checkError(err, "Reading back d_c"); results(N, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... C row per work item, A row in pivate memory //-------------------------------------------------------------------------------- kernelsource = getKernelSource("../C_row_priv.cl"); // Create the comput program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err); checkError(err, "Creating program from C_row_priv.cl"); free(kernelsource); // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return EXIT_FAILURE; } // Create the compute kernel from the program kernel = clCreateKernel(program, "mmul", &err); checkError(err, "Creating kernel from C_row_priv.cl"); printf("\n===== OpenCL, matrix mult, C row, A row in priv mem, order %d ======\n",N); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(N, h_C); err = clSetKernelArg(kernel, 0, sizeof(int), &N); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_a); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_b); err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_c); checkError(err, "Setting kernel args"); start_time = wtime(); // Execute the kernel over the rows of the C matrix ... computing // a dot product for each element of the product matrix. const size_t global = N; const size_t local = ORDER / 16; err = clEnqueueNDRangeKernel( commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); checkError(err, "Enqueueing kernel"); err = clFinish(commands); checkError(err, "Waiting for kernel to finish"); run_time = wtime() - start_time; err = clEnqueueReadBuffer( commands, d_c, CL_TRUE, 0, sizeof(float) * size, h_C, 0, NULL, NULL); checkError(err, "Reading back d_c"); results(N, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // Clean up! //-------------------------------------------------------------------------------- free(h_A); free(h_B); free(h_C); clReleaseMemObject(d_a); clReleaseMemObject(d_b); clReleaseMemObject(d_c); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); return EXIT_SUCCESS; }
int main(void) { int Mdim, Ndim, Pdim; // A[N][P], B[P][M], C[N][M] int szA, szB, szC; // Number of elements in each matrix double start_time; // Starting time double run_time; // Timing util::Timer timer; // Timing Ndim = ORDER; Pdim = ORDER; Mdim = ORDER; szA = Ndim * Pdim; szB = Pdim * Mdim; szC = Ndim * Mdim; std::vector<float> h_A(szA); // Host memory for Matrix A std::vector<float> h_B(szB); // Host memory for Matrix B std::vector<float> h_C(szC); // Host memory for Matrix C cl::Buffer d_a, d_b, d_c; // Matrices in device memory initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C); timer.reset(); printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER); for(int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C); run_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time; results(Mdim, Ndim, Pdim, h_C, run_time); } try { //-------------------------------------------------------------------------------- // Create a context and queue for DEVICE //-------------------------------------------------------------------------------- cl::Context context(DEVICE); cl::CommandQueue queue(context); //-------------------------------------------------------------------------------- // Setup the buffers, initialize matrices, and write them into global memory //-------------------------------------------------------------------------------- // Reset A, B and C matrices (just to play it safe) initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C); d_a = cl::Buffer(context, begin(h_A), end(h_A), true); d_b = cl::Buffer(context, begin(h_B), end(h_B), true); d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * szC); //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... Naive //-------------------------------------------------------------------------------- // Create the compute program from the source buffer cl::Program program(context, kernelsource, true); // Create the compute kernel from the program auto naive_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer>(program, "mmul"); printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; // Execute the kernel over the entire range of C matrix elements ... computing // a dot product for each element of the product matrix. The local work // group size is set to NULL ... so I'm telling the OpenCL runtime to // figure out a local work group size for me. cl::NDRange global(Ndim, Mdim); naive_mmul(cl::EnqueueArgs(queue, global), Mdim, Ndim, Pdim, d_a, d_b, d_c); queue.finish(); run_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time; cl::copy(queue, d_c, begin(h_C), end(h_C)); results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << "(" << err_code(err.err()) << ")" << std::endl; } return EXIT_SUCCESS; }