cl_vars_t setupRuntime(kernel* kernels, std::map<std::string, cl_kernel>* kernel_map, int num_kerns) { std::string kernel_source_str[num_kerns-1]; std::string arraycompact_kernel_file[num_kerns-1]; cl_vars_t cv; std::list<std::string> kernel_names; //get the names of the kernel files for (int i = 1; i<num_kerns; i++) { arraycompact_kernel_file[i-1] = kernels[i].name + ".cl"; kernel_names.push_back(kernels[i].name); } cl_int err = CL_SUCCESS; //read the kernel files readFile(arraycompact_kernel_file, kernel_source_str, num_kerns-1); initialize_ocl(cv); compile_ocl_program(*kernel_map, cv, kernel_source_str, num_kerns-1, kernel_names); return cv; }
int main(int argc, char *argv[]) { std::string matmul_kernel_str; /* Provide names of the OpenCL kernels * and cl file that they're kept in */ std::string matmul_name_str = std::string("matmul"); std::string matmul_kernel_file = std::string("matmul.cl"); cl_vars_t cv; cl_kernel matmul; /* Read OpenCL file into STL string */ readFile(matmul_kernel_file, matmul_kernel_str); /* Initialize the OpenCL runtime * Source in clhelp.cpp */ initialize_ocl(cv); // Compile all OpenCL kernels. compile_ocl_program(matmul, cv, matmul_kernel_str.c_str(), matmul_name_str.c_str()); // Arrays on the host (CPU) float *h_A, *h_B, *h_Y, *h_YY; // Arrays on the device (GPU) cl_mem g_A, g_B, g_Y; /* Allocate arrays on the host * and fill with random data */ int n = (1<<10); h_A = new float[n*n]; assert(h_A); h_B = new float[n*n]; assert(h_B); h_Y = new float[n*n]; assert(h_Y); h_YY = new float[n*n]; assert(h_YY); bzero(h_Y, sizeof(float)*n*n); bzero(h_YY, sizeof(float)*n*n); for(int i = 0; i < (n*n); i++) { h_A[i] = (float)drand48(); h_B[i] = (float)drand48(); } // Allocate memory for arrays on the GPU cl_int err = CL_SUCCESS; /* CS194: Allocate Buffers on the GPU. *...We're already allocating the Y buffer * on the GPU for you */ g_Y = clCreateBuffer(cv.context,CL_MEM_READ_WRITE, sizeof(float)*n*n,NULL,&err); CHK_ERR(err); g_A = clCreateBuffer(cv.context,CL_MEM_READ_WRITE, sizeof(float)*n*n,NULL,&err); CHK_ERR(err); g_B = clCreateBuffer(cv.context,CL_MEM_READ_WRITE, sizeof(float)*n*n,NULL,&err); CHK_ERR(err); /* CS194: Copy data from host CPU to GPU */ err = clEnqueueWriteBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n*n, h_Y, 0, NULL, NULL); CHK_ERR(err); err = clEnqueueWriteBuffer(cv.commands, g_A, true, 0, sizeof(float)*n*n, h_A, 0, NULL, NULL); CHK_ERR(err); err = clEnqueueWriteBuffer(cv.commands, g_B, true, 0, sizeof(float)*n*n, h_B, 0, NULL, NULL); CHK_ERR(err); /* CS194: Create appropriately sized workgroups */ size_t global_work_size[2] = {n,n}; size_t local_work_size[2] = {4,4}; /* CS194: Set kernel arguments */ err = clSetKernelArg(matmul, 0, sizeof(cl_mem), &g_Y); CHK_ERR(err); err = clSetKernelArg(matmul, 1, sizeof(cl_mem), &g_A); CHK_ERR(err); err = clSetKernelArg(matmul, 2, sizeof(cl_mem), &g_B); CHK_ERR(err); err = clSetKernelArg(matmul, 3, sizeof(int), &n); CHK_ERR(err); double t0 = timestamp(); /* CS194: Launch matrix multiply kernel Here's a little code to get you started.. */ err = clEnqueueNDRangeKernel(cv.commands, matmul, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL); CHK_ERR(err); err = clFinish(cv.commands); CHK_ERR(err); t0 = timestamp()-t0; /* Read result of GPU on host CPU */ err = clEnqueueReadBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n*n, h_Y, 0, NULL, NULL); CHK_ERR(err); err = clFinish(cv.commands); CHK_ERR(err); double t1 = timestamp(); sqr_sgemm(h_YY, h_A, h_B, n); t1 = timestamp()-t1; for(int i = 0; i < (n*n); i++) { double d = h_YY[i] - h_Y[i]; d *= d; if(d > 0.0001) { printf("CPU and GPU results do not match!\n"); break; } } uninitialize_ocl(cv); delete [] h_A; delete [] h_B; delete [] h_Y; delete [] h_YY; clReleaseMemObject(g_A); clReleaseMemObject(g_B); clReleaseMemObject(g_Y); double gpu_flops_s = (2.0 * pow((double)n, 3.0)) / t0; printf("GPU: %g gflops/sec\n", gpu_flops_s / (1e9)); double cpu_flops_s = (2.0 * pow((double)n, 3.0)) / t1; printf("CPU: %g gflops/sec\n", cpu_flops_s / (1e9)); return 0; }
int main(int argc, char *argv[]) { std::string reduce_kernel_str; std::string reduce_name_str = std::string("reduce"); std::string reduce_kernel_file = std::string("reduce.cl"); cl_vars_t cv; cl_kernel reduce; readFile(reduce_kernel_file, reduce_kernel_str); initialize_ocl(cv); compile_ocl_program(reduce, cv, reduce_kernel_str.c_str(), reduce_name_str.c_str()); int *h_A, *h_Y; cl_mem g_Out, g_In; int n = (1<<24); int c; /* how long do you want your arrays? */ while((c = getopt(argc, argv, "n:"))!=-1){ switch(c){ case 'n': n = atoi(optarg); break; } } if(n==0) return 0; // pad the array is not power of 2 int padded_size = 1; while(padded_size < n){ padded_size <<= 1; } h_A = new int[padded_size]; h_Y = new int[padded_size]; for(int i = 0; i < n; i++){ h_A[i] = 1; h_Y[i] = 0; } for (int i = n; i < padded_size; ++i) { h_A[i] = 0; h_Y[i] = 0; } cl_int err = CL_SUCCESS; g_Out = clCreateBuffer(cv.context,CL_MEM_READ_WRITE, sizeof(int)*n,NULL,&err); CHK_ERR(err); g_In = clCreateBuffer(cv.context,CL_MEM_READ_WRITE, sizeof(int)*n,NULL,&err); CHK_ERR(err); //copy data from host CPU to GPU err = clEnqueueWriteBuffer(cv.commands, g_Out, true, 0, sizeof(int)*n, h_Y, 0, NULL, NULL); CHK_ERR(err); err = clEnqueueWriteBuffer(cv.commands, g_In, true, 0, sizeof(int)*n, h_A, 0, NULL, NULL); CHK_ERR(err); size_t local_work_size[1] = {512}; size_t global_work_size[1] = {padded_size}; err = clSetKernelArg(reduce, 0, sizeof(cl_mem), &g_In); CHK_ERR(err); err = clSetKernelArg(reduce, 1, sizeof(cl_mem), &g_Out); CHK_ERR(err); err = clSetKernelArg(reduce, 2, sizeof(int)*512, NULL); CHK_ERR(err); err = clSetKernelArg(reduce, 3, sizeof(int), &padded_size); CHK_ERR(err); double t0 = timestamp(); // calls the recursion function recursive_reduce(cv.commands, cv.context, reduce, g_In, g_Out, padded_size); t0 = timestamp()-t0; //read result of GPU on host CPU err = clEnqueueReadBuffer(cv.commands, g_Out, true, 0, sizeof(int)*n, h_Y, 0, NULL, NULL); CHK_ERR(err); int sum=0.0f; for(int i = 0; i < n; i++) { sum += h_A[i]; } if(sum!=h_Y[0]) { printf("WRONG: CPU sum = %d, GPU sum = %d\n", sum, h_Y[0]); printf("WRONG: difference = %d\n", sum-h_Y[0]); printf("Other parts = %d, %d, %d, %d\n", h_Y[1], h_Y[2], h_Y[3], h_Y[4]); int z=0; while(h_Y[z] == h_Y[z+1]){ z++; } printf("red: %d\n", z); } else { printf("CORRECT: %d,%g\n",n,t0); } uninitialize_ocl(cv); delete [] h_A; delete [] h_Y; clReleaseMemObject(g_Out); clReleaseMemObject(g_In); return 0; }
void initialize_ocl(cl_vars_t& cv) { cl_uint num_platforms; cv.err = clGetPlatformIDs(1, &(cv.platform), &(num_platforms)); if(cv.err != CL_SUCCESS) { std::cout << "Could not get platform ID" << std::endl; exit(1); } if(getenv("HM_CPU0")) { std::cout << "Running on CPU 0" << std::endl; cl_uint max_devices = 1; cv.err = clGetDeviceIDs(cv.platform, CL_DEVICE_TYPE_CPU, max_devices, cv.device_ids, &(cv.num_devices)); cv.num_devices = 1; } else if(getenv("HM_CPU0_SUB1")) { std::cout << "Running on Subdivided1 CPU 0" << std::endl; cl_uint max_devices = 1; cl_device_id dev0; cv.err = clGetDeviceIDs(cv.platform, CL_DEVICE_TYPE_CPU, max_devices, &dev0, &(cv.num_devices)); cl_uint num_subdevices; cl_device_partition_property props[3]; props[0] = CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN; props[1] = CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE; props[2] = 0; cl_device_id id4[4]; cv.err = clCreateSubDevices(dev0, props, 2, id4, &num_subdevices); std::cout << "num subdevices: " << num_subdevices << std::endl; cv.device_ids[0] = id4[1]; cv.num_devices = 1; } else if(getenv("HM_GPU0")) { std::cout << "Running on GPU 0" << std::endl; cl_uint max_devices = 1; cv.err = clGetDeviceIDs(cv.platform, CL_DEVICE_TYPE_GPU, max_devices, cv.device_ids, &(cv.num_devices)); cv.num_devices = 1; } else if(getenv("HM_GPU01")) { std::cout << "Running on GPU 0 and GPU 1" << std::endl; cl_uint max_devices = 2; cv.err = clGetDeviceIDs(cv.platform, CL_DEVICE_TYPE_GPU, max_devices, cv.device_ids, &(cv.num_devices)); cv.num_devices = 2; } else if(getenv("HM_GPU1")) { std::cout << "Running on GPU 1" << std::endl; cl_uint max_devices = 2; cv.err = clGetDeviceIDs(cv.platform, CL_DEVICE_TYPE_GPU, max_devices, cv.device_ids, &(cv.num_devices)); assert(cv.num_devices > 1); cv.device_ids[0] = cv.device_ids[1]; cv.num_devices = 1; } else { std::cout << "Error: Specify target either HM_CPU0, HM_GPU0, HM_GPU01, or HM_GPU1" << std::endl; } if(cv.err != CL_SUCCESS) { std::cout << "Could not get GPU device ID" << std::endl; exit(1); } cv.context = clCreateContext(0, cv.num_devices, cv.device_ids, NULL, NULL, &(cv.err)); if(!cv.context) { std::cout << "Could not create context" << std::endl; exit(1); } //cv.commands = clCreateCommandQueue(cv.context, cv.device_id, 0, &(cv.err)); for(size_t devId = 0 ; devId < cv.num_devices ; devId++) { cv.commands[devId] = clCreateCommandQueue(cv.context, cv.device_ids[devId], CL_QUEUE_PROFILING_ENABLE, &(cv.err)); if(!cv.commands[devId]) { std::cout << "Could not create command queue" << std::endl; exit(1); } } compile_ocl_program(cv.memset_program, cv.memset_kernel, cv, memset_kernel_str, "memset_kernel"); #ifdef VERBOSE_COMPILATION docs.opencl_ss << "CL fill vars success" << std::endl; // Device info for(size_t devId = 0 ; devId < cv.num_devices ; devId++) { docs.opencl_ss << "Device ID: " << devId << std::endl; char device_name[255]; cv.err = clGetDeviceInfo(cv.device_ids[devId], CL_DEVICE_NAME, 255, device_name, NULL); docs.opencl_ss << "Device Name: " << device_name << std::endl; cl_ulong mem_size; cv.err = clGetDeviceInfo(cv.device_ids[devId], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &mem_size, NULL); docs.opencl_ss << "Global mem size: " << mem_size << std::endl; size_t max_work_item[3]; cv.err = clGetDeviceInfo(cv.device_ids[devId], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(max_work_item), max_work_item, NULL); docs.opencl_ss << "Max work item sizes: " << max_work_item[0] << ", " << max_work_item[1] << ", " << max_work_item[2] << std::endl; } #endif }
int main(int argc, char *argv[]) { std::string incr_kernel_str; /* Provide names of the OpenCL kernels * and cl file that they're kept in */ std::string incr_name_str = std::string("incr"); std::string incr_kernel_file = std::string("incr.cl"); cl_vars_t cv; cl_kernel incr; /* Read OpenCL file into STL string */ readFile(incr_kernel_file, incr_kernel_str); /* Initialize the OpenCL runtime * Source in clhelp.cpp */ initialize_ocl(cv); /* Compile all OpenCL kernels */ compile_ocl_program(incr, cv, incr_kernel_str.c_str(), incr_name_str.c_str()); /* Arrays on the host (CPU) */ float *h_Y, *h_YY; /* Arrays on the device (GPU) */ cl_mem g_Y; // Allocate arrays on the host and fill with random data. int n = (1<<20); h_Y = new float[n]; h_YY = new float[n]; for(int i = 0; i < n; i++) { h_YY[i] = h_Y[i] = (float)drand48(); } cl_int err = CL_SUCCESS; /* CS194: Allocate memory for arrays on * the GPU */ // Allocate the buffer memory objects. g_Y = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,sizeof(float)*n,NULL,&err); CHK_ERR(err); // Write data from CPU to GPU.(this is opposite of clEnqueueReadBuffer()) err = clEnqueueWriteBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n, h_Y, 0, NULL, NULL); CHK_ERR(err); // Define the global and local workgroup sizes. size_t global_work_size[1] = {n}; size_t local_work_size[1] = {128}; // Set the kernel args values. err = clSetKernelArg(incr, 0, sizeof(cl_mem), &g_Y); CHK_ERR(err); err = clSetKernelArg(incr, 1, sizeof(int), &n); CHK_ERR(err); // Call kernel on the GPU. err = clEnqueueNDRangeKernel(cv.commands, incr, 1,//work_dim, NULL, //global_work_offset global_work_size, //global_work_size local_work_size, //local_work_size 0, //num_events_in_wait_list NULL, //event_wait_list NULL // ); CHK_ERR(err); /* Read result of GPU on host CPU */ err = clEnqueueReadBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n, h_Y, 0, NULL, NULL); CHK_ERR(err); /* Check answer */ bool er = false; for(int i = 0; i < n; i++) { float d = (h_YY[i] + 1.0f); if(h_Y[i] != d) { printf("error at %d :(\n", i); er = true; break; } } if(!er) { printf("CPU and GPU results match\n"); } uninitialize_ocl(cv); delete [] h_Y; delete [] h_YY; clReleaseMemObject(g_Y); return 0; }
int main(int argc, char *argv[]) { std::string vvadd_kernel_str; /* Provide names of the OpenCL kernels * and cl file that they're kept in */ std::string vvadd_name_str = std::string("vvadd"); std::string vvadd_kernel_file = std::string("vvadd.cl"); cl_vars_t cv; cl_kernel vvadd; /* Read OpenCL file into STL string */ readFile(vvadd_kernel_file, vvadd_kernel_str); /* Initialize the OpenCL runtime * Source in clhelp.cpp */ initialize_ocl(cv); /* Compile all OpenCL kernels */ compile_ocl_program(vvadd, cv, vvadd_kernel_str.c_str(), vvadd_name_str.c_str()); /* Arrays on the host (CPU) */ float *h_A, *h_B, *h_Y; /* Arrays on the device (GPU) */ cl_mem g_A, g_B, g_Y; /* Allocate arrays on the host * and fill with random data */ int n = (1<<20); h_A = new float[n]; h_B = new float[n]; h_Y = new float[n]; bzero(h_Y, sizeof(float)*n); for(int i = 0; i < n; i++) { h_A[i] = (float)drand48(); h_B[i] = (float)drand48(); } /* CS194: Allocate memory for arrays on * the GPU */ cl_int err = CL_SUCCESS; /* CS194: Here's something to get you started */ g_Y = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,sizeof(float)*n,NULL,&err); CHK_ERR(err); g_A = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,sizeof(float)*n,NULL,&err); CHK_ERR(err); g_B = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,sizeof(float)*n,NULL,&err); CHK_ERR(err); /* CS194: Copy data from host CPU to GPU */ err = clEnqueueWriteBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n, h_Y, 0, NULL, NULL); CHK_ERR(err); err = clEnqueueWriteBuffer(cv.commands, g_A, true, 0, sizeof(float)*n, h_A, 0, NULL, NULL); CHK_ERR(err); err = clEnqueueWriteBuffer(cv.commands, g_B, true, 0, sizeof(float)*n, h_B, 0, NULL, NULL); CHK_ERR(err); /* CS194: Define the global and local workgroup sizes */ size_t global_work_size[1] = {n}; size_t local_work_size[1] = {128}; /* CS194: Set Kernel Arguments */ err = clSetKernelArg(vvadd, 0, sizeof(cl_mem), &g_Y); CHK_ERR(err); err = clSetKernelArg(vvadd, 1, sizeof(cl_mem), &g_A); CHK_ERR(err); err = clSetKernelArg(vvadd, 2, sizeof(cl_mem), &g_B); CHK_ERR(err); err = clSetKernelArg(vvadd, 3, sizeof(int), &n); CHK_ERR(err); /* CS194: Call kernel on the GPU */ err = clEnqueueNDRangeKernel(cv.commands, vvadd, 1,//work_dim, NULL, //global_work_offset global_work_size, //global_work_size local_work_size, //local_work_size 0, //num_events_in_wait_list NULL, //event_wait_list NULL // ); /* Read result of GPU on host CPU */ err = clEnqueueReadBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n, h_Y, 0, NULL, NULL); CHK_ERR(err); /* Check answer */ for(int i = 0; i < n; i++) { float d = h_A[i] + h_B[i]; if(h_Y[i] != d) { printf("error at %d :(\n", i); break; } } /* Shut down the OpenCL runtime */ uninitialize_ocl(cv); delete [] h_A; delete [] h_B; delete [] h_Y; clReleaseMemObject(g_A); clReleaseMemObject(g_B); clReleaseMemObject(g_Y); return 0; }
int main(int argc, char *argv[]) { std::string incr_kernel_str; /* Provide names of the OpenCL kernels * and cl file that they're kept in */ std::string incr_name_str = std::string("incr"); std::string incr_kernel_file = std::string("incr.cl"); cl_vars_t cv; cl_kernel incr; /* Read OpenCL file into STL string */ readFile(incr_kernel_file, incr_kernel_str); /* Initialize the OpenCL runtime * Source in clhelp.cpp */ initialize_ocl(cv); /* Compile all OpenCL kernels */ compile_ocl_program(incr, cv, incr_kernel_str.c_str(), incr_name_str.c_str()); /* Arrays on the host (CPU) */ float *h_Y, *h_YY; /* Arrays on the device (GPU) */ cl_mem g_Y; int n = (1<<20); h_Y = new float[n]; h_YY = new float[n]; for(int i = 0; i < n; i++) { h_YY[i] = h_Y[i] = (float)drand48(); } cl_int err = CL_SUCCESS; /* CS194: Allocate memory for arrays on * the GPU */ /* Creates a buffer in the cv.context context, with read and write access * at the global host adress g_Y, of size sizeof(float)*n. */ g_Y = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,sizeof(float)*n,NULL,&err); CHK_ERR(err); /* enqueue commands to write to the buffer g_Y from hos memory. * Commands will be queued in cv.commands. * true indicates that the write is put on the commands queue. * 0 is the offset in bytes in the buffer object to write to. * sizeof(float)*n is the size in byte of data being wirtten. * h_Y is the address in host memory of the data being written from. */ err = clEnqueueWriteBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n, h_Y, 0, NULL, NULL); /* checks whether the write buffer command was succesful. */ CHK_ERR(err); /* declaring the global size of th y dimension to be n. */ size_t global_work_size[1] = {n}; /* declaring the size of work groups to be 128 work items. */ size_t local_work_size[1] = {128}; /* Sets specific arguments for the kernel incr. * 0 is the argument index, sizeof(cl_mem) is the size * of the argument, which is the pointer to g_Y.*/ err = clSetKernelArg(incr, 0, sizeof(cl_mem), &g_Y); CHK_ERR(err); /* Sets specific arguments for the kernel incr. * 1 is the argument index, sizeof(int) is the size * of the argument, which is the pointer to n.*/ err = clSetKernelArg(incr, 1, sizeof(int), &n); CHK_ERR(err); /* Enqueues a command on cv.commands to execute the * kernel incr.cl on the device. Uses linear dimension * to specify work groups and items and specifies to use * global_work_size work items for the execution and local_work_size * as the size of a work group. */ err = clEnqueueNDRangeKernel(cv.commands, incr, 1,//work_dim, NULL, //global_work_offset global_work_size, //global_work_size local_work_size, //local_work_size 0, //num_events_in_wait_list NULL, //event_wait_list NULL // ); CHK_ERR(err); /* Read result of GPU on host CPU */ err = clEnqueueReadBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n, h_Y, 0, NULL, NULL); CHK_ERR(err); /* Check answer */ bool er = false; for(int i = 0; i < n; i++) { float d = (h_YY[i] + 1.0f); if(h_Y[i] != d) { printf("error at %d :(\n", i); er = true; break; } } if(!er) { printf("CPU and GPU results match\n"); } uninitialize_ocl(cv); delete [] h_Y; delete [] h_YY; clReleaseMemObject(g_Y); return 0; }