cl_vars_t setupRuntime(kernel* kernels, std::map<std::string, cl_kernel>* kernel_map, int num_kerns) { std::string kernel_source_str[num_kerns-1]; std::string arraycompact_kernel_file[num_kerns-1]; cl_vars_t cv; std::list<std::string> kernel_names; //get the names of the kernel files for (int i = 1; i<num_kerns; i++) { arraycompact_kernel_file[i-1] = kernels[i].name + ".cl"; kernel_names.push_back(kernels[i].name); } cl_int err = CL_SUCCESS; //read the kernel files readFile(arraycompact_kernel_file, kernel_source_str, num_kerns-1); initialize_ocl(cv); compile_ocl_program(*kernel_map, cv, kernel_source_str, num_kerns-1, kernel_names); return cv; }
int main(int argc, char *argv[]) { std::string matmul_kernel_str; /* Provide names of the OpenCL kernels * and cl file that they're kept in */ std::string matmul_name_str = std::string("matmul"); std::string matmul_kernel_file = std::string("matmul.cl"); cl_vars_t cv; cl_kernel matmul; /* Read OpenCL file into STL string */ readFile(matmul_kernel_file, matmul_kernel_str); /* Initialize the OpenCL runtime * Source in clhelp.cpp */ initialize_ocl(cv); // Compile all OpenCL kernels. compile_ocl_program(matmul, cv, matmul_kernel_str.c_str(), matmul_name_str.c_str()); // Arrays on the host (CPU) float *h_A, *h_B, *h_Y, *h_YY; // Arrays on the device (GPU) cl_mem g_A, g_B, g_Y; /* Allocate arrays on the host * and fill with random data */ int n = (1<<10); h_A = new float[n*n]; assert(h_A); h_B = new float[n*n]; assert(h_B); h_Y = new float[n*n]; assert(h_Y); h_YY = new float[n*n]; assert(h_YY); bzero(h_Y, sizeof(float)*n*n); bzero(h_YY, sizeof(float)*n*n); for(int i = 0; i < (n*n); i++) { h_A[i] = (float)drand48(); h_B[i] = (float)drand48(); } // Allocate memory for arrays on the GPU cl_int err = CL_SUCCESS; /* CS194: Allocate Buffers on the GPU. *...We're already allocating the Y buffer * on the GPU for you */ g_Y = clCreateBuffer(cv.context,CL_MEM_READ_WRITE, sizeof(float)*n*n,NULL,&err); CHK_ERR(err); g_A = clCreateBuffer(cv.context,CL_MEM_READ_WRITE, sizeof(float)*n*n,NULL,&err); CHK_ERR(err); g_B = clCreateBuffer(cv.context,CL_MEM_READ_WRITE, sizeof(float)*n*n,NULL,&err); CHK_ERR(err); /* CS194: Copy data from host CPU to GPU */ err = clEnqueueWriteBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n*n, h_Y, 0, NULL, NULL); CHK_ERR(err); err = clEnqueueWriteBuffer(cv.commands, g_A, true, 0, sizeof(float)*n*n, h_A, 0, NULL, NULL); CHK_ERR(err); err = clEnqueueWriteBuffer(cv.commands, g_B, true, 0, sizeof(float)*n*n, h_B, 0, NULL, NULL); CHK_ERR(err); /* CS194: Create appropriately sized workgroups */ size_t global_work_size[2] = {n,n}; size_t local_work_size[2] = {4,4}; /* CS194: Set kernel arguments */ err = clSetKernelArg(matmul, 0, sizeof(cl_mem), &g_Y); CHK_ERR(err); err = clSetKernelArg(matmul, 1, sizeof(cl_mem), &g_A); CHK_ERR(err); err = clSetKernelArg(matmul, 2, sizeof(cl_mem), &g_B); CHK_ERR(err); err = clSetKernelArg(matmul, 3, sizeof(int), &n); CHK_ERR(err); double t0 = timestamp(); /* CS194: Launch matrix multiply kernel Here's a little code to get you started.. */ err = clEnqueueNDRangeKernel(cv.commands, matmul, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL); CHK_ERR(err); err = clFinish(cv.commands); CHK_ERR(err); t0 = timestamp()-t0; /* Read result of GPU on host CPU */ err = clEnqueueReadBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n*n, h_Y, 0, NULL, NULL); CHK_ERR(err); err = clFinish(cv.commands); CHK_ERR(err); double t1 = timestamp(); sqr_sgemm(h_YY, h_A, h_B, n); t1 = timestamp()-t1; for(int i = 0; i < (n*n); i++) { double d = h_YY[i] - h_Y[i]; d *= d; if(d > 0.0001) { printf("CPU and GPU results do not match!\n"); break; } } uninitialize_ocl(cv); delete [] h_A; delete [] h_B; delete [] h_Y; delete [] h_YY; clReleaseMemObject(g_A); clReleaseMemObject(g_B); clReleaseMemObject(g_Y); double gpu_flops_s = (2.0 * pow((double)n, 3.0)) / t0; printf("GPU: %g gflops/sec\n", gpu_flops_s / (1e9)); double cpu_flops_s = (2.0 * pow((double)n, 3.0)) / t1; printf("CPU: %g gflops/sec\n", cpu_flops_s / (1e9)); return 0; }
int main(int argc, char *argv[]) { std::string reduce_kernel_str; std::string reduce_name_str = std::string("reduce"); std::string reduce_kernel_file = std::string("reduce.cl"); cl_vars_t cv; cl_kernel reduce; readFile(reduce_kernel_file, reduce_kernel_str); initialize_ocl(cv); compile_ocl_program(reduce, cv, reduce_kernel_str.c_str(), reduce_name_str.c_str()); int *h_A, *h_Y; cl_mem g_Out, g_In; int n = (1<<24); int c; /* how long do you want your arrays? */ while((c = getopt(argc, argv, "n:"))!=-1){ switch(c){ case 'n': n = atoi(optarg); break; } } if(n==0) return 0; // pad the array is not power of 2 int padded_size = 1; while(padded_size < n){ padded_size <<= 1; } h_A = new int[padded_size]; h_Y = new int[padded_size]; for(int i = 0; i < n; i++){ h_A[i] = 1; h_Y[i] = 0; } for (int i = n; i < padded_size; ++i) { h_A[i] = 0; h_Y[i] = 0; } cl_int err = CL_SUCCESS; g_Out = clCreateBuffer(cv.context,CL_MEM_READ_WRITE, sizeof(int)*n,NULL,&err); CHK_ERR(err); g_In = clCreateBuffer(cv.context,CL_MEM_READ_WRITE, sizeof(int)*n,NULL,&err); CHK_ERR(err); //copy data from host CPU to GPU err = clEnqueueWriteBuffer(cv.commands, g_Out, true, 0, sizeof(int)*n, h_Y, 0, NULL, NULL); CHK_ERR(err); err = clEnqueueWriteBuffer(cv.commands, g_In, true, 0, sizeof(int)*n, h_A, 0, NULL, NULL); CHK_ERR(err); size_t local_work_size[1] = {512}; size_t global_work_size[1] = {padded_size}; err = clSetKernelArg(reduce, 0, sizeof(cl_mem), &g_In); CHK_ERR(err); err = clSetKernelArg(reduce, 1, sizeof(cl_mem), &g_Out); CHK_ERR(err); err = clSetKernelArg(reduce, 2, sizeof(int)*512, NULL); CHK_ERR(err); err = clSetKernelArg(reduce, 3, sizeof(int), &padded_size); CHK_ERR(err); double t0 = timestamp(); // calls the recursion function recursive_reduce(cv.commands, cv.context, reduce, g_In, g_Out, padded_size); t0 = timestamp()-t0; //read result of GPU on host CPU err = clEnqueueReadBuffer(cv.commands, g_Out, true, 0, sizeof(int)*n, h_Y, 0, NULL, NULL); CHK_ERR(err); int sum=0.0f; for(int i = 0; i < n; i++) { sum += h_A[i]; } if(sum!=h_Y[0]) { printf("WRONG: CPU sum = %d, GPU sum = %d\n", sum, h_Y[0]); printf("WRONG: difference = %d\n", sum-h_Y[0]); printf("Other parts = %d, %d, %d, %d\n", h_Y[1], h_Y[2], h_Y[3], h_Y[4]); int z=0; while(h_Y[z] == h_Y[z+1]){ z++; } printf("red: %d\n", z); } else { printf("CORRECT: %d,%g\n",n,t0); } uninitialize_ocl(cv); delete [] h_A; delete [] h_Y; clReleaseMemObject(g_Out); clReleaseMemObject(g_In); return 0; }
int main(int argc, char *argv[]) { std::string incr_kernel_str; /* Provide names of the OpenCL kernels * and cl file that they're kept in */ std::string incr_name_str = std::string("incr"); std::string incr_kernel_file = std::string("incr.cl"); cl_vars_t cv; cl_kernel incr; /* Read OpenCL file into STL string */ readFile(incr_kernel_file, incr_kernel_str); /* Initialize the OpenCL runtime * Source in clhelp.cpp */ initialize_ocl(cv); /* Compile all OpenCL kernels */ compile_ocl_program(incr, cv, incr_kernel_str.c_str(), incr_name_str.c_str()); /* Arrays on the host (CPU) */ float *h_Y, *h_YY; /* Arrays on the device (GPU) */ cl_mem g_Y; // Allocate arrays on the host and fill with random data. int n = (1<<20); h_Y = new float[n]; h_YY = new float[n]; for(int i = 0; i < n; i++) { h_YY[i] = h_Y[i] = (float)drand48(); } cl_int err = CL_SUCCESS; /* CS194: Allocate memory for arrays on * the GPU */ // Allocate the buffer memory objects. g_Y = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,sizeof(float)*n,NULL,&err); CHK_ERR(err); // Write data from CPU to GPU.(this is opposite of clEnqueueReadBuffer()) err = clEnqueueWriteBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n, h_Y, 0, NULL, NULL); CHK_ERR(err); // Define the global and local workgroup sizes. size_t global_work_size[1] = {n}; size_t local_work_size[1] = {128}; // Set the kernel args values. err = clSetKernelArg(incr, 0, sizeof(cl_mem), &g_Y); CHK_ERR(err); err = clSetKernelArg(incr, 1, sizeof(int), &n); CHK_ERR(err); // Call kernel on the GPU. err = clEnqueueNDRangeKernel(cv.commands, incr, 1,//work_dim, NULL, //global_work_offset global_work_size, //global_work_size local_work_size, //local_work_size 0, //num_events_in_wait_list NULL, //event_wait_list NULL // ); CHK_ERR(err); /* Read result of GPU on host CPU */ err = clEnqueueReadBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n, h_Y, 0, NULL, NULL); CHK_ERR(err); /* Check answer */ bool er = false; for(int i = 0; i < n; i++) { float d = (h_YY[i] + 1.0f); if(h_Y[i] != d) { printf("error at %d :(\n", i); er = true; break; } } if(!er) { printf("CPU and GPU results match\n"); } uninitialize_ocl(cv); delete [] h_Y; delete [] h_YY; clReleaseMemObject(g_Y); return 0; }
int main(int argc, char *argv[]) { std::string vvadd_kernel_str; /* Provide names of the OpenCL kernels * and cl file that they're kept in */ std::string vvadd_name_str = std::string("vvadd"); std::string vvadd_kernel_file = std::string("vvadd.cl"); cl_vars_t cv; cl_kernel vvadd; /* Read OpenCL file into STL string */ readFile(vvadd_kernel_file, vvadd_kernel_str); /* Initialize the OpenCL runtime * Source in clhelp.cpp */ initialize_ocl(cv); /* Compile all OpenCL kernels */ compile_ocl_program(vvadd, cv, vvadd_kernel_str.c_str(), vvadd_name_str.c_str()); /* Arrays on the host (CPU) */ float *h_A, *h_B, *h_Y; /* Arrays on the device (GPU) */ cl_mem g_A, g_B, g_Y; /* Allocate arrays on the host * and fill with random data */ int n = (1<<20); h_A = new float[n]; h_B = new float[n]; h_Y = new float[n]; bzero(h_Y, sizeof(float)*n); for(int i = 0; i < n; i++) { h_A[i] = (float)drand48(); h_B[i] = (float)drand48(); } /* CS194: Allocate memory for arrays on * the GPU */ cl_int err = CL_SUCCESS; /* CS194: Here's something to get you started */ g_Y = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,sizeof(float)*n,NULL,&err); CHK_ERR(err); g_A = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,sizeof(float)*n,NULL,&err); CHK_ERR(err); g_B = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,sizeof(float)*n,NULL,&err); CHK_ERR(err); /* CS194: Copy data from host CPU to GPU */ err = clEnqueueWriteBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n, h_Y, 0, NULL, NULL); CHK_ERR(err); err = clEnqueueWriteBuffer(cv.commands, g_A, true, 0, sizeof(float)*n, h_A, 0, NULL, NULL); CHK_ERR(err); err = clEnqueueWriteBuffer(cv.commands, g_B, true, 0, sizeof(float)*n, h_B, 0, NULL, NULL); CHK_ERR(err); /* CS194: Define the global and local workgroup sizes */ size_t global_work_size[1] = {n}; size_t local_work_size[1] = {128}; /* CS194: Set Kernel Arguments */ err = clSetKernelArg(vvadd, 0, sizeof(cl_mem), &g_Y); CHK_ERR(err); err = clSetKernelArg(vvadd, 1, sizeof(cl_mem), &g_A); CHK_ERR(err); err = clSetKernelArg(vvadd, 2, sizeof(cl_mem), &g_B); CHK_ERR(err); err = clSetKernelArg(vvadd, 3, sizeof(int), &n); CHK_ERR(err); /* CS194: Call kernel on the GPU */ err = clEnqueueNDRangeKernel(cv.commands, vvadd, 1,//work_dim, NULL, //global_work_offset global_work_size, //global_work_size local_work_size, //local_work_size 0, //num_events_in_wait_list NULL, //event_wait_list NULL // ); /* Read result of GPU on host CPU */ err = clEnqueueReadBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n, h_Y, 0, NULL, NULL); CHK_ERR(err); /* Check answer */ for(int i = 0; i < n; i++) { float d = h_A[i] + h_B[i]; if(h_Y[i] != d) { printf("error at %d :(\n", i); break; } } /* Shut down the OpenCL runtime */ uninitialize_ocl(cv); delete [] h_A; delete [] h_B; delete [] h_Y; clReleaseMemObject(g_A); clReleaseMemObject(g_B); clReleaseMemObject(g_Y); return 0; }
int main(int argc, char *argv[]) { std::string incr_kernel_str; /* Provide names of the OpenCL kernels * and cl file that they're kept in */ std::string incr_name_str = std::string("incr"); std::string incr_kernel_file = std::string("incr.cl"); cl_vars_t cv; cl_kernel incr; /* Read OpenCL file into STL string */ readFile(incr_kernel_file, incr_kernel_str); /* Initialize the OpenCL runtime * Source in clhelp.cpp */ initialize_ocl(cv); /* Compile all OpenCL kernels */ compile_ocl_program(incr, cv, incr_kernel_str.c_str(), incr_name_str.c_str()); /* Arrays on the host (CPU) */ float *h_Y, *h_YY; /* Arrays on the device (GPU) */ cl_mem g_Y; int n = (1<<20); h_Y = new float[n]; h_YY = new float[n]; for(int i = 0; i < n; i++) { h_YY[i] = h_Y[i] = (float)drand48(); } cl_int err = CL_SUCCESS; /* CS194: Allocate memory for arrays on * the GPU */ /* Creates a buffer in the cv.context context, with read and write access * at the global host adress g_Y, of size sizeof(float)*n. */ g_Y = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,sizeof(float)*n,NULL,&err); CHK_ERR(err); /* enqueue commands to write to the buffer g_Y from hos memory. * Commands will be queued in cv.commands. * true indicates that the write is put on the commands queue. * 0 is the offset in bytes in the buffer object to write to. * sizeof(float)*n is the size in byte of data being wirtten. * h_Y is the address in host memory of the data being written from. */ err = clEnqueueWriteBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n, h_Y, 0, NULL, NULL); /* checks whether the write buffer command was succesful. */ CHK_ERR(err); /* declaring the global size of th y dimension to be n. */ size_t global_work_size[1] = {n}; /* declaring the size of work groups to be 128 work items. */ size_t local_work_size[1] = {128}; /* Sets specific arguments for the kernel incr. * 0 is the argument index, sizeof(cl_mem) is the size * of the argument, which is the pointer to g_Y.*/ err = clSetKernelArg(incr, 0, sizeof(cl_mem), &g_Y); CHK_ERR(err); /* Sets specific arguments for the kernel incr. * 1 is the argument index, sizeof(int) is the size * of the argument, which is the pointer to n.*/ err = clSetKernelArg(incr, 1, sizeof(int), &n); CHK_ERR(err); /* Enqueues a command on cv.commands to execute the * kernel incr.cl on the device. Uses linear dimension * to specify work groups and items and specifies to use * global_work_size work items for the execution and local_work_size * as the size of a work group. */ err = clEnqueueNDRangeKernel(cv.commands, incr, 1,//work_dim, NULL, //global_work_offset global_work_size, //global_work_size local_work_size, //local_work_size 0, //num_events_in_wait_list NULL, //event_wait_list NULL // ); CHK_ERR(err); /* Read result of GPU on host CPU */ err = clEnqueueReadBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n, h_Y, 0, NULL, NULL); CHK_ERR(err); /* Check answer */ bool er = false; for(int i = 0; i < n; i++) { float d = (h_YY[i] + 1.0f); if(h_Y[i] != d) { printf("error at %d :(\n", i); er = true; break; } } if(!er) { printf("CPU and GPU results match\n"); } uninitialize_ocl(cv); delete [] h_Y; delete [] h_YY; clReleaseMemObject(g_Y); return 0; }
int main(int argc, char **argv) { char c; char *filepath; int cut_horizontal = 0; int cut_vertical = 0; int timed = 0; int show = 0; while ((c = getopt(argc, argv, "f:h:v:ts")) != -1) { switch (c) { case 'f': filepath = optarg; break; case 'h': cut_horizontal = (int)strtol(optarg, NULL, 10); break; case 'v': cut_vertical = (int)strtol(optarg, NULL, 10); break; case 't': timed = 1; break; case 's': show = 1; break; default: exit(1); } } // OpenCL boilerplate std::string ..._kernel_str; std::string ..._name_str = std::string("..."); std::string ..._kernel_file = std::string("..."); cl_vars_t cv; cl_kernel ...; readFile(..._kernel_file, ..._kernel_str); initialize_ocl(cv); compile_ocl_program(..., cv, ..._kernel_str.c_str(), ..._name_str.c_str()); // Read image Mat_<Vec3b> image = imread(filepath); if (!image.data) { cout << "Invalid input"; image.release(); return -1; } if (show) { imshow("Original Image", image); } SeamCarver s(image); // imshow("Gradient", s.energy); // Mat tmp = s.energy/195075.0*255.0; // s.energy.convertTo(tmp,CV_8U,-1); // imwrite("bench_gradient.jpg", tmp); // vector<uint> sm = s.findVerticalSeam(); // s.showVerticalSeam(sm); // Carving happens here double start = get_time(); ...; double elapsed = get_time() - start; // -------------------- // double start = get_time(); // for (int i = 0; i < cut_horizontal; ++i) { // vector<uint> seam = s.findHorizontalSeam(); // // s.showHorizontalSeam(seam); // s.removeHorizontalSeam(seam); // } // for (int i = 0; i < cut_vertical; ++i) { // vector<uint> seam = s.findVerticalSeam(); // // s.showVerticalSeam(seam); // s.removeVerticalSeam(seam); // } // double elapsed = get_time() - start; if (timed) { printf("Elapsed time: %.3lf seconds\n", elapsed); } Mat_<Vec3b> output = s.getImage(); imwrite("scarved.jpg", output); if (show) { imshow("Carved Image", output); while (waitKey(20) != 27); } // cout << "Seam Length: " << seam.size() << endl; // s.showImage(); // s.showEnergy(); // imwrite("bench_carved.jpg", s.getImage()); // for (int i = 0; i < 5; ++i) { // for (int j = 0; j < 5; ++j) { // cout << s.energy.at<uint32_t>(i,j) << " "; // } // cout << endl; // } uninitialize_ocl(cv); ...; clReleaseMemObject(...); image.release(); return 0; }