int readAndBuildKernel(char *filename) { cl_int ciErrNum = CL_SUCCESS; size_t kernelLength; char *source; source = readFile(filename); kernelLength = strlen(source); // create the program theProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&source, &kernelLength, &ciErrNum); printCLError(ciErrNum,5); // build the program ciErrNum = clBuildProgram(theProgram, 0, NULL, NULL, NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out the build log, then exit char cBuildLog[10240]; clGetProgramBuildInfo(theProgram, device, CL_PROGRAM_BUILD_LOG, sizeof(cBuildLog), cBuildLog, NULL ); printf("\nBuild Log:\n%s\n\n", (char *)&cBuildLog); return -1; } theKernel = clCreateKernel(theProgram, "filter", &ciErrNum); printCLError(ciErrNum,6); //Discard temp storage free(source); return 0; }
int init_OpenCL() { cl_int ciErrNum = CL_SUCCESS; cl_platform_id platform; unsigned int no_plat; // We assume that we only have one platform available. (This may not be true.) ciErrNum = clGetPlatformIDs(1,&platform,&no_plat); printCLError(ciErrNum,0); // Get the GPU device ciErrNum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); printCLError(ciErrNum,1); // create the OpenCL context on the device cxGPUContext = clCreateContext(0, 1, &device, NULL, NULL, &ciErrNum); printCLError(ciErrNum,2); ciErrNum = clGetDeviceInfo(device,CL_DEVICE_MAX_WORK_GROUP_SIZE,sizeof(size_t),&noWG,NULL); printCLError(ciErrNum,3); printf("maximum number of workgroups: %d\n", (int)noWG); // create command queue commandQueue = clCreateCommandQueue(cxGPUContext, device, 0, &ciErrNum); printCLError(ciErrNum,4); }
int init_OpenCL() { cl_int ciErrNum = CL_SUCCESS; size_t kernelLength; char *source; cl_device_id device; cl_platform_id platform; unsigned int no_plat; ciErrNum = clGetPlatformIDs(1,&platform,&no_plat); printCLError(ciErrNum,0); //get the device ciErrNum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); printCLError(ciErrNum,1); // create the OpenCL context on the device cxGPUContext = clCreateContext(0, 1, &device, NULL, NULL, &ciErrNum); printCLError(ciErrNum,2); ciErrNum = clGetDeviceInfo(device,CL_DEVICE_MAX_WORK_GROUP_SIZE,sizeof(size_t),&noWG,NULL); printCLError(ciErrNum,3); printf("maximum number of workgroups: %d\n", (int)noWG); // create command queue commandQueue = clCreateCommandQueue(cxGPUContext, device, 0, &ciErrNum); printCLError(ciErrNum,4); source = readFile("sort.cl"); kernelLength = strlen(source); // create the program cpSort = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&source, &kernelLength, &ciErrNum); printCLError(ciErrNum,5); // build the program ciErrNum = clBuildProgram(cpSort, 0, NULL, NULL, NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out the build log, then exit char cBuildLog[10240]; clGetProgramBuildInfo(cpSort, device, CL_PROGRAM_BUILD_LOG, sizeof(cBuildLog), cBuildLog, NULL ); printf("\nBuild Log:\n%s\n\n", (char *)&cBuildLog); return -1; } gpgpuSort = clCreateKernel(cpSort, "sort", &ciErrNum); printCLError(ciErrNum,6); //Discard temp storage free(source); return 0; }
//////////////////////////////////////////////////////////////////////////////// // main computation function //////////////////////////////////////////////////////////////////////////////// void computeImages() { //read in full size of memory image = readppm("maskros512.ppm", &n, &m); out = (unsigned char*) malloc(n*m*3); cl_mem in_data, out_data; cl_int ciErrNum = CL_SUCCESS; // Create space for data and copy image to device (note that we could also use clEnqueueWriteBuffer to upload) in_data = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, 3*n*m * sizeof(unsigned char), image, &ciErrNum); printCLError(ciErrNum,6); out_data = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, 3*n*m * sizeof(unsigned char), NULL, &ciErrNum); printCLError(ciErrNum,7); // set the args values ciErrNum = clSetKernelArg(theKernel, 0, sizeof(cl_mem), (void *) &in_data); ciErrNum |= clSetKernelArg(theKernel, 1, sizeof(cl_mem), (void *) &out_data); ciErrNum |= clSetKernelArg(theKernel, 2, sizeof(cl_uint), (void *) &n); ciErrNum |= clSetKernelArg(theKernel, 3, sizeof(cl_uint), (void *) &m); printCLError(ciErrNum,8); // Computing arrangement //size_t localWorkSize, globalWorkSize; size_t globalWorkSize[3] = {512, 512, 1}; size_t localWorkSize[3] = {16, 16, 1}; //256 threads in each block // 32*32 (1024) blocks in total to have 512*512 threads in total printf("Startup time %lf\n", GetSeconds()); // Compute! cl_event event; ResetMilli(); ciErrNum = clEnqueueNDRangeKernel(commandQueue, theKernel, 2, NULL, &globalWorkSize, &localWorkSize, 0, NULL, &event); printCLError(ciErrNum,9); ciErrNum = clWaitForEvents(1, &event); // Synch printCLError(ciErrNum,10); printf("time %lf\n", GetSeconds()); ciErrNum = clEnqueueReadBuffer(commandQueue, out_data, CL_TRUE, 0, 3*n*m * sizeof(unsigned char), out, 0, NULL, &event); printCLError(ciErrNum,11); clWaitForEvents(1, &event); // Synch printCLError(ciErrNum,10); clReleaseMemObject(in_data); clReleaseMemObject(out_data); return; }
int gpu_Sort(unsigned int *data, unsigned int length) { cl_int ciErrNum = CL_SUCCESS; size_t localWorkSize, globalWorkSize; cl_mem in_data; cl_mem out_data; printf("GPU sorting.\n"); in_data = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, length * sizeof(unsigned int), data, &ciErrNum); out_data = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, length * sizeof(unsigned int), data, &ciErrNum); printCLError(ciErrNum,7); if (length<512) localWorkSize = length; else localWorkSize = 512; globalWorkSize = length; // set the args values ciErrNum = clSetKernelArg(gpgpuSort, 0, sizeof(cl_mem), (void *) &in_data); ciErrNum |= clSetKernelArg(gpgpuSort, 1, sizeof(cl_uint), (void *) &length); ciErrNum |= clSetKernelArg(gpgpuSort, 2, sizeof(cl_mem), (void *) &out_data); printCLError(ciErrNum,8); gettimeofday(&t_s_gpu, NULL); cl_event event; ciErrNum = clEnqueueNDRangeKernel(commandQueue, gpgpuSort, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, &event); printCLError(ciErrNum,9); clWaitForEvents(1, &event); // Synch gettimeofday(&t_e_gpu, NULL); printCLError(ciErrNum,10); ciErrNum = clEnqueueReadBuffer(commandQueue, out_data, CL_TRUE, 0, length * sizeof(unsigned int), data, 0, NULL, &event); printCLError(ciErrNum,11); clWaitForEvents(1, &event); // Synch printCLError(ciErrNum,10); clReleaseMemObject(in_data); clReleaseMemObject(out_data); return ciErrNum; }