int main(int argc, char *argv[]) { cl_int status = 0; cl_int binSize = 256; cl_int groupSize = 16; cl_int subHistgCnt; cl_device_type dType = CL_DEVICE_TYPE_GPU; cl_platform_id platform = NULL; cl_device_id device; cl_context context; cl_command_queue commandQueue; cl_mem imageBuffer; cl_mem intermediateHistR, intermediateHistG, intermediateHistB; /*Intermediate Image Histogram buffer*/ cl_uint * midDeviceBinR, *midDeviceBinG, *midDeviceBinB; cl_uint *deviceBinR,*deviceBinG,*deviceBinB; //Read a BMP Image Image *image; std::string filename = "sample_color.bmp"; ReadBMPImage(filename, &image); if(image == NULL) { printf("File %s not present...\n", filename.c_str()); return 0; } subHistgCnt = (image->width * image->height)/(binSize*groupSize); midDeviceBinR = (cl_uint*)malloc(binSize * subHistgCnt * sizeof(cl_uint)); midDeviceBinG = (cl_uint*)malloc(binSize * subHistgCnt * sizeof(cl_uint)); midDeviceBinB = (cl_uint*)malloc(binSize * subHistgCnt * sizeof(cl_uint)); deviceBinR = (cl_uint*)malloc(binSize * sizeof(cl_uint)); deviceBinG = (cl_uint*)malloc(binSize * sizeof(cl_uint)); deviceBinB = (cl_uint*)malloc(binSize * sizeof(cl_uint)); //Setup the OpenCL Platform, //Get the first available platform. Use it as the default platform status = clGetPlatformIDs(1, &platform, NULL); LOG_OCL_ERROR(status, "clGetPlatformIDs Failed." ); //Get the first available device status = clGetDeviceIDs (platform, dType, 1, &device, NULL); LOG_OCL_ERROR(status, "clGetDeviceIDs Failed." ); //Create an execution context for the selected platform and device. cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType( cps, dType, NULL, NULL, &status); LOG_OCL_ERROR(status, "clCreateContextFromType Failed." ); // Create command queue commandQueue = clCreateCommandQueue(context, device, 0, &status); LOG_OCL_ERROR(status, "clCreateCommandQueue Failed." ); #if !defined(USE_HOST_MEMORY) //Create OpenCL device input buffer imageBuffer = clCreateBuffer( context, CL_MEM_READ_ONLY, sizeof(cl_uint) * image->width * image->height, NULL, &status); LOG_OCL_ERROR(status, "clCreateBuffer Failed while creating the image buffer." ); //Set input data cl_event writeEvt; status = clEnqueueWriteBuffer(commandQueue, imageBuffer, CL_FALSE, 0, image->width * image->height * sizeof(cl_uint), image->pixels, 0, NULL, &writeEvt); LOG_OCL_ERROR(status, "clEnqueueWriteBuffer Failed while writing the image data." ); #else //Create OpenCL device input buffer imageBuffer = clCreateBuffer( context, CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR, sizeof(cl_uint) * image->width * image->height, image->pixels, &status); LOG_OCL_ERROR(status, "clCreateBuffer Failed while creating the image buffer." ); #endif status = clFinish(commandQueue); LOG_OCL_ERROR(status, "clFinish Failed while writing the image data." ); //Create OpenCL device output buffer intermediateHistR = clCreateBuffer( context, CL_MEM_WRITE_ONLY, sizeof(cl_uint) * binSize * subHistgCnt, NULL, &status); LOG_OCL_ERROR(status, "clCreateBuffer Failed." ); intermediateHistG = clCreateBuffer( context, CL_MEM_WRITE_ONLY, sizeof(cl_uint) * binSize * subHistgCnt, NULL, &status); LOG_OCL_ERROR(status, "clCreateBuffer Failed." ); intermediateHistB = clCreateBuffer( context, CL_MEM_WRITE_ONLY, sizeof(cl_uint) * binSize * subHistgCnt, NULL, &status); LOG_OCL_ERROR(status, "clCreateBuffer Failed." ); // Create a program from the kernel source cl_program program = clCreateProgramWithSource(context, 1, (const char **)&histogram_kernel, NULL, &status); LOG_OCL_ERROR(status, "clCreateProgramWithSource Failed." ); // Build the program status = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if(status != CL_SUCCESS) LOG_OCL_COMPILER_ERROR(program, device); // Create the OpenCL kernel cl_kernel kernel = clCreateKernel(program, "histogram_kernel", &status); LOG_OCL_ERROR(status, "clCreateKernel Failed." ); // Set the arguments of the kernel status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&imageBuffer); status |= clSetKernelArg(kernel, 1, 3 * groupSize * binSize * sizeof(cl_uchar), NULL); status |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&intermediateHistR); status |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&intermediateHistG); status |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&intermediateHistB); LOG_OCL_ERROR(status, "clSetKernelArg Failed." ); // Execute the OpenCL kernel on the list cl_event ndrEvt; size_t globalThreads = (image->width * image->height) / (binSize*groupSize) * groupSize; size_t localThreads = groupSize; status = clEnqueueNDRangeKernel( commandQueue, kernel, 1, NULL, &globalThreads, &localThreads, 0, NULL, &ndrEvt); LOG_OCL_ERROR(status, "clEnqueueNDRangeKernel Failed." ); status = clFinish(commandQueue); LOG_OCL_ERROR(status, "clFinish Failed." ); //Read the histogram back into the host memory. memset(deviceBinR, 0, binSize * sizeof(cl_uint)); memset(deviceBinG, 0, binSize * sizeof(cl_uint)); memset(deviceBinB, 0, binSize * sizeof(cl_uint)); cl_event readEvt[3]; status = clEnqueueReadBuffer( commandQueue, intermediateHistR, CL_FALSE, 0, subHistgCnt * binSize * sizeof(cl_uint), midDeviceBinR, 0, NULL, &readEvt[0]); LOG_OCL_ERROR(status, "clEnqueueReadBuffer of intermediateHistR Failed." ); status = clEnqueueReadBuffer( commandQueue, intermediateHistG, CL_FALSE, 0, subHistgCnt * binSize * sizeof(cl_uint), midDeviceBinG, 0, NULL, &readEvt[1]); LOG_OCL_ERROR(status, "clEnqueueReadBuffer of intermediateHistG Failed." ); status = clEnqueueReadBuffer( commandQueue, intermediateHistB, CL_FALSE, 0, subHistgCnt * binSize * sizeof(cl_uint), midDeviceBinB, 0, NULL, &readEvt[2]); LOG_OCL_ERROR(status, "clEnqueueReadBuffer of intermediateHistB Failed." ); status = clWaitForEvents(3, readEvt); //status = clFinish(commandQueue); LOG_OCL_ERROR(status, "clWaitForEvents for readEvt." ); // Calculate final histogram bin for(int i = 0; i < subHistgCnt; ++i) { for(int j = 0; j < binSize; ++j) { deviceBinR[j] += midDeviceBinR[i * binSize + j]; deviceBinG[j] += midDeviceBinG[i * binSize + j]; deviceBinB[j] += midDeviceBinB[i * binSize + j]; } } // Validate the histogram operation. // The idea behind this is that once a histogram is computed the sum of all the bins should be equal to the number of pixels. int totalPixelsR = 0; int totalPixelsG = 0; int totalPixelsB = 0; for(int j = 0; j < binSize; ++j) { totalPixelsR += deviceBinR[j]; totalPixelsG += deviceBinG[j]; totalPixelsB += deviceBinB[j]; } printf ("Total Number of Red Pixels = %d\n",totalPixelsR); printf ("Total Number of Green Pixels = %d\n",totalPixelsG); printf ("Total Number of Blue Pixels = %d\n",totalPixelsB); ReleaseBMPImage(&image); //free all allocated memory free(midDeviceBinR); free(midDeviceBinG); free(midDeviceBinB); free(deviceBinR); free(deviceBinG); free(deviceBinB); return 0; }
int main() { cl_platform_id platform = NULL; cl_device_id device = NULL; cl_context context = NULL; cl_command_queue command_queue = NULL; cl_program program = NULL; cl_kernel kernel = NULL; cl_int status = 0; cl_event task_event, map_event; cl_device_type dType = CL_DEVICE_TYPE_GPU; cl_int image_width, image_height; cl_float4 *result; int i, j; cl_mem clImage, out; cl_bool support; int pixels_read = 8; //Setup the OpenCL Platform, //Get the first available platform. Use it as the default platform status = clGetPlatformIDs(1, &platform, NULL); LOG_OCL_ERROR(status, "clGetPlatformIDs Failed" ); //Get the first available device status = clGetDeviceIDs (platform, dType, 1, &device, NULL); LOG_OCL_ERROR(status, "clGetDeviceIDs Failed" ); /*Check if the device support images */ clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(support), &support, NULL); if (support != CL_TRUE) { std::cout <<"IMAGES not supported\n"; return 1; } //Create an execution context for the selected platform and device. cl_context_properties contextProperty[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType( contextProperty, dType, NULL, NULL, &status); LOG_OCL_ERROR(status, "clCreateContextFromType Failed" ); /*Create command queue*/ command_queue = clCreateCommandQueue(context, device, 0, &status); LOG_OCL_ERROR(status, "clCreateCommandQueue Failed" ); /* Create Image Object */ //Create OpenCL device input image with the format and descriptor as below cl_image_format image_format; image_format.image_channel_data_type = CL_FLOAT; image_format.image_channel_order = CL_R; //We create a 5 X 5 2D image image_width = 5; image_height = 5; cl_image_desc image_desc; image_desc.image_type = CL_MEM_OBJECT_IMAGE2D; image_desc.image_width = image_width; image_desc.image_height = image_height; image_desc.image_depth = 1; image_desc.image_array_size = 1; image_desc.image_row_pitch = image_width*sizeof(float); image_desc.image_slice_pitch = 25*sizeof(float); image_desc.num_mip_levels = 0; image_desc.num_samples = 0; image_desc.buffer = NULL; /* Create output buffer */ out = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float4)*pixels_read, NULL, &status); LOG_OCL_ERROR(status, "clCreateBuffer Failed" ); size_t origin[] = {0,0,0}; /* Transfer target coordinate*/ size_t region[] = {image_width,image_height,1}; /* Size of object to be transferred */ float *data = (float *)malloc(image_width*image_height*sizeof(float)); float pixels[] = { /* Transfer Data */ 10, 20, 10, 40, 50, 10, 20, 20, 40, 50, 10, 20, 30, 40, 50, 10, 20, 40, 40, 50, 10, 20, 50, 40, 50 }; memcpy(data, pixels, image_width*image_height*sizeof(float)); clImage = clCreateImage(context, CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR, &image_format, &image_desc, pixels, &status); LOG_OCL_ERROR(status, "clCreateImage Failed" ); /* If the image was not created using CL_MEM_USE_HOST_PTR, then you can write the image data to the device using the clEnqueueWriteImage function. */ //status = clEnqueueWriteImage(command_queue, clImage, CL_TRUE, origin, region, 5*sizeof(float), 25*sizeof(float), data, 0, NULL, NULL); //LOG_OCL_ERROR(status, "clCreateBuffer Failed" ); /* Build program */ program = clCreateProgramWithSource(context, 1, (const char **)&sample_image_kernel, NULL, &status); LOG_OCL_ERROR(status, "clCreateProgramWithSource Failed" ); // Build the program status = clBuildProgram(program, 1, &device, "", NULL, NULL); LOG_OCL_ERROR(status, "clBuildProgram Failed" ); if(status != CL_SUCCESS) { if(status == CL_BUILD_PROGRAM_FAILURE) LOG_OCL_COMPILER_ERROR(program, device); LOG_OCL_ERROR(status, "clBuildProgram Failed" ); } printf("Printing the image pixels\n"); for (i=0; i<image_height; i++) { for (j=0; j<image_width; j++) { printf("%f ",data[i*image_width +j]); } printf("\n"); } //Create kernel and set the kernel arguments kernel = clCreateKernel(program, "image_test", &status); clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&clImage); clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&out); /*********Image sampler with image repeated at every 1.0 normalized coordinate***********/ /*If host side sampler is not required the sampler objects can also be created on the kernel code. Don't pass the thirsd argument to the kernel and create a sample object as shown below in the kernel code*/ //const sampler_t sampler = CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; cl_sampler sampler = clCreateSampler (context, CL_TRUE, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST, &status); clSetKernelArg(kernel, 2, sizeof(cl_sampler), (void*)&sampler); //Enqueue the kernel status = clEnqueueTask(command_queue, kernel, 0, NULL, &task_event); LOG_OCL_ERROR(status, "clEnqueueTask Failed" ); /* Map the result back to host address */ result = (cl_float4*)clEnqueueMapBuffer(command_queue, out, CL_TRUE, CL_MAP_READ, 0, sizeof(cl_float4)*pixels_read, 1, &task_event, &map_event, &status); printf(" SAMPLER mode set to CL_ADDRESS_REPEAT | CL_FILTER_NEAREST\n"); printf("\nPixel values retreived based on the filter and Addressing mode selected\n"); printf("(float2)(0.5f,0.5f) = %f,%f,%f,%f\n",result[0].s[0],result[0].s[1],result[0].s[2],result[0].s[3]); printf("(float2)(0.8f,0.5f) = %f,%f,%f,%f\n",result[1].s[0],result[1].s[1],result[1].s[2],result[1].s[3]); printf("(float2)(1.3f,0.5f) = %f,%f,%f,%f\n",result[2].s[0],result[2].s[1],result[2].s[2],result[2].s[3]); printf("(float2)(0.5f,0.5f) = %f,%f,%f,%f\n",result[3].s[0],result[3].s[1],result[3].s[2],result[3].s[3]); printf("(float2)(0.5f,0.8f) = %f,%f,%f,%f\n",result[4].s[0],result[4].s[1],result[4].s[2],result[4].s[3]); printf("(float2)(0.5f,1.3f) = %f,%f,%f,%f\n",result[5].s[0],result[5].s[1],result[5].s[2],result[5].s[3]); printf("(float2)(4.5f,0.5f) = %f,%f,%f,%f\n",result[5].s[0],result[5].s[1],result[5].s[2],result[5].s[3]); printf("(float2)(5.0f,0.5f) = %f,%f,%f,%f\n",result[7].s[0],result[7].s[1],result[7].s[2],result[7].s[3]); clEnqueueUnmapMemObject(command_queue, out, result, 0, NULL, NULL); clReleaseSampler(sampler); /*********Image sampler with image mirrored at every 1.0 normalized coordinate***********/ sampler = clCreateSampler (context, CL_TRUE, CL_ADDRESS_MIRRORED_REPEAT, CL_FILTER_LINEAR, &status); clSetKernelArg(kernel, 2, sizeof(cl_sampler), (void*)&sampler); //Enqueue the kernel status = clEnqueueTask(command_queue, kernel, 0, NULL, &task_event); LOG_OCL_ERROR(status, "clEnqueueTask Failed" ); /* Map the result back to host address */ result = (cl_float4*)clEnqueueMapBuffer(command_queue, out, CL_TRUE, CL_MAP_READ, 0, sizeof(cl_float4)*pixels_read, 1, &task_event, &map_event, &status); printf(" SAMPLER mode set to CL_ADDRESS_MIRRORED_REPEAT | CL_FILTER_LINEAR\n"); printf("\nPixel values retreived based on the filter and Addressing mode selected\n"); printf("(float2)(0.5f,0.5f) = %f,%f,%f,%f\n",result[0].s[0],result[0].s[1],result[0].s[2],result[0].s[3]); printf("(float2)(0.8f,0.5f) = %f,%f,%f,%f\n",result[1].s[0],result[1].s[1],result[1].s[2],result[1].s[3]); printf("(float2)(1.3f,0.5f) = %f,%f,%f,%f\n",result[2].s[0],result[2].s[1],result[2].s[2],result[2].s[3]); printf("(float2)(0.5f,0.5f) = %f,%f,%f,%f\n",result[3].s[0],result[3].s[1],result[3].s[2],result[3].s[3]); printf("(float2)(0.5f,0.8f) = %f,%f,%f,%f\n",result[4].s[0],result[4].s[1],result[4].s[2],result[4].s[3]); printf("(float2)(0.5f,1.3f) = %f,%f,%f,%f\n",result[5].s[0],result[5].s[1],result[5].s[2],result[5].s[3]); printf("(float2)(4.5f,0.5f) = %f,%f,%f,%f\n",result[5].s[0],result[5].s[1],result[5].s[2],result[5].s[3]); printf("(float2)(5.0f,0.5f) = %f,%f,%f,%f\n",result[7].s[0],result[7].s[1],result[7].s[2],result[7].s[3]); clEnqueueUnmapMemObject(command_queue, out, result, 0, NULL, NULL); clReleaseSampler(sampler); /********************/ //Free All OpenCL objects. clReleaseMemObject(out); clReleaseMemObject(clImage); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(command_queue); clReleaseContext(context); return 0; }
int main(void) { CPUBitmap bitmap(DIM, DIM); unsigned char *ptr = bitmap.get_ptr(); cl_int clStatus; //Keeps track of the error values returned. // Get platform and device information cl_platform_id * platforms = NULL; // Set up the Platform. Take a look at the MACROs used in this file. // These are defined in common/ocl_macros.h OCL_CREATE_PLATFORMS(platforms); // Get the devices list and choose the type of device you want to run on cl_device_id *device_list = NULL; OCL_CREATE_DEVICE(platforms[1], DEVICE_TYPE, device_list); // Create OpenCL context for devices in device_list cl_context context; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[0], 0 }; // An OpenCL context can be associated to multiple devices, either CPU or GPU // based on the value of DEVICE_TYPE defined above. context = clCreateContext(NULL, num_devices, device_list, NULL, NULL, &clStatus); LOG_OCL_ERROR(clStatus, "clCreateContext Failed..."); // Create a command queue for the first device in device_list cl_command_queue command_queue = clCreateCommandQueue(context, device_list[0], 0, &clStatus); LOG_OCL_ERROR(clStatus, "clCreateCommandQueue Failed..."); // Create memory buffer cl_mem julia_clmem = clCreateBuffer(context, CL_MEM_READ_WRITE, bitmap.image_size() * sizeof(unsigned char), NULL, &clStatus); for (int u = 0; u < DIM * DIM; u++) { ptr[u * 4] = 0; ptr[u * 4 + 1] = 0; ptr[u * 4 + 2] = 0; ptr[u * 4 + 3] = 0; } // Enqueue a write buffer clStatus = clEnqueueWriteBuffer(command_queue, julia_clmem, CL_TRUE, 0, bitmap.image_size() * sizeof(unsigned char), ptr, 0, NULL, NULL); LOG_OCL_ERROR(clStatus, "clEnqueueWriteBuffer Failed..."); // Create a program from the kernel source cl_program program = clCreateProgramWithSource(context, 1, (const char **)&julia_kernel, NULL, &clStatus); LOG_OCL_ERROR(clStatus, "clCreateProgramWithSource Failed..."); // Build the program clStatus = clBuildProgram(program, 1, device_list, NULL, NULL, NULL); if (clStatus != CL_SUCCESS) LOG_OCL_COMPILER_ERROR(program, device_list[0]); // Create the OpenCL kernel cl_kernel kernel = clCreateKernel(program, "julia_kernel", &clStatus); clStatus = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&julia_clmem); LOG_OCL_ERROR(clStatus, "clSetKernelArg Failed..."); // Execute the OpenCL kernel on the list size_t global_size[2] = { DIM, DIM }; size_t local_size[2] = { 1, 1 }; cl_event julia_event; clStatus = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global_size, local_size, 0, NULL, &julia_event); LOG_OCL_ERROR(clStatus, "clEnqueueNDRangeKernel Failed..."); // Read the memory buffer C_clmem on the device to the host allocated buffer unsigned char *A = (unsigned char*)malloc(bitmap.image_size() * sizeof(unsigned char)); clStatus = clEnqueueReadBuffer(command_queue, julia_clmem, CL_TRUE, 0, bitmap.image_size() * sizeof(unsigned char), ptr, 1, &julia_event, NULL); LOG_OCL_ERROR(clStatus, "clEnqueueReadBuffer Failed..."); // Clean up and wait for all the comands to complete. clStatus = clFinish(command_queue); // Finally release all OpenCL objects and release the host buffers. clStatus = clReleaseKernel(kernel); clStatus = clReleaseProgram(program); clStatus = clReleaseMemObject(julia_clmem); clStatus = clReleaseCommandQueue(command_queue); clStatus = clReleaseContext(context); free(platforms); free(device_list); bitmap.display_and_exit(); return 0; }