// Host function that launches an OpenCL kernel to compute the MGVF matrices for the specified cells void IMGVF_OpenCL(MAT **I, MAT **IMGVF, double vx, double vy, double e, int max_iterations, double cutoff, int num_cells) { cl_int error; // Initialize the data on the GPU IMGVF_OpenCL_init(I, num_cells); // Load the kernel source from the file const char *source = load_kernel_source("track_ellipse_kernel.cl"); size_t sourceSize = strlen(source); // Compile the kernel cl_program program = clCreateProgramWithSource(context, 1, &source, &sourceSize, &error); check_error(error, __FILE__, __LINE__); error = clBuildProgram(program, 1, &device, NULL, NULL, NULL); // Show compiler warnings/errors static char log[65536]; memset(log, 0, sizeof(log)); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL); if (strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log); check_error(error, __FILE__, __LINE__); // Create the IMGVF kernels cl_kernel IMGVF_kernel = clCreateKernel(program, "IMGVF_kernel", &error); check_error(error, __FILE__, __LINE__); // Setup execution parameters size_t num_work_groups = num_cells; size_t global_work_size = num_work_groups * local_work_size; // Convert double-precision parameters to single-precision float vx_float = (float) vx; float vy_float = (float) vy; float e_float = (float) e; float cutoff_float = (float) cutoff; // Set the kernel arguments clSetKernelArg(IMGVF_kernel, 0, sizeof(cl_mem), (void *) &device_IMGVF_all); clSetKernelArg(IMGVF_kernel, 1, sizeof(cl_mem), (void *) &device_I_all); clSetKernelArg(IMGVF_kernel, 2, sizeof(cl_mem), (void *) &device_I_offsets); clSetKernelArg(IMGVF_kernel, 3, sizeof(cl_mem), (void *) &device_m_array); clSetKernelArg(IMGVF_kernel, 4, sizeof(cl_mem), (void *) &device_n_array); clSetKernelArg(IMGVF_kernel, 5, sizeof(cl_float), (void *) &vx_float); clSetKernelArg(IMGVF_kernel, 6, sizeof(cl_float), (void *) &vy_float); clSetKernelArg(IMGVF_kernel, 7, sizeof(cl_float), (void *) &e_float); clSetKernelArg(IMGVF_kernel, 8, sizeof(cl_int), (void *) &max_iterations); clSetKernelArg(IMGVF_kernel, 9, sizeof(cl_float), (void *) &cutoff_float); // Compute the MGVF on the GPU error = clEnqueueNDRangeKernel(command_queue, IMGVF_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); check_error(error, __FILE__, __LINE__); // Check for kernel errors error = clFinish(command_queue); check_error(error, __FILE__, __LINE__); // Copy back the final results from the GPU IMGVF_OpenCL_cleanup(IMGVF, num_cells); }
void kernel_gpu_opencl_wrapper( fp* image, // input image int Nr, // IMAGE nbr of rows int Nc, // IMAGE nbr of cols long Ne, // IMAGE nbr of elem int niter, // nbr of iterations fp lambda, // update step size long NeROI, // ROI nbr of elements int* iN, int* iS, int* jE, int* jW, int iter, // primary loop int mem_size_i, int mem_size_j) { //======================================================================================================================================================150 // GPU SETUP //======================================================================================================================================================150 //====================================================================================================100 // COMMON VARIABLES //====================================================================================================100 // common variables cl_int error; //====================================================================================================100 // GET PLATFORMS (Intel, AMD, NVIDIA, based on provided library), SELECT ONE //====================================================================================================100 // Get the number of available platforms cl_uint num_platforms; error = clGetPlatformIDs( 0, NULL, &num_platforms); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Get the list of available platforms cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms); error = clGetPlatformIDs( num_platforms, platforms, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Select the 1st platform cl_platform_id platform = platforms[0]; // Get the name of the selected platform and print it (if there are multiple platforms, choose the first one) char pbuf[100]; error = clGetPlatformInfo( platform, CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); printf("Platform: %s\n", pbuf); //====================================================================================================100 // CREATE CONTEXT FOR THE PLATFORM //====================================================================================================100 // Create context properties for selected platform cl_context_properties context_properties[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) platform, 0}; // Create context for selected platform being GPU cl_context context; context = clCreateContextFromType( context_properties, CL_DEVICE_TYPE_ALL, NULL, NULL, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // GET DEVICES AVAILABLE FOR THE CONTEXT, SELECT ONE //====================================================================================================100 // Get the number of devices (previousely selected for the context) size_t devices_size; error = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &devices_size); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Get the list of devices (previousely selected for the context) cl_device_id *devices = (cl_device_id *) malloc(devices_size); error = clGetContextInfo( context, CL_CONTEXT_DEVICES, devices_size, devices, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Select the first device (previousely selected for the context) (if there are multiple devices, choose the first one) cl_device_id device; device = devices[0]; // Get the name of the selected device (previousely selected for the context) and print it error = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(pbuf), pbuf, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); printf("Device: %s\n", pbuf); //====================================================================================================100 // CREATE COMMAND QUEUE FOR THE DEVICE //====================================================================================================100 // Create a command queue cl_command_queue command_queue; command_queue = clCreateCommandQueue( context, device, 0, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // CREATE PROGRAM, COMPILE IT //====================================================================================================100 // Load kernel source code from file const char *source = load_kernel_source("./kernel/kernel_gpu_opencl.cl"); size_t sourceSize = strlen(source); // Create the program cl_program program = clCreateProgramWithSource( context, 1, &source, &sourceSize, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); char clOptions[150]; // sprintf(clOptions,"-I../../src"); sprintf(clOptions,"-I."); #ifdef RD_WG_SIZE sprintf(clOptions + strlen(clOptions), " -DRD_WG_SIZE=%d", RD_WG_SIZE); #endif #ifdef RD_WG_SIZE_0 sprintf(clOptions + strlen(clOptions), " -DRD_WG_SIZE_0=%d", RD_WG_SIZE_0); #endif #ifdef RD_WG_SIZE_0_0 sprintf(clOptions + strlen(clOptions), " -DRD_WG_SIZE_0_0=%d", RD_WG_SIZE_0_0); #endif // Compile the program error = clBuildProgram( program, 1, &device, clOptions, NULL, NULL); // Print warnings and errors from compilation static char log[65536]; memset(log, 0, sizeof(log)); clGetProgramBuildInfo( program, device, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL); printf("-----OpenCL Compiler Output-----\n"); if (strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log); printf("--------------------------------\n"); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // CREATE Kernels //====================================================================================================100 // Extract kernel cl_kernel extract_kernel; extract_kernel = clCreateKernel(program, "extract_kernel", &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Prepare kernel cl_kernel prepare_kernel; prepare_kernel = clCreateKernel(program, "prepare_kernel", &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Reduce kernel cl_kernel reduce_kernel; reduce_kernel = clCreateKernel( program, "reduce_kernel", &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // SRAD kernel cl_kernel srad_kernel; srad_kernel = clCreateKernel( program, "srad_kernel", &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // SRAD2 kernel cl_kernel srad2_kernel; srad2_kernel = clCreateKernel( program, "srad2_kernel", &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Compress kernel cl_kernel compress_kernel; compress_kernel = clCreateKernel( program, "compress_kernel", &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // TRIGGERING INITIAL DRIVER OVERHEAD //====================================================================================================100 // cudaThreadSynchronize(); // the above does it //======================================================================================================================================================150 // GPU VARIABLES //======================================================================================================================================================150 // CUDA kernel execution parameters int blocks_x; //======================================================================================================================================================150 // ALLOCATE MEMORY IN GPU //======================================================================================================================================================150 //====================================================================================================100 // common memory size //====================================================================================================100 int mem_size; // matrix memory size mem_size = sizeof(fp) * Ne; // get the size of float representation of input IMAGE //====================================================================================================100 // allocate memory for entire IMAGE on DEVICE //====================================================================================================100 cl_mem d_I; d_I = clCreateBuffer( context, CL_MEM_READ_WRITE, mem_size, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // allocate memory for coordinates on DEVICE //====================================================================================================100 cl_mem d_iN; d_iN = clCreateBuffer( context, CL_MEM_READ_WRITE, mem_size_i, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_mem d_iS; d_iS = clCreateBuffer( context, CL_MEM_READ_WRITE, mem_size_i, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_mem d_jE; d_jE = clCreateBuffer( context, CL_MEM_READ_WRITE, mem_size_j, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_mem d_jW; d_jW = clCreateBuffer( context, CL_MEM_READ_WRITE, mem_size_j, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // allocate memory for derivatives //====================================================================================================100 cl_mem d_dN; d_dN = clCreateBuffer( context, CL_MEM_READ_WRITE, mem_size, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_mem d_dS; d_dS = clCreateBuffer( context, CL_MEM_READ_WRITE, mem_size, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_mem d_dW; d_dW = clCreateBuffer( context, CL_MEM_READ_WRITE, mem_size, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_mem d_dE; d_dE = clCreateBuffer( context, CL_MEM_READ_WRITE, mem_size, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // allocate memory for coefficient on DEVICE //====================================================================================================100 cl_mem d_c; d_c = clCreateBuffer( context, CL_MEM_READ_WRITE, mem_size, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // allocate memory for partial sums on DEVICE //====================================================================================================100 cl_mem d_sums; d_sums = clCreateBuffer( context, CL_MEM_READ_WRITE, mem_size, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_mem d_sums2; d_sums2 = clCreateBuffer( context, CL_MEM_READ_WRITE, mem_size, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // End //====================================================================================================100 //======================================================================================================================================================150 // COPY INPUT TO CPU //======================================================================================================================================================150 //====================================================================================================100 // Image //====================================================================================================100 error = clEnqueueWriteBuffer( command_queue, d_I, 1, 0, mem_size, image, 0, 0, 0); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // coordinates //====================================================================================================100 error = clEnqueueWriteBuffer( command_queue, d_iN, 1, 0, mem_size_i, iN, 0, 0, 0); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clEnqueueWriteBuffer( command_queue, d_iS, 1, 0, mem_size_i, iS, 0, 0, 0); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clEnqueueWriteBuffer( command_queue, d_jE, 1, 0, mem_size_j, jE, 0, 0, 0); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clEnqueueWriteBuffer( command_queue, d_jW, 1, 0, mem_size_j, jW, 0, 0, 0); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // End //====================================================================================================100 //======================================================================================================================================================150 // KERNEL EXECUTION PARAMETERS //======================================================================================================================================================150 // threads size_t local_work_size[1]; local_work_size[0] = NUMBER_THREADS; // workgroups int blocks_work_size; size_t global_work_size[1]; blocks_x = Ne/(int)local_work_size[0]; if (Ne % (int)local_work_size[0] != 0){ // compensate for division remainder above by adding one grid blocks_x = blocks_x + 1; } blocks_work_size = blocks_x; global_work_size[0] = blocks_work_size * local_work_size[0]; // define the number of blocks in the grid printf("max # of workgroups = %d, # of threads/workgroup = %d (ensure that device can handle)\n", (int)(global_work_size[0]/local_work_size[0]), (int)local_work_size[0]); //======================================================================================================================================================150 // Extract Kernel - SCALE IMAGE DOWN FROM 0-255 TO 0-1 AND EXTRACT //======================================================================================================================================================150 //====================================================================================================100 // set arguments //====================================================================================================100 error = clSetKernelArg( extract_kernel, 0, sizeof(long), (void *) &Ne); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( extract_kernel, 1, sizeof(cl_mem), (void *) &d_I); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // launch kernel //====================================================================================================100 error = clEnqueueNDRangeKernel( command_queue, extract_kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // Synchronization - wait for all operations in the command queue so far to finish //====================================================================================================100 // error = clFinish(command_queue); // if (error != CL_SUCCESS) // fatal_CL(error, __LINE__); //====================================================================================================100 // End //====================================================================================================100 //======================================================================================================================================================150 // WHAT IS CONSTANT IN COMPUTATION LOOP //======================================================================================================================================================150 //====================================================================================================100 // Prepare Kernel //====================================================================================================100 error = clSetKernelArg( prepare_kernel, 0, sizeof(long), (void *) &Ne); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( prepare_kernel, 1, sizeof(cl_mem), (void *) &d_I); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( prepare_kernel, 2, sizeof(cl_mem), (void *) &d_sums); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( prepare_kernel, 3, sizeof(cl_mem), (void *) &d_sums2); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // Reduce Kernel //====================================================================================================100 int blocks2_x; int blocks2_work_size; size_t global_work_size2[1]; long no; int mul; int mem_size_single = sizeof(fp) * 1; fp total; fp total2; fp meanROI; fp meanROI2; fp varROI; fp q0sqr; error = clSetKernelArg( reduce_kernel, 0, sizeof(long), (void *) &Ne); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( reduce_kernel, 3, sizeof(cl_mem), (void *) &d_sums); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( reduce_kernel, 4, sizeof(cl_mem), (void *) &d_sums2); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // SRAD Kernel //====================================================================================================100 error = clSetKernelArg( srad_kernel, 0, sizeof(fp), (void *) &lambda); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad_kernel, 1, sizeof(int), (void *) &Nr); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad_kernel, 2, sizeof(int), (void *) &Nc); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad_kernel, 3, sizeof(long), (void *) &Ne); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad_kernel, 4, sizeof(cl_mem), (void *) &d_iN); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad_kernel, 5, sizeof(cl_mem), (void *) &d_iS); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad_kernel, 6, sizeof(cl_mem), (void *) &d_jE); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad_kernel, 7, sizeof(cl_mem), (void *) &d_jW); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad_kernel, 8, sizeof(cl_mem), (void *) &d_dN); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad_kernel, 9, sizeof(cl_mem), (void *) &d_dS); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad_kernel, 10, sizeof(cl_mem), (void *) &d_dW); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad_kernel, 11, sizeof(cl_mem), (void *) &d_dE); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad_kernel, 13, sizeof(cl_mem), (void *) &d_c); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad_kernel, 14, sizeof(cl_mem), (void *) &d_I); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // SRAD2 Kernel //====================================================================================================100 error = clSetKernelArg( srad2_kernel, 0, sizeof(fp), (void *) &lambda); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad2_kernel, 1, sizeof(int), (void *) &Nr); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad2_kernel, 2, sizeof(int), (void *) &Nc); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad2_kernel, 3, sizeof(long), (void *) &Ne); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad2_kernel, 4, sizeof(cl_mem), (void *) &d_iN); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad2_kernel, 5, sizeof(cl_mem), (void *) &d_iS); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad2_kernel, 6, sizeof(cl_mem), (void *) &d_jE); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad2_kernel, 7, sizeof(cl_mem), (void *) &d_jW); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad2_kernel, 8, sizeof(cl_mem), (void *) &d_dN); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad2_kernel, 9, sizeof(cl_mem), (void *) &d_dS); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad2_kernel, 10, sizeof(cl_mem), (void *) &d_dW); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad2_kernel, 11, sizeof(cl_mem), (void *) &d_dE); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad2_kernel, 12, sizeof(cl_mem), (void *) &d_c); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad2_kernel, 13, sizeof(cl_mem), (void *) &d_I); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // End //====================================================================================================100 //======================================================================================================================================================150 // COMPUTATION //======================================================================================================================================================150 printf("Iterations Progress: "); // execute main loop for (iter=0; iter<niter; iter++){ // do for the number of iterations input parameter printf("%d ", iter); fflush(NULL); //====================================================================================================100 // Prepare kernel //====================================================================================================100 // launch kernel error = clEnqueueNDRangeKernel( command_queue, prepare_kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // synchronize // error = clFinish(command_queue); // if (error != CL_SUCCESS) // fatal_CL(error, __LINE__); //====================================================================================================100 // Reduce Kernel - performs subsequent reductions of sums //====================================================================================================100 // initial values blocks2_work_size = blocks_work_size; // original number of blocks global_work_size2[0] = global_work_size[0]; no = Ne; // original number of sum elements mul = 1; // original multiplier // loop while(blocks2_work_size != 0){ // set arguments that were uptaded in this loop error = clSetKernelArg( reduce_kernel, 1, sizeof(long), (void *) &no); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( reduce_kernel, 2, sizeof(int), (void *) &mul); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( reduce_kernel, 5, sizeof(int), (void *) &blocks2_work_size); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // launch kernel error = clEnqueueNDRangeKernel( command_queue, reduce_kernel, 1, NULL, global_work_size2, local_work_size, 0, NULL, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // synchronize // error = clFinish(command_queue); // if (error != CL_SUCCESS) // fatal_CL(error, __LINE__); // update execution parameters no = blocks2_work_size; // get current number of elements if(blocks2_work_size == 1){ blocks2_work_size = 0; } else{ mul = mul * NUMBER_THREADS; // update the increment blocks_x = blocks2_work_size/(int)local_work_size[0]; // number of blocks if (blocks2_work_size % (int)local_work_size[0] != 0){ // compensate for division remainder above by adding one grid blocks_x = blocks_x + 1; } blocks2_work_size = blocks_x; global_work_size2[0] = blocks2_work_size * (int)local_work_size[0]; } } // copy total sums to device error = clEnqueueReadBuffer(command_queue, d_sums, CL_TRUE, 0, mem_size_single, &total, 0, NULL, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clEnqueueReadBuffer(command_queue, d_sums2, CL_TRUE, 0, mem_size_single, &total2, 0, NULL, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // calculate statistics //====================================================================================================100 meanROI = total / (fp)(NeROI); // gets mean (average) value of element in ROI meanROI2 = meanROI * meanROI; // varROI = (total2 / (fp)(NeROI)) - meanROI2; // gets variance of ROI q0sqr = varROI / meanROI2; // gets standard deviation of ROI //====================================================================================================100 // execute srad kernel //====================================================================================================100 // set arguments that were uptaded in this loop error = clSetKernelArg( srad_kernel, 12, sizeof(fp), (void *) &q0sqr); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // launch kernel error = clEnqueueNDRangeKernel( command_queue, srad_kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // synchronize // error = clFinish(command_queue); // if (error != CL_SUCCESS) // fatal_CL(error, __LINE__); //====================================================================================================100 // execute srad2 kernel //====================================================================================================100 // launch kernel error = clEnqueueNDRangeKernel( command_queue, srad2_kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // synchronize // error = clFinish(command_queue); // if (error != CL_SUCCESS) // fatal_CL(error, __LINE__); //====================================================================================================100 // End //====================================================================================================100 } printf("\n"); //======================================================================================================================================================150 // Compress Kernel - SCALE IMAGE UP FROM 0-1 TO 0-255 AND COMPRESS //======================================================================================================================================================150 //====================================================================================================100 // set parameters //====================================================================================================100 error = clSetKernelArg( compress_kernel, 0, sizeof(long), (void *) &Ne); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( compress_kernel, 1, sizeof(cl_mem), (void *) &d_I); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // launch kernel //====================================================================================================100 error = clEnqueueNDRangeKernel( command_queue, compress_kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // synchronize //====================================================================================================100 error = clFinish(command_queue); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // End //====================================================================================================100 //======================================================================================================================================================150 // COPY RESULTS BACK TO CPU //======================================================================================================================================================150 error = clEnqueueReadBuffer(command_queue, d_I, CL_TRUE, 0, mem_size, image, 0, NULL, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // int i; // for(i=0; i<100; i++){ // printf("%f ", image[i]); // } //======================================================================================================================================================150 // FREE MEMORY //======================================================================================================================================================150 // OpenCL structures error = clReleaseKernel(extract_kernel); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseKernel(prepare_kernel); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseKernel(reduce_kernel); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseKernel(srad_kernel); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseKernel(srad2_kernel); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseKernel(compress_kernel); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseProgram(program); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // common_change error = clReleaseMemObject(d_I); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseMemObject(d_c); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseMemObject(d_iN); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseMemObject(d_iS); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseMemObject(d_jE); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseMemObject(d_jW); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseMemObject(d_dN); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseMemObject(d_dS); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseMemObject(d_dE); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseMemObject(d_dW); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseMemObject(d_sums); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseMemObject(d_sums2); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // OpenCL structures error = clFlush(command_queue); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseCommandQueue(command_queue); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseContext(context); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //======================================================================================================================================================150 // End //======================================================================================================================================================150 }
static CUmodule build_kernel_source(const char *source_file, long target_capability) { char *source; int link_dev_runtime; nvrtcProgram program; nvrtcResult rc; char arch_buf[128]; const char *options[10]; int opt_index = 0; int build_failure = 0; char *build_log; size_t build_log_len; char *ptx_image; size_t ptx_image_len; void *bin_image; size_t bin_image_len; CUmodule cuda_module; CUresult cuda_rc; source = load_kernel_source(source_file, &link_dev_runtime); rc = nvrtcCreateProgram(&program, source, NULL, 0, NULL, NULL); if (rc != NVRTC_SUCCESS) nvrtc_error(rc, "nvrtcCreateProgram"); /* * Put command line options as cuda_program.c doing */ options[opt_index++] = "-I " CUDA_INCLUDE_PATH; snprintf(arch_buf, sizeof(arch_buf), "--gpu-architecture=compute_%ld", target_capability); options[opt_index++] = arch_buf; #ifdef PGSTROM_DEBUG options[opt_index++] = "--device-debug"; options[opt_index++] = "--generate-line-info"; #endif options[opt_index++] = "--use_fast_math"; if (link_dev_runtime) options[opt_index++] = "--relocatable-device-code=true"; /* * Kick runtime compiler */ rc = nvrtcCompileProgram(program, opt_index, options); if (rc != NVRTC_SUCCESS) { if (rc == NVRTC_ERROR_COMPILATION) build_failure = 1; else nvrtc_error(rc, "nvrtcCompileProgram"); } /* * Print build log */ rc = nvrtcGetProgramLogSize(program, &build_log_len); if (rc != NVRTC_SUCCESS) nvrtc_error(rc, "nvrtcGetProgramLogSize"); build_log = malloc(build_log_len + 1); if (!build_log) { fputs("out of memory", stderr); exit(1); } rc = nvrtcGetProgramLog(program, build_log); if (rc != NVRTC_SUCCESS) nvrtc_error(rc, "nvrtcGetProgramLog"); if (build_log_len > 1) printf("build log:\n%s\n", build_log); if (build_failure) exit(1); /* * Get PTX Image */ rc = nvrtcGetPTXSize(program, &ptx_image_len); if (rc != NVRTC_SUCCESS) nvrtc_error(rc, "nvrtcGetPTXSize"); ptx_image = malloc(ptx_image_len + 1); if (!ptx_image) { fputs("out of memory", stderr); exit(1); } rc = nvrtcGetPTX(program, ptx_image); if (rc != NVRTC_SUCCESS) nvrtc_error(rc, "nvrtcGetPTX"); ptx_image[ptx_image_len] = '\0'; /* * Link device runtime if needed */ if (link_dev_runtime) { link_device_libraries(ptx_image, ptx_image_len, &bin_image, &bin_image_len, target_capability); } else { bin_image = ptx_image; bin_image_len = ptx_image_len; } cuda_rc = cuModuleLoadData(&cuda_module, bin_image); if (cuda_rc != CUDA_SUCCESS) cuda_error(rc, "cuModuleLoadData"); return cuda_module; }
int main(int argc, char** argv) { cl_int error; cl_uint num_platforms; // Get the number of platforms error = clGetPlatformIDs(0, NULL, &num_platforms); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Get the list of platforms cl_platform_id* platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * num_platforms); error = clGetPlatformIDs(num_platforms, platforms, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Print the chosen platform (if there are multiple platforms, choose the first one) cl_platform_id platform = platforms[0]; char pbuf[100]; error = clGetPlatformInfo(platform, CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); printf("Platform: %s\n", pbuf); // Create a GPU context cl_context_properties context_properties[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) platform, 0}; context = clCreateContextFromType(context_properties, CL_DEVICE_TYPE_GPU, NULL, NULL, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Get and print the chosen device (if there are multiple devices, choose the first one) size_t devices_size; error = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &devices_size); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_device_id *devices = (cl_device_id *) malloc(devices_size); error = clGetContextInfo(context, CL_CONTEXT_DEVICES, devices_size, devices, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); device = devices[0]; error = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(pbuf), pbuf, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); printf("Device: %s\n", pbuf); // Create a command queue command_queue = clCreateCommandQueue(context, device, 0, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); int size; int grid_rows,grid_cols = 0; float *FilesavingTemp,*FilesavingPower; //,*MatrixOut; char *tfile, *pfile, *ofile; int total_iterations = 60; int pyramid_height = 1; // number of iterations if (argc < 7) usage(argc, argv); if((grid_rows = atoi(argv[1]))<=0|| (grid_cols = atoi(argv[1]))<=0|| (pyramid_height = atoi(argv[2]))<=0|| (total_iterations = atoi(argv[3]))<=0) usage(argc, argv); tfile=argv[4]; pfile=argv[5]; ofile=argv[6]; size=grid_rows*grid_cols; // --------------- pyramid parameters --------------- int borderCols = (pyramid_height)*EXPAND_RATE/2; int borderRows = (pyramid_height)*EXPAND_RATE/2; int smallBlockCol = BLOCK_SIZE-(pyramid_height)*EXPAND_RATE; int smallBlockRow = BLOCK_SIZE-(pyramid_height)*EXPAND_RATE; int blockCols = grid_cols/smallBlockCol+((grid_cols%smallBlockCol==0)?0:1); int blockRows = grid_rows/smallBlockRow+((grid_rows%smallBlockRow==0)?0:1); FilesavingTemp = (float *) malloc(size*sizeof(float)); FilesavingPower = (float *) malloc(size*sizeof(float)); // MatrixOut = (float *) calloc (size, sizeof(float)); if( !FilesavingPower || !FilesavingTemp) // || !MatrixOut) fatal("unable to allocate memory"); // Read input data from disk readinput(FilesavingTemp, grid_rows, grid_cols, tfile); readinput(FilesavingPower, grid_rows, grid_cols, pfile); // Load kernel source from file const char *source = load_kernel_source("hotspot_kernel.cl"); size_t sourceSize = strlen(source); // Compile the kernel cl_program program = clCreateProgramWithSource(context, 1, &source, &sourceSize, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Create an executable from the kernel error = clBuildProgram(program, 1, &device, NULL, NULL, NULL); // Show compiler warnings/errors static char log[65536]; memset(log, 0, sizeof(log)); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL); if (strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); kernel = clCreateKernel(program, "hotspot", &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); long long start_time = get_time(); // Create two temperature matrices and copy the temperature input data cl_mem MatrixTemp[2]; // Create input memory buffers on device MatrixTemp[0] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(float) * size, FilesavingTemp, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); MatrixTemp[1] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(float) * size, NULL, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Copy the power input data cl_mem MatrixPower = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(float) * size, FilesavingPower, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Perform the computation int ret = compute_tran_temp(MatrixPower, MatrixTemp, grid_cols, grid_rows, total_iterations, pyramid_height, blockCols, blockRows, borderCols, borderRows, FilesavingTemp, FilesavingPower); // Copy final temperature data back cl_float *MatrixOut = (cl_float *) clEnqueueMapBuffer(command_queue, MatrixTemp[ret], CL_TRUE, CL_MAP_READ, 0, sizeof(float) * size, 0, NULL, NULL, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); long long end_time = get_time(); printf("Total time: %.3f seconds\n", ((float) (end_time - start_time)) / (1000*1000)); // Write final output to output file writeoutput(MatrixOut, grid_rows, grid_cols, ofile); error = clEnqueueUnmapMemObject(command_queue, MatrixTemp[ret], (void *) MatrixOut, 0, NULL, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); clReleaseMemObject(MatrixTemp[0]); clReleaseMemObject(MatrixTemp[1]); clReleaseMemObject(MatrixPower); return 0; }
int encrypt_cl() { #ifdef DEBUG printf("start of encrypt_cl\n"); #endif int err; // error code returned from api calls unsigned int correct; // number of correct results returned size_t global; // global domain size for our calculation size_t local; // local domain size for our calculation cl_device_id *device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_program decrypt_program; // compute program cl_kernel encrypt_kernel; // compute kernel //cl_kernel decrypt_kernel; // compute kernel cl_event event; static cl_mem buffer_state; static cl_mem buffer_roundkeys; #ifdef DEBUG printf("data, keydata, results\n"); #endif float results[DATA_SIZE]; // results returned from device unsigned char in[DATA_SIZE]; //plain text unsigned char out[DATA_SIZE]; // encryped text #ifdef DEBUG printf("initFns\n"); #endif initFns(); cl_platform_id platform = NULL;//the chosen platform err = clGetPlatformIDs(1, &platform, NULL); CHECK_CL_SUCCESS("clGetPlatformIDs", err); // Connect to a compute device #ifdef DEBUG printf("Connect to a compute device\n"); #endif // cl_uint numDevices = 0; //int gpu = 1; //err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); device_id = (cl_device_id*)malloc(2 * sizeof(cl_device_id)); err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 2, device_id, &numDevices); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); return EXIT_FAILURE; } #ifdef DEBUG printf("has %d devices\n", numDevices); #endif //IAH(); //err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL); //cl_device_info device_info; char buffer[1024]; clGetDeviceInfo(device_id[0], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL); /* James: Check if the device is available */ cl_bool device_available = CL_FALSE; clGetDeviceInfo(device_id[0], CL_DEVICE_AVAILABLE, sizeof(cl_bool), &device_available, NULL); if (device_available != CL_TRUE) { printf("Error: Device %i is not available\n", 0); return EXIT_FAILURE; } #ifdef DEBUG printf("Device name is %s\n", buffer); #endif clGetDeviceInfo(device_id[1], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL); #ifdef DEBUG printf("Device name is %s\n", buffer); #endif // Create a compute context #ifdef DEBUG printf("Create a compute context\n"); #endif // context = clCreateContext(0, 1, &device_id[DEVICE], NULL, NULL, &err); //context = clCreateContext(0, 1, device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } if (err != CL_SUCCESS) { printf("Error: Failed to create a compute context: errcode_ret=%i\n", err); return EXIT_FAILURE; } // Create a command commands #ifdef DEBUG printf("Create a command commands\n"); #endif // commands = clCreateCommandQueue(context, device_id[DEVICE], CL_QUEUE_PROFILING_ENABLE, &err); CHECK_CL_SUCCESS("clCreateCommandQueue", err); if (!commands) { printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } // Create the compute program from the source buffer #ifdef DEBUG printf("Create the compute program from the source buffer\n"); #endif const char *kernel_source = load_kernel_source(AES_KERNEL); //printf("kernel source is:\n %s", kernel_source); program = clCreateProgramWithSource(context, 1, &kernel_source, NULL, &err); if (!program || err != CL_SUCCESS) { printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } // Build the program executable #ifdef DEBUG printf("Build the program executable\n"); #endif // err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char b[2048]; printf("Error: Failed to build program executable!\n"); err = clGetProgramBuildInfo(program, device_id[DEVICE], CL_PROGRAM_BUILD_LOG, sizeof(b), b, &len); CHECK_CL_SUCCESS("clGetProgramBuildInfo", err); printf("%s\n", b); exit(1); } // Create the compute kernel in the program we wish to run #ifdef DEBUG printf("Create the compute kernel in the program we wish to run\n"); #endif encrypt_kernel = clCreateKernel(program, "AES_encrypt", &err); if (!encrypt_kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel! err = %d\n", err); size_t len; char b[2048]; err = clGetProgramBuildInfo(program, device_id[DEVICE], CL_PROGRAM_BUILD_LOG, sizeof(b), b, &len); CHECK_CL_SUCCESS("clGetProgramBuildInfo", err); printf("%s\n", b); exit(1); } // Create the input and output arrays in device memory for our calculation #ifdef DEBUG printf("Create the input and output arrays in device memory for our calculation\n"); #endif int max_buffer_size = MAX_BUFFER_SIZE; // dynamic buffer size please buffer_state = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, max_buffer_size, NULL, &err); CHECK_CL_SUCCESS("clCreateBuffer", err); buffer_roundkeys = clCreateBuffer(context, CL_MEM_READ_ONLY, 16 * 15, NULL, &err); CHECK_CL_SUCCESS("clCreateBuffer", err); if (!buffer_state || !buffer_roundkeys) { printf("Error: Failed to allocate device memory!\n"); exit(1); } // Get the maximum work group size for executing the kernel on the device #ifdef DEBUG printf("Get the maximum work group size for executing the kernel on the device\n"); #endif // err = clGetKernelWorkGroupInfo(encrypt_kernel, device_id[DEVICE], CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); exit(1); } printf("local is %d\n", local); unsigned int i = 0; unsigned int count = DATA_SIZE; clock_t tStartF = clock(); // Fill our data set with random float values i = 0; printf("encrypt_cl: count = %d\n", count); for(i = 0; i < count; i++) { in[i] = 0; //in[i] = rand(); } //tFill += (double)(clock() - tStartF)/CLOCKS_PER_SEC; clock_t tStart = clock(); unsigned int k = 0; double tFill = 0; double tMemory = 0; double tArgument = 0; double tExecute = 0; double tRead = 0; int ret; AES_KEY ks; ret = AES_set_encrypt_key(key, 128, &ks); //for (k = 0; k<LOOP; k++) { //printf("encrypt_cl: i = %d\n", i); // Write our data set into the input array in device memory //printf("Write our data set into the input array in device memory\n"); // clock_t tStartM = clock(); err = clEnqueueWriteBuffer(commands, buffer_state, CL_TRUE, 0, DATA_SIZE, in, 0, NULL, NULL); CHECK_CL_SUCCESS("clEnqueueWriteBuffer", err); err = clEnqueueWriteBuffer(commands, buffer_roundkeys, CL_TRUE, 0, 16 * 15, &ks.rd_key, 0, NULL, NULL); CHECK_CL_SUCCESS("clEnqueueWriteBuffer", err); printf("rd_key %s", ks.rd_key); //err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL); //err = clEnqueueWriteBuffer(commands, key, CL_TRUE, 0, sizeof(float) * count, keyData, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array!\n"); exit(1); } err = clFinish(commands); CHECK_CL_SUCCESS("clFinish", err); tMemory += (double)(clock() - tStartM)/CLOCKS_PER_SEC; // Set the arguments to our compute kernel //printf("Set the arguments to our compute kernel\n"); // clock_t tStartA = clock(); err = 0; err = clSetKernelArg(encrypt_kernel, 0, sizeof(cl_mem), &buffer_state); CHECK_CL_SUCCESS("clSetKernelArg", err); err |= clSetKernelArg(encrypt_kernel, 1, sizeof(cl_mem), &buffer_roundkeys); CHECK_CL_SUCCESS("clSetKernelArg", err); err |= clSetKernelArg(encrypt_kernel, 2, sizeof(ks.rounds), &ks.rounds); CHECK_CL_SUCCESS("clSetKernelArg", err); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); exit(1); } tArgument += (double)(clock() - tStartA)/CLOCKS_PER_SEC; // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // global = count; #ifdef DEBUG printf("global is %d\n", global); #endif clock_t tStartE = clock(); cl_float t = 0.; cl_ulong start = 0, end = 0; //for (i = 0; i<LOOP; i++) { err = clEnqueueNDRangeKernel(commands, encrypt_kernel, 1, NULL, &global, &local, 0, NULL, &event); CHECK_CL_SUCCESS("clEnqueueNDRangeKernel", err); //err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, NULL, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel!\n"); return EXIT_FAILURE; } err = clWaitForEvents(1, &event); CHECK_CL_SUCCESS("clWaitForEvents", err); err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); CHECK_CL_SUCCESS("clGetEventProfilingInfo", err); err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); CHECK_CL_SUCCESS("clGetEventProfilingInfo", err); //END-START gives you hints on kind of “pure HW execution time” //the resolution of the events is 1e-09 sec t += (cl_float)(end - start)*(cl_float)(1e-06); //} printf("profile time: %f ms",t); err = clFinish(commands); CHECK_CL_SUCCESS("clFinish", err); // Wait for the command commands to get serviced before reading back results // tExecute += (double)(clock() - tStartE)/CLOCKS_PER_SEC; // Read back the results from the device to verify the output // clock_t tStartR = clock(); //err = clEnqueueReadBuffer( commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL ); err = clEnqueueReadBuffer(commands, buffer_state, CL_FALSE, 0, DATA_SIZE, out, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); exit(1); } printf("input data is\n"); for (i=0; i<DATA_SIZE; i++) { printf("%X ", in[i]); } printf("encrypted data is\n"); for (i=0; i<DATA_SIZE; i++) { printf("%X ", out[i]); } tRead += (double)(clock() - tStartR)/CLOCKS_PER_SEC; //} printf("-----------------------------------------------"); printf("encrypt_cl Time taken: %.2fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC); printf("cl Fill data Time taken: %.2fs\n", tFill); printf("cl memory copy Time taken: %.2fs\n", tMemory); printf("cl set Argument Time taken: %.2fs\n", tArgument); printf("cl Execute kernel time taken: %.2fs\n", tExecute); printf("cl read memory taken: %.2fs\n", tRead); // Validate our results // correct = 0; for(i = 0; i < count; i++) { //if( data[i] - sqrt(keyData[i]) < 0.001) correct++; } // Print a brief summary detailing the results #ifdef DEBUG printf("Computed '%d/%d' correct values!\n", correct, count); #endif // Shutdown and cleanup // clReleaseMemObject(buffer_state); CHECK_CL_SUCCESS("clReleaseMemObject", err); clReleaseMemObject(buffer_roundkeys); CHECK_CL_SUCCESS("clReleaseMemObject", err); clReleaseProgram(program); CHECK_CL_SUCCESS("clReleaseProgram", err); clReleaseKernel(encrypt_kernel); CHECK_CL_SUCCESS("clReleaseKernel", err); clReleaseCommandQueue(commands); CHECK_CL_SUCCESS("clReleaseCommandQueue", err); clReleaseContext(context); CHECK_CL_SUCCESS("clReleaseContext", err); }
void kernel_gpu_opencl_wrapper( record *records, long records_mem, knode *knodes, long knodes_elem, long knodes_mem, int order, long maxheight, int count, long *currKnode, long *offset, int *keys, record *ans) { //======================================================================================================================================================150 // CPU VARIABLES //======================================================================================================================================================150 // timer long long time0; long long time1; long long time2; long long time3; long long time4; long long time5; long long time6; time0 = get_time(); //======================================================================================================================================================150 // GPU SETUP //======================================================================================================================================================150 //====================================================================================================100 // INITIAL DRIVER OVERHEAD //====================================================================================================100 // cudaThreadSynchronize(); //====================================================================================================100 // COMMON VARIABLES //====================================================================================================100 // common variables cl_int error; //====================================================================================================100 // GET PLATFORMS (Intel, AMD, NVIDIA, based on provided library), SELECT ONE //====================================================================================================100 // Get the number of available platforms cl_uint num_platforms; error = clGetPlatformIDs( 0, NULL, &num_platforms); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Get the list of available platforms cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms); error = clGetPlatformIDs( num_platforms, platforms, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Select the 1st platform cl_platform_id platform = platforms[0]; // Get the name of the selected platform and print it (if there are multiple platforms, choose the first one) char pbuf[100]; error = clGetPlatformInfo( platform, CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); printf("Platform: %s\n", pbuf); //====================================================================================================100 // CREATE CONTEXT FOR THE PLATFORM //====================================================================================================100 // Create context properties for selected platform cl_context_properties context_properties[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) platform, 0}; // Create context for selected platform being GPU cl_context context; context = clCreateContextFromType( context_properties, CL_DEVICE_TYPE_GPU, NULL, NULL, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // GET DEVICES AVAILABLE FOR THE CONTEXT, SELECT ONE //====================================================================================================100 // Get the number of devices (previousely selected for the context) size_t devices_size; error = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &devices_size); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Get the list of devices (previousely selected for the context) cl_device_id *devices = (cl_device_id *) malloc(devices_size); error = clGetContextInfo( context, CL_CONTEXT_DEVICES, devices_size, devices, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Select the first device (previousely selected for the context) (if there are multiple devices, choose the first one) cl_device_id device; device = devices[0]; // Get the name of the selected device (previousely selected for the context) and print it error = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(pbuf), pbuf, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); printf("Device: %s\n", pbuf); //====================================================================================================100 // CREATE COMMAND QUEUE FOR THE DEVICE //====================================================================================================100 // Create a command queue cl_command_queue command_queue; command_queue = clCreateCommandQueue( context, device, 0, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // CREATE PROGRAM, COMPILE IT //====================================================================================================100 // Load kernel source code from file const char *source = load_kernel_source("./kernel/kernel_gpu_opencl.cl"); size_t sourceSize = strlen(source); // Create the program cl_program program = clCreateProgramWithSource( context, 1, &source, &sourceSize, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); char clOptions[110]; // sprintf(clOptions,"-I../../src"); sprintf(clOptions,"-I./../"); #ifdef DEFAULT_ORDER sprintf(clOptions + strlen(clOptions), " -DDEFAULT_ORDER=%d", DEFAULT_ORDER); #endif // Compile the program error = clBuildProgram( program, 1, &device, clOptions, NULL, NULL); // Print warnings and errors from compilation static char log[65536]; memset(log, 0, sizeof(log)); clGetProgramBuildInfo( program, device, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL); printf("-----OpenCL Compiler Output-----\n"); if (strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log); printf("--------------------------------\n"); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Create kernel cl_kernel kernel; kernel = clCreateKernel(program, "findK", &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); time1 = get_time(); //====================================================================================================100 // END //====================================================================================================100 //======================================================================================================================================================150 // GPU MEMORY (MALLOC) //======================================================================================================================================================150 //====================================================================================================100 // DEVICE IN //====================================================================================================100 //==================================================50 // recordsD //==================================================50 cl_mem recordsD; recordsD = clCreateBuffer( context, CL_MEM_READ_WRITE, records_mem, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // knodesD //==================================================50 cl_mem knodesD; knodesD = clCreateBuffer( context, CL_MEM_READ_WRITE, knodes_mem, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // currKnodeD //==================================================50 cl_mem currKnodeD; currKnodeD = clCreateBuffer( context, CL_MEM_READ_WRITE, count*sizeof(long), NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // offsetD //==================================================50 cl_mem offsetD; offsetD = clCreateBuffer( context, CL_MEM_READ_WRITE, count*sizeof(long), NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // keysD //==================================================50 cl_mem keysD; keysD = clCreateBuffer( context, CL_MEM_READ_WRITE, count*sizeof(long), NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // END //==================================================50 //====================================================================================================100 // DEVICE IN/OUT //====================================================================================================100 //==================================================50 // ansD //==================================================50 cl_mem ansD; ansD = clCreateBuffer( context, CL_MEM_READ_WRITE, count*sizeof(record), NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); time2 = get_time(); //==================================================50 // END //==================================================50 //====================================================================================================100 // END //====================================================================================================100 //======================================================================================================================================================150 // GPU MEMORY COPY //======================================================================================================================================================150 //====================================================================================================100 // GPU MEMORY (MALLOC) COPY IN //====================================================================================================100 //==================================================50 // recordsD //==================================================50 error = clEnqueueWriteBuffer( command_queue, // command queue recordsD, // destination 1, // block the source from access until this copy operation complates (1=yes, 0=no) 0, // offset in destination to write to records_mem, // size to be copied records, // source 0, // # of events in the list of events to wait for NULL, // list of events to wait for NULL); // ID of this operation to be used by waiting operations if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // knodesD //==================================================50 error = clEnqueueWriteBuffer( command_queue, // command queue knodesD, // destination 1, // block the source from access until this copy operation complates (1=yes, 0=no) 0, // offset in destination to write to knodes_mem, // size to be copied knodes, // source 0, // # of events in the list of events to wait for NULL, // list of events to wait for NULL); // ID of this operation to be used by waiting operations if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // currKnodeD //==================================================50 error = clEnqueueWriteBuffer( command_queue, // command queue currKnodeD, // destination 1, // block the source from access until this copy operation complates (1=yes, 0=no) 0, // offset in destination to write to count*sizeof(long), // size to be copied currKnode, // source 0, // # of events in the list of events to wait for NULL, // list of events to wait for NULL); // ID of this operation to be used by waiting operations if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // offsetD //==================================================50 error = clEnqueueWriteBuffer( command_queue, // command queue offsetD, // destination 1, // block the source from access until this copy operation complates (1=yes, 0=no) 0, // offset in destination to write to count*sizeof(long), // size to be copied offset, // source 0, // # of events in the list of events to wait for NULL, // list of events to wait for NULL); // ID of this operation to be used by waiting operations if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // keysD //==================================================50 error = clEnqueueWriteBuffer( command_queue, // command queue keysD, // destination 1, // block the source from access until this copy operation complates (1=yes, 0=no) 0, // offset in destination to write to count*sizeof(int), // size to be copied keys, // source 0, // # of events in the list of events to wait for NULL, // list of events to wait for NULL); // ID of this operation to be used by waiting operations if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // END //==================================================50 //====================================================================================================100 // DEVICE IN/OUT //====================================================================================================100 //==================================================50 // ansD //==================================================50 error = clEnqueueWriteBuffer( command_queue, // command queue ansD, // destination 1, // block the source from access until this copy operation complates (1=yes, 0=no) 0, // offset in destination to write to count*sizeof(record), // size to be copied ans, // source 0, // # of events in the list of events to wait for NULL, // list of events to wait for NULL); // ID of this operation to be used by waiting operations if (error != CL_SUCCESS) fatal_CL(error, __LINE__); time3 = get_time(); //==================================================50 // END //==================================================50 //====================================================================================================100 // END //====================================================================================================100 //======================================================================================================================================================150 // findK kernel //======================================================================================================================================================150 //====================================================================================================100 // Execution Parameters //====================================================================================================100 size_t local_work_size[1]; local_work_size[0] = order < 1024 ? order : 1024; size_t global_work_size[1]; global_work_size[0] = count * local_work_size[0]; printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", (int)(global_work_size[0]/local_work_size[0]), (int)local_work_size[0]); //====================================================================================================100 // Kernel Arguments //====================================================================================================100 clSetKernelArg( kernel, 0, sizeof(long), (void *) &maxheight); clSetKernelArg( kernel, 1, sizeof(cl_mem), (void *) &knodesD); clSetKernelArg( kernel, 2, sizeof(long), (void *) &knodes_elem); clSetKernelArg( kernel, 3, sizeof(cl_mem), (void *) &recordsD); clSetKernelArg( kernel, 4, sizeof(cl_mem), (void *) &currKnodeD); clSetKernelArg( kernel, 5, sizeof(cl_mem), (void *) &offsetD); clSetKernelArg( kernel, 6, sizeof(cl_mem), (void *) &keysD); clSetKernelArg( kernel, 7, sizeof(cl_mem), (void *) &ansD); //====================================================================================================100 // Kernel //====================================================================================================100 error = clEnqueueNDRangeKernel( command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Wait for all operations to finish NOT SURE WHERE THIS SHOULD GO error = clFinish(command_queue); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); time4 = get_time(); //====================================================================================================100 // END //====================================================================================================100 //======================================================================================================================================================150 // GPU MEMORY COPY (CONTD.) //======================================================================================================================================================150 //====================================================================================================100 // DEVICE IN/OUT //====================================================================================================100 //==================================================50 // ansD //==================================================50 error = clEnqueueReadBuffer(command_queue, // The command queue. ansD, // The image on the device. CL_TRUE, // Blocking? (ie. Wait at this line until read has finished?) 0, // Offset. None in this case. count*sizeof(record), // Size to copy. ans, // The pointer to the image on the host. 0, // Number of events in wait list. Not used. NULL, // Event wait list. Not used. NULL); // Event object for determining status. Not used. if (error != CL_SUCCESS) fatal_CL(error, __LINE__); time5 = get_time(); //==================================================50 // END //==================================================50 //====================================================================================================100 // END //====================================================================================================100 //======================================================================================================================================================150 // GPU MEMORY DEALLOCATION //======================================================================================================================================================150 // Release kernels... clReleaseKernel(kernel); // Now the program... clReleaseProgram(program); // Clean up the device memory... clReleaseMemObject(recordsD); clReleaseMemObject(knodesD); clReleaseMemObject(currKnodeD); clReleaseMemObject(offsetD); clReleaseMemObject(keysD); clReleaseMemObject(ansD); // Flush the queue error = clFlush(command_queue); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // ...and finally, the queue and context. clReleaseCommandQueue(command_queue); // ??? clReleaseContext(context); time6 = get_time(); //======================================================================================================================================================150 // DISPLAY TIMING //======================================================================================================================================================150 printf("Time spent in different stages of GPU_CUDA KERNEL:\n"); printf("%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\n", (float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time6-time0) * 100); printf("%15.12f s, %15.12f % : GPU MEM: ALO\n", (float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time6-time0) * 100); printf("%15.12f s, %15.12f % : GPU MEM: COPY IN\n", (float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time6-time0) * 100); printf("%15.12f s, %15.12f % : GPU: KERNEL\n", (float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time6-time0) * 100); printf("%15.12f s, %15.12f % : GPU MEM: COPY OUT\n", (float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time6-time0) * 100); printf("%15.12f s, %15.12f % : GPU MEM: FRE\n", (float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time6-time0) * 100); printf("Total time:\n"); printf("%.12f s\n", (float) (time6-time0) / 1000000); //======================================================================================================================================================150 // END //======================================================================================================================================================150 }
int kernel_gpu_opencl_wrapper( int xmax, int workload, fp ***y, fp **x, fp **params, fp *com) { //======================================================================================================================================================150 // VARIABLES //======================================================================================================================================================150 long long time0; long long time1; long long time2; long long time3; long long time4; long long time5; long long timecopyin = 0; long long timekernel = 0; long long timecopyout = 0; long long timeother; //stage1_start time0 = get_time(); int i; //======================================================================================================================================================150 // GPU SETUP //======================================================================================================================================================150 //====================================================================================================100 // COMMON VARIABLES //====================================================================================================100 // common variables cl_int error; //====================================================================================================100 // GET PLATFORMS (Intel, AMD, NVIDIA, based on provided library), SELECT ONE //====================================================================================================100 // Get the number of available platforms cl_uint num_platforms; error = clGetPlatformIDs( 0, NULL, &num_platforms); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Get the list of available platforms cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms); error = clGetPlatformIDs( num_platforms, platforms, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Select the 1st platform cl_platform_id platform = platforms[0]; // Get the name of the selected platform and print it (if there are multiple platforms, choose the first one) char pbuf[100]; error = clGetPlatformInfo( platform, CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); printf("Platform: %s\n", pbuf); //====================================================================================================100 // CREATE CONTEXT FOR THE PLATFORM //====================================================================================================100 // Create context properties for selected platform cl_context_properties context_properties[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) platform, 0}; // Create context for selected platform being GPU cl_context context; context = clCreateContextFromType( context_properties, CL_DEVICE_TYPE_GPU, NULL, NULL, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // GET DEVICES AVAILABLE FOR THE CONTEXT, SELECT ONE //====================================================================================================100 // Get the number of devices (previousely selected for the context) size_t devices_size; error = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &devices_size); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Get the list of devices (previousely selected for the context) cl_device_id *devices = (cl_device_id *) malloc(devices_size); error = clGetContextInfo( context, CL_CONTEXT_DEVICES, devices_size, devices, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Select the first device (previousely selected for the context) (if there are multiple devices, choose the first one) cl_device_id device; device = devices[0]; // Get the name of the selected device (previousely selected for the context) and print it error = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(pbuf), pbuf, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); printf("Device: %s\n", pbuf); //====================================================================================================100 // CREATE COMMAND QUEUE FOR THE DEVICE //====================================================================================================100 // Create a command queue cl_command_queue command_queue; command_queue = clCreateCommandQueue( context, device, 0, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // CRATE PROGRAM, COMPILE IT //====================================================================================================100 // Load kernel source code from file const char *source = load_kernel_source("./kernel/kernel_gpu_opencl.cl"); size_t sourceSize = strlen(source); // Create the program cl_program program = clCreateProgramWithSource( context, 1, &source, &sourceSize, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Compile the program error = clBuildProgram( program, 1, &device, "-I./../", NULL, NULL); // Print warnings and errors from compilation static char log[65536]; memset(log, 0, sizeof(log)); clGetProgramBuildInfo( program, device, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL); printf("-----OpenCL Compiler Output-----\n"); if (strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log); printf("--------------------------------\n"); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Create kernel cl_kernel kernel; kernel = clCreateKernel(program, "kernel_gpu_opencl", &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // INITIAL DRIVER OVERHEAD //====================================================================================================100 // cudaThreadSynchronize(); time1 = get_time(); // double start_timer = omp_get_wtime(); //======================================================================================================================================================150 // ALLOCATE MEMORY //======================================================================================================================================================150 //====================================================================================================100 // d_initvalu_mem //====================================================================================================100 int d_initvalu_mem; d_initvalu_mem = EQUATIONS * sizeof(fp); cl_mem d_initvalu; d_initvalu = clCreateBuffer(context, // context CL_MEM_READ_WRITE, // flags d_initvalu_mem, // size of buffer NULL, // host pointer (optional) &error ); // returned error if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // d_finavalu_mem //====================================================================================================100 int d_finavalu_mem; d_finavalu_mem = EQUATIONS * sizeof(fp); cl_mem d_finavalu; d_finavalu = clCreateBuffer(context, CL_MEM_READ_WRITE, d_finavalu_mem, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // d_params_mem //====================================================================================================100 int d_params_mem; d_params_mem = PARAMETERS * sizeof(fp); cl_mem d_params; d_params = clCreateBuffer( context, CL_MEM_READ_WRITE, d_params_mem, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // d_com_mem //====================================================================================================100 int d_com_mem; d_com_mem = 3 * sizeof(fp); cl_mem d_com; d_com = clCreateBuffer( context, CL_MEM_READ_WRITE, d_com_mem, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); time2 = get_time(); //======================================================================================================================================================150 // EXECUTION //======================================================================================================================================================150 int status; for(i=0; i<workload; i++){ status = solver( y[i], x[i], xmax, params[i], com, d_initvalu, d_finavalu, d_params, d_com, command_queue, kernel, &timecopyin, &timekernel, &timecopyout); if(status !=0){ printf("STATUS: %d\n", status); } } // // // print results // // int k; // // for(i=0; i<workload; i++){ // // printf("WORKLOAD %d:\n", i); // // for(j=0; j<(xmax+1); j++){ // // printf("\tTIME %d:\n", j); // // for(k=0; k<EQUATIONS; k++){ // // printf("\t\ty[%d][%d][%d]=%13.10f\n", i, j, k, y[i][j][k]); // // } // // } // // } // double end_timer = omp_get_wtime(); // printf("Time3-Time1 : %.8f\n",(end_timer - start_timer)); time3 = get_time(); //======================================================================================================================================================150 // FREE GPU MEMORY //======================================================================================================================================================150 // Release kernels... clReleaseKernel(kernel); // Now the program... clReleaseProgram(program); // Clean up the device memory... clReleaseMemObject(d_initvalu); clReleaseMemObject(d_finavalu); clReleaseMemObject(d_params); clReleaseMemObject(d_com); // Flush the queue error = clFlush(command_queue); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // ...and finally, the queue and context. clReleaseCommandQueue(command_queue); // ??? clReleaseContext(context); time4= get_time(); //======================================================================================================================================================150 // DISPLAY TIMING //======================================================================================================================================================150 printf("Time spent in different stages of the application:\n"); printf("%15.12f s, %15.12f % : CPU: GPU SETUP\n", (float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time4-time0) * 100); printf("%15.12f s, %15.12f % : CPU: ALLOCATE GPU MEMORY\n", (float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time4-time0) * 100); printf("%15.12f s, %15.12f % : GPU: COMPUTATION\n", (float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time4-time0) * 100); printf("\tGPU: COMPUTATION Components:\n"); printf("\t%15.12f s, %15.12f % : GPU: COPY DATA IN\n", (float) (timecopyin) / 1000000, (float) (timecopyin) / (float) (time4-time0) * 100); printf("\t%15.12f s, %15.12f % : GPU: KERNEL\n", (float) (timekernel) / 1000000, (float) (timekernel) / (float) (time4-time0) * 100); printf("\t%15.12f s, %15.12f % : GPU: COPY DATA OUT\n", (float) (timecopyout) / 1000000, (float) (timecopyout) / (float) (time4-time0) * 100); timeother = time3-time2-timecopyin-timekernel-timecopyout; printf("\t%15.12f s, %15.12f % : GPU: OTHER\n", (float) (timeother) / 1000000, (float) (timeother) / (float) (time4-time0) * 100); printf("%15.12f s, %15.12f % : CPU: FREE GPU MEMORY\n", (float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time4-time0) * 100); printf("Total time 1:\n"); printf("%.12f s\n", (float) (time4-time0) / 1000000); //======================================================================================================================================================150 // RETURN //======================================================================================================================================================150 return 0; //======================================================================================================================================================150 // END //======================================================================================================================================================150 }