int compute_tran_temp(cl_mem MatrixPower, cl_mem MatrixTemp[2], int col, int row, \ int total_iterations, int num_iterations, int blockCols, int blockRows, int borderCols, int borderRows, float *TempCPU, float *PowerCPU) { float grid_height = chip_height / row; float grid_width = chip_width / col; float Cap = FACTOR_CHIP * SPEC_HEAT_SI * t_chip * grid_width * grid_height; float Rx = grid_width / (2.0 * K_SI * t_chip * grid_height); float Ry = grid_height / (2.0 * K_SI * t_chip * grid_width); float Rz = t_chip / (K_SI * grid_height * grid_width); float max_slope = MAX_PD / (FACTOR_CHIP * t_chip * SPEC_HEAT_SI); float step = PRECISION / max_slope; int t; int src = 0, dst = 1; cl_int error; // Determine GPU work group grid size_t global_work_size[2]; global_work_size[0] = BLOCK_SIZE * blockCols; global_work_size[1] = BLOCK_SIZE * blockRows; size_t local_work_size[2]; local_work_size[0] = BLOCK_SIZE; local_work_size[1] = BLOCK_SIZE; long long start_time = get_time(); for (t = 0; t < total_iterations; t += num_iterations) { // Specify kernel arguments int iter = MIN(num_iterations, total_iterations - t); clSetKernelArg(kernel, 0, sizeof(int), (void *) &iter); clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &MatrixPower); clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &MatrixTemp[src]); clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &MatrixTemp[dst]); clSetKernelArg(kernel, 4, sizeof(int), (void *) &col); clSetKernelArg(kernel, 5, sizeof(int), (void *) &row); clSetKernelArg(kernel, 6, sizeof(int), (void *) &borderCols); clSetKernelArg(kernel, 7, sizeof(int), (void *) &borderRows); clSetKernelArg(kernel, 8, sizeof(float), (void *) &Cap); clSetKernelArg(kernel, 9, sizeof(float), (void *) &Rx); clSetKernelArg(kernel, 10, sizeof(float), (void *) &Ry); clSetKernelArg(kernel, 11, sizeof(float), (void *) &Rz); clSetKernelArg(kernel, 12, sizeof(float), (void *) &step); // Launch kernel error = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Flush the queue error = clFlush(command_queue); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Swap input and output GPU matrices src = 1 - src; dst = 1 - dst; } // Wait for all operations to finish error = clFinish(command_queue); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); long long end_time = get_time(); long long total_time = (end_time - start_time); printf("\nKernel time: %.3f seconds\n", ((float) total_time) / (1000*1000)); return src; }
void kernel_gpu_opencl_wrapper( fp* image, // input image int Nr, // IMAGE nbr of rows int Nc, // IMAGE nbr of cols long Ne, // IMAGE nbr of elem int niter, // nbr of iterations fp lambda, // update step size long NeROI, // ROI nbr of elements int* iN, int* iS, int* jE, int* jW, int iter, // primary loop int mem_size_i, int mem_size_j) { //======================================================================================================================================================150 // GPU SETUP //======================================================================================================================================================150 //====================================================================================================100 // COMMON VARIABLES //====================================================================================================100 // common variables cl_int error; //====================================================================================================100 // GET PLATFORMS (Intel, AMD, NVIDIA, based on provided library), SELECT ONE //====================================================================================================100 // Get the number of available platforms cl_uint num_platforms; error = clGetPlatformIDs( 0, NULL, &num_platforms); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Get the list of available platforms cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms); error = clGetPlatformIDs( num_platforms, platforms, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Select the 1st platform cl_platform_id platform = platforms[0]; // Get the name of the selected platform and print it (if there are multiple platforms, choose the first one) char pbuf[100]; error = clGetPlatformInfo( platform, CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); printf("Platform: %s\n", pbuf); //====================================================================================================100 // CREATE CONTEXT FOR THE PLATFORM //====================================================================================================100 // Create context properties for selected platform cl_context_properties context_properties[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) platform, 0}; // Create context for selected platform being GPU cl_context context; context = clCreateContextFromType( context_properties, CL_DEVICE_TYPE_ALL, NULL, NULL, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // GET DEVICES AVAILABLE FOR THE CONTEXT, SELECT ONE //====================================================================================================100 // Get the number of devices (previousely selected for the context) size_t devices_size; error = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &devices_size); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Get the list of devices (previousely selected for the context) cl_device_id *devices = (cl_device_id *) malloc(devices_size); error = clGetContextInfo( context, CL_CONTEXT_DEVICES, devices_size, devices, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Select the first device (previousely selected for the context) (if there are multiple devices, choose the first one) cl_device_id device; device = devices[0]; // Get the name of the selected device (previousely selected for the context) and print it error = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(pbuf), pbuf, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); printf("Device: %s\n", pbuf); //====================================================================================================100 // CREATE COMMAND QUEUE FOR THE DEVICE //====================================================================================================100 // Create a command queue cl_command_queue command_queue; command_queue = clCreateCommandQueue( context, device, 0, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // CREATE PROGRAM, COMPILE IT //====================================================================================================100 // Load kernel source code from file const char *source = load_kernel_source("./kernel/kernel_gpu_opencl.cl"); size_t sourceSize = strlen(source); // Create the program cl_program program = clCreateProgramWithSource( context, 1, &source, &sourceSize, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); char clOptions[150]; // sprintf(clOptions,"-I../../src"); sprintf(clOptions,"-I."); #ifdef RD_WG_SIZE sprintf(clOptions + strlen(clOptions), " -DRD_WG_SIZE=%d", RD_WG_SIZE); #endif #ifdef RD_WG_SIZE_0 sprintf(clOptions + strlen(clOptions), " -DRD_WG_SIZE_0=%d", RD_WG_SIZE_0); #endif #ifdef RD_WG_SIZE_0_0 sprintf(clOptions + strlen(clOptions), " -DRD_WG_SIZE_0_0=%d", RD_WG_SIZE_0_0); #endif // Compile the program error = clBuildProgram( program, 1, &device, clOptions, NULL, NULL); // Print warnings and errors from compilation static char log[65536]; memset(log, 0, sizeof(log)); clGetProgramBuildInfo( program, device, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL); printf("-----OpenCL Compiler Output-----\n"); if (strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log); printf("--------------------------------\n"); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // CREATE Kernels //====================================================================================================100 // Extract kernel cl_kernel extract_kernel; extract_kernel = clCreateKernel(program, "extract_kernel", &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Prepare kernel cl_kernel prepare_kernel; prepare_kernel = clCreateKernel(program, "prepare_kernel", &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Reduce kernel cl_kernel reduce_kernel; reduce_kernel = clCreateKernel( program, "reduce_kernel", &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // SRAD kernel cl_kernel srad_kernel; srad_kernel = clCreateKernel( program, "srad_kernel", &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // SRAD2 kernel cl_kernel srad2_kernel; srad2_kernel = clCreateKernel( program, "srad2_kernel", &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Compress kernel cl_kernel compress_kernel; compress_kernel = clCreateKernel( program, "compress_kernel", &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // TRIGGERING INITIAL DRIVER OVERHEAD //====================================================================================================100 // cudaThreadSynchronize(); // the above does it //======================================================================================================================================================150 // GPU VARIABLES //======================================================================================================================================================150 // CUDA kernel execution parameters int blocks_x; //======================================================================================================================================================150 // ALLOCATE MEMORY IN GPU //======================================================================================================================================================150 //====================================================================================================100 // common memory size //====================================================================================================100 int mem_size; // matrix memory size mem_size = sizeof(fp) * Ne; // get the size of float representation of input IMAGE //====================================================================================================100 // allocate memory for entire IMAGE on DEVICE //====================================================================================================100 cl_mem d_I; d_I = clCreateBuffer( context, CL_MEM_READ_WRITE, mem_size, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // allocate memory for coordinates on DEVICE //====================================================================================================100 cl_mem d_iN; d_iN = clCreateBuffer( context, CL_MEM_READ_WRITE, mem_size_i, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_mem d_iS; d_iS = clCreateBuffer( context, CL_MEM_READ_WRITE, mem_size_i, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_mem d_jE; d_jE = clCreateBuffer( context, CL_MEM_READ_WRITE, mem_size_j, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_mem d_jW; d_jW = clCreateBuffer( context, CL_MEM_READ_WRITE, mem_size_j, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // allocate memory for derivatives //====================================================================================================100 cl_mem d_dN; d_dN = clCreateBuffer( context, CL_MEM_READ_WRITE, mem_size, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_mem d_dS; d_dS = clCreateBuffer( context, CL_MEM_READ_WRITE, mem_size, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_mem d_dW; d_dW = clCreateBuffer( context, CL_MEM_READ_WRITE, mem_size, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_mem d_dE; d_dE = clCreateBuffer( context, CL_MEM_READ_WRITE, mem_size, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // allocate memory for coefficient on DEVICE //====================================================================================================100 cl_mem d_c; d_c = clCreateBuffer( context, CL_MEM_READ_WRITE, mem_size, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // allocate memory for partial sums on DEVICE //====================================================================================================100 cl_mem d_sums; d_sums = clCreateBuffer( context, CL_MEM_READ_WRITE, mem_size, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_mem d_sums2; d_sums2 = clCreateBuffer( context, CL_MEM_READ_WRITE, mem_size, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // End //====================================================================================================100 //======================================================================================================================================================150 // COPY INPUT TO CPU //======================================================================================================================================================150 //====================================================================================================100 // Image //====================================================================================================100 error = clEnqueueWriteBuffer( command_queue, d_I, 1, 0, mem_size, image, 0, 0, 0); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // coordinates //====================================================================================================100 error = clEnqueueWriteBuffer( command_queue, d_iN, 1, 0, mem_size_i, iN, 0, 0, 0); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clEnqueueWriteBuffer( command_queue, d_iS, 1, 0, mem_size_i, iS, 0, 0, 0); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clEnqueueWriteBuffer( command_queue, d_jE, 1, 0, mem_size_j, jE, 0, 0, 0); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clEnqueueWriteBuffer( command_queue, d_jW, 1, 0, mem_size_j, jW, 0, 0, 0); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // End //====================================================================================================100 //======================================================================================================================================================150 // KERNEL EXECUTION PARAMETERS //======================================================================================================================================================150 // threads size_t local_work_size[1]; local_work_size[0] = NUMBER_THREADS; // workgroups int blocks_work_size; size_t global_work_size[1]; blocks_x = Ne/(int)local_work_size[0]; if (Ne % (int)local_work_size[0] != 0){ // compensate for division remainder above by adding one grid blocks_x = blocks_x + 1; } blocks_work_size = blocks_x; global_work_size[0] = blocks_work_size * local_work_size[0]; // define the number of blocks in the grid printf("max # of workgroups = %d, # of threads/workgroup = %d (ensure that device can handle)\n", (int)(global_work_size[0]/local_work_size[0]), (int)local_work_size[0]); //======================================================================================================================================================150 // Extract Kernel - SCALE IMAGE DOWN FROM 0-255 TO 0-1 AND EXTRACT //======================================================================================================================================================150 //====================================================================================================100 // set arguments //====================================================================================================100 error = clSetKernelArg( extract_kernel, 0, sizeof(long), (void *) &Ne); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( extract_kernel, 1, sizeof(cl_mem), (void *) &d_I); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // launch kernel //====================================================================================================100 error = clEnqueueNDRangeKernel( command_queue, extract_kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // Synchronization - wait for all operations in the command queue so far to finish //====================================================================================================100 // error = clFinish(command_queue); // if (error != CL_SUCCESS) // fatal_CL(error, __LINE__); //====================================================================================================100 // End //====================================================================================================100 //======================================================================================================================================================150 // WHAT IS CONSTANT IN COMPUTATION LOOP //======================================================================================================================================================150 //====================================================================================================100 // Prepare Kernel //====================================================================================================100 error = clSetKernelArg( prepare_kernel, 0, sizeof(long), (void *) &Ne); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( prepare_kernel, 1, sizeof(cl_mem), (void *) &d_I); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( prepare_kernel, 2, sizeof(cl_mem), (void *) &d_sums); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( prepare_kernel, 3, sizeof(cl_mem), (void *) &d_sums2); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // Reduce Kernel //====================================================================================================100 int blocks2_x; int blocks2_work_size; size_t global_work_size2[1]; long no; int mul; int mem_size_single = sizeof(fp) * 1; fp total; fp total2; fp meanROI; fp meanROI2; fp varROI; fp q0sqr; error = clSetKernelArg( reduce_kernel, 0, sizeof(long), (void *) &Ne); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( reduce_kernel, 3, sizeof(cl_mem), (void *) &d_sums); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( reduce_kernel, 4, sizeof(cl_mem), (void *) &d_sums2); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // SRAD Kernel //====================================================================================================100 error = clSetKernelArg( srad_kernel, 0, sizeof(fp), (void *) &lambda); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad_kernel, 1, sizeof(int), (void *) &Nr); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad_kernel, 2, sizeof(int), (void *) &Nc); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad_kernel, 3, sizeof(long), (void *) &Ne); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad_kernel, 4, sizeof(cl_mem), (void *) &d_iN); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad_kernel, 5, sizeof(cl_mem), (void *) &d_iS); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad_kernel, 6, sizeof(cl_mem), (void *) &d_jE); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad_kernel, 7, sizeof(cl_mem), (void *) &d_jW); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad_kernel, 8, sizeof(cl_mem), (void *) &d_dN); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad_kernel, 9, sizeof(cl_mem), (void *) &d_dS); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad_kernel, 10, sizeof(cl_mem), (void *) &d_dW); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad_kernel, 11, sizeof(cl_mem), (void *) &d_dE); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad_kernel, 13, sizeof(cl_mem), (void *) &d_c); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad_kernel, 14, sizeof(cl_mem), (void *) &d_I); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // SRAD2 Kernel //====================================================================================================100 error = clSetKernelArg( srad2_kernel, 0, sizeof(fp), (void *) &lambda); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad2_kernel, 1, sizeof(int), (void *) &Nr); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad2_kernel, 2, sizeof(int), (void *) &Nc); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad2_kernel, 3, sizeof(long), (void *) &Ne); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad2_kernel, 4, sizeof(cl_mem), (void *) &d_iN); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad2_kernel, 5, sizeof(cl_mem), (void *) &d_iS); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad2_kernel, 6, sizeof(cl_mem), (void *) &d_jE); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad2_kernel, 7, sizeof(cl_mem), (void *) &d_jW); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad2_kernel, 8, sizeof(cl_mem), (void *) &d_dN); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad2_kernel, 9, sizeof(cl_mem), (void *) &d_dS); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad2_kernel, 10, sizeof(cl_mem), (void *) &d_dW); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad2_kernel, 11, sizeof(cl_mem), (void *) &d_dE); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad2_kernel, 12, sizeof(cl_mem), (void *) &d_c); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( srad2_kernel, 13, sizeof(cl_mem), (void *) &d_I); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // End //====================================================================================================100 //======================================================================================================================================================150 // COMPUTATION //======================================================================================================================================================150 printf("Iterations Progress: "); // execute main loop for (iter=0; iter<niter; iter++){ // do for the number of iterations input parameter printf("%d ", iter); fflush(NULL); //====================================================================================================100 // Prepare kernel //====================================================================================================100 // launch kernel error = clEnqueueNDRangeKernel( command_queue, prepare_kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // synchronize // error = clFinish(command_queue); // if (error != CL_SUCCESS) // fatal_CL(error, __LINE__); //====================================================================================================100 // Reduce Kernel - performs subsequent reductions of sums //====================================================================================================100 // initial values blocks2_work_size = blocks_work_size; // original number of blocks global_work_size2[0] = global_work_size[0]; no = Ne; // original number of sum elements mul = 1; // original multiplier // loop while(blocks2_work_size != 0){ // set arguments that were uptaded in this loop error = clSetKernelArg( reduce_kernel, 1, sizeof(long), (void *) &no); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( reduce_kernel, 2, sizeof(int), (void *) &mul); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( reduce_kernel, 5, sizeof(int), (void *) &blocks2_work_size); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // launch kernel error = clEnqueueNDRangeKernel( command_queue, reduce_kernel, 1, NULL, global_work_size2, local_work_size, 0, NULL, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // synchronize // error = clFinish(command_queue); // if (error != CL_SUCCESS) // fatal_CL(error, __LINE__); // update execution parameters no = blocks2_work_size; // get current number of elements if(blocks2_work_size == 1){ blocks2_work_size = 0; } else{ mul = mul * NUMBER_THREADS; // update the increment blocks_x = blocks2_work_size/(int)local_work_size[0]; // number of blocks if (blocks2_work_size % (int)local_work_size[0] != 0){ // compensate for division remainder above by adding one grid blocks_x = blocks_x + 1; } blocks2_work_size = blocks_x; global_work_size2[0] = blocks2_work_size * (int)local_work_size[0]; } } // copy total sums to device error = clEnqueueReadBuffer(command_queue, d_sums, CL_TRUE, 0, mem_size_single, &total, 0, NULL, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clEnqueueReadBuffer(command_queue, d_sums2, CL_TRUE, 0, mem_size_single, &total2, 0, NULL, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // calculate statistics //====================================================================================================100 meanROI = total / (fp)(NeROI); // gets mean (average) value of element in ROI meanROI2 = meanROI * meanROI; // varROI = (total2 / (fp)(NeROI)) - meanROI2; // gets variance of ROI q0sqr = varROI / meanROI2; // gets standard deviation of ROI //====================================================================================================100 // execute srad kernel //====================================================================================================100 // set arguments that were uptaded in this loop error = clSetKernelArg( srad_kernel, 12, sizeof(fp), (void *) &q0sqr); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // launch kernel error = clEnqueueNDRangeKernel( command_queue, srad_kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // synchronize // error = clFinish(command_queue); // if (error != CL_SUCCESS) // fatal_CL(error, __LINE__); //====================================================================================================100 // execute srad2 kernel //====================================================================================================100 // launch kernel error = clEnqueueNDRangeKernel( command_queue, srad2_kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // synchronize // error = clFinish(command_queue); // if (error != CL_SUCCESS) // fatal_CL(error, __LINE__); //====================================================================================================100 // End //====================================================================================================100 } printf("\n"); //======================================================================================================================================================150 // Compress Kernel - SCALE IMAGE UP FROM 0-1 TO 0-255 AND COMPRESS //======================================================================================================================================================150 //====================================================================================================100 // set parameters //====================================================================================================100 error = clSetKernelArg( compress_kernel, 0, sizeof(long), (void *) &Ne); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clSetKernelArg( compress_kernel, 1, sizeof(cl_mem), (void *) &d_I); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // launch kernel //====================================================================================================100 error = clEnqueueNDRangeKernel( command_queue, compress_kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // synchronize //====================================================================================================100 error = clFinish(command_queue); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // End //====================================================================================================100 //======================================================================================================================================================150 // COPY RESULTS BACK TO CPU //======================================================================================================================================================150 error = clEnqueueReadBuffer(command_queue, d_I, CL_TRUE, 0, mem_size, image, 0, NULL, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // int i; // for(i=0; i<100; i++){ // printf("%f ", image[i]); // } //======================================================================================================================================================150 // FREE MEMORY //======================================================================================================================================================150 // OpenCL structures error = clReleaseKernel(extract_kernel); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseKernel(prepare_kernel); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseKernel(reduce_kernel); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseKernel(srad_kernel); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseKernel(srad2_kernel); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseKernel(compress_kernel); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseProgram(program); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // common_change error = clReleaseMemObject(d_I); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseMemObject(d_c); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseMemObject(d_iN); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseMemObject(d_iS); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseMemObject(d_jE); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseMemObject(d_jW); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseMemObject(d_dN); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseMemObject(d_dS); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseMemObject(d_dE); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseMemObject(d_dW); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseMemObject(d_sums); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseMemObject(d_sums2); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // OpenCL structures error = clFlush(command_queue); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseCommandQueue(command_queue); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); error = clReleaseContext(context); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //======================================================================================================================================================150 // End //======================================================================================================================================================150 }
int main(int argc, char** argv) { cl_int error; cl_uint num_platforms; // Get the number of platforms error = clGetPlatformIDs(0, NULL, &num_platforms); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Get the list of platforms cl_platform_id* platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * num_platforms); error = clGetPlatformIDs(num_platforms, platforms, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Print the chosen platform (if there are multiple platforms, choose the first one) cl_platform_id platform = platforms[0]; char pbuf[100]; error = clGetPlatformInfo(platform, CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); printf("Platform: %s\n", pbuf); // Create a GPU context cl_context_properties context_properties[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) platform, 0}; context = clCreateContextFromType(context_properties, CL_DEVICE_TYPE_GPU, NULL, NULL, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Get and print the chosen device (if there are multiple devices, choose the first one) size_t devices_size; error = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &devices_size); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_device_id *devices = (cl_device_id *) malloc(devices_size); error = clGetContextInfo(context, CL_CONTEXT_DEVICES, devices_size, devices, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); device = devices[0]; error = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(pbuf), pbuf, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); printf("Device: %s\n", pbuf); // Create a command queue command_queue = clCreateCommandQueue(context, device, 0, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); int size; int grid_rows,grid_cols = 0; float *FilesavingTemp,*FilesavingPower; //,*MatrixOut; char *tfile, *pfile, *ofile; int total_iterations = 60; int pyramid_height = 1; // number of iterations if (argc < 7) usage(argc, argv); if((grid_rows = atoi(argv[1]))<=0|| (grid_cols = atoi(argv[1]))<=0|| (pyramid_height = atoi(argv[2]))<=0|| (total_iterations = atoi(argv[3]))<=0) usage(argc, argv); tfile=argv[4]; pfile=argv[5]; ofile=argv[6]; size=grid_rows*grid_cols; // --------------- pyramid parameters --------------- int borderCols = (pyramid_height)*EXPAND_RATE/2; int borderRows = (pyramid_height)*EXPAND_RATE/2; int smallBlockCol = BLOCK_SIZE-(pyramid_height)*EXPAND_RATE; int smallBlockRow = BLOCK_SIZE-(pyramid_height)*EXPAND_RATE; int blockCols = grid_cols/smallBlockCol+((grid_cols%smallBlockCol==0)?0:1); int blockRows = grid_rows/smallBlockRow+((grid_rows%smallBlockRow==0)?0:1); FilesavingTemp = (float *) malloc(size*sizeof(float)); FilesavingPower = (float *) malloc(size*sizeof(float)); // MatrixOut = (float *) calloc (size, sizeof(float)); if( !FilesavingPower || !FilesavingTemp) // || !MatrixOut) fatal("unable to allocate memory"); // Read input data from disk readinput(FilesavingTemp, grid_rows, grid_cols, tfile); readinput(FilesavingPower, grid_rows, grid_cols, pfile); // Load kernel source from file const char *source = load_kernel_source("hotspot_kernel.cl"); size_t sourceSize = strlen(source); // Compile the kernel cl_program program = clCreateProgramWithSource(context, 1, &source, &sourceSize, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Create an executable from the kernel error = clBuildProgram(program, 1, &device, NULL, NULL, NULL); // Show compiler warnings/errors static char log[65536]; memset(log, 0, sizeof(log)); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL); if (strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); kernel = clCreateKernel(program, "hotspot", &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); long long start_time = get_time(); // Create two temperature matrices and copy the temperature input data cl_mem MatrixTemp[2]; // Create input memory buffers on device MatrixTemp[0] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(float) * size, FilesavingTemp, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); MatrixTemp[1] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(float) * size, NULL, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Copy the power input data cl_mem MatrixPower = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(float) * size, FilesavingPower, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Perform the computation int ret = compute_tran_temp(MatrixPower, MatrixTemp, grid_cols, grid_rows, total_iterations, pyramid_height, blockCols, blockRows, borderCols, borderRows, FilesavingTemp, FilesavingPower); // Copy final temperature data back cl_float *MatrixOut = (cl_float *) clEnqueueMapBuffer(command_queue, MatrixTemp[ret], CL_TRUE, CL_MAP_READ, 0, sizeof(float) * size, 0, NULL, NULL, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); long long end_time = get_time(); printf("Total time: %.3f seconds\n", ((float) (end_time - start_time)) / (1000*1000)); // Write final output to output file writeoutput(MatrixOut, grid_rows, grid_cols, ofile); error = clEnqueueUnmapMemObject(command_queue, MatrixTemp[ret], (void *) MatrixOut, 0, NULL, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); clReleaseMemObject(MatrixTemp[0]); clReleaseMemObject(MatrixTemp[1]); clReleaseMemObject(MatrixPower); return 0; }
void kernel_gpu_opencl_wrapper( record *records, long records_mem, knode *knodes, long knodes_elem, long knodes_mem, int order, long maxheight, int count, long *currKnode, long *offset, int *keys, record *ans) { //======================================================================================================================================================150 // CPU VARIABLES //======================================================================================================================================================150 // timer long long time0; long long time1; long long time2; long long time3; long long time4; long long time5; long long time6; time0 = get_time(); //======================================================================================================================================================150 // GPU SETUP //======================================================================================================================================================150 //====================================================================================================100 // INITIAL DRIVER OVERHEAD //====================================================================================================100 // cudaThreadSynchronize(); //====================================================================================================100 // COMMON VARIABLES //====================================================================================================100 // common variables cl_int error; //====================================================================================================100 // GET PLATFORMS (Intel, AMD, NVIDIA, based on provided library), SELECT ONE //====================================================================================================100 // Get the number of available platforms cl_uint num_platforms; error = clGetPlatformIDs( 0, NULL, &num_platforms); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Get the list of available platforms cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms); error = clGetPlatformIDs( num_platforms, platforms, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Select the 1st platform cl_platform_id platform = platforms[0]; // Get the name of the selected platform and print it (if there are multiple platforms, choose the first one) char pbuf[100]; error = clGetPlatformInfo( platform, CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); printf("Platform: %s\n", pbuf); //====================================================================================================100 // CREATE CONTEXT FOR THE PLATFORM //====================================================================================================100 // Create context properties for selected platform cl_context_properties context_properties[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) platform, 0}; // Create context for selected platform being GPU cl_context context; context = clCreateContextFromType( context_properties, CL_DEVICE_TYPE_GPU, NULL, NULL, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // GET DEVICES AVAILABLE FOR THE CONTEXT, SELECT ONE //====================================================================================================100 // Get the number of devices (previousely selected for the context) size_t devices_size; error = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &devices_size); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Get the list of devices (previousely selected for the context) cl_device_id *devices = (cl_device_id *) malloc(devices_size); error = clGetContextInfo( context, CL_CONTEXT_DEVICES, devices_size, devices, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Select the first device (previousely selected for the context) (if there are multiple devices, choose the first one) cl_device_id device; device = devices[0]; // Get the name of the selected device (previousely selected for the context) and print it error = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(pbuf), pbuf, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); printf("Device: %s\n", pbuf); //====================================================================================================100 // CREATE COMMAND QUEUE FOR THE DEVICE //====================================================================================================100 // Create a command queue cl_command_queue command_queue; command_queue = clCreateCommandQueue( context, device, 0, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // CREATE PROGRAM, COMPILE IT //====================================================================================================100 // Load kernel source code from file const char *source = load_kernel_source("./kernel/kernel_gpu_opencl.cl"); size_t sourceSize = strlen(source); // Create the program cl_program program = clCreateProgramWithSource( context, 1, &source, &sourceSize, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); char clOptions[110]; // sprintf(clOptions,"-I../../src"); sprintf(clOptions,"-I./../"); #ifdef DEFAULT_ORDER sprintf(clOptions + strlen(clOptions), " -DDEFAULT_ORDER=%d", DEFAULT_ORDER); #endif // Compile the program error = clBuildProgram( program, 1, &device, clOptions, NULL, NULL); // Print warnings and errors from compilation static char log[65536]; memset(log, 0, sizeof(log)); clGetProgramBuildInfo( program, device, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL); printf("-----OpenCL Compiler Output-----\n"); if (strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log); printf("--------------------------------\n"); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Create kernel cl_kernel kernel; kernel = clCreateKernel(program, "findK", &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); time1 = get_time(); //====================================================================================================100 // END //====================================================================================================100 //======================================================================================================================================================150 // GPU MEMORY (MALLOC) //======================================================================================================================================================150 //====================================================================================================100 // DEVICE IN //====================================================================================================100 //==================================================50 // recordsD //==================================================50 cl_mem recordsD; recordsD = clCreateBuffer( context, CL_MEM_READ_WRITE, records_mem, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // knodesD //==================================================50 cl_mem knodesD; knodesD = clCreateBuffer( context, CL_MEM_READ_WRITE, knodes_mem, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // currKnodeD //==================================================50 cl_mem currKnodeD; currKnodeD = clCreateBuffer( context, CL_MEM_READ_WRITE, count*sizeof(long), NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // offsetD //==================================================50 cl_mem offsetD; offsetD = clCreateBuffer( context, CL_MEM_READ_WRITE, count*sizeof(long), NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // keysD //==================================================50 cl_mem keysD; keysD = clCreateBuffer( context, CL_MEM_READ_WRITE, count*sizeof(long), NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // END //==================================================50 //====================================================================================================100 // DEVICE IN/OUT //====================================================================================================100 //==================================================50 // ansD //==================================================50 cl_mem ansD; ansD = clCreateBuffer( context, CL_MEM_READ_WRITE, count*sizeof(record), NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); time2 = get_time(); //==================================================50 // END //==================================================50 //====================================================================================================100 // END //====================================================================================================100 //======================================================================================================================================================150 // GPU MEMORY COPY //======================================================================================================================================================150 //====================================================================================================100 // GPU MEMORY (MALLOC) COPY IN //====================================================================================================100 //==================================================50 // recordsD //==================================================50 error = clEnqueueWriteBuffer( command_queue, // command queue recordsD, // destination 1, // block the source from access until this copy operation complates (1=yes, 0=no) 0, // offset in destination to write to records_mem, // size to be copied records, // source 0, // # of events in the list of events to wait for NULL, // list of events to wait for NULL); // ID of this operation to be used by waiting operations if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // knodesD //==================================================50 error = clEnqueueWriteBuffer( command_queue, // command queue knodesD, // destination 1, // block the source from access until this copy operation complates (1=yes, 0=no) 0, // offset in destination to write to knodes_mem, // size to be copied knodes, // source 0, // # of events in the list of events to wait for NULL, // list of events to wait for NULL); // ID of this operation to be used by waiting operations if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // currKnodeD //==================================================50 error = clEnqueueWriteBuffer( command_queue, // command queue currKnodeD, // destination 1, // block the source from access until this copy operation complates (1=yes, 0=no) 0, // offset in destination to write to count*sizeof(long), // size to be copied currKnode, // source 0, // # of events in the list of events to wait for NULL, // list of events to wait for NULL); // ID of this operation to be used by waiting operations if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // offsetD //==================================================50 error = clEnqueueWriteBuffer( command_queue, // command queue offsetD, // destination 1, // block the source from access until this copy operation complates (1=yes, 0=no) 0, // offset in destination to write to count*sizeof(long), // size to be copied offset, // source 0, // # of events in the list of events to wait for NULL, // list of events to wait for NULL); // ID of this operation to be used by waiting operations if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // keysD //==================================================50 error = clEnqueueWriteBuffer( command_queue, // command queue keysD, // destination 1, // block the source from access until this copy operation complates (1=yes, 0=no) 0, // offset in destination to write to count*sizeof(int), // size to be copied keys, // source 0, // # of events in the list of events to wait for NULL, // list of events to wait for NULL); // ID of this operation to be used by waiting operations if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // END //==================================================50 //====================================================================================================100 // DEVICE IN/OUT //====================================================================================================100 //==================================================50 // ansD //==================================================50 error = clEnqueueWriteBuffer( command_queue, // command queue ansD, // destination 1, // block the source from access until this copy operation complates (1=yes, 0=no) 0, // offset in destination to write to count*sizeof(record), // size to be copied ans, // source 0, // # of events in the list of events to wait for NULL, // list of events to wait for NULL); // ID of this operation to be used by waiting operations if (error != CL_SUCCESS) fatal_CL(error, __LINE__); time3 = get_time(); //==================================================50 // END //==================================================50 //====================================================================================================100 // END //====================================================================================================100 //======================================================================================================================================================150 // findK kernel //======================================================================================================================================================150 //====================================================================================================100 // Execution Parameters //====================================================================================================100 size_t local_work_size[1]; local_work_size[0] = order < 1024 ? order : 1024; size_t global_work_size[1]; global_work_size[0] = count * local_work_size[0]; printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", (int)(global_work_size[0]/local_work_size[0]), (int)local_work_size[0]); //====================================================================================================100 // Kernel Arguments //====================================================================================================100 clSetKernelArg( kernel, 0, sizeof(long), (void *) &maxheight); clSetKernelArg( kernel, 1, sizeof(cl_mem), (void *) &knodesD); clSetKernelArg( kernel, 2, sizeof(long), (void *) &knodes_elem); clSetKernelArg( kernel, 3, sizeof(cl_mem), (void *) &recordsD); clSetKernelArg( kernel, 4, sizeof(cl_mem), (void *) &currKnodeD); clSetKernelArg( kernel, 5, sizeof(cl_mem), (void *) &offsetD); clSetKernelArg( kernel, 6, sizeof(cl_mem), (void *) &keysD); clSetKernelArg( kernel, 7, sizeof(cl_mem), (void *) &ansD); //====================================================================================================100 // Kernel //====================================================================================================100 error = clEnqueueNDRangeKernel( command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Wait for all operations to finish NOT SURE WHERE THIS SHOULD GO error = clFinish(command_queue); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); time4 = get_time(); //====================================================================================================100 // END //====================================================================================================100 //======================================================================================================================================================150 // GPU MEMORY COPY (CONTD.) //======================================================================================================================================================150 //====================================================================================================100 // DEVICE IN/OUT //====================================================================================================100 //==================================================50 // ansD //==================================================50 error = clEnqueueReadBuffer(command_queue, // The command queue. ansD, // The image on the device. CL_TRUE, // Blocking? (ie. Wait at this line until read has finished?) 0, // Offset. None in this case. count*sizeof(record), // Size to copy. ans, // The pointer to the image on the host. 0, // Number of events in wait list. Not used. NULL, // Event wait list. Not used. NULL); // Event object for determining status. Not used. if (error != CL_SUCCESS) fatal_CL(error, __LINE__); time5 = get_time(); //==================================================50 // END //==================================================50 //====================================================================================================100 // END //====================================================================================================100 //======================================================================================================================================================150 // GPU MEMORY DEALLOCATION //======================================================================================================================================================150 // Release kernels... clReleaseKernel(kernel); // Now the program... clReleaseProgram(program); // Clean up the device memory... clReleaseMemObject(recordsD); clReleaseMemObject(knodesD); clReleaseMemObject(currKnodeD); clReleaseMemObject(offsetD); clReleaseMemObject(keysD); clReleaseMemObject(ansD); // Flush the queue error = clFlush(command_queue); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // ...and finally, the queue and context. clReleaseCommandQueue(command_queue); // ??? clReleaseContext(context); time6 = get_time(); //======================================================================================================================================================150 // DISPLAY TIMING //======================================================================================================================================================150 printf("Time spent in different stages of GPU_CUDA KERNEL:\n"); printf("%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\n", (float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time6-time0) * 100); printf("%15.12f s, %15.12f % : GPU MEM: ALO\n", (float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time6-time0) * 100); printf("%15.12f s, %15.12f % : GPU MEM: COPY IN\n", (float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time6-time0) * 100); printf("%15.12f s, %15.12f % : GPU: KERNEL\n", (float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time6-time0) * 100); printf("%15.12f s, %15.12f % : GPU MEM: COPY OUT\n", (float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time6-time0) * 100); printf("%15.12f s, %15.12f % : GPU MEM: FRE\n", (float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time6-time0) * 100); printf("Total time:\n"); printf("%.12f s\n", (float) (time6-time0) / 1000000); //======================================================================================================================================================150 // END //======================================================================================================================================================150 }
int kernel_gpu_opencl_wrapper( int xmax, int workload, fp ***y, fp **x, fp **params, fp *com) { //======================================================================================================================================================150 // VARIABLES //======================================================================================================================================================150 long long time0; long long time1; long long time2; long long time3; long long time4; long long time5; long long timecopyin = 0; long long timekernel = 0; long long timecopyout = 0; long long timeother; //stage1_start time0 = get_time(); int i; //======================================================================================================================================================150 // GPU SETUP //======================================================================================================================================================150 //====================================================================================================100 // COMMON VARIABLES //====================================================================================================100 // common variables cl_int error; //====================================================================================================100 // GET PLATFORMS (Intel, AMD, NVIDIA, based on provided library), SELECT ONE //====================================================================================================100 // Get the number of available platforms cl_uint num_platforms; error = clGetPlatformIDs( 0, NULL, &num_platforms); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Get the list of available platforms cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms); error = clGetPlatformIDs( num_platforms, platforms, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Select the 1st platform cl_platform_id platform = platforms[0]; // Get the name of the selected platform and print it (if there are multiple platforms, choose the first one) char pbuf[100]; error = clGetPlatformInfo( platform, CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); printf("Platform: %s\n", pbuf); //====================================================================================================100 // CREATE CONTEXT FOR THE PLATFORM //====================================================================================================100 // Create context properties for selected platform cl_context_properties context_properties[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) platform, 0}; // Create context for selected platform being GPU cl_context context; context = clCreateContextFromType( context_properties, CL_DEVICE_TYPE_GPU, NULL, NULL, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // GET DEVICES AVAILABLE FOR THE CONTEXT, SELECT ONE //====================================================================================================100 // Get the number of devices (previousely selected for the context) size_t devices_size; error = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &devices_size); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Get the list of devices (previousely selected for the context) cl_device_id *devices = (cl_device_id *) malloc(devices_size); error = clGetContextInfo( context, CL_CONTEXT_DEVICES, devices_size, devices, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Select the first device (previousely selected for the context) (if there are multiple devices, choose the first one) cl_device_id device; device = devices[0]; // Get the name of the selected device (previousely selected for the context) and print it error = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(pbuf), pbuf, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); printf("Device: %s\n", pbuf); //====================================================================================================100 // CREATE COMMAND QUEUE FOR THE DEVICE //====================================================================================================100 // Create a command queue cl_command_queue command_queue; command_queue = clCreateCommandQueue( context, device, 0, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // CRATE PROGRAM, COMPILE IT //====================================================================================================100 // Load kernel source code from file const char *source = load_kernel_source("./kernel/kernel_gpu_opencl.cl"); size_t sourceSize = strlen(source); // Create the program cl_program program = clCreateProgramWithSource( context, 1, &source, &sourceSize, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Compile the program error = clBuildProgram( program, 1, &device, "-I./../", NULL, NULL); // Print warnings and errors from compilation static char log[65536]; memset(log, 0, sizeof(log)); clGetProgramBuildInfo( program, device, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL); printf("-----OpenCL Compiler Output-----\n"); if (strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log); printf("--------------------------------\n"); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Create kernel cl_kernel kernel; kernel = clCreateKernel(program, "kernel_gpu_opencl", &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // INITIAL DRIVER OVERHEAD //====================================================================================================100 // cudaThreadSynchronize(); time1 = get_time(); // double start_timer = omp_get_wtime(); //======================================================================================================================================================150 // ALLOCATE MEMORY //======================================================================================================================================================150 //====================================================================================================100 // d_initvalu_mem //====================================================================================================100 int d_initvalu_mem; d_initvalu_mem = EQUATIONS * sizeof(fp); cl_mem d_initvalu; d_initvalu = clCreateBuffer(context, // context CL_MEM_READ_WRITE, // flags d_initvalu_mem, // size of buffer NULL, // host pointer (optional) &error ); // returned error if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // d_finavalu_mem //====================================================================================================100 int d_finavalu_mem; d_finavalu_mem = EQUATIONS * sizeof(fp); cl_mem d_finavalu; d_finavalu = clCreateBuffer(context, CL_MEM_READ_WRITE, d_finavalu_mem, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // d_params_mem //====================================================================================================100 int d_params_mem; d_params_mem = PARAMETERS * sizeof(fp); cl_mem d_params; d_params = clCreateBuffer( context, CL_MEM_READ_WRITE, d_params_mem, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // d_com_mem //====================================================================================================100 int d_com_mem; d_com_mem = 3 * sizeof(fp); cl_mem d_com; d_com = clCreateBuffer( context, CL_MEM_READ_WRITE, d_com_mem, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); time2 = get_time(); //======================================================================================================================================================150 // EXECUTION //======================================================================================================================================================150 int status; for(i=0; i<workload; i++){ status = solver( y[i], x[i], xmax, params[i], com, d_initvalu, d_finavalu, d_params, d_com, command_queue, kernel, &timecopyin, &timekernel, &timecopyout); if(status !=0){ printf("STATUS: %d\n", status); } } // // // print results // // int k; // // for(i=0; i<workload; i++){ // // printf("WORKLOAD %d:\n", i); // // for(j=0; j<(xmax+1); j++){ // // printf("\tTIME %d:\n", j); // // for(k=0; k<EQUATIONS; k++){ // // printf("\t\ty[%d][%d][%d]=%13.10f\n", i, j, k, y[i][j][k]); // // } // // } // // } // double end_timer = omp_get_wtime(); // printf("Time3-Time1 : %.8f\n",(end_timer - start_timer)); time3 = get_time(); //======================================================================================================================================================150 // FREE GPU MEMORY //======================================================================================================================================================150 // Release kernels... clReleaseKernel(kernel); // Now the program... clReleaseProgram(program); // Clean up the device memory... clReleaseMemObject(d_initvalu); clReleaseMemObject(d_finavalu); clReleaseMemObject(d_params); clReleaseMemObject(d_com); // Flush the queue error = clFlush(command_queue); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // ...and finally, the queue and context. clReleaseCommandQueue(command_queue); // ??? clReleaseContext(context); time4= get_time(); //======================================================================================================================================================150 // DISPLAY TIMING //======================================================================================================================================================150 printf("Time spent in different stages of the application:\n"); printf("%15.12f s, %15.12f % : CPU: GPU SETUP\n", (float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time4-time0) * 100); printf("%15.12f s, %15.12f % : CPU: ALLOCATE GPU MEMORY\n", (float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time4-time0) * 100); printf("%15.12f s, %15.12f % : GPU: COMPUTATION\n", (float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time4-time0) * 100); printf("\tGPU: COMPUTATION Components:\n"); printf("\t%15.12f s, %15.12f % : GPU: COPY DATA IN\n", (float) (timecopyin) / 1000000, (float) (timecopyin) / (float) (time4-time0) * 100); printf("\t%15.12f s, %15.12f % : GPU: KERNEL\n", (float) (timekernel) / 1000000, (float) (timekernel) / (float) (time4-time0) * 100); printf("\t%15.12f s, %15.12f % : GPU: COPY DATA OUT\n", (float) (timecopyout) / 1000000, (float) (timecopyout) / (float) (time4-time0) * 100); timeother = time3-time2-timecopyin-timekernel-timecopyout; printf("\t%15.12f s, %15.12f % : GPU: OTHER\n", (float) (timeother) / 1000000, (float) (timeother) / (float) (time4-time0) * 100); printf("%15.12f s, %15.12f % : CPU: FREE GPU MEMORY\n", (float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time4-time0) * 100); printf("Total time 1:\n"); printf("%.12f s\n", (float) (time4-time0) / 1000000); //======================================================================================================================================================150 // RETURN //======================================================================================================================================================150 return 0; //======================================================================================================================================================150 // END //======================================================================================================================================================150 }
cl_int clEnqueueNDRangeKernel_tony(cl_command_queue * cmdqueue, cl_kernel kernel,cl_uint work_dim, const size_t *global_work_size,const size_t *local_work_size) { cl_event eventList[2]; cl_int error; int CPU_RUN=0; int GPU_RUN=0; if(cpu_offset==0){ if(tony_device!=1){ GPU_RUN=1; } } else if(cpu_offset==100){ if(tony_device!=2){ CPU_RUN=1; } } else{ if(tony_device==0){ CPU_RUN=1; GPU_RUN=1; }else if (tony_device==1){ CPU_RUN=1; }else{ GPU_RUN=1; } } size_t remain_global_work_size[2]; size_t global_offset[2]; //NOTES(tony): if dim!=0, offset means offset of first dimensional. //we only care for first dimensional.Keep rest remain int d; for(d=1;d<work_dim;d++){ remain_global_work_size[d]=global_work_size[d]; global_offset[d]=global_work_size[d]; } global_offset[0]=((double)cpu_offset/100)*(global_work_size[0]); global_offset[0]=Round(global_offset[0],(local_work_size[0])); remain_global_work_size[0]=global_work_size[0]-global_offset[0]; if(remain_global_work_size[0]==0){ GPU_RUN=0; } //const size_t *const_remain = remain_global_work_size; //printf("global_workSize[0] %d , local_work_size[0] %d, global_offset is %d\n",global_work_size[0],local_work_size [0],global_offset[0]); int numOfArgs; clGetKernelInfo(kernel,CL_KERNEL_NUM_ARGS,sizeof(int),&numOfArgs,NULL); //printf("Num of args is %d\n", numOfArgs); int groupOffset =0; if(CPU_RUN){ //printf("Launch into cpu,globaloffset is %d\n",global_offset[0]); clSetKernelArg(kernel, numOfArgs-1, sizeof(int),(void*)&groupOffset); error=clEnqueueNDRangeKernel(cmdqueue[0],kernel,work_dim,NULL,global_offset,local_work_size,0,NULL,&(eventList[0] )); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); } if(GPU_RUN){ groupOffset =global_offset[0] / (local_work_size[0]); //printf("Launch into gpu,remain_global_work_sizeis %d\n",remain_global_work_size[0]); clSetKernelArg(kernel, numOfArgs-1, sizeof(int),(void*)&groupOffset); error =clEnqueueNDRangeKernel(cmdqueue[1],kernel,work_dim,global_offset,remain_global_work_size,local_work_size,0 ,NULL,&(eventList[1])); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); } if(CPU_RUN) error|=clFlush(cmdqueue[0]); if(GPU_RUN) error|=clFlush(cmdqueue[1]); // printf("Try to flush\n"); if(CPU_RUN) error|=clWaitForEvents(1,&eventList[0]); if(GPU_RUN) error|=clWaitForEvents(1,&eventList[1]); //clWaitForEvents(2,eventList); // printf("kernel finished\n"); return error; }
int compute_tran_temp(cl_mem MatrixPower, cl_mem MatrixTemp[2], int col, int row, \ int total_iterations, int num_iterations, int blockCols, int blockRows, int borderCols, int borderRows, float *TempCPU, float *PowerCPU) { float grid_height = chip_height / row; float grid_width = chip_width / col; float Cap = FACTOR_CHIP * SPEC_HEAT_SI * t_chip * grid_width * grid_height; float Rx = grid_width / (2.0 * K_SI * t_chip * grid_height); float Ry = grid_height / (2.0 * K_SI * t_chip * grid_width); float Rz = t_chip / (K_SI * grid_height * grid_width); float max_slope = MAX_PD / (FACTOR_CHIP * t_chip * SPEC_HEAT_SI); float step = PRECISION / max_slope; int t; int src = 0, dst = 1; cl_int error; // Determine GPU work group grid size_t global_work_size[2]; global_work_size[0] = BLOCK_SIZE * blockCols; global_work_size[1] = BLOCK_SIZE * blockRows; size_t local_work_size[2]; local_work_size[0] = BLOCK_SIZE; local_work_size[1] = BLOCK_SIZE; long long start_time = get_time(); for (t = 0; t < total_iterations; t += num_iterations) { // Specify kernel arguments int iter = MIN(num_iterations, total_iterations - t); clSetKernelArg(kernel, 0, sizeof(int), (void *) &iter); clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &MatrixPower); clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &MatrixTemp[src]); clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &MatrixTemp[dst]); clSetKernelArg(kernel, 4, sizeof(int), (void *) &col); clSetKernelArg(kernel, 5, sizeof(int), (void *) &row); clSetKernelArg(kernel, 6, sizeof(int), (void *) &borderCols); clSetKernelArg(kernel, 7, sizeof(int), (void *) &borderRows); clSetKernelArg(kernel, 8, sizeof(float), (void *) &Cap); clSetKernelArg(kernel, 9, sizeof(float), (void *) &Rx); clSetKernelArg(kernel, 10, sizeof(float), (void *) &Ry); clSetKernelArg(kernel, 11, sizeof(float), (void *) &Rz); clSetKernelArg(kernel, 12, sizeof(float), (void *) &step); fprintf(stderr, "global_work_size[0]=%lu, global_work_size[1]=%lu\n", global_work_size[0], global_work_size[1]); // Launch kernel #pragma dividend local_work_group_size local_work_size dim 2 dim1(1:32:2:32) dim2(1:32:2:32) //This lws will be used to profile the OpenCL kernel with id 1 size_t _dividend_lws_local_work_size_k1[3]; { _dividend_lws_local_work_size_k1[0] = getLWSValue("DIVIDEND_LWS1_D0",DIVIDEND_LWS1_D0_DEFAULT_VAL); _dividend_lws_local_work_size_k1[1] = getLWSValue("DIVIDEND_LWS1_D1",DIVIDEND_LWS1_D1_DEFAULT_VAL); //Dividend extension: store the kernel id as the last element _dividend_lws_local_work_size_k1[2] = 1; } error = DIVIDEND_CL_WRAP(clEnqueueNDRangeKernel)(command_queue, kernel, 2, NULL, global_work_size, _dividend_lws_local_work_size_k1, 0, NULL, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Flush the queue error = clFlush(command_queue); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Swap input and output GPU matrices src = 1 - src; dst = 1 - dst; } // Wait for all operations to finish error = DIVIDEND_CL_WRAP(clFinish)(command_queue); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); long long end_time = get_time(); long long total_time = (end_time - start_time); printf("\nKernel time: %.3f seconds\n", ((float) total_time) / (1000*1000)); return src; }