int main(int argc, char** argv) { cl_int error; cl_uint num_platforms; // Get the number of platforms error = clGetPlatformIDs(0, NULL, &num_platforms); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Get the list of platforms cl_platform_id* platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * num_platforms); error = clGetPlatformIDs(num_platforms, platforms, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Print the chosen platform (if there are multiple platforms, choose the first one) cl_platform_id platform = platforms[0]; char pbuf[100]; error = clGetPlatformInfo(platform, CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); printf("Platform: %s\n", pbuf); // Create a GPU context cl_context_properties context_properties[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) platform, 0}; context = clCreateContextFromType(context_properties, CL_DEVICE_TYPE_GPU, NULL, NULL, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Get and print the chosen device (if there are multiple devices, choose the first one) size_t devices_size; error = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &devices_size); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_device_id *devices = (cl_device_id *) malloc(devices_size); error = clGetContextInfo(context, CL_CONTEXT_DEVICES, devices_size, devices, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); device = devices[0]; error = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(pbuf), pbuf, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); printf("Device: %s\n", pbuf); // Create a command queue command_queue = clCreateCommandQueue(context, device, 0, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); int size; int grid_rows,grid_cols = 0; float *FilesavingTemp,*FilesavingPower; //,*MatrixOut; char *tfile, *pfile, *ofile; int total_iterations = 60; int pyramid_height = 1; // number of iterations if (argc < 7) usage(argc, argv); if((grid_rows = atoi(argv[1]))<=0|| (grid_cols = atoi(argv[1]))<=0|| (pyramid_height = atoi(argv[2]))<=0|| (total_iterations = atoi(argv[3]))<=0) usage(argc, argv); tfile=argv[4]; pfile=argv[5]; ofile=argv[6]; size=grid_rows*grid_cols; // --------------- pyramid parameters --------------- int borderCols = (pyramid_height)*EXPAND_RATE/2; int borderRows = (pyramid_height)*EXPAND_RATE/2; int smallBlockCol = BLOCK_SIZE-(pyramid_height)*EXPAND_RATE; int smallBlockRow = BLOCK_SIZE-(pyramid_height)*EXPAND_RATE; int blockCols = grid_cols/smallBlockCol+((grid_cols%smallBlockCol==0)?0:1); int blockRows = grid_rows/smallBlockRow+((grid_rows%smallBlockRow==0)?0:1); FilesavingTemp = (float *) malloc(size*sizeof(float)); FilesavingPower = (float *) malloc(size*sizeof(float)); // MatrixOut = (float *) calloc (size, sizeof(float)); if( !FilesavingPower || !FilesavingTemp) // || !MatrixOut) fatal("unable to allocate memory"); // Read input data from disk readinput(FilesavingTemp, grid_rows, grid_cols, tfile); readinput(FilesavingPower, grid_rows, grid_cols, pfile); // Load kernel source from file const char *source = load_kernel_source("hotspot_kernel.cl"); size_t sourceSize = strlen(source); // Compile the kernel cl_program program = clCreateProgramWithSource(context, 1, &source, &sourceSize, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Create an executable from the kernel error = clBuildProgram(program, 1, &device, NULL, NULL, NULL); // Show compiler warnings/errors static char log[65536]; memset(log, 0, sizeof(log)); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL); if (strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); kernel = clCreateKernel(program, "hotspot", &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); long long start_time = get_time(); // Create two temperature matrices and copy the temperature input data cl_mem MatrixTemp[2]; // Create input memory buffers on device MatrixTemp[0] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(float) * size, FilesavingTemp, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); MatrixTemp[1] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(float) * size, NULL, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Copy the power input data cl_mem MatrixPower = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(float) * size, FilesavingPower, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Perform the computation int ret = compute_tran_temp(MatrixPower, MatrixTemp, grid_cols, grid_rows, total_iterations, pyramid_height, blockCols, blockRows, borderCols, borderRows, FilesavingTemp, FilesavingPower); // Copy final temperature data back cl_float *MatrixOut = (cl_float *) clEnqueueMapBuffer(command_queue, MatrixTemp[ret], CL_TRUE, CL_MAP_READ, 0, sizeof(float) * size, 0, NULL, NULL, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); long long end_time = get_time(); printf("Total time: %.3f seconds\n", ((float) (end_time - start_time)) / (1000*1000)); // Write final output to output file writeoutput(MatrixOut, grid_rows, grid_cols, ofile); error = clEnqueueUnmapMemObject(command_queue, MatrixTemp[ret], (void *) MatrixOut, 0, NULL, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); clReleaseMemObject(MatrixTemp[0]); clReleaseMemObject(MatrixTemp[1]); clReleaseMemObject(MatrixPower); return 0; }
void run(int argc, char** argv) { int size; int grid_rows,grid_cols; float *FilesavingTemp,*FilesavingPower,*MatrixOut; char *tfile, *pfile, *ofile; int total_iterations = 60; int pyramid_height = 1; // number of iterations if (argc != 7) usage(argc, argv); if((grid_rows = atoi(argv[1]))<=0|| (grid_cols = atoi(argv[1]))<=0|| (pyramid_height = atoi(argv[2]))<=0|| (total_iterations = atoi(argv[3]))<=0) usage(argc, argv); tfile=argv[4]; pfile=argv[5]; ofile=argv[6]; size=grid_rows*grid_cols; /* --------------- pyramid parameters --------------- */ # define EXPAND_RATE 2// add one iteration will extend the pyramid base by 2 per each borderline int borderCols = (pyramid_height)*EXPAND_RATE/2; int borderRows = (pyramid_height)*EXPAND_RATE/2; int smallBlockCol = BLOCK_SIZE-(pyramid_height)*EXPAND_RATE; int smallBlockRow = BLOCK_SIZE-(pyramid_height)*EXPAND_RATE; int blockCols = grid_cols/smallBlockCol+((grid_cols%smallBlockCol==0)?0:1); int blockRows = grid_rows/smallBlockRow+((grid_rows%smallBlockRow==0)?0:1); FilesavingTemp = (float *) malloc(size*sizeof(float)); FilesavingPower = (float *) malloc(size*sizeof(float)); MatrixOut = (float *) calloc (size, sizeof(float)); if( !FilesavingPower || !FilesavingTemp || !MatrixOut) fatal("unable to allocate memory"); printf("pyramidHeight: %d\ngridSize: [%d, %d]\nborder:[%d, %d]\nblockGrid:[%d, %d]\ntargetBlock:[%d, %d]\n", \ pyramid_height, grid_cols, grid_rows, borderCols, borderRows, blockCols, blockRows, smallBlockCol, smallBlockRow); readinput(FilesavingTemp, grid_rows, grid_cols, tfile); readinput(FilesavingPower, grid_rows, grid_cols, pfile); struct timeval tv; CUdeviceptr MatrixTemp[2], MatrixPower; CUcontext ctx; CUmodule mod; CUresult res; int ret; /* * call our common CUDA initialization utility function. */ res = cuda_driver_api_init(&ctx, &mod, "./hotspot.cubin"); if (res != CUDA_SUCCESS) { printf("cuda_driver_api_init failed: res = %u\n", res); return; } res = cuMemAlloc(&MatrixTemp[0], sizeof(float) * size); if (res != CUDA_SUCCESS) { printf("cuMemAlloc failed: res = %u\n", res); return; } res = cuMemAlloc(&MatrixTemp[1], sizeof(float) * size); if (res != CUDA_SUCCESS) { printf("cuMemAlloc failed: res = %u\n", res); return; } res = cuMemAlloc(&MatrixPower, sizeof(float) * size); if (res != CUDA_SUCCESS) { printf("cuMemAlloc failed: res = %u\n", res); return; } /* * measurement start! */ time_measure_start(&tv); res = cuMemcpyHtoD(MatrixTemp[0], FilesavingTemp, sizeof(float) * size); if (res != CUDA_SUCCESS) { printf("cuMemcpyHtoD failed: res = %u\n", res); return; } res = cuMemcpyHtoD(MatrixPower, FilesavingPower, sizeof(float) * size); if (res != CUDA_SUCCESS) { printf("cuMemcpyHtoD failed: res = %u\n", res); return; } ret = compute_tran_temp(mod, MatrixPower, MatrixTemp, grid_cols, grid_rows, total_iterations, pyramid_height, blockCols, blockRows, borderCols, borderRows); res = cuMemcpyDtoH(MatrixOut, MatrixTemp[ret], sizeof(float) * size); if (res != CUDA_SUCCESS) { printf("cuMemcpyDtoH failed: res = %u\n", res); return; } /* * measurement end! will print out the time. */ time_measure_end(&tv); writeoutput(MatrixOut, grid_rows, grid_cols, ofile); cuMemFree(MatrixPower); cuMemFree(MatrixTemp[0]); cuMemFree(MatrixTemp[1]); free(MatrixOut); res = cuda_driver_api_exit(ctx, mod); if (res != CUDA_SUCCESS) { printf("cuda_driver_api_exit faild: res = %u\n", res); return; } }