int main(int argc, char** argv) { int err; // error code returned from api calls cl_platform_id platform_id; // platform id cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel size_t global[2]; // global domain size for our calculation size_t local[2]; // local domain size for our calculation char cl_platform_vendor[1001]; char cl_platform_name[1001]; cl_mem in_array; // device memory used for the input array //cl_mem synaptic_weights; // device memory used for the input array cl_mem out_array; // device memory used for the output array if (argc != 2){ printf("%s <inputfile>\n", argv[0]); return -1; } //float in_array[NO_NODES]; //float out_array[NO_NODES]; //float synaptic_weights[NO_NODES*NO_NODES]; float in_array_tb[NO_NODES]; float out_array_tb[NO_NODES]; //float synaptic_weights_tb[NO_NODES*NO_NODES]; float temp =0; int i = 0; int j = 0; int index = 0; FILE* ifp; char* mode = "r"; // // Connect to first platform // err = clGetPlatformIDs(1,&platform_id,NULL); if (err != CL_SUCCESS) { printf("Error: Failed to find an OpenCL platform!\n"); printf("Test failed\n"); return -1; } err = clGetPlatformInfo(platform_id,CL_PLATFORM_VENDOR,1000,(void *)cl_platform_vendor,NULL); if (err != CL_SUCCESS) { printf("Error: clGetPlatformInfo(CL_PLATFORM_VENDOR) failed!\n"); printf("Test failed\n"); return -1; } printf("CL_PLATFORM_VENDOR %s\n",cl_platform_vendor); err = clGetPlatformInfo(platform_id,CL_PLATFORM_NAME,1000,(void *)cl_platform_name,NULL); if (err != CL_SUCCESS) { printf("Error: clGetPlatformInfo(CL_PLATFORM_NAME) failed!\n"); printf("Test failed\n"); return -1; } printf("CL_PLATFORM_NAME %s\n",cl_platform_name); // Connect to a compute device // int fpga = 0; #if defined (FPGA_DEVICE) fpga = 1; #endif err = clGetDeviceIDs(platform_id, fpga ? CL_DEVICE_TYPE_ACCELERATOR : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); printf("Test failed\n"); return -1; } // // Create a compute context // context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); printf("Test failed\n"); return -1; } //relu_1(in_array,synaptic_weights,out_array); // Fill our data sets with pattern // //int i = 0; //for(i = 0; i < DATA_SIZE; i++) { // a[i] = (int)i; // b[i] = (int)i; // results[i] = 0; //} // // Create a command commands commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); printf("Error: code %i\n",err); printf("Test failed\n"); return -1; } int status; // Create Program Objects // // Load binary from disk unsigned char *kernelbinary; char *xclbin=argv[1]; printf("loading %s\n", xclbin); int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary); if (n_i < 0) { printf("failed to load kernel from xclbin: %s\n", xclbin); printf("Test failed\n"); return -1; } size_t n = n_i; // Create the compute program from offline program = clCreateProgramWithBinary(context, 1, &device_id, &n, (const unsigned char **) &kernelbinary, &status, &err); if ((!program) || (err!=CL_SUCCESS)) { printf("Error: Failed to create compute program from binary %d!\n", err); printf("Test failed\n"); printf("err : %d %s\n",err,err); } // Build the program executable // err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); printf("Test failed\n"); return -1; } // Create the compute kernel in the program we wish to run // kernel = clCreateKernel(program, "relu_1", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); printf("Test failed\n"); return -1; } // Create the input and output arrays in device memory for our calculation // in_array = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * NO_NODES, NULL, NULL); //synaptic_weights = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * NO_NODES * NO_NODES, NULL, NULL); out_array = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * NO_NODES, NULL, NULL); if (!in_array || /*!synaptic_weights ||*/ !out_array) { printf("Error: Failed to allocate device memory!\n"); printf("Test failed\n"); return -1; } ifp = fopen("/home/agandhi92/sdaccel/relu_1/input.txt",mode); if(ifp == NULL) { printf("Input file not found \n"); return -1; } while (fscanf(ifp, "%f", &temp) != EOF && index < NO_NODES) { in_array_tb[index++] = temp; } index = 0; temp = 0; //ifp = fopen("/home/agandhi92/sdaccel/relu_1/weight.txt",mode); //if(ifp == NULL) //{ // printf("Weight file not found \n"); // return -1; //} //while (fscanf(ifp, "%f", &temp) != EOF && index < (NO_NODES*NO_NODES)) { // synaptic_weights_tb[index++] = temp; //} // // Write our data set into the input array in device memory // err = clEnqueueWriteBuffer(commands, in_array, CL_TRUE, 0, sizeof(float) * NO_NODES, in_array_tb, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array a!\n"); printf("Test failed\n"); return -1; } // Write our data set into the input array in device memory // //err = clEnqueueWriteBuffer(commands, synaptic_weights, CL_TRUE, 0, sizeof(float) * NO_NODES * NO_NODES, synaptic_weights_tb, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array b!\n"); printf("Test failed\n"); return -1; } // Set the arguments to our compute kernel // err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &in_array); //err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &synaptic_weights); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_array); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); printf("Test failed\n"); return -1; } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // err = clEnqueueTask(commands, kernel, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel! %d\n", err); printf("Test failed\n"); return -1; } // Read back the results from the device to verify the output // cl_event readevent; err = clEnqueueReadBuffer( commands, out_array, CL_TRUE, 0, sizeof(float) * NO_NODES, out_array_tb, 0, NULL, &readevent ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); printf("Test failed\n"); return -1; } clWaitForEvents(1, &readevent); //printf("A\n"); //for (i=0;i<DATA_SIZE;i++) { // printf("%x ",a[i]); // if (((i+1) % 16) == 0) // printf("\n"); //} //printf("B\n"); //for (i=0;i<DATA_SIZE;i++) { // printf("%x ",b[i]); // if (((i+1) % 16) == 0) // printf("\n"); //} //printf("res\n"); //for (i=0;i<DATA_SIZE;i++) { // printf("%x ",results[i]); // if (((i+1) % 16) == 0) // printf("\n"); //} // Validate our results // //correct = 0; //for(i = 0; i < DATA_SIZE; i++) //{ // int row = i/MATRIX_RANK; // int col = i%MATRIX_RANK; // int running = 0; // int index; // for (index=0;index<MATRIX_RANK;index++) { // int aIndex = row*MATRIX_RANK + index; // int bIndex = col + index*MATRIX_RANK; // running += a[aIndex] * b[bIndex]; // } // sw_results[i] = running; //} // //for (i = 0;i < DATA_SIZE; i++) // if(results[i] == sw_results[i]) // correct++; //printf("Software\n"); //for (i=0;i<DATA_SIZE;i++) { // //printf("%0.2f ",sw_results[i]); // printf("%d ",sw_results[i]); // if (((i+1) % 16) == 0) // printf("\n"); //} // // //// Print a brief summary detailing the results //// //printf("Computed '%d/%d' correct values!\n", correct, DATA_SIZE); // // Shutdown and cleanup int temp_ = 0; for (j = 0; j < NO_NODES; j++) { if (out_array_tb[j] >= 0) // || out_array_tb[j]== 0) { //printf("out_array[%d] = %f \n", j, out_array[j]); temp_++; } } clReleaseMemObject(in_array); //clReleaseMemObject(synaptic_weights); clReleaseMemObject(out_array); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); if (temp_ == NO_NODES) { printf("*********************************************************** \n"); printf("TEST PASSED !!!!!! The output matches the desired output. \n"); printf("*********************************************************** \n"); return EXIT_SUCCESS; } else { printf("**************************************************************** \n"); printf("TEST Failed !!!!!! The output does not match the desired output. \n"); printf("**************************************************************** \n"); return -1; } //if(correct == DATA_SIZE){ // printf("Test passed!\n"); // return EXIT_SUCCESS; //} //else{ // printf("Test failed\n"); // return -1; //} }
int main(int argc, char** argv) { /* OpenCL 1.1 data structures */ cl_platform_id* platforms; cl_program program; cl_context context; /* OpenCL 1.1 scalar data types */ cl_uint numOfPlatforms; cl_int error; /* Prepare an array of __cl_float4 via dynamic memory allocation This will map to the native vector type which is SSE / SSE2 / AVX on Intel-compatible processors. */ cl_float8* ud_in = (cl_float8*) malloc( sizeof(cl_float8) * DATA_SIZE); // input to device cl_float8* ud_out = (cl_float8*) malloc( sizeof(cl_float8) * DATA_SIZE); // output from device for( int i = 0; i < DATA_SIZE; ++i) { ud_in[i] = (cl_float8){(float)i,(float)i,(float)i,(float)i,(float)i,(float)i,(float)i,(float)i}; ud_out[i] = (cl_float8){(float)0.f,(float)0.f,(float)0.f,(float)0.f,(float)0.f,(float)0.f,(float)0.f,(float)0.f}; } /* Get the number of platforms Remember that for each vendor's SDK installed on the computer, the number of available platform also increased. */ error = clGetPlatformIDs(0, NULL, &numOfPlatforms); if(error != CL_SUCCESS ) { perror("Unable to find any OpenCL platforms"); exit(1); } platforms = (cl_platform_id*) alloca(sizeof(cl_platform_id) * numOfPlatforms); printf("Number of OpenCL platforms found: %d\n", numOfPlatforms); error = clGetPlatformIDs(numOfPlatforms, platforms, NULL); if(error != CL_SUCCESS ) { perror("Unable to find any OpenCL platforms"); exit(1); } // Search for a CPU/GPU device through the installed platforms // Build a OpenCL program and do not run it. for(cl_uint i = 0; i < numOfPlatforms; i++ ) { cl_uint numOfDevices = 0; /* Determine how many devices are connected to your platform */ error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &numOfDevices); if (error != CL_SUCCESS ) { perror("Unable to obtain any OpenCL compliant device info"); exit(1); } cl_device_id* devices = (cl_device_id*) alloca(sizeof(cl_device_id) * numOfDevices); /* Load the information about your devices into the variable 'devices' */ error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, numOfDevices, devices, NULL); if (error != CL_SUCCESS ) { perror("Unable to obtain any OpenCL compliant device info"); exit(1); } printf("Number of detected OpenCL devices: %d\n", numOfDevices); /* Create a context */ cl_context_properties ctx[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[i], 0 }; context = clCreateContext(ctx, numOfDevices, devices, NULL, NULL, &error); if(error != CL_SUCCESS) { perror("Can't create a valid OpenCL context"); exit(1); } /* For each device, create a buffer and partition that data among the devices for compute! */ cl_mem inobj = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * DATA_SIZE, ud_in, &error); if(error != CL_SUCCESS) { perror("Can't create a buffer"); exit(1); } int offset = 0; for(int i = 0; i < numOfDevices; ++i, ++offset ) { /* Load the two source files into temporary datastores */ const char *file_names[] = {"vectorization.cl"}; const int NUMBER_OF_FILES = 1; char* buffer[NUMBER_OF_FILES]; size_t sizes[NUMBER_OF_FILES]; loadProgramSource(file_names, NUMBER_OF_FILES, buffer, sizes); /* Create the OpenCL program object */ program = clCreateProgramWithSource(context, NUMBER_OF_FILES, (const char**)buffer, sizes, &error); if(error != CL_SUCCESS) { perror("Can't create the OpenCL program object"); exit(1); } /* Build OpenCL program object and dump the error message, if any */ char *program_log; size_t log_size; char* build_options = "-fbin-llvmir -fbin-amdil -fbin-exe"; error = clBuildProgram(program, 1, &devices[i], build_options, NULL, NULL); if(error != CL_SUCCESS) { // If there's an error whilst building the program, dump the log clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); program_log = (char*) malloc(log_size+1); program_log[log_size] = '\0'; clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL); printf("\n=== ERROR ===\n\n%s\n=============\n", program_log); free(program_log); exit(1); } /* Query the program as to how many kernels were detected */ cl_uint numOfKernels; error = clCreateKernelsInProgram(program, 0, NULL, &numOfKernels); if (error != CL_SUCCESS) { perror("Unable to retrieve kernel count from program"); exit(1); } cl_kernel* kernels = (cl_kernel*) alloca(sizeof(cl_kernel) * numOfKernels); error = clCreateKernelsInProgram(program, numOfKernels, kernels, NULL); /* Loop thru each kernel and execute on device */ for(cl_uint j = 0; j < numOfKernels; j++) { char kernelName[32]; cl_uint argCnt; clGetKernelInfo(kernels[j], CL_KERNEL_FUNCTION_NAME, sizeof(kernelName), kernelName, NULL); clGetKernelInfo(kernels[j], CL_KERNEL_NUM_ARGS, sizeof(argCnt), &argCnt, NULL); printf("Kernel name: %s with arity: %d\n", kernelName, argCnt); printf("About to create command queue and enqueue this kernel...\n"); /* Create a command queue */ cl_command_queue cQ = clCreateCommandQueue(context, devices[i], 0, &error); if (error != CL_SUCCESS) { perror("Unable to create command-queue"); exit(1); } /* Create a buffer and copy the data from the main buffer */ cl_mem outobj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float8) * DATA_SIZE, 0, &error); if (error != CL_SUCCESS) { perror("Unable to create sub-buffer object"); exit(1); } /* Let OpenCL know that the kernel is suppose to receive an argument */ error = clSetKernelArg(kernels[j], 0, sizeof(cl_mem), &inobj); error = clSetKernelArg(kernels[j], 1, sizeof(cl_mem), &outobj); if (error != CL_SUCCESS) { perror("Unable to set buffer object in kernel"); exit(1); } /* Enqueue the kernel to the command queue */ error = clEnqueueTask(cQ, kernels[j], 0, NULL, NULL); if (error != CL_SUCCESS) { perror("Unable to enqueue task to command-queue"); exit(1); } printf("Task has been enqueued successfully!\n"); /* Enqueue the read-back from device to host */ error = clEnqueueReadBuffer(cQ, outobj, CL_TRUE, // blocking read 0, // read from the start sizeof(cl_float8)*DATA_SIZE, // how much to copy ud_out, 0, NULL, NULL); /* Check the returned data */ if ( valuesOK(ud_in, ud_out, DATA_SIZE) ) { printf("Check passed!\n"); } else printf("Check failed!\n"); /* Release the command queue */ clReleaseCommandQueue(cQ); clReleaseMemObject(outobj); } /* Clean up */ for(cl_uint i = 0; i < numOfKernels; i++) { clReleaseKernel(kernels[i]); } for(int i=0; i< NUMBER_OF_FILES; i++) { free(buffer[i]); } clReleaseProgram(program); }// end of device loop and execution clReleaseMemObject(inobj); clReleaseContext(context); }// end of platform loop free(ud_in); free(ud_out); }
int main( int argc, char **argv ) { struct pb_Parameters *params; params = pb_ReadParameters(&argc, argv); if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL)) { fprintf(stderr, "Expecting one input filename\n"); exit(-1); } int err = 0; if(argc != 3) err |= 1; else { char* numend; N = strtol(argv[1], &numend, 10); if(numend == argv[1]) err |= 2; B = strtol(argv[2], &numend, 10); if(numend == argv[2]) err |= 4; } if(err) { fprintf(stderr, "Expecting two integers for N and B\n"); exit(-1); } //8*1024*1024; int n_bytes = N * B* sizeof(float2); int nthreads = T; struct pb_TimerSet timers; pb_InitializeTimerSet(&timers); pb_AddSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); float *shared_source =(float *)malloc(n_bytes); float2 *source = (float2 *)malloc( n_bytes ); float2 *result = (float2 *)calloc( N*B, sizeof(float2) ); inputData(params->inpFiles[0],(float*)source,N*B*2); // OpenCL Code cl_int clErrNum; pb_Context* pb_context; pb_context = pb_InitOpenCLContext(params); if (pb_context == NULL) { fprintf (stderr, "Error: No OpenCL platform/device can be found."); return -1; } cl_int clStatus; cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId; cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId; cl_context clContext = (cl_context) pb_context->clContext; cl_command_queue clCommandQueue; cl_program clProgram; cl_kernel fft_kernel; cl_mem d_source, d_work; //float2 *d_source, *d_work; cl_mem d_shared_source; //float *d_shared_source; clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clErrNum); OCL_ERRCK_VAR(clErrNum); pb_SetOpenCL(&clContext, &clCommandQueue); pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); const char *source_path = "src/opencl_nvidia/fft_kernel.cl"; char *sourceCode; sourceCode = readFile(source_path); if (sourceCode == NULL) { fprintf(stderr, "Could not load program source of '%s'\n", source_path); exit(1); } clProgram = clCreateProgramWithSource(clContext, 1, (const char **)&sourceCode, NULL, &clErrNum); OCL_ERRCK_VAR(clErrNum); free(sourceCode); /* char compileOptions[1024]; // -cl-nv-verbose // Provides register info for NVIDIA devices // Set all Macros referenced by kernels sprintf(compileOptions, "\ -D PRESCAN_THREADS=%u\ -D KB=%u -D UNROLL=%u\ -D BINS_PER_BLOCK=%u -D BLOCK_X=%u", prescanThreads, lmemKB, UNROLL, bins_per_block, blockX ); */ OCL_ERRCK_RETVAL ( clBuildProgram(clProgram, 1, &clDevice, NULL /*compileOptions*/, NULL, NULL) ); char *build_log; size_t ret_val_size; OCL_ERRCK_RETVAL ( clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size) ); build_log = (char *)malloc(ret_val_size+1); OCL_ERRCK_RETVAL ( clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL) ); // to be careful, terminate with \0 build_log[ret_val_size] = '\0'; fprintf(stderr, "%s\n", build_log ); fft_kernel = clCreateKernel(clProgram, "GPU_FftShMem", &clErrNum); OCL_ERRCK_VAR(clErrNum); pb_SwitchToTimer(&timers, pb_TimerID_COPY); // allocate & copy device memory d_shared_source = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, n_bytes, shared_source, &clErrNum); OCL_ERRCK_VAR(clErrNum); d_source = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, n_bytes, source, &clErrNum); OCL_ERRCK_VAR(clErrNum); //result is initially zero'd out d_work = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, n_bytes, result, &clErrNum); OCL_ERRCK_VAR(clErrNum); pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); size_t block[1] = { nthreads }; size_t grid[1] = { B*block[0] }; OCL_ERRCK_RETVAL( clSetKernelArg(fft_kernel, 0, sizeof(cl_mem), (void *)&d_source) ); OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, fft_kernel, 1, 0, grid, block, 0, 0, 0) ); pb_SwitchToTimer(&timers, pb_TimerID_COPY); // copy device memory to host OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, d_source, CL_TRUE, 0, // Offset in bytes n_bytes, // Size of data to read result, // Host Source 0, NULL, NULL) ); if (params->outFile) { /* Write result to file */ pb_SwitchToTimer(&timers, pb_TimerID_IO); outputData(params->outFile, (float*)result, N*B*2); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); } OCL_ERRCK_RETVAL ( clReleaseMemObject(d_source) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(d_work) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(d_shared_source) ); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); free(shared_source); free(source); free(result); pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); pb_DestroyTimerSet(&timers); return 0; }
// main() for simple buffer and sub-buffer example // int main(int argc, char** argv) { cl_int errNum; cl_uint numPlatforms; cl_uint numDevices; cl_platform_id * platformIDs; cl_device_id * deviceIDs; cl_context context; cl_program program; std::vector<cl_kernel> kernels; std::vector<cl_command_queue> queues; std::vector<cl_mem> buffers; int * inputOutput; std::cout << "Simple buffer and sub-buffer Example" << std::endl; // First, select an OpenCL platform to run on. errNum = clGetPlatformIDs(0, NULL, &numPlatforms); checkErr( (errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs"); platformIDs = (cl_platform_id *)alloca(sizeof(cl_platform_id) * numPlatforms); std::cout << "Number of platforms: \t" << numPlatforms << std::endl; errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL); checkErr( (errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs"); std::ifstream srcFile("simple.cl"); checkErr(srcFile.is_open() ? CL_SUCCESS : -1, "reading simple.cl"); std::string srcProg( std::istreambuf_iterator<char>(srcFile), (std::istreambuf_iterator<char>())); const char * src = srcProg.c_str(); size_t length = srcProg.length(); deviceIDs = NULL; DisplayPlatformInfo( platformIDs[PLATFORM_INDEX], CL_PLATFORM_VENDOR, "CL_PLATFORM_VENDOR"); errNum = clGetDeviceIDs( platformIDs[PLATFORM_INDEX], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices); if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND){ checkErr(errNum, "clGetDeviceIDs"); } deviceIDs = (cl_device_id *)alloca( sizeof(cl_device_id) * numDevices); errNum = clGetDeviceIDs( platformIDs[PLATFORM_INDEX], CL_DEVICE_TYPE_ALL, numDevices, &deviceIDs[0], NULL); checkErr(errNum, "clGetDeviceIDs"); cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platformIDs[PLATFORM_INDEX], 0 }; context = clCreateContext( contextProperties, numDevices, deviceIDs, NULL, NULL, &errNum); checkErr(errNum, "clCreateContext"); // Create program from source program = clCreateProgramWithSource( context, 1, &src, &length, &errNum); checkErr(errNum, "clCreateProgramWithSource"); // Build program errNum = clBuildProgram( program, numDevices, deviceIDs, "-I.", NULL, NULL); if (errNum != CL_SUCCESS){ // Determine the reason for the error char buildLog[16384]; clGetProgramBuildInfo( program, deviceIDs[0], CL_PROGRAM_BUILD_LOG, sizeof(buildLog), buildLog, NULL); std::cerr << "Error in OpenCL C source: " << std::endl; std::cerr << buildLog; checkErr(errNum, "clBuildProgram"); } // create buffers and sub-buffers inputOutput = new int[NUM_BUFFER_ELEMENTS * numDevices]; for (unsigned int i = 0; i < NUM_BUFFER_ELEMENTS * numDevices; i++) { inputOutput[i] = i; } // create a single buffer to cover all the input data cl_mem buffer = clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof(int) * NUM_BUFFER_ELEMENTS * numDevices, NULL, &errNum); checkErr(errNum, "clCreateBuffer"); buffers.push_back(buffer); // now for all devices other than the first create a sub-buffer for (unsigned int i = 1; i < numDevices; i++) { cl_buffer_region region = { NUM_BUFFER_ELEMENTS * i * sizeof(int), NUM_BUFFER_ELEMENTS * sizeof(int) }; buffer = clCreateSubBuffer( buffers[0], CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &errNum); checkErr(errNum, "clCreateSubBuffer"); buffers.push_back(buffer); } // Create command queues for (int i = 0; i < numDevices; i++) { InfoDevice<cl_device_type>::display(deviceIDs[i], CL_DEVICE_TYPE, "CL_DEVICE_TYPE"); cl_command_queue queue = clCreateCommandQueue( context, deviceIDs[i], 0, &errNum); checkErr(errNum, "clCreateCommandQueue"); queues.push_back(queue); cl_kernel kernel = clCreateKernel( program, "square", &errNum); checkErr(errNum, "clCreateKernel(square)"); errNum = clSetKernelArg( kernel, 0, sizeof(cl_mem), (void *)&buffers[i]); checkErr(errNum, "clSetKernelArg(square)"); kernels.push_back(kernel); // Write input data clEnqueueWriteBuffer( queues[0], buffers[0], CL_TRUE, 0, sizeof(int) * NUM_BUFFER_ELEMENTS * numDevices, (void*)inputOutput, 0, NULL, NULL); std::vector<cl_event> events; // call kernel for each device for (int i = 0; i < queues.size(); i++) { cl_event event; size_t gWI = NUM_BUFFER_ELEMENTS; errNum = clEnqueueNDRangeKernel( queues[i], kernels[i], 1, NULL, (const size_t*)&gWI, (const size_t*)NULL, 0, 0, &event); events.push_back(event); } // Technically don't need this as we are doing a blocking read // with in-order queue. clWaitForEvents(events.size(), events.data()); // Read back computed data clEnqueueReadBuffer( queues[0], buffers[0], CL_TRUE, 0, sizeof(int) * NUM_BUFFER_ELEMENTS * numDevices, (void*)inputOutput, 0, NULL, NULL); // Display output in rows for (unsigned i = 0; i < numDevices; i++) { for (unsigned elems = i * NUM_BUFFER_ELEMENTS; elems < ((i+1) * NUM_BUFFER_ELEMENTS); elems++) { std::cout << " " << inputOutput[elems]; } std::cout << std::endl; } std::cout << "Program completed successfully" << std::endl; return 0; } }
int main() { cl_int num_rand = 4096*256; /* The number of random numbers generated using one generator */ int count_all, i, num_generator = sizeof(mts)/sizeof(mts[0]); /* The number of generators */ double pi; cl_platform_id platform_id = NULL; cl_uint ret_num_platforms; cl_device_id device_id = NULL; cl_uint ret_num_devices; cl_context context = NULL; cl_command_queue command_queue = NULL; cl_program program = NULL; cl_kernel kernel_mt = NULL, kernel_pi = NULL; size_t kernel_code_size; char *kernel_src_str; cl_uint *result; cl_int ret; FILE *fp; cl_mem rand, count; size_t global_item_size[3], local_item_size[3]; cl_mem dev_mts; cl_event ev_mt_end, ev_pi_end, ev_copy_end; cl_ulong prof_start, prof_mt_end, prof_pi_end, prof_copy_end; clGetPlatformIDs(1, &platform_id, &ret_num_platforms); clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret); result = (cl_uint*)malloc(sizeof(cl_uint)*num_generator); command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret); fp = fopen("mt.cl", "r"); kernel_src_str = (char*)malloc(MAX_SOURCE_SIZE); kernel_code_size = fread(kernel_src_str, 1, MAX_SOURCE_SIZE, fp); fclose(fp); /* Create output buffer */ rand = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint)*num_rand*num_generator, NULL, &ret); count = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint)*num_generator, NULL, &ret); /* Build Program*/ program = clCreateProgramWithSource(context, 1, (const char **)&kernel_src_str, (const size_t *)&kernel_code_size, &ret); clBuildProgram(program, 1, &device_id, "", NULL, NULL); kernel_mt = clCreateKernel(program, "genrand", &ret); kernel_pi = clCreateKernel(program, "calc_pi", &ret); /* Create input parameter */ dev_mts = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(mts), NULL, &ret); clEnqueueWriteBuffer(command_queue, dev_mts, CL_TRUE, 0, sizeof(mts), mts, 0, NULL, NULL); /* Set Kernel Arguments */ clSetKernelArg(kernel_mt, 0, sizeof(cl_mem), (void*)&rand); /* Random numbers (output of genrand) */ clSetKernelArg(kernel_mt, 1, sizeof(cl_mem), (void*)&dev_mts); /* MT parameter (input to genrand) */ clSetKernelArg(kernel_mt, 2, sizeof(num_rand), &num_rand); /* Number of random numbers to generate */ clSetKernelArg(kernel_pi, 0, sizeof(cl_mem), (void*)&count); /* Counter for points within circle (output of calc_pi) */ clSetKernelArg(kernel_pi, 1, sizeof(cl_mem), (void*)&rand); /* Random numbers (input to calc_pi) */ clSetKernelArg(kernel_pi, 2, sizeof(num_rand), &num_rand); /* Number of random numbers used */ global_item_size[0] = num_generator; global_item_size[1] = 1; global_item_size[2] = 1; local_item_size[0] = num_generator; local_item_size[1] = 1; local_item_size[2] = 1; /* Create a random number array */ clEnqueueNDRangeKernel(command_queue, kernel_mt, 1, NULL, global_item_size, local_item_size, 0, NULL, &ev_mt_end); /* Compute PI */ clEnqueueNDRangeKernel(command_queue, kernel_pi, 1, NULL, global_item_size, local_item_size, 0, NULL, &ev_pi_end); /* Get result */ clEnqueueReadBuffer(command_queue, count, CL_TRUE, 0, sizeof(cl_uint)*num_generator, result, 0, NULL, &ev_copy_end); /* Average the values of PI */ count_all = 0; for (i=0; i < num_generator; i++) { count_all += result[i]; } pi = ((double)count_all)/(num_rand * num_generator) * 4; printf("pi = %f\n", pi); /* Get execution time info */ clGetEventProfilingInfo(ev_mt_end, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &prof_start, NULL); clGetEventProfilingInfo(ev_mt_end, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &prof_mt_end, NULL); clGetEventProfilingInfo(ev_pi_end, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &prof_pi_end, NULL); clGetEventProfilingInfo(ev_copy_end, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &prof_copy_end, NULL); printf(" mt: %f[ms]\n" " pi: %f[ms]\n" " copy: %f[ms]\n", (prof_mt_end - prof_start)/(1000000.0), (prof_pi_end - prof_mt_end)/(1000000.0), (prof_copy_end - prof_pi_end)/(1000000.0)); clReleaseEvent(ev_mt_end); clReleaseEvent(ev_pi_end); clReleaseEvent(ev_copy_end); clReleaseMemObject(rand); clReleaseMemObject(count); clReleaseKernel(kernel_mt); clReleaseKernel(kernel_pi); clReleaseProgram(program); clReleaseCommandQueue(command_queue); clReleaseContext(context); free(kernel_src_str); free(result); return 0; }
void InitOpenCL() { // 1. Get a platform. cl_platform_id platform; clGetPlatformIDs( 1, &platform, NULL ); // 2. Find a gpu device. cl_device_id device; clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); // 3. Create a context and command queue on that device. cl_context context = clCreateContext( NULL, 1, &device, NULL, NULL, NULL); queue = clCreateCommandQueue( context, device, 0, NULL ); // 4. Perform runtime source compilation, and obtain kernel entry point. std::ifstream file("scene.cl"); std::string source; if (file){ while(!file.eof()){ char line[256]; file.getline(line,255); source += std::string(line) + "\n"; } } if (source.length()==0) { std::string err = "fail to load shader"; } cl_ulong maxSize; clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE , sizeof(cl_ulong), &maxSize, 0); const char* str = source.c_str(); cl_program program = clCreateProgramWithSource( context, 1, &str, NULL, NULL ); cl_int result = clBuildProgram( program, 1, &device, NULL, NULL, NULL ); if ( result ){ char* build_log; size_t log_size; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); build_log = new char[log_size+1]; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL); build_log[log_size] = '\0'; if( log_size > 2 ) { std::cout << "build log: " << build_log << std::endl; } delete[] build_log; std::cout << "Error during compilation! (" << result << ")" << std::endl; } kernel = clCreateKernel( program, "tracekernel", NULL ); // 5. Create a data buffer. buffer = clCreateBuffer( context, CL_MEM_WRITE_ONLY, kWidth * kHeight *sizeof(cl_float4), NULL, 0 ); viewTransform = clCreateBuffer( context, CL_MEM_READ_WRITE, 16 *sizeof(cl_float), NULL, 0 ); worldTransforms = clCreateBuffer( context, CL_MEM_READ_WRITE, 16 *sizeof(cl_float)*2, NULL, 0 ); clSetKernelArg(kernel, 0, sizeof(buffer), (void*) &buffer); clSetKernelArg(kernel, 1, sizeof(cl_uint), (void*) &kWidth); clSetKernelArg(kernel, 2, sizeof(cl_uint), (void*) &kWidth); clSetKernelArg(kernel, 3, sizeof(viewTransform), (void*) &viewTransform); clSetKernelArg(kernel, 4, sizeof(worldTransforms), (void*) &worldTransforms); }
int main(void) { const size_t ARRAY_BYTES = ARRAY_SIZE * sizeof(float); // Generate the input array on the host. float h_a[ARRAY_SIZE]; float h_b[ARRAY_SIZE]; for (int i = 0; i < ARRAY_SIZE; i++) { h_a[i] = (float)i; h_b[i] = (float)(2 * i); } float h_c[ARRAY_SIZE]; FILE *fp; char *source_str; size_t source_size; fp = fopen("vectors_cl.cl", "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); fclose(fp); // Get platform and device information cl_platform_id platform_id = NULL; cl_device_id device_id = NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); // Create an OpenCL context cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); // Create a command queue cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret); // Create memory buffers on the device for each vector cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, ARRAY_BYTES, NULL, &ret); cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, ARRAY_BYTES, NULL, &ret); cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, ARRAY_BYTES, NULL, &ret); // Copy h_a and h_b to memory buffer ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, ARRAY_BYTES, h_a, 0, NULL, NULL); ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0, ARRAY_BYTES, h_b, 0, NULL, NULL); // Create a program from the kernel source cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); if (ret != 0) { printf("clCreateProgramWithSource returned non-zero status %d\n\n", ret); exit(1); } // Build the program ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); if (ret != 0) { printf("clBuildProgram returned non-zero status %d: ", ret); if (ret == CL_INVALID_PROGRAM) { printf("invalid program\n"); } else if (ret == CL_INVALID_VALUE) { printf("invalid value\n"); } else if (ret == CL_INVALID_DEVICE) { printf("invalid device\n"); } else if (ret == CL_INVALID_BINARY) { printf("invalid binary\n"); } else if (ret == CL_INVALID_BUILD_OPTIONS) { printf("invalid build options\n"); } else if (ret == CL_INVALID_OPERATION) { printf("invalid operation\n"); } else if (ret == CL_COMPILER_NOT_AVAILABLE) { printf("compiler not available\n"); } else if (ret == CL_BUILD_PROGRAM_FAILURE) { printf("build program failure\n"); // Determine the size of the log size_t log_size; clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); // Allocate memory for the log char *log = (char *) malloc(log_size); // Get the log clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); // Print the log printf("%s\n", log); } else if (ret == CL_OUT_OF_HOST_MEMORY) { printf("out of host memory\n"); } exit(1); } // Create the OpenCL kernel cl_kernel kernel = clCreateKernel(program, "add", &ret); // Set the arguments of the kernel ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj); ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj); ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj); size_t array_size = ARRAY_SIZE; ret = clSetKernelArg(kernel, 3, sizeof(const size_t), (void *)&array_size); // Execute the OpenCL kernel on the list size_t global_item_size = ARRAY_SIZE; // Process the entire lists size_t local_item_size = 1; // Divide work items into groups of 64 ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); // Read the memory buffer C on the device to the local variable C ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0, ARRAY_BYTES, h_c, 0, NULL, NULL); // Print out the resulting array. for (int i = 0; i < 8; i++) { printf("%d + %d = %d", (int)h_a[i], (int)h_b[i], (int)h_c[i]); printf(((i % 4) != 3) ? "\t" : "\n"); } printf("...\n"); for (int i = ARRAY_SIZE - 8; i < ARRAY_SIZE; i++) { printf("%d + %d = %d", (int)h_a[i], (int)h_b[i], (int)h_c[i]); printf(((i % 4) != 3) ? "\t" : "\n"); } // Clean up ret = clFlush(command_queue); ret = clFinish(command_queue); ret = clReleaseKernel(kernel); ret = clReleaseProgram(program); ret = clReleaseMemObject(a_mem_obj); ret = clReleaseMemObject(b_mem_obj); ret = clReleaseMemObject(c_mem_obj); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); return 0; }
void buildOpenCLKernels_flux_calc_kernely(int xdim0, int ydim0, int xdim1, int ydim1, int xdim2, int ydim2, int xdim3, int ydim3) { // int ocl_fma = OCL_FMA; if (!isbuilt_flux_calc_kernely) { buildOpenCLKernels(); // clSafeCall( clUnloadCompiler() ); cl_int ret; char *source_filename[1] = {(char *)"./OpenCL/flux_calc_kernely.cl"}; // Load the kernel source code into the array source_str FILE *fid; char *source_str[1]; size_t source_size[1]; for (int i = 0; i < 1; i++) { fid = fopen(source_filename[i], "r"); if (!fid) { fprintf(stderr, "Can't open the kernel source file!\n"); exit(1); } source_str[i] = (char *)malloc(4 * 0x1000000); source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); if (source_size[i] != 4 * 0x1000000) { if (ferror(fid)) { printf("Error while reading kernel source file %s\n", source_filename[i]); exit(-1); } if (feof(fid)) printf("Kernel source file %s succesfuly read.\n", source_filename[i]); // printf("%s\n",source_str[i]); } fclose(fid); } printf("Compiling flux_calc_kernely %d source -- start \n", OCL_FMA); // Create a program from the source OPS_opencl_core.program = clCreateProgramWithSource( OPS_opencl_core.context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); clSafeCall(ret); // Build the program char buildOpts[255 * 4]; char *pPath = NULL; pPath = getenv("OPS_INSTALL_PATH"); if (pPath != NULL) if (OCL_FMA) sprintf(buildOpts, "-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d " "-Dxdim0_flux_calc_kernely=%d -Dydim0_flux_calc_kernely=%d " "-Dxdim1_flux_calc_kernely=%d -Dydim1_flux_calc_kernely=%d " "-Dxdim2_flux_calc_kernely=%d -Dydim2_flux_calc_kernely=%d " "-Dxdim3_flux_calc_kernely=%d -Dydim3_flux_calc_kernely=%d ", pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3); else sprintf(buildOpts, "-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d " "-Dxdim0_flux_calc_kernely=%d -Dydim0_flux_calc_kernely=%d " "-Dxdim1_flux_calc_kernely=%d -Dydim1_flux_calc_kernely=%d " "-Dxdim2_flux_calc_kernely=%d -Dydim2_flux_calc_kernely=%d " "-Dxdim3_flux_calc_kernely=%d -Dydim3_flux_calc_kernely=%d ", pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3); else { sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); exit(EXIT_FAILURE); } ret = clBuildProgram(OPS_opencl_core.program, 1, &OPS_opencl_core.device_id, buildOpts, NULL, NULL); if (ret != CL_SUCCESS) { char *build_log; size_t log_size; clSafeCall(clGetProgramBuildInfo( OPS_opencl_core.program, OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); build_log = (char *)malloc(log_size + 1); clSafeCall(clGetProgramBuildInfo( OPS_opencl_core.program, OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); build_log[log_size] = '\0'; fprintf( stderr, "=============== OpenCL Program Build Info ================\n\n%s", build_log); fprintf(stderr, "\n========================================================= \n"); free(build_log); exit(EXIT_FAILURE); } printf("compiling flux_calc_kernely -- done\n"); // Create the OpenCL kernel OPS_opencl_core.kernel[107] = clCreateKernel(OPS_opencl_core.program, "ops_flux_calc_kernely", &ret); clSafeCall(ret); isbuilt_flux_calc_kernely = true; } }
int main(int argc, char** argv) { int err; // error code returned from api calls float data[DATA_SIZE]; // original data set given to device float results[DATA_SIZE]; // results returned from device unsigned int correct; // number of correct results returned size_t global; // global domain size for our calculation size_t local; // local domain size for our calculation cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel cl_mem input; // device memory used for the input array cl_mem output; // device memory used for the output array // Fill our data set with random float values // int i = 0; unsigned int count = DATA_SIZE; for(i = 0; i < count; i++) data[i] = rand() / (float)RAND_MAX; // Connect to a compute device // int gpu = 1; err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); return EXIT_FAILURE; } // Create a compute context // context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } // Create a command commands // commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } // Create the compute program from the source buffer // program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err); if (!program) { printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } // Build the program executable // err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } // Create the compute kernel in the program we wish to run // kernel = clCreateKernel(program, "square", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } // Create the input and output arrays in device memory for our calculation // input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL); output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL); if (!input || !output) { printf("Error: Failed to allocate device memory!\n"); exit(1); } // Write our data set into the input array in device memory // err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array!\n"); exit(1); } // Set the arguments to our compute kernel // err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output); err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); exit(1); } // Get the maximum work group size for executing the kernel on the device // err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); exit(1); } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // global = count; err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel!\n"); return EXIT_FAILURE; } // Wait for the command commands to get serviced before reading back results // clFinish(commands); // Read back the results from the device to verify the output // err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); exit(1); } // Validate our results // correct = 0; for(i = 0; i < count; i++) { if(results[i] == data[i] * data[i]) correct++; } // Print a brief summary detailing the results // printf("Computed '%d/%d' correct values!\n", correct, count); // Shutdown and cleanup // clReleaseMemObject(input); clReleaseMemObject(output); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); return 0; }
/** * @brief Creates an array of objects containing the OpenCL variables of each device * @param trDataBase The training database which will contain the instances and the features * @param selInstances The instances choosen as initial centroids * @param transposedTrDataBase The training database already transposed * @param conf The structure with all configuration parameters * @return A pointer containing the objects */ CLDevice *createDevices(const float *const trDataBase, const int *const selInstances, const float *const transposedTrDataBase, Config *const conf) { /********** Find the OpenCL devices specified in configuration ***********/ // OpenCL variables cl_uint numPlatformsDevices; cl_device_type deviceType; cl_program program; cl_kernel kernel; cl_int status; // Others variables auto allDevices = getAllDevices(); CLDevice *devices = new CLDevice[conf -> nDevices + (conf -> ompThreads > 0)]; for (int dev = 0; dev < conf -> nDevices; ++dev) { bool found = false; for (int allDev = 0; allDev < allDevices.size() && !found; ++allDev) { // Get the specified OpenCL device char dbuff[120]; check(clGetDeviceInfo(allDevices[allDev], CL_DEVICE_NAME, sizeof(dbuff), dbuff, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_NAME); // If the device exists... if (conf -> devices[dev] == dbuff) { devices[dev].device = allDevices[allDev]; devices[dev].deviceName = dbuff; check(clGetDeviceInfo(devices[dev].device, CL_DEVICE_TYPE, sizeof(cl_device_type), &(devices[dev].deviceType), NULL) != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_TYPE); /********** Device local memory usage ***********/ long int usedMemory = conf -> nFeatures * sizeof(cl_uchar); // Chromosome of the individual usedMemory += conf -> trNInstances * sizeof(cl_uchar); // Mapping buffer usedMemory += conf -> K * conf -> nFeatures * sizeof(cl_float); // Centroids buffer usedMemory += conf -> trNInstances * sizeof(cl_float); // DistCentroids buffer usedMemory += conf -> K * sizeof(cl_int); // Samples_in_k buffer // Get the maximum local memory size long int maxMemory; check(clGetDeviceInfo(devices[dev].device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(long int), &maxMemory, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_MAXMEM); // Avoid exceeding the maximum local memory available. 1024 bytes of margin check(usedMemory > maxMemory - 1024, "%s:\n\tMax memory: %ld bytes\n\tAllow memory: %ld bytes\n\tUsed memory: %ld bytes\n", CL_ERROR_DEVICE_LOCALMEM, maxMemory, maxMemory - 1024, usedMemory); /********** Create context ***********/ devices[dev].context = clCreateContext(NULL, 1, &(devices[dev].device), 0, 0, &status); check(status != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_CONTEXT); /********** Create Command queue ***********/ devices[dev].commandQueue = clCreateCommandQueue(devices[dev].context, devices[dev].device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE, &status); check(status != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_QUEUE); /********** Create kernel ***********/ // Open the file containing the kernels std::fstream kernels(conf -> kernelsFileName.c_str(), std::fstream::in); check(!kernels.is_open(), "%s\n", CL_ERROR_FILE_OPEN); // Obtain the size kernels.seekg(0, kernels.end); size_t fSize = kernels.tellg(); kernels.seekg(0, kernels.beg); char *kernelSource = new char[fSize]; kernels.read(kernelSource, fSize); kernels.close(); // Create program program = clCreateProgramWithSource(devices[dev].context, 1, (const char **) &kernelSource, &fSize, &status); check(status != CL_SUCCESS, "%s\n", CL_ERROR_PROGRAM_BUILD); // Build program for the device in the context char buildOptions[196]; sprintf(buildOptions, "-I include -D N_INSTANCES=%d -D N_FEATURES=%d -D N_OBJECTIVES=%d -D K=%d -D MAX_ITER_KMEANS=%d", conf -> trNInstances, conf -> nFeatures, conf -> nObjectives, conf -> K, conf -> maxIterKmeans); if (clBuildProgram(program, 1, &(devices[dev].device), buildOptions, 0, 0) != CL_SUCCESS) { char buffer[4096]; fprintf(stderr, "Error: Could not build the program\n"); check(clGetProgramBuildInfo(program, devices[dev].device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_PROGRAM_ERRORS); check(true, "%s\n", buffer); } // Create kernel const char *kernelName = (devices[dev].deviceType == CL_DEVICE_TYPE_GPU) ? "kmeansGPU" : ""; devices[dev].kernel = clCreateKernel(program, kernelName, &status); check(status != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_BUILD); /******* Work-items *******/ devices[dev].computeUnits = atoi(conf -> computeUnits[dev].c_str()); devices[dev].wiLocal = atoi(conf -> wiLocal[dev].c_str()); devices[dev].wiGlobal = devices[dev].computeUnits * devices[dev].wiLocal; /******* Create and write the databases and centroids buffers. Create the subpopulations buffer. Set kernel arguments *******/ // Create buffers devices[dev].objSubpopulations = clCreateBuffer(devices[dev].context, CL_MEM_READ_WRITE, conf -> familySize * sizeof(Individual), 0, &status); check(status != CL_SUCCESS, "%s\n", CL_ERROR_OBJECT_SUBPOPS); devices[dev].objTrDataBase = clCreateBuffer(devices[dev].context, CL_MEM_READ_ONLY, conf -> trNInstances * conf -> nFeatures * sizeof(cl_float), 0, &status); check(status != CL_SUCCESS, "%s\n", CL_ERROR_OBJECT_TRDB); devices[dev].objTransposedTrDataBase = clCreateBuffer(devices[dev].context, CL_MEM_READ_ONLY, conf -> trNInstances * conf -> nFeatures * sizeof(cl_float), 0, &status); check(status != CL_SUCCESS, "%s\n", CL_ERROR_OBJECT_TTRDB); devices[dev].objSelInstances = clCreateBuffer(devices[dev].context, CL_MEM_READ_ONLY, conf -> K * sizeof(cl_int), 0, &status); check(status != CL_SUCCESS, "%s\n", CL_ERROR_OBJECT_CENTROIDS); // Sets kernel arguments check(clSetKernelArg(devices[dev].kernel, 0, sizeof(cl_mem), (void *)&(devices[dev].objSubpopulations)) != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_ARGUMENT1); check(clSetKernelArg(devices[dev].kernel, 1, sizeof(cl_mem), (void *)&(devices[dev].objSelInstances)) != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_ARGUMENT2); check(clSetKernelArg(devices[dev].kernel, 2, sizeof(cl_mem), (void *)&(devices[dev].objTrDataBase)) != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_ARGUMENT3); check(clSetKernelArg(devices[dev].kernel, 5, sizeof(cl_mem), (void *)&(devices[dev].objTransposedTrDataBase)) != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_ARGUMENT6); // Write buffers check(clEnqueueWriteBuffer(devices[dev].commandQueue, devices[dev].objTrDataBase, CL_FALSE, 0, conf -> trNInstances * conf -> nFeatures * sizeof(cl_float), trDataBase, 0, NULL, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_ENQUEUE_TRDB); check(clEnqueueWriteBuffer(devices[dev].commandQueue, devices[dev].objSelInstances, CL_FALSE, 0, conf -> K * sizeof(cl_int), selInstances, 0, NULL, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_ENQUEUE_CENTROIDS); check(clEnqueueWriteBuffer(devices[dev].commandQueue, devices[dev].objTransposedTrDataBase, CL_FALSE, 0, conf -> trNInstances * conf -> nFeatures * sizeof(cl_float), transposedTrDataBase, 0, NULL, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_ENQUEUE_TTRDB); // Resources used are released delete[] kernelSource; clReleaseProgram(program); found = true; allDevices.erase(allDevices.begin() + allDev); } } check(!found, "%s\n", CL_ERROR_DEVICE_FOUND); } /********** Add the CPU if has been enabled in configuration ***********/ if (conf -> ompThreads > 0) { devices[conf -> nDevices].deviceType = CL_DEVICE_TYPE_CPU; devices[conf -> nDevices].computeUnits = conf -> ompThreads; ++(conf -> nDevices); } return devices; }
int main() { char buf[]="Hello, World!"; size_t srcsize, worksize=strlen(buf); cl_int error; cl_platform_id platform; cl_device_id device; cl_uint platforms, devices; // Fetch the Platform and Device IDs; we only want one. error=clGetPlatformIDs(1, &platform, &platforms); error=clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, &devices); cl_context_properties properties[]={ CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0}; // Note that nVidia's OpenCL requires the platform property cl_context context=clCreateContext(properties, 1, &device, NULL, NULL, &error); cl_command_queue cq = clCreateCommandQueue(context, device, 0, &error); rot13(buf); // scramble using the CPU puts(buf); // Just to demonstrate the plaintext is destroyed //char src[8192]; //FILE *fil=fopen("rot13.cl","r"); //srcsize=fread(src, sizeof src, 1, fil); //fclose(fil); const char *src=rot13_cl; srcsize=strlen(rot13_cl); const char *srcptr[]={src}; // Submit the source code of the rot13 kernel to OpenCL cl_program prog=clCreateProgramWithSource(context, 1, srcptr, &srcsize, &error); // and compile it (after this we could extract the compiled version) error=clBuildProgram(prog, 0, NULL, "", NULL, NULL); // Allocate memory for the kernel to work with cl_mem mem1, mem2; mem1=clCreateBuffer(context, CL_MEM_READ_ONLY, worksize, NULL, &error); mem2=clCreateBuffer(context, CL_MEM_WRITE_ONLY, worksize, NULL, &error); // get a handle and map parameters for the kernel cl_kernel k_rot13=clCreateKernel(prog, "rot13", &error); clSetKernelArg(k_rot13, 0, sizeof(mem1), &mem1); clSetKernelArg(k_rot13, 1, sizeof(mem2), &mem2); // Target buffer just so we show we got the data from OpenCL char buf2[sizeof buf]; buf2[0]='?'; buf2[worksize]=0; // Send input data to OpenCL (async, don't alter the buffer!) error=clEnqueueWriteBuffer(cq, mem1, CL_FALSE, 0, worksize, buf, 0, NULL, NULL); // Perform the operation error=clEnqueueNDRangeKernel(cq, k_rot13, 1, NULL, &worksize, &worksize, 0, NULL, NULL); // Read the result back into buf2 error=clEnqueueReadBuffer(cq, mem2, CL_FALSE, 0, worksize, buf2, 0, NULL, NULL); // Await completion of all the above error=clFinish(cq); // Finally, output out happy message. puts(buf2); }
void btParticlesDynamicsWorld::initCLKernels(int argc, char** argv) { cl_int ciErrNum; if (!m_cxMainContext) { cl_device_type deviceType = CL_DEVICE_TYPE_ALL; m_cxMainContext = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0, 0); int numDev = btOpenCLUtils::getNumDevices(m_cxMainContext); if (!numDev) { btAssert(0); exit(0);//this is just a demo, exit now } m_cdDevice = btOpenCLUtils::getDevice(m_cxMainContext,0); oclCHECKERROR(ciErrNum, CL_SUCCESS); btOpenCLDeviceInfo clInfo; btOpenCLUtils::getDeviceInfo(m_cdDevice,clInfo); btOpenCLUtils::printDeviceInfo(m_cdDevice); // create a command-queue m_cqCommandQue = clCreateCommandQueue(m_cxMainContext, m_cdDevice, 0, &ciErrNum); oclCHECKERROR(ciErrNum, CL_SUCCESS); } // Program Setup size_t program_length; #ifdef LOAD_FROM_MEMORY program_length = strlen(source); printf("OpenCL compiles ParticlesOCL.cl ... "); #else const char* fileName = "ParticlesOCL.cl"; FILE * fp = fopen(fileName, "rb"); char newFileName[512]; if (fp == NULL) { sprintf(newFileName,"..//%s",fileName); fp = fopen(newFileName, "rb"); if (fp) fileName = newFileName; } if (fp == NULL) { sprintf(newFileName,"Demos//ParticlesOpenCL//%s",fileName); fp = fopen(newFileName, "rb"); if (fp) fileName = newFileName; } if (fp == NULL) { sprintf(newFileName,"..//..//..//..//..//Demos//ParticlesOpenCL//%s",fileName); fp = fopen(newFileName, "rb"); if (fp) fileName = newFileName; else { printf("cannot find %s\n",newFileName); exit(0); } } // char *source = oclLoadProgSource(".//Demos//SpheresGrid//SpheresGrid.cl", "", &program_length); //char *source = btOclLoadProgSource(".//Demos//SpheresOpenCL//Shared//SpheresGrid.cl", "", &program_length); char *source = btOclLoadProgSource(fileName, "", &program_length); if(source == NULL) { printf("ERROR : OpenCL can't load file %s\n", fileName); } // oclCHECKERROR (source == NULL, oclFALSE); btAssert(source != NULL); // create the program printf("OpenCL compiles %s ...", fileName); #endif //LOAD_FROM_MEMORY //printf("%s\n", source); m_cpProgram = clCreateProgramWithSource(m_cxMainContext, 1, (const char**)&source, &program_length, &ciErrNum); oclCHECKERROR(ciErrNum, CL_SUCCESS); #ifndef LOAD_FROM_MEMORY free(source); #endif //LOAD_FROM_MEMORY //#define LOCAL_SIZE_LIMIT 1024U #define LOCAL_SIZE_MAX 1024U // Build the program with 'mad' Optimization option #ifdef MAC const char* flags = "-I. -DLOCAL_SIZE_MAX=1024U -cl-mad-enable -DMAC -DGUID_ARG"; #else const char* flags = "-I. -DLOCAL_SIZE_MAX=1024U -DGUID_ARG= "; #endif // build the program ciErrNum = clBuildProgram(m_cpProgram, 0, NULL, flags, NULL, NULL); if(ciErrNum != CL_SUCCESS) { // write out standard error // oclLog(LOGBOTH | ERRORMSG, (double)ciErrNum, STDERROR); // write out the build log and ptx, then exit char cBuildLog[10240]; // char* cPtx; // size_t szPtxLength; clGetProgramBuildInfo(m_cpProgram, m_cdDevice, CL_PROGRAM_BUILD_LOG, sizeof(cBuildLog), cBuildLog, NULL ); // oclGetProgBinary(m_cpProgram, oclGetFirstDev(m_cxMainContext), &cPtx, &szPtxLength); // oclLog(LOGBOTH | CLOSELOG, 0.0, "\n\nLog:\n%s\n\n\n\n\nPtx:\n%s\n\n\n", cBuildLog, cPtx); printf("\n\n%s\n\n\n", cBuildLog); printf("Press ENTER key to terminate the program\n"); getchar(); exit(-1); } printf("OK\n"); // create the kernels postInitDeviceData(); initKernel(PARTICLES_KERNEL_COMPUTE_CELL_ID, "kComputeCellId"); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_COMPUTE_CELL_ID].m_kernel, 1, sizeof(cl_mem), (void*) &m_dPos); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_COMPUTE_CELL_ID].m_kernel, 2, sizeof(cl_mem), (void*) &m_dPosHash); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_COMPUTE_CELL_ID].m_kernel, 3, sizeof(cl_mem), (void*) &m_dSimParams); oclCHECKERROR(ciErrNum, CL_SUCCESS); initKernel(PARTICLES_KERNEL_INTEGRATE_MOTION, "kIntegrateMotion"); ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_INTEGRATE_MOTION].m_kernel, 1, sizeof(cl_mem), (void *) &m_dPos); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_INTEGRATE_MOTION].m_kernel, 2, sizeof(cl_mem), (void *) &m_dVel); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_INTEGRATE_MOTION].m_kernel, 3, sizeof(cl_mem), (void *) &m_dSimParams); oclCHECKERROR(ciErrNum, CL_SUCCESS); initKernel(PARTICLES_KERNEL_CLEAR_CELL_START, "kClearCellStart"); ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_CLEAR_CELL_START].m_kernel, 0, sizeof(int), (void *) &m_numGridCells); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_CLEAR_CELL_START].m_kernel, 1, sizeof(cl_mem), (void*) &m_dCellStart); initKernel(PARTICLES_KERNEL_FIND_CELL_START, "kFindCellStart"); // ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 0, sizeof(int), (void*) &m_numParticles); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 1, sizeof(cl_mem), (void*) &m_dPosHash); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 2, sizeof(cl_mem), (void*) &m_dCellStart); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 3, sizeof(cl_mem), (void*) &m_dPos); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 4, sizeof(cl_mem), (void*) &m_dVel); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 5, sizeof(cl_mem), (void*) &m_dSortedPos); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 6, sizeof(cl_mem), (void*) &m_dSortedVel); oclCHECKERROR(ciErrNum, CL_SUCCESS); initKernel(PARTICLES_KERNEL_COLLIDE_PARTICLES, "kCollideParticles"); ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 1, sizeof(cl_mem), (void*) &m_dVel); ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 2, sizeof(cl_mem), (void*) &m_dSortedPos); ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 3, sizeof(cl_mem), (void*) &m_dSortedVel); ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 4, sizeof(cl_mem), (void*) &m_dPosHash); ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 5, sizeof(cl_mem), (void*) &m_dCellStart); ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 6, sizeof(cl_mem), (void*) &m_dSimParams); initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_LOCAL, "kBitonicSortCellIdLocal"); initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_LOCAL_1, "kBitonicSortCellIdLocal1"); initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_GLOBAL, "kBitonicSortCellIdMergeGlobal"); initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_LOCAL, "kBitonicSortCellIdMergeLocal"); }
int main() { srand(unsigned(time(nullptr))); int err; // error code returned from api calls cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel // OpenCL device memory for matrices cl_mem d_A; cl_mem d_B; cl_mem d_C; // set seed for rand() srand(2014); //Allocate host memory for matrices A and B unsigned int size_A = WA * HA; unsigned int mem_size_A = sizeof(float) * size_A; float* h_A = (float*)malloc(mem_size_A); unsigned int size_B = WB * HB; unsigned int mem_size_B = sizeof(float) * size_B; float* h_B = (float*)malloc(mem_size_B); //Initialize host memory randomMemInit(h_A, size_A); randomMemInit(h_B, size_B); //Allocate host memory for the result C unsigned int size_C = WC * HC; unsigned int mem_size_C = sizeof(float) * size_C; float* h_C = (float*)malloc(mem_size_C); printf("Initializing OpenCL device...\n"); cl_uint dev_cnt = 0; clGetPlatformIDs(0, 0, &dev_cnt); cl_platform_id platform_ids[100]; clGetPlatformIDs(dev_cnt, platform_ids, NULL); // Connect to a compute device int gpu = 1; err = clGetDeviceIDs(platform_ids[0], gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if (err != CL_SUCCESS){ printf("Error: Failed to create a device group!\n"); return EXIT_FAILURE; } // Create a compute context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context){ printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } // Create a command commands commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands){ printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } // Create the compute program from the source file char *KernelSource; long lFileSize = LoadOpenCLKernel("matrixmul_kernel.cl", &KernelSource); if (lFileSize < 0L){ perror("File read failed"); return 1; } //const char* KernelSource = loadKernelCPP(".\\matrixmul_kernel.cl"); program = clCreateProgramWithSource(context, 1, (const char **)&KernelSource, NULL, &err); if (!program){ printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } // Build the program executable err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS){ size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } // Create the compute kernel in the program we wish to run kernel = clCreateKernel(program, "matrixMul", &err); if (!kernel || err != CL_SUCCESS){ printf("Error: Failed to create compute kernel!\n"); exit(1); } // Create the input and output arrays in device memory for our calculation d_C = clCreateBuffer(context, CL_MEM_READ_WRITE, mem_size_A, NULL, &err); d_A = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_A, h_A, &err); d_B = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_B, h_B, &err); if (!d_A || !d_B || !d_C){ printf("Error: Failed to allocate device memory!\n"); exit(1); } printf("Running matrix multiplication for matrices A (%dx%d) and B (%dx%d) ...\n", WA, HA, WB, HB); //Launch OpenCL kernel size_t localWorkSize[2], globalWorkSize[2]; int wA = WA; int wC = WC; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&d_C); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d_A); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&d_B); err |= clSetKernelArg(kernel, 3, sizeof(int), (void *)&wA); err |= clSetKernelArg(kernel, 4, sizeof(int), (void *)&wC); if (err != CL_SUCCESS){ printf("Error: Failed to set kernel arguments! %d\n", err); exit(1); } localWorkSize[0] = 16; localWorkSize[1] = 16; globalWorkSize[0] = 1024; globalWorkSize[1] = 1024; err = clEnqueueNDRangeKernel(commands, kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); if (err != CL_SUCCESS){ printf("Error: Failed to execute kernel! %d\n", err); exit(1); } //Retrieve result from device err = clEnqueueReadBuffer(commands, d_C, CL_TRUE, 0, mem_size_C, h_C, 0, NULL, NULL); if (err != CL_SUCCESS){ printf("Error: Failed to read output array! %d\n", err); exit(1); } //print table A printf("\nMatrix A\n"); for (int i = 0; i < size_A; i++){ printf("%f\t", h_A[i]); if (((i + 1) % WA) == 0) printf("\n"); } //print table B printf("\nMatrix B\n"); for (int i = 0; i < size_B; i++){ printf("%f\t", h_B[i]); if (((i + 1) % WB) == 0) printf("\n"); } //print out the results printf("\nMatrix C (Results)\n"); for (int i = 0; i < size_C; i++){ printf("%f\t", h_C[i]); if (((i + 1) % WC) == 0) printf("\n"); } printf("\n"); printf("Matrix multiplication completed...\n"); //Shutdown and cleanup free(h_A); free(h_B); free(h_C); clReleaseMemObject(d_A); clReleaseMemObject(d_C); clReleaseMemObject(d_B); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); std::cin.clear(); std::cin.sync(); std::cin.get(); }
void initopencl(void) { int i; // Get Platform and Device Info CL_CHECK(clGetPlatformIDs(1, &platform_id, &num_platforms)); // Currently this program only runs on a SINGLE GPU. CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &num_devices)); printf("=== %d OpenCL platform(s) found: ===\n", num_platforms); printf("=== %d OpenCL device(s) found on platform:\n", num_devices); char buffer[10240]; cl_uint buf_uint; cl_ulong buf_ulong; printf(" -- %d --\n", i); CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(buffer), buffer, NULL)); printf(" DEVICE_NAME = %s\n", buffer); CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL)); printf(" DEVICE_VENDOR = %s\n", buffer); CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL)); printf(" DEVICE_VERSION = %s\n", buffer); CL_CHECK(clGetDeviceInfo(device_id, CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL)); printf(" DRIVER_VERSION = %s\n", buffer); CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL)); printf(" DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint); CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL)); printf(" DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint); CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL)); printf(" DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong); if (num_devices == 0) { fprintf(stderr, "No Devices found that can run OpenCL."); exit(0); } // Create OpenCL context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating context: Function returned %d \n\n", ret); exit(1); } // Create Command Queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating command Queue: Function returned %d \n\n", ret); exit(1); } // Load the kernel source code into the array source_str FILE *fp; char *source_str; size_t source_size; fp = fopen("integrate.cl", "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } source_str = (char*)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp); fclose( fp ); // Create a program from the kernel source program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating a program for integration3D. %d \n\n", (int)ret); exit(1); } // Build the program ret = clBuildProgram(program, 1, &device_id, "-DUSE_DOUBLE=1", NULL, NULL); if (ret != CL_SUCCESS) { size_t length; char buffer[10240]; clGetProgramBuildInfo(program, 1, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &length); fprintf(stderr, "Error returned %d. \n\n", (int)ret); printf("Error Log: \n\n %s \n\n", buffer); exit(0); } /* // Create the OpenCL kernel (compute_points_Unstructure3D_1) kernel1 = clCreateKernel(program, "compute_points_Unstructure3D_1", &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating a kernel for compute_points_Unstructure3D_1. \n\n"); exit(1); } */ // Create the OpenCL kernel (check_int) kernel2 = clCreateKernel(program, "check_int", &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating a kernel for check_int. %d \n\n", (int)ret); exit(1); } // Create the OpenCL kernel (compute_points_Unstructure3D_1) kernel1 = clCreateKernel(program, "compute_points_Unstructure3D_1", &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating a kernel for compute_points_Unstructure3D_1. \n\n"); exit(1); } // Create the OpenCL kernel (initialize_timestep3D) kernel3 = clCreateKernel(program, "initialize_timestep3D", &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating a kernel for initialize_timestep3D. \n\n"); exit(1); } // Create the OpenCL kernel (initialize_timestep3D) kernel4 = clCreateKernel(program, "LocalSearch3D", &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating a kernel for LocalSearch3D. \n\n"); exit(1); } // Create the OpenCL kernel (initialize_timestep3D) kernel5 = clCreateKernel(program, "compute_points_Unstructure3D_2", &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating a kernel for LocalSearch3D. \n\n"); exit(1); } printf("\n\n"); }
int vadd(void) { // Create the two input vectors int i; const int LIST_SIZE = 1024; int *A = (int*)malloc(sizeof(int)*LIST_SIZE); int *B = (int*)malloc(sizeof(int)*LIST_SIZE); for(i = 0; i < LIST_SIZE; i++) { A[i] = i; B[i] = LIST_SIZE - i; } // Load the kernel source code into the array source_str FILE *fp; char *source_str; size_t source_size; const char *fname = "/home/ckit/program/workspace_java/OpenCLHookSample/jni/vector_add_kernel.cl"; // const char *fname = "vadd.ir"; fp = fopen(fname, "rb"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } source_str = (char*)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp); fclose( fp ); // Get platform and device information cl_platform_id platform_id = NULL; cl_device_id device_id = NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); printf("clGetPlatformIDs err=%d,num_platforms=%d, platform_id=%x\n", ret, ret_num_platforms, (unsigned int)platform_id ); //#define XXX CL_DEVICE_TYPE_DEFAULT // #define XXX CL_DEVICE_TYPE_ALL // #define XXX CL_DEVICE_TYPE_GPU #define XXX CL_DEVICE_TYPE_CPU cl_uint num_platforms = 2; cl_platform_id* platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id)* num_platforms); if(NULL == platforms){ printf("malloc err!\n"); } ret = clGetPlatformIDs(2, platforms, &ret_num_platforms); printf("clGetPlatformIDs err=%d,num_platforms=%d, platform_id=%x\n", ret, ret_num_platforms, (unsigned int)platforms[1] ); ret = clGetDeviceIDs( platforms[0], XXX, 1, &device_id, &ret_num_devices); printf("clGetDeviceIDs err=%d,num_platforms=%d, device_id=%x\n", ret, ret_num_platforms, (unsigned int)device_id ); char name[64]; ret = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(char)*64, name, NULL); printf("device_name : %s\n", name); // Create an OpenCL context // cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret); cl_context_properties cps[3] = { (cl_context_properties)CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[0], (cl_context_properties)0 }; cl_context context = clCreateContextFromType( cps, XXX, NULL, NULL, &ret); printf("clCreateContextFromType err=%d,device_type=%x\n", ret, (unsigned int)XXX); // Create a command queue cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret); // Create memory buffers on the device for each vector cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(int), NULL, &ret); cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(int), NULL, &ret); cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, LIST_SIZE * sizeof(int), NULL, &ret); // Copy the lists A and B to their respective memory buffers ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(int), A, 0, NULL, NULL); ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(int), B, 0, NULL, NULL); // Create a program from the kernel source cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); // cl_int status; // cl_int err; // cl_program program = clCreateProgramWithBinary( // context, 1, &device_id, &source_size, (const unsigned char **)&source_str, &status, &err); // Build the program ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); // Create the OpenCL kernel cl_kernel kernel = clCreateKernel(program, "vector_add", &ret); // Set the arguments of the kernel ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj); ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj); ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj); // Execute the OpenCL kernel on the list size_t global_item_size = LIST_SIZE; // Process the entire lists size_t local_item_size = 64; // Process in groups of 64 ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); // Read the memory buffer C on the device to the local variable C int *C = (int*)malloc(sizeof(int)*LIST_SIZE); ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(int), C, 0, NULL, NULL); // Display the result to the screen for(i = 0; i < /*LIST_SIZE*/10; i++) printf("%d + %d = %d\n", A[i], B[i], C[i]); // Clean up ret = clFlush(command_queue); ret = clFinish(command_queue); ret = clReleaseKernel(kernel); ret = clReleaseProgram(program); ret = clReleaseMemObject(a_mem_obj); ret = clReleaseMemObject(b_mem_obj); ret = clReleaseMemObject(c_mem_obj); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); free(A); free(B); free(C); return 0; }
int main( int argc, char* argv[] ) { // Length of vectors unsigned int n = 100000; // Host input vectors double *h_a; double *h_b; // Host output vector double *h_c; // Device input buffers cl_mem d_a; cl_mem d_b; // Device output buffer cl_mem d_c; cl_platform_id cpPlatform; // OpenCL platform cl_device_id device_id; // device ID cl_context context; // context cl_command_queue queue; // command queue cl_program program; // program cl_kernel kernel; // kernel // Size, in bytes, of each vector size_t bytes = n*sizeof(double); // Allocate memory for each vector on host h_a = (double*)malloc(bytes); h_b = (double*)malloc(bytes); h_c = (double*)malloc(bytes); // Initialize vectors on host int i; for( i = 0; i < n; i++ ) { h_a[i] = sinf(i)*sinf(i); h_b[i] = cosf(i)*cosf(i); } size_t globalSize, localSize; cl_int err; // Number of work items in each local work group localSize = 64; // Number of total work items - localSize must be devisor globalSize = ceil(n/(float)localSize)*localSize; // Bind to platform err = clGetPlatformIDs(1, &cpPlatform, NULL); // Get ID for the device err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); // Create a context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); // Create a command queue queue = clCreateCommandQueue(context, device_id, 0, &err); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelSource, NULL, &err); // Build the program executable clBuildProgram(program, 0, NULL, NULL, NULL, NULL); // Create the compute kernel in the program we wish to run kernel = clCreateKernel(program, "vecAdd", &err); // Create the input and output arrays in device memory for our calculation d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL); d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL); d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL); // Write our data set into the input array in device memory err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0, bytes, h_a, 0, NULL, NULL); err |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0, bytes, h_b, 0, NULL, NULL); // Set the arguments to our compute kernel err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c); err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n); // Execute the kernel over the entire range of the data set err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL); // Wait for the command queue to get serviced before reading back results clFinish(queue); // Read the results from the device clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0, bytes, h_c, 0, NULL, NULL ); //Sum up vector c and print result divided by n, this should equal 1 within error double sum = 0; for(i=0; i<n; i++) sum += h_c[i]; printf("final result: %f\n", sum/n); // release OpenCL resources clReleaseMemObject(d_a); clReleaseMemObject(d_b); clReleaseMemObject(d_c); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseContext(context); //release host memory free(h_a); free(h_b); free(h_c); return 0; }
/** * @brief Create an OpenCL program given a set of source kernel files. * * @param zone OpenCL zone where to create program. * @param kernelFiles Array of strings identifying filenames containing kernels. * @param numKernelFiles Number of strings identifying filenames containing kernels. * @param compilerOpts OpenCL compiler options. * @param err Error structure, to be populated if an error occurs. * @return @link clu_error_codes::CLU_SUCCESS @endlink operation * successfully completed or another value of #clu_error_codes if an * error occurs. */ int clu_program_create(CLUZone* zone, char** kernelFiles, cl_uint numKernelFiles, char* compilerOpts, GError **err) { /* Helper variables */ cl_int ocl_status, ocl_build_status; int ret_status; char * build_log = NULL; size_t logsize; char** source = NULL; /* Import kernels */ source = (char**) malloc(numKernelFiles * sizeof(char*)); gef_if_error_create_goto( *err, CLU_UTILS_ERROR, NULL == source, ret_status = CLU_ERROR_NOALLOC, error_handler, "Unable to allocate memory for kernels source file."); for (unsigned int i = 0; i < numKernelFiles; i++) { source[i] = NULL; } for (unsigned int i = 0; i < numKernelFiles; i++) { source[i] = clu_source_load(kernelFiles[i], err); gef_if_error_goto(*err, GEF_USE_GERROR, ret_status, error_handler); } /* Load kernels sources and create program */ cl_program program = clCreateProgramWithSource( zone->context, numKernelFiles, (const char**) source, NULL, &ocl_status); gef_if_error_create_goto( *err, CLU_UTILS_ERROR, CL_SUCCESS != ocl_status, ret_status = CLU_OCL_ERROR, error_handler, "Create program with source (OpenCL error %d :%s).", ocl_status, clerror_get(ocl_status)); /* Perform runtime source compilation of program */ ocl_build_status = clBuildProgram( program, 1, &zone->device_info.device_id, compilerOpts, NULL, NULL); /* Check for errors. */ if (ocl_build_status != CL_SUCCESS) { /* If where here it's because program failed to build. However, error will only be thrown after getting build information. */ /* Get build log size. */ ocl_status = clGetProgramBuildInfo( program, zone->device_info.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &logsize); gef_if_error_create_goto( *err, CLU_UTILS_ERROR, CL_SUCCESS != ocl_status, ret_status = CLU_OCL_ERROR, error_handler, "Error getting program build info (log size, OpenCL error %d: %s) after program failed to build (OpenCL error %d: %s).", ocl_status, clerror_get(ocl_status), ocl_build_status, clerror_get(ocl_build_status)); /* Alocate memory for build log. */ build_log = (char*) malloc(logsize); gef_if_error_create_goto( *err, CLU_UTILS_ERROR, NULL == build_log, ret_status = CLU_ERROR_NOALLOC, error_handler, "Unable to allocate memory for build log after program failed to build with OpenCL error %d (%s).", ocl_build_status, clerror_get(ocl_build_status)); /* Get build log. */ ocl_status = clGetProgramBuildInfo( program, zone->device_info.device_id, CL_PROGRAM_BUILD_LOG, logsize, build_log, NULL); gef_if_error_create_goto( *err, CLU_UTILS_ERROR, CL_SUCCESS != ocl_status, ret_status = CLU_OCL_ERROR, error_handler, "Error getting program build info (build log, OpenCL error %d: %s) after program failed to build (OpenCL error %d: %s).", ocl_status, clerror_get(ocl_status), ocl_build_status, clerror_get(ocl_build_status)); /* Throw error. */ gef_if_error_create_goto( *err, CLU_UTILS_ERROR, 1, ret_status = CLU_OCL_ERROR, error_handler, "Failed to build program (OpenCL error %d: %s). \n\n **** Start of build log **** \n\n%s\n **** End of build log **** \n", ocl_build_status, clerror_get(ocl_build_status), build_log); } zone->program = program; /* If we got here, everything is OK. */ g_assert (err == NULL || *err == NULL); ret_status = CLU_SUCCESS; goto finish; error_handler: /* If we got here there was an error, verify that it is so. */ g_assert (err == NULL || *err != NULL); finish: /* Free stuff. */ if (source != NULL) { for (unsigned int i = 0; i < numKernelFiles; i++) { if (source[i] != NULL) { clu_source_free(source[i]); } } free(source); } if (build_log != NULL) { free(build_log); } /* Return. */ return ret_status; }
int main() { // Set the image rotation (in degrees) float theta = 3.14159/6; float cos_theta = cosf(theta); float sin_theta = sinf(theta); printf("theta = %f (cos theta = %f, sin theta = %f)\n", theta, cos_theta, sin_theta); // Rows and columns in the input image int imageHeight; int imageWidth; const char* inputFile = "input.bmp"; const char* outputFile = "output.bmp"; // Homegrown function to read a BMP from file float* inputImage = readImage(inputFile, &imageWidth, &imageHeight); // Size of the input and output images on the host int dataSize = imageHeight*imageWidth*sizeof(float); // Output image on the host float* outputImage = NULL; outputImage = (float*)malloc(dataSize); // Set up the OpenCL environment cl_int status; // Discovery platform cl_platform_id platforms[2]; cl_platform_id platform; status = clGetPlatformIDs(2, platforms, NULL); chk(status, "clGetPlatformIDs"); platform = platforms[PLATFORM_TO_USE]; // Discover device cl_device_id device; clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL); chk(status, "clGetDeviceIDs"); // Create context cl_context_properties props[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(platform), 0}; cl_context context; context = clCreateContext(props, 1, &device, NULL, NULL, &status); chk(status, "clCreateContext"); // Create command queue cl_command_queue queue; queue = clCreateCommandQueue(context, device, 0, &status); chk(status, "clCreateCommandQueue"); // Create the input and output buffers cl_mem d_input; d_input = clCreateBuffer(context, CL_MEM_READ_ONLY, dataSize, NULL, &status); chk(status, "clCreateBuffer"); cl_mem d_output; d_output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, dataSize, NULL, &status); chk(status, "clCreateBuffer"); // Copy the input image to the device status = clEnqueueWriteBuffer(queue, d_input, CL_TRUE, 0, dataSize, inputImage, 0, NULL, NULL); chk(status, "clEnqueueWriteBuffer"); const char* source = readSource("rotation.cl"); // Create a program object with source and build it cl_program program; program = clCreateProgramWithSource(context, 1, &source, NULL, NULL); chk(status, "clCreateProgramWithSource"); status = clBuildProgram(program, 1, &device, NULL, NULL, NULL); chk(status, "clBuildProgram"); // Create the kernel object cl_kernel kernel; kernel = clCreateKernel(program, "img_rotate", &status); chk(status, "clCreateKernel"); // Set the kernel arguments status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_output); status |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_input); status |= clSetKernelArg(kernel, 2, sizeof(int), &imageWidth); status |= clSetKernelArg(kernel, 3, sizeof(int), &imageHeight); status |= clSetKernelArg(kernel, 4, sizeof(float), &sin_theta); status |= clSetKernelArg(kernel, 5, sizeof(float), &cos_theta); chk(status, "clSetKernelArg"); // Set the work item dimensions size_t globalSize[2] = {imageWidth, imageHeight}; status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, NULL, 0, NULL, NULL); chk(status, "clEnqueueNDRange"); // Read the image back to the host status = clEnqueueReadBuffer(queue, d_output, CL_TRUE, 0, dataSize, outputImage, 0, NULL, NULL); chk(status, "clEnqueueReadBuffer"); // Write the output image to file storeImage(outputImage, outputFile, imageHeight, imageWidth, inputFile); return 0; }
int main() { typedef float ScalarType; ///////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////// Part 1: Set up a custom context and perform a sample operation. //////////////// //////////////////////// This is rather lengthy due to the OpenCL framework. //////////////// //////////////////////// The following does essentially the same as the //////////////// //////////////////////// 'custom_kernels'-tutorial! //////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////// //manually set up a custom OpenCL context: std::vector<cl_device_id> device_id_array; //get all available devices viennacl::ocl::platform pf; std::cout << "Platform info: " << pf.info() << std::endl; std::vector<viennacl::ocl::device> devices = pf.devices(CL_DEVICE_TYPE_DEFAULT); std::cout << devices[0].name() << std::endl; std::cout << "Number of devices for custom context: " << devices.size() << std::endl; //set up context using all found devices: for (size_t i=0; i<devices.size(); ++i) { device_id_array.push_back(devices[i].id()); } std::cout << "Creating context..." << std::endl; cl_int err; cl_context my_context = clCreateContext(0, device_id_array.size(), &(device_id_array[0]), NULL, NULL, &err); VIENNACL_ERR_CHECK(err); //create two Vectors: unsigned int vector_size = 10; std::vector<ScalarType> vec1(vector_size); std::vector<ScalarType> vec2(vector_size); std::vector<ScalarType> result(vector_size); // // fill the operands vec1 and vec2: // for (unsigned int i=0; i<vector_size; ++i) { vec1[i] = static_cast<ScalarType>(i); vec2[i] = static_cast<ScalarType>(vector_size-i); } // // create memory in OpenCL context: // cl_mem mem_vec1 = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(vec1[0]), &err); VIENNACL_ERR_CHECK(err); cl_mem mem_vec2 = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(vec2[0]), &err); VIENNACL_ERR_CHECK(err); cl_mem mem_result = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(result[0]), &err); VIENNACL_ERR_CHECK(err); // // create a command queue for each device: // std::vector<cl_command_queue> queues(devices.size()); for (size_t i=0; i<devices.size(); ++i) { queues[i] = clCreateCommandQueue(my_context, devices[i].id(), 0, &err); VIENNACL_ERR_CHECK(err); } // // create and build a program in the context: // size_t source_len = std::string(my_compute_program).length(); cl_program my_prog = clCreateProgramWithSource(my_context, 1, &my_compute_program, &source_len, &err); err = clBuildProgram(my_prog, 0, NULL, NULL, NULL, NULL); /* char buffer[1024]; cl_build_status status; clGetProgramBuildInfo(my_prog, devices[1].id(), CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL); clGetProgramBuildInfo(my_prog, devices[1].id(), CL_PROGRAM_BUILD_LOG, sizeof(char)*1024, &buffer, NULL); std::cout << "Build Scalar: Err = " << err << " Status = " << status << std::endl; std::cout << "Log: " << buffer << std::endl;*/ VIENNACL_ERR_CHECK(err); // // create a kernel from the program: // const char * kernel_name = "elementwise_prod"; cl_kernel my_kernel = clCreateKernel(my_prog, kernel_name, &err); VIENNACL_ERR_CHECK(err); // // Execute elementwise_prod kernel on first queue: result = vec1 .* vec2; // err = clSetKernelArg(my_kernel, 0, sizeof(cl_mem), (void*)&mem_vec1); VIENNACL_ERR_CHECK(err); err = clSetKernelArg(my_kernel, 1, sizeof(cl_mem), (void*)&mem_vec2); VIENNACL_ERR_CHECK(err); err = clSetKernelArg(my_kernel, 2, sizeof(cl_mem), (void*)&mem_result); VIENNACL_ERR_CHECK(err); err = clSetKernelArg(my_kernel, 3, sizeof(unsigned int), (void*)&vector_size); VIENNACL_ERR_CHECK(err); size_t global_size = vector_size; size_t local_size = vector_size; err = clEnqueueNDRangeKernel(queues[0], my_kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL); VIENNACL_ERR_CHECK(err); // // Read and output result: // err = clEnqueueReadBuffer(queues[0], mem_vec1, CL_TRUE, 0, sizeof(ScalarType)*vector_size, &(vec1[0]), 0, NULL, NULL); VIENNACL_ERR_CHECK(err); err = clEnqueueReadBuffer(queues[0], mem_result, CL_TRUE, 0, sizeof(ScalarType)*vector_size, &(result[0]), 0, NULL, NULL); VIENNACL_ERR_CHECK(err); std::cout << "vec1 : "; for (size_t i=0; i<vec1.size(); ++i) std::cout << vec1[i] << " "; std::cout << std::endl; std::cout << "vec2 : "; for (size_t i=0; i<vec2.size(); ++i) std::cout << vec2[i] << " "; std::cout << std::endl; std::cout << "result: "; for (size_t i=0; i<result.size(); ++i) std::cout << result[i] << " "; std::cout << std::endl; //////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////// Part 2: Let ViennaCL use the already created context: ////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////// //Tell ViennaCL to use the previously created context. //This context is assigned an id '0' when using viennacl::ocl::switch_context(). viennacl::ocl::setup_context(0, my_context, device_id_array, queues); viennacl::ocl::switch_context(0); //activate the new context (only mandatory with context-id not equal to zero) // // Proof that ViennaCL really uses the new context: // std::cout << "Existing context: " << my_context << std::endl; std::cout << "ViennaCL uses context: " << viennacl::ocl::current_context().handle().get() << std::endl; // // Wrap existing OpenCL objects into ViennaCL: // viennacl::vector<ScalarType> vcl_vec1(mem_vec1, vector_size); viennacl::vector<ScalarType> vcl_vec2(mem_vec2, vector_size); viennacl::vector<ScalarType> vcl_result(mem_result, vector_size); viennacl::scalar<ScalarType> vcl_s = 2.0; std::cout << "Standard vector operations within ViennaCL:" << std::endl; vcl_result = vcl_s * vcl_vec1 + vcl_vec2; std::cout << "vec1 : "; std::cout << vcl_vec1 << std::endl; std::cout << "vec2 : "; std::cout << vcl_vec2 << std::endl; std::cout << "result: "; std::cout << vcl_result << std::endl; // // We can also reuse the existing elementwise_prod kernel. // Therefore, we first have to make the existing program known to ViennaCL // For more details on the three lines, see tutorial 'custom-kernels' // std::cout << "Using existing kernel within the OpenCL backend of ViennaCL:" << std::endl; viennacl::ocl::program & my_vcl_prog = viennacl::ocl::current_context().add_program(my_prog, "my_compute_program"); viennacl::ocl::kernel & my_vcl_kernel = my_vcl_prog.add_kernel("elementwise_prod"); viennacl::ocl::enqueue(my_vcl_kernel(vcl_vec1, vcl_vec2, vcl_result, static_cast<cl_uint>(vcl_vec1.size()))); //Note that size_t might differ between host and device. Thus, a cast to cl_uint is necessary here. std::cout << "vec1 : "; std::cout << vcl_vec1 << std::endl; std::cout << "vec2 : "; std::cout << vcl_vec2 << std::endl; std::cout << "result: "; std::cout << vcl_result << std::endl; // // Since a linear piece of memory can be interpreted in several ways, // we will now create a 3x3 row-major matrix out of the linear memory in mem_vec1/ // The first three entries in vcl_vec2 and vcl_result are used to carry out matrix-vector products: // viennacl::matrix<ScalarType> vcl_matrix(mem_vec1, 3, 3); vcl_vec2.resize(3); //note that the resize operation leads to new memory, thus vcl_vec2 is now at a different memory location (values are copied) vcl_result.resize(3); //note that the resize operation leads to new memory, thus vcl_vec2 is now at a different memory location (values are copied) vcl_result = viennacl::linalg::prod(vcl_matrix, vcl_vec2); std::cout << "result of matrix-vector product: "; std::cout << vcl_result << std::endl; // // That's it. // std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl; return 0; }
int NBody::setupCL() { cl_int status = CL_SUCCESS; cl_uint numPlatforms; cl_platform_id platform = NULL; assert(clGetPlatformIDs(0, NULL, &numPlatforms) == CL_SUCCESS); assert(numPlatforms > 0); cl_platform_id* platforms = new cl_platform_id[numPlatforms]; assert(clGetPlatformIDs(numPlatforms, platforms, NULL) == CL_SUCCESS); platform = platforms[0]; delete[] platforms; cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &status); assert(status == CL_SUCCESS); size_t deviceListSize; /* First, get the size of device list data */ assert(clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize) == 0); int deviceCount = (int)(deviceListSize / sizeof(cl_device_id)); assert(deviceCount > 0); devices = (cl_device_id*)malloc(deviceListSize); assert(clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceListSize, devices, NULL) == 0); commandQueue = clCreateCommandQueue(context, devices[0], 0, &status); assert(status == CL_SUCCESS); /* * Create and initialize memory objects */ /* Create memory objects for position */ currPos = clCreateBuffer(context, CL_MEM_READ_WRITE, numBodies * sizeof(cl_float4), 0, &status); assert(status == CL_SUCCESS); /* Initialize position buffer */ status = clEnqueueWriteBuffer(commandQueue, currPos, 1, 0, numBodies * sizeof(cl_float4), pos, 0, 0, 0); assert(status == CL_SUCCESS); /* Create memory objects for position */ newPos = clCreateBuffer(context, CL_MEM_READ_WRITE, numBodies * sizeof(cl_float4), 0, &status); assert(status == CL_SUCCESS); /* Create memory objects for velocity */ currVel = clCreateBuffer(context, CL_MEM_READ_WRITE, numBodies * sizeof(cl_float4), 0, &status); assert(status == CL_SUCCESS); /* Initialize velocity buffer */ status = clEnqueueWriteBuffer(commandQueue, currVel, 1, 0, numBodies * sizeof(cl_float4), vel, 0, 0, 0); assert(status == CL_SUCCESS); /* Create memory objects for velocity */ newVel = clCreateBuffer(context, CL_MEM_READ_ONLY, numBodies * sizeof(cl_float4), 0, &status); assert(status == CL_SUCCESS); std::ifstream in("NBody_Kernels.cl", std::ios_base::in); if (!in.good()) { std::cout << "Failed to load kernel file : " << "NBody_Kernels.cl" << std::endl; abort(); } in.seekg(0, std::ios::end); size_t length = size_t(in.tellg()); in.seekg(0, std::ios::beg); // Allocate memory for the code char *source = new char[length+1]; // Read data as a block in.read(source,length); source[length] = '\0'; in.close(); const char *constSrc = source; program = clCreateProgramWithSource(context, 1, &constSrc, &length, &status); assert(status == CL_SUCCESS); // Get additional options /* create a cl program executable for all the devices specified */ assert(clBuildProgram(program, 1, devices, NULL, NULL, NULL) == CL_SUCCESS); /* get a kernel object handle for a kernel with the given name */ kernel = clCreateKernel(program, "nbody_sim", &status); assert(status == CL_SUCCESS); delete [] source; return 0; }
int main(int argc, char **argv) { if (find_option(argc, argv, "-h") >= 0) { printf("Options:\n"); printf("-h to see this help\n"); printf("-n <int> to set the number of particles\n"); printf("-o <filename> to specify the output file name\n"); printf("-s <filename> to specify the summary output file name\n"); return 0; } int n = read_int(argc, argv, "-n", 1000); char *savename = read_string(argc, argv, "-o", NULL); char *sumname = read_string(argc, argv, "-s", NULL); // For return values. cl_int ret; // OpenCL stuff. // Loading kernel files. FILE *kernelFile; char *kernelSource; size_t kernelSize; kernelFile = fopen("simulationKernel.cl", "r"); if (!kernelFile) { fprintf(stderr, "No file named simulationKernel.cl was found\n"); exit(-1); } kernelSource = (char*)malloc(MAX_SOURCE_SIZE); kernelSize = fread(kernelSource, 1, MAX_SOURCE_SIZE, kernelFile); fclose(kernelFile); // Getting platform and device information cl_platform_id platformId = NULL; cl_device_id deviceID = NULL; cl_uint retNumDevices; cl_uint retNumPlatforms; ret = clGetPlatformIDs(1, &platformId, &retNumPlatforms); // Different types of devices to pick from. At the moment picks the default opencl device. //CL_DEVICE_TYPE_GPU //CL_DEVICE_TYPE_ACCELERATOR //CL_DEVICE_TYPE_DEFAULT //CL_DEVICE_TYPE_CPU ret = clGetDeviceIDs(platformId, CL_DEVICE_TYPE_ACCELERATOR, 1, &deviceID, &retNumDevices); // Max workgroup size size_t max_available_local_wg_size; ret = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_available_local_wg_size, NULL); // Creating context. cl_context context = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &ret); // Creating command queue cl_command_queue commandQueue = clCreateCommandQueueWithProperties (context, deviceID, 0, &ret); // Build program cl_program program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, (const size_t *)&kernelSize, &ret); // printf("program = ret %i \n", ret); ret = clBuildProgram(program, 1, &deviceID, NULL, NULL, NULL); // printf("clBuildProgram: ret %i \n", ret); // Create kernels cl_kernel forceKernel = clCreateKernel(program, "compute_forces_gpu", &ret); cl_kernel moveKernel = clCreateKernel(program, "move_gpu", &ret); cl_kernel binInitKernel = clCreateKernel(program, "bin_init_gpu", &ret); cl_kernel binKernel = clCreateKernel(program, "bin_gpu", &ret); FILE *fsave = savename ? fopen(savename, "w") : NULL; FILE *fsum = sumname ? fopen(sumname, "a") : NULL; particle_t *particles = (particle_t*)malloc(n * sizeof(particle_t)); // GPU particle data structure cl_mem d_particles = clCreateBuffer(context, CL_MEM_READ_WRITE, n * sizeof(particle_t), NULL, &ret); // Set size set_size(n); init_particles(n, particles); double copy_time = read_timer(); // Copy particles to device. ret = clEnqueueWriteBuffer(commandQueue, d_particles, CL_TRUE, 0, n * sizeof(particle_t), particles, 0, NULL, NULL); copy_time = read_timer() - copy_time; // Calculating thread and thread block counts. // sizes size_t globalItemSize; size_t localItemSize; // Global item size if (n <= NUM_THREADS) { globalItemSize = NUM_THREADS; localItemSize = 16; } else if (n % NUM_THREADS != 0) { globalItemSize = (n / NUM_THREADS + 1) * NUM_THREADS; } else { globalItemSize = n; } // Local item size localItemSize = globalItemSize / NUM_THREADS; // Bins and bin sizes. // Because of uniform distribution we will know that bins size is amortized. Therefore I picked the value of 10. // There will never be 10 particles in one bin. int maxParticles = 10; // Calculating the number of bins. int numberOfBins = (int)ceil(size/(2*cutoff)) + 2; // Bins will only exist on the device. particle_t* bins; // How many particles are there in each bin - also only exists on the device. volatile int* binSizes; // Number of bins to be initialized. size_t clearAmt = numberOfBins*numberOfBins; // Allocate memory for bins on the device. cl_mem d_binSizes = clCreateBuffer(context, CL_MEM_READ_WRITE, numberOfBins * numberOfBins * sizeof(volatile int), NULL, &ret); cl_mem d_bins = clCreateBuffer(context, CL_MEM_READ_WRITE, numberOfBins * numberOfBins * maxParticles * sizeof(particle_t), NULL, &ret); // SETTING ARGUMENTS FOR THE KERNELS // Set arguments for the init / clear kernel ret = clSetKernelArg(binInitKernel, 0, sizeof(cl_mem), (void *)&d_binSizes); ret = clSetKernelArg(binInitKernel, 1, sizeof(int), &numberOfBins); // Set arguments for the binning kernel ret = clSetKernelArg(binKernel, 0, sizeof(cl_mem), (void *)&d_particles); ret = clSetKernelArg(binKernel, 1, sizeof(int), &n); ret = clSetKernelArg(binKernel, 2, sizeof(cl_mem), (void *)&d_bins); ret = clSetKernelArg(binKernel, 3, sizeof(cl_mem), (void *)&d_binSizes); ret = clSetKernelArg(binKernel, 4, sizeof(int), &numberOfBins); // Set arguments for force kernel. ret = clSetKernelArg(forceKernel, 0, sizeof(cl_mem), (void *)&d_particles); ret = clSetKernelArg(forceKernel, 1, sizeof(int), &n); ret = clSetKernelArg(forceKernel, 2, sizeof(cl_mem), (void *)&d_bins); ret = clSetKernelArg(forceKernel, 3, sizeof(cl_mem), (void *)&d_binSizes); ret = clSetKernelArg(forceKernel, 4, sizeof(int), &numberOfBins); // Set arguments for move kernel ret = clSetKernelArg(moveKernel, 0, sizeof(cl_mem), (void *)&d_particles); ret = clSetKernelArg(moveKernel, 1, sizeof(int), &n); ret = clSetKernelArg(moveKernel, 2, sizeof(double), &size); // Variable to check if kernel execution is done. cl_event kernelDone; double simulation_time = read_timer(); int step = 0; for (step = 0; step < NSTEPS; step++) { // Execute bin initialization (clearing after first iteration) ret = clEnqueueNDRangeKernel(commandQueue, binInitKernel, 1, NULL, &clearAmt, NULL, 0, NULL, &kernelDone); ret = clWaitForEvents(1, &kernelDone); // Execute binning kernel ret = clEnqueueNDRangeKernel(commandQueue, binKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone); // ret = clEnqueueNDRangeKernel(commandQueue, binKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone); ret = clWaitForEvents(1, &kernelDone); // Execute force kernel ret = clEnqueueNDRangeKernel(commandQueue, forceKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone); ret = clWaitForEvents(1, &kernelDone); // Execute move kernel ret = clEnqueueNDRangeKernel(commandQueue, moveKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone); ret = clWaitForEvents(1, &kernelDone); if (fsave && (step%SAVEFREQ) == 0) { // Copy the particles back to the CPU ret = clEnqueueReadBuffer(commandQueue, d_particles, CL_TRUE, 0, n * sizeof(particle_t), particles, 0, NULL, &kernelDone); ret = clWaitForEvents(1, &kernelDone); save(fsave, n, particles); } } simulation_time = read_timer() - simulation_time; printf("CPU-GPU copy time = %g seconds\n", copy_time); printf("n = %d, simulation time = %g seconds\n", n, simulation_time); if (fsum) fprintf(fsum, "%d %lf \n", n, simulation_time); if (fsum) fclose(fsum); free(particles); if (fsave) fclose(fsave); ret = clFlush(commandQueue); ret = clFinish(commandQueue); ret = clReleaseCommandQueue(commandQueue); ret = clReleaseKernel(forceKernel); ret = clReleaseKernel(moveKernel); ret = clReleaseProgram(program); ret = clReleaseMemObject(d_particles); ret = clReleaseContext(context); return 0; }
int main(int argc, char** argv) { ocd_init(&argc, &argv, NULL); ocd_initCL(); cl_int err; size_t global_size; size_t local_size; cl_program program; cl_kernel kernel_compute_flux; cl_kernel kernel_compute_flux_contributions; cl_kernel kernel_compute_step_factor; cl_kernel kernel_time_step; cl_kernel kernel_initialize_variables; cl_mem ff_variable; cl_mem ff_fc_momentum_x; cl_mem ff_fc_momentum_y; cl_mem ff_fc_momentum_z; cl_mem ff_fc_density_energy; if (argc < 2) { printf("Usage ./cfd <data input file>\n"); return 0; } const char* data_file_name = argv[1]; // set far field conditions and load them into constant memory on the gpu { float h_ff_variable[NVAR]; const float angle_of_attack = (float)(3.1415926535897931 / 180.0) * (float)(deg_angle_of_attack); h_ff_variable[VAR_DENSITY] = (float)(1.4); float ff_pressure = (float)(1.0); float ff_speed_of_sound = sqrt(GAMMA*ff_pressure / h_ff_variable[VAR_DENSITY]); float ff_speed = (float)(ff_mach)*ff_speed_of_sound; float3 ff_velocity; ff_velocity.x = ff_speed*(float)(cos((float)angle_of_attack)); ff_velocity.y = ff_speed*(float)(sin((float)angle_of_attack)); ff_velocity.z = 0.0; h_ff_variable[VAR_MOMENTUM+0] = h_ff_variable[VAR_DENSITY] * ff_velocity.x; h_ff_variable[VAR_MOMENTUM+1] = h_ff_variable[VAR_DENSITY] * ff_velocity.y; h_ff_variable[VAR_MOMENTUM+2] = h_ff_variable[VAR_DENSITY] * ff_velocity.z; h_ff_variable[VAR_DENSITY_ENERGY] = h_ff_variable[VAR_DENSITY]*((float)(0.5)*(ff_speed*ff_speed)) + (ff_pressure / (float)(GAMMA-1.0)); float3 h_ff_momentum; h_ff_momentum.x = *(h_ff_variable+VAR_MOMENTUM+0); h_ff_momentum.y = *(h_ff_variable+VAR_MOMENTUM+1); h_ff_momentum.z = *(h_ff_variable+VAR_MOMENTUM+2); float3 h_ff_fc_momentum_x; float3 h_ff_fc_momentum_y; float3 h_ff_fc_momentum_z; float3 h_ff_fc_density_energy; compute_flux_contribution(&h_ff_variable[VAR_DENSITY], &h_ff_momentum, &h_ff_variable[VAR_DENSITY_ENERGY], ff_pressure, &ff_velocity, &h_ff_fc_momentum_x, &h_ff_fc_momentum_y, &h_ff_fc_momentum_z, &h_ff_fc_density_energy); // copy far field conditions to the gpu ff_variable = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * NVAR, h_ff_variable, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_momentum_x = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_x, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_momentum_y = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_y, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_momentum_z = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_z, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_density_energy = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_density_energy, &err); CHKERR(err, "Unable to allocate ff data"); } int nel; int nelr; // read in domain geometry cl_mem areas; cl_mem elements_surrounding_elements; cl_mem normals; { std::ifstream file(data_file_name); file >> nel; nelr = block_length*((nel / block_length )+ std::min(1, nel % block_length)); float* h_areas = new float[nelr]; int* h_elements_surrounding_elements = new int[nelr*NNB]; float* h_normals = new float[nelr*NDIM*NNB]; // read in data for(int i = 0; i < nel; i++) { file >> h_areas[i]; for(int j = 0; j < NNB; j++) { file >> h_elements_surrounding_elements[i + j*nelr]; if(h_elements_surrounding_elements[i+j*nelr] < 0) h_elements_surrounding_elements[i+j*nelr] = -1; h_elements_surrounding_elements[i + j*nelr]--; //it's coming in with Fortran numbering for(int k = 0; k < NDIM; k++) { file >> h_normals[i + (j + k*NNB)*nelr]; h_normals[i + (j + k*NNB)*nelr] = -h_normals[i + (j + k*NNB)*nelr]; } } } // fill in remaining data int last = nel-1; for(int i = nel; i < nelr; i++) { h_areas[i] = h_areas[last]; for(int j = 0; j < NNB; j++) { // duplicate the last element h_elements_surrounding_elements[i + j*nelr] = h_elements_surrounding_elements[last + j*nelr]; for(int k = 0; k < NDIM; k++) h_normals[last + (j + k*NNB)*nelr] = h_normals[last + (j + k*NNB)*nelr]; } } areas = alloc<float>(context, nelr); upload<float>(commands, areas, h_areas, nelr); elements_surrounding_elements = alloc<int>(context, nelr*NNB); upload<int>(commands, elements_surrounding_elements, h_elements_surrounding_elements, nelr*NNB); normals = alloc<float>(context, nelr*NDIM*NNB); upload<float>(commands, normals, h_normals, nelr*NDIM*NNB); delete[] h_areas; delete[] h_elements_surrounding_elements; delete[] h_normals; } // Get program source. long kernelSize = getKernelSize(); char* kernelSource = new char[kernelSize]; getKernelSource(kernelSource, kernelSize); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) &kernelSource, NULL, &err); CHKERR(err, "Failed to create a compute program!"); // Build the program executable err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err == CL_BUILD_PROGRAM_FAILURE) { char *log; size_t logLen; err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &logLen); log = (char *) malloc(sizeof(char)*logLen); err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, logLen, (void *) log, NULL); fprintf(stderr, "CL Error %d: Failed to build program! Log:\n%s", err, log); free(log); exit(1); } CHKERR(err, "Failed to build program!"); delete[] kernelSource; // Create the compute kernel in the program we wish to run kernel_compute_flux = clCreateKernel(program, "compute_flux", &err); CHKERR(err, "Failed to create a compute kernel!"); // Create the reduce kernel in the program we wish to run kernel_compute_flux_contributions = clCreateKernel(program, "compute_flux_contributions", &err); CHKERR(err, "Failed to create a compute_flux_contributions kernel!"); // Create the reduce kernel in the program we wish to run kernel_compute_step_factor = clCreateKernel(program, "compute_step_factor", &err); CHKERR(err, "Failed to create a compute_step_factor kernel!"); // Create the reduce kernel in the program we wish to run kernel_time_step = clCreateKernel(program, "time_step", &err); CHKERR(err, "Failed to create a time_step kernel!"); // Create the reduce kernel in the program we wish to run kernel_initialize_variables = clCreateKernel(program, "initialize_variables", &err); CHKERR(err, "Failed to create a initialize_variables kernel!"); // Create arrays and set initial conditions cl_mem variables = alloc<cl_float>(context, nelr*NVAR); err = 0; err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&variables); err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device //err = clGetKernelWorkGroupInfo(kernel_initialize_variables, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_initialize_variables work group info!"); local_size = 1;//std::min(local_size, (size_t)nelr); global_size = nelr; err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); err = clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 0"); cl_mem old_variables = alloc<float>(context, nelr*NVAR); cl_mem fluxes = alloc<float>(context, nelr*NVAR); cl_mem step_factors = alloc<float>(context, nelr); clFinish(commands); cl_mem fc_momentum_x = alloc<float>(context, nelr*NDIM); cl_mem fc_momentum_y = alloc<float>(context, nelr*NDIM); cl_mem fc_momentum_z = alloc<float>(context, nelr*NDIM); cl_mem fc_density_energy = alloc<float>(context, nelr*NDIM); clFinish(commands); // make sure all memory is floatly allocated before we start timing err = 0; err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&old_variables); err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_initialize_variables, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_initialize_variables work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 1"); err = 0; err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&fluxes); err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_step_factor, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_step_factor work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 2"); std::cout << "About to memcopy" << std::endl; err = clReleaseMemObject(step_factors); float temp[nelr]; for(int i = 0; i < nelr; i++) temp[i] = 0; step_factors = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * nelr, temp, &err); CHKERR(err, "Unable to memset step_factors"); // make sure CUDA isn't still doing something before we start timing clFinish(commands); // these need to be computed the first time in order to compute time step std::cout << "Starting..." << std::endl; // Begin iterations for(int i = 0; i < iterations; i++) { copy<float>(commands, old_variables, variables, nelr*NVAR); // for the first iteration we compute the time step err = 0; err = clSetKernelArg(kernel_compute_step_factor, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_compute_step_factor, 1, sizeof(cl_mem),&variables); err |= clSetKernelArg(kernel_compute_step_factor, 2, sizeof(cl_mem), &areas); err |= clSetKernelArg(kernel_compute_step_factor, 3, sizeof(cl_mem), &step_factors); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_step_factor, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_step_factor work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_compute_step_factor, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Step Factor Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel[kernel_compute_step_factor]!"); for(int j = 0; j < RK; j++) { err = 0; err = clSetKernelArg(kernel_compute_flux_contributions, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_compute_flux_contributions, 1, sizeof(cl_mem),&variables); err |= clSetKernelArg(kernel_compute_flux_contributions, 2, sizeof(cl_mem), &fc_momentum_x); err |= clSetKernelArg(kernel_compute_flux_contributions, 3, sizeof(cl_mem), &fc_momentum_y); err |= clSetKernelArg(kernel_compute_flux_contributions, 4, sizeof(cl_mem), &fc_momentum_z); err |= clSetKernelArg(kernel_compute_flux_contributions, 5, sizeof(cl_mem), &fc_density_energy); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_flux_contributions, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_flux_contributions work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_compute_flux_contributions, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Flux Contribution Kernel", ocdTempTimer) //compute_flux_contributions(nelr, variables, fc_momentum_x, fc_momentum_y, fc_momentum_z, fc_density_energy); END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_compute_flux_contributions]!"); err = 0; err = clSetKernelArg(kernel_compute_flux, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_compute_flux, 1, sizeof(cl_mem), &elements_surrounding_elements); err |= clSetKernelArg(kernel_compute_flux, 2, sizeof(cl_mem), &normals); err |= clSetKernelArg(kernel_compute_flux, 3, sizeof(cl_mem), &variables); err |= clSetKernelArg(kernel_compute_flux, 4, sizeof(cl_mem), &fc_momentum_x); err |= clSetKernelArg(kernel_compute_flux, 5, sizeof(cl_mem), &fc_momentum_y); err |= clSetKernelArg(kernel_compute_flux, 6, sizeof(cl_mem), &fc_momentum_z); err |= clSetKernelArg(kernel_compute_flux, 7, sizeof(cl_mem), &fc_density_energy); err |= clSetKernelArg(kernel_compute_flux, 8, sizeof(cl_mem), &fluxes); err |= clSetKernelArg(kernel_compute_flux, 9, sizeof(cl_mem), &ff_variable); err |= clSetKernelArg(kernel_compute_flux, 10, sizeof(cl_mem), &ff_fc_momentum_x); err |= clSetKernelArg(kernel_compute_flux, 11, sizeof(cl_mem), &ff_fc_momentum_y); err |= clSetKernelArg(kernel_compute_flux, 12, sizeof(cl_mem), &ff_fc_momentum_z); err |= clSetKernelArg(kernel_compute_flux, 13, sizeof(cl_mem), &ff_fc_density_energy); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_flux, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_flux work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_compute_flux, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Flux Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_compute_flux]!"); err = 0; err = clSetKernelArg(kernel_time_step, 0, sizeof(int), &j); err |= clSetKernelArg(kernel_time_step, 1, sizeof(int), &nelr); err |= clSetKernelArg(kernel_time_step, 2, sizeof(cl_mem), &old_variables); err |= clSetKernelArg(kernel_time_step, 3, sizeof(cl_mem), &variables); err |= clSetKernelArg(kernel_time_step, 4, sizeof(cl_mem), &step_factors); err |= clSetKernelArg(kernel_time_step, 5, sizeof(cl_mem), &fluxes); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_time_step, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_time_step work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_time_step, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Time Step Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_time_step]!"); } } clFinish(commands); std::cout << "Finished" << std::endl; std::cout << "Saving solution..." << std::endl; dump(commands, variables, nel, nelr); std::cout << "Saved solution..." << std::endl; std::cout << "Cleaning up..." << std::endl; clReleaseProgram(program); clReleaseKernel(kernel_compute_flux); clReleaseKernel(kernel_compute_flux_contributions); clReleaseKernel(kernel_compute_step_factor); clReleaseKernel(kernel_time_step); clReleaseKernel(kernel_initialize_variables); clReleaseCommandQueue(commands); clReleaseContext(context); dealloc<float>(areas); dealloc<int>(elements_surrounding_elements); dealloc<float>(normals); dealloc<float>(variables); dealloc<float>(old_variables); dealloc<float>(fluxes); dealloc<float>(step_factors); dealloc<float>(fc_momentum_x); dealloc<float>(fc_momentum_y); dealloc<float>(fc_momentum_z); dealloc<float>(fc_density_energy); std::cout << "Done..." << std::endl; ocd_finalize(); return 0; }
void runProgram(int N, char *fileName) { printf("GPU Symmetrize()..." "\nSquareMatrix[%d][%d]\n", N, N); int i,j; // initialize input array float *A; A = (float*)malloc(sizeof(float)*N*N); for( i = 0; i < N ; ++i ) { for( j = 0; j < N ; ++j ) { A[i*N + j] = j; } } // result float *Aout; Aout = (float*)malloc(sizeof(float)*N*N); #ifdef DEBUG puts("A"); check_2d_f(A,N,N); #endif int NumK = 1; int NumE = 2; double gpuTime; cl_ulong gstart, gend; //------------------------------------------------ // OpenCL //------------------------------------------------ cl_int err; cl_platform_id platform; // OpenCL platform cl_device_id device_id; // device ID cl_context context; // context cl_command_queue queue; // command queue cl_program program; // program cl_kernel *kernel = (cl_kernel*)malloc(sizeof(cl_kernel)*NumK); cl_event *event = (cl_event*)malloc(sizeof(cl_event)*NumE); // read kernel file //char *fileName = "transpose_kernel.cl"; char *kernelSource; size_t size; FILE *fh = fopen(fileName, "rb"); if(!fh) { printf("Error: Failed to open kernel file!\n"); exit(1); } fseek(fh,0,SEEK_END); size=ftell(fh); fseek(fh,0,SEEK_SET); kernelSource = malloc(size+1); size_t result; result = fread(kernelSource,1,size,fh); if(result != size){ fputs("Reading error", stderr);exit(1);} kernelSource[size] = '\0'; // Bind to platform err = clGetPlatformIDs(1, &platform, NULL); OCL_CHECK(err); // Get ID for the device err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); OCL_CHECK(err); // Create a context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); OCL_CHECK(err); // Create a command queue queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err); OCL_CHECK(err); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, NULL, &err); OCL_CHECK(err); // turn on optimization for kernel char *options="-cl-mad-enable -cl-fast-relaxed-math -cl-no-signed-zeros -cl-unsafe-math-optimizations -cl-finite-math-only"; err = clBuildProgram(program, 1, &device_id, options, NULL, NULL); if(err != CL_SUCCESS) printCompilerOutput(program, device_id); OCL_CHECK(err); #ifdef SAVEBIN // Calculate size of binaries size_t binary_size; err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binary_size, NULL); OCL_CHECK(err); //printf("binary size = %ld\n", binary_size); unsigned char* bin; bin = (unsigned char*)malloc(sizeof(unsigned char)*binary_size); err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*) , &bin, NULL); OCL_CHECK(err); //puts("save binaries"); // Print the binary out to the output file fh = fopen("kernel.bin", "wb"); fwrite(bin, 1, binary_size, fh); fclose(fh); puts("done save binaries"); #endif kernel[0] = clCreateKernel(program, "kernel_a", &err); OCL_CHECK(err); // memory on device cl_mem A_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*N*N, NULL, NULL); cl_mem Aout_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*N*N, NULL, NULL); // copy data to device err = clEnqueueWriteBuffer(queue, A_d, CL_TRUE, 0, sizeof(float)*N*N, A, 0, NULL , &event[0]); OCL_CHECK(err); size_t localsize[2]; size_t globalsize[2]; localsize[0] = 16; localsize[1] = 16; globalsize[0] = N; globalsize[1] = N; err = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), &A_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clSetKernelArg(kernel[0], 1, sizeof(cl_mem), &Aout_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clEnqueueNDRangeKernel(queue, kernel[0], 2, NULL, globalsize, localsize, 0, NULL, NULL); OCL_CHECK(err); clFinish(queue); // read device data back to host clEnqueueReadBuffer(queue, Aout_d, CL_TRUE, 0, sizeof(float)*N*N, Aout, 0, NULL , &event[1]); err = clWaitForEvents(1,&event[1]); OCL_CHECK(err); err = clGetEventProfilingInfo (event[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &gstart, NULL); OCL_CHECK(err); err = clGetEventProfilingInfo (event[1], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &gend, NULL); OCL_CHECK(err); gpuTime = (double)(gend -gstart)/1000000000.0; //check_1d_f(sum, blks+1); #ifdef DEBUG puts("Output"); check_2d_f(Aout,N,N); #endif printf("oclTime = %lf (s)\n", gpuTime ); // free clReleaseMemObject(A_d); clReleaseMemObject(Aout_d); // // check // int flag = 1; // for(i=0;i<N;++i){ // for(j=0;j<N;++j){ // if(A[i*N+j] != At[j*N+i]) // { // flag = 0; // break; // } // } // } // if( flag == 0 ) // { // puts("Bugs! Check program."); // }else{ // puts("Succeed!"); // } clReleaseProgram(program); clReleaseContext(context); clReleaseCommandQueue(queue); for(i=0;i<NumK;++i){ clReleaseKernel(kernel[i]); } for(i=0;i<NumE;++i){ clReleaseEvent(event[i]); } free(kernelSource); #ifdef SAVEBIN free(bin); #endif free(A); free(Aout); return; }
void WorkScheduler::initialize(bool use_opencl, int num_cpu_threads) { /* initialize highlighting */ if (!g_highlightInitialized) { if (g_highlightedNodesRead) MEM_freeN(g_highlightedNodesRead); if (g_highlightedNodes) MEM_freeN(g_highlightedNodes); g_highlightedNodesRead = NULL; g_highlightedNodes = NULL; COM_startReadHighlights(); g_highlightInitialized = true; } #if COM_CURRENT_THREADING_MODEL == COM_TM_QUEUE /* deinitialize if number of threads doesn't match */ if (g_cpudevices.size() != num_cpu_threads) { Device *device; while (g_cpudevices.size() > 0) { device = g_cpudevices.back(); g_cpudevices.pop_back(); device->deinitialize(); delete device; } g_cpuInitialized = false; } /* initialize CPU threads */ if (!g_cpuInitialized) { for (int index = 0; index < num_cpu_threads; index++) { CPUDevice *device = new CPUDevice(); device->initialize(); g_cpudevices.push_back(device); } g_cpuInitialized = true; } #ifdef COM_OPENCL_ENABLED /* deinitialize OpenCL GPU's */ if (use_opencl && !g_openclInitialized) { g_context = NULL; g_program = NULL; if (clewInit() != CLEW_SUCCESS) /* this will check for errors and skip if already initialized */ return; if (clCreateContextFromType) { cl_uint numberOfPlatforms = 0; cl_int error; error = clGetPlatformIDs(0, 0, &numberOfPlatforms); if (error == -1001) { } /* GPU not supported */ else if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); } if (G.f & G_DEBUG) printf("%u number of platforms\n", numberOfPlatforms); cl_platform_id *platforms = (cl_platform_id *)MEM_mallocN(sizeof(cl_platform_id) * numberOfPlatforms, __func__); error = clGetPlatformIDs(numberOfPlatforms, platforms, 0); unsigned int indexPlatform; for (indexPlatform = 0; indexPlatform < numberOfPlatforms; indexPlatform++) { cl_platform_id platform = platforms[indexPlatform]; cl_uint numberOfDevices = 0; clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, 0, &numberOfDevices); if (numberOfDevices <= 0) continue; cl_device_id *cldevices = (cl_device_id *)MEM_mallocN(sizeof(cl_device_id) * numberOfDevices, __func__); clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numberOfDevices, cldevices, 0); g_context = clCreateContext(NULL, numberOfDevices, cldevices, clContextError, NULL, &error); if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); } const char *cl_str[2] = {datatoc_COM_OpenCLKernels_cl, NULL}; g_program = clCreateProgramWithSource(g_context, 1, cl_str, 0, &error); error = clBuildProgram(g_program, numberOfDevices, cldevices, 0, 0, 0); if (error != CL_SUCCESS) { cl_int error2; size_t ret_val_size = 0; printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); error2 = clGetProgramBuildInfo(g_program, cldevices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); if (error2 != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); } char *build_log = (char *)MEM_mallocN(sizeof(char) * ret_val_size + 1, __func__); error2 = clGetProgramBuildInfo(g_program, cldevices[0], CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL); if (error2 != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); } build_log[ret_val_size] = '\0'; printf("%s", build_log); MEM_freeN(build_log); } else { unsigned int indexDevices; for (indexDevices = 0; indexDevices < numberOfDevices; indexDevices++) { cl_device_id device = cldevices[indexDevices]; cl_int vendorID = 0; cl_int error2 = clGetDeviceInfo(device, CL_DEVICE_VENDOR_ID, sizeof(cl_int), &vendorID, NULL); if (error2 != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error2, clewErrorString(error2)); } OpenCLDevice *clDevice = new OpenCLDevice(g_context, device, g_program, vendorID); clDevice->initialize(); g_gpudevices.push_back(clDevice); } } MEM_freeN(cldevices); } MEM_freeN(platforms); } g_openclInitialized = true; } #endif #endif }
void sum_gpu(long long *in, long long *out, unsigned int n) { size_t global_size; size_t local_size; char *kernel_src; cl_int err; cl_platform_id platform_id; cl_device_id device_id; cl_uint max_compute_units; size_t max_workgroup_size; cl_context context; cl_command_queue commands; cl_program program; cl_kernel kernel; cl_mem d_array; cl_event event; cl_ulong start, end; /* start OpenCL */ err = clGetPlatformIDs(1, &platform_id,NULL); clErrorHandling("clGetPlatformIDs"); err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); clErrorHandling("clGetDeviceIDs"); context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); clErrorHandling("clCreateContext"); commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err); clErrorHandling("clCreateCommandQueue"); /* create kernel */ kernel_src = file_to_string(KERNEL_SRC); program = clCreateProgramWithSource(context, 1, (const char**) &kernel_src, NULL, &err); free(kernel_src); clErrorHandling("clCreateProgramWithSource"); err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); clErrorHandling("clBuildProgram"); kernel = clCreateKernel(program, "matrix_mult", &err); clErrorHandling("clCreateKernel"); /* allocate memory and send to gpu */ d_array = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long long) * n, NULL, &err); clErrorHandling("clCreateBuffer"); err = clEnqueueWriteBuffer(commands, d_array, CL_TRUE, 0, sizeof(long long) * n, in, 0, NULL, NULL); clErrorHandling("clEnqueueWriteBuffer"); err = clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &max_compute_units, NULL); err |= clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_workgroup_size, NULL); clErrorHandling("clGetDeviceInfo"); /* prepare kernel args */ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_array); err |= clSetKernelArg(kernel, 1, sizeof(unsigned int), &n); /* execute */ local_size = n / max_compute_units / 8; if (local_size > max_workgroup_size) local_size = max_workgroup_size; /* * Usually it would be * global_size = local_size * max_compute_units; * but that would only be valid if local_size = n / max_compute_units; * local_size is n / max_compute_units / 8 because it obtains its hightest performance. */ for (global_size = local_size; global_size < n; global_size += local_size); err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global_size, &local_size, 0, NULL, &event); clErrorHandling("clEnqueueNDRangeKernel"); clWaitForEvents(1, &event); clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); fprintf(stderr, "Time for event (ms): %10.5f \n", (end - start) / 1000000.0); err = clFinish(commands); clErrorHandling("clFinish"); /* transfer back */ err = clEnqueueReadBuffer(commands, d_array, CL_TRUE, 0, sizeof(long long), out, 0, NULL, NULL); // a single long long clErrorHandling("clEnqueueReadBuffer"); /* cleanup*/ clReleaseMemObject(d_array); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); clReleaseEvent(event); }
/** * The implementation of the particle filter using OpenMP for many frames * @see http://openmp.org/wp/ * @note This function is designed to work with a video of several frames. In addition, it references a provided MATLAB function which takes the video, the objxy matrix and the x and y arrays as arguments and returns the likelihoods * @param I The video to be run * @param IszX The x dimension of the video * @param IszY The y dimension of the video * @param Nfr The number of frames * @param seed The seed array used for random number generation * @param Nparticles The number of particles to be used */ int particleFilter(unsigned char * I, int IszX, int IszY, int Nfr, int * seed, int Nparticles) { int max_size = IszX * IszY*Nfr; //original particle centroid double xe = roundDouble(IszY / 2.0); double ye = roundDouble(IszX / 2.0); //expected object locations, compared to center int radius = 5; int diameter = radius * 2 - 1; int * disk = (int*) calloc(diameter * diameter, sizeof (int)); strelDisk(disk, radius); int countOnes = 0; int x, y; for (x = 0; x < diameter; x++) { for (y = 0; y < diameter; y++) { if (disk[x * diameter + y] == 1) countOnes++; } } int * objxy = (int *) calloc(countOnes * 2, sizeof(int)); getneighbors(disk, countOnes, objxy, radius); //initial weights are all equal (1/Nparticles) double * weights = (double *) calloc(Nparticles, sizeof(double)); for (x = 0; x < Nparticles; x++) { weights[x] = 1 / ((double) (Nparticles)); } /**************************************************************** ************** B E G I N A L L O C A T E ******************* ****************************************************************/ /***** kernel variables ******/ cl_kernel kernel_likelihood; cl_kernel kernel_sum; cl_kernel kernel_normalize_weights; cl_kernel kernel_find_index; int sourcesize = 2048 * 2048; char * source = (char *) calloc(sourcesize, sizeof (char)); if (!source) { printf("ERROR: calloc(%d) failed\n", sourcesize); return -1; } // read the kernel core source char * tempchar = "./particle_double.cl"; FILE * fp = fopen(tempchar, "rb"); if (!fp) { printf("ERROR: unable to open '%s'\n", tempchar); return -1; } fread(source + strlen(source), sourcesize, 1, fp); fclose(fp); // OpenCL initialization int use_gpu = 1; if (initialize(use_gpu)) return -1; // compile kernel cl_int err = 0; const char * slist[2] = {source, 0}; cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateProgramWithSource() => %d\n", err); return -1; } err = clBuildProgram(prog, 1, device_list, "-cl-fast-relaxed-math", NULL, NULL); if (err != CL_SUCCESS) { if (err == CL_INVALID_PROGRAM) printf("CL_INVALID_PROGRAM\n"); else if (err == CL_INVALID_VALUE) printf("CL_INVALID_VALUE\n"); else if (err == CL_INVALID_DEVICE) printf("CL_INVALID_DEVICE\n"); else if (err == CL_INVALID_BINARY) printf("CL_INVALID_BINARY\n"); else if (err == CL_INVALID_BUILD_OPTIONS) printf("CL_INVALID_BUILD_OPTIONS\n"); else if (err == CL_INVALID_OPERATION) printf("CL_INVALID_OPERATION\n"); else if (err == CL_COMPILER_NOT_AVAILABLE) printf("CL_COMPILER_NOT_AVAILABLE\n"); else if (err == CL_BUILD_PROGRAM_FAILURE) printf("CL_BUILD_PROGRAM_FAILURE\n"); else if (err == CL_INVALID_OPERATION) printf("CL_INVALID_OPERATION\n"); else if (err == CL_OUT_OF_RESOURCES) printf("CL_OUT_OF_RESOURCES\n"); else if (err == CL_OUT_OF_HOST_MEMORY) printf("CL_OUT_OF_HOST_MEMORY\n"); printf("ERROR: clBuildProgram() => %d\n", err); static char log[65536]; memset(log, 0, sizeof (log)); err = clGetProgramBuildInfo(prog, device_list[0], CL_PROGRAM_BUILD_LOG, sizeof (log) - 1, log, NULL); if (err != CL_SUCCESS) { printf("ERROR: clGetProgramBuildInfo() => %d\n", err); } if (strstr(log, "warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log); } // { // show warnings/errors // static char log[65536]; // memset(log, 0, sizeof (log)); // cl_device_id device_id[2] = {0}; // err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof (device_id), device_id, NULL); // if (err != CL_SUCCESS) { // if (err == CL_INVALID_CONTEXT) // printf("ERROR: clGetContextInfo() => CL_INVALID_CONTEXT\n"); // if (err == CL_INVALID_VALUE) // printf("ERROR: clGetContextInfo() => CL_INVALID_VALUE\n"); // } // }//*/ char * s_likelihood_kernel = "likelihood_kernel"; char * s_sum_kernel = "sum_kernel"; char * s_normalize_weights_kernel = "normalize_weights_kernel"; char * s_find_index_kernel = "find_index_kernel"; kernel_likelihood = clCreateKernel(prog, s_likelihood_kernel, &err); if (err != CL_SUCCESS) { if (err == CL_INVALID_PROGRAM) printf("ERROR: clCreateKernel(likelihood_kernel) 0 => INVALID PROGRAM %d\n", err); if (err == CL_INVALID_PROGRAM_EXECUTABLE) printf("ERROR: clCreateKernel(likelihood_kernel) 0 => INVALID PROGRAM EXECUTABLE %d\n", err); if (err == CL_INVALID_KERNEL_NAME) printf("ERROR: clCreateKernel(likelihood_kernel) 0 => INVALID KERNEL NAME %d\n", err); if (err == CL_INVALID_KERNEL_DEFINITION) printf("ERROR: clCreateKernel(likelihood_kernel) 0 => INVALID KERNEL DEFINITION %d\n", err); if (err == CL_INVALID_VALUE) printf("ERROR: clCreateKernel(likelihood_kernel) 0 => INVALID CL_INVALID_VALUE %d\n", err); printf("ERROR: clCreateKernel(likelihood_kernel) failed.\n"); return -1; } kernel_sum = clCreateKernel(prog, s_sum_kernel, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateKernel(sum_kernel) 0 => %d\n", err); return -1; } kernel_normalize_weights = clCreateKernel(prog, s_normalize_weights_kernel, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateKernel(normalize_weights_kernel) 0 => %d\n", err); return -1; } kernel_find_index = clCreateKernel(prog, s_find_index_kernel, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateKernel(find_index_kernel) 0 => %d\n", err); return -1; } //initial likelihood to 0.0 double * likelihood = (double *) calloc(Nparticles + 1, sizeof (double)); double * arrayX = (double *) calloc(Nparticles, sizeof (double)); double * arrayY = (double *) calloc(Nparticles, sizeof (double)); double * xj = (double *) calloc(Nparticles, sizeof (double)); double * yj = (double *) calloc(Nparticles, sizeof (double)); double * CDF = (double *) calloc(Nparticles, sizeof(double)); //GPU copies of arrays cl_mem arrayX_GPU; cl_mem arrayY_GPU; cl_mem xj_GPU; cl_mem yj_GPU; cl_mem CDF_GPU; cl_mem likelihood_GPU; cl_mem I_GPU; cl_mem weights_GPU; cl_mem objxy_GPU; int * ind = (int*) calloc(countOnes, sizeof(int)); cl_mem ind_GPU; double * u = (double *) calloc(Nparticles, sizeof(double)); cl_mem u_GPU; cl_mem seed_GPU; cl_mem partial_sums; //OpenCL memory allocation arrayX_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer arrayX_GPU (size:%d) => %d\n", Nparticles, err); return -1; } arrayY_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer arrayY_GPU (size:%d) => %d\n", Nparticles, err); return -1; } xj_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer xj_GPU (size:%d) => %d\n", Nparticles, err); return -1; } yj_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer yj_GPU (size:%d) => %d\n", Nparticles, err); return -1; } CDF_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) * Nparticles, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer CDF_GPU (size:%d) => %d\n", Nparticles, err); return -1; } u_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer u_GPU (size:%d) => %d\n", Nparticles, err); return -1; } likelihood_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer likelihood_GPU (size:%d) => %d\n", Nparticles, err); return -1; } weights_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer weights_GPU (size:%d) => %d\n", Nparticles, err); return -1; } I_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (unsigned char) *IszX * IszY * Nfr, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer I_GPU (size:%d) => %d\n", IszX * IszY * Nfr, err); return -1; } objxy_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, 2*sizeof (int) *countOnes, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer objxy_GPU (size:%d) => %d\n", countOnes, err); return -1; } ind_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (int) *countOnes * Nparticles, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer ind_GPU (size:%d) => %d\n", countOnes * Nparticles, err); return -1; } seed_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (int) *Nparticles, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer seed_GPU (size:%d) => %d\n", Nparticles, err); return -1; } partial_sums = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof (double) * Nparticles + 1, likelihood, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer partial_sums (size:%d) => %d\n", Nparticles, err); return -1; } //Donnie - this loop is different because in this kernel, arrayX and arrayY // are set equal to xj before every iteration, so effectively, arrayX and // arrayY will be set to xe and ye before the first iteration. for (x = 0; x < Nparticles; x++) { xj[x] = xe; yj[x] = ye; } int k; //double * Ik = (double *)calloc(IszX*IszY, sizeof(double)); int indX, indY; //start send long long send_start = get_time(); //OpenCL memory copy err = clEnqueueWriteBuffer(cmd_queue, I_GPU, 1, 0, sizeof (unsigned char) *IszX * IszY*Nfr, I, 0, 0, 0); if (err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer I_GPU (size:%d) => %d\n", IszX * IszY*Nfr, err); return -1; } err = clEnqueueWriteBuffer(cmd_queue, objxy_GPU, 1, 0, 2*sizeof (int) *countOnes, objxy, 0, 0, 0); if (err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer objxy_GPU (size:%d) => %d\n", countOnes, err); return -1; } err = clEnqueueWriteBuffer(cmd_queue, weights_GPU, 1, 0, sizeof (double) *Nparticles, weights, 0, 0, 0); if (err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer weights_GPU (size:%d) => %d\n", Nparticles, err); return -1; } err = clEnqueueWriteBuffer(cmd_queue, xj_GPU, 1, 0, sizeof (double) *Nparticles, xj, 0, 0, 0); if (err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer arrayX_GPU (size:%d) => %d\n", Nparticles, err); return -1; } err = clEnqueueWriteBuffer(cmd_queue, yj_GPU, 1, 0, sizeof (double) *Nparticles, yj, 0, 0, 0); if (err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer arrayY_GPU (size:%d) => %d\n", Nparticles, err); return -1; } err = clEnqueueWriteBuffer(cmd_queue, seed_GPU, 1, 0, sizeof (int) *Nparticles, seed, 0, 0, 0); if (err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer seed_GPU (size:%d) => %d\n", Nparticles, err); return -1; } /********************************************************************** *********** E N D A L L O C A T E ******************************** *********************************************************************/ long long send_end = get_time(); printf("TIME TO SEND TO GPU: %f\n", elapsed_time(send_start, send_end)); int num_blocks = ceil((double) Nparticles / (double) threads_per_block); printf("threads_per_block=%d \n",threads_per_block); size_t local_work[3] = {threads_per_block, 1, 1}; size_t global_work[3] = {num_blocks*threads_per_block, 1, 1}; for (k = 1; k < Nfr; k++) { /****************** L I K E L I H O O D ************************************/ clSetKernelArg(kernel_likelihood, 0, sizeof (void *), (void*) &arrayX_GPU); clSetKernelArg(kernel_likelihood, 1, sizeof (void *), (void*) &arrayY_GPU); clSetKernelArg(kernel_likelihood, 2, sizeof (void *), (void*) &xj_GPU); clSetKernelArg(kernel_likelihood, 3, sizeof (void *), (void*) &yj_GPU); clSetKernelArg(kernel_likelihood, 4, sizeof (void *), (void*) &CDF_GPU); clSetKernelArg(kernel_likelihood, 5, sizeof (void *), (void*) &ind_GPU); clSetKernelArg(kernel_likelihood, 6, sizeof (void *), (void*) &objxy_GPU); clSetKernelArg(kernel_likelihood, 7, sizeof (void *), (void*) &likelihood_GPU); clSetKernelArg(kernel_likelihood, 8, sizeof (void *), (void*) &I_GPU); clSetKernelArg(kernel_likelihood, 9, sizeof (void *), (void*) &u_GPU); clSetKernelArg(kernel_likelihood, 10, sizeof (void *), (void*) &weights_GPU); clSetKernelArg(kernel_likelihood, 11, sizeof (cl_int), (void*) &Nparticles); clSetKernelArg(kernel_likelihood, 12, sizeof (cl_int), (void*) &countOnes); clSetKernelArg(kernel_likelihood, 13, sizeof (cl_int), (void*) &max_size); clSetKernelArg(kernel_likelihood, 14, sizeof (cl_int), (void*) &k); clSetKernelArg(kernel_likelihood, 15, sizeof (cl_int), (void*) &IszY); clSetKernelArg(kernel_likelihood, 16, sizeof (cl_int), (void*) &Nfr); clSetKernelArg(kernel_likelihood, 17, sizeof (void *), (void*) &seed_GPU); clSetKernelArg(kernel_likelihood, 18, sizeof (void *), (void*) &partial_sums); clSetKernelArg(kernel_likelihood, 19, threads_per_block * sizeof (double), NULL); //KERNEL FUNCTION CALL err = clEnqueueNDRangeKernel(cmd_queue, kernel_likelihood, 1, NULL, global_work, local_work, 0, 0, 0); clFinish(cmd_queue); if (err != CL_SUCCESS) { printf("ERROR: clEnqueueNDRangeKernel(kernel_likelihood)=>%d failed\n", err); //check_error(err, __FILE__, __LINE__); return -1; } /****************** E N D L I K E L I H O O D **********************/ /*************************** S U M ************************************/ clSetKernelArg(kernel_sum, 0, sizeof (void *), (void*) &partial_sums); clSetKernelArg(kernel_sum, 1, sizeof (cl_int), (void*) &Nparticles); //KERNEL FUNCTION CALL err = clEnqueueNDRangeKernel(cmd_queue, kernel_sum, 1, NULL, global_work, local_work, 0, 0, 0); clFinish(cmd_queue); if (err != CL_SUCCESS) { printf("ERROR: clEnqueueNDRangeKernel(kernel_sum)=>%d failed\n", err); //check_error(err, __FILE__, __LINE__); return -1; }/*************************** E N D S U M ****************************/ /**************** N O R M A L I Z E W E I G H T S *****************/ clSetKernelArg(kernel_normalize_weights, 0, sizeof (void *), (void*) &weights_GPU); clSetKernelArg(kernel_normalize_weights, 1, sizeof (cl_int), (void*) &Nparticles); clSetKernelArg(kernel_normalize_weights, 2, sizeof (void *), (void*) &partial_sums); //*/ clSetKernelArg(kernel_normalize_weights, 3, sizeof (void *), (void*) &CDF_GPU); clSetKernelArg(kernel_normalize_weights, 4, sizeof (void *), (void*) &u_GPU); clSetKernelArg(kernel_normalize_weights, 5, sizeof (void *), (void*) &seed_GPU); //KERNEL FUNCTION CALL err = clEnqueueNDRangeKernel(cmd_queue, kernel_normalize_weights, 1, NULL, global_work, local_work, 0, 0, 0); clFinish(cmd_queue); if (err != CL_SUCCESS) { printf("ERROR: clEnqueueNDRangeKernel(normalize_weights)=>%d failed\n", err); //check_error(err, __FILE__, __LINE__); return -1; } /************* E N D N O R M A L I Z E W E I G H T S ***********/ // ocl_print_double_array(cmd_queue, partial_sums, 40); // /********* I N T E R M E D I A T E R E S U L T S ***************/ // //OpenCL memory copying back from GPU to CPU memory err = clEnqueueReadBuffer(cmd_queue, arrayX_GPU, 1, 0, sizeof (double) *Nparticles, arrayX, 0, 0, 0); err = clEnqueueReadBuffer(cmd_queue, arrayY_GPU, 1, 0, sizeof (double) *Nparticles, arrayY, 0, 0, 0); err = clEnqueueReadBuffer(cmd_queue, weights_GPU, 1, 0, sizeof (double) *Nparticles, weights, 0, 0, 0); xe = 0; ye = 0; double total=0.0; // estimate the object location by expected values for (x = 0; x < Nparticles; x++) { // if( 0.0000000 < arrayX[x]*weights[x]) printf("arrayX[%d]:%f, arrayY[%d]:%f, weights[%d]:%0.10f\n",x,arrayX[x], x, arrayY[x], x, weights[x]); // printf("arrayX[%d]:%f | arrayY[%d]:%f | weights[%d]:%f\n", // x, arrayX[x], x, arrayY[x], x, weights[x]); xe += arrayX[x] * weights[x]; ye += arrayY[x] * weights[x]; total+= weights[x]; } printf("total weight: %lf\n", total); printf("XE: %lf\n", xe); printf("YE: %lf\n", ye); double distance = sqrt(pow((double) (xe - (int) roundDouble(IszY / 2.0)), 2) + pow((double) (ye - (int) roundDouble(IszX / 2.0)), 2)); printf("%lf\n", distance); // /********* E N D I N T E R M E D I A T E R E S U L T S ***************/ /******************** F I N D I N D E X ****************************/ //Set number of threads clSetKernelArg(kernel_find_index, 0, sizeof (void *), (void*) &arrayX_GPU); clSetKernelArg(kernel_find_index, 1, sizeof (void *), (void*) &arrayY_GPU); clSetKernelArg(kernel_find_index, 2, sizeof (void *), (void*) &CDF_GPU); clSetKernelArg(kernel_find_index, 3, sizeof (void *), (void*) &u_GPU); clSetKernelArg(kernel_find_index, 4, sizeof (void *), (void*) &xj_GPU); clSetKernelArg(kernel_find_index, 5, sizeof (void *), (void*) &yj_GPU); clSetKernelArg(kernel_find_index, 6, sizeof (void *), (void*) &weights_GPU); clSetKernelArg(kernel_find_index, 7, sizeof (cl_int), (void*) &Nparticles); //KERNEL FUNCTION CALL err = clEnqueueNDRangeKernel(cmd_queue, kernel_find_index, 1, NULL, global_work, local_work, 0, 0, 0); clFinish(cmd_queue); if (err != CL_SUCCESS) { printf("ERROR: clEnqueueNDRangeKernel(find_index)=>%d failed\n", err); //check_error(err, __FILE__, __LINE__); return -1; } /******************* E N D F I N D I N D E X ********************/ }//end loop //block till kernels are finished //clFinish(cmd_queue); long long back_time = get_time(); //OpenCL freeing of memory clReleaseProgram(prog); clReleaseMemObject(u_GPU); clReleaseMemObject(CDF_GPU); clReleaseMemObject(yj_GPU); clReleaseMemObject(xj_GPU); clReleaseMemObject(likelihood_GPU); clReleaseMemObject(I_GPU); clReleaseMemObject(objxy_GPU); clReleaseMemObject(ind_GPU); clReleaseMemObject(seed_GPU); clReleaseMemObject(partial_sums); long long free_time = get_time(); //OpenCL memory copying back from GPU to CPU memory err = clEnqueueReadBuffer(cmd_queue, arrayX_GPU, 1, 0, sizeof (double) *Nparticles, arrayX, 0, 0, 0); if (err != CL_SUCCESS) { printf("ERROR: Memcopy Out\n"); return -1; } long long arrayX_time = get_time(); err = clEnqueueReadBuffer(cmd_queue, arrayY_GPU, 1, 0, sizeof (double) *Nparticles, arrayY, 0, 0, 0); if (err != CL_SUCCESS) { printf("ERROR: Memcopy Out\n"); return -1; } long long arrayY_time = get_time(); err = clEnqueueReadBuffer(cmd_queue, weights_GPU, 1, 0, sizeof (double) *Nparticles, weights, 0, 0, 0); if (err != CL_SUCCESS) { printf("ERROR: Memcopy Out\n"); return -1; } long long back_end_time = get_time(); printf("GPU Execution: %lf\n", elapsed_time(send_end, back_time)); printf("FREE TIME: %lf\n", elapsed_time(back_time, free_time)); printf("SEND TO SEND BACK: %lf\n", elapsed_time(back_time, back_end_time)); printf("SEND ARRAY X BACK: %lf\n", elapsed_time(free_time, arrayX_time)); printf("SEND ARRAY Y BACK: %lf\n", elapsed_time(arrayX_time, arrayY_time)); printf("SEND WEIGHTS BACK: %lf\n", elapsed_time(arrayY_time, back_end_time)); xe = 0; ye = 0; // estimate the object location by expected values for (x = 0; x < Nparticles; x++) { xe += arrayX[x] * weights[x]; ye += arrayY[x] * weights[x]; } double distance = sqrt(pow((double) (xe - (int) roundDouble(IszY / 2.0)), 2) + pow((double) (ye - (int) roundDouble(IszX / 2.0)), 2)); //Output results FILE *fid; fid=fopen("output.txt", "w+"); if( fid == NULL ){ printf( "The file was not opened for writing\n" ); return -1; } fprintf(fid, "XE: %lf\n", xe); fprintf(fid, "YE: %lf\n", ye); fprintf(fid, "distance: %lf\n", distance); fclose(fid); //OpenCL freeing of memory clReleaseMemObject(weights_GPU); clReleaseMemObject(arrayY_GPU); clReleaseMemObject(arrayX_GPU); //free regular memory free(likelihood); free(arrayX); free(arrayY); free(xj); free(yj); free(CDF); free(ind); free(u); }
int main(int argc, char const *argv[]) { /* Get platform */ cl_platform_id platform; cl_uint num_platforms; cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformIDs' failed\n"); exit(1); } printf("Number of platforms: %d\n", num_platforms); printf("platform=%p\n", platform); /* Get platform name */ char platform_name[100]; ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformInfo' failed\n"); exit(1); } printf("platform.name='%s'\n\n", platform_name); /* Get device */ cl_device_id device; cl_uint num_devices; ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceIDs' failed\n"); exit(1); } printf("Number of devices: %d\n", num_devices); printf("device=%p\n", device); /* Get device name */ char device_name[100]; ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceInfo' failed\n"); exit(1); } printf("device.name='%s'\n", device_name); printf("\n"); /* Create a Context Object */ cl_context context; context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateContext' failed\n"); exit(1); } printf("context=%p\n", context); /* Create a Command Queue Object*/ cl_command_queue command_queue; command_queue = clCreateCommandQueue(context, device, 0, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateCommandQueue' failed\n"); exit(1); } printf("command_queue=%p\n", command_queue); printf("\n"); /* Program source */ unsigned char *source_code; size_t source_length; /* Read program from 'sub_sat_short16short16.cl' */ source_code = read_buffer("sub_sat_short16short16.cl", &source_length); /* Create a program */ cl_program program; program = clCreateProgramWithSource(context, 1, (const char **)&source_code, &source_length, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateProgramWithSource' failed\n"); exit(1); } printf("program=%p\n", program); /* Build program */ ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if (ret != CL_SUCCESS ) { size_t size; char *log; /* Get log size */ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size); /* Allocate log and print */ log = malloc(size); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL); printf("error: call to 'clBuildProgram' failed:\n%s\n", log); /* Free log and exit */ free(log); exit(1); } printf("program built\n"); printf("\n"); /* Create a Kernel Object */ cl_kernel kernel; kernel = clCreateKernel(program, "sub_sat_short16short16", &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateKernel' failed\n"); exit(1); } /* Create and allocate host buffers */ size_t num_elem = 10; /* Create and init host side src buffer 0 */ cl_short16 *src_0_host_buffer; src_0_host_buffer = malloc(num_elem * sizeof(cl_short16)); for (int i = 0; i < num_elem; i++) src_0_host_buffer[i] = (cl_short16){{2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}}; /* Create and init device side src buffer 0 */ cl_mem src_0_device_buffer; src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_short16), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_short16), src_0_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create and init host side src buffer 1 */ cl_short16 *src_1_host_buffer; src_1_host_buffer = malloc(num_elem * sizeof(cl_short16)); for (int i = 0; i < num_elem; i++) src_1_host_buffer[i] = (cl_short16){{2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}}; /* Create and init device side src buffer 1 */ cl_mem src_1_device_buffer; src_1_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_short16), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_1_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_short16), src_1_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create host dst buffer */ cl_short16 *dst_host_buffer; dst_host_buffer = malloc(num_elem * sizeof(cl_short16)); memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_short16)); /* Create device dst buffer */ cl_mem dst_device_buffer; dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_short16), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create dst buffer\n"); exit(1); } /* Set kernel arguments */ ret = CL_SUCCESS; ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer); ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &src_1_device_buffer); ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clSetKernelArg' failed\n"); exit(1); } /* Launch the kernel */ size_t global_work_size = num_elem; size_t local_work_size = num_elem; ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueNDRangeKernel' failed\n"); exit(1); } /* Wait for it to finish */ clFinish(command_queue); /* Read results from GPU */ ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_short16), dst_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueReadBuffer' failed\n"); exit(1); } /* Dump dst buffer to file */ char dump_file[100]; sprintf((char *)&dump_file, "%s.result", argv[0]); write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_short16)); printf("Result dumped to %s\n", dump_file); /* Free host dst buffer */ free(dst_host_buffer); /* Free device dst buffer */ ret = clReleaseMemObject(dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 0 */ free(src_0_host_buffer); /* Free device side src buffer 0 */ ret = clReleaseMemObject(src_0_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 1 */ free(src_1_host_buffer); /* Free device side src buffer 1 */ ret = clReleaseMemObject(src_1_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Release kernel */ ret = clReleaseKernel(kernel); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseKernel' failed\n"); exit(1); } /* Release program */ ret = clReleaseProgram(program); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseProgram' failed\n"); exit(1); } /* Release command queue */ ret = clReleaseCommandQueue(command_queue); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseCommandQueue' failed\n"); exit(1); } /* Release context */ ret = clReleaseContext(context); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseContext' failed\n"); exit(1); } return 0; }
int exec_trig_kernel(const char *program_source, int n, void *srcA, void *dst) { cl_context context; cl_command_queue cmd_queue; cl_device_id *devices; cl_program program; cl_kernel kernel; cl_mem memobjs[2]; size_t global_work_size[1]; size_t local_work_size[1]; size_t cb; cl_int err; float c = 7.3f; // a scalar number to test non-pointer args // create the OpenCL context on a GPU device context = poclu_create_any_context(); if (context == (cl_context)0) return -1; // get the list of GPU devices associated with context clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &cb); devices = (cl_device_id *) malloc(cb); clGetContextInfo(context, CL_CONTEXT_DEVICES, cb, devices, NULL); // create a command-queue cmd_queue = clCreateCommandQueue(context, devices[0], 0, NULL); if (cmd_queue == (cl_command_queue)0) { clReleaseContext(context); free(devices); return -1; } free(devices); // allocate the buffer memory objects memobjs[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float4) * n, srcA, NULL); if (memobjs[0] == (cl_mem)0) { clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } memobjs[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float4) * n, NULL, NULL); if (memobjs[1] == (cl_mem)0) { delete_memobjs(memobjs, 1); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // create the program program = clCreateProgramWithSource(context, 1, (const char**)&program_source, NULL, NULL); if (program == (cl_program)0) { delete_memobjs(memobjs, 2); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 2); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // create the kernel kernel = clCreateKernel(program, "trig", NULL); if (kernel == (cl_kernel)0) { delete_memobjs(memobjs, 2); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // set the args values err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &memobjs[0]); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &memobjs[1]); err |= clSetKernelArg(kernel, 2, sizeof(float), (void *) &c); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 2); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // set work-item dimensions global_work_size[0] = n; local_work_size[0]= 2; // execute kernel err = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 2); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // read output image err = clEnqueueReadBuffer(cmd_queue, memobjs[1], CL_TRUE, 0, n * sizeof(cl_float4), dst, 0, NULL, NULL); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 2); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // release kernel, program, and memory objects delete_memobjs(memobjs, 2); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return 0; // success... }
b2CLNarrowPhase::b2CLNarrowPhase() { #if defined(NARROWPHASE_OPENCL) if (b2clGlobal_OpenCLSupported) { printf("Initializing b2CLNarrowPhase...\n"); int err; //load opencl programs from files char* narrowPhaseKernelSource = 0; size_t narrowPhaseKernelSourceLen = 0; shrLog("...loading b2CLNarrowPhase.cl\n"); #if defined(SCAN_OPENCL) #ifdef linux narrowPhaseKernelSource = b2clLoadProgSource(shrFindFilePath("/opt/usr/apps/com.samsung.browser/include/Box2D/Common/OpenCL/b2CLNarrowPhase.cl", NULL), "// My comment\n", &narrowPhaseKernelSourceLen); #elif defined (_WIN32) narrowPhaseKernelSource = b2clLoadProgSource(shrFindFilePath("../../Box2D/Common/OpenCL/b2CLNarrowPhase.cl", NULL), "// My comment\n", &narrowPhaseKernelSourceLen); #else narrowPhaseKernelSource = b2clLoadProgSource(shrFindFilePath("/usr/local/include/Box2D/Common/OpenCL/b2CLNarrowPhase.cl", NULL), "// My comment\n", &narrowPhaseKernelSourceLen); #endif #else #ifdef linux narrowPhaseKernelSource = b2clLoadProgSource(shrFindFilePath("/opt/usr/apps/com.samsung.browser/include/Box2D/Common/OpenCL/b2CLNarrowPhase_Alone.cl", NULL), "// My comment\n", &narrowPhaseKernelSourceLen); #elif defined (_WIN32) narrowPhaseKernelSource = b2clLoadProgSource(shrFindFilePath("../../Box2D/Common/OpenCL/b2CLNarrowPhase_Alone.cl", NULL), "// My comment\n", &narrowPhaseKernelSourceLen); #else narrowPhaseKernelSource = b2clLoadProgSource(shrFindFilePath("/usr/local/include/Box2D/Common/OpenCL/b2CLNarrowPhase_Alone.cl", NULL), "// My comment\n", &narrowPhaseKernelSourceLen); #endif #endif if(narrowPhaseKernelSource == NULL) { b2Log("Could not load program source, is path 'b2CLNarrowPhase.cl' correct?"); } //create the compute program from source kernel code narrowPhaseProgram = clCreateProgramWithSource(b2CLDevice::instance().GetContext(), 1, (const char**)&narrowPhaseKernelSource, NULL, &err); if (!narrowPhaseProgram) { printf("Error: Failed to create compute program!\n"); exit(1); } //build the program err = clBuildProgram(narrowPhaseProgram, 0, NULL, OPENCL_BUILD_PATH, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[204800]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(narrowPhaseProgram, b2CLDevice::instance().GetCurrentDevice(), CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } //create the compute kernel collidePolygonsKernel = clCreateKernel(narrowPhaseProgram, "b2clCollidePolygons", &err); if (!collidePolygonsKernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } collideCirclesKernel = clCreateKernel(narrowPhaseProgram, "b2clCollideCircles", &err); if (!collideCirclesKernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } collidePolygonAndCircleKernel = clCreateKernel(narrowPhaseProgram, "b2clCollidePolygonAndCircle", &err); if (!collidePolygonAndCircleKernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } collideEdgeAndCircleKernel = clCreateKernel(narrowPhaseProgram, "b2clCollideEdgeAndCircle", &err); if (!collideEdgeAndCircleKernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } collideEdgeAndPolygonKernel = clCreateKernel(narrowPhaseProgram, "b2clCollideEdgeAndPolygon", &err); if (!collideEdgeAndPolygonKernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } b2CLDevice::instance().getMaximumKernelWorkGroupSize(collidePolygonsKernel, kernel_work_group_size); //clGetKernelWorkGroupInfo(collidePolygonsKernel, b2CLDevice::instance().GetCurrentDevice(), CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &kernel_preferred_work_group_size_multiple, NULL); //create the compute kernel compactForOneContactKernel = clCreateKernel(narrowPhaseProgram, "b2clCompactForOneContact", &err); if (!compactForOneContactKernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } /*old_xf_num = */old_contact_num = 0; xfListData = NULL; b2CLCommonData::instance().manifoldListData = NULL; globalIndicesData = pairIndicesData = NULL; #if !defined(BROADPHASE_OPENCL) b2CLCommonData::instance().manifoldListBuffers[0] = b2CLCommonData::instance().manifoldListBuffers[1] = b2CLCommonData::instance().pairIndicesBuffer = #endif #if defined(SCAN_OPENCL) b2CLCommonData::instance().manifoldBinaryBitListBuffer = b2CLCommonData::instance().validContactIndicesBuffer = #endif NULL; b2CLScan::instance(); // initializing b2CLScan } #endif b2CLCommonData::instance().currentManifoldBuffer = 0; manifold_nums[0] = 0; manifold_nums[1] = 0; }
int main(int argc, char** argv) { int rank, size; // MPI rank & size int err; // error code returned from OpenCL calls float h_a[LENGTH]; // a vector float h_b[LENGTH]; // b vector float h_c[LENGTH]; // c vector (a+b) returned from the compute device (local per task) float _h_c[LENGTH]; // c vector (a+b) returned from the compute device (global for master) unsigned int correct; // number of correct results size_t global; // global domain size size_t local; // local domain size cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel ko_vadd; // compute kernel cl_mem d_a; // device memory used for the input a vector cl_mem d_b; // device memory used for the input b vector cl_mem d_c; // device memory used for the output c vector int mycount, i; err = MPI_Init (&argc, &argv); if (err != MPI_SUCCESS) { printf ("MPI_Init failed!\n"); exit (-1); } err = MPI_Comm_rank (MPI_COMM_WORLD, &rank); if (err != MPI_SUCCESS) { printf ("MPI_Comm_rank failed!\n"); exit (-1); } err = MPI_Comm_size (MPI_COMM_WORLD, &size); if (err != MPI_SUCCESS) { printf ("MPI_Comm_size failed\n"); exit (-1); } if (LENGTH % size != 0) { printf ("Number of MPI processes must divide LENGTH (%d)\n", LENGTH); exit (-1); } mycount = LENGTH / size; if (rank == 0) { for (i = 0; i < LENGTH; i++) { h_a[i] = rand() / (float)RAND_MAX; h_b[i] = rand() / (float)RAND_MAX; h_a[i] = i; h_b[i] = i*2; } err = MPI_Bcast (h_a, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD); if (err != MPI_SUCCESS) { printf ("MPI_Bcast failed transferring h_a\n"); exit (-1); } err = MPI_Bcast (h_b, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD); if (err != MPI_SUCCESS) { printf ("MPI_Bcast failed transferring h_b\n"); exit (-1); } } else { err = MPI_Bcast (h_a, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD); if (err != MPI_SUCCESS) { printf ("MPI_Bcast failed receiving h_a\n"); exit (-1); } err = MPI_Bcast (h_b, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD); if (err != MPI_SUCCESS) { printf ("MPI_Bcast failed receiving h_b\n"); exit (-1); } } // Set up platform cl_uint numPlatforms; // Find number of platforms err = clGetPlatformIDs(0, NULL, &numPlatforms); if (err != CL_SUCCESS || numPlatforms <= 0) { printf("Error: Failed to find a platform!\n"); return EXIT_FAILURE; } // Get all platforms cl_platform_id Platform[numPlatforms]; err = clGetPlatformIDs(numPlatforms, Platform, NULL); if (err != CL_SUCCESS || numPlatforms <= 0) { printf("Error: Failed to get the platform!\n"); return EXIT_FAILURE; } // Secure a GPU for (i = 0; i < numPlatforms; i++) { err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL); if (err == CL_SUCCESS) break; } if (device_id == NULL) { printf("Error: Failed to create a device group!\n"); return EXIT_FAILURE; } else { if (output_device_info (rank, device_id) != CL_SUCCESS) return EXIT_FAILURE; } // Create a compute context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } // Create a command queue commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err); if (!program) { printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } // Create the compute kernel from the program ko_vadd = clCreateKernel(program, "vadd", &err); if (!ko_vadd || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } // Create the input (a, b) and output (c) arrays in device memory d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * mycount, NULL, NULL); d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * mycount, NULL, NULL); d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * mycount, NULL, NULL); if (!d_a || !d_b || !d_c) { printf("Error: Failed to allocate device memory!\n"); exit(1); } // Write a and b vectors into compute device memory err = clEnqueueWriteBuffer(commands, d_a, CL_TRUE, 0, sizeof(float) * mycount, &h_a[rank*mycount], 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write h_a to source array!\n"); exit(1); } err = clEnqueueWriteBuffer(commands, d_b, CL_TRUE, 0, sizeof(float) * mycount, &h_b[rank*mycount], 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write h_b to source array!\n"); exit(1); } // Set the arguments to our compute kernel err = clSetKernelArg(ko_vadd, 0, sizeof(cl_mem), &d_a); err |= clSetKernelArg(ko_vadd, 1, sizeof(cl_mem), &d_b); err |= clSetKernelArg(ko_vadd, 2, sizeof(cl_mem), &d_c); err |= clSetKernelArg(ko_vadd, 3, sizeof(unsigned int), &mycount); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); exit(1); } // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(ko_vadd, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); exit(1); } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device global = LENGTH; err = clEnqueueNDRangeKernel(commands, ko_vadd, 1, NULL, &global, &local, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel!\n"); return EXIT_FAILURE; } // Wait for the commands to complete before reading back results clFinish(commands); // Read back the results from the compute device err = clEnqueueReadBuffer( commands, d_c, CL_TRUE, 0, sizeof(float) * mycount, &h_c, 0, NULL, NULL ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); exit(1); } err = MPI_Gather (h_c, mycount, MPI_FLOAT, _h_c, mycount, MPI_FLOAT, 0, MPI_COMM_WORLD); if (err != MPI_SUCCESS) { printf ("MPI_Gather failed receiving h_c\n"); exit (-1); } if (rank == 0) { // Test the results correct = 0; float tmp; for(i = 0; i < LENGTH; i++) { tmp = h_a[i] + h_b[i]; // assign element i of a+b to tmp tmp -= _h_c[i]; // compute deviation of expected and output result if(tmp*tmp < TOL*TOL) // correct if square deviation is less than tolerance squared correct++; else printf(" tmp %f h_a %f h_b %f h_c %f \n",tmp, h_a[i], h_b[i], _h_c[i]); } // summarize results printf("C = A+B: %d out of %d results were correct.\n", correct, LENGTH); } // cleanup then shutdown clReleaseMemObject(d_a); clReleaseMemObject(d_b); clReleaseMemObject(d_c); clReleaseProgram(program); clReleaseKernel(ko_vadd); clReleaseCommandQueue(commands); clReleaseContext(context); err = MPI_Finalize (); if (err != MPI_SUCCESS) { printf ("MPI_Finalize failed!\n"); exit (-1); } return 0; }