int main(int argc, char** argv) { int err; // error code returned from api calls cl_platform_id platform_id; // platform id cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel size_t global[2]; // global domain size for our calculation size_t local[2]; // local domain size for our calculation char cl_platform_vendor[1001]; char cl_platform_name[1001]; cl_mem in_array; // device memory used for the input array //cl_mem synaptic_weights; // device memory used for the input array cl_mem out_array; // device memory used for the output array if (argc != 2){ printf("%s <inputfile>\n", argv[0]); return -1; } //float in_array[NO_NODES]; //float out_array[NO_NODES]; //float synaptic_weights[NO_NODES*NO_NODES]; float in_array_tb[NO_NODES]; float out_array_tb[NO_NODES]; //float synaptic_weights_tb[NO_NODES*NO_NODES]; float temp =0; int i = 0; int j = 0; int index = 0; FILE* ifp; char* mode = "r"; // // Connect to first platform // err = clGetPlatformIDs(1,&platform_id,NULL); if (err != CL_SUCCESS) { printf("Error: Failed to find an OpenCL platform!\n"); printf("Test failed\n"); return -1; } err = clGetPlatformInfo(platform_id,CL_PLATFORM_VENDOR,1000,(void *)cl_platform_vendor,NULL); if (err != CL_SUCCESS) { printf("Error: clGetPlatformInfo(CL_PLATFORM_VENDOR) failed!\n"); printf("Test failed\n"); return -1; } printf("CL_PLATFORM_VENDOR %s\n",cl_platform_vendor); err = clGetPlatformInfo(platform_id,CL_PLATFORM_NAME,1000,(void *)cl_platform_name,NULL); if (err != CL_SUCCESS) { printf("Error: clGetPlatformInfo(CL_PLATFORM_NAME) failed!\n"); printf("Test failed\n"); return -1; } printf("CL_PLATFORM_NAME %s\n",cl_platform_name); // Connect to a compute device // int fpga = 0; #if defined (FPGA_DEVICE) fpga = 1; #endif err = clGetDeviceIDs(platform_id, fpga ? CL_DEVICE_TYPE_ACCELERATOR : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); printf("Test failed\n"); return -1; } // // Create a compute context // context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); printf("Test failed\n"); return -1; } //relu_1(in_array,synaptic_weights,out_array); // Fill our data sets with pattern // //int i = 0; //for(i = 0; i < DATA_SIZE; i++) { // a[i] = (int)i; // b[i] = (int)i; // results[i] = 0; //} // // Create a command commands commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); printf("Error: code %i\n",err); printf("Test failed\n"); return -1; } int status; // Create Program Objects // // Load binary from disk unsigned char *kernelbinary; char *xclbin=argv[1]; printf("loading %s\n", xclbin); int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary); if (n_i < 0) { printf("failed to load kernel from xclbin: %s\n", xclbin); printf("Test failed\n"); return -1; } size_t n = n_i; // Create the compute program from offline program = clCreateProgramWithBinary(context, 1, &device_id, &n, (const unsigned char **) &kernelbinary, &status, &err); if ((!program) || (err!=CL_SUCCESS)) { printf("Error: Failed to create compute program from binary %d!\n", err); printf("Test failed\n"); printf("err : %d %s\n",err,err); } // Build the program executable // err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); printf("Test failed\n"); return -1; } // Create the compute kernel in the program we wish to run // kernel = clCreateKernel(program, "relu_1", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); printf("Test failed\n"); return -1; } // Create the input and output arrays in device memory for our calculation // in_array = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * NO_NODES, NULL, NULL); //synaptic_weights = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * NO_NODES * NO_NODES, NULL, NULL); out_array = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * NO_NODES, NULL, NULL); if (!in_array || /*!synaptic_weights ||*/ !out_array) { printf("Error: Failed to allocate device memory!\n"); printf("Test failed\n"); return -1; } ifp = fopen("/home/agandhi92/sdaccel/relu_1/input.txt",mode); if(ifp == NULL) { printf("Input file not found \n"); return -1; } while (fscanf(ifp, "%f", &temp) != EOF && index < NO_NODES) { in_array_tb[index++] = temp; } index = 0; temp = 0; //ifp = fopen("/home/agandhi92/sdaccel/relu_1/weight.txt",mode); //if(ifp == NULL) //{ // printf("Weight file not found \n"); // return -1; //} //while (fscanf(ifp, "%f", &temp) != EOF && index < (NO_NODES*NO_NODES)) { // synaptic_weights_tb[index++] = temp; //} // // Write our data set into the input array in device memory // err = clEnqueueWriteBuffer(commands, in_array, CL_TRUE, 0, sizeof(float) * NO_NODES, in_array_tb, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array a!\n"); printf("Test failed\n"); return -1; } // Write our data set into the input array in device memory // //err = clEnqueueWriteBuffer(commands, synaptic_weights, CL_TRUE, 0, sizeof(float) * NO_NODES * NO_NODES, synaptic_weights_tb, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array b!\n"); printf("Test failed\n"); return -1; } // Set the arguments to our compute kernel // err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &in_array); //err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &synaptic_weights); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_array); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); printf("Test failed\n"); return -1; } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // err = clEnqueueTask(commands, kernel, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel! %d\n", err); printf("Test failed\n"); return -1; } // Read back the results from the device to verify the output // cl_event readevent; err = clEnqueueReadBuffer( commands, out_array, CL_TRUE, 0, sizeof(float) * NO_NODES, out_array_tb, 0, NULL, &readevent ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); printf("Test failed\n"); return -1; } clWaitForEvents(1, &readevent); //printf("A\n"); //for (i=0;i<DATA_SIZE;i++) { // printf("%x ",a[i]); // if (((i+1) % 16) == 0) // printf("\n"); //} //printf("B\n"); //for (i=0;i<DATA_SIZE;i++) { // printf("%x ",b[i]); // if (((i+1) % 16) == 0) // printf("\n"); //} //printf("res\n"); //for (i=0;i<DATA_SIZE;i++) { // printf("%x ",results[i]); // if (((i+1) % 16) == 0) // printf("\n"); //} // Validate our results // //correct = 0; //for(i = 0; i < DATA_SIZE; i++) //{ // int row = i/MATRIX_RANK; // int col = i%MATRIX_RANK; // int running = 0; // int index; // for (index=0;index<MATRIX_RANK;index++) { // int aIndex = row*MATRIX_RANK + index; // int bIndex = col + index*MATRIX_RANK; // running += a[aIndex] * b[bIndex]; // } // sw_results[i] = running; //} // //for (i = 0;i < DATA_SIZE; i++) // if(results[i] == sw_results[i]) // correct++; //printf("Software\n"); //for (i=0;i<DATA_SIZE;i++) { // //printf("%0.2f ",sw_results[i]); // printf("%d ",sw_results[i]); // if (((i+1) % 16) == 0) // printf("\n"); //} // // //// Print a brief summary detailing the results //// //printf("Computed '%d/%d' correct values!\n", correct, DATA_SIZE); // // Shutdown and cleanup int temp_ = 0; for (j = 0; j < NO_NODES; j++) { if (out_array_tb[j] >= 0) // || out_array_tb[j]== 0) { //printf("out_array[%d] = %f \n", j, out_array[j]); temp_++; } } clReleaseMemObject(in_array); //clReleaseMemObject(synaptic_weights); clReleaseMemObject(out_array); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); if (temp_ == NO_NODES) { printf("*********************************************************** \n"); printf("TEST PASSED !!!!!! The output matches the desired output. \n"); printf("*********************************************************** \n"); return EXIT_SUCCESS; } else { printf("**************************************************************** \n"); printf("TEST Failed !!!!!! The output does not match the desired output. \n"); printf("**************************************************************** \n"); return -1; } //if(correct == DATA_SIZE){ // printf("Test passed!\n"); // return EXIT_SUCCESS; //} //else{ // printf("Test failed\n"); // return -1; //} }
int main() { char buf[]="Hello, World!"; size_t srcsize, worksize=strlen(buf); cl_int error; cl_platform_id platform; cl_device_id device; cl_uint platforms, devices; // Fetch the Platform and Device IDs; we only want one. error=clGetPlatformIDs(1, &platform, &platforms); error=clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, &devices); cl_context_properties properties[]={ CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0}; // Note that nVidia's OpenCL requires the platform property cl_context context=clCreateContext(properties, 1, &device, NULL, NULL, &error); cl_command_queue cq = clCreateCommandQueue(context, device, 0, &error); rot13(buf); // scramble using the CPU puts(buf); // Just to demonstrate the plaintext is destroyed //char src[8192]; //FILE *fil=fopen("rot13.cl","r"); //srcsize=fread(src, sizeof src, 1, fil); //fclose(fil); const char *src=rot13_cl; srcsize=strlen(rot13_cl); const char *srcptr[]={src}; // Submit the source code of the rot13 kernel to OpenCL cl_program prog=clCreateProgramWithSource(context, 1, srcptr, &srcsize, &error); // and compile it (after this we could extract the compiled version) error=clBuildProgram(prog, 0, NULL, "", NULL, NULL); // Allocate memory for the kernel to work with cl_mem mem1, mem2; mem1=clCreateBuffer(context, CL_MEM_READ_ONLY, worksize, NULL, &error); mem2=clCreateBuffer(context, CL_MEM_WRITE_ONLY, worksize, NULL, &error); // get a handle and map parameters for the kernel cl_kernel k_rot13=clCreateKernel(prog, "rot13", &error); clSetKernelArg(k_rot13, 0, sizeof(mem1), &mem1); clSetKernelArg(k_rot13, 1, sizeof(mem2), &mem2); // Target buffer just so we show we got the data from OpenCL char buf2[sizeof buf]; buf2[0]='?'; buf2[worksize]=0; // Send input data to OpenCL (async, don't alter the buffer!) error=clEnqueueWriteBuffer(cq, mem1, CL_FALSE, 0, worksize, buf, 0, NULL, NULL); // Perform the operation error=clEnqueueNDRangeKernel(cq, k_rot13, 1, NULL, &worksize, &worksize, 0, NULL, NULL); // Read the result back into buf2 error=clEnqueueReadBuffer(cq, mem2, CL_FALSE, 0, worksize, buf2, 0, NULL, NULL); // Await completion of all the above error=clFinish(cq); // Finally, output out happy message. puts(buf2); }
void sum_gpu(long long *in, long long *out, unsigned int n) { size_t global_size; size_t local_size; char *kernel_src; cl_int err; cl_platform_id platform_id; cl_device_id device_id; cl_uint max_compute_units; size_t max_workgroup_size; cl_context context; cl_command_queue commands; cl_program program; cl_kernel kernel; cl_mem d_array; cl_event event; cl_ulong start, end; /* start OpenCL */ err = clGetPlatformIDs(1, &platform_id,NULL); clErrorHandling("clGetPlatformIDs"); err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); clErrorHandling("clGetDeviceIDs"); context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); clErrorHandling("clCreateContext"); commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err); clErrorHandling("clCreateCommandQueue"); /* create kernel */ kernel_src = file_to_string(KERNEL_SRC); program = clCreateProgramWithSource(context, 1, (const char**) &kernel_src, NULL, &err); free(kernel_src); clErrorHandling("clCreateProgramWithSource"); err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); clErrorHandling("clBuildProgram"); kernel = clCreateKernel(program, "matrix_mult", &err); clErrorHandling("clCreateKernel"); /* allocate memory and send to gpu */ d_array = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long long) * n, NULL, &err); clErrorHandling("clCreateBuffer"); err = clEnqueueWriteBuffer(commands, d_array, CL_TRUE, 0, sizeof(long long) * n, in, 0, NULL, NULL); clErrorHandling("clEnqueueWriteBuffer"); err = clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &max_compute_units, NULL); err |= clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_workgroup_size, NULL); clErrorHandling("clGetDeviceInfo"); /* prepare kernel args */ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_array); err |= clSetKernelArg(kernel, 1, sizeof(unsigned int), &n); /* execute */ local_size = n / max_compute_units / 8; if (local_size > max_workgroup_size) local_size = max_workgroup_size; /* * Usually it would be * global_size = local_size * max_compute_units; * but that would only be valid if local_size = n / max_compute_units; * local_size is n / max_compute_units / 8 because it obtains its hightest performance. */ for (global_size = local_size; global_size < n; global_size += local_size); err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global_size, &local_size, 0, NULL, &event); clErrorHandling("clEnqueueNDRangeKernel"); clWaitForEvents(1, &event); clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); fprintf(stderr, "Time for event (ms): %10.5f \n", (end - start) / 1000000.0); err = clFinish(commands); clErrorHandling("clFinish"); /* transfer back */ err = clEnqueueReadBuffer(commands, d_array, CL_TRUE, 0, sizeof(long long), out, 0, NULL, NULL); // a single long long clErrorHandling("clEnqueueReadBuffer"); /* cleanup*/ clReleaseMemObject(d_array); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); clReleaseEvent(event); }
int main() { // Get platform information err = clGetPlatformIDs(0, NULL, &numOfPlatforms); if (err) Error("Fail to get the number of platforms"); printf("The machine has %d platform(s) for OpenCL.\n", numOfPlatforms); platformIDs = new cl_platform_id [numOfPlatforms]; err = clGetPlatformIDs(numOfPlatforms, platformIDs, NULL); if (err) Error("Fail to get the platform list"); int cudaPlatformID = -1; for (int i = 0; i < numOfPlatforms; i++) { char platformName[50]; err = clGetPlatformInfo(platformIDs[i], CL_PLATFORM_NAME, 50, platformName, NULL); if (err) Error("Fail to get the platform name"); printf("Platform %d is %s\n", i + 1, platformName); if (!strcmp(platformName, "NVIDIA CUDA")) cudaPlatformID = i; } printf("\n"); if (cudaPlatformID == -1) Error("Fail to find an NVIDIA CUDA platform"); printf("Platform %d (NVIDIA CUDA) is chosen for use.\n", cudaPlatformID + 1); printf("\n"); // Get device information err = clGetDeviceIDs(platformIDs[cudaPlatformID], CL_DEVICE_TYPE_GPU, 0, NULL, &numOfDevices); if (err) Error("Fail to get the number of devices"); printf("CUDA platform has %d device(s).\n", numOfDevices); deviceIDs = new cl_device_id [numOfDevices]; err = clGetDeviceIDs(platformIDs[cudaPlatformID], CL_DEVICE_TYPE_GPU, numOfDevices, deviceIDs, NULL); if (err) Error("Fail to get the device list"); for (int i = 0; i < numOfDevices; i++) { char deviceName[50]; err = clGetDeviceInfo(deviceIDs[i], CL_DEVICE_NAME, 50, deviceName, NULL); if (err) Error("Fail to get the device name"); printf("Device %d is %s\n", i + 1, deviceName); } printf("\n"); // Create a context context = clCreateContext(NULL, numOfDevices, deviceIDs, NULL, NULL, &err); if (err) Error("Fail to create a context"); printf("Device 1 is chosen for use.\n"); printf("\n"); // Create a command queue for the first device commandQueue = clCreateCommandQueue(context, deviceIDs[0], CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE, &err); if (err) Error("Fail to create a command queue"); // create the program cl_program program = CreateProgram(exclusiveScanKernels, "exclusive scan"); // create two kernels cl_kernel scanKernel = clCreateKernel(program, "Scan", &err); if (err) Error("Fail to create the kernel for scan"); cl_kernel reverseUpdateKernel = clCreateKernel(program, "ReverseUpdate", &err); if (err) Error("Fail to create the kernel for reverse update"); // Get the work group size size_t maxWorkGroupSize; err = clGetKernelWorkGroupInfo(scanKernel, deviceIDs[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &maxWorkGroupSize, NULL); printf("maxWorkGroupSize = %d\n", maxWorkGroupSize); err = clGetKernelWorkGroupInfo(reverseUpdateKernel, deviceIDs[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &maxWorkGroupSize, NULL); printf("maxWorkGroupSize = %d\n", maxWorkGroupSize); // Set work group size to 64 int workGroupSize = 512; int length = 2048000; int *arr = new int [length]; for (int i = 0; i < length; i++) arr[i] = rand() % 100; int *prefixSum = new int [length]; prefixSum[0] = 0; int t0 = clock(); for (int i = 1; i < length; i++) prefixSum[i] = prefixSum[i - 1] + arr[i - 1]; int t1 = clock(); printf("time1: %lf\n", (t1 - t0) * 1.0 / CLOCKS_PER_SEC); cl_mem d_arr = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int) * length, NULL, &err); if (err) Error("Fail to create d_arr"); err = clEnqueueWriteBuffer(commandQueue, d_arr, CL_TRUE, 0, sizeof(int) * length, arr, 0, NULL, NULL); if (err) Error("Fail to write d_arr"); clSetKernelArg(scanKernel, 0, sizeof(cl_mem), &d_arr); cl_int d_length = length; clSetKernelArg(scanKernel, 1, sizeof(cl_int), &d_length); cl_int d_step = 1; clSetKernelArg(scanKernel, 2, sizeof(cl_int), &d_step); clSetKernelArg(scanKernel, 3, sizeof(cl_int) * (workGroupSize * 2 + workGroupSize * 2 / 16 + 1), NULL); int problemSize = length; int records[10]; int num = 0; int t2 = clock(); for (; problemSize > 1; problemSize = (problemSize - 1) / (workGroupSize * 2) + 1) { if (num) d_step *= workGroupSize * 2; printf("d_step = %d\n", d_step); records[num++] = problemSize; printf("problemSize = %d\n", problemSize); clSetKernelArg(scanKernel, 2, sizeof(cl_int), &d_step); size_t globalWorkSize = ((problemSize - 1) / (workGroupSize * 2) + 1) * workGroupSize; size_t localWorkSize = workGroupSize; err = clEnqueueNDRangeKernel(commandQueue, scanKernel, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); if (err) Error("Fail to enqueue scan"); clFinish(commandQueue); } //CheckValues(length, d_arr); int zero = 0; clEnqueueWriteBuffer(commandQueue, d_arr, CL_TRUE, 0, sizeof(int), &zero, 0, NULL, NULL); printf("d_step = %d\n", d_step); //scanf("%*c"); clSetKernelArg(reverseUpdateKernel, 0, sizeof(cl_mem), &d_arr); clSetKernelArg(reverseUpdateKernel, 1, sizeof(cl_int), &d_length); for (int i = num - 1; i >= 0; i--, d_step /= workGroupSize * 2) { printf("d_step = %d\n", d_step); clSetKernelArg(reverseUpdateKernel, 2, sizeof(cl_int), &d_step); size_t globalWorkSize = ((records[i] - 1) / (workGroupSize * 2) + 1) * workGroupSize; size_t localWorkSize = workGroupSize; printf("globalWorkSize = %d, localWorkSize = %d\n", globalWorkSize, localWorkSize); err = clEnqueueNDRangeKernel(commandQueue, reverseUpdateKernel, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); if (err) Error("Fail to enqueue scan"); clFinish(commandQueue); } int t3 = clock(); printf("time: %lf\n", (t3 - t2) * 1.0 / CLOCKS_PER_SEC); int *GPUResult = new int [length]; memset(GPUResult, 0, sizeof(int) * length); err = clEnqueueReadBuffer(commandQueue, d_arr, CL_TRUE, 0, sizeof(int) * length, GPUResult, 0, NULL, NULL); printf("err = %d\n", err); if (err) Error("Fail to read d_arr"); for (int i = 0; i < length; i++) if (GPUResult[i] != prefixSum[i]) printf("at i = %d, GPUResult[%d] = %d, prefixSum[%d] = %d\n", i, i, GPUResult[i], i, prefixSum[i]); system("pause"); return 0; }
int device_check() { cl_int err; cl_int i,j,cnt; cl_platform_id *platforms; cl_uint num_platforms; cl_platform_id platform; char* ext_data; size_t ext_size; cl_device_id *devs; size_t num_devs; cl_device_id device; /* Program data structures */ cl_program program; FILE *program_handle; char *program_buffer[NUM_FILES]; char *program_log; const char *file_name[] = {PROGRAM_FILE_1, PROGRAM_FILE_2}; const char options[] = "-cl-finite-math-only -cl-no-signed-zeros"; size_t program_size[NUM_FILES]; size_t log_size; /*kernel data*/ cl_kernel *kernels; cl_uint num_kernels; /*枚举所有的平台,最多10个*/ err = clGetPlatformIDs(10, NULL, &num_platforms); /*参数1:要枚举的数量,参数2:返回结果的存放空间,参数3:返回结果的条数*/ if(err < 0) { perror("Couldn't find any platforms"); exit(1); } platforms=(cl_platform_id *)malloc( sizeof(cl_platform_id) * num_platforms ); clGetPlatformIDs(num_platforms, platforms, NULL); /*现在num_platforms和platforms是平台的数量和数据指针*/ /* Find infor of all platforms */ for (i=0; i<num_platforms; i++) { /* Find size of extension data */ /*clGetPlatformInfo*/ /* 参数1:平台 参数2:所需信息的枚举 参数3:返回值需要保存的长度 参数4:返回值的存储空间 参数5:所需数据的真实长度 */ platform = platforms[i] /*NAME*/ err = clGetPlatformInfo(platform, CL_PLATFORM_NAME, 0, NULL, &ext_size); if(err < 0) { perror("Couldn't read CL_PLATFORM_NAME data."); } ext_data = (char *)malloc(ext_size); clGetPlatformInfo(platform, CL_PLATFORM_NAME, ext_size, ext_data, NULL); printf("Platform %d name: %s\n", i, ext_data); free(ext_data); /*VRNDOR*/ err = clGetPlatformInfo(platform, CL_PLATFORM_VENDER, 0, NULL, &ext_size); if(err < 0) { perror("Couldn't read CL_PLATFORM_VENDER data."); } ext_data = (char *)malloc(ext_size); clGetPlatformInfo(platform, CL_PLATFORM_VENDER, ext_size, ext_data, NULL); printf("Platform %d vender: %s\n", i, ext_data); free(ext_data) /*VERSION*/ err = clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, NULL, &ext_size); if(err < 0) { perror("Couldn't read CL_PLATFORM_VERSION data."); } ext_data = (char *)malloc(ext_size); clGetPlatformInfo(platform, CL_PLATFORM_VERSION, ext_size, ext_data, NULL); printf("Platform %d support OpenCL version: %s\n", i, ext_data); free(ext_data) /*PROFILE*/ err = clGetPlatformInfo(platform, CL_PLATFORM_PROFILE, 0, NULL, &ext_size); if(err < 0) { perror("Couldn't read CL_PLATFORM_PROFILE data."); } ext_data = (char *)malloc(ext_size); clGetPlatformInfo(platform, CL_PLATFORM_PROFILE, ext_size, ext_data, NULL); printf("Platform %d support OpenCL profile: %s\n", i, ext_data); free(ext_data) /*EXTENSIONS*/ err = clGetPlatformInfo(platform, CL_PLATFORM_EXTENSIONS, 0, NULL, &ext_size); if(err < 0) { perror("Couldn't read CL_PLATFORM_EXTENSIONS data."); } ext_data = (char *)malloc(ext_size); clGetPlatformInfo(platform, CL_PLATFORM_EXTENSIONS, ext_size, ext_data, NULL); printf("Platform %d support OpenCL extensions: %s\n", i, ext_data); free(ext_data) /*现在对这个platform进一步的提取信息*/ /*获取Device信息*/ /*clGetDeviceIDs*/ /* 参数1:平台句柄 参数2:要获取设备的类型 参数3:要获取的数量 参数4:返回信息的数据指针 参数5:返回信息的实际条数 */ for (cnt=0; cnt<2; cnt++) { if (0==cnt) err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_devs); if (1==cnt) err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 0, NULL, &num_devs); if(err == CL_DEVICE_NOT_FOUND) { if (0==cnt) printf("No GPU support OpenCL found.\n"); if (1==cnt) printf("No CPU support OpenCL found.\n"); } else if(err < 0) { if (0==cnt) printf("Couldn't access any GPU devices.\n"); if (1==cnt) printf("Couldn't access any CPU devices.\n"); } else { devs = (cl_device_id *)malloc( sizeof(cl_device_id) * num_devs ); clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, num_devs, devs, NULL); for (j=0; j<num_devs; j++) { cl_device_id dev; char dev_name_data[48]; cl_uint addr_data; cl_ulong global_mem_size; /*name*/ err = clGetDeviceInfo(dev, CL_DEVICE_NAME, 48 * sizeof(char), dev_name_data, NULL); if(err < 0) { perror("Couldn't read dev name data"); exit(1); } printf("Dev %d: NAME: %s\n", j, name_data); /*address size*/ err = clGetDeviceInfo(dev, CL_DEVICE_ADDRESS_BITS, sizeof(addr_data), &addr_data, NULL); printf("Dev %d: ADDRESS_WIDTH: %u\n", j, addr_data); /*device extensions*/ ext_data=malloc(4096) clGetDeviceInfo(dev, CL_DEVICE_EXTENSIONS, 4096 * sizeof(char), ext_data, NULL); printf("Dev %d: EXTENSIONS: %s\n", j, ext_data); free(ext_data) err = clGetDeviceInfo(dev, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(global_mem_size), &global_mem_size, NULL); printf("Dev %d: GLOBAL_MEM_SIZE: %u\n", j, global_mem_size); } } } } return 0; }
void InitOpenCL() { // 1. Get a platform. cl_platform_id platform; clGetPlatformIDs( 1, &platform, NULL ); // 2. Find a gpu device. cl_device_id device; clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); // 3. Create a context and command queue on that device. cl_context context = clCreateContext( NULL, 1, &device, NULL, NULL, NULL); queue = clCreateCommandQueue( context, device, 0, NULL ); // 4. Perform runtime source compilation, and obtain kernel entry point. std::ifstream file("scene.cl"); std::string source; if (file){ while(!file.eof()){ char line[256]; file.getline(line,255); source += std::string(line) + "\n"; } } if (source.length()==0) { std::string err = "fail to load shader"; } cl_ulong maxSize; clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE , sizeof(cl_ulong), &maxSize, 0); const char* str = source.c_str(); cl_program program = clCreateProgramWithSource( context, 1, &str, NULL, NULL ); cl_int result = clBuildProgram( program, 1, &device, NULL, NULL, NULL ); if ( result ){ char* build_log; size_t log_size; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); build_log = new char[log_size+1]; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL); build_log[log_size] = '\0'; if( log_size > 2 ) { std::cout << "build log: " << build_log << std::endl; } delete[] build_log; std::cout << "Error during compilation! (" << result << ")" << std::endl; } kernel = clCreateKernel( program, "tracekernel", NULL ); // 5. Create a data buffer. buffer = clCreateBuffer( context, CL_MEM_WRITE_ONLY, kWidth * kHeight *sizeof(cl_float4), NULL, 0 ); viewTransform = clCreateBuffer( context, CL_MEM_READ_WRITE, 16 *sizeof(cl_float), NULL, 0 ); worldTransforms = clCreateBuffer( context, CL_MEM_READ_WRITE, 16 *sizeof(cl_float)*2, NULL, 0 ); clSetKernelArg(kernel, 0, sizeof(buffer), (void*) &buffer); clSetKernelArg(kernel, 1, sizeof(cl_uint), (void*) &kWidth); clSetKernelArg(kernel, 2, sizeof(cl_uint), (void*) &kWidth); clSetKernelArg(kernel, 3, sizeof(viewTransform), (void*) &viewTransform); clSetKernelArg(kernel, 4, sizeof(worldTransforms), (void*) &worldTransforms); }
int main() { cl_platform_id platform_id = NULL; cl_device_id device_id = NULL; cl_context context = NULL; cl_command_queue command_queue = NULL; cl_mem memobj = NULL; cl_program program = NULL; cl_kernel kernel = NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret; float mem[MEM_SIZE]; FILE *fp; char fileName[] = "./kernel.clbin"; size_t binary_size; char *binary_buf; cl_int binary_status; cl_int i; /* カーネルを含むオブジェクトファイルをロード */ fp = fopen(fileName, "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } binary_buf = (char *)malloc(MAX_BINARY_SIZE); binary_size = fread( binary_buf, 1, MAX_BINARY_SIZE, fp ); fclose( fp ); /* データを初期化 */ for( i = 0; i < MEM_SIZE; i++ ) { mem[i] = i; } /* プラットフォーム・デバイスの情報の取得 */ ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); /* OpenCLコンテキストの作成 */ context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret); /* コマンドキューの作成 */ command_queue = clCreateCommandQueue(context, device_id, 0, &ret); /* メモリバッファの作成 */ memobj = clCreateBuffer(context, CL_MEM_READ_WRITE, MEM_SIZE * sizeof(float), NULL, &ret); /* メモリバッファにデータを転送 */ ret = clEnqueueWriteBuffer(command_queue, memobj, CL_TRUE, 0, MEM_SIZE * sizeof(float), mem, 0, NULL, NULL); /* 読み込んだバイナリからカーネルプログラムを作成 */ program = clCreateProgramWithBinary(context, 1, &device_id, (const size_t *)&binary_size, (const unsigned char **)&binary_buf, &binary_status, &ret); /* OpenCLカーネルの作成 */ kernel = clCreateKernel(program, "vecAdd", &ret); printf("err:%d\n", ret); /* OpenCLカーネル引数の設定 */ ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj); size_t global_work_size[3] = {MEM_SIZE, 0, 0}; size_t local_work_size[3] = {MEM_SIZE, 0, 0}; /* OpenCLカーネルを実行 */ ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); /* メモリバッファから結果を取得 */ ret = clEnqueueReadBuffer(command_queue, memobj, CL_TRUE, 0, MEM_SIZE * sizeof(float), mem, 0, NULL, NULL); /* 結果の表示 */ for(i=0; i<MEM_SIZE; i++) { printf("mem[%d] : %f\n", i, mem[i]); } /* 終了処理 */ ret = clFlush(command_queue); ret = clFinish(command_queue); ret = clReleaseKernel(kernel); ret = clReleaseProgram(program); ret = clReleaseMemObject(memobj); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); free(binary_buf); return 0; }
int main(void) { const size_t ARRAY_BYTES = ARRAY_SIZE * sizeof(float); // Generate the input array on the host. float h_a[ARRAY_SIZE]; float h_b[ARRAY_SIZE]; for (int i = 0; i < ARRAY_SIZE; i++) { h_a[i] = (float)i; h_b[i] = (float)(2 * i); } float h_c[ARRAY_SIZE]; FILE *fp; char *source_str; size_t source_size; fp = fopen("vectors_cl.cl", "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); fclose(fp); // Get platform and device information cl_platform_id platform_id = NULL; cl_device_id device_id = NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); // Create an OpenCL context cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); // Create a command queue cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret); // Create memory buffers on the device for each vector cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, ARRAY_BYTES, NULL, &ret); cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, ARRAY_BYTES, NULL, &ret); cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, ARRAY_BYTES, NULL, &ret); // Copy h_a and h_b to memory buffer ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, ARRAY_BYTES, h_a, 0, NULL, NULL); ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0, ARRAY_BYTES, h_b, 0, NULL, NULL); // Create a program from the kernel source cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); if (ret != 0) { printf("clCreateProgramWithSource returned non-zero status %d\n\n", ret); exit(1); } // Build the program ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); if (ret != 0) { printf("clBuildProgram returned non-zero status %d: ", ret); if (ret == CL_INVALID_PROGRAM) { printf("invalid program\n"); } else if (ret == CL_INVALID_VALUE) { printf("invalid value\n"); } else if (ret == CL_INVALID_DEVICE) { printf("invalid device\n"); } else if (ret == CL_INVALID_BINARY) { printf("invalid binary\n"); } else if (ret == CL_INVALID_BUILD_OPTIONS) { printf("invalid build options\n"); } else if (ret == CL_INVALID_OPERATION) { printf("invalid operation\n"); } else if (ret == CL_COMPILER_NOT_AVAILABLE) { printf("compiler not available\n"); } else if (ret == CL_BUILD_PROGRAM_FAILURE) { printf("build program failure\n"); // Determine the size of the log size_t log_size; clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); // Allocate memory for the log char *log = (char *) malloc(log_size); // Get the log clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); // Print the log printf("%s\n", log); } else if (ret == CL_OUT_OF_HOST_MEMORY) { printf("out of host memory\n"); } exit(1); } // Create the OpenCL kernel cl_kernel kernel = clCreateKernel(program, "add", &ret); // Set the arguments of the kernel ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj); ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj); ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj); size_t array_size = ARRAY_SIZE; ret = clSetKernelArg(kernel, 3, sizeof(const size_t), (void *)&array_size); // Execute the OpenCL kernel on the list size_t global_item_size = ARRAY_SIZE; // Process the entire lists size_t local_item_size = 1; // Divide work items into groups of 64 ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); // Read the memory buffer C on the device to the local variable C ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0, ARRAY_BYTES, h_c, 0, NULL, NULL); // Print out the resulting array. for (int i = 0; i < 8; i++) { printf("%d + %d = %d", (int)h_a[i], (int)h_b[i], (int)h_c[i]); printf(((i % 4) != 3) ? "\t" : "\n"); } printf("...\n"); for (int i = ARRAY_SIZE - 8; i < ARRAY_SIZE; i++) { printf("%d + %d = %d", (int)h_a[i], (int)h_b[i], (int)h_c[i]); printf(((i % 4) != 3) ? "\t" : "\n"); } // Clean up ret = clFlush(command_queue); ret = clFinish(command_queue); ret = clReleaseKernel(kernel); ret = clReleaseProgram(program); ret = clReleaseMemObject(a_mem_obj); ret = clReleaseMemObject(b_mem_obj); ret = clReleaseMemObject(c_mem_obj); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); return 0; }
int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufX, bufY; cl_event event = NULL; int ret = 0; int lenX = 1 + (N-1)*abs(incx); int lenY = 1 + (N-1)*abs(incy); /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenX*sizeof(cl_float)), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenY*sizeof(cl_float)), NULL, &err); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); printResult(); /* Call clblas function. */ err = clblasSrot(N, bufX, 0, incx, bufY, 0, incy, C, S, 1, &queue, 0, NULL, &event); // printf("here\n"); if (err != CL_SUCCESS) { printf("clblasSrot() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); /* At this point you will get the result of SROT placed in vector Y. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufY); clReleaseMemObject(bufX); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; }
int main(void) { hwloc_topology_t topology; cl_int clret; cl_platform_id *platform_ids; unsigned nrp, nrd, count, i, j; int err; hwloc_topology_init(&topology); hwloc_topology_set_flags(topology, HWLOC_TOPOLOGY_FLAG_IO_DEVICES); hwloc_topology_load(topology); clret = clGetPlatformIDs(0, NULL, &nrp); if (CL_SUCCESS != clret || !nrp) return 0; platform_ids = malloc(nrp * sizeof(*platform_ids)); if (!platform_ids) return 0; clret = clGetPlatformIDs(nrp, platform_ids, &nrp); if (CL_SUCCESS != clret || !nrp) return 0; count = 0; for(i=0; i<nrp; i++) { cl_device_id *device_ids; clret = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_ALL, 0, NULL, &nrd); if (CL_SUCCESS != clret || !nrd) continue; device_ids = malloc(nrd * sizeof(*device_ids)); if (!device_ids) continue; clret = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_ALL, nrd, device_ids, &nrd); if (CL_SUCCESS != clret || !nrd) continue; for(j=0; j<nrd; j++) { hwloc_bitmap_t set; hwloc_obj_t osdev, osdev2, ancestor; const char *value; osdev = hwloc_opencl_get_device_osdev(topology, device_ids[j]); osdev2 = hwloc_opencl_get_device_osdev_by_index(topology, i, j); assert(osdev == osdev2); if (!osdev) { printf("no osdev for platform %d device %d\n", i, j); continue; } ancestor = hwloc_get_non_io_ancestor_obj(topology, osdev); set = hwloc_bitmap_alloc(); err = hwloc_opencl_get_device_cpuset(topology, device_ids[j], set); if (err < 0) { printf("no cpuset for platform %d device %d\n", i, j); } else { char *cpuset_string = NULL; hwloc_bitmap_asprintf(&cpuset_string, set); printf("got cpuset %s for platform %d device %d\n", cpuset_string, i, j); free(cpuset_string); assert(hwloc_bitmap_isequal(set, ancestor->cpuset)); } hwloc_bitmap_free(set); printf("found OSDev %s\n", osdev->name); err = strncmp(osdev->name, "opencl", 6); assert(!err); assert(atoi(osdev->name+6) == (int) count); value = hwloc_obj_get_info_by_name(osdev, "Backend"); err = strcmp(value, "OpenCL"); assert(!err); value = hwloc_obj_get_info_by_name(osdev, "Name"); printf("found OSDev name %s\n", value); count++; } } hwloc_topology_destroy(topology); return 0; }
int main() { // Set the image rotation (in degrees) float theta = 3.14159/6; float cos_theta = cosf(theta); float sin_theta = sinf(theta); printf("theta = %f (cos theta = %f, sin theta = %f)\n", theta, cos_theta, sin_theta); // Rows and columns in the input image int imageHeight; int imageWidth; const char* inputFile = "input.bmp"; const char* outputFile = "output.bmp"; // Homegrown function to read a BMP from file float* inputImage = readImage(inputFile, &imageWidth, &imageHeight); // Size of the input and output images on the host int dataSize = imageHeight*imageWidth*sizeof(float); // Output image on the host float* outputImage = NULL; outputImage = (float*)malloc(dataSize); // Set up the OpenCL environment cl_int status; // Discovery platform cl_platform_id platforms[2]; cl_platform_id platform; status = clGetPlatformIDs(2, platforms, NULL); chk(status, "clGetPlatformIDs"); platform = platforms[PLATFORM_TO_USE]; // Discover device cl_device_id device; clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL); chk(status, "clGetDeviceIDs"); // Create context cl_context_properties props[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(platform), 0}; cl_context context; context = clCreateContext(props, 1, &device, NULL, NULL, &status); chk(status, "clCreateContext"); // Create command queue cl_command_queue queue; queue = clCreateCommandQueue(context, device, 0, &status); chk(status, "clCreateCommandQueue"); // Create the input and output buffers cl_mem d_input; d_input = clCreateBuffer(context, CL_MEM_READ_ONLY, dataSize, NULL, &status); chk(status, "clCreateBuffer"); cl_mem d_output; d_output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, dataSize, NULL, &status); chk(status, "clCreateBuffer"); // Copy the input image to the device status = clEnqueueWriteBuffer(queue, d_input, CL_TRUE, 0, dataSize, inputImage, 0, NULL, NULL); chk(status, "clEnqueueWriteBuffer"); const char* source = readSource("rotation.cl"); // Create a program object with source and build it cl_program program; program = clCreateProgramWithSource(context, 1, &source, NULL, NULL); chk(status, "clCreateProgramWithSource"); status = clBuildProgram(program, 1, &device, NULL, NULL, NULL); chk(status, "clBuildProgram"); // Create the kernel object cl_kernel kernel; kernel = clCreateKernel(program, "img_rotate", &status); chk(status, "clCreateKernel"); // Set the kernel arguments status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_output); status |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_input); status |= clSetKernelArg(kernel, 2, sizeof(int), &imageWidth); status |= clSetKernelArg(kernel, 3, sizeof(int), &imageHeight); status |= clSetKernelArg(kernel, 4, sizeof(float), &sin_theta); status |= clSetKernelArg(kernel, 5, sizeof(float), &cos_theta); chk(status, "clSetKernelArg"); // Set the work item dimensions size_t globalSize[2] = {imageWidth, imageHeight}; status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, NULL, 0, NULL, NULL); chk(status, "clEnqueueNDRange"); // Read the image back to the host status = clEnqueueReadBuffer(queue, d_output, CL_TRUE, 0, dataSize, outputImage, 0, NULL, NULL); chk(status, "clEnqueueReadBuffer"); // Write the output image to file storeImage(outputImage, outputFile, imageHeight, imageWidth, inputFile); return 0; }
int main(int argc, char** argv) { int err; // error code returned from api calls float data[DATA_SIZE]; // original data set given to device float results[DATA_SIZE]; // results returned from device unsigned int correct; // number of correct results returned size_t global; // global domain size for our calculation size_t local; // local domain size for our calculation cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel cl_mem input; // device memory used for the input array cl_mem output; // device memory used for the output array // Fill our data set with random float values // int i = 0; unsigned int count = DATA_SIZE; for(i = 0; i < count; i++) data[i] = rand() / (float)RAND_MAX; // Connect to a compute device // int gpu = 1; err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); return EXIT_FAILURE; } // Create a compute context // context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } // Create a command commands // commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } // Create the compute program from the source buffer // program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err); if (!program) { printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } // Build the program executable // err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } // Create the compute kernel in the program we wish to run // kernel = clCreateKernel(program, "square", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } // Create the input and output arrays in device memory for our calculation // input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL); output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL); if (!input || !output) { printf("Error: Failed to allocate device memory!\n"); exit(1); } // Write our data set into the input array in device memory // err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array!\n"); exit(1); } // Set the arguments to our compute kernel // err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output); err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); exit(1); } // Get the maximum work group size for executing the kernel on the device // err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); exit(1); } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // global = count; err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel!\n"); return EXIT_FAILURE; } // Wait for the command commands to get serviced before reading back results // clFinish(commands); // Read back the results from the device to verify the output // err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); exit(1); } // Validate our results // correct = 0; for(i = 0; i < count; i++) { if(results[i] == data[i] * data[i]) correct++; } // Print a brief summary detailing the results // printf("Computed '%d/%d' correct values!\n", correct, count); // Shutdown and cleanup // clReleaseMemObject(input); clReleaseMemObject(output); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); return 0; }
cl_context CreateContext() { cl_int errNum; cl_uint numPlatforms; cl_platform_id * platformIDs; cl_uint numDevices; cl_device_id * deviceIDs; cl_context context = NULL; // First, select an OpenCL platform to run on. errNum = clGetPlatformIDs(0, NULL, &numPlatforms); platformIDs = (cl_platform_id *)alloca( sizeof(cl_platform_id) * numPlatforms); errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL); if (errNum != CL_SUCCESS || numPlatforms <= 0) { std::cerr << "Failed to find any OpenCL platforms." << std::endl; return NULL; } else std::cout<<"number of platforms:"<<numPlatforms<<std::endl; deviceIDs = NULL; errNum = clGetDeviceIDs( platformIDs[0], CL_DEVICE_TYPE_GPU, //寻找GPU device 的个数 0, NULL, &numDevices); if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND) { checkErr(errNum, "clGetDeviceIDs"); } else std::cout<<"number of devices:"<<numDevices<<std::endl; deviceIDs = (cl_device_id *)alloca(sizeof(cl_device_id) * numDevices); errNum = clGetDeviceIDs( platformIDs[0], CL_DEVICE_TYPE_GPU, numDevices, &deviceIDs[0], NULL); checkErr(errNum, "clGetDeviceIDs"); // Next, create an OpenCL context on the platform. Attempt to // create a GPU-based context, and if that fails, try to create // a CPU-based context. cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platformIDs[0], 0 }; /*context = clCreateContext( contextProperties, numDevices, deviceIDs, NULL, NULL, &errNum); checkErr(errNum, "clCreateContext");*/ //如果有平台和设备类型,可以使用clCreateContextFromType()创建上下文。上下文较随意,可以包含不同设备类型 context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_GPU, NULL, NULL, &errNum); if (errNum != CL_SUCCESS) { std::cout << "Could not create GPU context, trying CPU..." << std::endl; context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_CPU, NULL, NULL, &errNum); if (errNum != CL_SUCCESS) { std::cerr << "Failed to create an OpenCL GPU or CPU context." << std::endl; return NULL; } } return context; }
OpenCLDevice(DeviceInfo& info, Stats &stats, bool background_) : Device(stats) { background = background_; cpPlatform = NULL; cxContext = NULL; cqCommandQueue = NULL; cpProgram = NULL; ckPathTraceKernel = NULL; ckFilmConvertKernel = NULL; null_mem = 0; device_initialized = false; /* setup platform */ cl_uint num_platforms; ciErr = clGetPlatformIDs(0, NULL, &num_platforms); if(opencl_error(ciErr)) return; if(num_platforms == 0) { opencl_error("OpenCL: no platforms found."); return; } ciErr = clGetPlatformIDs(1, &cpPlatform, NULL); if(opencl_error(ciErr)) return; char name[256]; clGetPlatformInfo(cpPlatform, CL_PLATFORM_NAME, sizeof(name), &name, NULL); platform_name = name; /* get devices */ vector<cl_device_id> device_ids; cl_uint num_devices; if(opencl_error(clGetDeviceIDs(cpPlatform, opencl_device_type(), 0, NULL, &num_devices))) return; if(info.num > num_devices) { if(num_devices == 0) opencl_error("OpenCL: no devices found."); else opencl_error("OpenCL: specified device not found."); return; } device_ids.resize(num_devices); if(opencl_error(clGetDeviceIDs(cpPlatform, opencl_device_type(), num_devices, &device_ids[0], NULL))) return; cdDevice = device_ids[info.num]; /* create context */ cxContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr); if(opencl_error(ciErr)) return; cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr); if(opencl_error(ciErr)) return; null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr); device_initialized = true; }
int main(int argc, const char** argv) { // start logs printf("clDeviceQuery Starting...\n\n"); bool bPassed = true; std::string sProfileString = "clDeviceQuery, Platform Name = "; // Get OpenCL platform ID for NVIDIA if avaiable, otherwise default char cBuffer[1024]; cl_platform_id clSelectedPlatformID = NULL; cl_platform_id* clPlatformIDs; cl_uint num_platforms; cl_int ciErrNum = clGetPlatformIDs(0, NULL, &num_platforms); if (ciErrNum != CL_SUCCESS) { printf(" Error %i in clGetPlatformIDs Call!\n\n", ciErrNum); bPassed = false; } else { if (num_platforms == 0) { printf("No OpenCL platform found!\n\n"); bPassed = false; } else { // if there's one platform or more, make space for ID's if ((clPlatformIDs = (cl_platform_id*)malloc(num_platforms * sizeof(cl_platform_id))) == NULL) { printf("Failed to allocate memory for cl_platform ID's!\n\n"); bPassed = false; } printf("%d OpenCL Platforms found\n\n", num_platforms); // get platform info for each platform ciErrNum = clGetPlatformIDs (num_platforms, clPlatformIDs, NULL); for(cl_uint i = 0; i < num_platforms; ++i) { ciErrNum = clGetPlatformInfo (clPlatformIDs[i], CL_PLATFORM_NAME, 1024, &cBuffer, NULL); if(ciErrNum == CL_SUCCESS) { clSelectedPlatformID = clPlatformIDs[i]; // Get OpenCL platform name and version ciErrNum = clGetPlatformInfo (clSelectedPlatformID, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL); if (ciErrNum == CL_SUCCESS) { printf(" CL_PLATFORM_NAME: \t%s\n", cBuffer); sProfileString += cBuffer; } else { printf(" Error %i in clGetPlatformInfo Call !!!\n\n", ciErrNum); bPassed = false; } sProfileString += ", Platform Version = "; ciErrNum = clGetPlatformInfo (clSelectedPlatformID, CL_PLATFORM_VERSION, sizeof(cBuffer), cBuffer, NULL); if (ciErrNum == CL_SUCCESS) { printf(" CL_PLATFORM_VERSION: \t%s\n", cBuffer); sProfileString += cBuffer; } else { printf(" Error %i in clGetPlatformInfo Call !!!\n\n", ciErrNum); bPassed = false; } // Log OpenCL SDK Version # (for convenience: not specific to OpenCL) sProfileString += ", NumDevs = "; // Get and log OpenCL device info cl_uint ciDeviceCount; cl_device_id *devices; printf("OpenCL Device Info:\n\n"); ciErrNum = clGetDeviceIDs (clSelectedPlatformID, CL_DEVICE_TYPE_ALL, 0, NULL, &ciDeviceCount); // check for 0 devices found or errors... if (ciDeviceCount == 0) { printf(" No devices found supporting OpenCL (return code %i)\n\n", ciErrNum); bPassed = false; sProfileString += "0"; } else if (ciErrNum != CL_SUCCESS) { printf(" Error %i in clGetDeviceIDs call !!!\n\n", ciErrNum); bPassed = false; } else { // Get and log the OpenCL device ID's ciErrNum = clGetPlatformInfo (clSelectedPlatformID, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL); printf(" %u devices found supporting OpenCL on: %s\n\n", ciDeviceCount, cBuffer); char cTemp[2]; sprintf(cTemp, "%u", ciDeviceCount); sProfileString += cTemp; if ((devices = (cl_device_id*)malloc(sizeof(cl_device_id) * ciDeviceCount)) == NULL) { printf(" Failed to allocate memory for devices !!!\n\n"); bPassed = false; } ciErrNum = clGetDeviceIDs (clSelectedPlatformID, CL_DEVICE_TYPE_ALL, ciDeviceCount, devices, &ciDeviceCount); if (ciErrNum == CL_SUCCESS) { for(unsigned int i = 0; i < ciDeviceCount; ++i ) { printf(" ----------------------------------\n"); clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(cBuffer), &cBuffer, NULL); printf(" Device %s\n", cBuffer); printf(" ---------------------------------\n"); clPrintDevInfo(devices[i]); sProfileString += ", Device = "; sProfileString += cBuffer; } } else { printf(" Error %i in clGetDeviceIDs call !!!\n\n", ciErrNum); bPassed = false; } } // masterlog info sProfileString += "\n"; printf("%s", sProfileString.c_str()); } free(clPlatformIDs); } } } // Log system info(for convenience: not specific to OpenCL) printf( "\nSystem Info: \n\n"); char timestr[255]; time_t now = time(NULL); struct tm *ts; ts = localtime(&now); strftime(timestr, 255, " %H:%M:%S, %m/%d/%Y",ts); // write time and date to logs printf(" Local Time/Date = %s\n", timestr); // write proc and OS info to logs // parse /proc/cpuinfo std::ifstream cpuinfo( "/proc/cpuinfo" ); // open the file in /proc std::string tmp; int cpu_num = 0; std::string cpu_name = "none"; do { cpuinfo >> tmp; if( tmp == "processor" ) cpu_num++; if( tmp == "name" ) { cpuinfo >> tmp; // skip : std::stringstream tmp_stream(""); do { cpuinfo >> tmp; if (tmp != std::string("stepping")) { tmp_stream << tmp.c_str() << " "; } } while (tmp != std::string("stepping")); cpu_name = tmp_stream.str(); } } while ( (! cpuinfo.eof()) ); // Linux version std::ifstream version( "/proc/version" ); char versionstr[255]; version.getline(versionstr, 255); printf(" CPU Name: %s\n # of CPU processors: %u\n %s\n\n\n", cpu_name.c_str(),cpu_num,versionstr); // finish printf("TEST %s\n\n", bPassed ? "PASSED" : "FAILED !!!"); }
int main( int argc, char* argv[] ) { // Length of vectors unsigned int n = 100000; // Host input vectors double *h_a; double *h_b; // Host output vector double *h_c; // Device input buffers cl_mem d_a; cl_mem d_b; // Device output buffer cl_mem d_c; cl_platform_id cpPlatform; // OpenCL platform cl_device_id device_id; // device ID cl_context context; // context cl_command_queue queue; // command queue cl_program program; // program cl_kernel kernel; // kernel // Size, in bytes, of each vector size_t bytes = n*sizeof(double); // Allocate memory for each vector on host h_a = (double*)malloc(bytes); h_b = (double*)malloc(bytes); h_c = (double*)malloc(bytes); // Initialize vectors on host int i; for( i = 0; i < n; i++ ) { h_a[i] = sinf(i)*sinf(i); h_b[i] = cosf(i)*cosf(i); } size_t globalSize, localSize; cl_int err; // Number of work items in each local work group localSize = 64; // Number of total work items - localSize must be devisor globalSize = ceil(n/(float)localSize)*localSize; // Bind to platform err = clGetPlatformIDs(1, &cpPlatform, NULL); // Get ID for the device err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); // Create a context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); // Create a command queue queue = clCreateCommandQueue(context, device_id, 0, &err); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelSource, NULL, &err); // Build the program executable clBuildProgram(program, 0, NULL, NULL, NULL, NULL); // Create the compute kernel in the program we wish to run kernel = clCreateKernel(program, "vecAdd", &err); // Create the input and output arrays in device memory for our calculation d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL); d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL); d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL); // Write our data set into the input array in device memory err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0, bytes, h_a, 0, NULL, NULL); err |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0, bytes, h_b, 0, NULL, NULL); // Set the arguments to our compute kernel err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c); err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n); // Execute the kernel over the entire range of the data set err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL); // Wait for the command queue to get serviced before reading back results clFinish(queue); // Read the results from the device clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0, bytes, h_c, 0, NULL, NULL ); //Sum up vector c and print result divided by n, this should equal 1 within error double sum = 0; for(i=0; i<n; i++) sum += h_c[i]; printf("final result: %f\n", sum/n); // release OpenCL resources clReleaseMemObject(d_a); clReleaseMemObject(d_b); clReleaseMemObject(d_c); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseContext(context); //release host memory free(h_a); free(h_b); free(h_c); return 0; }
/** * @brief Create a new OpenCL zone, which will contain complete information for an OpenCL execution session on a specific device. * * @param deviceType Device type (OpenCL bitfield). * @param numQueues Number of command queues. * @param queueProperties Properties for the command queues. * @param devSel Pointer to function which will select device, if more than one is available. * @param dsExtraArg Extra argument for (*deviceSelector) function. * @param err Error structure, to be populated if an error occurs. * @return OpenCL zone or NULL if device wasn't properly initialized. */ CLUZone* clu_zone_new(cl_uint deviceType, cl_uint numQueues, cl_int queueProperties, clu_device_selector devSel, void* dsExtraArg, GError **err) { /* OpenCL status variable. */ cl_int status; /* OpenCL zone to initialize and return */ CLUZone* zone; /* Information about devices */ CLUDeviceInfo devInfos[CLU_MAX_DEVICES_TOTAL]; /* Number of devices. */ cl_uint numDevices; /* Index of device information */ cl_int deviceInfoIndex; /* Context properties, */ cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, 0, 0}; /* List of platform Ids. */ cl_platform_id platfIds[CLU_MAX_PLATFORMS]; /* Number of platforms. */ cl_uint numPlatforms; /* Total number of devices. */ unsigned int totalNumDevices; /* Device IDs for a given platform. */ cl_device_id devIds[CLU_MAX_DEVICES_PER_PLATFORM]; /* Initialize zone */ zone = (CLUZone*) malloc(sizeof(CLUZone)); gef_if_error_create_goto( *err, CLU_UTILS_ERROR, NULL == zone, CLU_ERROR_NOALLOC, error_handler, "Unable to allocate memory for OpenCL zone" ); zone->context = NULL; zone->queues = NULL; zone->program = NULL; zone->device_info.device_id = NULL; zone->device_info.platform_id = NULL; zone->device_info.device_name[0] = '\0'; zone->device_info.device_vendor[0] = '\0'; zone->device_info.platform_name[0] = '\0'; /* Get number of platforms */ status = clGetPlatformIDs(0, NULL, &numPlatforms); gef_if_error_create_goto( *err, CLU_UTILS_ERROR, CL_SUCCESS != status, CLU_OCL_ERROR, error_handler, "clu_zone_new: get number of platforms (OpenCL error %d: %s).", status, clerror_get(status)); /* Get existing platforms */ status = clGetPlatformIDs(numPlatforms, platfIds, NULL); gef_if_error_create_goto( *err, CLU_UTILS_ERROR, CL_SUCCESS != status, CLU_OCL_ERROR, error_handler, "clu_zone_new: get platform Ids (OpenCL error %d: %s).", status, clerror_get(status)); /* Cycle through platforms, get specified devices in existing platforms */ totalNumDevices = 0; for(unsigned int i = 0; i < numPlatforms; i++) { /* Get specified devices for current platform */ status = clGetDeviceIDs( platfIds[i], deviceType, CLU_MAX_DEVICES_PER_PLATFORM, devIds, &numDevices); if (status != CL_DEVICE_NOT_FOUND) { /* At least one device found, lets take note */ gef_if_error_create_goto( *err, CLU_UTILS_ERROR, CL_SUCCESS != status, CLU_OCL_ERROR, error_handler, "clu_zone_new: get device Ids (OpenCL error %d: %s).", status, clerror_get(status)); for (unsigned int j = 0; j < numDevices; j++) { /* Keep device and platform IDs. */ devInfos[totalNumDevices].device_id = devIds[j]; devInfos[totalNumDevices].platform_id = platfIds[i]; /* Get device name. */ status = clGetDeviceInfo( devIds[j], CL_DEVICE_NAME, sizeof(devInfos[totalNumDevices].device_name), devInfos[totalNumDevices].device_name, NULL); gef_if_error_create_goto( *err, CLU_UTILS_ERROR, CL_SUCCESS != status, CLU_OCL_ERROR, error_handler, "clu_zone_new: get device name info (OpenCL error %d: %s).", status, clerror_get(status)); /* Get device vendor. */ status = clGetDeviceInfo( devIds[j], CL_DEVICE_VENDOR, sizeof(devInfos[totalNumDevices].device_vendor), devInfos[totalNumDevices].device_vendor, NULL); gef_if_error_create_goto( *err, CLU_UTILS_ERROR, CL_SUCCESS != status, CLU_OCL_ERROR, error_handler, "clu_zone_new: get device vendor info (OpenCL error %d: %s).", status, clerror_get(status)); /* Get platform name. */ status = clGetPlatformInfo( platfIds[i], CL_PLATFORM_VENDOR, sizeof(devInfos[totalNumDevices].platform_name), devInfos[totalNumDevices].platform_name, NULL); gef_if_error_create_goto( *err, CLU_UTILS_ERROR, CL_SUCCESS != status, CLU_OCL_ERROR, error_handler, "clu_zone_new: get platform info (OpenCL error %d: %s).", status, clerror_get(status)); /* Increment total number of found devices. */ totalNumDevices++; } } } /* Check whether any devices of the specified type were found */ if (totalNumDevices == 0) { /* No devices of the specified type where found, return with error. */ gef_if_error_create_goto( *err, CLU_UTILS_ERROR, 1, CLU_ERROR_DEVICE_NOT_FOUND, error_handler, "clu_zone_new: device not found."); } else { /* Several compatible devices found, choose one with given selector function. */ deviceInfoIndex = devSel(devInfos, totalNumDevices, dsExtraArg); /* Test return value of selector function (if it is out of range, * there is a programming error). */ g_assert_cmpint(deviceInfoIndex, >=, -1); g_assert_cmpint(deviceInfoIndex, <, totalNumDevices); /* If selector function returned -1, then no device is selectable. */ if (deviceInfoIndex == -1) { gef_if_error_create_goto( *err, CLU_UTILS_ERROR, 1, CLU_ERROR_DEVICE_NOT_FOUND, error_handler, "clu_zone_new: specified device not found."); } } /* Store info about the selected device and platform. */ zone->device_type = deviceType; zone->device_info = devInfos[deviceInfoIndex]; /* Determine number of compute units for that device */ status = clGetDeviceInfo( zone->device_info.device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &zone->cu, NULL); gef_if_error_create_goto( *err, CLU_UTILS_ERROR, CL_SUCCESS != status, CLU_OCL_ERROR, error_handler, "clu_zone_new: get target device info (OpenCL error %d: %s).", status, clerror_get(status)); /* Create a context on that device. */ cps[1] = (cl_context_properties) devInfos[deviceInfoIndex].platform_id; zone->context = clCreateContext(cps, 1, &zone->device_info.device_id, NULL, NULL, &status); gef_if_error_create_goto( *err, CLU_UTILS_ERROR, CL_SUCCESS != status, CLU_OCL_ERROR, error_handler, "clu_zone_new: creating context (OpenCL error %d: %s).", status, clerror_get(status)); /* Create the specified command queues on that device */ zone->numQueues = numQueues; zone->queues = (cl_command_queue*) malloc(numQueues * sizeof(cl_command_queue)); gef_if_error_create_goto( *err, CLU_UTILS_ERROR, NULL == zone->queues, CLU_ERROR_NOALLOC, error_handler, "Unable to allocate memory to keep OpenCL command queues in Zone." ); for (unsigned int i = 0; i < numQueues; i++) { zone->queues[i] = clCreateCommandQueue( zone->context, zone->device_info.device_id, queueProperties, &status); gef_if_error_create_goto( *err, CLU_UTILS_ERROR, CL_SUCCESS != status, CLU_OCL_ERROR, error_handler, "clu_zone_new: creating command queue (OpenCL error %d: %s).", status, clerror_get(status)); } /* If we got here, everything is OK. */ g_assert (err == NULL || *err == NULL); goto finish; error_handler: /* If we got here there was an error, verify that it is so. */ g_assert (err == NULL || *err != NULL); /* Free OpenCL zone. */ if (zone != NULL) { clu_zone_free(zone); zone = NULL; } finish: /* Return. */ return zone; }
void runProgram(int N, char *fileName) { printf("GPU Symmetrize()..." "\nSquareMatrix[%d][%d]\n", N, N); int i,j; // initialize input array float *A; A = (float*)malloc(sizeof(float)*N*N); for( i = 0; i < N ; ++i ) { for( j = 0; j < N ; ++j ) { A[i*N + j] = j; } } // result float *Aout; Aout = (float*)malloc(sizeof(float)*N*N); #ifdef DEBUG puts("A"); check_2d_f(A,N,N); #endif int NumK = 1; int NumE = 2; double gpuTime; cl_ulong gstart, gend; //------------------------------------------------ // OpenCL //------------------------------------------------ cl_int err; cl_platform_id platform; // OpenCL platform cl_device_id device_id; // device ID cl_context context; // context cl_command_queue queue; // command queue cl_program program; // program cl_kernel *kernel = (cl_kernel*)malloc(sizeof(cl_kernel)*NumK); cl_event *event = (cl_event*)malloc(sizeof(cl_event)*NumE); // read kernel file //char *fileName = "transpose_kernel.cl"; char *kernelSource; size_t size; FILE *fh = fopen(fileName, "rb"); if(!fh) { printf("Error: Failed to open kernel file!\n"); exit(1); } fseek(fh,0,SEEK_END); size=ftell(fh); fseek(fh,0,SEEK_SET); kernelSource = malloc(size+1); size_t result; result = fread(kernelSource,1,size,fh); if(result != size){ fputs("Reading error", stderr);exit(1);} kernelSource[size] = '\0'; // Bind to platform err = clGetPlatformIDs(1, &platform, NULL); OCL_CHECK(err); // Get ID for the device err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); OCL_CHECK(err); // Create a context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); OCL_CHECK(err); // Create a command queue queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err); OCL_CHECK(err); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, NULL, &err); OCL_CHECK(err); // turn on optimization for kernel char *options="-cl-mad-enable -cl-fast-relaxed-math -cl-no-signed-zeros -cl-unsafe-math-optimizations -cl-finite-math-only"; err = clBuildProgram(program, 1, &device_id, options, NULL, NULL); if(err != CL_SUCCESS) printCompilerOutput(program, device_id); OCL_CHECK(err); #ifdef SAVEBIN // Calculate size of binaries size_t binary_size; err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binary_size, NULL); OCL_CHECK(err); //printf("binary size = %ld\n", binary_size); unsigned char* bin; bin = (unsigned char*)malloc(sizeof(unsigned char)*binary_size); err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*) , &bin, NULL); OCL_CHECK(err); //puts("save binaries"); // Print the binary out to the output file fh = fopen("kernel.bin", "wb"); fwrite(bin, 1, binary_size, fh); fclose(fh); puts("done save binaries"); #endif kernel[0] = clCreateKernel(program, "kernel_a", &err); OCL_CHECK(err); // memory on device cl_mem A_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*N*N, NULL, NULL); cl_mem Aout_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*N*N, NULL, NULL); // copy data to device err = clEnqueueWriteBuffer(queue, A_d, CL_TRUE, 0, sizeof(float)*N*N, A, 0, NULL , &event[0]); OCL_CHECK(err); size_t localsize[2]; size_t globalsize[2]; localsize[0] = 16; localsize[1] = 16; globalsize[0] = N; globalsize[1] = N; err = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), &A_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clSetKernelArg(kernel[0], 1, sizeof(cl_mem), &Aout_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clEnqueueNDRangeKernel(queue, kernel[0], 2, NULL, globalsize, localsize, 0, NULL, NULL); OCL_CHECK(err); clFinish(queue); // read device data back to host clEnqueueReadBuffer(queue, Aout_d, CL_TRUE, 0, sizeof(float)*N*N, Aout, 0, NULL , &event[1]); err = clWaitForEvents(1,&event[1]); OCL_CHECK(err); err = clGetEventProfilingInfo (event[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &gstart, NULL); OCL_CHECK(err); err = clGetEventProfilingInfo (event[1], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &gend, NULL); OCL_CHECK(err); gpuTime = (double)(gend -gstart)/1000000000.0; //check_1d_f(sum, blks+1); #ifdef DEBUG puts("Output"); check_2d_f(Aout,N,N); #endif printf("oclTime = %lf (s)\n", gpuTime ); // free clReleaseMemObject(A_d); clReleaseMemObject(Aout_d); // // check // int flag = 1; // for(i=0;i<N;++i){ // for(j=0;j<N;++j){ // if(A[i*N+j] != At[j*N+i]) // { // flag = 0; // break; // } // } // } // if( flag == 0 ) // { // puts("Bugs! Check program."); // }else{ // puts("Succeed!"); // } clReleaseProgram(program); clReleaseContext(context); clReleaseCommandQueue(queue); for(i=0;i<NumK;++i){ clReleaseKernel(kernel[i]); } for(i=0;i<NumE;++i){ clReleaseEvent(event[i]); } free(kernelSource); #ifdef SAVEBIN free(bin); #endif free(A); free(Aout); return; }
int main() { int i, j; char* value; size_t valueSize; cl_uint platformCount; cl_platform_id* platforms; cl_uint deviceCount; cl_device_id* devices; cl_uint maxComputeUnits; // get all platforms clGetPlatformIDs(0, NULL, &platformCount); platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount); clGetPlatformIDs(platformCount, platforms, NULL); for (i = 0; i < platformCount; i++) { // get all devices clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount); devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount); clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL); // for each device print critical attributes for (j = 0; j < deviceCount; j++) { // print device name clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &valueSize); value = (char*) malloc(valueSize); clGetDeviceInfo(devices[j], CL_DEVICE_NAME, valueSize, value, NULL); printf("%d. Device: %s\n", j+1, value); free(value); // print hardware device version clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, 0, NULL, &valueSize); value = (char*) malloc(valueSize); clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, valueSize, value, NULL); printf(" %d.%d Hardware version: %s\n", j+1, 1, value); free(value); // print software driver version clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, 0, NULL, &valueSize); value = (char*) malloc(valueSize); clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, valueSize, value, NULL); printf(" %d.%d Software version: %s\n", j+1, 2, value); free(value); // print c version supported by compiler for device clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &valueSize); value = (char*) malloc(valueSize); clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, valueSize, value, NULL); printf(" %d.%d OpenCL C version: %s\n", j+1, 3, value); free(value); // print parallel compute units clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(maxComputeUnits), &maxComputeUnits, NULL); printf(" %d.%d Parallel compute units: %d\n", j+1, 4, maxComputeUnits); } free(devices); } free(platforms); exit(0); }
int main(int argc, char *argv[]){ if (MODE == 5){ printf("---OpenCL Test Code---\n\n"); cl_int errNum; cl_uint numPlatforms; cl_platform_id *platforms = NULL; cl_uint numDevices; cl_device_id *devices = NULL; //platform info fields char vendor[1024], name[1024], version[1024]; //device info fields size_t MAX_WORK_GROUP_SIZE; cl_ulong GLOBAL_MEM_CACHE_SIZE, GLOBAL_MEM_SIZE, LOCAL_MEM_SIZE, GLOBAL_MEM_CACHELINE_SIZE; cl_uint MAX_COMPUTE_UNITS, MAX_WORK_ITEM_DIMENSIONS; size_t MAX_WORK_ITEM_SIZES[3]; char DEVICE_NAME[1024], DEVICE_VENDOR[1024], DEVICE_VERSION[1024], DRIVER_VERSION[1024], EXTENSIONS[2048]; cl_device_mem_cache_type GLOBAL_MEM_CACHE_TYPE; //printf("Getting number of OpenCL Platforms...\n"); errNum = clGetPlatformIDs(0, NULL, &numPlatforms); if (errNum != CL_SUCCESS) { printf("Failed to get number of OpenCL platforms.\n"); return 0; } else { //printf("found %d.\n", numPlatforms); } //printf("Allocating space for the platform info...\n"); platforms = (cl_platform_id *)malloc(numPlatforms*sizeof(cl_platform_id)); printf("---Platform Info---\n"); errNum = clGetPlatformIDs(numPlatforms, platforms, NULL); if (errNum != CL_SUCCESS) { printf("Failed to get platform info.\n"); return 0; } else { clGetPlatformInfo (platforms[0], CL_PLATFORM_VENDOR, sizeof(vendor), vendor, NULL); clGetPlatformInfo (platforms[0], CL_PLATFORM_NAME, sizeof(name), name, NULL); clGetPlatformInfo (platforms[0], CL_PLATFORM_VERSION, sizeof(version), version, NULL); //printf("Got platform info.\n"); printf("Vendor: \t%s\n", vendor); printf("Name: \t%s\n", name); printf("Version:\t%s\n", version); } //printf("Getting number of devices...\n"); errNum = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices); if (errNum != CL_SUCCESS) { printf("Failed to get number of devices.\n"); return 0; } else { //printf("Found %d.\n", numDevices); } //printf("Allocating space for device info...\n"); devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id)); printf("\n---Device Info---"); errNum = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL); if (errNum != CL_SUCCESS) { printf("Failed to get device info.\n"); return 0; } else { int i, j = 0; for (i = 0; i < numDevices; i++ ) { printf("\nDevice ID: %d\n", i+1); clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(DEVICE_NAME), DEVICE_NAME, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(DEVICE_VENDOR), DEVICE_VENDOR, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(DEVICE_VERSION), DEVICE_VERSION, NULL); clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(DRIVER_VERSION), DRIVER_VERSION, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_EXTENSIONS, sizeof(EXTENSIONS), EXTENSIONS, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(MAX_COMPUTE_UNITS), &MAX_COMPUTE_UNITS, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(GLOBAL_MEM_SIZE), &GLOBAL_MEM_SIZE, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(LOCAL_MEM_SIZE), &LOCAL_MEM_SIZE, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(MAX_WORK_ITEM_DIMENSIONS), &MAX_WORK_ITEM_DIMENSIONS, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(MAX_WORK_ITEM_SIZES), MAX_WORK_ITEM_SIZES, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(MAX_WORK_GROUP_SIZE), &MAX_WORK_GROUP_SIZE, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, sizeof(GLOBAL_MEM_CACHE_SIZE), &GLOBAL_MEM_CACHE_SIZE, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(GLOBAL_MEM_CACHELINE_SIZE), &GLOBAL_MEM_CACHELINE_SIZE, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, sizeof(GLOBAL_MEM_CACHE_TYPE), &GLOBAL_MEM_CACHE_TYPE, NULL); printf("Device Name:\t%s\n", DEVICE_NAME); printf("Device Vendor:\t%s\n", DEVICE_VENDOR); printf("Device Version:\t%s\n", DEVICE_VERSION); printf("Driver Version:\t%s\n", DRIVER_VERSION); printf("EXTENSIONS:\t%s\n", EXTENSIONS); printf("Number of CUs:\t%d\n", MAX_COMPUTE_UNITS); printf("GMem:\t\t%lld (Bytes)\n", (long long) GLOBAL_MEM_SIZE); printf("GMem $ Size:\t%lld (Bytes)\n", (long long) GLOBAL_MEM_CACHE_SIZE); printf("GMem $ Line:\t%lld (Bytes)\n", (long long) GLOBAL_MEM_CACHELINE_SIZE); if(GLOBAL_MEM_CACHE_TYPE == CL_NONE) { printf("GMem $ Type:\tCL_NONE\n"); } else if(GLOBAL_MEM_CACHE_TYPE == CL_READ_ONLY_CACHE) { printf("GMem $ Type:\tCL_READ_ONLY_CACHE\n"); } else if(GLOBAL_MEM_CACHE_TYPE == CL_READ_WRITE_CACHE) { printf("GMem $ Type:\tCL_READ_WRITE_CACHE\n"); } printf("LMem:\t\t%lld (Bytes)\n", (long long) LOCAL_MEM_SIZE); printf("Work Group Size:%d (Max)\n", (int) MAX_WORK_GROUP_SIZE); printf("Work Item Dim:\t%d (Max)\n", MAX_WORK_ITEM_DIMENSIONS); printf("Work Item Size:\t"); for(j = 0; j < MAX_WORK_ITEM_DIMENSIONS; j ++) { if (j != (MAX_WORK_ITEM_DIMENSIONS -1)) printf("%d, ", (int) MAX_WORK_ITEM_SIZES[j]); if (j == (MAX_WORK_ITEM_DIMENSIONS -1)) printf("%d ", (int) MAX_WORK_ITEM_SIZES[j]); } printf("(Max)\n"); } //printf("Got device info.\n"); } } else if (MODE == 4){ cl_context context = 0; cl_command_queue commandQueue = 0; cl_program program = 0; cl_device_id device = 0; //Create an OpenCL context on first available platform context = CreateContext(); if (context == NULL) { printf("Failed to create OpenCL context.\n"); return 1; } //Create a command-queue on the first device available on the created context commandQueue = CreateCommandQueue(context, &device); if (commandQueue == NULL) { printf("Failed to create commandQueue.\n"); Cleanup(context, commandQueue, program, NULL); return 1; } // Create OpenCL program and store the binary for future use. printf("Attempting to create kernel binary from source.\n"); program = CreateProgram(context, device, KERNELPATHIN); if (program == NULL) { printf("Failed to create Program"); Cleanup(context, commandQueue, program, NULL); return 1; } printf("Kernel is saved.\n"); if (SaveProgramBinary(program, device, KERNELPATHOUT) == false) { printf("Failed to write program binary.\n"); Cleanup(context, commandQueue, program, NULL); return 1; } //printf("---Done---"); //return 1; } else if (MODE == 3){ //todo free remaining objects not passed to cleanup //profiling int write_bytes = 0; int read_bytes = 0; /*unsigned long long start_cycles, stop_cycles; unsigned long long start_setup, stop_setup; unsigned long long start_write, stop_write; unsigned long long start_read, stop_read; unsigned long long start_finalize, stop_finalize; struct timespec start_time_t, stop_time_t;*/ printf("Stream Mode\n\n"); //clock_gettime(CLOCK_MONOTONIC, &start_time_t); //start_cycles = rdtsc(); int i; time_t t; srand((unsigned) time(&t)); // Create the two input vectors printf("\nHostside malloc(s)\n"); fflush(stdout); int *A = (int*)malloc(sizeof(int)*(SIZE*SIZE)); int *B = (int*)malloc(sizeof(int)*(SIZE*SIZE)); int *C = (int*)malloc(sizeof(int)*(SIZE*SIZE)); //profile //bytes += 3 * sizeof(int)*(SIZE*SIZE); printf("\nHostside mat init\n"); fflush(stdout); for(i = 0; i < (SIZE*SIZE); i++) { A[i] = B[i] = rand() % 10 + 1;; } //print matrix printf("Matrix A[%d][%d]:\n", SIZE, SIZE); for(i = 0; i < (SIZE*SIZE); i++) { printf("%3d ", A[i]); if(((i + 1) % SIZE) == 0) printf("\n"); } //print matrix printf("\nMatrix B[%d][%d]:\n", SIZE, SIZE); for(i = 0; i < (SIZE*SIZE); i++) { printf("%3d ", B[i]); if(((i + 1) % SIZE) == 0) printf("\n"); } //syscall(STATS_RESET); //Get platform and device information cl_context context = 0; cl_command_queue commandQueue = 0; cl_program program = 0; cl_device_id device = 0; cl_kernel kernel = 0; cl_uint err = 0; //char *filepath = NULL; //Create the context printf("\nCreateContext\n"); fflush(stdout); context = CreateContext(); if (context == NULL) { printf("Failed to create OpenCL context.\n"); return 1; } /* printf("\nEnd CreateContext\n"); fflush(stdout);*/ //Create a command-queue on the first device available on the created context printf("\nCreateCommandQueue\n"); fflush(stdout); commandQueue = CreateCommandQueue(context, &device); if (commandQueue == NULL) { printf("Failed to create command queue.\n"); Cleanup(context, commandQueue, program, NULL); return 1; } //create the program from the binary //program = CreateProgramFromBinary(context, device, "/home/stardica/Desktop/Kernels/vector.cl.bin.GPU"); //strcat(KERNELPATHOUT, ".GPU") printf("\nCreateProgramFromBinary\n"); fflush(stdout); program = CreateProgramFromBinary(context, device, KERNEL); if (program == NULL) { printf("Failed to load kernel binary,\n"); Cleanup(context, commandQueue, program, NULL); return 1; } // Create OpenCL kernel printf("\nclCreateKernel\n"); fflush(stdout); kernel = clCreateKernel(program, "Matrix", NULL); if (kernel == NULL) { printf("Failed to create kernel.\n"); Cleanup(context, commandQueue, program, NULL); return 1; } cl_mem a_mem_obj = 0; cl_mem b_mem_obj = 0; cl_mem c_mem_obj = 0; //Create memory buffers on the device for each vector printf("\nclCreateBuffer(s)\n"); fflush(stdout); if(LOCALMEM == 1 && CACHEDMEM == 0) { //this creates uncached buffers in the GPU's local memory #if M2S_CGM_OCL_SIM { a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); } #else { a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); } #endif } if(SYSMEM == 1 && CACHEDMEM == 0) { //this creates uncached buffers in the system memory #if M2S_CGM_OCL_SIM { a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); b_mem_obj = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); } #else { a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); b_mem_obj = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); } #endif } if(SYSMEM == 1 && CACHEDMEM == 1) { //this creates cached buffers in the system memory. #if M2S_CGM_OCL_SIM { a_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); b_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); c_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); } #else { a_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); b_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); c_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); } #endif } if (a_mem_obj == NULL || b_mem_obj == NULL || c_mem_obj == NULL) { printf("Failed to create memory objects.\n"); Cleanup(context, commandQueue, program, kernel); return 1; } //Copy the lists A and B to their respective memory buffers printf("\nclEnqueueWriteBuffer(s)\n"); fflush(stdout); write_bytes += 2 * sizeof(int)*(SIZE*SIZE); // start_write = rdtsc(); clEnqueueWriteBuffer(commandQueue, a_mem_obj, CL_TRUE, 0, (sizeof(int)*(SIZE*SIZE)), A, 0, NULL, NULL); clEnqueueWriteBuffer(commandQueue, b_mem_obj, CL_TRUE, 0, (sizeof(int)*(SIZE*SIZE)), B, 0, NULL, NULL); // stop_write = rdtsc(); // Set the arguments of the kernel int *size = (int *)SIZE; printf("\nclSetKernelArg(s)\n"); fflush(stdout); err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&c_mem_obj); err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&a_mem_obj); err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&b_mem_obj); err = clSetKernelArg(kernel, 3, sizeof(int), (void *)&size); if (err != CL_SUCCESS) { printf("Kernel args not set.\n"); return 1; } // Execute the OpenCL kernel on the list size_t GlobalWorkSize[2], LocalWorkSize[2]; //Rember that in OpenCL we need to express the globalWorkSize in //terms of the total number of threads. The underlying OpenCL API //will look at the globalWorkSize and divide by the localWorkSize //to arrive at a 64 by 64 NDRange of 16 by 16 work groups. GlobalWorkSize[0] = GWS_0;//SIZE*SIZE*SIZE; // Process the entire lists GlobalWorkSize[1] = GWS_1;//SIZE*SIZE*SIZE; // Process the entire lists LocalWorkSize[0] = LWS_0; //SIZE Divide work items into groups of 64 LocalWorkSize[1] = LWS_1; //SIZE Divide work items into groups of 64 //used null for local, lets OpenCL determine the best local size. //err = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, GlobalWorkSize, LocalWorkSize, 0, NULL, NULL); printf("\nclEnqueueNDRangeKernel\n"); fflush(stdout); err = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, GlobalWorkSize, LocalWorkSize, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("ND range not enqueued. Code: %d\n", err); return 1; } //Read the memory buffer C on the device to the local variable C printf("\nclEnqueueReadBuffer\n"); fflush(stdout); read_bytes += sizeof(int)*(SIZE*SIZE); //start_read = rdtsc(); err = clEnqueueReadBuffer(commandQueue, c_mem_obj, CL_TRUE, 0, (sizeof(int)*(SIZE*SIZE)), C, 0, NULL, NULL); // stop_read = rdtsc(); if (err != CL_SUCCESS) { printf("Buffer not returned.\n"); return 1; } //syscall(STATS_STOP); //print matrix printf("\nMatrix C[%d][%d] = A[%d][%d]*B[%d][%d]:\n", SIZE, SIZE, SIZE, SIZE, SIZE, SIZE); for(i = 0; i < (SIZE*SIZE); i++) { printf("%3d ", C[i]); if(((i + 1) % SIZE) == 0) printf("\n"); } printf("\nHostside clean up\n"); fflush(stdout); err = clFlush(commandQueue); err = clFinish(commandQueue); Cleanup(context, commandQueue, program, kernel); err = clReleaseMemObject(a_mem_obj); err = clReleaseMemObject(b_mem_obj); err = clReleaseMemObject(c_mem_obj); free(A); free(B); free(C); //printf("---Done---"); /*stop_cycles = rdtsc(); clock_gettime(CLOCK_MONOTONIC, &stop_time_t); printf("Total cycles = %llu\n", (stop_cycles - start_cycles)); long int time_s = stop_time_t.tv_nsec - start_time_t.tv_nsec; printf("Approximate runtime (check) = %ld ms\n", (time_s/1000000)); printf("Bytes written %d\n", write_bytes); printf("transfer cycles = %llu\n", (stop_write - start_write)); printf("start at = %llu\n", (start_write - start_cycles)); printf("Bytes read %d\n", read_bytes); printf("transfer cycles = %llu\n", (stop_read - start_read)); printf("start at = %llu\n", (start_read - start_cycles));*/ } else if (MODE == 2){ printf("Multi Thread Mode\n"); //cal this: //assignToThisCore(0);//assign to core 0,1,2,... unsigned long long a, b; int i = 0; int j = 0; int k = 0; LoadMatrices(); pthread_t tid[SIZE*SIZE]; //printf("waiting\n"); //start our threads a = rdtsc(); syscall(BEGIN_PARALLEL_SECTION); for(i=0;i<SIZE;i++){ for(j=0;j<SIZE;j++){ struct RowColumnData *RCData = (struct RowColumnData *) malloc(sizeof(struct RowColumnData)); RCData->RowNum = i; RCData->ColumnNum = j; //printf("Thread create %d Row %d Col %d\n", k, RCData->RowNum, RCData->ColumnNum); pthread_create(&tid[k], NULL, RowColumnMultiply, RCData); k++; } } //Join threads//////////////////////////// for (i=0;i<NUM_THREADS;i++) { pthread_join(tid[i], NULL); } syscall(END_PARALLEL_SECTION); b = rdtsc(); PrintMatrices(); //printf("\nend clock Cycles: %llu\n", b); printf("\nDone. Number of clock Cycles: %llu\n", b-a); } else if (MODE == 1) { printf("Single Thread Mode\n\n"); //unsigned long long a, b; //a = rdtsc(); //time_t t; int i,j,k; //srand((unsigned) time(&t)); LoadMatrices(); //multiply mats///////////////////////// for (i=0;i<SIZE;i++){ for(j=0;j<SIZE;j++){ for(k=0;k<SIZE;k++){ matC[i][j] = matC[i][j] + (matA[i][k] * matB[k][j]); } } } PrintMatrices(); //b = rdtsc(); //printf("\nDone. Number of clock Cycles: %llu\n", b-a); } else if (MODE == 0) { printf("---Misc Tests---\n\n"); printf("size of long long is %d\n", (int) sizeof(long long)); printf("size of long is %d\n", (int) sizeof(long)); printf("size of int is %d\n", (int) sizeof(int)); printf("size of short is %d\n", (int) sizeof(short)); printf("size of char * %d\n", (int) sizeof(char *)); printf("size of unsigned int (word) %d\n", (int) sizeof(unsigned int)); char *string = "test string"; printf("Here is the string 1: \"%s\"\n", string); //Using the struct //set string variable and point to print_me. object.string = strdup(string); object.print_me = (void (*)(void *)) print_me; //use of print_me object.print_me(object.string); //pointer fun struct Object *ptr = &object; printf("this is the value of the pointer to struct object: %p\n", ptr); object.next=&object; printf("this is the value of the pointer to struct object: %p\n", object.next); object_ptr = &object; object_ptr->next = &object; printf("this is the value of the pointer to struct object: %p\n", object_ptr->next); //Macro fun PRINT(ptr, ptr); PRINT(object.next, object.next); PRINT(object_ptr->next, object_ptr->next); int mmu_page_size = 1 << 12; printf("mmu_papge_size = %d\n", mmu_page_size); //setjmp and longjmp fun /*jmp_buf environment; int i; i = setjmp(environment); printf("\n\nsetjmp returned = %d\n", i); printf("Env 1:\n"); int x = 0; for(x = 0; x < 6; x++) { printf(" %x\n", environment[x]); } if (i < 3) { longjmp(environment, 3); } printf("longjmp finished with i = %d\n", i);*/ } else { printf("---Invalid Mode Set---\n\n"); } printf("\n---Done---\n"); return 1; }
int main(int argc, char **argv) { if (find_option(argc, argv, "-h") >= 0) { printf("Options:\n"); printf("-h to see this help\n"); printf("-n <int> to set the number of particles\n"); printf("-o <filename> to specify the output file name\n"); printf("-s <filename> to specify the summary output file name\n"); return 0; } int n = read_int(argc, argv, "-n", 1000); char *savename = read_string(argc, argv, "-o", NULL); char *sumname = read_string(argc, argv, "-s", NULL); // For return values. cl_int ret; // OpenCL stuff. // Loading kernel files. FILE *kernelFile; char *kernelSource; size_t kernelSize; kernelFile = fopen("simulationKernel.cl", "r"); if (!kernelFile) { fprintf(stderr, "No file named simulationKernel.cl was found\n"); exit(-1); } kernelSource = (char*)malloc(MAX_SOURCE_SIZE); kernelSize = fread(kernelSource, 1, MAX_SOURCE_SIZE, kernelFile); fclose(kernelFile); // Getting platform and device information cl_platform_id platformId = NULL; cl_device_id deviceID = NULL; cl_uint retNumDevices; cl_uint retNumPlatforms; ret = clGetPlatformIDs(1, &platformId, &retNumPlatforms); // Different types of devices to pick from. At the moment picks the default opencl device. //CL_DEVICE_TYPE_GPU //CL_DEVICE_TYPE_ACCELERATOR //CL_DEVICE_TYPE_DEFAULT //CL_DEVICE_TYPE_CPU ret = clGetDeviceIDs(platformId, CL_DEVICE_TYPE_ACCELERATOR, 1, &deviceID, &retNumDevices); // Max workgroup size size_t max_available_local_wg_size; ret = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_available_local_wg_size, NULL); // Creating context. cl_context context = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &ret); // Creating command queue cl_command_queue commandQueue = clCreateCommandQueueWithProperties (context, deviceID, 0, &ret); // Build program cl_program program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, (const size_t *)&kernelSize, &ret); // printf("program = ret %i \n", ret); ret = clBuildProgram(program, 1, &deviceID, NULL, NULL, NULL); // printf("clBuildProgram: ret %i \n", ret); // Create kernels cl_kernel forceKernel = clCreateKernel(program, "compute_forces_gpu", &ret); cl_kernel moveKernel = clCreateKernel(program, "move_gpu", &ret); cl_kernel binInitKernel = clCreateKernel(program, "bin_init_gpu", &ret); cl_kernel binKernel = clCreateKernel(program, "bin_gpu", &ret); FILE *fsave = savename ? fopen(savename, "w") : NULL; FILE *fsum = sumname ? fopen(sumname, "a") : NULL; particle_t *particles = (particle_t*)malloc(n * sizeof(particle_t)); // GPU particle data structure cl_mem d_particles = clCreateBuffer(context, CL_MEM_READ_WRITE, n * sizeof(particle_t), NULL, &ret); // Set size set_size(n); init_particles(n, particles); double copy_time = read_timer(); // Copy particles to device. ret = clEnqueueWriteBuffer(commandQueue, d_particles, CL_TRUE, 0, n * sizeof(particle_t), particles, 0, NULL, NULL); copy_time = read_timer() - copy_time; // Calculating thread and thread block counts. // sizes size_t globalItemSize; size_t localItemSize; // Global item size if (n <= NUM_THREADS) { globalItemSize = NUM_THREADS; localItemSize = 16; } else if (n % NUM_THREADS != 0) { globalItemSize = (n / NUM_THREADS + 1) * NUM_THREADS; } else { globalItemSize = n; } // Local item size localItemSize = globalItemSize / NUM_THREADS; // Bins and bin sizes. // Because of uniform distribution we will know that bins size is amortized. Therefore I picked the value of 10. // There will never be 10 particles in one bin. int maxParticles = 10; // Calculating the number of bins. int numberOfBins = (int)ceil(size/(2*cutoff)) + 2; // Bins will only exist on the device. particle_t* bins; // How many particles are there in each bin - also only exists on the device. volatile int* binSizes; // Number of bins to be initialized. size_t clearAmt = numberOfBins*numberOfBins; // Allocate memory for bins on the device. cl_mem d_binSizes = clCreateBuffer(context, CL_MEM_READ_WRITE, numberOfBins * numberOfBins * sizeof(volatile int), NULL, &ret); cl_mem d_bins = clCreateBuffer(context, CL_MEM_READ_WRITE, numberOfBins * numberOfBins * maxParticles * sizeof(particle_t), NULL, &ret); // SETTING ARGUMENTS FOR THE KERNELS // Set arguments for the init / clear kernel ret = clSetKernelArg(binInitKernel, 0, sizeof(cl_mem), (void *)&d_binSizes); ret = clSetKernelArg(binInitKernel, 1, sizeof(int), &numberOfBins); // Set arguments for the binning kernel ret = clSetKernelArg(binKernel, 0, sizeof(cl_mem), (void *)&d_particles); ret = clSetKernelArg(binKernel, 1, sizeof(int), &n); ret = clSetKernelArg(binKernel, 2, sizeof(cl_mem), (void *)&d_bins); ret = clSetKernelArg(binKernel, 3, sizeof(cl_mem), (void *)&d_binSizes); ret = clSetKernelArg(binKernel, 4, sizeof(int), &numberOfBins); // Set arguments for force kernel. ret = clSetKernelArg(forceKernel, 0, sizeof(cl_mem), (void *)&d_particles); ret = clSetKernelArg(forceKernel, 1, sizeof(int), &n); ret = clSetKernelArg(forceKernel, 2, sizeof(cl_mem), (void *)&d_bins); ret = clSetKernelArg(forceKernel, 3, sizeof(cl_mem), (void *)&d_binSizes); ret = clSetKernelArg(forceKernel, 4, sizeof(int), &numberOfBins); // Set arguments for move kernel ret = clSetKernelArg(moveKernel, 0, sizeof(cl_mem), (void *)&d_particles); ret = clSetKernelArg(moveKernel, 1, sizeof(int), &n); ret = clSetKernelArg(moveKernel, 2, sizeof(double), &size); // Variable to check if kernel execution is done. cl_event kernelDone; double simulation_time = read_timer(); int step = 0; for (step = 0; step < NSTEPS; step++) { // Execute bin initialization (clearing after first iteration) ret = clEnqueueNDRangeKernel(commandQueue, binInitKernel, 1, NULL, &clearAmt, NULL, 0, NULL, &kernelDone); ret = clWaitForEvents(1, &kernelDone); // Execute binning kernel ret = clEnqueueNDRangeKernel(commandQueue, binKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone); // ret = clEnqueueNDRangeKernel(commandQueue, binKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone); ret = clWaitForEvents(1, &kernelDone); // Execute force kernel ret = clEnqueueNDRangeKernel(commandQueue, forceKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone); ret = clWaitForEvents(1, &kernelDone); // Execute move kernel ret = clEnqueueNDRangeKernel(commandQueue, moveKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone); ret = clWaitForEvents(1, &kernelDone); if (fsave && (step%SAVEFREQ) == 0) { // Copy the particles back to the CPU ret = clEnqueueReadBuffer(commandQueue, d_particles, CL_TRUE, 0, n * sizeof(particle_t), particles, 0, NULL, &kernelDone); ret = clWaitForEvents(1, &kernelDone); save(fsave, n, particles); } } simulation_time = read_timer() - simulation_time; printf("CPU-GPU copy time = %g seconds\n", copy_time); printf("n = %d, simulation time = %g seconds\n", n, simulation_time); if (fsum) fprintf(fsum, "%d %lf \n", n, simulation_time); if (fsum) fclose(fsum); free(particles); if (fsave) fclose(fsave); ret = clFlush(commandQueue); ret = clFinish(commandQueue); ret = clReleaseCommandQueue(commandQueue); ret = clReleaseKernel(forceKernel); ret = clReleaseKernel(moveKernel); ret = clReleaseProgram(program); ret = clReleaseMemObject(d_particles); ret = clReleaseContext(context); return 0; }
void WorkScheduler::initialize(bool use_opencl, int num_cpu_threads) { /* initialize highlighting */ if (!g_highlightInitialized) { if (g_highlightedNodesRead) MEM_freeN(g_highlightedNodesRead); if (g_highlightedNodes) MEM_freeN(g_highlightedNodes); g_highlightedNodesRead = NULL; g_highlightedNodes = NULL; COM_startReadHighlights(); g_highlightInitialized = true; } #if COM_CURRENT_THREADING_MODEL == COM_TM_QUEUE /* deinitialize if number of threads doesn't match */ if (g_cpudevices.size() != num_cpu_threads) { Device *device; while (g_cpudevices.size() > 0) { device = g_cpudevices.back(); g_cpudevices.pop_back(); device->deinitialize(); delete device; } g_cpuInitialized = false; } /* initialize CPU threads */ if (!g_cpuInitialized) { for (int index = 0; index < num_cpu_threads; index++) { CPUDevice *device = new CPUDevice(); device->initialize(); g_cpudevices.push_back(device); } g_cpuInitialized = true; } #ifdef COM_OPENCL_ENABLED /* deinitialize OpenCL GPU's */ if (use_opencl && !g_openclInitialized) { g_context = NULL; g_program = NULL; if (clewInit() != CLEW_SUCCESS) /* this will check for errors and skip if already initialized */ return; if (clCreateContextFromType) { cl_uint numberOfPlatforms = 0; cl_int error; error = clGetPlatformIDs(0, 0, &numberOfPlatforms); if (error == -1001) { } /* GPU not supported */ else if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); } if (G.f & G_DEBUG) printf("%u number of platforms\n", numberOfPlatforms); cl_platform_id *platforms = (cl_platform_id *)MEM_mallocN(sizeof(cl_platform_id) * numberOfPlatforms, __func__); error = clGetPlatformIDs(numberOfPlatforms, platforms, 0); unsigned int indexPlatform; for (indexPlatform = 0; indexPlatform < numberOfPlatforms; indexPlatform++) { cl_platform_id platform = platforms[indexPlatform]; cl_uint numberOfDevices = 0; clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, 0, &numberOfDevices); if (numberOfDevices <= 0) continue; cl_device_id *cldevices = (cl_device_id *)MEM_mallocN(sizeof(cl_device_id) * numberOfDevices, __func__); clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numberOfDevices, cldevices, 0); g_context = clCreateContext(NULL, numberOfDevices, cldevices, clContextError, NULL, &error); if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); } const char *cl_str[2] = {datatoc_COM_OpenCLKernels_cl, NULL}; g_program = clCreateProgramWithSource(g_context, 1, cl_str, 0, &error); error = clBuildProgram(g_program, numberOfDevices, cldevices, 0, 0, 0); if (error != CL_SUCCESS) { cl_int error2; size_t ret_val_size = 0; printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); error2 = clGetProgramBuildInfo(g_program, cldevices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); if (error2 != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); } char *build_log = (char *)MEM_mallocN(sizeof(char) * ret_val_size + 1, __func__); error2 = clGetProgramBuildInfo(g_program, cldevices[0], CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL); if (error2 != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); } build_log[ret_val_size] = '\0'; printf("%s", build_log); MEM_freeN(build_log); } else { unsigned int indexDevices; for (indexDevices = 0; indexDevices < numberOfDevices; indexDevices++) { cl_device_id device = cldevices[indexDevices]; cl_int vendorID = 0; cl_int error2 = clGetDeviceInfo(device, CL_DEVICE_VENDOR_ID, sizeof(cl_int), &vendorID, NULL); if (error2 != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error2, clewErrorString(error2)); } OpenCLDevice *clDevice = new OpenCLDevice(g_context, device, g_program, vendorID); clDevice->initialize(); g_gpudevices.push_back(clDevice); } } MEM_freeN(cldevices); } MEM_freeN(platforms); } g_openclInitialized = true; } #endif #endif }
int main( void ) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufX; float *X; cl_event event = NULL; int ret = 0; const size_t N0 = 4, N1 = 4, N2 = 4; char platform_name[128]; char device_name[128]; /* FFT library realted declarations */ clfftPlanHandle planHandle; clfftDim dim = CLFFT_3D; size_t clLengths[3] = {N0, N1, N2}; /* Setup OpenCL environment. */ err = clGetPlatformIDs( 1, &platform, NULL ); size_t ret_param_size = 0; err = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, &ret_param_size); printf("Platform found: %s\n", platform_name); err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, NULL ); err = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, &ret_param_size); printf("Device found on the above platform: %s\n", device_name); props[1] = (cl_context_properties)platform; ctx = clCreateContext( props, 1, &device, NULL, NULL, &err ); queue = clCreateCommandQueue( ctx, device, 0, &err ); /* Setup clFFT. */ clfftSetupData fftSetup; err = clfftInitSetupData(&fftSetup); err = clfftSetup(&fftSetup); /* Allocate host & initialize data. */ /* Only allocation shown for simplicity. */ size_t buffer_size = N0 * N1 * N2 * 2 * sizeof(*X); X = (float *)malloc(buffer_size); /* print input array just using the * indices to fill the array with data */ printf("\nPerforming fft on an three dimensional array of size N0 x N1 x N2 : %ld x %ld x %ld\n", N0, N1, N2); int i, j, k; i = j = k = 0; for (i=0; i<N0; ++i) { for (j=0; j<N1; ++j) { for (k=0; k<N2; ++k) { float x = 0.0f; float y = 0.0f; if (i==0 && j==0 && k==0) { x = y = 0.5f; } unsigned idx = 2*(k+j*N1+i*N0*N1); X[idx] = x; X[idx+1] = y; printf("(%f, %f) ", X[idx], X[idx+1]); } printf("\n"); } printf("\n"); } /* Prepare OpenCL memory objects and place data inside them. */ bufX = clCreateBuffer( ctx, CL_MEM_READ_WRITE, buffer_size, NULL, &err ); err = clEnqueueWriteBuffer( queue, bufX, CL_TRUE, 0, buffer_size, X, 0, NULL, NULL ); /* Create a default plan for a complex FFT. */ err = clfftCreateDefaultPlan(&planHandle, ctx, dim, clLengths); /* Set plan parameters. */ err = clfftSetPlanPrecision(planHandle, CLFFT_SINGLE); err = clfftSetLayout(planHandle, CLFFT_COMPLEX_INTERLEAVED, CLFFT_COMPLEX_INTERLEAVED); err = clfftSetResultLocation(planHandle, CLFFT_INPLACE); /* Bake the plan. */ err = clfftBakePlan(planHandle, 1, &queue, NULL, NULL); /* Execute the plan. */ err = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &queue, 0, NULL, NULL, &bufX, NULL, NULL); /* Wait for calculations to be finished. */ err = clFinish(queue); /* Fetch results of calculations. */ err = clEnqueueReadBuffer( queue, bufX, CL_TRUE, 0, buffer_size, X, 0, NULL, NULL ); /* print output array */ printf("\n\nfft result: \n"); i = j = k = 0; for (i=0; i<N0; ++i) { for (j=0; j<N1; ++j) { for (k=0; k<N2; ++k) { unsigned idx = 2*(k+j*N1+i*N0*N1); printf("(%f, %f) ", X[idx], X[idx+1]); } printf("\n"); } printf("\n"); } printf("\n"); /* Release OpenCL memory objects. */ clReleaseMemObject( bufX ); free(X); /* Release the plan. */ err = clfftDestroyPlan( &planHandle ); /* Release clFFT library. */ clfftTeardown( ); /* Release OpenCL working objects. */ clReleaseCommandQueue( queue ); clReleaseContext( ctx ); return ret; }
/* * pgstrom_collect_device_info * * It collects properties of all the OpenCL devices. It shall be called once * by the OpenCL management worker process, prior to any other backends. */ static List * construct_opencl_device_info(int platform_index) { cl_platform_id platforms[32]; cl_device_id devices[MAX_NUM_DEVICES]; cl_uint n_platform; cl_uint n_devices; cl_int i, j, rc; long score_max = -1; List *result = NIL; rc = clGetPlatformIDs(lengthof(platforms), platforms, &n_platform); if (rc != CL_SUCCESS) elog(ERROR, "clGetPlatformIDs failed (%s)", opencl_strerror(rc)); for (i=0; i < n_platform; i++) { pgstrom_platform_info *pl_info; pgstrom_device_info *dev_info; long score = 0; List *temp = NIL; pl_info = collect_opencl_platform_info(platforms[i]); pl_info->pl_index = i; rc = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_CPU | CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_ACCELERATOR, lengthof(devices), devices, &n_devices); if (rc != CL_SUCCESS) elog(ERROR, "clGetDeviceIDs failed (%s)", opencl_strerror(rc)); elog(LOG, "PG-Strom: [%d] OpenCL Platform: %s", i, pl_info->pl_name); for (j=0; j < n_devices; j++) { dev_info = collect_opencl_device_info(devices[j]); dev_info->pl_info = pl_info; dev_info->dev_index = j; elog(LOG, "PG-Strom: + device %s (%uMHz x %uunits, %luMB)", dev_info->dev_name, dev_info->dev_max_clock_frequency, dev_info->dev_max_compute_units, dev_info->dev_global_mem_size >> 20); /* rough estimation about computing power */ if ((dev_info->dev_type & CL_DEVICE_TYPE_GPU) != 0) score += 32 * (dev_info->dev_max_compute_units * dev_info->dev_max_clock_frequency); else score += (dev_info->dev_max_compute_units * dev_info->dev_max_clock_frequency); temp = lappend(temp, dev_info); } if (platform_index == i || (platform_index < 0 && score > score_max)) { opencl_platform_id = platforms[i]; opencl_num_devices = n_devices; for (j=0; j < n_devices; j++) opencl_devices[j] = devices[j]; score_max = score; result = temp; } } /* show platform name if auto-selection */ if (platform_index < 0 && result != NIL) { pgstrom_platform_info *pl_info = ((pgstrom_device_info *) linitial(result))->pl_info; elog(LOG, "PG-Strom: auto platform selection: %s", pl_info->pl_name); } if (result != NIL) { /* * Create an OpenCL context */ opencl_context = clCreateContext(NULL, opencl_num_devices, opencl_devices, NULL, NULL, &rc); if (rc != CL_SUCCESS) elog(ERROR, "clCreateContext failed: %s", opencl_strerror(rc)); /* * Create an OpenCL command queue for each device */ for (j=0; j < opencl_num_devices; j++) { opencl_cmdq[j] = clCreateCommandQueue(opencl_context, opencl_devices[j], CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE, &rc); if (rc != CL_SUCCESS) elog(ERROR, "clCreateCommandQueue failed: %s", opencl_strerror(rc)); } } return result; }
int main(void) { //time meassuring struct timeval tvs; //variables int Nx=1024; int Ny=1024; int plotnum=0; int Tmax=2; int plottime=0; int plotgap=1; double Lx=1.0; double Ly=1.0; double dt=0.0; double A=0.0; double B=0.0; double Du=0.0; double Dv=0.0; //splitting coefficients double a=0.5; double b=0.5; double c=1.0; //loop counters int i=0; int j=0; int n=0; double*umax=NULL; double*vmax=NULL; parainit(&Nx,&Ny,&Tmax,&plotgap,&Lx,&Ly,&dt,&Du,&Dv,&A,&B); plottime=plotgap; vmax=(double*)malloc((Tmax/plotgap+1)*sizeof(double)); umax=(double*)malloc((Tmax/plotgap+1)*sizeof(double)); //openCL variables cl_platform_id *platform_id = NULL; cl_kernel frequencies = NULL, initialdata = NULL, linearpart=NULL; cl_kernel nonlinearpart_a=NULL, nonlinearpart_b=NULL; cl_int ret; cl_uint num_platforms; // Detect how many platforms there are. ret = clGetPlatformIDs(0, NULL, &num_platforms); // Allocate enough space for the number of platforms. platform_id = (cl_platform_id*) malloc(num_platforms*sizeof(cl_platform_id)); // Store the platforms ret = clGetPlatformIDs(num_platforms, platform_id, NULL); printf("Found %d platform(s)!\n",num_platforms); cl_uint *num_devices; num_devices=(cl_uint*) malloc(num_platforms*sizeof(cl_uint)); cl_device_id **device_id = NULL; device_id =(cl_device_id**) malloc(num_platforms*sizeof(cl_device_id*)); // Detect number of devices in the platforms for(i=0;i<num_platforms;i++){ char buf[65536]; size_t size; ret = clGetPlatformInfo(platform_id[i],CL_PLATFORM_VERSION,sizeof(buf),buf,&size); printf("%s\n",buf); ret = clGetDeviceIDs(platform_id[i],CL_DEVICE_TYPE_ALL,0,NULL,num_devices); printf("Found %d device(s) on platform %d!\n", num_devices[i],i); ret = clGetPlatformInfo(platform_id[i],CL_PLATFORM_NAME,sizeof(buf),buf,&size); printf("%s ",buf); // Store numDevices from platform device_id[i]=(cl_device_id*) malloc(num_devices[i]*sizeof(device_id)); ret = clGetDeviceIDs(platform_id[i],CL_DEVICE_TYPE_ALL,num_devices[i],device_id[i],NULL); for(j=0;j<num_devices[i];j++){ ret = clGetDeviceInfo(device_id[i][j],CL_DEVICE_NAME,sizeof(buf),buf,&size); printf("%s (%d,%d)\n",buf,i,j); } } //create context and command_queue cl_context context = NULL; cl_command_queue command_queue = NULL; //Which platform and device do i choose? int chooseplatform=0; int choosedevice=0; printf("Choose platform %d and device %d!\n",chooseplatform,choosedevice); context = clCreateContext( NULL, num_devices[chooseplatform], device_id[chooseplatform], NULL, NULL, &ret); if(ret!=CL_SUCCESS){printf("createContext ret:%d\n",ret); exit(1); } command_queue = clCreateCommandQueue(context, device_id[chooseplatform][choosedevice], 0, &ret); if(ret!=CL_SUCCESS){printf("createCommandQueue ret:%d\n",ret); exit(1); } //OpenCL arrays cl_mem cl_u = NULL,cl_v = NULL; cl_mem cl_uhat = NULL, cl_vhat = NULL; cl_mem cl_kx = NULL, cl_ky = NULL; //FFT clfftPlanHandle planHandle; cl_mem tmpBuffer = NULL; fftinit(&planHandle,&context, &command_queue, &tmpBuffer, Nx, Ny); //allocate gpu memory/ cl_u=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx* Ny* sizeof(double), NULL, &ret); cl_v=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx* Ny* sizeof(double), NULL, &ret); cl_uhat=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx * Ny* sizeof(double), NULL, &ret); cl_vhat=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx * Ny* sizeof(double), NULL, &ret); cl_kx = clCreateBuffer(context, CL_MEM_READ_WRITE, Nx * sizeof(double), NULL, &ret); cl_ky = clCreateBuffer(context, CL_MEM_READ_WRITE, Ny * sizeof(double), NULL, &ret); printf("allocated space\n"); //load the kernels loadKernel(&frequencies,&context,&device_id[chooseplatform][choosedevice],"frequencies"); loadKernel(&initialdata,&context,&device_id[chooseplatform][choosedevice],"initialdata"); loadKernel(&linearpart,&context,&device_id[chooseplatform][choosedevice],"linearpart"); loadKernel(&nonlinearpart_a,&context,&device_id[chooseplatform][choosedevice],"nonlinearpart_a"); loadKernel(&nonlinearpart_b,&context,&device_id[chooseplatform][choosedevice],"nonlinearpart_b"); size_t global_work_size[1] = {Nx*Ny}; size_t global_work_size_X[1] = {Nx}; size_t global_work_size_Y[1] = {Ny}; //frequencies ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem),(void *)&cl_kx); ret = clSetKernelArg(frequencies, 1, sizeof(double),(void* )&Lx); ret = clSetKernelArg(frequencies, 2, sizeof(int),(void* )&Nx); ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_X, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem),(void *)&cl_ky); ret = clSetKernelArg(frequencies, 1, sizeof(double),(void* )&Ly); ret = clSetKernelArg(frequencies, 2, sizeof(int),(void* )&Ny); ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_Y, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //printCL(&cl_kx,&command_queue,Nx,1); //printCL(&cl_ky,&command_queue,1,Ny); //inintial data ret = clSetKernelArg(initialdata, 0, sizeof(cl_mem),(void *)&cl_u); ret = clSetKernelArg(initialdata, 1, sizeof(cl_mem),(void* )&cl_v); ret = clSetKernelArg(initialdata, 2, sizeof(int),(void* )&Nx); ret = clSetKernelArg(initialdata, 3, sizeof(int),(void* )&Ny); ret = clSetKernelArg(initialdata, 4, sizeof(double),(void* )&Lx); ret = clSetKernelArg(initialdata, 5, sizeof(double),(void* )&Ly); ret = clEnqueueNDRangeKernel(command_queue, initialdata, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //make output writedata_C(&cl_u, &command_queue,Nx,Ny,plotnum,"u"); writedata_C(&cl_v, &command_queue,Nx,Ny,plotnum,"v"); umax[plotnum]=writeimage(&cl_u, &command_queue,Nx,Ny,plotnum,"u"); vmax[plotnum]=writeimage(&cl_v, &command_queue,Nx,Ny,plotnum,"v"); printf("Got initial data, starting timestepping\n"); mtime_s(&tvs); for(n=0;n<=Tmax;n++){ //nonlinearpart_a ret = clSetKernelArg(nonlinearpart_a, 0, sizeof(cl_mem),(void *)&cl_u); ret = clSetKernelArg(nonlinearpart_a, 1, sizeof(cl_mem),(void* )&cl_v); ret = clSetKernelArg(nonlinearpart_a, 2, sizeof(double),(void* )&A); ret = clSetKernelArg(nonlinearpart_a, 3, sizeof(double),(void* )&dt); ret = clSetKernelArg(nonlinearpart_a, 4, sizeof(double),(void* )&a); ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_a, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //nonlinearpart_b ret = clSetKernelArg(nonlinearpart_b, 0, sizeof(cl_mem),(void *)&cl_u); ret = clSetKernelArg(nonlinearpart_b, 1, sizeof(cl_mem),(void* )&cl_v); ret = clSetKernelArg(nonlinearpart_b, 2, sizeof(double),(void* )&A); ret = clSetKernelArg(nonlinearpart_b, 3, sizeof(double),(void* )&dt); ret = clSetKernelArg(nonlinearpart_b, 4, sizeof(double),(void* )&b); ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_b, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //linear fft2dfor(&cl_u, &cl_uhat,&planHandle,&command_queue,&tmpBuffer); fft2dfor(&cl_v, &cl_vhat,&planHandle,&command_queue,&tmpBuffer); //printf("A%f,B%f\n",A,B); ret = clSetKernelArg(linearpart, 0, sizeof(cl_mem),(void *)&cl_uhat); ret = clSetKernelArg(linearpart, 1, sizeof(cl_mem),(void *)&cl_vhat); ret = clSetKernelArg(linearpart, 2, sizeof(cl_mem),(void* )&cl_kx); ret = clSetKernelArg(linearpart, 3, sizeof(cl_mem),(void* )&cl_ky); ret = clSetKernelArg(linearpart, 4, sizeof(double),(void* )&Du); ret = clSetKernelArg(linearpart, 5, sizeof(double),(void* )&Dv); ret = clSetKernelArg(linearpart, 6, sizeof(double),(void* )&A); ret = clSetKernelArg(linearpart, 7, sizeof(double),(void* )&B); ret = clSetKernelArg(linearpart, 8, sizeof(double),(void* )&dt); ret = clSetKernelArg(linearpart, 9, sizeof(double),(void* )&c); ret = clSetKernelArg(linearpart, 10, sizeof(int),(void* )&Nx); ret = clSetKernelArg(linearpart, 11, sizeof(int),(void* )&Ny); ret = clEnqueueNDRangeKernel(command_queue, linearpart, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); fft2dback(&cl_u, &cl_uhat,&planHandle,&command_queue,&tmpBuffer); fft2dback(&cl_v, &cl_vhat,&planHandle,&command_queue,&tmpBuffer); //nonlinearpart_b ret = clSetKernelArg(nonlinearpart_b, 0, sizeof(cl_mem),(void *)&cl_u); ret = clSetKernelArg(nonlinearpart_b, 1, sizeof(cl_mem),(void* )&cl_v); ret = clSetKernelArg(nonlinearpart_b, 2, sizeof(double),(void* )&A); ret = clSetKernelArg(nonlinearpart_b, 3, sizeof(double),(void* )&dt); ret = clSetKernelArg(nonlinearpart_b, 4, sizeof(double),(void* )&b); ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_b, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //nonlinearpart_a ret = clSetKernelArg(nonlinearpart_a, 0, sizeof(cl_mem),(void *)&cl_u); ret = clSetKernelArg(nonlinearpart_a, 1, sizeof(cl_mem),(void* )&cl_v); ret = clSetKernelArg(nonlinearpart_a, 2, sizeof(double),(void* )&A); ret = clSetKernelArg(nonlinearpart_a, 3, sizeof(double),(void* )&dt); ret = clSetKernelArg(nonlinearpart_a, 4, sizeof(double),(void* )&a); ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_a, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); // done if(n==plottime){ printf("time:%f, step:%d,%d,umax:%f,vmax:%f\n",n*dt,n,plotnum,umax[plotnum],vmax[plotnum]); plottime=plottime+plotgap; plotnum=plotnum+1; writedata_C(&cl_u, &command_queue,Nx,Ny,plotnum,"u"); writedata_C(&cl_v, &command_queue,Nx,Ny,plotnum,"v"); umax[plotnum]=writeimage(&cl_u, &command_queue,Nx,Ny,plotnum,"u"); vmax[plotnum]=writeimage(&cl_v, &command_queue,Nx,Ny,plotnum,"v"); } }//end timestepping printf("Finished time stepping\n"); mtime_e(&tvs,"Programm took:"); writearray(umax,(Tmax/plotgap)+1,"u"); writearray(vmax,(Tmax/plotgap)+1,"v"); free(umax); free(vmax); clReleaseMemObject(cl_u); clReleaseMemObject(cl_v); clReleaseMemObject(cl_uhat); clReleaseMemObject(cl_vhat); clReleaseMemObject(cl_kx); clReleaseMemObject(cl_ky); ret = clReleaseKernel(initialdata); ret = clReleaseKernel(frequencies); ret = clReleaseKernel(linearpart); ret = clReleaseKernel(nonlinearpart_a); ret = clReleaseKernel(nonlinearpart_b); fftdestroy(&planHandle, &tmpBuffer); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); for(i=0;i<num_platforms;i++){free(device_id[i]);} free(device_id); free(platform_id); free(num_devices); printf("Program execution complete\n"); return 0; }
xcl_world xcl_world_single(cl_device_type device_type, char *target_vendor, char *target_device) { int err; xcl_world world; cl_uint num_platforms; err = clGetPlatformIDs(0, NULL, &num_platforms); if (err != CL_SUCCESS) { printf("Error: no platforms available or OpenCL install broken"); printf("Test failed\n"); exit(EXIT_FAILURE); } cl_platform_id *platform_ids = (cl_platform_id *) malloc(sizeof(cl_platform_id) * num_platforms); if (platform_ids == NULL) { printf("Error: Out of Memory\n"); printf("Test failed\n"); exit(EXIT_FAILURE); } err = clGetPlatformIDs(num_platforms, platform_ids, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to find an OpenCL platform!\n"); printf("Test failed\n"); exit(EXIT_FAILURE); } int i; char cl_platform_vendor[1001]; //find target vendor if target_vendor is specified if (target_vendor != NULL) { for(i = 0; i < num_platforms; i++) { err = clGetPlatformInfo(platform_ids[i], CL_PLATFORM_VENDOR, 1000, (void *)cl_platform_vendor,NULL); if (err != CL_SUCCESS) { printf("Error: clGetPlatformInfo(CL_PLATFORM_VENDOR) failed!\n"); printf("Test failed\n"); exit(EXIT_FAILURE); } if ((target_vendor != NULL) && (strcmp(cl_platform_vendor, target_vendor) == 0)) { printf("INFO: Selected platform %d from %s\n", i, cl_platform_vendor); world.platform_id = platform_ids[i]; break; } } } else { for(i = 0; i < num_platforms; i++) { err = clGetDeviceIDs(platform_ids[i], device_type, 1, &world.device_id, NULL); if (err == CL_SUCCESS) { world.platform_id = platform_ids[i]; break; } } } free(platform_ids); if (i == num_platforms) { printf("Error: Failed to find a platform\n"); printf("Test failed\n"); exit(EXIT_FAILURE); } if (target_device != NULL) { //find target device cl_device_id devices[16]; // compute device id cl_uint num_devices; char cl_device_name[100]; err = clGetDeviceIDs(world.platform_id, CL_DEVICE_TYPE_ACCELERATOR, 16, devices, &num_devices); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); printf("Test failed\n"); exit(EXIT_FAILURE); } //iterate all devices to select the target device. for (i=0; i<num_devices; i++) { err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 100, cl_device_name, 0); if (err != CL_SUCCESS) { printf("Error: Failed to get device name for device %d!\n", i); printf("Test failed\n"); exit(EXIT_FAILURE); } //printf("CL_DEVICE_NAME %s\n", cl_device_name); if (strcmp(cl_device_name, target_device) == 0) { world.device_id = devices[i]; printf("INFO: Selected %s as the target device\n", cl_device_name); break; } } if (i == num_devices) { printf("Error: Failed to find target device %s\n", target_device); printf("Test failed\n"); exit(EXIT_FAILURE); } } world.context = clCreateContext(0, 1, &world.device_id, NULL, NULL, &err); if (err != CL_SUCCESS) { printf("Error: Failed to create a compute context!\n"); printf("Test failed\n"); exit(EXIT_FAILURE); } world.command_queue = clCreateCommandQueue(world.context, world.device_id, CL_QUEUE_PROFILING_ENABLE, &err); if (err != CL_SUCCESS) { printf("Error: Failed to create a command queue!\n"); printf("Test failed\n"); exit(EXIT_FAILURE); } return world; }
int main() { cl_int num_rand = 4096*256; /* The number of random numbers generated using one generator */ int count_all, i, num_generator = sizeof(mts)/sizeof(mts[0]); /* The number of generators */ double pi; cl_platform_id platform_id = NULL; cl_uint ret_num_platforms; cl_device_id device_id = NULL; cl_uint ret_num_devices; cl_context context = NULL; cl_command_queue command_queue = NULL; cl_program program = NULL; cl_kernel kernel_mt = NULL, kernel_pi = NULL; size_t kernel_code_size; char *kernel_src_str; cl_uint *result; cl_int ret; FILE *fp; cl_mem rand, count; size_t global_item_size[3], local_item_size[3]; cl_mem dev_mts; cl_event ev_mt_end, ev_pi_end, ev_copy_end; cl_ulong prof_start, prof_mt_end, prof_pi_end, prof_copy_end; clGetPlatformIDs(1, &platform_id, &ret_num_platforms); clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret); result = (cl_uint*)malloc(sizeof(cl_uint)*num_generator); command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret); fp = fopen("mt.cl", "r"); kernel_src_str = (char*)malloc(MAX_SOURCE_SIZE); kernel_code_size = fread(kernel_src_str, 1, MAX_SOURCE_SIZE, fp); fclose(fp); /* Create output buffer */ rand = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint)*num_rand*num_generator, NULL, &ret); count = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint)*num_generator, NULL, &ret); /* Build Program*/ program = clCreateProgramWithSource(context, 1, (const char **)&kernel_src_str, (const size_t *)&kernel_code_size, &ret); clBuildProgram(program, 1, &device_id, "", NULL, NULL); kernel_mt = clCreateKernel(program, "genrand", &ret); kernel_pi = clCreateKernel(program, "calc_pi", &ret); /* Create input parameter */ dev_mts = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(mts), NULL, &ret); clEnqueueWriteBuffer(command_queue, dev_mts, CL_TRUE, 0, sizeof(mts), mts, 0, NULL, NULL); /* Set Kernel Arguments */ clSetKernelArg(kernel_mt, 0, sizeof(cl_mem), (void*)&rand); /* Random numbers (output of genrand) */ clSetKernelArg(kernel_mt, 1, sizeof(cl_mem), (void*)&dev_mts); /* MT parameter (input to genrand) */ clSetKernelArg(kernel_mt, 2, sizeof(num_rand), &num_rand); /* Number of random numbers to generate */ clSetKernelArg(kernel_pi, 0, sizeof(cl_mem), (void*)&count); /* Counter for points within circle (output of calc_pi) */ clSetKernelArg(kernel_pi, 1, sizeof(cl_mem), (void*)&rand); /* Random numbers (input to calc_pi) */ clSetKernelArg(kernel_pi, 2, sizeof(num_rand), &num_rand); /* Number of random numbers used */ global_item_size[0] = num_generator; global_item_size[1] = 1; global_item_size[2] = 1; local_item_size[0] = num_generator; local_item_size[1] = 1; local_item_size[2] = 1; /* Create a random number array */ clEnqueueNDRangeKernel(command_queue, kernel_mt, 1, NULL, global_item_size, local_item_size, 0, NULL, &ev_mt_end); /* Compute PI */ clEnqueueNDRangeKernel(command_queue, kernel_pi, 1, NULL, global_item_size, local_item_size, 0, NULL, &ev_pi_end); /* Get result */ clEnqueueReadBuffer(command_queue, count, CL_TRUE, 0, sizeof(cl_uint)*num_generator, result, 0, NULL, &ev_copy_end); /* Average the values of PI */ count_all = 0; for (i=0; i < num_generator; i++) { count_all += result[i]; } pi = ((double)count_all)/(num_rand * num_generator) * 4; printf("pi = %f\n", pi); /* Get execution time info */ clGetEventProfilingInfo(ev_mt_end, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &prof_start, NULL); clGetEventProfilingInfo(ev_mt_end, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &prof_mt_end, NULL); clGetEventProfilingInfo(ev_pi_end, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &prof_pi_end, NULL); clGetEventProfilingInfo(ev_copy_end, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &prof_copy_end, NULL); printf(" mt: %f[ms]\n" " pi: %f[ms]\n" " copy: %f[ms]\n", (prof_mt_end - prof_start)/(1000000.0), (prof_pi_end - prof_mt_end)/(1000000.0), (prof_copy_end - prof_pi_end)/(1000000.0)); clReleaseEvent(ev_mt_end); clReleaseEvent(ev_pi_end); clReleaseEvent(ev_copy_end); clReleaseMemObject(rand); clReleaseMemObject(count); clReleaseKernel(kernel_mt); clReleaseKernel(kernel_pi); clReleaseProgram(program); clReleaseCommandQueue(command_queue); clReleaseContext(context); free(kernel_src_str); free(result); return 0; }
void print_clinfo () { char *s = NULL; size_t len; unsigned i, j; cl_uint platform_count; cl_platform_id *platforms; /* Determine number of OpenCL Platforms available. */ clGetPlatformIDs (0, NULL, &platform_count); printf ("number of OpenCL Platforms available:\t%d\n", platform_count); /* Get platforms. */ platforms = (cl_platform_id*) malloc (sizeof (cl_platform_id) * platform_count); if (platforms == NULL) { fprintf (stderr, "malloc failed\n"); exit (EXIT_FAILURE); } clGetPlatformIDs (platform_count, platforms, NULL); /* Querying platforms. */ for (i = 0; i < platform_count; i++) { cl_device_id *devices; cl_uint device_count; cl_device_id default_dev; printf (" OpenCL Platform: %d\n", i); #define PRINT_PF_INFO(PARM)\ clGetPlatformInfo (platforms[i], PARM, 0, NULL, &len); \ s = realloc (s, len); \ clGetPlatformInfo (platforms[i], PARM, len, s, NULL); \ printf (" %-36s%s\n", #PARM ":", s); PRINT_PF_INFO (CL_PLATFORM_PROFILE) PRINT_PF_INFO (CL_PLATFORM_VERSION) PRINT_PF_INFO (CL_PLATFORM_NAME) PRINT_PF_INFO (CL_PLATFORM_VENDOR) PRINT_PF_INFO (CL_PLATFORM_EXTENSIONS) #undef PRINT_PF_INFO clGetDeviceIDs (platforms[i], CL_DEVICE_TYPE_DEFAULT, 1, &default_dev, NULL); clGetDeviceInfo (default_dev, CL_DEVICE_NAME, 0, NULL, &len); s = realloc (s, len); clGetDeviceInfo (default_dev, CL_DEVICE_NAME, len, s, NULL); printf (" CL_DEVICE_TYPE_DEFAULT: %s\n", s); /* Determine number of devices. */ clGetDeviceIDs (platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &device_count); printf ("\n number of OpenCL Devices available: %d\n", device_count); /* Get devices. */ devices = (cl_device_id*) malloc (sizeof (cl_device_id) * device_count); if (devices == NULL) { fprintf (stderr, "malloc failed\n"); exit (EXIT_FAILURE); } clGetDeviceIDs (platforms[i], CL_DEVICE_TYPE_ALL, device_count, devices, NULL); /* Querying devices. */ for (j = 0; j < device_count; j++) { cl_device_type dtype; cl_device_mem_cache_type mctype; cl_device_local_mem_type mtype; cl_device_fp_config fpcfg; cl_device_exec_capabilities xcap; cl_command_queue_properties qprops; cl_bool clbool; cl_uint cluint; cl_ulong clulong; size_t sizet; size_t workitem_size[3]; printf (" OpenCL Device: %d\n", j); #define PRINT_DEV_INFO(PARM)\ clGetDeviceInfo (devices[j], PARM, 0, NULL, &len); \ s = realloc (s, len); \ clGetDeviceInfo (devices[j], PARM, len, s, NULL); \ printf (" %-41s%s\n", #PARM ":", s); PRINT_DEV_INFO (CL_DEVICE_NAME) PRINT_DEV_INFO (CL_DRIVER_VERSION) PRINT_DEV_INFO (CL_DEVICE_VENDOR) clGetDeviceInfo (devices[j], CL_DEVICE_VENDOR_ID, sizeof (cluint), &cluint, NULL); printf (" CL_DEVICE_VENDOR_ID: %d\n", cluint); clGetDeviceInfo (devices[j], CL_DEVICE_TYPE, sizeof (dtype), &dtype, NULL); if (dtype & CL_DEVICE_TYPE_CPU) printf (" CL_DEVICE_TYPE: CL_DEVICE_TYPE_CPU\n"); if (dtype & CL_DEVICE_TYPE_GPU) printf (" CL_DEVICE_TYPE: CL_DEVICE_TYPE_GPU\n"); if (dtype & CL_DEVICE_TYPE_ACCELERATOR) printf (" CL_DEVICE_TYPE: CL_DEVICE_TYPE_ACCELERATOR\n"); if (dtype & CL_DEVICE_TYPE_DEFAULT) printf (" CL_DEVICE_TYPE: CL_DEVICE_TYPE_DEFAULT\n"); clGetDeviceInfo (devices[j], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof (cluint), &cluint, NULL); printf (" CL_DEVICE_MAX_CLOCK_FREQUENCY: %d\n", cluint); PRINT_DEV_INFO (CL_DEVICE_PROFILE) PRINT_DEV_INFO (CL_DEVICE_EXTENSIONS) clGetDeviceInfo (devices[j], CL_DEVICE_AVAILABLE, sizeof (clbool), &clbool, NULL); if (clbool == CL_TRUE) printf (" CL_DEVICE_AVAILABLE: CL_TRUE\n"); else printf (" CL_DEVICE_AVAILABLE: CL_FALSE\n"); clGetDeviceInfo (devices[j], CL_DEVICE_ENDIAN_LITTLE, sizeof (clbool), &clbool, NULL); if (clbool == CL_TRUE) printf (" CL_DEVICE_ENDIAN_LITTLE: CL_TRUE\n"); else printf (" CL_DEVICE_ENDIAN_LITTLE: CL_FALSE\n"); clGetDeviceInfo (devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof (cluint), &cluint, NULL); printf (" CL_DEVICE_MAX_COMPUTE_UNITS: %d\n", cluint); clGetDeviceInfo (devices[j], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof (sizet), &sizet, NULL); printf (" CL_DEVICE_MAX_WORK_GROUP_SIZE: %d\n", sizet); clGetDeviceInfo (devices[j], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof (cluint), &cluint, NULL); printf (" CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: %d\n", cluint); clGetDeviceInfo (devices[j], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof (workitem_size), &workitem_size, NULL); printf (" CL_DEVICE_MAX_WORK_ITEM_SIZES: %d / %d / %d\n", workitem_size[0], workitem_size[1], workitem_size[2]); clGetDeviceInfo (devices[j], CL_DEVICE_ADDRESS_BITS, sizeof (cluint), &cluint, NULL); printf (" CL_DEVICE_ADDRESS_BITS: %d\n", cluint); clGetDeviceInfo (devices[j], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof (clulong), &clulong, NULL); printf (" CL_DEVICE_MAX_MEM_ALLOC_SIZE: %llu\n", clulong); clGetDeviceInfo (devices[j], CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof (cluint), &cluint, NULL); printf (" CL_DEVICE_MEM_BASE_ADDR_ALIGN: %d\n", cluint); clGetDeviceInfo(devices[j], CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, sizeof (cluint), &cluint, NULL); printf (" CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE: %d\n", cluint); clGetDeviceInfo(devices[j], CL_DEVICE_MAX_PARAMETER_SIZE, sizeof (sizet), &sizet, NULL); printf (" CL_DEVICE_MAX_PARAMETER_SIZE: %d\n", sizet); clGetDeviceInfo(devices[j], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof (clulong), &clulong, NULL); printf (" CL_DEVICE_GLOBAL_MEM_SIZE: %llu\n", clulong); clGetDeviceInfo (devices[j], CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, sizeof (mctype), &mctype, NULL); if (mctype & CL_NONE) printf (" CL_DEVICE_GLOBAL_MEM_CACHE_TYPE: CL_NONE\n"); if (mctype & CL_READ_ONLY_CACHE) printf (" CL_DEVICE_GLOBAL_MEM_CACHE_TYPE: CL_READ_ONLY_CACHE\n"); if (mctype & CL_READ_WRITE_CACHE) printf (" CL_DEVICE_GLOBAL_MEM_CACHE_TYPE: CL_READ_WRITE_CACHE\n"); clGetDeviceInfo (devices[j], CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, sizeof (clulong), &clulong, NULL); printf (" CL_DEVICE_GLOBAL_MEM_CACHE_SIZE: %llu\n", clulong); clGetDeviceInfo (devices[j], CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof (cluint), &cluint, NULL); printf (" CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE: %d\n", cluint); clGetDeviceInfo (devices[j], CL_DEVICE_LOCAL_MEM_TYPE, sizeof (mtype), &mtype, NULL); if (mtype & CL_LOCAL) printf (" CL_DEVICE_LOCAL_MEM_TYPE: CL_LOCAL\n"); if (mtype & CL_GLOBAL) printf (" CL_DEVICE_LOCAL_MEM_TYPE: CL_GLOBAL\n"); clGetDeviceInfo (devices[j], CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, sizeof (cluint), &cluint, NULL); printf (" CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE: %d\n", cluint); clGetDeviceInfo (devices[j], CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof (cluint), &cluint, NULL); printf (" CL_DEVICE_MEM_BASE_ADDR_ALIGN: %d\n", cluint); clGetDeviceInfo (devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, sizeof (cluint), &cluint, NULL); printf (" CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR: %d\n", cluint); clGetDeviceInfo (devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, sizeof (cluint), &cluint, NULL); printf (" CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT: %d\n", cluint); clGetDeviceInfo (devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof (cluint), &cluint, NULL); printf (" CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT: %d\n", cluint); clGetDeviceInfo (devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, sizeof (cluint), &cluint, NULL); printf (" CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG: %d\n", cluint); clGetDeviceInfo (devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, sizeof (cluint), &cluint, NULL); printf (" CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT: %d\n", cluint); clGetDeviceInfo (devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, sizeof (cluint), &cluint, NULL); printf (" CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE: %d\n", cluint); clGetDeviceInfo (devices[j], CL_DEVICE_SINGLE_FP_CONFIG, sizeof (fpcfg), &fpcfg, NULL); if (fpcfg & CL_FP_DENORM) printf (" CL_DEVICE_SINGLE_FP_CONFIG: CL_FP_DENORM\n"); if (fpcfg & CL_FP_INF_NAN) printf (" CL_DEVICE_SINGLE_FP_CONFIG: CL_FP_INF_NAN\n"); if (fpcfg & CL_FP_ROUND_TO_NEAREST) printf (" CL_DEVICE_SINGLE_FP_CONFIG: CL_FP_ROUND_TO_NEAREST\n"); if (fpcfg & CL_FP_ROUND_TO_ZERO) printf (" CL_DEVICE_SINGLE_FP_CONFIG: CL_FP_ROUND_TO_ZERO\n"); clGetDeviceInfo (devices[j], CL_DEVICE_EXECUTION_CAPABILITIES, sizeof (xcap), &xcap, NULL); if (xcap & CL_EXEC_KERNEL ) printf (" CL_DEVICE_EXECUTION_CAPABILITIES: CL_EXEC_KERNEL\n"); if (xcap & CL_EXEC_NATIVE_KERNEL) printf (" CL_DEVICE_EXECUTION_CAPABILITIES: CL_EXEC_NATIVE_KERNEL\n"); clGetDeviceInfo (devices[j], CL_DEVICE_QUEUE_PROPERTIES, sizeof (qprops), &qprops, NULL); if (qprops & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) printf (" CL_DEVICE_QUEUE_PROPERTIES: CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE\n"); if (qprops & CL_QUEUE_PROFILING_ENABLE) printf (" CL_DEVICE_QUEUE_PROPERTIES: CL_QUEUE_PROFILING_ENABLE\n"); clGetDeviceInfo (devices[j], CL_DEVICE_PROFILING_TIMER_RESOLUTION, sizeof (sizet), &sizet, NULL); printf (" CL_DEVICE_PROFILING_TIMER_RESOLUTION: %d\n", sizet); clGetDeviceInfo (devices[j], CL_DEVICE_COMPILER_AVAILABLE, sizeof (clbool), &clbool, NULL); if (clbool == CL_TRUE) printf (" CL_DEVICE_COMPILER_AVAILABLE: CL_TRUE\n"); else printf (" CL_DEVICE_COMPILER_AVAILABLE: CL_FALSE\n"); clGetDeviceInfo (devices[j], CL_DEVICE_ERROR_CORRECTION_SUPPORT, sizeof (clbool), &clbool, NULL); if (clbool == CL_TRUE) printf (" CL_DEVICE_ERROR_CORRECTION_SUPPORT: CL_TRUE\n"); else printf (" CL_DEVICE_ERROR_CORRECTION_SUPPORT: CL_FALSE\n"); clGetDeviceInfo (devices[j], CL_DEVICE_IMAGE_SUPPORT, sizeof (clbool), &clbool, NULL); if (clbool == CL_FALSE) { printf (" CL_DEVICE_IMAGE_SUPPORT: CL_FALSE\n"); } else { printf (" CL_DEVICE_IMAGE_SUPPORT: CL_TRUE\n"); clGetDeviceInfo (devices[j], CL_DEVICE_MAX_SAMPLERS, sizeof (cluint), &cluint, NULL); printf (" CL_DEVICE_MAX_SAMPLERS: %d\n", cluint); clGetDeviceInfo (devices[j], CL_DEVICE_MAX_READ_IMAGE_ARGS, sizeof (cluint), &cluint, NULL); printf (" CL_DEVICE_MAX_READ_IMAGE_ARGS: %d\n", cluint); clGetDeviceInfo (devices[j], CL_DEVICE_MAX_WRITE_IMAGE_ARGS, sizeof (cluint), &cluint, NULL); printf (" CL_DEVICE_MAX_WRITE_IMAGE_ARGS: %d\n", cluint); clGetDeviceInfo (devices[j], CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof (sizet), &sizet, NULL); printf (" CL_DEVICE_IMAGE2D_MAX_WIDTH: %d\n", sizet); clGetDeviceInfo (devices[j], CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof (sizet), &sizet, NULL); printf (" CL_DEVICE_IMAGE2D_MAX_HEIGHT: %d\n", sizet); clGetDeviceInfo (devices[j], CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof (sizet), &sizet, NULL); printf (" CL_DEVICE_IMAGE3D_MAX_WIDTH: %d\n", sizet); clGetDeviceInfo (devices[j], CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof (sizet), &sizet, NULL); printf (" CL_DEVICE_IMAGE3D_MAX_HEIGHT: %d\n", sizet); clGetDeviceInfo (devices[j], CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof (sizet), &sizet, NULL); printf (" CL_DEVICE_IMAGE3D_MAX_DEPTH: %d\n", sizet); } #undef PRINT_DEV_INFO } /* devices */ free (devices); } /* platforms */ free (s); free (platforms); }
int main(int argc, char const *argv[]) { /* Get platform */ cl_platform_id platform; cl_uint num_platforms; cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformIDs' failed\n"); exit(1); } printf("Number of platforms: %d\n", num_platforms); printf("platform=%p\n", platform); /* Get platform name */ char platform_name[100]; ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformInfo' failed\n"); exit(1); } printf("platform.name='%s'\n\n", platform_name); /* Get device */ cl_device_id device; cl_uint num_devices; ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceIDs' failed\n"); exit(1); } printf("Number of devices: %d\n", num_devices); printf("device=%p\n", device); /* Get device name */ char device_name[100]; ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceInfo' failed\n"); exit(1); } printf("device.name='%s'\n", device_name); printf("\n"); /* Create a Context Object */ cl_context context; context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateContext' failed\n"); exit(1); } printf("context=%p\n", context); /* Create a Command Queue Object*/ cl_command_queue command_queue; command_queue = clCreateCommandQueue(context, device, 0, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateCommandQueue' failed\n"); exit(1); } printf("command_queue=%p\n", command_queue); printf("\n"); /* Program binary */ unsigned char *bin; size_t bin_len; cl_int bin_ret; /* Read program binary */ if (argc == 2) bin = read_buffer((char *)argv[1], &bin_len); else { printf("error: No binary specified\n"); exit(1); } /* Create a program */ cl_program program; program = clCreateProgramWithBinary(context, 1, &device, &bin_len, (const unsigned char **)&bin, &bin_ret, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateProgramWithBinary' failed\n"); exit(1); } if (bin_ret != CL_SUCCESS) { printf("error: Invalid binary for device\n"); exit(1); } printf("program=%p\n", program); /* Free binary */ free(bin); printf("program binary loaded\n"); printf("\n"); ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if (ret != CL_SUCCESS ) { size_t size; char *log; /* Get log size */ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size); /* Allocate log and print */ log = malloc(size); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL); printf("error: call to 'clBuildProgram' failed:\n%s\n", log); /* Free log and exit */ free(log); exit(1); } printf("program built\n"); printf("\n"); /* Create a Kernel Object*/ cl_kernel kernel; kernel = clCreateKernel(program, "hypot_float4float4", &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateKernel' failed\n"); exit(1); } /* Create and allocate host buffers */ size_t num_elem = 10; /* Create and init host side src buffer 0 */ cl_float4 *src_0_host_buffer; src_0_host_buffer = malloc(num_elem * sizeof(cl_float4)); for (int i = 0; i < num_elem; i++) src_0_host_buffer[i] = (cl_float4){{2.0, 2.0, 2.0, 2.0}}; /* Create and init device side src buffer 0 */ cl_mem src_0_device_buffer; src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_float4), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_float4), src_0_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create and init host side src buffer 1 */ cl_float4 *src_1_host_buffer; src_1_host_buffer = malloc(num_elem * sizeof(cl_float4)); for (int i = 0; i < num_elem; i++) src_1_host_buffer[i] = (cl_float4){{2.0, 2.0, 2.0, 2.0}}; /* Create and init device side src buffer 1 */ cl_mem src_1_device_buffer; src_1_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_float4), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_1_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_float4), src_1_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create host dst buffer */ cl_float4 *dst_host_buffer; dst_host_buffer = malloc(num_elem * sizeof(cl_float4)); memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_float4)); /* Create device dst buffer */ cl_mem dst_device_buffer; dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_float4), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create dst buffer\n"); exit(1); } /* Set kernel arguments */ ret = CL_SUCCESS; ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer); ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &src_1_device_buffer); ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clSetKernelArg' failed\n"); exit(1); } /* Launch the kernel */ size_t global_work_size = num_elem; size_t local_work_size = num_elem; ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueNDRangeKernel' failed\n"); exit(1); } /* Wait for it to finish */ clFinish(command_queue); /* Read results from GPU */ ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_float4), dst_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueReadBuffer' failed\n"); exit(1); } /* Dump dst buffer to file */ char dump_file[100]; sprintf((char *)&dump_file, "%s.result", argv[0]); write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_float4)); printf("Result dumped to %s\n", dump_file); /* Free host dst buffer */ free(dst_host_buffer); /* Free device dst buffer */ ret = clReleaseMemObject(dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 0 */ free(src_0_host_buffer); /* Free device side src buffer 0 */ ret = clReleaseMemObject(src_0_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 1 */ free(src_1_host_buffer); /* Free device side src buffer 1 */ ret = clReleaseMemObject(src_1_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Release kernel */ ret = clReleaseKernel(kernel); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseKernel' failed\n"); exit(1); } /* Release program */ ret = clReleaseProgram(program); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseProgram' failed\n"); exit(1); } /* Release command queue */ ret = clReleaseCommandQueue(command_queue); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseCommandQueue' failed\n"); exit(1); } /* Release context */ ret = clReleaseContext(context); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseContext' failed\n"); exit(1); } return 0; }
int main() { srand(unsigned(time(nullptr))); int err; // error code returned from api calls cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel // OpenCL device memory for matrices cl_mem d_A; cl_mem d_B; cl_mem d_C; // set seed for rand() srand(2014); //Allocate host memory for matrices A and B unsigned int size_A = WA * HA; unsigned int mem_size_A = sizeof(float) * size_A; float* h_A = (float*)malloc(mem_size_A); unsigned int size_B = WB * HB; unsigned int mem_size_B = sizeof(float) * size_B; float* h_B = (float*)malloc(mem_size_B); //Initialize host memory randomMemInit(h_A, size_A); randomMemInit(h_B, size_B); //Allocate host memory for the result C unsigned int size_C = WC * HC; unsigned int mem_size_C = sizeof(float) * size_C; float* h_C = (float*)malloc(mem_size_C); printf("Initializing OpenCL device...\n"); cl_uint dev_cnt = 0; clGetPlatformIDs(0, 0, &dev_cnt); cl_platform_id platform_ids[100]; clGetPlatformIDs(dev_cnt, platform_ids, NULL); // Connect to a compute device int gpu = 1; err = clGetDeviceIDs(platform_ids[0], gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if (err != CL_SUCCESS){ printf("Error: Failed to create a device group!\n"); return EXIT_FAILURE; } // Create a compute context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context){ printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } // Create a command commands commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands){ printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } // Create the compute program from the source file char *KernelSource; long lFileSize = LoadOpenCLKernel("matrixmul_kernel.cl", &KernelSource); if (lFileSize < 0L){ perror("File read failed"); return 1; } //const char* KernelSource = loadKernelCPP(".\\matrixmul_kernel.cl"); program = clCreateProgramWithSource(context, 1, (const char **)&KernelSource, NULL, &err); if (!program){ printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } // Build the program executable err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS){ size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } // Create the compute kernel in the program we wish to run kernel = clCreateKernel(program, "matrixMul", &err); if (!kernel || err != CL_SUCCESS){ printf("Error: Failed to create compute kernel!\n"); exit(1); } // Create the input and output arrays in device memory for our calculation d_C = clCreateBuffer(context, CL_MEM_READ_WRITE, mem_size_A, NULL, &err); d_A = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_A, h_A, &err); d_B = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_B, h_B, &err); if (!d_A || !d_B || !d_C){ printf("Error: Failed to allocate device memory!\n"); exit(1); } printf("Running matrix multiplication for matrices A (%dx%d) and B (%dx%d) ...\n", WA, HA, WB, HB); //Launch OpenCL kernel size_t localWorkSize[2], globalWorkSize[2]; int wA = WA; int wC = WC; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&d_C); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d_A); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&d_B); err |= clSetKernelArg(kernel, 3, sizeof(int), (void *)&wA); err |= clSetKernelArg(kernel, 4, sizeof(int), (void *)&wC); if (err != CL_SUCCESS){ printf("Error: Failed to set kernel arguments! %d\n", err); exit(1); } localWorkSize[0] = 16; localWorkSize[1] = 16; globalWorkSize[0] = 1024; globalWorkSize[1] = 1024; err = clEnqueueNDRangeKernel(commands, kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); if (err != CL_SUCCESS){ printf("Error: Failed to execute kernel! %d\n", err); exit(1); } //Retrieve result from device err = clEnqueueReadBuffer(commands, d_C, CL_TRUE, 0, mem_size_C, h_C, 0, NULL, NULL); if (err != CL_SUCCESS){ printf("Error: Failed to read output array! %d\n", err); exit(1); } //print table A printf("\nMatrix A\n"); for (int i = 0; i < size_A; i++){ printf("%f\t", h_A[i]); if (((i + 1) % WA) == 0) printf("\n"); } //print table B printf("\nMatrix B\n"); for (int i = 0; i < size_B; i++){ printf("%f\t", h_B[i]); if (((i + 1) % WB) == 0) printf("\n"); } //print out the results printf("\nMatrix C (Results)\n"); for (int i = 0; i < size_C; i++){ printf("%f\t", h_C[i]); if (((i + 1) % WC) == 0) printf("\n"); } printf("\n"); printf("Matrix multiplication completed...\n"); //Shutdown and cleanup free(h_A); free(h_B); free(h_C); clReleaseMemObject(d_A); clReleaseMemObject(d_C); clReleaseMemObject(d_B); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); std::cin.clear(); std::cin.sync(); std::cin.get(); }