int main(int argc, char *argv[]){ cl_uint numPlatforms; cl_platform_id* clSelectedPlatformID = NULL; int err; // error code returned from api calls int data[DATA_SIZE]; // original data set given to device int results[DATA_SIZE]; // results returned from device unsigned int correct; // number of correct results returned size_t global; // global domain size for our calculation size_t local; // local domain size for our calculation cl_device_id device_id; cl_context context; cl_command_queue commands; cl_program program; cl_kernel kernel; cl_mem input; // device memory used for the input array cl_mem output; // device memory used for the output array if(parseArgs(argc, argv)){ return 0; } // Fill our data set with random int values unsigned int count = DATA_SIZE; //////////////////////////////////////////////////////////////////////////////// // Simple compute kernel which computes the collatz of an input array // const char *KernelSource = fileToString("gpuFunctions.c"); //get Platform clGetPlatformIDs(0, NULL, &numPlatforms); clSelectedPlatformID = (cl_platform_id*)malloc(sizeof(cl_platform_id)*numPlatforms); err = clGetPlatformIDs(numPlatforms, clSelectedPlatformID, NULL); //get Device err = clGetDeviceIDs(clSelectedPlatformID[0], CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); return EXIT_FAILURE; } //create context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } // Create a command commands // commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } // Create the compute program from the source buffer // program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err); if (!program) { printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } // Build the program executable // err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } // Create the compute kernel in the program we wish to run // kernel = clCreateKernel(program, "allToOne", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } // Create the input and output arrays in device memory for our calculation // input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL); output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL); if (!input || !output) { printf("Error: Failed to allocate device memory!\n"); exit(1); } timer t = createTimer(); for(int i =0;i<rep;i++){ initData(data); // Write our data set into the input array in device memory // err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array!\n"); exit(1); } // Set the arguments to our compute kernel // err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output); err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); exit(1); } // Get the maximum work group size for executing the kernel on the device // err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); exit(1); } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // global = count; err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel!\n"); return EXIT_FAILURE; } // Wait for the command commands to get serviced before reading back results // clFinish(commands); // Read back the results from the device to verify the output // err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); exit(1); } } double timeEnd = getTime(t); // Validate our results // correct = 0; for(int i = 0; i < arraySize; i++) { if(results[i] >= 0){ correct++; if(i==0){ printf("%d",results[i]); }else{ printf(",%d",results[i]); } } } printf("\n"); // Print a brief summary detailing the results printf("Computed '%d/%d' values to 1!\n", correct, arraySize); printf("TIME- %f\n",timeEnd); // Shutdown and cleanup clReleaseMemObject(input); clReleaseMemObject(output); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); return 0; }
ErrorStatus gemm_clblas(cl_device_id device, const void *inMatrixA, int nrowA, int ncolA, bool transposeA, const void *inMatrixB, int nrowB, int ncolB, bool transposeB, double alpha, double beta, void *outMatrix, bool use_float) { std::stringstream result; float *input_matrixA_f = (float *)inMatrixA; float *input_matrixB_f = (float *)inMatrixB; float *output_matrix_f = (float *)outMatrix; double *input_matrixA_d = (double *)inMatrixA; double *input_matrixB_d = (double *)inMatrixB; double *output_matrix_d = (double *)outMatrix; if (debug) { result << "gemm_clblas( " << (use_float ? "FLOAT" : "DOUBLE") << ")" << std::endl << std::endl; } cl_int err = CL_SUCCESS; clblasStatus status = clblasSetup(); if (status != CL_SUCCESS) { if (debug) { result << "clblasSetup: " << clblasErrorToString(status) << std::endl; } err = CL_INVALID_OPERATION; } // get first platform cl_platform_id platform = NULL; if (err == CL_SUCCESS) { err = clGetPlatformIDs(1, &platform, NULL); } if (debug && err == CL_SUCCESS) { result << "Platform: " << getPlatformInfoString(platform, CL_PLATFORM_NAME) << std::endl; result << "Device: " << getDeviceInfoString(device, CL_DEVICE_NAME) << std::endl; } // context cl_context context = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateContext:" << std::endl; } context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); } // queue cl_command_queue queue = NULL; if (err == CL_SUCCESS) { #ifdef CL_VERSION_2_0 if (debug) { result << "clCreateCommandQueueWithProperties:" << std::endl; } queue = clCreateCommandQueueWithProperties(context, device, NULL, &err); #else if (debug) { result << "clCreateCommandQueue:" << std::endl; } queue = clCreateCommandQueue(context, device, 0, &err); #endif } // buffers cl_mem cl_input_matrixA = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateBuffer cl_input_matrixA:" << std::endl; } if (use_float) { cl_input_matrixA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrowA * ncolA * sizeof(float), input_matrixA_f, &err); } else { cl_input_matrixA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrowA * ncolA * sizeof(double), input_matrixA_d, &err); } } cl_mem cl_input_matrixB = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateBuffer cl_input_matrixB:" << std::endl; } if (use_float) { cl_input_matrixB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrowB * ncolB * sizeof(float), input_matrixB_f, &err); } else { cl_input_matrixB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrowB * ncolB * sizeof(double), input_matrixB_d, &err); } } int nrowC = transposeA ? ncolA : nrowA; int ncolC = transposeB ? nrowB : ncolB; cl_mem cl_output_matrix = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateBuffer cl_output_vector:" << std::endl; } if (use_float) { cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, nrowC * ncolC * sizeof(float), output_matrix_f, &err); } else { cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, nrowC * ncolC * sizeof(double), output_matrix_d, &err); } } // ++++++++++++ const int lda = nrowA; // first dimension of A (rows), before any transpose const int ldb = nrowB; // first dimension of B (rows), before any transpose const int ldc = nrowC; // first dimension of C (rows) const int M = transposeA ? ncolA : nrowA; // rows in A (after transpose, if any) and C const int N = transposeB ? nrowB : ncolB; // cols in B (after transpose, if any) and C const int K = transposeA ? nrowA : ncolA; // cols in A and rows in B (after transposes, if any) const clblasOrder order = clblasColumnMajor; const clblasTranspose transA = transposeA ? clblasTrans : clblasNoTrans; const clblasTranspose transB = transposeB ? clblasTrans : clblasNoTrans; cl_event event = NULL; if (err == CL_SUCCESS) { if (use_float) { if (debug) { result << "clblasSgemm:" << std::endl; } status = clblasSgemm(order, transA, transB, M, N, K, alpha, cl_input_matrixA, 0, lda, cl_input_matrixB, 0, ldb, beta, cl_output_matrix, 0, ldc, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS && debug) { result << "clblasSgemm error:" << clblasErrorToString(status) << std::endl; } } else { if (debug) { result << "clblasDgemm:" << std::endl; } status = clblasDgemm(order, transA, transB, M, N, K, alpha, cl_input_matrixA, 0, lda, cl_input_matrixB, 0, ldb, beta, cl_output_matrix, 0, ldc, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { if (debug) { result << "clblasDgemm error:" << clblasErrorToString(status) << std::endl; } err = status; } } } if (err == CL_SUCCESS) { /* Wait for calculations to be finished. */ if (debug) { result << "clWaitForEvents:" << std::endl; } err = clWaitForEvents(1, &event); } // retrieve result if (err == CL_SUCCESS) { if (debug) { result << "Retrieve result:" << std::endl; } if (use_float) { clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, nrowC * ncolC * sizeof(float), output_matrix_f, 0, NULL, NULL); } else { clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, nrowC * ncolC * sizeof(double), output_matrix_d, 0, NULL, NULL); } } std::string err_str = clErrorToString(err); result << std::endl << err_str << std::endl; // cleanup clReleaseMemObject(cl_output_matrix); cl_output_matrix = NULL; clReleaseMemObject(cl_input_matrixA); cl_input_matrixA = NULL; clReleaseMemObject(cl_input_matrixB); cl_input_matrixB = NULL; clReleaseCommandQueue(queue); queue = NULL; clReleaseContext(context); context = NULL; if (debug) { CERR << result.str(); } ErrorStatus errorStatus = { err, status }; return errorStatus; }
int main() { typedef float ScalarType; ///////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////// Part 1: Set up a custom context and perform a sample operation. //////////////// //////////////////////// This is rather lengthy due to the OpenCL framework. //////////////// //////////////////////// The following does essentially the same as the //////////////// //////////////////////// 'custom_kernels'-tutorial! //////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////// //manually set up a custom OpenCL context: std::vector<cl_device_id> device_id_array; //get all available devices viennacl::ocl::platform pf; std::cout << "Platform info: " << pf.info() << std::endl; std::vector<viennacl::ocl::device> devices = pf.devices(CL_DEVICE_TYPE_DEFAULT); std::cout << devices[0].name() << std::endl; std::cout << "Number of devices for custom context: " << devices.size() << std::endl; //set up context using all found devices: for (size_t i=0; i<devices.size(); ++i) { device_id_array.push_back(devices[i].id()); } std::cout << "Creating context..." << std::endl; cl_int err; cl_context my_context = clCreateContext(0, device_id_array.size(), &(device_id_array[0]), NULL, NULL, &err); VIENNACL_ERR_CHECK(err); //create two Vectors: unsigned int vector_size = 10; std::vector<ScalarType> vec1(vector_size); std::vector<ScalarType> vec2(vector_size); std::vector<ScalarType> result(vector_size); // // fill the operands vec1 and vec2: // for (unsigned int i=0; i<vector_size; ++i) { vec1[i] = static_cast<ScalarType>(i); vec2[i] = static_cast<ScalarType>(vector_size-i); } // // create memory in OpenCL context: // cl_mem mem_vec1 = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(vec1[0]), &err); VIENNACL_ERR_CHECK(err); cl_mem mem_vec2 = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(vec2[0]), &err); VIENNACL_ERR_CHECK(err); cl_mem mem_result = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(result[0]), &err); VIENNACL_ERR_CHECK(err); // // create a command queue for each device: // std::vector<cl_command_queue> queues(devices.size()); for (size_t i=0; i<devices.size(); ++i) { queues[i] = clCreateCommandQueue(my_context, devices[i].id(), 0, &err); VIENNACL_ERR_CHECK(err); } // // create and build a program in the context: // size_t source_len = std::string(my_compute_program).length(); cl_program my_prog = clCreateProgramWithSource(my_context, 1, &my_compute_program, &source_len, &err); err = clBuildProgram(my_prog, 0, NULL, NULL, NULL, NULL); /* char buffer[1024]; cl_build_status status; clGetProgramBuildInfo(my_prog, devices[1].id(), CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL); clGetProgramBuildInfo(my_prog, devices[1].id(), CL_PROGRAM_BUILD_LOG, sizeof(char)*1024, &buffer, NULL); std::cout << "Build Scalar: Err = " << err << " Status = " << status << std::endl; std::cout << "Log: " << buffer << std::endl;*/ VIENNACL_ERR_CHECK(err); // // create a kernel from the program: // const char * kernel_name = "elementwise_prod"; cl_kernel my_kernel = clCreateKernel(my_prog, kernel_name, &err); VIENNACL_ERR_CHECK(err); // // Execute elementwise_prod kernel on first queue: result = vec1 .* vec2; // err = clSetKernelArg(my_kernel, 0, sizeof(cl_mem), (void*)&mem_vec1); VIENNACL_ERR_CHECK(err); err = clSetKernelArg(my_kernel, 1, sizeof(cl_mem), (void*)&mem_vec2); VIENNACL_ERR_CHECK(err); err = clSetKernelArg(my_kernel, 2, sizeof(cl_mem), (void*)&mem_result); VIENNACL_ERR_CHECK(err); err = clSetKernelArg(my_kernel, 3, sizeof(unsigned int), (void*)&vector_size); VIENNACL_ERR_CHECK(err); size_t global_size = vector_size; size_t local_size = vector_size; err = clEnqueueNDRangeKernel(queues[0], my_kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL); VIENNACL_ERR_CHECK(err); // // Read and output result: // err = clEnqueueReadBuffer(queues[0], mem_vec1, CL_TRUE, 0, sizeof(ScalarType)*vector_size, &(vec1[0]), 0, NULL, NULL); VIENNACL_ERR_CHECK(err); err = clEnqueueReadBuffer(queues[0], mem_result, CL_TRUE, 0, sizeof(ScalarType)*vector_size, &(result[0]), 0, NULL, NULL); VIENNACL_ERR_CHECK(err); std::cout << "vec1 : "; for (size_t i=0; i<vec1.size(); ++i) std::cout << vec1[i] << " "; std::cout << std::endl; std::cout << "vec2 : "; for (size_t i=0; i<vec2.size(); ++i) std::cout << vec2[i] << " "; std::cout << std::endl; std::cout << "result: "; for (size_t i=0; i<result.size(); ++i) std::cout << result[i] << " "; std::cout << std::endl; //////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////// Part 2: Let ViennaCL use the already created context: ////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////// //Tell ViennaCL to use the previously created context. //This context is assigned an id '0' when using viennacl::ocl::switch_context(). viennacl::ocl::setup_context(0, my_context, device_id_array, queues); viennacl::ocl::switch_context(0); //activate the new context (only mandatory with context-id not equal to zero) // // Proof that ViennaCL really uses the new context: // std::cout << "Existing context: " << my_context << std::endl; std::cout << "ViennaCL uses context: " << viennacl::ocl::current_context().handle().get() << std::endl; // // Wrap existing OpenCL objects into ViennaCL: // viennacl::vector<ScalarType> vcl_vec1(mem_vec1, vector_size); viennacl::vector<ScalarType> vcl_vec2(mem_vec2, vector_size); viennacl::vector<ScalarType> vcl_result(mem_result, vector_size); viennacl::scalar<ScalarType> vcl_s = 2.0; std::cout << "Standard vector operations within ViennaCL:" << std::endl; vcl_result = vcl_s * vcl_vec1 + vcl_vec2; std::cout << "vec1 : "; std::cout << vcl_vec1 << std::endl; std::cout << "vec2 : "; std::cout << vcl_vec2 << std::endl; std::cout << "result: "; std::cout << vcl_result << std::endl; // // We can also reuse the existing elementwise_prod kernel. // Therefore, we first have to make the existing program known to ViennaCL // For more details on the three lines, see tutorial 'custom-kernels' // std::cout << "Using existing kernel within the OpenCL backend of ViennaCL:" << std::endl; viennacl::ocl::program & my_vcl_prog = viennacl::ocl::current_context().add_program(my_prog, "my_compute_program"); viennacl::ocl::kernel & my_vcl_kernel = my_vcl_prog.add_kernel("elementwise_prod"); viennacl::ocl::enqueue(my_vcl_kernel(vcl_vec1, vcl_vec2, vcl_result, static_cast<cl_uint>(vcl_vec1.size()))); //Note that size_t might differ between host and device. Thus, a cast to cl_uint is necessary here. std::cout << "vec1 : "; std::cout << vcl_vec1 << std::endl; std::cout << "vec2 : "; std::cout << vcl_vec2 << std::endl; std::cout << "result: "; std::cout << vcl_result << std::endl; // // Since a linear piece of memory can be interpreted in several ways, // we will now create a 3x3 row-major matrix out of the linear memory in mem_vec1/ // The first three entries in vcl_vec2 and vcl_result are used to carry out matrix-vector products: // viennacl::matrix<ScalarType> vcl_matrix(mem_vec1, 3, 3); vcl_vec2.resize(3); //note that the resize operation leads to new memory, thus vcl_vec2 is now at a different memory location (values are copied) vcl_result.resize(3); //note that the resize operation leads to new memory, thus vcl_vec2 is now at a different memory location (values are copied) vcl_result = viennacl::linalg::prod(vcl_matrix, vcl_vec2); std::cout << "result of matrix-vector product: "; std::cout << vcl_result << std::endl; // // That's it. // std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl; return 0; }
/* * pgstrom_collect_device_info * * It collects properties of all the OpenCL devices. It shall be called once * by the OpenCL management worker process, prior to any other backends. */ static List * construct_opencl_device_info(int platform_index) { cl_platform_id platforms[32]; cl_device_id devices[MAX_NUM_DEVICES]; cl_uint n_platform; cl_uint n_devices; cl_int i, j, rc; long score_max = -1; List *result = NIL; rc = clGetPlatformIDs(lengthof(platforms), platforms, &n_platform); if (rc != CL_SUCCESS) elog(ERROR, "clGetPlatformIDs failed (%s)", opencl_strerror(rc)); for (i=0; i < n_platform; i++) { pgstrom_platform_info *pl_info; pgstrom_device_info *dev_info; long score = 0; List *temp = NIL; pl_info = collect_opencl_platform_info(platforms[i]); pl_info->pl_index = i; rc = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_CPU | CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_ACCELERATOR, lengthof(devices), devices, &n_devices); if (rc != CL_SUCCESS) elog(ERROR, "clGetDeviceIDs failed (%s)", opencl_strerror(rc)); elog(LOG, "PG-Strom: [%d] OpenCL Platform: %s", i, pl_info->pl_name); for (j=0; j < n_devices; j++) { dev_info = collect_opencl_device_info(devices[j]); dev_info->pl_info = pl_info; dev_info->dev_index = j; elog(LOG, "PG-Strom: + device %s (%uMHz x %uunits, %luMB)", dev_info->dev_name, dev_info->dev_max_clock_frequency, dev_info->dev_max_compute_units, dev_info->dev_global_mem_size >> 20); /* rough estimation about computing power */ if ((dev_info->dev_type & CL_DEVICE_TYPE_GPU) != 0) score += 32 * (dev_info->dev_max_compute_units * dev_info->dev_max_clock_frequency); else score += (dev_info->dev_max_compute_units * dev_info->dev_max_clock_frequency); temp = lappend(temp, dev_info); } if (platform_index == i || (platform_index < 0 && score > score_max)) { opencl_platform_id = platforms[i]; opencl_num_devices = n_devices; for (j=0; j < n_devices; j++) opencl_devices[j] = devices[j]; score_max = score; result = temp; } } /* show platform name if auto-selection */ if (platform_index < 0 && result != NIL) { pgstrom_platform_info *pl_info = ((pgstrom_device_info *) linitial(result))->pl_info; elog(LOG, "PG-Strom: auto platform selection: %s", pl_info->pl_name); } if (result != NIL) { /* * Create an OpenCL context */ opencl_context = clCreateContext(NULL, opencl_num_devices, opencl_devices, NULL, NULL, &rc); if (rc != CL_SUCCESS) elog(ERROR, "clCreateContext failed: %s", opencl_strerror(rc)); /* * Create an OpenCL command queue for each device */ for (j=0; j < opencl_num_devices; j++) { opencl_cmdq[j] = clCreateCommandQueue(opencl_context, opencl_devices[j], CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE, &rc); if (rc != CL_SUCCESS) elog(ERROR, "clCreateCommandQueue failed: %s", opencl_strerror(rc)); } } return result; }
xcl_world xcl_world_single(cl_device_type device_type, char *target_vendor, char *target_device) { int err; xcl_world world; cl_uint num_platforms; err = clGetPlatformIDs(0, NULL, &num_platforms); if (err != CL_SUCCESS) { printf("Error: no platforms available or OpenCL install broken"); printf("Test failed\n"); exit(EXIT_FAILURE); } cl_platform_id *platform_ids = (cl_platform_id *) malloc(sizeof(cl_platform_id) * num_platforms); if (platform_ids == NULL) { printf("Error: Out of Memory\n"); printf("Test failed\n"); exit(EXIT_FAILURE); } err = clGetPlatformIDs(num_platforms, platform_ids, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to find an OpenCL platform!\n"); printf("Test failed\n"); exit(EXIT_FAILURE); } int i; char cl_platform_vendor[1001]; //find target vendor if target_vendor is specified if (target_vendor != NULL) { for(i = 0; i < num_platforms; i++) { err = clGetPlatformInfo(platform_ids[i], CL_PLATFORM_VENDOR, 1000, (void *)cl_platform_vendor,NULL); if (err != CL_SUCCESS) { printf("Error: clGetPlatformInfo(CL_PLATFORM_VENDOR) failed!\n"); printf("Test failed\n"); exit(EXIT_FAILURE); } if ((target_vendor != NULL) && (strcmp(cl_platform_vendor, target_vendor) == 0)) { printf("INFO: Selected platform %d from %s\n", i, cl_platform_vendor); world.platform_id = platform_ids[i]; break; } } } else { for(i = 0; i < num_platforms; i++) { err = clGetDeviceIDs(platform_ids[i], device_type, 1, &world.device_id, NULL); if (err == CL_SUCCESS) { world.platform_id = platform_ids[i]; break; } } } free(platform_ids); if (i == num_platforms) { printf("Error: Failed to find a platform\n"); printf("Test failed\n"); exit(EXIT_FAILURE); } if (target_device != NULL) { //find target device cl_device_id devices[16]; // compute device id cl_uint num_devices; char cl_device_name[100]; err = clGetDeviceIDs(world.platform_id, CL_DEVICE_TYPE_ACCELERATOR, 16, devices, &num_devices); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); printf("Test failed\n"); exit(EXIT_FAILURE); } //iterate all devices to select the target device. for (i=0; i<num_devices; i++) { err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 100, cl_device_name, 0); if (err != CL_SUCCESS) { printf("Error: Failed to get device name for device %d!\n", i); printf("Test failed\n"); exit(EXIT_FAILURE); } //printf("CL_DEVICE_NAME %s\n", cl_device_name); if (strcmp(cl_device_name, target_device) == 0) { world.device_id = devices[i]; printf("INFO: Selected %s as the target device\n", cl_device_name); break; } } if (i == num_devices) { printf("Error: Failed to find target device %s\n", target_device); printf("Test failed\n"); exit(EXIT_FAILURE); } } world.context = clCreateContext(0, 1, &world.device_id, NULL, NULL, &err); if (err != CL_SUCCESS) { printf("Error: Failed to create a compute context!\n"); printf("Test failed\n"); exit(EXIT_FAILURE); } world.command_queue = clCreateCommandQueue(world.context, world.device_id, CL_QUEUE_PROFILING_ENABLE, &err); if (err != CL_SUCCESS) { printf("Error: Failed to create a command queue!\n"); printf("Test failed\n"); exit(EXIT_FAILURE); } return world; }
int main() { // Set the image rotation (in degrees) float theta = 3.14159/6; float cos_theta = cosf(theta); float sin_theta = sinf(theta); printf("theta = %f (cos theta = %f, sin theta = %f)\n", theta, cos_theta, sin_theta); // Rows and columns in the input image int imageHeight; int imageWidth; const char* inputFile = "input.bmp"; const char* outputFile = "output.bmp"; // Homegrown function to read a BMP from file float* inputImage = readImage(inputFile, &imageWidth, &imageHeight); // Size of the input and output images on the host int dataSize = imageHeight*imageWidth*sizeof(float); // Output image on the host float* outputImage = NULL; outputImage = (float*)malloc(dataSize); // Set up the OpenCL environment cl_int status; // Discovery platform cl_platform_id platforms[2]; cl_platform_id platform; status = clGetPlatformIDs(2, platforms, NULL); chk(status, "clGetPlatformIDs"); platform = platforms[PLATFORM_TO_USE]; // Discover device cl_device_id device; clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL); chk(status, "clGetDeviceIDs"); // Create context cl_context_properties props[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(platform), 0}; cl_context context; context = clCreateContext(props, 1, &device, NULL, NULL, &status); chk(status, "clCreateContext"); // Create command queue cl_command_queue queue; queue = clCreateCommandQueue(context, device, 0, &status); chk(status, "clCreateCommandQueue"); // Create the input and output buffers cl_mem d_input; d_input = clCreateBuffer(context, CL_MEM_READ_ONLY, dataSize, NULL, &status); chk(status, "clCreateBuffer"); cl_mem d_output; d_output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, dataSize, NULL, &status); chk(status, "clCreateBuffer"); // Copy the input image to the device status = clEnqueueWriteBuffer(queue, d_input, CL_TRUE, 0, dataSize, inputImage, 0, NULL, NULL); chk(status, "clEnqueueWriteBuffer"); const char* source = readSource("rotation.cl"); // Create a program object with source and build it cl_program program; program = clCreateProgramWithSource(context, 1, &source, NULL, NULL); chk(status, "clCreateProgramWithSource"); status = clBuildProgram(program, 1, &device, NULL, NULL, NULL); chk(status, "clBuildProgram"); // Create the kernel object cl_kernel kernel; kernel = clCreateKernel(program, "img_rotate", &status); chk(status, "clCreateKernel"); // Set the kernel arguments status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_output); status |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_input); status |= clSetKernelArg(kernel, 2, sizeof(int), &imageWidth); status |= clSetKernelArg(kernel, 3, sizeof(int), &imageHeight); status |= clSetKernelArg(kernel, 4, sizeof(float), &sin_theta); status |= clSetKernelArg(kernel, 5, sizeof(float), &cos_theta); chk(status, "clSetKernelArg"); // Set the work item dimensions size_t globalSize[2] = {imageWidth, imageHeight}; status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, NULL, 0, NULL, NULL); chk(status, "clEnqueueNDRange"); // Read the image back to the host status = clEnqueueReadBuffer(queue, d_output, CL_TRUE, 0, dataSize, outputImage, 0, NULL, NULL); chk(status, "clEnqueueReadBuffer"); // Write the output image to file storeImage(outputImage, outputFile, imageHeight, imageWidth, inputFile); return 0; }
void runProgram(int N, char *fileName) { printf("GPU Symmetrize()..." "\nSquareMatrix[%d][%d]\n", N, N); int i,j; // initialize input array float *A; A = (float*)malloc(sizeof(float)*N*N); for( i = 0; i < N ; ++i ) { for( j = 0; j < N ; ++j ) { A[i*N + j] = j; } } // result float *Aout; Aout = (float*)malloc(sizeof(float)*N*N); #ifdef DEBUG puts("A"); check_2d_f(A,N,N); #endif int NumK = 1; int NumE = 2; double gpuTime; cl_ulong gstart, gend; //------------------------------------------------ // OpenCL //------------------------------------------------ cl_int err; cl_platform_id platform; // OpenCL platform cl_device_id device_id; // device ID cl_context context; // context cl_command_queue queue; // command queue cl_program program; // program cl_kernel *kernel = (cl_kernel*)malloc(sizeof(cl_kernel)*NumK); cl_event *event = (cl_event*)malloc(sizeof(cl_event)*NumE); // read kernel file //char *fileName = "transpose_kernel.cl"; char *kernelSource; size_t size; FILE *fh = fopen(fileName, "rb"); if(!fh) { printf("Error: Failed to open kernel file!\n"); exit(1); } fseek(fh,0,SEEK_END); size=ftell(fh); fseek(fh,0,SEEK_SET); kernelSource = malloc(size+1); size_t result; result = fread(kernelSource,1,size,fh); if(result != size){ fputs("Reading error", stderr);exit(1);} kernelSource[size] = '\0'; // Bind to platform err = clGetPlatformIDs(1, &platform, NULL); OCL_CHECK(err); // Get ID for the device err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); OCL_CHECK(err); // Create a context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); OCL_CHECK(err); // Create a command queue queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err); OCL_CHECK(err); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, NULL, &err); OCL_CHECK(err); // turn on optimization for kernel char *options="-cl-mad-enable -cl-fast-relaxed-math -cl-no-signed-zeros -cl-unsafe-math-optimizations -cl-finite-math-only"; err = clBuildProgram(program, 1, &device_id, options, NULL, NULL); if(err != CL_SUCCESS) printCompilerOutput(program, device_id); OCL_CHECK(err); #ifdef SAVEBIN // Calculate size of binaries size_t binary_size; err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binary_size, NULL); OCL_CHECK(err); //printf("binary size = %ld\n", binary_size); unsigned char* bin; bin = (unsigned char*)malloc(sizeof(unsigned char)*binary_size); err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*) , &bin, NULL); OCL_CHECK(err); //puts("save binaries"); // Print the binary out to the output file fh = fopen("kernel.bin", "wb"); fwrite(bin, 1, binary_size, fh); fclose(fh); puts("done save binaries"); #endif kernel[0] = clCreateKernel(program, "kernel_a", &err); OCL_CHECK(err); // memory on device cl_mem A_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*N*N, NULL, NULL); cl_mem Aout_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*N*N, NULL, NULL); // copy data to device err = clEnqueueWriteBuffer(queue, A_d, CL_TRUE, 0, sizeof(float)*N*N, A, 0, NULL , &event[0]); OCL_CHECK(err); size_t localsize[2]; size_t globalsize[2]; localsize[0] = 16; localsize[1] = 16; globalsize[0] = N; globalsize[1] = N; err = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), &A_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clSetKernelArg(kernel[0], 1, sizeof(cl_mem), &Aout_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clEnqueueNDRangeKernel(queue, kernel[0], 2, NULL, globalsize, localsize, 0, NULL, NULL); OCL_CHECK(err); clFinish(queue); // read device data back to host clEnqueueReadBuffer(queue, Aout_d, CL_TRUE, 0, sizeof(float)*N*N, Aout, 0, NULL , &event[1]); err = clWaitForEvents(1,&event[1]); OCL_CHECK(err); err = clGetEventProfilingInfo (event[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &gstart, NULL); OCL_CHECK(err); err = clGetEventProfilingInfo (event[1], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &gend, NULL); OCL_CHECK(err); gpuTime = (double)(gend -gstart)/1000000000.0; //check_1d_f(sum, blks+1); #ifdef DEBUG puts("Output"); check_2d_f(Aout,N,N); #endif printf("oclTime = %lf (s)\n", gpuTime ); // free clReleaseMemObject(A_d); clReleaseMemObject(Aout_d); // // check // int flag = 1; // for(i=0;i<N;++i){ // for(j=0;j<N;++j){ // if(A[i*N+j] != At[j*N+i]) // { // flag = 0; // break; // } // } // } // if( flag == 0 ) // { // puts("Bugs! Check program."); // }else{ // puts("Succeed!"); // } clReleaseProgram(program); clReleaseContext(context); clReleaseCommandQueue(queue); for(i=0;i<NumK;++i){ clReleaseKernel(kernel[i]); } for(i=0;i<NumE;++i){ clReleaseEvent(event[i]); } free(kernelSource); #ifdef SAVEBIN free(bin); #endif free(A); free(Aout); return; }
int main(int argc, char** argv) { int rank, size; // MPI rank & size int err; // error code returned from OpenCL calls float h_a[LENGTH]; // a vector float h_b[LENGTH]; // b vector float h_c[LENGTH]; // c vector (a+b) returned from the compute device (local per task) float _h_c[LENGTH]; // c vector (a+b) returned from the compute device (global for master) unsigned int correct; // number of correct results size_t global; // global domain size size_t local; // local domain size cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel ko_vadd; // compute kernel cl_mem d_a; // device memory used for the input a vector cl_mem d_b; // device memory used for the input b vector cl_mem d_c; // device memory used for the output c vector int mycount, i; err = MPI_Init (&argc, &argv); if (err != MPI_SUCCESS) { printf ("MPI_Init failed!\n"); exit (-1); } err = MPI_Comm_rank (MPI_COMM_WORLD, &rank); if (err != MPI_SUCCESS) { printf ("MPI_Comm_rank failed!\n"); exit (-1); } err = MPI_Comm_size (MPI_COMM_WORLD, &size); if (err != MPI_SUCCESS) { printf ("MPI_Comm_size failed\n"); exit (-1); } if (LENGTH % size != 0) { printf ("Number of MPI processes must divide LENGTH (%d)\n", LENGTH); exit (-1); } mycount = LENGTH / size; if (rank == 0) { for (i = 0; i < LENGTH; i++) { h_a[i] = rand() / (float)RAND_MAX; h_b[i] = rand() / (float)RAND_MAX; h_a[i] = i; h_b[i] = i*2; } err = MPI_Bcast (h_a, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD); if (err != MPI_SUCCESS) { printf ("MPI_Bcast failed transferring h_a\n"); exit (-1); } err = MPI_Bcast (h_b, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD); if (err != MPI_SUCCESS) { printf ("MPI_Bcast failed transferring h_b\n"); exit (-1); } } else { err = MPI_Bcast (h_a, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD); if (err != MPI_SUCCESS) { printf ("MPI_Bcast failed receiving h_a\n"); exit (-1); } err = MPI_Bcast (h_b, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD); if (err != MPI_SUCCESS) { printf ("MPI_Bcast failed receiving h_b\n"); exit (-1); } } // Set up platform cl_uint numPlatforms; // Find number of platforms err = clGetPlatformIDs(0, NULL, &numPlatforms); if (err != CL_SUCCESS || numPlatforms <= 0) { printf("Error: Failed to find a platform!\n"); return EXIT_FAILURE; } // Get all platforms cl_platform_id Platform[numPlatforms]; err = clGetPlatformIDs(numPlatforms, Platform, NULL); if (err != CL_SUCCESS || numPlatforms <= 0) { printf("Error: Failed to get the platform!\n"); return EXIT_FAILURE; } // Secure a GPU for (i = 0; i < numPlatforms; i++) { err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL); if (err == CL_SUCCESS) break; } if (device_id == NULL) { printf("Error: Failed to create a device group!\n"); return EXIT_FAILURE; } else { if (output_device_info (rank, device_id) != CL_SUCCESS) return EXIT_FAILURE; } // Create a compute context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } // Create a command queue commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err); if (!program) { printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } // Create the compute kernel from the program ko_vadd = clCreateKernel(program, "vadd", &err); if (!ko_vadd || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } // Create the input (a, b) and output (c) arrays in device memory d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * mycount, NULL, NULL); d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * mycount, NULL, NULL); d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * mycount, NULL, NULL); if (!d_a || !d_b || !d_c) { printf("Error: Failed to allocate device memory!\n"); exit(1); } // Write a and b vectors into compute device memory err = clEnqueueWriteBuffer(commands, d_a, CL_TRUE, 0, sizeof(float) * mycount, &h_a[rank*mycount], 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write h_a to source array!\n"); exit(1); } err = clEnqueueWriteBuffer(commands, d_b, CL_TRUE, 0, sizeof(float) * mycount, &h_b[rank*mycount], 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write h_b to source array!\n"); exit(1); } // Set the arguments to our compute kernel err = clSetKernelArg(ko_vadd, 0, sizeof(cl_mem), &d_a); err |= clSetKernelArg(ko_vadd, 1, sizeof(cl_mem), &d_b); err |= clSetKernelArg(ko_vadd, 2, sizeof(cl_mem), &d_c); err |= clSetKernelArg(ko_vadd, 3, sizeof(unsigned int), &mycount); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); exit(1); } // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(ko_vadd, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); exit(1); } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device global = LENGTH; err = clEnqueueNDRangeKernel(commands, ko_vadd, 1, NULL, &global, &local, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel!\n"); return EXIT_FAILURE; } // Wait for the commands to complete before reading back results clFinish(commands); // Read back the results from the compute device err = clEnqueueReadBuffer( commands, d_c, CL_TRUE, 0, sizeof(float) * mycount, &h_c, 0, NULL, NULL ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); exit(1); } err = MPI_Gather (h_c, mycount, MPI_FLOAT, _h_c, mycount, MPI_FLOAT, 0, MPI_COMM_WORLD); if (err != MPI_SUCCESS) { printf ("MPI_Gather failed receiving h_c\n"); exit (-1); } if (rank == 0) { // Test the results correct = 0; float tmp; for(i = 0; i < LENGTH; i++) { tmp = h_a[i] + h_b[i]; // assign element i of a+b to tmp tmp -= _h_c[i]; // compute deviation of expected and output result if(tmp*tmp < TOL*TOL) // correct if square deviation is less than tolerance squared correct++; else printf(" tmp %f h_a %f h_b %f h_c %f \n",tmp, h_a[i], h_b[i], _h_c[i]); } // summarize results printf("C = A+B: %d out of %d results were correct.\n", correct, LENGTH); } // cleanup then shutdown clReleaseMemObject(d_a); clReleaseMemObject(d_b); clReleaseMemObject(d_c); clReleaseProgram(program); clReleaseKernel(ko_vadd); clReleaseCommandQueue(commands); clReleaseContext(context); err = MPI_Finalize (); if (err != MPI_SUCCESS) { printf ("MPI_Finalize failed!\n"); exit (-1); } return 0; }
// Main program //***************************************************************************** int main(int argc, char** argv) { // Locals used with command line args int p = 256; // workgroup X dimension int q = 1; // workgroup Y dimension pArgc = &argc; pArgv = argv; shrQAStart(argc, argv); // latch the executable path for other funcs to use cExecutablePath = argv[0]; // start logs and show command line help shrSetLogFileName ("oclNbody.txt"); shrLog("%s Starting...\n\n", cExecutablePath); shrLog("Command line switches:\n"); shrLog(" --qatest\t\tCheck correctness of GPU execution and measure performance)\n"); shrLog(" --noprompt\t\tQuit simulation automatically after a brief period\n"); shrLog(" --n=<numbodies>\tSpecify # of bodies to simulate (default = %d)\n", numBodies); shrLog(" --double\t\tUse double precision floating point values for simulation\n"); shrLog(" --p=<workgroup X dim>\tSpecify X dimension of workgroup (default = %d)\n", p); shrLog(" --q=<workgroup Y dim>\tSpecify Y dimension of workgroup (default = %d)\n\n", q); // Get command line arguments if there are any and set vars accordingly if (argc > 0) { shrGetCmdLineArgumenti(argc, (const char**)argv, "p", &p); shrGetCmdLineArgumenti(argc, (const char**)argv, "q", &q); shrGetCmdLineArgumenti(argc, (const char**)argv, "n", &numBodies); bDouble = (shrTRUE == shrCheckCmdLineFlag(argc, (const char**)argv, "double")); bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt"); bQATest = shrCheckCmdLineFlag(argc, (const char**)argv, "qatest"); } //Get the NVIDIA platform cl_int ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("clGetPlatformID...\n\n"); if (bDouble) { shrLog("Double precision execution...\n\n"); } else { shrLog("Single precision execution...\n\n"); } flopsPerInteraction = bDouble ? 30 : 20; //Get all the devices shrLog("Get the Device info and select Device...\n"); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Set target device and Query number of compute units on uiTargetDevice shrLog(" # of Devices Available = %u\n", uiNumDevices); if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE) { uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1)); } shrLog(" Using Device %u, ", uiTargetDevice); oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]); cl_uint uiNumComputeUnits; clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL); shrLog(" # of Compute Units = %u\n", uiNumComputeUnits); //Create the context shrLog("clCreateContext...\n"); cxContext = clCreateContext(0, uiNumDevsUsed, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Create a command-queue shrLog("clCreateCommandQueue...\n\n"); cqCommandQueue = clCreateCommandQueue(cxContext, cdDevices[uiTargetDevice], CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Log and config for number of bodies shrLog("Number of Bodies = %d\n", numBodies); switch (numBodies) { case 1024: activeParams.m_clusterScale = 1.52f; activeParams.m_velocityScale = 2.f; break; case 2048: activeParams.m_clusterScale = 1.56f; activeParams.m_velocityScale = 2.64f; break; case 4096: activeParams.m_clusterScale = 1.68f; activeParams.m_velocityScale = 2.98f; break; case 7680: case 8192: activeParams.m_clusterScale = 1.98f; activeParams.m_velocityScale = 2.9f; break; default: case 15360: case 16384: activeParams.m_clusterScale = 1.54f; activeParams.m_velocityScale = 8.f; break; case 30720: case 32768: activeParams.m_clusterScale = 1.44f; activeParams.m_velocityScale = 11.f; break; } if ((q * p) > 256) { p = 256 / q; shrLog("Setting p=%d to maintain %d threads per block\n", p, 256); } if ((q == 1) && (numBodies < p)) { p = numBodies; shrLog("Setting p=%d because # of bodies < p\n", p); } shrLog("Workgroup Dims = (%d x %d)\n\n", p, q); // Initialize OpenGL items if using GL if (bQATest == shrFALSE) { shrLog("Calling InitGL...\n"); InitGL(&argc, argv); } else { shrLog("Skipping InitGL...\n"); } // CL/GL interop disabled bUsePBO = (false && (bQATest == shrFALSE)); InitNbody(cdDevices[uiTargetDevice], cxContext, cqCommandQueue, numBodies, p, q, bUsePBO, bDouble); ResetSim(nbody, numBodies, NBODY_CONFIG_SHELL, bUsePBO); // init timers shrDeltaT(DEMOTIME); // timer 0 is for timing demo periods shrDeltaT(FUNCTIME); // timer 1 is for logging function delta t's shrDeltaT(FPSTIME); // timer 2 is for fps measurement // Standard simulation if (bQATest == shrFALSE) { shrLog("Running standard oclNbody simulation...\n\n"); glutDisplayFunc(DisplayGL); glutReshapeFunc(ReshapeGL); glutMouseFunc(MouseGL); glutMotionFunc(MotionGL); glutKeyboardFunc(KeyboardGL); glutSpecialFunc(SpecialGL); glutIdleFunc(IdleGL); glutMainLoop(); } // Compare to host, profile and write out file for regression analysis if (bQATest == shrTRUE) { bool bTestResults = false; shrLog("Running oclNbody Results Comparison...\n\n"); bTestResults = CompareResults(numBodies); shrLog("Profiling oclNbody...\n\n"); RunProfiling(100, (unsigned int)(p * q)); // 100 iterations shrQAFinish(argc, (const char **)argv, bTestResults ? QA_PASSED : QA_FAILED); } else { // Cleanup/exit bNoPrompt = shrTRUE; shrQAFinish2(false, *pArgc, (const char **)pArgv, QA_PASSED); } Cleanup(EXIT_SUCCESS); }
int main() { cl_device_id device = new_device(); cl_context context; cl_command_queue queue; cl_program program; cl_kernel kernel; cl_int i, j, err; float matrix_1[80], matrix_2[80], matrix_3[80]; const size_t buffer_origin[3] = { 5 * sizeof(float), 3, 0 }; const size_t host_origin[3] = { 1 * sizeof(float), 1, 0 }; const size_t region[3] = { 4 * sizeof(float), 4, 1 }; cl_mem matrix_buffer_1, matrix_buffer_2, matrix_buffer_3; for (i = 0; i < 80; i++) { matrix_1[i] = i * 1.0f; matrix_2[i] = 3.0; matrix_3[i] = 0; } context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); if (err < 0) { perror("Couldn't create a context\n"); exit(1); } program = build_program(context, device, FILE_NAME); kernel = clCreateKernel(program, "add", &err); if (err < 0) { perror("Couldn't create a kernel\n"); exit(1); } matrix_buffer_1 = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(matrix_1), matrix_1, &err); if (err < 0) { perror("Couldn't create a buffer\n"); exit(1); } matrix_buffer_2 = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(matrix_2), matrix_2, &err); if (err < 0) { perror("Couldn't create a buffer\n"); exit(1); } matrix_buffer_3 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(matrix_3), matrix_3, &err); if (err < 0) { perror("Couldn't create a buffer\n"); exit(1); } int row = 8; int col = 10; err = clSetKernelArg(kernel, 0, sizeof(int), &row); err = clSetKernelArg(kernel, 1, sizeof(int), &col); err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &matrix_buffer_1); err = clSetKernelArg(kernel, 3, sizeof(cl_mem), &matrix_buffer_2); err = clSetKernelArg(kernel, 4, sizeof(cl_mem), &matrix_buffer_3); queue = clCreateCommandQueue(context, device, 0, &err); if (err < 0) { perror("Couldn't create a command queue\n"); exit(1); } err = clEnqueueTask(queue, kernel, 0, NULL, NULL); if (err < 0) { perror("Couldn't enque task\n"); exit(1); } err = clEnqueueReadBuffer(queue, matrix_buffer_3, CL_TRUE, 0, sizeof(matrix_3), &matrix_3, 0, NULL, NULL); for (i = 0; i < 8; i++) { for (j = 0; j < 10; j++) { printf("%6.1f ", matrix_3[j + i * 10]); } printf("\n"); } clReleaseMemObject(matrix_buffer_1); clReleaseMemObject(matrix_buffer_2); clReleaseMemObject(matrix_buffer_3); clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseContext(context); return 0; }
void call_kernel(float *data,unsigned int count,char * cl_name,float *results) { FILE* programHandle; size_t programSize, KernelSourceSize; char *programBuffer, *KernelSource; size_t global; // global domain size for our calculation size_t local; // local domain size for our calculation cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel cl_mem input; // device memory used for the input array cl_mem output; // device memory used for the output array int err; int gpu = 1; err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); commands = clCreateCommandQueue(context, device_id, 0, &err); //---------------------------------------------------------------------------- // get size of kernel source programHandle = fopen(cl_name, "r"); fseek(programHandle, 0, SEEK_END); programSize = ftell(programHandle); rewind(programHandle); programBuffer = (char*) malloc(programSize + 1); programBuffer[programSize] = '\0'; fread(programBuffer, sizeof(char), programSize, programHandle); fclose(programHandle); // create program from buffer program = clCreateProgramWithSource(context,1,(const char**) &programBuffer,&programSize, NULL); free(programBuffer); // read kernel source back in from program to check clGetProgramInfo(program, CL_PROGRAM_SOURCE, 0, NULL, &KernelSourceSize); KernelSource = (char*) malloc(KernelSourceSize); clGetProgramInfo(program, CL_PROGRAM_SOURCE, KernelSourceSize, KernelSource, NULL); program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err); err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); kernel = clCreateKernel(program, "square", &err); //---------------------------------------------------------------------------- input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL); output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL); err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL); clSetKernelArg(kernel, 0, sizeof(cl_mem), &input); clSetKernelArg(kernel, 1, sizeof(cl_mem), &output); clSetKernelArg(kernel, 2, sizeof(unsigned int), &count); err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); global = count; err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); clFinish(commands); err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL ); clReleaseMemObject(input); clReleaseMemObject(output); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); printf("nKernel source:\n\n %s \n", KernelSource); free(KernelSource); }
int main() { /* Host/device data structures */ cl_platform_id platform; cl_device_id device; cl_context context; cl_command_queue queue; cl_int err; /* Program/kernel data structures */ cl_program program; FILE *program_handle; char *program_buffer, *program_log; size_t program_size, log_size; cl_kernel kernel; size_t offset = 0; size_t global_size, local_size; /* Data and buffers */ char pattern[16] = "thatwithhavefrom"; FILE *text_handle; char *text; size_t text_size; int chars_per_item; int result[4] = {0, 0, 0, 0}; cl_mem text_buffer, result_buffer; /* Identify a platform */ err = clGetPlatformIDs(1, &platform, NULL); if(err < 0) { perror("Couldn't identify a platform"); exit(1); } /* Access a device */ err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if(err < 0) { perror("Couldn't access any devices"); exit(1); } /* Determine global size and local size */ clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(global_size), &global_size, NULL); clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(local_size), &local_size, NULL); global_size *= local_size; /* Create a context */ context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); if(err < 0) { perror("Couldn't create a context"); exit(1); } /* Read program file and place content into buffer */ program_handle = fopen(PROGRAM_FILE, "r"); if(program_handle == NULL) { perror("Couldn't find the program file"); exit(1); } fseek(program_handle, 0, SEEK_END); program_size = ftell(program_handle); rewind(program_handle); program_buffer = (char*)calloc(program_size+1, sizeof(char)); fread(program_buffer, sizeof(char), program_size, program_handle); fclose(program_handle); /* Read text file and place content into buffer */ text_handle = fopen(TEXT_FILE, "r"); if(text_handle == NULL) { perror("Couldn't find the text file"); exit(1); } fseek(text_handle, 0, SEEK_END); text_size = ftell(text_handle)-1; rewind(text_handle); text = (char*)calloc(text_size, sizeof(char)); fread(text, sizeof(char), text_size, text_handle); fclose(text_handle); chars_per_item = text_size / global_size + 1; /* Create program from file */ program = clCreateProgramWithSource(context, 1, (const char**)&program_buffer, &program_size, &err); if(err < 0) { perror("Couldn't create the program"); exit(1); } free(program_buffer); /* Build program */ err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if(err < 0) { /* Find size of log and print to std output */ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); program_log = (char*) calloc(log_size+1, sizeof(char)); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL); printf("%s\n", program_log); free(program_log); exit(1); } /* Create a kernel */ kernel = clCreateKernel(program, KERNEL_FUNC, &err); if(err < 0) { perror("Couldn't create a kernel"); exit(1); }; /* Create buffers to hold the text characters and count */ text_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, text_size, text, &err); if(err < 0) { perror("Couldn't create a buffer"); exit(1); }; result_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(result), result, NULL); /* Create kernel argument */ err = clSetKernelArg(kernel, 0, sizeof(pattern), pattern); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &text_buffer); err |= clSetKernelArg(kernel, 2, sizeof(chars_per_item), &chars_per_item); err |= clSetKernelArg(kernel, 3, 4 * sizeof(int), NULL); err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &result_buffer); if(err < 0) { printf("Couldn't set a kernel argument"); exit(1); }; /* Create a command queue */ queue = clCreateCommandQueue(context, device, 0, &err); if(err < 0) { perror("Couldn't create a command queue"); exit(1); }; /* Enqueue kernel */ err = clEnqueueNDRangeKernel(queue, kernel, 1, &offset, &global_size, &local_size, 0, NULL, NULL); if(err < 0) { perror("Couldn't enqueue the kernel"); printf("Error code: %d\n", err); exit(1); } /* Read and print the result */ err = clEnqueueReadBuffer(queue, result_buffer, CL_TRUE, 0, sizeof(result), &result, 0, NULL, NULL); if(err < 0) { perror("Couldn't read the buffer"); exit(1); } printf("\nResults: \n"); printf("Number of occurrences of 'that': %d\n", result[0]); printf("Number of occurrences of 'with': %d\n", result[1]); printf("Number of occurrences of 'have': %d\n", result[2]); printf("Number of occurrences of 'from': %d\n", result[3]); /* Deallocate resources */ clReleaseMemObject(result_buffer); clReleaseMemObject(text_buffer); clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseProgram(program); clReleaseContext(context); return 0; }
int main() { //Control Variables bool showStartInput=false;// Setting it to true shows the original Input bool showFftOutput=false;// Shows the output after the FFT but before the Reshuffle bool showReshuffleOutput=false;// Shows the output after the reshuffle bool showFinalResult=false; // Shows final result after cross-correlation bool showGemmInput=false; // Shows output after the reshuffle but before the matrix multiplication bool showReformatOutputAfterReshuffle=false; // Shows output after it has been reformatted after the reshuffling //openCL State cl_platform_id platform_id=NULL; cl_device_id device_id=NULL; cl_context context=NULL; cl_command_queue queue=NULL; cl_program program=NULL; cl_kernel kernel=NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret=0; // Stores the error values retuned by many functions cl_event event = NULL; cl_event events[10]; cl_kernel clKernel; //FFT state clAmdFftPlanHandle plHandle; clAmdFftResultLocation place = CLFFT_OUTOFPLACE; //Alternative CLFFT_INPLACE clAmdFftLayout inLayout = CLFFT_COMPLEX_INTERLEAVED; clAmdFftLayout outLayout = CLFFT_COMPLEX_INTERLEAVED; clAmdFftDim dim = CLFFT_1D; size_t clStrides[3]={0,0,0}; size_t clLengths[3]; clLengths[0]=(MEM_SIZE/2);//Length of first dimension of fft clLengths[1]=1;//length of second dimension of fft clLengths[2]=1; clStrides[ 0 ] = 1; clStrides[ 1 ] = clStrides[ 0 ] * clLengths[ 0 ]; clStrides[ 2 ] = clStrides[ 1 ] * clLengths[ 1 ]; clStrides[ 3 ] = clStrides[ 2 ] * clLengths[ 2 ]; size_t batchSize=CHANSIZE;//number of discreet fft's to be calculated simultaneously //Initialise openCL OPENCL_V_THROW(clGetPlatformIDs(1, &platform_id, &ret_num_platforms),"clGetPlatformIDs Failed"); OPENCL_V_THROW(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,&ret_num_devices),"clGetDeviceIDs Failed"); context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); OPENCL_V_THROW(ret, "Creating Context failed" ); queue = clCreateCommandQueue(context, device_id, 0, &ret); OPENCL_V_THROW(ret, "Creating command queue failed" ); //===========Initialise the host buffers====================================== /* * The functions sgenerate2darray(), screate2darray() and sgenerate2darrayout() are defined and declared in definition.h */ float** src_a_h=sgenerate2darray(NO_INPUTS,MEM_SIZE);//To be used to store the original input float** answer=screate2darray(NO_INPUTS,MEM_SIZE);//To be used to store the answer after the reshuffling float** corr_h=sgenerate2darrayout(NO_INPUTS,CHANSIZE << 1,CHANNELNO);// To be used to store the final answer if(showStartInput){ cout << "Initial Input Buffer" << "\n"; for(int j=0;j<NO_INPUTS;j++){ for(int i=0;i<MEM_SIZE;i++){ cout << src_a_h[j][i] << " "; }cout << "\n"; }printf("\n"); } //=================================================================== //Calculation of facs for reshuffling complex <float>* facs_h=(complex <float>*) malloc(sizeof(complex <float>)*(MEM_SIZE/2)); complex<float> I=1.0i; complex <float> xx=2.0*PI; for(int i=0;i<MEM_SIZE/2;i++){ facs_h[i]=(1.0*i)/(1.0*MEM_SIZE); facs_h[i]=exp(xx*(-I*facs_h[i])); } //=================================================================== //Initialise GPU memory buffers size_t sizeofgpumem=NO_INPUTS*MEM_SIZE*sizeof(float); size_t sizeoffacsmem=MEM_SIZE*sizeof(float); cl_mem clMemBuffersIn = clCreateBuffer(context,CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,sizeofgpumem,src_a_h[0],&ret); OPENCL_V_THROW( ret, "Creating clMemBuffersIn Buffer failed" ); cl_mem clMemBuffersOut = clCreateBuffer(context,CL_MEM_READ_WRITE,sizeofgpumem,NULL,&ret); OPENCL_V_THROW (ret, "Creating fft output Buffer failed"); cl_mem facs = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,sizeoffacsmem,facs_h,&ret); OPENCL_V_THROW (ret, "Creating facs Buffer failed"); //===========================Starting the fft=============================// clAmdFftSetupData setupData; OPENCL_V_THROW( clAmdFftInitSetupData( &setupData ),"clAmdFftInitSetupData failed" ); OPENCL_V_THROW( clAmdFftSetup( &setupData ), "clAmdFftSetup failed" ); OPENCL_V_THROW( clAmdFftCreateDefaultPlan( &plHandle, context, dim, clLengths ), "clAmdFftCreateDefaultPlan failed" ); OPENCL_V_THROW (clAmdFftSetPlanBatchSize (plHandle, batchSize),"Setting BatchSize Failed"); OPENCL_V_THROW (clAmdFftSetResultLocation( plHandle, place ), "clAmdFftSetResultLocation failed" ); OPENCL_V_THROW (clAmdFftSetPlanInStride ( plHandle, dim, clStrides ), "clAmdFftSetPlanInStride failed" ); OPENCL_V_THROW (clAmdFftSetPlanOutStride ( plHandle, dim, clStrides ), "clAmdFftSetPlanOutStride failed" ); OPENCL_V_THROW (clAmdFftSetPlanDistance ( plHandle, clStrides[ dim ], clStrides[ dim ]), "clAmdFftSetPlanDistance failed" ); OPENCL_V_THROW( clAmdFftBakePlan( plHandle, 1, &queue, NULL, NULL ), "clAmdFftBakePlan failed" ); size_t tempbuffersize=0; OPENCL_V_THROW( clAmdFftGetTmpBufSize(plHandle, &tempbuffersize ), "clAmdFftGetTmpBufSize failed" ); //allocate the intermediate buffer cl_mem clMedBuffer=NULL; if (tempbuffersize) { cl_int medstatus; clMedBuffer = clCreateBuffer ( context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,tempbuffersize, 0, &medstatus); OPENCL_V_THROW( medstatus, "Creating fft intermediate Buffer failed" ); } if (( place == CLFFT_INPLACE )&& ( inLayout != outLayout )) { switch( inLayout ) { case CLFFT_COMPLEX_INTERLEAVED: { assert (CLFFT_COMPLEX_PLANAR == outLayout); throw std::runtime_error( "Cannot use the same buffer for interleaved->planar in-place transforms" ); break; } case CLFFT_COMPLEX_PLANAR: { assert (CLFFT_COMPLEX_INTERLEAVED == outLayout); throw std::runtime_error( "Cannot use the same buffer for planar->interleaved in-place transforms" ); break; } } } cl_mem * BuffersOut = ( place == CLFFT_INPLACE ) ? NULL : &clMemBuffersOut; //========Timimg fft============// double time_fft_start=omp_get_wtime(); for(int i=0;i<ITER_FFT;i++){ OPENCL_V_THROW( clAmdFftEnqueueTransform( plHandle, CLFFT_FORWARD, 1,&queue,0,NULL,&event,&clMemBuffersIn,BuffersOut,clMedBuffer ),"clAmdFftEnqueueTransform failed" ); } ret=clWaitForEvents(1,&event); double time_fft_end=omp_get_wtime(); //Cleaning up fft OPENCL_V_THROW( clAmdFftDestroyPlan( &plHandle ), "clAmdFftDestroyPlan failed" ); OPENCL_V_THROW( clAmdFftTeardown( ), "clAmdFftTeardown failed" ); //displaying results if(showFftOutput){ OPENCL_V_THROW( clEnqueueReadBuffer( queue, clMemBuffersOut, CL_TRUE, 0, sizeofgpumem,answer [0], 0, NULL, NULL ),"Reading the result buffer failed" ); cout << "**FFT Output**" << endl; for(int j=0;j<NO_INPUTS;j++){ for(int i=0;i<MEM_SIZE;i++){ cout << answer[j][i] << " "; } printf("\n"); }printf("\n"); } //==================End of FFT=============================================// //==================Start the Reshuffling==================================// FILE *fp; char fileName[]="./reshuffle.cl"; char* source_str=NULL; size_t source_size; //Load the source code containing the kernel/ fp = fopen(fileName, "r"); if (!fp) { fprintf(stderr, "Failed to load reshuffle kernel.Â¥n"); exit(1); } source_str = (char*)malloc(MAX_SOURCE_SIZE); source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); fclose(fp); //Preparation for building the Kernel program = clCreateProgramWithSource(context, 1, (const char **)&source_str,(const size_t *)&source_size, &ret); OPENCL_V_THROW( ret, "Creating program with source failed for Reshuffle" ); OPENCL_V_THROW( clBuildProgram(program, 1, &device_id, NULL, NULL, NULL),"Build Program Failed for Reshuffle"); kernel = clCreateKernel(program, "reshuffle", &ret); OPENCL_V_THROW( ret, "Creating kernel failed for Reshuffle" ); //Set kernel parameters const int num=NO_INPUTS*MEM_SIZE; const int block=MEM_SIZE; OPENCL_V_THROW(clSetKernelArg(kernel, 0, sizeof(cl_mem), (float *)&clMemBuffersIn),"Passing argument 0 of reshuffle failed"); OPENCL_V_THROW(clSetKernelArg(kernel, 1, sizeof(cl_mem), (float *)&facs),"Passing arg 1 of reshuffle failed"); OPENCL_V_THROW(clSetKernelArg(kernel, 2, sizeof(cl_mem), (float *)&clMemBuffersOut),"Passing arg2 of reshuffle failed"); OPENCL_V_THROW(clSetKernelArg(kernel, 3, sizeof(int), (int *)&num),"Passing arg3 of reshuffle failed"); OPENCL_V_THROW(clSetKernelArg(kernel, 4, sizeof(int), (int *)&block),"Passing arg4 of reshuffle failed"); // Execute OpenCL Kernel // const size_t local_ws=NO_INPUTS*MEM_SIZE; const size_t global_ws=min(NO_THREAD_PER_BLOCK,MEM_SIZE);//ceil(MEM_SIZE/local_ws); //===========timing the reshuffle===============// double time_reshuffle_start=omp_get_wtime(); for(int i=0;i<ITER_FFT;i++){ OPENCL_V_THROW(clEnqueueNDRangeKernel(queue,kernel, 1, NULL,&local_ws,&global_ws, 0, NULL, NULL),"Reshuffle Kernel execution failed"); } double time_reshuffle_end=omp_get_wtime(); //Read back data OPENCL_V_THROW(clEnqueueReadBuffer(queue, clMemBuffersOut, CL_TRUE, 0, sizeofgpumem,answer[0], 0, NULL, NULL),"Reading back reshuffled data failed"); //====================Finish the reshuffling================================// if(showReshuffleOutput){ cout << "Output after reshuffling" << endl; for(int j=0;j<NO_INPUTS;j++){ for(int i=0;i<MEM_SIZE;i++){ cout << answer[j][i] << " "; } printf("\n"); }printf("\n"); } //=================Reformatting the input given to the matrix multiply===================================// float** answer_final=screate2darray(NO_INPUTS*2,MEM_SIZE/2); for(int i=0;i<NO_INPUTS;i++){ for(int j=0;j<MEM_SIZE;j++){ if(j&1) answer_final[(i<<1)+1][j >> 1]=answer[i][j]; else answer_final[(i<<1)][j >> 1]=answer[i][j]; } }
MainContext::MainContext(const cl_device_type_t dev_type) { if (clGetPlatformIDs(0, NULL, &plat_count) == CL_SUCCESS) { plat = new cl_platform_id_t[plat_count]; if (clGetPlatformIDs(plat_count, plat, NULL) == CL_SUCCESS) { dev_count = new cl_uint_t[plat_count]; dev = new cl_device_id_t*[plat_count]; context = new cl_context_t*[plat_count]; cmd = new cl_command_queue_t*[plat_count]; for (cl_uint_t p = 0; p < plat_count; p++) { if (clGetDeviceIDs(plat[p], dev_type, 0, NULL, &dev_count[p]) == CL_SUCCESS) { total_dev_count += dev_count[p]; dev[p] = new cl_device_id_t[dev_count[p]]; context[p] = new cl_context_t[dev_count[p]]; cmd[p] = new cl_command_queue_t[dev_count[p]]; if (clGetDeviceIDs(plat[p], dev_type, dev_count[p], dev[p], NULL) == CL_SUCCESS) { cl_context_properties_t properties[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties_t) plat[p], 0 }; for (cl_uint_t d = 0; d < dev_count[p]; d++) { cl_int_t err; context[p][d] = clCreateContext(properties, 1, &dev[p][d], NULL, NULL, &err); if (err == CL_SUCCESS) { cmd[p][d] = clCreateCommandQueue(context[p][d], dev[p][d], CL_QUEUE_PROFILING_ENABLE, &err); if (err != CL_SUCCESS) { throw 1; } } else { throw 2; } } } else { throw 3; } } else { throw 4; } } } else { throw 5; } } else { throw 6; } }
void create_context_on(const char *plat_name, const char*dev_name, cl_uint idx, cl_context *ctx, cl_command_queue *queue, int enable_profiling) { cl_uint plat_count; CALL_CL_GUARDED(clGetPlatformIDs, (0, NULL, &plat_count)); cl_platform_id *platforms = (cl_platform_id *) malloc(plat_count*sizeof(cl_platform_id)); CHECK_SYS_ERROR(!platforms, "allocating platform array"); CALL_CL_GUARDED(clGetPlatformIDs, (plat_count, platforms, NULL)); for (cl_uint i = 0; i < plat_count; ++i) { char buf[100]; CALL_CL_GUARDED(clGetPlatformInfo, (platforms[i], CL_PLATFORM_VENDOR, sizeof(buf), buf, NULL)); if (!plat_name || strstr(buf, plat_name)) { cl_uint dev_count; CALL_CL_GUARDED(clGetDeviceIDs, (platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &dev_count)); cl_device_id *devices = (cl_device_id *) malloc(dev_count*sizeof(cl_device_id)); CHECK_SYS_ERROR(!devices, "allocating device array"); CALL_CL_GUARDED(clGetDeviceIDs, (platforms[i], CL_DEVICE_TYPE_ALL, dev_count, devices, NULL)); for (cl_uint j = 0; j < dev_count; ++j) { char buf[100]; CALL_CL_GUARDED(clGetDeviceInfo, (devices[j], CL_DEVICE_NAME, sizeof(buf), buf, NULL)); if (!dev_name || strstr(buf, dev_name)) { if (idx == 0) { cl_platform_id plat = platforms[i]; cl_device_id dev = devices[j]; free(devices); free(platforms); cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) plat, 0 }; cl_int status; *ctx = clCreateContext( cps, 1, &dev, NULL, NULL, &status); CHECK_CL_ERROR(status, "clCreateContext"); cl_command_queue_properties qprops = 0; if (enable_profiling) qprops |= CL_QUEUE_PROFILING_ENABLE; *queue = clCreateCommandQueue(*ctx, dev, qprops, &status); CHECK_CL_ERROR(status, "clCreateCommandQueue"); return; } else --idx; } } free(devices); } } free(platforms); fputs("create_context_on: specified device not found.\n", stderr); abort(); }
static sc_status SetupOpenCLSessionForDeviceType( sc_session handle, sc_device_type device_type, sc_uint device_count) { sc_status status = 0; cl_int system_status = 0; cl_device_type system_device_type = 0; cl_uint system_device_count = 0; cl_uint system_platform_count = 0; cl_device_id *system_device_list = 0; cl_platform_id system_platform = 0; size_t return_size = 0; sc_session_t *session = (sc_session_t*)handle; system_status = clGetPlatformIDs(1, &system_platform, &system_platform_count); if (system_status != CL_SUCCESS || system_platform_count < 1) { scError(NULL, SC_INVALID_PLATFORM, "Failed to locate platform!\n"); return SC_INVALID_PLATFORM; } system_device_type = GetOpenCLDeviceType(device_type, &status); if (status != CL_SUCCESS) { scError(NULL, SC_INVALID_DEVICE_TYPE, "Invalid compute device type specified!\n"); return SC_INVALID_DEVICE_TYPE; } system_status = clGetDeviceIDs(system_platform, system_device_type, 0, NULL, &system_device_count); if (system_status != CL_SUCCESS || system_device_count < 1) { scError(NULL, SC_DEVICE_NOT_AVAILABLE, "Failed to locate compute device!\n"); return SC_DEVICE_NOT_AVAILABLE; } if(device_count) system_device_count = system_device_count > device_count ? device_count : system_device_count; system_device_list = scAllocate(NULL, sizeof(cl_device_id) * device_count); memset(system_device_list, 0, sizeof(cl_device_id) * device_count); system_status = clGetDeviceIDs(system_platform, system_device_type, system_device_count, system_device_list, &system_device_count); if (system_status != CL_SUCCESS) { scError(NULL, SC_DEVICE_NOT_AVAILABLE, "Failed to locate compute device!\n"); return SC_DEVICE_NOT_AVAILABLE; } session->context = clCreateContext(NULL, system_device_count, system_device_list, NotifyCallback, session, &system_status); if (!session->context) { scError(handle, SC_INVALID_CONTEXT, "Failed to create compute context!\n"); return SC_INVALID_CONTEXT; } session->platform = system_platform; system_status = clGetContextInfo(session->context, CL_CONTEXT_DEVICES, sizeof(cl_device_id) * system_device_count, system_device_list, &return_size); if(system_status != CL_SUCCESS || return_size < 1) { scError(handle, SC_INVALID_DEVICE, "Failed to retrieve compute devices for context!\n"); return SC_INVALID_DEVICE; } session->devices = system_device_list; session->units = system_device_count; session->queues = scAllocate(handle, sizeof(cl_command_queue) * session->units); if(!session->queues) { scError(handle, SC_OUT_OF_HOST_MEMORY, "Failed to allocate command queues!\n"); return SC_OUT_OF_HOST_MEMORY; } for(sc_uint i = 0; i < session->units; i++) { cl_char vendor_name[256] = {0}; cl_char device_name[256] = {0}; system_status = clGetDeviceInfo(session->devices[i], CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, &return_size); system_status|= clGetDeviceInfo(session->devices[i], CL_DEVICE_NAME, sizeof(device_name), device_name, &return_size); if (system_status != CL_SUCCESS) { scError(NULL, SC_INVALID_DEVICE_INFO, "Failed to retrieve device info!\n"); return SC_INVALID_DEVICE_INFO; } scInfo(handle, "Adding device '%s' '%s' to compute session.\n", vendor_name, device_name); session->queues[i] = clCreateCommandQueue(session->context, session->devices[i], 0, &system_status); if (!session->queues[i]) { scError(handle, SC_INVALID_COMMAND_QUEUE, "Failed to create a command queue!\n"); return SC_INVALID_COMMAND_QUEUE; } } session->programs = scCreateMap(handle, SC_DEFAULT_MAP_SIZE); session->kernels = scCreateMap(handle, SC_DEFAULT_MAP_SIZE); session->mem = scCreateMap(handle, SC_DEFAULT_MAP_SIZE); session->valid = SC_TRUE; return SC_SUCCESS; }
int main(int argc, char** argv) { int err; // error code returned from api calls float data[DATA_SIZE]; // original data set given to device float results[DATA_SIZE]; // results returned from device unsigned int correct; // number of correct results returned size_t global; // global domain size for our calculation size_t local; // local domain size for our calculation cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel cl_mem input; // device memory used for the input array cl_mem output; // device memory used for the output array // Fill our data set with random float values // int i = 0; unsigned int count = DATA_SIZE; for(i = 0; i < count; i++) data[i] = rand() / (float)RAND_MAX; // Connect to a compute device // int gpu = 1; err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); return EXIT_FAILURE; } // Create a compute context // context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } // Create a command commands // commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } // Create the compute program from the source buffer // program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err); if (!program) { printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } // Build the program executable // err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } // Create the compute kernel in the program we wish to run // kernel = clCreateKernel(program, "square", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } // Create the input and output arrays in device memory for our calculation // input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL); output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL); if (!input || !output) { printf("Error: Failed to allocate device memory!\n"); exit(1); } // Write our data set into the input array in device memory // err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array!\n"); exit(1); } // Set the arguments to our compute kernel // err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output); err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); exit(1); } // Get the maximum work group size for executing the kernel on the device // err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); exit(1); } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // global = count; err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel!\n"); return EXIT_FAILURE; } // Wait for the command commands to get serviced before reading back results // clFinish(commands); // Read back the results from the device to verify the output // err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); exit(1); } // Validate our results // correct = 0; for(i = 0; i < count; i++) { if(results[i] == data[i] * data[i]) correct++; } // Print a brief summary detailing the results // printf("Computed '%d/%d' correct values!\n", correct, count); // Shutdown and cleanup // clReleaseMemObject(input); clReleaseMemObject(output); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); return 0; }
int main(int argc, char** argv) { /* OpenCL 1.1 data structures */ cl_platform_id* platforms; cl_program program; cl_context context; /* OpenCL 1.1 scalar data types */ cl_uint numOfPlatforms; cl_int error; /* Prepare an array of __cl_float4 via dynamic memory allocation This will map to the native vector type which is SSE / SSE2 / AVX on Intel-compatible processors. */ cl_float8* ud_in = (cl_float8*) malloc( sizeof(cl_float8) * DATA_SIZE); // input to device cl_float8* ud_out = (cl_float8*) malloc( sizeof(cl_float8) * DATA_SIZE); // output from device for( int i = 0; i < DATA_SIZE; ++i) { ud_in[i] = (cl_float8){(float)i,(float)i,(float)i,(float)i,(float)i,(float)i,(float)i,(float)i}; ud_out[i] = (cl_float8){(float)0.f,(float)0.f,(float)0.f,(float)0.f,(float)0.f,(float)0.f,(float)0.f,(float)0.f}; } /* Get the number of platforms Remember that for each vendor's SDK installed on the computer, the number of available platform also increased. */ error = clGetPlatformIDs(0, NULL, &numOfPlatforms); if(error != CL_SUCCESS ) { perror("Unable to find any OpenCL platforms"); exit(1); } platforms = (cl_platform_id*) alloca(sizeof(cl_platform_id) * numOfPlatforms); printf("Number of OpenCL platforms found: %d\n", numOfPlatforms); error = clGetPlatformIDs(numOfPlatforms, platforms, NULL); if(error != CL_SUCCESS ) { perror("Unable to find any OpenCL platforms"); exit(1); } // Search for a CPU/GPU device through the installed platforms // Build a OpenCL program and do not run it. for(cl_uint i = 0; i < numOfPlatforms; i++ ) { cl_uint numOfDevices = 0; /* Determine how many devices are connected to your platform */ error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &numOfDevices); if (error != CL_SUCCESS ) { perror("Unable to obtain any OpenCL compliant device info"); exit(1); } cl_device_id* devices = (cl_device_id*) alloca(sizeof(cl_device_id) * numOfDevices); /* Load the information about your devices into the variable 'devices' */ error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, numOfDevices, devices, NULL); if (error != CL_SUCCESS ) { perror("Unable to obtain any OpenCL compliant device info"); exit(1); } printf("Number of detected OpenCL devices: %d\n", numOfDevices); /* Create a context */ cl_context_properties ctx[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[i], 0 }; context = clCreateContext(ctx, numOfDevices, devices, NULL, NULL, &error); if(error != CL_SUCCESS) { perror("Can't create a valid OpenCL context"); exit(1); } /* For each device, create a buffer and partition that data among the devices for compute! */ cl_mem inobj = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * DATA_SIZE, ud_in, &error); if(error != CL_SUCCESS) { perror("Can't create a buffer"); exit(1); } int offset = 0; for(int i = 0; i < numOfDevices; ++i, ++offset ) { /* Load the two source files into temporary datastores */ const char *file_names[] = {"vectorization.cl"}; const int NUMBER_OF_FILES = 1; char* buffer[NUMBER_OF_FILES]; size_t sizes[NUMBER_OF_FILES]; loadProgramSource(file_names, NUMBER_OF_FILES, buffer, sizes); /* Create the OpenCL program object */ program = clCreateProgramWithSource(context, NUMBER_OF_FILES, (const char**)buffer, sizes, &error); if(error != CL_SUCCESS) { perror("Can't create the OpenCL program object"); exit(1); } /* Build OpenCL program object and dump the error message, if any */ char *program_log; size_t log_size; char* build_options = "-fbin-llvmir -fbin-amdil -fbin-exe"; error = clBuildProgram(program, 1, &devices[i], build_options, NULL, NULL); if(error != CL_SUCCESS) { // If there's an error whilst building the program, dump the log clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); program_log = (char*) malloc(log_size+1); program_log[log_size] = '\0'; clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL); printf("\n=== ERROR ===\n\n%s\n=============\n", program_log); free(program_log); exit(1); } /* Query the program as to how many kernels were detected */ cl_uint numOfKernels; error = clCreateKernelsInProgram(program, 0, NULL, &numOfKernels); if (error != CL_SUCCESS) { perror("Unable to retrieve kernel count from program"); exit(1); } cl_kernel* kernels = (cl_kernel*) alloca(sizeof(cl_kernel) * numOfKernels); error = clCreateKernelsInProgram(program, numOfKernels, kernels, NULL); /* Loop thru each kernel and execute on device */ for(cl_uint j = 0; j < numOfKernels; j++) { char kernelName[32]; cl_uint argCnt; clGetKernelInfo(kernels[j], CL_KERNEL_FUNCTION_NAME, sizeof(kernelName), kernelName, NULL); clGetKernelInfo(kernels[j], CL_KERNEL_NUM_ARGS, sizeof(argCnt), &argCnt, NULL); printf("Kernel name: %s with arity: %d\n", kernelName, argCnt); printf("About to create command queue and enqueue this kernel...\n"); /* Create a command queue */ cl_command_queue cQ = clCreateCommandQueue(context, devices[i], 0, &error); if (error != CL_SUCCESS) { perror("Unable to create command-queue"); exit(1); } /* Create a buffer and copy the data from the main buffer */ cl_mem outobj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float8) * DATA_SIZE, 0, &error); if (error != CL_SUCCESS) { perror("Unable to create sub-buffer object"); exit(1); } /* Let OpenCL know that the kernel is suppose to receive an argument */ error = clSetKernelArg(kernels[j], 0, sizeof(cl_mem), &inobj); error = clSetKernelArg(kernels[j], 1, sizeof(cl_mem), &outobj); if (error != CL_SUCCESS) { perror("Unable to set buffer object in kernel"); exit(1); } /* Enqueue the kernel to the command queue */ error = clEnqueueTask(cQ, kernels[j], 0, NULL, NULL); if (error != CL_SUCCESS) { perror("Unable to enqueue task to command-queue"); exit(1); } printf("Task has been enqueued successfully!\n"); /* Enqueue the read-back from device to host */ error = clEnqueueReadBuffer(cQ, outobj, CL_TRUE, // blocking read 0, // read from the start sizeof(cl_float8)*DATA_SIZE, // how much to copy ud_out, 0, NULL, NULL); /* Check the returned data */ if ( valuesOK(ud_in, ud_out, DATA_SIZE) ) { printf("Check passed!\n"); } else printf("Check failed!\n"); /* Release the command queue */ clReleaseCommandQueue(cQ); clReleaseMemObject(outobj); } /* Clean up */ for(cl_uint i = 0; i < numOfKernels; i++) { clReleaseKernel(kernels[i]); } for(int i=0; i< NUMBER_OF_FILES; i++) { free(buffer[i]); } clReleaseProgram(program); }// end of device loop and execution clReleaseMemObject(inobj); clReleaseContext(context); }// end of platform loop free(ud_in); free(ud_out); }
int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufX, bufY; cl_event event = NULL; int ret = 0; int lenX = 1 + (N-1)*abs(incx); int lenY = 1 + (N-1)*abs(incy); /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenX*sizeof(cl_float)), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenY*sizeof(cl_float)), NULL, &err); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); printResult(); /* Call clblas function. */ err = clblasSrot(N, bufX, 0, incx, bufY, 0, incy, C, S, 1, &queue, 0, NULL, &event); // printf("here\n"); if (err != CL_SUCCESS) { printf("clblasSrot() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); /* At this point you will get the result of SROT placed in vector Y. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufY); clReleaseMemObject(bufX); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; }
void initopencl(void) { int i; // Get Platform and Device Info CL_CHECK(clGetPlatformIDs(1, &platform_id, &num_platforms)); // Currently this program only runs on a SINGLE GPU. CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &num_devices)); printf("=== %d OpenCL platform(s) found: ===\n", num_platforms); printf("=== %d OpenCL device(s) found on platform:\n", num_devices); char buffer[10240]; cl_uint buf_uint; cl_ulong buf_ulong; printf(" -- %d --\n", i); CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(buffer), buffer, NULL)); printf(" DEVICE_NAME = %s\n", buffer); CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL)); printf(" DEVICE_VENDOR = %s\n", buffer); CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL)); printf(" DEVICE_VERSION = %s\n", buffer); CL_CHECK(clGetDeviceInfo(device_id, CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL)); printf(" DRIVER_VERSION = %s\n", buffer); CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL)); printf(" DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint); CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL)); printf(" DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint); CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL)); printf(" DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong); if (num_devices == 0) { fprintf(stderr, "No Devices found that can run OpenCL."); exit(0); } // Create OpenCL context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating context: Function returned %d \n\n", ret); exit(1); } // Create Command Queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating command Queue: Function returned %d \n\n", ret); exit(1); } // Load the kernel source code into the array source_str FILE *fp; char *source_str; size_t source_size; fp = fopen("integrate.cl", "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } source_str = (char*)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp); fclose( fp ); // Create a program from the kernel source program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating a program for integration3D. %d \n\n", (int)ret); exit(1); } // Build the program ret = clBuildProgram(program, 1, &device_id, "-DUSE_DOUBLE=1", NULL, NULL); if (ret != CL_SUCCESS) { size_t length; char buffer[10240]; clGetProgramBuildInfo(program, 1, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &length); fprintf(stderr, "Error returned %d. \n\n", (int)ret); printf("Error Log: \n\n %s \n\n", buffer); exit(0); } /* // Create the OpenCL kernel (compute_points_Unstructure3D_1) kernel1 = clCreateKernel(program, "compute_points_Unstructure3D_1", &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating a kernel for compute_points_Unstructure3D_1. \n\n"); exit(1); } */ // Create the OpenCL kernel (check_int) kernel2 = clCreateKernel(program, "check_int", &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating a kernel for check_int. %d \n\n", (int)ret); exit(1); } // Create the OpenCL kernel (compute_points_Unstructure3D_1) kernel1 = clCreateKernel(program, "compute_points_Unstructure3D_1", &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating a kernel for compute_points_Unstructure3D_1. \n\n"); exit(1); } // Create the OpenCL kernel (initialize_timestep3D) kernel3 = clCreateKernel(program, "initialize_timestep3D", &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating a kernel for initialize_timestep3D. \n\n"); exit(1); } // Create the OpenCL kernel (initialize_timestep3D) kernel4 = clCreateKernel(program, "LocalSearch3D", &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating a kernel for LocalSearch3D. \n\n"); exit(1); } // Create the OpenCL kernel (initialize_timestep3D) kernel5 = clCreateKernel(program, "compute_points_Unstructure3D_2", &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating a kernel for LocalSearch3D. \n\n"); exit(1); } printf("\n\n"); }
void WorkScheduler::initialize(bool use_opencl, int num_cpu_threads) { /* initialize highlighting */ if (!g_highlightInitialized) { if (g_highlightedNodesRead) MEM_freeN(g_highlightedNodesRead); if (g_highlightedNodes) MEM_freeN(g_highlightedNodes); g_highlightedNodesRead = NULL; g_highlightedNodes = NULL; COM_startReadHighlights(); g_highlightInitialized = true; } #if COM_CURRENT_THREADING_MODEL == COM_TM_QUEUE /* deinitialize if number of threads doesn't match */ if (g_cpudevices.size() != num_cpu_threads) { Device *device; while (g_cpudevices.size() > 0) { device = g_cpudevices.back(); g_cpudevices.pop_back(); device->deinitialize(); delete device; } g_cpuInitialized = false; } /* initialize CPU threads */ if (!g_cpuInitialized) { for (int index = 0; index < num_cpu_threads; index++) { CPUDevice *device = new CPUDevice(); device->initialize(); g_cpudevices.push_back(device); } g_cpuInitialized = true; } #ifdef COM_OPENCL_ENABLED /* deinitialize OpenCL GPU's */ if (use_opencl && !g_openclInitialized) { g_context = NULL; g_program = NULL; if (clewInit() != CLEW_SUCCESS) /* this will check for errors and skip if already initialized */ return; if (clCreateContextFromType) { cl_uint numberOfPlatforms = 0; cl_int error; error = clGetPlatformIDs(0, 0, &numberOfPlatforms); if (error == -1001) { } /* GPU not supported */ else if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); } if (G.f & G_DEBUG) printf("%u number of platforms\n", numberOfPlatforms); cl_platform_id *platforms = (cl_platform_id *)MEM_mallocN(sizeof(cl_platform_id) * numberOfPlatforms, __func__); error = clGetPlatformIDs(numberOfPlatforms, platforms, 0); unsigned int indexPlatform; for (indexPlatform = 0; indexPlatform < numberOfPlatforms; indexPlatform++) { cl_platform_id platform = platforms[indexPlatform]; cl_uint numberOfDevices = 0; clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, 0, &numberOfDevices); if (numberOfDevices <= 0) continue; cl_device_id *cldevices = (cl_device_id *)MEM_mallocN(sizeof(cl_device_id) * numberOfDevices, __func__); clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numberOfDevices, cldevices, 0); g_context = clCreateContext(NULL, numberOfDevices, cldevices, clContextError, NULL, &error); if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); } const char *cl_str[2] = {datatoc_COM_OpenCLKernels_cl, NULL}; g_program = clCreateProgramWithSource(g_context, 1, cl_str, 0, &error); error = clBuildProgram(g_program, numberOfDevices, cldevices, 0, 0, 0); if (error != CL_SUCCESS) { cl_int error2; size_t ret_val_size = 0; printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); error2 = clGetProgramBuildInfo(g_program, cldevices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); if (error2 != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); } char *build_log = (char *)MEM_mallocN(sizeof(char) * ret_val_size + 1, __func__); error2 = clGetProgramBuildInfo(g_program, cldevices[0], CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL); if (error2 != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); } build_log[ret_val_size] = '\0'; printf("%s", build_log); MEM_freeN(build_log); } else { unsigned int indexDevices; for (indexDevices = 0; indexDevices < numberOfDevices; indexDevices++) { cl_device_id device = cldevices[indexDevices]; cl_int vendorID = 0; cl_int error2 = clGetDeviceInfo(device, CL_DEVICE_VENDOR_ID, sizeof(cl_int), &vendorID, NULL); if (error2 != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error2, clewErrorString(error2)); } OpenCLDevice *clDevice = new OpenCLDevice(g_context, device, g_program, vendorID); clDevice->initialize(); g_gpudevices.push_back(clDevice); } } MEM_freeN(cldevices); } MEM_freeN(platforms); } g_openclInitialized = true; } #endif #endif }
int main() { srand(unsigned(time(nullptr))); int err; // error code returned from api calls cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel // OpenCL device memory for matrices cl_mem d_A; cl_mem d_B; cl_mem d_C; // set seed for rand() srand(2014); //Allocate host memory for matrices A and B unsigned int size_A = WA * HA; unsigned int mem_size_A = sizeof(float) * size_A; float* h_A = (float*)malloc(mem_size_A); unsigned int size_B = WB * HB; unsigned int mem_size_B = sizeof(float) * size_B; float* h_B = (float*)malloc(mem_size_B); //Initialize host memory randomMemInit(h_A, size_A); randomMemInit(h_B, size_B); //Allocate host memory for the result C unsigned int size_C = WC * HC; unsigned int mem_size_C = sizeof(float) * size_C; float* h_C = (float*)malloc(mem_size_C); printf("Initializing OpenCL device...\n"); cl_uint dev_cnt = 0; clGetPlatformIDs(0, 0, &dev_cnt); cl_platform_id platform_ids[100]; clGetPlatformIDs(dev_cnt, platform_ids, NULL); // Connect to a compute device int gpu = 1; err = clGetDeviceIDs(platform_ids[0], gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if (err != CL_SUCCESS){ printf("Error: Failed to create a device group!\n"); return EXIT_FAILURE; } // Create a compute context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context){ printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } // Create a command commands commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands){ printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } // Create the compute program from the source file char *KernelSource; long lFileSize = LoadOpenCLKernel("matrixmul_kernel.cl", &KernelSource); if (lFileSize < 0L){ perror("File read failed"); return 1; } //const char* KernelSource = loadKernelCPP(".\\matrixmul_kernel.cl"); program = clCreateProgramWithSource(context, 1, (const char **)&KernelSource, NULL, &err); if (!program){ printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } // Build the program executable err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS){ size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } // Create the compute kernel in the program we wish to run kernel = clCreateKernel(program, "matrixMul", &err); if (!kernel || err != CL_SUCCESS){ printf("Error: Failed to create compute kernel!\n"); exit(1); } // Create the input and output arrays in device memory for our calculation d_C = clCreateBuffer(context, CL_MEM_READ_WRITE, mem_size_A, NULL, &err); d_A = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_A, h_A, &err); d_B = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_B, h_B, &err); if (!d_A || !d_B || !d_C){ printf("Error: Failed to allocate device memory!\n"); exit(1); } printf("Running matrix multiplication for matrices A (%dx%d) and B (%dx%d) ...\n", WA, HA, WB, HB); //Launch OpenCL kernel size_t localWorkSize[2], globalWorkSize[2]; int wA = WA; int wC = WC; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&d_C); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d_A); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&d_B); err |= clSetKernelArg(kernel, 3, sizeof(int), (void *)&wA); err |= clSetKernelArg(kernel, 4, sizeof(int), (void *)&wC); if (err != CL_SUCCESS){ printf("Error: Failed to set kernel arguments! %d\n", err); exit(1); } localWorkSize[0] = 16; localWorkSize[1] = 16; globalWorkSize[0] = 1024; globalWorkSize[1] = 1024; err = clEnqueueNDRangeKernel(commands, kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); if (err != CL_SUCCESS){ printf("Error: Failed to execute kernel! %d\n", err); exit(1); } //Retrieve result from device err = clEnqueueReadBuffer(commands, d_C, CL_TRUE, 0, mem_size_C, h_C, 0, NULL, NULL); if (err != CL_SUCCESS){ printf("Error: Failed to read output array! %d\n", err); exit(1); } //print table A printf("\nMatrix A\n"); for (int i = 0; i < size_A; i++){ printf("%f\t", h_A[i]); if (((i + 1) % WA) == 0) printf("\n"); } //print table B printf("\nMatrix B\n"); for (int i = 0; i < size_B; i++){ printf("%f\t", h_B[i]); if (((i + 1) % WB) == 0) printf("\n"); } //print out the results printf("\nMatrix C (Results)\n"); for (int i = 0; i < size_C; i++){ printf("%f\t", h_C[i]); if (((i + 1) % WC) == 0) printf("\n"); } printf("\n"); printf("Matrix multiplication completed...\n"); //Shutdown and cleanup free(h_A); free(h_B); free(h_C); clReleaseMemObject(d_A); clReleaseMemObject(d_C); clReleaseMemObject(d_B); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); std::cin.clear(); std::cin.sync(); std::cin.get(); }
int main(void) { //time meassuring struct timeval tvs; //variables int Nx=1024; int Ny=1024; int plotnum=0; int Tmax=2; int plottime=0; int plotgap=1; double Lx=1.0; double Ly=1.0; double dt=0.0; double A=0.0; double B=0.0; double Du=0.0; double Dv=0.0; //splitting coefficients double a=0.5; double b=0.5; double c=1.0; //loop counters int i=0; int j=0; int n=0; double*umax=NULL; double*vmax=NULL; parainit(&Nx,&Ny,&Tmax,&plotgap,&Lx,&Ly,&dt,&Du,&Dv,&A,&B); plottime=plotgap; vmax=(double*)malloc((Tmax/plotgap+1)*sizeof(double)); umax=(double*)malloc((Tmax/plotgap+1)*sizeof(double)); //openCL variables cl_platform_id *platform_id = NULL; cl_kernel frequencies = NULL, initialdata = NULL, linearpart=NULL; cl_kernel nonlinearpart_a=NULL, nonlinearpart_b=NULL; cl_int ret; cl_uint num_platforms; // Detect how many platforms there are. ret = clGetPlatformIDs(0, NULL, &num_platforms); // Allocate enough space for the number of platforms. platform_id = (cl_platform_id*) malloc(num_platforms*sizeof(cl_platform_id)); // Store the platforms ret = clGetPlatformIDs(num_platforms, platform_id, NULL); printf("Found %d platform(s)!\n",num_platforms); cl_uint *num_devices; num_devices=(cl_uint*) malloc(num_platforms*sizeof(cl_uint)); cl_device_id **device_id = NULL; device_id =(cl_device_id**) malloc(num_platforms*sizeof(cl_device_id*)); // Detect number of devices in the platforms for(i=0;i<num_platforms;i++){ char buf[65536]; size_t size; ret = clGetPlatformInfo(platform_id[i],CL_PLATFORM_VERSION,sizeof(buf),buf,&size); printf("%s\n",buf); ret = clGetDeviceIDs(platform_id[i],CL_DEVICE_TYPE_ALL,0,NULL,num_devices); printf("Found %d device(s) on platform %d!\n", num_devices[i],i); ret = clGetPlatformInfo(platform_id[i],CL_PLATFORM_NAME,sizeof(buf),buf,&size); printf("%s ",buf); // Store numDevices from platform device_id[i]=(cl_device_id*) malloc(num_devices[i]*sizeof(device_id)); ret = clGetDeviceIDs(platform_id[i],CL_DEVICE_TYPE_ALL,num_devices[i],device_id[i],NULL); for(j=0;j<num_devices[i];j++){ ret = clGetDeviceInfo(device_id[i][j],CL_DEVICE_NAME,sizeof(buf),buf,&size); printf("%s (%d,%d)\n",buf,i,j); } } //create context and command_queue cl_context context = NULL; cl_command_queue command_queue = NULL; //Which platform and device do i choose? int chooseplatform=0; int choosedevice=0; printf("Choose platform %d and device %d!\n",chooseplatform,choosedevice); context = clCreateContext( NULL, num_devices[chooseplatform], device_id[chooseplatform], NULL, NULL, &ret); if(ret!=CL_SUCCESS){printf("createContext ret:%d\n",ret); exit(1); } command_queue = clCreateCommandQueue(context, device_id[chooseplatform][choosedevice], 0, &ret); if(ret!=CL_SUCCESS){printf("createCommandQueue ret:%d\n",ret); exit(1); } //OpenCL arrays cl_mem cl_u = NULL,cl_v = NULL; cl_mem cl_uhat = NULL, cl_vhat = NULL; cl_mem cl_kx = NULL, cl_ky = NULL; //FFT clfftPlanHandle planHandle; cl_mem tmpBuffer = NULL; fftinit(&planHandle,&context, &command_queue, &tmpBuffer, Nx, Ny); //allocate gpu memory/ cl_u=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx* Ny* sizeof(double), NULL, &ret); cl_v=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx* Ny* sizeof(double), NULL, &ret); cl_uhat=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx * Ny* sizeof(double), NULL, &ret); cl_vhat=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx * Ny* sizeof(double), NULL, &ret); cl_kx = clCreateBuffer(context, CL_MEM_READ_WRITE, Nx * sizeof(double), NULL, &ret); cl_ky = clCreateBuffer(context, CL_MEM_READ_WRITE, Ny * sizeof(double), NULL, &ret); printf("allocated space\n"); //load the kernels loadKernel(&frequencies,&context,&device_id[chooseplatform][choosedevice],"frequencies"); loadKernel(&initialdata,&context,&device_id[chooseplatform][choosedevice],"initialdata"); loadKernel(&linearpart,&context,&device_id[chooseplatform][choosedevice],"linearpart"); loadKernel(&nonlinearpart_a,&context,&device_id[chooseplatform][choosedevice],"nonlinearpart_a"); loadKernel(&nonlinearpart_b,&context,&device_id[chooseplatform][choosedevice],"nonlinearpart_b"); size_t global_work_size[1] = {Nx*Ny}; size_t global_work_size_X[1] = {Nx}; size_t global_work_size_Y[1] = {Ny}; //frequencies ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem),(void *)&cl_kx); ret = clSetKernelArg(frequencies, 1, sizeof(double),(void* )&Lx); ret = clSetKernelArg(frequencies, 2, sizeof(int),(void* )&Nx); ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_X, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem),(void *)&cl_ky); ret = clSetKernelArg(frequencies, 1, sizeof(double),(void* )&Ly); ret = clSetKernelArg(frequencies, 2, sizeof(int),(void* )&Ny); ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_Y, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //printCL(&cl_kx,&command_queue,Nx,1); //printCL(&cl_ky,&command_queue,1,Ny); //inintial data ret = clSetKernelArg(initialdata, 0, sizeof(cl_mem),(void *)&cl_u); ret = clSetKernelArg(initialdata, 1, sizeof(cl_mem),(void* )&cl_v); ret = clSetKernelArg(initialdata, 2, sizeof(int),(void* )&Nx); ret = clSetKernelArg(initialdata, 3, sizeof(int),(void* )&Ny); ret = clSetKernelArg(initialdata, 4, sizeof(double),(void* )&Lx); ret = clSetKernelArg(initialdata, 5, sizeof(double),(void* )&Ly); ret = clEnqueueNDRangeKernel(command_queue, initialdata, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //make output writedata_C(&cl_u, &command_queue,Nx,Ny,plotnum,"u"); writedata_C(&cl_v, &command_queue,Nx,Ny,plotnum,"v"); umax[plotnum]=writeimage(&cl_u, &command_queue,Nx,Ny,plotnum,"u"); vmax[plotnum]=writeimage(&cl_v, &command_queue,Nx,Ny,plotnum,"v"); printf("Got initial data, starting timestepping\n"); mtime_s(&tvs); for(n=0;n<=Tmax;n++){ //nonlinearpart_a ret = clSetKernelArg(nonlinearpart_a, 0, sizeof(cl_mem),(void *)&cl_u); ret = clSetKernelArg(nonlinearpart_a, 1, sizeof(cl_mem),(void* )&cl_v); ret = clSetKernelArg(nonlinearpart_a, 2, sizeof(double),(void* )&A); ret = clSetKernelArg(nonlinearpart_a, 3, sizeof(double),(void* )&dt); ret = clSetKernelArg(nonlinearpart_a, 4, sizeof(double),(void* )&a); ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_a, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //nonlinearpart_b ret = clSetKernelArg(nonlinearpart_b, 0, sizeof(cl_mem),(void *)&cl_u); ret = clSetKernelArg(nonlinearpart_b, 1, sizeof(cl_mem),(void* )&cl_v); ret = clSetKernelArg(nonlinearpart_b, 2, sizeof(double),(void* )&A); ret = clSetKernelArg(nonlinearpart_b, 3, sizeof(double),(void* )&dt); ret = clSetKernelArg(nonlinearpart_b, 4, sizeof(double),(void* )&b); ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_b, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //linear fft2dfor(&cl_u, &cl_uhat,&planHandle,&command_queue,&tmpBuffer); fft2dfor(&cl_v, &cl_vhat,&planHandle,&command_queue,&tmpBuffer); //printf("A%f,B%f\n",A,B); ret = clSetKernelArg(linearpart, 0, sizeof(cl_mem),(void *)&cl_uhat); ret = clSetKernelArg(linearpart, 1, sizeof(cl_mem),(void *)&cl_vhat); ret = clSetKernelArg(linearpart, 2, sizeof(cl_mem),(void* )&cl_kx); ret = clSetKernelArg(linearpart, 3, sizeof(cl_mem),(void* )&cl_ky); ret = clSetKernelArg(linearpart, 4, sizeof(double),(void* )&Du); ret = clSetKernelArg(linearpart, 5, sizeof(double),(void* )&Dv); ret = clSetKernelArg(linearpart, 6, sizeof(double),(void* )&A); ret = clSetKernelArg(linearpart, 7, sizeof(double),(void* )&B); ret = clSetKernelArg(linearpart, 8, sizeof(double),(void* )&dt); ret = clSetKernelArg(linearpart, 9, sizeof(double),(void* )&c); ret = clSetKernelArg(linearpart, 10, sizeof(int),(void* )&Nx); ret = clSetKernelArg(linearpart, 11, sizeof(int),(void* )&Ny); ret = clEnqueueNDRangeKernel(command_queue, linearpart, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); fft2dback(&cl_u, &cl_uhat,&planHandle,&command_queue,&tmpBuffer); fft2dback(&cl_v, &cl_vhat,&planHandle,&command_queue,&tmpBuffer); //nonlinearpart_b ret = clSetKernelArg(nonlinearpart_b, 0, sizeof(cl_mem),(void *)&cl_u); ret = clSetKernelArg(nonlinearpart_b, 1, sizeof(cl_mem),(void* )&cl_v); ret = clSetKernelArg(nonlinearpart_b, 2, sizeof(double),(void* )&A); ret = clSetKernelArg(nonlinearpart_b, 3, sizeof(double),(void* )&dt); ret = clSetKernelArg(nonlinearpart_b, 4, sizeof(double),(void* )&b); ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_b, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //nonlinearpart_a ret = clSetKernelArg(nonlinearpart_a, 0, sizeof(cl_mem),(void *)&cl_u); ret = clSetKernelArg(nonlinearpart_a, 1, sizeof(cl_mem),(void* )&cl_v); ret = clSetKernelArg(nonlinearpart_a, 2, sizeof(double),(void* )&A); ret = clSetKernelArg(nonlinearpart_a, 3, sizeof(double),(void* )&dt); ret = clSetKernelArg(nonlinearpart_a, 4, sizeof(double),(void* )&a); ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_a, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); // done if(n==plottime){ printf("time:%f, step:%d,%d,umax:%f,vmax:%f\n",n*dt,n,plotnum,umax[plotnum],vmax[plotnum]); plottime=plottime+plotgap; plotnum=plotnum+1; writedata_C(&cl_u, &command_queue,Nx,Ny,plotnum,"u"); writedata_C(&cl_v, &command_queue,Nx,Ny,plotnum,"v"); umax[plotnum]=writeimage(&cl_u, &command_queue,Nx,Ny,plotnum,"u"); vmax[plotnum]=writeimage(&cl_v, &command_queue,Nx,Ny,plotnum,"v"); } }//end timestepping printf("Finished time stepping\n"); mtime_e(&tvs,"Programm took:"); writearray(umax,(Tmax/plotgap)+1,"u"); writearray(vmax,(Tmax/plotgap)+1,"v"); free(umax); free(vmax); clReleaseMemObject(cl_u); clReleaseMemObject(cl_v); clReleaseMemObject(cl_uhat); clReleaseMemObject(cl_vhat); clReleaseMemObject(cl_kx); clReleaseMemObject(cl_ky); ret = clReleaseKernel(initialdata); ret = clReleaseKernel(frequencies); ret = clReleaseKernel(linearpart); ret = clReleaseKernel(nonlinearpart_a); ret = clReleaseKernel(nonlinearpart_b); fftdestroy(&planHandle, &tmpBuffer); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); for(i=0;i<num_platforms;i++){free(device_id[i]);} free(device_id); free(platform_id); free(num_devices); printf("Program execution complete\n"); return 0; }
int main() { char buf[]="Hello, World!"; size_t srcsize, worksize=strlen(buf); cl_int error; cl_platform_id platform; cl_device_id device; cl_uint platforms, devices; // Fetch the Platform and Device IDs; we only want one. error=clGetPlatformIDs(1, &platform, &platforms); error=clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, &devices); cl_context_properties properties[]={ CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0}; // Note that nVidia's OpenCL requires the platform property cl_context context=clCreateContext(properties, 1, &device, NULL, NULL, &error); cl_command_queue cq = clCreateCommandQueue(context, device, 0, &error); rot13(buf); // scramble using the CPU puts(buf); // Just to demonstrate the plaintext is destroyed //char src[8192]; //FILE *fil=fopen("rot13.cl","r"); //srcsize=fread(src, sizeof src, 1, fil); //fclose(fil); const char *src=rot13_cl; srcsize=strlen(rot13_cl); const char *srcptr[]={src}; // Submit the source code of the rot13 kernel to OpenCL cl_program prog=clCreateProgramWithSource(context, 1, srcptr, &srcsize, &error); // and compile it (after this we could extract the compiled version) error=clBuildProgram(prog, 0, NULL, "", NULL, NULL); // Allocate memory for the kernel to work with cl_mem mem1, mem2; mem1=clCreateBuffer(context, CL_MEM_READ_ONLY, worksize, NULL, &error); mem2=clCreateBuffer(context, CL_MEM_WRITE_ONLY, worksize, NULL, &error); // get a handle and map parameters for the kernel cl_kernel k_rot13=clCreateKernel(prog, "rot13", &error); clSetKernelArg(k_rot13, 0, sizeof(mem1), &mem1); clSetKernelArg(k_rot13, 1, sizeof(mem2), &mem2); // Target buffer just so we show we got the data from OpenCL char buf2[sizeof buf]; buf2[0]='?'; buf2[worksize]=0; // Send input data to OpenCL (async, don't alter the buffer!) error=clEnqueueWriteBuffer(cq, mem1, CL_FALSE, 0, worksize, buf, 0, NULL, NULL); // Perform the operation error=clEnqueueNDRangeKernel(cq, k_rot13, 1, NULL, &worksize, &worksize, 0, NULL, NULL); // Read the result back into buf2 error=clEnqueueReadBuffer(cq, mem2, CL_FALSE, 0, worksize, buf2, 0, NULL, NULL); // Await completion of all the above error=clFinish(cq); // Finally, output out happy message. puts(buf2); }
int main( int argc, char* argv[] ) { // Length of vectors unsigned int n = 100000; // Host input vectors double *h_a; double *h_b; // Host output vector double *h_c; // Device input buffers cl_mem d_a; cl_mem d_b; // Device output buffer cl_mem d_c; cl_platform_id cpPlatform; // OpenCL platform cl_device_id device_id; // device ID cl_context context; // context cl_command_queue queue; // command queue cl_program program; // program cl_kernel kernel; // kernel // Size, in bytes, of each vector size_t bytes = n*sizeof(double); // Allocate memory for each vector on host h_a = (double*)malloc(bytes); h_b = (double*)malloc(bytes); h_c = (double*)malloc(bytes); // Initialize vectors on host int i; for( i = 0; i < n; i++ ) { h_a[i] = sinf(i)*sinf(i); h_b[i] = cosf(i)*cosf(i); } size_t globalSize, localSize; cl_int err; // Number of work items in each local work group localSize = 64; // Number of total work items - localSize must be devisor globalSize = ceil(n/(float)localSize)*localSize; // Bind to platform err = clGetPlatformIDs(1, &cpPlatform, NULL); // Get ID for the device err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); // Create a context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); // Create a command queue queue = clCreateCommandQueue(context, device_id, 0, &err); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelSource, NULL, &err); // Build the program executable clBuildProgram(program, 0, NULL, NULL, NULL, NULL); // Create the compute kernel in the program we wish to run kernel = clCreateKernel(program, "vecAdd", &err); // Create the input and output arrays in device memory for our calculation d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL); d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL); d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL); // Write our data set into the input array in device memory err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0, bytes, h_a, 0, NULL, NULL); err |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0, bytes, h_b, 0, NULL, NULL); // Set the arguments to our compute kernel err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c); err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n); // Execute the kernel over the entire range of the data set err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL); // Wait for the command queue to get serviced before reading back results clFinish(queue); // Read the results from the device clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0, bytes, h_c, 0, NULL, NULL ); //Sum up vector c and print result divided by n, this should equal 1 within error double sum = 0; for(i=0; i<n; i++) sum += h_c[i]; printf("final result: %f\n", sum/n); // release OpenCL resources clReleaseMemObject(d_a); clReleaseMemObject(d_b); clReleaseMemObject(d_c); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseContext(context); //release host memory free(h_a); free(h_b); free(h_c); return 0; }
/** * @brief Creates an array of objects containing the OpenCL variables of each device * @param trDataBase The training database which will contain the instances and the features * @param selInstances The instances choosen as initial centroids * @param transposedTrDataBase The training database already transposed * @param conf The structure with all configuration parameters * @return A pointer containing the objects */ CLDevice *createDevices(const float *const trDataBase, const int *const selInstances, const float *const transposedTrDataBase, Config *const conf) { /********** Find the OpenCL devices specified in configuration ***********/ // OpenCL variables cl_uint numPlatformsDevices; cl_device_type deviceType; cl_program program; cl_kernel kernel; cl_int status; // Others variables auto allDevices = getAllDevices(); CLDevice *devices = new CLDevice[conf -> nDevices + (conf -> ompThreads > 0)]; for (int dev = 0; dev < conf -> nDevices; ++dev) { bool found = false; for (int allDev = 0; allDev < allDevices.size() && !found; ++allDev) { // Get the specified OpenCL device char dbuff[120]; check(clGetDeviceInfo(allDevices[allDev], CL_DEVICE_NAME, sizeof(dbuff), dbuff, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_NAME); // If the device exists... if (conf -> devices[dev] == dbuff) { devices[dev].device = allDevices[allDev]; devices[dev].deviceName = dbuff; check(clGetDeviceInfo(devices[dev].device, CL_DEVICE_TYPE, sizeof(cl_device_type), &(devices[dev].deviceType), NULL) != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_TYPE); /********** Device local memory usage ***********/ long int usedMemory = conf -> nFeatures * sizeof(cl_uchar); // Chromosome of the individual usedMemory += conf -> trNInstances * sizeof(cl_uchar); // Mapping buffer usedMemory += conf -> K * conf -> nFeatures * sizeof(cl_float); // Centroids buffer usedMemory += conf -> trNInstances * sizeof(cl_float); // DistCentroids buffer usedMemory += conf -> K * sizeof(cl_int); // Samples_in_k buffer // Get the maximum local memory size long int maxMemory; check(clGetDeviceInfo(devices[dev].device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(long int), &maxMemory, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_MAXMEM); // Avoid exceeding the maximum local memory available. 1024 bytes of margin check(usedMemory > maxMemory - 1024, "%s:\n\tMax memory: %ld bytes\n\tAllow memory: %ld bytes\n\tUsed memory: %ld bytes\n", CL_ERROR_DEVICE_LOCALMEM, maxMemory, maxMemory - 1024, usedMemory); /********** Create context ***********/ devices[dev].context = clCreateContext(NULL, 1, &(devices[dev].device), 0, 0, &status); check(status != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_CONTEXT); /********** Create Command queue ***********/ devices[dev].commandQueue = clCreateCommandQueue(devices[dev].context, devices[dev].device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE, &status); check(status != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_QUEUE); /********** Create kernel ***********/ // Open the file containing the kernels std::fstream kernels(conf -> kernelsFileName.c_str(), std::fstream::in); check(!kernels.is_open(), "%s\n", CL_ERROR_FILE_OPEN); // Obtain the size kernels.seekg(0, kernels.end); size_t fSize = kernels.tellg(); kernels.seekg(0, kernels.beg); char *kernelSource = new char[fSize]; kernels.read(kernelSource, fSize); kernels.close(); // Create program program = clCreateProgramWithSource(devices[dev].context, 1, (const char **) &kernelSource, &fSize, &status); check(status != CL_SUCCESS, "%s\n", CL_ERROR_PROGRAM_BUILD); // Build program for the device in the context char buildOptions[196]; sprintf(buildOptions, "-I include -D N_INSTANCES=%d -D N_FEATURES=%d -D N_OBJECTIVES=%d -D K=%d -D MAX_ITER_KMEANS=%d", conf -> trNInstances, conf -> nFeatures, conf -> nObjectives, conf -> K, conf -> maxIterKmeans); if (clBuildProgram(program, 1, &(devices[dev].device), buildOptions, 0, 0) != CL_SUCCESS) { char buffer[4096]; fprintf(stderr, "Error: Could not build the program\n"); check(clGetProgramBuildInfo(program, devices[dev].device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_PROGRAM_ERRORS); check(true, "%s\n", buffer); } // Create kernel const char *kernelName = (devices[dev].deviceType == CL_DEVICE_TYPE_GPU) ? "kmeansGPU" : ""; devices[dev].kernel = clCreateKernel(program, kernelName, &status); check(status != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_BUILD); /******* Work-items *******/ devices[dev].computeUnits = atoi(conf -> computeUnits[dev].c_str()); devices[dev].wiLocal = atoi(conf -> wiLocal[dev].c_str()); devices[dev].wiGlobal = devices[dev].computeUnits * devices[dev].wiLocal; /******* Create and write the databases and centroids buffers. Create the subpopulations buffer. Set kernel arguments *******/ // Create buffers devices[dev].objSubpopulations = clCreateBuffer(devices[dev].context, CL_MEM_READ_WRITE, conf -> familySize * sizeof(Individual), 0, &status); check(status != CL_SUCCESS, "%s\n", CL_ERROR_OBJECT_SUBPOPS); devices[dev].objTrDataBase = clCreateBuffer(devices[dev].context, CL_MEM_READ_ONLY, conf -> trNInstances * conf -> nFeatures * sizeof(cl_float), 0, &status); check(status != CL_SUCCESS, "%s\n", CL_ERROR_OBJECT_TRDB); devices[dev].objTransposedTrDataBase = clCreateBuffer(devices[dev].context, CL_MEM_READ_ONLY, conf -> trNInstances * conf -> nFeatures * sizeof(cl_float), 0, &status); check(status != CL_SUCCESS, "%s\n", CL_ERROR_OBJECT_TTRDB); devices[dev].objSelInstances = clCreateBuffer(devices[dev].context, CL_MEM_READ_ONLY, conf -> K * sizeof(cl_int), 0, &status); check(status != CL_SUCCESS, "%s\n", CL_ERROR_OBJECT_CENTROIDS); // Sets kernel arguments check(clSetKernelArg(devices[dev].kernel, 0, sizeof(cl_mem), (void *)&(devices[dev].objSubpopulations)) != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_ARGUMENT1); check(clSetKernelArg(devices[dev].kernel, 1, sizeof(cl_mem), (void *)&(devices[dev].objSelInstances)) != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_ARGUMENT2); check(clSetKernelArg(devices[dev].kernel, 2, sizeof(cl_mem), (void *)&(devices[dev].objTrDataBase)) != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_ARGUMENT3); check(clSetKernelArg(devices[dev].kernel, 5, sizeof(cl_mem), (void *)&(devices[dev].objTransposedTrDataBase)) != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_ARGUMENT6); // Write buffers check(clEnqueueWriteBuffer(devices[dev].commandQueue, devices[dev].objTrDataBase, CL_FALSE, 0, conf -> trNInstances * conf -> nFeatures * sizeof(cl_float), trDataBase, 0, NULL, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_ENQUEUE_TRDB); check(clEnqueueWriteBuffer(devices[dev].commandQueue, devices[dev].objSelInstances, CL_FALSE, 0, conf -> K * sizeof(cl_int), selInstances, 0, NULL, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_ENQUEUE_CENTROIDS); check(clEnqueueWriteBuffer(devices[dev].commandQueue, devices[dev].objTransposedTrDataBase, CL_FALSE, 0, conf -> trNInstances * conf -> nFeatures * sizeof(cl_float), transposedTrDataBase, 0, NULL, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_ENQUEUE_TTRDB); // Resources used are released delete[] kernelSource; clReleaseProgram(program); found = true; allDevices.erase(allDevices.begin() + allDev); } } check(!found, "%s\n", CL_ERROR_DEVICE_FOUND); } /********** Add the CPU if has been enabled in configuration ***********/ if (conf -> ompThreads > 0) { devices[conf -> nDevices].deviceType = CL_DEVICE_TYPE_CPU; devices[conf -> nDevices].computeUnits = conf -> ompThreads; ++(conf -> nDevices); } return devices; }
/** * @brief Create a new OpenCL zone, which will contain complete information for an OpenCL execution session on a specific device. * * @param deviceType Device type (OpenCL bitfield). * @param numQueues Number of command queues. * @param queueProperties Properties for the command queues. * @param devSel Pointer to function which will select device, if more than one is available. * @param dsExtraArg Extra argument for (*deviceSelector) function. * @param err Error structure, to be populated if an error occurs. * @return OpenCL zone or NULL if device wasn't properly initialized. */ CLUZone* clu_zone_new(cl_uint deviceType, cl_uint numQueues, cl_int queueProperties, clu_device_selector devSel, void* dsExtraArg, GError **err) { /* OpenCL status variable. */ cl_int status; /* OpenCL zone to initialize and return */ CLUZone* zone; /* Information about devices */ CLUDeviceInfo devInfos[CLU_MAX_DEVICES_TOTAL]; /* Number of devices. */ cl_uint numDevices; /* Index of device information */ cl_int deviceInfoIndex; /* Context properties, */ cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, 0, 0}; /* List of platform Ids. */ cl_platform_id platfIds[CLU_MAX_PLATFORMS]; /* Number of platforms. */ cl_uint numPlatforms; /* Total number of devices. */ unsigned int totalNumDevices; /* Device IDs for a given platform. */ cl_device_id devIds[CLU_MAX_DEVICES_PER_PLATFORM]; /* Initialize zone */ zone = (CLUZone*) malloc(sizeof(CLUZone)); gef_if_error_create_goto( *err, CLU_UTILS_ERROR, NULL == zone, CLU_ERROR_NOALLOC, error_handler, "Unable to allocate memory for OpenCL zone" ); zone->context = NULL; zone->queues = NULL; zone->program = NULL; zone->device_info.device_id = NULL; zone->device_info.platform_id = NULL; zone->device_info.device_name[0] = '\0'; zone->device_info.device_vendor[0] = '\0'; zone->device_info.platform_name[0] = '\0'; /* Get number of platforms */ status = clGetPlatformIDs(0, NULL, &numPlatforms); gef_if_error_create_goto( *err, CLU_UTILS_ERROR, CL_SUCCESS != status, CLU_OCL_ERROR, error_handler, "clu_zone_new: get number of platforms (OpenCL error %d: %s).", status, clerror_get(status)); /* Get existing platforms */ status = clGetPlatformIDs(numPlatforms, platfIds, NULL); gef_if_error_create_goto( *err, CLU_UTILS_ERROR, CL_SUCCESS != status, CLU_OCL_ERROR, error_handler, "clu_zone_new: get platform Ids (OpenCL error %d: %s).", status, clerror_get(status)); /* Cycle through platforms, get specified devices in existing platforms */ totalNumDevices = 0; for(unsigned int i = 0; i < numPlatforms; i++) { /* Get specified devices for current platform */ status = clGetDeviceIDs( platfIds[i], deviceType, CLU_MAX_DEVICES_PER_PLATFORM, devIds, &numDevices); if (status != CL_DEVICE_NOT_FOUND) { /* At least one device found, lets take note */ gef_if_error_create_goto( *err, CLU_UTILS_ERROR, CL_SUCCESS != status, CLU_OCL_ERROR, error_handler, "clu_zone_new: get device Ids (OpenCL error %d: %s).", status, clerror_get(status)); for (unsigned int j = 0; j < numDevices; j++) { /* Keep device and platform IDs. */ devInfos[totalNumDevices].device_id = devIds[j]; devInfos[totalNumDevices].platform_id = platfIds[i]; /* Get device name. */ status = clGetDeviceInfo( devIds[j], CL_DEVICE_NAME, sizeof(devInfos[totalNumDevices].device_name), devInfos[totalNumDevices].device_name, NULL); gef_if_error_create_goto( *err, CLU_UTILS_ERROR, CL_SUCCESS != status, CLU_OCL_ERROR, error_handler, "clu_zone_new: get device name info (OpenCL error %d: %s).", status, clerror_get(status)); /* Get device vendor. */ status = clGetDeviceInfo( devIds[j], CL_DEVICE_VENDOR, sizeof(devInfos[totalNumDevices].device_vendor), devInfos[totalNumDevices].device_vendor, NULL); gef_if_error_create_goto( *err, CLU_UTILS_ERROR, CL_SUCCESS != status, CLU_OCL_ERROR, error_handler, "clu_zone_new: get device vendor info (OpenCL error %d: %s).", status, clerror_get(status)); /* Get platform name. */ status = clGetPlatformInfo( platfIds[i], CL_PLATFORM_VENDOR, sizeof(devInfos[totalNumDevices].platform_name), devInfos[totalNumDevices].platform_name, NULL); gef_if_error_create_goto( *err, CLU_UTILS_ERROR, CL_SUCCESS != status, CLU_OCL_ERROR, error_handler, "clu_zone_new: get platform info (OpenCL error %d: %s).", status, clerror_get(status)); /* Increment total number of found devices. */ totalNumDevices++; } } } /* Check whether any devices of the specified type were found */ if (totalNumDevices == 0) { /* No devices of the specified type where found, return with error. */ gef_if_error_create_goto( *err, CLU_UTILS_ERROR, 1, CLU_ERROR_DEVICE_NOT_FOUND, error_handler, "clu_zone_new: device not found."); } else { /* Several compatible devices found, choose one with given selector function. */ deviceInfoIndex = devSel(devInfos, totalNumDevices, dsExtraArg); /* Test return value of selector function (if it is out of range, * there is a programming error). */ g_assert_cmpint(deviceInfoIndex, >=, -1); g_assert_cmpint(deviceInfoIndex, <, totalNumDevices); /* If selector function returned -1, then no device is selectable. */ if (deviceInfoIndex == -1) { gef_if_error_create_goto( *err, CLU_UTILS_ERROR, 1, CLU_ERROR_DEVICE_NOT_FOUND, error_handler, "clu_zone_new: specified device not found."); } } /* Store info about the selected device and platform. */ zone->device_type = deviceType; zone->device_info = devInfos[deviceInfoIndex]; /* Determine number of compute units for that device */ status = clGetDeviceInfo( zone->device_info.device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &zone->cu, NULL); gef_if_error_create_goto( *err, CLU_UTILS_ERROR, CL_SUCCESS != status, CLU_OCL_ERROR, error_handler, "clu_zone_new: get target device info (OpenCL error %d: %s).", status, clerror_get(status)); /* Create a context on that device. */ cps[1] = (cl_context_properties) devInfos[deviceInfoIndex].platform_id; zone->context = clCreateContext(cps, 1, &zone->device_info.device_id, NULL, NULL, &status); gef_if_error_create_goto( *err, CLU_UTILS_ERROR, CL_SUCCESS != status, CLU_OCL_ERROR, error_handler, "clu_zone_new: creating context (OpenCL error %d: %s).", status, clerror_get(status)); /* Create the specified command queues on that device */ zone->numQueues = numQueues; zone->queues = (cl_command_queue*) malloc(numQueues * sizeof(cl_command_queue)); gef_if_error_create_goto( *err, CLU_UTILS_ERROR, NULL == zone->queues, CLU_ERROR_NOALLOC, error_handler, "Unable to allocate memory to keep OpenCL command queues in Zone." ); for (unsigned int i = 0; i < numQueues; i++) { zone->queues[i] = clCreateCommandQueue( zone->context, zone->device_info.device_id, queueProperties, &status); gef_if_error_create_goto( *err, CLU_UTILS_ERROR, CL_SUCCESS != status, CLU_OCL_ERROR, error_handler, "clu_zone_new: creating command queue (OpenCL error %d: %s).", status, clerror_get(status)); } /* If we got here, everything is OK. */ g_assert (err == NULL || *err == NULL); goto finish; error_handler: /* If we got here there was an error, verify that it is so. */ g_assert (err == NULL || *err != NULL); /* Free OpenCL zone. */ if (zone != NULL) { clu_zone_free(zone); zone = NULL; } finish: /* Return. */ return zone; }
OpenCLDevice(DeviceInfo& info, Stats &stats, bool background_) : Device(stats) { background = background_; cpPlatform = NULL; cxContext = NULL; cqCommandQueue = NULL; cpProgram = NULL; ckPathTraceKernel = NULL; ckFilmConvertKernel = NULL; null_mem = 0; device_initialized = false; /* setup platform */ cl_uint num_platforms; ciErr = clGetPlatformIDs(0, NULL, &num_platforms); if(opencl_error(ciErr)) return; if(num_platforms == 0) { opencl_error("OpenCL: no platforms found."); return; } ciErr = clGetPlatformIDs(1, &cpPlatform, NULL); if(opencl_error(ciErr)) return; char name[256]; clGetPlatformInfo(cpPlatform, CL_PLATFORM_NAME, sizeof(name), &name, NULL); platform_name = name; /* get devices */ vector<cl_device_id> device_ids; cl_uint num_devices; if(opencl_error(clGetDeviceIDs(cpPlatform, opencl_device_type(), 0, NULL, &num_devices))) return; if(info.num > num_devices) { if(num_devices == 0) opencl_error("OpenCL: no devices found."); else opencl_error("OpenCL: specified device not found."); return; } device_ids.resize(num_devices); if(opencl_error(clGetDeviceIDs(cpPlatform, opencl_device_type(), num_devices, &device_ids[0], NULL))) return; cdDevice = device_ids[info.num]; /* create context */ cxContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr); if(opencl_error(ciErr)) return; cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr); if(opencl_error(ciErr)) return; null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr); device_initialized = true; }
int main(int argc, char const *argv[]) { /* Get platform */ cl_platform_id platform; cl_uint num_platforms; cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformIDs' failed\n"); exit(1); } printf("Number of platforms: %d\n", num_platforms); printf("platform=%p\n", platform); /* Get platform name */ char platform_name[100]; ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformInfo' failed\n"); exit(1); } printf("platform.name='%s'\n\n", platform_name); /* Get device */ cl_device_id device; cl_uint num_devices; ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceIDs' failed\n"); exit(1); } printf("Number of devices: %d\n", num_devices); printf("device=%p\n", device); /* Get device name */ char device_name[100]; ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceInfo' failed\n"); exit(1); } printf("device.name='%s'\n", device_name); printf("\n"); /* Create a Context Object */ cl_context context; context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateContext' failed\n"); exit(1); } printf("context=%p\n", context); /* Create a Command Queue Object*/ cl_command_queue command_queue; command_queue = clCreateCommandQueue(context, device, 0, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateCommandQueue' failed\n"); exit(1); } printf("command_queue=%p\n", command_queue); printf("\n"); /* Program source */ unsigned char *source_code; size_t source_length; /* Read program from 'clz_uint16.cl' */ source_code = read_buffer("clz_uint16.cl", &source_length); /* Create a program */ cl_program program; program = clCreateProgramWithSource(context, 1, (const char **)&source_code, &source_length, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateProgramWithSource' failed\n"); exit(1); } printf("program=%p\n", program); /* Build program */ ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if (ret != CL_SUCCESS ) { size_t size; char *log; /* Get log size */ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size); /* Allocate log and print */ log = malloc(size); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL); printf("error: call to 'clBuildProgram' failed:\n%s\n", log); /* Free log and exit */ free(log); exit(1); } printf("program built\n"); printf("\n"); /* Create a Kernel Object */ cl_kernel kernel; kernel = clCreateKernel(program, "clz_uint16", &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateKernel' failed\n"); exit(1); } /* Create and allocate host buffers */ size_t num_elem = 10; /* Create and init host side src buffer 0 */ cl_uint16 *src_0_host_buffer; src_0_host_buffer = malloc(num_elem * sizeof(cl_uint16)); for (int i = 0; i < num_elem; i++) src_0_host_buffer[i] = (cl_uint16){{2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}}; /* Create and init device side src buffer 0 */ cl_mem src_0_device_buffer; src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_uint16), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_uint16), src_0_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create host dst buffer */ cl_uint16 *dst_host_buffer; dst_host_buffer = malloc(num_elem * sizeof(cl_uint16)); memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_uint16)); /* Create device dst buffer */ cl_mem dst_device_buffer; dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_uint16), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create dst buffer\n"); exit(1); } /* Set kernel arguments */ ret = CL_SUCCESS; ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer); ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clSetKernelArg' failed\n"); exit(1); } /* Launch the kernel */ size_t global_work_size = num_elem; size_t local_work_size = num_elem; ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueNDRangeKernel' failed\n"); exit(1); } /* Wait for it to finish */ clFinish(command_queue); /* Read results from GPU */ ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_uint16), dst_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueReadBuffer' failed\n"); exit(1); } /* Dump dst buffer to file */ char dump_file[100]; sprintf((char *)&dump_file, "%s.result", argv[0]); write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_uint16)); printf("Result dumped to %s\n", dump_file); /* Free host dst buffer */ free(dst_host_buffer); /* Free device dst buffer */ ret = clReleaseMemObject(dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 0 */ free(src_0_host_buffer); /* Free device side src buffer 0 */ ret = clReleaseMemObject(src_0_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Release kernel */ ret = clReleaseKernel(kernel); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseKernel' failed\n"); exit(1); } /* Release program */ ret = clReleaseProgram(program); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseProgram' failed\n"); exit(1); } /* Release command queue */ ret = clReleaseCommandQueue(command_queue); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseCommandQueue' failed\n"); exit(1); } /* Release context */ ret = clReleaseContext(context); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseContext' failed\n"); exit(1); } return 0; }
int32_t init_kernel_platform() { cl_uint plat_num; cl_platform_id plat_id = NULL; cl_uint dev_num = 0; cl_device_id *devices; ret = clGetPlatformIDs(0, NULL, &plat_num); if (ret < 0) { LOGD("MU1 Error: Getting plat_ids!\n"); return -1; } if(plat_num > 0) { cl_platform_id* plat_ids = (cl_platform_id* )malloc(plat_num* sizeof(cl_platform_id)); ret = clGetPlatformIDs(plat_num, plat_ids, NULL); plat_id = plat_ids[0]; free(plat_ids); } ret = clGetDeviceIDs(plat_id, CL_DEVICE_TYPE_GPU, 0, NULL, &dev_num); if (dev_num == 0) { LOGD("MU1: No GPU device available.\n"); LOGD("MU1: Choose CPU as default device.\n"); ret = clGetDeviceIDs(plat_id, CL_DEVICE_TYPE_CPU, 0, NULL, &dev_num); devices = (cl_device_id*)malloc(dev_num * sizeof(cl_device_id)); ret = clGetDeviceIDs(plat_id, CL_DEVICE_TYPE_CPU, dev_num, devices, NULL); } else { LOGD("MU1: Choose GPU as default device. dev_num %d\n", dev_num); devices = (cl_device_id*)malloc(dev_num * sizeof(cl_device_id)); ret = clGetDeviceIDs(plat_id, CL_DEVICE_TYPE_GPU, dev_num, devices, NULL); } context = clCreateContext(NULL,1, devices,NULL,NULL,NULL); commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL); char filename[] = "/data/mu1_kernel.cl"; char file_context[10*1024]={0}; const char *source = &file_context[0]; ret = read_cl(filename, &file_context[0]); size_t sourceSize[10] = {strlen(source)}; cl_program program = clCreateProgramWithSource(context, 1, &source, &sourceSize[0], NULL); ret = clBuildProgram(program, 1, devices, NULL, NULL, NULL); if(ret < 0) { LOGD("MU1 Error: clBuildProgram error\n"); return 0; } kernel = clCreateKernel(program, "process_iq", NULL); inputBuffer_i = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 512*1024*4, (void *)(&table_i[0][0]), NULL); inputBuffer_q = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 512*1024*4, (void *)(&table_q[0][0]), NULL); inputBuffer_o = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, 512*1024*4, (void *)(&table_o[0][0]), NULL); ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer_i); ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&inputBuffer_q); ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&inputBuffer_o); if(devices != NULL) { free(devices);} LOGD("MU1: init cl plat success"); return 0; }