/* Enqueues a given task to operate on a given memory buffer dataset. * * @env: Struct containing device/context/queue variables. * @mem_struct: Struct containing cl_mem buffer / target dataset. * @task: Struct containing the kernel to execute and the task name. */ void runTaskOnDataset( const RubiCLEnvironment* env, const RubiCLMemoryBuffer* mem_struct, const RubiCLTask* task ) { if (DEBUG) printf("runTaskOnDataset\n"); size_t g_work_size[1] = {mem_struct->buffer_entries}; /* Kernel's global data_array set to be the given device memory buffer */ cl_int ret = clSetKernelArg( task->kernel, // Kernel concerned 0, // Index of argument to specify sizeof(cl_mem), // Size of argument value &mem_struct->buffer // Argument value ); if (ret != CL_SUCCESS) printf("clSetKernelArg %s\n", oclErrorString(ret)); /* Kernel enqueued to be executed on the environment's command queue */ ret = clEnqueueNDRangeKernel( env->queue, // Device's command queue task->kernel, // Kernel to enqueue 1, // Dimensionality of work 0, // Global offset of work index g_work_size, // Array of work sizes by dimension NULL, // Local work size, omitted so will be automatically deduced 0, // Number of preceding events NULL, // Preceding events list NULL // Event object destination ); if (ret != CL_SUCCESS) printf("clEnqueueNDRangeKernel %s\n", oclErrorString(ret)); }
void CL::popCorn() { printf("in popCorn\n"); //initialize our kernel from the program //kernel = clCreateKernel(program, "part1", &err); //printf("clCreateKernel: %s\n", oclErrorString(err)); try{ kernel = cl::Kernel(program, "part2", &err); } catch (cl::Error er) { printf("ERROR: %s(%s)\n", er.what(), oclErrorString(er.err())); } //set the arguements of our kernel try { err = kernel.setArg(0, cl_vbos[0]); //position vbo err = kernel.setArg(1, cl_vbos[1]); //color vbo err = kernel.setArg(2, cl_velocities); err = kernel.setArg(3, cl_pos_gen); err = kernel.setArg(4, cl_vel_gen); } catch (cl::Error er) { printf("ERROR: %s(%s)\n", er.what(), oclErrorString(er.err())); } //Wait for the command queue to finish these commands before proceeding queue.finish(); }
CL::CL() { printf("Initialize OpenCL object and context\n"); //setup devices and context //this function is defined in util.cpp //it comes from the NVIDIA SDK example code err = oclGetPlatformID(&platform); //oclErrorString is also defined in util.cpp and comes from the NVIDIA SDK printf("oclGetPlatformID: %s\n", oclErrorString(err)); // Get the number of GPU devices available to the platform // we should probably expose the device type to the user // the other common option is CL_DEVICE_TYPE_CPU err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); printf("clGetDeviceIDs (get number of devices): %s\n", oclErrorString(err)); // Create the device list devices = new cl_device_id [numDevices]; err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL); printf("clGetDeviceIDs (create device list): %s\n", oclErrorString(err)); //for right now we just use the first available device //later you may have criteria (such as support for different extensions) //that you want to use to select the device deviceUsed = 0; //create the context context = clCreateContext(0, 1, &devices[deviceUsed], NULL, NULL, &err); //create the command queue we will use to execute OpenCL commands command_queue = clCreateCommandQueue(context, devices[deviceUsed], 0, &err); }
void CL::loadProgram(std::string kernel_source) { //Program Setup int pl; //size_t program_length; printf("load the program\n"); pl = kernel_source.size(); printf("kernel size: %d\n", pl); //printf("kernel: \n %s\n", kernel_source.c_str()); try { cl::Program::Sources source(1, std::make_pair(kernel_source.c_str(), pl)); program = cl::Program(context, source); } catch (cl::Error er) { printf("ERROR: %s(%s)\n", er.what(), oclErrorString(er.err())); } printf("build program\n"); try { err = program.build(devices); } catch (cl::Error er) { printf("program.build: %s\n", oclErrorString(er.err())); //if(err != CL_SUCCESS){ } printf("done building program\n"); std::cout << "Build Status: " << program.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(devices[0]) << std::endl; std::cout << "Build Options:\t" << program.getBuildInfo<CL_PROGRAM_BUILD_OPTIONS>(devices[0]) << std::endl; std::cout << "Build Log:\t " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]) << std::endl; }
void MD::clInit(float bound, float dt, std::string force_kernel_name) { printf("Initializing CL Kernels.\n"); // Initialize our kernel from the program. try { forceKernel = cl::Kernel(program, force_kernel_name.c_str(), &err); updateKernel = cl::Kernel(program, "update", &err); } catch (cl::Error er) { printf("ERROR: %s(%s)\n", er.what(), oclErrorString(er.err())); exit(EXIT_FAILURE); } // Set the arguements of our kernel. try { err = forceKernel.setArg(0, cl_vbos[0]); // Position vbo. err = forceKernel.setArg(1, cl_vbos[1]); // Color vbo. err = forceKernel.setArg(2, cl_forces); err = forceKernel.setArg(3, num); // Pass in the size. err = updateKernel.setArg(0, cl_vbos[0]); // Position vbo. err = updateKernel.setArg(1, cl_vbos[1]); // Color vbo. err = updateKernel.setArg(2, cl_forces); err = updateKernel.setArg(3, cl_vel); err = updateKernel.setArg(4, bound); updateKernel.setArg(5, dt); // Pass in the timestep. } catch (cl::Error er) { printf("ERROR: %s(%s)\n", er.what(), oclErrorString(er.err())); } // Wait for the command queue to finish these commands before proceeding. queue.finish(); printf("Done initializing.\n"); }
void computePresenceArrayForTupDataset( const RubiCLEnvironment* env, const RubiCLMemoryBuffer* fst_mem_struct, const RubiCLMemoryBuffer* snd_mem_struct, const RubiCLTask* task, RubiCLMemoryBuffer *presence ) { if (DEBUG) printf("computePresenceArrayForDataset\n"); size_t g_work_size[1] = {fst_mem_struct->buffer_entries}; /* Kernel's fst data_array set to be the given fst memory buffer */ cl_int ret = clSetKernelArg( task->kernel, // Kernel concerned 0, // Index of argument to specify sizeof(cl_mem), // Size of argument value &fst_mem_struct->buffer // Argument value ); if (ret != CL_SUCCESS) printf("clSetKernelArg %s\n", oclErrorString(ret)); /* Kernel's snd data_array set to be the given snd memory buffer */ ret = clSetKernelArg( task->kernel, // Kernel concerned 1, // Index of argument to specify sizeof(cl_mem), // Size of argument value &snd_mem_struct->buffer // Argument value ); if (ret != CL_SUCCESS) printf("clSetKernelArg %s\n", oclErrorString(ret)); /* Output buffer created to be an int flag for each element in input dataset. */ presence->buffer_entries = fst_mem_struct->buffer_entries; presence->buffer = createMemoryBuffer( env, // Environment struct (presence->buffer_entries * sizeof(int)), // Size of buffer to create CL_MEM_HOST_READ_ONLY // Buffer flags set ); presence->type = INTEGER_BUFFER; /* Kernel's global presence_array set to be the newly created presence buffer */ ret = clSetKernelArg( task->kernel, // Kernel concerned 2, // Index of argument to specify sizeof(cl_mem), // Size of argument value &presence->buffer // Argument value ); if (ret != CL_SUCCESS) printf("clSetKernelArg PA %s\n", oclErrorString(ret)); /* Kernel enqueued to be executed on the environment's command queue */ ret = clEnqueueNDRangeKernel( env->queue, // Device's command queue task->kernel, // Kernel to enqueue 1, // Dimensionality of work 0, // Global offset of work index g_work_size, // Array of work size in each dimension NULL, // Local work size, omitted so will be deduced by OpenCL platform 0, // Number of preceding events NULL, // Preceding events list NULL // Event object destination ); if (ret != CL_SUCCESS) printf("clEnqueueNDRangeKernel %s\n", oclErrorString(ret)); }
int setup_data_buffers(struct sapi_o *a, size_t data_in_len) { cl_int err; if (a == NULL) return -1; a->olen = data_in_len / sizeof(float2) * sizeof(float); a->host = realloc(a->host, a->olen); if (a->host == NULL){ #ifdef DEBUG fprintf(stderr, "e: logic cannot malloc output buffer\n"); #endif return -1; } #ifdef DEBUG fprintf(stderr, "%s: created ouput buffer %ld bytes\n", __func__, a->olen); #endif a->clin = clCreateBuffer(a->ctx, CL_MEM_READ_ONLY, data_in_len, NULL, &err); if (err != CL_SUCCESS){ #ifdef DEBUG fprintf(stderr, "clCreateBuffer return %s\n", oclErrorString(err)); #endif return -1; } #ifdef DEBUG fprintf(stderr, "%s: created device input buffer %ld bytes\n", __func__, data_in_len); #endif a->clout = clCreateBuffer(a->ctx, CL_MEM_WRITE_ONLY, data_in_len, NULL, &err); if (err != CL_SUCCESS){ #ifdef DEBUG fprintf(stderr, "clCreateBuffer return %s\n", oclErrorString(err)); #endif return -1; } #ifdef DEBUG fprintf(stderr, "%s: created device ouput buffer %ld bytes\n", __func__, data_in_len); #endif a->clpow = clCreateBuffer(a->ctx, CL_MEM_WRITE_ONLY, a->olen, NULL, &err); if (err != CL_SUCCESS){ #ifdef DEBUG fprintf(stderr, "clCreateBuffer return %s\n", oclErrorString(err)); #endif return -1; } #ifdef DEBUG fprintf(stderr, "%s: created device ouput buffer %ld bytes\n", __func__, a->olen); #endif return 0; }
float Kernel::execute(int ndrange) { if (ndrange <= 0) return -1.f; cl_ulong start, end; float timing = -1.0f; try { cl::Event event; cli->err = cli->queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(ndrange), cl::NullRange, NULL, &event); cli->queue.finish(); event.getProfilingInfo(CL_PROFILING_COMMAND_END, &end); event.getProfilingInfo(CL_PROFILING_COMMAND_START, &start); timing = (end - start) * 1.0e-6f; } catch (cl::Error er) { printf("err: work group size: %d\n", ndrange); printf("ERROR: %s(%s)\n", er.what(), oclErrorString(er.err())); } return timing; }
void CL::loadProgram(const char* relative_path) { // Program Setup int pl; size_t program_length; printf("load the program\n"); //CL_SOURCE_DIR is set in the CMakeLists.txt std::string path(CL_SOURCE_DIR); path += "/" + std::string(relative_path); printf("path: %s\n", path.c_str()); //file_contents is defined in util.cpp //it loads the contents of the file at the given path char* cSourceCL = file_contents(path.c_str(), &pl); //printf("file: %s\n", cSourceCL); program_length = (size_t)pl; // create the program program = clCreateProgramWithSource(context, 1, (const char **) &cSourceCL, &program_length, &err); printf("clCreateProgramWithSource: %s\n", oclErrorString(err)); buildExecutable(); }
cl_device_id selectDefaultClDeviceOfType(cl_device_type device_type, cl_platform_id* platform) { if (DEBUG) printf("selectDefaultClDeviceOfType\n"); cl_int ret; cl_device_id device_id; cl_uint num_devices; ret = clGetDeviceIDs( *platform, // Selected platform device_type, // Type of device (CPU/GPU) 0, // Limit NULL, // Devices destination &num_devices // Count destination ); cl_device_id* devices = calloc(sizeof(cl_device_id), num_devices); ret = clGetDeviceIDs( *platform, // Selected platform device_type, // Type of device (CPU/GPU) num_devices, // Limit devices, // Devices destination NULL // Count destination ); if (ret != CL_SUCCESS) printf("clGetDeviceIDs %s\n", oclErrorString(ret)); if (DEBUG) { displayDeviceInfo(num_devices, devices); printf("Selecting Device 1.\n"); } device_id = devices[0]; free(devices); return device_id; }
void Density::execute(int num, Buffer<float4>& pos_s, Buffer<float>& dens_s, //output Buffer<unsigned int>& ci_start, Buffer<unsigned int>& ci_end, //params Buffer<SPHParams>& sphp, Buffer<GridParams>& gp, //debug params Buffer<float4>& clf_debug, Buffer<int4>& cli_debug) { int iarg = 0; k_density.setArg(iarg++, pos_s.getDevicePtr()); k_density.setArg(iarg++, dens_s.getDevicePtr()); k_density.setArg(iarg++, ci_start.getDevicePtr()); k_density.setArg(iarg++, ci_end.getDevicePtr()); k_density.setArg(iarg++, gp.getDevicePtr()); k_density.setArg(iarg++, sphp.getDevicePtr()); k_density.setArg(iarg++, clf_debug.getDevicePtr()); k_density.setArg(iarg++, cli_debug.getDevicePtr()); int local = 64; try { float gputime = k_density.execute(num, local); } catch (cl::Error er) { printf("ERROR(density): %s(%s)\n", er.what(), oclErrorString(er.err())); } }
/* Writes the contents of a given dataset into a given cl_mem device memory buffer * * @env: Struct containing device/context/queue variables. * @mem_struct: Struct containing cl_mem buffer and the number of entries it can hold. * @dataset: Pointer to an integer array of data to be read, same length as buffer. */ void loadIntArrayIntoDevice( const RubiCLEnvironment env, const RubiCLMemoryBuffer mem_struct, int* dataset ) { if (DEBUG) printf("loadIntArrayIntoDevice\n"); cl_event write_event; cl_int ret = clEnqueueWriteBuffer( env.queue, // Command queue mem_struct.buffer, // Memory buffer CL_FALSE, // Blocking write? (set to nonblocking) 0, // Offset in buffer to write to mem_struct.buffer_entries * sizeof(int), // Input data size dataset, // Input data 0, // Number of preceding actions NULL, // List of preceding actions &write_event // Event object destination ); if (ret != CL_SUCCESS) printf("clEnqueueWriteBuffer %s\n", oclErrorString(ret)); clSetEventCallback( write_event, // Event to monitor CL_COMPLETE, // Status to fire on &releaseMemoryCallback, // Callback to trigger dataset // Data to pass to callback ); }
bool OCL::BuildExecutable() { cl_int error; // Build program printf("Building OpenCL program...\n"); error = clBuildProgram(program, 1, &deviceId, NULL, NULL, NULL); if(error != CL_SUCCESS) { printf("Failed to build executable with error code %d (%s)", error, oclErrorString(error)); return false; } // Get and print build status messages. cl_build_status build_status; error = clGetProgramBuildInfo(program, deviceId, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &build_status, NULL); char *build_log; size_t ret_val_size; error = clGetProgramBuildInfo(program, deviceId, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); build_log = new char[ret_val_size+1]; error = clGetProgramBuildInfo(program, deviceId, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL); build_log[ret_val_size] = '\0'; printf("BUILD LOG: \n %s", build_log); delete build_log; return true; }
cl_platform_id selectDefaultClPlatform() { if (DEBUG) printf("selectDefaultClPlatform\n"); cl_platform_id platform; cl_int ret; cl_uint num_platforms, i; ret = clGetPlatformIDs( 0, // Limit NULL, // Value destination &num_platforms // Count destination ); cl_platform_id* platforms = calloc(sizeof(cl_platform_id), num_platforms); ret = clGetPlatformIDs( num_platforms, // Limit platforms, // Value destination NULL // Count destination ); if (ret != CL_SUCCESS) printf("clGetPlatformIDs %s\n", oclErrorString(ret)); if (DEBUG) { char buf [128]; for (i = 0; i < num_platforms; i++) { ret = clGetPlatformInfo( platforms[i], // Platform CL_PLATFORM_VERSION, // OpenCL version sizeof(buf), // Buffer size buf, // Destination buffer NULL // Size destination ); printf("* Platform %d: %s", (i + 1), buf); ret = clGetPlatformInfo( platforms[i], // Platform CL_PLATFORM_NAME, // Platform name sizeof(buf), // Buffer size buf, // Destination buffer NULL // Size destination ); printf(" %s", buf); ret = clGetPlatformInfo( platforms[i], // Platform CL_PLATFORM_VENDOR, // Platform vendor sizeof(buf), // Buffer size buf, // Destination buffer NULL // Size destination ); printf(" %s\n", buf); } } if (DEBUG) printf("Selecting Platform 1.\n"); platform = platforms[0]; free(platforms); return platform; }
void MD::loadProgram(std::string kernel_source, int group_size_val) { // Program Setup. int pl; group_size = group_size_val; printf("Load the program.\n"); bool failed = false; pl = kernel_source.size(); printf("Kernel size: %d.\n", pl); try { cl::Program::Sources source(1, std::make_pair(kernel_source.c_str(), pl)); program = cl::Program(context, source); } catch (cl::Error er) { printf("ERROR: %s(%s)\n", er.what(), oclErrorString(er.err())); } printf("Building program...\n"); try { //err = program.build(devices, "-cl-nv-verbose -cl-nv-maxrregcount=100"); std::stringstream build_options; // Define the group size to allow for __local arrays. build_options << "-D SIZE=" << group_size << std::ends; err = program.build(devices, build_options.str().c_str()); } catch (cl::Error er) { printf("program.build: %s\n", oclErrorString(er.err())); failed = true; } printf("Done building program.\n"); std::cout << "Build Status: " << program.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(devices[0]) << std::endl; std::cout << "Build Options:\t" << program.getBuildInfo<CL_PROGRAM_BUILD_OPTIONS>(devices[0]) << std::endl; std::cout << "Build Log:\t " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]) << std::endl; if (failed) exit(EXIT_FAILURE); }
CLWrapper::CLWrapper() { // Initialize OpenCL object and context // Get all platforms that are OpenCL compatible err_ = cl::Platform::get(&allPlatforms_); Zilch::Console::Write("Collect all OpenCL platforms. Status : "); Zilch::Console::WriteLine(oclErrorString(err_)); if (allPlatforms_.size() == 0) { Zilch::Console::WriteLine("No OpenCL compatible platforms found, check installation!"); exit(1); } // Use default platform platform_ = allPlatforms_[0]; Zilch::Console::Write("Using platform: "); Zilch::Console::WriteLine(platform_.getInfo<CL_PLATFORM_NAME>().c_str()); // Get all devices registered to platform err_ = platform_.getDevices(CL_DEVICE_TYPE_ALL, &allDevices_); Zilch::Console::Write("Collect all OpenCL devices registered to platform. Status : "); Zilch::Console::WriteLine(oclErrorString(err_)); if (allDevices_.size() == 0) { Zilch::Console::WriteLine(" No OpenCL compatible devices found, check installation!"); exit(1); } // Use default devices device_ = allDevices_[0]; // Create context context_ = cl::Context({ device_ }); // Create command queue queue_ = cl::CommandQueue(context_, device_, 0, &err_); Zilch::Console::Write("Creating command queue. -> Status : "); Zilch::Console::WriteLine(oclErrorString(err_)); }
CL::CL() { printf("Initialize OpenCL object and context\n"); //setup devices and context //this function is defined in util.cpp //it comes from the NVIDIA SDK example code ///err = oclGetPlatformID(&platform); //oclErrorString is also defined in util.cpp and comes from the NVIDIA SDK ///printf("oclGetPlatformID: %s\n", oclErrorString(err)); std::vector<cl::Platform> platforms; err = cl::Platform::get(&platforms); printf("cl::Platform::get(): %s\n", oclErrorString(err)); printf("number of platforms: %d\n", platforms.size()); if (platforms.size() == 0) { printf("Platform size 0\n"); } // Get the number of GPU devices available to the platform // we should probably expose the device type to the user // the other common option is CL_DEVICE_TYPE_CPU ///err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); ///printf("clGetDeviceIDs (get number of devices): %s\n", oclErrorString(err)); // Create the device list ///devices = new cl_device_id [numDevices]; ///err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL); ///printf("clGetDeviceIDs (create device list): %s\n", oclErrorString(err)); //for right now we just use the first available device //later you may have criteria (such as support for different extensions) //that you want to use to select the device deviceUsed = 0; //create the context ///context = clCreateContext(0, 1, &devices[deviceUsed], NULL, NULL, &err); //context properties will be important later, for now we go with defualts cl_context_properties properties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0}; context = cl::Context(CL_DEVICE_TYPE_GPU, properties); devices = context.getInfo<CL_CONTEXT_DEVICES>(); printf("number of devices %d\n", devices.size()); //create the command queue we will use to execute OpenCL commands ///command_queue = clCreateCommandQueue(context, devices[deviceUsed], 0, &err); try{ queue = cl::CommandQueue(context, devices[deviceUsed], 0, &err); } catch (cl::Error er) { printf("ERROR: %s(%d)\n", er.what(), er.err()); } }
cl_kernel __cl_create_kernel__(cl_program program, const char *kernelname, int line, const char *file){ cl_int error; cl_kernel result = clCreateKernel(program, kernelname, &error); if(error){ handle_error(error, line, file); unregister_gpu();\ pthread_exit((void *)oclErrorString(error)); } return result; }
void CLWrapper::readAllBuffers() { for each (CLBuffer buf in allBuffers_) { err_ = queue_.enqueueReadBuffer(buf.bufferValue, CL_TRUE, 0, buf.bufferSize, buf.dataPointer, NULL, NULL); if (err_ != CL_SUCCESS) { Zilch::Console::Write("Reading buffers. -> Status : "); Zilch::Console::WriteLine(oclErrorString(err_)); } }
void CL::runKernel() { printf("in runKernel\n"); //execute the kernel err = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, workGroupSize, NULL, 0, NULL, &event); clReleaseEvent(event); printf("clEnqueueNDRangeKernel: %s\n", oclErrorString(err)); clFinish(command_queue); //lets check our calculations by reading from the device memory and printing out the results float c_done[num]; err = clEnqueueReadBuffer(command_queue, cl_c, CL_TRUE, 0, sizeof(float) * num, &c_done, 0, NULL, &event); printf("clEnqueueReadBuffer: %s\n", oclErrorString(err)); clReleaseEvent(event); for(int i=0; i < num; i++) { printf("c_done[%d] = %g\n", i, c_done[i]); } }
void cl_context::init() { // OpenCL try { // Get available platforms vector<cl::Platform> platforms; cl::Platform::get(&platforms); LOG_INFO<<platforms.front().getInfo<CL_PLATFORM_VERSION>(); // context sharing is OS specific #if defined (__APPLE__) || defined(MACOSX) CGLContextObj curCGLContext = CGLGetCurrentContext(); CGLShareGroupObj curCGLShareGroup = CGLGetShareGroup(curCGLContext); cl_context_properties properties[] = { CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE, (cl_context_properties)curCGLShareGroup, 0 }; #elif defined WIN32 cl_context_properties properties[] = { CL_GL_CONTEXT_KHR, (cl_context_properties)wglGetCurrentContext(), CL_WGL_HDC_KHR, (cl_context_properties)wglGetCurrentDC(), CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0 }; #else cl_context_properties properties[] = { CL_GL_CONTEXT_KHR, (cl_context_properties)glXGetCurrentContext(), CL_GLX_DISPLAY_KHR, (cl_context_properties)glXGetCurrentDisplay(), CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0 }; #endif m_context = cl::Context(CL_DEVICE_TYPE_GPU, properties); // Get a list of devices on this platform vector<cl::Device> devices = m_context.getInfo<CL_CONTEXT_DEVICES>(); m_device = devices[0]; // Create a command queue and use the first device m_queue = cl::CommandQueue(m_context, m_device); } catch(cl::Error &error) { LOG_ERROR << error.what() << "(" << oclErrorString(error.err()) << ")"; } }
void createRubiCLHybridEnvironment(RubiCLHybridEnvironment* env) { if (DEBUG) printf("createRubiCLHybridEnvironment\n"); cl_int ret; cl_platform_id platform = selectDefaultClPlatform(); env->cpu_device_id = selectDefaultClDeviceOfType(CL_DEVICE_TYPE_CPU, &platform); env->gpu_device_id = selectDefaultClDeviceOfType(CL_DEVICE_TYPE_GPU, &platform); cl_device_id devices[2] = { env->cpu_device_id, env->gpu_device_id }; env->context = clCreateContext( NULL, // Properties 2, // Number of devices specified devices, // Devices specified NULL, // Error callback Fn NULL, // User data for Fn &ret // Status destination ); if (ret != CL_SUCCESS) printf("clCreateContext %s\n", oclErrorString(ret)); env->cpu_queue = clCreateCommandQueue( env->context, env->cpu_device_id, 0, &ret ); if (ret != CL_SUCCESS) printf("clCreateCommandQueue (CPU) %s\n", oclErrorString(ret)); env->gpu_queue = clCreateCommandQueue( env->context, env->gpu_device_id, 0, &ret ); if (ret != CL_SUCCESS) printf("clCreateCommandQueue (GPU) %s\n", oclErrorString(ret)); //FIXME: Set GroupSize or something. }
void Kernel::setArgShared(int arg, int nb_bytes) { try { kernel.setArg(arg, nb_bytes, 0); cli->queue.finish(); } catch (cl::Error er) { printf("ERROR: %s(%s)\n", er.what(), oclErrorString(er.err())); } }
void CL::popCorn() { printf("in popCorn\n"); //initialize our kernel from the program printf("%s\n",kernel_name); kernel = clCreateKernel(program, kernel_name, &err); printf("clCreateKernel: %s\n", oclErrorString(err)); //initialize our CPU memory arrays, send them to the device and set the kernel arguements num = 10; float *a = new float[num]; float *b = new float[num]; for(int i=0; i < num; i++) { a[i] = 1.0f * i; b[i] = 1.0f * i; } printf("Creating OpenCL arrays\n"); //our input arrays //create our OpenCL buffer for a, copying the data from CPU to the GPU at the same time cl_a = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, sizeof(float) * num, a, &err); //cl_b = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, sizeof(float) * num, b, &err); //we could do b similar, but you may want to create your buffer and fill it at a different time cl_b = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * num, NULL, &err); //our output array cl_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * num, NULL, &err); printf("Pushing data to the GPU\n"); //push our CPU arrays to the GPU // err = clEnqueueWriteBuffer(command_queue, cl_a, CL_TRUE, 0, sizeof(float) * num, a, 0, NULL, &event); // clReleaseEvent(event); //we need to release events in order to be completely clean (has to do with openclprof) // //push b's data to the GPU err = clEnqueueWriteBuffer(command_queue, cl_b, CL_TRUE, 0, sizeof(float) * num, b, 0, NULL, &event); clReleaseEvent(event); //set the arguements of our kernel err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &cl_a); err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &cl_b); err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &cl_c); //Wait for the command queue to finish these commands before proceeding clFinish(command_queue); //clean up allocated space. delete[] a; delete[] b; //for now we make the workgroup size the same as the number of elements in our arrays workGroupSize[0] = num; }
size_t __cl_get_info__(cl_device_id device_id, cl_device_info info, int line, const char *file){ size_t ret; cl_int error = clGetDeviceInfo(device_id, info, sizeof(ret), &ret, NULL); if(error){ handle_error(error, line, file); unregister_gpu();\ pthread_exit((void *)oclErrorString(error)); } return ret; }
cl_mem __cl_malloc__(cl_context context, size_t size, int line, const char *file){ cl_mem memory_object; #if DEBUG >= 1 fprintf(stderr, "Allocating %u on a GPU... ", size); #endif if(size<1) { size=16; } cl_int error; memory_object = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &error); if(error){ #if DEBUG >= 1 fprintf(stderr, "fail\n"); #else handle_error(error, line, file); fprintf(stderr, "MTY_CL was unable to allocate enough memory on one of your GPU devices.\n" "Possible reasons for this are:\n" "\t- Running other programs hogging a lot of GPU memory\n" "\t- Limited maximum memory allocation on GPU devices\n" "\t- Thread concurrency + batch size too large for the device\n" "\n" "On windows the most common issue is limited maximum memory allocation.\n" "To remove this limit, run the following command on any windows command line:\n" "\tsetx GPU_MAX_ALLOC_PERCENT 100\n" "\n" "Thread concurrency and batch size is not currently user configurable.\n" "If removing allocation limit does not work, and you are uses the latest driver\n" "or a recommended driver version, please open an issue at\n" "\thttps://github.com/madsbuvi/MTY_CL\n" "Including the following information:\n" "\t-File name and line number of error\n" "\t-GPU name & available memory on the GPU\n" "\t-OS\n" "\t-Driver version\n" "\n" "Amount of memory that could not be allocated: %u\n", (unsigned int)size ); #endif #ifndef IGNORE_MEMORY_FAILURE pthread_exit((void *)oclErrorString(error)); #endif return 0; } #if DEBUG >= 1 fprintf(stderr, "done\n"); #endif return memory_object; }
/* Takes the source code for a kernel and the name of the task to build and creates a * RubiCLTask Struct containing the components needed to dispatch this task later. * * @env: Struct containing device/context/queue variables. * @kernel_source: String containing the .cl Kernel source. * @source_size: The size of the source. * @name: The name of the task within the source to build. */ void buildTaskFromSource( const RubiCLEnvironment* env, const char* kernel_source, const char* name, RubiCLTask* result ) { if (DEBUG) printf("buildTaskFromSource\n"); /* Create cl_program from given task/name and store inside RubiCLTask struct. */ cl_int ret; result->name = (char *) name; result->program = clCreateProgramWithSource( env->context, // Context 1, // Number of parts that the source is in (const char **) &kernel_source, // Array of program source code NULL, // Total size of source &ret // Status destination ); if (ret != CL_SUCCESS) printf("clCreateProgramWithSource %s\n", oclErrorString(ret)); /* Create kernel from cl_program to execute later on target-device */ ret = clBuildProgram( result->program, // Program to build 1, // Number of devices involved &env->device_id, // List of involved devices "-cl-fast-relaxed-math", // Compilation options NULL, // Build complete callback, building is synchronous if omitted NULL // Callback user data ); if (ret != CL_SUCCESS) printf("clBuildProgram %s\n", oclErrorString(ret)); if (ret == CL_BUILD_PROGRAM_FAILURE) displayBuildFailureInfo(result->program, env->device_id); result->kernel = clCreateKernel( result->program, // Built program result->name, // Entry point to kernel &ret // Status destination ); if (ret != CL_SUCCESS) printf("clCreateKernel %s\n", oclErrorString(ret)); }
MD::MD() { printf("Initialize OpenCL object and context\n"); // Setup devices and context. std::vector<cl::Platform> platforms; err = cl::Platform::get(&platforms); printf("cl::Platform::get(): %s\n", oclErrorString(err)); printf("platforms.size(): %lu\n", platforms.size()); deviceUsed = 0; err = platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &devices); printf("getDevices: %s\n", oclErrorString(err)); printf("devices.size(): %lu\n", devices.size()); int t = devices.front().getInfo<CL_DEVICE_TYPE>(); printf("type: device: %d CL_DEVICE_TYPE_GPU: %d \n", t, CL_DEVICE_TYPE_GPU); // This part of the setup may be Linux specific. cl_context_properties props[] = { CL_GL_CONTEXT_KHR, (cl_context_properties)glXGetCurrentContext(), CL_GLX_DISPLAY_KHR, (cl_context_properties)glXGetCurrentDisplay(), CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0 }; try { context = cl::Context(CL_DEVICE_TYPE_GPU, props); } catch (cl::Error er) { printf("ERROR: %s(%s)\n", er.what(), oclErrorString(er.err())); exit(EXIT_FAILURE); } // Create the command queue we will use to execute OpenCL commands. try { queue = cl::CommandQueue(context, devices[deviceUsed], 0, &err); } catch (cl::Error er) { printf("ERROR: %s(%d)\n", er.what(), er.err()); exit(EXIT_FAILURE); } }
/* Returns the number of elements that would be kept after a presence array calculation * has been completed. * * @env: RubiCLEnvironment struct * @presence: presence array post filter calculation. * @index_scan: result of exclusive prefix sum on presence array. */ int filteredBufferLength( const RubiCLEnvironment* env, RubiCLMemoryBuffer* presence, RubiCLMemoryBuffer* index_scan ) { if (DEBUG) printf("filteredBufferLength\n"); int index_reduce, last_element_presence; cl_int ret = clEnqueueReadBuffer( env->queue, // Device's command queue index_scan->buffer, // Buffer to output data from CL_FALSE, // Block? Async to hide latency (index_scan->buffer_entries - 1) * sizeof(int), // Offset to read from sizeof(int), // Size of output data &index_reduce, // Output destination 0, // Number of preceding actions NULL, // List of preceding actions NULL // Event object destination ); if (ret != CL_SUCCESS) printf("clEnqueueReadBuffer %s\n", oclErrorString(ret)); ret = clEnqueueReadBuffer( env->queue, // Device's command queue presence->buffer, // Buffer to output data from CL_FALSE, // Block? Async to hide latency (presence->buffer_entries - 1) * sizeof(int), // Offset to read from sizeof(int), // Size of output data &last_element_presence, // Output destination 0, // Number of preceding actions NULL, // List of preceding actions NULL // Event object destination ); if (ret != CL_SUCCESS) printf("clEnqueueReadBuffer %s\n", oclErrorString(ret)); clFinish(env->queue); return index_reduce + last_element_presence; }
void __cl_copy_from__(cl_mem from, void *to, size_t size, size_t offset, cl_command_queue command_queue, int line, const char *file) { if(size<1) { size=16; } cl_int error = clEnqueueReadBuffer(command_queue, from, CL_TRUE, offset, size, to, 0, NULL, NULL); if(error){ handle_error(error, line, file); unregister_gpu();\ pthread_exit((void *)oclErrorString(error)); } }