void setupGPU() { // Retrieve an OpenCL platform cl_uint num_platforms = 0; int err = 0; err = clGetPlatformIDs(0, NULL, &num_platforms); cl_platform_id* platform_ids = (cl_platform_id*)(malloc(sizeof(cl_platform_id) * num_platforms)); err = clGetPlatformIDs(num_platforms, platform_ids, NULL); CHKERR(err, "Failed to get a platform!"); // Connect to a compute device int i = 0; for(i = 0; i < num_platforms; i++) { cl_device_id device_id; err = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if(err != CL_DEVICE_NOT_FOUND) { CHKERR(err, "Failed to create a device group!"); device_id_cpu = device_id; } err = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); if(err != CL_DEVICE_NOT_FOUND) { CHKERR(err, "Failed to create a device group!"); device_id_gpu = device_id; } } free(platform_ids); if(scheme != GPU_ONLY) { context_cpu = clCreateContext(NULL, 1, &device_id_cpu, NULL, NULL, &err); CHKERR(err, "Failed to create a compute context!"); commands_cpu = clCreateCommandQueue(context_cpu, device_id_cpu, 0, &err); CHKERR(err, "Failed to create a command queue!"); kernel_compute_cpu = create_kernel(KernelSourceFile_cpu, "compute", context_cpu, device_id_cpu); } if(scheme != CPU_ONLY) { context_gpu = clCreateContext(NULL, 1, &device_id_gpu, NULL, NULL, &err); CHKERR(err, "Failed to create a compute context!"); commands_gpu = clCreateCommandQueue(context_gpu, device_id_gpu, 0, &err); CHKERR(err, "Failed to create a command queue!"); kernel_compute_gpu = create_kernel(KernelSourceFile_gpu, "compute", context_gpu, device_id_gpu); } }
int main() { // create a compute context and command queue auto ctx = boost::compute::system::default_context(); auto queue = boost::compute::system::default_queue(); // create program and kernels std::ostringstream source; get_file_contents(source, "svd3.cl"); auto program = boost::compute::program::build_with_source(source.str(), ctx); auto svdArrayTestKernel = program.create_kernel("svdArrayTest"); EigenMatN A_host = EigenMatN::Identity(); boost::compute::vector<cl_float> A_dev(A_host.data(), A_host.data() + N_*N_, queue); boost::compute::vector<cl_float> U_dev(N_*N_), S_dev(N_*N_), V_dev(N_*N_); svdArrayTestKernel.set_args(A_dev, U_dev, S_dev, V_dev); queue.enqueue_1d_range_kernel(svdArrayTestKernel, 0, 1, 0); EigenMatN U_host,S_host,V_host; boost::compute::copy(U_dev.begin(), U_dev.end(), U_host.data()); boost::compute::copy(S_dev.begin(), S_dev.end(), S_host.data()); boost::compute::copy(V_dev.begin(), V_dev.end(), V_host.data()); std::cout << A_host << std::endl; std::cout << U_host << std::endl; std::cout << S_host << std::endl; std::cout << V_host << std::endl; return 0; }
// Called at the beginning of a code block generated by Halide. This function // is responsible for setting up the OpenGL environment and compiling the GLSL // code into a fragment shader. EXPORT void *halide_opengl_init_kernels(void *user_context, void *state_ptr, const char *src, int size) { // TODO: handle error if (int error = halide_opengl_init(user_context)) { return NULL; } // Use '/// KERNEL' comments to split 'src' into discrete blocks, one for // each kernel contained in it. char *begin = strstr(src, kernel_marker); char *end = NULL; for (; begin && begin[0]; begin = end) { end = strstr(begin + sizeof(kernel_marker) - 1, kernel_marker); if (!end) { end = begin + strlen(begin); } HalideOpenGLKernel *kernel = create_kernel(user_context, begin, end - begin); if (!kernel) { // Simply skip invalid kernels continue; } #ifdef DEBUG halide_printf(user_context, "Defining kernel '%s'\n", kernel->name); #endif // Compile shader kernel->shader_id = halide_opengl_make_shader(user_context, GL_FRAGMENT_SHADER, kernel->source, NULL); // Link GLSL program GLuint program = ST.CreateProgram(); ST.AttachShader(program, ST.vertex_shader_id); ST.AttachShader(program, kernel->shader_id); ST.LinkProgram(program); GLint status; ST.GetProgramiv(program, GL_LINK_STATUS, &status); if (!status) { halide_printf(user_context, "Could not link GLSL program:\n"); GLint log_len; ST.GetProgramiv(program, GL_INFO_LOG_LENGTH, &log_len); char *log = (char*) malloc(log_len); ST.GetProgramInfoLog(program, log_len, NULL, log); halide_printf(user_context, "%s", log); free(log); ST.DeleteProgram(program); program = 0; } kernel->program_id = program; if (halide_opengl_find_kernel(kernel->name)) { halide_printf(user_context, "Duplicate kernel name '%s'\n", kernel->name); halide_opengl_delete_kernel(user_context, kernel); } else { kernel->next = ST.kernels; ST.kernels = kernel; } } return NULL; }
cl_kernel cl_manager::get_kernel(const std::string& filename, std::string entry_proc) { if (FAILED(create_kernel(filename, entry_proc))) { std::cout << "Shutting down ..." << std::endl; exit(-1); } return program_to_kernels[filename][entry_proc]; }
void BModel::wiener_filter(const int r, const double sigma, const double S) { IplImage *reKernel = cvCreateImage(cvGetSize(_tempImageSrc), IPL_DEPTH_64F, 1); IplImage *image = cvCreateImage(cvGetSize(_tempImageSrc), IPL_DEPTH_64F, 4); IplImage *reRImage = cvCreateImage(cvGetSize(_tempImageSrc), IPL_DEPTH_64F, 1); IplImage *reGImage = cvCreateImage(cvGetSize(_tempImageSrc), IPL_DEPTH_64F, 1); IplImage *reBImage = cvCreateImage(cvGetSize(_tempImageSrc), IPL_DEPTH_64F, 1); IplImage *kernel = cvCreateImage(cvGetSize(_tempImageSrc), IPL_DEPTH_64F, 2); IplImage *rImage = cvCreateImage(cvGetSize(_tempImageSrc), IPL_DEPTH_64F, 2); IplImage *gImage = cvCreateImage(cvGetSize(_tempImageSrc), IPL_DEPTH_64F, 2); IplImage *bImage = cvCreateImage(cvGetSize(_tempImageSrc), IPL_DEPTH_64F, 2); IplImage *imaginary = cvCreateImage(cvGetSize(_tempImageSrc), IPL_DEPTH_64F, 1); cvZero(imaginary); cvZero(reKernel); create_kernel(r, sigma, reKernel); cvConvertScale(_tempImageSrc, image, 1/255.); cvSplit(image, reRImage, reGImage, reBImage, 0); cvMerge(reKernel, imaginary, 0, 0, kernel); cvMerge(reRImage, imaginary, 0, 0, rImage); cvMerge(reGImage, imaginary, 0, 0, gImage); cvMerge(reBImage, imaginary, 0, 0, bImage); wiener_filter_chanel(rImage, kernel, S); cvSplit(rImage, reRImage, imaginary, 0, 0); wiener_filter_chanel(gImage, kernel, S); cvSplit(gImage, reGImage, imaginary, 0, 0); wiener_filter_chanel(bImage, kernel, S); cvSplit(bImage, reBImage, imaginary, 0, 0); cvMerge(reRImage, reGImage, reBImage, 0, image); cvConvertScale(image, _tempImageDst, 255); remap_image(_tempImageDst, -r); change_filt_image(); create_temp_image(_srcImage); cvReleaseImage(&reKernel); cvReleaseImage(&image); cvReleaseImage(&reRImage); cvReleaseImage(&reGImage); cvReleaseImage(&reBImage); cvReleaseImage(&kernel); cvReleaseImage(&rImage); cvReleaseImage(&gImage); cvReleaseImage(&bImage); cvReleaseImage(&imaginary); }
void AttentionMap::init(int fps, int width, int height) { m_heatmap = Mat::zeros(height, width, CV_32FC1); m_ones = Mat::ones(height, width, CV_32F); m_zeros = Mat::zeros(height, width, CV_32F); m_fade_mat = Mat::ones(height, width, CV_32F); // Determine how much to fade the heatmap values by each frame m_fade_mat.setTo((1.0 / fps) / g_fade_time); // Create heatmap kernel create_kernel(); m_last_update_time = 0; }
int main (int argc, char **argv) { int i, j; double x, y; printf ("/* gimpbrushcore-kernels.h\n" " *\n" " * This file was generated using kernelgen as found in the tools dir.\n"); printf (" * (threshold = %g)\n", THRESHOLD); printf (" */\n\n"); printf ("#ifndef __GIMP_BRUSH_CORE_KERNELS_H__\n"); printf ("#define __GIMP_BRUSH_CORE_KERNELS_H__\n\n"); printf ("#define KERNEL_WIDTH %d\n", KERNEL_WIDTH); printf ("#define KERNEL_HEIGHT %d\n", KERNEL_HEIGHT); printf ("#define KERNEL_SUBSAMPLE %d\n", SUBSAMPLE); printf ("#define KERNEL_SUM %d\n", KERNEL_SUM); printf ("\n\n"); printf ("/* Brush pixel subsampling kernels */\n"); printf ("static const int subsample[%d][%d][%d] =\n{\n", SUBSAMPLE + 1, SUBSAMPLE + 1, KERNEL_WIDTH * KERNEL_HEIGHT); for (j = 0; j <= SUBSAMPLE; j++) { y = (double) j / (double) SUBSAMPLE; printf (" {\n"); for (i = 0; i <= SUBSAMPLE; i++) { x = (double) i / (double) SUBSAMPLE; printf (" {"); create_kernel (x, y); printf (" }%s", i < SUBSAMPLE ? ",\n" : "\n"); } printf (" }%s", j < SUBSAMPLE ? ",\n" : "\n"); } printf ("};\n\n"); printf ("#endif /* __GIMP_BRUSH_CORE_KERNELS_H__ */\n"); return 0; }
/** * ufo_resources_get_kernel_from_source: * @resources: A #UfoResources * @source: OpenCL source string * @kernel: Name of a kernel or %NULL * @error: Return location for a GError from #UfoResourcesError, or NULL * * Loads and builds a kernel from a string. If @kernel is %NULL, the first * kernel defined in @source is used. * * Returns: (transfer none): a cl_kernel object that is load from @filename */ gpointer ufo_resources_get_kernel_from_source (UfoResources *resources, const gchar *source, const gchar *kernel, GError **error) { UfoResourcesPrivate *priv; cl_program program; g_return_val_if_fail (UFO_IS_RESOURCES (resources) && (source != NULL), NULL); priv = UFO_RESOURCES_GET_PRIVATE (resources); program = add_program_from_source (priv, source, NULL, error); g_debug ("Added program %p from source", (gpointer) program); return create_kernel (priv, program, kernel, error); }
/** * ufo_resources_get_kernel: * @resources: A #UfoResources object * @filename: Name of the .cl kernel file * @kernel: Name of a kernel, or %NULL * @error: Return location for a GError from #UfoResourcesError, or %NULL * * Loads a and builds a kernel from a file. The file is searched in the current * working directory and all paths added through ufo_resources_add_paths (). If * @kernel is %NULL, the first encountered kernel is returned. * * Returns: (transfer none): a cl_kernel object that is load from @filename or %NULL on error */ gpointer ufo_resources_get_kernel (UfoResources *resources, const gchar *filename, const gchar *kernelname, GError **error) { UfoResourcesPrivate *priv; gchar *path; gchar *buffer; cl_program program; g_return_val_if_fail (UFO_IS_RESOURCES (resources) && (filename != NULL), NULL); priv = resources->priv; path = lookup_kernel_path (priv, filename); if (path == NULL) { g_set_error (error, UFO_RESOURCES_ERROR, UFO_RESOURCES_ERROR_LOAD_PROGRAM, "Could not find `%s'. Maybe you forgot to pass a configuration?", filename); return NULL; } buffer = read_file (path); g_free (path); if (buffer == NULL) { g_set_error (error, UFO_RESOURCES_ERROR, UFO_RESOURCES_ERROR_LOAD_PROGRAM, "Could not open `%s'", filename); return NULL; } program = add_program_from_source (priv, buffer, "", error); g_debug ("Added program %p from `%s`", (gpointer) program, filename); g_free (buffer); return create_kernel (priv, program, kernelname, error); }
int main (void) { int *a; cl_mem a_in; cl_event event; cl_kernel kernel; cl_context context; cl_program program; cl_uint devices_num; char *program_source; cl_device_id device_id; cl_platform_id platform_id; cl_command_queue command_queue; program_source = (char *) calloc (1000, sizeof (char)); program_source = readKernel (); /* number of platforms on the system */ platforms_number (); /* id of the first platform proposed by the system */ platform_id = get_platform (); /* number of devices on the platform specified by platform_id */ devices_num = devices_number (platform_id); /* id of the first device proposed by the system on the platform specified by platform_id */ device_id = create_device (platform_id); /* create a context to stablish a communication channel between the host process and the device */ context = create_context (device_id); /* create a program providing the source code */ program = create_program (context, program_source); /* compile the program for the specific device architecture */ build_program (program, device_id); /* create a kernel given the program */ kernel = create_kernel (program); /* create a memory object, in this case this will be an array of integers of length specified by the LENGTH macro */ a = create_memory_object (LENGTH, "a"); /* create a buffer, this will be allocated on the global memory of the device */ a_in = create_buffer (LENGTH, context, "a_in"); /* assign this buffer as the only kernel argument */ set_kernel_argument (kernel, a_in, 0, "a_in"); /* create a command queue, here we can enqueue tasks for the device specified by device_id */ command_queue = create_command_queue (context, device_id); /* copy the memory object allocated on the host memory into the buffer created on the global memory of the device */ enqueue_write_buffer_task (command_queue, a_in, LENGTH, a, "a_in"); /* enqueue a task to execute the kernel on the device */ event = enqueue_kernel_execution (command_queue, kernel, LENGTH, 0, NULL); enqueue_kernel_execution (command_queue, kernel, LENGTH, 1, &event); /* copy the content of the buffer from the global memory of the device to the host memory */ enqueue_read_buffer_task (command_queue, a_in, LENGTH, a, "a_in"); /* print the memory object with the result of the execution */ print_memory_object (a, LENGTH, "a"); return 0; }
void go_cl(config c) { cl_int error; cl_platform_id platform; cl_device_id device; cl_context context; cl_command_queue commands; cl_program program; cl_kernel kernel; cl_event ev; cl_mem mem; float *result = NULL; char buf[256]; int work_size[2] = { 128, 128 }; int offset[2]; int x, y; cl_float4 camera_pos = { c.camera_pos.x, c.camera_pos.y, c.camera_pos.z, 0 }; cl_float4 camera_dir = { c.camera_target.x - c.camera_pos.x, c.camera_target.y - c.camera_pos.y, c.camera_target.z - c.camera_pos.z, 0 }; cl_float4 light_pos = { c.light_pos.x, c.light_pos.y, c.light_pos.z, 0 }; cl_int2 image_size = { c.width, c.height }; sprintf(buf, "-DBAILOUT=%d -DSCALE=%f -DFOV=%f", c.bailout, c.scale, c.fov); printf("Starting\n"); init_cl(&platform, &device, &context, &commands); dump_info(platform, device); printf("Creating kernel\n"); program = load_program_from_file("kernel.cl", context, device, buf); kernel = create_kernel(program, "test"); printf("Setting memory\n"); mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * c.width * c.height, NULL, &error); check_error(error, "Could not allocate buffer"); error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem); error = clSetKernelArg(kernel, 1, sizeof(cl_float4), &camera_pos); error = clSetKernelArg(kernel, 2, sizeof(cl_float4), &camera_dir); error = clSetKernelArg(kernel, 3, sizeof(cl_float4), &light_pos); error = clSetKernelArg(kernel, 4, sizeof(cl_int2), &image_size); clFinish(commands); printf("Running\n"); for (x = 0; x < c.width; x += work_size[0]) { for (y = 0; y < c.height; y += work_size[1]) { offset[0] = x; offset[1] = y; error = clEnqueueNDRangeKernel(commands, kernel, 2, offset, work_size, NULL, 0, NULL, &ev); printf("."); } } clFinish(commands); printf("\nWriting image\n"); result = malloc(sizeof(float) * c.width * c.height); error = clEnqueueReadBuffer(commands, mem, CL_TRUE, 0, sizeof(float) * c.width * c.height, result, 0, NULL, &ev); clFinish(commands); save(result, c, 0, 0); free(result); clReleaseMemObject(mem); release_cl(context, program, commands, kernel); }
int main (void) { float *sum; cl_kernel kernel; cl_mem sum_buffer; cl_context context; cl_program program; cl_uint devices_num; char *program_source; cl_device_id device_id; cl_platform_id platform_id; cl_command_queue command_queue; sum = (float *) calloc (NUM_STEPS, sizeof (float)); program_source = (char *) calloc (1000, sizeof (char)); program_source = readKernel (); /* number of platforms on the system */ platforms_number (); /* id of the first platform proposed by the system */ platform_id = get_platform (); /* number of devices on the platform specified by platform_id */ devices_num = devices_number (platform_id); /* id of the first device proposed by the system on the platform specified by platform_id */ device_id = create_device (platform_id); /* create a context to stablish a communication channel between the host process and the device */ context = create_context (device_id); /* create a program providing the source code */ program = create_program (context, program_source); /* compile the program for the specific device architecture */ build_program (program, device_id);\ /* create a kernel given the program */ kernel = create_kernel (program); /* create a memory object, in this case this will be float number that will contain the values of the partial sums */ sum_buffer = create_buffer (context, "sum_buffer", NUM_STEPS); /* assign this buffer as the only kernel argument */ set_kernel_argument (kernel, sum_buffer, 0, "sum_buffer"); /* create a command queue, here we can enqueue tasks for the device specified by device_id */ command_queue = create_command_queue (context, device_id); /* enqueue a task to execute the kernel on the device */ enqueue_kernel_execution (command_queue, kernel, NUM_STEPS); /* copy the content of the buffer from the global memory of the device to the host memory */ enqueue_read_buffer_task (command_queue, sum_buffer, NUM_STEPS, sum, "sum"); printf (ANSI_COLOR_CYAN "\nAproximación de PI: %.10lf\n\n" ANSI_COLOR_RESET, sum[0] / NUM_STEPS); return 0; }
// This program implements a simple vector addition routine to demonstrate kernel execution // It is mostly a straightforward port of the vector addition example from Heterogeneous Computing with OpenCL int main() { // In this example, we will be summing two integer vectors of a hard-coded size const size_t vector_length = 64 * 1024 * 1024; const size_t vector_size = vector_length * sizeof(cl_int); // Minimal platform and device parameters are specified here const CLplusplus::Version target_version = CLplusplus::version_1p2; const cl_ulong min_mem_alloc_size = vector_size; const cl_ulong min_global_mem_size = 3 * vector_size; // Have the user select a suitable device, according to some criteria (see shared.hpp for more details) const auto selected_platform_and_device = Shared::select_device( [&](const CLplusplus::Platform & platform) -> bool { return (platform.version() >= target_version); // Platform OpenCL version is recent enough }, [&](const CLplusplus::Device & device) -> bool { if(device.version() < target_version) return false; // OpenCL platforms may support older-generation devices, which we need to eliminate const bool device_supports_ooe_execution = device.queue_properties() & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE; return device.available() && // Device is available for compute purposes device.endian_little() && // Device is little-endian (device.execution_capabilities() & CL_EXEC_KERNEL) && // Device can execute OpenCL kernels device_supports_ooe_execution && // Device can execute OpenCL commands out of order device.compiler_available() && device.linker_available() && // Implementation has an OpenCL C compiler and linker for this device (device.max_mem_alloc_size() >= min_mem_alloc_size) && // Device accepts large enough global memory allocations (device.global_mem_size() >= min_global_mem_size); // Device has enough global memory } ); // Create an OpenCL context on the device with some default parameters (see shared.hpp for more details) const auto context = Shared::build_default_context(selected_platform_and_device); // Allocate our input and output buffers std::cout << "Creating buffers..." << std::endl; const auto input_A_buffer = context.create_buffer(CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, vector_size); const auto input_B_buffer = context.create_buffer(CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, vector_size); const auto output_C_buffer = context.create_buffer(CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY, vector_size); // Create a program object from the basic vector addition example std::cout << "Loading program..." << std::endl; auto program = context.create_program_with_source_file("kernels/vector_add.cl"); // Start an asynchronous program build std::cout << "Starting to build program..." << std::endl; const auto build_event = program.build_with_event("-cl-mad-enable -cl-no-signed-zeros -cl-std=CL1.2 -cl-kernel-arg-info"); // Create an out-of-order command queue for the device const auto command_queue = context.create_command_queue(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE); // Generate our input data and send it to the device std::cout << "Generating and sending data..." << std::endl; std::vector<cl_int> input_A(vector_length); for(size_t i = 0; i < vector_length; ++i) input_A[i] = i + 1; const auto write_A_event = command_queue.enqueued_write_buffer(static_cast<const void *>(&(input_A[0])), false, input_A_buffer, 0, vector_size, {}); std::vector<cl_int> input_B(vector_length); for(size_t i = 0; i < vector_length; ++i) input_B[i] = vector_length - i; const auto write_B_event = command_queue.enqueued_write_buffer(static_cast<const void *>(&(input_B[0])), false, input_B_buffer, 0, vector_size, {}); const auto all_write_events = command_queue.enqueued_marker_with_wait_list({write_A_event, write_B_event}); // Once the program is built, create a kernel object associated to our vector addition routine std::cout << std::endl; std::cout << "Creating a kernel for vector addition..." << std::endl; const auto kernel = program.create_kernel("vector_add", build_event); // Set its arguments as appropriate kernel.set_buffer_argument(0, &input_A_buffer); kernel.set_buffer_argument(1, &input_B_buffer); kernel.set_buffer_argument(2, &output_C_buffer); // Execute the kernel std::cout << "Starting the kernel..." << std::endl; const auto exec_event = command_queue.enqueued_1d_range_kernel(kernel, vector_length, {all_write_events}); // Once the kernel is done, synchronously read device output back into host memory std::cout << "Waiting for output..." << std::endl; std::vector<cl_int> output_C(vector_length); command_queue.read_buffer(output_C_buffer, 0, static_cast<void *>(&(output_C[0])), vector_size, {exec_event}); // Verify the output std::cout << std::endl; for(const auto & output : output_C) { if(output != vector_length + 1) { std::cout << "Incorrect output !" << std::endl; std::abort(); } } std::cout << "Vector addition was performed successfully !" << std::endl; return 0; }
static void create_kernel_sync(hpx::naming::id_type const &gid, std::string module_name, std::string kernel_name) { create_kernel(gid, module_name, kernel_name).get(); }