static void randomDistribution(cl::Buffer out, const size_t elements, const af_random_engine_type type, const uintl &seed, uintl &counter, int kerIdx) { uint elementsPerBlock = THREADS * 4 * sizeof(uint) / sizeof(T); uint groups = divup(elements, elementsPerBlock); uint hi = seed >> 32; uint lo = seed; uint hic = counter >> 32; uint loc = counter; NDRange local(THREADS, 1); NDRange global(THREADS * groups, 1); if ((type == AF_RANDOM_ENGINE_PHILOX_4X32_10) || (type == AF_RANDOM_ENGINE_THREEFRY_2X32_16)) { Kernel ker = get_random_engine_kernel<T>(type, kerIdx, elementsPerBlock); auto randomEngineOp = KernelFunctor<cl::Buffer, uint, uint, uint, uint, uint>(ker); randomEngineOp(EnqueueArgs(getQueue(), global, local), out, elements, hic, loc, hi, lo); } counter += elements; CL_DEBUG_FINISH(getQueue()); }
void initMersenneState(cl::Buffer state, cl::Buffer table, const uintl &seed) { NDRange local(THREADS_PER_GROUP, 1); NDRange global(local[0] * MAX_BLOCKS, 1); Kernel ker = get_mersenne_init_kernel(); auto initOp = KernelFunctor<cl::Buffer, cl::Buffer, uintl>(ker); initOp(EnqueueArgs(getQueue(), global, local), state, table, seed); CL_DEBUG_FINISH(getQueue()); }
void anisotropicDiffusion(Param inout, const float dt, const float mct, const int fluxFnCode) { using cl::Buffer; using cl::EnqueueArgs; using cl::Kernel; using cl::KernelFunctor; using cl::NDRange; using cl::Program; std::string kerKeyStr = std::string("anisotropic_diffusion_") + std::string(dtype_traits<T>::getName()) + "_" + std::to_string(isMCDE); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, kerKeyStr); if (entry.prog == 0 && entry.ker == 0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D SHRD_MEM_HEIGHT=" << (THREADS_X + 2) << " -D SHRD_MEM_WIDTH=" << (THREADS_Y + 2) << " -D IS_MCDE=" << isMCDE; if (std::is_same<T, double>::value) options << " -D USE_DOUBLE"; const char *ker_strs[] = {anisotropic_diffusion_cl}; const int ker_lens[] = {anisotropic_diffusion_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "diffUpdate"); addKernelToCache(device, kerKeyStr, entry); } auto diffUpdateOp = KernelFunctor<Buffer, KParam, float, float, int, unsigned, unsigned>( *entry.ker); NDRange threads(THREADS_X, THREADS_Y, 1); int blkX = divup(inout.info.dims[0], threads[0]); int blkY = divup(inout.info.dims[1], threads[1]); NDRange global(threads[0] * blkX * inout.info.dims[2], threads[1] * blkY * inout.info.dims[3], 1); diffUpdateOp(EnqueueArgs(getQueue(), global, threads), *inout.data, inout.info, dt, mct, fluxFnCode, blkX, blkY); CL_DEBUG_FINISH(getQueue()); }
void randomDistribution(cl::Buffer out, const size_t elements, cl::Buffer state, cl::Buffer pos, cl::Buffer sh1, cl::Buffer sh2, const uint mask, cl::Buffer recursion_table, cl::Buffer temper_table, int kerIdx) { int threads = THREADS; int min_elements_per_block = 32*THREADS*4*sizeof(uint)/sizeof(T); int blocks = divup(elements, min_elements_per_block); blocks = (blocks > MAX_BLOCKS)? MAX_BLOCKS : blocks; int elementsPerBlock = divup(elements, blocks); NDRange local(threads, 1); NDRange global(threads * blocks, 1); Kernel ker = get_random_engine_kernel<T>(AF_RANDOM_ENGINE_MERSENNE_GP11213, kerIdx, elementsPerBlock); auto randomEngineOp = KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::Buffer, cl::Buffer, uint, cl::Buffer, cl::Buffer, uint, uint>(ker); randomEngineOp(EnqueueArgs(getQueue(), global, local), out, state, pos, sh1, sh2, mask, recursion_table, temper_table, elementsPerBlock, elements); CL_DEBUG_FINISH(getQueue()); }
void convSep(Param out, const Param signal, const Param filter) { try { const int fLen = filter.info.dims[0] * filter.info.dims[1]; std::string ref_name = std::string("convsep_") + std::to_string(conv_dim) + std::string("_") + std::string(dtype_traits<T>::getName()) + std::string("_") + std::string(dtype_traits<accType>::getName()) + std::string("_") + std::to_string(expand) + std::string("_") + std::to_string(fLen); int device = getActiveDeviceId(); kc_t::iterator idx = kernelCaches[device].find(ref_name); kc_entry_t entry; if (idx == kernelCaches[device].end()) { const size_t C0_SIZE = (THREADS_X+2*(fLen-1))* THREADS_Y; const size_t C1_SIZE = (THREADS_Y+2*(fLen-1))* THREADS_X; size_t locSize = (conv_dim==0 ? C0_SIZE : C1_SIZE); std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D accType="<< dtype_traits<accType>::getName() << " -D CONV_DIM="<< conv_dim << " -D EXPAND="<< expand << " -D FLEN="<< fLen << " -D LOCAL_MEM_SIZE="<<locSize; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } Program prog; buildProgram(prog, convolve_separable_cl, convolve_separable_cl_len, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "convolve"); kernelCaches[device][ref_name] = entry; } else { entry = idx->second; } auto convOp = KernelFunctor<Buffer, KParam, Buffer, KParam, Buffer, int, int>(*entry.ker); NDRange local(THREADS_X, THREADS_Y); int blk_x = divup(out.info.dims[0], THREADS_X); int blk_y = divup(out.info.dims[1], THREADS_Y); NDRange global(blk_x*signal.info.dims[2]*THREADS_X, blk_y*signal.info.dims[3]*THREADS_Y); cl::Buffer *mBuff = bufferAlloc(fLen*sizeof(accType)); // FIX ME: if the filter array is strided, direct might cause issues getQueue().enqueueCopyBuffer(*filter.data, *mBuff, 0, 0, fLen*sizeof(accType)); convOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *signal.data, signal.info, *mBuff, blk_x, blk_y); bufferFree(mBuff); } catch (cl::Error err) { CL_TO_AF_ERROR(err); throw; } }
void harris(unsigned* corners_out, Param &x_out, Param &y_out, Param &resp_out, Param in, const unsigned max_corners, const float min_response, const float sigma, const unsigned filter_len, const float k_thr) { auto kernels = getHarrisKernels<T>(); using cl::Buffer; using cl::EnqueueArgs; using cl::NDRange; // Window filter convAccT* h_filter = new convAccT[filter_len]; // Decide between rectangular or circular filter if (sigma < 0.5f) { for (unsigned i = 0; i < filter_len; i++) h_filter[i] = (T)1.f / (filter_len); } else { gaussian1D<convAccT>(h_filter, (int)filter_len, sigma); } const unsigned border_len = filter_len / 2 + 1; // Copy filter to device object Array<convAccT> filter = createHostDataArray<convAccT>(filter_len, h_filter); Array<T> ix = createEmptyArray<T>(dim4(4, in.info.dims)); Array<T> iy = createEmptyArray<T>(dim4(4, in.info.dims)); // Compute first-order derivatives as gradients gradient<T>(iy, ix, in); Array<T> ixx = createEmptyArray<T>(dim4(4, in.info.dims)); Array<T> ixy = createEmptyArray<T>(dim4(4, in.info.dims)); Array<T> iyy = createEmptyArray<T>(dim4(4, in.info.dims)); // Second order-derivatives kernel sizes const unsigned blk_x_so = divup(in.info.dims[3] * in.info.strides[3], HARRIS_THREADS_PER_GROUP); const NDRange local_so(HARRIS_THREADS_PER_GROUP, 1); const NDRange global_so(blk_x_so * HARRIS_THREADS_PER_GROUP, 1); auto soOp = KernelFunctor< Buffer, Buffer, Buffer, unsigned, Buffer, Buffer > (*std::get<0>(kernels)); // Compute second-order derivatives soOp(EnqueueArgs(getQueue(), global_so, local_so), *ixx.get(), *ixy.get(), *iyy.get(), in.info.dims[3] * in.info.strides[3], *ix.get(), *iy.get()); CL_DEBUG_FINISH(getQueue()); // Convolve second order derivatives with proper window filter conv_helper<T, convAccT>(ixx, ixy, iyy, filter); cl::Buffer *d_responses = bufferAlloc(in.info.dims[3] * in.info.strides[3] * sizeof(T)); // Harris responses kernel sizes unsigned blk_x_hr = divup(in.info.dims[0] - border_len*2, HARRIS_THREADS_X); unsigned blk_y_hr = divup(in.info.dims[1] - border_len*2, HARRIS_THREADS_Y); const NDRange local_hr(HARRIS_THREADS_X, HARRIS_THREADS_Y); const NDRange global_hr(blk_x_hr * HARRIS_THREADS_X, blk_y_hr * HARRIS_THREADS_Y); auto hrOp = KernelFunctor< Buffer, unsigned, unsigned, Buffer, Buffer, Buffer, float, unsigned> (*std::get<2>(kernels)); // Calculate Harris responses for all pixels hrOp(EnqueueArgs(getQueue(), global_hr, local_hr), *d_responses, in.info.dims[0], in.info.dims[1], *ixx.get(), *ixy.get(), *iyy.get(), k_thr, border_len); CL_DEBUG_FINISH(getQueue()); // Number of corners is not known a priori, limit maximum number of corners // according to image dimensions unsigned corner_lim = in.info.dims[3] * in.info.strides[3] * 0.2f; unsigned corners_found = 0; cl::Buffer *d_corners_found = bufferAlloc(sizeof(unsigned)); getQueue().enqueueWriteBuffer(*d_corners_found, CL_TRUE, 0, sizeof(unsigned), &corners_found); cl::Buffer *d_x_corners = bufferAlloc(corner_lim * sizeof(float)); cl::Buffer *d_y_corners = bufferAlloc(corner_lim * sizeof(float)); cl::Buffer *d_resp_corners = bufferAlloc(corner_lim * sizeof(float)); const float min_r = (max_corners > 0) ? 0.f : min_response; auto nmOp = KernelFunctor< Buffer, Buffer, Buffer, Buffer, Buffer, unsigned, unsigned, float, unsigned, unsigned> (*std::get<3>(kernels)); // Perform non-maximal suppression nmOp(EnqueueArgs(getQueue(), global_hr, local_hr), *d_x_corners, *d_y_corners, *d_resp_corners, *d_corners_found, *d_responses, in.info.dims[0], in.info.dims[1], min_r, border_len, corner_lim); CL_DEBUG_FINISH(getQueue()); getQueue().enqueueReadBuffer(*d_corners_found, CL_TRUE, 0, sizeof(unsigned), &corners_found); bufferFree(d_responses); bufferFree(d_corners_found); *corners_out = min(corners_found, (max_corners > 0) ? max_corners : corner_lim); if (*corners_out == 0) return; // Set output Param info x_out.info.dims[0] = y_out.info.dims[0] = resp_out.info.dims[0] = *corners_out; x_out.info.strides[0] = y_out.info.strides[0] = resp_out.info.strides[0] = 1; x_out.info.offset = y_out.info.offset = resp_out.info.offset = 0; for (int k = 1; k < 4; k++) { x_out.info.dims[k] = y_out.info.dims[k] = resp_out.info.dims[k] = 1; x_out.info.strides[k] = x_out.info.dims[k - 1] * x_out.info.strides[k - 1]; y_out.info.strides[k] = y_out.info.dims[k - 1] * y_out.info.strides[k - 1]; resp_out.info.strides[k] = resp_out.info.dims[k - 1] * resp_out.info.strides[k - 1]; } if (max_corners > 0 && corners_found > *corners_out) { Param harris_resp; Param harris_idx; harris_resp.info.dims[0] = harris_idx.info.dims[0] = corners_found; harris_resp.info.strides[0] = harris_idx.info.strides[0] = 1; for (int k = 1; k < 4; k++) { harris_resp.info.dims[k] = 1; harris_resp.info.strides[k] = harris_resp.info.dims[k - 1] * harris_resp.info.strides[k - 1]; harris_idx.info.dims[k] = 1; harris_idx.info.strides[k] = harris_idx.info.dims[k - 1] * harris_idx.info.strides[k - 1]; } int sort_elem = harris_resp.info.strides[3] * harris_resp.info.dims[3]; harris_resp.data = d_resp_corners; // Create indices using range harris_idx.data = bufferAlloc(sort_elem * sizeof(unsigned)); kernel::range<uint>(harris_idx, 0); // Sort Harris responses kernel::sort0ByKey<float, uint>(harris_resp, harris_idx, false); x_out.data = bufferAlloc(*corners_out * sizeof(float)); y_out.data = bufferAlloc(*corners_out * sizeof(float)); resp_out.data = bufferAlloc(*corners_out * sizeof(float)); // Keep corners kernel sizes const unsigned blk_x_kc = divup(*corners_out, HARRIS_THREADS_PER_GROUP); const NDRange local_kc(HARRIS_THREADS_PER_GROUP, 1); const NDRange global_kc(blk_x_kc * HARRIS_THREADS_PER_GROUP, 1); auto kcOp = KernelFunctor< Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, unsigned> (*std::get<1>(kernels)); // Keep only the first corners_to_keep corners with higher Harris // responses kcOp(EnqueueArgs(getQueue(), global_kc, local_kc), *x_out.data, *y_out.data, *resp_out.data, *d_x_corners, *d_y_corners, *harris_resp.data, *harris_idx.data, *corners_out); CL_DEBUG_FINISH(getQueue()); bufferFree(d_x_corners); bufferFree(d_y_corners); bufferFree(harris_resp.data); bufferFree(harris_idx.data); } else if (max_corners == 0 && corners_found < corner_lim) { x_out.data = bufferAlloc(*corners_out * sizeof(float)); y_out.data = bufferAlloc(*corners_out * sizeof(float)); resp_out.data = bufferAlloc(*corners_out * sizeof(float)); getQueue().enqueueCopyBuffer(*d_x_corners, *x_out.data, 0, 0, *corners_out * sizeof(float)); getQueue().enqueueCopyBuffer(*d_y_corners, *y_out.data, 0, 0, *corners_out * sizeof(float)); getQueue().enqueueCopyBuffer(*d_resp_corners, *resp_out.data, 0, 0, *corners_out * sizeof(float)); bufferFree(d_x_corners); bufferFree(d_y_corners); bufferFree(d_resp_corners); } else { x_out.data = d_x_corners; y_out.data = d_y_corners; resp_out.data = d_resp_corners; } }
void regions(Param out, Param in) { try { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; static std::map<int, Program*> regionsProgs; static std::map<int, Kernel *> ilKernel; static std::map<int, Kernel *> frKernel; static std::map<int, Kernel *> ueKernel; int device = getActiveDeviceId(); static const int block_dim = 16; static const int num_warps = 8; std::call_once( compileFlags[device], [device] () { std::ostringstream options; if (full_conn) { options << " -D T=" << dtype_traits<T>::getName() << " -D BLOCK_DIM=" << block_dim << " -D NUM_WARPS=" << num_warps << " -D N_PER_THREAD=" << n_per_thread << " -D LIMIT_MAX=" << limit_max<T>() << " -D FULL_CONN"; } else { options << " -D T=" << dtype_traits<T>::getName() << " -D BLOCK_DIM=" << block_dim << " -D NUM_WARPS=" << num_warps << " -D N_PER_THREAD=" << n_per_thread << " -D LIMIT_MAX=" << limit_max<T>(); } if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } Program prog; buildProgram(prog, regions_cl, regions_cl_len, options.str()); regionsProgs[device] = new Program(prog); ilKernel[device] = new Kernel(*regionsProgs[device], "initial_label"); frKernel[device] = new Kernel(*regionsProgs[device], "final_relabel"); ueKernel[device] = new Kernel(*regionsProgs[device], "update_equiv"); }); const NDRange local(THREADS_X, THREADS_Y); const int blk_x = divup(in.info.dims[0], THREADS_X*2); const int blk_y = divup(in.info.dims[1], THREADS_Y*2); const NDRange global(blk_x * THREADS_X, blk_y * THREADS_Y); auto ilOp = make_kernel<Buffer, KParam, Buffer, KParam> (*ilKernel[device]); ilOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *in.data, in.info); CL_DEBUG_FINISH(getQueue()); int h_continue = 1; cl::Buffer *d_continue = bufferAlloc(sizeof(int)); while (h_continue) { h_continue = 0; getQueue().enqueueWriteBuffer(*d_continue, CL_TRUE, 0, sizeof(int), &h_continue); auto ueOp = make_kernel<Buffer, KParam, Buffer> (*ueKernel[device]); ueOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *d_continue); CL_DEBUG_FINISH(getQueue()); getQueue().enqueueReadBuffer(*d_continue, CL_TRUE, 0, sizeof(int), &h_continue); } bufferFree(d_continue); // Now, perform the final relabeling. This converts the equivalency // map from having unique labels based on the lowest pixel in the // component to being sequentially numbered components starting at // 1. int size = in.info.dims[0] * in.info.dims[1]; compute::command_queue c_queue(getQueue()()); // Wrap raw device ptr compute::context context(getContext()()); compute::vector<T> tmp(size, context); clEnqueueCopyBuffer(getQueue()(), (*out.data)(), tmp.get_buffer().get(), 0, 0, size * sizeof(T), 0, NULL, NULL); // Sort the copy compute::sort(tmp.begin(), tmp.end(), c_queue); // Take the max element, this is the number of label assignments to // compute. //int num_bins = tmp[size - 1] + 1; T last_label; clEnqueueReadBuffer(getQueue()(), tmp.get_buffer().get(), CL_TRUE, (size - 1) * sizeof(T), sizeof(T), &last_label, 0, NULL, NULL); int num_bins = (int)last_label + 1; Buffer labels(getContext(), CL_MEM_READ_WRITE, num_bins * sizeof(T)); compute::buffer c_labels(labels()); compute::buffer_iterator<T> labels_begin = compute::make_buffer_iterator<T>(c_labels, 0); compute::buffer_iterator<T> labels_end = compute::make_buffer_iterator<T>(c_labels, num_bins); // Find the end of each section of values compute::counting_iterator<T> search_begin(0); int tmp_size = size; BOOST_COMPUTE_CLOSURE(int, upper_bound_closure, (int v), (tmp, tmp_size), { int start = 0, n = tmp_size, i; while(start < n) { i = (start + n) / 2; if(v < tmp[i]) { n = i; } else { start = i + 1; } } return start; }); BOOST_COMPUTE_FUNCTION(int, clamp_to_one, (int i), { return (i >= 1) ? 1 : i; });
// Perform the scan -- this can computes the correct labels for each // component compute::transform(labels_begin, labels_end, labels_begin, clamp_to_one, c_queue); compute::exclusive_scan(labels_begin, labels_end, labels_begin, c_queue); // Apply the correct labels to the equivalency map auto frOp = make_kernel<Buffer, KParam, Buffer, KParam, Buffer> (*frKernel[device]); //Buffer labels_buf(tmp.get_buffer().get()); frOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *in.data, in.info, labels); CL_DEBUG_FINISH(getQueue()); } catch (cl::Error err) { CL_TO_AF_ERROR(err); throw; } } } //namespace kernel } //namespace opencl