void vkeGameRendererDynamic::setNodeData(VkeNodeData::List *inData){ m_node_data = inData; if (m_node_data != NULL){ uint32_t cnt = m_node_data->count(); uint32_t transformsSize = 64 * 64; uint32_t sz = sizeof(VkeNodeUniform) * 100; sz += (transformsSize); VkBufferUsageFlags usageFlags = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT; bufferCreate(&m_uniforms_buffer_staging, sz, (VkBufferUsageFlagBits)usageFlags); bufferAlloc(&m_uniforms_buffer_staging, &m_uniforms_staging, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); bufferCreate(&m_uniforms_buffer, sz, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT); bufferAlloc(&m_uniforms_buffer, &m_uniforms_memory, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); m_uniforms_descriptor.buffer = m_uniforms_buffer; m_uniforms_descriptor.offset = 0; m_uniforms_descriptor.range = sizeof(VkeNodeUniform) * 100; m_transforms_descriptor.buffer = m_uniforms_buffer; m_transforms_descriptor.offset = sizeof(VkeNodeUniform) * 100; m_transforms_descriptor.range = transformsSize; //(4 * 64); } }
void mean_first(Param out, Param in, Param inWeight) { uint threads_x = nextpow2(std::max(32u, (uint)in.info.dims[0])); threads_x = std::min(threads_x, THREADS_PER_GROUP); uint threads_y = THREADS_PER_GROUP / threads_x; uint groups_x = divup(in.info.dims[0], threads_x * REPEAT); uint groups_y = divup(in.info.dims[1], threads_y); Param tmpOut = out; Param noWeight; noWeight.info.offset = 0; for (int k = 0; k < 4; ++k) { noWeight.info.dims[k] = 0; noWeight.info.strides[k] = 0; } // Does not matter what the value is it will not be used. Just needs to be valid. noWeight.data = inWeight.data; Param tmpWeight = noWeight; if (groups_x > 1) { tmpOut.data = bufferAlloc(groups_x * in.info.dims[1] * in.info.dims[2] * in.info.dims[3] * sizeof(To)); tmpWeight.data = bufferAlloc(groups_x * in.info.dims[1] * in.info.dims[2] * in.info.dims[3] * sizeof(Tw)); tmpOut.info.dims[0] = groups_x; for (int k = 1; k < 4; k++) tmpOut.info.strides[k] *= groups_x; tmpWeight.info = tmpOut.info; } mean_first_launcher<Ti, Tw, To>(tmpOut, tmpWeight, in, inWeight, threads_x, groups_x, groups_y); if (groups_x > 1) { // No Weight is needed when writing out the output. mean_first_launcher<Ti, Tw, To>(out, noWeight, tmpOut, tmpWeight, threads_x, 1, groups_y); bufferFree(tmpOut.data); bufferFree(tmpWeight.data); } }
void vkeGameRendererDynamic::setNodeData(VkeNodeData::List *inData){ m_node_data = inData; if (m_node_data != NULL){ uint32_t cnt = m_node_data->count(); uint32_t transformsSize = 64 * m_instance_count; uint32_t sz = sizeof(VkeNodeUniform) * cnt; sz += (transformsSize); m_uniforms_local = (float*)malloc( sz); bufferCreate(&m_uniforms_buffer, sz, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT); bufferAlloc(&m_uniforms_buffer, &m_uniforms_memory, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); m_uniforms_descriptor.buffer = m_uniforms_buffer; m_uniforms_descriptor.offset = 0; m_uniforms_descriptor.range = sizeof(VkeNodeUniform) * cnt; m_transforms_descriptor.buffer = m_uniforms_buffer; m_transforms_descriptor.offset = sizeof(VkeNodeUniform) * cnt; m_transforms_descriptor.range = transformsSize; } }
void vkeGameRendererDynamic::setMaterialData(VkeMaterial::List *inData){ m_materials = inData; if (m_materials != NULL){ uint32_t cnt = m_materials->count(); uint32_t sz = sizeof(VkeMaterialUniform) * cnt; VkBufferUsageFlags usageFlags = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT; bufferCreate(&m_material_buffer_staging, sz, (VkBufferUsageFlagBits)usageFlags); bufferAlloc(&m_material_buffer_staging, &m_material_staging, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT); VkeMaterialUniform *uniforms = NULL; VKA_CHECK_ERROR(vkMapMemory(getDefaultDevice(), m_material_staging, 0, sz, 0, (void **)&uniforms), "Could not map buffer memory.\n"); for (uint32_t i = 0; i < cnt; ++i){ VkeMaterial *mat = m_materials->getMaterial(i); mat->initVKBufferData(m_material_buffer_staging); mat->updateVKBufferData(uniforms); } vkUnmapMemory(getDefaultDevice(), m_material_staging); } }
static void scan_dim(Param &out, const Param &in, int dim) { uint threads_y = std::min(THREADS_Y, nextpow2(out.info.dims[dim])); uint threads_x = THREADS_X; uint groups_all[] = {divup((uint)out.info.dims[0], threads_x), (uint)out.info.dims[1], (uint)out.info.dims[2], (uint)out.info.dims[3]}; groups_all[dim] = divup(out.info.dims[dim], threads_y * REPEAT); if (groups_all[dim] == 1) { scan_dim_launcher<Ti, To, op, inclusive_scan>(out, out, in, dim, true, threads_y, groups_all); } else { Param tmp = out; tmp.info.dims[dim] = groups_all[dim]; tmp.info.strides[0] = 1; for (int k = 1; k < 4; k++) { tmp.info.strides[k] = tmp.info.strides[k - 1] * tmp.info.dims[k - 1]; } int tmp_elements = tmp.info.strides[3] * tmp.info.dims[3]; // FIXME: Do I need to free this ? tmp.data = bufferAlloc(tmp_elements * sizeof(To)); scan_dim_launcher<Ti, To, op, inclusive_scan>(out, tmp, in, dim, false, threads_y, groups_all); int gdim = groups_all[dim]; groups_all[dim] = 1; if (op == af_notzero_t) { scan_dim_launcher<To, To, af_add_t, true>(tmp, tmp, tmp, dim, true, threads_y, groups_all); } else { scan_dim_launcher<To, To, op, true>(tmp, tmp, tmp, dim, true, threads_y, groups_all); } groups_all[dim] = gdim; bcast_dim_launcher<To, To, op, inclusive_scan>(out, tmp, dim, true, threads_y, groups_all); bufferFree(tmp.data); } }
void conv2(conv_kparam_t& p, Param& out, const Param& sig, const Param& filt) { size_t se_size = filt.info.dims[0] * filt.info.dims[1] * sizeof(aT); p.impulse = bufferAlloc(se_size); int f0Off = filt.info.offset; for (int b3=0; b3<filt.info.dims[3]; ++b3) { int f3Off = b3 * filt.info.strides[3]; for (int b2=0; b2<filt.info.dims[2]; ++b2) { int f2Off = b2 * filt.info.strides[2]; // FIXME: if the filter array is strided, direct copy of symbols // might cause issues getQueue().enqueueCopyBuffer(*filt.data, *p.impulse, (f2Off+f3Off+f0Off)*sizeof(aT), 0, se_size); p.o[1] = (p.outHasNoOffset ? 0 : b2); p.o[2] = (p.outHasNoOffset ? 0 : b3); p.s[1] = (p.inHasNoOffset ? 0 : b2); p.s[2] = (p.inHasNoOffset ? 0 : b3); conv2Helper<T, aT, expand>(p, out, sig, filt); } } }
void convolve2(Param out, const Param signal, const Param filter) { try { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; static std::map<int, Program*> convProgs; static std::map<int, Kernel*> convKernels; int device = getActiveDeviceId(); std::call_once( compileFlags[device], [device] () { const size_t C0_SIZE = (THREADS_X+2*(fLen-1))* THREADS_Y; const size_t C1_SIZE = (THREADS_Y+2*(fLen-1))* THREADS_X; size_t locSize = (conv_dim==0 ? C0_SIZE : C1_SIZE); std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D accType="<< dtype_traits<accType>::getName() << " -D CONV_DIM="<< conv_dim << " -D EXPAND="<< expand << " -D FLEN="<< fLen << " -D LOCAL_MEM_SIZE="<<locSize; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } Program prog; buildProgram(prog, convolve_separable_cl, convolve_separable_cl_len, options.str()); convProgs[device] = new Program(prog); convKernels[device] = new Kernel(*convProgs[device], "convolve"); }); auto convOp = make_kernel<Buffer, KParam, Buffer, KParam, Buffer, int, int>(*convKernels[device]); NDRange local(THREADS_X, THREADS_Y); int blk_x = divup(out.info.dims[0], THREADS_X); int blk_y = divup(out.info.dims[1], THREADS_Y); NDRange global(blk_x*signal.info.dims[2]*THREADS_X, blk_y*signal.info.dims[3]*THREADS_Y); cl::Buffer *mBuff = bufferAlloc(fLen*sizeof(accType)); // FIX ME: if the filter array is strided, direct might cause issues getQueue().enqueueCopyBuffer(*filter.data, *mBuff, 0, 0, fLen*sizeof(accType)); convOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *signal.data, signal.info, *mBuff, blk_x, blk_y); bufferFree(mBuff); } catch (cl::Error err) { CL_TO_AF_ERROR(err); throw; } }
Array<T> index(const Array<T>& in, const af_index_t idxrs[]) { kernel::IndexKernelParam_t p; std::vector<af_seq> seqs(4, af_span); // create seq vector to retrieve output // dimensions, offsets & offsets for (dim_t x=0; x<4; ++x) { if (idxrs[x].isSeq) { seqs[x] = idxrs[x].idx.seq; } } // retrieve dimensions, strides and offsets dim4 iDims = in.dims(); dim4 dDims = in.getDataDims(); dim4 oDims = toDims (seqs, iDims); dim4 iOffs = toOffset(seqs, dDims); dim4 iStrds= toStride(seqs, dDims); for (dim_t i=0; i<4; ++i) { p.isSeq[i] = idxrs[i].isSeq; p.offs[i] = iOffs[i]; p.strds[i] = iStrds[i]; } Buffer* bPtrs[4]; std::vector< Array<uint> > idxArrs(4, createEmptyArray<uint>(dim4())); // look through indexs to read af_array indexs for (dim_t x=0; x<4; ++x) { // set index pointers were applicable if (!p.isSeq[x]) { idxArrs[x] = castArray<uint>(idxrs[x].idx.arr); bPtrs[x] = idxArrs[x].get(); // set output array ith dimension value oDims[x] = idxArrs[x].elements(); } else { // alloc an 1-element buffer to avoid OpenCL from failing bPtrs[x] = bufferAlloc(sizeof(uint)); } } Array<T> out = createEmptyArray<T>(oDims); if(oDims.elements() == 0) { return out; } kernel::index<T>(out, in, p, bPtrs); for (dim_t x=0; x<4; ++x) { if (p.isSeq[x]) bufferFree(bPtrs[x]); } return out; }
void morph3d(Param out, const Param in, const Param mask) { std::string refName = std::string("morph3d_") + std::string(dtype_traits<T>::getName()) + std::to_string(isDilation) + std::to_string(SeLength); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, refName); if (entry.prog==0 && entry.ker==0) { std::string options = generateOptionsString<T, isDilation, SeLength>(); const char* ker_strs[] = {morph_cl}; const int ker_lens[] = {morph_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "morph3d"); addKernelToCache(device, refName, entry); } auto morphOp = KernelFunctor< Buffer, KParam, Buffer, KParam, Buffer, cl::LocalSpaceArg, int >(*entry.ker); NDRange local(CUBE_X, CUBE_Y, CUBE_Z); int blk_x = divup(in.info.dims[0], CUBE_X); int blk_y = divup(in.info.dims[1], CUBE_Y); int blk_z = divup(in.info.dims[2], CUBE_Z); // launch batch * blk_x blocks along x dimension NDRange global(blk_x * CUBE_X * in.info.dims[3], blk_y * CUBE_Y, blk_z * CUBE_Z); // copy mask/filter to constant memory cl_int se_size = sizeof(T)*SeLength*SeLength*SeLength; cl::Buffer *mBuff = bufferAlloc(se_size); getQueue().enqueueCopyBuffer(*mask.data, *mBuff, 0, 0, se_size); // calculate shared memory size const int padding = (SeLength%2==0 ? (SeLength-1) : (2*(SeLength/2))); const int locLen = CUBE_X+padding+1; const int locArea = locLen *(CUBE_Y+padding); const int locSize = locArea*(CUBE_Z+padding); morphOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *in.data, in.info, *mBuff, cl::Local(locSize*sizeof(T)), blk_x); bufferFree(mBuff); CL_DEBUG_FINISH(getQueue()); }
unsigned nonMaximal(cl::Buffer* x_out, cl::Buffer* y_out, cl::Buffer* resp_out, const unsigned idim0, const unsigned idim1, const cl::Buffer* resp_in, const unsigned edge, const unsigned max_corners) { unsigned corners_found = 0; std::string refName = std::string("non_maximal_") + std::string(dtype_traits<T>::getName()); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, refName); if (entry.prog == 0 && entry.ker == 0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D NONMAX"; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) options << " -D USE_DOUBLE"; const char* ker_strs[] = {susan_cl}; const int ker_lens[] = {susan_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "non_maximal"); addKernelToCache(device, refName, entry); } cl::Buffer* d_corners_found = bufferAlloc(sizeof(unsigned)); getQueue().enqueueWriteBuffer(*d_corners_found, CL_TRUE, 0, sizeof(unsigned), &corners_found); auto nonMaximalOp = KernelFunctor<Buffer, Buffer, Buffer, Buffer, unsigned, unsigned, Buffer, unsigned, unsigned>(*entry.ker); NDRange local(SUSAN_THREADS_X, SUSAN_THREADS_Y); NDRange global(divup(idim0 - 2 * edge, local[0]) * local[0], divup(idim1 - 2 * edge, local[1]) * local[1]); nonMaximalOp(EnqueueArgs(getQueue(), global, local), *x_out, *y_out, *resp_out, *d_corners_found, idim0, idim1, *resp_in, edge, max_corners); getQueue().enqueueReadBuffer(*d_corners_found, CL_TRUE, 0, sizeof(unsigned), &corners_found); bufferFree(d_corners_found); return corners_found; }
void csrmv(Param out, const Param &values, const Param &rowIdx, const Param &colIdx, const Param &rhs, const T alpha, const T beta) { bool use_alpha = (alpha != scalar<T>(1.0)); bool use_beta = (beta != scalar<T>(0.0)); // Using greedy indexing is causing performance issues on many platforms // FIXME: Figure out why bool use_greedy = false; // FIXME: Find a better number based on average non zeros per row int threads = 64; std::string ref_name = std::string("csrmv_") + std::string(dtype_traits<T>::getName()) + std::string("_") + std::to_string(use_alpha) + std::string("_") + std::to_string(use_beta) + std::string("_") + std::to_string(use_greedy) + std::string("_") + std::to_string(threads); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, ref_name); if (entry.prog==0 && entry.ker==0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName(); options << " -D USE_ALPHA=" << use_alpha; options << " -D USE_BETA=" << use_beta; options << " -D USE_GREEDY=" << use_greedy; options << " -D THREADS=" << threads; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } if (std::is_same<T, cfloat>::value || std::is_same<T, cdouble>::value) { options << " -D IS_CPLX=1"; } else { options << " -D IS_CPLX=0"; } const char *ker_strs[] = {csrmv_cl}; const int ker_lens[] = {csrmv_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel[2]; entry.ker[0] = Kernel(*entry.prog, "csrmv_thread"); entry.ker[1] = Kernel(*entry.prog, "csrmv_block"); addKernelToCache(device, ref_name, entry); } int count = 0; cl::Buffer *counter = bufferAlloc(sizeof(int)); getQueue().enqueueWriteBuffer(*counter, CL_TRUE, 0, sizeof(int), (void *)&count); // TODO: Figure out the proper way to choose either csrmv_thread or csrmv_block bool is_csrmv_block = true; auto csrmv_kernel = is_csrmv_block ? entry.ker[1] : entry.ker[0]; auto csrmv_func = KernelFunctor<Buffer, Buffer, Buffer, Buffer, int, Buffer, KParam, T, T, Buffer>(csrmv_kernel); NDRange local(is_csrmv_block ? threads : THREADS_PER_GROUP, 1); int M = rowIdx.info.dims[0] - 1; int groups_x = is_csrmv_block ? divup(M, REPEAT) : divup(M, REPEAT * local[0]); groups_x = std::min(groups_x, MAX_CSRMV_GROUPS); NDRange global(local[0] * groups_x, 1); csrmv_func(EnqueueArgs(getQueue(), global, local), *out.data, *values.data, *rowIdx.data, *colIdx.data, M, *rhs.data, rhs.info, alpha, beta, *counter); CL_DEBUG_FINISH(getQueue()); bufferFree(counter); }
void orb(unsigned* out_feat, Param& x_out, Param& y_out, Param& score_out, Param& ori_out, Param& size_out, Param& desc_out, Param image, const float fast_thr, const unsigned max_feat, const float scl_fctr, const unsigned levels, const bool blur_img) { try { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; static Program orbProgs[DeviceManager::MAX_DEVICES]; static Kernel hrKernel[DeviceManager::MAX_DEVICES]; static Kernel kfKernel[DeviceManager::MAX_DEVICES]; static Kernel caKernel[DeviceManager::MAX_DEVICES]; static Kernel eoKernel[DeviceManager::MAX_DEVICES]; int device = getActiveDeviceId(); std::call_once( compileFlags[device], [device] () { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D BLOCK_SIZE=" << ORB_THREADS_X; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } buildProgram(orbProgs[device], orb_cl, orb_cl_len, options.str()); hrKernel[device] = Kernel(orbProgs[device], "harris_response"); kfKernel[device] = Kernel(orbProgs[device], "keep_features"); caKernel[device] = Kernel(orbProgs[device], "centroid_angle"); eoKernel[device] = Kernel(orbProgs[device], "extract_orb"); }); unsigned patch_size = REF_PAT_SIZE; unsigned min_side = std::min(image.info.dims[0], image.info.dims[1]); unsigned max_levels = 0; float scl_sum = 0.f; for (unsigned i = 0; i < levels; i++) { min_side /= scl_fctr; // Minimum image side for a descriptor to be computed if (min_side < patch_size || max_levels == levels) break; max_levels++; scl_sum += 1.f / (float)pow(scl_fctr,(float)i); } std::vector<cl::Buffer*> d_x_pyr(max_levels); std::vector<cl::Buffer*> d_y_pyr(max_levels); std::vector<cl::Buffer*> d_score_pyr(max_levels); std::vector<cl::Buffer*> d_ori_pyr(max_levels); std::vector<cl::Buffer*> d_size_pyr(max_levels); std::vector<cl::Buffer*> d_desc_pyr(max_levels); std::vector<unsigned> feat_pyr(max_levels); unsigned total_feat = 0; // Compute number of features to keep for each level std::vector<unsigned> lvl_best(max_levels); unsigned feat_sum = 0; for (unsigned i = 0; i < max_levels-1; i++) { float lvl_scl = (float)pow(scl_fctr,(float)i); lvl_best[i] = ceil((max_feat / scl_sum) / lvl_scl); feat_sum += lvl_best[i]; } lvl_best[max_levels-1] = max_feat - feat_sum; // Maintain a reference to previous level image Param prev_img; Param lvl_img; const unsigned gauss_len = 9; T* h_gauss = nullptr; Param gauss_filter; gauss_filter.data = nullptr; for (unsigned i = 0; i < max_levels; i++) { const float lvl_scl = (float)pow(scl_fctr,(float)i); if (i == 0) { // First level is used in its original size lvl_img = image; prev_img = image; } else if (i > 0) { // Resize previous level image to current level dimensions lvl_img.info.dims[0] = round(image.info.dims[0] / lvl_scl); lvl_img.info.dims[1] = round(image.info.dims[1] / lvl_scl); lvl_img.info.strides[0] = 1; lvl_img.info.strides[1] = lvl_img.info.dims[0]; for (int k = 2; k < 4; k++) { lvl_img.info.dims[k] = 1; lvl_img.info.strides[k] = lvl_img.info.dims[k - 1] * lvl_img.info.strides[k - 1]; } lvl_img.info.offset = 0; lvl_img.data = bufferAlloc(lvl_img.info.dims[3] * lvl_img.info.strides[3] * sizeof(T)); resize<T, AF_INTERP_BILINEAR>(lvl_img, prev_img); if (i > 1) bufferFree(prev_img.data); prev_img = lvl_img; } unsigned lvl_feat = 0; Param d_x_feat, d_y_feat, d_score_feat; // Round feature size to nearest odd integer float size = 2.f * floor(patch_size / 2.f) + 1.f; // Avoid keeping features that might be too wide and might not fit on // the image, sqrt(2.f) is the radius when angle is 45 degrees and // represents widest case possible unsigned edge = ceil(size * sqrt(2.f) / 2.f); // Detect FAST features fast<T, 9, true>(&lvl_feat, d_x_feat, d_y_feat, d_score_feat, lvl_img, fast_thr, 0.15f, edge); if (lvl_feat == 0) { feat_pyr[i] = 0; if (i > 0 && i == max_levels-1) bufferFree(lvl_img.data); continue; } bufferFree(d_score_feat.data); unsigned usable_feat = 0; cl::Buffer* d_usable_feat = bufferAlloc(sizeof(unsigned)); getQueue().enqueueWriteBuffer(*d_usable_feat, CL_TRUE, 0, sizeof(unsigned), &usable_feat); cl::Buffer* d_x_harris = bufferAlloc(lvl_feat * sizeof(float)); cl::Buffer* d_y_harris = bufferAlloc(lvl_feat * sizeof(float)); cl::Buffer* d_score_harris = bufferAlloc(lvl_feat * sizeof(float)); // Calculate Harris responses // Good block_size >= 7 (must be an odd number) const dim_type blk_x = divup(lvl_feat, ORB_THREADS_X); const NDRange local(ORB_THREADS_X, ORB_THREADS_Y); const NDRange global(blk_x * ORB_THREADS_X, ORB_THREADS_Y); unsigned block_size = 7; float k_thr = 0.04f; auto hrOp = make_kernel<Buffer, Buffer, Buffer, Buffer, Buffer, const unsigned, Buffer, Buffer, KParam, const unsigned, const float, const unsigned> (hrKernel[device]); hrOp(EnqueueArgs(getQueue(), global, local), *d_x_harris, *d_y_harris, *d_score_harris, *d_x_feat.data, *d_y_feat.data, lvl_feat, *d_usable_feat, *lvl_img.data, lvl_img.info, block_size, k_thr, patch_size); CL_DEBUG_FINISH(getQueue()); getQueue().enqueueReadBuffer(*d_usable_feat, CL_TRUE, 0, sizeof(unsigned), &usable_feat); bufferFree(d_x_feat.data); bufferFree(d_y_feat.data); bufferFree(d_usable_feat); if (usable_feat == 0) { feat_pyr[i] = 0; bufferFree(d_x_harris); bufferFree(d_y_harris); bufferFree(d_score_harris); if (i > 0 && i == max_levels-1) bufferFree(lvl_img.data); continue; } // Sort features according to Harris responses Param d_harris_sorted; Param d_harris_idx; d_harris_sorted.info.dims[0] = usable_feat; d_harris_idx.info.dims[0] = usable_feat; d_harris_sorted.info.strides[0] = 1; d_harris_idx.info.strides[0] = 1; for (int k = 1; k < 4; k++) { d_harris_sorted.info.dims[k] = 1; d_harris_idx.info.dims[k] = 1; d_harris_sorted.info.strides[k] = d_harris_sorted.info.dims[k - 1] * d_harris_sorted.info.strides[k - 1]; d_harris_idx.info.strides[k] = d_harris_idx.info.dims[k - 1] * d_harris_idx.info.strides[k - 1]; } d_harris_sorted.info.offset = 0; d_harris_idx.info.offset = 0; d_harris_sorted.data = d_score_harris; d_harris_idx.data = bufferAlloc((d_harris_idx.info.dims[0]) * sizeof(unsigned)); sort0_index<float, false>(d_harris_sorted, d_harris_idx); cl::Buffer* d_x_lvl = bufferAlloc(usable_feat * sizeof(float)); cl::Buffer* d_y_lvl = bufferAlloc(usable_feat * sizeof(float)); cl::Buffer* d_score_lvl = bufferAlloc(usable_feat * sizeof(float)); usable_feat = min(usable_feat, lvl_best[i]); // Keep only features with higher Harris responses const dim_type keep_blk = divup(usable_feat, ORB_THREADS); const NDRange local_keep(ORB_THREADS, 1); const NDRange global_keep(keep_blk * ORB_THREADS, 1); auto kfOp = make_kernel<Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, const unsigned> (kfKernel[device]); kfOp(EnqueueArgs(getQueue(), global_keep, local_keep), *d_x_lvl, *d_y_lvl, *d_score_lvl, *d_x_harris, *d_y_harris, *d_harris_sorted.data, *d_harris_idx.data, usable_feat); CL_DEBUG_FINISH(getQueue()); bufferFree(d_x_harris); bufferFree(d_y_harris); bufferFree(d_harris_sorted.data); bufferFree(d_harris_idx.data); cl::Buffer* d_ori_lvl = bufferAlloc(usable_feat * sizeof(float)); cl::Buffer* d_size_lvl = bufferAlloc(usable_feat * sizeof(float)); // Compute orientation of features const dim_type centroid_blk_x = divup(usable_feat, ORB_THREADS_X); const NDRange local_centroid(ORB_THREADS_X, ORB_THREADS_Y); const NDRange global_centroid(centroid_blk_x * ORB_THREADS_X, ORB_THREADS_Y); auto caOp = make_kernel<Buffer, Buffer, Buffer, const unsigned, Buffer, KParam, const unsigned> (caKernel[device]); caOp(EnqueueArgs(getQueue(), global_centroid, local_centroid), *d_x_lvl, *d_y_lvl, *d_ori_lvl, usable_feat, *lvl_img.data, lvl_img.info, patch_size); CL_DEBUG_FINISH(getQueue()); Param lvl_filt; Param lvl_tmp; if (blur_img) { lvl_filt = lvl_img; lvl_tmp = lvl_img; lvl_filt.data = bufferAlloc(lvl_filt.info.dims[0] * lvl_filt.info.dims[1] * sizeof(T)); lvl_tmp.data = bufferAlloc(lvl_tmp.info.dims[0] * lvl_tmp.info.dims[1] * sizeof(T)); // Calculate a separable Gaussian kernel if (h_gauss == nullptr) { h_gauss = new T[gauss_len]; gaussian1D(h_gauss, gauss_len, 2.f); gauss_filter.info.dims[0] = gauss_len; gauss_filter.info.strides[0] = 1; for (int k = 1; k < 4; k++) { gauss_filter.info.dims[k] = 1; gauss_filter.info.strides[k] = gauss_filter.info.dims[k - 1] * gauss_filter.info.strides[k - 1]; } dim_type gauss_elem = gauss_filter.info.strides[3] * gauss_filter.info.dims[3]; gauss_filter.data = bufferAlloc(gauss_elem * sizeof(T)); getQueue().enqueueWriteBuffer(*gauss_filter.data, CL_TRUE, 0, gauss_elem * sizeof(T), h_gauss); } // Filter level image with Gaussian kernel to reduce noise sensitivity convolve2<T, convAccT, 0, false, gauss_len>(lvl_tmp, lvl_img, gauss_filter); convolve2<T, convAccT, 1, false, gauss_len>(lvl_filt, lvl_tmp, gauss_filter); bufferFree(lvl_tmp.data); } // Compute ORB descriptors cl::Buffer* d_desc_lvl = bufferAlloc(usable_feat * 8 * sizeof(unsigned)); unsigned* h_desc_lvl = new unsigned[usable_feat * 8]; for (int j = 0; j < (int)usable_feat * 8; j++) h_desc_lvl[j] = 0; getQueue().enqueueWriteBuffer(*d_desc_lvl, CL_TRUE, 0, usable_feat * 8 * sizeof(unsigned), h_desc_lvl); delete[] h_desc_lvl; auto eoOp = make_kernel<Buffer, const unsigned, Buffer, Buffer, Buffer, Buffer, Buffer, KParam, const float, const unsigned> (eoKernel[device]); if (blur_img) { eoOp(EnqueueArgs(getQueue(), global_centroid, local_centroid), *d_desc_lvl, usable_feat, *d_x_lvl, *d_y_lvl, *d_ori_lvl, *d_size_lvl, *lvl_filt.data, lvl_filt.info, lvl_scl, patch_size); CL_DEBUG_FINISH(getQueue()); bufferFree(lvl_filt.data); } else { eoOp(EnqueueArgs(getQueue(), global_centroid, local_centroid), *d_desc_lvl, usable_feat, *d_x_lvl, *d_y_lvl, *d_ori_lvl, *d_size_lvl, *lvl_img.data, lvl_img.info, lvl_scl, patch_size); CL_DEBUG_FINISH(getQueue()); } // Store results to pyramids total_feat += usable_feat; feat_pyr[i] = usable_feat; d_x_pyr[i] = d_x_lvl; d_y_pyr[i] = d_y_lvl; d_score_pyr[i] = d_score_lvl; d_ori_pyr[i] = d_ori_lvl; d_size_pyr[i] = d_size_lvl; d_desc_pyr[i] = d_desc_lvl; if (i > 0 && i == max_levels-1) bufferFree(lvl_img.data); } if (gauss_filter.data != nullptr) bufferFree(gauss_filter.data); if (h_gauss != nullptr) delete[] h_gauss; // If no features are found, set found features to 0 and return if (total_feat == 0) { *out_feat = 0; return; } // Allocate output memory x_out.info.dims[0] = total_feat; x_out.info.strides[0] = 1; y_out.info.dims[0] = total_feat; y_out.info.strides[0] = 1; score_out.info.dims[0] = total_feat; score_out.info.strides[0] = 1; ori_out.info.dims[0] = total_feat; ori_out.info.strides[0] = 1; size_out.info.dims[0] = total_feat; size_out.info.strides[0] = 1; desc_out.info.dims[0] = 8; desc_out.info.strides[0] = 1; desc_out.info.dims[1] = total_feat; desc_out.info.strides[1] = desc_out.info.dims[0]; for (int k = 1; k < 4; k++) { x_out.info.dims[k] = 1; x_out.info.strides[k] = x_out.info.dims[k - 1] * x_out.info.strides[k - 1]; y_out.info.dims[k] = 1; y_out.info.strides[k] = y_out.info.dims[k - 1] * y_out.info.strides[k - 1]; score_out.info.dims[k] = 1; score_out.info.strides[k] = score_out.info.dims[k - 1] * score_out.info.strides[k - 1]; ori_out.info.dims[k] = 1; ori_out.info.strides[k] = ori_out.info.dims[k - 1] * ori_out.info.strides[k - 1]; size_out.info.dims[k] = 1; size_out.info.strides[k] = size_out.info.dims[k - 1] * size_out.info.strides[k - 1]; if (k > 1) { desc_out.info.dims[k] = 1; desc_out.info.strides[k] = desc_out.info.dims[k - 1] * desc_out.info.strides[k - 1]; } } if (total_feat > 0) { size_t out_sz = total_feat * sizeof(float); x_out.data = bufferAlloc(out_sz); y_out.data = bufferAlloc(out_sz); score_out.data = bufferAlloc(out_sz); ori_out.data = bufferAlloc(out_sz); size_out.data = bufferAlloc(out_sz); size_t desc_sz = total_feat * 8 * sizeof(unsigned); desc_out.data = bufferAlloc(desc_sz); } unsigned offset = 0; for (unsigned i = 0; i < max_levels; i++) { if (feat_pyr[i] == 0) continue; if (i > 0) offset += feat_pyr[i-1]; getQueue().enqueueCopyBuffer(*d_x_pyr[i], *x_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float)); getQueue().enqueueCopyBuffer(*d_y_pyr[i], *y_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float)); getQueue().enqueueCopyBuffer(*d_score_pyr[i], *score_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float)); getQueue().enqueueCopyBuffer(*d_ori_pyr[i], *ori_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float)); getQueue().enqueueCopyBuffer(*d_size_pyr[i], *size_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float)); getQueue().enqueueCopyBuffer(*d_desc_pyr[i], *desc_out.data, 0, offset*8*sizeof(unsigned), feat_pyr[i] * 8 * sizeof(unsigned)); bufferFree(d_x_pyr[i]); bufferFree(d_y_pyr[i]); bufferFree(d_score_pyr[i]); bufferFree(d_ori_pyr[i]); bufferFree(d_size_pyr[i]); bufferFree(d_desc_pyr[i]); } // Sets number of output features *out_feat = total_feat; } catch (cl::Error err) { CL_TO_AF_ERROR(err); throw; } }
void VkeCubeTexture::loadTextureFiles(const char **inPath){ bool imagesOK = true; VKA_INFO_MSG("Loading Cube Texture.\n"); for (uint32_t i = 0; i < 6; ++i){ if (!loadTexture(inPath[i], NULL, NULL, &m_width, &m_height)){ VKA_ERROR_MSG("Error loading texture image.\n"); printf("Texture : %d not available (%s).\n", i, inPath[i]); return; } } VulkanDC::Device::Queue::Name queueName = "DEFAULT_GRAPHICS_QUEUE"; VulkanDC::Device::Queue::CommandBufferID cmdID = INIT_COMMAND_ID; VulkanDC *dc = VulkanDC::Get(); VulkanDC::Device *device = dc->getDefaultDevice(); VulkanDC::Device::Queue *queue = device->getQueue(queueName); VkCommandBuffer cmd = VK_NULL_HANDLE; queue->beginCommandBuffer(cmdID, &cmd, VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT); imageCreateAndBind( &m_data.image, &m_data.memory, m_format, VK_IMAGE_TYPE_2D, m_width, m_height, 1, 6, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, (VkImageUsageFlagBits)( VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT ), VK_IMAGE_TILING_OPTIMAL); VkBuffer cubeMapBuffer; VkDeviceMemory cubeMapMem; bufferCreate(&cubeMapBuffer, m_width*m_height * 4 * 6, VK_BUFFER_USAGE_TRANSFER_SRC_BIT); bufferAlloc(&cubeMapBuffer, &cubeMapMem, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); VkDeviceSize dSize = m_width * m_height * 4; uint32_t rowPitch = m_width * 4; if (m_memory_flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT){ imageSetLayoutBarrier(cmdID, queueName, m_data.image, VK_IMAGE_ASPECT_COLOR_BIT, VK_IMAGE_LAYOUT_PREINITIALIZED, VK_IMAGE_LAYOUT_GENERAL); for (uint32_t i = 0; i < 6; ++i){ void *data = NULL; VkDeviceSize ofst = dSize*i; VKA_CHECK_ERROR(vkMapMemory(getDefaultDevice(),cubeMapMem, ofst, dSize, 0, &data), "Could not map memory for image.\n"); if (!loadTexture(inPath[i], (uint8_t**)&data, rowPitch, &m_width, &m_height)){ VKA_ERROR_MSG("Could not load final image.\n"); } vkUnmapMemory(getDefaultDevice(), cubeMapMem); } VkBufferImageCopy biCpyRgn[6]; for (uint32_t k = 0; k < 6; ++k){ VkDeviceSize ofst = dSize*k; biCpyRgn[k].bufferOffset = ofst; biCpyRgn[k].bufferImageHeight = 0; biCpyRgn[k].bufferRowLength = 0; biCpyRgn[k].imageExtent.width = m_width; biCpyRgn[k].imageExtent.height = m_height; biCpyRgn[k].imageExtent.depth = 1; biCpyRgn[k].imageOffset.x = 0; biCpyRgn[k].imageOffset.y = 0; biCpyRgn[k].imageOffset.z = 0; biCpyRgn[k].imageSubresource.baseArrayLayer = k; biCpyRgn[k].imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; biCpyRgn[k].imageSubresource.layerCount = 1; biCpyRgn[k].imageSubresource.mipLevel = 0; } VkFence copyFence; VkFenceCreateInfo fenceInfo; memset(&fenceInfo, 0, sizeof(fenceInfo)); fenceInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; vkCreateFence(device->getVKDevice(), &fenceInfo,NULL , ©Fence); vkCmdCopyBufferToImage(cmd, cubeMapBuffer, m_data.image, m_data.imageLayout, 6, biCpyRgn); queue->flushCommandBuffer(cmdID , ©Fence); vkWaitForFences(device->getVKDevice(), 1, ©Fence, VK_TRUE, 100000000000); vkDestroyBuffer(device->getVKDevice(), cubeMapBuffer, NULL); vkFreeMemory(device->getVKDevice(), cubeMapMem, NULL); } VkSamplerCreateInfo sampler; sampler.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO; sampler.pNext = NULL; sampler.magFilter = VK_FILTER_NEAREST; sampler.minFilter = VK_FILTER_NEAREST; sampler.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST; sampler.addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE; sampler.addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE; sampler.addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE; sampler.mipLodBias = 0.0f; sampler.maxAnisotropy = 1; sampler.compareOp = VK_COMPARE_OP_NEVER; sampler.minLod = 0.0f; sampler.maxLod = 0.0f; sampler.borderColor = VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE; VkImageViewCreateInfo view; view.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; view.pNext = NULL; view.viewType = VK_IMAGE_VIEW_TYPE_CUBE; view.format = m_format; view.components = { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G, VK_COMPONENT_SWIZZLE_B, VK_COMPONENT_SWIZZLE_A }; view.subresourceRange = { VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 0 }; view.subresourceRange.baseArrayLayer = 0; view.subresourceRange.levelCount = 1; view.subresourceRange.baseMipLevel = 0; view.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; view.subresourceRange.layerCount = 1; VKA_CHECK_ERROR(vkCreateSampler(getDefaultDevice(), &sampler,NULL, &m_data.sampler), "Could not create sampler for image texture.\n"); view.image = m_data.image; VKA_CHECK_ERROR(vkCreateImageView(getDefaultDevice(), &view,NULL, &m_data.view), "Could not create image view for texture.\n"); VKA_INFO_MSG("Created CUBE Image Texture.\n"); }
void VkeCubeTexture::loadCubeDDS(const char *inFile){ std::string searchPaths[] = { std::string(PROJECT_NAME), NVPWindow::sysExePath() + std::string(PROJECT_RELDIRECTORY), std::string(PROJECT_ABSDIRECTORY) }; nv_dds::CDDSImage ddsImage; for (uint32_t i = 0; i < 3; ++i){ std::string separator = ""; uint32_t strSize = searchPaths[i].size(); if(searchPaths[i].substr(strSize-1,strSize) != "/") separator = "/"; std::string filePath = searchPaths[i] + separator + std::string("images/") + std::string(inFile); ddsImage.load(filePath, true); if (ddsImage.is_valid()) break; } if (!ddsImage.is_valid()){ perror("Could not cube load texture image.\n"); exit(1); } uint32_t imgW = ddsImage.get_width(); uint32_t imgH = ddsImage.get_height(); uint32_t comCount = ddsImage.get_components(); uint32_t fmt = ddsImage.get_format(); bool isCube = ddsImage.is_cubemap(); bool isComp = ddsImage.is_compressed(); VkFormat vkFmt = VK_FORMAT_R8G8B8A8_UNORM; switch (fmt){ case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT: vkFmt = VK_FORMAT_BC1_RGB_SRGB_BLOCK; break; case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT: vkFmt = VK_FORMAT_BC2_UNORM_BLOCK; break; case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT: vkFmt = VK_FORMAT_BC3_UNORM_BLOCK; break; default: break; } m_width = imgW; m_height = imgH; m_format = vkFmt; VulkanDC::Device::Queue::Name queueName = "DEFAULT_GRAPHICS_QUEUE"; VulkanDC::Device::Queue::CommandBufferID cmdID = INIT_COMMAND_ID; VulkanDC *dc = VulkanDC::Get(); VulkanDC::Device *device = dc->getDefaultDevice(); VulkanDC::Device::Queue *queue = device->getQueue(queueName); VkCommandBuffer cmd = VK_NULL_HANDLE; queue->beginCommandBuffer(cmdID, &cmd, VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT); imageCreateAndBind( &m_data.image, &m_data.memory, m_format, VK_IMAGE_TYPE_2D, m_width, m_height, 1, 6, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, (VkImageUsageFlagBits)(VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT), VK_IMAGE_TILING_OPTIMAL); VkBuffer cubeMapBuffer; VkDeviceMemory cubeMapMem; bufferCreate(&cubeMapBuffer, m_width*m_height * 3 * 6, VK_BUFFER_USAGE_TRANSFER_SRC_BIT); bufferAlloc(&cubeMapBuffer, &cubeMapMem, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); if (m_memory_flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT){ imageSetLayoutBarrier(cmdID, queueName, m_data.image, VK_IMAGE_ASPECT_COLOR_BIT, VK_IMAGE_LAYOUT_PREINITIALIZED, VK_IMAGE_LAYOUT_GENERAL); for (uint32_t i = 0; i < 6; ++i){ void *data = NULL; VkSubresourceLayout layout; VkImageSubresource subres; subres.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; subres.mipLevel = m_mip_level; subres.arrayLayer = i; vkGetImageSubresourceLayout(getDefaultDevice(), m_data.image, &subres, &layout); VKA_CHECK_ERROR(vkMapMemory(getDefaultDevice(), cubeMapMem, layout.offset, layout.size, 0, &data), "Could not map memory for image.\n"); const nv_dds::CTexture &mipmap = ddsImage.get_cubemap_face(i); memcpy(data, (void *)mipmap, layout.size); vkUnmapMemory(getDefaultDevice(), cubeMapMem); } VkBufferImageCopy biCpyRgn[6]; for (uint32_t k = 0; k < 6; ++k){ VkSubresourceLayout layout; VkImageSubresource subres; subres.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; subres.mipLevel = m_mip_level; subres.arrayLayer = k; vkGetImageSubresourceLayout(getDefaultDevice(), m_data.image, &subres, &layout); biCpyRgn[k].bufferOffset = layout.offset; biCpyRgn[k].bufferImageHeight = 0; biCpyRgn[k].bufferRowLength = 0; biCpyRgn[k].imageExtent.width = m_width; biCpyRgn[k].imageExtent.height = m_height; biCpyRgn[k].imageExtent.depth = 1; biCpyRgn[k].imageOffset.x = 0; biCpyRgn[k].imageOffset.y = 0; biCpyRgn[k].imageOffset.z = 0; biCpyRgn[k].imageSubresource.baseArrayLayer = k; biCpyRgn[k].imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; biCpyRgn[k].imageSubresource.layerCount = 1; biCpyRgn[k].imageSubresource.mipLevel = 0; } VkFence copyFence; VkFenceCreateInfo fenceInfo; memset(&fenceInfo, 0, sizeof(fenceInfo)); fenceInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; vkCreateFence(device->getVKDevice(), &fenceInfo, NULL, ©Fence); vkCmdCopyBufferToImage(cmd, cubeMapBuffer, m_data.image, m_data.imageLayout, 6, biCpyRgn); queue->flushCommandBuffer(cmdID, ©Fence); vkWaitForFences(device->getVKDevice(), 1, ©Fence, VK_TRUE, 100000000000); vkDestroyBuffer(device->getVKDevice(), cubeMapBuffer, NULL); vkFreeMemory(device->getVKDevice(), cubeMapMem, NULL); } VkSamplerCreateInfo sampler; sampler.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO; sampler.pNext = NULL; sampler.magFilter = VK_FILTER_NEAREST; sampler.minFilter = VK_FILTER_NEAREST; sampler.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST; sampler.addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE; sampler.addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE; sampler.addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE; sampler.mipLodBias = 0.0f; sampler.maxAnisotropy = 1; sampler.compareOp = VK_COMPARE_OP_NEVER; sampler.minLod = 0.0f; sampler.maxLod = 0.0f; sampler.borderColor = VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE; VkImageViewCreateInfo view; view.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; view.pNext = NULL; view.viewType = VK_IMAGE_VIEW_TYPE_CUBE; view.format = m_format; view.components.r = VK_COMPONENT_SWIZZLE_R; view.components.g = VK_COMPONENT_SWIZZLE_G; view.components.b = VK_COMPONENT_SWIZZLE_B; view.components.a = VK_COMPONENT_SWIZZLE_A; view.subresourceRange.baseArrayLayer = 0; view.subresourceRange.levelCount = 1; view.subresourceRange.baseMipLevel = 0; view.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; view.subresourceRange.layerCount = 1; VKA_CHECK_ERROR(vkCreateSampler(getDefaultDevice(), &sampler, NULL, &m_data.sampler), "Could not create sampler for image texture.\n"); view.image = m_data.image; VKA_CHECK_ERROR(vkCreateImageView(getDefaultDevice(), &view, NULL, &m_data.view), "Could not create image view for texture.\n"); }
void csrmm_nt(Param out, const Param &values, const Param &rowIdx, const Param &colIdx, const Param &rhs, const T alpha, const T beta) { bool use_alpha = (alpha != scalar<T>(1.0)); bool use_beta = (beta != scalar<T>(0.0)); // Using greedy indexing is causing performance issues on many platforms // FIXME: Figure out why bool use_greedy = false; std::string ref_name = std::string("csrmm_nt_") + std::string(dtype_traits<T>::getName()) + std::string("_") + std::to_string(use_alpha) + std::string("_") + std::to_string(use_beta) + std::string("_") + std::to_string(use_greedy); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, ref_name); if (entry.prog == 0 && entry.ker == 0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName(); options << " -D USE_ALPHA=" << use_alpha; options << " -D USE_BETA=" << use_beta; options << " -D USE_GREEDY=" << use_greedy; options << " -D THREADS_PER_GROUP=" << THREADS_PER_GROUP; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } if (std::is_same<T, cfloat>::value || std::is_same<T, cdouble>::value) { options << " -D IS_CPLX=1"; } else { options << " -D IS_CPLX=0"; } const char *ker_strs[] = {csrmm_cl}; const int ker_lens[] = {csrmm_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel[2]; entry.ker[0] = Kernel(*entry.prog, "csrmm_nt"); // FIXME: Change this after adding another kernel entry.ker[1] = Kernel(*entry.prog, "csrmm_nt"); addKernelToCache(device, ref_name, entry); } auto csrmm_nt_kernel = entry.ker[0]; auto csrmm_nt_func = KernelFunctor<Buffer, Buffer, Buffer, Buffer, int, int, Buffer, KParam, T, T, Buffer>(csrmm_nt_kernel); NDRange local(THREADS_PER_GROUP, 1); int M = rowIdx.info.dims[0] - 1; int N = rhs.info.dims[0]; int groups_x = divup(N, local[0]); int groups_y = divup(M, REPEAT); groups_y = std::min(groups_y, MAX_CSRMM_GROUPS); NDRange global(local[0] * groups_x, local[1] * groups_y); std::vector<int> count(groups_x); cl::Buffer *counter = bufferAlloc(count.size() * sizeof(int)); getQueue().enqueueWriteBuffer( *counter, CL_TRUE, 0, count.size() * sizeof(int), (void *)count.data()); csrmm_nt_func(EnqueueArgs(getQueue(), global, local), *out.data, *values.data, *rowIdx.data, *colIdx.data, M, N, *rhs.data, rhs.info, alpha, beta, *counter); bufferFree(counter); }
static void where(Param &out, Param &in) { uint threads_x = nextpow2(std::max(32u, (uint)in.info.dims[0])); threads_x = std::min(threads_x, THREADS_PER_GROUP); uint threads_y = THREADS_PER_GROUP / threads_x; uint groups_x = divup(in.info.dims[0], threads_x * REPEAT); uint groups_y = divup(in.info.dims[1], threads_y); Param rtmp; Param otmp; rtmp.info.dims[0] = groups_x; otmp.info.dims[0] = in.info.dims[0]; rtmp.info.strides[0] = 1; otmp.info.strides[0] = 1; rtmp.info.offset = 0; otmp.info.offset = 0; for (int k = 1; k < 4; k++) { rtmp.info.dims[k] = in.info.dims[k]; rtmp.info.strides[k] = rtmp.info.strides[k - 1] * rtmp.info.dims[k - 1]; otmp.info.dims[k] = in.info.dims[k]; otmp.info.strides[k] = otmp.info.strides[k - 1] * otmp.info.dims[k - 1]; } int rtmp_elements = rtmp.info.strides[3] * rtmp.info.dims[3]; rtmp.data = bufferAlloc(rtmp_elements * sizeof(uint)); int otmp_elements = otmp.info.strides[3] * otmp.info.dims[3]; otmp.data = bufferAlloc(otmp_elements * sizeof(uint)); scan_first_launcher<T, uint, af_notzero_t>(otmp, rtmp, in, false, groups_x, groups_y, threads_x); // Linearize the dimensions and perform scan Param ltmp = rtmp; ltmp.info.offset = 0; ltmp.info.dims[0] = rtmp_elements; for (int k = 1; k < 4; k++) { ltmp.info.dims[k] = 1; ltmp.info.strides[k] = rtmp_elements; } scan_first<uint, uint, af_add_t>(ltmp, ltmp); // Get output size and allocate output uint total; getQueue().enqueueReadBuffer(*rtmp.data, CL_TRUE, sizeof(uint) * (rtmp_elements - 1), sizeof(uint), &total); out.data = bufferAlloc(total * sizeof(uint)); out.info.dims[0] = total; out.info.strides[0] = 1; for (int k = 1; k < 4; k++) { out.info.dims[k] = 1; out.info.strides[k] = total; } if (total > 0) get_out_idx<T>(out.data, otmp, rtmp, in, threads_x, groups_x, groups_y); bufferFree(rtmp.data); bufferFree(otmp.data); }
void vkeGameRendererDynamic::initIndirectCommands(){ if (!m_node_data) return; VulkanDC *dc = VulkanDC::Get(); VulkanDC::Device *device = dc->getDefaultDevice(); VulkanDC::Device::Queue *queue = dc->getDefaultQueue(); uint32_t cnt = m_node_data->count(); uint32_t sz = sizeof(VkDrawIndexedIndirectCommand)*cnt; VkBuffer sceneIndirectStaging; VkDeviceMemory sceneIndirectMemStaging; VkBufferUsageFlags usageFlags = VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; bufferCreate(&m_scene_indirect_buffer, sz, (VkBufferUsageFlagBits)usageFlags); bufferAlloc(&m_scene_indirect_buffer, &m_scene_indirect_memory, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); usageFlags = VK_BUFFER_USAGE_TRANSFER_SRC_BIT; bufferCreate(&sceneIndirectStaging, sz, (VkBufferUsageFlagBits)usageFlags); bufferAlloc(&sceneIndirectStaging, &sceneIndirectMemStaging, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); VkDrawIndexedIndirectCommand *commands = NULL; VKA_CHECK_ERROR(vkMapMemory(device->getVKDevice(), sceneIndirectMemStaging, 0, sz, 0, (void **)&commands), "Could not map indirect buffer memory.\n"); for (uint32_t i = 0; i < cnt; ++i){ VkeMesh *mesh = m_node_data->getData(i)->getMesh(); commands[i].firstIndex = mesh->getFirstIndex(); commands[i].firstInstance = i*m_instance_count; commands[i].vertexOffset = mesh->getFirstVertex(); commands[i].indexCount = mesh->getIndexCount(); commands[i].instanceCount = m_instance_count; } vkUnmapMemory(device->getVKDevice(), sceneIndirectMemStaging); VkBufferCopy bufCpy; bufCpy.dstOffset = 0; bufCpy.srcOffset = 0; bufCpy.size = sz; VkCommandBuffer copyCmd; VkCommandBufferAllocateInfo cmdBufInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO }; cmdBufInfo.commandBufferCount = 1; cmdBufInfo.commandPool = queue->getCommandPool(); cmdBufInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; VKA_CHECK_ERROR(vkAllocateCommandBuffers(device->getVKDevice(), &cmdBufInfo, ©Cmd), "Could not allocate command buffers.\n"); VkCommandBufferBeginInfo cmdBeginInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; cmdBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; VKA_CHECK_ERROR(vkBeginCommandBuffer(copyCmd, &cmdBeginInfo), "Could not begin commmand buffer.\n"); vkCmdCopyBuffer(copyCmd, sceneIndirectStaging, m_scene_indirect_buffer, 1, &bufCpy); VKA_CHECK_ERROR(vkEndCommandBuffer(copyCmd), "Could not end command buffer.\n"); VkSubmitInfo subInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO }; subInfo.commandBufferCount = 1; subInfo.pCommandBuffers = ©Cmd; VkFence theFence; VkFenceCreateInfo fenceInfo = { VK_STRUCTURE_TYPE_FENCE_CREATE_INFO }; VKA_CHECK_ERROR(vkCreateFence(device->getVKDevice(), &fenceInfo, NULL, &theFence), "Could not create fence.\n"); VKA_CHECK_ERROR(vkQueueSubmit(queue->getVKQueue(), 1, &subInfo, theFence), "Could not submit queue for indirect buffer copy.\n"); VKA_CHECK_ERROR(vkWaitForFences(device->getVKDevice(), 1, &theFence, VK_TRUE, UINT_MAX), "Could not wait for fence.\n"); vkFreeCommandBuffers(device->getVKDevice(), queue->getCommandPool(), 1, ©Cmd); vkDestroyFence(device->getVKDevice(), theFence, NULL); }
void fast(unsigned* out_feat, Param &x_out, Param &y_out, Param &score_out, Param in, const float thr, const float feature_ratio, const unsigned edge) { try { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; static std::map<int, Program*> fastProgs; static std::map<int, Kernel*> lfKernel; static std::map<int, Kernel*> nmKernel; static std::map<int, Kernel*> gfKernel; int device = getActiveDeviceId(); std::call_once( compileFlags[device], [device] () { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D ARC_LENGTH=" << arc_length << " -D NONMAX=" << static_cast<unsigned>(nonmax); if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } cl::Program prog; buildProgram(prog, fast_cl, fast_cl_len, options.str()); fastProgs[device] = new Program(prog); lfKernel[device] = new Kernel(*fastProgs[device], "locate_features"); nmKernel[device] = new Kernel(*fastProgs[device], "non_max_counts"); gfKernel[device] = new Kernel(*fastProgs[device], "get_features"); }); const unsigned max_feat = ceil(in.info.dims[0] * in.info.dims[1] * feature_ratio); // Matrix containing scores for detected features, scores are stored in the // same coordinates as features, dimensions should be equal to in. cl::Buffer *d_score = bufferAlloc(in.info.dims[0] * in.info.dims[1] * sizeof(float)); std::vector<float> score_init(in.info.dims[0] * in.info.dims[1], (float)0); getQueue().enqueueWriteBuffer(*d_score, CL_TRUE, 0, in.info.dims[0] * in.info.dims[1] * sizeof(float), &score_init[0]); cl::Buffer *d_flags = d_score; if (nonmax) { d_flags = bufferAlloc(in.info.dims[0] * in.info.dims[1] * sizeof(T)); } const int blk_x = divup(in.info.dims[0]-edge*2, FAST_THREADS_X); const int blk_y = divup(in.info.dims[1]-edge*2, FAST_THREADS_Y); // Locate features kernel sizes const NDRange local(FAST_THREADS_X, FAST_THREADS_Y); const NDRange global(blk_x * FAST_THREADS_X, blk_y * FAST_THREADS_Y); auto lfOp = make_kernel<Buffer, KParam, Buffer, const float, const unsigned, LocalSpaceArg> (*lfKernel[device]); lfOp(EnqueueArgs(getQueue(), global, local), *in.data, in.info, *d_score, thr, edge, cl::Local((FAST_THREADS_X + 6) * (FAST_THREADS_Y + 6) * sizeof(T))); CL_DEBUG_FINISH(getQueue()); const int blk_nonmax_x = divup(in.info.dims[0], 64); const int blk_nonmax_y = divup(in.info.dims[1], 64); // Nonmax kernel sizes const NDRange local_nonmax(FAST_THREADS_NONMAX_X, FAST_THREADS_NONMAX_Y); const NDRange global_nonmax(blk_nonmax_x * FAST_THREADS_NONMAX_X, blk_nonmax_y * FAST_THREADS_NONMAX_Y); unsigned count_init = 0; cl::Buffer *d_total = bufferAlloc(sizeof(unsigned)); getQueue().enqueueWriteBuffer(*d_total, CL_TRUE, 0, sizeof(unsigned), &count_init); //size_t *global_nonmax_dims = global_nonmax(); size_t blocks_sz = blk_nonmax_x * FAST_THREADS_NONMAX_X * blk_nonmax_y * FAST_THREADS_NONMAX_Y * sizeof(unsigned); cl::Buffer *d_counts = bufferAlloc(blocks_sz); cl::Buffer *d_offsets = bufferAlloc(blocks_sz); auto nmOp = make_kernel<Buffer, Buffer, Buffer, Buffer, Buffer, KParam, const unsigned> (*nmKernel[device]); nmOp(EnqueueArgs(getQueue(), global_nonmax, local_nonmax), *d_counts, *d_offsets, *d_total, *d_flags, *d_score, in.info, edge); CL_DEBUG_FINISH(getQueue()); unsigned total; getQueue().enqueueReadBuffer(*d_total, CL_TRUE, 0, sizeof(unsigned), &total); total = total < max_feat ? total : max_feat; if (total > 0) { size_t out_sz = total * sizeof(float); x_out.data = bufferAlloc(out_sz); y_out.data = bufferAlloc(out_sz); score_out.data = bufferAlloc(out_sz); auto gfOp = make_kernel<Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, KParam, const unsigned, const unsigned> (*gfKernel[device]); gfOp(EnqueueArgs(getQueue(), global_nonmax, local_nonmax), *x_out.data, *y_out.data, *score_out.data, *d_flags, *d_counts, *d_offsets, in.info, total, edge); CL_DEBUG_FINISH(getQueue()); } *out_feat = total; x_out.info.dims[0] = total; x_out.info.strides[0] = 1; y_out.info.dims[0] = total; y_out.info.strides[0] = 1; score_out.info.dims[0] = total; score_out.info.strides[0] = 1; for (int k = 1; k < 4; k++) { x_out.info.dims[k] = 1; x_out.info.strides[k] = total; y_out.info.dims[k] = 1; y_out.info.strides[k] = total; score_out.info.dims[k] = 1; score_out.info.strides[k] = total; } bufferFree(d_score); if (nonmax) bufferFree(d_flags); bufferFree(d_total); bufferFree(d_counts); bufferFree(d_offsets); } catch (cl::Error err) { CL_TO_AF_ERROR(err); throw; } }
void morph(Param out, const Param in, const Param mask) { try { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; static std::map<int, Program*> morProgs; static std::map<int, Kernel*> morKernels; int device = getActiveDeviceId(); std::call_once( compileFlags[device], [device] () { ToNumStr<T> toNumStr; T init = isDilation ? Binary<T, af_max_t>().init() : Binary<T, af_min_t>().init(); std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D isDilation="<< isDilation << " -D init=" << toNumStr(init) << " -D windLen=" << windLen; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } Program prog; buildProgram(prog, morph_cl, morph_cl_len, options.str()); morProgs[device] = new Program(prog); morKernels[device] = new Kernel(*morProgs[device], "morph"); }); auto morphOp = KernelFunctor<Buffer, KParam, Buffer, KParam, Buffer, cl::LocalSpaceArg, int, int >(*morKernels[device]); NDRange local(THREADS_X, THREADS_Y); int blk_x = divup(in.info.dims[0], THREADS_X); int blk_y = divup(in.info.dims[1], THREADS_Y); // launch batch * blk_x blocks along x dimension NDRange global(blk_x * THREADS_X * in.info.dims[2], blk_y * THREADS_Y * in.info.dims[3]); // copy mask/filter to constant memory cl_int se_size = sizeof(T)*windLen*windLen; cl::Buffer *mBuff = bufferAlloc(se_size); getQueue().enqueueCopyBuffer(*mask.data, *mBuff, 0, 0, se_size); // calculate shared memory size const int halo = windLen/2; const int padding = 2*halo; const int locLen = THREADS_X + padding + 1; const int locSize = locLen * (THREADS_Y+padding); morphOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *in.data, in.info, *mBuff, cl::Local(locSize*sizeof(T)), blk_x, blk_y); bufferFree(mBuff); CL_DEBUG_FINISH(getQueue()); } catch (cl::Error err) { CL_TO_AF_ERROR(err); throw; } }
void nearest_neighbour(Param idx, Param dist, Param query, Param train, const dim_t dist_dim, const unsigned n_dist) { try { const unsigned feat_len = query.info.dims[dist_dim]; const To max_dist = maxval<To>(); // Determine maximum feat_len capable of using shared memory (faster) cl_ulong avail_lmem = getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>(); size_t lmem_predef = 2 * THREADS * sizeof(unsigned) + feat_len * sizeof(T); size_t ltrain_sz = THREADS * feat_len * sizeof(T); bool use_lmem = (avail_lmem >= (lmem_predef + ltrain_sz)) ? true : false; size_t lmem_sz = (use_lmem) ? lmem_predef + ltrain_sz : lmem_predef; unsigned unroll_len = nextpow2(feat_len); if (unroll_len != feat_len) unroll_len = 0; std::string ref_name = std::string("knn_") + std::to_string(dist_type) + std::string("_") + std::to_string(use_lmem) + std::string("_") + std::string(dtype_traits<T>::getName()) + std::string("_") + std::to_string(unroll_len); int device = getActiveDeviceId(); kc_t::iterator cache_idx = kernelCaches[device].find(ref_name); kc_entry_t entry; if (cache_idx == kernelCaches[device].end()) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D To=" << dtype_traits<To>::getName() << " -D THREADS=" << THREADS << " -D FEAT_LEN=" << unroll_len; switch(dist_type) { case AF_SAD: options <<" -D DISTOP=_sad_"; break; case AF_SSD: options <<" -D DISTOP=_ssd_"; break; case AF_SHD: options <<" -D DISTOP=_shd_ -D __SHD__"; break; default: break; } if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } if (use_lmem) options << " -D USE_LOCAL_MEM"; cl::Program prog; buildProgram(prog, nearest_neighbour_cl, nearest_neighbour_cl_len, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel[3]; entry.ker[0] = Kernel(*entry.prog, "nearest_neighbour_unroll"); entry.ker[1] = Kernel(*entry.prog, "nearest_neighbour"); entry.ker[2] = Kernel(*entry.prog, "select_matches"); kernelCaches[device][ref_name] = entry; } else { entry = cache_idx->second; } const dim_t sample_dim = (dist_dim == 0) ? 1 : 0; const unsigned nquery = query.info.dims[sample_dim]; const unsigned ntrain = train.info.dims[sample_dim]; unsigned nblk = divup(ntrain, THREADS); const NDRange local(THREADS, 1); const NDRange global(nblk * THREADS, 1); cl::Buffer *d_blk_idx = bufferAlloc(nblk * nquery * sizeof(unsigned)); cl::Buffer *d_blk_dist = bufferAlloc(nblk * nquery * sizeof(To)); // For each query vector, find training vector with smallest Hamming // distance per CUDA block if (unroll_len > 0) { auto huOp = KernelFunctor<Buffer, Buffer, Buffer, KParam, Buffer, KParam, const To, LocalSpaceArg> (entry.ker[0]); huOp(EnqueueArgs(getQueue(), global, local), *d_blk_idx, *d_blk_dist, *query.data, query.info, *train.data, train.info, max_dist, cl::Local(lmem_sz)); } else { auto hmOp = KernelFunctor<Buffer, Buffer, Buffer, KParam, Buffer, KParam, const To, const unsigned, LocalSpaceArg> (entry.ker[1]); hmOp(EnqueueArgs(getQueue(), global, local), *d_blk_idx, *d_blk_dist, *query.data, query.info, *train.data, train.info, max_dist, feat_len, cl::Local(lmem_sz)); } CL_DEBUG_FINISH(getQueue()); const NDRange local_sm(32, 8); const NDRange global_sm(divup(nquery, 32) * 32, 8); // Reduce all smallest Hamming distances from each block and store final // best match auto smOp = KernelFunctor<Buffer, Buffer, Buffer, Buffer, const unsigned, const unsigned, const To> (entry.ker[2]); smOp(EnqueueArgs(getQueue(), global_sm, local_sm), *idx.data, *dist.data, *d_blk_idx, *d_blk_dist, nquery, nblk, max_dist); CL_DEBUG_FINISH(getQueue()); bufferFree(d_blk_idx); bufferFree(d_blk_dist); } catch (cl::Error err) { CL_TO_AF_ERROR(err); throw; } }
T *memAlloc(const size_t &elements) { managerInit(); return (T *)bufferAlloc(elements * sizeof(T)); }