Array<Ty> *approx2(const Array<Ty> &in, const Array<Tp> &pos0, const Array<Tp> &pos1, const af_interp_type method, const float offGrid) { af::dim4 odims = in.dims(); odims[0] = pos0.dims()[0]; odims[1] = pos0.dims()[1]; // Create output placeholder Array<Ty> *out = createEmptyArray<Ty>(odims); switch(method) { case AF_INTERP_NEAREST: approx2_<Ty, Tp, AF_INTERP_NEAREST> (out->get(), out->dims(), out->elements(), in.get(), in.dims(), in.elements(), pos0.get(), pos0.dims(), pos1.get(), pos1.dims(), out->strides(), in.strides(), pos0.strides(), pos1.strides(), offGrid); break; case AF_INTERP_LINEAR: approx2_<Ty, Tp, AF_INTERP_LINEAR> (out->get(), out->dims(), out->elements(), in.get(), in.dims(), in.elements(), pos0.get(), pos0.dims(), pos1.get(), pos1.dims(), out->strides(), in.strides(), pos0.strides(), pos1.strides(), offGrid); break; default: break; } return out; }
Array<int> lu_inplace(Array<T> &in, const bool convert_pivot) { dim4 iDims = in.dims(); int M = iDims[0]; int N = iDims[1]; int *pivotPtr = pinnedAlloc<int>(min(M, N)); T *inPtr = pinnedAlloc<T> (in.elements()); copyData(inPtr, in); getrf_func<T>()(AF_LAPACK_COL_MAJOR, M, N, inPtr, in.strides()[1], pivotPtr); if(convert_pivot) convertPivot(&pivotPtr, M, min(M, N)); writeHostDataArray<T>(in, inPtr, in.elements() * sizeof(T)); Array<int> pivot = createHostDataArray<int>(af::dim4(M), pivotPtr); pivot.eval(); pinnedFree(inPtr); pinnedFree(pivotPtr); return pivot; }
Array<T> convolve2(Array<T> const& signal, Array<accT> const& c_filter, Array<accT> const& r_filter) { const dim_t cflen = (dim_t)c_filter.elements(); const dim_t rflen = (dim_t)r_filter.elements(); if ((cflen > kernel::MAX_SCONV_FILTER_LEN) || (rflen > kernel::MAX_SCONV_FILTER_LEN)) { // TODO call upon fft char errMessage[256]; snprintf(errMessage, sizeof(errMessage), "\nOpenCL Separable convolution doesn't support %lld(coloumn) " "%lld(row) filters\n", cflen, rflen); OPENCL_NOT_SUPPORTED(errMessage); } const dim4 sDims = signal.dims(); dim4 tDims = sDims; dim4 oDims = sDims; if (expand) { tDims[0] += cflen - 1; oDims[0] += cflen - 1; oDims[1] += rflen - 1; } Array<T> temp = createEmptyArray<T>(tDims); Array<T> out = createEmptyArray<T>(oDims); kernel::convSep<T, accT, 0, expand>(temp, signal, c_filter); kernel::convSep<T, accT, 1, expand>(out, temp, r_filter); return out; }
static void assign(Array<Tout> &out, const unsigned &ndims, const af_seq *index, const Array<Tin> &in_) { dim4 const outDs = out.dims(); dim4 const iDims = in_.dims(); DIM_ASSERT(0, (outDs.ndims()>=iDims.ndims())); DIM_ASSERT(0, (outDs.ndims()>=(dim_t)ndims)); out.eval(); vector<af_seq> index_(index, index+ndims); dim4 oDims = toDims(index_, outDs); bool is_vector = true; for (int i = 0; is_vector && i < (int)oDims.ndims() - 1; i++) { is_vector &= oDims[i] == 1; } is_vector &= in_.isVector() || in_.isScalar(); for (dim_t i = ndims; i < (int)in_.ndims(); i++) { oDims[i] = 1; } if (is_vector) { if (oDims.elements() != (dim_t)in_.elements() && in_.elements() != 1) { AF_ERROR("Size mismatch between input and output", AF_ERR_SIZE); } // If both out and in are vectors of equal elements, reshape in to out dims Array<Tin> in = in_.elements() == 1 ? tile(in_, oDims) : modDims(in_, oDims); Array<Tout> dst = createSubArray<Tout>(out, index_, false); copyArray<Tin , Tout>(dst, in); } else { for (int i = 0; i < 4; i++) { if (oDims[i] != iDims[i]) { AF_ERROR("Size mismatch between input and output", AF_ERR_SIZE); } } Array<Tout> dst = createSubArray<Tout>(out, index_, false); copyArray<Tin , Tout>(dst, in_); } }
void evalMultiple(std::vector<Array<T>*> arrays) { std::vector<Param<T> > outputs; std::vector<JIT::Node *> nodes; for (int i = 0; i < (int)arrays.size(); i++) { Array<T> *array = arrays[i]; if (array->isReady()) { continue; } array->ready = true; array->setId(getActiveDeviceId()); array->data = shared_ptr<T>(memAlloc<T>(array->elements()).release(), memFree<T>); outputs.push_back(*array); nodes.push_back(array->node.get()); } evalNodes(outputs, nodes); for (int i = 0; i < (int)arrays.size(); i++) { Array<T> *array = arrays[i]; if (array->isReady()) continue; // FIXME: Replace the current node in any JIT possible trees with the new BufferNode array->node = bufferNodePtr<T>(); } return; }
static outType varAll(const af_array& in, const bool isbiased) { typedef typename baseOutType<outType>::type weightType; Array<inType> inArr = getArray<inType>(in); Array<outType> input = cast<outType>(inArr); Array<outType> meanCnst= createValueArray<outType>(input.dims(), mean<inType, weightType, outType>(inArr)); Array<outType> diff = arithOp<outType, af_sub_t>(input, meanCnst, input.dims()); Array<outType> diffSq = arithOp<outType, af_mul_t>(diff, diff, diff.dims()); outType result = division(reduce_all<af_add_t, outType, outType>(diffSq), isbiased ? input.elements() : input.elements() - 1); return result; }
void sort0(Array<T>& val, bool isAscending) { int higherDims = val.elements() / val.dims()[0]; // TODO Make a better heurisitic if (higherDims > 10) sortBatched<T, 0>(val, isAscending); else getQueue().enqueue(kernel::sort0Iterative<T>, val, isAscending); }
unsigned susan(Array<float> &x_out, Array<float> &y_out, Array<float> &resp_out, const Array<T> &in, const unsigned radius, const float diff_thr, const float geom_thr, const float feature_ratio, const unsigned edge) { dim4 idims = in.dims(); const unsigned corner_lim = in.elements() * feature_ratio; cl::Buffer* x_corners = bufferAlloc(corner_lim * sizeof(float)); cl::Buffer* y_corners = bufferAlloc(corner_lim * sizeof(float)); cl::Buffer* resp_corners = bufferAlloc(corner_lim * sizeof(float)); cl::Buffer* resp = bufferAlloc(in.elements()*sizeof(float)); switch(radius) { case 1: kernel::susan<T, 1>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break; case 2: kernel::susan<T, 2>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break; case 3: kernel::susan<T, 3>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break; case 4: kernel::susan<T, 4>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break; case 5: kernel::susan<T, 5>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break; case 6: kernel::susan<T, 6>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break; case 7: kernel::susan<T, 7>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break; case 8: kernel::susan<T, 8>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break; case 9: kernel::susan<T, 9>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break; } unsigned corners_found = kernel::nonMaximal<T>(x_corners, y_corners, resp_corners, idims[0], idims[1], resp, edge, corner_lim); bufferFree(resp); const unsigned corners_out = std::min(corners_found, corner_lim); if (corners_out == 0) { bufferFree(x_corners); bufferFree(y_corners); bufferFree(resp_corners); x_out = createEmptyArray<float>(dim4()); y_out = createEmptyArray<float>(dim4()); resp_out = createEmptyArray<float>(dim4()); return 0; } else { x_out = createDeviceDataArray<float>(dim4(corners_out), (void*)((*x_corners)())); y_out = createDeviceDataArray<float>(dim4(corners_out), (void*)((*y_corners)())); resp_out = createDeviceDataArray<float>(dim4(corners_out), (void*)((*resp_corners)())); return corners_out; } }
void copyData(T *to, const Array<T> &from) { if(from.isOwner()) { // FIXME: Check for errors / exceptions memcpy(to, from.get(), from.elements()*sizeof(T)); } else { stridedCopy<T>(to, from.get(), from.dims(), from.strides(), from.ndims() - 1); } }
static void assign(Array<Tout>& out, const vector<af_seq> seqs, const Array<Tin>& in) { size_t ndims = seqs.size(); const dim4& outDs = out.dims(); const dim4& iDims = in.dims(); if (iDims.elements() == 0) return; out.eval(); dim4 oDims = toDims(seqs, outDs); bool isVec = true; for (int i = 0; isVec && i < (int)oDims.ndims() - 1; i++) { isVec &= oDims[i] == 1; } isVec &= in.isVector() || in.isScalar(); for (dim_t i = ndims; i < (int)in.ndims(); i++) { oDims[i] = 1; } if (isVec) { if (oDims.elements() != (dim_t)in.elements() && in.elements() != 1) { AF_ERROR("Size mismatch between input and output", AF_ERR_SIZE); } // If both out and in are vectors of equal elements, // reshape in to out dims Array<Tin> in_ = in.elements() == 1 ? tile(in, oDims) : modDims(in, oDims); auto dst = createSubArray<Tout>(out, seqs, false); copyArray<Tin, Tout>(dst, in_); } else { for (int i = 0; i < AF_MAX_DIMS; i++) { if (oDims[i] != iDims[i]) AF_ERROR("Size mismatch between input and output", AF_ERR_SIZE); } Array<Tout> dst = createSubArray<Tout>(out, seqs, false); copyArray<Tin, Tout>(dst, in); } }
SparseArray<T> sparseConvertDenseToCOO(const Array<T> &in) { in.eval(); Array<uint> nonZeroIdx_ = where<T>(in); Array<int> nonZeroIdx = cast<int, uint>(nonZeroIdx_); dim_t nNZ = nonZeroIdx.elements(); Array<int> constNNZ = createValueArray<int>(dim4(nNZ), nNZ); constNNZ.eval(); Array<int> rowIdx = arithOp<int, af_mod_t>(nonZeroIdx, constNNZ, nonZeroIdx.dims()); Array<int> colIdx = arithOp<int, af_div_t>(nonZeroIdx, constNNZ, nonZeroIdx.dims()); Array<T> values = copyArray<T>(in); values.modDims(dim4(values.elements())); values = lookup<T, int>(values, nonZeroIdx, 0); return createArrayDataSparseArray<T>(in.dims(), values, rowIdx, colIdx, AF_STORAGE_COO); }
unsigned susan(Array<float> &x_out, Array<float> &y_out, Array<float> &resp_out, const Array<T> &in, const unsigned radius, const float diff_thr, const float geom_thr, const float feature_ratio, const unsigned edge) { in.eval(); dim4 idims = in.dims(); const unsigned corner_lim = in.elements() * feature_ratio; auto x_corners = createEmptyArray<float>(dim4(corner_lim)); auto y_corners = createEmptyArray<float>(dim4(corner_lim)); auto resp_corners = createEmptyArray<float>(dim4(corner_lim)); auto response = createEmptyArray<T>(dim4(in.elements())); auto corners_found= std::shared_ptr<unsigned>(memAlloc<unsigned>(1).release(), memFree<unsigned>); corners_found.get()[0] = 0; getQueue().enqueue(kernel::susan_responses<T>, response, in, idims[0], idims[1], radius, diff_thr, geom_thr, edge); getQueue().enqueue(kernel::non_maximal<T>, x_corners, y_corners, resp_corners, corners_found, idims[0], idims[1], response, edge, corner_lim); getQueue().sync(); const unsigned corners_out = min((corners_found.get())[0], corner_lim); if (corners_out == 0) { x_out = createEmptyArray<float>(dim4()); y_out = createEmptyArray<float>(dim4()); resp_out = createEmptyArray<float>(dim4()); return 0; } else { x_out = x_corners; y_out = y_corners; resp_out = resp_corners; x_out.resetDims(dim4(corners_out)); y_out.resetDims(dim4(corners_out)); resp_out.resetDims(dim4(corners_out)); return corners_out; } }
fg::Histogram* setup_histogram(const af_array in, const double minval, const double maxval) { Array<T> histogramInput = getArray<T>(in); dim_t nBins = histogramInput.elements(); T freqMax = detail::reduce_all<af_max_t, T, T>(histogramInput); /* retrieve Forge Histogram with nBins and array type */ ForgeManager& fgMngr = ForgeManager::getInstance(); fg::Histogram* hist = fgMngr.getHistogram(nBins, getGLType<T>()); /* set histogram bar colors to orange */ hist->setBarColor(0.929f, 0.486f, 0.2745f); /* set x axis limits to maximum and minimum values of data * and y axis limits to range [0, nBins]*/ hist->setAxesLimits(maxval, minval, double(freqMax), 0.0f); hist->setAxesTitles("Bins", "Frequency"); copy_histogram<T>(histogramInput, hist); return hist; }
Array<in_t> lookup(const Array<in_t> &input, const Array<idx_t> &indices, const unsigned dim) { const dim4 iDims = input.dims(); dim4 oDims(1); for (int d=0; d<4; ++d) oDims[d] = (d==int(dim) ? indices.elements() : iDims[d]); Array<in_t> out = createEmptyArray<in_t>(oDims); dim_t nDims = iDims.ndims(); switch(dim) { case 0: kernel::lookup<in_t, idx_t, 0>(out, input, indices, nDims); break; case 1: kernel::lookup<in_t, idx_t, 1>(out, input, indices, nDims); break; case 2: kernel::lookup<in_t, idx_t, 2>(out, input, indices, nDims); break; case 3: kernel::lookup<in_t, idx_t, 3>(out, input, indices, nDims); break; } return out; }
static To corrcoef(const af_array& X, const af_array& Y) { Array<To> xIn = cast<To>(getArray<Ti>(X)); Array<To> yIn = cast<To>(getArray<Ti>(Y)); dim4 dims = xIn.dims(); dim_t n= xIn.elements(); To xSum = detail::reduce_all<af_add_t, To, To>(xIn); To ySum = detail::reduce_all<af_add_t, To, To>(yIn); Array<To> xSq = detail::arithOp<To, af_mul_t>(xIn, xIn, dims); Array<To> ySq = detail::arithOp<To, af_mul_t>(yIn, yIn, dims); Array<To> xy = detail::arithOp<To, af_mul_t>(xIn, yIn, dims); To xSqSum = detail::reduce_all<af_add_t, To, To>(xSq); To ySqSum = detail::reduce_all<af_add_t, To, To>(ySq); To xySum = detail::reduce_all<af_add_t, To, To>(xy); To result = (n*xySum - xSum*ySum)/(sqrt(n*xSqSum-xSum*xSum)*sqrt(n*ySqSum-ySum*ySum)); return result; }
Array<uint> where(const Array<T> &in) { const dim_t *dims = in.dims().get(); const dim_t *strides = in.strides().get(); static const T zero = scalar<T>(0); const T *iptr = in.get(); uint *out_vec = memAlloc<uint>(in.elements()); dim_t count = 0; dim_t idx = 0; for (dim_t w = 0; w < dims[3]; w++) { uint offw = w * strides[3]; for (dim_t z = 0; z < dims[2]; z++) { uint offz = offw + z * strides[2]; for (dim_t y = 0; y < dims[1]; y++) { uint offy = y * strides[1] + offz; for (dim_t x = 0; x < dims[0]; x++) { T val = iptr[offy + x]; if (val != zero) { out_vec[count] = idx; count++; } idx++; } } } } Array<uint> out = createHostDataArray(dim4(count), out_vec); memFree<uint>(out_vec); return out; }
static af_array hist_equal(const af_array& in, const af_array& hist) { const Array<T> input = getArray<T>(in); af_array vInput = 0; AF_CHECK(af_flat(&vInput, in)); Array<float> fHist = cast<float>(getArray<hType>(hist)); dim4 hDims = fHist.dims(); dim_t grayLevels = fHist.elements(); Array<float> cdf = scan<af_add_t, float, float>(fHist, 0); float minCdf = reduce_all<af_min_t, float, float>(cdf); float maxCdf = reduce_all<af_max_t, float, float>(cdf); float factor = (float)(grayLevels-1)/(maxCdf - minCdf); // constant array of min value from cdf Array<float> minCnst = createValueArray<float>(hDims, minCdf); // constant array of factor variable Array<float> facCnst = createValueArray<float>(hDims, factor); // cdf(i) - min for all elements Array<float> diff = arithOp<float, af_sub_t>(cdf, minCnst, hDims); // multiply factor with difference Array<float> normCdf = arithOp<float, af_mul_t>(diff, facCnst, hDims); // index input array with normalized cdf array Array<float> idxArr = lookup<float, T>(normCdf, getArray<T>(vInput), 0); Array<T> result = cast<T>(idxArr); result.modDims(input.dims()); AF_CHECK(af_release_array(vInput)); return getHandle<T>(result); }
To mean_all(Param in) { int in_elements = in.info.dims[0] * in.info.dims[1] * in.info.dims[2] * in.info.dims[3]; // FIXME: Use better heuristics to get to the optimum number if (in_elements > 4096) { bool is_linear = (in.info.strides[0] == 1); for (int k = 1; k < 4; k++) { is_linear &= (in.info.strides[k] == (in.info.strides[k - 1] * in.info.dims[k - 1])); } if (is_linear) { in.info.dims[0] = in_elements; for (int k = 1; k < 4; k++) { in.info.dims[k] = 1; in.info.strides[k] = in_elements; } } uint threads_x = nextpow2(std::max(32u, (uint)in.info.dims[0])); threads_x = std::min(threads_x, THREADS_PER_GROUP); uint threads_y = THREADS_PER_GROUP / threads_x; uint groups_x = divup(in.info.dims[0], threads_x * REPEAT); uint groups_y = divup(in.info.dims[1], threads_y); Array<To> tmpOut = createEmptyArray<To>(groups_x); Array<Tw> tmpCt = createEmptyArray<Tw>(groups_x); Param iWt; mean_first_launcher<Ti, Tw, To>(tmpOut, tmpCt, in, iWt, threads_x, groups_x, groups_y); vector<To> h_ptr(tmpOut.elements()); vector<Tw> h_cptr(tmpOut.elements()); getQueue().enqueueReadBuffer(*tmpOut.get(), CL_TRUE, 0, sizeof(To) * tmpOut.elements(), h_ptr.data()); getQueue().enqueueReadBuffer(*tmpCt.get(), CL_TRUE, 0, sizeof(Tw) * tmpCt.elements(), h_cptr.data()); MeanOp<To, Tw> Op(h_ptr[0], h_cptr[0]); for (int i = 1; i < (int)h_ptr.size(); i++) { Op(h_ptr[i], h_cptr[i]); } return Op.runningMean; } else { vector<Ti> h_ptr(in_elements); getQueue().enqueueReadBuffer(*in.data, CL_TRUE, sizeof(Ti) * in.info.offset, sizeof(Ti) * in_elements, h_ptr.data()); //TODO : MeanOp with (Tw)1 Transform<Ti, To, af_add_t> transform; Transform<uint, Tw, af_add_t> transform_weight; MeanOp<To, Tw> Op(transform(h_ptr[0]), transform_weight(1)); for (int i = 1; i < (int)in_elements; i++) { Op(transform(h_ptr[i]), transform_weight(1)); } return Op.runningMean; } }
Array<outType> match_template(const Array<inType> &sImg, const Array<inType> &tImg) { const dim4 sDims = sImg.dims(); const dim4 tDims = tImg.dims(); const dim4 sStrides = sImg.strides(); const dim4 tStrides = tImg.strides(); const dim_t tDim0 = tDims[0]; const dim_t tDim1 = tDims[1]; const dim_t sDim0 = sDims[0]; const dim_t sDim1 = sDims[1]; Array<outType> out = createEmptyArray<outType>(sDims); const dim4 oStrides = out.strides(); outType tImgMean = outType(0); dim_t winNumElements = tImg.elements(); bool needMean = mType==AF_ZSAD || mType==AF_LSAD || mType==AF_ZSSD || mType==AF_LSSD || mType==AF_ZNCC; const inType * tpl = tImg.get(); if (needMean) { for(dim_t tj=0; tj<tDim1; tj++) { dim_t tjStride = tj*tStrides[1]; for(dim_t ti=0; ti<tDim0; ti++) { tImgMean += (outType)tpl[tjStride+ti*tStrides[0]]; } } tImgMean /= winNumElements; } outType * dst = out.get(); const inType * src = sImg.get(); for(dim_t b3=0; b3<sDims[3]; ++b3) { for(dim_t b2=0; b2<sDims[2]; ++b2) { // slide through image window after window for(dim_t sj=0; sj<sDim1; sj++) { dim_t ojStride = sj*oStrides[1]; for(dim_t si=0; si<sDim0; si++) { outType disparity = outType(0); // mean for window // this variable will be used based on mType value outType wImgMean = outType(0); if (needMean) { for(dim_t tj=0,j=sj; tj<tDim1; tj++, j++) { dim_t jStride = j*sStrides[1]; for(dim_t ti=0, i=si; ti<tDim0; ti++, i++) { inType sVal = ((j<sDim1 && i<sDim0) ? src[jStride + i*sStrides[0]] : inType(0)); wImgMean += (outType)sVal; } } wImgMean /= winNumElements; } // run the window match metric for(dim_t tj=0,j=sj; tj<tDim1; tj++, j++) { dim_t jStride = j*sStrides[1]; dim_t tjStride = tj*tStrides[1]; for(dim_t ti=0, i=si; ti<tDim0; ti++, i++) { inType sVal = ((j<sDim1 && i<sDim0) ? src[jStride + i*sStrides[0]] : inType(0)); inType tVal = tpl[tjStride+ti*tStrides[0]]; outType temp; switch(mType) { case AF_SAD: disparity += fabs((outType)sVal-(outType)tVal); break; case AF_ZSAD: disparity += fabs((outType)sVal - wImgMean - (outType)tVal + tImgMean); break; case AF_LSAD: disparity += fabs((outType)sVal-(wImgMean/tImgMean)*tVal); break; case AF_SSD: disparity += ((outType)sVal-(outType)tVal)*((outType)sVal-(outType)tVal); break; case AF_ZSSD: temp = ((outType)sVal - wImgMean - (outType)tVal + tImgMean); disparity += temp*temp; break; case AF_LSSD: temp = ((outType)sVal-(wImgMean/tImgMean)*tVal); disparity += temp*temp; break; case AF_NCC: //TODO: furture implementation break; case AF_ZNCC: //TODO: furture implementation break; case AF_SHD: //TODO: furture implementation break; } } } // output is just created, hence not doing the // extra multiplication for 0th dim stride dst[ojStride + si] = disparity; } } src += sStrides[2]; dst += oStrides[2]; } src += sStrides[3]; dst += oStrides[3]; } return out; }
forge::Chart* setup_surface(const forge::Window* const window, const af_array xVals, const af_array yVals, const af_array zVals, const af_cell* const props) { Array<T> xIn = getArray<T>(xVals); Array<T> yIn = getArray<T>(yVals); Array<T> zIn = getArray<T>(zVals); const ArrayInfo& Xinfo = getInfo(xVals); const ArrayInfo& Yinfo = getInfo(yVals); const ArrayInfo& Zinfo = getInfo(zVals); af::dim4 X_dims = Xinfo.dims(); af::dim4 Y_dims = Yinfo.dims(); af::dim4 Z_dims = Zinfo.dims(); if(Xinfo.isVector()){ // Convert xIn is a column vector xIn = modDims(xIn, xIn.elements()); // Now tile along second dimension dim4 x_tdims(1, Y_dims[0], 1, 1); xIn = tile(xIn, x_tdims); // Convert yIn to a row vector yIn= modDims(yIn, af::dim4(1, yIn.elements())); // Now tile along first dimension dim4 y_tdims(X_dims[0], 1, 1, 1); yIn = tile(yIn, y_tdims); } // Flatten xIn, yIn and zIn into row vectors dim4 rowDims = dim4(1, zIn.elements()); xIn = modDims(xIn, rowDims); yIn = modDims(yIn, rowDims); zIn = modDims(zIn, rowDims); // Now join along first dimension, skip reorder std::vector<Array<T> > inputs{xIn, yIn, zIn}; Array<T> Z = join(0, inputs); ForgeManager& fgMngr = ForgeManager::getInstance(); // Get the chart for the current grid position (if any) forge::Chart* chart = NULL; if (props->col>-1 && props->row>-1) chart = fgMngr.getChart(window, props->row, props->col, FG_CHART_3D); else chart = fgMngr.getChart(window, 0, 0, FG_CHART_3D); forge::Surface* surface = fgMngr.getSurface(chart, Z_dims[0], Z_dims[1], getGLType<T>()); surface->setColor(0.0, 1.0, 0.0, 1.0); // If chart axes limits do not have a manual override // then compute and set axes limits if(!fgMngr.getChartAxesOverride(chart)) { float cmin[3], cmax[3]; T dmin[3], dmax[3]; chart->getAxesLimits(&cmin[0], &cmax[0], &cmin[1], &cmax[1], &cmin[2], &cmax[2]); dmin[0] = reduce_all<af_min_t, T, T>(xIn); dmax[0] = reduce_all<af_max_t, T, T>(xIn); dmin[1] = reduce_all<af_min_t, T, T>(yIn); dmax[1] = reduce_all<af_max_t, T, T>(yIn); dmin[2] = reduce_all<af_min_t, T, T>(zIn); dmax[2] = reduce_all<af_max_t, T, T>(zIn); if(cmin[0] == 0 && cmax[0] == 0 && cmin[1] == 0 && cmax[1] == 0 && cmin[2] == 0 && cmax[2] == 0) { // No previous limits. Set without checking cmin[0] = step_round(dmin[0], false); cmax[0] = step_round(dmax[0], true); cmin[1] = step_round(dmin[1], false); cmax[1] = step_round(dmax[1], true); cmin[2] = step_round(dmin[2], false); cmax[2] = step_round(dmax[2], true); } else { if(cmin[0] > dmin[0]) cmin[0] = step_round(dmin[0], false); if(cmax[0] < dmax[0]) cmax[0] = step_round(dmax[0], true); if(cmin[1] > dmin[1]) cmin[1] = step_round(dmin[1], false); if(cmax[1] < dmax[1]) cmax[1] = step_round(dmax[1], true); if(cmin[2] > dmin[2]) cmin[2] = step_round(dmin[2], false); if(cmax[2] < dmax[2]) cmax[2] = step_round(dmax[2], true); } chart->setAxesLimits(cmin[0], cmax[0], cmin[1], cmax[1], cmin[2], cmax[2]); } copy_surface<T>(Z, surface); return chart; }
void sortByKeyBatched(Array<Tk> okey, Array<Tv> oval, const int dim, bool isAscending) { af::dim4 inDims = okey.dims(); af::dim4 tileDims(1); af::dim4 seqDims = inDims; tileDims[dim] = inDims[dim]; seqDims[dim] = 1; uint* key = memAlloc<uint>(inDims.elements()); // IOTA { af::dim4 dims = inDims; uint* out = key; af::dim4 strides(1); for(int i = 1; i < 4; i++) strides[i] = strides[i-1] * dims[i-1]; for(dim_t w = 0; w < dims[3]; w++) { dim_t offW = w * strides[3]; uint okeyW = (w % seqDims[3]) * seqDims[0] * seqDims[1] * seqDims[2]; for(dim_t z = 0; z < dims[2]; z++) { dim_t offWZ = offW + z * strides[2]; uint okeyZ = okeyW + (z % seqDims[2]) * seqDims[0] * seqDims[1]; for(dim_t y = 0; y < dims[1]; y++) { dim_t offWZY = offWZ + y * strides[1]; uint okeyY = okeyZ + (y % seqDims[1]) * seqDims[0]; for(dim_t x = 0; x < dims[0]; x++) { dim_t id = offWZY + x; out[id] = okeyY + (x % seqDims[0]); } } } } } // initialize original index locations Tk *okey_ptr = okey.get(); Tv *oval_ptr = oval.get(); typedef KeyIndexPair<Tk, Tv> CurrentTuple; size_t size = okey.elements(); size_t bytes = okey.elements() * sizeof(CurrentTuple); CurrentTuple *tupleKeyValIdx = (CurrentTuple *)memAlloc<char>(bytes); for(unsigned i = 0; i < size; i++) { tupleKeyValIdx[i] = std::make_tuple(okey_ptr[i], oval_ptr[i], key[i]); } memFree(key); // key is no longer required if(isAscending) { std::stable_sort(tupleKeyValIdx, tupleKeyValIdx + size, KIPCompareV<Tk, Tv, true>()); } else { std::stable_sort(tupleKeyValIdx, tupleKeyValIdx + size, KIPCompareV<Tk, Tv, false>()); } std::stable_sort(tupleKeyValIdx, tupleKeyValIdx + size, KIPCompareK<Tk, Tv, true>()); for(unsigned x = 0; x < okey.elements(); x++) { okey_ptr[x] = std::get<0>(tupleKeyValIdx[x]); oval_ptr[x] = std::get<1>(tupleKeyValIdx[x]); } memFree((char *)tupleKeyValIdx); return; }
unsigned harris(Array<float> &x_out, Array<float> &y_out, Array<float> &resp_out, const Array<T> &in, const unsigned max_corners, const float min_response, const float sigma, const unsigned filter_len, const float k_thr) { dim4 idims = in.dims(); // Window filter convAccT* h_filter = memAlloc<convAccT>(filter_len); // Decide between rectangular or circular filter if (sigma < 0.5f) { for (unsigned i = 0; i < filter_len; i++) h_filter[i] = (T)1.f / (filter_len); } else { gaussian1D<convAccT>(h_filter, (int)filter_len, sigma); } Array<convAccT> filter = createDeviceDataArray<convAccT>(dim4(filter_len), (const void*)h_filter); unsigned border_len = filter_len / 2 + 1; Array<T> ix = createEmptyArray<T>(idims); Array<T> iy = createEmptyArray<T>(idims); // Compute first order derivatives gradient<T>(iy, ix, in); Array<T> ixx = createEmptyArray<T>(idims); Array<T> ixy = createEmptyArray<T>(idims); Array<T> iyy = createEmptyArray<T>(idims); // Compute second-order derivatives second_order_deriv<T>(ixx.get(), ixy.get(), iyy.get(), in.elements(), ix.get(), iy.get()); // Convolve second-order derivatives with proper window filter ixx = convolve2<T, convAccT, false>(ixx, filter, filter); ixy = convolve2<T, convAccT, false>(ixy, filter, filter); iyy = convolve2<T, convAccT, false>(iyy, filter, filter); const unsigned corner_lim = in.elements() * 0.2f; float* x_corners = memAlloc<float>(corner_lim); float* y_corners = memAlloc<float>(corner_lim); float* resp_corners = memAlloc<float>(corner_lim); T* resp = memAlloc<T>(in.elements()); // Calculate Harris responses for all pixels harris_responses<T>(resp, idims[0], idims[1], ixx.get(), ixy.get(), iyy.get(), k_thr, border_len); const unsigned min_r = (max_corners > 0) ? 0.f : min_response; unsigned corners_found = 0; // Performs non-maximal suppression non_maximal<T>(x_corners, y_corners, resp_corners, &corners_found, idims[0], idims[1], resp, min_r, border_len, corner_lim); memFree(resp); const unsigned corners_out = (max_corners > 0) ? min(corners_found, max_corners) : min(corners_found, corner_lim); if (corners_out == 0) return 0; if (max_corners > 0 && corners_found > corners_out) { Array<float> harris_responses = createDeviceDataArray<float>(dim4(corners_found), (void*)resp_corners); Array<float> harris_sorted = createEmptyArray<float>(dim4(corners_found)); Array<unsigned> harris_idx = createEmptyArray<unsigned>(dim4(corners_found)); // Sort Harris responses sort_index<float, false>(harris_sorted, harris_idx, harris_responses, 0); x_out = createEmptyArray<float>(dim4(corners_out)); y_out = createEmptyArray<float>(dim4(corners_out)); resp_out = createEmptyArray<float>(dim4(corners_out)); // Keep only the corners with higher Harris responses keep_corners(x_out.get(), y_out.get(), resp_out.get(), x_corners, y_corners, harris_sorted.get(), harris_idx.get(), corners_out); memFree(x_corners); memFree(y_corners); } else if (max_corners == 0 && corners_found < corner_lim) { x_out = createEmptyArray<float>(dim4(corners_out)); y_out = createEmptyArray<float>(dim4(corners_out)); resp_out = createEmptyArray<float>(dim4(corners_out)); memcpy(x_out.get(), x_corners, corners_out * sizeof(float)); memcpy(y_out.get(), y_corners, corners_out * sizeof(float)); memcpy(resp_out.get(), resp_corners, corners_out * sizeof(float)); memFree(x_corners); memFree(y_corners); memFree(resp_corners); } else { x_out = createDeviceDataArray<float>(dim4(corners_out), (void*)x_corners); y_out = createDeviceDataArray<float>(dim4(corners_out), (void*)y_corners); resp_out = createDeviceDataArray<float>(dim4(corners_out), (void*)resp_corners); } return corners_out; }
T mean_all_weighted(Param in, Param inWeight) { int in_elements = in.info.dims[0] * in.info.dims[1] * in.info.dims[2] * in.info.dims[3]; // FIXME: Use better heuristics to get to the optimum number if (in_elements > 4096) { bool in_is_linear = (in.info.strides[0] == 1); bool wt_is_linear = (in.info.strides[0] == 1); for (int k = 1; k < 4; k++) { in_is_linear &= ( in.info.strides[k] == ( in.info.strides[k - 1] * in.info.dims[k - 1])); wt_is_linear &= (inWeight.info.strides[k] == (inWeight.info.strides[k - 1] * inWeight.info.dims[k - 1])); } if (in_is_linear && wt_is_linear) { in.info.dims[0] = in_elements; for (int k = 1; k < 4; k++) { in.info.dims[k] = 1; in.info.strides[k] = in_elements; } inWeight.info = in.info; } uint threads_x = nextpow2(std::max(32u, (uint)in.info.dims[0])); threads_x = std::min(threads_x, THREADS_PER_GROUP); uint threads_y = THREADS_PER_GROUP / threads_x; uint groups_x = divup(in.info.dims[0], threads_x * REPEAT); uint groups_y = divup(in.info.dims[1], threads_y); Array<T> tmpOut = createEmptyArray<T>(groups_x); Array<Tw> tmpWeight = createEmptyArray<Tw>(groups_x); mean_first_launcher<T, Tw, T>(tmpOut, tmpWeight, in, inWeight, threads_x, groups_x, groups_y); vector<T> h_ptr(tmpOut.elements()); vector<Tw> h_wptr(tmpWeight.elements()); getQueue().enqueueReadBuffer(*tmpOut.get(), CL_TRUE, 0, sizeof(T) * tmpOut.elements(), h_ptr.data()); getQueue().enqueueReadBuffer(*tmpWeight.get(), CL_TRUE, 0, sizeof(Tw) * tmpWeight.elements(), h_wptr.data()); MeanOp<T, Tw> Op(h_ptr[0], h_wptr[0]); for (int i = 1; i < (int)tmpOut.elements(); i++) { Op(h_ptr[i], h_wptr[i]); } return Op.runningMean; } else { vector<T> h_ptr(in_elements); vector<Tw> h_wptr(in_elements); getQueue().enqueueReadBuffer(*in.data, CL_TRUE, sizeof(T) * in.info.offset, sizeof(T) * in_elements, h_ptr.data()); getQueue().enqueueReadBuffer(*inWeight.data, CL_TRUE, sizeof(Tw) * inWeight.info.offset, sizeof(Tw) * in_elements, h_wptr.data()); MeanOp<T, Tw> Op(h_ptr[0], h_wptr[0]); for (int i = 1; i < (int)in_elements; i++) { Op(h_ptr[i], h_wptr[i]); } return Op.runningMean; } }
static outType stdev(const af_array& in) { Array<inType> _in = getArray<inType>(in); Array<outType> input = cast<outType>(_in); Array<outType> meanCnst = createValueArray<outType>(input.dims(), mean<inType, outType>(_in)); Array<outType> diff = detail::arithOp<outType, af_sub_t>(input, meanCnst, input.dims()); Array<outType> diffSq = detail::arithOp<outType, af_mul_t>(diff, diff, diff.dims()); outType result = division(reduce_all<af_add_t, outType, outType>(diffSq), input.elements()); return sqrt(result); }