CBLAS_TRANSPOSE toCblasTranspose(af_mat_prop opt) { CBLAS_TRANSPOSE out = CblasNoTrans; switch(opt) { case AF_MAT_NONE : out = CblasNoTrans; break; case AF_MAT_TRANS : out = CblasTrans; break; case AF_MAT_CTRANS : out = CblasConjTrans; break; default : AF_ERROR("INVALID af_mat_prop", AF_ERR_ARG); } return out; }
SparseArray<T> sparseConvertStorageToStorage(const SparseArray<T> &in) { // Dummy function // TODO finish this function when support is required AF_ERROR("CPU Backend only supports Dense to CSR or COO", AF_ERR_NOT_SUPPORTED); in.eval(); SparseArray<T> dense = createEmptySparseArray<T>(in.dims(), (int)in.getNNZ(), dest); return dense; }
Array<T> morph3d(const Array<T> &in, const Array<T> &mask) { const dim4 mdims = mask.dims(); if (mdims[0]!=mdims[1] || mdims[0]!=mdims[2]) AF_ERROR("Only cube masks are supported in opencl morph currently", AF_ERR_SIZE); if (mdims[0]>7) AF_ERROR("Upto 7x7x7 kernels are only supported in opencl currently", AF_ERR_SIZE); const dim4 dims= in.dims(); Array<T> out = createEmptyArray<T>(dims); switch(mdims[0]) { case 3: kernel::morph3d<T, isDilation, 3>(out, in, mask); break; case 5: kernel::morph3d<T, isDilation, 5>(out, in, mask); break; case 7: kernel::morph3d<T, isDilation, 7>(out, in, mask); break; default: kernel::morph3d<T, isDilation, 3>(out, in, mask); break; } return out; }
void rotate_(T *out, const T *in, const float theta, const af::dim4 &odims, const af::dim4 &idims, const af::dim4 &ostrides, const af::dim4 &istrides) { dim_t nimages = idims[2]; void (*t_fn)(T *, const T *, const float *, const af::dim4 &, const af::dim4 &, const af::dim4 &, const dim_t, const dim_t, const dim_t, const dim_t); const float c = cos(-theta), s = sin(-theta); float tx, ty; { const float nx = 0.5 * (idims[0] - 1); const float ny = 0.5 * (idims[1] - 1); const float mx = 0.5 * (odims[0] - 1); const float my = 0.5 * (odims[1] - 1); const float sx = (mx * c + my *-s); const float sy = (mx * s + my * c); tx = -(sx - nx); ty = -(sy - ny); } const float tmat[6] = {std::round( c * 1000) / 1000.0f, std::round(-s * 1000) / 1000.0f, std::round(tx * 1000) / 1000.0f, std::round( s * 1000) / 1000.0f, std::round( c * 1000) / 1000.0f, std::round(ty * 1000) / 1000.0f, }; switch(method) { case AF_INTERP_NEAREST: t_fn = &transform_n; break; case AF_INTERP_BILINEAR: t_fn = &transform_b; break; default: AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); break; } // Do transform for image for(int yy = 0; yy < (int)odims[1]; yy++) { for(int xx = 0; xx < (int)odims[0]; xx++) { t_fn(out, in, tmat, idims, ostrides, istrides, nimages, 0, xx, yy); } } }
Array<T> morph(const Array<T> &in, const Array<T> &mask) { const dim4 mdims = mask.dims(); if (mdims[0] != mdims[1]) AF_ERROR("Only square masks are supported in cuda morph currently", AF_ERR_SIZE); if (mdims[0] > 19) AF_ERROR("Upto 19x19 square kernels are only supported in cuda currently", AF_ERR_SIZE); Array<T> out = createEmptyArray<T>(in.dims()); CUDA_CHECK(cudaMemcpyToSymbolAsync(kernel::cFilter, mask.get(), mdims[0] * mdims[1] * sizeof(T), 0, cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId()))); if (isDilation) kernel::morph<T, true >(out, in, mdims[0]); else kernel::morph<T, false>(out, in, mdims[0]); return out; }
T* memAlloc(const size_t &elements) { managerInit(); T* ptr = NULL; size_t alloc_bytes = divup(sizeof(T) * elements, memory_resolution) * memory_resolution; if (elements > 0) { std::lock_guard<std::mutex> lock(memory_map_mutex); // FIXME: Add better checks for garbage collection // Perhaps look at total memory available as a metric if (memory_map.size() > MAX_BUFFERS || used_bytes >= MAX_BYTES) { garbageCollect(); } for(mem_iter iter = memory_map.begin(); iter != memory_map.end(); ++iter) { mem_info info = iter->second; if ( info.is_free && !info.is_unlinked && info.bytes == alloc_bytes) { iter->second.is_free = false; used_bytes += alloc_bytes; used_buffers++; return (T *)iter->first; } } // Perform garbage collection if memory can not be allocated ptr = (T *)malloc(alloc_bytes); if (ptr == NULL) { AF_ERROR("Can not allocate memory", AF_ERR_NO_MEM); } mem_info info = {false, false, alloc_bytes}; memory_map[ptr] = info; used_bytes += alloc_bytes; used_buffers++; total_bytes += alloc_bytes; } return ptr; }
void Array<T>::eval() { if (isReady()) return; if (getQueue().is_worker()) AF_ERROR("Array not evaluated", AF_ERR_INTERNAL); this->setId(getActiveDeviceId()); data = shared_ptr<T>(memAlloc<T>(elements()).release(), memFree<T>); getQueue().enqueue(kernel::evalArray<T>, *this, this->node); // Reset shared_ptr this->node = bufferNodePtr<T>(); ready = true; }
SparseArray<T> sparseConvertDenseToStorage(const Array<T> &in_) { in_.eval(); // MKL only has dns->csr. // CSR <-> CSC is only supported if input is square uint nNZ = reduce_all<af_notzero_t, T, uint>(in_); SparseArray<T> sparse_ = createEmptySparseArray<T>(in_.dims(), nNZ, AF_STORAGE_CSR); sparse_.eval(); auto func = [=] (SparseArray<T> sparse, const Array<T> in) { // Read: https://software.intel.com/en-us/node/520848 // But job description is incorrect with regards to job[1] // 0 implies row major and 1 implies column major int j1 = 1, j2 = 0; const int job[] = {0, j1, j2, 2, (int)sparse.elements(), 1}; const int M = in.dims()[0]; const int N = in.dims()[1]; int ldd = in.strides()[1]; int info = 0; // Have to mess up all const correctness because MKL dnscsr function // is bidirectional and has input/output on all pointers Array<T > &values = sparse.getValues(); Array<int> &rowIdx = sparse.getRowIdx(); Array<int> &colIdx = sparse.getColIdx(); dnscsr_func<T>()( job, &M, &N, reinterpret_cast<ptr_type<T>>(const_cast<T*>(in.get())), &ldd, reinterpret_cast<ptr_type<T>>(values.get()), colIdx.get(), rowIdx.get(), &info); }; getQueue().enqueue(func, sparse_, in_); if(stype == AF_STORAGE_CSR) return sparse_; else AF_ERROR("CPU Backend only supports Dense to CSR or COO", AF_ERR_NOT_SUPPORTED); return sparse_; }
void sort_index(Array<T> &okey, Array<uint> &oval, const Array<T> &in, const uint dim, bool isAscending) { try { // okey contains values, oval contains indices okey = copyArray<T>(in); oval = range<uint>(in.dims(), dim); oval.eval(); switch(dim) { case 0: kernel::sort0ByKey<T, uint>(okey, oval, isAscending); break; case 1: case 2: case 3: kernel::sortByKeyBatched<T, uint>(okey, oval, dim, isAscending); break; default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } if(dim != 0) { af::dim4 preorderDims = okey.dims(); af::dim4 reorderDims(0, 1, 2, 3); reorderDims[dim] = 0; preorderDims[0] = okey.dims()[dim]; for(int i = 1; i <= (int)dim; i++) { reorderDims[i - 1] = i; preorderDims[i] = okey.dims()[i - 1]; } okey.setDataDims(preorderDims); oval.setDataDims(preorderDims); okey = reorder<T>(okey, reorderDims); oval = reorder<uint>(oval, reorderDims); } } catch (std::exception &ex) { AF_ERROR(ex.what(), AF_ERR_INTERNAL); } }
void sort_by_key(Array<Tk> &okey, Array<Tv> &oval, const Array<Tk> &ikey, const Array<Tv> &ival, const unsigned dim) { if ((std::is_same<Tk, double>::value || std::is_same<Tk, cdouble>::value) && !isDoubleSupported(getActiveDeviceId())) { OPENCL_NOT_SUPPORTED(); } if ((std::is_same<Tv, double>::value || std::is_same<Tv, cdouble>::value) && !isDoubleSupported(getActiveDeviceId())) { OPENCL_NOT_SUPPORTED(); } try { okey = copyArray<Tk>(ikey); oval = copyArray<Tv>(ival); switch(dim) { case 0: kernel::sort0_by_key<Tk, Tv, isAscending>(okey, oval); break; default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } }catch(std::exception &ex) { AF_ERROR(ex.what(), AF_ERR_INTERNAL); } }
Array<T> rotate(const Array<T> &in, const float theta, const af::dim4 &odims, const af_interp_type method) { Array<T> out = createEmptyArray<T>(odims); switch(method) { case AF_INTERP_NEAREST: kernel::rotate<T, AF_INTERP_NEAREST> (out, in, theta); break; case AF_INTERP_BILINEAR: kernel::rotate<T, AF_INTERP_BILINEAR> (out, in, theta); break; default: AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); break; } return out; }
Array<T> rotate(const Array<T> &in, const float theta, const af::dim4 &odims, const af_interp_type method) { Array<T> out = createEmptyArray<T>(odims); const af::dim4 idims = in.dims(); switch(method) { case AF_INTERP_NEAREST: rotate_<T, AF_INTERP_NEAREST> (out.get(), in.get(), theta, odims, idims, out.strides(), in.strides()); break; case AF_INTERP_BILINEAR: rotate_<T, AF_INTERP_BILINEAR> (out.get(), in.get(), theta, odims, idims, out.strides(), in.strides()); break; default: AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); break; } return out; }
Array<T> rotate(const Array<T> &in, const float theta, const af::dim4 &odims, const af_interp_type method) { if ((std::is_same<T, double>::value || std::is_same<T, cdouble>::value) && !isDoubleSupported(getActiveDeviceId())) { OPENCL_NOT_SUPPORTED(); } Array<T> out = createEmptyArray<T>(odims); switch(method) { case AF_INTERP_NEAREST: kernel::rotate<T, AF_INTERP_NEAREST> (out, in, theta); break; case AF_INTERP_BILINEAR: kernel::rotate<T, AF_INTERP_BILINEAR> (out, in, theta); break; default: AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); break; } return out; }
Array<T> sparseConvertStorageToDense(const SparseArray<T> &in_) { in_.eval(); Array<T> dense_ = createValueArray<T>(in_.dims(), scalar<T>(0)); dense_.eval(); auto func = [=] (Array<T> dense, const SparseArray<T> in) { Array<T > values = in.getValues(); Array<int> rowIdx = in.getRowIdx(); Array<int> colIdx = in.getColIdx(); kernel::csr_dense<T>()(dense, values, rowIdx, colIdx); }; getQueue().enqueue(func, dense_, in_); if(stype == AF_STORAGE_CSR) return dense_; else AF_ERROR("CPU Backend only supports Dense to CSR or COO", AF_ERR_NOT_SUPPORTED); return dense_; }
void swapdblk(int n, int nb, cl_mem dA, size_t dA_offset, int ldda, int inca, cl_mem dB, size_t dB_offset, int lddb, int incb, cl_command_queue queue) { std::string refName = std::string("swapdblk_") + std::string(dtype_traits<T>::getName()); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, refName); if (entry.prog == 0 && entry.ker == 0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName(); if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) options << " -D USE_DOUBLE"; const char* ker_strs[] = {swapdblk_cl}; const int ker_lens[] = {swapdblk_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "swapdblk"); addKernelToCache(device, refName, entry); } int nblocks = n / nb; if (nblocks == 0) return; int info = 0; if (n < 0) { info = -1; } else if (nb < 1 || nb > 1024) { info = -2; } else if (ldda < (nblocks - 1) * nb * inca + nb) { info = -4; } else if (inca < 0) { info = -5; } else if (lddb < (nblocks - 1) * nb * incb + nb) { info = -7; } else if (incb < 0) { info = -8; } if (info != 0) { AF_ERROR("Invalid configuration", AF_ERR_INTERNAL); return; } NDRange local(nb); NDRange global(nblocks * nb); cl::Buffer dAObj(dA, true); cl::Buffer dBObj(dB, true); auto swapdOp = KernelFunctor<int, Buffer, unsigned long long, int, int, Buffer, unsigned long long, int, int>(*entry.ker); cl::CommandQueue q(queue); swapdOp(EnqueueArgs(q, global, local), nb, dAObj, dA_offset, ldda, inca, dBObj, dB_offset, lddb, incb); }
void swapdblk(int n, int nb, cl_mem dA, size_t dA_offset, int ldda, int inca, cl_mem dB, size_t dB_offset, int lddb, int incb) { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; static std::map<int, Program*> swpProgs; static std::map<int, Kernel*> swpKernels; int device = getActiveDeviceId(); std::call_once(compileFlags[device], [device] () { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName(); if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } cl::Program prog; buildProgram(prog, swapdblk_cl, swapdblk_cl_len, options.str()); swpProgs[device] = new Program(prog); swpKernels[device] = new Kernel(*swpProgs[device], "swapdblk"); }); int nblocks = n / nb; if(nblocks == 0) return; int info = 0; if (n < 0) { info = -1; } else if (nb < 1 || nb > 1024) { info = -2; } else if (ldda < (nblocks-1)*nb*inca + nb) { info = -4; } else if (inca < 0) { info = -5; } else if (lddb < (nblocks-1)*nb*incb + nb) { info = -7; } else if (incb < 0) { info = -8; } if (info != 0) { AF_ERROR("Invalid configuration", AF_ERR_INTERNAL); return; } NDRange local(nb); NDRange global(nblocks * nb); auto swapdOp = make_kernel<int, cl_mem, unsigned long long, int, int, cl_mem, unsigned long long, int, int>(*swpKernels[device]); swapdOp(EnqueueArgs(getQueue(), global, local), nb, dA, dA_offset, ldda, inca, dB, dB_offset, lddb, incb); }
Array<T> cholesky(int *info, const Array<T> &in, const bool is_upper) { AF_ERROR("Linear Algebra is disabled on OpenCL", AF_ERR_NOT_CONFIGURED); }
Array<T> solve(const Array<T> &a, const Array<T> &b, const af_mat_prop options) { AF_ERROR("Linear Algebra is diabled on OpenCL", AF_ERR_NOT_CONFIGURED); }
int cholesky_inplace(Array<T> &in, const bool is_upper) { AF_ERROR("Linear Algebra is disabled on OpenCL", AF_ERR_NOT_CONFIGURED); }
Array<T> transform(const Array<T> &in, const Array<float> &transform, const af::dim4 &odims, const af_interp_type method, const bool inverse, const bool perspective) { Array<T> out = createEmptyArray<T>(odims); if(inverse) { if (perspective) { switch(method) { case AF_INTERP_NEAREST: kernel::transform<T, true, true, AF_INTERP_NEAREST> (out, in, transform); break; case AF_INTERP_BILINEAR: kernel::transform<T, true, true, AF_INTERP_BILINEAR> (out, in, transform); break; case AF_INTERP_LOWER: kernel::transform<T, true, true, AF_INTERP_LOWER> (out, in, transform); break; default: AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); break; } } else { switch(method) { case AF_INTERP_NEAREST: kernel::transform<T, true, false, AF_INTERP_NEAREST> (out, in, transform); break; case AF_INTERP_BILINEAR: kernel::transform<T, true, false, AF_INTERP_BILINEAR> (out, in, transform); break; case AF_INTERP_LOWER: kernel::transform<T, true, false, AF_INTERP_LOWER> (out, in, transform); break; default: AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); break; } } } else { if (perspective) { switch(method) { case AF_INTERP_NEAREST: kernel::transform<T, false, true, AF_INTERP_NEAREST> (out, in, transform); break; case AF_INTERP_BILINEAR: kernel::transform<T, false, true, AF_INTERP_BILINEAR> (out, in, transform); break; case AF_INTERP_LOWER: kernel::transform<T, false, true, AF_INTERP_LOWER> (out, in, transform); break; default: AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); break; } } else { switch(method) { case AF_INTERP_NEAREST: kernel::transform<T, false, false, AF_INTERP_NEAREST> (out, in, transform); break; case AF_INTERP_BILINEAR: kernel::transform<T, false, false, AF_INTERP_BILINEAR> (out, in, transform); break; case AF_INTERP_LOWER: kernel::transform<T, false, false, AF_INTERP_LOWER> (out, in, transform); break; default: AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); break; } } } return out; }