Array<T> unaryOp(const Array<T> &in, dim4 outDim = dim4(-1, -1, -1, -1)) { jit::Node_ptr in_node = in.getNode(); jit::UnaryNode<T, T, op> *node = new jit::UnaryNode<T, T, op>(in_node); if (outDim == dim4(-1, -1, -1, -1)) { outDim = in.dims(); } return createNodeArray<T>(outDim, jit::Node_ptr(node)); }
void sortBatched(Array<T>& val, bool isAscending) { af::dim4 inDims = val.dims(); // Sort dimension af::dim4 tileDims(1); af::dim4 seqDims = inDims; tileDims[dim] = inDims[dim]; seqDims[dim] = 1; Array<uint> key = iota<uint>(seqDims, tileDims); Array<uint> resKey = createEmptyArray<uint>(dim4()); Array<T> resVal = createEmptyArray<T>(dim4()); val.setDataDims(inDims.elements()); key.setDataDims(inDims.elements()); sort_by_key<T, uint>(resVal, resKey, val, key, 0, isAscending); // Needs to be ascending (true) in order to maintain the indices properly sort_by_key<uint, T>(key, val, resKey, resVal, 0, true); val.eval(); val.setDataDims(inDims); // This is correct only for dim0 }
void assign(Array<T>& out, const af_index_t idxrs[], const Array<T>& rhs) { out.eval(); rhs.eval(); vector<bool> isSeq(4); vector<af_seq> seqs(4, af_span); // create seq vector to retrieve output dimensions, offsets & offsets for (dim_t x=0; x<4; ++x) { if (idxrs[x].isSeq) { seqs[x] = idxrs[x].idx.seq; } isSeq[x] = idxrs[x].isSeq; } vector< Array<uint> > idxArrs(4, createEmptyArray<uint>(dim4())); // look through indexs to read af_array indexs for (dim_t x=0; x<4; ++x) { if (!isSeq[x]) { idxArrs[x] = castArray<uint>(idxrs[x].idx.arr); idxArrs[x].eval(); } } getQueue().enqueue(kernel::assign<T>, out, rhs, std::move(isSeq), std::move(seqs), std::move(idxArrs)); }
Array<T>* setUnique(const Array<T> &in, const bool is_sorted) { if ((std::is_same<T, double>::value || std::is_same<T, cdouble>::value) && !isDoubleSupported(getActiveDeviceId())) { OPENCL_NOT_SUPPORTED(); } Array<T> *out = copyArray<T>(in); compute::command_queue queue(getQueue()()); compute::buffer out_data((*out->get())()); compute::buffer_iterator<T> begin(out_data, 0); compute::buffer_iterator<T> end(out_data, out->dims()[0]); if (!is_sorted) { compute::sort(begin, end, queue); } end = compute::unique(begin, end, queue); out->resetDims(dim4(std::distance(begin, end), 1, 1, 1)); return out; }
Array<T> diagCreate(const Array<T> &in, const int num) { int size = in.dims()[0] + std::abs(num); int batch = in.dims()[1]; Array<T> out = createEmptyArray<T>(dim4(size, size, batch)); const T *iptr = in.get(); T *optr = out.get(); for (int k = 0; k < batch; k++) { for (int j = 0; j < size; j++) { for (int i = 0; i < size; i++) { T val = scalar<T>(0); if (i == j - num) { val = (num > 0) ? iptr[i] : iptr[j]; } optr[i + j * out.strides()[1]] = val; } } optr += out.strides()[2]; iptr += in.strides()[1]; } return out; }
Array<T> diagExtract(const Array<T> &in, const int num) { const dim_t *idims = in.dims().get(); dim_t size = std::max(idims[0], idims[1]) - std::abs(num); Array<T> out = createEmptyArray<T>(dim4(size, 1, idims[2], idims[3])); const dim_t *odims = out.dims().get(); const int i_off = (num > 0) ? (num * in.strides()[1]) : (-num); for (int l = 0; l < (int)odims[3]; l++) { for (int k = 0; k < (int)odims[2]; k++) { const T *iptr = in.get() + l * in.strides()[3] + k * in.strides()[2] + i_off; T *optr = out.get() + l * out.strides()[3] + k * out.strides()[2]; for (int i = 0; i < (int)odims[0]; i++) { T val = scalar<T>(0); if (i < idims[0] && i < idims[1]) val = iptr[i * in.strides()[1] + i]; optr[i] = val; } } } return out; }
Array<T>::Array(dim4 dims, const T * const in_data): ArrayInfo(getActiveDeviceId(), dims, dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits<T>::af_type), data(memAlloc<T>(dims.elements()), memFree<T>), data_dims(dims), node(), ready(true), offset(0), owner(true) { std::copy(in_data, in_data + dims.elements(), data.get()); }
Array<T> diagCreate(const Array<T> &in, const int num) { int size = in.dims()[0] + std::abs(num); int batch = in.dims()[1]; Array<T> out = createEmptyArray<T>(dim4(size, size, batch)); kernel::diagCreate<T>(out, in, num); return out; }
Array<T> diagExtract(const Array<T> &in, const int num) { const dim_t *idims = in.dims().get(); dim_t size = std::min(idims[0], idims[1]) - std::abs(num); Array<T> out = createEmptyArray<T>(dim4(size, 1, idims[2], idims[3])); kernel::diagExtract<T>(out, in, num); return out; }
Array<T> index(const Array<T>& in, const af_index_t idxrs[]) { kernel::IndexKernelParam_t p; std::vector<af_seq> seqs(4, af_span); // create seq vector to retrieve output // dimensions, offsets & offsets for (dim_t x=0; x<4; ++x) { if (idxrs[x].isSeq) { seqs[x] = idxrs[x].idx.seq; } } // retrieve dimensions, strides and offsets dim4 iDims = in.dims(); dim4 dDims = in.getDataDims(); dim4 oDims = toDims (seqs, iDims); dim4 iOffs = toOffset(seqs, dDims); dim4 iStrds= toStride(seqs, dDims); for (dim_t i=0; i<4; ++i) { p.isSeq[i] = idxrs[i].isSeq; p.offs[i] = iOffs[i]; p.strds[i] = iStrds[i]; } Buffer* bPtrs[4]; std::vector< Array<uint> > idxArrs(4, createEmptyArray<uint>(dim4())); // look through indexs to read af_array indexs for (dim_t x=0; x<4; ++x) { // set index pointers were applicable if (!p.isSeq[x]) { idxArrs[x] = castArray<uint>(idxrs[x].idx.arr); bPtrs[x] = idxArrs[x].get(); // set output array ith dimension value oDims[x] = idxArrs[x].elements(); } else { // alloc an 1-element buffer to avoid OpenCL from failing bPtrs[x] = bufferAlloc(sizeof(uint)); } } Array<T> out = createEmptyArray<T>(oDims); if(oDims.elements() == 0) { return out; } kernel::index<T>(out, in, p, bPtrs); for (dim_t x=0; x<4; ++x) { if (p.isSeq[x]) bufferFree(bPtrs[x]); } return out; }
Array<T> diagExtract(const Array<T> &in, const int num) { in.eval(); const dim4 idims = in.dims(); dim_t size = std::min(idims[0], idims[1]) - std::abs(num); Array<T> out = createEmptyArray<T>(dim4(size, 1, idims[2], idims[3])); getQueue().enqueue(kernel::diagExtract<T>, out, in, num); return out; }
SparseArray<T> sparseConvertDenseToCOO(const Array<T> &in) { in.eval(); Array<uint> nonZeroIdx_ = where<T>(in); Array<int> nonZeroIdx = cast<int, uint>(nonZeroIdx_); dim_t nNZ = nonZeroIdx.elements(); Array<int> constNNZ = createValueArray<int>(dim4(nNZ), nNZ); constNNZ.eval(); Array<int> rowIdx = arithOp<int, af_mod_t>(nonZeroIdx, constNNZ, nonZeroIdx.dims()); Array<int> colIdx = arithOp<int, af_div_t>(nonZeroIdx, constNNZ, nonZeroIdx.dims()); Array<T> values = copyArray<T>(in); values.modDims(dim4(values.elements())); values = lookup<T, int>(values, nonZeroIdx, 0); return createArrayDataSparseArray<T>(in.dims(), values, rowIdx, colIdx, AF_STORAGE_COO); }
Array<T> solve(const Array<T> &a, const Array<T> &b, const af_mat_prop options) { a.eval(); b.eval(); if (options & AF_MAT_UPPER || options & AF_MAT_LOWER) { return triangleSolve<T>(a, b, options); } int M = a.dims()[0]; int N = a.dims()[1]; int K = b.dims()[1]; Array<T> A = copyArray<T>(a); Array<T> B = padArray<T, T>(b, dim4(max(M, N), K)); if(M == N) { Array<int> pivot = createEmptyArray<int>(dim4(N, 1, 1)); auto func = [=] (Array<T> A, Array<T> B, Array<int> pivot, int N, int K) { gesv_func<T>()(AF_LAPACK_COL_MAJOR, N, K, A.get(), A.strides()[1], pivot.get(), B.get(), B.strides()[1]); }; getQueue().enqueue(func, A, B, pivot, N, K); } else { auto func = [=] (Array<T> A, Array<T> B, int M, int N, int K) { int sM = A.strides()[1]; int sN = A.strides()[2] / sM; gels_func<T>()(AF_LAPACK_COL_MAJOR, 'N', M, N, K, A.get(), A.strides()[1], B.get(), max(sM, sN)); }; B.resetDims(dim4(N, K)); getQueue().enqueue(func, A, B, M, N, K); } return B; }
Array<T>* setIntersect(const Array<T> &first, const Array<T> &second, const bool is_unique) { if ((std::is_same<T, double>::value || std::is_same<T, cdouble>::value) && !isDoubleSupported(getActiveDeviceId())) { OPENCL_NOT_SUPPORTED(); } Array<T> unique_first = first; Array<T> unique_second = second; if (!is_unique) { unique_first = *setUnique(first, false); unique_second = *setUnique(second, false); } size_t out_size = std::max(unique_first.dims()[0], unique_second.dims()[0]); Array<T> *out = createEmptyArray<T>(dim4(out_size, 1, 1, 1)); compute::command_queue queue(getQueue()()); compute::buffer first_data((*unique_first.get())()); compute::buffer second_data((*unique_second.get())()); compute::buffer out_data((*out->get())()); compute::buffer_iterator<T> first_begin(first_data, 0); compute::buffer_iterator<T> first_end(first_data, unique_first.dims()[0]); compute::buffer_iterator<T> second_begin(second_data, 0); compute::buffer_iterator<T> second_end(second_data, unique_second.dims()[0]); compute::buffer_iterator<T> out_begin(out_data, 0); compute::buffer_iterator<T> out_end = compute::set_intersection( first_begin, first_end, second_begin, second_end, out_begin, queue ); out->resetDims(dim4(std::distance(out_begin, out_end), 1, 1, 1)); return out; }
Array<T> solve(const Array<T> &a, const Array<T> &b, const af_mat_prop options) { if (options & AF_MAT_UPPER || options & AF_MAT_LOWER) { return triangleSolve<T>(a, b, options); } int M = a.dims()[0]; int N = a.dims()[1]; int K = b.dims()[1]; Array<T> A = copyArray<T>(a); Array<T> B = padArray<T, T>(b, dim4(max(M, N), K), scalar<T>(0)); std::shared_ptr<T> aPtr = A.getMappedPtr(); std::shared_ptr<T> bPtr = B.getMappedPtr(); if(M == N) { std::vector<int> pivot(N); gesv_func<T>()(AF_LAPACK_COL_MAJOR, N, K, aPtr.get(), A.strides()[1], &pivot.front(), bPtr.get(), B.strides()[1]); } else { int sM = a.strides()[1]; int sN = a.strides()[2] / sM; gels_func<T>()(AF_LAPACK_COL_MAJOR, 'N', M, N, K, aPtr.get(), A.strides()[1], bPtr.get(), max(sM, sN)); B.resetDims(dim4(N, K)); } return B; }
Array<int> lu_inplace(Array<T> &in, const bool convert_pivot) { in.eval(); dim4 iDims = in.dims(); Array<int> pivot = createEmptyArray<int>(af::dim4(min(iDims[0], iDims[1]), 1, 1, 1)); auto func = [=](Param<T> in, Param<int> pivot) { dim4 iDims = in.dims(); getrf_func<T>()(AF_LAPACK_COL_MAJOR, iDims[0], iDims[1], in.get(), in.strides(1), pivot.get()); }; getQueue().enqueue(func, in, pivot); if (convert_pivot) { Array<int> p = range<int>(dim4(iDims[0]), 0); getQueue().enqueue(kernel::convertPivot, p, pivot); return p; } else { return pivot; } }
static dim4 getDims(const af_array arr) { dim_t d0, d1, d2, d3; AF_THROW(af_get_dims(&d0, &d1, &d2, &d3, arr)); return dim4(d0, d1, d2, d3); }
// Helper functions dim4 array::dims() const { dim_type d0, d1, d2, d3; AF_THROW(af_get_dims(&d0, &d1, &d2, &d3, arr)); return dim4(d0, d1, d2, d3); }
namespace cuda { template<typename T> Array<T> iota(const dim4 &dim, const dim4 &tile_dims = dim4(1)); }
array randn(const dim_type d0, af_dtype ty) { return randn(dim4(d0), ty); }
Array<T> leastSquares(const Array<T> &a, const Array<T> &b) { int M = a.dims()[0]; int N = a.dims()[1]; int K = b.dims()[1]; int MN = std::min(M, N); Array<T> B = createEmptyArray<T>(dim4()); gpu_blas_trsm_func<T> gpu_blas_trsm; cl_event event; cl_command_queue queue = getQueue()(); if (M < N) { #define UNMQR 0 // FIXME: UNMQR == 1 should be faster but does not work // Least squres for this case is solved using the following // solve(A, B) == matmul(Q, Xpad); // Where: // Xpad == pad(Xt, N - M, 1); // Xt == tri_solve(R1, B); // R1 == R(seq(M), seq(M)); // transpose(A) == matmul(Q, R); // QR is performed on the transpose of A Array<T> A = transpose<T>(a, true); #if UNMQR B = padArray<T, T>(b, dim4(N, K), scalar<T>(0)); B.resetDims(dim4(M, K)); #else B = copyArray<T>(b); #endif int NB = magma_get_geqrf_nb<T>(A.dims()[1]); int NUM = (2*MN + ((M+31)/32)*32)*NB; Array<T> tmp = createEmptyArray<T>(dim4(NUM)); std::vector<T> h_tau(MN); int info = 0; cl::Buffer *dA = A.get(); cl::Buffer *dT = tmp.get(); cl::Buffer *dB = B.get(); magma_geqrf3_gpu<T>(A.dims()[0], A.dims()[1], (*dA)(), A.getOffset(), A.strides()[1], &h_tau[0], (*dT)(), tmp.getOffset(), getQueue()(), &info); A.resetDims(dim4(M, M)); magmablas_swapdblk<T>(MN-1, NB, (*dA)(), A.getOffset(), A.strides()[1], 1, (*dT)(), tmp.getOffset() + MN * NB, NB, 0, queue); CLBLAS_CHECK(gpu_blas_trsm( clblasLeft, clblasUpper, clblasConjTrans, clblasNonUnit, B.dims()[0], B.dims()[1], scalar<T>(1), (*dA)(), A.getOffset(), A.strides()[1], (*dB)(), B.getOffset(), B.strides()[1], 1, &queue, 0, nullptr, &event)); magmablas_swapdblk<T>(MN - 1, NB, (*dT)(), tmp.getOffset() + MN * NB, NB, 0, (*dA)(), A.getOffset(), A.strides()[1], 1, queue); #if UNMQR int lwork = (B.dims()[0]-A.dims()[0]+NB)*(B.dims()[1]+2*NB); std::vector<T> h_work(lwork); B.resetDims(dim4(N, K)); magma_unmqr_gpu<T>(MagmaLeft, MagmaNoTrans, B.dims()[0], B.dims()[1], A.dims()[0], (*dA)(), A.getOffset(), A.strides()[1], &h_tau[0], (*dB)(), B.getOffset(), B.strides()[1], &h_work[0], lwork, (*dT)(), tmp.getOffset(), NB, queue, &info); #else A.resetDims(dim4(N, M)); magma_ungqr_gpu<T>(A.dims()[0], A.dims()[1], std::min(M, N), (*dA)(), A.getOffset(), A.strides()[1], &h_tau[0], (*dT)(), tmp.getOffset(), NB, queue, &info); B = matmul(A, B, AF_MAT_NONE, AF_MAT_NONE); #endif } else if (M > N) { // Least squres for this case is solved using the following // solve(A, B) == tri_solve(R1, Bt); // Where: // R1 == R(seq(N), seq(N)); // Bt == matmul(transpose(Q1), B); // Q1 == Q(span, seq(N)); // A == matmul(Q, R); Array<T> A = copyArray<T>(a); B = copyArray(b); int MN = std::min(M, N); int NB = magma_get_geqrf_nb<T>(M); int NUM = (2*MN + ((N+31)/32)*32)*NB; Array<T> tmp = createEmptyArray<T>(dim4(NUM)); std::vector<T> h_tau(NUM); int info = 0; cl::Buffer *A_buf = A.get(); cl::Buffer *B_buf = B.get(); cl::Buffer *dT = tmp.get(); magma_geqrf3_gpu<T>(M, N, (*A_buf)(), A.getOffset(), A.strides()[1], &h_tau[0], (*dT)(), tmp.getOffset(), getQueue()(), &info); int NRHS = B.dims()[1]; int lhwork = (M - N + NB) * (NRHS + NB) + NRHS * NB; std::vector<T> h_work(lhwork); h_work[0] = scalar<T>(lhwork); magma_unmqr_gpu<T>(MagmaLeft, MagmaConjTrans, M, NRHS, N, (*A_buf)(), A.getOffset(), A.strides()[1], &h_tau[0], (*B_buf)(), B.getOffset(), B.strides()[1], &h_work[0], lhwork, (*dT)(), tmp.getOffset(), NB, queue, &info); magmablas_swapdblk<T>(MN - 1, NB, (*A_buf)(), A.getOffset(), A.strides()[1], 1, (*dT)(), tmp.getOffset() + NB * MN, NB, 0, queue); if(getActivePlatform() == AFCL_PLATFORM_NVIDIA) { Array<T> AT = transpose<T>(A, true); cl::Buffer* AT_buf = AT.get(); CLBLAS_CHECK(gpu_blas_trsm( clblasLeft, clblasLower, clblasConjTrans, clblasNonUnit, N, NRHS, scalar<T>(1), (*AT_buf)(), AT.getOffset(), AT.strides()[1], (*B_buf)(), B.getOffset(), B.strides()[1], 1, &queue, 0, nullptr, &event)); } else { CLBLAS_CHECK(gpu_blas_trsm( clblasLeft, clblasUpper, clblasNoTrans, clblasNonUnit, N, NRHS, scalar<T>(1), (*A_buf)(), A.getOffset(), A.strides()[1], (*B_buf)(), B.getOffset(), B.strides()[1], 1, &queue, 0, nullptr, &event)); } B.resetDims(dim4(N, K)); } return B; }
array randn(const dim_t d0, const dim_t d1, const dim_t d2, const dim_t d3, const af::dtype ty) { return randn(dim4(d0, d1, d2, d3), ty); }
Array<T> *initArray() { return new Array<T>(dim4()); }
array constant(T val, const dim_t d0, const dim_t d1, const dim_t d2, const dim_t d3, const af::dtype ty) { return constant(val, dim4(d0, d1, d2, d3), ty); }
array constant(T val, const dim_t d0, const af::dtype ty) { return constant(val, dim4(d0), ty); }
array randu(const dim_t d0, const dim_t d1, const dim_t d2, const af::dtype ty) { return randu(dim4(d0, d1, d2), ty); }
array identity(const dim_t d0, const af::dtype ty) { return identity(dim4(d0), ty); }
array range(const dim_t d0, const dim_t d1, const dim_t d2, const dim_t d3, const int seq_dim, const af::dtype ty) { return range(dim4(d0, d1, d2, d3), seq_dim, ty); }
array identity(const dim_t d0, const dim_t d1, const dim_t d2, const dim_t d3, const af::dtype ty) { return identity(dim4(d0, d1, d2, d3), ty); }
array randn(const dim_t d0, const af::dtype ty) { return randn(dim4(d0), ty); }