Example #1
0
Array<T> unaryOp(const Array<T> &in, dim4 outDim = dim4(-1, -1, -1, -1)) {
    jit::Node_ptr in_node          = in.getNode();
    jit::UnaryNode<T, T, op> *node = new jit::UnaryNode<T, T, op>(in_node);

    if (outDim == dim4(-1, -1, -1, -1)) { outDim = in.dims(); }
    return createNodeArray<T>(outDim, jit::Node_ptr(node));
}
Example #2
0
void sortBatched(Array<T>& val, bool isAscending) {
    af::dim4 inDims = val.dims();

    // Sort dimension
    af::dim4 tileDims(1);
    af::dim4 seqDims = inDims;
    tileDims[dim]    = inDims[dim];
    seqDims[dim]     = 1;

    Array<uint> key = iota<uint>(seqDims, tileDims);

    Array<uint> resKey = createEmptyArray<uint>(dim4());
    Array<T> resVal    = createEmptyArray<T>(dim4());

    val.setDataDims(inDims.elements());
    key.setDataDims(inDims.elements());

    sort_by_key<T, uint>(resVal, resKey, val, key, 0, isAscending);

    // Needs to be ascending (true) in order to maintain the indices properly
    sort_by_key<uint, T>(key, val, resKey, resVal, 0, true);
    val.eval();

    val.setDataDims(inDims);  // This is correct only for dim0
}
Example #3
0
void assign(Array<T>& out, const af_index_t idxrs[], const Array<T>& rhs)
{
    out.eval();
    rhs.eval();

    vector<bool> isSeq(4);
    vector<af_seq> seqs(4, af_span);
    // create seq vector to retrieve output dimensions, offsets & offsets
    for (dim_t x=0; x<4; ++x) {
        if (idxrs[x].isSeq) {
            seqs[x] = idxrs[x].idx.seq;
        }
        isSeq[x] = idxrs[x].isSeq;
    }

    vector< Array<uint> > idxArrs(4, createEmptyArray<uint>(dim4()));
    // look through indexs to read af_array indexs
    for (dim_t x=0; x<4; ++x) {
        if (!isSeq[x]) {
            idxArrs[x] = castArray<uint>(idxrs[x].idx.arr);
            idxArrs[x].eval();
        }
    }

    getQueue().enqueue(kernel::assign<T>, out, rhs, std::move(isSeq),
            std::move(seqs), std::move(idxArrs));
}
Example #4
0
    Array<T>* setUnique(const Array<T> &in,
                        const bool is_sorted)
    {
        if ((std::is_same<T, double>::value || std::is_same<T, cdouble>::value) &&
            !isDoubleSupported(getActiveDeviceId())) {
            OPENCL_NOT_SUPPORTED();
        }
        Array<T> *out = copyArray<T>(in);

        compute::command_queue queue(getQueue()());

        compute::buffer out_data((*out->get())());

        compute::buffer_iterator<T> begin(out_data, 0);
        compute::buffer_iterator<T> end(out_data, out->dims()[0]);

        if (!is_sorted) {
            compute::sort(begin, end, queue);
        }

        end = compute::unique(begin, end, queue);

        out->resetDims(dim4(std::distance(begin, end), 1, 1, 1));

        return out;
    }
Example #5
0
    Array<T> diagCreate(const Array<T> &in, const int num)
    {
        int size = in.dims()[0] + std::abs(num);
        int batch = in.dims()[1];
        Array<T> out = createEmptyArray<T>(dim4(size, size, batch));

        const T *iptr = in.get();
        T *optr = out.get();

        for (int k = 0; k < batch; k++) {
            for (int j = 0; j < size; j++) {
                for (int i = 0; i < size; i++) {
                    T val = scalar<T>(0);
                    if (i == j - num) {
                        val = (num > 0) ? iptr[i] : iptr[j];
                    }
                    optr[i + j * out.strides()[1]] = val;
                }
            }
            optr += out.strides()[2];
            iptr += in.strides()[1];
        }

        return out;
    }
Example #6
0
    Array<T> diagExtract(const Array<T> &in, const int num)
    {
        const dim_t *idims = in.dims().get();
        dim_t size = std::max(idims[0], idims[1]) - std::abs(num);
        Array<T> out = createEmptyArray<T>(dim4(size, 1, idims[2], idims[3]));

        const dim_t *odims = out.dims().get();

        const int i_off = (num > 0) ? (num * in.strides()[1]) : (-num);

        for (int l = 0; l < (int)odims[3]; l++) {

            for (int k = 0; k < (int)odims[2]; k++) {
                const T *iptr = in.get() + l * in.strides()[3] + k * in.strides()[2] + i_off;
                T *optr = out.get() + l * out.strides()[3] + k * out.strides()[2];

                for (int i = 0; i < (int)odims[0]; i++) {
                    T val = scalar<T>(0);
                    if (i < idims[0] && i < idims[1]) val =  iptr[i * in.strides()[1] + i];
                    optr[i] = val;
                }
            }
        }

        return out;
    }
Example #7
0
Array<T>::Array(dim4 dims, const T * const in_data):
    ArrayInfo(getActiveDeviceId(), dims, dim4(0,0,0,0), calcStrides(dims), (af_dtype)dtype_traits<T>::af_type),
    data(memAlloc<T>(dims.elements()), memFree<T>), data_dims(dims),
    node(), ready(true), offset(0), owner(true)
{
    std::copy(in_data, in_data + dims.elements(), data.get());
}
    Array<T> diagCreate(const Array<T> &in, const int num)
    {
        int size = in.dims()[0] + std::abs(num);
        int batch = in.dims()[1];
        Array<T> out = createEmptyArray<T>(dim4(size, size, batch));

        kernel::diagCreate<T>(out, in, num);

        return out;
    }
    Array<T> diagExtract(const Array<T> &in, const int num)
    {
        const dim_t *idims = in.dims().get();
        dim_t size = std::min(idims[0], idims[1]) - std::abs(num);
        Array<T> out = createEmptyArray<T>(dim4(size, 1, idims[2], idims[3]));

        kernel::diagExtract<T>(out, in, num);

        return out;

    }
Example #10
0
Array<T> index(const Array<T>& in, const af_index_t idxrs[])
{
    kernel::IndexKernelParam_t p;
    std::vector<af_seq> seqs(4, af_span);
    // create seq vector to retrieve output
    // dimensions, offsets & offsets
    for (dim_t x=0; x<4; ++x) {
        if (idxrs[x].isSeq) {
            seqs[x] = idxrs[x].idx.seq;
        }
    }

    // retrieve dimensions, strides and offsets
    dim4 iDims = in.dims();
    dim4 dDims = in.getDataDims();
    dim4 oDims = toDims  (seqs, iDims);
    dim4 iOffs = toOffset(seqs, dDims);
    dim4 iStrds= toStride(seqs, dDims);

    for (dim_t i=0; i<4; ++i) {
        p.isSeq[i] = idxrs[i].isSeq;
        p.offs[i]  = iOffs[i];
        p.strds[i] = iStrds[i];
    }

    Buffer* bPtrs[4];

    std::vector< Array<uint> > idxArrs(4, createEmptyArray<uint>(dim4()));
    // look through indexs to read af_array indexs
    for (dim_t x=0; x<4; ++x) {
        // set index pointers were applicable
        if (!p.isSeq[x]) {
            idxArrs[x] = castArray<uint>(idxrs[x].idx.arr);
            bPtrs[x] = idxArrs[x].get();
            // set output array ith dimension value
            oDims[x] = idxArrs[x].elements();
        }
        else {
            // alloc an 1-element buffer to avoid OpenCL from failing
            bPtrs[x] = bufferAlloc(sizeof(uint));
        }
    }

    Array<T> out = createEmptyArray<T>(oDims);
    if(oDims.elements() == 0) { return out; }

    kernel::index<T>(out, in, p, bPtrs);

    for (dim_t x=0; x<4; ++x) {
        if (p.isSeq[x]) bufferFree(bPtrs[x]);
    }

    return out;
}
Example #11
0
Array<T> diagExtract(const Array<T> &in, const int num)
{
    in.eval();

    const dim4 idims = in.dims();
    dim_t size = std::min(idims[0], idims[1]) - std::abs(num);
    Array<T> out = createEmptyArray<T>(dim4(size, 1, idims[2], idims[3]));

    getQueue().enqueue(kernel::diagExtract<T>, out, in, num);

    return out;
}
Example #12
0
SparseArray<T> sparseConvertDenseToCOO(const Array<T> &in)
{
    in.eval();

    Array<uint> nonZeroIdx_ = where<T>(in);
    Array<int> nonZeroIdx = cast<int, uint>(nonZeroIdx_);

    dim_t nNZ = nonZeroIdx.elements();

    Array<int> constNNZ = createValueArray<int>(dim4(nNZ), nNZ);
    constNNZ.eval();

    Array<int> rowIdx = arithOp<int, af_mod_t>(nonZeroIdx, constNNZ, nonZeroIdx.dims());
    Array<int> colIdx = arithOp<int, af_div_t>(nonZeroIdx, constNNZ, nonZeroIdx.dims());

    Array<T> values = copyArray<T>(in);
    values.modDims(dim4(values.elements()));
    values = lookup<T, int>(values, nonZeroIdx, 0);

    return createArrayDataSparseArray<T>(in.dims(), values, rowIdx, colIdx, AF_STORAGE_COO);
}
Example #13
0
Array<T> solve(const Array<T> &a, const Array<T> &b, const af_mat_prop options)
{
    a.eval();
    b.eval();

    if (options & AF_MAT_UPPER || options & AF_MAT_LOWER) {
        return triangleSolve<T>(a, b, options);
    }

    int M = a.dims()[0];
    int N = a.dims()[1];
    int K = b.dims()[1];

    Array<T> A = copyArray<T>(a);
    Array<T> B = padArray<T, T>(b, dim4(max(M, N), K));

    if(M == N) {
        Array<int> pivot = createEmptyArray<int>(dim4(N, 1, 1));

        auto func = [=] (Array<T> A, Array<T> B, Array<int> pivot, int N, int K) {
            gesv_func<T>()(AF_LAPACK_COL_MAJOR, N, K, A.get(), A.strides()[1],
                           pivot.get(), B.get(), B.strides()[1]);
        };
        getQueue().enqueue(func, A, B, pivot, N, K);
    } else {
        auto func = [=] (Array<T> A, Array<T> B, int M, int N, int K) {
            int sM = A.strides()[1];
            int sN = A.strides()[2] / sM;

            gels_func<T>()(AF_LAPACK_COL_MAJOR, 'N',
                    M, N, K,
                    A.get(), A.strides()[1],
                    B.get(), max(sM, sN));
        };
        B.resetDims(dim4(N, K));
        getQueue().enqueue(func, A, B, M, N, K);
    }

    return B;
}
Example #14
0
    Array<T>* setIntersect(const Array<T> &first,
                           const Array<T> &second,
                           const bool is_unique)
    {
        if ((std::is_same<T, double>::value || std::is_same<T, cdouble>::value) &&
            !isDoubleSupported(getActiveDeviceId())) {
            OPENCL_NOT_SUPPORTED();
        }
        Array<T> unique_first = first;
        Array<T> unique_second = second;

        if (!is_unique) {
            unique_first  = *setUnique(first, false);
            unique_second = *setUnique(second, false);
        }

        size_t out_size = std::max(unique_first.dims()[0], unique_second.dims()[0]);
        Array<T> *out = createEmptyArray<T>(dim4(out_size, 1, 1, 1));

        compute::command_queue queue(getQueue()());

        compute::buffer first_data((*unique_first.get())());
        compute::buffer second_data((*unique_second.get())());
        compute::buffer out_data((*out->get())());

        compute::buffer_iterator<T> first_begin(first_data, 0);
        compute::buffer_iterator<T> first_end(first_data, unique_first.dims()[0]);
        compute::buffer_iterator<T> second_begin(second_data, 0);
        compute::buffer_iterator<T> second_end(second_data, unique_second.dims()[0]);
        compute::buffer_iterator<T> out_begin(out_data, 0);

        compute::buffer_iterator<T> out_end = compute::set_intersection(
            first_begin, first_end, second_begin, second_end, out_begin, queue
        );

        out->resetDims(dim4(std::distance(out_begin, out_end), 1, 1, 1));

        return out;
    }
Example #15
0
Array<T> solve(const Array<T> &a, const Array<T> &b, const af_mat_prop options)
{

    if (options & AF_MAT_UPPER ||
        options & AF_MAT_LOWER) {
        return triangleSolve<T>(a, b, options);
    }

    int M = a.dims()[0];
    int N = a.dims()[1];
    int K = b.dims()[1];

    Array<T> A = copyArray<T>(a);
    Array<T> B = padArray<T, T>(b, dim4(max(M, N), K), scalar<T>(0));

    std::shared_ptr<T> aPtr = A.getMappedPtr();
    std::shared_ptr<T> bPtr = B.getMappedPtr();

    if(M == N) {
        std::vector<int> pivot(N);
        gesv_func<T>()(AF_LAPACK_COL_MAJOR, N, K,
                       aPtr.get(), A.strides()[1],
                       &pivot.front(),
                       bPtr.get(), B.strides()[1]);
    } else {
        int sM = a.strides()[1];
        int sN = a.strides()[2] / sM;

        gels_func<T>()(AF_LAPACK_COL_MAJOR, 'N',
                       M, N, K,
                       aPtr.get(), A.strides()[1],
                       bPtr.get(), max(sM, sN));
        B.resetDims(dim4(N, K));
    }

    return B;
}
Example #16
0
Array<int> lu_inplace(Array<T> &in, const bool convert_pivot) {
    in.eval();

    dim4 iDims = in.dims();
    Array<int> pivot =
        createEmptyArray<int>(af::dim4(min(iDims[0], iDims[1]), 1, 1, 1));

    auto func = [=](Param<T> in, Param<int> pivot) {
        dim4 iDims = in.dims();
        getrf_func<T>()(AF_LAPACK_COL_MAJOR, iDims[0], iDims[1], in.get(),
                        in.strides(1), pivot.get());
    };
    getQueue().enqueue(func, in, pivot);

    if (convert_pivot) {
        Array<int> p = range<int>(dim4(iDims[0]), 0);
        getQueue().enqueue(kernel::convertPivot, p, pivot);
        return p;
    } else {
        return pivot;
    }
}
Example #17
0
 static dim4 getDims(const af_array arr)
 {
     dim_t d0, d1, d2, d3;
     AF_THROW(af_get_dims(&d0, &d1, &d2, &d3, arr));
     return dim4(d0, d1, d2, d3);
 }
Example #18
0
 // Helper functions
 dim4 array::dims() const
 {
     dim_type d0, d1, d2, d3;
     AF_THROW(af_get_dims(&d0, &d1, &d2, &d3, arr));
     return dim4(d0, d1, d2, d3);
 }
Example #19
0
namespace cuda
{
template<typename T>
Array<T> iota(const dim4 &dim, const dim4 &tile_dims = dim4(1));
}
Example #20
0
 array randn(const dim_type d0, af_dtype ty)
 {
     return randn(dim4(d0), ty);
 }
Example #21
0
Array<T> leastSquares(const Array<T> &a, const Array<T> &b)
{
    int M = a.dims()[0];
    int N = a.dims()[1];
    int K = b.dims()[1];
    int MN = std::min(M, N);

    Array<T> B = createEmptyArray<T>(dim4());
    gpu_blas_trsm_func<T> gpu_blas_trsm;

    cl_event event;
    cl_command_queue queue = getQueue()();

    if (M < N) {

#define UNMQR 0 // FIXME: UNMQR == 1 should be faster but does not work

        // Least squres for this case is solved using the following
        // solve(A, B) == matmul(Q, Xpad);
        // Where:
        // Xpad == pad(Xt, N - M, 1);
        // Xt   == tri_solve(R1, B);
        // R1   == R(seq(M), seq(M));
        // transpose(A) == matmul(Q, R);

        // QR is performed on the transpose of A
        Array<T> A = transpose<T>(a, true);

#if UNMQR
        B = padArray<T, T>(b, dim4(N, K), scalar<T>(0));
        B.resetDims(dim4(M, K));
#else
        B = copyArray<T>(b);
#endif

        int NB = magma_get_geqrf_nb<T>(A.dims()[1]);
        int NUM = (2*MN + ((M+31)/32)*32)*NB;
        Array<T> tmp = createEmptyArray<T>(dim4(NUM));

        std::vector<T> h_tau(MN);

        int info = 0;
        cl::Buffer *dA = A.get();
        cl::Buffer *dT = tmp.get();
        cl::Buffer *dB = B.get();

        magma_geqrf3_gpu<T>(A.dims()[0], A.dims()[1],
                           (*dA)(), A.getOffset(), A.strides()[1],
                           &h_tau[0], (*dT)(), tmp.getOffset(), getQueue()(), &info);

        A.resetDims(dim4(M, M));

        magmablas_swapdblk<T>(MN-1, NB,
                              (*dA)(), A.getOffset(), A.strides()[1], 1,
                              (*dT)(), tmp.getOffset() + MN * NB, NB, 0, queue);

        CLBLAS_CHECK(gpu_blas_trsm(
                         clblasLeft, clblasUpper,
                         clblasConjTrans, clblasNonUnit,
                         B.dims()[0], B.dims()[1],
                         scalar<T>(1),
                         (*dA)(), A.getOffset(), A.strides()[1],
                         (*dB)(), B.getOffset(), B.strides()[1],
                         1, &queue, 0, nullptr, &event));

        magmablas_swapdblk<T>(MN - 1, NB,
                              (*dT)(), tmp.getOffset() + MN * NB, NB, 0,
                              (*dA)(), A.getOffset(), A.strides()[1], 1, queue);

#if UNMQR
        int lwork = (B.dims()[0]-A.dims()[0]+NB)*(B.dims()[1]+2*NB);
        std::vector<T> h_work(lwork);
        B.resetDims(dim4(N, K));
        magma_unmqr_gpu<T>(MagmaLeft, MagmaNoTrans,
                           B.dims()[0], B.dims()[1], A.dims()[0],
                           (*dA)(), A.getOffset(), A.strides()[1],
                           &h_tau[0],
                           (*dB)(), B.getOffset(), B.strides()[1],
                           &h_work[0], lwork,
                           (*dT)(), tmp.getOffset(), NB, queue, &info);
#else
        A.resetDims(dim4(N, M));
        magma_ungqr_gpu<T>(A.dims()[0], A.dims()[1], std::min(M, N),
                           (*dA)(), A.getOffset(), A.strides()[1],
                           &h_tau[0],
                           (*dT)(), tmp.getOffset(), NB, queue, &info);

        B = matmul(A, B, AF_MAT_NONE, AF_MAT_NONE);
#endif
    } else if (M > N) {
        // Least squres for this case is solved using the following
        // solve(A, B) == tri_solve(R1, Bt);
        // Where:
        // R1 == R(seq(N), seq(N));
        // Bt == matmul(transpose(Q1), B);
        // Q1 == Q(span, seq(N));
        // A  == matmul(Q, R);

        Array<T> A = copyArray<T>(a);
        B = copyArray(b);

        int MN = std::min(M, N);
        int NB = magma_get_geqrf_nb<T>(M);

        int NUM = (2*MN + ((N+31)/32)*32)*NB;
        Array<T> tmp = createEmptyArray<T>(dim4(NUM));

        std::vector<T> h_tau(NUM);

        int info = 0;
        cl::Buffer *A_buf = A.get();
        cl::Buffer *B_buf = B.get();
        cl::Buffer *dT = tmp.get();

        magma_geqrf3_gpu<T>(M, N,
                           (*A_buf)(), A.getOffset(), A.strides()[1],
                           &h_tau[0], (*dT)(), tmp.getOffset(), getQueue()(), &info);

        int NRHS = B.dims()[1];
        int lhwork = (M - N + NB) * (NRHS + NB) + NRHS * NB;

        std::vector<T> h_work(lhwork);
        h_work[0] = scalar<T>(lhwork);

        magma_unmqr_gpu<T>(MagmaLeft, MagmaConjTrans,
                           M, NRHS, N,
                           (*A_buf)(), A.getOffset(), A.strides()[1],
                           &h_tau[0],
                           (*B_buf)(), B.getOffset(), B.strides()[1],
                           &h_work[0], lhwork,
                           (*dT)(), tmp.getOffset(), NB,
                           queue, &info);

        magmablas_swapdblk<T>(MN - 1, NB,
                              (*A_buf)(), A.getOffset(), A.strides()[1], 1,
                              (*dT)(), tmp.getOffset() + NB * MN,
                              NB, 0, queue);

        if(getActivePlatform() == AFCL_PLATFORM_NVIDIA)
        {
            Array<T> AT = transpose<T>(A, true);
            cl::Buffer* AT_buf = AT.get();
            CLBLAS_CHECK(gpu_blas_trsm(
                             clblasLeft, clblasLower, clblasConjTrans, clblasNonUnit,
                             N, NRHS, scalar<T>(1),
                             (*AT_buf)(), AT.getOffset(), AT.strides()[1],
                             (*B_buf)(), B.getOffset(), B.strides()[1],
                             1, &queue, 0, nullptr, &event));
        } else {
            CLBLAS_CHECK(gpu_blas_trsm(
                             clblasLeft, clblasUpper, clblasNoTrans, clblasNonUnit,
                             N, NRHS, scalar<T>(1),
                             (*A_buf)(), A.getOffset(), A.strides()[1],
                             (*B_buf)(), B.getOffset(), B.strides()[1],
                             1, &queue, 0, nullptr, &event));
        }
        B.resetDims(dim4(N, K));
    }

    return B;
}
Example #22
0
 array randn(const dim_t d0,
             const dim_t d1, const dim_t d2,
             const dim_t d3, const af::dtype ty)
 {
     return randn(dim4(d0, d1, d2, d3), ty);
 }
Example #23
0
Array<T> *initArray() {
    return new Array<T>(dim4());
}
Example #24
0
 array constant(T val, const dim_t d0, const dim_t d1, const dim_t d2, const dim_t d3, const af::dtype ty)
 {
     return constant(val, dim4(d0, d1, d2, d3), ty);
 }
Example #25
0
 array constant(T val, const dim_t d0, const af::dtype ty)
 {
     return constant(val, dim4(d0), ty);
 }
Example #26
0
 array randu(const dim_t d0,
             const dim_t d1, const dim_t d2, const af::dtype ty)
 {
     return randu(dim4(d0, d1, d2), ty);
 }
Example #27
0
 array identity(const dim_t d0, const af::dtype ty)
 {
     return identity(dim4(d0), ty);
 }
Example #28
0
 array range(const dim_t d0, const dim_t d1, const dim_t d2,
            const dim_t d3, const int seq_dim, const af::dtype ty)
 {
     return range(dim4(d0, d1, d2, d3), seq_dim, ty);
 }
Example #29
0
 array identity(const dim_t d0,
             const dim_t d1, const dim_t d2,
             const dim_t d3, const af::dtype ty)
 {
     return identity(dim4(d0, d1, d2, d3), ty);
 }
Example #30
0
 array randn(const dim_t d0, const af::dtype ty)
 {
     return randn(dim4(d0), ty);
 }