Array<T> solveLU(const Array<T> &A, const Array<int> &pivot, const Array<T> &b, const af_mat_prop options) { if(OpenCLCPUOffload()) { return cpu::solveLU(A, pivot, b, options); } int N = A.dims()[0]; int NRHS = b.dims()[1]; std::vector<int> ipiv(N); copyData(&ipiv[0], pivot); Array< T > B = copyArray<T>(b); const cl::Buffer *A_buf = A.get(); cl::Buffer *B_buf = B.get(); int info = 0; magma_getrs_gpu<T>(MagmaNoTrans, N, NRHS, (*A_buf)(), A.getOffset(), A.strides()[1], &ipiv[0], (*B_buf)(), B.getOffset(), B.strides()[1], getQueue()(), &info); return B; }
Array<T> cholesky(int *info, const Array<T> &in, const bool is_upper) { if(OpenCLCPUOffload()) { return cpu::cholesky(info, in, is_upper); } Array<T> out = copyArray<T>(in); *info = cholesky_inplace(out, is_upper); if (is_upper) triangle<T, true , false>(out, out); else triangle<T, false, false>(out, out); return out; }
int cholesky_inplace(Array<T> &in, const bool is_upper) { if(OpenCLCPUOffload()) { return cpu::cholesky_inplace(in, is_upper); } dim4 iDims = in.dims(); int N = iDims[0]; magma_uplo_t uplo = is_upper ? MagmaUpper : MagmaLower; int info = 0; cl::Buffer *in_buf = in.get(); magma_potrf_gpu<T>(uplo, N, (*in_buf)(), in.getOffset(), in.strides()[1], getQueue()(), &info); return info; }
Array<T> solve(const Array<T> &a, const Array<T> &b, const af_mat_prop options) { try { if(OpenCLCPUOffload()) { return cpu::solve(a, b, options); } initBlas(); if (options & AF_MAT_UPPER || options & AF_MAT_LOWER) { return triangleSolve<T>(a, b, options); } if(a.dims()[0] == a.dims()[1]) { return generalSolve<T>(a, b); } else { return leastSquares<T>(a, b); } } catch(cl::Error &err) { CL_TO_AF_ERROR(err); } }
Array<T> matmul(const Array<T> &lhs, const Array<T> &rhs, af_mat_prop optLhs, af_mat_prop optRhs) { #if defined(WITH_LINEAR_ALGEBRA) if(OpenCLCPUOffload(false)) { // Do not force offload gemm on OSX Intel devices return cpu::matmul(lhs, rhs, optLhs, optRhs); } #endif const auto lOpts = toBlasTranspose(optLhs); const auto rOpts = toBlasTranspose(optRhs); const auto aRowDim = (lOpts == OPENCL_BLAS_NO_TRANS) ? 0 : 1; const auto aColDim = (lOpts == OPENCL_BLAS_NO_TRANS) ? 1 : 0; const auto bColDim = (rOpts == OPENCL_BLAS_NO_TRANS) ? 1 : 0; const dim4 lDims = lhs.dims(); const dim4 rDims = rhs.dims(); const int M = lDims[aRowDim]; const int N = rDims[bColDim]; const int K = lDims[aColDim]; dim_t d2 = std::max(lDims[2], rDims[2]); dim_t d3 = std::max(lDims[3], rDims[3]); dim4 oDims = af::dim4(M, N, d2, d3); Array<T> out = createEmptyArray<T>(oDims); const auto alpha = scalar<T>(1); const auto beta = scalar<T>(0); const dim4 lStrides = lhs.strides(); const dim4 rStrides = rhs.strides(); const dim4 oStrides = out.strides(); int batchSize = oDims[2] * oDims[3]; bool is_l_d2_batched = oDims[2] == lDims[2]; bool is_l_d3_batched = oDims[3] == lDims[3]; bool is_r_d2_batched = oDims[2] == rDims[2]; bool is_r_d3_batched = oDims[3] == rDims[3]; for (int n = 0; n < batchSize; n++) { int w = n / rDims[2]; int z = n - w * rDims[2]; int loff = z * (is_l_d2_batched * lStrides[2]) + w * (is_l_d3_batched * lStrides[3]); int roff = z * (is_r_d2_batched * rStrides[2]) + w * (is_r_d3_batched * rStrides[3]); dim_t lOffset = lhs.getOffset() + loff; dim_t rOffset = rhs.getOffset() + roff; dim_t oOffset = out.getOffset() + z * oStrides[2] + w * oStrides[3]; cl::Event event; if(rDims[bColDim] == 1) { dim_t incr = (optRhs == AF_MAT_NONE) ? rStrides[0] : rStrides[1]; gpu_blas_gemv_func<T> gemv; OPENCL_BLAS_CHECK( gemv(lOpts, lDims[0], lDims[1], alpha, (*lhs.get())(), lOffset, lStrides[1], (*rhs.get())(), rOffset, incr, beta, (*out.get())(), oOffset, 1, 1, &getQueue()(), 0, nullptr, &event()) ); } else { gpu_blas_gemm_func<T> gemm; OPENCL_BLAS_CHECK( gemm(lOpts, rOpts, M, N, K, alpha, (*lhs.get())(), lOffset, lStrides[1], (*rhs.get())(), rOffset, rStrides[1], beta, (*out.get())(), oOffset, out.dims()[0], 1, &getQueue()(), 0, nullptr, &event()) ); } } return out; }