Array<T> generalSolve(const Array<T> &a, const Array<T> &b) { dim4 iDims = a.dims(); int M = iDims[0]; int N = iDims[1]; int MN = std::min(M, N); std::vector<int> ipiv(MN); Array<T> A = copyArray<T>(a); Array<T> B = copyArray<T>(b); cl::Buffer *A_buf = A.get(); int info = 0; magma_getrf_gpu<T>(M, N, (*A_buf)(), A.getOffset(), A.strides()[1], &ipiv[0], getQueue()(), &info); cl::Buffer *B_buf = B.get(); int K = B.dims()[1]; magma_getrs_gpu<T>(MagmaNoTrans, M, K, (*A_buf)(), A.getOffset(), A.strides()[1], &ipiv[0], (*B_buf)(), B.getOffset(), B.strides()[1], getQueue()(), &info); return B; }
Array<T> solveLU(const Array<T> &A, const Array<int> &pivot, const Array<T> &b, const af_mat_prop options) { if(OpenCLCPUOffload()) { return cpu::solveLU(A, pivot, b, options); } int N = A.dims()[0]; int NRHS = b.dims()[1]; std::vector<int> ipiv(N); copyData(&ipiv[0], pivot); Array< T > B = copyArray<T>(b); const cl::Buffer *A_buf = A.get(); cl::Buffer *B_buf = B.get(); int info = 0; magma_getrs_gpu<T>(MagmaNoTrans, N, NRHS, (*A_buf)(), A.getOffset(), A.strides()[1], &ipiv[0], (*B_buf)(), B.getOffset(), B.strides()[1], getQueue()(), &info); return B; }
Array<T>::Array(const Array<T>& parent, const dim4 &dims, const dim4 &offsets, const dim4 &strides) : ArrayInfo(parent.getDevId(), dims, offsets, strides, (af_dtype)dtype_traits<T>::af_type), data(parent.getData()), data_dims(parent.getDataDims()), node(), ready(true), offset(parent.getOffset() + calcOffset(parent.strides(), offsets)), owner(false) { }
Array<T> createSubArray(const Array<T>& parent, const std::vector<af_seq> &index, bool copy) { parent.eval(); dim4 dDims = parent.getDataDims(); dim4 pDims = parent.dims(); dim4 dims = toDims (index, pDims); dim4 strides = toStride (index, dDims); // Find total offsets after indexing dim4 offsets = toOffset(index, pDims); dim4 parent_strides = parent.strides(); dim_t offset = parent.getOffset(); for (int i = 0; i < 4; i++) offset += offsets[i] * parent_strides[i]; Array<T> out = Array<T>(parent, dims, offset, strides); if (!copy) return out; if (strides[0] != 1 || strides[1] < 0 || strides[2] < 0 || strides[3] < 0) { out = copyArray(out); } return out; }
unsigned susan(Array<float> &x_out, Array<float> &y_out, Array<float> &resp_out, const Array<T> &in, const unsigned radius, const float diff_thr, const float geom_thr, const float feature_ratio, const unsigned edge) { dim4 idims = in.dims(); const unsigned corner_lim = in.elements() * feature_ratio; cl::Buffer* x_corners = bufferAlloc(corner_lim * sizeof(float)); cl::Buffer* y_corners = bufferAlloc(corner_lim * sizeof(float)); cl::Buffer* resp_corners = bufferAlloc(corner_lim * sizeof(float)); cl::Buffer* resp = bufferAlloc(in.elements()*sizeof(float)); switch(radius) { case 1: kernel::susan<T, 1>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break; case 2: kernel::susan<T, 2>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break; case 3: kernel::susan<T, 3>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break; case 4: kernel::susan<T, 4>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break; case 5: kernel::susan<T, 5>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break; case 6: kernel::susan<T, 6>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break; case 7: kernel::susan<T, 7>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break; case 8: kernel::susan<T, 8>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break; case 9: kernel::susan<T, 9>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break; } unsigned corners_found = kernel::nonMaximal<T>(x_corners, y_corners, resp_corners, idims[0], idims[1], resp, edge, corner_lim); bufferFree(resp); const unsigned corners_out = std::min(corners_found, corner_lim); if (corners_out == 0) { bufferFree(x_corners); bufferFree(y_corners); bufferFree(resp_corners); x_out = createEmptyArray<float>(dim4()); y_out = createEmptyArray<float>(dim4()); resp_out = createEmptyArray<float>(dim4()); return 0; } else { x_out = createDeviceDataArray<float>(dim4(corners_out), (void*)((*x_corners)())); y_out = createDeviceDataArray<float>(dim4(corners_out), (void*)((*y_corners)())); resp_out = createDeviceDataArray<float>(dim4(corners_out), (void*)((*resp_corners)())); return corners_out; } }
void writeDeviceDataArray(Array<T> &arr, const void * const data, const size_t bytes) { if(!arr.isOwner()) { arr = createEmptyArray<T>(arr.dims()); } memcpy(arr.get() + arr.getOffset(), (const T * const)data, bytes); }
Array<T> triangleSolve(const Array<T> &A, const Array<T> &b, const af_mat_prop options) { trsm_func<T> gpu_trsm; Array<T> B = copyArray<T>(b); int N = B.dims()[0]; int NRHS = B.dims()[1]; const cl::Buffer* A_buf = A.get(); cl::Buffer* B_buf = B.get(); cl_event event = 0; cl_command_queue queue = getQueue()(); std::string pName = getPlatformName(getDevice()); if(pName.find("NVIDIA") != std::string::npos && (options & AF_MAT_UPPER)) { Array<T> AT = transpose<T>(A, true); cl::Buffer* AT_buf = AT.get(); gpu_trsm(clblasColumnMajor, clblasLeft, clblasLower, clblasConjTrans, options & AF_MAT_DIAG_UNIT ? clblasUnit : clblasNonUnit, N, NRHS, scalar<T>(1), (*AT_buf)(), AT.getOffset(), AT.strides()[1], (*B_buf)(), B.getOffset(), B.strides()[1], 1, &queue, 0, nullptr, &event); } else { gpu_trsm(clblasColumnMajor, clblasLeft, options & AF_MAT_LOWER ? clblasLower : clblasUpper, clblasNoTrans, options & AF_MAT_DIAG_UNIT ? clblasUnit : clblasNonUnit, N, NRHS, scalar<T>(1), (*A_buf)(), A.getOffset(), A.strides()[1], (*B_buf)(), B.getOffset(), B.strides()[1], 1, &queue, 0, nullptr, &event); } return B; }
Array<T> triangleSolve(const Array<T> &A, const Array<T> &b, const af_mat_prop options) { gpu_blas_trsm_func<T> gpu_blas_trsm; Array<T> B = copyArray<T>(b); int N = B.dims()[0]; int NRHS = B.dims()[1]; const cl::Buffer* A_buf = A.get(); cl::Buffer* B_buf = B.get(); cl_event event = 0; cl_command_queue queue = getQueue()(); if(getActivePlatform() == AFCL_PLATFORM_NVIDIA && (options & AF_MAT_UPPER)) { Array<T> AT = transpose<T>(A, true); cl::Buffer* AT_buf = AT.get(); CLBLAS_CHECK(gpu_blas_trsm( clblasLeft, clblasLower, clblasConjTrans, options & AF_MAT_DIAG_UNIT ? clblasUnit : clblasNonUnit, N, NRHS, scalar<T>(1), (*AT_buf)(), AT.getOffset(), AT.strides()[1], (*B_buf)(), B.getOffset(), B.strides()[1], 1, &queue, 0, nullptr, &event)); } else { CLBLAS_CHECK(gpu_blas_trsm( clblasLeft, options & AF_MAT_LOWER ? clblasLower : clblasUpper, clblasNoTrans, options & AF_MAT_DIAG_UNIT ? clblasUnit : clblasNonUnit, N, NRHS, scalar<T>(1), (*A_buf)(), A.getOffset(), A.strides()[1], (*B_buf)(), B.getOffset(), B.strides()[1], 1, &queue, 0, nullptr, &event)); } return B; }
Array<T> dot(const Array<T> &lhs, const Array<T> &rhs, af_blas_transpose optLhs, af_blas_transpose optRhs) { initBlas(); int N = lhs.dims()[0]; dot_func<T> dot; cl::Event event; auto out = createEmptyArray<T>(af::dim4(1)); cl::Buffer scratch(getContext(), CL_MEM_READ_WRITE, sizeof(T) * N); clblasStatus err; err = dot(N, (*out.get())(), out.getOffset(), (*lhs.get())(), lhs.getOffset(), lhs.strides()[0], (*rhs.get())(), rhs.getOffset(), rhs.strides()[0], scratch(), 1, &getQueue()(), 0, nullptr, &event()); if(err) { throw runtime_error(std::string("CLBLAS error: ") + std::to_string(err)); } return out; }
void writeHostDataArray(Array<T> &arr, const T * const data, const size_t bytes) { if (!arr.isOwner()) { arr = createEmptyArray<T>(arr.dims()); } getQueue().enqueueWriteBuffer(*arr.get(), CL_TRUE, arr.getOffset(), bytes, data); return; }
Array<T> matmul(const Array<T> &lhs, const Array<T> &rhs, af_blas_transpose optLhs, af_blas_transpose optRhs) { initBlas(); clblasTranspose lOpts = toClblasTranspose(optLhs); clblasTranspose rOpts = toClblasTranspose(optRhs); int aRowDim = (lOpts == clblasNoTrans) ? 0 : 1; int aColDim = (lOpts == clblasNoTrans) ? 1 : 0; int bColDim = (rOpts == clblasNoTrans) ? 1 : 0; dim4 lDims = lhs.dims(); dim4 rDims = rhs.dims(); int M = lDims[aRowDim]; int N = rDims[bColDim]; int K = lDims[aColDim]; //FIXME: Leaks on errors. Array<T> out = createEmptyArray<T>(af::dim4(M, N, 1, 1)); auto alpha = scalar<T>(1); auto beta = scalar<T>(0); dim4 lStrides = lhs.strides(); dim4 rStrides = rhs.strides(); clblasStatus err; cl::Event event; if(rDims[bColDim] == 1) { N = lDims[aColDim]; gemv_func<T> gemv; err = gemv( clblasColumnMajor, lOpts, lDims[0], lDims[1], alpha, (*lhs.get())(), lhs.getOffset(), lStrides[1], (*rhs.get())(), rhs.getOffset(), rStrides[0], beta , (*out.get())(), out.getOffset(), 1, 1, &getQueue()(), 0, nullptr, &event()); } else { gemm_func<T> gemm; err = gemm( clblasColumnMajor, lOpts, rOpts, M, N, K, alpha, (*lhs.get())(), lhs.getOffset(), lStrides[1], (*rhs.get())(), rhs.getOffset(), rStrides[1], beta, (*out.get())(), out.getOffset(), out.dims()[0], 1, &getQueue()(), 0, nullptr, &event()); } if(err) { throw runtime_error(std::string("CLBLAS error: ") + std::to_string(err)); } return out; }
void writeDeviceDataArray(Array<T> &arr, const void * const data, const size_t bytes) { if (!arr.isOwner()) { arr = createEmptyArray<T>(arr.dims()); } T *ptr = arr.get(); CUDA_CHECK(cudaMemcpyAsync(ptr + arr.getOffset(), data, bytes, cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId()))); return; }
void writeHostDataArray(Array<T> &arr, const T * const data, const size_t bytes) { if (!arr.isOwner()) { arr = createEmptyArray<T>(arr.dims()); } T *ptr = arr.get(); CUDA_CHECK(cudaMemcpy(ptr + arr.getOffset(), data, bytes, cudaMemcpyHostToDevice)); return; }
int cholesky_inplace(Array<T> &in, const bool is_upper) { if(OpenCLCPUOffload()) { return cpu::cholesky_inplace(in, is_upper); } dim4 iDims = in.dims(); int N = iDims[0]; magma_uplo_t uplo = is_upper ? MagmaUpper : MagmaLower; int info = 0; cl::Buffer *in_buf = in.get(); magma_potrf_gpu<T>(uplo, N, (*in_buf)(), in.getOffset(), in.strides()[1], getQueue()(), &info); return info; }
void writeDeviceDataArray(Array<T> &arr, const void * const data, const size_t bytes) { if (!arr.isOwner()) { arr = createEmptyArray<T>(arr.dims()); } cl::Buffer& buf = *arr.get(); clRetainMemObject((cl_mem)(data)); cl::Buffer data_buf = cl::Buffer((cl_mem)(data)); getQueue().enqueueCopyBuffer(data_buf, buf, 0, (size_t)arr.getOffset(), bytes); return; }
int cholesky_inplace(Array<T> &in, const bool is_upper) { try { initBlas(); dim4 iDims = in.dims(); int N = iDims[0]; magma_uplo_t uplo = is_upper ? MagmaUpper : MagmaLower; int info = 0; cl::Buffer *in_buf = in.get(); magma_potrf_gpu<T>(uplo, N, (*in_buf)(), in.getOffset(), in.strides()[1], getQueue()(), &info); return info; } catch (cl::Error &err) { CL_TO_AF_ERROR(err); } }
Array<T> matmul(const Array<T> &lhs, const Array<T> &rhs, af_mat_prop optLhs, af_mat_prop optRhs) { #if defined(WITH_LINEAR_ALGEBRA) if(OpenCLCPUOffload(false)) { // Do not force offload gemm on OSX Intel devices return cpu::matmul(lhs, rhs, optLhs, optRhs); } #endif const auto lOpts = toBlasTranspose(optLhs); const auto rOpts = toBlasTranspose(optRhs); const auto aRowDim = (lOpts == OPENCL_BLAS_NO_TRANS) ? 0 : 1; const auto aColDim = (lOpts == OPENCL_BLAS_NO_TRANS) ? 1 : 0; const auto bColDim = (rOpts == OPENCL_BLAS_NO_TRANS) ? 1 : 0; const dim4 lDims = lhs.dims(); const dim4 rDims = rhs.dims(); const int M = lDims[aRowDim]; const int N = rDims[bColDim]; const int K = lDims[aColDim]; dim_t d2 = std::max(lDims[2], rDims[2]); dim_t d3 = std::max(lDims[3], rDims[3]); dim4 oDims = af::dim4(M, N, d2, d3); Array<T> out = createEmptyArray<T>(oDims); const auto alpha = scalar<T>(1); const auto beta = scalar<T>(0); const dim4 lStrides = lhs.strides(); const dim4 rStrides = rhs.strides(); const dim4 oStrides = out.strides(); int batchSize = oDims[2] * oDims[3]; bool is_l_d2_batched = oDims[2] == lDims[2]; bool is_l_d3_batched = oDims[3] == lDims[3]; bool is_r_d2_batched = oDims[2] == rDims[2]; bool is_r_d3_batched = oDims[3] == rDims[3]; for (int n = 0; n < batchSize; n++) { int w = n / rDims[2]; int z = n - w * rDims[2]; int loff = z * (is_l_d2_batched * lStrides[2]) + w * (is_l_d3_batched * lStrides[3]); int roff = z * (is_r_d2_batched * rStrides[2]) + w * (is_r_d3_batched * rStrides[3]); dim_t lOffset = lhs.getOffset() + loff; dim_t rOffset = rhs.getOffset() + roff; dim_t oOffset = out.getOffset() + z * oStrides[2] + w * oStrides[3]; cl::Event event; if(rDims[bColDim] == 1) { dim_t incr = (optRhs == AF_MAT_NONE) ? rStrides[0] : rStrides[1]; gpu_blas_gemv_func<T> gemv; OPENCL_BLAS_CHECK( gemv(lOpts, lDims[0], lDims[1], alpha, (*lhs.get())(), lOffset, lStrides[1], (*rhs.get())(), rOffset, incr, beta, (*out.get())(), oOffset, 1, 1, &getQueue()(), 0, nullptr, &event()) ); } else { gpu_blas_gemm_func<T> gemm; OPENCL_BLAS_CHECK( gemm(lOpts, rOpts, M, N, K, alpha, (*lhs.get())(), lOffset, lStrides[1], (*rhs.get())(), rOffset, rStrides[1], beta, (*out.get())(), oOffset, out.dims()[0], 1, &getQueue()(), 0, nullptr, &event()) ); } } return out; }
Array<T> leastSquares(const Array<T> &a, const Array<T> &b) { int M = a.dims()[0]; int N = a.dims()[1]; int K = b.dims()[1]; int MN = std::min(M, N); Array<T> B = createEmptyArray<T>(dim4()); gpu_blas_trsm_func<T> gpu_blas_trsm; cl_event event; cl_command_queue queue = getQueue()(); if (M < N) { #define UNMQR 0 // FIXME: UNMQR == 1 should be faster but does not work // Least squres for this case is solved using the following // solve(A, B) == matmul(Q, Xpad); // Where: // Xpad == pad(Xt, N - M, 1); // Xt == tri_solve(R1, B); // R1 == R(seq(M), seq(M)); // transpose(A) == matmul(Q, R); // QR is performed on the transpose of A Array<T> A = transpose<T>(a, true); #if UNMQR B = padArray<T, T>(b, dim4(N, K), scalar<T>(0)); B.resetDims(dim4(M, K)); #else B = copyArray<T>(b); #endif int NB = magma_get_geqrf_nb<T>(A.dims()[1]); int NUM = (2*MN + ((M+31)/32)*32)*NB; Array<T> tmp = createEmptyArray<T>(dim4(NUM)); std::vector<T> h_tau(MN); int info = 0; cl::Buffer *dA = A.get(); cl::Buffer *dT = tmp.get(); cl::Buffer *dB = B.get(); magma_geqrf3_gpu<T>(A.dims()[0], A.dims()[1], (*dA)(), A.getOffset(), A.strides()[1], &h_tau[0], (*dT)(), tmp.getOffset(), getQueue()(), &info); A.resetDims(dim4(M, M)); magmablas_swapdblk<T>(MN-1, NB, (*dA)(), A.getOffset(), A.strides()[1], 1, (*dT)(), tmp.getOffset() + MN * NB, NB, 0, queue); CLBLAS_CHECK(gpu_blas_trsm( clblasLeft, clblasUpper, clblasConjTrans, clblasNonUnit, B.dims()[0], B.dims()[1], scalar<T>(1), (*dA)(), A.getOffset(), A.strides()[1], (*dB)(), B.getOffset(), B.strides()[1], 1, &queue, 0, nullptr, &event)); magmablas_swapdblk<T>(MN - 1, NB, (*dT)(), tmp.getOffset() + MN * NB, NB, 0, (*dA)(), A.getOffset(), A.strides()[1], 1, queue); #if UNMQR int lwork = (B.dims()[0]-A.dims()[0]+NB)*(B.dims()[1]+2*NB); std::vector<T> h_work(lwork); B.resetDims(dim4(N, K)); magma_unmqr_gpu<T>(MagmaLeft, MagmaNoTrans, B.dims()[0], B.dims()[1], A.dims()[0], (*dA)(), A.getOffset(), A.strides()[1], &h_tau[0], (*dB)(), B.getOffset(), B.strides()[1], &h_work[0], lwork, (*dT)(), tmp.getOffset(), NB, queue, &info); #else A.resetDims(dim4(N, M)); magma_ungqr_gpu<T>(A.dims()[0], A.dims()[1], std::min(M, N), (*dA)(), A.getOffset(), A.strides()[1], &h_tau[0], (*dT)(), tmp.getOffset(), NB, queue, &info); B = matmul(A, B, AF_MAT_NONE, AF_MAT_NONE); #endif } else if (M > N) { // Least squres for this case is solved using the following // solve(A, B) == tri_solve(R1, Bt); // Where: // R1 == R(seq(N), seq(N)); // Bt == matmul(transpose(Q1), B); // Q1 == Q(span, seq(N)); // A == matmul(Q, R); Array<T> A = copyArray<T>(a); B = copyArray(b); int MN = std::min(M, N); int NB = magma_get_geqrf_nb<T>(M); int NUM = (2*MN + ((N+31)/32)*32)*NB; Array<T> tmp = createEmptyArray<T>(dim4(NUM)); std::vector<T> h_tau(NUM); int info = 0; cl::Buffer *A_buf = A.get(); cl::Buffer *B_buf = B.get(); cl::Buffer *dT = tmp.get(); magma_geqrf3_gpu<T>(M, N, (*A_buf)(), A.getOffset(), A.strides()[1], &h_tau[0], (*dT)(), tmp.getOffset(), getQueue()(), &info); int NRHS = B.dims()[1]; int lhwork = (M - N + NB) * (NRHS + NB) + NRHS * NB; std::vector<T> h_work(lhwork); h_work[0] = scalar<T>(lhwork); magma_unmqr_gpu<T>(MagmaLeft, MagmaConjTrans, M, NRHS, N, (*A_buf)(), A.getOffset(), A.strides()[1], &h_tau[0], (*B_buf)(), B.getOffset(), B.strides()[1], &h_work[0], lhwork, (*dT)(), tmp.getOffset(), NB, queue, &info); magmablas_swapdblk<T>(MN - 1, NB, (*A_buf)(), A.getOffset(), A.strides()[1], 1, (*dT)(), tmp.getOffset() + NB * MN, NB, 0, queue); if(getActivePlatform() == AFCL_PLATFORM_NVIDIA) { Array<T> AT = transpose<T>(A, true); cl::Buffer* AT_buf = AT.get(); CLBLAS_CHECK(gpu_blas_trsm( clblasLeft, clblasLower, clblasConjTrans, clblasNonUnit, N, NRHS, scalar<T>(1), (*AT_buf)(), AT.getOffset(), AT.strides()[1], (*B_buf)(), B.getOffset(), B.strides()[1], 1, &queue, 0, nullptr, &event)); } else { CLBLAS_CHECK(gpu_blas_trsm( clblasLeft, clblasUpper, clblasNoTrans, clblasNonUnit, N, NRHS, scalar<T>(1), (*A_buf)(), A.getOffset(), A.strides()[1], (*B_buf)(), B.getOffset(), B.strides()[1], 1, &queue, 0, nullptr, &event)); } B.resetDims(dim4(N, K)); } return B; }