Example #1
0
Array<T> generalSolve(const Array<T> &a, const Array<T> &b)
{

    dim4 iDims = a.dims();
    int M = iDims[0];
    int N = iDims[1];
    int MN = std::min(M, N);
    std::vector<int> ipiv(MN);

    Array<T> A = copyArray<T>(a);
    Array<T> B = copyArray<T>(b);

    cl::Buffer *A_buf = A.get();
    int info = 0;
    magma_getrf_gpu<T>(M, N, (*A_buf)(), A.getOffset(), A.strides()[1],
                       &ipiv[0], getQueue()(), &info);

    cl::Buffer *B_buf = B.get();
    int K = B.dims()[1];
    magma_getrs_gpu<T>(MagmaNoTrans, M, K,
                       (*A_buf)(), A.getOffset(), A.strides()[1],
                       &ipiv[0],
                       (*B_buf)(), B.getOffset(), B.strides()[1],
                       getQueue()(), &info);
    return B;
}
Example #2
0
Array<T> solveLU(const Array<T> &A, const Array<int> &pivot,
                 const Array<T> &b, const af_mat_prop options)
{
    if(OpenCLCPUOffload()) {
        return cpu::solveLU(A, pivot, b, options);
    }

    int N = A.dims()[0];
    int NRHS = b.dims()[1];

    std::vector<int> ipiv(N);
    copyData(&ipiv[0], pivot);

    Array< T > B = copyArray<T>(b);

    const cl::Buffer *A_buf = A.get();
    cl::Buffer *B_buf = B.get();

    int info = 0;
    magma_getrs_gpu<T>(MagmaNoTrans, N, NRHS,
                       (*A_buf)(), A.getOffset(), A.strides()[1],
                       &ipiv[0],
                       (*B_buf)(), B.getOffset(), B.strides()[1],
                       getQueue()(), &info);
    return B;
}
Example #3
0
Array<T>::Array(const Array<T>& parent, const dim4 &dims, const dim4 &offsets, const dim4 &strides) :
    ArrayInfo(parent.getDevId(), dims, offsets, strides, (af_dtype)dtype_traits<T>::af_type),
    data(parent.getData()), data_dims(parent.getDataDims()),
    node(), ready(true),
    offset(parent.getOffset() + calcOffset(parent.strides(), offsets)),
    owner(false)
{ }
Example #4
0
    Array<T> createSubArray(const Array<T>& parent,
                            const std::vector<af_seq> &index,
                            bool copy)
    {
        parent.eval();

        dim4 dDims = parent.getDataDims();
        dim4 pDims = parent.dims();

        dim4 dims    = toDims  (index, pDims);
        dim4 strides = toStride (index, dDims);

        // Find total offsets after indexing
        dim4 offsets = toOffset(index, pDims);
        dim4 parent_strides = parent.strides();
        dim_t offset = parent.getOffset();
        for (int i = 0; i < 4; i++) offset += offsets[i] * parent_strides[i];

        Array<T> out = Array<T>(parent, dims, offset, strides);

        if (!copy) return out;

        if (strides[0] != 1 ||
            strides[1] <  0 ||
            strides[2] <  0 ||
            strides[3] <  0) {

            out = copyArray(out);
        }

        return out;
    }
Example #5
0
unsigned susan(Array<float> &x_out, Array<float> &y_out, Array<float> &resp_out,
               const Array<T> &in,
               const unsigned radius, const float diff_thr, const float geom_thr,
               const float feature_ratio, const unsigned edge)
{
    dim4 idims = in.dims();

    const unsigned corner_lim = in.elements() * feature_ratio;
    cl::Buffer* x_corners     = bufferAlloc(corner_lim * sizeof(float));
    cl::Buffer* y_corners     = bufferAlloc(corner_lim * sizeof(float));
    cl::Buffer* resp_corners  = bufferAlloc(corner_lim * sizeof(float));

    cl::Buffer* resp = bufferAlloc(in.elements()*sizeof(float));

    switch(radius) {
    case 1: kernel::susan<T, 1>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
    case 2: kernel::susan<T, 2>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
    case 3: kernel::susan<T, 3>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
    case 4: kernel::susan<T, 4>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
    case 5: kernel::susan<T, 5>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
    case 6: kernel::susan<T, 6>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
    case 7: kernel::susan<T, 7>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
    case 8: kernel::susan<T, 8>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
    case 9: kernel::susan<T, 9>(resp, in.get(), in.getOffset(), idims[0], idims[1], diff_thr, geom_thr, edge); break;
    }

    unsigned corners_found = kernel::nonMaximal<T>(x_corners, y_corners, resp_corners,
                                                   idims[0], idims[1], resp, edge, corner_lim);
    bufferFree(resp);

    const unsigned corners_out = std::min(corners_found, corner_lim);
    if (corners_out == 0) {
        bufferFree(x_corners);
        bufferFree(y_corners);
        bufferFree(resp_corners);
        x_out    = createEmptyArray<float>(dim4());
        y_out    = createEmptyArray<float>(dim4());
        resp_out = createEmptyArray<float>(dim4());
        return 0;
    } else {
        x_out    = createDeviceDataArray<float>(dim4(corners_out), (void*)((*x_corners)()));
        y_out    = createDeviceDataArray<float>(dim4(corners_out), (void*)((*y_corners)()));
        resp_out = createDeviceDataArray<float>(dim4(corners_out), (void*)((*resp_corners)()));
        return corners_out;
    }
}
Example #6
0
void
writeDeviceDataArray(Array<T> &arr, const void * const data, const size_t bytes)
{
    if(!arr.isOwner()) {
        arr = createEmptyArray<T>(arr.dims());
    }
    memcpy(arr.get() + arr.getOffset(), (const T * const)data, bytes);
}
Example #7
0
Array<T> triangleSolve(const Array<T> &A, const Array<T> &b, const af_mat_prop options)
{
    trsm_func<T> gpu_trsm;

    Array<T> B = copyArray<T>(b);

    int N = B.dims()[0];
    int NRHS = B.dims()[1];

    const cl::Buffer* A_buf = A.get();
    cl::Buffer* B_buf = B.get();

    cl_event event = 0;
    cl_command_queue queue = getQueue()();

    std::string pName = getPlatformName(getDevice());
    if(pName.find("NVIDIA") != std::string::npos && (options & AF_MAT_UPPER))
    {
        Array<T> AT = transpose<T>(A, true);

        cl::Buffer* AT_buf = AT.get();
        gpu_trsm(clblasColumnMajor,
                 clblasLeft,
                 clblasLower,
                 clblasConjTrans,
                 options & AF_MAT_DIAG_UNIT ? clblasUnit : clblasNonUnit,
                 N, NRHS, scalar<T>(1),
                 (*AT_buf)(), AT.getOffset(), AT.strides()[1],
                 (*B_buf)(), B.getOffset(), B.strides()[1],
                 1, &queue, 0, nullptr, &event);
    } else {
        gpu_trsm(clblasColumnMajor,
                 clblasLeft,
                 options & AF_MAT_LOWER ? clblasLower : clblasUpper,
                 clblasNoTrans,
                 options & AF_MAT_DIAG_UNIT ? clblasUnit : clblasNonUnit,
                 N, NRHS, scalar<T>(1),
                 (*A_buf)(), A.getOffset(), A.strides()[1],
                 (*B_buf)(), B.getOffset(), B.strides()[1],
                 1, &queue, 0, nullptr, &event);
    }

    return B;
}
Example #8
0
Array<T> triangleSolve(const Array<T> &A, const Array<T> &b, const af_mat_prop options)
{
    gpu_blas_trsm_func<T> gpu_blas_trsm;

    Array<T> B = copyArray<T>(b);

    int N = B.dims()[0];
    int NRHS = B.dims()[1];

    const cl::Buffer* A_buf = A.get();
    cl::Buffer* B_buf = B.get();

    cl_event event = 0;
    cl_command_queue queue = getQueue()();

    if(getActivePlatform() == AFCL_PLATFORM_NVIDIA && (options & AF_MAT_UPPER))
    {
        Array<T> AT = transpose<T>(A, true);

        cl::Buffer* AT_buf = AT.get();
        CLBLAS_CHECK(gpu_blas_trsm(
                         clblasLeft,
                         clblasLower,
                         clblasConjTrans,
                         options & AF_MAT_DIAG_UNIT ? clblasUnit : clblasNonUnit,
                         N, NRHS, scalar<T>(1),
                         (*AT_buf)(), AT.getOffset(), AT.strides()[1],
                         (*B_buf)(), B.getOffset(), B.strides()[1],
                         1, &queue, 0, nullptr, &event));
    } else {
        CLBLAS_CHECK(gpu_blas_trsm(
                         clblasLeft,
                         options & AF_MAT_LOWER ? clblasLower : clblasUpper,
                         clblasNoTrans,
                         options & AF_MAT_DIAG_UNIT ? clblasUnit : clblasNonUnit,
                         N, NRHS, scalar<T>(1),
                         (*A_buf)(), A.getOffset(), A.strides()[1],
                         (*B_buf)(), B.getOffset(), B.strides()[1],
                         1, &queue, 0, nullptr, &event));
    }

    return B;
}
Example #9
0
Array<T> dot(const Array<T> &lhs, const Array<T> &rhs,
             af_blas_transpose optLhs, af_blas_transpose optRhs)
{
    initBlas();

    int N = lhs.dims()[0];
    dot_func<T> dot;
    cl::Event event;
    auto out = createEmptyArray<T>(af::dim4(1));
    cl::Buffer scratch(getContext(), CL_MEM_READ_WRITE, sizeof(T) * N);
    clblasStatus err;
    err = dot(N,
              (*out.get())(), out.getOffset(),
              (*lhs.get())(),  lhs.getOffset(), lhs.strides()[0],
              (*rhs.get())(),  rhs.getOffset(), rhs.strides()[0],
              scratch(),
              1, &getQueue()(), 0, nullptr, &event());

    if(err) {
        throw runtime_error(std::string("CLBLAS error: ") + std::to_string(err));
    }
    return out;
}
Example #10
0
    void
    writeHostDataArray(Array<T> &arr, const T * const data, const size_t bytes)
    {
        if (!arr.isOwner()) {
            arr = createEmptyArray<T>(arr.dims());
        }

        getQueue().enqueueWriteBuffer(*arr.get(), CL_TRUE,
                                      arr.getOffset(),
                                      bytes,
                                      data);

        return;
    }
Example #11
0
Array<T> matmul(const Array<T> &lhs, const Array<T> &rhs,
                af_blas_transpose optLhs, af_blas_transpose optRhs)
{
    initBlas();
    clblasTranspose lOpts = toClblasTranspose(optLhs);
    clblasTranspose rOpts = toClblasTranspose(optRhs);

    int aRowDim = (lOpts == clblasNoTrans) ? 0 : 1;
    int aColDim = (lOpts == clblasNoTrans) ? 1 : 0;
    int bColDim = (rOpts == clblasNoTrans) ? 1 : 0;

    dim4 lDims = lhs.dims();
    dim4 rDims = rhs.dims();
    int M = lDims[aRowDim];
    int N = rDims[bColDim];
    int K = lDims[aColDim];

    //FIXME: Leaks on errors.
    Array<T> out = createEmptyArray<T>(af::dim4(M, N, 1, 1));
    auto alpha = scalar<T>(1);
    auto beta  = scalar<T>(0);

    dim4 lStrides = lhs.strides();
    dim4 rStrides = rhs.strides();
    clblasStatus err;
    cl::Event event;
    if(rDims[bColDim] == 1) {
        N = lDims[aColDim];
        gemv_func<T> gemv;
        err = gemv(
            clblasColumnMajor, lOpts,
            lDims[0], lDims[1],
            alpha,
            (*lhs.get())(),    lhs.getOffset(),   lStrides[1],
            (*rhs.get())(),    rhs.getOffset(),   rStrides[0],
            beta ,
            (*out.get())(),   out.getOffset(),             1,
            1, &getQueue()(), 0, nullptr, &event());
    } else {
        gemm_func<T> gemm;
        err = gemm(
                clblasColumnMajor, lOpts, rOpts,
                M, N, K,
                alpha,
                (*lhs.get())(),    lhs.getOffset(),   lStrides[1],
                (*rhs.get())(),    rhs.getOffset(),   rStrides[1],
                beta,
                (*out.get())(),   out.getOffset(),  out.dims()[0],
                1, &getQueue()(), 0, nullptr, &event());

    }
    if(err) {
        throw runtime_error(std::string("CLBLAS error: ") + std::to_string(err));
    }

    return out;
}
Example #12
0
    void
    writeDeviceDataArray(Array<T> &arr, const void * const data, const size_t bytes)
    {
        if (!arr.isOwner()) {
            arr = createEmptyArray<T>(arr.dims());
        }

        T *ptr = arr.get();

        CUDA_CHECK(cudaMemcpyAsync(ptr + arr.getOffset(), data,
                                   bytes, cudaMemcpyDeviceToDevice,
                                   cuda::getStream(cuda::getActiveDeviceId())));

        return;
    }
Example #13
0
    void
    writeHostDataArray(Array<T> &arr, const T * const data, const size_t bytes)
    {
        if (!arr.isOwner()) {
            arr = createEmptyArray<T>(arr.dims());
        }

        T *ptr = arr.get();

        CUDA_CHECK(cudaMemcpy(ptr + arr.getOffset(), data,
                              bytes,
                              cudaMemcpyHostToDevice));

        return;
    }
int cholesky_inplace(Array<T> &in, const bool is_upper)
{
    if(OpenCLCPUOffload()) {
        return cpu::cholesky_inplace(in, is_upper);
    }

    dim4 iDims = in.dims();
    int N = iDims[0];

    magma_uplo_t uplo = is_upper ? MagmaUpper : MagmaLower;

    int info = 0;
    cl::Buffer *in_buf = in.get();
    magma_potrf_gpu<T>(uplo, N,
                        (*in_buf)(), in.getOffset(),  in.strides()[1],
                        getQueue()(), &info);
    return info;
}
Example #15
0
    void
    writeDeviceDataArray(Array<T> &arr, const void * const data, const size_t bytes)
    {
        if (!arr.isOwner()) {
            arr = createEmptyArray<T>(arr.dims());
        }

        cl::Buffer& buf = *arr.get();

        clRetainMemObject((cl_mem)(data));
        cl::Buffer data_buf = cl::Buffer((cl_mem)(data));

        getQueue().enqueueCopyBuffer(data_buf, buf,
                                     0, (size_t)arr.getOffset(),
                                     bytes);

        return;
    }
Example #16
0
int cholesky_inplace(Array<T> &in, const bool is_upper)
{
    try {
        initBlas();

        dim4 iDims = in.dims();
        int N = iDims[0];

        magma_uplo_t uplo = is_upper ? MagmaUpper : MagmaLower;

        int info = 0;
        cl::Buffer *in_buf = in.get();
        magma_potrf_gpu<T>(uplo, N,
                           (*in_buf)(), in.getOffset(),  in.strides()[1],
                           getQueue()(), &info);
        return info;
    } catch (cl::Error &err) {
        CL_TO_AF_ERROR(err);
    }
}
Example #17
0
Array<T> matmul(const Array<T> &lhs, const Array<T> &rhs,
                af_mat_prop optLhs, af_mat_prop optRhs)
{
#if defined(WITH_LINEAR_ALGEBRA)
    if(OpenCLCPUOffload(false)) {   // Do not force offload gemm on OSX Intel devices
        return cpu::matmul(lhs, rhs, optLhs, optRhs);
    }
#endif
    const auto lOpts = toBlasTranspose(optLhs);
    const auto rOpts = toBlasTranspose(optRhs);

    const auto aRowDim = (lOpts == OPENCL_BLAS_NO_TRANS) ? 0 : 1;
    const auto aColDim = (lOpts == OPENCL_BLAS_NO_TRANS) ? 1 : 0;
    const auto bColDim = (rOpts == OPENCL_BLAS_NO_TRANS) ? 1 : 0;

    const dim4 lDims = lhs.dims();
    const dim4 rDims = rhs.dims();
    const int M = lDims[aRowDim];
    const int N = rDims[bColDim];
    const int K = lDims[aColDim];

    dim_t d2 = std::max(lDims[2], rDims[2]);
    dim_t d3 = std::max(lDims[3], rDims[3]);
    dim4 oDims = af::dim4(M, N, d2, d3);
    Array<T> out = createEmptyArray<T>(oDims);

    const auto alpha = scalar<T>(1);
    const auto beta  = scalar<T>(0);

    const dim4 lStrides = lhs.strides();
    const dim4 rStrides = rhs.strides();
    const dim4 oStrides = out.strides();

    int batchSize = oDims[2] * oDims[3];

    bool is_l_d2_batched = oDims[2] == lDims[2];
    bool is_l_d3_batched = oDims[3] == lDims[3];
    bool is_r_d2_batched = oDims[2] == rDims[2];
    bool is_r_d3_batched = oDims[3] == rDims[3];

    for (int n = 0; n < batchSize; n++) {
        int w = n / rDims[2];
        int z = n - w * rDims[2];

        int loff = z * (is_l_d2_batched * lStrides[2]) + w * (is_l_d3_batched * lStrides[3]);
        int roff = z * (is_r_d2_batched * rStrides[2]) + w * (is_r_d3_batched * rStrides[3]);

        dim_t lOffset = lhs.getOffset() + loff;
        dim_t rOffset = rhs.getOffset() + roff;
        dim_t oOffset = out.getOffset() + z * oStrides[2] + w * oStrides[3];

        cl::Event event;
        if(rDims[bColDim] == 1) {
            dim_t incr = (optRhs == AF_MAT_NONE) ? rStrides[0] : rStrides[1];
            gpu_blas_gemv_func<T> gemv;
            OPENCL_BLAS_CHECK(
                gemv(lOpts, lDims[0], lDims[1],
                     alpha,
                     (*lhs.get())(), lOffset, lStrides[1],
                     (*rhs.get())(), rOffset, incr,
                     beta,
                     (*out.get())(), oOffset, 1,
                     1, &getQueue()(), 0, nullptr, &event())
                );
        } else {
            gpu_blas_gemm_func<T> gemm;
            OPENCL_BLAS_CHECK(
                gemm(lOpts, rOpts, M, N, K,
                     alpha,
                     (*lhs.get())(), lOffset, lStrides[1],
                     (*rhs.get())(), rOffset, rStrides[1],
                     beta,
                     (*out.get())(), oOffset, out.dims()[0],
                     1, &getQueue()(), 0, nullptr, &event())
                );
        }
    }

    return out;
}
Example #18
0
Array<T> leastSquares(const Array<T> &a, const Array<T> &b)
{
    int M = a.dims()[0];
    int N = a.dims()[1];
    int K = b.dims()[1];
    int MN = std::min(M, N);

    Array<T> B = createEmptyArray<T>(dim4());
    gpu_blas_trsm_func<T> gpu_blas_trsm;

    cl_event event;
    cl_command_queue queue = getQueue()();

    if (M < N) {

#define UNMQR 0 // FIXME: UNMQR == 1 should be faster but does not work

        // Least squres for this case is solved using the following
        // solve(A, B) == matmul(Q, Xpad);
        // Where:
        // Xpad == pad(Xt, N - M, 1);
        // Xt   == tri_solve(R1, B);
        // R1   == R(seq(M), seq(M));
        // transpose(A) == matmul(Q, R);

        // QR is performed on the transpose of A
        Array<T> A = transpose<T>(a, true);

#if UNMQR
        B = padArray<T, T>(b, dim4(N, K), scalar<T>(0));
        B.resetDims(dim4(M, K));
#else
        B = copyArray<T>(b);
#endif

        int NB = magma_get_geqrf_nb<T>(A.dims()[1]);
        int NUM = (2*MN + ((M+31)/32)*32)*NB;
        Array<T> tmp = createEmptyArray<T>(dim4(NUM));

        std::vector<T> h_tau(MN);

        int info = 0;
        cl::Buffer *dA = A.get();
        cl::Buffer *dT = tmp.get();
        cl::Buffer *dB = B.get();

        magma_geqrf3_gpu<T>(A.dims()[0], A.dims()[1],
                           (*dA)(), A.getOffset(), A.strides()[1],
                           &h_tau[0], (*dT)(), tmp.getOffset(), getQueue()(), &info);

        A.resetDims(dim4(M, M));

        magmablas_swapdblk<T>(MN-1, NB,
                              (*dA)(), A.getOffset(), A.strides()[1], 1,
                              (*dT)(), tmp.getOffset() + MN * NB, NB, 0, queue);

        CLBLAS_CHECK(gpu_blas_trsm(
                         clblasLeft, clblasUpper,
                         clblasConjTrans, clblasNonUnit,
                         B.dims()[0], B.dims()[1],
                         scalar<T>(1),
                         (*dA)(), A.getOffset(), A.strides()[1],
                         (*dB)(), B.getOffset(), B.strides()[1],
                         1, &queue, 0, nullptr, &event));

        magmablas_swapdblk<T>(MN - 1, NB,
                              (*dT)(), tmp.getOffset() + MN * NB, NB, 0,
                              (*dA)(), A.getOffset(), A.strides()[1], 1, queue);

#if UNMQR
        int lwork = (B.dims()[0]-A.dims()[0]+NB)*(B.dims()[1]+2*NB);
        std::vector<T> h_work(lwork);
        B.resetDims(dim4(N, K));
        magma_unmqr_gpu<T>(MagmaLeft, MagmaNoTrans,
                           B.dims()[0], B.dims()[1], A.dims()[0],
                           (*dA)(), A.getOffset(), A.strides()[1],
                           &h_tau[0],
                           (*dB)(), B.getOffset(), B.strides()[1],
                           &h_work[0], lwork,
                           (*dT)(), tmp.getOffset(), NB, queue, &info);
#else
        A.resetDims(dim4(N, M));
        magma_ungqr_gpu<T>(A.dims()[0], A.dims()[1], std::min(M, N),
                           (*dA)(), A.getOffset(), A.strides()[1],
                           &h_tau[0],
                           (*dT)(), tmp.getOffset(), NB, queue, &info);

        B = matmul(A, B, AF_MAT_NONE, AF_MAT_NONE);
#endif
    } else if (M > N) {
        // Least squres for this case is solved using the following
        // solve(A, B) == tri_solve(R1, Bt);
        // Where:
        // R1 == R(seq(N), seq(N));
        // Bt == matmul(transpose(Q1), B);
        // Q1 == Q(span, seq(N));
        // A  == matmul(Q, R);

        Array<T> A = copyArray<T>(a);
        B = copyArray(b);

        int MN = std::min(M, N);
        int NB = magma_get_geqrf_nb<T>(M);

        int NUM = (2*MN + ((N+31)/32)*32)*NB;
        Array<T> tmp = createEmptyArray<T>(dim4(NUM));

        std::vector<T> h_tau(NUM);

        int info = 0;
        cl::Buffer *A_buf = A.get();
        cl::Buffer *B_buf = B.get();
        cl::Buffer *dT = tmp.get();

        magma_geqrf3_gpu<T>(M, N,
                           (*A_buf)(), A.getOffset(), A.strides()[1],
                           &h_tau[0], (*dT)(), tmp.getOffset(), getQueue()(), &info);

        int NRHS = B.dims()[1];
        int lhwork = (M - N + NB) * (NRHS + NB) + NRHS * NB;

        std::vector<T> h_work(lhwork);
        h_work[0] = scalar<T>(lhwork);

        magma_unmqr_gpu<T>(MagmaLeft, MagmaConjTrans,
                           M, NRHS, N,
                           (*A_buf)(), A.getOffset(), A.strides()[1],
                           &h_tau[0],
                           (*B_buf)(), B.getOffset(), B.strides()[1],
                           &h_work[0], lhwork,
                           (*dT)(), tmp.getOffset(), NB,
                           queue, &info);

        magmablas_swapdblk<T>(MN - 1, NB,
                              (*A_buf)(), A.getOffset(), A.strides()[1], 1,
                              (*dT)(), tmp.getOffset() + NB * MN,
                              NB, 0, queue);

        if(getActivePlatform() == AFCL_PLATFORM_NVIDIA)
        {
            Array<T> AT = transpose<T>(A, true);
            cl::Buffer* AT_buf = AT.get();
            CLBLAS_CHECK(gpu_blas_trsm(
                             clblasLeft, clblasLower, clblasConjTrans, clblasNonUnit,
                             N, NRHS, scalar<T>(1),
                             (*AT_buf)(), AT.getOffset(), AT.strides()[1],
                             (*B_buf)(), B.getOffset(), B.strides()[1],
                             1, &queue, 0, nullptr, &event));
        } else {
            CLBLAS_CHECK(gpu_blas_trsm(
                             clblasLeft, clblasUpper, clblasNoTrans, clblasNonUnit,
                             N, NRHS, scalar<T>(1),
                             (*A_buf)(), A.getOffset(), A.strides()[1],
                             (*B_buf)(), B.getOffset(), B.strides()[1],
                             1, &queue, 0, nullptr, &event));
        }
        B.resetDims(dim4(N, K));
    }

    return B;
}