void laset(int m, int  n,
           T offdiag, T diag,
           cl_mem dA, size_t dA_offset, magma_int_t ldda)
{
    std::string refName = laset_name<uplo>() + std::string("_") +
        std::string(dtype_traits<T>::getName()) +
        std::to_string(uplo);

    int device = getActiveDeviceId();
    kc_entry_t entry = kernelCache(device, refName);

    if (entry.prog==0 && entry.ker==0) {
        std::ostringstream options;
        options << " -D T=" << dtype_traits<T>::getName()
                << " -D BLK_X=" << BLK_X
                << " -D BLK_Y=" << BLK_Y
                << " -D IS_CPLX=" << af::iscplx<T>();

        if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value)
            options << " -D USE_DOUBLE";

        const char* ker_strs[] = {laset_cl};
        const int   ker_lens[] = {laset_cl_len};
        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, laset_name<uplo>());

        addKernelToCache(device, refName, entry);
    }

    int groups_x = (m - 1) / BLK_X + 1;
    int groups_y = (n - 1) / BLK_Y + 1;

    NDRange local(BLK_X, 1);
    NDRange global(groups_x * local[0], groups_y * local[1]);

    // retain the cl_mem object during cl::Buffer creation
    cl::Buffer dAObj(dA, true);

    auto lasetOp = KernelFunctor<int, int, T, T, Buffer, unsigned long long, int>(*entry.ker);

    lasetOp(EnqueueArgs(getQueue(), global, local), m, n, offdiag, diag, dAObj, dA_offset, ldda);
}
Exemple #2
0
void swapdblk(int n, int nb, cl_mem dA, size_t dA_offset, int ldda, int inca,
              cl_mem dB, size_t dB_offset, int lddb, int incb,
              cl_command_queue queue) {
    std::string refName =
        std::string("swapdblk_") + std::string(dtype_traits<T>::getName());

    int device       = getActiveDeviceId();
    kc_entry_t entry = kernelCache(device, refName);

    if (entry.prog == 0 && entry.ker == 0) {
        std::ostringstream options;

        options << " -D T=" << dtype_traits<T>::getName();
        if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value)
            options << " -D USE_DOUBLE";

        const char* ker_strs[] = {swapdblk_cl};
        const int ker_lens[]   = {swapdblk_cl_len};
        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, "swapdblk");

        addKernelToCache(device, refName, entry);
    }

    int nblocks = n / nb;

    if (nblocks == 0) return;

    int info = 0;
    if (n < 0) {
        info = -1;
    } else if (nb < 1 || nb > 1024) {
        info = -2;
    } else if (ldda < (nblocks - 1) * nb * inca + nb) {
        info = -4;
    } else if (inca < 0) {
        info = -5;
    } else if (lddb < (nblocks - 1) * nb * incb + nb) {
        info = -7;
    } else if (incb < 0) {
        info = -8;
    }

    if (info != 0) {
        AF_ERROR("Invalid configuration", AF_ERR_INTERNAL);
        return;
    }

    NDRange local(nb);
    NDRange global(nblocks * nb);

    cl::Buffer dAObj(dA, true);
    cl::Buffer dBObj(dB, true);

    auto swapdOp =
        KernelFunctor<int, Buffer, unsigned long long, int, int, Buffer,
                      unsigned long long, int, int>(*entry.ker);

    cl::CommandQueue q(queue);
    swapdOp(EnqueueArgs(q, global, local), nb, dAObj, dA_offset, ldda, inca,
            dBObj, dB_offset, lddb, incb);
}