void laset(int m, int n, T offdiag, T diag, cl_mem dA, size_t dA_offset, magma_int_t ldda) { std::string refName = laset_name<uplo>() + std::string("_") + std::string(dtype_traits<T>::getName()) + std::to_string(uplo); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, refName); if (entry.prog==0 && entry.ker==0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D BLK_X=" << BLK_X << " -D BLK_Y=" << BLK_Y << " -D IS_CPLX=" << af::iscplx<T>(); if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) options << " -D USE_DOUBLE"; const char* ker_strs[] = {laset_cl}; const int ker_lens[] = {laset_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, laset_name<uplo>()); addKernelToCache(device, refName, entry); } int groups_x = (m - 1) / BLK_X + 1; int groups_y = (n - 1) / BLK_Y + 1; NDRange local(BLK_X, 1); NDRange global(groups_x * local[0], groups_y * local[1]); // retain the cl_mem object during cl::Buffer creation cl::Buffer dAObj(dA, true); auto lasetOp = KernelFunctor<int, int, T, T, Buffer, unsigned long long, int>(*entry.ker); lasetOp(EnqueueArgs(getQueue(), global, local), m, n, offdiag, diag, dAObj, dA_offset, ldda); }
void swapdblk(int n, int nb, cl_mem dA, size_t dA_offset, int ldda, int inca, cl_mem dB, size_t dB_offset, int lddb, int incb, cl_command_queue queue) { std::string refName = std::string("swapdblk_") + std::string(dtype_traits<T>::getName()); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, refName); if (entry.prog == 0 && entry.ker == 0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName(); if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) options << " -D USE_DOUBLE"; const char* ker_strs[] = {swapdblk_cl}; const int ker_lens[] = {swapdblk_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "swapdblk"); addKernelToCache(device, refName, entry); } int nblocks = n / nb; if (nblocks == 0) return; int info = 0; if (n < 0) { info = -1; } else if (nb < 1 || nb > 1024) { info = -2; } else if (ldda < (nblocks - 1) * nb * inca + nb) { info = -4; } else if (inca < 0) { info = -5; } else if (lddb < (nblocks - 1) * nb * incb + nb) { info = -7; } else if (incb < 0) { info = -8; } if (info != 0) { AF_ERROR("Invalid configuration", AF_ERR_INTERNAL); return; } NDRange local(nb); NDRange global(nblocks * nb); cl::Buffer dAObj(dA, true); cl::Buffer dBObj(dB, true); auto swapdOp = KernelFunctor<int, Buffer, unsigned long long, int, int, Buffer, unsigned long long, int, int>(*entry.ker); cl::CommandQueue q(queue); swapdOp(EnqueueArgs(q, global, local), nb, dAObj, dA_offset, ldda, inca, dBObj, dB_offset, lddb, incb); }