SparseArray<T> sparseConvertDenseToStorage(const Array<T> &in_) { in_.eval(); // MKL only has dns->csr. // CSR <-> CSC is only supported if input is square uint nNZ = reduce_all<af_notzero_t, T, uint>(in_); SparseArray<T> sparse_ = createEmptySparseArray<T>(in_.dims(), nNZ, AF_STORAGE_CSR); sparse_.eval(); auto func = [=] (SparseArray<T> sparse, const Array<T> in) { // Read: https://software.intel.com/en-us/node/520848 // But job description is incorrect with regards to job[1] // 0 implies row major and 1 implies column major int j1 = 1, j2 = 0; const int job[] = {0, j1, j2, 2, (int)sparse.elements(), 1}; const int M = in.dims()[0]; const int N = in.dims()[1]; int ldd = in.strides()[1]; int info = 0; // Have to mess up all const correctness because MKL dnscsr function // is bidirectional and has input/output on all pointers Array<T > &values = sparse.getValues(); Array<int> &rowIdx = sparse.getRowIdx(); Array<int> &colIdx = sparse.getColIdx(); dnscsr_func<T>()( job, &M, &N, reinterpret_cast<ptr_type<T>>(const_cast<T*>(in.get())), &ldd, reinterpret_cast<ptr_type<T>>(values.get()), colIdx.get(), rowIdx.get(), &info); }; getQueue().enqueue(func, sparse_, in_); if(stype == AF_STORAGE_CSR) return sparse_; else AF_ERROR("CPU Backend only supports Dense to CSR or COO", AF_ERR_NOT_SUPPORTED); return sparse_; }
Array<T> sparseConvertCOOToDense(const SparseArray<T> &in) { in.eval(); Array<T> dense = createValueArray<T>(in.dims(), scalar<T>(0)); dense.eval(); const Array<T> values = in.getValues(); const Array<int> rowIdx = in.getRowIdx(); const Array<int> colIdx = in.getColIdx(); getQueue().enqueue(kernel::coo2dense<T>, dense, values, rowIdx, colIdx); return dense; }
static void bcast_dim_launcher(Param &out, Param &tmp, const uint groups_all[4]) { Kernel ker = get_scan_dim_kernels<Ti, To, op, dim, isFinalPass, threads_y>(1); NDRange local(THREADS_X, threads_y); NDRange global(groups_all[0] * groups_all[2] * local[0], groups_all[1] * groups_all[3] * local[1]); uint lim = divup(out.info.dims[dim], (threads_y * groups_all[dim])); auto bcastOp = make_kernel<Buffer, KParam, Buffer, KParam, uint, uint, uint, uint>(ker); bcastOp(EnqueueArgs(getQueue(), global, local), out.data, out.info, tmp.data, tmp.info, groups_all[0], groups_all[1], groups_all[dim], lim); CL_DEBUG_FINISH(getQueue()); }
Array<T>::Array(af::dim4 dims, af::dim4 strides, dim_t offset_, const T * const in_data, bool is_device) : info(getActiveDeviceId(), dims, offset_, strides, (af_dtype)dtype_traits<T>::af_type), data(is_device ? (T*)in_data : memAlloc<T>(info.total()).release(), memFree<T>), data_dims(dims), node(bufferNodePtr<T>()), ready(true), owner(true) { if (!is_device) { // Ensure the memory being written to isnt used anywhere else. getQueue().sync(); copy(in_data, in_data + info.total(), data.get()); } }
Array<T> rotate(const Array<T> &in, const float theta, const af::dim4 &odims, const af_interp_type method) { in.eval(); Array<T> out = createEmptyArray<T>(odims); switch(method) { case AF_INTERP_NEAREST: getQueue().enqueue(kernel::rotate<T, AF_INTERP_NEAREST>, out, in, theta); break; case AF_INTERP_BILINEAR: getQueue().enqueue(kernel::rotate<T, AF_INTERP_BILINEAR>, out, in, theta); break; case AF_INTERP_LOWER: getQueue().enqueue(kernel::rotate<T, AF_INTERP_LOWER>, out, in, theta); break; default: AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); break; } return out; }
void sort0(Param val) { try { compute::command_queue c_queue(getQueue()()); compute::buffer val_buf((*val.data)()); for(int w = 0; w < val.info.dims[3]; w++) { int valW = w * val.info.strides[3]; for(int z = 0; z < val.info.dims[2]; z++) { int valWZ = valW + z * val.info.strides[2]; for(int y = 0; y < val.info.dims[1]; y++) { int valOffset = valWZ + y * val.info.strides[1]; if(isAscending) { compute::stable_sort( compute::make_buffer_iterator<T>(val_buf, valOffset), compute::make_buffer_iterator<T>(val_buf, valOffset + val.info.dims[0]), compute::less<T>(), c_queue); } else { compute::stable_sort( compute::make_buffer_iterator<T>(val_buf, valOffset), compute::make_buffer_iterator<T>(val_buf, valOffset + val.info.dims[0]), compute::greater<T>(), c_queue); } } } } CL_DEBUG_FINISH(getQueue()); } catch (cl::Error err) { CL_TO_AF_ERROR(err); throw; } }
void convNHelper(const conv_kparam_t& param, Param& out, const Param& signal, const Param& filter) { std::string ref_name = std::string("convolveND_") + std::string(dtype_traits<T>::getName()) + std::string(dtype_traits<aT>::getName()) + std::to_string(bDim) + std::to_string(expand); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, ref_name); if (entry.prog==0 && entry.ker==0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D Ti=" << dtype_traits<T>::getName() << " -D To=" << dtype_traits<aT>::getName() << " -D accType=" << dtype_traits<aT>::getName() << " -D BASE_DIM=" << bDim << " -D EXPAND=" << expand << " -D " << binOpName<af_mul_t>(); if((af_dtype) dtype_traits<T>::af_type == c32 || (af_dtype) dtype_traits<T>::af_type == c64) { options << " -D CPLX=1"; } else { options << " -D CPLX=0"; } if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) options << " -D USE_DOUBLE"; const char *ker_strs[] = {ops_cl, convolve_cl}; const int ker_lens[] = {ops_cl_len, convolve_cl_len}; Program prog; buildProgram(prog, 2, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "convolve"); addKernelToCache(device, ref_name, entry); } auto convOp = cl::KernelFunctor<Buffer, KParam, Buffer, KParam, cl::LocalSpaceArg, Buffer, KParam, int, int, int, int, int, int, int, int >(*entry.ker); convOp(EnqueueArgs(getQueue(), param.global, param.local), *out.data, out.info, *signal.data, signal.info, cl::Local(param.loc_size), *param.impulse, filter.info, param.nBBS0, param.nBBS1, param.o[0], param.o[1], param.o[2], param.s[0], param.s[1], param.s[2]); }
/* * Print Entire queue: Debugging */ void printQueue() { BoardNode boardToPrint = getQueue(NULL)->start; int row, col, queueN = 0; while(boardToPrint != NULL) { queueN++; printf("Queue Item %d\n",queueN); for(row = 0; row < MAXROW; row++) { for(col = 0; col < MAXCOL; col++) { printf("%d ",boardToPrint->board[row][col]); } pNL(); } boardToPrint = boardToPrint->next; } }
void random(cl::Buffer out, dim_type elements) { try { static unsigned counter; static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; static Program ranProgs[DeviceManager::MAX_DEVICES]; static Kernel ranKernels[DeviceManager::MAX_DEVICES]; int device = getActiveDeviceId(); std::call_once( compileFlags[device], [device] () { Program::Sources setSrc; setSrc.emplace_back(random_cl, random_cl_len); std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D repeat="<< REPEAT << " -D " << random_name<T, isRandu>().name(); if (std::is_same<T, double>::value) { options << " -D USE_DOUBLE"; options << " -D IS_64"; } if (std::is_same<T, char>::value) { options << " -D IS_BOOL"; } buildProgram(ranProgs[device], random_cl, random_cl_len, options.str()); ranKernels[device] = Kernel(ranProgs[device], "random"); }); auto randomOp = make_kernel<cl::Buffer, uint, uint, uint, uint>(ranKernels[device]); uint groups = divup(elements, THREADS * REPEAT); counter += divup(elements, THREADS * groups); NDRange local(THREADS, 1); NDRange global(THREADS * groups, 1); randomOp(EnqueueArgs(getQueue(), global, local), out, elements, counter, random_seed[0], random_seed[1]); } catch(cl::Error ex) { CL_TO_AF_ERROR(ex); } }
/* *Checks start of action Queue for command, and actions it if all criteria are met */ int popToTower() { ActionQueueStructure queue = getQueue(NULL); GameProperties Game = getGame(NULL); int needed; if(queue->start != NULL) { needed = calculateCosts(queue->start->command,queue->start->option,queue->start->target); switch(queue->start->command) { case cmd_upgrade: if (checkQueue(queue, Game,needed)) { upgradeTowerStat(queue->start->option,queue->start->target); useMemory(Game, needed); removeQueueItem(); } break; case cmd_mktwr: if (checkQueue(queue,Game,needed)) { switch(queue->start->option) { case mktwr_int: createTowerTypeFromPositions(queue->start->target,INT_TYPE); break; case mktwr_char: createTowerTypeFromPositions(queue->start->target,CHAR_TYPE); break; default: fprintf(stderr,"Unrecognised tower type\n"); break; } //createTowerFromPositions(queue->start->target); useMemory(Game, needed); removeQueueItem(); } break; case cmd_aptget: if(checkQueue(queue,Game,needed)) { unlock_ability(KILL); useMemory(Game, needed); removeQueueItem(); } default: break; } } else { return 0; } return 1; }
//============================================================================= // METHOD: SPELLipcMessageMailbox::place //============================================================================= bool SPELLipcMessageMailbox::place( std::string id, const SPELLipcMessage& msg ) { DEBUG(NAME + "Place message on queue with id " + id + " (" + msg.getSequenceStr() + ")"); SPELLipcMessageQueue* queue = getQueue(id); if(queue) { DEBUG(NAME + "Place message IN"); queue->push(msg); DEBUG(NAME + "Place message OUT"); return true; } else { LOG_ERROR("###### No queue to place response " + msg.dataStr()); return false; } }
int cholesky_inplace(Array<T> &in, const bool is_upper) { if(OpenCLCPUOffload()) { return cpu::cholesky_inplace(in, is_upper); } dim4 iDims = in.dims(); int N = iDims[0]; magma_uplo_t uplo = is_upper ? MagmaUpper : MagmaLower; int info = 0; cl::Buffer *in_buf = in.get(); magma_potrf_gpu<T>(uplo, N, (*in_buf)(), in.getOffset(), in.strides()[1], getQueue()(), &info); return info; }
void laset(int m, int n, T offdiag, T diag, cl_mem dA, size_t dA_offset, magma_int_t ldda) { std::string refName = laset_name<uplo>() + std::string("_") + std::string(dtype_traits<T>::getName()) + std::to_string(uplo); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, refName); if (entry.prog==0 && entry.ker==0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D BLK_X=" << BLK_X << " -D BLK_Y=" << BLK_Y << " -D IS_CPLX=" << af::iscplx<T>(); if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) options << " -D USE_DOUBLE"; const char* ker_strs[] = {laset_cl}; const int ker_lens[] = {laset_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, laset_name<uplo>()); addKernelToCache(device, refName, entry); } int groups_x = (m - 1) / BLK_X + 1; int groups_y = (n - 1) / BLK_Y + 1; NDRange local(BLK_X, 1); NDRange global(groups_x * local[0], groups_y * local[1]); // retain the cl_mem object during cl::Buffer creation cl::Buffer dAObj(dA, true); auto lasetOp = KernelFunctor<int, int, T, T, Buffer, unsigned long long, int>(*entry.ker); lasetOp(EnqueueArgs(getQueue(), global, local), m, n, offdiag, diag, dAObj, dA_offset, ldda); }
Array<T> triangleSolve(const Array<T> &A, const Array<T> &b, const af_mat_prop options) { trsm_func<T> gpu_trsm; Array<T> B = copyArray<T>(b); int N = B.dims()[0]; int NRHS = B.dims()[1]; const cl::Buffer* A_buf = A.get(); cl::Buffer* B_buf = B.get(); cl_event event = 0; cl_command_queue queue = getQueue()(); std::string pName = getPlatformName(getDevice()); if(pName.find("NVIDIA") != std::string::npos && (options & AF_MAT_UPPER)) { Array<T> AT = transpose<T>(A, true); cl::Buffer* AT_buf = AT.get(); gpu_trsm(clblasColumnMajor, clblasLeft, clblasLower, clblasConjTrans, options & AF_MAT_DIAG_UNIT ? clblasUnit : clblasNonUnit, N, NRHS, scalar<T>(1), (*AT_buf)(), AT.getOffset(), AT.strides()[1], (*B_buf)(), B.getOffset(), B.strides()[1], 1, &queue, 0, nullptr, &event); } else { gpu_trsm(clblasColumnMajor, clblasLeft, options & AF_MAT_LOWER ? clblasLower : clblasUpper, clblasNoTrans, options & AF_MAT_DIAG_UNIT ? clblasUnit : clblasNonUnit, N, NRHS, scalar<T>(1), (*A_buf)(), A.getOffset(), A.strides()[1], (*B_buf)(), B.getOffset(), B.strides()[1], 1, &queue, 0, nullptr, &event); } return B; }
int32_t InnerUdtServer::sendMessage(idgs::actor::ActorMessagePtr& msg) { int32_t memberId = msg->getDestMemberId(); if(memberId < 0) { LOG(ERROR) << "Invalid member ID: " << memberId; return RC_ERROR; } auto q = getQueue(memberId); msg->freePbMemory(); q->push(msg); std::shared_ptr<InnerUdtConnection> conn = getConnection(memberId); if(conn) { conn->sendMessage(msg); } return 0; }
Array<T> triangleSolve(const Array<T> &A, const Array<T> &b, const af_mat_prop options) { gpu_blas_trsm_func<T> gpu_blas_trsm; Array<T> B = copyArray<T>(b); int N = B.dims()[0]; int NRHS = B.dims()[1]; const cl::Buffer* A_buf = A.get(); cl::Buffer* B_buf = B.get(); cl_event event = 0; cl_command_queue queue = getQueue()(); if(getActivePlatform() == AFCL_PLATFORM_NVIDIA && (options & AF_MAT_UPPER)) { Array<T> AT = transpose<T>(A, true); cl::Buffer* AT_buf = AT.get(); CLBLAS_CHECK(gpu_blas_trsm( clblasLeft, clblasLower, clblasConjTrans, options & AF_MAT_DIAG_UNIT ? clblasUnit : clblasNonUnit, N, NRHS, scalar<T>(1), (*AT_buf)(), AT.getOffset(), AT.strides()[1], (*B_buf)(), B.getOffset(), B.strides()[1], 1, &queue, 0, nullptr, &event)); } else { CLBLAS_CHECK(gpu_blas_trsm( clblasLeft, options & AF_MAT_LOWER ? clblasLower : clblasUpper, clblasNoTrans, options & AF_MAT_DIAG_UNIT ? clblasUnit : clblasNonUnit, N, NRHS, scalar<T>(1), (*A_buf)(), A.getOffset(), A.strides()[1], (*B_buf)(), B.getOffset(), B.strides()[1], 1, &queue, 0, nullptr, &event)); } return B; }
void convNHelper(const conv_kparam_t& param, Param& out, const Param& signal, const Param& filter) { try { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; static std::map<int, Program*> convProgs; static std::map<int, Kernel*> convKernels; int device = getActiveDeviceId(); std::call_once( compileFlags[device], [device] () { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D accType="<< dtype_traits<aT>::getName() << " -D BASE_DIM="<< bDim << " -D EXPAND=" << expand; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } Program prog; buildProgram(prog, convolve_cl, convolve_cl_len, options.str()); convProgs[device] = new Program(prog); convKernels[device] = new Kernel(*convProgs[device], "convolve"); }); auto convOp = cl::KernelFunctor<Buffer, KParam, Buffer, KParam, cl::LocalSpaceArg, Buffer, KParam, int, int, int, int, int, int, int, int >(*convKernels[device]); convOp(EnqueueArgs(getQueue(), param.global, param.local), *out.data, out.info, *signal.data, signal.info, cl::Local(param.loc_size), *param.impulse, filter.info, param.nBBS0, param.nBBS1, param.o[0], param.o[1], param.o[2], param.s[0], param.s[1], param.s[2]); } catch (cl::Error err) { CL_TO_AF_ERROR(err); throw; } }
void susan(cl::Buffer* out, const cl::Buffer* in, const unsigned in_off, const unsigned idim0, const unsigned idim1, const float t, const float g, const unsigned edge) { std::string refName = std::string("susan_responses_") + std::string(dtype_traits<T>::getName()) + std::to_string(radius); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, refName); if (entry.prog == 0 && entry.ker == 0) { const size_t LOCAL_MEM_SIZE = (SUSAN_THREADS_X + 2 * radius) * (SUSAN_THREADS_Y + 2 * radius); std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D LOCAL_MEM_SIZE=" << LOCAL_MEM_SIZE << " -D BLOCK_X=" << SUSAN_THREADS_X << " -D BLOCK_Y=" << SUSAN_THREADS_Y << " -D RADIUS=" << radius << " -D RESPONSE"; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) options << " -D USE_DOUBLE"; const char* ker_strs[] = {susan_cl}; const int ker_lens[] = {susan_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "susan_responses"); addKernelToCache(device, refName, entry); } auto susanOp = KernelFunctor<Buffer, Buffer, unsigned, unsigned, unsigned, float, float, unsigned>(*entry.ker); NDRange local(SUSAN_THREADS_X, SUSAN_THREADS_Y); NDRange global(divup(idim0 - 2 * edge, local[0]) * local[0], divup(idim1 - 2 * edge, local[1]) * local[1]); susanOp(EnqueueArgs(getQueue(), global, local), *out, *in, in_off, idim0, idim1, t, g, edge); }
int cholesky_inplace(Array<T> &in, const bool is_upper) { try { initBlas(); dim4 iDims = in.dims(); int N = iDims[0]; magma_uplo_t uplo = is_upper ? MagmaUpper : MagmaLower; int info = 0; cl::Buffer *in_buf = in.get(); magma_potrf_gpu<T>(uplo, N, (*in_buf)(), in.getOffset(), in.strides()[1], getQueue()(), &info); return info; } catch (cl::Error &err) { CL_TO_AF_ERROR(err); } }
Array<T> solveLU(const Array<T> &A, const Array<int> &pivot, const Array<T> &b, const af_mat_prop options) { A.eval(); pivot.eval(); b.eval(); int N = A.dims()[0]; int NRHS = b.dims()[1]; Array< T > B = copyArray<T>(b); auto func = [=] (Array<T> A, Array<T> B, Array<int> pivot, int N, int NRHS) { getrs_func<T>()(AF_LAPACK_COL_MAJOR, 'N', N, NRHS, A.get(), A.strides()[1], pivot.get(), B.get(), B.strides()[1]); }; getQueue().enqueue(func, A, B, pivot, N, NRHS); return B; }
/* Input: An empty queue, and an initialized sudoku s, an empty queue * for the solutions, and a pointer to a variable to store the * number of guesses * Returns: NULL if no possible solutions exist or the queue is not * empty, and otherwise the solution to the input puzzle. * ----------------------------------------------------------- * Solves the puzzle using backtracking. The solver is initialized by * putting the puzzle into the empty queue. Then, each iteration will * pull a board out of the queue, perform a simple reduction on that * board, and then make a guess on the cell which has the least number * of possibilities. If verbose is set, it prints each board before * making a guess, giving a sense of the whole solution process. * * If there is nothing to pull out of the queue, there are no possible * solutions to the puzzle, and the function returns an error value of * NULL. If the queue is not empty at initialization, the function * prints an error message regardless of the state of the flags, and * returns an error. */ sudoku solve(queue q, sudoku s, int * guesses) { if (!isEmptyQueue(q)) { printf("Error: call to solve with a non-empty queue"); return NULL; } putQueue(q, (void *) s); *guesses = 0; int slvd = 0; while (1) { if (getQueue(q, (void **) &s)){ return NULL; } reduce(s); if(verbose) { system("clear"); printSudoku(s, pretty); printf("\n"); } slvd = checkSudoku(s); if (slvd == -1) deleteSudoku(s); else if (slvd == 1) return s; else { if(guess(q, s)) { printf("Error: Full queue"); return NULL; } deleteSudoku(s); (*guesses)++; } } }
void EmitterSystem::update(Camera & camera, float frametime) { Vector3 near = camera.unProject(Mouse::getPosition(), 0.f); Vector3 far = camera.unProject(Mouse::getPosition(), 1.f); Vector3 dir = (far - near); dir.normalize(); dir.x *= lerp(1.f, 4.f, std::abs(Mouse::getPosition().x - 0.5f) / 0.5f); dir.y *= lerp(1.f, 4.f, std::abs(Mouse::getPosition().y - 0.5f) / 0.5f); cl::Event event; cl_int err = 0; glFinish(); if (Keyboard::isKeyPressed(GLFW_KEY_F)) init(); if (Keyboard::isKeyPressed(GLFW_KEY_H)) m_disableVelocity = !m_disableVelocity; if (m_disableVelocity) frametime = 0.f; std::vector<cl::Memory> buffers; buffers.push_back(m_glBuffer[Index::Particles]); err = getQueue().enqueueAcquireGLObjects(&buffers, NULL, NULL); if (err != CL_SUCCESS) std::cout << "Failed acquiring GL object : " << getError(err) << std::endl; err = getQueue().finish(); if (err != CL_SUCCESS) std::cout << "ERROR" << std::endl; err = getKernel().setArg(3, frametime); if (err != CL_SUCCESS) std::cout << "ERROR kernel args" << std::endl; err = getQueue().enqueueNDRangeKernel(getKernel(), cl::NullRange, cl::NDRange(m_particleCount), cl::NullRange, NULL, &event); if (err != CL_SUCCESS) std::cout << "Failed enqueueing kernel : " << getError(err) << std::endl; err = event.wait(); if (err != CL_SUCCESS) std::cout << "ERROR" << std::endl; getQueue().enqueueCopyBuffer(m_clBuffer[Index::Particles], m_glBuffer[Index::Particles], 0, 0, m_particleCount * sizeof(Particle), NULL, NULL); err = getQueue().enqueueReleaseGLObjects(&buffers, NULL, NULL); if (err != CL_SUCCESS) std::cout << "Failed releasing GL object : " << getError(err) << std::endl; getQueue().finish(); }
Array<T> solveLU(const Array<T> &A, const Array<int> &pivot, const Array<T> &b, const af_mat_prop options) { int N = A.dims()[0]; int NRHS = b.dims()[1]; std::vector<int> ipiv(N); copyData(&ipiv[0], pivot); Array< T > B = copyArray<T>(b); const cl::Buffer *A_buf = A.get(); cl::Buffer *B_buf = B.get(); int info = 0; magma_getrs_gpu<T>(MagmaNoTrans, N, NRHS, (*A_buf)(), A.getOffset(), A.strides()[1], &ipiv[0], (*B_buf)(), B.getOffset(), B.strides()[1], getQueue()(), &info); return B; }
Array<T> triangleSolve(const Array<T> &A, const Array<T> &b, const af_mat_prop options) { A.eval(); b.eval(); Array<T> B = copyArray<T>(b); int N = B.dims()[0]; int NRHS = B.dims()[1]; auto func = [=] (Array<T> A, Array<T> B, int N, int NRHS, const af_mat_prop options) { trtrs_func<T>()(AF_LAPACK_COL_MAJOR, options & AF_MAT_UPPER ? 'U' : 'L', 'N', // transpose flag options & AF_MAT_DIAG_UNIT ? 'U' : 'N', N, NRHS, A.get(), A.strides()[1], B.get(), B.strides()[1]); }; getQueue().enqueue(func, A, B, N, NRHS, options); return B; }
Array<T>::Array(const dim4 &dims, T *const in_data, bool is_device, bool copy_device) : info(getActiveDeviceId(), dims, 0, calcStrides(dims), (af_dtype)dtype_traits<T>::af_type) , data((is_device & !copy_device) ? (T *)in_data : memAlloc<T>(dims.elements()).release(), memFree<T>) , data_dims(dims) , node(bufferNodePtr<T>()) , ready(true) , owner(true) { static_assert(is_standard_layout<Array<T>>::value, "Array<T> must be a standard layout type"); static_assert( offsetof(Array<T>, info) == 0, "Array<T>::info must be the first member variable of Array<T>"); if (!is_device || copy_device) { // Ensure the memory being written to isnt used anywhere else. getQueue().sync(); copy(in_data, in_data + dims.elements(), data.get()); } }
void lu(Array<T> &lower, Array<T> &upper, Array<int> &pivot, const Array<T> &in) { lower.eval(); upper.eval(); pivot.eval(); in.eval(); dim4 iDims = in.dims(); int M = iDims[0]; int N = iDims[1]; Array<T> in_copy = copyArray<T>(in); pivot = lu_inplace(in_copy); // SPLIT into lower and upper dim4 ldims(M, min(M, N)); dim4 udims(min(M, N), N); lower = createEmptyArray<T>(ldims); upper = createEmptyArray<T>(udims); getQueue().enqueue(kernel::lu_split<T>, lower, upper, in_copy); }
Array<T>* setIntersect(const Array<T> &first, const Array<T> &second, const bool is_unique) { if ((std::is_same<T, double>::value || std::is_same<T, cdouble>::value) && !isDoubleSupported(getActiveDeviceId())) { OPENCL_NOT_SUPPORTED(); } Array<T> unique_first = first; Array<T> unique_second = second; if (!is_unique) { unique_first = *setUnique(first, false); unique_second = *setUnique(second, false); } size_t out_size = std::max(unique_first.dims()[0], unique_second.dims()[0]); Array<T> *out = createEmptyArray<T>(dim4(out_size, 1, 1, 1)); compute::command_queue queue(getQueue()()); compute::buffer first_data((*unique_first.get())()); compute::buffer second_data((*unique_second.get())()); compute::buffer out_data((*out->get())()); compute::buffer_iterator<T> first_begin(first_data, 0); compute::buffer_iterator<T> first_end(first_data, unique_first.dims()[0]); compute::buffer_iterator<T> second_begin(second_data, 0); compute::buffer_iterator<T> second_end(second_data, unique_second.dims()[0]); compute::buffer_iterator<T> out_begin(out_data, 0); compute::buffer_iterator<T> out_end = compute::set_intersection( first_begin, first_end, second_begin, second_end, out_begin, queue ); out->resetDims(dim4(std::distance(out_begin, out_end), 1, 1, 1)); return out; }
void testPushToQueue() { cmdType nCommand_1=cmd_upgrade; cmdOption nStat_1=upgrade_power; int tar_1 = 1; cmdType nCommand_2=cmd_upgrade; cmdOption nStat_2=upgrade_range; int tar_2 = 2; ActionQueueStructure newQueue = getQueue(NULL); sput_fail_unless(pushToQueue(newQueue,nCommand_1,nStat_1,tar_1) == 1,"Valid: 1 Queue Item"); sput_fail_unless(pushToQueue(newQueue,nCommand_2,nStat_2,tar_2) == 2,"Valid: 2 Queue Items"); sput_fail_unless(getFirstCommand(newQueue) == cmd_upgrade,"Valid: Top of Queue Upgrade Command"); sput_fail_unless(getFirstOption(newQueue) == upgrade_power,"Valid: Top of Queue Power Option"); sput_fail_unless(getLastCommand(newQueue) == cmd_upgrade,"Valid: Last in Queue upgrade Command"); sput_fail_unless(getLastOption(newQueue) == upgrade_range,"Valid: Last of Queue range Option"); pushToQueue(newQueue,cmd_mktwr,mktwr_int,2); sput_fail_unless(getLastCommand(newQueue) == cmd_mktwr,"Valid: Last in Queue make tower command"); sput_fail_unless(getLastOption(newQueue) == mktwr_int,"Valid: Last option in Queue is int tower"); clearQueue(); }
Array<T> dot(const Array<T> &lhs, const Array<T> &rhs, af_blas_transpose optLhs, af_blas_transpose optRhs) { initBlas(); int N = lhs.dims()[0]; dot_func<T> dot; cl::Event event; auto out = createEmptyArray<T>(af::dim4(1)); cl::Buffer scratch(getContext(), CL_MEM_READ_WRITE, sizeof(T) * N); clblasStatus err; err = dot(N, (*out.get())(), out.getOffset(), (*lhs.get())(), lhs.getOffset(), lhs.strides()[0], (*rhs.get())(), rhs.getOffset(), rhs.strides()[0], scratch(), 1, &getQueue()(), 0, nullptr, &event()); if(err) { throw runtime_error(std::string("CLBLAS error: ") + std::to_string(err)); } return out; }
Array<T> sparseConvertStorageToDense(const SparseArray<T> &in_) { in_.eval(); Array<T> dense_ = createValueArray<T>(in_.dims(), scalar<T>(0)); dense_.eval(); auto func = [=] (Array<T> dense, const SparseArray<T> in) { Array<T > values = in.getValues(); Array<int> rowIdx = in.getRowIdx(); Array<int> colIdx = in.getColIdx(); kernel::csr_dense<T>()(dense, values, rowIdx, colIdx); }; getQueue().enqueue(func, dense_, in_); if(stype == AF_STORAGE_CSR) return dense_; else AF_ERROR("CPU Backend only supports Dense to CSR or COO", AF_ERR_NOT_SUPPORTED); return dense_; }