THCStream* THCStream_newWithPriority(int flags, int priority) { THCStream* self = (THCStream*) malloc(sizeof(THCStream)); self->refcount = 1; THCudaCheck(cudaGetDevice(&self->device)); THCudaCheck(cudaStreamCreateWithPriority(&self->stream, flags, priority)); return self; }
CudaStream::CudaStream(bool high_priority = false) { if (high_priority) { int leastPriority, greatestPriority; CUDA_CHECK(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority)); CUDA_CHECK(cudaStreamCreateWithPriority(&stream_, cudaStreamDefault, greatestPriority)); } else { CUDA_CHECK(cudaStreamCreate(&stream_)); } DLOG(INFO) << "New " << (high_priority ? "high priority " : "") << "stream " << stream_ << " on device " << current_device() << ", thread " << std::this_thread::get_id(); }
void TaskScheduler::run() { // Kahn's algorithm // https://en.wikipedia.org/wiki/Topological_sorting auto compareNodes = [] (Node* a, Node* b) { // lower number means higher priority return a->priority < b->priority; }; std::priority_queue<Node*, std::vector<Node*>, decltype(compareNodes)> S(compareNodes); std::vector<std::pair<cudaStream_t, Node*>> workMap; for (auto& n : nodes) { n->from = n->from_backup; if (n->from.empty()) S.push(n.get()); } int completed = 0; const int total = nodes.size(); while (true) { // Check the status of all running kernels while (completed < total && S.empty()) { for (auto streamNode_it = workMap.begin(); streamNode_it != workMap.end(); ) { auto result = cudaStreamQuery(streamNode_it->first); if ( result == cudaSuccess ) { auto node = streamNode_it->second; debug("Completed group %s ", tasks[node->id].label.c_str()); // Return freed stream back to the corresponding queue node->streams->push(streamNode_it->first); // Remove resolved dependencies for (auto dep : node->to) { if (!dep->from.empty()) { dep->from.remove(node); if (dep->from.empty()) S.push(dep); } } // Remove task from the list of currently in progress completed++; streamNode_it = workMap.erase(streamNode_it); } else if (result == cudaErrorNotReady) { streamNode_it++; } else { error("Group '%s' raised an error", tasks[streamNode_it->second->id].label.c_str()); CUDA_Check( result ); } } } if (completed == total) break; Node* node = S.top(); S.pop(); cudaStream_t stream; if (node->streams->empty()) CUDA_Check( cudaStreamCreateWithPriority(&stream, cudaStreamNonBlocking, node->priority) ); else { stream = node->streams->front(); node->streams->pop(); } debug("Executing group %s on stream %lld with priority %d", tasks[node->id].label.c_str(), (int64_t)stream, node->priority); workMap.push_back({stream, node}); for (auto& func_every : tasks[node->id].funcs) if (nExecutions % func_every.second == 0) func_every.first(stream); } nExecutions++; CUDA_Check( cudaDeviceSynchronize() ); }
Stream::Stream( bool blocking, int priority ) : m_blocking( blocking ) { CUDA_VERIFY( cudaStreamCreateWithPriority( &m_stream, m_blocking ? cudaStreamDefault : cudaStreamNonBlocking, priority ) ); CUDA_VERIFY( cudaStreamGetPriority( m_stream, &m_priority ) ); }