Ejemplo n.º 1
0
TEST(StreamQuery, InvalidStream) {
    ::testing::FLAGS_gtest_death_test_style = "threadsafe";

    cudaError_t ret;
    cudaStream_t stream;

    /* The CUDA 5.0 driver no longer segfaults. */
    int driver;
    ret = cudaDriverGetVersion(&driver);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaStreamCreate(&stream);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaStreamDestroy(stream);
    ASSERT_EQ(cudaSuccess, ret);

    if (driver >= 5000) {
        ret = cudaStreamQuery(stream);
        EXPECT_EQ(cudaErrorUnknown, ret);
    } else {
        EXPECT_EXIT({
            cudaStreamQuery(stream); },
            ::testing::KilledBySignal(SIGSEGV), "");
    }
Ejemplo n.º 2
0
cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id)
{
	cudaError_t result = cudaSuccess;
	if (abort_flag)
		return result;
	if (situation >= 0)
	{
		static std::map<int, tsumarray> tsum;

		double a = 0.95, b = 0.05;
		if (tsum.find(situation) == tsum.end()) { a = 0.5; b = 0.5; } // faster initial convergence

		double tsync = 0.0;
		double tsleep = 0.95 * tsum[situation].value[thr_id];
		if (cudaStreamQuery(stream) == cudaErrorNotReady)
		{
			usleep((useconds_t)(1e6*tsleep));
			struct timeval tv_start, tv_end;
			gettimeofday(&tv_start, NULL);
			result = cudaStreamSynchronize(stream);
			gettimeofday(&tv_end, NULL);
			tsync = 1e-6 * (tv_end.tv_usec-tv_start.tv_usec) + (tv_end.tv_sec-tv_start.tv_sec);
		}
		if (tsync >= 0) tsum[situation].value[thr_id] = a * tsum[situation].value[thr_id] + b * (tsleep+tsync);
	}
	else
		result = cudaStreamSynchronize(stream);
	return result;
}
Ejemplo n.º 3
0
// if we use 2 threads on the same gpu, we need to reinit the threads
void cuda_reset_device(int thr_id, bool *init)
{
	int dev_id = device_map[thr_id % MAX_GPUS];
	cudaSetDevice(dev_id);
	if (init != NULL) {
		// with init array, its meant to be used in algo's scan code...
		for (int i=0; i < MAX_GPUS; i++) {
			if (device_map[i] == dev_id) {
				init[i] = false;
			}
		}
		// force exit from algo's scan loops/function
		restart_threads();
		cudaDeviceSynchronize();
		while (cudaStreamQuery(NULL) == cudaErrorNotReady)
			usleep(1000);
	}
	cudaDeviceReset();
	if (opt_cudaschedule >= 0) {
		cudaSetDeviceFlags((unsigned)(opt_cudaschedule & cudaDeviceScheduleMask));
	} else {
		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
	}
	cudaDeviceSynchronize();
}
  /**
     Query if a stream has finished.
  */
  inline bool query() const
  {
    cudaError_t ret = cudaStreamQuery(stream);

    if(ret == cudaSuccess)
      return true;
    else if(ret == cudaErrorNotReady)
      return false;

    CUDA_CHECK(ret);
    return false;  // suppress compiler warning
  }
Ejemplo n.º 5
0
    void synchronize_stream(cudaStream_t stream)
    {
#if !defined CUDA_VERSION
#error CUDA_VERSION not defined
#elif CUDA_VERSION >= 9020 && CUDA_VERSION <= 10010
        // This should be pretty much the same as cudaStreamSynchronize, which for some
        // reason makes training freeze in some cases.
        // (see https://github.com/davisking/dlib/issues/1513)
        while (true)
        {
            cudaError_t err = cudaStreamQuery(stream);
            switch (err)
            {
            case cudaSuccess: return;      // now we are synchronized
            case cudaErrorNotReady: break; // continue waiting
            default: CHECK_CUDA(err);      // unexpected error: throw
            }
        }
#else // CUDA_VERSION
        CHECK_CUDA(cudaStreamSynchronize(stream));
#endif // CUDA_VERSION
    }
Ejemplo n.º 6
0
      /*
       * Simply calls cudaStreamQuery on the native handle.
       *
       * @return True iff cudaStreamQuery(handle) == cudaSuccess, or in other
       * words, if all commands in the stream have executed successfully.
       */
	  bool Stream::query()
	  {
		return (cudaStreamQuery(handle) == cudaSuccess);
	  }//bool
Ejemplo n.º 7
0
cudaError_t WINAPI wine_cudaStreamQuery(cudaStream_t stream) {
    WINE_TRACE("\n");
    return cudaStreamQuery( stream );
}
Ejemplo n.º 8
0
void TaskScheduler::run()
{
    // Kahn's algorithm
    // https://en.wikipedia.org/wiki/Topological_sorting

    auto compareNodes = [] (Node* a, Node* b) {
        // lower number means higher priority
        return a->priority < b->priority;
    };
    std::priority_queue<Node*, std::vector<Node*>, decltype(compareNodes)> S(compareNodes);
    std::vector<std::pair<cudaStream_t, Node*>> workMap;

    for (auto& n : nodes)
    {
        n->from = n->from_backup;

        if (n->from.empty())
            S.push(n.get());
    }

    int completed = 0;
    const int total = nodes.size();

    while (true)
    {
        // Check the status of all running kernels
        while (completed < total && S.empty())
        {
            for (auto streamNode_it = workMap.begin(); streamNode_it != workMap.end(); )
            {
                auto result = cudaStreamQuery(streamNode_it->first);
                if ( result == cudaSuccess )
                {
                    auto node = streamNode_it->second;

                    debug("Completed group %s ", tasks[node->id].label.c_str());

                    // Return freed stream back to the corresponding queue
                    node->streams->push(streamNode_it->first);

                    // Remove resolved dependencies
                    for (auto dep : node->to)
                    {
                        if (!dep->from.empty())
                        {
                            dep->from.remove(node);
                            if (dep->from.empty())
                                S.push(dep);
                        }
                    }

                    // Remove task from the list of currently in progress
                    completed++;
                    streamNode_it = workMap.erase(streamNode_it);
                }
                else if (result == cudaErrorNotReady)
                {
                    streamNode_it++;
                }
                else
                {
                    error("Group '%s' raised an error",  tasks[streamNode_it->second->id].label.c_str());
                    CUDA_Check( result );
                }
            }
        }

        if (completed == total)
            break;

        Node* node = S.top();
        S.pop();

        cudaStream_t stream;
        if (node->streams->empty())
            CUDA_Check( cudaStreamCreateWithPriority(&stream, cudaStreamNonBlocking, node->priority) );
        else
        {
            stream = node->streams->front();
            node->streams->pop();
        }

        debug("Executing group %s on stream %lld with priority %d", tasks[node->id].label.c_str(), (int64_t)stream, node->priority);
        workMap.push_back({stream, node});

        for (auto& func_every : tasks[node->id].funcs)
            if (nExecutions % func_every.second == 0)
                func_every.first(stream);
    }

    nExecutions++;
    CUDA_Check( cudaDeviceSynchronize() );
}
Ejemplo n.º 9
0
 bool Stream::isCompleted() const
 {
   cudaError_t err = cudaStreamQuery( m_stream );
   DP_ASSERT( ( err == cudaSuccess ) || ( err == cudaErrorNotReady ) );
   return( err == cudaSuccess );
 }