TEST(StreamQuery, InvalidStream) {
    ::testing::FLAGS_gtest_death_test_style = "threadsafe";

    cudaError_t ret;
    cudaStream_t stream;

    /* The CUDA 5.0 driver no longer segfaults. */
    int driver;
    ret = cudaDriverGetVersion(&driver);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaStreamCreate(&stream);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaStreamDestroy(stream);
    ASSERT_EQ(cudaSuccess, ret);

    if (driver >= 5000) {
        ret = cudaStreamQuery(stream);
        EXPECT_EQ(cudaErrorUnknown, ret);
    } else {
        EXPECT_EXIT({
            cudaStreamQuery(stream); },
            ::testing::KilledBySignal(SIGSEGV), "");
    }
TEST_P(MemcpyAsync, H2DTransfers) {
    const size_t param = GetParam();
    const size_t alloc = 1 << param;

    cudaError_t ret;
    void *d1, *h1;
    ret = cudaMalloc(&d1, alloc);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaHostAlloc(&h1, alloc, cudaHostAllocMapped);
    ASSERT_EQ(cudaSuccess, ret);

    cudaStream_t stream;
    ret = cudaStreamCreate(&stream);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaMemcpyAsync(d1, h1, alloc, cudaMemcpyHostToDevice, stream);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaStreamSynchronize(stream);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaFree(d1);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaFreeHost(h1);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaStreamDestroy(stream);
    ASSERT_EQ(cudaSuccess, ret);
}
Beispiel #3
0
void cuda_initialize() {
    CUDA_CHECK(cudaStreamCreate(&g_context.stream));
    CUBLAS_CHECK(cublasCreate_v2(&g_context.cublas_handle));
    CUBLAS_CHECK(cublasSetStream(g_context.cublas_handle, g_context.stream));
    // CUDNN_CHECK(cudnnCreate(&g_context.cudnn_handle));
    // CUDNN_CHECK(cudnnSetStream(g_context.cudnn_handle, g_context.stream));
}
TEST_P(MemcpyAsync, D2DTransfers) {
    const size_t param = GetParam();
    const size_t alloc = 1 << param;

    cudaError_t ret;
    void *d1, *d2;
    ret = cudaMalloc(&d1, alloc);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaMalloc(&d2, alloc);
    ASSERT_EQ(cudaSuccess, ret);

    cudaStream_t stream;
    ret = cudaStreamCreate(&stream);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaMemcpyAsync(d2, d1, alloc, cudaMemcpyDeviceToDevice, stream);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaStreamSynchronize(stream);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaFree(d1);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaFree(d2);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaStreamDestroy(stream);
    ASSERT_EQ(cudaSuccess, ret);
}
/**
 * CUDA4 introduced the cudaMemcpyDefault direction to cudaMemcpy.
 */
TEST(MemcpyAsync, CheckDefaultDirection) {
    cudaError_t ret;
    cudaStream_t stream;

    ret = cudaStreamCreate(&stream);
    ASSERT_EQ(cudaSuccess, ret);

    int a1 = 0;
    int a2 = 0;
    int * b;
    ret = cudaMalloc((void**) &b, sizeof(*b));
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaMemcpyAsync(&a1, &a2, sizeof(a1), cudaMemcpyDefault, stream);
    EXPECT_EQ(cudaSuccess, ret);

    ret = cudaMemcpyAsync(&a1, b, sizeof(a1), cudaMemcpyDefault, stream);
    EXPECT_EQ(cudaSuccess, ret);

    ret = cudaMemcpyAsync(b, &a1, sizeof(a1), cudaMemcpyDefault, stream);
    EXPECT_EQ(cudaSuccess, ret);

    ret = cudaMemcpyAsync(b, b, sizeof(a1), cudaMemcpyDefault, stream);
    EXPECT_EQ(cudaSuccess, ret);

    ret = cudaStreamSynchronize(stream);
    EXPECT_EQ(cudaSuccess, ret);

    ret = cudaFree(b);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaStreamDestroy(stream);
    EXPECT_EQ(cudaSuccess, ret);
}
TEST(MemcpyAsync, CheckReturnValues) {
    /**
     * The API documentation states that
     * cudaErrorInvalidDevicePointer is a valid return value for
     * cudaMemcpyAsync
     *
     * TODO;  This needs a test.
     */
    cudaError_t ret;
    cudaStream_t stream;

    ret = cudaStreamCreate(&stream);
    ASSERT_EQ(cudaSuccess, ret);

    /**
     * Test woefully out of range directions.
     */
    int a = 0;
    ret = cudaMemcpyAsync(&a,   &a,   sizeof(a), (cudaMemcpyKind) -1, stream);
    EXPECT_EQ(cudaErrorInvalidMemcpyDirection, ret);

    ret = cudaMemcpyAsync(NULL, NULL, sizeof(a), (cudaMemcpyKind) -1, stream);
    EXPECT_EQ(cudaErrorInvalidMemcpyDirection, ret);

    ret = cudaStreamSynchronize(stream);
    EXPECT_EQ(cudaSuccess, ret);

    ret = cudaStreamDestroy(stream);
    EXPECT_EQ(cudaSuccess, ret);
}
Beispiel #7
0
void PathPlanner::initialize()
{
    qDebug() << __PRETTY_FUNCTION__;

    Q_ASSERT(!mIsInitialized);

    Q_ASSERT(mPointCloudColliders != nullptr);

    const quint32 numberOfCells = mParametersPathPlanner.grid.getCellCount();

    // The occupancy grid will be built and dilated in this memory
    mVboGridOccupancy = OpenGlUtilities::createVbo(numberOfCells * sizeof(quint8));
    cudaSafeCall(cudaGraphicsGLRegisterBuffer(&mCudaVboResourceGridOccupancyTemplate, mVboGridOccupancy, cudaGraphicsMapFlagsNone));

    checkAndMapGridOccupancy(mCudaVboResourceGridOccupancyTemplate);
    cudaMemset(mGridOccupancyTemplate, 0, numberOfCells);
    checkAndUnmapGridOccupancy(mCudaVboResourceGridOccupancyTemplate);

    // The (dilated) occupancy grid will be copied in here, then the pathplanner fills it. Separate memories
    // enable re-use of the pre-built occupancy grid.
    mVboGridPathPlanner = OpenGlUtilities::createVbo(numberOfCells * sizeof(quint8));
    cudaSafeCall(cudaGraphicsGLRegisterBuffer(&mCudaVboResourceGridPathPlanner, mVboGridPathPlanner, cudaGraphicsMapFlagsNone));

    cudaSafeCall(cudaMalloc((void**)&mDeviceWaypoints, mMaxWaypoints * 4 * sizeof(float)));

    cudaSafeCall(cudaStreamCreate(&mCudaStream));

    alignPathPlannerGridToColliderCloud();

    mIsInitialized = true;
}
TEST(EventRecord, RecordAfterDestroy) {
    ::testing::FLAGS_gtest_death_test_style = "threadsafe";

    cudaError_t ret;
    cudaEvent_t event;
    cudaStream_t stream;

    ret = cudaEventCreate(&event);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaEventDestroy(event);
    EXPECT_EQ(cudaSuccess, ret);

    ret = cudaStreamCreate(&stream);
    ASSERT_EQ(cudaSuccess, ret);

    #if CUDART_VERSION >= 5000
    ret = cudaEventRecord(event);
    EXPECT_EQ(cudaErrorUnknown, ret);
    #else
    EXPECT_EXIT(
        cudaEventRecord(event, stream),
        ::testing::KilledBySignal(SIGSEGV), "");
    #endif

    ret = cudaStreamDestroy(stream);
    EXPECT_EQ(cudaSuccess, ret);
}
Beispiel #9
0
void blasx_resource_init(int GPUs, cublasHandle_t* handles, cudaStream_t* streams, cudaEvent_t* events, void** C_dev, int floatType_id) {
    if(floatType_id == 0) C_dev = (float**)  C_dev;
    else if(floatType_id == 1) C_dev = (double**) C_dev;
    else              C_dev = (cuDoubleComplex**) C_dev;
    int GPU_id = 0;
    for (GPU_id = 0; GPU_id < GPUs; GPU_id++) {
        assert( cudaSetDevice(GPU_id) == cudaSuccess );
        //create handles
        assert( cublasCreate(&handles[GPU_id]) == CUBLAS_STATUS_SUCCESS);
        //create streams and event
        int i = 0;
        for (i = 0 ; i < STREAMNUM; i++) {
            assert( cudaStreamCreate(&streams[i+GPU_id*STREAMNUM]) == cudaSuccess );
            assert( cudaEventCreateWithFlags(&events[i+GPU_id*STREAMNUM], cudaEventDisableTiming) == cudaSuccess );
        }
        //create C_dev
        for (i = 0; i < STREAMNUM*2; i++) {
            if (floatType_id == 0) {
                assert( cudaMalloc((void**)&C_dev[i+GPU_id*STREAMNUM*2], sizeof(float)*BLOCKSIZE_SGEMM*BLOCKSIZE_SGEMM) == cudaSuccess );
            }else if (floatType_id == 1) {
                 assert( cudaMalloc((void**)&C_dev[i+GPU_id*STREAMNUM*2], sizeof(double)*BLOCKSIZE_DGEMM*BLOCKSIZE_DGEMM) == cudaSuccess );
            } else {
                 assert( cudaMalloc((void**)&C_dev[i+GPU_id*STREAMNUM*2], sizeof(cuDoubleComplex)*BLOCKSIZE_ZGEMM*BLOCKSIZE_ZGEMM) == cudaSuccess );
            }
        }
    }
}
Beispiel #10
0
ContextPtr CudaDevice::CreateStream(bool stream, CudaAlloc* alloc) {
	ContextPtr context(new CudaContext);
	context->SetAllocator(alloc ? CreateDefaultAlloc().get() : alloc);

	// Create a stream.
	if(stream) cudaStreamCreate(&context->_stream);
	return context;
}
void BilateralFilterLayer<Dtype>::cudastream_init() {
#ifndef CPU_ONLY
  if(stream_ == NULL) {
    stream_ = new cudaStream_t;
    CUDA_CHECK(cudaStreamCreate(stream_));
  }
#endif
}
TEST(MemcpyAsync, Pinned) {
    /**
     * Host memory must be pinned in order to be used as an argument to
     * cudaMemcpyAsync.  Panoptes only prints a warning about this error
     * rather than actually return an error via the CUDA API.  This test is
     * written as to check for the absence of an error once the CUDA
     * implementation starts returning one for nonpinned host memory.
     */
    const long page_size_ = sysconf(_SC_PAGESIZE);
    ASSERT_LT(0, page_size_);
    const size_t page_size = page_size_;

    const size_t pages = 3;
    assert(pages > 0);

    cudaError_t ret;
    cudaStream_t stream;

    uint8_t *device_ptr, *host_ptr;
    ret = cudaMalloc((void **) &device_ptr, pages * page_size);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaMallocHost((void **) &host_ptr, pages * page_size);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaStreamCreate(&stream);
    ASSERT_EQ(cudaSuccess, ret);

    /* Page aligned transfers */
    for (size_t i = 0; i < pages; i++) {
        for (size_t j = i; j < pages; j++) {
            ret = cudaMemcpyAsync(device_ptr, host_ptr + i * page_size,
                (pages - j) * page_size, cudaMemcpyHostToDevice, stream);
            EXPECT_EQ(cudaSuccess, ret);

            ret = cudaMemcpyAsync(host_ptr + i * page_size, device_ptr,
                (pages - j) * page_size, cudaMemcpyDeviceToHost, stream);
            EXPECT_EQ(cudaSuccess, ret);
        }
    }

    /* Try a nonaligned transfer. */
    ret = cudaMemcpyAsync(device_ptr, host_ptr + (page_size / 2),
        page_size / 2, cudaMemcpyHostToDevice, stream);

    ret = cudaStreamSynchronize(stream);
    EXPECT_EQ(cudaSuccess, ret);

    ret = cudaStreamDestroy(stream);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaFreeHost(host_ptr);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaFree(device_ptr);
    ASSERT_EQ(cudaSuccess, ret);
}
JNIEXPORT jdouble JNICALL Java_org_apache_spark_mllib_classification_LogisticRegressionNative_predictPoint
    (JNIEnv *env, jobject obj, jdoubleArray data, jdoubleArray weights, jdouble intercept) {

	// the kernel is written to take multiple data sets and produce a set of results, but we're going
	// to run it as multiple parallel kernels, each producing a single result instead
	double *d_dataBuffer, *d_weightsBuffer, *d_score;
	int dataCount, dataLen, whichGPU;
	jdouble h_score, *h_dataBuffer, *h_weightsBuffer;
	cudaStream_t stream;

	// select a GPU for *this* specific dataset
	whichGPU = get_gpu();
	checkCudaErrors(cudaSetDevice(whichGPU));
	checkCudaErrors(cudaStreamCreate(&stream));

	// get a pointer to the raw input data, pinning them in memory
	dataCount = env->GetArrayLength(data);
	dataLen = dataCount*sizeof(double);
	assert(dataCount == env->GetArrayLength(weights));
	h_dataBuffer = (jdouble*) env->GetPrimitiveArrayCritical(data, 0);
	h_weightsBuffer = (jdouble*) env->GetPrimitiveArrayCritical(weights, 0);

	// copy input data to the GPU memory
	// TODO: It may be better to access host memory directly, skipping the copy.  Investigate.
	checkCudaErrors(mallocBest((void**)&d_dataBuffer, dataLen));
	checkCudaErrors(mallocBest((void**)&d_weightsBuffer, dataLen));
	checkCudaErrors(cudaMemcpyAsync(d_dataBuffer, h_dataBuffer, dataLen, cudaMemcpyHostToDevice, stream));
	checkCudaErrors(cudaMemcpyAsync(d_weightsBuffer, h_weightsBuffer, dataLen, cudaMemcpyHostToDevice, stream));
	// synchronize before unpinning, and also because there is a device-device transfer in predictKernelDevice
	checkCudaErrors(cudaStreamSynchronize(stream));
	// un-pin the host arrays, as we're done with them
	env->ReleasePrimitiveArrayCritical(data, h_dataBuffer, 0);
	env->ReleasePrimitiveArrayCritical(weights, h_weightsBuffer, 0);

	// allocate storage for the result
	checkCudaErrors(mallocBest((void**)&d_score, sizeof(double)));

	// run the kernel, to produce a result
	predictKernelDevice(d_dataBuffer, d_weightsBuffer, intercept, d_score, 1, dataCount, stream);

	checkCudaErrors(cudaStreamSynchronize(stream));

	// copy result back to host
	checkCudaErrors(cudaMemcpyAsync(&h_score, d_score, sizeof(double), cudaMemcpyDeviceToHost, stream));

	checkCudaErrors(cudaStreamSynchronize(stream));

	// Free the GPU buffers
	checkCudaErrors(freeBest(d_dataBuffer));
	checkCudaErrors(freeBest(d_weightsBuffer));
	checkCudaErrors(freeBest(d_score));

	checkCudaErrors(cudaStreamDestroy(stream));

	return h_score;
}
Beispiel #14
0
GpuDevice::Impl::Impl(int d) : device(d) {
  ActivateDevice();
  for (size_t i = 0; i < kParallelism; ++i) {
    CUDA_CALL(cudaStreamCreate(&stream[i]));
    CUBLAS_CALL(cublasCreate(&cublas_handle[i]));
    CUBLAS_CALL(cublasSetStream(cublas_handle[i], stream[i]));
    CUDNN_CALL(cudnnCreate(&cudnn_handle[i]));
    CUDNN_CALL(cudnnSetStream(cudnn_handle[i], stream[i]));
  }
}
Beispiel #15
0
void SingleParticle2dx::Methods::CUDAProjectionMethod::prepareForProjections(SingleParticle2dx::DataStructures::ParticleContainer& cont)
{
	cudaSetDevice(getMyGPU());
	cudaStreamCreate(&m_stream);
	
	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
	cudaExtent VS = make_cudaExtent(m_size, m_size, m_size);
	
	if( m_alloc_done == false )
	{
		cudaMalloc3DArray(&m_cuArray, &channelDesc, VS);
	}
		
	SingleParticle2dx::real_array3d_type real_data( boost::extents[m_size][m_size][m_size] );
	m_context->getRealSpaceData(real_data);
	unsigned int size = m_size*m_size*m_size*sizeof(float);
	
	if( m_alloc_done == false )
	{
		res_data_h = (float*)malloc(m_size*m_size*sizeof(float));
		cudaMalloc((void**)&res_data_d, m_size*m_size*sizeof(float));
		m_alloc_done = true;
	}
	
	cudaMemcpy3DParms copyParams = {0};
	copyParams.srcPtr = make_cudaPitchedPtr((void*)real_data.origin(), VS.width*sizeof(float), VS.width, VS.height);
	copyParams.dstArray = m_cuArray;
	copyParams.extent = VS;
	copyParams.kind = cudaMemcpyHostToDevice;
	
//	cudaMemcpy3D(&copyParams);
	cudaMemcpy3DAsync(&copyParams, m_stream);
		
	struct cudaResourceDesc resDesc;
	memset(&resDesc, 0, sizeof(resDesc));
	resDesc.resType = cudaResourceTypeArray;
	resDesc.res.array.array = m_cuArray;
	
	struct cudaTextureDesc texDesc;
	memset(&texDesc, 0, sizeof(texDesc));
	texDesc.addressMode[0]   = cudaAddressModeClamp;
	texDesc.addressMode[1]   = cudaAddressModeClamp;
	texDesc.addressMode[2]   = cudaAddressModeClamp;
	texDesc.filterMode       = cudaFilterModeLinear;
	texDesc.readMode         = cudaReadModeElementType;
	texDesc.normalizedCoords = 0;

	if(m_alloc_done == true)
	{
		cudaDestroyTextureObject(m_texObj);
	}

	m_texObj = 0;
	cudaCreateTextureObject(&m_texObj, &resDesc, &texDesc, NULL);
}
Beispiel #16
0
    BenchmarkContext(int device)
        : device_(device)
    {
        cudaError_t st = cudaSetDevice(device_);
        if (st != cudaSuccess)
            throw std::invalid_argument("could not set CUDA device");

        st = cudaStreamCreate(&stream_);
        if (st != cudaSuccess)
            throw std::invalid_argument("could not create CUDA stream");
    }
CudaStream::CudaStream(bool high_priority = false) {
  if (high_priority) {
    int leastPriority, greatestPriority;
    CUDA_CHECK(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
    CUDA_CHECK(cudaStreamCreateWithPriority(&stream_, cudaStreamDefault, greatestPriority));
  } else {
    CUDA_CHECK(cudaStreamCreate(&stream_));
  }
  DLOG(INFO) << "New " << (high_priority ? "high priority " : "") << "stream "
      << stream_ << " on device " << current_device() << ", thread " << std::this_thread::get_id();
}
Beispiel #18
0
/**
 * @brief This allocates and initializes all the relevant data buffers before the Jacobi run
 *
 * @param[in]	topSizeY		The size of the topology in the Y direction
 * @param[in]	topIdxY			The Y index of the calling MPI process in the topology
 * @param[in]	domSize			The size of the local domain (for which only the current MPI process is responsible)
 * @param[in]	neighbors		The neighbor ranks, according to the topology
 * @param[in]	copyStream		The stream used to overlap top & bottom halo exchange with side halo copy to host memory
 * @param[out]	devBlocks		The 2 device blocks that will be updated during the Jacobi run
 * @param[out]	devSideEdges	The 2 side edges (parallel to the Y direction) that will hold the packed halo values before sending them
 * @param[out]	devHaloLines	The 2 halo lines (parallel to the Y direction) that will hold the packed halo values after receiving them
 * @param[out] 	hostSendLines	The 2 host send buffers that will be used during the halo exchange by the normal CUDA & MPI version
 * @param[out]	hostRecvLines	The 2 host receive buffers that will be used during the halo exchange by the normal CUDA & MPI version
 * @param[out]	devResidue		The global device residue, which will be updated after every Jacobi iteration
 */
void InitializeDataChunk(int topSizeY, int topIdxY, const int2 * domSize, const int * neighbors, cudaStream_t * copyStream, 
		real * devBlocks[2], real * devSideEdges[2], real * devHaloLines[2], real * hostSendLines[2], real * hostRecvLines[2], real ** devResidue)
{
	const real PI = (real)3.1415926535897932384626;
	const real E_M_PI = (real)exp(-PI);
	
	size_t blockBytes = (domSize->x + 2) * (domSize->y + 2) * sizeof(real);
	size_t sideLineBytes = domSize->y * sizeof(real);
	int2 borderBounds = make_int2(topIdxY * domSize->y, (topIdxY + 1) * domSize->y);
	int borderSpan = domSize->y * topSizeY - 1;
	real * hostBlock = SafeHostAlloc(blockBytes);

	// Clearing the block also sets the boundary conditions for top and bottom edges to 0
	memset(hostBlock, 0, blockBytes);

	InitExchangeBuffers(hostSendLines, hostRecvLines, 0, domSize->x * sizeof(real));
	InitExchangeBuffers(hostSendLines, hostRecvLines, 1, sideLineBytes);

	// Set the boundary conditions for the left edge
	if (!HasNeighbor(neighbors, DIR_LEFT))
	{
		for (int j = borderBounds.x, idx = domSize->x + 3; j < borderBounds.y; ++j, idx += domSize->x + 2)
		{
			hostBlock[idx] = (real)sin(PI * j / borderSpan);
		}
	}

	// Set the boundary conditions for the right edge
	if (!HasNeighbor(neighbors, DIR_RIGHT))
	{
		for (int j = borderBounds.x, idx = ((domSize->x + 2) << 1) - 2; j < borderBounds.y; ++j, idx += domSize->x + 2)
		{
			hostBlock[idx] = (real)sin(PI * j / borderSpan) * E_M_PI;
		}
	}

	// Perform device memory allocation and initialization
	for (int i = 0; i < 2; ++i)
	{
		SafeCudaCall(cudaMalloc((void **)&devBlocks[i], blockBytes));
		SafeCudaCall(cudaMalloc((void **)&devSideEdges[i], sideLineBytes));	
		SafeCudaCall(cudaMalloc((void **)&devHaloLines[i], sideLineBytes));

		SafeCudaCall(cudaMemset(devSideEdges[i], 0, sideLineBytes));
	}

	SafeCudaCall(cudaMalloc((void **)devResidue, sizeof(real)));
	SafeCudaCall(cudaMemcpy(devBlocks[0], hostBlock, blockBytes, cudaMemcpyHostToDevice));
	SafeCudaCall(cudaMemcpy(devBlocks[1], devBlocks[0], blockBytes, cudaMemcpyDeviceToDevice));
	SafeCudaCall(cudaStreamCreate(copyStream));			

	SafeHostFree(hostBlock);
}
Beispiel #19
0
void THCState_reserveStreams(THCState* state, int numStreams)
{
  if (numStreams <= state->numUserStreams)
  {
    return;
  }

  int prevDev = -1;
  THCudaCheck(cudaGetDevice(&prevDev));

  /* Otherwise, we have to allocate a new set of streams and stream data */
  for (int dev = 0; dev < state->numDevices; ++dev) {
    THCudaCheck(cudaSetDevice(dev));

    /* +1 for the default stream as well */
    cudaStream_t* newStreams =
      (cudaStream_t*) malloc((numStreams + 1) * sizeof(cudaStream_t));

    void** newScratchSpace =
      (void**) malloc((numStreams + 1) * sizeof(void*));

    /* Copy over old stream data
       (0 is default stream, 1 ... numUserStreams are rest) */
    for (int stream = 0; stream <= state->numUserStreams; ++stream) {
      newStreams[stream] =
        THCState_getDeviceStream(state, dev, stream);
      newScratchSpace[stream] =
        THCState_getDeviceScratchSpace(state, dev, stream);
    }

    /* Allocate new stream resources */
    size_t scratchSpaceSize = THCState_getDeviceScratchSpaceSize(state, dev);

    for (int stream = state->numUserStreams + 1; stream <= numStreams; ++stream) {
      newStreams[stream] = NULL;
      THCudaCheck(cudaStreamCreate(newStreams + stream));
      newScratchSpace[stream] = NULL;
      THCudaCheck(THCudaMalloc(state, &newScratchSpace[stream], scratchSpaceSize));
    }

    THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, dev);
    free(res->streams);
    res->streams = newStreams;
    free(res->devScratchSpacePerStream);
    res->devScratchSpacePerStream = newScratchSpace;
  }

  state->numUserStreams = numStreams;

  THCudaCheck(cudaSetDevice(prevDev));
}
/**
 * This test only performs copies in valid directions as to avoid upsetting
 * Valgrind.  The error-causing tests are in test_memcpy.cu.
 */
TEST(MemcpyAsync, AllDirections) {
    cudaError_t ret;
    cudaStream_t stream;

    ret = cudaStreamCreate(&stream);
    ASSERT_EQ(cudaSuccess, ret);

    int a1 = 0;
    int a2 = 0;
    int * b;
    ret = cudaMalloc((void**) &b, sizeof(*b) * 2);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaMemcpyAsync(&a1, &a2, sizeof(a1),
        cudaMemcpyHostToHost, stream);
    EXPECT_EQ(cudaSuccess, ret);

    ret = cudaMemcpyAsync(&a1, b + 0, sizeof(a1),
        cudaMemcpyDeviceToHost, stream);
    EXPECT_EQ(cudaSuccess, ret);

    ret = cudaMemcpyAsync(&a1, b + 1, sizeof(a1),
        cudaMemcpyDeviceToHost, stream);
    EXPECT_EQ(cudaSuccess, ret);

    ret = cudaMemcpyAsync(b + 0, &a1, sizeof(a1),
        cudaMemcpyHostToDevice, stream);
    EXPECT_EQ(cudaSuccess, ret);

    ret = cudaMemcpyAsync(b + 1, &a1, sizeof(a1),
        cudaMemcpyHostToDevice, stream);
    EXPECT_EQ(cudaSuccess, ret);

    ret = cudaMemcpyAsync(b + 0, b + 0, sizeof(a1),
        cudaMemcpyDeviceToDevice, stream);
    EXPECT_EQ(cudaSuccess, ret);

    ret = cudaMemcpyAsync(b + 1, b + 1, sizeof(a1),
        cudaMemcpyDeviceToDevice, stream);
    EXPECT_EQ(cudaSuccess, ret);

    ret = cudaStreamSynchronize(stream);
    EXPECT_EQ(cudaSuccess, ret);

    ret = cudaFree(b);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaStreamDestroy(stream);
    EXPECT_EQ(cudaSuccess, ret);
}
TEST(StreamSynchronize, InvalidStream) {
    ::testing::FLAGS_gtest_death_test_style = "threadsafe";

    cudaStream_t stream;
    cudaError_t ret;

    /* The CUDA 5.0 driver no longer segfaults. */
    int driver;
    ret = cudaDriverGetVersion(&driver);

    if (driver >= 5000) {
        ret = cudaStreamCreate(&stream);
        ASSERT_EQ(cudaSuccess, ret);

        ret = cudaStreamDestroy(stream);
        ASSERT_EQ(cudaSuccess, ret);

        ret = cudaStreamSynchronize(stream);
        EXPECT_EQ(cudaErrorUnknown, ret);
    } else {
        /**
         * Without this compound statement, EXPECT_EXIT fails.
         * Without the EXPECT_EXIT wrapper, SIGSEGV happens.
         *
         * From appearances, it seems that gtest does not properly execute both
         * cudaStreamCreate and cudaStreamDestroy on stream when entering the
         * death test for this lone statement as to properly "initialize"
         * stream.
         */
        EXPECT_EXIT(
            {
                cudaStreamCreate(&stream);
                cudaStreamDestroy(stream);
                cudaStreamSynchronize(stream);
            },
            ::testing::KilledBySignal(SIGSEGV), "");
    }
Beispiel #22
0
void
init_arrays(double target_time)
{

    if (DEBUG) fprintf(stderr, "called init_arrays with target_time = %f \n",
            (target_time * 1e6));

#ifdef _ENABLE_CUDA_KERNEL_
    if (options.target == gpu || options.target == both) {
    /* Setting size of arrays for Dummy Compute */
    int N = options.device_array_size;

    /* Device Arrays for Dummy Compute */
    allocate_device_arrays(N);
    
    double time_elapsed = 0.0;
    double t1 = 0.0, t2 = 0.0;
    
    while (1) {
        t1 = MPI_Wtime();
        
        if (options.target == gpu || options.target == both) {
            cudaStreamCreate(&stream);
            call_kernel(A, d_x, d_y, N, &stream);
            
            cudaDeviceSynchronize();
            cudaStreamDestroy(stream);
        }

        t2 = MPI_Wtime();
        if ((t2-t1) < target_time)
        {  
            N += 32;

            /* Now allocate arrays of size N */
            allocate_device_arrays(N);
        }
        else {
            break;
        }
    }
    
    /* we reach here with desired N so save it and pass it to options */
    options.device_array_size = N;
    if (DEBUG) fprintf(stderr, "correct N = %d\n", N);
    }
#endif

}
void CuDNNConvolutionLayer<Dtype>::LayerSetUp(
    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
  ConvolutionLayer<Dtype>::LayerSetUp(bottom, top);
  // Initialize CUDA streams and cuDNN.
  stream_         = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
  handle_         = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
  workspaceSizeInBytes = 0;
  workspace = NULL;

  workspace = NULL;
  workspaceSizeInBytes = (size_t)0;

  for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
    CUDA_CHECK(cudaStreamCreate(&stream_[g]));
    CUDNN_CHECK(cudnnCreate(&handle_[g]));
    CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g]));
  }

  // Set the indexing parameters.
  weight_offset_ = (this->num_output_ / this->group_)
      * (this->channels_ / this->group_) * this->kernel_h_ * this->kernel_w_;
  bias_offset_ = (this->num_output_ / this->group_);

  // Create filter descriptor.
  cudnn::createFilterDesc<Dtype>(&filter_desc_,
      this->num_output_ / this->group_, this->channels_ / this->group_,
      this->kernel_h_, this->kernel_w_);

  // Create tensor descriptor(s) for data and corresponding convolution(s).
  for (int i = 0; i < bottom.size(); i++) {
    cudnnTensorDescriptor_t bottom_desc;
    cudnn::createTensor4dDesc<Dtype>(&bottom_desc);
    bottom_descs_.push_back(bottom_desc);
    cudnnTensorDescriptor_t top_desc;
    cudnn::createTensor4dDesc<Dtype>(&top_desc);
    top_descs_.push_back(top_desc);
    cudnnConvolutionDescriptor_t conv_desc;
    cudnn::createConvolutionDesc<Dtype>(&conv_desc);
    conv_descs_.push_back(conv_desc);
  }

  // Tensor descriptor for bias.
  if (this->bias_term_) {
    cudnn::createTensor4dDesc<Dtype>(&bias_desc_);
  }

  handles_setup_ = true;
}
TEST(MemcpyAsync, Validity) {
    cudaError_t ret;
    cudaStream_t stream;

    ret = cudaStreamCreate(&stream);
    ASSERT_EQ(cudaSuccess, ret);

    int * device_ptr, src = 0, vsrc, dst, vdst;

    ret = cudaMalloc((void **) &device_ptr, sizeof(*device_ptr));
    ASSERT_EQ(cudaSuccess, ret);

    /* Only src is valid; *device_ptr and dst are invalid. */

    /* Do transfer */
    ret = cudaMemcpyAsync(device_ptr, &src, sizeof(src),
        cudaMemcpyHostToDevice, stream);
    ASSERT_EQ(cudaSuccess, ret);

    /* Both src and *device_ptr are valid; dst is invalid */
    ret = cudaMemcpyAsync(&dst, device_ptr, sizeof(dst),
        cudaMemcpyDeviceToHost, stream);
    ASSERT_EQ(cudaSuccess, ret);

    EXPECT_EQ(src, dst);

    int valgrind = VALGRIND_GET_VBITS(&src, &vsrc, sizeof(src));
    assert(valgrind == 0 || valgrind == 1);

    if (valgrind == 1) {
        valgrind = VALGRIND_GET_VBITS(&dst, &vdst, sizeof(dst));
        assert(valgrind == 1);

        EXPECT_EQ(vsrc, vdst);
    }

    ret = cudaStreamSynchronize(stream);
    EXPECT_EQ(cudaSuccess, ret);

    ret = cudaFree(device_ptr);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaStreamDestroy(stream);
    ASSERT_EQ(cudaSuccess, ret);
}
Beispiel #25
0
void THCState_resetStreams(THCState* state, int device)
{
  if (state->currentStream !=
      state->streamsPerDevice[device][state->currentPerDeviceStream]) {
    THError("Unexpected stream state");
  }

  /* Reallocate all streams for the current device; the 0 stream
     doesn't need updating */
  for (int dev = 0; dev < state->numDevices; ++dev) {
    for (int stream = 1; stream <= state->numUserStreams; ++stream) {
      THCudaCheck(cudaStreamCreate(&state->streamsPerDevice[dev][stream]));
    }
  }

  state->currentStream =
    state->streamsPerDevice[device][state->currentPerDeviceStream];
}
Beispiel #26
0
void THCState_reserveStreams(THCState* state, int numStreams)
{
  if (numStreams <= state->numUserStreams)
  {
    return;
  }

  int prevDev = -1;
  THCudaCheck(cudaGetDevice(&prevDev));

  /* Otherwise, we have to allocate a new set of streams */
  cudaStream_t** newStreams =
    (cudaStream_t**) malloc(state->numDevices * sizeof(cudaStream_t*));
  for (int dev = 0; dev < state->numDevices; ++dev) {
    THCudaCheck(cudaSetDevice(dev));

    /* +1 for the default stream as well */
    newStreams[dev] =
      (cudaStream_t*) malloc((numStreams + 1) * sizeof(cudaStream_t));

    /* Copy over old streams
       (0 is default stream, 1 ... numUserStreams are rest) */
    for (int stream = 0; stream <= state->numUserStreams; ++stream) {
      newStreams[dev][stream] = state->streamsPerDevice[dev][stream];
    }

    /* Allocate new streams */
    for (int stream = state->numUserStreams + 1; stream <= numStreams; ++stream) {
      newStreams[dev][stream] = NULL;
      THCudaCheck(cudaStreamCreate(&newStreams[dev][stream]));
    }
  }

  cudaStream_t** oldStreams = state->streamsPerDevice;
  state->streamsPerDevice = newStreams;
  state->numUserStreams = numStreams;

  for (int dev = 0; dev < state->numDevices; ++dev) {
    free(oldStreams[dev]);
  }
  free(oldStreams);

  THCudaCheck(cudaSetDevice(prevDev));
}
 ff_stencilReduceCUDA(const taskT &task, size_t maxIter_ = 1, Tout identityValue_ = Tout()) :
     oneShot(&task), identityValue(identityValue_), iter(0), maxIter(maxIter_) {
     maxThreads = maxBlocks = 0;
     oldSize_in = oldSize_out = oldSize_env1 = oldSize_env2 = oldSize_env3 = oldSize_env4 = oldSize_env5 = oldSize_env6 = 0;
     deviceID=-1;
     stream = NULL;
     kernelMap = new TkernelMap();
     kernelReduce = new TkernelReduce();
     hostReduce = new ThostReduce();
     assert(kernelMap != NULL && kernelReduce != NULL && hostReduce != NULL);
     in_buffer   = NULL;   out_buffer  = NULL; 
     env1_buffer = NULL;   env2_buffer = NULL;
     env3_buffer = NULL;   env4_buffer = NULL;
     env5_buffer = NULL;   env6_buffer = NULL;
     Task.setTask((void *)&task);
     
     if (cudaStreamCreate(&stream) != cudaSuccess)
         error("mapCUDA, error creating stream\n");
 }
void
do_compute_gpu(double seconds)
{
    int i,j;
    double time_elapsed = 0.0, t1 = 0.0, t2 = 0.0;

    {
        t1 = MPI_Wtime();

        /* Execute Dummy Kernel on GPU if set by user */
        if (options.target == both || options.target == gpu) {
            {
                cudaStreamCreate(&stream);
                call_kernel(A, d_x, d_y, options.device_array_size, &stream);
            }
        }

        t2 = MPI_Wtime();
        time_elapsed += (t2-t1);
    }
}
Beispiel #29
0
GpuDevice::GpuDevice(uint64_t device_id, DeviceListener* l, int gpu_id) : ThreadedDevice(device_id, l, kParallelism), device_(gpu_id) {
  CUDA_CALL(cudaSetDevice(device_));
  cudaFree(0);  // Initialize
  auto allocator = [this](size_t len) -> void* {
    void* ret;
    CUDA_CALL(cudaSetDevice(device_));
    CUDA_CALL(cudaMalloc(&ret, len));
    return ret;
  };
  auto deallocator = [this](void* ptr) {
    CUDA_CALL(cudaSetDevice(device_));
    CUDA_CALL(cudaFree(ptr));
  };
  data_store_ = new PooledDataStore(DEFAULT_POOL_SIZE, allocator, deallocator);
  for (size_t i = 0; i < kParallelism; ++i) {
    CUDA_CALL(cudaStreamCreate(&stream_[i]));
    CUBLAS_CALL(cublasCreate(&cublas_handle_[i]));
    CUBLAS_CALL(cublasSetStream(cublas_handle_[i], stream_[i]));
    CUDNN_CALL(cudnnCreate(&cudnn_handle_[i]));
    CUDNN_CALL(cudnnSetStream(cudnn_handle_[i], stream_[i]));
  }
}
Beispiel #30
0
void Caffe::SetSlaveDevice(const int slave_device_id) {
  int current_device;
  CUDA_CHECK(cudaGetDevice(&current_device));
  if (current_device == slave_device_id) {
    return;
  }
  if (Get().slave_cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().slave_cublas_handle_));
  if (Get().slave_curand_generator_) {
    CURAND_CHECK(curandDestroyGenerator(Get().slave_curand_generator_));
  }
  CUDA_CHECK(cudaSetDevice(slave_device_id));
  CUDA_CHECK(cudaStreamCreate (&Get().slave_cu_stream_));
  CUBLAS_CHECK(cublasCreate(&Get().slave_cublas_handle_));
  CUBLAS_CHECK(cublasSetStream(Get().slave_cublas_handle_, Get().slave_cu_stream_));
  CURAND_CHECK(curandCreateGenerator(&Get().slave_curand_generator_,
      CURAND_RNG_PSEUDO_DEFAULT));
  CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().slave_curand_generator_,
      cluster_seedgen()));
  Get().slave_device_id_ = slave_device_id;
  CUDA_CHECK(cudaSetDevice(current_device));
  Caffe::set_gpu_mode(Caffe::MASTER_SLAVE);
}