TEST(StreamQuery, InvalidStream) { ::testing::FLAGS_gtest_death_test_style = "threadsafe"; cudaError_t ret; cudaStream_t stream; /* The CUDA 5.0 driver no longer segfaults. */ int driver; ret = cudaDriverGetVersion(&driver); ASSERT_EQ(cudaSuccess, ret); ret = cudaStreamCreate(&stream); ASSERT_EQ(cudaSuccess, ret); ret = cudaStreamDestroy(stream); ASSERT_EQ(cudaSuccess, ret); if (driver >= 5000) { ret = cudaStreamQuery(stream); EXPECT_EQ(cudaErrorUnknown, ret); } else { EXPECT_EXIT({ cudaStreamQuery(stream); }, ::testing::KilledBySignal(SIGSEGV), ""); }
TEST_P(MemcpyAsync, H2DTransfers) { const size_t param = GetParam(); const size_t alloc = 1 << param; cudaError_t ret; void *d1, *h1; ret = cudaMalloc(&d1, alloc); ASSERT_EQ(cudaSuccess, ret); ret = cudaHostAlloc(&h1, alloc, cudaHostAllocMapped); ASSERT_EQ(cudaSuccess, ret); cudaStream_t stream; ret = cudaStreamCreate(&stream); ASSERT_EQ(cudaSuccess, ret); ret = cudaMemcpyAsync(d1, h1, alloc, cudaMemcpyHostToDevice, stream); ASSERT_EQ(cudaSuccess, ret); ret = cudaStreamSynchronize(stream); ASSERT_EQ(cudaSuccess, ret); ret = cudaFree(d1); ASSERT_EQ(cudaSuccess, ret); ret = cudaFreeHost(h1); ASSERT_EQ(cudaSuccess, ret); ret = cudaStreamDestroy(stream); ASSERT_EQ(cudaSuccess, ret); }
void cuda_initialize() { CUDA_CHECK(cudaStreamCreate(&g_context.stream)); CUBLAS_CHECK(cublasCreate_v2(&g_context.cublas_handle)); CUBLAS_CHECK(cublasSetStream(g_context.cublas_handle, g_context.stream)); // CUDNN_CHECK(cudnnCreate(&g_context.cudnn_handle)); // CUDNN_CHECK(cudnnSetStream(g_context.cudnn_handle, g_context.stream)); }
TEST_P(MemcpyAsync, D2DTransfers) { const size_t param = GetParam(); const size_t alloc = 1 << param; cudaError_t ret; void *d1, *d2; ret = cudaMalloc(&d1, alloc); ASSERT_EQ(cudaSuccess, ret); ret = cudaMalloc(&d2, alloc); ASSERT_EQ(cudaSuccess, ret); cudaStream_t stream; ret = cudaStreamCreate(&stream); ASSERT_EQ(cudaSuccess, ret); ret = cudaMemcpyAsync(d2, d1, alloc, cudaMemcpyDeviceToDevice, stream); ASSERT_EQ(cudaSuccess, ret); ret = cudaStreamSynchronize(stream); ASSERT_EQ(cudaSuccess, ret); ret = cudaFree(d1); ASSERT_EQ(cudaSuccess, ret); ret = cudaFree(d2); ASSERT_EQ(cudaSuccess, ret); ret = cudaStreamDestroy(stream); ASSERT_EQ(cudaSuccess, ret); }
/** * CUDA4 introduced the cudaMemcpyDefault direction to cudaMemcpy. */ TEST(MemcpyAsync, CheckDefaultDirection) { cudaError_t ret; cudaStream_t stream; ret = cudaStreamCreate(&stream); ASSERT_EQ(cudaSuccess, ret); int a1 = 0; int a2 = 0; int * b; ret = cudaMalloc((void**) &b, sizeof(*b)); ASSERT_EQ(cudaSuccess, ret); ret = cudaMemcpyAsync(&a1, &a2, sizeof(a1), cudaMemcpyDefault, stream); EXPECT_EQ(cudaSuccess, ret); ret = cudaMemcpyAsync(&a1, b, sizeof(a1), cudaMemcpyDefault, stream); EXPECT_EQ(cudaSuccess, ret); ret = cudaMemcpyAsync(b, &a1, sizeof(a1), cudaMemcpyDefault, stream); EXPECT_EQ(cudaSuccess, ret); ret = cudaMemcpyAsync(b, b, sizeof(a1), cudaMemcpyDefault, stream); EXPECT_EQ(cudaSuccess, ret); ret = cudaStreamSynchronize(stream); EXPECT_EQ(cudaSuccess, ret); ret = cudaFree(b); ASSERT_EQ(cudaSuccess, ret); ret = cudaStreamDestroy(stream); EXPECT_EQ(cudaSuccess, ret); }
TEST(MemcpyAsync, CheckReturnValues) { /** * The API documentation states that * cudaErrorInvalidDevicePointer is a valid return value for * cudaMemcpyAsync * * TODO; This needs a test. */ cudaError_t ret; cudaStream_t stream; ret = cudaStreamCreate(&stream); ASSERT_EQ(cudaSuccess, ret); /** * Test woefully out of range directions. */ int a = 0; ret = cudaMemcpyAsync(&a, &a, sizeof(a), (cudaMemcpyKind) -1, stream); EXPECT_EQ(cudaErrorInvalidMemcpyDirection, ret); ret = cudaMemcpyAsync(NULL, NULL, sizeof(a), (cudaMemcpyKind) -1, stream); EXPECT_EQ(cudaErrorInvalidMemcpyDirection, ret); ret = cudaStreamSynchronize(stream); EXPECT_EQ(cudaSuccess, ret); ret = cudaStreamDestroy(stream); EXPECT_EQ(cudaSuccess, ret); }
void PathPlanner::initialize() { qDebug() << __PRETTY_FUNCTION__; Q_ASSERT(!mIsInitialized); Q_ASSERT(mPointCloudColliders != nullptr); const quint32 numberOfCells = mParametersPathPlanner.grid.getCellCount(); // The occupancy grid will be built and dilated in this memory mVboGridOccupancy = OpenGlUtilities::createVbo(numberOfCells * sizeof(quint8)); cudaSafeCall(cudaGraphicsGLRegisterBuffer(&mCudaVboResourceGridOccupancyTemplate, mVboGridOccupancy, cudaGraphicsMapFlagsNone)); checkAndMapGridOccupancy(mCudaVboResourceGridOccupancyTemplate); cudaMemset(mGridOccupancyTemplate, 0, numberOfCells); checkAndUnmapGridOccupancy(mCudaVboResourceGridOccupancyTemplate); // The (dilated) occupancy grid will be copied in here, then the pathplanner fills it. Separate memories // enable re-use of the pre-built occupancy grid. mVboGridPathPlanner = OpenGlUtilities::createVbo(numberOfCells * sizeof(quint8)); cudaSafeCall(cudaGraphicsGLRegisterBuffer(&mCudaVboResourceGridPathPlanner, mVboGridPathPlanner, cudaGraphicsMapFlagsNone)); cudaSafeCall(cudaMalloc((void**)&mDeviceWaypoints, mMaxWaypoints * 4 * sizeof(float))); cudaSafeCall(cudaStreamCreate(&mCudaStream)); alignPathPlannerGridToColliderCloud(); mIsInitialized = true; }
TEST(EventRecord, RecordAfterDestroy) { ::testing::FLAGS_gtest_death_test_style = "threadsafe"; cudaError_t ret; cudaEvent_t event; cudaStream_t stream; ret = cudaEventCreate(&event); ASSERT_EQ(cudaSuccess, ret); ret = cudaEventDestroy(event); EXPECT_EQ(cudaSuccess, ret); ret = cudaStreamCreate(&stream); ASSERT_EQ(cudaSuccess, ret); #if CUDART_VERSION >= 5000 ret = cudaEventRecord(event); EXPECT_EQ(cudaErrorUnknown, ret); #else EXPECT_EXIT( cudaEventRecord(event, stream), ::testing::KilledBySignal(SIGSEGV), ""); #endif ret = cudaStreamDestroy(stream); EXPECT_EQ(cudaSuccess, ret); }
void blasx_resource_init(int GPUs, cublasHandle_t* handles, cudaStream_t* streams, cudaEvent_t* events, void** C_dev, int floatType_id) { if(floatType_id == 0) C_dev = (float**) C_dev; else if(floatType_id == 1) C_dev = (double**) C_dev; else C_dev = (cuDoubleComplex**) C_dev; int GPU_id = 0; for (GPU_id = 0; GPU_id < GPUs; GPU_id++) { assert( cudaSetDevice(GPU_id) == cudaSuccess ); //create handles assert( cublasCreate(&handles[GPU_id]) == CUBLAS_STATUS_SUCCESS); //create streams and event int i = 0; for (i = 0 ; i < STREAMNUM; i++) { assert( cudaStreamCreate(&streams[i+GPU_id*STREAMNUM]) == cudaSuccess ); assert( cudaEventCreateWithFlags(&events[i+GPU_id*STREAMNUM], cudaEventDisableTiming) == cudaSuccess ); } //create C_dev for (i = 0; i < STREAMNUM*2; i++) { if (floatType_id == 0) { assert( cudaMalloc((void**)&C_dev[i+GPU_id*STREAMNUM*2], sizeof(float)*BLOCKSIZE_SGEMM*BLOCKSIZE_SGEMM) == cudaSuccess ); }else if (floatType_id == 1) { assert( cudaMalloc((void**)&C_dev[i+GPU_id*STREAMNUM*2], sizeof(double)*BLOCKSIZE_DGEMM*BLOCKSIZE_DGEMM) == cudaSuccess ); } else { assert( cudaMalloc((void**)&C_dev[i+GPU_id*STREAMNUM*2], sizeof(cuDoubleComplex)*BLOCKSIZE_ZGEMM*BLOCKSIZE_ZGEMM) == cudaSuccess ); } } } }
ContextPtr CudaDevice::CreateStream(bool stream, CudaAlloc* alloc) { ContextPtr context(new CudaContext); context->SetAllocator(alloc ? CreateDefaultAlloc().get() : alloc); // Create a stream. if(stream) cudaStreamCreate(&context->_stream); return context; }
void BilateralFilterLayer<Dtype>::cudastream_init() { #ifndef CPU_ONLY if(stream_ == NULL) { stream_ = new cudaStream_t; CUDA_CHECK(cudaStreamCreate(stream_)); } #endif }
TEST(MemcpyAsync, Pinned) { /** * Host memory must be pinned in order to be used as an argument to * cudaMemcpyAsync. Panoptes only prints a warning about this error * rather than actually return an error via the CUDA API. This test is * written as to check for the absence of an error once the CUDA * implementation starts returning one for nonpinned host memory. */ const long page_size_ = sysconf(_SC_PAGESIZE); ASSERT_LT(0, page_size_); const size_t page_size = page_size_; const size_t pages = 3; assert(pages > 0); cudaError_t ret; cudaStream_t stream; uint8_t *device_ptr, *host_ptr; ret = cudaMalloc((void **) &device_ptr, pages * page_size); ASSERT_EQ(cudaSuccess, ret); ret = cudaMallocHost((void **) &host_ptr, pages * page_size); ASSERT_EQ(cudaSuccess, ret); ret = cudaStreamCreate(&stream); ASSERT_EQ(cudaSuccess, ret); /* Page aligned transfers */ for (size_t i = 0; i < pages; i++) { for (size_t j = i; j < pages; j++) { ret = cudaMemcpyAsync(device_ptr, host_ptr + i * page_size, (pages - j) * page_size, cudaMemcpyHostToDevice, stream); EXPECT_EQ(cudaSuccess, ret); ret = cudaMemcpyAsync(host_ptr + i * page_size, device_ptr, (pages - j) * page_size, cudaMemcpyDeviceToHost, stream); EXPECT_EQ(cudaSuccess, ret); } } /* Try a nonaligned transfer. */ ret = cudaMemcpyAsync(device_ptr, host_ptr + (page_size / 2), page_size / 2, cudaMemcpyHostToDevice, stream); ret = cudaStreamSynchronize(stream); EXPECT_EQ(cudaSuccess, ret); ret = cudaStreamDestroy(stream); ASSERT_EQ(cudaSuccess, ret); ret = cudaFreeHost(host_ptr); ASSERT_EQ(cudaSuccess, ret); ret = cudaFree(device_ptr); ASSERT_EQ(cudaSuccess, ret); }
JNIEXPORT jdouble JNICALL Java_org_apache_spark_mllib_classification_LogisticRegressionNative_predictPoint (JNIEnv *env, jobject obj, jdoubleArray data, jdoubleArray weights, jdouble intercept) { // the kernel is written to take multiple data sets and produce a set of results, but we're going // to run it as multiple parallel kernels, each producing a single result instead double *d_dataBuffer, *d_weightsBuffer, *d_score; int dataCount, dataLen, whichGPU; jdouble h_score, *h_dataBuffer, *h_weightsBuffer; cudaStream_t stream; // select a GPU for *this* specific dataset whichGPU = get_gpu(); checkCudaErrors(cudaSetDevice(whichGPU)); checkCudaErrors(cudaStreamCreate(&stream)); // get a pointer to the raw input data, pinning them in memory dataCount = env->GetArrayLength(data); dataLen = dataCount*sizeof(double); assert(dataCount == env->GetArrayLength(weights)); h_dataBuffer = (jdouble*) env->GetPrimitiveArrayCritical(data, 0); h_weightsBuffer = (jdouble*) env->GetPrimitiveArrayCritical(weights, 0); // copy input data to the GPU memory // TODO: It may be better to access host memory directly, skipping the copy. Investigate. checkCudaErrors(mallocBest((void**)&d_dataBuffer, dataLen)); checkCudaErrors(mallocBest((void**)&d_weightsBuffer, dataLen)); checkCudaErrors(cudaMemcpyAsync(d_dataBuffer, h_dataBuffer, dataLen, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_weightsBuffer, h_weightsBuffer, dataLen, cudaMemcpyHostToDevice, stream)); // synchronize before unpinning, and also because there is a device-device transfer in predictKernelDevice checkCudaErrors(cudaStreamSynchronize(stream)); // un-pin the host arrays, as we're done with them env->ReleasePrimitiveArrayCritical(data, h_dataBuffer, 0); env->ReleasePrimitiveArrayCritical(weights, h_weightsBuffer, 0); // allocate storage for the result checkCudaErrors(mallocBest((void**)&d_score, sizeof(double))); // run the kernel, to produce a result predictKernelDevice(d_dataBuffer, d_weightsBuffer, intercept, d_score, 1, dataCount, stream); checkCudaErrors(cudaStreamSynchronize(stream)); // copy result back to host checkCudaErrors(cudaMemcpyAsync(&h_score, d_score, sizeof(double), cudaMemcpyDeviceToHost, stream)); checkCudaErrors(cudaStreamSynchronize(stream)); // Free the GPU buffers checkCudaErrors(freeBest(d_dataBuffer)); checkCudaErrors(freeBest(d_weightsBuffer)); checkCudaErrors(freeBest(d_score)); checkCudaErrors(cudaStreamDestroy(stream)); return h_score; }
GpuDevice::Impl::Impl(int d) : device(d) { ActivateDevice(); for (size_t i = 0; i < kParallelism; ++i) { CUDA_CALL(cudaStreamCreate(&stream[i])); CUBLAS_CALL(cublasCreate(&cublas_handle[i])); CUBLAS_CALL(cublasSetStream(cublas_handle[i], stream[i])); CUDNN_CALL(cudnnCreate(&cudnn_handle[i])); CUDNN_CALL(cudnnSetStream(cudnn_handle[i], stream[i])); } }
void SingleParticle2dx::Methods::CUDAProjectionMethod::prepareForProjections(SingleParticle2dx::DataStructures::ParticleContainer& cont) { cudaSetDevice(getMyGPU()); cudaStreamCreate(&m_stream); cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); cudaExtent VS = make_cudaExtent(m_size, m_size, m_size); if( m_alloc_done == false ) { cudaMalloc3DArray(&m_cuArray, &channelDesc, VS); } SingleParticle2dx::real_array3d_type real_data( boost::extents[m_size][m_size][m_size] ); m_context->getRealSpaceData(real_data); unsigned int size = m_size*m_size*m_size*sizeof(float); if( m_alloc_done == false ) { res_data_h = (float*)malloc(m_size*m_size*sizeof(float)); cudaMalloc((void**)&res_data_d, m_size*m_size*sizeof(float)); m_alloc_done = true; } cudaMemcpy3DParms copyParams = {0}; copyParams.srcPtr = make_cudaPitchedPtr((void*)real_data.origin(), VS.width*sizeof(float), VS.width, VS.height); copyParams.dstArray = m_cuArray; copyParams.extent = VS; copyParams.kind = cudaMemcpyHostToDevice; // cudaMemcpy3D(©Params); cudaMemcpy3DAsync(©Params, m_stream); struct cudaResourceDesc resDesc; memset(&resDesc, 0, sizeof(resDesc)); resDesc.resType = cudaResourceTypeArray; resDesc.res.array.array = m_cuArray; struct cudaTextureDesc texDesc; memset(&texDesc, 0, sizeof(texDesc)); texDesc.addressMode[0] = cudaAddressModeClamp; texDesc.addressMode[1] = cudaAddressModeClamp; texDesc.addressMode[2] = cudaAddressModeClamp; texDesc.filterMode = cudaFilterModeLinear; texDesc.readMode = cudaReadModeElementType; texDesc.normalizedCoords = 0; if(m_alloc_done == true) { cudaDestroyTextureObject(m_texObj); } m_texObj = 0; cudaCreateTextureObject(&m_texObj, &resDesc, &texDesc, NULL); }
BenchmarkContext(int device) : device_(device) { cudaError_t st = cudaSetDevice(device_); if (st != cudaSuccess) throw std::invalid_argument("could not set CUDA device"); st = cudaStreamCreate(&stream_); if (st != cudaSuccess) throw std::invalid_argument("could not create CUDA stream"); }
CudaStream::CudaStream(bool high_priority = false) { if (high_priority) { int leastPriority, greatestPriority; CUDA_CHECK(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority)); CUDA_CHECK(cudaStreamCreateWithPriority(&stream_, cudaStreamDefault, greatestPriority)); } else { CUDA_CHECK(cudaStreamCreate(&stream_)); } DLOG(INFO) << "New " << (high_priority ? "high priority " : "") << "stream " << stream_ << " on device " << current_device() << ", thread " << std::this_thread::get_id(); }
/** * @brief This allocates and initializes all the relevant data buffers before the Jacobi run * * @param[in] topSizeY The size of the topology in the Y direction * @param[in] topIdxY The Y index of the calling MPI process in the topology * @param[in] domSize The size of the local domain (for which only the current MPI process is responsible) * @param[in] neighbors The neighbor ranks, according to the topology * @param[in] copyStream The stream used to overlap top & bottom halo exchange with side halo copy to host memory * @param[out] devBlocks The 2 device blocks that will be updated during the Jacobi run * @param[out] devSideEdges The 2 side edges (parallel to the Y direction) that will hold the packed halo values before sending them * @param[out] devHaloLines The 2 halo lines (parallel to the Y direction) that will hold the packed halo values after receiving them * @param[out] hostSendLines The 2 host send buffers that will be used during the halo exchange by the normal CUDA & MPI version * @param[out] hostRecvLines The 2 host receive buffers that will be used during the halo exchange by the normal CUDA & MPI version * @param[out] devResidue The global device residue, which will be updated after every Jacobi iteration */ void InitializeDataChunk(int topSizeY, int topIdxY, const int2 * domSize, const int * neighbors, cudaStream_t * copyStream, real * devBlocks[2], real * devSideEdges[2], real * devHaloLines[2], real * hostSendLines[2], real * hostRecvLines[2], real ** devResidue) { const real PI = (real)3.1415926535897932384626; const real E_M_PI = (real)exp(-PI); size_t blockBytes = (domSize->x + 2) * (domSize->y + 2) * sizeof(real); size_t sideLineBytes = domSize->y * sizeof(real); int2 borderBounds = make_int2(topIdxY * domSize->y, (topIdxY + 1) * domSize->y); int borderSpan = domSize->y * topSizeY - 1; real * hostBlock = SafeHostAlloc(blockBytes); // Clearing the block also sets the boundary conditions for top and bottom edges to 0 memset(hostBlock, 0, blockBytes); InitExchangeBuffers(hostSendLines, hostRecvLines, 0, domSize->x * sizeof(real)); InitExchangeBuffers(hostSendLines, hostRecvLines, 1, sideLineBytes); // Set the boundary conditions for the left edge if (!HasNeighbor(neighbors, DIR_LEFT)) { for (int j = borderBounds.x, idx = domSize->x + 3; j < borderBounds.y; ++j, idx += domSize->x + 2) { hostBlock[idx] = (real)sin(PI * j / borderSpan); } } // Set the boundary conditions for the right edge if (!HasNeighbor(neighbors, DIR_RIGHT)) { for (int j = borderBounds.x, idx = ((domSize->x + 2) << 1) - 2; j < borderBounds.y; ++j, idx += domSize->x + 2) { hostBlock[idx] = (real)sin(PI * j / borderSpan) * E_M_PI; } } // Perform device memory allocation and initialization for (int i = 0; i < 2; ++i) { SafeCudaCall(cudaMalloc((void **)&devBlocks[i], blockBytes)); SafeCudaCall(cudaMalloc((void **)&devSideEdges[i], sideLineBytes)); SafeCudaCall(cudaMalloc((void **)&devHaloLines[i], sideLineBytes)); SafeCudaCall(cudaMemset(devSideEdges[i], 0, sideLineBytes)); } SafeCudaCall(cudaMalloc((void **)devResidue, sizeof(real))); SafeCudaCall(cudaMemcpy(devBlocks[0], hostBlock, blockBytes, cudaMemcpyHostToDevice)); SafeCudaCall(cudaMemcpy(devBlocks[1], devBlocks[0], blockBytes, cudaMemcpyDeviceToDevice)); SafeCudaCall(cudaStreamCreate(copyStream)); SafeHostFree(hostBlock); }
void THCState_reserveStreams(THCState* state, int numStreams) { if (numStreams <= state->numUserStreams) { return; } int prevDev = -1; THCudaCheck(cudaGetDevice(&prevDev)); /* Otherwise, we have to allocate a new set of streams and stream data */ for (int dev = 0; dev < state->numDevices; ++dev) { THCudaCheck(cudaSetDevice(dev)); /* +1 for the default stream as well */ cudaStream_t* newStreams = (cudaStream_t*) malloc((numStreams + 1) * sizeof(cudaStream_t)); void** newScratchSpace = (void**) malloc((numStreams + 1) * sizeof(void*)); /* Copy over old stream data (0 is default stream, 1 ... numUserStreams are rest) */ for (int stream = 0; stream <= state->numUserStreams; ++stream) { newStreams[stream] = THCState_getDeviceStream(state, dev, stream); newScratchSpace[stream] = THCState_getDeviceScratchSpace(state, dev, stream); } /* Allocate new stream resources */ size_t scratchSpaceSize = THCState_getDeviceScratchSpaceSize(state, dev); for (int stream = state->numUserStreams + 1; stream <= numStreams; ++stream) { newStreams[stream] = NULL; THCudaCheck(cudaStreamCreate(newStreams + stream)); newScratchSpace[stream] = NULL; THCudaCheck(THCudaMalloc(state, &newScratchSpace[stream], scratchSpaceSize)); } THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, dev); free(res->streams); res->streams = newStreams; free(res->devScratchSpacePerStream); res->devScratchSpacePerStream = newScratchSpace; } state->numUserStreams = numStreams; THCudaCheck(cudaSetDevice(prevDev)); }
/** * This test only performs copies in valid directions as to avoid upsetting * Valgrind. The error-causing tests are in test_memcpy.cu. */ TEST(MemcpyAsync, AllDirections) { cudaError_t ret; cudaStream_t stream; ret = cudaStreamCreate(&stream); ASSERT_EQ(cudaSuccess, ret); int a1 = 0; int a2 = 0; int * b; ret = cudaMalloc((void**) &b, sizeof(*b) * 2); ASSERT_EQ(cudaSuccess, ret); ret = cudaMemcpyAsync(&a1, &a2, sizeof(a1), cudaMemcpyHostToHost, stream); EXPECT_EQ(cudaSuccess, ret); ret = cudaMemcpyAsync(&a1, b + 0, sizeof(a1), cudaMemcpyDeviceToHost, stream); EXPECT_EQ(cudaSuccess, ret); ret = cudaMemcpyAsync(&a1, b + 1, sizeof(a1), cudaMemcpyDeviceToHost, stream); EXPECT_EQ(cudaSuccess, ret); ret = cudaMemcpyAsync(b + 0, &a1, sizeof(a1), cudaMemcpyHostToDevice, stream); EXPECT_EQ(cudaSuccess, ret); ret = cudaMemcpyAsync(b + 1, &a1, sizeof(a1), cudaMemcpyHostToDevice, stream); EXPECT_EQ(cudaSuccess, ret); ret = cudaMemcpyAsync(b + 0, b + 0, sizeof(a1), cudaMemcpyDeviceToDevice, stream); EXPECT_EQ(cudaSuccess, ret); ret = cudaMemcpyAsync(b + 1, b + 1, sizeof(a1), cudaMemcpyDeviceToDevice, stream); EXPECT_EQ(cudaSuccess, ret); ret = cudaStreamSynchronize(stream); EXPECT_EQ(cudaSuccess, ret); ret = cudaFree(b); ASSERT_EQ(cudaSuccess, ret); ret = cudaStreamDestroy(stream); EXPECT_EQ(cudaSuccess, ret); }
TEST(StreamSynchronize, InvalidStream) { ::testing::FLAGS_gtest_death_test_style = "threadsafe"; cudaStream_t stream; cudaError_t ret; /* The CUDA 5.0 driver no longer segfaults. */ int driver; ret = cudaDriverGetVersion(&driver); if (driver >= 5000) { ret = cudaStreamCreate(&stream); ASSERT_EQ(cudaSuccess, ret); ret = cudaStreamDestroy(stream); ASSERT_EQ(cudaSuccess, ret); ret = cudaStreamSynchronize(stream); EXPECT_EQ(cudaErrorUnknown, ret); } else { /** * Without this compound statement, EXPECT_EXIT fails. * Without the EXPECT_EXIT wrapper, SIGSEGV happens. * * From appearances, it seems that gtest does not properly execute both * cudaStreamCreate and cudaStreamDestroy on stream when entering the * death test for this lone statement as to properly "initialize" * stream. */ EXPECT_EXIT( { cudaStreamCreate(&stream); cudaStreamDestroy(stream); cudaStreamSynchronize(stream); }, ::testing::KilledBySignal(SIGSEGV), ""); }
void init_arrays(double target_time) { if (DEBUG) fprintf(stderr, "called init_arrays with target_time = %f \n", (target_time * 1e6)); #ifdef _ENABLE_CUDA_KERNEL_ if (options.target == gpu || options.target == both) { /* Setting size of arrays for Dummy Compute */ int N = options.device_array_size; /* Device Arrays for Dummy Compute */ allocate_device_arrays(N); double time_elapsed = 0.0; double t1 = 0.0, t2 = 0.0; while (1) { t1 = MPI_Wtime(); if (options.target == gpu || options.target == both) { cudaStreamCreate(&stream); call_kernel(A, d_x, d_y, N, &stream); cudaDeviceSynchronize(); cudaStreamDestroy(stream); } t2 = MPI_Wtime(); if ((t2-t1) < target_time) { N += 32; /* Now allocate arrays of size N */ allocate_device_arrays(N); } else { break; } } /* we reach here with desired N so save it and pass it to options */ options.device_array_size = N; if (DEBUG) fprintf(stderr, "correct N = %d\n", N); } #endif }
void CuDNNConvolutionLayer<Dtype>::LayerSetUp( const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { ConvolutionLayer<Dtype>::LayerSetUp(bottom, top); // Initialize CUDA streams and cuDNN. stream_ = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP]; handle_ = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP]; workspaceSizeInBytes = 0; workspace = NULL; workspace = NULL; workspaceSizeInBytes = (size_t)0; for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) { CUDA_CHECK(cudaStreamCreate(&stream_[g])); CUDNN_CHECK(cudnnCreate(&handle_[g])); CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g])); } // Set the indexing parameters. weight_offset_ = (this->num_output_ / this->group_) * (this->channels_ / this->group_) * this->kernel_h_ * this->kernel_w_; bias_offset_ = (this->num_output_ / this->group_); // Create filter descriptor. cudnn::createFilterDesc<Dtype>(&filter_desc_, this->num_output_ / this->group_, this->channels_ / this->group_, this->kernel_h_, this->kernel_w_); // Create tensor descriptor(s) for data and corresponding convolution(s). for (int i = 0; i < bottom.size(); i++) { cudnnTensorDescriptor_t bottom_desc; cudnn::createTensor4dDesc<Dtype>(&bottom_desc); bottom_descs_.push_back(bottom_desc); cudnnTensorDescriptor_t top_desc; cudnn::createTensor4dDesc<Dtype>(&top_desc); top_descs_.push_back(top_desc); cudnnConvolutionDescriptor_t conv_desc; cudnn::createConvolutionDesc<Dtype>(&conv_desc); conv_descs_.push_back(conv_desc); } // Tensor descriptor for bias. if (this->bias_term_) { cudnn::createTensor4dDesc<Dtype>(&bias_desc_); } handles_setup_ = true; }
TEST(MemcpyAsync, Validity) { cudaError_t ret; cudaStream_t stream; ret = cudaStreamCreate(&stream); ASSERT_EQ(cudaSuccess, ret); int * device_ptr, src = 0, vsrc, dst, vdst; ret = cudaMalloc((void **) &device_ptr, sizeof(*device_ptr)); ASSERT_EQ(cudaSuccess, ret); /* Only src is valid; *device_ptr and dst are invalid. */ /* Do transfer */ ret = cudaMemcpyAsync(device_ptr, &src, sizeof(src), cudaMemcpyHostToDevice, stream); ASSERT_EQ(cudaSuccess, ret); /* Both src and *device_ptr are valid; dst is invalid */ ret = cudaMemcpyAsync(&dst, device_ptr, sizeof(dst), cudaMemcpyDeviceToHost, stream); ASSERT_EQ(cudaSuccess, ret); EXPECT_EQ(src, dst); int valgrind = VALGRIND_GET_VBITS(&src, &vsrc, sizeof(src)); assert(valgrind == 0 || valgrind == 1); if (valgrind == 1) { valgrind = VALGRIND_GET_VBITS(&dst, &vdst, sizeof(dst)); assert(valgrind == 1); EXPECT_EQ(vsrc, vdst); } ret = cudaStreamSynchronize(stream); EXPECT_EQ(cudaSuccess, ret); ret = cudaFree(device_ptr); ASSERT_EQ(cudaSuccess, ret); ret = cudaStreamDestroy(stream); ASSERT_EQ(cudaSuccess, ret); }
void THCState_resetStreams(THCState* state, int device) { if (state->currentStream != state->streamsPerDevice[device][state->currentPerDeviceStream]) { THError("Unexpected stream state"); } /* Reallocate all streams for the current device; the 0 stream doesn't need updating */ for (int dev = 0; dev < state->numDevices; ++dev) { for (int stream = 1; stream <= state->numUserStreams; ++stream) { THCudaCheck(cudaStreamCreate(&state->streamsPerDevice[dev][stream])); } } state->currentStream = state->streamsPerDevice[device][state->currentPerDeviceStream]; }
void THCState_reserveStreams(THCState* state, int numStreams) { if (numStreams <= state->numUserStreams) { return; } int prevDev = -1; THCudaCheck(cudaGetDevice(&prevDev)); /* Otherwise, we have to allocate a new set of streams */ cudaStream_t** newStreams = (cudaStream_t**) malloc(state->numDevices * sizeof(cudaStream_t*)); for (int dev = 0; dev < state->numDevices; ++dev) { THCudaCheck(cudaSetDevice(dev)); /* +1 for the default stream as well */ newStreams[dev] = (cudaStream_t*) malloc((numStreams + 1) * sizeof(cudaStream_t)); /* Copy over old streams (0 is default stream, 1 ... numUserStreams are rest) */ for (int stream = 0; stream <= state->numUserStreams; ++stream) { newStreams[dev][stream] = state->streamsPerDevice[dev][stream]; } /* Allocate new streams */ for (int stream = state->numUserStreams + 1; stream <= numStreams; ++stream) { newStreams[dev][stream] = NULL; THCudaCheck(cudaStreamCreate(&newStreams[dev][stream])); } } cudaStream_t** oldStreams = state->streamsPerDevice; state->streamsPerDevice = newStreams; state->numUserStreams = numStreams; for (int dev = 0; dev < state->numDevices; ++dev) { free(oldStreams[dev]); } free(oldStreams); THCudaCheck(cudaSetDevice(prevDev)); }
ff_stencilReduceCUDA(const taskT &task, size_t maxIter_ = 1, Tout identityValue_ = Tout()) : oneShot(&task), identityValue(identityValue_), iter(0), maxIter(maxIter_) { maxThreads = maxBlocks = 0; oldSize_in = oldSize_out = oldSize_env1 = oldSize_env2 = oldSize_env3 = oldSize_env4 = oldSize_env5 = oldSize_env6 = 0; deviceID=-1; stream = NULL; kernelMap = new TkernelMap(); kernelReduce = new TkernelReduce(); hostReduce = new ThostReduce(); assert(kernelMap != NULL && kernelReduce != NULL && hostReduce != NULL); in_buffer = NULL; out_buffer = NULL; env1_buffer = NULL; env2_buffer = NULL; env3_buffer = NULL; env4_buffer = NULL; env5_buffer = NULL; env6_buffer = NULL; Task.setTask((void *)&task); if (cudaStreamCreate(&stream) != cudaSuccess) error("mapCUDA, error creating stream\n"); }
void do_compute_gpu(double seconds) { int i,j; double time_elapsed = 0.0, t1 = 0.0, t2 = 0.0; { t1 = MPI_Wtime(); /* Execute Dummy Kernel on GPU if set by user */ if (options.target == both || options.target == gpu) { { cudaStreamCreate(&stream); call_kernel(A, d_x, d_y, options.device_array_size, &stream); } } t2 = MPI_Wtime(); time_elapsed += (t2-t1); } }
GpuDevice::GpuDevice(uint64_t device_id, DeviceListener* l, int gpu_id) : ThreadedDevice(device_id, l, kParallelism), device_(gpu_id) { CUDA_CALL(cudaSetDevice(device_)); cudaFree(0); // Initialize auto allocator = [this](size_t len) -> void* { void* ret; CUDA_CALL(cudaSetDevice(device_)); CUDA_CALL(cudaMalloc(&ret, len)); return ret; }; auto deallocator = [this](void* ptr) { CUDA_CALL(cudaSetDevice(device_)); CUDA_CALL(cudaFree(ptr)); }; data_store_ = new PooledDataStore(DEFAULT_POOL_SIZE, allocator, deallocator); for (size_t i = 0; i < kParallelism; ++i) { CUDA_CALL(cudaStreamCreate(&stream_[i])); CUBLAS_CALL(cublasCreate(&cublas_handle_[i])); CUBLAS_CALL(cublasSetStream(cublas_handle_[i], stream_[i])); CUDNN_CALL(cudnnCreate(&cudnn_handle_[i])); CUDNN_CALL(cudnnSetStream(cudnn_handle_[i], stream_[i])); } }
void Caffe::SetSlaveDevice(const int slave_device_id) { int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); if (current_device == slave_device_id) { return; } if (Get().slave_cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().slave_cublas_handle_)); if (Get().slave_curand_generator_) { CURAND_CHECK(curandDestroyGenerator(Get().slave_curand_generator_)); } CUDA_CHECK(cudaSetDevice(slave_device_id)); CUDA_CHECK(cudaStreamCreate (&Get().slave_cu_stream_)); CUBLAS_CHECK(cublasCreate(&Get().slave_cublas_handle_)); CUBLAS_CHECK(cublasSetStream(Get().slave_cublas_handle_, Get().slave_cu_stream_)); CURAND_CHECK(curandCreateGenerator(&Get().slave_curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().slave_curand_generator_, cluster_seedgen())); Get().slave_device_id_ = slave_device_id; CUDA_CHECK(cudaSetDevice(current_device)); Caffe::set_gpu_mode(Caffe::MASTER_SLAVE); }