/* GPUrand */ void GPUrand(const GPUtype &OUT) { curandStatus_t status; gpuTYPE_t type = gm->gputype.getType(OUT); gm->gmat->control.cacheClean(); const void *gpuptr = gm->gputype.getGPUptr(OUT); // pointer to GPU memory int numel = gm->gputype.getNumel(OUT); // number of elements int datasize = gm->gputype.getDataSize(OUT); // bytes for each element gen = 0; // implement recovery procedure // try and if error try again // init curand if (curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT)!=CURAND_STATUS_SUCCESS) { mexErrMsgTxt(ERROR_CURAND_INIT); } //if (curandCreateGenerator(&gen,CURAND_RNG_QUASI_DEFAULT)!=CURAND_STATUS_SUCCESS) { // mexErrMsgTxt(ERROR_CURAND_INIT); //} // seed seed++; if (curandSetPseudoRandomGeneratorSeed(gen, time(NULL)+seed)!=CURAND_STATUS_SUCCESS) { mexErrMsgTxt(ERROR_CURAND_SEED); } if (type == gpuFLOAT) { status = curandGenerateUniform(gen, (float *) gpuptr, numel); } else if (type == gpuCFLOAT) { status = curandGenerateUniform(gen, (float *) gpuptr, numel*2); } else if (type == gpuDOUBLE) { status = curandGenerateUniformDouble(gen, (double *) gpuptr, numel); } else if (type == gpuCDOUBLE) { status = curandGenerateUniformDouble(gen, (double *) gpuptr, numel*2); } if (status!=CURAND_STATUS_SUCCESS) { curandDestroyGenerator(gen); mexErrMsgTxt(ERROR_CURAND_GEN); } // destroy if (curandDestroyGenerator(gen)!=CURAND_STATUS_SUCCESS) { mexErrMsgTxt(ERROR_CURAND_DESTROY); } }
void Caffe::SetDevice(const int device_id) { int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); if (current_device == device_id) { return; } // The call to cudaSetDevice must come before any calls to Get, which // may perform initialization using the GPU. CUDA_CHECK(cudaSetDevice(device_id)); if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_)); if (Get().cusparse_descr_)CUSPARSE_CHECK(cusparseDestroyMatDescr(Get().cusparse_descr_)); if (Get().cusparse_handle_)CUSPARSE_CHECK(cusparseDestroy(Get().cusparse_handle_)); if (Get().curand_generator_) { CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_)); } CUSPARSE_CHECK(cusparseCreate(&Get().cusparse_handle_)); CUSPARSE_CHECK(cusparseCreateMatDescr(&Get().cusparse_descr_)); // cusparseSetMatType(cusparse_descr_,CUSPARSE_MATRIX_TYPE_GENERAL); // cusparseSetMatIndexBase(cusparse_descr_,CUSPARSE_INDEX_BASE_ZERO); LOG(INFO)<<"set descr"; CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_)); CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, cluster_seedgen())); }
curand_generator:: ~curand_generator() { if (handle) { curandDestroyGenerator((curandGenerator_t)handle); } }
Caffe::~Caffe() { if (cusparse_descr_) CUSPARSE_CHECK(cusparseDestroyMatDescr(cusparse_descr_)); if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_)); if (cusparse_handle_) CUSPARSE_CHECK(cusparseDestroy(cusparse_handle_)); if (curand_generator_) { CURAND_CHECK(curandDestroyGenerator(curand_generator_)); } }
void Caffe::set_random_seed(unsigned int seed) { CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_)); CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, seed)); VSL_CHECK(vslDeleteStream(&Get().vsl_stream_)); VSL_CHECK(vslNewStream(&Get().vsl_stream_, VSL_BRNG_MT19937, seed)); }
Caffe::~Caffe() { if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_)); if (curand_generator_) CURAND_CHECK(curandDestroyGenerator(curand_generator_)); if (vsl_stream_) VSL_CHECK(vslDeleteStream(&vsl_stream_)); }
WIE::Random::~Random() { assert(generator); curandDestroyGenerator(generator); generator = NULL; assert(samples); cudaFree(samples); samples = NULL; }
Caffe::~Caffe() { // Make sure all device contexts and // dependent memory blocks are freed properly device_contexts_.clear(); #ifdef USE_CUDA if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_)); if (curand_generator_) { CURAND_CHECK(curandDestroyGenerator(curand_generator_)); } #endif // USE_CUDA }
Caffe::~Caffe() { for (vector<cublasHandle_t>& group_cublas_handles : cublas_handles_) { for (cublasHandle_t h : group_cublas_handles) { if (h) { CUBLAS_CHECK(cublasDestroy(h)); } } } for_each(curand_generators_.begin(), curand_generators_.end(), [](curandGenerator_t h) { if (h) { CURAND_CHECK(curandDestroyGenerator(h)); } }); }
void Dragon::set_device(const int device_id) { int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); if (current_device == device_id) return; // The call to cudaSetDevice must come before any calls to Get, which // may perform initialization using the GPU. // reset Device must reset handle and generator??? CUDA_CHECK(cudaSetDevice(device_id)); if (Get().cublas_handle) cublasDestroy_v2(Get().cublas_handle); if (Get().curand_generator) curandDestroyGenerator(Get().curand_generator); cublasCreate_v2(&Get().cublas_handle); curandCreateGenerator(&Get().curand_generator, CURAND_RNG_PSEUDO_DEFAULT); curandSetPseudoRandomGeneratorSeed(Get().curand_generator, cluster_seedgen()); }
void Caffe::set_random_seed(const unsigned int seed) { // Curand seed // Yangqing's note: simply setting the generator seed does not seem to // work on the tesla K20s, so I wrote the ugly reset thing below. if (Get().curand_generator_) { CURAND_CHECK(curandDestroyGenerator(curand_generator())); CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator(), seed)); } else { LOG(ERROR) << "Curand not available. Skipping setting the curand seed."; } // RNG seed Get().random_generator_.reset(new RNG(seed)); }
void Caffe::SetDevice(const int device_id) { int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); if (current_device == device_id) { return; } if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_)); if (Get().curand_generator_) { CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_)); } CUDA_CHECK(cudaSetDevice(device_id)); CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_)); CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, cluster_seedgen())); }
void Engine::SetDevice(const int device_id) { int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); if (current_device == device_id) { return; } // The call to cudaSetDevice must come before any calls to Get, which // may perform initialization using the GPU. CUDA_CHECK(cudaSetDevice(device_id)); if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_)); if (Get().curand_generator_) { CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_)); } CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_)); CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, cluster_seedgen())); }
void Caffe::SetSlaveDevice(const int slave_device_id) { int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); if (current_device == slave_device_id) { return; } if (Get().slave_cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().slave_cublas_handle_)); if (Get().slave_curand_generator_) { CURAND_CHECK(curandDestroyGenerator(Get().slave_curand_generator_)); } CUDA_CHECK(cudaSetDevice(slave_device_id)); CUDA_CHECK(cudaStreamCreate (&Get().slave_cu_stream_)); CUBLAS_CHECK(cublasCreate(&Get().slave_cublas_handle_)); CUBLAS_CHECK(cublasSetStream(Get().slave_cublas_handle_, Get().slave_cu_stream_)); CURAND_CHECK(curandCreateGenerator(&Get().slave_curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().slave_curand_generator_, cluster_seedgen())); Get().slave_device_id_ = slave_device_id; CUDA_CHECK(cudaSetDevice(current_device)); Caffe::set_gpu_mode(Caffe::MASTER_SLAVE); }
void Caffe::SetDevice(const int device_id) { std::vector<int> devices; devices.push_back(device_id); Caffe::SetDevices(devices); Get().default_device_context_ = GetDeviceContext(device_id); if (Get().default_device_context_->backend() == Backend::BACKEND_CUDA) { #ifdef USE_CUDA int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); if (current_device == device_id) { return; } // The call to cudaSetDevice must come before any calls to Get, which // may perform initialization using the GPU. CUDA_CHECK(cudaSetDevice(device_id)); if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_)); if (Get().curand_generator_) { CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_)); } CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_)); CURAND_CHECK( curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK( curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, cluster_seedgen())); #endif // USE_CUDA } else { #ifdef USE_GREENTEA #ifdef USE_CLBLAS clblasSetup(); #endif // USE_CLBLAS #endif // USE_GREENTEA } }
/////////////////////////////////////////////////////////////////////////////// // Main program /////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { // Start logs shrQAStart(argc, argv); // initialize the GPU, either identified by --device // or by picking the device with highest flop rate. int devID = findCudaDevice(argc, (const char **)argv); // parsing the number of random numbers to generate int rand_n = DEFAULT_RAND_N; if( checkCmdLineFlag(argc, (const char**) argv, "count") ) { rand_n = getCmdLineArgumentInt(argc, (const char**) argv, "count"); } printf("Allocating data for %i samples...\n", rand_n); // parsing the seed int seed = DEFAULT_SEED; if( checkCmdLineFlag(argc, (const char**) argv, "seed") ) { seed = getCmdLineArgumentInt(argc, (const char**) argv, "seed"); } printf("Seeding with %i ...\n", seed); float *d_Rand; checkCudaErrors( cudaMalloc((void **)&d_Rand, rand_n * sizeof(float)) ); curandGenerator_t prngGPU; checkCurandErrors( curandCreateGenerator(&prngGPU, CURAND_RNG_PSEUDO_MTGP32) ); checkCurandErrors( curandSetPseudoRandomGeneratorSeed(prngGPU, seed) ); curandGenerator_t prngCPU; checkCurandErrors( curandCreateGeneratorHost(&prngCPU, CURAND_RNG_PSEUDO_MTGP32) ); checkCurandErrors( curandSetPseudoRandomGeneratorSeed(prngCPU, seed) ); // // Example 1: Compare random numbers generated on GPU and CPU float *h_RandGPU = (float *)malloc(rand_n * sizeof(float)); printf("Generating random numbers on GPU...\n\n"); checkCurandErrors( curandGenerateUniform(prngGPU, (float*) d_Rand, rand_n) ); printf("\nReading back the results...\n"); checkCudaErrors( cudaMemcpy(h_RandGPU, d_Rand, rand_n * sizeof(float), cudaMemcpyDeviceToHost) ); float *h_RandCPU = (float *)malloc(rand_n * sizeof(float)); printf("Generating random numbers on CPU...\n\n"); checkCurandErrors( curandGenerateUniform(prngCPU, (float*) h_RandCPU, rand_n) ); printf("Comparing CPU/GPU random numbers...\n\n"); float L1norm = compareResults(rand_n, h_RandGPU, h_RandCPU); // // Example 2: Timing of random number generation on GPU const int numIterations = 10; int i; StopWatchInterface *hTimer; checkCudaErrors( cudaDeviceSynchronize() ); sdkCreateTimer(&hTimer); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); for (i = 0; i < numIterations; i++) { checkCurandErrors( curandGenerateUniform(prngGPU, (float*) d_Rand, rand_n) ); } checkCudaErrors( cudaDeviceSynchronize() ); sdkStopTimer(&hTimer); double gpuTime = 1.0e-3 * sdkGetTimerValue(&hTimer)/(double)numIterations; printf("MersenneTwister, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers\n", 1.0e-9 * rand_n / gpuTime, gpuTime, rand_n); printf("Shutting down...\n"); checkCurandErrors( curandDestroyGenerator(prngGPU) ); checkCurandErrors( curandDestroyGenerator(prngCPU) ); checkCudaErrors( cudaFree(d_Rand) ); sdkDeleteTimer( &hTimer); free(h_RandGPU); free(h_RandCPU); cudaDeviceReset(); shrQAFinishExit(argc, (const char**)argv, (L1norm < 1e-6) ? QA_PASSED : QA_FAILED); }
Engine::~Engine() { if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_)); if (curand_generator_) { CURAND_CHECK(curandDestroyGenerator(curand_generator_)); } }
/* GPUrandn */ void GPUrandn(const GPUtype &OUT) { curandStatus_t status; gpuTYPE_t type = gm->gputype.getType(OUT); gm->gmat->control.cacheClean(); const void *gpuptr = gm->gputype.getGPUptr(OUT); // pointer to GPU memory int numel = gm->gputype.getNumel(OUT); // number of elements int datasize = gm->gputype.getDataSize(OUT); // bytes for each element gen = 0; // implement recovery procedure // try and if error try again // init curand if (curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT)!=CURAND_STATUS_SUCCESS) { mexErrMsgTxt(ERROR_CURAND_INIT); } //if (curandCreateGenerator(&gen,CURAND_RNG_QUASI_DEFAULT)!=CURAND_STATUS_SUCCESS) { // mexErrMsgTxt(ERROR_CURAND_INIT); //} // randn requires even numbers // we split the execution in 2 parts (overlap if not even) // seed seed++; if (curandSetPseudoRandomGeneratorSeed(gen, time(NULL)+seed)!=CURAND_STATUS_SUCCESS) { mexErrMsgTxt(ERROR_CURAND_SEED); } unsigned int n = 0; if (type == gpuFLOAT) { n = numel; } else if (type == gpuCFLOAT) { n = numel*2; } else if (type == gpuDOUBLE) { n = numel; } else if (type == gpuCDOUBLE) { n = numel*2; } unsigned int even = (n%2) == 0; unsigned int offset = 0; unsigned int mysize = 0; unsigned int iter = 1; if (!even) { n = n-1; iter = 2; } if (type == gpuFLOAT) { float mean = 0.0; float std = 1.0; status = curandGenerateNormal(gen, (float *) gpuptr, n, mean, std); if (!even) { float *devData; if((cudaMalloc((void **)&devData, 4 * sizeof(float))) != cudaSuccess) { status = CURAND_STATUS_LAUNCH_FAILURE; } else { status = curandGenerateNormal(gen, devData, 4, mean, std); if (status==CURAND_STATUS_SUCCESS) { void *dst = (void *) ((UINTPTR gpuptr)+n*datasize); if (cudaMemcpy(dst, (void *) devData, datasize, cudaMemcpyDeviceToDevice)!=cudaSuccess) { status = CURAND_STATUS_LAUNCH_FAILURE; } } if(cudaFree(devData) != cudaSuccess) { status = CURAND_STATUS_LAUNCH_FAILURE; } } } } else if (type == gpuCFLOAT) { float mean = 0.0; float std = 1.0; status = curandGenerateNormal(gen, (float *) gpuptr, n, mean, std); } else if (type == gpuDOUBLE) { double mean = 0.0; double std = 1.0; status = curandGenerateNormalDouble(gen, (double *) gpuptr, n, mean, std); if (!even) { double *devData; if((cudaMalloc((void **)&devData, 4 * sizeof(double))) != cudaSuccess) { status = CURAND_STATUS_LAUNCH_FAILURE; } else { status = curandGenerateNormalDouble(gen, devData, 4, mean, std); if (status==CURAND_STATUS_SUCCESS) { void *dst = (void *) ((UINTPTR gpuptr)+n*datasize); if (cudaMemcpy(dst, (void *) devData, datasize, cudaMemcpyDeviceToDevice)!=cudaSuccess) { status = CURAND_STATUS_LAUNCH_FAILURE; } } if(cudaFree(devData) != cudaSuccess) { status = CURAND_STATUS_LAUNCH_FAILURE; } } } } else if (type == gpuCDOUBLE) { double mean = 0.0; double std = 1.0; status = curandGenerateNormalDouble(gen, (double *) gpuptr, n, mean, std); } if (status!=CURAND_STATUS_SUCCESS) { curandDestroyGenerator(gen); mexErrMsgTxt(ERROR_CURAND_GEN); } // destroy if (curandDestroyGenerator(gen)!=CURAND_STATUS_SUCCESS) { mexErrMsgTxt(ERROR_CURAND_DESTROY); } }
Dragon::~Dragon(){ if (cublas_handle) cublasDestroy_v2(cublas_handle); if (curand_generator) curandDestroyGenerator(curand_generator); }
prngenerator_cuda<TFloat>::~prngenerator_cuda() { CurandSafeCall(curandDestroyGenerator(_dev_bulk_prng_engine)); CudaSafeCall(cudaFree(_dev_prng_engines)); }
ModelWPAMGPU::~ModelWPAMGPU() { curandDestroyGenerator(gen); }