void Caffe::SetDevice(const int device_id) { int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); if (current_device == device_id) { return; } // The call to cudaSetDevice must come before any calls to Get, which // may perform initialization using the GPU. CUDA_CHECK(cudaSetDevice(device_id)); if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_)); if (Get().curand_generator_) { CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_)); } CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_)); CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, cluster_seedgen())); }
void Caffe::set_random_seed(const unsigned int seed) { // Curand seed // Yangqing's note: simply setting the generator seed does not seem to // work on the tesla K20s, so I wrote the ugly reset thing below. static bool g_curand_availability_logged = false; if (Get().curand_generator_) { CURAND_CHECK(curandDestroyGenerator(curand_generator())); CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator(), seed)); } else { if (!g_curand_availability_logged) { LOG(ERROR) << "Curand not available. Skipping setting the curand seed."; g_curand_availability_logged = true; } } // RNG seed Get().random_generator_.reset(new RNG(seed)); }
void Caffe::SetSlaveDevice(const int slave_device_id) { int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); if (current_device == slave_device_id) { return; } if (Get().slave_cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().slave_cublas_handle_)); if (Get().slave_curand_generator_) { CURAND_CHECK(curandDestroyGenerator(Get().slave_curand_generator_)); } CUDA_CHECK(cudaSetDevice(slave_device_id)); CUDA_CHECK(cudaStreamCreate (&Get().slave_cu_stream_)); CUBLAS_CHECK(cublasCreate(&Get().slave_cublas_handle_)); CUBLAS_CHECK(cublasSetStream(Get().slave_cublas_handle_, Get().slave_cu_stream_)); CURAND_CHECK(curandCreateGenerator(&Get().slave_curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().slave_curand_generator_, cluster_seedgen())); Get().slave_device_id_ = slave_device_id; CUDA_CHECK(cudaSetDevice(current_device)); Caffe::set_gpu_mode(Caffe::MASTER_SLAVE); }
void Caffe::SetDevice(const int device_id) { std::vector<int> devices; devices.push_back(device_id); Caffe::SetDevices(devices); Get().default_device_context_ = GetDeviceContext(device_id); if (Get().default_device_context_->backend() == Backend::BACKEND_CUDA) { #ifdef USE_CUDA int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); if (current_device == device_id) { return; } // The call to cudaSetDevice must come before any calls to Get, which // may perform initialization using the GPU. CUDA_CHECK(cudaSetDevice(device_id)); if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_)); if (Get().curand_generator_) { CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_)); } CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_)); CURAND_CHECK( curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK( curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, cluster_seedgen())); #endif // USE_CUDA } else { #ifdef USE_GREENTEA #ifdef USE_CLBLAS clblasSetup(); #endif // USE_CLBLAS #endif // USE_GREENTEA } }
Caffe::Caffe() : mode_(Caffe::CPU), phase_(Caffe::TRAIN), cublas_handle_(NULL), curand_generator_(NULL), random_generator_(), slave_cublas_handle_(NULL), slave_curand_generator_(NULL), master_device_id_(0), slave_device_id_(-1), cu_stream_(NULL),slave_cu_stream_(NULL), current_cu_stream_(NULL){ // Try to create a cublas handler, and report an error if failed (but we will // keep the program running as one might just want to run CPU code). if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) { LOG(ERROR) << "Cannot create Cublas handle. Cublas won't be available."; } // Try to create a curand handler. if (curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT) != CURAND_STATUS_SUCCESS || curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()) != CURAND_STATUS_SUCCESS) { LOG(ERROR) << "Cannot create Curand generator. Curand won't be available."; } }
Caffe::Caffe() : #ifdef USE_CUDA cublas_handle_(NULL), curand_generator_(NULL), #endif // USE_CUDA random_generator_(), mode_(Caffe::CPU), default_device_context_(nullptr) { // Try to create a cublas handler, and report an error if failed (but we will // keep the program running as one might just want to run CPU code). #ifdef USE_CUDA if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) { LOG(ERROR)<< "Cannot create Cublas handle. Cublas won't be available."; } // Try to create a curand handler. if (curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT) != CURAND_STATUS_SUCCESS || curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()) != CURAND_STATUS_SUCCESS) { LOG(ERROR) << "Cannot create Curand generator. Curand won't be available."; } #endif // USE_CUDA }
void Random<GPU>::set_seed (int seed) { cuda_check (curandSetPseudoRandomGeneratorSeed (dnnctx[did_]->curand_, seed)); }
/////////////////////////////////////////////////////////////////////////////// // Main program /////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { // Start logs shrQAStart(argc, argv); // initialize the GPU, either identified by --device // or by picking the device with highest flop rate. int devID = findCudaDevice(argc, (const char **)argv); // parsing the number of random numbers to generate int rand_n = DEFAULT_RAND_N; if( checkCmdLineFlag(argc, (const char**) argv, "count") ) { rand_n = getCmdLineArgumentInt(argc, (const char**) argv, "count"); } printf("Allocating data for %i samples...\n", rand_n); // parsing the seed int seed = DEFAULT_SEED; if( checkCmdLineFlag(argc, (const char**) argv, "seed") ) { seed = getCmdLineArgumentInt(argc, (const char**) argv, "seed"); } printf("Seeding with %i ...\n", seed); float *d_Rand; checkCudaErrors( cudaMalloc((void **)&d_Rand, rand_n * sizeof(float)) ); curandGenerator_t prngGPU; checkCurandErrors( curandCreateGenerator(&prngGPU, CURAND_RNG_PSEUDO_MTGP32) ); checkCurandErrors( curandSetPseudoRandomGeneratorSeed(prngGPU, seed) ); curandGenerator_t prngCPU; checkCurandErrors( curandCreateGeneratorHost(&prngCPU, CURAND_RNG_PSEUDO_MTGP32) ); checkCurandErrors( curandSetPseudoRandomGeneratorSeed(prngCPU, seed) ); // // Example 1: Compare random numbers generated on GPU and CPU float *h_RandGPU = (float *)malloc(rand_n * sizeof(float)); printf("Generating random numbers on GPU...\n\n"); checkCurandErrors( curandGenerateUniform(prngGPU, (float*) d_Rand, rand_n) ); printf("\nReading back the results...\n"); checkCudaErrors( cudaMemcpy(h_RandGPU, d_Rand, rand_n * sizeof(float), cudaMemcpyDeviceToHost) ); float *h_RandCPU = (float *)malloc(rand_n * sizeof(float)); printf("Generating random numbers on CPU...\n\n"); checkCurandErrors( curandGenerateUniform(prngCPU, (float*) h_RandCPU, rand_n) ); printf("Comparing CPU/GPU random numbers...\n\n"); float L1norm = compareResults(rand_n, h_RandGPU, h_RandCPU); // // Example 2: Timing of random number generation on GPU const int numIterations = 10; int i; StopWatchInterface *hTimer; checkCudaErrors( cudaDeviceSynchronize() ); sdkCreateTimer(&hTimer); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); for (i = 0; i < numIterations; i++) { checkCurandErrors( curandGenerateUniform(prngGPU, (float*) d_Rand, rand_n) ); } checkCudaErrors( cudaDeviceSynchronize() ); sdkStopTimer(&hTimer); double gpuTime = 1.0e-3 * sdkGetTimerValue(&hTimer)/(double)numIterations; printf("MersenneTwister, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers\n", 1.0e-9 * rand_n / gpuTime, gpuTime, rand_n); printf("Shutting down...\n"); checkCurandErrors( curandDestroyGenerator(prngGPU) ); checkCurandErrors( curandDestroyGenerator(prngCPU) ); checkCudaErrors( cudaFree(d_Rand) ); sdkDeleteTimer( &hTimer); free(h_RandGPU); free(h_RandCPU); cudaDeviceReset(); shrQAFinishExit(argc, (const char**)argv, (L1norm < 1e-6) ? QA_PASSED : QA_FAILED); }
/* GPUrandn */ void GPUrandn(const GPUtype &OUT) { curandStatus_t status; gpuTYPE_t type = gm->gputype.getType(OUT); gm->gmat->control.cacheClean(); const void *gpuptr = gm->gputype.getGPUptr(OUT); // pointer to GPU memory int numel = gm->gputype.getNumel(OUT); // number of elements int datasize = gm->gputype.getDataSize(OUT); // bytes for each element gen = 0; // implement recovery procedure // try and if error try again // init curand if (curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT)!=CURAND_STATUS_SUCCESS) { mexErrMsgTxt(ERROR_CURAND_INIT); } //if (curandCreateGenerator(&gen,CURAND_RNG_QUASI_DEFAULT)!=CURAND_STATUS_SUCCESS) { // mexErrMsgTxt(ERROR_CURAND_INIT); //} // randn requires even numbers // we split the execution in 2 parts (overlap if not even) // seed seed++; if (curandSetPseudoRandomGeneratorSeed(gen, time(NULL)+seed)!=CURAND_STATUS_SUCCESS) { mexErrMsgTxt(ERROR_CURAND_SEED); } unsigned int n = 0; if (type == gpuFLOAT) { n = numel; } else if (type == gpuCFLOAT) { n = numel*2; } else if (type == gpuDOUBLE) { n = numel; } else if (type == gpuCDOUBLE) { n = numel*2; } unsigned int even = (n%2) == 0; unsigned int offset = 0; unsigned int mysize = 0; unsigned int iter = 1; if (!even) { n = n-1; iter = 2; } if (type == gpuFLOAT) { float mean = 0.0; float std = 1.0; status = curandGenerateNormal(gen, (float *) gpuptr, n, mean, std); if (!even) { float *devData; if((cudaMalloc((void **)&devData, 4 * sizeof(float))) != cudaSuccess) { status = CURAND_STATUS_LAUNCH_FAILURE; } else { status = curandGenerateNormal(gen, devData, 4, mean, std); if (status==CURAND_STATUS_SUCCESS) { void *dst = (void *) ((UINTPTR gpuptr)+n*datasize); if (cudaMemcpy(dst, (void *) devData, datasize, cudaMemcpyDeviceToDevice)!=cudaSuccess) { status = CURAND_STATUS_LAUNCH_FAILURE; } } if(cudaFree(devData) != cudaSuccess) { status = CURAND_STATUS_LAUNCH_FAILURE; } } } } else if (type == gpuCFLOAT) { float mean = 0.0; float std = 1.0; status = curandGenerateNormal(gen, (float *) gpuptr, n, mean, std); } else if (type == gpuDOUBLE) { double mean = 0.0; double std = 1.0; status = curandGenerateNormalDouble(gen, (double *) gpuptr, n, mean, std); if (!even) { double *devData; if((cudaMalloc((void **)&devData, 4 * sizeof(double))) != cudaSuccess) { status = CURAND_STATUS_LAUNCH_FAILURE; } else { status = curandGenerateNormalDouble(gen, devData, 4, mean, std); if (status==CURAND_STATUS_SUCCESS) { void *dst = (void *) ((UINTPTR gpuptr)+n*datasize); if (cudaMemcpy(dst, (void *) devData, datasize, cudaMemcpyDeviceToDevice)!=cudaSuccess) { status = CURAND_STATUS_LAUNCH_FAILURE; } } if(cudaFree(devData) != cudaSuccess) { status = CURAND_STATUS_LAUNCH_FAILURE; } } } } else if (type == gpuCDOUBLE) { double mean = 0.0; double std = 1.0; status = curandGenerateNormalDouble(gen, (double *) gpuptr, n, mean, std); } if (status!=CURAND_STATUS_SUCCESS) { curandDestroyGenerator(gen); mexErrMsgTxt(ERROR_CURAND_GEN); } // destroy if (curandDestroyGenerator(gen)!=CURAND_STATUS_SUCCESS) { mexErrMsgTxt(ERROR_CURAND_DESTROY); } }