/* GPUrand */ void GPUrand(const GPUtype &OUT) { curandStatus_t status; gpuTYPE_t type = gm->gputype.getType(OUT); gm->gmat->control.cacheClean(); const void *gpuptr = gm->gputype.getGPUptr(OUT); // pointer to GPU memory int numel = gm->gputype.getNumel(OUT); // number of elements int datasize = gm->gputype.getDataSize(OUT); // bytes for each element gen = 0; // implement recovery procedure // try and if error try again // init curand if (curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT)!=CURAND_STATUS_SUCCESS) { mexErrMsgTxt(ERROR_CURAND_INIT); } //if (curandCreateGenerator(&gen,CURAND_RNG_QUASI_DEFAULT)!=CURAND_STATUS_SUCCESS) { // mexErrMsgTxt(ERROR_CURAND_INIT); //} // seed seed++; if (curandSetPseudoRandomGeneratorSeed(gen, time(NULL)+seed)!=CURAND_STATUS_SUCCESS) { mexErrMsgTxt(ERROR_CURAND_SEED); } if (type == gpuFLOAT) { status = curandGenerateUniform(gen, (float *) gpuptr, numel); } else if (type == gpuCFLOAT) { status = curandGenerateUniform(gen, (float *) gpuptr, numel*2); } else if (type == gpuDOUBLE) { status = curandGenerateUniformDouble(gen, (double *) gpuptr, numel); } else if (type == gpuCDOUBLE) { status = curandGenerateUniformDouble(gen, (double *) gpuptr, numel*2); } if (status!=CURAND_STATUS_SUCCESS) { curandDestroyGenerator(gen); mexErrMsgTxt(ERROR_CURAND_GEN); } // destroy if (curandDestroyGenerator(gen)!=CURAND_STATUS_SUCCESS) { mexErrMsgTxt(ERROR_CURAND_DESTROY); } }
void Random<GPU>::uniform (float *data, int size, const float a, const float b) const { const int N = size; cuda_check (curandGenerateUniform (dnnctx[did_]->curand_, data, N)); if (a != 0.f || b != 1.f) XPU_KERNEL_LAUNCH (tensor_scale, cuda_get_blocks(N), CUDA_NUM_THREADS, 0, dnnctx[did_]->stream_, N, data, a, b); }
/////////////////////////////////////////////////////////////////////////////// // Main class logic. /////////////////////////////////////////////////////////////////////////////// void WIE::Random::generate() { device.activate(); if(!samples) { device.assertResult(cudaMalloc((void **)&samples, sampleCount * sizeof(float)), "Could not allocate device memory"); } curandStatus_t result = curandGenerateUniform(generator, samples, sampleCount); assertResult(result, "Could not generate random numbers"); }
void prngenerator_cuda<TFloat>::_generate(const uint32_t n, TFloat * output) { // #ifdef _DEBUG // __cudaCheckMemory(); // std::cout << "Generating " << n << " numbers." << std::endl; // #endif CurandSafeCall(curandGenerateUniform (_dev_bulk_prng_engine, output, n)); }
void curand_generator:: fill_uniform ( tensor& data ) { if (data.size() == 0) return; CHECK_CURAND(curandGenerateUniform((curandGenerator_t)handle, data.device(), data.size())); }
void cuda_random(float *x_gpu, size_t n) { static curandGenerator_t gen; static int init = 0; if(!init){ curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT); curandSetPseudoRandomGeneratorSeed(gen, time(0)); init = 1; } curandGenerateUniform(gen, x_gpu, n); check_error(cudaPeekAtLastError()); }
void cuda_random(float *x_gpu, size_t n) { static curandGenerator_t gen[16]; static int init[16] = { 0 }; int i = cuda_get_device(); if (!init[i]) { curandCreateGenerator(&gen[i], CURAND_RNG_PSEUDO_DEFAULT); curandSetPseudoRandomGeneratorSeed(gen[i], time(0)); init[i] = 1; } curandGenerateUniform(gen[i], x_gpu, n); check_error(cudaPeekAtLastError()); }
void cuda_rand(void *ptr, int numel, bool dbl) { static bool is_init = false; static curandGenerator_t stream; if (!is_init) { curandCreateGenerator(&stream, CURAND_RNG_PSEUDO_DEFAULT); is_init = true; } if (!dbl) { curandGenerateUniform(stream, (float *)ptr, numel); } else { curandGenerateUniformDouble(stream, (double *)ptr, numel); } return; }
/////////////////////////////////////////////////////////////////////////////// // Main program /////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { // Start logs shrQAStart(argc, argv); // initialize the GPU, either identified by --device // or by picking the device with highest flop rate. int devID = findCudaDevice(argc, (const char **)argv); // parsing the number of random numbers to generate int rand_n = DEFAULT_RAND_N; if( checkCmdLineFlag(argc, (const char**) argv, "count") ) { rand_n = getCmdLineArgumentInt(argc, (const char**) argv, "count"); } printf("Allocating data for %i samples...\n", rand_n); // parsing the seed int seed = DEFAULT_SEED; if( checkCmdLineFlag(argc, (const char**) argv, "seed") ) { seed = getCmdLineArgumentInt(argc, (const char**) argv, "seed"); } printf("Seeding with %i ...\n", seed); float *d_Rand; checkCudaErrors( cudaMalloc((void **)&d_Rand, rand_n * sizeof(float)) ); curandGenerator_t prngGPU; checkCurandErrors( curandCreateGenerator(&prngGPU, CURAND_RNG_PSEUDO_MTGP32) ); checkCurandErrors( curandSetPseudoRandomGeneratorSeed(prngGPU, seed) ); curandGenerator_t prngCPU; checkCurandErrors( curandCreateGeneratorHost(&prngCPU, CURAND_RNG_PSEUDO_MTGP32) ); checkCurandErrors( curandSetPseudoRandomGeneratorSeed(prngCPU, seed) ); // // Example 1: Compare random numbers generated on GPU and CPU float *h_RandGPU = (float *)malloc(rand_n * sizeof(float)); printf("Generating random numbers on GPU...\n\n"); checkCurandErrors( curandGenerateUniform(prngGPU, (float*) d_Rand, rand_n) ); printf("\nReading back the results...\n"); checkCudaErrors( cudaMemcpy(h_RandGPU, d_Rand, rand_n * sizeof(float), cudaMemcpyDeviceToHost) ); float *h_RandCPU = (float *)malloc(rand_n * sizeof(float)); printf("Generating random numbers on CPU...\n\n"); checkCurandErrors( curandGenerateUniform(prngCPU, (float*) h_RandCPU, rand_n) ); printf("Comparing CPU/GPU random numbers...\n\n"); float L1norm = compareResults(rand_n, h_RandGPU, h_RandCPU); // // Example 2: Timing of random number generation on GPU const int numIterations = 10; int i; StopWatchInterface *hTimer; checkCudaErrors( cudaDeviceSynchronize() ); sdkCreateTimer(&hTimer); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); for (i = 0; i < numIterations; i++) { checkCurandErrors( curandGenerateUniform(prngGPU, (float*) d_Rand, rand_n) ); } checkCudaErrors( cudaDeviceSynchronize() ); sdkStopTimer(&hTimer); double gpuTime = 1.0e-3 * sdkGetTimerValue(&hTimer)/(double)numIterations; printf("MersenneTwister, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers\n", 1.0e-9 * rand_n / gpuTime, gpuTime, rand_n); printf("Shutting down...\n"); checkCurandErrors( curandDestroyGenerator(prngGPU) ); checkCurandErrors( curandDestroyGenerator(prngCPU) ); checkCudaErrors( cudaFree(d_Rand) ); sdkDeleteTimer( &hTimer); free(h_RandGPU); free(h_RandCPU); cudaDeviceReset(); shrQAFinishExit(argc, (const char**)argv, (L1norm < 1e-6) ? QA_PASSED : QA_FAILED); }