void cuda_randn(void *ptr, int numel, bool dbl) { static bool is_init = false; static curandGenerator_t stream; if (!is_init) { curandCreateGenerator(&stream, CURAND_RNG_PSEUDO_DEFAULT); is_init = true; } if (!dbl) { curandGenerateNormal(stream, (float *)ptr, numel, 0, 1); } else { curandGenerateNormalDouble(stream, (double *)ptr, numel, 0, 1); } return; }
/* GPUrandn */ void GPUrandn(const GPUtype &OUT) { curandStatus_t status; gpuTYPE_t type = gm->gputype.getType(OUT); gm->gmat->control.cacheClean(); const void *gpuptr = gm->gputype.getGPUptr(OUT); // pointer to GPU memory int numel = gm->gputype.getNumel(OUT); // number of elements int datasize = gm->gputype.getDataSize(OUT); // bytes for each element gen = 0; // implement recovery procedure // try and if error try again // init curand if (curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT)!=CURAND_STATUS_SUCCESS) { mexErrMsgTxt(ERROR_CURAND_INIT); } //if (curandCreateGenerator(&gen,CURAND_RNG_QUASI_DEFAULT)!=CURAND_STATUS_SUCCESS) { // mexErrMsgTxt(ERROR_CURAND_INIT); //} // randn requires even numbers // we split the execution in 2 parts (overlap if not even) // seed seed++; if (curandSetPseudoRandomGeneratorSeed(gen, time(NULL)+seed)!=CURAND_STATUS_SUCCESS) { mexErrMsgTxt(ERROR_CURAND_SEED); } unsigned int n = 0; if (type == gpuFLOAT) { n = numel; } else if (type == gpuCFLOAT) { n = numel*2; } else if (type == gpuDOUBLE) { n = numel; } else if (type == gpuCDOUBLE) { n = numel*2; } unsigned int even = (n%2) == 0; unsigned int offset = 0; unsigned int mysize = 0; unsigned int iter = 1; if (!even) { n = n-1; iter = 2; } if (type == gpuFLOAT) { float mean = 0.0; float std = 1.0; status = curandGenerateNormal(gen, (float *) gpuptr, n, mean, std); if (!even) { float *devData; if((cudaMalloc((void **)&devData, 4 * sizeof(float))) != cudaSuccess) { status = CURAND_STATUS_LAUNCH_FAILURE; } else { status = curandGenerateNormal(gen, devData, 4, mean, std); if (status==CURAND_STATUS_SUCCESS) { void *dst = (void *) ((UINTPTR gpuptr)+n*datasize); if (cudaMemcpy(dst, (void *) devData, datasize, cudaMemcpyDeviceToDevice)!=cudaSuccess) { status = CURAND_STATUS_LAUNCH_FAILURE; } } if(cudaFree(devData) != cudaSuccess) { status = CURAND_STATUS_LAUNCH_FAILURE; } } } } else if (type == gpuCFLOAT) { float mean = 0.0; float std = 1.0; status = curandGenerateNormal(gen, (float *) gpuptr, n, mean, std); } else if (type == gpuDOUBLE) { double mean = 0.0; double std = 1.0; status = curandGenerateNormalDouble(gen, (double *) gpuptr, n, mean, std); if (!even) { double *devData; if((cudaMalloc((void **)&devData, 4 * sizeof(double))) != cudaSuccess) { status = CURAND_STATUS_LAUNCH_FAILURE; } else { status = curandGenerateNormalDouble(gen, devData, 4, mean, std); if (status==CURAND_STATUS_SUCCESS) { void *dst = (void *) ((UINTPTR gpuptr)+n*datasize); if (cudaMemcpy(dst, (void *) devData, datasize, cudaMemcpyDeviceToDevice)!=cudaSuccess) { status = CURAND_STATUS_LAUNCH_FAILURE; } } if(cudaFree(devData) != cudaSuccess) { status = CURAND_STATUS_LAUNCH_FAILURE; } } } } else if (type == gpuCDOUBLE) { double mean = 0.0; double std = 1.0; status = curandGenerateNormalDouble(gen, (double *) gpuptr, n, mean, std); } if (status!=CURAND_STATUS_SUCCESS) { curandDestroyGenerator(gen); mexErrMsgTxt(ERROR_CURAND_GEN); } // destroy if (curandDestroyGenerator(gen)!=CURAND_STATUS_SUCCESS) { mexErrMsgTxt(ERROR_CURAND_DESTROY); } }