示例#1
0
void Caffe::SetDevice(const int device_id) {
  int current_device;
  CUDA_CHECK(cudaGetDevice(&current_device));
  if (current_device == device_id) {
    return;
  }
  // The call to cudaSetDevice must come before any calls to Get, which
  // may perform initialization using the GPU.
  CUDA_CHECK(cudaSetDevice(device_id));
  if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_));
  if (Get().cusparse_descr_)CUSPARSE_CHECK(cusparseDestroyMatDescr(Get().cusparse_descr_));
  if (Get().cusparse_handle_)CUSPARSE_CHECK(cusparseDestroy(Get().cusparse_handle_));
  if (Get().curand_generator_) {
    CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_));
  }
  CUSPARSE_CHECK(cusparseCreate(&Get().cusparse_handle_));
  CUSPARSE_CHECK(cusparseCreateMatDescr(&Get().cusparse_descr_));
//  cusparseSetMatType(cusparse_descr_,CUSPARSE_MATRIX_TYPE_GENERAL);
//  cusparseSetMatIndexBase(cusparse_descr_,CUSPARSE_INDEX_BASE_ZERO);
  LOG(INFO)<<"set descr";
  CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_));
  CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_,
      CURAND_RNG_PSEUDO_DEFAULT));
  CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_,
      cluster_seedgen()));
}
示例#2
0
Caffe::Caffe()
    : cublas_handle_(NULL),cusparse_handle_(NULL),cusparse_descr_(NULL),curand_generator_(NULL),random_generator_(),mode_(Caffe::CPU), solver_count_(1), root_solver_(true){
  // Try to create a cublas handler, and report an error if failed (but we will
  // keep the program running as one might just want to run CPU code).
    LOG(INFO)<<"caffe init.";
    if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) {
    LOG(ERROR) << "Cannot create Cublas handle. Cublas won't be available.";
  }
//add cusparse handler
  if (cusparseCreate(&cusparse_handle_)!=CUSPARSE_STATUS_SUCCESS){
    LOG(ERROR) << "cannot create Cusparse handle,Cusparse won't be available.";
  }
 if(cusparseCreateMatDescr(&cusparse_descr_)!=CUSPARSE_STATUS_SUCCESS){
   LOG(ERROR) << "cannot create Cusparse descr,descr won't be available.";
 }else{
  cusparseSetMatType(cusparse_descr_,CUSPARSE_MATRIX_TYPE_GENERAL);
  cusparseSetMatIndexBase(cusparse_descr_,CUSPARSE_INDEX_BASE_ZERO);
  LOG(INFO)<<"init descr";
 }
  // Try to create a curand handler.
  if (curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)
      != CURAND_STATUS_SUCCESS ||
      curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())
      != CURAND_STATUS_SUCCESS) {
    LOG(ERROR) << "Cannot create Curand generator. Curand won't be available.";
  }
  LOG(INFO)<<"caffe finish";
}
示例#3
0
void WIE::Random::Init()
{
  device.activate();
  assertResult(curandCreateGenerator(&generator, rngMethod), "Could not create random number generator");
  assertResult(curandSetPseudoRandomGeneratorSeed(generator, seed), "Could not set seed value");
  samples = NULL;
}
示例#4
0
void Caffe::set_random_seed(unsigned int seed)
{
  CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_));
  CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
  CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, seed));
  VSL_CHECK(vslDeleteStream(&Get().vsl_stream_));
  VSL_CHECK(vslNewStream(&Get().vsl_stream_, VSL_BRNG_MT19937, seed));
}
  prngenerator_cuda<TFloat>::prngenerator_cuda(uint32_t num_engines) : prngenerator<TFloat>::prngenerator(num_engines) {

    CurandSafeCall(curandCreateGenerator(&(_dev_bulk_prng_engine),
                                         CURAND_RNG_PSEUDO_DEFAULT));

    CudaSafeCall(cudaMalloc((void **) &(_dev_prng_engines),
                            _NUM_ENGINES * sizeof(curandState)));
  }
示例#6
0
Dragon::Dragon() :
	mode(Dragon::CPU), solver_count(1), root_solver(true),
	cublas_handle(NULL), curand_generator(NULL){
	if (cublasCreate_v2(&cublas_handle) != CUBLAS_STATUS_SUCCESS)
		LOG(ERROR) << "Couldn't create cublas handle.";
	if (curandCreateGenerator(&curand_generator, CURAND_RNG_PSEUDO_DEFAULT) != CURAND_STATUS_SUCCESS
		|| curandSetPseudoRandomGeneratorSeed(curand_generator, cluster_seedgen()) != CURAND_STATUS_SUCCESS)
		LOG(ERROR) << "Couldn't create curand generator.";
}
示例#7
0
Caffe::Caffe()
  : mode_(Caffe::CPU), phase_(Caffe::TRAIN), cublas_handle_(NULL),
  curand_generator_(NULL), vsl_stream_(NULL)
{
  CUBLAS_CHECK(cublasCreate(&cublas_handle_));
  //TODO: original caffe code has bug here!
  CURAND_CHECK(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
  CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, 1701ULL));
  VSL_CHECK(vslNewStream(&vsl_stream_, VSL_BRNG_MT19937, 1701));
}
/* GPUrand */
void GPUrand(const GPUtype &OUT) {

  curandStatus_t status;

  gpuTYPE_t type = gm->gputype.getType(OUT);

  gm->gmat->control.cacheClean();


  const void *gpuptr = gm->gputype.getGPUptr(OUT); // pointer to GPU memory
  int numel = gm->gputype.getNumel(OUT);           // number of elements
  int datasize = gm->gputype.getDataSize(OUT);     // bytes for each element


  gen = 0;
  // implement recovery procedure
  // try and if error try again

  // init curand
  if (curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT)!=CURAND_STATUS_SUCCESS) {
    mexErrMsgTxt(ERROR_CURAND_INIT);
  }
  //if (curandCreateGenerator(&gen,CURAND_RNG_QUASI_DEFAULT)!=CURAND_STATUS_SUCCESS) {
  //  mexErrMsgTxt(ERROR_CURAND_INIT);
  //}

  // seed
  seed++;
  if (curandSetPseudoRandomGeneratorSeed(gen, time(NULL)+seed)!=CURAND_STATUS_SUCCESS) {
    mexErrMsgTxt(ERROR_CURAND_SEED);
  }

  if (type == gpuFLOAT) {
    status = curandGenerateUniform(gen, (float *) gpuptr, numel);
  } else if (type == gpuCFLOAT) {
    status = curandGenerateUniform(gen, (float *) gpuptr, numel*2);
  } else if (type == gpuDOUBLE) {
    status = curandGenerateUniformDouble(gen, (double *) gpuptr, numel);
  } else if (type == gpuCDOUBLE) {
    status = curandGenerateUniformDouble(gen, (double *) gpuptr, numel*2);
  }

  if (status!=CURAND_STATUS_SUCCESS) {
    curandDestroyGenerator(gen);
    mexErrMsgTxt(ERROR_CURAND_GEN);
  }


  // destroy
  if (curandDestroyGenerator(gen)!=CURAND_STATUS_SUCCESS) {
    mexErrMsgTxt(ERROR_CURAND_DESTROY);
  }


}
示例#9
0
        curand_generator::
        curand_generator(
            unsigned long long seed
        ) : handle(nullptr)
        {
            curandGenerator_t gen;
            CHECK_CURAND(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
            handle = gen;

            CHECK_CURAND(curandSetPseudoRandomGeneratorSeed(gen, seed));
        }
示例#10
0
文件: cuda.c 项目: isuker/darknet
void cuda_random(float *x_gpu, size_t n)
{
    static curandGenerator_t gen;
    static int init = 0;
    if(!init){
        curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
        curandSetPseudoRandomGeneratorSeed(gen, time(0));
        init = 1;
    }
    curandGenerateUniform(gen, x_gpu, n);
    check_error(cudaPeekAtLastError());
}
示例#11
0
void cuda_random(float *x_gpu, size_t n) {
	static curandGenerator_t gen[16];
	static int init[16] = { 0 };
	int i = cuda_get_device();
	if (!init[i]) {
		curandCreateGenerator(&gen[i], CURAND_RNG_PSEUDO_DEFAULT);
		curandSetPseudoRandomGeneratorSeed(gen[i], time(0));
		init[i] = 1;
	}
	curandGenerateUniform(gen[i], x_gpu, n);
	check_error(cudaPeekAtLastError());
}
ModelWPAMGPU::ModelWPAMGPU()
{
    variance = 100e-6f;
    dimmeas = 3;
    T = 1.0f/30.0f;

    addF();

    setUNoise();

	curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
	curandSetPseudoRandomGeneratorSeed(gen, 1234ULL);
}
示例#13
0
curandGenerator_t Caffe::device_curand_generator() {
  curandGenerator_t& curand_generator = curand_generators_[current_device()];
  if (!curand_generator) {
    // Try to create a curand handler.
    if (curandCreateGenerator(&curand_generator, CURAND_RNG_PSEUDO_DEFAULT) !=
            CURAND_STATUS_SUCCESS ||
        curandSetPseudoRandomGeneratorSeed(curand_generator, cluster_seedgen()) !=
            CURAND_STATUS_SUCCESS) {
      LOG(ERROR) << "Cannot create Curand generator. Curand won't be available.";
    }
    curandSetStream(curand_generator, device_pstream()->get());
  }
  return curand_generator;
}
示例#14
0
void Dragon::set_device(const int device_id) {
	int current_device;
	CUDA_CHECK(cudaGetDevice(&current_device));
	if (current_device == device_id) return;
	// The call to cudaSetDevice must come before any calls to Get, which
	// may perform initialization using the GPU.

	//	reset Device must reset handle and generator???
	CUDA_CHECK(cudaSetDevice(device_id));
	if (Get().cublas_handle) cublasDestroy_v2(Get().cublas_handle);
	if (Get().curand_generator) curandDestroyGenerator(Get().curand_generator);
	cublasCreate_v2(&Get().cublas_handle);
	curandCreateGenerator(&Get().curand_generator, CURAND_RNG_PSEUDO_DEFAULT);
	curandSetPseudoRandomGeneratorSeed(Get().curand_generator, cluster_seedgen());
}
示例#15
0
void Caffe::set_random_seed(const unsigned int seed) {
  // Curand seed
  // Yangqing's note: simply setting the generator seed does not seem to
  // work on the tesla K20s, so I wrote the ugly reset thing below.
  if (Get().curand_generator_) {
    CURAND_CHECK(curandDestroyGenerator(curand_generator()));
    CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_,
        CURAND_RNG_PSEUDO_DEFAULT));
    CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator(),
        seed));
  } else {
    LOG(ERROR) << "Curand not available. Skipping setting the curand seed.";
  }
  // RNG seed
  Get().random_generator_.reset(new RNG(seed));
}
示例#16
0
Engine::Engine()
    : cublas_handle_(NULL), curand_generator_(NULL), random_generator_(),
    mode_(Engine::CPU) {
  // Try to create a cublas handler, and report an error if failed (but we will
  // keep the program running as one might just want to run CPU code).
  if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) {
    LOG(ERROR) << "Cannot create Cublas handle. Cublas won't be available.";
  }
  // Try to create a curand handler.
  if (curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)
      != CURAND_STATUS_SUCCESS ||
      curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())
      != CURAND_STATUS_SUCCESS) {
    LOG(ERROR) << "Cannot create Curand generator. Curand won't be available.";
  }
}
示例#17
0
void Caffe::SetDevice(const int device_id) {
  int current_device;
  CUDA_CHECK(cudaGetDevice(&current_device));
  if (current_device == device_id) {
    return;
  }
  if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_));
  if (Get().curand_generator_) {
    CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_));
  }
  CUDA_CHECK(cudaSetDevice(device_id));
  CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_));
  CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_,
      CURAND_RNG_PSEUDO_DEFAULT));
  CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_,
      cluster_seedgen()));
}
示例#18
0
void cuda_rand(void *ptr, int numel, bool dbl)
{
    static bool is_init = false;
    static curandGenerator_t stream;
    if (!is_init) {
        curandCreateGenerator(&stream, CURAND_RNG_PSEUDO_DEFAULT);
        is_init = true;
    }

    if (!dbl) {
        curandGenerateUniform(stream, (float *)ptr, numel);
    } else {
        curandGenerateUniformDouble(stream, (double *)ptr, numel);
    }

    return;
}
示例#19
0
void Engine::SetDevice(const int device_id) {
  int current_device;
  CUDA_CHECK(cudaGetDevice(&current_device));
  if (current_device == device_id) {
    return;
  }
  // The call to cudaSetDevice must come before any calls to Get, which
  // may perform initialization using the GPU.
  CUDA_CHECK(cudaSetDevice(device_id));
  if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_));
  if (Get().curand_generator_) {
    CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_));
  }
  CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_));
  CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_,
      CURAND_RNG_PSEUDO_DEFAULT));
  CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_,
      cluster_seedgen()));
}
示例#20
0
void Caffe::SetSlaveDevice(const int slave_device_id) {
  int current_device;
  CUDA_CHECK(cudaGetDevice(&current_device));
  if (current_device == slave_device_id) {
    return;
  }
  if (Get().slave_cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().slave_cublas_handle_));
  if (Get().slave_curand_generator_) {
    CURAND_CHECK(curandDestroyGenerator(Get().slave_curand_generator_));
  }
  CUDA_CHECK(cudaSetDevice(slave_device_id));
  CUDA_CHECK(cudaStreamCreate (&Get().slave_cu_stream_));
  CUBLAS_CHECK(cublasCreate(&Get().slave_cublas_handle_));
  CUBLAS_CHECK(cublasSetStream(Get().slave_cublas_handle_, Get().slave_cu_stream_));
  CURAND_CHECK(curandCreateGenerator(&Get().slave_curand_generator_,
      CURAND_RNG_PSEUDO_DEFAULT));
  CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().slave_curand_generator_,
      cluster_seedgen()));
  Get().slave_device_id_ = slave_device_id;
  CUDA_CHECK(cudaSetDevice(current_device));
  Caffe::set_gpu_mode(Caffe::MASTER_SLAVE);
}
示例#21
0
void Caffe::SetDevice(const int device_id) {
  std::vector<int> devices;
  devices.push_back(device_id);
  Caffe::SetDevices(devices);

  Get().default_device_context_ = GetDeviceContext(device_id);

  if (Get().default_device_context_->backend() == Backend::BACKEND_CUDA) {
#ifdef USE_CUDA
    int current_device;
    CUDA_CHECK(cudaGetDevice(&current_device));
    if (current_device == device_id) {
      return;
    }
// The call to cudaSetDevice must come before any calls to Get, which
// may perform initialization using the GPU.
    CUDA_CHECK(cudaSetDevice(device_id));
    if (Get().cublas_handle_)
      CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_));
    if (Get().curand_generator_) {
      CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_));
    }
    CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_));
    CURAND_CHECK(
        curandCreateGenerator(&Get().curand_generator_,
                              CURAND_RNG_PSEUDO_DEFAULT));
    CURAND_CHECK(
        curandSetPseudoRandomGeneratorSeed(Get().curand_generator_,
                                           cluster_seedgen()));
#endif  // USE_CUDA
  } else {
#ifdef USE_GREENTEA
#ifdef USE_CLBLAS
    clblasSetup();
#endif  // USE_CLBLAS
#endif  // USE_GREENTEA
  }
}
示例#22
0
Caffe::Caffe()
    : mode_(Caffe::CPU), phase_(Caffe::TRAIN), cublas_handle_(NULL),
      curand_generator_(NULL),
      random_generator_(),
	  slave_cublas_handle_(NULL),
	  slave_curand_generator_(NULL),
	  master_device_id_(0), slave_device_id_(-1),
	  cu_stream_(NULL),slave_cu_stream_(NULL),
	  current_cu_stream_(NULL){
  // Try to create a cublas handler, and report an error if failed (but we will
  // keep the program running as one might just want to run CPU code).
  if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) {
    LOG(ERROR) << "Cannot create Cublas handle. Cublas won't be available.";
  }
  // Try to create a curand handler.
  if (curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)
      != CURAND_STATUS_SUCCESS ||
      curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())
      != CURAND_STATUS_SUCCESS) {
    LOG(ERROR) << "Cannot create Curand generator. Curand won't be available.";
  }


}
示例#23
0
Caffe::Caffe()
    :
#ifdef USE_CUDA
      cublas_handle_(NULL),
      curand_generator_(NULL),
#endif  // USE_CUDA
      random_generator_(),
      mode_(Caffe::CPU),
      default_device_context_(nullptr) {
  // Try to create a cublas handler, and report an error if failed (but we will
  // keep the program running as one might just want to run CPU code).
#ifdef USE_CUDA
  if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) {
    LOG(ERROR)<< "Cannot create Cublas handle. Cublas won't be available.";
  }
  // Try to create a curand handler.
  if (curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)
      != CURAND_STATUS_SUCCESS ||
      curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())
      != CURAND_STATUS_SUCCESS) {
    LOG(ERROR) << "Cannot create Curand generator. Curand won't be available.";
  }
#endif  // USE_CUDA
}
示例#24
0
///////////////////////////////////////////////////////////////////////////////
// Main program
///////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    // Start logs
    shrQAStart(argc, argv);

    // initialize the GPU, either identified by --device
    // or by picking the device with highest flop rate.
    int devID = findCudaDevice(argc, (const char **)argv);

    // parsing the number of random numbers to generate
    int rand_n = DEFAULT_RAND_N;
    if( checkCmdLineFlag(argc, (const char**) argv, "count") )  
    {       
        rand_n = getCmdLineArgumentInt(argc, (const char**) argv, "count"); 
    }
    printf("Allocating data for %i samples...\n", rand_n);
     
    // parsing the seed
    int seed = DEFAULT_SEED;
    if( checkCmdLineFlag(argc, (const char**) argv, "seed") ) 
    {       
        seed = getCmdLineArgumentInt(argc, (const char**) argv, "seed"); 
    }
    printf("Seeding with %i ...\n", seed);
    

    float *d_Rand; 
    checkCudaErrors( cudaMalloc((void **)&d_Rand, rand_n * sizeof(float)) );
    
    curandGenerator_t prngGPU;
    checkCurandErrors( curandCreateGenerator(&prngGPU, CURAND_RNG_PSEUDO_MTGP32) ); 
    checkCurandErrors( curandSetPseudoRandomGeneratorSeed(prngGPU, seed) );

    curandGenerator_t prngCPU;
    checkCurandErrors( curandCreateGeneratorHost(&prngCPU, CURAND_RNG_PSEUDO_MTGP32) ); 
    checkCurandErrors( curandSetPseudoRandomGeneratorSeed(prngCPU, seed) );

    //
    // Example 1: Compare random numbers generated on GPU and CPU
    float *h_RandGPU  = (float *)malloc(rand_n * sizeof(float));

    printf("Generating random numbers on GPU...\n\n");
    checkCurandErrors( curandGenerateUniform(prngGPU, (float*) d_Rand, rand_n) );

    printf("\nReading back the results...\n");
    checkCudaErrors( cudaMemcpy(h_RandGPU, d_Rand, rand_n * sizeof(float), cudaMemcpyDeviceToHost) );

    
    float *h_RandCPU  = (float *)malloc(rand_n * sizeof(float));
     
    printf("Generating random numbers on CPU...\n\n");
    checkCurandErrors( curandGenerateUniform(prngCPU, (float*) h_RandCPU, rand_n) ); 
 
    printf("Comparing CPU/GPU random numbers...\n\n");
    float L1norm = compareResults(rand_n, h_RandGPU, h_RandCPU); 
    
    //
    // Example 2: Timing of random number generation on GPU
    const int numIterations = 10;
    int i;
    StopWatchInterface *hTimer;

    checkCudaErrors( cudaDeviceSynchronize() );
    sdkCreateTimer(&hTimer);
    sdkResetTimer(&hTimer);
    sdkStartTimer(&hTimer);

    for (i = 0; i < numIterations; i++)
    {
        checkCurandErrors( curandGenerateUniform(prngGPU, (float*) d_Rand, rand_n) );
    }

    checkCudaErrors( cudaDeviceSynchronize() );
    sdkStopTimer(&hTimer);

    double gpuTime = 1.0e-3 * sdkGetTimerValue(&hTimer)/(double)numIterations;

    printf("MersenneTwister, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers\n", 
               1.0e-9 * rand_n / gpuTime, gpuTime, rand_n); 

    printf("Shutting down...\n");

    checkCurandErrors( curandDestroyGenerator(prngGPU) );
    checkCurandErrors( curandDestroyGenerator(prngCPU) );
    checkCudaErrors( cudaFree(d_Rand) );
    sdkDeleteTimer( &hTimer);
    free(h_RandGPU);
    free(h_RandCPU);

    cudaDeviceReset();	
    shrQAFinishExit(argc, (const char**)argv, (L1norm < 1e-6) ? QA_PASSED : QA_FAILED);
}
示例#25
0
/* GPUrandn */
void GPUrandn(const GPUtype &OUT) {

  curandStatus_t status;

  gpuTYPE_t type = gm->gputype.getType(OUT);


  gm->gmat->control.cacheClean();



  const void *gpuptr = gm->gputype.getGPUptr(OUT); // pointer to GPU memory
  int numel = gm->gputype.getNumel(OUT);           // number of elements
  int datasize = gm->gputype.getDataSize(OUT);     // bytes for each element


  gen = 0;
  // implement recovery procedure
  // try and if error try again

  // init curand
  if (curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT)!=CURAND_STATUS_SUCCESS) {
    mexErrMsgTxt(ERROR_CURAND_INIT);
  }
  //if (curandCreateGenerator(&gen,CURAND_RNG_QUASI_DEFAULT)!=CURAND_STATUS_SUCCESS) {
  //  mexErrMsgTxt(ERROR_CURAND_INIT);
  //}

  // randn requires even numbers
  // we split the execution in 2 parts (overlap if not even)

  // seed
  seed++;
  if (curandSetPseudoRandomGeneratorSeed(gen, time(NULL)+seed)!=CURAND_STATUS_SUCCESS) {
    mexErrMsgTxt(ERROR_CURAND_SEED);
  }

  unsigned int n = 0;

  if (type == gpuFLOAT) {
    n = numel;
  } else if (type == gpuCFLOAT) {
    n = numel*2;
  } else if (type == gpuDOUBLE) {
    n = numel;
  } else if (type == gpuCDOUBLE) {
    n = numel*2;
  }

  unsigned int even = (n%2) == 0;

  unsigned int offset = 0;
  unsigned int mysize = 0;


  unsigned int iter = 1;
  if (!even) {
    n = n-1;
    iter = 2;
  }

  if (type == gpuFLOAT) {
    float mean = 0.0;
    float std = 1.0;
    status = curandGenerateNormal(gen, (float *) gpuptr, n, mean, std);

    if (!even) {
      float *devData;
      if((cudaMalloc((void **)&devData, 4 * sizeof(float))) != cudaSuccess) {
        status = CURAND_STATUS_LAUNCH_FAILURE;
      } else {
        status = curandGenerateNormal(gen, devData, 4, mean, std);
        if (status==CURAND_STATUS_SUCCESS) {
          void *dst = (void *) ((UINTPTR gpuptr)+n*datasize);
          if (cudaMemcpy(dst, (void *) devData, datasize, cudaMemcpyDeviceToDevice)!=cudaSuccess) {
            status = CURAND_STATUS_LAUNCH_FAILURE;
          }
        }
        if(cudaFree(devData) != cudaSuccess) {
          status = CURAND_STATUS_LAUNCH_FAILURE;
        }
      }
    }

  } else if (type == gpuCFLOAT) {
    float mean = 0.0;
    float std = 1.0;
    status = curandGenerateNormal(gen, (float *) gpuptr, n, mean, std);
  } else if (type == gpuDOUBLE) {
    double mean = 0.0;
    double std = 1.0;
    status = curandGenerateNormalDouble(gen, (double *) gpuptr, n, mean, std);
    if (!even) {
      double *devData;
      if((cudaMalloc((void **)&devData, 4 * sizeof(double))) != cudaSuccess) {
        status = CURAND_STATUS_LAUNCH_FAILURE;
      } else {
        status = curandGenerateNormalDouble(gen, devData, 4, mean, std);
        if (status==CURAND_STATUS_SUCCESS) {
          void *dst = (void *) ((UINTPTR gpuptr)+n*datasize);
          if (cudaMemcpy(dst, (void *) devData, datasize, cudaMemcpyDeviceToDevice)!=cudaSuccess) {
            status = CURAND_STATUS_LAUNCH_FAILURE;
          }
        }
        if(cudaFree(devData) != cudaSuccess) {
          status = CURAND_STATUS_LAUNCH_FAILURE;
        }
      }
    }


  } else if (type == gpuCDOUBLE) {
    double mean = 0.0;
    double std = 1.0;
    status = curandGenerateNormalDouble(gen, (double *) gpuptr, n, mean, std);

  }

  if (status!=CURAND_STATUS_SUCCESS) {
    curandDestroyGenerator(gen);
    mexErrMsgTxt(ERROR_CURAND_GEN);
  }


  // destroy
  if (curandDestroyGenerator(gen)!=CURAND_STATUS_SUCCESS) {
    mexErrMsgTxt(ERROR_CURAND_DESTROY);
  }


}