fmat ModelWPAMGPU::hfun(fmat *state) { float* state_dev; float* oNoise_dev; int stateDimension = state->n_rows; int numberOfSamples = state->n_cols; float* meas_dev; fmat measurement(state->n_rows,state->n_cols); //fmat oNoiseSample = oNoise.sample(state->n_cols); //measurement = state + oNoiseSample; //allocate memory on gpu cudaMalloc( &state_dev, (size_t) state->n_elem * sizeof(float)) ; cudaMalloc( &oNoise_dev, (size_t) numberOfSamples * stateDimension * sizeof(float)) ; cudaMalloc( &meas_dev, (size_t) numberOfSamples * stateDimension * sizeof(float)) ; //Copy particles and weights to the gpu cudaMemcpy(state_dev,state->memptr(),(size_t) state->n_elem * sizeof(float), cudaMemcpyHostToDevice); //generate random particles //cudaMemcpy(oNoise_dev,oNoiseSample.memptr(),(size_t) oNoiseSample.n_elem * sizeof(float), cudaMemcpyHostToDevice); curandGenerateNormal(gen, oNoise_dev, numberOfSamples, 0.0f, 50.0e-3f); curandGenerateNormal(gen, oNoise_dev+numberOfSamples, numberOfSamples, 0.0f, 50.0e-3f); curandGenerateNormal(gen, oNoise_dev+2*numberOfSamples, numberOfSamples, 0.0f, 50.0e-3f); curandGenerateNormal(gen, oNoise_dev+3*numberOfSamples, numberOfSamples, 0.0f, 50.0e-3f); curandGenerateNormal(gen, oNoise_dev+4*numberOfSamples, numberOfSamples, 0.0f, 50.0e-3f); curandGenerateNormal(gen, oNoise_dev+5*numberOfSamples, numberOfSamples, 0.0f, 50.0e-3f); curandGenerateNormal(gen, oNoise_dev+6*numberOfSamples, numberOfSamples, 0.0f, 50.0e-3f); curandGenerateNormal(gen, oNoise_dev+7*numberOfSamples, numberOfSamples, 0.0f, 50.0e-3f); curandGenerateNormal(gen, oNoise_dev+8*numberOfSamples, numberOfSamples, 0.0f, 50.0e-3f); //prediction = F * current + pNoiseSample + u ; callHfunKernel(state_dev,oNoise_dev,stateDimension,numberOfSamples,meas_dev); //printf("%s\n",cudaGetErrorString(cudaGetLastError())); //get estimation from gpu cudaMemcpy(measurement.memptr(),meas_dev,measurement.n_elem * sizeof(float), cudaMemcpyDeviceToHost); // clean up the graphics card cudaFree(state_dev); cudaFree(oNoise_dev); cudaFree(meas_dev); return measurement; }
void curand_generator:: fill_gaussian ( tensor& data, float mean, float stddev ) { if (data.size() == 0) return; CHECK_CURAND(curandGenerateNormal((curandGenerator_t)handle, data.device(), data.size(), mean, stddev)); }
void cuda_randn(void *ptr, int numel, bool dbl) { static bool is_init = false; static curandGenerator_t stream; if (!is_init) { curandCreateGenerator(&stream, CURAND_RNG_PSEUDO_DEFAULT); is_init = true; } if (!dbl) { curandGenerateNormal(stream, (float *)ptr, numel, 0, 1); } else { curandGenerateNormalDouble(stream, (double *)ptr, numel, 0, 1); } return; }
void Random<GPU>::gaussian (float *data, int size, const float mu, const float sigma) const { CHECK (sigma > 0.f); cuda_check (curandGenerateNormal (dnnctx[did_]->curand_, data, size, mu, sigma)); }
fmat ModelWPAMGPU::ffun(fmat *current) { fmat prediction(current->n_rows,current->n_cols); fmat pNoiseSample = pNoise.sample(current->n_cols); fmat u = U.sample(current->n_cols); float* lastState_dev; float* F_dev; float* U_dev; float* pNoise_dev; int stateDimension = current->n_rows; int numberOfSamples = current->n_cols; float* newState_dev; //allocate memory on gpu cudaMalloc( &lastState_dev, (size_t) current->n_elem * sizeof(float)) ; cudaMalloc( &F_dev, (size_t) F.n_elem * sizeof(float)) ; cudaMalloc( &U_dev, (size_t) u.n_elem * sizeof(float)) ; cudaMalloc( &pNoise_dev, (size_t) pNoiseSample.n_elem * sizeof(float)) ; cudaMalloc( &newState_dev, (size_t) prediction.n_elem * sizeof(float)) ; //Copy particles and weights to the gpu cudaMemcpy(lastState_dev,current->memptr(),(size_t) current->n_elem * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(F_dev,F.memptr(),(size_t) F.n_elem * sizeof(float), cudaMemcpyHostToDevice); //cudaMemcpy(U_dev,u.memptr(),(size_t) u.n_elem * sizeof(float), cudaMemcpyHostToDevice); //cudaMemcpy(pNoise_dev,pNoiseSample.memptr(),(size_t) pNoiseSample.n_elem * sizeof(float), cudaMemcpyHostToDevice); //pNoise curandGenerateNormal(gen, pNoise_dev, numberOfSamples, 0.0f, 50.0e-6f); curandGenerateNormal(gen, pNoise_dev+numberOfSamples, numberOfSamples, 0.0f, 50.0e-6f); curandGenerateNormal(gen, pNoise_dev+2*numberOfSamples, numberOfSamples, 0.0f, 50.0e-6f); curandGenerateNormal(gen, pNoise_dev+3*numberOfSamples, numberOfSamples, 0.0f, 10.0e-6f); curandGenerateNormal(gen, pNoise_dev+4*numberOfSamples, numberOfSamples, 0.0f, 10.0e-6f); curandGenerateNormal(gen, pNoise_dev+5*numberOfSamples, numberOfSamples, 0.0f, 10.0e-6f); curandGenerateNormal(gen, pNoise_dev+6*numberOfSamples, numberOfSamples, 0.0f, 100.0e-6f); curandGenerateNormal(gen, pNoise_dev+7*numberOfSamples, numberOfSamples, 0.0f, 100.0e-6f); curandGenerateNormal(gen, pNoise_dev+8*numberOfSamples, numberOfSamples, 0.0f, 100.0e-6f); // U U.batch.at(0); for (unsigned int i=0; i< 9 ;++i) { curandGenerateNormal(gen, U_dev+ i*numberOfSamples, numberOfSamples, U.batch.at(i)->a, U.batch.at(i)->b); } /*curandGenerateNormal(gen, oNoise_dev, numberOfSamples, 0.0f, 50.0e-6f); curandGenerateNormal(gen, oNoise_dev+numberOfSamples, numberOfSamples, 0.0f, 50.0e-6f); curandGenerateNormal(gen, oNoise_dev+2*numberOfSamples, numberOfSamples, 0.0f, 50.0e-6f); curandGenerateNormal(gen, oNoise_dev+3*numberOfSamples, numberOfSamples, 0.0f, 10.0e-6f); curandGenerateNormal(gen, oNoise_dev+4*numberOfSamples, numberOfSamples, 0.0f, 10.0e-6f); curandGenerateNormal(gen, oNoise_dev+5*numberOfSamples, numberOfSamples, 0.0f, 10.0e-6f); curandGenerateNormal(gen, oNoise_dev+6*numberOfSamples, numberOfSamples, 0.0f, 100.0e-6f); curandGenerateNormal(gen, oNoise_dev+7*numberOfSamples, numberOfSamples, 0.0f, 100.0e-6f); curandGenerateNormal(gen, oNoise_dev+8*numberOfSamples, numberOfSamples, 0.0f, 100.0e-6f);*/ //prediction = F * current + pNoiseSample + u ; callFfunKernel(lastState_dev, F_dev, U_dev, pNoise_dev, stateDimension ,numberOfSamples,newState_dev); //printf("%s\n",cudaGetErrorString(cudaGetLastError())); //get estimation from gpu cudaMemcpy(prediction.memptr(),newState_dev,current->n_elem * sizeof(float), cudaMemcpyDeviceToHost); // clean up the graphics card cudaFree(lastState_dev); cudaFree(newState_dev); cudaFree(F_dev); cudaFree(U_dev); cudaFree(pNoise_dev); return prediction; }
/* GPUrandn */ void GPUrandn(const GPUtype &OUT) { curandStatus_t status; gpuTYPE_t type = gm->gputype.getType(OUT); gm->gmat->control.cacheClean(); const void *gpuptr = gm->gputype.getGPUptr(OUT); // pointer to GPU memory int numel = gm->gputype.getNumel(OUT); // number of elements int datasize = gm->gputype.getDataSize(OUT); // bytes for each element gen = 0; // implement recovery procedure // try and if error try again // init curand if (curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT)!=CURAND_STATUS_SUCCESS) { mexErrMsgTxt(ERROR_CURAND_INIT); } //if (curandCreateGenerator(&gen,CURAND_RNG_QUASI_DEFAULT)!=CURAND_STATUS_SUCCESS) { // mexErrMsgTxt(ERROR_CURAND_INIT); //} // randn requires even numbers // we split the execution in 2 parts (overlap if not even) // seed seed++; if (curandSetPseudoRandomGeneratorSeed(gen, time(NULL)+seed)!=CURAND_STATUS_SUCCESS) { mexErrMsgTxt(ERROR_CURAND_SEED); } unsigned int n = 0; if (type == gpuFLOAT) { n = numel; } else if (type == gpuCFLOAT) { n = numel*2; } else if (type == gpuDOUBLE) { n = numel; } else if (type == gpuCDOUBLE) { n = numel*2; } unsigned int even = (n%2) == 0; unsigned int offset = 0; unsigned int mysize = 0; unsigned int iter = 1; if (!even) { n = n-1; iter = 2; } if (type == gpuFLOAT) { float mean = 0.0; float std = 1.0; status = curandGenerateNormal(gen, (float *) gpuptr, n, mean, std); if (!even) { float *devData; if((cudaMalloc((void **)&devData, 4 * sizeof(float))) != cudaSuccess) { status = CURAND_STATUS_LAUNCH_FAILURE; } else { status = curandGenerateNormal(gen, devData, 4, mean, std); if (status==CURAND_STATUS_SUCCESS) { void *dst = (void *) ((UINTPTR gpuptr)+n*datasize); if (cudaMemcpy(dst, (void *) devData, datasize, cudaMemcpyDeviceToDevice)!=cudaSuccess) { status = CURAND_STATUS_LAUNCH_FAILURE; } } if(cudaFree(devData) != cudaSuccess) { status = CURAND_STATUS_LAUNCH_FAILURE; } } } } else if (type == gpuCFLOAT) { float mean = 0.0; float std = 1.0; status = curandGenerateNormal(gen, (float *) gpuptr, n, mean, std); } else if (type == gpuDOUBLE) { double mean = 0.0; double std = 1.0; status = curandGenerateNormalDouble(gen, (double *) gpuptr, n, mean, std); if (!even) { double *devData; if((cudaMalloc((void **)&devData, 4 * sizeof(double))) != cudaSuccess) { status = CURAND_STATUS_LAUNCH_FAILURE; } else { status = curandGenerateNormalDouble(gen, devData, 4, mean, std); if (status==CURAND_STATUS_SUCCESS) { void *dst = (void *) ((UINTPTR gpuptr)+n*datasize); if (cudaMemcpy(dst, (void *) devData, datasize, cudaMemcpyDeviceToDevice)!=cudaSuccess) { status = CURAND_STATUS_LAUNCH_FAILURE; } } if(cudaFree(devData) != cudaSuccess) { status = CURAND_STATUS_LAUNCH_FAILURE; } } } } else if (type == gpuCDOUBLE) { double mean = 0.0; double std = 1.0; status = curandGenerateNormalDouble(gen, (double *) gpuptr, n, mean, std); } if (status!=CURAND_STATUS_SUCCESS) { curandDestroyGenerator(gen); mexErrMsgTxt(ERROR_CURAND_GEN); } // destroy if (curandDestroyGenerator(gen)!=CURAND_STATUS_SUCCESS) { mexErrMsgTxt(ERROR_CURAND_DESTROY); } }