fmat ModelWPAMGPU::hfun(fmat *state)
{
	float* state_dev;
    float* oNoise_dev;
    int stateDimension = state->n_rows;
    int numberOfSamples = state->n_cols;
    float* meas_dev;

    fmat measurement(state->n_rows,state->n_cols);
    //fmat oNoiseSample = oNoise.sample(state->n_cols);

    //measurement = state + oNoiseSample;

	//allocate memory on gpu
    cudaMalloc( &state_dev, (size_t) state->n_elem * sizeof(float)) ;
    cudaMalloc( &oNoise_dev, (size_t) numberOfSamples * stateDimension * sizeof(float)) ;
    cudaMalloc( &meas_dev, (size_t) numberOfSamples * stateDimension * sizeof(float)) ;

	//Copy particles and weights to the gpu
    cudaMemcpy(state_dev,state->memptr(),(size_t) state->n_elem * sizeof(float), cudaMemcpyHostToDevice);

    //generate random particles
    //cudaMemcpy(oNoise_dev,oNoiseSample.memptr(),(size_t) oNoiseSample.n_elem * sizeof(float), cudaMemcpyHostToDevice);
    curandGenerateNormal(gen, oNoise_dev, numberOfSamples, 0.0f, 50.0e-3f);
    curandGenerateNormal(gen, oNoise_dev+numberOfSamples, numberOfSamples, 0.0f, 50.0e-3f);
    curandGenerateNormal(gen, oNoise_dev+2*numberOfSamples, numberOfSamples, 0.0f, 50.0e-3f);
    curandGenerateNormal(gen, oNoise_dev+3*numberOfSamples, numberOfSamples, 0.0f, 50.0e-3f);
    curandGenerateNormal(gen, oNoise_dev+4*numberOfSamples, numberOfSamples, 0.0f, 50.0e-3f);
    curandGenerateNormal(gen, oNoise_dev+5*numberOfSamples, numberOfSamples, 0.0f, 50.0e-3f);
    curandGenerateNormal(gen, oNoise_dev+6*numberOfSamples, numberOfSamples, 0.0f, 50.0e-3f);
    curandGenerateNormal(gen, oNoise_dev+7*numberOfSamples, numberOfSamples, 0.0f, 50.0e-3f);
    curandGenerateNormal(gen, oNoise_dev+8*numberOfSamples, numberOfSamples, 0.0f, 50.0e-3f);


    //prediction = F * current + pNoiseSample + u ;
	callHfunKernel(state_dev,oNoise_dev,stateDimension,numberOfSamples,meas_dev);
    //printf("%s\n",cudaGetErrorString(cudaGetLastError()));

	//get estimation from gpu
	cudaMemcpy(measurement.memptr(),meas_dev,measurement.n_elem * sizeof(float), cudaMemcpyDeviceToHost);

	// clean up the graphics card
    cudaFree(state_dev);
	cudaFree(oNoise_dev);
	cudaFree(meas_dev);

    return measurement;
}
Ejemplo n.º 2
0
        void curand_generator::
        fill_gaussian (
            tensor& data,
            float mean,
            float stddev
        )
        {
            if (data.size() == 0)
                return;

            CHECK_CURAND(curandGenerateNormal((curandGenerator_t)handle, 
                                        data.device(),
                                        data.size(),
                                        mean,
                                        stddev));
        }
Ejemplo n.º 3
0
void cuda_randn(void *ptr, int numel, bool dbl)
{
    static bool is_init = false;
    static curandGenerator_t stream;
    if (!is_init) {
        curandCreateGenerator(&stream, CURAND_RNG_PSEUDO_DEFAULT);
        is_init = true;
    }

    if (!dbl) {
        curandGenerateNormal(stream, (float *)ptr, numel, 0, 1);
    } else {
        curandGenerateNormalDouble(stream, (double *)ptr, numel, 0, 1);
    }

    return;
}
Ejemplo n.º 4
0
void Random<GPU>::gaussian (float *data, int size, const float mu, const float sigma) const
{ CHECK (sigma > 0.f);
  cuda_check (curandGenerateNormal (dnnctx[did_]->curand_, data, size, mu, sigma));
}
fmat ModelWPAMGPU::ffun(fmat *current)
{
    fmat prediction(current->n_rows,current->n_cols);
    fmat pNoiseSample = pNoise.sample(current->n_cols);
    fmat u = U.sample(current->n_cols);
    float* lastState_dev;
    float* F_dev;
	float* U_dev;
	float* pNoise_dev;
    int stateDimension = current->n_rows;
    int numberOfSamples = current->n_cols;
    float* newState_dev;

	//allocate memory on gpu
    cudaMalloc( &lastState_dev, (size_t) current->n_elem * sizeof(float)) ;
	cudaMalloc( &F_dev, (size_t) F.n_elem * sizeof(float)) ;
	cudaMalloc( &U_dev, (size_t) u.n_elem * sizeof(float)) ;
	cudaMalloc( &pNoise_dev, (size_t) pNoiseSample.n_elem * sizeof(float)) ;
	cudaMalloc( &newState_dev, (size_t) prediction.n_elem * sizeof(float)) ;

	//Copy particles and weights to the gpu
    cudaMemcpy(lastState_dev,current->memptr(),(size_t) current->n_elem * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(F_dev,F.memptr(),(size_t) F.n_elem * sizeof(float), cudaMemcpyHostToDevice);
    //cudaMemcpy(U_dev,u.memptr(),(size_t) u.n_elem * sizeof(float), cudaMemcpyHostToDevice);
    //cudaMemcpy(pNoise_dev,pNoiseSample.memptr(),(size_t) pNoiseSample.n_elem * sizeof(float), cudaMemcpyHostToDevice);

    //pNoise
    curandGenerateNormal(gen, pNoise_dev, numberOfSamples, 0.0f, 50.0e-6f);
    curandGenerateNormal(gen, pNoise_dev+numberOfSamples, numberOfSamples, 0.0f, 50.0e-6f);
    curandGenerateNormal(gen, pNoise_dev+2*numberOfSamples, numberOfSamples, 0.0f, 50.0e-6f);
    curandGenerateNormal(gen, pNoise_dev+3*numberOfSamples, numberOfSamples, 0.0f, 10.0e-6f);
    curandGenerateNormal(gen, pNoise_dev+4*numberOfSamples, numberOfSamples, 0.0f, 10.0e-6f);
    curandGenerateNormal(gen, pNoise_dev+5*numberOfSamples, numberOfSamples, 0.0f, 10.0e-6f);
    curandGenerateNormal(gen, pNoise_dev+6*numberOfSamples, numberOfSamples, 0.0f, 100.0e-6f);
    curandGenerateNormal(gen, pNoise_dev+7*numberOfSamples, numberOfSamples, 0.0f, 100.0e-6f);
    curandGenerateNormal(gen, pNoise_dev+8*numberOfSamples, numberOfSamples, 0.0f, 100.0e-6f);

    // U
    U.batch.at(0);
    for (unsigned int i=0; i< 9 ;++i)
    {
        curandGenerateNormal(gen, U_dev+ i*numberOfSamples, numberOfSamples, U.batch.at(i)->a, U.batch.at(i)->b);
    }
    /*curandGenerateNormal(gen, oNoise_dev, numberOfSamples, 0.0f, 50.0e-6f);
    curandGenerateNormal(gen, oNoise_dev+numberOfSamples, numberOfSamples, 0.0f, 50.0e-6f);
    curandGenerateNormal(gen, oNoise_dev+2*numberOfSamples, numberOfSamples, 0.0f, 50.0e-6f);
    curandGenerateNormal(gen, oNoise_dev+3*numberOfSamples, numberOfSamples, 0.0f, 10.0e-6f);
    curandGenerateNormal(gen, oNoise_dev+4*numberOfSamples, numberOfSamples, 0.0f, 10.0e-6f);
    curandGenerateNormal(gen, oNoise_dev+5*numberOfSamples, numberOfSamples, 0.0f, 10.0e-6f);
    curandGenerateNormal(gen, oNoise_dev+6*numberOfSamples, numberOfSamples, 0.0f, 100.0e-6f);
    curandGenerateNormal(gen, oNoise_dev+7*numberOfSamples, numberOfSamples, 0.0f, 100.0e-6f);
    curandGenerateNormal(gen, oNoise_dev+8*numberOfSamples, numberOfSamples, 0.0f, 100.0e-6f);*/

    //prediction = F * current + pNoiseSample + u ;
    callFfunKernel(lastState_dev, F_dev, U_dev, pNoise_dev, stateDimension ,numberOfSamples,newState_dev);
    //printf("%s\n",cudaGetErrorString(cudaGetLastError()));

	//get estimation from gpu
    cudaMemcpy(prediction.memptr(),newState_dev,current->n_elem * sizeof(float), cudaMemcpyDeviceToHost);

	// clean up the graphics card
    cudaFree(lastState_dev);
	cudaFree(newState_dev);
	cudaFree(F_dev);
	cudaFree(U_dev);
	cudaFree(pNoise_dev);

    return prediction;
}
Ejemplo n.º 6
0
/* GPUrandn */
void GPUrandn(const GPUtype &OUT) {

  curandStatus_t status;

  gpuTYPE_t type = gm->gputype.getType(OUT);


  gm->gmat->control.cacheClean();



  const void *gpuptr = gm->gputype.getGPUptr(OUT); // pointer to GPU memory
  int numel = gm->gputype.getNumel(OUT);           // number of elements
  int datasize = gm->gputype.getDataSize(OUT);     // bytes for each element


  gen = 0;
  // implement recovery procedure
  // try and if error try again

  // init curand
  if (curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT)!=CURAND_STATUS_SUCCESS) {
    mexErrMsgTxt(ERROR_CURAND_INIT);
  }
  //if (curandCreateGenerator(&gen,CURAND_RNG_QUASI_DEFAULT)!=CURAND_STATUS_SUCCESS) {
  //  mexErrMsgTxt(ERROR_CURAND_INIT);
  //}

  // randn requires even numbers
  // we split the execution in 2 parts (overlap if not even)

  // seed
  seed++;
  if (curandSetPseudoRandomGeneratorSeed(gen, time(NULL)+seed)!=CURAND_STATUS_SUCCESS) {
    mexErrMsgTxt(ERROR_CURAND_SEED);
  }

  unsigned int n = 0;

  if (type == gpuFLOAT) {
    n = numel;
  } else if (type == gpuCFLOAT) {
    n = numel*2;
  } else if (type == gpuDOUBLE) {
    n = numel;
  } else if (type == gpuCDOUBLE) {
    n = numel*2;
  }

  unsigned int even = (n%2) == 0;

  unsigned int offset = 0;
  unsigned int mysize = 0;


  unsigned int iter = 1;
  if (!even) {
    n = n-1;
    iter = 2;
  }

  if (type == gpuFLOAT) {
    float mean = 0.0;
    float std = 1.0;
    status = curandGenerateNormal(gen, (float *) gpuptr, n, mean, std);

    if (!even) {
      float *devData;
      if((cudaMalloc((void **)&devData, 4 * sizeof(float))) != cudaSuccess) {
        status = CURAND_STATUS_LAUNCH_FAILURE;
      } else {
        status = curandGenerateNormal(gen, devData, 4, mean, std);
        if (status==CURAND_STATUS_SUCCESS) {
          void *dst = (void *) ((UINTPTR gpuptr)+n*datasize);
          if (cudaMemcpy(dst, (void *) devData, datasize, cudaMemcpyDeviceToDevice)!=cudaSuccess) {
            status = CURAND_STATUS_LAUNCH_FAILURE;
          }
        }
        if(cudaFree(devData) != cudaSuccess) {
          status = CURAND_STATUS_LAUNCH_FAILURE;
        }
      }
    }

  } else if (type == gpuCFLOAT) {
    float mean = 0.0;
    float std = 1.0;
    status = curandGenerateNormal(gen, (float *) gpuptr, n, mean, std);
  } else if (type == gpuDOUBLE) {
    double mean = 0.0;
    double std = 1.0;
    status = curandGenerateNormalDouble(gen, (double *) gpuptr, n, mean, std);
    if (!even) {
      double *devData;
      if((cudaMalloc((void **)&devData, 4 * sizeof(double))) != cudaSuccess) {
        status = CURAND_STATUS_LAUNCH_FAILURE;
      } else {
        status = curandGenerateNormalDouble(gen, devData, 4, mean, std);
        if (status==CURAND_STATUS_SUCCESS) {
          void *dst = (void *) ((UINTPTR gpuptr)+n*datasize);
          if (cudaMemcpy(dst, (void *) devData, datasize, cudaMemcpyDeviceToDevice)!=cudaSuccess) {
            status = CURAND_STATUS_LAUNCH_FAILURE;
          }
        }
        if(cudaFree(devData) != cudaSuccess) {
          status = CURAND_STATUS_LAUNCH_FAILURE;
        }
      }
    }


  } else if (type == gpuCDOUBLE) {
    double mean = 0.0;
    double std = 1.0;
    status = curandGenerateNormalDouble(gen, (double *) gpuptr, n, mean, std);

  }

  if (status!=CURAND_STATUS_SUCCESS) {
    curandDestroyGenerator(gen);
    mexErrMsgTxt(ERROR_CURAND_GEN);
  }


  // destroy
  if (curandDestroyGenerator(gen)!=CURAND_STATUS_SUCCESS) {
    mexErrMsgTxt(ERROR_CURAND_DESTROY);
  }


}