Ejemplos de TransferCUDA en C++ (Cpp)

Lenguaje de programación: C++ (Cpp)

Clase / Tipo: TransferCUDA

Ejemplos en hotexamples.com: 2

C++ (Cpp) TransferCUDA - 2 ejemplos encontrados. Estos son los ejemplos en C++ (Cpp) del mundo real mejor valorados de TransferCUDA extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

set_input(2)

set_kind(2)

set_output(2)

Métodos usados con frecuencia

set_input (2)

set_kind (2)

set_output (2)

Ejemplo n.º 1

Mostrar archivo

Archivo: SingleThread.C Proyecto: gitj/dspsr

void dsp::SingleThread::initialize () try
{
  TimeSeries::auto_delete = false;

  operations.resize (0);

  // each timeseries created will be counted in new_time_series
  config->buffers = 0;

  if (thread_id < config->affinity.size())
    set_affinity (config->affinity[thread_id]);

  // only the first thread should prepare the input
  if (thread_id == 0)
    config->prepare( manager->get_input() );

  if (!unpacked)
    unpacked = new_time_series();

  manager->set_output (unpacked);

  operations.push_back (manager.get());

#if HAVE_CUDA

  bool run_on_gpu = thread_id < config->get_cuda_ndevice();

  cudaStream_t stream = 0;

  if (run_on_gpu)
  {
    // disable input buffering when data must be copied between devices
    if (config->get_total_nthread() > 1)
      config->input_buffering = false;

    int device = config->cuda_device[thread_id];
    cerr << "dspsr: thread " << thread_id 
         << " using CUDA device " << device << endl;

    int ndevice = 0;
    cudaGetDeviceCount(&ndevice);

    if (device >= ndevice)
      throw Error (InvalidParam, "dsp::SingleThread::initialize",
                   "device=%d >= ndevice=%d", device, ndevice);

    cudaError err = cudaSetDevice (device);
    if (err != cudaSuccess)
      throw Error (InvalidState, "dsp::SingleThread::initialize",
                   "cudaMalloc failed: %s", cudaGetErrorString(err));

    unsigned nstream = count (config->cuda_device, (unsigned)device);

    if (nstream > 1)
    {
      cudaStreamCreate( &stream );
      cerr << "dspsr: thread " << thread_id << " on stream " << stream << endl;
    }

    gpu_stream = stream;

    device_memory = new CUDA::DeviceMemory (stream);

    Unpacker* unpacker = manager->get_unpacker ();
    if (unpacker->get_device_supported( device_memory ))
    {
      if (Operation::verbose)
        cerr << "SingleThread: unpack on GraphicsPU" << endl;

      unpacker->set_device( device_memory );
      unpacked->set_memory( device_memory );
        
      BitSeries* bits = new BitSeries;
      bits->set_memory (new CUDA::PinnedMemory);
      manager->set_output (bits);
    }
    else
    {
      if (Operation::verbose)
        cerr << "SingleThread: unpack on CPU" << endl;

      TransferCUDA* transfer = new TransferCUDA;
      transfer->set_kind( cudaMemcpyHostToDevice );
      transfer->set_input( unpacked );
        
      unpacked = new_time_series ();
      unpacked->set_memory (device_memory);
      transfer->set_output( unpacked );
      operations.push_back (transfer);
    }    
  }

#endif // HAVE_CUFFT

}
catch (Error& error)
{
  throw error += "dsp::SingleThread::initialize";
}

Ejemplo n.º 2

Mostrar archivo

Archivo: LoadToFITS.C Proyecto: demorest/dspsr

void dsp::LoadToFITS::construct () try
{
  // sets operations to zero length then adds IOManger/unpack
  SingleThread::construct ();

  bool run_on_gpu = false;
#if HAVE_CUDA
  run_on_gpu = thread_id < config->get_cuda_ndevice();
  cudaStream_t stream = reinterpret_cast<cudaStream_t>( gpu_stream );
#endif

  /*
    The following lines "wire up" the signal path, using containers
    to communicate the data between operations.
  */

  // set up for optimal memory usage pattern

  Unpacker* unpacker = manager->get_unpacker();
  
  if (!config->dedisperse && unpacker->get_order_supported (config->order))
    unpacker->set_output_order (config->order);

  // get basic information about the observation

  Observation* obs = manager->get_info();
  const unsigned nchan = obs->get_nchan ();
  const unsigned npol = obs->get_npol ();
  const unsigned ndim = obs->get_ndim ();
  const double rate = obs->get_rate () ;

  if (verbose)
  {
    cerr << "Source = " << obs->get_source() << endl;
    cerr << "Frequency = " << obs->get_centre_frequency() << endl;
    cerr << "Bandwidth = " << obs->get_bandwidth() << endl;
    cerr << "Channels = " << nchan << endl;
    cerr << "Sampling rate = " << rate << endl;
    cerr << "State = " << tostring(obs->get_state()) <<endl;
  }

  obs->set_dispersion_measure( config->dispersion_measure );

  unsigned fb_nchan = config->filterbank.get_nchan();
  unsigned nsample;
  double tsamp, samp_per_fb;
  unsigned tres_factor;
  double factor = obs->get_state() == Signal::Nyquist? 0.5 : 1.0;

  if (fb_nchan > 0)
  {
    // Strategy will be to tscrunch from Nyquist resolution to desired reso.
    // voltage samples per filterbank sample
    samp_per_fb = config->tsamp * rate;
    if (verbose)
      cerr << "voltage samples per filterbank sample="<<samp_per_fb << endl;
    // correction for number of samples per filterbank channel
    tres_factor = round(factor*samp_per_fb/fb_nchan);
    tsamp = tres_factor/factor*fb_nchan/rate;

    // voltage samples per output block
    nsample = round(samp_per_fb * config->nsblk);
  }
  else
  {
    samp_per_fb = 1.0;
    tres_factor = round(rate * config->tsamp);
    tsamp = tres_factor/factor * 1/rate;
    nsample = config->nsblk * tres_factor;
  }

  cerr << "digifits: requested tsamp=" << config->tsamp << " rate=" << rate << endl 
       << "             actual tsamp=" << tsamp << " (tscrunch=" << tres_factor << ")" << endl;
  if (verbose)
    cerr << "digifits: nsblk=" << config->nsblk << endl;

  // the unpacked input will occupy nbytes_per_sample
  double nbytes_per_sample = sizeof(float) * nchan * npol * ndim;
  double MB = 1024.0 * 1024.0;

  // ideally, block size would be a full output block, but this is too large
  // pick a nice fraction that will divide evently into maximum RAM
  // NB this doesn't account for copies (yet)

  if (verbose)
    cerr << "digifits: nsample * nbytes_per_sample=" << nsample * nbytes_per_sample 
         << " config->maximum_RAM=" << config->maximum_RAM << endl;
  while (nsample * nbytes_per_sample > config->maximum_RAM) nsample /= 2;

  if (verbose)
    cerr << "digifits: block_size=" << (nbytes_per_sample*nsample)/MB 
         << " MB " << "(" << nsample << " samp)" << endl;

  manager->set_block_size ( nsample );

  // if running on multiple GPUs, make nsblk such that no buffering is
  // required
  if ((run_on_gpu) and (config->get_total_nthread() > 1))
  {
    config->nsblk = nsample / samp_per_fb;
    if (verbose)
      cerr << "digifits: due to GPU multi-threading, setting nsblk="<<config->nsblk << endl;
  }

  TimeSeries* timeseries = unpacked;

#if HAVE_CUDA
  if (run_on_gpu)
  {
    timeseries->set_memory (device_memory);
    timeseries->set_engine (new CUDA::TimeSeriesEngine (device_memory));
  }
#endif

  if (!obs->get_detected())
  {
    cerr << "digifits: input data not detected" << endl;

    // if no filterbank specified
    if ((fb_nchan == 0) && (nchan == 1))
    {
      throw Error(InvalidParam,"dsp::LoadToFITS::construct",
          "must specify filterbank scheme if single channel data");
    }

    if ((config->coherent_dedisp) && (config->dispersion_measure != 0.0))
    {
      cerr << "digifits: performing coherent dedispersion" << endl;
      kernel = new Dedispersion;
      kernel->set_dispersion_measure( config->dispersion_measure );

      unsigned frequency_resolution = config->filterbank.get_freq_res ();
      cerr << "digifits: config->filterbank.get_freq_res= " << frequency_resolution << endl;
      if (frequency_resolution)
      {
        cerr << "digifits: setting filter length to " << frequency_resolution << endl;
        //kernel->set_frequency_resolution (frequency_resolution);
        kernel -> set_times_minimum_nfft (frequency_resolution);
      }
    }
    else
    {
      if (config->dispersion_measure != 0.0)
        cerr << "digifits: performing incoherent dedispersion" << endl;
      config->coherent_dedisp = false;
    }

    // filterbank is performing channelisation
    if (config->filterbank.get_nchan() > 1)
    {
      // If user specifies -FN:D, enable coherent dedispersion
      if (config->filterbank.get_convolve_when() == Filterbank::Config::During)
      {
        // during is the only option for filterbank
        config->filterbank.set_convolve_when( Filterbank::Config::During );
      }
      else
      {
        config->coherent_dedisp = false;
      }

#if HAVE_CUDA
      if (run_on_gpu)
      {
        config->filterbank.set_device ( device_memory.ptr() );
        config->filterbank.set_stream ( gpu_stream );
      }
#endif

      filterbank = config->filterbank.create ();

      filterbank->set_nchan( config->filterbank.get_nchan() );
      filterbank->set_input( timeseries );
      filterbank->set_output( timeseries = new_TimeSeries() );

#if HAVE_CUDA
      if (run_on_gpu)
        timeseries->set_memory (device_memory);
#endif

      if (config->coherent_dedisp && kernel)
        filterbank->set_response( kernel );

      if ( !config->coherent_dedisp )
      {
        unsigned freq_res = config->filterbank.get_freq_res();
        if (freq_res > 1)
          filterbank->set_frequency_resolution ( freq_res );
      }

      operations.push_back( filterbank.get() );
    }

    // if convolution does not happen during filterbanking
    if (config->coherent_dedisp && config->filterbank.get_convolve_when() != Filterbank::Config::During)
    {
      cerr << "digifits: creating convolution operation" << endl;

      if (!convolution)
        convolution = new Convolution;
      
      if (!config->input_buffering)
        convolution->set_buffering_policy (NULL);
      
      convolution->set_response (kernel);
      //if (!config->integration_turns)
      //  convolution->set_passband (passband);
      
      convolution->set_input  (timeseries);  
      convolution->set_output (timeseries = new_TimeSeries() );    // out of place

#if HAVE_CUDA
      if (run_on_gpu)
      { 
        timeseries->set_memory (device_memory);
        convolution->set_device (device_memory.ptr());
        unsigned nchan = manager->get_info()->get_nchan();
        if (fb_nchan)
          nchan *= fb_nchan;
        if (nchan >= 16)
          convolution->set_engine (new CUDA::ConvolutionEngineSpectral (stream));
        else
          convolution->set_engine (new CUDA::ConvolutionEngine (stream));
      }
#endif
    
      operations.push_back (convolution.get());
    }

    if (verbose)
	    cerr << "digifits: creating detection operation" << endl;
      
    Detection* detection = new Detection;
    detection->set_input ( timeseries );

    // always use coherence for GPU, pscrunch later if needed
    if (run_on_gpu)
    {
#ifdef HAVE_CUDA
      if (npol == 2)
      {
        detection->set_output_state (Signal::Coherence);
        detection->set_engine (new CUDA::DetectionEngine(stream) );
        detection->set_output_ndim (2);
        detection->set_output (timeseries);
      }
      else
      {
        detection->set_output_state (Signal::Intensity);
        detection->set_engine (new CUDA::DetectionEngine(stream) );
        detection->set_output (timeseries = new_TimeSeries());
        cerr << "detection->set_output(timeseries = newTimeSeries())" << endl;
        detection->set_output_ndim (1);
        timeseries->set_memory (device_memory);
      }
#endif
    }
    else
    {
      switch (config->npol) 
      {
        case 1:
          detection->set_output_state (Signal::Intensity);
          //detected = new_TimeSeries();
          break;
        case 2:
          detection->set_output_state (Signal::PPQQ);
          //detected = new_TimeSeries();
          break;
        case 4:
          detection->set_output_state (Signal::Coherence);
          // use this to avoid copies -- seem to segfault in multi-threaded
          //detection->set_output_ndim (2);
          break;
        default:
          throw Error(InvalidParam,"dsp::LoadToFITS::construct",
              "invalid polarization specified");
      }
      detection->set_output (timeseries);
    }

    operations.push_back ( detection );
  }

#if HAVE_CUDA
  if (run_on_gpu)
  {
    // to support input buffering
    timeseries->set_engine (new CUDA::TimeSeriesEngine (device_memory));
  }
#endif

  TScrunch* tscrunch = new TScrunch;
  tscrunch->set_factor ( tres_factor );
  tscrunch->set_input ( timeseries );
  tscrunch->set_output ( timeseries = new_TimeSeries() );

#if HAVE_CUDA
  if ( run_on_gpu )
  {
    tscrunch->set_engine ( new CUDA::TScrunchEngine(stream) );
    timeseries->set_memory (device_memory);
  }
#endif
  operations.push_back( tscrunch );

#if HAVE_CUDA
  if (run_on_gpu)
  {
    TransferCUDA* transfer = new TransferCUDA (stream);
    transfer->set_kind (cudaMemcpyDeviceToHost);
    transfer->set_input( timeseries );
    transfer->set_output( timeseries = new_TimeSeries() );
    operations.push_back (transfer);
  }
#endif

  // need to do PolnReshape if have done on GPU (because uses the
  // hybrid npol=2, ndim=2 for the Stokes parameters)
  if (run_on_gpu)
  {
    PolnReshape* reshape = new PolnReshape;
    switch (config->npol)
    {
      case 4:
        reshape->set_state ( Signal::Coherence );
        break;
      case 2:
        reshape->set_state ( Signal::PPQQ );
        break;
      case 1:
        reshape->set_state ( Signal::Intensity );
        break;
      default: 
        throw Error(InvalidParam,"dsp::LoadToFITS::construct",
            "invalid polarization specified");
    }
    reshape->set_input (timeseries );
    reshape->set_output ( timeseries = new_TimeSeries() );
    operations.push_back(reshape);
  }
  //else if (config->npol == 4)
  else if (false)
  {
    PolnReshape* reshape = new PolnReshape;
    reshape->set_state ( Signal::Coherence );
    reshape->set_input (timeseries );
    reshape->set_output ( timeseries = new_TimeSeries() );
    operations.push_back (reshape);
  }

  if ( config->dedisperse )
  {
    //if (verbose)
      cerr << "digifits: removing dispersion delays" << endl;

    SampleDelay* delay = new SampleDelay;

    delay->set_input (timeseries);
    delay->set_output (timeseries);
    delay->set_function (new Dedispersion::SampleDelay);

    operations.push_back( delay );
  }


  // only do pscrunch for detected data -- NB always goes to Intensity
  bool do_pscrunch = (obs->get_npol() > 1) && (config->npol==1) 
    && (obs->get_detected());
  if (do_pscrunch)
  {
    //if (verbose)
      cerr << "digifits: creating pscrunch transformation" << endl;

    PScrunch* pscrunch = new PScrunch;
    pscrunch->set_input (timeseries);
    pscrunch->set_output (timeseries);

    operations.push_back( pscrunch );
  }

  if (verbose)
    cerr << "digifits: creating output bitseries container" << endl;

  BitSeries* bitseries = new BitSeries;

  if (verbose)
    cerr << "digifits: creating PSRFITS digitizer with nbit="
         << config->nbits << endl;

  FITSDigitizer* digitizer = new FITSDigitizer (config->nbits);
  digitizer->set_input (timeseries);
  digitizer->set_output (bitseries);

  // PSRFITS allows us to save the reference spectrum in each output block
  // "subint", so we can take advantage of this to store the exect
  // reference spectrum for later use.  By default, we will rescale the 
  // spectrum using values for exactly one block (nsblk samples).  This
  // potentially improves the dynamic range, but makes the observaiton more
  // subject to transiennts.  By calling set_rescale_nblock(N), the path
  // will keep a running mean/scale for N sample blocks.  This is presented
  // to the user through rescale_seconds, which will choose the appropriate
  // block length to approximate the requested time interval.
  digitizer->set_rescale_samples (config->nsblk);
  if (config->rescale_constant)
  {
    cerr << "digifits: holding scales and offsets constant" << endl;
    digitizer->set_rescale_constant (true);
  }
  else if (config->rescale_seconds > 0)
  {
    double tblock = config->tsamp * config->nsblk;
    unsigned nblock = unsigned ( config->rescale_seconds/tblock + 0.5 );
    if (nblock < 1) nblock = 1;
    digitizer->set_rescale_nblock (nblock);
    cerr << "digifits: using "<<nblock<<" blocks running mean for scales and constant ("<<tblock*nblock<<") seconds"<<endl;
  }

  operations.push_back( digitizer );

  if (verbose)
    cerr << "digifits: creating PSRFITS output file" << endl;

  const char* output_filename = 0;
  if (!config->output_filename.empty())
    output_filename = config->output_filename.c_str();

  FITSOutputFile* outputfile = new FITSOutputFile (output_filename);
  outputfile->set_nsblk (config->nsblk);
  outputfile->set_nbit (config->nbits);
  outputfile->set_max_length (config->integration_length);
  outputFile = outputfile;
  outputFile->set_input (bitseries);

  operations.push_back( outputFile.get() );

  // add a callback for the PSRFITS reference spectrum
  digitizer->update.connect (
      dynamic_cast<FITSOutputFile*> (outputFile.get()), 
      &FITSOutputFile::set_reference_spectrum);
}
catch (Error& error)
{
  throw error += "dsp::LoadToFITS::construct";
}