void dsp::SingleThread::initialize () try { TimeSeries::auto_delete = false; operations.resize (0); // each timeseries created will be counted in new_time_series config->buffers = 0; if (thread_id < config->affinity.size()) set_affinity (config->affinity[thread_id]); // only the first thread should prepare the input if (thread_id == 0) config->prepare( manager->get_input() ); if (!unpacked) unpacked = new_time_series(); manager->set_output (unpacked); operations.push_back (manager.get()); #if HAVE_CUDA bool run_on_gpu = thread_id < config->get_cuda_ndevice(); cudaStream_t stream = 0; if (run_on_gpu) { // disable input buffering when data must be copied between devices if (config->get_total_nthread() > 1) config->input_buffering = false; int device = config->cuda_device[thread_id]; cerr << "dspsr: thread " << thread_id << " using CUDA device " << device << endl; int ndevice = 0; cudaGetDeviceCount(&ndevice); if (device >= ndevice) throw Error (InvalidParam, "dsp::SingleThread::initialize", "device=%d >= ndevice=%d", device, ndevice); cudaError err = cudaSetDevice (device); if (err != cudaSuccess) throw Error (InvalidState, "dsp::SingleThread::initialize", "cudaMalloc failed: %s", cudaGetErrorString(err)); unsigned nstream = count (config->cuda_device, (unsigned)device); if (nstream > 1) { cudaStreamCreate( &stream ); cerr << "dspsr: thread " << thread_id << " on stream " << stream << endl; } gpu_stream = stream; device_memory = new CUDA::DeviceMemory (stream); Unpacker* unpacker = manager->get_unpacker (); if (unpacker->get_device_supported( device_memory )) { if (Operation::verbose) cerr << "SingleThread: unpack on GraphicsPU" << endl; unpacker->set_device( device_memory ); unpacked->set_memory( device_memory ); BitSeries* bits = new BitSeries; bits->set_memory (new CUDA::PinnedMemory); manager->set_output (bits); } else { if (Operation::verbose) cerr << "SingleThread: unpack on CPU" << endl; TransferCUDA* transfer = new TransferCUDA; transfer->set_kind( cudaMemcpyHostToDevice ); transfer->set_input( unpacked ); unpacked = new_time_series (); unpacked->set_memory (device_memory); transfer->set_output( unpacked ); operations.push_back (transfer); } } #endif // HAVE_CUFFT } catch (Error& error) { throw error += "dsp::SingleThread::initialize"; }
void dsp::LoadToFITS::construct () try { // sets operations to zero length then adds IOManger/unpack SingleThread::construct (); bool run_on_gpu = false; #if HAVE_CUDA run_on_gpu = thread_id < config->get_cuda_ndevice(); cudaStream_t stream = reinterpret_cast<cudaStream_t>( gpu_stream ); #endif /* The following lines "wire up" the signal path, using containers to communicate the data between operations. */ // set up for optimal memory usage pattern Unpacker* unpacker = manager->get_unpacker(); if (!config->dedisperse && unpacker->get_order_supported (config->order)) unpacker->set_output_order (config->order); // get basic information about the observation Observation* obs = manager->get_info(); const unsigned nchan = obs->get_nchan (); const unsigned npol = obs->get_npol (); const unsigned ndim = obs->get_ndim (); const double rate = obs->get_rate () ; if (verbose) { cerr << "Source = " << obs->get_source() << endl; cerr << "Frequency = " << obs->get_centre_frequency() << endl; cerr << "Bandwidth = " << obs->get_bandwidth() << endl; cerr << "Channels = " << nchan << endl; cerr << "Sampling rate = " << rate << endl; cerr << "State = " << tostring(obs->get_state()) <<endl; } obs->set_dispersion_measure( config->dispersion_measure ); unsigned fb_nchan = config->filterbank.get_nchan(); unsigned nsample; double tsamp, samp_per_fb; unsigned tres_factor; double factor = obs->get_state() == Signal::Nyquist? 0.5 : 1.0; if (fb_nchan > 0) { // Strategy will be to tscrunch from Nyquist resolution to desired reso. // voltage samples per filterbank sample samp_per_fb = config->tsamp * rate; if (verbose) cerr << "voltage samples per filterbank sample="<<samp_per_fb << endl; // correction for number of samples per filterbank channel tres_factor = round(factor*samp_per_fb/fb_nchan); tsamp = tres_factor/factor*fb_nchan/rate; // voltage samples per output block nsample = round(samp_per_fb * config->nsblk); } else { samp_per_fb = 1.0; tres_factor = round(rate * config->tsamp); tsamp = tres_factor/factor * 1/rate; nsample = config->nsblk * tres_factor; } cerr << "digifits: requested tsamp=" << config->tsamp << " rate=" << rate << endl << " actual tsamp=" << tsamp << " (tscrunch=" << tres_factor << ")" << endl; if (verbose) cerr << "digifits: nsblk=" << config->nsblk << endl; // the unpacked input will occupy nbytes_per_sample double nbytes_per_sample = sizeof(float) * nchan * npol * ndim; double MB = 1024.0 * 1024.0; // ideally, block size would be a full output block, but this is too large // pick a nice fraction that will divide evently into maximum RAM // NB this doesn't account for copies (yet) if (verbose) cerr << "digifits: nsample * nbytes_per_sample=" << nsample * nbytes_per_sample << " config->maximum_RAM=" << config->maximum_RAM << endl; while (nsample * nbytes_per_sample > config->maximum_RAM) nsample /= 2; if (verbose) cerr << "digifits: block_size=" << (nbytes_per_sample*nsample)/MB << " MB " << "(" << nsample << " samp)" << endl; manager->set_block_size ( nsample ); // if running on multiple GPUs, make nsblk such that no buffering is // required if ((run_on_gpu) and (config->get_total_nthread() > 1)) { config->nsblk = nsample / samp_per_fb; if (verbose) cerr << "digifits: due to GPU multi-threading, setting nsblk="<<config->nsblk << endl; } TimeSeries* timeseries = unpacked; #if HAVE_CUDA if (run_on_gpu) { timeseries->set_memory (device_memory); timeseries->set_engine (new CUDA::TimeSeriesEngine (device_memory)); } #endif if (!obs->get_detected()) { cerr << "digifits: input data not detected" << endl; // if no filterbank specified if ((fb_nchan == 0) && (nchan == 1)) { throw Error(InvalidParam,"dsp::LoadToFITS::construct", "must specify filterbank scheme if single channel data"); } if ((config->coherent_dedisp) && (config->dispersion_measure != 0.0)) { cerr << "digifits: performing coherent dedispersion" << endl; kernel = new Dedispersion; kernel->set_dispersion_measure( config->dispersion_measure ); unsigned frequency_resolution = config->filterbank.get_freq_res (); cerr << "digifits: config->filterbank.get_freq_res= " << frequency_resolution << endl; if (frequency_resolution) { cerr << "digifits: setting filter length to " << frequency_resolution << endl; //kernel->set_frequency_resolution (frequency_resolution); kernel -> set_times_minimum_nfft (frequency_resolution); } } else { if (config->dispersion_measure != 0.0) cerr << "digifits: performing incoherent dedispersion" << endl; config->coherent_dedisp = false; } // filterbank is performing channelisation if (config->filterbank.get_nchan() > 1) { // If user specifies -FN:D, enable coherent dedispersion if (config->filterbank.get_convolve_when() == Filterbank::Config::During) { // during is the only option for filterbank config->filterbank.set_convolve_when( Filterbank::Config::During ); } else { config->coherent_dedisp = false; } #if HAVE_CUDA if (run_on_gpu) { config->filterbank.set_device ( device_memory.ptr() ); config->filterbank.set_stream ( gpu_stream ); } #endif filterbank = config->filterbank.create (); filterbank->set_nchan( config->filterbank.get_nchan() ); filterbank->set_input( timeseries ); filterbank->set_output( timeseries = new_TimeSeries() ); #if HAVE_CUDA if (run_on_gpu) timeseries->set_memory (device_memory); #endif if (config->coherent_dedisp && kernel) filterbank->set_response( kernel ); if ( !config->coherent_dedisp ) { unsigned freq_res = config->filterbank.get_freq_res(); if (freq_res > 1) filterbank->set_frequency_resolution ( freq_res ); } operations.push_back( filterbank.get() ); } // if convolution does not happen during filterbanking if (config->coherent_dedisp && config->filterbank.get_convolve_when() != Filterbank::Config::During) { cerr << "digifits: creating convolution operation" << endl; if (!convolution) convolution = new Convolution; if (!config->input_buffering) convolution->set_buffering_policy (NULL); convolution->set_response (kernel); //if (!config->integration_turns) // convolution->set_passband (passband); convolution->set_input (timeseries); convolution->set_output (timeseries = new_TimeSeries() ); // out of place #if HAVE_CUDA if (run_on_gpu) { timeseries->set_memory (device_memory); convolution->set_device (device_memory.ptr()); unsigned nchan = manager->get_info()->get_nchan(); if (fb_nchan) nchan *= fb_nchan; if (nchan >= 16) convolution->set_engine (new CUDA::ConvolutionEngineSpectral (stream)); else convolution->set_engine (new CUDA::ConvolutionEngine (stream)); } #endif operations.push_back (convolution.get()); } if (verbose) cerr << "digifits: creating detection operation" << endl; Detection* detection = new Detection; detection->set_input ( timeseries ); // always use coherence for GPU, pscrunch later if needed if (run_on_gpu) { #ifdef HAVE_CUDA if (npol == 2) { detection->set_output_state (Signal::Coherence); detection->set_engine (new CUDA::DetectionEngine(stream) ); detection->set_output_ndim (2); detection->set_output (timeseries); } else { detection->set_output_state (Signal::Intensity); detection->set_engine (new CUDA::DetectionEngine(stream) ); detection->set_output (timeseries = new_TimeSeries()); cerr << "detection->set_output(timeseries = newTimeSeries())" << endl; detection->set_output_ndim (1); timeseries->set_memory (device_memory); } #endif } else { switch (config->npol) { case 1: detection->set_output_state (Signal::Intensity); //detected = new_TimeSeries(); break; case 2: detection->set_output_state (Signal::PPQQ); //detected = new_TimeSeries(); break; case 4: detection->set_output_state (Signal::Coherence); // use this to avoid copies -- seem to segfault in multi-threaded //detection->set_output_ndim (2); break; default: throw Error(InvalidParam,"dsp::LoadToFITS::construct", "invalid polarization specified"); } detection->set_output (timeseries); } operations.push_back ( detection ); } #if HAVE_CUDA if (run_on_gpu) { // to support input buffering timeseries->set_engine (new CUDA::TimeSeriesEngine (device_memory)); } #endif TScrunch* tscrunch = new TScrunch; tscrunch->set_factor ( tres_factor ); tscrunch->set_input ( timeseries ); tscrunch->set_output ( timeseries = new_TimeSeries() ); #if HAVE_CUDA if ( run_on_gpu ) { tscrunch->set_engine ( new CUDA::TScrunchEngine(stream) ); timeseries->set_memory (device_memory); } #endif operations.push_back( tscrunch ); #if HAVE_CUDA if (run_on_gpu) { TransferCUDA* transfer = new TransferCUDA (stream); transfer->set_kind (cudaMemcpyDeviceToHost); transfer->set_input( timeseries ); transfer->set_output( timeseries = new_TimeSeries() ); operations.push_back (transfer); } #endif // need to do PolnReshape if have done on GPU (because uses the // hybrid npol=2, ndim=2 for the Stokes parameters) if (run_on_gpu) { PolnReshape* reshape = new PolnReshape; switch (config->npol) { case 4: reshape->set_state ( Signal::Coherence ); break; case 2: reshape->set_state ( Signal::PPQQ ); break; case 1: reshape->set_state ( Signal::Intensity ); break; default: throw Error(InvalidParam,"dsp::LoadToFITS::construct", "invalid polarization specified"); } reshape->set_input (timeseries ); reshape->set_output ( timeseries = new_TimeSeries() ); operations.push_back(reshape); } //else if (config->npol == 4) else if (false) { PolnReshape* reshape = new PolnReshape; reshape->set_state ( Signal::Coherence ); reshape->set_input (timeseries ); reshape->set_output ( timeseries = new_TimeSeries() ); operations.push_back (reshape); } if ( config->dedisperse ) { //if (verbose) cerr << "digifits: removing dispersion delays" << endl; SampleDelay* delay = new SampleDelay; delay->set_input (timeseries); delay->set_output (timeseries); delay->set_function (new Dedispersion::SampleDelay); operations.push_back( delay ); } // only do pscrunch for detected data -- NB always goes to Intensity bool do_pscrunch = (obs->get_npol() > 1) && (config->npol==1) && (obs->get_detected()); if (do_pscrunch) { //if (verbose) cerr << "digifits: creating pscrunch transformation" << endl; PScrunch* pscrunch = new PScrunch; pscrunch->set_input (timeseries); pscrunch->set_output (timeseries); operations.push_back( pscrunch ); } if (verbose) cerr << "digifits: creating output bitseries container" << endl; BitSeries* bitseries = new BitSeries; if (verbose) cerr << "digifits: creating PSRFITS digitizer with nbit=" << config->nbits << endl; FITSDigitizer* digitizer = new FITSDigitizer (config->nbits); digitizer->set_input (timeseries); digitizer->set_output (bitseries); // PSRFITS allows us to save the reference spectrum in each output block // "subint", so we can take advantage of this to store the exect // reference spectrum for later use. By default, we will rescale the // spectrum using values for exactly one block (nsblk samples). This // potentially improves the dynamic range, but makes the observaiton more // subject to transiennts. By calling set_rescale_nblock(N), the path // will keep a running mean/scale for N sample blocks. This is presented // to the user through rescale_seconds, which will choose the appropriate // block length to approximate the requested time interval. digitizer->set_rescale_samples (config->nsblk); if (config->rescale_constant) { cerr << "digifits: holding scales and offsets constant" << endl; digitizer->set_rescale_constant (true); } else if (config->rescale_seconds > 0) { double tblock = config->tsamp * config->nsblk; unsigned nblock = unsigned ( config->rescale_seconds/tblock + 0.5 ); if (nblock < 1) nblock = 1; digitizer->set_rescale_nblock (nblock); cerr << "digifits: using "<<nblock<<" blocks running mean for scales and constant ("<<tblock*nblock<<") seconds"<<endl; } operations.push_back( digitizer ); if (verbose) cerr << "digifits: creating PSRFITS output file" << endl; const char* output_filename = 0; if (!config->output_filename.empty()) output_filename = config->output_filename.c_str(); FITSOutputFile* outputfile = new FITSOutputFile (output_filename); outputfile->set_nsblk (config->nsblk); outputfile->set_nbit (config->nbits); outputfile->set_max_length (config->integration_length); outputFile = outputfile; outputFile->set_input (bitseries); operations.push_back( outputFile.get() ); // add a callback for the PSRFITS reference spectrum digitizer->update.connect ( dynamic_cast<FITSOutputFile*> (outputFile.get()), &FITSOutputFile::set_reference_spectrum); } catch (Error& error) { throw error += "dsp::LoadToFITS::construct"; }