int gpuCgSpiritGadget::process(GadgetContainerMessage<ISMRMRD::ImageHeader> *m1, GadgetContainerMessage<GenericReconJob> *m2)
  {
    // Is this data for this gadget's set/slice?
    //
    
    if( m1->getObjectPtr()->set != set_number_ || m1->getObjectPtr()->slice != slice_number_ ) {      
      // No, pass it downstream...
      return this->next()->putq(m1);
    }
    
    //GDEBUG("gpuCgSpiritGadget::process\n");

    boost::shared_ptr<GPUTimer> process_timer;
    if( output_timing_ )
      process_timer = boost::shared_ptr<GPUTimer>( new GPUTimer("gpuCgSpiritGadget::process()") );
    
    if (!is_configured_) {
      GDEBUG("Data received before configuration was completed\n");
      return GADGET_FAIL;
    }

    GenericReconJob* j = m2->getObjectPtr();

    // Some basic validation of the incoming Spirit job
    if (!j->csm_host_.get() || !j->dat_host_.get() || !j->tra_host_.get() || !j->dcw_host_.get() || !j->reg_host_.get()) {
      GDEBUG("Received an incomplete Spirit job\n");
      return GADGET_FAIL;
    }

    unsigned int samples = j->dat_host_->get_size(0);
    unsigned int channels = j->dat_host_->get_size(1);
    unsigned int rotations = samples / j->tra_host_->get_number_of_elements();
    unsigned int frames = j->tra_host_->get_size(1)*rotations;

    if( samples%j->tra_host_->get_number_of_elements() ) {
      GDEBUG("Mismatch between number of samples (%d) and number of k-space coordinates (%d).\nThe first should be a multiplum of the latter.\n",
                    samples, j->tra_host_->get_number_of_elements());
      return GADGET_FAIL;
    }

    boost::shared_ptr< cuNDArray<floatd2> > traj(new cuNDArray<floatd2> (j->tra_host_.get()));
    boost::shared_ptr< cuNDArray<float> > dcw(new cuNDArray<float> (j->dcw_host_.get()));
    sqrt_inplace(dcw.get()); //Take square root to use for weighting
    boost::shared_ptr< cuNDArray<float_complext> > csm(new cuNDArray<float_complext> (j->csm_host_.get()));
    boost::shared_ptr< cuNDArray<float_complext> > device_samples(new cuNDArray<float_complext> (j->dat_host_.get()));
    
    cudaDeviceProp deviceProp;
    if( cudaGetDeviceProperties( &deviceProp, device_number_ ) != cudaSuccess) {
      GDEBUG( "Error: unable to query device properties.\n" );
      return GADGET_FAIL;
    }
    
    unsigned int warp_size = deviceProp.warpSize;
    
    matrix_size_ = uint64d2( j->reg_host_->get_size(0), j->reg_host_->get_size(1) );    

    matrix_size_os_ =
      uint64d2(((static_cast<unsigned int>(std::ceil(matrix_size_[0]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size,
               ((static_cast<unsigned int>(std::ceil(matrix_size_[1]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size);

    if( !matrix_size_reported_ ) {
      GDEBUG("Matrix size    : [%d,%d] \n", matrix_size_[0], matrix_size_[1]);
      GDEBUG("Matrix size OS : [%d,%d] \n", matrix_size_os_[0], matrix_size_os_[1]);
      matrix_size_reported_ = true;
    }

    std::vector<size_t> image_dims = to_std_vector(matrix_size_);

    image_dims.push_back(frames);
    image_dims.push_back(channels);
    GDEBUG("Number of coils: %d %d \n",channels,image_dims.size());
    
    E_->set_domain_dimensions(&image_dims);
    E_->set_codomain_dimensions(device_samples->get_dimensions().get());
    E_->set_dcw(dcw);
    E_->setup( matrix_size_, matrix_size_os_, static_cast<float>(kernel_width_) );
    E_->preprocess(traj.get());
    
    boost::shared_ptr< cuNDArray<float_complext> > csm_device( new cuNDArray<float_complext>( csm.get() ));
    S_->set_calibration_kernels(csm_device);
    S_->set_domain_dimensions(&image_dims);
    S_->set_codomain_dimensions(&image_dims);

    /*
    boost::shared_ptr< cuNDArray<float_complext> > reg_image(new cuNDArray<float_complext> (j->reg_host_.get()));
    R_->compute(reg_image.get());

    // Define preconditioning weights
    boost::shared_ptr< cuNDArray<float> > _precon_weights = sum(abs_square(csm.get()).get(), 2);
    boost::shared_ptr<cuNDArray<float> > R_diag = R_->get();
    *R_diag *= float(kappa_);
    *_precon_weights += *R_diag;
    R_diag.reset();
    reciprocal_sqrt_inplace(_precon_weights.get());	
    boost::shared_ptr< cuNDArray<float_complext> > precon_weights = real_to_complex<float_complext>( _precon_weights.get() );
    _precon_weights.reset();
    D_->set_weights( precon_weights );
    */

    /*{
      static int counter = 0;
      char filename[256];
      sprintf((char*)filename, "_traj_%d.real", counter);
      write_nd_array<floatd2>( traj->to_host().get(), filename );
      sprintf((char*)filename, "_dcw_%d.real", counter);
      write_nd_array<float>( dcw->to_host().get(), filename );
      sprintf((char*)filename, "_csm_%d.cplx", counter);
      write_nd_array<float_complext>( csm->to_host().get(), filename );
      sprintf((char*)filename, "_samples_%d.cplx", counter);
      write_nd_array<float_complext>( device_samples->to_host().get(), filename );
      sprintf((char*)filename, "_reg_%d.cplx", counter);
      write_nd_array<float_complext>( reg_image->to_host().get(), filename );
      counter++; 
      }*/

    // Invoke solver
    // 

    boost::shared_ptr< cuNDArray<float_complext> > cgresult;

    {
      boost::shared_ptr<GPUTimer> solve_timer;
      if( output_timing_ )
        solve_timer = boost::shared_ptr<GPUTimer>( new GPUTimer("gpuCgSpiritGadget::solve()") );
      
      cgresult = cg_.solve(device_samples.get());
      
      if( output_timing_ )
        solve_timer.reset();
    }
    
    if (!cgresult.get()) {
      GDEBUG("Iterative_spirit_compute failed\n");
      return GADGET_FAIL;
    }

    /*
      static int counter = 0;
      char filename[256];
      sprintf((char*)filename, "recon_%d.real", counter);
      write_nd_array<float>( abs(cgresult.get())->to_host().get(), filename );
      counter++; 
    */

    // If the recon matrix size exceeds the sequence matrix size then crop
    if( matrix_size_seq_ != matrix_size_ )
      cgresult = crop<float_complext,2>( (matrix_size_-matrix_size_seq_)>>1, matrix_size_seq_, cgresult.get() );    
    
    // Combine coil images
    //

    cgresult = real_to_complex<float_complext>(sqrt(sum(abs_square(cgresult.get()).get(), 3).get()).get()); // RSS
    //cgresult = sum(cgresult.get(), 2);

    // Pass on the reconstructed images
    //

    
	put_frames_on_que(frames,rotations,j,cgresult.get());
    frame_counter_ += frames;

    if( output_timing_ )
      process_timer.reset();

    m1->release();
    return GADGET_OK;
  }
예제 #2
0
  int NFFT2DGadget::process(GadgetContainerMessage< ISMRMRD::AcquisitionHeader > *m1,        // header
                            GadgetContainerMessage< hoNDArray< std::complex<float> > > *m2,  // data
                            GadgetContainerMessage< hoNDArray<float> > *m3 )                 // traj/dcw
  {    
    // Throw away any noise samples if they have been allowed to pass this far down the chain...
    //
    
  	bool is_noise = m1->getObjectPtr()->isFlagSet(ISMRMRD::ISMRMRD_ACQ_IS_NOISE_MEASUREMENT);
    if (is_noise) { 
      m1->release();
      return GADGET_OK;
    }
    
    // First pass initialization
    //
    
    if (frame_readout_queue_->message_count() == 0 ) {      
      samples_per_readout_ = m1->getObjectPtr()->number_of_samples;
      num_coils_ = m1->getObjectPtr()->active_channels;      
      dimensions_.push_back(m1->getObjectPtr()->active_channels);
      dimensions_.push_back(repetitions_);
      num_trajectory_dims_ = m3->getObjectPtr()->get_size(0); // 2 for trajectories only, 3 for both trajectories + dcw
    }

    int samples = m1->getObjectPtr()->number_of_samples;
    int readout = m1->getObjectPtr()->idx.kspace_encode_step_1;
    int repetition = m1->getObjectPtr()->idx.kspace_encode_step_2;

    // Enqueue incoming readouts and trajectories
    //

    frame_readout_queue_->enqueue_tail(duplicate_array(m2));
    frame_traj_queue_->enqueue_tail(duplicate_array(m3));
    
    // If the last readout for a slice has arrived then perform a reconstruction
    //

    bool is_last_scan_in_repetition = 
    		m1->getObjectPtr()->isFlagSet(ISMRMRD::ISMRMRD_ACQ_LAST_IN_REPETITION);

    if (is_last_scan_in_repetition) {


      // Define the image header
      //

      GadgetContainerMessage<ISMRMRD::ImageHeader> *cm1 = 
        new GadgetContainerMessage<ISMRMRD::ImageHeader>();      
      
      GadgetContainerMessage< hoNDArray< std::complex<float> > > *cm2 = 
        new GadgetContainerMessage<hoNDArray< std::complex<float> > >();
      
      cm1->getObjectPtr()->flags = 0;
      cm1->cont(cm2);
    
      cm1->getObjectPtr()->matrix_size[0]     = dimensions_[0];
      cm1->getObjectPtr()->matrix_size[1]     = dimensions_[1];
      cm1->getObjectPtr()->matrix_size[2]     = 1;
      cm1->getObjectPtr()->field_of_view[0]   = field_of_view_[0];
      cm1->getObjectPtr()->field_of_view[1]   = field_of_view_[1];
      cm1->getObjectPtr()->channels           = num_coils_;
      cm1->getObjectPtr()->repetition         = m1->getObjectPtr()->idx.repetition;

      memcpy(cm1->getObjectPtr()->position,
             m1->getObjectPtr()->position,
             sizeof(float)*3);

      memcpy(cm1->getObjectPtr()->read_dir,
             m1->getObjectPtr()->read_dir,
             sizeof(float)*3);

      memcpy(cm1->getObjectPtr()->phase_dir,
             m1->getObjectPtr()->phase_dir,
             sizeof(float)*3);

      memcpy(cm1->getObjectPtr()->slice_dir,
             m1->getObjectPtr()->slice_dir,
             sizeof(float)*3);

      memcpy(cm1->getObjectPtr()->patient_table_position,
             m1->getObjectPtr()->patient_table_position, sizeof(float)*3);

      cm1->getObjectPtr()->data_type = ISMRMRD::ISMRMRD_CXFLOAT;
      cm1->getObjectPtr()->image_index = 0;
      cm1->getObjectPtr()->image_series_index = 0;

      //
      // Perform reconstruction of repetition
      //
      
      // Get samples for frame
      //

      cuNDArray<float_complext> samples( extract_samples_from_queue( frame_readout_queue_.get()).get() );

      // Get trajectories/dcw for frame
      //
      
      boost::shared_ptr< cuNDArray<floatd2> > traj(new cuNDArray<floatd2>);
      boost::shared_ptr<cuNDArray<float> > dcw(new cuNDArray<float>);

      extract_trajectory_and_dcw_from_queue( frame_traj_queue_.get(), traj.get(), dcw.get() );
      //traj = compute_radial_trajectory_golden_ratio_2d<float>(samples_per_readout_,dimensions_[1],1,0,GR_ORIGINAL);

      unsigned int num_profiles = samples.get_number_of_elements()/samples_per_readout_;
      dcw = compute_radial_dcw_golden_ratio_2d<float>(samples_per_readout_,num_profiles,1.0,1.0f/samples_per_readout_/num_profiles,0,GR_ORIGINAL);
      // Create output array
      //


      std::vector<size_t> img_dims(2);
      img_dims[0] = dimensions_[0];
      img_dims[1] = dimensions_[1];
      cm2->getObjectPtr()->create(&img_dims);
      cuNDArray<float_complext> image(&img_dims);
      
      // Initialize plan
      //
      
      const float kernel_width = 5.5f;
      cuNFFT_plan<float,2> plan( from_std_vector<size_t,2>(img_dims), from_std_vector<size_t,2>(img_dims)<<1, kernel_width );
      plan.preprocess( traj.get(), cuNFFT_plan<float,2>::NFFT_PREP_NC2C );
/*
      if( dcw->get_number_of_elements() == 0 ){
        std::vector<size_t> dcw_dims; dcw_dims.push_back(samples_per_readout_);
        hoNDArray<float> host_dcw( dcw_dims );
        for( int i=0; i<(int)dcw_dims[0]; i++ )
          host_dcw.get_data_ptr()[i]=abs(i-(int)dcw_dims[0]/2);
        host_dcw.get_data_ptr()[dcw_dims[0]/2] = 0.25f; // ad hoc value (we do not want a DC component of 0)        
        dcw = expand(&host_dcw, traj->get_number_of_elements()/samples_per_readout_);
      }
*/
      // Gridder
      //
      
      plan.compute( &samples, &image,  
                    (dcw->get_number_of_elements()>0) ? dcw.get() : 0x0,
                    cuNFFT_plan<float,2>::NFFT_BACKWARDS_NC2C );


      // Download to host
      //

      image.to_host( (hoNDArray<float_complext>*)cm2->getObjectPtr() );
      // Pass on data down the gadget chain
      //

      if (this->next()->putq(cm1) < 0) {
        return GADGET_FAIL;
      }
    }

    m1->release();
    return GADGET_OK;
  }
예제 #3
0
int gpuOsSenseGadget::process(GadgetContainerMessage<ISMRMRD::ImageHeader> *m1, GadgetContainerMessage<GenericReconJob> *m2)
{
	// Is this data for this gadget's set/slice?
	//
	GDEBUG("Starting gpuOsSenseGadget\n");

	if( m1->getObjectPtr()->set != set_number_ || m1->getObjectPtr()->slice != slice_number_ ) {
		// No, pass it downstream...
		return this->next()->putq(m1);
	}

	//GDEBUG("gpuOsSenseGadget::process\n");
	//GPUTimer timer("gpuOsSenseGadget::process");

	if (!is_configured_) {
		GDEBUG("\nData received before configuration complete\n");
		return GADGET_FAIL;
	}

	GenericReconJob* j = m2->getObjectPtr();

	// Let's first check that this job has the required data...
	if (!j->csm_host_.get() || !j->dat_host_.get() || !j->tra_host_.get() || !j->dcw_host_.get()) {
		GDEBUG("Received an incomplete Sense job\n");
		return GADGET_FAIL;
	}

	unsigned int samples = j->dat_host_->get_size(0);
	unsigned int channels = j->dat_host_->get_size(1);
	unsigned int rotations = samples / j->tra_host_->get_number_of_elements();
	unsigned int frames = j->tra_host_->get_size(1)*rotations;

	if( samples%j->tra_host_->get_number_of_elements() ) {
		GDEBUG("Mismatch between number of samples (%d) and number of k-space coordinates (%d).\nThe first should be a multiplum of the latter.\n",
				samples, j->tra_host_->get_number_of_elements());
		return GADGET_FAIL;
	}

	boost::shared_ptr< cuNDArray<floatd2> > traj(new cuNDArray<floatd2> (j->tra_host_.get()));
	boost::shared_ptr< cuNDArray<float> > dcw(new cuNDArray<float> (j->dcw_host_.get()));
	sqrt_inplace(dcw.get());
	boost::shared_ptr< cuNDArray<float_complext> > csm(new cuNDArray<float_complext> (j->csm_host_.get()));
	boost::shared_ptr< cuNDArray<float_complext> > device_samples(new cuNDArray<float_complext> (j->dat_host_.get()));


	// Take the reconstruction matrix size from the regulariaztion image.
	// It could be oversampled from the sequence specified size...

	matrix_size_ = uint64d2( j->reg_host_->get_size(0), j->reg_host_->get_size(1) );

	cudaDeviceProp deviceProp;
	if( cudaGetDeviceProperties( &deviceProp, device_number_ ) != cudaSuccess) {
		GDEBUG( "\nError: unable to query device properties.\n" );
		return GADGET_FAIL;
	}

	unsigned int warp_size = deviceProp.warpSize;

	matrix_size_os_ =
			uint64d2(((static_cast<unsigned int>(std::ceil(matrix_size_[0]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size,
					((static_cast<unsigned int>(std::ceil(matrix_size_[1]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size);

	GDEBUG("Matrix size    : [%d,%d] \n", matrix_size_[0], matrix_size_[1]);
	GDEBUG("Matrix size OS : [%d,%d] \n", matrix_size_os_[0], matrix_size_os_[1]);

	std::vector<size_t> image_dims = to_std_vector(matrix_size_);
	image_dims.push_back(frames);

	E_->set_domain_dimensions(&image_dims);
	E_->set_codomain_dimensions(device_samples->get_dimensions().get());
	E_->set_csm(csm);
	E_->setup( matrix_size_, matrix_size_os_, kernel_width_ );
	E_->preprocess(traj.get());

	{
		auto precon = boost::make_shared<cuNDArray<float_complext>>(image_dims);
		fill(precon.get(),float_complext(1.0f));
		//solver_.set_preconditioning_image(precon);
	}
	reg_image_ = boost::shared_ptr< cuNDArray<float_complext> >(new cuNDArray<float_complext>(&image_dims));

	// These operators need their domain/codomain set before being added to the solver
	//

	//E_->set_dcw(dcw);
	GDEBUG("Prepared\n");

	// Expand the average image to the number of frames
	//

	{
		cuNDArray<float_complext> tmp(*j->reg_host_);
		*reg_image_ = expand( tmp, frames );
	}
	PICS_->set_prior(reg_image_);

	// Define preconditioning weights
	//

	//Apply weights
	//*device_samples *= *dcw;

	// Invoke solver
	//

	boost::shared_ptr< cuNDArray<float_complext> > result;
	{
		GDEBUG("Running NLCG solver\n");
		GPUTimer timer("Running NLCG solver");

		// Optionally, allow exclusive (per device) access to the solver
		// This may not matter much in terms of speed, but it can in terms of memory consumption
		//

		if( exclusive_access_ )
			_mutex[device_number_].lock();

		result = solver_.solve(device_samples.get());

		if( exclusive_access_ )
			_mutex[device_number_].unlock();
	}

	// Provide some info about the scaling between the regularization and reconstruction.
	// If it is not close to one, PICCS does not work optimally...
	//

	if( alpha_ > 0.0 ){
		cuNDArray<float_complext> gpureg(j->reg_host_.get());
		boost::shared_ptr< cuNDArray<float_complext> > gpurec = sum(result.get(),2);
		*gpurec /= float(result->get_size(2));
		float scale = abs(dot(gpurec.get(), gpurec.get())/dot(gpurec.get(),&gpureg));
		GDEBUG("Scaling factor between regularization and reconstruction is %f.\n", scale);
	}

	if (!result.get()) {
		GDEBUG("\nNon-linear conjugate gradient solver failed\n");
		return GADGET_FAIL;
	}

	/*
      static int counter = 0;
      char filename[256];
      sprintf((char*)filename, "recon_sb_%d.cplx", counter);
      write_nd_array<float_complext>( sbresult->to_host().get(), filename );
      counter++; */

	// If the recon matrix size exceeds the sequence matrix size then crop
	if( matrix_size_seq_ != matrix_size_ )
		*result = crop<float_complext,2>( (matrix_size_-matrix_size_seq_)>>1, matrix_size_seq_, *result );


	// Now pass on the reconstructed images
	//
	this->put_frames_on_que(frames,rotations,j,result.get(),channels);

	frame_counter_ += frames;
	m1->release();
	return GADGET_OK;
}