int gpuCgSpiritGadget::process(GadgetContainerMessage<ISMRMRD::ImageHeader> *m1, GadgetContainerMessage<GenericReconJob> *m2) { // Is this data for this gadget's set/slice? // if( m1->getObjectPtr()->set != set_number_ || m1->getObjectPtr()->slice != slice_number_ ) { // No, pass it downstream... return this->next()->putq(m1); } //GDEBUG("gpuCgSpiritGadget::process\n"); boost::shared_ptr<GPUTimer> process_timer; if( output_timing_ ) process_timer = boost::shared_ptr<GPUTimer>( new GPUTimer("gpuCgSpiritGadget::process()") ); if (!is_configured_) { GDEBUG("Data received before configuration was completed\n"); return GADGET_FAIL; } GenericReconJob* j = m2->getObjectPtr(); // Some basic validation of the incoming Spirit job if (!j->csm_host_.get() || !j->dat_host_.get() || !j->tra_host_.get() || !j->dcw_host_.get() || !j->reg_host_.get()) { GDEBUG("Received an incomplete Spirit job\n"); return GADGET_FAIL; } unsigned int samples = j->dat_host_->get_size(0); unsigned int channels = j->dat_host_->get_size(1); unsigned int rotations = samples / j->tra_host_->get_number_of_elements(); unsigned int frames = j->tra_host_->get_size(1)*rotations; if( samples%j->tra_host_->get_number_of_elements() ) { GDEBUG("Mismatch between number of samples (%d) and number of k-space coordinates (%d).\nThe first should be a multiplum of the latter.\n", samples, j->tra_host_->get_number_of_elements()); return GADGET_FAIL; } boost::shared_ptr< cuNDArray<floatd2> > traj(new cuNDArray<floatd2> (j->tra_host_.get())); boost::shared_ptr< cuNDArray<float> > dcw(new cuNDArray<float> (j->dcw_host_.get())); sqrt_inplace(dcw.get()); //Take square root to use for weighting boost::shared_ptr< cuNDArray<float_complext> > csm(new cuNDArray<float_complext> (j->csm_host_.get())); boost::shared_ptr< cuNDArray<float_complext> > device_samples(new cuNDArray<float_complext> (j->dat_host_.get())); cudaDeviceProp deviceProp; if( cudaGetDeviceProperties( &deviceProp, device_number_ ) != cudaSuccess) { GDEBUG( "Error: unable to query device properties.\n" ); return GADGET_FAIL; } unsigned int warp_size = deviceProp.warpSize; matrix_size_ = uint64d2( j->reg_host_->get_size(0), j->reg_host_->get_size(1) ); matrix_size_os_ = uint64d2(((static_cast<unsigned int>(std::ceil(matrix_size_[0]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size, ((static_cast<unsigned int>(std::ceil(matrix_size_[1]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size); if( !matrix_size_reported_ ) { GDEBUG("Matrix size : [%d,%d] \n", matrix_size_[0], matrix_size_[1]); GDEBUG("Matrix size OS : [%d,%d] \n", matrix_size_os_[0], matrix_size_os_[1]); matrix_size_reported_ = true; } std::vector<size_t> image_dims = to_std_vector(matrix_size_); image_dims.push_back(frames); image_dims.push_back(channels); GDEBUG("Number of coils: %d %d \n",channels,image_dims.size()); E_->set_domain_dimensions(&image_dims); E_->set_codomain_dimensions(device_samples->get_dimensions().get()); E_->set_dcw(dcw); E_->setup( matrix_size_, matrix_size_os_, static_cast<float>(kernel_width_) ); E_->preprocess(traj.get()); boost::shared_ptr< cuNDArray<float_complext> > csm_device( new cuNDArray<float_complext>( csm.get() )); S_->set_calibration_kernels(csm_device); S_->set_domain_dimensions(&image_dims); S_->set_codomain_dimensions(&image_dims); /* boost::shared_ptr< cuNDArray<float_complext> > reg_image(new cuNDArray<float_complext> (j->reg_host_.get())); R_->compute(reg_image.get()); // Define preconditioning weights boost::shared_ptr< cuNDArray<float> > _precon_weights = sum(abs_square(csm.get()).get(), 2); boost::shared_ptr<cuNDArray<float> > R_diag = R_->get(); *R_diag *= float(kappa_); *_precon_weights += *R_diag; R_diag.reset(); reciprocal_sqrt_inplace(_precon_weights.get()); boost::shared_ptr< cuNDArray<float_complext> > precon_weights = real_to_complex<float_complext>( _precon_weights.get() ); _precon_weights.reset(); D_->set_weights( precon_weights ); */ /*{ static int counter = 0; char filename[256]; sprintf((char*)filename, "_traj_%d.real", counter); write_nd_array<floatd2>( traj->to_host().get(), filename ); sprintf((char*)filename, "_dcw_%d.real", counter); write_nd_array<float>( dcw->to_host().get(), filename ); sprintf((char*)filename, "_csm_%d.cplx", counter); write_nd_array<float_complext>( csm->to_host().get(), filename ); sprintf((char*)filename, "_samples_%d.cplx", counter); write_nd_array<float_complext>( device_samples->to_host().get(), filename ); sprintf((char*)filename, "_reg_%d.cplx", counter); write_nd_array<float_complext>( reg_image->to_host().get(), filename ); counter++; }*/ // Invoke solver // boost::shared_ptr< cuNDArray<float_complext> > cgresult; { boost::shared_ptr<GPUTimer> solve_timer; if( output_timing_ ) solve_timer = boost::shared_ptr<GPUTimer>( new GPUTimer("gpuCgSpiritGadget::solve()") ); cgresult = cg_.solve(device_samples.get()); if( output_timing_ ) solve_timer.reset(); } if (!cgresult.get()) { GDEBUG("Iterative_spirit_compute failed\n"); return GADGET_FAIL; } /* static int counter = 0; char filename[256]; sprintf((char*)filename, "recon_%d.real", counter); write_nd_array<float>( abs(cgresult.get())->to_host().get(), filename ); counter++; */ // If the recon matrix size exceeds the sequence matrix size then crop if( matrix_size_seq_ != matrix_size_ ) cgresult = crop<float_complext,2>( (matrix_size_-matrix_size_seq_)>>1, matrix_size_seq_, cgresult.get() ); // Combine coil images // cgresult = real_to_complex<float_complext>(sqrt(sum(abs_square(cgresult.get()).get(), 3).get()).get()); // RSS //cgresult = sum(cgresult.get(), 2); // Pass on the reconstructed images // put_frames_on_que(frames,rotations,j,cgresult.get()); frame_counter_ += frames; if( output_timing_ ) process_timer.reset(); m1->release(); return GADGET_OK; }
int NFFT2DGadget::process(GadgetContainerMessage< ISMRMRD::AcquisitionHeader > *m1, // header GadgetContainerMessage< hoNDArray< std::complex<float> > > *m2, // data GadgetContainerMessage< hoNDArray<float> > *m3 ) // traj/dcw { // Throw away any noise samples if they have been allowed to pass this far down the chain... // bool is_noise = m1->getObjectPtr()->isFlagSet(ISMRMRD::ISMRMRD_ACQ_IS_NOISE_MEASUREMENT); if (is_noise) { m1->release(); return GADGET_OK; } // First pass initialization // if (frame_readout_queue_->message_count() == 0 ) { samples_per_readout_ = m1->getObjectPtr()->number_of_samples; num_coils_ = m1->getObjectPtr()->active_channels; dimensions_.push_back(m1->getObjectPtr()->active_channels); dimensions_.push_back(repetitions_); num_trajectory_dims_ = m3->getObjectPtr()->get_size(0); // 2 for trajectories only, 3 for both trajectories + dcw } int samples = m1->getObjectPtr()->number_of_samples; int readout = m1->getObjectPtr()->idx.kspace_encode_step_1; int repetition = m1->getObjectPtr()->idx.kspace_encode_step_2; // Enqueue incoming readouts and trajectories // frame_readout_queue_->enqueue_tail(duplicate_array(m2)); frame_traj_queue_->enqueue_tail(duplicate_array(m3)); // If the last readout for a slice has arrived then perform a reconstruction // bool is_last_scan_in_repetition = m1->getObjectPtr()->isFlagSet(ISMRMRD::ISMRMRD_ACQ_LAST_IN_REPETITION); if (is_last_scan_in_repetition) { // Define the image header // GadgetContainerMessage<ISMRMRD::ImageHeader> *cm1 = new GadgetContainerMessage<ISMRMRD::ImageHeader>(); GadgetContainerMessage< hoNDArray< std::complex<float> > > *cm2 = new GadgetContainerMessage<hoNDArray< std::complex<float> > >(); cm1->getObjectPtr()->flags = 0; cm1->cont(cm2); cm1->getObjectPtr()->matrix_size[0] = dimensions_[0]; cm1->getObjectPtr()->matrix_size[1] = dimensions_[1]; cm1->getObjectPtr()->matrix_size[2] = 1; cm1->getObjectPtr()->field_of_view[0] = field_of_view_[0]; cm1->getObjectPtr()->field_of_view[1] = field_of_view_[1]; cm1->getObjectPtr()->channels = num_coils_; cm1->getObjectPtr()->repetition = m1->getObjectPtr()->idx.repetition; memcpy(cm1->getObjectPtr()->position, m1->getObjectPtr()->position, sizeof(float)*3); memcpy(cm1->getObjectPtr()->read_dir, m1->getObjectPtr()->read_dir, sizeof(float)*3); memcpy(cm1->getObjectPtr()->phase_dir, m1->getObjectPtr()->phase_dir, sizeof(float)*3); memcpy(cm1->getObjectPtr()->slice_dir, m1->getObjectPtr()->slice_dir, sizeof(float)*3); memcpy(cm1->getObjectPtr()->patient_table_position, m1->getObjectPtr()->patient_table_position, sizeof(float)*3); cm1->getObjectPtr()->data_type = ISMRMRD::ISMRMRD_CXFLOAT; cm1->getObjectPtr()->image_index = 0; cm1->getObjectPtr()->image_series_index = 0; // // Perform reconstruction of repetition // // Get samples for frame // cuNDArray<float_complext> samples( extract_samples_from_queue( frame_readout_queue_.get()).get() ); // Get trajectories/dcw for frame // boost::shared_ptr< cuNDArray<floatd2> > traj(new cuNDArray<floatd2>); boost::shared_ptr<cuNDArray<float> > dcw(new cuNDArray<float>); extract_trajectory_and_dcw_from_queue( frame_traj_queue_.get(), traj.get(), dcw.get() ); //traj = compute_radial_trajectory_golden_ratio_2d<float>(samples_per_readout_,dimensions_[1],1,0,GR_ORIGINAL); unsigned int num_profiles = samples.get_number_of_elements()/samples_per_readout_; dcw = compute_radial_dcw_golden_ratio_2d<float>(samples_per_readout_,num_profiles,1.0,1.0f/samples_per_readout_/num_profiles,0,GR_ORIGINAL); // Create output array // std::vector<size_t> img_dims(2); img_dims[0] = dimensions_[0]; img_dims[1] = dimensions_[1]; cm2->getObjectPtr()->create(&img_dims); cuNDArray<float_complext> image(&img_dims); // Initialize plan // const float kernel_width = 5.5f; cuNFFT_plan<float,2> plan( from_std_vector<size_t,2>(img_dims), from_std_vector<size_t,2>(img_dims)<<1, kernel_width ); plan.preprocess( traj.get(), cuNFFT_plan<float,2>::NFFT_PREP_NC2C ); /* if( dcw->get_number_of_elements() == 0 ){ std::vector<size_t> dcw_dims; dcw_dims.push_back(samples_per_readout_); hoNDArray<float> host_dcw( dcw_dims ); for( int i=0; i<(int)dcw_dims[0]; i++ ) host_dcw.get_data_ptr()[i]=abs(i-(int)dcw_dims[0]/2); host_dcw.get_data_ptr()[dcw_dims[0]/2] = 0.25f; // ad hoc value (we do not want a DC component of 0) dcw = expand(&host_dcw, traj->get_number_of_elements()/samples_per_readout_); } */ // Gridder // plan.compute( &samples, &image, (dcw->get_number_of_elements()>0) ? dcw.get() : 0x0, cuNFFT_plan<float,2>::NFFT_BACKWARDS_NC2C ); // Download to host // image.to_host( (hoNDArray<float_complext>*)cm2->getObjectPtr() ); // Pass on data down the gadget chain // if (this->next()->putq(cm1) < 0) { return GADGET_FAIL; } } m1->release(); return GADGET_OK; }
int gpuOsSenseGadget::process(GadgetContainerMessage<ISMRMRD::ImageHeader> *m1, GadgetContainerMessage<GenericReconJob> *m2) { // Is this data for this gadget's set/slice? // GDEBUG("Starting gpuOsSenseGadget\n"); if( m1->getObjectPtr()->set != set_number_ || m1->getObjectPtr()->slice != slice_number_ ) { // No, pass it downstream... return this->next()->putq(m1); } //GDEBUG("gpuOsSenseGadget::process\n"); //GPUTimer timer("gpuOsSenseGadget::process"); if (!is_configured_) { GDEBUG("\nData received before configuration complete\n"); return GADGET_FAIL; } GenericReconJob* j = m2->getObjectPtr(); // Let's first check that this job has the required data... if (!j->csm_host_.get() || !j->dat_host_.get() || !j->tra_host_.get() || !j->dcw_host_.get()) { GDEBUG("Received an incomplete Sense job\n"); return GADGET_FAIL; } unsigned int samples = j->dat_host_->get_size(0); unsigned int channels = j->dat_host_->get_size(1); unsigned int rotations = samples / j->tra_host_->get_number_of_elements(); unsigned int frames = j->tra_host_->get_size(1)*rotations; if( samples%j->tra_host_->get_number_of_elements() ) { GDEBUG("Mismatch between number of samples (%d) and number of k-space coordinates (%d).\nThe first should be a multiplum of the latter.\n", samples, j->tra_host_->get_number_of_elements()); return GADGET_FAIL; } boost::shared_ptr< cuNDArray<floatd2> > traj(new cuNDArray<floatd2> (j->tra_host_.get())); boost::shared_ptr< cuNDArray<float> > dcw(new cuNDArray<float> (j->dcw_host_.get())); sqrt_inplace(dcw.get()); boost::shared_ptr< cuNDArray<float_complext> > csm(new cuNDArray<float_complext> (j->csm_host_.get())); boost::shared_ptr< cuNDArray<float_complext> > device_samples(new cuNDArray<float_complext> (j->dat_host_.get())); // Take the reconstruction matrix size from the regulariaztion image. // It could be oversampled from the sequence specified size... matrix_size_ = uint64d2( j->reg_host_->get_size(0), j->reg_host_->get_size(1) ); cudaDeviceProp deviceProp; if( cudaGetDeviceProperties( &deviceProp, device_number_ ) != cudaSuccess) { GDEBUG( "\nError: unable to query device properties.\n" ); return GADGET_FAIL; } unsigned int warp_size = deviceProp.warpSize; matrix_size_os_ = uint64d2(((static_cast<unsigned int>(std::ceil(matrix_size_[0]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size, ((static_cast<unsigned int>(std::ceil(matrix_size_[1]*oversampling_factor_))+warp_size-1)/warp_size)*warp_size); GDEBUG("Matrix size : [%d,%d] \n", matrix_size_[0], matrix_size_[1]); GDEBUG("Matrix size OS : [%d,%d] \n", matrix_size_os_[0], matrix_size_os_[1]); std::vector<size_t> image_dims = to_std_vector(matrix_size_); image_dims.push_back(frames); E_->set_domain_dimensions(&image_dims); E_->set_codomain_dimensions(device_samples->get_dimensions().get()); E_->set_csm(csm); E_->setup( matrix_size_, matrix_size_os_, kernel_width_ ); E_->preprocess(traj.get()); { auto precon = boost::make_shared<cuNDArray<float_complext>>(image_dims); fill(precon.get(),float_complext(1.0f)); //solver_.set_preconditioning_image(precon); } reg_image_ = boost::shared_ptr< cuNDArray<float_complext> >(new cuNDArray<float_complext>(&image_dims)); // These operators need their domain/codomain set before being added to the solver // //E_->set_dcw(dcw); GDEBUG("Prepared\n"); // Expand the average image to the number of frames // { cuNDArray<float_complext> tmp(*j->reg_host_); *reg_image_ = expand( tmp, frames ); } PICS_->set_prior(reg_image_); // Define preconditioning weights // //Apply weights //*device_samples *= *dcw; // Invoke solver // boost::shared_ptr< cuNDArray<float_complext> > result; { GDEBUG("Running NLCG solver\n"); GPUTimer timer("Running NLCG solver"); // Optionally, allow exclusive (per device) access to the solver // This may not matter much in terms of speed, but it can in terms of memory consumption // if( exclusive_access_ ) _mutex[device_number_].lock(); result = solver_.solve(device_samples.get()); if( exclusive_access_ ) _mutex[device_number_].unlock(); } // Provide some info about the scaling between the regularization and reconstruction. // If it is not close to one, PICCS does not work optimally... // if( alpha_ > 0.0 ){ cuNDArray<float_complext> gpureg(j->reg_host_.get()); boost::shared_ptr< cuNDArray<float_complext> > gpurec = sum(result.get(),2); *gpurec /= float(result->get_size(2)); float scale = abs(dot(gpurec.get(), gpurec.get())/dot(gpurec.get(),&gpureg)); GDEBUG("Scaling factor between regularization and reconstruction is %f.\n", scale); } if (!result.get()) { GDEBUG("\nNon-linear conjugate gradient solver failed\n"); return GADGET_FAIL; } /* static int counter = 0; char filename[256]; sprintf((char*)filename, "recon_sb_%d.cplx", counter); write_nd_array<float_complext>( sbresult->to_host().get(), filename ); counter++; */ // If the recon matrix size exceeds the sequence matrix size then crop if( matrix_size_seq_ != matrix_size_ ) *result = crop<float_complext,2>( (matrix_size_-matrix_size_seq_)>>1, matrix_size_seq_, *result ); // Now pass on the reconstructed images // this->put_frames_on_que(frames,rotations,j,result.get(),channels); frame_counter_ += frames; m1->release(); return GADGET_OK; }