void FullModelComposition<Vec,Mat>::compute_values( const std::vector<double>& param_values, std::vector<double>& model_output ) const { // Each data set will compute the two outputs std::vector<double> local_values(2); _model_evaluator->compute_values( param_values, local_values ); // Now gather the local_values from all evaluated datasets // to be able to insert into the model_output vector. // First, we gather from processor 0 from each set of workers queso_require_equal_to( model_output.size(), _observations->sizeGlobal() ); // We can only call this if we're a member of inter_chain_0 // By convention, inter0_rank is negative if this processor // is not in that communicator if( _comm_handler.get_inter0_rank() >= 0 ) MPI_Gather( &local_values[0], 2, MPI_DOUBLE, &model_output[0], 2, MPI_DOUBLE, 0, _comm_handler.get_inter_chain_0_comm() ); // Now broadcast to the rest of the workers on the whole chain // We did the split, so processor 0 on inter_chain_0 is also // processor 0 on inter_chain MPI_Bcast( &model_output[0], _observations->sizeGlobal(), MPI_DOUBLE, 0, _comm_handler.get_inter_chain_comm() ); }
void weights::write_proto(lbann_data::WeightsData* proto) const { // Set proto properties proto->Clear(); proto->set_name(m_name); for (const auto& d : get_dims()) { proto->mutable_shape()->add_dim(d); } proto->set_height(get_matrix_height()); proto->set_width(get_matrix_width()); // Write weight values to prototext on world master process CircMat<El::Device::CPU> values = *m_values; /// @todo What if weights are on GPU? values.SetRoot(0); /// @todo What if world master is not process 0? if (m_comm->am_world_master()) { const auto& local_values = values.LockedMatrix(); const El::Int height = local_values.Height(); const El::Int width = local_values.Width(); /// @todo OpenMP parallelization /** @todo Our matrices are column-major while Numpy expects * row-major matrices. This row-wise iteration is fine for * matrices and column vectors, but it can mess up the order of * the weights if a high-dimensional tensor is represented as a * matrix. This is what we need for quantization on convolution * kernel weights. */ for (El::Int i = 0; i < height; ++i) { for (El::Int j = 0; j < width; ++j) { proto->add_data(local_values(i,j)); } } } }
void PetscVector<Real>::localize_to_one (std::vector<Real>& v_local, const processor_id_type pid) const { this->_restore_array(); PetscErrorCode ierr=0; const PetscInt n = size(); const PetscInt nl = local_size(); PetscScalar *values; v_local.resize(n); // only one processor if (n == nl) { ierr = VecGetArray (_vec, &values); CHKERRABORT(libMesh::COMM_WORLD,ierr); for (PetscInt i=0; i<n; i++) v_local[i] = static_cast<Real>(values[i]); ierr = VecRestoreArray (_vec, &values); CHKERRABORT(libMesh::COMM_WORLD,ierr); } // otherwise multiple processors else { numeric_index_type ioff = this->first_local_index(); std::vector<Real> local_values (n, 0.); { ierr = VecGetArray (_vec, &values); CHKERRABORT(libMesh::COMM_WORLD,ierr); for (PetscInt i=0; i<nl; i++) local_values[i+ioff] = static_cast<Real>(values[i]); ierr = VecRestoreArray (_vec, &values); CHKERRABORT(libMesh::COMM_WORLD,ierr); } MPI_Reduce (&local_values[0], &v_local[0], n, MPI_REAL, MPI_SUM, pid, libMesh::COMM_WORLD); } }
void InterpolationSurrogateBuilder<V,M>::build_values() { unsigned int n_begin, n_end; this->set_work_bounds( n_begin, n_end ); // Cache each processors work, then we only need to do 1 Allgather std::vector<unsigned int> local_n(n_end-n_begin); // We need to cache (n_end-n_begin) values for each dataset, std::vector<std::vector<double> > local_values(this->m_data.size()); for( std::vector<std::vector<double> >::iterator it = local_values.begin(); it != local_values.end(); ++it ) it->resize(n_end-n_begin); unsigned int count = 0; // vector to store current domain value V domain_vector(this->get_default_data().get_paramDomain().vectorSpace().zeroVector()); // vector to store values evaluated at the current domain_vector std::vector<double> values(this->m_data.size()); for( unsigned int n = n_begin; n < n_end; n++ ) { this->set_domain_vector( n, domain_vector ); this->evaluate_model( domain_vector, values ); local_n[count] = n; for( unsigned int s = 0; s < this->m_data.size(); s++ ) local_values[s][count] = values[s]; count += 1; } /* Sync all the locally computed values between the subenvironments so all processes have all the computed values. We need to sync values for every data set. */ for( unsigned int s = 0; s < this->m_data.size(); s++ ) this->sync_data( local_n, local_values[s], this->m_data.get_dataset(s) ); }
FullModelComposition<Vec,Mat>::FullModelComposition( int argc, char** argv, const QUESO::BaseEnvironment& queso_env, const GetPot& model_input ) : _model(ModelBuilder<Vec,Mat>::build_model(queso_env,model_input)), _comm_handler(queso_env.subComm().Comm(), model_input.vector_variable_size("Likelihood/datasets") ) { // Grab the datasets we'll be working with unsigned int n_datasets = model_input.vector_variable_size("Likelihood/datasets"); std::vector<std::string> datasets(n_datasets); for( unsigned int d = 0; d < n_datasets; d++ ) { datasets[d] = model_input( "Likelihood/datasets", "DIE!", d ); } // This is the dataset the current set of processors is going to work on int dataset_index = this->_comm_handler.get_dataset_index(); // Input for this dataset _forward_run_input.reset( new GetPot(datasets[dataset_index]) ); // Setup data space, 2 datapoints per dataset unsigned int n_datapoints = 2*n_datasets; QUESO::VectorSpace<Vec,Mat> data_space( queso_env, "data_", n_datapoints, NULL); _observations.reset( data_space.newVector() ); _covariance.reset( data_space.newVector() ); // Now parse data values and the corresponding covariances // Each processor parses its own dataset // Then we'll gather/broadcast to everyone std::vector<double> local_values(2); std::vector<double> all_values(n_datapoints); // Convention, mass_loss is first, then avg_N local_values[0] = (*_forward_run_input)("MassLossLikelihood/data_value", 0.0); local_values[1] = (*_forward_run_input)("AverageNLikelihood/data_value", 0.0); if( _comm_handler.get_inter0_rank() >= 0 ) MPI_Gather( &local_values[0], 2, MPI_DOUBLE, &all_values[0], 2, MPI_DOUBLE, 0, _comm_handler.get_inter_chain_0_comm() ); MPI_Bcast( &all_values[0], n_datapoints, MPI_DOUBLE, 0, _comm_handler.get_inter_chain_comm() ); for( unsigned int i = 0; i < n_datapoints; i++ ) (*_observations)[i] = all_values[i]; local_values[0] = (*_forward_run_input)("MassLossLikelihood/sigma", -1.0); local_values[1] = (*_forward_run_input)("AverageNLikelihood/sigma", -1.0); if( _comm_handler.get_inter0_rank() >= 0 ) MPI_Gather( &local_values[0], 2, MPI_DOUBLE, &all_values[0], 2, MPI_DOUBLE, 0, _comm_handler.get_inter_chain_0_comm() ); MPI_Bcast( &all_values[0], n_datapoints, MPI_DOUBLE, 0, _comm_handler.get_inter_chain_comm() ); for( unsigned int i = 0; i < n_datapoints; i++ ) (*_covariance)[i] = all_values[i]; // Now setup model to be evaluated on this set of processors // We do this last because of the UFO check in GRINS _model_evaluator.reset( new FullModelEvaluator<Vec,Mat>(argc,argv, queso_env, *(_forward_run_input.get()), _comm_handler.get_split_chain_comm(), *(_model.get())) ); }