void sort(Master& master, //!< master object const Assigner& assigner, //!< assigner object std::vector<T> Block::* values, //!< all values to sort std::vector<T> Block::* samples, //!< (output) boundaries of blocks size_t num_samples, //!< desired number of samples const Cmp& cmp, //!< comparison function int k = 2, //!< k-ary reduction will be used bool samples_only = false) //!< false: results will be all_to_all exchanged; true: only sort but don't exchange results { bool immediate = master.immediate(); master.set_immediate(false); // NB: although sorter will go out of scope, its member functions sample() // and exchange() will return functors whose copies get saved inside reduce detail::SampleSort<Block,T,Cmp> sorter(values, samples, cmp, num_samples); // swap-reduce to all-gather samples RegularDecomposer<DiscreteBounds> decomposer(1, interval(0,assigner.nblocks()), assigner.nblocks()); RegularSwapPartners partners(decomposer, k); reduce(master, assigner, partners, sorter.sample(), detail::SkipIntermediate(partners.rounds())); // all_to_all to exchange the values if (!samples_only) all_to_all(master, assigner, sorter.exchange(), k); master.set_immediate(immediate); }
void CR_Matrix::multiply( const double * const x , double * const y ) const { if ( 1 < m_comm_size ) { std::vector<double> x_work( m_work_disp[ m_comm_size ] ); { // Remote gathered portion: std::vector<double> x_send( m_send_disp[ m_comm_size ] ); for ( unsigned i = 0 ; i < m_send_map.size() ; ++i ) { x_send[i] = x[ m_send_map[i] ]; } // Skips local-to-local all_to_all( m_comm , PARALLEL_DATATYPE_DOUBLE , m_sparse , & x_send[0] , & m_send_disp[0] , & x_work[0] , & m_work_disp[0] ); } { // Local portion: double * d = & x_work[0] + m_work_disp[ m_comm_rank ]; const double * s = x ; const double * const e = x + m_row_size ; while ( e != s ) { *d++ = *s++ ; } } txblas_cr_mxv( m_row_size , & m_prefix[0] , & m_coli[0] , & m_coef[0] , & x_work[0] , y ); } else { txblas_cr_mxv( m_row_size , & m_prefix[0] , & m_coli[0] , & m_coef[0] , x , y ); } }
CR_Matrix::CR_Matrix( ParallelMachine arg_comm , const std::vector<unsigned> & arg_partition , std::vector<unsigned> & arg_prefix , std::vector<unsigned> & arg_coli , std::vector<double> & arg_coef ) : m_comm( arg_comm ), m_comm_size( parallel_machine_size( arg_comm ) ), m_comm_rank( parallel_machine_rank( arg_comm ) ), m_sparse( false ), m_work_disp(), m_send_disp(), m_send_map(), m_row_size( 0 ), m_prefix(), m_coli(), m_coef() { static const char method[] = "phdmesh::CR_Matrix::CR_Matrix" ; if ( arg_prefix.empty() ) { return ; } //------------------------------------ if ( arg_coli.size() != arg_prefix.back() || arg_coef.size() != arg_prefix.back() ) { std::ostringstream msg ; msg << method << " ERROR" ; msg << " arg_coli.size() = " << arg_coli.size() ; msg << " arg_coef.size() = " << arg_coef.size() ; msg << " != arg_prefix.back() = " << arg_prefix.back() ; throw std::invalid_argument( msg.str() ); } swap( m_prefix , arg_prefix ); swap( m_coli , arg_coli ); swap( m_coef , arg_coef ); m_row_size = m_prefix.size() - 1 ; if ( 1 == m_comm_size ) { return ; } //------------------------------------ if ( arg_partition.size() != 1 + m_comm_size ) { std::ostringstream msg ; msg << method << " ERROR" ; msg << " comm_size = " << m_comm_size ; msg << " + 1 != arg_partition.size() = " << arg_partition.size() ; throw std::invalid_argument( msg.str() ); } const unsigned row_first = arg_partition[ m_comm_rank ]; const unsigned row_end = arg_partition[ m_comm_rank + 1 ] ; if ( m_row_size != ( row_end - row_first ) ) { std::ostringstream msg ; msg << method << " ERROR" ; msg << " arg_prefix'row_size = " << m_row_size ; msg << " != arg_partition'row_size = " << ( row_end - row_first ); throw std::invalid_argument( msg.str() ); } //------------------------------------ m_send_disp.resize( m_comm_size + 1 ); m_work_disp.resize( m_comm_size + 1 ); // Generate a vector of off-processor column identifiers std::vector<unsigned> work_col_ident ; { const std::vector<unsigned>::iterator j = m_coli.end(); std::vector<unsigned>::iterator b = m_coli.begin(); std::vector<unsigned>::iterator i ; for ( i = b ; j != i ; ++i ) { const unsigned global_col = *i ; if ( global_col < row_first || row_end <= global_col ) { ordered_insert( work_col_ident , global_col ); } } } //------------------------------------ // Map column global identifiers to local work offsets { const std::vector<unsigned>::iterator b = work_col_ident.begin(); const std::vector<unsigned>::iterator e = work_col_ident.end(); std::vector<unsigned>::iterator j ; j = std::lower_bound( b , e , row_end ); const unsigned local_row_end = j - b ; for ( std::vector<unsigned>::iterator i = m_coli.begin() ; i != m_coli.end() ; ++i ) { const unsigned global_col = *i ; j = std::lower_bound( b, e, global_col ); unsigned local_col = j - b ; if ( row_end <= global_col ) { local_col += local_row_end ; } *i = local_col ; } } //------------------------------------ // Displacement prefix for work vector { std::vector<unsigned>::const_iterator i = work_col_ident.begin() ; m_work_disp[0] = 0 ; for ( unsigned p = 0 ; p < m_comm_size ; ++p ) { const unsigned p_row_end = arg_partition[p+1] ; unsigned count = 0 ; for ( ; i != work_col_ident.end() && *i < p_row_end ; ++i ) { ++count ; } m_work_disp[p+1] = m_work_disp[p] + count ; } } //------------------------------------ // Set up communications to gather work subvector { std::vector<unsigned> send_col_size( m_comm_size ); std::vector<unsigned> recv_col_size( m_comm_size ); for ( unsigned p = 0 ; p < m_comm_size ; ++p ) { send_col_size[p] = m_work_disp[p+1] - m_work_disp[p] ; } if ( send_col_size[ m_comm_rank ] ) { std::ostringstream msg ; msg << method << " ERROR with communication sizing logic" ; throw std::logic_error( msg.str() ); } unsigned num_msg_maximum = 0 ; comm_sizes( m_comm , m_comm_size / 4 , num_msg_maximum , & send_col_size[0] , & recv_col_size[0] ); m_sparse = num_msg_maximum < ( m_comm_size / 4 ); m_send_disp[0] = 0 ; for ( unsigned p = 0 ; p < m_comm_size ; ++p ) { m_send_disp[p+1] = m_send_disp[p] + recv_col_size[p] ; } } const unsigned send_map_size = m_send_disp[ m_comm_size ]; m_send_map.resize( send_map_size ); all_to_all( m_comm , PARALLEL_DATATYPE_UNSIGNED , m_sparse , & work_col_ident[0] , & m_work_disp[0], & m_send_map[0] , & m_send_disp[0] ); //------------------------------------ // Remap the 'm_work_disp' for receiving coefficients into the // work vector: [ lower_row_recv , local_row , upper_row_recv ] for ( unsigned p = m_comm_rank ; p < m_comm_size ; ++p ) { m_work_disp[p+1] += m_row_size ; } //------------------------------------ // Map the send_map from global to local indices, // also sanity check it. for ( unsigned i = 0 ; i < send_map_size ; ++i ) { if ( m_send_map[i] < (int) row_first || (int) row_end <= m_send_map[i] ) { std::ostringstream msg ; msg << method << " ERROR Received index " ; msg << m_send_map[i] ; msg << " out of range [ " ; msg << row_first ; msg << " : " ; msg << row_end ; msg << " )" ; throw std::runtime_error( msg.str() ); } m_send_map[i] -= row_first ; } }
void all_to_all_impl(const communicator& comm, const T* in_values, int n, T* out_values, mpl::false_) { int size = comm.size(); int rank = comm.rank(); // The amount of data to be sent to each process std::vector<int> send_sizes(size); // The displacements for each outgoing value. std::vector<int> send_disps(size); // The buffer that will store all of the outgoing values std::vector<char, allocator<char> > outgoing; // Pack the buffer with all of the outgoing values. for (int dest = 0; dest < size; ++dest) { // Keep track of the displacements send_disps[dest] = outgoing.size(); // Our own value will never be transmitted, so don't pack it. if (dest != rank) { packed_oarchive oa(comm, outgoing); for (int i = 0; i < n; ++i) oa << in_values[dest * n + i]; } // Keep track of the sizes send_sizes[dest] = outgoing.size() - send_disps[dest]; } // Determine how much data each process will receive. std::vector<int> recv_sizes(size); all_to_all(comm, send_sizes, recv_sizes); // Prepare a buffer to receive the incoming data. std::vector<int> recv_disps(size); int sum = 0; for (int src = 0; src < size; ++src) { recv_disps[src] = sum; sum += recv_sizes[src]; } std::vector<char, allocator<char> > incoming(sum > 0? sum : 1); // Make sure we don't try to reference an empty vector if (outgoing.empty()) outgoing.push_back(0); // Transmit the actual data BOOST_MPI_CHECK_RESULT(MPI_Alltoallv, (&outgoing[0], &send_sizes[0], &send_disps[0], MPI_PACKED, &incoming[0], &recv_sizes[0], &recv_disps[0], MPI_PACKED, comm)); // Deserialize data from the iarchive for (int src = 0; src < size; ++src) { if (src == rank) std::copy(in_values + src * n, in_values + (src + 1) * n, out_values + src * n); else { packed_iarchive ia(comm, incoming, boost::archive::no_header, recv_disps[src]); for (int i = 0; i < n; ++i) ia >> out_values[src * n + i]; } } }