inline void MyMPI_ExchangeTable (TABLE<T> & send_data, TABLE<T> & recv_data, int tag, MPI_Comm comm = MPI_COMM_WORLD) { int ntasks, rank; MPI_Comm_size(comm, &ntasks); MPI_Comm_rank(comm, &rank); Array<int> send_sizes(ntasks); Array<int> recv_sizes(ntasks); for (int i = 0; i < ntasks; i++) send_sizes[i] = send_data[i].Size(); MPI_Alltoall (&send_sizes[0], 1, MPI_INT, &recv_sizes[0], 1, MPI_INT, comm); // in-place is buggy ! // MPI_Alltoall (MPI_IN_PLACE, 1, MPI_INT, // &recv_sizes[0], 1, MPI_INT, comm); for (int i = 0; i < ntasks; i++) recv_data.SetEntrySize (i, recv_sizes[i], sizeof(T)); Array<MPI_Request> requests; for (int dest = 0; dest < ntasks; dest++) if (dest != rank && send_data[dest].Size()) requests.Append (MyMPI_ISend (send_data[dest], dest, tag, comm)); for (int dest = 0; dest < ntasks; dest++) if (dest != rank && recv_data[dest].Size()) requests.Append (MyMPI_IRecv (recv_data[dest], dest, tag, comm)); // MPI_Barrier (comm); MPI_Waitall (requests.Size(), &requests[0], MPI_STATUS_IGNORE); }
void gather(const T& elem, std::vector<T>& results) { #ifdef HAS_MPI // Get the mpi rank and size size_t mpi_size(size()); int mpi_rank(rank()); if(results.size() != mpi_size) results.resize(mpi_size); // Serialize the local map graphlab::charstream cstrm(128); graphlab::oarchive oarc(cstrm); oarc << elem; cstrm.flush(); char* send_buffer = cstrm->c_str(); int send_buffer_size = cstrm->size(); assert(send_buffer_size >= 0); // compute the sizes std::vector<int> recv_sizes(mpi_size, -1); // Compute the sizes int error = MPI_Gather(&send_buffer_size, // Send buffer 1, // send count MPI_INT, // send type &(recv_sizes[0]), // recvbuffer 1, // recvcount MPI_INT, // recvtype mpi_rank, // root rank MPI_COMM_WORLD); assert(error == MPI_SUCCESS); for(size_t i = 0; i < recv_sizes.size(); ++i) assert(recv_sizes[i] >= 0); // Construct offsets std::vector<int> recv_offsets(recv_sizes); int sum = 0, tmp = 0; for(size_t i = 0; i < recv_offsets.size(); ++i) { tmp = recv_offsets[i]; recv_offsets[i] = sum; sum += tmp; } // if necessary realloac recv_buffer std::vector<char> recv_buffer(sum); // recv all the maps error = MPI_Gatherv(send_buffer, // send buffer send_buffer_size, // how much to send MPI_BYTE, // send type &(recv_buffer[0]), // recv buffer &(recv_sizes[0]), // amount to recv // for each cpuess &(recv_offsets[0]), // where to place data MPI_BYTE, mpi_rank, // root rank MPI_COMM_WORLD); assert(error == MPI_SUCCESS); // Update the local map namespace bio = boost::iostreams; typedef bio::stream<bio::array_source> icharstream; icharstream strm(&(recv_buffer[0]), recv_buffer.size()); graphlab::iarchive iarc(strm); for(size_t i = 0; i < results.size(); ++i) { iarc >> results[i]; } #else logstream(LOG_FATAL) << "MPI not installed!" << std::endl; #endif } // end of gather
//! @name Constructor/Destructor //@{ AMGXOperator(const Teuchos::RCP<Tpetra::CrsMatrix<SC,LO,GO,NO> > &inA, Teuchos::ParameterList ¶mListIn) { RCP<const Teuchos::Comm<int> > comm = inA->getRowMap()->getComm(); int numProcs = comm->getSize(); int myRank = comm->getRank(); RCP<Teuchos::Time> amgxTimer = Teuchos::TimeMonitor::getNewTimer("MueLu: AMGX: initialize"); amgxTimer->start(); // Initialize AMGX_SAFE_CALL(AMGX_initialize()); AMGX_SAFE_CALL(AMGX_initialize_plugins()); /*system*/ //AMGX_SAFE_CALL(AMGX_register_print_callback(&print_callback)); AMGX_SAFE_CALL(AMGX_install_signal_handler()); Teuchos::ParameterList configs = paramListIn.sublist("amgx:params", true); if (configs.isParameter("json file")) { AMGX_SAFE_CALL(AMGX_config_create_from_file(&Config_, (const char *) &configs.get<std::string>("json file")[0])); } else { std::ostringstream oss; oss << ""; ParameterList::ConstIterator itr; for (itr = configs.begin(); itr != configs.end(); ++itr) { const std::string& name = configs.name(itr); const ParameterEntry& entry = configs.entry(itr); oss << name << "=" << filterValueToString(entry) << ", "; } oss << "\0"; std::string configString = oss.str(); if (configString == "") { //print msg that using defaults //GetOStream(Warnings0) << "Warning: No configuration parameters specified, using default AMGX configuration parameters. \n"; } AMGX_SAFE_CALL(AMGX_config_create(&Config_, configString.c_str())); } // TODO: we probably need to add "exception_handling=1" to the parameter list // to switch on internal error handling (with no need for AMGX_SAFE_CALL) #define NEW_COMM #ifdef NEW_COMM // NOTE: MPI communicator used in AMGX_resources_create must exist in the scope of AMGX_matrix_comm_from_maps_one_ring // FIXME: fix for serial comm RCP<const Teuchos::MpiComm<int> > tmpic = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int> >(comm->duplicate()); TEUCHOS_TEST_FOR_EXCEPTION(tmpic.is_null(), Exceptions::RuntimeError, "Communicator is not MpiComm"); RCP<const Teuchos::OpaqueWrapper<MPI_Comm> > rawMpiComm = tmpic->getRawMpiComm(); MPI_Comm mpiComm = *rawMpiComm; #endif // Construct AMGX resources if (numProcs == 1) { AMGX_resources_create_simple(&Resources_, Config_); } else { int numGPUDevices; cudaGetDeviceCount(&numGPUDevices); int device[] = {(comm->getRank() % numGPUDevices)}; AMGX_config_add_parameters(&Config_, "communicator=MPI"); #ifdef NEW_COMM AMGX_resources_create(&Resources_, Config_, &mpiComm, 1/* number of GPU devices utilized by this rank */, device); #else AMGX_resources_create(&Resources_, Config_, MPI_COMM_WORLD, 1/* number of GPU devices utilized by this rank */, device); #endif } AMGX_Mode mode = AMGX_mode_dDDI; AMGX_solver_create(&Solver_, Resources_, mode, Config_); AMGX_matrix_create(&A_, Resources_, mode); AMGX_vector_create(&X_, Resources_, mode); AMGX_vector_create(&Y_, Resources_, mode); amgxTimer->stop(); amgxTimer->incrementNumCalls(); std::vector<int> amgx2muelu; // Construct AMGX communication pattern if (numProcs > 1) { RCP<const Tpetra::Import<LO,GO> > importer = inA->getCrsGraph()->getImporter(); TEUCHOS_TEST_FOR_EXCEPTION(importer.is_null(), MueLu::Exceptions::RuntimeError, "The matrix A has no Import object."); Tpetra::Distributor distributor = importer->getDistributor(); Array<int> sendRanks = distributor.getImagesTo(); Array<int> recvRanks = distributor.getImagesFrom(); std::sort(sendRanks.begin(), sendRanks.end()); std::sort(recvRanks.begin(), recvRanks.end()); bool match = true; if (sendRanks.size() != recvRanks.size()) { match = false; } else { for (int i = 0; i < sendRanks.size(); i++) { if (recvRanks[i] != sendRanks[i]) match = false; break; } } TEUCHOS_TEST_FOR_EXCEPTION(!match, MueLu::Exceptions::RuntimeError, "AMGX requires that the processors that we send to and receive from are the same. " "This is not the case: we send to {" << sendRanks << "} and receive from {" << recvRanks << "}"); int num_neighbors = sendRanks.size(); // does not include the calling process const int* neighbors = &sendRanks[0]; // Later on, we'll have to organize the send and recv data by PIDs, // i.e, a vector V of vectors, where V[i] is PID i's vector of data. // Hence we need to be able to quickly look up an array index // associated with each PID. Tpetra::Details::HashTable<int,int> hashTable(3*num_neighbors); for (int i = 0; i < num_neighbors; i++) hashTable.add(neighbors[i], i); // Get some information out ArrayView<const int> exportLIDs = importer->getExportLIDs(); ArrayView<const int> exportPIDs = importer->getExportPIDs(); Array<int> importPIDs; Tpetra::Import_Util::getPids(*importer, importPIDs, true/* make local -1 */); // Construct the reordering for AMGX as in AMGX_matrix_upload_all documentation RCP<const Map> rowMap = inA->getRowMap(); RCP<const Map> colMap = inA->getColMap(); int N = rowMap->getNodeNumElements(), Nc = colMap->getNodeNumElements(); muelu2amgx_.resize(Nc, -1); int numUniqExports = 0; for (int i = 0; i < exportLIDs.size(); i++) if (muelu2amgx_[exportLIDs[i]] == -1) { numUniqExports++; muelu2amgx_[exportLIDs[i]] = -2; } int localOffset = 0, exportOffset = N - numUniqExports; // Go through exported LIDs and put them at the end of LIDs for (int i = 0; i < exportLIDs.size(); i++) if (muelu2amgx_[exportLIDs[i]] < 0) // exportLIDs are not unique muelu2amgx_[exportLIDs[i]] = exportOffset++; // Go through all non-export LIDs, and put them at the beginning of LIDs for (int i = 0; i < N; i++) if (muelu2amgx_[i] == -1) muelu2amgx_[i] = localOffset++; // Go through the tail (imported LIDs), and order those by neighbors int importOffset = N; for (int k = 0; k < num_neighbors; k++) for (int i = 0; i < importPIDs.size(); i++) if (importPIDs[i] != -1 && hashTable.get(importPIDs[i]) == k) muelu2amgx_[i] = importOffset++; amgx2muelu.resize(muelu2amgx_.size()); for (int i = 0; i < muelu2amgx_.size(); i++) amgx2muelu[muelu2amgx_[i]] = i; // Construct send arrays std::vector<std::vector<int> > sendDatas (num_neighbors); std::vector<int> send_sizes(num_neighbors, 0); for (int i = 0; i < exportPIDs.size(); i++) { int index = hashTable.get(exportPIDs[i]); sendDatas [index].push_back(muelu2amgx_[exportLIDs[i]]); send_sizes[index]++; } // FIXME: sendDatas must be sorted (based on GIDs) std::vector<const int*> send_maps(num_neighbors); for (int i = 0; i < num_neighbors; i++) send_maps[i] = &(sendDatas[i][0]); // Debugging printMaps(comm, sendDatas, amgx2muelu, neighbors, *importer->getTargetMap(), "send_map_vector"); // Construct recv arrays std::vector<std::vector<int> > recvDatas (num_neighbors); std::vector<int> recv_sizes(num_neighbors, 0); for (int i = 0; i < importPIDs.size(); i++) if (importPIDs[i] != -1) { int index = hashTable.get(importPIDs[i]); recvDatas [index].push_back(muelu2amgx_[i]); recv_sizes[index]++; } // FIXME: recvDatas must be sorted (based on GIDs) std::vector<const int*> recv_maps(num_neighbors); for (int i = 0; i < num_neighbors; i++) recv_maps[i] = &(recvDatas[i][0]); // Debugging printMaps(comm, recvDatas, amgx2muelu, neighbors, *importer->getTargetMap(), "recv_map_vector"); AMGX_SAFE_CALL(AMGX_matrix_comm_from_maps_one_ring(A_, 1, num_neighbors, neighbors, &send_sizes[0], &send_maps[0], &recv_sizes[0], &recv_maps[0])); AMGX_vector_bind(X_, A_); AMGX_vector_bind(Y_, A_); } RCP<Teuchos::Time> matrixTransformTimer = Teuchos::TimeMonitor::getNewTimer("MueLu: AMGX: transform matrix"); matrixTransformTimer->start(); ArrayRCP<const size_t> ia_s; ArrayRCP<const int> ja; ArrayRCP<const double> a; inA->getAllValues(ia_s, ja, a); ArrayRCP<int> ia(ia_s.size()); for (int i = 0; i < ia.size(); i++) ia[i] = Teuchos::as<int>(ia_s[i]); N_ = inA->getNodeNumRows(); int nnz = inA->getNodeNumEntries(); matrixTransformTimer->stop(); matrixTransformTimer->incrementNumCalls(); // Upload matrix // TODO Do we need to pin memory here through AMGX_pin_memory? RCP<Teuchos::Time> matrixTimer = Teuchos::TimeMonitor::getNewTimer("MueLu: AMGX: transfer matrix CPU->GPU"); matrixTimer->start(); if (numProcs == 1) { AMGX_matrix_upload_all(A_, N_, nnz, 1, 1, &ia[0], &ja[0], &a[0], NULL); } else { // Transform the matrix std::vector<int> ia_new(ia.size()); std::vector<int> ja_new(ja.size()); std::vector<double> a_new (a.size()); ia_new[0] = 0; for (int i = 0; i < N_; i++) { int oldRow = amgx2muelu[i]; ia_new[i+1] = ia_new[i] + (ia[oldRow+1] - ia[oldRow]); for (int j = ia[oldRow]; j < ia[oldRow+1]; j++) { int offset = j - ia[oldRow]; ja_new[ia_new[i] + offset] = muelu2amgx_[ja[j]]; a_new [ia_new[i] + offset] = a[j]; } // Do bubble sort on two arrays // NOTE: There are multiple possible optimizations here (even of bubble sort) bool swapped; do { swapped = false; for (int j = ia_new[i]; j < ia_new[i+1]-1; j++) if (ja_new[j] > ja_new[j+1]) { std::swap(ja_new[j], ja_new[j+1]); std::swap(a_new [j], a_new [j+1]); swapped = true; } } while (swapped == true); } AMGX_matrix_upload_all(A_, N_, nnz, 1, 1, &ia_new[0], &ja_new[0], &a_new[0], NULL); } matrixTimer->stop(); matrixTimer->incrementNumCalls(); domainMap_ = inA->getDomainMap(); rangeMap_ = inA->getRangeMap(); RCP<Teuchos::Time> realSetupTimer = Teuchos::TimeMonitor::getNewTimer("MueLu: AMGX: real setup"); realSetupTimer->start(); AMGX_solver_setup(Solver_, A_); realSetupTimer->stop(); realSetupTimer->incrementNumCalls(); vectorTimer1_ = Teuchos::TimeMonitor::getNewTimer("MueLu: AMGX: transfer vectors CPU->GPU"); vectorTimer2_ = Teuchos::TimeMonitor::getNewTimer("MueLu: AMGX: transfer vector GPU->CPU"); }
void comm_recv_msg_sizes(ParallelMachine comm , const std::vector<int>& send_procs, const std::vector<int>& recv_procs, const std::vector<CommBuffer>& send_bufs, std::vector<CommBuffer>& recv_bufs) { static const char method[] = "stk::comm_recv_msg_sizes" ; int result = MPI_SUCCESS ; MPI_Datatype uint_type = MPI_LONG_LONG; if (sizeof(int) == sizeof(unsigned)) uint_type = MPI_INT; else if (sizeof(long) == sizeof(unsigned)) uint_type = MPI_LONG; else if (sizeof(long long) == sizeof(unsigned)) uint_type = MPI_LONG_LONG; else { std::ostringstream msg ; msg << method << " ERROR: No matching MPI type found for unsigned"; throw std::runtime_error(msg.str()); } // do point-to-point send/recvs const int mpi_tag = STK_COMMSPARSE_MPI_TAG_MSG_SIZING ; MPI_Request request_null = MPI_REQUEST_NULL ; const unsigned num_recv = recv_procs.size(); const unsigned num_send = send_procs.size(); std::vector<MPI_Request> request( num_recv , request_null ); std::vector<MPI_Status> status( num_recv ); std::vector<unsigned> recv_sizes(num_recv); // Post receives for point-to-point message sizes for ( unsigned i = 0 ; i < num_recv ; ++i ) { unsigned * const p_buf = & recv_sizes[i] ; MPI_Request * const p_request = & request[i] ; result = MPI_Irecv( p_buf , 1 , uint_type, recv_procs[i] , mpi_tag , comm , p_request ); if ( MPI_SUCCESS != result ) { // LOCAL ERROR std::ostringstream msg ; msg << method << " ERROR: " << result << " == MPI_Irecv" ; throw std::runtime_error( msg.str() ); } } //barrier to make sure recvs have been posted before sends are launched: MPI_Barrier( comm ); // Send the point-to-point message sizes, for ( unsigned i = 0 ; i < num_send ; ++i ) { int dst = send_procs[i]; unsigned value = send_bufs[dst].size() ; result = MPI_Send( & value , 1 , uint_type, dst , mpi_tag , comm ); if ( MPI_SUCCESS != result ) { // LOCAL ERROR std::ostringstream msg ; msg << method << " ERROR: " << result << " == MPI_Send" ; throw std::runtime_error( msg.str() ); } } // Wait for all receives { MPI_Request * const p_request = (request.empty() ? NULL : & request[0]) ; MPI_Status * const p_status = (status.empty() ? NULL : & status[0]) ; result = MPI_Waitall( num_recv , p_request , p_status ); } if ( MPI_SUCCESS != result ) { // LOCAL ERROR ? std::ostringstream msg ; msg << method << " ERROR: " << result << " == MPI_Waitall" ; throw std::runtime_error( msg.str() ); } for(unsigned i=0; i<num_recv; ++i) { recv_bufs[recv_procs[i]].set_size(recv_sizes[i]); } }
int fei::Vector_core::gatherFromOverlap(bool accumulate) { if (fei::numProcs(comm_) == 1 || haveFEVector()) { return(0); } #ifndef FEI_SER //first create the list of procs we'll be sending to. std::vector<int> sendProcs; for(unsigned i=0; i<remotelyOwned_.size(); ++i) { if ((int)i == fei::localProc(comm_)) continue; if (remotelyOwned_[i]->size() == 0) continue; sendProcs.push_back(i); } std::vector<int> recvProcs; fei::mirrorProcs(comm_, sendProcs, recvProcs); //declare arrays to hold the indices and coefs we'll be receiving. std::vector<std::vector<int> > recv_ints(recvProcs.size()); std::vector<std::vector<double> > recv_doubles(recvProcs.size()); std::vector<int> recv_sizes(recvProcs.size()); std::vector<MPI_Request> mpiReqs(recvProcs.size()*2); std::vector<MPI_Status> mpiStatuses(recvProcs.size()*2); int tag1 = 11111; int tag2 = 11112; //post the recvs for the sizes. for(size_t i=0; i<recvProcs.size(); ++i) { int proc = recvProcs[i]; MPI_Irecv(&recv_sizes[i], 1, MPI_INT, proc, tag1, comm_, &mpiReqs[i]); } //send the sizes of data we'll be sending. for(unsigned i=0; i<sendProcs.size(); ++i) { int proc = sendProcs[i]; int size = remotelyOwned_[proc]->size(); MPI_Send(&size, 1, MPI_INT, proc, tag1, comm_); } if (recvProcs.size() > 0) { MPI_Waitall(recvProcs.size(), &mpiReqs[0], &mpiStatuses[0]); } //now post the recvs for the data. unsigned offset = 0; for(size_t i=0; i<recvProcs.size(); ++i) { int proc = recvProcs[i]; int size = recv_sizes[i]; std::vector<int>& recv_ints_i = recv_ints[i]; std::vector<double>& recv_doubles_i = recv_doubles[i]; recv_ints_i.resize(size); recv_doubles_i.resize(size); MPI_Irecv(&(recv_ints_i[0]), size, MPI_INT, proc, tag1, comm_, &mpiReqs[offset++]); MPI_Irecv(&(recv_doubles_i[0]), size, MPI_DOUBLE, proc, tag2, comm_, &mpiReqs[offset++]); } //now send the outgoing data. for(size_t i=0; i<sendProcs.size(); ++i) { int proc = sendProcs[i]; int size = remotelyOwned_[proc]->size(); int* indices = &(remotelyOwned_[proc]->indices())[0]; MPI_Send(indices, size, MPI_INT, proc, tag1, comm_); double* coefs = &(remotelyOwned_[proc]->coefs())[0]; MPI_Send(coefs, size, MPI_DOUBLE, proc, tag2, comm_); fei::set_values(*remotelyOwned_[proc], 0.0); } if (recvProcs.size() > 0) { MPI_Waitall(recvProcs.size()*2, &mpiReqs[0], &mpiStatuses[0]); } //now store the data we've received. for(size_t i=0; i<recvProcs.size(); ++i) { int num = recv_sizes[i]; std::vector<int>& recv_ints_i = recv_ints[i]; std::vector<double>& recv_doubles_i = recv_doubles[i]; int err = giveToUnderlyingVector(num, &(recv_ints_i[0]), &(recv_doubles_i[0]), accumulate, 0); if (err != 0) { FEI_COUT << "fei::Vector_core::gatherFromOverlap ERROR storing recvd data" << FEI_ENDL; return(err); } } #endif //#ifndef FEI_SER return(0); }
void all_to_all_impl(const communicator& comm, const T* in_values, int n, T* out_values, mpl::false_) { int size = comm.size(); int rank = comm.rank(); // The amount of data to be sent to each process std::vector<int> send_sizes(size); // The displacements for each outgoing value. std::vector<int> send_disps(size); // The buffer that will store all of the outgoing values std::vector<char, allocator<char> > outgoing; // Pack the buffer with all of the outgoing values. for (int dest = 0; dest < size; ++dest) { // Keep track of the displacements send_disps[dest] = outgoing.size(); // Our own value will never be transmitted, so don't pack it. if (dest != rank) { packed_oarchive oa(comm, outgoing); for (int i = 0; i < n; ++i) oa << in_values[dest * n + i]; } // Keep track of the sizes send_sizes[dest] = outgoing.size() - send_disps[dest]; } // Determine how much data each process will receive. std::vector<int> recv_sizes(size); all_to_all(comm, send_sizes, recv_sizes); // Prepare a buffer to receive the incoming data. std::vector<int> recv_disps(size); int sum = 0; for (int src = 0; src < size; ++src) { recv_disps[src] = sum; sum += recv_sizes[src]; } std::vector<char, allocator<char> > incoming(sum > 0? sum : 1); // Make sure we don't try to reference an empty vector if (outgoing.empty()) outgoing.push_back(0); // Transmit the actual data BOOST_MPI_CHECK_RESULT(MPI_Alltoallv, (&outgoing[0], &send_sizes[0], &send_disps[0], MPI_PACKED, &incoming[0], &recv_sizes[0], &recv_disps[0], MPI_PACKED, comm)); // Deserialize data from the iarchive for (int src = 0; src < size; ++src) { if (src == rank) std::copy(in_values + src * n, in_values + (src + 1) * n, out_values + src * n); else { packed_iarchive ia(comm, incoming, boost::archive::no_header, recv_disps[src]); for (int i = 0; i < n; ++i) ia >> out_values[src * n + i]; } } }
inline void copy_from_owned( const BulkData & mesh , const std::vector< const FieldBase *> & fields ) { if ( fields.empty() ) { return; } const int parallel_size = mesh.parallel_size(); const int parallel_rank = mesh.parallel_rank(); const std::vector<const FieldBase *>::const_iterator fe = fields.end(); const std::vector<const FieldBase *>::const_iterator fb = fields.begin(); std::vector<const FieldBase *>::const_iterator fi ; std::vector<std::vector<unsigned char> > send_data(parallel_size); std::vector<std::vector<unsigned char> > recv_data(parallel_size); const EntityCommListInfoVector &comm_info_vec = mesh.internal_comm_list(); size_t comm_info_vec_size = comm_info_vec.size(); std::vector<unsigned> send_sizes(parallel_size, 0); std::vector<unsigned> recv_sizes(parallel_size, 0); //this first loop calculates send_sizes and recv_sizes. for(fi = fb; fi != fe; ++fi) { const FieldBase & f = **fi; for(size_t i = 0; i<comm_info_vec_size; ++i) { const Bucket* bucket = comm_info_vec[i].bucket; int owner = comm_info_vec[i].owner; const bool owned = (owner == parallel_rank); unsigned e_size = 0; if(is_matching_rank(f, *bucket)) { const unsigned bucketId = bucket->bucket_id(); unsigned size = field_bytes_per_entity(f, bucketId); e_size += size; } if(e_size == 0) { continue; } if(owned) { const EntityCommInfoVector& infovec = comm_info_vec[i].entity_comm->comm_map; size_t infovec_size = infovec.size(); for(size_t j=0; j<infovec_size; ++j) { int proc = infovec[j].proc; send_sizes[proc] += e_size; } } else { recv_sizes[owner] += e_size; } } } //now size the send_data buffers size_t max_len = 0; for(int p=0; p<parallel_size; ++p) { if (send_sizes[p] > 0) { if (send_sizes[p] > max_len) { max_len = send_sizes[p]; } send_data[p].resize(send_sizes[p]); send_sizes[p] = 0; } } //now pack the send buffers std::vector<unsigned char> field_data(max_len); unsigned char* field_data_ptr = field_data.data(); for(fi = fb; fi != fe; ++fi) { const FieldBase & f = **fi; for(size_t i = 0; i<comm_info_vec_size; ++i) { const Bucket* bucket = comm_info_vec[i].bucket; int owner = comm_info_vec[i].owner; const bool owned = (owner == parallel_rank); unsigned e_size = 0; if(is_matching_rank(f, *bucket)) { const unsigned bucketId = bucket->bucket_id(); unsigned size = field_bytes_per_entity(f, bucketId); if (owned && size > 0) { unsigned char * ptr = reinterpret_cast<unsigned char*>(stk::mesh::field_data(f, bucketId, comm_info_vec[i].bucket_ordinal, size)); std::memcpy(field_data_ptr+e_size, ptr, size); // field_data.insert(field_data.end(), ptr, ptr+size); } e_size += size; } if(e_size == 0) { continue; } if(owned) { const EntityCommInfoVector& infovec = comm_info_vec[i].entity_comm->comm_map; size_t infovec_size = infovec.size(); for(size_t j=0; j<infovec_size; ++j) { int proc = infovec[j].proc; unsigned char* dest_ptr = send_data[proc].data()+send_sizes[proc]; unsigned char* src_ptr = field_data_ptr; std::memcpy(dest_ptr, src_ptr, e_size); send_sizes[proc] += e_size; // send_data[proc].insert(send_data[proc].end(), field_data.begin(), field_data.end()); } } else { recv_sizes[owner] += e_size; } } } for(int p=0; p<parallel_size; ++p) { if (recv_sizes[p] > 0) { recv_data[p].resize(recv_sizes[p]); recv_sizes[p] = 0; } } parallel_data_exchange_nonsym_known_sizes_t(send_data, recv_data, mesh.parallel()); //now unpack and store the recvd data for(fi = fb; fi != fe; ++fi) { const FieldBase & f = **fi; for(size_t i=0; i<comm_info_vec_size; ++i) { int owner = comm_info_vec[i].owner; const bool owned = (owner == parallel_rank); if(owned || recv_data[owner].size() == 0) { continue; } const Bucket* bucket = comm_info_vec[i].bucket; if(is_matching_rank(f, *bucket)) { const unsigned bucketId = bucket->bucket_id(); unsigned size = field_bytes_per_entity(f, bucketId); if (size > 0) { unsigned char * ptr = reinterpret_cast<unsigned char*>(stk::mesh::field_data(f, bucketId, comm_info_vec[i].bucket_ordinal, size)); std::memcpy(ptr, &(recv_data[owner][recv_sizes[owner]]), size); // for(unsigned j = 0; j < size; ++j) // { // ptr[j] = recv_data[owner][recv_sizes[owner]+j]; // } recv_sizes[owner] += size; } } } } }