inline void MyMPI_ExchangeTable (TABLE<T> & send_data, 
				   TABLE<T> & recv_data, int tag,
				   MPI_Comm comm = MPI_COMM_WORLD)
    int ntasks, rank;
    MPI_Comm_size(comm, &ntasks);
    MPI_Comm_rank(comm, &rank);

    Array<int> send_sizes(ntasks);
    Array<int> recv_sizes(ntasks);
    for (int i = 0; i < ntasks; i++)
      send_sizes[i] = send_data[i].Size();
    MPI_Alltoall (&send_sizes[0], 1, MPI_INT, 
		  &recv_sizes[0], 1, MPI_INT, comm);

      // in-place is buggy !
//    MPI_Alltoall (MPI_IN_PLACE, 1, MPI_INT, 
//		  &recv_sizes[0], 1, MPI_INT, comm);

    for (int i = 0; i < ntasks; i++)
      recv_data.SetEntrySize (i, recv_sizes[i], sizeof(T));
    Array<MPI_Request> requests;
    for (int dest = 0; dest < ntasks; dest++)
      if (dest != rank && send_data[dest].Size())
	requests.Append (MyMPI_ISend (send_data[dest], dest, tag, comm));

    for (int dest = 0; dest < ntasks; dest++)
      if (dest != rank && recv_data[dest].Size())
	requests.Append (MyMPI_IRecv (recv_data[dest], dest, tag, comm));

    // MPI_Barrier (comm);
    MPI_Waitall (requests.Size(), &requests[0], MPI_STATUS_IGNORE);
Ejemplo n.º 2
    void gather(const T& elem, std::vector<T>& results) {
#ifdef HAS_MPI
      // Get the mpi rank and size
      size_t mpi_size(size());
      int mpi_rank(rank());
      if(results.size() != mpi_size) results.resize(mpi_size);

      // Serialize the local map
      graphlab::charstream cstrm(128);
      graphlab::oarchive oarc(cstrm);
      oarc << elem;
      char* send_buffer = cstrm->c_str();
      int send_buffer_size = cstrm->size();
      assert(send_buffer_size >= 0);

      // compute the sizes
      std::vector<int> recv_sizes(mpi_size, -1);
      // Compute the sizes
      int error = MPI_Gather(&send_buffer_size,  // Send buffer
                             1,                  // send count
                             MPI_INT,            // send type
                             &(recv_sizes[0]),  // recvbuffer
                             1,                  // recvcount
                             MPI_INT,           // recvtype
                             mpi_rank,          // root rank
      assert(error == MPI_SUCCESS);
      for(size_t i = 0; i < recv_sizes.size(); ++i)
        assert(recv_sizes[i] >= 0);

      // Construct offsets
      std::vector<int> recv_offsets(recv_sizes);
      int sum = 0, tmp = 0;
      for(size_t i = 0; i < recv_offsets.size(); ++i) {
        tmp = recv_offsets[i]; recv_offsets[i] = sum; sum += tmp;

      // if necessary realloac recv_buffer
      std::vector<char> recv_buffer(sum);

      // recv all the maps
      error = MPI_Gatherv(send_buffer,         // send buffer
                          send_buffer_size,    // how much to send
                          MPI_BYTE,            // send type
                          &(recv_buffer[0]),   // recv buffer
                          &(recv_sizes[0]),    // amount to recv
                                               // for each cpuess
                          &(recv_offsets[0]),  // where to place data
                          mpi_rank,            // root rank
      assert(error == MPI_SUCCESS);
      // Update the local map
      namespace bio = boost::iostreams;
      typedef bio::stream<bio::array_source> icharstream;
      icharstream strm(&(recv_buffer[0]), recv_buffer.size());
      graphlab::iarchive iarc(strm);
      for(size_t i = 0; i < results.size(); ++i) {
        iarc >> results[i];
      logstream(LOG_FATAL) << "MPI not installed!" << std::endl;
    } // end of gather
    //! @name Constructor/Destructor
    AMGXOperator(const Teuchos::RCP<Tpetra::CrsMatrix<SC,LO,GO,NO> > &inA, Teuchos::ParameterList &paramListIn) {
      RCP<const Teuchos::Comm<int> > comm = inA->getRowMap()->getComm();
      int numProcs = comm->getSize();
      int myRank   = comm->getRank();

      RCP<Teuchos::Time> amgxTimer = Teuchos::TimeMonitor::getNewTimer("MueLu: AMGX: initialize");
      // Initialize

      Teuchos::ParameterList configs = paramListIn.sublist("amgx:params", true);
      if (configs.isParameter("json file")) {
        AMGX_SAFE_CALL(AMGX_config_create_from_file(&Config_, (const char *) &configs.get<std::string>("json file")[0]));
      } else {
        std::ostringstream oss;
        oss << "";
        ParameterList::ConstIterator itr;
        for (itr = configs.begin(); itr != configs.end(); ++itr) {
          const std::string&    name  =;
          const ParameterEntry& entry = configs.entry(itr);
          oss << name << "=" << filterValueToString(entry) << ", ";
        oss << "\0";
        std::string configString = oss.str();
        if (configString == "") {
          //print msg that using defaults
          //GetOStream(Warnings0) << "Warning: No configuration parameters specified, using default AMGX configuration parameters. \n";
        AMGX_SAFE_CALL(AMGX_config_create(&Config_, configString.c_str()));

      // TODO: we probably need to add "exception_handling=1" to the parameter list
      // to switch on internal error handling (with no need for AMGX_SAFE_CALL)

#define NEW_COMM
#ifdef NEW_COMM
      // NOTE: MPI communicator used in AMGX_resources_create must exist in the scope of AMGX_matrix_comm_from_maps_one_ring
      // FIXME: fix for serial comm
      RCP<const Teuchos::MpiComm<int> > tmpic = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int> >(comm->duplicate());
      TEUCHOS_TEST_FOR_EXCEPTION(tmpic.is_null(), Exceptions::RuntimeError, "Communicator is not MpiComm");

      RCP<const Teuchos::OpaqueWrapper<MPI_Comm> > rawMpiComm = tmpic->getRawMpiComm();
      MPI_Comm mpiComm = *rawMpiComm;

      // Construct AMGX resources
      if (numProcs == 1) {
        AMGX_resources_create_simple(&Resources_, Config_);

      } else {
        int numGPUDevices;
        int device[] = {(comm->getRank() % numGPUDevices)};

        AMGX_config_add_parameters(&Config_, "communicator=MPI");
#ifdef NEW_COMM
        AMGX_resources_create(&Resources_, Config_, &mpiComm, 1/* number of GPU devices utilized by this rank */, device);
        AMGX_resources_create(&Resources_, Config_, MPI_COMM_WORLD, 1/* number of GPU devices utilized by this rank */, device);

      AMGX_Mode mode = AMGX_mode_dDDI;
      AMGX_solver_create(&Solver_, Resources_, mode,  Config_);
      AMGX_matrix_create(&A_,      Resources_, mode);
      AMGX_vector_create(&X_,      Resources_, mode);
      AMGX_vector_create(&Y_,      Resources_, mode);


      std::vector<int> amgx2muelu;

      // Construct AMGX communication pattern
      if (numProcs > 1) {
        RCP<const Tpetra::Import<LO,GO> > importer = inA->getCrsGraph()->getImporter();

        TEUCHOS_TEST_FOR_EXCEPTION(importer.is_null(), MueLu::Exceptions::RuntimeError, "The matrix A has no Import object.");

        Tpetra::Distributor distributor = importer->getDistributor();

        Array<int> sendRanks = distributor.getImagesTo();
        Array<int> recvRanks = distributor.getImagesFrom();

        std::sort(sendRanks.begin(), sendRanks.end());
        std::sort(recvRanks.begin(), recvRanks.end());

        bool match = true;
        if (sendRanks.size() != recvRanks.size()) {
          match = false;
        } else {
          for (int i = 0; i < sendRanks.size(); i++) {
            if (recvRanks[i] != sendRanks[i])
              match = false;
        TEUCHOS_TEST_FOR_EXCEPTION(!match, MueLu::Exceptions::RuntimeError, "AMGX requires that the processors that we send to and receive from are the same. "
                                   "This is not the case: we send to {" << sendRanks << "} and receive from {" << recvRanks << "}");

        int        num_neighbors = sendRanks.size();  // does not include the calling process
        const int* neighbors     = &sendRanks[0];

        // Later on, we'll have to organize the send and recv data by PIDs,
        // i.e, a vector V of vectors, where V[i] is PID i's vector of data.
        // Hence we need to be able to quickly look up  an array index
        // associated with each PID.
        Tpetra::Details::HashTable<int,int> hashTable(3*num_neighbors);
        for (int i = 0; i < num_neighbors; i++)
          hashTable.add(neighbors[i], i);

        // Get some information out
        ArrayView<const int> exportLIDs = importer->getExportLIDs();
        ArrayView<const int> exportPIDs = importer->getExportPIDs();
        Array<int> importPIDs;
        Tpetra::Import_Util::getPids(*importer, importPIDs, true/* make local -1 */);

        // Construct the reordering for AMGX as in AMGX_matrix_upload_all documentation
        RCP<const Map> rowMap = inA->getRowMap();
        RCP<const Map> colMap = inA->getColMap();

        int N = rowMap->getNodeNumElements(), Nc = colMap->getNodeNumElements();
        muelu2amgx_.resize(Nc, -1);

        int numUniqExports = 0;
        for (int i = 0; i < exportLIDs.size(); i++)
          if (muelu2amgx_[exportLIDs[i]] == -1) {
            muelu2amgx_[exportLIDs[i]] = -2;

        int localOffset = 0, exportOffset = N - numUniqExports;
        // Go through exported LIDs and put them at the end of LIDs
        for (int i = 0; i < exportLIDs.size(); i++)
          if (muelu2amgx_[exportLIDs[i]] < 0) // exportLIDs are not unique
            muelu2amgx_[exportLIDs[i]] = exportOffset++;
        // Go through all non-export LIDs, and put them at the beginning of LIDs
        for (int i = 0; i < N; i++)
          if (muelu2amgx_[i] == -1)
            muelu2amgx_[i] = localOffset++;
        // Go through the tail (imported LIDs), and order those by neighbors
        int importOffset = N;
        for (int k = 0; k < num_neighbors; k++)
          for (int i = 0; i < importPIDs.size(); i++)
            if (importPIDs[i] != -1 && hashTable.get(importPIDs[i]) == k)
              muelu2amgx_[i] = importOffset++;

        for (int i = 0; i < muelu2amgx_.size(); i++)
          amgx2muelu[muelu2amgx_[i]] = i;

        // Construct send arrays
        std::vector<std::vector<int> > sendDatas (num_neighbors);
        std::vector<int>               send_sizes(num_neighbors, 0);
        for (int i = 0; i < exportPIDs.size(); i++) {
          int index = hashTable.get(exportPIDs[i]);
          sendDatas [index].push_back(muelu2amgx_[exportLIDs[i]]);
        // FIXME: sendDatas must be sorted (based on GIDs)

        std::vector<const int*> send_maps(num_neighbors);
        for (int i = 0; i < num_neighbors; i++)
          send_maps[i] = &(sendDatas[i][0]);

        // Debugging
        printMaps(comm, sendDatas, amgx2muelu, neighbors, *importer->getTargetMap(), "send_map_vector");

        // Construct recv arrays
        std::vector<std::vector<int> > recvDatas (num_neighbors);
        std::vector<int>               recv_sizes(num_neighbors, 0);
        for (int i = 0; i < importPIDs.size(); i++)
          if (importPIDs[i] != -1) {
            int index = hashTable.get(importPIDs[i]);
            recvDatas [index].push_back(muelu2amgx_[i]);
        // FIXME: recvDatas must be sorted (based on GIDs)

        std::vector<const int*> recv_maps(num_neighbors);
        for (int i = 0; i < num_neighbors; i++)
          recv_maps[i] = &(recvDatas[i][0]);

        // Debugging
        printMaps(comm, recvDatas, amgx2muelu, neighbors, *importer->getTargetMap(), "recv_map_vector");

        AMGX_SAFE_CALL(AMGX_matrix_comm_from_maps_one_ring(A_, 1, num_neighbors, neighbors, &send_sizes[0], &send_maps[0], &recv_sizes[0], &recv_maps[0]));

        AMGX_vector_bind(X_, A_);
        AMGX_vector_bind(Y_, A_);

      RCP<Teuchos::Time> matrixTransformTimer = Teuchos::TimeMonitor::getNewTimer("MueLu: AMGX: transform matrix");

      ArrayRCP<const size_t> ia_s;
      ArrayRCP<const int>    ja;
      ArrayRCP<const double> a;
      inA->getAllValues(ia_s, ja, a);

      ArrayRCP<int> ia(ia_s.size());
      for (int i = 0; i < ia.size(); i++)
        ia[i] = Teuchos::as<int>(ia_s[i]);

      N_      = inA->getNodeNumRows();
      int nnz = inA->getNodeNumEntries();


      // Upload matrix
      // TODO Do we need to pin memory here through AMGX_pin_memory?
      RCP<Teuchos::Time> matrixTimer = Teuchos::TimeMonitor::getNewTimer("MueLu: AMGX: transfer matrix  CPU->GPU");
      if (numProcs == 1) {
        AMGX_matrix_upload_all(A_, N_, nnz, 1, 1, &ia[0], &ja[0], &a[0], NULL);

      } else {
        // Transform the matrix
        std::vector<int>    ia_new(ia.size());
        std::vector<int>    ja_new(ja.size());
        std::vector<double> a_new (a.size());

        ia_new[0] = 0;
        for (int i = 0; i < N_; i++) {
          int oldRow = amgx2muelu[i];

          ia_new[i+1] = ia_new[i] + (ia[oldRow+1] - ia[oldRow]);

          for (int j = ia[oldRow]; j < ia[oldRow+1]; j++) {
            int offset = j - ia[oldRow];
            ja_new[ia_new[i] + offset] = muelu2amgx_[ja[j]];
            a_new [ia_new[i] + offset] = a[j];
          // Do bubble sort on two arrays
          // NOTE: There are multiple possible optimizations here (even of bubble sort)
          bool swapped;
          do {
            swapped = false;

            for (int j = ia_new[i]; j < ia_new[i+1]-1; j++)
              if (ja_new[j] > ja_new[j+1]) {
                std::swap(ja_new[j], ja_new[j+1]);
                std::swap(a_new [j], a_new [j+1]);
                swapped = true;
          } while (swapped == true);

        AMGX_matrix_upload_all(A_, N_, nnz, 1, 1, &ia_new[0], &ja_new[0], &a_new[0], NULL);

      domainMap_ = inA->getDomainMap();
      rangeMap_  = inA->getRangeMap();

      RCP<Teuchos::Time> realSetupTimer = Teuchos::TimeMonitor::getNewTimer("MueLu: AMGX: real setup");
      AMGX_solver_setup(Solver_, A_);

      vectorTimer1_ = Teuchos::TimeMonitor::getNewTimer("MueLu: AMGX: transfer vectors CPU->GPU");
      vectorTimer2_ = Teuchos::TimeMonitor::getNewTimer("MueLu: AMGX: transfer vector  GPU->CPU");
Ejemplo n.º 4
void comm_recv_msg_sizes(ParallelMachine comm ,
                     const std::vector<int>& send_procs,
                     const std::vector<int>& recv_procs,
                     const std::vector<CommBuffer>& send_bufs,
                     std::vector<CommBuffer>& recv_bufs)
  static const char method[] = "stk::comm_recv_msg_sizes" ;

  int result = MPI_SUCCESS ;

  MPI_Datatype uint_type = MPI_LONG_LONG;
  if (sizeof(int) == sizeof(unsigned))
    uint_type = MPI_INT;
  else if (sizeof(long) == sizeof(unsigned))
    uint_type = MPI_LONG;
  else if (sizeof(long long) == sizeof(unsigned))
    uint_type = MPI_LONG_LONG;
  else {
    std::ostringstream msg ;
    msg << method << " ERROR: No matching MPI type found for unsigned";
    throw std::runtime_error(msg.str());

  // do point-to-point send/recvs

  const int mpi_tag = STK_COMMSPARSE_MPI_TAG_MSG_SIZING ;

  MPI_Request request_null = MPI_REQUEST_NULL ;
  const unsigned num_recv = recv_procs.size();
  const unsigned num_send = send_procs.size();
  std::vector<MPI_Request> request( num_recv , request_null );
  std::vector<MPI_Status>  status(  num_recv );

  std::vector<unsigned> recv_sizes(num_recv);

  // Post receives for point-to-point message sizes

  for ( unsigned i = 0 ; i < num_recv ; ++i ) {
    unsigned    * const p_buf     = & recv_sizes[i] ;
    MPI_Request * const p_request = & request[i] ;
    result = MPI_Irecv( p_buf , 1 , uint_type,
                        recv_procs[i] , mpi_tag , comm , p_request );
    if ( MPI_SUCCESS != result ) {
      // LOCAL ERROR
      std::ostringstream msg ;
      msg << method << " ERROR: " << result << " == MPI_Irecv" ;
      throw std::runtime_error( msg.str() );

  //barrier to make sure recvs have been posted before sends are launched:
  MPI_Barrier( comm );

  // Send the point-to-point message sizes,

  for ( unsigned i = 0 ; i < num_send ; ++i ) {
    int      dst = send_procs[i];
    unsigned value = send_bufs[dst].size() ;
    result = MPI_Send( & value , 1 , uint_type, dst , mpi_tag , comm );
    if ( MPI_SUCCESS != result ) {
      // LOCAL ERROR
      std::ostringstream msg ;
      msg << method << " ERROR: " << result << " == MPI_Send" ;
      throw std::runtime_error( msg.str() );

  // Wait for all receives

    MPI_Request * const p_request = (request.empty() ? NULL : & request[0]) ;
    MPI_Status  * const p_status  = (status.empty() ? NULL : & status[0]) ;
    result = MPI_Waitall( num_recv , p_request , p_status );
  if ( MPI_SUCCESS != result ) {
    // LOCAL ERROR ?
    std::ostringstream msg ;
    msg << method << " ERROR: " << result << " == MPI_Waitall" ;
    throw std::runtime_error( msg.str() );

  for(unsigned i=0; i<num_recv; ++i) {
Ejemplo n.º 5
int fei::Vector_core::gatherFromOverlap(bool accumulate)
  if (fei::numProcs(comm_) == 1 || haveFEVector()) {

#ifndef FEI_SER
  //first create the list of procs we'll be sending to.
  std::vector<int> sendProcs;
  for(unsigned i=0; i<remotelyOwned_.size(); ++i) {
    if ((int)i == fei::localProc(comm_)) continue;
    if (remotelyOwned_[i]->size() == 0) continue;


  std::vector<int> recvProcs;
  fei::mirrorProcs(comm_, sendProcs, recvProcs);

  //declare arrays to hold the indices and coefs we'll be receiving.
  std::vector<std::vector<int> > recv_ints(recvProcs.size());
  std::vector<std::vector<double> > recv_doubles(recvProcs.size());
  std::vector<int> recv_sizes(recvProcs.size());

  std::vector<MPI_Request> mpiReqs(recvProcs.size()*2);
  std::vector<MPI_Status> mpiStatuses(recvProcs.size()*2);
  int tag1 = 11111;
  int tag2 = 11112;

  //post the recvs for the sizes.
  for(size_t i=0; i<recvProcs.size(); ++i) {
    int proc = recvProcs[i];
    MPI_Irecv(&recv_sizes[i], 1, MPI_INT, proc,
              tag1, comm_, &mpiReqs[i]);

  //send the sizes of data we'll be sending.
  for(unsigned i=0; i<sendProcs.size(); ++i) {
    int proc = sendProcs[i];
    int size = remotelyOwned_[proc]->size();
    MPI_Send(&size, 1, MPI_INT, proc, tag1, comm_);

  if (recvProcs.size() > 0) {
    MPI_Waitall(recvProcs.size(), &mpiReqs[0], &mpiStatuses[0]);

  //now post the recvs for the data.
  unsigned offset = 0;
  for(size_t i=0; i<recvProcs.size(); ++i) {
    int proc = recvProcs[i];
    int size = recv_sizes[i];
    std::vector<int>& recv_ints_i = recv_ints[i];
    std::vector<double>& recv_doubles_i = recv_doubles[i];
    MPI_Irecv(&(recv_ints_i[0]), size, MPI_INT, proc,
              tag1, comm_, &mpiReqs[offset++]);
    MPI_Irecv(&(recv_doubles_i[0]), size, MPI_DOUBLE, proc,
              tag2, comm_, &mpiReqs[offset++]);

  //now send the outgoing data.
  for(size_t i=0; i<sendProcs.size(); ++i) {
    int proc = sendProcs[i];
    int size = remotelyOwned_[proc]->size();
    int* indices = &(remotelyOwned_[proc]->indices())[0];
    MPI_Send(indices, size, MPI_INT, proc, tag1, comm_);
    double* coefs = &(remotelyOwned_[proc]->coefs())[0];
    MPI_Send(coefs, size, MPI_DOUBLE, proc, tag2, comm_);

    fei::set_values(*remotelyOwned_[proc], 0.0);

  if (recvProcs.size() > 0) {
    MPI_Waitall(recvProcs.size()*2, &mpiReqs[0], &mpiStatuses[0]);

  //now store the data we've received.
  for(size_t i=0; i<recvProcs.size(); ++i) {
    int num = recv_sizes[i];
    std::vector<int>& recv_ints_i = recv_ints[i];
    std::vector<double>& recv_doubles_i = recv_doubles[i];
    int err = giveToUnderlyingVector(num, &(recv_ints_i[0]),
                                     &(recv_doubles_i[0]), accumulate, 0);
    if (err != 0) {
      FEI_COUT << "fei::Vector_core::gatherFromOverlap ERROR storing recvd data" << FEI_ENDL;

#endif  //#ifndef FEI_SER

Ejemplo n.º 6
  all_to_all_impl(const communicator& comm, const T* in_values, int n,
                  T* out_values, mpl::false_)
    int size = comm.size();
    int rank = comm.rank();

    // The amount of data to be sent to each process
    std::vector<int> send_sizes(size);

    // The displacements for each outgoing value.
    std::vector<int> send_disps(size);

    // The buffer that will store all of the outgoing values
    std::vector<char, allocator<char> > outgoing;

    // Pack the buffer with all of the outgoing values.
    for (int dest = 0; dest < size; ++dest) {
      // Keep track of the displacements
      send_disps[dest] = outgoing.size();

      // Our own value will never be transmitted, so don't pack it.
      if (dest != rank) {
        packed_oarchive oa(comm, outgoing);
        for (int i = 0; i < n; ++i)
          oa << in_values[dest * n + i];

      // Keep track of the sizes
      send_sizes[dest] = outgoing.size() - send_disps[dest];

    // Determine how much data each process will receive.
    std::vector<int> recv_sizes(size);
    all_to_all(comm, send_sizes, recv_sizes);

    // Prepare a buffer to receive the incoming data.
    std::vector<int> recv_disps(size);
    int sum = 0;
    for (int src = 0; src < size; ++src) {
      recv_disps[src] = sum;
      sum += recv_sizes[src];
    std::vector<char, allocator<char> > incoming(sum > 0? sum : 1);

    // Make sure we don't try to reference an empty vector
    if (outgoing.empty())

    // Transmit the actual data
                           (&outgoing[0], &send_sizes[0],
                            &send_disps[0], MPI_PACKED,
                            &incoming[0], &recv_sizes[0],
                            &recv_disps[0], MPI_PACKED,

    // Deserialize data from the iarchive
    for (int src = 0; src < size; ++src) {
      if (src == rank) 
        std::copy(in_values + src * n, in_values + (src + 1) * n, 
                  out_values + src * n);
      else {
        packed_iarchive ia(comm, incoming, boost::archive::no_header,
        for (int i = 0; i < n; ++i)
          ia >> out_values[src * n + i];
inline void copy_from_owned(
  const BulkData                        & mesh ,
  const std::vector< const FieldBase *> & fields )
  if ( fields.empty() ) { return; }

  const int parallel_size = mesh.parallel_size();
  const int parallel_rank = mesh.parallel_rank();

  const std::vector<const FieldBase *>::const_iterator fe = fields.end();
  const std::vector<const FieldBase *>::const_iterator fb = fields.begin();
        std::vector<const FieldBase *>::const_iterator fi ;

  std::vector<std::vector<unsigned char> > send_data(parallel_size);
  std::vector<std::vector<unsigned char> > recv_data(parallel_size);

  const EntityCommListInfoVector &comm_info_vec = mesh.internal_comm_list();
  size_t comm_info_vec_size = comm_info_vec.size();

  std::vector<unsigned> send_sizes(parallel_size, 0);
  std::vector<unsigned> recv_sizes(parallel_size, 0);

  //this first loop calculates send_sizes and recv_sizes.
  for(fi = fb; fi != fe; ++fi)
      const FieldBase & f = **fi;
      for(size_t i = 0; i<comm_info_vec_size; ++i)
          const Bucket* bucket = comm_info_vec[i].bucket;

          int owner = comm_info_vec[i].owner;
          const bool owned = (owner == parallel_rank);

          unsigned e_size = 0;

          if(is_matching_rank(f, *bucket))
              const unsigned bucketId = bucket->bucket_id();
              unsigned size = field_bytes_per_entity(f, bucketId);
              e_size += size;

          if(e_size == 0)

              const EntityCommInfoVector& infovec = comm_info_vec[i].entity_comm->comm_map;
              size_t infovec_size = infovec.size();
              for(size_t j=0; j<infovec_size; ++j)
                  int proc = infovec[j].proc;

                  send_sizes[proc] += e_size;
              recv_sizes[owner] += e_size;

  //now size the send_data buffers
  size_t max_len = 0;
  for(int p=0; p<parallel_size; ++p)
      if (send_sizes[p] > 0)
          if (send_sizes[p] > max_len)
              max_len = send_sizes[p];
          send_sizes[p] = 0;

  //now pack the send buffers
  std::vector<unsigned char> field_data(max_len);
  unsigned char* field_data_ptr =;

  for(fi = fb; fi != fe; ++fi)
      const FieldBase & f = **fi;
      for(size_t i = 0; i<comm_info_vec_size; ++i)
          const Bucket* bucket = comm_info_vec[i].bucket;

          int owner = comm_info_vec[i].owner;
          const bool owned = (owner == parallel_rank);

          unsigned e_size = 0;

          if(is_matching_rank(f, *bucket))
              const unsigned bucketId = bucket->bucket_id();
              unsigned size = field_bytes_per_entity(f, bucketId);
              if (owned && size > 0)
                  unsigned char * ptr = reinterpret_cast<unsigned char*>(stk::mesh::field_data(f, bucketId, comm_info_vec[i].bucket_ordinal, size));
                  std::memcpy(field_data_ptr+e_size, ptr, size);
 //                 field_data.insert(field_data.end(), ptr, ptr+size);
              e_size += size;

          if(e_size == 0)

              const EntityCommInfoVector& infovec = comm_info_vec[i].entity_comm->comm_map;
              size_t infovec_size = infovec.size();
              for(size_t j=0; j<infovec_size; ++j)
                  int proc = infovec[j].proc;

                  unsigned char* dest_ptr = send_data[proc].data()+send_sizes[proc];
                  unsigned char* src_ptr = field_data_ptr;
                  std::memcpy(dest_ptr, src_ptr, e_size);
                  send_sizes[proc] += e_size;
     //             send_data[proc].insert(send_data[proc].end(), field_data.begin(), field_data.end());
              recv_sizes[owner] += e_size;

  for(int p=0; p<parallel_size; ++p)
      if (recv_sizes[p] > 0)
          recv_sizes[p] = 0;

  parallel_data_exchange_nonsym_known_sizes_t(send_data, recv_data, mesh.parallel());

  //now unpack and store the recvd data
  for(fi = fb; fi != fe; ++fi)
      const FieldBase & f = **fi;

      for(size_t i=0; i<comm_info_vec_size; ++i)
          int owner = comm_info_vec[i].owner;
          const bool owned = (owner == parallel_rank);

          if(owned || recv_data[owner].size() == 0)

          const Bucket* bucket = comm_info_vec[i].bucket;

          if(is_matching_rank(f, *bucket))
              const unsigned bucketId = bucket->bucket_id();
              unsigned size = field_bytes_per_entity(f, bucketId);
              if (size > 0)
                  unsigned char * ptr = reinterpret_cast<unsigned char*>(stk::mesh::field_data(f, bucketId, comm_info_vec[i].bucket_ordinal, size));

                  std::memcpy(ptr, &(recv_data[owner][recv_sizes[owner]]), size);
//                for(unsigned j = 0; j < size; ++j)
//                {
//                    ptr[j] = recv_data[owner][recv_sizes[owner]+j];
//                }
                  recv_sizes[owner] += size;