inline void operator()( const VectorType & v ) const { typedef typename VectorType::value_type scalar_type ; const Teuchos::MpiComm<int> & teuchos_mpi_comm = dynamic_cast< const Teuchos::MpiComm<int> & >( *comm ); MPI_Comm mpi_comm = * teuchos_mpi_comm.getRawMpiComm(); const int mpi_tag = 42 ; const unsigned vchunk = v.dimension_1(); // Subvector for receives const std::pair<unsigned,unsigned> recv_range( count_owned , count_owned + count_receive ); const VectorType recv_vector = Kokkos::subview< VectorType >( v , recv_range ); std::vector< MPI_Request > recv_request( recv_msg.dimension_0() , MPI_REQUEST_NULL ); { // Post receives scalar_type * ptr = ReceiveInPlace ? recv_vector.ptr_on_device() : host_recv_buffer.ptr_on_device(); for ( size_t i = 0 ; i < recv_msg.dimension_0() ; ++i ) { const int proc = recv_msg(i,0); const int count = recv_msg(i,1) * vchunk ; MPI_Irecv( ptr , count * sizeof(scalar_type) , MPI_BYTE , proc , mpi_tag , mpi_comm , & recv_request[i] ); ptr += count ; } } MPI_Barrier( mpi_comm ); { // Pack and send const Pack pack( send_nodeid , v , send_buffer ); Kokkos::deep_copy( host_send_buffer , send_buffer ); scalar_type * ptr = host_send_buffer.ptr_on_device(); for ( size_t i = 0 ; i < send_msg.dimension_0() ; ++i ) { const int proc = send_msg(i,0); const int count = send_msg(i,1) * vchunk ; // MPI_Ssend blocks until // (1) a receive is matched for the message and // (2) the send buffer can be re-used. // // It is suggested that MPI_Ssend will have the best performance: // http://www.mcs.anl.gov/research/projects/mpi/sendmode.html . MPI_Ssend( ptr , count * sizeof(scalar_type) , MPI_BYTE , proc , mpi_tag , mpi_comm ); ptr += count ; } } // Wait for receives and verify: for ( size_t i = 0 ; i < recv_msg.dimension_0() ; ++i ) { MPI_Status recv_status ; int recv_which = 0 ; int recv_size = 0 ; MPI_Waitany( recv_msg.dimension_0() , & recv_request[0] , & recv_which , & recv_status ); const int recv_proc = recv_status.MPI_SOURCE ; MPI_Get_count( & recv_status , MPI_BYTE , & recv_size ); // Verify message properly received: const int expected_proc = recv_msg(recv_which,0); const int expected_size = recv_msg(recv_which,1) * vchunk * sizeof(scalar_type); if ( ( expected_proc != recv_proc ) || ( expected_size != recv_size ) ) { int local_rank = 0 ; MPI_Comm_rank( mpi_comm , & local_rank ); std::ostringstream msg ; msg << "VectorImport error:" << " P" << local_rank << " received from P" << recv_proc << " size " << recv_size << " expected " << expected_size << " from P" << expected_proc ; throw std::runtime_error( msg.str() ); } } // Copy received data to device memory. if ( ! ReceiveInPlace ) { Kokkos::deep_copy( recv_vector , host_recv_buffer ); } }