/// \brief Return an optimized reordering of the given column Map.
    ///   Optionally, recompute an Import from the input domain Map to
    ///   the new column Map.
    /// \tparam MapType A specialization of Map.
    ///
    /// See the documentation of the free function
    /// makeOptimizedColMapAndImport().
    ///
    /// \param errStream [out] Output stream for human-readable error
    ///   reporting.  This is local to the calling process and may
    ///   differ on different processes.
    /// \param lclErr [out] On output: true if anything went wrong on
    ///   the calling process.  This value is local to the calling
    ///   process and may differ on different processes.
    /// \param domMap [in] Domain Map of a CrsGraph or CrsMatrix.
    /// \param colMap [in] <i>Original</i> column Map of the same
    ///   CrsGraph or CrsMatrix as \c domMap.
    /// \param oldImport [in] Optional pointer to the "original
    ///   Import: an Import from \c domMap to \c colMap.  This is not
    ///   required, but if you supply this, this function may use it
    ///   to avoid some communication and/or work when setting up the
    ///   new Import object.  This function will <i>only</i> look at
    ///   this pointer if \c makeImport is true.
    /// \param makeImport [in] Whether to make and return an Import from
    ///   the input domain Map to the new column Map.
    ///
    /// \return The possibly reordered column Map \c newColMap, and the
    ///   corresponding Import from \c domMap to \c newColMap.  The
    ///   latter is nonnull if and only if \c makeImport is true.
    ///
    /// \pre \c domMap and \c colMap must have the same or congruent
    ///   communicators.
    /// \pre On all calling processes, the indices in \c colMap must be
    ///   a subset of the indices in \c domMap.
    static std::pair<map_type, Teuchos::RCP<import_type> >
    make (std::ostream& errStream,
          bool& lclErr,
          const map_type& domMap,
          const map_type& colMap,
          const import_type* oldImport,
          const bool makeImport)
    {
      using Teuchos::Array;
      using Teuchos::ArrayView;
      using Teuchos::RCP;
      using Teuchos::rcp;
      using std::endl;
      typedef local_ordinal_type LO;
      typedef global_ordinal_type GO;
      const char prefix[] = "Tpetra::makeOptimizedColMapAndImport: ";
      std::ostream& err = errStream;

      (void) oldImport; // We don't currently use this argument.

      RCP<const Teuchos::Comm<int> > comm = colMap.getComm ();
      const LO colMapMinLid = colMap.getMinLocalIndex ();
      const LO colMapMaxLid = colMap.getMaxLocalIndex ();

      // Count the numbers of GIDs in colMap that are in and not in
      // domMap on the calling process.  Check for zero indices on the
      // calling process first, because if it's true, then we shouldn't
      // trust [getMinLocalIndex(), getMaxLocalIndex()] to return a
      // correct range.
      LO numOwnedGids = 0;
      LO numRemoteGids = 0;
      if (colMap.getNodeNumElements () != 0) {
        for (LO colMapLid = colMapMinLid; colMapLid <= colMapMaxLid; ++colMapLid) {
          const GO colMapGid = colMap.getGlobalElement (colMapLid);
          if (domMap.isNodeLocalElement (colMapGid)) {
            ++numOwnedGids;
          } else {
            ++numRemoteGids;
          }
        }
      }

      // Put all colMap GIDs on the calling process in a single array.
      // Owned GIDs go in front, and remote GIDs at the end.
      Array<GO> allGids (numOwnedGids + numRemoteGids);
      ArrayView<GO> ownedGids = allGids.view (0, numOwnedGids);
      ArrayView<GO> remoteGids = allGids.view (numOwnedGids, numRemoteGids);

      // Fill ownedGids and remoteGids (and therefore allGids).  We use
      // two loops, one to count (above) and one to fill (here), in
      // order to avoid dynamic memory allocation during the loop (in
      // this case, lots of calls to push_back()).  That will simplify
      // use of Kokkos to parallelize these loops later.
      LO ownedPos = 0;
      LO remotePos = 0;
      if (colMap.getNodeNumElements () != 0) {
        for (LO colMapLid = colMapMinLid; colMapLid <= colMapMaxLid; ++colMapLid) {
          const GO colMapGid = colMap.getGlobalElement (colMapLid);
          if (domMap.isNodeLocalElement (colMapGid)) {
            ownedGids[ownedPos++] = colMapGid;
          } else {
            remoteGids[remotePos++] = colMapGid;
          }
        }
      }

      // If, for some reason, the running count doesn't match the
      // orignal count, fill in any remaining GID spots with an
      // obviously invalid value.  We don't want to stop yet, because
      // other processes might not have noticed this error; Map
      // construction is a collective, so we can't stop now.
      if (ownedPos != numOwnedGids) {
        lclErr = true;
        err << prefix << "On Process " << comm->getRank () << ", ownedPos = "
            << ownedPos << " != numOwnedGids = " << numOwnedGids << endl;
        for (LO colMapLid = ownedPos; colMapLid < numOwnedGids; ++colMapLid) {
          ownedGids[colMapLid] = Teuchos::OrdinalTraits<GO>::invalid ();
        }
      }
      if (remotePos != numRemoteGids) {
        lclErr = true;
        err << prefix << "On Process " << comm->getRank () << ", remotePos = "
            << remotePos << " != numRemoteGids = " << numRemoteGids << endl;
        for (LO colMapLid = remotePos; colMapLid < numRemoteGids; ++colMapLid) {
          remoteGids[colMapLid] = Teuchos::OrdinalTraits<GO>::invalid ();
        }
      }

      // Figure out what processes own what GIDs in the domain Map.
      // Initialize the output array of remote PIDs with the "invalid
      // process rank" -1, to help us test whether getRemoteIndexList
      // did its job.
      Array<int> remotePids (numRemoteGids, -1);
      Array<LO> remoteLids;
      if (makeImport) {
        remoteLids.resize (numRemoteGids);
        std::fill (remoteLids.begin (), remoteLids.end (),
                   Teuchos::OrdinalTraits<LO>::invalid ());
      }
      LookupStatus lookupStatus;
      if (makeImport) {
        lookupStatus = domMap.getRemoteIndexList (remoteGids, remotePids (),
                                                  remoteLids ());
      } else {
        lookupStatus = domMap.getRemoteIndexList (remoteGids, remotePids ());
      }

      // If any process returns IDNotPresent, then at least one of the
      // remote indices was not present in the domain Map.  This means
      // that the Import object cannot be constructed, because of
      // incongruity between the column Map and domain Map.  This means
      // that either the column Map or domain Map, or both, is
      // incorrect.
      const bool getRemoteIndexListFailed = (lookupStatus == IDNotPresent);
      if (getRemoteIndexListFailed) {
        lclErr = true;
        err << prefix << "On Process " << comm->getRank () << ", some indices "
          "in the input colMap (the original column Map) are not in domMap (the "
          "domain Map).  Either these indices or the domain Map is invalid.  "
          "Likely cause: For a nonsquare matrix, you must give the domain and "
          "range Maps as input to fillComplete." << endl;
      }

      // Check that getRemoteIndexList actually worked, by making sure
      // that none of the remote PIDs are -1.
      for (LO k = 0; k < numRemoteGids; ++k) {
        bool foundInvalidPid = false;
        if (remotePids[k] == -1) {
          foundInvalidPid = true;
          break;
        }
        if (foundInvalidPid) {
          lclErr = true;
          err << prefix << "On Process " << comm->getRank () << ", "
            "getRemoteIndexList returned -1 for the process ranks of "
            "one or more GIDs on this process." << endl;
        }
      }

      // Sort incoming remote column Map indices so that all columns
      // coming from a given remote process are contiguous.  This means
      // the Import's Distributor doesn't need to reorder data.
      if (makeImport) {
        sort2 (remotePids.begin (), remotePids.end (), remoteGids.begin ());
      }
      else {
        sort3 (remotePids.begin (), remotePids.end (),
               remoteGids.begin (),
               remoteLids.begin ());
      }
      // Make the new column Map.
      MapType newColMap (colMap.getGlobalNumElements (), allGids (),
                         colMap.getIndexBase (), comm, colMap.getNode ());
      // Optionally, make the new Import object.
      RCP<import_type> imp;
      if (makeImport) {
        imp = rcp (new import_type (rcp (new map_type (domMap)),
                                    rcp (new map_type (newColMap))));
        // FIXME (mfh 06 Jul 2014) This constructor throws a runtime
        // error, so I'm not using it for now.
        //
        // imp = rcp (new import_type (domMap, newColMap, remoteGids,
        //                             remotePids (), remoteLids (),
        //                             Teuchos::null, Teuchos::null));
      }
      return std::make_pair (newColMap, imp);
    }