  dot (const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node,true>& a) const
    using Teuchos::outArg;
    using Teuchos::REDUCE_SUM;
    using Teuchos::reduceAll;

      this->getGlobalLength () != a.getGlobalLength (), std::runtime_error,
      "Tpetra::Vector::dots: Vectors do not have the same global length.  "
      "this->getGlobalLength() = " << this->getGlobalLength () << " != "
      "a.getGlobalLength() = " << a.getGlobalLength () << ".");

      ! this->getMap ()->isCompatible (*a.getMap ()), std::runtime_error,
      "Tpetra::Vector::dots: Vectors do not have compatible Maps:" << std::endl
      << "this->getMap(): " << std::endl << * (this->getMap ())
      << "a.getMap(): " << std::endl << * (a.getMap ()) << std::endl);
      this->getLocalLength () != a.getLocalLength (), std::runtime_error,
      "Tpetra::Vector::dots: Vectors do not have the same local length.");
    Scalar gbldot;
    gbldot = MVT::Dot (this->lclMV_, a.lclMV_);
    if (this->isDistributed ()) {
      Scalar lcldot = gbldot;
      reduceAll (*this->getMap ()->getComm (), REDUCE_SUM,
                 lcldot, outArg (gbldot));
    return gbldot;
// Test for Tpetra::CrsMatrix::sumIntoGlobalValues(), with nonowned
// rows.  The test creates the CrsMatrix with a static graph, so that
// globalAssemble() uses sumIntoGlobalValues() instead of
// insertGlobalValues() to merge in the incoming matrix entries.  All
// calls to sumIntoGlobalValues() in this test are for nonowned rows,
// and all the calls are correct (that is, the processes that own
// those rows have entries in the corresponding columns, so that
// nonowned fill does not require creating new entries).
// mfh 16 Dec 2012: The one-template-argument version breaks explicit
// instantiation.  Ah well.
//TEUCHOS_UNIT_TEST_TEMPLATE_1_DECL( CrsMatrix, NonlocalSumInto, CrsMatrixType )
TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL( CrsMatrix, NonlocalSumInto, LocalOrdinalType, GlobalOrdinalType, ScalarType, NodeType )
  using Tpetra::createContigMapWithNode;
  using Tpetra::createNonContigMapWithNode;
  using Tpetra::global_size_t;
  using Tpetra::Map;
  using Teuchos::Array;
  using Teuchos::ArrayView;
  using Teuchos::as;
  using Teuchos::av_const_cast;
  using Teuchos::Comm;
  using Teuchos::RCP;
  using Teuchos::rcp;
  using Teuchos::rcp_const_cast;
  using Teuchos::OrdinalTraits;
  using Teuchos::outArg;
  using Teuchos::ParameterList;
  using Teuchos::parameterList;
  using Teuchos::reduceAll;
  using Teuchos::ScalarTraits;
  using Teuchos::tuple;
  using Teuchos::TypeNameTraits;
  using std::endl;

#if 0
  // Extract typedefs from the CrsMatrix specialization.
  typedef typename CrsMatrixType::scalar_type scalar_type;
  typedef typename CrsMatrixType::local_ordinal_type local_ordinal_type;
  typedef typename CrsMatrixType::global_ordinal_type global_ordinal_type;
  typedef typename CrsMatrixType::node_type node_type;
#endif // 0

  typedef ScalarType scalar_type;
  typedef LocalOrdinalType local_ordinal_type;
  typedef GlobalOrdinalType global_ordinal_type;
  typedef NodeType node_type;

  // Typedefs derived from the above canonical typedefs.
  typedef ScalarTraits<scalar_type> STS;
  typedef Map<local_ordinal_type, global_ordinal_type, node_type> map_type;

  // Abbreviation typedefs.
  typedef scalar_type ST;
  typedef local_ordinal_type LO;
  typedef global_ordinal_type GO;
  typedef node_type NT;

  typedef Tpetra::CrsMatrix<ST, LO, GO, NT> CrsMatrixType;

  // CrsGraph specialization corresponding to CrsMatrixType (the
  // CrsMatrix specialization).
  typedef Tpetra::CrsGraph<LO, GO, NT, typename CrsMatrixType::mat_solve_type> crs_graph_type;


  const global_size_t INVALID = OrdinalTraits<global_size_t>::invalid();

  // Get the default communicator.
  RCP<const Comm<int> > comm = Tpetra::DefaultPlatform::getDefaultPlatform ().getComm ();
  const int numProcs = comm->getSize ();
  const int myRank = comm->getRank ();

  if (myRank == 0) {
    out << "Test with " << numProcs << " process" << (numProcs != 1 ? "es" : "") << endl;

  // This test doesn't make much sense if there is only one MPI
  // process.  We let it pass trivially in that case.
  if (numProcs == 1) {
    out << "Number of processes in world is one; test passes trivially." << endl;

  // Get a Kokkos Node instance.  It would be nice if we could pass in
  // parameters here, but threads don't matter for this test; it's a
  // test for distributed-memory capabilities.

  if (myRank == 0) {
    out << "Creating Kokkos Node of type " << TypeNameTraits<node_type>::name () << endl;
  RCP<node_type> node;
    ParameterList pl; // Kokkos Node types require a PL inout.
    node = rcp (new node_type (pl));

  // Number of rows in the matrix owned by each process.
  const LO numLocalRows = 10;

  // Number of (global) rows and columns in the matrix.
  const GO numGlobalRows = numLocalRows * numProcs;
  const GO numGlobalCols = numGlobalRows;
  // Prevent compile warning for unused variable.
  // (It's not really "variable" if it's const, but oh well.)
  (void) numGlobalCols;

  if (myRank == 0) {
    out << "Creating contiguous row Map" << endl;

  // Create a contiguous row Map, with numLocalRows rows per process.
  RCP<const map_type> rowMap = createContigMapWithNode<LO, GO, NT> (INVALID, numLocalRows, comm, node);

  // For now, reuse the row Map for the domain and range Maps.  Later,
  // we might want to test using different domain or range Maps.
  RCP<const map_type> domainMap = rowMap;
  RCP<const map_type> rangeMap = rowMap;

  // Min and max row and column index of this process.  Use the row
  // Map for the row and column indices, since we're only inserting
  // indices into the graph for rows that the calling process owns.
  const GO globalMinRow = rowMap->getMinGlobalIndex ();
  const GO globalMaxRow = rowMap->getMaxGlobalIndex ();
  const GO globalMinCol = domainMap->getMinAllGlobalIndex ();
  const GO globalMaxCol = domainMap->getMaxAllGlobalIndex ();

  if (myRank == 0) {
    out << "Creating graph" << endl;

  // Create a numGlobalRows by numGlobalCols graph and set its
  // structure.  Every process sets its diagonal entries (which it
  // owns), and its local (0,0) (if not on the diagonal) and
  // (numLocalRows-1, numLocalCols-1) (if not on the diagonal)
  // entries.  We will use the off-diagonal entries to test
  // modification of nonlocal entries.
  RCP<const crs_graph_type> graph;
    // We have a good upper bound for the number of entries per row, so use static profile.
    RCP<crs_graph_type> nonconstGraph (new crs_graph_type (rowMap, 2, Tpetra::StaticProfile));

    TEUCHOS_TEST_FOR_EXCEPTION(globalMinRow >= globalMaxRow, std::logic_error,
      "This test only works if globalMinRow < globalMaxRow.");

    // Insert all the diagonal entries.
    for (GO globalRow = globalMinRow; globalRow <= globalMaxRow; ++globalRow) {
      nonconstGraph->insertGlobalIndices (globalRow, tuple (globalRow));

    // Insert the local (0,0) entry, if not on the diagonal.
    if (globalMinRow > rowMap->getMinAllGlobalIndex ()) {
      nonconstGraph->insertGlobalIndices (globalMinRow, tuple (globalMinCol));

    // Insert the local (numLocalRows-1, numLocalCols-1) entry, if not on the diagonal.
    if (globalMaxRow < rowMap->getMaxAllGlobalIndex ()) {
      nonconstGraph->insertGlobalIndices (globalMaxRow, tuple (globalMaxCol));

    nonconstGraph->fillComplete (domainMap, rangeMap);
    graph = rcp_const_cast<const crs_graph_type> (nonconstGraph);

  // Test whether the graph has the correct structure.
  bool localGraphSuccess = true;
  std::ostringstream graphFailMsg;
    Array<GO> ind (2); // upper bound

    for (GO globalRow = globalMinRow; globalRow <= globalMaxRow; ++globalRow) {
      size_t numEntries = 0; // output argument of below line.
      graph->getGlobalRowCopy (globalRow, ind (), numEntries);

      // Revise view based on numEntries.
      ArrayView<GO> indView = ind.view (0, numEntries);

      // Sort the view.
      std::sort (indView.begin (), indView.end ());

      if (globalRow == globalMinRow && globalRow > rowMap->getMinAllGlobalIndex ()) {
        if (numEntries != as<size_t> (2)) {
          localGraphSuccess = false;
          graphFailMsg << "Proc " << myRank << ": globalRow = " << globalRow << ": numEntries = " << numEntries << " != 2" << endl;
        if (numEntries > 0 && indView[0] != globalMinCol) {
          localGraphSuccess = false;
          graphFailMsg << "Proc " << myRank << ": globalRow = " << globalRow << ": indView[0] = " << indView[0] << " != globalMinCol = " << globalMinCol << endl;
        if (numEntries > 1 && indView[1] != globalRow) {
          localGraphSuccess = false;
          graphFailMsg << "Proc " << myRank << ": globalRow = " << globalRow << ": indView[1] = " << indView[1] << " != globalRow = " << globalRow << endl;
      else if (globalRow == globalMaxRow && globalRow < rowMap->getMaxAllGlobalIndex ()) {
        if (numEntries != as<size_t> (2)) {
          localGraphSuccess = false;
          graphFailMsg << "Proc " << myRank << ": globalRow = " << globalRow << ": numEntries = " << numEntries << " != 2" << endl;
        if (numEntries > 0 && indView[0] != globalRow) {
          localGraphSuccess = false;
          graphFailMsg << "Proc " << myRank << ": globalRow = " << globalRow << ": indView[0] = " << indView[0] << " != globalRow = " << globalRow << endl;
        if (numEntries > 1 && indView[1] != globalMaxCol) {
          localGraphSuccess = false;
          graphFailMsg << "Proc " << myRank << ": globalRow = " << globalRow << ": indView[1] = " << indView[1] << " != globalMaxCol = " << globalMaxCol << endl;
      else {
        if (numEntries != as<size_t> (1)) {
          localGraphSuccess = false;
          graphFailMsg << "Proc " << myRank << ": globalRow = " << globalRow << ": numEntries = " << numEntries << " != 1" << endl;
        if (numEntries > 0 && indView[0] != globalRow) {
          localGraphSuccess = false;
          graphFailMsg << "Proc " << myRank << ": globalRow = " << globalRow << ": indView[0] = " << indView[0] << " != globalRow = " << globalRow << endl;

  // Make sure that all processes successfully created the graph.
  bool globalGraphSuccess = true;
    int globalGraphSuccess_int = 1;
    reduceAll (*comm, Teuchos::REDUCE_MIN, localGraphSuccess ? 1 : 0, outArg (globalGraphSuccess_int));
    globalGraphSuccess = (globalGraphSuccess_int != 0);
  if (! globalGraphSuccess) {
    if (myRank == 0) {
      out << "Graph structure not all correct:" << endl << endl;
    // Print out the failure messages on all processes.
    for (int p = 0; p < numProcs; ++p) {
      if (p == myRank) {
        out << graphFailMsg.str () << endl;
        std::flush (out);
      // Do some barriers to allow output to finish.
      comm->barrier ();
      comm->barrier ();
      comm->barrier ();
  TEUCHOS_TEST_FOR_EXCEPTION(! globalGraphSuccess, std::logic_error, "Graph structure test failed.");

  if (myRank == 0) {
    out << "Creating matrix" << endl;

  // Create the matrix, using the above graph.
  RCP<CrsMatrixType> matrix (new CrsMatrixType (graph));

  if (myRank == 0) {
    out << "Setting all matrix entries to 1" << endl;

  // Set all the owned entries to one.  Later we'll set nonlocal
  // entries' values in a loop.
  matrix->setAllToScalar (STS::one ());

  // Sum into nonowned entries (which nevertheless exist in the
  // matrix, just not on this process) using this process' rank.
  // After global assembly, this should result in those entries having
  // value equal to one plus the rank of the process that wrote to
  // them.  That value happens to be myRank for the (0,0) local entry
  // (except when myRank==0, in which case the value is 1), and
  // myRank+2 for the (numLocalRows-1,numLocalCols-1) local entry
  // (except when myRank==numProcs-1, in which case the value is 1).
  if (globalMinRow > rowMap->getMinAllGlobalIndex ()) {
    // Write to the (numLocalRows-1,numLocalCols-1) local entry of the previous process.
    matrix->sumIntoGlobalValues (globalMinRow-1, tuple (globalMaxCol), tuple (as<ST> (myRank)));
  if (globalMaxRow < rowMap->getMaxAllGlobalIndex ()) {
    // Write to the (0,0) local entry of the next process.
    matrix->sumIntoGlobalValues (globalMaxRow+1, tuple (globalMinCol), tuple (as<ST> (myRank)));

  if (myRank == 0) {
    out << "Calling fillComplete on the matrix" << endl;
  matrix->fillComplete (domainMap, rangeMap);

  if (myRank == 0) {
    out << "Testing the matrix values" << endl;

  // Test whether the entries have their correct values.
  bool localSuccess = true;
  std::ostringstream failMsg;
    Array<GO> ind (2); // upper bound
    Array<ST> val (2); // upper bound

    for (GO globalRow = globalMinRow; globalRow <= globalMaxRow; ++globalRow) {
      size_t numEntries = 0; // output argument of below line.
      matrix->getGlobalRowCopy (globalRow, ind (), val (), numEntries);

      // Revise views based on numEntries.
      ArrayView<GO> indView = ind.view (0, numEntries);
      ArrayView<ST> valView = val.view (0, numEntries);

      // Sort the views jointly by column index.
      Tpetra::sort2 (indView.begin (), indView.end (), valView.begin ());

      if (globalRow == globalMinRow && globalRow > rowMap->getMinAllGlobalIndex ()) {
        if (numEntries != as<size_t> (2)) {
          localSuccess = false;
          failMsg << "Proc " << myRank << ": globalRow = " << globalRow << ": numEntries = " << numEntries << " != 2" << endl;
        if (numEntries > 0 && indView[0] != globalMinCol) {
          localSuccess = false;
          failMsg << "Proc " << myRank << ": globalRow = " << globalRow << ": indView[0] = " << indView[0] << " != globalMinCol = " << globalMinCol << endl;
        if (numEntries > 1 && indView[1] != globalRow) {
          localSuccess = false;
          failMsg << "Proc " << myRank << ": globalRow = " << globalRow << ": indView[1] = " << indView[1] << " != globalRow = " << globalRow << endl;
        if (numEntries > 0 && valView[0] != as<ST> (myRank)) {
          localSuccess = false;
          failMsg << "Proc " << myRank << ": globalRow = " << globalRow << ": valView[0] = " << valView[0] << " != myRank = " << myRank << endl;
        if (numEntries > 1 && valView[1] != STS::one ()) {
          localSuccess = false;
          failMsg << "Proc " << 1 << ": globalRow = " << globalRow << ": valView[1] = " << valView[1] << " != 1" << endl;
      else if (globalRow == globalMaxRow && globalRow < rowMap->getMaxAllGlobalIndex ()) {
        if (numEntries != as<size_t> (2)) {
          localSuccess = false;
          failMsg << "Proc " << myRank << ": globalRow = " << globalRow << ": numEntries = " << numEntries << " != 2" << endl;
        if (numEntries > 0 && indView[0] != globalRow) {
          localSuccess = false;
          failMsg << "Proc " << myRank << ": globalRow = " << globalRow << ": indView[0] = " << indView[0] << " != globalRow = " << globalRow << endl;
        if (numEntries > 1 && indView[1] != globalMaxCol) {
          localSuccess = false;
          failMsg << "Proc " << myRank << ": globalRow = " << globalRow << ": indView[1] = " << indView[1] << " != globalMaxCol = " << globalMaxCol << endl;
        if (numEntries > 0 && valView[0] != STS::one ()) {
          localSuccess = false;
          failMsg << "Proc " << myRank << ": globalRow = " << globalRow << ": valView[0] = " << valView[0] << " != 1" << endl;
        if (numEntries > 1 && valView[1] != as<ST> (myRank+2)) {
          localSuccess = false;
          failMsg << "Proc " << 1 << ": globalRow = " << globalRow << ": valView[1] = " << valView[1] << " != myRank+2 = " << (myRank+2) << endl;
      else {
        if (numEntries != as<size_t> (1)) {
          localSuccess = false;
          failMsg << "Proc " << myRank << ": globalRow = " << globalRow << ": numEntries = " << numEntries << " != 1" << endl;
        if (numEntries > 0 && indView[0] != globalRow) {
          localSuccess = false;
          failMsg << "Proc " << myRank << ": globalRow = " << globalRow << ": indView[0] = " << indView[0] << " != globalRow = " << globalRow << endl;
        if (numEntries > 0 && valView[0] != STS::one ()) {
          localSuccess = false;
          failMsg << "Proc " << myRank << ": globalRow = " << globalRow << ": valView[0] = " << valView[0] << " != 1" << endl;

  bool globalSuccess = true;
    int globalSuccess_int = 1;
    reduceAll (*comm, Teuchos::REDUCE_MIN, localSuccess ? 1 : 0, outArg (globalSuccess_int));
    globalSuccess = (globalSuccess_int != 0);

  if (! globalSuccess) {
    // Print out the failure messages on all processes.
    for (int p = 0; p < numProcs; ++p) {
      if (p == myRank) {
        out << failMsg.str () << endl;
        out << "Proc " << myRank << ": localSuccess = " << localSuccess << ", globalSuccess = " << globalSuccess << endl;
        //      std::flush (out);
      // Do some barriers to allow output to finish.
      comm->barrier ();
      comm->barrier ();
      comm->barrier ();

  TEST_EQUALITY_CONST(globalSuccess, true);
// Test for Tpetra::CrsMatrix::sumIntoGlobalValues(), with nonowned
// rows.  This test is like CrsMatrix_NonlocalSumInto.cpp, except that
// it attempts to sum into remote entries that don't exist on the
// process that owns them.  Currently, CrsMatrix silently ignores
// these entries.  (This is how CrsMatrix implements Import and Export
// when the target matrix has a fixed column Map.  Data are
// redistributed between the two row Maps, and "filtered" by the
// target matrix's column Map.)  This unit test verifies that behavior
// by ensuring the following:
// 1. fillComplete() (actually globalAssemble()) does not throw an
//    exception when the incoming entries don't exist on the process
//    that owns their rows.
// 2. The ignored entries are actually ignored.  They must change
//    neither the structure nor the values of the matrix.
// mfh 16 Dec 2012: The one-template-argument version breaks explicit
// instantiation.  Ah well.
//TEUCHOS_UNIT_TEST_TEMPLATE_1_DECL( CrsMatrix, NonlocalSumInto_Ignore, CrsMatrixType )
TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL( CrsMatrix, NonlocalSumInto_Ignore, LocalOrdinalType, GlobalOrdinalType, ScalarType, NodeType )
  using Tpetra::createContigMapWithNode;
  using Tpetra::createNonContigMapWithNode;
  using Tpetra::global_size_t;
  using Tpetra::Map;
  using Teuchos::Array;
  using Teuchos::ArrayView;
  using Teuchos::as;
  using Teuchos::av_const_cast;
  using Teuchos::Comm;
  using Teuchos::RCP;
  using Teuchos::rcp;
  using Teuchos::rcp_const_cast;
  using Teuchos::OrdinalTraits;
  using Teuchos::outArg;
  using Teuchos::ParameterList;
  using Teuchos::parameterList;
  using Teuchos::reduceAll;
  using Teuchos::ScalarTraits;
  using Teuchos::tuple;
  using Teuchos::TypeNameTraits;
  using std::endl;

#if 0
  // Extract typedefs from the CrsMatrix specialization.
  typedef typename CrsMatrixType::scalar_type scalar_type;
  typedef typename CrsMatrixType::local_ordinal_type local_ordinal_type;
  typedef typename CrsMatrixType::global_ordinal_type global_ordinal_type;
  typedef typename CrsMatrixType::node_type node_type;
#endif // 0

  typedef ScalarType scalar_type;
  typedef LocalOrdinalType local_ordinal_type;
  typedef GlobalOrdinalType global_ordinal_type;
  typedef NodeType node_type;

  // Typedefs derived from the above canonical typedefs.
  typedef ScalarTraits<scalar_type> STS;
  typedef Map<local_ordinal_type, global_ordinal_type, node_type> map_type;

  // Abbreviation typedefs.
  typedef scalar_type ST;
  typedef local_ordinal_type LO;
  typedef global_ordinal_type GO;
  typedef node_type NT;

  typedef Tpetra::CrsMatrix<ST, LO, GO, NT> CrsMatrixType;

  // CrsGraph specialization corresponding to CrsMatrixType (the
  // CrsMatrix specialization).
  typedef Tpetra::CrsGraph<LO, GO, NT, typename CrsMatrixType::mat_solve_type> crs_graph_type;


  const global_size_t INVALID = OrdinalTraits<global_size_t>::invalid();

  // Get the default communicator.
  RCP<const Comm<int> > comm = Tpetra::DefaultPlatform::getDefaultPlatform ().getComm ();
  const int numProcs = comm->getSize ();
  const int myRank = comm->getRank ();

  if (myRank == 0) {
    out << "Test with " << numProcs << " process" << (numProcs != 1 ? "es" : "") << endl;

  // This test doesn't make much sense if there is only one MPI
  // process.  We let it pass trivially in that case.
  if (numProcs == 1) {
    out << "Number of processes in world is one; test passes trivially." << endl;

  // Get a Kokkos Node instance.  It would be nice if we could pass in
  // parameters here, but threads don't matter for this test; it's a
  // test for distributed-memory capabilities.

  if (myRank == 0) {
    out << "Creating Kokkos Node of type " << TypeNameTraits<node_type>::name () << endl;
  RCP<node_type> node;
    ParameterList pl; // Kokkos Node types require a PL inout.
    node = rcp (new node_type (pl));

  // Number of rows in the matrix owned by each process.
  const LO numLocalRows = 10;

  //CrT: 4Feb14: the void trick does not seem to work, I get warnings
  // Number of (global) rows and columns in the matrix.
  //const GO numGlobalRows = numLocalRows * numProcs;
  //const GO numGlobalCols = numGlobalRows;
  // Prevent compile warning for unused variable.
  // (It's not really "variable" if it's const, but oh well.)
  //(void) numGlobalCols;

  if (myRank == 0) {
    out << "Creating contiguous row Map" << endl;

  // Create a contiguous row Map, with numLocalRows rows per process.
  RCP<const map_type> rowMap = createContigMapWithNode<LO, GO, NT> (INVALID, numLocalRows, comm, node);

  // For now, reuse the row Map for the domain and range Maps.  Later,
  // we might want to test using different domain or range Maps.
  RCP<const map_type> domainMap = rowMap;
  RCP<const map_type> rangeMap = rowMap;

  // Min and max row and column index of this process.  Use the row
  // Map for the row and column indices, since we're only inserting
  // indices into the graph for rows that the calling process owns.
  const GO globalMinRow = rowMap->getMinGlobalIndex ();
  const GO globalMaxRow = rowMap->getMaxGlobalIndex ();
  const GO globalMinCol = domainMap->getMinAllGlobalIndex ();
  const GO globalMaxCol = domainMap->getMaxAllGlobalIndex ();

  if (myRank == 0) {
    out << "Creating graph" << endl;

  // Create a numGlobalRows by numGlobalCols graph and set its
  // structure.  Every process sets its diagonal entries (which it
  // owns).  Unlike in the CrsMatrix_NonlocalSumInto.cpp test, we
  // don't set any other entries.  As a result, the later calls to
  // sumIntoGlobalValues() for nonowned rows should fail.
  RCP<const crs_graph_type> graph;
    // We have a good upper bound for the number of entries per row,
    // so use static profile.  Leave the upper bound as 2 (just as it
    // is in the CrsMatrix_NonlocalSumInto.cpp test) so that there
    // would actually be room for the incoming entries from remote
    // calls to sumIntoGlobalValues().
    RCP<crs_graph_type> nonconstGraph (new crs_graph_type (rowMap, 2, Tpetra::StaticProfile));

    TEUCHOS_TEST_FOR_EXCEPTION(globalMinRow >= globalMaxRow, std::logic_error,
      "This test only works if globalMinRow < globalMaxRow.");

    // Insert all the diagonal entries, and only the diagonal entries
    // (unlike in the other test).
    for (GO globalRow = globalMinRow; globalRow <= globalMaxRow; ++globalRow) {
      nonconstGraph->insertGlobalIndices (globalRow, tuple (globalRow));

    nonconstGraph->fillComplete (domainMap, rangeMap);
    graph = rcp_const_cast<const crs_graph_type> (nonconstGraph);

  // Test whether the graph has the correct structure.
  bool localGraphSuccess = true;
  std::ostringstream graphFailMsg;
    Array<GO> ind (2); // upper bound

    for (GO globalRow = globalMinRow; globalRow <= globalMaxRow; ++globalRow) {
      size_t numEntries = 0; // output argument of below line.
      graph->getGlobalRowCopy (globalRow, ind (), numEntries);

      // Revise view based on numEntries.
      ArrayView<GO> indView = ind.view (0, numEntries);

      // Sort the view.
      std::sort (indView.begin (), indView.end ());

      if (numEntries != as<size_t> (1)) {
        localGraphSuccess = false;
        graphFailMsg << "Proc " << myRank << ": globalRow = " << globalRow << ": numEntries = " << numEntries << " != 1" << endl;
      if (numEntries > 0 && indView[0] != globalRow) {
        localGraphSuccess = false;
        graphFailMsg << "Proc " << myRank << ": globalRow = " << globalRow << ": indView[0] = " << indView[0] << " != globalRow = " << globalRow << endl;

  // Make sure that all processes successfully created the graph.
  bool globalGraphSuccess = true;
    int globalGraphSuccess_int = 1;
    reduceAll (*comm, Teuchos::REDUCE_MIN, localGraphSuccess ? 1 : 0, outArg (globalGraphSuccess_int));
    globalGraphSuccess = (globalGraphSuccess_int != 0);
  if (! globalGraphSuccess) {
    if (myRank == 0) {
      out << "Graph structure not all correct:" << endl << endl;
    // Print out the failure messages on all processes.
    for (int p = 0; p < numProcs; ++p) {
      if (p == myRank) {
        out << graphFailMsg.str () << endl;
        std::flush (out);
      // Do some barriers to allow output to finish.
      comm->barrier ();
      comm->barrier ();
      comm->barrier ();
  TEUCHOS_TEST_FOR_EXCEPTION(! globalGraphSuccess, std::logic_error, "Graph structure test failed.");

  if (myRank == 0) {
    out << "Creating matrix" << endl;

  // Create the matrix, using the above graph.
  RCP<CrsMatrixType> matrix (new CrsMatrixType (graph));

  if (myRank == 0) {
    out << "Setting all matrix entries to 1" << endl;

  // Set all the owned entries to one.  Later we'll set nonlocal
  // entries' values in a loop.
  matrix->setAllToScalar (STS::one ());

  // Attempt to sum into nonowned entries (which nevertheless exist in
  // the matrix, just not on this process) using this process' rank.
  // The sumIntoGlobalValues() calls will record the data, but the
  // globalAssemble() method (called by fillComplete()) will silently
  // ignore entries whose columns are not in the column Map.  The
  // comment at the top of this test explains why this behavior is
  // reasonable.
  // mfh 15,16 Dec 2012: Silently ignoring columns not in the column
  // Map has implications for the implementation of
  // sumIntoGlobalValues() for nonowned rows.  In particular, a
  // version of Map's getRemoteIDList() that uses one-sided
  // communication could invoke MPI_Get to figure out what the remote
  // process owns, without asking it or otherwise requiring
  // synchronization.  Thus, sumIntoGlobalValues() could throw
  // immediately on the calling process, rather than deferring the
  // exception to the remote process in globalAssemble().  If we
  // switch to that implementation, this unit test must be changed
  // accordingly.
  if (globalMinRow > rowMap->getMinAllGlobalIndex ()) {
    // Attempt to write to the (numLocalRows-1,numLocalCols-1) local entry of the previous process.
    matrix->sumIntoGlobalValues (globalMinRow-1, tuple (globalMaxCol), tuple (as<ST> (myRank)));
  if (globalMaxRow < rowMap->getMaxAllGlobalIndex ()) {
    // Attempt to write to the (0,0) local entry of the next process.
    matrix->sumIntoGlobalValues (globalMaxRow+1, tuple (globalMinCol), tuple (as<ST> (myRank)));

  if (myRank == 0) {
    out << "Calling fillComplete on the matrix" << endl;
  TEST_NOTHROW(matrix->fillComplete (domainMap, rangeMap)); // Tpetra::Details::InvalidGlobalIndex<GO>

  // mfh 15 Dec 2012: We currently don't make promises about the state
  // of the matrix if fillComplete() throws.  Later, we might like to
  // improve the exception guarantees of fillComplete().  In that
  // case, the commented-out code below should be allowed to run.

  if (myRank == 0) {
    out << "Testing the matrix values" << endl;

  // Test whether the entries have their correct values.
  bool localSuccess = true;
  std::ostringstream failMsg;
    Array<GO> ind (2); // upper bound
    Array<ST> val (2); // upper bound

    for (GO globalRow = globalMinRow; globalRow <= globalMaxRow; ++globalRow) {
      size_t numEntries = 0; // output argument of below line.
      matrix->getGlobalRowCopy (globalRow, ind (), val (), numEntries);

      // Revise views based on numEntries.
      ArrayView<GO> indView = ind.view (0, numEntries);
      ArrayView<ST> valView = val.view (0, numEntries);

      // Sort the views jointly by column index.
      Tpetra::sort2 (indView.begin (), indView.end (), valView.begin ());

      if (numEntries != as<size_t> (1)) {
        localSuccess = false;
        failMsg << "Proc " << myRank << ": globalRow = " << globalRow << ": numEntries = " << numEntries << " != 1" << endl;
      if (numEntries > 0 && indView[0] != globalRow) {
        localSuccess = false;
        failMsg << "Proc " << myRank << ": globalRow = " << globalRow << ": indView[0] = " << indView[0] << " != globalRow = " << globalRow << endl;
      if (numEntries > 0 && valView[0] != STS::one ()) {
        localSuccess = false;
        failMsg << "Proc " << myRank << ": globalRow = " << globalRow << ": valView[0] = " << valView[0] << " != 1" << endl;

  bool globalSuccess = true;
    int globalSuccess_int = 1;
    reduceAll (*comm, Teuchos::REDUCE_MIN, localSuccess ? 1 : 0, outArg (globalSuccess_int));
    globalSuccess = (globalSuccess_int != 0);

  if (! globalSuccess) {
    // Print out the failure messages on all processes.
    for (int p = 0; p < numProcs; ++p) {
      if (p == myRank) {
        out << failMsg.str () << endl;
        out << "Proc " << myRank << ": localSuccess = " << localSuccess << ", globalSuccess = " << globalSuccess << endl;
        //      std::flush (out);
      // Do some barriers to allow output to finish.
      comm->barrier ();
      comm->barrier ();
      comm->barrier ();

  TEST_EQUALITY_CONST(globalSuccess, true);
// This test is only meaningful in an MPI build.
TEUCHOS_UNIT_TEST( Map, replaceCommWithSubset )
  typedef int local_ordinal_type;
  typedef long global_ordinal_type;
  typedef Tpetra::Map<local_ordinal_type, global_ordinal_type> map_type;
  typedef Array<global_ordinal_type>::size_type size_type;

  RCP<const Comm<int> > origComm = rcp (new MpiComm<int> (MPI_COMM_WORLD));
  const int numProcs = origComm->getSize ();
  const int myRank = origComm->getRank ();

  // Create a Map in which all processes have a nonzero number of elements.
  const size_type numGidsPerProc = 3;
  const size_type myNumGids = numGidsPerProc;
  Array<global_ordinal_type> myGids (myNumGids);
  for (size_type k = 0; k < myNumGids; ++k) {
    myGids[k] = as<global_ordinal_type> (myRank) *
      as<global_ordinal_type> (numGidsPerProc) +
      as<global_ordinal_type> (k);
  const global_size_t globalNumElts = as<global_size_t> (numGidsPerProc) *
    as<global_size_t> (numProcs);
  const global_ordinal_type indexBase = 0;
  RCP<const map_type> origMap (new map_type (globalNumElts, myGids (),
                                             indexBase, origComm));

  // Create a new communicator that excludes Proc 0.
  // This will exercise recomputing the index base.
  const int color = (myRank == 0) ? 0 : 1;
  const int key = 0;
  RCP<const Comm<int> > newComm = origComm->split (color, key);
  if (myRank == 0) {
    newComm = null;
  // Create the new Map distributed over the subset communicator.
  RCP<const map_type> newMap = origMap->replaceCommWithSubset (newComm);

  // Test collectively for success, so the test doesn't hang on failure.
  int localSuccess = 1;
  std::ostringstream err;
  if (myRank == 0) {
    if (! newMap.is_null ()) {
      localSuccess = 0;
      err << "removeEmptyProcesses() should have returned null, but did not."
          << endl;
  } else {
    if (newMap.is_null ()) {
      localSuccess = 0;
      err << "removeEmptyProcesses() should not have returned null, but did."
          << endl;
    } else {
      RCP<const Comm<int> > theNewComm = newMap->getComm ();
      if (theNewComm->getSize () != numProcs - 1) {
        localSuccess = 0;
        err << "New communicator should have " << (numProcs - 1)
            << " processes, but has " << theNewComm->getSize ()
            << " processes instead." << endl;

      if (newMap->getGlobalNumElements () != origMap->getGlobalNumElements () - numGidsPerProc) {
        localSuccess = 0;
        err << "New Map has " << newMap->getGlobalNumElements () << " global "
            << "elements, but should have "
            << (origMap->getGlobalNumElements () - numGidsPerProc) << "." << endl;

      if (newMap->getNodeNumElements () != origMap->getNodeNumElements ()) {
        localSuccess = 0;
        err << "New Map has " << newMap->getNodeNumElements () << " local "
            << "elements, but should have " << origMap->getNodeNumElements ()
            << "." << endl;

      if (newMap->getIndexBase () != as<global_ordinal_type> (numGidsPerProc)) {
        localSuccess = 0;
        err << "New Map has index base " << newMap->getIndexBase ()
            << ", but should have index base " << numGidsPerProc << "." << endl;

      ArrayView<const global_ordinal_type> myNewGids =
        newMap->getNodeElementList ();
      if (myNewGids.size () != myGids.size () ||
          ! std::equal (myNewGids.begin (), myNewGids.end (), myGids.begin ())) {
        localSuccess = 0;
        err << "New Map has local GID list " << toString (myNewGids) << ", but "
            << "should have local GID list " << toString (myGids ()) << "."
            << endl;

  int globalSuccess = 0;
  reduceAll (*origComm, REDUCE_MIN, localSuccess, outArg (globalSuccess));
  if (globalSuccess == 0) {
    if (myRank == 0) {
      cerr << "TEST FAILED" << endl
           << "Error messages from each process:" << endl << endl;
    for (int p = 0; p < numProcs; ++p) {
      if (myRank == p) {
        cerr << "Process " << myRank << ": " << err.str () << endl;
      origComm->barrier (); // Give time for output to finish.
      origComm->barrier ();
      origComm->barrier ();
  TEST_EQUALITY(globalSuccess, 1);
main (int argc, char *argv[]) 
  using Teuchos::Array;
  using Teuchos::as;
  using Teuchos::Comm;
  using Teuchos::CommandLineProcessor;
  using Teuchos::ParameterList;
  using Teuchos::ptr;
  using Teuchos::RCP;
  using Teuchos::rcp;
  using Teuchos::REDUCE_MAX;
  using Teuchos::REDUCE_MIN;
  using Teuchos::reduceAll;
  using std::cerr;
  using std::cout;
  using std::endl;

  typedef double scalar_type;
  typedef int local_ordinal_type;
  typedef int global_ordinal_type;
  typedef Kokkos::SerialNode node_type;

  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &cout);
  RCP<const Comm<int> > comm = 
  const int myRank = comm->getRank();
  const int numProcs = comm->getSize();

  std::string inputFilename;  // Matrix Market file to read
  std::string temporaryFilename; // Matrix Market file to write (if applicable)
  std::string outputFilename; // Matrix Market file to write (if applicable)

  // Number of a specific test to run.  If nonzero, only run that
  // test.  We always run Test #1 since its result is needed for
  // subsequent tests.
  int testToRun = 0;

  // FIXME (mfh 07 Feb 2012) Currently, all tests with a different
  // index base FAIL.  Reading in the multivector appears to be
  // correct, but writing it results in a multivector of all zeros (on
  // _all_ processes).
  bool testDifferentIndexBase = false;
  bool testContiguousInputMap = true;
  bool testNoncontiguousInputMap = false; 

  bool testWrite = true; // Test Matrix Market output?
  bool tolerant = false; // Parse the file tolerantly?
  bool echo = false;     // Echo the read-in matrix back?
  bool verbose = false;  // Verbosity of output
  bool debug = false;    // Print debugging info?
  // If true, stop after a single test failure.  Intended for
  // interactive use, so that you can examine a test's output file.
  // Not intended for batch or ctest use.
  bool stopAfterFailure = false; 

  CommandLineProcessor cmdp (false, true);
  cmdp.setOption ("inputFilename", &inputFilename,
		  "Name of the Matrix Market dense matrix file to read.");
  cmdp.setOption ("temporaryFilename", &temporaryFilename,
		  "If --testWrite is true, then use this file as temporary "
		  "storage on (MPI) Proc 0.  Otherwise, this argument is "
  cmdp.setOption ("outputFilename", &outputFilename,
		  "If --testWrite is true, then write the read-in matrix to "
		  "this file in Matrix Market format on (MPI) Proc 0.  "
		  "Otherwise, this argument is ignored.  Note that the output "
		  "file may not be identical to the input file.");
  cmdp.setOption ("testToRun", &testToRun, "Number of a specific test to run.  "
		  "If nonzero, only run that test.  We always run Test #1 since"
		  " its result is needed for subsequent tests.");
  cmdp.setOption ("testDifferentIndexBase", "dontTestDifferentIndexBase",
		  &testDifferentIndexBase, "Whether to test input and output "
		  "for Maps with different index bases.");
  cmdp.setOption ("testContiguousInputMap", "dontTestContiguousInputMap",
		  "Whether to test input and output for nonnull contiguous "
		  "input Maps.");
  cmdp.setOption ("testNoncontiguousInputMap", "dontTestNoncontiguousInputMap",
		  "Whether to test input and output for nonnull noncontiguous "
		  "input Maps.");
  cmdp.setOption ("testWrite", "noTestWrite", &testWrite,
		  "Whether to test Matrix Market file output.  Ignored if no "
		  "--outputFilename value is given.");
  cmdp.setOption ("tolerant", "strict", &tolerant, 
		  "Whether to parse the input Matrix Market file tolerantly.");
  cmdp.setOption ("echo", "noecho", &echo,
		  "Whether to echo the read-in matrix back to stdout on Rank 0 "
		  "in Matrix Market format.  Note that the echoed matrix may "
		  "not be identical to the input file.");
  cmdp.setOption ("verbose", "quiet", &verbose, "Print messages and results.");
  cmdp.setOption ("debug", "nodebug", &debug, "Print debugging information.");
  cmdp.setOption ("stopAfterFailure", "dontStopAfterFailure", &stopAfterFailure, 
		  "Whether to stop after a single test failure.");

  // Parse the command-line arguments.
    const CommandLineProcessor::EParseCommandLineReturn parseResult = 
      cmdp.parse (argc,argv);
    // If the caller asks us to print the documentation, or does not
    // explicitly say to run the benchmark, we let this "test" pass
    // trivially.
    if (parseResult == CommandLineProcessor::PARSE_HELP_PRINTED) {
      if (myRank == 0) {
	cout << "End Result: TEST PASSED" << endl;
      return EXIT_SUCCESS;
    TEUCHOS_TEST_FOR_EXCEPTION(parseResult != CommandLineProcessor::PARSE_SUCCESSFUL, 
      std::invalid_argument, "Failed to parse command-line arguments.");

  // Get a Kokkos Node instance for the particular Node type.
  RCP<node_type> node = getNode<node_type>();

  // List of numbers of failed tests.
  std::vector<int> failedTests;
  // Current test number.  Increment before starting each test.  If a
  // test is only run conditionally, increment before evaluating the
  // condition.  This ensures that each test has the same number each
  // time, whether or not a particular test is run.
  int testNum = 0;

  // Run all the tests.  If no input filename was specified, we don't
  // invoke the test and we report a "TEST PASSED" message.
  if (inputFilename != "") {
    // Convenient abbreviations
    typedef scalar_type ST;
    typedef local_ordinal_type LO;
    typedef global_ordinal_type GO;
    typedef node_type NT;
    typedef Tpetra::MultiVector<ST, LO, GO, NT> MV;
    typedef Tpetra::Map<LO, GO, NT> MT;

    // If not testing writes, don't do the sanity check that tests
    // input by comparing against output.
    std::string outFilename = testWrite ? outputFilename : "";
    std::string tmpFilename = testWrite ? temporaryFilename : "";

    // Test 1: null input Map.
    if (verbose && myRank == 0) {
      cout << "Test " << testNum << ": Null Map on input to readDenseFile()" << endl;
    RCP<MV> X;
    try {
      X = testReadDenseFile<MV> (inputFilename, tmpFilename, comm,
				 node, tolerant, verbose, debug);
      if (outFilename != "") {
	testWriteDenseFile<MV> (outFilename, X, echo, verbose, debug);
    } catch (std::exception& e) {
      failedTests.push_back (testNum);
      // If Test 1 fails, the other tests shouldn't even run, since
      // they depend on the result of Test 1 (the multivector X).
      throw e;

    // Test 2: nonnull contiguous input Map with the same index base
    // as X's Map.  This Map may or may not necessarily be the same as
    // (in the sense of isSameAs()) or even compatible with X's Map.
    if ((testToRun == 0 && testContiguousInputMap) || 
	(testToRun != 0 && testToRun == testNum)) {
      if (verbose && myRank == 0) {
	cout << "Test " << testNum << ": Nonnull contiguous Map (same index "
	  "base) on input to readDenseFile()" << endl;
      const Tpetra::global_size_t globalNumRows = X->getMap()->getGlobalNumElements();
      const GO indexBase = X->getMap()->getIndexBase();
      // Create the Map.
      RCP<const MT> map = 
	rcp (new Tpetra::Map<LO, GO, NT> (globalNumRows, indexBase, comm, 
					  Tpetra::GloballyDistributed, node));
      try {
	RCP<MV> X2 = 
	  testReadDenseFileWithInputMap<MV> (inputFilename, tmpFilename,
					     map, tolerant, verbose, debug);
	if (outFilename != "") {
	  testWriteDenseFile<MV> (outFilename, X2, echo, verbose, debug);
      } catch (std::exception& e) {
	failedTests.push_back (testNum);
	if (myRank == 0) {
	  cerr << "Test " << testNum << " failed: " << e.what() << endl;

	if (stopAfterFailure) {
	  if (failedTests.size() > 0) {
	    if (myRank == 0) {
	      cout << "End Result: TEST FAILED" << endl;
	    return EXIT_FAILURE;
	  else {
	    if (myRank == 0) {
	      cout << "End Result: TEST PASSED" << endl;
	    return EXIT_SUCCESS;
	} // if stop after failure

    // Test 3: nonnull contiguous input Map, with a different index
    // base than X's Map.  In this case, the index base is X's Map's
    // index base plus a small number (3).  For sufficiently long
    // vectors, this tests the case where the GID sets overlap.
    if ((testToRun == 0 && testContiguousInputMap && testDifferentIndexBase) || 
	(testToRun != 0 && testToRun == testNum)) {
      if (verbose && myRank == 0) {
	cout << "Test " << testNum << ": Nonnull contiguous Map (different "
	  "index base) on input to readDenseFile()" << endl;
      const Tpetra::global_size_t globalNumRows = X->getMap()->getGlobalNumElements();
      const GO indexBase = X->getMap()->getIndexBase() + as<GO> (3);

      // Make sure that the index base is the same on all processes.
      // It definitely should be, since the Map's getMaxAllGlobalIndex()
      // method should return the same value on all processes.
      GO minIndexBase = indexBase;
      reduceAll (*comm, REDUCE_MIN, indexBase, ptr (&minIndexBase));
      GO maxIndexBase = indexBase;
      reduceAll (*comm, REDUCE_MAX, indexBase, ptr (&maxIndexBase));
      TEUCHOS_TEST_FOR_EXCEPTION(minIndexBase != maxIndexBase || minIndexBase != indexBase,
        std::logic_error, "Index base values do not match on all processes.  "
        "Min value is " << minIndexBase << " and max value is " << maxIndexBase 
        << ".");

      // Create the Map.
      RCP<const MT> map = 
	rcp (new Tpetra::Map<LO, GO, NT> (globalNumRows, indexBase, comm, 
					  Tpetra::GloballyDistributed, node));
      try {
	RCP<MV> X3 = 
	  testReadDenseFileWithInputMap<MV> (inputFilename, tmpFilename,
					     map, tolerant, verbose, debug);
	if (outFilename != "") {
	  testWriteDenseFile<MV> (outFilename, X3, echo, verbose, debug);
      } catch (std::exception& e) {
	failedTests.push_back (testNum);
	if (myRank == 0) {
	  cerr << "Test " << testNum << " failed: " << e.what() << endl;

	if (stopAfterFailure) {
	  if (failedTests.size() > 0) {
	    if (myRank == 0) {
	      cout << "End Result: TEST FAILED" << endl;
	    return EXIT_FAILURE;
	  else {
	    if (myRank == 0) {
	      cout << "End Result: TEST PASSED" << endl;
	    return EXIT_SUCCESS;
	} // if stop after failure

    // Test 4: nonnull contiguous input Map, with a different index
    // base than X's Map.  In this case, the new index base is chosen
    // so that the new GID set does not overlap with X's Map's GID
    // set.
    if ((testToRun == 0 && testContiguousInputMap && testDifferentIndexBase) || 
	(testToRun != 0 && testToRun == testNum)) {
      if (verbose && myRank == 0) {
	cout << "Test " << testNum << ": Nonnull contiguous Map (different "
	  "index base) on input to readDenseFile()" << endl;
      const Tpetra::global_size_t globalNumRows = X->getMap()->getGlobalNumElements();
      // Choose the Map's index base so that the global ordinal sets
      // of X->getMap() and map don't overlap.  This will ensure that
      // we test something nontrivial.
      const GO indexBase = X->getMap()->getMaxAllGlobalIndex() + 1;

      // Make sure that the index base is the same on all processes.
      // It definitely should be, since the Map's getMaxAllGlobalIndex()
      // method should return the same value on all processes.
      GO minIndexBase = indexBase;
      reduceAll (*comm, REDUCE_MIN, indexBase, ptr (&minIndexBase));
      GO maxIndexBase = indexBase;
      reduceAll (*comm, REDUCE_MAX, indexBase, ptr (&maxIndexBase));
      TEUCHOS_TEST_FOR_EXCEPTION(minIndexBase != maxIndexBase || minIndexBase != indexBase,
        std::logic_error, "Index base values do not match on all processes.  "
        "Min value is " << minIndexBase << " and max value is " << maxIndexBase 
        << ".");

      // Create the Map.
      RCP<const MT> map = 
	rcp (new Tpetra::Map<LO, GO, NT> (globalNumRows, indexBase, comm, 
					  Tpetra::GloballyDistributed, node));
      try {
	RCP<MV> X3 = 
	  testReadDenseFileWithInputMap<MV> (inputFilename, tmpFilename,
					     map, tolerant, verbose, debug);
	if (outFilename != "") {
	  testWriteDenseFile<MV> (outFilename, X3, echo, verbose, debug);
      } catch (std::exception& e) {
	failedTests.push_back (testNum);
	if (myRank == 0) {
	  cerr << "Test " << testNum << " failed: " << e.what() << endl;

	if (stopAfterFailure) {
	  if (failedTests.size() > 0) {
	    if (myRank == 0) {
	      cout << "End Result: TEST FAILED" << endl;
	    return EXIT_FAILURE;
	  else {
	    if (myRank == 0) {
	      cout << "End Result: TEST PASSED" << endl;
	    return EXIT_SUCCESS;
	} // if stop after failure

    if ((testToRun == 0 && testNoncontiguousInputMap) || 
	(testToRun != 0 && testToRun == testNum)) {
      // Test 5: nonnull input Map with the same index base as X's
      // Map, and a "noncontiguous" distribution (in the sense that
      // the Map is constructed using the constructor that takes an
      // arbitrary list of GIDs; that doesn't necessarily mean that
      // the GIDs themselves are noncontiguous).
      if (verbose && myRank == 0) {
	cout << "Test " << testNum << ": Nonnull noncontiguous Map (same index "
	  "base) on input to readDenseFile()" << endl;
      const GO indexBase = X->getMap()->getIndexBase();
      const Tpetra::global_size_t globalNumRows = X->getMap()->getGlobalNumElements();

      // Compute number of GIDs owned by each process.  We're
      // replicating Tpetra functionality here because we want to
      // trick Tpetra into thinking we have a noncontiguous
      // distribution.  This is the most general case and the most
      // likely to uncover bugs.
      const size_t quotient = globalNumRows / numProcs;
      const size_t remainder = globalNumRows - quotient * numProcs;
      const size_t localNumRows = (as<size_t> (myRank) < remainder) ? 
	(quotient + 1) : quotient;

      // Build the list of GIDs owned by this process.
      Array<GO> elementList (localNumRows);
      GO myStartGID;
      if (as<size_t> (myRank) < remainder) {
	myStartGID = indexBase + as<GO> (myRank) * as<GO> (quotient + 1);
      else {
	// This branch does _not_ assume that GO is a signed type.
	myStartGID = indexBase + as<GO> (remainder) * as<GO> (quotient + 1) +
	  (as<GO> (myRank) - as<GO> (remainder)) * as<GO> (quotient);
      for (GO i = 0; i < as<GO> (localNumRows); ++i) {
	elementList[i] = myStartGID + i;

      if (debug) {
	for (int p = 0; p < numProcs; ++p) {
	  if (p == myRank) {
	    if (elementList.size() > 0) {
	      const GO minGID = *std::min_element (elementList.begin(), elementList.end());
	      const GO maxGID = *std::max_element (elementList.begin(), elementList.end());
	      cerr << "On Proc " << p << ": min,max GID = " << minGID << "," << maxGID << endl;
	    else {
	      cerr << "On Proc " << p << ": elementList is empty" << endl;
	    cerr << std::flush;
	  comm->barrier ();
	  comm->barrier ();
	  comm->barrier ();

      // Create the Map.
      using Tpetra::createNonContigMapWithNode;
      RCP<const MT> map = 
	createNonContigMapWithNode<LO, GO, NT> (elementList(), comm, node);
      try {
	RCP<MV> X4 = testReadDenseFileWithInputMap<MV> (inputFilename, tmpFilename,
							map, tolerant, verbose, debug);
	if (outFilename != "") {
	  testWriteDenseFile<MV> (outFilename, X4, echo, verbose, debug);
      } catch (std::exception& e) {
	failedTests.push_back (testNum);
	if (myRank == 0) {
	  cerr << "Test " << testNum << " failed: " << e.what() << endl;

	if (stopAfterFailure) {
	  if (failedTests.size() > 0) {
	    if (myRank == 0) {
	      cout << "End Result: TEST FAILED" << endl;
	    return EXIT_FAILURE;
	  else {
	    if (myRank == 0) {
	      cout << "End Result: TEST PASSED" << endl;
	    return EXIT_SUCCESS;
	} // if stop after failure
    } // if test noncontiguous input Map

    if ((testToRun == 0 && testNoncontiguousInputMap && testDifferentIndexBase) ||
	(testToRun != 0 && testToRun == testNum)) {
      // Test 6: nonnull input Map with a different index base than
      // X's Map, and a "noncontiguous" distribution (in the sense
      // that the Map is constructed using the constructor that takes
      // an arbitrary list of GIDs; that doesn't necessarily mean that
      // the GIDs themselves are noncontiguous).
      if (verbose && myRank == 0) {
	cout << "Test " << testNum << ": Nonnull noncontiguous Map (different "
	  "index base) on input to readDenseFile()" << endl;
      // Make sure that the global ordinal sets of X->getMap() and
      // map don't overlap.
      GO indexBase = X->getMap()->getMaxAllGlobalIndex() + 1;
      const Tpetra::global_size_t globalNumRows = X->getMap()->getGlobalNumElements();

      // Compute number of GIDs owned by each process.  We're
      // replicating Tpetra functionality here because we want to
      // trick Tpetra into thinking we have a noncontiguous
      // distribution.  This is the most general case and the most
      // likely to uncover bugs.
      const size_t quotient = globalNumRows / numProcs;
      const size_t remainder = globalNumRows - quotient * numProcs;
      const size_t localNumRows = (as<size_t> (myRank) < remainder) ? 
	(quotient + 1) : quotient;

      // Build the list of GIDs owned by this process.
      Array<GO> elementList (localNumRows);
      GO myStartGID;
      if (as<size_t> (myRank) < remainder) {
	myStartGID = indexBase + as<GO> (myRank) * as<GO> (quotient + 1);
      else {
	// This branch does _not_ assume that GO is a signed type.
	myStartGID = indexBase + as<GO> (remainder) * as<GO> (quotient + 1) +
	  (as<GO> (remainder) - as<GO> (myRank)) * as<GO> (quotient);
      for (GO i = 0; i < as<GO> (localNumRows); ++i) {
	elementList[i] = myStartGID + i;

      // Create the Map.
      using Tpetra::createNonContigMapWithNode;
      RCP<const MT> map = 
	createNonContigMapWithNode<LO, GO, NT> (elementList(), comm, node);
      try {
	RCP<MV> X5 = testReadDenseFileWithInputMap<MV> (inputFilename, tmpFilename,
							map, tolerant, verbose, debug);
	if (outFilename != "") {
	  testWriteDenseFile<MV> (outFilename, X5, echo, verbose, debug);
      } catch (std::exception& e) {
	failedTests.push_back (testNum);
	if (myRank == 0) {
	  cerr << "Test " << testNum << " failed: " << e.what() << endl;

	if (stopAfterFailure) {
	  if (failedTests.size() > 0) {
	    if (myRank == 0) {
	      cout << "End Result: TEST FAILED" << endl;
	    return EXIT_FAILURE;
	  else {
	    if (myRank == 0) {
	      cout << "End Result: TEST PASSED" << endl;
	    return EXIT_SUCCESS;
	} // if stop after failure
    } // if test noncontiguous input Map

    if ((testToRun == 0 && testNoncontiguousInputMap) || 
	(testToRun != 0 && testToRun == testNum)) {
      // Test 7: nonnull input Map with the same index base as X's
      // Map, and a "noncontiguous" distribution with GIDs that start
      // at 3.  This lets us easily observe any missing entries after
      // writing X and reading it back in again.
      if (verbose && myRank == 0) {
	cout << "Test " << testNum << ": Nonnull noncontiguous Map (same index "
	  "base, GIDs not in 0 .. N-1) on input to readDenseFile()" << endl;
      const Tpetra::global_size_t globalNumRows = X->getMap()->getGlobalNumElements();
      const GO globalStartGID = as<GO> (3);

      // Compute number of GIDs owned by each process.  We're
      // replicating Tpetra functionality here because we want to
      // trick Tpetra into thinking we have a noncontiguous
      // distribution.  This is the most general case and the most
      // likely to uncover bugs.
      const size_t quotient = globalNumRows / numProcs;
      const size_t remainder = globalNumRows - quotient * numProcs;
      const size_t localNumRows = (as<size_t> (myRank) < remainder) ? 
	(quotient + 1) : quotient;

      // Build the list of GIDs owned by this process.
      Array<GO> elementList (localNumRows);
      GO myStartGID;
      if (as<size_t> (myRank) < remainder) {
	myStartGID = globalStartGID + as<GO> (myRank) * as<GO> (quotient + 1);
      else {
	// This branch does _not_ assume that GO is a signed type.
	myStartGID = globalStartGID + as<GO> (remainder) * as<GO> (quotient + 1) +
	  (as<GO> (myRank) - as<GO> (remainder)) * as<GO> (quotient);
      for (GO i = 0; i < as<GO> (localNumRows); ++i) {
	elementList[i] = myStartGID + i;

      if (debug) {
	for (int p = 0; p < numProcs; ++p) {
	  if (p == myRank) {
	    if (elementList.size() > 0) {
	      const GO minGID = *std::min_element (elementList.begin(), elementList.end());
	      const GO maxGID = *std::max_element (elementList.begin(), elementList.end());
	      cerr << "On Proc " << p << ": min,max GID = " << minGID << "," << maxGID << endl;
	    else {
	      cerr << "On Proc " << p << ": elementList is empty" << endl;
	    cerr << std::flush;
	  comm->barrier ();
	  comm->barrier ();
	  comm->barrier ();

      // Create the Map.
      using Tpetra::createNonContigMapWithNode;
      RCP<const MT> map = 
	createNonContigMapWithNode<LO, GO, NT> (elementList(), comm, node);
      try {
	RCP<MV> X7 = testReadDenseFileWithInputMap<MV> (inputFilename, tmpFilename,
							map, tolerant, verbose, debug);
	if (outFilename != "") {
	  testWriteDenseFile<MV> (outFilename, X7, echo, verbose, debug);
      } catch (std::exception& e) {
	failedTests.push_back (testNum);
	if (myRank == 0) {
	  cerr << "Test " << testNum << " failed: " << e.what() << endl;

	if (stopAfterFailure) {
	  if (failedTests.size() > 0) {
	    if (myRank == 0) {
	      cout << "End Result: TEST FAILED" << endl;
	    return EXIT_FAILURE;
	  else {
	    if (myRank == 0) {
	      cout << "End Result: TEST PASSED" << endl;
	    return EXIT_SUCCESS;
	} // if stop after failure
    } // if test noncontiguous input Map

  if (failedTests.size() > 0) {
    if (myRank == 0) {
      cout << "End Result: TEST FAILED" << endl;
    return EXIT_FAILURE;
  else {
    if (myRank == 0) {
      cout << "End Result: TEST PASSED" << endl;
    return EXIT_SUCCESS;