SubVectorView<Scalar>
newStridedSubVectorView(const int m, const int stride, const Scalar &val)
{
  ArrayRCP<Scalar> vals = Teuchos::arcp<Scalar>(m*stride);
  std::fill(vals.begin(), vals.end(), Teuchos::ScalarTraits<Scalar>::nan());
  for (
    typename ArrayRCP<Scalar>::iterator itr = vals.begin();
    itr != vals.end();
    itr += stride
    )
  {
    *itr = val;
  }
  return SubVectorView<Scalar>(
    0, m, vals, stride);
}
SubVectorView<Scalar>
newStridedRandomSubVectorView(const int m, const int stride)
{
  typedef Teuchos::ScalarTraits<Scalar> ST;
  ArrayRCP<Scalar> vals = Teuchos::arcp<Scalar>(m*stride);
  std::fill(vals.begin(), vals.end(), ST::nan());
  for (
    typename ArrayRCP<Scalar>::iterator itr = vals.begin();
    itr != vals.end();
    itr += stride
    )
  {
    *itr = ST::random();
  }
  return SubVectorView<Scalar>(
    0, m, vals, stride);
}
void
gathervPrint (std::ostream& out,
              const std::string& s,
              const Teuchos::Comm<int>& comm)
{
  using Teuchos::ArrayRCP;
  using Teuchos::CommRequest;
  using Teuchos::ireceive;
  using Teuchos::isend;
  using Teuchos::outArg;
  using Teuchos::RCP;
  using Teuchos::wait;

  const int myRank = comm.getRank ();
  const int rootRank = 0;
  if (myRank == rootRank) {
    out << s; // Proc 0 prints its buffer first
  }

  const int numProcs = comm.getSize ();
  const int sizeTag = 42;
  const int msgTag = 43;

  ArrayRCP<size_t> sizeBuf (1);
  ArrayRCP<char> msgBuf; // to be resized later
  RCP<CommRequest<int> > req;

  for (int p = 1; p < numProcs; ++p) {
    if (myRank == p) {
      sizeBuf[0] = s.size ();
      req = isend<int, size_t> (sizeBuf, rootRank, sizeTag, comm);
      (void) wait<int> (comm, outArg (req));

      const size_t msgSize = s.size ();
      msgBuf.resize (msgSize + 1); // for the '\0'
      std::copy (s.begin (), s.end (), msgBuf.begin ());
      msgBuf[msgSize] = '\0';

      req = isend<int, char> (msgBuf, rootRank, msgTag, comm);
      (void) wait<int> (comm, outArg (req));
    }
    else if (myRank == rootRank) {
      sizeBuf[0] = 0; // just a precaution
      req = ireceive<int, size_t> (sizeBuf, p, sizeTag, comm);
      (void) wait<int> (comm, outArg (req));

      const size_t msgSize = sizeBuf[0];
      msgBuf.resize (msgSize + 1); // for the '\0'
      req = ireceive<int, char> (msgBuf, p, msgTag, comm);
      (void) wait<int> (comm, outArg (req));

      std::string msg (msgBuf.getRawPtr ());
      out << msg;
    }
  }
}
    inline
    void convert(const DMV& X, RMV& Y)
    {
      // Convert X from DomainScalar precision to RangeScalar precision
      for (size_t j=0; j<X.getNumVectors(); ++j) {
	ArrayRCP<const DomainScalar> xvecVals = X.getVector( j )->get1dView();
	if( xvecVals.size() ) {
	  std::transform( xvecVals.begin(), 
			  xvecVals.end(), 
			  Y.getVectorNonConst( j )->get1dViewNonConst().begin(), 
			  Teuchos::asFunc<RangeScalar>() );
	}
      }
      return;
    }
 EpetraCrsMatrixT<EpetraGlobalOrdinal>::EpetraCrsMatrixT(const RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rowMap, const RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &colMap, const ArrayRCP< const size_t > &NumEntriesPerRowToAlloc, ProfileType pftype, const Teuchos::RCP< Teuchos::ParameterList > &plist)
   : isFillResumed_(false)
 {
   Teuchos::Array<int> numEntriesPerRowToAlloc(NumEntriesPerRowToAlloc.begin(), NumEntriesPerRowToAlloc.end()); // convert array of "size_t" to array of "int"
   mtx_ = Teuchos::rcp(new Epetra_CrsMatrix(Copy, toEpetra(rowMap), toEpetra(colMap), numEntriesPerRowToAlloc.getRawPtr(), toEpetra(pftype)));
 }
  void ZoltanInterface<LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Build(Level& level) const {
    FactoryMonitor m(*this, "Build", level);

    RCP<Matrix>      A        = Get< RCP<Matrix> >     (level, "A");
    RCP<const Map>   rowMap   = A->getRowMap();

    RCP<MultiVector> Coords   = Get< RCP<MultiVector> >(level, "Coordinates");
    size_t           dim      = Coords->getNumVectors();

    GO               numParts = level.Get<GO>("number of partitions");

    if (numParts == 1) {
      // Running on one processor, so decomposition is the trivial one, all zeros.
      RCP<Xpetra::Vector<GO, LO, GO, NO> > decomposition = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(rowMap, true);
      Set(level, "Partition", decomposition);
      return;
    }

    float zoltanVersion_;
    Zoltan_Initialize(0, NULL, &zoltanVersion_);

    RCP<const Teuchos::MpiComm<int> >            dupMpiComm = rcp_dynamic_cast<const Teuchos::MpiComm<int> >(rowMap->getComm()->duplicate());
    RCP<const Teuchos::OpaqueWrapper<MPI_Comm> > zoltanComm = dupMpiComm->getRawMpiComm();

    RCP<Zoltan> zoltanObj_ = rcp(new Zoltan((*zoltanComm)()));  //extract the underlying MPI_Comm handle and create a Zoltan object
    if (zoltanObj_ == Teuchos::null)
      throw Exceptions::RuntimeError("MueLu::Zoltan : Unable to create Zoltan data structure");

    // Tell Zoltan what kind of local/global IDs we will use.
    // In our case, each GID is two ints and there are no local ids.
    // One can skip this step if the IDs are just single ints.
    int rv;
    if ((rv = zoltanObj_->Set_Param("num_gid_entries", "1")) != ZOLTAN_OK)
      throw Exceptions::RuntimeError("MueLu::Zoltan::Setup : setting parameter 'num_gid_entries' returned error code " + Teuchos::toString(rv));
    if ((rv = zoltanObj_->Set_Param("num_lid_entries", "0") ) != ZOLTAN_OK)
      throw Exceptions::RuntimeError("MueLu::Zoltan::Setup : setting parameter 'num_lid_entries' returned error code " + Teuchos::toString(rv));
    if ((rv = zoltanObj_->Set_Param("obj_weight_dim", "1") ) != ZOLTAN_OK)
      throw Exceptions::RuntimeError("MueLu::Zoltan::Setup : setting parameter 'obj_weight_dim' returned error code "  + Teuchos::toString(rv));

    if (GetVerbLevel() & Statistics1) zoltanObj_->Set_Param("debug_level", "1");
    else                              zoltanObj_->Set_Param("debug_level", "0");

    zoltanObj_->Set_Param("num_global_partitions", toString(numParts));

    zoltanObj_->Set_Num_Obj_Fn(GetLocalNumberOfRows,      (void *) &*A);
    zoltanObj_->Set_Obj_List_Fn(GetLocalNumberOfNonzeros, (void *) &*A);
    zoltanObj_->Set_Num_Geom_Fn(GetProblemDimension,      (void *) &dim);
    zoltanObj_->Set_Geom_Multi_Fn(GetProblemGeometry,     (void *) Coords.get());

    // Data pointers that Zoltan requires.
    ZOLTAN_ID_PTR import_gids = NULL;  // Global nums of objs to be imported
    ZOLTAN_ID_PTR import_lids = NULL;  // Local indices to objs to be imported
    int   *import_procs       = NULL;  // Proc IDs of procs owning objs to be imported.
    int   *import_to_part     = NULL;  // Partition #s to which imported objs should be assigned.
    ZOLTAN_ID_PTR export_gids = NULL;  // Global nums of objs to be exported
    ZOLTAN_ID_PTR export_lids = NULL;  // local indices to objs to be exported
    int   *export_procs       = NULL;  // Proc IDs of destination procs for objs to be exported.
    int   *export_to_part     = NULL;  // Partition #s for objs to be exported.
    int   num_imported;                // Number of objs to be imported.
    int   num_exported;                // Number of objs to be exported.
    int   newDecomp;                   // Flag indicating whether the decomposition has changed
    int   num_gid_entries;             // Number of array entries in a global ID.
    int   num_lid_entries;

    {
      SubFactoryMonitor m1(*this, "Zoltan RCB", level);
      rv = zoltanObj_->LB_Partition(newDecomp, num_gid_entries, num_lid_entries,
                                    num_imported, import_gids, import_lids, import_procs, import_to_part,
                                    num_exported, export_gids, export_lids, export_procs, export_to_part);
      if (rv == ZOLTAN_FATAL)
        throw Exceptions::RuntimeError("Zoltan::LB_Partition() returned error code");
    }

    // TODO check that A's row map is 1-1.  Zoltan requires this.

    RCP<Xpetra::Vector<GO, LO, GO, NO> > decomposition;
    if (newDecomp) {
      decomposition = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(rowMap, false); // Don't initialize, will be overwritten
      ArrayRCP<GO> decompEntries = decomposition->getDataNonConst(0);

      int mypid = rowMap->getComm()->getRank();
      for (typename ArrayRCP<GO>::iterator i = decompEntries.begin(); i != decompEntries.end(); ++i)
        *i = mypid;

      LO blockSize = A->GetFixedBlockSize();
      for (int i = 0; i < num_exported; ++i) {
        // We have assigned Zoltan gids to first row GID in the block
        // NOTE: Zoltan GIDs are different from GIDs in the Coordinates vector
        LO  localEl = rowMap->getLocalElement(export_gids[i]);
        int partNum = export_to_part[i];
        for (LO j = 0; j < blockSize; ++j)
          decompEntries[localEl + j] = partNum;
      }
    }

    Set(level, "Partition", decomposition);

    zoltanObj_->LB_Free_Part(&import_gids, &import_lids, &import_procs, &import_to_part);
    zoltanObj_->LB_Free_Part(&export_gids, &export_lids, &export_procs, &export_to_part);

  } //Build()