PartitionedMatrixFactory::PartitionedMatrixFactory(
  const VectorSpace<double>& domain,
  int lowestLocalCol,
  const RCP<Array<int> >& isBCCol,
  const RCP<std::set<int> >& remoteBCCols,
  const VectorType<double>& domainVecType,
  const VectorSpace<double>& range,
  int lowestLocalRow,
  const RCP<Array<int> >& isBCRow,
  const VectorType<double>& rangeVecType
  )
  : 
  domain_(domain),
  internalDomain_(domain.getBlock(0)),
  bcDomain_(domain.getBlock(1)),
  isBCCol_(isBCCol),
  remoteBCCols_(remoteBCCols),
  domainVecType_(domainVecType),
  lowestLocalCol_(lowestLocalCol),
  highestLocalCol_(-1),
  range_(range),
  internalRange_(range.getBlock(0)),
  bcRange_(range.getBlock(1)),
  isBCRow_(isBCRow),
  rangeVecType_(rangeVecType),
  lowestLocalRow_(lowestLocalRow),
  highestLocalRow_(-1),
  blockFactory_(2),
  blockICMF_(2)
{
  highestLocalCol_ = lowestLocalCol_ + domain.numLocalElements();
  highestLocalRow_ = lowestLocalRow_ + range.numLocalElements();

  blockFactory_[0].resize(2);
  blockFactory_[1].resize(2);
  
  blockFactory_[0][0] = rangeVecType_.createMatrixFactory(internalDomain_, internalRange_);

  blockFactory_[0][1] = rangeVecType_.createMatrixFactory(bcDomain_, internalRange_);

  blockFactory_[1][1] = rangeVecType_.createMatrixFactory(bcDomain_, bcRange_);

  for (int i=0; i<2; i++)
  {
    blockICMF_[i].resize(2);
    for (int j=0; j<2; j++)
    {
      if (i==1 && j==0) continue;
      IncrementallyConfigurableMatrixFactory* icmf 
        = dynamic_cast<IncrementallyConfigurableMatrixFactory*>(blockFactory_[i][j].get());
      TEST_FOR_EXCEPTION(icmf==0, std::runtime_error,
        "block(" << i << ", " << j << ") is not an ICMF");
      blockICMF_[i][j] = icmf;
    }
  }
  
}
int main(int argc, char *argv[]) 
{
  int stat = 0;
  try
    {
      GlobalMPISession session(&argc, &argv);
 
      /* create a distributed vector space for the multivector's vectors */
      VectorType<double> rowType = new EpetraVectorType();
      int nLocalRows = 2;
      VectorSpace<double> space 
        = rowType.createEvenlyPartitionedSpace(MPIComm::world(), nLocalRows);
      
      /* create a replicated vector space for the small space of columns */
      int nVecs = 3;
      VectorType<double> colType = new SerialVectorType();
      VectorSpace<double> replSpace 
        = colType.createEvenlyPartitionedSpace(MPIComm::world(), nVecs);

      /* create some random vectors */
      Teuchos::Array<Vector<double> > vecs(nVecs);
      for (int i=0; i<nVecs; i++)
      {
        vecs[i] = space.createMember();
        vecs[i].randomize();
      }

      /* Test multiplication by a multivector operator. We will compute
       * y1 by directly summing columns, and y2 by applying the operator */
      LinearOperator<double> A = multiVectorOperator<double>(vecs, replSpace);

      
      Vector<double> y1 = space.createMember();
      Vector<double> y2 = space.createMember();
      y1.zero(); 
      y2.zero(); 
      

      /* Sum columns, putting the weights into x */
      Vector<double> x = replSpace.createMember();

      Out::os() << "A=" << A << std::endl;
      
      for (int j=0; j<replSpace.numLocalElements(); j++)
        {
          
          x[j] = 2.0*(drand48()-0.5);
          y1 = y1 + x[j] * vecs[j];
          Out::os() << "x[" << j << "]=" << x[j] << std::endl;
          Out::os() << "vecs[j]=" << vecs[j] << std::endl;
        }  
      Out::os() << "y1=" << std::endl << y1 << std::endl;
      
      /* Apply the operator to the vector of weights */
      y2 = A * x;
      Out::os() << "y2=A*x=" << std::endl << y2 << std::endl;

      Vector<double> y12 = y1-y2;
      Vector<double> y21 = y2-y1;
      Out::os() << "y1-y2=" << std::endl << y12 << std::endl;
      Out::os() << "y2-y1=" << std::endl << y21 << std::endl;
      
      double errA = (y1-y2).norm2();

      Out::root() << "error in A*x = " << errA << std::endl;


      /* Now test z = A^T * y */
      LinearOperator<double> At = A.transpose();
      
      Vector<double> z1 = replSpace.createMember();
      z1.zero();
      Vector<double> z2 = replSpace.createMember();
      z2.zero();

      Vector<double> y = y1.copy();

      /* compute by vectorwise multiplication */
      for (int j=0; j<replSpace.numLocalElements(); j++)
      {
        z1[j] = vecs[j].dot(y);
      }
      /* compute with operator */
      z2 = At * y;
      

      double errAt = (z1-z2).normInf();
      Out::root() << "error in At*y = " << errA << std::endl;

      double tol = 1.0e-13;
      bool pass = errA + errAt < tol;
      pass = globalAnd(pass);
      if (pass)
        {
          Out::root() << "multivector op test PASSED" << std::endl;
        }
      else
        {
          stat = -1;
          Out::root() << "multivector op test FAILED" << std::endl;
        }
    }
  catch(std::exception& e)
    {
      stat = -1;
      std::cerr << "Caught exception: " << e.what() << std::endl;
    }
  return stat;
}
double FunctionalEvaluator::fdGradientCheck(double h) const
{
  bool showAll = false;
  Tabs tabs;
  double f0, fPlus, fMinus;
  Expr gradF0 = evalGradient(f0);

  FancyOStream& os = Out::os();


  DiscreteFunction* df = DiscreteFunction::discFunc(varValues_);
  DiscreteFunction* dg = DiscreteFunction::discFunc(gradF0);
  Vector<double> x = df->getVector();
  Vector<double> x0 = x.copy();
  Vector<double> gf = dg->getVector();

  RCP<GhostView<double> > xView = df->ghostView();
  RCP<GhostView<double> > gradF0View = dg->ghostView();


  TEUCHOS_TEST_FOR_EXCEPTION(xView.get() == 0, std::runtime_error, 
    "bad pointer in FunctionalEvaluator::fdGradientCheck");
  TEUCHOS_TEST_FOR_EXCEPTION(gradF0View.get() == 0, std::runtime_error, 
    "bad pointer in FunctionalEvaluator::fdGradientCheck");

  int nTot = x.space().dim();
  int n = x.space().numLocalElements();
  int lowestIndex = x.space().baseGlobalNaturalIndex();

  os << tabs << "doing fd check:  h=" << h << std::endl;
  Array<double> df_dx(n);

  int localIndex = 0;
  for (int globalIndex=0; globalIndex<nTot; globalIndex++)
  {
    double tmp=0.0;
    bool isLocal = globalIndex >= lowestIndex 
      && globalIndex < (lowestIndex+n);
    if (isLocal)
    {
      tmp = xView->getElement(globalIndex);
      loadable(x)->setElement(globalIndex, tmp + h);
    }

    df->setVector(x);
    fPlus = evaluate();
    if (isLocal)
    {
      loadable(x)->setElement(globalIndex, tmp - h);
    }

    df->setVector(x);
    fMinus = evaluate();
      
    if (isLocal)
    {
      df_dx[localIndex] = (fPlus - fMinus)/2.0/h;
      os << "g=" << setw(5) << globalIndex << ", l=" << setw(5) << localIndex << " f0="
         << setw(12) << f0 
         << " fPlus=" << setw(12) << fPlus << " fMinus=" << setw(12) << fMinus << " df_dx="
         << setw(12) << df_dx[localIndex] << std::endl;
      if (showAll)
      {
        os << "i " << globalIndex << " x_i=" << tmp 
           << " f(x)=" << f0 
           << " f(x+h)=" << fPlus 
           << " f(x-h)=" << fMinus << std::endl;
      }
      loadable(x)->setElement(globalIndex, tmp);
      localIndex++;
    }
    df->setVector(x);
  }
  
  double localMaxErr = 0.0;

  showAll = true;
  VectorSpace<double> space = x.space();
  for (int i=0; i<space.numLocalElements(); i++)
  {
    double num =  fabs(df_dx[i]-gf[i]);
    double den = fabs(df_dx[i]) + fabs(gf[i]) + 1.0e-14;
    double r = 0.0;
    if (fabs(den) > 1.0e-16) r = num/den;
    else r = 1.0;
    if (showAll)
    {
      os << "i " << i;
      os << " FD=" << df_dx[i] 
         << " grad=" << gf[i]
         << " r=" << r << std::endl;
    }
    if (localMaxErr < r) localMaxErr = r;
  }
  os << "local max error = " << localMaxErr << std::endl;
  
  double maxErr = localMaxErr;
  df->mesh().comm().allReduce((void*) &localMaxErr, (void*) &maxErr, 1, 
    MPIDataType::doubleType(), MPIOp::maxOp());
  os << tabs << "fd check: max error = " << maxErr << std::endl;

  return maxErr;
}