void PeriodicSingleCellMesh1D::getJacobians(int cellDim, const Array<int>& cellLID,
    CellJacobianBatch& jBatch) const
{
  TEUCHOS_TEST_FOR_EXCEPTION(cellDim < 0 || cellDim > spatialDim(), std::logic_error,
    "cellDim=" << cellDim 
    << " is not in expected range [0, " << spatialDim()
    << "]");

  jBatch.resize(cellLID.size(), spatialDim(), cellDim);

  int nCells = cellLID.size();

  TEUCHOS_TEST_FOR_EXCEPT(nCells != 1);

  if (cellDim==0)
  {
    for (int i=0; i<nCells; i++)
    {
      double* detJ = jBatch.detJ(i);
      *detJ = 1.0;
    }
  }
  else
  {
    for (int i=0; i<nCells; i++)
    {
      double* J = jBatch.jVals(i);
      J[0] = fabs(xMin_-xMax_);
    }
  }
}
void ElementIntegral
::createOneFormTransformationMatrix(const CellJacobianBatch& JTrans,
                                    const CellJacobianBatch& JVol) const
{
    TimeMonitor timer(transCreationTimer());
    Tabs tab;
    SUNDANCE_MSG2(transformVerb(),
                  tab << "ElementIntegral creating linear form trans matrices");

    int maxDim = JTrans.cellDim();

    if (transformationMatrixIsValid(alpha())) return;
    transformationMatrixIsValid(alpha()) = true;

    int flops = JTrans.numCells() * maxDim + JTrans.numCells();

    G(alpha()).resize(JTrans.numCells() * JTrans.cellDim());

    int k = 0;
    double* GPtr = &(G(alpha())[0]);

    for (int c=0; c<JTrans.numCells(); c++)
    {
        Array<double> invJ;
        JTrans.getInvJ(c, invJ);
        double detJ = fabs(JVol.detJ()[c]);
        for (int gamma=0; gamma<maxDim; gamma++, k++)
        {
            GPtr[k] = detJ*invJ[alpha() + maxDim * gamma];
        }
    }

    addFlops(flops);
}
void CubicHermite::postApplyTransformationTriangle( const Mesh &mesh,
						    const Array<int> &cellLIDs , 
						    const CellJacobianBatch& JVol,
						    RCP<Array<double> >& A ) const
{
  Array<double> &Anoptr = *A;
  
  Array<double> cellVertH;
  getVertexH( mesh , cellLIDs , cellVertH );
  
  
  const int numRows = Anoptr.size() / 10 / JVol.numCells();
  
  for (int i=0;i<JVol.numCells();i++) 
    {
      const double *invJ = JVol.jVals(i);
      
      const int cell_start = i * numRows * 10;
      // handle columns 1 and 2
      for (int j=0;j<numRows;j++) 
	{
	  const double a = Anoptr[cell_start + numRows + j];
	  const double b = Anoptr[cell_start + 2*numRows + j];
	  Anoptr[cell_start + numRows + j] = (invJ[0] * a + invJ[1] * b)/cellVertH[3*i];
	  Anoptr[cell_start + 2*numRows + j] = (invJ[2] * a + invJ[3] * b)/cellVertH[3*i];
	}
      
      // handle columns 4 and 5
      for (int j=0;j<numRows;j++) 
	{
	  const double a = Anoptr[cell_start + 4*numRows + j];
	  const double b = Anoptr[cell_start + 5*numRows + j];
	  Anoptr[cell_start + 4*numRows + j] = (invJ[0] * a + invJ[1] * b)/cellVertH[3*i+1];
	  Anoptr[cell_start + 5*numRows + j] = (invJ[2] * a + invJ[3] * b)/cellVertH[3*i+1];
	}
      
      // handle columns 7 and 8
      for (int j=0;j<numRows;j++) 
	{
	  const double a = Anoptr[cell_start + 7*numRows + j];
	  const double b = Anoptr[cell_start + 8*numRows + j];
	  Anoptr[cell_start + 7*numRows + j] = (invJ[0] * a + invJ[1] * b)/cellVertH[3*i+2];
	  Anoptr[cell_start + 8*numRows + j] = (invJ[2] * a + invJ[3] * b)/cellVertH[3*i+2];
	}
      
    }
  
  return;
}
void IQI_HdivLF_DivV_Cell::evaluate( CellJacobianBatch& JTrans,
				     const double* const coeff,
				     RefCountPtr<Array<double> >& A) const
{
  const int nqp = quad().getNumPoints( maxCellType() );
  const int ncell = JTrans.numCells();
  const int nbf = testBasis().nReferenceDOFs(maxCellType(),maxCellType());

  // wrap A into a rank 2 field container
  Teuchos::Array<int> Aindices(2);
  Aindices[0] = nqp;
  Aindices[1] = nbf;
  Intrepid::FieldContainer<double> AFC(Aindices,*A);

  // wrap coeff into another field container.
  // by way of a Teuchos array
  // by way of discarding the const

  /* this surprisingly doesn't depend on the Jacobian ! */

  for (int c=0;c<ncell;c++) {
    for (int bf=0;bf<nbf;bf++) {
      AFC(c,bf) = 0.0;
      for (int q=0;q<nqp;q++) {
	AFC(c,bf) += QW_(q) * coeff[c*nqp+q] * DivV_(bf,q );
      }
    }
  }

  return;
}
void CubicHermite::preApplyTransformationTriangle( const Mesh &mesh, 
						   const Array<int> &cellLIDs,
						   const CellJacobianBatch& JVol,
						   RCP<Array<double> >& A) const
{
  Array<double> &Anoptr = *A;
  
  Array<double> cellVertH;
  getVertexH( mesh , cellLIDs , cellVertH );
  
  
  // this applies M from the left on each cell
  // A for each cell has 10 rows because it is Hermite
  // so, this gives the number of columns per cell
  const int numCols = Anoptr.size() / JVol.numCells() / 10;
  
  for (int i=0;i<JVol.numCells();i++) 
    {
      const int cell_start = i * numCols * 10;
      const double *invJ = JVol.jVals(i);
      
      for (int j=0;j<numCols;j++) 
	{
	  const int col_start = cell_start + 10 * j;
	  const double a1 = Anoptr[col_start + 1];
	  const double a2 = Anoptr[col_start + 2];
	  const double a4 = Anoptr[col_start + 4];
	  const double a5 = Anoptr[col_start + 5];
	  const double a7 = Anoptr[col_start + 7];
	  const double a8 = Anoptr[col_start + 8];
	  Anoptr[col_start+1] = (invJ[0]*a1 + invJ[1]*a2)/cellVertH[3*i];
	  Anoptr[col_start+2] = (invJ[2]*a1 + invJ[3]*a2)/cellVertH[3*i];
	  Anoptr[col_start+4] = (invJ[0]*a4 + invJ[1]*a5)/cellVertH[3*i+1];
	  Anoptr[col_start+5] = (invJ[2]*a4 + invJ[3]*a5)/cellVertH[3*i+1];
	  Anoptr[col_start+7] = (invJ[0]*a7 + invJ[1]*a8)/cellVertH[3*i+2];
	  Anoptr[col_start+8] = (invJ[2]*a7 + invJ[3]*a8)/cellVertH[3*i+2];
	}
    }
  
}
int main(int argc, char** argv)
{
  
  try
		{
      GlobalMPISession session(&argc, &argv);

      TimeMonitor t(totalTimer());

      int pMax = 2;
      int dim=2;

      verbosity<RefIntegral>() = 0;

      CellType cellType = TriangleCell;

      Point a = Point(0.0, 0.0);
      Point b = Point(1.0, 0.0);
      Point c = Point(0.0, 1.0);

//       Point a = Point(1.0, 1.0);
//       Point b = Point(1.2, 1.6);
//       Point c = Point(0.8, 1.3);
      CellJacobianBatch JBatch;
      JBatch.resize(1, dim, dim);
      double* J = JBatch.jVals(0);
      J[0] = b[0] - a[0];
      J[1] = c[0] - a[0];
      J[2] = b[1] - a[1];
      J[3] = c[1] - a[1];

      QuadratureFamily q4 = new GaussianQuadrature(4);
          
      for (int p=1; p<=pMax; p++)
        {
          std::cerr << std::endl << "---------- p = " << p << " --------------" << std::endl;
          Tabs tab;
          BasisFamily P = new Lagrange(p);
      
          RCP<Array<double> > A = rcp(new Array<double>());
          RCP<Array<double> > Bxx = rcp(new Array<double>());
          RCP<Array<double> > Byy = rcp(new Array<double>());

          Array<double> constCoeff = tuple(1.0, 1.0);

          Array<int> alpha = tuple(0,1);
          Array<int> beta = tuple(0,1);
          ParametrizedCurve curve = new DummyParametrizedCurve();
           MeshType meshType = new BasicSimplicialMeshType();
           MeshSource mesher = new PartitionedLineMesher(0.0, 1.0, 10, meshType);
           Mesh mesh = mesher.getMesh();
           QuadratureFamily quad_1 = new GaussianQuadrature(2);
           RCP<Array<int> > cellLIDs;

          RefIntegral ref(dim, dim, cellType, P, alpha, 1, P, beta,quad_1, 1, isInternalBdry , curve , mesh);
          QuadratureIntegral qxx(dim, dim, cellType, P, tuple(0), 1, 
                                 P, tuple(0), 1, q4, curve , mesh,isInternalBdry);
          QuadratureIntegral qyy(dim, dim, cellType, P, tuple(1), 1, 
                                 P, tuple(1), 1, q4, curve , mesh,isInternalBdry);

          int nq = qxx.nQuad();
          Array<double> varCoeff(nq, 1.0);

          std::cerr << "============================= Doing reference integration =========== " << std::endl;

          ref.transformTwoForm(JBatch, constCoeff, cellLIDs, A);
          std::cerr << "============================= Doing quad integration xx =========== " << std::endl;
          qxx.transformTwoForm(JBatch, &(varCoeff[0]), cellLIDs , Bxx);
          std::cerr << "============================= Doing quad integration yy =========== " << std::endl;
          qyy.transformTwoForm(JBatch, &(varCoeff[0]), cellLIDs , Byy);

          std::cerr << "============================= Done integration =========== " << std::endl;
          std::cerr << tab << "transformed reference element" << std::endl;
          std::cerr << tab << "{";
          for (int r=0; r<ref.nNodesTest(); r++)
            {
              if (r!=0) std::cerr << ", ";
              std::cerr << "{";
              for (int c=0; c<ref.nNodesUnk(); c++)
                {
                  if (c!=0) std::cerr << ", ";
                  std::cerr << (*A)[r + ref.nNodesTest()*c];
                }
              std::cerr << "}";
            }
          std::cerr << "}" << std::endl;

          std::cerr << tab << "transformed Q_xx" << std::endl;
          std::cerr << tab << "{";
          for (int r=0; r<qxx.nNodesTest(); r++)
            {
              if (r!=0) std::cerr << ", ";
              std::cerr << "{";
              for (int c=0; c<qxx.nNodesUnk(); c++)
                {
                  if (c!=0) std::cerr << ", ";
                  int i = r + qxx.nNodesTest()*c;
                  double lapl = (*Bxx)[i];
                  std::cerr << lapl;
                }
              std::cerr << "}";
            }
          std::cerr << "}" << std::endl;

          std::cerr << tab << "transformed Q_yy" << std::endl;
          std::cerr << tab << "{";
          for (int r=0; r<qxx.nNodesTest(); r++)
            {
              if (r!=0) std::cerr << ", ";
              std::cerr << "{";
              for (int c=0; c<qxx.nNodesUnk(); c++)
                {
                  if (c!=0) std::cerr << ", ";
                  int i = r + qxx.nNodesTest()*c;
                  double lapl = (*Byy)[i];
                  std::cerr << lapl;
                }
              std::cerr << "}";
            }
          std::cerr << "}" << std::endl;
        }

      TimeMonitor::summarize();
    }
	catch(std::exception& e)
		{
      std::cerr << e.what() << std::endl;
		}
}
void PeanoMesh3D::getJacobians(int cellDim, const Array<int>& cellLID,
                          CellJacobianBatch& jBatch) const
{
	  //printf("cellDim:%d  _uniqueResolution:%f ",cellDim, _uniqueResolution);
	  SUNDANCE_VERB_HIGH("getJacobians()");
	  TEUCHOS_TEST_FOR_EXCEPTION(cellDim < 0 || cellDim > spatialDim(), std::logic_error,
	    "cellDim=" << cellDim << " is not in expected range [0, " << spatialDim() << "]");
	  int nCells = cellLID.size();
 	  int tmp_index , tmp;
 	  int tmp_index1 , tmp_index2;
 	  Point pnt(0.0,0.0,0.0);
 	  Point pnt1(0.0,0.0,0.0);
 	  Point pnt2(0.0,0.0,0.0);
	  jBatch.resize(cellLID.size(), spatialDim(), cellDim);
	  if (cellDim < spatialDim()) // they need the Jacobian of a lower dinemsional element
	  {
		  //printf("PeanoMesh3D::getJacobians() cellDim < spatialDim() \n");
		   for (int i=0; i<nCells; i++)
		    {
		      //printf("PeanoMesh3D::getJacobian() cellDim < spatialDim() cellDim:%d , ret:%f \n",cellDim , _uniqueResolution);
		      double* detJ = jBatch.detJ(i);
		      switch(cellDim)
		      {
		        case 0: *detJ = 1.0;
		          break;
		        case 1:
			    	 tmp_index = this->facetLID(cellDim,  cellLID[i] , 0 , 0 , tmp );
			    	 tmp_index1= this->facetLID(cellDim,  cellLID[i] , 0 , 1 , tmp );
			    	 pnt = nodePosition(tmp_index);
			    	 pnt1 = nodePosition(tmp_index1);
			    	 pnt = pnt1 - pnt;
		             *detJ = sqrt(pnt * pnt); // the length of the edge
		        break;
		        case 2:{
			    	 tmp_index = this->facetLID(cellDim,  cellLID[i] , 0 , 0 , tmp );
			    	 tmp_index1= this->facetLID(cellDim,  cellLID[i] , 0 , 1 , tmp );
			    	 tmp_index2= this->facetLID(cellDim,  cellLID[i] , 0 , 2 , tmp );
			    	 pnt = nodePosition(tmp_index);
			    	 pnt1 = nodePosition(tmp_index1);
			    	 pnt2 = nodePosition(tmp_index2);
			    	 Point directedArea = cross( pnt1 - pnt , pnt2 - pnt );
		             *detJ = sqrt(directedArea * directedArea); // the are of the face
		        break;}
		        default:
		          TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "impossible switch value "
		            "cellDim=" << cellDim << " in PeanoMesh3D::getJacobians()");
		      }
		    }
	  }else{ // they request the complete Jacoby matrix for this bunch of elements
		    //Array<double> J(cellDim*cellDim);
		    SUNDANCE_VERB_HIGH("cellDim == spatialDim()");
		    for (unsigned int i=0; i<(unsigned int)cellLID.size(); i++)
		    {
			  //printf("PeanoMesh3D::getJacobian() cellDim == spatialDim() cellDim:%d , ret:%f \n",cellDim , _uniqueResolution);
		      double* J = jBatch.jVals(i);
		      switch(cellDim)
		      {
		        case 3:
		          J[0] = _peanoMesh->returnResolution(0);
		          J[1] = 0.0; J[2] = 0.0; J[3] = 0.0;
		          J[4] = _peanoMesh->returnResolution(1);
		          J[5] = 0.0; J[6] = 0.0; J[7] = 0.0;
		          J[8] = _peanoMesh->returnResolution(2); // the Jacobi of the tet
		        break;
		        default:
		          TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "impossible switch value "
		            "cellDim=" << cellDim
		            << " in PeanoMesh3D::getJacobians()");
		      }
		    }
	  }
}
int main(int argc, char** argv)
{
  
  try
  {
    GlobalMPISession session(&argc, &argv);

    TimeMonitor t(totalTimer());

    int pMax = 1;
    int dim=2;

    CellType cellType = TriangleCell;

    Point a = Point(0.0, 0.0);
    Point b = Point(1.0, 0.0);
    Point c = Point(0.0, 1.0);
    CellJacobianBatch JBatch;
    JBatch.resize(1, 2, 2);
    double* J = JBatch.jVals(0);
    J[0] = b[0] - a[0];
    J[1] = c[0] - a[0];
    J[2] = b[1] - a[1];
    J[3] = c[1] - a[1];


    bool isInternalBdry=false;

    /* ------ evaluate Lagrange and FIAT-Lagrange at the vertices */
    Array<Point> verts = tuple(a,b,c);
    BasisFamily lagrange = new Lagrange(1);
    BasisFamily fiatLagrange = new Lagrange(1);
      
    MultiIndex d0(0,0,0);
    MultiIndex dx(1,0,0);
    MultiIndex dy(0,1,0);

    Array<Array<Array<double> > > result;

    Array<int> dummy;

    std::cerr << "------ Evaluating bases at vertices ----------" << std::endl
         << std::endl;

    std::cerr << "Evaluating phi(vert) with FIAT-Lagrange" << std::endl;
    fiatLagrange.ptr()->refEval(cellType, verts, d0, result);
    std::cerr << "results = " << result << std::endl << std::endl;

    std::cerr << "Evaluating phi(vert) with Lagrange" << std::endl;
    lagrange.ptr()->refEval(cellType, verts, d0, result);
    std::cerr << "results = " << result << std::endl << std::endl;

    std::cerr << std::endl ;

    std::cerr << "Evaluating Dx*phi(vert) with FIAT-Lagrange" << std::endl;
    fiatLagrange.ptr()->refEval(cellType, verts, dx, result);
    std::cerr << "results = " << result << std::endl << std::endl;

    std::cerr << "Evaluating Dx*phi(vert) with Lagrange" << std::endl;
    lagrange.ptr()->refEval(cellType, verts, dx, result);
    std::cerr << "results = " << result << std::endl << std::endl;

    std::cerr << std::endl ;
      
    std::cerr << "Evaluating Dy*phi(vert) with FIAT-Lagrange" << std::endl;
    fiatLagrange.ptr()->refEval(cellType, verts, dy, result);
    std::cerr << "results = " << result << std::endl << std::endl;

    std::cerr << "Evaluating Dy*phi(vert) with Lagrange" << std::endl;
    lagrange.ptr()->refEval(cellType, verts, dy, result);
    std::cerr << "results = " << result << std::endl << std::endl;

      

    /* --------- evaluate integrals over elements ----------- */
      
    RCP<Array<double> > A = rcp(new Array<double>());
          
    QuadratureFamily quad = new GaussianQuadrature(4);
    Array<double> quadWeights;
    Array<Point> quadPts;
    quad.getPoints(cellType, quadPts, quadWeights);
    int nQuad = quadPts.size();

    Array<double> coeff(nQuad);
    for (int i=0; i<nQuad; i++) 
    {
      double s = quadPts[i][0];
      double t = quadPts[i][1];
      double x = a[0] + J[0]*s + J[1]*t;
      double y = a[1] + J[2]*s + J[3]*t;
      coeff[i] = x*y;
    }
    const double* const f = &(coeff[0]);

    std::cerr << std::endl << std::endl 
         << "---------------- One-forms --------------------" 
         << std::endl << std::endl;
    for (int p=1; p<=pMax; p++)
    {
      BasisFamily P = new Lagrange(p);
      for (int dp=0; dp<=1; dp++)
      {
        if (dp > p) continue;
        Tabs tab0;
        std::cerr << tab0 << "test function deriv order = " << dp << std::endl;
        int numTestDir = 1;
        if (dp==1) numTestDir = dim;
        for (int t=0; t<numTestDir; t++)
        {
          int alpha = t;
          Tabs tab;
          QuadratureIntegral ref(dim, cellType, dim, cellType, P, alpha, dp, quad, isInternalBdry);
          A->resize(ref.nNodesTest());
          ref.transformOneForm(JBatch, JBatch, dummy, f, A);
          std::cerr << tab << "test deriv direction =" << t << std::endl;
          std::cerr << tab << "transformed local vector: " << std::endl;
          std::cerr << tab << "{";
          for (int r=0; r<ref.nNodesTest(); r++)
          {
            if (r!=0) std::cerr << ", ";
            std::cerr << (*A)[r];
          }
          std::cerr << "}" << std::endl << std::endl;
        }
      }
    }

    std::cerr << std::endl << std::endl 
         << "---------------- Two-forms --------------------" 
         << std::endl << std::endl;
    for (int p=1; p<=pMax; p++)
    {
      BasisFamily P = new Lagrange(p);
      for (int q=1; q<=pMax; q++)
      {
        BasisFamily Q = new Lagrange(q);
        for (int dp=0; dp<=1; dp++)
        {
          if (dp > p) continue;
          Tabs tab0;
          std::cerr << tab0 << "test function deriv order = " << dp << std::endl;
          for (int dq=0; dq<=1; dq++)
          {
            if (dq > q) continue;
            Tabs tab1;
            std::cerr << tab1 
                 << "unk function deriv order = " << dq << std::endl;
            int numTestDir = 1;
            if (dp==1) numTestDir = dim;
            for (int t=0; t<numTestDir; t++)
            {
              int alpha = t;
              int numUnkDir = 1;
              if (dq==1) numUnkDir = dim;
              for (int u=0; u<numUnkDir; u++)
              {
                Tabs tab;
                int beta = u;
                QuadratureIntegral ref(dim, cellType, dim, cellType, P, alpha, 
                  dp, Q, beta, dq, quadd, isInternalBdry);
                A->resize(ref.nNodesTest()*ref.nNodesUnk());
                ref.transformTwoForm(JBatch, JBatch, dummy, f, A);

                std::cerr << tab << "test deriv direction =" << 
                  t << ", unk deriv direction =" << u << std::endl;
                std::cerr << tab << "transformed local stiffness matrix" << std::endl;
                std::cerr << tab << "{";

                for (int r=0; r<ref.nNodesTest(); r++)
                {
                  if (r!=0) std::cerr << ", ";
                  std::cerr << "{";
                  for (int c=0; c<ref.nNodesUnk(); c++)
                  {
                    if (c!=0) std::cerr << ", ";
                    std::cerr << chop((*A)[r + ref.nNodesTest()*c]);
                  }
                  std::cerr << "}";
                }
                std::cerr << "}" << std::endl << std::endl;
              }
            }
          }
        }
      }
    }
    TimeMonitor::summarize();

  }
	catch(std::exception& e)
  {
    std::cerr << e.what() << std::endl;
  }
}
int main(int argc, char** argv)
{
  int stat = 0;
  int verb=1;
  try
  {
    GlobalMPISession session(&argc, &argv);

    TimeMonitor t(totalTimer());

    int pMax = 2;
    int dim=2;

    bool isInternalBdry = false;

    Utils::setChopVal(1.0e-14);

    CellType cellType = TriangleCell;

    //       Point a = Point(1.0, 1.0);
    //       Point b = Point(1.2, 1.6);
    //       Point c = Point(0.8, 1.3);

    Point a = Point(0.0, 0.0);
    Point b = Point(1.0, 0.0);
    Point c = Point(0.0, 1.0);

    Point d = Point(0.1, 0.1);
    Point e = Point(1.0, 0.0);
    Point f = Point(0.0, 1.0);

    int nCells = 2;

    CellJacobianBatch JBatch;
    JBatch.resize(nCells, 2, 2);
    double* J = JBatch.jVals(0);
    J[0] = b[0] - a[0];
    J[1] = c[0] - a[0];
    J[2] = b[1] - a[1];
    J[3] = c[1] - a[1];

    J[4] = e[0] - d[0];
    J[5] = f[0] - d[0];
    J[6] = e[1] - d[1];
    J[7] = f[1] - d[1];


      
    Array<int> dummy;
    double coeff = 1.0;
    RCP<Array<double> > A = rcp(new Array<double>());
    RCP<Array<double> > B = rcp(new Array<double>());

    QuadratureFamily q4 = new GaussianQuadrature(4);

    int nErrors = 0;

    std::cerr << std::endl << std::endl 
         << "---------------- One-forms --------------------" 
         << std::endl << std::endl;
    for (int p=0; p<=pMax; p++)
    {
      BasisFamily P = new Lagrange(p);
      for (int dp=0; dp<=1; dp++)
      {
        if (dp > p) continue;

        int numTestDir = 1;
        if (dp==1) numTestDir = dim;
        for (int t=0; t<numTestDir; t++)
        {
          int alpha = t;
          Tabs tab;

          ParametrizedCurve curve = new DummyParametrizedCurve();
          MeshType meshType = new BasicSimplicialMeshType();
          MeshSource mesher = new PartitionedLineMesher(0.0, 1.0, 10, meshType);
          Mesh mesh = mesher.getMesh();
          RCP<Array<int> > cellLIDs;

          RefIntegral ref(dim, cellType, dim, cellType, P, alpha, dp, q4 , isInternalBdry, curve, mesh ,verb);
          A->resize(JBatch.numCells() * ref.nNodes());
          for (int ai=0; ai<A->size(); ai++) (*A)[ai]=0.0;
          ref.transformOneForm(JBatch, JBatch, dummy, cellLIDs , coeff, A);
          std::cerr << tab << "transformed reference element" << std::endl;
          if (dp>0) std::cerr << tab << "test diff direction=" << t << std::endl;
          for (int cell=0; cell<nCells; cell++)
          {
            std::cerr << tab << "{";
            for (int r=0; r<ref.nNodesTest(); r++)
            {
              if (r!=0) std::cerr << ", ";
              std::cerr << Utils::chop((*A)[cell*ref.nNodesTest()+r]);
            }
            std::cerr << "}" << std::endl;
          }
          QuadratureIntegral quad(dim, cellType, dim, cellType, P, alpha, dp, q4, isInternalBdry, curve, mesh, verb);
          Array<double> quadCoeff(2*quad.nQuad(), 1.0);
          B->resize(JBatch.numCells() * quad.nNodes());
          for (int ai=0; ai<B->size(); ai++) (*B)[ai]=0.0;
          quad.transformOneForm(JBatch, JBatch, dummy, cellLIDs , &(quadCoeff[0]), B);
          std::cerr << tab << "transformed quad element" << std::endl;
          if (dp>0) std::cerr << tab << "test diff direction =" << t << std::endl;
          for (int cell=0; cell<nCells; cell++)
          {
            std::cerr << tab << "{";
            for (int r=0; r<quad.nNodesTest(); r++)
            {
              if (r!=0) std::cerr << ", ";
              std::cerr << Utils::chop((*B)[cell*ref.nNodesTest()+r]);
            }
            std::cerr << "}" << std::endl;
          }

          std::cerr << tab << "MISFIT quad-ref" << std::endl;
          std::cerr << tab << "test diff order =" << dp << std::endl;
          if (dp>0) std::cerr << tab << "test diff direction =" << t << std::endl;
          bool OK = true;
          for (int cell=0; cell<nCells; cell++)
          {
            std::cerr << tab << "{";
            for (int r=0; r<quad.nNodesTest(); r++)
            {
              if (r!=0) std::cerr << ", ";
              int i = cell*ref.nNodesTest()+r;
              double err = fabs(Utils::chop((*B)[i] - (*A)[i]));
              if (err > 1.0e-14) 
              {
                OK = false;
              }
              std::cerr << err;
            }
            std::cerr << "}" << std::endl;
          }
                  
          if (!OK) 
          {
            nErrors ++;
            std::cerr << "ERROR DETECTED!!! p=" << p
                 << "  t=" << t  << std::endl;
          }
        }
      }
    }
         




    std::cerr << std::endl << std::endl 
         << "---------------- Two-forms --------------------" 
         << std::endl << std::endl;

    for (int p=0; p<=pMax; p++)
    {
      BasisFamily P = new Lagrange(p);
      for (int dp=0; dp<=1; dp++)
      {
        if (dp > p) continue;
        int numTestDir = 1;
        if (dp==1) numTestDir = dim;
        for (int q=0; q<=pMax; q++)
        {
          BasisFamily Q = new Lagrange(q);
          for (int dq=0; dq<=1; dq++)
          {
            if (dq > q) continue;
            for (int t=0; t<numTestDir; t++)
            {
              int alpha = t;
              int numUnkDir = 1;
              if (dq==1) numUnkDir = dim;
              for (int u=0; u<numUnkDir; u++)
              {
                   ParametrizedCurve curve = new DummyParametrizedCurve();
                   MeshType meshType = new BasicSimplicialMeshType();
                   MeshSource mesher = new PartitionedLineMesher(0.0, 1.0, 10, meshType);
                   Mesh mesh = mesher.getMesh();
                   QuadratureFamily quad_1 = new GaussianQuadrature(2);
                   RCP<Array<int> > cellLIDs;

                Tabs tab;
                //                              if (p==0 || q==0 || dp==0 || dq==0 || u==1
                //  || t==1) continue;
                int beta = u;
                RefIntegral ref(dim, cellType, dim, cellType, P, alpha,
                  dp, Q, beta, dq, quad_1 , isInternalBdry, curve , mesh , verb);
                A->resize(JBatch.numCells() * ref.nNodes());
                for (int ai=0; ai<A->size(); ai++) (*A)[ai]=0.0;
                ref.transformTwoForm(JBatch, JBatch, dummy, cellLIDs , coeff, A);
                std::cerr << tab << "transformed ref element" << std::endl;
                std::cerr << tab << "test diff order = " << dp << std::endl;
                if (dp>0) std::cerr << tab << "t=dx(" << t << ")" << std::endl;
                std::cerr << tab << "unk diff order = " << dq << std::endl;
                if (dq>0) std::cerr << tab << "u=dx(" << u << ")" << std::endl;

                for (int cell=0; cell<nCells; cell++)
                {
                  std::cerr << tab << "cell=" << cell << " {";
                  for (int r=0; r<ref.nNodesTest(); r++)
                  {
                    if (r!=0) std::cerr << ", ";
                    std::cerr << "{";
                    for (int c=0; c<ref.nNodesUnk(); c++)
                    {
                      if (c!=0) std::cerr << ", ";
                      std::cerr << Utils::chop((*A)[r + ref.nNodesTest()*(c + cell*ref.nNodesUnk())]);
                    }
                    std::cerr << "}";
                  }
                  std::cerr << "}" << std::endl;
                }


                QuadratureIntegral quad(dim, cellType, dim, cellType, P, alpha,
                  dp, Q, beta, dq, q4, isInternalBdry,curve , mesh , verb);
                Array<double> quadCoeff(2*quad.nQuad(), 1.0);
                B->resize(JBatch.numCells() * quad.nNodes());
                for (int ai=0; ai<B->size(); ai++) (*B)[ai]=0.0;
                quad.transformTwoForm(JBatch, JBatch, dummy, cellLIDs , &(quadCoeff[0]), B);

                std::cerr << tab << "transformed quad element" << std::endl;
                std::cerr << tab << "test diff order = " << dp << std::endl;
                if (dp>0) std::cerr << tab << "t=dx(" << t << ")" << std::endl;
                std::cerr << tab << "unk diff order = " << dq << std::endl;
                if (dq>0) std::cerr << tab << "u=dx(" << u << ")" << std::endl;

                for (int cell=0; cell<nCells; cell++)
                {
                  std::cerr << tab << "cell=" << cell << " {";
                  for (int r=0; r<ref.nNodesTest(); r++)
                  {
                    if (r!=0) std::cerr << ", ";
                    std::cerr << "{";
                    for (int c=0; c<ref.nNodesUnk(); c++)
                    {
                      if (c!=0) std::cerr << ", ";
                      std::cerr << Utils::chop((*B)[r + ref.nNodesTest()*(c + cell*ref.nNodesUnk())]);
                    }
                    std::cerr << "}";
                  }
                  std::cerr << "}" << std::endl;   
                }

                bool OK = true;
                std::cerr << tab << "MISMATCH quad - ref" << std::endl;
                std::cerr << tab << "test diff order = " << dp << std::endl;
                if (dp>0) std::cerr << tab << "t=dx(" << t << ")" << std::endl;
                std::cerr << tab << "unk diff order = " << dq << std::endl;
                if (dq>0) std::cerr << tab << "u=dx(" << u << ")" << std::endl;

                for (int cell=0; cell<nCells; cell++)
                {
                  std::cerr << tab << "cell #" << cell << " {";
                              
                  for (int r=0; r<ref.nNodesTest(); r++)
                  {
                    if (r!=0) std::cerr << ", ";
                    std::cerr << "{";
                    for (int c=0; c<ref.nNodesUnk(); c++)
                    {
                      if (c!=0) std::cerr << ", ";
                      int i = r + ref.nNodesTest()*(c + cell*ref.nNodesUnk());
                      double err = fabs(Utils::chop((*B)[i] - (*A)[i]));
                      if (err > 1.0e-14) OK = false;
                      std::cerr << err;
                    }
                    std::cerr << "}";
                  }
                  std::cerr << "}" << std::endl;
                }
                if (!OK) 
                {
                  nErrors ++;
                  std::cerr << "ERROR DETECTED!!! p=" << p
                       << " dp=" << dp << "  t=" << t  
                       << " q=" << q << "  dq=" << dq
                       << "  u=" << u << std::endl;
                }

                std::cerr << std::endl << std::endl << std::endl << std::endl;
              }
            }
          }
        }
      }
    }

    std::cerr << "total quadrature flops: " << QuadratureIntegral::totalFlops() 
         << std::endl;
    std::cerr << "total ref integration flops: " << RefIntegral::totalFlops() 
         << std::endl;

    if (nErrors == 0)
    {
      std::cerr << "Transformed integral test PASSED" << std::endl;
    }
    else
    {
      stat = -1;
      std::cerr << "Transformed integral test FAILED" << std::endl;
    }
    TimeMonitor::summarize();
  }
	catch(std::exception& e)
  {
    stat = -1;
    std::cerr << "Transformed integral test FAILED" << std::endl;
    std::cerr << e.what() << std::endl;
  }

  return stat;
  
}
bool IntegralGroup
::evaluate(const CellJacobianBatch& JTrans,
  const CellJacobianBatch& JVol,
  const Array<int>& isLocalFlag, 
  const Array<int>& facetIndex, 
  const RCP<Array<int> >& cellLIDs,
  const Array<RCP<EvalVector> >& vectorCoeffs,
  const Array<double>& constantCoeffs,
  RCP<Array<double> >& A) const
{
  TimeMonitor timer(integrationTimer());
  Tabs tab0(0);


  SUNDANCE_MSG1(integrationVerb(), tab0 << "evaluating integral group with "
    << integrals_.size() << " integrals");

  SUNDANCE_MSG3(integrationVerb(), 
    tab0 << "num integration cells = " << JVol.numCells());
  SUNDANCE_MSG3(integrationVerb(), 
    tab0 << "num nodes in output = " << integrals_[0]->nNodes());

  /* initialize the return vector */
  if (integrals_[0]->nNodes() == -1) A->resize(1);
  else A->resize(JVol.numCells() * integrals_[0]->nNodes());
  double* aPtr = &((*A)[0]);
  int n = A->size();
  for (int i=0; i<n; i++) aPtr[i] = 0.0;

  SUNDANCE_MSG5(integrationVerb(), tab0 << "begin A=");
  if (integrationVerb() >=5) writeTable(Out::os(), tab0, *A, 6);

  /* do the integrals */
  for (int i=0; i<integrals_.size(); i++)
  {
    Tabs tab1;
    SUNDANCE_MSG1(integrationVerb(), tab1 << "group member i=" << i 
      << " of " << integrals_.size());
    Tabs tab2;

    const RefIntegral* ref 
      = dynamic_cast<const RefIntegral*>(integrals_[i].get());
    const QuadratureIntegral* quad 
      = dynamic_cast<const QuadratureIntegral*>(integrals_[i].get());
    const MaximalQuadratureIntegral* maxQuad 
      = dynamic_cast<const MaximalQuadratureIntegral*>(integrals_[i].get());
    const CurveQuadratureIntegral* curveQuad
      = dynamic_cast<const CurveQuadratureIntegral*>(integrals_[i].get());

    if (ref!=0)
    {
      SUNDANCE_MSG2(integrationVerb(),
        tab2 << "Integrating term group " << i 
        << " by transformed reference integral");
      double f = constantCoeffs[resultIndices_[i]];
      SUNDANCE_MSG2(integrationVerb(),
        tab2 << "Coefficient is " << f);
      ref->transform(JTrans, JVol, isLocalFlag, facetIndex, cellLIDs , f, A);
    }
    else if (quad != 0)
    {
      SUNDANCE_MSG2(integrationVerb(),
        tab2 << "Integrating term group " << i 
        << " by quadrature");
          
      TEST_FOR_EXCEPTION(vectorCoeffs[resultIndices_[i]]->length()==0,
        InternalError,
        "zero-length coeff vector detected in "
        "quadrature integration branch of "
        "IntegralGroup::evaluate(). std::string value is ["
        << vectorCoeffs[resultIndices_[i]]->str()
        << "]");

      Tabs tab3;
      SUNDANCE_MSG3(integrationVerb(),
        tab3 << "coefficients are " <<  vectorCoeffs[resultIndices_[i]]->str());

      const double* const f = vectorCoeffs[resultIndices_[i]]->start();
      quad->transform(JTrans, JVol, isLocalFlag, facetIndex, cellLIDs , f, A);
    }
    else if (maxQuad != 0)
    {
      SUNDANCE_MSG2(integrationVerb(),
        tab2 << "Integrating term group " << i 
        << " by quadrature");
          
      TEST_FOR_EXCEPTION(vectorCoeffs[resultIndices_[i]]->length()==0,
        InternalError,
        "zero-length coeff vector detected in "
        "quadrature integration branch of "
        "IntegralGroup::evaluate(). std::string value is ["
        << vectorCoeffs[resultIndices_[i]]->str()
        << "]");

      Tabs tab3;
      SUNDANCE_MSG3(integrationVerb(),
        tab3 << "coefficients are " <<  vectorCoeffs[resultIndices_[i]]->str());

      const double* const f = vectorCoeffs[resultIndices_[i]]->start();
      maxQuad->transform(JTrans, JVol, isLocalFlag, facetIndex, cellLIDs , f, A);
    }
    else if (curveQuad != 0)
    {
        SUNDANCE_MSG2(integrationVerb(),
          tab2 << "Integrating term group " << i
          << " by curve integral (quadrature by default) , result index: " << resultIndices_[i]);

        double f_const = 0.0;
        if (constantCoeffs.size() > resultIndices_[i]){
        	f_const = constantCoeffs[resultIndices_[i]];
        }

        SUNDANCE_MSG2(integrationVerb(),
          tab2 << "Coefficient is " << f_const);

        // set this
        if (vectorCoeffs.size() > resultIndices_[i]){
            Tabs tab3;
            double* const f = vectorCoeffs[resultIndices_[i]]->start();
        	SUNDANCE_MSG3(integrationVerb(),
        			tab3 << "coefficients are " <<  vectorCoeffs[resultIndices_[i]]->str());
            curveQuad->transform(JTrans, JVol, isLocalFlag, facetIndex, cellLIDs , f_const , f , A);
        } else{
            const double* f_null = 0;
            curveQuad->transform(JTrans, JVol, isLocalFlag, facetIndex, cellLIDs , f_const , f_null , A);
        }

    }
    else
    {
      TEST_FOR_EXCEPT(1);
    }

    SUNDANCE_MSG4(integrationVerb(),
      tab1 << "i=" << i << " integral values=");
    if (integrationVerb() >=4) writeTable(Out::os(), tab1, *A, 6);
  }
  SUNDANCE_MSG1(integrationVerb(), tab0 << "done integral group");

  return true;
}
void MaximalQuadratureIntegral::transformTwoForm(const CellJacobianBatch& JTrans,
  const CellJacobianBatch& JVol,
  const Array<int>& facetIndex,
  const RCP<Array<int> >& cellLIDs,
  const double* const coeff,
  RCP<Array<double> >& A) const
{
  TimeMonitor timer(maxCellQuadrature2Timer());
  Tabs tabs;
  TEUCHOS_TEST_FOR_EXCEPTION(order() != 2, std::logic_error,
    "MaximalQuadratureIntegral::transformTwoForm() called for form "
    "of order " << order());
  SUNDANCE_MSG2(integrationVerb(), tabs << "doing one form by quadrature");

  int nQuad = quadWeights_.size();
  const Array<int>* cellLID = cellLIDs.get();


  /* If the derivative orders are zero, the only thing to be done 
   * is to multiply by the cell's Jacobian determinant and sum over the
   * quad points */
  if (testDerivOrder() == 0 && unkDerivOrder() == 0)
  {
    double* aPtr = &((*A)[0]);
    double* coeffPtr = (double*) coeff;
    int offset = 0 ;
    const Array<double>& w = W_;
    if (globalCurve().isCurveValid())
    {
      int fc = 0;
      Array<double> quadWeightsTmp = quadWeights_;
      Array<Point> quadPointsTmp = quadPts_;
      bool isCutByCurve;

      for (int c=0; c<JVol.numCells(); c++, offset+=nNodes())
      {
        double detJ = fabs(JVol.detJ()[c]);
        quadWeightsTmp = quadWeights_;
        quadPointsTmp = quadPts_;
        /* call the special integration routine */
        quad_.getAdaptedWeights(cellType(), dim(), (*cellLID)[c] , fc ,
          mesh() , globalCurve() , quadPointsTmp , quadWeightsTmp , isCutByCurve );
        if (isCutByCurve)
        {
          Array<double> wi;
          wi.resize(nQuad * nNodesTest() *nNodesUnk() ); //recalculate the special weights
          for (int ii = 0 ; ii < wi.size() ; ii++ ) wi[ii] = 0.0;
          /* Good coding practice: always use braces { } when nesting loops even if
           * there's only one line. Otherwise, if someone inserts a new line 
           * (e.g., a print statement) it can totally change the code logic */
          for (int nt = 0 ; nt < nNodesTest() ; nt++)
          {
            for (int nu=0; nu<nNodesUnk(); nu++)
            { 
              for (int q=0 ; q < quadWeightsTmp.size() ; q++)
              {
                //Indexing: unkNode + nNodesUnk()*(testNode + nNodesTest()*(unkDerivDir + nRefDerivUnk()*(testDerivDir + nRefDerivTest()*q)))
                wi[nu + nNodesUnk()*(nt + nNodesTest()*q)] +=
                  chop(quadWeightsTmp[q] * W_ACI_F2_[q][0][nt][0][nu]);
              }
            }
          }
          for (int q=0; q<nQuad; q++, coeffPtr++)
          {
            double f = (*coeffPtr)*detJ;
            for (int n=0; n<nNodes(); n++)
            {
              aPtr[offset+n] += f*wi[n + nNodes()*q];
            }
          }
        }// end isCutByCurve
        else
        {
          for (int q=0; q<nQuad; q++, coeffPtr++)
          {
            double f = (*coeffPtr)*detJ;
            for (int n=0; n<nNodes(); n++)
            {
              aPtr[offset+n] += f*w[n + nNodes()*q];
            }
          }
        }
      }
    }
    else  // No ACI
    {
      for (int c=0; c<JVol.numCells(); c++, offset+=nNodes())
      {
        double detJ = fabs(JVol.detJ()[c]);
        for (int q=0; q<nQuad; q++, coeffPtr++)
        {
          double f = (*coeffPtr)*detJ;
          for (int n=0; n<nNodes(); n++)
          {
            aPtr[offset+n] += f*w[n + nNodes()*q];
          }
        }
      }
    }

    addFlops( JVol.numCells() * (1 + nQuad * (1 + 2*nNodes())) );
  }
  else
  {
    createTwoFormTransformationMatrix(JTrans, JVol);
    double* GPtr;

    if (testDerivOrder() == 0)
    {
      GPtr = &(G(beta())[0]);
      SUNDANCE_MSG2(transformVerb(),
        Tabs() << "transformation matrix=" << G(beta()));
    }
    else if (unkDerivOrder() == 0)
    {
      GPtr = &(G(alpha())[0]);
      SUNDANCE_MSG2(transformVerb(),
        Tabs() << "transformation matrix=" << G(alpha()));
    }
    else
    {
      GPtr = &(G(alpha(), beta())[0]);
      SUNDANCE_MSG2(transformVerb(),
        Tabs() << "transformation matrix=" 
        << G(alpha(),beta()));
    }
        
      
    transformSummingFirst(JTrans.numCells(), facetIndex, cellLIDs, GPtr, coeff, A);
  }
}
void MaximalQuadratureIntegral::transformOneForm(const CellJacobianBatch& JTrans,  
  const CellJacobianBatch& JVol,
  const Array<int>& facetIndex,
  const RCP<Array<int> >& cellLIDs,
  const double* const coeff,
  RCP<Array<double> >& A) const
{
  TimeMonitor timer(maxCellQuadrature1Timer());
  Tabs tabs;
  TEUCHOS_TEST_FOR_EXCEPTION(order() != 1, std::logic_error,
    "MaximalQuadratureIntegral::transformOneForm() called for form "
    "of order " << order());
  SUNDANCE_MSG2(integrationVerb(), tabs << "doing one form by quadrature");
  int flops = 0;
  const Array<int>* cellLID = cellLIDs.get();

  int nQuad = quadWeights_.size();

  /* If the derivative order is zero, the only thing to be done 
   * is to multiply by the cell's Jacobian determinant and sum over the
   * quad points */
  if (testDerivOrder() == 0)
  {
    double* aPtr = &((*A)[0]);
    SUNDANCE_MSG5(integrationVerb(), tabs << "input A = ");
    if (integrationVerb() >= 5) writeTable(Out::os(), tabs, *A, 6);
  
    double* coeffPtr = (double*) coeff;
    int offset = 0 ;
    const Array<double>& w = W_;

    if (globalCurve().isCurveValid()) /* ----- ACI logic ---- */
    {
      Array<double> quadWeightsTmp = quadWeights_;
      Array<Point> quadPointsTmp = quadPts_; 
      bool isCutByCurve;

      for (int c=0; c<JVol.numCells(); c++, offset+=nNodes())
      {
        Tabs tab2;
        double detJ = fabs(JVol.detJ()[c]);
        int fc = 0;

        SUNDANCE_MSG4(integrationVerb(), tab2 << "c=" << c << " detJ=" << detJ);
        
        /* call the special integration routine */
        quad_.getAdaptedWeights(cellType(), dim(), (*cellLID)[c] , fc ,
          mesh() , globalCurve() , quadPointsTmp , quadWeightsTmp , isCutByCurve );
        if (isCutByCurve)
        {
          Array<double> wi;
          wi.resize(nQuad * nNodes()); //recalculate the special weights
          for (int ii = 0 ; ii < wi.size() ; ii++ ) wi[ii] = 0.0;
          for (int n = 0 ; n < nNodes() ; n++)
          {
            for (int q=0 ; q < quadWeightsTmp.size() ; q++)
            {
              //Indexing: testNode + nNodesTest()*(testDerivDir + nRefDerivTest()*q)
              wi[n + nNodes()*q] +=
                chop(quadWeightsTmp[q] * W_ACI_F1_[q][0][n]);
            }
          }
          // if it is cut by curve then use this vector
          for (int q=0; q<nQuad; q++, coeffPtr++)
          {
            double f = (*coeffPtr)*detJ;
            for (int n=0; n<nNodes(); n++)
            {
              aPtr[offset+n] += f*wi[n + nNodes()*q];
            }
          }
        } // end isCutByCurve
        else 
        {
          for (int q=0; q<nQuad; q++, coeffPtr++)
          {
            double f = (*coeffPtr)*detJ;
            for (int n=0; n<nNodes(); n++)
            {
              aPtr[offset+n] += f*w[n + nNodes()*q];
            }
          }
        }

        if (integrationVerb() >= 4)
        {
          Out::os() << tab2 << "integration results on cell:" << std::endl;
          Out::os() << tab2 << setw(10) << "n" << setw(12) << "I_n" << std::endl;
          for (int n=0; n<nNodes(); n++) 
          {
            Out::os() << tab2 << setw(10) << n 
                      << setw(12) << setprecision(6) << aPtr[offset+n] << std::endl;
          }
        }
      }
    } 
    else /* -------- No ACI -------- */ 
    {
      for (int c=0; c<JVol.numCells(); c++, offset+=nNodes())
      {
        Tabs tab2;
        double detJ = fabs(JVol.detJ()[c]);
        SUNDANCE_MSG4(integrationVerb(), tab2 << "c=" << c << " detJ=" << detJ);

        for (int q=0; q<nQuad; q++, coeffPtr++)
        {
          Tabs tab3;
          double f = (*coeffPtr)*detJ;
          SUNDANCE_MSG4(integrationVerb(), tab3 << "q=" << q << " coeff=" <<
            *coeffPtr << " coeff*detJ=" << f);
          for (int n=0; n<nNodes(); n++)
          {
            Tabs tab4;
            SUNDANCE_MSG4(integrationVerb(), tab4 << "n=" << n << " w=" <<
              w[n + nNodes()*q]);
            aPtr[offset+n] += f*w[n + nNodes()*q];
          }
        }

        if (integrationVerb() >= 4)
        {
          Out::os() << tab2 << "integration results on cell:" << std::endl;
          Out::os() << tab2 << setw(10) << "n" << setw(12) << "I_n" << std::endl;
          for (int n=0; n<nNodes(); n++) 
          {
            Out::os() << tab2 << setw(10) << n 
                      << setw(12) << setprecision(6) << aPtr[offset+n] << std::endl;
          }
        }
        
      }
    }

    SUNDANCE_MSG5(integrationVerb(), tabs << "output A = ");
    if (integrationVerb() >= 5) writeTable(Out::os(), tabs, *A, 6);
  }
  else
  {
    /* If the derivative order is nonzero, then we have to do a transformation. */
    
    createOneFormTransformationMatrix(JTrans, JVol);
    
    SUNDANCE_MSG4(transformVerb(), 
      Tabs() << "transformation matrix=" << G(alpha()));
    
    double* GPtr = &(G(alpha())[0]);      
    
    transformSummingFirst(JVol.numCells(), facetIndex, cellLIDs, GPtr, coeff, A);
  }
  addFlops(flops);
}
void MaximalQuadratureIntegral
::transformZeroForm(const CellJacobianBatch& JTrans,  
  const CellJacobianBatch& JVol,
  const Array<int>& isLocalFlag,
  const Array<int>& facetIndex,
  const RCP<Array<int> >& cellLIDs,
  const double* const coeff,
  RCP<Array<double> >& A) const
{
  TimeMonitor timer(maxCellQuadrature0Timer());
  Tabs tabs;
  SUNDANCE_MSG1(integrationVerb(), tabs << "doing zero form by quadrature");

  TEUCHOS_TEST_FOR_EXCEPTION(order() != 0, std::logic_error,
    "MaximalQuadratureIntegral::transformZeroForm() called "
    "for form of order " << order());

  TEUCHOS_TEST_FOR_EXCEPTION( (int) isLocalFlag.size() != 0 
    && (int) isLocalFlag.size() != JVol.numCells(),
    std::runtime_error,
    "mismatch between isLocalFlag.size()=" << isLocalFlag.size()
    << " and JVol.numCells()=" << JVol.numCells());

  bool checkLocalFlag = (int) isLocalFlag.size() != 0;

  const Array<int>* cellLID = cellLIDs.get();
  int nQuad = quadWeights_.size();


  double& a = (*A)[0];
  SUNDANCE_MSG5(integrationVerb(), tabs << "input A=");
  if (integrationVerb() >= 5) writeTable(Out::os(), tabs, *A, 6);
  double* coeffPtr = (double*) coeff;
  const Array<double>& w = W_;

  if (globalCurve().isCurveValid()) /* ---------- ACI ------------- */
  {
    Array<double> quadWeightsTmp = quadWeights_;
    Array<Point> quadPointsTmp = quadPts_;
    int fc = 0;
    bool isCutByCurve;

    for (int c=0; c<JVol.numCells(); c++)
    {
      if (checkLocalFlag && !isLocalFlag[c]) 
      {
        coeffPtr += nQuad;
        continue;
      }
      double detJ = fabs(JVol.detJ()[c]);
      
      quad_.getAdaptedWeights(cellType(), dim(), (*cellLID)[c], fc ,mesh(),
        globalCurve(), quadPointsTmp, quadWeightsTmp, isCutByCurve);
      /* if we have special weights then do the same as before */
      if (isCutByCurve)
      {
        for (int q=0; q<nQuad; q++, coeffPtr++)
        {
          a += quadWeightsTmp[q]*(*coeffPtr)*detJ;
        }
      } // end cut by curve
      else
      {
        for (int q=0; q<nQuad; q++, coeffPtr++)
        {
          a += w[q]*(*coeffPtr)*detJ;
        }
      }
    }
  }
  else /* --------- No ACI ------------- */
  {
    for (int c=0; c<JVol.numCells(); c++)
    {
      if (checkLocalFlag && !isLocalFlag[c]) 
      {
        coeffPtr += nQuad;
        continue;
      }
      double detJ = fabs(JVol.detJ()[c]);
      
      for (int q=0; q<nQuad; q++, coeffPtr++)
      {
        a += w[q]*(*coeffPtr)*detJ;
      }
    }
  }
  SUNDANCE_MSG5(integrationVerb(), tabs << "output A = ");
  if (integrationVerb() >= 5) writeTable(Out::os(), tabs, *A, 6);

  SUNDANCE_MSG1(integrationVerb(), tabs << "done zero form");
}
void ElementIntegral
::createTwoFormTransformationMatrix(const CellJacobianBatch& JTrans,
                                    const CellJacobianBatch& JVol) const
{
    TimeMonitor timer(transCreationTimer());
    Tabs tab;

    int flops = 0;

    int maxDim = JTrans.cellDim();
    int cellDim = JVol.cellDim();

    if (testDerivOrder() == 1 && unkDerivOrder() == 1)
    {
        Tabs tab2;
        if (transformationMatrixIsValid(alpha(), beta())) return;
        transformationMatrixIsValid(alpha(), beta()) = true;

        G(alpha(), beta()).resize(JTrans.numCells() * JTrans.cellDim() * JTrans.cellDim());

        double* GPtr = &(G(alpha(),beta())[0]);
        int k = 0;

        for (int c=0; c<JTrans.numCells(); c++)
        {
            static Array<double> invJ;
            JTrans.getInvJ(c, invJ);
            double detJ = fabs(JVol.detJ()[c]);
            for (int gamma=0; gamma<maxDim; gamma++)
            {
                for (int delta=0; delta<maxDim; delta++, k++)
                {
                    GPtr[k] =  detJ*invJ[alpha() + gamma*maxDim]
                               * invJ[beta() + maxDim*delta];
                }
            }
        }
        flops = 2 * JTrans.numCells() * maxDim * maxDim + JTrans.numCells();
    }

    else if (testDerivOrder() == 1 && unkDerivOrder() == 0)
    {
        if (transformationMatrixIsValid(alpha())) return;
        transformationMatrixIsValid(alpha()) = true;

        G(alpha()).resize(JTrans.numCells() * JTrans.cellDim());

        int k = 0;
        double* GPtr = &(G(alpha())[0]);

        for (int c=0; c<JTrans.numCells(); c++)
        {
            static Array<double> invJ;
            JTrans.getInvJ(c, invJ);
            double detJ = fabs(JVol.detJ()[c]);
            for (int gamma=0; gamma<maxDim; gamma++,k++)
            {
                GPtr[k] = detJ*invJ[alpha() + maxDim * gamma];
            }
        }
        flops = JTrans.numCells() * maxDim + JTrans.numCells();
    }

    else
    {
        if (transformationMatrixIsValid(beta())) return;
        transformationMatrixIsValid(beta()) = true;

        G(beta()).resize(JTrans.numCells() * JTrans.cellDim());

        int k = 0;
        double* GPtr = &(G(beta())[0]);

        for (int c=0; c<JTrans.numCells(); c++)
        {
            static Array<double> invJ;
            JTrans.getInvJ(c, invJ);
            double detJ = fabs(JVol.detJ()[c]);
            for (int gamma=0; gamma<maxDim; gamma++,k++)
            {
                GPtr[k] = detJ*invJ[beta() + maxDim * gamma];
            }
        }
        flops = JTrans.numCells() * maxDim + JTrans.numCells();
    }

    addFlops(flops);
}
void RefIntegral::transformTwoForm(const CellJacobianBatch& JTrans,
  const CellJacobianBatch& JVol,
  const Array<int>& facetIndex, 
  const RCP<Array<int> >& cellLIDs,
  const double& coeff,
  RCP<Array<double> >& A) const
{
  TimeMonitor timer(ref2IntegrationTimer());
  TEUCHOS_TEST_FOR_EXCEPTION(order() != 2, std::logic_error,
    "RefIntegral::transformTwoForm() called for form "
    "of order " << order());
  
  Tabs tabs;  
  SUNDANCE_MSG1(transformVerb(), tabs << "doing two form by reference");

  int nfc = nFacetCases();

	  SUNDANCE_MSG1(transformVerb(), tabs << "doing two form by reference ... ");
  /* If the derivative orders are zero, the only transformation to be done 
   * is to multiply by the cell's Jacobian determinant */
  if (testDerivOrder() == 0 && unkDerivOrder() == 0)
  {
      if (globalCurve().isCurveValid())
      {     /* ----------- ACI logic ------------ */

    	   Array<double> quadWeightsTmp = quadWeights_;
    	   Array<Point> quadPointsTmp = quadPts_;
    	   bool isCutByCurve = false;

    	   double* aPtr = &((*A)[0]);
    	   int count = 0;
    	   for (int c=0; c<JVol.numCells(); c++)
    	   {
    	     int fc = 0;
    	     if (nFacetCases() != 1) fc = facetIndex[c];

    	     /* ---------- ACI ----------- */
    	     /* call the special integration routine */
    	     quadWeightsTmp = quadWeights_;
    	     quadPointsTmp = quadPts_;
    	     quad_.getAdaptedWeights(cellType(), dim(), (*cellLIDs)[c] , fc ,
    	    		 mesh(), globalCurve(), quadPointsTmp, quadWeightsTmp, isCutByCurve);
    	     if (isCutByCurve){
    	    	 Array<double> w;
    	    	 int ci = 0;
    	    	 w.resize(nNodesTest()*nNodesUnk()); //recalculate the special weights
    	    	 for (int nt = 0 ; nt < nNodesTest() ; nt++)
    	    		 for(int nu=0 ; nu < nNodesUnk() ; nu++ , ci++){
    	    			 w[ci] = 0.0;
    	    			 for (int q=0 ; q < quadWeightsTmp.size() ; q++)
    	    				 w[ci] += chop( quadWeightsTmp[q] * W_ACI_F2_[fc][q][0][nt][0][nu] );
    	    		 }
    	    	 // do the integration here
    	    	 double detJ = coeff * fabs(JVol.detJ()[c]);
    	    	 for (int n=0; n<nNodes(); n++, count++)
    	    	 {
    	    		 aPtr[count] += detJ*w[n];
    	    	 }
    	     }
    	     else
    	     {
    	    	  const Array<double>& w = W_[fc];
    	    	  double detJ = coeff * fabs(JVol.detJ()[c]);
    	    	  for (int n=0; n<nNodes(); n++, count++)
    	    	  {
    	    		  aPtr[count] += detJ*w[n];
    	    	  }
    	     }
    	   }
  	  }
      else        /* ---------- NO ACI logic----------- */
      {
    	  double* aPtr = &((*A)[0]);
    	  int count = 0;
    	  for (int c=0; c<JVol.numCells(); c++)
    	  {
    		  int fc = 0;
    		  if (nFacetCases() != 1) fc = facetIndex[c];

    		  const Array<double>& w = W_[fc];
    		  double detJ = coeff * fabs(JVol.detJ()[c]);
    		  for (int n=0; n<nNodes(); n++, count++)
    		  {
    			  aPtr[count] += detJ*w[n];
    		  }
    	  }
      }
    addFlops(JVol.numCells() * (nNodes() + 1));
  }
  else
  {
    /* If the derivative order is nonzero, then we have to do a transformation. 
     * If we're also on a cell of dimension lower than maximal, we need to refer
     * to the facet index of the facet being integrated. */
    int nCells = JVol.numCells();
    double one = 1.0;
    int nTransRows = nRefDerivUnk()*nRefDerivTest();

    createTwoFormTransformationMatrix(JTrans, JVol);
      
    double* GPtr;
    if (testDerivOrder() == 0)
    {
      GPtr = &(G(beta())[0]);
      SUNDANCE_MSG2(transformVerb(),
        Tabs() << "transformation matrix=" << G(beta()));
    }
    else if (unkDerivOrder() == 0)
    {
      GPtr = &(G(alpha())[0]);
      SUNDANCE_MSG2(transformVerb(),
        Tabs() << "transformation matrix=" << G(alpha()));
    }
    else
    {
      GPtr = &(G(alpha(), beta())[0]);
      SUNDANCE_MSG2(transformVerb(),
        Tabs() << "transformation matrix=" 
        << G(alpha(),beta()));
    }
      
    int nNodes0 = nNodes();

    if (nFacetCases()==1)
    {
      /* if we're on a maximal cell, we can do transformations 
       * for all cells in a single batch. 
       */
      if (globalCurve().isCurveValid())
      {          /* ---------- ACI logic ----------- */

    	 Array<double> quadWeightsTmp = quadWeights_;
    	 Array<Point> quadPointsTmp = quadPts_;
    	 bool isCutByCurve = false;

       	 for (int c=0; c<JVol.numCells(); c++)
       	 {
             int fc = 0;
             if (nfc != 1) fc = facetIndex[c];

             double* aPtr = &((*A)[c*nNodes0]);
             double* gPtr = &(GPtr[c*nTransRows]);
             int oneI = 1;
       		 /* call the special integration routine */
         	//SUNDANCE_MSG1(transformVerb(), tabs << "before quad_.getAdaptedWeights");
         	 quadWeightsTmp = quadWeights_;
             quadPointsTmp = quadPts_;
       		 quad_.getAdaptedWeights(cellType(), dim(), (*cellLIDs)[c], fc ,
             mesh(),globalCurve(), quadPointsTmp, quadWeightsTmp, isCutByCurve);
         	//SUNDANCE_MSG1(transformVerb(), tabs << "after quad_.getAdaptedWeights");
       		 if (isCutByCurve){
       			 Array<double> w;
       			 w.resize(nNodesUnk()*nNodesTest()*nRefDerivUnk()*nRefDerivTest());
       			 for ( int i = 0 ; i < w.size() ; i++) w[i] = 0.0;
       			 //recalculate the special weights
       		     for (int t=0; t<nRefDerivTest(); t++){
       		        for (int nt=0; nt<nNodesTest(); nt++)
       		          for (int u=0; u<nRefDerivUnk(); u++)
       		            for (int nu=0; nu<nNodesUnk(); nu++)
       		            	for (int q=0 ; q < quadWeightsTmp.size() ; q++)
       		                // unkNode + nNodesUnk()*testNode  + nNodes()*(unkDerivDir + nRefDerivUnk()*testDerivDir)
       		                    w[nu + nNodesUnk()*nt  + nNodes()*(u + nRefDerivUnk()*t)] +=
       		                    		chop(quadWeightsTmp[q]*W_ACI_F2_[0][q][t][nt][u][nu]);
       		     }
      		      ::dgemm_("N", "N", &nNodes0, &oneI , &nTransRows, &coeff, &(w[0]),
      		        &nNodes0, &(gPtr[0]), &nTransRows, &one,
      		        &(aPtr[0]), &nNodes0);
       		  }else{
       		     ::dgemm_("N", "N", &nNodes0, &oneI , &nTransRows, &coeff, &(W_[0][0]),
       		        &nNodes0, &(gPtr[0]), &nTransRows, &one,
       		        &(aPtr[0]), &nNodes0);
       		  }
       	 } // end from the for loop over the cells
      }
      else /* ---------- NO ACI ----------- */
      {
        	 ::dgemm_("N", "N", &nNodes0, &nCells, &nTransRows, &coeff, &(W_[0][0]),
        		 &nNodes0, GPtr, &nTransRows, &one,
        		 &((*A)[0]), &nNodes0);
      }
    }
    else
    {
      /* If we're on a lower-dimensional cell and have to transform, 
       * we've got to do each transformation using a different facet case */
        if (globalCurve().isCurveValid())
        {   /* ---------- ACI logic ----------- */
            int oneI = 1;
            Array<double> quadWeightsTmp = quadWeights_;
            Array<Point> quadPointsTmp = quadPts_;
            bool isCutByCurve = false;

            for (int c=0; c<JVol.numCells(); c++)
            {
              int fc = 0;
              if (nfc != 1) fc = facetIndex[c];
              double* aPtr = &((*A)[c*nNodes0]);
              double* gPtr = &(GPtr[c*nTransRows]);
              SUNDANCE_MSG2(integrationVerb(),
                tabs << "c=" << c << ", facet case=" << fc
                << " W=" << W_[fc]);

              /* call the special integration routine */
              quadWeightsTmp = quadWeights_;
              quadPointsTmp = quadPts_;
              quad_.getAdaptedWeights(cellType(), dim(), (*cellLIDs)[c], fc ,
            		  mesh(), globalCurve(), quadPointsTmp, quadWeightsTmp, isCutByCurve);
              if (isCutByCurve){
            	  Array<double> w;
            	  w.resize(nNodesUnk()*nNodesTest()*nRefDerivUnk()*nRefDerivTest());
            	  for ( int i = 0 ; i < w.size() ; i++) w[i] = 0.0;
            	  //recalculate the special weights
            	  for (int t=0; t<nRefDerivTest(); t++){
            		  for (int nt=0; nt<nNodesTest(); nt++)
            			  for (int u=0; u<nRefDerivUnk(); u++)
            				  for (int nu=0; nu<nNodesUnk(); nu++)
            					  for (int q=0 ; q < quadWeightsTmp.size() ; q++)
            						  // unkNode + nNodesUnk()*testNode  + nNodes()*(unkDerivDir + nRefDerivUnk()*testDerivDir)
            						  w[nu + nNodesUnk()*nt  + nNodes()*(u + nRefDerivUnk()*t)] +=
            								  chop( quadWeightsTmp[q]*W_ACI_F2_[fc][q][t][nt][u][nu] );
            	  }
            	  ::dgemm_("N", "N", &nNodes0, &oneI , &nTransRows, &coeff, &(w[0]),
            			  &nNodes0, &(gPtr[0]), &nTransRows, &one,
            			  &(aPtr[0]), &nNodes0);
				  }else{
					  ::dgemm_("N", "N", &nNodes0, &oneI , &nTransRows, &coeff, &(W_[fc][0]),
							  &nNodes0, &(gPtr[0]), &nTransRows, &one,
							  &(aPtr[0]), &nNodes0);
				  }
            }
        }
        else         /* ---------- NO ACI ----------- */
        {
            int N = 1;
            for (int c=0; c<JVol.numCells(); c++)
            {
              int fc = 0;
              if (nfc != 1) fc = facetIndex[c];
              double* aPtr = &((*A)[c*nNodes0]);
              double* gPtr = &(GPtr[c*nTransRows]);
              SUNDANCE_MSG2(integrationVerb(),
                tabs << "c=" << c << ", facet case=" << fc
                << " W=" << W_[fc]);

              ::dgemm_("N", "N", &nNodes0, &N, &nTransRows, &coeff, &(W_[fc][0]),
            		  &nNodes0, gPtr, &nTransRows, &one,
            		  aPtr, &nNodes0);
            }
        }
    }// from else of (nFacetCases()==1)
      
    addFlops(2 * nNodes0 * nCells * nTransRows);
  }
}
void RefIntegral::transformZeroForm(const CellJacobianBatch& JVol,
  const Array<int>& isLocalFlag,  
  const RCP<Array<int> >& cellLIDs,
  const double& coeff,
  RCP<Array<double> >& A) const
{
  TimeMonitor timer(ref0IntegrationTimer());

  TEUCHOS_TEST_FOR_EXCEPTION(order() != 0, std::logic_error,
    "RefIntegral::transformZeroForm() called "
    "for form of order " << order());

  Tabs tabs;  
  SUNDANCE_MSG1(integrationVerb(), tabs << "doing zero form by reference");

  double& a = (*A)[0];
  int flops = 0;
  const Array<int>* cellLID = cellLIDs.get();

  /* if we don't need to check whether elements are local, we
   * can streamline the loop. This will be the case when
   * we are evaluating a functional but not its gradient */
  double w = coeff * W_[0][0];
  if ((int) isLocalFlag.size()==0)
  {
     	if (globalCurve().isCurveValid())
     	{     /* ---------- ACI logic ----------- */

     		Array<double> quadWeightsTmp = quadWeights_;
     		Array<Point> quadPointsTmp = quadPts_;
     		bool isCutByCurve;

     		for (int c=0; c<JVol.numCells(); c++)
     		{
     			int fc = 0;
   				/* call the special integration routine */
   				quadWeightsTmp = quadWeights_;
   				quadPointsTmp = quadPts_;
   				quad_.getAdaptedWeights(cellType(), dim(), (*cellLID)[c], fc ,mesh(),
   						globalCurve(), quadPointsTmp, quadWeightsTmp, isCutByCurve);
   				/* if we have special weights then do the same as before */
   				if (isCutByCurve){
   					double sumweights = 0;
   					for (int j=0; j < quadWeightsTmp.size(); j++) sumweights += chop(quadWeightsTmp[j]);
   					flops+=3+quadWeightsTmp.size();  //Todo: the curve stuff not counted
   					a += coeff * sumweights * fabs(JVol.detJ()[c]);
   				} else {
   					flops+=2;  //Todo: the curve stuff not counted
   					a += w * fabs(JVol.detJ()[c]);
   				}
     		}
     	}
     	else /* -------- NO ACI logic ------- */
     	{
     		for (int c=0; c<JVol.numCells(); c++)
     		{
 				flops+=2;
 				a += w * fabs(JVol.detJ()[c]);
     		}
     	}

  }
  else
  {
    TEUCHOS_TEST_FOR_EXCEPTION( (int) isLocalFlag.size() != JVol.numCells(),
      std::runtime_error,
      "mismatch between isLocalFlag.size()=" 
      << isLocalFlag.size()
      << " and JVol.numCells()=" << JVol.numCells());

      int fc = 0;
      if (globalCurve().isCurveValid())
      {   /* ---------- ACI logic ----------- */
    		Array<double> quadWeightsTmp = quadWeights_;
     		Array<Point> quadPointsTmp = quadPts_;
     		bool isCutByCurve;

    		for (int c=0; c<JVol.numCells(); c++)
    		{
    		  if (isLocalFlag[c])
    		  {

    			/* call the special integration routine */
    			quadWeightsTmp = quadWeights_;
    			quadPointsTmp = quadPts_;
    			quad_.getAdaptedWeights(cellType(), dim(), (*cellLID)[c], fc , mesh(),
    					globalCurve(), quadPointsTmp, quadWeightsTmp, isCutByCurve);
    			/* if we do not have special weights then do the same as before */
    			if (isCutByCurve){
    				double sumweights = 0;
    				for (int j=0; j < quadWeightsTmp.size(); j++) sumweights += chop(quadWeightsTmp[j]);
    				flops+=3+quadWeightsTmp.size();  //Todo: the curve stuff not counted
    				a += coeff * sumweights * fabs(JVol.detJ()[c]);
    			} else {
    				flops+=2;  //Todo: the curve stuff not counted
    				a += w * fabs(JVol.detJ()[c]);
    			}
    		  }
    		}
    	}
        else         /* ---------- NO ACI logic ----------- */
    	{
    		for (int c=0; c<JVol.numCells(); c++)
    		{
      		  if (isLocalFlag[c])
      		  {
    			flops+=2;
    			a += w * fabs(JVol.detJ()[c]);
      		  }
    		}
    	}
  }
  addFlops(flops);
}