void ElementIntegral
::createOneFormTransformationMatrix(const CellJacobianBatch& JTrans,
                                    const CellJacobianBatch& JVol) const
{
    TimeMonitor timer(transCreationTimer());
    Tabs tab;
    SUNDANCE_MSG2(transformVerb(),
                  tab << "ElementIntegral creating linear form trans matrices");

    int maxDim = JTrans.cellDim();

    if (transformationMatrixIsValid(alpha())) return;
    transformationMatrixIsValid(alpha()) = true;

    int flops = JTrans.numCells() * maxDim + JTrans.numCells();

    G(alpha()).resize(JTrans.numCells() * JTrans.cellDim());

    int k = 0;
    double* GPtr = &(G(alpha())[0]);

    for (int c=0; c<JTrans.numCells(); c++)
    {
        Array<double> invJ;
        JTrans.getInvJ(c, invJ);
        double detJ = fabs(JVol.detJ()[c]);
        for (int gamma=0; gamma<maxDim; gamma++, k++)
        {
            GPtr[k] = detJ*invJ[alpha() + maxDim * gamma];
        }
    }

    addFlops(flops);
}
void PeriodicSingleCellMesh1D::getJacobians(int cellDim, const Array<int>& cellLID,
    CellJacobianBatch& jBatch) const
{
  TEUCHOS_TEST_FOR_EXCEPTION(cellDim < 0 || cellDim > spatialDim(), std::logic_error,
    "cellDim=" << cellDim 
    << " is not in expected range [0, " << spatialDim()
    << "]");

  jBatch.resize(cellLID.size(), spatialDim(), cellDim);

  int nCells = cellLID.size();

  TEUCHOS_TEST_FOR_EXCEPT(nCells != 1);

  if (cellDim==0)
  {
    for (int i=0; i<nCells; i++)
    {
      double* detJ = jBatch.detJ(i);
      *detJ = 1.0;
    }
  }
  else
  {
    for (int i=0; i<nCells; i++)
    {
      double* J = jBatch.jVals(i);
      J[0] = fabs(xMin_-xMax_);
    }
  }
}
void PeanoMesh3D::getJacobians(int cellDim, const Array<int>& cellLID,
                          CellJacobianBatch& jBatch) const
{
	  //printf("cellDim:%d  _uniqueResolution:%f ",cellDim, _uniqueResolution);
	  SUNDANCE_VERB_HIGH("getJacobians()");
	  TEUCHOS_TEST_FOR_EXCEPTION(cellDim < 0 || cellDim > spatialDim(), std::logic_error,
	    "cellDim=" << cellDim << " is not in expected range [0, " << spatialDim() << "]");
	  int nCells = cellLID.size();
 	  int tmp_index , tmp;
 	  int tmp_index1 , tmp_index2;
 	  Point pnt(0.0,0.0,0.0);
 	  Point pnt1(0.0,0.0,0.0);
 	  Point pnt2(0.0,0.0,0.0);
	  jBatch.resize(cellLID.size(), spatialDim(), cellDim);
	  if (cellDim < spatialDim()) // they need the Jacobian of a lower dinemsional element
	  {
		  //printf("PeanoMesh3D::getJacobians() cellDim < spatialDim() \n");
		   for (int i=0; i<nCells; i++)
		    {
		      //printf("PeanoMesh3D::getJacobian() cellDim < spatialDim() cellDim:%d , ret:%f \n",cellDim , _uniqueResolution);
		      double* detJ = jBatch.detJ(i);
		      switch(cellDim)
		      {
		        case 0: *detJ = 1.0;
		          break;
		        case 1:
			    	 tmp_index = this->facetLID(cellDim,  cellLID[i] , 0 , 0 , tmp );
			    	 tmp_index1= this->facetLID(cellDim,  cellLID[i] , 0 , 1 , tmp );
			    	 pnt = nodePosition(tmp_index);
			    	 pnt1 = nodePosition(tmp_index1);
			    	 pnt = pnt1 - pnt;
		             *detJ = sqrt(pnt * pnt); // the length of the edge
		        break;
		        case 2:{
			    	 tmp_index = this->facetLID(cellDim,  cellLID[i] , 0 , 0 , tmp );
			    	 tmp_index1= this->facetLID(cellDim,  cellLID[i] , 0 , 1 , tmp );
			    	 tmp_index2= this->facetLID(cellDim,  cellLID[i] , 0 , 2 , tmp );
			    	 pnt = nodePosition(tmp_index);
			    	 pnt1 = nodePosition(tmp_index1);
			    	 pnt2 = nodePosition(tmp_index2);
			    	 Point directedArea = cross( pnt1 - pnt , pnt2 - pnt );
		             *detJ = sqrt(directedArea * directedArea); // the are of the face
		        break;}
		        default:
		          TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "impossible switch value "
		            "cellDim=" << cellDim << " in PeanoMesh3D::getJacobians()");
		      }
		    }
	  }else{ // they request the complete Jacoby matrix for this bunch of elements
		    //Array<double> J(cellDim*cellDim);
		    SUNDANCE_VERB_HIGH("cellDim == spatialDim()");
		    for (unsigned int i=0; i<(unsigned int)cellLID.size(); i++)
		    {
			  //printf("PeanoMesh3D::getJacobian() cellDim == spatialDim() cellDim:%d , ret:%f \n",cellDim , _uniqueResolution);
		      double* J = jBatch.jVals(i);
		      switch(cellDim)
		      {
		        case 3:
		          J[0] = _peanoMesh->returnResolution(0);
		          J[1] = 0.0; J[2] = 0.0; J[3] = 0.0;
		          J[4] = _peanoMesh->returnResolution(1);
		          J[5] = 0.0; J[6] = 0.0; J[7] = 0.0;
		          J[8] = _peanoMesh->returnResolution(2); // the Jacobi of the tet
		        break;
		        default:
		          TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "impossible switch value "
		            "cellDim=" << cellDim
		            << " in PeanoMesh3D::getJacobians()");
		      }
		    }
	  }
}
void MaximalQuadratureIntegral::transformTwoForm(const CellJacobianBatch& JTrans,
  const CellJacobianBatch& JVol,
  const Array<int>& facetIndex,
  const RCP<Array<int> >& cellLIDs,
  const double* const coeff,
  RCP<Array<double> >& A) const
{
  TimeMonitor timer(maxCellQuadrature2Timer());
  Tabs tabs;
  TEUCHOS_TEST_FOR_EXCEPTION(order() != 2, std::logic_error,
    "MaximalQuadratureIntegral::transformTwoForm() called for form "
    "of order " << order());
  SUNDANCE_MSG2(integrationVerb(), tabs << "doing one form by quadrature");

  int nQuad = quadWeights_.size();
  const Array<int>* cellLID = cellLIDs.get();


  /* If the derivative orders are zero, the only thing to be done 
   * is to multiply by the cell's Jacobian determinant and sum over the
   * quad points */
  if (testDerivOrder() == 0 && unkDerivOrder() == 0)
  {
    double* aPtr = &((*A)[0]);
    double* coeffPtr = (double*) coeff;
    int offset = 0 ;
    const Array<double>& w = W_;
    if (globalCurve().isCurveValid())
    {
      int fc = 0;
      Array<double> quadWeightsTmp = quadWeights_;
      Array<Point> quadPointsTmp = quadPts_;
      bool isCutByCurve;

      for (int c=0; c<JVol.numCells(); c++, offset+=nNodes())
      {
        double detJ = fabs(JVol.detJ()[c]);
        quadWeightsTmp = quadWeights_;
        quadPointsTmp = quadPts_;
        /* call the special integration routine */
        quad_.getAdaptedWeights(cellType(), dim(), (*cellLID)[c] , fc ,
          mesh() , globalCurve() , quadPointsTmp , quadWeightsTmp , isCutByCurve );
        if (isCutByCurve)
        {
          Array<double> wi;
          wi.resize(nQuad * nNodesTest() *nNodesUnk() ); //recalculate the special weights
          for (int ii = 0 ; ii < wi.size() ; ii++ ) wi[ii] = 0.0;
          /* Good coding practice: always use braces { } when nesting loops even if
           * there's only one line. Otherwise, if someone inserts a new line 
           * (e.g., a print statement) it can totally change the code logic */
          for (int nt = 0 ; nt < nNodesTest() ; nt++)
          {
            for (int nu=0; nu<nNodesUnk(); nu++)
            { 
              for (int q=0 ; q < quadWeightsTmp.size() ; q++)
              {
                //Indexing: unkNode + nNodesUnk()*(testNode + nNodesTest()*(unkDerivDir + nRefDerivUnk()*(testDerivDir + nRefDerivTest()*q)))
                wi[nu + nNodesUnk()*(nt + nNodesTest()*q)] +=
                  chop(quadWeightsTmp[q] * W_ACI_F2_[q][0][nt][0][nu]);
              }
            }
          }
          for (int q=0; q<nQuad; q++, coeffPtr++)
          {
            double f = (*coeffPtr)*detJ;
            for (int n=0; n<nNodes(); n++)
            {
              aPtr[offset+n] += f*wi[n + nNodes()*q];
            }
          }
        }// end isCutByCurve
        else
        {
          for (int q=0; q<nQuad; q++, coeffPtr++)
          {
            double f = (*coeffPtr)*detJ;
            for (int n=0; n<nNodes(); n++)
            {
              aPtr[offset+n] += f*w[n + nNodes()*q];
            }
          }
        }
      }
    }
    else  // No ACI
    {
      for (int c=0; c<JVol.numCells(); c++, offset+=nNodes())
      {
        double detJ = fabs(JVol.detJ()[c]);
        for (int q=0; q<nQuad; q++, coeffPtr++)
        {
          double f = (*coeffPtr)*detJ;
          for (int n=0; n<nNodes(); n++)
          {
            aPtr[offset+n] += f*w[n + nNodes()*q];
          }
        }
      }
    }

    addFlops( JVol.numCells() * (1 + nQuad * (1 + 2*nNodes())) );
  }
  else
  {
    createTwoFormTransformationMatrix(JTrans, JVol);
    double* GPtr;

    if (testDerivOrder() == 0)
    {
      GPtr = &(G(beta())[0]);
      SUNDANCE_MSG2(transformVerb(),
        Tabs() << "transformation matrix=" << G(beta()));
    }
    else if (unkDerivOrder() == 0)
    {
      GPtr = &(G(alpha())[0]);
      SUNDANCE_MSG2(transformVerb(),
        Tabs() << "transformation matrix=" << G(alpha()));
    }
    else
    {
      GPtr = &(G(alpha(), beta())[0]);
      SUNDANCE_MSG2(transformVerb(),
        Tabs() << "transformation matrix=" 
        << G(alpha(),beta()));
    }
        
      
    transformSummingFirst(JTrans.numCells(), facetIndex, cellLIDs, GPtr, coeff, A);
  }
}
void MaximalQuadratureIntegral::transformOneForm(const CellJacobianBatch& JTrans,  
  const CellJacobianBatch& JVol,
  const Array<int>& facetIndex,
  const RCP<Array<int> >& cellLIDs,
  const double* const coeff,
  RCP<Array<double> >& A) const
{
  TimeMonitor timer(maxCellQuadrature1Timer());
  Tabs tabs;
  TEUCHOS_TEST_FOR_EXCEPTION(order() != 1, std::logic_error,
    "MaximalQuadratureIntegral::transformOneForm() called for form "
    "of order " << order());
  SUNDANCE_MSG2(integrationVerb(), tabs << "doing one form by quadrature");
  int flops = 0;
  const Array<int>* cellLID = cellLIDs.get();

  int nQuad = quadWeights_.size();

  /* If the derivative order is zero, the only thing to be done 
   * is to multiply by the cell's Jacobian determinant and sum over the
   * quad points */
  if (testDerivOrder() == 0)
  {
    double* aPtr = &((*A)[0]);
    SUNDANCE_MSG5(integrationVerb(), tabs << "input A = ");
    if (integrationVerb() >= 5) writeTable(Out::os(), tabs, *A, 6);
  
    double* coeffPtr = (double*) coeff;
    int offset = 0 ;
    const Array<double>& w = W_;

    if (globalCurve().isCurveValid()) /* ----- ACI logic ---- */
    {
      Array<double> quadWeightsTmp = quadWeights_;
      Array<Point> quadPointsTmp = quadPts_; 
      bool isCutByCurve;

      for (int c=0; c<JVol.numCells(); c++, offset+=nNodes())
      {
        Tabs tab2;
        double detJ = fabs(JVol.detJ()[c]);
        int fc = 0;

        SUNDANCE_MSG4(integrationVerb(), tab2 << "c=" << c << " detJ=" << detJ);
        
        /* call the special integration routine */
        quad_.getAdaptedWeights(cellType(), dim(), (*cellLID)[c] , fc ,
          mesh() , globalCurve() , quadPointsTmp , quadWeightsTmp , isCutByCurve );
        if (isCutByCurve)
        {
          Array<double> wi;
          wi.resize(nQuad * nNodes()); //recalculate the special weights
          for (int ii = 0 ; ii < wi.size() ; ii++ ) wi[ii] = 0.0;
          for (int n = 0 ; n < nNodes() ; n++)
          {
            for (int q=0 ; q < quadWeightsTmp.size() ; q++)
            {
              //Indexing: testNode + nNodesTest()*(testDerivDir + nRefDerivTest()*q)
              wi[n + nNodes()*q] +=
                chop(quadWeightsTmp[q] * W_ACI_F1_[q][0][n]);
            }
          }
          // if it is cut by curve then use this vector
          for (int q=0; q<nQuad; q++, coeffPtr++)
          {
            double f = (*coeffPtr)*detJ;
            for (int n=0; n<nNodes(); n++)
            {
              aPtr[offset+n] += f*wi[n + nNodes()*q];
            }
          }
        } // end isCutByCurve
        else 
        {
          for (int q=0; q<nQuad; q++, coeffPtr++)
          {
            double f = (*coeffPtr)*detJ;
            for (int n=0; n<nNodes(); n++)
            {
              aPtr[offset+n] += f*w[n + nNodes()*q];
            }
          }
        }

        if (integrationVerb() >= 4)
        {
          Out::os() << tab2 << "integration results on cell:" << std::endl;
          Out::os() << tab2 << setw(10) << "n" << setw(12) << "I_n" << std::endl;
          for (int n=0; n<nNodes(); n++) 
          {
            Out::os() << tab2 << setw(10) << n 
                      << setw(12) << setprecision(6) << aPtr[offset+n] << std::endl;
          }
        }
      }
    } 
    else /* -------- No ACI -------- */ 
    {
      for (int c=0; c<JVol.numCells(); c++, offset+=nNodes())
      {
        Tabs tab2;
        double detJ = fabs(JVol.detJ()[c]);
        SUNDANCE_MSG4(integrationVerb(), tab2 << "c=" << c << " detJ=" << detJ);

        for (int q=0; q<nQuad; q++, coeffPtr++)
        {
          Tabs tab3;
          double f = (*coeffPtr)*detJ;
          SUNDANCE_MSG4(integrationVerb(), tab3 << "q=" << q << " coeff=" <<
            *coeffPtr << " coeff*detJ=" << f);
          for (int n=0; n<nNodes(); n++)
          {
            Tabs tab4;
            SUNDANCE_MSG4(integrationVerb(), tab4 << "n=" << n << " w=" <<
              w[n + nNodes()*q]);
            aPtr[offset+n] += f*w[n + nNodes()*q];
          }
        }

        if (integrationVerb() >= 4)
        {
          Out::os() << tab2 << "integration results on cell:" << std::endl;
          Out::os() << tab2 << setw(10) << "n" << setw(12) << "I_n" << std::endl;
          for (int n=0; n<nNodes(); n++) 
          {
            Out::os() << tab2 << setw(10) << n 
                      << setw(12) << setprecision(6) << aPtr[offset+n] << std::endl;
          }
        }
        
      }
    }

    SUNDANCE_MSG5(integrationVerb(), tabs << "output A = ");
    if (integrationVerb() >= 5) writeTable(Out::os(), tabs, *A, 6);
  }
  else
  {
    /* If the derivative order is nonzero, then we have to do a transformation. */
    
    createOneFormTransformationMatrix(JTrans, JVol);
    
    SUNDANCE_MSG4(transformVerb(), 
      Tabs() << "transformation matrix=" << G(alpha()));
    
    double* GPtr = &(G(alpha())[0]);      
    
    transformSummingFirst(JVol.numCells(), facetIndex, cellLIDs, GPtr, coeff, A);
  }
  addFlops(flops);
}
void MaximalQuadratureIntegral
::transformZeroForm(const CellJacobianBatch& JTrans,  
  const CellJacobianBatch& JVol,
  const Array<int>& isLocalFlag,
  const Array<int>& facetIndex,
  const RCP<Array<int> >& cellLIDs,
  const double* const coeff,
  RCP<Array<double> >& A) const
{
  TimeMonitor timer(maxCellQuadrature0Timer());
  Tabs tabs;
  SUNDANCE_MSG1(integrationVerb(), tabs << "doing zero form by quadrature");

  TEUCHOS_TEST_FOR_EXCEPTION(order() != 0, std::logic_error,
    "MaximalQuadratureIntegral::transformZeroForm() called "
    "for form of order " << order());

  TEUCHOS_TEST_FOR_EXCEPTION( (int) isLocalFlag.size() != 0 
    && (int) isLocalFlag.size() != JVol.numCells(),
    std::runtime_error,
    "mismatch between isLocalFlag.size()=" << isLocalFlag.size()
    << " and JVol.numCells()=" << JVol.numCells());

  bool checkLocalFlag = (int) isLocalFlag.size() != 0;

  const Array<int>* cellLID = cellLIDs.get();
  int nQuad = quadWeights_.size();


  double& a = (*A)[0];
  SUNDANCE_MSG5(integrationVerb(), tabs << "input A=");
  if (integrationVerb() >= 5) writeTable(Out::os(), tabs, *A, 6);
  double* coeffPtr = (double*) coeff;
  const Array<double>& w = W_;

  if (globalCurve().isCurveValid()) /* ---------- ACI ------------- */
  {
    Array<double> quadWeightsTmp = quadWeights_;
    Array<Point> quadPointsTmp = quadPts_;
    int fc = 0;
    bool isCutByCurve;

    for (int c=0; c<JVol.numCells(); c++)
    {
      if (checkLocalFlag && !isLocalFlag[c]) 
      {
        coeffPtr += nQuad;
        continue;
      }
      double detJ = fabs(JVol.detJ()[c]);
      
      quad_.getAdaptedWeights(cellType(), dim(), (*cellLID)[c], fc ,mesh(),
        globalCurve(), quadPointsTmp, quadWeightsTmp, isCutByCurve);
      /* if we have special weights then do the same as before */
      if (isCutByCurve)
      {
        for (int q=0; q<nQuad; q++, coeffPtr++)
        {
          a += quadWeightsTmp[q]*(*coeffPtr)*detJ;
        }
      } // end cut by curve
      else
      {
        for (int q=0; q<nQuad; q++, coeffPtr++)
        {
          a += w[q]*(*coeffPtr)*detJ;
        }
      }
    }
  }
  else /* --------- No ACI ------------- */
  {
    for (int c=0; c<JVol.numCells(); c++)
    {
      if (checkLocalFlag && !isLocalFlag[c]) 
      {
        coeffPtr += nQuad;
        continue;
      }
      double detJ = fabs(JVol.detJ()[c]);
      
      for (int q=0; q<nQuad; q++, coeffPtr++)
      {
        a += w[q]*(*coeffPtr)*detJ;
      }
    }
  }
  SUNDANCE_MSG5(integrationVerb(), tabs << "output A = ");
  if (integrationVerb() >= 5) writeTable(Out::os(), tabs, *A, 6);

  SUNDANCE_MSG1(integrationVerb(), tabs << "done zero form");
}
void ElementIntegral
::createTwoFormTransformationMatrix(const CellJacobianBatch& JTrans,
                                    const CellJacobianBatch& JVol) const
{
    TimeMonitor timer(transCreationTimer());
    Tabs tab;

    int flops = 0;

    int maxDim = JTrans.cellDim();
    int cellDim = JVol.cellDim();

    if (testDerivOrder() == 1 && unkDerivOrder() == 1)
    {
        Tabs tab2;
        if (transformationMatrixIsValid(alpha(), beta())) return;
        transformationMatrixIsValid(alpha(), beta()) = true;

        G(alpha(), beta()).resize(JTrans.numCells() * JTrans.cellDim() * JTrans.cellDim());

        double* GPtr = &(G(alpha(),beta())[0]);
        int k = 0;

        for (int c=0; c<JTrans.numCells(); c++)
        {
            static Array<double> invJ;
            JTrans.getInvJ(c, invJ);
            double detJ = fabs(JVol.detJ()[c]);
            for (int gamma=0; gamma<maxDim; gamma++)
            {
                for (int delta=0; delta<maxDim; delta++, k++)
                {
                    GPtr[k] =  detJ*invJ[alpha() + gamma*maxDim]
                               * invJ[beta() + maxDim*delta];
                }
            }
        }
        flops = 2 * JTrans.numCells() * maxDim * maxDim + JTrans.numCells();
    }

    else if (testDerivOrder() == 1 && unkDerivOrder() == 0)
    {
        if (transformationMatrixIsValid(alpha())) return;
        transformationMatrixIsValid(alpha()) = true;

        G(alpha()).resize(JTrans.numCells() * JTrans.cellDim());

        int k = 0;
        double* GPtr = &(G(alpha())[0]);

        for (int c=0; c<JTrans.numCells(); c++)
        {
            static Array<double> invJ;
            JTrans.getInvJ(c, invJ);
            double detJ = fabs(JVol.detJ()[c]);
            for (int gamma=0; gamma<maxDim; gamma++,k++)
            {
                GPtr[k] = detJ*invJ[alpha() + maxDim * gamma];
            }
        }
        flops = JTrans.numCells() * maxDim + JTrans.numCells();
    }

    else
    {
        if (transformationMatrixIsValid(beta())) return;
        transformationMatrixIsValid(beta()) = true;

        G(beta()).resize(JTrans.numCells() * JTrans.cellDim());

        int k = 0;
        double* GPtr = &(G(beta())[0]);

        for (int c=0; c<JTrans.numCells(); c++)
        {
            static Array<double> invJ;
            JTrans.getInvJ(c, invJ);
            double detJ = fabs(JVol.detJ()[c]);
            for (int gamma=0; gamma<maxDim; gamma++,k++)
            {
                GPtr[k] = detJ*invJ[beta() + maxDim * gamma];
            }
        }
        flops = JTrans.numCells() * maxDim + JTrans.numCells();
    }

    addFlops(flops);
}
void RefIntegral::transformTwoForm(const CellJacobianBatch& JTrans,
  const CellJacobianBatch& JVol,
  const Array<int>& facetIndex, 
  const RCP<Array<int> >& cellLIDs,
  const double& coeff,
  RCP<Array<double> >& A) const
{
  TimeMonitor timer(ref2IntegrationTimer());
  TEUCHOS_TEST_FOR_EXCEPTION(order() != 2, std::logic_error,
    "RefIntegral::transformTwoForm() called for form "
    "of order " << order());
  
  Tabs tabs;  
  SUNDANCE_MSG1(transformVerb(), tabs << "doing two form by reference");

  int nfc = nFacetCases();

	  SUNDANCE_MSG1(transformVerb(), tabs << "doing two form by reference ... ");
  /* If the derivative orders are zero, the only transformation to be done 
   * is to multiply by the cell's Jacobian determinant */
  if (testDerivOrder() == 0 && unkDerivOrder() == 0)
  {
      if (globalCurve().isCurveValid())
      {     /* ----------- ACI logic ------------ */

    	   Array<double> quadWeightsTmp = quadWeights_;
    	   Array<Point> quadPointsTmp = quadPts_;
    	   bool isCutByCurve = false;

    	   double* aPtr = &((*A)[0]);
    	   int count = 0;
    	   for (int c=0; c<JVol.numCells(); c++)
    	   {
    	     int fc = 0;
    	     if (nFacetCases() != 1) fc = facetIndex[c];

    	     /* ---------- ACI ----------- */
    	     /* call the special integration routine */
    	     quadWeightsTmp = quadWeights_;
    	     quadPointsTmp = quadPts_;
    	     quad_.getAdaptedWeights(cellType(), dim(), (*cellLIDs)[c] , fc ,
    	    		 mesh(), globalCurve(), quadPointsTmp, quadWeightsTmp, isCutByCurve);
    	     if (isCutByCurve){
    	    	 Array<double> w;
    	    	 int ci = 0;
    	    	 w.resize(nNodesTest()*nNodesUnk()); //recalculate the special weights
    	    	 for (int nt = 0 ; nt < nNodesTest() ; nt++)
    	    		 for(int nu=0 ; nu < nNodesUnk() ; nu++ , ci++){
    	    			 w[ci] = 0.0;
    	    			 for (int q=0 ; q < quadWeightsTmp.size() ; q++)
    	    				 w[ci] += chop( quadWeightsTmp[q] * W_ACI_F2_[fc][q][0][nt][0][nu] );
    	    		 }
    	    	 // do the integration here
    	    	 double detJ = coeff * fabs(JVol.detJ()[c]);
    	    	 for (int n=0; n<nNodes(); n++, count++)
    	    	 {
    	    		 aPtr[count] += detJ*w[n];
    	    	 }
    	     }
    	     else
    	     {
    	    	  const Array<double>& w = W_[fc];
    	    	  double detJ = coeff * fabs(JVol.detJ()[c]);
    	    	  for (int n=0; n<nNodes(); n++, count++)
    	    	  {
    	    		  aPtr[count] += detJ*w[n];
    	    	  }
    	     }
    	   }
  	  }
      else        /* ---------- NO ACI logic----------- */
      {
    	  double* aPtr = &((*A)[0]);
    	  int count = 0;
    	  for (int c=0; c<JVol.numCells(); c++)
    	  {
    		  int fc = 0;
    		  if (nFacetCases() != 1) fc = facetIndex[c];

    		  const Array<double>& w = W_[fc];
    		  double detJ = coeff * fabs(JVol.detJ()[c]);
    		  for (int n=0; n<nNodes(); n++, count++)
    		  {
    			  aPtr[count] += detJ*w[n];
    		  }
    	  }
      }
    addFlops(JVol.numCells() * (nNodes() + 1));
  }
  else
  {
    /* If the derivative order is nonzero, then we have to do a transformation. 
     * If we're also on a cell of dimension lower than maximal, we need to refer
     * to the facet index of the facet being integrated. */
    int nCells = JVol.numCells();
    double one = 1.0;
    int nTransRows = nRefDerivUnk()*nRefDerivTest();

    createTwoFormTransformationMatrix(JTrans, JVol);
      
    double* GPtr;
    if (testDerivOrder() == 0)
    {
      GPtr = &(G(beta())[0]);
      SUNDANCE_MSG2(transformVerb(),
        Tabs() << "transformation matrix=" << G(beta()));
    }
    else if (unkDerivOrder() == 0)
    {
      GPtr = &(G(alpha())[0]);
      SUNDANCE_MSG2(transformVerb(),
        Tabs() << "transformation matrix=" << G(alpha()));
    }
    else
    {
      GPtr = &(G(alpha(), beta())[0]);
      SUNDANCE_MSG2(transformVerb(),
        Tabs() << "transformation matrix=" 
        << G(alpha(),beta()));
    }
      
    int nNodes0 = nNodes();

    if (nFacetCases()==1)
    {
      /* if we're on a maximal cell, we can do transformations 
       * for all cells in a single batch. 
       */
      if (globalCurve().isCurveValid())
      {          /* ---------- ACI logic ----------- */

    	 Array<double> quadWeightsTmp = quadWeights_;
    	 Array<Point> quadPointsTmp = quadPts_;
    	 bool isCutByCurve = false;

       	 for (int c=0; c<JVol.numCells(); c++)
       	 {
             int fc = 0;
             if (nfc != 1) fc = facetIndex[c];

             double* aPtr = &((*A)[c*nNodes0]);
             double* gPtr = &(GPtr[c*nTransRows]);
             int oneI = 1;
       		 /* call the special integration routine */
         	//SUNDANCE_MSG1(transformVerb(), tabs << "before quad_.getAdaptedWeights");
         	 quadWeightsTmp = quadWeights_;
             quadPointsTmp = quadPts_;
       		 quad_.getAdaptedWeights(cellType(), dim(), (*cellLIDs)[c], fc ,
             mesh(),globalCurve(), quadPointsTmp, quadWeightsTmp, isCutByCurve);
         	//SUNDANCE_MSG1(transformVerb(), tabs << "after quad_.getAdaptedWeights");
       		 if (isCutByCurve){
       			 Array<double> w;
       			 w.resize(nNodesUnk()*nNodesTest()*nRefDerivUnk()*nRefDerivTest());
       			 for ( int i = 0 ; i < w.size() ; i++) w[i] = 0.0;
       			 //recalculate the special weights
       		     for (int t=0; t<nRefDerivTest(); t++){
       		        for (int nt=0; nt<nNodesTest(); nt++)
       		          for (int u=0; u<nRefDerivUnk(); u++)
       		            for (int nu=0; nu<nNodesUnk(); nu++)
       		            	for (int q=0 ; q < quadWeightsTmp.size() ; q++)
       		                // unkNode + nNodesUnk()*testNode  + nNodes()*(unkDerivDir + nRefDerivUnk()*testDerivDir)
       		                    w[nu + nNodesUnk()*nt  + nNodes()*(u + nRefDerivUnk()*t)] +=
       		                    		chop(quadWeightsTmp[q]*W_ACI_F2_[0][q][t][nt][u][nu]);
       		     }
      		      ::dgemm_("N", "N", &nNodes0, &oneI , &nTransRows, &coeff, &(w[0]),
      		        &nNodes0, &(gPtr[0]), &nTransRows, &one,
      		        &(aPtr[0]), &nNodes0);
       		  }else{
       		     ::dgemm_("N", "N", &nNodes0, &oneI , &nTransRows, &coeff, &(W_[0][0]),
       		        &nNodes0, &(gPtr[0]), &nTransRows, &one,
       		        &(aPtr[0]), &nNodes0);
       		  }
       	 } // end from the for loop over the cells
      }
      else /* ---------- NO ACI ----------- */
      {
        	 ::dgemm_("N", "N", &nNodes0, &nCells, &nTransRows, &coeff, &(W_[0][0]),
        		 &nNodes0, GPtr, &nTransRows, &one,
        		 &((*A)[0]), &nNodes0);
      }
    }
    else
    {
      /* If we're on a lower-dimensional cell and have to transform, 
       * we've got to do each transformation using a different facet case */
        if (globalCurve().isCurveValid())
        {   /* ---------- ACI logic ----------- */
            int oneI = 1;
            Array<double> quadWeightsTmp = quadWeights_;
            Array<Point> quadPointsTmp = quadPts_;
            bool isCutByCurve = false;

            for (int c=0; c<JVol.numCells(); c++)
            {
              int fc = 0;
              if (nfc != 1) fc = facetIndex[c];
              double* aPtr = &((*A)[c*nNodes0]);
              double* gPtr = &(GPtr[c*nTransRows]);
              SUNDANCE_MSG2(integrationVerb(),
                tabs << "c=" << c << ", facet case=" << fc
                << " W=" << W_[fc]);

              /* call the special integration routine */
              quadWeightsTmp = quadWeights_;
              quadPointsTmp = quadPts_;
              quad_.getAdaptedWeights(cellType(), dim(), (*cellLIDs)[c], fc ,
            		  mesh(), globalCurve(), quadPointsTmp, quadWeightsTmp, isCutByCurve);
              if (isCutByCurve){
            	  Array<double> w;
            	  w.resize(nNodesUnk()*nNodesTest()*nRefDerivUnk()*nRefDerivTest());
            	  for ( int i = 0 ; i < w.size() ; i++) w[i] = 0.0;
            	  //recalculate the special weights
            	  for (int t=0; t<nRefDerivTest(); t++){
            		  for (int nt=0; nt<nNodesTest(); nt++)
            			  for (int u=0; u<nRefDerivUnk(); u++)
            				  for (int nu=0; nu<nNodesUnk(); nu++)
            					  for (int q=0 ; q < quadWeightsTmp.size() ; q++)
            						  // unkNode + nNodesUnk()*testNode  + nNodes()*(unkDerivDir + nRefDerivUnk()*testDerivDir)
            						  w[nu + nNodesUnk()*nt  + nNodes()*(u + nRefDerivUnk()*t)] +=
            								  chop( quadWeightsTmp[q]*W_ACI_F2_[fc][q][t][nt][u][nu] );
            	  }
            	  ::dgemm_("N", "N", &nNodes0, &oneI , &nTransRows, &coeff, &(w[0]),
            			  &nNodes0, &(gPtr[0]), &nTransRows, &one,
            			  &(aPtr[0]), &nNodes0);
				  }else{
					  ::dgemm_("N", "N", &nNodes0, &oneI , &nTransRows, &coeff, &(W_[fc][0]),
							  &nNodes0, &(gPtr[0]), &nTransRows, &one,
							  &(aPtr[0]), &nNodes0);
				  }
            }
        }
        else         /* ---------- NO ACI ----------- */
        {
            int N = 1;
            for (int c=0; c<JVol.numCells(); c++)
            {
              int fc = 0;
              if (nfc != 1) fc = facetIndex[c];
              double* aPtr = &((*A)[c*nNodes0]);
              double* gPtr = &(GPtr[c*nTransRows]);
              SUNDANCE_MSG2(integrationVerb(),
                tabs << "c=" << c << ", facet case=" << fc
                << " W=" << W_[fc]);

              ::dgemm_("N", "N", &nNodes0, &N, &nTransRows, &coeff, &(W_[fc][0]),
            		  &nNodes0, gPtr, &nTransRows, &one,
            		  aPtr, &nNodes0);
            }
        }
    }// from else of (nFacetCases()==1)
      
    addFlops(2 * nNodes0 * nCells * nTransRows);
  }
}
void RefIntegral::transformZeroForm(const CellJacobianBatch& JVol,
  const Array<int>& isLocalFlag,  
  const RCP<Array<int> >& cellLIDs,
  const double& coeff,
  RCP<Array<double> >& A) const
{
  TimeMonitor timer(ref0IntegrationTimer());

  TEUCHOS_TEST_FOR_EXCEPTION(order() != 0, std::logic_error,
    "RefIntegral::transformZeroForm() called "
    "for form of order " << order());

  Tabs tabs;  
  SUNDANCE_MSG1(integrationVerb(), tabs << "doing zero form by reference");

  double& a = (*A)[0];
  int flops = 0;
  const Array<int>* cellLID = cellLIDs.get();

  /* if we don't need to check whether elements are local, we
   * can streamline the loop. This will be the case when
   * we are evaluating a functional but not its gradient */
  double w = coeff * W_[0][0];
  if ((int) isLocalFlag.size()==0)
  {
     	if (globalCurve().isCurveValid())
     	{     /* ---------- ACI logic ----------- */

     		Array<double> quadWeightsTmp = quadWeights_;
     		Array<Point> quadPointsTmp = quadPts_;
     		bool isCutByCurve;

     		for (int c=0; c<JVol.numCells(); c++)
     		{
     			int fc = 0;
   				/* call the special integration routine */
   				quadWeightsTmp = quadWeights_;
   				quadPointsTmp = quadPts_;
   				quad_.getAdaptedWeights(cellType(), dim(), (*cellLID)[c], fc ,mesh(),
   						globalCurve(), quadPointsTmp, quadWeightsTmp, isCutByCurve);
   				/* if we have special weights then do the same as before */
   				if (isCutByCurve){
   					double sumweights = 0;
   					for (int j=0; j < quadWeightsTmp.size(); j++) sumweights += chop(quadWeightsTmp[j]);
   					flops+=3+quadWeightsTmp.size();  //Todo: the curve stuff not counted
   					a += coeff * sumweights * fabs(JVol.detJ()[c]);
   				} else {
   					flops+=2;  //Todo: the curve stuff not counted
   					a += w * fabs(JVol.detJ()[c]);
   				}
     		}
     	}
     	else /* -------- NO ACI logic ------- */
     	{
     		for (int c=0; c<JVol.numCells(); c++)
     		{
 				flops+=2;
 				a += w * fabs(JVol.detJ()[c]);
     		}
     	}

  }
  else
  {
    TEUCHOS_TEST_FOR_EXCEPTION( (int) isLocalFlag.size() != JVol.numCells(),
      std::runtime_error,
      "mismatch between isLocalFlag.size()=" 
      << isLocalFlag.size()
      << " and JVol.numCells()=" << JVol.numCells());

      int fc = 0;
      if (globalCurve().isCurveValid())
      {   /* ---------- ACI logic ----------- */
    		Array<double> quadWeightsTmp = quadWeights_;
     		Array<Point> quadPointsTmp = quadPts_;
     		bool isCutByCurve;

    		for (int c=0; c<JVol.numCells(); c++)
    		{
    		  if (isLocalFlag[c])
    		  {

    			/* call the special integration routine */
    			quadWeightsTmp = quadWeights_;
    			quadPointsTmp = quadPts_;
    			quad_.getAdaptedWeights(cellType(), dim(), (*cellLID)[c], fc , mesh(),
    					globalCurve(), quadPointsTmp, quadWeightsTmp, isCutByCurve);
    			/* if we do not have special weights then do the same as before */
    			if (isCutByCurve){
    				double sumweights = 0;
    				for (int j=0; j < quadWeightsTmp.size(); j++) sumweights += chop(quadWeightsTmp[j]);
    				flops+=3+quadWeightsTmp.size();  //Todo: the curve stuff not counted
    				a += coeff * sumweights * fabs(JVol.detJ()[c]);
    			} else {
    				flops+=2;  //Todo: the curve stuff not counted
    				a += w * fabs(JVol.detJ()[c]);
    			}
    		  }
    		}
    	}
        else         /* ---------- NO ACI logic ----------- */
    	{
    		for (int c=0; c<JVol.numCells(); c++)
    		{
      		  if (isLocalFlag[c])
      		  {
    			flops+=2;
    			a += w * fabs(JVol.detJ()[c]);
      		  }
    		}
    	}
  }
  addFlops(flops);
}