//==============================================================================
int Ifpack_Chebyshev::
ApplyInverse(const Epetra_MultiVector& X, Epetra_MultiVector& Y) const
{
  
  if (!IsComputed())
    IFPACK_CHK_ERR(-3);

  if (PolyDegree_ == 0)
    return 0;

  int nVec = X.NumVectors();
  int len = X.MyLength();
  if (nVec != Y.NumVectors())
    IFPACK_CHK_ERR(-2);

  Time_->ResetStartTime();

  // AztecOO gives X and Y pointing to the same memory location,
  // need to create an auxiliary vector, Xcopy
  Teuchos::RefCountPtr<const Epetra_MultiVector> Xcopy;
  if (X.Pointers()[0] == Y.Pointers()[0])
    Xcopy = Teuchos::rcp( new Epetra_MultiVector(X) );
  else
    Xcopy = Teuchos::rcp( &X, false );

  double **xPtr = 0, **yPtr = 0;
  Xcopy->ExtractView(&xPtr);
  Y.ExtractView(&yPtr);

#ifdef HAVE_IFPACK_EPETRAEXT
  EpetraExt_PointToBlockDiagPermute* IBD=0;
  if (UseBlockMode_) IBD=&*InvBlockDiagonal_;
#endif
  

  //--- Do a quick solve when the matrix is identity
  double *invDiag=0;
  if(!UseBlockMode_) invDiag=InvDiagonal_->Values();
  if ((LambdaMin_ == 1.0) && (LambdaMax_ == LambdaMin_)) {
#ifdef HAVE_IFPACK_EPETRAEXT
    if(UseBlockMode_) IBD->ApplyInverse(*Xcopy,Y);
    else
#endif
    if (nVec == 1) {
      double *yPointer = yPtr[0], *xPointer = xPtr[0];
      for (int i = 0; i < len; ++i)
        yPointer[i] = xPointer[i]*invDiag[i];
    }
    else {
      int i, k;
      for (i = 0; i < len; ++i) {
        double coeff = invDiag[i];
        for (k = 0; k < nVec; ++k)
          yPtr[k][i] = xPtr[k][i] * coeff;
      }
    } // if (nVec == 1)
    return 0;
  } // if ((LambdaMin_ == 1.0) && (LambdaMax_ == LambdaMin_))

  //--- Initialize coefficients
  // Note that delta stores the inverse of ML_Cheby::delta
  double alpha = LambdaMax_ / EigRatio_;
  double beta = 1.1 * LambdaMax_;
  double delta = 2.0 / (beta - alpha);
  double theta = 0.5 * (beta + alpha);
  double s1 = theta * delta;

  //--- Define vectors
  // In ML_Cheby, V corresponds to pAux and W to dk
  Epetra_MultiVector V(X);
  Epetra_MultiVector W(X);
#ifdef HAVE_IFPACK_EPETRAEXT
  Epetra_MultiVector Temp(X);
#endif
  
  double *vPointer = V.Values(), *wPointer = W.Values();

  double oneOverTheta = 1.0/theta;
  int i, j, k;


  //--- If solving normal equations, multiply RHS by A^T
  if(SolveNormalEquations_){
    Apply_Transpose(Operator_,Y,V);
    Y=V;
  }

  // Do the smoothing when block scaling is turned OFF
  // --- Treat the initial guess
  if (ZeroStartingSolution_ == false) {
    Operator_->Apply(Y, V);
    // Compute W = invDiag * ( X - V )/ Theta
#ifdef HAVE_IFPACK_EPETRAEXT    
    if(UseBlockMode_) {
      Temp.Update(oneOverTheta,X,-oneOverTheta,V,0.0);
      IBD->ApplyInverse(Temp,W);

      // Perform additional matvecs for normal equations
      // CMS: Testing this only in block mode FOR NOW
      if(SolveNormalEquations_){
	IBD->ApplyInverse(W,Temp);
	Apply_Transpose(Operator_,Temp,W);
      }
    }
    else
#endif
    if (nVec == 1) {
      double *xPointer = xPtr[0];
      for (i = 0; i < len; ++i)
        wPointer[i] = invDiag[i] * (xPointer[i] - vPointer[i]) * oneOverTheta;
    }
    else {
      for (i = 0; i < len; ++i) {
        double coeff = invDiag[i]*oneOverTheta;
        double *wi = wPointer + i, *vi = vPointer + i;
        for (k = 0; k < nVec; ++k) {
          *wi = (xPtr[k][i] - (*vi)) * coeff;
          wi = wi + len; vi = vi + len;
        }
      }
    } // if (nVec == 1)
    // Update the vector Y
    Y.Update(1.0, W, 1.0);
  }
  else {
    // Compute W = invDiag * X / Theta
#ifdef HAVE_IFPACK_EPETRAEXT    
    if(UseBlockMode_) {
      IBD->ApplyInverse(X,W);

      // Perform additional matvecs for normal equations
      // CMS: Testing this only in block mode FOR NOW
      if(SolveNormalEquations_){
	IBD->ApplyInverse(W,Temp);
	Apply_Transpose(Operator_,Temp,W);
      }

      W.Scale(oneOverTheta);
      Y.Update(1.0, W, 0.0);      
    }
    else
#endif
    if (nVec == 1) {
      double *xPointer = xPtr[0];
      for (i = 0; i < len; ++i){
        wPointer[i] = invDiag[i] * xPointer[i] * oneOverTheta;
      }
      memcpy(yPtr[0], wPointer, len*sizeof(double));
    }
    else {
      for (i = 0; i < len; ++i) {
        double coeff = invDiag[i]*oneOverTheta;
        double *wi = wPointer + i;
        for (k = 0; k < nVec; ++k) {
          *wi = xPtr[k][i] * coeff;
          wi = wi + len;
        }
      }
      for (k = 0; k < nVec; ++k)
        memcpy(yPtr[k], wPointer + k*len, len*sizeof(double));
    } // if (nVec == 1)
  } // if (ZeroStartingSolution_ == false)
  
  //--- Apply the polynomial
  double rhok = 1.0/s1, rhokp1;
  double dtemp1, dtemp2;
  int degreeMinusOne = PolyDegree_ - 1;
  if (nVec == 1) {
    double *xPointer = xPtr[0];
    for (k = 0; k < degreeMinusOne; ++k) {
      Operator_->Apply(Y, V);
      rhokp1 = 1.0 / (2.0*s1 - rhok);
      dtemp1 = rhokp1 * rhok;
      dtemp2 = 2.0 * rhokp1 * delta;
      rhok = rhokp1;
      // Compute W = dtemp1 * W
      W.Scale(dtemp1);
      // Compute W = W + dtemp2 * invDiag * ( X - V )
#ifdef HAVE_IFPACK_EPETRAEXT    
    if(UseBlockMode_) {
      //NTS: We can clobber V since it will be reset in the Apply
      V.Update(dtemp2,X,-dtemp2);
      IBD->ApplyInverse(V,Temp);

      // Perform additional matvecs for normal equations
      // CMS: Testing this only in block mode FOR NOW
      if(SolveNormalEquations_){
	IBD->ApplyInverse(V,Temp);
	Apply_Transpose(Operator_,Temp,V);
      }

      W.Update(1.0,Temp,1.0);
    }
    else{
#endif
      for (i = 0; i < len; ++i)
        wPointer[i] += dtemp2* invDiag[i] * (xPointer[i] - vPointer[i]);
#ifdef HAVE_IFPACK_EPETRAEXT
    }
#endif

      // Update the vector Y
      Y.Update(1.0, W, 1.0);
    } // for (k = 0; k < degreeMinusOne; ++k)
  }
  else {
    for (k = 0; k < degreeMinusOne; ++k) {
      Operator_->Apply(Y, V);
      rhokp1 = 1.0 / (2.0*s1 - rhok);
      dtemp1 = rhokp1 * rhok;
      dtemp2 = 2.0 * rhokp1 * delta;
      rhok = rhokp1;
      // Compute W = dtemp1 * W
      W.Scale(dtemp1);
      // Compute W = W + dtemp2 * invDiag * ( X - V )
#ifdef HAVE_IFPACK_EPETRAEXT    
    if(UseBlockMode_) {
      //We can clobber V since it will be reset in the Apply
      V.Update(dtemp2,X,-dtemp2);
      IBD->ApplyInverse(V,Temp);

      // Perform additional matvecs for normal equations
      // CMS: Testing this only in block mode FOR NOW
      if(SolveNormalEquations_){
	IBD->ApplyInverse(V,Temp);
	Apply_Transpose(Operator_,Temp,V);
      }


      W.Update(1.0,Temp,1.0);
    }
    else{
#endif
      for (i = 0; i < len; ++i) {
        double coeff = invDiag[i]*dtemp2;
        double *wi = wPointer + i, *vi = vPointer + i;
        for (j = 0; j < nVec; ++j) {
          *wi += (xPtr[j][i] - (*vi)) * coeff;
          wi = wi + len; vi = vi + len;
        }
      }
#ifdef HAVE_IFPACK_EPETRAEXT
    }
#endif      
      // Update the vector Y
      Y.Update(1.0, W, 1.0);
    } // for (k = 0; k < degreeMinusOne; ++k)
  } // if (nVec == 1)

  
  // Flops are updated in each of the following. 
  ++NumApplyInverse_;
  ApplyInverseTime_ += Time_->ElapsedTime();
  return(0);
}
int Ifpack_SORa::ApplyInverse(const Epetra_MultiVector& X, Epetra_MultiVector& Y) const{  
  if(!IsComputed_) return -1;
  Time_.ResetStartTime();
  bool initial_guess_is_zero=false;  
  int NumMyRows=W_->NumMyRows();
  int NumVectors = X.NumVectors();
  Epetra_MultiVector Temp(A_->RowMatrixRowMap(),NumVectors);

  double omega=GetOmega();

  // need to create an auxiliary vector, Xcopy
  Teuchos::RefCountPtr<const Epetra_MultiVector> Xcopy;
  if (X.Pointers()[0] == Y.Pointers()[0]){
    Xcopy = Teuchos::rcp( new Epetra_MultiVector(X) );
    // Since the user didn't give us anything better, our initial guess is zero.
    Y.Scale(0.0);
    initial_guess_is_zero=true;
  }
  else
    Xcopy = Teuchos::rcp( &X, false ); 

  Teuchos::RefCountPtr< Epetra_MultiVector > T2;
  // Note: Assuming that the matrix has an importer.  Ifpack_PointRelaxation doesn't do this, but given that 
  // I have a CrsMatrix, I'm probably OK.
  // Note: This is the lazy man's version sacrificing a few extra flops for avoiding if statements to determine 
  // if things are on or off processor.
  // Note: T2 must be zero'd out
  if (IsParallel_ && W_->Importer())  T2 = Teuchos::rcp( new Epetra_MultiVector(W_->Importer()->TargetMap(),NumVectors,true));
  else T2 = Teuchos::rcp( new Epetra_MultiVector(A_->RowMatrixRowMap(),NumVectors,true));

  // Pointer grabs
  int* rowptr,*colind;
  double *values;
  double **t_ptr,** y_ptr, ** t2_ptr, **x_ptr,*d_ptr;
  T2->ExtractView(&t2_ptr);
  Y.ExtractView(&y_ptr);
  Temp.ExtractView(&t_ptr);
  Xcopy->ExtractView(&x_ptr);
  Wdiag_->ExtractView(&d_ptr);
  IFPACK_CHK_ERR(W_->ExtractCrsDataPointers(rowptr,colind,values));


  for(int i=0; i<NumSweeps_; i++){
    // Calculate b-Ax 
    if(!initial_guess_is_zero  || i > 0) {      
      A_->Apply(Y,Temp);
      Temp.Update(1.0,*Xcopy,-1.0);
    }
    else 
      Temp.Update(1.0,*Xcopy,0.0);

    // Note: The off-processor entries of T2 never get touched (they're always zero) and the other entries are updated 
    // in this sweep before they are used, so we don't need to reset T2 to zero here.

    // Do backsolve & update
    // x = x  + W^{-1} (b - A x)
    for(int j=0; j<NumMyRows; j++){
      double diag=d_ptr[j];
      for (int m=0 ; m<NumVectors; m++) {
	double dtmp=0.0;
	// Note: Since the diagonal is in the matrix, we need to zero that entry of T2 here to make sure it doesn't contribute.
	t2_ptr[m][j]=0.0;
	for(int k=rowptr[j];k<rowptr[j+1];k++){
	  dtmp+= values[k]*t2_ptr[m][colind[k]];
	}
	// Yes, we need to update both of these.
	t2_ptr[m][j] = (t_ptr[m][j]- dtmp)/diag;     
	y_ptr[m][j] += omega*t2_ptr[m][j];
      }
    }
  }

  // Counter update
  NumApplyInverse_++;
  ApplyInverseTime_ += Time_.ElapsedTime();
  return 0;
}