Пример #1
0
/*
   PetscSplitReductionApply - Actually do the communication required for a split phase reduction
*/
static PetscErrorCode PetscSplitReductionApply(PetscSplitReduction *sr)
{
  PetscErrorCode ierr;
  PetscInt       i,numops = sr->numopsbegin,*reducetype = sr->reducetype;
  PetscScalar    *lvalues = sr->lvalues,*gvalues = sr->gvalues;
  PetscInt       sum_flg  = 0,max_flg = 0, min_flg = 0;
  MPI_Comm       comm     = sr->comm;
  PetscMPIInt    size,cmul = sizeof(PetscScalar)/sizeof(PetscReal);

  PetscFunctionBegin;
  if (sr->numopsend > 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Cannot call this after VecxxxEnd() has been called");
  ierr = PetscLogEventBarrierBegin(VEC_ReduceBarrier,0,0,0,0,comm);CHKERRQ(ierr);
  ierr = MPI_Comm_size(sr->comm,&size);CHKERRQ(ierr);
  if (size == 1) {
    ierr = PetscMemcpy(gvalues,lvalues,numops*sizeof(PetscScalar));CHKERRQ(ierr);
  } else {
    /* determine if all reductions are sum, max, or min */
    for (i=0; i<numops; i++) {
      if      (reducetype[i] == REDUCE_MAX) max_flg = 1;
      else if (reducetype[i] == REDUCE_SUM) sum_flg = 1;
      else if (reducetype[i] == REDUCE_MIN) min_flg = 1;
      else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Error in PetscSplitReduction() data structure, probably memory corruption");
    }
    if (sum_flg + max_flg + min_flg > 1) {
      /*
         after all the entires in lvalues we store the reducetype flags to indicate
         to the reduction operations what are sums and what are max
      */
      for (i=0; i<numops; i++) lvalues[numops+i] = reducetype[i];
      ierr = MPIU_Allreduce(lvalues,gvalues,2*numops,MPIU_SCALAR,PetscSplitReduction_Op,comm);CHKERRQ(ierr);
    } else if (max_flg) {     /* Compute max of real and imag parts separately, presumably only the real part is used */
      ierr = MPIU_Allreduce((PetscReal*)lvalues,(PetscReal*)gvalues,cmul*numops,MPIU_REAL,MPIU_MAX,comm);CHKERRQ(ierr);
    } else if (min_flg) {
      ierr = MPIU_Allreduce((PetscReal*)lvalues,(PetscReal*)gvalues,cmul*numops,MPIU_REAL,MPIU_MIN,comm);CHKERRQ(ierr);
    } else {
      ierr = MPIU_Allreduce(lvalues,gvalues,numops,MPIU_SCALAR,MPIU_SUM,comm);CHKERRQ(ierr);
    }
  }
  sr->state     = STATE_END;
  sr->numopsend = 0;
  ierr = PetscLogEventBarrierEnd(VEC_ReduceBarrier,0,0,0,0,comm);CHKERRQ(ierr);
  PetscFunctionReturn(0);
}
Пример #2
0
/*
   PetscSplitReductionApply - Actually do the communication required for a split phase reduction
*/
static PetscErrorCode PetscSplitReductionApply(PetscSplitReduction *sr)
{
  PetscErrorCode ierr;
  PetscInt       i,numops = sr->numopsbegin,*reducetype = sr->reducetype;
  PetscScalar    *lvalues = sr->lvalues,*gvalues = sr->gvalues;
  PetscInt       sum_flg = 0,max_flg = 0, min_flg = 0;
  MPI_Comm       comm = sr->comm;
  PetscMPIInt    size;

  PetscFunctionBegin;
  if (sr->numopsend > 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Cannot call this after VecxxxEnd() has been called");
  ierr = PetscLogEventBarrierBegin(VEC_ReduceBarrier,0,0,0,0,comm);CHKERRQ(ierr);
  ierr  = MPI_Comm_size(sr->comm,&size);CHKERRQ(ierr); 
  if (size == 1) {
    ierr = PetscMemcpy(gvalues,lvalues,numops*sizeof(PetscScalar));CHKERRQ(ierr);
  } else {
    /* determine if all reductions are sum, max, or min */
    for (i=0; i<numops; i++) {
      if (reducetype[i] == REDUCE_MAX) {
        max_flg = 1;
      } else if (reducetype[i] == REDUCE_SUM) {
        sum_flg = 1;
      } else if (reducetype[i] == REDUCE_MIN) {
        min_flg = 1;
      } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Error in PetscSplitReduction() data structure, probably memory corruption");
    }
    if (sum_flg + max_flg + min_flg > 1) {
      /* 
         after all the entires in lvalues we store the reducetype flags to indicate
         to the reduction operations what are sums and what are max
      */
      for (i=0; i<numops; i++) {
        lvalues[numops+i] = reducetype[i];
      }
#if defined(PETSC_USE_COMPLEX)
      ierr = MPI_Allreduce(lvalues,gvalues,2*2*numops,MPIU_REAL,PetscSplitReduction_Op,comm);CHKERRQ(ierr);
#else
      ierr = MPI_Allreduce(lvalues,gvalues,2*numops,MPIU_REAL,PetscSplitReduction_Op,comm);CHKERRQ(ierr);
#endif
    } else if (max_flg) {
#if defined(PETSC_USE_COMPLEX)
      /* 
        complex case we max both the real and imaginary parts, the imaginary part
        is just ignored later
      */
      ierr = MPI_Allreduce(lvalues,gvalues,2*numops,MPIU_REAL,MPIU_MAX,comm);CHKERRQ(ierr);
#else
      ierr = MPI_Allreduce(lvalues,gvalues,numops,MPIU_REAL,MPIU_MAX,comm);CHKERRQ(ierr);
#endif
    } else if (min_flg) {
#if defined(PETSC_USE_COMPLEX)
      /* 
        complex case we min both the real and imaginary parts, the imaginary part
        is just ignored later
      */
      ierr = MPI_Allreduce(lvalues,gvalues,2*numops,MPIU_REAL,MPIU_MIN,comm);CHKERRQ(ierr);
#else
      ierr = MPI_Allreduce(lvalues,gvalues,numops,MPIU_REAL,MPIU_MIN,comm);CHKERRQ(ierr);
#endif
    } else {
      ierr = MPI_Allreduce(lvalues,gvalues,numops,MPIU_SCALAR,MPIU_SUM,comm);CHKERRQ(ierr);
    }
  }
  sr->state     = STATE_END;
  sr->numopsend = 0;
  ierr = PetscLogEventBarrierEnd(VEC_ReduceBarrier,0,0,0,0,comm);CHKERRQ(ierr);
  PetscFunctionReturn(0);
}
Пример #3
0
PetscErrorCode  KSPSolve_FBCGSR(KSP ksp)
{
    PetscErrorCode    ierr;
    PetscInt          i,j,N;
    PetscScalar       tau,sigma,alpha,omega,beta;
    PetscReal         rho;
    PetscScalar       xi1,xi2,xi3,xi4;
    Vec               X,B,P,P2,RP,R,V,S,T,S2;
    PetscScalar       *PETSC_RESTRICT rp, *PETSC_RESTRICT r, *PETSC_RESTRICT p;
    PetscScalar       *PETSC_RESTRICT v, *PETSC_RESTRICT s, *PETSC_RESTRICT t, *PETSC_RESTRICT s2;
    PetscScalar       insums[4],outsums[4];
    KSP_BCGS          *bcgs = (KSP_BCGS*)ksp->data;
    PC                pc;

    PetscFunctionBegin;
    if (!ksp->vec_rhs->petscnative) SETERRQ(PetscObjectComm((PetscObject)ksp),PETSC_ERR_SUP,"Only coded for PETSc vectors");
    ierr = VecGetLocalSize(ksp->vec_sol,&N);
    CHKERRQ(ierr);

    X  = ksp->vec_sol;
    B  = ksp->vec_rhs;
    P2 = ksp->work[0];

    /* The followings are involved in modified inner product calculations and vector updates */
    RP = ksp->work[1];
    ierr = VecGetArray(RP,(PetscScalar**)&rp);
    CHKERRQ(ierr);
    ierr = VecRestoreArray(RP,NULL);
    CHKERRQ(ierr);
    R  = ksp->work[2];
    ierr = VecGetArray(R,(PetscScalar**)&r);
    CHKERRQ(ierr);
    ierr = VecRestoreArray(R,NULL);
    CHKERRQ(ierr);
    P  = ksp->work[3];
    ierr = VecGetArray(P,(PetscScalar**)&p);
    CHKERRQ(ierr);
    ierr = VecRestoreArray(P,NULL);
    CHKERRQ(ierr);
    V  = ksp->work[4];
    ierr = VecGetArray(V,(PetscScalar**)&v);
    CHKERRQ(ierr);
    ierr = VecRestoreArray(V,NULL);
    CHKERRQ(ierr);
    S  = ksp->work[5];
    ierr = VecGetArray(S,(PetscScalar**)&s);
    CHKERRQ(ierr);
    ierr = VecRestoreArray(S,NULL);
    CHKERRQ(ierr);
    T  = ksp->work[6];
    ierr = VecGetArray(T,(PetscScalar**)&t);
    CHKERRQ(ierr);
    ierr = VecRestoreArray(T,NULL);
    CHKERRQ(ierr);
    S2 = ksp->work[7];
    ierr = VecGetArray(S2,(PetscScalar**)&s2);
    CHKERRQ(ierr);
    ierr = VecRestoreArray(S2,NULL);
    CHKERRQ(ierr);

    /* Only supports right preconditioning */
    if (ksp->pc_side != PC_RIGHT) SETERRQ1(PetscObjectComm((PetscObject)ksp),PETSC_ERR_SUP,"KSP fbcgsr does not support %s",PCSides[ksp->pc_side]);
    if (!ksp->guess_zero) {
        if (!bcgs->guess) {
            ierr = VecDuplicate(X,&bcgs->guess);
            CHKERRQ(ierr);
        }
        ierr = VecCopy(X,bcgs->guess);
        CHKERRQ(ierr);
    } else {
        ierr = VecSet(X,0.0);
        CHKERRQ(ierr);
    }

    /* Compute initial residual */
    ierr = KSPGetPC(ksp,&pc);
    CHKERRQ(ierr);
    ierr = PCSetUp(pc);
    CHKERRQ(ierr);
    if (!ksp->guess_zero) {
        ierr = MatMult(pc->mat,X,P2);
        CHKERRQ(ierr); /* P2 is used as temporary storage */
        ierr = VecCopy(B,R);
        CHKERRQ(ierr);
        ierr = VecAXPY(R,-1.0,P2);
        CHKERRQ(ierr);
    } else {
        ierr = VecCopy(B,R);
        CHKERRQ(ierr);
    }

    /* Test for nothing to do */
    if (ksp->normtype != KSP_NORM_NONE) {
        ierr = VecNorm(R,NORM_2,&rho);
        CHKERRQ(ierr);
    }
    ierr       = PetscObjectSAWsTakeAccess((PetscObject)ksp);
    CHKERRQ(ierr);
    ksp->its   = 0;
    ksp->rnorm = rho;
    ierr = PetscObjectSAWsGrantAccess((PetscObject)ksp);
    CHKERRQ(ierr);
    ierr = KSPLogResidualHistory(ksp,rho);
    CHKERRQ(ierr);
    ierr = KSPMonitor(ksp,0,rho);
    CHKERRQ(ierr);
    ierr = (*ksp->converged)(ksp,0,rho,&ksp->reason,ksp->cnvP);
    CHKERRQ(ierr);
    if (ksp->reason) PetscFunctionReturn(0);

    /* Initialize iterates */
    ierr = VecCopy(R,RP);
    CHKERRQ(ierr); /* rp <- r */
    ierr = VecCopy(R,P);
    CHKERRQ(ierr); /* p <- r */

    /* Big loop */
    for (i=0; i<ksp->max_it; i++) {

        /* matmult and pc */
        ierr = PCApply(pc,P,P2);
        CHKERRQ(ierr); /* p2 <- K p */
        ierr = MatMult(pc->mat,P2,V);
        CHKERRQ(ierr); /* v <- A p2 */

        /* inner prodcuts */
        if (i==0) {
            tau  = rho*rho;
            ierr = VecDot(V,RP,&sigma);
            CHKERRQ(ierr); /* sigma <- (v,rp) */
        } else {
            ierr = PetscLogEventBegin(VEC_ReduceArithmetic,0,0,0,0);
            CHKERRQ(ierr);
            tau  = sigma = 0.0;
            for (j=0; j<N; j++) {
                tau   += r[j]*rp[j]; /* tau <- (r,rp) */
                sigma += v[j]*rp[j]; /* sigma <- (v,rp) */
            }
            PetscLogFlops(4.0*N);
            ierr      = PetscLogEventEnd(VEC_ReduceArithmetic,0,0,0,0);
            CHKERRQ(ierr);
            insums[0] = tau;
            insums[1] = sigma;
            ierr      = PetscLogEventBarrierBegin(VEC_ReduceBarrier,0,0,0,0,PetscObjectComm((PetscObject)ksp));
            CHKERRQ(ierr);
            ierr      = MPI_Allreduce(insums,outsums,2,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)ksp));
            CHKERRQ(ierr);
            ierr      = PetscLogEventBarrierEnd(VEC_ReduceBarrier,0,0,0,0,PetscObjectComm((PetscObject)ksp));
            CHKERRQ(ierr);
            tau       = outsums[0];
            sigma     = outsums[1];
        }

        /* scalar update */
        alpha = tau / sigma;

        /* vector update */
        ierr = VecWAXPY(S,-alpha,V,R);
        CHKERRQ(ierr);  /* s <- r - alpha v */

        /* matmult and pc */
        ierr = PCApply(pc,S,S2);
        CHKERRQ(ierr); /* s2 <- K s */
        ierr = MatMult(pc->mat,S2,T);
        CHKERRQ(ierr); /* t <- A s2 */

        /* inner prodcuts */
        ierr = PetscLogEventBegin(VEC_ReduceArithmetic,0,0,0,0);
        CHKERRQ(ierr);
        xi1  = xi2 = xi3 = xi4 = 0.0;
        for (j=0; j<N; j++) {
            xi1 += s[j]*s[j]; /* xi1 <- (s,s) */
            xi2 += t[j]*s[j]; /* xi2 <- (t,s) */
            xi3 += t[j]*t[j]; /* xi3 <- (t,t) */
            xi4 += t[j]*rp[j]; /* xi4 <- (t,rp) */
        }
        PetscLogFlops(8.0*N);
        ierr = PetscLogEventEnd(VEC_ReduceArithmetic,0,0,0,0);
        CHKERRQ(ierr);

        insums[0] = xi1;
        insums[1] = xi2;
        insums[2] = xi3;
        insums[3] = xi4;

        ierr = PetscLogEventBarrierBegin(VEC_ReduceBarrier,0,0,0,0,PetscObjectComm((PetscObject)ksp));
        CHKERRQ(ierr);
        ierr = MPI_Allreduce(insums,outsums,4,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)ksp));
        CHKERRQ(ierr);
        ierr = PetscLogEventBarrierEnd(VEC_ReduceBarrier,0,0,0,0,PetscObjectComm((PetscObject)ksp));
        CHKERRQ(ierr);
        xi1  = outsums[0];
        xi2  = outsums[1];
        xi3  = outsums[2];
        xi4  = outsums[3];

        /* test denominator */
        if (xi3 == 0.0) SETERRQ(PetscObjectComm((PetscObject)ksp),PETSC_ERR_PLIB,"Divide by zero");
        if (sigma == 0.0) SETERRQ(PetscObjectComm((PetscObject)ksp),PETSC_ERR_PLIB,"Divide by zero");

        /* scalar updates */
        omega = xi2 / xi3;
        beta  = -xi4 / sigma;
        rho   = PetscSqrtReal(PetscAbsScalar(xi1 - omega * xi2)); /* residual norm */

        /* vector updates */
        ierr = VecAXPBYPCZ(X,alpha,omega,1.0,P2,S2);
        CHKERRQ(ierr); /* x <- alpha * p2 + omega * s2 + x */

        /* convergence test */
        ierr = PetscObjectSAWsTakeAccess((PetscObject)ksp);
        CHKERRQ(ierr);
        ksp->its++;
        ksp->rnorm = rho;
        ierr = PetscObjectSAWsGrantAccess((PetscObject)ksp);
        CHKERRQ(ierr);
        ierr = KSPLogResidualHistory(ksp,rho);
        CHKERRQ(ierr);
        ierr = KSPMonitor(ksp,i+1,rho);
        CHKERRQ(ierr);
        ierr = (*ksp->converged)(ksp,i+1,rho,&ksp->reason,ksp->cnvP);
        CHKERRQ(ierr);
        if (ksp->reason) break;

        /* vector updates */
        ierr = PetscLogEventBegin(VEC_Ops,0,0,0,0);
        CHKERRQ(ierr);
        for (j=0; j<N; j++) {
            r[j] = s[j] - omega * t[j]; /* r <- s - omega t */
            p[j] = r[j] + beta * (p[j] - omega * v[j]); /* p <- r + beta * (p - omega v) */
        }
        PetscLogFlops(6.0*N);
        ierr = PetscLogEventEnd(VEC_Ops,0,0,0,0);
        CHKERRQ(ierr);

    }

    if (i >= ksp->max_it) ksp->reason = KSP_DIVERGED_ITS;
    PetscFunctionReturn(0);
}
Пример #4
0
static PetscErrorCode  KSPSolve_IBCGS(KSP ksp)
{
  PetscErrorCode ierr;
  PetscInt       i,N;
  PetscReal      rnorm,rnormin = 0.0;
#if defined(PETSC_HAVE_MPI_LONG_DOUBLE) && !defined(PETSC_USE_COMPLEX) && (defined(PETSC_USE_REAL_SINGLE) || defined(PETSC_USE_REAL_DOUBLE))
  /* Because of possible instabilities in the algorithm (as indicated by different residual histories for the same problem 
     on the same number of processes  with different runs) we support computing the inner products using Intel's 80 bit arithematic
     rather than just 64 bit. Thus we copy our double precision values into long doubles (hoping this keeps the 16 extra bits)
     and tell MPI to do its ALlreduces with MPI_LONG_DOUBLE.

     Note for developers that does not effect the code. Intel's long double is implemented by storing the 80 bits of extended double
     precision into a 16 byte space (the rest of the space is ignored)  */
  long double    insums[7],outsums[7];
#else
  PetscScalar    insums[7],outsums[7];
#endif
  PetscScalar    sigman_2, sigman_1, sigman, pin_1, pin, phin_1, phin,tmp1,tmp2;
  PetscScalar    taun_1, taun, rhon, alphan_1, alphan, omegan_1, omegan;
  const PetscScalar *PETSC_RESTRICT r0, *PETSC_RESTRICT f0, *PETSC_RESTRICT qn, *PETSC_RESTRICT b, *PETSC_RESTRICT un;
  PetscScalar    *PETSC_RESTRICT rn, *PETSC_RESTRICT xn, *PETSC_RESTRICT vn, *PETSC_RESTRICT zn;
  /* the rest do not have to keep n_1 values */
  PetscScalar    kappan, thetan, etan, gamman, betan, deltan;
  const PetscScalar *PETSC_RESTRICT tn;
  PetscScalar    *PETSC_RESTRICT sn;
  Vec            R0,Rn,Xn,F0,Vn,Zn,Qn,Tn,Sn,B,Un;
  Mat            A;

  PetscFunctionBegin;
  if (!ksp->vec_rhs->petscnative) SETERRQ(((PetscObject)ksp)->comm,PETSC_ERR_SUP,"Only coded for PETSc vectors");

  ierr = PCGetOperators(ksp->pc,&A,PETSC_NULL,PETSC_NULL);CHKERRQ(ierr);
  ierr = VecGetLocalSize(ksp->vec_sol,&N);CHKERRQ(ierr);
  Xn = ksp->vec_sol;ierr = VecGetArray(Xn_1,(PetscScalar**)&xn_1);CHKERRQ(ierr);ierr = VecRestoreArray(Xn_1,PETSC_NULL);CHKERRQ(ierr);
  B  = ksp->vec_rhs;ierr = VecGetArrayRead(B,(const PetscScalar**)&b);ierr = VecRestoreArrayRead(B,PETSC_NULL);CHKERRQ(ierr);
  R0 = ksp->work[0];ierr = VecGetArrayRead(R0,(const PetscScalar**)&r0);CHKERRQ(ierr);ierr = VecRestoreArrayRead(R0,PETSC_NULL);CHKERRQ(ierr);
  Rn = ksp->work[1];ierr = VecGetArray(Rn_1,(PetscScalar**)&rn_1);CHKERRQ(ierr);ierr = VecRestoreArray(Rn_1,PETSC_NULL);CHKERRQ(ierr);
  Un = ksp->work[2];ierr = VecGetArrayRead(Un_1,(const PetscScalar**)&un_1);CHKERRQ(ierr);ierr = VecRestoreArrayRead(Un_1,PETSC_NULL);CHKERRQ(ierr);
  F0 = ksp->work[3];ierr = VecGetArrayRead(F0,(const PetscScalar**)&f0);CHKERRQ(ierr);ierr = VecRestoreArrayRead(F0,PETSC_NULL);CHKERRQ(ierr);
  Vn = ksp->work[4];ierr = VecGetArray(Vn_1,(PetscScalar**)&vn_1);CHKERRQ(ierr);ierr = VecRestoreArray(Vn_1,PETSC_NULL);CHKERRQ(ierr);
  Zn = ksp->work[5];ierr = VecGetArray(Zn_1,(PetscScalar**)&zn_1);CHKERRQ(ierr);ierr = VecRestoreArray(Zn_1,PETSC_NULL);CHKERRQ(ierr);
  Qn = ksp->work[6];ierr = VecGetArrayRead(Qn_1,(const PetscScalar**)&qn_1);CHKERRQ(ierr);ierr = VecRestoreArrayRead(Qn_1,PETSC_NULL);CHKERRQ(ierr);
  Tn = ksp->work[7];ierr = VecGetArrayRead(Tn,(const PetscScalar**)&tn);CHKERRQ(ierr);ierr = VecRestoreArrayRead(Tn,PETSC_NULL);CHKERRQ(ierr);
  Sn = ksp->work[8];ierr = VecGetArrayRead(Sn,(const PetscScalar**)&sn);CHKERRQ(ierr);ierr = VecRestoreArrayRead(Sn,PETSC_NULL);CHKERRQ(ierr);

  /* r0 = rn_1 = b - A*xn_1; */
  /* ierr = KSP_PCApplyBAorAB(ksp,Xn_1,Rn_1,Tn);CHKERRQ(ierr);
     ierr = VecAYPX(Rn_1,-1.0,B);CHKERRQ(ierr); */
  ierr = KSPInitialResidual(ksp,Xn_1,Tn,Sn,Rn_1,B);CHKERRQ(ierr);

  ierr = VecNorm(Rn_1,NORM_2,&rnorm);CHKERRQ(ierr);
  ierr = KSPMonitor(ksp,0,rnorm);CHKERRQ(ierr);
  ierr = (*ksp->converged)(ksp,0,rnorm,&ksp->reason,ksp->cnvP);CHKERRQ(ierr);   
  if (ksp->reason) PetscFunctionReturn(0);

  ierr = VecCopy(Rn_1,R0);CHKERRQ(ierr);

  /* un_1 = A*rn_1; */
  ierr = KSP_PCApplyBAorAB(ksp,Rn_1,Un_1,Tn);CHKERRQ(ierr);
  
  /* f0   = A'*rn_1; */
  if (ksp->pc_side == PC_RIGHT) { /* B' A' */
    ierr = MatMultTranspose(A,R0,Tn);CHKERRQ(ierr);
    ierr = PCApplyTranspose(ksp->pc,Tn,F0);CHKERRQ(ierr);
  } else if (ksp->pc_side == PC_LEFT) { /* A' B' */
    ierr = PCApplyTranspose(ksp->pc,R0,Tn);CHKERRQ(ierr);
    ierr = MatMultTranspose(A,Tn,F0);CHKERRQ(ierr);
  }

  /*qn_1 = vn_1 = zn_1 = 0.0; */
  ierr = VecSet(Qn_1,0.0);CHKERRQ(ierr);
  ierr = VecSet(Vn_1,0.0);CHKERRQ(ierr);
  ierr = VecSet(Zn_1,0.0);CHKERRQ(ierr);

  sigman_2 = pin_1 = taun_1 = 0.0;

  /* the paper says phin_1 should be initialized to zero, it is actually R0'R0 */
  ierr = VecDot(R0,R0,&phin_1);CHKERRQ(ierr); 

  /* sigman_1 = rn_1'un_1  */
  ierr = VecDot(R0,Un_1,&sigman_1);CHKERRQ(ierr); 

  alphan_1 = omegan_1 = 1.0;

  for (ksp->its = 1; ksp->its<ksp->max_it+1; ksp->its++) {
    rhon   = phin_1 - omegan_1*sigman_2 + omegan_1*alphan_1*pin_1;
    /*    if (rhon == 0.0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_CONV_FAILED,"rhon is zero, iteration %D",n); */
    if (ksp->its == 1) deltan = rhon;
    else deltan = rhon/taun_1;
    betan  = deltan/omegan_1;
    taun   = sigman_1 + betan*taun_1  - deltan*pin_1;
    if (taun == 0.0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_CONV_FAILED,"taun is zero, iteration %D",ksp->its);
    alphan = rhon/taun;
    ierr = PetscLogFlops(15.0);

    /*  
        zn = alphan*rn_1 + (alphan/alphan_1)betan*zn_1 - alphan*deltan*vn_1
        vn = un_1 + betan*vn_1 - deltan*qn_1
        sn = rn_1 - alphan*vn

       The algorithm in the paper is missing the alphan/alphan_1 term in the zn update
    */
    ierr = PetscLogEventBegin(VEC_Ops,0,0,0,0);CHKERRQ(ierr);
    tmp1 = (alphan/alphan_1)*betan;
    tmp2 = alphan*deltan;
    for (i=0; i<N; i++) {
      zn[i] = alphan*rn_1[i] + tmp1*zn_1[i] - tmp2*vn_1[i];
      vn[i] = un_1[i] + betan*vn_1[i] - deltan*qn_1[i];
      sn[i] = rn_1[i] - alphan*vn[i];
    }
    ierr = PetscLogFlops(3.0+11.0*N);
    ierr = PetscLogEventEnd(VEC_Ops,0,0,0,0);CHKERRQ(ierr);

    /*
        qn = A*vn
    */
    ierr = KSP_PCApplyBAorAB(ksp,Vn,Qn,Tn);CHKERRQ(ierr);

    /*
        tn = un_1 - alphan*qn
    */
    ierr = VecWAXPY(Tn,-alphan,Qn,Un_1);CHKERRQ(ierr);
      

    /*
        phin = r0'sn
        pin  = r0'qn
        gamman = f0'sn
        etan   = f0'tn
        thetan = sn'tn
        kappan = tn'tn
    */
    ierr = PetscLogEventBegin(VEC_ReduceArithmetic,0,0,0,0);CHKERRQ(ierr);
    phin = pin = gamman = etan = thetan = kappan = 0.0;
    for (i=0; i<N; i++) {
      phin += r0[i]*sn[i];
      pin  += r0[i]*qn[i];
      gamman += f0[i]*sn[i];
      etan   += f0[i]*tn[i];
      thetan += sn[i]*tn[i];
      kappan += tn[i]*tn[i];
    }
    ierr = PetscLogFlops(12.0*N);
    ierr = PetscLogEventEnd(VEC_ReduceArithmetic,0,0,0,0);CHKERRQ(ierr);
    insums[0] = phin;
    insums[1] = pin;
    insums[2] = gamman;
    insums[3] = etan;
    insums[4] = thetan;
    insums[5] = kappan;
    insums[6] = rnormin;
    ierr = PetscLogEventBarrierBegin(VEC_ReduceBarrier,0,0,0,0,((PetscObject)ksp)->comm);CHKERRQ(ierr);
#if defined(PETSC_HAVE_MPI_LONG_DOUBLE) && !defined(PETSC_USE_COMPLEX) && (defined(PETSC_USE_REAL_SINGLE) || defined(PETSC_USE_REAL_DOUBLE))
    if (ksp->lagnorm && ksp->its > 1) {
      ierr = MPI_Allreduce(insums,outsums,7,MPI_LONG_DOUBLE,MPI_SUM,((PetscObject)ksp)->comm);CHKERRQ(ierr);
    } else {
      ierr = MPI_Allreduce(insums,outsums,6,MPI_LONG_DOUBLE,MPI_SUM,((PetscObject)ksp)->comm);CHKERRQ(ierr);
    }
#else
    if (ksp->lagnorm && ksp->its > 1) {
      ierr = MPI_Allreduce(insums,outsums,7,MPIU_SCALAR,MPIU_SUM,((PetscObject)ksp)->comm);CHKERRQ(ierr);
    } else {
      ierr = MPI_Allreduce(insums,outsums,6,MPIU_SCALAR,MPIU_SUM,((PetscObject)ksp)->comm);CHKERRQ(ierr);
    }
#endif
    ierr = PetscLogEventBarrierEnd(VEC_ReduceBarrier,0,0,0,0,((PetscObject)ksp)->comm);CHKERRQ(ierr);
    phin     = outsums[0];
    pin      = outsums[1];
    gamman   = outsums[2];
    etan     = outsums[3];
    thetan   = outsums[4];
    kappan   = outsums[5];
    if (ksp->lagnorm && ksp->its > 1) rnorm = PetscSqrtReal(PetscRealPart(outsums[6]));

    if (kappan == 0.0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_CONV_FAILED,"kappan is zero, iteration %D",ksp->its);
    if (thetan == 0.0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_CONV_FAILED,"thetan is zero, iteration %D",ksp->its);
    omegan = thetan/kappan;
    sigman = gamman - omegan*etan;

    /*
        rn = sn - omegan*tn
        xn = xn_1 + zn + omegan*sn
    */
    ierr = PetscLogEventBegin(VEC_Ops,0,0,0,0);CHKERRQ(ierr);
    rnormin = 0.0;
    for (i=0; i<N; i++) {
      rn[i]    = sn[i] - omegan*tn[i];
      rnormin += PetscRealPart(PetscConj(rn[i])*rn[i]);
      xn[i]   += zn[i] + omegan*sn[i];
    }
    ierr = PetscObjectStateIncrease((PetscObject)Xn);CHKERRQ(ierr);
    ierr = PetscLogFlops(7.0*N);
    ierr = PetscLogEventEnd(VEC_Ops,0,0,0,0);CHKERRQ(ierr);

    if (!ksp->lagnorm && ksp->chknorm < ksp->its) {
      ierr = PetscLogEventBarrierBegin(VEC_ReduceBarrier,0,0,0,0,((PetscObject)ksp)->comm);CHKERRQ(ierr);
      ierr = MPI_Allreduce(&rnormin,&rnorm,1,MPIU_REAL,MPIU_SUM,((PetscObject)ksp)->comm);CHKERRQ(ierr);
      ierr = PetscLogEventBarrierEnd(VEC_ReduceBarrier,0,0,0,0,((PetscObject)ksp)->comm);CHKERRQ(ierr);
      rnorm = PetscSqrtReal(rnorm);
    } 

    /* Test for convergence */
    ierr = KSPMonitor(ksp,ksp->its,rnorm);CHKERRQ(ierr);
    ierr = (*ksp->converged)(ksp,ksp->its,rnorm,&ksp->reason,ksp->cnvP);CHKERRQ(ierr);   
    if (ksp->reason) break;
 
    /* un = A*rn */
    ierr = KSP_PCApplyBAorAB(ksp,Rn,Un,Tn);CHKERRQ(ierr);   

    /* Update n-1 locations with n locations */
    sigman_2 = sigman_1;
    sigman_1 = sigman;
    pin_1    = pin;
    phin_1   = phin;
    alphan_1 = alphan;
    taun_1   = taun;
    omegan_1 = omegan;
  }
  if (ksp->its >= ksp->max_it) {
    ksp->reason = KSP_DIVERGED_ITS;
  }
  ierr = KSPUnwindPreconditioner(ksp,Xn,Tn);CHKERRQ(ierr);
  PetscFunctionReturn(0);
}