Beispiel #1
0
  double norm2(const ColorSpinorField &a) {

    double rtn = 0.0;
    if (typeid(a) == typeid(cudaColorSpinorField)) {
      rtn = normCuda(dynamic_cast<const cudaColorSpinorField&>(a));
    } else if (typeid(a) == typeid(cpuColorSpinorField)) {
      rtn = normCpu(dynamic_cast<const cpuColorSpinorField&>(a));
    } else {
      errorQuda("Unknown input ColorSpinorField %s", typeid(a).name());
    }

    return rtn;
  }
Beispiel #2
0
void MultiShiftCG::operator()(cudaColorSpinorField **x, cudaColorSpinorField &b)
{
 
  int num_offset = invParam.num_offset;
  double *offset = invParam.offset;
  double *residue_sq = invParam.tol_offset;
 
  if (num_offset == 0) return;

  int *finished = new int [num_offset];
  double *zeta_i = new double[num_offset];
  double *zeta_im1 = new double[num_offset];
  double *zeta_ip1 = new double[num_offset];
  double *beta_i = new double[num_offset];
  double *beta_im1 = new double[num_offset];
  double *alpha = new double[num_offset];
  int i, j;
  
  int j_low = 0;   
  int num_offset_now = num_offset;
  for (i=0; i<num_offset; i++) {
    finished[i] = 0;
    zeta_im1[i] = zeta_i[i] = 1.0;
    beta_im1[i] = -1.0;
    alpha[i] = 0.0;
  }
  
  //double msq_x4 = offset[0];

  cudaColorSpinorField *r = new cudaColorSpinorField(b);
  
  cudaColorSpinorField **x_sloppy = new cudaColorSpinorField*[num_offset], *r_sloppy;
  
  ColorSpinorParam param;
  param.create = QUDA_ZERO_FIELD_CREATE;
  param.precision = invParam.cuda_prec_sloppy;
  
  if (invParam.cuda_prec_sloppy == x[0]->Precision()) {
    for (i=0; i<num_offset; i++){
      x_sloppy[i] = x[i];
      zeroCuda(*x_sloppy[i]);
    }
    r_sloppy = r;
  } else {
    for (i=0; i<num_offset; i++) {
      x_sloppy[i] = new cudaColorSpinorField(*x[i], param);
    }
    param.create = QUDA_COPY_FIELD_CREATE;
    r_sloppy = new cudaColorSpinorField(*r, param);
  }
  
  cudaColorSpinorField **p = new cudaColorSpinorField*[num_offset];  
  for(i=0;i < num_offset;i++){
    p[i]= new cudaColorSpinorField(*r_sloppy);    
  }
  
  param.create = QUDA_ZERO_FIELD_CREATE;
  param.precision = invParam.cuda_prec_sloppy;
  cudaColorSpinorField* Ap = new cudaColorSpinorField(*r_sloppy, param);
  
  double b2 = 0.0;
  b2 = normCuda(b);
    
  double r2 = b2;
  double r2_old;
  double stop = r2*invParam.tol*invParam.tol; // stopping condition of solver
    
  double pAp;
    
  int k = 0;
    
  stopwatchStart();
  while (r2 > stop &&  k < invParam.maxiter) {
    //dslashCuda_st(tmp_sloppy, fatlinkSloppy, longlinkSloppy, p[0], 1 - oddBit, 0);
    //dslashAxpyCuda(Ap, fatlinkSloppy, longlinkSloppy, tmp_sloppy, oddBit, 0, p[0], msq_x4);
    matSloppy(*Ap, *p[0]);
    if (invParam.dslash_type != QUDA_ASQTAD_DSLASH){
      axpyCuda(offset[0], *p[0], *Ap);
    }
    pAp = reDotProductCuda(*p[0], *Ap);
    beta_i[0] = r2 / pAp;        

    zeta_ip1[0] = 1.0;
    for (j=1; j<num_offset_now; j++) {
      zeta_ip1[j] = zeta_i[j] * zeta_im1[j] * beta_im1[j_low];
      double c1 = beta_i[j_low] * alpha[j_low] * (zeta_im1[j]-zeta_i[j]);
      double c2 = zeta_im1[j] * beta_im1[j_low] * (1.0+(offset[j]-offset[0])*beta_i[j_low]);
      /*THISBLOWSUP
	zeta_ip1[j] /= c1 + c2;
	beta_i[j] = beta_i[j_low] * zeta_ip1[j] / zeta_i[j];
      */
      /*TRYTHIS*/
      if( (c1+c2) != 0.0 )
	zeta_ip1[j] /= (c1 + c2); 
      else {
	zeta_ip1[j] = 0.0;
	finished[j] = 1;
      }
      if( zeta_i[j] != 0.0) {
	beta_i[j] = beta_i[j_low] * zeta_ip1[j] / zeta_i[j];
      } else  {
	zeta_ip1[j] = 0.0;
	beta_i[j] = 0.0;
	finished[j] = 1;
	if (invParam.verbosity >= QUDA_VERBOSE)
	  printfQuda("SETTING A ZERO, j=%d, num_offset_now=%d\n",j,num_offset_now);
	//if(j==num_offset_now-1)node0_PRINTF("REDUCING OFFSET\n");
	if(j==num_offset_now-1) num_offset_now--;
	// don't work any more on finished solutions
	// this only works if largest offsets are last, otherwise
	// just wastes time multiplying by zero
      }
    }	
	
    r2_old = r2;
    r2 = axpyNormCuda(-beta_i[j_low], *Ap, *r_sloppy);

    alpha[0] = r2 / r2_old;
	
    for (j=1; j<num_offset_now; j++) {
      /*THISBLOWSUP
	alpha[j] = alpha[j_low] * zeta_ip1[j] * beta_i[j] /
	(zeta_i[j] * beta_i[j_low]);
      */
      /*TRYTHIS*/
      if( zeta_i[j] * beta_i[j_low] != 0.0)
	alpha[j] = alpha[j_low] * zeta_ip1[j] * beta_i[j] /
	  (zeta_i[j] * beta_i[j_low]);
      else {
	alpha[j] = 0.0;
	finished[j] = 1;
      }
    }
	
    axpyZpbxCuda(beta_i[0], *p[0], *x_sloppy[0], *r_sloppy, alpha[0]);	
    for (j=1; j<num_offset_now; j++) {
      axpyBzpcxCuda(beta_i[j], *p[j], *x_sloppy[j], zeta_ip1[j], *r_sloppy, alpha[j]);
    }
    
    for (j=0; j<num_offset_now; j++) {
      beta_im1[j] = beta_i[j];
      zeta_im1[j] = zeta_i[j];
      zeta_i[j] = zeta_ip1[j];
    }

    k++;
    if (invParam.verbosity >= QUDA_VERBOSE){
      printfQuda("Multimass CG: %d iterations, r2 = %e\n", k, r2);
    }
  }
    
  if (x[0]->Precision() != x_sloppy[0]->Precision()) {
    for(i=0;i < num_offset; i++){
      copyCuda(*x[i], *x_sloppy[i]);
    }
  }

  *residue_sq = r2;

  invParam.secs = stopwatchReadSeconds();
     
  if (k==invParam.maxiter) {
    warningQuda("Exceeded maximum iterations %d\n", invParam.maxiter);
  }
    
  double gflops = (quda::blas_flops + mat.flops() + matSloppy.flops())*1e-9;
  reduceDouble(gflops);

  invParam.gflops = gflops;
  invParam.iter = k;
  
  // Calculate the true residual of the system with the smallest shift
  mat(*r, *x[0]); 
  axpyCuda(offset[0],*x[0], *r); // Offset it.

  double true_res = xmyNormCuda(b, *r);
  if (invParam.verbosity >= QUDA_SUMMARIZE){
    printfQuda("MultiShift CG: Converged after %d iterations, r2 = %e, relative true_r2 = %e\n", 
	       k,r2, (true_res / b2));
  }    
  if (invParam.verbosity >= QUDA_VERBOSE){
    printfQuda("MultiShift CG: Converged after %d iterations\n", k);
    printfQuda(" shift=0 resid_rel=%e\n", sqrt(true_res/b2));
    for(int i=1; i < num_offset; i++) { 
      mat(*r, *x[i]); 
      axpyCuda(offset[i],*x[i], *r); // Offset it.
      true_res = xmyNormCuda(b, *r);
      printfQuda(" shift=%d resid_rel=%e\n",i, sqrt(true_res/b2));
    }
  }      
  
  delete r;
  for(i=0;i < num_offset; i++){
    delete p[i];
  }
  delete p;
  delete Ap;
  
  if (invParam.cuda_prec_sloppy != x[0]->Precision()) {
    for(i=0;i < num_offset;i++){
      delete x_sloppy[i];
    }
    delete r_sloppy;
  }
  delete x_sloppy;
  
  delete []finished;
  delete []zeta_i;
  delete []zeta_im1;
  delete []zeta_ip1;
  delete []beta_i;
  delete []beta_im1;
  delete []alpha;
 
}
Beispiel #3
0
  void MultiShiftCG::operator()(cudaColorSpinorField **x, cudaColorSpinorField &b)
  {
    profile.Start(QUDA_PROFILE_INIT);

    int num_offset = param.num_offset;
    double *offset = param.offset;
 
    if (num_offset == 0) return;

    const double b2 = normCuda(b);
    // Check to see that we're not trying to invert on a zero-field source
    if(b2 == 0){
      profile.Stop(QUDA_PROFILE_INIT);
      printfQuda("Warning: inverting on zero-field source\n");
      for(int i=0; i<num_offset; ++i){
        *(x[i]) = b;
	param.true_res_offset[i] = 0.0;
	param.true_res_hq_offset[i] = 0.0;
      }
      return;
    }
    

    double *zeta = new double[num_offset];
    double *zeta_old = new double[num_offset];
    double *alpha = new double[num_offset];
    double *beta = new double[num_offset];
  
    int j_low = 0;   
    int num_offset_now = num_offset;
    for (int i=0; i<num_offset; i++) {
      zeta[i] = zeta_old[i] = 1.0;
      beta[i] = 0.0;
      alpha[i] = 1.0;
    }
  
    // flag whether we will be using reliable updates or not
    bool reliable = false;
    for (int j=0; j<num_offset; j++) 
      if (param.tol_offset[j] < param.delta) reliable = true;


    cudaColorSpinorField *r = new cudaColorSpinorField(b);
    cudaColorSpinorField **y = reliable ? new cudaColorSpinorField*[num_offset] : NULL;
  
    ColorSpinorParam csParam(b);
    csParam.create = QUDA_ZERO_FIELD_CREATE;

    if (reliable)
      for (int i=0; i<num_offset; i++) y[i] = new cudaColorSpinorField(*r, csParam);

    csParam.setPrecision(param.precision_sloppy);
  
    cudaColorSpinorField *r_sloppy;
    if (param.precision_sloppy == x[0]->Precision()) {
      r_sloppy = r;
    } else {
      csParam.create = QUDA_COPY_FIELD_CREATE;
      r_sloppy = new cudaColorSpinorField(*r, csParam);
    }
  
    cudaColorSpinorField **x_sloppy = new cudaColorSpinorField*[num_offset];
    if (param.precision_sloppy == x[0]->Precision() ||
	!param.use_sloppy_partial_accumulator) {
      for (int i=0; i<num_offset; i++) x_sloppy[i] = x[i];
    } else {
      csParam.create = QUDA_ZERO_FIELD_CREATE;
      for (int i=0; i<num_offset; i++)
	x_sloppy[i] = new cudaColorSpinorField(*x[i], csParam);
    }
  
    cudaColorSpinorField **p = new cudaColorSpinorField*[num_offset];  
    for (int i=0; i<num_offset; i++) p[i]= new cudaColorSpinorField(*r_sloppy);    
  
    csParam.create = QUDA_ZERO_FIELD_CREATE;
    cudaColorSpinorField* Ap = new cudaColorSpinorField(*r_sloppy, csParam);
  
    cudaColorSpinorField tmp1(*Ap, csParam);

    // tmp2 only needed for multi-gpu Wilson-like kernels
    cudaColorSpinorField *tmp2_p = !mat.isStaggered() ?
      new cudaColorSpinorField(*Ap, csParam) : &tmp1;
    cudaColorSpinorField &tmp2 = *tmp2_p;

    // additional high-precision temporary if Wilson and mixed-precision
    csParam.setPrecision(param.precision);
    cudaColorSpinorField *tmp3_p =
      (param.precision != param.precision_sloppy && !mat.isStaggered()) ?
      new cudaColorSpinorField(*r, csParam) : &tmp1;
    cudaColorSpinorField &tmp3 = *tmp3_p;

    profile.Stop(QUDA_PROFILE_INIT);
    profile.Start(QUDA_PROFILE_PREAMBLE);

    // stopping condition of each shift
    double stop[QUDA_MAX_MULTI_SHIFT];
    double r2[QUDA_MAX_MULTI_SHIFT];
    for (int i=0; i<num_offset; i++) {
      r2[i] = b2;
      stop[i] = Solver::stopping(param.tol_offset[i], b2, param.residual_type);
    }

    double r2_old;
    double pAp;

    double rNorm[QUDA_MAX_MULTI_SHIFT];
    double r0Norm[QUDA_MAX_MULTI_SHIFT];
    double maxrx[QUDA_MAX_MULTI_SHIFT];
    double maxrr[QUDA_MAX_MULTI_SHIFT];
    for (int i=0; i<num_offset; i++) {
      rNorm[i] = sqrt(r2[i]);
      r0Norm[i] = rNorm[i];
      maxrx[i] = rNorm[i];
      maxrr[i] = rNorm[i];
    }
    double delta = param.delta;

    // this parameter determines how many consective reliable update
    // reisudal increases we tolerate before terminating the solver,
    // i.e., how long do we want to keep trying to converge
    const int maxResIncrease =  param.max_res_increase; // check if we reached the limit of our tolerance
    const int maxResIncreaseTotal = param.max_res_increase_total;
    
    int resIncrease = 0;
    int resIncreaseTotal[QUDA_MAX_MULTI_SHIFT];
    for (int i=0; i<num_offset; i++) {
      resIncreaseTotal[i]=0;
    }

    int k = 0;
    int rUpdate = 0;
    quda::blas_flops = 0;

    profile.Stop(QUDA_PROFILE_PREAMBLE);
    profile.Start(QUDA_PROFILE_COMPUTE);

    if (getVerbosity() >= QUDA_VERBOSE) 
      printfQuda("MultiShift CG: %d iterations, <r,r> = %e, |r|/|b| = %e\n", k, r2[0], sqrt(r2[0]/b2));
    
    while (r2[0] > stop[0] &&  k < param.maxiter) {
      matSloppy(*Ap, *p[0], tmp1, tmp2);
      // FIXME - this should be curried into the Dirac operator
      if (r->Nspin()==4) axpyCuda(offset[0], *p[0], *Ap); 

      pAp = reDotProductCuda(*p[0], *Ap);

      // compute zeta and alpha
      updateAlphaZeta(alpha, zeta, zeta_old, r2, beta, pAp, offset, num_offset_now, j_low);
	
      r2_old = r2[0];
      Complex cg_norm = axpyCGNormCuda(-alpha[j_low], *Ap, *r_sloppy);
      r2[0] = real(cg_norm);
      double zn = imag(cg_norm);

      // reliable update conditions
      rNorm[0] = sqrt(r2[0]);
      for (int j=1; j<num_offset_now; j++) rNorm[j] = rNorm[0] * zeta[j];

      int updateX=0, updateR=0;
      int reliable_shift = -1; // this is the shift that sets the reliable_shift
      for (int j=num_offset_now-1; j>=0; j--) {
	if (rNorm[j] > maxrx[j]) maxrx[j] = rNorm[j];
	if (rNorm[j] > maxrr[j]) maxrr[j] = rNorm[j];
	updateX = (rNorm[j] < delta*r0Norm[j] && r0Norm[j] <= maxrx[j]) ? 1 : updateX;
	updateR = ((rNorm[j] < delta*maxrr[j] && r0Norm[j] <= maxrr[j]) || updateX) ? 1 : updateR;
	if ((updateX || updateR) && reliable_shift == -1) reliable_shift = j;
      }

      if ( !(updateR || updateX) || !reliable) {
	//beta[0] = r2[0] / r2_old;	
	beta[0] = zn / r2_old;
	// update p[0] and x[0]
	axpyZpbxCuda(alpha[0], *p[0], *x_sloppy[0], *r_sloppy, beta[0]);	

	for (int j=1; j<num_offset_now; j++) {
	  beta[j] = beta[j_low] * zeta[j] * alpha[j] / (zeta_old[j] * alpha[j_low]);
	  // update p[i] and x[i]
	  axpyBzpcxCuda(alpha[j], *p[j], *x_sloppy[j], zeta[j], *r_sloppy, beta[j]);
	}
      } else {
	for (int j=0; j<num_offset_now; j++) {
	  axpyCuda(alpha[j], *p[j], *x_sloppy[j]);
	  copyCuda(*x[j], *x_sloppy[j]);
	  xpyCuda(*x[j], *y[j]);
	}

	mat(*r, *y[0], *x[0], tmp3); // here we can use x as tmp
	if (r->Nspin()==4) axpyCuda(offset[0], *y[0], *r);

	r2[0] = xmyNormCuda(b, *r);
	for (int j=1; j<num_offset_now; j++) r2[j] = zeta[j] * zeta[j] * r2[0];
	for (int j=0; j<num_offset_now; j++) zeroCuda(*x_sloppy[j]);

	copyCuda(*r_sloppy, *r);            

	// break-out check if we have reached the limit of the precision

	if (sqrt(r2[reliable_shift]) > r0Norm[reliable_shift]) { // reuse r0Norm for this
    resIncrease++;
    resIncreaseTotal[reliable_shift]++;
	  warningQuda("MultiShiftCG: Shift %d, updated residual %e is greater than previous residual %e (total #inc %i)", 
		      reliable_shift, sqrt(r2[reliable_shift]), r0Norm[reliable_shift], resIncreaseTotal[reliable_shift]);


	  if (resIncrease > maxResIncrease or resIncreaseTotal[reliable_shift] > maxResIncreaseTotal) break; // check if we reached the limit of our tolerancebreak;
	} else {
	  resIncrease = 0;
	}

	// explicitly restore the orthogonality of the gradient vector
	for (int j=0; j<num_offset_now; j++) {
	  double rp = reDotProductCuda(*r_sloppy, *p[j]) / (r2[0]);
	  axpyCuda(-rp, *r_sloppy, *p[j]);
	}

	// update beta and p
	beta[0] = r2[0] / r2_old; 
	xpayCuda(*r_sloppy, beta[0], *p[0]);
	for (int j=1; j<num_offset_now; j++) {
	  beta[j] = beta[j_low] * zeta[j] * alpha[j] / (zeta_old[j] * alpha[j_low]);
	  axpbyCuda(zeta[j], *r_sloppy, beta[j], *p[j]);
	}    

	// update reliable update parameters for the system that triggered the update
	int m = reliable_shift;
	rNorm[m] = sqrt(r2[0]) * zeta[m];
	maxrr[m] = rNorm[m];
	maxrx[m] = rNorm[m];
	r0Norm[m] = rNorm[m];      
	rUpdate++;
      }    

      // now we can check if any of the shifts have converged and remove them
      for (int j=1; j<num_offset_now; j++) {
        if (zeta[j] == 0.0) {
          num_offset_now--;
          if (getVerbosity() >= QUDA_VERBOSE)
              printfQuda("MultiShift CG: Shift %d converged after %d iterations\n", j, k + 1);
        }
        else {
	r2[j] = zeta[j] * zeta[j] * r2[0];
	if (r2[j] < stop[j]) {
            num_offset_now--;
	  if (getVerbosity() >= QUDA_VERBOSE)
	    printfQuda("MultiShift CG: Shift %d converged after %d iterations\n", j, k+1);
          }
	}
      }

      k++;
      
      if (getVerbosity() >= QUDA_VERBOSE) 
	printfQuda("MultiShift CG: %d iterations, <r,r> = %e, |r|/|b| = %e\n", k, r2[0], sqrt(r2[0]/b2));
    }
    
    
    for (int i=0; i<num_offset; i++) {
      copyCuda(*x[i], *x_sloppy[i]);
      if (reliable) xpyCuda(*y[i], *x[i]);
    }

    profile.Stop(QUDA_PROFILE_COMPUTE);
    profile.Start(QUDA_PROFILE_EPILOGUE);

    if (getVerbosity() >= QUDA_VERBOSE)
      printfQuda("MultiShift CG: Reliable updates = %d\n", rUpdate);

    if (k==param.maxiter) warningQuda("Exceeded maximum iterations %d\n", param.maxiter);
    
    param.secs = profile.Last(QUDA_PROFILE_COMPUTE);
    double gflops = (quda::blas_flops + mat.flops() + matSloppy.flops())*1e-9;
    reduceDouble(gflops);
    param.gflops = gflops;
    param.iter += k;

    for(int i=0; i < num_offset; i++) { 
      mat(*r, *x[i]); 
      if (r->Nspin()==4) {
	axpyCuda(offset[i], *x[i], *r); // Offset it.
      } else if (i!=0) {
	axpyCuda(offset[i]-offset[0], *x[i], *r); // Offset it.
      }
      double true_res = xmyNormCuda(b, *r);
      param.true_res_offset[i] = sqrt(true_res/b2);
#if (__COMPUTE_CAPABILITY__ >= 200)
      param.true_res_hq_offset[i] = sqrt(HeavyQuarkResidualNormCuda(*x[i], *r).z);
#else
      param.true_res_hq_offset[i] = 0.0;
#endif   
    }

    if (getVerbosity() >= QUDA_SUMMARIZE){
      printfQuda("MultiShift CG: Converged after %d iterations\n", k);
      for(int i=0; i < num_offset; i++) { 
	printfQuda(" shift=%d, relative residual: iterated = %e, true = %e\n", 
		   i, sqrt(r2[i]/b2), param.true_res_offset[i]);
      }
    }      
  
    // reset the flops counters
    quda::blas_flops = 0;
    mat.flops();
    matSloppy.flops();

    profile.Stop(QUDA_PROFILE_EPILOGUE);
    profile.Start(QUDA_PROFILE_FREE);

    if (&tmp3 != &tmp1) delete tmp3_p;
    if (&tmp2 != &tmp1) delete tmp2_p;

    if (r_sloppy->Precision() != r->Precision()) delete r_sloppy;
    for (int i=0; i<num_offset; i++) 
       if (x_sloppy[i]->Precision() != x[i]->Precision()) delete x_sloppy[i];
    delete []x_sloppy;
  
    delete r;
    for (int i=0; i<num_offset; i++) delete p[i];
    delete []p;

    if (reliable) {
      for (int i=0; i<num_offset; i++) delete y[i];
      delete []y;
    }

    delete Ap;
  
    delete []zeta_old;
    delete []zeta;
    delete []alpha;
    delete []beta;

    profile.Stop(QUDA_PROFILE_FREE);

    return;
  }
Beispiel #4
0
  void MR::operator()(cudaColorSpinorField &x, cudaColorSpinorField &b)
  {

    globalReduce = false; // use local reductions for DD solver

    if (!init) {
      ColorSpinorParam csParam(x);
      csParam.create = QUDA_ZERO_FIELD_CREATE;
      if (param.preserve_source == QUDA_PRESERVE_SOURCE_YES) {
	rp = new cudaColorSpinorField(x, csParam); 
	allocate_r = true;
      }
      Arp = new cudaColorSpinorField(x);
      tmpp = new cudaColorSpinorField(x, csParam); //temporary for mat-vec

      init = true;
    }
    cudaColorSpinorField &r = 
      (param.preserve_source == QUDA_PRESERVE_SOURCE_YES) ? *rp : b;
    cudaColorSpinorField &Ar = *Arp;
    cudaColorSpinorField &tmp = *tmpp;

    // set initial guess to zero and thus the residual is just the source
    zeroCuda(x);  // can get rid of this for a special first update kernel  
    double b2 = normCuda(b);
    if (&r != &b) copyCuda(r, b);

    // domain-wise normalization of the initial residual to prevent underflow
    double r2=0.0; // if zero source then we will exit immediately doing no work
    if (b2 > 0.0) {
      axCuda(1/sqrt(b2), r); // can merge this with the prior copy
      r2 = 1.0; // by definition by this is now true
    }

    if (param.inv_type_precondition != QUDA_GCR_INVERTER) {
      quda::blas_flops = 0;
      profile.TPSTART(QUDA_PROFILE_COMPUTE);
    }

    double omega = 1.0;

    int k = 0;
    if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
      double x2 = norm2(x);
      double3 Ar3 = cDotProductNormBCuda(Ar, r);
      printfQuda("MR: %d iterations, r2 = %e, <r|A|r> = (%e, %e), x2 = %e\n", 
		 k, Ar3.z, Ar3.x, Ar3.y, x2);
    }

    while (k < param.maxiter && r2 > 0.0) {
    
      mat(Ar, r, tmp);

      double3 Ar3 = cDotProductNormACuda(Ar, r);
      Complex alpha = Complex(Ar3.x, Ar3.y) / Ar3.z;

      // x += omega*alpha*r, r -= omega*alpha*Ar, r2 = norm2(r)
      //r2 = caxpyXmazNormXCuda(omega*alpha, r, x, Ar);
      caxpyXmazCuda(omega*alpha, r, x, Ar);

      if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
	double x2 = norm2(x);
	double r2 = norm2(r);
	printfQuda("MR: %d iterations, r2 = %e, <r|A|r> = (%e,%e) x2 = %e\n", 
		   k+1, r2, Ar3.x, Ar3.y, x2);
      } else if (getVerbosity() >= QUDA_VERBOSE) {
	printfQuda("MR: %d iterations, <r|A|r> = (%e, %e)\n", k, Ar3.x, Ar3.y);
      }

      k++;
    }
  
    if (getVerbosity() >= QUDA_VERBOSE) {
      mat(Ar, r, tmp);    
      Complex Ar2 = cDotProductCuda(Ar, r);
      printfQuda("MR: %d iterations, <r|A|r> = (%e, %e)\n", k, real(Ar2), imag(Ar2));
    }

    // Obtain global solution by rescaling
    if (b2 > 0.0) axCuda(sqrt(b2), x);

    if (param.inv_type_precondition != QUDA_GCR_INVERTER) {
        profile.TPSTOP(QUDA_PROFILE_COMPUTE);
        profile.TPSTART(QUDA_PROFILE_EPILOGUE);
	param.secs += profile.Last(QUDA_PROFILE_COMPUTE);
  
	double gflops = (quda::blas_flops + mat.flops())*1e-9;
	reduceDouble(gflops);
	
	param.gflops += gflops;
	param.iter += k;
	
	// this is the relative residual since it has been scaled by b2
	r2 = norm2(r);

	if (param.preserve_source == QUDA_PRESERVE_SOURCE_YES) {
	  // Calculate the true residual
	  mat(r, x);
	  double true_res = xmyNormCuda(b, r);
	  param.true_res = sqrt(true_res / b2);
	  if (getVerbosity() >= QUDA_SUMMARIZE) {
	    printfQuda("MR: Converged after %d iterations, relative residua: iterated = %e, true = %e\n",
		       k, sqrt(r2), param.true_res);
	  }
	} else {
	  if (getVerbosity() >= QUDA_SUMMARIZE) {
	    printfQuda("MR: Converged after %d iterations, relative residua: iterated = %e\n", k, sqrt(r2));
	  }
	}

	// reset the flops counters
	quda::blas_flops = 0;
	mat.flops();
        profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
    }

    globalReduce = true; // renable global reductions for outer solver

    return;
  }