C++ (Cpp) xpayの例 - HotExamples

コード例 #1

0

ファイルを表示

ファイル: wilson_dslash_reference.cpp プロジェクト: kpetrov/quda

// Apply the even-odd preconditioned Dirac operator
void tm_matpc(void *outEven, void **gauge, void *inEven, double kappa, double mu, QudaTwistFlavorType flavor,
	      QudaMatPCType matpc_type, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param) {

  void *tmp = malloc(Vh*spinorSiteSize*precision);
    
  if (matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) {
    wil_dslash(tmp, gauge, inEven, 1, daggerBit, precision, gauge_param);
    twist_gamma5(tmp, tmp, daggerBit, kappa, mu, flavor, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision);
    wil_dslash(outEven, gauge, tmp, 0, daggerBit, precision, gauge_param);
    twist_gamma5(tmp, inEven, daggerBit, kappa, mu, flavor, Vh, QUDA_TWIST_GAMMA5_DIRECT, precision);
  } else if (matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) {
    wil_dslash(tmp, gauge, inEven, 0, daggerBit, precision, gauge_param);
    twist_gamma5(tmp, tmp, daggerBit, kappa, mu, flavor, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision);
    wil_dslash(outEven, gauge, tmp, 1, daggerBit, precision, gauge_param);
    twist_gamma5(tmp, inEven, daggerBit, kappa, mu, flavor, Vh, QUDA_TWIST_GAMMA5_DIRECT, precision);
  } else if (!daggerBit) {
    if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
      wil_dslash(tmp, gauge, inEven, 1, daggerBit, precision, gauge_param);
      twist_gamma5(tmp, tmp, daggerBit, kappa, mu, flavor, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision);
      wil_dslash(outEven, gauge, tmp, 0, daggerBit, precision, gauge_param);
      twist_gamma5(outEven, outEven, daggerBit, kappa, mu, flavor, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision);
    } else if (matpc_type == QUDA_MATPC_ODD_ODD) {
      wil_dslash(tmp, gauge, inEven, 0, daggerBit, precision, gauge_param);
      twist_gamma5(tmp, tmp, daggerBit, kappa, mu, flavor, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision);
      wil_dslash(outEven, gauge, tmp, 1, daggerBit, precision, gauge_param);
      twist_gamma5(outEven, outEven, daggerBit, kappa, mu, flavor, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision);
    }
  } else {
    if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
      twist_gamma5(inEven, inEven, daggerBit, kappa, mu, flavor, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision);
      wil_dslash(tmp, gauge, inEven, 1, daggerBit, precision, gauge_param);
      twist_gamma5(tmp, tmp, daggerBit, kappa, mu, flavor, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision);
      wil_dslash(outEven, gauge, tmp, 0, daggerBit, precision, gauge_param);
      twist_gamma5(inEven, inEven, daggerBit, kappa, mu, flavor, Vh, QUDA_TWIST_GAMMA5_DIRECT, precision);
    } else if (matpc_type == QUDA_MATPC_ODD_ODD) {
      twist_gamma5(inEven, inEven, daggerBit, kappa, mu, flavor, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision);
      wil_dslash(tmp, gauge, inEven, 0, daggerBit, precision, gauge_param);
      twist_gamma5(tmp, tmp, daggerBit, kappa, mu, flavor, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision);
      wil_dslash(outEven, gauge, tmp, 1, daggerBit, precision, gauge_param);
      twist_gamma5(inEven, inEven, daggerBit, kappa, mu, flavor, Vh, QUDA_TWIST_GAMMA5_DIRECT, precision); // undo
    }
  }
  // lastly apply the kappa term
  double kappa2 = -kappa*kappa;
  if (matpc_type == QUDA_MATPC_EVEN_EVEN || matpc_type == QUDA_MATPC_ODD_ODD) {
    if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)inEven, kappa2, (double*)outEven, Vh*spinorSiteSize);
    else xpay((float*)inEven, (float)kappa2, (float*)outEven, Vh*spinorSiteSize);
  } else {
    if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)tmp, kappa2, (double*)outEven, Vh*spinorSiteSize);
    else xpay((float*)tmp, (float)kappa2, (float*)outEven, Vh*spinorSiteSize);
  }

  free(tmp);
}

コード例 #2

0

ファイルを表示

ファイル: wilson_dslash_reference.cpp プロジェクト: kpetrov/quda

void wil_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision,
	     QudaGaugeParam &gauge_param) {

  void *inEven = in;
  void *inOdd  = (char*)in + Vh*spinorSiteSize*precision;
  void *outEven = out;
  void *outOdd = (char*)out + Vh*spinorSiteSize*precision;

  wil_dslash(outOdd, gauge, inEven, 1, dagger_bit, precision, gauge_param);
  wil_dslash(outEven, gauge, inOdd, 0, dagger_bit, precision, gauge_param);

  // lastly apply the kappa term
  if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)in, -kappa, (double*)out, V*spinorSiteSize);
  else xpay((float*)in, -(float)kappa, (float*)out, V*spinorSiteSize);
}

コード例 #3

0

ファイルを表示

ファイル: laplb.c プロジェクト: g-koutsou/CoS-2

/*
 * Solves lapl(u) x = b, for x, given b, using Conjugate Gradient
 */
void
cg(latparams lp, field **x, field **b, link **g)
{
  size_t L = lp.L;
  int max_iter = 100;
  float tol = 1e-9;

  /* Temporary fields needed for CG */
  field **r = new_field(lp);
  field **p = new_field(lp);
  field **Ap = new_field(lp);

  /* Initial residual and p-vector */
  lapl(lp, r, x, g);
  xmy(lp, b, r);
  xeqy(lp, p, r);

  /* Initial r-norm and b-norm */
  float rr = xdotx(lp, r);  
  float bb = xdotx(lp, b);
  double t_lapl = 0;
  int iter = 0;
  for(iter=0; iter<max_iter; iter++) {
    printf(" %6d, res = %+e\n", iter, rr/bb);
    if(sqrt(rr/bb) < tol)
      break;
    double t = stop_watch(0);
    lapl(lp, Ap, p, g);
    t_lapl += stop_watch(t);
    float pAp = xdoty(lp, p, Ap);
    float alpha = rr/pAp;
    axpy(lp, alpha, p, x);
    axpy(lp, -alpha, Ap, r);
    float r1r1 = xdotx(lp, r);
    float beta = r1r1/rr;
    xpay(lp, r, beta, p);
    rr = r1r1;
  }

  /* Recompute residual after convergence */
  lapl(lp, r, x, g);
  xmy(lp, b, r);
  rr = xdotx(lp, r);

  double beta_fp = 50*((double)L*L*L)/(t_lapl/(double)iter)*1e-9;
  double beta_io = 40*((double)L*L*L)/(t_lapl/(double)iter)*1e-9;
  printf(" Converged after %6d iterations, res = %+e\n", iter, rr/bb);  
  printf(" Time in lapl(): %+6.3e sec/call, %4.2e Gflop/s, %4.2e GB/s\n",
	 t_lapl/(double)iter, beta_fp, beta_io);  

  del_field(r);
  del_field(p);
  del_field(Ap);
  return;
}

コード例 #4

0

ファイルを表示

ファイル: dslash_reference.cpp プロジェクト: rjchacko/klein-gould

// Apply the even-odd preconditioned Dirac operator
void MatPCDag(float *outEven, float **gauge, float *inEven, float kappa) {
    float *tmpOdd = (float*)malloc(Nh*spinorSiteSize*sizeof(float));
    
    // full dslash operator
    dslashReference(tmpOdd, gauge, inEven, 1, 1);
    dslashReference(outEven, gauge, tmpOdd, 0, 1);
    
    float kappa2 = -kappa*kappa;
    xpay(inEven, kappa2, outEven, Nh*spinorSiteSize);
    free(tmpOdd);
}

コード例 #5

0

ファイルを表示

ファイル: lapl.c プロジェクト: g-koutsou/LAP2015

/*
 * Solves lapl(u) x = b, for x, given b, using Conjugate Gradient
 */
void
cg(size_t L, _Complex float *x, _Complex float *b, _Complex float *u)
{
  int max_iter = 100;
  float tol = 1e-6;

  /* Temporary fields needed for CG */
  _Complex float *r = new_field(L);
  _Complex float *p = new_field(L);
  _Complex float *Ap = new_field(L);

  /* Initial residual and p-vector */
  lapl(L, r, x, u);
  xmy(L, b, r);
  xeqy(L, p, r);

  /* Initial r-norm and b-norm */
  float rr = xdotx(L, r);  
  float bb = xdotx(L, b);
  double t_lapl = 0;
  int iter = 0;
  for(iter=0; iter<max_iter; iter++) {
    printf(" %6d, res = %+e\n", iter, rr/bb);
    if(sqrt(rr/bb) < tol)
      break;
    double t = stop_watch(0);
    lapl(L, Ap, p, u);
    t_lapl += stop_watch(t);
    float pAp = xdoty(L, p, Ap);
    float alpha = rr/pAp;
    axpy(L, alpha, p, x);
    axpy(L, -alpha, Ap, r);
    float r1r1 = xdotx(L, r);
    float beta = r1r1/rr;
    xpay(L, r, beta, p);
    rr = r1r1;
  }

  /* Recompute residual after convergence */
  lapl(L, r, x, u);
  xmy(L, b, r);
  rr = xdotx(L, r);

  double beta_fp = 34*L*L/(t_lapl/(double)iter)*1e-9;
  double beta_io = 32*L*L/(t_lapl/(double)iter)*1e-9;
  printf(" Converged after %6d iterations, res = %+e\n", iter, rr/bb);  
  printf(" Time in lapl(): %+6.3e sec/call, %4.2e Gflop/s, %4.2e GB/s\n",
	 t_lapl/(double)iter, beta_fp, beta_io);  

  free(r);
  free(p);
  free(Ap);
  return;
}

コード例 #6

0

ファイルを表示

ファイル: dslash_reference.cpp プロジェクト: rjchacko/klein-gould

void MatDag(float *out, float **gauge, float *in, float kappa) {
    float *inEven = in;
    float *inOdd  = in + Nh*spinorSiteSize;
    float *outEven = out;
    float *outOdd = out + Nh*spinorSiteSize;
    
    // full dslash operator
    dslashReference(outOdd, gauge, inEven, 1, 1);
    dslashReference(outEven, gauge, inOdd, 0, 1);
    
    // lastly apply the kappa term
    xpay(in, -kappa, out, N*spinorSiteSize);
}

コード例 #7

0

ファイルを表示

ファイル: wilson_dslash_reference.cpp プロジェクト: kpetrov/quda

void tm_mat(void *out, void **gauge, void *in, double kappa, double mu, 
	    QudaTwistFlavorType flavor, int dagger_bit, QudaPrecision precision,
	    QudaGaugeParam &gauge_param) {

  void *inEven = in;
  void *inOdd  = (char*)in + Vh*spinorSiteSize*precision;
  void *outEven = out;
  void *outOdd = (char*)out + Vh*spinorSiteSize*precision;
  void *tmp = malloc(V*spinorSiteSize*precision);

  wil_dslash(outOdd, gauge, inEven, 1, dagger_bit, precision, gauge_param);
  wil_dslash(outEven, gauge, inOdd, 0, dagger_bit, precision, gauge_param);

  // apply the twist term to the full lattice
  twist_gamma5(tmp, in, dagger_bit, kappa, mu, flavor, V, QUDA_TWIST_GAMMA5_DIRECT, precision);

  // combine
  if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)tmp, -kappa, (double*)out, V*spinorSiteSize);
  else xpay((float*)tmp, -(float)kappa, (float*)out, V*spinorSiteSize);

  free(tmp);
}

コード例 #8

0

ファイルを表示

ファイル: wilson_dslash_reference.cpp プロジェクト: kpetrov/quda

// Apply the even-odd preconditioned Dirac operator
void wil_matpc(void *outEven, void **gauge, void *inEven, double kappa, 
	       QudaMatPCType matpc_type, int daggerBit, QudaPrecision precision,
	       QudaGaugeParam &gauge_param) {

  void *tmp = malloc(Vh*spinorSiteSize*precision);
    
  // FIXME: remove once reference clover is finished
  // full dslash operator
  if (matpc_type == QUDA_MATPC_EVEN_EVEN || matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) {
    wil_dslash(tmp, gauge, inEven, 1, daggerBit, precision, gauge_param);
    wil_dslash(outEven, gauge, tmp, 0, daggerBit, precision, gauge_param);
  } else {
    wil_dslash(tmp, gauge, inEven, 0, daggerBit, precision, gauge_param);
    wil_dslash(outEven, gauge, tmp, 1, daggerBit, precision, gauge_param);
  }    
  
  // lastly apply the kappa term
  double kappa2 = -kappa*kappa;
  if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)inEven, kappa2, (double*)outEven, Vh*spinorSiteSize);
  else xpay((float*)inEven, (float)kappa2, (float*)outEven, Vh*spinorSiteSize);

  free(tmp);
}

コード例 #9

0

ファイルを表示

ファイル: solverschnellundfalsch.c プロジェクト: luisaho/cuda-hpc

/***************************************
 *         Conjugate Gradient          *
 *   This function will do the CG      *
 *  algorithm without preconditioning. *
 *    For optimiziation you must not   *
 *        change the algorithm.        *
 ***************************************
 r(0)    = b - Ax(0)
 p(0)    = r(0)
 rho(0)    =  <r(0),r(0)>                
 ***************************************
 for k=0,1,2,...,n-1
   q(k)      = A * p(k)                 
   dot_pq    = <p(k),q(k)>             
   alpha     = rho(k) / dot_pq
   x(k+1)    = x(k) + alpha*p(k)      
   r(k+1)    = r(k) - alpha*q(k)     
   check convergence ||r(k+1)||_2 < eps  
	 rho(k+1)  = <r(k+1), r(k+1)>         
   beta      = rho(k+1) / rho(k)
   p(k+1)    = r(k+1) + beta*p(k)      
***************************************/
void cg(const int n, const int nnz, const int maxNNZ, const floatType* data, const int* indices, const int* length, const floatType* b, floatType* x, struct SolverConfig* sc){

	floatType* r, *p, *q;
	floatType alpha, beta, rho, rho_old, dot_pq, bnrm2;
	int iter;
 	double timeMatvec_s;
 	double timeMatvec=0;
	int i;
	floatType temp;
	
	/* allocate memory */
	r = (floatType*)malloc(n * sizeof(floatType));
	p = (floatType*)malloc(n * sizeof(floatType));
	q = (floatType*)malloc(n * sizeof(floatType));
	
#pragma acc data copyin(data[0:n*maxNNZ], indices[0:n*maxNNZ], length[0:n], n, nnz, maxNNZ, b[0:n]) copy(x[0:n]) create(alpha, beta, r[0:n], p[0:n], q[0:n], i, temp) //eigentlich auch copy(x[0:n]) aber error: not found on device???
{
	DBGMAT("Start matrix A = ", n, nnz, maxNNZ, data, indices, length)
	DBGVEC("b = ", b, n);
	DBGVEC("x = ", x, n);

	/* r(0)    = b - Ax(0) */
	timeMatvec_s = getWTime();
	matvec(n, nnz, maxNNZ, data, indices, length, x, r);
//hier inline ausprobieren
/*int i, j, k;
#pragma acc parallel loop present(data, indices, length, x)
	for (i = 0; i < n; i++) {
		r[i] = 0;
		for (j = 0; j < length[i]; j++) {
			k = j * n + i;
			r[i] += data[k] * x[indices[k]];
		}
	}*/
	timeMatvec += getWTime() - timeMatvec_s;
	xpay(b, -1.0, n, r);
	DBGVEC("r = b - Ax = ", r, n);
	

	/* Calculate initial residuum */
	nrm2(r, n, &bnrm2);
	bnrm2 = 1.0 /bnrm2;

	/* p(0)    = r(0) */
	memcpy(p, r, n*sizeof(floatType));
	DBGVEC("p = r = ", p, n);

	/* rho(0)    =  <r(0),r(0)> */
	vectorDot(r, r, n, &rho);
	printf("rho_0=%e\n", rho);

	for(iter = 0; iter < sc->maxIter; iter++){
		DBGMSG("=============== Iteration %d ======================\n", iter);
		/* q(k)      = A * p(k) */
		timeMatvec_s = getWTime();
		matvec(n, nnz, maxNNZ, data, indices, length, p, q);
		timeMatvec += getWTime() - timeMatvec_s;
		DBGVEC("q = A * p= ", q, n);

		/* dot_pq    = <p(k),q(k)> */
		vectorDot(p, q, n, &dot_pq);
		DBGSCA("dot_pq = <p, q> = ", dot_pq);

		/* alpha     = rho(k) / dot_pq */
		alpha = rho / dot_pq;
		DBGSCA("alpha = rho / dot_pq = ", alpha);

		/* x(k+1)    = x(k) + alpha*p(k) */
		axpy(alpha, p, n, x);
#pragma acc update host(x[0:n])
		DBGVEC("x = x + alpha * p= ", x, n);

		/* r(k+1)    = r(k) - alpha*q(k) */
		axpy(-alpha, q, n, r);
		DBGVEC("r = r - alpha * q= ", r, n);


		rho_old = rho;
		DBGSCA("rho_old = rho = ", rho_old);


		/* rho(k+1)  = <r(k+1), r(k+1)> */
		vectorDot(r, r, n, &rho);
		DBGSCA("rho = <r, r> = ", rho);

		/* Normalize the residual with initial one */
		sc->residual= sqrt(rho) * bnrm2;


   	
		/* Check convergence ||r(k+1)||_2 < eps
		 * If the residual is smaller than the CG
		 * tolerance specified in the CG_TOLERANCE
		 * environment variable our solution vector
		 * is good enough and we can stop the 
		 * algorithm. */
		printf("res_%d=%e\n", iter+1, sc->residual);
		if(sc->residual <= sc->tolerance)
			break;


		/* beta      = rho(k+1) / rho(k) */
		beta = rho / rho_old;
		DBGSCA("beta = rho / rho_old= ", beta);

		/* p(k+1)    = r(k+1) + beta*p(k) */
		xpay(r, beta, n, p);
		DBGVEC("p = r + beta * p> = ", p, n);

	}

	/* Store the number of iterations and the 
	 * time for the sparse matrix vector
	 * product which is the most expensive 
	 * function in the whole CG algorithm. */
	sc->iter = iter;
	sc->timeMatvec = timeMatvec;

	/* Clean up */
	free(r);
	free(p);
	free(q);
}//ende data region
}

コード例 #10

0

ファイルを表示

ファイル: cg_test.cpp プロジェクト: rjchacko/klein-gould

void cgTest() {
  float mass = 0.01;
  float kappa = 1.0 / (2.0*(4 + mass));

  float *gauge[4];
  for (int dir = 0; dir < 4; dir++) {
    gauge[dir] = (float*)malloc(N*gaugeSiteSize*sizeof(float));
  }
  //constructGaugeField(gauge);
  constructUnitGaugeField(gauge);
    
  float *spinorIn = (float*)malloc(N*spinorSiteSize*sizeof(float));
  float *spinorOut = (float*)malloc(N*spinorSiteSize*sizeof(float));

#ifdef EVEN_ODD
  float *source = (float *)malloc(Nh*spinorSiteSize*sizeof(float));
  float *tmp = (float *)malloc(Nh*spinorSiteSize*sizeof(float));
#else
  float *source = (float *)malloc(N*spinorSiteSize*sizeof(float));
#endif

  int i0 = 0;
  int s0 = 0;
  int c0 = 0;
  constructPointSpinorField(spinorIn, i0, s0, c0);
  //constructSpinorField(spinorIn);

  // Prepare the source term
  ax(2*kappa, spinorIn, N*spinorSiteSize);

  // see output element
  // Mat(source, gauge, spinorIn, kappa);
  // printSpinorElement(source, 0);

#ifdef EVEN_ODD
  float *spinorInOdd = spinorIn + Nh*spinorSiteSize;
  dslashReference(tmp, gauge, spinorInOdd, 0, 0);
  xpay(spinorIn, kappa, tmp, Nh*spinorSiteSize);
  MatPCDag(source, gauge, tmp, kappa);
#else
  MatDag(source, gauge, spinorIn, kappa);
#endif

  cgCuda(spinorOut, gauge, source, kappa, 1e-7);
//  cg_reference(spinorOut, gauge, source, kappa, 1e-7);

  // Reconstruct the full inverse
#ifdef EVEN_ODD
  float *spinorOutOdd = spinorOut + Nh*spinorSiteSize;
  dslashReference(spinorOutOdd, gauge, spinorOut, 1, 0);
  xpay(spinorInOdd, kappa, spinorOutOdd, Nh*spinorSiteSize);
#endif

  printf("Result norm = %e\n", norm(spinorOut, N*spinorSiteSize));

  // release memory
  for (int dir = 0; dir < 4; dir++) free(gauge[dir]);
  free(source);
#ifdef EVEN_ODD
  free(tmp);
#endif
  free(spinorIn);
  free(spinorOut);
}

コード例 #11

0

ファイルを表示

ファイル: bicgstab.cpp プロジェクト: stevengj/meep

/* BiCGSTAB(L) algorithm for the n-by-n problem Ax = b */
ptrdiff_t bicgstabL(const int L, const size_t n, realnum *x, bicgstab_op A, void *Adata,
                    const realnum *b, const double tol, int *iters, realnum *work,
                    const bool quiet) {
  if (!work) return (2 * L + 3) * n; // required workspace

  prealnum *r = new prealnum[L + 1];
  prealnum *u = new prealnum[L + 1];
  for (int i = 0; i <= L; ++i) {
    r[i] = work + i * n;
    u[i] = work + (L + 1 + i) * n;
  }

  double bnrm = norm2(n, b);
  if (bnrm == 0.0) bnrm = 1.0;

  int iter = 0;
  double last_output_wall_time = wall_time();

  double *gamma = new double[L + 1];
  double *gamma_p = new double[L + 1];
  double *gamma_pp = new double[L + 1];

  double *tau = new double[L * L];
  double *sigma = new double[L + 1];

  int ierr = 0; // error code to return, if any
  const double breaktol = 1e-30;

  /**** FIXME: check for breakdown conditions(?) during iteration  ****/

  // rtilde = r[0] = b - Ax
  realnum *rtilde = work + (2 * L + 2) * n;
  A(x, r[0], Adata);
  for (size_t m = 0; m < n; ++m)
    rtilde[m] = r[0][m] = b[m] - r[0][m];

  { /* Sleipjen normalizes rtilde in his code; it seems to help slightly */
    double s = 1.0 / norm2(n, rtilde);
    for (size_t m = 0; m < n; ++m)
      rtilde[m] *= s;
  }

  memset(u[0], 0, sizeof(realnum) * n); // u[0] = 0

  double rho = 1.0, alpha = 0, omega = 1;

  double resid;
  while ((resid = norm2(n, r[0])) > tol * bnrm) {
    ++iter;
    if (!quiet && wall_time() > last_output_wall_time + MEEP_MIN_OUTPUT_TIME) {
      master_printf("residual[%d] = %g\n", iter, resid / bnrm);
      last_output_wall_time = wall_time();
    }

    rho = -omega * rho;
    for (int j = 0; j < L; ++j) {
      if (fabs(rho) < breaktol) {
        ierr = -1;
        goto finish;
      }
      double rho1 = dot(n, r[j], rtilde);
      double beta = alpha * rho1 / rho;
      rho = rho1;
      for (int i = 0; i <= j; ++i)
        for (size_t m = 0; m < n; ++m)
          u[i][m] = r[i][m] - beta * u[i][m];
      A(u[j], u[j + 1], Adata);
      alpha = rho / dot(n, u[j + 1], rtilde);
      for (int i = 0; i <= j; ++i)
        xpay(n, r[i], -alpha, u[i + 1]);
      A(r[j], r[j + 1], Adata);
      xpay(n, x, alpha, u[0]);
    }

    for (int j = 1; j <= L; ++j) {
      for (int i = 1; i < j; ++i) {
        int ij = (j - 1) * L + (i - 1);
        tau[ij] = dot(n, r[j], r[i]) / sigma[i];
        xpay(n, r[j], -tau[ij], r[i]);
      }
      sigma[j] = dot(n, r[j], r[j]);
      gamma_p[j] = dot(n, r[0], r[j]) / sigma[j];
    }

    omega = gamma[L] = gamma_p[L];
    for (int j = L - 1; j >= 1; --j) {
      gamma[j] = gamma_p[j];
      for (int i = j + 1; i <= L; ++i)
        gamma[j] -= tau[(i - 1) * L + (j - 1)] * gamma[i];
    }
    for (int j = 1; j < L; ++j) {
      gamma_pp[j] = gamma[j + 1];
      for (int i = j + 1; i < L; ++i)
        gamma_pp[j] += tau[(i - 1) * L + (j - 1)] * gamma[i + 1];
    }

    xpay(n, x, gamma[1], r[0]);
    xpay(n, r[0], -gamma_p[L], r[L]);
    xpay(n, u[0], -gamma[L], u[L]);
    for (int j = 1; j < L; ++j) { /* TODO: use blas DGEMV (for L > 2) */
      xpay(n, x, gamma_pp[j], r[j]);
      xpay(n, r[0], -gamma_p[j], r[j]);
      xpay(n, u[0], -gamma[j], u[j]);
    }

    if (iter == *iters) {
      ierr = 1;
      break;
    }
  }

  if (!quiet) master_printf("final residual = %g\n", norm2(n, r[0]) / bnrm);

finish:
  delete[] sigma;
  delete[] tau;
  delete[] gamma_pp;
  delete[] gamma_p;
  delete[] gamma;
  delete[] u;
  delete[] r;

  *iters = iter;
  return ierr;
}