/* Ref: Weiss, Algorithm 11 CGS * INPUT * n : dimension of the problem * b [n] : r-h-s vector * atimes (int n, static double *x, double *b, void *param) : * calc matrix-vector product A.x = b. * atimes_param : parameters for atimes(). * it : struct iter. following entries are used * it->max = kend : max of iteration * it->eps = eps : criteria for |r^2|/|b^2| * OUTPUT * returned value : 0 == success, otherwise (-1) == failed * x [n] : solution * it->niter : # of iteration * it->res2 : |r^2| / |b^2| */ int cgs (int n, const double *b, double *x, void (*atimes) (int, const double *, double *, void *), void *atimes_param, struct iter *it) { #ifndef HAVE_CBLAS_H # ifdef HAVE_BLAS_H /* use Fortran BLAS routines */ int i_1 = 1; double d_m1 = -1.0; double d_2 = 2.0; # endif // !HAVE_BLAS_H #endif // !HAVE_CBLAS_H int ret = -1; double eps2 = it->eps * it->eps; int itmax = it->max; double *r = (double *)malloc (sizeof (double) * n); double *r0 = (double *)malloc (sizeof (double) * n); double *p = (double *)malloc (sizeof (double) * n); double *u = (double *)malloc (sizeof (double) * n); double *ap = (double *)malloc (sizeof (double) * n); double *q = (double *)malloc (sizeof (double) * n); double *t = (double *)malloc (sizeof (double) * n); CHECK_MALLOC (r, "cgs"); CHECK_MALLOC (r0, "cgs"); CHECK_MALLOC (p, "cgs"); CHECK_MALLOC (u, "cgs"); CHECK_MALLOC (ap, "cgs"); CHECK_MALLOC (q, "cgs"); CHECK_MALLOC (t, "cgs"); double r0ap; double rho, rho1; double delta; double beta; double res2 = 0.0; #ifdef HAVE_CBLAS_H /** * ATLAS version */ double b2 = cblas_ddot (n, b, 1, b, 1); // (b,b) eps2 *= b2; // initial residue atimes (n, x, r, atimes_param); // r = A.x cblas_daxpy (n, -1.0, b, 1, r, 1); // r = A.x - b cblas_dcopy (n, r, 1, r0, 1); // r0* = r cblas_dcopy (n, r, 1, p, 1); // p = r cblas_dcopy (n, r, 1, u, 1); // u = r rho = cblas_ddot (n, r0, 1, r, 1); // rho = (r0*, r) int i; for (i = 0; i < itmax; i ++) { atimes (n, p, ap, atimes_param); // ap = A.p r0ap = cblas_ddot (n, r0, 1, ap, 1); // r0ap = (r0*, A.p) delta = - rho / r0ap; cblas_dcopy (n, u, 1, q, 1); // q = u cblas_dscal (n, 2.0, q, 1); // q = 2 u cblas_daxpy (n, delta, ap, 1, q, 1); // q = 2 u + delta A.p atimes (n, q, t, atimes_param); // t = A.q cblas_daxpy (n, delta, t, 1, r, 1); // r = r + delta t cblas_daxpy (n, delta, q, 1, x, 1); // x = x + delta q res2 = cblas_ddot (n, r, 1, r, 1); if (it->debug == 2) { fprintf (it->out, "libiter-cgs %d %e\n", i, res2 / b2); } if (res2 <= eps2) { ret = 0; // success break; } rho1 = cblas_ddot (n, r0, 1, r, 1); // rho = (r0*, r) beta = rho1 / rho; rho = rho1; // here t is not used so that this is used for working area. double *qu = t; cblas_dcopy (n, q, 1, qu, 1); // qu = q cblas_daxpy (n, -1.0, u, 1, qu, 1); // qu = q - u cblas_dcopy (n, r, 1, u, 1); // u = r cblas_daxpy (n, beta, qu, 1, u, 1); // u = r + beta (q - u) cblas_daxpy (n, beta, p, 1, qu, 1); // qu = q - u + beta * p cblas_dcopy (n, u, 1, p, 1); // p = u cblas_daxpy (n, beta, qu, 1, p, 1); // p = u + beta (q - u + b * p) } #else // !HAVE_CBLAS_H # ifdef HAVE_BLAS_H /** * BLAS version */ double b2 = ddot_ (&n, b, &i_1, b, &i_1); // (b,b) eps2 *= b2; // initial residue atimes (n, x, r, atimes_param); // r = A.x daxpy_ (&n, &d_m1, b, &i_1, r, &i_1); // r = A.x - b dcopy_ (&n, r, &i_1, r0, &i_1); // r0* = r dcopy_ (&n, r, &i_1, p, &i_1); // p = r dcopy_ (&n, r, &i_1, u, &i_1); // u = r rho = ddot_ (&n, r0, &i_1, r, &i_1); // rho = (r0*, r) int i; for (i = 0; i < itmax; i ++) { atimes (n, p, ap, atimes_param); // ap = A.p r0ap = ddot_ (&n, r0, &i_1, ap, &i_1); // r0ap = (r0*, A.p) delta = - rho / r0ap; dcopy_ (&n, u, &i_1, q, &i_1); // q = u dscal_ (&n, &d_2, q, &i_1); // q = 2 u daxpy_ (&n, &delta, ap, &i_1, q, &i_1); // q = 2 u + delta A.p atimes (n, q, t, atimes_param); // t = A.q daxpy_ (&n, &delta, t, &i_1, r, &i_1); // r = r + delta t daxpy_ (&n, &delta, q, &i_1, x, &i_1); // x = x + delta q res2 = ddot_ (&n, r, &i_1, r, &i_1); if (it->debug == 2) { fprintf (it->out, "libiter-cgs %d %e\n", i, res2 / b2); } if (res2 <= eps2) { ret = 0; // success break; } rho1 = ddot_ (&n, r0, &i_1, r, &i_1); // rho = (r0*, r) beta = rho1 / rho; rho = rho1; // here t is not used so that this is used for working area. double *qu = t; dcopy_ (&n, q, &i_1, qu, &i_1); // qu = q daxpy_ (&n, &d_m1, u, &i_1, qu, &i_1); // qu = q - u dcopy_ (&n, r, &i_1, u, &i_1); // u = r daxpy_ (&n, &beta, qu, &i_1, u, &i_1); // u = r + beta (q - u) daxpy_ (&n, &beta, p, &i_1, qu, &i_1); // qu = q - u + beta * p dcopy_ (&n, u, &i_1, p, &i_1); // p = u daxpy_ (&n, &beta, qu, &i_1, p, &i_1); // p = u + beta (q - u + b * p) } # else // !HAVE_BLAS_H /** * local BLAS version */ double b2 = my_ddot (n, b, 1, b, 1); // (b,b) eps2 *= b2; // initial residue atimes (n, x, r, atimes_param); // r = A.x my_daxpy (n, -1.0, b, 1, r, 1); // r = A.x - b my_dcopy (n, r, 1, r0, 1); // r0* = r my_dcopy (n, r, 1, p, 1); // p = r my_dcopy (n, r, 1, u, 1); // u = r rho = my_ddot (n, r0, 1, r, 1); // rho = (r0*, r) int i; for (i = 0; i < itmax; i ++) { atimes (n, p, ap, atimes_param); // ap = A.p r0ap = my_ddot (n, r0, 1, ap, 1); // r0ap = (r0*, A.p) delta = - rho / r0ap; my_dcopy (n, u, 1, q, 1); // q = u my_dscal (n, 2.0, q, 1); // q = 2 u my_daxpy (n, delta, ap, 1, q, 1); // q = 2 u + delta A.p atimes (n, q, t, atimes_param); // t = A.q my_daxpy (n, delta, t, 1, r, 1); // r = r + delta t my_daxpy (n, delta, q, 1, x, 1); // x = x + delta q res2 = my_ddot (n, r, 1, r, 1); if (it->debug == 2) { fprintf (it->out, "libiter-cgs %d %e\n", i, res2 / b2); } if (res2 <= eps2) { ret = 0; // success break; } rho1 = my_ddot (n, r0, 1, r, 1); // rho = (r0*, r) beta = rho1 / rho; rho = rho1; // here t is not used so that this is used for working area. double *qu = t; my_dcopy (n, q, 1, qu, 1); // qu = q my_daxpy (n, -1.0, u, 1, qu, 1); // qu = q - u my_dcopy (n, r, 1, u, 1); // u = r my_daxpy (n, beta, qu, 1, u, 1); // u = r + beta (q - u) my_daxpy (n, beta, p, 1, qu, 1); // qu = q - u + beta * p my_dcopy (n, u, 1, p, 1); // p = u my_daxpy (n, beta, qu, 1, p, 1); // p = u + beta (q - u + b * p) } # endif // !HAVE_BLAS_H #endif // !HAVE_CBLAS_H free (r); free (r0); free (p); free (u); free (ap); free (q); free (t); if (it->debug == 1) { fprintf (it->out, "libiter-cgs it= %d res^2= %e\n", i, res2); } it->niter = i; it->res2 = res2 / b2; return (ret); }
/* Ref: Weiss, Algorithm 12 BiCGSTAB * INPUT * n : dimension of the problem * b [n] : r-h-s vector * atimes (int n, static double *x, double *b, void *param) : * calc matrix-vector product A.x = b. * atimes_param : parameters for atimes(). * it : struct iter. following entries are used * it->max = kend : max of iteration * it->eps = eps : criteria for |r^2|/|b^2| * OUTPUT * returned value : 0 == success, otherwise (-1) == failed * x [n] : solution * it->niter : # of iteration * it->res2 : |r^2| / |b^2| */ int bicgstab (int n, const double *b, double *x, void (*atimes) (int, const double *, double *, void *), void *atimes_param, struct iter *it) { #ifndef HAVE_CBLAS_H # ifdef HAVE_BLAS_H /* use Fortran BLAS routines */ int i_1 = 1; double d_1 = 1.0; double d_m1 = -1.0; # endif // !HAVE_BLAS_H #endif // !HAVE_CBLAS_H int ret = -1; double eps2 = it->eps * it->eps; int itmax = it->max; double *r = (double *)malloc (sizeof (double) * n); double *rs = (double *)malloc (sizeof (double) * n); double *p = (double *)malloc (sizeof (double) * n); double *ap = (double *)malloc (sizeof (double) * n); double *s = (double *)malloc (sizeof (double) * n); double *t = (double *)malloc (sizeof (double) * n); CHECK_MALLOC (r, "bicgstab"); CHECK_MALLOC (rs, "bicgstab"); CHECK_MALLOC (p, "bicgstab"); CHECK_MALLOC (ap, "bicgstab"); CHECK_MALLOC (s, "bicgstab"); CHECK_MALLOC (t, "bicgstab"); double rsap; // (r*, A.p) double st; double t2; double rho, rho1; double delta; double gamma; double beta; double res2 = 0.0; #ifdef HAVE_CBLAS_H /** * ATLAS version */ double b2 = cblas_ddot (n, b, 1, b, 1); // (b,b) eps2 *= b2; atimes (n, x, r, atimes_param); // r = A.x ... cblas_daxpy (n, -1.0, b, 1, r, 1); // - b cblas_dcopy (n, r, 1, rs, 1); // r* = r cblas_dcopy (n, r, 1, p, 1); // p = r rho = cblas_ddot (n, rs, 1, r, 1); // rho = (r*, r) int i; for (i = 0; i < itmax; i ++) { atimes (n, p, ap, atimes_param); // ap = A.p rsap = cblas_ddot (n, rs, 1, ap, 1); // rsap = (r*, A.p) delta = - rho / rsap; cblas_dcopy (n, r, 1, s, 1); // s = r ... cblas_daxpy (n, delta, ap, 1, s, 1); // + delta A.p atimes (n, s, t, atimes_param); // t = A.s st = cblas_ddot (n, s, 1, t, 1); // st = (s, t) t2 = cblas_ddot (n, t, 1, t, 1); // t2 = (t, t) gamma = - st / t2; cblas_dcopy (n, s, 1, r, 1); // r = s ... cblas_daxpy (n, gamma, t, 1, r, 1); // + gamma t cblas_daxpy (n, delta, p, 1, x, 1); // x = x + delta p... cblas_daxpy (n, gamma, s, 1, x, 1); // + gamma s res2 = cblas_ddot (n, r, 1, r, 1); if (it->debug == 2) { fprintf (it->out, "libiter-bicgstab(cblas) %d %e\n", i, res2 / b2); } if (res2 <= eps2) { ret = 0; // success break; } rho1 = cblas_ddot (n, rs, 1, r, 1); // rho = (r*, r) beta = rho1 / rho * delta / gamma; rho = rho1; cblas_daxpy (n, gamma, ap, 1, p, 1); // p = p + gamma A.p cblas_dscal (n, beta, p, 1); // p = beta (p + gamma A.p) cblas_daxpy (n, 1.0, r, 1, p, 1); // p = r + beta(p + gamma A.p) } #else // !HAVE_CBLAS_H # ifdef HAVE_BLAS_H /** * BLAS version */ double b2 = ddot_ (&n, b, &i_1, b, &i_1); // (b,b) eps2 *= b2; atimes (n, x, r, atimes_param); // r = A.x ... daxpy_ (&n, &d_m1, b, &i_1, r, &i_1); // - b dcopy_ (&n, r, &i_1, rs, &i_1); // r* = r dcopy_ (&n, r, &i_1, p, &i_1); // p = r rho = ddot_ (&n, rs, &i_1, r, &i_1); // rho = (r*, r) int i; for (i = 0; i < itmax; i ++) { atimes (n, p, ap, atimes_param); // ap = A.p rsap = ddot_ (&n, rs, &i_1, ap, &i_1); // rsap = (r*, A.p) delta = - rho / rsap; dcopy_ (&n, r, &i_1, s, &i_1); // s = r ... daxpy_ (&n, &delta, ap, &i_1, s, &i_1); // + delta A.p atimes (n, s, t, atimes_param); // t = A.s st = ddot_ (&n, s, &i_1, t, &i_1); // st = (s, t) t2 = ddot_ (&n, t, &i_1, t, &i_1); // t2 = (t, t) gamma = - st / t2; dcopy_ (&n, s, &i_1, r, &i_1); // r = s ... daxpy_ (&n, &gamma, t, &i_1, r, &i_1); // + gamma t daxpy_ (&n, &delta, p, &i_1, x, &i_1); // x = x + delta p... daxpy_ (&n, &gamma, s, &i_1, x, &i_1); // + gamma s res2 = ddot_ (&n, r, &i_1, r, &i_1); if (it->debug == 2) { fprintf (it->out, "libiter-bicgstab(blas) %d %e\n", i, res2 / b2); } if (res2 <= eps2) { ret = 0; // success break; } if (res2 > 1.0e20) { // already too big residual break; } rho1 = ddot_ (&n, rs, &i_1, r, &i_1); // rho = (r*, r) beta = rho1 / rho * delta / gamma; rho = rho1; daxpy_ (&n, &gamma, ap, &i_1, p, &i_1); // p = p + gamma A.p dscal_ (&n, &beta, p, &i_1); // p = beta (p + gamma A.p) daxpy_ (&n, &d_1, r, &i_1, p, &i_1); // p = r + beta(p + gamma A.p) } # else // !HAVE_BLAS_H /** * local BLAS version */ double b2 = my_ddot (n, b, 1, b, 1); // (b,b) eps2 *= b2; atimes (n, x, r, atimes_param); // r = A.x ... my_daxpy (n, -1.0, b, 1, r, 1); // - b my_dcopy (n, r, 1, rs, 1); // r* = r my_dcopy (n, r, 1, p, 1); // p = r rho = my_ddot (n, rs, 1, r, 1); // rho = (r*, r) int i; for (i = 0; i < itmax; i ++) { atimes (n, p, ap, atimes_param); // ap = A.p rsap = my_ddot (n, rs, 1, ap, 1); // rsap = (r*, A.p) delta = - rho / rsap; my_dcopy (n, r, 1, s, 1); // s = r ... my_daxpy (n, delta, ap, 1, s, 1); // + delta A.p atimes (n, s, t, atimes_param); // t = A.s st = my_ddot (n, s, 1, t, 1); // st = (s, t) t2 = my_ddot (n, t, 1, t, 1); // t2 = (t, t) gamma = - st / t2; my_dcopy (n, s, 1, r, 1); // r = s ... my_daxpy (n, gamma, t, 1, r, 1); // + gamma t my_daxpy (n, delta, p, 1, x, 1); // x = x + delta p... my_daxpy (n, gamma, s, 1, x, 1); // + gamma s res2 = my_ddot (n, r, 1, r, 1); if (it->debug == 2) { fprintf (it->out, "libiter-bicgstab(myblas) %d %e\n", i, res2 / b2); } if (res2 <= eps2) { ret = 0; // success break; } rho1 = my_ddot (n, rs, 1, r, 1); // rho = (r*, r) beta = rho1 / rho * delta / gamma; rho = rho1; my_daxpy (n, gamma, ap, 1, p, 1); // p = p + gamma A.p my_dscal (n, beta, p, 1); // p = beta (p + gamma A.p) my_daxpy (n, 1.0, r, 1, p, 1); // p = r + beta(p + gamma A.p) } # endif // !HAVE_BLAS_H #endif // !HAVE_CBLAS_H free (r); free (rs); free (p); free (ap); free (s); free (t); if (it->debug == 1) { fprintf (it->out, "libiter-bicgstab %d %e\n", i, res2 / b2); } it->niter = i; it->res2 = res2 / b2; return (ret); }
/* Classical CG method -- Weiss' Algorithm 2 * INPUT * n : dimension of the problem * b [n] : r-h-s vector * atimes (int n, static double *x, double *b, void *param) : * calc matrix-vector product A.x = b. * atimes_param : parameters for atimes(). * it : struct iter. following entries are used * it->max = kend : max of iteration * it->eps = eps : criteria for |r^2|/|b^2| * OUTPUT * returned value : 0 == success, otherwise (-1) == failed * x [n] : solution * it->niter : # of iteration * it->res2 : |r^2| / |b^2| */ int cg (int n, const double *b, double *x, void (*atimes) (int, const double *, double *, void *), void *atimes_param, struct iter *it) { #ifndef HAVE_CBLAS_H # ifdef HAVE_BLAS_H /* use Fortran BLAS routines */ int i_1 = 1; double d_1 = 1.0; double d_m1 = -1.0; # endif // !HAVE_BLAS_H #endif // !HAVE_CBLAS_H int ret = -1; double eps2 = it->eps * it->eps; int itmax = it->max; double *p = (double *)malloc (sizeof (double) * n); double *r = (double *)malloc (sizeof (double) * n); double *ap = (double *)malloc (sizeof (double) * n); CHECK_MALLOC (p, "cg"); CHECK_MALLOC (r, "cg"); CHECK_MALLOC (ap, "cg"); double r2; double res2 = 0.0; double pap; double gamma; double beta; int i; #ifdef HAVE_CBLAS_H /** * ATLAS version */ double b2 = cblas_ddot (n, b, 1, b, 1); // (b,b) eps2 *= b2; atimes (n, x, r, atimes_param); // r = A.x cblas_daxpy (n, -1.0, b, 1, r, 1); // r = A.x - b cblas_dcopy (n, r, 1, p, 1); // p = r for (i = 0; i < itmax; i ++) { r2 = cblas_ddot (n, r, 1, r, 1); // r2 = (r, r) atimes (n, p, ap, atimes_param); // ap = A.p pap = cblas_ddot (n, p, 1, ap, 1); // pap = (p, A.p) gamma = - r2 / pap; // gamma = - (r, r) / (p, A.p) cblas_daxpy (n, gamma, p, 1, x, 1); // x += gamma p cblas_daxpy (n, gamma, ap, 1, r, 1); // r += gamma Ap // new norm of r res2 = cblas_ddot (n, r, 1, r, 1); // (r, r) if (it->debug == 2) { fprintf (it->out, "libiter-cg %d %e\n", i, res2 / b2); } if (res2 <= eps2) { ret = 0; // success break; } beta = res2 / r2; // beta = (r, r) / (r0, r0) cblas_dscal (n, beta, p, 1); // p *= beta cblas_daxpy (n, 1.0, r, 1, p, 1); // p = r + beta p r2 = res2; } #else // !HAVE_CBLAS_H # ifdef HAVE_BLAS_H /** * BLAS version */ double b2 = ddot_ (&n, b, &i_1, b, &i_1); // (b,b) eps2 *= b2; atimes (n, x, r, atimes_param); // r = A.x daxpy_ (&n, &d_m1, b, &i_1, r, &i_1); // r = A.x - b dcopy_ (&n, r, &i_1, p, &i_1); // p = r for (i = 0; i < itmax; i ++) { r2 = ddot_ (&n, r, &i_1, r, &i_1); // r2 = (r, r) atimes (n, p, ap, atimes_param); // ap = A.p pap = ddot_ (&n, p, &i_1, ap, &i_1); // pap = (p, A.p) gamma = - r2 / pap; // gamma = - (r, r) / (p, A.p) daxpy_ (&n, &gamma, p, &i_1, x, &i_1); // x += gamma p daxpy_ (&n, &gamma, ap, &i_1, r, &i_1); // r += gamma Ap // new norm of r res2 = ddot_ (&n, r, &i_1, r, &i_1); // (r, r) if (it->debug == 2) { fprintf (it->out, "libiter-cg %d %e\n", i, res2 / b2); } if (res2 <= eps2) { ret = 0; // success break; } beta = res2 / r2; // beta = (r, r) / (r0, r0) dscal_ (&n, &beta, p, &i_1); // p *= beta daxpy_ (&n, &d_1, r, &i_1, p, &i_1); // p = r + beta p r2 = res2; } # else // !HAVE_BLAS_H /** * local BLAS version */ double b2 = my_ddot (n, b, 1, b, 1); // (b,b) eps2 *= b2; atimes (n, x, r, atimes_param); // r = A.x my_daxpy (n, -1.0, b, 1, r, 1); // r = A.x - b my_dcopy (n, r, 1, p, 1); // p = r for (i = 0; i < itmax; i ++) { r2 = my_ddot (n, r, 1, r, 1); // r2 = (r, r) atimes (n, p, ap, atimes_param); // ap = A.p pap = my_ddot (n, p, 1, ap, 1); // pap = (p, A.p) gamma = - r2 / pap; // gamma = - (r, r) / (p, A.p) my_daxpy (n, gamma, p, 1, x, 1); // x += gamma p my_daxpy (n, gamma, ap, 1, r, 1); // r += gamma Ap // new norm of r res2 = my_ddot (n, r, 1, r, 1); // (r, r) if (it->debug == 2) { fprintf (it->out, "libiter-cg %d %e\n", i, res2 / b2); } if (res2 <= eps2) { ret = 0; // success break; } beta = res2 / r2; // beta = (r, r) / (r0, r0) my_dscal (n, beta, p, 1); // p *= beta my_daxpy (n, 1.0, r, 1, p, 1); // p = r + beta p r2 = res2; } # endif // !HAVE_BLAS_H #endif // !HAVE_CBLAS_H free (p); free (r); free (ap); if (it->debug == 1) { fprintf (it->out, "libiter-cg it= %d res^2= %e\n", i, res2 / b2); } it->niter = i; it->res2 = res2 / b2; return (ret); }