/* Ref: Weiss, Algorithm 12 BiCGSTAB * INPUT * n : dimension of the problem * b [n] : r-h-s vector * atimes (int n, static double *x, double *b, void *param) : * calc matrix-vector product A.x = b. * atimes_param : parameters for atimes(). * it : struct iter. following entries are used * it->max = kend : max of iteration * it->eps = eps : criteria for |r^2|/|b^2| * OUTPUT * returned value : 0 == success, otherwise (-1) == failed * x [n] : solution * it->niter : # of iteration * it->res2 : |r^2| / |b^2| */ int bicgstab (int n, const double *b, double *x, void (*atimes) (int, const double *, double *, void *), void *atimes_param, struct iter *it) { #ifndef HAVE_CBLAS_H # ifdef HAVE_BLAS_H /* use Fortran BLAS routines */ int i_1 = 1; double d_1 = 1.0; double d_m1 = -1.0; # endif // !HAVE_BLAS_H #endif // !HAVE_CBLAS_H int ret = -1; double eps2 = it->eps * it->eps; int itmax = it->max; double *r = (double *)malloc (sizeof (double) * n); double *rs = (double *)malloc (sizeof (double) * n); double *p = (double *)malloc (sizeof (double) * n); double *ap = (double *)malloc (sizeof (double) * n); double *s = (double *)malloc (sizeof (double) * n); double *t = (double *)malloc (sizeof (double) * n); CHECK_MALLOC (r, "bicgstab"); CHECK_MALLOC (rs, "bicgstab"); CHECK_MALLOC (p, "bicgstab"); CHECK_MALLOC (ap, "bicgstab"); CHECK_MALLOC (s, "bicgstab"); CHECK_MALLOC (t, "bicgstab"); double rsap; // (r*, A.p) double st; double t2; double rho, rho1; double delta; double gamma; double beta; double res2 = 0.0; #ifdef HAVE_CBLAS_H /** * ATLAS version */ double b2 = cblas_ddot (n, b, 1, b, 1); // (b,b) eps2 *= b2; atimes (n, x, r, atimes_param); // r = A.x ... cblas_daxpy (n, -1.0, b, 1, r, 1); // - b cblas_dcopy (n, r, 1, rs, 1); // r* = r cblas_dcopy (n, r, 1, p, 1); // p = r rho = cblas_ddot (n, rs, 1, r, 1); // rho = (r*, r) int i; for (i = 0; i < itmax; i ++) { atimes (n, p, ap, atimes_param); // ap = A.p rsap = cblas_ddot (n, rs, 1, ap, 1); // rsap = (r*, A.p) delta = - rho / rsap; cblas_dcopy (n, r, 1, s, 1); // s = r ... cblas_daxpy (n, delta, ap, 1, s, 1); // + delta A.p atimes (n, s, t, atimes_param); // t = A.s st = cblas_ddot (n, s, 1, t, 1); // st = (s, t) t2 = cblas_ddot (n, t, 1, t, 1); // t2 = (t, t) gamma = - st / t2; cblas_dcopy (n, s, 1, r, 1); // r = s ... cblas_daxpy (n, gamma, t, 1, r, 1); // + gamma t cblas_daxpy (n, delta, p, 1, x, 1); // x = x + delta p... cblas_daxpy (n, gamma, s, 1, x, 1); // + gamma s res2 = cblas_ddot (n, r, 1, r, 1); if (it->debug == 2) { fprintf (it->out, "libiter-bicgstab(cblas) %d %e\n", i, res2 / b2); } if (res2 <= eps2) { ret = 0; // success break; } rho1 = cblas_ddot (n, rs, 1, r, 1); // rho = (r*, r) beta = rho1 / rho * delta / gamma; rho = rho1; cblas_daxpy (n, gamma, ap, 1, p, 1); // p = p + gamma A.p cblas_dscal (n, beta, p, 1); // p = beta (p + gamma A.p) cblas_daxpy (n, 1.0, r, 1, p, 1); // p = r + beta(p + gamma A.p) } #else // !HAVE_CBLAS_H # ifdef HAVE_BLAS_H /** * BLAS version */ double b2 = ddot_ (&n, b, &i_1, b, &i_1); // (b,b) eps2 *= b2; atimes (n, x, r, atimes_param); // r = A.x ... daxpy_ (&n, &d_m1, b, &i_1, r, &i_1); // - b dcopy_ (&n, r, &i_1, rs, &i_1); // r* = r dcopy_ (&n, r, &i_1, p, &i_1); // p = r rho = ddot_ (&n, rs, &i_1, r, &i_1); // rho = (r*, r) int i; for (i = 0; i < itmax; i ++) { atimes (n, p, ap, atimes_param); // ap = A.p rsap = ddot_ (&n, rs, &i_1, ap, &i_1); // rsap = (r*, A.p) delta = - rho / rsap; dcopy_ (&n, r, &i_1, s, &i_1); // s = r ... daxpy_ (&n, &delta, ap, &i_1, s, &i_1); // + delta A.p atimes (n, s, t, atimes_param); // t = A.s st = ddot_ (&n, s, &i_1, t, &i_1); // st = (s, t) t2 = ddot_ (&n, t, &i_1, t, &i_1); // t2 = (t, t) gamma = - st / t2; dcopy_ (&n, s, &i_1, r, &i_1); // r = s ... daxpy_ (&n, &gamma, t, &i_1, r, &i_1); // + gamma t daxpy_ (&n, &delta, p, &i_1, x, &i_1); // x = x + delta p... daxpy_ (&n, &gamma, s, &i_1, x, &i_1); // + gamma s res2 = ddot_ (&n, r, &i_1, r, &i_1); if (it->debug == 2) { fprintf (it->out, "libiter-bicgstab(blas) %d %e\n", i, res2 / b2); } if (res2 <= eps2) { ret = 0; // success break; } if (res2 > 1.0e20) { // already too big residual break; } rho1 = ddot_ (&n, rs, &i_1, r, &i_1); // rho = (r*, r) beta = rho1 / rho * delta / gamma; rho = rho1; daxpy_ (&n, &gamma, ap, &i_1, p, &i_1); // p = p + gamma A.p dscal_ (&n, &beta, p, &i_1); // p = beta (p + gamma A.p) daxpy_ (&n, &d_1, r, &i_1, p, &i_1); // p = r + beta(p + gamma A.p) } # else // !HAVE_BLAS_H /** * local BLAS version */ double b2 = my_ddot (n, b, 1, b, 1); // (b,b) eps2 *= b2; atimes (n, x, r, atimes_param); // r = A.x ... my_daxpy (n, -1.0, b, 1, r, 1); // - b my_dcopy (n, r, 1, rs, 1); // r* = r my_dcopy (n, r, 1, p, 1); // p = r rho = my_ddot (n, rs, 1, r, 1); // rho = (r*, r) int i; for (i = 0; i < itmax; i ++) { atimes (n, p, ap, atimes_param); // ap = A.p rsap = my_ddot (n, rs, 1, ap, 1); // rsap = (r*, A.p) delta = - rho / rsap; my_dcopy (n, r, 1, s, 1); // s = r ... my_daxpy (n, delta, ap, 1, s, 1); // + delta A.p atimes (n, s, t, atimes_param); // t = A.s st = my_ddot (n, s, 1, t, 1); // st = (s, t) t2 = my_ddot (n, t, 1, t, 1); // t2 = (t, t) gamma = - st / t2; my_dcopy (n, s, 1, r, 1); // r = s ... my_daxpy (n, gamma, t, 1, r, 1); // + gamma t my_daxpy (n, delta, p, 1, x, 1); // x = x + delta p... my_daxpy (n, gamma, s, 1, x, 1); // + gamma s res2 = my_ddot (n, r, 1, r, 1); if (it->debug == 2) { fprintf (it->out, "libiter-bicgstab(myblas) %d %e\n", i, res2 / b2); } if (res2 <= eps2) { ret = 0; // success break; } rho1 = my_ddot (n, rs, 1, r, 1); // rho = (r*, r) beta = rho1 / rho * delta / gamma; rho = rho1; my_daxpy (n, gamma, ap, 1, p, 1); // p = p + gamma A.p my_dscal (n, beta, p, 1); // p = beta (p + gamma A.p) my_daxpy (n, 1.0, r, 1, p, 1); // p = r + beta(p + gamma A.p) } # endif // !HAVE_BLAS_H #endif // !HAVE_CBLAS_H free (r); free (rs); free (p); free (ap); free (s); free (t); if (it->debug == 1) { fprintf (it->out, "libiter-bicgstab %d %e\n", i, res2 / b2); } it->niter = i; it->res2 = res2 / b2; return (ret); }
/* N>1 */ double fit_N_point_em_f(hpixelf *parr, int npix, int Nf, double *freqs, double *bmaj, double *bmin, double *bpa, double ref_freq, int maxiter, int max_em_iter, double *ll, double *mm, double *sI, double *sP, int N, int Nh, hpoint *hull){ int ci,cj,ck; double *p,*p1,*p2, // params m x 1 *x; // observed data n x 1, the image pixel fluxes double *xdummy, *xsub; //extra arrays int m,n; double *b; /* affine combination */ double opts[CLM_OPTS_SZ], info[CLM_INFO_SZ]; double l_min,l_max,m_min,m_max,sumI; double fraction; double penalty; /* penalty for solutions with components out of pixel range */ fit_double_point_dataf lmdata; /* for initial average pixel fit */ hpixel *parrav; double mean_bmaj,mean_bmin,mean_bpa,mean_err; /***** first do a fit for average pixesl, using average PSF ******/ if ((parrav=(hpixel*)calloc((size_t)(npix),sizeof(hpixel)))==0) { fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); exit(1); } for (ci=0; ci<npix; ++ci) { parrav[ci].x=parr[ci].x; parrav[ci].y=parr[ci].y; parrav[ci].l=parr[ci].l; parrav[ci].m=parr[ci].m; parrav[ci].ra=parr[ci].ra; parrav[ci].dec=parr[ci].dec; parrav[ci].sI=0.0; for (cj=0; cj<Nf; ++cj) { parrav[ci].sI+=parr[ci].sI[cj]; } parrav[ci].sI/=(double)Nf; } mean_bmaj=mean_bmin=mean_bpa=0.0; for (cj=0; cj<Nf; ++cj) { mean_bmaj+=bmaj[cj]; mean_bmin+=bmin[cj]; mean_bpa+=bpa[cj]; } mean_bmaj/=(double)Nf; mean_bmin/=(double)Nf; mean_bpa/=(double)Nf; /* we get mean_err=2*3*N+npix*log(error) */ mean_err=fit_N_point_em(parrav, npix, mean_bmaj, mean_bmin, mean_bpa, maxiter, max_em_iter, ll, mm, sI, N, Nh, hull); free(parrav); opts[0]=CLM_INIT_MU; opts[1]=1E-15; opts[2]=1E-15; opts[3]=1E-15; opts[4]=-CLM_DIFF_DELTA; m=4; /* 1x1 flux, 3x1 spec index for each component */ n=Nf*npix; /* no of pixels */ if ((p=(double*)calloc((size_t)(m),sizeof(double)))==0) { fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); exit(1); } if ((p1=(double*)calloc((size_t)(m),sizeof(double)))==0) { fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); exit(1); } if ((p2=(double*)calloc((size_t)(m),sizeof(double)))==0) { fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); exit(1); } if ((x=(double*)calloc((size_t)(n),sizeof(double)))==0) { fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); exit(1); } if ((xsub=(double*)calloc((size_t)(n),sizeof(double)))==0) { fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); exit(1); } if ((xdummy=(double*)calloc((size_t)(n),sizeof(double)))==0) { fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); exit(1); } if ((b=(double*)calloc((size_t)(N),sizeof(double)))==0) { fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); exit(1); } l_min=m_min=INFINITY_L; l_max=m_max=-INFINITY_L; sumI=0.0; /* only use valid pixels for initial conditions */ for (ci=0; ci<npix; ci++) { if (parr[ci].ra!=-1 && parr[ci].m!=-1) { if (l_min>parr[ci].l) { l_min=parr[ci].l; } if (l_max<parr[ci].l) { l_max=parr[ci].l; } if (m_min>parr[ci].m) { m_min=parr[ci].m; } if (m_max<parr[ci].m) { m_max=parr[ci].m; } } } ck=0; for (cj=0; cj<Nf; ++cj) { for (ci=0; ci<npix; ci++) { x[ck++]=parr[ci].sI[cj]; sumI+=parr[ci].sI[cj]; } } sumI/=(double)n; fraction=1.0;///(double)N; /**********************************/ for (ci=0; ci<N; ci++) { sP[ci]=0.0; sP[ci+N]=0.0; sP[ci+2*N]=0.0; b[ci]=fraction; } lmdata.Nf=Nf; lmdata.parr=parr; lmdata.freqs=freqs; lmdata.bmaj=bmaj; lmdata.bmin=bmin; lmdata.bpa=bpa; lmdata.ref_freq=ref_freq; double aic1,aic2,aic3; for (ci=0; ci<max_em_iter; ci++) { for (cj=0; cj<N; cj++) { /* calculate contribution from hidden data, subtract from x */ memcpy(xdummy,x,(size_t)(n)*sizeof(double)); for (ck=0; ck<N; ck++) { if (ck!=cj) { lmdata.ll=&ll[ck]; /* pointer to positions */ lmdata.mm=&mm[ck]; p[0]=sI[ck]; p[1]=sP[ck]; p[2]=sP[ck+N]; p[3]=sP[ck+2*N]; mylm_fit_single_pf(p, xsub, m, n, (void*)&lmdata); /* xdummy=xdummy-b*xsub */ my_daxpy(n, xsub, -b[ck], xdummy); } } lmdata.ll=&ll[cj]; /* pointer to positions */ lmdata.mm=&mm[cj]; p[0]=p1[0]=p2[0]=sI[cj]; p[1]=p1[1]=p2[1]=sP[cj]; p[2]=p2[2]=sP[cj+N]; p1[2]=0.0; p[3]=sP[cj+2*N]; p1[3]=p2[3]=0.0; //ret=dlevmar_dif(mylm_fit_single_pf, p, xdummy, m, n, maxiter, opts, info, NULL, NULL, (void*)&lmdata); // no Jacobian clevmar_der_single_nocuda(mylm_fit_single_pf, NULL, p, xdummy, m, n, maxiter, opts, info, 2, (void*)&lmdata); // no Jacobian /* penalize only 1/10 of parameters */ aic3=0.3+log(info[1]); clevmar_der_single_nocuda(mylm_fit_single_pf_2d, NULL, p2, xdummy, m-1, n, maxiter, opts, info, 2, (void*)&lmdata); // no Jacobian aic2=0.2+log(info[1]); clevmar_der_single_nocuda(mylm_fit_single_pf_1d, NULL, p1, xdummy, m-2, n, maxiter, opts, info, 2, (void*)&lmdata); // no Jacobian aic1=0.1+log(info[1]); /* choose one with minimum error */ if (aic3<aic2) { if (aic3<aic1) { /* 3d */ sI[cj]=p[0]; sP[cj]=p[1]; sP[cj+N]=p[2]; sP[cj+2*N]=p[3]; } else { /* 1d */ sI[cj]=p1[0]; sP[cj]=p1[1]; sP[cj+N]=p1[2]; sP[cj+2*N]=p1[3]; } } else { if (aic2<aic1) { /* 2d */ sI[cj]=p2[0]; sP[cj]=p2[1]; sP[cj+N]=p2[2]; sP[cj+2*N]=p2[3]; } else { /* 1d */ sI[cj]=p1[0]; sP[cj]=p1[1]; sP[cj+N]=p1[2]; sP[cj+2*N]=p1[3]; } } } } /**********************************/ #ifdef DEBUG print_levmar_info(info[0],info[1],(int)info[5], (int)info[6], (int)info[7], (int)info[8], (int)info[9]); printf("Levenberg-Marquardt returned %d in %g iter, reason %g\nSolution: ", ret, info[5], info[6]); #endif /* check for solutions such that l_min <= ll <= l_max and m_min <= mm <= m_max */ penalty=0.0; for (ci=0; ci<N; ci++) { /* position out of range */ if (ll[ci]<l_min || ll[ci]>l_max || mm[ci]<m_min || mm[ci]>m_max) { penalty+=INFINITY_L; } /* spec index too high to be true */ if (fabs(sP[ci])>20.0) { penalty+=INFINITY_L; } } /* calculate error */ memcpy(xdummy,x,(size_t)(n)*sizeof(double)); for (cj=0; cj<N; cj++) { for (ck=0; ck<N; ck++) { lmdata.ll=&ll[ck]; /* pointer to positions */ lmdata.mm=&mm[ck]; p[0]=sI[ck]; p[1]=sP[ck]; p[2]=sP[ck+N]; p[3]=sP[ck+2*N]; mylm_fit_single_pf(p, xsub, m, n, (void*)&lmdata); /* xdummy=xdummy-b*xsub */ my_daxpy(n, xsub, -1.0, xdummy); } } /*sumI=0.0; for (ci=0; ci<n; ++ci ){ sumI+=xdummy[ci]*xdummy[ci]; } */ sumI=my_dnrm2(n,xdummy); sumI=sumI*sumI; free(p); free(p1); free(p2); free(x); free(xdummy); free(xsub); free(b); /* AIC, 4*N parms */ //return 2*4*N+npix*Nf*log(sumI)+penalty; return 2*4*N+Nf*(mean_err-2*3*N)+log(sumI)*npix*Nf+penalty; }
/* Ref: Weiss, Algorithm 11 CGS * INPUT * n : dimension of the problem * b [n] : r-h-s vector * atimes (int n, static double *x, double *b, void *param) : * calc matrix-vector product A.x = b. * atimes_param : parameters for atimes(). * it : struct iter. following entries are used * it->max = kend : max of iteration * it->eps = eps : criteria for |r^2|/|b^2| * OUTPUT * returned value : 0 == success, otherwise (-1) == failed * x [n] : solution * it->niter : # of iteration * it->res2 : |r^2| / |b^2| */ int cgs (int n, const double *b, double *x, void (*atimes) (int, const double *, double *, void *), void *atimes_param, struct iter *it) { #ifndef HAVE_CBLAS_H # ifdef HAVE_BLAS_H /* use Fortran BLAS routines */ int i_1 = 1; double d_m1 = -1.0; double d_2 = 2.0; # endif // !HAVE_BLAS_H #endif // !HAVE_CBLAS_H int ret = -1; double eps2 = it->eps * it->eps; int itmax = it->max; double *r = (double *)malloc (sizeof (double) * n); double *r0 = (double *)malloc (sizeof (double) * n); double *p = (double *)malloc (sizeof (double) * n); double *u = (double *)malloc (sizeof (double) * n); double *ap = (double *)malloc (sizeof (double) * n); double *q = (double *)malloc (sizeof (double) * n); double *t = (double *)malloc (sizeof (double) * n); CHECK_MALLOC (r, "cgs"); CHECK_MALLOC (r0, "cgs"); CHECK_MALLOC (p, "cgs"); CHECK_MALLOC (u, "cgs"); CHECK_MALLOC (ap, "cgs"); CHECK_MALLOC (q, "cgs"); CHECK_MALLOC (t, "cgs"); double r0ap; double rho, rho1; double delta; double beta; double res2 = 0.0; #ifdef HAVE_CBLAS_H /** * ATLAS version */ double b2 = cblas_ddot (n, b, 1, b, 1); // (b,b) eps2 *= b2; // initial residue atimes (n, x, r, atimes_param); // r = A.x cblas_daxpy (n, -1.0, b, 1, r, 1); // r = A.x - b cblas_dcopy (n, r, 1, r0, 1); // r0* = r cblas_dcopy (n, r, 1, p, 1); // p = r cblas_dcopy (n, r, 1, u, 1); // u = r rho = cblas_ddot (n, r0, 1, r, 1); // rho = (r0*, r) int i; for (i = 0; i < itmax; i ++) { atimes (n, p, ap, atimes_param); // ap = A.p r0ap = cblas_ddot (n, r0, 1, ap, 1); // r0ap = (r0*, A.p) delta = - rho / r0ap; cblas_dcopy (n, u, 1, q, 1); // q = u cblas_dscal (n, 2.0, q, 1); // q = 2 u cblas_daxpy (n, delta, ap, 1, q, 1); // q = 2 u + delta A.p atimes (n, q, t, atimes_param); // t = A.q cblas_daxpy (n, delta, t, 1, r, 1); // r = r + delta t cblas_daxpy (n, delta, q, 1, x, 1); // x = x + delta q res2 = cblas_ddot (n, r, 1, r, 1); if (it->debug == 2) { fprintf (it->out, "libiter-cgs %d %e\n", i, res2 / b2); } if (res2 <= eps2) { ret = 0; // success break; } rho1 = cblas_ddot (n, r0, 1, r, 1); // rho = (r0*, r) beta = rho1 / rho; rho = rho1; // here t is not used so that this is used for working area. double *qu = t; cblas_dcopy (n, q, 1, qu, 1); // qu = q cblas_daxpy (n, -1.0, u, 1, qu, 1); // qu = q - u cblas_dcopy (n, r, 1, u, 1); // u = r cblas_daxpy (n, beta, qu, 1, u, 1); // u = r + beta (q - u) cblas_daxpy (n, beta, p, 1, qu, 1); // qu = q - u + beta * p cblas_dcopy (n, u, 1, p, 1); // p = u cblas_daxpy (n, beta, qu, 1, p, 1); // p = u + beta (q - u + b * p) } #else // !HAVE_CBLAS_H # ifdef HAVE_BLAS_H /** * BLAS version */ double b2 = ddot_ (&n, b, &i_1, b, &i_1); // (b,b) eps2 *= b2; // initial residue atimes (n, x, r, atimes_param); // r = A.x daxpy_ (&n, &d_m1, b, &i_1, r, &i_1); // r = A.x - b dcopy_ (&n, r, &i_1, r0, &i_1); // r0* = r dcopy_ (&n, r, &i_1, p, &i_1); // p = r dcopy_ (&n, r, &i_1, u, &i_1); // u = r rho = ddot_ (&n, r0, &i_1, r, &i_1); // rho = (r0*, r) int i; for (i = 0; i < itmax; i ++) { atimes (n, p, ap, atimes_param); // ap = A.p r0ap = ddot_ (&n, r0, &i_1, ap, &i_1); // r0ap = (r0*, A.p) delta = - rho / r0ap; dcopy_ (&n, u, &i_1, q, &i_1); // q = u dscal_ (&n, &d_2, q, &i_1); // q = 2 u daxpy_ (&n, &delta, ap, &i_1, q, &i_1); // q = 2 u + delta A.p atimes (n, q, t, atimes_param); // t = A.q daxpy_ (&n, &delta, t, &i_1, r, &i_1); // r = r + delta t daxpy_ (&n, &delta, q, &i_1, x, &i_1); // x = x + delta q res2 = ddot_ (&n, r, &i_1, r, &i_1); if (it->debug == 2) { fprintf (it->out, "libiter-cgs %d %e\n", i, res2 / b2); } if (res2 <= eps2) { ret = 0; // success break; } rho1 = ddot_ (&n, r0, &i_1, r, &i_1); // rho = (r0*, r) beta = rho1 / rho; rho = rho1; // here t is not used so that this is used for working area. double *qu = t; dcopy_ (&n, q, &i_1, qu, &i_1); // qu = q daxpy_ (&n, &d_m1, u, &i_1, qu, &i_1); // qu = q - u dcopy_ (&n, r, &i_1, u, &i_1); // u = r daxpy_ (&n, &beta, qu, &i_1, u, &i_1); // u = r + beta (q - u) daxpy_ (&n, &beta, p, &i_1, qu, &i_1); // qu = q - u + beta * p dcopy_ (&n, u, &i_1, p, &i_1); // p = u daxpy_ (&n, &beta, qu, &i_1, p, &i_1); // p = u + beta (q - u + b * p) } # else // !HAVE_BLAS_H /** * local BLAS version */ double b2 = my_ddot (n, b, 1, b, 1); // (b,b) eps2 *= b2; // initial residue atimes (n, x, r, atimes_param); // r = A.x my_daxpy (n, -1.0, b, 1, r, 1); // r = A.x - b my_dcopy (n, r, 1, r0, 1); // r0* = r my_dcopy (n, r, 1, p, 1); // p = r my_dcopy (n, r, 1, u, 1); // u = r rho = my_ddot (n, r0, 1, r, 1); // rho = (r0*, r) int i; for (i = 0; i < itmax; i ++) { atimes (n, p, ap, atimes_param); // ap = A.p r0ap = my_ddot (n, r0, 1, ap, 1); // r0ap = (r0*, A.p) delta = - rho / r0ap; my_dcopy (n, u, 1, q, 1); // q = u my_dscal (n, 2.0, q, 1); // q = 2 u my_daxpy (n, delta, ap, 1, q, 1); // q = 2 u + delta A.p atimes (n, q, t, atimes_param); // t = A.q my_daxpy (n, delta, t, 1, r, 1); // r = r + delta t my_daxpy (n, delta, q, 1, x, 1); // x = x + delta q res2 = my_ddot (n, r, 1, r, 1); if (it->debug == 2) { fprintf (it->out, "libiter-cgs %d %e\n", i, res2 / b2); } if (res2 <= eps2) { ret = 0; // success break; } rho1 = my_ddot (n, r0, 1, r, 1); // rho = (r0*, r) beta = rho1 / rho; rho = rho1; // here t is not used so that this is used for working area. double *qu = t; my_dcopy (n, q, 1, qu, 1); // qu = q my_daxpy (n, -1.0, u, 1, qu, 1); // qu = q - u my_dcopy (n, r, 1, u, 1); // u = r my_daxpy (n, beta, qu, 1, u, 1); // u = r + beta (q - u) my_daxpy (n, beta, p, 1, qu, 1); // qu = q - u + beta * p my_dcopy (n, u, 1, p, 1); // p = u my_daxpy (n, beta, qu, 1, p, 1); // p = u + beta (q - u + b * p) } # endif // !HAVE_BLAS_H #endif // !HAVE_CBLAS_H free (r); free (r0); free (p); free (u); free (ap); free (q); free (t); if (it->debug == 1) { fprintf (it->out, "libiter-cgs it= %d res^2= %e\n", i, res2); } it->niter = i; it->res2 = res2 / b2; return (ret); }
/* Classical CG method -- Weiss' Algorithm 2 * INPUT * n : dimension of the problem * b [n] : r-h-s vector * atimes (int n, static double *x, double *b, void *param) : * calc matrix-vector product A.x = b. * atimes_param : parameters for atimes(). * it : struct iter. following entries are used * it->max = kend : max of iteration * it->eps = eps : criteria for |r^2|/|b^2| * OUTPUT * returned value : 0 == success, otherwise (-1) == failed * x [n] : solution * it->niter : # of iteration * it->res2 : |r^2| / |b^2| */ int cg (int n, const double *b, double *x, void (*atimes) (int, const double *, double *, void *), void *atimes_param, struct iter *it) { #ifndef HAVE_CBLAS_H # ifdef HAVE_BLAS_H /* use Fortran BLAS routines */ int i_1 = 1; double d_1 = 1.0; double d_m1 = -1.0; # endif // !HAVE_BLAS_H #endif // !HAVE_CBLAS_H int ret = -1; double eps2 = it->eps * it->eps; int itmax = it->max; double *p = (double *)malloc (sizeof (double) * n); double *r = (double *)malloc (sizeof (double) * n); double *ap = (double *)malloc (sizeof (double) * n); CHECK_MALLOC (p, "cg"); CHECK_MALLOC (r, "cg"); CHECK_MALLOC (ap, "cg"); double r2; double res2 = 0.0; double pap; double gamma; double beta; int i; #ifdef HAVE_CBLAS_H /** * ATLAS version */ double b2 = cblas_ddot (n, b, 1, b, 1); // (b,b) eps2 *= b2; atimes (n, x, r, atimes_param); // r = A.x cblas_daxpy (n, -1.0, b, 1, r, 1); // r = A.x - b cblas_dcopy (n, r, 1, p, 1); // p = r for (i = 0; i < itmax; i ++) { r2 = cblas_ddot (n, r, 1, r, 1); // r2 = (r, r) atimes (n, p, ap, atimes_param); // ap = A.p pap = cblas_ddot (n, p, 1, ap, 1); // pap = (p, A.p) gamma = - r2 / pap; // gamma = - (r, r) / (p, A.p) cblas_daxpy (n, gamma, p, 1, x, 1); // x += gamma p cblas_daxpy (n, gamma, ap, 1, r, 1); // r += gamma Ap // new norm of r res2 = cblas_ddot (n, r, 1, r, 1); // (r, r) if (it->debug == 2) { fprintf (it->out, "libiter-cg %d %e\n", i, res2 / b2); } if (res2 <= eps2) { ret = 0; // success break; } beta = res2 / r2; // beta = (r, r) / (r0, r0) cblas_dscal (n, beta, p, 1); // p *= beta cblas_daxpy (n, 1.0, r, 1, p, 1); // p = r + beta p r2 = res2; } #else // !HAVE_CBLAS_H # ifdef HAVE_BLAS_H /** * BLAS version */ double b2 = ddot_ (&n, b, &i_1, b, &i_1); // (b,b) eps2 *= b2; atimes (n, x, r, atimes_param); // r = A.x daxpy_ (&n, &d_m1, b, &i_1, r, &i_1); // r = A.x - b dcopy_ (&n, r, &i_1, p, &i_1); // p = r for (i = 0; i < itmax; i ++) { r2 = ddot_ (&n, r, &i_1, r, &i_1); // r2 = (r, r) atimes (n, p, ap, atimes_param); // ap = A.p pap = ddot_ (&n, p, &i_1, ap, &i_1); // pap = (p, A.p) gamma = - r2 / pap; // gamma = - (r, r) / (p, A.p) daxpy_ (&n, &gamma, p, &i_1, x, &i_1); // x += gamma p daxpy_ (&n, &gamma, ap, &i_1, r, &i_1); // r += gamma Ap // new norm of r res2 = ddot_ (&n, r, &i_1, r, &i_1); // (r, r) if (it->debug == 2) { fprintf (it->out, "libiter-cg %d %e\n", i, res2 / b2); } if (res2 <= eps2) { ret = 0; // success break; } beta = res2 / r2; // beta = (r, r) / (r0, r0) dscal_ (&n, &beta, p, &i_1); // p *= beta daxpy_ (&n, &d_1, r, &i_1, p, &i_1); // p = r + beta p r2 = res2; } # else // !HAVE_BLAS_H /** * local BLAS version */ double b2 = my_ddot (n, b, 1, b, 1); // (b,b) eps2 *= b2; atimes (n, x, r, atimes_param); // r = A.x my_daxpy (n, -1.0, b, 1, r, 1); // r = A.x - b my_dcopy (n, r, 1, p, 1); // p = r for (i = 0; i < itmax; i ++) { r2 = my_ddot (n, r, 1, r, 1); // r2 = (r, r) atimes (n, p, ap, atimes_param); // ap = A.p pap = my_ddot (n, p, 1, ap, 1); // pap = (p, A.p) gamma = - r2 / pap; // gamma = - (r, r) / (p, A.p) my_daxpy (n, gamma, p, 1, x, 1); // x += gamma p my_daxpy (n, gamma, ap, 1, r, 1); // r += gamma Ap // new norm of r res2 = my_ddot (n, r, 1, r, 1); // (r, r) if (it->debug == 2) { fprintf (it->out, "libiter-cg %d %e\n", i, res2 / b2); } if (res2 <= eps2) { ret = 0; // success break; } beta = res2 / r2; // beta = (r, r) / (r0, r0) my_dscal (n, beta, p, 1); // p *= beta my_daxpy (n, 1.0, r, 1, p, 1); // p = r + beta p r2 = res2; } # endif // !HAVE_BLAS_H #endif // !HAVE_CBLAS_H free (p); free (r); free (ap); if (it->debug == 1) { fprintf (it->out, "libiter-cg it= %d res^2= %e\n", i, res2 / b2); } it->niter = i; it->res2 = res2 / b2; return (ret); }