static inline cmplx cpolar(R r, R t) { if (r == 0.0 && !isnan(t)) return 0.0; else return CNUM(r * pnfft_cos(t), r * pnfft_sin(t)); }
int mad_cmat_invc_r (const cnum_t y[], num_t x_re, num_t x_im, cnum_t r[], ssz_t m, ssz_t n, num_t rcond) { CNUM(x); return mad_cmat_invc(y, x, r, m, n, rcond); }
int // without complex-by-value version mad_mat_invc_r (const num_t y[], num_t x_re, num_t x_im, cnum_t r[], ssz_t m, ssz_t n, num_t rcond) { CNUM(x); return mad_mat_invc(y, x, r, m, n, rcond); }
void mad_cmat_fill_r(num_t x_re, num_t x_im, cnum_t r[], ssz_t m, ssz_t n, ssz_t ldr) { CHKR; CNUM(x); SET(); }
cmplx w_of_z(cmplx z) { faddeeva_nofterms = 0; // Steven G. Johnson, October 2012. if (pnfft_creal(z) == 0.0) { // Purely imaginary input, purely real output. // However, use creal(z) to give correct sign of 0 in cimag(w). return CNUM(erfcx(pnfft_cimag(z)), pnfft_creal(z)); } if (pnfft_cimag(z) == 0) { // Purely real input, complex output. return CNUM(pnfft_exp(-sqr(pnfft_creal(z))), im_w_of_x(pnfft_creal(z))); } const R relerr = DBL_EPSILON; const R a = 0.518321480430085929872; // pi / sqrt(-log(eps*0.5)) const R c = 0.329973702884629072537; // (2/pi) * a; const R a2 = 0.268657157075235951582; // a^2 const R x = pnfft_fabs(pnfft_creal(z)); const R y = pnfft_cimag(z); const R ya = pnfft_fabs(y); cmplx ret = 0.; // return value R sum1 = 0, sum2 = 0, sum3 = 0, sum4 = 0, sum5 = 0; if (ya > 7 || (x > 6 // continued fraction is faster /* As pointed out by M. Zaghloul, the continued fraction seems to give a large relative error in Re w(z) for |x| ~ 6 and small |y|, so use algorithm 816 in this region: */ && (ya > 0.1 || (x > 8 && ya > 1e-10) || x > 28))) { faddeeva_algorithm = 100; /* Poppe & Wijers suggest using a number of terms nu = 3 + 1442 / (26*rho + 77) where rho = sqrt((x/x0)^2 + (y/y0)^2) where x0=6.3, y0=4.4. (They only use this expansion for rho >= 1, but rho a little less than 1 seems okay too.) Instead, I did my own fit to a slightly different function that avoids the hypotenuse calculation, using NLopt to minimize the sum of the squares of the errors in nu with the constraint that the estimated nu be >= minimum nu to attain machine precision. I also separate the regions where nu == 2 and nu == 1. */ const R ispi = 0.56418958354775628694807945156; // 1 / sqrt(pi) R xs = y < 0 ? -pnfft_creal(z) : pnfft_creal(z); // compute for -z if y < 0 if (x + ya > 4000) { // nu <= 2 if (x + ya > 1e7) { // nu == 1, w(z) = i/sqrt(pi) / z // scale to avoid overflow if (x > ya) { faddeeva_algorithm += 1; R yax = ya / xs; faddeeva_algorithm = 100; R denom = ispi / (xs + yax*ya); ret = CNUM(denom*yax, denom); } else if (isinf(ya)) { faddeeva_algorithm += 2; return ((isnan(x) || y < 0) ? CNUM(NaN,NaN) : CNUM(0,0)); } else { faddeeva_algorithm += 3; R xya = xs / ya; R denom = ispi / (xya*xs + ya); ret = CNUM(denom, denom*xya); } } else { // nu == 2, w(z) = i/sqrt(pi) * z / (z*z - 0.5) faddeeva_algorithm += 4; R dr = xs*xs - ya*ya - 0.5, di = 2*xs*ya; R denom = ispi / (dr*dr + di*di); ret = CNUM(denom * (xs*di-ya*dr), denom * (xs*dr+ya*di)); } } else { // compute nu(z) estimate and do general continued fraction faddeeva_algorithm += 5; const R c0=3.9, c1=11.398, c2=0.08254, c3=0.1421, c4=0.2023; // fit R nu = pnfft_floor(c0 + c1 / (c2*x + c3*ya + c4)); R wr = xs, wi = ya; for (nu = 0.5 * (nu - 1); nu > 0.4; nu -= 0.5) { // w <- z - nu/w: R denom = nu / (wr*wr + wi*wi); wr = xs - wr * denom; wi = ya + wi * denom; } { // w(z) = i/sqrt(pi) / w: R denom = ispi / (wr*wr + wi*wi); ret = CNUM(denom*wi, denom*wr); } } if (y < 0) { faddeeva_algorithm += 10; // use w(z) = 2.0*exp(-z*z) - w(-z), // but be careful of overflow in exp(-z*z) // = exp(-(xs*xs-ya*ya) -2*i*xs*ya) return 2.0*pnfft_cexp(CNUM((ya-xs)*(xs+ya), 2*xs*y)) - ret; } else return ret; } /* Note: The test that seems to be suggested in the paper is x < sqrt(-log(DBL_MIN)), about 26.6, since otherwise exp(-x^2) underflows to zero and sum1,sum2,sum4 are zero. However, long before this occurs, the sum1,sum2,sum4 contributions are negligible in R precision; I find that this happens for x > about 6, for all y. On the other hand, I find that the case where we compute all of the sums is faster (at least with the precomputed expa2n2 table) until about x=10. Furthermore, if we try to compute all of the sums for x > 20, I find that we sometimes run into numerical problems because underflow/overflow problems start to appear in the various coefficients of the sums, below. Therefore, we use x < 10 here. */ else if (x < 10) { faddeeva_algorithm = 200; R prod2ax = 1, prodm2ax = 1; R expx2; if (isnan(y)) { faddeeva_algorithm += 99; return CNUM(y,y); } if (x < 5e-4) { // compute sum4 and sum5 together as sum5-sum4 // This special case is needed for accuracy. faddeeva_algorithm += 1; const R x2 = x*x; expx2 = 1 - x2 * (1 - 0.5*x2); // exp(-x*x) via Taylor // compute exp(2*a*x) and exp(-2*a*x) via Taylor, to R precision const R ax2 = 1.036642960860171859744*x; // 2*a*x const R exp2ax = 1 + ax2 * (1 + ax2 * (0.5 + 0.166666666666666666667*ax2)); const R expm2ax = 1 - ax2 * (1 - ax2 * (0.5 - 0.166666666666666666667*ax2)); for (int n = 1; 1; ++n) { ++faddeeva_nofterms; const R coef = expa2n2[n-1] * expx2 / (a2*(n*n) + y*y); prod2ax *= exp2ax; prodm2ax *= expm2ax; sum1 += coef; sum2 += coef * prodm2ax; sum3 += coef * prod2ax; // really = sum5 - sum4 sum5 += coef * (2*a) * n * sinh_taylor((2*a)*n*x); // test convergence via sum3 if (coef * prod2ax < relerr * sum3) break; } } else { // x > 5e-4, compute sum4 and sum5 separately faddeeva_algorithm += 2; expx2 = pnfft_exp(-x*x); const R exp2ax = pnfft_exp((2*a)*x), expm2ax = 1 / exp2ax; for (int n = 1; 1; ++n) { ++faddeeva_nofterms; const R coef = expa2n2[n-1] * expx2 / (a2*(n*n) + y*y); prod2ax *= exp2ax; prodm2ax *= expm2ax; sum1 += coef; sum2 += coef * prodm2ax; sum4 += (coef * prodm2ax) * (a*n); sum3 += coef * prod2ax; sum5 += (coef * prod2ax) * (a*n); // test convergence via sum5, since this sum has the slowest decay if ((coef * prod2ax) * (a*n) < relerr * sum5) break; } } const R expx2erfcxy = // avoid spurious overflow for large negative y y > -6 // for y < -6, erfcx(y) = 2*exp(y*y) to R precision ? expx2*erfcx(y) : 2*pnfft_exp(y*y-x*x); if (y > 5) { // imaginary terms cancel faddeeva_algorithm += 10; const R sinxy = pnfft_sin(x*y); ret = (expx2erfcxy - c*y*sum1) * pnfft_cos(2*x*y) + (c*x*expx2) * sinxy * sinc(x*y, sinxy); } else { faddeeva_algorithm += 20; R xs = pnfft_creal(z); const R sinxy = pnfft_sin(xs*y); const R sin2xy = pnfft_sin(2*xs*y), cos2xy = pnfft_cos(2*xs*y); const R coef1 = expx2erfcxy - c*y*sum1; const R coef2 = c*xs*expx2; ret = CNUM(coef1 * cos2xy + coef2 * sinxy * sinc(xs*y, sinxy), coef2 * sinc(2*xs*y, sin2xy) - coef1 * sin2xy); } } else { // x large: only sum3 & sum5 contribute (see above note) faddeeva_algorithm = 300; if (isnan(x)) return CNUM(x,x); if (isnan(y)) return CNUM(y,y); ret = pnfft_exp(-x*x); // |y| < 1e-10, so we only need exp(-x*x) term // (round instead of ceil as in original paper; note that x/a > 1 here) R n0 = pnfft_floor(x/a + 0.5); // sum in both directions, starting at n0 R dx = a*n0 - x; sum3 = pnfft_exp(-dx*dx) / (a2*(n0*n0) + y*y); sum5 = a*n0 * sum3; R exp1 = pnfft_exp(4*a*dx), exp1dn = 1; int dn; for (dn = 1; n0 - dn > 0; ++dn) { // loop over n0-dn and n0+dn terms R np = n0 + dn, nm = n0 - dn; R tp = pnfft_exp(-sqr(a*dn+dx)); R tm = tp * (exp1dn *= exp1); // trick to get tm from tp tp /= (a2*(np*np) + y*y); tm /= (a2*(nm*nm) + y*y); sum3 += tp + tm; sum5 += a * (np * tp + nm * tm); if (a * (np * tp + nm * tm) < relerr * sum5) goto finish; } while (1) { // loop over n0+dn terms only (since n0-dn <= 0) R np = n0 + dn++; R tp = pnfft_exp(-sqr(a*dn+dx)) / (a2*(np*np) + y*y); sum3 += tp; sum5 += a * np * tp; if (a * np * tp < relerr * sum5) goto finish; } } finish: return ret + CNUM((0.5*c)*y*(sum2+sum3), (0.5*c)*copysign(sum5-sum4, pnfft_creal(z))); } // w_of_z