Exemplo n.º 1
0
static inline cmplx cpolar(R r, R t)
{
    if (r == 0.0 && !isnan(t))
        return 0.0;
    else
        return CNUM(r * pnfft_cos(t), r * pnfft_sin(t));
}
Exemplo n.º 2
0
int
mad_cmat_invc_r (const cnum_t y[], num_t x_re, num_t x_im, cnum_t r[], ssz_t m, ssz_t n, num_t rcond)
{ CNUM(x); return mad_cmat_invc(y, x, r, m, n, rcond); }
Exemplo n.º 3
0
int // without complex-by-value version
mad_mat_invc_r (const num_t y[], num_t x_re, num_t x_im, cnum_t r[], ssz_t m, ssz_t n, num_t rcond)
{ CNUM(x); return mad_mat_invc(y, x, r, m, n, rcond); }
Exemplo n.º 4
0
void mad_cmat_fill_r(num_t x_re, num_t x_im, cnum_t r[], ssz_t m, ssz_t n, ssz_t ldr)
{ CHKR; CNUM(x); SET(); }
Exemplo n.º 5
0
cmplx w_of_z(cmplx z)
{
    faddeeva_nofterms = 0;

    // Steven G. Johnson, October 2012.

    if (pnfft_creal(z) == 0.0) {
        // Purely imaginary input, purely real output.
        // However, use creal(z) to give correct sign of 0 in cimag(w).
        return CNUM(erfcx(pnfft_cimag(z)), pnfft_creal(z));
    }
    if (pnfft_cimag(z) == 0) {
        // Purely real input, complex output.
        return CNUM(pnfft_exp(-sqr(pnfft_creal(z))),  im_w_of_x(pnfft_creal(z)));
    }

    const R relerr = DBL_EPSILON;
    const R a = 0.518321480430085929872; // pi / sqrt(-log(eps*0.5))
    const R c = 0.329973702884629072537; // (2/pi) * a;
    const R a2 = 0.268657157075235951582; // a^2

    const R x = pnfft_fabs(pnfft_creal(z));
    const R y = pnfft_cimag(z);
    const R ya = pnfft_fabs(y);

    cmplx ret = 0.; // return value

    R sum1 = 0, sum2 = 0, sum3 = 0, sum4 = 0, sum5 = 0;

    if (ya > 7 || (x > 6  // continued fraction is faster
                   /* As pointed out by M. Zaghloul, the continued
                      fraction seems to give a large relative error in
                      Re w(z) for |x| ~ 6 and small |y|, so use
                      algorithm 816 in this region: */
                   && (ya > 0.1 || (x > 8 && ya > 1e-10) || x > 28))) {
    
        faddeeva_algorithm = 100;

        /* Poppe & Wijers suggest using a number of terms
           nu = 3 + 1442 / (26*rho + 77)
           where rho = sqrt((x/x0)^2 + (y/y0)^2) where x0=6.3, y0=4.4.
           (They only use this expansion for rho >= 1, but rho a little less
           than 1 seems okay too.)
           Instead, I did my own fit to a slightly different function
           that avoids the hypotenuse calculation, using NLopt to minimize
           the sum of the squares of the errors in nu with the constraint
           that the estimated nu be >= minimum nu to attain machine precision.
           I also separate the regions where nu == 2 and nu == 1. */
        const R ispi = 0.56418958354775628694807945156; // 1 / sqrt(pi)
        R xs = y < 0 ? -pnfft_creal(z) : pnfft_creal(z); // compute for -z if y < 0
        if (x + ya > 4000) { // nu <= 2
            if (x + ya > 1e7) { // nu == 1, w(z) = i/sqrt(pi) / z
                // scale to avoid overflow
                if (x > ya) {
                    faddeeva_algorithm += 1;
                    R yax = ya / xs; 
                    faddeeva_algorithm = 100;
                    R denom = ispi / (xs + yax*ya);
                    ret = CNUM(denom*yax, denom);
                }
                else if (isinf(ya)) {
                    faddeeva_algorithm += 2;
                    return ((isnan(x) || y < 0) 
                            ? CNUM(NaN,NaN) : CNUM(0,0));
                }
                else {
                    faddeeva_algorithm += 3;
                    R xya = xs / ya;
                    R denom = ispi / (xya*xs + ya);
                    ret = CNUM(denom, denom*xya);
                }
            }
            else { // nu == 2, w(z) = i/sqrt(pi) * z / (z*z - 0.5)
                faddeeva_algorithm += 4;
                R dr = xs*xs - ya*ya - 0.5, di = 2*xs*ya;
                R denom = ispi / (dr*dr + di*di);
                ret = CNUM(denom * (xs*di-ya*dr), denom * (xs*dr+ya*di));
            }
        }
        else { // compute nu(z) estimate and do general continued fraction
            faddeeva_algorithm += 5;
            const R c0=3.9, c1=11.398, c2=0.08254, c3=0.1421, c4=0.2023; // fit
            R nu = pnfft_floor(c0 + c1 / (c2*x + c3*ya + c4));
            R wr = xs, wi = ya;
            for (nu = 0.5 * (nu - 1); nu > 0.4; nu -= 0.5) {
                // w <- z - nu/w:
                R denom = nu / (wr*wr + wi*wi);
                wr = xs - wr * denom;
                wi = ya + wi * denom;
            }
            { // w(z) = i/sqrt(pi) / w:
                R denom = ispi / (wr*wr + wi*wi);
                ret = CNUM(denom*wi, denom*wr);
            }
        }
        if (y < 0) {
            faddeeva_algorithm += 10;
            // use w(z) = 2.0*exp(-z*z) - w(-z), 
            // but be careful of overflow in exp(-z*z) 
            //                                = exp(-(xs*xs-ya*ya) -2*i*xs*ya) 
            return 2.0*pnfft_cexp(CNUM((ya-xs)*(xs+ya), 2*xs*y)) - ret;
        }
        else
            return ret;
    }

    /* Note: The test that seems to be suggested in the paper is x <
       sqrt(-log(DBL_MIN)), about 26.6, since otherwise exp(-x^2)
       underflows to zero and sum1,sum2,sum4 are zero.  However, long
       before this occurs, the sum1,sum2,sum4 contributions are
       negligible in R precision; I find that this happens for x >
       about 6, for all y.  On the other hand, I find that the case
       where we compute all of the sums is faster (at least with the
       precomputed expa2n2 table) until about x=10.  Furthermore, if we
       try to compute all of the sums for x > 20, I find that we
       sometimes run into numerical problems because underflow/overflow
       problems start to appear in the various coefficients of the sums,
       below.  Therefore, we use x < 10 here. */
    else if (x < 10) {

        faddeeva_algorithm = 200;

        R prod2ax = 1, prodm2ax = 1;
        R expx2;

        if (isnan(y)) {
            faddeeva_algorithm += 99;
            return CNUM(y,y);
        }

        if (x < 5e-4) { // compute sum4 and sum5 together as sum5-sum4
                        // This special case is needed for accuracy.
            faddeeva_algorithm += 1;
            const R x2 = x*x;
            expx2 = 1 - x2 * (1 - 0.5*x2); // exp(-x*x) via Taylor
            // compute exp(2*a*x) and exp(-2*a*x) via Taylor, to R precision
            const R ax2 = 1.036642960860171859744*x; // 2*a*x
            const R exp2ax =
                1 + ax2 * (1 + ax2 * (0.5 + 0.166666666666666666667*ax2));
            const R expm2ax =
                1 - ax2 * (1 - ax2 * (0.5 - 0.166666666666666666667*ax2));
            for (int n = 1; 1; ++n) {
                ++faddeeva_nofterms;
                const R coef = expa2n2[n-1] * expx2 / (a2*(n*n) + y*y);
                prod2ax *= exp2ax;
                prodm2ax *= expm2ax;
                sum1 += coef;
                sum2 += coef * prodm2ax;
                sum3 += coef * prod2ax;
          
                // really = sum5 - sum4
                sum5 += coef * (2*a) * n * sinh_taylor((2*a)*n*x);
          
                // test convergence via sum3
                if (coef * prod2ax < relerr * sum3) break;
            }
        }
        else { // x > 5e-4, compute sum4 and sum5 separately
            faddeeva_algorithm += 2;
            expx2 = pnfft_exp(-x*x);
            const R exp2ax = pnfft_exp((2*a)*x), expm2ax = 1 / exp2ax;
            for (int n = 1; 1; ++n) {
                ++faddeeva_nofterms;
                const R coef = expa2n2[n-1] * expx2 / (a2*(n*n) + y*y);
                prod2ax *= exp2ax;
                prodm2ax *= expm2ax;
                sum1 += coef;
                sum2 += coef * prodm2ax;
                sum4 += (coef * prodm2ax) * (a*n);
                sum3 += coef * prod2ax;
                sum5 += (coef * prod2ax) * (a*n);
                // test convergence via sum5, since this sum has the slowest decay
                if ((coef * prod2ax) * (a*n) < relerr * sum5) break;
            }
        }
        const R expx2erfcxy = // avoid spurious overflow for large negative y
            y > -6 // for y < -6, erfcx(y) = 2*exp(y*y) to R precision
            ? expx2*erfcx(y) : 2*pnfft_exp(y*y-x*x);
        if (y > 5) { // imaginary terms cancel
            faddeeva_algorithm += 10;
            const R sinxy = pnfft_sin(x*y);
            ret = (expx2erfcxy - c*y*sum1) * pnfft_cos(2*x*y)
                + (c*x*expx2) * sinxy * sinc(x*y, sinxy);
        }
        else {
            faddeeva_algorithm += 20;
            R xs = pnfft_creal(z);
            const R sinxy = pnfft_sin(xs*y);
            const R sin2xy = pnfft_sin(2*xs*y), cos2xy = pnfft_cos(2*xs*y);
            const R coef1 = expx2erfcxy - c*y*sum1;
            const R coef2 = c*xs*expx2;
            ret = CNUM(coef1 * cos2xy + coef2 * sinxy * sinc(xs*y, sinxy),
                    coef2 * sinc(2*xs*y, sin2xy) - coef1 * sin2xy);
        }
    }
    else { // x large: only sum3 & sum5 contribute (see above note)    

        faddeeva_algorithm = 300;

        if (isnan(x))
            return CNUM(x,x);
        if (isnan(y))
            return CNUM(y,y);

        ret = pnfft_exp(-x*x); // |y| < 1e-10, so we only need exp(-x*x) term
        // (round instead of ceil as in original paper; note that x/a > 1 here)
        R n0 = pnfft_floor(x/a + 0.5); // sum in both directions, starting at n0
        R dx = a*n0 - x;
        sum3 = pnfft_exp(-dx*dx) / (a2*(n0*n0) + y*y);
        sum5 = a*n0 * sum3;
        R exp1 = pnfft_exp(4*a*dx), exp1dn = 1;
        int dn;
        for (dn = 1; n0 - dn > 0; ++dn) { // loop over n0-dn and n0+dn terms
            R np = n0 + dn, nm = n0 - dn;
            R tp = pnfft_exp(-sqr(a*dn+dx));
            R tm = tp * (exp1dn *= exp1); // trick to get tm from tp
            tp /= (a2*(np*np) + y*y);
            tm /= (a2*(nm*nm) + y*y);
            sum3 += tp + tm;
            sum5 += a * (np * tp + nm * tm);
            if (a * (np * tp + nm * tm) < relerr * sum5) goto finish;
        }
        while (1) { // loop over n0+dn terms only (since n0-dn <= 0)
            R np = n0 + dn++;
            R tp = pnfft_exp(-sqr(a*dn+dx)) / (a2*(np*np) + y*y);
            sum3 += tp;
            sum5 += a * np * tp;
            if (a * np * tp < relerr * sum5) goto finish;
        }
    }
finish:
    return ret + CNUM((0.5*c)*y*(sum2+sum3), 
                   (0.5*c)*copysign(sum5-sum4, pnfft_creal(z)));
} // w_of_z