void ecc_modp_sqr (const struct ecc_curve *ecc, mp_limb_t *rp, const mp_limb_t *ap) { mpn_sqr (rp, ap, ecc->size); ecc->reduce (ecc, rp); }
int main() { mp_limb_t *x = NULL, *y = NULL; long nx = 0; #ifdef mpn_sqr mpn_sqr(y, x, nx); #else mpn_mul_n(y, x, x, nx); #endif return 0; }
void Sqr(modp& ans,const modp& x,const Zp_Data& ZpD) { if (ZpD.montgomery) { ZpD.Mont_Mult(ans.x,x.x,x.x); } else { //ans.x=(x.x*x.x)%ZpD.pr; mp_limb_t aa[2*MAX_MOD_SZ],q[2*MAX_MOD_SZ]; mpn_sqr(aa,x.x,ZpD.t); mpn_tdiv_qr(q,ans.x,0,aa,2*ZpD.t,ZpD.prA,ZpD.t); } }
/* Input is {ap,rn}; output is {rp,rn}, computation is mod B^rn - 1, and values are semi-normalised; zero is represented as either 0 or B^n - 1. Needs a scratch of 2rn limbs at tp. tp==rp is allowed. */ static void mpn_bc_sqrmod_bnm1 (mp_ptr rp, mp_srcptr ap, mp_size_t rn, mp_ptr tp) { mp_limb_t cy; ASSERT (0 < rn); mpn_sqr (tp, ap, rn); cy = mpn_add_n (rp, tp, tp + rn, rn); /* If cy == 1, then the value of rp is at most B^rn - 2, so there can * be no overflow when adding in the carry. */ MPN_INCR_U (rp, rn, cy); }
/* Input is {ap,rn+1}; output is {rp,rn+1}, in semi-normalised representation, computation is mod B^rn + 1. Needs a scratch area of 2rn + 2 limbs at tp; tp == rp is allowed. Output is normalised. */ static void mpn_bc_sqrmod_bnp1 (mp_ptr rp, mp_srcptr ap, mp_size_t rn, mp_ptr tp) { mp_limb_t cy; ASSERT (0 < rn); mpn_sqr (tp, ap, rn + 1); ASSERT (tp[2*rn+1] == 0); ASSERT (tp[2*rn] < GMP_NUMB_MAX); cy = tp[2*rn] + mpn_sub_n (rp, tp, tp+rn, rn); rp[rn] = 0; MPN_INCR_U (rp, rn+1, cy ); }
void _fmpz_poly_sqrlow_KS(fmpz * res, const fmpz * poly, long len, long n) { int neg; long bits, limbs, loglen, sign = 0; mp_limb_t *arr_in, *arr_out; FMPZ_VEC_NORM(poly, len); if (len == 0) { _fmpz_vec_zero(res, n); return; } neg = (fmpz_sgn(poly + len - 1) > 0) ? 0 : -1; if (n > 2 * len - 1) { _fmpz_vec_zero(res + 2 * len - 1, n - (2 * len - 1)); n = 2 * len - 1; } bits = _fmpz_vec_max_bits(poly, len); if (bits < 0) { sign = 1; bits = - bits; } loglen = FLINT_BIT_COUNT(len); bits = 2 * bits + loglen + sign; limbs = (bits * len - 1) / FLINT_BITS + 1; arr_in = flint_calloc(limbs, sizeof(mp_limb_t)); arr_out = flint_malloc((2 * limbs) * sizeof(mp_limb_t)); _fmpz_poly_bit_pack(arr_in, poly, len, bits, neg); mpn_sqr(arr_out, arr_in, limbs); if (sign) _fmpz_poly_bit_unpack(res, n, arr_out, bits, 0); else _fmpz_poly_bit_unpack_unsigned(res, n, arr_out, bits); flint_free(arr_in); flint_free(arr_out); }
/* Computes {rp,MIN(rn,2an)} <- {ap,an}^2 Mod(B^rn-1) * * The result is expected to be ZERO if and only if the operand * already is. Otherwise the class [0] Mod(B^rn-1) is represented by * B^rn-1. * It should not be a problem if sqrmod_bnm1 is used to * compute the full square with an <= 2*rn, because this condition * implies (B^an-1)^2 < (B^rn-1) . * * Requires rn/4 < an <= rn * Scratch need: rn/2 + (need for recursive call OR rn + 3). This gives * * S(n) <= rn/2 + MAX (rn + 4, S(n/2)) <= 3/2 rn + 4 */ void mpn_sqrmod_bnm1 (mp_ptr rp, mp_size_t rn, mp_srcptr ap, mp_size_t an, mp_ptr tp) { ASSERT (0 < an); ASSERT (an <= rn); if ((rn & 1) != 0 || BELOW_THRESHOLD (rn, SQRMOD_BNM1_THRESHOLD)) { if (UNLIKELY (an < rn)) { if (UNLIKELY (2*an <= rn)) { mpn_sqr (rp, ap, an); } else { mp_limb_t cy; mpn_sqr (tp, ap, an); cy = mpn_add (rp, tp, rn, tp + rn, 2*an - rn); MPN_INCR_U (rp, rn, cy); } } else mpn_bc_sqrmod_bnm1 (rp, ap, rn, tp); } else { mp_size_t n; mp_limb_t cy; mp_limb_t hi; n = rn >> 1; ASSERT (2*an > n); /* Compute xm = a^2 mod (B^n - 1), xp = a^2 mod (B^n + 1) and crt together as x = -xp * B^n + (B^n + 1) * [ (xp + xm)/2 mod (B^n-1)] */ #define a0 ap #define a1 (ap + n) #define xp tp /* 2n + 2 */ /* am1 maybe in {xp, n} */ #define sp1 (tp + 2*n + 2) /* ap1 maybe in {sp1, n + 1} */ { mp_srcptr am1; mp_size_t anm; mp_ptr so; if (LIKELY (an > n)) { so = xp + n; am1 = xp; cy = mpn_add (xp, a0, n, a1, an - n); MPN_INCR_U (xp, n, cy); anm = n; } else { so = xp; am1 = a0; anm = an; } mpn_sqrmod_bnm1 (rp, n, am1, anm, so); } { int k; mp_srcptr ap1; mp_size_t anp; if (LIKELY (an > n)) { ap1 = sp1; cy = mpn_sub (sp1, a0, n, a1, an - n); sp1[n] = 0; MPN_INCR_U (sp1, n + 1, cy); anp = n + ap1[n]; } else { ap1 = a0; anp = an; } if (BELOW_THRESHOLD (n, MUL_FFT_MODF_THRESHOLD)) k=0; else { int mask; k = mpn_fft_best_k (n, 1); mask = (1<<k) -1; while (n & mask) {k--; mask >>=1;}; } if (k >= FFT_FIRST_K) xp[n] = mpn_mul_fft (xp, n, ap1, anp, ap1, anp, k); else if (UNLIKELY (ap1 == a0)) { ASSERT (anp <= n); ASSERT (2*anp > n); mpn_sqr (xp, a0, an); anp = 2*an - n; cy = mpn_sub (xp, xp, n, xp + n, anp); xp[n] = 0; MPN_INCR_U (xp, n+1, cy); } else mpn_bc_sqrmod_bnp1 (xp, ap1, n, xp); } /* Here the CRT recomposition begins. xm <- (xp + xm)/2 = (xp + xm)B^n/2 mod (B^n-1) Division by 2 is a bitwise rotation. Assumes xp normalised mod (B^n+1). The residue class [0] is represented by [B^n-1]; except when both input are ZERO. */ #if HAVE_NATIVE_mpn_rsh1add_n || HAVE_NATIVE_mpn_rsh1add_nc #if HAVE_NATIVE_mpn_rsh1add_nc cy = mpn_rsh1add_nc(rp, rp, xp, n, xp[n]); /* B^n = 1 */ hi = cy << (GMP_NUMB_BITS - 1); cy = 0; /* next update of rp[n-1] will set cy = 1 only if rp[n-1]+=hi overflows, i.e. a further increment will not overflow again. */ #else /* ! _nc */ cy = xp[n] + mpn_rsh1add_n(rp, rp, xp, n); /* B^n = 1 */ hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */ cy >>= 1; /* cy = 1 only if xp[n] = 1 i.e. {xp,n} = ZERO, this implies that the rsh1add was a simple rshift: the top bit is 0. cy=1 => hi=0. */ #endif #if GMP_NAIL_BITS == 0 add_ssaaaa(cy, rp[n-1], cy, rp[n-1], CNST_LIMB(0), hi); #else cy += (hi & rp[n-1]) >> (GMP_NUMB_BITS-1); rp[n-1] ^= hi; #endif #else /* ! HAVE_NATIVE_mpn_rsh1add_n */ #if HAVE_NATIVE_mpn_add_nc cy = mpn_add_nc(rp, rp, xp, n, xp[n]); #else /* ! _nc */ cy = xp[n] + mpn_add_n(rp, rp, xp, n); /* xp[n] == 1 implies {xp,n} == ZERO */ #endif cy += (rp[0]&1); mpn_rshift(rp, rp, n, 1); ASSERT (cy <= 2); hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */ cy >>= 1; /* We can have cy != 0 only if hi = 0... */ ASSERT ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0); rp[n-1] |= hi; /* ... rp[n-1] + cy can not overflow, the following INCR is correct. */ #endif ASSERT (cy <= 1); /* Next increment can not overflow, read the previous comments about cy. */ ASSERT ((cy == 0) || ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0)); MPN_INCR_U(rp, n, cy); /* Compute the highest half: ([(xp + xm)/2 mod (B^n-1)] - xp ) * B^n */ if (UNLIKELY (2*an < rn)) { /* Note that in this case, the only way the result can equal zero mod B^{rn} - 1 is if the input is zero, and then the output of both the recursive calls and this CRT reconstruction is zero, not B^{rn} - 1. */ cy = mpn_sub_n (rp + n, rp, xp, 2*an - n); /* FIXME: This subtraction of the high parts is not really necessary, we do it to get the carry out, and for sanity checking. */ cy = xp[n] + mpn_sub_nc (xp + 2*an - n, rp + 2*an - n, xp + 2*an - n, rn - 2*an, cy); ASSERT (mpn_zero_p (xp + 2*an - n+1, rn - 1 - 2*an)); cy = mpn_sub_1 (rp, rp, 2*an, cy); ASSERT (cy == (xp + 2*an - n)[0]); } else { cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n); /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO. DECR will affect _at most_ the lowest n limbs. */ MPN_DECR_U (rp, 2*n, cy); } #undef a0 #undef a1 #undef xp #undef sp1 } }
void mpz_mul (mpz_ptr w, mpz_srcptr u, mpz_srcptr v) { mp_size_t usize; mp_size_t vsize; mp_size_t wsize; mp_size_t sign_product; mp_ptr up, vp; mp_ptr wp; mp_ptr free_me; size_t free_me_size; mp_limb_t cy_limb; TMP_DECL; usize = SIZ (u); vsize = SIZ (v); sign_product = usize ^ vsize; usize = ABS (usize); vsize = ABS (vsize); if (usize < vsize) { MPZ_SRCPTR_SWAP (u, v); MP_SIZE_T_SWAP (usize, vsize); } if (vsize == 0) { SIZ (w) = 0; return; } #if HAVE_NATIVE_mpn_mul_2 if (vsize <= 2) { wp = MPZ_REALLOC (w, usize+vsize); if (vsize == 1) cy_limb = mpn_mul_1 (wp, PTR (u), usize, PTR (v)[0]); else { cy_limb = mpn_mul_2 (wp, PTR (u), usize, PTR (v)); usize++; } wp[usize] = cy_limb; usize += (cy_limb != 0); SIZ (w) = (sign_product >= 0 ? usize : -usize); return; } #else if (vsize == 1) { wp = MPZ_REALLOC (w, usize+1); cy_limb = mpn_mul_1 (wp, PTR (u), usize, PTR (v)[0]); wp[usize] = cy_limb; usize += (cy_limb != 0); SIZ (w) = (sign_product >= 0 ? usize : -usize); return; } #endif TMP_MARK; free_me = NULL; up = PTR (u); vp = PTR (v); wp = PTR (w); /* Ensure W has space enough to store the result. */ wsize = usize + vsize; if (ALLOC (w) < wsize) { if (wp == up || wp == vp) { free_me = wp; free_me_size = ALLOC (w); } else (*__gmp_free_func) (wp, (size_t) ALLOC (w) * GMP_LIMB_BYTES); ALLOC (w) = wsize; wp = __GMP_ALLOCATE_FUNC_LIMBS (wsize); PTR (w) = wp; } else { /* Make U and V not overlap with W. */ if (wp == up) { /* W and U are identical. Allocate temporary space for U. */ up = TMP_ALLOC_LIMBS (usize); /* Is V identical too? Keep it identical with U. */ if (wp == vp) vp = up; /* Copy to the temporary space. */ MPN_COPY (up, wp, usize); } else if (wp == vp) { /* W and V are identical. Allocate temporary space for V. */ vp = TMP_ALLOC_LIMBS (vsize); /* Copy to the temporary space. */ MPN_COPY (vp, wp, vsize); } } if (up == vp) { mpn_sqr (wp, up, usize); cy_limb = wp[wsize - 1]; } else { cy_limb = mpn_mul (wp, up, usize, vp, vsize); } wsize -= cy_limb == 0; SIZ (w) = sign_product < 0 ? -wsize : wsize; if (free_me != NULL) (*__gmp_free_func) (free_me, free_me_size * GMP_LIMB_BYTES); TMP_FREE; }
void mpz_powm_ui (mpz_ptr r, mpz_srcptr b, unsigned long int el, mpz_srcptr m) { mp_ptr xp, tp, qp, mp, bp; mp_size_t xn, tn, mn, bn; int m_zero_cnt; int c; mp_limb_t e; TMP_DECL; mp = PTR(m); mn = ABSIZ(m); if (mn == 0) DIVIDE_BY_ZERO; if (el == 0) { /* Exponent is zero, result is 1 mod MOD, i.e., 1 or 0 depending on if MOD equals 1. */ SIZ(r) = (mn == 1 && mp[0] == 1) ? 0 : 1; PTR(r)[0] = 1; return; } TMP_MARK; /* Normalize m (i.e. make its most significant bit set) as required by division functions below. */ count_leading_zeros (m_zero_cnt, mp[mn - 1]); m_zero_cnt -= GMP_NAIL_BITS; if (m_zero_cnt != 0) { mp_ptr new_mp = TMP_ALLOC_LIMBS (mn); mpn_lshift (new_mp, mp, mn, m_zero_cnt); mp = new_mp; } bn = ABSIZ(b); bp = PTR(b); if (bn > mn) { /* Reduce possibly huge base. Use a function call to reduce, since we don't want the quotient allocation to live until function return. */ mp_ptr new_bp = TMP_ALLOC_LIMBS (mn); reduce (new_bp, bp, bn, mp, mn); bp = new_bp; bn = mn; /* Canonicalize the base, since we are potentially going to multiply with it quite a few times. */ MPN_NORMALIZE (bp, bn); } if (bn == 0) { SIZ(r) = 0; TMP_FREE; return; } tp = TMP_ALLOC_LIMBS (2 * mn + 1); xp = TMP_ALLOC_LIMBS (mn); qp = TMP_ALLOC_LIMBS (mn + 1); MPN_COPY (xp, bp, bn); xn = bn; e = el; count_leading_zeros (c, e); e = (e << c) << 1; /* shift the exp bits to the left, lose msb */ c = BITS_PER_MP_LIMB - 1 - c; /* Main loop. */ /* If m is already normalized (high bit of high limb set), and b is the same size, but a bigger value, and e==1, then there's no modular reductions done and we can end up with a result out of range at the end. */ if (c == 0) { if (xn == mn && mpn_cmp (xp, mp, mn) >= 0) mpn_sub_n (xp, xp, mp, mn); goto finishup; } while (c != 0) { mpn_sqr (tp, xp, xn); tn = 2 * xn; tn -= tp[tn - 1] == 0; if (tn < mn) { MPN_COPY (xp, tp, tn); xn = tn; } else { mpn_tdiv_qr (qp, xp, 0L, tp, tn, mp, mn); xn = mn; } if ((mp_limb_signed_t) e < 0) { mpn_mul (tp, xp, xn, bp, bn); tn = xn + bn; tn -= tp[tn - 1] == 0; if (tn < mn) { MPN_COPY (xp, tp, tn); xn = tn; } else { mpn_tdiv_qr (qp, xp, 0L, tp, tn, mp, mn); xn = mn; } } e <<= 1; c--; } finishup: /* We shifted m left m_zero_cnt steps. Adjust the result by reducing it with the original MOD. */ if (m_zero_cnt != 0) { mp_limb_t cy; cy = mpn_lshift (tp, xp, xn, m_zero_cnt); tp[xn] = cy; xn += cy != 0; if (xn < mn) { MPN_COPY (xp, tp, xn); } else { mpn_tdiv_qr (qp, xp, 0L, tp, xn, mp, mn); xn = mn; } mpn_rshift (xp, xp, xn, m_zero_cnt); } MPN_NORMALIZE (xp, xn); if ((el & 1) != 0 && SIZ(b) < 0 && xn != 0) { mp = PTR(m); /* want original, unnormalized m */ mpn_sub (xp, mp, mn, xp, xn); xn = mn; MPN_NORMALIZE (xp, xn); } MPZ_REALLOC (r, xn); SIZ (r) = xn; MPN_COPY (PTR(r), xp, xn); TMP_FREE; }
void _arb_sin_cos_taylor_rs(mp_ptr ysin, mp_ptr ycos, mp_limb_t * error, mp_srcptr x, mp_size_t xn, ulong N, int sinonly, int alternating) { mp_ptr s, t, xpow; mp_limb_t new_denom, old_denom, c; slong power, k, m; int cosorsin; TMP_INIT; TMP_START; if (2 * N >= FACTORIAL_TAB_SIZE - 1) { flint_printf("_arb_sin_cos_taylor_rs: N too large!\n"); abort(); } if (N <= 1) { if (N == 0) { flint_mpn_zero(ysin, xn); if (!sinonly) flint_mpn_zero(ycos, xn); error[0] = 0; } else if (N == 1) { flint_mpn_copyi(ysin, x, xn); if (!sinonly) flint_mpn_store(ycos, xn, LIMB_ONES); error[0] = 1; } } else { /* Choose m ~= sqrt(num_terms) (m must be even, >= 2) */ m = 2; while (m * m < N) m += 2; /* todo: merge allocations */ xpow = TMP_ALLOC_LIMBS((m + 1) * xn); s = TMP_ALLOC_LIMBS(xn + 2); t = TMP_ALLOC_LIMBS(2 * xn + 2); /* todo: 1 limb too much? */ /* higher index ---> */ /* | ---xn--- | */ /* xpow = | <temp> | x^m | x^(m-1) | ... | x^2 | x | */ #define XPOW_WRITE(__k) (xpow + (m - (__k)) * xn) #define XPOW_READ(__k) (xpow + (m - (__k) + 1) * xn) mpn_sqr(XPOW_WRITE(1), x, xn); mpn_sqr(XPOW_WRITE(2), XPOW_READ(1), xn); for (k = 4; k <= m; k += 2) { mpn_mul_n(XPOW_WRITE(k - 1), XPOW_READ(k / 2), XPOW_READ(k / 2 - 1), xn); mpn_sqr(XPOW_WRITE(k), XPOW_READ(k / 2), xn); } for (cosorsin = sinonly; cosorsin < 2; cosorsin++) { flint_mpn_zero(s, xn + 1); /* todo: skip one nonscalar multiplication (use x^m) when starting on x^0 */ power = (N - 1) % m; for (k = N - 1; k >= 0; k--) { c = factorial_tab_numer[2 * k + cosorsin]; new_denom = factorial_tab_denom[2 * k + cosorsin]; old_denom = factorial_tab_denom[2 * k + cosorsin + 2]; /* change denominators */ if (new_denom != old_denom && k < N - 1) { if (alternating && (k % 2 == 0)) s[xn] += old_denom; mpn_divrem_1(s, 0, s, xn + 1, old_denom); if (alternating && (k % 2 == 0)) s[xn] -= 1; } if (power == 0) { /* add c * x^0 -- only top limb is affected */ if (alternating & k) s[xn] -= c; else s[xn] += c; /* Outer polynomial evaluation: multiply by x^m */ if (k != 0) { mpn_mul(t, s, xn + 1, XPOW_READ(m), xn); flint_mpn_copyi(s, t + xn, xn + 1); } power = m - 1; } else { if (alternating & k) s[xn] -= mpn_submul_1(s, XPOW_READ(power), xn, c); else s[xn] += mpn_addmul_1(s, XPOW_READ(power), xn, c); power--; } } /* finally divide by denominator */ if (cosorsin == 0) { mpn_divrem_1(t, 0, s, xn + 1, factorial_tab_denom[0]); /* perturb down to a number < 1 if necessary. note that this does not invalidate the error bound: 1 - ulp is either 1 ulp too small or must be closer to the exact value */ if (t[xn] == 0) flint_mpn_copyi(ycos, t, xn); else flint_mpn_store(ycos, xn, LIMB_ONES); } else { mpn_divrem_1(s, 0, s, xn + 1, factorial_tab_denom[0]); mpn_mul(t, s, xn + 1, x, xn); flint_mpn_copyi(ysin, t + xn, xn); } } /* error bound (ulp) */ error[0] = 2; } TMP_END; }
mp_size_t mpn_pow_1 (mp_ptr rp, mp_srcptr bp, mp_size_t bn, mp_limb_t exp, mp_ptr tp) { mp_limb_t x; int cnt, i; mp_size_t rn; int par; ASSERT (bn >= 1); /* FIXME: Add operand overlap criteria */ if (exp <= 1) { if (exp == 0) { rp[0] = 1; return 1; } else { MPN_COPY (rp, bp, bn); return bn; } } /* Count number of bits in exp, and compute where to put initial square in order to magically get results in the entry rp. Use simple code, optimized for small exp. For large exp, the bignum operations will take so much time that the slowness of this code will be negligible. */ par = 0; cnt = GMP_LIMB_BITS; for (x = exp; x != 0; x >>= 1) { par ^= x & 1; cnt--; } exp <<= cnt; if (bn == 1) { mp_limb_t bl = bp[0]; if ((cnt & 1) != 0) MP_PTR_SWAP (rp, tp); mpn_sqr (rp, bp, bn); rn = 2 * bn; rn -= rp[rn - 1] == 0; for (i = GMP_LIMB_BITS - cnt - 1;;) { exp <<= 1; if ((exp & GMP_LIMB_HIGHBIT) != 0) { rp[rn] = mpn_mul_1 (rp, rp, rn, bl); rn += rp[rn] != 0; } if (--i == 0) break; mpn_sqr (tp, rp, rn); rn = 2 * rn; rn -= tp[rn - 1] == 0; MP_PTR_SWAP (rp, tp); } } else { if (((par ^ cnt) & 1) == 0) MP_PTR_SWAP (rp, tp); mpn_sqr (rp, bp, bn); rn = 2 * bn; rn -= rp[rn - 1] == 0; for (i = GMP_LIMB_BITS - cnt - 1;;) { exp <<= 1; if ((exp & GMP_LIMB_HIGHBIT) != 0) { rn = rn + bn - (mpn_mul (tp, rp, rn, bp, bn) == 0); MP_PTR_SWAP (rp, tp); } if (--i == 0) break; mpn_sqr (tp, rp, rn); rn = 2 * rn; rn -= tp[rn - 1] == 0; MP_PTR_SWAP (rp, tp); } } return rn; }
mp_limb_t mpn_mul (mp_ptr prodp, mp_srcptr up, mp_size_t un, mp_srcptr vp, mp_size_t vn) { mp_size_t l, k; mp_limb_t c; ASSERT (un >= vn); ASSERT (vn >= 1); ASSERT (! MPN_OVERLAP_P (prodp, un+vn, up, un)); ASSERT (! MPN_OVERLAP_P (prodp, un+vn, vp, vn)); if (un == vn) { if (up == vp) { mpn_sqr (prodp, up, un); return prodp[2 * un - 1]; } else { mpn_mul_n (prodp, up, vp, un); return prodp[2 * un - 1]; } } if (vn < MUL_KARATSUBA_THRESHOLD) { /* plain schoolbook multiplication */ if (un <= MUL_BASECASE_MAX_UN) mpn_mul_basecase (prodp, up, un, vp, vn); else { /* We have un >> MUL_BASECASE_MAX_UN > vn. For better memory locality, split up[] into MUL_BASECASE_MAX_UN pieces and multiply these pieces with the vp[] operand. After each such partial multiplication (but the last) we copy the most significant vn limbs into a temporary buffer since that part would otherwise be overwritten by the next multiplication. After the next multiplication, we add it back. This illustrates the situation: -->vn<-- | |<------- un ------->| _____________________| X /| /XX__________________/ | _____________________ | X / | /XX__________________/ | _____________________ | / / | /____________________/ | ================================================================== The parts marked with X are the parts whose sums are copied into the temporary buffer. */ mp_limb_t tp[MUL_KARATSUBA_THRESHOLD_LIMIT]; mp_limb_t cy; ASSERT (MUL_KARATSUBA_THRESHOLD <= MUL_KARATSUBA_THRESHOLD_LIMIT); mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn); prodp += MUL_BASECASE_MAX_UN; MPN_COPY (tp, prodp, vn); /* preserve high triangle */ up += MUL_BASECASE_MAX_UN; un -= MUL_BASECASE_MAX_UN; while (un > MUL_BASECASE_MAX_UN) { mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn); cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */ mpn_incr_u (prodp + vn, cy); /* safe? */ prodp += MUL_BASECASE_MAX_UN; MPN_COPY (tp, prodp, vn); /* preserve high triangle */ up += MUL_BASECASE_MAX_UN; un -= MUL_BASECASE_MAX_UN; } if (un > vn) { mpn_mul_basecase (prodp, up, un, vp, vn); } else { ASSERT_ALWAYS (un > 0); mpn_mul_basecase (prodp, vp, vn, up, un); } cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */ mpn_incr_u (prodp + vn, cy); /* safe? */ } return prodp[un + vn - 1]; } if (ABOVE_THRESHOLD (un + vn, 2*MUL_FFT_FULL_THRESHOLD) && ABOVE_THRESHOLD (3*vn, MUL_FFT_FULL_THRESHOLD)) { mpn_mul_fft_main (prodp, up, un, vp, vn); return prodp[un + vn - 1]; } k = (un + 3)/4; // ceil(un/4) #if GMP_NUMB_BITS == 32 if ((ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM8H_THRESHOLD)) && (vn>=86) && (5*un <= 11*vn)) #else if ((ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM8H_THRESHOLD)) && (vn>=86) && (4*un <= 13*vn)) #endif { mpn_toom8h_mul(prodp, up, un, vp, vn); return prodp[un + vn - 1]; } if (ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM4_THRESHOLD)) { if (vn > 3*k) { mpn_toom4_mul(prodp, up, un, vp, vn); return prodp[un + vn - 1]; } else { l = (un + 4)/5; // ceil(un/5) if ((((vn > 9*k/4) && (un+vn <= 6*MUL_TOOM4_THRESHOLD)) || ((vn > 2*l) && (un+vn > 6*MUL_TOOM4_THRESHOLD))) && (vn <= 3*l)) { mpn_toom53_mul(prodp, up, un, vp, vn); return prodp[un + vn - 1]; } } } if (ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM3_THRESHOLD) && (vn > k)) { mp_ptr ws; TMP_DECL; TMP_MARK; if (vn < 2*k) // un/2 >= vn > un/4 { ws = TMP_ALLOC_LIMBS (MPN_TOOM3_MUL_TSIZE(un)); mpn_toom42_mul(prodp, up, un, vp, vn, ws); TMP_FREE; return prodp[un + vn - 1]; } l = (un+2)/3; //ceil(u/3) if (vn > 2*l) // un >= vn > 2un/3 { ws = TMP_ALLOC_LIMBS (MPN_TOOM3_MUL_TSIZE(un)); mpn_toom3_mul(prodp, up, un, vp, vn, ws); TMP_FREE; return prodp[un + vn - 1]; } else // 2un/3 >= vn > un/3 { ws = TMP_ALLOC_LIMBS (MPN_TOOM3_MUL_TSIZE(un)); mpn_toom32_mul(prodp, up, un, vp, vn, ws); TMP_FREE; return prodp[un + vn - 1]; } } mpn_mul_n (prodp, up, vp, vn); if (un != vn) { mp_limb_t t; mp_ptr ws; TMP_DECL; TMP_MARK; prodp += vn; l = vn; up += vn; un -= vn; if (un < vn) { /* Swap u's and v's. */ MPN_SRCPTR_SWAP (up,un, vp,vn); } ws = TMP_ALLOC_LIMBS ((vn >= MUL_KARATSUBA_THRESHOLD ? vn : un) + vn); t = 0; while (vn >= MUL_KARATSUBA_THRESHOLD) { mpn_mul_n (ws, up, vp, vn); if (l <= 2*vn) { t += mpn_add_n (prodp, prodp, ws, l); if (l != 2*vn) { t = mpn_add_1 (prodp + l, ws + l, 2*vn - l, t); l = 2*vn; } } else { c = mpn_add_n (prodp, prodp, ws, 2*vn); t += mpn_add_1 (prodp + 2*vn, prodp + 2*vn, l - 2*vn, c); } prodp += vn; l -= vn; up += vn; un -= vn; if (un < vn) { /* Swap u's and v's. */ MPN_SRCPTR_SWAP (up,un, vp,vn); } } if (vn != 0) { mpn_mul_basecase (ws, up, un, vp, vn); if (l <= un + vn) { t += mpn_add_n (prodp, prodp, ws, l); if (l != un + vn) t = mpn_add_1 (prodp + l, ws + l, un + vn - l, t); } else { c = mpn_add_n (prodp, prodp, ws, un + vn); t += mpn_add_1 (prodp + un + vn, prodp + un + vn, l - un - vn, c); } } TMP_FREE; } return prodp[un + vn - 1]; }
mp_size_t mpn_remove_power_ascending(mp_ptr x, mp_size_t xsize, mp_ptr p, mp_size_t psize, ulong *exp) { int i, maxi; mp_ptr div; mp_ptr rem; mp_ptr square[FLINT_BITS]; mp_size_t square_size[FLINT_BITS]; mp_size_t sqsize; *exp = 0; if (psize > xsize) return xsize; maxi = 0; square[0] = p; square_size[0] = psize; /* Most likely less memory will be needed, but this way we avoid reallocations */ div = flint_malloc(sizeof(mp_limb_t) * xsize); rem = flint_malloc(sizeof(mp_limb_t) * xsize); /* Remove ascending powers */ for (i = 0; i < FLINT_BITS && xsize >= square_size[i]; i++) { mpn_tdiv_qr(div, rem, 0, x, xsize, square[i], square_size[i]); if (!mpn_zero_p(rem, square_size[i])) { i -= 1; break; } *exp += (1 << i); xsize = xsize - square_size[i] + 1; if (div[xsize-1] == 0) xsize--; mpn_copyi(x, div, xsize); /* Form next square if needed */ sqsize = square_size[i] * 2; if (sqsize - 1 > xsize) break; maxi = i + 1; square[i + 1] = flint_malloc(sizeof(mp_limb_t) * sqsize); mpn_sqr(square[i + 1], square[i], square_size[i]); if (square[i + 1][sqsize - 1] == 0) sqsize -= 1; square_size[i + 1] = sqsize; } /* Remove descending powers */ for ( ; i >= 0; i--) { if (xsize >= square_size[i]) { mpn_tdiv_qr(div, rem, 0, x, xsize, square[i], square_size[i]); if (mpn_zero_p(rem, square_size[i])) { *exp += (1 << i); xsize = xsize - square_size[i] + 1; if (div[xsize-1] == 0) xsize--; mpn_copyi(x, div, xsize); } } } for (i = 1; i <= maxi; i++) flint_free(square[i]); flint_free(div); flint_free(rem); return xsize; }
mp_bitcnt_t mpn_remove (mp_ptr wp, mp_size_t *wn, mp_ptr up, mp_size_t un, mp_ptr vp, mp_size_t vn, mp_bitcnt_t cap) { mp_ptr pwpsp[LOG]; mp_size_t pwpsn[LOG]; mp_size_t npowers; mp_ptr tp, qp, np, pp, qp2; mp_size_t pn, nn, qn, i; mp_bitcnt_t pwr; TMP_DECL; ASSERT (un > 0); ASSERT (vn > 0); ASSERT (vp[0] % 2 != 0); /* 2-adic division wants odd numbers */ ASSERT (vn > 1 || vp[0] > 1); /* else we would loop indefinitely */ TMP_MARK; tp = TMP_ALLOC_LIMBS ((un + 1 + vn) / 2); /* remainder */ qp = TMP_ALLOC_LIMBS (un + 1); /* quotient, alternating */ qp2 = TMP_ALLOC_LIMBS (un + 1); /* quotient, alternating */ np = TMP_ALLOC_LIMBS (un + LOG); /* powers of V */ pp = vp; pn = vn; MPN_COPY (qp, up, un); qn = un; npowers = 0; while (qn >= pn) { qp[qn] = 0; mpn_bdiv_qr_wrap (qp2, tp, qp, qn + 1, pp, pn); if (!mpn_zero_p (tp, pn)) break; /* could not divide by V^npowers */ MP_PTR_SWAP (qp, qp2); qn = qn - pn; qn += qp[qn] != 0; pwpsp[npowers] = pp; pwpsn[npowers] = pn; npowers++; if (((mp_bitcnt_t) 2 << npowers) - 1 > cap) break; nn = 2 * pn - 1; /* next power will be at least this large */ if (nn > qn) break; /* next power would be overlarge */ mpn_sqr (np, pp, pn); nn += np[nn] != 0; pp = np; pn = nn; np += nn; } pwr = ((mp_bitcnt_t) 1 << npowers) - 1; for (i = npowers - 1; i >= 0; i--) { pp = pwpsp[i]; pn = pwpsn[i]; if (qn < pn) continue; if (pwr + ((mp_bitcnt_t) 1 << i) > cap) continue; /* V^i would bring us past cap */ qp[qn] = 0; mpn_bdiv_qr_wrap (qp2, tp, qp, qn + 1, pp, pn); if (!mpn_zero_p (tp, pn)) continue; /* could not divide by V^i */ MP_PTR_SWAP (qp, qp2); qn = qn - pn; qn += qp[qn] != 0; pwr += (mp_bitcnt_t) 1 << i; } MPN_COPY (wp, qp, qn); *wn = qn; TMP_FREE; return pwr; }
void mpz_powm (mpz_ptr r, mpz_srcptr b, mpz_srcptr e, mpz_srcptr m) { mp_ptr xp, tp, qp, gp, this_gp; mp_srcptr bp, ep, mp; mp_size_t bn, es, en, mn, xn; mp_limb_t invm, c; unsigned long int enb; mp_size_t i, K, j, l, k; int m_zero_cnt, e_zero_cnt; int sh; int use_redc; #if HANDLE_NEGATIVE_EXPONENT mpz_t new_b; #endif #if REDUCE_EXPONENT mpz_t new_e; #endif TMP_DECL; mp = PTR(m); mn = ABSIZ (m); if (mn == 0) DIVIDE_BY_ZERO; TMP_MARK; es = SIZ (e); if (es <= 0) { if (es == 0) { /* Exponent is zero, result is 1 mod m, i.e., 1 or 0 depending on if m equals 1. */ SIZ(r) = (mn == 1 && mp[0] == 1) ? 0 : 1; PTR(r)[0] = 1; TMP_FREE; /* we haven't really allocated anything here */ return; } #if HANDLE_NEGATIVE_EXPONENT MPZ_TMP_INIT (new_b, mn + 1); if (! mpz_invert (new_b, b, m)) DIVIDE_BY_ZERO; b = new_b; es = -es; #else DIVIDE_BY_ZERO; #endif } en = es; #if REDUCE_EXPONENT /* Reduce exponent by dividing it by phi(m) when m small. */ if (mn == 1 && mp[0] < 0x7fffffffL && en * GMP_NUMB_BITS > 150) { MPZ_TMP_INIT (new_e, 2); mpz_mod_ui (new_e, e, phi (mp[0])); e = new_e; } #endif use_redc = mn < POWM_THRESHOLD && mp[0] % 2 != 0; if (use_redc) { /* invm = -1/m mod 2^BITS_PER_MP_LIMB, must have m odd */ modlimb_invert (invm, mp[0]); invm = -invm; } else { /* Normalize m (i.e. make its most significant bit set) as required by division functions below. */ count_leading_zeros (m_zero_cnt, mp[mn - 1]); m_zero_cnt -= GMP_NAIL_BITS; if (m_zero_cnt != 0) { mp_ptr new_mp; new_mp = TMP_ALLOC_LIMBS (mn); mpn_lshift (new_mp, mp, mn, m_zero_cnt); mp = new_mp; } } /* Determine optimal value of k, the number of exponent bits we look at at a time. */ count_leading_zeros (e_zero_cnt, PTR(e)[en - 1]); e_zero_cnt -= GMP_NAIL_BITS; enb = en * GMP_NUMB_BITS - e_zero_cnt; /* number of bits of exponent */ k = 1; K = 2; while (2 * enb > K * (2 + k * (3 + k))) { k++; K *= 2; if (k == 10) /* cap allocation */ break; } tp = TMP_ALLOC_LIMBS (2 * mn); qp = TMP_ALLOC_LIMBS (mn + 1); gp = __GMP_ALLOCATE_FUNC_LIMBS (K / 2 * mn); /* Compute x*R^n where R=2^BITS_PER_MP_LIMB. */ bn = ABSIZ (b); bp = PTR(b); /* Handle |b| >= m by computing b mod m. FIXME: It is not strictly necessary for speed or correctness to do this when b and m have the same number of limbs, perhaps remove mpn_cmp call. */ if (bn > mn || (bn == mn && mpn_cmp (bp, mp, mn) >= 0)) { /* Reduce possibly huge base while moving it to gp[0]. Use a function call to reduce, since we don't want the quotient allocation to live until function return. */ if (use_redc) { reduce (tp + mn, bp, bn, mp, mn); /* b mod m */ MPN_ZERO (tp, mn); mpn_tdiv_qr (qp, gp, 0L, tp, 2 * mn, mp, mn); /* unnormnalized! */ } else { reduce (gp, bp, bn, mp, mn); } } else { /* |b| < m. We pad out operands to become mn limbs, which simplifies the rest of the function, but slows things down when the |b| << m. */ if (use_redc) { MPN_ZERO (tp, mn); MPN_COPY (tp + mn, bp, bn); MPN_ZERO (tp + mn + bn, mn - bn); mpn_tdiv_qr (qp, gp, 0L, tp, 2 * mn, mp, mn); } else { MPN_COPY (gp, bp, bn); MPN_ZERO (gp + bn, mn - bn); } } /* Compute xx^i for odd g < 2^i. */ xp = TMP_ALLOC_LIMBS (mn); mpn_sqr (tp, gp, mn); if (use_redc) mpn_redc_1 (xp, tp, mp, mn, invm); /* xx = x^2*R^n */ else mpn_tdiv_qr (qp, xp, 0L, tp, 2 * mn, mp, mn); this_gp = gp; for (i = 1; i < K / 2; i++) { mpn_mul_n (tp, this_gp, xp, mn); this_gp += mn; if (use_redc) mpn_redc_1 (this_gp,tp, mp, mn, invm); /* g[i] = x^(2i+1)*R^n */ else mpn_tdiv_qr (qp, this_gp, 0L, tp, 2 * mn, mp, mn); } /* Start the real stuff. */ ep = PTR (e); i = en - 1; /* current index */ c = ep[i]; /* current limb */ sh = GMP_NUMB_BITS - e_zero_cnt; /* significant bits in ep[i] */ sh -= k; /* index of lower bit of ep[i] to take into account */ if (sh < 0) { /* k-sh extra bits are needed */ if (i > 0) { i--; c <<= (-sh); sh += GMP_NUMB_BITS; c |= ep[i] >> sh; } }
/* Computes a^{1/k - 1} (mod B^n). Both a and k must be odd. Iterates r' <-- r - r * (a^{k-1} r^k - 1) / n If a^{k-1} r^k = 1 (mod 2^m), then a^{k-1} r'^k = 1 (mod 2^{2m}), Compute the update term as r' = r - (a^{k-1} r^{k+1} - r) / k where we still have cancellation of low limbs. */ void mpn_broot_invm1 (mp_ptr rp, mp_srcptr ap, mp_size_t n, mp_limb_t k) { mp_size_t sizes[GMP_LIMB_BITS * 2]; mp_ptr akm1, tp, rnp, ep; mp_limb_t a0, r0, km1, kp1h, kinv; mp_size_t rn; unsigned i; TMP_DECL; ASSERT (n > 0); ASSERT (ap[0] & 1); ASSERT (k & 1); ASSERT (k >= 3); TMP_MARK; akm1 = TMP_ALLOC_LIMBS (4*n); tp = akm1 + n; km1 = k-1; /* FIXME: Could arrange the iteration so we don't need to compute this up front, computing a^{k-1} * r^k as (a r)^{k-1} * r. Note that we can use wraparound also for a*r, since the low half is unchanged from the previous iteration. Or possibly mulmid. Also, a r = a^{1/k}, so we get that value too, for free? */ mpn_powlo (akm1, ap, &km1, 1, n, tp); /* 3 n scratch space */ a0 = ap[0]; binvert_limb (kinv, k); /* 4 bits: a^{1/k - 1} (mod 16): a % 8 1 3 5 7 k%4 +------- 1 |1 1 1 1 3 |1 9 9 1 */ r0 = 1 + (((k << 2) & ((a0 << 1) ^ (a0 << 2))) & 8); r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k & 0x7f)); /* 8 bits */ r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k & 0x7fff)); /* 16 bits */ r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k)); /* 32 bits */ #if GMP_NUMB_BITS > 32 { unsigned prec = 32; do { r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k)); prec *= 2; } while (prec < GMP_NUMB_BITS); } #endif rp[0] = r0; if (n == 1) { TMP_FREE; return; } /* For odd k, (k+1)/2 = k/2+1, and the latter avoids overflow. */ kp1h = k/2 + 1; /* FIXME: Special case for two limb iteration. */ rnp = TMP_ALLOC_LIMBS (2*n + 1); ep = rnp + n; /* FIXME: Possible to this on the fly with some bit fiddling. */ for (i = 0; n > 1; n = (n + 1)/2) sizes[i++] = n; rn = 1; while (i-- > 0) { /* Compute x^{k+1}. */ mpn_sqr (ep, rp, rn); /* For odd n, writes n+1 limbs in the final iteration. */ mpn_powlo (rnp, ep, &kp1h, 1, sizes[i], tp); /* Multiply by a^{k-1}. Can use wraparound; low part equals r. */ mpn_mullo_n (ep, rnp, akm1, sizes[i]); ASSERT (mpn_cmp (ep, rp, rn) == 0); ASSERT (sizes[i] <= 2*rn); mpn_pi1_bdiv_q_1 (rp + rn, ep + rn, sizes[i] - rn, k, kinv, 0); mpn_neg (rp + rn, rp + rn, sizes[i] - rn); rn = sizes[i]; } TMP_FREE; }
/* ret + (xp, n) = (yp, n)*(zp, n) % 2^b + 1 needs (tp, 2n) temp space, everything reduced mod 2^b inputs, outputs are fully reduced N.B: 2n is not the same as 2b rounded up to nearest limb! */ inline static int mpn_mulmod_2expp1_internal (mp_ptr xp, mp_srcptr yp, mp_srcptr zp, mpir_ui b, mp_ptr tp) { mp_size_t n, k; mp_limb_t c; TMP_DECL; n = BITS_TO_LIMBS (b); k = GMP_NUMB_BITS * n - b; ASSERT(b > 0); ASSERT(n > 0); ASSERT_MPN(yp, n); ASSERT_MPN(zp, n); ASSERT(!MPN_OVERLAP_P (tp, 2 * n, yp, n)); ASSERT(!MPN_OVERLAP_P (tp, 2 * n, zp, n)); ASSERT(MPN_SAME_OR_SEPARATE_P (xp, tp, n)); ASSERT(MPN_SAME_OR_SEPARATE_P (xp, tp + n, n)); ASSERT(k == 0 || yp[n - 1] >> (GMP_NUMB_BITS - k) == 0); ASSERT(k == 0 || zp[n - 1] >> (GMP_NUMB_BITS - k) == 0); #ifndef TUNE_PROGRAM_BUILD if (k == 0 && n > FFT_MULMOD_2EXPP1_CUTOFF && n == mpir_fft_adjust_limbs(n)) { mp_bitcnt_t depth1, depth = 1; mp_size_t w1, off; mp_ptr tx, ty, tz; mp_limb_t ret; TMP_MARK; tx = TMP_BALLOC_LIMBS(3*n + 3); ty = tx + n + 1; tz = ty + n + 1; MPN_COPY(ty, yp, n); MPN_COPY(tz, zp, n); ty[n] = 0; tz[n] = 0; while ((((mp_limb_t)1)<<depth) < b) depth++; if (depth < 12) off = mulmod_2expp1_table_n[0]; else off = mulmod_2expp1_table_n[MIN(depth, FFT_N_NUM + 11) - 12]; depth1 = depth/2 - off; w1 = b/(((mp_limb_t)1)<<(2*depth1)); mpir_fft_mulmod_2expp1(tx, ty, tz, n, depth1, w1); MPN_COPY(xp, tx, n); ret = tx[n]; TMP_FREE; return ret; } #endif if (yp == zp) mpn_sqr(tp, yp, n); else mpn_mul_n (tp, yp, zp, n); if (k == 0) { c = mpn_sub_n (xp, tp, tp + n, n); return mpn_add_1 (xp, xp, n, c); } c = tp[n - 1]; tp[n - 1] &= GMP_NUMB_MASK >> k; #if HAVE_NATIVE_mpn_sublsh_nc c = mpn_sublsh_nc (xp, tp, tp + n, n, k, c); #else { mp_limb_t c1; c1 = mpn_lshift (tp + n, tp + n, n, k); tp[n] |= c >> (GMP_NUMB_BITS - k); c = mpn_sub_n (xp, tp, tp + n, n) + c1; } #endif c = mpn_add_1 (xp, xp, n, c); xp[n - 1] &= GMP_NUMB_MASK >> k; return c; }
void _arb_exp_taylor_rs(mp_ptr y, mp_limb_t * error, mp_srcptr x, mp_size_t xn, ulong N) { mp_ptr s, t, xpow; mp_limb_t new_denom, old_denom, c; slong power, k, m; TMP_INIT; TMP_START; if (N >= FACTORIAL_TAB_SIZE - 1) { flint_printf("_arb_exp_taylor_rs: N too large!\n"); abort(); } if (N <= 3) { if (N <= 1) { flint_mpn_zero(y, xn); y[xn] = N; error[0] = 0; } else if (N == 2) { flint_mpn_copyi(y, x, xn); y[xn] = 1; error[0] = 0; } else { /* 1 + x + x^2 / 2 */ t = TMP_ALLOC_LIMBS(2 * xn); mpn_sqr(t, x, xn); mpn_rshift(t + xn, t + xn, xn, 1); y[xn] = mpn_add_n(y, x, t + xn, xn) + 1; error[0] = 2; } } else { /* Choose m ~= sqrt(num_terms) (m must be even, >= 2) */ /* TODO: drop evenness assumption since we don't have sign issues here? */ /* TODO: then just need to fix power construction below... */ m = 2; while (m * m < N) m += 2; /* todo: merge allocations */ xpow = TMP_ALLOC_LIMBS((m + 1) * xn); s = TMP_ALLOC_LIMBS(xn + 2); t = TMP_ALLOC_LIMBS(2 * xn + 2); /* todo: 1 limb too much? */ /* higher index ---> */ /* | ---xn--- | */ /* xpow = | <temp> | x^m | x^(m-1) | ... | x^2 | x | */ #define XPOW_WRITE(__k) (xpow + (m - (__k)) * xn) #define XPOW_READ(__k) (xpow + (m - (__k) + 1) * xn) flint_mpn_copyi(XPOW_READ(1), x, xn); mpn_sqr(XPOW_WRITE(2), XPOW_READ(1), xn); for (k = 4; k <= m; k += 2) { mpn_mul_n(XPOW_WRITE(k - 1), XPOW_READ(k / 2), XPOW_READ(k / 2 - 1), xn); mpn_sqr(XPOW_WRITE(k), XPOW_READ(k / 2), xn); } flint_mpn_zero(s, xn + 1); /* todo: skip one nonscalar multiplication (use x^m) when starting on x^0 */ power = (N - 1) % m; for (k = N - 1; k >= 0; k--) { c = factorial_tab_numer[k]; new_denom = factorial_tab_denom[k]; old_denom = factorial_tab_denom[k+1]; /* change denominators */ if (new_denom != old_denom && k < N - 1) { mpn_divrem_1(s, 0, s, xn + 1, old_denom); } if (power == 0) { /* add c * x^0 -- only top limb is affected */ s[xn] += c; /* Outer polynomial evaluation: multiply by x^m */ if (k != 0) { mpn_mul(t, s, xn + 1, XPOW_READ(m), xn); flint_mpn_copyi(s, t + xn, xn + 1); } power = m - 1; } else { s[xn] += mpn_addmul_1(s, XPOW_READ(power), xn, c); power--; } } /* finally divide by denominator */ mpn_divrem_1(y, 0, s, xn + 1, factorial_tab_denom[0]); /* error bound (ulp) */ error[0] = 2; } TMP_END; }