static void mpn_karaadd(mp_ptr rp,mp_ptr tp,mp_size_t n) {mp_size_t n2,n3;mp_limb_t c1,c2,c3; n2=n>>1;n3=n-n2; c1=mpn_add_n(tp,rp+2*n2,tp,2*n3); c2=mpn_add_n(tp,tp,rp,2*n2); c3=mpn_add_n(rp+n2,rp+n2,tp,2*n3); mpn_incr_u(rp+n2+2*n3,c3+c1); mpn_incr_u(rp+n2+2*n2,c2); //mpn_incr_u(rp+n2+2*n3,c1); return;}
static void mpn_karasub(mp_ptr rp,mp_ptr tp,mp_size_t n) {mp_size_t n2,n3;mp_limb_t c1,c2,c3,top[2]; n2=n>>1;n3=n-n2; c1=mpn_sub_n(tp,rp+2*n2,tp,2*n2);//c1=mpn_sub_n(tp,rp+2*n2,tp,2*n3); c2=mpn_add_n(tp,tp,rp,2*n2); c3=mpn_add_n(rp+n2,rp+n2,tp,2*n2);//c3=mpn_add_n(rp+n2,rp+n2,tp,2*n3); top[1]=rp[2*n2+2*n3-1];top[0]=rp[2*n2+2*n3-2]; mpn_incr_u(rp+3*n2,c3);//mpn_incr_u(rp+n2+2*n3,c3); mpn_incr_u(rp+3*n2,c2); mpn_decr_u(rp+3*n2,c1);//mpn_decr_u(rp+n2+2*n3,c1); if(n2==n3)return; c1=mpn_sub_n(rp+3*n2,rp+3*n2,tp+2*n2,2); c2=mpn_add_n(rp+3*n2,rp+3*n2,top,2); if(c2==1 && c1==0)mpn_incr_u(rp+3*n2+2,1); if(c2==0 && c1==1)mpn_decr_u(rp+3*n2+2,1); return;}
unsigned long int mpz_cdiv_qr_ui (mpz_ptr quot, mpz_ptr rem, mpz_srcptr dividend, unsigned long int divisor) { mp_size_t ns, nn, qn; mp_ptr np, qp; mp_limb_t rl; if (divisor == 0) DIVIDE_BY_ZERO; ns = SIZ(dividend); if (ns == 0) { SIZ(quot) = 0; SIZ(rem) = 0; return 0; } nn = ABS(ns); MPZ_REALLOC (quot, nn); qp = PTR(quot); np = PTR(dividend); #if GMP_NAIL_BITS != 0 if (divisor > GMP_NUMB_MAX) { mp_limb_t dp[2]; mp_ptr rp; mp_size_t rn; MPZ_REALLOC (rem, 2); rp = PTR(rem); if (nn == 1) /* tdiv_qr requirements; tested above for 0 */ { qp[0] = 0; qn = 1; /* a white lie, fixed below */ rl = np[0]; rp[0] = rl; } else { dp[0] = divisor & GMP_NUMB_MASK; dp[1] = divisor >> GMP_NUMB_BITS; mpn_tdiv_qr (qp, rp, (mp_size_t) 0, np, nn, dp, (mp_size_t) 2); rl = rp[0] + (rp[1] << GMP_NUMB_BITS); qn = nn - 2 + 1; } if (rl != 0 && ns >= 0) { mpn_incr_u (qp, (mp_limb_t) 1); rl = divisor - rl; rp[0] = rl & GMP_NUMB_MASK; rp[1] = rl >> GMP_NUMB_BITS; }
static void ref_mpn_mul (mp_ptr wp, mp_srcptr up, mp_size_t un, mp_srcptr vp, mp_size_t vn) { mp_ptr tp; mp_size_t tn; mp_limb_t cy; if (vn < TOOM3_THRESHOLD) { /* In the mpn_mul_basecase and mpn_kara_mul_n range, use our own mul_basecase. */ if (vn != 0) mul_basecase (wp, up, un, vp, vn); else MPN_ZERO (wp, un); return; } if (vn < FFT_THRESHOLD) { /* In the mpn_toom3_mul_n and mpn_toom4_mul_n range, use mpn_kara_mul_n. */ tn = 2 * vn + MPN_KARA_MUL_N_TSIZE (vn); tp = __GMP_ALLOCATE_FUNC_LIMBS (tn); mpn_kara_mul_n (tp, up, vp, vn, tp + 2 * vn); } else { /* Finally, for the largest operands, use mpn_toom3_mul_n. */ /* The "- 63 + 255" tweaks the allocation to allow for huge operands. See the definition of this macro in gmp-impl.h to understand this. */ tn = 2 * vn + MPN_TOOM3_MUL_N_TSIZE (vn) - 63 + 255; tp = __GMP_ALLOCATE_FUNC_LIMBS (tn); mpn_toom3_mul_n (tp, up, vp, vn, tp + 2 * vn); } if (un != vn) { if (un - vn < vn) ref_mpn_mul (wp + vn, vp, vn, up + vn, un - vn); else ref_mpn_mul (wp + vn, up + vn, un - vn, vp, vn); MPN_COPY (wp, tp, vn); cy = mpn_add_n (wp + vn, wp + vn, tp + vn, vn); mpn_incr_u (wp + 2 * vn, cy); } else { MPN_COPY (wp, tp, 2 * vn); } __GMP_FREE_FUNC_LIMBS (tp, tn); }
mpir_ui mpz_fdiv_q_ui (mpz_ptr quot, mpz_srcptr dividend, mpir_ui divisor) { mp_size_t ns, nn, qn; mp_ptr np, qp; mp_limb_t rl; if (divisor == 0) DIVIDE_BY_ZERO; ns = SIZ(dividend); if (ns == 0) { SIZ(quot) = 0; return 0; } nn = ABS(ns); MPZ_REALLOC (quot, nn); qp = PTR(quot); np = PTR(dividend); #if BITS_PER_UI > GMP_NUMB_BITS /* avoid warnings about shift amount */ if (divisor > GMP_NUMB_MAX) { mp_limb_t dp[2], rp[2]; if (nn == 1) /* tdiv_qr requirements; tested above for 0 */ { qp[0] = 0; rl = np[0]; qn = 1; /* a white lie, fixed below */ } else { dp[0] = divisor & GMP_NUMB_MASK; dp[1] = divisor >> GMP_NUMB_BITS; mpn_tdiv_qr (qp, rp, (mp_size_t) 0, np, nn, dp, (mp_size_t) 2); rl = rp[0] + (rp[1] << GMP_NUMB_BITS); qn = nn - 2 + 1; } if (rl != 0 && ns < 0) { mpn_incr_u (qp, (mp_limb_t) 1); rl = divisor - rl; } qn -= qp[qn - 1] == 0; qn -= qn != 0 && qp[qn - 1] == 0; }
static void gmp_rrandomb (mp_ptr rp, gmp_randstate_t rstate, mpir_ui nbits) { mpir_ui bi; mp_limb_t ranm; /* buffer for random bits */ unsigned cap_chunksize, chunksize; mp_size_t i; /* Set entire result to 111..1 */ i = (nbits + GMP_NUMB_BITS - 1) / GMP_NUMB_BITS - 1; rp[i] = GMP_NUMB_MAX >> (GMP_NUMB_BITS - (nbits % GMP_NUMB_BITS)) % GMP_NUMB_BITS; for (i = i - 1; i >= 0; i--) rp[i] = GMP_NUMB_MAX; _gmp_rand (&ranm, rstate, BITS_PER_RANDCALL); cap_chunksize = nbits / (ranm % 4 + 1); cap_chunksize += cap_chunksize == 0; /* make it at least 1 */ bi = nbits; for (;;) { _gmp_rand (&ranm, rstate, BITS_PER_RANDCALL); chunksize = 1 + ranm % cap_chunksize; bi = (bi < chunksize) ? 0 : bi - chunksize; if (bi == 0) break; /* low chunk is ...1 */ rp[bi / GMP_NUMB_BITS] ^= CNST_LIMB (1) << bi % GMP_NUMB_BITS; _gmp_rand (&ranm, rstate, BITS_PER_RANDCALL); chunksize = 1 + ranm % cap_chunksize; bi = (bi < chunksize) ? 0 : bi - chunksize; mpn_incr_u (rp + bi / GMP_NUMB_BITS, CNST_LIMB (1) << bi % GMP_NUMB_BITS); if (bi == 0) break; /* low chunk is ...0 */ } }
static #endif void redc (mp_ptr cp, mp_srcptr mp, mp_size_t n, mp_limb_t Nprim, mp_ptr tp) { mp_limb_t cy; mp_limb_t q; mp_size_t j; tp[2 * n] = 0; /* carry guard */ for (j = 0; j < n; j++) { q = tp[0] * Nprim; cy = mpn_addmul_1 (tp, mp, n, q); mpn_incr_u (tp + n, cy); tp++; } if (tp[n] != 0) mpn_sub_n (cp, tp, mp, n); else MPN_COPY (cp, tp, n); }
void mpn_kara_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws) { mp_limb_t w, w0, w1; mp_size_t n2; mp_srcptr x, y; mp_size_t i; int sign; n2 = n >> 1; ASSERT (n2 > 0); if ((n & 1) != 0) { /* Odd length. */ mp_size_t n1, n3, nm1; n3 = n - n2; sign = 0; w = a[n2]; if (w != 0) w -= mpn_sub_n (p, a, a + n3, n2); else { i = n2; do { --i; w0 = a[i]; w1 = a[n3 + i]; } while (w0 == w1 && i != 0); if (w0 < w1) { x = a + n3; y = a; sign = ~0; } else { x = a; y = a + n3; } mpn_sub_n (p, x, y, n2); } p[n2] = w; w = b[n2]; if (w != 0) w -= mpn_sub_n (p + n3, b, b + n3, n2); else { i = n2; do { --i; w0 = b[i]; w1 = b[n3 + i]; } while (w0 == w1 && i != 0); if (w0 < w1) { x = b + n3; y = b; sign = ~sign; } else { x = b; y = b + n3; } mpn_sub_n (p + n3, x, y, n2); } p[n] = w; n1 = n + 1; if (n2 < MUL_KARATSUBA_THRESHOLD) { if (n3 < MUL_KARATSUBA_THRESHOLD) { mpn_mul_basecase (ws, p, n3, p + n3, n3); mpn_mul_basecase (p, a, n3, b, n3); } else { mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1); mpn_kara_mul_n (p, a, b, n3, ws + n1); } mpn_mul_basecase (p + n1, a + n3, n2, b + n3, n2); } else { mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1); mpn_kara_mul_n (p, a, b, n3, ws + n1); mpn_kara_mul_n (p + n1, a + n3, b + n3, n2, ws + n1); } if (sign) mpn_add_n (ws, p, ws, n1); else mpn_sub_n (ws, p, ws, n1); nm1 = n - 1; if (mpn_add_n (ws, p + n1, ws, nm1)) { mp_limb_t x = (ws[nm1] + 1) & GMP_NUMB_MASK; ws[nm1] = x; if (x == 0) ws[n] = (ws[n] + 1) & GMP_NUMB_MASK; } if (mpn_add_n (p + n3, p + n3, ws, n1)) { mpn_incr_u (p + n1 + n3, 1); } } else { /* Even length. */ i = n2; do { --i; w0 = a[i]; w1 = a[n2 + i]; } while (w0 == w1 && i != 0); sign = 0; if (w0 < w1) { x = a + n2; y = a; sign = ~0; } else { x = a; y = a + n2; } mpn_sub_n (p, x, y, n2); i = n2; do { --i; w0 = b[i]; w1 = b[n2 + i]; } while (w0 == w1 && i != 0); if (w0 < w1) { x = b + n2; y = b; sign = ~sign; } else { x = b; y = b + n2; } mpn_sub_n (p + n2, x, y, n2); /* Pointwise products. */ if (n2 < MUL_KARATSUBA_THRESHOLD) { mpn_mul_basecase (ws, p, n2, p + n2, n2); mpn_mul_basecase (p, a, n2, b, n2); mpn_mul_basecase (p + n, a + n2, n2, b + n2, n2); } else { mpn_kara_mul_n (ws, p, p + n2, n2, ws + n); mpn_kara_mul_n (p, a, b, n2, ws + n); mpn_kara_mul_n (p + n, a + n2, b + n2, n2, ws + n); } /* Interpolate. */ if (sign) w = mpn_add_n (ws, p, ws, n); else w = -mpn_sub_n (ws, p, ws, n); w += mpn_add_n (ws, p + n, ws, n); w += mpn_add_n (p + n2, p + n2, ws, n); MPN_INCR_U (p + n2 + n, 2 * n - (n2 + n), w); } }
void mpn_kara_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws) { mp_limb_t w, w0, w1; mp_size_t n2; mp_srcptr x, y; mp_size_t i; n2 = n >> 1; ASSERT (n2 > 0); if ((n & 1) != 0) { /* Odd length. */ mp_size_t n1, n3, nm1; n3 = n - n2; w = a[n2]; if (w != 0) w -= mpn_sub_n (p, a, a + n3, n2); else { i = n2; do { --i; w0 = a[i]; w1 = a[n3 + i]; } while (w0 == w1 && i != 0); if (w0 < w1) { x = a + n3; y = a; } else { x = a; y = a + n3; } mpn_sub_n (p, x, y, n2); } p[n2] = w; n1 = n + 1; /* n2 is always either n3 or n3-1 so maybe the two sets of tests here could be combined. But that's not important, since the tests will take a miniscule amount of time compared to the function calls. */ if (BELOW_THRESHOLD (n3, SQR_BASECASE_THRESHOLD)) { mpn_mul_basecase (ws, p, n3, p, n3); mpn_mul_basecase (p, a, n3, a, n3); } else if (BELOW_THRESHOLD (n3, SQR_KARATSUBA_THRESHOLD)) { mpn_sqr_basecase (ws, p, n3); mpn_sqr_basecase (p, a, n3); } else { mpn_kara_sqr_n (ws, p, n3, ws + n1); /* (x-y)^2 */ mpn_kara_sqr_n (p, a, n3, ws + n1); /* x^2 */ } if (BELOW_THRESHOLD (n2, SQR_BASECASE_THRESHOLD)) mpn_mul_basecase (p + n1, a + n3, n2, a + n3, n2); else if (BELOW_THRESHOLD (n2, SQR_KARATSUBA_THRESHOLD)) mpn_sqr_basecase (p + n1, a + n3, n2); else mpn_kara_sqr_n (p + n1, a + n3, n2, ws + n1); /* y^2 */ /* Since x^2+y^2-(x-y)^2 = 2xy >= 0 there's no need to track the borrow from mpn_sub_n. If it occurs then it'll be cancelled by a carry from ws[n]. Further, since 2xy fits in n1 limbs there won't be any carry out of ws[n] other than cancelling that borrow. */ mpn_sub_n (ws, p, ws, n1); /* x^2-(x-y)^2 */ nm1 = n - 1; if (mpn_add_n (ws, p + n1, ws, nm1)) /* x^2+y^2-(x-y)^2 = 2xy */ { mp_limb_t x = (ws[nm1] + 1) & GMP_NUMB_MASK; ws[nm1] = x; if (x == 0) ws[n] = (ws[n] + 1) & GMP_NUMB_MASK; } if (mpn_add_n (p + n3, p + n3, ws, n1)) { mpn_incr_u (p + n1 + n3, 1); } } else { /* Even length. */ i = n2; do { --i; w0 = a[i]; w1 = a[n2 + i]; } while (w0 == w1 && i != 0); if (w0 < w1) { x = a + n2; y = a; } else { x = a; y = a + n2; } mpn_sub_n (p, x, y, n2); /* Pointwise products. */ if (BELOW_THRESHOLD (n2, SQR_BASECASE_THRESHOLD)) { mpn_mul_basecase (ws, p, n2, p, n2); mpn_mul_basecase (p, a, n2, a, n2); mpn_mul_basecase (p + n, a + n2, n2, a + n2, n2); } else if (BELOW_THRESHOLD (n2, SQR_KARATSUBA_THRESHOLD)) { mpn_sqr_basecase (ws, p, n2); mpn_sqr_basecase (p, a, n2); mpn_sqr_basecase (p + n, a + n2, n2); } else { mpn_kara_sqr_n (ws, p, n2, ws + n); mpn_kara_sqr_n (p, a, n2, ws + n); mpn_kara_sqr_n (p + n, a + n2, n2, ws + n); } /* Interpolate. */ w = -mpn_sub_n (ws, p, ws, n); w += mpn_add_n (ws, p + n, ws, n); w += mpn_add_n (p + n2, p + n2, ws, n); MPN_INCR_U (p + n2 + n, 2 * n - (n2 + n), w); } }
void mpn_toom22_mul (mp_ptr pp, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr scratch) { mp_size_t n, s, t; int vm1_neg; mp_limb_t cy, cy2; mp_ptr asm1; mp_ptr bsm1; #define a0 ap #define a1 (ap + n) #define b0 bp #define b1 (bp + n) s = an >> 1; n = an - s; t = bn - n; ASSERT (an >= bn); ASSERT (0 < s && s <= n); ASSERT (0 < t && t <= s); asm1 = pp; bsm1 = pp + n; vm1_neg = 0; /* Compute asm1. */ if (s == n) { if (mpn_cmp (a0, a1, n) < 0) { mpn_sub_n (asm1, a1, a0, n); vm1_neg = 1; } else { mpn_sub_n (asm1, a0, a1, n); } } else { if (mpn_zero_p (a0 + s, n - s) && mpn_cmp (a0, a1, s) < 0) { mpn_sub_n (asm1, a1, a0, s); MPN_ZERO (asm1 + s, n - s); vm1_neg = 1; } else { mpn_sub (asm1, a0, n, a1, s); } } /* Compute bsm1. */ if (t == n) { if (mpn_cmp (b0, b1, n) < 0) { mpn_sub_n (bsm1, b1, b0, n); vm1_neg ^= 1; } else { mpn_sub_n (bsm1, b0, b1, n); } } else { if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0) { mpn_sub_n (bsm1, b1, b0, t); MPN_ZERO (bsm1 + t, n - t); vm1_neg ^= 1; } else { mpn_sub (bsm1, b0, n, b1, t); } } #define v0 pp /* 2n */ #define vinf (pp + 2 * n) /* s+t */ #define vm1 scratch /* 2n */ #define scratch_out scratch + 2 * n /* vm1, 2n limbs */ TOOM22_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out); if (s > t) TOOM22_MUL_REC (vinf, a1, s, b1, t, scratch_out); else TOOM22_MUL_N_REC (vinf, a1, b1, s, scratch_out); /* v0, 2n limbs */ TOOM22_MUL_N_REC (v0, ap, bp, n, scratch_out); /* H(v0) + L(vinf) */ cy = mpn_add_n (pp + 2 * n, v0 + n, vinf, n); /* L(v0) + H(v0) */ cy2 = cy + mpn_add_n (pp + n, pp + 2 * n, v0, n); /* L(vinf) + H(vinf) */ cy += mpn_add (pp + 2 * n, pp + 2 * n, n, vinf + n, s + t - n); if (vm1_neg) cy += mpn_add_n (pp + n, pp + n, vm1, 2 * n); else cy -= mpn_sub_n (pp + n, pp + n, vm1, 2 * n); ASSERT (cy + 1 <= 3); ASSERT (cy2 <= 2); mpn_incr_u (pp + 2 * n, cy2); if (LIKELY (cy <= 2)) mpn_incr_u (pp + 3 * n, cy); else mpn_decr_u (pp + 3 * n, 1); }
mp_limb_t mpn_mul (mp_ptr prodp, mp_srcptr up, mp_size_t un, mp_srcptr vp, mp_size_t vn) { mp_size_t l, k; mp_limb_t c; ASSERT (un >= vn); ASSERT (vn >= 1); ASSERT (! MPN_OVERLAP_P (prodp, un+vn, up, un)); ASSERT (! MPN_OVERLAP_P (prodp, un+vn, vp, vn)); if (un == vn) { if (up == vp) { mpn_sqr (prodp, up, un); return prodp[2 * un - 1]; } else { mpn_mul_n (prodp, up, vp, un); return prodp[2 * un - 1]; } } if (vn < MUL_KARATSUBA_THRESHOLD) { /* plain schoolbook multiplication */ if (un <= MUL_BASECASE_MAX_UN) mpn_mul_basecase (prodp, up, un, vp, vn); else { /* We have un >> MUL_BASECASE_MAX_UN > vn. For better memory locality, split up[] into MUL_BASECASE_MAX_UN pieces and multiply these pieces with the vp[] operand. After each such partial multiplication (but the last) we copy the most significant vn limbs into a temporary buffer since that part would otherwise be overwritten by the next multiplication. After the next multiplication, we add it back. This illustrates the situation: -->vn<-- | |<------- un ------->| _____________________| X /| /XX__________________/ | _____________________ | X / | /XX__________________/ | _____________________ | / / | /____________________/ | ================================================================== The parts marked with X are the parts whose sums are copied into the temporary buffer. */ mp_limb_t tp[MUL_KARATSUBA_THRESHOLD_LIMIT]; mp_limb_t cy; ASSERT (MUL_KARATSUBA_THRESHOLD <= MUL_KARATSUBA_THRESHOLD_LIMIT); mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn); prodp += MUL_BASECASE_MAX_UN; MPN_COPY (tp, prodp, vn); /* preserve high triangle */ up += MUL_BASECASE_MAX_UN; un -= MUL_BASECASE_MAX_UN; while (un > MUL_BASECASE_MAX_UN) { mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn); cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */ mpn_incr_u (prodp + vn, cy); /* safe? */ prodp += MUL_BASECASE_MAX_UN; MPN_COPY (tp, prodp, vn); /* preserve high triangle */ up += MUL_BASECASE_MAX_UN; un -= MUL_BASECASE_MAX_UN; } if (un > vn) { mpn_mul_basecase (prodp, up, un, vp, vn); } else { ASSERT_ALWAYS (un > 0); mpn_mul_basecase (prodp, vp, vn, up, un); } cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */ mpn_incr_u (prodp + vn, cy); /* safe? */ } return prodp[un + vn - 1]; } if (ABOVE_THRESHOLD (un + vn, 2*MUL_FFT_FULL_THRESHOLD) && ABOVE_THRESHOLD (3*vn, MUL_FFT_FULL_THRESHOLD)) { mpn_mul_fft_main (prodp, up, un, vp, vn); return prodp[un + vn - 1]; } k = (un + 3)/4; // ceil(un/4) #if GMP_NUMB_BITS == 32 if ((ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM8H_THRESHOLD)) && (vn>=86) && (5*un <= 11*vn)) #else if ((ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM8H_THRESHOLD)) && (vn>=86) && (4*un <= 13*vn)) #endif { mpn_toom8h_mul(prodp, up, un, vp, vn); return prodp[un + vn - 1]; } if (ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM4_THRESHOLD)) { if (vn > 3*k) { mpn_toom4_mul(prodp, up, un, vp, vn); return prodp[un + vn - 1]; } else { l = (un + 4)/5; // ceil(un/5) if ((((vn > 9*k/4) && (un+vn <= 6*MUL_TOOM4_THRESHOLD)) || ((vn > 2*l) && (un+vn > 6*MUL_TOOM4_THRESHOLD))) && (vn <= 3*l)) { mpn_toom53_mul(prodp, up, un, vp, vn); return prodp[un + vn - 1]; } } } if (ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM3_THRESHOLD) && (vn > k)) { mp_ptr ws; TMP_DECL; TMP_MARK; if (vn < 2*k) // un/2 >= vn > un/4 { ws = TMP_ALLOC_LIMBS (MPN_TOOM3_MUL_TSIZE(un)); mpn_toom42_mul(prodp, up, un, vp, vn, ws); TMP_FREE; return prodp[un + vn - 1]; } l = (un+2)/3; //ceil(u/3) if (vn > 2*l) // un >= vn > 2un/3 { ws = TMP_ALLOC_LIMBS (MPN_TOOM3_MUL_TSIZE(un)); mpn_toom3_mul(prodp, up, un, vp, vn, ws); TMP_FREE; return prodp[un + vn - 1]; } else // 2un/3 >= vn > un/3 { ws = TMP_ALLOC_LIMBS (MPN_TOOM3_MUL_TSIZE(un)); mpn_toom32_mul(prodp, up, un, vp, vn, ws); TMP_FREE; return prodp[un + vn - 1]; } } mpn_mul_n (prodp, up, vp, vn); if (un != vn) { mp_limb_t t; mp_ptr ws; TMP_DECL; TMP_MARK; prodp += vn; l = vn; up += vn; un -= vn; if (un < vn) { /* Swap u's and v's. */ MPN_SRCPTR_SWAP (up,un, vp,vn); } ws = TMP_ALLOC_LIMBS ((vn >= MUL_KARATSUBA_THRESHOLD ? vn : un) + vn); t = 0; while (vn >= MUL_KARATSUBA_THRESHOLD) { mpn_mul_n (ws, up, vp, vn); if (l <= 2*vn) { t += mpn_add_n (prodp, prodp, ws, l); if (l != 2*vn) { t = mpn_add_1 (prodp + l, ws + l, 2*vn - l, t); l = 2*vn; } } else { c = mpn_add_n (prodp, prodp, ws, 2*vn); t += mpn_add_1 (prodp + 2*vn, prodp + 2*vn, l - 2*vn, c); } prodp += vn; l -= vn; up += vn; un -= vn; if (un < vn) { /* Swap u's and v's. */ MPN_SRCPTR_SWAP (up,un, vp,vn); } } if (vn != 0) { mpn_mul_basecase (ws, up, un, vp, vn); if (l <= un + vn) { t += mpn_add_n (prodp, prodp, ws, l); if (l != un + vn) t = mpn_add_1 (prodp + l, ws + l, un + vn - l, t); } else { c = mpn_add_n (prodp, prodp, ws, un + vn); t += mpn_add_1 (prodp + un + vn, prodp + un + vn, l - un - vn, c); } } TMP_FREE; } return prodp[un + vn - 1]; }
void mpn_toom2_sqr (mp_ptr pp, mp_srcptr ap, mp_size_t an, mp_ptr scratch) { mp_size_t n, s; mp_limb_t cy, cy2; mp_ptr asm1; #define a0 ap #define a1 (ap + n) s = an >> 1; n = an - s; ASSERT (0 < s && s <= n); asm1 = pp; /* Compute asm1. */ if (s == n) { if (mpn_cmp (a0, a1, n) < 0) { mpn_sub_n (asm1, a1, a0, n); } else { mpn_sub_n (asm1, a0, a1, n); } } else { if (mpn_zero_p (a0 + s, n - s) && mpn_cmp (a0, a1, s) < 0) { mpn_sub_n (asm1, a1, a0, s); MPN_ZERO (asm1 + s, n - s); } else { mpn_sub (asm1, a0, n, a1, s); } } #define v0 pp /* 2n */ #define vinf (pp + 2 * n) /* s+s */ #define vm1 scratch /* 2n */ #define scratch_out scratch + 2 * n /* vm1, 2n limbs */ TOOM2_SQR_REC (vm1, asm1, n, scratch_out); /* vinf, s+s limbs */ TOOM2_SQR_REC (vinf, a1, s, scratch_out); /* v0, 2n limbs */ TOOM2_SQR_REC (v0, ap, n, scratch_out); /* H(v0) + L(vinf) */ cy = mpn_add_n (pp + 2 * n, v0 + n, vinf, n); /* L(v0) + H(v0) */ cy2 = cy + mpn_add_n (pp + n, pp + 2 * n, v0, n); /* L(vinf) + H(vinf) */ cy += mpn_add (pp + 2 * n, pp + 2 * n, n, vinf + n, s + s - n); cy -= mpn_sub_n (pp + n, pp + n, vm1, 2 * n); ASSERT (cy + 1 <= 3); ASSERT (cy2 <= 2); mpn_incr_u (pp + 2 * n, cy2); if (LIKELY (cy <= 2)) mpn_incr_u (pp + 3 * n, cy); else mpn_decr_u (pp + 3 * n, 1); }
void mpn_dcpi1_bdiv_q (mp_ptr qp, mp_ptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_limb_t dinv) { mp_size_t qn; mp_limb_t cy; mp_ptr tp; TMP_DECL; TMP_MARK; ASSERT (dn >= 2); ASSERT (nn - dn >= 0); ASSERT (dp[0] & 1); tp = TMP_SALLOC_LIMBS (dn); qn = nn; if (qn > dn) { /* Reduce qn mod dn in a super-efficient manner. */ do qn -= dn; while (qn > dn); /* Perform the typically smaller block first. */ if (BELOW_THRESHOLD (qn, DC_BDIV_QR_THRESHOLD)) cy = mpn_sbpi1_bdiv_qr (qp, np, 2 * qn, dp, qn, dinv); else cy = mpn_dcpi1_bdiv_qr_n (qp, np, dp, qn, dinv, tp); if (qn != dn) { if (qn > dn - qn) mpn_mul (tp, qp, qn, dp + qn, dn - qn); else mpn_mul (tp, dp + qn, dn - qn, qp, qn); mpn_incr_u (tp + qn, cy); mpn_sub (np + qn, np + qn, nn - qn, tp, dn); cy = 0; } np += qn; qp += qn; qn = nn - qn; while (qn > dn) { mpn_sub_1 (np + dn, np + dn, qn - dn, cy); cy = mpn_dcpi1_bdiv_qr_n (qp, np, dp, dn, dinv, tp); qp += dn; np += dn; qn -= dn; } mpn_dcpi1_bdiv_q_n (qp, np, dp, dn, dinv, tp); } else { if (BELOW_THRESHOLD (qn, DC_BDIV_Q_THRESHOLD)) mpn_sbpi1_bdiv_q (qp, np, qn, dp, qn, dinv); else mpn_dcpi1_bdiv_q_n (qp, np, dp, qn, dinv, tp); } TMP_FREE; }
/* Input: A = {ap, n} with most significant bit set. Output: X = B^n + {xp, n} where B = 2^GMP_NUMB_BITS. X is a lower approximation of B^(2n)/A with implicit msb. More precisely, one has: A*X < B^(2n) <= A*(X+1) or X = ceil(B^(2n)/A) - 1. */ void mpn_invert (mp_ptr xp, mp_srcptr ap, mp_size_t n) { if (n == 1) { /* invert_limb returns min(B-1, floor(B^2/ap[0])-B), which is B-1 when ap[0]=B/2, and 1 when ap[0]=B-1. For X=B+xp[0], we have A*X < B^2 <= A*(X+1) where the equality holds only when A=B/2. We thus have A*X < B^2 <= A*(X+1). */ invert_limb (xp[0], ap[0]); } else if (n == 2) { mp_limb_t tp[4], up[2], sp[2], cy; tp[0] = ZERO; invert_limb (xp[1], ap[1]); tp[3] = mpn_mul_1 (tp + 1, ap, 2, xp[1]); cy = mpn_add_n (tp + 2, tp + 2, ap, 2); while (cy) /* Xh is too large */ { xp[1] --; cy -= mpn_sub (tp + 1, tp + 1, 3, ap, 2); } /* tp[3] should be 111...111 */ mpn_com_n (sp, tp + 1, 2); cy = mpn_add_1 (sp, sp, 2, ONE); /* cy should be 0 */ up[1] = mpn_mul_1 (up, sp + 1, 1, xp[1]); cy = mpn_add_1 (up + 1, up + 1, 1, sp[1]); /* cy should be 0 */ xp[0] = up[1]; /* update tp */ cy = mpn_addmul_1 (tp, ap, 2, xp[0]); cy = mpn_add_1 (tp + 2, tp + 2, 2, cy); do { cy = mpn_add (tp, tp, 4, ap, 2); if (cy == ZERO) mpn_add_1 (xp, xp, 2, ONE); } while (cy == ZERO); /* now A*X < B^4 <= A*(X+1) */ } else { mp_size_t l, h; mp_ptr tp, up; mp_limb_t cy, th; int special = 0; TMP_DECL; l = (n - 1) / 2; h = n - l; mpn_invert (xp + l, ap + l, h); TMP_MARK; tp = TMP_ALLOC_LIMBS (n + h); up = TMP_ALLOC_LIMBS (2 * h); if (n <= WRAP_AROUND_BOUND) { mpn_mul (tp, ap, n, xp + l, h); cy = mpn_add_n (tp + h, tp + h, ap, n); } else { mp_size_t m = n + 1; mpir_ui k; int cc; if (m >= FFT_MULMOD_2EXPP1_CUTOFF) m = mpir_fft_adjust_limbs (m); /* we have m >= n + 1 by construction, thus m > h */ ASSERT(m < n + h); cy = mpn_mulmod_Bexpp1_fft (tp, m, ap, n, xp + l, h); /* cy, {tp, m} = A * {xp + l, h} mod (B^m+1) */ cy += mpn_add_n (tp + h, tp + h, ap, m - h); cc = mpn_sub_n (tp, tp, ap + m - h, n + h - m); cc = mpn_sub_1 (tp + n + h - m, tp + n + h - m, 2 * m - n - h, cc); if (cc > cy) /* can only occur if cc=1 and cy=0 */ cy = mpn_add_1 (tp, tp, m, ONE); else cy -= cc; /* cy, {tp, m} = A * Xh */ /* add B^(n+h) + B^(n+h-m) */ MPN_ZERO (tp + m, n + h - m); tp[m] = cy; /* note: since tp[n+h-1] is either 0, or cy<=1 if m=n+h-1, the mpn_incr_u() below cannot produce a carry */ mpn_incr_u (tp + n + h - m, ONE); cy = 1; do /* check if T >= B^(n+h) + 2*B^n */ { mp_size_t i; if (cy == ZERO) break; /* surely T < B^(n+h) */ if (cy == ONE) { for (i = n + h - 1; tp[i] == ZERO && i > n; i--); if (i == n && tp[i] < (mp_limb_t) 2) break; } /* subtract B^m+1 */ cy -= mpn_sub_1 (tp, tp, n + h, ONE); cy -= mpn_sub_1 (tp + m, tp + m, n + h - m, ONE); } while (1); } while (cy) { mpn_sub_1 (xp + l, xp + l, h, ONE); cy -= mpn_sub (tp, tp, n + h, ap, n); } mpn_not (tp, n); th = ~tp[n] + mpn_add_1 (tp, tp, n, ONE); mpn_mul_n (up, tp + l, xp + l, h); cy = mpn_add_n (up + h, up + h, tp + l, h); if (th != ZERO) { cy += ONE + mpn_add_n (up + h, up + h, xp + l, h); } if (up[2*h-l-1] + 4 <= CNST_LIMB(3)) special = 1; MPN_COPY (xp, up + 2 * h - l, l); mpn_add_1 (xp + l, xp + l, h, cy); TMP_FREE; if ((special) && !mpn_is_invert(xp, ap, n)) mpn_add_1 (xp, xp, n, 1); } }
mp_limb_t mpn_dc_bdiv_qr (mp_ptr qp, mp_ptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_limb_t dinv) { mp_size_t qn; mp_limb_t rr, cy; mp_ptr tp; TMP_DECL; TMP_MARK; ASSERT (dn >= 2); /* to adhere to mpn_sbpi1_div_qr's limits */ ASSERT (nn - dn >= 1); /* to adhere to mpn_sbpi1_div_qr's limits */ ASSERT (dp[0] & 1); tp = TMP_ALLOC_LIMBS (dn); qn = nn - dn; if (qn > dn) { /* Reduce qn mod dn without division, optimizing small operations. */ do qn -= dn; while (qn > dn); /* Perform the typically smaller block first. */ if (BELOW_THRESHOLD (qn, DC_BDIV_QR_THRESHOLD)) cy = mpn_sb_bdiv_qr (qp, np, 2 * qn, dp, qn, dinv); else cy = mpn_dc_bdiv_qr_n (qp, np, dp, qn, dinv, tp); rr = 0; if (qn != dn) { if (qn > dn - qn) mpn_mul (tp, qp, qn, dp + qn, dn - qn); else mpn_mul (tp, dp + qn, dn - qn, qp, qn); mpn_incr_u (tp + qn, cy); rr = mpn_sub (np + qn, np + qn, nn - qn, tp, dn); cy = 0; } np += qn; qp += qn; qn = nn - dn - qn; do { rr += mpn_sub_1 (np + dn, np + dn, qn, cy); cy = mpn_dc_bdiv_qr_n (qp, np, dp, dn, dinv, tp); qp += dn; np += dn; qn -= dn; } while (qn > 0); TMP_FREE; return rr + cy; } if (BELOW_THRESHOLD (qn, DC_BDIV_QR_THRESHOLD)) cy = mpn_sb_bdiv_qr (qp, np, 2 * qn, dp, qn, dinv); else cy = mpn_dc_bdiv_qr_n (qp, np, dp, qn, dinv, tp); rr = 0; if (qn != dn) { if (qn > dn - qn) mpn_mul (tp, qp, qn, dp + qn, dn - qn); else mpn_mul (tp, dp + qn, dn - qn, qp, qn); mpn_incr_u (tp + qn, cy); rr = mpn_sub (np + qn, np + qn, nn - qn, tp, dn); cy = 0; } TMP_FREE; return rr + cy; }