int mpf_cmp (mpf_srcptr u, mpf_srcptr v) { mp_srcptr up, vp; mp_size_t usize, vsize; mp_exp_t uexp, vexp; int cmp; int usign; uexp = u->_mp_exp; vexp = v->_mp_exp; usize = u->_mp_size; vsize = v->_mp_size; /* 1. Are the signs different? */ if ((usize ^ vsize) >= 0) { /* U and V are both non-negative or both negative. */ if (usize == 0) /* vsize >= 0 */ return -(vsize != 0); if (vsize == 0) /* usize >= 0 */ return usize != 0; /* Fall out. */ } else { /* Either U or V is negative, but not both. */ return usize >= 0 ? 1 : -1; } /* U and V have the same sign and are both non-zero. */ usign = usize >= 0 ? 1 : -1; /* 2. Are the exponents different? */ if (uexp > vexp) return usign; if (uexp < vexp) return -usign; usize = ABS (usize); vsize = ABS (vsize); up = u->_mp_d; vp = v->_mp_d; #define STRICT_MPF_NORMALIZATION 0 #if ! STRICT_MPF_NORMALIZATION /* Ignore zeroes at the low end of U and V. */ while (up[0] == 0) { up++; usize--; } while (vp[0] == 0) { vp++; vsize--; } #endif if (usize > vsize) { cmp = mpn_cmp (up + usize - vsize, vp, vsize); if (cmp == 0) return usign; } else if (vsize > usize) { cmp = mpn_cmp (up, vp + vsize - usize, usize); if (cmp == 0) return -usign; } else { cmp = mpn_cmp (up, vp, usize); if (cmp == 0) return 0; } return cmp > 0 ? usign : -usign; }

void mpn_toom53_mul (mp_ptr pp, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr scratch) { mp_size_t n, s, t; int vm1_neg, vmh_neg; mp_limb_t cy; mp_ptr gp, hp; mp_ptr as1, asm1, as2, ash, asmh; mp_ptr bs1, bsm1, bs2, bsh, bsmh; enum toom4_flags flags; TMP_DECL; #define a0 ap #define a1 (ap + n) #define a2 (ap + 2*n) #define a3 (ap + 3*n) #define a4 (ap + 4*n) #define b0 bp #define b1 (bp + n) #define b2 (bp + 2*n) n = 1 + (3 * an >= 5 * bn ? (an - 1) / (size_t) 5 : (bn - 1) / (size_t) 3); s = an - 4 * n; t = bn - 2 * n; ASSERT (0 < s && s <= n); ASSERT (0 < t && t <= n); TMP_MARK; as1 = TMP_SALLOC_LIMBS (n + 1); asm1 = TMP_SALLOC_LIMBS (n + 1); as2 = TMP_SALLOC_LIMBS (n + 1); ash = TMP_SALLOC_LIMBS (n + 1); asmh = TMP_SALLOC_LIMBS (n + 1); bs1 = TMP_SALLOC_LIMBS (n + 1); bsm1 = TMP_SALLOC_LIMBS (n + 1); bs2 = TMP_SALLOC_LIMBS (n + 1); bsh = TMP_SALLOC_LIMBS (n + 1); bsmh = TMP_SALLOC_LIMBS (n + 1); gp = pp; hp = pp + n + 1; /* Compute as1 and asm1. */ gp[n] = mpn_add_n (gp, a0, a2, n); gp[n] += mpn_add (gp, gp, n, a4, s); hp[n] = mpn_add_n (hp, a1, a3, n); #if HAVE_NATIVE_mpn_addsub_n if (mpn_cmp (gp, hp, n + 1) < 0) { mpn_addsub_n (as1, asm1, hp, gp, n + 1); vm1_neg = 1; } else { mpn_addsub_n (as1, asm1, gp, hp, n + 1); vm1_neg = 0; } #else mpn_add_n (as1, gp, hp, n + 1); if (mpn_cmp (gp, hp, n + 1) < 0) { mpn_sub_n (asm1, hp, gp, n + 1); vm1_neg = 1; } else { mpn_sub_n (asm1, gp, hp, n + 1); vm1_neg = 0; } #endif /* Compute as2. */ #if !HAVE_NATIVE_mpn_addlsh_n ash[n] = mpn_lshift (ash, a2, n, 2); /* 4a2 */ #endif #if HAVE_NATIVE_mpn_addlsh1_n cy = mpn_addlsh1_n (as2, a3, a4, s); if (s != n) cy = mpn_add_1 (as2 + s, a3 + s, n - s, cy); cy = 2 * cy + mpn_addlsh1_n (as2, a2, as2, n); cy = 2 * cy + mpn_addlsh1_n (as2, a1, as2, n); as2[n] = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n); #else cy = mpn_lshift (as2, a4, s, 1); cy += mpn_add_n (as2, a3, as2, s); if (s != n) cy = mpn_add_1 (as2 + s, a3 + s, n - s, cy); cy = 4 * cy + mpn_lshift (as2, as2, n, 2); cy += mpn_add_n (as2, a1, as2, n); cy = 2 * cy + mpn_lshift (as2, as2, n, 1); as2[n] = cy + mpn_add_n (as2, a0, as2, n); mpn_add_n (as2, ash, as2, n + 1); #endif /* Compute ash and asmh. */ #if HAVE_NATIVE_mpn_addlsh_n cy = mpn_addlsh_n (gp, a2, a0, n, 2); /* 4a0 + a2 */ cy = 4 * cy + mpn_addlsh_n (gp, a4, gp, n, 2); /* 16a0 + 4a2 + a4 */ /* FIXME s */ gp[n] = cy; cy = mpn_addlsh_n (hp, a3, a1, n, 2); /* 4a1 + a3 */ cy = 2 * cy + mpn_lshift (hp, hp, n, 1); /* 8a1 + 2a3 */ hp[n] = cy; #else gp[n] = mpn_lshift (gp, a0, n, 4); /* 16a0 */ mpn_add (gp, gp, n + 1, a4, s); /* 16a0 + a4 */ mpn_add_n (gp, ash, gp, n+1); /* 16a0 + 4a2 + a4 */ cy = mpn_lshift (hp, a1, n, 3); /* 8a1 */ cy += mpn_lshift (ash, a3, n, 1); /* 2a3 */ cy += mpn_add_n (hp, ash, hp, n); /* 8a1 + 2a3 */ hp[n] = cy; #endif #if HAVE_NATIVE_mpn_addsub_n if (mpn_cmp (gp, hp, n + 1) < 0) { mpn_addsub_n (ash, asmh, hp, gp, n + 1); vmh_neg = 1; } else { mpn_addsub_n (ash, asmh, gp, hp, n + 1); vmh_neg = 0; } #else mpn_add_n (ash, gp, hp, n + 1); if (mpn_cmp (gp, hp, n + 1) < 0) { mpn_sub_n (asmh, hp, gp, n + 1); vmh_neg = 1; } else { mpn_sub_n (asmh, gp, hp, n + 1); vmh_neg = 0; } #endif /* Compute bs1 and bsm1. */ bs1[n] = mpn_add (bs1, b0, n, b2, t); /* b0 + b2 */ #if HAVE_NATIVE_mpn_addsub_n if (bs1[n] == 0 && mpn_cmp (bs1, b1, n) < 0) { bs1[n] = mpn_addsub_n (bs1, bsm1, b1, bs1, n) >> 1; bsm1[n] = 0; vm1_neg ^= 1; }

void check_n (void) { { int n = -1; check_one ("blah", "%nblah", &n); ASSERT_ALWAYS (n == 0); } { int n = -1; check_one ("hello ", "hello %n", &n); ASSERT_ALWAYS (n == 6); } { int n = -1; check_one ("hello world", "hello %n world", &n); ASSERT_ALWAYS (n == 6); } #define CHECK_N(type, string) \ do { \ type x[2]; \ char fmt[128]; \ \ x[0] = ~ (type) 0; \ x[1] = ~ (type) 0; \ sprintf (fmt, "%%d%%%sn%%d", string); \ check_one ("123456", fmt, 123, &x[0], 456); \ \ /* should write whole of x[0] and none of x[1] */ \ ASSERT_ALWAYS (x[0] == 3); \ ASSERT_ALWAYS (x[1] == (type) ~ (type) 0); \ \ } while (0) CHECK_N (mp_limb_t, "M"); CHECK_N (char, "hh"); CHECK_N (long, "l"); #if HAVE_LONG_LONG CHECK_N (long long, "L"); #endif #if HAVE_INTMAX_T CHECK_N (intmax_t, "j"); #endif #if HAVE_PTRDIFF_T CHECK_N (ptrdiff_t, "t"); #endif CHECK_N (short, "h"); CHECK_N (size_t, "z"); { mpz_t x[2]; mpz_init_set_si (x[0], -987L); mpz_init_set_si (x[1], 654L); check_one ("123456", "%d%Zn%d", 123, x[0], 456); MPZ_CHECK_FORMAT (x[0]); MPZ_CHECK_FORMAT (x[1]); ASSERT_ALWAYS (mpz_cmp_ui (x[0], 3L) == 0); ASSERT_ALWAYS (mpz_cmp_ui (x[1], 654L) == 0); mpz_clear (x[0]); mpz_clear (x[1]); } { mpq_t x[2]; mpq_init (x[0]); mpq_init (x[1]); mpq_set_ui (x[0], -987L, 654L); mpq_set_ui (x[1], 4115L, 226L); check_one ("123456", "%d%Qn%d", 123, x[0], 456); MPQ_CHECK_FORMAT (x[0]); MPQ_CHECK_FORMAT (x[1]); ASSERT_ALWAYS (mpq_cmp_ui (x[0], 3L, 1L) == 0); ASSERT_ALWAYS (mpq_cmp_ui (x[1], 4115L, 226L) == 0); mpq_clear (x[0]); mpq_clear (x[1]); } { mpf_t x[2]; mpf_init (x[0]); mpf_init (x[1]); mpf_set_ui (x[0], -987L); mpf_set_ui (x[1], 654L); check_one ("123456", "%d%Fn%d", 123, x[0], 456); MPF_CHECK_FORMAT (x[0]); MPF_CHECK_FORMAT (x[1]); ASSERT_ALWAYS (mpf_cmp_ui (x[0], 3L) == 0); ASSERT_ALWAYS (mpf_cmp_ui (x[1], 654L) == 0); mpf_clear (x[0]); mpf_clear (x[1]); } { mp_limb_t a[5]; mp_limb_t a_want[numberof(a)]; mp_size_t i; a[0] = 123; check_one ("blah", "bl%Nnah", a, (mp_size_t) 0); ASSERT_ALWAYS (a[0] == 123); MPN_ZERO (a_want, numberof (a_want)); for (i = 1; i < numberof (a); i++) { check_one ("blah", "bl%Nnah", a, i); a_want[0] = 2; ASSERT_ALWAYS (mpn_cmp (a, a_want, i) == 0); } } }

/* Put in Q={qp, n} an approximation of N={np, 2*n} divided by D={dp, n}, with the most significant limb of the quotient as return value (0 or 1). Assumes the most significant bit of D is set. Clobbers N. The approximate quotient Q satisfies - 2(n-1) < N/D - Q <= 4. */ static mp_limb_t mpfr_divhigh_n_basecase (mpfr_limb_ptr qp, mpfr_limb_ptr np, mpfr_limb_srcptr dp, mp_size_t n) { mp_limb_t qh, d1, d0, dinv, q2, q1, q0; mpfr_pi1_t dinv2; np += n; if ((qh = (mpn_cmp (np, dp, n) >= 0))) mpn_sub_n (np, np, dp, n); /* now {np, n} is less than D={dp, n}, which implies np[n-1] <= dp[n-1] */ d1 = dp[n - 1]; if (n == 1) { invert_limb (dinv, d1); umul_ppmm (q1, q0, np[0], dinv); qp[0] = np[0] + q1; return qh; } /* now n >= 2 */ d0 = dp[n - 2]; invert_pi1 (dinv2, d1, d0); /* dinv2.inv32 = floor ((B^3 - 1) / (d0 + d1 B)) - B */ while (n > 1) { /* Invariant: it remains to reduce n limbs from N (in addition to the initial low n limbs). Since n >= 2 here, necessarily we had n >= 2 initially, which means that in addition to the limb np[n-1] to reduce, we have at least 2 extra limbs, thus accessing np[n-3] is valid. */ /* Warning: we can have np[n-1]>d1 or (np[n-1]=d1 and np[n-2]>=d0) here, since we truncate the divisor at each step, but since {np,n} < D originally, the largest possible partial quotient is B-1. */ if (MPFR_UNLIKELY(np[n-1] > d1 || (np[n-1] == d1 && np[n-2] >= d0))) q2 = MPFR_LIMB_MAX; else udiv_qr_3by2 (q2, q1, q0, np[n - 1], np[n - 2], np[n - 3], d1, d0, dinv2.inv32); /* since q2 = floor((np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0)), we have q2 <= (np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0), thus np[n-1]*B^2+np[n-2]*B+np[n-3] >= q2*(d1*B+d0) and {np-1, n} >= q2*D - q2*B^(n-2) >= q2*D - B^(n-1) thus {np-1, n} - (q2-1)*D >= D - B^(n-1) >= 0 which proves that at most one correction is needed */ q0 = mpn_submul_1 (np - 1, dp, n, q2); if (MPFR_UNLIKELY(q0 > np[n - 1])) { mpn_add_n (np - 1, np - 1, dp, n); q2 --; } qp[--n] = q2; dp ++; } /* we have B+dinv2 = floor((B^3-1)/(d1*B+d0)) < B^2/d1 q1 = floor(np[0]*(B+dinv2)/B) <= floor(np[0]*B/d1) <= floor((np[0]*B+np[1])/d1) thus q1 is not larger than the true quotient. q1 > np[0]*(B+dinv2)/B - 1 > np[0]*(B^3-1)/(d1*B+d0)/B - 2 For d1*B+d0 <> B^2/2, we have B+dinv2 = floor(B^3/(d1*B+d0)) thus q1 > np[0]*B^2/(d1*B+d0) - 2, i.e., (d1*B+d0)*q1 > np[0]*B^2 - 2*(d1*B+d0) d1*B*q1 > np[0]*B^2 - 2*d1*B - 2*d0 - d0*q1 >= np[0]*B^2 - 2*d1*B - B^2 thus q1 > np[0]*B/d1 - 2 - B/d1 > np[0]*B/d1 - 4. For d1*B+d0 = B^2/2, dinv2 = B-1 thus q1 > np[0]*(2B-1)/B - 1 > np[0]*B/d1 - 2. In all cases, if q = floor((np[0]*B+np[1])/d1), we have: q - 4 <= q1 <= q */ umul_ppmm (q1, q0, np[0], dinv2.inv32); qp[0] = np[0] + q1; return qh; }

mp_limb_t mpn_dc_divappr_q_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n, mp_limb_t dip, mp_limb_t d1ip, mp_ptr tp) { mp_limb_t qh, cy; mp_ptr q_hi; mp_size_t m; mp_limb_t ret = 0; ASSERT (n >= 6); /* if the top n limbs of np are >= dp, high limb of quotient is 1 */ if (mpn_cmp(np + n, dp, n) >= 0) { ret = 1; mpn_sub_n(np + n, np + n, dp, n); } /* top n limbs of np are now < dp */ m = (n + 1) / 2; q_hi = qp + n - m; /* FIXME: we could probably avoid this copy if we could guarantee that sb_div_appr_q/dc_divappr_q_n did not destroy the "bottom half" of N */ MPN_COPY (tp, np, 2*n); /* estimate high m+1 limbs of quotient, using a 2*m by m division the quotient may be computed 1 too large as it is approximate, moreover, even computed precisely it may be two too large due to the truncation we've done to a 2*m by m division... */ if (m < DC_DIVAPPR_Q_N_THRESHOLD) qh = mpn_sb_divappr_q (q_hi, tp + 2*n - 2*m, 2*m, dp + n - m, m, dip, d1ip); else qh = mpn_dc_divappr_q_n (q_hi, tp + 2*n - 2*m, dp + n - m, m, dip, d1ip, tp + 2*n); /* we therefore decrease the estimate by 3... */ qh -= mpn_sub_1 (q_hi, q_hi, m, (mp_limb_t) 3); /* ensuring it doesn't become negative */ if (qh & GMP_NUMB_HIGHBIT) { MPN_ZERO (q_hi, m); qh = 0; } /* note qh is now always zero as the quotient we have is definitely correct or up to two too small, and we already normalised np */ ASSERT (qh == 0); /* we know that {np+n-m, n+m} = q_hi * D + e0, where 0 <= e0 < C*B^n, where C is a small positive constant. Estimate q_hi * D using middle product, developing one additional limb, i.e. develop n - m + 3 limbs. The bottom limb is meaningless and the next limb may be too small by up to some small multiple of n, but recall n << B. */ mpn_mulmid (tp, dp, n, q_hi + 1, m - 2); /* do some parts of the middle product "manually": */ tp[n - m + 2] += mpn_addmul_1 (tp, dp + m - 2, n - m + 2, q_hi[0]); mpn_addmul_1 (tp + 1, dp, n - m + 2, q_hi[m-1]); /* subtract that estimate from N. We note the limb at np + n - 2 is then meaningless, and the next limb mght be too large by a small amount, i.e. the bottom n limbs of np are now possibly too large by a quantity much less than dp */ mpn_sub_n (np + n - 2, np + n - 2, tp, n - m + 3); /* recursively divide to obtain low half of quotient, developing one more limb than we would need if everything had been exact. As this extra limb is out by only a small amount, rounding the remaining limbs based on its value and discarding the extra limb results in a quotient which is at most 1 too large */ if (n - m + 2 < DC_DIVAPPR_Q_N_THRESHOLD) cy = mpn_sb_divappr_q (tp, np + m - 3, 2*n - 2*m + 4, dp + m - 2, n - m + 2, dip, d1ip); else cy = mpn_dc_divappr_q_n (tp, np + m - 3, dp + m - 2, n - m + 2, dip, d1ip, tp + n - m + 2); /* FIXME: The only reason this copy happens is that we elected to develop one extra quotient limb in the second recursive quotient. */ MPN_COPY (qp, tp + 1, n - m); /* Construct final quotient from low and hi parts... */ ret += mpn_add_1 (qp + n - m, qp + n - m, m, tp[n-m+1]); ret += mpn_add_1 (qp + n - m + 1, qp + n - m + 1, m - 1, cy); if (tp[0] >= GMP_NUMB_HIGHBIT) ret += mpn_add_1 (qp, qp, n, 1); /* ...rounding quotient up */ /* As the final quotient may be 1 too large, we may have ret == 2 (it is very unlikely, but can be relatively easily triggered at random when dp = 0x80000...0000), then Q must be 2000.... and we should return instead 1ffff.... */ if (ret == 2) { ret -= mpn_sub_1 (qp, qp, n, 1); ASSERT (ret == 1); } return ret; }

static int do_test (void) { mp1 ex, x, xt, e2, e3; int i; int errors = 0; int failures = 0; mp1 maxerror; int maxerror_s = 0; const double sf = pow (2, mpbpl); /* assert(mpbpl == mp_bits_per_limb); */ assert(FRAC / mpbpl * mpbpl == FRAC); memset (maxerror, 0, sizeof (mp1)); memset (xt, 0, sizeof (mp1)); xt[(FRAC - N2) / mpbpl] = (mp_limb_t)1 << (FRAC - N2) % mpbpl; for (i = 0; i < (1 << N2); ++i) { int e2s, e3s, j; double de2; mpn_mul_1 (x, xt, SZ, i); exp2_mpn (ex, x); de2 = exp2 (i / (double) (1 << N2)); for (j = SZ - 1; j >= 0; --j) { e2[j] = (mp_limb_t) de2; de2 = (de2 - e2[j]) * sf; } if (mpn_cmp (ex, e2, SZ) >= 0) mpn_sub_n (e3, ex, e2, SZ); else mpn_sub_n (e3, e2, ex, SZ); e2s = mpn_bitsize (e2, SZ); e3s = mpn_bitsize (e3, SZ); if (e3s >= 0 && e2s - e3s < 54) { #if PRINT_ERRORS printf ("%06x ", i * (0x100000 / (1 << N2))); print_mpn_fp (ex, (FRAC / 4) + 1, 16); putchar ('\n'); fputs (" ",stdout); print_mpn_fp (e2, (FRAC / 4) + 1, 16); putchar ('\n'); printf (" %c ", e2s - e3s < 54 ? e2s - e3s == 53 ? 'e' : 'F' : 'P'); print_mpn_fp (e3, (FRAC / 4) + 1, 16); putchar ('\n'); #endif errors += (e2s - e3s == 53); failures += (e2s - e3s < 53); } if (e3s >= maxerror_s && mpn_cmp (e3, maxerror, SZ) > 0) { memcpy (maxerror, e3, sizeof (mp1)); maxerror_s = e3s; } } /* Check exp_mpn against precomputed value of exp(1). */ memset (x, 0, sizeof (mp1)); x[FRAC / mpbpl] = (mp_limb_t)1 << FRAC % mpbpl; exp_mpn (ex, x); if (mpn_cmp (ex, mp_exp1, SZ) >= 0) mpn_sub_n (e3, ex, mp_exp1, SZ); else mpn_sub_n (e3, mp_exp1, ex, SZ); printf ("%d failures; %d errors; error rate %0.2f%%\n", failures, errors, errors * 100.0 / (double) (1 << N2)); fputs ("maximum error: ", stdout); print_mpn_fp (maxerror, (FRAC / 4) + 1, 16); putchar ('\n'); fputs ("error in exp(1): ", stdout); print_mpn_fp (e3, (FRAC / 4) + 1, 16); putchar ('\n'); return failures == 0 ? 0 : 1; }

mp_size_t mpn_gcdext (mp_ptr gp, mp_ptr s0p, mp_size_t *s0size, mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n) { mp_size_t init_scratch, orig_n = n; mp_size_t scratch, un, u0n, u1n; mp_limb_t t; mp_ptr tp, u0, u1; int swapped = 0; struct ngcd_matrix M; mp_size_t p; mp_size_t nn; mp_limb_signed_t a; int c; TMP_DECL; ASSERT (an >= n); if (an == 1) { if (!n) { /* shouldn't ever occur, but we include for completeness */ gp[0] = ap[0]; s0p[0] = 1; *s0size = 1; return 1; } gp[0] = mpn_gcdinv_1(&a, ap[0], bp[0]); if (a < (mp_limb_signed_t) 0) { s0p[0] = -a; (*s0size) = -1; } else { s0p[0] = a; (*s0size) = 1 - (s0p[0] == 0); } return 1; } init_scratch = MPN_NGCD_MATRIX_INIT_ITCH (n-P_SIZE(n)); scratch = mpn_nhgcd_itch ((n+1)/2); /* Space needed for mpn_ngcd_matrix_adjust */ if (scratch < 2*n) scratch = 2*n; if (scratch < an - n + 1) /* the first division can sometimes be selfish!! */ scratch = an - n + 1; /* Space needed for cofactor adjust */ scratch = MAX(scratch, 2*(n+1) + P_SIZE(n) + 1); TMP_MARK; if (5*n + 2 + MPN_GCD_LEHMER_N_ITCH(n) > init_scratch + scratch) tp = TMP_ALLOC_LIMBS (7*n+4+MPN_GCD_LEHMER_N_ITCH(n)); /* 2n+2 for u0, u1, 5*n+2 + MPN_GCD_LEHMER_N_ITCH(n) for Lehmer and copies of ap and bp and s (and finally 3*n+1 for t and get_t) */ else tp = TMP_ALLOC_LIMBS (2*(n+1) + init_scratch + scratch); if (an > n) { mp_ptr qp = tp; mpn_tdiv_qr (qp, ap, 0, ap, an, bp, n); an = n; MPN_NORMALIZE (ap, an); if (an == 0) { MPN_COPY (gp, bp, n); TMP_FREE; (*s0size) = 0; return n; } } if (BELOW_THRESHOLD (n, GCDEXT_THRESHOLD)) { n = mpn_ngcdext_lehmer (gp, s0p, s0size, ap, bp, n, tp); TMP_FREE; return n; } u0 = tp; /* Cofactor space */ u1 = tp + n + 1; MPN_ZERO(tp, 2*(n+1)); tp += 2*(n+1); /* First iteration, setup u0 and u1 */ p = P_SIZE(n); mpn_ngcd_matrix_init (&M, n - p, tp); ASSERT(tp + init_scratch > M.p[1][1] + M.n); nn = mpn_nhgcd (ap + p, bp + p, n - p, &M, tp + init_scratch); if (nn > 0) { n = mpn_ngcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + init_scratch); /* (ap'', bp'')^T = M^-1(ap', bp')^T and (ap', bp') = (1*ap + ?*bp, 0*ap + ?*bp) We let u0 be minus the factor of ap appearing in the expression for bp'' and u1 be the factor of ap appearing in the expression for ap'' */ MPN_COPY(u0, M.p[1][0], M.n); MPN_COPY(u1, M.p[1][1], M.n); un = M.n; while ((u0[un-1] == 0) && (u1[un-1] == 0)) un--; /* normalise u0, u1, both cannot be zero as det = 1*/ } else { mp_size_t gn; un = 1; u0[0] = 0; /* bp = 0*ap + ?*bp, thus u0 = -0 */ u1[0] = 1; /* ap = 1*ap + ?*bp, thus u1 = 1 */ n = mpn_ngcdext_subdiv_step (gp, &gn, s0p, u0, u1, &un, ap, bp, n, tp); if (n == 0) { /* never observed to occur */ (*s0size) = un; ASSERT(s0p[*s0size - 1] != 0); TMP_FREE; return gn; } } while (ABOVE_THRESHOLD (n, GCDEXT_THRESHOLD)) { struct ngcd_matrix M; mp_size_t p = P_SIZE(n); mp_size_t nn; mpn_ngcd_matrix_init (&M, n - p, tp); nn = mpn_nhgcd (ap + p, bp + p, n - p, &M, tp + init_scratch); if (nn > 0) { n = mpn_ngcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + init_scratch); ngcdext_cofactor_adjust(u0, u1, &un, &M, tp + init_scratch); /* (ap'', bp'')^T = M^-1(ap', bp')^T and (ap', bp') = (u1*ap + ?*bp, -u0*ap + ?*bp) So we need u0' = -(-c*u1 + a*-u0) = a*u0 + c*u1 and we need u1' = (d*u1 -b*-u0) = b*u0 + d*u1 */ ASSERT(un <= orig_n + 1); } else { mp_size_t gn; n = mpn_ngcdext_subdiv_step (gp, &gn, s0p, u0, u1, &un, ap, bp, n, tp); ASSERT(un <= orig_n + 1); if (n == 0) { (*s0size) = un; ASSERT(((*s0size) == 0) || (s0p[ABS(*s0size) - 1] != 0)); TMP_FREE; return gn; } } } ASSERT (ap[n-1] > 0 || bp[n-1] > 0); ASSERT (u0[un-1] > 0 || u1[un-1] > 0); if (ap[n-1] < bp[n-1]) { MP_PTR_SWAP (ap, bp); MP_PTR_SWAP (u0, u1); swapped = 1; } an = n; /* {ap, an} and {bp, bn} are normalised, {ap, an} >= {bp, bn} */ MPN_NORMALIZE (bp, n); if (n == 0) { /* If bp == 0 then gp = ap with cofactor u1 If we swapped then cofactor is -u1 This case never seems to happen */ MPN_COPY (gp, ap, an); MPN_NORMALIZE(u1, un); MPN_COPY(s0p, u1, un); (*s0size) = un; if (swapped) (*s0size) = -(*s0size); TMP_FREE; return an; } /* If at this point we have s*ap' + t*bp' = gp where gp is the gcd and (ap', bp') = (u1*ap + ?*bp, -u0*ap + ?*bp) then gp = s*u1*ap - t*u0*ap + ?*bp and the cofactor we want is (s*u1-t*u0). First there is the special case u0 = 0, u1 = 1 in which case we do not need to compute t... */ ASSERT(u1 + un <= tp); u0n = un; MPN_NORMALIZE(u0, u0n); /* {u0, u0n} is now normalised */ if (u0n == 0) /* u1 = 1 case is rare*/ { mp_size_t gn; gn = mpn_ngcdext_lehmer (gp, s0p, s0size, ap, bp, n, tp); if (swapped) (*s0size) = -(*s0size); TMP_FREE; return gn; } else { /* Compute final gcd. */ mp_size_t gn, sn, tn; mp_ptr s, t; mp_limb_t cy; int negate = 0; /* Save an, bn first as gcdext destroys inputs */ s = tp; tp += an; MPN_COPY(tp, ap, an); MPN_COPY(tp + an, bp, an); if (mpn_cmp(tp, tp + an, an) == 0) { /* gcd is tp or tp + an return smallest cofactor, either -u0 or u1 */ gn = an; MPN_NORMALIZE(tp, gn); MPN_COPY(gp, tp, gn); MPN_CMP(c, u0, u1, un); if (c < (mp_limb_signed_t) 0) { MPN_COPY(s0p, u0, u0n); (*s0size) = -u0n; } else { MPN_NORMALIZE(u1, un); MPN_COPY(s0p, u1, un); (*s0size) = un; } TMP_FREE; return gn; } gn = mpn_ngcdext_lehmer (gp, s, &sn, tp, tp + an, an, tp + 2*an); /* Special case, s == 0, t == 1, cofactor = -u0 case is rare*/ if (sn == 0) { MPN_COPY(s0p, u0, u0n); (*s0size) = -u0n; if (swapped) (*s0size) = -(*s0size); TMP_FREE; return gn; } /* We'll need the other cofactor t = (gp - s*ap)/bp */ t = tp; tp += (an + 1); gcdext_get_t(t, &tn, gp, gn, ap, an, bp, n, s, sn, tp); ASSERT((tn == 0) || (t[tn - 1] > 0)); /* {t, tn} is normalised */ ASSERT(tn <= an + 1); /* We want to compute s*u1 - t*u0, so if s is negative t will be positive, so we'd be dealing with negative numbers. We fix that here. */ if (sn < 0) { sn = -sn; negate = 1; } /* Now we can deal with the special case u1 = 0 */ u1n = un; MPN_NORMALIZE(u1, u1n); /* {u1, u1n} is now normalised */ if (u1n == 0) /* case is rare */ { MPN_COPY(s0p, t, tn); (*s0size) = -tn; if (swapped ^ negate) (*s0size) = -(*s0size); TMP_FREE; return gn; } /* t may be zero, but we need to compute s*u1 anyway */ if (sn >= u1n) mpn_mul(s0p, s, sn, u1, u1n); else mpn_mul(s0p, u1, u1n, s, sn); (*s0size) = sn + u1n; (*s0size) -= (s0p[sn + u1n - 1] == 0); ASSERT(s0p[*s0size - 1] > 0); /* {s0p, *s0size} is normalised now */ if (tn == 0) /* case is rare */ { if (swapped ^ negate) (*s0size) = -(*s0size); TMP_FREE; return gn; } /* Now compute the rest of the cofactor, t*u0 and subtract it We're done with u1 and s which happen to be consecutive, so use that space */ ASSERT(u1 + tn + u0n <= t); if (tn > u0n) mpn_mul(u1, t, tn, u0, u0n); else mpn_mul(u1, u0, u0n, t, tn); u1n = tn + u0n; u1n -= (u1[tn + u0n - 1] == 0); ASSERT(u1[u1n - 1] > 0); /* Recall t is now negated so s*u1 - t*u0 involves an *addition* */ if ((*s0size) >= u1n) { cy = mpn_add(s0p, s0p, *s0size, u1, u1n); if (cy) s0p[(*s0size)++] = cy; } else { cy = mpn_add(s0p, u1, u1n, s0p, *s0size); (*s0size) = u1n; if (cy) s0p[(*s0size)++] = cy; } if (swapped ^ negate) (*s0size) = -(*s0size); TMP_FREE; return gn; } }

void mpn_toom3_sqr_n (mp_ptr c, mp_srcptr a, mp_size_t n, mp_ptr t) { mp_size_t k, k1, kk1, r, twok, rr2; mp_limb_t cy, cc, saved, vinf0; mp_ptr trec; int sa; mp_ptr c1, c2, c3, c4, c5, t1, t2, t3, t4; ASSERT(GMP_NUMB_BITS >= 6); k = (n + 2) / 3; /* ceil(n/3) */ ASSERT(GMP_NUMB_BITS >= 6); ASSERT(n >= 17); /* so that r <> 0 and 5k+3 <= 2n */ twok = 2 * k; k1 = k + 1; kk1 = k + k1; r = n - twok; /* last chunk */ rr2 = 2*r; c1 = c + k; c2 = c1 + k; c3 = c2 + k; c4 = c3 + k; c5 = c4 + k; t1 = t + k; t2 = t1 + k; t3 = t2 + k; t4 = t3 + k; trec = t + 4 * k + 3; /* put a0+a2 in {c, k+1} put a0+a1+a2 in {t2 + 1, k+1} */ cy = mpn_add_n (c, a, a + twok, r); if (r < k) { __GMPN_ADD_1 (cy, c + r, a + r, k - r, cy); } t3[1] = (c1[0] = cy) + mpn_add_n (t2 + 1, c, a + k, k); /* compute v1 := (a0+a1+a2)^2 in {c2, 2k+1}; since v1 < 9*B^(2k), v1 uses only 2k+1 words if GMP_NUMB_BITS >= 4 */ TOOM3_SQR_REC (c2, t2 + 1, k1, trec); /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} v1 */ /* put |a0-a1+a2| in {c,k+1} */ sa = (c[k] != 0) ? 1 : mpn_cmp (c, a + k, k); c[k] = (sa >= 0) ? c[k] - mpn_sub_n (c, c, a + k, k) : mpn_sub_n (c, a + k, c, k); /* compute vm1 := (a0-a1+a2)^2 in {t, 2k+1}; since |vm1| < 4*B^(2k), vm1 uses only 2k+1 limbs */ TOOM3_SQR_REC (t, c, k1, trec); /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} v1 {t, 2k+1} {t+2k+1, 2k + 1} vm1 */ /* compute a0+2a1+4a2 in {c, k+1} */ #if HAVE_NATIVE_mpn_addlsh1_n c1[0] = mpn_addlsh1_n (c, a + k, a + twok, r); if (r < k) { __GMPN_ADD_1 (c1[0], c + r, a + k + r, k - r, c1[0]); } c1[0] = 2 * c1[0] + mpn_addlsh1_n (c, a, c, k); #else c[r] = mpn_lshift1 (c, a + twok, r); if (r < k) { MPN_ZERO(c + r + 1, k - r); } c1[0] += mpn_add_n (c, c, a + k, k); mpn_double (c, k1); c1[0] += mpn_add_n (c, c, a, k); #endif #define v2 (t+2*k+1) /* compute v2 := (a0+2a1+4a2)^2 in {t+2k+1, 2k+1} v2 < 49*B^k so v2 uses at most 2k+1 limbs if GMP_NUMB_BITS >= 6 */ TOOM3_SQR_REC (v2, c, k1, trec); /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} v1 {t, 2k+1} {t+2k+1, 2k + 1} vm1 v2 */ /* compute v0 := a0^2 in {c, 2k} */ TOOM3_SQR_REC (c, a, k, trec); /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} v0 v1 {t, 2k+1} {t+2k+1, 2k + 1} vm1 v2 */ #define vinf (c+4*k) /* compute vinf := a2*b2 in {c4, r + r2}, */ saved = c4[0]; TOOM3_SQR_REC (c4, a + twok, r, trec); vinf0 = c4[0]; c4[0] = saved; /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} v0 v1 {-}vinf {t, 2k+1} {t+2k+1, 2k + 1} vm1 v2 vinf0 = {-} */ mpn_toom3_interpolate (c, c2, v2, t, vinf, k, rr2, 1, vinf0, t4+2); #undef v2 #undef vinf }

/* The necessary temporary space T(n) satisfies T(n)=0 for n < THRESHOLD, and T(n) <= max(2n+2, 6k+3, 4k+3+T(k+1)) otherwise, where k = ceil(n/3). Assuming T(n) >= 2n, 6k+3 <= 4k+3+T(k+1). Similarly, 2n+2 <= 6k+2 <= 4k+3+T(k+1). With T(n) = 2n+S(n), this simplifies to S(n) <= 9 + S(k+1). Since THRESHOLD >= 17, we have n/(k+1) >= 19/8 thus S(n) <= S(n/(19/8)) + 9 thus S(n) <= 9*log(n)/log(19/8) <= 8*log2(n). We need in addition 2*r for mpn_sublsh1_n, so the total is at most 8/3*n+8*log2(n). */ void mpn_toom3_mul_n (mp_ptr c, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr t) { mp_size_t k, k1, kk1, r, twok, rr2; mp_limb_t cy, cc, saved, vinf0; mp_ptr trec; int sa, sb; mp_ptr c1, c2, c3, c4, c5, t1, t2, t3, t4; ASSERT(GMP_NUMB_BITS >= 6); k = (n + 2) / 3; /* ceil(n/3) */ ASSERT(GMP_NUMB_BITS >= 6); ASSERT(n >= 17); /* so that r <> 0 and 5k+3 <= 2n */ twok = 2 * k; k1 = k + 1; kk1 = k + k1; r = n - twok; /* last chunk */ rr2 = 2*r; c1 = c + k; c2 = c1 + k; c3 = c2 + k; c4 = c3 + k; c5 = c4 + k; t1 = t + k; t2 = t1 + k; t3 = t2 + k; t4 = t3 + k; trec = t + 4 * k + 4; /* put a0+a2 in {c, k+1}, and b0+b2 in {c4 + 2, k+1}; put a0+a1+a2 in {t2 + 1, k+1} and b0+b1+b2 in {t3 + 2,k+1} */ c1[0] = mpn_add_n (c, a, a + twok, r); c5[2] = mpn_add_n (c4 + 2, b, b + twok, r); if (r < k) { c1[0] = mpn_add_1 (c + r, a + r, k - r, c1[0]); c5[2] = mpn_add_1 (c4 + 2 + r, b + r, k - r, c5[2]); } t3[1] = c1[0] + mpn_add_n (t2 + 1, c, a + k, k); t4[2] = c5[2] + mpn_add_n (t3 + 2, c4 + 2, b + k, k); ASSERT(c1[0] < 2); ASSERT(c5[2] < 2); ASSERT(t3[1] < 3); ASSERT(t4[2] < 3); /* compute v1 := (a0+a1+a2)*(b0+b1+b2) in {c2, 2k+1}; since v1 < 9*B^(2k), v1 uses only 2k+1 words if GMP_NUMB_BITS >= 4 */ TOOM3_MUL_REC (c2, t2 + 1, t3 + 2, k1, trec); ASSERT(c2[k+k] < 9); /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} v1 */ /* put |a0-a1+a2| in {c,k+1} and |b0-b1+b2| in {c4 + 2,k+1} */ /* sa = sign(a0-a1+a2) */ /* sb = sign(b0-b1+b2) */ sa = (c[k] != 0) ? 1 : mpn_cmp (c, a + k, k); c[k] = (sa >= 0) ? c[k] - mpn_sub_n (c, c, a + k, k) : mpn_sub_n (c, a + k, c, k); /* b0+b2 is in {c4+2, k+1} now */ sb = (c5[2] != 0) ? 1 : mpn_cmp (c4 + 2, b + k, k); c5[2] = (sb >= 0) ? c5[2] - mpn_sub_n (c4 + 2, c4 + 2, b + k, k) : mpn_sub_n (c4 + 2, b + k, c4 + 2, k); ASSERT(c[k] < 2); ASSERT(c5[2] < 2); sa *= sb; /* sign of vm1 */ /* compute vm1 := (a0-a1+a2)*(b0-b1+b2) in {t, 2k+1}; since |vm1| < 4*B^(2k), vm1 uses only 2k+1 limbs */ TOOM3_MUL_REC (t, c, c4 + 2, k1, trec); ASSERT(t[k+k] < 4); /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} v1 {t, 2k+1} {t+2k+1, 2k + 1} vm1 */ /* compute a0+2a1+4a2 in {c, k+1} and b0+2b1+4b2 in {c4 + 2, k+1} */ #if HAVE_NATIVE_mpn_addlsh1_n c1[0] = mpn_addlsh1_n (c, a + k, a + twok, r); c5[2] = mpn_addlsh1_n (c4 + 2, b + k, b + twok, r); if (r < k) { c1[0] = mpn_add_1(c + r, a + k + r, k - r, c1[0]); c5[2] = mpn_add_1(c4 + 2 + r, b + k + r, k - r, c5[2]); } c1[0] = 2 * c1[0] + mpn_addlsh1_n (c, a, c, k); c5[2] = 2 * c5[2] + mpn_addlsh1_n (c4 + 2, b, c4 + 2, k); #else c[r] = mpn_lshift1 (c, a + twok, r); c4[r + 2] = mpn_lshift1 (c4 + 2, b + twok, r); if (r < k) { MPN_ZERO(c + r + 1, k - r); MPN_ZERO(c4 + r + 3, k - r); } c1[0] += mpn_add_n (c, c, a + k, k); c5[2] += mpn_add_n (c4 + 2, c4 + 2, b + k, k); mpn_double (c, k1); mpn_double (c4 + 2, k1); c1[0] += mpn_add_n (c, c, a, k); c5[2] += mpn_add_n (c4 + 2, c4 + 2, b, k); #endif ASSERT(c[k] < 7); ASSERT(c5[2] < 7); #define v2 (t+2*k+1) /* compute v2 := (a0+2a1+4a2)*(b0+2b1+4b2) in {t+2k+1, 2k+1} v2 < 49*B^k so v2 uses at most 2k+1 limbs if GMP_NUMB_BITS >= 6 */ TOOM3_MUL_REC (v2, c, c4 + 2, k1, trec); ASSERT(v2[k+k] < 49); /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} v1 {t, 2k+1} {t+2k+1, 2k + 1} vm1 v2 */ /* compute v0 := a0*b0 in {c, 2k} */ TOOM3_MUL_REC (c, a, b, k, trec); /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} v0 v1 {t, 2k+1} {t+2k+1, 2k + 1} vm1 v2 */ #define vinf (c+4*k) /* compute vinf := a2*b2 in {c4, r + r2}, */ saved = c4[0]; TOOM3_MUL_REC (c4, a + twok, b + twok, r, trec); vinf0 = c4[0]; c4[0] = saved; /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} v0 v1 {-}vinf {t, 2k+1} {t+2k+1, 2k + 1} vm1 v2 vinf0 = {-} */ mpn_toom3_interpolate (c, c2, v2, t, vinf, k, rr2, sa, vinf0, t4+2); #undef v2 #undef vinf }

void mpf_ui_sub (mpf_ptr r, mpir_ui u, mpf_srcptr v) { mp_srcptr up, vp; mp_ptr rp, tp; mp_size_t usize, vsize, rsize; mp_size_t prec; mp_exp_t uexp; mp_size_t ediff; int negate; mp_limb_t ulimb; TMP_DECL; vsize = v->_mp_size; /* Handle special cases that don't work in generic code below. */ if (u == 0) { mpf_neg (r, v); return; } if (vsize == 0) { mpf_set_ui (r, u); return; } /* If signs of U and V are different, perform addition. */ if (vsize < 0) { __mpf_struct v_negated; v_negated._mp_size = -vsize; v_negated._mp_exp = v->_mp_exp; v_negated._mp_d = v->_mp_d; mpf_add_ui (r, &v_negated, u); return; } TMP_MARK; /* Signs are now known to be the same. */ ulimb = u; /* Make U be the operand with the largest exponent. */ if (1 < v->_mp_exp) { negate = 1; usize = ABS (vsize); vsize = 1; up = v->_mp_d; vp = &ulimb; rp = r->_mp_d; prec = r->_mp_prec + 1; uexp = v->_mp_exp; ediff = uexp - 1; } else { negate = 0; usize = 1; vsize = ABS (vsize); up = &ulimb; vp = v->_mp_d; rp = r->_mp_d; prec = r->_mp_prec; uexp = 1; ediff = 1 - v->_mp_exp; } /* Ignore leading limbs in U and V that are equal. Doing this helps increase the precision of the result. */ if (ediff == 0) { /* This loop normally exits immediately. Optimize for that. */ for (;;) { usize--; vsize--; if (up[usize] != vp[vsize]) break; uexp--; if (usize == 0) goto Lu0; if (vsize == 0) goto Lv0; } usize++; vsize++; /* Note that either operand (but not both operands) might now have leading zero limbs. It matters only that U is unnormalized if vsize is now zero, and vice versa. And it is only in that case that we have to adjust uexp. */ if (vsize == 0) Lv0: while (usize != 0 && up[usize - 1] == 0) usize--, uexp--; if (usize == 0) Lu0: while (vsize != 0 && vp[vsize - 1] == 0) vsize--, uexp--; } /* If U extends beyond PREC, ignore the part that does. */ if (usize > prec) { up += usize - prec; usize = prec; } /* If V extends beyond PREC, ignore the part that does. Note that this may make vsize negative. */ if (vsize + ediff > prec) { vp += vsize + ediff - prec; vsize = prec - ediff; } /* Allocate temp space for the result. Allocate just vsize + ediff later??? */ tp = (mp_ptr) TMP_ALLOC (prec * BYTES_PER_MP_LIMB); if (ediff >= prec) { /* V completely cancelled. */ if (tp != up) MPN_COPY (rp, up, usize); rsize = usize; } else { /* Locate the least significant non-zero limb in (the needed parts of) U and V, to simplify the code below. */ for (;;) { if (vsize == 0) { MPN_COPY (rp, up, usize); rsize = usize; goto done; } if (vp[0] != 0) break; vp++, vsize--; } for (;;) { if (usize == 0) { MPN_COPY (rp, vp, vsize); rsize = vsize; negate ^= 1; goto done; } if (up[0] != 0) break; up++, usize--; } /* uuuu | uuuu | uuuu | uuuu | uuuu */ /* vvvvvvv | vv | vvvvv | v | vv */ if (usize > ediff) { /* U and V partially overlaps. */ if (ediff == 0) { /* Have to compare the leading limbs of u and v to determine whether to compute u - v or v - u. */ if (usize > vsize) { /* uuuu */ /* vv */ int cmp; cmp = mpn_cmp (up + usize - vsize, vp, vsize); if (cmp >= 0) { mp_size_t size; size = usize - vsize; MPN_COPY (tp, up, size); mpn_sub_n (tp + size, up + size, vp, vsize); rsize = usize; } else { /* vv */ /* Swap U and V. */ /* uuuu */ mp_size_t size, i; size = usize - vsize; tp[0] = -up[0] & GMP_NUMB_MASK; for (i = 1; i < size; i++) tp[i] = ~up[i] & GMP_NUMB_MASK; mpn_sub_n (tp + size, vp, up + size, vsize); mpn_sub_1 (tp + size, tp + size, vsize, (mp_limb_t) 1); negate ^= 1; rsize = usize; } } else if (usize < vsize) { /* uuuu */ /* vvvvvvv */ int cmp; cmp = mpn_cmp (up, vp + vsize - usize, usize); if (cmp > 0) { mp_size_t size, i; size = vsize - usize; tp[0] = -vp[0] & GMP_NUMB_MASK; for (i = 1; i < size; i++) tp[i] = ~vp[i] & GMP_NUMB_MASK; mpn_sub_n (tp + size, up, vp + size, usize); mpn_sub_1 (tp + size, tp + size, usize, (mp_limb_t) 1); rsize = vsize; } else { /* vvvvvvv */ /* Swap U and V. */ /* uuuu */ /* This is the only place we can get 0.0. */ mp_size_t size; size = vsize - usize; MPN_COPY (tp, vp, size); mpn_sub_n (tp + size, vp + size, up, usize); negate ^= 1; rsize = vsize; } } else { /* uuuu */ /* vvvv */ int cmp; cmp = mpn_cmp (up, vp + vsize - usize, usize); if (cmp > 0) { mpn_sub_n (tp, up, vp, usize); rsize = usize; } else { mpn_sub_n (tp, vp, up, usize); negate ^= 1; rsize = usize; /* can give zero */ } } } else { if (vsize + ediff <= usize) { /* uuuu */ /* v */ mp_size_t size; size = usize - ediff - vsize; MPN_COPY (tp, up, size); mpn_sub (tp + size, up + size, usize - size, vp, vsize); rsize = usize; } else { /* uuuu */ /* vvvvv */ mp_size_t size, i; size = vsize + ediff - usize; tp[0] = -vp[0] & GMP_NUMB_MASK; for (i = 1; i < size; i++) tp[i] = ~vp[i] & GMP_NUMB_MASK; mpn_sub (tp + size, up, usize, vp + size, usize - ediff); mpn_sub_1 (tp + size, tp + size, usize, (mp_limb_t) 1); rsize = vsize + ediff; } } } else { /* uuuu */ /* vv */ mp_size_t size, i; size = vsize + ediff - usize; tp[0] = -vp[0] & GMP_NUMB_MASK; for (i = 1; i < vsize; i++) tp[i] = ~vp[i] & GMP_NUMB_MASK; for (i = vsize; i < size; i++) tp[i] = GMP_NUMB_MAX; mpn_sub_1 (tp + size, up, usize, (mp_limb_t) 1); rsize = size + usize; } /* Full normalize. Optimize later. */ while (rsize != 0 && tp[rsize - 1] == 0) { rsize--; uexp--; } MPN_COPY (rp, tp, rsize); } done: r->_mp_size = negate ? -rsize : rsize; r->_mp_exp = uexp; TMP_FREE; }

mp_limb_t mpn_dc_divappr_q (mp_ptr qp, mp_ptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_limb_t dinv) { mp_size_t q_orig, qn, sh, sl, i; mp_limb_t qh, cy, cy2; mp_ptr tp; TMP_DECL; ASSERT (dn >= 6); ASSERT (nn >= dn + 3); ASSERT (dp[dn-1] & GMP_NUMB_HIGHBIT); qn = nn - dn; if (qn + 1 < dn) { dp += dn - (qn + 1); dn = qn + 1; } q_orig = qn; qh = mpn_cmp(np + nn - dn, dp, dn) >= 0; if (qh != 0) mpn_sub_n(np + nn - dn, np + nn - dn, dp, dn); np += nn - dn - qn; nn = dn + qn; /* Reduce until dn - 1 >= qn */ while (dn - 1 < qn) { sh = MIN(dn, qn - dn + 1); if (sh <= DC_DIV_QR_THRESHOLD) cy2 = mpn_sb_div_qr(qp + qn - sh, np + nn - dn - sh, dn + sh, dp, dn, dinv); else cy2 = mpn_dc_div_qr(qp + qn - sh, np + nn - dn - sh, dn + sh, dp, dn, dinv); qn -= sh; nn -= sh; } cy = np[nn - 1]; /* split into two parts */ sh = qn/2; sl = qn - sh; /* Rare case where truncation ruins normalisation */ if (cy > dp[dn - 1] || (cy == dp[dn - 1] && mpn_cmp(np + nn - qn, dp + dn - qn, qn - 1) >= 0)) { __divappr_helper(qp, np + nn - qn - 2, dp + dn - qn - 1, qn); return qh; } if (mpn_cmp(np + sl + dn - 1, dp + dn - sh - 1, sh + 1) >= 0) __divappr_helper(qp + sl, np + dn + sl - 2, dp + dn - sh - 1, sh); else { if (sh < SB_DIVAPPR_Q_CUTOFF) mpn_sb_divappr_q(qp + sl, np + sl, dn + sh, dp, dn, dinv); else mpn_dc_divappr_q(qp + sl, np + sl, dn + sh, dp, dn, dinv); } cy = np[nn - sh]; TMP_MARK; tp = TMP_ALLOC_LIMBS(sl + 2); mpn_mulmid(tp, dp + dn - qn - 1, qn - 1, qp + sl, sh); cy -= mpn_sub_n(np + nn - qn - 2, np + nn - qn - 2, tp, sl + 2); TMP_FREE; while ((mp_limb_signed_t) cy < 0) { qh -= mpn_sub_1(qp + sl, qp + sl, q_orig - sl, 1); /* ensure quotient is not too big */ /* correct remainder, noting that "digits" of quotient aren't base B but in base varying with truncation, thus correction needs fixup */ cy += mpn_add_n(np + nn - qn - 2, np + nn - qn - 2, dp + dn - sl - 2, sl + 2); for (i = 0; i < sh - 1 && qp[sl + i] == ~CNST_LIMB(0); i++) cy += mpn_add_1(np + nn - qn - 2, np + nn - qn - 2, sl + 2, dp[dn - sl - 3 - i]); } if (cy != 0) /* special case: unable to canonicalise */ __divappr_helper(qp, np + nn - qn - 2, dp + dn - sl - 1, sl); else { if (mpn_cmp(np + dn - 1, dp + dn - sl - 1, sl + 1) >= 0) __divappr_helper(qp, np + dn - 2, dp + dn - sl - 1, sl); else { if (sl < SB_DIVAPPR_Q_CUTOFF) mpn_sb_divappr_q(qp, np, dn + sl, dp, dn, dinv); else mpn_dc_divappr_q(qp, np, dn + sl, dp, dn, dinv); } } return qh; }

/* Computes an approximate quotient of { np, 2*dn } by { dp, dn } which is either correct or one too large. We require dp to be normalised and inv to be a precomputed inverse given by mpn_invert. */ mp_limb_t mpn_inv_divappr_q_n(mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t dn, mp_srcptr inv) { mp_limb_t cy, lo, ret = 0, ret2 = 0; mp_ptr tp; TMP_DECL; TMP_MARK; ASSERT(dp[dn-1] & GMP_LIMB_HIGHBIT); ASSERT(mpn_is_invert(inv, dp, dn)); if (mpn_cmp(np + dn, dp, dn) >= 0) { ret2 = 1; mpn_sub_n(np + dn, np + dn, dp, dn); } tp = TMP_ALLOC_LIMBS(2*dn + 1); mpn_mul(tp, np + dn - 1, dn + 1, inv, dn); add_ssaaaa(cy, lo, 0, np[dn - 1], 0, tp[dn]); ret += mpn_add_n(qp, tp + dn + 1, np + dn, dn); ret += mpn_add_1(qp, qp, dn, cy + 1); /* Let X = B^dn + inv, D = { dp, dn }, N = { np, 2*dn }, then DX < B^{2*dn} <= D(X+1), thus Let N' = { np + n - 1, n + 1 } N'X/B^{dn+1} < B^{dn-1}N'/D <= N'X/B^{dn+1} + N'/B^{dn+1} < N'X/B^{dn+1} + 1 N'X/B^{dn+1} < N/D <= N'X/B^{dn+1} + 1 + 2/B There is either one integer in this range, or two. However, in the latter case the left hand bound is either an integer or < 2/B below one. */ if (UNLIKELY(ret == 1)) { ret -= mpn_sub_1(qp, qp, dn, 1); ASSERT(ret == 0); } if (UNLIKELY((lo == ~CNST_LIMB(0)) || (lo == ~CNST_LIMB(1)))) { /* Special case, multiply out to get accurate quotient */ ret -= mpn_sub_1(qp, qp, dn, 1); if (UNLIKELY(ret == ~CNST_LIMB(0))) ret += mpn_add_1(qp, qp, dn, 1); /* ret is now guaranteed to be 0 */ ASSERT(ret == 0); mpn_mul_n(tp, qp, dp, dn); mpn_sub_n(tp, np, tp, dn+1); while (tp[dn] || mpn_cmp(tp, dp, dn) >= 0) { ret += mpn_add_1(qp, qp, dn, 1); tp[dn] -= mpn_sub_n(tp, tp, dp, dn); } /* Not possible for ret == 2 as we have qp*dp <= np */ ASSERT(ret + ret2 < 2); } TMP_FREE; return ret + ret2; }

void arb_log_arf(arb_t z, const arf_t x, slong prec) { if (arf_is_special(x)) { if (arf_is_pos_inf(x)) arb_pos_inf(z); else arb_indeterminate(z); } else if (ARF_SGNBIT(x)) { arb_indeterminate(z); } else if (ARF_IS_POW2(x)) { if (fmpz_is_one(ARF_EXPREF(x))) { arb_zero(z); } else { fmpz_t exp; fmpz_init(exp); _fmpz_add_fast(exp, ARF_EXPREF(x), -1); arb_const_log2(z, prec + 2); arb_mul_fmpz(z, z, exp, prec); fmpz_clear(exp); } } else if (COEFF_IS_MPZ(*ARF_EXPREF(x))) { arb_log_arf_huge(z, x, prec); } else { slong exp, wp, wn, N, r, closeness_to_one; mp_srcptr xp; mp_size_t xn, tn; mp_ptr tmp, w, t, u; mp_limb_t p1, q1bits, p2, q2bits, error, error2, cy; int negative, inexact, used_taylor_series; TMP_INIT; exp = ARF_EXP(x); negative = 0; ARF_GET_MPN_READONLY(xp, xn, x); /* compute a c >= 0 such that |x-1| <= 2^(-c) if c > 0 */ closeness_to_one = 0; if (exp == 0) { slong i; closeness_to_one = FLINT_BITS - FLINT_BIT_COUNT(~xp[xn - 1]); if (closeness_to_one == FLINT_BITS) { for (i = xn - 2; i > 0 && xp[i] == LIMB_ONES; i--) closeness_to_one += FLINT_BITS; closeness_to_one += (FLINT_BITS - FLINT_BIT_COUNT(~xp[i])); } } else if (exp == 1) { closeness_to_one = FLINT_BITS - FLINT_BIT_COUNT(xp[xn - 1] & (~LIMB_TOP)); if (closeness_to_one == FLINT_BITS) { slong i; for (i = xn - 2; xp[i] == 0; i--) closeness_to_one += FLINT_BITS; closeness_to_one += (FLINT_BITS - FLINT_BIT_COUNT(xp[i])); } closeness_to_one--; } /* if |t-1| <= 0.5 */ /* |log(1+t) - t| <= t^2 */ /* |log(1+t) - (t-t^2/2)| <= t^3 */ if (closeness_to_one > prec + 1) { inexact = arf_sub_ui(arb_midref(z), x, 1, prec, ARB_RND); mag_set_ui_2exp_si(arb_radref(z), 1, -2 * closeness_to_one); if (inexact) arf_mag_add_ulp(arb_radref(z), arb_radref(z), arb_midref(z), prec); return; } else if (2 * closeness_to_one > prec + 1) { arf_t t, u; arf_init(t); arf_init(u); arf_sub_ui(t, x, 1, ARF_PREC_EXACT, ARF_RND_DOWN); arf_mul(u, t, t, ARF_PREC_EXACT, ARF_RND_DOWN); arf_mul_2exp_si(u, u, -1); inexact = arf_sub(arb_midref(z), t, u, prec, ARB_RND); mag_set_ui_2exp_si(arb_radref(z), 1, -3 * closeness_to_one); if (inexact) arf_mag_add_ulp(arb_radref(z), arb_radref(z), arb_midref(z), prec); arf_clear(t); arf_clear(u); return; } /* Absolute working precision (NOT rounded to a limb multiple) */ wp = prec + closeness_to_one + 5; /* Too high precision to use table */ if (wp > ARB_LOG_TAB2_PREC) { arf_log_via_mpfr(arb_midref(z), x, prec, ARB_RND); arf_mag_set_ulp(arb_radref(z), arb_midref(z), prec); return; } /* Working precision in limbs */ wn = (wp + FLINT_BITS - 1) / FLINT_BITS; TMP_START; tmp = TMP_ALLOC_LIMBS(4 * wn + 3); w = tmp; /* requires wn+1 limbs */ t = w + wn + 1; /* requires wn+1 limbs */ u = t + wn + 1; /* requires 2wn+1 limbs */ /* read x-1 */ if (xn <= wn) { flint_mpn_zero(w, wn - xn); mpn_lshift(w + wn - xn, xp, xn, 1); error = 0; } else { mpn_lshift(w, xp + xn - wn, wn, 1); error = 1; } /* First table-based argument reduction */ if (wp <= ARB_LOG_TAB1_PREC) q1bits = ARB_LOG_TAB11_BITS; else q1bits = ARB_LOG_TAB21_BITS; p1 = w[wn-1] >> (FLINT_BITS - q1bits); /* Special case: covers logarithms of small integers */ if (xn == 1 && (w[wn-1] == (p1 << (FLINT_BITS - q1bits)))) { p2 = 0; flint_mpn_zero(t, wn); used_taylor_series = 0; N = r = 0; /* silence compiler warning */ } else { /* log(1+w) = log(1+p/q) + log(1 + (qw-p)/(p+q)) */ w[wn] = mpn_mul_1(w, w, wn, UWORD(1) << q1bits) - p1; mpn_divrem_1(w, 0, w, wn + 1, p1 + (UWORD(1) << q1bits)); error += 1; /* Second table-based argument reduction (fused with log->atanh conversion) */ if (wp <= ARB_LOG_TAB1_PREC) q2bits = ARB_LOG_TAB11_BITS + ARB_LOG_TAB12_BITS; else q2bits = ARB_LOG_TAB21_BITS + ARB_LOG_TAB22_BITS; p2 = w[wn-1] >> (FLINT_BITS - q2bits); u[2 * wn] = mpn_lshift(u + wn, w, wn, q2bits); flint_mpn_zero(u, wn); flint_mpn_copyi(t, u + wn, wn + 1); t[wn] += p2 + (UWORD(1) << (q2bits + 1)); u[2 * wn] -= p2; mpn_tdiv_q(w, u, 2 * wn + 1, t, wn + 1); /* propagated error from 1 ulp error: 2 atanh'(1/3) = 2.25 */ error += 3; /* |w| <= 2^-r */ r = _arb_mpn_leading_zeros(w, wn); /* N >= (wp-r)/(2r) */ N = (wp - r + (2*r-1)) / (2*r); N = FLINT_MAX(N, 0); /* Evaluate Taylor series */ _arb_atan_taylor_rs(t, &error2, w, wn, N, 0); /* Multiply by 2 */ mpn_lshift(t, t, wn, 1); /* Taylor series evaluation error (multiply by 2) */ error += error2 * 2; used_taylor_series = 1; } /* Size of output number */ tn = wn; /* First table lookup */ if (p1 != 0) { if (wp <= ARB_LOG_TAB1_PREC) mpn_add_n(t, t, arb_log_tab11[p1] + ARB_LOG_TAB1_LIMBS - tn, tn); else mpn_add_n(t, t, arb_log_tab21[p1] + ARB_LOG_TAB2_LIMBS - tn, tn); error++; } /* Second table lookup */ if (p2 != 0) { if (wp <= ARB_LOG_TAB1_PREC) mpn_add_n(t, t, arb_log_tab12[p2] + ARB_LOG_TAB1_LIMBS - tn, tn); else mpn_add_n(t, t, arb_log_tab22[p2] + ARB_LOG_TAB2_LIMBS - tn, tn); error++; } /* add exp * log(2) */ exp--; if (exp > 0) { cy = mpn_addmul_1(t, arb_log_log2_tab + ARB_LOG_TAB2_LIMBS - tn, tn, exp); t[tn] = cy; tn += (cy != 0); error += exp; } else if (exp < 0) { t[tn] = 0; u[tn] = mpn_mul_1(u, arb_log_log2_tab + ARB_LOG_TAB2_LIMBS - tn, tn, -exp); if (mpn_cmp(t, u, tn + 1) >= 0) { mpn_sub_n(t, t, u, tn + 1); } else { mpn_sub_n(t, u, t, tn + 1); negative = 1; } error += (-exp); tn += (t[tn] != 0); } /* The accumulated arithmetic error */ mag_set_ui_2exp_si(arb_radref(z), error, -wn * FLINT_BITS); /* Truncation error from the Taylor series */ if (used_taylor_series) mag_add_ui_2exp_si(arb_radref(z), arb_radref(z), 1, -r*(2*N+1) + 1); /* Set the midpoint */ inexact = _arf_set_mpn_fixed(arb_midref(z), t, tn, wn, negative, prec); if (inexact) arf_mag_add_ulp(arb_radref(z), arb_radref(z), arb_midref(z), prec); TMP_END; } }

void _gst_mpz_add (gst_mpz *sum, const gst_mpz *u, const gst_mpz *v) { mp_srcptr up, vp; mp_ptr sump; mp_size_t usize, vsize, sumsize; mp_size_t abs_usize; mp_size_t abs_vsize; usize = u->size; vsize = v->size; abs_usize = ABS (usize); abs_vsize = ABS (vsize); if (abs_usize < abs_vsize) { /* Swap U and V. */ {const gst_mpz *t = u; u = v; v = t;} {mp_size_t t = usize; usize = vsize; vsize = t;} {mp_size_t t = abs_usize; abs_usize = abs_vsize; abs_vsize = t;} } /* True: abs(USIZE) >= abs(VSIZE) */ /* If not space for sum (and possible carry), increase space. */ sumsize = abs_usize + 1; if (sum->alloc < sumsize) gst_mpz_realloc (sum, sumsize); /* These must be after realloc (u or v may be the same as sum). */ up = u->d; vp = v->d; sump = sum->d; if (usize >= 0) { if (vsize >= 0) { sumsize = mpn_add (sump, up, abs_usize, vp, abs_vsize); if (sumsize != 0) sump[abs_usize] = 1; sumsize = sumsize + abs_usize; } else { /* The signs are different. Need exact comparision to determine which operand to subtract from which. */ if (abs_usize == abs_vsize && mpn_cmp (up, vp, abs_usize) < 0) sumsize = -(abs_usize + mpn_sub (sump, vp, abs_usize, up, abs_usize)); else sumsize = (abs_usize + mpn_sub (sump, up, abs_usize, vp, abs_vsize)); } } else { if (vsize >= 0) { /* The signs are different. Need exact comparision to determine which operand to subtract from which. */ if (abs_usize == abs_vsize && mpn_cmp (up, vp, abs_usize) < 0) sumsize = (abs_usize + mpn_sub (sump, vp, abs_usize, up, abs_usize)); else sumsize = -(abs_usize + mpn_sub (sump, up, abs_usize, vp, abs_vsize)); } else { sumsize = mpn_add (sump, up, abs_usize, vp, abs_vsize); if (sumsize != 0) sump[abs_usize] = 1; sumsize = -(sumsize + abs_usize); } } sum->size = sumsize; }

int main(void) { mp_bitcnt_t depth, w; gmp_randstate_t state; tests_start(); fflush(stdout); gmp_randinit_default(state); for (depth = 6; depth <= 12; depth++) { for (w = 1; w <= 5; w++) { mp_size_t n = (((mp_limb_t)1)<<depth); mp_limb_t trunc; mp_size_t limbs = (n*w)/GMP_LIMB_BITS; mp_size_t size = limbs + 1; mp_size_t i; mp_limb_t * ptr; mp_limb_t ** ii, ** jj, * t1, * t2, * s1; mpn_rrandom(&trunc, state, 1); trunc = 2*n + trunc % (2 * n) + 1; trunc = 2*((trunc + 1)/2); ii = malloc((4*(n + n*size) + 3*size)*sizeof(mp_limb_t)); for (i = 0, ptr = (mp_limb_t *) ii + 4*n; i < 4*n; i++, ptr += size) { ii[i] = ptr; mpir_random_fermat(ii[i], state, limbs); } t1 = ptr; t2 = t1 + size; s1 = t2 + size; for (i = 0; i < 4*n; i++) mpn_normmod_2expp1(ii[i], limbs); jj = malloc(4*(n + n*size)*sizeof(mp_limb_t)); for (i = 0, ptr = (mp_limb_t *) jj + 4*n; i < 4*n; i++, ptr += size) { jj[i] = ptr; mpn_copyi(jj[i], ii[i], size); } mpir_fft_trunc_sqrt2(ii, n, w, &t1, &t2, &s1, trunc); mpir_ifft_trunc_sqrt2(ii, n, w, &t1, &t2, &s1, trunc); for (i = 0; i < trunc; i++) { mpn_div_2expmod_2expp1(ii[i], ii[i], limbs, depth + 2); mpn_normmod_2expp1(ii[i], limbs); } for (i = 0; i < trunc; i++) { if (mpn_cmp(ii[i], jj[i], size) != 0) { printf("FAIL:\n"); printf("n = %ld, trunc = %ld\n", n, trunc); printf("Error in entry %ld\n", i); abort(); } } free(ii); free(jj); } } gmp_randclear(state); tests_end(); return 0; }

/* Multiply {up, un} by {vp, vn} and write the result to {prodp, un + vn} assuming vn > 3*ceil(un/4). Note that prodp gets un + vn limbs stored, even if the actual result only needs un + vn - 1. */ void mpn_toom4_mul (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr vp, mp_size_t vn) { mp_size_t ind; mp_limb_t cy, cy2, r30, r31; mp_ptr tp; mp_size_t sn, n1, n2, n3, n4, n5, n6, n7, n8, rpn, t4, h1, h2; TMP_DECL; sn = (un + 3) / 4; h1 = un - 3*sn; h2 = vn - 3*sn; ASSERT (vn > 3*sn); #define a0 (up) #define a1 (up + sn) #define a2 (up + 2*sn) #define a3 (up + 3*sn) #define b0 (vp) #define b1 (vp + sn) #define b2 (vp + 2*sn) #define b3 (vp + 3*sn) t4 = 2*sn+2; // allows mult of 2 integers of sn + 1 limbs TMP_MARK; tp = TMP_ALLOC_LIMBS(4*t4 + 5*(sn + 1)); #define u2 (tp + 4*t4) #define u3 (tp + 4*t4 + (sn+1)) #define u4 (tp + 4*t4 + 2*(sn+1)) #define u5 (tp + 4*t4 + 3*(sn+1)) #define u6 (tp + 4*t4 + 4*(sn+1)) u6[sn] = mpn_add(u6, a1, sn, a3, h1); u5[sn] = mpn_add_n(u5, a2, a0, sn); mpn_add_n(u3, u5, u6, sn + 1); n4 = sn + 1; if (mpn_cmp(u5, u6, sn + 1) >= 0) mpn_sub_n(u4, u5, u6, sn + 1); else { mpn_sub_n(u4, u6, u5, sn + 1); n4 = -n4; } u6[sn] = mpn_add(u6, b1, sn, b3, h2); u5[sn] = mpn_add_n(u5, b2, b0, sn); mpn_add_n(r2, u5, u6, sn + 1); n5 = sn + 1; if (mpn_cmp(u5, u6, sn + 1) >= 0) mpn_sub_n(u5, u5, u6, sn + 1); else { mpn_sub_n(u5, u6, u5, sn + 1); n5 = -n5; } MUL_TC4_UNSIGNED(r3, n3, u3, sn + 1, r2, sn + 1); /* 1 */ MUL_TC4(r4, n4, u4, n4, u5, n5); /* -1 */ #if HAVE_NATIVE_mpn_addlsh_n r1[sn] = mpn_addlsh_n(r1, a2, a0, sn, 2); mpn_lshift(r1, r1, sn + 1, 1); cy = mpn_addlsh_n(r2, a3, a1, h1, 2); #else r1[sn] = mpn_lshift(r1, a2, sn, 1); MPN_COPY(r2, a3, h1); r1[sn] += mpn_addmul_1(r1, a0, sn, 8); cy = mpn_addmul_1(r2, a1, h1, 4); #endif if (sn > h1) { cy2 = mpn_lshift(r2 + h1, a1 + h1, sn - h1, 2); cy = cy2 + mpn_add_1(r2 + h1, r2 + h1, sn - h1, cy); } r2[sn] = cy; mpn_add_n(u5, r1, r2, sn + 1); n6 = sn + 1; if (mpn_cmp(r1, r2, sn + 1) >= 0) mpn_sub_n(u6, r1, r2, sn + 1); else { mpn_sub_n(u6, r2, r1, sn + 1); n6 = -n6; } #if HAVE_NATIVE_mpn_addlsh_n r1[sn] = mpn_addlsh_n(r1, b2, b0, sn, 2); mpn_lshift(r1, r1, sn + 1, 1); cy = mpn_addlsh_n(r2, b3, b1, h2, 2); #else r1[sn] = mpn_lshift(r1, b2, sn, 1); MPN_COPY(r2, b3, h2); r1[sn] += mpn_addmul_1(r1, b0, sn, 8); cy = mpn_addmul_1(r2, b1, h2, 4); #endif if (sn > h2) { cy2 = mpn_lshift(r2 + h2, b1 + h2, sn - h2, 2); cy = cy2 + mpn_add_1(r2 + h2, r2 + h2, sn - h2, cy); } r2[sn] = cy; mpn_add_n(u2, r1, r2, sn + 1); n8 = sn + 1; if (mpn_cmp(r1, r2, sn + 1) >= 0) mpn_sub_n(r2, r1, r2, sn + 1); else { mpn_sub_n(r2, r2, r1, sn + 1); n8 = -n8; } r30 = r3[0]; r31 = r3[1]; MUL_TC4_UNSIGNED(r5, n5, u5, sn + 1, u2, sn + 1); /* 1/2 */ MUL_TC4(r6, n6, u6, n6, r2, n8); /* -1/2 */ r3[1] = r31; #if HAVE_NATIVE_mpn_addlsh1_n cy = mpn_addlsh1_n(u2, a2, a3, h1); if (sn > h1) cy = mpn_add_1(u2 + h1, a2 + h1, sn - h1, cy); u2[sn] = cy; u2[sn] = 2*u2[sn] + mpn_addlsh1_n(u2, a1, u2, sn); u2[sn] = 2*u2[sn] + mpn_addlsh1_n(u2, a0, u2, sn); #else MPN_COPY(u2, a0, sn); u2[sn] = mpn_addmul_1(u2, a1, sn, 2); u2[sn] += mpn_addmul_1(u2, a2, sn, 4); cy = mpn_addmul_1(u2, a3, h1, 8); if (sn > h1) cy = mpn_add_1(u2 + h1, u2 + h1, sn - h1, cy); u2[sn] += cy; #endif #if HAVE_NATIVE_mpn_addlsh1_n cy = mpn_addlsh1_n(r1, b2, b3, h2); if (sn > h2) cy = mpn_add_1(r1 + h2, b2 + h2, sn - h2, cy); r1[sn] = cy; r1[sn] = 2*r1[sn] + mpn_addlsh1_n(r1, b1, r1, sn); r1[sn] = 2*r1[sn] + mpn_addlsh1_n(r1, b0, r1, sn); #else MPN_COPY(r1, b0, sn); r1[sn] = mpn_addmul_1(r1, b1, sn, 2); r1[sn] += mpn_addmul_1(r1, b2, sn, 4); cy = mpn_addmul_1(r1, b3, h2, 8); if (sn > h2) cy = mpn_add_1(r1 + h2, r1 + h2, sn - h2, cy); r1[sn] += cy; #endif MUL_TC4_UNSIGNED(r2, n2, u2, sn + 1, r1, sn + 1); /* 2 */ MUL_TC4_UNSIGNED(r1, n1, a3, h1, b3, h2); /* oo */ MUL_TC4_UNSIGNED(r7, n7, a0, sn, b0, sn); /* 0 */ TC4_DENORM(r1, n1, t4 - 1); /* rp rp1 rp2 rp3 rp4 rp5 rp6 rp7 <----------- r7-----------><------------r5--------------> <-------------r3-------------> <-------------r6-------------> < -----------r2------------>{ } <-------------r4--------------> <--------------r1----> */ mpn_toom4_interpolate(rp, &rpn, sn, tp, t4 - 1, n4, n6, r30); if (rpn != un + vn) { MPN_ZERO((rp + rpn), un + vn - rpn); } TMP_FREE; }

/* if approx is non-zero, does not compute the final remainder */ static mp_size_t mpn_rootrem_internal (mp_ptr rootp, mp_ptr remp, mp_srcptr up, mp_size_t un, mp_limb_t k, int approx) { mp_ptr qp, rp, sp, wp, scratch; mp_size_t qn, rn, sn, wn, nl, bn; mp_limb_t save, save2, cy; unsigned long int unb; /* number of significant bits of {up,un} */ unsigned long int xnb; /* number of significant bits of the result */ unsigned long b, kk; unsigned long sizes[GMP_NUMB_BITS + 1]; int ni, i; int c; int logk; TMP_DECL; TMP_MARK; if (remp == NULL) { rp = TMP_ALLOC_LIMBS (un + 1); /* will contain the remainder */ scratch = rp; /* used by mpn_div_q */ } else { scratch = TMP_ALLOC_LIMBS (un + 1); /* used by mpn_div_q */ rp = remp; } sp = rootp; MPN_SIZEINBASE_2EXP(unb, up, un, 1); /* unb is the number of bits of the input U */ xnb = (unb - 1) / k + 1; /* ceil (unb / k) */ /* xnb is the number of bits of the root R */ if (xnb == 1) /* root is 1 */ { if (remp == NULL) remp = rp; mpn_sub_1 (remp, up, un, (mp_limb_t) 1); MPN_NORMALIZE (remp, un); /* There should be at most one zero limb, if we demand u to be normalized */ rootp[0] = 1; TMP_FREE; return un; } /* We initialize the algorithm with a 1-bit approximation to zero: since we know the root has exactly xnb bits, we write r0 = 2^(xnb-1), so that r0^k = 2^(k*(xnb-1)), that we subtract to the input. */ kk = k * (xnb - 1); /* number of truncated bits in the input */ rn = un - kk / GMP_NUMB_BITS; /* number of limbs of the non-truncated part */ MPN_RSHIFT (cy, rp, up + kk / GMP_NUMB_BITS, rn, kk % GMP_NUMB_BITS); mpn_sub_1 (rp, rp, rn, 1); /* subtract the initial approximation: since the non-truncated part is less than 2^k, it is <= k bits: rn <= ceil(k/GMP_NUMB_BITS) */ sp[0] = 1; /* initial approximation */ sn = 1; /* it has one limb */ for (logk = 1; ((k - 1) >> logk) != 0; logk++) ; /* logk = ceil(log(k)/log(2)) */ b = xnb - 1; /* number of remaining bits to determine in the kth root */ ni = 0; while (b != 0) { /* invariant: here we want b+1 total bits for the kth root */ sizes[ni] = b; /* if c is the new value of b, this means that we'll go from a root of c+1 bits (say s') to a root of b+1 bits. It is proved in the book "Modern Computer Arithmetic" from Brent and Zimmermann, Chapter 1, that if s' >= k*beta, then at most one correction is necessary. Here beta = 2^(b-c), and s' >= 2^c, thus it suffices that c >= ceil((b + log2(k))/2). */ b = (b + logk + 1) / 2; if (b >= sizes[ni]) b = sizes[ni] - 1; /* add just one bit at a time */ ni++; } sizes[ni] = 0; ASSERT_ALWAYS (ni < GMP_NUMB_BITS + 1); /* We have sizes[0] = b > sizes[1] > ... > sizes[ni] = 0 with sizes[i] <= 2 * sizes[i+1]. Newton iteration will first compute sizes[ni-1] extra bits, then sizes[ni-2], ..., then sizes[0] = b. */ /* qp and wp need enough space to store S'^k where S' is an approximate root. Since S' can be as large as S+2, the worst case is when S=2 and S'=4. But then since we know the number of bits of S in advance, S' can only be 3 at most. Similarly for S=4, then S' can be 6 at most. So the worst case is S'/S=3/2, thus S'^k <= (3/2)^k * S^k. Since S^k fits in un limbs, the number of extra limbs needed is bounded by ceil(k*log2(3/2)/GMP_NUMB_BITS). */ #define EXTRA 2 + (mp_size_t) (0.585 * (double) k / (double) GMP_NUMB_BITS) qp = TMP_ALLOC_LIMBS (un + EXTRA); /* will contain quotient and remainder of R/(k*S^(k-1)), and S^k */ wp = TMP_ALLOC_LIMBS (un + EXTRA); /* will contain S^(k-1), k*S^(k-1), and temporary for mpn_pow_1 */ wp[0] = 1; /* {sp,sn}^(k-1) = 1 */ wn = 1; for (i = ni; i != 0; i--) { /* 1: loop invariant: {sp, sn} is the current approximation of the root, which has exactly 1 + sizes[ni] bits. {rp, rn} is the current remainder {wp, wn} = {sp, sn}^(k-1) kk = number of truncated bits of the input */ b = sizes[i - 1] - sizes[i]; /* number of bits to compute in that iteration */ /* Reinsert a low zero limb if we normalized away the entire remainder */ if (rn == 0) { rp[0] = 0; rn = 1; } /* first multiply the remainder by 2^b */ MPN_LSHIFT (cy, rp + b / GMP_NUMB_BITS, rp, rn, b % GMP_NUMB_BITS); rn = rn + b / GMP_NUMB_BITS; if (cy != 0) { rp[rn] = cy; rn++; } kk = kk - b; /* 2: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */ /* Now insert bits [kk,kk+b-1] from the input U */ bn = b / GMP_NUMB_BITS; /* lowest limb from high part of rp[] */ save = rp[bn]; /* nl is the number of limbs in U which contain bits [kk,kk+b-1] */ nl = 1 + (kk + b - 1) / GMP_NUMB_BITS - (kk / GMP_NUMB_BITS); /* nl = 1 + floor((kk + b - 1) / GMP_NUMB_BITS) - floor(kk / GMP_NUMB_BITS) <= 1 + (kk + b - 1) / GMP_NUMB_BITS - (kk - GMP_NUMB_BITS + 1) / GMP_NUMB_BITS = 2 + (b - 2) / GMP_NUMB_BITS thus since nl is an integer: nl <= 2 + floor(b/GMP_NUMB_BITS) <= 2 + bn. */ /* we have to save rp[bn] up to rp[nl-1], i.e. 1 or 2 limbs */ if (nl - 1 > bn) save2 = rp[bn + 1]; MPN_RSHIFT (cy, rp, up + kk / GMP_NUMB_BITS, nl, kk % GMP_NUMB_BITS); /* set to zero high bits of rp[bn] */ rp[bn] &= ((mp_limb_t) 1 << (b % GMP_NUMB_BITS)) - 1; /* restore corresponding bits */ rp[bn] |= save; if (nl - 1 > bn) rp[bn + 1] = save2; /* the low b bits go in rp[0..bn] only, since they start by bit 0 in rp[0], so they use at most ceil(b/GMP_NUMB_BITS) limbs */ /* 3: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */ /* compute {wp, wn} = k * {sp, sn}^(k-1) */ cy = mpn_mul_1 (wp, wp, wn, k); wp[wn] = cy; wn += cy != 0; /* 4: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */ /* now divide {rp, rn} by {wp, wn} to get the low part of the root */ if (rn < wn) { qn = 0; } else { qn = rn - wn; /* expected quotient size */ mpn_div_q (qp, rp, rn, wp, wn, scratch); qn += qp[qn] != 0; } /* 5: current buffers: {sp,sn}, {qp,qn}. Note: {rp,rn} is not needed any more since we'll compute it from scratch at the end of the loop. */ /* Number of limbs used by b bits, when least significant bit is aligned to least limb */ bn = (b - 1) / GMP_NUMB_BITS + 1; /* the quotient should be smaller than 2^b, since the previous approximation was correctly rounded toward zero */ if (qn > bn || (qn == bn && (b % GMP_NUMB_BITS != 0) && qp[qn - 1] >= ((mp_limb_t) 1 << (b % GMP_NUMB_BITS)))) { qn = b / GMP_NUMB_BITS + 1; /* b+1 bits */ MPN_ZERO (qp, qn); qp[qn - 1] = (mp_limb_t) 1 << (b % GMP_NUMB_BITS); MPN_DECR_U (qp, qn, 1); qn -= qp[qn - 1] == 0; } /* 6: current buffers: {sp,sn}, {qp,qn} */ /* multiply the root approximation by 2^b */ MPN_LSHIFT (cy, sp + b / GMP_NUMB_BITS, sp, sn, b % GMP_NUMB_BITS); sn = sn + b / GMP_NUMB_BITS; if (cy != 0) { sp[sn] = cy; sn++; } /* 7: current buffers: {sp,sn}, {qp,qn} */ ASSERT_ALWAYS (bn >= qn); /* this is ok since in the case qn > bn above, q is set to 2^b-1, which has exactly bn limbs */ /* Combine sB and q to form sB + q. */ save = sp[b / GMP_NUMB_BITS]; MPN_COPY (sp, qp, qn); MPN_ZERO (sp + qn, bn - qn); sp[b / GMP_NUMB_BITS] |= save; /* 8: current buffer: {sp,sn} */ /* Since each iteration treats b bits from the root and thus k*b bits from the input, and we already considered b bits from the input, we now have to take another (k-1)*b bits from the input. */ kk -= (k - 1) * b; /* remaining input bits */ /* {rp, rn} = floor({up, un} / 2^kk) */ MPN_RSHIFT (cy, rp, up + kk / GMP_NUMB_BITS, un - kk / GMP_NUMB_BITS, kk % GMP_NUMB_BITS); rn = un - kk / GMP_NUMB_BITS; rn -= rp[rn - 1] == 0; /* 9: current buffers: {sp,sn}, {rp,rn} */ for (c = 0;; c++) { /* Compute S^k in {qp,qn}. */ if (i == 1) { /* Last iteration: we don't need W anymore. */ /* mpn_pow_1 requires that both qp and wp have enough space to store the result {sp,sn}^k + 1 limb */ approx = approx && (sp[0] > 1); qn = (approx == 0) ? mpn_pow_1 (qp, sp, sn, k, wp) : 0; } else { /* W <- S^(k-1) for the next iteration, and S^k = W * S. */ wn = mpn_pow_1 (wp, sp, sn, k - 1, qp); mpn_mul (qp, wp, wn, sp, sn); qn = wn + sn; qn -= qp[qn - 1] == 0; } /* if S^k > floor(U/2^kk), the root approximation was too large */ if (qn > rn || (qn == rn && mpn_cmp (qp, rp, rn) > 0)) MPN_DECR_U (sp, sn, 1); else break; } /* 10: current buffers: {sp,sn}, {rp,rn}, {qp,qn}, {wp,wn} */ ASSERT_ALWAYS (c <= 1); ASSERT_ALWAYS (rn >= qn); /* R = R - Q = floor(U/2^kk) - S^k */ if (i > 1 || approx == 0) { mpn_sub (rp, rp, rn, qp, qn); MPN_NORMALIZE (rp, rn); } /* otherwise we have rn > 0, thus the return value is ok */ /* 11: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */ } TMP_FREE; return rn; }

/* Computes a^{1/k - 1} (mod B^n). Both a and k must be odd. Iterates r' <-- r - r * (a^{k-1} r^k - 1) / n If a^{k-1} r^k = 1 (mod 2^m), then a^{k-1} r'^k = 1 (mod 2^{2m}), Compute the update term as r' = r - (a^{k-1} r^{k+1} - r) / k where we still have cancelation of low limbs. */ void mpn_broot_invm1 (mp_ptr rp, mp_srcptr ap, mp_size_t n, mp_limb_t k) { mp_size_t sizes[GMP_LIMB_BITS * 2]; mp_ptr akm1, tp, rnp, ep, scratch; mp_limb_t a0, r0, km1, kp1h, kinv; mp_size_t rn; unsigned i; TMP_DECL; ASSERT (n > 0); ASSERT (ap[0] & 1); ASSERT (k & 1); ASSERT (k >= 3); TMP_MARK; akm1 = TMP_ALLOC_LIMBS (4*n); tp = akm1 + n; km1 = k-1; /* FIXME: Could arrange the iteration so we don't need to compute this up front, computing a^{k-1} * r^k as (a r)^{k-1} * r. Note that we can use wraparound also for a*r, since the low half is unchanged from the previous iteration. Or possibly mulmid. Also, a r = a^{1/k}, so we get that value too, for free? */ mpn_powlo (akm1, ap, &km1, 1, n, tp); /* 3 n scratch space */ a0 = ap[0]; binvert_limb (kinv, k); /* 4 bits: a^{1/k - 1} (mod 16): a % 8 1 3 5 7 k%4 +------- 1 |1 1 1 1 3 |1 9 9 1 */ r0 = 1 + (((k << 2) & ((a0 << 1) ^ (a0 << 2))) & 8); r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k & 0x7f)); /* 8 bits */ r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k & 0x7fff)); /* 16 bits */ r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k)); /* 32 bits */ #if GMP_NUMB_BITS > 32 { unsigned prec = 32; do { r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k)); prec *= 2; } while (prec < GMP_NUMB_BITS); } #endif rp[0] = r0; if (n == 1) { TMP_FREE; return; } /* For odd k, (k+1)/2 = k/2+1, and the latter avoids overflow. */ kp1h = k/2 + 1; /* FIXME: Special case for two limb iteration. */ rnp = TMP_ALLOC_LIMBS (2*n + 1); ep = rnp + n; /* FIXME: Possible to this on the fly with some bit fiddling. */ for (i = 0; n > 1; n = (n + 1)/2) sizes[i++] = n; rn = 1; while (i-- > 0) { /* Compute x^{k+1}. */ mpn_sqr (ep, rp, rn); /* For odd n, writes n+1 limbs in the final iteration.*/ mpn_powlo (rnp, ep, &kp1h, 1, sizes[i], tp); /* Multiply by a^{k-1}. Can use wraparound; low part equals r. */ mpn_mullo_n (ep, rnp, akm1, sizes[i]); ASSERT (mpn_cmp (ep, rp, rn) == 0); ASSERT (sizes[i] <= 2*rn); mpn_pi1_bdiv_q_1 (rp + rn, ep + rn, sizes[i] - rn, k, kinv, 0); mpn_neg (rp + rn, rp + rn, sizes[i] - rn); rn = sizes[i]; } TMP_FREE; }

/* Check divide and conquer division routine. */ void check_dc_divappr_q_n (void) { mp_limb_t tp[DC_DIVAPPR_Q_N_ITCH(MAX_LIMBS)]; mp_limb_t np[2*MAX_LIMBS]; mp_limb_t np2[2*MAX_LIMBS]; mp_limb_t rp[2*MAX_LIMBS]; mp_limb_t dp[MAX_LIMBS]; mp_limb_t qp[MAX_LIMBS]; mp_limb_t dip, d1ip; mp_size_t nn, rn, dn, qn; gmp_randstate_t rands; int i, j, s; gmp_randinit_default(rands); for (i = 0; i < ITERS; i++) { dn = (random() % (MAX_LIMBS - 6)) + 6; nn = 2*dn; mpn_rrandom (np, rands, nn); mpn_rrandom (dp, rands, dn); dp[dn-1] |= GMP_LIMB_HIGHBIT; MPN_COPY(np2, np, nn); mpir_invert_pi2(dip, d1ip, dp[dn - 1], dp[dn - 2]); qn = nn - dn + 1; qp[qn - 1] = mpn_dc_divappr_q_n(qp, np, dp, dn, dip, d1ip, tp); MPN_NORMALIZE(qp, qn); if (qn) { if (qn >= dn) mpn_mul(rp, qp, qn, dp, dn); else mpn_mul(rp, dp, dn, qp, qn); rn = dn + qn; MPN_NORMALIZE(rp, rn); s = (rn < nn) ? -1 : (rn > nn) ? 1 : mpn_cmp(rp, np2, nn); if (s <= 0) { mpn_sub(rp, np2, nn, rp, rn); rn = nn; MPN_NORMALIZE(rp, rn); } else { mpn_sub(rp, rp, rn, np2, nn); MPN_NORMALIZE(rp, rn); } } else { rn = nn; MPN_COPY(rp, np, nn); } s = (rn < dn) ? -1 : (rn > dn) ? 1 : mpn_cmp(rp, dp, dn); if (s >= 0) { printf ("failed:\n"); printf ("nn = %lu, dn = %lu, qn = %lu, rn = %lu\n\n", nn, dn, qn, rn); gmp_printf (" np: %Nx\n\n", np2, nn); gmp_printf (" dp: %Nx\n\n", dp, dn); gmp_printf (" qp: %Nx\n\n", qp, qn); gmp_printf (" rp: %Nx\n\n", rp, rn); abort (); } } gmp_randclear(rands); }

int main (void) { gmp_randstate_t rands; int j, n; mp_limb_t cp1[1000], cp2[1000], mp[1000], tp1[1000], tp2[1000], inv; tests_start (); gmp_randinit_default (rands); for (n = 1; n < 100; n++) { for (j = 1; j < 100; j++) { mpn_randomb (mp, rands, n); mp[0] |= 1; modlimb_invert (inv, mp[0]); inv = -inv; mpn_randomb (tp1, rands, 2 * n); MPN_COPY (tp2, tp1, 2 * n); ref_redc_1 (cp1, tp1, mp, n, inv); mpn_redc_1 (cp2, tp2, mp, n, inv); if (mpn_cmp (cp1, cp2, n) != 0) { printf ("mpn_redc_1 error %d\n", n); abort (); } if (n != 1 && mpn_cmp (tp1, tp2, 2 * n) != 0) { printf ("mpn_redc_1 possible error\n"); abort (); } /* we dont require the above to be the same but it could be a useful test */ } } for (n = 1; n < 100; n++) { for (j = 1; j < 100; j++) { mpn_rrandom (mp, rands, n); mp[0] |= 1; modlimb_invert (inv, mp[0]); inv = -inv; mpn_rrandom (tp1, rands, 2 * n); MPN_COPY (tp2, tp1, 2 * n); ref_redc_1 (cp1, tp1, mp, n, inv); mpn_redc_1 (cp2, tp2, mp, n, inv); if (mpn_cmp (cp1, cp2, n) != 0) { printf ("mpn_redc_1 error %d\n", n); abort (); } if (n != 1 && mpn_cmp (tp1, tp2, 2 * n) != 0) { printf ("mpn_redc_1 possible error\n"); abort (); } /* we dont require the above to be the same but it could be a useful test */ } } gmp_randclear (rands); tests_end (); exit (0); }

mp_limb_t mpn_sbpi1_div_q (mp_ptr qp, mp_ptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_limb_t dinv) { mp_limb_t qh; mp_size_t qn, i; mp_limb_t n1, n0; mp_limb_t d1, d0; mp_limb_t cy, cy1; mp_limb_t q; mp_limb_t flag; mp_size_t dn_orig = dn; mp_srcptr dp_orig = dp; mp_ptr np_orig = np; ASSERT (dn > 2); ASSERT (nn >= dn); ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0); np += nn; qn = nn - dn; if (qn + 1 < dn) { dp += dn - (qn + 1); dn = qn + 1; } qh = mpn_cmp (np - dn, dp, dn) >= 0; if (qh != 0) mpn_sub_n (np - dn, np - dn, dp, dn); qp += qn; dn -= 2; /* offset dn by 2 for main division loops, saving two iterations in mpn_submul_1. */ d1 = dp[dn + 1]; d0 = dp[dn + 0]; np -= 2; n1 = np[1]; for (i = qn - (dn + 2); i >= 0; i--) { np--; if (UNLIKELY (n1 == d1) && np[1] == d0) { q = GMP_NUMB_MASK; mpn_submul_1 (np - dn, dp, dn + 2, q); n1 = np[1]; /* update n1, last loop's value will now be invalid */ } else { udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv); cy = mpn_submul_1 (np - dn, dp, dn, q); cy1 = n0 < cy; n0 = (n0 - cy) & GMP_NUMB_MASK; cy = n1 < cy1; n1 -= cy1; np[0] = n0; if (UNLIKELY (cy != 0)) { n1 += d1 + mpn_add_n (np - dn, np - dn, dp, dn + 1); q--; } } *--qp = q; } flag = ~CNST_LIMB(0); if (dn >= 0) { for (i = dn; i > 0; i--) { np--; if (UNLIKELY (n1 >= (d1 & flag))) { q = GMP_NUMB_MASK; cy = mpn_submul_1 (np - dn, dp, dn + 2, q); if (UNLIKELY (n1 != cy)) { if (n1 < (cy & flag)) { q--; mpn_add_n (np - dn, np - dn, dp, dn + 2); } else flag = 0; } n1 = np[1]; } else { udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv); cy = mpn_submul_1 (np - dn, dp, dn, q); cy1 = n0 < cy; n0 = (n0 - cy) & GMP_NUMB_MASK; cy = n1 < cy1; n1 -= cy1; np[0] = n0; if (UNLIKELY (cy != 0)) { n1 += d1 + mpn_add_n (np - dn, np - dn, dp, dn + 1); q--; } } *--qp = q; /* Truncate operands. */ dn--; dp++; } np--; if (UNLIKELY (n1 >= (d1 & flag))) { q = GMP_NUMB_MASK; cy = mpn_submul_1 (np, dp, 2, q); if (UNLIKELY (n1 != cy)) { if (n1 < (cy & flag)) { q--; add_ssaaaa (np[1], np[0], np[1], np[0], dp[1], dp[0]); } else flag = 0; } n1 = np[1]; } else { udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv); np[0] = n0; np[1] = n1; } *--qp = q; } ASSERT_ALWAYS (np[1] == n1); np += 2; dn = dn_orig; if (UNLIKELY (n1 < (dn & flag))) { mp_limb_t q, x; /* The quotient may be too large if the remainder is small. Recompute for above ignored operand parts, until the remainder spills. FIXME: The quality of this code isn't the same as the code above. 1. We don't compute things in an optimal order, high-to-low, in order to terminate as quickly as possible. 2. We mess with pointers and sizes, adding and subtracting and adjusting to get things right. It surely could be streamlined. 3. The only termination criteria are that we determine that the quotient needs to be adjusted, or that we have recomputed everything. We should stop when the remainder is so large that no additional subtracting could make it spill. 4. If nothing else, we should not do two loops of submul_1 over the data, instead handle both the triangularization and chopping at once. */ x = n1; if (dn > 2) { /* Compensate for triangularization. */ mp_limb_t y; dp = dp_orig; if (qn + 1 < dn) { dp += dn - (qn + 1); dn = qn + 1; } y = np[-2]; for (i = dn - 3; i >= 0; i--) { q = qp[i]; cy = mpn_submul_1 (np - (dn - i), dp, dn - i - 2, q); if (y < cy) { if (x == 0) { cy = mpn_sub_1 (qp, qp, qn, 1); ASSERT_ALWAYS (cy == 0); return qh - cy; } x--; } y -= cy; } np[-2] = y; } dn = dn_orig; if (qn + 1 < dn) { /* Compensate for ignored dividend and divisor tails. */ dp = dp_orig; np = np_orig; if (qh != 0) { cy = mpn_sub_n (np + qn, np + qn, dp, dn - (qn + 1)); if (cy != 0) { if (x == 0) { if (qn != 0) cy = mpn_sub_1 (qp, qp, qn, 1); return qh - cy; } x--; } } if (qn == 0) return qh; for (i = dn - qn - 2; i >= 0; i--) { cy = mpn_submul_1 (np + i, qp, qn, dp[i]); cy = mpn_sub_1 (np + qn + i, np + qn + i, dn - qn - i - 1, cy); if (cy != 0) { if (x == 0) { cy = mpn_sub_1 (qp, qp, qn, 1); return qh; } x--; } } } } return qh; }

int main (int argc, char **argv) { mp_ptr ap, bp, refp, pp, scratch; int count = COUNT; int test; gmp_randstate_ptr rands; TMP_DECL; TMP_MARK; if (argc > 1) { char *end; count = strtol (argv[1], &end, 0); if (*end || count <= 0) { fprintf (stderr, "Invalid test count: %s.\n", argv[1]); return 1; } } tests_start (); rands = RANDS; #define mpn_mullo_itch(n) (0) ap = TMP_ALLOC_LIMBS (MAX_N); bp = TMP_ALLOC_LIMBS (MAX_N); refp = TMP_ALLOC_LIMBS (MAX_N * 2); pp = 1+TMP_ALLOC_LIMBS (MAX_N + 2); scratch = 1+TMP_ALLOC_LIMBS (mpn_mullo_itch (MAX_N) + 2); for (test = 0; test < count; test++) { unsigned size_min; unsigned size_range; mp_size_t n; mp_size_t itch; mp_limb_t p_before, p_after, s_before, s_after; for (size_min = 1; (1L << size_min) < MIN_N; size_min++) ; /* We generate an in the MIN_N <= n <= (1 << size_range). */ size_range = size_min + gmp_urandomm_ui (rands, SIZE_LOG + 1 - size_min); n = MIN_N + gmp_urandomm_ui (rands, (1L << size_range) + 1 - MIN_N); mpn_random2 (ap, n); mpn_random2 (bp, n); mpn_random2 (pp-1, n + 2); p_before = pp[-1]; p_after = pp[n]; itch = mpn_mullo_itch (n); ASSERT_ALWAYS (itch <= mpn_mullo_itch (MAX_N)); mpn_random2 (scratch-1, itch+2); s_before = scratch[-1]; s_after = scratch[itch]; mpn_mullo_n (pp, ap, bp, n); mpn_mul_n (refp, ap, bp, n); if (pp[-1] != p_before || pp[n] != p_after || scratch[-1] != s_before || scratch[itch] != s_after || mpn_cmp (refp, pp, n) != 0) { printf ("ERROR in test %d, n = %d", test, (int) n); if (pp[-1] != p_before) { printf ("before pp:"); mpn_dump (pp -1, 1); printf ("keep: "); mpn_dump (&p_before, 1); } if (pp[n] != p_after) { printf ("after pp:"); mpn_dump (pp + n, 1); printf ("keep: "); mpn_dump (&p_after, 1); } if (scratch[-1] != s_before) { printf ("before scratch:"); mpn_dump (scratch-1, 1); printf ("keep: "); mpn_dump (&s_before, 1); } if (scratch[itch] != s_after) { printf ("after scratch:"); mpn_dump (scratch + itch, 1); printf ("keep: "); mpn_dump (&s_after, 1); } mpn_dump (ap, n); mpn_dump (bp, n); mpn_dump (pp, n); mpn_dump (refp, n); abort(); } } TMP_FREE; tests_end (); return 0; }

mp_limb_t mpn_preinv_dc_divappr_q (mp_ptr qp, mp_ptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_srcptr dip) { mp_size_t qn; mp_limb_t qh, cy, qsave; mp_ptr tp; TMP_DECL; TMP_MARK; tp = TMP_SALLOC_LIMBS (dn+1); qn = nn - dn; qp += qn; np += nn; dp += dn; if (qn > dn) { qn++; /* pretend we'll need an extra limb */ /* Reduce qn mod dn without division, optimizing small operations. */ do qn -= dn; while (qn > dn); qp -= qn; /* point at low limb of next quotient block */ np -= qn; /* point in the middle of partial remainder */ /* Perform the typically smaller block first. */ if (BELOW_THRESHOLD (qn, DC_DIV_QR_THRESHOLD)) qh = mpn_sb_div_qr (qp, np - qn, 2 * qn, dp - qn, qn, dip); else qh = mpn_dc_div_qr_n (qp, np - qn, dp - qn, qn, dip, tp); if (qn != dn) { if (qn > dn - qn) mpn_mul (tp, qp, qn, dp - dn, dn - qn); else mpn_mul (tp, dp - dn, dn - qn, qp, qn); cy = mpn_sub_n (np - dn, np - dn, tp, dn); if (qh != 0) cy += mpn_sub_n (np - dn + qn, np - dn + qn, dp - dn, dn - qn); while (cy != 0) { qh -= mpn_sub_1 (qp, qp, qn, 1); cy -= mpn_add_n (np - dn, np - dn, dp - dn, dn); } } qn = nn - dn - qn + 1; while (qn > dn) { qp -= dn; np -= dn; mpn_dc_div_qr_n (qp, np - dn, dp - dn, dn, dip, tp); qn -= dn; } /* Since we pretended we'd need an extra quotient limb before, we now have made sure the code above left just dn-1=qn quotient limbs to develop. Develop that plus a guard limb. */ qn--; qp -= qn; np -= dn; qsave = qp[qn]; mpn_dc_divappr_q_n (qp, np - dn, dp - dn, dn, dip, tp); MPN_COPY_INCR (qp, qp + 1, qn); qp[qn] = qsave; } else { if (qn == 0) { qh = mpn_cmp (np - dn, dp - dn, dn) >= 0; if (qh) mpn_sub_n (np - dn, np - dn, dp - dn, dn); TMP_FREE; return qh; } qp -= qn; /* point at low limb of next quotient block */ np -= qn; /* point in the middle of partial remainder */ if (BELOW_THRESHOLD (qn, DC_DIVAPPR_Q_THRESHOLD)) /* Full precision. Optimal? */ qh = mpn_sb_divappr_q (qp, np - dn, nn, dp - dn, dn, dip); else { /* Put quotient in tp, use qp as temporary, since qp lacks a limb. */ qh = mpn_dc_divappr_q_n (tp, np - qn - 2, dp - (qn + 1), qn + 1, dip, qp); MPN_COPY (qp, tp + 1, qn); } } TMP_FREE; return qh; }

void testmain (int argc, char **argv) { unsigned i; char *ap; char *bp; char *rp; size_t bn, rn, arn; mpz_t a, b; FILE *tmp; test_small (); mpz_init (a); mpz_init (b); tmp = tmpfile (); if (!tmp) fprintf (stderr, "Failed to create temporary file. Skipping mpz_out_str tests.\n"); for (i = 0; i < COUNT; i++) { int base; for (base = 0; base <= 36; base += 1 + (base == 0)) { hex_random_str_op (MAXBITS, i&1 ? base: -base, &ap, &rp); if (mpz_set_str (a, ap, 16) != 0) { fprintf (stderr, "mpz_set_str failed on input %s\n", ap); abort (); } rn = strlen (rp); arn = rn - (rp[0] == '-'); bn = mpz_sizeinbase (a, base ? base : 10); if (bn < arn || bn > (arn + 1)) { fprintf (stderr, "mpz_sizeinbase failed:\n"); dump ("a", a); fprintf (stderr, "r = %s\n", rp); fprintf (stderr, " base %d, correct size %u, got %u\n", base, (unsigned) arn, (unsigned)bn); abort (); } bp = mpz_get_str (NULL, i&1 ? base: -base, a); if (strcmp (bp, rp)) { fprintf (stderr, "mpz_get_str failed:\n"); dump ("a", a); fprintf (stderr, "b = %s\n", bp); fprintf (stderr, " base = %d\n", base); fprintf (stderr, "r = %s\n", rp); abort (); } /* Just a few tests with file i/o. */ if (tmp && i < 20) { size_t tn; rewind (tmp); tn = mpz_out_str (tmp, i&1 ? base: -base, a); if (tn != rn) { fprintf (stderr, "mpz_out_str, bad return value:\n"); dump ("a", a); fprintf (stderr, "r = %s\n", rp); fprintf (stderr, " base %d, correct size %u, got %u\n", base, (unsigned) rn, (unsigned)tn); abort (); } rewind (tmp); memset (bp, 0, rn); tn = fread (bp, 1, rn, tmp); if (tn != rn) { fprintf (stderr, "fread failed, expected %lu bytes, got only %lu.\n", (unsigned long) rn, (unsigned long) tn); abort (); } if (memcmp (bp, rp, rn) != 0) { fprintf (stderr, "mpz_out_str failed:\n"); dump ("a", a); fprintf (stderr, "b = %s\n", bp); fprintf (stderr, " base = %d\n", base); fprintf (stderr, "r = %s\n", rp); abort (); } } mpz_set_str (b, rp, base); if (mpz_cmp (a, b)) { fprintf (stderr, "mpz_set_str failed:\n"); fprintf (stderr, "r = %s\n", rp); fprintf (stderr, " base = %d\n", base); fprintf (stderr, "r = %s\n", ap); fprintf (stderr, " base = 16\n"); dump ("b", b); dump ("r", a); abort (); } /* Test mpn interface */ if (base && mpz_sgn (a)) { size_t i; const char *absr; mp_limb_t t[MAXLIMBS]; mp_size_t tn = mpz_size (a); assert (tn <= MAXLIMBS); mpn_copyi (t, a->_mp_d, tn); bn = mpn_get_str ((unsigned char *) bp, base, t, tn); if (bn != arn) { fprintf (stderr, "mpn_get_str failed:\n"); fprintf (stderr, "returned length: %lu (bad)\n", (unsigned long) bn); fprintf (stderr, "expected: %lu\n", (unsigned long) arn); fprintf (stderr, " base = %d\n", base); fprintf (stderr, "r = %s\n", ap); fprintf (stderr, " base = 16\n"); dump ("b", b); dump ("r", a); abort (); } absr = rp + (rp[0] == '-'); for (i = 0; i < bn; i++) { unsigned char digit = absr[i]; unsigned value; if (digit >= '0' && digit <= '9') value = digit - '0'; else if (digit >= 'a' && digit <= 'z') value = digit - 'a' + 10; else if (digit >= 'A' && digit <= 'Z') value = digit - 'A' + 10; else { fprintf (stderr, "Internal error in test.\n"); abort(); } if (bp[i] != value) { fprintf (stderr, "mpn_get_str failed:\n"); fprintf (stderr, "digit %lu: %d (bad)\n", (unsigned long) i, bp[i]); fprintf (stderr, "expected: %d\n", value); fprintf (stderr, " base = %d\n", base); fprintf (stderr, "r = %s\n", ap); fprintf (stderr, " base = 16\n"); dump ("b", b); dump ("r", a); abort (); } } tn = mpn_set_str (t, (unsigned char *) bp, bn, base); if (tn != mpz_size (a) || mpn_cmp (t, a->_mp_d, tn)) { fprintf (stderr, "mpn_set_str failed:\n"); fprintf (stderr, "r = %s\n", rp); fprintf (stderr, " base = %d\n", base); fprintf (stderr, "r = %s\n", ap); fprintf (stderr, " base = 16\n"); dump ("r", a); abort (); } } free (ap); testfree (bp); } } mpz_clear (a); mpz_clear (b); }

mp_size_t mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep, mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n) { mp_size_t talloc; mp_size_t scratch; mp_size_t matrix_scratch; mp_size_t ualloc = n + 1; struct gcdext_ctx ctx; mp_size_t un; mp_ptr u0; mp_ptr u1; mp_ptr tp; TMP_DECL; ASSERT (an >= n); ASSERT (n > 0); ASSERT (bp[n-1] > 0); TMP_MARK; /* FIXME: Check for small sizes first, before setting up temporary storage etc. */ talloc = MPN_GCDEXT_LEHMER_N_ITCH(n); /* For initial division */ scratch = an - n + 1; if (scratch > talloc) talloc = scratch; if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) { /* For hgcd loop. */ mp_size_t hgcd_scratch; mp_size_t update_scratch; mp_size_t p1 = CHOOSE_P_1 (n); mp_size_t p2 = CHOOSE_P_2 (n); mp_size_t min_p = MIN(p1, p2); mp_size_t max_p = MAX(p1, p2); matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p); hgcd_scratch = mpn_hgcd_itch (n - min_p); update_scratch = max_p + n - 1; scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch); if (scratch > talloc) talloc = scratch; /* Final mpn_gcdext_lehmer_n call. Need space for u and for copies of a and b. */ scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD) + 3*GCDEXT_DC_THRESHOLD; if (scratch > talloc) talloc = scratch; /* Cofactors u0 and u1 */ talloc += 2*(n+1); } tp = TMP_ALLOC_LIMBS(talloc); if (an > n) { mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n); if (mpn_zero_p (ap, n)) { MPN_COPY (gp, bp, n); *usizep = 0; TMP_FREE; return n; } } if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) { mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp); TMP_FREE; return gn; } MPN_ZERO (tp, 2*ualloc); u0 = tp; tp += ualloc; u1 = tp; tp += ualloc; ctx.gp = gp; ctx.up = up; ctx.usize = usizep; { /* For the first hgcd call, there are no u updates, and it makes some sense to use a different choice for p. */ /* FIXME: We could trim use of temporary storage, since u0 and u1 are not used yet. For the hgcd call, we could swap in the u0 and u1 pointers for the relevant matrix elements. */ struct hgcd_matrix M; mp_size_t p = CHOOSE_P_1 (n); mp_size_t nn; mpn_hgcd_matrix_init (&M, n - p, tp); nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch); if (nn > 0) { ASSERT (M.n <= (n - p - 1)/2); ASSERT (M.n + p <= (p + n - 1) / 2); /* Temporary storage 2 (p + M->n) <= p + n - 1 */ n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch); MPN_COPY (u0, M.p[1][0], M.n); MPN_COPY (u1, M.p[1][1], M.n); un = M.n; while ( (u0[un-1] | u1[un-1] ) == 0) un--; } else { /* mpn_hgcd has failed. Then either one of a or b is very small, or the difference is very small. Perform one subtraction followed by one division. */ u1[0] = 1; ctx.u0 = u0; ctx.u1 = u1; ctx.tp = tp + n; /* ualloc */ ctx.un = 1; /* Temporary storage n */ n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp); if (n == 0) { TMP_FREE; return ctx.gn; } un = ctx.un; ASSERT (un < ualloc); } } while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) { struct hgcd_matrix M; mp_size_t p = CHOOSE_P_2 (n); mp_size_t nn; mpn_hgcd_matrix_init (&M, n - p, tp); nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch); if (nn > 0) { mp_ptr t0; t0 = tp + matrix_scratch; ASSERT (M.n <= (n - p - 1)/2); ASSERT (M.n + p <= (p + n - 1) / 2); /* Temporary storage 2 (p + M->n) <= p + n - 1 */ n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0); /* By the same analysis as for mpn_hgcd_matrix_mul */ ASSERT (M.n + un <= ualloc); /* FIXME: This copying could be avoided by some swapping of * pointers. May need more temporary storage, though. */ MPN_COPY (t0, u0, un); /* Temporary storage ualloc */ un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un); ASSERT (un < ualloc); ASSERT ( (u0[un-1] | u1[un-1]) > 0); } else { /* mpn_hgcd has failed. Then either one of a or b is very small, or the difference is very small. Perform one subtraction followed by one division. */ ctx.u0 = u0; ctx.u1 = u1; ctx.tp = tp + n; /* ualloc */ ctx.un = un; /* Temporary storage n */ n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp); if (n == 0) { TMP_FREE; return ctx.gn; } un = ctx.un; ASSERT (un < ualloc); } } /* We have A = ... a + ... b B = u0 a + u1 b a = u1 A + ... B b = -u0 A + ... B with bounds |u0|, |u1| <= B / min(a, b) We always have u1 > 0, and u0 == 0 is possible only if u1 == 1, in which case the only reduction done so far is a = A - k B for some k. Compute g = u a + v b = (u u1 - v u0) A + (...) B Here, u, v are bounded by |u| <= b, |v| <= a */ ASSERT ( (ap[n-1] | bp[n-1]) > 0); if (UNLIKELY (mpn_cmp (ap, bp, n) == 0)) { /* Must return the smallest cofactor, +u1 or -u0 */ int c; MPN_COPY (gp, ap, n); MPN_CMP (c, u0, u1, un); /* c == 0 can happen only when A = (2k+1) G, B = 2 G. And in this case we choose the cofactor + 1, corresponding to G = A - k B, rather than -1, corresponding to G = - A + (k+1) B. */ ASSERT (c != 0 || (un == 1 && u0[0] == 1 && u1[0] == 1)); if (c < 0) { MPN_NORMALIZE (u0, un); MPN_COPY (up, u0, un); *usizep = -un; } else { MPN_NORMALIZE_NOT_ZERO (u1, un); MPN_COPY (up, u1, un); *usizep = un; } TMP_FREE; return n; } else if (UNLIKELY (u0[0] == 0) && un == 1) { mp_size_t gn; ASSERT (u1[0] == 1); /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */ gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp); TMP_FREE; return gn; } else { mp_size_t u0n; mp_size_t u1n; mp_size_t lehmer_un; mp_size_t lehmer_vn; mp_size_t gn; mp_ptr lehmer_up; mp_ptr lehmer_vp; int negate; lehmer_up = tp; tp += n; /* Call mpn_gcdext_lehmer_n with copies of a and b. */ MPN_COPY (tp, ap, n); MPN_COPY (tp + n, bp, n); gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n); u0n = un; MPN_NORMALIZE (u0, u0n); ASSERT (u0n > 0); if (lehmer_un == 0) { /* u == 0 ==> v = g / b == 1 ==> g = - u0 A + (...) B */ MPN_COPY (up, u0, u0n); *usizep = -u0n; TMP_FREE; return gn; } lehmer_vp = tp; /* Compute v = (g - u a) / b */ lehmer_vn = compute_v (lehmer_vp, ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1); if (lehmer_un > 0) negate = 0; else { lehmer_un = -lehmer_un; negate = 1; } u1n = un; MPN_NORMALIZE (u1, u1n); ASSERT (u1n > 0); ASSERT (lehmer_un + u1n <= ualloc); ASSERT (lehmer_vn + u0n <= ualloc); /* We may still have v == 0 */ /* Compute u u0 */ if (lehmer_un <= u1n) /* Should be the common case */ mpn_mul (up, u1, u1n, lehmer_up, lehmer_un); else mpn_mul (up, lehmer_up, lehmer_un, u1, u1n); un = u1n + lehmer_un; un -= (up[un - 1] == 0); if (lehmer_vn > 0) { mp_limb_t cy; /* Overwrites old u1 value */ if (lehmer_vn <= u0n) /* Should be the common case */ mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn); else mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n); u1n = u0n + lehmer_vn; u1n -= (u1[u1n - 1] == 0); if (u1n <= un) { cy = mpn_add (up, up, un, u1, u1n); } else { cy = mpn_add (up, u1, u1n, up, un); un = u1n; } up[un] = cy; un += (cy != 0); ASSERT (un < ualloc); } *usizep = negate ? -un : un; TMP_FREE; return gn; } }

void mpn_toom53_mul (mp_ptr pp, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr scratch) { mp_size_t n, s, t; mp_limb_t cy; mp_ptr gp; mp_ptr as1, asm1, as2, asm2, ash; mp_ptr bs1, bsm1, bs2, bsm2, bsh; enum toom7_flags flags; TMP_DECL; #define a0 ap #define a1 (ap + n) #define a2 (ap + 2*n) #define a3 (ap + 3*n) #define a4 (ap + 4*n) #define b0 bp #define b1 (bp + n) #define b2 (bp + 2*n) n = 1 + (3 * an >= 5 * bn ? (an - 1) / (size_t) 5 : (bn - 1) / (size_t) 3); s = an - 4 * n; t = bn - 2 * n; ASSERT (0 < s && s <= n); ASSERT (0 < t && t <= n); TMP_MARK; as1 = TMP_SALLOC_LIMBS (n + 1); asm1 = TMP_SALLOC_LIMBS (n + 1); as2 = TMP_SALLOC_LIMBS (n + 1); asm2 = TMP_SALLOC_LIMBS (n + 1); ash = TMP_SALLOC_LIMBS (n + 1); bs1 = TMP_SALLOC_LIMBS (n + 1); bsm1 = TMP_SALLOC_LIMBS (n + 1); bs2 = TMP_SALLOC_LIMBS (n + 1); bsm2 = TMP_SALLOC_LIMBS (n + 1); bsh = TMP_SALLOC_LIMBS (n + 1); gp = pp; /* Compute as1 and asm1. */ flags = (enum toom7_flags) (toom7_w3_neg & mpn_toom_eval_pm1 (as1, asm1, 4, ap, n, s, gp)); /* Compute as2 and asm2. */ flags = (enum toom7_flags) (flags | (toom7_w1_neg & mpn_toom_eval_pm2 (as2, asm2, 4, ap, n, s, gp))); /* Compute ash = 16 a0 + 8 a1 + 4 a2 + 2 a3 + a4 = 2*(2*(2*(2*a0 + a1) + a2) + a3) + a4 */ #if HAVE_NATIVE_mpn_addlsh1_n cy = mpn_addlsh1_n (ash, a1, a0, n); cy = 2*cy + mpn_addlsh1_n (ash, a2, ash, n); cy = 2*cy + mpn_addlsh1_n (ash, a3, ash, n); if (s < n) { mp_limb_t cy2; cy2 = mpn_addlsh1_n (ash, a4, ash, s); ash[n] = 2*cy + mpn_lshift (ash + s, ash + s, n - s, 1); MPN_INCR_U (ash + s, n+1-s, cy2); } else ash[n] = 2*cy + mpn_addlsh1_n (ash, a4, ash, n); #else cy = mpn_lshift (ash, a0, n, 1); cy += mpn_add_n (ash, ash, a1, n); cy = 2*cy + mpn_lshift (ash, ash, n, 1); cy += mpn_add_n (ash, ash, a2, n); cy = 2*cy + mpn_lshift (ash, ash, n, 1); cy += mpn_add_n (ash, ash, a3, n); cy = 2*cy + mpn_lshift (ash, ash, n, 1); ash[n] = cy + mpn_add (ash, ash, n, a4, s); #endif /* Compute bs1 and bsm1. */ bs1[n] = mpn_add (bs1, b0, n, b2, t); /* b0 + b2 */ #if HAVE_NATIVE_mpn_add_n_sub_n if (bs1[n] == 0 && mpn_cmp (bs1, b1, n) < 0) { bs1[n] = mpn_add_n_sub_n (bs1, bsm1, b1, bs1, n) >> 1; bsm1[n] = 0; flags = (enum toom7_flags) (flags ^ toom7_w3_neg); }

/* Evaluates a polynomial of degree k > 2, in the points +2^shift and -2^shift. */ int mpn_toom_eval_pm2exp (mp_ptr xp2, mp_ptr xm2, unsigned k, mp_srcptr xp, mp_size_t n, mp_size_t hn, unsigned shift, mp_ptr tp) { unsigned i; int neg; #if HAVE_NATIVE_mpn_addlsh_n mp_limb_t cy; #endif ASSERT (k >= 3); ASSERT (shift*k < GMP_NUMB_BITS); ASSERT (hn > 0); ASSERT (hn <= n); /* The degree k is also the number of full-size coefficients, so * that last coefficient, of size hn, starts at xp + k*n. */ #if HAVE_NATIVE_mpn_addlsh_n xp2[n] = mpn_addlsh_n (xp2, xp, xp + 2*n, n, 2*shift); for (i = 4; i < k; i += 2) xp2[n] += mpn_addlsh_n (xp2, xp2, xp + i*n, n, i*shift); tp[n] = mpn_lshift (tp, xp+n, n, shift); for (i = 3; i < k; i+= 2) tp[n] += mpn_addlsh_n (tp, tp, xp+i*n, n, i*shift); if (k & 1) { cy = mpn_addlsh_n (tp, tp, xp+k*n, hn, k*shift); MPN_INCR_U (tp + hn, n+1 - hn, cy); } else { cy = mpn_addlsh_n (xp2, xp2, xp+k*n, hn, k*shift); MPN_INCR_U (xp2 + hn, n+1 - hn, cy); } #else /* !HAVE_NATIVE_mpn_addlsh_n */ xp2[n] = mpn_lshift (tp, xp+2*n, n, 2*shift); xp2[n] += mpn_add_n (xp2, xp, tp, n); for (i = 4; i < k; i += 2) { xp2[n] += mpn_lshift (tp, xp + i*n, n, i*shift); xp2[n] += mpn_add_n (xp2, xp2, tp, n); } tp[n] = mpn_lshift (tp, xp+n, n, shift); for (i = 3; i < k; i+= 2) { tp[n] += mpn_lshift (xm2, xp + i*n, n, i*shift); tp[n] += mpn_add_n (tp, tp, xm2, n); } xm2[hn] = mpn_lshift (xm2, xp + k*n, hn, k*shift); if (k & 1) mpn_add (tp, tp, n+1, xm2, hn+1); else mpn_add (xp2, xp2, n+1, xm2, hn+1); #endif /* !HAVE_NATIVE_mpn_addlsh_n */ neg = (mpn_cmp (xp2, tp, n + 1) < 0) ? ~0 : 0; #if HAVE_NATIVE_mpn_sumdiff_n if (neg) mpn_sumdiff_n (xp2, xm2, tp, xp2, n + 1); else mpn_sumdiff_n (xp2, xm2, xp2, tp, n + 1); #else if (neg) mpn_sub_n (xm2, tp, xp2, n + 1); else mpn_sub_n (xm2, xp2, tp, n + 1); mpn_add_n (xp2, xp2, tp, n + 1); #endif /* FIXME: the following asserts are useless if (k+1)*shift >= GMP_LIMB_BITS */ ASSERT ((k+1)*shift >= GMP_LIMB_BITS || xp2[n] < ((CNST_LIMB(1)<<((k+1)*shift))-1)/((CNST_LIMB(1)<<shift)-1)); ASSERT ((k+2)*shift >= GMP_LIMB_BITS || xm2[n] < ((CNST_LIMB(1)<<((k+2)*shift))-((k&1)?(CNST_LIMB(1)<<shift):1))/((CNST_LIMB(1)<<(2*shift))-1)); return neg; }

int mpq_cmp (const MP_RAT *op1, const MP_RAT *op2) { mp_size_t num1_size = op1->_mp_num._mp_size; mp_size_t den1_size = op1->_mp_den._mp_size; mp_size_t num2_size = op2->_mp_num._mp_size; mp_size_t den2_size = op2->_mp_den._mp_size; mp_size_t tmp1_size, tmp2_size; mp_ptr tmp1_ptr, tmp2_ptr; mp_size_t num1_sign; int cc; TMP_DECL; /* need canonical signs to get right result */ ASSERT (den1_size > 0); ASSERT (den2_size > 0); if (num1_size == 0) return -num2_size; if (num2_size == 0) return num1_size; if ((num1_size ^ num2_size) < 0) /* I.e. are the signs different? */ return num1_size; num1_sign = num1_size; num1_size = ABS (num1_size); num2_size = ABS (num2_size); tmp1_size = num1_size + den2_size; tmp2_size = num2_size + den1_size; /* 1. Check to see if we can tell which operand is larger by just looking at the number of limbs. */ /* NUM1 x DEN2 is either TMP1_SIZE limbs or TMP1_SIZE-1 limbs. Same for NUM1 x DEN1 with respect to TMP2_SIZE. */ if (tmp1_size > tmp2_size + 1) /* NUM1 x DEN2 is surely larger in magnitude than NUM2 x DEN1. */ return num1_sign; if (tmp2_size > tmp1_size + 1) /* NUM1 x DEN2 is surely smaller in magnitude than NUM2 x DEN1. */ return -num1_sign; /* 2. Same, but compare the number of significant bits. */ { int cnt1, cnt2; mp_bitcnt_t bits1, bits2; count_leading_zeros (cnt1, op1->_mp_num._mp_d[num1_size - 1]); count_leading_zeros (cnt2, op2->_mp_den._mp_d[den2_size - 1]); bits1 = tmp1_size * GMP_NUMB_BITS - cnt1 - cnt2 + 2 * GMP_NAIL_BITS; count_leading_zeros (cnt1, op2->_mp_num._mp_d[num2_size - 1]); count_leading_zeros (cnt2, op1->_mp_den._mp_d[den1_size - 1]); bits2 = tmp2_size * GMP_NUMB_BITS - cnt1 - cnt2 + 2 * GMP_NAIL_BITS; if (bits1 > bits2 + 1) return num1_sign; if (bits2 > bits1 + 1) return -num1_sign; } /* 3. Finally, cross multiply and compare. */ TMP_MARK; TMP_ALLOC_LIMBS_2 (tmp1_ptr,tmp1_size, tmp2_ptr,tmp2_size); if (num1_size >= den2_size) tmp1_size -= 0 == mpn_mul (tmp1_ptr, op1->_mp_num._mp_d, num1_size, op2->_mp_den._mp_d, den2_size); else tmp1_size -= 0 == mpn_mul (tmp1_ptr, op2->_mp_den._mp_d, den2_size, op1->_mp_num._mp_d, num1_size); if (num2_size >= den1_size) tmp2_size -= 0 == mpn_mul (tmp2_ptr, op2->_mp_num._mp_d, num2_size, op1->_mp_den._mp_d, den1_size); else tmp2_size -= 0 == mpn_mul (tmp2_ptr, op1->_mp_den._mp_d, den1_size, op2->_mp_num._mp_d, num2_size); cc = tmp1_size - tmp2_size != 0 ? tmp1_size - tmp2_size : mpn_cmp (tmp1_ptr, tmp2_ptr, tmp1_size); TMP_FREE; return num1_sign < 0 ? -cc : cc; }

void mpz_powm (mpz_ptr r, mpz_srcptr b, mpz_srcptr e, mpz_srcptr m) { mp_ptr xp, tp, qp, gp, this_gp; mp_srcptr bp, ep, mp; mp_size_t bn, es, en, mn, xn; mp_limb_t invm, c; unsigned long int enb; mp_size_t i, K, j, l, k; int m_zero_cnt, e_zero_cnt; int sh; int use_redc; #if HANDLE_NEGATIVE_EXPONENT mpz_t new_b; #endif #if REDUCE_EXPONENT mpz_t new_e; #endif TMP_DECL; mp = PTR(m); mn = ABSIZ (m); if (mn == 0) DIVIDE_BY_ZERO; TMP_MARK; es = SIZ (e); if (es <= 0) { if (es == 0) { /* Exponent is zero, result is 1 mod m, i.e., 1 or 0 depending on if m equals 1. */ SIZ(r) = (mn == 1 && mp[0] == 1) ? 0 : 1; PTR(r)[0] = 1; TMP_FREE; /* we haven't really allocated anything here */ return; } #if HANDLE_NEGATIVE_EXPONENT MPZ_TMP_INIT (new_b, mn + 1); if (! mpz_invert (new_b, b, m)) DIVIDE_BY_ZERO; b = new_b; es = -es; #else DIVIDE_BY_ZERO; #endif } en = es; #if REDUCE_EXPONENT /* Reduce exponent by dividing it by phi(m) when m small. */ if (mn == 1 && mp[0] < 0x7fffffffL && en * GMP_NUMB_BITS > 150) { MPZ_TMP_INIT (new_e, 2); mpz_mod_ui (new_e, e, phi (mp[0])); e = new_e; } #endif use_redc = mn < POWM_THRESHOLD && mp[0] % 2 != 0; if (use_redc) { /* invm = -1/m mod 2^BITS_PER_MP_LIMB, must have m odd */ modlimb_invert (invm, mp[0]); invm = -invm; } else { /* Normalize m (i.e. make its most significant bit set) as required by division functions below. */ count_leading_zeros (m_zero_cnt, mp[mn - 1]); m_zero_cnt -= GMP_NAIL_BITS; if (m_zero_cnt != 0) { mp_ptr new_mp; new_mp = TMP_ALLOC_LIMBS (mn); mpn_lshift (new_mp, mp, mn, m_zero_cnt); mp = new_mp; } } /* Determine optimal value of k, the number of exponent bits we look at at a time. */ count_leading_zeros (e_zero_cnt, PTR(e)[en - 1]); e_zero_cnt -= GMP_NAIL_BITS; enb = en * GMP_NUMB_BITS - e_zero_cnt; /* number of bits of exponent */ k = 1; K = 2; while (2 * enb > K * (2 + k * (3 + k))) { k++; K *= 2; if (k == 10) /* cap allocation */ break; } tp = TMP_ALLOC_LIMBS (2 * mn); qp = TMP_ALLOC_LIMBS (mn + 1); gp = __GMP_ALLOCATE_FUNC_LIMBS (K / 2 * mn); /* Compute x*R^n where R=2^BITS_PER_MP_LIMB. */ bn = ABSIZ (b); bp = PTR(b); /* Handle |b| >= m by computing b mod m. FIXME: It is not strictly necessary for speed or correctness to do this when b and m have the same number of limbs, perhaps remove mpn_cmp call. */ if (bn > mn || (bn == mn && mpn_cmp (bp, mp, mn) >= 0)) { /* Reduce possibly huge base while moving it to gp[0]. Use a function call to reduce, since we don't want the quotient allocation to live until function return. */ if (use_redc) { reduce (tp + mn, bp, bn, mp, mn); /* b mod m */ MPN_ZERO (tp, mn); mpn_tdiv_qr (qp, gp, 0L, tp, 2 * mn, mp, mn); /* unnormnalized! */ } else { reduce (gp, bp, bn, mp, mn); } } else { /* |b| < m. We pad out operands to become mn limbs, which simplifies the rest of the function, but slows things down when the |b| << m. */ if (use_redc) { MPN_ZERO (tp, mn); MPN_COPY (tp + mn, bp, bn); MPN_ZERO (tp + mn + bn, mn - bn); mpn_tdiv_qr (qp, gp, 0L, tp, 2 * mn, mp, mn); } else { MPN_COPY (gp, bp, bn); MPN_ZERO (gp + bn, mn - bn); } } /* Compute xx^i for odd g < 2^i. */ xp = TMP_ALLOC_LIMBS (mn); mpn_sqr (tp, gp, mn); if (use_redc) mpn_redc_1 (xp, tp, mp, mn, invm); /* xx = x^2*R^n */ else mpn_tdiv_qr (qp, xp, 0L, tp, 2 * mn, mp, mn); this_gp = gp; for (i = 1; i < K / 2; i++) { mpn_mul_n (tp, this_gp, xp, mn); this_gp += mn; if (use_redc) mpn_redc_1 (this_gp,tp, mp, mn, invm); /* g[i] = x^(2i+1)*R^n */ else mpn_tdiv_qr (qp, this_gp, 0L, tp, 2 * mn, mp, mn); } /* Start the real stuff. */ ep = PTR (e); i = en - 1; /* current index */ c = ep[i]; /* current limb */ sh = GMP_NUMB_BITS - e_zero_cnt; /* significant bits in ep[i] */ sh -= k; /* index of lower bit of ep[i] to take into account */ if (sh < 0) { /* k-sh extra bits are needed */ if (i > 0) { i--; c <<= (-sh); sh += GMP_NUMB_BITS; c |= ep[i] >> sh; } }

/* Check divide and conquer division routine. */ void check_dc_div_qr (void) { mp_limb_t np[2*MAX_LIMBS]; mp_limb_t np2[2*MAX_LIMBS]; mp_limb_t rp[2*MAX_LIMBS+1]; mp_limb_t dp[MAX_LIMBS]; mp_limb_t qp[2*MAX_LIMBS]; mp_limb_t dip, d1ip, cy; mp_size_t nn, rn, dn, qn; gmp_randstate_t rands; int i, j, s; gmp_randinit_default(rands); for (i = 0; i < ITERS; i++) { dn = (random() % (MAX_LIMBS - 5)) + 6; nn = (random() % (MAX_LIMBS - 3)) + dn + 3; mpn_rrandom (np, rands, nn); mpn_rrandom (dp, rands, dn); dp[dn-1] |= GMP_LIMB_HIGHBIT; MPN_COPY(np2, np, nn); mpir_invert_pi2(dip, d1ip, dp[dn - 1], dp[dn - 2]); qn = nn - dn + 1; qp[qn - 1] = mpn_dc_div_qr(qp, np, nn, dp, dn, dip, d1ip); MPN_NORMALIZE(qp, qn); if (qn) { if (qn >= dn) mpn_mul(rp, qp, qn, dp, dn); else mpn_mul(rp, dp, dn, qp, qn); rn = dn + qn; MPN_NORMALIZE(rp, rn); if (rn > nn) { printf("failed: q*d has too many limbs\n"); abort(); } if (mpn_cmp(rp, np2, nn) > 0) { printf("failed: remainder negative\n"); abort(); } mpn_sub(rp, np2, nn, rp, rn); rn = nn; MPN_NORMALIZE(rp, rn); } else { rn = nn; MPN_COPY(rp, np, nn); } s = (rn < dn) ? -1 : (rn > dn) ? 1 : mpn_cmp(rp, dp, dn); if (s >= 0) { printf ("failed:\n"); printf ("nn = %lu, dn = %lu, qn = %lu, rn = %lu\n\n", nn, dn, qn, rn); gmp_printf (" np: %Nx\n\n", np2, nn); gmp_printf (" dp: %Nx\n\n", dp, dn); gmp_printf (" qp: %Nx\n\n", qp, qn); gmp_printf (" rp: %Nx\n\n", rp, rn); abort (); } if (mpn_cmp(rp, np, rn) != 0) { printf("failed: remainder does not match\n"); gmp_printf (" np: %Nx\n\n", np2, nn); gmp_printf (" dp: %Nx\n\n", dp, dn); gmp_printf (" qp: %Nx\n\n", qp, qn); gmp_printf (" rp: %Nx\n\n", rp, rn); gmp_printf (" rp2: %Nx\n\n", np, rn); } } gmp_randclear(rands); }