mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, mp_limb_t h) { mp_limb_t s, x, y, inverse, dummy, dmul, c1, c2; mp_limb_t c = 0; mp_size_t i; ASSERT (size >= 1); ASSERT (d & 1); binvert_limb (inverse, d); dmul = d << GMP_NAIL_BITS; for (i = 0; i < size; i++) { ASSERT (c==0 || c==1); s = src[i]; SUBC_LIMB (c1, x, s, c); SUBC_LIMB (c2, y, x, h); c = c1 + c2; y = (y * inverse) & GMP_NUMB_MASK; umul_ppmm (h, dummy, y, dmul); } h += c; return h; }
void mpn_binvert (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_ptr scratch) { mp_ptr xp; mp_size_t rn, newrn; mp_size_t sizes[NPOWS], *sizp; mp_limb_t di; /* Compute the computation precisions from highest to lowest, leaving the base case size in 'rn'. */ sizp = sizes; for (rn = n; ABOVE_THRESHOLD (rn, BINV_NEWTON_THRESHOLD); rn = (rn + 1) >> 1) *sizp++ = rn; xp = scratch; /* Compute a base value using a low-overhead O(n^2) algorithm. FIXME: We should call some divide-and-conquer lsb division function here for an operand subrange. */ MPN_ZERO (xp, rn); xp[0] = 1; binvert_limb (di, up[0]); if (BELOW_THRESHOLD (rn, DC_BDIV_Q_THRESHOLD)) mpn_sb_bdiv_q (rp, xp, rn, up, rn, -di); else mpn_dc_bdiv_q (rp, xp, rn, up, rn, -di); /* Use Newton iterations to get the desired precision. */ for (; rn < n; rn = newrn) { newrn = *--sizp; #if WANT_FFT if (ABOVE_THRESHOLD (newrn, 2 * MUL_FFT_MODF_THRESHOLD)) { int k; mp_size_t m, i; k = mpn_fft_best_k (newrn, 0); m = mpn_fft_next_size (newrn, k); mpn_mul_fft (xp, m, up, newrn, rp, rn, k); for (i = rn - 1; i >= 0; i--) if (xp[i] > (i == 0)) { mpn_add_1 (xp + rn, xp + rn, newrn - rn, 1); break; } } else #endif mpn_mul (xp, up, newrn, rp, rn); mpn_mullow_n (rp + rn, rp, xp + rn, newrn - rn); mpn_neg_n (rp + rn, rp + rn, newrn - rn); } }
int main (int argc, char **argv) { gmp_randstate_ptr rands; unsigned long maxnbits, maxdbits, nbits, dbits; mpz_t n, d, tz; mp_size_t maxnn, maxdn, nn, dn, clearn, i; mp_ptr np, dp, qp, rp; mp_limb_t rh; mp_limb_t t; mp_limb_t dinv; int count = COUNT; mp_ptr scratch; mp_limb_t ran; mp_size_t alloc, itch; mp_limb_t rran0, rran1, qran0, qran1; TMP_DECL; if (argc > 1) { char *end; count = strtol (argv[1], &end, 0); if (*end || count <= 0) { fprintf (stderr, "Invalid test count: %s.\n", argv[1]); return 1; } } maxdbits = MAX_DN; maxnbits = MAX_NN; tests_start (); rands = RANDS; mpz_init (n); mpz_init (d); mpz_init (tz); maxnn = maxnbits / GMP_NUMB_BITS + 1; maxdn = maxdbits / GMP_NUMB_BITS + 1; TMP_MARK; qp = TMP_ALLOC_LIMBS (maxnn + 2) + 1; rp = TMP_ALLOC_LIMBS (maxnn + 2) + 1; alloc = 1; scratch = __GMP_ALLOCATE_FUNC_LIMBS (alloc); for (test = 0; test < count;) { nbits = random_word (rands) % (maxnbits - GMP_NUMB_BITS) + 2 * GMP_NUMB_BITS; if (maxdbits > nbits) dbits = random_word (rands) % nbits + 1; else dbits = random_word (rands) % maxdbits + 1; #if RAND_UNIFORM #define RANDFUNC mpz_urandomb #else #define RANDFUNC mpz_rrandomb #endif do { RANDFUNC (n, rands, nbits); do { RANDFUNC (d, rands, dbits); } while (mpz_sgn (d) == 0); np = PTR (n); dp = PTR (d); nn = SIZ (n); dn = SIZ (d); } while (nn < dn); dp[0] |= 1; mpz_urandomb (tz, rands, 32); t = mpz_get_ui (tz); if (t % 17 == 0) dp[0] = GMP_NUMB_MAX; switch ((int) t % 16) { case 0: clearn = random_word (rands) % nn; for (i = 0; i <= clearn; i++) np[i] = 0; break; case 1: mpn_sub_1 (np + nn - dn, dp, dn, random_word (rands)); break; case 2: mpn_add_1 (np + nn - dn, dp, dn, random_word (rands)); break; } test++; binvert_limb (dinv, dp[0]); rran0 = random_word (rands); rran1 = random_word (rands); qran0 = random_word (rands); qran1 = random_word (rands); qp[-1] = qran0; qp[nn - dn + 1] = qran1; rp[-1] = rran0; ran = random_word (rands); if ((double) (nn - dn) * dn < 1e5) { if (nn > dn) { /* Test mpn_sbpi1_bdiv_qr */ MPN_ZERO (qp, nn - dn); MPN_ZERO (rp, dn); MPN_COPY (rp, np, nn); rh = mpn_sbpi1_bdiv_qr (qp, rp, nn, dp, dn, -dinv); ASSERT_ALWAYS (qp[-1] == qran0); ASSERT_ALWAYS (qp[nn - dn + 1] == qran1); ASSERT_ALWAYS (rp[-1] == rran0); check_one (qp, rp + nn - dn, rh, np, nn, dp, dn, "mpn_sbpi1_bdiv_qr"); } if (nn > dn) { /* Test mpn_sbpi1_bdiv_q */ MPN_COPY (rp, np, nn); MPN_ZERO (qp, nn - dn); mpn_sbpi1_bdiv_q (qp, rp, nn - dn, dp, MIN(dn,nn-dn), -dinv); ASSERT_ALWAYS (qp[-1] == qran0); ASSERT_ALWAYS (qp[nn - dn + 1] == qran1); ASSERT_ALWAYS (rp[-1] == rran0); check_one (qp, NULL, 0, np, nn, dp, dn, "mpn_sbpi1_bdiv_q"); } } if (dn >= 4 && nn - dn >= 2) { /* Test mpn_dcpi1_bdiv_qr */ MPN_COPY (rp, np, nn); MPN_ZERO (qp, nn - dn); rh = mpn_dcpi1_bdiv_qr (qp, rp, nn, dp, dn, -dinv); ASSERT_ALWAYS (qp[-1] == qran0); ASSERT_ALWAYS (qp[nn - dn + 1] == qran1); ASSERT_ALWAYS (rp[-1] == rran0); check_one (qp, rp + nn - dn, rh, np, nn, dp, dn, "mpn_dcpi1_bdiv_qr"); } if (dn >= 4 && nn - dn >= 2) { /* Test mpn_dcpi1_bdiv_q */ MPN_COPY (rp, np, nn); MPN_ZERO (qp, nn - dn); mpn_dcpi1_bdiv_q (qp, rp, nn - dn, dp, MIN(dn,nn-dn), -dinv); ASSERT_ALWAYS (qp[-1] == qran0); ASSERT_ALWAYS (qp[nn - dn + 1] == qran1); ASSERT_ALWAYS (rp[-1] == rran0); check_one (qp, NULL, 0, np, nn, dp, dn, "mpn_dcpi1_bdiv_q"); } if (nn > dn) { /* Test mpn_bdiv_qr */ itch = mpn_bdiv_qr_itch (nn, dn); if (itch + 1 > alloc) { scratch = __GMP_REALLOCATE_FUNC_LIMBS (scratch, alloc, itch + 1); alloc = itch + 1; } scratch[itch] = ran; MPN_ZERO (qp, nn - dn); MPN_ZERO (rp, dn); rp[dn] = rran1; rh = mpn_bdiv_qr (qp, rp, np, nn, dp, dn, scratch); ASSERT_ALWAYS (ran == scratch[itch]); ASSERT_ALWAYS (qp[-1] == qran0); ASSERT_ALWAYS (qp[nn - dn + 1] == qran1); ASSERT_ALWAYS (rp[-1] == rran0); ASSERT_ALWAYS (rp[dn] == rran1); check_one (qp, rp, rh, np, nn, dp, dn, "mpn_bdiv_qr"); } if (nn - dn < 2 || dn < 2) continue; /* Test mpn_mu_bdiv_qr */ itch = mpn_mu_bdiv_qr_itch (nn, dn); if (itch + 1 > alloc) { scratch = __GMP_REALLOCATE_FUNC_LIMBS (scratch, alloc, itch + 1); alloc = itch + 1; } scratch[itch] = ran; MPN_ZERO (qp, nn - dn); MPN_ZERO (rp, dn); rp[dn] = rran1; rh = mpn_mu_bdiv_qr (qp, rp, np, nn, dp, dn, scratch); ASSERT_ALWAYS (ran == scratch[itch]); ASSERT_ALWAYS (qp[-1] == qran0); ASSERT_ALWAYS (qp[nn - dn + 1] == qran1); ASSERT_ALWAYS (rp[-1] == rran0); ASSERT_ALWAYS (rp[dn] == rran1); check_one (qp, rp, rh, np, nn, dp, dn, "mpn_mu_bdiv_qr"); /* Test mpn_mu_bdiv_q */ itch = mpn_mu_bdiv_q_itch (nn, dn); if (itch + 1 > alloc) { scratch = __GMP_REALLOCATE_FUNC_LIMBS (scratch, alloc, itch + 1); alloc = itch + 1; } scratch[itch] = ran; MPN_ZERO (qp, nn - dn + 1); mpn_mu_bdiv_q (qp, np, nn - dn, dp, dn, scratch); ASSERT_ALWAYS (ran == scratch[itch]); ASSERT_ALWAYS (qp[-1] == qran0); ASSERT_ALWAYS (qp[nn - dn + 1] == qran1); check_one (qp, NULL, 0, np, nn, dp, dn, "mpn_mu_bdiv_q"); } __GMP_FREE_FUNC_LIMBS (scratch, alloc); TMP_FREE; mpz_clear (n); mpz_clear (d); mpz_clear (tz); tests_end (); return 0; }
/* Computes a^{1/k - 1} (mod B^n). Both a and k must be odd. Iterates r' <-- r - r * (a^{k-1} r^k - 1) / n If a^{k-1} r^k = 1 (mod 2^m), then a^{k-1} r'^k = 1 (mod 2^{2m}), Compute the update term as r' = r - (a^{k-1} r^{k+1} - r) / k where we still have cancellation of low limbs. */ void mpn_broot_invm1 (mp_ptr rp, mp_srcptr ap, mp_size_t n, mp_limb_t k) { mp_size_t sizes[GMP_LIMB_BITS * 2]; mp_ptr akm1, tp, rnp, ep; mp_limb_t a0, r0, km1, kp1h, kinv; mp_size_t rn; unsigned i; TMP_DECL; ASSERT (n > 0); ASSERT (ap[0] & 1); ASSERT (k & 1); ASSERT (k >= 3); TMP_MARK; akm1 = TMP_ALLOC_LIMBS (4*n); tp = akm1 + n; km1 = k-1; /* FIXME: Could arrange the iteration so we don't need to compute this up front, computing a^{k-1} * r^k as (a r)^{k-1} * r. Note that we can use wraparound also for a*r, since the low half is unchanged from the previous iteration. Or possibly mulmid. Also, a r = a^{1/k}, so we get that value too, for free? */ mpn_powlo (akm1, ap, &km1, 1, n, tp); /* 3 n scratch space */ a0 = ap[0]; binvert_limb (kinv, k); /* 4 bits: a^{1/k - 1} (mod 16): a % 8 1 3 5 7 k%4 +------- 1 |1 1 1 1 3 |1 9 9 1 */ r0 = 1 + (((k << 2) & ((a0 << 1) ^ (a0 << 2))) & 8); r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k & 0x7f)); /* 8 bits */ r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k & 0x7fff)); /* 16 bits */ r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k)); /* 32 bits */ #if GMP_NUMB_BITS > 32 { unsigned prec = 32; do { r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k)); prec *= 2; } while (prec < GMP_NUMB_BITS); } #endif rp[0] = r0; if (n == 1) { TMP_FREE; return; } /* For odd k, (k+1)/2 = k/2+1, and the latter avoids overflow. */ kp1h = k/2 + 1; /* FIXME: Special case for two limb iteration. */ rnp = TMP_ALLOC_LIMBS (2*n + 1); ep = rnp + n; /* FIXME: Possible to this on the fly with some bit fiddling. */ for (i = 0; n > 1; n = (n + 1)/2) sizes[i++] = n; rn = 1; while (i-- > 0) { /* Compute x^{k+1}. */ mpn_sqr (ep, rp, rn); /* For odd n, writes n+1 limbs in the final iteration. */ mpn_powlo (rnp, ep, &kp1h, 1, sizes[i], tp); /* Multiply by a^{k-1}. Can use wraparound; low part equals r. */ mpn_mullo_n (ep, rnp, akm1, sizes[i]); ASSERT (mpn_cmp (ep, rp, rn) == 0); ASSERT (sizes[i] <= 2*rn); mpn_pi1_bdiv_q_1 (rp + rn, ep + rn, sizes[i] - rn, k, kinv, 0); mpn_neg (rp + rn, rp + rn, sizes[i] - rn); rn = sizes[i]; } TMP_FREE; }
mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, mp_limb_t orig_c) { mp_limb_t s, h, l, inverse, dummy, dmul, ret; mp_limb_t c = orig_c; mp_size_t i; ASSERT (size >= 1); ASSERT (d & 1); ASSERT_MPN (src, size); ASSERT_LIMB (d); ASSERT_LIMB (c); if (size == 1) { s = src[0]; if (s > c) { l = s-c; h = l % d; if (h != 0) h = d - h; } else { l = c-s; h = l % d; } return h; } binvert_limb (inverse, d); dmul = d << GMP_NAIL_BITS; i = 0; do { s = src[i]; SUBC_LIMB (c, l, s, c); l = (l * inverse) & GMP_NUMB_MASK; umul_ppmm (h, dummy, l, dmul); c += h; } while (++i < size-1); s = src[i]; if (s <= d) { /* With high<=d the final step can be a subtract and addback. If c==0 then the addback will restore to l>=0. If c==d then will get l==d if s==0, but that's ok per the function definition. */ l = c - s; if (c < s) l += d; ret = l; } else { /* Can't skip a divide, just do the loop code once more. */ SUBC_LIMB (c, l, s, c); l = (l * inverse) & GMP_NUMB_MASK; umul_ppmm (h, dummy, l, dmul); c += h; ret = c; } ASSERT (orig_c < d ? ret < d : ret <= d); return ret; }
/* rp[n-1..0] = bp[bn-1..0] ^ ep[en-1..0] mod mp[n-1..0] Requires that mp[n-1..0] is odd. FIXME: is this true? Requires that ep[en-1..0] is > 1. Uses scratch space at tp of 3n+1 limbs. */ void mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn, mp_srcptr ep, mp_size_t en, mp_srcptr mp, mp_size_t n, mp_ptr tp) { mp_limb_t minv; int cnt; mp_bitcnt_t ebi; int windowsize, this_windowsize; mp_limb_t expbits; mp_ptr pp, this_pp; long i; int cnd; ASSERT (en > 1 || (en == 1 && ep[0] > 0)); ASSERT (n >= 1 && ((mp[0] & 1) != 0)); count_leading_zeros (cnt, ep[en - 1]); ebi = (mp_bitcnt_t) en * GMP_LIMB_BITS - cnt; windowsize = win_size (ebi); binvert_limb (minv, mp[0]); minv = -minv; pp = tp + 4 * n; this_pp = pp; this_pp[n] = 1; redcify (this_pp, this_pp + n, 1, mp, n, tp + 6 * n); this_pp += n; redcify (this_pp, bp, bn, mp, n, tp + 6 * n); /* Precompute powers of b and put them in the temporary area at pp. */ for (i = (1 << windowsize) - 2; i > 0; i--) { mpn_mul_basecase (tp, this_pp, n, pp + n, n); this_pp += n; mpn_redc_1_sec (this_pp, tp, mp, n, minv); } expbits = getbits (ep, ebi, windowsize); if (ebi < windowsize) ebi = 0; else ebi -= windowsize; #if WANT_CACHE_SECURITY mpn_tabselect (rp, pp, n, 1 << windowsize, expbits); #else MPN_COPY (rp, pp + n * expbits, n); #endif while (ebi != 0) { expbits = getbits (ep, ebi, windowsize); this_windowsize = windowsize; if (ebi < windowsize) { this_windowsize -= windowsize - ebi; ebi = 0; } else ebi -= windowsize; do { mpn_local_sqr (tp, rp, n, tp + 2 * n); mpn_redc_1_sec (rp, tp, mp, n, minv); this_windowsize--; } while (this_windowsize != 0); #if WANT_CACHE_SECURITY mpn_tabselect (tp + 2*n, pp, n, 1 << windowsize, expbits); mpn_mul_basecase (tp, rp, n, tp + 2*n, n); #else mpn_mul_basecase (tp, rp, n, pp + n * expbits, n); #endif mpn_redc_1_sec (rp, tp, mp, n, minv); } MPN_COPY (tp, rp, n); MPN_ZERO (tp + n, n); mpn_redc_1_sec (rp, tp, mp, n, minv); cnd = mpn_sub_n (tp, rp, mp, n); /* we need just retval */ mpn_subcnd_n (rp, rp, mp, n, !cnd); }
/* rp[n-1..0] = bp[bn-1..0] ^ ep[en-1..0] mod mp[n-1..0] Requires that mp[n-1..0] is odd. Requires that ep[en-1..0] is > 1. Uses scratch space tp[3n..0], i.e., 3n+1 words. */ void mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn, mp_srcptr ep, mp_size_t en, mp_srcptr mp, mp_size_t n, mp_ptr tp) { mp_limb_t mip[2]; int cnt; long ebi; int windowsize, this_windowsize; mp_limb_t expbits; mp_ptr pp, this_pp, last_pp; long i; int redc_x; TMP_DECL; ASSERT (en > 1 || (en == 1 && ep[0] > 1)); ASSERT (n >= 1 && ((mp[0] & 1) != 0)); TMP_MARK; count_leading_zeros (cnt, ep[en - 1]); ebi = en * GMP_LIMB_BITS - cnt; windowsize = win_size (ebi); if (BELOW_THRESHOLD (n, REDC_2_THRESHOLD)) { binvert_limb (mip[0], mp[0]); mip[0] = -mip[0]; redc_x = 1; } #if defined (HAVE_NATIVE_mpn_addmul_2) else { mpn_binvert (mip, mp, 2, tp); mip[0] = -mip[0]; mip[1] = ~mip[1]; redc_x = 2; } #endif #if 0 mpn_binvert (mip, mp, n, tp); redc_x = 0; #endif pp = TMP_ALLOC_LIMBS (n << windowsize); this_pp = pp; this_pp[n] = 1; redcify (this_pp, this_pp + n, 1, mp, n); this_pp += n; redcify (this_pp, bp, bn, mp, n); /* Precompute powers of b and put them in the temporary area at pp. */ for (i = (1 << windowsize) - 2; i > 0; i--) { last_pp = this_pp; this_pp += n; mpn_mul_n (tp, last_pp, pp + n, n); MPN_REDC_X (this_pp, tp, mp, n, mip); } expbits = getbits (ep, ebi, windowsize); ebi -= windowsize; if (ebi < 0) ebi = 0; MPN_COPY (rp, pp + n * expbits, n); while (ebi != 0) { expbits = getbits (ep, ebi, windowsize); ebi -= windowsize; this_windowsize = windowsize; if (ebi < 0) { this_windowsize += ebi; ebi = 0; } do { mpn_sqr_n (tp, rp, n); MPN_REDC_X (rp, tp, mp, n, mip); this_windowsize--; } while (this_windowsize != 0); #if WANT_CACHE_SECURITY mpn_tabselect (tp + 2*n, pp, n, 1 << windowsize, expbits); mpn_mul_n (tp, rp, tp + 2*n, n); #else mpn_mul_n (tp, rp, pp + n * expbits, n); #endif MPN_REDC_X (rp, tp, mp, n, mip); } MPN_COPY (tp, rp, n); MPN_ZERO (tp + n, n); MPN_REDC_X (rp, tp, mp, n, mip); if (mpn_cmp (rp, mp, n) >= 0) mpn_sub_n (rp, rp, mp, n); TMP_FREE; }
mp_limb_t mpn_bdivmod (mp_ptr qp, mp_ptr up, mp_size_t usize, mp_srcptr vp, mp_size_t vsize, unsigned long int d) { mp_limb_t v_inv; ASSERT (usize >= 1); ASSERT (vsize >= 1); ASSERT (usize * GMP_NUMB_BITS >= d); ASSERT (! MPN_OVERLAP_P (up, usize, vp, vsize)); ASSERT (! MPN_OVERLAP_P (qp, d/GMP_NUMB_BITS, vp, vsize)); ASSERT (MPN_SAME_OR_INCR2_P (qp, d/GMP_NUMB_BITS, up, usize)); ASSERT_MPN (up, usize); ASSERT_MPN (vp, vsize); /* 1/V mod 2^GMP_NUMB_BITS. */ binvert_limb (v_inv, vp[0]); /* Fast code for two cases previously used by the accel part of mpn_gcd. (Could probably remove this now it's inlined there.) */ if (usize == 2 && vsize == 2 && (d == GMP_NUMB_BITS || d == 2*GMP_NUMB_BITS)) { mp_limb_t hi, lo; mp_limb_t q = (up[0] * v_inv) & GMP_NUMB_MASK; umul_ppmm (hi, lo, q, vp[0] << GMP_NAIL_BITS); up[0] = 0; up[1] -= hi + q*vp[1]; qp[0] = q; if (d == 2*GMP_NUMB_BITS) { q = (up[1] * v_inv) & GMP_NUMB_MASK; up[1] = 0; qp[1] = q; } return 0; } /* Main loop. */ while (d >= GMP_NUMB_BITS) { mp_limb_t q = (up[0] * v_inv) & GMP_NUMB_MASK; mp_limb_t b = mpn_submul_1 (up, vp, MIN (usize, vsize), q); if (usize > vsize) mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b); d -= GMP_NUMB_BITS; up += 1, usize -= 1; *qp++ = q; } if (d) { mp_limb_t b; mp_limb_t q = (up[0] * v_inv) & (((mp_limb_t)1<<d) - 1); if (q <= 1) { if (q == 0) return 0; else b = mpn_sub_n (up, up, vp, MIN (usize, vsize)); } else b = mpn_submul_1 (up, vp, MIN (usize, vsize), q); if (usize > vsize) mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b); return q; } return 0; }
mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, mp_limb_t orig_c) { mp_limb_t c = orig_c; mp_limb_t s, l, q, h, inverse; ASSERT (size >= 1); ASSERT (d & 1); ASSERT_MPN (src, size); ASSERT_LIMB (d); ASSERT_LIMB (c); /* udivx is faster than 10 or 12 mulx's for one limb via an inverse */ if (size == 1) { s = src[0]; if (s > c) { l = s-c; h = l % d; if (h != 0) h = d - h; } else { l = c-s; h = l % d; } return h; } binvert_limb (inverse, d); if (d <= 0xFFFFFFFF) { s = *src++; size--; do { SUBC_LIMB (c, l, s, c); s = *src++; q = l * inverse; umul_ppmm_half_lowequal (h, q, d, l); c += h; size--; } while (size != 0); if (s <= d) { /* With high s <= d the final step can be a subtract and addback. If c==0 then the addback will restore to l>=0. If c==d then will get l==d if s==0, but that's ok per the function definition. */ l = c - s; l += (l > c ? d : 0); ASSERT_RETVAL (l); return l; } else { /* Can't skip a divide, just do the loop code once more. */ SUBC_LIMB (c, l, s, c); q = l * inverse; umul_ppmm_half_lowequal (h, q, d, l); c += h; ASSERT_RETVAL (c); return c; } } else { mp_limb_t dl = LOW32 (d); mp_limb_t dh = HIGH32 (d); long i; s = *src++; size--; do { SUBC_LIMB (c, l, s, c); s = *src++; q = l * inverse; umul_ppmm_lowequal (h, q, d, dh, dl, l); c += h; size--; } while (size != 0); if (s <= d) { /* With high s <= d the final step can be a subtract and addback. If c==0 then the addback will restore to l>=0. If c==d then will get l==d if s==0, but that's ok per the function definition. */ l = c - s; l += (l > c ? d : 0); ASSERT_RETVAL (l); return l; } else { /* Can't skip a divide, just do the loop code once more. */ SUBC_LIMB (c, l, s, c); q = l * inverse; umul_ppmm_lowequal (h, q, d, dh, dl, l); c += h; ASSERT_RETVAL (c); return c; } } }
void mpn_brootinv (mp_ptr rp, mp_srcptr yp, mp_size_t bn, mp_limb_t k, mp_ptr tp) { mp_ptr tp2, tp3; mp_limb_t kinv, k2, r0, y0; mp_size_t order[GMP_LIMB_BITS + 1]; int i, d; ASSERT (bn > 0); ASSERT ((k & 1) != 0); tp2 = tp + bn; tp3 = tp + 2 * bn; k2 = k + 1; binvert_limb (kinv, k); /* 4-bit initial approximation: y%16 | 1 3 5 7 9 11 13 15, k%4 +-------------------------+k2%4 1 | 1 11 13 7 9 3 5 15 | 2 3 | 1 3 5 7 9 11 13 15 | 0 */ y0 = yp[0]; r0 = y0 ^ (((y0 << 1) ^ (y0 << 2)) & (k2 << 2) & 8); /* 4 bits */ r0 = kinv * (k2 * r0 - y0 * powlimb(r0, k2 & 0x7f)); /* 8 bits */ r0 = kinv * (k2 * r0 - y0 * powlimb(r0, k2 & 0x7fff)); /* 16 bits */ #if GMP_NUMB_BITS > 16 { unsigned prec = 16; do { r0 = kinv * (k2 * r0 - y0 * powlimb(r0, k2)); prec *= 2; } while (prec < GMP_NUMB_BITS); } #endif rp[0] = r0; if (bn == 1) return; /* This initialization doesn't matter for the result (any garbage is cancelled in the iteration), but proper initialization makes valgrind happier. */ MPN_ZERO (rp+1, bn-1); d = 0; for (; bn > 1; bn = (bn + 1) >> 1) order[d++] = bn; for (i = d - 1; i >= 0; i--) { bn = order[i]; mpn_mul_1 (tp, rp, bn, k2); mpn_powlo (tp2, rp, &k2, 1, bn, tp3); mpn_mullo_n (rp, yp, tp2, bn); mpn_sub_n (tp2, tp, rp, bn); mpn_pi1_bdiv_q_1 (rp, tp2, bn, k, kinv, 0); } }