static int ref_modinv (mp_limb_t *rp, const mp_limb_t *ap, const mp_limb_t *mp, mp_size_t mn) { mp_limb_t tp[4*(mn+1)]; mp_limb_t *up = tp; mp_limb_t *vp = tp + mn+1; mp_limb_t *gp = tp + 2*(mn+1); mp_limb_t *sp = tp + 3*(mn+1); mp_size_t gn, sn; mpn_copyi (up, ap, mn); mpn_copyi (vp, mp, mn); gn = mpn_gcdext (gp, sp, &sn, up, mn, vp, mn); if (gn != 1 || gp[0] != 1) return 0; if (sn < 0) mpn_sub (sp, mp, mn, sp, -sn); else if (sn < mn) /* Zero-pad. */ mpn_zero (sp + sn, mn - sn); mpn_copyi (rp, sp, mn); return 1; }
/* Computes R -= A * B. Result must be non-negative. Normalized down to size an, and resulting size is returned. */ static mp_size_t submul (mp_ptr rp, mp_size_t rn, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn) { mp_ptr tp; TMP_DECL; ASSERT (bn > 0); ASSERT (an >= bn); ASSERT (rn >= an); ASSERT (an + bn <= rn + 1); TMP_MARK; tp = TMP_ALLOC_LIMBS (an + bn); mpn_mul (tp, ap, an, bp, bn); if (an + bn > rn) { ASSERT (tp[rn] == 0); bn--; } ASSERT_NOCARRY (mpn_sub (rp, rp, rn, tp, an + bn)); TMP_FREE; while (rn > an && (rp[rn-1] == 0)) rn--; return rn; }
static int modinv_gcd (const struct ecc_curve *ecc, mp_limb_t *rp, mp_limb_t *ap, mp_limb_t *tp) { mp_size_t size = ecc->p.size; mp_limb_t *up = tp; mp_limb_t *vp = tp + size+1; mp_limb_t *gp = tp + 2*(size+1); mp_limb_t *sp = tp + 3*(size+1); mp_size_t gn, sn; mpn_copyi (up, ap, size); mpn_copyi (vp, ecc->p.m, size); gn = mpn_gcdext (gp, sp, &sn, up, size, vp, size); if (gn != 1 || gp[0] != 1) return 0; if (sn < 0) mpn_sub (sp, ecc->p.m, size, sp, -sn); else if (sn < size) /* Zero-pad. */ mpn_zero (sp + sn, size - sn); mpn_copyi (rp, sp, size); return 1; }
void fmpz_add(fmpz_t coeffs_out, const fmpz_t in1, const fmpz_t in2) { fmpz_t coeffs1 = in1; fmpz_t coeffs2 = in2; long carry; unsigned long size1 = ABS(coeffs1[0]); unsigned long size2 = ABS(coeffs2[0]); if (size1 < size2) { SWAP_PTRS(coeffs1, coeffs2); size1 = ABS(coeffs1[0]); size2 = ABS(coeffs2[0]); } if (!size1) { if (!size2) coeffs_out[0] = 0L; else { if (coeffs_out != coeffs2) F_mpn_copy(coeffs_out, coeffs2, size2+1); } } else if (!size2) { if (coeffs_out != coeffs1) F_mpn_copy(coeffs_out, coeffs1, size1+1); } else if ((long) (coeffs1[0] ^ coeffs2[0]) >= 0L) { coeffs_out[0] = coeffs1[0]; carry = mpn_add(coeffs_out+1, coeffs1+1, size1, coeffs2+1, size2); if (carry) { coeffs_out[size1+1] = carry; if ((long) coeffs_out[0] < 0L) coeffs_out[0]--; else coeffs_out[0]++; } } else { carry = 0; if (size1 != size2) carry = 1; else carry = mpn_cmp(coeffs1+1, coeffs2+1, size1); if (carry == 0) coeffs_out[0] = 0L; else if (carry > 0) { mpn_sub(coeffs_out+1, coeffs1+1, size1, coeffs2+1, size2); coeffs_out[0] = coeffs1[0]; NORM(coeffs_out); } else { mpn_sub_n(coeffs_out+1, coeffs2+1, coeffs1+1, size1); coeffs_out[0] = -coeffs1[0]; NORM(coeffs_out); } } }
void Inv(modp& ans,const modp& x,const Zp_Data& ZpD) { mp_limb_t g[MAX_MOD_SZ],xx[MAX_MOD_SZ],yy[MAX_MOD_SZ]; mp_size_t sz; mpn_copyi(xx,x.x,ZpD.t); mpn_copyi(yy,ZpD.prA,ZpD.t); mpn_gcdext(g,ans.x,&sz,xx,ZpD.t,yy,ZpD.t); if (sz<0) { mpn_sub(ans.x,ZpD.prA,ZpD.t,ans.x,-sz); sz=-sz; } else { for (int i=sz; i<ZpD.t; i++) { ans.x[i]=0; } } if (ZpD.montgomery) { ZpD.Mont_Mult(ans.x,ans.x,ZpD.R3); } }
void _tc4_add(mp_ptr rp, mp_size_t * rn, mp_srcptr r1, mp_size_t r1n, mp_srcptr r2, mp_size_t r2n) { mp_limb_t cy; mp_size_t s1 = ABS(r1n); mp_size_t s2 = ABS(r2n); if (!s1) { *rn = 0; } else if (!s2) { if (rp != r1) MPN_COPY(rp, r1, s1); *rn = r1n; } else if ((r1n ^ r2n) >= 0) { *rn = r1n; cy = mpn_add(rp, r1, s1, r2, s2); if (cy) { rp[s1] = cy; if ((*rn) < 0) (*rn)--; else (*rn)++; } } else { mp_size_t ct; if (s1 != s2) ct = 1; else MPN_CMP(ct, r1, r2, s1); if (!ct) *rn = 0; else if (ct > 0) { mpn_sub(rp, r1, s1, r2, s2); *rn = s1; MPN_NORMALIZE(rp, (*rn)); if (r1n < 0) *rn = -(*rn); } else { mpn_sub_n(rp, r2, r1, s1); *rn = s1; MPN_NORMALIZE(rp, (*rn)); if (r1n > 0) *rn = -(*rn); } } }
void gcdext_get_t(mp_ptr t, mp_size_t * tn, mp_ptr gp, mp_size_t gn, mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n, mp_ptr s, mp_size_t sn, mp_ptr tp) { mp_size_t ss = ABS(sn); mp_limb_t cy; if (ss >= an) mpn_mul(tp, s, ss, ap, an); else mpn_mul(tp, ap, an, s, ss); (*tn) = ss + an; (*tn) -= (tp[(*tn) - 1] == 0); /* We must have s*ap >= gp and we really want to compute -t */ if (sn > 0) { mpn_sub(tp, tp, *tn, gp, gn); MPN_NORMALIZE(tp, (*tn)); } else { cy = mpn_add(tp, tp, *tn, gp, gn); if (cy) tp[(*tn)++] = cy; } if ((*tn) == 0) { return; } mpn_tdiv_qr(t, tp, 0, tp, (*tn), bp, n); ASSERT_MPN_ZERO_P(tp, n); (*tn) -= (n - 1); (*tn) -= (t[(*tn) - 1] == 0); }
/* Computes |v| = |(g - u a)| / b, where u may be positive or negative, and v is of the opposite sign. a, b are of size n, u and v at most size n, and v must have space for n+1 limbs. */ static mp_size_t compute_v (mp_ptr vp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, mp_srcptr gp, mp_size_t gn, mp_srcptr up, mp_size_t usize, mp_ptr tp) { mp_size_t size; mp_size_t an; mp_size_t bn; mp_size_t vn; ASSERT (n > 0); ASSERT (gn > 0); ASSERT (usize != 0); size = ABS (usize); ASSERT (size <= n); an = n; MPN_NORMALIZE (ap, an); if (an >= size) mpn_mul (tp, ap, an, up, size); else mpn_mul (tp, up, size, ap, an); size += an; size -= tp[size - 1] == 0; ASSERT (gn <= size); if (usize > 0) { /* |v| = -v = (u a - g) / b */ ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn)); MPN_NORMALIZE (tp, size); if (size == 0) return 0; } else { /* usize < 0 */ /* |v| = v = (c - u a) / b = (c + |u| a) / b */ mp_limb_t cy = mpn_add (tp, tp, size, gp, gn); if (cy) tp[size++] = cy; } /* Now divide t / b. There must be no remainder */ bn = n; MPN_NORMALIZE (bp, bn); ASSERT (size >= bn); vn = size + 1 - bn; ASSERT (vn <= n + 1); mpn_divexact (vp, tp, size, bp, bn); vn -= (vp[vn-1] == 0); return vn; }
void mpf_sub (mpf_ptr r, mpf_srcptr u, mpf_srcptr v) { mp_srcptr up, vp; mp_ptr rp, tp; mp_size_t usize, vsize, rsize; mp_size_t prec; mp_exp_t exp; mp_size_t ediff; int negate; TMP_DECL; usize = u->_mp_size; vsize = v->_mp_size; /* Handle special cases that don't work in generic code below. */ if (usize == 0) { mpf_neg (r, v); return; } if (vsize == 0) { if (r != u) mpf_set (r, u); return; } /* If signs of U and V are different, perform addition. */ if ((usize ^ vsize) < 0) { __mpf_struct v_negated; v_negated._mp_size = -vsize; v_negated._mp_exp = v->_mp_exp; v_negated._mp_d = v->_mp_d; mpf_add (r, u, &v_negated); return; } TMP_MARK; /* Signs are now known to be the same. */ negate = usize < 0; /* Make U be the operand with the largest exponent. */ if (u->_mp_exp < v->_mp_exp) { mpf_srcptr t; t = u; u = v; v = t; negate ^= 1; usize = u->_mp_size; vsize = v->_mp_size; } usize = ABS (usize); vsize = ABS (vsize); up = u->_mp_d; vp = v->_mp_d; rp = r->_mp_d; prec = r->_mp_prec + 1; exp = u->_mp_exp; ediff = u->_mp_exp - v->_mp_exp; /* If ediff is 0 or 1, we might have a situation where the operands are extremely close. We need to scan the operands from the most significant end ignore the initial parts that are equal. */ if (ediff <= 1) { if (ediff == 0) { /* Skip leading limbs in U and V that are equal. */ if (up[usize - 1] == vp[vsize - 1]) { /* This loop normally exits immediately. Optimize for that. */ do { usize--; vsize--; exp--; if (usize == 0) { /* u cancels high limbs of v, result is rest of v */ negate ^= 1; cancellation: /* strip high zeros before truncating to prec */ while (vsize != 0 && vp[vsize - 1] == 0) { vsize--; exp--; } if (vsize > prec) { vp += vsize - prec; vsize = prec; } MPN_COPY_INCR (rp, vp, vsize); rsize = vsize; goto done; } if (vsize == 0) { vp = up; vsize = usize; goto cancellation; } } while (up[usize - 1] == vp[vsize - 1]); } if (up[usize - 1] < vp[vsize - 1]) { /* For simplicity, swap U and V. Note that since the loop above wouldn't have exited unless up[usize - 1] and vp[vsize - 1] were non-equal, this if-statement catches all cases where U is smaller than V. */ MPN_SRCPTR_SWAP (up,usize, vp,vsize); negate ^= 1; /* negating ediff not necessary since it is 0. */ } /* Check for x+1 00000000 ... x ffffffff ... */ if (up[usize - 1] != vp[vsize - 1] + 1) goto general_case; usize--; vsize--; exp--; } else /* ediff == 1 */ { /* Check for 1 00000000 ... 0 ffffffff ... */ if (up[usize - 1] != 1 || vp[vsize - 1] != GMP_NUMB_MAX || (usize >= 2 && up[usize - 2] != 0)) goto general_case; usize--; exp--; } /* Skip sequences of 00000000/ffffffff */ while (vsize != 0 && usize != 0 && up[usize - 1] == 0 && vp[vsize - 1] == GMP_NUMB_MAX) { usize--; vsize--; exp--; } if (usize == 0) { while (vsize != 0 && vp[vsize - 1] == GMP_NUMB_MAX) { vsize--; exp--; } } if (usize > prec - 1) { up += usize - (prec - 1); usize = prec - 1; } if (vsize > prec - 1) { vp += vsize - (prec - 1); vsize = prec - 1; } tp = (mp_ptr) TMP_ALLOC (prec * BYTES_PER_MP_LIMB); { mp_limb_t cy_limb; if (vsize == 0) { mp_size_t size, i; size = usize; for (i = 0; i < size; i++) tp[i] = up[i]; tp[size] = 1; rsize = size + 1; exp++; goto normalize; } if (usize == 0) { mp_size_t size, i; size = vsize; for (i = 0; i < size; i++) tp[i] = ~vp[i] & GMP_NUMB_MASK; cy_limb = 1 - mpn_add_1 (tp, tp, vsize, (mp_limb_t) 1); rsize = vsize; if (cy_limb == 0) { tp[rsize] = 1; rsize++; exp++; } goto normalize; } if (usize >= vsize) { /* uuuu */ /* vv */ mp_size_t size; size = usize - vsize; MPN_COPY (tp, up, size); cy_limb = mpn_sub_n (tp + size, up + size, vp, vsize); rsize = usize; } else /* (usize < vsize) */ { /* uuuu */ /* vvvvvvv */ mp_size_t size, i; size = vsize - usize; for (i = 0; i < size; i++) tp[i] = ~vp[i] & GMP_NUMB_MASK; cy_limb = mpn_sub_n (tp + size, up, vp + size, usize); cy_limb+= mpn_sub_1 (tp + size, tp + size, usize, (mp_limb_t) 1); cy_limb-= mpn_add_1 (tp, tp, vsize, (mp_limb_t) 1); rsize = vsize; } if (cy_limb == 0) { tp[rsize] = 1; rsize++; exp++; } goto normalize; } } general_case: /* If U extends beyond PREC, ignore the part that does. */ if (usize > prec) { up += usize - prec; usize = prec; } /* If V extends beyond PREC, ignore the part that does. Note that this may make vsize negative. */ if (vsize + ediff > prec) { vp += vsize + ediff - prec; vsize = prec - ediff; } /* Allocate temp space for the result. Allocate just vsize + ediff later??? */ tp = (mp_ptr) TMP_ALLOC (prec * BYTES_PER_MP_LIMB); if (ediff >= prec) { /* V completely cancelled. */ if (tp != up) MPN_COPY (rp, up, usize); rsize = usize; } else { /* Locate the least significant non-zero limb in (the needed parts of) U and V, to simplify the code below. */ for (;;) { if (vsize == 0) { MPN_COPY (rp, up, usize); rsize = usize; goto done; } if (vp[0] != 0) break; vp++, vsize--; } for (;;) { if (usize == 0) { MPN_COPY (rp, vp, vsize); rsize = vsize; negate ^= 1; goto done; } if (up[0] != 0) break; up++, usize--; } /* uuuu | uuuu | uuuu | uuuu | uuuu */ /* vvvvvvv | vv | vvvvv | v | vv */ if (usize > ediff) { /* U and V partially overlaps. */ if (ediff == 0) { /* Have to compare the leading limbs of u and v to determine whether to compute u - v or v - u. */ if (usize >= vsize) { /* uuuu */ /* vv */ mp_size_t size; size = usize - vsize; MPN_COPY (tp, up, size); mpn_sub_n (tp + size, up + size, vp, vsize); rsize = usize; } else /* (usize < vsize) */ { /* uuuu */ /* vvvvvvv */ mp_size_t size, i; size = vsize - usize; tp[0] = -vp[0] & GMP_NUMB_MASK; for (i = 1; i < size; i++) tp[i] = ~vp[i] & GMP_NUMB_MASK; mpn_sub_n (tp + size, up, vp + size, usize); mpn_sub_1 (tp + size, tp + size, usize, (mp_limb_t) 1); rsize = vsize; } } else { if (vsize + ediff <= usize) { /* uuuu */ /* v */ mp_size_t size; size = usize - ediff - vsize; MPN_COPY (tp, up, size); mpn_sub (tp + size, up + size, usize - size, vp, vsize); rsize = usize; } else { /* uuuu */ /* vvvvv */ mp_size_t size, i; size = vsize + ediff - usize; tp[0] = -vp[0] & GMP_NUMB_MASK; for (i = 1; i < size; i++) tp[i] = ~vp[i] & GMP_NUMB_MASK; mpn_sub (tp + size, up, usize, vp + size, usize - ediff); mpn_sub_1 (tp + size, tp + size, usize, (mp_limb_t) 1); rsize = vsize + ediff; } } } else { /* uuuu */ /* vv */ mp_size_t size, i; size = vsize + ediff - usize; tp[0] = -vp[0] & GMP_NUMB_MASK; for (i = 1; i < vsize; i++) tp[i] = ~vp[i] & GMP_NUMB_MASK; for (i = vsize; i < size; i++) tp[i] = GMP_NUMB_MAX; mpn_sub_1 (tp + size, up, usize, (mp_limb_t) 1); rsize = size + usize; } normalize: /* Full normalize. Optimize later. */ while (rsize != 0 && tp[rsize - 1] == 0) { rsize--; exp--; } MPN_COPY (rp, tp, rsize); } done: r->_mp_size = negate ? -rsize : rsize; if (rsize == 0) exp = 0; r->_mp_exp = exp; TMP_FREE; }
/* Check divide and conquer division routine. */ void check_dc_divappr_q (void) { mp_limb_t np[2*MAX_LIMBS]; mp_limb_t np2[2*MAX_LIMBS]; mp_limb_t rp[2*MAX_LIMBS]; mp_limb_t dp[MAX_LIMBS]; mp_limb_t qp[2*MAX_LIMBS]; mp_limb_t dip; mp_size_t nn, rn, dn, qn; gmp_randstate_t rands; int i, j, s; gmp_randinit_default(rands); for (i = 0; i < ITERS; i++) { dn = (random() % (MAX_LIMBS - 5)) + 6; nn = (random() % (MAX_LIMBS - 3)) + dn + 3; mpn_rrandom (np, rands, nn); mpn_rrandom (dp, rands, dn); dp[dn-1] |= GMP_LIMB_HIGHBIT; MPN_COPY(np2, np, nn); mpir_invert_pi1(dip, dp[dn - 1], dp[dn - 2]); qn = nn - dn + 1; qp[qn - 1] = mpn_dc_divappr_q(qp, np, nn, dp, dn, dip); MPN_NORMALIZE(qp, qn); if (qn) { if (qn >= dn) mpn_mul(rp, qp, qn, dp, dn); else mpn_mul(rp, dp, dn, qp, qn); rn = dn + qn; MPN_NORMALIZE(rp, rn); s = (rn < nn) ? -1 : (rn > nn) ? 1 : mpn_cmp(rp, np2, nn); if (s <= 0) { mpn_sub(rp, np2, nn, rp, rn); rn = nn; MPN_NORMALIZE(rp, rn); } else { mpn_sub(rp, rp, rn, np2, nn); MPN_NORMALIZE(rp, rn); } } else { rn = nn; MPN_COPY(rp, np, nn); } s = (rn < dn) ? -1 : (rn > dn) ? 1 : mpn_cmp(rp, dp, dn); if (s >= 0) { printf ("failed:\n"); printf ("nn = %lu, dn = %lu, qn = %lu, rn = %lu\n\n", nn, dn, qn, rn); gmp_printf (" np: %Nx\n\n", np2, nn); gmp_printf (" dp: %Nx\n\n", dp, dn); gmp_printf (" qp: %Nx\n\n", qp, qn); gmp_printf (" rp: %Nx\n\n", rp, rn); abort (); } } gmp_randclear(rands); }
/* Multiplies the least significant p limbs of (a;b) by M^-1. Temporary space needed: 2 * (p + M->n)*/ mp_size_t mpn_hgcd_matrix_adjust (struct hgcd_matrix *M, mp_size_t n, mp_ptr ap, mp_ptr bp, mp_size_t p, mp_ptr tp) { /* M^-1 (a;b) = (r11, -r01; -r10, r00) (a ; b) = (r11 a - r01 b; - r10 a + r00 b */ mp_ptr t0 = tp; mp_ptr t1 = tp + p + M->n; mp_limb_t ah, bh; mp_limb_t cy; ASSERT (p + M->n < n); /* First compute the two values depending on a, before overwriting a */ if (M->n >= p) { mpn_mul (t0, M->p[1][1], M->n, ap, p); mpn_mul (t1, M->p[1][0], M->n, ap, p); } else { mpn_mul (t0, ap, p, M->p[1][1], M->n); mpn_mul (t1, ap, p, M->p[1][0], M->n); } /* Update a */ MPN_COPY (ap, t0, p); ah = mpn_add (ap + p, ap + p, n - p, t0 + p, M->n); if (M->n >= p) mpn_mul (t0, M->p[0][1], M->n, bp, p); else mpn_mul (t0, bp, p, M->p[0][1], M->n); cy = mpn_sub (ap, ap, n, t0, p + M->n); ASSERT (cy <= ah); ah -= cy; /* Update b */ if (M->n >= p) mpn_mul (t0, M->p[0][0], M->n, bp, p); else mpn_mul (t0, bp, p, M->p[0][0], M->n); MPN_COPY (bp, t0, p); bh = mpn_add (bp + p, bp + p, n - p, t0 + p, M->n); cy = mpn_sub (bp, bp, n, t1, p + M->n); ASSERT (cy <= bh); bh -= cy; if (ah > 0 || bh > 0) { ap[n] = ah; bp[n] = bh; n++; } else { /* The subtraction can reduce the size by at most one limb. */ if (ap[n-1] == 0 && bp[n-1] == 0) n--; } ASSERT (ap[n-1] > 0 || bp[n-1] > 0); return n; }
/* Input: A = {ap, n} with most significant bit set. Output: X = B^n + {xp, n} where B = 2^GMP_NUMB_BITS. X is a lower approximation of B^(2n)/A with implicit msb. More precisely, one has: A*X < B^(2n) <= A*(X+1) or X = ceil(B^(2n)/A) - 1. */ void mpn_invert (mp_ptr xp, mp_srcptr ap, mp_size_t n) { if (n == 1) { /* invert_limb returns min(B-1, floor(B^2/ap[0])-B), which is B-1 when ap[0]=B/2, and 1 when ap[0]=B-1. For X=B+xp[0], we have A*X < B^2 <= A*(X+1) where the equality holds only when A=B/2. We thus have A*X < B^2 <= A*(X+1). */ invert_limb (xp[0], ap[0]); } else if (n == 2) { mp_limb_t tp[4], up[2], sp[2], cy; tp[0] = ZERO; invert_limb (xp[1], ap[1]); tp[3] = mpn_mul_1 (tp + 1, ap, 2, xp[1]); cy = mpn_add_n (tp + 2, tp + 2, ap, 2); while (cy) /* Xh is too large */ { xp[1] --; cy -= mpn_sub (tp + 1, tp + 1, 3, ap, 2); } /* tp[3] should be 111...111 */ mpn_com_n (sp, tp + 1, 2); cy = mpn_add_1 (sp, sp, 2, ONE); /* cy should be 0 */ up[1] = mpn_mul_1 (up, sp + 1, 1, xp[1]); cy = mpn_add_1 (up + 1, up + 1, 1, sp[1]); /* cy should be 0 */ xp[0] = up[1]; /* update tp */ cy = mpn_addmul_1 (tp, ap, 2, xp[0]); cy = mpn_add_1 (tp + 2, tp + 2, 2, cy); do { cy = mpn_add (tp, tp, 4, ap, 2); if (cy == ZERO) mpn_add_1 (xp, xp, 2, ONE); } while (cy == ZERO); /* now A*X < B^4 <= A*(X+1) */ } else { mp_size_t l, h; mp_ptr tp, up; mp_limb_t cy, th; int special = 0; TMP_DECL; l = (n - 1) / 2; h = n - l; mpn_invert (xp + l, ap + l, h); TMP_MARK; tp = TMP_ALLOC_LIMBS (n + h); up = TMP_ALLOC_LIMBS (2 * h); if (n <= WRAP_AROUND_BOUND) { mpn_mul (tp, ap, n, xp + l, h); cy = mpn_add_n (tp + h, tp + h, ap, n); } else { mp_size_t m = n + 1; mpir_ui k; int cc; if (m >= FFT_MULMOD_2EXPP1_CUTOFF) m = mpir_fft_adjust_limbs (m); /* we have m >= n + 1 by construction, thus m > h */ ASSERT(m < n + h); cy = mpn_mulmod_Bexpp1_fft (tp, m, ap, n, xp + l, h); /* cy, {tp, m} = A * {xp + l, h} mod (B^m+1) */ cy += mpn_add_n (tp + h, tp + h, ap, m - h); cc = mpn_sub_n (tp, tp, ap + m - h, n + h - m); cc = mpn_sub_1 (tp + n + h - m, tp + n + h - m, 2 * m - n - h, cc); if (cc > cy) /* can only occur if cc=1 and cy=0 */ cy = mpn_add_1 (tp, tp, m, ONE); else cy -= cc; /* cy, {tp, m} = A * Xh */ /* add B^(n+h) + B^(n+h-m) */ MPN_ZERO (tp + m, n + h - m); tp[m] = cy; /* note: since tp[n+h-1] is either 0, or cy<=1 if m=n+h-1, the mpn_incr_u() below cannot produce a carry */ mpn_incr_u (tp + n + h - m, ONE); cy = 1; do /* check if T >= B^(n+h) + 2*B^n */ { mp_size_t i; if (cy == ZERO) break; /* surely T < B^(n+h) */ if (cy == ONE) { for (i = n + h - 1; tp[i] == ZERO && i > n; i--); if (i == n && tp[i] < (mp_limb_t) 2) break; } /* subtract B^m+1 */ cy -= mpn_sub_1 (tp, tp, n + h, ONE); cy -= mpn_sub_1 (tp + m, tp + m, n + h - m, ONE); } while (1); } while (cy) { mpn_sub_1 (xp + l, xp + l, h, ONE); cy -= mpn_sub (tp, tp, n + h, ap, n); } mpn_not (tp, n); th = ~tp[n] + mpn_add_1 (tp, tp, n, ONE); mpn_mul_n (up, tp + l, xp + l, h); cy = mpn_add_n (up + h, up + h, tp + l, h); if (th != ZERO) { cy += ONE + mpn_add_n (up + h, up + h, xp + l, h); } if (up[2*h-l-1] + 4 <= CNST_LIMB(3)) special = 1; MPN_COPY (xp, up + 2 * h - l, l); mpn_add_1 (xp + l, xp + l, h, cy); TMP_FREE; if ((special) && !mpn_is_invert(xp, ap, n)) mpn_add_1 (xp, xp, n, 1); } }
REGPARM_ATTR (1) static void mpz_aorsmul (mpz_ptr w, mpz_srcptr x, mpz_srcptr y, mp_size_t sub) { mp_size_t xsize, ysize, tsize, wsize, wsize_signed; mp_ptr wp, tp; mp_limb_t c, high; TMP_DECL; /* w unaffected if x==0 or y==0 */ xsize = SIZ(x); ysize = SIZ(y); if (xsize == 0 || ysize == 0) return; /* make x the bigger of the two */ if (ABS(ysize) > ABS(xsize)) { MPZ_SRCPTR_SWAP (x, y); MP_SIZE_T_SWAP (xsize, ysize); } sub ^= ysize; ysize = ABS(ysize); /* use mpn_addmul_1/mpn_submul_1 if possible */ if (ysize == 1) { mpz_aorsmul_1 (w, x, PTR(y)[0], sub); return; } sub ^= xsize; xsize = ABS(xsize); wsize_signed = SIZ(w); sub ^= wsize_signed; wsize = ABS(wsize_signed); tsize = xsize + ysize; wp = MPZ_REALLOC (w, MAX (wsize, tsize) + 1); if (wsize_signed == 0) { /* Nothing to add to, just set w=x*y. No w==x or w==y overlap here, since we know x,y!=0 but w==0. */ high = mpn_mul (wp, PTR(x),xsize, PTR(y),ysize); tsize -= (high == 0); SIZ(w) = (sub >= 0 ? tsize : -tsize); return; } TMP_MARK; tp = TMP_ALLOC_LIMBS (tsize); high = mpn_mul (tp, PTR(x),xsize, PTR(y),ysize); tsize -= (high == 0); ASSERT (tp[tsize-1] != 0); if (sub >= 0) { mp_srcptr up = wp; mp_size_t usize = wsize; if (usize < tsize) { up = tp; usize = tsize; tp = wp; tsize = wsize; wsize = usize; } c = mpn_add (wp, up,usize, tp,tsize); wp[wsize] = c; wsize += (c != 0); } else { mp_srcptr up = wp; mp_size_t usize = wsize; if (mpn_cmp_twosizes_lt (up,usize, tp,tsize)) { up = tp; usize = tsize; tp = wp; tsize = wsize; wsize = usize; wsize_signed = -wsize_signed; } ASSERT_NOCARRY (mpn_sub (wp, up,usize, tp,tsize)); wsize = usize; MPN_NORMALIZE (wp, wsize); } SIZ(w) = (wsize_signed >= 0 ? wsize : -wsize); TMP_FREE; }
/* Check schoolboy division routine. */ void check_sb_div_q (void) { mp_limb_t np[2*MAX_LIMBS]; mp_limb_t np2[2*MAX_LIMBS]; mp_limb_t rp[2*MAX_LIMBS+1]; mp_limb_t dp[MAX_LIMBS]; mp_limb_t qp[2*MAX_LIMBS]; mp_limb_t dip, cy; mp_size_t nn, rn, dn, qn; gmp_randstate_t rands; int i, j, s; gmp_randinit_default(rands); for (i = 0; i < ITERS; i++) { dn = (random() % (MAX_LIMBS - 2)) + 3; nn = (random() % MAX_LIMBS) + dn; mpn_rrandom (np, rands, nn); mpn_rrandom (dp, rands, dn); dp[dn-1] |= GMP_LIMB_HIGHBIT; MPN_COPY(np2, np, nn); mpir_invert_pi2(dip, dp[dn - 1], dp[dn - 2]); qn = nn - dn + 1; qp[qn - 1] = mpn_sb_div_q(qp, np, nn, dp, dn, dip); MPN_NORMALIZE(qp, qn); if (qn) { if (qn >= dn) mpn_mul(rp, qp, qn, dp, dn); else mpn_mul(rp, dp, dn, qp, qn); rn = dn + qn; MPN_NORMALIZE(rp, rn); if (rn > nn) { printf("failed: q*d has too many limbs\n"); abort(); } if (mpn_cmp(rp, np2, nn) > 0) { printf("failed: remainder negative\n"); abort(); } mpn_sub(rp, np2, nn, rp, rn); rn = nn; MPN_NORMALIZE(rp, rn); } else { rn = nn; MPN_COPY(rp, np, nn); } s = (rn < dn) ? -1 : (rn > dn) ? 1 : mpn_cmp(rp, dp, dn); if (s >= 0) { printf ("failed:\n"); printf ("nn = %lu, dn = %lu, qn = %lu, rn = %lu\n\n", nn, dn, qn, rn); gmp_printf (" np: %Nx\n\n", np2, nn); gmp_printf (" dp: %Nx\n\n", dp, dn); gmp_printf (" qp: %Nx\n\n", qp, qn); gmp_printf (" rp: %Nx\n\n", rp, rn); abort (); } } gmp_randclear(rands); }
void mpf_ui_sub (mpf_ptr r, unsigned long int u, mpf_srcptr v) { mp_srcptr up, vp; mp_ptr rp, tp; mp_size_t usize, vsize, rsize; mp_size_t prec; mp_exp_t uexp; mp_size_t ediff; int negate; mp_limb_t ulimb; TMP_DECL; vsize = v->_mp_size; /* Handle special cases that don't work in generic code below. */ if (u == 0) { mpf_neg (r, v); return; } if (vsize == 0) { mpf_set_ui (r, u); return; } /* If signs of U and V are different, perform addition. */ if (vsize < 0) { __mpf_struct v_negated; v_negated._mp_size = -vsize; v_negated._mp_exp = v->_mp_exp; v_negated._mp_d = v->_mp_d; mpf_add_ui (r, &v_negated, u); return; } TMP_MARK; /* Signs are now known to be the same. */ ulimb = u; /* Make U be the operand with the largest exponent. */ if (1 < v->_mp_exp) { negate = 1; usize = ABS (vsize); vsize = 1; up = v->_mp_d; vp = &ulimb; rp = r->_mp_d; prec = r->_mp_prec + 1; uexp = v->_mp_exp; ediff = uexp - 1; } else { negate = 0; usize = 1; vsize = ABS (vsize); up = &ulimb; vp = v->_mp_d; rp = r->_mp_d; prec = r->_mp_prec; uexp = 1; ediff = 1 - v->_mp_exp; } /* Ignore leading limbs in U and V that are equal. Doing this helps increase the precision of the result. */ if (ediff == 0) { /* This loop normally exits immediately. Optimize for that. */ for (;;) { usize--; vsize--; if (up[usize] != vp[vsize]) break; uexp--; if (usize == 0) goto Lu0; if (vsize == 0) goto Lv0; } usize++; vsize++; /* Note that either operand (but not both operands) might now have leading zero limbs. It matters only that U is unnormalized if vsize is now zero, and vice versa. And it is only in that case that we have to adjust uexp. */ if (vsize == 0) Lv0: while (usize != 0 && up[usize - 1] == 0) usize--, uexp--; if (usize == 0) Lu0: while (vsize != 0 && vp[vsize - 1] == 0) vsize--, uexp--; } /* If U extends beyond PREC, ignore the part that does. */ if (usize > prec) { up += usize - prec; usize = prec; } /* If V extends beyond PREC, ignore the part that does. Note that this may make vsize negative. */ if (vsize + ediff > prec) { vp += vsize + ediff - prec; vsize = prec - ediff; } /* Allocate temp space for the result. Allocate just vsize + ediff later??? */ tp = (mp_ptr) TMP_ALLOC (prec * BYTES_PER_MP_LIMB); if (ediff >= prec) { /* V completely cancelled. */ if (tp != up) MPN_COPY (rp, up, usize); rsize = usize; } else { /* Locate the least significant non-zero limb in (the needed parts of) U and V, to simplify the code below. */ for (;;) { if (vsize == 0) { MPN_COPY (rp, up, usize); rsize = usize; goto done; } if (vp[0] != 0) break; vp++, vsize--; } for (;;) { if (usize == 0) { MPN_COPY (rp, vp, vsize); rsize = vsize; negate ^= 1; goto done; } if (up[0] != 0) break; up++, usize--; } /* uuuu | uuuu | uuuu | uuuu | uuuu */ /* vvvvvvv | vv | vvvvv | v | vv */ if (usize > ediff) { /* U and V partially overlaps. */ if (ediff == 0) { /* Have to compare the leading limbs of u and v to determine whether to compute u - v or v - u. */ if (usize > vsize) { /* uuuu */ /* vv */ int cmp; cmp = mpn_cmp (up + usize - vsize, vp, vsize); if (cmp >= 0) { mp_size_t size; size = usize - vsize; MPN_COPY (tp, up, size); mpn_sub_n (tp + size, up + size, vp, vsize); rsize = usize; } else { /* vv */ /* Swap U and V. */ /* uuuu */ mp_size_t size, i; size = usize - vsize; tp[0] = -up[0] & GMP_NUMB_MASK; for (i = 1; i < size; i++) tp[i] = ~up[i] & GMP_NUMB_MASK; mpn_sub_n (tp + size, vp, up + size, vsize); mpn_sub_1 (tp + size, tp + size, vsize, (mp_limb_t) 1); negate ^= 1; rsize = usize; } } else if (usize < vsize) { /* uuuu */ /* vvvvvvv */ int cmp; cmp = mpn_cmp (up, vp + vsize - usize, usize); if (cmp > 0) { mp_size_t size, i; size = vsize - usize; tp[0] = -vp[0] & GMP_NUMB_MASK; for (i = 1; i < size; i++) tp[i] = ~vp[i] & GMP_NUMB_MASK; mpn_sub_n (tp + size, up, vp + size, usize); mpn_sub_1 (tp + size, tp + size, usize, (mp_limb_t) 1); rsize = vsize; } else { /* vvvvvvv */ /* Swap U and V. */ /* uuuu */ /* This is the only place we can get 0.0. */ mp_size_t size; size = vsize - usize; MPN_COPY (tp, vp, size); mpn_sub_n (tp + size, vp + size, up, usize); negate ^= 1; rsize = vsize; } } else { /* uuuu */ /* vvvv */ int cmp; cmp = mpn_cmp (up, vp + vsize - usize, usize); if (cmp > 0) { mpn_sub_n (tp, up, vp, usize); rsize = usize; } else { mpn_sub_n (tp, vp, up, usize); negate ^= 1; rsize = usize; /* can give zero */ } } } else { if (vsize + ediff <= usize) { /* uuuu */ /* v */ mp_size_t size; size = usize - ediff - vsize; MPN_COPY (tp, up, size); mpn_sub (tp + size, up + size, usize - size, vp, vsize); rsize = usize; } else { /* uuuu */ /* vvvvv */ mp_size_t size, i; size = vsize + ediff - usize; tp[0] = -vp[0] & GMP_NUMB_MASK; for (i = 1; i < size; i++) tp[i] = ~vp[i] & GMP_NUMB_MASK; mpn_sub (tp + size, up, usize, vp + size, usize - ediff); mpn_sub_1 (tp + size, tp + size, usize, (mp_limb_t) 1); rsize = vsize + ediff; } } } else { /* uuuu */ /* vv */ mp_size_t size, i; size = vsize + ediff - usize; tp[0] = -vp[0] & GMP_NUMB_MASK; for (i = 1; i < vsize; i++) tp[i] = ~vp[i] & GMP_NUMB_MASK; for (i = vsize; i < size; i++) tp[i] = GMP_NUMB_MAX; mpn_sub_1 (tp + size, up, usize, (mp_limb_t) 1); rsize = size + usize; } /* Full normalize. Optimize later. */ while (rsize != 0 && tp[rsize - 1] == 0) { rsize--; uexp--; } MPN_COPY (rp, tp, rsize); } done: r->_mp_size = negate ? -rsize : rsize; r->_mp_exp = uexp; TMP_FREE; }
/* Temporary storage: Needs n limbs for the quotient, at qp. tp must point to an area large enough for the resulting cofactor, plus one limb extra. All in all, 2N + 1 if N is a bound for both inputs and outputs. */ mp_size_t mpn_gcdext_subdiv_step (mp_ptr gp, mp_size_t *gn, mp_ptr up, mp_size_t *usizep, mp_ptr ap, mp_ptr bp, mp_size_t n, mp_ptr u0, mp_ptr u1, mp_size_t *unp, mp_ptr qp, mp_ptr tp) { mp_size_t an, bn, un; mp_size_t qn; mp_size_t u0n; int swapped; an = bn = n; ASSERT (an > 0); ASSERT (ap[an-1] > 0 || bp[an-1] > 0); MPN_NORMALIZE (ap, an); MPN_NORMALIZE (bp, bn); un = *unp; swapped = 0; if (UNLIKELY (an == 0)) { return_b: MPN_COPY (gp, bp, bn); *gn = bn; MPN_NORMALIZE (u0, un); MPN_COPY (up, u0, un); *usizep = swapped ? un : -un; return 0; } else if (UNLIKELY (bn == 0)) { MPN_COPY (gp, ap, an); *gn = an; MPN_NORMALIZE (u1, un); MPN_COPY (up, u1, un); *usizep = swapped ? -un : un; return 0; } /* Arrange so that a > b, subtract an -= bn, and maintain normalization. */ if (an < bn) { MPN_PTR_SWAP (ap, an, bp, bn); MP_PTR_SWAP (u0, u1); swapped ^= 1; } else if (an == bn) { int c; MPN_CMP (c, ap, bp, an); if (UNLIKELY (c == 0)) { MPN_COPY (gp, ap, an); *gn = an; /* Must return the smallest cofactor, +u1 or -u0 */ MPN_CMP (c, u0, u1, un); ASSERT (c != 0 || (un == 1 && u0[0] == 1 && u1[0] == 1)); if (c < 0) { MPN_NORMALIZE (u0, un); MPN_COPY (up, u0, un); swapped ^= 1; } else { MPN_NORMALIZE_NOT_ZERO (u1, un); MPN_COPY (up, u1, un); } *usizep = swapped ? -un : un; return 0; } else if (c < 0) { MP_PTR_SWAP (ap, bp); MP_PTR_SWAP (u0, u1); swapped ^= 1; } } /* Reduce a -= b, u1 += u0 */ ASSERT_NOCARRY (mpn_sub (ap, ap, an, bp, bn)); MPN_NORMALIZE (ap, an); ASSERT (an > 0); u1[un] = mpn_add_n (u1, u1, u0, un); un += (u1[un] > 0); /* Arrange so that a > b, and divide a = q b + r */ if (an < bn) { MPN_PTR_SWAP (ap, an, bp, bn); MP_PTR_SWAP (u0, u1); swapped ^= 1; } else if (an == bn) { int c; MPN_CMP (c, ap, bp, an); if (UNLIKELY (c == 0)) goto return_b; else if (c < 0) { MP_PTR_SWAP (ap, bp); MP_PTR_SWAP (u0, u1); swapped ^= 1; } } /* Reduce a -= q b, u1 += q u0 */ qn = an - bn + 1; mpn_tdiv_qr (qp, ap, 0, ap, an, bp, bn); if (mpn_zero_p (ap, bn)) goto return_b; n = bn; /* Update u1 += q u0 */ u0n = un; MPN_NORMALIZE (u0, u0n); if (u0n > 0) { qn -= (qp[qn - 1] == 0); if (qn > u0n) mpn_mul (tp, qp, qn, u0, u0n); else mpn_mul (tp, u0, u0n, qp, qn); if (qn + u0n > un) { mp_size_t u1n = un; un = qn + u0n; un -= (tp[un-1] == 0); u1[un] = mpn_add (u1, tp, un, u1, u1n); } else { u1[un] = mpn_add (u1, u1, un, tp, qn + u0n); } un += (u1[un] > 0); } *unp = un; return n; }
void mpn_toom22_mul (mp_ptr pp, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr scratch) { mp_size_t n, s, t; int vm1_neg; mp_limb_t cy, cy2; mp_ptr asm1; mp_ptr bsm1; #define a0 ap #define a1 (ap + n) #define b0 bp #define b1 (bp + n) s = an >> 1; n = an - s; t = bn - n; ASSERT (an >= bn); ASSERT (0 < s && s <= n); ASSERT (0 < t && t <= s); asm1 = pp; bsm1 = pp + n; vm1_neg = 0; /* Compute asm1. */ if (s == n) { if (mpn_cmp (a0, a1, n) < 0) { mpn_sub_n (asm1, a1, a0, n); vm1_neg = 1; } else { mpn_sub_n (asm1, a0, a1, n); } } else { if (mpn_zero_p (a0 + s, n - s) && mpn_cmp (a0, a1, s) < 0) { mpn_sub_n (asm1, a1, a0, s); MPN_ZERO (asm1 + s, n - s); vm1_neg = 1; } else { mpn_sub (asm1, a0, n, a1, s); } } /* Compute bsm1. */ if (t == n) { if (mpn_cmp (b0, b1, n) < 0) { mpn_sub_n (bsm1, b1, b0, n); vm1_neg ^= 1; } else { mpn_sub_n (bsm1, b0, b1, n); } } else { if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0) { mpn_sub_n (bsm1, b1, b0, t); MPN_ZERO (bsm1 + t, n - t); vm1_neg ^= 1; } else { mpn_sub (bsm1, b0, n, b1, t); } } #define v0 pp /* 2n */ #define vinf (pp + 2 * n) /* s+t */ #define vm1 scratch /* 2n */ #define scratch_out scratch + 2 * n /* vm1, 2n limbs */ TOOM22_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out); if (s > t) TOOM22_MUL_REC (vinf, a1, s, b1, t, scratch_out); else TOOM22_MUL_N_REC (vinf, a1, b1, s, scratch_out); /* v0, 2n limbs */ TOOM22_MUL_N_REC (v0, ap, bp, n, scratch_out); /* H(v0) + L(vinf) */ cy = mpn_add_n (pp + 2 * n, v0 + n, vinf, n); /* L(v0) + H(v0) */ cy2 = cy + mpn_add_n (pp + n, pp + 2 * n, v0, n); /* L(vinf) + H(vinf) */ cy += mpn_add (pp + 2 * n, pp + 2 * n, n, vinf + n, s + t - n); if (vm1_neg) cy += mpn_add_n (pp + n, pp + n, vm1, 2 * n); else cy -= mpn_sub_n (pp + n, pp + n, vm1, 2 * n); ASSERT (cy + 1 <= 3); ASSERT (cy2 <= 2); mpn_incr_u (pp + 2 * n, cy2); if (LIKELY (cy <= 2)) mpn_incr_u (pp + 3 * n, cy); else mpn_decr_u (pp + 3 * n, 1); }
/* Check divide and conquer division routine. */ void check_dc_div_qr_n (void) { mp_limb_t np[2*MAX_LIMBS]; mp_limb_t np2[2*MAX_LIMBS]; mp_limb_t rp[2*MAX_LIMBS+1]; mp_limb_t dp[MAX_LIMBS]; mp_limb_t qp[2*MAX_LIMBS]; mp_limb_t tp[DC_DIVAPPR_Q_N_ITCH(MAX_LIMBS)]; mp_limb_t dip, cy; mp_size_t rn, dn, qn; gmp_randstate_t rands; int i, j, s; gmp_randinit_default(rands); for (i = 0; i < ITERS; i++) { dn = (random() % (MAX_LIMBS - 5)) + 6; mpn_rrandom (np, rands, 2*dn); mpn_rrandom (dp, rands, dn); dp[dn-1] |= GMP_LIMB_HIGHBIT; MPN_COPY(np2, np, 2*dn); invert_1(dip, dp[dn - 1], dp[dn - 2]); qn = dn + 1; qp[qn - 1] = mpn_dc_div_qr_n(qp, np, dp, dn, dip, tp); MPN_NORMALIZE(qp, qn); if (qn) { if (qn >= dn) mpn_mul(rp, qp, qn, dp, dn); else mpn_mul(rp, dp, dn, qp, qn); rn = dn + qn; MPN_NORMALIZE(rp, rn); if (rn > 2*dn) { printf("failed: q*d has too many limbs\n"); abort(); } if (mpn_cmp(rp, np2, 2*dn) > 0) { printf("failed: remainder negative\n"); abort(); } mpn_sub(rp, np2, 2*dn, rp, rn); rn = 2*dn; MPN_NORMALIZE(rp, rn); } else { rn = 2*dn; MPN_COPY(rp, np, 2*dn); } s = (rn < dn) ? -1 : (rn > dn) ? 1 : mpn_cmp(rp, dp, dn); if (s >= 0) { printf ("failed:\n"); printf ("dn = %lu, qn = %lu, rn = %lu\n\n", dn, qn, rn); gmp_printf (" np: %Nx\n\n", np2, 2*dn); gmp_printf (" dp: %Nx\n\n", dp, dn); gmp_printf (" qp: %Nx\n\n", qp, qn); gmp_printf (" rp: %Nx\n\n", rp, rn); abort (); } if (mpn_cmp(rp, np, rn) != 0) { printf("failed: remainder does not match\n"); gmp_printf (" np: %Nx\n\n", np2, 2*dn); gmp_printf (" dp: %Nx\n\n", dp, dn); gmp_printf (" qp: %Nx\n\n", qp, qn); gmp_printf (" rp: %Nx\n\n", rp, rn); gmp_printf (" rp2: %Nx\n\n", np, rn); } } gmp_randclear(rands); }
void mpn_dcpi1_bdiv_q (mp_ptr qp, mp_ptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_limb_t dinv) { mp_size_t qn; mp_limb_t cy; mp_ptr tp; TMP_DECL; TMP_MARK; ASSERT (dn >= 2); ASSERT (nn - dn >= 0); ASSERT (dp[0] & 1); tp = TMP_SALLOC_LIMBS (dn); qn = nn; if (qn > dn) { /* Reduce qn mod dn in a super-efficient manner. */ do qn -= dn; while (qn > dn); /* Perform the typically smaller block first. */ if (BELOW_THRESHOLD (qn, DC_BDIV_QR_THRESHOLD)) cy = mpn_sbpi1_bdiv_qr (qp, np, 2 * qn, dp, qn, dinv); else cy = mpn_dcpi1_bdiv_qr_n (qp, np, dp, qn, dinv, tp); if (qn != dn) { if (qn > dn - qn) mpn_mul (tp, qp, qn, dp + qn, dn - qn); else mpn_mul (tp, dp + qn, dn - qn, qp, qn); mpn_incr_u (tp + qn, cy); mpn_sub (np + qn, np + qn, nn - qn, tp, dn); cy = 0; } np += qn; qp += qn; qn = nn - qn; while (qn > dn) { mpn_sub_1 (np + dn, np + dn, qn - dn, cy); cy = mpn_dcpi1_bdiv_qr_n (qp, np, dp, dn, dinv, tp); qp += dn; np += dn; qn -= dn; } mpn_dcpi1_bdiv_q_n (qp, np, dp, dn, dinv, tp); } else { if (BELOW_THRESHOLD (qn, DC_BDIV_Q_THRESHOLD)) mpn_sbpi1_bdiv_q (qp, np, qn, dp, qn, dinv); else mpn_dcpi1_bdiv_q_n (qp, np, dp, qn, dinv, tp); } TMP_FREE; }
/* Computes |v| = |(g - u a)| / b, where u may be positive or negative, and v is of the opposite sign. a, b are of size n, u and v at most size n, and v must have space for n+1 limbs. */ static mp_size_t compute_v (mp_ptr vp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, mp_srcptr gp, mp_size_t gn, mp_srcptr up, mp_size_t usize, mp_ptr tp) { mp_size_t size; mp_size_t an; mp_size_t bn; mp_size_t vn; ASSERT (n > 0); ASSERT (gn > 0); ASSERT (usize != 0); size = ABS (usize); ASSERT (size <= n); an = n; MPN_NORMALIZE (ap, an); if (an >= size) mpn_mul (tp, ap, an, up, size); else mpn_mul (tp, up, size, ap, an); size += an; ASSERT (gn <= size); if (usize > 0) { /* |v| = -v = (u a - g) / b */ ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn)); MPN_NORMALIZE (tp, size); if (size == 0) return 0; } else { /* usize < 0 */ /* |v| = v = (c - u a) / b = (c + |u| a) / b */ mp_limb_t cy = mpn_add (tp, tp, size, gp, gn); if (cy) tp[size++] = cy; } /* Now divide t / b. There must be no remainder */ bn = n; MPN_NORMALIZE (bp, bn); ASSERT (size >= bn); vn = size + 1 - bn; ASSERT (vn <= n + 1); /* FIXME: Use divexact. Or do the entire calculation mod 2^{n * GMP_NUMB_BITS}. */ mpn_tdiv_qr (vp, tp, 0, tp, size, bp, bn); vn -= (vp[vn-1] == 0); /* Remainder must be zero */ #if WANT_ASSERT { mp_size_t i; for (i = 0; i < bn; i++) { ASSERT (tp[i] == 0); } } #endif return vn; }
void mpn_toom2_sqr (mp_ptr pp, mp_srcptr ap, mp_size_t an, mp_ptr scratch) { mp_size_t n, s; mp_limb_t cy, cy2; mp_ptr asm1; #define a0 ap #define a1 (ap + n) s = an >> 1; n = an - s; ASSERT (0 < s && s <= n); asm1 = pp; /* Compute asm1. */ if (s == n) { if (mpn_cmp (a0, a1, n) < 0) { mpn_sub_n (asm1, a1, a0, n); } else { mpn_sub_n (asm1, a0, a1, n); } } else { if (mpn_zero_p (a0 + s, n - s) && mpn_cmp (a0, a1, s) < 0) { mpn_sub_n (asm1, a1, a0, s); MPN_ZERO (asm1 + s, n - s); } else { mpn_sub (asm1, a0, n, a1, s); } } #define v0 pp /* 2n */ #define vinf (pp + 2 * n) /* s+s */ #define vm1 scratch /* 2n */ #define scratch_out scratch + 2 * n /* vm1, 2n limbs */ TOOM2_SQR_REC (vm1, asm1, n, scratch_out); /* vinf, s+s limbs */ TOOM2_SQR_REC (vinf, a1, s, scratch_out); /* v0, 2n limbs */ TOOM2_SQR_REC (v0, ap, n, scratch_out); /* H(v0) + L(vinf) */ cy = mpn_add_n (pp + 2 * n, v0 + n, vinf, n); /* L(v0) + H(v0) */ cy2 = cy + mpn_add_n (pp + n, pp + 2 * n, v0, n); /* L(vinf) + H(vinf) */ cy += mpn_add (pp + 2 * n, pp + 2 * n, n, vinf + n, s + s - n); cy -= mpn_sub_n (pp + n, pp + n, vm1, 2 * n); ASSERT (cy + 1 <= 3); ASSERT (cy2 <= 2); mpn_incr_u (pp + 2 * n, cy2); if (LIKELY (cy <= 2)) mpn_incr_u (pp + 3 * n, cy); else mpn_decr_u (pp + 3 * n, 1); }
void mpn_toom_interpolate_7pts (mp_ptr rp, mp_size_t n, enum toom7_flags flags, mp_ptr w1, mp_ptr w3, mp_ptr w4, mp_ptr w5, mp_size_t w6n, mp_ptr tp) { mp_size_t m; mp_limb_t cy; m = 2*n + 1; #define w0 rp #define w2 (rp + 2*n) #define w6 (rp + 6*n) ASSERT (w6n > 0); ASSERT (w6n <= 2*n); /* Using formulas similar to Marco Bodrato's W5 = W5 + W4 W1 =(W4 - W1)/2 W4 = W4 - W0 W4 =(W4 - W1)/4 - W6*16 W3 =(W2 - W3)/2 W2 = W2 - W3 W5 = W5 - W2*65 May be negative. W2 = W2 - W6 - W0 W5 =(W5 + W2*45)/2 Now >= 0 again. W4 =(W4 - W2)/3 W2 = W2 - W4 W1 = W5 - W1 May be negative. W5 =(W5 - W3*8)/9 W3 = W3 - W5 W1 =(W1/15 + W5)/2 Now >= 0 again. W5 = W5 - W1 where W0 = f(0), W1 = f(-2), W2 = f(1), W3 = f(-1), W4 = f(2), W5 = f(1/2), W6 = f(oo), Note that most intermediate results are positive; the ones that may be negative are represented in two's complement. We must never shift right a value that may be negative, since that would invalidate the sign bit. On the other hand, divexact by odd numbers work fine with two's complement. */ mpn_add_n (w5, w5, w4, m); if (flags & toom7_w1_neg) { #ifdef HAVE_NATIVE_mpn_rsh1add_n mpn_rsh1add_n (w1, w1, w4, m); #else mpn_add_n (w1, w1, w4, m); ASSERT (!(w1[0] & 1)); mpn_rshift (w1, w1, m, 1); #endif } else { #ifdef HAVE_NATIVE_mpn_rsh1sub_n mpn_rsh1sub_n (w1, w4, w1, m); #else mpn_sub_n (w1, w4, w1, m); ASSERT (!(w1[0] & 1)); mpn_rshift (w1, w1, m, 1); #endif } mpn_sub (w4, w4, m, w0, 2*n); mpn_sub_n (w4, w4, w1, m); ASSERT (!(w4[0] & 3)); mpn_rshift (w4, w4, m, 2); /* w4>=0 */ tp[w6n] = mpn_lshift (tp, w6, w6n, 4); mpn_sub (w4, w4, m, tp, w6n+1); if (flags & toom7_w3_neg) { #ifdef HAVE_NATIVE_mpn_rsh1add_n mpn_rsh1add_n (w3, w3, w2, m); #else mpn_add_n (w3, w3, w2, m); ASSERT (!(w3[0] & 1)); mpn_rshift (w3, w3, m, 1); #endif } else { #ifdef HAVE_NATIVE_mpn_rsh1sub_n mpn_rsh1sub_n (w3, w2, w3, m); #else mpn_sub_n (w3, w2, w3, m); ASSERT (!(w3[0] & 1)); mpn_rshift (w3, w3, m, 1); #endif } mpn_sub_n (w2, w2, w3, m); mpn_submul_1 (w5, w2, m, 65); mpn_sub (w2, w2, m, w6, w6n); mpn_sub (w2, w2, m, w0, 2*n); mpn_addmul_1 (w5, w2, m, 45); ASSERT (!(w5[0] & 1)); mpn_rshift (w5, w5, m, 1); mpn_sub_n (w4, w4, w2, m); mpn_divexact_by3 (w4, w4, m); mpn_sub_n (w2, w2, w4, m); mpn_sub_n (w1, w5, w1, m); mpn_lshift (tp, w3, m, 3); mpn_sub_n (w5, w5, tp, m); mpn_divexact_by9 (w5, w5, m); mpn_sub_n (w3, w3, w5, m); mpn_divexact_by15 (w1, w1, m); mpn_add_n (w1, w1, w5, m); ASSERT (!(w1[0] & 1)); mpn_rshift (w1, w1, m, 1); /* w1>=0 now */ mpn_sub_n (w5, w5, w1, m); /* These bounds are valid for the 4x4 polynomial product of toom44, * and they are conservative for toom53 and toom62. */ ASSERT (w1[2*n] < 2); ASSERT (w2[2*n] < 3); ASSERT (w3[2*n] < 4); ASSERT (w4[2*n] < 3); ASSERT (w5[2*n] < 2); /* Addition chain. Note carries and the 2n'th limbs that need to be * added in. * * Special care is needed for w2[2n] and the corresponding carry, * since the "simple" way of adding it all together would overwrite * the limb at wp[2*n] and rp[4*n] (same location) with the sum of * the high half of w3 and the low half of w4. * * 7 6 5 4 3 2 1 0 * | | | | | | | | | * ||w3 (2n+1)| * ||w4 (2n+1)| * ||w5 (2n+1)| ||w1 (2n+1)| * + | w6 (w6n)| ||w2 (2n+1)| w0 (2n) | (share storage with r) * ----------------------------------------------- * r | | | | | | | | | * c7 c6 c5 c4 c3 Carries to propagate */ cy = mpn_add_n (rp + n, rp + n, w1, m); MPN_INCR_U (w2 + n + 1, n , cy); cy = mpn_add_n (rp + 3*n, rp + 3*n, w3, n); MPN_INCR_U (w3 + n, n + 1, w2[2*n] + cy); cy = mpn_add_n (rp + 4*n, w3 + n, w4, n); MPN_INCR_U (w4 + n, n + 1, w3[2*n] + cy); cy = mpn_add_n (rp + 5*n, w4 + n, w5, n); MPN_INCR_U (w5 + n, n + 1, w4[2*n] + cy); if (w6n > n + 1) ASSERT_NOCARRY (mpn_add (rp + 6*n, rp + 6*n, w6n, w5 + n, n + 1)); else { ASSERT_NOCARRY (mpn_add_n (rp + 6*n, rp + 6*n, w5 + n, w6n)); #if WANT_ASSERT { mp_size_t i; for (i = w6n; i <= n; i++) ASSERT (w5[n + i] == 0); } #endif } }
void compute_A(QS_t * qs_inf, poly_t * poly_inf) { unsigned long min = poly_inf->min; unsigned long span = poly_inf->span; unsigned long s = poly_inf->s; unsigned long * A_ind = poly_inf->A_ind; unsigned long * A = poly_inf->A; unsigned long * target_A = poly_inf->target_A; unsigned long * current_A = (unsigned long *) flint_stack_alloc(qs_inf->prec+1); unsigned long * diff = (unsigned long *) flint_stack_alloc(qs_inf->prec+1); unsigned long * best_diff = (unsigned long *) flint_stack_alloc(qs_inf->prec+1); prime_t * factor_base = qs_inf->factor_base; unsigned long factor, p; unsigned long best1, best2, best3; unsigned long odds = s - 3; mp_limb_t msl; int taken; long i, j, k; A[0] = 1; A[1] = 1; for (i = 0; i < odds; i++) // Randomly choose the first s-3 prime factors of A with odd indices { do { taken = 0; A_ind[i] = ((z_randint(span) + min) | 1); if (A_ind[i] == min + span) A_ind[i] -= 2; for (j = 0; j < i; j++) { if (A_ind[i] == A_ind[j]) taken = 1; } } while (taken); msl = mpn_mul_1(A+1, A+1, A[0], factor_base[A_ind[i]].p); if (msl) // Compute the product of these s-3 primes { A[A[0]+1] = msl; A[0]++; } } for (k = 0; k < 30; k++) // Now try 8 different sets of even index primes as the remaining factors { F_mpn_copy(current_A, A, A[0] + 1); for (i = 0; i < 3; i++) // Randomly choose the last 3 prime factors of A with even indices { do { taken = 0; A_ind[s-3+i] = ((z_randint(span) + min) & -2L); if (A_ind[s-3+i] < min) A_ind[s-3+i] += 2; for (j = 0; j < i; j++) { if (A_ind[s-3+i] == A_ind[s-3+j]) taken = 1; } } while (taken); msl = mpn_mul_1(current_A+1, current_A+1, current_A[0], factor_base[A_ind[s-3+i]].p); if (msl) // Compute the product of these s-3 primes and the odd indexed primes { current_A[current_A[0]+1] = msl; current_A[0]++; } } if (k == 0) // Just store the first difference as the best one { if (target_A[0] >= current_A[0]) // Compute the difference with the target A { msl = mpn_sub(best_diff+1, target_A+1, target_A[0], current_A+1, current_A[0]); best_diff[0] = target_A[0]; } else { msl = mpn_sub(best_diff+1, current_A+1, current_A[0], target_A+1, target_A[0]); best_diff[0] = current_A[0]; } if (msl) F_mpn_negate(best_diff+1, best_diff+1, best_diff[0]); while ((!best_diff[best_diff[0]]) && (best_diff[0])) best_diff[0]--; // Normalise best_diff best1 = A_ind[s-3]; best2 = A_ind[s-2]; best3 = A_ind[s-1]; continue; } if (target_A[0] >= current_A[0]) // Compute the difference with the target A { msl = mpn_sub(diff+1, target_A+1, target_A[0], current_A+1, current_A[0]); diff[0] = target_A[0]; } else { msl = mpn_sub(diff+1, current_A+1, current_A[0], target_A+1, target_A[0]); diff[0] = current_A[0]; } if (msl) F_mpn_negate(diff+1, diff+1, diff[0]); while ((!diff[diff[0]]) && (diff[0])) diff[0]--; // Normalise diff if ((diff[0] < best_diff[0]) || ((diff[0] == best_diff[0]) && (mpn_cmp(diff+1, best_diff+1, diff[0]) < 0))) // The new diff is better { F_mpn_copy(best_diff, diff, diff[0]+1); best1 = A_ind[s-3]; best2 = A_ind[s-2]; best3 = A_ind[s-1]; } } A_ind[s-3] = best1; // Multiply A by the product of these 3 primes and store their indices A_ind[s-2] = best2; A_ind[s-1] = best3; for (i = 0; i < 3; i++) { msl = mpn_mul_1(A+1, A+1, A[0], factor_base[A_ind[s+i-3]].p); if (msl) { A[A[0]+1] = msl; A[0]++; } } #if POLY_A mpz_t A_disp, targ_A; mpz_init(A_disp); mpz_init(targ_A); fmpz_to_mpz(A_disp, A); fmpz_to_mpz(targ_A, target_A); gmp_printf("A = %Zd, target A = %Zd\n", A_disp, targ_A); mpz_clear(A_disp); mpz_clear(targ_A); #endif /*for (i = 0; i < s; i++) { p = factor_base[A_ind[i]].p; poly_inf->inv_p2[i] = z_precompute_inverse(p*p); } */ fmpz_to_mpz(poly_inf->A_mpz, A); flint_stack_release(); // release current_A flint_stack_release(); // release diff flint_stack_release(); // release best_diff }
void mpn_toom22_mul (mp_ptr pp, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr scratch) { const int __gmpn_cpuvec_initialized = 1; mp_size_t n, s, t; int vm1_neg; mp_limb_t cy, cy2; mp_ptr asm1; mp_ptr bsm1; #define a0 ap #define a1 (ap + n) #define b0 bp #define b1 (bp + n) s = an >> 1; n = an - s; t = bn - n; ASSERT (an >= bn); ASSERT (0 < s && s <= n && s >= n - 1); ASSERT (0 < t && t <= s); asm1 = pp; bsm1 = pp + n; vm1_neg = 0; /* Compute asm1. */ if (s == n) { if (mpn_cmp (a0, a1, n) < 0) { mpn_sub_n (asm1, a1, a0, n); vm1_neg = 1; } else { mpn_sub_n (asm1, a0, a1, n); } } else /* n - s == 1 */ { if (a0[s] == 0 && mpn_cmp (a0, a1, s) < 0) { mpn_sub_n (asm1, a1, a0, s); asm1[s] = 0; vm1_neg = 1; } else { asm1[s] = a0[s] - mpn_sub_n (asm1, a0, a1, s); } } /* Compute bsm1. */ if (t == n) { if (mpn_cmp (b0, b1, n) < 0) { mpn_sub_n (bsm1, b1, b0, n); vm1_neg ^= 1; } else { mpn_sub_n (bsm1, b0, b1, n); } } else { if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0) { mpn_sub_n (bsm1, b1, b0, t); MPN_ZERO (bsm1 + t, n - t); vm1_neg ^= 1; } else { mpn_sub (bsm1, b0, n, b1, t); } } #define v0 pp /* 2n */ #define vinf (pp + 2 * n) /* s+t */ #define vm1 scratch /* 2n */ #define scratch_out scratch + 2 * n /* vm1, 2n limbs */ TOOM22_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out); if (s > t) TOOM22_MUL_REC (vinf, a1, s, b1, t, scratch_out); else TOOM22_MUL_N_REC (vinf, a1, b1, s, scratch_out); /* v0, 2n limbs */ TOOM22_MUL_N_REC (v0, ap, bp, n, scratch_out); /* H(v0) + L(vinf) */ cy = mpn_add_n (pp + 2 * n, v0 + n, vinf, n); /* L(v0) + H(v0) */ cy2 = cy + mpn_add_n (pp + n, pp + 2 * n, v0, n); /* L(vinf) + H(vinf) */ cy += mpn_add (pp + 2 * n, pp + 2 * n, n, vinf + n, s + t - n); if (vm1_neg) cy += mpn_add_n (pp + n, pp + n, vm1, 2 * n); else cy -= mpn_sub_n (pp + n, pp + n, vm1, 2 * n); ASSERT (cy + 1 <= 3); ASSERT (cy2 <= 2); MPN_INCR_U (pp + 2 * n, s + t, cy2); if (LIKELY (cy <= 2)) /* if s+t==n, cy is zero, but we should not acces pp[3*n] at all. */ MPN_INCR_U (pp + 3 * n, s + t - n, cy); else MPN_DECR_U (pp + 3 * n, s + t - n, 1); }
/* Computes {rp,MIN(rn,an+bn)} <- {ap,an}*{bp,bn} Mod(B^rn-1) * * The result is expected to be ZERO if and only if one of the operand * already is. Otherwise the class [0] Mod(B^rn-1) is represented by * B^rn-1. This should not be a problem if mulmod_bnm1 is used to * combine results and obtain a natural number when one knows in * advance that the final value is less than (B^rn-1). * Moreover it should not be a problem if mulmod_bnm1 is used to * compute the full product with an+bn <= rn, because this condition * implies (B^an-1)(B^bn-1) < (B^rn-1) . * * Requires 0 < bn <= an <= rn and an + bn > rn/2 * Scratch need: rn + (need for recursive call OR rn + 4). This gives * * S(n) <= rn + MAX (rn + 4, S(n/2)) <= 2rn + 4 */ void mpn_mulmod_bnm1 (mp_ptr rp, mp_size_t rn, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr tp) { ASSERT (0 < bn); ASSERT (bn <= an); ASSERT (an <= rn); if ((rn & 1) != 0 || BELOW_THRESHOLD (rn, MULMOD_BNM1_THRESHOLD)) { if (UNLIKELY (bn < rn)) { if (UNLIKELY (an + bn <= rn)) { mpn_mul (rp, ap, an, bp, bn); } else { mp_limb_t cy; mpn_mul (tp, ap, an, bp, bn); cy = mpn_add (rp, tp, rn, tp + rn, an + bn - rn); MPN_INCR_U (rp, rn, cy); } } else mpn_bc_mulmod_bnm1 (rp, ap, bp, rn, tp); } else { mp_size_t n; mp_limb_t cy; mp_limb_t hi; n = rn >> 1; /* We need at least an + bn >= n, to be able to fit one of the recursive products at rp. Requiring strict inequality makes the coded slightly simpler. If desired, we could avoid this restriction by initially halving rn as long as rn is even and an + bn <= rn/2. */ ASSERT (an + bn > n); /* Compute xm = a*b mod (B^n - 1), xp = a*b mod (B^n + 1) and crt together as x = -xp * B^n + (B^n + 1) * [ (xp + xm)/2 mod (B^n-1)] */ #define a0 ap #define a1 (ap + n) #define b0 bp #define b1 (bp + n) #define xp tp /* 2n + 2 */ /* am1 maybe in {xp, n} */ /* bm1 maybe in {xp + n, n} */ #define sp1 (tp + 2*n + 2) /* ap1 maybe in {sp1, n + 1} */ /* bp1 maybe in {sp1 + n + 1, n + 1} */ { mp_srcptr am1, bm1; mp_size_t anm, bnm; mp_ptr so; bm1 = b0; bnm = bn; if (LIKELY (an > n)) { am1 = xp; cy = mpn_add (xp, a0, n, a1, an - n); MPN_INCR_U (xp, n, cy); anm = n; so = xp + n; if (LIKELY (bn > n)) { bm1 = so; cy = mpn_add (so, b0, n, b1, bn - n); MPN_INCR_U (so, n, cy); bnm = n; so += n; } } else { so = xp; am1 = a0; anm = an; } mpn_mulmod_bnm1 (rp, n, am1, anm, bm1, bnm, so); } { int k; mp_srcptr ap1, bp1; mp_size_t anp, bnp; bp1 = b0; bnp = bn; if (LIKELY (an > n)) { ap1 = sp1; cy = mpn_sub (sp1, a0, n, a1, an - n); sp1[n] = 0; MPN_INCR_U (sp1, n + 1, cy); anp = n + ap1[n]; if (LIKELY (bn > n)) { bp1 = sp1 + n + 1; cy = mpn_sub (sp1 + n + 1, b0, n, b1, bn - n); sp1[2*n+1] = 0; MPN_INCR_U (sp1 + n + 1, n + 1, cy); bnp = n + bp1[n]; } } else { ap1 = a0; anp = an; } if (BELOW_THRESHOLD (n, MUL_FFT_MODF_THRESHOLD)) k=0; else { int mask; k = mpn_fft_best_k (n, 0); mask = (1<<k) - 1; while (n & mask) {k--; mask >>=1;}; } if (k >= FFT_FIRST_K) xp[n] = mpn_mul_fft (xp, n, ap1, anp, bp1, bnp, k); else if (UNLIKELY (bp1 == b0)) { ASSERT (anp + bnp <= 2*n+1); ASSERT (anp + bnp > n); ASSERT (anp >= bnp); mpn_mul (xp, ap1, anp, bp1, bnp); anp = anp + bnp - n; ASSERT (anp <= n || xp[2*n]==0); anp-= anp > n; cy = mpn_sub (xp, xp, n, xp + n, anp); xp[n] = 0; MPN_INCR_U (xp, n+1, cy); } else mpn_bc_mulmod_bnp1 (xp, ap1, bp1, n, xp); } /* Here the CRT recomposition begins. xm <- (xp + xm)/2 = (xp + xm)B^n/2 mod (B^n-1) Division by 2 is a bitwise rotation. Assumes xp normalised mod (B^n+1). The residue class [0] is represented by [B^n-1]; except when both input are ZERO. */ #if HAVE_NATIVE_mpn_rsh1add_n || HAVE_NATIVE_mpn_rsh1add_nc #if HAVE_NATIVE_mpn_rsh1add_nc cy = mpn_rsh1add_nc(rp, rp, xp, n, xp[n]); /* B^n = 1 */ hi = cy << (GMP_NUMB_BITS - 1); cy = 0; /* next update of rp[n-1] will set cy = 1 only if rp[n-1]+=hi overflows, i.e. a further increment will not overflow again. */ #else /* ! _nc */ cy = xp[n] + mpn_rsh1add_n(rp, rp, xp, n); /* B^n = 1 */ hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */ cy >>= 1; /* cy = 1 only if xp[n] = 1 i.e. {xp,n} = ZERO, this implies that the rsh1add was a simple rshift: the top bit is 0. cy=1 => hi=0. */ #endif #if GMP_NAIL_BITS == 0 add_ssaaaa(cy, rp[n-1], cy, rp[n-1], 0, hi); #else cy += (hi & rp[n-1]) >> (GMP_NUMB_BITS-1); rp[n-1] ^= hi; #endif #else /* ! HAVE_NATIVE_mpn_rsh1add_n */ #if HAVE_NATIVE_mpn_add_nc cy = mpn_add_nc(rp, rp, xp, n, xp[n]); #else /* ! _nc */ cy = xp[n] + mpn_add_n(rp, rp, xp, n); /* xp[n] == 1 implies {xp,n} == ZERO */ #endif cy += (rp[0]&1); mpn_rshift(rp, rp, n, 1); ASSERT (cy <= 2); hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */ cy >>= 1; /* We can have cy != 0 only if hi = 0... */ ASSERT ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0); rp[n-1] |= hi; /* ... rp[n-1] + cy can not overflow, the following INCR is correct. */ #endif ASSERT (cy <= 1); /* Next increment can not overflow, read the previous comments about cy. */ ASSERT ((cy == 0) || ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0)); MPN_INCR_U(rp, n, cy); /* Compute the highest half: ([(xp + xm)/2 mod (B^n-1)] - xp ) * B^n */ if (UNLIKELY (an + bn < rn)) { /* Note that in this case, the only way the result can equal zero mod B^{rn} - 1 is if one of the inputs is zero, and then the output of both the recursive calls and this CRT reconstruction is zero, not B^{rn} - 1. Which is good, since the latter representation doesn't fit in the output area.*/ cy = mpn_sub_n (rp + n, rp, xp, an + bn - n); /* FIXME: This subtraction of the high parts is not really necessary, we do it to get the carry out, and for sanity checking. */ cy = xp[n] + mpn_sub_nc (xp + an + bn - n, rp + an + bn - n, xp + an + bn - n, rn - (an + bn), cy); ASSERT (an + bn == rn - 1 || mpn_zero_p (xp + an + bn - n + 1, rn - 1 - (an + bn))); cy = mpn_sub_1 (rp, rp, an + bn, cy); ASSERT (cy == (xp + an + bn - n)[0]); } else { cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n); /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO. DECR will affect _at most_ the lowest n limbs. */ MPN_DECR_U (rp, 2*n, cy); } #undef a0 #undef a1 #undef b0 #undef b1 #undef xp #undef sp1 } }
mp_size_t mpn_rootrem (mp_ptr rootp, mp_ptr remp, mp_srcptr up, mp_size_t un, mp_limb_t nth) { mp_ptr pp, qp, xp; mp_size_t pn, xn, qn; unsigned long int unb, xnb, bit; unsigned int cnt; mp_size_t i; unsigned long int n_valid_bits, adj; TMP_DECL; TMP_MARK; /* The extra factor 1.585 = log(3)/log(2) here is for the worst case overestimate of the root, i.e., when the code rounds a root that is 2+epsilon to 3, and then powers this to a potentially huge power. We could generalize the code for detecting root=1 a few lines below to deal with xnb <= k, for some small k. For example, when xnb <= 2, meaning the root should be 1, 2, or 3, we could replace this factor by the much smaller log(5)/log(4). */ #define PP_ALLOC (2 + (mp_size_t) (un*1.585)) pp = TMP_ALLOC_LIMBS (PP_ALLOC); count_leading_zeros (cnt, up[un - 1]); unb = un * GMP_NUMB_BITS - cnt + GMP_NAIL_BITS; xnb = (unb - 1) / nth + 1; if (xnb == 1) { if (remp == NULL) remp = pp; mpn_sub_1 (remp, up, un, (mp_limb_t) 1); MPN_NORMALIZE (remp, un); rootp[0] = 1; TMP_FREE; return un; } xn = (xnb + GMP_NUMB_BITS - 1) / GMP_NUMB_BITS; qp = TMP_ALLOC_LIMBS (PP_ALLOC); xp = TMP_ALLOC_LIMBS (xn + 1); /* Set initial root to only ones. This is an overestimate of the actual root by less than a factor of 2. */ for (i = 0; i < xn; i++) xp[i] = GMP_NUMB_MAX; xp[xnb / GMP_NUMB_BITS] = ((mp_limb_t) 1 << (xnb % GMP_NUMB_BITS)) - 1; /* Improve the initial approximation, one bit at a time. Keep the approximations >= root(U,nth). */ bit = xnb - 2; n_valid_bits = 0; for (i = 0; (nth >> i) != 0; i++) { mp_limb_t xl = xp[bit / GMP_NUMB_BITS]; xp[bit / GMP_NUMB_BITS] = xl ^ (mp_limb_t) 1 << bit % GMP_NUMB_BITS; pn = mpn_pow_1 (pp, xp, xn, nth, qp); ASSERT_ALWAYS (pn < PP_ALLOC); /* If the new root approximation is too small, restore old value. */ if (! (un < pn || (un == pn && mpn_cmp (up, pp, pn) < 0))) xp[bit / GMP_NUMB_BITS] = xl; /* restore old value */ n_valid_bits += 1; if (bit == 0) goto done; bit--; } adj = n_valid_bits - 1; /* Newton loop. Converges downwards towards root(U,nth). Currently we use full precision from iteration 1. Clearly, we should use just n_valid_bits of precision in each step, and thus save most of the computations. */ while (n_valid_bits <= xnb) { mp_limb_t cy; pn = mpn_pow_1 (pp, xp, xn, nth - 1, qp); ASSERT_ALWAYS (pn < PP_ALLOC); qp[xn - 1] = 0; /* pad quotient to make it always xn limbs */ mpn_tdiv_qr (qp, pp, (mp_size_t) 0, up, un, pp, pn); /* junk remainder */ cy = mpn_addmul_1 (qp, xp, xn, nth - 1); if (un - pn == xn) { cy += qp[xn]; if (cy == nth) { for (i = xn - 1; i >= 0; i--) qp[i] = GMP_NUMB_MAX; cy = nth - 1; } } qp[xn] = cy; qn = xn + (cy != 0); mpn_divrem_1 (xp, (mp_size_t) 0, qp, qn, nth); n_valid_bits = n_valid_bits * 2 - adj; } /* The computed result might be one unit too large. Adjust as necessary. */ done: pn = mpn_pow_1 (pp, xp, xn, nth, qp); ASSERT_ALWAYS (pn < PP_ALLOC); if (un < pn || (un == pn && mpn_cmp (up, pp, pn) < 0)) { mpn_decr_u (xp, 1); pn = mpn_pow_1 (pp, xp, xn, nth, qp); ASSERT_ALWAYS (pn < PP_ALLOC); ASSERT_ALWAYS (! (un < pn || (un == pn && mpn_cmp (up, pp, pn) < 0))); } if (remp == NULL) remp = pp; mpn_sub (remp, up, un, pp, pn); MPN_NORMALIZE (remp, un); MPN_COPY (rootp, xp, xn); TMP_FREE; return un; }
void mpz_powm_ui (mpz_ptr r, mpz_srcptr b, unsigned long int el, mpz_srcptr m) { mp_ptr xp, tp, qp, mp, bp; mp_size_t xn, tn, mn, bn; int m_zero_cnt; int c; mp_limb_t e; TMP_DECL; mp = PTR(m); mn = ABSIZ(m); if (mn == 0) DIVIDE_BY_ZERO; if (el == 0) { /* Exponent is zero, result is 1 mod MOD, i.e., 1 or 0 depending on if MOD equals 1. */ SIZ(r) = (mn == 1 && mp[0] == 1) ? 0 : 1; PTR(r)[0] = 1; return; } TMP_MARK; /* Normalize m (i.e. make its most significant bit set) as required by division functions below. */ count_leading_zeros (m_zero_cnt, mp[mn - 1]); m_zero_cnt -= GMP_NAIL_BITS; if (m_zero_cnt != 0) { mp_ptr new_mp = TMP_ALLOC_LIMBS (mn); mpn_lshift (new_mp, mp, mn, m_zero_cnt); mp = new_mp; } bn = ABSIZ(b); bp = PTR(b); if (bn > mn) { /* Reduce possibly huge base. Use a function call to reduce, since we don't want the quotient allocation to live until function return. */ mp_ptr new_bp = TMP_ALLOC_LIMBS (mn); reduce (new_bp, bp, bn, mp, mn); bp = new_bp; bn = mn; /* Canonicalize the base, since we are potentially going to multiply with it quite a few times. */ MPN_NORMALIZE (bp, bn); } if (bn == 0) { SIZ(r) = 0; TMP_FREE; return; } tp = TMP_ALLOC_LIMBS (2 * mn + 1); xp = TMP_ALLOC_LIMBS (mn); qp = TMP_ALLOC_LIMBS (mn + 1); MPN_COPY (xp, bp, bn); xn = bn; e = el; count_leading_zeros (c, e); e = (e << c) << 1; /* shift the exp bits to the left, lose msb */ c = BITS_PER_MP_LIMB - 1 - c; /* Main loop. */ /* If m is already normalized (high bit of high limb set), and b is the same size, but a bigger value, and e==1, then there's no modular reductions done and we can end up with a result out of range at the end. */ if (c == 0) { if (xn == mn && mpn_cmp (xp, mp, mn) >= 0) mpn_sub_n (xp, xp, mp, mn); goto finishup; } while (c != 0) { mpn_sqr_n (tp, xp, xn); tn = 2 * xn; tn -= tp[tn - 1] == 0; if (tn < mn) { MPN_COPY (xp, tp, tn); xn = tn; } else { mpn_tdiv_qr (qp, xp, 0L, tp, tn, mp, mn); xn = mn; } if ((mp_limb_signed_t) e < 0) { mpn_mul (tp, xp, xn, bp, bn); tn = xn + bn; tn -= tp[tn - 1] == 0; if (tn < mn) { MPN_COPY (xp, tp, tn); xn = tn; } else { mpn_tdiv_qr (qp, xp, 0L, tp, tn, mp, mn); xn = mn; } } e <<= 1; c--; } finishup: /* We shifted m left m_zero_cnt steps. Adjust the result by reducing it with the original MOD. */ if (m_zero_cnt != 0) { mp_limb_t cy; cy = mpn_lshift (tp, xp, xn, m_zero_cnt); tp[xn] = cy; xn += cy != 0; if (xn < mn) { MPN_COPY (xp, tp, xn); } else { mpn_tdiv_qr (qp, xp, 0L, tp, xn, mp, mn); xn = mn; } mpn_rshift (xp, xp, xn, m_zero_cnt); } MPN_NORMALIZE (xp, xn); if ((el & 1) != 0 && SIZ(b) < 0 && xn != 0) { mp = PTR(m); /* want original, unnormalized m */ mpn_sub (xp, mp, mn, xp, xn); xn = mn; MPN_NORMALIZE (xp, xn); } MPZ_REALLOC (r, xn); SIZ (r) = xn; MPN_COPY (PTR(r), xp, xn); TMP_FREE; }
void mpz_powm_sec (mpz_ptr r, mpz_srcptr b, mpz_srcptr e, mpz_srcptr m) { mp_size_t n; mp_ptr rp, tp; mp_srcptr bp, ep, mp; mp_size_t rn, bn, es, en; TMP_DECL; n = ABSIZ(m); mp = PTR(m); if (UNLIKELY ((n == 0) || (mp[0] % 2 == 0))) DIVIDE_BY_ZERO; es = SIZ(e); if (UNLIKELY (es <= 0)) { if (es == 0) { /* b^0 mod m, b is anything and m is non-zero. Result is 1 mod m, i.e., 1 or 0 depending on if m = 1. */ SIZ(r) = n != 1 || mp[0] != 1; PTR(r)[0] = 1; return; } DIVIDE_BY_ZERO; } en = es; bn = ABSIZ(b); if (UNLIKELY (bn == 0)) { SIZ(r) = 0; return; } TMP_MARK; tp = TMP_ALLOC_LIMBS (n + mpn_sec_powm_itch (bn, en * GMP_NUMB_BITS, n)); rp = tp; tp += n; bp = PTR(b); ep = PTR(e); mpn_sec_powm (rp, bp, bn, ep, en * GMP_NUMB_BITS, mp, n, tp); rn = n; MPN_NORMALIZE (rp, rn); if ((ep[0] & 1) && SIZ(b) < 0 && rn != 0) { mpn_sub (rp, PTR(m), n, rp, rn); rn = n; MPN_NORMALIZE (rp, rn); } MPZ_REALLOC (r, rn); SIZ(r) = rn; MPN_COPY (PTR(r), rp, rn); TMP_FREE; }
void _gst_mpz_add (gst_mpz *sum, const gst_mpz *u, const gst_mpz *v) { mp_srcptr up, vp; mp_ptr sump; mp_size_t usize, vsize, sumsize; mp_size_t abs_usize; mp_size_t abs_vsize; usize = u->size; vsize = v->size; abs_usize = ABS (usize); abs_vsize = ABS (vsize); if (abs_usize < abs_vsize) { /* Swap U and V. */ { const gst_mpz *t = u; u = v; v = t; } { mp_size_t t = usize; usize = vsize; vsize = t; } { mp_size_t t = abs_usize; abs_usize = abs_vsize; abs_vsize = t; } } /* True: abs(USIZE) >= abs(VSIZE) */ /* If not space for sum (and possible carry), increase space. */ sumsize = abs_usize + 1; if (sum->alloc < sumsize) gst_mpz_realloc (sum, sumsize); /* These must be after realloc (u or v may be the same as sum). */ up = u->d; vp = v->d; sump = sum->d; if (usize >= 0) { if (vsize >= 0) { sumsize = mpn_add (sump, up, abs_usize, vp, abs_vsize); if (sumsize != 0) sump[abs_usize] = 1; sumsize = sumsize + abs_usize; } else { /* The signs are different. Need exact comparision to determine which operand to subtract from which. */ if (abs_usize == abs_vsize && mpn_cmp (up, vp, abs_usize) < 0) sumsize = -(abs_usize + mpn_sub (sump, vp, abs_usize, up, abs_usize)); else sumsize = (abs_usize + mpn_sub (sump, up, abs_usize, vp, abs_vsize)); } } else { if (vsize >= 0) { /* The signs are different. Need exact comparision to determine which operand to subtract from which. */ if (abs_usize == abs_vsize && mpn_cmp (up, vp, abs_usize) < 0) sumsize = (abs_usize + mpn_sub (sump, vp, abs_usize, up, abs_usize)); else sumsize = -(abs_usize + mpn_sub (sump, up, abs_usize, vp, abs_vsize)); } else { sumsize = mpn_add (sump, up, abs_usize, vp, abs_vsize); if (sumsize != 0) sump[abs_usize] = 1; sumsize = -(sumsize + abs_usize); } } sum->size = sumsize; }
/* if approx is non-zero, does not compute the final remainder */ static mp_size_t mpn_rootrem_internal (mp_ptr rootp, mp_ptr remp, mp_srcptr up, mp_size_t un, mp_limb_t k, int approx) { mp_ptr qp, rp, sp, wp, scratch; mp_size_t qn, rn, sn, wn, nl, bn; mp_limb_t save, save2, cy; unsigned long int unb; /* number of significant bits of {up,un} */ unsigned long int xnb; /* number of significant bits of the result */ unsigned int cnt; unsigned long b, kk; unsigned long sizes[GMP_NUMB_BITS + 1]; int ni, i; int c; int logk; TMP_DECL; TMP_MARK; /* qp and wp need enough space to store S'^k where S' is an approximate root. Since S' can be as large as S+2, the worst case is when S=2 and S'=4. But then since we know the number of bits of S in advance, S' can only be 3 at most. Similarly for S=4, then S' can be 6 at most. So the worst case is S'/S=3/2, thus S'^k <= (3/2)^k * S^k. Since S^k fits in un limbs, the number of extra limbs needed is bounded by ceil(k*log2(3/2)/GMP_NUMB_BITS). */ #define EXTRA 2 + (mp_size_t) (0.585 * (double) k / (double) GMP_NUMB_BITS) qp = TMP_ALLOC_LIMBS (un + EXTRA); /* will contain quotient and remainder of R/(k*S^(k-1)), and S^k */ if (remp == NULL) { rp = TMP_ALLOC_LIMBS (un + 1); /* will contain the remainder */ scratch = rp; /* used by mpn_div_q */ } else { scratch = TMP_ALLOC_LIMBS (un + 1); /* used by mpn_div_q */ rp = remp; } sp = rootp; wp = TMP_ALLOC_LIMBS (un + EXTRA); /* will contain S^(k-1), k*S^(k-1), and temporary for mpn_pow_1 */ count_leading_zeros (cnt, up[un - 1]); unb = un * GMP_NUMB_BITS - cnt + GMP_NAIL_BITS; /* unb is the number of bits of the input U */ xnb = (unb - 1) / k + 1; /* ceil (unb / k) */ /* xnb is the number of bits of the root R */ if (xnb == 1) /* root is 1 */ { if (remp == NULL) remp = rp; mpn_sub_1 (remp, up, un, (mp_limb_t) 1); MPN_NORMALIZE (remp, un); /* There should be at most one zero limb, if we demand u to be normalized */ rootp[0] = 1; TMP_FREE; return un; } /* We initialize the algorithm with a 1-bit approximation to zero: since we know the root has exactly xnb bits, we write r0 = 2^(xnb-1), so that r0^k = 2^(k*(xnb-1)), that we subtract to the input. */ kk = k * (xnb - 1); /* number of truncated bits in the input */ rn = un - kk / GMP_NUMB_BITS; /* number of limbs of the non-truncated part */ MPN_RSHIFT (cy, rp, up + kk / GMP_NUMB_BITS, rn, kk % GMP_NUMB_BITS); mpn_sub_1 (rp, rp, rn, 1); /* subtract the initial approximation: since the non-truncated part is less than 2^k, it is <= k bits: rn <= ceil(k/GMP_NUMB_BITS) */ sp[0] = 1; /* initial approximation */ sn = 1; /* it has one limb */ for (logk = 1; ((k - 1) >> logk) != 0; logk++) ; /* logk = ceil(log(k)/log(2)) */ b = xnb - 1; /* number of remaining bits to determine in the kth root */ ni = 0; while (b != 0) { /* invariant: here we want b+1 total bits for the kth root */ sizes[ni] = b; /* if c is the new value of b, this means that we'll go from a root of c+1 bits (say s') to a root of b+1 bits. It is proved in the book "Modern Computer Arithmetic" from Brent and Zimmermann, Chapter 1, that if s' >= k*beta, then at most one correction is necessary. Here beta = 2^(b-c), and s' >= 2^c, thus it suffices that c >= ceil((b + log2(k))/2). */ b = (b + logk + 1) / 2; if (b >= sizes[ni]) b = sizes[ni] - 1; /* add just one bit at a time */ ni++; } sizes[ni] = 0; ASSERT_ALWAYS (ni < GMP_NUMB_BITS + 1); /* We have sizes[0] = b > sizes[1] > ... > sizes[ni] = 0 with sizes[i] <= 2 * sizes[i+1]. Newton iteration will first compute sizes[ni-1] extra bits, then sizes[ni-2], ..., then sizes[0] = b. */ wp[0] = 1; /* {sp,sn}^(k-1) = 1 */ wn = 1; for (i = ni; i != 0; i--) { /* 1: loop invariant: {sp, sn} is the current approximation of the root, which has exactly 1 + sizes[ni] bits. {rp, rn} is the current remainder {wp, wn} = {sp, sn}^(k-1) kk = number of truncated bits of the input */ b = sizes[i - 1] - sizes[i]; /* number of bits to compute in that iteration */ /* Reinsert a low zero limb if we normalized away the entire remainder */ if (rn == 0) { rp[0] = 0; rn = 1; } /* first multiply the remainder by 2^b */ MPN_LSHIFT (cy, rp + b / GMP_NUMB_BITS, rp, rn, b % GMP_NUMB_BITS); rn = rn + b / GMP_NUMB_BITS; if (cy != 0) { rp[rn] = cy; rn++; } kk = kk - b; /* 2: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */ /* Now insert bits [kk,kk+b-1] from the input U */ bn = b / GMP_NUMB_BITS; /* lowest limb from high part of rp[] */ save = rp[bn]; /* nl is the number of limbs in U which contain bits [kk,kk+b-1] */ nl = 1 + (kk + b - 1) / GMP_NUMB_BITS - (kk / GMP_NUMB_BITS); /* nl = 1 + floor((kk + b - 1) / GMP_NUMB_BITS) - floor(kk / GMP_NUMB_BITS) <= 1 + (kk + b - 1) / GMP_NUMB_BITS - (kk - GMP_NUMB_BITS + 1) / GMP_NUMB_BITS = 2 + (b - 2) / GMP_NUMB_BITS thus since nl is an integer: nl <= 2 + floor(b/GMP_NUMB_BITS) <= 2 + bn. */ /* we have to save rp[bn] up to rp[nl-1], i.e. 1 or 2 limbs */ if (nl - 1 > bn) save2 = rp[bn + 1]; MPN_RSHIFT (cy, rp, up + kk / GMP_NUMB_BITS, nl, kk % GMP_NUMB_BITS); /* set to zero high bits of rp[bn] */ rp[bn] &= ((mp_limb_t) 1 << (b % GMP_NUMB_BITS)) - 1; /* restore corresponding bits */ rp[bn] |= save; if (nl - 1 > bn) rp[bn + 1] = save2; /* the low b bits go in rp[0..bn] only, since they start by bit 0 in rp[0], so they use at most ceil(b/GMP_NUMB_BITS) limbs */ /* 3: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */ /* compute {wp, wn} = k * {sp, sn}^(k-1) */ cy = mpn_mul_1 (wp, wp, wn, k); wp[wn] = cy; wn += cy != 0; /* 4: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */ /* now divide {rp, rn} by {wp, wn} to get the low part of the root */ if (rn < wn) { qn = 0; } else { mp_ptr tp; qn = rn - wn; /* expected quotient size */ /* tp must have space for wn limbs. The quotient needs rn-wn+1 limbs, thus quotient+remainder need altogether rn+1 limbs. */ tp = qp + qn + 1; /* put remainder in Q buffer */ mpn_div_q (qp, rp, rn, wp, wn, scratch); qn += qp[qn] != 0; } /* 5: current buffers: {sp,sn}, {qp,qn}. Note: {rp,rn} is not needed any more since we'll compute it from scratch at the end of the loop. */ /* Number of limbs used by b bits, when least significant bit is aligned to least limb */ bn = (b - 1) / GMP_NUMB_BITS + 1; /* the quotient should be smaller than 2^b, since the previous approximation was correctly rounded toward zero */ if (qn > bn || (qn == bn && (b % GMP_NUMB_BITS != 0) && qp[qn - 1] >= ((mp_limb_t) 1 << (b % GMP_NUMB_BITS)))) { qn = b / GMP_NUMB_BITS + 1; /* b+1 bits */ MPN_ZERO (qp, qn); qp[qn - 1] = (mp_limb_t) 1 << (b % GMP_NUMB_BITS); MPN_DECR_U (qp, qn, 1); qn -= qp[qn - 1] == 0; } /* 6: current buffers: {sp,sn}, {qp,qn} */ /* multiply the root approximation by 2^b */ MPN_LSHIFT (cy, sp + b / GMP_NUMB_BITS, sp, sn, b % GMP_NUMB_BITS); sn = sn + b / GMP_NUMB_BITS; if (cy != 0) { sp[sn] = cy; sn++; } /* 7: current buffers: {sp,sn}, {qp,qn} */ ASSERT_ALWAYS (bn >= qn); /* this is ok since in the case qn > bn above, q is set to 2^b-1, which has exactly bn limbs */ /* Combine sB and q to form sB + q. */ save = sp[b / GMP_NUMB_BITS]; MPN_COPY (sp, qp, qn); MPN_ZERO (sp + qn, bn - qn); sp[b / GMP_NUMB_BITS] |= save; /* 8: current buffer: {sp,sn} */ /* Since each iteration treats b bits from the root and thus k*b bits from the input, and we already considered b bits from the input, we now have to take another (k-1)*b bits from the input. */ kk -= (k - 1) * b; /* remaining input bits */ /* {rp, rn} = floor({up, un} / 2^kk) */ MPN_RSHIFT (cy, rp, up + kk / GMP_NUMB_BITS, un - kk / GMP_NUMB_BITS, kk % GMP_NUMB_BITS); rn = un - kk / GMP_NUMB_BITS; rn -= rp[rn - 1] == 0; /* 9: current buffers: {sp,sn}, {rp,rn} */ for (c = 0;; c++) { /* Compute S^k in {qp,qn}. */ if (i == 1) { /* Last iteration: we don't need W anymore. */ /* mpn_pow_1 requires that both qp and wp have enough space to store the result {sp,sn}^k + 1 limb */ approx = approx && (sp[0] > 1); qn = (approx == 0) ? mpn_pow_1 (qp, sp, sn, k, wp) : 0; } else { /* W <- S^(k-1) for the next iteration, and S^k = W * S. */ wn = mpn_pow_1 (wp, sp, sn, k - 1, qp); mpn_mul (qp, wp, wn, sp, sn); qn = wn + sn; qn -= qp[qn - 1] == 0; } /* if S^k > floor(U/2^kk), the root approximation was too large */ if (qn > rn || (qn == rn && mpn_cmp (qp, rp, rn) > 0)) MPN_DECR_U (sp, sn, 1); else break; } /* 10: current buffers: {sp,sn}, {rp,rn}, {qp,qn}, {wp,wn} */ ASSERT_ALWAYS (c <= 1); ASSERT_ALWAYS (rn >= qn); /* R = R - Q = floor(U/2^kk) - S^k */ if ((i > 1) || (approx == 0)) { mpn_sub (rp, rp, rn, qp, qn); MPN_NORMALIZE (rp, rn); } /* otherwise we have rn > 0, thus the return value is ok */ /* 11: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */ } TMP_FREE; return rn; }