/* Divides {up, n} by d. Writes the n-1 low quotient limbs at {qp, * n-1}, and the high quote limb at *qh. Returns remainder. */ mp_limb_t mpn_div_qr_1 (mp_ptr qp, mp_limb_t *qh, mp_srcptr up, mp_size_t n, mp_limb_t d) { unsigned cnt; mp_limb_t uh; ASSERT (n > 0); ASSERT (d > 0); if (d & GMP_NUMB_HIGHBIT) { /* Normalized case */ mp_limb_t dinv, q; uh = up[--n]; q = (uh >= d); *qh = q; uh -= (-q) & d; if (BELOW_THRESHOLD (n, DIV_QR_1_NORM_THRESHOLD)) { cnt = 0; plain: while (n > 0) { mp_limb_t ul = up[--n]; udiv_qrnnd (qp[n], uh, uh, ul, d); } return uh >> cnt; } invert_limb (dinv, d); return mpn_div_qr_1n_pi1 (qp, up, n, uh, d, dinv); }
mp_limb_t div_preinv1(mp_limb_t d1, mp_limb_t d2) { mp_limb_t q, r[2], p[2], cy; if (d2 + 1 == 0 && d1 + 1 == 0) return 0; if (d1 + 1 == 0) q = ~d1, r[1] = ~d2; else udiv_qrnnd(q, r[1], ~d1, ~d2, d1 + 1); r[0] = 0; if (d2 + 1 == 0) add_ssaaaa(cy, r[1], 0, r[1], 0, q); else { umul_ppmm(p[1], p[0], q, ~d2 - 1); cy = mpn_add_n(r, r, p, 2); } p[0] = d2 + 1, p[1] = d1 + (d2 + 1 == 0); if (cy || mpn_cmp(r, p, 2) >= 0) q++; return q; }
/* * Multiply x and y, reducing the result modulo n. */ uint64_t mul_mod_n(uint64_t x, uint64_t y, uint64_t n) { #if 0 uint64_t q, r, p1, p2; umul_ppmm(p1, p2, x, y); udiv_qrnnd(q, r, p1, p2, n); return r; #endif return (x * y) % n; }
mp_limb_t mpn_mod_1 (mp_srcptr up, mp_size_t un, mp_limb_t d) { mp_size_t i; mp_limb_t n1, n0, r; mp_limb_t dummy; ASSERT (un >= 0); ASSERT (d != 0); /* Botch: Should this be handled at all? Rely on callers? But note un==0 is currently required by mpz/fdiv_r_ui.c and possibly other places. */ if (un == 0) return 0; #if HAVE_NATIVE_mpn_divrem_euclidean_r_1 return mpn_divrem_euclidean_r_1(up,un,d); #endif d <<= GMP_NAIL_BITS; if ((d & GMP_LIMB_HIGHBIT) != 0) { /* High limb is initial remainder, possibly with one subtract of d to get r<d. */ r = up[un - 1] << GMP_NAIL_BITS; if (r >= d) r -= d; r >>= GMP_NAIL_BITS; un--; if (un == 0) return r; if (BELOW_THRESHOLD (un, MOD_1_NORM_THRESHOLD)) { plain: for (i = un - 1; i >= 0; i--) { n0 = up[i] << GMP_NAIL_BITS; udiv_qrnnd (dummy, r, r, n0, d); r >>= GMP_NAIL_BITS; } return r; } else {
void sample(void * arg, ulong count) { mp_limb_t d; mp_ptr array = (mp_ptr) flint_malloc(200 * sizeof(mp_limb_t)); flint_rand_t state; ulong i; int j; flint_randinit(state); d = n_randtest_not_zero(state); for (i = 0; i < count; i++) { for (j = 0; j < 200; j+=2) { do { array[j] = n_randtest(state); } while (array[j] >= d); array[j + 1] = n_randtest(state); } prof_start(); for (j = 0; j < 200; j+=2) { udiv_qrnnd(array[j], array[j+1], array[j], array[j+1], d); } prof_stop(); for (j = 0; j < 200; j++) if (array[j] == 0) printf("\r"); } flint_randclear(state); flint_free(array); }
mp_limb_t mpn_sb_divrem_mn (mp_ptr qp, mp_ptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn) { mp_limb_t most_significant_q_limb = 0; mp_size_t qn = nn - dn; mp_size_t i; mp_limb_t dx, d1, n0; mp_limb_t dxinv; int use_preinv; ASSERT (dn > 2); ASSERT (nn >= dn); ASSERT (dp[dn-1] & GMP_NUMB_HIGHBIT); ASSERT (! MPN_OVERLAP_P (np, nn, dp, dn)); ASSERT (! MPN_OVERLAP_P (qp, nn-dn, dp, dn)); ASSERT (! MPN_OVERLAP_P (qp, nn-dn, np, nn) || qp+dn >= np); ASSERT_MPN (np, nn); ASSERT_MPN (dp, dn); np += qn; dx = dp[dn - 1]; d1 = dp[dn - 2]; n0 = np[dn - 1]; if (n0 >= dx) { if (n0 > dx || mpn_cmp (np, dp, dn - 1) >= 0) { mpn_sub_n (np, np, dp, dn); most_significant_q_limb = 1; } } /* use_preinv is possibly a constant, but it's left to the compiler to optimize away the unused code in that case. */ use_preinv = ABOVE_THRESHOLD (qn, DIV_SB_PREINV_THRESHOLD); if (use_preinv) invert_limb (dxinv, dx); for (i = qn - 1; i >= 0; i--) { mp_limb_t q; mp_limb_t nx; mp_limb_t cy_limb; nx = np[dn - 1]; /* FIXME: could get value from r1 */ np--; if (nx == dx) { /* This might over-estimate q, but it's probably not worth the extra code here to find out. */ q = GMP_NUMB_MASK; #if 1 cy_limb = mpn_submul_1 (np, dp, dn, q); #else /* This should be faster on many machines */ cy_limb = mpn_sub_n (np + 1, np + 1, dp, dn); cy = mpn_add_n (np, np, dp, dn); np[dn] += cy; #endif if (nx != cy_limb) { mpn_add_n (np, np, dp, dn); q--; } qp[i] = q; } else { mp_limb_t rx, r1, r0, p1, p0; /* "workaround" avoids a problem with gcc 2.7.2.3 i386 register usage when np[dn-1] is used in an asm statement like umul_ppmm in udiv_qrnnd_preinv. The symptom is seg faults due to registers being clobbered. gcc 2.95 i386 doesn't have the problem. */ { mp_limb_t workaround = np[dn - 1]; if (use_preinv) udiv_qrnnd_preinv (q, r1, nx, workaround, dx, dxinv); else { udiv_qrnnd (q, r1, nx, workaround << GMP_NAIL_BITS, dx << GMP_NAIL_BITS); r1 >>= GMP_NAIL_BITS; } } umul_ppmm (p1, p0, d1, q << GMP_NAIL_BITS); p0 >>= GMP_NAIL_BITS; r0 = np[dn - 2]; rx = 0; if (r1 < p1 || (r1 == p1 && r0 < p0)) { p1 -= p0 < d1; p0 = (p0 - d1) & GMP_NUMB_MASK; q--; r1 = (r1 + dx) & GMP_NUMB_MASK; rx = r1 < dx; } p1 += r0 < p0; /* cannot carry! */ rx -= r1 < p1; /* may become 11..1 if q is still too large */ r1 = (r1 - p1) & GMP_NUMB_MASK; r0 = (r0 - p0) & GMP_NUMB_MASK; cy_limb = mpn_submul_1 (np, dp, dn - 2, q); /* Check if we've over-estimated q, and adjust as needed. */ { mp_limb_t cy1, cy2; cy1 = r0 < cy_limb; r0 = (r0 - cy_limb) & GMP_NUMB_MASK; cy2 = r1 < cy1; r1 -= cy1; np[dn - 1] = r1; np[dn - 2] = r0; if (cy2 != rx) { mpn_add_n (np, np, dp, dn); q--; } } qp[i] = q; } } /* ______ ______ ______ |__rx__|__r1__|__r0__| partial remainder ______ ______ - |__p1__|__p0__| partial product to subtract ______ ______ - |______|cylimb| rx is -1, 0 or 1. If rx=1, then q is correct (it should match carry out). If rx=-1 then q is too large. If rx=0, then q might be too large, but it is most likely correct. */ return most_significant_q_limb; }
mp_limb_t _ll_factor_SQUFOF(mp_limb_t n_hi, mp_limb_t n_lo, ulong max_iters) { mp_limb_t n[2]; mp_limb_t sqrt[2]; mp_limb_t rem[2]; mp_size_t num, sqroot, p, q; mp_limb_t l, l2, iq, pnext; mp_limb_t qarr[50]; mp_limb_t qupto, qlast, t, r = 0; ulong i, j; n[0] = n_lo; n[1] = n_hi; if (n_hi) num = mpn_sqrtrem(sqrt, rem, n, 2); else num = ((sqrt[0] = n_sqrtrem(rem, n_lo)) != 0UL); sqroot = sqrt[0]; p = sqroot; q = rem[0]; if ((q == 0) || (num == 0)) { return sqroot; } l = 1 + 2*n_sqrt(2*p); l2 = l/2; qupto = 0; qlast = 1; for (i = 0; i < max_iters; i++) { iq = (sqroot + p)/q; pnext = iq*q - p; if (q <= l) { if ((q & 1UL) == 0UL) { qarr[qupto] = q/2; qupto++; if (qupto >= 50UL) return 0UL; } else if (q <= l2) { qarr[qupto] = q; qupto++; if (qupto >= 50UL) return 0UL; } } t = qlast + iq*(p - pnext); qlast = q; q = t; p = pnext; if ((i & 1) == 1) continue; if (!n_is_square(q)) continue; r = n_sqrt(q); if (qupto == 0UL) break; for (j = 0; j < qupto; j++) if (r == qarr[j]) goto cont; break; cont: ; if (r == 1UL) return 0UL; } if (i == max_iters) return 0UL; /* taken too long, give up */ qlast = r; p = p + r*((sqroot - p)/r); umul_ppmm(rem[1], rem[0], p, p); sub_ddmmss(sqrt[1], sqrt[0], n[1], n[0], rem[1], rem[0]); if (sqrt[1]) { int norm; count_leading_zeros(norm, qlast); udiv_qrnnd(q, rem[0], (sqrt[1] << norm) + r_shift(sqrt[0], FLINT_BITS - norm), sqrt[0] << norm, qlast << norm); rem[0] >>= norm; } else {
mp_limb_t mpn_divrem (mp_ptr qp, mp_size_t qextra_limbs, mp_ptr np, mp_size_t nsize, mp_srcptr dp, mp_size_t dsize) { mp_limb_t most_significant_q_limb = 0; switch (dsize) { case 0: /* We are asked to divide by zero, so go ahead and do it! (To make the compiler not remove this statement, return the value.) */ return 1 / dsize; case 1: { mp_size_t i; mp_limb_t n1; mp_limb_t d; d = dp[0]; n1 = np[nsize - 1]; if (n1 >= d) { n1 -= d; most_significant_q_limb = 1; } qp += qextra_limbs; for (i = nsize - 2; i >= 0; i--) udiv_qrnnd (qp[i], n1, n1, np[i], d); qp -= qextra_limbs; for (i = qextra_limbs - 1; i >= 0; i--) udiv_qrnnd (qp[i], n1, n1, 0, d); np[0] = n1; } break; case 2: { mp_size_t i; mp_limb_t n1, n0, n2; mp_limb_t d1, d0; np += nsize - 2; d1 = dp[1]; d0 = dp[0]; n1 = np[1]; n0 = np[0]; if (n1 >= d1 && (n1 > d1 || n0 >= d0)) { sub_ddmmss (n1, n0, n1, n0, d1, d0); most_significant_q_limb = 1; } for (i = qextra_limbs + nsize - 2 - 1; i >= 0; i--) { mp_limb_t q; mp_limb_t r; if (i >= qextra_limbs) np--; else np[0] = 0; if (n1 == d1) { /* Q should be either 111..111 or 111..110. Need special treatment of this rare case as normal division would give overflow. */ q = ~(mp_limb_t) 0; r = n0 + d1; if (r < d1) /* Carry in the addition? */ { add_ssaaaa (n1, n0, r - d0, np[0], 0, d0); qp[i] = q; continue; } n1 = d0 - (d0 != 0); n0 = -d0; } else { udiv_qrnnd (q, r, n1, n0, d1); umul_ppmm (n1, n0, d0, q); } n2 = np[0]; q_test: if (n1 > r || (n1 == r && n0 > n2)) { /* The estimated Q was too large. */ q--; sub_ddmmss (n1, n0, n1, n0, 0, d0); r += d1; if (r >= d1) /* If not carry, test Q again. */ goto q_test; } qp[i] = q; sub_ddmmss (n1, n0, r, n2, n1, n0); } np[1] = n1; np[0] = n0; } break; default: { mp_size_t i; mp_limb_t dX, d1, n0; np += nsize - dsize; dX = dp[dsize - 1]; d1 = dp[dsize - 2]; n0 = np[dsize - 1]; if (n0 >= dX) { if (n0 > dX || mpn_cmp (np, dp, dsize - 1) >= 0) { mpn_sub_n (np, np, dp, dsize); n0 = np[dsize - 1]; most_significant_q_limb = 1; } } for (i = qextra_limbs + nsize - dsize - 1; i >= 0; i--) { mp_limb_t q; mp_limb_t n1, n2; mp_limb_t cy_limb; if (i >= qextra_limbs) { np--; n2 = np[dsize]; } else { n2 = np[dsize - 1]; MPN_COPY_DECR (np + 1, np, dsize); np[0] = 0; } if (n0 == dX) /* This might over-estimate q, but it's probably not worth the extra code here to find out. */ q = ~(mp_limb_t) 0; else { mp_limb_t r; udiv_qrnnd (q, r, n0, np[dsize - 1], dX); umul_ppmm (n1, n0, d1, q); while (n1 > r || (n1 == r && n0 > np[dsize - 2])) { q--; r += dX; if (r < dX) /* I.e. "carry in previous addition?" */ break; n1 -= n0 < d1; n0 -= d1; } } /* Possible optimization: We already have (q * n0) and (1 * n1) after the calculation of q. Taking advantage of that, we could make this loop make two iterations less. */ cy_limb = mpn_submul_1 (np, dp, dsize, q); if (n2 != cy_limb) { mpn_add_n (np, np, dp, dsize); q--; } qp[i] = q; n0 = np[dsize - 1]; } } } return most_significant_q_limb; }