mp_limb_t mpn_preinv_mod_1 (mp_srcptr up, mp_size_t un, mp_limb_t d, mp_limb_t dinv) { mp_size_t i; mp_limb_t n0, r; mp_limb_t dummy; ASSERT (un >= 1); ASSERT (d & GMP_LIMB_HIGHBIT); r = up[un - 1]; if (r >= d) r -= d; for (i = un - 2; i >= 0; i--) { n0 = up[i]; udiv_qrnnd_preinv (dummy, r, r, n0, d, dinv); } return r; }
/* Divides (uh B^n + {up, n}) by d, storing the quotient at {qp, n}. Requires that uh < d. */ mp_limb_t mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t uh, mp_limb_t d, mp_limb_t dinv) { ASSERT (n > 0); ASSERT (uh < d); ASSERT (d & GMP_NUMB_HIGHBIT); ASSERT (MPN_SAME_OR_SEPARATE_P (qp, up, n)); do { mp_limb_t q, ul; ul = up[--n]; udiv_qrnnd_preinv (q, uh, uh, ul, d, dinv); qp[n] = q; } while (n > 0); return uh; }
void sample(void * arg, ulong count) { mp_limb_t d, q, r, dinv, norm; mp_ptr array = (mp_ptr) flint_malloc(200 * sizeof(mp_limb_t)); FLINT_TEST_INIT(state); ulong i; int j; d = n_randtest_not_zero(state); count_leading_zeros(norm, d); d <<= norm; for (i = 0; i < count; i++) { for (j = 0; j < 200; j+=2) { do { array[j] = n_randtest(state); } while (array[j] >= d); array[j + 1] = n_randtest(state); } invert_limb(dinv, d); prof_start(); for (j = 0; j < 200; j+=2) { udiv_qrnnd_preinv(q, r, array[j], array[j+1], d, dinv); } prof_stop(); if (q + r == 0) flint_printf("\r"); } flint_randclear(state); flint_free(array); }
mp_limb_t mpn_sb_divrem_mn (mp_ptr qp, mp_ptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn) { mp_limb_t most_significant_q_limb = 0; mp_size_t qn = nn - dn; mp_size_t i; mp_limb_t dx, d1, n0; mp_limb_t dxinv; int use_preinv; ASSERT (dn > 2); ASSERT (nn >= dn); ASSERT (dp[dn-1] & GMP_NUMB_HIGHBIT); ASSERT (! MPN_OVERLAP_P (np, nn, dp, dn)); ASSERT (! MPN_OVERLAP_P (qp, nn-dn, dp, dn)); ASSERT (! MPN_OVERLAP_P (qp, nn-dn, np, nn) || qp+dn >= np); ASSERT_MPN (np, nn); ASSERT_MPN (dp, dn); np += qn; dx = dp[dn - 1]; d1 = dp[dn - 2]; n0 = np[dn - 1]; if (n0 >= dx) { if (n0 > dx || mpn_cmp (np, dp, dn - 1) >= 0) { mpn_sub_n (np, np, dp, dn); most_significant_q_limb = 1; } } /* use_preinv is possibly a constant, but it's left to the compiler to optimize away the unused code in that case. */ use_preinv = ABOVE_THRESHOLD (qn, DIV_SB_PREINV_THRESHOLD); if (use_preinv) invert_limb (dxinv, dx); for (i = qn - 1; i >= 0; i--) { mp_limb_t q; mp_limb_t nx; mp_limb_t cy_limb; nx = np[dn - 1]; /* FIXME: could get value from r1 */ np--; if (nx == dx) { /* This might over-estimate q, but it's probably not worth the extra code here to find out. */ q = GMP_NUMB_MASK; #if 1 cy_limb = mpn_submul_1 (np, dp, dn, q); #else /* This should be faster on many machines */ cy_limb = mpn_sub_n (np + 1, np + 1, dp, dn); cy = mpn_add_n (np, np, dp, dn); np[dn] += cy; #endif if (nx != cy_limb) { mpn_add_n (np, np, dp, dn); q--; } qp[i] = q; } else { mp_limb_t rx, r1, r0, p1, p0; /* "workaround" avoids a problem with gcc 2.7.2.3 i386 register usage when np[dn-1] is used in an asm statement like umul_ppmm in udiv_qrnnd_preinv. The symptom is seg faults due to registers being clobbered. gcc 2.95 i386 doesn't have the problem. */ { mp_limb_t workaround = np[dn - 1]; if (use_preinv) udiv_qrnnd_preinv (q, r1, nx, workaround, dx, dxinv); else { udiv_qrnnd (q, r1, nx, workaround << GMP_NAIL_BITS, dx << GMP_NAIL_BITS); r1 >>= GMP_NAIL_BITS; } } umul_ppmm (p1, p0, d1, q << GMP_NAIL_BITS); p0 >>= GMP_NAIL_BITS; r0 = np[dn - 2]; rx = 0; if (r1 < p1 || (r1 == p1 && r0 < p0)) { p1 -= p0 < d1; p0 = (p0 - d1) & GMP_NUMB_MASK; q--; r1 = (r1 + dx) & GMP_NUMB_MASK; rx = r1 < dx; } p1 += r0 < p0; /* cannot carry! */ rx -= r1 < p1; /* may become 11..1 if q is still too large */ r1 = (r1 - p1) & GMP_NUMB_MASK; r0 = (r0 - p0) & GMP_NUMB_MASK; cy_limb = mpn_submul_1 (np, dp, dn - 2, q); /* Check if we've over-estimated q, and adjust as needed. */ { mp_limb_t cy1, cy2; cy1 = r0 < cy_limb; r0 = (r0 - cy_limb) & GMP_NUMB_MASK; cy2 = r1 < cy1; r1 -= cy1; np[dn - 1] = r1; np[dn - 2] = r0; if (cy2 != rx) { mpn_add_n (np, np, dp, dn); q--; } } qp[i] = q; } } /* ______ ______ ______ |__rx__|__r1__|__r0__| partial remainder ______ ______ - |__p1__|__p0__| partial product to subtract ______ ______ - |______|cylimb| rx is -1, 0 or 1. If rx=1, then q is correct (it should match carry out). If rx=-1 then q is too large. If rx=0, then q might be too large, but it is most likely correct. */ return most_significant_q_limb; }
mp_limb_t mpn_divrem_euclidean_qr_2(mp_ptr qp, mp_ptr xp, mp_size_t xn, mp_srcptr dp) { mp_size_t qn; mp_limb_t qf, t[2], t1[2], q, h, l, d1, d2, i; int c1, c3, c4; ASSERT(xn >= 2); ASSERT_MPN(dp, 2); ASSERT_MPN(xp, xn); ASSERT(dp[1] != 0); qn = xn - 1; /* ASSERT(!MPN_OVERLAP_P(qp, qn, xp, xn)); */ /* FIXME: correct this overlap requirement */ ASSERT((dp[1]>>(GMP_NUMB_BITS - 1)) != 0); h = 0; d1 = dp[1]; d2 = dp[0]; invert_limb(i, d1); l = xp[xn - 1]; qn = xn - 2; t[0] = xp[qn]; if (l < d1) { h = t[1] = l; l = t[0] = xp[qn]; qf = 0; } else { qf = 1; t[1] = l - d1; t1[1] = 0; t1[0] = d2; if (mpn_sub_n(t, t, t1, 2)) { qf--; mpn_add_n(t, t, dp, 2); } h = t[1]; l = t[0]; } for (qn = xn - 3; qn >= 0; qn--) { t[0] = xp[qn]; if (h < d1) { udiv_qrnnd_preinv(q, t[1], h, l, d1, i); umul_ppmm(t1[1], t1[0], q, d2); if (mpn_sub_n(t, t, t1, 2)) { q--; if (mpn_add_n(t, t, dp, 2) == 0) { q--; ASSERT_CARRY(mpn_add_n(t, t, dp, 2)); } } } else { ASSERT(h == d1); q = -1; t[1] = l; c3 = mpn_add_n(t, t, dp, 2); c1 = mpn_sub_1(t + 1, t + 1, 1, d2); c4 = c3 - c1; if (l >= d1) { ASSERT(c3 != 0); ASSERT(c4 == 0); } /* our guess is B + 1, so q = B - 1 is correct */ else { ASSERT(c4 <= 0); /* our guess is B so q = B - 1 or B - 2 */ if (c4 != 0) { q--; mpn_add_n(t, t, dp, 2); } } } h = t[1]; l = t[0]; qp[qn] = q; } xp[1] = t[1]; xp[0] = t[0]; return qf; }
mp_limb_t mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1, mp_limb_t d, mp_limb_t dinv) { mp_limb_t B2; mp_limb_t u0, u2; mp_limb_t q0, q1; mp_limb_t p0, p1; mp_limb_t t; mp_size_t j; ASSERT (d & GMP_LIMB_HIGHBIT); ASSERT (n > 0); ASSERT (u1 < d); if (n == 1) { udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv); return u1; } /* FIXME: Could be precomputed */ B2 = -d*dinv; umul_ppmm (q1, q0, dinv, u1); umul_ppmm (p1, p0, B2, u1); q1 += u1; ASSERT (q1 >= u1); u0 = up[n-1]; /* Early read, to allow qp == up. */ qp[n-1] = q1; add_mssaaaa (u2, u1, u0, u0, up[n-2], p1, p0); /* FIXME: Keep q1 in a variable between iterations, to reduce number of memory accesses. */ for (j = n-2; j-- > 0; ) { mp_limb_t q2, cy; /* Additions for the q update: * +-------+ * |u1 * v | * +---+---+ * | u1| * +---+---+ * | 1 | v | (conditional on u2) * +---+---+ * | 1 | (conditional on u0 + u2 B2 carry) * +---+ * + | q0| * -+---+---+---+ * | q2| q1| q0| * +---+---+---+ */ umul_ppmm (p1, t, u1, dinv); add_ssaaaa (q2, q1, -u2, u2 & dinv, CNST_LIMB(0), u1); add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), p1); add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), q0); q0 = t; umul_ppmm (p1, p0, u1, B2); ADDC_LIMB (cy, u0, u0, u2 & B2); u0 -= (-cy) & d; /* Final q update */ add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), cy); qp[j+1] = q1; MPN_INCR_U (qp+j+2, n-j-2, q2); add_mssaaaa (u2, u1, u0, u0, up[j], p1, p0); } q1 = (u2 > 0); u1 -= (-q1) & d; t = (u1 >= d); q1 += t; u1 -= (-t) & d; udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv); add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t); MPN_INCR_U (qp+1, n-1, q1); qp[0] = q0; return u0; }
mp_limb_t mpn_div_qr_1n_pi2 (mp_ptr qp, mp_srcptr up, mp_size_t un, struct precomp_div_1_pi2 *pd) { mp_limb_t most_significant_q_limb; mp_size_t i; mp_limb_t r, u2, u1, u0; mp_limb_t d0, di1, di0; mp_limb_t q3a, q2a, q2b, q1b, q2c, q1c, q1d, q0d; mp_limb_t cnd; ASSERT (un >= 2); ASSERT ((pd->d & GMP_NUMB_HIGHBIT) != 0); ASSERT (! MPN_OVERLAP_P (qp, un-2, up, un) || qp+2 >= up); ASSERT_MPN (up, un); #define q3 q3a #define q2 q2b #define q1 q1b up += un - 3; r = up[2]; d0 = pd->d; most_significant_q_limb = (r >= d0); r -= d0 & -most_significant_q_limb; qp += un - 3; qp[2] = most_significant_q_limb; di1 = pd->dip[1]; di0 = pd->dip[0]; for (i = un - 3; i >= 0; i -= 2) { u2 = r; u1 = up[1]; u0 = up[0]; /* Dividend in {r,u1,u0} */ umul_ppmm (q1d,q0d, u1, di0); umul_ppmm (q2b,q1b, u1, di1); q2b++; /* cannot spill */ add_sssaaaa (r,q2b,q1b, q2b,q1b, u1,u0); umul_ppmm (q2c,q1c, u2, di0); add_sssaaaa (r,q2b,q1b, q2b,q1b, q2c,q1c); umul_ppmm (q3a,q2a, u2, di1); add_sssaaaa (r,q2b,q1b, q2b,q1b, q2a,q1d); q3 += r; r = u0 - q2 * d0; cnd = (r >= q1); r += d0 & -cnd; sub_ddmmss (q3,q2, q3,q2, 0,cnd); if (UNLIKELY (r >= d0)) { r -= d0; add_ssaaaa (q3,q2, q3,q2, 0,1); } qp[0] = q2; qp[1] = q3; up -= 2; qp -= 2; } if ((un & 1) == 0) { u2 = r; u1 = up[1]; udiv_qrnnd_preinv (q3, r, u2, u1, d0, di1); qp[1] = q3; } return r; #undef q3 #undef q2 #undef q1 }