/* Divides {up, n} by d. Writes the n-1 low quotient limbs at {qp, * n-1}, and the high quote limb at *qh. Returns remainder. */ mp_limb_t mpn_div_qr_1 (mp_ptr qp, mp_limb_t *qh, mp_srcptr up, mp_size_t n, mp_limb_t d) { unsigned cnt; mp_limb_t uh; ASSERT (n > 0); ASSERT (d > 0); if (d & GMP_NUMB_HIGHBIT) { /* Normalized case */ mp_limb_t dinv, q; uh = up[--n]; q = (uh >= d); *qh = q; uh -= (-q) & d; if (BELOW_THRESHOLD (n, DIV_QR_1_NORM_THRESHOLD)) { cnt = 0; plain: while (n > 0) { mp_limb_t ul = up[--n]; udiv_qrnnd (qp[n], uh, uh, ul, d); } return uh >> cnt; } invert_limb (dinv, d); return mpn_div_qr_1n_pi1 (qp, up, n, uh, d, dinv); }
void mpn_invert (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch) { ASSERT (n > 0); ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT); ASSERT (! MPN_OVERLAP_P (ip, n, dp, n)); ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n))); ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n))); if (n == 1) invert_limb (*ip, *dp); else if (BELOW_THRESHOLD (n, INV_APPR_THRESHOLD)) { /* Maximum scratch needed by this branch: 2*n */ mp_size_t i; mp_ptr xp; xp = scratch; /* 2 * n limbs */ /* n > 1 here */ i = n; do xp[--i] = GMP_NUMB_MAX; while (i); mpn_com (xp + n, dp, n); if (n == 2) { mpn_divrem_2 (ip, 0, xp, 4, dp); } else { gmp_pi1_t inv; invert_pi1 (inv, dp[n-1], dp[n-2]); /* FIXME: should we use dcpi1_div_q, for big sizes? */ mpn_sbpi1_div_q (ip, xp, 2 * n, dp, n, inv.inv32); } } else { /* Use approximated inverse; correct the result if needed. */ mp_limb_t e; /* The possible error in the approximate inverse */ ASSERT ( mpn_invert_itch (n) >= mpn_invertappr_itch (n) ); e = mpn_ni_invertappr (ip, dp, n, scratch); if (UNLIKELY (e)) { /* Assume the error can only be "0" (no error) or "1". */ /* Code to detect and correct the "off by one" approximation. */ mpn_mul_n (scratch, ip, dp, n); e = mpn_add_n (scratch, scratch, dp, n); /* FIXME: we only need e.*/ if (LIKELY(e)) /* The high part can not give a carry by itself. */ e = mpn_add_nc (scratch + n, scratch + n, dp, n, e); /* FIXME:e */ /* If the value was wrong (no carry), correct it (increment). */ e ^= CNST_LIMB (1); MPN_INCR_U (ip, n, e); } } }
void mpn_invert (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch) { ASSERT (n > 0); ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT); ASSERT (! MPN_OVERLAP_P (ip, n, dp, n)); ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n))); ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n))); if (n == 1) invert_limb (*ip, *dp); else { TMP_DECL; TMP_MARK; if (BELOW_THRESHOLD (n, INV_APPR_THRESHOLD)) { /* Maximum scratch needed by this branch: 2*n */ mp_size_t i; mp_ptr xp; xp = scratch; /* 2 * n limbs */ for (i = n - 1; i >= 0; i--) xp[i] = GMP_NUMB_MAX; mpn_com (xp + n, dp, n); if (n == 2) { mpn_divrem_2 (ip, 0, xp, 4, dp); } else { gmp_pi1_t inv; invert_pi1 (inv, dp[n-1], dp[n-2]); /* FIXME: should we use dcpi1_div_q, for big sizes? */ mpn_sbpi1_div_q (ip, xp, 2 * n, dp, n, inv.inv32); } } else { /* Use approximated inverse; correct the result if needed. */ mp_limb_t e; /* The possible error in the approximate inverse */ ASSERT ( mpn_invert_itch (n) >= mpn_invertappr_itch (n) ); e = mpn_ni_invertappr (ip, dp, n, scratch); if (UNLIKELY (e)) { /* Assume the error can only be "0" (no error) or "1". */ /* Code to detect and correct the "off by one" approximation. */ mpn_mul_n (scratch, ip, dp, n); ASSERT_NOCARRY (mpn_add_n (scratch + n, scratch + n, dp, n)); if (! mpn_add (scratch, scratch, 2*n, dp, n)) MPN_INCR_U (ip, n, 1); /* The value was wrong, correct it. */ } } TMP_FREE; } }
static __inline__ mp_limb_t choose_prime_and_degree(p_k_pk_t* pp,nmod_t* mod,n_primes_rev_t it, const mpz_t divisor) { mp_limb_t r,t=1,r_mod_p; for(;;) { if( (pp->p = n_primes_rev_next(it)) == 1 ) { flint_printf("Exception (choose_prime_and_degree): " "Prime set exhausted\n"); abort(); } if( pp->p >= (UWORD(1)<<(FLINT_BITS/2)) ) { pp->k=1; r=r_mod_p=mpz_fdiv_ui( divisor, pp->p_deg_k=pp->p ); t=0; } else { max_degree( pp ); r=mpz_fdiv_ui( divisor, pp->p_deg_k ); r_mod_p=r % pp->p; } if(r_mod_p) { if(t) count_leading_zeros( t, pp->p_deg_k ); mod->n = pp->p_deg_k << t; invert_limb(mod->ninv, mod->n); #if SPEEDUP_NMOD_RED3 t = - mod->n; mod->norm = n_mulmod_preinv_4arg( t,t, mod->n,mod->ninv ); #else mod->norm = 0; #endif return inv_mod_pk_4arg(r,r_mod_p,pp[0],mod[0]); } } }
void sample(void * arg, ulong count) { mp_limb_t d, q, r, dinv, norm; mp_ptr array = (mp_ptr) flint_malloc(200 * sizeof(mp_limb_t)); FLINT_TEST_INIT(state); ulong i; int j; d = n_randtest_not_zero(state); count_leading_zeros(norm, d); d <<= norm; for (i = 0; i < count; i++) { for (j = 0; j < 200; j+=2) { do { array[j] = n_randtest(state); } while (array[j] >= d); array[j + 1] = n_randtest(state); } invert_limb(dinv, d); prof_start(); for (j = 0; j < 200; j+=2) { udiv_qrnnd_preinv(q, r, array[j], array[j+1], d, dinv); } prof_stop(); if (q + r == 0) flint_printf("\r"); } flint_randclear(state); flint_free(array); }
/* Put in Q={qp, n} an approximation of N={np, 2*n} divided by D={dp, n}, with the most significant limb of the quotient as return value (0 or 1). Assumes the most significant bit of D is set. Clobbers N. The approximate quotient Q satisfies - 2(n-1) < N/D - Q <= 4. */ static mp_limb_t mpfr_divhigh_n_basecase (mpfr_limb_ptr qp, mpfr_limb_ptr np, mpfr_limb_srcptr dp, mp_size_t n) { mp_limb_t qh, d1, d0, dinv, q2, q1, q0; mpfr_pi1_t dinv2; np += n; if ((qh = (mpn_cmp (np, dp, n) >= 0))) mpn_sub_n (np, np, dp, n); /* now {np, n} is less than D={dp, n}, which implies np[n-1] <= dp[n-1] */ d1 = dp[n - 1]; if (n == 1) { invert_limb (dinv, d1); umul_ppmm (q1, q0, np[0], dinv); qp[0] = np[0] + q1; return qh; } /* now n >= 2 */ d0 = dp[n - 2]; invert_pi1 (dinv2, d1, d0); /* dinv2.inv32 = floor ((B^3 - 1) / (d0 + d1 B)) - B */ while (n > 1) { /* Invariant: it remains to reduce n limbs from N (in addition to the initial low n limbs). Since n >= 2 here, necessarily we had n >= 2 initially, which means that in addition to the limb np[n-1] to reduce, we have at least 2 extra limbs, thus accessing np[n-3] is valid. */ /* warning: we can have np[n-1]=d1 and np[n-2]=d0, but since {np,n} < D, the largest possible partial quotient is B-1 */ if (MPFR_UNLIKELY(np[n - 1] == d1 && np[n - 2] == d0)) q2 = ~ (mp_limb_t) 0; else udiv_qr_3by2 (q2, q1, q0, np[n - 1], np[n - 2], np[n - 3], d1, d0, dinv2.inv32); /* since q2 = floor((np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0)), we have q2 <= (np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0), thus np[n-1]*B^2+np[n-2]*B+np[n-3] >= q2*(d1*B+d0) and {np-1, n} >= q2*D - q2*B^(n-2) >= q2*D - B^(n-1) thus {np-1, n} - (q2-1)*D >= D - B^(n-1) >= 0 which proves that at most one correction is needed */ q0 = mpn_submul_1 (np - 1, dp, n, q2); if (MPFR_UNLIKELY(q0 > np[n - 1])) { mpn_add_n (np - 1, np - 1, dp, n); q2 --; } qp[--n] = q2; dp ++; } /* we have B+dinv2 = floor((B^3-1)/(d1*B+d0)) < B^2/d1 q1 = floor(np[0]*(B+dinv2)/B) <= floor(np[0]*B/d1) <= floor((np[0]*B+np[1])/d1) thus q1 is not larger than the true quotient. q1 > np[0]*(B+dinv2)/B - 1 > np[0]*(B^3-1)/(d1*B+d0)/B - 2 For d1*B+d0 <> B^2/2, we have B+dinv2 = floor(B^3/(d1*B+d0)) thus q1 > np[0]*B^2/(d1*B+d0) - 2, i.e., (d1*B+d0)*q1 > np[0]*B^2 - 2*(d1*B+d0) d1*B*q1 > np[0]*B^2 - 2*d1*B - 2*d0 - d0*q1 >= np[0]*B^2 - 2*d1*B - B^2 thus q1 > np[0]*B/d1 - 2 - B/d1 > np[0]*B/d1 - 4. For d1*B+d0 = B^2/2, dinv2 = B-1 thus q1 > np[0]*(2B-1)/B - 1 > np[0]*B/d1 - 2. In all cases, if q = floor((np[0]*B+np[1])/d1), we have: q - 4 <= q1 <= q */ umul_ppmm (q1, q0, np[0], dinv2.inv32); qp[0] = np[0] + q1; return qh; }
mp_limb_t mpn_sb_divrem_mn (mp_ptr qp, mp_ptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn) { mp_limb_t most_significant_q_limb = 0; mp_size_t qn = nn - dn; mp_size_t i; mp_limb_t dx, d1, n0; mp_limb_t dxinv; int use_preinv; ASSERT (dn > 2); ASSERT (nn >= dn); ASSERT (dp[dn-1] & GMP_NUMB_HIGHBIT); ASSERT (! MPN_OVERLAP_P (np, nn, dp, dn)); ASSERT (! MPN_OVERLAP_P (qp, nn-dn, dp, dn)); ASSERT (! MPN_OVERLAP_P (qp, nn-dn, np, nn) || qp+dn >= np); ASSERT_MPN (np, nn); ASSERT_MPN (dp, dn); np += qn; dx = dp[dn - 1]; d1 = dp[dn - 2]; n0 = np[dn - 1]; if (n0 >= dx) { if (n0 > dx || mpn_cmp (np, dp, dn - 1) >= 0) { mpn_sub_n (np, np, dp, dn); most_significant_q_limb = 1; } } /* use_preinv is possibly a constant, but it's left to the compiler to optimize away the unused code in that case. */ use_preinv = ABOVE_THRESHOLD (qn, DIV_SB_PREINV_THRESHOLD); if (use_preinv) invert_limb (dxinv, dx); for (i = qn - 1; i >= 0; i--) { mp_limb_t q; mp_limb_t nx; mp_limb_t cy_limb; nx = np[dn - 1]; /* FIXME: could get value from r1 */ np--; if (nx == dx) { /* This might over-estimate q, but it's probably not worth the extra code here to find out. */ q = GMP_NUMB_MASK; #if 1 cy_limb = mpn_submul_1 (np, dp, dn, q); #else /* This should be faster on many machines */ cy_limb = mpn_sub_n (np + 1, np + 1, dp, dn); cy = mpn_add_n (np, np, dp, dn); np[dn] += cy; #endif if (nx != cy_limb) { mpn_add_n (np, np, dp, dn); q--; } qp[i] = q; } else { mp_limb_t rx, r1, r0, p1, p0; /* "workaround" avoids a problem with gcc 2.7.2.3 i386 register usage when np[dn-1] is used in an asm statement like umul_ppmm in udiv_qrnnd_preinv. The symptom is seg faults due to registers being clobbered. gcc 2.95 i386 doesn't have the problem. */ { mp_limb_t workaround = np[dn - 1]; if (use_preinv) udiv_qrnnd_preinv (q, r1, nx, workaround, dx, dxinv); else { udiv_qrnnd (q, r1, nx, workaround << GMP_NAIL_BITS, dx << GMP_NAIL_BITS); r1 >>= GMP_NAIL_BITS; } } umul_ppmm (p1, p0, d1, q << GMP_NAIL_BITS); p0 >>= GMP_NAIL_BITS; r0 = np[dn - 2]; rx = 0; if (r1 < p1 || (r1 == p1 && r0 < p0)) { p1 -= p0 < d1; p0 = (p0 - d1) & GMP_NUMB_MASK; q--; r1 = (r1 + dx) & GMP_NUMB_MASK; rx = r1 < dx; } p1 += r0 < p0; /* cannot carry! */ rx -= r1 < p1; /* may become 11..1 if q is still too large */ r1 = (r1 - p1) & GMP_NUMB_MASK; r0 = (r0 - p0) & GMP_NUMB_MASK; cy_limb = mpn_submul_1 (np, dp, dn - 2, q); /* Check if we've over-estimated q, and adjust as needed. */ { mp_limb_t cy1, cy2; cy1 = r0 < cy_limb; r0 = (r0 - cy_limb) & GMP_NUMB_MASK; cy2 = r1 < cy1; r1 -= cy1; np[dn - 1] = r1; np[dn - 2] = r0; if (cy2 != rx) { mpn_add_n (np, np, dp, dn); q--; } } qp[i] = q; } } /* ______ ______ ______ |__rx__|__r1__|__r0__| partial remainder ______ ______ - |__p1__|__p0__| partial product to subtract ______ ______ - |______|cylimb| rx is -1, 0 or 1. If rx=1, then q is correct (it should match carry out). If rx=-1 then q is too large. If rx=0, then q might be too large, but it is most likely correct. */ return most_significant_q_limb; }
mp_limb_t mpn_divrem_euclidean_qr_2(mp_ptr qp, mp_ptr xp, mp_size_t xn, mp_srcptr dp) { mp_size_t qn; mp_limb_t qf, t[2], t1[2], q, h, l, d1, d2, i; int c1, c3, c4; ASSERT(xn >= 2); ASSERT_MPN(dp, 2); ASSERT_MPN(xp, xn); ASSERT(dp[1] != 0); qn = xn - 1; /* ASSERT(!MPN_OVERLAP_P(qp, qn, xp, xn)); */ /* FIXME: correct this overlap requirement */ ASSERT((dp[1]>>(GMP_NUMB_BITS - 1)) != 0); h = 0; d1 = dp[1]; d2 = dp[0]; invert_limb(i, d1); l = xp[xn - 1]; qn = xn - 2; t[0] = xp[qn]; if (l < d1) { h = t[1] = l; l = t[0] = xp[qn]; qf = 0; } else { qf = 1; t[1] = l - d1; t1[1] = 0; t1[0] = d2; if (mpn_sub_n(t, t, t1, 2)) { qf--; mpn_add_n(t, t, dp, 2); } h = t[1]; l = t[0]; } for (qn = xn - 3; qn >= 0; qn--) { t[0] = xp[qn]; if (h < d1) { udiv_qrnnd_preinv(q, t[1], h, l, d1, i); umul_ppmm(t1[1], t1[0], q, d2); if (mpn_sub_n(t, t, t1, 2)) { q--; if (mpn_add_n(t, t, dp, 2) == 0) { q--; ASSERT_CARRY(mpn_add_n(t, t, dp, 2)); } } } else { ASSERT(h == d1); q = -1; t[1] = l; c3 = mpn_add_n(t, t, dp, 2); c1 = mpn_sub_1(t + 1, t + 1, 1, d2); c4 = c3 - c1; if (l >= d1) { ASSERT(c3 != 0); ASSERT(c4 == 0); } /* our guess is B + 1, so q = B - 1 is correct */ else { ASSERT(c4 <= 0); /* our guess is B so q = B - 1 or B - 2 */ if (c4 != 0) { q--; mpn_add_n(t, t, dp, 2); } } } h = t[1]; l = t[0]; qp[qn] = q; } xp[1] = t[1]; xp[0] = t[0]; return qf; }
/* Input: A = {ap, n} with most significant bit set. Output: X = B^n + {xp, n} where B = 2^GMP_NUMB_BITS. X is a lower approximation of B^(2n)/A with implicit msb. More precisely, one has: A*X < B^(2n) <= A*(X+1) or X = ceil(B^(2n)/A) - 1. */ void mpn_invert (mp_ptr xp, mp_srcptr ap, mp_size_t n) { if (n == 1) { /* invert_limb returns min(B-1, floor(B^2/ap[0])-B), which is B-1 when ap[0]=B/2, and 1 when ap[0]=B-1. For X=B+xp[0], we have A*X < B^2 <= A*(X+1) where the equality holds only when A=B/2. We thus have A*X < B^2 <= A*(X+1). */ invert_limb (xp[0], ap[0]); } else if (n == 2) { mp_limb_t tp[4], up[2], sp[2], cy; tp[0] = ZERO; invert_limb (xp[1], ap[1]); tp[3] = mpn_mul_1 (tp + 1, ap, 2, xp[1]); cy = mpn_add_n (tp + 2, tp + 2, ap, 2); while (cy) /* Xh is too large */ { xp[1] --; cy -= mpn_sub (tp + 1, tp + 1, 3, ap, 2); } /* tp[3] should be 111...111 */ mpn_com_n (sp, tp + 1, 2); cy = mpn_add_1 (sp, sp, 2, ONE); /* cy should be 0 */ up[1] = mpn_mul_1 (up, sp + 1, 1, xp[1]); cy = mpn_add_1 (up + 1, up + 1, 1, sp[1]); /* cy should be 0 */ xp[0] = up[1]; /* update tp */ cy = mpn_addmul_1 (tp, ap, 2, xp[0]); cy = mpn_add_1 (tp + 2, tp + 2, 2, cy); do { cy = mpn_add (tp, tp, 4, ap, 2); if (cy == ZERO) mpn_add_1 (xp, xp, 2, ONE); } while (cy == ZERO); /* now A*X < B^4 <= A*(X+1) */ } else { mp_size_t l, h; mp_ptr tp, up; mp_limb_t cy, th; int special = 0; TMP_DECL; l = (n - 1) / 2; h = n - l; mpn_invert (xp + l, ap + l, h); TMP_MARK; tp = TMP_ALLOC_LIMBS (n + h); up = TMP_ALLOC_LIMBS (2 * h); if (n <= WRAP_AROUND_BOUND) { mpn_mul (tp, ap, n, xp + l, h); cy = mpn_add_n (tp + h, tp + h, ap, n); } else { mp_size_t m = n + 1; mpir_ui k; int cc; if (m >= FFT_MULMOD_2EXPP1_CUTOFF) m = mpir_fft_adjust_limbs (m); /* we have m >= n + 1 by construction, thus m > h */ ASSERT(m < n + h); cy = mpn_mulmod_Bexpp1_fft (tp, m, ap, n, xp + l, h); /* cy, {tp, m} = A * {xp + l, h} mod (B^m+1) */ cy += mpn_add_n (tp + h, tp + h, ap, m - h); cc = mpn_sub_n (tp, tp, ap + m - h, n + h - m); cc = mpn_sub_1 (tp + n + h - m, tp + n + h - m, 2 * m - n - h, cc); if (cc > cy) /* can only occur if cc=1 and cy=0 */ cy = mpn_add_1 (tp, tp, m, ONE); else cy -= cc; /* cy, {tp, m} = A * Xh */ /* add B^(n+h) + B^(n+h-m) */ MPN_ZERO (tp + m, n + h - m); tp[m] = cy; /* note: since tp[n+h-1] is either 0, or cy<=1 if m=n+h-1, the mpn_incr_u() below cannot produce a carry */ mpn_incr_u (tp + n + h - m, ONE); cy = 1; do /* check if T >= B^(n+h) + 2*B^n */ { mp_size_t i; if (cy == ZERO) break; /* surely T < B^(n+h) */ if (cy == ONE) { for (i = n + h - 1; tp[i] == ZERO && i > n; i--); if (i == n && tp[i] < (mp_limb_t) 2) break; } /* subtract B^m+1 */ cy -= mpn_sub_1 (tp, tp, n + h, ONE); cy -= mpn_sub_1 (tp + m, tp + m, n + h - m, ONE); } while (1); } while (cy) { mpn_sub_1 (xp + l, xp + l, h, ONE); cy -= mpn_sub (tp, tp, n + h, ap, n); } mpn_not (tp, n); th = ~tp[n] + mpn_add_1 (tp, tp, n, ONE); mpn_mul_n (up, tp + l, xp + l, h); cy = mpn_add_n (up + h, up + h, tp + l, h); if (th != ZERO) { cy += ONE + mpn_add_n (up + h, up + h, xp + l, h); } if (up[2*h-l-1] + 4 <= CNST_LIMB(3)) special = 1; MPN_COPY (xp, up + 2 * h - l, l); mpn_add_1 (xp + l, xp + l, h, cy); TMP_FREE; if ((special) && !mpn_is_invert(xp, ap, n)) mpn_add_1 (xp, xp, n, 1); } }
int main (int argc, char *argv[]) { mp_limb_t bb, h, l, bb_inv; int i, j; for (i = 2; i < numberof (mp_bases); i++) { if (POW2_P (i)) { count_trailing_zeros (j, i); if (mp_bases[i].big_base != (mp_limb_t) j) { printf ("mp_bases[%d].big_base (trailing zeros) wrong\n", i); abort (); } } else { bb = 1; for (j = 0; j < mp_bases[i].chars_per_limb; j++) { umul_ppmm (h, bb, bb, i); if (h != 0 || (bb & GMP_NAIL_MASK) != 0) { printf ("mp_bases[%d].chars_per_limb overflow\n", i); abort (); } } umul_ppmm (h, l, bb, i); if (h == 0 && (l & GMP_NAIL_MASK) == 0) { printf ("mp_bases[%d].chars_per_limb too small\n", i); abort (); } if (mp_bases[i].big_base != bb) { printf ("mp_bases[%d].big_base wrong\n", i); abort (); } invert_limb (bb_inv, bb << refmpn_count_leading_zeros (bb)); if (mp_bases[i].big_base_inverted != bb_inv) { printf ("mp_bases[%d].big_base_inverted wrong\n", i); abort (); } } } if (MP_BASES_CHARS_PER_LIMB_10 != mp_bases[10].chars_per_limb) { printf ("MP_BASES_CHARS_PER_LIMB_10 not the same as mp_bases[10].chars_per_limb\n"); abort (); } if (MP_BASES_BIG_BASE_10 != mp_bases[10].big_base) { printf ("MP_BASES_BIG_BASE_10 not the same as mp_bases[10].big_base\n"); abort (); } if (MP_BASES_BIG_BASE_INVERTED_10 != mp_bases[10].big_base_inverted) { printf ("MP_BASES_BIG_BASE_INVERTED_10 not the same as mp_bases[10].big_base_inverted\n"); abort (); } if (MP_BASES_NORMALIZATION_STEPS_10 != refmpn_count_leading_zeros (MP_BASES_BIG_BASE_10)) { printf ("MP_BASES_NORMALIZATION_STEPS_10 wrong\n"); abort (); } exit (0); }