mp_limb_t divexact_submul(mp_ptr qp,mp_ptr xp,mp_size_t n) {int j;mp_limb_t c,m,t1,t2,t3,acc,ax,dx,t; ASSERT(n>0);ASSERT_MPN(xp,n);ASSERT(MPN_SAME_OR_SEPARATE_P(qp,xp,n)); m=0;m=~m;m=m/3;// m=(B-1)/3 c=0;t1=t2=t3=acc=0; umul_ppmm(dx,ax,xp[0],m); SUB(c,acc,0,t1); ADC(c,t2,0,ax,c); ADC(c,t3,0,dx,c); ASSERT(c==0); t1=t2;t2=t3; for(j=1;j<=n-1;j++) { t3=0; umul_ppmm(dx,ax,xp[j],m); SUB(c,acc,acc,t1); qp[j-1]=acc; ADC(c,t2,t2,ax,c); ADC(c,t3,t3,dx,c); ASSERT(c==0); t1=t2;t2=t3; } SUB(c,acc,acc,t1); qp[n-1]=acc; ADC(c,t2,t2,0,c); t=(t2-acc)*3; // return next quotient*-3 return t;} // so (xp,n) = (qp,n)*3 -ret*B^n and 0 <= ret < 3
/* (rp, 2n) = (xp, n)*(yp, n) / B^n */ inline static void mpn_mulshort_n_basecase(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n) { mp_size_t i, k; #if GMP_NAIL_BITS==0 mp_limb_t t1, t2, t3; #endif ASSERT(n >= 3); /* this restriction doesn't make a lot of sense in general */ ASSERT_MPN(xp, n); ASSERT_MPN(yp, n); ASSERT(!MPN_OVERLAP_P (rp, 2 * n, xp, n)); ASSERT(!MPN_OVERLAP_P (rp, 2 * n, yp, n)); k = n - 2; /* so want short product sum_(i + j >= k) x[i]y[j]B^(i + j) */ #if GMP_NAIL_BITS!=0 rp[n] = mpn_mul_1(rp + k, xp + k, 2, yp[0]); #else umul_ppmm(t1, rp[k], xp[k], yp[0]); umul_ppmm(t3, t2, xp[k + 1], yp[0]); add_ssaaaa(rp[n], rp[k + 1], t3, t2, 0, t1); #endif for (i = 1; i <= n - 2; i++) rp[n + i] = mpn_addmul_1 (rp + k, xp + k - i, 2 + i, yp[i]); rp[n + n - 1] = mpn_addmul_1 (rp + n - 1, xp, n, yp[n - 1]); return; }
static void mul_basecase (mp_ptr wp, mp_srcptr up, mp_size_t un, mp_srcptr vp, mp_size_t vn) { mp_size_t i, j; mp_limb_t prod_low, prod_high; mp_limb_t cy_dig; mp_limb_t v_limb; /* Multiply by the first limb in V separately, as the result can be stored (not added) to PROD. We also avoid a loop for zeroing. */ v_limb = vp[0]; cy_dig = 0; for (j = un; j > 0; j--) { mp_limb_t u_limb, w_limb; u_limb = *up++; umul_ppmm (prod_high, prod_low, u_limb, v_limb << GMP_NAIL_BITS); add_ssaaaa (cy_dig, w_limb, prod_high, prod_low, 0, cy_dig << GMP_NAIL_BITS); *wp++ = w_limb >> GMP_NAIL_BITS; } *wp++ = cy_dig; wp -= un; up -= un; /* For each iteration in the outer loop, multiply one limb from U with one limb from V, and add it to PROD. */ for (i = 1; i < vn; i++) { v_limb = vp[i]; cy_dig = 0; for (j = un; j > 0; j--) { mp_limb_t u_limb, w_limb; u_limb = *up++; umul_ppmm (prod_high, prod_low, u_limb, v_limb << GMP_NAIL_BITS); w_limb = *wp; add_ssaaaa (prod_high, prod_low, prod_high, prod_low, 0, w_limb << GMP_NAIL_BITS); prod_low >>= GMP_NAIL_BITS; prod_low += cy_dig; #if GMP_NAIL_BITS == 0 cy_dig = prod_high + (prod_low < cy_dig); #else cy_dig = prod_high; cy_dig += prod_low >> GMP_NUMB_BITS; #endif *wp++ = prod_low & GMP_NUMB_MASK; } *wp++ = cy_dig; wp -= un; up -= un; } }
int _nmod_vec_dot_bound_limbs(slong len, nmod_t mod) { mp_limb_t t2, t1, t0, u1, u0; umul_ppmm(t1, t0, mod.n - 1, mod.n - 1); umul_ppmm(t2, t1, t1, len); umul_ppmm(u1, u0, t0, len); add_sssaaaaaa(t2, t1, t0, t2, t1, UWORD(0), UWORD(0), u1, u0); if (t2 != 0) return 3; if (t1 != 0) return 2; return (t0 != 0); }
void _nmod_mat_mul_transpose_3(nmod_mat_t C, const nmod_mat_t A, const nmod_mat_t B) { long i, j, k; register mp_limb_t s0, s1, s2; register mp_limb_t t0, t1; register mp_limb_t c1, c2; for (i = 0; i < A->r; i++) { for (j = 0; j < B->r; j++) { s0 = s1 = s2 = 0UL; for (k = 0; k < A->c; k++) { umul_ppmm(t1, t0, A->rows[i][k], B->rows[j][k]); add_ssaaaa(c1, s0, (mp_limb_t) 0, s0, (mp_limb_t) 0, t0); add_ssaaaa(c2, s1, (mp_limb_t) 0, s1, (mp_limb_t) 0, t1); add_ssaaaa(s2, s1, s2, s1, c2, c1); } NMOD_RED(s2, s2, C->mod); NMOD_RED3(s0, s2, s1, s0, C->mod); C->rows[i][j] = s0; } } }
/* (xp, n) = (qp, n)*f - ret*B^n and 0 <= ret < f Note the divexact_by3 code is just a special case of this */ mp_limb_t mpn_divexact_byfobm1(mp_ptr qp, mp_srcptr xp, mp_size_t n, mp_limb_t f, mp_limb_t Bm1of) { mp_size_t j; mp_limb_t c, acc, ax, dx; ASSERT(n > 0); ASSERT_MPN(xp, n); ASSERT(MPN_SAME_OR_SEPARATE_P(qp, xp, n)); ASSERT(Bm1of*f + 1 == 0); acc = 0*Bm1of; /* carry in is 0 */ for (j = 0; j <= n - 1; j++) { umul_ppmm(dx, ax, xp[j], Bm1of); SUBC_LIMB(c, acc, acc, ax); qp[j] = acc; acc -= dx + c; } /* return next quotient*(-f) */ return acc*(-f); }
/* (xp, n) = (qp, n)*3 - ret*B^n and 0 <= ret < 3 */ mp_limb_t mpn_divexact_by3c(mp_ptr qp, mp_srcptr xp, mp_size_t n, mp_limb_t ci) { mp_size_t j; mp_limb_t c, m, acc, ax, dx; ASSERT(n > 0); ASSERT_MPN(xp, n); ASSERT(MPN_SAME_OR_SEPARATE_P(qp, xp, n)); m = 0; m = ~m; m = m/3; /* m = (B - 1)/3 */ acc = ci*m; for (j = 0; j <= n - 1; j++) { umul_ppmm(dx, ax, xp[j], m); SUBC_LIMB(c, acc, acc, ax); qp[j] = acc; acc -= dx + c; } /* return next quotient*(-3) */ return acc*(-3); }
void _nmod_mat_mul_transpose_2(nmod_mat_t C, const nmod_mat_t A, const nmod_mat_t B) { long i, j, k; register mp_limb_t s0, s1; register mp_limb_t t0, t1; for (i = 0; i < A->r; i++) { for (j = 0; j < B->r; j++) { s0 = s1 = 0UL; for (k = 0; k < A->c; k++) { umul_ppmm(t1, t0, A->rows[i][k], B->rows[j][k]); add_ssaaaa(s1, s0, s1, s0, t1, t0); } NMOD2_RED2(s0, s1, s0, C->mod); C->rows[i][j] = s0; } } }
mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, mp_limb_t h) { mp_limb_t s, x, y, inverse, dummy, dmul, c1, c2; mp_limb_t c = 0; mp_size_t i; ASSERT (size >= 1); ASSERT (d & 1); binvert_limb (inverse, d); dmul = d << GMP_NAIL_BITS; for (i = 0; i < size; i++) { ASSERT (c==0 || c==1); s = src[i]; SUBC_LIMB (c1, x, s, c); SUBC_LIMB (c2, y, x, h); c = c1 + c2; y = (y * inverse) & GMP_NUMB_MASK; umul_ppmm (h, dummy, y, dmul); } h += c; return h; }
mp_limb_t div_preinv1(mp_limb_t d1, mp_limb_t d2) { mp_limb_t q, r[2], p[2], cy; if (d2 + 1 == 0 && d1 + 1 == 0) return 0; if (d1 + 1 == 0) q = ~d1, r[1] = ~d2; else udiv_qrnnd(q, r[1], ~d1, ~d2, d1 + 1); r[0] = 0; if (d2 + 1 == 0) add_ssaaaa(cy, r[1], 0, r[1], 0, q); else { umul_ppmm(p[1], p[0], q, ~d2 - 1); cy = mpn_add_n(r, r, p, 2); } p[0] = d2 + 1, p[1] = d1 + (d2 + 1 == 0); if (cy || mpn_cmp(r, p, 2) >= 0) q++; return q; }
mpi_limb_t mpihelp_addmul_1( mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size, mpi_limb_t s2_limb) { mpi_limb_t cy_limb; mpi_size_t j; mpi_limb_t prod_high, prod_low; mpi_limb_t x; /* The loop counter and index J goes from -SIZE to -1. This way * the loop becomes faster. */ j = -s1_size; res_ptr -= j; s1_ptr -= j; cy_limb = 0; do { umul_ppmm( prod_high, prod_low, s1_ptr[j], s2_limb ); prod_low += cy_limb; cy_limb = (prod_low < cy_limb?1:0) + prod_high; x = res_ptr[j]; prod_low = x + prod_low; cy_limb += prod_low < x?1:0; res_ptr[j] = prod_low; } while ( ++j ); return cy_limb; }
void nmod_mat_mul_check(nmod_mat_t C, const nmod_mat_t A, const nmod_mat_t B) { long i, j, k; mp_limb_t s0, s1, s2; mp_limb_t t0, t1; for (i = 0; i < A->r; i++) { for (j = 0; j < B->c; j++) { s0 = s1 = s2 = 0UL; for (k = 0; k < A->c; k++) { umul_ppmm(t1, t0, A->rows[i][k], B->rows[k][j]); add_sssaaaaaa(s2, s1, s0, s2, s1, s0, 0, t1, t0); } NMOD_RED(s2, s2, C->mod); NMOD_RED3(s0, s2, s1, s0, C->mod); C->rows[i][j] = s0; } } }
mpi_limb_t mpihelp_addmul_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size, mpi_limb_t s2_limb) { mpi_limb_t cy_limb; mpi_size_t j; mpi_limb_t prod_high, prod_low; mpi_limb_t x; /* */ j = -s1_size; res_ptr -= j; s1_ptr -= j; cy_limb = 0; do { umul_ppmm(prod_high, prod_low, s1_ptr[j], s2_limb); prod_low += cy_limb; cy_limb = (prod_low < cy_limb ? 1 : 0) + prod_high; x = res_ptr[j]; prod_low = x + prod_low; cy_limb += prod_low < x ? 1 : 0; res_ptr[j] = prod_low; } while (++j); return cy_limb; }
void mpn_sqr_basecase (mp_ptr rp, mp_srcptr up, mp_size_t n) { mp_size_t i; mp_limb_t tarr[2 * SQR_KARATSUBA_THRESHOLD]; mp_ptr tp = tarr; mp_limb_t cy; /* must fit 2*n limbs in tarr */ ASSERT (n <= SQR_KARATSUBA_THRESHOLD); if ((n & 1) != 0) { if (n == 1) { mp_limb_t ul, lpl; ul = up[0]; umul_ppmm (rp[1], lpl, ul, ul << GMP_NAIL_BITS); rp[0] = lpl >> GMP_NAIL_BITS; return; } MPN_ZERO (tp, n); for (i = 0; i <= n - 2; i += 2) { cy = mpn_addmul_2s (tp + 2 * i, up + i + 1, n - (i + 1), up + i); tp[n + i] = cy; } }
int main(void) { int i, result; flint_rand_t state; printf("xgcd...."); fflush(stdout); flint_randinit(state); for (i = 0; i < 100000; i++) { mp_limb_t a, b, c, g, bits1, bits2, bits3, ph, pl, qh, ql; mp_limb_t s, t; bits1 = n_randint(state, FLINT_BITS-1) + 1; bits2 = n_randint(state, bits1) + 1; bits3 = n_randint(state, FLINT_BITS - bits1) + 1; do { a = n_randbits(state, bits1); b = n_randbits(state, bits2); } while ((n_gcd(a, b) != 1UL) || (b > a)); c = n_randbits(state, bits3); g = n_xgcd(&s, &t, a*c, b*c); umul_ppmm(ph, pl, a*c, s); umul_ppmm(qh, ql, b*c, t); sub_ddmmss(ph, pl, ph, pl, qh, ql); result = ((g == c) && (ph == 0UL) && (pl == c)); if (!result) { printf("FAIL:\n"); printf("a = %lu, b = %lu, c = %lu, g = %lu, s = %lu, t = %lu\n", a, b, c, g, s, t); abort(); } } flint_randclear(state); printf("PASS\n"); return 0; }
/* Return {xp, xn} mod p. Assume 2p < B where B = 2^GMP_NUMB_LIMB. We first compute {xp, xn} / B^n mod p using Montgomery reduction, where the number N to factor has n limbs. Then we multiply by B^(n+1) mod p (precomputed) and divide by B mod p. Assume invm = -1/p mod B and Bpow = B^n mod p */ static mp_limb_t ecm_mod_1 (mp_ptr xp, mp_size_t xn, mp_limb_t p, mp_size_t n, mp_limb_t invm, mp_limb_t Bpow) { mp_limb_t q, cy, hi, lo, x0, x1; if (xn == 0) return 0; /* the code below assumes xn <= n+1, thus we call mpn_mod_1 otherwise, but this should never (or rarely) happen */ if (xn > n + 1) return mpn_mod_1 (xp, xn, p); x0 = xp[0]; cy = (mp_limb_t) 0; while (n-- > 0) { /* Invariant: cy is the input carry on xp[1], x0 is xp[0] */ x1 = (xn > 1) ? xp[1] : 0; q = x0 * invm; /* q = -x0/p mod B */ umul_ppmm (hi, lo, q, p); /* hi*B + lo = -x0 mod B */ /* Add hi*B + lo to x1*B + x0. Since p <= B-2 we have hi*B + lo <= (B-1)(B-2) = B^2-3B+2, thus hi <= B-3 */ hi += cy + (lo != 0); /* cannot overflow */ x0 = x1 + hi; cy = x0 < hi; xn --; xp ++; } if (cy != 0) x0 -= p; /* now x0 = {xp, xn} / B^n mod p */ umul_ppmm (x1, x0, x0, Bpow); /* since Bpow < p, x1 <= p-1 */ q = x0 * invm; umul_ppmm (hi, lo, q, p); /* hi <= p-1 thus hi+x1+1 < 2p-1 < B */ hi = hi + x1 + (lo != 0); while (hi >= p) hi -= p; return hi; }
/* * Multiply x and y, reducing the result modulo n. */ uint64_t mul_mod_n(uint64_t x, uint64_t y, uint64_t n) { #if 0 uint64_t q, r, p1, p2; umul_ppmm(p1, p2, x, y); udiv_qrnnd(q, r, p1, p2, n); return r; #endif return (x * y) % n; }
/* in each round we remove one limb from the body, i.e. k = 1 */ void mpn_mod_1_3(mp_ptr rem, mp_srcptr xp, mp_size_t xn, mp_srcptr db) { mp_limb_t h, l, sh, sl, th, tl; mp_size_t j, jj; ASSERT(xn >= 5); ASSERT_MPN(xp, xn); ASSERT_LIMB(db[0]); ASSERT_LIMB(db[1]); ASSERT_LIMB(db[2]); ASSERT_LIMB(db[3]); tl = xp[xn - 2]; th = xp[xn - 1]; for (j = xn - 5; j >= 0; j -= 3) { umul_ppmm(sh, sl, xp[j + 1], db[0]); add_ssaaaa(sh, sl, sh, sl, 0, xp[j]); umul_ppmm(h, l, xp[j + 2], db[1]); add_ssaaaa(sh, sl, sh, sl, h, l); umul_ppmm(h, l, tl, db[2]); add_ssaaaa(sh, sl, sh, sl, h, l); umul_ppmm(th, tl, th, db[3]); add_ssaaaa(th, tl, th, tl, sh, sl); } if (j > -3) /* we have at least three limbs to do, i.e. xp[0], ..., tl, th */ { sh = 0; sl = xp[0]; jj = 1; if (j == -1) { umul_ppmm(sh, sl, xp[1], db[0]); add_ssaaaa(sh, sl, sh, sl, 0, xp[0]); jj = 2; } umul_ppmm(h, l, tl, db[jj - 1]); add_ssaaaa(sh, sl, sh, sl, h, l); umul_ppmm(th, tl, th, db[jj]); add_ssaaaa(th, tl, th, tl, sh, sl); } umul_ppmm(h, l, th, db[0]); add_ssaaaa(h, l, h, l, 0, tl); rem[0] = l; rem[1] = h; }
/* Put in rp[n..2n-1] an approximation of the n high limbs of {up, n} * {vp, n}. The error is less than n ulps of rp[n] (and the approximation is always less or equal to the truncated full product). Assume 2n limbs are allocated at rp. Implements Algorithm ShortMulNaive from [1]. */ static void mpfr_mulhigh_n_basecase (mpfr_limb_ptr rp, mpfr_limb_srcptr up, mpfr_limb_srcptr vp, mp_size_t n) { mp_size_t i; rp += n - 1; umul_ppmm (rp[1], rp[0], up[n-1], vp[0]); /* we neglect up[0..n-2]*vp[0], which is less than B^n */ for (i = 1 ; i < n ; i++) /* here, we neglect up[0..n-i-2] * vp[i], which is less than B^n too */ rp[i + 1] = mpn_addmul_1 (rp, up + (n - i - 1), i + 1, vp[i]); /* in total, we neglect less than n*B^n, i.e., n ulps of rp[n]. */ }
// basic divexact mp_limb_t divexact_basic(mp_ptr qp,mp_ptr xp,mp_size_t n,mp_limb_t d) {int j;mp_limb_t c,h,q,dummy,h1,t,m; ASSERT(n>0);ASSERT(d!=0);ASSERT_MPN(xp,n);ASSERT(MPN_SAME_OR_SEPARATE_P(qp,xp,n)); ASSERT(d%2==1);modlimb_invert(m,d); c=0;h=0;t=0; for(j=0;j<=n-1;j++) {h1=xp[j]; t=h+c;if(t>h1){h1=h1-t;c=1;}else{h1=h1-t;c=0;}// set borrow to c ; sbb t,h1 ; set c to borrow q=h1*m; qp[j]=q; umul_ppmm(h,dummy,q,d); ASSERT(dummy==h1);} // ie returns next quotient*-d return h+c;} // so (xp,n) = (qp,n)*d -ret*B^n and 0 <= ret < d
mp_limb_t divexact3_direct(mp_ptr qp,mp_ptr xp,mp_size_t n) {int j;mp_limb_t c,m,acc,ax,dx; ASSERT(n>0);ASSERT_MPN(xp,n);ASSERT(MPN_SAME_OR_SEPARATE_P(qp,xp,n)); m=0;m=~m;m=m/3;// m=(B-1)/3 c=0;t1=t2=t3=acc=0; for(j=0;j<=n-1;j++) { umul_ppmm(dx,ax,xp[j],m); SBB(c,acc,acc,ax,c); qp[j]=acc; SBB(c,acc,acc,dx,c); } SBB(c,acc,acc,0,c); // return next quotient*-3 return acc*-3;} // so (xp,n) = (qp,n)*3 -ret*B^n and 0 <= ret < 3
mp_limb_t divexact3_byluck(mp_ptr qp,mp_ptr xp,mp_size_t n) {int j;mp_limb_t c,m,acc,ax,dx; ASSERT(n>0);ASSERT_MPN(xp,n);ASSERT(MPN_SAME_OR_SEPARATE_P(qp,xp,n)); m=0;m=~m;m=m/3;// m=(B-1)/3 c=0;acc=0; for(j=0;j<=n-1;j++) { umul_ppmm(dx,ax,xp[j],m); // line 1 SUB(c,acc,acc,ax); // line 2 qp[j]=acc; // line 3 SBB(c,acc,acc,dx,c); // line 4 if(c!=0){printf("c not zero\n");abort();} } // return next quotient*-3 return acc*-3;} // so (xp,n) = (qp,n)*3 -ret*B^n and 0 <= ret < 3
/* Define our own squaring function, which uses mpn_sqr_basecase for its allowed sizes, but its own code for larger sizes. */ static void mpn_local_sqr (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_ptr tp) { mp_size_t i; ASSERT (n >= 1); ASSERT (! MPN_OVERLAP_P (rp, 2*n, up, n)); if (BELOW_THRESHOLD (n, SQR_BASECASE_LIM)) { mpn_sqr_basecase (rp, up, n); return; } { mp_limb_t ul, lpl; ul = up[0]; umul_ppmm (rp[1], lpl, ul, ul << GMP_NAIL_BITS); rp[0] = lpl >> GMP_NAIL_BITS; } if (n > 1) { mp_limb_t cy; cy = mpn_mul_1 (tp, up + 1, n - 1, up[0]); tp[n - 1] = cy; for (i = 2; i < n; i++) { mp_limb_t cy; cy = mpn_addmul_1 (tp + 2 * i - 2, up + i, n - i, up[i - 1]); tp[n + i - 2] = cy; } MPN_SQR_DIAGONAL (rp + 2, up + 1, n - 1); { mp_limb_t cy; #if HAVE_NATIVE_mpn_addlsh1_n cy = mpn_addlsh1_n (rp + 1, rp + 1, tp, 2 * n - 2); #else cy = mpn_lshift (tp, tp, 2 * n - 2, 1); cy += mpn_add_n (rp + 1, rp + 1, tp, 2 * n - 2); #endif rp[2 * n - 1] += cy; } } }
void _nmod_vec_scalar_mul_nmod(mp_ptr res, mp_srcptr vec, slong len, mp_limb_t c, nmod_t mod) { if (mod.norm >= FLINT_BITS/2) /* products will fit in a limb */ { mpn_mul_1(res, vec, len, c); _nmod_vec_reduce(res, res, len, mod); } else /* products may take two limbs */ { slong i; for (i = 0; i < len; i++) { mp_limb_t hi, lo; umul_ppmm(hi, lo, vec[i], c); NMOD_RED2(res[i], hi, lo, mod); /* hi already reduced mod n */ } } }
mp_limb_t mpn_bdiv_dbm1c (mp_ptr qp, mp_srcptr ap, mp_size_t n, mp_limb_t bd, mp_limb_t h) { mp_limb_t a, p0, p1, cy; mp_size_t i; for (i = 0; i < n; i++) { a = ap[i]; umul_ppmm (p1, p0, a, bd << GMP_NAIL_BITS); p0 >>= GMP_NAIL_BITS; cy = h < p0; h = (h - p0) & GMP_NUMB_MASK; qp[i] = h; h = h - p1 - cy; } return h; }
int main(void) { int i, result; FLINT_TEST_INIT(state); flint_printf("mulmod_precomp...."); fflush(stdout); for (i = 0; i < 100000 * flint_test_multiplier(); i++) { mp_limb_t a, b, d, r1, r2, p1, p2, dinv; double dpre; mp_limb_t bits = n_randint(state, FLINT_D_BITS) + 1; d = n_randtest_bits(state, bits); a = n_randtest(state) % d; b = n_randtest(state) % d; dpre = n_precompute_inverse(d); r1 = n_mulmod_precomp(a, b, d, dpre); umul_ppmm(p1, p2, a, b); dinv = n_preinvert_limb(d); r2 = n_ll_mod_preinv(p1, p2, d, dinv); result = (r1 == r2); if (!result) { flint_printf("FAIL:\n"); flint_printf("a = %wu, b = %wu, d = %wu, dinv = %f\n", a, b, d, dpre); flint_printf("r1 = %wu, r2 = %wu\n", r1, r2); abort(); } } FLINT_TEST_CLEANUP(state); flint_printf("PASS\n"); return 0; }
mp_limb_t n_clog(mp_limb_t n, mp_limb_t b) { mp_limb_t r, p, t, phi; r = 0; p = 1; while (1) { umul_ppmm(phi, t, p, b); if (t <= n && !phi) { r++; p = t; } else return r + (p != n); } }
/* (xp, n) = (qp, n)*d - ret*B^n and 0 <= ret < d */ mp_limb_t mpn_divrem_hensel_qr_1_1(mp_ptr qp, mp_srcptr xp, mp_size_t n, mp_limb_t d) { mp_size_t j; mp_limb_t c, h, q, dummy, h1, t, m; ASSERT(n > 0); ASSERT_MPN(xp, n); ASSERT(MPN_SAME_OR_SEPARATE_P(qp, xp, n)); ASSERT(d%2 == 1); modlimb_invert(m, d); c = 0; h = 0; t = 0; for (j = 0; j <= n - 1; j++) { h1 = xp[j]; t = h + c; if (t > h1) { h1 = h1 - t; c = 1; } else { h1 = h1 - t; c = 0; } q = h1*m; qp[j] = q; umul_ppmm(h, dummy, q, d); ASSERT(dummy == h1); } return h + c; }
void fmpz_mul_si(fmpz_t f, const fmpz_t g, long x) { fmpz c2 = *g; if (x == 0) { fmpz_zero(f); return; } else if (!COEFF_IS_MPZ(c2)) /* c2 is small */ { mp_limb_t prod[2]; mp_limb_t uc2 = FLINT_ABS(c2); mp_limb_t ux = FLINT_ABS(x); /* unsigned limb by limb multiply (assembly for most CPU's) */ umul_ppmm(prod[1], prod[0], uc2, ux); if (!prod[1]) /* result fits in one limb */ { fmpz_set_ui(f, prod[0]); if ((c2 ^ x) < 0L) fmpz_neg(f, f); } else /* result takes two limbs */ { __mpz_struct *mpz_ptr = _fmpz_promote(f); /* two limbs, least significant first, native endian, no nails, stored in prod */ mpz_import(mpz_ptr, 2, -1, sizeof(mp_limb_t), 0, 0, prod); if ((c2 ^ x) < 0L) mpz_neg(mpz_ptr, mpz_ptr); } } else /* c2 is large */ { __mpz_struct *mpz_ptr = _fmpz_promote(f); /* ok without val as if aliased both are large */ mpz_mul_si(mpz_ptr, COEFF_TO_PTR(c2), x); } }
/* in each round we remove one limb from the body, i.e. k = 1 */ void mpn_mod_1_2(mp_ptr rem, mp_srcptr xp, mp_size_t xn, mp_srcptr db) { mp_limb_t h, l, sh, sl, th, tl; mp_size_t j; ASSERT(xn >= 4); ASSERT_MPN(xp, xn); ASSERT_LIMB(db[0]); ASSERT_LIMB(db[1]); ASSERT_LIMB(db[2]); tl = xp[xn - 2]; th = xp[xn - 1]; for (j = xn - 4; j >= 0; j -= 2) { umul_ppmm(sh, sl, xp[j + 1], db[0]); add_ssaaaa(sh, sl, sh, sl, 0, xp[j]); umul_ppmm(h, l, tl, db[1]); add_ssaaaa(sh, sl, sh, sl, h, l); umul_ppmm(th, tl, th, db[2]); add_ssaaaa(th, tl, th, tl, sh, sl); } if (j > -2) /* we have at least three limbs to do i.e. xp[0], ..., tl, th */ { umul_ppmm(sh, sl, tl, db[0]); add_ssaaaa(sh, sl, sh, sl, 0, xp[0]); umul_ppmm(th, tl, th, db[1]); add_ssaaaa(th, tl, th, tl, sh, sl); } umul_ppmm(h, l, th, db[0]); add_ssaaaa(h, l, h, l, 0, tl); rem[0] = l; rem[1] = h; }