/* (rp, 2n) = (xp, n)*(yp, n) */ void mpn_mulhigh_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n) { mp_limb_t t; ASSERT(n > 0); ASSERT_MPN(xp, n); ASSERT_MPN(yp, n); ASSERT(!MPN_OVERLAP_P(rp, 2 * n, xp, n)); ASSERT(!MPN_OVERLAP_P(rp, 2 * n, yp, n)); if (BELOW_THRESHOLD(n, MULHIGH_BASECASE_THRESHOLD)) { mpn_mul_basecase(rp, xp, n, yp, n); return; } if (ABOVE_THRESHOLD (n, MULHIGH_MUL_THRESHOLD)) { mpn_mul_n(rp, xp, yp, n); return; } mpn_mulshort_n(rp, xp, yp, n); t = rp[n - 1] + n - 2; if (UNLIKELY(t < n - 2)) mpn_mul_n(rp, xp, yp, n); return; }
// k degree poly so have k+1 coeffs and first k are size n // k>3 so we can do the first add unconditionally int mpn_toom_eval_pm1(mp_ptr pp,mp_ptr mp,unsigned int k,mp_srcptr xp,mp_size_t n,mp_size_t m,mp_ptr tp) {int isneg=0;unsigned int i; ASSERT(k>3);ASSERT(n>=m);ASSERT(m>0);ASSERT_MPN(xp,n*k+m); //ASSERT_SPACE(pp,n+1);ASSERT_SPACE(mp,n+1);ASSERT_SPACE(tp,n+1); ASSERT(!MPN_OVERLAP_P(pp,n+1,mp,n+1));ASSERT(!MPN_OVERLAP_P(pp,n+1,xp,n*k+m));ASSERT(!MPN_OVERLAP_P(pp,n+1,tp,n+1)); ASSERT(!MPN_OVERLAP_P(mp,n+1,xp,n*k+m));ASSERT(!MPN_OVERLAP_P(xp,n*k+m,tp,n+1)); #if ! HAVE_NATIVE_mpn_sumdiff_n ASSERT(!MPN_OVERLAP_P(mp,n+1,tp,n+1)); #endif #if HAVE_NATIVE_mpn_addadd_n if(k==4){pp[n]=mpn_add_n(pp,xp,xp+2*n,n);tp[n]=mpn_add_n(tp,xp+n,xp+3*n,n);}else if(k==5){pp[n]=mpn_addadd_n(pp,xp,xp+2*n,xp+4*n,n);tp[n]=mpn_add_n(tp,xp+n,xp+3*n,n);}else {pp[n]=mpn_addadd_n(pp,xp,xp+2*n,xp+4*n,n);tp[n]=mpn_addadd_n(tp,xp+n,xp+3*n,xp+5*n,n); for(i=7;i<k-2;i+=4){pp[n]+=mpn_addadd_n(pp,pp,xp+(i-1)*n,xp+(i+1)*n,n);tp[n]+=mpn_addadd_n(tp,tp,xp+i*n,xp+(i+2)*n,n);} if(k%4==3){pp[n]+=mpn_add_n(pp,pp,xp+(k-1)*n,n);} if(k%4==0){pp[n]+=mpn_add_n(pp,pp,xp+(k-2)*n,n);tp[n]+=mpn_add_n(tp,tp,xp+(k-1)*n,n);} if(k%4==1){pp[n]+=mpn_addadd_n(pp,pp,xp+(k-3)*n,xp+(k-1)*n,n);tp[n]+=mpn_add_n(tp,tp,xp+(k-2)*n,n);}} if(k%2==0){pp[n]+=mpn_add(pp,pp,n,xp+k*n,m);}else{tp[n]+=mpn_add(tp,tp,n,xp+k*n,m);} #else // pp is xp+0 xp+2n xp+4n xp+6n ... xp+jn where j<=k-1 // mp is xp+1 xp+3n xp+5n xp+7n ... xp+jn where j<=k-1 pp[n]=mpn_add_n(pp,xp,xp+2*n,n);tp[n]=mpn_add_n(tp,xp+n,xp+3*n,n); for(i=5;i<k;i+=2){pp[n]+=mpn_add_n(pp,pp,xp+(i-1)*n,n);tp[n]+=mpn_add_n(tp,tp,xp+i*n,n);} if(k%2==1){pp[n]+=mpn_add_n(pp,pp,xp+(k-1)*n,n);tp[n]+=mpn_add(tp,tp,n,xp+k*n,m);}else{pp[n]+=mpn_add(pp,pp,n,xp+k*n,m);} #endif if(mpn_cmp(tp,pp,n+1)>0)isneg=-1; #if HAVE_NATIVE_mpn_sumdiff_n if(isneg==0){mpn_sumdiff_n(pp,mp,pp,tp,n+1);}else{mpn_sumdiff_n(pp,mp,tp,pp,n+1);} #else if(isneg==0){mpn_sub_n(mp,pp,tp,n+1);}else{mpn_sub_n(mp,tp,pp,n+1);} mpn_add_n(pp,pp,tp,n+1); #endif return isneg;}
/* (rp, 2n) = (xp, n)*(yp, n) / B^n */ inline static void mpn_mulshort_n_basecase(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n) { mp_size_t i, k; #if GMP_NAIL_BITS==0 mp_limb_t t1, t2, t3; #endif ASSERT(n >= 3); /* this restriction doesn't make a lot of sense in general */ ASSERT_MPN(xp, n); ASSERT_MPN(yp, n); ASSERT(!MPN_OVERLAP_P (rp, 2 * n, xp, n)); ASSERT(!MPN_OVERLAP_P (rp, 2 * n, yp, n)); k = n - 2; /* so want short product sum_(i + j >= k) x[i]y[j]B^(i + j) */ #if GMP_NAIL_BITS!=0 rp[n] = mpn_mul_1(rp + k, xp + k, 2, yp[0]); #else umul_ppmm(t1, rp[k], xp[k], yp[0]); umul_ppmm(t3, t2, xp[k + 1], yp[0]); add_ssaaaa(rp[n], rp[k + 1], t3, t2, 0, t1); #endif for (i = 1; i <= n - 2; i++) rp[n + i] = mpn_addmul_1 (rp + k, xp + k - i, 2 + i, yp[i]); rp[n + n - 1] = mpn_addmul_1 (rp + n - 1, xp, n, yp[n - 1]); return; }
void mpn_mulmid_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n) { ASSERT (n >= 1); ASSERT (! MPN_OVERLAP_P (rp, n + 2, ap, 2*n - 1)); ASSERT (! MPN_OVERLAP_P (rp, n + 2, bp, n)); if (n < MULMID_TOOM42_THRESHOLD) { mpn_mulmid_basecase (rp, ap, 2*n - 1, bp, n); } else { mp_size_t k; mp_ptr scratch; TMP_DECL; k = mpn_toom42_mulmid_itch (n); if (k <= 1000) k = 1000; TMP_MARK; scratch = TMP_ALLOC_LIMBS (k); mpn_toom42_mulmid (rp, ap, bp, n, scratch); TMP_FREE; } }
void my__gmpn_tdiv_qr (mp_ptr qp, mp_ptr rp, mp_size_t qxn, mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn) { ASSERT_ALWAYS (qxn == 0); ASSERT (nn >= 0); ASSERT (dn >= 0); ASSERT (dn == 0 || dp[dn - 1] != 0); ASSERT (! MPN_OVERLAP_P (qp, nn - dn + 1 + qxn, np, nn)); ASSERT (! MPN_OVERLAP_P (qp, nn - dn + 1 + qxn, dp, dn)); int adjust; gmp_pi1_t dinv; TMP_DECL; TMP_MARK; /* conservative tests for quotient size */ adjust = np[nn - 1] >= dp[dn - 1]; mp_ptr n2p, d2p; mp_limb_t cy; int cnt; qp[nn - dn] = 0; /* zero high quotient limb */ count_leading_zeros (cnt, dp[dn - 1]); cnt -= GMP_NAIL_BITS; d2p = TMP_ALLOC_LIMBS (dn); mpn_lshift (d2p, dp, dn, cnt); for (int i=0; i<dn; i+=1) { printf("d2p %08x\n", *( (int*) (((void*)(d2p))+(i*4)))); } n2p = TMP_ALLOC_LIMBS (nn + 1); cy = mpn_lshift (n2p, np, nn, cnt); for (int i=0; i<nn; i+=1) { printf("n2p %08x\n", *( (int*) (((void*)(n2p))+(i*4)))); } n2p[nn] = cy; nn += adjust; printf("d2p[dn-1] = %08lx\nd2p[dn-2] = %08lx\n", d2p[dn-1], d2p[dn-2]); invert_pi1 (dinv, d2p[dn - 1], d2p[dn - 2]); printf("dinv %08lx\n", dinv.inv32); my_mpn_sbpi1_div_qr (qp, n2p, nn, d2p, dn, dinv.inv32); for (int i=0; i<nn; i+=1) { printf("inside qp %08x\n", *( (int*) (((void*)(qp))+(i*4)))); } n2p[nn] = cy; mpn_rshift (rp, n2p, dn, cnt); TMP_FREE; return; }
/* c is the top bits of the inputs, (fully reduced) c & 2 is the top bit of y c & 1 is the top bit of z */ int mpn_mulmod_2expp1_basecase (mp_ptr xp, mp_srcptr yp, mp_srcptr zp, int c, mpir_ui b, mp_ptr tp) { int cy, cz; mp_size_t n, k; cy = c & 2; cz = c & 1; n = BITS_TO_LIMBS (b); k = GMP_NUMB_BITS * n - b; ASSERT(b > 0); ASSERT(n > 0); ASSERT_MPN(yp, n); ASSERT_MPN(zp, n); ASSERT(!MPN_OVERLAP_P (tp, 2 * n, yp, n)); ASSERT(!MPN_OVERLAP_P (tp, 2 * n, zp, n)); ASSERT(MPN_SAME_OR_SEPARATE_P (xp, tp, n)); ASSERT(MPN_SAME_OR_SEPARATE_P (xp, tp + n, n)); ASSERT(k == 0 || yp[n - 1] >> (GMP_NUMB_BITS - k) == 0); ASSERT(k == 0 || zp[n - 1] >> (GMP_NUMB_BITS - k) == 0); #if WANT_ASSERT { mp_size_t t = n; MPN_NORMALIZE(yp, t); ASSERT(cy == 0 || t == 0); t = n; MPN_NORMALIZE(zp, t); ASSERT(cz == 0 || t == 0); } #endif if (LIKELY (cy == 0)) { if (LIKELY (cz == 0)) { c = mpn_mulmod_2expp1_internal (xp, yp, zp, b, tp); } else { c = mpn_neg_n (xp, yp, n); c = mpn_add_1 (xp, xp, n, c); xp[n - 1] &= GMP_NUMB_MASK >> k; } } else { if (LIKELY (cz == 0))
void mpn_invert (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch) { ASSERT (n > 0); ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT); ASSERT (! MPN_OVERLAP_P (ip, n, dp, n)); ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n))); ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n))); if (n == 1) invert_limb (*ip, *dp); else { TMP_DECL; TMP_MARK; if (BELOW_THRESHOLD (n, INV_APPR_THRESHOLD)) { /* Maximum scratch needed by this branch: 2*n */ mp_size_t i; mp_ptr xp; xp = scratch; /* 2 * n limbs */ for (i = n - 1; i >= 0; i--) xp[i] = GMP_NUMB_MAX; mpn_com (xp + n, dp, n); if (n == 2) { mpn_divrem_2 (ip, 0, xp, 4, dp); } else { gmp_pi1_t inv; invert_pi1 (inv, dp[n-1], dp[n-2]); /* FIXME: should we use dcpi1_div_q, for big sizes? */ mpn_sbpi1_div_q (ip, xp, 2 * n, dp, n, inv.inv32); } } else { /* Use approximated inverse; correct the result if needed. */ mp_limb_t e; /* The possible error in the approximate inverse */ ASSERT ( mpn_invert_itch (n) >= mpn_invertappr_itch (n) ); e = mpn_ni_invertappr (ip, dp, n, scratch); if (UNLIKELY (e)) { /* Assume the error can only be "0" (no error) or "1". */ /* Code to detect and correct the "off by one" approximation. */ mpn_mul_n (scratch, ip, dp, n); ASSERT_NOCARRY (mpn_add_n (scratch + n, scratch + n, dp, n)); if (! mpn_add (scratch, scratch, 2*n, dp, n)) MPN_INCR_U (ip, n, 1); /* The value was wrong, correct it. */ } } TMP_FREE; } }
void mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n) { ASSERT (n >= 1); ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n)); ASSERT (! MPN_OVERLAP_P (p, 2 * n, b, n)); if (BELOW_THRESHOLD (n, MUL_KARATSUBA_THRESHOLD)) { mpn_mul_basecase (p, a, n, b, n); } else if (BELOW_THRESHOLD (n, MUL_TOOM3_THRESHOLD)) { /* Allocate workspace of fixed size on stack: fast! */ mp_limb_t ws[MPN_KARA_MUL_N_TSIZE (MUL_TOOM3_THRESHOLD_LIMIT-1)]; ASSERT (MUL_TOOM3_THRESHOLD <= MUL_TOOM3_THRESHOLD_LIMIT); mpn_kara_mul_n (p, a, b, n, ws); } #if WANT_FFT || TUNE_PROGRAM_BUILD else if (BELOW_THRESHOLD (n, MUL_FFT_THRESHOLD)) #else else if (BELOW_THRESHOLD (n, MPN_TOOM3_MAX_N)) #endif { mp_ptr ws; TMP_SDECL; TMP_SMARK; ws = TMP_SALLOC_LIMBS (MPN_TOOM3_MUL_N_TSIZE (n)); mpn_toom3_mul_n (p, a, b, n, ws); TMP_SFREE; } else #if WANT_FFT || TUNE_PROGRAM_BUILD { /* The current FFT code allocates its own space. That should probably change. */ mpn_mul_fft_full (p, a, n, b, n); } #else { /* Toom3 for large operands. Use workspace from the heap, as stack space may be limited. Since n is at least MUL_TOOM3_THRESHOLD, multiplication will take much longer than malloc()/free(). */ mp_ptr ws; mp_size_t ws_size; ws_size = MPN_TOOM3_MUL_N_TSIZE (n); ws = __GMP_ALLOCATE_FUNC_LIMBS (ws_size); mpn_toom3_mul_n (p, a, b, n, ws); __GMP_FREE_FUNC_LIMBS (ws, ws_size); } #endif }
void mpn_invert (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch) { ASSERT (n > 0); ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT); ASSERT (! MPN_OVERLAP_P (ip, n, dp, n)); ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n))); ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n))); if (n == 1) invert_limb (*ip, *dp); else if (BELOW_THRESHOLD (n, INV_APPR_THRESHOLD)) { /* Maximum scratch needed by this branch: 2*n */ mp_size_t i; mp_ptr xp; xp = scratch; /* 2 * n limbs */ /* n > 1 here */ i = n; do xp[--i] = GMP_NUMB_MAX; while (i); mpn_com (xp + n, dp, n); if (n == 2) { mpn_divrem_2 (ip, 0, xp, 4, dp); } else { gmp_pi1_t inv; invert_pi1 (inv, dp[n-1], dp[n-2]); /* FIXME: should we use dcpi1_div_q, for big sizes? */ mpn_sbpi1_div_q (ip, xp, 2 * n, dp, n, inv.inv32); } } else { /* Use approximated inverse; correct the result if needed. */ mp_limb_t e; /* The possible error in the approximate inverse */ ASSERT ( mpn_invert_itch (n) >= mpn_invertappr_itch (n) ); e = mpn_ni_invertappr (ip, dp, n, scratch); if (UNLIKELY (e)) { /* Assume the error can only be "0" (no error) or "1". */ /* Code to detect and correct the "off by one" approximation. */ mpn_mul_n (scratch, ip, dp, n); e = mpn_add_n (scratch, scratch, dp, n); /* FIXME: we only need e.*/ if (LIKELY(e)) /* The high part can not give a carry by itself. */ e = mpn_add_nc (scratch + n, scratch + n, dp, n, e); /* FIXME:e */ /* If the value was wrong (no carry), correct it (increment). */ e ^= CNST_LIMB (1); MPN_INCR_U (ip, n, e); } } }
void mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n) { ASSERT (n >= 1); ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n)); ASSERT (! MPN_OVERLAP_P (p, 2 * n, b, n)); if (BELOW_THRESHOLD (n, MUL_KARATSUBA_THRESHOLD)) { mpn_mul_basecase (p, a, n, b, n); } else if (BELOW_THRESHOLD (n, MUL_TOOM3_THRESHOLD)) { /* Allocate workspace of fixed size on stack: fast! */ mp_limb_t ws[MPN_KARA_MUL_N_TSIZE (MUL_TOOM3_THRESHOLD_LIMIT-1)]; ASSERT (MUL_TOOM3_THRESHOLD <= MUL_TOOM3_THRESHOLD_LIMIT); mpn_kara_mul_n (p, a, b, n, ws); } else if (BELOW_THRESHOLD (n, MUL_TOOM4_THRESHOLD)) { mp_ptr ws; TMP_SDECL; TMP_SMARK; ws = TMP_SALLOC_LIMBS (MPN_TOOM3_MUL_N_TSIZE (n)); mpn_toom3_mul_n (p, a, b, n, ws); TMP_SFREE; } else if (BELOW_THRESHOLD (n, MUL_TOOM8H_THRESHOLD)) { mpn_toom4_mul_n (p, a, b, n); } #if WANT_FFT || TUNE_PROGRAM_BUILD else if (BELOW_THRESHOLD (n, MUL_FFT_FULL_THRESHOLD)) { mpn_toom8h_mul (p, a, n, b, n); } #endif else #if WANT_FFT || TUNE_PROGRAM_BUILD { mpn_mul_fft_main(p, a, n, b, n); } #else { /* Toom8 for large operands. */ mpn_toom8h_mul (p, a, n, b, n); } #endif }
mp_limb_t mpn_sumdiff_n(mp_ptr s,mp_ptr d,mp_srcptr x,mp_srcptr y,mp_size_t n) {mp_limb_t ret;mp_ptr t; ASSERT(n>0); ASSERT_MPN(x,n);ASSERT_MPN(y,n);//ASSERT_SPACE(s,n);ASSERT_SPACE(d,n); ASSERT(MPN_SAME_OR_SEPARATE_P(s,x,n)); ASSERT(MPN_SAME_OR_SEPARATE_P(s,y,n)); ASSERT(MPN_SAME_OR_SEPARATE_P(d,x,n)); ASSERT(MPN_SAME_OR_SEPARATE_P(d,y,n)); ASSERT(!MPN_OVERLAP_P(s,n,d,n)); if( (s==x && d==y)||(s==y && d==x) ) {t=__GMP_ALLOCATE_FUNC_LIMBS(n); ret=mpn_sub_n(t,x,y,n); ret+=2*mpn_add_n(s,x,y,n); MPN_COPY(d,t,n); __GMP_FREE_FUNC_LIMBS(t,n); return ret;} if(s==x || s==y) {ret=mpn_sub_n(d,x,y,n); ret+=2*mpn_add_n(s,x,y,n); return ret;} ret=2*mpn_add_n(s,x,y,n); ret+=mpn_sub_n(d,x,y,n); return ret;}
/* mpn_dc_mullo_n requires a scratch space of 2*n limbs at tp. It accepts tp == rp. */ static void mpn_dc_mullo_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, mp_ptr tp) { mp_size_t n2, n1; ASSERT (n >= 2); ASSERT (! MPN_OVERLAP_P (rp, n, xp, n)); ASSERT (! MPN_OVERLAP_P (rp, n, yp, n)); ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n)); /* Divide-and-conquer */ /* We need fractional approximation of the value 0 < a <= 1/2 giving the minimum in the function k=(1-a)^e/(1-2*a^e). */ if (MAYBE_range_basecase && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD*36/(36-11))) n1 = n >> 1; else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD*36/(36-11)))
void mpn_sqr (mp_ptr p, mp_srcptr a, mp_size_t n) { ASSERT (n >= 1); ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n)); #if 0 /* FIXME: Can this be removed? */ if (n == 0) return; #endif if (BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) { /* mul_basecase is faster than sqr_basecase on small sizes sometimes */ mpn_mul_basecase (p, a, n, a, n); } else if (BELOW_THRESHOLD (n, SQR_KARATSUBA_THRESHOLD)) { mpn_sqr_basecase (p, a, n); } else if (BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) { /* Allocate workspace of fixed size on stack: fast! */ mp_limb_t ws[MPN_KARA_SQR_N_TSIZE (SQR_TOOM3_THRESHOLD_LIMIT-1)]; ASSERT (SQR_TOOM3_THRESHOLD <= SQR_TOOM3_THRESHOLD_LIMIT); mpn_kara_sqr_n (p, a, n, ws); } else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD)) { mp_ptr ws; TMP_SDECL; TMP_SMARK; ws = TMP_SALLOC_LIMBS (MPN_TOOM3_SQR_N_TSIZE (n)); mpn_toom3_sqr_n (p, a, n, ws); TMP_SFREE; } else if (BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD)) { mpn_toom4_sqr_n (p, a, n); } #if WANT_FFT || TUNE_PROGRAM_BUILD else if (BELOW_THRESHOLD (n, SQR_FFT_FULL_THRESHOLD)) #else else #endif { mpn_toom8_sqr_n (p, a, n); } #if WANT_FFT || TUNE_PROGRAM_BUILD else {
void mpn_sqr_n (mp_ptr prodp, mp_srcptr up, mp_size_t un) { ASSERT (un >= 1); ASSERT (! MPN_OVERLAP_P (prodp, 2*un, up, un)); /* FIXME: Can this be removed? */ if (un == 0) return; if (BELOW_THRESHOLD (un, SQR_BASECASE_THRESHOLD)) { /* mul_basecase is faster than sqr_basecase on small sizes sometimes */ mpn_mul_basecase (prodp, up, un, up, un); } else if (BELOW_THRESHOLD (un, SQR_KARATSUBA_THRESHOLD)) { /* plain schoolbook multiplication */ mpn_sqr_basecase (prodp, up, un); } else if (BELOW_THRESHOLD (un, SQR_TOOM3_THRESHOLD)) { /* karatsuba multiplication */ mp_ptr tspace; TMP_DECL (marker); TMP_MARK (marker); tspace = TMP_ALLOC_LIMBS (MPN_KARA_SQR_N_TSIZE (un)); mpn_kara_sqr_n (prodp, up, un, tspace); TMP_FREE (marker); } #if WANT_FFT || TUNE_PROGRAM_BUILD else if (BELOW_THRESHOLD (un, SQR_FFT_THRESHOLD)) #else else #endif { /* Toom3 multiplication. Use workspace from the heap, as stack may be limited. Since n is at least MUL_TOOM3_THRESHOLD, the multiplication will take much longer than malloc()/free(). */ mp_ptr tspace; mp_size_t tsize; tsize = MPN_TOOM3_SQR_N_TSIZE (un); tspace = __GMP_ALLOCATE_FUNC_LIMBS (tsize); mpn_toom3_sqr_n (prodp, up, un, tspace); __GMP_FREE_FUNC_LIMBS (tspace, tsize); } #if WANT_FFT || TUNE_PROGRAM_BUILD else {
/* Define our own squaring function, which uses mpn_sqr_basecase for its allowed sizes, but its own code for larger sizes. */ static void mpn_local_sqr (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_ptr tp) { mp_size_t i; ASSERT (n >= 1); ASSERT (! MPN_OVERLAP_P (rp, 2*n, up, n)); if (BELOW_THRESHOLD (n, SQR_BASECASE_LIM)) { mpn_sqr_basecase (rp, up, n); return; } { mp_limb_t ul, lpl; ul = up[0]; umul_ppmm (rp[1], lpl, ul, ul << GMP_NAIL_BITS); rp[0] = lpl >> GMP_NAIL_BITS; } if (n > 1) { mp_limb_t cy; cy = mpn_mul_1 (tp, up + 1, n - 1, up[0]); tp[n - 1] = cy; for (i = 2; i < n; i++) { mp_limb_t cy; cy = mpn_addmul_1 (tp + 2 * i - 2, up + i, n - i, up[i - 1]); tp[n + i - 2] = cy; } MPN_SQR_DIAGONAL (rp + 2, up + 1, n - 1); { mp_limb_t cy; #if HAVE_NATIVE_mpn_addlsh1_n cy = mpn_addlsh1_n (rp + 1, rp + 1, tp, 2 * n - 2); #else cy = mpn_lshift (tp, tp, 2 * n - 2, 1); cy += mpn_add_n (rp + 1, rp + 1, tp, 2 * n - 2); #endif rp[2 * n - 1] += cy; } } }
mp_limb_t mpn_sb_divrem_mn (mp_ptr qp, mp_ptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn) { mp_limb_t most_significant_q_limb = 0; mp_size_t qn = nn - dn; mp_size_t i; mp_limb_t dx, d1, n0; mp_limb_t dxinv; int use_preinv; ASSERT (dn > 2); ASSERT (nn >= dn); ASSERT (dp[dn-1] & GMP_NUMB_HIGHBIT); ASSERT (! MPN_OVERLAP_P (np, nn, dp, dn)); ASSERT (! MPN_OVERLAP_P (qp, nn-dn, dp, dn)); ASSERT (! MPN_OVERLAP_P (qp, nn-dn, np, nn) || qp+dn >= np); ASSERT_MPN (np, nn); ASSERT_MPN (dp, dn); np += qn; dx = dp[dn - 1]; d1 = dp[dn - 2]; n0 = np[dn - 1]; if (n0 >= dx) { if (n0 > dx || mpn_cmp (np, dp, dn - 1) >= 0) { mpn_sub_n (np, np, dp, dn); most_significant_q_limb = 1; } } /* use_preinv is possibly a constant, but it's left to the compiler to optimize away the unused code in that case. */ use_preinv = ABOVE_THRESHOLD (qn, DIV_SB_PREINV_THRESHOLD); if (use_preinv) invert_limb (dxinv, dx); for (i = qn - 1; i >= 0; i--) { mp_limb_t q; mp_limb_t nx; mp_limb_t cy_limb; nx = np[dn - 1]; /* FIXME: could get value from r1 */ np--; if (nx == dx) { /* This might over-estimate q, but it's probably not worth the extra code here to find out. */ q = GMP_NUMB_MASK; #if 1 cy_limb = mpn_submul_1 (np, dp, dn, q); #else /* This should be faster on many machines */ cy_limb = mpn_sub_n (np + 1, np + 1, dp, dn); cy = mpn_add_n (np, np, dp, dn); np[dn] += cy; #endif if (nx != cy_limb) { mpn_add_n (np, np, dp, dn); q--; } qp[i] = q; } else { mp_limb_t rx, r1, r0, p1, p0; /* "workaround" avoids a problem with gcc 2.7.2.3 i386 register usage when np[dn-1] is used in an asm statement like umul_ppmm in udiv_qrnnd_preinv. The symptom is seg faults due to registers being clobbered. gcc 2.95 i386 doesn't have the problem. */ { mp_limb_t workaround = np[dn - 1]; if (use_preinv) udiv_qrnnd_preinv (q, r1, nx, workaround, dx, dxinv); else { udiv_qrnnd (q, r1, nx, workaround << GMP_NAIL_BITS, dx << GMP_NAIL_BITS); r1 >>= GMP_NAIL_BITS; } } umul_ppmm (p1, p0, d1, q << GMP_NAIL_BITS); p0 >>= GMP_NAIL_BITS; r0 = np[dn - 2]; rx = 0; if (r1 < p1 || (r1 == p1 && r0 < p0)) { p1 -= p0 < d1; p0 = (p0 - d1) & GMP_NUMB_MASK; q--; r1 = (r1 + dx) & GMP_NUMB_MASK; rx = r1 < dx; } p1 += r0 < p0; /* cannot carry! */ rx -= r1 < p1; /* may become 11..1 if q is still too large */ r1 = (r1 - p1) & GMP_NUMB_MASK; r0 = (r0 - p0) & GMP_NUMB_MASK; cy_limb = mpn_submul_1 (np, dp, dn - 2, q); /* Check if we've over-estimated q, and adjust as needed. */ { mp_limb_t cy1, cy2; cy1 = r0 < cy_limb; r0 = (r0 - cy_limb) & GMP_NUMB_MASK; cy2 = r1 < cy1; r1 -= cy1; np[dn - 1] = r1; np[dn - 2] = r0; if (cy2 != rx) { mpn_add_n (np, np, dp, dn); q--; } } qp[i] = q; } } /* ______ ______ ______ |__rx__|__r1__|__r0__| partial remainder ______ ______ - |__p1__|__p0__| partial product to subtract ______ ______ - |______|cylimb| rx is -1, 0 or 1. If rx=1, then q is correct (it should match carry out). If rx=-1 then q is too large. If rx=0, then q might be too large, but it is most likely correct. */ return most_significant_q_limb; }
void mpn_tdiv_q (mp_ptr qp, mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn) { mp_ptr new_dp, new_np, tp, rp, scratch; mp_limb_t cy, dh, qh; mp_size_t new_nn, qn; mp_limb_t dinv; int cnt; TMP_DECL; TMP_MARK; ASSERT (nn >= dn); ASSERT (dn > 0); ASSERT (dp[dn - 1] != 0); ASSERT (! MPN_OVERLAP_P (qp, nn - dn + 1, np, nn)); ASSERT (! MPN_OVERLAP_P (qp, nn - dn + 1, dp, dn)); ASSERT_ALWAYS (FUDGE >= 2); if (dn == 1) { mpn_divrem_1 (qp, 0L, np, nn, dp[dn - 1]); return; } scratch = TMP_ALLOC_LIMBS(nn + 1); qn = nn - dn + 1; /* Quotient size, high limb might be zero */ if (qn + FUDGE >= dn) { /* |________________________| |_______| */ new_np = scratch; dh = dp[dn - 1]; if (LIKELY ((dh & GMP_NUMB_HIGHBIT) == 0)) { count_leading_zeros (cnt, dh); cy = mpn_lshift (new_np, np, nn, cnt); new_np[nn] = cy; new_nn = nn + (cy != 0); new_dp = TMP_ALLOC_LIMBS (dn); mpn_lshift (new_dp, dp, dn, cnt); if (dn == 2) { qh = mpn_divrem_2 (qp, 0L, new_np, new_nn, new_dp); } else if (BELOW_THRESHOLD (dn, DC_DIV_Q_THRESHOLD) || BELOW_THRESHOLD (new_nn - dn, DC_DIV_Q_THRESHOLD)) { invert_1(dinv, new_dp[dn - 1], new_dp[dn - 2]); qh = mpn_sb_div_q (qp, new_np, new_nn, new_dp, dn, dinv); } else if (BELOW_THRESHOLD (dn, INV_DIV_Q_THRESHOLD) || BELOW_THRESHOLD (nn, 2 * INV_DIV_Q_THRESHOLD)) { invert_1(dinv, new_dp[dn - 1], new_dp[dn - 2]); qh = mpn_dc_div_q (qp, new_np, new_nn, new_dp, dn, dinv); } else { mp_ptr inv = TMP_ALLOC_LIMBS(dn); mpn_invert(inv, new_dp, dn); qh = mpn_inv_div_q (qp, new_np, new_nn, new_dp, dn, inv); } if (cy == 0) qp[qn - 1] = qh; else if (UNLIKELY (qh != 0)) { /* This happens only when the quotient is close to B^n and mpn_*_divappr_q returned B^n. */ mp_size_t i, n; n = new_nn - dn; for (i = 0; i < n; i++) qp[i] = GMP_NUMB_MAX; qh = 0; /* currently ignored */ } } else /* divisor is already normalised */ { if (new_np != np) MPN_COPY (new_np, np, nn); if (dn == 2) { qh = mpn_divrem_2 (qp, 0L, new_np, nn, dp); } else if (BELOW_THRESHOLD (dn, DC_DIV_Q_THRESHOLD) || BELOW_THRESHOLD (nn - dn, DC_DIV_Q_THRESHOLD)) { invert_1(dinv, dh, dp[dn - 2]); qh = mpn_sb_div_q (qp, new_np, nn, dp, dn, dinv); } else if (BELOW_THRESHOLD (dn, INV_DIV_Q_THRESHOLD) || BELOW_THRESHOLD (nn, 2 * INV_DIV_Q_THRESHOLD)) { invert_1(dinv, dh, dp[dn - 2]); qh = mpn_dc_div_q (qp, new_np, nn, dp, dn, dinv); } else { mp_ptr inv = TMP_ALLOC_LIMBS(dn); mpn_invert(inv, dp, dn); qh = mpn_inv_div_q (qp, new_np, nn, dp, dn, inv); } qp[nn - dn] = qh; } } else { /* |________________________| |_________________| */ tp = TMP_ALLOC_LIMBS (qn + 1); new_np = scratch; new_nn = 2 * qn + 1; if (new_np == np) /* We need {np,nn} to remain untouched until the final adjustment, so we need to allocate separate space for new_np. */ new_np = TMP_ALLOC_LIMBS (new_nn + 1); dh = dp[dn - 1]; if (LIKELY ((dh & GMP_NUMB_HIGHBIT) == 0)) { count_leading_zeros (cnt, dh); cy = mpn_lshift (new_np, np + nn - new_nn, new_nn, cnt); new_np[new_nn] = cy; new_nn += (cy != 0); new_dp = TMP_ALLOC_LIMBS (qn + 1); mpn_lshift (new_dp, dp + dn - (qn + 1), qn + 1, cnt); new_dp[0] |= dp[dn - (qn + 1) - 1] >> (GMP_NUMB_BITS - cnt); if (qn + 1 == 2) { qh = mpn_divrem_2 (tp, 0L, new_np, new_nn, new_dp); } else if (BELOW_THRESHOLD (qn - 1, DC_DIVAPPR_Q_THRESHOLD)) { invert_1(dinv, new_dp[qn], new_dp[qn - 1]); qh = mpn_sb_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, dinv); } else if (BELOW_THRESHOLD (qn - 1, INV_DIVAPPR_Q_THRESHOLD)) { invert_1(dinv, new_dp[qn], new_dp[qn - 1]); qh = mpn_dc_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, dinv); } else { mp_ptr inv = TMP_ALLOC_LIMBS(qn + 1); mpn_invert(inv, new_dp, qn + 1); qh = mpn_inv_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, inv); } if (cy == 0) tp[qn] = qh; else if (UNLIKELY (qh != 0)) { /* This happens only when the quotient is close to B^n and mpn_*_divappr_q returned B^n. */ mp_size_t i, n; n = new_nn - (qn + 1); for (i = 0; i < n; i++) tp[i] = GMP_NUMB_MAX; qh = 0; /* currently ignored */ } } else /* divisor is already normalised */ {
mp_limb_t mpn_div_qr_1n_pi2 (mp_ptr qp, mp_srcptr up, mp_size_t un, struct precomp_div_1_pi2 *pd) { mp_limb_t most_significant_q_limb; mp_size_t i; mp_limb_t r, u2, u1, u0; mp_limb_t d0, di1, di0; mp_limb_t q3a, q2a, q2b, q1b, q2c, q1c, q1d, q0d; mp_limb_t cnd; ASSERT (un >= 2); ASSERT ((pd->d & GMP_NUMB_HIGHBIT) != 0); ASSERT (! MPN_OVERLAP_P (qp, un-2, up, un) || qp+2 >= up); ASSERT_MPN (up, un); #define q3 q3a #define q2 q2b #define q1 q1b up += un - 3; r = up[2]; d0 = pd->d; most_significant_q_limb = (r >= d0); r -= d0 & -most_significant_q_limb; qp += un - 3; qp[2] = most_significant_q_limb; di1 = pd->dip[1]; di0 = pd->dip[0]; for (i = un - 3; i >= 0; i -= 2) { u2 = r; u1 = up[1]; u0 = up[0]; /* Dividend in {r,u1,u0} */ umul_ppmm (q1d,q0d, u1, di0); umul_ppmm (q2b,q1b, u1, di1); q2b++; /* cannot spill */ add_sssaaaa (r,q2b,q1b, q2b,q1b, u1,u0); umul_ppmm (q2c,q1c, u2, di0); add_sssaaaa (r,q2b,q1b, q2b,q1b, q2c,q1c); umul_ppmm (q3a,q2a, u2, di1); add_sssaaaa (r,q2b,q1b, q2b,q1b, q2a,q1d); q3 += r; r = u0 - q2 * d0; cnd = (r >= q1); r += d0 & -cnd; sub_ddmmss (q3,q2, q3,q2, 0,cnd); if (UNLIKELY (r >= d0)) { r -= d0; add_ssaaaa (q3,q2, q3,q2, 0,1); } qp[0] = q2; qp[1] = q3; up -= 2; qp -= 2; } if ((un & 1) == 0) { u2 = r; u1 = up[1]; udiv_qrnnd_preinv (q3, r, u2, u1, d0, di1); qp[1] = q3; } return r; #undef q3 #undef q2 #undef q1 }
/* (rp, 2n) = (xp, n)*(yp, n) */ static void mpn_mulshort_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n) { mp_size_t m; mp_limb_t t; mp_ptr rpn2; ASSERT(n >= 1); ASSERT_MPN(xp, n); ASSERT_MPN(yp, n); ASSERT(!MPN_OVERLAP_P (rp, 2 * n, xp, n)); ASSERT(!MPN_OVERLAP_P (rp, 2 * n, yp, n)); if (BELOW_THRESHOLD(n, MULHIGH_BASECASE_THRESHOLD)) { mpn_mul_basecase(rp, xp, n, yp, n); return; } if (BELOW_THRESHOLD (n, MULHIGH_DC_THRESHOLD)) { mpn_mulshort_n_basecase(rp, xp, yp, n); return; } /* choose optimal m s.t. n + 2 <= 2m, m < n */ ASSERT (n >= 4); m = 87 * n / 128; if (2 * m < n + 2) m = (n + 1) / 2 + 1; if (m >= n) m = n - 1; ASSERT (n + 2 <= 2 * m); ASSERT (m < n); rpn2 = rp + n - 2; mpn_mul_n (rp + n - m + n - m, xp + n - m, yp + n - m, m); mpn_mulshort_n (rp, xp, yp + m, n - m); ASSERT_NOCARRY (mpn_add (rpn2, rpn2, n + 2, rpn2 - m, n - m + 2)); mpn_mulshort_n (rp, xp + m, yp, n - m); ASSERT_NOCARRY (mpn_add (rpn2, rpn2, n + 2, rpn2 - m, n - m + 2)); umul_ppmm (rp[1], t, xp[m - 1], yp[n - m - 1] << GMP_NAIL_BITS); rp[0] = t >> GMP_NAIL_BITS; ASSERT_NOCARRY (mpn_add (rpn2, rpn2, n + 2, rp, 2)); umul_ppmm (rp[1], t, xp[n - m - 1], yp[m - 1] << GMP_NAIL_BITS); rp[0] = t >> GMP_NAIL_BITS; ASSERT_NOCARRY (mpn_add (rpn2, rpn2, n + 2, rp, 2)); return; }
void mpn_mul_basecase (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr vp, mp_size_t vn) { ASSERT (un >= vn); ASSERT (vn >= 1); ASSERT (! MPN_OVERLAP_P (rp, un+vn, up, un)); ASSERT (! MPN_OVERLAP_P (rp, un+vn, vp, vn)); /* We first multiply by the low order limb (or depending on optional function availability, limbs). This result can be stored, not added, to rp. We also avoid a loop for zeroing this way. */ #if HAVE_NATIVE_mpn_mul_2 if (vn >= 2) { rp[un + 1] = mpn_mul_2 (rp, up, un, vp); rp += 2, vp += 2, vn -= 2; } else { rp[un] = mpn_mul_1 (rp, up, un, vp[0]); return; } #else rp[un] = mpn_mul_1 (rp, up, un, vp[0]); rp += 1, vp += 1, vn -= 1; #endif /* Now accumulate the product of up[] and the next low-order limb (or depending on optional function availability, limbs) from vp[0]. */ #define MAX_LEFT MP_SIZE_T_MAX #if HAVE_NATIVE_mpn_addmul_4 while (vn >= 4) { rp[un + 4 - 1] = mpn_addmul_4 (rp, up, un, vp); rp += 4, vp += 4, vn -= 4; } #undef MAX_LEFT #define MAX_LEFT 3 #endif #if HAVE_NATIVE_mpn_addmul_3 while (vn >= 3) { rp[un + 3 - 1] = mpn_addmul_3 (rp, up, un, vp); rp += 3, vp += 3, vn -= 3; if (MAX_LEFT - 3 <= 3) break; } #undef MAX_LEFT #define MAX_LEFT 2 #endif #if HAVE_NATIVE_mpn_addmul_2 while (vn >= 2) { rp[un + 2 - 1] = mpn_addmul_2 (rp, up, un, vp); rp += 2, vp += 2, vn -= 2; if (MAX_LEFT - 2 <= 2) break; } #undef MAX_LEFT #define MAX_LEFT 1 #endif while (vn >= 1) { rp[un] = mpn_addmul_1 (rp, up, un, vp[0]); rp += 1, vp += 1, vn -= 1; if (MAX_LEFT - 1 <= 1) break; } }
mp_limb_t mpn_mul (mp_ptr prodp, mp_srcptr up, mp_size_t un, mp_srcptr vp, mp_size_t vn) { mp_size_t l, k; mp_limb_t c; ASSERT (un >= vn); ASSERT (vn >= 1); ASSERT (! MPN_OVERLAP_P (prodp, un+vn, up, un)); ASSERT (! MPN_OVERLAP_P (prodp, un+vn, vp, vn)); if (un == vn) { if (up == vp) { mpn_sqr (prodp, up, un); return prodp[2 * un - 1]; } else { mpn_mul_n (prodp, up, vp, un); return prodp[2 * un - 1]; } } if (vn < MUL_KARATSUBA_THRESHOLD) { /* plain schoolbook multiplication */ if (un <= MUL_BASECASE_MAX_UN) mpn_mul_basecase (prodp, up, un, vp, vn); else { /* We have un >> MUL_BASECASE_MAX_UN > vn. For better memory locality, split up[] into MUL_BASECASE_MAX_UN pieces and multiply these pieces with the vp[] operand. After each such partial multiplication (but the last) we copy the most significant vn limbs into a temporary buffer since that part would otherwise be overwritten by the next multiplication. After the next multiplication, we add it back. This illustrates the situation: -->vn<-- | |<------- un ------->| _____________________| X /| /XX__________________/ | _____________________ | X / | /XX__________________/ | _____________________ | / / | /____________________/ | ================================================================== The parts marked with X are the parts whose sums are copied into the temporary buffer. */ mp_limb_t tp[MUL_KARATSUBA_THRESHOLD_LIMIT]; mp_limb_t cy; ASSERT (MUL_KARATSUBA_THRESHOLD <= MUL_KARATSUBA_THRESHOLD_LIMIT); mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn); prodp += MUL_BASECASE_MAX_UN; MPN_COPY (tp, prodp, vn); /* preserve high triangle */ up += MUL_BASECASE_MAX_UN; un -= MUL_BASECASE_MAX_UN; while (un > MUL_BASECASE_MAX_UN) { mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn); cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */ mpn_incr_u (prodp + vn, cy); /* safe? */ prodp += MUL_BASECASE_MAX_UN; MPN_COPY (tp, prodp, vn); /* preserve high triangle */ up += MUL_BASECASE_MAX_UN; un -= MUL_BASECASE_MAX_UN; } if (un > vn) { mpn_mul_basecase (prodp, up, un, vp, vn); } else { ASSERT_ALWAYS (un > 0); mpn_mul_basecase (prodp, vp, vn, up, un); } cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */ mpn_incr_u (prodp + vn, cy); /* safe? */ } return prodp[un + vn - 1]; } if (ABOVE_THRESHOLD (un + vn, 2*MUL_FFT_FULL_THRESHOLD) && ABOVE_THRESHOLD (3*vn, MUL_FFT_FULL_THRESHOLD)) { mpn_mul_fft_main (prodp, up, un, vp, vn); return prodp[un + vn - 1]; } k = (un + 3)/4; // ceil(un/4) #if GMP_NUMB_BITS == 32 if ((ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM8H_THRESHOLD)) && (vn>=86) && (5*un <= 11*vn)) #else if ((ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM8H_THRESHOLD)) && (vn>=86) && (4*un <= 13*vn)) #endif { mpn_toom8h_mul(prodp, up, un, vp, vn); return prodp[un + vn - 1]; } if (ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM4_THRESHOLD)) { if (vn > 3*k) { mpn_toom4_mul(prodp, up, un, vp, vn); return prodp[un + vn - 1]; } else { l = (un + 4)/5; // ceil(un/5) if ((((vn > 9*k/4) && (un+vn <= 6*MUL_TOOM4_THRESHOLD)) || ((vn > 2*l) && (un+vn > 6*MUL_TOOM4_THRESHOLD))) && (vn <= 3*l)) { mpn_toom53_mul(prodp, up, un, vp, vn); return prodp[un + vn - 1]; } } } if (ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM3_THRESHOLD) && (vn > k)) { mp_ptr ws; TMP_DECL; TMP_MARK; if (vn < 2*k) // un/2 >= vn > un/4 { ws = TMP_ALLOC_LIMBS (MPN_TOOM3_MUL_TSIZE(un)); mpn_toom42_mul(prodp, up, un, vp, vn, ws); TMP_FREE; return prodp[un + vn - 1]; } l = (un+2)/3; //ceil(u/3) if (vn > 2*l) // un >= vn > 2un/3 { ws = TMP_ALLOC_LIMBS (MPN_TOOM3_MUL_TSIZE(un)); mpn_toom3_mul(prodp, up, un, vp, vn, ws); TMP_FREE; return prodp[un + vn - 1]; } else // 2un/3 >= vn > un/3 { ws = TMP_ALLOC_LIMBS (MPN_TOOM3_MUL_TSIZE(un)); mpn_toom32_mul(prodp, up, un, vp, vn, ws); TMP_FREE; return prodp[un + vn - 1]; } } mpn_mul_n (prodp, up, vp, vn); if (un != vn) { mp_limb_t t; mp_ptr ws; TMP_DECL; TMP_MARK; prodp += vn; l = vn; up += vn; un -= vn; if (un < vn) { /* Swap u's and v's. */ MPN_SRCPTR_SWAP (up,un, vp,vn); } ws = TMP_ALLOC_LIMBS ((vn >= MUL_KARATSUBA_THRESHOLD ? vn : un) + vn); t = 0; while (vn >= MUL_KARATSUBA_THRESHOLD) { mpn_mul_n (ws, up, vp, vn); if (l <= 2*vn) { t += mpn_add_n (prodp, prodp, ws, l); if (l != 2*vn) { t = mpn_add_1 (prodp + l, ws + l, 2*vn - l, t); l = 2*vn; } } else { c = mpn_add_n (prodp, prodp, ws, 2*vn); t += mpn_add_1 (prodp + 2*vn, prodp + 2*vn, l - 2*vn, c); } prodp += vn; l -= vn; up += vn; un -= vn; if (un < vn) { /* Swap u's and v's. */ MPN_SRCPTR_SWAP (up,un, vp,vn); } } if (vn != 0) { mpn_mul_basecase (ws, up, un, vp, vn); if (l <= un + vn) { t += mpn_add_n (prodp, prodp, ws, l); if (l != un + vn) t = mpn_add_1 (prodp + l, ws + l, un + vn - l, t); } else { c = mpn_add_n (prodp, prodp, ws, un + vn); t += mpn_add_1 (prodp + un + vn, prodp + un + vn, l - un - vn, c); } } TMP_FREE; } return prodp[un + vn - 1]; }
mp_limb_t mpn_bdivmod (mp_ptr qp, mp_ptr up, mp_size_t usize, mp_srcptr vp, mp_size_t vsize, unsigned long int d) { mp_limb_t v_inv; ASSERT (usize >= 1); ASSERT (vsize >= 1); ASSERT (usize * GMP_NUMB_BITS >= d); ASSERT (! MPN_OVERLAP_P (up, usize, vp, vsize)); ASSERT (! MPN_OVERLAP_P (qp, d/GMP_NUMB_BITS, vp, vsize)); ASSERT (MPN_SAME_OR_INCR2_P (qp, d/GMP_NUMB_BITS, up, usize)); ASSERT_MPN (up, usize); ASSERT_MPN (vp, vsize); /* 1/V mod 2^GMP_NUMB_BITS. */ binvert_limb (v_inv, vp[0]); /* Fast code for two cases previously used by the accel part of mpn_gcd. (Could probably remove this now it's inlined there.) */ if (usize == 2 && vsize == 2 && (d == GMP_NUMB_BITS || d == 2*GMP_NUMB_BITS)) { mp_limb_t hi, lo; mp_limb_t q = (up[0] * v_inv) & GMP_NUMB_MASK; umul_ppmm (hi, lo, q, vp[0] << GMP_NAIL_BITS); up[0] = 0; up[1] -= hi + q*vp[1]; qp[0] = q; if (d == 2*GMP_NUMB_BITS) { q = (up[1] * v_inv) & GMP_NUMB_MASK; up[1] = 0; qp[1] = q; } return 0; } /* Main loop. */ while (d >= GMP_NUMB_BITS) { mp_limb_t q = (up[0] * v_inv) & GMP_NUMB_MASK; mp_limb_t b = mpn_submul_1 (up, vp, MIN (usize, vsize), q); if (usize > vsize) mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b); d -= GMP_NUMB_BITS; up += 1, usize -= 1; *qp++ = q; } if (d) { mp_limb_t b; mp_limb_t q = (up[0] * v_inv) & (((mp_limb_t)1<<d) - 1); if (q <= 1) { if (q == 0) return 0; else b = mpn_sub_n (up, up, vp, MIN (usize, vsize)); } else b = mpn_submul_1 (up, vp, MIN (usize, vsize), q); if (usize > vsize) mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b); return q; } return 0; }
/* (rp, 2n) = (xp, n)*(yp, n) / B^n */ inline static void mpn_mulshort_n_basecase(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n) { mp_size_t i, k; ASSERT(n >= 3); /* this restriction doesn't make a lot of sense in general */ ASSERT_MPN(xp, n); ASSERT_MPN(yp, n); ASSERT(!MPN_OVERLAP_P (rp, 2 * n, xp, n)); ASSERT(!MPN_OVERLAP_P (rp, 2 * n, yp, n)); k = n - 2; /* so want short product sum_(i + j >= k) x[i]y[j]B^(i + j) */ i = 0; /* Multiply w limbs from y + i to (2 + i + w - 1) limbs from x + (n - 2 - i - w + 1) and put it into r + (n - 2 - w + 1), "overflow" (i.e. last) limb into r + (n + w - 1) for i between 0 and n - 2. i == n - w needs special treatment. */ /* We first multiply by the low order limb (or depending on optional function availability, limbs). This result can be stored, not added, to rp. We also avoid a loop for zeroing this way. */ #if HAVE_NATIVE_mpn_mul_2 rp[n + 1] = mpn_mul_2 (rp + k - 1, xp + k - 1, 2 + 1, yp); i += 2; #else rp[n] = mpn_mul_1 (rp + k, xp + k, 2, yp[0]); i += 1; #endif #if HAVE_NATIVE_mpn_addmul_6 while (i < n - 6) { rp[n + i + 6 - 1] = mpn_addmul_6 (rp + k - 6 + 1, xp + k - i - 6 + 1, 2 + i + 6 - 1, yp + i); i += 6; } if (i == n - 6) { rp[n + n - 1] = mpn_addmul_6 (rp + i, xp, n, yp + i); return; } #endif #if HAVE_NATIVE_mpn_addmul_5 while (i < n - 5) { rp[n + i + 5 - 1] = mpn_addmul_5 (rp + k - 5 + 1, xp + k - i - 5 + 1, 2 + i + 5 - 1, yp + i) i += 5; } if (i == n - 5) { rp[n + n - 1] = mpn_addmul_5 (rp + i, xp, n, yp + i); return; } #endif #if HAVE_NATIVE_mpn_addmul_4 while (i < n - 4) { rp[n + i + 4 - 1] = mpn_addmul_4 (rp + k - 4 + 1, xp + k - i - 4 + 1, 2 + i + 4 - 1, yp + i); i += 4; } if (i == n - 4) { rp[n + n - 1] = mpn_addmul_4 (rp + i, xp, n, yp + i); return; } #endif #if HAVE_NATIVE_mpn_addmul_3 while (i < n - 3) { rp[n + i + 3 - 1] = mpn_addmul_3 (rp + k - 3 + 1, xp + k - i - 3 + 1, 2 + i + 3 - 1, yp + i); i += 3; } if (i == n - 3) { rp[n + n - 1] = mpn_addmul_3 (rp + i, xp, n, yp + i); return; } #endif #if HAVE_NATIVE_mpn_addmul_2 while (i < n - 2) { rp[n + i + 2 - 1] = mpn_addmul_2 (rp + k - 2 + 1, xp + k - i - 2 + 1, 2 + i + 2 - 1, yp + i); i += 2; } if (i == n - 2) { rp[n + n - 1] = mpn_addmul_2 (rp + i, xp, n, yp + i); return; } #endif while (i < n - 1) { rp[n + i] = mpn_addmul_1 (rp + k, xp + k - i, 2 + i, yp[i]); i += 1; } rp[n + n - 1] = mpn_addmul_1 (rp + i, xp, n, yp[i]); return; }
/* Computes {np, n} / {dp, n} mod B^n, using divide-and-conquer algorithm, switching to classical for n <= BDIV_Q_DC_THRESHOLD. Also computes a 2 limb "overflow". See sb_bdiv_q.c for a definition. scratch is workspace. */ void mpn_dc_bdiv_q_n (mp_ptr qp, mp_ptr wp, mp_ptr np, mp_srcptr dp, mp_size_t n, mp_limb_t dinv, mp_ptr scratch) { mp_size_t s, t; mp_limb_t cy; ASSERT (n >= 6); ASSERT (! MPN_OVERLAP_P (qp, n, np, n)); ASSERT (! MPN_OVERLAP_P (qp, n, dp, n)); ASSERT (! MPN_OVERLAP_P (wp, 2, np, n)); ASSERT (! MPN_OVERLAP_P (wp, 2, dp, n)); ASSERT (! MPN_OVERLAP_P (np, n, dp, n)); /* Example with s = 4, t = 3, n = 7: C C C C C C qp . A B B B . A A B B B 1 A A A B B B 0 A A A A B B B 0 1 ... dp */ t = n / 2; /* t = floor(n/2) */ s = n - t; /* s = ceil(n/2) */ /* recurse into low half of quotient (region A) */ if (s <= DC_BDIV_Q_THRESHOLD) mpn_sb_bdiv_q (qp, wp, np, s, dp, s, dinv); else mpn_dc_bdiv_q_n (qp, wp, np, dp, s, dinv, scratch); /* remove region B and overflow from A from N (if n odd, do first row of B separately --- we could have used mpn_mulmid, but this saves some logic) */ mpn_mulmid_n (scratch, dp + 1, qp + (n & 1), t); if (n & 1) { cy = mpn_addmul_1 (scratch, dp + s, t, qp[0]); MPN_INCR_U (scratch + t, 2, cy); } ADDC_LIMB (cy, scratch[0], scratch[0], wp[0]); /* overflow from A */ MPN_INCR_U (scratch + 1, t + 1, wp[1] + cy); cy = mpn_sub_n (np + s, np + s, scratch, t); MPN_INCR_U (scratch + t, 2, cy); /* recurse into top half of quotient (region C) (this does not overwrite {scratch + t, 2}, because n >= 6 implies t >= 3 implies floor(t/2) + 2 <= t) */ if (t <= DC_BDIV_Q_THRESHOLD) mpn_sb_bdiv_q (qp + s, wp, np + s, t, dp, t, dinv); else mpn_dc_bdiv_q_n (qp + s, wp, np + s, dp, t, dinv, scratch); /* combine overflows from B and C */ ADDC_LIMB (cy, wp[0], wp[0], scratch[t]); wp[1] += scratch[t + 1] + cy; }
mp_limb_t mpn_divrem (mp_ptr qp, mp_size_t qxn, mp_ptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn) { ASSERT (qxn >= 0); ASSERT (nn >= dn); ASSERT (dn >= 1); ASSERT (dp[dn-1] & GMP_NUMB_HIGHBIT); ASSERT (! MPN_OVERLAP_P (np, nn, dp, dn)); ASSERT (! MPN_OVERLAP_P (qp, nn-dn+qxn, np, nn) || qp==np+dn+qxn); ASSERT (! MPN_OVERLAP_P (qp, nn-dn+qxn, dp, dn)); ASSERT_MPN (np, nn); ASSERT_MPN (dp, dn); if (dn == 1) { mp_limb_t ret; mp_ptr q2p; mp_size_t qn; TMP_DECL; TMP_MARK; q2p = (mp_ptr) TMP_ALLOC ((nn + qxn) * BYTES_PER_MP_LIMB); np[0] = mpn_divrem_1 (q2p, qxn, np, nn, dp[0]); qn = nn + qxn - 1; MPN_COPY (qp, q2p, qn); ret = q2p[qn]; TMP_FREE; return ret; } else if (dn == 2) { return mpn_divrem_2 (qp, qxn, np, nn, dp); } else { mp_ptr rp, q2p; mp_limb_t qhl; mp_size_t qn; TMP_DECL; TMP_MARK; if (UNLIKELY (qxn != 0)) { mp_ptr n2p; n2p = (mp_ptr) TMP_ALLOC ((nn + qxn) * BYTES_PER_MP_LIMB); MPN_ZERO (n2p, qxn); MPN_COPY (n2p + qxn, np, nn); q2p = (mp_ptr) TMP_ALLOC ((nn - dn + qxn + 1) * BYTES_PER_MP_LIMB); rp = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB); mpn_tdiv_qr (q2p, rp, 0L, n2p, nn + qxn, dp, dn); MPN_COPY (np, rp, dn); qn = nn - dn + qxn; MPN_COPY (qp, q2p, qn); qhl = q2p[qn]; } else { q2p = (mp_ptr) TMP_ALLOC ((nn - dn + 1) * BYTES_PER_MP_LIMB); rp = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB); mpn_tdiv_qr (q2p, rp, 0L, np, nn, dp, dn); MPN_COPY (np, rp, dn); /* overwrite np area with remainder */ qn = nn - dn; MPN_COPY (qp, q2p, qn); qhl = q2p[qn]; } TMP_FREE; return qhl; } }
void mpn_sqr (mp_ptr p, mp_srcptr a, mp_size_t n) { ASSERT (n >= 1); ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n)); if (BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) { /* mul_basecase is faster than sqr_basecase on small sizes sometimes */ mpn_mul_basecase (p, a, n, a, n); } else if (BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD)) { mpn_sqr_basecase (p, a, n); } else if (BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) { /* Allocate workspace of fixed size on stack: fast! */ mp_limb_t ws[mpn_toom2_sqr_itch (SQR_TOOM3_THRESHOLD_LIMIT-1)]; ASSERT (SQR_TOOM3_THRESHOLD <= SQR_TOOM3_THRESHOLD_LIMIT); mpn_toom2_sqr (p, a, n, ws); } else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD)) { mp_ptr ws; TMP_SDECL; TMP_SMARK; ws = TMP_SALLOC_LIMBS (mpn_toom3_sqr_itch (n)); mpn_toom3_sqr (p, a, n, ws); TMP_SFREE; } else if (BELOW_THRESHOLD (n, SQR_TOOM6_THRESHOLD)) { mp_ptr ws; TMP_SDECL; TMP_SMARK; ws = TMP_SALLOC_LIMBS (mpn_toom4_sqr_itch (n)); mpn_toom4_sqr (p, a, n, ws); TMP_SFREE; } else if (BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD)) { mp_ptr ws; TMP_SDECL; TMP_SMARK; ws = TMP_SALLOC_LIMBS (mpn_toom6_sqr_itch (n)); mpn_toom6_sqr (p, a, n, ws); TMP_SFREE; } else if (BELOW_THRESHOLD (n, SQR_FFT_THRESHOLD)) { mp_ptr ws; TMP_DECL; TMP_MARK; ws = TMP_ALLOC_LIMBS (mpn_toom8_sqr_itch (n)); mpn_toom8_sqr (p, a, n, ws); TMP_FREE; } else { /* The current FFT code allocates its own space. That should probably change. */ mpn_fft_mul (p, a, n, a, n); } }
/* ret + (xp, n) = (yp, n)*(zp, n) % 2^b + 1 needs (tp, 2n) temp space, everything reduced mod 2^b inputs, outputs are fully reduced N.B: 2n is not the same as 2b rounded up to nearest limb! */ inline static int mpn_mulmod_2expp1_internal (mp_ptr xp, mp_srcptr yp, mp_srcptr zp, mpir_ui b, mp_ptr tp) { mp_size_t n, k; mp_limb_t c; TMP_DECL; n = BITS_TO_LIMBS (b); k = GMP_NUMB_BITS * n - b; ASSERT(b > 0); ASSERT(n > 0); ASSERT_MPN(yp, n); ASSERT_MPN(zp, n); ASSERT(!MPN_OVERLAP_P (tp, 2 * n, yp, n)); ASSERT(!MPN_OVERLAP_P (tp, 2 * n, zp, n)); ASSERT(MPN_SAME_OR_SEPARATE_P (xp, tp, n)); ASSERT(MPN_SAME_OR_SEPARATE_P (xp, tp + n, n)); ASSERT(k == 0 || yp[n - 1] >> (GMP_NUMB_BITS - k) == 0); ASSERT(k == 0 || zp[n - 1] >> (GMP_NUMB_BITS - k) == 0); #ifndef TUNE_PROGRAM_BUILD if (k == 0 && n > FFT_MULMOD_2EXPP1_CUTOFF && n == mpir_fft_adjust_limbs(n)) { mp_bitcnt_t depth1, depth = 1; mp_size_t w1, off; mp_ptr tx, ty, tz; mp_limb_t ret; TMP_MARK; tx = TMP_BALLOC_LIMBS(3*n + 3); ty = tx + n + 1; tz = ty + n + 1; MPN_COPY(ty, yp, n); MPN_COPY(tz, zp, n); ty[n] = 0; tz[n] = 0; while ((((mp_limb_t)1)<<depth) < b) depth++; if (depth < 12) off = mulmod_2expp1_table_n[0]; else off = mulmod_2expp1_table_n[MIN(depth, FFT_N_NUM + 11) - 12]; depth1 = depth/2 - off; w1 = b/(((mp_limb_t)1)<<(2*depth1)); mpir_fft_mulmod_2expp1(tx, ty, tz, n, depth1, w1); MPN_COPY(xp, tx, n); ret = tx[n]; TMP_FREE; return ret; } #endif if (yp == zp) mpn_sqr(tp, yp, n); else mpn_mul_n (tp, yp, zp, n); if (k == 0) { c = mpn_sub_n (xp, tp, tp + n, n); return mpn_add_1 (xp, xp, n, c); } c = tp[n - 1]; tp[n - 1] &= GMP_NUMB_MASK >> k; #if HAVE_NATIVE_mpn_sublsh_nc c = mpn_sublsh_nc (xp, tp, tp + n, n, k, c); #else { mp_limb_t c1; c1 = mpn_lshift (tp + n, tp + n, n, k); tp[n] |= c >> (GMP_NUMB_BITS - k); c = mpn_sub_n (xp, tp, tp + n, n) + c1; } #endif c = mpn_add_1 (xp, xp, n, c); xp[n - 1] &= GMP_NUMB_MASK >> k; return c; }
void mpn_mul_basecase (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr vp, mp_size_t vn) { ASSERT (un >= vn); ASSERT (vn >= 1); ASSERT (! MPN_OVERLAP_P (rp, un+vn, up, un)); ASSERT (! MPN_OVERLAP_P (rp, un+vn, vp, vn)); /* We first multiply by the low order limb (or depending on optional function availability, limbs). This result can be stored, not added, to rp. We also avoid a loop for zeroing this way. */ #ifdef HAVE_NATIVE_mpn_mul_2 if (vn >= 2) { rp[un + 1] = mpn_mul_2 (rp, up, un, vp); rp += 2, vp += 2, vn -= 2; } else { rp[un] = mpn_mul_1 (rp, up, un, vp[0]); return; } #else rp[un] = mpn_mul_1 (rp, up, un, vp[0]); rp += 1, vp += 1, vn -= 1; #endif /* Now accumulate the product of up[] and the next higher limb (or depending on optional function availability, limbs) from vp[]. */ #define MAX_LEFT MP_SIZE_T_MAX /* Used to simplify loops into if statements */ #ifdef HAVE_NATIVE_mpn_addmul_6 while (vn >= 6) { rp[un + 6 - 1] = mpn_addmul_6 (rp, up, un, vp); if (MAX_LEFT == 6) return; rp += 6, vp += 6, vn -= 6; if (MAX_LEFT < 2 * 6) break; } #undef MAX_LEFT #define MAX_LEFT (6 - 1) #endif #ifdef HAVE_NATIVE_mpn_addmul_5 while (vn >= 5) { rp[un + 5 - 1] = mpn_addmul_5 (rp, up, un, vp); if (MAX_LEFT == 5) return; rp += 5, vp += 5, vn -= 5; if (MAX_LEFT < 2 * 5) break; } #undef MAX_LEFT #define MAX_LEFT (5 - 1) #endif #ifdef HAVE_NATIVE_mpn_addmul_4 while (vn >= 4) { rp[un + 4 - 1] = mpn_addmul_4 (rp, up, un, vp); if (MAX_LEFT == 4) return; rp += 4, vp += 4, vn -= 4; if (MAX_LEFT < 2 * 4) break; } #undef MAX_LEFT #define MAX_LEFT (4 - 1) #endif #ifdef HAVE_NATIVE_mpn_addmul_3 while (vn >= 3) { rp[un + 3 - 1] = mpn_addmul_3 (rp, up, un, vp); if (MAX_LEFT == 3) return; rp += 3, vp += 3, vn -= 3; if (MAX_LEFT < 2 * 3) break; } #undef MAX_LEFT #define MAX_LEFT (3 - 1) #endif #ifdef HAVE_NATIVE_mpn_addmul_2 while (vn >= 2) { rp[un + 2 - 1] = mpn_addmul_2 (rp, up, un, vp); if (MAX_LEFT == 2) return; rp += 2, vp += 2, vn -= 2; if (MAX_LEFT < 2 * 2) break; } #undef MAX_LEFT #define MAX_LEFT (2 - 1) #endif while (vn >= 1) { rp[un] = mpn_addmul_1 (rp, up, un, vp[0]); if (MAX_LEFT == 1) return; rp += 1, vp += 1, vn -= 1; } }