// k degree poly so have k+1 coeffs and first k are size n // k>3 so we can do the first add unconditionally int mpn_toom_eval_pm1(mp_ptr pp,mp_ptr mp,unsigned int k,mp_srcptr xp,mp_size_t n,mp_size_t m,mp_ptr tp) {int isneg=0;unsigned int i; ASSERT(k>3);ASSERT(n>=m);ASSERT(m>0);ASSERT_MPN(xp,n*k+m); //ASSERT_SPACE(pp,n+1);ASSERT_SPACE(mp,n+1);ASSERT_SPACE(tp,n+1); ASSERT(!MPN_OVERLAP_P(pp,n+1,mp,n+1));ASSERT(!MPN_OVERLAP_P(pp,n+1,xp,n*k+m));ASSERT(!MPN_OVERLAP_P(pp,n+1,tp,n+1)); ASSERT(!MPN_OVERLAP_P(mp,n+1,xp,n*k+m));ASSERT(!MPN_OVERLAP_P(xp,n*k+m,tp,n+1)); #if ! HAVE_NATIVE_mpn_sumdiff_n ASSERT(!MPN_OVERLAP_P(mp,n+1,tp,n+1)); #endif #if HAVE_NATIVE_mpn_addadd_n if(k==4){pp[n]=mpn_add_n(pp,xp,xp+2*n,n);tp[n]=mpn_add_n(tp,xp+n,xp+3*n,n);}else if(k==5){pp[n]=mpn_addadd_n(pp,xp,xp+2*n,xp+4*n,n);tp[n]=mpn_add_n(tp,xp+n,xp+3*n,n);}else {pp[n]=mpn_addadd_n(pp,xp,xp+2*n,xp+4*n,n);tp[n]=mpn_addadd_n(tp,xp+n,xp+3*n,xp+5*n,n); for(i=7;i<k-2;i+=4){pp[n]+=mpn_addadd_n(pp,pp,xp+(i-1)*n,xp+(i+1)*n,n);tp[n]+=mpn_addadd_n(tp,tp,xp+i*n,xp+(i+2)*n,n);} if(k%4==3){pp[n]+=mpn_add_n(pp,pp,xp+(k-1)*n,n);} if(k%4==0){pp[n]+=mpn_add_n(pp,pp,xp+(k-2)*n,n);tp[n]+=mpn_add_n(tp,tp,xp+(k-1)*n,n);} if(k%4==1){pp[n]+=mpn_addadd_n(pp,pp,xp+(k-3)*n,xp+(k-1)*n,n);tp[n]+=mpn_add_n(tp,tp,xp+(k-2)*n,n);}} if(k%2==0){pp[n]+=mpn_add(pp,pp,n,xp+k*n,m);}else{tp[n]+=mpn_add(tp,tp,n,xp+k*n,m);} #else // pp is xp+0 xp+2n xp+4n xp+6n ... xp+jn where j<=k-1 // mp is xp+1 xp+3n xp+5n xp+7n ... xp+jn where j<=k-1 pp[n]=mpn_add_n(pp,xp,xp+2*n,n);tp[n]=mpn_add_n(tp,xp+n,xp+3*n,n); for(i=5;i<k;i+=2){pp[n]+=mpn_add_n(pp,pp,xp+(i-1)*n,n);tp[n]+=mpn_add_n(tp,tp,xp+i*n,n);} if(k%2==1){pp[n]+=mpn_add_n(pp,pp,xp+(k-1)*n,n);tp[n]+=mpn_add(tp,tp,n,xp+k*n,m);}else{pp[n]+=mpn_add(pp,pp,n,xp+k*n,m);} #endif if(mpn_cmp(tp,pp,n+1)>0)isneg=-1; #if HAVE_NATIVE_mpn_sumdiff_n if(isneg==0){mpn_sumdiff_n(pp,mp,pp,tp,n+1);}else{mpn_sumdiff_n(pp,mp,tp,pp,n+1);} #else if(isneg==0){mpn_sub_n(mp,pp,tp,n+1);}else{mpn_sub_n(mp,tp,pp,n+1);} mpn_add_n(pp,pp,tp,n+1); #endif return isneg;}
/* Evaluates a polynomial of degree k > 3, in the points +1 and -1. */ int mpn_toom_eval_pm1 (mp_ptr xp1, mp_ptr xm1, unsigned k, mp_srcptr xp, mp_size_t n, mp_size_t hn, mp_ptr tp) { unsigned i; int neg; ASSERT (k >= 4); ASSERT (hn > 0); ASSERT (hn <= n); /* The degree k is also the number of full-size coefficients, so * that last coefficient, of size hn, starts at xp + k*n. */ xp1[n] = mpn_add_n (xp1, xp, xp + 2*n, n); for (i = 4; i < k; i += 2) ASSERT_NOCARRY (mpn_add (xp1, xp1, n+1, xp+i*n, n)); tp[n] = mpn_add_n (tp, xp + n, xp + 3*n, n); for (i = 5; i < k; i += 2) ASSERT_NOCARRY (mpn_add (tp, tp, n+1, xp+i*n, n)); if (k & 1) ASSERT_NOCARRY (mpn_add (tp, tp, n+1, xp+k*n, hn)); else ASSERT_NOCARRY (mpn_add (xp1, xp1, n+1, xp+k*n, hn)); neg = (mpn_cmp (xp1, tp, n + 1) < 0) ? ~0 : 0; #if HAVE_NATIVE_mpn_add_n_sub_n if (neg) mpn_add_n_sub_n (xp1, xm1, tp, xp1, n + 1); else mpn_add_n_sub_n (xp1, xm1, xp1, tp, n + 1); #else if (neg) mpn_sub_n (xm1, tp, xp1, n + 1); else mpn_sub_n (xm1, xp1, tp, n + 1); mpn_add_n (xp1, xp1, tp, n + 1); #endif ASSERT (xp1[n] <= k); ASSERT (xm1[n] <= k/2 + 1); return neg; }
int mpn_toom_eval_dgr3_pm1 (mp_ptr xp1, mp_ptr xm1, mp_srcptr xp, mp_size_t n, mp_size_t x3n, mp_ptr tp) { int neg; ASSERT (x3n > 0); ASSERT (x3n <= n); xp1[n] = mpn_add_n (xp1, xp, xp + 2*n, n); tp[n] = mpn_add (tp, xp + n, n, xp + 3*n, x3n); neg = (mpn_cmp (xp1, tp, n + 1) < 0) ? ~0 : 0; #if HAVE_NATIVE_mpn_add_n_sub_n if (neg) mpn_add_n_sub_n (xp1, xm1, tp, xp1, n + 1); else mpn_add_n_sub_n (xp1, xm1, xp1, tp, n + 1); #else if (neg) mpn_sub_n (xm1, tp, xp1, n + 1); else mpn_sub_n (xm1, xp1, tp, n + 1); mpn_add_n (xp1, xp1, tp, n + 1); #endif ASSERT (xp1[n] <= 3); ASSERT (xm1[n] <= 1); return neg; }
void mpn_toom3_sqr (mp_ptr pp, mp_srcptr ap, mp_size_t an, mp_ptr scratch) { mp_size_t n, s; mp_limb_t cy, vinf0; mp_ptr gp; mp_ptr as1, asm1, as2; #define a0 ap #define a1 (ap + n) #define a2 (ap + 2*n) n = (an + 2) / (size_t) 3; s = an - 2 * n; ASSERT (0 < s && s <= n); as1 = scratch + 4 * n + 4; asm1 = scratch + 2 * n + 2; as2 = pp + n + 1; gp = scratch; /* Compute as1 and asm1. */ cy = mpn_add (gp, a0, n, a2, s); #if HAVE_NATIVE_mpn_add_n_sub_n if (cy == 0 && mpn_cmp (gp, a1, n) < 0) { cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n); as1[n] = cy >> 1; asm1[n] = 0; }
void _tc4_add_unsigned(mp_ptr rp, mp_size_t * rn, mp_srcptr r1, mp_size_t r1n, mp_srcptr r2, mp_size_t r2n) { mp_limb_t cy; mp_size_t s1 = r1n; mp_size_t s2 = r2n; if (!s2) { if (!s1) *rn = 0; else { if (rp != r1) MPN_COPY(rp, r1, s1); *rn = r1n; } } else { *rn = r1n; cy = mpn_add(rp, r1, s1, r2, s2); if (cy) { rp[s1] = cy; if ((*rn) < 0) (*rn)--; else (*rn)++; } } }
void fmpz_add(fmpz_t coeffs_out, const fmpz_t in1, const fmpz_t in2) { fmpz_t coeffs1 = in1; fmpz_t coeffs2 = in2; long carry; unsigned long size1 = ABS(coeffs1[0]); unsigned long size2 = ABS(coeffs2[0]); if (size1 < size2) { SWAP_PTRS(coeffs1, coeffs2); size1 = ABS(coeffs1[0]); size2 = ABS(coeffs2[0]); } if (!size1) { if (!size2) coeffs_out[0] = 0L; else { if (coeffs_out != coeffs2) F_mpn_copy(coeffs_out, coeffs2, size2+1); } } else if (!size2) { if (coeffs_out != coeffs1) F_mpn_copy(coeffs_out, coeffs1, size1+1); } else if ((long) (coeffs1[0] ^ coeffs2[0]) >= 0L) { coeffs_out[0] = coeffs1[0]; carry = mpn_add(coeffs_out+1, coeffs1+1, size1, coeffs2+1, size2); if (carry) { coeffs_out[size1+1] = carry; if ((long) coeffs_out[0] < 0L) coeffs_out[0]--; else coeffs_out[0]++; } } else { carry = 0; if (size1 != size2) carry = 1; else carry = mpn_cmp(coeffs1+1, coeffs2+1, size1); if (carry == 0) coeffs_out[0] = 0L; else if (carry > 0) { mpn_sub(coeffs_out+1, coeffs1+1, size1, coeffs2+1, size2); coeffs_out[0] = coeffs1[0]; NORM(coeffs_out); } else { mpn_sub_n(coeffs_out+1, coeffs2+1, coeffs1+1, size1); coeffs_out[0] = -coeffs1[0]; NORM(coeffs_out); } } }
/* Needs n+1 limbs of temporary storage. */ int mpn_toom_eval_dgr3_pm2 (mp_ptr xp2, mp_ptr xm2, mp_srcptr xp, mp_size_t n, mp_size_t x3n, mp_ptr tp) { mp_limb_t cy; int neg; ASSERT (x3n > 0); ASSERT (x3n <= n); /* (x0 + 4 * x2) +/- (2 x1 + 8 x_3) */ #if HAVE_NATIVE_mpn_addlsh_n || HAVE_NATIVE_mpn_addlsh2_n #if HAVE_NATIVE_mpn_addlsh2_n xp2[n] = mpn_addlsh2_n (xp2, xp, xp + 2*n, n); cy = mpn_addlsh2_n (tp, xp + n, xp + 3*n, x3n); #else /* HAVE_NATIVE_mpn_addlsh_n */ xp2[n] = mpn_addlsh_n (xp2, xp, xp + 2*n, n, 2); cy = mpn_addlsh_n (tp, xp + n, xp + 3*n, x3n, 2); #endif if (x3n < n) cy = mpn_add_1 (tp + x3n, xp + n + x3n, n - x3n, cy); tp[n] = cy; #else cy = mpn_lshift (tp, xp + 2*n, n, 2); xp2[n] = cy + mpn_add_n (xp2, tp, xp, n); tp[x3n] = mpn_lshift (tp, xp + 3*n, x3n, 2); if (x3n < n) tp[n] = mpn_add (tp, xp + n, n, tp, x3n + 1); else tp[n] += mpn_add_n (tp, xp + n, tp, n); #endif mpn_lshift (tp, tp, n+1, 1); neg = (mpn_cmp (xp2, tp, n + 1) < 0) ? ~0 : 0; #if HAVE_NATIVE_mpn_sumdiff_n if (neg) mpn_sumdiff_n (xp2, xm2, tp, xp2, n + 1); else mpn_sumdiff_n (xp2, xm2, xp2, tp, n + 1); #else if (neg) mpn_sub_n (xm2, tp, xp2, n + 1); else mpn_sub_n (xm2, xp2, tp, n + 1); mpn_add_n (xp2, xp2, tp, n + 1); #endif ASSERT (xp2[n] < 15); ASSERT (xm2[n] < 10); return neg; }
int main () { mp_limb_t nptr[2 * SIZE]; mp_limb_t dptr[2 * SIZE]; mp_limb_t qptr[2 * SIZE]; mp_limb_t pptr[2 * SIZE + 1]; mp_limb_t rptr[2 * SIZE]; mp_size_t nsize, dsize, qsize, rsize, psize; int test; mp_limb_t qlimb; for (test = 0; ; test++) { printf ("%d\n", test); #ifdef RANDOM nsize = random () % (2 * SIZE) + 1; dsize = random () % nsize + 1; #else nsize = 2 * SIZE; dsize = SIZE; #endif mpn_random2 (nptr, nsize); mpn_random2 (dptr, dsize); dptr[dsize - 1] |= (mp_limb_t) 1 << (GMP_LIMB_BITS - 1); MPN_COPY (rptr, nptr, nsize); qlimb = mpn_divrem (qptr, (mp_size_t) 0, rptr, nsize, dptr, dsize); rsize = dsize; qsize = nsize - dsize; qptr[qsize] = qlimb; qsize += qlimb; if (qsize == 0 || qsize > 2 * SIZE) { continue; /* bogus */ } else { mp_limb_t cy; if (qsize > dsize) mpn_mul (pptr, qptr, qsize, dptr, dsize); else mpn_mul (pptr, dptr, dsize, qptr, qsize); psize = qsize + dsize; psize -= pptr[psize - 1] == 0; cy = mpn_add (pptr, pptr, psize, rptr, rsize); pptr[psize] = cy; psize += cy; } if (nsize != psize || mpn_cmp (nptr, pptr, nsize) != 0) abort (); } }
void mpn_toom33_mul (mp_ptr pp, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr scratch) { const int __gmpn_cpuvec_initialized = 1; mp_size_t n, s, t; int vm1_neg; mp_limb_t cy, vinf0; mp_ptr gp; mp_ptr as1, asm1, as2; mp_ptr bs1, bsm1, bs2; #define a0 ap #define a1 (ap + n) #define a2 (ap + 2*n) #define b0 bp #define b1 (bp + n) #define b2 (bp + 2*n) n = (an + 2) / (size_t) 3; s = an - 2 * n; t = bn - 2 * n; ASSERT (an >= bn); ASSERT (0 < s && s <= n); ASSERT (0 < t && t <= n); as1 = scratch + 4 * n + 4; asm1 = scratch + 2 * n + 2; as2 = pp + n + 1; bs1 = pp; bsm1 = scratch + 3 * n + 3; /* we need 4n+4 <= 4n+s+t */ bs2 = pp + 2 * n + 2; gp = scratch; vm1_neg = 0; /* Compute as1 and asm1. */ cy = mpn_add (gp, a0, n, a2, s); #if HAVE_NATIVE_mpn_add_n_sub_n if (cy == 0 && mpn_cmp (gp, a1, n) < 0) { cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n); as1[n] = cy >> 1; asm1[n] = 0; vm1_neg = 1; }
void mpn_invert (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch) { ASSERT (n > 0); ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT); ASSERT (! MPN_OVERLAP_P (ip, n, dp, n)); ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n))); ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n))); if (n == 1) invert_limb (*ip, *dp); else { TMP_DECL; TMP_MARK; if (BELOW_THRESHOLD (n, INV_APPR_THRESHOLD)) { /* Maximum scratch needed by this branch: 2*n */ mp_size_t i; mp_ptr xp; xp = scratch; /* 2 * n limbs */ for (i = n - 1; i >= 0; i--) xp[i] = GMP_NUMB_MAX; mpn_com (xp + n, dp, n); if (n == 2) { mpn_divrem_2 (ip, 0, xp, 4, dp); } else { gmp_pi1_t inv; invert_pi1 (inv, dp[n-1], dp[n-2]); /* FIXME: should we use dcpi1_div_q, for big sizes? */ mpn_sbpi1_div_q (ip, xp, 2 * n, dp, n, inv.inv32); } } else { /* Use approximated inverse; correct the result if needed. */ mp_limb_t e; /* The possible error in the approximate inverse */ ASSERT ( mpn_invert_itch (n) >= mpn_invertappr_itch (n) ); e = mpn_ni_invertappr (ip, dp, n, scratch); if (UNLIKELY (e)) { /* Assume the error can only be "0" (no error) or "1". */ /* Code to detect and correct the "off by one" approximation. */ mpn_mul_n (scratch, ip, dp, n); ASSERT_NOCARRY (mpn_add_n (scratch + n, scratch + n, dp, n)); if (! mpn_add (scratch, scratch, 2*n, dp, n)) MPN_INCR_U (ip, n, 1); /* The value was wrong, correct it. */ } } TMP_FREE; } }
void vectvectmul (vector *vect1, vector *vect2, int Tid) { int i; MPN_ZERO (temp_sum[Tid], 2*LIMBS+1); for (i=VStart[Tid]; i<=VEnd[Tid]; ++i) { mpn_mul_n (temp_prod[Tid], vect1[i], vect2[i], LIMBS); mpn_add (temp_sum[Tid], temp_sum[Tid], 2*LIMBS+1, temp_prod[Tid], 2*LIMBS); } return; }
void _tc4_add(mp_ptr rp, mp_size_t * rn, mp_srcptr r1, mp_size_t r1n, mp_srcptr r2, mp_size_t r2n) { mp_limb_t cy; mp_size_t s1 = ABS(r1n); mp_size_t s2 = ABS(r2n); if (!s1) { *rn = 0; } else if (!s2) { if (rp != r1) MPN_COPY(rp, r1, s1); *rn = r1n; } else if ((r1n ^ r2n) >= 0) { *rn = r1n; cy = mpn_add(rp, r1, s1, r2, s2); if (cy) { rp[s1] = cy; if ((*rn) < 0) (*rn)--; else (*rn)++; } } else { mp_size_t ct; if (s1 != s2) ct = 1; else MPN_CMP(ct, r1, r2, s1); if (!ct) *rn = 0; else if (ct > 0) { mpn_sub(rp, r1, s1, r2, s2); *rn = s1; MPN_NORMALIZE(rp, (*rn)); if (r1n < 0) *rn = -(*rn); } else { mpn_sub_n(rp, r2, r1, s1); *rn = s1; MPN_NORMALIZE(rp, (*rn)); if (r1n > 0) *rn = -(*rn); } } }
void gcdext_get_t(mp_ptr t, mp_size_t * tn, mp_ptr gp, mp_size_t gn, mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n, mp_ptr s, mp_size_t sn, mp_ptr tp) { mp_size_t ss = ABS(sn); mp_limb_t cy; if (ss >= an) mpn_mul(tp, s, ss, ap, an); else mpn_mul(tp, ap, an, s, ss); (*tn) = ss + an; (*tn) -= (tp[(*tn) - 1] == 0); /* We must have s*ap >= gp and we really want to compute -t */ if (sn > 0) { mpn_sub(tp, tp, *tn, gp, gn); MPN_NORMALIZE(tp, (*tn)); } else { cy = mpn_add(tp, tp, *tn, gp, gn); if (cy) tp[(*tn)++] = cy; } if ((*tn) == 0) { return; } mpn_tdiv_qr(t, tp, 0, tp, (*tn), bp, n); ASSERT_MPN_ZERO_P(tp, n); (*tn) -= (n - 1); (*tn) -= (t[(*tn) - 1] == 0); }
mp_size_t mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep, mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n) { mp_size_t talloc; mp_size_t scratch; mp_size_t matrix_scratch; mp_size_t ualloc = n + 1; mp_size_t un; mp_ptr u0; mp_ptr u1; mp_ptr tp; TMP_DECL; ASSERT (an >= n); ASSERT (n > 0); TMP_MARK; /* FIXME: Check for small sizes first, before setting up temporary storage etc. */ talloc = MPN_GCDEXT_LEHMER_N_ITCH(n); /* For initial division */ scratch = an - n + 1; if (scratch > talloc) talloc = scratch; if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) { /* For hgcd loop. */ mp_size_t hgcd_scratch; mp_size_t update_scratch; mp_size_t p1 = CHOOSE_P_1 (n); mp_size_t p2 = CHOOSE_P_2 (n); mp_size_t min_p = MIN(p1, p2); mp_size_t max_p = MAX(p1, p2); matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p); hgcd_scratch = mpn_hgcd_itch (n - min_p); update_scratch = max_p + n - 1; scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch); if (scratch > talloc) talloc = scratch; /* Final mpn_gcdext_lehmer_n call. Need space for u and for copies of a and b. */ scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD) + 3*GCDEXT_DC_THRESHOLD; if (scratch > talloc) talloc = scratch; /* Cofactors u0 and u1 */ talloc += 2*(n+1); } tp = TMP_ALLOC_LIMBS(talloc); if (an > n) { mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n); if (mpn_zero_p (ap, n)) { MPN_COPY (gp, bp, n); *usizep = 0; TMP_FREE; return n; } } if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) { mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp); TMP_FREE; return gn; } MPN_ZERO (tp, 2*ualloc); u0 = tp; tp += ualloc; u1 = tp; tp += ualloc; { /* For the first hgcd call, there are no u updates, and it makes some sense to use a different choice for p. */ /* FIXME: We could trim use of temporary storage, since u0 and u1 are not used yet. For the hgcd call, we could swap in the u0 and u1 pointers for the relevant matrix elements. */ struct hgcd_matrix M; mp_size_t p = CHOOSE_P_1 (n); mp_size_t nn; mpn_hgcd_matrix_init (&M, n - p, tp); nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch); if (nn > 0) { ASSERT (M.n <= (n - p - 1)/2); ASSERT (M.n + p <= (p + n - 1) / 2); /* Temporary storage 2 (p + M->n) <= p + n - 1 */ n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch); MPN_COPY (u0, M.p[1][0], M.n); MPN_COPY (u1, M.p[1][1], M.n); un = M.n; while ( (u0[un-1] | u1[un-1] ) == 0) un--; } else { /* mpn_hgcd has failed. Then either one of a or b is very small, or the difference is very small. Perform one subtraction followed by one division. */ mp_size_t gn; mp_size_t updated_un = 1; u1[0] = 1; /* Temporary storage 2n + 1 */ n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n, u0, u1, &updated_un, tp, tp + n); if (n == 0) { TMP_FREE; return gn; } un = updated_un; ASSERT (un < ualloc); } } while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) { struct hgcd_matrix M; mp_size_t p = CHOOSE_P_2 (n); mp_size_t nn; mpn_hgcd_matrix_init (&M, n - p, tp); nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch); if (nn > 0) { mp_ptr t0; t0 = tp + matrix_scratch; ASSERT (M.n <= (n - p - 1)/2); ASSERT (M.n + p <= (p + n - 1) / 2); /* Temporary storage 2 (p + M->n) <= p + n - 1 */ n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0); /* By the same analysis as for mpn_hgcd_matrix_mul */ ASSERT (M.n + un <= ualloc); /* FIXME: This copying could be avoided by some swapping of * pointers. May need more temporary storage, though. */ MPN_COPY (t0, u0, un); /* Temporary storage ualloc */ un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un); ASSERT (un < ualloc); ASSERT ( (u0[un-1] | u1[un-1]) > 0); } else { /* mpn_hgcd has failed. Then either one of a or b is very small, or the difference is very small. Perform one subtraction followed by one division. */ mp_size_t gn; mp_size_t updated_un = un; /* Temporary storage 2n + 1 */ n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n, u0, u1, &updated_un, tp, tp + n); if (n == 0) { TMP_FREE; return gn; } un = updated_un; ASSERT (un < ualloc); } } if (UNLIKELY (mpn_cmp (ap, bp, n) == 0)) { /* Must return the smallest cofactor, +u1 or -u0 */ int c; MPN_COPY (gp, ap, n); MPN_CMP (c, u0, u1, un); ASSERT (c != 0); if (c < 0) { MPN_NORMALIZE (u0, un); MPN_COPY (up, u0, un); *usizep = -un; } else { MPN_NORMALIZE_NOT_ZERO (u1, un); MPN_COPY (up, u1, un); *usizep = un; } TMP_FREE; return n; } else if (mpn_zero_p (u0, un)) { mp_size_t gn; ASSERT (un == 1); ASSERT (u1[0] == 1); /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */ gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp); TMP_FREE; return gn; } else { /* We have A = ... a + ... b B = u0 a + u1 b a = u1 A + ... B b = -u0 A + ... B with bounds |u0|, |u1| <= B / min(a, b) Compute g = u a + v b = (u u1 - v u0) A + (...) B Here, u, v are bounded by |u| <= b, |v| <= a */ mp_size_t u0n; mp_size_t u1n; mp_size_t lehmer_un; mp_size_t lehmer_vn; mp_size_t gn; mp_ptr lehmer_up; mp_ptr lehmer_vp; int negate; lehmer_up = tp; tp += n; /* Call mpn_gcdext_lehmer_n with copies of a and b. */ MPN_COPY (tp, ap, n); MPN_COPY (tp + n, bp, n); gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n); u0n = un; MPN_NORMALIZE (u0, u0n); if (lehmer_un == 0) { /* u == 0 ==> v = g / b == 1 ==> g = - u0 A + (...) B */ MPN_COPY (up, u0, u0n); *usizep = -u0n; TMP_FREE; return gn; } lehmer_vp = tp; /* Compute v = (g - u a) / b */ lehmer_vn = compute_v (lehmer_vp, ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1); if (lehmer_un > 0) negate = 0; else { lehmer_un = -lehmer_un; negate = 1; } u1n = un; MPN_NORMALIZE (u1, u1n); /* It's possible that u0 = 1, u1 = 0 */ if (u1n == 0) { ASSERT (un == 1); ASSERT (u0[0] == 1); /* u1 == 0 ==> u u1 + v u0 = v */ MPN_COPY (up, lehmer_vp, lehmer_vn); *usizep = negate ? lehmer_vn : - lehmer_vn; TMP_FREE; return gn; } ASSERT (lehmer_un + u1n <= ualloc); ASSERT (lehmer_vn + u0n <= ualloc); /* Now u0, u1, u are non-zero. We may still have v == 0 */ /* Compute u u0 */ if (lehmer_un <= u1n) /* Should be the common case */ mpn_mul (up, u1, u1n, lehmer_up, lehmer_un); else mpn_mul (up, lehmer_up, lehmer_un, u1, u1n); un = u1n + lehmer_un; un -= (up[un - 1] == 0); if (lehmer_vn > 0) { mp_limb_t cy; /* Overwrites old u1 value */ if (lehmer_vn <= u0n) /* Should be the common case */ mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn); else mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n); u1n = u0n + lehmer_vn; u1n -= (u1[u1n - 1] == 0); if (u1n <= un) { cy = mpn_add (up, up, un, u1, u1n); } else { cy = mpn_add (up, u1, u1n, up, un); un = u1n; } up[un] = cy; un += (cy != 0); ASSERT (un < ualloc); } *usizep = negate ? -un : un; TMP_FREE; return gn; } }
/* FIXME: x Take scratch parameter, and figure out scratch need. x Use some fallback for small M->n? */ static mp_size_t hgcd_matrix_apply (const struct hgcd_matrix *M, mp_ptr ap, mp_ptr bp, mp_size_t n) { mp_size_t an, bn, un, vn, nn; mp_size_t mn[2][2]; mp_size_t modn; mp_ptr tp, sp, scratch; mp_limb_t cy; unsigned i, j; TMP_DECL; ASSERT ( (ap[n-1] | bp[n-1]) > 0); an = n; MPN_NORMALIZE (ap, an); bn = n; MPN_NORMALIZE (bp, bn); for (i = 0; i < 2; i++) for (j = 0; j < 2; j++) { mp_size_t k; k = M->n; MPN_NORMALIZE (M->p[i][j], k); mn[i][j] = k; } ASSERT (mn[0][0] > 0); ASSERT (mn[1][1] > 0); ASSERT ( (mn[0][1] | mn[1][0]) > 0); TMP_MARK; if (mn[0][1] == 0) { /* A unchanged, M = (1, 0; q, 1) */ ASSERT (mn[0][0] == 1); ASSERT (M->p[0][0][0] == 1); ASSERT (mn[1][1] == 1); ASSERT (M->p[1][1][0] == 1); /* Put B <-- B - q A */ nn = submul (bp, bn, ap, an, M->p[1][0], mn[1][0]); } else if (mn[1][0] == 0) { /* B unchanged, M = (1, q; 0, 1) */ ASSERT (mn[0][0] == 1); ASSERT (M->p[0][0][0] == 1); ASSERT (mn[1][1] == 1); ASSERT (M->p[1][1][0] == 1); /* Put A <-- A - q * B */ nn = submul (ap, an, bp, bn, M->p[0][1], mn[0][1]); } else { /* A = m00 a + m01 b ==> a <= A / m00, b <= A / m01. B = m10 a + m11 b ==> a <= B / m10, b <= B / m11. */ un = MIN (an - mn[0][0], bn - mn[1][0]) + 1; vn = MIN (an - mn[0][1], bn - mn[1][1]) + 1; nn = MAX (un, vn); /* In the range of interest, mulmod_bnm1 should always beat mullo. */ modn = mpn_mulmod_bnm1_next_size (nn + 1); scratch = TMP_ALLOC_LIMBS (mpn_mulmod_bnm1_itch (modn, modn, M->n)); tp = TMP_ALLOC_LIMBS (modn); sp = TMP_ALLOC_LIMBS (modn); ASSERT (n <= 2*modn); if (n > modn) { cy = mpn_add (ap, ap, modn, ap + modn, n - modn); MPN_INCR_U (ap, modn, cy); cy = mpn_add (bp, bp, modn, bp + modn, n - modn); MPN_INCR_U (bp, modn, cy); n = modn; } mpn_mulmod_bnm1 (tp, modn, ap, n, M->p[1][1], mn[1][1], scratch); mpn_mulmod_bnm1 (sp, modn, bp, n, M->p[0][1], mn[0][1], scratch); /* FIXME: Handle the small n case in some better way. */ if (n + mn[1][1] < modn) MPN_ZERO (tp + n + mn[1][1], modn - n - mn[1][1]); if (n + mn[0][1] < modn) MPN_ZERO (sp + n + mn[0][1], modn - n - mn[0][1]); cy = mpn_sub_n (tp, tp, sp, modn); MPN_DECR_U (tp, modn, cy); ASSERT (mpn_zero_p (tp + nn, modn - nn)); mpn_mulmod_bnm1 (sp, modn, ap, n, M->p[1][0], mn[1][0], scratch); MPN_COPY (ap, tp, nn); mpn_mulmod_bnm1 (tp, modn, bp, n, M->p[0][0], mn[0][0], scratch); if (n + mn[1][0] < modn) MPN_ZERO (sp + n + mn[1][0], modn - n - mn[1][0]); if (n + mn[0][0] < modn) MPN_ZERO (tp + n + mn[0][0], modn - n - mn[0][0]); cy = mpn_sub_n (tp, tp, sp, modn); MPN_DECR_U (tp, modn, cy); ASSERT (mpn_zero_p (tp + nn, modn - nn)); MPN_COPY (bp, tp, nn); while ( (ap[nn-1] | bp[nn-1]) == 0) { nn--; ASSERT (nn > 0); } } TMP_FREE; return nn; }
void mpn_toom63_mul (mp_ptr pp, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr scratch) { mp_size_t n, s, t; mp_limb_t cy; int sign; /***************************** decomposition *******************************/ #define a5 (ap + 5 * n) #define b0 (bp + 0 * n) #define b1 (bp + 1 * n) #define b2 (bp + 2 * n) ASSERT (an >= bn); n = 1 + (an >= 2 * bn ? (an - 1) / (size_t) 6 : (bn - 1) / (size_t) 3); s = an - 5 * n; t = bn - 2 * n; ASSERT (0 < s && s <= n); ASSERT (0 < t && t <= n); /* WARNING! it assumes s+t>=n */ ASSERT ( s + t >= n ); ASSERT ( s + t > 4); /* WARNING! it assumes n>1 */ ASSERT ( n > 2); #define r8 pp /* 2n */ #define r7 scratch /* 3n+1 */ #define r5 (pp + 3*n) /* 3n+1 */ #define v0 (pp + 3*n) /* n+1 */ #define v1 (pp + 4*n+1) /* n+1 */ #define v2 (pp + 5*n+2) /* n+1 */ #define v3 (pp + 6*n+3) /* n+1 */ #define r3 (scratch + 3 * n + 1) /* 3n+1 */ #define r1 (pp + 7*n) /* s+t <= 2*n */ #define ws (scratch + 6 * n + 2) /* ??? */ /* Alloc also 3n+1 limbs for ws... mpn_toom_interpolate_8pts may need all of them, when DO_mpn_sublsh_n usea a scratch */ /* if (scratch == NULL) scratch = TMP_SALLOC_LIMBS (9 * n + 3); */ /********************** evaluation and recursive calls *********************/ /* $\pm4$ */ sign = mpn_toom_eval_pm2exp (v2, v0, 5, ap, n, s, 2, pp); pp[n] = mpn_lshift (pp, b1, n, 2); /* 4b1 */ /* FIXME: use addlsh */ v3[t] = mpn_lshift (v3, b2, t, 4);/* 16b2 */ if ( n == t ) v3[n]+= mpn_add_n (v3, v3, b0, n); /* 16b2+b0 */ else v3[n] = mpn_add (v3, b0, n, v3, t+1); /* 16b2+b0 */ sign ^= abs_sub_add_n (v1, v3, pp, n + 1); TOOM_63_MUL_N_REC(pp, v0, v1, n + 1, ws); /* A(-4)*B(-4) */ TOOM_63_MUL_N_REC(r3, v2, v3, n + 1, ws); /* A(+4)*B(+4) */ mpn_toom_couple_handling (r3, 2*n+1, pp, sign, n, 2, 4); /* $\pm1$ */ sign = mpn_toom_eval_pm1 (v2, v0, 5, ap, n, s, pp); /* Compute bs1 and bsm1. Code taken from toom33 */ cy = mpn_add (ws, b0, n, b2, t); #if HAVE_NATIVE_mpn_add_n_sub_n if (cy == 0 && mpn_cmp (ws, b1, n) < 0) { cy = mpn_add_n_sub_n (v3, v1, b1, ws, n); v3[n] = cy >> 1; v1[n] = 0; sign = ~sign; }
void mpn_toom22_mul (mp_ptr pp, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr scratch) { mp_size_t n, s, t; int vm1_neg; mp_limb_t cy, cy2; mp_ptr asm1; mp_ptr bsm1; #define a0 ap #define a1 (ap + n) #define b0 bp #define b1 (bp + n) s = an >> 1; n = an - s; t = bn - n; ASSERT (an >= bn); ASSERT (0 < s && s <= n); ASSERT (0 < t && t <= s); asm1 = pp; bsm1 = pp + n; vm1_neg = 0; /* Compute asm1. */ if (s == n) { if (mpn_cmp (a0, a1, n) < 0) { mpn_sub_n (asm1, a1, a0, n); vm1_neg = 1; } else { mpn_sub_n (asm1, a0, a1, n); } } else { if (mpn_zero_p (a0 + s, n - s) && mpn_cmp (a0, a1, s) < 0) { mpn_sub_n (asm1, a1, a0, s); MPN_ZERO (asm1 + s, n - s); vm1_neg = 1; } else { mpn_sub (asm1, a0, n, a1, s); } } /* Compute bsm1. */ if (t == n) { if (mpn_cmp (b0, b1, n) < 0) { mpn_sub_n (bsm1, b1, b0, n); vm1_neg ^= 1; } else { mpn_sub_n (bsm1, b0, b1, n); } } else { if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0) { mpn_sub_n (bsm1, b1, b0, t); MPN_ZERO (bsm1 + t, n - t); vm1_neg ^= 1; } else { mpn_sub (bsm1, b0, n, b1, t); } } #define v0 pp /* 2n */ #define vinf (pp + 2 * n) /* s+t */ #define vm1 scratch /* 2n */ #define scratch_out scratch + 2 * n /* vm1, 2n limbs */ TOOM22_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out); if (s > t) TOOM22_MUL_REC (vinf, a1, s, b1, t, scratch_out); else TOOM22_MUL_N_REC (vinf, a1, b1, s, scratch_out); /* v0, 2n limbs */ TOOM22_MUL_N_REC (v0, ap, bp, n, scratch_out); /* H(v0) + L(vinf) */ cy = mpn_add_n (pp + 2 * n, v0 + n, vinf, n); /* L(v0) + H(v0) */ cy2 = cy + mpn_add_n (pp + n, pp + 2 * n, v0, n); /* L(vinf) + H(vinf) */ cy += mpn_add (pp + 2 * n, pp + 2 * n, n, vinf + n, s + t - n); if (vm1_neg) cy += mpn_add_n (pp + n, pp + n, vm1, 2 * n); else cy -= mpn_sub_n (pp + n, pp + n, vm1, 2 * n); ASSERT (cy + 1 <= 3); ASSERT (cy2 <= 2); mpn_incr_u (pp + 2 * n, cy2); if (LIKELY (cy <= 2)) mpn_incr_u (pp + 3 * n, cy); else mpn_decr_u (pp + 3 * n, 1); }
/* Computes {rp,MIN(rn,an+bn)} <- {ap,an}*{bp,bn} Mod(B^rn-1) * * The result is expected to be ZERO if and only if one of the operand * already is. Otherwise the class [0] Mod(B^rn-1) is represented by * B^rn-1. This should not be a problem if mulmod_bnm1 is used to * combine results and obtain a natural number when one knows in * advance that the final value is less than (B^rn-1). * Moreover it should not be a problem if mulmod_bnm1 is used to * compute the full product with an+bn <= rn, because this condition * implies (B^an-1)(B^bn-1) < (B^rn-1) . * * Requires 0 < bn <= an <= rn and an + bn > rn/2 * Scratch need: rn + (need for recursive call OR rn + 4). This gives * * S(n) <= rn + MAX (rn + 4, S(n/2)) <= 2rn + 4 */ void mpn_mulmod_bnm1 (mp_ptr rp, mp_size_t rn, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr tp) { ASSERT (0 < bn); ASSERT (bn <= an); ASSERT (an <= rn); if ((rn & 1) != 0 || BELOW_THRESHOLD (rn, MULMOD_BNM1_THRESHOLD)) { if (UNLIKELY (bn < rn)) { if (UNLIKELY (an + bn <= rn)) { mpn_mul (rp, ap, an, bp, bn); } else { mp_limb_t cy; mpn_mul (tp, ap, an, bp, bn); cy = mpn_add (rp, tp, rn, tp + rn, an + bn - rn); MPN_INCR_U (rp, rn, cy); } } else mpn_bc_mulmod_bnm1 (rp, ap, bp, rn, tp); } else { mp_size_t n; mp_limb_t cy; mp_limb_t hi; n = rn >> 1; /* We need at least an + bn >= n, to be able to fit one of the recursive products at rp. Requiring strict inequality makes the coded slightly simpler. If desired, we could avoid this restriction by initially halving rn as long as rn is even and an + bn <= rn/2. */ ASSERT (an + bn > n); /* Compute xm = a*b mod (B^n - 1), xp = a*b mod (B^n + 1) and crt together as x = -xp * B^n + (B^n + 1) * [ (xp + xm)/2 mod (B^n-1)] */ #define a0 ap #define a1 (ap + n) #define b0 bp #define b1 (bp + n) #define xp tp /* 2n + 2 */ /* am1 maybe in {xp, n} */ /* bm1 maybe in {xp + n, n} */ #define sp1 (tp + 2*n + 2) /* ap1 maybe in {sp1, n + 1} */ /* bp1 maybe in {sp1 + n + 1, n + 1} */ { mp_srcptr am1, bm1; mp_size_t anm, bnm; mp_ptr so; bm1 = b0; bnm = bn; if (LIKELY (an > n)) { am1 = xp; cy = mpn_add (xp, a0, n, a1, an - n); MPN_INCR_U (xp, n, cy); anm = n; so = xp + n; if (LIKELY (bn > n)) { bm1 = so; cy = mpn_add (so, b0, n, b1, bn - n); MPN_INCR_U (so, n, cy); bnm = n; so += n; } } else { so = xp; am1 = a0; anm = an; } mpn_mulmod_bnm1 (rp, n, am1, anm, bm1, bnm, so); } { int k; mp_srcptr ap1, bp1; mp_size_t anp, bnp; bp1 = b0; bnp = bn; if (LIKELY (an > n)) { ap1 = sp1; cy = mpn_sub (sp1, a0, n, a1, an - n); sp1[n] = 0; MPN_INCR_U (sp1, n + 1, cy); anp = n + ap1[n]; if (LIKELY (bn > n)) { bp1 = sp1 + n + 1; cy = mpn_sub (sp1 + n + 1, b0, n, b1, bn - n); sp1[2*n+1] = 0; MPN_INCR_U (sp1 + n + 1, n + 1, cy); bnp = n + bp1[n]; } } else { ap1 = a0; anp = an; } if (BELOW_THRESHOLD (n, MUL_FFT_MODF_THRESHOLD)) k=0; else { int mask; k = mpn_fft_best_k (n, 0); mask = (1<<k) - 1; while (n & mask) {k--; mask >>=1;}; } if (k >= FFT_FIRST_K) xp[n] = mpn_mul_fft (xp, n, ap1, anp, bp1, bnp, k); else if (UNLIKELY (bp1 == b0)) { ASSERT (anp + bnp <= 2*n+1); ASSERT (anp + bnp > n); ASSERT (anp >= bnp); mpn_mul (xp, ap1, anp, bp1, bnp); anp = anp + bnp - n; ASSERT (anp <= n || xp[2*n]==0); anp-= anp > n; cy = mpn_sub (xp, xp, n, xp + n, anp); xp[n] = 0; MPN_INCR_U (xp, n+1, cy); } else mpn_bc_mulmod_bnp1 (xp, ap1, bp1, n, xp); } /* Here the CRT recomposition begins. xm <- (xp + xm)/2 = (xp + xm)B^n/2 mod (B^n-1) Division by 2 is a bitwise rotation. Assumes xp normalised mod (B^n+1). The residue class [0] is represented by [B^n-1]; except when both input are ZERO. */ #if HAVE_NATIVE_mpn_rsh1add_n || HAVE_NATIVE_mpn_rsh1add_nc #if HAVE_NATIVE_mpn_rsh1add_nc cy = mpn_rsh1add_nc(rp, rp, xp, n, xp[n]); /* B^n = 1 */ hi = cy << (GMP_NUMB_BITS - 1); cy = 0; /* next update of rp[n-1] will set cy = 1 only if rp[n-1]+=hi overflows, i.e. a further increment will not overflow again. */ #else /* ! _nc */ cy = xp[n] + mpn_rsh1add_n(rp, rp, xp, n); /* B^n = 1 */ hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */ cy >>= 1; /* cy = 1 only if xp[n] = 1 i.e. {xp,n} = ZERO, this implies that the rsh1add was a simple rshift: the top bit is 0. cy=1 => hi=0. */ #endif #if GMP_NAIL_BITS == 0 add_ssaaaa(cy, rp[n-1], cy, rp[n-1], 0, hi); #else cy += (hi & rp[n-1]) >> (GMP_NUMB_BITS-1); rp[n-1] ^= hi; #endif #else /* ! HAVE_NATIVE_mpn_rsh1add_n */ #if HAVE_NATIVE_mpn_add_nc cy = mpn_add_nc(rp, rp, xp, n, xp[n]); #else /* ! _nc */ cy = xp[n] + mpn_add_n(rp, rp, xp, n); /* xp[n] == 1 implies {xp,n} == ZERO */ #endif cy += (rp[0]&1); mpn_rshift(rp, rp, n, 1); ASSERT (cy <= 2); hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */ cy >>= 1; /* We can have cy != 0 only if hi = 0... */ ASSERT ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0); rp[n-1] |= hi; /* ... rp[n-1] + cy can not overflow, the following INCR is correct. */ #endif ASSERT (cy <= 1); /* Next increment can not overflow, read the previous comments about cy. */ ASSERT ((cy == 0) || ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0)); MPN_INCR_U(rp, n, cy); /* Compute the highest half: ([(xp + xm)/2 mod (B^n-1)] - xp ) * B^n */ if (UNLIKELY (an + bn < rn)) { /* Note that in this case, the only way the result can equal zero mod B^{rn} - 1 is if one of the inputs is zero, and then the output of both the recursive calls and this CRT reconstruction is zero, not B^{rn} - 1. Which is good, since the latter representation doesn't fit in the output area.*/ cy = mpn_sub_n (rp + n, rp, xp, an + bn - n); /* FIXME: This subtraction of the high parts is not really necessary, we do it to get the carry out, and for sanity checking. */ cy = xp[n] + mpn_sub_nc (xp + an + bn - n, rp + an + bn - n, xp + an + bn - n, rn - (an + bn), cy); ASSERT (an + bn == rn - 1 || mpn_zero_p (xp + an + bn - n + 1, rn - 1 - (an + bn))); cy = mpn_sub_1 (rp, rp, an + bn, cy); ASSERT (cy == (xp + an + bn - n)[0]); } else { cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n); /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO. DECR will affect _at most_ the lowest n limbs. */ MPN_DECR_U (rp, 2*n, cy); } #undef a0 #undef a1 #undef b0 #undef b1 #undef xp #undef sp1 } }
mp_limb_t mpn_sbpi1_bdiv_qr (mp_ptr qp, mp_ptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_limb_t dinv) { mp_size_t qn; mp_size_t i; mp_limb_t rh; mp_limb_t ql; ASSERT (dn > 0); ASSERT (nn > dn); ASSERT ((dp[0] & 1) != 0); /* FIXME: Add ASSERTs for allowable overlapping; i.e., that qp = np is OK, but some over N/Q overlaps will not work. */ qn = nn - dn; rh = 0; /* To complete the negation, this value is added to q. */ ql = 1; while (qn > dn) { for (i = 0; i < dn; i++) { mp_limb_t q; q = dinv * np[i]; np[i] = mpn_addmul_1 (np + i, dp, dn, q); qp[i] = ~q; } rh += mpn_add (np + dn, np + dn, qn, np, dn); ql = mpn_add_1 (qp, qp, dn, ql); qp += dn; qn -= dn; np += dn; nn -= dn; } for (i = 0; i < qn; i++) { mp_limb_t q; q = dinv * np[i]; np[i] = mpn_addmul_1 (np + i, dp, dn, q); qp[i] = ~q; } rh += mpn_add_n (np + dn, np + dn, np, qn); ql = mpn_add_1 (qp, qp, qn, ql); if (UNLIKELY (ql > 0)) { /* q == 0 */ ASSERT (rh == 0); return 0; } else { mp_limb_t cy; cy = mpn_sub_n (np + qn, np + qn, dp, dn); ASSERT (cy >= rh); return cy - rh; } }
void mpn_toom_interpolate_7pts (mp_ptr rp, mp_size_t n, enum toom7_flags flags, mp_ptr w1, mp_ptr w3, mp_ptr w4, mp_ptr w5, mp_size_t w6n, mp_ptr tp) { mp_size_t m; mp_limb_t cy; m = 2*n + 1; #define w0 rp #define w2 (rp + 2*n) #define w6 (rp + 6*n) ASSERT (w6n > 0); ASSERT (w6n <= 2*n); /* Using formulas similar to Marco Bodrato's W5 = W5 + W4 W1 =(W4 - W1)/2 W4 = W4 - W0 W4 =(W4 - W1)/4 - W6*16 W3 =(W2 - W3)/2 W2 = W2 - W3 W5 = W5 - W2*65 May be negative. W2 = W2 - W6 - W0 W5 =(W5 + W2*45)/2 Now >= 0 again. W4 =(W4 - W2)/3 W2 = W2 - W4 W1 = W5 - W1 May be negative. W5 =(W5 - W3*8)/9 W3 = W3 - W5 W1 =(W1/15 + W5)/2 Now >= 0 again. W5 = W5 - W1 where W0 = f(0), W1 = f(-2), W2 = f(1), W3 = f(-1), W4 = f(2), W5 = f(1/2), W6 = f(oo), Note that most intermediate results are positive; the ones that may be negative are represented in two's complement. We must never shift right a value that may be negative, since that would invalidate the sign bit. On the other hand, divexact by odd numbers work fine with two's complement. */ mpn_add_n (w5, w5, w4, m); if (flags & toom7_w1_neg) { #ifdef HAVE_NATIVE_mpn_rsh1add_n mpn_rsh1add_n (w1, w1, w4, m); #else mpn_add_n (w1, w1, w4, m); ASSERT (!(w1[0] & 1)); mpn_rshift (w1, w1, m, 1); #endif } else { #ifdef HAVE_NATIVE_mpn_rsh1sub_n mpn_rsh1sub_n (w1, w4, w1, m); #else mpn_sub_n (w1, w4, w1, m); ASSERT (!(w1[0] & 1)); mpn_rshift (w1, w1, m, 1); #endif } mpn_sub (w4, w4, m, w0, 2*n); mpn_sub_n (w4, w4, w1, m); ASSERT (!(w4[0] & 3)); mpn_rshift (w4, w4, m, 2); /* w4>=0 */ tp[w6n] = mpn_lshift (tp, w6, w6n, 4); mpn_sub (w4, w4, m, tp, w6n+1); if (flags & toom7_w3_neg) { #ifdef HAVE_NATIVE_mpn_rsh1add_n mpn_rsh1add_n (w3, w3, w2, m); #else mpn_add_n (w3, w3, w2, m); ASSERT (!(w3[0] & 1)); mpn_rshift (w3, w3, m, 1); #endif } else { #ifdef HAVE_NATIVE_mpn_rsh1sub_n mpn_rsh1sub_n (w3, w2, w3, m); #else mpn_sub_n (w3, w2, w3, m); ASSERT (!(w3[0] & 1)); mpn_rshift (w3, w3, m, 1); #endif } mpn_sub_n (w2, w2, w3, m); mpn_submul_1 (w5, w2, m, 65); mpn_sub (w2, w2, m, w6, w6n); mpn_sub (w2, w2, m, w0, 2*n); mpn_addmul_1 (w5, w2, m, 45); ASSERT (!(w5[0] & 1)); mpn_rshift (w5, w5, m, 1); mpn_sub_n (w4, w4, w2, m); mpn_divexact_by3 (w4, w4, m); mpn_sub_n (w2, w2, w4, m); mpn_sub_n (w1, w5, w1, m); mpn_lshift (tp, w3, m, 3); mpn_sub_n (w5, w5, tp, m); mpn_divexact_by9 (w5, w5, m); mpn_sub_n (w3, w3, w5, m); mpn_divexact_by15 (w1, w1, m); mpn_add_n (w1, w1, w5, m); ASSERT (!(w1[0] & 1)); mpn_rshift (w1, w1, m, 1); /* w1>=0 now */ mpn_sub_n (w5, w5, w1, m); /* These bounds are valid for the 4x4 polynomial product of toom44, * and they are conservative for toom53 and toom62. */ ASSERT (w1[2*n] < 2); ASSERT (w2[2*n] < 3); ASSERT (w3[2*n] < 4); ASSERT (w4[2*n] < 3); ASSERT (w5[2*n] < 2); /* Addition chain. Note carries and the 2n'th limbs that need to be * added in. * * Special care is needed for w2[2n] and the corresponding carry, * since the "simple" way of adding it all together would overwrite * the limb at wp[2*n] and rp[4*n] (same location) with the sum of * the high half of w3 and the low half of w4. * * 7 6 5 4 3 2 1 0 * | | | | | | | | | * ||w3 (2n+1)| * ||w4 (2n+1)| * ||w5 (2n+1)| ||w1 (2n+1)| * + | w6 (w6n)| ||w2 (2n+1)| w0 (2n) | (share storage with r) * ----------------------------------------------- * r | | | | | | | | | * c7 c6 c5 c4 c3 Carries to propagate */ cy = mpn_add_n (rp + n, rp + n, w1, m); MPN_INCR_U (w2 + n + 1, n , cy); cy = mpn_add_n (rp + 3*n, rp + 3*n, w3, n); MPN_INCR_U (w3 + n, n + 1, w2[2*n] + cy); cy = mpn_add_n (rp + 4*n, w3 + n, w4, n); MPN_INCR_U (w4 + n, n + 1, w3[2*n] + cy); cy = mpn_add_n (rp + 5*n, w4 + n, w5, n); MPN_INCR_U (w5 + n, n + 1, w4[2*n] + cy); if (w6n > n + 1) ASSERT_NOCARRY (mpn_add (rp + 6*n, rp + 6*n, w6n, w5 + n, n + 1)); else { ASSERT_NOCARRY (mpn_add_n (rp + 6*n, rp + 6*n, w5 + n, w6n)); #if WANT_ASSERT { mp_size_t i; for (i = w6n; i <= n; i++) ASSERT (w5[n + i] == 0); } #endif } }
void mpn_toom2_sqr (mp_ptr pp, mp_srcptr ap, mp_size_t an, mp_ptr scratch) { const int __gmpn_cpuvec_initialized = 1; mp_size_t n, s; mp_limb_t cy, cy2; mp_ptr asm1; #define a0 ap #define a1 (ap + n) s = an >> 1; n = an - s; ASSERT (0 < s && s <= n && s >= n - 1); asm1 = pp; /* Compute asm1. */ if (s == n) { if (mpn_cmp (a0, a1, n) < 0) { mpn_sub_n (asm1, a1, a0, n); } else { mpn_sub_n (asm1, a0, a1, n); } } else /* n - s == 1 */ { if (a0[s] == 0 && mpn_cmp (a0, a1, s) < 0) { mpn_sub_n (asm1, a1, a0, s); asm1[s] = 0; } else { asm1[s] = a0[s] - mpn_sub_n (asm1, a0, a1, s); } } #define v0 pp /* 2n */ #define vinf (pp + 2 * n) /* s+s */ #define vm1 scratch /* 2n */ #define scratch_out scratch + 2 * n /* vm1, 2n limbs */ TOOM2_SQR_REC (vm1, asm1, n, scratch_out); /* vinf, s+s limbs */ TOOM2_SQR_REC (vinf, a1, s, scratch_out); /* v0, 2n limbs */ TOOM2_SQR_REC (v0, ap, n, scratch_out); /* H(v0) + L(vinf) */ cy = mpn_add_n (pp + 2 * n, v0 + n, vinf, n); /* L(v0) + H(v0) */ cy2 = cy + mpn_add_n (pp + n, pp + 2 * n, v0, n); /* L(vinf) + H(vinf) */ cy += mpn_add (pp + 2 * n, pp + 2 * n, n, vinf + n, s + s - n); cy -= mpn_sub_n (pp + n, pp + n, vm1, 2 * n); ASSERT (cy + 1 <= 3); ASSERT (cy2 <= 2); MPN_INCR_U (pp + 2 * n, s + s, cy2); if (LIKELY (cy <= 2)) MPN_INCR_U (pp + 3 * n, s + s - n, cy); else MPN_DECR_U (pp + 3 * n, s + s - n, 1); }
static void ngcd_matrix_update_q (struct ngcd_matrix *M, mp_srcptr qp, mp_size_t qn, unsigned col) { ASSERT (col < 2); if (qn == 1) { mp_limb_t q = qp[0]; mp_limb_t c0, c1; c0 = mpn_addmul_1 (M->p[0][col], M->p[0][1-col], M->n, q); c1 = mpn_addmul_1 (M->p[1][col], M->p[1][1-col], M->n, q); M->p[0][col][M->n] = c0; M->p[1][col][M->n] = c1; M->n += (c0 | c1) != 0; } else { unsigned row; /* Carries for the unlikely case that we get both high words from the multiplication and carries from the addition. */ mp_limb_t c[2]; mp_size_t n; /* The matrix will not necessarily grow in size by qn, so we need normalization in order not to overflow M. */ for (n = M->n; n + qn > M->n; n--) { ASSERT (n > 0); if (M->p[0][1-col][n-1] > 0 || M->p[1][1-col][n-1] > 0) break; } ASSERT (qn + n <= M->alloc); for (row = 0; row < 2; row++) { if (qn <= n) mpn_mul (M->tp, M->p[row][1-col], n, qp, qn); else mpn_mul (M->tp, qp, qn, M->p[row][1-col], n); ASSERT (n + qn >= M->n); c[row] = mpn_add (M->p[row][col], M->tp, n + qn, M->p[row][col], M->n); } if (c[0] | c[1]) { M->n = n + qn + 1; M->p[0][col][n-1] = c[0]; M->p[1][col][n-1] = c[1]; } else { n += qn; n -= (M->p[0][col][n-1] | M->p[1][col][n-1]) == 0; if (n > M->n) M->n = n; } } ASSERT (M->n < M->alloc); }
/* Evaluates a polynomial of degree k > 2, in the points +2^shift and -2^shift. */ int mpn_toom_eval_pm2exp (mp_ptr xp2, mp_ptr xm2, unsigned k, mp_srcptr xp, mp_size_t n, mp_size_t hn, unsigned shift, mp_ptr tp) { unsigned i; int neg; #ifdef HAVE_NATIVE_mpn_addlsh_n mp_limb_t cy; #endif ASSERT (k >= 3); ASSERT (shift*k < GMP_NUMB_BITS); ASSERT (hn > 0); ASSERT (hn <= n); /* The degree k is also the number of full-size coefficients, so * that last coefficient, of size hn, starts at xp + k*n. */ #ifdef HAVE_NATIVE_mpn_addlsh_n xp2[n] = mpn_addlsh_n (xp2, xp, xp + 2*n, n, 2*shift); for (i = 4; i < k; i += 2) xp2[n] += mpn_addlsh_n (xp2, xp2, xp + i*n, n, i*shift); tp[n] = mpn_lshift (tp, xp+n, n, shift); for (i = 3; i < k; i+= 2) tp[n] += mpn_addlsh_n (tp, tp, xp+i*n, n, i*shift); if (k & 1) { cy = mpn_addlsh_n (tp, tp, xp+k*n, hn, k*shift); MPN_INCR_U (tp + hn, n+1 - hn, cy); } else { cy = mpn_addlsh_n (xp2, xp2, xp+k*n, hn, k*shift); MPN_INCR_U (xp2 + hn, n+1 - hn, cy); } #else /* !HAVE_NATIVE_mpn_addlsh_n */ xp2[n] = mpn_lshift (tp, xp+2*n, n, 2*shift); xp2[n] += mpn_add_n (xp2, xp, tp, n); for (i = 4; i < k; i += 2) { xp2[n] += mpn_lshift (tp, xp + ((mp_size_t) i)*n, n, i*shift); xp2[n] += mpn_add_n (xp2, xp2, tp, n); } tp[n] = mpn_lshift (tp, xp+n, n, shift); for (i = 3; i < k; i+= 2) { tp[n] += mpn_lshift (xm2, xp + ((mp_size_t) i)*n, n, i*shift); tp[n] += mpn_add_n (tp, tp, xm2, n); } xm2[hn] = mpn_lshift (xm2, xp + ((mp_size_t) k)*n, hn, k*shift); if (k & 1) mpn_add (tp, tp, n+1, xm2, hn+1); else mpn_add (xp2, xp2, n+1, xm2, hn+1); #endif /* !HAVE_NATIVE_mpn_addlsh_n */ neg = (mpn_cmp (xp2, tp, n + 1) < 0) ? ~0 : 0; #ifdef HAVE_NATIVE_mpn_add_n_sub_n if (neg) mpn_add_n_sub_n (xp2, xm2, tp, xp2, n + 1); else mpn_add_n_sub_n (xp2, xm2, xp2, tp, n + 1); #else /* !HAVE_NATIVE_mpn_add_n_sub_n */ if (neg) mpn_sub_n (xm2, tp, xp2, n + 1); else mpn_sub_n (xm2, xp2, tp, n + 1); mpn_add_n (xp2, xp2, tp, n + 1); #endif /* !HAVE_NATIVE_mpn_add_n_sub_n */ /* FIXME: the following asserts are useless if (k+1)*shift >= GMP_LIMB_BITS */ ASSERT ((k+1)*shift >= GMP_LIMB_BITS || xp2[n] < ((CNST_LIMB(1)<<((k+1)*shift))-1)/((CNST_LIMB(1)<<shift)-1)); ASSERT ((k+2)*shift >= GMP_LIMB_BITS || xm2[n] < ((CNST_LIMB(1)<<((k+2)*shift))-((k&1)?(CNST_LIMB(1)<<shift):1))/((CNST_LIMB(1)<<(2*shift))-1)); return neg; }
/* Temporary storage: Needs n limbs for the quotient, at qp. tp must point to an area large enough for the resulting cofactor, plus one limb extra. All in all, 2N + 1 if N is a bound for both inputs and outputs. */ mp_size_t mpn_gcdext_subdiv_step (mp_ptr gp, mp_size_t *gn, mp_ptr up, mp_size_t *usizep, mp_ptr ap, mp_ptr bp, mp_size_t n, mp_ptr u0, mp_ptr u1, mp_size_t *unp, mp_ptr qp, mp_ptr tp) { mp_size_t an, bn, un; mp_size_t qn; mp_size_t u0n; int swapped; an = bn = n; ASSERT (an > 0); ASSERT (ap[an-1] > 0 || bp[an-1] > 0); MPN_NORMALIZE (ap, an); MPN_NORMALIZE (bp, bn); un = *unp; swapped = 0; if (UNLIKELY (an == 0)) { return_b: MPN_COPY (gp, bp, bn); *gn = bn; MPN_NORMALIZE (u0, un); MPN_COPY (up, u0, un); *usizep = swapped ? un : -un; return 0; } else if (UNLIKELY (bn == 0)) { MPN_COPY (gp, ap, an); *gn = an; MPN_NORMALIZE (u1, un); MPN_COPY (up, u1, un); *usizep = swapped ? -un : un; return 0; } /* Arrange so that a > b, subtract an -= bn, and maintain normalization. */ if (an < bn) { MPN_PTR_SWAP (ap, an, bp, bn); MP_PTR_SWAP (u0, u1); swapped ^= 1; } else if (an == bn) { int c; MPN_CMP (c, ap, bp, an); if (UNLIKELY (c == 0)) { MPN_COPY (gp, ap, an); *gn = an; /* Must return the smallest cofactor, +u1 or -u0 */ MPN_CMP (c, u0, u1, un); ASSERT (c != 0 || (un == 1 && u0[0] == 1 && u1[0] == 1)); if (c < 0) { MPN_NORMALIZE (u0, un); MPN_COPY (up, u0, un); swapped ^= 1; } else { MPN_NORMALIZE_NOT_ZERO (u1, un); MPN_COPY (up, u1, un); } *usizep = swapped ? -un : un; return 0; } else if (c < 0) { MP_PTR_SWAP (ap, bp); MP_PTR_SWAP (u0, u1); swapped ^= 1; } } /* Reduce a -= b, u1 += u0 */ ASSERT_NOCARRY (mpn_sub (ap, ap, an, bp, bn)); MPN_NORMALIZE (ap, an); ASSERT (an > 0); u1[un] = mpn_add_n (u1, u1, u0, un); un += (u1[un] > 0); /* Arrange so that a > b, and divide a = q b + r */ if (an < bn) { MPN_PTR_SWAP (ap, an, bp, bn); MP_PTR_SWAP (u0, u1); swapped ^= 1; } else if (an == bn) { int c; MPN_CMP (c, ap, bp, an); if (UNLIKELY (c == 0)) goto return_b; else if (c < 0) { MP_PTR_SWAP (ap, bp); MP_PTR_SWAP (u0, u1); swapped ^= 1; } } /* Reduce a -= q b, u1 += q u0 */ qn = an - bn + 1; mpn_tdiv_qr (qp, ap, 0, ap, an, bp, bn); if (mpn_zero_p (ap, bn)) goto return_b; n = bn; /* Update u1 += q u0 */ u0n = un; MPN_NORMALIZE (u0, u0n); if (u0n > 0) { qn -= (qp[qn - 1] == 0); if (qn > u0n) mpn_mul (tp, qp, qn, u0, u0n); else mpn_mul (tp, u0, u0n, qp, qn); if (qn + u0n > un) { mp_size_t u1n = un; un = qn + u0n; un -= (tp[un-1] == 0); u1[un] = mpn_add (u1, tp, un, u1, u1n); } else { u1[un] = mpn_add (u1, u1, un, tp, qn + u0n); } un += (u1[un] > 0); } *unp = un; return n; }
void _gst_mpz_add (gst_mpz *sum, const gst_mpz *u, const gst_mpz *v) { mp_srcptr up, vp; mp_ptr sump; mp_size_t usize, vsize, sumsize; mp_size_t abs_usize; mp_size_t abs_vsize; usize = u->size; vsize = v->size; abs_usize = ABS (usize); abs_vsize = ABS (vsize); if (abs_usize < abs_vsize) { /* Swap U and V. */ { const gst_mpz *t = u; u = v; v = t; } { mp_size_t t = usize; usize = vsize; vsize = t; } { mp_size_t t = abs_usize; abs_usize = abs_vsize; abs_vsize = t; } } /* True: abs(USIZE) >= abs(VSIZE) */ /* If not space for sum (and possible carry), increase space. */ sumsize = abs_usize + 1; if (sum->alloc < sumsize) gst_mpz_realloc (sum, sumsize); /* These must be after realloc (u or v may be the same as sum). */ up = u->d; vp = v->d; sump = sum->d; if (usize >= 0) { if (vsize >= 0) { sumsize = mpn_add (sump, up, abs_usize, vp, abs_vsize); if (sumsize != 0) sump[abs_usize] = 1; sumsize = sumsize + abs_usize; } else { /* The signs are different. Need exact comparision to determine which operand to subtract from which. */ if (abs_usize == abs_vsize && mpn_cmp (up, vp, abs_usize) < 0) sumsize = -(abs_usize + mpn_sub (sump, vp, abs_usize, up, abs_usize)); else sumsize = (abs_usize + mpn_sub (sump, up, abs_usize, vp, abs_vsize)); } } else { if (vsize >= 0) { /* The signs are different. Need exact comparision to determine which operand to subtract from which. */ if (abs_usize == abs_vsize && mpn_cmp (up, vp, abs_usize) < 0) sumsize = (abs_usize + mpn_sub (sump, vp, abs_usize, up, abs_usize)); else sumsize = -(abs_usize + mpn_sub (sump, up, abs_usize, vp, abs_vsize)); } else { sumsize = mpn_add (sump, up, abs_usize, vp, abs_vsize); if (sumsize != 0) sump[abs_usize] = 1; sumsize = -(sumsize + abs_usize); } } sum->size = sumsize; }
void mpn_toom22_mul (mp_ptr pp, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr scratch) { const int __gmpn_cpuvec_initialized = 1; mp_size_t n, s, t; int vm1_neg; mp_limb_t cy, cy2; mp_ptr asm1; mp_ptr bsm1; #define a0 ap #define a1 (ap + n) #define b0 bp #define b1 (bp + n) s = an >> 1; n = an - s; t = bn - n; ASSERT (an >= bn); ASSERT (0 < s && s <= n && s >= n - 1); ASSERT (0 < t && t <= s); asm1 = pp; bsm1 = pp + n; vm1_neg = 0; /* Compute asm1. */ if (s == n) { if (mpn_cmp (a0, a1, n) < 0) { mpn_sub_n (asm1, a1, a0, n); vm1_neg = 1; } else { mpn_sub_n (asm1, a0, a1, n); } } else /* n - s == 1 */ { if (a0[s] == 0 && mpn_cmp (a0, a1, s) < 0) { mpn_sub_n (asm1, a1, a0, s); asm1[s] = 0; vm1_neg = 1; } else { asm1[s] = a0[s] - mpn_sub_n (asm1, a0, a1, s); } } /* Compute bsm1. */ if (t == n) { if (mpn_cmp (b0, b1, n) < 0) { mpn_sub_n (bsm1, b1, b0, n); vm1_neg ^= 1; } else { mpn_sub_n (bsm1, b0, b1, n); } } else { if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0) { mpn_sub_n (bsm1, b1, b0, t); MPN_ZERO (bsm1 + t, n - t); vm1_neg ^= 1; } else { mpn_sub (bsm1, b0, n, b1, t); } } #define v0 pp /* 2n */ #define vinf (pp + 2 * n) /* s+t */ #define vm1 scratch /* 2n */ #define scratch_out scratch + 2 * n /* vm1, 2n limbs */ TOOM22_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out); if (s > t) TOOM22_MUL_REC (vinf, a1, s, b1, t, scratch_out); else TOOM22_MUL_N_REC (vinf, a1, b1, s, scratch_out); /* v0, 2n limbs */ TOOM22_MUL_N_REC (v0, ap, bp, n, scratch_out); /* H(v0) + L(vinf) */ cy = mpn_add_n (pp + 2 * n, v0 + n, vinf, n); /* L(v0) + H(v0) */ cy2 = cy + mpn_add_n (pp + n, pp + 2 * n, v0, n); /* L(vinf) + H(vinf) */ cy += mpn_add (pp + 2 * n, pp + 2 * n, n, vinf + n, s + t - n); if (vm1_neg) cy += mpn_add_n (pp + n, pp + n, vm1, 2 * n); else cy -= mpn_sub_n (pp + n, pp + n, vm1, 2 * n); ASSERT (cy + 1 <= 3); ASSERT (cy2 <= 2); MPN_INCR_U (pp + 2 * n, s + t, cy2); if (LIKELY (cy <= 2)) /* if s+t==n, cy is zero, but we should not acces pp[3*n] at all. */ MPN_INCR_U (pp + 3 * n, s + t - n, cy); else MPN_DECR_U (pp + 3 * n, s + t - n, 1); }
void mpn_toom4_mul_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n) { mp_size_t ind; mp_limb_t cy, cy2, r30, r31; mp_ptr tp; mp_size_t sn, n1, n2, n3, n4, n5, n6, n7, n8, rpn, t4, h1; TMP_DECL; sn = (n + 3) / 4; h1 = n - 3*sn; #define a0 (up) #define a1 (up + sn) #define a2 (up + 2*sn) #define a3 (up + 3*sn) #define b0 (vp) #define b1 (vp + sn) #define b2 (vp + 2*sn) #define b3 (vp + 3*sn) t4 = 2*sn+2; // allows mult of 2 integers of sn + 1 limbs TMP_MARK; tp = TMP_ALLOC_LIMBS(4*t4 + 5*(sn + 1)); #define u2 (tp + 4*t4) #define u3 (tp + 4*t4 + (sn+1)) #define u4 (tp + 4*t4 + 2*(sn+1)) #define u5 (tp + 4*t4 + 3*(sn+1)) #define u6 (tp + 4*t4 + 4*(sn+1)) u6[sn] = mpn_add(u6, a1, sn, a3, h1); u5[sn] = mpn_add_n(u5, a2, a0, sn); mpn_add_n(u3, u5, u6, sn + 1); n4 = sn + 1; if (mpn_cmp(u5, u6, sn + 1) >= 0) mpn_sub_n(u4, u5, u6, sn + 1); else { mpn_sub_n(u4, u6, u5, sn + 1); n4 = -n4; } u6[sn] = mpn_add(u6, b1, sn, b3, h1); u5[sn] = mpn_add_n(u5, b2, b0, sn); mpn_add_n(r2, u5, u6, sn + 1); n5 = sn + 1; if (mpn_cmp(u5, u6, sn + 1) >= 0) mpn_sub_n(u5, u5, u6, sn + 1); else { mpn_sub_n(u5, u6, u5, sn + 1); n5 = -n5; } MUL_TC4_UNSIGNED(r3, n3, u3, sn + 1, r2, sn + 1); /* 1 */ MUL_TC4(r4, n4, u4, n4, u5, n5); /* -1 */ #if HAVE_NATIVE_mpn_addlsh_n r1[sn] = mpn_addlsh_n(r1, a2, a0, sn, 2); mpn_lshift(r1, r1, sn + 1, 1); cy = mpn_addlsh_n(r2, a3, a1, h1, 2); #else r1[sn] = mpn_lshift(r1, a2, sn, 1); MPN_COPY(r2, a3, h1); r1[sn] += mpn_addmul_1(r1, a0, sn, 8); cy = mpn_addmul_1(r2, a1, h1, 4); #endif if (sn > h1) { cy2 = mpn_lshift(r2 + h1, a1 + h1, sn - h1, 2); cy = cy2 + mpn_add_1(r2 + h1, r2 + h1, sn - h1, cy); } r2[sn] = cy; mpn_add_n(u5, r1, r2, sn + 1); n6 = sn + 1; if (mpn_cmp(r1, r2, sn + 1) >= 0) mpn_sub_n(u6, r1, r2, sn + 1); else { mpn_sub_n(u6, r2, r1, sn + 1); n6 = -n6; } #if HAVE_NATIVE_mpn_addlsh_n r1[sn] = mpn_addlsh_n(r1, b2, b0, sn, 2); mpn_lshift(r1, r1, sn + 1, 1); cy = mpn_addlsh_n(r2, b3, b1, h1, 2); #else r1[sn] = mpn_lshift(r1, b2, sn, 1); MPN_COPY(r2, b3, h1); r1[sn] += mpn_addmul_1(r1, b0, sn, 8); cy = mpn_addmul_1(r2, b1, h1, 4); #endif if (sn > h1) { cy2 = mpn_lshift(r2 + h1, b1 + h1, sn - h1, 2); cy = cy2 + mpn_add_1(r2 + h1, r2 + h1, sn - h1, cy); } r2[sn] = cy; mpn_add_n(u2, r1, r2, sn + 1); n8 = sn + 1; if (mpn_cmp(r1, r2, sn + 1) >= 0) mpn_sub_n(r2, r1, r2, sn + 1); else { mpn_sub_n(r2, r2, r1, sn + 1); n8 = -n8; } r30 = r3[0]; r31 = r3[1]; MUL_TC4_UNSIGNED(r5, n5, u5, sn + 1, u2, sn + 1); /* 1/2 */ MUL_TC4(r6, n6, u6, n6, r2, n8); /* -1/2 */ r3[1] = r31; #if HAVE_NATIVE_mpn_addlsh1_n cy = mpn_addlsh1_n(u2, a2, a3, h1); if (sn > h1) cy = mpn_add_1(u2 + h1, a2 + h1, sn - h1, cy); u2[sn] = cy; u2[sn] = 2*u2[sn] + mpn_addlsh1_n(u2, a1, u2, sn); u2[sn] = 2*u2[sn] + mpn_addlsh1_n(u2, a0, u2, sn); #else MPN_COPY(u2, a0, sn); u2[sn] = mpn_addmul_1(u2, a1, sn, 2); u2[sn] += mpn_addmul_1(u2, a2, sn, 4); cy = mpn_addmul_1(u2, a3, h1, 8); if (sn > h1) cy = mpn_add_1(u2 + h1, u2 + h1, sn - h1, cy); u2[sn] += cy; #endif #if HAVE_NATIVE_mpn_addlsh1_n cy = mpn_addlsh1_n(r1, b2, b3, h1); if (sn > h1) cy = mpn_add_1(r1 + h1, b2 + h1, sn - h1, cy); r1[sn] = cy; r1[sn] = 2*r1[sn] + mpn_addlsh1_n(r1, b1, r1, sn); r1[sn] = 2*r1[sn] + mpn_addlsh1_n(r1, b0, r1, sn); #else MPN_COPY(r1, b0, sn); r1[sn] = mpn_addmul_1(r1, b1, sn, 2); r1[sn] += mpn_addmul_1(r1, b2, sn, 4); cy = mpn_addmul_1(r1, b3, h1, 8); if (sn > h1) cy = mpn_add_1(r1 + h1, r1 + h1, sn - h1, cy); r1[sn] += cy; #endif MUL_TC4_UNSIGNED(r2, n2, u2, sn + 1, r1, sn + 1); /* 2 */ MUL_TC4_UNSIGNED(r1, n1, a3, h1, b3, h1); /* oo */ MUL_TC4_UNSIGNED(r7, n7, a0, sn, b0, sn); /* 0 */ TC4_DENORM(r1, n1, t4 - 1); /* rp rp1 rp2 rp3 rp4 rp5 rp6 rp7 <----------- r7-----------><------------r5--------------> <-------------r3-------------> <-------------r6-------------> < -----------r2------------>{ } <-------------r4--------------> <--------------r1----> */ mpn_toom4_interpolate(rp, &rpn, sn, tp, t4 - 1, n4, n6, r30); if (rpn != 2*n) { MPN_ZERO((rp + rpn), 2*n - rpn); } TMP_FREE; }
/* Computes |v| = |(g - u a)| / b, where u may be positive or negative, and v is of the opposite sign. a, b are of size n, u and v at most size n, and v must have space for n+1 limbs. */ static mp_size_t compute_v (mp_ptr vp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, mp_srcptr gp, mp_size_t gn, mp_srcptr up, mp_size_t usize, mp_ptr tp) { mp_size_t size; mp_size_t an; mp_size_t bn; mp_size_t vn; ASSERT (n > 0); ASSERT (gn > 0); ASSERT (usize != 0); size = ABS (usize); ASSERT (size <= n); an = n; MPN_NORMALIZE (ap, an); if (an >= size) mpn_mul (tp, ap, an, up, size); else mpn_mul (tp, up, size, ap, an); size += an; size -= tp[size - 1] == 0; ASSERT (gn <= size); if (usize > 0) { /* |v| = -v = (u a - g) / b */ ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn)); MPN_NORMALIZE (tp, size); if (size == 0) return 0; } else { /* usize < 0 */ /* |v| = v = (c - u a) / b = (c + |u| a) / b */ mp_limb_t cy = mpn_add (tp, tp, size, gp, gn); if (cy) tp[size++] = cy; } /* Now divide t / b. There must be no remainder */ bn = n; MPN_NORMALIZE (bp, bn); ASSERT (size >= bn); vn = size + 1 - bn; ASSERT (vn <= n + 1); mpn_divexact (vp, tp, size, bp, bn); vn -= (vp[vn-1] == 0); return vn; }
void mpn_toom53_mul (mp_ptr pp, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr scratch) { mp_size_t n, s, t; int vm1_neg, vmh_neg; mp_limb_t cy; mp_ptr gp, hp; mp_ptr as1, asm1, as2, ash, asmh; mp_ptr bs1, bsm1, bs2, bsh, bsmh; enum toom4_flags flags; TMP_DECL; #define a0 ap #define a1 (ap + n) #define a2 (ap + 2*n) #define a3 (ap + 3*n) #define a4 (ap + 4*n) #define b0 bp #define b1 (bp + n) #define b2 (bp + 2*n) n = 1 + (3 * an >= 5 * bn ? (an - 1) / (size_t) 5 : (bn - 1) / (size_t) 3); s = an - 4 * n; t = bn - 2 * n; ASSERT (0 < s && s <= n); ASSERT (0 < t && t <= n); TMP_MARK; as1 = TMP_SALLOC_LIMBS (n + 1); asm1 = TMP_SALLOC_LIMBS (n + 1); as2 = TMP_SALLOC_LIMBS (n + 1); ash = TMP_SALLOC_LIMBS (n + 1); asmh = TMP_SALLOC_LIMBS (n + 1); bs1 = TMP_SALLOC_LIMBS (n + 1); bsm1 = TMP_SALLOC_LIMBS (n + 1); bs2 = TMP_SALLOC_LIMBS (n + 1); bsh = TMP_SALLOC_LIMBS (n + 1); bsmh = TMP_SALLOC_LIMBS (n + 1); gp = pp; hp = pp + n + 1; /* Compute as1 and asm1. */ gp[n] = mpn_add_n (gp, a0, a2, n); gp[n] += mpn_add (gp, gp, n, a4, s); hp[n] = mpn_add_n (hp, a1, a3, n); #if HAVE_NATIVE_mpn_addsub_n if (mpn_cmp (gp, hp, n + 1) < 0) { mpn_addsub_n (as1, asm1, hp, gp, n + 1); vm1_neg = 1; } else { mpn_addsub_n (as1, asm1, gp, hp, n + 1); vm1_neg = 0; } #else mpn_add_n (as1, gp, hp, n + 1); if (mpn_cmp (gp, hp, n + 1) < 0) { mpn_sub_n (asm1, hp, gp, n + 1); vm1_neg = 1; } else { mpn_sub_n (asm1, gp, hp, n + 1); vm1_neg = 0; } #endif /* Compute as2. */ #if !HAVE_NATIVE_mpn_addlsh_n ash[n] = mpn_lshift (ash, a2, n, 2); /* 4a2 */ #endif #if HAVE_NATIVE_mpn_addlsh1_n cy = mpn_addlsh1_n (as2, a3, a4, s); if (s != n) cy = mpn_add_1 (as2 + s, a3 + s, n - s, cy); cy = 2 * cy + mpn_addlsh1_n (as2, a2, as2, n); cy = 2 * cy + mpn_addlsh1_n (as2, a1, as2, n); as2[n] = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n); #else cy = mpn_lshift (as2, a4, s, 1); cy += mpn_add_n (as2, a3, as2, s); if (s != n) cy = mpn_add_1 (as2 + s, a3 + s, n - s, cy); cy = 4 * cy + mpn_lshift (as2, as2, n, 2); cy += mpn_add_n (as2, a1, as2, n); cy = 2 * cy + mpn_lshift (as2, as2, n, 1); as2[n] = cy + mpn_add_n (as2, a0, as2, n); mpn_add_n (as2, ash, as2, n + 1); #endif /* Compute ash and asmh. */ #if HAVE_NATIVE_mpn_addlsh_n cy = mpn_addlsh_n (gp, a2, a0, n, 2); /* 4a0 + a2 */ cy = 4 * cy + mpn_addlsh_n (gp, a4, gp, n, 2); /* 16a0 + 4a2 + a4 */ /* FIXME s */ gp[n] = cy; cy = mpn_addlsh_n (hp, a3, a1, n, 2); /* 4a1 + a3 */ cy = 2 * cy + mpn_lshift (hp, hp, n, 1); /* 8a1 + 2a3 */ hp[n] = cy; #else gp[n] = mpn_lshift (gp, a0, n, 4); /* 16a0 */ mpn_add (gp, gp, n + 1, a4, s); /* 16a0 + a4 */ mpn_add_n (gp, ash, gp, n+1); /* 16a0 + 4a2 + a4 */ cy = mpn_lshift (hp, a1, n, 3); /* 8a1 */ cy += mpn_lshift (ash, a3, n, 1); /* 2a3 */ cy += mpn_add_n (hp, ash, hp, n); /* 8a1 + 2a3 */ hp[n] = cy; #endif #if HAVE_NATIVE_mpn_addsub_n if (mpn_cmp (gp, hp, n + 1) < 0) { mpn_addsub_n (ash, asmh, hp, gp, n + 1); vmh_neg = 1; } else { mpn_addsub_n (ash, asmh, gp, hp, n + 1); vmh_neg = 0; } #else mpn_add_n (ash, gp, hp, n + 1); if (mpn_cmp (gp, hp, n + 1) < 0) { mpn_sub_n (asmh, hp, gp, n + 1); vmh_neg = 1; } else { mpn_sub_n (asmh, gp, hp, n + 1); vmh_neg = 0; } #endif /* Compute bs1 and bsm1. */ bs1[n] = mpn_add (bs1, b0, n, b2, t); /* b0 + b2 */ #if HAVE_NATIVE_mpn_addsub_n if (bs1[n] == 0 && mpn_cmp (bs1, b1, n) < 0) { bs1[n] = mpn_addsub_n (bs1, bsm1, b1, bs1, n) >> 1; bsm1[n] = 0; vm1_neg ^= 1; }
/* Perform a few steps, using some of mpn_nhgcd2, subtraction and division. Reduces the size by almost one limb or more, but never below the given size s. Return new size for a and b, or 0 if no more steps are possible. M = NULL is allowed, if M is not needed. Needs temporary space for division, n + 1 limbs, and for ngcd_matrix1_vector, n limbs. */ mp_size_t mpn_ngcd_step (mp_size_t n, mp_ptr ap, mp_ptr bp, mp_size_t s, struct ngcd_matrix *M, mp_ptr tp) { struct ngcd_matrix1 M1; mp_limb_t mask; mp_limb_t ah, al, bh, bl; mp_size_t an, bn, qn; mp_ptr qp; mp_ptr rp; int col; ASSERT (n > s); mask = ap[n-1] | bp[n-1]; ASSERT (mask > 0); if (n == s + 1) { if (mask < 4) goto subtract; ah = ap[n-1]; al = ap[n-2]; bh = bp[n-1]; bl = bp[n-2]; } else if (mask & GMP_NUMB_HIGHBIT) { ah = ap[n-1]; al = ap[n-2]; bh = bp[n-1]; bl = bp[n-2]; } else { int shift; count_leading_zeros (shift, mask); ah = MPN_EXTRACT_LIMB (shift, ap[n-1], ap[n-2]); al = MPN_EXTRACT_LIMB (shift, ap[n-2], ap[n-3]); bh = MPN_EXTRACT_LIMB (shift, bp[n-1], bp[n-2]); bl = MPN_EXTRACT_LIMB (shift, bp[n-2], bp[n-3]); } /* Try an mpn_nhgcd2 step */ if (mpn_nhgcd2 (ah, al, bh, bl, &M1)) { /* Multiply M <- M * M1 */ if (M) ngcd_matrix_mul_1 (M, &M1); /* Multiply M1^{-1} (a;b) */ return mpn_ngcd_matrix1_vector (&M1, n, ap, bp, tp); } subtract: /* There are two ways in which mpn_nhgcd2 can fail. Either one of ah and bh was too small, or ah, bh were (almost) equal. Perform one subtraction step (for possible cancellation of high limbs), followed by one division. */ /* Since we must ensure that #(a-b) > s, we handle cancellation of high limbs explicitly up front. (FIXME: Or is it better to just subtract, normalize, and use an addition to undo if it turns out the the difference is too small?) */ for (an = n; an > s; an--) if (ap[an-1] != bp[an-1]) break; if (an == s) return 0; /* Maintain a > b. When needed, swap a and b, and let col keep track of how to update M. */ if (ap[an-1] > bp[an-1]) { /* a is largest. In the subtraction step, we need to update column 1 of M */ col = 1; } else { MP_PTR_SWAP (ap, bp); col = 0; } bn = n; MPN_NORMALIZE (bp, bn); if (bn <= s) return 0; /* We have #a, #b > s. When is it possible that #(a-b) < s? For cancellation to happen, the numbers must be of the form a = x + 1, 0, ..., 0, al b = x , GMP_NUMB_MAX, ..., GMP_NUMB_MAX, bl where al, bl denotes the least significant k limbs. If al < bl, then #(a-b) < k, and if also high(al) != 0, high(bl) != GMP_NUMB_MAX, then #(a-b) = k. If al >= bl, then #(a-b) = k + 1. */ if (ap[an-1] == bp[an-1] + 1) { mp_size_t k; int c; for (k = an-1; k > s; k--) if (ap[k-1] != 0 || bp[k-1] != GMP_NUMB_MAX) break; MPN_CMP (c, ap, bp, k); if (c < 0) { mp_limb_t cy; /* The limbs from k and up are cancelled. */ if (k == s) return 0; cy = mpn_sub_n (ap, ap, bp, k); ASSERT (cy == 1); an = k; } else { ASSERT_NOCARRY (mpn_sub_n (ap, ap, bp, k)); ap[k] = 1; an = k + 1; } } else ASSERT_NOCARRY (mpn_sub_n (ap, ap, bp, an)); ASSERT (an > s); ASSERT (ap[an-1] > 0); ASSERT (bn > s); ASSERT (bp[bn-1] > 0); if (M) ngcd_matrix_update_1 (M, col); if (an < bn) { MPN_PTR_SWAP (ap, an, bp, bn); col ^= 1; } else if (an == bn) { int c; MPN_CMP (c, ap, bp, an); if (c < 0) { MP_PTR_SWAP (ap, bp); col ^= 1; } } /* Divide a / b. Store first the quotient (qn limbs) and then the remainder (bn limbs) starting at tp. */ qn = an + 1 - bn; qp = tp; rp = tp + qn; /* FIXME: We could use an approximate division, that may return a too small quotient, and only guarantess that the size of r is almost the size of b. */ mpn_tdiv_qr (qp, rp, 0, ap, an, bp, bn); qn -= (qp[qn -1] == 0); /* Normalize remainder */ an = bn; for ( ; an > s; an--) if (rp[an-1] > 0) break; if (an > s) /* Include leading zero limbs */ MPN_COPY (ap, rp, bn); else { /* Quotient is too large */ mp_limb_t cy; cy = mpn_add (ap, bp, bn, rp, an); if (cy > 0) { ASSERT (bn < n); ap[bn] = cy; bp[bn] = 0; bn++; } MPN_DECR_U (qp, qn, 1); qn -= (qp[qn-1] == 0); } if (qn > 0 && M) ngcd_matrix_update_q (M, qp, qn, col); return bn; }