mp_limb_t mpn_addadd_n(mp_ptr t,mp_srcptr x,mp_srcptr y,mp_srcptr z,mp_size_t n) {mp_limb_t ret; mp_srcptr a=x,b=y,c=z; ASSERT(n>0); ASSERT_MPN(x,n);ASSERT_MPN(y,n);ASSERT_MPN(z,n);//ASSERT_SPACE(t,n); ASSERT(MPN_SAME_OR_SEPARATE_P(t,x,n)); ASSERT(MPN_SAME_OR_SEPARATE_P(t,y,n)); ASSERT(MPN_SAME_OR_SEPARATE_P(t,z,n)); if(t==x) {if(t==y) {if(t==z) { #ifdef HAVE_NATIVE_mpn_addlsh1_n return mpn_addlsh1_n(t,x,y,n); #else return mpn_mul_1(t,x,n,3); #endif } } else {MP_SRCPTR_SWAP(b,c);} } else {MP_SRCPTR_SWAP(a,c);if(t==y)MP_SRCPTR_SWAP(a,b);} ret=mpn_add_n(t,a,b,n);return ret+mpn_add_n(t,t,c,n);}
void mpz_lucnum2_ui (mpz_ptr ln, mpz_ptr lnsub1, unsigned long n) { mp_ptr lp, l1p, f1p; mp_size_t size; mp_limb_t c; TMP_DECL; ASSERT (ln != lnsub1); /* handle small n quickly, and hide the special case for L[-1]=-1 */ if (n <= FIB_TABLE_LUCNUM_LIMIT) { mp_limb_t f = FIB_TABLE (n); mp_limb_t f1 = FIB_TABLE ((int) n - 1); /* L[n] = F[n] + 2F[n-1] */ PTR(ln)[0] = f + 2*f1; SIZ(ln) = 1; /* L[n-1] = 2F[n] - F[n-1], but allow for L[-1]=-1 */ PTR(lnsub1)[0] = (n == 0 ? 1 : 2*f - f1); SIZ(lnsub1) = (n == 0 ? -1 : 1); return; } TMP_MARK; size = MPN_FIB2_SIZE (n); f1p = TMP_ALLOC_LIMBS (size); MPZ_REALLOC (ln, size+1); MPZ_REALLOC (lnsub1, size+1); lp = PTR(ln); l1p = PTR(lnsub1); size = mpn_fib2_ui (l1p, f1p, n); /* L[n] = F[n] + 2F[n-1] */ #if HAVE_NATIVE_mpn_addlsh1_n c = mpn_addlsh1_n (lp, l1p, f1p, size); #else c = mpn_lshift1 (lp, f1p, size); c += mpn_add_n (lp, lp, l1p, size); #endif lp[size] = c; SIZ(ln) = size + (c != 0); /* L[n-1] = 2F[n] - F[n-1] */ c = mpn_double (l1p, size); c -= mpn_sub_n (l1p, l1p, f1p, size); ASSERT ((mp_limb_signed_t) c >= 0); l1p[size] = c; SIZ(lnsub1) = size + (c != 0); TMP_FREE; }
void tc4_addlsh1_unsigned(mp_ptr rp, mp_size_t * rn, mp_srcptr xp, mp_size_t xn) { if (xn) { if (xn >= *rn) { mp_limb_t cy; if (xn > *rn) MPN_ZERO(rp + *rn, xn - *rn); #if HAVE_NATIVE_mpn_addlsh1_n cy = mpn_addlsh1_n(rp, rp, xp, xn); #else cy = mpn_add_n(rp, rp, xp, xn); cy += mpn_add_n(rp, rp, xp, xn); #endif if (cy) { rp[xn] = cy; *rn = xn + 1; } else *rn = xn; } else { mp_limb_t cy; #if HAVE_NATIVE_mpn_addlsh1_n cy = mpn_addlsh1_n(rp, rp, xp, xn); #else cy = mpn_add_n(rp, rp, xp, xn); cy += mpn_add_n(rp, rp, xp, xn); #endif if (cy) cy = mpn_add_1(rp + xn, rp + xn, *rn - xn, cy); if (cy) { rp[*rn] = cy; (*rn)++; } } } }
/* Define our own squaring function, which uses mpn_sqr_basecase for its allowed sizes, but its own code for larger sizes. */ static void mpn_local_sqr (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_ptr tp) { mp_size_t i; ASSERT (n >= 1); ASSERT (! MPN_OVERLAP_P (rp, 2*n, up, n)); if (BELOW_THRESHOLD (n, SQR_BASECASE_LIM)) { mpn_sqr_basecase (rp, up, n); return; } { mp_limb_t ul, lpl; ul = up[0]; umul_ppmm (rp[1], lpl, ul, ul << GMP_NAIL_BITS); rp[0] = lpl >> GMP_NAIL_BITS; } if (n > 1) { mp_limb_t cy; cy = mpn_mul_1 (tp, up + 1, n - 1, up[0]); tp[n - 1] = cy; for (i = 2; i < n; i++) { mp_limb_t cy; cy = mpn_addmul_1 (tp + 2 * i - 2, up + i, n - i, up[i - 1]); tp[n + i - 2] = cy; } MPN_SQR_DIAGONAL (rp + 2, up + 1, n - 1); { mp_limb_t cy; #if HAVE_NATIVE_mpn_addlsh1_n cy = mpn_addlsh1_n (rp + 1, rp + 1, tp, 2 * n - 2); #else cy = mpn_lshift (tp, tp, 2 * n - 2, 1); cy += mpn_add_n (rp + 1, rp + 1, tp, 2 * n - 2); #endif rp[2 * n - 1] += cy; } } }
void mpn_toom53_mul (mp_ptr pp, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr scratch) { mp_size_t n, s, t; int vm1_neg, vmh_neg; mp_limb_t cy; mp_ptr gp, hp; mp_ptr as1, asm1, as2, ash, asmh; mp_ptr bs1, bsm1, bs2, bsh, bsmh; enum toom4_flags flags; TMP_DECL; #define a0 ap #define a1 (ap + n) #define a2 (ap + 2*n) #define a3 (ap + 3*n) #define a4 (ap + 4*n) #define b0 bp #define b1 (bp + n) #define b2 (bp + 2*n) n = 1 + (3 * an >= 5 * bn ? (an - 1) / (size_t) 5 : (bn - 1) / (size_t) 3); s = an - 4 * n; t = bn - 2 * n; ASSERT (0 < s && s <= n); ASSERT (0 < t && t <= n); TMP_MARK; as1 = TMP_SALLOC_LIMBS (n + 1); asm1 = TMP_SALLOC_LIMBS (n + 1); as2 = TMP_SALLOC_LIMBS (n + 1); ash = TMP_SALLOC_LIMBS (n + 1); asmh = TMP_SALLOC_LIMBS (n + 1); bs1 = TMP_SALLOC_LIMBS (n + 1); bsm1 = TMP_SALLOC_LIMBS (n + 1); bs2 = TMP_SALLOC_LIMBS (n + 1); bsh = TMP_SALLOC_LIMBS (n + 1); bsmh = TMP_SALLOC_LIMBS (n + 1); gp = pp; hp = pp + n + 1; /* Compute as1 and asm1. */ gp[n] = mpn_add_n (gp, a0, a2, n); gp[n] += mpn_add (gp, gp, n, a4, s); hp[n] = mpn_add_n (hp, a1, a3, n); #if HAVE_NATIVE_mpn_addsub_n if (mpn_cmp (gp, hp, n + 1) < 0) { mpn_addsub_n (as1, asm1, hp, gp, n + 1); vm1_neg = 1; } else { mpn_addsub_n (as1, asm1, gp, hp, n + 1); vm1_neg = 0; } #else mpn_add_n (as1, gp, hp, n + 1); if (mpn_cmp (gp, hp, n + 1) < 0) { mpn_sub_n (asm1, hp, gp, n + 1); vm1_neg = 1; } else { mpn_sub_n (asm1, gp, hp, n + 1); vm1_neg = 0; } #endif /* Compute as2. */ #if !HAVE_NATIVE_mpn_addlsh_n ash[n] = mpn_lshift (ash, a2, n, 2); /* 4a2 */ #endif #if HAVE_NATIVE_mpn_addlsh1_n cy = mpn_addlsh1_n (as2, a3, a4, s); if (s != n) cy = mpn_add_1 (as2 + s, a3 + s, n - s, cy); cy = 2 * cy + mpn_addlsh1_n (as2, a2, as2, n); cy = 2 * cy + mpn_addlsh1_n (as2, a1, as2, n); as2[n] = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n); #else cy = mpn_lshift (as2, a4, s, 1); cy += mpn_add_n (as2, a3, as2, s); if (s != n) cy = mpn_add_1 (as2 + s, a3 + s, n - s, cy); cy = 4 * cy + mpn_lshift (as2, as2, n, 2); cy += mpn_add_n (as2, a1, as2, n); cy = 2 * cy + mpn_lshift (as2, as2, n, 1); as2[n] = cy + mpn_add_n (as2, a0, as2, n); mpn_add_n (as2, ash, as2, n + 1); #endif /* Compute ash and asmh. */ #if HAVE_NATIVE_mpn_addlsh_n cy = mpn_addlsh_n (gp, a2, a0, n, 2); /* 4a0 + a2 */ cy = 4 * cy + mpn_addlsh_n (gp, a4, gp, n, 2); /* 16a0 + 4a2 + a4 */ /* FIXME s */ gp[n] = cy; cy = mpn_addlsh_n (hp, a3, a1, n, 2); /* 4a1 + a3 */ cy = 2 * cy + mpn_lshift (hp, hp, n, 1); /* 8a1 + 2a3 */ hp[n] = cy; #else gp[n] = mpn_lshift (gp, a0, n, 4); /* 16a0 */ mpn_add (gp, gp, n + 1, a4, s); /* 16a0 + a4 */ mpn_add_n (gp, ash, gp, n+1); /* 16a0 + 4a2 + a4 */ cy = mpn_lshift (hp, a1, n, 3); /* 8a1 */ cy += mpn_lshift (ash, a3, n, 1); /* 2a3 */ cy += mpn_add_n (hp, ash, hp, n); /* 8a1 + 2a3 */ hp[n] = cy; #endif #if HAVE_NATIVE_mpn_addsub_n if (mpn_cmp (gp, hp, n + 1) < 0) { mpn_addsub_n (ash, asmh, hp, gp, n + 1); vmh_neg = 1; } else { mpn_addsub_n (ash, asmh, gp, hp, n + 1); vmh_neg = 0; } #else mpn_add_n (ash, gp, hp, n + 1); if (mpn_cmp (gp, hp, n + 1) < 0) { mpn_sub_n (asmh, hp, gp, n + 1); vmh_neg = 1; } else { mpn_sub_n (asmh, gp, hp, n + 1); vmh_neg = 0; } #endif /* Compute bs1 and bsm1. */ bs1[n] = mpn_add (bs1, b0, n, b2, t); /* b0 + b2 */ #if HAVE_NATIVE_mpn_addsub_n if (bs1[n] == 0 && mpn_cmp (bs1, b1, n) < 0) { bs1[n] = mpn_addsub_n (bs1, bsm1, b1, bs1, n) >> 1; bsm1[n] = 0; vm1_neg ^= 1; }
void mpn_toom3_sqr_n (mp_ptr c, mp_srcptr a, mp_size_t n, mp_ptr t) { mp_size_t k, k1, kk1, r, twok, twor; mp_limb_t cy, saved, vinf0, cinf0; mp_ptr trec; int sa; mp_ptr c1, c2, c3, c4; ASSERT(GMP_NUMB_BITS >= 6); ASSERT(n >= 17); /* so that r <> 0 and 5k+3 <= 2n */ /* the algorithm is the same as mpn_mul_n_tc3, with b=a */ k = (n + 2) / 3; /* ceil(n/3) */ twok = 2 * k; k1 = k + 1; kk1 = k + k1; r = n - twok; /* last chunk */ twor = 2 * r; c1 = c + k; c2 = c1 + k; c3 = c2 + k; c4 = c3 + k; trec = t + 4 * k + 3; /* trec = v2 + (2k+2) */ cy = mpn_add_n (c, a, a + twok, r); if (r < k) __GMPN_ADD_1 (cy, c + r, a + r, k - r, cy); c3[2] = (c1[0] = cy) + mpn_add_n (c2 + 2, c, a + k, k); #define v2 (t+2*k+1) #define vinf (t+4*k+2) TOOM3_SQR_REC (t, c2 + 2, k1, trec); sa = (c[k] != 0) ? 1 : mpn_cmp (c, a + k, k); c[k] = (sa >= 0) ? c[k] - mpn_sub_n (c, c, a + k, k) : mpn_sub_n (c, a + k, c, k); TOOM3_SQR_REC (c2, c, k1, trec); #ifdef HAVE_NATIVE_mpn_addlsh1_n c1[0] = mpn_addlsh1_n (c, a + k, a + twok, r); if (r < k) __GMPN_ADD_1 (c1[0], c + r, a + k + r, k - r, c1[0]); c1[0] = 2 * c1[0] + mpn_addlsh1_n (c, a, c, k); #else c[r] = mpn_lshift (c, a + twok, r, 1); if (r < k) MPN_ZERO(c + r + 1, k - r); c1[0] += mpn_add_n (c, c, a + k, k); mpn_lshift (c, c, k1, 1); c1[0] += mpn_add_n (c, c, a, k); #endif TOOM3_SQR_REC (v2, c, k1, trec); TOOM3_SQR_REC (c, a, k, trec); #ifdef HAVE_NATIVE_mpn_addlsh1_n mpn_addlsh1_n (v2, v2, c2, kk1); #else mpn_lshift (t + 4 * k + 2, c2, kk1, 1); mpn_add_n (v2, v2, t + 4 * k + 2, kk1); #endif saved = c4[0]; TOOM3_SQR_REC (c4, a + twok, r, trec); cinf0 = mpn_add_n (vinf, c4, c, twor); vinf0 = c4[0]; c4[0] = saved; toom3_interpolate (c, t, v2, c2, vinf, k, r, 1, vinf0, cinf0, vinf + twor); #undef v2 #undef vinf }
void mpn_toom3_mul_n (mp_ptr c, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr t) { mp_size_t k, k1, kk1, r, twok, twor; mp_limb_t cy, cc, saved, vinf0, cinf0; mp_ptr trec; int sa, sb; mp_ptr c1, c2, c3, c4, c5; ASSERT(GMP_NUMB_BITS >= 6); ASSERT(n >= 17); /* so that r <> 0 and 5k+3 <= 2n */ /* The algorithm is the following: 0. k = ceil(n/3), r = n - 2k, B = 2^(GMP_NUMB_BITS), t = B^k 1. split a and b in three parts each a0, a1, a2 and b0, b1, b2 with a0, a1, b0, b1 of k limbs, and a2, b2 of r limbs 2. v0 <- a0*b0 v1 <- (a0+a1+a2)*(b0+b1+b2) v2 <- (a0+2*a1+4*a2)*(b0+2*b1+4*b2) vm1 <- (a0-a1+a2)*(b0-b1+b2) vinf <- a2*b2 t1 <- (3*v0+2*vm1+v2)/6-2*vinf t2 <- (v1+vm1)/2 3. result is c0+c1*t+c2*t^2+c3*t^3+c4*t^4 where c0 <- v0 c1 <- v1 - t1 c2 <- t2 - v0 - vinf c3 <- t1 - t2 c4 <- vinf */ k = (n + 2) / 3; /* ceil(n/3) */ twok = 2 * k; k1 = k + 1; kk1 = k + k1; r = n - twok; /* last chunk */ twor = 2 * r; c1 = c + k; c2 = c1 + k; c3 = c2 + k; c4 = c3 + k; c5 = c4 + k; trec = t + 4 * k + 3; /* trec = v2 + (2k+2) */ /* put a0+a2 in {c, k+1}, and b0+b2 in {c+k+1, k+1}; put a0+a1+a2 in {c+2k+2, k+1} and b0+b1+b2 in {c+3k+3,k+1} [requires 4k+4 <= 2n, ie. n >= 8] */ cy = mpn_add_n (c, a, a + twok, r); cc = mpn_add_n (c1 + 1, b, b + twok, r); if (r < k) { __GMPN_ADD_1 (cy, c + r, a + r, k - r, cy); __GMPN_ADD_1 (cc, c1 + 1 + r, b + r, k - r, cc); } c3[2] = (c1[0] = cy) + mpn_add_n (c2 + 2, c, a + k, k); c4[3] = (c2[1] = cc) + mpn_add_n (c3 + 3, c1 + 1, b + k, k); #define v2 (t+2*k+1) #define vinf (t+4*k+2) /* compute v1 := (a0+a1+a2)*(b0+b1+b2) in {t, 2k+1}; since v1 < 9*B^(2k), v1 uses only 2k+1 words if GMP_NUMB_BITS >= 4 */ TOOM3_MUL_REC (t, c2 + 2, c3 + 3, k1, trec); /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v1 */ /* put |a0-a1+a2| in {c, k+1} and |b0-b1+b2| in {c+4k+2,k+1} */ /* sa = sign(a0-a1+a2) */ sa = (c[k] != 0) ? 1 : mpn_cmp (c, a + k, k); c[k] = (sa >= 0) ? c[k] - mpn_sub_n (c, c, a + k, k) : mpn_sub_n (c, a + k, c, k); /* b0+b2 is in {c+k+1, k+1} now */ sb = (c2[1] != 0) ? 1 : mpn_cmp (c1 + 1, b + k, k); c5[2] = (sb >= 0) ? c2[1] - mpn_sub_n (c4 + 2, c1 + 1, b + k, k) : mpn_sub_n (c4 + 2, b + k, c1 + 1, k); sa *= sb; /* sign of vm1 */ /* compute vm1 := (a0-a1+a2)*(b0-b1+b2) in {c+2k, 2k+1}; since |vm1| < 4*B^(2k), vm1 uses only 2k+1 limbs */ TOOM3_MUL_REC (c2, c, c4 + 2, k1, trec); /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} vm1 v1 */ /* compute a0+2a1+4a2 in {c, k+1} and b0+2b1+4b2 in {c+4k+2, k+1} [requires 5k+3 <= 2n, i.e. n >= 17] */ #ifdef HAVE_NATIVE_mpn_addlsh1_n c1[0] = mpn_addlsh1_n (c, a + k, a + twok, r); c5[2] = mpn_addlsh1_n (c4 + 2, b + k, b + twok, r); if (r < k) { __GMPN_ADD_1 (c1[0], c + r, a + k + r, k - r, c1[0]); __GMPN_ADD_1 (c5[2], c4 + 2 + r, b + k + r, k - r, c5[2]); } c1[0] = 2 * c1[0] + mpn_addlsh1_n (c, a, c, k); c5[2] = 2 * c5[2] + mpn_addlsh1_n (c4 + 2, b, c4 + 2, k); #else c[r] = mpn_lshift (c, a + twok, r, 1); c4[r + 2] = mpn_lshift (c4 + 2, b + twok, r, 1); if (r < k) { MPN_ZERO(c + r + 1, k - r); MPN_ZERO(c4 + r + 3, k - r); } c1[0] += mpn_add_n (c, c, a + k, k); c5[2] += mpn_add_n (c4 + 2, c4 + 2, b + k, k); mpn_lshift (c, c, k1, 1); mpn_lshift (c4 + 2, c4 + 2, k1, 1); c1[0] += mpn_add_n (c, c, a, k); c5[2] += mpn_add_n (c4 + 2, c4 + 2, b, k); #endif /* compute v2 := (a0+2a1+4a2)*(b0+2b1+4b2) in {t+2k+1, 2k+1} v2 < 49*B^k so v2 uses at most 2k+1 limbs if GMP_NUMB_BITS >= 6 */ TOOM3_MUL_REC (v2, c, c4 + 2, k1, trec); /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} vm1 v1 v2 */ /* compute v0 := a0*b0 in {c, 2k} */ TOOM3_MUL_REC (c, a, b, k, trec); /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 vm1 v1 v2 */ /* now compute (3v0+2vm1+v2)/6 = [v0 + (2vm1+v2)/3]/2 v2 <- v2+2vm1 = 3*(a0*b0+2*a0*b2+2*a1*b1+2*a1*b2+2*a2*b0+2*a2*b1+6*a2*b2), thus 0 <= v2 < 51*B^(2k) < 2^6*B^(2k) Uses temporary space {t+4k+2,2k+1}, requires T(n) >= 6k+3. */ if (sa >= 0) { #ifdef HAVE_NATIVE_mpn_addlsh1_n mpn_addlsh1_n (v2, v2, c2, kk1); #else /* we can use vinf=t+4k+2 as workspace since it is not full yet */ mpn_lshift (vinf, c2, kk1, 1); mpn_add_n (v2, v2, vinf, kk1); #endif } else { #ifdef HAVE_NATIVE_mpn_sublsh1_n mpn_sublsh1_n (v2, v2, c2, kk1); #else /* we can use vinf=t+4k+2 as workspace since it is not full yet */ mpn_lshift (vinf, c2, kk1, 1); mpn_sub_n (v2, v2, vinf, kk1); #endif } /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 vm1 v1 v2+2vm1 */ /* compute vinf := a2*b2 in {t+4k+2, 2r}: first put it in {c4, 2r}, then copy it in {t+4k+2,2r} */ saved = c4[0]; TOOM3_MUL_REC (c4, a + twok, b + twok, r, trec); cinf0 = mpn_add_n (vinf, c4, c, twor); /* {v0,2r} + {vinf,2r} */ vinf0 = c4[0]; c4[0] = saved; toom3_interpolate (c, t, v2, c2, vinf, k, r, sa, vinf0, cinf0, vinf + twor); #undef v2 #undef vinf }
void mpn_toom4_mul_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n) { mp_size_t ind; mp_limb_t cy, cy2, r30, r31; mp_ptr tp; mp_size_t sn, n1, n2, n3, n4, n5, n6, n7, n8, rpn, t4, h1; TMP_DECL; sn = (n + 3) / 4; h1 = n - 3*sn; #define a0 (up) #define a1 (up + sn) #define a2 (up + 2*sn) #define a3 (up + 3*sn) #define b0 (vp) #define b1 (vp + sn) #define b2 (vp + 2*sn) #define b3 (vp + 3*sn) t4 = 2*sn+2; // allows mult of 2 integers of sn + 1 limbs TMP_MARK; tp = TMP_ALLOC_LIMBS(4*t4 + 5*(sn + 1)); #define u2 (tp + 4*t4) #define u3 (tp + 4*t4 + (sn+1)) #define u4 (tp + 4*t4 + 2*(sn+1)) #define u5 (tp + 4*t4 + 3*(sn+1)) #define u6 (tp + 4*t4 + 4*(sn+1)) u6[sn] = mpn_add(u6, a1, sn, a3, h1); u5[sn] = mpn_add_n(u5, a2, a0, sn); mpn_add_n(u3, u5, u6, sn + 1); n4 = sn + 1; if (mpn_cmp(u5, u6, sn + 1) >= 0) mpn_sub_n(u4, u5, u6, sn + 1); else { mpn_sub_n(u4, u6, u5, sn + 1); n4 = -n4; } u6[sn] = mpn_add(u6, b1, sn, b3, h1); u5[sn] = mpn_add_n(u5, b2, b0, sn); mpn_add_n(r2, u5, u6, sn + 1); n5 = sn + 1; if (mpn_cmp(u5, u6, sn + 1) >= 0) mpn_sub_n(u5, u5, u6, sn + 1); else { mpn_sub_n(u5, u6, u5, sn + 1); n5 = -n5; } MUL_TC4_UNSIGNED(r3, n3, u3, sn + 1, r2, sn + 1); /* 1 */ MUL_TC4(r4, n4, u4, n4, u5, n5); /* -1 */ #if HAVE_NATIVE_mpn_addlsh_n r1[sn] = mpn_addlsh_n(r1, a2, a0, sn, 2); mpn_lshift(r1, r1, sn + 1, 1); cy = mpn_addlsh_n(r2, a3, a1, h1, 2); #else r1[sn] = mpn_lshift(r1, a2, sn, 1); MPN_COPY(r2, a3, h1); r1[sn] += mpn_addmul_1(r1, a0, sn, 8); cy = mpn_addmul_1(r2, a1, h1, 4); #endif if (sn > h1) { cy2 = mpn_lshift(r2 + h1, a1 + h1, sn - h1, 2); cy = cy2 + mpn_add_1(r2 + h1, r2 + h1, sn - h1, cy); } r2[sn] = cy; mpn_add_n(u5, r1, r2, sn + 1); n6 = sn + 1; if (mpn_cmp(r1, r2, sn + 1) >= 0) mpn_sub_n(u6, r1, r2, sn + 1); else { mpn_sub_n(u6, r2, r1, sn + 1); n6 = -n6; } #if HAVE_NATIVE_mpn_addlsh_n r1[sn] = mpn_addlsh_n(r1, b2, b0, sn, 2); mpn_lshift(r1, r1, sn + 1, 1); cy = mpn_addlsh_n(r2, b3, b1, h1, 2); #else r1[sn] = mpn_lshift(r1, b2, sn, 1); MPN_COPY(r2, b3, h1); r1[sn] += mpn_addmul_1(r1, b0, sn, 8); cy = mpn_addmul_1(r2, b1, h1, 4); #endif if (sn > h1) { cy2 = mpn_lshift(r2 + h1, b1 + h1, sn - h1, 2); cy = cy2 + mpn_add_1(r2 + h1, r2 + h1, sn - h1, cy); } r2[sn] = cy; mpn_add_n(u2, r1, r2, sn + 1); n8 = sn + 1; if (mpn_cmp(r1, r2, sn + 1) >= 0) mpn_sub_n(r2, r1, r2, sn + 1); else { mpn_sub_n(r2, r2, r1, sn + 1); n8 = -n8; } r30 = r3[0]; r31 = r3[1]; MUL_TC4_UNSIGNED(r5, n5, u5, sn + 1, u2, sn + 1); /* 1/2 */ MUL_TC4(r6, n6, u6, n6, r2, n8); /* -1/2 */ r3[1] = r31; #if HAVE_NATIVE_mpn_addlsh1_n cy = mpn_addlsh1_n(u2, a2, a3, h1); if (sn > h1) cy = mpn_add_1(u2 + h1, a2 + h1, sn - h1, cy); u2[sn] = cy; u2[sn] = 2*u2[sn] + mpn_addlsh1_n(u2, a1, u2, sn); u2[sn] = 2*u2[sn] + mpn_addlsh1_n(u2, a0, u2, sn); #else MPN_COPY(u2, a0, sn); u2[sn] = mpn_addmul_1(u2, a1, sn, 2); u2[sn] += mpn_addmul_1(u2, a2, sn, 4); cy = mpn_addmul_1(u2, a3, h1, 8); if (sn > h1) cy = mpn_add_1(u2 + h1, u2 + h1, sn - h1, cy); u2[sn] += cy; #endif #if HAVE_NATIVE_mpn_addlsh1_n cy = mpn_addlsh1_n(r1, b2, b3, h1); if (sn > h1) cy = mpn_add_1(r1 + h1, b2 + h1, sn - h1, cy); r1[sn] = cy; r1[sn] = 2*r1[sn] + mpn_addlsh1_n(r1, b1, r1, sn); r1[sn] = 2*r1[sn] + mpn_addlsh1_n(r1, b0, r1, sn); #else MPN_COPY(r1, b0, sn); r1[sn] = mpn_addmul_1(r1, b1, sn, 2); r1[sn] += mpn_addmul_1(r1, b2, sn, 4); cy = mpn_addmul_1(r1, b3, h1, 8); if (sn > h1) cy = mpn_add_1(r1 + h1, r1 + h1, sn - h1, cy); r1[sn] += cy; #endif MUL_TC4_UNSIGNED(r2, n2, u2, sn + 1, r1, sn + 1); /* 2 */ MUL_TC4_UNSIGNED(r1, n1, a3, h1, b3, h1); /* oo */ MUL_TC4_UNSIGNED(r7, n7, a0, sn, b0, sn); /* 0 */ TC4_DENORM(r1, n1, t4 - 1); /* rp rp1 rp2 rp3 rp4 rp5 rp6 rp7 <----------- r7-----------><------------r5--------------> <-------------r3-------------> <-------------r6-------------> < -----------r2------------>{ } <-------------r4--------------> <--------------r1----> */ mpn_toom4_interpolate(rp, &rpn, sn, tp, t4 - 1, n4, n6, r30); if (rpn != 2*n) { MPN_ZERO((rp + rpn), 2*n - rpn); } TMP_FREE; }
void mpn_toom53_mul (mp_ptr pp, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr scratch) { mp_size_t n, s, t; mp_limb_t cy; mp_ptr gp; mp_ptr as1, asm1, as2, asm2, ash; mp_ptr bs1, bsm1, bs2, bsm2, bsh; mp_ptr tmp; enum toom7_flags flags; TMP_DECL; #define a0 ap #define a1 (ap + n) #define a2 (ap + 2*n) #define a3 (ap + 3*n) #define a4 (ap + 4*n) #define b0 bp #define b1 (bp + n) #define b2 (bp + 2*n) n = 1 + (3 * an >= 5 * bn ? (an - 1) / (size_t) 5 : (bn - 1) / (size_t) 3); s = an - 4 * n; t = bn - 2 * n; ASSERT (0 < s && s <= n); ASSERT (0 < t && t <= n); TMP_MARK; tmp = TMP_ALLOC_LIMBS (10 * (n + 1)); as1 = tmp; tmp += n + 1; asm1 = tmp; tmp += n + 1; as2 = tmp; tmp += n + 1; asm2 = tmp; tmp += n + 1; ash = tmp; tmp += n + 1; bs1 = tmp; tmp += n + 1; bsm1 = tmp; tmp += n + 1; bs2 = tmp; tmp += n + 1; bsm2 = tmp; tmp += n + 1; bsh = tmp; tmp += n + 1; gp = pp; /* Compute as1 and asm1. */ flags = (enum toom7_flags) (toom7_w3_neg & mpn_toom_eval_pm1 (as1, asm1, 4, ap, n, s, gp)); /* Compute as2 and asm2. */ flags = (enum toom7_flags) (flags | toom7_w1_neg & mpn_toom_eval_pm2 (as2, asm2, 4, ap, n, s, gp)); /* Compute ash = 16 a0 + 8 a1 + 4 a2 + 2 a3 + a4 = 2*(2*(2*(2*a0 + a1) + a2) + a3) + a4 */ #if HAVE_NATIVE_mpn_addlsh1_n cy = mpn_addlsh1_n (ash, a1, a0, n); cy = 2*cy + mpn_addlsh1_n (ash, a2, ash, n); cy = 2*cy + mpn_addlsh1_n (ash, a3, ash, n); if (s < n) { mp_limb_t cy2; cy2 = mpn_addlsh1_n (ash, a4, ash, s); ash[n] = 2*cy + mpn_lshift (ash + s, ash + s, n - s, 1); MPN_INCR_U (ash + s, n+1-s, cy2); } else ash[n] = 2*cy + mpn_addlsh1_n (ash, a4, ash, n); #else cy = mpn_lshift (ash, a0, n, 1); cy += mpn_add_n (ash, ash, a1, n); cy = 2*cy + mpn_lshift (ash, ash, n, 1); cy += mpn_add_n (ash, ash, a2, n); cy = 2*cy + mpn_lshift (ash, ash, n, 1); cy += mpn_add_n (ash, ash, a3, n); cy = 2*cy + mpn_lshift (ash, ash, n, 1); ash[n] = cy + mpn_add (ash, ash, n, a4, s); #endif /* Compute bs1 and bsm1. */ bs1[n] = mpn_add (bs1, b0, n, b2, t); /* b0 + b2 */ #if HAVE_NATIVE_mpn_add_n_sub_n if (bs1[n] == 0 && mpn_cmp (bs1, b1, n) < 0) { bs1[n] = mpn_add_n_sub_n (bs1, bsm1, b1, bs1, n) >> 1; bsm1[n] = 0; flags = (enum toom7_flags) (flags ^ toom7_w3_neg); }
void check (void) { mp_limb_t wp[100], xp[100], yp[100]; mp_size_t size = 100; refmpn_zero (xp, size); refmpn_zero (yp, size); refmpn_zero (wp, size); pre ("mpn_add_n"); mpn_add_n (wp, xp, yp, size); post (); #if HAVE_NATIVE_mpn_add_nc pre ("mpn_add_nc"); mpn_add_nc (wp, xp, yp, size, CNST_LIMB(0)); post (); #endif #if HAVE_NATIVE_mpn_addlsh1_n pre ("mpn_addlsh1_n"); mpn_addlsh1_n (wp, xp, yp, size); post (); #endif #if HAVE_NATIVE_mpn_and_n pre ("mpn_and_n"); mpn_and_n (wp, xp, yp, size); post (); #endif #if HAVE_NATIVE_mpn_andn_n pre ("mpn_andn_n"); mpn_andn_n (wp, xp, yp, size); post (); #endif pre ("mpn_addmul_1"); mpn_addmul_1 (wp, xp, size, yp[0]); post (); #if HAVE_NATIVE_mpn_addmul_1c pre ("mpn_addmul_1c"); mpn_addmul_1c (wp, xp, size, yp[0], CNST_LIMB(0)); post (); #endif #if HAVE_NATIVE_mpn_com_n pre ("mpn_com_n"); mpn_com_n (wp, xp, size); post (); #endif #if HAVE_NATIVE_mpn_copyd pre ("mpn_copyd"); mpn_copyd (wp, xp, size); post (); #endif #if HAVE_NATIVE_mpn_copyi pre ("mpn_copyi"); mpn_copyi (wp, xp, size); post (); #endif pre ("mpn_divexact_1"); mpn_divexact_1 (wp, xp, size, CNST_LIMB(123)); post (); pre ("mpn_divexact_by3c"); mpn_divexact_by3c (wp, xp, size, CNST_LIMB(0)); post (); pre ("mpn_divrem_1"); mpn_divrem_1 (wp, (mp_size_t) 0, xp, size, CNST_LIMB(123)); post (); #if HAVE_NATIVE_mpn_divrem_1c pre ("mpn_divrem_1c"); mpn_divrem_1c (wp, (mp_size_t) 0, xp, size, CNST_LIMB(123), CNST_LIMB(122)); post (); #endif pre ("mpn_gcd_1"); xp[0] |= 1; notdead += (unsigned long) mpn_gcd_1 (xp, size, CNST_LIMB(123)); post (); #if HAVE_NATIVE_mpn_gcd_finda pre ("mpn_gcd_finda"); xp[0] |= 1; xp[1] |= 1; notdead += mpn_gcd_finda (xp); post (); #endif pre ("mpn_hamdist"); notdead += mpn_hamdist (xp, yp, size); post (); #if HAVE_NATIVE_mpn_ior_n pre ("mpn_ior_n"); mpn_ior_n (wp, xp, yp, size); post (); #endif #if HAVE_NATIVE_mpn_iorn_n pre ("mpn_iorn_n"); mpn_iorn_n (wp, xp, yp, size); post (); #endif pre ("mpn_lshift"); mpn_lshift (wp, xp, size, 1); post (); pre ("mpn_mod_1"); notdead += mpn_mod_1 (xp, size, CNST_LIMB(123)); post (); #if HAVE_NATIVE_mpn_mod_1c pre ("mpn_mod_1c"); notdead += mpn_mod_1c (xp, size, CNST_LIMB(123), CNST_LIMB(122)); post (); #endif #if GMP_NUMB_BITS % 4 == 0 pre ("mpn_mod_34lsub1"); notdead += mpn_mod_34lsub1 (xp, size); post (); #endif pre ("mpn_modexact_1_odd"); notdead += mpn_modexact_1_odd (xp, size, CNST_LIMB(123)); post (); pre ("mpn_modexact_1c_odd"); notdead += mpn_modexact_1c_odd (xp, size, CNST_LIMB(123), CNST_LIMB(456)); post (); pre ("mpn_mul_1"); mpn_mul_1 (wp, xp, size, yp[0]); post (); #if HAVE_NATIVE_mpn_mul_1c pre ("mpn_mul_1c"); mpn_mul_1c (wp, xp, size, yp[0], CNST_LIMB(0)); post (); #endif #if HAVE_NATIVE_mpn_mul_2 pre ("mpn_mul_2"); mpn_mul_2 (wp, xp, size-1, yp); post (); #endif pre ("mpn_mul_basecase"); mpn_mul_basecase (wp, xp, (mp_size_t) 3, yp, (mp_size_t) 3); post (); #if HAVE_NATIVE_mpn_nand_n pre ("mpn_nand_n"); mpn_nand_n (wp, xp, yp, size); post (); #endif #if HAVE_NATIVE_mpn_nior_n pre ("mpn_nior_n"); mpn_nior_n (wp, xp, yp, size); post (); #endif pre ("mpn_popcount"); notdead += mpn_popcount (xp, size); post (); pre ("mpn_preinv_mod_1"); notdead += mpn_preinv_mod_1 (xp, size, GMP_NUMB_MAX, refmpn_invert_limb (GMP_NUMB_MAX)); post (); #if USE_PREINV_DIVREM_1 || HAVE_NATIVE_mpn_preinv_divrem_1 pre ("mpn_preinv_divrem_1"); mpn_preinv_divrem_1 (wp, (mp_size_t) 0, xp, size, GMP_NUMB_MAX, refmpn_invert_limb (GMP_NUMB_MAX), 0); post (); #endif #if HAVE_NATIVE_mpn_rsh1add_n pre ("mpn_rsh1add_n"); mpn_rsh1add_n (wp, xp, yp, size); post (); #endif #if HAVE_NATIVE_mpn_rsh1sub_n pre ("mpn_rsh1sub_n"); mpn_rsh1sub_n (wp, xp, yp, size); post (); #endif pre ("mpn_rshift"); mpn_rshift (wp, xp, size, 1); post (); pre ("mpn_sqr_basecase"); mpn_sqr_basecase (wp, xp, (mp_size_t) 3); post (); pre ("mpn_submul_1"); mpn_submul_1 (wp, xp, size, yp[0]); post (); #if HAVE_NATIVE_mpn_submul_1c pre ("mpn_submul_1c"); mpn_submul_1c (wp, xp, size, yp[0], CNST_LIMB(0)); post (); #endif pre ("mpn_sub_n"); mpn_sub_n (wp, xp, yp, size); post (); #if HAVE_NATIVE_mpn_sub_nc pre ("mpn_sub_nc"); mpn_sub_nc (wp, xp, yp, size, CNST_LIMB(0)); post (); #endif #if HAVE_NATIVE_mpn_sublsh1_n pre ("mpn_sublsh1_n"); mpn_sublsh1_n (wp, xp, yp, size); post (); #endif #if HAVE_NATIVE_mpn_udiv_qrnnd pre ("mpn_udiv_qrnnd"); mpn_udiv_qrnnd (&wp[0], CNST_LIMB(122), xp[0], CNST_LIMB(123)); post (); #endif #if HAVE_NATIVE_mpn_udiv_qrnnd_r pre ("mpn_udiv_qrnnd_r"); mpn_udiv_qrnnd (CNST_LIMB(122), xp[0], CNST_LIMB(123), &wp[0]); post (); #endif #if HAVE_NATIVE_mpn_umul_ppmm pre ("mpn_umul_ppmm"); mpn_umul_ppmm (&wp[0], xp[0], yp[0]); post (); #endif #if HAVE_NATIVE_mpn_umul_ppmm_r pre ("mpn_umul_ppmm_r"); mpn_umul_ppmm_r (&wp[0], xp[0], yp[0]); post (); #endif #if HAVE_NATIVE_mpn_xor_n pre ("mpn_xor_n"); mpn_xor_n (wp, xp, yp, size); post (); #endif #if HAVE_NATIVE_mpn_xnor_n pre ("mpn_xnor_n"); mpn_xnor_n (wp, xp, yp, size); post (); #endif }
void mpn_toom4_sqr (mp_ptr pp, mp_srcptr ap, mp_size_t an, mp_ptr scratch) { mp_size_t n, s; mp_limb_t cy; #define a0 ap #define a1 (ap + n) #define a2 (ap + 2*n) #define a3 (ap + 3*n) n = (an + 3) >> 2; s = an - 3 * n; ASSERT (0 < s && s <= n); /* NOTE: The multiplications to v2, vm2, vh and vm1 overwrites the * following limb, so these must be computed in order, and we need a * one limb gap to tp. */ #define v0 pp /* 2n */ #define v1 (pp + 2 * n) /* 2n+1 */ #define vinf (pp + 6 * n) /* s+t */ #define v2 scratch /* 2n+1 */ #define vm2 (scratch + 2 * n + 1) /* 2n+1 */ #define vh (scratch + 4 * n + 2) /* 2n+1 */ #define vm1 (scratch + 6 * n + 3) /* 2n+1 */ #define tp (scratch + 8*n + 5) /* No overlap with v1 */ #define apx pp /* n+1 */ #define amx (pp + 4*n + 2) /* n+1 */ /* Total scratch need: 8*n + 5 + scratch for recursive calls. This gives roughly 32 n/3 + log term. */ /* Compute apx = a0 + 2 a1 + 4 a2 + 8 a3 and amx = a0 - 2 a1 + 4 a2 - 8 a3. */ mpn_toom_eval_dgr3_pm2 (apx, amx, ap, n, s, tp); TOOM4_SQR_REC (v2, apx, n + 1, tp); /* v2, 2n+1 limbs */ TOOM4_SQR_REC (vm2, amx, n + 1, tp); /* vm2, 2n+1 limbs */ /* Compute apx = 8 a0 + 4 a1 + 2 a2 + a3 = (((2*a0 + a1) * 2 + a2) * 2 + a3 */ #if HAVE_NATIVE_mpn_addlsh1_n cy = mpn_addlsh1_n (apx, a1, a0, n); cy = 2*cy + mpn_addlsh1_n (apx, a2, apx, n); if (s < n) { mp_limb_t cy2; cy2 = mpn_addlsh1_n (apx, a3, apx, s); apx[n] = 2*cy + mpn_lshift (apx + s, apx + s, n - s, 1); MPN_INCR_U (apx + s, n+1-s, cy2); } else apx[n] = 2*cy + mpn_addlsh1_n (apx, a3, apx, n); #else cy = mpn_lshift (apx, a0, n, 1); cy += mpn_add_n (apx, apx, a1, n); cy = 2*cy + mpn_lshift (apx, apx, n, 1); cy += mpn_add_n (apx, apx, a2, n); cy = 2*cy + mpn_lshift (apx, apx, n, 1); apx[n] = cy + mpn_add (apx, apx, n, a3, s); #endif ASSERT (apx[n] < 15); TOOM4_SQR_REC (vh, apx, n + 1, tp); /* vh, 2n+1 limbs */ /* Compute apx = a0 + a1 + a2 + a3 and amx = a0 - a1 + a2 - a3. */ mpn_toom_eval_dgr3_pm1 (apx, amx, ap, n, s, tp); TOOM4_SQR_REC (v1, apx, n + 1, tp); /* v1, 2n+1 limbs */ TOOM4_SQR_REC (vm1, amx, n + 1, tp); /* vm1, 2n+1 limbs */ TOOM4_SQR_REC (v0, a0, n, tp); TOOM4_SQR_REC (vinf, a3, s, tp); /* vinf, 2s limbs */ mpn_toom_interpolate_7pts (pp, n, 0, vm2, vm1, v2, vh, 2*s, tp); }
void mpn_toom44_mul (mp_ptr pp, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr scratch) { mp_size_t n, s, t; mp_limb_t cy; enum toom7_flags flags; #define a0 ap #define a1 (ap + n) #define a2 (ap + 2*n) #define a3 (ap + 3*n) #define b0 bp #define b1 (bp + n) #define b2 (bp + 2*n) #define b3 (bp + 3*n) ASSERT (an >= bn); n = (an + 3) >> 2; s = an - 3 * n; t = bn - 3 * n; ASSERT (0 < s && s <= n); ASSERT (0 < t && t <= n); ASSERT (s >= t); /* NOTE: The multiplications to v2, vm2, vh and vm1 overwrites the * following limb, so these must be computed in order, and we need a * one limb gap to tp. */ #define v0 pp /* 2n */ #define v1 (pp + 2 * n) /* 2n+1 */ #define vinf (pp + 6 * n) /* s+t */ #define v2 scratch /* 2n+1 */ #define vm2 (scratch + 2 * n + 1) /* 2n+1 */ #define vh (scratch + 4 * n + 2) /* 2n+1 */ #define vm1 (scratch + 6 * n + 3) /* 2n+1 */ #define tp (scratch + 8*n + 5) /* apx and bpx must not overlap with v1 */ #define apx pp /* n+1 */ #define amx (pp + n + 1) /* n+1 */ #define bmx (pp + 2*n + 2) /* n+1 */ #define bpx (pp + 4*n + 2) /* n+1 */ /* Total scratch need: 8*n + 5 + scratch for recursive calls. This gives roughly 32 n/3 + log term. */ /* Compute apx = a0 + 2 a1 + 4 a2 + 8 a3 and amx = a0 - 2 a1 + 4 a2 - 8 a3. */ flags = (enum toom7_flags) (toom7_w1_neg & mpn_toom_eval_dgr3_pm2 (apx, amx, ap, n, s, tp)); /* Compute bpx = b0 + 2 b1 + 4 b2 + 8 b3 and bmx = b0 - 2 b1 + 4 b2 - 8 b3. */ flags = (enum toom7_flags) (flags ^ toom7_w1_neg & mpn_toom_eval_dgr3_pm2 (bpx, bmx, bp, n, t, tp)); TOOM44_MUL_N_REC (v2, apx, bpx, n + 1, tp); /* v2, 2n+1 limbs */ TOOM44_MUL_N_REC (vm2, amx, bmx, n + 1, tp); /* vm2, 2n+1 limbs */ /* Compute apx = 8 a0 + 4 a1 + 2 a2 + a3 = (((2*a0 + a1) * 2 + a2) * 2 + a3 */ #if HAVE_NATIVE_mpn_addlsh1_n cy = mpn_addlsh1_n (apx, a1, a0, n); cy = 2*cy + mpn_addlsh1_n (apx, a2, apx, n); if (s < n) { mp_limb_t cy2; cy2 = mpn_addlsh1_n (apx, a3, apx, s); apx[n] = 2*cy + mpn_lshift (apx + s, apx + s, n - s, 1); MPN_INCR_U (apx + s, n+1-s, cy2); } else apx[n] = 2*cy + mpn_addlsh1_n (apx, a3, apx, n); #else cy = mpn_lshift (apx, a0, n, 1); cy += mpn_add_n (apx, apx, a1, n); cy = 2*cy + mpn_lshift (apx, apx, n, 1); cy += mpn_add_n (apx, apx, a2, n); cy = 2*cy + mpn_lshift (apx, apx, n, 1); apx[n] = cy + mpn_add (apx, apx, n, a3, s); #endif /* Compute bpx = 8 b0 + 4 b1 + 2 b2 + b3 = (((2*b0 + b1) * 2 + b2) * 2 + b3 */ #if HAVE_NATIVE_mpn_addlsh1_n cy = mpn_addlsh1_n (bpx, b1, b0, n); cy = 2*cy + mpn_addlsh1_n (bpx, b2, bpx, n); if (t < n) { mp_limb_t cy2; cy2 = mpn_addlsh1_n (bpx, b3, bpx, t); bpx[n] = 2*cy + mpn_lshift (bpx + t, bpx + t, n - t, 1); MPN_INCR_U (bpx + t, n+1-t, cy2); } else bpx[n] = 2*cy + mpn_addlsh1_n (bpx, b3, bpx, n); #else cy = mpn_lshift (bpx, b0, n, 1); cy += mpn_add_n (bpx, bpx, b1, n); cy = 2*cy + mpn_lshift (bpx, bpx, n, 1); cy += mpn_add_n (bpx, bpx, b2, n); cy = 2*cy + mpn_lshift (bpx, bpx, n, 1); bpx[n] = cy + mpn_add (bpx, bpx, n, b3, t); #endif ASSERT (apx[n] < 15); ASSERT (bpx[n] < 15); TOOM44_MUL_N_REC (vh, apx, bpx, n + 1, tp); /* vh, 2n+1 limbs */ /* Compute apx = a0 + a1 + a2 + a3 and amx = a0 - a1 + a2 - a3. */ flags = (enum toom7_flags) (flags | toom7_w3_neg & mpn_toom_eval_dgr3_pm1 (apx, amx, ap, n, s, tp)); /* Compute bpx = b0 + b1 + b2 + b3 bnd bmx = b0 - b1 + b2 - b3. */ flags = (enum toom7_flags) (flags ^ toom7_w3_neg & mpn_toom_eval_dgr3_pm1 (bpx, bmx, bp, n, t, tp)); TOOM44_MUL_N_REC (vm1, amx, bmx, n + 1, tp); /* vm1, 2n+1 limbs */ /* Clobbers amx, bmx. */ TOOM44_MUL_N_REC (v1, apx, bpx, n + 1, tp); /* v1, 2n+1 limbs */ TOOM44_MUL_N_REC (v0, a0, b0, n, tp); if (s > t) mpn_mul (vinf, a3, s, b3, t); else TOOM44_MUL_N_REC (vinf, a3, b3, s, tp); /* vinf, s+t limbs */ mpn_toom_interpolate_7pts (pp, n, flags, vm2, vm1, v2, vh, s + t, tp); }
void mpn_toom42_mul (mp_ptr c, mp_srcptr a, mp_size_t an, mp_srcptr b, mp_size_t bn, mp_ptr t) { mp_size_t k, k1, kk1, r, r2, twok, threek, rr2, n1, n2; mp_limb_t cy, cc, saved, vinf0, c20, c21; mp_ptr trec; int sa, sb; mp_ptr c1, c2, c3, c4, t1, t2, t3, t4; ASSERT(GMP_NUMB_BITS >= 6); k = (an + 3) / 4; /* ceil(an/4) */ ASSERT(bn > k); ASSERT(bn <= 2*k); ASSERT(an >= 20); twok = 2 * k; threek = 3 * k; k1 = k + 1; kk1 = k + k1; r = an - threek; /* last chunk */ r2 = bn - k; /* last chunk */ rr2 = r + r2; c1 = c + k; c2 = c1 + k; c3 = c2 + k; c4 = c3 + k; t1 = t + k; t2 = t1 + k; t3 = t2 + k; t4 = t3 + k; trec = t + 4 * k + 4; /* put a0+a2 in {t, k+1}, and b0+b1 in {t1 + 1, k+1}; put a1+a3 in {t3+3, k+1}, put a0+a1+a2+a3 in {t2 + 2, k+1} */ t[k] = mpn_add_n (t, a, a + twok, k); t4[3] = mpn_add_n (t3 + 3, a + k, a + threek, r); if (k > r) t4[3] = mpn_add_1(t3 + r + 3, a + k + r, k - r, t4[3]); mpn_add_n (t2 + 2, t, t3 + 3, k1); t2[1] = mpn_add_n (t1 + 1, b, b + k, r2); if (k > r2) t2[1] = mpn_add_1(t1 + 1 + r2, b + r2, k - r2, t2[1]); /* compute v1 := (a0+a1+a2+a3)*(b0+b1) in {c2, 2k+1}; since v1 < 6*B^(2k), v1 uses only 2k+1 words if GMP_NUMB_BITS >= 3 */ TOOM3_MUL_REC (c2, t2 + 2, t1 + 1, k1, trec); ASSERT(c2[k+k] < 6); /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} v1 */ /* put |a0-a1+a2-a3| in {t2 + 2, k+1} and |b0-b1| in {t3 + 3,k+1} */ /* sa = sign(a0-a1+a2-a3) */ /* sb = sign(b0-b1) */ sa = mpn_cmp (t, t3 + 3, k1); if (sa >= 0) mpn_sub_n (t2 + 2, t, t3 + 3, k1); else mpn_sub_n (t2 + 2, t3 + 3, t, k1); n1 = k; n2 = r2; MPN_NORMALIZE(b, n1); MPN_NORMALIZE(b+k, n2); if (n1 != n2) sb = (n1 > n2) ? 1 : -1; else sb = mpn_cmp (b, b + k, n2); if (sb >= 0) { t4[3] = mpn_sub_n (t3 + 3, b, b + k, r2); if (k > r2) t4[3] = -mpn_sub_1(t3 + 3 + r2, b + r2, k - r2, t4[3]); } else { mpn_sub_n (t3 + 3, b + k, b, r2); MPN_ZERO(t3 + r2 + 3, k1 - r2); } sa *= sb; /* sign of vm1 */ /* compute vm1 := (a0-a1+a2-a3)*(b0-b1) in {t, 2k+1}; since |vm1| < 2*B^(2k), vm1 uses only 2k+1 limbs */ TOOM3_MUL_REC (t, t2 + 2, t3 + 3, k1, trec); ASSERT(t[k+k] < 2); /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} v1 {t, 2k+1} {t+2k+1, 2k + 1} vm1 */ c20 = c2[0]; /* save c2[0] and c2[1] giving space 2k+2 at c */ c21 = c2[1]; /* compute a0+2a1+4a2+8a3 in {c, k+1} and b0+2b1 in {c1 + 1, k+1} */ #if HAVE_NATIVE_mpn_addlsh1_n c1[0] = mpn_addlsh1_n (c, a + twok, a + threek, r); if (r < k) { c1[0] = mpn_add_1 (c + r, a + twok + r, k - r, c1[0]); } c1[0] = 2 * c1[0] + mpn_addlsh1_n (c, a + k, c, k); c1[0] = 2 * c1[0] + mpn_addlsh1_n (c, a, c, k); c2[1] = mpn_addlsh1_n (c1 + 1, b, b + k, r2); if (r2 < k) { c2[1] = mpn_add_1(c1 + 1 + r2, b + r2, k - r2, c2[1]); } #else c[r] = mpn_lshift1 (c, a + threek, r); if (r < k) { MPN_ZERO(c + r + 1, k - r); } c1[0] += mpn_add_n (c, c, a + twok, k); mpn_double (c, k1); c1[0] += mpn_add_n (c, c, a + k, k); mpn_double (c, k1); c1[0] += mpn_add_n (c, c, a, k); c1[r2 + 1] = mpn_lshift1 (c1 + 1, b + k, r2); if (r2 < k) { MPN_ZERO(c1 + r2 + 2, k - r2); } c2[1] += mpn_add_n (c1 + 1, c1 + 1, b, k); #endif #define v2 (t+2*k+1) /* compute v2 := (a0+2a1+4a2+8a3)*(b0+2b1) in {t+2k+1, 2k+1} v2 < 45*B^k so v2 uses at most 2k+1 limbs if GMP_NUMB_BITS >= 6 */ TOOM3_MUL_REC (v2, c, c1 + 1, k1, trec); ASSERT(v2[k+k] < 45); /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} v1 {t, 2k+1} {t+2k+1, 2k + 1} vm1 v2 */ c2[0] = c20; /* restore c2[0] and c2[1] */ c2[1] = c21; /* compute v0 := a0*b0 in {c, 2k} */ TOOM3_MUL_REC (c, a, b, k, trec); /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} v0 v1 {t, 2k+1} {t+2k+1, 2k + 1} vm1 v2 */ #define vinf (c+4*k) /* compute vinf := a3*b1 in {c4, r + r2}, */ saved = c4[0]; if (r == r2) TOOM3_MUL_REC (c4, a + threek, b + k, r, trec); else if (r > r2) mpn_mul(c4, a + threek, r, b + k, r2); else mpn_mul(c4, b + k, r2, a + threek, r); vinf0 = c4[0]; c4[0] = saved; /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} v0 v1 {-}vinf {t, 2k+1} {t+2k+1, 2k + 1} vm1 v2 vinf0 = {-} */ mpn_toom3_interpolate (c, c2, v2, t, vinf, k, rr2, sa, vinf0, t4+2); #undef v2 #undef vinf }
void mpn_toom3_mul (mp_ptr c, mp_srcptr a, mp_size_t an, mp_srcptr b, mp_size_t bn, mp_ptr t) { mp_size_t k, k1, kk1, r, r2, twok, rr2; mp_limb_t cy, cc, saved, vinf0, c20, c21; mp_ptr trec; int sa, sb; mp_ptr c1, c2, c3, c4, t1, t2, t3, t4; ASSERT(GMP_NUMB_BITS >= 6); k = (an + 2) / 3; /* ceil(an/3) */ ASSERT(bn > 2*k); ASSERT(an >= 20); twok = 2 * k; k1 = k + 1; kk1 = k + k1; r = an - twok; /* last chunk */ r2 = bn - twok; /* last chunk */ rr2 = r + r2; c1 = c + k; c2 = c1 + k; c3 = c2 + k; c4 = c3 + k; t1 = t + k; t2 = t1 + k; t3 = t2 + k; t4 = t3 + k; trec = t + 4 * k + 4; /* put a0+a2 in {t, k+1}, and b0+b2 in {t1 + 1, k+1}; put a0+a1+a2 in {t2 + 2, k+1} and b0+b1+b2 in {t3 + 3,k+1} */ cy = mpn_add_n (t, a, a + twok, r); cc = mpn_add_n (t1 + 1, b, b + twok, r2); if (r < k) { __GMPN_ADD_1 (cy, t + r, a + r, k - r, cy); } if (r2 < k) { __GMPN_ADD_1 (cc, t1 + 1 + r2, b + r2, k - r2, cc); } t3[2] = (t1[0] = cy) + mpn_add_n (t2 + 2, t, a + k, k); t4[3] = (t2[1] = cc) + mpn_add_n (t3 + 3, t1 + 1, b + k, k); /* compute v1 := (a0+a1+a2)*(b0+b1+b2) in {c2, 2k+1}; since v1 < 9*B^(2k), v1 uses only 2k+1 words if GMP_NUMB_BITS >= 4 */ TOOM3_MUL_REC (c2, t2 + 2, t3 + 3, k1, trec); /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} v1 */ /* put |a0-a1+a2| in {t2 + 2, k+1} and |b0-b1+b2| in {t3 + 3,k+1} */ /* sa = sign(a0-a1+a2) */ /* sb = sign(b0-b1+b2) */ sa = (t[k] != 0) ? 1 : mpn_cmp (t, a + k, k); t3[2] = (sa >= 0) ? t[k] - mpn_sub_n (t2 + 2, t, a + k, k) : mpn_sub_n (t2 + 2, a + k, t, k); /* b0+b2 is in {c+k+1, k+1} now */ sb = (t2[1] != 0) ? 1 : mpn_cmp (t1 + 1, b + k, k); t4[3] = (sb >= 0) ? t2[1] - mpn_sub_n (t3 + 3, t1 + 1, b + k, k) : mpn_sub_n (t3 + 3, b + k, t1 + 1, k); sa *= sb; /* sign of vm1 */ /* compute vm1 := (a0-a1+a2)*(b0-b1+b2) in {t, 2k+1}; since |vm1| < 4*B^(2k), vm1 uses only 2k+1 limbs */ TOOM3_MUL_REC (t, t2 + 2, t3 + 3, k1, trec); /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} v1 {t, 2k+1} {t+2k+1, 2k + 1} vm1 */ c20 = c2[0]; /* save c2[0] and c2[1] giving space 2k+2 at c */ c21 = c2[1]; /* compute a0+2a1+4a2 in {c, k+1} and b0+2b1+4b2 in {c1 + 1, k+1} */ #if HAVE_NATIVE_mpn_addlsh1_n c1[0] = mpn_addlsh1_n (c, a + k, a + twok, r); c2[1] = mpn_addlsh1_n (c1 + 1, b + k, b + twok, r2); if (r < k) { __GMPN_ADD_1 (c1[0], c + r, a + k + r, k - r, c1[0]); } if (r2 < k) {__GMPN_ADD_1 (c2[1], c1 + 1 + r2, b + k + r2, k - r2, c2[1]); } c1[0] = 2 * c1[0] + mpn_addlsh1_n (c, a, c, k); c2[1] = 2 * c2[1] + mpn_addlsh1_n (c1 + 1, b, c1 + 1, k); #else c[r] = mpn_lshift1 (c, a + twok, r); c1[r2 + 1] = mpn_lshift1 (c1 + 1, b + twok, r2); if (r < k) { MPN_ZERO(c + r + 1, k - r); } if (r2 < k) { MPN_ZERO(c1 + r2 + 2, k - r2); } c1[0] += mpn_add_n (c, c, a + k, k); c2[1] += mpn_add_n (c1 + 1, c1 + 1, b + k, k); mpn_double (c, k1); mpn_double (c1 + 1, k1); c1[0] += mpn_add_n (c, c, a, k); c2[1] += mpn_add_n (c1 + 1, c1 + 1, b, k); #endif #define v2 (t+2*k+1) /* compute v2 := (a0+2a1+4a2)*(b0+2b1+4b2) in {t+2k+1, 2k+1} v2 < 49*B^k so v2 uses at most 2k+1 limbs if GMP_NUMB_BITS >= 6 */ TOOM3_MUL_REC (v2, c, c1 + 1, k1, trec); /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} v1 {t, 2k+1} {t+2k+1, 2k + 1} vm1 v2 */ c2[0] = c20; /* restore c2[0] and c2[1] */ c2[1] = c21; /* compute v0 := a0*b0 in {c, 2k} */ TOOM3_MUL_REC (c, a, b, k, trec); /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} v0 v1 {t, 2k+1} {t+2k+1, 2k + 1} vm1 v2 */ #define vinf (c+4*k) /* compute vinf := a2*b2 in {c4, r + r2}, */ saved = c4[0]; if (r == r2) TOOM3_MUL_REC (c4, a + twok, b + twok, r, trec); else if (r > r2) mpn_mul(c4, a + twok, r, b + twok, r2); else mpn_mul(c4, b + twok, r2, a + twok, r); vinf0 = c4[0]; c4[0] = saved; /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} v0 v1 {-}vinf {t, 2k+1} {t+2k+1, 2k + 1} vm1 v2 vinf0 = {-} */ mpn_toom3_interpolate (c, c2, v2, t, vinf, k, rr2, sa, vinf0, t4+2); #undef v2 #undef vinf }
/* The necessary temporary space T(n) satisfies T(n)=0 for n < THRESHOLD, and T(n) <= max(2n+2, 6k+3, 4k+3+T(k+1)) otherwise, where k = ceil(n/3). Assuming T(n) >= 2n, 6k+3 <= 4k+3+T(k+1). Similarly, 2n+2 <= 6k+2 <= 4k+3+T(k+1). With T(n) = 2n+S(n), this simplifies to S(n) <= 9 + S(k+1). Since THRESHOLD >= 17, we have n/(k+1) >= 19/8 thus S(n) <= S(n/(19/8)) + 9 thus S(n) <= 9*log(n)/log(19/8) <= 8*log2(n). We need in addition 2*r for mpn_sublsh1_n, so the total is at most 8/3*n+8*log2(n). */ void mpn_toom3_mul_n (mp_ptr c, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr t) { mp_size_t k, k1, kk1, r, twok, rr2; mp_limb_t cy, cc, saved, vinf0; mp_ptr trec; int sa, sb; mp_ptr c1, c2, c3, c4, c5, t1, t2, t3, t4; ASSERT(GMP_NUMB_BITS >= 6); k = (n + 2) / 3; /* ceil(n/3) */ ASSERT(GMP_NUMB_BITS >= 6); ASSERT(n >= 17); /* so that r <> 0 and 5k+3 <= 2n */ twok = 2 * k; k1 = k + 1; kk1 = k + k1; r = n - twok; /* last chunk */ rr2 = 2*r; c1 = c + k; c2 = c1 + k; c3 = c2 + k; c4 = c3 + k; c5 = c4 + k; t1 = t + k; t2 = t1 + k; t3 = t2 + k; t4 = t3 + k; trec = t + 4 * k + 4; /* put a0+a2 in {c, k+1}, and b0+b2 in {c4 + 2, k+1}; put a0+a1+a2 in {t2 + 1, k+1} and b0+b1+b2 in {t3 + 2,k+1} */ c1[0] = mpn_add_n (c, a, a + twok, r); c5[2] = mpn_add_n (c4 + 2, b, b + twok, r); if (r < k) { c1[0] = mpn_add_1 (c + r, a + r, k - r, c1[0]); c5[2] = mpn_add_1 (c4 + 2 + r, b + r, k - r, c5[2]); } t3[1] = c1[0] + mpn_add_n (t2 + 1, c, a + k, k); t4[2] = c5[2] + mpn_add_n (t3 + 2, c4 + 2, b + k, k); ASSERT(c1[0] < 2); ASSERT(c5[2] < 2); ASSERT(t3[1] < 3); ASSERT(t4[2] < 3); /* compute v1 := (a0+a1+a2)*(b0+b1+b2) in {c2, 2k+1}; since v1 < 9*B^(2k), v1 uses only 2k+1 words if GMP_NUMB_BITS >= 4 */ TOOM3_MUL_REC (c2, t2 + 1, t3 + 2, k1, trec); ASSERT(c2[k+k] < 9); /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} v1 */ /* put |a0-a1+a2| in {c,k+1} and |b0-b1+b2| in {c4 + 2,k+1} */ /* sa = sign(a0-a1+a2) */ /* sb = sign(b0-b1+b2) */ sa = (c[k] != 0) ? 1 : mpn_cmp (c, a + k, k); c[k] = (sa >= 0) ? c[k] - mpn_sub_n (c, c, a + k, k) : mpn_sub_n (c, a + k, c, k); /* b0+b2 is in {c4+2, k+1} now */ sb = (c5[2] != 0) ? 1 : mpn_cmp (c4 + 2, b + k, k); c5[2] = (sb >= 0) ? c5[2] - mpn_sub_n (c4 + 2, c4 + 2, b + k, k) : mpn_sub_n (c4 + 2, b + k, c4 + 2, k); ASSERT(c[k] < 2); ASSERT(c5[2] < 2); sa *= sb; /* sign of vm1 */ /* compute vm1 := (a0-a1+a2)*(b0-b1+b2) in {t, 2k+1}; since |vm1| < 4*B^(2k), vm1 uses only 2k+1 limbs */ TOOM3_MUL_REC (t, c, c4 + 2, k1, trec); ASSERT(t[k+k] < 4); /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} v1 {t, 2k+1} {t+2k+1, 2k + 1} vm1 */ /* compute a0+2a1+4a2 in {c, k+1} and b0+2b1+4b2 in {c4 + 2, k+1} */ #if HAVE_NATIVE_mpn_addlsh1_n c1[0] = mpn_addlsh1_n (c, a + k, a + twok, r); c5[2] = mpn_addlsh1_n (c4 + 2, b + k, b + twok, r); if (r < k) { c1[0] = mpn_add_1(c + r, a + k + r, k - r, c1[0]); c5[2] = mpn_add_1(c4 + 2 + r, b + k + r, k - r, c5[2]); } c1[0] = 2 * c1[0] + mpn_addlsh1_n (c, a, c, k); c5[2] = 2 * c5[2] + mpn_addlsh1_n (c4 + 2, b, c4 + 2, k); #else c[r] = mpn_lshift1 (c, a + twok, r); c4[r + 2] = mpn_lshift1 (c4 + 2, b + twok, r); if (r < k) { MPN_ZERO(c + r + 1, k - r); MPN_ZERO(c4 + r + 3, k - r); } c1[0] += mpn_add_n (c, c, a + k, k); c5[2] += mpn_add_n (c4 + 2, c4 + 2, b + k, k); mpn_double (c, k1); mpn_double (c4 + 2, k1); c1[0] += mpn_add_n (c, c, a, k); c5[2] += mpn_add_n (c4 + 2, c4 + 2, b, k); #endif ASSERT(c[k] < 7); ASSERT(c5[2] < 7); #define v2 (t+2*k+1) /* compute v2 := (a0+2a1+4a2)*(b0+2b1+4b2) in {t+2k+1, 2k+1} v2 < 49*B^k so v2 uses at most 2k+1 limbs if GMP_NUMB_BITS >= 6 */ TOOM3_MUL_REC (v2, c, c4 + 2, k1, trec); ASSERT(v2[k+k] < 49); /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} v1 {t, 2k+1} {t+2k+1, 2k + 1} vm1 v2 */ /* compute v0 := a0*b0 in {c, 2k} */ TOOM3_MUL_REC (c, a, b, k, trec); /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} v0 v1 {t, 2k+1} {t+2k+1, 2k + 1} vm1 v2 */ #define vinf (c+4*k) /* compute vinf := a2*b2 in {c4, r + r2}, */ saved = c4[0]; TOOM3_MUL_REC (c4, a + twok, b + twok, r, trec); vinf0 = c4[0]; c4[0] = saved; /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} v0 v1 {-}vinf {t, 2k+1} {t+2k+1, 2k + 1} vm1 v2 vinf0 = {-} */ mpn_toom3_interpolate (c, c2, v2, t, vinf, k, rr2, sa, vinf0, t4+2); #undef v2 #undef vinf }
void mpn_toom3_sqr_n (mp_ptr c, mp_srcptr a, mp_size_t n, mp_ptr t) { mp_size_t k, k1, kk1, r, twok, rr2; mp_limb_t cy, cc, saved, vinf0; mp_ptr trec; int sa; mp_ptr c1, c2, c3, c4, c5, t1, t2, t3, t4; ASSERT(GMP_NUMB_BITS >= 6); k = (n + 2) / 3; /* ceil(n/3) */ ASSERT(GMP_NUMB_BITS >= 6); ASSERT(n >= 17); /* so that r <> 0 and 5k+3 <= 2n */ twok = 2 * k; k1 = k + 1; kk1 = k + k1; r = n - twok; /* last chunk */ rr2 = 2*r; c1 = c + k; c2 = c1 + k; c3 = c2 + k; c4 = c3 + k; c5 = c4 + k; t1 = t + k; t2 = t1 + k; t3 = t2 + k; t4 = t3 + k; trec = t + 4 * k + 3; /* put a0+a2 in {c, k+1} put a0+a1+a2 in {t2 + 1, k+1} */ cy = mpn_add_n (c, a, a + twok, r); if (r < k) { __GMPN_ADD_1 (cy, c + r, a + r, k - r, cy); } t3[1] = (c1[0] = cy) + mpn_add_n (t2 + 1, c, a + k, k); /* compute v1 := (a0+a1+a2)^2 in {c2, 2k+1}; since v1 < 9*B^(2k), v1 uses only 2k+1 words if GMP_NUMB_BITS >= 4 */ TOOM3_SQR_REC (c2, t2 + 1, k1, trec); /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} v1 */ /* put |a0-a1+a2| in {c,k+1} */ sa = (c[k] != 0) ? 1 : mpn_cmp (c, a + k, k); c[k] = (sa >= 0) ? c[k] - mpn_sub_n (c, c, a + k, k) : mpn_sub_n (c, a + k, c, k); /* compute vm1 := (a0-a1+a2)^2 in {t, 2k+1}; since |vm1| < 4*B^(2k), vm1 uses only 2k+1 limbs */ TOOM3_SQR_REC (t, c, k1, trec); /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} v1 {t, 2k+1} {t+2k+1, 2k + 1} vm1 */ /* compute a0+2a1+4a2 in {c, k+1} */ #if HAVE_NATIVE_mpn_addlsh1_n c1[0] = mpn_addlsh1_n (c, a + k, a + twok, r); if (r < k) { __GMPN_ADD_1 (c1[0], c + r, a + k + r, k - r, c1[0]); } c1[0] = 2 * c1[0] + mpn_addlsh1_n (c, a, c, k); #else c[r] = mpn_lshift1 (c, a + twok, r); if (r < k) { MPN_ZERO(c + r + 1, k - r); } c1[0] += mpn_add_n (c, c, a + k, k); mpn_double (c, k1); c1[0] += mpn_add_n (c, c, a, k); #endif #define v2 (t+2*k+1) /* compute v2 := (a0+2a1+4a2)^2 in {t+2k+1, 2k+1} v2 < 49*B^k so v2 uses at most 2k+1 limbs if GMP_NUMB_BITS >= 6 */ TOOM3_SQR_REC (v2, c, k1, trec); /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} v1 {t, 2k+1} {t+2k+1, 2k + 1} vm1 v2 */ /* compute v0 := a0^2 in {c, 2k} */ TOOM3_SQR_REC (c, a, k, trec); /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} v0 v1 {t, 2k+1} {t+2k+1, 2k + 1} vm1 v2 */ #define vinf (c+4*k) /* compute vinf := a2*b2 in {c4, r + r2}, */ saved = c4[0]; TOOM3_SQR_REC (c4, a + twok, r, trec); vinf0 = c4[0]; c4[0] = saved; /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} v0 v1 {-}vinf {t, 2k+1} {t+2k+1, 2k + 1} vm1 v2 vinf0 = {-} */ mpn_toom3_interpolate (c, c2, v2, t, vinf, k, rr2, 1, vinf0, t4+2); #undef v2 #undef vinf }