/* Needs n+1 limbs of temporary storage. */ int mpn_toom_eval_dgr3_pm2 (mp_ptr xp2, mp_ptr xm2, mp_srcptr xp, mp_size_t n, mp_size_t x3n, mp_ptr tp) { mp_limb_t cy; int neg; ASSERT (x3n > 0); ASSERT (x3n <= n); /* (x0 + 4 * x2) +/- (2 x1 + 8 x_3) */ #if HAVE_NATIVE_mpn_addlsh_n || HAVE_NATIVE_mpn_addlsh2_n #if HAVE_NATIVE_mpn_addlsh2_n xp2[n] = mpn_addlsh2_n (xp2, xp, xp + 2*n, n); cy = mpn_addlsh2_n (tp, xp + n, xp + 3*n, x3n); #else /* HAVE_NATIVE_mpn_addlsh_n */ xp2[n] = mpn_addlsh_n (xp2, xp, xp + 2*n, n, 2); cy = mpn_addlsh_n (tp, xp + n, xp + 3*n, x3n, 2); #endif if (x3n < n) cy = mpn_add_1 (tp + x3n, xp + n + x3n, n - x3n, cy); tp[n] = cy; #else cy = mpn_lshift (tp, xp + 2*n, n, 2); xp2[n] = cy + mpn_add_n (xp2, tp, xp, n); tp[x3n] = mpn_lshift (tp, xp + 3*n, x3n, 2); if (x3n < n) tp[n] = mpn_add (tp, xp + n, n, tp, x3n + 1); else tp[n] += mpn_add_n (tp, xp + n, tp, n); #endif mpn_lshift (tp, tp, n+1, 1); neg = (mpn_cmp (xp2, tp, n + 1) < 0) ? ~0 : 0; #if HAVE_NATIVE_mpn_sumdiff_n if (neg) mpn_sumdiff_n (xp2, xm2, tp, xp2, n + 1); else mpn_sumdiff_n (xp2, xm2, xp2, tp, n + 1); #else if (neg) mpn_sub_n (xm2, tp, xp2, n + 1); else mpn_sub_n (xm2, xp2, tp, n + 1); mpn_add_n (xp2, xp2, tp, n + 1); #endif ASSERT (xp2[n] < 15); ASSERT (xm2[n] < 10); return neg; }
void mpn_toom53_mul (mp_ptr pp, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr scratch) { mp_size_t n, s, t; int vm1_neg, vmh_neg; mp_limb_t cy; mp_ptr gp, hp; mp_ptr as1, asm1, as2, ash, asmh; mp_ptr bs1, bsm1, bs2, bsh, bsmh; enum toom4_flags flags; TMP_DECL; #define a0 ap #define a1 (ap + n) #define a2 (ap + 2*n) #define a3 (ap + 3*n) #define a4 (ap + 4*n) #define b0 bp #define b1 (bp + n) #define b2 (bp + 2*n) n = 1 + (3 * an >= 5 * bn ? (an - 1) / (size_t) 5 : (bn - 1) / (size_t) 3); s = an - 4 * n; t = bn - 2 * n; ASSERT (0 < s && s <= n); ASSERT (0 < t && t <= n); TMP_MARK; as1 = TMP_SALLOC_LIMBS (n + 1); asm1 = TMP_SALLOC_LIMBS (n + 1); as2 = TMP_SALLOC_LIMBS (n + 1); ash = TMP_SALLOC_LIMBS (n + 1); asmh = TMP_SALLOC_LIMBS (n + 1); bs1 = TMP_SALLOC_LIMBS (n + 1); bsm1 = TMP_SALLOC_LIMBS (n + 1); bs2 = TMP_SALLOC_LIMBS (n + 1); bsh = TMP_SALLOC_LIMBS (n + 1); bsmh = TMP_SALLOC_LIMBS (n + 1); gp = pp; hp = pp + n + 1; /* Compute as1 and asm1. */ gp[n] = mpn_add_n (gp, a0, a2, n); gp[n] += mpn_add (gp, gp, n, a4, s); hp[n] = mpn_add_n (hp, a1, a3, n); #if HAVE_NATIVE_mpn_addsub_n if (mpn_cmp (gp, hp, n + 1) < 0) { mpn_addsub_n (as1, asm1, hp, gp, n + 1); vm1_neg = 1; } else { mpn_addsub_n (as1, asm1, gp, hp, n + 1); vm1_neg = 0; } #else mpn_add_n (as1, gp, hp, n + 1); if (mpn_cmp (gp, hp, n + 1) < 0) { mpn_sub_n (asm1, hp, gp, n + 1); vm1_neg = 1; } else { mpn_sub_n (asm1, gp, hp, n + 1); vm1_neg = 0; } #endif /* Compute as2. */ #if !HAVE_NATIVE_mpn_addlsh_n ash[n] = mpn_lshift (ash, a2, n, 2); /* 4a2 */ #endif #if HAVE_NATIVE_mpn_addlsh1_n cy = mpn_addlsh1_n (as2, a3, a4, s); if (s != n) cy = mpn_add_1 (as2 + s, a3 + s, n - s, cy); cy = 2 * cy + mpn_addlsh1_n (as2, a2, as2, n); cy = 2 * cy + mpn_addlsh1_n (as2, a1, as2, n); as2[n] = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n); #else cy = mpn_lshift (as2, a4, s, 1); cy += mpn_add_n (as2, a3, as2, s); if (s != n) cy = mpn_add_1 (as2 + s, a3 + s, n - s, cy); cy = 4 * cy + mpn_lshift (as2, as2, n, 2); cy += mpn_add_n (as2, a1, as2, n); cy = 2 * cy + mpn_lshift (as2, as2, n, 1); as2[n] = cy + mpn_add_n (as2, a0, as2, n); mpn_add_n (as2, ash, as2, n + 1); #endif /* Compute ash and asmh. */ #if HAVE_NATIVE_mpn_addlsh_n cy = mpn_addlsh_n (gp, a2, a0, n, 2); /* 4a0 + a2 */ cy = 4 * cy + mpn_addlsh_n (gp, a4, gp, n, 2); /* 16a0 + 4a2 + a4 */ /* FIXME s */ gp[n] = cy; cy = mpn_addlsh_n (hp, a3, a1, n, 2); /* 4a1 + a3 */ cy = 2 * cy + mpn_lshift (hp, hp, n, 1); /* 8a1 + 2a3 */ hp[n] = cy; #else gp[n] = mpn_lshift (gp, a0, n, 4); /* 16a0 */ mpn_add (gp, gp, n + 1, a4, s); /* 16a0 + a4 */ mpn_add_n (gp, ash, gp, n+1); /* 16a0 + 4a2 + a4 */ cy = mpn_lshift (hp, a1, n, 3); /* 8a1 */ cy += mpn_lshift (ash, a3, n, 1); /* 2a3 */ cy += mpn_add_n (hp, ash, hp, n); /* 8a1 + 2a3 */ hp[n] = cy; #endif #if HAVE_NATIVE_mpn_addsub_n if (mpn_cmp (gp, hp, n + 1) < 0) { mpn_addsub_n (ash, asmh, hp, gp, n + 1); vmh_neg = 1; } else { mpn_addsub_n (ash, asmh, gp, hp, n + 1); vmh_neg = 0; } #else mpn_add_n (ash, gp, hp, n + 1); if (mpn_cmp (gp, hp, n + 1) < 0) { mpn_sub_n (asmh, hp, gp, n + 1); vmh_neg = 1; } else { mpn_sub_n (asmh, gp, hp, n + 1); vmh_neg = 0; } #endif /* Compute bs1 and bsm1. */ bs1[n] = mpn_add (bs1, b0, n, b2, t); /* b0 + b2 */ #if HAVE_NATIVE_mpn_addsub_n if (bs1[n] == 0 && mpn_cmp (bs1, b1, n) < 0) { bs1[n] = mpn_addsub_n (bs1, bsm1, b1, bs1, n) >> 1; bsm1[n] = 0; vm1_neg ^= 1; }
/* Evaluates a polynomial of degree k > 2, in the points +2^shift and -2^shift. */ int mpn_toom_eval_pm2exp (mp_ptr xp2, mp_ptr xm2, unsigned k, mp_srcptr xp, mp_size_t n, mp_size_t hn, unsigned shift, mp_ptr tp) { unsigned i; int neg; #ifdef HAVE_NATIVE_mpn_addlsh_n mp_limb_t cy; #endif ASSERT (k >= 3); ASSERT (shift*k < GMP_NUMB_BITS); ASSERT (hn > 0); ASSERT (hn <= n); /* The degree k is also the number of full-size coefficients, so * that last coefficient, of size hn, starts at xp + k*n. */ #ifdef HAVE_NATIVE_mpn_addlsh_n xp2[n] = mpn_addlsh_n (xp2, xp, xp + 2*n, n, 2*shift); for (i = 4; i < k; i += 2) xp2[n] += mpn_addlsh_n (xp2, xp2, xp + i*n, n, i*shift); tp[n] = mpn_lshift (tp, xp+n, n, shift); for (i = 3; i < k; i+= 2) tp[n] += mpn_addlsh_n (tp, tp, xp+i*n, n, i*shift); if (k & 1) { cy = mpn_addlsh_n (tp, tp, xp+k*n, hn, k*shift); MPN_INCR_U (tp + hn, n+1 - hn, cy); } else { cy = mpn_addlsh_n (xp2, xp2, xp+k*n, hn, k*shift); MPN_INCR_U (xp2 + hn, n+1 - hn, cy); } #else /* !HAVE_NATIVE_mpn_addlsh_n */ xp2[n] = mpn_lshift (tp, xp+2*n, n, 2*shift); xp2[n] += mpn_add_n (xp2, xp, tp, n); for (i = 4; i < k; i += 2) { xp2[n] += mpn_lshift (tp, xp + ((mp_size_t) i)*n, n, i*shift); xp2[n] += mpn_add_n (xp2, xp2, tp, n); } tp[n] = mpn_lshift (tp, xp+n, n, shift); for (i = 3; i < k; i+= 2) { tp[n] += mpn_lshift (xm2, xp + ((mp_size_t) i)*n, n, i*shift); tp[n] += mpn_add_n (tp, tp, xm2, n); } xm2[hn] = mpn_lshift (xm2, xp + ((mp_size_t) k)*n, hn, k*shift); if (k & 1) mpn_add (tp, tp, n+1, xm2, hn+1); else mpn_add (xp2, xp2, n+1, xm2, hn+1); #endif /* !HAVE_NATIVE_mpn_addlsh_n */ neg = (mpn_cmp (xp2, tp, n + 1) < 0) ? ~0 : 0; #ifdef HAVE_NATIVE_mpn_add_n_sub_n if (neg) mpn_add_n_sub_n (xp2, xm2, tp, xp2, n + 1); else mpn_add_n_sub_n (xp2, xm2, xp2, tp, n + 1); #else /* !HAVE_NATIVE_mpn_add_n_sub_n */ if (neg) mpn_sub_n (xm2, tp, xp2, n + 1); else mpn_sub_n (xm2, xp2, tp, n + 1); mpn_add_n (xp2, xp2, tp, n + 1); #endif /* !HAVE_NATIVE_mpn_add_n_sub_n */ /* FIXME: the following asserts are useless if (k+1)*shift >= GMP_LIMB_BITS */ ASSERT ((k+1)*shift >= GMP_LIMB_BITS || xp2[n] < ((CNST_LIMB(1)<<((k+1)*shift))-1)/((CNST_LIMB(1)<<shift)-1)); ASSERT ((k+2)*shift >= GMP_LIMB_BITS || xm2[n] < ((CNST_LIMB(1)<<((k+2)*shift))-((k&1)?(CNST_LIMB(1)<<shift):1))/((CNST_LIMB(1)<<(2*shift))-1)); return neg; }
void mpn_toom4_mul_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n) { mp_size_t ind; mp_limb_t cy, cy2, r30, r31; mp_ptr tp; mp_size_t sn, n1, n2, n3, n4, n5, n6, n7, n8, rpn, t4, h1; TMP_DECL; sn = (n + 3) / 4; h1 = n - 3*sn; #define a0 (up) #define a1 (up + sn) #define a2 (up + 2*sn) #define a3 (up + 3*sn) #define b0 (vp) #define b1 (vp + sn) #define b2 (vp + 2*sn) #define b3 (vp + 3*sn) t4 = 2*sn+2; // allows mult of 2 integers of sn + 1 limbs TMP_MARK; tp = TMP_ALLOC_LIMBS(4*t4 + 5*(sn + 1)); #define u2 (tp + 4*t4) #define u3 (tp + 4*t4 + (sn+1)) #define u4 (tp + 4*t4 + 2*(sn+1)) #define u5 (tp + 4*t4 + 3*(sn+1)) #define u6 (tp + 4*t4 + 4*(sn+1)) u6[sn] = mpn_add(u6, a1, sn, a3, h1); u5[sn] = mpn_add_n(u5, a2, a0, sn); mpn_add_n(u3, u5, u6, sn + 1); n4 = sn + 1; if (mpn_cmp(u5, u6, sn + 1) >= 0) mpn_sub_n(u4, u5, u6, sn + 1); else { mpn_sub_n(u4, u6, u5, sn + 1); n4 = -n4; } u6[sn] = mpn_add(u6, b1, sn, b3, h1); u5[sn] = mpn_add_n(u5, b2, b0, sn); mpn_add_n(r2, u5, u6, sn + 1); n5 = sn + 1; if (mpn_cmp(u5, u6, sn + 1) >= 0) mpn_sub_n(u5, u5, u6, sn + 1); else { mpn_sub_n(u5, u6, u5, sn + 1); n5 = -n5; } MUL_TC4_UNSIGNED(r3, n3, u3, sn + 1, r2, sn + 1); /* 1 */ MUL_TC4(r4, n4, u4, n4, u5, n5); /* -1 */ #if HAVE_NATIVE_mpn_addlsh_n r1[sn] = mpn_addlsh_n(r1, a2, a0, sn, 2); mpn_lshift(r1, r1, sn + 1, 1); cy = mpn_addlsh_n(r2, a3, a1, h1, 2); #else r1[sn] = mpn_lshift(r1, a2, sn, 1); MPN_COPY(r2, a3, h1); r1[sn] += mpn_addmul_1(r1, a0, sn, 8); cy = mpn_addmul_1(r2, a1, h1, 4); #endif if (sn > h1) { cy2 = mpn_lshift(r2 + h1, a1 + h1, sn - h1, 2); cy = cy2 + mpn_add_1(r2 + h1, r2 + h1, sn - h1, cy); } r2[sn] = cy; mpn_add_n(u5, r1, r2, sn + 1); n6 = sn + 1; if (mpn_cmp(r1, r2, sn + 1) >= 0) mpn_sub_n(u6, r1, r2, sn + 1); else { mpn_sub_n(u6, r2, r1, sn + 1); n6 = -n6; } #if HAVE_NATIVE_mpn_addlsh_n r1[sn] = mpn_addlsh_n(r1, b2, b0, sn, 2); mpn_lshift(r1, r1, sn + 1, 1); cy = mpn_addlsh_n(r2, b3, b1, h1, 2); #else r1[sn] = mpn_lshift(r1, b2, sn, 1); MPN_COPY(r2, b3, h1); r1[sn] += mpn_addmul_1(r1, b0, sn, 8); cy = mpn_addmul_1(r2, b1, h1, 4); #endif if (sn > h1) { cy2 = mpn_lshift(r2 + h1, b1 + h1, sn - h1, 2); cy = cy2 + mpn_add_1(r2 + h1, r2 + h1, sn - h1, cy); } r2[sn] = cy; mpn_add_n(u2, r1, r2, sn + 1); n8 = sn + 1; if (mpn_cmp(r1, r2, sn + 1) >= 0) mpn_sub_n(r2, r1, r2, sn + 1); else { mpn_sub_n(r2, r2, r1, sn + 1); n8 = -n8; } r30 = r3[0]; r31 = r3[1]; MUL_TC4_UNSIGNED(r5, n5, u5, sn + 1, u2, sn + 1); /* 1/2 */ MUL_TC4(r6, n6, u6, n6, r2, n8); /* -1/2 */ r3[1] = r31; #if HAVE_NATIVE_mpn_addlsh1_n cy = mpn_addlsh1_n(u2, a2, a3, h1); if (sn > h1) cy = mpn_add_1(u2 + h1, a2 + h1, sn - h1, cy); u2[sn] = cy; u2[sn] = 2*u2[sn] + mpn_addlsh1_n(u2, a1, u2, sn); u2[sn] = 2*u2[sn] + mpn_addlsh1_n(u2, a0, u2, sn); #else MPN_COPY(u2, a0, sn); u2[sn] = mpn_addmul_1(u2, a1, sn, 2); u2[sn] += mpn_addmul_1(u2, a2, sn, 4); cy = mpn_addmul_1(u2, a3, h1, 8); if (sn > h1) cy = mpn_add_1(u2 + h1, u2 + h1, sn - h1, cy); u2[sn] += cy; #endif #if HAVE_NATIVE_mpn_addlsh1_n cy = mpn_addlsh1_n(r1, b2, b3, h1); if (sn > h1) cy = mpn_add_1(r1 + h1, b2 + h1, sn - h1, cy); r1[sn] = cy; r1[sn] = 2*r1[sn] + mpn_addlsh1_n(r1, b1, r1, sn); r1[sn] = 2*r1[sn] + mpn_addlsh1_n(r1, b0, r1, sn); #else MPN_COPY(r1, b0, sn); r1[sn] = mpn_addmul_1(r1, b1, sn, 2); r1[sn] += mpn_addmul_1(r1, b2, sn, 4); cy = mpn_addmul_1(r1, b3, h1, 8); if (sn > h1) cy = mpn_add_1(r1 + h1, r1 + h1, sn - h1, cy); r1[sn] += cy; #endif MUL_TC4_UNSIGNED(r2, n2, u2, sn + 1, r1, sn + 1); /* 2 */ MUL_TC4_UNSIGNED(r1, n1, a3, h1, b3, h1); /* oo */ MUL_TC4_UNSIGNED(r7, n7, a0, sn, b0, sn); /* 0 */ TC4_DENORM(r1, n1, t4 - 1); /* rp rp1 rp2 rp3 rp4 rp5 rp6 rp7 <----------- r7-----------><------------r5--------------> <-------------r3-------------> <-------------r6-------------> < -----------r2------------>{ } <-------------r4--------------> <--------------r1----> */ mpn_toom4_interpolate(rp, &rpn, sn, tp, t4 - 1, n4, n6, r30); if (rpn != 2*n) { MPN_ZERO((rp + rpn), 2*n - rpn); } TMP_FREE; }