void tc4_divexact_by3(mp_ptr rp, mp_size_t * rn, mp_ptr x, mp_size_t xn) { if (xn) { mp_size_t xu = ABS(xn); mpn_divexact_by3(rp, x, xu); if (xn > 0) { if (rp[xu - 1] == CNST_LIMB(0)) *rn = xn - 1; else *rn = xn; } else { if (rp[xu - 1] == CNST_LIMB(0)) *rn = xn + 1; else *rn = xn; } } else *rn = 0; }
/* put in {c, 2n} where n = 2k+r the value of {v0,2k} (already in place) + B^k * [{v1, 2k+1} - {t1, 2k+1}] + B^(2k) * [{t2, 2k+1} - {v0+vinf, 2k}] + B^(3k) * [{t1, 2k+1} - {t2, 2k+1}] + B^(4k) * {vinf,2r} (high 2r-1 limbs already in place) where {t1, 2k+1} = (3*{v0,2k}+2*sa*{vm1,2k+1}+{v2,2k+1})/6-2*{vinf,2r} {t2, 2k+1} = ({v1, 2k+1} + sa * {vm1, 2k+1})/2 (sa is the sign of {vm1, 2k+1}). {vinf, 2r} stores the content of {v0, 2r} + {vinf, 2r}, with carry in cinf0. vinf0 is the low limb of vinf. ws is temporary space, and should have at least 2r limbs. Think about: The evaluated point a-b+c stands a good chance of having a zero carry limb, a+b+c would have a 1/4 chance, and 4*a+2*b+c a 1/8 chance, roughly. Perhaps this could be tested and stripped. Doing so before recursing would be better than stripping at the start of mpn_toom3_mul_n/sqr_n, since then the recursion could be based on the new size. Although in truth the kara vs toom3 crossover is never so exact that one limb either way makes a difference. A small value like 1 or 2 for the carry could perhaps also be handled with an add_n or addlsh1_n. Would that be faster than an extra limb on a (recursed) multiply/square? */ static void toom3_interpolate (mp_ptr c, mp_srcptr v1, mp_ptr v2, mp_ptr vm1, mp_ptr vinf, mp_size_t k, mp_size_t r, int sa, mp_limb_t vinf0, mp_limb_t cinf0, mp_ptr ws) { mp_limb_t cy, saved; unsigned long twok = k + k; unsigned long kk1 = twok + 1; unsigned long twor = r + r; mp_ptr c1, c2, c3, c4, c5; mp_limb_t cout; /* final carry, should be zero at the end */ c1 = c + k; c2 = c1 + k; c3 = c2 + k; c4 = c3 + k; c5 = c4 + k; #define v0 (c) /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 |vm1| hi(vinf) v1 v2+2vm1 vinf +lo(v0) */ ASSERT_NOCARRY (mpn_divexact_by3 (v2, v2, kk1)); /* v2 <- v2 / 3 */ #ifdef HAVE_NATIVE_mpn_rsh1add_n mpn_rsh1add_n (v2, v2, v0, twok); /* v2 <- (lo(v2)+v0) / 2, exact */ cy = v2[twok] & 1; /* add high limb of v2 divided by 2 */ v2[twok] >>= 1; MPN_INCR_U (v2 + twok - 1, 2, cy << (GMP_NUMB_BITS - 1)); #else v2[twok] += mpn_add_n (v2, v2, v0, twok); mpn_rshift (v2, v2, kk1, 1); #endif /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 |vm1| hi(vinf) v1 (3v0+2vm1+v2) vinf /6 +lo(v0) */ /* vm1 <- t2 := (v1 + sa*vm1) / 2 t2 = a0*b0+a0*b2+a1*b1+a2*b0+a2*b2 >= 0 No carry comes out from {v1, kk1} +/- {vm1, kk1}, and the division by two is exact */ if (sa >= 0) { #ifdef HAVE_NATIVE_mpn_rsh1add_n mpn_rsh1add_n (vm1, v1, vm1, kk1); #else mpn_add_n (vm1, vm1, v1, kk1); mpn_rshift (vm1, vm1, kk1, 1); #endif } else { #ifdef HAVE_NATIVE_mpn_rsh1sub_n mpn_rsh1sub_n (vm1, v1, vm1, kk1); #else mpn_sub_n (vm1, v1, vm1, kk1); mpn_rshift (vm1, vm1, kk1, 1); #endif } /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 t2 hi(vinf) v1 t1 vinf+lo(v0) */ /* subtract 2*vinf to v2, result is t1 := a0*b0+a0*b2+a1*b1+a1*b2+a2*b0+a2*b1+a2*b2 >= 0 */ saved = c4[0]; c4[0] = vinf0; #ifdef HAVE_NATIVE_mpn_sublsh1_n cy = mpn_sublsh1_n (v2, v2, c4, twor); #else cy = mpn_lshift (ws, c4, twor, 1); cy += mpn_sub_n (v2, v2, ws, twor); #endif MPN_DECR_U (v2 + twor, kk1 - twor, cy); c4[0] = saved; /* subtract {t2, 2k+1} in {c+3k, 2k+1} i.e. in {t2+k, 2k+1}: by chunks of k limbs from right to left to avoid overlap */ #define t2 (vm1) /* a borrow may occur in one of the 2 following __GMPN_SUB_1 calls, but since the final result is nonnegative, it will be compensated later on */ __GMPN_SUB_1 (cout, c5, c5, twor - k, t2[twok]); cy = mpn_sub_n (c4, c4, t2 + k, k); __GMPN_SUB_1 (cout, c5, c5, twor - k, cy); cy = mpn_sub_n (c3, c3, t2, k); __GMPN_SUB_1 (cout, c4, c4, twor, cy); /* don't forget to add vinf0 in {c+4k, ...} */ __GMPN_ADD_1 (cout, c4, c4, twor, vinf0); /* c c+k c+2k c+3k c+4k+1 t t+2k+1 t+4k+2 v0 t2 hi(vinf) v1 t1 vinf -t2 +lo(v0) */ /* c c+k c+2k c+3k c+4k t t+2k+1 t+4k+2 v0 t2 vinf v1 t1 vinf -t2 +lo(v0) */ /* subtract v0+vinf in {c+2k, ...} */ cy = cinf0 + mpn_sub_n (c2, c2, vinf, twor); if (twor < twok) { __GMPN_SUB_1 (cy, c2 + twor, c2 + twor, twok - twor, cy); cy += mpn_sub_n (c2 + twor, c2 + twor, v0 + twor, twok - twor); } __GMPN_SUB_1 (cout, c4, c4, twor, cy); /* 2n-4k = 2r */ /* c c+k c+2k c+3k c+4k t t+2k+1 t+4k+2 v0 t2 vinf v1 t1 vinf -v0 -t2 +lo(v0) -vinf */ /* subtract t1 in {c+k, ...} */ cy = mpn_sub_n (c1, c1, v2, kk1); __GMPN_SUB_1 (cout, c3 + 1, c3 + 1, twor + k - 1, cy); /* 2n-(3k+1)=k+2r-1 */ /* c c+k c+2k c+3k c+4k t t+2k+1 t+4k+2 v0 t2 vinf v1 t1 vinf -t1 -v0 -t2 -vinf */ /* add t1 in {c+3k, ...} */ cy = mpn_add_n (c3, c3, v2, kk1); __GMPN_ADD_1 (cout, c5 + 1, c5 + 1, twor - k - 1, cy); /* 2n-(5k+1) = 2r-k-1 */ /* c c+k c+2k c+3k c+4k t t+2k+1 t+4k+2 v0 t2 t1 vinf v1 t1 vinf -t1 -v0 -t2 -vinf */ /* add v1 in {c+k, ...} */ cy = mpn_add_n (c1, c1, v1, kk1); __GMPN_ADD_1 (cout, c3 + 1, c3 + 1, twor + k - 1, cy); /* 2n-(3k+1) = 2r+k-1 */ /* c c+k c+2k c+3k c+4k t t+2k+1 t+4k+2 v0 v1 t2 t1 vinf v1 t1 vinf -t1 -v0 -t2 -vinf */ #undef v0 #undef t2 }
/* Toom 4 interpolation. Interpolates the value at 2^(sn*B) of a polynomial p(x) with 7 coefficients given the values p(oo), p(2), p(1), p(-1), 2^6*p(1/2), 2^6*p(-1/2), p(0). The output is placed in rp and the final number of limbs of the output is given in rpn. The 4th and 6th values may be negative, and if so, n4 and n6 should be set to a negative value respectively. To save space we pass r3, r5, r7 in place in the output rp. The other r's are stored separately in space tp. The low limb of r3 is stored in r30, as it will be overwritten by the high limb of r5. rp rp1 rp2 rp3 rp4 rp5 rp6 rp7 <----------- r7-----------><------------r5--------------> <-------------r3-------------> We assume that r1 is stored at tp, r2 at (tp + t4), r4 at (tp + 2*t4) and r6 (tp + 3*t4). Each of these r's has t4 = s4 + 1 limbs allocated. */ void mpn_toom4_interpolate(mp_ptr rp, mp_size_t * rpn, mp_size_t sn, mp_ptr tp, mp_size_t s4, mp_size_t n4, mp_size_t n6, mp_limb_t r30) { mp_size_t n1, n2, n3, n5, n7, t4; mp_limb_t saved, saved2, cy; t4 = s4 + 1; mpn_add_n(r2, r2, r5, s4); if (n6 < 0) mpn_add_n(r6, r5, r6, s4); else mpn_sub_n(r6, r5, r6, s4); /* r6 is now in twos complement format */ saved = r3[0]; r3[0] = r30; if (n4 < 0) mpn_add_n(r4, r3, r4, s4); else mpn_sub_n(r4, r3, r4, s4); r3[0] = saved; /* r4 is now in twos complement format */ mpn_sub_n(r5, r5, r1, s4); #if HAVE_NATIVE_mpn_sublsh_n r5[s4-1] -= mpn_sublsh_n(r5, r5, r7, s4-1, 6); #else r5[s4-1] -= mpn_submul_1(r5, r7, s4-1, 64); #endif TC4_RSHIFT1(r4, s4); saved = r3[0]; r3[0] = r30; mpn_sub_n(r3, r3, r4, s4); r30 = r3[0]; r3[0] = saved; mpn_double(r5, s4); mpn_sub_n(r5, r5, r6, s4); saved = r3[0]; r3[0] = r30; mpn_submul_1(r2, r3, s4, 65); r3[0] = saved; saved2 = r7[s4-1]; r7[s4-1] = CNST_LIMB(0); // r7 is always positive so no sign extend needed saved = r3[0]; r3[0] = r30; #if HAVE_NATIVE_mpn_subadd_n mpn_subadd_n(r3, r3, r7, r1, s4); #else mpn_sub_n(r3, r3, r7, s4); mpn_sub_n(r3, r3, r1, s4); #endif r7[s4-1] = saved2; r30 = r3[0]; mpn_addmul_1(r2, r3, s4, 45); #if HAVE_NATIVE_mpn_sublsh_n cy = mpn_sublsh_n(r5, r5, r3, s4 - 1, 3); #else cy = mpn_submul_1(r5, r3, s4 - 1, 8); #endif r3[0] = saved; r3[0] -= (cy + 8*r3[s4-1]); mpn_rshift(r5, r5, s4, 3); mpn_divexact_by3(r5, r5, s4); mpn_sub_n(r6, r6, r2, s4); #if HAVE_NATIVE_mpn_sublsh_n mpn_sublsh_n(r2, r2, r4, s4, 4); #else mpn_submul_1(r2, r4, s4, 16); #endif mpn_rshift(r2, r2, s4, 1); mpn_divexact_by3(r2, r2, s4); mpn_divexact_by3(r2, r2, s4); saved = r3[0]; r3[0] = r30; cy = mpn_sub_n(r3, r3, r5, s4 - 1); r30 = r3[0]; r3[0] = saved; r3[s4-1] -= (cy + r5[s4-1]); mpn_sub_n(r4, r4, r2, s4); mpn_addmul_1(r6, r2, s4, 30); mpn_divexact_byfobm1(r6, r6, s4, CNST_LIMB(15), CNST_LIMB(~0/15)); mpn_rshift(r6, r6, s4, 2); mpn_sub_n(r2, r2, r6, s4); TC4_NORM(r1, n1, s4); TC4_NORM(r2, n2, s4); (*rpn) = 6*sn+1; cy = mpn_add_1(r3, r3, *rpn - 4*sn, r30); /* don't forget to add r3[0] back in */ if (cy) { rp[*rpn] = cy; (*rpn)++; } tc4_copy(rp, rpn, 5*sn, r2, n2); tc4_copy(rp, rpn, 6*sn, r1, n1); tc4_copy(rp, rpn, sn, r6, s4); tc4_copy(rp, rpn, 3*sn, r4, s4); }
void mpn_toom_interpolate_8pts (mp_ptr pp, mp_size_t n, mp_ptr r3, mp_ptr r7, mp_size_t spt, mp_ptr ws) { mp_limb_signed_t cy; mp_ptr r5, r1; r5 = (pp + 3 * n); /* 3n+1 */ r1 = (pp + 7 * n); /* spt */ /******************************* interpolation *****************************/ DO_mpn_subrsh(r3+n, 2 * n + 1, pp, 2 * n, 4, ws); cy = DO_mpn_sublsh_n (r3, r1, spt, 12, ws); MPN_DECR_U (r3 + spt, 3 * n + 1 - spt, cy); DO_mpn_subrsh(r5+n, 2 * n + 1, pp, 2 * n, 2, ws); cy = DO_mpn_sublsh_n (r5, r1, spt, 6, ws); MPN_DECR_U (r5 + spt, 3 * n + 1 - spt, cy); r7[3*n] -= mpn_sub_n (r7+n, r7+n, pp, 2 * n); cy = mpn_sub_n (r7, r7, r1, spt); MPN_DECR_U (r7 + spt, 3 * n + 1 - spt, cy); ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, 3 * n + 1)); ASSERT_NOCARRY(mpn_rshift(r3, r3, 3 * n + 1, 2)); ASSERT_NOCARRY(mpn_sub_n (r5, r5, r7, 3 * n + 1)); ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, 3 * n + 1)); mpn_divexact_by45 (r3, r3, 3 * n + 1); ASSERT_NOCARRY(mpn_divexact_by3 (r5, r5, 3 * n + 1)); ASSERT_NOCARRY(DO_mpn_sublsh2_n (r5, r3, 3 * n + 1, ws)); /* last interpolation steps... */ /* ... are mixed with recomposition */ /***************************** recomposition *******************************/ /* pp[] prior to operations: |_H r1|_L r1|____||_H r5|_M_r5|_L r5|_____|_H r8|_L r8|pp summation scheme for remaining operations: |____8|n___7|n___6|n___5|n___4|n___3|n___2|n____|n____|pp |_H r1|_L r1|____||_H*r5|_M r5|_L r5|_____|_H_r8|_L r8|pp ||_H r3|_M r3|_L*r3| ||_H_r7|_M_r7|_L_r7| ||-H r3|-M r3|-L*r3| ||-H*r5|-M_r5|-L_r5| */ cy = mpn_add_n (pp + n, pp + n, r7, n); /* Hr8+Lr7-Lr5 */ cy-= mpn_sub_n (pp + n, pp + n, r5, n); if (0 > cy) MPN_DECR_U (r7 + n, 2*n + 1, 1); else MPN_INCR_U (r7 + n, 2*n + 1, cy); cy = mpn_sub_n (pp + 2*n, r7 + n, r5 + n, n); /* Mr7-Mr5 */ MPN_DECR_U (r7 + 2*n, n + 1, cy); cy = mpn_add_n (pp + 3*n, r5, r7+ 2*n, n+1); /* Hr7+Lr5 */ r5[3*n]+= mpn_add_n (r5 + 2*n, r5 + 2*n, r3, n); /* Hr5+Lr3 */ cy-= mpn_sub_n (pp + 3*n, pp + 3*n, r5 + 2*n, n+1); /* Hr7-Hr5+Lr5-Lr3 */ if (UNLIKELY(0 > cy)) MPN_DECR_U (r5 + n + 1, 2*n, 1); else MPN_INCR_U (r5 + n + 1, 2*n, cy); ASSERT_NOCARRY(mpn_sub_n(pp + 4*n, r5 + n, r3 + n, 2*n +1)); /* Mr5-Mr3,Hr5-Hr3 */ cy = mpn_add_1 (pp + 6*n, r3 + n, n, pp[6*n]); MPN_INCR_U (r3 + 2*n, n + 1, cy); cy = mpn_add_n (pp + 7*n, pp + 7*n, r3 + 2*n, n); if (LIKELY(spt != n)) MPN_INCR_U (pp + 8*n, spt - n, cy + r3[3*n]); else ASSERT (r3[3*n] | cy == 0); }
void mpn_toom_interpolate_7pts (mp_ptr rp, mp_size_t n, enum toom7_flags flags, mp_ptr w1, mp_ptr w3, mp_ptr w4, mp_ptr w5, mp_size_t w6n, mp_ptr tp) { mp_size_t m; mp_limb_t cy; m = 2*n + 1; #define w0 rp #define w2 (rp + 2*n) #define w6 (rp + 6*n) ASSERT (w6n > 0); ASSERT (w6n <= 2*n); /* Using formulas similar to Marco Bodrato's W5 = W5 + W4 W1 =(W4 - W1)/2 W4 = W4 - W0 W4 =(W4 - W1)/4 - W6*16 W3 =(W2 - W3)/2 W2 = W2 - W3 W5 = W5 - W2*65 May be negative. W2 = W2 - W6 - W0 W5 =(W5 + W2*45)/2 Now >= 0 again. W4 =(W4 - W2)/3 W2 = W2 - W4 W1 = W5 - W1 May be negative. W5 =(W5 - W3*8)/9 W3 = W3 - W5 W1 =(W1/15 + W5)/2 Now >= 0 again. W5 = W5 - W1 where W0 = f(0), W1 = f(-2), W2 = f(1), W3 = f(-1), W4 = f(2), W5 = f(1/2), W6 = f(oo), Note that most intermediate results are positive; the ones that may be negative are represented in two's complement. We must never shift right a value that may be negative, since that would invalidate the sign bit. On the other hand, divexact by odd numbers work fine with two's complement. */ mpn_add_n (w5, w5, w4, m); if (flags & toom7_w1_neg) { #ifdef HAVE_NATIVE_mpn_rsh1add_n mpn_rsh1add_n (w1, w1, w4, m); #else mpn_add_n (w1, w1, w4, m); ASSERT (!(w1[0] & 1)); mpn_rshift (w1, w1, m, 1); #endif } else { #ifdef HAVE_NATIVE_mpn_rsh1sub_n mpn_rsh1sub_n (w1, w4, w1, m); #else mpn_sub_n (w1, w4, w1, m); ASSERT (!(w1[0] & 1)); mpn_rshift (w1, w1, m, 1); #endif } mpn_sub (w4, w4, m, w0, 2*n); mpn_sub_n (w4, w4, w1, m); ASSERT (!(w4[0] & 3)); mpn_rshift (w4, w4, m, 2); /* w4>=0 */ tp[w6n] = mpn_lshift (tp, w6, w6n, 4); mpn_sub (w4, w4, m, tp, w6n+1); if (flags & toom7_w3_neg) { #ifdef HAVE_NATIVE_mpn_rsh1add_n mpn_rsh1add_n (w3, w3, w2, m); #else mpn_add_n (w3, w3, w2, m); ASSERT (!(w3[0] & 1)); mpn_rshift (w3, w3, m, 1); #endif } else { #ifdef HAVE_NATIVE_mpn_rsh1sub_n mpn_rsh1sub_n (w3, w2, w3, m); #else mpn_sub_n (w3, w2, w3, m); ASSERT (!(w3[0] & 1)); mpn_rshift (w3, w3, m, 1); #endif } mpn_sub_n (w2, w2, w3, m); mpn_submul_1 (w5, w2, m, 65); mpn_sub (w2, w2, m, w6, w6n); mpn_sub (w2, w2, m, w0, 2*n); mpn_addmul_1 (w5, w2, m, 45); ASSERT (!(w5[0] & 1)); mpn_rshift (w5, w5, m, 1); mpn_sub_n (w4, w4, w2, m); mpn_divexact_by3 (w4, w4, m); mpn_sub_n (w2, w2, w4, m); mpn_sub_n (w1, w5, w1, m); mpn_lshift (tp, w3, m, 3); mpn_sub_n (w5, w5, tp, m); mpn_divexact_by9 (w5, w5, m); mpn_sub_n (w3, w3, w5, m); mpn_divexact_by15 (w1, w1, m); mpn_add_n (w1, w1, w5, m); ASSERT (!(w1[0] & 1)); mpn_rshift (w1, w1, m, 1); /* w1>=0 now */ mpn_sub_n (w5, w5, w1, m); /* These bounds are valid for the 4x4 polynomial product of toom44, * and they are conservative for toom53 and toom62. */ ASSERT (w1[2*n] < 2); ASSERT (w2[2*n] < 3); ASSERT (w3[2*n] < 4); ASSERT (w4[2*n] < 3); ASSERT (w5[2*n] < 2); /* Addition chain. Note carries and the 2n'th limbs that need to be * added in. * * Special care is needed for w2[2n] and the corresponding carry, * since the "simple" way of adding it all together would overwrite * the limb at wp[2*n] and rp[4*n] (same location) with the sum of * the high half of w3 and the low half of w4. * * 7 6 5 4 3 2 1 0 * | | | | | | | | | * ||w3 (2n+1)| * ||w4 (2n+1)| * ||w5 (2n+1)| ||w1 (2n+1)| * + | w6 (w6n)| ||w2 (2n+1)| w0 (2n) | (share storage with r) * ----------------------------------------------- * r | | | | | | | | | * c7 c6 c5 c4 c3 Carries to propagate */ cy = mpn_add_n (rp + n, rp + n, w1, m); MPN_INCR_U (w2 + n + 1, n , cy); cy = mpn_add_n (rp + 3*n, rp + 3*n, w3, n); MPN_INCR_U (w3 + n, n + 1, w2[2*n] + cy); cy = mpn_add_n (rp + 4*n, w3 + n, w4, n); MPN_INCR_U (w4 + n, n + 1, w3[2*n] + cy); cy = mpn_add_n (rp + 5*n, w4 + n, w5, n); MPN_INCR_U (w5 + n, n + 1, w4[2*n] + cy); if (w6n > n + 1) ASSERT_NOCARRY (mpn_add (rp + 6*n, rp + 6*n, w6n, w5 + n, n + 1)); else { ASSERT_NOCARRY (mpn_add_n (rp + 6*n, rp + 6*n, w5 + n, w6n)); #if WANT_ASSERT { mp_size_t i; for (i = w6n; i <= n; i++) ASSERT (w5[n + i] == 0); } #endif } }
void mpn_toom_interpolate_5pts (mp_ptr c, mp_ptr v2, mp_ptr vm1, mp_size_t k, mp_size_t twor, int sa, mp_limb_t vinf0) { mp_limb_t cy, saved; mp_size_t twok; mp_size_t kk1; mp_ptr c1, v1, c3, vinf; twok = k + k; kk1 = twok + 1; c1 = c + k; v1 = c1 + k; c3 = v1 + k; vinf = c3 + k; #define v0 (c) /* (1) v2 <- v2-vm1 < v2+|vm1|, (16 8 4 2 1) - (1 -1 1 -1 1) = thus 0 <= v2 < 50*B^(2k) < 2^6*B^(2k) (15 9 3 3 0) */ if (sa) ASSERT_NOCARRY (mpn_add_n (v2, v2, vm1, kk1)); else ASSERT_NOCARRY (mpn_sub_n (v2, v2, vm1, kk1)); /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 v1 hi(vinf) |vm1| v2-vm1 EMPTY */ ASSERT_NOCARRY (mpn_divexact_by3 (v2, v2, kk1)); /* v2 <- v2 / 3 */ /* (5 3 1 1 0)*/ /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 v1 hi(vinf) |vm1| (v2-vm1)/3 EMPTY */ /* (2) vm1 <- tm1 := (v1 - vm1) / 2 [(1 1 1 1 1) - (1 -1 1 -1 1)] / 2 = tm1 >= 0 (0 1 0 1 0) No carry comes out from {v1, kk1} +/- {vm1, kk1}, and the division by two is exact. If (sa!=0) the sign of vm1 is negative */ if (sa) { #ifdef HAVE_NATIVE_mpn_rsh1add_n mpn_rsh1add_n (vm1, v1, vm1, kk1); #else ASSERT_NOCARRY (mpn_add_n (vm1, v1, vm1, kk1)); ASSERT_NOCARRY (mpn_rshift (vm1, vm1, kk1, 1)); #endif } else { #ifdef HAVE_NATIVE_mpn_rsh1sub_n mpn_rsh1sub_n (vm1, v1, vm1, kk1); #else ASSERT_NOCARRY (mpn_sub_n (vm1, v1, vm1, kk1)); ASSERT_NOCARRY (mpn_rshift (vm1, vm1, kk1, 1)); #endif } /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 v1 hi(vinf) tm1 (v2-vm1)/3 EMPTY */ /* (3) v1 <- t1 := v1 - v0 (1 1 1 1 1) - (0 0 0 0 1) = (1 1 1 1 0) t1 >= 0 */ vinf[0] -= mpn_sub_n (v1, v1, c, twok); /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 v1-v0 hi(vinf) tm1 (v2-vm1)/3 EMPTY */ /* (4) v2 <- t2 := ((v2-vm1)/3-t1)/2 = (v2-vm1-3*t1)/6 t2 >= 0 [(5 3 1 1 0) - (1 1 1 1 0)]/2 = (2 1 0 0 0) */ #ifdef HAVE_NATIVE_mpn_rsh1sub_n mpn_rsh1sub_n (v2, v2, v1, kk1); #else ASSERT_NOCARRY (mpn_sub_n (v2, v2, v1, kk1)); ASSERT_NOCARRY (mpn_rshift (v2, v2, kk1, 1)); #endif /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 v1-v0 hi(vinf) tm1 (v2-vm1-3t1)/6 EMPTY */ /* (5) v1 <- t1-tm1 (1 1 1 1 0) - (0 1 0 1 0) = (1 0 1 0 0) result is v1 >= 0 */ ASSERT_NOCARRY (mpn_sub_n (v1, v1, vm1, kk1)); /* We do not need to read the value in vm1, so we add it in {c+k, ...} */ cy = mpn_add_n (c1, c1, vm1, kk1); MPN_INCR_U (c3 + 1, twor + k - 1, cy); /* 2n-(3k+1) = 2r+k-1 */ /* Memory allocated for vm1 is now free, it can be recycled ...*/ /* (6) v2 <- v2 - 2*vinf, (2 1 0 0 0) - 2*(1 0 0 0 0) = (0 1 0 0 0) result is v2 >= 0 */ saved = vinf[0]; /* Remember v1's highest byte (will be overwritten). */ vinf[0] = vinf0; /* Set the right value for vinf0 */ #ifdef HAVE_NATIVE_mpn_sublsh1_n cy = mpn_sublsh1_n (v2, v2, vinf, twor); #else /* Overwrite unused vm1 */ cy = mpn_lshift (vm1, vinf, twor, 1); cy += mpn_sub_n (v2, v2, vm1, twor); #endif MPN_DECR_U (v2 + twor, kk1 - twor, cy); /* Current matrix is [1 0 0 0 0; vinf 0 1 0 0 0; v2 1 0 1 0 0; v1 0 1 0 1 0; vm1 0 0 0 0 1] v0 Some vaues already are in-place (we added vm1 in the correct position) | vinf| v1 | v0 | | vm1 | One still is in a separated area | +v2 | We have to compute v1-=vinf; vm1 -= v2, |-vinf| | -v2 | Carefully reordering operations we can avoid to compute twice the sum of the high half of v2 plus the low half of vinf. */ /* Add the high half of t2 in {vinf} */ if ( LIKELY(twor > k + 1) ) { /* This is the expected flow */ cy = mpn_add_n (vinf, vinf, v2 + k, k + 1); MPN_INCR_U (c3 + kk1, twor - k - 1, cy); /* 2n-(5k+1) = 2r-k-1 */ } else { /* triggered only by very unbalanced cases like (k+k+(k-2))x(k+k+1) , should be handled by toom32 */ ASSERT_NOCARRY (mpn_add_n (vinf, vinf, v2 + k, twor)); } /* (7) v1 <- v1 - vinf, (1 0 1 0 0) - (1 0 0 0 0) = (0 0 1 0 0) result is >= 0 */ /* Side effect: we also subtracted (high half) vm1 -= v2 */ cy = mpn_sub_n (v1, v1, vinf, twor); /* vinf is at most twor long. */ vinf0 = vinf[0]; /* Save again the right value for vinf0 */ vinf[0] = saved; MPN_DECR_U (v1 + twor, kk1 - twor, cy); /* Treat the last bytes. */ /* (8) vm1 <- vm1-v2 (0 1 0 1 0) - (0 1 0 0 0) = (0 0 0 1 0) Operate only on the low half. */ cy = mpn_sub_n (c1, c1, v2, k); MPN_DECR_U (v1, kk1, cy); /********************* Beginning the final phase **********************/ /* Most of the recomposition was done */ /* add t2 in {c+3k, ...}, but only the low half */ cy = mpn_add_n (c3, c3, v2, k); vinf[0] += cy; ASSERT(vinf[0] >= cy); /* No carry */ MPN_INCR_U (vinf, twor, vinf0); /* Add vinf0, propagate carry. */ #undef v0 }
void mpn_toom_interpolate_5pts (mp_ptr c, mp_ptr v2, mp_ptr vm1, mp_size_t k, mp_size_t twor, int sa, mp_limb_t vinf0, mp_ptr ws) { mp_limb_t cy, saved; mp_size_t twok = k + k; mp_size_t kk1 = twok + 1; mp_ptr c1, v1, c3, vinf, c5; mp_limb_t cout; /* final carry, should be zero at the end */ c1 = c + k; v1 = c1 + k; c3 = v1 + k; vinf = c3 + k; c5 = vinf + k; #define v0 (c) /* (1) v2 <- v2-vm1 < v2+|vm1|, (16 8 4 2 1) - (1 -1 1 -1 1) = thus 0 <= v2 < 50*B^(2k) < 2^6*B^(2k) (15 9 3 3 0) */ if (sa <= 0) mpn_add_n (v2, v2, vm1, kk1); else mpn_sub_n (v2, v2, vm1, kk1); /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 v1 hi(vinf) |vm1| v2-vm1 EMPTY */ ASSERT_NOCARRY (mpn_divexact_by3 (v2, v2, kk1)); /* v2 <- v2 / 3 */ /* (5 3 1 1 0)*/ /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 v1 hi(vinf) |vm1| (v2-vm1)/3 EMPTY */ /* (2) vm1 <- tm1 := (v1 - sa*vm1) / 2 [(1 1 1 1 1) - (1 -1 1 -1 1)] / 2 = tm1 >= 0 (0 1 0 1 0) No carry comes out from {v1, kk1} +/- {vm1, kk1}, and the division by two is exact */ if (sa <= 0) { #ifdef HAVE_NATIVE_mpn_rsh1add_n mpn_rsh1add_n (vm1, v1, vm1, kk1); #else mpn_add_n (vm1, v1, vm1, kk1); mpn_rshift (vm1, vm1, kk1, 1); #endif } else { #ifdef HAVE_NATIVE_mpn_rsh1sub_n mpn_rsh1sub_n (vm1, v1, vm1, kk1); #else mpn_sub_n (vm1, v1, vm1, kk1); mpn_rshift (vm1, vm1, kk1, 1); #endif } /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 v1 hi(vinf) tm1 (v2-vm1)/3 EMPTY */ /* (3) v1 <- t1 := v1 - v0 (1 1 1 1 1) - (0 0 0 0 1) = (1 1 1 1 0) t1 >= 0 */ vinf[0] -= mpn_sub_n (v1, v1, c, twok); /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 v1-v0 hi(vinf) tm1 (v2-vm1)/3 EMPTY */ /* (4) v2 <- t2 := ((v2-vm1)/3-t1)/2 = (v2-vm1-3*t1)/6 t2 >= 0 [(5 3 1 1 0) - (1 1 1 1 0)]/2 = (2 1 0 0 0) */ #ifdef HAVE_NATIVE_mpn_rsh1sub_n mpn_rsh1sub_n (v2, v2, v1, kk1); #else mpn_sub_n (v2, v2, v1, kk1); mpn_rshift (v2, v2, kk1, 1); #endif /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 v1-v0 hi(vinf) tm1 (v2-vm1-3t1)/6 EMPTY */ /* (5) v1 <- t1-tm1 (1 1 1 1 0) - (0 1 0 1 0) = (1 0 1 0 0) result is v1 >= 0 */ mpn_sub_n (v1, v1, vm1, kk1); /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 v1-v0-tm1 hi(vinf) tm1 (v2-vm1-3t1)/6 EMPTY */ /* (6) v2 <- v2 - 2*vinf, (2 1 0 0 0) - 2*(1 0 0 0 0) = (0 1 0 0 0) result is v2 >= 0 */ saved = vinf[0]; /* Remember v1's highest byte (will be overwritten). */ vinf[0] = vinf0; /* Set the right value for vinf0 */ #ifdef HAVE_NATIVE_mpn_sublsh1_n cy = mpn_sublsh1_n (v2, v2, vinf, twor); #else cy = mpn_lshift (ws, vinf, twor, 1); cy += mpn_sub_n (v2, v2, ws, twor); #endif MPN_DECR_U (v2 + twor, kk1 - twor, cy); /* (7) v1 <- v1 - vinf, (1 0 1 0 0) - (1 0 0 0 0) = (0 0 1 0 0) result is >= 0 */ cy = mpn_sub_n (v1, v1, vinf, twor); /* vinf is at most twor long. */ vinf[0] = saved; MPN_DECR_U (v1 + twor, kk1 - twor, cy); /* Treat the last bytes. */ __GMPN_ADD_1 (cout, vinf, vinf, twor, vinf0); /* Add vinf0, propagate carry. */ /* (8) vm1 <- vm1-t2 (0 1 0 1 0) - (0 1 0 0 0) = (0 0 0 1 0) vm1 >= 0 */ mpn_sub_n (vm1, vm1, v2, kk1); /* No overlapping here. */ /********************* Beginning the final phase **********************/ /* {c,2k} {c+2k,2k } {c+4k ,2r } {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 t1 hi(t1)+vinf tm1 (v2-vm1-3t1)/6 EMPTY */ /* (9) add t2 in {c+3k, ...} */ cy = mpn_add_n (c3, c3, v2, kk1); __GMPN_ADD_1 (cout, c5 + 1, c5 + 1, twor - k - 1, cy); /* 2n-(5k+1) = 2r-k-1 */ /* {c,2k} {c+2k,2k } {c+4k ,2r } {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 t1 hi(t1)+vinf tm1 (v2-vm1-3t1)/6 EMPTY */ /* c c+k c+2k c+3k c+4k t t+2k+1 t+4k+2 v0 t1 vinf tm1 t2 +t2 */ /* add vm1 in {c+k, ...} */ cy = mpn_add_n (c1, c1, vm1, kk1); __GMPN_ADD_1 (cout, c3 + 1, c3 + 1, twor + k - 1, cy); /* 2n-(3k+1) = 2r+k-1 */ /* c c+k c+2k c+3k c+4k t t+2k+1 t+4k+2 v0 t1 vinf tm1 t2 +tm1 +t2 */ #undef v0 #undef t2 }
void mpn_toom_interpolate_6pts (mp_ptr pp, mp_size_t n, enum toom6_flags flags, mp_ptr w4, mp_ptr w2, mp_ptr w1, mp_size_t w0n) { mp_limb_t cy; /* cy6 can be stored in w1[2*n], cy4 in w4[0], embankment in w2[0] */ mp_limb_t cy4, cy6, embankment; ASSERT( n > 0 ); ASSERT( 2*n >= w0n && w0n > 0 ); #define w5 pp /* 2n */ #define w3 (pp + 2 * n) /* 2n+1 */ #define w0 (pp + 5 * n) /* w0n */ /* Interpolate with sequence: W2 =(W1 - W2)>>2 W1 =(W1 - W5)>>1 W1 =(W1 - W2)>>1 W4 =(W3 - W4)>>1 W2 =(W2 - W4)/3 W3 = W3 - W4 - W5 W1 =(W1 - W3)/3 // Last steps are mixed with recomposition... W2 = W2 - W0<<2 W4 = W4 - W2 W3 = W3 - W1 W2 = W2 - W0 */ /* W2 =(W1 - W2)>>2 */ if (flags & toom6_vm2_neg) mpn_add_n (w2, w1, w2, 2 * n + 1); else mpn_sub_n (w2, w1, w2, 2 * n + 1); mpn_rshift (w2, w2, 2 * n + 1, 2); /* W1 =(W1 - W5)>>1 */ w1[2*n] -= mpn_sub_n (w1, w1, w5, 2*n); mpn_rshift (w1, w1, 2 * n + 1, 1); /* W1 =(W1 - W2)>>1 */ #if HAVE_NATIVE_mpn_rsh1sub_n mpn_rsh1sub_n (w1, w1, w2, 2 * n + 1); #else mpn_sub_n (w1, w1, w2, 2 * n + 1); mpn_rshift (w1, w1, 2 * n + 1, 1); #endif /* W4 =(W3 - W4)>>1 */ if (flags & toom6_vm1_neg) { #if HAVE_NATIVE_mpn_rsh1add_n mpn_rsh1add_n (w4, w3, w4, 2 * n + 1); #else mpn_add_n (w4, w3, w4, 2 * n + 1); mpn_rshift (w4, w4, 2 * n + 1, 1); #endif } else { #if HAVE_NATIVE_mpn_rsh1sub_n mpn_rsh1sub_n (w4, w3, w4, 2 * n + 1); #else mpn_sub_n (w4, w3, w4, 2 * n + 1); mpn_rshift (w4, w4, 2 * n + 1, 1); #endif } /* W2 =(W2 - W4)/3 */ mpn_sub_n (w2, w2, w4, 2 * n + 1); mpn_divexact_by3 (w2, w2, 2 * n + 1); /* W3 = W3 - W4 - W5 */ mpn_sub_n (w3, w3, w4, 2 * n + 1); w3[2 * n] -= mpn_sub_n (w3, w3, w5, 2 * n); /* W1 =(W1 - W3)/3 */ mpn_sub_n (w1, w1, w3, 2 * n + 1); mpn_divexact_by3 (w1, w1, 2 * n + 1); /* [1 0 0 0 0 0; 0 1 0 0 0 0; 1 0 1 0 0 0; 0 1 0 1 0 0; 1 0 1 0 1 0; 0 0 0 0 0 1] pp[] prior to operations: |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__| summation scheme for remaining operations: |______________5|n_____4|n_____3|n_____2|n______|n______|pp |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__| || H w4 | L w4 | || H w2 | L w2 | || H w1 | L w1 | ||-H w1 |-L w1 | |-H w0 |-L w0 ||-H w2 |-L w2 | */ cy = mpn_add_n (pp + n, pp + n, w4, 2 * n + 1); MPN_INCR_U (pp + 3 * n + 1, n, cy); /* W2 -= W0<<2 */ #if HAVE_NATIVE_mpn_sublsh_n || HAVE_NATIVE_mpn_sublsh2_n_ip1 #if HAVE_NATIVE_mpn_sublsh2_n_ip1 cy = mpn_sublsh2_n_ip1 (w2, w0, w0n); #else cy = mpn_sublsh_n (w2, w2, w0, w0n, 2); #endif #else /* {W4,2*n+1} is now free and can be overwritten. */ cy = mpn_lshift(w4, w0, w0n, 2); cy+= mpn_sub_n(w2, w2, w4, w0n); #endif MPN_DECR_U (w2 + w0n, 2 * n + 1 - w0n, cy); /* W4L = W4L - W2L */ cy = mpn_sub_n (pp + n, pp + n, w2, n); MPN_DECR_U (w3, 2 * n + 1, cy); /* W3H = W3H + W2L */ cy4 = w3[2 * n] + mpn_add_n (pp + 3 * n, pp + 3 * n, w2, n); /* W1L + W2H */ cy = w2[2 * n] + mpn_add_n (pp + 4 * n, w1, w2 + n, n); MPN_INCR_U (w1 + n, n + 1, cy); /* W0 = W0 + W1H */ if (LIKELY (w0n > n)) cy6 = w1[2 * n] + mpn_add_n (w0, w0, w1 + n, n); else cy6 = mpn_add_n (w0, w0, w1 + n, w0n); /* summation scheme for the next operation: |...____5|n_____4|n_____3|n_____2|n______|n______|pp |...w0___|_w1_w2_|_H w3__|_L w3__|_H w5__|_L w5__| ...-w0___|-w1_w2 | */ /* if(LIKELY(w0n>n)) the two operands below DO overlap! */ cy = mpn_sub_n (pp + 2 * n, pp + 2 * n, pp + 4 * n, n + w0n); /* embankment is a "dirty trick" to avoid carry/borrow propagation beyond allocated memory */ embankment = w0[w0n - 1] - 1; w0[w0n - 1] = 1; if (LIKELY (w0n > n)) { if (cy4 > cy6) MPN_INCR_U (pp + 4 * n, w0n + n, cy4 - cy6); else MPN_DECR_U (pp + 4 * n, w0n + n, cy6 - cy4); MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy); MPN_INCR_U (w0 + n, w0n - n, cy6); } else { MPN_INCR_U (pp + 4 * n, w0n + n, cy4); MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy + cy6); } w0[w0n - 1] += embankment; #undef w5 #undef w3 #undef w0 }
/* mpn_divexact_by3 was a function in gmp 3.0.1, but as of gmp 3.1 it's a macro calling mpn_divexact_by3c. */ mp_limb_t __MPN (divexact_by3) (mp_ptr dst, mp_srcptr src, mp_size_t size) { return mpn_divexact_by3 (dst, src, size); }
/* We have {v0,2k} {v1,2k+1} {c+4k+1,r+r2-1} v0 v1 {-}vinf vinf0 is the first limb of vinf, which is overwritten by v1 {vm1,2k+1} {v2, 2k+1} ws is temporary space sa is the sign of vm1 rr2 is r+r2 We want to compute t1 <- (3*v0+2*vm1+v2)/6-2*vinf t2 <- (v1+vm1)/2 then the result is c0+c1*t+c2*t^2+c3*t^3+c4*t^4 where c0 <- v0 c1 <- v1 - t1 c2 <- t2 - v0 - vinf c3 <- t1 - t2 c4 <- vinf */ void mpn_toom3_interpolate (mp_ptr c, mp_ptr v1, mp_ptr v2, mp_ptr vm1, mp_ptr vinf, mp_size_t k, mp_size_t rr2, int sa, mp_limb_t vinf0, mp_ptr ws) { mp_limb_t cy, saved; mp_size_t twok = k + k; mp_size_t kk1 = twok + 1; mp_ptr c1, c2, c3, c4, c5; mp_limb_t cout; /* final carry, should be zero at the end */ c1 = c + k; c2 = c1 + k; c3 = c2 + k; c4 = c3 + k; c5 = c4 + k; #define v0 (c) /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} v0 v1 {-}vinf {vm1,2k+1} {v2, 2k+1} */ /* v2 <- v2 - vm1 */ if (sa < 0) { mpn_add_n(v2, v2, vm1, kk1); } else { mpn_sub_n(v2, v2, vm1, kk1); } ASSERT_NOCARRY (mpn_divexact_by3 (v2, v2, kk1)); /* v2 <- v2 / 3 */ /* vm1 <- t2 := (v1 - sa*vm1) / 2 */ if (sa < 0) { #ifdef HAVE_NATIVE_mpn_rsh1add_n mpn_rsh1add_n (vm1, v1, vm1, kk1); #else mpn_add_n (vm1, vm1, v1, kk1); mpn_half (vm1, kk1); #endif } else { #ifdef HAVE_NATIVE_mpn_rsh1sub_n mpn_rsh1sub_n (vm1, v1, vm1, kk1); #else mpn_sub_n (vm1, v1, vm1, kk1); mpn_half (vm1, kk1); #endif } /* v1 <- v1 - v0 - vinf */ saved = c4[0]; c4[0] = vinf0; #if HAVE_NATIVE_mpn_subadd_n cy = mpn_subadd_n(v1, v1, v0, c4, rr2); #else cy = mpn_sub_n(v1, v1, v0, rr2); cy += mpn_sub_n(v1, v1, c4, rr2); #endif c4[0] = saved; if (rr2 < twok) { v1[twok] -= mpn_sub_n(v1 + rr2, v1 + rr2, v0 + rr2, twok - rr2); MPN_DECR_U(v1 + rr2, kk1 - rr2, cy); } else v1[twok] -= cy; saved = c4[0]; c4[0] = vinf0; /* subtract 5*vinf from v2, */ cy = mpn_submul_1 (v2, c4, rr2, CNST_LIMB(5)); MPN_DECR_U (v2 + rr2, kk1 - rr2, cy); c4[0] = saved; /* v2 = (v2 - v1)/2 (exact) */ #ifdef HAVE_NATIVE_mpn_rsh1sub_n mpn_rsh1sub_n (v2, v2, v1, kk1); #else mpn_sub_n (v2, v2, v1, kk1); mpn_half (v2, kk1); #endif /* v1 = v1 - vm1 */ mpn_sub_n(v1, v1, vm1, kk1); /* vm1 = vm1 - v2 and add vm1 in {c+k, ...} */ #if HAVE_NATIVE_mpn_addsub_n cy = mpn_addsub_n(c1, c1, vm1, v2, kk1); #else mpn_sub_n(vm1, vm1, v2, kk1); cy = mpn_add_n (c1, c1, vm1, kk1); #endif ASSERT_NOCARRY (mpn_add_1(c3 + 1, c3 + 1, rr2 + k - 1, cy)); /* 4k+rr2-(3k+1) = rr2+k-1 */ /* don't forget to add vinf0 in {c+4k, ...} */ ASSERT_NOCARRY (mpn_add_1(c4, c4, rr2, vinf0)); /* add v2 in {c+3k, ...} */ if (rr2 <= k + 1) ASSERT_NOCARRY (mpn_add_n (c3, c3, v2, k+rr2)); else { cy = mpn_add_n (c3, c3, v2, kk1); if (cy) ASSERT_NOCARRY (mpn_add_1(c5 + 1, c5 + 1, rr2 - k - 1, cy)); /* 4k+rr2-(5k+1) = rr2-k-1 */ } #undef v0 }