void mpn_toom3_mul_n (mp_ptr c, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr t) { mp_size_t k, k1, kk1, r, twok, twor; mp_limb_t cy, cc, saved, vinf0, cinf0; mp_ptr trec; int sa, sb; mp_ptr c1, c2, c3, c4, c5; ASSERT(GMP_NUMB_BITS >= 6); ASSERT(n >= 17); /* so that r <> 0 and 5k+3 <= 2n */ /* The algorithm is the following: 0. k = ceil(n/3), r = n - 2k, B = 2^(GMP_NUMB_BITS), t = B^k 1. split a and b in three parts each a0, a1, a2 and b0, b1, b2 with a0, a1, b0, b1 of k limbs, and a2, b2 of r limbs 2. v0 <- a0*b0 v1 <- (a0+a1+a2)*(b0+b1+b2) v2 <- (a0+2*a1+4*a2)*(b0+2*b1+4*b2) vm1 <- (a0-a1+a2)*(b0-b1+b2) vinf <- a2*b2 t1 <- (3*v0+2*vm1+v2)/6-2*vinf t2 <- (v1+vm1)/2 3. result is c0+c1*t+c2*t^2+c3*t^3+c4*t^4 where c0 <- v0 c1 <- v1 - t1 c2 <- t2 - v0 - vinf c3 <- t1 - t2 c4 <- vinf */ k = (n + 2) / 3; /* ceil(n/3) */ twok = 2 * k; k1 = k + 1; kk1 = k + k1; r = n - twok; /* last chunk */ twor = 2 * r; c1 = c + k; c2 = c1 + k; c3 = c2 + k; c4 = c3 + k; c5 = c4 + k; trec = t + 4 * k + 3; /* trec = v2 + (2k+2) */ /* put a0+a2 in {c, k+1}, and b0+b2 in {c+k+1, k+1}; put a0+a1+a2 in {c+2k+2, k+1} and b0+b1+b2 in {c+3k+3,k+1} [requires 4k+4 <= 2n, ie. n >= 8] */ cy = mpn_add_n (c, a, a + twok, r); cc = mpn_add_n (c1 + 1, b, b + twok, r); if (r < k) { __GMPN_ADD_1 (cy, c + r, a + r, k - r, cy); __GMPN_ADD_1 (cc, c1 + 1 + r, b + r, k - r, cc); } c3[2] = (c1[0] = cy) + mpn_add_n (c2 + 2, c, a + k, k); c4[3] = (c2[1] = cc) + mpn_add_n (c3 + 3, c1 + 1, b + k, k); #define v2 (t+2*k+1) #define vinf (t+4*k+2) /* compute v1 := (a0+a1+a2)*(b0+b1+b2) in {t, 2k+1}; since v1 < 9*B^(2k), v1 uses only 2k+1 words if GMP_NUMB_BITS >= 4 */ TOOM3_MUL_REC (t, c2 + 2, c3 + 3, k1, trec); /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v1 */ /* put |a0-a1+a2| in {c, k+1} and |b0-b1+b2| in {c+4k+2,k+1} */ /* sa = sign(a0-a1+a2) */ sa = (c[k] != 0) ? 1 : mpn_cmp (c, a + k, k); c[k] = (sa >= 0) ? c[k] - mpn_sub_n (c, c, a + k, k) : mpn_sub_n (c, a + k, c, k); /* b0+b2 is in {c+k+1, k+1} now */ sb = (c2[1] != 0) ? 1 : mpn_cmp (c1 + 1, b + k, k); c5[2] = (sb >= 0) ? c2[1] - mpn_sub_n (c4 + 2, c1 + 1, b + k, k) : mpn_sub_n (c4 + 2, b + k, c1 + 1, k); sa *= sb; /* sign of vm1 */ /* compute vm1 := (a0-a1+a2)*(b0-b1+b2) in {c+2k, 2k+1}; since |vm1| < 4*B^(2k), vm1 uses only 2k+1 limbs */ TOOM3_MUL_REC (c2, c, c4 + 2, k1, trec); /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} vm1 v1 */ /* compute a0+2a1+4a2 in {c, k+1} and b0+2b1+4b2 in {c+4k+2, k+1} [requires 5k+3 <= 2n, i.e. n >= 17] */ #ifdef HAVE_NATIVE_mpn_addlsh1_n c1[0] = mpn_addlsh1_n (c, a + k, a + twok, r); c5[2] = mpn_addlsh1_n (c4 + 2, b + k, b + twok, r); if (r < k) { __GMPN_ADD_1 (c1[0], c + r, a + k + r, k - r, c1[0]); __GMPN_ADD_1 (c5[2], c4 + 2 + r, b + k + r, k - r, c5[2]); } c1[0] = 2 * c1[0] + mpn_addlsh1_n (c, a, c, k); c5[2] = 2 * c5[2] + mpn_addlsh1_n (c4 + 2, b, c4 + 2, k); #else c[r] = mpn_lshift (c, a + twok, r, 1); c4[r + 2] = mpn_lshift (c4 + 2, b + twok, r, 1); if (r < k) { MPN_ZERO(c + r + 1, k - r); MPN_ZERO(c4 + r + 3, k - r); } c1[0] += mpn_add_n (c, c, a + k, k); c5[2] += mpn_add_n (c4 + 2, c4 + 2, b + k, k); mpn_lshift (c, c, k1, 1); mpn_lshift (c4 + 2, c4 + 2, k1, 1); c1[0] += mpn_add_n (c, c, a, k); c5[2] += mpn_add_n (c4 + 2, c4 + 2, b, k); #endif /* compute v2 := (a0+2a1+4a2)*(b0+2b1+4b2) in {t+2k+1, 2k+1} v2 < 49*B^k so v2 uses at most 2k+1 limbs if GMP_NUMB_BITS >= 6 */ TOOM3_MUL_REC (v2, c, c4 + 2, k1, trec); /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} vm1 v1 v2 */ /* compute v0 := a0*b0 in {c, 2k} */ TOOM3_MUL_REC (c, a, b, k, trec); /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 vm1 v1 v2 */ /* now compute (3v0+2vm1+v2)/6 = [v0 + (2vm1+v2)/3]/2 v2 <- v2+2vm1 = 3*(a0*b0+2*a0*b2+2*a1*b1+2*a1*b2+2*a2*b0+2*a2*b1+6*a2*b2), thus 0 <= v2 < 51*B^(2k) < 2^6*B^(2k) Uses temporary space {t+4k+2,2k+1}, requires T(n) >= 6k+3. */ if (sa >= 0) { #ifdef HAVE_NATIVE_mpn_addlsh1_n mpn_addlsh1_n (v2, v2, c2, kk1); #else /* we can use vinf=t+4k+2 as workspace since it is not full yet */ mpn_lshift (vinf, c2, kk1, 1); mpn_add_n (v2, v2, vinf, kk1); #endif } else { #ifdef HAVE_NATIVE_mpn_sublsh1_n mpn_sublsh1_n (v2, v2, c2, kk1); #else /* we can use vinf=t+4k+2 as workspace since it is not full yet */ mpn_lshift (vinf, c2, kk1, 1); mpn_sub_n (v2, v2, vinf, kk1); #endif } /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 vm1 v1 v2+2vm1 */ /* compute vinf := a2*b2 in {t+4k+2, 2r}: first put it in {c4, 2r}, then copy it in {t+4k+2,2r} */ saved = c4[0]; TOOM3_MUL_REC (c4, a + twok, b + twok, r, trec); cinf0 = mpn_add_n (vinf, c4, c, twor); /* {v0,2r} + {vinf,2r} */ vinf0 = c4[0]; c4[0] = saved; toom3_interpolate (c, t, v2, c2, vinf, k, r, sa, vinf0, cinf0, vinf + twor); #undef v2 #undef vinf }
/* put in {c, 2n} where n = 2k+r the value of {v0,2k} (already in place) + B^k * [{v1, 2k+1} - {t1, 2k+1}] + B^(2k) * [{t2, 2k+1} - {v0+vinf, 2k}] + B^(3k) * [{t1, 2k+1} - {t2, 2k+1}] + B^(4k) * {vinf,2r} (high 2r-1 limbs already in place) where {t1, 2k+1} = (3*{v0,2k}+2*sa*{vm1,2k+1}+{v2,2k+1})/6-2*{vinf,2r} {t2, 2k+1} = ({v1, 2k+1} + sa * {vm1, 2k+1})/2 (sa is the sign of {vm1, 2k+1}). {vinf, 2r} stores the content of {v0, 2r} + {vinf, 2r}, with carry in cinf0. vinf0 is the low limb of vinf. ws is temporary space, and should have at least 2r limbs. Think about: The evaluated point a-b+c stands a good chance of having a zero carry limb, a+b+c would have a 1/4 chance, and 4*a+2*b+c a 1/8 chance, roughly. Perhaps this could be tested and stripped. Doing so before recursing would be better than stripping at the start of mpn_toom3_mul_n/sqr_n, since then the recursion could be based on the new size. Although in truth the kara vs toom3 crossover is never so exact that one limb either way makes a difference. A small value like 1 or 2 for the carry could perhaps also be handled with an add_n or addlsh1_n. Would that be faster than an extra limb on a (recursed) multiply/square? */ static void toom3_interpolate (mp_ptr c, mp_srcptr v1, mp_ptr v2, mp_ptr vm1, mp_ptr vinf, mp_size_t k, mp_size_t r, int sa, mp_limb_t vinf0, mp_limb_t cinf0, mp_ptr ws) { mp_limb_t cy, saved; unsigned long twok = k + k; unsigned long kk1 = twok + 1; unsigned long twor = r + r; mp_ptr c1, c2, c3, c4, c5; mp_limb_t cout; /* final carry, should be zero at the end */ c1 = c + k; c2 = c1 + k; c3 = c2 + k; c4 = c3 + k; c5 = c4 + k; #define v0 (c) /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 |vm1| hi(vinf) v1 v2+2vm1 vinf +lo(v0) */ ASSERT_NOCARRY (mpn_divexact_by3 (v2, v2, kk1)); /* v2 <- v2 / 3 */ #ifdef HAVE_NATIVE_mpn_rsh1add_n mpn_rsh1add_n (v2, v2, v0, twok); /* v2 <- (lo(v2)+v0) / 2, exact */ cy = v2[twok] & 1; /* add high limb of v2 divided by 2 */ v2[twok] >>= 1; MPN_INCR_U (v2 + twok - 1, 2, cy << (GMP_NUMB_BITS - 1)); #else v2[twok] += mpn_add_n (v2, v2, v0, twok); mpn_rshift (v2, v2, kk1, 1); #endif /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 |vm1| hi(vinf) v1 (3v0+2vm1+v2) vinf /6 +lo(v0) */ /* vm1 <- t2 := (v1 + sa*vm1) / 2 t2 = a0*b0+a0*b2+a1*b1+a2*b0+a2*b2 >= 0 No carry comes out from {v1, kk1} +/- {vm1, kk1}, and the division by two is exact */ if (sa >= 0) { #ifdef HAVE_NATIVE_mpn_rsh1add_n mpn_rsh1add_n (vm1, v1, vm1, kk1); #else mpn_add_n (vm1, vm1, v1, kk1); mpn_rshift (vm1, vm1, kk1, 1); #endif } else { #ifdef HAVE_NATIVE_mpn_rsh1sub_n mpn_rsh1sub_n (vm1, v1, vm1, kk1); #else mpn_sub_n (vm1, v1, vm1, kk1); mpn_rshift (vm1, vm1, kk1, 1); #endif } /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 t2 hi(vinf) v1 t1 vinf+lo(v0) */ /* subtract 2*vinf to v2, result is t1 := a0*b0+a0*b2+a1*b1+a1*b2+a2*b0+a2*b1+a2*b2 >= 0 */ saved = c4[0]; c4[0] = vinf0; #ifdef HAVE_NATIVE_mpn_sublsh1_n cy = mpn_sublsh1_n (v2, v2, c4, twor); #else cy = mpn_lshift (ws, c4, twor, 1); cy += mpn_sub_n (v2, v2, ws, twor); #endif MPN_DECR_U (v2 + twor, kk1 - twor, cy); c4[0] = saved; /* subtract {t2, 2k+1} in {c+3k, 2k+1} i.e. in {t2+k, 2k+1}: by chunks of k limbs from right to left to avoid overlap */ #define t2 (vm1) /* a borrow may occur in one of the 2 following __GMPN_SUB_1 calls, but since the final result is nonnegative, it will be compensated later on */ __GMPN_SUB_1 (cout, c5, c5, twor - k, t2[twok]); cy = mpn_sub_n (c4, c4, t2 + k, k); __GMPN_SUB_1 (cout, c5, c5, twor - k, cy); cy = mpn_sub_n (c3, c3, t2, k); __GMPN_SUB_1 (cout, c4, c4, twor, cy); /* don't forget to add vinf0 in {c+4k, ...} */ __GMPN_ADD_1 (cout, c4, c4, twor, vinf0); /* c c+k c+2k c+3k c+4k+1 t t+2k+1 t+4k+2 v0 t2 hi(vinf) v1 t1 vinf -t2 +lo(v0) */ /* c c+k c+2k c+3k c+4k t t+2k+1 t+4k+2 v0 t2 vinf v1 t1 vinf -t2 +lo(v0) */ /* subtract v0+vinf in {c+2k, ...} */ cy = cinf0 + mpn_sub_n (c2, c2, vinf, twor); if (twor < twok) { __GMPN_SUB_1 (cy, c2 + twor, c2 + twor, twok - twor, cy); cy += mpn_sub_n (c2 + twor, c2 + twor, v0 + twor, twok - twor); } __GMPN_SUB_1 (cout, c4, c4, twor, cy); /* 2n-4k = 2r */ /* c c+k c+2k c+3k c+4k t t+2k+1 t+4k+2 v0 t2 vinf v1 t1 vinf -v0 -t2 +lo(v0) -vinf */ /* subtract t1 in {c+k, ...} */ cy = mpn_sub_n (c1, c1, v2, kk1); __GMPN_SUB_1 (cout, c3 + 1, c3 + 1, twor + k - 1, cy); /* 2n-(3k+1)=k+2r-1 */ /* c c+k c+2k c+3k c+4k t t+2k+1 t+4k+2 v0 t2 vinf v1 t1 vinf -t1 -v0 -t2 -vinf */ /* add t1 in {c+3k, ...} */ cy = mpn_add_n (c3, c3, v2, kk1); __GMPN_ADD_1 (cout, c5 + 1, c5 + 1, twor - k - 1, cy); /* 2n-(5k+1) = 2r-k-1 */ /* c c+k c+2k c+3k c+4k t t+2k+1 t+4k+2 v0 t2 t1 vinf v1 t1 vinf -t1 -v0 -t2 -vinf */ /* add v1 in {c+k, ...} */ cy = mpn_add_n (c1, c1, v1, kk1); __GMPN_ADD_1 (cout, c3 + 1, c3 + 1, twor + k - 1, cy); /* 2n-(3k+1) = 2r+k-1 */ /* c c+k c+2k c+3k c+4k t t+2k+1 t+4k+2 v0 v1 t2 t1 vinf v1 t1 vinf -t1 -v0 -t2 -vinf */ #undef v0 #undef t2 }
void mpn_toom_interpolate_5pts (mp_ptr c, mp_ptr v2, mp_ptr vm1, mp_size_t k, mp_size_t twor, int sa, mp_limb_t vinf0, mp_ptr ws) { mp_limb_t cy, saved; mp_size_t twok = k + k; mp_size_t kk1 = twok + 1; mp_ptr c1, v1, c3, vinf, c5; mp_limb_t cout; /* final carry, should be zero at the end */ c1 = c + k; v1 = c1 + k; c3 = v1 + k; vinf = c3 + k; c5 = vinf + k; #define v0 (c) /* (1) v2 <- v2-vm1 < v2+|vm1|, (16 8 4 2 1) - (1 -1 1 -1 1) = thus 0 <= v2 < 50*B^(2k) < 2^6*B^(2k) (15 9 3 3 0) */ if (sa <= 0) mpn_add_n (v2, v2, vm1, kk1); else mpn_sub_n (v2, v2, vm1, kk1); /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 v1 hi(vinf) |vm1| v2-vm1 EMPTY */ ASSERT_NOCARRY (mpn_divexact_by3 (v2, v2, kk1)); /* v2 <- v2 / 3 */ /* (5 3 1 1 0)*/ /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 v1 hi(vinf) |vm1| (v2-vm1)/3 EMPTY */ /* (2) vm1 <- tm1 := (v1 - sa*vm1) / 2 [(1 1 1 1 1) - (1 -1 1 -1 1)] / 2 = tm1 >= 0 (0 1 0 1 0) No carry comes out from {v1, kk1} +/- {vm1, kk1}, and the division by two is exact */ if (sa <= 0) { #ifdef HAVE_NATIVE_mpn_rsh1add_n mpn_rsh1add_n (vm1, v1, vm1, kk1); #else mpn_add_n (vm1, v1, vm1, kk1); mpn_rshift (vm1, vm1, kk1, 1); #endif } else { #ifdef HAVE_NATIVE_mpn_rsh1sub_n mpn_rsh1sub_n (vm1, v1, vm1, kk1); #else mpn_sub_n (vm1, v1, vm1, kk1); mpn_rshift (vm1, vm1, kk1, 1); #endif } /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 v1 hi(vinf) tm1 (v2-vm1)/3 EMPTY */ /* (3) v1 <- t1 := v1 - v0 (1 1 1 1 1) - (0 0 0 0 1) = (1 1 1 1 0) t1 >= 0 */ vinf[0] -= mpn_sub_n (v1, v1, c, twok); /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 v1-v0 hi(vinf) tm1 (v2-vm1)/3 EMPTY */ /* (4) v2 <- t2 := ((v2-vm1)/3-t1)/2 = (v2-vm1-3*t1)/6 t2 >= 0 [(5 3 1 1 0) - (1 1 1 1 0)]/2 = (2 1 0 0 0) */ #ifdef HAVE_NATIVE_mpn_rsh1sub_n mpn_rsh1sub_n (v2, v2, v1, kk1); #else mpn_sub_n (v2, v2, v1, kk1); mpn_rshift (v2, v2, kk1, 1); #endif /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 v1-v0 hi(vinf) tm1 (v2-vm1-3t1)/6 EMPTY */ /* (5) v1 <- t1-tm1 (1 1 1 1 0) - (0 1 0 1 0) = (1 0 1 0 0) result is v1 >= 0 */ mpn_sub_n (v1, v1, vm1, kk1); /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 v1-v0-tm1 hi(vinf) tm1 (v2-vm1-3t1)/6 EMPTY */ /* (6) v2 <- v2 - 2*vinf, (2 1 0 0 0) - 2*(1 0 0 0 0) = (0 1 0 0 0) result is v2 >= 0 */ saved = vinf[0]; /* Remember v1's highest byte (will be overwritten). */ vinf[0] = vinf0; /* Set the right value for vinf0 */ #ifdef HAVE_NATIVE_mpn_sublsh1_n cy = mpn_sublsh1_n (v2, v2, vinf, twor); #else cy = mpn_lshift (ws, vinf, twor, 1); cy += mpn_sub_n (v2, v2, ws, twor); #endif MPN_DECR_U (v2 + twor, kk1 - twor, cy); /* (7) v1 <- v1 - vinf, (1 0 1 0 0) - (1 0 0 0 0) = (0 0 1 0 0) result is >= 0 */ cy = mpn_sub_n (v1, v1, vinf, twor); /* vinf is at most twor long. */ vinf[0] = saved; MPN_DECR_U (v1 + twor, kk1 - twor, cy); /* Treat the last bytes. */ __GMPN_ADD_1 (cout, vinf, vinf, twor, vinf0); /* Add vinf0, propagate carry. */ /* (8) vm1 <- vm1-t2 (0 1 0 1 0) - (0 1 0 0 0) = (0 0 0 1 0) vm1 >= 0 */ mpn_sub_n (vm1, vm1, v2, kk1); /* No overlapping here. */ /********************* Beginning the final phase **********************/ /* {c,2k} {c+2k,2k } {c+4k ,2r } {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 t1 hi(t1)+vinf tm1 (v2-vm1-3t1)/6 EMPTY */ /* (9) add t2 in {c+3k, ...} */ cy = mpn_add_n (c3, c3, v2, kk1); __GMPN_ADD_1 (cout, c5 + 1, c5 + 1, twor - k - 1, cy); /* 2n-(5k+1) = 2r-k-1 */ /* {c,2k} {c+2k,2k } {c+4k ,2r } {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 t1 hi(t1)+vinf tm1 (v2-vm1-3t1)/6 EMPTY */ /* c c+k c+2k c+3k c+4k t t+2k+1 t+4k+2 v0 t1 vinf tm1 t2 +t2 */ /* add vm1 in {c+k, ...} */ cy = mpn_add_n (c1, c1, vm1, kk1); __GMPN_ADD_1 (cout, c3 + 1, c3 + 1, twor + k - 1, cy); /* 2n-(3k+1) = 2r+k-1 */ /* c c+k c+2k c+3k c+4k t t+2k+1 t+4k+2 v0 t1 vinf tm1 t2 +tm1 +t2 */ #undef v0 #undef t2 }
void mpn_toom_interpolate_5pts (mp_ptr c, mp_ptr v2, mp_ptr vm1, mp_size_t k, mp_size_t twor, int sa, mp_limb_t vinf0) { mp_limb_t cy, saved; mp_size_t twok; mp_size_t kk1; mp_ptr c1, v1, c3, vinf; twok = k + k; kk1 = twok + 1; c1 = c + k; v1 = c1 + k; c3 = v1 + k; vinf = c3 + k; #define v0 (c) /* (1) v2 <- v2-vm1 < v2+|vm1|, (16 8 4 2 1) - (1 -1 1 -1 1) = thus 0 <= v2 < 50*B^(2k) < 2^6*B^(2k) (15 9 3 3 0) */ if (sa) ASSERT_NOCARRY (mpn_add_n (v2, v2, vm1, kk1)); else ASSERT_NOCARRY (mpn_sub_n (v2, v2, vm1, kk1)); /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 v1 hi(vinf) |vm1| v2-vm1 EMPTY */ ASSERT_NOCARRY (mpn_divexact_by3 (v2, v2, kk1)); /* v2 <- v2 / 3 */ /* (5 3 1 1 0)*/ /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 v1 hi(vinf) |vm1| (v2-vm1)/3 EMPTY */ /* (2) vm1 <- tm1 := (v1 - vm1) / 2 [(1 1 1 1 1) - (1 -1 1 -1 1)] / 2 = tm1 >= 0 (0 1 0 1 0) No carry comes out from {v1, kk1} +/- {vm1, kk1}, and the division by two is exact. If (sa!=0) the sign of vm1 is negative */ if (sa) { #ifdef HAVE_NATIVE_mpn_rsh1add_n mpn_rsh1add_n (vm1, v1, vm1, kk1); #else ASSERT_NOCARRY (mpn_add_n (vm1, v1, vm1, kk1)); ASSERT_NOCARRY (mpn_rshift (vm1, vm1, kk1, 1)); #endif } else { #ifdef HAVE_NATIVE_mpn_rsh1sub_n mpn_rsh1sub_n (vm1, v1, vm1, kk1); #else ASSERT_NOCARRY (mpn_sub_n (vm1, v1, vm1, kk1)); ASSERT_NOCARRY (mpn_rshift (vm1, vm1, kk1, 1)); #endif } /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 v1 hi(vinf) tm1 (v2-vm1)/3 EMPTY */ /* (3) v1 <- t1 := v1 - v0 (1 1 1 1 1) - (0 0 0 0 1) = (1 1 1 1 0) t1 >= 0 */ vinf[0] -= mpn_sub_n (v1, v1, c, twok); /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 v1-v0 hi(vinf) tm1 (v2-vm1)/3 EMPTY */ /* (4) v2 <- t2 := ((v2-vm1)/3-t1)/2 = (v2-vm1-3*t1)/6 t2 >= 0 [(5 3 1 1 0) - (1 1 1 1 0)]/2 = (2 1 0 0 0) */ #ifdef HAVE_NATIVE_mpn_rsh1sub_n mpn_rsh1sub_n (v2, v2, v1, kk1); #else ASSERT_NOCARRY (mpn_sub_n (v2, v2, v1, kk1)); ASSERT_NOCARRY (mpn_rshift (v2, v2, kk1, 1)); #endif /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} v0 v1-v0 hi(vinf) tm1 (v2-vm1-3t1)/6 EMPTY */ /* (5) v1 <- t1-tm1 (1 1 1 1 0) - (0 1 0 1 0) = (1 0 1 0 0) result is v1 >= 0 */ ASSERT_NOCARRY (mpn_sub_n (v1, v1, vm1, kk1)); /* We do not need to read the value in vm1, so we add it in {c+k, ...} */ cy = mpn_add_n (c1, c1, vm1, kk1); MPN_INCR_U (c3 + 1, twor + k - 1, cy); /* 2n-(3k+1) = 2r+k-1 */ /* Memory allocated for vm1 is now free, it can be recycled ...*/ /* (6) v2 <- v2 - 2*vinf, (2 1 0 0 0) - 2*(1 0 0 0 0) = (0 1 0 0 0) result is v2 >= 0 */ saved = vinf[0]; /* Remember v1's highest byte (will be overwritten). */ vinf[0] = vinf0; /* Set the right value for vinf0 */ #ifdef HAVE_NATIVE_mpn_sublsh1_n cy = mpn_sublsh1_n (v2, v2, vinf, twor); #else /* Overwrite unused vm1 */ cy = mpn_lshift (vm1, vinf, twor, 1); cy += mpn_sub_n (v2, v2, vm1, twor); #endif MPN_DECR_U (v2 + twor, kk1 - twor, cy); /* Current matrix is [1 0 0 0 0; vinf 0 1 0 0 0; v2 1 0 1 0 0; v1 0 1 0 1 0; vm1 0 0 0 0 1] v0 Some vaues already are in-place (we added vm1 in the correct position) | vinf| v1 | v0 | | vm1 | One still is in a separated area | +v2 | We have to compute v1-=vinf; vm1 -= v2, |-vinf| | -v2 | Carefully reordering operations we can avoid to compute twice the sum of the high half of v2 plus the low half of vinf. */ /* Add the high half of t2 in {vinf} */ if ( LIKELY(twor > k + 1) ) { /* This is the expected flow */ cy = mpn_add_n (vinf, vinf, v2 + k, k + 1); MPN_INCR_U (c3 + kk1, twor - k - 1, cy); /* 2n-(5k+1) = 2r-k-1 */ } else { /* triggered only by very unbalanced cases like (k+k+(k-2))x(k+k+1) , should be handled by toom32 */ ASSERT_NOCARRY (mpn_add_n (vinf, vinf, v2 + k, twor)); } /* (7) v1 <- v1 - vinf, (1 0 1 0 0) - (1 0 0 0 0) = (0 0 1 0 0) result is >= 0 */ /* Side effect: we also subtracted (high half) vm1 -= v2 */ cy = mpn_sub_n (v1, v1, vinf, twor); /* vinf is at most twor long. */ vinf0 = vinf[0]; /* Save again the right value for vinf0 */ vinf[0] = saved; MPN_DECR_U (v1 + twor, kk1 - twor, cy); /* Treat the last bytes. */ /* (8) vm1 <- vm1-v2 (0 1 0 1 0) - (0 1 0 0 0) = (0 0 0 1 0) Operate only on the low half. */ cy = mpn_sub_n (c1, c1, v2, k); MPN_DECR_U (v1, kk1, cy); /********************* Beginning the final phase **********************/ /* Most of the recomposition was done */ /* add t2 in {c+3k, ...}, but only the low half */ cy = mpn_add_n (c3, c3, v2, k); vinf[0] += cy; ASSERT(vinf[0] >= cy); /* No carry */ MPN_INCR_U (vinf, twor, vinf0); /* Add vinf0, propagate carry. */ #undef v0 }
void check (void) { mp_limb_t wp[100], xp[100], yp[100]; mp_size_t size = 100; refmpn_zero (xp, size); refmpn_zero (yp, size); refmpn_zero (wp, size); pre ("mpn_add_n"); mpn_add_n (wp, xp, yp, size); post (); #if HAVE_NATIVE_mpn_add_nc pre ("mpn_add_nc"); mpn_add_nc (wp, xp, yp, size, CNST_LIMB(0)); post (); #endif #if HAVE_NATIVE_mpn_addlsh1_n pre ("mpn_addlsh1_n"); mpn_addlsh1_n (wp, xp, yp, size); post (); #endif #if HAVE_NATIVE_mpn_and_n pre ("mpn_and_n"); mpn_and_n (wp, xp, yp, size); post (); #endif #if HAVE_NATIVE_mpn_andn_n pre ("mpn_andn_n"); mpn_andn_n (wp, xp, yp, size); post (); #endif pre ("mpn_addmul_1"); mpn_addmul_1 (wp, xp, size, yp[0]); post (); #if HAVE_NATIVE_mpn_addmul_1c pre ("mpn_addmul_1c"); mpn_addmul_1c (wp, xp, size, yp[0], CNST_LIMB(0)); post (); #endif #if HAVE_NATIVE_mpn_com_n pre ("mpn_com_n"); mpn_com_n (wp, xp, size); post (); #endif #if HAVE_NATIVE_mpn_copyd pre ("mpn_copyd"); mpn_copyd (wp, xp, size); post (); #endif #if HAVE_NATIVE_mpn_copyi pre ("mpn_copyi"); mpn_copyi (wp, xp, size); post (); #endif pre ("mpn_divexact_1"); mpn_divexact_1 (wp, xp, size, CNST_LIMB(123)); post (); pre ("mpn_divexact_by3c"); mpn_divexact_by3c (wp, xp, size, CNST_LIMB(0)); post (); pre ("mpn_divrem_1"); mpn_divrem_1 (wp, (mp_size_t) 0, xp, size, CNST_LIMB(123)); post (); #if HAVE_NATIVE_mpn_divrem_1c pre ("mpn_divrem_1c"); mpn_divrem_1c (wp, (mp_size_t) 0, xp, size, CNST_LIMB(123), CNST_LIMB(122)); post (); #endif pre ("mpn_gcd_1"); xp[0] |= 1; notdead += (unsigned long) mpn_gcd_1 (xp, size, CNST_LIMB(123)); post (); #if HAVE_NATIVE_mpn_gcd_finda pre ("mpn_gcd_finda"); xp[0] |= 1; xp[1] |= 1; notdead += mpn_gcd_finda (xp); post (); #endif pre ("mpn_hamdist"); notdead += mpn_hamdist (xp, yp, size); post (); #if HAVE_NATIVE_mpn_ior_n pre ("mpn_ior_n"); mpn_ior_n (wp, xp, yp, size); post (); #endif #if HAVE_NATIVE_mpn_iorn_n pre ("mpn_iorn_n"); mpn_iorn_n (wp, xp, yp, size); post (); #endif pre ("mpn_lshift"); mpn_lshift (wp, xp, size, 1); post (); pre ("mpn_mod_1"); notdead += mpn_mod_1 (xp, size, CNST_LIMB(123)); post (); #if HAVE_NATIVE_mpn_mod_1c pre ("mpn_mod_1c"); notdead += mpn_mod_1c (xp, size, CNST_LIMB(123), CNST_LIMB(122)); post (); #endif #if GMP_NUMB_BITS % 4 == 0 pre ("mpn_mod_34lsub1"); notdead += mpn_mod_34lsub1 (xp, size); post (); #endif pre ("mpn_modexact_1_odd"); notdead += mpn_modexact_1_odd (xp, size, CNST_LIMB(123)); post (); pre ("mpn_modexact_1c_odd"); notdead += mpn_modexact_1c_odd (xp, size, CNST_LIMB(123), CNST_LIMB(456)); post (); pre ("mpn_mul_1"); mpn_mul_1 (wp, xp, size, yp[0]); post (); #if HAVE_NATIVE_mpn_mul_1c pre ("mpn_mul_1c"); mpn_mul_1c (wp, xp, size, yp[0], CNST_LIMB(0)); post (); #endif #if HAVE_NATIVE_mpn_mul_2 pre ("mpn_mul_2"); mpn_mul_2 (wp, xp, size-1, yp); post (); #endif pre ("mpn_mul_basecase"); mpn_mul_basecase (wp, xp, (mp_size_t) 3, yp, (mp_size_t) 3); post (); #if HAVE_NATIVE_mpn_nand_n pre ("mpn_nand_n"); mpn_nand_n (wp, xp, yp, size); post (); #endif #if HAVE_NATIVE_mpn_nior_n pre ("mpn_nior_n"); mpn_nior_n (wp, xp, yp, size); post (); #endif pre ("mpn_popcount"); notdead += mpn_popcount (xp, size); post (); pre ("mpn_preinv_mod_1"); notdead += mpn_preinv_mod_1 (xp, size, GMP_NUMB_MAX, refmpn_invert_limb (GMP_NUMB_MAX)); post (); #if USE_PREINV_DIVREM_1 || HAVE_NATIVE_mpn_preinv_divrem_1 pre ("mpn_preinv_divrem_1"); mpn_preinv_divrem_1 (wp, (mp_size_t) 0, xp, size, GMP_NUMB_MAX, refmpn_invert_limb (GMP_NUMB_MAX), 0); post (); #endif #if HAVE_NATIVE_mpn_rsh1add_n pre ("mpn_rsh1add_n"); mpn_rsh1add_n (wp, xp, yp, size); post (); #endif #if HAVE_NATIVE_mpn_rsh1sub_n pre ("mpn_rsh1sub_n"); mpn_rsh1sub_n (wp, xp, yp, size); post (); #endif pre ("mpn_rshift"); mpn_rshift (wp, xp, size, 1); post (); pre ("mpn_sqr_basecase"); mpn_sqr_basecase (wp, xp, (mp_size_t) 3); post (); pre ("mpn_submul_1"); mpn_submul_1 (wp, xp, size, yp[0]); post (); #if HAVE_NATIVE_mpn_submul_1c pre ("mpn_submul_1c"); mpn_submul_1c (wp, xp, size, yp[0], CNST_LIMB(0)); post (); #endif pre ("mpn_sub_n"); mpn_sub_n (wp, xp, yp, size); post (); #if HAVE_NATIVE_mpn_sub_nc pre ("mpn_sub_nc"); mpn_sub_nc (wp, xp, yp, size, CNST_LIMB(0)); post (); #endif #if HAVE_NATIVE_mpn_sublsh1_n pre ("mpn_sublsh1_n"); mpn_sublsh1_n (wp, xp, yp, size); post (); #endif #if HAVE_NATIVE_mpn_udiv_qrnnd pre ("mpn_udiv_qrnnd"); mpn_udiv_qrnnd (&wp[0], CNST_LIMB(122), xp[0], CNST_LIMB(123)); post (); #endif #if HAVE_NATIVE_mpn_udiv_qrnnd_r pre ("mpn_udiv_qrnnd_r"); mpn_udiv_qrnnd (CNST_LIMB(122), xp[0], CNST_LIMB(123), &wp[0]); post (); #endif #if HAVE_NATIVE_mpn_umul_ppmm pre ("mpn_umul_ppmm"); mpn_umul_ppmm (&wp[0], xp[0], yp[0]); post (); #endif #if HAVE_NATIVE_mpn_umul_ppmm_r pre ("mpn_umul_ppmm_r"); mpn_umul_ppmm_r (&wp[0], xp[0], yp[0]); post (); #endif #if HAVE_NATIVE_mpn_xor_n pre ("mpn_xor_n"); mpn_xor_n (wp, xp, yp, size); post (); #endif #if HAVE_NATIVE_mpn_xnor_n pre ("mpn_xnor_n"); mpn_xnor_n (wp, xp, yp, size); post (); #endif }