/* Toom 4 interpolation. Interpolates the value at 2^(sn*B) of a polynomial p(x) with 7 coefficients given the values p(oo), p(2), p(1), p(-1), 2^6*p(1/2), 2^6*p(-1/2), p(0). The output is placed in rp and the final number of limbs of the output is given in rpn. The 4th and 6th values may be negative, and if so, n4 and n6 should be set to a negative value respectively. To save space we pass r3, r5, r7 in place in the output rp. The other r's are stored separately in space tp. The low limb of r3 is stored in r30, as it will be overwritten by the high limb of r5. rp rp1 rp2 rp3 rp4 rp5 rp6 rp7 <----------- r7-----------><------------r5--------------> <-------------r3-------------> We assume that r1 is stored at tp, r2 at (tp + t4), r4 at (tp + 2*t4) and r6 (tp + 3*t4). Each of these r's has t4 = s4 + 1 limbs allocated. */ void mpn_toom4_interpolate(mp_ptr rp, mp_size_t * rpn, mp_size_t sn, mp_ptr tp, mp_size_t s4, mp_size_t n4, mp_size_t n6, mp_limb_t r30) { mp_size_t n1, n2, n3, n5, n7, t4; mp_limb_t saved, saved2, cy; t4 = s4 + 1; mpn_add_n(r2, r2, r5, s4); if (n6 < 0) mpn_add_n(r6, r5, r6, s4); else mpn_sub_n(r6, r5, r6, s4); /* r6 is now in twos complement format */ saved = r3[0]; r3[0] = r30; if (n4 < 0) mpn_add_n(r4, r3, r4, s4); else mpn_sub_n(r4, r3, r4, s4); r3[0] = saved; /* r4 is now in twos complement format */ mpn_sub_n(r5, r5, r1, s4); #if HAVE_NATIVE_mpn_sublsh_n r5[s4-1] -= mpn_sublsh_n(r5, r5, r7, s4-1, 6); #else r5[s4-1] -= mpn_submul_1(r5, r7, s4-1, 64); #endif TC4_RSHIFT1(r4, s4); saved = r3[0]; r3[0] = r30; mpn_sub_n(r3, r3, r4, s4); r30 = r3[0]; r3[0] = saved; mpn_double(r5, s4); mpn_sub_n(r5, r5, r6, s4); saved = r3[0]; r3[0] = r30; mpn_submul_1(r2, r3, s4, 65); r3[0] = saved; saved2 = r7[s4-1]; r7[s4-1] = CNST_LIMB(0); // r7 is always positive so no sign extend needed saved = r3[0]; r3[0] = r30; #if HAVE_NATIVE_mpn_subadd_n mpn_subadd_n(r3, r3, r7, r1, s4); #else mpn_sub_n(r3, r3, r7, s4); mpn_sub_n(r3, r3, r1, s4); #endif r7[s4-1] = saved2; r30 = r3[0]; mpn_addmul_1(r2, r3, s4, 45); #if HAVE_NATIVE_mpn_sublsh_n cy = mpn_sublsh_n(r5, r5, r3, s4 - 1, 3); #else cy = mpn_submul_1(r5, r3, s4 - 1, 8); #endif r3[0] = saved; r3[0] -= (cy + 8*r3[s4-1]); mpn_rshift(r5, r5, s4, 3); mpn_divexact_by3(r5, r5, s4); mpn_sub_n(r6, r6, r2, s4); #if HAVE_NATIVE_mpn_sublsh_n mpn_sublsh_n(r2, r2, r4, s4, 4); #else mpn_submul_1(r2, r4, s4, 16); #endif mpn_rshift(r2, r2, s4, 1); mpn_divexact_by3(r2, r2, s4); mpn_divexact_by3(r2, r2, s4); saved = r3[0]; r3[0] = r30; cy = mpn_sub_n(r3, r3, r5, s4 - 1); r30 = r3[0]; r3[0] = saved; r3[s4-1] -= (cy + r5[s4-1]); mpn_sub_n(r4, r4, r2, s4); mpn_addmul_1(r6, r2, s4, 30); mpn_divexact_byfobm1(r6, r6, s4, CNST_LIMB(15), CNST_LIMB(~0/15)); mpn_rshift(r6, r6, s4, 2); mpn_sub_n(r2, r2, r6, s4); TC4_NORM(r1, n1, s4); TC4_NORM(r2, n2, s4); (*rpn) = 6*sn+1; cy = mpn_add_1(r3, r3, *rpn - 4*sn, r30); /* don't forget to add r3[0] back in */ if (cy) { rp[*rpn] = cy; (*rpn)++; } tc4_copy(rp, rpn, 5*sn, r2, n2); tc4_copy(rp, rpn, 6*sn, r1, n1); tc4_copy(rp, rpn, sn, r6, s4); tc4_copy(rp, rpn, 3*sn, r4, s4); }
void mpn_toom_interpolate_6pts (mp_ptr pp, mp_size_t n, enum toom6_flags flags, mp_ptr w4, mp_ptr w2, mp_ptr w1, mp_size_t w0n) { mp_limb_t cy; /* cy6 can be stored in w1[2*n], cy4 in w4[0], embankment in w2[0] */ mp_limb_t cy4, cy6, embankment; ASSERT( n > 0 ); ASSERT( 2*n >= w0n && w0n > 0 ); #define w5 pp /* 2n */ #define w3 (pp + 2 * n) /* 2n+1 */ #define w0 (pp + 5 * n) /* w0n */ /* Interpolate with sequence: W2 =(W1 - W2)>>2 W1 =(W1 - W5)>>1 W1 =(W1 - W2)>>1 W4 =(W3 - W4)>>1 W2 =(W2 - W4)/3 W3 = W3 - W4 - W5 W1 =(W1 - W3)/3 // Last steps are mixed with recomposition... W2 = W2 - W0<<2 W4 = W4 - W2 W3 = W3 - W1 W2 = W2 - W0 */ /* W2 =(W1 - W2)>>2 */ if (flags & toom6_vm2_neg) mpn_add_n (w2, w1, w2, 2 * n + 1); else mpn_sub_n (w2, w1, w2, 2 * n + 1); mpn_rshift (w2, w2, 2 * n + 1, 2); /* W1 =(W1 - W5)>>1 */ w1[2*n] -= mpn_sub_n (w1, w1, w5, 2*n); mpn_rshift (w1, w1, 2 * n + 1, 1); /* W1 =(W1 - W2)>>1 */ #if HAVE_NATIVE_mpn_rsh1sub_n mpn_rsh1sub_n (w1, w1, w2, 2 * n + 1); #else mpn_sub_n (w1, w1, w2, 2 * n + 1); mpn_rshift (w1, w1, 2 * n + 1, 1); #endif /* W4 =(W3 - W4)>>1 */ if (flags & toom6_vm1_neg) { #if HAVE_NATIVE_mpn_rsh1add_n mpn_rsh1add_n (w4, w3, w4, 2 * n + 1); #else mpn_add_n (w4, w3, w4, 2 * n + 1); mpn_rshift (w4, w4, 2 * n + 1, 1); #endif } else { #if HAVE_NATIVE_mpn_rsh1sub_n mpn_rsh1sub_n (w4, w3, w4, 2 * n + 1); #else mpn_sub_n (w4, w3, w4, 2 * n + 1); mpn_rshift (w4, w4, 2 * n + 1, 1); #endif } /* W2 =(W2 - W4)/3 */ mpn_sub_n (w2, w2, w4, 2 * n + 1); mpn_divexact_by3 (w2, w2, 2 * n + 1); /* W3 = W3 - W4 - W5 */ mpn_sub_n (w3, w3, w4, 2 * n + 1); w3[2 * n] -= mpn_sub_n (w3, w3, w5, 2 * n); /* W1 =(W1 - W3)/3 */ mpn_sub_n (w1, w1, w3, 2 * n + 1); mpn_divexact_by3 (w1, w1, 2 * n + 1); /* [1 0 0 0 0 0; 0 1 0 0 0 0; 1 0 1 0 0 0; 0 1 0 1 0 0; 1 0 1 0 1 0; 0 0 0 0 0 1] pp[] prior to operations: |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__| summation scheme for remaining operations: |______________5|n_____4|n_____3|n_____2|n______|n______|pp |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__| || H w4 | L w4 | || H w2 | L w2 | || H w1 | L w1 | ||-H w1 |-L w1 | |-H w0 |-L w0 ||-H w2 |-L w2 | */ cy = mpn_add_n (pp + n, pp + n, w4, 2 * n + 1); MPN_INCR_U (pp + 3 * n + 1, n, cy); /* W2 -= W0<<2 */ #if HAVE_NATIVE_mpn_sublsh_n || HAVE_NATIVE_mpn_sublsh2_n_ip1 #if HAVE_NATIVE_mpn_sublsh2_n_ip1 cy = mpn_sublsh2_n_ip1 (w2, w0, w0n); #else cy = mpn_sublsh_n (w2, w2, w0, w0n, 2); #endif #else /* {W4,2*n+1} is now free and can be overwritten. */ cy = mpn_lshift(w4, w0, w0n, 2); cy+= mpn_sub_n(w2, w2, w4, w0n); #endif MPN_DECR_U (w2 + w0n, 2 * n + 1 - w0n, cy); /* W4L = W4L - W2L */ cy = mpn_sub_n (pp + n, pp + n, w2, n); MPN_DECR_U (w3, 2 * n + 1, cy); /* W3H = W3H + W2L */ cy4 = w3[2 * n] + mpn_add_n (pp + 3 * n, pp + 3 * n, w2, n); /* W1L + W2H */ cy = w2[2 * n] + mpn_add_n (pp + 4 * n, w1, w2 + n, n); MPN_INCR_U (w1 + n, n + 1, cy); /* W0 = W0 + W1H */ if (LIKELY (w0n > n)) cy6 = w1[2 * n] + mpn_add_n (w0, w0, w1 + n, n); else cy6 = mpn_add_n (w0, w0, w1 + n, w0n); /* summation scheme for the next operation: |...____5|n_____4|n_____3|n_____2|n______|n______|pp |...w0___|_w1_w2_|_H w3__|_L w3__|_H w5__|_L w5__| ...-w0___|-w1_w2 | */ /* if(LIKELY(w0n>n)) the two operands below DO overlap! */ cy = mpn_sub_n (pp + 2 * n, pp + 2 * n, pp + 4 * n, n + w0n); /* embankment is a "dirty trick" to avoid carry/borrow propagation beyond allocated memory */ embankment = w0[w0n - 1] - 1; w0[w0n - 1] = 1; if (LIKELY (w0n > n)) { if (cy4 > cy6) MPN_INCR_U (pp + 4 * n, w0n + n, cy4 - cy6); else MPN_DECR_U (pp + 4 * n, w0n + n, cy6 - cy4); MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy); MPN_INCR_U (w0 + n, w0n - n, cy6); } else { MPN_INCR_U (pp + 4 * n, w0n + n, cy4); MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy + cy6); } w0[w0n - 1] += embankment; #undef w5 #undef w3 #undef w0 }