void mpn_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n) { ASSERT (n >= 1); ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n)); #if 0 /* FIXME: Can this be removed? */ if (n == 0) return; #endif if (BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) { /* mul_basecase is faster than sqr_basecase on small sizes sometimes */ mpn_mul_basecase (p, a, n, a, n); } else if (BELOW_THRESHOLD (n, SQR_KARATSUBA_THRESHOLD)) { mpn_sqr_basecase (p, a, n); } else if (BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) { /* Allocate workspace of fixed size on stack: fast! */ mp_limb_t ws[MPN_KARA_SQR_N_TSIZE (SQR_TOOM3_THRESHOLD_LIMIT-1)]; ASSERT (SQR_TOOM3_THRESHOLD <= SQR_TOOM3_THRESHOLD_LIMIT); mpn_kara_sqr_n (p, a, n, ws); } #if WANT_FFT || TUNE_PROGRAM_BUILD else if (BELOW_THRESHOLD (n, SQR_FFT_THRESHOLD)) #else else if (BELOW_THRESHOLD (n, MPN_TOOM3_MAX_N)) #endif { mp_ptr ws; TMP_SDECL; TMP_SMARK; ws = TMP_SALLOC_LIMBS (MPN_TOOM3_SQR_N_TSIZE (n)); mpn_toom3_sqr_n (p, a, n, ws); TMP_SFREE; } else #if WANT_FFT || TUNE_PROGRAM_BUILD { /* The current FFT code allocates its own space. That should probably change. */ mpn_mul_fft_full (p, a, n, a, n); } #else { /* Toom3 for large operands. Use workspace from the heap, as stack space may be limited. Since n is at least MUL_TOOM3_THRESHOLD, multiplication will take much longer than malloc()/free(). */ mp_ptr ws; mp_size_t ws_size; ws_size = MPN_TOOM3_SQR_N_TSIZE (n); ws = __GMP_ALLOCATE_FUNC_LIMBS (ws_size); mpn_toom3_sqr_n (p, a, n, ws); __GMP_FREE_FUNC_LIMBS (ws, ws_size); } #endif }
void mpn_sqr (mp_ptr p, mp_srcptr a, mp_size_t n) { ASSERT (n >= 1); ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n)); #if 0 /* FIXME: Can this be removed? */ if (n == 0) return; #endif if (BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) { /* mul_basecase is faster than sqr_basecase on small sizes sometimes */ mpn_mul_basecase (p, a, n, a, n); } else if (BELOW_THRESHOLD (n, SQR_KARATSUBA_THRESHOLD)) { mpn_sqr_basecase (p, a, n); } else if (BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) { /* Allocate workspace of fixed size on stack: fast! */ mp_limb_t ws[MPN_KARA_SQR_N_TSIZE (SQR_TOOM3_THRESHOLD_LIMIT-1)]; ASSERT (SQR_TOOM3_THRESHOLD <= SQR_TOOM3_THRESHOLD_LIMIT); mpn_kara_sqr_n (p, a, n, ws); } else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD)) { mp_ptr ws; TMP_SDECL; TMP_SMARK; ws = TMP_SALLOC_LIMBS (MPN_TOOM3_SQR_N_TSIZE (n)); mpn_toom3_sqr_n (p, a, n, ws); TMP_SFREE; } else if (BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD)) { mpn_toom4_sqr_n (p, a, n); } #if WANT_FFT || TUNE_PROGRAM_BUILD else if (BELOW_THRESHOLD (n, SQR_FFT_FULL_THRESHOLD)) #else else #endif { mpn_toom8_sqr_n (p, a, n); } #if WANT_FFT || TUNE_PROGRAM_BUILD else {
void mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n) { ASSERT (n >= 1); ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n)); ASSERT (! MPN_OVERLAP_P (p, 2 * n, b, n)); if (BELOW_THRESHOLD (n, MUL_KARATSUBA_THRESHOLD)) { mpn_mul_basecase (p, a, n, b, n); } else if (BELOW_THRESHOLD (n, MUL_TOOM3_THRESHOLD)) { /* Allocate workspace of fixed size on stack: fast! */ mp_limb_t ws[MPN_KARA_MUL_N_TSIZE (MUL_TOOM3_THRESHOLD_LIMIT-1)]; ASSERT (MUL_TOOM3_THRESHOLD <= MUL_TOOM3_THRESHOLD_LIMIT); mpn_kara_mul_n (p, a, b, n, ws); } else if (BELOW_THRESHOLD (n, MUL_TOOM4_THRESHOLD)) { mp_ptr ws; TMP_SDECL; TMP_SMARK; ws = TMP_SALLOC_LIMBS (MPN_TOOM3_MUL_N_TSIZE (n)); mpn_toom3_mul_n (p, a, b, n, ws); TMP_SFREE; } else if (BELOW_THRESHOLD (n, MUL_TOOM8H_THRESHOLD)) { mpn_toom4_mul_n (p, a, b, n); } #if WANT_FFT || TUNE_PROGRAM_BUILD else if (BELOW_THRESHOLD (n, MUL_FFT_FULL_THRESHOLD)) { mpn_toom8h_mul (p, a, n, b, n); } #endif else #if WANT_FFT || TUNE_PROGRAM_BUILD { mpn_mul_fft_main(p, a, n, b, n); } #else { /* Toom8 for large operands. */ mpn_toom8h_mul (p, a, n, b, n); } #endif }
void mpn_mullow_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n) { if (BELOW_THRESHOLD (n, MULLOW_BASECASE_THRESHOLD)) { /* Allocate workspace of fixed size on stack: fast! */ mp_limb_t ws[MUL_BASECASE_ALLOC]; mpn_mul_basecase (ws, xp, n, yp, n); MPN_COPY (rp, ws, n); } else if (BELOW_THRESHOLD (n, MULLOW_DC_THRESHOLD)) { mpn_mullow_basecase (rp, xp, yp, n); } else if (BELOW_THRESHOLD (n, MULLOW_MUL_N_THRESHOLD)) { /* Divide-and-conquer */ mp_size_t n2 = n >> 1; /* floor(n/2) */ mp_size_t n1 = n - n2; /* ceil(n/2) */ mp_ptr tp; TMP_SDECL; TMP_SMARK; tp = TMP_SALLOC_LIMBS (n1); /* Split as x = x1 2^(n1 GMP_NUMB_BITS) + x0, y = y1 2^(n2 GMP_NUMB_BITS) + y0 */ /* x0 * y0 */ mpn_mul_n (rp, xp, yp, n2); if (n1 != n2) rp[2 * n2] = mpn_addmul_1 (rp + n2, yp, n2, xp[n2]); /* x1 * y0 * 2^(n1 GMP_NUMB_BITS) */ mpn_mullow_n (tp, xp + n1, yp, n2); mpn_add_n (rp + n1, rp + n1, tp, n2); /* x0 * y1 * 2^(n2 GMP_NUMB_BITS) */ mpn_mullow_n (tp, yp + n2, xp, n1); mpn_add_n (rp + n2, rp + n2, tp, n1); TMP_SFREE; } else {
void mpn_toom53_mul (mp_ptr pp, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr scratch) { mp_size_t n, s, t; int vm1_neg, vmh_neg; mp_limb_t cy; mp_ptr gp, hp; mp_ptr as1, asm1, as2, ash, asmh; mp_ptr bs1, bsm1, bs2, bsh, bsmh; enum toom4_flags flags; TMP_DECL; #define a0 ap #define a1 (ap + n) #define a2 (ap + 2*n) #define a3 (ap + 3*n) #define a4 (ap + 4*n) #define b0 bp #define b1 (bp + n) #define b2 (bp + 2*n) n = 1 + (3 * an >= 5 * bn ? (an - 1) / (size_t) 5 : (bn - 1) / (size_t) 3); s = an - 4 * n; t = bn - 2 * n; ASSERT (0 < s && s <= n); ASSERT (0 < t && t <= n); TMP_MARK; as1 = TMP_SALLOC_LIMBS (n + 1); asm1 = TMP_SALLOC_LIMBS (n + 1); as2 = TMP_SALLOC_LIMBS (n + 1); ash = TMP_SALLOC_LIMBS (n + 1); asmh = TMP_SALLOC_LIMBS (n + 1); bs1 = TMP_SALLOC_LIMBS (n + 1); bsm1 = TMP_SALLOC_LIMBS (n + 1); bs2 = TMP_SALLOC_LIMBS (n + 1); bsh = TMP_SALLOC_LIMBS (n + 1); bsmh = TMP_SALLOC_LIMBS (n + 1); gp = pp; hp = pp + n + 1; /* Compute as1 and asm1. */ gp[n] = mpn_add_n (gp, a0, a2, n); gp[n] += mpn_add (gp, gp, n, a4, s); hp[n] = mpn_add_n (hp, a1, a3, n); #if HAVE_NATIVE_mpn_addsub_n if (mpn_cmp (gp, hp, n + 1) < 0) { mpn_addsub_n (as1, asm1, hp, gp, n + 1); vm1_neg = 1; } else { mpn_addsub_n (as1, asm1, gp, hp, n + 1); vm1_neg = 0; } #else mpn_add_n (as1, gp, hp, n + 1); if (mpn_cmp (gp, hp, n + 1) < 0) { mpn_sub_n (asm1, hp, gp, n + 1); vm1_neg = 1; } else { mpn_sub_n (asm1, gp, hp, n + 1); vm1_neg = 0; } #endif /* Compute as2. */ #if !HAVE_NATIVE_mpn_addlsh_n ash[n] = mpn_lshift (ash, a2, n, 2); /* 4a2 */ #endif #if HAVE_NATIVE_mpn_addlsh1_n cy = mpn_addlsh1_n (as2, a3, a4, s); if (s != n) cy = mpn_add_1 (as2 + s, a3 + s, n - s, cy); cy = 2 * cy + mpn_addlsh1_n (as2, a2, as2, n); cy = 2 * cy + mpn_addlsh1_n (as2, a1, as2, n); as2[n] = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n); #else cy = mpn_lshift (as2, a4, s, 1); cy += mpn_add_n (as2, a3, as2, s); if (s != n) cy = mpn_add_1 (as2 + s, a3 + s, n - s, cy); cy = 4 * cy + mpn_lshift (as2, as2, n, 2); cy += mpn_add_n (as2, a1, as2, n); cy = 2 * cy + mpn_lshift (as2, as2, n, 1); as2[n] = cy + mpn_add_n (as2, a0, as2, n); mpn_add_n (as2, ash, as2, n + 1); #endif /* Compute ash and asmh. */ #if HAVE_NATIVE_mpn_addlsh_n cy = mpn_addlsh_n (gp, a2, a0, n, 2); /* 4a0 + a2 */ cy = 4 * cy + mpn_addlsh_n (gp, a4, gp, n, 2); /* 16a0 + 4a2 + a4 */ /* FIXME s */ gp[n] = cy; cy = mpn_addlsh_n (hp, a3, a1, n, 2); /* 4a1 + a3 */ cy = 2 * cy + mpn_lshift (hp, hp, n, 1); /* 8a1 + 2a3 */ hp[n] = cy; #else gp[n] = mpn_lshift (gp, a0, n, 4); /* 16a0 */ mpn_add (gp, gp, n + 1, a4, s); /* 16a0 + a4 */ mpn_add_n (gp, ash, gp, n+1); /* 16a0 + 4a2 + a4 */ cy = mpn_lshift (hp, a1, n, 3); /* 8a1 */ cy += mpn_lshift (ash, a3, n, 1); /* 2a3 */ cy += mpn_add_n (hp, ash, hp, n); /* 8a1 + 2a3 */ hp[n] = cy; #endif #if HAVE_NATIVE_mpn_addsub_n if (mpn_cmp (gp, hp, n + 1) < 0) { mpn_addsub_n (ash, asmh, hp, gp, n + 1); vmh_neg = 1; } else { mpn_addsub_n (ash, asmh, gp, hp, n + 1); vmh_neg = 0; } #else mpn_add_n (ash, gp, hp, n + 1); if (mpn_cmp (gp, hp, n + 1) < 0) { mpn_sub_n (asmh, hp, gp, n + 1); vmh_neg = 1; } else { mpn_sub_n (asmh, gp, hp, n + 1); vmh_neg = 0; } #endif /* Compute bs1 and bsm1. */ bs1[n] = mpn_add (bs1, b0, n, b2, t); /* b0 + b2 */ #if HAVE_NATIVE_mpn_addsub_n if (bs1[n] == 0 && mpn_cmp (bs1, b1, n) < 0) { bs1[n] = mpn_addsub_n (bs1, bsm1, b1, bs1, n) >> 1; bsm1[n] = 0; vm1_neg ^= 1; }
mp_limb_t mpn_dcpi1_divappr_q (mp_ptr qp, mp_ptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, gmp_pi1_t *dinv) { mp_size_t qn; mp_limb_t qh, cy, qsave; mp_ptr tp; TMP_DECL; TMP_MARK; ASSERT (dn >= 6); ASSERT (nn > dn); ASSERT (dp[dn-1] & GMP_NUMB_HIGHBIT); qn = nn - dn; qp += qn; np += nn; dp += dn; if (qn >= dn) { qn++; /* pretend we'll need an extra limb */ /* Reduce qn mod dn without division, optimizing small operations. */ do qn -= dn; while (qn > dn); qp -= qn; /* point at low limb of next quotient block */ np -= qn; /* point in the middle of partial remainder */ tp = TMP_SALLOC_LIMBS (dn); /* Perform the typically smaller block first. */ if (qn == 1) { mp_limb_t q, n2, n1, n0, d1, d0; /* Handle qh up front, for simplicity. */ qh = mpn_cmp (np - dn + 1, dp - dn, dn) >= 0; if (qh) ASSERT_NOCARRY (mpn_sub_n (np - dn + 1, np - dn + 1, dp - dn, dn)); /* A single iteration of schoolbook: One 3/2 division, followed by the bignum update and adjustment. */ n2 = np[0]; n1 = np[-1]; n0 = np[-2]; d1 = dp[-1]; d0 = dp[-2]; ASSERT (n2 < d1 || (n2 == d1 && n1 <= d0)); if (UNLIKELY (n2 == d1) && n1 == d0) { q = GMP_NUMB_MASK; cy = mpn_submul_1 (np - dn, dp - dn, dn, q); ASSERT (cy == n2); } else { udiv_qr_3by2 (q, n1, n0, n2, n1, n0, d1, d0, dinv->inv32); if (dn > 2) { mp_limb_t cy, cy1; cy = mpn_submul_1 (np - dn, dp - dn, dn - 2, q); cy1 = n0 < cy; n0 = (n0 - cy) & GMP_NUMB_MASK; cy = n1 < cy1; n1 = (n1 - cy1) & GMP_NUMB_MASK; np[-2] = n0; if (UNLIKELY (cy != 0)) { n1 += d1 + mpn_add_n (np - dn, np - dn, dp - dn, dn - 1); qh -= (q == 0); q = (q - 1) & GMP_NUMB_MASK; } } else np[-2] = n0; np[-1] = n1; } qp[0] = q; } else { if (qn == 2) qh = mpn_divrem_2 (qp, 0L, np - 2, 4, dp - 2); else if (BELOW_THRESHOLD (qn, DC_DIV_QR_THRESHOLD)) qh = mpn_sbpi1_div_qr (qp, np - qn, 2 * qn, dp - qn, qn, dinv->inv32); else qh = mpn_dcpi1_div_qr_n (qp, np - qn, dp - qn, qn, dinv, tp); if (qn != dn) { if (qn > dn - qn) mpn_mul (tp, qp, qn, dp - dn, dn - qn); else mpn_mul (tp, dp - dn, dn - qn, qp, qn); cy = mpn_sub_n (np - dn, np - dn, tp, dn); if (qh != 0) cy += mpn_sub_n (np - dn + qn, np - dn + qn, dp - dn, dn - qn); while (cy != 0) { qh -= mpn_sub_1 (qp, qp, qn, 1); cy -= mpn_add_n (np - dn, np - dn, dp - dn, dn); } } } qn = nn - dn - qn + 1; while (qn > dn) { qp -= dn; np -= dn; mpn_dcpi1_div_qr_n (qp, np - dn, dp - dn, dn, dinv, tp); qn -= dn; } /* Since we pretended we'd need an extra quotient limb before, we now have made sure the code above left just dn-1=qn quotient limbs to develop. Develop that plus a guard limb. */ qn--; qp -= qn; np -= dn; qsave = qp[qn]; mpn_dcpi1_divappr_q_n (qp, np - dn, dp - dn, dn, dinv, tp); MPN_COPY_INCR (qp, qp + 1, qn); qp[qn] = qsave; } else /* (qn < dn) */ { mp_ptr q2p; #if 0 /* not possible since we demand nn > dn */ if (qn == 0) { qh = mpn_cmp (np - dn, dp - dn, dn) >= 0; if (qh) mpn_sub_n (np - dn, np - dn, dp - dn, dn); TMP_FREE; return qh; } #endif qp -= qn; /* point at low limb of next quotient block */ np -= qn; /* point in the middle of partial remainder */ q2p = TMP_SALLOC_LIMBS (qn + 1); /* Should we at all check DC_DIVAPPR_Q_THRESHOLD here, or reply on callers not to be silly? */ if (BELOW_THRESHOLD (qn, DC_DIVAPPR_Q_THRESHOLD)) { qh = mpn_sbpi1_divappr_q (q2p, np - qn - 2, 2 * (qn + 1), dp - (qn + 1), qn + 1, dinv->inv32); } else { /* It is tempting to use qp for recursive scratch and put quotient in tp, but the recursive scratch needs one limb too many. */ tp = TMP_SALLOC_LIMBS (qn + 1); qh = mpn_dcpi1_divappr_q_n (q2p, np - qn - 2, dp - (qn + 1), qn + 1, dinv, tp); } MPN_COPY (qp, q2p + 1, qn); } TMP_FREE; return qh; }
mp_limb_t mpn_dcpi1_div_qr (mp_ptr qp, mp_ptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, gmp_pi1_t *dinv) { mp_size_t qn; mp_limb_t qh, cy; mp_ptr tp; TMP_DECL; TMP_MARK; ASSERT (dn >= 6); /* to adhere to mpn_sbpi1_div_qr's limits */ ASSERT (nn - dn >= 3); /* to adhere to mpn_sbpi1_div_qr's limits */ ASSERT (dp[dn-1] & GMP_NUMB_HIGHBIT); tp = TMP_SALLOC_LIMBS (dn); qn = nn - dn; qp += qn; np += nn; dp += dn; if (qn > dn) { /* Reduce qn mod dn without division, optimizing small operations. */ do qn -= dn; while (qn > dn); qp -= qn; /* point at low limb of next quotient block */ np -= qn; /* point in the middle of partial remainder */ /* Perform the typically smaller block first. */ if (qn == 1) { mp_limb_t q, n2, n1, n0, d1, d0; /* Handle qh up front, for simplicity. */ qh = mpn_cmp (np - dn + 1, dp - dn, dn) >= 0; if (qh) ASSERT_NOCARRY (mpn_sub_n (np - dn + 1, np - dn + 1, dp - dn, dn)); /* A single iteration of schoolbook: One 3/2 division, followed by the bignum update and adjustment. */ n2 = np[0]; n1 = np[-1]; n0 = np[-2]; d1 = dp[-1]; d0 = dp[-2]; ASSERT (n2 < d1 || (n2 == d1 && n1 <= d0)); if (UNLIKELY (n2 == d1) && n1 == d0) { q = GMP_NUMB_MASK; cy = mpn_submul_1 (np - dn, dp - dn, dn, q); ASSERT (cy == n2); } else { udiv_qr_3by2 (q, n1, n0, n2, n1, n0, d1, d0, dinv->inv32); if (dn > 2) { mp_limb_t cy, cy1; cy = mpn_submul_1 (np - dn, dp - dn, dn - 2, q); cy1 = n0 < cy; n0 = (n0 - cy) & GMP_NUMB_MASK; cy = n1 < cy1; n1 = (n1 - cy1) & GMP_NUMB_MASK; np[-2] = n0; if (UNLIKELY (cy != 0)) { n1 += d1 + mpn_add_n (np - dn, np - dn, dp - dn, dn - 1); qh -= (q == 0); q = (q - 1) & GMP_NUMB_MASK; } } else np[-2] = n0; np[-1] = n1; } qp[0] = q; } else { /* Do a 2qn / qn division */ if (qn == 2) qh = mpn_divrem_2 (qp, 0L, np - 2, 4, dp - 2); /* FIXME: obsolete function. Use 5/3 division? */ else if (BELOW_THRESHOLD (qn, DC_DIV_QR_THRESHOLD)) qh = mpn_sbpi1_div_qr (qp, np - qn, 2 * qn, dp - qn, qn, dinv->inv32); else qh = mpn_dcpi1_div_qr_n (qp, np - qn, dp - qn, qn, dinv, tp); if (qn != dn) { if (qn > dn - qn) mpn_mul (tp, qp, qn, dp - dn, dn - qn); else mpn_mul (tp, dp - dn, dn - qn, qp, qn); cy = mpn_sub_n (np - dn, np - dn, tp, dn); if (qh != 0) cy += mpn_sub_n (np - dn + qn, np - dn + qn, dp - dn, dn - qn); while (cy != 0) { qh -= mpn_sub_1 (qp, qp, qn, 1); cy -= mpn_add_n (np - dn, np - dn, dp - dn, dn); } } } qn = nn - dn - qn; do { qp -= dn; np -= dn; mpn_dcpi1_div_qr_n (qp, np - dn, dp - dn, dn, dinv, tp); qn -= dn; } while (qn > 0); } else { qp -= qn; /* point at low limb of next quotient block */ np -= qn; /* point in the middle of partial remainder */ if (BELOW_THRESHOLD (qn, DC_DIV_QR_THRESHOLD)) qh = mpn_sbpi1_div_qr (qp, np - qn, 2 * qn, dp - qn, qn, dinv->inv32); else qh = mpn_dcpi1_div_qr_n (qp, np - qn, dp - qn, qn, dinv, tp); if (qn != dn) { if (qn > dn - qn) mpn_mul (tp, qp, qn, dp - dn, dn - qn); else mpn_mul (tp, dp - dn, dn - qn, qp, qn); cy = mpn_sub_n (np - dn, np - dn, tp, dn); if (qh != 0) cy += mpn_sub_n (np - dn + qn, np - dn + qn, dp - dn, dn - qn); while (cy != 0) { qh -= mpn_sub_1 (qp, qp, qn, 1); cy -= mpn_add_n (np - dn, np - dn, dp - dn, dn); } } } TMP_FREE; return qh; }
int main (int argc, char **argv) { mp_ptr ap, bp, rp, refp; mp_size_t max_n, n; gmp_randstate_ptr rands; long test, reps = 1000; TMP_SDECL; TMP_SMARK; tests_start (); TESTS_REPS (reps, argv, argc); rands = RANDS; max_n = 32; ap = TMP_SALLOC_LIMBS (max_n); bp = TMP_SALLOC_LIMBS (max_n); rp = TMP_SALLOC_LIMBS (max_n); refp = TMP_SALLOC_LIMBS (max_n); for (test = 0; test < reps; test++) { for (n = 1; n <= max_n; n++) { mpn_random2 (ap, n); mpn_random2 (bp, n); refmpn_and_n (refp, ap, bp, n); mpn_and_n (rp, ap, bp, n); check_one (refp, rp, ap, bp, n, "and_n"); refmpn_ior_n (refp, ap, bp, n); mpn_ior_n (rp, ap, bp, n); check_one (refp, rp, ap, bp, n, "ior_n"); refmpn_xor_n (refp, ap, bp, n); mpn_xor_n (rp, ap, bp, n); check_one (refp, rp, ap, bp, n, "xor_n"); refmpn_andn_n (refp, ap, bp, n); mpn_andn_n (rp, ap, bp, n); check_one (refp, rp, ap, bp, n, "andn_n"); refmpn_iorn_n (refp, ap, bp, n); mpn_iorn_n (rp, ap, bp, n); check_one (refp, rp, ap, bp, n, "iorn_n"); refmpn_nand_n (refp, ap, bp, n); mpn_nand_n (rp, ap, bp, n); check_one (refp, rp, ap, bp, n, "nand_n"); refmpn_nior_n (refp, ap, bp, n); mpn_nior_n (rp, ap, bp, n); check_one (refp, rp, ap, bp, n, "nior_n"); refmpn_xnor_n (refp, ap, bp, n); mpn_xnor_n (rp, ap, bp, n); check_one (refp, rp, ap, bp, n, "xnor_n"); refmpn_com (refp, ap, n); mpn_com (rp, ap, n); check_one (refp, rp, ap, bp, n, "com"); } } TMP_SFREE; tests_end (); return 0; }
void mpn_dcpi1_bdiv_q (mp_ptr qp, mp_ptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_limb_t dinv) { mp_size_t qn; mp_limb_t cy; mp_ptr tp; TMP_DECL; TMP_MARK; ASSERT (dn >= 2); ASSERT (nn - dn >= 0); ASSERT (dp[0] & 1); tp = TMP_SALLOC_LIMBS (dn); qn = nn; if (qn > dn) { /* Reduce qn mod dn in a super-efficient manner. */ do qn -= dn; while (qn > dn); /* Perform the typically smaller block first. */ if (BELOW_THRESHOLD (qn, DC_BDIV_QR_THRESHOLD)) cy = mpn_sbpi1_bdiv_qr (qp, np, 2 * qn, dp, qn, dinv); else cy = mpn_dcpi1_bdiv_qr_n (qp, np, dp, qn, dinv, tp); if (qn != dn) { if (qn > dn - qn) mpn_mul (tp, qp, qn, dp + qn, dn - qn); else mpn_mul (tp, dp + qn, dn - qn, qp, qn); mpn_incr_u (tp + qn, cy); mpn_sub (np + qn, np + qn, nn - qn, tp, dn); cy = 0; } np += qn; qp += qn; qn = nn - qn; while (qn > dn) { mpn_sub_1 (np + dn, np + dn, qn - dn, cy); cy = mpn_dcpi1_bdiv_qr_n (qp, np, dp, dn, dinv, tp); qp += dn; np += dn; qn -= dn; } mpn_dcpi1_bdiv_q_n (qp, np, dp, dn, dinv, tp); } else { if (BELOW_THRESHOLD (qn, DC_BDIV_Q_THRESHOLD)) mpn_sbpi1_bdiv_q (qp, np, qn, dp, qn, dinv); else mpn_dcpi1_bdiv_q_n (qp, np, dp, qn, dinv, tp); } TMP_FREE; }
mp_limb_t mpn_preinv_dc_divappr_q (mp_ptr qp, mp_ptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_srcptr dip) { mp_size_t qn; mp_limb_t qh, cy, qsave; mp_ptr tp; TMP_DECL; TMP_MARK; tp = TMP_SALLOC_LIMBS (dn+1); qn = nn - dn; qp += qn; np += nn; dp += dn; if (qn > dn) { qn++; /* pretend we'll need an extra limb */ /* Reduce qn mod dn without division, optimizing small operations. */ do qn -= dn; while (qn > dn); qp -= qn; /* point at low limb of next quotient block */ np -= qn; /* point in the middle of partial remainder */ /* Perform the typically smaller block first. */ if (BELOW_THRESHOLD (qn, DC_DIV_QR_THRESHOLD)) qh = mpn_sb_div_qr (qp, np - qn, 2 * qn, dp - qn, qn, dip); else qh = mpn_dc_div_qr_n (qp, np - qn, dp - qn, qn, dip, tp); if (qn != dn) { if (qn > dn - qn) mpn_mul (tp, qp, qn, dp - dn, dn - qn); else mpn_mul (tp, dp - dn, dn - qn, qp, qn); cy = mpn_sub_n (np - dn, np - dn, tp, dn); if (qh != 0) cy += mpn_sub_n (np - dn + qn, np - dn + qn, dp - dn, dn - qn); while (cy != 0) { qh -= mpn_sub_1 (qp, qp, qn, 1); cy -= mpn_add_n (np - dn, np - dn, dp - dn, dn); } } qn = nn - dn - qn + 1; while (qn > dn) { qp -= dn; np -= dn; mpn_dc_div_qr_n (qp, np - dn, dp - dn, dn, dip, tp); qn -= dn; } /* Since we pretended we'd need an extra quotient limb before, we now have made sure the code above left just dn-1=qn quotient limbs to develop. Develop that plus a guard limb. */ qn--; qp -= qn; np -= dn; qsave = qp[qn]; mpn_dc_divappr_q_n (qp, np - dn, dp - dn, dn, dip, tp); MPN_COPY_INCR (qp, qp + 1, qn); qp[qn] = qsave; } else { if (qn == 0) { qh = mpn_cmp (np - dn, dp - dn, dn) >= 0; if (qh) mpn_sub_n (np - dn, np - dn, dp - dn, dn); TMP_FREE; return qh; } qp -= qn; /* point at low limb of next quotient block */ np -= qn; /* point in the middle of partial remainder */ if (BELOW_THRESHOLD (qn, DC_DIVAPPR_Q_THRESHOLD)) /* Full precision. Optimal? */ qh = mpn_sb_divappr_q (qp, np - dn, nn, dp - dn, dn, dip); else { /* Put quotient in tp, use qp as temporary, since qp lacks a limb. */ qh = mpn_dc_divappr_q_n (tp, np - qn - 2, dp - (qn + 1), qn + 1, dip, qp); MPN_COPY (qp, tp + 1, qn); } } TMP_FREE; return qh; }
void mpn_sqr (mp_ptr p, mp_srcptr a, mp_size_t n) { ASSERT (n >= 1); ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n)); if (BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) { /* mul_basecase is faster than sqr_basecase on small sizes sometimes */ mpn_mul_basecase (p, a, n, a, n); } else if (BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD)) { mpn_sqr_basecase (p, a, n); } else if (BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) { /* Allocate workspace of fixed size on stack: fast! */ mp_limb_t ws[mpn_toom2_sqr_itch (SQR_TOOM3_THRESHOLD_LIMIT-1)]; ASSERT (SQR_TOOM3_THRESHOLD <= SQR_TOOM3_THRESHOLD_LIMIT); mpn_toom2_sqr (p, a, n, ws); } else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD)) { mp_ptr ws; TMP_SDECL; TMP_SMARK; ws = TMP_SALLOC_LIMBS (mpn_toom3_sqr_itch (n)); mpn_toom3_sqr (p, a, n, ws); TMP_SFREE; } else if (BELOW_THRESHOLD (n, SQR_TOOM6_THRESHOLD)) { mp_ptr ws; TMP_SDECL; TMP_SMARK; ws = TMP_SALLOC_LIMBS (mpn_toom4_sqr_itch (n)); mpn_toom4_sqr (p, a, n, ws); TMP_SFREE; } else if (BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD)) { mp_ptr ws; TMP_SDECL; TMP_SMARK; ws = TMP_SALLOC_LIMBS (mpn_toom6_sqr_itch (n)); mpn_toom6_sqr (p, a, n, ws); TMP_SFREE; } else if (BELOW_THRESHOLD (n, SQR_FFT_THRESHOLD)) { mp_ptr ws; TMP_DECL; TMP_MARK; ws = TMP_ALLOC_LIMBS (mpn_toom8_sqr_itch (n)); mpn_toom8_sqr (p, a, n, ws); TMP_FREE; } else { /* The current FFT code allocates its own space. That should probably change. */ mpn_fft_mul (p, a, n, a, n); } }
void mpn_toom53_mul (mp_ptr pp, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr scratch) { mp_size_t n, s, t; mp_limb_t cy; mp_ptr gp; mp_ptr as1, asm1, as2, asm2, ash; mp_ptr bs1, bsm1, bs2, bsm2, bsh; enum toom7_flags flags; TMP_DECL; #define a0 ap #define a1 (ap + n) #define a2 (ap + 2*n) #define a3 (ap + 3*n) #define a4 (ap + 4*n) #define b0 bp #define b1 (bp + n) #define b2 (bp + 2*n) n = 1 + (3 * an >= 5 * bn ? (an - 1) / (size_t) 5 : (bn - 1) / (size_t) 3); s = an - 4 * n; t = bn - 2 * n; ASSERT (0 < s && s <= n); ASSERT (0 < t && t <= n); TMP_MARK; as1 = TMP_SALLOC_LIMBS (n + 1); asm1 = TMP_SALLOC_LIMBS (n + 1); as2 = TMP_SALLOC_LIMBS (n + 1); asm2 = TMP_SALLOC_LIMBS (n + 1); ash = TMP_SALLOC_LIMBS (n + 1); bs1 = TMP_SALLOC_LIMBS (n + 1); bsm1 = TMP_SALLOC_LIMBS (n + 1); bs2 = TMP_SALLOC_LIMBS (n + 1); bsm2 = TMP_SALLOC_LIMBS (n + 1); bsh = TMP_SALLOC_LIMBS (n + 1); gp = pp; /* Compute as1 and asm1. */ flags = (enum toom7_flags) (toom7_w3_neg & mpn_toom_eval_pm1 (as1, asm1, 4, ap, n, s, gp)); /* Compute as2 and asm2. */ flags = (enum toom7_flags) (flags | (toom7_w1_neg & mpn_toom_eval_pm2 (as2, asm2, 4, ap, n, s, gp))); /* Compute ash = 16 a0 + 8 a1 + 4 a2 + 2 a3 + a4 = 2*(2*(2*(2*a0 + a1) + a2) + a3) + a4 */ #if HAVE_NATIVE_mpn_addlsh1_n cy = mpn_addlsh1_n (ash, a1, a0, n); cy = 2*cy + mpn_addlsh1_n (ash, a2, ash, n); cy = 2*cy + mpn_addlsh1_n (ash, a3, ash, n); if (s < n) { mp_limb_t cy2; cy2 = mpn_addlsh1_n (ash, a4, ash, s); ash[n] = 2*cy + mpn_lshift (ash + s, ash + s, n - s, 1); MPN_INCR_U (ash + s, n+1-s, cy2); } else ash[n] = 2*cy + mpn_addlsh1_n (ash, a4, ash, n); #else cy = mpn_lshift (ash, a0, n, 1); cy += mpn_add_n (ash, ash, a1, n); cy = 2*cy + mpn_lshift (ash, ash, n, 1); cy += mpn_add_n (ash, ash, a2, n); cy = 2*cy + mpn_lshift (ash, ash, n, 1); cy += mpn_add_n (ash, ash, a3, n); cy = 2*cy + mpn_lshift (ash, ash, n, 1); ash[n] = cy + mpn_add (ash, ash, n, a4, s); #endif /* Compute bs1 and bsm1. */ bs1[n] = mpn_add (bs1, b0, n, b2, t); /* b0 + b2 */ #if HAVE_NATIVE_mpn_add_n_sub_n if (bs1[n] == 0 && mpn_cmp (bs1, b1, n) < 0) { bs1[n] = mpn_add_n_sub_n (bs1, bsm1, b1, bs1, n) >> 1; bsm1[n] = 0; flags = (enum toom7_flags) (flags ^ toom7_w3_neg); }