void mpn_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n) { ASSERT (n >= 1); ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n)); #if 0 /* FIXME: Can this be removed? */ if (n == 0) return; #endif if (BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) { /* mul_basecase is faster than sqr_basecase on small sizes sometimes */ mpn_mul_basecase (p, a, n, a, n); } else if (BELOW_THRESHOLD (n, SQR_KARATSUBA_THRESHOLD)) { mpn_sqr_basecase (p, a, n); } else if (BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) { /* Allocate workspace of fixed size on stack: fast! */ mp_limb_t ws[MPN_KARA_SQR_N_TSIZE (SQR_TOOM3_THRESHOLD_LIMIT-1)]; ASSERT (SQR_TOOM3_THRESHOLD <= SQR_TOOM3_THRESHOLD_LIMIT); mpn_kara_sqr_n (p, a, n, ws); } #if WANT_FFT || TUNE_PROGRAM_BUILD else if (BELOW_THRESHOLD (n, SQR_FFT_THRESHOLD)) #else else if (BELOW_THRESHOLD (n, MPN_TOOM3_MAX_N)) #endif { mp_ptr ws; TMP_SDECL; TMP_SMARK; ws = TMP_SALLOC_LIMBS (MPN_TOOM3_SQR_N_TSIZE (n)); mpn_toom3_sqr_n (p, a, n, ws); TMP_SFREE; } else #if WANT_FFT || TUNE_PROGRAM_BUILD { /* The current FFT code allocates its own space. That should probably change. */ mpn_mul_fft_full (p, a, n, a, n); } #else { /* Toom3 for large operands. Use workspace from the heap, as stack space may be limited. Since n is at least MUL_TOOM3_THRESHOLD, multiplication will take much longer than malloc()/free(). */ mp_ptr ws; mp_size_t ws_size; ws_size = MPN_TOOM3_SQR_N_TSIZE (n); ws = __GMP_ALLOCATE_FUNC_LIMBS (ws_size); mpn_toom3_sqr_n (p, a, n, ws); __GMP_FREE_FUNC_LIMBS (ws, ws_size); } #endif }
void mpn_sqr (mp_ptr p, mp_srcptr a, mp_size_t n) { ASSERT (n >= 1); ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n)); #if 0 /* FIXME: Can this be removed? */ if (n == 0) return; #endif if (BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) { /* mul_basecase is faster than sqr_basecase on small sizes sometimes */ mpn_mul_basecase (p, a, n, a, n); } else if (BELOW_THRESHOLD (n, SQR_KARATSUBA_THRESHOLD)) { mpn_sqr_basecase (p, a, n); } else if (BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) { /* Allocate workspace of fixed size on stack: fast! */ mp_limb_t ws[MPN_KARA_SQR_N_TSIZE (SQR_TOOM3_THRESHOLD_LIMIT-1)]; ASSERT (SQR_TOOM3_THRESHOLD <= SQR_TOOM3_THRESHOLD_LIMIT); mpn_kara_sqr_n (p, a, n, ws); } else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD)) { mp_ptr ws; TMP_SDECL; TMP_SMARK; ws = TMP_SALLOC_LIMBS (MPN_TOOM3_SQR_N_TSIZE (n)); mpn_toom3_sqr_n (p, a, n, ws); TMP_SFREE; } else if (BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD)) { mpn_toom4_sqr_n (p, a, n); } #if WANT_FFT || TUNE_PROGRAM_BUILD else if (BELOW_THRESHOLD (n, SQR_FFT_FULL_THRESHOLD)) #else else #endif { mpn_toom8_sqr_n (p, a, n); } #if WANT_FFT || TUNE_PROGRAM_BUILD else {
void mpn_sec_sqr (mp_ptr rp, mp_srcptr ap, mp_size_t an, mp_ptr tp) { #ifndef SQR_BASECASE_LIM /* If SQR_BASECASE_LIM is now not defined, use mpn_sqr_basecase for any operand size. */ mpn_sqr_basecase (rp, ap, an); #else /* Else use mpn_sqr_basecase for its allowed sizes, else mpn_mul_basecase. */ mpn_mul_basecase (rp, ap, an, ap, an); #endif }
// (rp,2n)=(xp,n)^2 with temp space (tp,2*n+C) void mpn_kara_sqr_n(mp_ptr rp,mp_srcptr xp,mp_size_t n,mp_ptr tp) {mp_size_t n2,n3;mp_srcptr xl,xh;mp_ptr dx;mp_limb_t c; n2=n>>1; xl=xp;xh=xp+n2;n3=n-n2; dx=rp+2*n2; if((n&1)==0) {if(mpn_cmp(xh,xl,n2)>=0){mpn_sub_n(dx,xh,xl,n2);}else{mpn_sub_n(dx,xl,xh,n2);}} else {if(xh[n2]!=0 || mpn_cmp(xh,xl,n2)>=0){c=mpn_sub_n(dx,xh,xl,n2);dx[n2]=xh[n2]-c;}else{mpn_sub_n(dx,xl,xh,n2);dx[n2]=0;}} if(BELOW_THRESHOLD(n3,MUL_KARATSUBA_THRESHOLD)) {mpn_mul_basecase(rp,xl,n2,xl,n2); mpn_mul_basecase(tp,dx,n3,dx,n3); mpn_mul_basecase(rp+2*n2,xh,n3,xh,n3);} else if(BELOW_THRESHOLD(n3,SQR_KARATSUBA_THRESHOLD)) {mpn_sqr_basecase(rp,xl,n2); mpn_sqr_basecase(tp,dx,n3); mpn_sqr_basecase(rp+2*n2,xh,n3);} else {mpn_kara_sqr_n(rp,xl,n2,tp+2*n3); mpn_kara_sqr_n(tp,dx,n3,tp+2*n3); mpn_kara_sqr_n(rp+2*n2,xh,n3,tp+2*n3);} mpn_karasub(rp,tp,n); return;}
void mpn_sqr_n (mp_ptr prodp, mp_srcptr up, mp_size_t un) { ASSERT (un >= 1); ASSERT (! MPN_OVERLAP_P (prodp, 2*un, up, un)); /* FIXME: Can this be removed? */ if (un == 0) return; if (BELOW_THRESHOLD (un, SQR_BASECASE_THRESHOLD)) { /* mul_basecase is faster than sqr_basecase on small sizes sometimes */ mpn_mul_basecase (prodp, up, un, up, un); } else if (BELOW_THRESHOLD (un, SQR_KARATSUBA_THRESHOLD)) { /* plain schoolbook multiplication */ mpn_sqr_basecase (prodp, up, un); } else if (BELOW_THRESHOLD (un, SQR_TOOM3_THRESHOLD)) { /* karatsuba multiplication */ mp_ptr tspace; TMP_DECL (marker); TMP_MARK (marker); tspace = TMP_ALLOC_LIMBS (MPN_KARA_SQR_N_TSIZE (un)); mpn_kara_sqr_n (prodp, up, un, tspace); TMP_FREE (marker); } #if WANT_FFT || TUNE_PROGRAM_BUILD else if (BELOW_THRESHOLD (un, SQR_FFT_THRESHOLD)) #else else #endif { /* Toom3 multiplication. Use workspace from the heap, as stack may be limited. Since n is at least MUL_TOOM3_THRESHOLD, the multiplication will take much longer than malloc()/free(). */ mp_ptr tspace; mp_size_t tsize; tsize = MPN_TOOM3_SQR_N_TSIZE (un); tspace = __GMP_ALLOCATE_FUNC_LIMBS (tsize); mpn_toom3_sqr_n (prodp, up, un, tspace); __GMP_FREE_FUNC_LIMBS (tspace, tsize); } #if WANT_FFT || TUNE_PROGRAM_BUILD else {
/* Define our own squaring function, which uses mpn_sqr_basecase for its allowed sizes, but its own code for larger sizes. */ static void mpn_local_sqr (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_ptr tp) { mp_size_t i; ASSERT (n >= 1); ASSERT (! MPN_OVERLAP_P (rp, 2*n, up, n)); if (BELOW_THRESHOLD (n, SQR_BASECASE_LIM)) { mpn_sqr_basecase (rp, up, n); return; } { mp_limb_t ul, lpl; ul = up[0]; umul_ppmm (rp[1], lpl, ul, ul << GMP_NAIL_BITS); rp[0] = lpl >> GMP_NAIL_BITS; } if (n > 1) { mp_limb_t cy; cy = mpn_mul_1 (tp, up + 1, n - 1, up[0]); tp[n - 1] = cy; for (i = 2; i < n; i++) { mp_limb_t cy; cy = mpn_addmul_1 (tp + 2 * i - 2, up + i, n - i, up[i - 1]); tp[n + i - 2] = cy; } MPN_SQR_DIAGONAL (rp + 2, up + 1, n - 1); { mp_limb_t cy; #if HAVE_NATIVE_mpn_addlsh1_n cy = mpn_addlsh1_n (rp + 1, rp + 1, tp, 2 * n - 2); #else cy = mpn_lshift (tp, tp, 2 * n - 2, 1); cy += mpn_add_n (rp + 1, rp + 1, tp, 2 * n - 2); #endif rp[2 * n - 1] += cy; } } }
void mpn_kara_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws) { mp_limb_t w, w0, w1; mp_size_t n2; mp_srcptr x, y; mp_size_t i; n2 = n >> 1; ASSERT (n2 > 0); if ((n & 1) != 0) { /* Odd length. */ mp_size_t n1, n3, nm1; n3 = n - n2; w = a[n2]; if (w != 0) w -= mpn_sub_n (p, a, a + n3, n2); else { i = n2; do { --i; w0 = a[i]; w1 = a[n3 + i]; } while (w0 == w1 && i != 0); if (w0 < w1) { x = a + n3; y = a; } else { x = a; y = a + n3; } mpn_sub_n (p, x, y, n2); } p[n2] = w; n1 = n + 1; /* n2 is always either n3 or n3-1 so maybe the two sets of tests here could be combined. But that's not important, since the tests will take a miniscule amount of time compared to the function calls. */ if (BELOW_THRESHOLD (n3, SQR_BASECASE_THRESHOLD)) { mpn_mul_basecase (ws, p, n3, p, n3); mpn_mul_basecase (p, a, n3, a, n3); } else if (BELOW_THRESHOLD (n3, SQR_KARATSUBA_THRESHOLD)) { mpn_sqr_basecase (ws, p, n3); mpn_sqr_basecase (p, a, n3); } else { mpn_kara_sqr_n (ws, p, n3, ws + n1); /* (x-y)^2 */ mpn_kara_sqr_n (p, a, n3, ws + n1); /* x^2 */ } if (BELOW_THRESHOLD (n2, SQR_BASECASE_THRESHOLD)) mpn_mul_basecase (p + n1, a + n3, n2, a + n3, n2); else if (BELOW_THRESHOLD (n2, SQR_KARATSUBA_THRESHOLD)) mpn_sqr_basecase (p + n1, a + n3, n2); else mpn_kara_sqr_n (p + n1, a + n3, n2, ws + n1); /* y^2 */ /* Since x^2+y^2-(x-y)^2 = 2xy >= 0 there's no need to track the borrow from mpn_sub_n. If it occurs then it'll be cancelled by a carry from ws[n]. Further, since 2xy fits in n1 limbs there won't be any carry out of ws[n] other than cancelling that borrow. */ mpn_sub_n (ws, p, ws, n1); /* x^2-(x-y)^2 */ nm1 = n - 1; if (mpn_add_n (ws, p + n1, ws, nm1)) /* x^2+y^2-(x-y)^2 = 2xy */ { mp_limb_t x = (ws[nm1] + 1) & GMP_NUMB_MASK; ws[nm1] = x; if (x == 0) ws[n] = (ws[n] + 1) & GMP_NUMB_MASK; } if (mpn_add_n (p + n3, p + n3, ws, n1)) { mpn_incr_u (p + n1 + n3, 1); } } else { /* Even length. */ i = n2; do { --i; w0 = a[i]; w1 = a[n2 + i]; } while (w0 == w1 && i != 0); if (w0 < w1) { x = a + n2; y = a; } else { x = a; y = a + n2; } mpn_sub_n (p, x, y, n2); /* Pointwise products. */ if (BELOW_THRESHOLD (n2, SQR_BASECASE_THRESHOLD)) { mpn_mul_basecase (ws, p, n2, p, n2); mpn_mul_basecase (p, a, n2, a, n2); mpn_mul_basecase (p + n, a + n2, n2, a + n2, n2); } else if (BELOW_THRESHOLD (n2, SQR_KARATSUBA_THRESHOLD)) { mpn_sqr_basecase (ws, p, n2); mpn_sqr_basecase (p, a, n2); mpn_sqr_basecase (p + n, a + n2, n2); } else { mpn_kara_sqr_n (ws, p, n2, ws + n); mpn_kara_sqr_n (p, a, n2, ws + n); mpn_kara_sqr_n (p + n, a + n2, n2, ws + n); } /* Interpolate. */ w = -mpn_sub_n (ws, p, ws, n); w += mpn_add_n (ws, p + n, ws, n); w += mpn_add_n (p + n2, p + n2, ws, n); MPN_INCR_U (p + n2 + n, 2 * n - (n2 + n), w); } }
void check_functions (void) { mp_limb_t wp[2], wp2[2], xp[2], yp[2], r; int i; memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 123; yp[0] = 456; mpn_add_n (wp, xp, yp, (mp_size_t) 1); ASSERT_ALWAYS (wp[0] == 579); } memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 123; wp[0] = 456; r = mpn_addmul_1 (wp, xp, (mp_size_t) 1, CNST_LIMB(2)); ASSERT_ALWAYS (wp[0] == 702); ASSERT_ALWAYS (r == 0); } #if HAVE_NATIVE_mpn_copyd memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 123; xp[1] = 456; mpn_copyd (xp+1, xp, (mp_size_t) 1); ASSERT_ALWAYS (xp[1] == 123); } #endif #if HAVE_NATIVE_mpn_copyi memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 123; xp[1] = 456; mpn_copyi (xp, xp+1, (mp_size_t) 1); ASSERT_ALWAYS (xp[0] == 456); } #endif memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 1605; mpn_divexact_1 (wp, xp, (mp_size_t) 1, CNST_LIMB(5)); ASSERT_ALWAYS (wp[0] == 321); } memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 1296; r = mpn_divexact_by3c (wp, xp, (mp_size_t) 1, CNST_LIMB(0)); ASSERT_ALWAYS (wp[0] == 432); ASSERT_ALWAYS (r == 0); } memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 578; r = mpn_divexact_byfobm1 (wp, xp, (mp_size_t) 1, CNST_LIMB(17),CNST_LIMB(-1)/CNST_LIMB(17)); ASSERT_ALWAYS (wp[0] == 34); ASSERT_ALWAYS (r == 0); } memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 287; r = mpn_divrem_1 (wp, (mp_size_t) 1, xp, (mp_size_t) 1, CNST_LIMB(7)); ASSERT_ALWAYS (wp[1] == 41); ASSERT_ALWAYS (wp[0] == 0); ASSERT_ALWAYS (r == 0); } memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 290; r = mpn_divrem_euclidean_qr_1 (wp, 0, xp, (mp_size_t) 1, CNST_LIMB(7)); ASSERT_ALWAYS (wp[0] == 41); ASSERT_ALWAYS (r == 3); } memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 12; r = mpn_gcd_1 (xp, (mp_size_t) 1, CNST_LIMB(9)); ASSERT_ALWAYS (r == 3); } memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 0x1001; mpn_lshift (wp, xp, (mp_size_t) 1, 1); ASSERT_ALWAYS (wp[0] == 0x2002); } memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 14; r = mpn_mod_1 (xp, (mp_size_t) 1, CNST_LIMB(4)); ASSERT_ALWAYS (r == 2); } #if (GMP_NUMB_BITS % 4) == 0 memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { int bits = (GMP_NUMB_BITS / 4) * 3; mp_limb_t mod = (CNST_LIMB(1) << bits) - 1; mp_limb_t want = GMP_NUMB_MAX % mod; xp[0] = GMP_NUMB_MAX; r = mpn_mod_34lsub1 (xp, (mp_size_t) 1); ASSERT_ALWAYS (r % mod == want); } #endif // DECL_modexact_1c_odd ((*modexact_1c_odd)); memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 14; r = mpn_mul_1 (wp, xp, (mp_size_t) 1, CNST_LIMB(4)); ASSERT_ALWAYS (wp[0] == 56); ASSERT_ALWAYS (r == 0); } memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 5; yp[0] = 7; mpn_mul_basecase (wp, xp, (mp_size_t) 1, yp, (mp_size_t) 1); ASSERT_ALWAYS (wp[0] == 35); ASSERT_ALWAYS (wp[1] == 0); } #if HAVE_NATIVE_mpn_preinv_divrem_1 && GMP_NAIL_BITS == 0 memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 0x101; r = mpn_preinv_divrem_1 (wp, (mp_size_t) 1, xp, (mp_size_t) 1, GMP_LIMB_HIGHBIT, refmpn_invert_limb (GMP_LIMB_HIGHBIT), 0); ASSERT_ALWAYS (wp[0] == 0x202); ASSERT_ALWAYS (wp[1] == 0); ASSERT_ALWAYS (r == 0); } #endif #if GMP_NAIL_BITS == 0 memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = GMP_LIMB_HIGHBIT+123; r = mpn_preinv_mod_1 (xp, (mp_size_t) 1, GMP_LIMB_HIGHBIT, refmpn_invert_limb (GMP_LIMB_HIGHBIT)); ASSERT_ALWAYS (r == 123); } #endif memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 5; modlimb_invert(r,xp[0]); r=-r; yp[0]=43; yp[1]=75; mpn_redc_1 (wp, yp, xp, (mp_size_t) 1,r); ASSERT_ALWAYS (wp[0] == 78); } memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0]=5; yp[0]=3; mpn_sumdiff_n (wp, wp2,xp, yp,1); ASSERT_ALWAYS (wp[0] == 8); ASSERT_ALWAYS (wp2[0] == 2); } memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 0x8008; mpn_rshift (wp, xp, (mp_size_t) 1, 1); ASSERT_ALWAYS (wp[0] == 0x4004); } memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 5; mpn_sqr_basecase (wp, xp, (mp_size_t) 1); ASSERT_ALWAYS (wp[0] == 25); ASSERT_ALWAYS (wp[1] == 0); } memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 999; yp[0] = 666; mpn_sub_n (wp, xp, yp, (mp_size_t) 1); ASSERT_ALWAYS (wp[0] == 333); } memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 123; wp[0] = 456; r = mpn_submul_1 (wp, xp, (mp_size_t) 1, CNST_LIMB(2)); ASSERT_ALWAYS (wp[0] == 210); ASSERT_ALWAYS (r == 0); } }
void check (void) { mp_limb_t wp[100], xp[100], yp[100]; mp_size_t size = 100; refmpn_zero (xp, size); refmpn_zero (yp, size); refmpn_zero (wp, size); pre ("mpn_add_n"); mpn_add_n (wp, xp, yp, size); post (); #if HAVE_NATIVE_mpn_add_nc pre ("mpn_add_nc"); mpn_add_nc (wp, xp, yp, size, CNST_LIMB(0)); post (); #endif #if HAVE_NATIVE_mpn_addlsh1_n pre ("mpn_addlsh1_n"); mpn_addlsh1_n (wp, xp, yp, size); post (); #endif #if HAVE_NATIVE_mpn_and_n pre ("mpn_and_n"); mpn_and_n (wp, xp, yp, size); post (); #endif #if HAVE_NATIVE_mpn_andn_n pre ("mpn_andn_n"); mpn_andn_n (wp, xp, yp, size); post (); #endif pre ("mpn_addmul_1"); mpn_addmul_1 (wp, xp, size, yp[0]); post (); #if HAVE_NATIVE_mpn_addmul_1c pre ("mpn_addmul_1c"); mpn_addmul_1c (wp, xp, size, yp[0], CNST_LIMB(0)); post (); #endif #if HAVE_NATIVE_mpn_com_n pre ("mpn_com_n"); mpn_com_n (wp, xp, size); post (); #endif #if HAVE_NATIVE_mpn_copyd pre ("mpn_copyd"); mpn_copyd (wp, xp, size); post (); #endif #if HAVE_NATIVE_mpn_copyi pre ("mpn_copyi"); mpn_copyi (wp, xp, size); post (); #endif pre ("mpn_divexact_1"); mpn_divexact_1 (wp, xp, size, CNST_LIMB(123)); post (); pre ("mpn_divexact_by3c"); mpn_divexact_by3c (wp, xp, size, CNST_LIMB(0)); post (); pre ("mpn_divrem_1"); mpn_divrem_1 (wp, (mp_size_t) 0, xp, size, CNST_LIMB(123)); post (); #if HAVE_NATIVE_mpn_divrem_1c pre ("mpn_divrem_1c"); mpn_divrem_1c (wp, (mp_size_t) 0, xp, size, CNST_LIMB(123), CNST_LIMB(122)); post (); #endif pre ("mpn_gcd_1"); xp[0] |= 1; notdead += (unsigned long) mpn_gcd_1 (xp, size, CNST_LIMB(123)); post (); #if HAVE_NATIVE_mpn_gcd_finda pre ("mpn_gcd_finda"); xp[0] |= 1; xp[1] |= 1; notdead += mpn_gcd_finda (xp); post (); #endif pre ("mpn_hamdist"); notdead += mpn_hamdist (xp, yp, size); post (); #if HAVE_NATIVE_mpn_ior_n pre ("mpn_ior_n"); mpn_ior_n (wp, xp, yp, size); post (); #endif #if HAVE_NATIVE_mpn_iorn_n pre ("mpn_iorn_n"); mpn_iorn_n (wp, xp, yp, size); post (); #endif pre ("mpn_lshift"); mpn_lshift (wp, xp, size, 1); post (); pre ("mpn_mod_1"); notdead += mpn_mod_1 (xp, size, CNST_LIMB(123)); post (); #if HAVE_NATIVE_mpn_mod_1c pre ("mpn_mod_1c"); notdead += mpn_mod_1c (xp, size, CNST_LIMB(123), CNST_LIMB(122)); post (); #endif #if GMP_NUMB_BITS % 4 == 0 pre ("mpn_mod_34lsub1"); notdead += mpn_mod_34lsub1 (xp, size); post (); #endif pre ("mpn_modexact_1_odd"); notdead += mpn_modexact_1_odd (xp, size, CNST_LIMB(123)); post (); pre ("mpn_modexact_1c_odd"); notdead += mpn_modexact_1c_odd (xp, size, CNST_LIMB(123), CNST_LIMB(456)); post (); pre ("mpn_mul_1"); mpn_mul_1 (wp, xp, size, yp[0]); post (); #if HAVE_NATIVE_mpn_mul_1c pre ("mpn_mul_1c"); mpn_mul_1c (wp, xp, size, yp[0], CNST_LIMB(0)); post (); #endif #if HAVE_NATIVE_mpn_mul_2 pre ("mpn_mul_2"); mpn_mul_2 (wp, xp, size-1, yp); post (); #endif pre ("mpn_mul_basecase"); mpn_mul_basecase (wp, xp, (mp_size_t) 3, yp, (mp_size_t) 3); post (); #if HAVE_NATIVE_mpn_nand_n pre ("mpn_nand_n"); mpn_nand_n (wp, xp, yp, size); post (); #endif #if HAVE_NATIVE_mpn_nior_n pre ("mpn_nior_n"); mpn_nior_n (wp, xp, yp, size); post (); #endif pre ("mpn_popcount"); notdead += mpn_popcount (xp, size); post (); pre ("mpn_preinv_mod_1"); notdead += mpn_preinv_mod_1 (xp, size, GMP_NUMB_MAX, refmpn_invert_limb (GMP_NUMB_MAX)); post (); #if USE_PREINV_DIVREM_1 || HAVE_NATIVE_mpn_preinv_divrem_1 pre ("mpn_preinv_divrem_1"); mpn_preinv_divrem_1 (wp, (mp_size_t) 0, xp, size, GMP_NUMB_MAX, refmpn_invert_limb (GMP_NUMB_MAX), 0); post (); #endif #if HAVE_NATIVE_mpn_rsh1add_n pre ("mpn_rsh1add_n"); mpn_rsh1add_n (wp, xp, yp, size); post (); #endif #if HAVE_NATIVE_mpn_rsh1sub_n pre ("mpn_rsh1sub_n"); mpn_rsh1sub_n (wp, xp, yp, size); post (); #endif pre ("mpn_rshift"); mpn_rshift (wp, xp, size, 1); post (); pre ("mpn_sqr_basecase"); mpn_sqr_basecase (wp, xp, (mp_size_t) 3); post (); pre ("mpn_submul_1"); mpn_submul_1 (wp, xp, size, yp[0]); post (); #if HAVE_NATIVE_mpn_submul_1c pre ("mpn_submul_1c"); mpn_submul_1c (wp, xp, size, yp[0], CNST_LIMB(0)); post (); #endif pre ("mpn_sub_n"); mpn_sub_n (wp, xp, yp, size); post (); #if HAVE_NATIVE_mpn_sub_nc pre ("mpn_sub_nc"); mpn_sub_nc (wp, xp, yp, size, CNST_LIMB(0)); post (); #endif #if HAVE_NATIVE_mpn_sublsh1_n pre ("mpn_sublsh1_n"); mpn_sublsh1_n (wp, xp, yp, size); post (); #endif #if HAVE_NATIVE_mpn_udiv_qrnnd pre ("mpn_udiv_qrnnd"); mpn_udiv_qrnnd (&wp[0], CNST_LIMB(122), xp[0], CNST_LIMB(123)); post (); #endif #if HAVE_NATIVE_mpn_udiv_qrnnd_r pre ("mpn_udiv_qrnnd_r"); mpn_udiv_qrnnd (CNST_LIMB(122), xp[0], CNST_LIMB(123), &wp[0]); post (); #endif #if HAVE_NATIVE_mpn_umul_ppmm pre ("mpn_umul_ppmm"); mpn_umul_ppmm (&wp[0], xp[0], yp[0]); post (); #endif #if HAVE_NATIVE_mpn_umul_ppmm_r pre ("mpn_umul_ppmm_r"); mpn_umul_ppmm_r (&wp[0], xp[0], yp[0]); post (); #endif #if HAVE_NATIVE_mpn_xor_n pre ("mpn_xor_n"); mpn_xor_n (wp, xp, yp, size); post (); #endif #if HAVE_NATIVE_mpn_xnor_n pre ("mpn_xnor_n"); mpn_xnor_n (wp, xp, yp, size); post (); #endif }
void mpn_sqr (mp_ptr p, mp_srcptr a, mp_size_t n) { ASSERT (n >= 1); ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n)); if (BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) { /* mul_basecase is faster than sqr_basecase on small sizes sometimes */ mpn_mul_basecase (p, a, n, a, n); } else if (BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD)) { mpn_sqr_basecase (p, a, n); } else if (BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) { /* Allocate workspace of fixed size on stack: fast! */ mp_limb_t ws[mpn_toom2_sqr_itch (SQR_TOOM3_THRESHOLD_LIMIT-1)]; ASSERT (SQR_TOOM3_THRESHOLD <= SQR_TOOM3_THRESHOLD_LIMIT); mpn_toom2_sqr (p, a, n, ws); } else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD)) { mp_ptr ws; TMP_SDECL; TMP_SMARK; ws = TMP_SALLOC_LIMBS (mpn_toom3_sqr_itch (n)); mpn_toom3_sqr (p, a, n, ws); TMP_SFREE; } else if (BELOW_THRESHOLD (n, SQR_TOOM6_THRESHOLD)) { mp_ptr ws; TMP_SDECL; TMP_SMARK; ws = TMP_SALLOC_LIMBS (mpn_toom4_sqr_itch (n)); mpn_toom4_sqr (p, a, n, ws); TMP_SFREE; } else if (BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD)) { mp_ptr ws; TMP_SDECL; TMP_SMARK; ws = TMP_SALLOC_LIMBS (mpn_toom6_sqr_itch (n)); mpn_toom6_sqr (p, a, n, ws); TMP_SFREE; } else if (BELOW_THRESHOLD (n, SQR_FFT_THRESHOLD)) { mp_ptr ws; TMP_DECL; TMP_MARK; ws = TMP_ALLOC_LIMBS (mpn_toom8_sqr_itch (n)); mpn_toom8_sqr (p, a, n, ws); TMP_FREE; } else { /* The current FFT code allocates its own space. That should probably change. */ mpn_fft_mul (p, a, n, a, n); } }