/* Put in rp[0..n] the n+1 low limbs of {np, n} * {mp, n}. Assume 2n limbs are allocated at rp. */ void mpfr_mullow_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mpfr_limb_srcptr mp, mp_size_t n) { mp_size_t k; MPFR_ASSERTN (MPFR_MULHIGH_TAB_SIZE >= 8); /* so that 3*(n/4) > n/2 */ k = MPFR_LIKELY (n < MPFR_MULHIGH_TAB_SIZE) ? mulhigh_ktab[n] : 3*(n/4); MPFR_ASSERTD (k == -1 || k == 0 || (2 * k >= n && k < n)); if (k < 0) mpn_mul_basecase (rp, np, n, mp, n); else if (k == 0) mpfr_mullow_n_basecase (rp, np, mp, n); else if (n > MUL_FFT_THRESHOLD) mpn_mul_n (rp, np, mp, n); else { mp_size_t l = n - k; mpn_mul_n (rp, np, mp, k); /* fills rp[0..2k] */ mpfr_mullow_n (rp + n, np + k, mp, l); /* fills rp[n..n+2l] */ mpn_add_n (rp + k, rp + k, rp + n, l + 1); mpfr_mullow_n (rp + n, np, mp + k, l); /* fills rp[n..n+2l] */ mpn_add_n (rp + k, rp + k, rp + n, l + 1); } }
/* (rp, 2n) = (xp, n)*(yp, n) */ void mpn_mulhigh_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n) { mp_limb_t t; ASSERT(n > 0); ASSERT_MPN(xp, n); ASSERT_MPN(yp, n); ASSERT(!MPN_OVERLAP_P(rp, 2 * n, xp, n)); ASSERT(!MPN_OVERLAP_P(rp, 2 * n, yp, n)); if (BELOW_THRESHOLD(n, MULHIGH_BASECASE_THRESHOLD)) { mpn_mul_basecase(rp, xp, n, yp, n); return; } if (ABOVE_THRESHOLD (n, MULHIGH_MUL_THRESHOLD)) { mpn_mul_n(rp, xp, yp, n); return; } mpn_mulshort_n(rp, xp, yp, n); t = rp[n - 1] + n - 2; if (UNLIKELY(t < n - 2)) mpn_mul_n(rp, xp, yp, n); return; }
/* Put in rp[n..2n-1] an approximation of the n high limbs of {np, n} * {mp, n}. The error is less than n ulps of rp[n] (and the approximation is always less or equal to the truncated full product). Implements Algorithm ShortMul from [1]. */ void mpfr_mulhigh_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mpfr_limb_srcptr mp, mp_size_t n) { mp_size_t k; MPFR_ASSERTN (MPFR_MULHIGH_TAB_SIZE >= 8); /* so that 3*(n/4) > n/2 */ k = MPFR_LIKELY (n < MPFR_MULHIGH_TAB_SIZE) ? mulhigh_ktab[n] : 3*(n/4); /* Algorithm ShortMul from [1] requires k >= (n+3)/2, which translates into k >= (n+4)/2 in the C language. */ MPFR_ASSERTD (k == -1 || k == 0 || (k >= (n+4)/2 && k < n)); if (k < 0) mpn_mul_basecase (rp, np, n, mp, n); /* result is exact, no error */ else if (k == 0) mpfr_mulhigh_n_basecase (rp, np, mp, n); /* basecase error < n ulps */ else if (n > MUL_FFT_THRESHOLD) mpn_mul_n (rp, np, mp, n); /* result is exact, no error */ else { mp_size_t l = n - k; mp_limb_t cy; mpn_mul_n (rp + 2 * l, np + l, mp + l, k); /* fills rp[2l..2n-1] */ mpfr_mulhigh_n (rp, np + k, mp, l); /* fills rp[l-1..2l-1] */ cy = mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1); mpfr_mulhigh_n (rp, np, mp + k, l); /* fills rp[l-1..2l-1] */ cy += mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1); mpn_add_1 (rp + n + l, rp + n + l, k, cy); /* propagate carry */ } }
void mpn_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n) { ASSERT (n >= 1); ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n)); #if 0 /* FIXME: Can this be removed? */ if (n == 0) return; #endif if (BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) { /* mul_basecase is faster than sqr_basecase on small sizes sometimes */ mpn_mul_basecase (p, a, n, a, n); } else if (BELOW_THRESHOLD (n, SQR_KARATSUBA_THRESHOLD)) { mpn_sqr_basecase (p, a, n); } else if (BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) { /* Allocate workspace of fixed size on stack: fast! */ mp_limb_t ws[MPN_KARA_SQR_N_TSIZE (SQR_TOOM3_THRESHOLD_LIMIT-1)]; ASSERT (SQR_TOOM3_THRESHOLD <= SQR_TOOM3_THRESHOLD_LIMIT); mpn_kara_sqr_n (p, a, n, ws); } #if WANT_FFT || TUNE_PROGRAM_BUILD else if (BELOW_THRESHOLD (n, SQR_FFT_THRESHOLD)) #else else if (BELOW_THRESHOLD (n, MPN_TOOM3_MAX_N)) #endif { mp_ptr ws; TMP_SDECL; TMP_SMARK; ws = TMP_SALLOC_LIMBS (MPN_TOOM3_SQR_N_TSIZE (n)); mpn_toom3_sqr_n (p, a, n, ws); TMP_SFREE; } else #if WANT_FFT || TUNE_PROGRAM_BUILD { /* The current FFT code allocates its own space. That should probably change. */ mpn_mul_fft_full (p, a, n, a, n); } #else { /* Toom3 for large operands. Use workspace from the heap, as stack space may be limited. Since n is at least MUL_TOOM3_THRESHOLD, multiplication will take much longer than malloc()/free(). */ mp_ptr ws; mp_size_t ws_size; ws_size = MPN_TOOM3_SQR_N_TSIZE (n); ws = __GMP_ALLOCATE_FUNC_LIMBS (ws_size); mpn_toom3_sqr_n (p, a, n, ws); __GMP_FREE_FUNC_LIMBS (ws, ws_size); } #endif }
void mpn_sec_mul (mp_ptr rp, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr tp) { mpn_mul_basecase (rp, ap, an, bp, bn); }
// (rp,2n)=(xp,n)*(yp,n) with temp space (tp,2*n+C) void mpn_kara_mul_n(mp_ptr rp,mp_srcptr xp,mp_srcptr yp,mp_size_t n,mp_ptr tp) {mp_size_t n2,n3;mp_srcptr xl,yl,xh,yh;mp_ptr dx,dy;int suboradd;mp_limb_t c; n2=n>>1;suboradd=-1; xl=xp;xh=xp+n2;yl=yp;yh=yp+n2;n3=n-n2; dx=rp+2*n2;dy=dx+n3; if((n&1)==0) {if(mpn_cmp(xh,xl,n2)>=0){mpn_sub_n(dx,xh,xl,n2);}else{mpn_sub_n(dx,xl,xh,n2);suboradd=-suboradd;} if(mpn_cmp(yh,yl,n2)>=0){mpn_sub_n(dy,yh,yl,n2);}else{mpn_sub_n(dy,yl,yh,n2);suboradd=-suboradd;}} else {if(xh[n2]!=0 || mpn_cmp(xh,xl,n2)>=0){c=mpn_sub_n(dx,xh,xl,n2);dx[n2]=xh[n2]-c;}else{mpn_sub_n(dx,xl,xh,n2);dx[n2]=0;suboradd=-suboradd;} if(yh[n2]!=0 || mpn_cmp(yh,yl,n2)>=0){c=mpn_sub_n(dy,yh,yl,n2);dy[n2]=yh[n2]-c;}else{mpn_sub_n(dy,yl,yh,n2);dy[n2]=0;suboradd=-suboradd;}} if(BELOW_THRESHOLD(n3,MUL_KARATSUBA_THRESHOLD)) {mpn_mul_basecase(rp,xl,n2,yl,n2); mpn_mul_basecase(tp,dx,n3,dy,n3); mpn_mul_basecase(rp+2*n2,xh,n3,yh,n3);} else {mpn_kara_mul_n(rp,xl,yl,n2,tp+2*n3); mpn_kara_mul_n(tp,dx,dy,n3,tp+2*n3); mpn_kara_mul_n(rp+2*n2,xh,yh,n3,tp+2*n3);} if(suboradd==-1){mpn_karasub(rp,tp,n);}else{mpn_karaadd(rp,tp,n);} return;}
void mpn_sqr (mp_ptr p, mp_srcptr a, mp_size_t n) { ASSERT (n >= 1); ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n)); #if 0 /* FIXME: Can this be removed? */ if (n == 0) return; #endif if (BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) { /* mul_basecase is faster than sqr_basecase on small sizes sometimes */ mpn_mul_basecase (p, a, n, a, n); } else if (BELOW_THRESHOLD (n, SQR_KARATSUBA_THRESHOLD)) { mpn_sqr_basecase (p, a, n); } else if (BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) { /* Allocate workspace of fixed size on stack: fast! */ mp_limb_t ws[MPN_KARA_SQR_N_TSIZE (SQR_TOOM3_THRESHOLD_LIMIT-1)]; ASSERT (SQR_TOOM3_THRESHOLD <= SQR_TOOM3_THRESHOLD_LIMIT); mpn_kara_sqr_n (p, a, n, ws); } else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD)) { mp_ptr ws; TMP_SDECL; TMP_SMARK; ws = TMP_SALLOC_LIMBS (MPN_TOOM3_SQR_N_TSIZE (n)); mpn_toom3_sqr_n (p, a, n, ws); TMP_SFREE; } else if (BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD)) { mpn_toom4_sqr_n (p, a, n); } #if WANT_FFT || TUNE_PROGRAM_BUILD else if (BELOW_THRESHOLD (n, SQR_FFT_FULL_THRESHOLD)) #else else #endif { mpn_toom8_sqr_n (p, a, n); } #if WANT_FFT || TUNE_PROGRAM_BUILD else {
void mpn_sec_sqr (mp_ptr rp, mp_srcptr ap, mp_size_t an, mp_ptr tp) { #ifndef SQR_BASECASE_LIM /* If SQR_BASECASE_LIM is now not defined, use mpn_sqr_basecase for any operand size. */ mpn_sqr_basecase (rp, ap, an); #else /* Else use mpn_sqr_basecase for its allowed sizes, else mpn_mul_basecase. */ mpn_mul_basecase (rp, ap, an, ap, an); #endif }
void mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n) { ASSERT (n >= 1); ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n)); ASSERT (! MPN_OVERLAP_P (p, 2 * n, b, n)); if (BELOW_THRESHOLD (n, MUL_KARATSUBA_THRESHOLD)) { mpn_mul_basecase (p, a, n, b, n); } else if (BELOW_THRESHOLD (n, MUL_TOOM3_THRESHOLD)) { /* Allocate workspace of fixed size on stack: fast! */ mp_limb_t ws[MPN_KARA_MUL_N_TSIZE (MUL_TOOM3_THRESHOLD_LIMIT-1)]; ASSERT (MUL_TOOM3_THRESHOLD <= MUL_TOOM3_THRESHOLD_LIMIT); mpn_kara_mul_n (p, a, b, n, ws); } else if (BELOW_THRESHOLD (n, MUL_TOOM4_THRESHOLD)) { mp_ptr ws; TMP_SDECL; TMP_SMARK; ws = TMP_SALLOC_LIMBS (MPN_TOOM3_MUL_N_TSIZE (n)); mpn_toom3_mul_n (p, a, b, n, ws); TMP_SFREE; } else if (BELOW_THRESHOLD (n, MUL_TOOM8H_THRESHOLD)) { mpn_toom4_mul_n (p, a, b, n); } #if WANT_FFT || TUNE_PROGRAM_BUILD else if (BELOW_THRESHOLD (n, MUL_FFT_FULL_THRESHOLD)) { mpn_toom8h_mul (p, a, n, b, n); } #endif else #if WANT_FFT || TUNE_PROGRAM_BUILD { mpn_mul_fft_main(p, a, n, b, n); } #else { /* Toom8 for large operands. */ mpn_toom8h_mul (p, a, n, b, n); } #endif }
// (rp,2n)=(xp,n)^2 with temp space (tp,2*n+C) void mpn_kara_sqr_n(mp_ptr rp,mp_srcptr xp,mp_size_t n,mp_ptr tp) {mp_size_t n2,n3;mp_srcptr xl,xh;mp_ptr dx;mp_limb_t c; n2=n>>1; xl=xp;xh=xp+n2;n3=n-n2; dx=rp+2*n2; if((n&1)==0) {if(mpn_cmp(xh,xl,n2)>=0){mpn_sub_n(dx,xh,xl,n2);}else{mpn_sub_n(dx,xl,xh,n2);}} else {if(xh[n2]!=0 || mpn_cmp(xh,xl,n2)>=0){c=mpn_sub_n(dx,xh,xl,n2);dx[n2]=xh[n2]-c;}else{mpn_sub_n(dx,xl,xh,n2);dx[n2]=0;}} if(BELOW_THRESHOLD(n3,MUL_KARATSUBA_THRESHOLD)) {mpn_mul_basecase(rp,xl,n2,xl,n2); mpn_mul_basecase(tp,dx,n3,dx,n3); mpn_mul_basecase(rp+2*n2,xh,n3,xh,n3);} else if(BELOW_THRESHOLD(n3,SQR_KARATSUBA_THRESHOLD)) {mpn_sqr_basecase(rp,xl,n2); mpn_sqr_basecase(tp,dx,n3); mpn_sqr_basecase(rp+2*n2,xh,n3);} else {mpn_kara_sqr_n(rp,xl,n2,tp+2*n3); mpn_kara_sqr_n(tp,dx,n3,tp+2*n3); mpn_kara_sqr_n(rp+2*n2,xh,n3,tp+2*n3);} mpn_karasub(rp,tp,n); return;}
void mpn_sqr_n (mp_ptr prodp, mp_srcptr up, mp_size_t un) { ASSERT (un >= 1); ASSERT (! MPN_OVERLAP_P (prodp, 2*un, up, un)); /* FIXME: Can this be removed? */ if (un == 0) return; if (BELOW_THRESHOLD (un, SQR_BASECASE_THRESHOLD)) { /* mul_basecase is faster than sqr_basecase on small sizes sometimes */ mpn_mul_basecase (prodp, up, un, up, un); } else if (BELOW_THRESHOLD (un, SQR_KARATSUBA_THRESHOLD)) { /* plain schoolbook multiplication */ mpn_sqr_basecase (prodp, up, un); } else if (BELOW_THRESHOLD (un, SQR_TOOM3_THRESHOLD)) { /* karatsuba multiplication */ mp_ptr tspace; TMP_DECL (marker); TMP_MARK (marker); tspace = TMP_ALLOC_LIMBS (MPN_KARA_SQR_N_TSIZE (un)); mpn_kara_sqr_n (prodp, up, un, tspace); TMP_FREE (marker); } #if WANT_FFT || TUNE_PROGRAM_BUILD else if (BELOW_THRESHOLD (un, SQR_FFT_THRESHOLD)) #else else #endif { /* Toom3 multiplication. Use workspace from the heap, as stack may be limited. Since n is at least MUL_TOOM3_THRESHOLD, the multiplication will take much longer than malloc()/free(). */ mp_ptr tspace; mp_size_t tsize; tsize = MPN_TOOM3_SQR_N_TSIZE (un); tspace = __GMP_ALLOCATE_FUNC_LIMBS (tsize); mpn_toom3_sqr_n (prodp, up, un, tspace); __GMP_FREE_FUNC_LIMBS (tspace, tsize); } #if WANT_FFT || TUNE_PROGRAM_BUILD else {
void mpn_mullow_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n) { if (BELOW_THRESHOLD (n, MULLOW_BASECASE_THRESHOLD)) { /* Allocate workspace of fixed size on stack: fast! */ mp_limb_t ws[MUL_BASECASE_ALLOC]; mpn_mul_basecase (ws, xp, n, yp, n); MPN_COPY (rp, ws, n); } else if (BELOW_THRESHOLD (n, MULLOW_DC_THRESHOLD)) { mpn_mullow_basecase (rp, xp, yp, n); } else if (BELOW_THRESHOLD (n, MULLOW_MUL_N_THRESHOLD)) { /* Divide-and-conquer */ mp_size_t n2 = n >> 1; /* floor(n/2) */ mp_size_t n1 = n - n2; /* ceil(n/2) */ mp_ptr tp; TMP_SDECL; TMP_SMARK; tp = TMP_SALLOC_LIMBS (n1); /* Split as x = x1 2^(n1 GMP_NUMB_BITS) + x0, y = y1 2^(n2 GMP_NUMB_BITS) + y0 */ /* x0 * y0 */ mpn_mul_n (rp, xp, yp, n2); if (n1 != n2) rp[2 * n2] = mpn_addmul_1 (rp + n2, yp, n2, xp[n2]); /* x1 * y0 * 2^(n1 GMP_NUMB_BITS) */ mpn_mullow_n (tp, xp + n1, yp, n2); mpn_add_n (rp + n1, rp + n1, tp, n2); /* x0 * y1 * 2^(n2 GMP_NUMB_BITS) */ mpn_mullow_n (tp, yp + n2, xp, n1); mpn_add_n (rp + n2, rp + n2, tp, n1); TMP_SFREE; } else {
void check (void) { mp_limb_t wp[100], xp[100], yp[100]; mp_size_t size = 100; refmpn_zero (xp, size); refmpn_zero (yp, size); refmpn_zero (wp, size); pre ("mpn_add_n"); mpn_add_n (wp, xp, yp, size); post (); #if HAVE_NATIVE_mpn_add_nc pre ("mpn_add_nc"); mpn_add_nc (wp, xp, yp, size, CNST_LIMB(0)); post (); #endif #if HAVE_NATIVE_mpn_addlsh1_n pre ("mpn_addlsh1_n"); mpn_addlsh1_n (wp, xp, yp, size); post (); #endif #if HAVE_NATIVE_mpn_and_n pre ("mpn_and_n"); mpn_and_n (wp, xp, yp, size); post (); #endif #if HAVE_NATIVE_mpn_andn_n pre ("mpn_andn_n"); mpn_andn_n (wp, xp, yp, size); post (); #endif pre ("mpn_addmul_1"); mpn_addmul_1 (wp, xp, size, yp[0]); post (); #if HAVE_NATIVE_mpn_addmul_1c pre ("mpn_addmul_1c"); mpn_addmul_1c (wp, xp, size, yp[0], CNST_LIMB(0)); post (); #endif #if HAVE_NATIVE_mpn_com_n pre ("mpn_com_n"); mpn_com_n (wp, xp, size); post (); #endif #if HAVE_NATIVE_mpn_copyd pre ("mpn_copyd"); mpn_copyd (wp, xp, size); post (); #endif #if HAVE_NATIVE_mpn_copyi pre ("mpn_copyi"); mpn_copyi (wp, xp, size); post (); #endif pre ("mpn_divexact_1"); mpn_divexact_1 (wp, xp, size, CNST_LIMB(123)); post (); pre ("mpn_divexact_by3c"); mpn_divexact_by3c (wp, xp, size, CNST_LIMB(0)); post (); pre ("mpn_divrem_1"); mpn_divrem_1 (wp, (mp_size_t) 0, xp, size, CNST_LIMB(123)); post (); #if HAVE_NATIVE_mpn_divrem_1c pre ("mpn_divrem_1c"); mpn_divrem_1c (wp, (mp_size_t) 0, xp, size, CNST_LIMB(123), CNST_LIMB(122)); post (); #endif pre ("mpn_gcd_1"); xp[0] |= 1; notdead += (unsigned long) mpn_gcd_1 (xp, size, CNST_LIMB(123)); post (); #if HAVE_NATIVE_mpn_gcd_finda pre ("mpn_gcd_finda"); xp[0] |= 1; xp[1] |= 1; notdead += mpn_gcd_finda (xp); post (); #endif pre ("mpn_hamdist"); notdead += mpn_hamdist (xp, yp, size); post (); #if HAVE_NATIVE_mpn_ior_n pre ("mpn_ior_n"); mpn_ior_n (wp, xp, yp, size); post (); #endif #if HAVE_NATIVE_mpn_iorn_n pre ("mpn_iorn_n"); mpn_iorn_n (wp, xp, yp, size); post (); #endif pre ("mpn_lshift"); mpn_lshift (wp, xp, size, 1); post (); pre ("mpn_mod_1"); notdead += mpn_mod_1 (xp, size, CNST_LIMB(123)); post (); #if HAVE_NATIVE_mpn_mod_1c pre ("mpn_mod_1c"); notdead += mpn_mod_1c (xp, size, CNST_LIMB(123), CNST_LIMB(122)); post (); #endif #if GMP_NUMB_BITS % 4 == 0 pre ("mpn_mod_34lsub1"); notdead += mpn_mod_34lsub1 (xp, size); post (); #endif pre ("mpn_modexact_1_odd"); notdead += mpn_modexact_1_odd (xp, size, CNST_LIMB(123)); post (); pre ("mpn_modexact_1c_odd"); notdead += mpn_modexact_1c_odd (xp, size, CNST_LIMB(123), CNST_LIMB(456)); post (); pre ("mpn_mul_1"); mpn_mul_1 (wp, xp, size, yp[0]); post (); #if HAVE_NATIVE_mpn_mul_1c pre ("mpn_mul_1c"); mpn_mul_1c (wp, xp, size, yp[0], CNST_LIMB(0)); post (); #endif #if HAVE_NATIVE_mpn_mul_2 pre ("mpn_mul_2"); mpn_mul_2 (wp, xp, size-1, yp); post (); #endif pre ("mpn_mul_basecase"); mpn_mul_basecase (wp, xp, (mp_size_t) 3, yp, (mp_size_t) 3); post (); #if HAVE_NATIVE_mpn_nand_n pre ("mpn_nand_n"); mpn_nand_n (wp, xp, yp, size); post (); #endif #if HAVE_NATIVE_mpn_nior_n pre ("mpn_nior_n"); mpn_nior_n (wp, xp, yp, size); post (); #endif pre ("mpn_popcount"); notdead += mpn_popcount (xp, size); post (); pre ("mpn_preinv_mod_1"); notdead += mpn_preinv_mod_1 (xp, size, GMP_NUMB_MAX, refmpn_invert_limb (GMP_NUMB_MAX)); post (); #if USE_PREINV_DIVREM_1 || HAVE_NATIVE_mpn_preinv_divrem_1 pre ("mpn_preinv_divrem_1"); mpn_preinv_divrem_1 (wp, (mp_size_t) 0, xp, size, GMP_NUMB_MAX, refmpn_invert_limb (GMP_NUMB_MAX), 0); post (); #endif #if HAVE_NATIVE_mpn_rsh1add_n pre ("mpn_rsh1add_n"); mpn_rsh1add_n (wp, xp, yp, size); post (); #endif #if HAVE_NATIVE_mpn_rsh1sub_n pre ("mpn_rsh1sub_n"); mpn_rsh1sub_n (wp, xp, yp, size); post (); #endif pre ("mpn_rshift"); mpn_rshift (wp, xp, size, 1); post (); pre ("mpn_sqr_basecase"); mpn_sqr_basecase (wp, xp, (mp_size_t) 3); post (); pre ("mpn_submul_1"); mpn_submul_1 (wp, xp, size, yp[0]); post (); #if HAVE_NATIVE_mpn_submul_1c pre ("mpn_submul_1c"); mpn_submul_1c (wp, xp, size, yp[0], CNST_LIMB(0)); post (); #endif pre ("mpn_sub_n"); mpn_sub_n (wp, xp, yp, size); post (); #if HAVE_NATIVE_mpn_sub_nc pre ("mpn_sub_nc"); mpn_sub_nc (wp, xp, yp, size, CNST_LIMB(0)); post (); #endif #if HAVE_NATIVE_mpn_sublsh1_n pre ("mpn_sublsh1_n"); mpn_sublsh1_n (wp, xp, yp, size); post (); #endif #if HAVE_NATIVE_mpn_udiv_qrnnd pre ("mpn_udiv_qrnnd"); mpn_udiv_qrnnd (&wp[0], CNST_LIMB(122), xp[0], CNST_LIMB(123)); post (); #endif #if HAVE_NATIVE_mpn_udiv_qrnnd_r pre ("mpn_udiv_qrnnd_r"); mpn_udiv_qrnnd (CNST_LIMB(122), xp[0], CNST_LIMB(123), &wp[0]); post (); #endif #if HAVE_NATIVE_mpn_umul_ppmm pre ("mpn_umul_ppmm"); mpn_umul_ppmm (&wp[0], xp[0], yp[0]); post (); #endif #if HAVE_NATIVE_mpn_umul_ppmm_r pre ("mpn_umul_ppmm_r"); mpn_umul_ppmm_r (&wp[0], xp[0], yp[0]); post (); #endif #if HAVE_NATIVE_mpn_xor_n pre ("mpn_xor_n"); mpn_xor_n (wp, xp, yp, size); post (); #endif #if HAVE_NATIVE_mpn_xnor_n pre ("mpn_xnor_n"); mpn_xnor_n (wp, xp, yp, size); post (); #endif }
void mpn_kara_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws) { mp_limb_t w, w0, w1; mp_size_t n2; mp_srcptr x, y; mp_size_t i; int sign; n2 = n >> 1; ASSERT (n2 > 0); if ((n & 1) != 0) { /* Odd length. */ mp_size_t n1, n3, nm1; n3 = n - n2; sign = 0; w = a[n2]; if (w != 0) w -= mpn_sub_n (p, a, a + n3, n2); else { i = n2; do { --i; w0 = a[i]; w1 = a[n3 + i]; } while (w0 == w1 && i != 0); if (w0 < w1) { x = a + n3; y = a; sign = ~0; } else { x = a; y = a + n3; } mpn_sub_n (p, x, y, n2); } p[n2] = w; w = b[n2]; if (w != 0) w -= mpn_sub_n (p + n3, b, b + n3, n2); else { i = n2; do { --i; w0 = b[i]; w1 = b[n3 + i]; } while (w0 == w1 && i != 0); if (w0 < w1) { x = b + n3; y = b; sign = ~sign; } else { x = b; y = b + n3; } mpn_sub_n (p + n3, x, y, n2); } p[n] = w; n1 = n + 1; if (n2 < MUL_KARATSUBA_THRESHOLD) { if (n3 < MUL_KARATSUBA_THRESHOLD) { mpn_mul_basecase (ws, p, n3, p + n3, n3); mpn_mul_basecase (p, a, n3, b, n3); } else { mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1); mpn_kara_mul_n (p, a, b, n3, ws + n1); } mpn_mul_basecase (p + n1, a + n3, n2, b + n3, n2); } else { mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1); mpn_kara_mul_n (p, a, b, n3, ws + n1); mpn_kara_mul_n (p + n1, a + n3, b + n3, n2, ws + n1); } if (sign) mpn_add_n (ws, p, ws, n1); else mpn_sub_n (ws, p, ws, n1); nm1 = n - 1; if (mpn_add_n (ws, p + n1, ws, nm1)) { mp_limb_t x = (ws[nm1] + 1) & GMP_NUMB_MASK; ws[nm1] = x; if (x == 0) ws[n] = (ws[n] + 1) & GMP_NUMB_MASK; } if (mpn_add_n (p + n3, p + n3, ws, n1)) { mpn_incr_u (p + n1 + n3, 1); } } else { /* Even length. */ i = n2; do { --i; w0 = a[i]; w1 = a[n2 + i]; } while (w0 == w1 && i != 0); sign = 0; if (w0 < w1) { x = a + n2; y = a; sign = ~0; } else { x = a; y = a + n2; } mpn_sub_n (p, x, y, n2); i = n2; do { --i; w0 = b[i]; w1 = b[n2 + i]; } while (w0 == w1 && i != 0); if (w0 < w1) { x = b + n2; y = b; sign = ~sign; } else { x = b; y = b + n2; } mpn_sub_n (p + n2, x, y, n2); /* Pointwise products. */ if (n2 < MUL_KARATSUBA_THRESHOLD) { mpn_mul_basecase (ws, p, n2, p + n2, n2); mpn_mul_basecase (p, a, n2, b, n2); mpn_mul_basecase (p + n, a + n2, n2, b + n2, n2); } else { mpn_kara_mul_n (ws, p, p + n2, n2, ws + n); mpn_kara_mul_n (p, a, b, n2, ws + n); mpn_kara_mul_n (p + n, a + n2, b + n2, n2, ws + n); } /* Interpolate. */ if (sign) w = mpn_add_n (ws, p, ws, n); else w = -mpn_sub_n (ws, p, ws, n); w += mpn_add_n (ws, p + n, ws, n); w += mpn_add_n (p + n2, p + n2, ws, n); MPN_INCR_U (p + n2 + n, 2 * n - (n2 + n), w); } }
void mpn_kara_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws) { mp_limb_t w, w0, w1; mp_size_t n2; mp_srcptr x, y; mp_size_t i; n2 = n >> 1; ASSERT (n2 > 0); if ((n & 1) != 0) { /* Odd length. */ mp_size_t n1, n3, nm1; n3 = n - n2; w = a[n2]; if (w != 0) w -= mpn_sub_n (p, a, a + n3, n2); else { i = n2; do { --i; w0 = a[i]; w1 = a[n3 + i]; } while (w0 == w1 && i != 0); if (w0 < w1) { x = a + n3; y = a; } else { x = a; y = a + n3; } mpn_sub_n (p, x, y, n2); } p[n2] = w; n1 = n + 1; /* n2 is always either n3 or n3-1 so maybe the two sets of tests here could be combined. But that's not important, since the tests will take a miniscule amount of time compared to the function calls. */ if (BELOW_THRESHOLD (n3, SQR_BASECASE_THRESHOLD)) { mpn_mul_basecase (ws, p, n3, p, n3); mpn_mul_basecase (p, a, n3, a, n3); } else if (BELOW_THRESHOLD (n3, SQR_KARATSUBA_THRESHOLD)) { mpn_sqr_basecase (ws, p, n3); mpn_sqr_basecase (p, a, n3); } else { mpn_kara_sqr_n (ws, p, n3, ws + n1); /* (x-y)^2 */ mpn_kara_sqr_n (p, a, n3, ws + n1); /* x^2 */ } if (BELOW_THRESHOLD (n2, SQR_BASECASE_THRESHOLD)) mpn_mul_basecase (p + n1, a + n3, n2, a + n3, n2); else if (BELOW_THRESHOLD (n2, SQR_KARATSUBA_THRESHOLD)) mpn_sqr_basecase (p + n1, a + n3, n2); else mpn_kara_sqr_n (p + n1, a + n3, n2, ws + n1); /* y^2 */ /* Since x^2+y^2-(x-y)^2 = 2xy >= 0 there's no need to track the borrow from mpn_sub_n. If it occurs then it'll be cancelled by a carry from ws[n]. Further, since 2xy fits in n1 limbs there won't be any carry out of ws[n] other than cancelling that borrow. */ mpn_sub_n (ws, p, ws, n1); /* x^2-(x-y)^2 */ nm1 = n - 1; if (mpn_add_n (ws, p + n1, ws, nm1)) /* x^2+y^2-(x-y)^2 = 2xy */ { mp_limb_t x = (ws[nm1] + 1) & GMP_NUMB_MASK; ws[nm1] = x; if (x == 0) ws[n] = (ws[n] + 1) & GMP_NUMB_MASK; } if (mpn_add_n (p + n3, p + n3, ws, n1)) { mpn_incr_u (p + n1 + n3, 1); } } else { /* Even length. */ i = n2; do { --i; w0 = a[i]; w1 = a[n2 + i]; } while (w0 == w1 && i != 0); if (w0 < w1) { x = a + n2; y = a; } else { x = a; y = a + n2; } mpn_sub_n (p, x, y, n2); /* Pointwise products. */ if (BELOW_THRESHOLD (n2, SQR_BASECASE_THRESHOLD)) { mpn_mul_basecase (ws, p, n2, p, n2); mpn_mul_basecase (p, a, n2, a, n2); mpn_mul_basecase (p + n, a + n2, n2, a + n2, n2); } else if (BELOW_THRESHOLD (n2, SQR_KARATSUBA_THRESHOLD)) { mpn_sqr_basecase (ws, p, n2); mpn_sqr_basecase (p, a, n2); mpn_sqr_basecase (p + n, a + n2, n2); } else { mpn_kara_sqr_n (ws, p, n2, ws + n); mpn_kara_sqr_n (p, a, n2, ws + n); mpn_kara_sqr_n (p + n, a + n2, n2, ws + n); } /* Interpolate. */ w = -mpn_sub_n (ws, p, ws, n); w += mpn_add_n (ws, p + n, ws, n); w += mpn_add_n (p + n2, p + n2, ws, n); MPN_INCR_U (p + n2 + n, 2 * n - (n2 + n), w); } }
void check_functions (void) { mp_limb_t wp[2], wp2[2], xp[2], yp[2], r; int i; memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 123; yp[0] = 456; mpn_add_n (wp, xp, yp, (mp_size_t) 1); ASSERT_ALWAYS (wp[0] == 579); } memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 123; wp[0] = 456; r = mpn_addmul_1 (wp, xp, (mp_size_t) 1, CNST_LIMB(2)); ASSERT_ALWAYS (wp[0] == 702); ASSERT_ALWAYS (r == 0); } #if HAVE_NATIVE_mpn_copyd memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 123; xp[1] = 456; mpn_copyd (xp+1, xp, (mp_size_t) 1); ASSERT_ALWAYS (xp[1] == 123); } #endif #if HAVE_NATIVE_mpn_copyi memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 123; xp[1] = 456; mpn_copyi (xp, xp+1, (mp_size_t) 1); ASSERT_ALWAYS (xp[0] == 456); } #endif memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 1605; mpn_divexact_1 (wp, xp, (mp_size_t) 1, CNST_LIMB(5)); ASSERT_ALWAYS (wp[0] == 321); } memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 1296; r = mpn_divexact_by3c (wp, xp, (mp_size_t) 1, CNST_LIMB(0)); ASSERT_ALWAYS (wp[0] == 432); ASSERT_ALWAYS (r == 0); } memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 578; r = mpn_divexact_byfobm1 (wp, xp, (mp_size_t) 1, CNST_LIMB(17),CNST_LIMB(-1)/CNST_LIMB(17)); ASSERT_ALWAYS (wp[0] == 34); ASSERT_ALWAYS (r == 0); } memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 287; r = mpn_divrem_1 (wp, (mp_size_t) 1, xp, (mp_size_t) 1, CNST_LIMB(7)); ASSERT_ALWAYS (wp[1] == 41); ASSERT_ALWAYS (wp[0] == 0); ASSERT_ALWAYS (r == 0); } memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 290; r = mpn_divrem_euclidean_qr_1 (wp, 0, xp, (mp_size_t) 1, CNST_LIMB(7)); ASSERT_ALWAYS (wp[0] == 41); ASSERT_ALWAYS (r == 3); } memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 12; r = mpn_gcd_1 (xp, (mp_size_t) 1, CNST_LIMB(9)); ASSERT_ALWAYS (r == 3); } memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 0x1001; mpn_lshift (wp, xp, (mp_size_t) 1, 1); ASSERT_ALWAYS (wp[0] == 0x2002); } memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 14; r = mpn_mod_1 (xp, (mp_size_t) 1, CNST_LIMB(4)); ASSERT_ALWAYS (r == 2); } #if (GMP_NUMB_BITS % 4) == 0 memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { int bits = (GMP_NUMB_BITS / 4) * 3; mp_limb_t mod = (CNST_LIMB(1) << bits) - 1; mp_limb_t want = GMP_NUMB_MAX % mod; xp[0] = GMP_NUMB_MAX; r = mpn_mod_34lsub1 (xp, (mp_size_t) 1); ASSERT_ALWAYS (r % mod == want); } #endif // DECL_modexact_1c_odd ((*modexact_1c_odd)); memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 14; r = mpn_mul_1 (wp, xp, (mp_size_t) 1, CNST_LIMB(4)); ASSERT_ALWAYS (wp[0] == 56); ASSERT_ALWAYS (r == 0); } memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 5; yp[0] = 7; mpn_mul_basecase (wp, xp, (mp_size_t) 1, yp, (mp_size_t) 1); ASSERT_ALWAYS (wp[0] == 35); ASSERT_ALWAYS (wp[1] == 0); } #if HAVE_NATIVE_mpn_preinv_divrem_1 && GMP_NAIL_BITS == 0 memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 0x101; r = mpn_preinv_divrem_1 (wp, (mp_size_t) 1, xp, (mp_size_t) 1, GMP_LIMB_HIGHBIT, refmpn_invert_limb (GMP_LIMB_HIGHBIT), 0); ASSERT_ALWAYS (wp[0] == 0x202); ASSERT_ALWAYS (wp[1] == 0); ASSERT_ALWAYS (r == 0); } #endif #if GMP_NAIL_BITS == 0 memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = GMP_LIMB_HIGHBIT+123; r = mpn_preinv_mod_1 (xp, (mp_size_t) 1, GMP_LIMB_HIGHBIT, refmpn_invert_limb (GMP_LIMB_HIGHBIT)); ASSERT_ALWAYS (r == 123); } #endif memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 5; modlimb_invert(r,xp[0]); r=-r; yp[0]=43; yp[1]=75; mpn_redc_1 (wp, yp, xp, (mp_size_t) 1,r); ASSERT_ALWAYS (wp[0] == 78); } memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0]=5; yp[0]=3; mpn_sumdiff_n (wp, wp2,xp, yp,1); ASSERT_ALWAYS (wp[0] == 8); ASSERT_ALWAYS (wp2[0] == 2); } memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 0x8008; mpn_rshift (wp, xp, (mp_size_t) 1, 1); ASSERT_ALWAYS (wp[0] == 0x4004); } memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 5; mpn_sqr_basecase (wp, xp, (mp_size_t) 1); ASSERT_ALWAYS (wp[0] == 25); ASSERT_ALWAYS (wp[1] == 0); } memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 999; yp[0] = 666; mpn_sub_n (wp, xp, yp, (mp_size_t) 1); ASSERT_ALWAYS (wp[0] == 333); } memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec)); for (i = 0; i < 2; i++) { xp[0] = 123; wp[0] = 456; r = mpn_submul_1 (wp, xp, (mp_size_t) 1, CNST_LIMB(2)); ASSERT_ALWAYS (wp[0] == 210); ASSERT_ALWAYS (r == 0); } }
/* rp[n-1..0] = bp[bn-1..0] ^ ep[en-1..0] mod mp[n-1..0] Requires that mp[n-1..0] is odd. FIXME: is this true? Requires that ep[en-1..0] is > 1. Uses scratch space at tp of 3n+1 limbs. */ void mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn, mp_srcptr ep, mp_size_t en, mp_srcptr mp, mp_size_t n, mp_ptr tp) { mp_limb_t minv; int cnt; mp_bitcnt_t ebi; int windowsize, this_windowsize; mp_limb_t expbits; mp_ptr pp, this_pp; long i; int cnd; ASSERT (en > 1 || (en == 1 && ep[0] > 0)); ASSERT (n >= 1 && ((mp[0] & 1) != 0)); count_leading_zeros (cnt, ep[en - 1]); ebi = (mp_bitcnt_t) en * GMP_LIMB_BITS - cnt; windowsize = win_size (ebi); binvert_limb (minv, mp[0]); minv = -minv; pp = tp + 4 * n; this_pp = pp; this_pp[n] = 1; redcify (this_pp, this_pp + n, 1, mp, n, tp + 6 * n); this_pp += n; redcify (this_pp, bp, bn, mp, n, tp + 6 * n); /* Precompute powers of b and put them in the temporary area at pp. */ for (i = (1 << windowsize) - 2; i > 0; i--) { mpn_mul_basecase (tp, this_pp, n, pp + n, n); this_pp += n; mpn_redc_1_sec (this_pp, tp, mp, n, minv); } expbits = getbits (ep, ebi, windowsize); if (ebi < windowsize) ebi = 0; else ebi -= windowsize; #if WANT_CACHE_SECURITY mpn_tabselect (rp, pp, n, 1 << windowsize, expbits); #else MPN_COPY (rp, pp + n * expbits, n); #endif while (ebi != 0) { expbits = getbits (ep, ebi, windowsize); this_windowsize = windowsize; if (ebi < windowsize) { this_windowsize -= windowsize - ebi; ebi = 0; } else ebi -= windowsize; do { mpn_local_sqr (tp, rp, n, tp + 2 * n); mpn_redc_1_sec (rp, tp, mp, n, minv); this_windowsize--; } while (this_windowsize != 0); #if WANT_CACHE_SECURITY mpn_tabselect (tp + 2*n, pp, n, 1 << windowsize, expbits); mpn_mul_basecase (tp, rp, n, tp + 2*n, n); #else mpn_mul_basecase (tp, rp, n, pp + n * expbits, n); #endif mpn_redc_1_sec (rp, tp, mp, n, minv); } MPN_COPY (tp, rp, n); MPN_ZERO (tp + n, n); mpn_redc_1_sec (rp, tp, mp, n, minv); cnd = mpn_sub_n (tp, rp, mp, n); /* we need just retval */ mpn_subcnd_n (rp, rp, mp, n, !cnd); }
mp_limb_t mpn_mul (mp_ptr prodp, mp_srcptr up, mp_size_t un, mp_srcptr vp, mp_size_t vn) { mp_size_t l, k; mp_limb_t c; ASSERT (un >= vn); ASSERT (vn >= 1); ASSERT (! MPN_OVERLAP_P (prodp, un+vn, up, un)); ASSERT (! MPN_OVERLAP_P (prodp, un+vn, vp, vn)); if (un == vn) { if (up == vp) { mpn_sqr (prodp, up, un); return prodp[2 * un - 1]; } else { mpn_mul_n (prodp, up, vp, un); return prodp[2 * un - 1]; } } if (vn < MUL_KARATSUBA_THRESHOLD) { /* plain schoolbook multiplication */ if (un <= MUL_BASECASE_MAX_UN) mpn_mul_basecase (prodp, up, un, vp, vn); else { /* We have un >> MUL_BASECASE_MAX_UN > vn. For better memory locality, split up[] into MUL_BASECASE_MAX_UN pieces and multiply these pieces with the vp[] operand. After each such partial multiplication (but the last) we copy the most significant vn limbs into a temporary buffer since that part would otherwise be overwritten by the next multiplication. After the next multiplication, we add it back. This illustrates the situation: -->vn<-- | |<------- un ------->| _____________________| X /| /XX__________________/ | _____________________ | X / | /XX__________________/ | _____________________ | / / | /____________________/ | ================================================================== The parts marked with X are the parts whose sums are copied into the temporary buffer. */ mp_limb_t tp[MUL_KARATSUBA_THRESHOLD_LIMIT]; mp_limb_t cy; ASSERT (MUL_KARATSUBA_THRESHOLD <= MUL_KARATSUBA_THRESHOLD_LIMIT); mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn); prodp += MUL_BASECASE_MAX_UN; MPN_COPY (tp, prodp, vn); /* preserve high triangle */ up += MUL_BASECASE_MAX_UN; un -= MUL_BASECASE_MAX_UN; while (un > MUL_BASECASE_MAX_UN) { mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn); cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */ mpn_incr_u (prodp + vn, cy); /* safe? */ prodp += MUL_BASECASE_MAX_UN; MPN_COPY (tp, prodp, vn); /* preserve high triangle */ up += MUL_BASECASE_MAX_UN; un -= MUL_BASECASE_MAX_UN; } if (un > vn) { mpn_mul_basecase (prodp, up, un, vp, vn); } else { ASSERT_ALWAYS (un > 0); mpn_mul_basecase (prodp, vp, vn, up, un); } cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */ mpn_incr_u (prodp + vn, cy); /* safe? */ } return prodp[un + vn - 1]; } if (ABOVE_THRESHOLD (un + vn, 2*MUL_FFT_FULL_THRESHOLD) && ABOVE_THRESHOLD (3*vn, MUL_FFT_FULL_THRESHOLD)) { mpn_mul_fft_main (prodp, up, un, vp, vn); return prodp[un + vn - 1]; } k = (un + 3)/4; // ceil(un/4) #if GMP_NUMB_BITS == 32 if ((ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM8H_THRESHOLD)) && (vn>=86) && (5*un <= 11*vn)) #else if ((ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM8H_THRESHOLD)) && (vn>=86) && (4*un <= 13*vn)) #endif { mpn_toom8h_mul(prodp, up, un, vp, vn); return prodp[un + vn - 1]; } if (ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM4_THRESHOLD)) { if (vn > 3*k) { mpn_toom4_mul(prodp, up, un, vp, vn); return prodp[un + vn - 1]; } else { l = (un + 4)/5; // ceil(un/5) if ((((vn > 9*k/4) && (un+vn <= 6*MUL_TOOM4_THRESHOLD)) || ((vn > 2*l) && (un+vn > 6*MUL_TOOM4_THRESHOLD))) && (vn <= 3*l)) { mpn_toom53_mul(prodp, up, un, vp, vn); return prodp[un + vn - 1]; } } } if (ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM3_THRESHOLD) && (vn > k)) { mp_ptr ws; TMP_DECL; TMP_MARK; if (vn < 2*k) // un/2 >= vn > un/4 { ws = TMP_ALLOC_LIMBS (MPN_TOOM3_MUL_TSIZE(un)); mpn_toom42_mul(prodp, up, un, vp, vn, ws); TMP_FREE; return prodp[un + vn - 1]; } l = (un+2)/3; //ceil(u/3) if (vn > 2*l) // un >= vn > 2un/3 { ws = TMP_ALLOC_LIMBS (MPN_TOOM3_MUL_TSIZE(un)); mpn_toom3_mul(prodp, up, un, vp, vn, ws); TMP_FREE; return prodp[un + vn - 1]; } else // 2un/3 >= vn > un/3 { ws = TMP_ALLOC_LIMBS (MPN_TOOM3_MUL_TSIZE(un)); mpn_toom32_mul(prodp, up, un, vp, vn, ws); TMP_FREE; return prodp[un + vn - 1]; } } mpn_mul_n (prodp, up, vp, vn); if (un != vn) { mp_limb_t t; mp_ptr ws; TMP_DECL; TMP_MARK; prodp += vn; l = vn; up += vn; un -= vn; if (un < vn) { /* Swap u's and v's. */ MPN_SRCPTR_SWAP (up,un, vp,vn); } ws = TMP_ALLOC_LIMBS ((vn >= MUL_KARATSUBA_THRESHOLD ? vn : un) + vn); t = 0; while (vn >= MUL_KARATSUBA_THRESHOLD) { mpn_mul_n (ws, up, vp, vn); if (l <= 2*vn) { t += mpn_add_n (prodp, prodp, ws, l); if (l != 2*vn) { t = mpn_add_1 (prodp + l, ws + l, 2*vn - l, t); l = 2*vn; } } else { c = mpn_add_n (prodp, prodp, ws, 2*vn); t += mpn_add_1 (prodp + 2*vn, prodp + 2*vn, l - 2*vn, c); } prodp += vn; l -= vn; up += vn; un -= vn; if (un < vn) { /* Swap u's and v's. */ MPN_SRCPTR_SWAP (up,un, vp,vn); } } if (vn != 0) { mpn_mul_basecase (ws, up, un, vp, vn); if (l <= un + vn) { t += mpn_add_n (prodp, prodp, ws, l); if (l != un + vn) t = mpn_add_1 (prodp + l, ws + l, un + vn - l, t); } else { c = mpn_add_n (prodp, prodp, ws, un + vn); t += mpn_add_1 (prodp + un + vn, prodp + un + vn, l - un - vn, c); } } TMP_FREE; } return prodp[un + vn - 1]; }
void mpn_sqr (mp_ptr p, mp_srcptr a, mp_size_t n) { ASSERT (n >= 1); ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n)); if (BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) { /* mul_basecase is faster than sqr_basecase on small sizes sometimes */ mpn_mul_basecase (p, a, n, a, n); } else if (BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD)) { mpn_sqr_basecase (p, a, n); } else if (BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) { /* Allocate workspace of fixed size on stack: fast! */ mp_limb_t ws[mpn_toom2_sqr_itch (SQR_TOOM3_THRESHOLD_LIMIT-1)]; ASSERT (SQR_TOOM3_THRESHOLD <= SQR_TOOM3_THRESHOLD_LIMIT); mpn_toom2_sqr (p, a, n, ws); } else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD)) { mp_ptr ws; TMP_SDECL; TMP_SMARK; ws = TMP_SALLOC_LIMBS (mpn_toom3_sqr_itch (n)); mpn_toom3_sqr (p, a, n, ws); TMP_SFREE; } else if (BELOW_THRESHOLD (n, SQR_TOOM6_THRESHOLD)) { mp_ptr ws; TMP_SDECL; TMP_SMARK; ws = TMP_SALLOC_LIMBS (mpn_toom4_sqr_itch (n)); mpn_toom4_sqr (p, a, n, ws); TMP_SFREE; } else if (BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD)) { mp_ptr ws; TMP_SDECL; TMP_SMARK; ws = TMP_SALLOC_LIMBS (mpn_toom6_sqr_itch (n)); mpn_toom6_sqr (p, a, n, ws); TMP_SFREE; } else if (BELOW_THRESHOLD (n, SQR_FFT_THRESHOLD)) { mp_ptr ws; TMP_DECL; TMP_MARK; ws = TMP_ALLOC_LIMBS (mpn_toom8_sqr_itch (n)); mpn_toom8_sqr (p, a, n, ws); TMP_FREE; } else { /* The current FFT code allocates its own space. That should probably change. */ mpn_fft_mul (p, a, n, a, n); } }
/* (rp, 2n) = (xp, n)*(yp, n) */ static void mpn_mulshort_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n) { mp_size_t m; mp_limb_t t; mp_ptr rpn2; ASSERT(n >= 1); ASSERT_MPN(xp, n); ASSERT_MPN(yp, n); ASSERT(!MPN_OVERLAP_P (rp, 2 * n, xp, n)); ASSERT(!MPN_OVERLAP_P (rp, 2 * n, yp, n)); if (BELOW_THRESHOLD(n, MULHIGH_BASECASE_THRESHOLD)) { mpn_mul_basecase(rp, xp, n, yp, n); return; } if (BELOW_THRESHOLD (n, MULHIGH_DC_THRESHOLD)) { mpn_mulshort_n_basecase(rp, xp, yp, n); return; } /* choose optimal m s.t. n + 2 <= 2m, m < n */ ASSERT (n >= 4); m = 87 * n / 128; if (2 * m < n + 2) m = (n + 1) / 2 + 1; if (m >= n) m = n - 1; ASSERT (n + 2 <= 2 * m); ASSERT (m < n); rpn2 = rp + n - 2; mpn_mul_n (rp + n - m + n - m, xp + n - m, yp + n - m, m); mpn_mulshort_n (rp, xp, yp + m, n - m); ASSERT_NOCARRY (mpn_add (rpn2, rpn2, n + 2, rpn2 - m, n - m + 2)); mpn_mulshort_n (rp, xp + m, yp, n - m); ASSERT_NOCARRY (mpn_add (rpn2, rpn2, n + 2, rpn2 - m, n - m + 2)); umul_ppmm (rp[1], t, xp[m - 1], yp[n - m - 1] << GMP_NAIL_BITS); rp[0] = t >> GMP_NAIL_BITS; ASSERT_NOCARRY (mpn_add (rpn2, rpn2, n + 2, rp, 2)); umul_ppmm (rp[1], t, xp[n - m - 1], yp[m - 1] << GMP_NAIL_BITS); rp[0] = t >> GMP_NAIL_BITS; ASSERT_NOCARRY (mpn_add (rpn2, rpn2, n + 2, rp, 2)); return; }