void ecm_mul_lo_n (mp_ptr rp, mp_srcptr np, mp_srcptr mp, mp_size_t n) { mp_size_t k; if (n < MPN_MUL_LO_THRESHOLD) { switch (k = mpn_mul_lo_threshold[n]) { case 0: { mpn_mul_n (rp, np, mp, n); return; } case 1: { ecm_mul_lo_basecase (rp, np, mp, n); return; } /* else go through */ } } else k = (mp_size_t) (0.75 * (double) n); mpn_mul_n (rp, np, mp, k); rp += k; n -= k; ecm_mul_lo_n (rp + n, np + k, mp, n); mpn_add_n (rp, rp, rp + n, n); ecm_mul_lo_n (rp + n, np, mp + k, n); mpn_add_n (rp, rp, rp + n, n); }
/* Put in rp[n..2n-1] an approximation of the n high limbs of {np, n} * {mp, n}. The error is less than n ulps of rp[n] (and the approximation is always less or equal to the truncated full product). Implements Algorithm ShortMul from [1]. */ void mpfr_mulhigh_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mpfr_limb_srcptr mp, mp_size_t n) { mp_size_t k; MPFR_ASSERTN (MPFR_MULHIGH_TAB_SIZE >= 8); /* so that 3*(n/4) > n/2 */ k = MPFR_LIKELY (n < MPFR_MULHIGH_TAB_SIZE) ? mulhigh_ktab[n] : 3*(n/4); /* Algorithm ShortMul from [1] requires k >= (n+3)/2, which translates into k >= (n+4)/2 in the C language. */ MPFR_ASSERTD (k == -1 || k == 0 || (k >= (n+4)/2 && k < n)); if (k < 0) mpn_mul_basecase (rp, np, n, mp, n); /* result is exact, no error */ else if (k == 0) mpfr_mulhigh_n_basecase (rp, np, mp, n); /* basecase error < n ulps */ else if (n > MUL_FFT_THRESHOLD) mpn_mul_n (rp, np, mp, n); /* result is exact, no error */ else { mp_size_t l = n - k; mp_limb_t cy; mpn_mul_n (rp + 2 * l, np + l, mp + l, k); /* fills rp[2l..2n-1] */ mpfr_mulhigh_n (rp, np + k, mp, l); /* fills rp[l-1..2l-1] */ cy = mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1); mpfr_mulhigh_n (rp, np, mp + k, l); /* fills rp[l-1..2l-1] */ cy += mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1); mpn_add_1 (rp + n + l, rp + n + l, k, cy); /* propagate carry */ } }
/* (rp, 2n) = (xp, n)*(yp, n) */ void mpn_mulhigh_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n) { mp_limb_t t; ASSERT(n > 0); ASSERT_MPN(xp, n); ASSERT_MPN(yp, n); ASSERT(!MPN_OVERLAP_P(rp, 2 * n, xp, n)); ASSERT(!MPN_OVERLAP_P(rp, 2 * n, yp, n)); if (BELOW_THRESHOLD(n, MULHIGH_BASECASE_THRESHOLD)) { mpn_mul_basecase(rp, xp, n, yp, n); return; } if (ABOVE_THRESHOLD (n, MULHIGH_MUL_THRESHOLD)) { mpn_mul_n(rp, xp, yp, n); return; } mpn_mulshort_n(rp, xp, yp, n); t = rp[n - 1] + n - 2; if (UNLIKELY(t < n - 2)) mpn_mul_n(rp, xp, yp, n); return; }
/* Put in rp[0..n] the n+1 low limbs of {np, n} * {mp, n}. Assume 2n limbs are allocated at rp. */ void mpfr_mullow_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mpfr_limb_srcptr mp, mp_size_t n) { mp_size_t k; MPFR_ASSERTN (MPFR_MULHIGH_TAB_SIZE >= 8); /* so that 3*(n/4) > n/2 */ k = MPFR_LIKELY (n < MPFR_MULHIGH_TAB_SIZE) ? mulhigh_ktab[n] : 3*(n/4); MPFR_ASSERTD (k == -1 || k == 0 || (2 * k >= n && k < n)); if (k < 0) mpn_mul_basecase (rp, np, n, mp, n); else if (k == 0) mpfr_mullow_n_basecase (rp, np, mp, n); else if (n > MUL_FFT_THRESHOLD) mpn_mul_n (rp, np, mp, n); else { mp_size_t l = n - k; mpn_mul_n (rp, np, mp, k); /* fills rp[0..2k] */ mpfr_mullow_n (rp + n, np + k, mp, l); /* fills rp[n..n+2l] */ mpn_add_n (rp + k, rp + k, rp + n, l + 1); mpfr_mullow_n (rp + n, np, mp + k, l); /* fills rp[n..n+2l] */ mpn_add_n (rp + k, rp + k, rp + n, l + 1); } }
void fmpz_mul(fmpz_t res, const fmpz_t a, const fmpz_t b) { long a0 = a[0]; long b0 = b[0]; unsigned long sizea = FLINT_ABS(a0); unsigned long sizeb = FLINT_ABS(b0); while ((!a[sizea]) && (sizea)) sizea--; while ((!b[sizeb]) && (sizeb)) sizeb--; mp_limb_t mslimb; fmpz_t temp; if ((sizea == 0) || (sizeb == 0)) { res[0] = 0; } else if (sizea + sizeb < 100) { temp = (fmpz_t) flint_stack_alloc_small(sizea + sizeb + 1); if (sizea > sizeb) mslimb = mpn_mul(temp+1, a+1, sizea, b+1, sizeb); else if (sizea == sizeb) { mpn_mul_n(temp+1, a+1, b+1, sizeb); mslimb = temp[2*sizeb]; } else mslimb = mpn_mul(temp+1, b+1, sizeb, a+1, sizea); temp[0] = sizea + sizeb - (mslimb == 0); F_mpn_copy(res, temp, temp[0]+1); if ((long) (a0 ^ b0) < 0) res[0] = -res[0]; flint_stack_release_small(); } else if (sizea + sizeb < 2*FLINT_FFT_LIMBS_CROSSOVER) { temp = (fmpz_t) flint_stack_alloc(sizea + sizeb + 1); if (sizea > sizeb) mslimb = mpn_mul(temp+1, a+1, sizea, b+1, sizeb); else if (sizea == sizeb) { mpn_mul_n(temp+1, a+1, b+1, sizeb); mslimb = temp[2*sizeb]; } else mslimb = mpn_mul(temp+1, b+1, sizeb, a+1, sizea); temp[0] = sizea + sizeb - (mslimb == 0); F_mpn_copy(res, temp, temp[0]+1); if ((long) (a0 ^ b0) < 0) res[0] = -res[0]; flint_stack_release(); } else { if (sizea >= sizeb) mslimb = F_mpn_mul(res+1, a+1, sizea, b+1, sizeb); else mslimb = F_mpn_mul(res+1, b+1, sizeb, a+1, sizea); res[0] = sizea+sizeb - (mslimb == 0); if ((long) (a0 ^ b0) < 0) res[0] = -res[0]; } }
/* NOTE: mul and sqr needs 2*ecc->size limbs at rp */ void ecc_modp_mul (const struct ecc_curve *ecc, mp_limb_t *rp, const mp_limb_t *ap, const mp_limb_t *bp) { mpn_mul_n (rp, ap, bp, ecc->size); ecc->reduce (ecc, rp); }
void mpn_invert_trunc(mp_ptr x_new, mp_size_t m, mp_srcptr xp, mp_size_t n, mp_srcptr ap) { mp_ptr tp; mp_limb_t cy; TMP_DECL; TMP_MARK; tp = TMP_ALLOC_LIMBS (2 * m); MPN_COPY(x_new, xp + n - m, m); ap += (n - m); mpn_mul_n (tp, x_new, ap, m); mpn_add_n (tp + m, tp + m, ap, m); /* A * msb(X) */ /* now check B^(2n) - X*A <= A */ mpn_not (tp, 2 * m); mpn_add_1 (tp, tp, 2 * m, 1); /* B^(2m) - X*A */ while (tp[m] || mpn_cmp (tp, ap, m) > 0) { mpn_add_1(x_new, x_new, m, 1); tp[m] -= mpn_sub_n(tp, tp, ap, m); } TMP_FREE; }
/* Compute e^x. */ static void exp_mpn (mp1 ex, mp1 x) { unsigned int n; mp1 xp; mp2 tmp; mp_limb_t chk; mp1 tol; memset (xp, 0, sizeof (mp1)); memset (ex, 0, sizeof (mp1)); xp[FRAC / mpbpl] = (mp_limb_t)1 << FRAC % mpbpl; memset (tol, 0, sizeof (mp1)); tol[(FRAC - TOL) / mpbpl] = (mp_limb_t)1 << (FRAC - TOL) % mpbpl; n = 0; do { /* Calculate sum(x^n/n!) until the next term is sufficiently small. */ mpn_mul_n (tmp, xp, x, SZ); assert(tmp[SZ * 2 - 1] == 0); if (n > 0) mpn_divmod_1 (xp, tmp + FRAC / mpbpl, SZ, n); chk = mpn_add_n (ex, ex, xp, SZ); assert (chk == 0); ++n; assert (n < 80); /* Catch too-high TOL. */ } while (n < 10 || mpn_cmp (xp, tol, SZ) >= 0); }
int test_invert (mp_ptr xp, mp_srcptr ap, mp_size_t n) { int res = 1; mp_size_t i; mp_ptr tp, up; mp_limb_t cy; TMP_DECL; TMP_MARK; tp = TMP_ALLOC_LIMBS (2 * n); up = TMP_ALLOC_LIMBS (2 * n); /* first check X*A < B^(2*n) */ mpn_mul_n (tp, xp, ap, n); cy = mpn_add_n (tp + n, tp + n, ap, n); /* A * msb(X) */ if (cy != 0) return 0; /* now check B^(2n) - X*A <= A */ mpn_com_n (tp, tp, 2 * n); mpn_add_1 (tp, tp, 2 * n, 1); /* B^(2n) - X*A */ MPN_ZERO (up, 2 * n); MPN_COPY (up, ap, n); res = mpn_cmp (tp, up, 2 * n) <= 0; TMP_FREE; return res; }
/* Calculate 2^x. */ static void exp2_mpn (mp1 ex, mp1 x) { mp2 tmp; mpn_mul_n (tmp, x, mp_log2, SZ); assert(tmp[SZ * 2 - 1] == 0); exp_mpn (ex, tmp + FRAC / mpbpl); }
int main() { mp_limb_t *x = NULL, *y = NULL; long nx = 0; #ifdef mpn_sqr mpn_sqr(y, x, nx); #else mpn_mul_n(y, x, x, nx); #endif return 0; }
void mpn_invert (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch) { ASSERT (n > 0); ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT); ASSERT (! MPN_OVERLAP_P (ip, n, dp, n)); ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n))); ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n))); if (n == 1) invert_limb (*ip, *dp); else { TMP_DECL; TMP_MARK; if (BELOW_THRESHOLD (n, INV_APPR_THRESHOLD)) { /* Maximum scratch needed by this branch: 2*n */ mp_size_t i; mp_ptr xp; xp = scratch; /* 2 * n limbs */ for (i = n - 1; i >= 0; i--) xp[i] = GMP_NUMB_MAX; mpn_com (xp + n, dp, n); if (n == 2) { mpn_divrem_2 (ip, 0, xp, 4, dp); } else { gmp_pi1_t inv; invert_pi1 (inv, dp[n-1], dp[n-2]); /* FIXME: should we use dcpi1_div_q, for big sizes? */ mpn_sbpi1_div_q (ip, xp, 2 * n, dp, n, inv.inv32); } } else { /* Use approximated inverse; correct the result if needed. */ mp_limb_t e; /* The possible error in the approximate inverse */ ASSERT ( mpn_invert_itch (n) >= mpn_invertappr_itch (n) ); e = mpn_ni_invertappr (ip, dp, n, scratch); if (UNLIKELY (e)) { /* Assume the error can only be "0" (no error) or "1". */ /* Code to detect and correct the "off by one" approximation. */ mpn_mul_n (scratch, ip, dp, n); ASSERT_NOCARRY (mpn_add_n (scratch + n, scratch + n, dp, n)); if (! mpn_add (scratch, scratch, 2*n, dp, n)) MPN_INCR_U (ip, n, 1); /* The value was wrong, correct it. */ } } TMP_FREE; } }
void mpn_invert (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch) { ASSERT (n > 0); ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT); ASSERT (! MPN_OVERLAP_P (ip, n, dp, n)); ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n))); ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n))); if (n == 1) invert_limb (*ip, *dp); else if (BELOW_THRESHOLD (n, INV_APPR_THRESHOLD)) { /* Maximum scratch needed by this branch: 2*n */ mp_size_t i; mp_ptr xp; xp = scratch; /* 2 * n limbs */ /* n > 1 here */ i = n; do xp[--i] = GMP_NUMB_MAX; while (i); mpn_com (xp + n, dp, n); if (n == 2) { mpn_divrem_2 (ip, 0, xp, 4, dp); } else { gmp_pi1_t inv; invert_pi1 (inv, dp[n-1], dp[n-2]); /* FIXME: should we use dcpi1_div_q, for big sizes? */ mpn_sbpi1_div_q (ip, xp, 2 * n, dp, n, inv.inv32); } } else { /* Use approximated inverse; correct the result if needed. */ mp_limb_t e; /* The possible error in the approximate inverse */ ASSERT ( mpn_invert_itch (n) >= mpn_invertappr_itch (n) ); e = mpn_ni_invertappr (ip, dp, n, scratch); if (UNLIKELY (e)) { /* Assume the error can only be "0" (no error) or "1". */ /* Code to detect and correct the "off by one" approximation. */ mpn_mul_n (scratch, ip, dp, n); e = mpn_add_n (scratch, scratch, dp, n); /* FIXME: we only need e.*/ if (LIKELY(e)) /* The high part can not give a carry by itself. */ e = mpn_add_nc (scratch + n, scratch + n, dp, n, e); /* FIXME:e */ /* If the value was wrong (no carry), correct it (increment). */ e ^= CNST_LIMB (1); MPN_INCR_U (ip, n, e); } } }
/* Inputs are {ap,rn} and {bp,rn}; output is {rp,rn}, computation is mod B^rn - 1, and values are semi-normalised; zero is represented as either 0 or B^n - 1. Needs a scratch of 2rn limbs at tp. tp==rp is allowed. */ void mpn_bc_mulmod_bnm1 (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t rn, mp_ptr tp) { mp_limb_t cy; ASSERT (0 < rn); mpn_mul_n (tp, ap, bp, rn); cy = mpn_add_n (rp, tp, tp + rn, rn); /* If cy == 1, then the value of rp is at most B^rn - 2, so there can * be no overflow when adding in the carry. */ MPN_INCR_U (rp, rn, cy); }
void vectvectmul (vector *vect1, vector *vect2, int Tid) { int i; MPN_ZERO (temp_sum[Tid], 2*LIMBS+1); for (i=VStart[Tid]; i<=VEnd[Tid]; ++i) { mpn_mul_n (temp_prod[Tid], vect1[i], vect2[i], LIMBS); mpn_add (temp_sum[Tid], temp_sum[Tid], 2*LIMBS+1, temp_prod[Tid], 2*LIMBS); } return; }
static void fp_mul(element_ptr c, element_ptr a, element_ptr b) { fp_field_data_ptr p = c->field->data; size_t t = p->limbs; //mp_limb_t tmp[3 * t + 1]; //mp_limb_t *qp = &tmp[2 * t]; mp_limb_t *tmp = _alloca(2 * t * sizeof(mp_limb_t)); mp_limb_t *qp = _alloca((t + 1) * sizeof(mp_limb_t)); //static mp_limb_t tmp[2 * 100]; //static mp_limb_t qp[100 + 1]; mpn_mul_n(tmp, a->data, b->data, t); mpn_tdiv_qr(qp, c->data, 0, tmp, 2 * t, p->primelimbs, t); }
/* Inputs are {ap,rn+1} and {bp,rn+1}; output is {rp,rn+1}, in semi-normalised representation, computation is mod B^rn + 1. Needs a scratch area of 2rn + 2 limbs at tp; tp == rp is allowed. Output is normalised. */ static void mpn_bc_mulmod_bnp1 (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t rn, mp_ptr tp) { mp_limb_t cy; ASSERT (0 < rn); mpn_mul_n (tp, ap, bp, rn + 1); ASSERT (tp[2*rn+1] == 0); ASSERT (tp[2*rn] < GMP_NUMB_MAX); cy = tp[2*rn] + mpn_sub_n (rp, tp, tp+rn, rn); rp[rn] = 0; MPN_INCR_U (rp, rn+1, cy ); }
void mpn_mullow_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n) { if (BELOW_THRESHOLD (n, MULLOW_BASECASE_THRESHOLD)) { /* Allocate workspace of fixed size on stack: fast! */ mp_limb_t ws[MUL_BASECASE_ALLOC]; mpn_mul_basecase (ws, xp, n, yp, n); MPN_COPY (rp, ws, n); } else if (BELOW_THRESHOLD (n, MULLOW_DC_THRESHOLD)) { mpn_mullow_basecase (rp, xp, yp, n); } else if (BELOW_THRESHOLD (n, MULLOW_MUL_N_THRESHOLD)) { /* Divide-and-conquer */ mp_size_t n2 = n >> 1; /* floor(n/2) */ mp_size_t n1 = n - n2; /* ceil(n/2) */ mp_ptr tp; TMP_SDECL; TMP_SMARK; tp = TMP_SALLOC_LIMBS (n1); /* Split as x = x1 2^(n1 GMP_NUMB_BITS) + x0, y = y1 2^(n2 GMP_NUMB_BITS) + y0 */ /* x0 * y0 */ mpn_mul_n (rp, xp, yp, n2); if (n1 != n2) rp[2 * n2] = mpn_addmul_1 (rp + n2, yp, n2, xp[n2]); /* x1 * y0 * 2^(n1 GMP_NUMB_BITS) */ mpn_mullow_n (tp, xp + n1, yp, n2); mpn_add_n (rp + n1, rp + n1, tp, n2); /* x0 * y1 * 2^(n2 GMP_NUMB_BITS) */ mpn_mullow_n (tp, yp + n2, xp, n1); mpn_add_n (rp + n2, rp + n2, tp, n1); TMP_SFREE; } else {
void _nmod_poly_mul_KS(mp_ptr out, mp_srcptr in1, long len1, mp_srcptr in2, long len2, mp_bitcnt_t bits, nmod_t mod) { long len_out = len1 + len2 - 1, limbs1, limbs2; mp_ptr mpn1, mpn2, res; if (bits == 0) { mp_bitcnt_t bits1, bits2, loglen; bits1 = _nmod_vec_max_bits(in1, len1); bits2 = (in1 == in2) ? bits1 : _nmod_vec_max_bits(in2, len2); loglen = FLINT_BIT_COUNT(len2); bits = bits1 + bits2 + loglen; } limbs1 = (len1 * bits - 1) / FLINT_BITS + 1; limbs2 = (len2 * bits - 1) / FLINT_BITS + 1; mpn1 = (mp_ptr) malloc(sizeof(mp_limb_t) * limbs1); mpn2 = (in1 == in2) ? mpn1 : (mp_ptr) malloc(sizeof(mp_limb_t) * limbs2); _nmod_poly_bit_pack(mpn1, in1, len1, bits); if (in1 != in2) _nmod_poly_bit_pack(mpn2, in2, len2, bits); res = (mp_ptr) malloc(sizeof(mp_limb_t) * (limbs1 + limbs2)); if (in1 != in2) mpn_mul(res, mpn1, limbs1, mpn2, limbs2); else mpn_mul_n(res, mpn1, mpn1, limbs1); _nmod_poly_bit_unpack(out, len_out, res, bits, mod); free(mpn2); if (in1 != in2) free(mpn1); free(res); }
static mp_limb_t mpn_dc_div_3_by_2 (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n, mp_ptr scratch) { mp_size_t twon = n + n; mp_limb_t qhl, cc; if (n < DIV_DC_THRESHOLD) qhl = mpn_sb_divrem_mn (qp, np + n, twon, dp + n, n); else qhl = mpn_dc_div_2_by_1 (qp, np + n, dp + n, n, scratch); mpn_mul_n (scratch, qp, dp, n); cc = mpn_sub_n (np, np, scratch, twon); if (qhl != 0) cc += mpn_sub_n (np + n, np + n, dp, n); while (cc != 0) { qhl -= mpn_sub_1 (qp, qp, n, (mp_limb_t) 1); cc -= mpn_add_n (np, np, dp, twon); } return qhl; }
int main (void) { unsigned long bp, xn, n, b, zn, c; mp_limb_t xp[1000], yp[1000], mp[1000], lp[1000], hp[1000]; gmp_randstate_t rands; int qpn, j, k, i, l, i1, k1, j1, i2, k2, j2; tests_start (); gmp_randinit_default(rands); for (n = 1; n < 100; n++) { for (c = 0; c < 10; c++) { mpn_randomb (xp, rands, n); mpn_randomb (yp, rands, n); mpn_mul_n (mp, xp, yp, n); mpn_mullow_n (lp, xp, yp, n); mpn_mulhigh_n (hp, xp, yp, n); if (mpn_cmp (mp, lp, n) != 0) { printf ("mpn_mullow_n error %ld\n", n); abort (); } if (mpn_cmp (mp + n, hp + n, n) != 0) { printf ("mpn_mulhigh_n error %ld\n", n); abort (); } } } gmp_randclear(rands); tests_end (); exit (0); }
/* rp[n-1..0] = bp[bn-1..0] ^ ep[en-1..0] mod mp[n-1..0] Requires that mp[n-1..0] is odd. Requires that ep[en-1..0] is > 1. Uses scratch space tp[3n..0], i.e., 3n+1 words. */ void mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn, mp_srcptr ep, mp_size_t en, mp_srcptr mp, mp_size_t n, mp_ptr tp) { mp_limb_t mip[2]; int cnt; long ebi; int windowsize, this_windowsize; mp_limb_t expbits; mp_ptr pp, this_pp, last_pp; long i; int redc_x; TMP_DECL; ASSERT (en > 1 || (en == 1 && ep[0] > 1)); ASSERT (n >= 1 && ((mp[0] & 1) != 0)); TMP_MARK; count_leading_zeros (cnt, ep[en - 1]); ebi = en * GMP_LIMB_BITS - cnt; windowsize = win_size (ebi); if (BELOW_THRESHOLD (n, REDC_2_THRESHOLD)) { binvert_limb (mip[0], mp[0]); mip[0] = -mip[0]; redc_x = 1; } #if defined (HAVE_NATIVE_mpn_addmul_2) else { mpn_binvert (mip, mp, 2, tp); mip[0] = -mip[0]; mip[1] = ~mip[1]; redc_x = 2; } #endif #if 0 mpn_binvert (mip, mp, n, tp); redc_x = 0; #endif pp = TMP_ALLOC_LIMBS (n << windowsize); this_pp = pp; this_pp[n] = 1; redcify (this_pp, this_pp + n, 1, mp, n); this_pp += n; redcify (this_pp, bp, bn, mp, n); /* Precompute powers of b and put them in the temporary area at pp. */ for (i = (1 << windowsize) - 2; i > 0; i--) { last_pp = this_pp; this_pp += n; mpn_mul_n (tp, last_pp, pp + n, n); MPN_REDC_X (this_pp, tp, mp, n, mip); } expbits = getbits (ep, ebi, windowsize); ebi -= windowsize; if (ebi < 0) ebi = 0; MPN_COPY (rp, pp + n * expbits, n); while (ebi != 0) { expbits = getbits (ep, ebi, windowsize); ebi -= windowsize; this_windowsize = windowsize; if (ebi < 0) { this_windowsize += ebi; ebi = 0; } do { mpn_sqr_n (tp, rp, n); MPN_REDC_X (rp, tp, mp, n, mip); this_windowsize--; } while (this_windowsize != 0); #if WANT_CACHE_SECURITY mpn_tabselect (tp + 2*n, pp, n, 1 << windowsize, expbits); mpn_mul_n (tp, rp, tp + 2*n, n); #else mpn_mul_n (tp, rp, pp + n * expbits, n); #endif MPN_REDC_X (rp, tp, mp, n, mip); } MPN_COPY (tp, rp, n); MPN_ZERO (tp + n, n); MPN_REDC_X (rp, tp, mp, n, mip); if (mpn_cmp (rp, mp, n) >= 0) mpn_sub_n (rp, rp, mp, n); TMP_FREE; }
/* Computes an approximate quotient of { np, 2*dn } by { dp, dn } which is either correct or one too large. We require dp to be normalised and inv to be a precomputed inverse given by mpn_invert. */ mp_limb_t mpn_inv_divappr_q_n(mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t dn, mp_srcptr inv) { mp_limb_t cy, lo, ret = 0, ret2 = 0; mp_ptr tp; TMP_DECL; TMP_MARK; ASSERT(dp[dn-1] & GMP_LIMB_HIGHBIT); ASSERT(mpn_is_invert(inv, dp, dn)); if (mpn_cmp(np + dn, dp, dn) >= 0) { ret2 = 1; mpn_sub_n(np + dn, np + dn, dp, dn); } tp = TMP_ALLOC_LIMBS(2*dn + 1); mpn_mul(tp, np + dn - 1, dn + 1, inv, dn); add_ssaaaa(cy, lo, 0, np[dn - 1], 0, tp[dn]); ret += mpn_add_n(qp, tp + dn + 1, np + dn, dn); ret += mpn_add_1(qp, qp, dn, cy + 1); /* Let X = B^dn + inv, D = { dp, dn }, N = { np, 2*dn }, then DX < B^{2*dn} <= D(X+1), thus Let N' = { np + n - 1, n + 1 } N'X/B^{dn+1} < B^{dn-1}N'/D <= N'X/B^{dn+1} + N'/B^{dn+1} < N'X/B^{dn+1} + 1 N'X/B^{dn+1} < N/D <= N'X/B^{dn+1} + 1 + 2/B There is either one integer in this range, or two. However, in the latter case the left hand bound is either an integer or < 2/B below one. */ if (UNLIKELY(ret == 1)) { ret -= mpn_sub_1(qp, qp, dn, 1); ASSERT(ret == 0); } if (UNLIKELY((lo == ~CNST_LIMB(0)) || (lo == ~CNST_LIMB(1)))) { /* Special case, multiply out to get accurate quotient */ ret -= mpn_sub_1(qp, qp, dn, 1); if (UNLIKELY(ret == ~CNST_LIMB(0))) ret += mpn_add_1(qp, qp, dn, 1); /* ret is now guaranteed to be 0*/ ASSERT(ret == 0); mpn_mul_n(tp, qp, dp, dn); mpn_sub_n(tp, np, tp, dn+1); while (tp[dn] || mpn_cmp(tp, dp, dn) >= 0) { ret += mpn_add_1(qp, qp, dn, 1); tp[dn] -= mpn_sub_n(tp, tp, dp, dn); } /* Not possible for ret == 2 as we have qp*dp <= np */ ASSERT(ret + ret2 < 2); } TMP_FREE; return ret + ret2; }
void fp_sqrn_low(dig_t *c, const dig_t *a) { mpn_mul_n(c, a, a, FP_DIGS); }
/* Computes the quotient and remainder of { np, 2*dn } by { dp, dn }. We require dp to be normalised and inv to be a precomputed inverse of { dp, dn } given by mpn_invert. */ mp_limb_t mpn_inv_div_qr_n(mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t dn, mp_srcptr inv) { mp_limb_t cy, lo, ret = 0, ret2 = 0; mp_size_t m, i; mp_ptr tp; TMP_DECL; TMP_MARK; ASSERT(mpn_is_invert(inv, dp, dn)); if (mpn_cmp(np + dn, dp, dn) >= 0) { ret2 = 1; mpn_sub_n(np + dn, np + dn, dp, dn); } tp = TMP_ALLOC_LIMBS(2*dn + 1); mpn_mul(tp, np + dn - 1, dn + 1, inv, dn); add_ssaaaa(cy, lo, 0, np[dn - 1], 0, tp[dn]); ret += mpn_add_n(qp, tp + dn + 1, np + dn, dn); ret += mpn_add_1(qp, qp, dn, cy); /* Let X = B^dn + inv, D = { dp, dn }, N = { np, 2*dn }, then DX < B^{2*dn} <= D(X+1), thus Let N' = { np + n - 1, n + 1 } N'X/B^{dn+1} < B^{dn-1}N'/D <= N'X/B^{dn+1} + N'/B^{dn+1} < N'X/B^{dn+1} + 1 N'X/B^{dn+1} < N/D <= N'X/B^{dn+1} + 1 + 2/B There is either one integer in this range, or two. However, in the latter case the left hand bound is either an integer or < 2/B below one. */ if (UNLIKELY(ret == 1)) { ret -= mpn_sub_1(qp, qp, dn, 1); ASSERT(ret == 0); } ret -= mpn_sub_1(qp, qp, dn, 1); if (UNLIKELY(ret == ~CNST_LIMB(0))) ret += mpn_add_1(qp, qp, dn, 1); /* ret is now guaranteed to be 0 or 1*/ ASSERT(ret == 0); m = dn + 1; if ((dn <= MPN_FFT_MUL_N_MINSIZE) || (ret)) { mpn_mul_n(tp, qp, dp, dn); } else { mp_limb_t cy, cy2; if (m >= FFT_MULMOD_2EXPP1_CUTOFF) m = mpir_fft_adjust_limbs (m); cy = mpn_mulmod_Bexpp1_fft (tp, m, qp, dn, dp, dn); /* cy, {tp, m} = qp * dp mod (B^m+1) */ cy2 = mpn_add_n(tp, tp, np + m, 2*dn - m); mpn_add_1(tp + 2*dn - m, tp + 2*dn - m, 2*m - 2*dn, cy2); /* Make correction */ mpn_sub_1(tp, tp, m, tp[0] - dp[0]*qp[0]); } mpn_sub_n(np, np, tp, m); MPN_ZERO(np + m, 2*dn - m); while (np[dn] || mpn_cmp(np, dp, dn) >= 0) { ret += mpn_add_1(qp, qp, dn, 1); np[dn] -= mpn_sub_n(np, np, dp, dn); } /* Not possible for ret == 2 as we have qp*dp <= np */ ASSERT(ret + ret2 < 2); TMP_FREE; return ret + ret2; }
void bn_muln_low(dig_t *c, const dig_t *a, const dig_t *b, int size) { mpn_mul_n(c, a, b, size); }
void _arb_exp_taylor_rs(mp_ptr y, mp_limb_t * error, mp_srcptr x, mp_size_t xn, ulong N) { mp_ptr s, t, xpow; mp_limb_t new_denom, old_denom, c; slong power, k, m; TMP_INIT; TMP_START; if (N >= FACTORIAL_TAB_SIZE - 1) { flint_printf("_arb_exp_taylor_rs: N too large!\n"); abort(); } if (N <= 3) { if (N <= 1) { flint_mpn_zero(y, xn); y[xn] = N; error[0] = 0; } else if (N == 2) { flint_mpn_copyi(y, x, xn); y[xn] = 1; error[0] = 0; } else { /* 1 + x + x^2 / 2 */ t = TMP_ALLOC_LIMBS(2 * xn); mpn_sqr(t, x, xn); mpn_rshift(t + xn, t + xn, xn, 1); y[xn] = mpn_add_n(y, x, t + xn, xn) + 1; error[0] = 2; } } else { /* Choose m ~= sqrt(num_terms) (m must be even, >= 2) */ /* TODO: drop evenness assumption since we don't have sign issues here? */ /* TODO: then just need to fix power construction below... */ m = 2; while (m * m < N) m += 2; /* todo: merge allocations */ xpow = TMP_ALLOC_LIMBS((m + 1) * xn); s = TMP_ALLOC_LIMBS(xn + 2); t = TMP_ALLOC_LIMBS(2 * xn + 2); /* todo: 1 limb too much? */ /* higher index ---> */ /* | ---xn--- | */ /* xpow = | <temp> | x^m | x^(m-1) | ... | x^2 | x | */ #define XPOW_WRITE(__k) (xpow + (m - (__k)) * xn) #define XPOW_READ(__k) (xpow + (m - (__k) + 1) * xn) flint_mpn_copyi(XPOW_READ(1), x, xn); mpn_sqr(XPOW_WRITE(2), XPOW_READ(1), xn); for (k = 4; k <= m; k += 2) { mpn_mul_n(XPOW_WRITE(k - 1), XPOW_READ(k / 2), XPOW_READ(k / 2 - 1), xn); mpn_sqr(XPOW_WRITE(k), XPOW_READ(k / 2), xn); } flint_mpn_zero(s, xn + 1); /* todo: skip one nonscalar multiplication (use x^m) when starting on x^0 */ power = (N - 1) % m; for (k = N - 1; k >= 0; k--) { c = factorial_tab_numer[k]; new_denom = factorial_tab_denom[k]; old_denom = factorial_tab_denom[k+1]; /* change denominators */ if (new_denom != old_denom && k < N - 1) { mpn_divrem_1(s, 0, s, xn + 1, old_denom); } if (power == 0) { /* add c * x^0 -- only top limb is affected */ s[xn] += c; /* Outer polynomial evaluation: multiply by x^m */ if (k != 0) { mpn_mul(t, s, xn + 1, XPOW_READ(m), xn); flint_mpn_copyi(s, t + xn, xn + 1); } power = m - 1; } else { s[xn] += mpn_addmul_1(s, XPOW_READ(power), xn, c); power--; } } /* finally divide by denominator */ mpn_divrem_1(y, 0, s, xn + 1, factorial_tab_denom[0]); /* error bound (ulp) */ error[0] = 2; } TMP_END; }
void _arb_atan_taylor_naive(mp_ptr y, mp_limb_t * error, mp_srcptr x, mp_size_t xn, ulong N, int alternating) { ulong k; mp_ptr s, t, x1, x2, u; mp_size_t nn = xn + 1; if (N == 0) { flint_mpn_zero(y, xn); error[0] = 0; return; } if (N == 1) { flint_mpn_copyi(y, x, xn); error[0] = 0; } s = flint_malloc(sizeof(mp_limb_t) * nn); t = flint_malloc(sizeof(mp_limb_t) * nn); u = flint_malloc(sizeof(mp_limb_t) * 2 * nn); x1 = flint_malloc(sizeof(mp_limb_t) * nn); x2 = flint_malloc(sizeof(mp_limb_t) * nn); flint_mpn_zero(s, nn); flint_mpn_zero(t, nn); flint_mpn_zero(u, 2 * nn); flint_mpn_zero(x1, nn); flint_mpn_zero(x2, nn); /* x1 = x */ flint_mpn_copyi(x1 + 1, x, xn); /* x2 = x * x */ mpn_mul_n(u, x1, x1, nn); flint_mpn_copyi(x2, u + nn, nn); /* s = t = x */ flint_mpn_copyi(s, x1, nn); flint_mpn_copyi(t, x1, nn); for (k = 1; k < N; k++) { /* t = t * x2 */ mpn_mul_n(u, t, x2, nn); flint_mpn_copyi(t, u + nn, nn); /* u = t / (2k+1) */ mpn_divrem_1(u, 0, t, nn, 2 * k + 1); if (alternating & k) mpn_sub_n(s, s, u, nn); else mpn_add_n(s, s, u, nn); } flint_mpn_copyi(y, s + 1, xn); error[0] = 2; flint_free(s); flint_free(t); flint_free(u); flint_free(x1); flint_free(x2); }
/* ret + (xp, n) = (yp, n)*(zp, n) % 2^b + 1 needs (tp, 2n) temp space, everything reduced mod 2^b inputs, outputs are fully reduced N.B: 2n is not the same as 2b rounded up to nearest limb! */ inline static int mpn_mulmod_2expp1_internal (mp_ptr xp, mp_srcptr yp, mp_srcptr zp, mpir_ui b, mp_ptr tp) { mp_size_t n, k; mp_limb_t c; TMP_DECL; n = BITS_TO_LIMBS (b); k = GMP_NUMB_BITS * n - b; ASSERT(b > 0); ASSERT(n > 0); ASSERT_MPN(yp, n); ASSERT_MPN(zp, n); ASSERT(!MPN_OVERLAP_P (tp, 2 * n, yp, n)); ASSERT(!MPN_OVERLAP_P (tp, 2 * n, zp, n)); ASSERT(MPN_SAME_OR_SEPARATE_P (xp, tp, n)); ASSERT(MPN_SAME_OR_SEPARATE_P (xp, tp + n, n)); ASSERT(k == 0 || yp[n - 1] >> (GMP_NUMB_BITS - k) == 0); ASSERT(k == 0 || zp[n - 1] >> (GMP_NUMB_BITS - k) == 0); #ifndef TUNE_PROGRAM_BUILD if (k == 0 && n > FFT_MULMOD_2EXPP1_CUTOFF && n == mpir_fft_adjust_limbs(n)) { mp_bitcnt_t depth1, depth = 1; mp_size_t w1, off; mp_ptr tx, ty, tz; mp_limb_t ret; TMP_MARK; tx = TMP_BALLOC_LIMBS(3*n + 3); ty = tx + n + 1; tz = ty + n + 1; MPN_COPY(ty, yp, n); MPN_COPY(tz, zp, n); ty[n] = 0; tz[n] = 0; while ((((mp_limb_t)1)<<depth) < b) depth++; if (depth < 12) off = mulmod_2expp1_table_n[0]; else off = mulmod_2expp1_table_n[MIN(depth, FFT_N_NUM + 11) - 12]; depth1 = depth/2 - off; w1 = b/(((mp_limb_t)1)<<(2*depth1)); mpir_fft_mulmod_2expp1(tx, ty, tz, n, depth1, w1); MPN_COPY(xp, tx, n); ret = tx[n]; TMP_FREE; return ret; } #endif if (yp == zp) mpn_sqr(tp, yp, n); else mpn_mul_n (tp, yp, zp, n); if (k == 0) { c = mpn_sub_n (xp, tp, tp + n, n); return mpn_add_1 (xp, xp, n, c); } c = tp[n - 1]; tp[n - 1] &= GMP_NUMB_MASK >> k; #if HAVE_NATIVE_mpn_sublsh_nc c = mpn_sublsh_nc (xp, tp, tp + n, n, k, c); #else { mp_limb_t c1; c1 = mpn_lshift (tp + n, tp + n, n, k); tp[n] |= c >> (GMP_NUMB_BITS - k); c = mpn_sub_n (xp, tp, tp + n, n) + c1; } #endif c = mpn_add_1 (xp, xp, n, c); xp[n - 1] &= GMP_NUMB_MASK >> k; return c; }
void _arb_sin_cos_taylor_rs(mp_ptr ysin, mp_ptr ycos, mp_limb_t * error, mp_srcptr x, mp_size_t xn, ulong N, int sinonly, int alternating) { mp_ptr s, t, xpow; mp_limb_t new_denom, old_denom, c; slong power, k, m; int cosorsin; TMP_INIT; TMP_START; if (2 * N >= FACTORIAL_TAB_SIZE - 1) { flint_printf("_arb_sin_cos_taylor_rs: N too large!\n"); abort(); } if (N <= 1) { if (N == 0) { flint_mpn_zero(ysin, xn); if (!sinonly) flint_mpn_zero(ycos, xn); error[0] = 0; } else if (N == 1) { flint_mpn_copyi(ysin, x, xn); if (!sinonly) flint_mpn_store(ycos, xn, LIMB_ONES); error[0] = 1; } } else { /* Choose m ~= sqrt(num_terms) (m must be even, >= 2) */ m = 2; while (m * m < N) m += 2; /* todo: merge allocations */ xpow = TMP_ALLOC_LIMBS((m + 1) * xn); s = TMP_ALLOC_LIMBS(xn + 2); t = TMP_ALLOC_LIMBS(2 * xn + 2); /* todo: 1 limb too much? */ /* higher index ---> */ /* | ---xn--- | */ /* xpow = | <temp> | x^m | x^(m-1) | ... | x^2 | x | */ #define XPOW_WRITE(__k) (xpow + (m - (__k)) * xn) #define XPOW_READ(__k) (xpow + (m - (__k) + 1) * xn) mpn_sqr(XPOW_WRITE(1), x, xn); mpn_sqr(XPOW_WRITE(2), XPOW_READ(1), xn); for (k = 4; k <= m; k += 2) { mpn_mul_n(XPOW_WRITE(k - 1), XPOW_READ(k / 2), XPOW_READ(k / 2 - 1), xn); mpn_sqr(XPOW_WRITE(k), XPOW_READ(k / 2), xn); } for (cosorsin = sinonly; cosorsin < 2; cosorsin++) { flint_mpn_zero(s, xn + 1); /* todo: skip one nonscalar multiplication (use x^m) when starting on x^0 */ power = (N - 1) % m; for (k = N - 1; k >= 0; k--) { c = factorial_tab_numer[2 * k + cosorsin]; new_denom = factorial_tab_denom[2 * k + cosorsin]; old_denom = factorial_tab_denom[2 * k + cosorsin + 2]; /* change denominators */ if (new_denom != old_denom && k < N - 1) { if (alternating && (k % 2 == 0)) s[xn] += old_denom; mpn_divrem_1(s, 0, s, xn + 1, old_denom); if (alternating && (k % 2 == 0)) s[xn] -= 1; } if (power == 0) { /* add c * x^0 -- only top limb is affected */ if (alternating & k) s[xn] -= c; else s[xn] += c; /* Outer polynomial evaluation: multiply by x^m */ if (k != 0) { mpn_mul(t, s, xn + 1, XPOW_READ(m), xn); flint_mpn_copyi(s, t + xn, xn + 1); } power = m - 1; } else { if (alternating & k) s[xn] -= mpn_submul_1(s, XPOW_READ(power), xn, c); else s[xn] += mpn_addmul_1(s, XPOW_READ(power), xn, c); power--; } } /* finally divide by denominator */ if (cosorsin == 0) { mpn_divrem_1(t, 0, s, xn + 1, factorial_tab_denom[0]); /* perturb down to a number < 1 if necessary. note that this does not invalidate the error bound: 1 - ulp is either 1 ulp too small or must be closer to the exact value */ if (t[xn] == 0) flint_mpn_copyi(ycos, t, xn); else flint_mpn_store(ycos, xn, LIMB_ONES); } else { mpn_divrem_1(s, 0, s, xn + 1, factorial_tab_denom[0]); mpn_mul(t, s, xn + 1, x, xn); flint_mpn_copyi(ysin, t + xn, xn); } } /* error bound (ulp) */ error[0] = 2; } TMP_END; }