예제 #1
0
파일: invert.c 프로젝트: Macaulay2/mpir
void mpn_invert_trunc(mp_ptr x_new, mp_size_t m, mp_srcptr xp, mp_size_t n, mp_srcptr ap)
{
  mp_ptr tp;
  mp_limb_t cy;
  TMP_DECL;

  TMP_MARK;
  tp = TMP_ALLOC_LIMBS (2 * m);
  
  MPN_COPY(x_new, xp + n - m, m);
  ap += (n - m);

  mpn_mul_n (tp, x_new, ap, m);
  mpn_add_n (tp + m, tp + m, ap, m); /* A * msb(X) */
  
  /* now check B^(2n) - X*A <= A */
  mpn_not (tp, 2 * m);
  mpn_add_1 (tp, tp, 2 * m, 1); /* B^(2m) - X*A */
  
  while (tp[m] || mpn_cmp (tp, ap, m) > 0)
  {
     mpn_add_1(x_new, x_new, m, 1);
     tp[m] -= mpn_sub_n(tp, tp, ap, m);
  }
  TMP_FREE;
}
예제 #2
0
static mp_limb_t
mpn_dc_div_2_by_1 (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n, mp_ptr scratch)
{
  mp_limb_t qhl, cc;
  mp_size_t n2 = n/2;

  if (n % 2 != 0)
    {
      mp_ptr qp1 = qp + 1;
      qhl = mpn_dc_div_3_by_2 (qp1 + n2, np + 2 + n2, dp + 1, n2, scratch);
      qhl += mpn_add_1 (qp1 + n2, qp1 + n2, n2,
			mpn_dc_div_3_by_2 (qp1, np + 2, dp + 1, n2, scratch));

      cc = mpn_submul_1 (np + 1, qp1, n - 1, dp[0]);
      cc = mpn_sub_1 (np + n, np + n, 1, cc);
      if (qhl != 0)
	cc += mpn_sub_1 (np + n, np + n, 1, dp[0]);
      while (cc != 0)
	{
	  qhl -= mpn_sub_1 (qp1, qp1, n - 1, (mp_limb_t) 1);
	  cc -= mpn_add_n (np + 1, np + 1, dp, n);
	}
      qhl += mpn_add_1 (qp1, qp1, n - 1,
			mpn_sb_divrem_mn (qp, np, n + 1, dp, n));
    }
  else
    {
      qhl = mpn_dc_div_3_by_2 (qp + n2, np + n2, dp, n2, scratch);
      qhl += mpn_add_1 (qp + n2, qp + n2, n2,
			mpn_dc_div_3_by_2 (qp, np, dp, n2, scratch));
    }
  return qhl;
}
예제 #3
0
void bn_sqra_low(dig_t *c, const dig_t *a, int size) {
	dig_t carry, digit = *a;

	carry = mpn_addmul_1(c, a, size, digit);
	mpn_add_1(c + size, c + size, size, carry);
	if (size > 1) {
		carry = mpn_addmul_1(c + 1, a + 1, size - 1, digit);
		mpn_add_1(c + size, c + size, size, carry);
	}
}
예제 #4
0
int
test_invert (mp_ptr xp, mp_srcptr ap, mp_size_t n)
{
  int res = 1;
  mp_size_t i;
  mp_ptr tp, up;
  mp_limb_t cy;
  TMP_DECL;

  TMP_MARK;
  tp = TMP_ALLOC_LIMBS (2 * n);
  up = TMP_ALLOC_LIMBS (2 * n);

  /* first check X*A < B^(2*n) */
  mpn_mul_n (tp, xp, ap, n);
  cy = mpn_add_n (tp + n, tp + n, ap, n); /* A * msb(X) */
  if (cy != 0)
    return 0;

  /* now check B^(2n) - X*A <= A */
  mpn_com_n (tp, tp, 2 * n);
  mpn_add_1 (tp, tp, 2 * n, 1); /* B^(2n) - X*A */
  MPN_ZERO (up, 2 * n);
  MPN_COPY (up, ap, n);
  res = mpn_cmp (tp, up, 2 * n) <= 0;
  TMP_FREE;
  return res;
}
예제 #5
0
파일: mulders.c 프로젝트: SESA/EbbRT-mpfr
/* Put in  rp[n..2n-1] an approximation of the n high limbs
   of {np, n}^2. The error is less than n ulps of rp[n]. */
void
mpfr_sqrhigh_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mp_size_t n)
{
  mp_size_t k;

  MPFR_ASSERTN (MPFR_SQRHIGH_TAB_SIZE > 2); /* ensures k < n */
  k = MPFR_LIKELY (n < MPFR_SQRHIGH_TAB_SIZE) ? sqrhigh_ktab[n]
    : (n+4)/2; /* ensures that k >= (n+3)/2 */
  MPFR_ASSERTD (k == -1 || k == 0 || (k >= (n+4)/2 && k < n));
  if (k < 0)
    /* we can't use mpn_sqr_basecase here, since it requires
       n <= SQR_KARATSUBA_THRESHOLD, where SQR_KARATSUBA_THRESHOLD
       is not exported by GMP */
    mpn_sqr_n (rp, np, n);
  else if (k == 0)
    mpfr_mulhigh_n_basecase (rp, np, np, n);
  else
    {
      mp_size_t l = n - k;
      mp_limb_t cy;

      mpn_sqr_n (rp + 2 * l, np + l, k);          /* fills rp[2l..2n-1] */
      mpfr_mulhigh_n (rp, np, np + k, l);         /* fills rp[l-1..2l-1] */
      /* {rp+n-1,l+1} += 2 * {rp+l-1,l+1} */
      cy = mpn_lshift (rp + l - 1, rp + l - 1, l + 1, 1);
      cy += mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1);
      mpn_add_1 (rp + n + l, rp + n + l, k, cy); /* propagate carry */
    }
}
예제 #6
0
파일: add_one_ulp.c 프로젝트: mahdiz/mpclib
/* sets x to x+sign(x)*ulp(x) */
int
mpfr_add_one_ulp (mpfr_ptr x, mp_rnd_t rnd_mode)
{
  mp_size_t xn;
  int sh;
  mp_limb_t *xp;

  if (MPFR_IS_NAN(x))
    MPFR_RET_NAN;

  if (MPFR_IS_INF(x) || MPFR_IS_ZERO(x))
    return 0;

  xn = 1 + (MPFR_PREC(x) - 1) / BITS_PER_MP_LIMB;
  sh = xn * BITS_PER_MP_LIMB - MPFR_PREC(x);
  xp = MPFR_MANT(x);
  if (mpn_add_1 (xp, xp, xn, MP_LIMB_T_ONE << sh)) /* got 1.0000... */
    {
      mp_exp_t exp = MPFR_EXP(x);
      if (exp == __mpfr_emax)
        return mpfr_set_overflow(x, rnd_mode, MPFR_SIGN(x));
      else
        {
          MPFR_EXP(x)++;
          xp[xn-1] = GMP_LIMB_HIGHBIT;
        }
    }
  return 0;
}
예제 #7
0
REGPARM_ATTR (1) static void
cfdiv_q_2exp (mpz_ptr w, mpz_srcptr u, mp_bitcnt_t cnt, int dir)
{
  mp_size_t  wsize, usize, abs_usize, limb_cnt, i;
  mp_srcptr  up;
  mp_ptr     wp;
  mp_limb_t  round, rmask;

  usize = SIZ (u);
  abs_usize = ABS (usize);
  limb_cnt = cnt / GMP_NUMB_BITS;
  wsize = abs_usize - limb_cnt;
  if (wsize <= 0)
    {
      /* u < 2**cnt, so result 1, 0 or -1 according to rounding */
      PTR(w)[0] = 1;
      SIZ(w) = (usize == 0 || (usize ^ dir) < 0 ? 0 : dir);
      return;
    }

  /* +1 limb to allow for mpn_add_1 below */
  MPZ_REALLOC (w, wsize+1);

  /* Check for rounding if direction matches u sign.
     Set round if we're skipping non-zero limbs.  */
  up = PTR(u);
  round = 0;
  rmask = ((usize ^ dir) >= 0 ? MP_LIMB_T_MAX : 0);
  if (rmask != 0)
    for (i = 0; i < limb_cnt && round == 0; i++)
      round = up[i];

  wp = PTR(w);
  cnt %= GMP_NUMB_BITS;
  if (cnt != 0)
    {
      round |= rmask & mpn_rshift (wp, up + limb_cnt, wsize, cnt);
      wsize -= (wp[wsize - 1] == 0);
    }
  else
    MPN_COPY_INCR (wp, up + limb_cnt, wsize);

  if (round != 0)
    {
      if (wsize != 0)
	{
	  mp_limb_t cy;
	  cy = mpn_add_1 (wp, wp, wsize, CNST_LIMB(1));
	  wp[wsize] = cy;
	  wsize += cy;
	}
      else
	{
	  /* We shifted something to zero.  */
	  wp[0] = 1;
	  wsize = 1;
	}
    }
  SIZ(w) = (usize >= 0 ? wsize : -wsize);
}
예제 #8
0
파일: mulders.c 프로젝트: SESA/EbbRT-mpfr
/* Put in  rp[n..2n-1] an approximation of the n high limbs
   of {np, n} * {mp, n}. The error is less than n ulps of rp[n] (and the
   approximation is always less or equal to the truncated full product).

   Implements Algorithm ShortMul from [1].
*/
void
mpfr_mulhigh_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mpfr_limb_srcptr mp,
                mp_size_t n)
{
  mp_size_t k;

  MPFR_ASSERTN (MPFR_MULHIGH_TAB_SIZE >= 8); /* so that 3*(n/4) > n/2 */
  k = MPFR_LIKELY (n < MPFR_MULHIGH_TAB_SIZE) ? mulhigh_ktab[n] : 3*(n/4);
  /* Algorithm ShortMul from [1] requires k >= (n+3)/2, which translates
     into k >= (n+4)/2 in the C language. */
  MPFR_ASSERTD (k == -1 || k == 0 || (k >= (n+4)/2 && k < n));
  if (k < 0)
    mpn_mul_basecase (rp, np, n, mp, n); /* result is exact, no error */
  else if (k == 0)
    mpfr_mulhigh_n_basecase (rp, np, mp, n); /* basecase error < n ulps */
  else if (n > MUL_FFT_THRESHOLD)
    mpn_mul_n (rp, np, mp, n); /* result is exact, no error */
  else
    {
      mp_size_t l = n - k;
      mp_limb_t cy;

      mpn_mul_n (rp + 2 * l, np + l, mp + l, k); /* fills rp[2l..2n-1] */
      mpfr_mulhigh_n (rp, np + k, mp, l);        /* fills rp[l-1..2l-1] */
      cy = mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1);
      mpfr_mulhigh_n (rp, np, mp + k, l);        /* fills rp[l-1..2l-1] */
      cy += mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1);
      mpn_add_1 (rp + n + l, rp + n + l, k, cy); /* propagate carry */
    }
}
예제 #9
0
파일: wrappers.c 프로젝트: jaspervdj/ghc
/* Twos-complement version of 'integer_gmp_mpn_rshift' for performing
 * arithmetic right shifts on "negative" MPNs.
 *
 * Same pre-conditions as 'integer_gmp_mpn_rshift'
 *
 * This variant is needed to operate on MPNs interpreted as negative
 * numbers, which require "rounding" towards minus infinity iff a
 * non-zero bit is shifted out.
 */
mp_limb_t
integer_gmp_mpn_rshift_2c (mp_limb_t rp[], const mp_limb_t sp[],
                           const mp_size_t sn, const mp_bitcnt_t count)
{
  const mp_size_t    limb_shift = count / GMP_NUMB_BITS;
  const unsigned int bit_shift  = count % GMP_NUMB_BITS;
  const mp_size_t    rn         = sn - limb_shift;

  // whether non-zero bits were shifted out
  bool nz_shift_out = false;

  if (bit_shift) {
    if (mpn_rshift(rp, &sp[limb_shift], rn, bit_shift))
      nz_shift_out = true;
  } else
    memcpy(rp, &sp[limb_shift], rn*sizeof(mp_limb_t));

  if (!nz_shift_out)
    for (unsigned i = 0; i < limb_shift; i++)
      if (sp[i]) {
        nz_shift_out = true;
        break;
      }

  // round if non-zero bits were shifted out
  if (nz_shift_out)
    if (mpn_add_1(rp, rp, rn, 1))
      abort(); /* should never happen */

  return rp[rn-1];
}
예제 #10
0
void fp_rdcn_low(dig_t *c, dig_t *a) {
	int i;
	dig_t r, c0, c1, u, *tmp;
	const dig_t *m;

	u = *(fp_prime_get_rdc());
	m = fp_prime_get();

	tmp = a;

	c1 = 0;
	for (i = 0; i < FP_DIGS; i++, tmp++) {
		r = (dig_t)(*tmp * u);
		c0 = mpn_addmul_1(tmp, m, FP_DIGS, r);
		c1 += mpn_add_1(tmp + FP_DIGS, tmp + FP_DIGS, FP_DIGS - i, c0);
	}
	for (i = 0; i < FP_DIGS; i++, tmp++) {
		c[i] = *tmp;
	}
	for (i = 0; i < c1; i++) {
		fp_subn_low(c, c, m);
	}
	if (fp_cmp(c, m) != CMP_LT) {
		fp_subn_low(c, c, m);
	}
}
예제 #11
0
/* Needs n+1 limbs of temporary storage. */
int
mpn_toom_eval_dgr3_pm2 (mp_ptr xp2, mp_ptr xm2,
			mp_srcptr xp, mp_size_t n, mp_size_t x3n, mp_ptr tp)
{
  mp_limb_t cy;
  int neg;

  ASSERT (x3n > 0);
  ASSERT (x3n <= n);

  /* (x0 + 4 * x2) +/- (2 x1 + 8 x_3) */
#if HAVE_NATIVE_mpn_addlsh_n || HAVE_NATIVE_mpn_addlsh2_n
#if HAVE_NATIVE_mpn_addlsh2_n
  xp2[n] = mpn_addlsh2_n (xp2, xp, xp + 2*n, n);

  cy = mpn_addlsh2_n (tp, xp + n, xp + 3*n, x3n);
#else /* HAVE_NATIVE_mpn_addlsh_n */
  xp2[n] = mpn_addlsh_n (xp2, xp, xp + 2*n, n, 2);

  cy = mpn_addlsh_n (tp, xp + n, xp + 3*n, x3n, 2);
#endif
  if (x3n < n)
    cy = mpn_add_1 (tp + x3n, xp + n + x3n, n - x3n, cy);
  tp[n] = cy;
#else
  cy = mpn_lshift (tp, xp + 2*n, n, 2);
  xp2[n] = cy + mpn_add_n (xp2, tp, xp, n);

  tp[x3n] = mpn_lshift (tp, xp + 3*n, x3n, 2);
  if (x3n < n)
    tp[n] = mpn_add (tp, xp + n, n, tp, x3n + 1);
  else
    tp[n] += mpn_add_n (tp, xp + n, tp, n);
#endif
  mpn_lshift (tp, tp, n+1, 1);

  neg = (mpn_cmp (xp2, tp, n + 1) < 0) ? ~0 : 0;

#if HAVE_NATIVE_mpn_sumdiff_n
  if (neg)
    mpn_sumdiff_n (xp2, xm2, tp, xp2, n + 1);
  else
    mpn_sumdiff_n (xp2, xm2, xp2, tp, n + 1);
#else
  if (neg)
    mpn_sub_n (xm2, tp, xp2, n + 1);
  else
    mpn_sub_n (xm2, xp2, tp, n + 1);

  mpn_add_n (xp2, xp2, tp, n + 1);
#endif

  ASSERT (xp2[n] < 15);
  ASSERT (xm2[n] < 10);

  return neg;
}
예제 #12
0
void
mpn_binvert (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_ptr scratch)
{
  mp_ptr xp;
  mp_size_t rn, newrn;
  mp_size_t sizes[NPOWS], *sizp;
  mp_limb_t di;

  /* Compute the computation precisions from highest to lowest, leaving the
     base case size in 'rn'.  */
  sizp = sizes;
  for (rn = n; ABOVE_THRESHOLD (rn, BINV_NEWTON_THRESHOLD); rn = (rn + 1) >> 1)
    *sizp++ = rn;

  xp = scratch;

  /* Compute a base value using a low-overhead O(n^2) algorithm.  FIXME: We
     should call some divide-and-conquer lsb division function here for an
     operand subrange.  */
  MPN_ZERO (xp, rn);
  xp[0] = 1;
  binvert_limb (di, up[0]);
  if (BELOW_THRESHOLD (rn, DC_BDIV_Q_THRESHOLD))
    mpn_sb_bdiv_q (rp, xp, rn, up, rn, -di);
  else
    mpn_dc_bdiv_q (rp, xp, rn, up, rn, -di);

  /* Use Newton iterations to get the desired precision.  */
  for (; rn < n; rn = newrn)
    {
      newrn = *--sizp;

#if WANT_FFT
      if (ABOVE_THRESHOLD (newrn, 2 * MUL_FFT_MODF_THRESHOLD))
	{
	  int k;
	  mp_size_t m, i;

	  k = mpn_fft_best_k (newrn, 0);
	  m = mpn_fft_next_size (newrn, k);
	  mpn_mul_fft (xp, m, up, newrn, rp, rn, k);
	  for (i = rn - 1; i >= 0; i--)
	    if (xp[i] > (i == 0))
	      {
		mpn_add_1 (xp + rn, xp + rn, newrn - rn, 1);
		break;
	      }
	}
      else
#endif
	mpn_mul (xp, up, newrn, rp, rn);
      mpn_mullow_n (rp + rn, rp, xp + rn, newrn - rn);
      mpn_neg_n (rp + rn, rp + rn, newrn - rn);
    }
}
예제 #13
0
파일: mpz.c 프로젝트: indeyets/smalltalk
static void
gst_mpz_sub_ui (gst_mpz *dif, const gst_mpz *min, mp_limb_t sub)
{
    mp_srcptr minp;
    mp_ptr difp;
    mp_size_t minsize, difsize;
    mp_size_t abs_minsize;

    minsize = min->size;
    abs_minsize = ABS (minsize);

    /* If not space for SUM (and possible carry), increase space.  */
    difsize = abs_minsize + 1;
    if (dif->alloc < difsize)
        gst_mpz_realloc (dif, difsize);

    /* These must be after realloc (ADD1 may be the same as SUM).  */
    minp = min->d;
    difp = dif->d;

    if (sub == 0)
    {
        MPN_COPY (difp, minp, abs_minsize);
        dif->size = minsize;
        return;
    }
    if (abs_minsize == 0)
    {
        difp[0] = sub;
        dif->size = -1;
        return;
    }

    if (minsize < 0)
    {
        difsize = mpn_add_1 (difp, minp, abs_minsize, sub);
        if (difsize != 0)
            difp[abs_minsize] = 1;
        difsize = -(difsize + abs_minsize);
    }
    else
    {
        /* The signs are different.  Need exact comparision to determine
        which operand to subtract from which.  */
        if (abs_minsize == 1 && minp[0] < sub)
            difsize = -(abs_minsize
                        + mpn_sub_1 (difp, &sub, 1, *minp));
        else
            difsize = (abs_minsize
                       + mpn_sub_1 (difp, minp, abs_minsize, sub));
    }

    dif->size = difsize;
}
예제 #14
0
void
mpn_sbpi1_bdiv_q (mp_ptr qp,
		  mp_ptr np, mp_size_t nn,
		  mp_srcptr dp, mp_size_t dn,
		  mp_limb_t dinv)
{
  mp_size_t i;
  mp_limb_t cy, q;

  ASSERT (dn > 0);
  ASSERT (nn >= dn);
  ASSERT ((dp[0] & 1) != 0);
  /* FIXME: Add ASSERTs for allowable overlapping; i.e., that qp = np is OK,
     but some over N/Q overlaps will not work.  */

  for (i = nn - dn; i > 0; i--)
    {
      q = dinv * np[0];
      cy = mpn_addmul_1 (np, dp, dn, q);
      mpn_add_1 (np + dn, np + dn, i, cy);
      ASSERT (np[0] == 0);
      qp[0] = ~q;
      qp++;
      np++;
    }

  for (i = dn; i > 1; i--)
    {
      q = dinv * np[0];
      mpn_addmul_1 (np, dp, i, q);
      ASSERT (np[0] == 0);
      qp[0] = ~q;
      qp++;
      np++;
    }

  /* Final limb */
  q = dinv * np[0];
  qp[0] = ~q;
  mpn_add_1 (qp - nn + 1, qp - nn + 1, nn, 1);
}
예제 #15
0
/* 
   c is the top bits of the inputs, (fully reduced)
   c & 2 is the top bit of y
   c & 1 is the top bit of z
*/
int
mpn_mulmod_2expp1_basecase (mp_ptr xp, mp_srcptr yp, mp_srcptr zp, 
                                           int c, mpir_ui b, mp_ptr tp)
{
  int cy, cz;
  mp_size_t n, k;

  cy = c & 2;
  cz = c & 1;

  n = BITS_TO_LIMBS (b);
  k = GMP_NUMB_BITS * n - b;

  ASSERT(b > 0);
  ASSERT(n > 0);
  ASSERT_MPN(yp, n);
  ASSERT_MPN(zp, n);
  ASSERT(!MPN_OVERLAP_P (tp, 2 * n, yp, n));
  ASSERT(!MPN_OVERLAP_P (tp, 2 * n, zp, n));
  ASSERT(MPN_SAME_OR_SEPARATE_P (xp, tp, n));
  ASSERT(MPN_SAME_OR_SEPARATE_P (xp, tp + n, n));
  ASSERT(k == 0 || yp[n - 1] >> (GMP_NUMB_BITS - k) == 0);
  ASSERT(k == 0 || zp[n - 1] >> (GMP_NUMB_BITS - k) == 0);

#if WANT_ASSERT
  {
     mp_size_t t = n;

     MPN_NORMALIZE(yp, t);
     ASSERT(cy == 0 || t == 0);
     
     t = n; 
     MPN_NORMALIZE(zp, t);
     ASSERT(cz == 0 || t == 0);
  }
#endif

  if (LIKELY (cy == 0))
    {
      if (LIKELY (cz == 0))
	{
	  c = mpn_mulmod_2expp1_internal (xp, yp, zp, b, tp);
	}
      else
	{
	  c = mpn_neg_n (xp, yp, n);
	  c = mpn_add_1 (xp, xp, n, c);
	  xp[n - 1] &= GMP_NUMB_MASK >> k;
	}
    }
  else
    {
      if (LIKELY (cz == 0))
예제 #16
0
void
mpn_sbpi1_bdiv_q (mp_ptr qp,
		  mp_ptr np, mp_size_t nn,
		  mp_srcptr dp, mp_size_t dn,
		  mp_limb_t dinv)
{
  mp_size_t i;
  mp_limb_t cy, q;

  ASSERT (dn > 0);
  ASSERT (nn >= dn);
  ASSERT ((dp[0] & 1) != 0);

  for (i = nn - dn; i > 0; i--)
    {
      q = dinv * np[0];
      qp[0] = ~q;
      qp++;
      cy = mpn_addmul_1 (np, dp, dn, q);
      mpn_add_1 (np + dn, np + dn, i, cy);
      ASSERT (np[0] == 0);
      np++;
    }

  for (i = dn; i > 1; i--)
    {
      q = dinv * np[0];
      qp[0] = ~q;
      qp++;
      mpn_addmul_1 (np, dp, i, q);
      ASSERT (np[0] == 0);
      np++;
    }

  /* Final limb */
  q = dinv * np[0];
  qp[0] = ~q;
  mpn_add_1 (qp - nn + 1, qp - nn + 1, nn, 1);
}
예제 #17
0
void tc4_copy (mp_ptr yp, mp_size_t * yn, mp_size_t offset, mp_srcptr xp, mp_size_t xn)
{
  mp_size_t yu = ABS(*yn);
  mp_size_t xu = ABS(xn);
  mp_limb_t cy = 0;

  if (xn == 0)
    return;

  if (offset < yu) /* low part of x overlaps with y */
  {
      if (offset + xu <= yu) /* x entirely inside y */
      {
          cy = mpn_add_n (yp + offset, yp + offset, xp, xu);
          if (offset + xu < yu)
            cy = mpn_add_1 (yp + offset + xu, yp + offset + xu,
                            yu - (offset + xu), cy);
      } else
        cy = mpn_add_n (yp + offset, yp + offset, xp, yu - offset);
      /* now cy is the carry at yp + yu */
      if (xu + offset > yu) /* high part of x exceeds y */
      {
          MPN_COPY (yp + yu, xp + yu - offset, xu + offset - yu);
          cy = mpn_add_1 (yp + yu, yp + yu, xu + offset - yu, cy);
          yu = xu + offset;
      }
      /* now cy is the carry at yp + yn */
      if (cy)
        yp[yu++] = cy;
      MPN_NORMALIZE(yp, yu);
      *yn = yu;
  } else /* x does not overlap */
  {
      if (offset > yu)
        MPN_ZERO (yp + yu, offset - yu);
      MPN_COPY (yp + offset, xp, xu);
      *yn = offset + xu;
  }
}
예제 #18
0
파일: montfp.cpp 프로젝트: mahdiz/mpclib
// Montgomery reduction.
// Algorithm II.4 from Blake, Seroussi and Smart.
static void mont_reduce(mp_limb_t *x, mp_limb_t *y, fptr p) {
	size_t t = p->limbs;
	size_t i;
	mp_limb_t flag = 0;
	for (i = 0; i < t; i++) {
		mp_limb_t u = y[i] * p->negpinv;
		mp_limb_t carry = mpn_addmul_1(&y[i], p->primelimbs, t, u);
		//mpn_add_1(&y[i+t], &y[i+t], t - i + 1, carry);
		flag += mpn_add_1(&y[i + t], &y[i + t], t - i, carry);
	}
	if (flag || mpn_cmp(&y[t], p->primelimbs, t) >= 0) {
		mpn_sub_n(x, &y[t], p->primelimbs, t);
	}
	else {
		// TODO: GMP set might be faster.
		memcpy(x, &y[t], t * sizeof(mp_limb_t));
	}
}
static void
mpf_normalize(mpf_t op)
{
    Py_ssize_t size, prec, toclear, temp, i;
    mp_limb_t bit1, rem, carry;

    prec = mpf_get_prec(op);
    size = mpf_size(op);
    toclear = size - ((prec / GMP_NUMB_BITS) + 1);
    if(toclear>0) {
        bit1 = (op->_mp_d[toclear-1] & ((mp_limb_t)1 << (GMP_NUMB_BITS - 1))) ? 1 : 0;
        rem = (op->_mp_d[toclear-1] & (((mp_limb_t)1 << (GMP_NUMB_BITS - 1)) - 1)) ? 1 : 0;
        carry = bit1 && ((op->_mp_d[toclear] & 1) || rem);
    } else {
        carry = 0;
    }
    if(options.debug) {
        fprintf(stderr, "prec %ld size %ld toclear %ld carry %ld\n",
               prec, size, toclear, carry);
        for(i=0; i<size; i++)
            fprintf(stderr,"[%zd]=%lx\n", i, op->_mp_d[i]);
    }
    temp = toclear;
    if(temp>0) {
        op->_mp_d[--temp] = 0;
    }
    if(carry) {
        if(options.debug) {
            fprintf(stderr, "adding carry bit\n");
        }
        carry = mpn_add_1(op->_mp_d + toclear, op->_mp_d + toclear, size-toclear, carry);
        if(carry) {
            if(options.debug) {
                fprintf(stderr, "carry bit extended\n");
            }
            op->_mp_d[size-1] = 1;
            op->_mp_exp++;
        }
    }
    if(options.debug) {
        for(i=0; i<size; i++)
            fprintf(stderr,"[%zd]=%lx\n", i, op->_mp_d[i]);
    }
}
예제 #20
0
mp_limb_t
mpn_dc_divappr_q (mp_ptr qp, mp_ptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn)
{
  mp_limb_t cy;
  mp_limb_t xp[2], dip[2];

  ASSERT (dn >= 2);

  cy = mpn_add_1 (xp, dp + dn - 2, 2, 1);
  if (cy != 0)
    dip[0] = dip[1] = 0;
  else
    {
      mp_limb_t scratch[10];	/* FIXME */
      mpn_invert (dip, xp, 2, scratch);
    }

  return mpn_preinv_dc_divappr_q (qp, np, nn, dp, dn, dip);
}
예제 #21
0
파일: fmpz.c 프로젝트: hperl/flint
void fmpz_sub_ui(fmpz_t output, const fmpz_t input, const unsigned long x)
{
   unsigned long carry;
   
   if (x)
   {
      if (!input[0])
      {
         output[1] = x;
         output[0] = -1L;
      } else if ((long) input[0] < 0)
      {
         carry = mpn_add_1(output + 1, input + 1, ABS(input[0]), x); 
         output[0] = input[0];
         if (carry)
         {
            output[ABS(output[0])+1] = carry;
            output[0]--;
         }
      } else if ((long) input[0] > 1L)
      {
         mpn_sub_1(output + 1, input + 1, input[0], x); 
         output[0] = input[0];
         NORM(output);
      } else
      {
         if (x <= input[1]) 
         {
            output[1] = input[1] - x;
            if (!output[1]) output[0] = 0;
            else output[0] = 1L;
         } else
         {
            output[1] = x - input[1];
            output[0] = -1L;
         } 
      }
   } else
   {
      fmpz_set(output, input);
   }
}
예제 #22
0
파일: fmpz.c 프로젝트: hperl/flint
void __fmpz_add_ui_inplace(fmpz_t output, const unsigned long x)
{
   unsigned long carry;
   
   if (x)
   {
      if (!output[0])
      {
         output[1] = x;
         output[0] = 1;
      } else 
      {
         carry = mpn_add_1(output + 1, output + 1, output[0], x); 
         if (carry)
         {
            output[output[0]+1] = carry;
            output[0]++;
         }
      } 
   }
}
예제 #23
0
파일: butterfly_lshB.c 프로젝트: HRF92/mpir
void mpir_butterfly_lshB(mp_ptr t, mp_ptr u, mp_ptr i1, 
                       mp_ptr i2, mp_size_t limbs, mp_size_t x, mp_size_t y)
{
   mp_limb_t cy, cy1, cy2;

   if (x == 0)
   {
      if (y == 0)
         cy = mpn_sumdiff_n(t + x, u + y, i1, i2, limbs + 1);
      else
      {
         cy = mpn_sumdiff_n(t, u + y, i1, i2, limbs - y);
         u[limbs] = -(cy&1);
         cy1 = cy>>1;
         cy = mpn_sumdiff_n(t + limbs - y, u, i2 + limbs - y, i1 + limbs - y, y);
         t[limbs] = cy>>1;
         mpn_add_1(t + limbs - y, t + limbs - y, y + 1, cy1);
         cy1 = -(cy&1) + (i2[limbs] - i1[limbs]);
         mpn_addmod_2expp1_1(u + y, limbs - y, cy1);
         cy1 = -(i1[limbs] + i2[limbs]);
         mpn_addmod_2expp1_1(t, limbs, cy1);
      }
   } else if (y == 0)
예제 #24
0
void tc4_addlsh1_unsigned(mp_ptr rp, mp_size_t * rn, mp_srcptr xp, mp_size_t xn)
{
	if (xn)
	{
		if (xn >= *rn)
		{
            mp_limb_t cy;
			if (xn > *rn) MPN_ZERO(rp + *rn, xn - *rn);
#if HAVE_NATIVE_mpn_addlsh1_n
            cy = mpn_addlsh1_n(rp, rp, xp, xn);
#else
            cy = mpn_add_n(rp, rp, xp, xn);
            cy += mpn_add_n(rp, rp, xp, xn);
#endif
			if (cy) 
			{
				rp[xn] = cy;
				*rn = xn + 1;
			} else *rn = xn;
		} else
	   {
		   mp_limb_t cy;
#if HAVE_NATIVE_mpn_addlsh1_n
            cy = mpn_addlsh1_n(rp, rp, xp, xn);
#else
            cy = mpn_add_n(rp, rp, xp, xn);
            cy += mpn_add_n(rp, rp, xp, xn);
#endif
	      if (cy) cy = mpn_add_1(rp + xn, rp + xn, *rn - xn, cy);
		   if (cy) 
		   {
			   rp[*rn] = cy;
			   (*rn)++;
		   }
		}
	}
}
예제 #25
0
파일: fmpz.c 프로젝트: hperl/flint
void fmpz_sub_ui_inplace(fmpz_t output, const unsigned long x)
{
   unsigned long carry;
   
   if (x)
   {
      if (!output[0])
      {
         output[1] = x;
         output[0] = -1L;
      } else if ((long) output[0] < 0)
      {
         carry = mpn_add_1(output + 1, output + 1, ABS(output[0]), x); 
         if (carry)
         {
            output[ABS(output[0])+1] = carry;
            output[0]--;
         }
      } else if ((long) output[0] > 1L)
      {
         mpn_sub_1(output + 1, output + 1, output[0], x); 
         NORM(output);
      } else
      {
         if (x <= output[1]) 
         {
            output[1] -= x;
            if (!output[1]) output[0] = 0;
         } else
         {
            output[1] = x - output[1];
            output[0] = -1L;
         } 
      }
   }
}
예제 #26
0
void
mpn_toom53_mul (mp_ptr pp,
		mp_srcptr ap, mp_size_t an,
		mp_srcptr bp, mp_size_t bn,
		mp_ptr scratch)
{
  mp_size_t n, s, t;
  int vm1_neg, vmh_neg;
  mp_limb_t cy;
  mp_ptr gp, hp;
  mp_ptr as1, asm1, as2, ash, asmh;
  mp_ptr bs1, bsm1, bs2, bsh, bsmh;
  enum toom4_flags flags;
  TMP_DECL;

#define a0  ap
#define a1  (ap + n)
#define a2  (ap + 2*n)
#define a3  (ap + 3*n)
#define a4  (ap + 4*n)
#define b0  bp
#define b1  (bp + n)
#define b2  (bp + 2*n)

  n = 1 + (3 * an >= 5 * bn ? (an - 1) / (size_t) 5 : (bn - 1) / (size_t) 3);

  s = an - 4 * n;
  t = bn - 2 * n;

  ASSERT (0 < s && s <= n);
  ASSERT (0 < t && t <= n);

  TMP_MARK;

  as1  = TMP_SALLOC_LIMBS (n + 1);
  asm1 = TMP_SALLOC_LIMBS (n + 1);
  as2  = TMP_SALLOC_LIMBS (n + 1);
  ash  = TMP_SALLOC_LIMBS (n + 1);
  asmh = TMP_SALLOC_LIMBS (n + 1);

  bs1  = TMP_SALLOC_LIMBS (n + 1);
  bsm1 = TMP_SALLOC_LIMBS (n + 1);
  bs2  = TMP_SALLOC_LIMBS (n + 1);
  bsh  = TMP_SALLOC_LIMBS (n + 1);
  bsmh = TMP_SALLOC_LIMBS (n + 1);

  gp = pp;
  hp = pp + n + 1;

  /* Compute as1 and asm1.  */
  gp[n]  = mpn_add_n (gp, a0, a2, n);
  gp[n] += mpn_add   (gp, gp, n, a4, s);
  hp[n]  = mpn_add_n (hp, a1, a3, n);
#if HAVE_NATIVE_mpn_addsub_n
  if (mpn_cmp (gp, hp, n + 1) < 0)
    {
      mpn_addsub_n (as1, asm1, hp, gp, n + 1);
      vm1_neg = 1;
    }
  else
    {
      mpn_addsub_n (as1, asm1, gp, hp, n + 1);
      vm1_neg = 0;
    }
#else
  mpn_add_n (as1, gp, hp, n + 1);
  if (mpn_cmp (gp, hp, n + 1) < 0)
    {
      mpn_sub_n (asm1, hp, gp, n + 1);
      vm1_neg = 1;
    }
  else
    {
      mpn_sub_n (asm1, gp, hp, n + 1);
      vm1_neg = 0;
    }
#endif

  /* Compute as2.  */
#if !HAVE_NATIVE_mpn_addlsh_n
  ash[n] = mpn_lshift (ash, a2, n, 2);			/*        4a2       */
#endif
#if HAVE_NATIVE_mpn_addlsh1_n
  cy  = mpn_addlsh1_n (as2, a3, a4, s);
  if (s != n)
    cy = mpn_add_1 (as2 + s, a3 + s, n - s, cy);
  cy = 2 * cy + mpn_addlsh1_n (as2, a2, as2, n);
  cy = 2 * cy + mpn_addlsh1_n (as2, a1, as2, n);
  as2[n] = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
#else
  cy  = mpn_lshift (as2, a4, s, 1);
  cy += mpn_add_n (as2, a3, as2, s);
  if (s != n)
    cy = mpn_add_1 (as2 + s, a3 + s, n - s, cy);
  cy = 4 * cy + mpn_lshift (as2, as2, n, 2);
  cy += mpn_add_n (as2, a1, as2, n);
  cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
  as2[n] = cy + mpn_add_n (as2, a0, as2, n);
  mpn_add_n (as2, ash, as2, n + 1);
#endif

  /* Compute ash and asmh.  */
#if HAVE_NATIVE_mpn_addlsh_n
  cy  = mpn_addlsh_n (gp, a2, a0, n, 2);		/* 4a0  +  a2       */
  cy = 4 * cy + mpn_addlsh_n (gp, a4, gp, n, 2);	/* 16a0 + 4a2 +  a4 */ /* FIXME s */
  gp[n] = cy;
  cy  = mpn_addlsh_n (hp, a3, a1, n, 2);		/*  4a1 +  a3       */
  cy = 2 * cy + mpn_lshift (hp, hp, n, 1);		/*  8a1 + 2a3       */
  hp[n] = cy;
#else
  gp[n] = mpn_lshift (gp, a0, n, 4);			/* 16a0             */
  mpn_add (gp, gp, n + 1, a4, s);			/* 16a0 +        a4 */
  mpn_add_n (gp, ash, gp, n+1);				/* 16a0 + 4a2 +  a4 */
  cy  = mpn_lshift (hp, a1, n, 3);			/*  8a1             */
  cy += mpn_lshift (ash, a3, n, 1);			/*        2a3       */
  cy += mpn_add_n (hp, ash, hp, n);			/*  8a1 + 2a3       */
  hp[n] = cy;
#endif
#if HAVE_NATIVE_mpn_addsub_n
  if (mpn_cmp (gp, hp, n + 1) < 0)
    {
      mpn_addsub_n (ash, asmh, hp, gp, n + 1);
      vmh_neg = 1;
    }
  else
    {
      mpn_addsub_n (ash, asmh, gp, hp, n + 1);
      vmh_neg = 0;
    }
#else
  mpn_add_n (ash, gp, hp, n + 1);
  if (mpn_cmp (gp, hp, n + 1) < 0)
    {
      mpn_sub_n (asmh, hp, gp, n + 1);
      vmh_neg = 1;
    }
  else
    {
      mpn_sub_n (asmh, gp, hp, n + 1);
      vmh_neg = 0;
    }
#endif

  /* Compute bs1 and bsm1.  */
  bs1[n] = mpn_add (bs1, b0, n, b2, t);		/* b0 + b2 */
#if HAVE_NATIVE_mpn_addsub_n
  if (bs1[n] == 0 && mpn_cmp (bs1, b1, n) < 0)
    {
      bs1[n] = mpn_addsub_n (bs1, bsm1, b1, bs1, n) >> 1;
      bsm1[n] = 0;
      vm1_neg ^= 1;
    }
예제 #27
0
/*
   ret + (xp, n) = (yp, n)*(zp, n) % 2^b + 1  
   needs (tp, 2n) temp space, everything reduced mod 2^b 
   inputs, outputs are fully reduced
  
   N.B: 2n is not the same as 2b rounded up to nearest limb!
*/
inline static int
mpn_mulmod_2expp1_internal (mp_ptr xp, mp_srcptr yp, mp_srcptr zp,
			                                         mpir_ui b, mp_ptr tp)
{
  mp_size_t n, k;
  mp_limb_t c;

  TMP_DECL;

  n = BITS_TO_LIMBS (b);
  k = GMP_NUMB_BITS * n - b;

  ASSERT(b > 0);
  ASSERT(n > 0);
  ASSERT_MPN(yp, n);
  ASSERT_MPN(zp, n);
  ASSERT(!MPN_OVERLAP_P (tp, 2 * n, yp, n));
  ASSERT(!MPN_OVERLAP_P (tp, 2 * n, zp, n));
  ASSERT(MPN_SAME_OR_SEPARATE_P (xp, tp, n));
  ASSERT(MPN_SAME_OR_SEPARATE_P (xp, tp + n, n));
  ASSERT(k == 0 || yp[n - 1] >> (GMP_NUMB_BITS - k) == 0);
  ASSERT(k == 0 || zp[n - 1] >> (GMP_NUMB_BITS - k) == 0);

#ifndef TUNE_PROGRAM_BUILD
  if (k == 0 && n > FFT_MULMOD_2EXPP1_CUTOFF && n == mpir_fft_adjust_limbs(n))
  {
      mp_bitcnt_t depth1, depth = 1;
      mp_size_t w1, off;
      mp_ptr tx, ty, tz;
      mp_limb_t ret;

      TMP_MARK;

      tx = TMP_BALLOC_LIMBS(3*n + 3);
      ty = tx + n + 1;
      tz = ty + n + 1;

      MPN_COPY(ty, yp, n);
      MPN_COPY(tz, zp, n);
      ty[n] = 0;
      tz[n] = 0;

      while ((((mp_limb_t)1)<<depth) < b) depth++;
   
      if (depth < 12) off = mulmod_2expp1_table_n[0];
      else off = mulmod_2expp1_table_n[MIN(depth, FFT_N_NUM + 11) - 12];
      depth1 = depth/2 - off;
   
      w1 = b/(((mp_limb_t)1)<<(2*depth1));

      mpir_fft_mulmod_2expp1(tx, ty, tz, n, depth1, w1);

      MPN_COPY(xp, tx, n);
      ret = tx[n];
      
      TMP_FREE;

	   return ret;
  }
#endif

  if (yp == zp)
     mpn_sqr(tp, yp, n);
  else
     mpn_mul_n (tp, yp, zp, n);

  if (k == 0)
    {
      c = mpn_sub_n (xp, tp, tp + n, n);

      return mpn_add_1 (xp, xp, n, c);
    }

  c = tp[n - 1];
  tp[n - 1] &= GMP_NUMB_MASK >> k;

#if HAVE_NATIVE_mpn_sublsh_nc
  c = mpn_sublsh_nc (xp, tp, tp + n, n, k, c);
#else
  {
    mp_limb_t c1;
    c1 = mpn_lshift (tp + n, tp + n, n, k);
    tp[n] |= c >> (GMP_NUMB_BITS - k);
    c = mpn_sub_n (xp, tp, tp + n, n) + c1;
  }
#endif

  c = mpn_add_1 (xp, xp, n, c);
  xp[n - 1] &= GMP_NUMB_MASK >> k;

  return c;
}
예제 #28
0
  if (LIKELY (cy == 0))
    {
      if (LIKELY (cz == 0))
	{
	  c = mpn_mulmod_2expp1_internal (xp, yp, zp, b, tp);
	}
      else
	{
	  c = mpn_neg_n (xp, yp, n);
	  c = mpn_add_1 (xp, xp, n, c);
	  xp[n - 1] &= GMP_NUMB_MASK >> k;
	}
    }
  else
    {
      if (LIKELY (cz == 0))
	{
	  c = mpn_neg_n (xp, zp, n);
	  c = mpn_add_1 (xp, xp, n, c);
	  xp[n - 1] &= GMP_NUMB_MASK >> k;
	}
      else
	{
	  c = 0;
	  xp[0] = 1;
	  MPN_ZERO (xp + 1, n - 1);
	}
    }
  return c;
}
void
mpn_toom_interpolate_12pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5,
			mp_size_t n, mp_size_t spt, int half, mp_ptr wsi)
{
  mp_limb_t cy;
  mp_size_t n3;
  mp_size_t n3p1;
  n3 = 3 * n;
  n3p1 = n3 + 1;

#define   r4    (pp + n3)			/* 3n+1 */
#define   r2    (pp + 7 * n)			/* 3n+1 */
#define   r0    (pp +11 * n)			/* s+t <= 2*n */

  /******************************* interpolation *****************************/
  if (half != 0) {
    cy = mpn_sub_n (r3, r3, r0, spt);
    MPN_DECR_U (r3 + spt, n3p1 - spt, cy);

    cy = DO_mpn_sublsh_n (r2, r0, spt, 10, wsi);
    MPN_DECR_U (r2 + spt, n3p1 - spt, cy);
    DO_mpn_subrsh(r5, n3p1, r0, spt, 2, wsi);

    cy = DO_mpn_sublsh_n (r1, r0, spt, 20, wsi);
    MPN_DECR_U (r1 + spt, n3p1 - spt, cy);
    DO_mpn_subrsh(r4, n3p1, r0, spt, 4, wsi);
  };

  r4[n3] -= DO_mpn_sublsh_n (r4 + n, pp, 2 * n, 20, wsi);
  DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 4, wsi);

#if HAVE_NATIVE_mpn_add_n_sub_n
  mpn_add_n_sub_n (r1, r4, r4, r1, n3p1);
#else
  ASSERT_NOCARRY(mpn_add_n (wsi, r1, r4, n3p1));
  mpn_sub_n (r4, r4, r1, n3p1); /* can be negative */
  MP_PTR_SWAP(r1, wsi);
#endif

  r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 10, wsi);
  DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 2, wsi);

#if HAVE_NATIVE_mpn_add_n_sub_n
  mpn_add_n_sub_n (r2, r5, r5, r2, n3p1);
#else
  mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */
  ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1));
  MP_PTR_SWAP(r5, wsi);
#endif

  r3[n3] -= mpn_sub_n (r3+n, r3+n, pp, 2 * n);

#if AORSMUL_FASTER_AORS_AORSLSH
  mpn_submul_1 (r4, r5, n3p1, 257); /* can be negative */
#else
  mpn_sub_n (r4, r4, r5, n3p1); /* can be negative */
  DO_mpn_sublsh_n (r4, r5, n3p1, 8, wsi); /* can be negative */
#endif
  /* A division by 2835x4 follows. Warning: the operand can be negative! */
  mpn_divexact_by2835x4(r4, r4, n3p1);
  if ((r4[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0)
    r4[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2));

#if AORSMUL_FASTER_2AORSLSH
  mpn_addmul_1 (r5, r4, n3p1, 60); /* can be negative */
#else
  DO_mpn_sublsh_n (r5, r4, n3p1, 2, wsi); /* can be negative */
  DO_mpn_addlsh_n (r5, r4, n3p1, 6, wsi); /* can give a carry */
#endif
  mpn_divexact_by255(r5, r5, n3p1);

  ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r3, n3p1, 5, wsi));

#if AORSMUL_FASTER_3AORSLSH
  ASSERT_NOCARRY(mpn_submul_1 (r1, r2, n3p1, 100));
#else
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 6, wsi));
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 5, wsi));
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 2, wsi));
#endif
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r3, n3p1, 9, wsi));
  mpn_divexact_by42525(r1, r1, n3p1);

#if AORSMUL_FASTER_AORS_2AORSLSH
  ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 225));
#else
  ASSERT_NOCARRY(mpn_sub_n (r2, r2, r1, n3p1));
  ASSERT_NOCARRY(DO_mpn_addlsh_n (r2, r1, n3p1, 5, wsi));
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r1, n3p1, 8, wsi));
#endif
  mpn_divexact_by9x4(r2, r2, n3p1);

  ASSERT_NOCARRY(mpn_sub_n (r3, r3, r2, n3p1));

  mpn_sub_n (r4, r2, r4, n3p1);
  ASSERT_NOCARRY(mpn_rshift(r4, r4, n3p1, 1));
  ASSERT_NOCARRY(mpn_sub_n (r2, r2, r4, n3p1));

  mpn_add_n (r5, r5, r1, n3p1);
  ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1));

  /* last interpolation steps... */
  ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1));
  ASSERT_NOCARRY(mpn_sub_n (r1, r1, r5, n3p1));
  /* ... could be mixed with recomposition
	||H-r5|M-r5|L-r5|   ||H-r1|M-r1|L-r1|
  */

  /***************************** recomposition *******************************/
  /*
    pp[] prior to operations:
    |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp

    summation scheme for remaining operations:
    |__12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp
    |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
	||H r1|M r1|L r1|   ||H r3|M r3|L r3|   ||H_r5|M_r5|L_r5|
  */

  cy = mpn_add_n (pp + n, pp + n, r5, n);
  cy = mpn_add_1 (pp + 2 * n, r5 + n, n, cy);
#if HAVE_NATIVE_mpn_add_nc
  cy = r5[n3] + mpn_add_nc(pp + n3, pp + n3, r5 + 2 * n, n, cy);
#else
  MPN_INCR_U (r5 + 2 * n, n + 1, cy);
  cy = r5[n3] + mpn_add_n (pp + n3, pp + n3, r5 + 2 * n, n);
#endif
  MPN_INCR_U (pp + n3 + n, 2 * n + 1, cy);

  pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r3, n);
  cy = mpn_add_1 (pp + 2 * n3, r3 + n, n, pp[2 * n3]);
#if HAVE_NATIVE_mpn_add_nc
  cy = r3[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r3 + 2 * n, n, cy);
#else
  MPN_INCR_U (r3 + 2 * n, n + 1, cy);
  cy = r3[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r3 + 2 * n, n);
#endif
  MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy);

  pp[10*n]+=mpn_add_n (pp + 9 * n, pp + 9 * n, r1, n);
  if (half) {
    cy = mpn_add_1 (pp + 10 * n, r1 + n, n, pp[10 * n]);
#if HAVE_NATIVE_mpn_add_nc
    if (LIKELY (spt > n)) {
      cy = r1[n3] + mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, n, cy);
      MPN_INCR_U (pp + 4 * n3, spt - n, cy);
    } else {
      ASSERT_NOCARRY(mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt, cy));
    }
#else
    MPN_INCR_U (r1 + 2 * n, n + 1, cy);
    if (LIKELY (spt > n)) {
      cy = r1[n3] + mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, n);
      MPN_INCR_U (pp + 4 * n3, spt - n, cy);
    } else {
      ASSERT_NOCARRY(mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt));
    }
#endif
  } else {
    ASSERT_NOCARRY(mpn_add_1 (pp + 10 * n, r1 + n, spt, pp[10 * n]));
  }

#undef   r0
#undef   r2
#undef   r4
}
예제 #30
0
/*
   Toom 4 interpolation. Interpolates the value at 2^(sn*B) of a 
	polynomial p(x) with 7 coefficients given the values 
	p(oo), p(2), p(1), p(-1), 2^6*p(1/2), 2^6*p(-1/2), p(0).
	The output is placed in rp and the final number of limbs of the
	output is given in rpn.
	The 4th and 6th values may be negative, and if so, n4 and n6 
	should be set to a negative value respectively.
   To save space we pass r3, r5, r7 in place in the output rp.
	The other r's are stored separately in space tp.
	The low limb of r3 is stored in r30, as it will be overwritten
	by the high limb of r5.

rp          rp1          rp2           rp3          rp4           rp5         rp6           rp7
<----------- r7-----------><------------r5-------------->            
                                                      <-------------r3------------->

   We assume that r1 is stored at tp, r2 at (tp + t4), r4 at (tp + 2*t4) 
	and r6 (tp + 3*t4). Each of these r's has t4 = s4 + 1 limbs allocated.
*/
void mpn_toom4_interpolate(mp_ptr rp, mp_size_t * rpn, mp_size_t sn,  
		       mp_ptr tp, mp_size_t s4, mp_size_t n4, mp_size_t n6, mp_limb_t r30)
{
	mp_size_t n1, n2, n3, n5, n7, t4;
	mp_limb_t saved, saved2, cy;

   t4 = s4 + 1; 
   
	mpn_add_n(r2, r2, r5, s4);

   if (n6 < 0) 
		mpn_add_n(r6, r5, r6, s4);
	else
      mpn_sub_n(r6, r5, r6, s4);
	/* r6 is now in twos complement format */

	saved = r3[0];
	r3[0] = r30;
	if (n4 < 0) 
		mpn_add_n(r4, r3, r4, s4);
	else
      mpn_sub_n(r4, r3, r4, s4);
	r3[0] = saved;
	/* r4 is now in twos complement format */
	
	mpn_sub_n(r5, r5, r1, s4);

#if HAVE_NATIVE_mpn_sublsh_n
	r5[s4-1] -= mpn_sublsh_n(r5, r5, r7, s4-1, 6);
#else
	r5[s4-1] -= mpn_submul_1(r5, r7, s4-1, 64);
#endif
   
   TC4_RSHIFT1(r4, s4); 
	
	saved = r3[0];
	r3[0] = r30;
	mpn_sub_n(r3, r3, r4, s4);
	r30 = r3[0];
	r3[0] = saved;

	mpn_double(r5, s4); 

	mpn_sub_n(r5, r5, r6, s4);

   saved = r3[0];
	r3[0] = r30;
	mpn_submul_1(r2, r3, s4, 65);
   r3[0] = saved;
	
	saved2 = r7[s4-1];
	r7[s4-1] = CNST_LIMB(0); // r7 is always positive so no sign extend needed
	saved = r3[0];
	r3[0] = r30;
#if HAVE_NATIVE_mpn_subadd_n
	mpn_subadd_n(r3, r3, r7, r1, s4);
#else
    mpn_sub_n(r3, r3, r7, s4);
    mpn_sub_n(r3, r3, r1, s4);
#endif
	r7[s4-1] = saved2;
   r30 = r3[0];
	
   mpn_addmul_1(r2, r3, s4, 45);

#if HAVE_NATIVE_mpn_sublsh_n
   cy = mpn_sublsh_n(r5, r5, r3, s4 - 1, 3);
#else
   cy = mpn_submul_1(r5, r3, s4 - 1, 8);
#endif
   r3[0] = saved;
	r3[0] -= (cy + 8*r3[s4-1]);
   
	mpn_rshift(r5, r5, s4, 3); 

	mpn_divexact_by3(r5, r5, s4); 
   
	mpn_sub_n(r6, r6, r2, s4);

#if HAVE_NATIVE_mpn_sublsh_n
	mpn_sublsh_n(r2, r2, r4, s4, 4);
#else
	mpn_submul_1(r2, r4, s4, 16);
#endif
   
   mpn_rshift(r2, r2, s4, 1); 

	mpn_divexact_by3(r2, r2, s4); 

   mpn_divexact_by3(r2, r2, s4); 
   
   saved = r3[0];
	r3[0] = r30;
   cy = mpn_sub_n(r3, r3, r5, s4 - 1);
   r30 = r3[0];
	r3[0] = saved;
	r3[s4-1] -= (cy + r5[s4-1]);
   
	mpn_sub_n(r4, r4, r2, s4);
	
	mpn_addmul_1(r6, r2, s4, 30);

   mpn_divexact_byfobm1(r6, r6, s4, CNST_LIMB(15), CNST_LIMB(~0/15));

	mpn_rshift(r6, r6, s4, 2);

	mpn_sub_n(r2, r2, r6, s4);

	TC4_NORM(r1, n1, s4);
   TC4_NORM(r2, n2, s4);
   
   (*rpn) = 6*sn+1;
	cy = mpn_add_1(r3, r3, *rpn - 4*sn, r30); /* don't forget to add r3[0] back in */
   if (cy) 
	{
		rp[*rpn] = cy;
	   (*rpn)++;
	}

	tc4_copy(rp, rpn, 5*sn, r2, n2);
   tc4_copy(rp, rpn, 6*sn, r1, n1);

	tc4_copy(rp, rpn, sn, r6, s4);
   tc4_copy(rp, rpn, 3*sn, r4, s4); 
}