예제 #1
0
파일: mul_n.c 프로젝트: angavrilov/ecl-sse
void
mpn_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n)
{
  ASSERT (n >= 1);
  ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n));

#if 0
  /* FIXME: Can this be removed? */
  if (n == 0)
    return;
#endif

  if (BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
    { /* mul_basecase is faster than sqr_basecase on small sizes sometimes */
      mpn_mul_basecase (p, a, n, a, n);
    }
  else if (BELOW_THRESHOLD (n, SQR_KARATSUBA_THRESHOLD))
    {
      mpn_sqr_basecase (p, a, n);
    }
  else if (BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD))
    {
      /* Allocate workspace of fixed size on stack: fast! */
      mp_limb_t ws[MPN_KARA_SQR_N_TSIZE (SQR_TOOM3_THRESHOLD_LIMIT-1)];
      ASSERT (SQR_TOOM3_THRESHOLD <= SQR_TOOM3_THRESHOLD_LIMIT);
      mpn_kara_sqr_n (p, a, n, ws);
    }
#if WANT_FFT || TUNE_PROGRAM_BUILD
  else if (BELOW_THRESHOLD (n, SQR_FFT_THRESHOLD))
#else
  else if (BELOW_THRESHOLD (n, MPN_TOOM3_MAX_N))
#endif
    {
      mp_ptr ws;
      TMP_SDECL;
      TMP_SMARK;
      ws = TMP_SALLOC_LIMBS (MPN_TOOM3_SQR_N_TSIZE (n));
      mpn_toom3_sqr_n (p, a, n, ws);
      TMP_SFREE;
    }
  else
#if WANT_FFT || TUNE_PROGRAM_BUILD
    {
      /* The current FFT code allocates its own space.  That should probably
	 change.  */
      mpn_mul_fft_full (p, a, n, a, n);
    }
#else
    {
      /* Toom3 for large operands.  Use workspace from the heap, as stack space
      may be limited.  Since n is at least MUL_TOOM3_THRESHOLD, multiplication
      will take much longer than malloc()/free().  */
      mp_ptr ws;  mp_size_t ws_size;
      ws_size = MPN_TOOM3_SQR_N_TSIZE (n);
      ws = __GMP_ALLOCATE_FUNC_LIMBS (ws_size);
      mpn_toom3_sqr_n (p, a, n, ws);
      __GMP_FREE_FUNC_LIMBS (ws, ws_size);
    }
#endif
}
예제 #2
0
mp_size_t
mpn_matrix22_mul_itch (mp_size_t rn, mp_size_t mn)
{
  if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD)
      || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD))
    return 3*rn + 2*mn;
  else
    return 3*(rn + mn) + 5;
}
예제 #3
0
void
mpn_sqr (mp_ptr p, mp_srcptr a, mp_size_t n)
{
  ASSERT (n >= 1);
  ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n));

#if 0
  /* FIXME: Can this be removed? */
  if (n == 0)
    return;
#endif

  if (BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
    { /* mul_basecase is faster than sqr_basecase on small sizes sometimes */
      mpn_mul_basecase (p, a, n, a, n);
    }
  else if (BELOW_THRESHOLD (n, SQR_KARATSUBA_THRESHOLD))
    {
      mpn_sqr_basecase (p, a, n);
    }
  else if (BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD))
    {
      /* Allocate workspace of fixed size on stack: fast! */
      mp_limb_t ws[MPN_KARA_SQR_N_TSIZE (SQR_TOOM3_THRESHOLD_LIMIT-1)];
      ASSERT (SQR_TOOM3_THRESHOLD <= SQR_TOOM3_THRESHOLD_LIMIT);
      mpn_kara_sqr_n (p, a, n, ws);
    }
  else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD))
    {
      mp_ptr ws;
      TMP_SDECL;
      TMP_SMARK;
      ws = TMP_SALLOC_LIMBS (MPN_TOOM3_SQR_N_TSIZE (n));
      mpn_toom3_sqr_n (p, a, n, ws);
      TMP_SFREE;
    }
  else if (BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD))
    {
       mpn_toom4_sqr_n (p, a, n);
    }
#if WANT_FFT || TUNE_PROGRAM_BUILD
  else if (BELOW_THRESHOLD (n, SQR_FFT_FULL_THRESHOLD))
#else
  else 
#endif
    {
       mpn_toom8_sqr_n (p, a, n);
    }
#if WANT_FFT || TUNE_PROGRAM_BUILD
  else
    {
예제 #4
0
void
mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n)
{
  ASSERT (n >= 1);
  ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n));
  ASSERT (! MPN_OVERLAP_P (p, 2 * n, b, n));

  if (BELOW_THRESHOLD (n, MUL_KARATSUBA_THRESHOLD))
    {
      mpn_mul_basecase (p, a, n, b, n);
    }
  else if (BELOW_THRESHOLD (n, MUL_TOOM3_THRESHOLD))
    {
      /* Allocate workspace of fixed size on stack: fast! */
      mp_limb_t ws[MPN_KARA_MUL_N_TSIZE (MUL_TOOM3_THRESHOLD_LIMIT-1)];
      ASSERT (MUL_TOOM3_THRESHOLD <= MUL_TOOM3_THRESHOLD_LIMIT);
      mpn_kara_mul_n (p, a, b, n, ws);
    }
  else if (BELOW_THRESHOLD (n, MUL_TOOM4_THRESHOLD))
    {
      mp_ptr ws;
      TMP_SDECL;
      TMP_SMARK;
      ws = TMP_SALLOC_LIMBS (MPN_TOOM3_MUL_N_TSIZE (n));
      mpn_toom3_mul_n (p, a, b, n, ws);
      TMP_SFREE;
    }
  else if (BELOW_THRESHOLD (n, MUL_TOOM8H_THRESHOLD))
    {
       mpn_toom4_mul_n (p, a, b, n);
    }
#if WANT_FFT || TUNE_PROGRAM_BUILD
  else if (BELOW_THRESHOLD (n, MUL_FFT_FULL_THRESHOLD))
    {
       mpn_toom8h_mul (p, a, n, b, n);
    }
#endif
  else
#if WANT_FFT || TUNE_PROGRAM_BUILD
    {
       mpn_mul_fft_main(p, a, n, b, n); 
    }
#else
    {
      /* Toom8 for large operands. */
      mpn_toom8h_mul (p, a, n, b, n);
    }
#endif
}
예제 #5
0
파일: mulhigh_n.c 프로젝트: HRF92/mpir
/* (rp, 2n) = (xp, n)*(yp, n) */
void
mpn_mulhigh_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
{
  mp_limb_t t;

  ASSERT(n > 0);
  ASSERT_MPN(xp, n);
  ASSERT_MPN(yp, n);
  ASSERT(!MPN_OVERLAP_P(rp, 2 * n, xp, n));
  ASSERT(!MPN_OVERLAP_P(rp, 2 * n, yp, n));
  
  if (BELOW_THRESHOLD(n, MULHIGH_BASECASE_THRESHOLD))
    {
      mpn_mul_basecase(rp, xp, n, yp, n);
      
      return;
    }
  
  if (ABOVE_THRESHOLD (n, MULHIGH_MUL_THRESHOLD))
    {
      mpn_mul_n(rp, xp, yp, n);
      
      return;
    }

  mpn_mulshort_n(rp, xp, yp, n);
  t = rp[n - 1] + n - 2;
  
  if (UNLIKELY(t < n - 2))
    mpn_mul_n(rp, xp, yp, n);
  
  return;
}
예제 #6
0
파일: mul.c 프로젝트: mahdiz/mpclib
void
mpn_sqr_n (mp_ptr prodp,
	   mp_srcptr up, mp_size_t un)
{
  ASSERT (un >= 1);
  ASSERT (! MPN_OVERLAP_P (prodp, 2*un, up, un));

  /* FIXME: Can this be removed? */
  if (un == 0)
    return;

  if (BELOW_THRESHOLD (un, SQR_BASECASE_THRESHOLD))
    { /* mul_basecase is faster than sqr_basecase on small sizes sometimes */
      mpn_mul_basecase (prodp, up, un, up, un);
    }
  else if (BELOW_THRESHOLD (un, SQR_KARATSUBA_THRESHOLD))
    { /* plain schoolbook multiplication */
      mpn_sqr_basecase (prodp, up, un);
    }
  else if (BELOW_THRESHOLD (un, SQR_TOOM3_THRESHOLD))
    { /* karatsuba multiplication */
      mp_ptr tspace;
      TMP_DECL (marker);
      TMP_MARK (marker);
      tspace = TMP_ALLOC_LIMBS (MPN_KARA_SQR_N_TSIZE (un));
      mpn_kara_sqr_n (prodp, up, un, tspace);
      TMP_FREE (marker);
    }
#if WANT_FFT || TUNE_PROGRAM_BUILD
  else if (BELOW_THRESHOLD (un, SQR_FFT_THRESHOLD))
#else
  else
#endif
    { /* Toom3 multiplication.
	 Use workspace from the heap, as stack may be limited.  Since n is
	 at least MUL_TOOM3_THRESHOLD, the multiplication will take much
	 longer than malloc()/free().  */
      mp_ptr     tspace;
      mp_size_t  tsize;
      tsize = MPN_TOOM3_SQR_N_TSIZE (un);
      tspace = __GMP_ALLOCATE_FUNC_LIMBS (tsize);
      mpn_toom3_sqr_n (prodp, up, un, tspace);
      __GMP_FREE_FUNC_LIMBS (tspace, tsize);
    }
#if WANT_FFT || TUNE_PROGRAM_BUILD
  else
    {
예제 #7
0
mp_limb_t
mpn_dcpi1_div_qr_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n,
		    gmp_pi1_t *dinv, mp_ptr tp)
{
  mp_size_t lo, hi;
  mp_limb_t cy, qh, ql;

  lo = n >> 1;			/* floor(n/2) */
  hi = n - lo;			/* ceil(n/2) */

  if (BELOW_THRESHOLD (hi, DC_DIV_QR_THRESHOLD))
    qh = mpn_sbpi1_div_qr (qp + lo, np + 2 * lo, 2 * hi, dp + lo, hi, dinv->inv32);
  else
    qh = mpn_dcpi1_div_qr_n (qp + lo, np + 2 * lo, dp + lo, hi, dinv, tp);

  mpn_mul (tp, qp + lo, hi, dp, lo);

  cy = mpn_sub_n (np + lo, np + lo, tp, n);
  if (qh != 0)
    cy += mpn_sub_n (np + n, np + n, dp, lo);

  while (cy != 0)
    {
      qh -= mpn_sub_1 (qp + lo, qp + lo, hi, 1);
      cy -= mpn_add_n (np + lo, np + lo, dp, n);
    }

  if (BELOW_THRESHOLD (lo, DC_DIV_QR_THRESHOLD))
    ql = mpn_sbpi1_div_qr (qp, np + hi, 2 * lo, dp + hi, lo, dinv->inv32);
  else
    ql = mpn_dcpi1_div_qr_n (qp, np + hi, dp + hi, lo, dinv, tp);

  mpn_mul (tp, dp, hi, qp, lo);

  cy = mpn_sub_n (np, np, tp, n);
  if (ql != 0)
    cy += mpn_sub_n (np + lo, np + lo, dp, hi);

  while (cy != 0)
    {
      mpn_sub_1 (qp, qp, lo, 1);
      cy -= mpn_add_n (np, np, dp, n);
    }

  return qh;
}
예제 #8
0
파일: divis.c 프로젝트: angavrilov/ecl-sse
int
mpn_divisible_p (mp_srcptr ap, mp_size_t asize,
		 mp_srcptr dp, mp_size_t dsize)
{
  mp_limb_t  alow, dlow, dmask;
  mp_ptr     qp, rp;
  mp_size_t  i;
  TMP_DECL;

  ASSERT (asize >= 0);
  ASSERT (asize == 0 || ap[asize-1] != 0);
  ASSERT (dsize >= 1);
  ASSERT (dp[dsize-1] != 0);
  ASSERT_MPN (ap, asize);
  ASSERT_MPN (dp, dsize);

  /* When a<d only a==0 is divisible.
     Notice this test covers all cases of asize==0. */
  if (asize < dsize)
    return (asize == 0);

  /* Strip low zero limbs from d, requiring a==0 on those. */
  for (;;)
    {
      alow = *ap;
      dlow = *dp;

      if (dlow != 0)
	break;

      if (alow != 0)
	return 0;  /* a has fewer low zero limbs than d, so not divisible */

      /* a!=0 and d!=0 so won't get to size==0 */
      asize--; ASSERT (asize >= 1);
      dsize--; ASSERT (dsize >= 1);
      ap++;
      dp++;
    }

  /* a must have at least as many low zero bits as d */
  dmask = LOW_ZEROS_MASK (dlow);
  if ((alow & dmask) != 0)
    return 0;

  if (dsize == 1)
    {
      if (BELOW_THRESHOLD (asize, MODEXACT_1_ODD_THRESHOLD))
	return mpn_mod_1 (ap, asize, dlow) == 0;

      if ((dlow & 1) == 0)
	{
	  unsigned  twos;
	  count_trailing_zeros (twos, dlow);
	  dlow >>= twos;
	}
      return mpn_modexact_1_odd (ap, asize, dlow) == 0;
    }
예제 #9
0
void
mpn_mullow_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
{
  if (BELOW_THRESHOLD (n, MULLOW_BASECASE_THRESHOLD))
    {
      /* Allocate workspace of fixed size on stack: fast! */
      mp_limb_t ws[MUL_BASECASE_ALLOC];
      mpn_mul_basecase (ws, xp, n, yp, n);
      MPN_COPY (rp, ws, n);
    }
  else if (BELOW_THRESHOLD (n, MULLOW_DC_THRESHOLD))
    {
      mpn_mullow_basecase (rp, xp, yp, n);
    }
  else if (BELOW_THRESHOLD (n, MULLOW_MUL_N_THRESHOLD))
    {
      /* Divide-and-conquer */
      mp_size_t n2 = n >> 1;		/* floor(n/2) */
      mp_size_t n1 = n - n2;		/* ceil(n/2) */
      mp_ptr tp;
      TMP_SDECL;
      TMP_SMARK;
      tp = TMP_SALLOC_LIMBS (n1);

      /* Split as x = x1 2^(n1 GMP_NUMB_BITS) + x0,
                  y = y1 2^(n2 GMP_NUMB_BITS) + y0 */

      /* x0 * y0 */
      mpn_mul_n (rp, xp, yp, n2);
      if (n1 != n2)
	rp[2 * n2] = mpn_addmul_1 (rp + n2, yp, n2, xp[n2]);

      /* x1 * y0 * 2^(n1 GMP_NUMB_BITS) */
      mpn_mullow_n (tp, xp + n1, yp, n2);
      mpn_add_n (rp + n1, rp + n1, tp, n2);

      /* x0 * y1 * 2^(n2 GMP_NUMB_BITS) */
      mpn_mullow_n (tp, yp + n2, xp, n1);
      mpn_add_n (rp + n2, rp + n2, tp, n1);
      TMP_SFREE;
    }
  else
    {
예제 #10
0
void
mpn_matrix22_mul (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn,
		  mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn,
		  mp_ptr tp)
{
  if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD)
      || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD))
    {
      mp_ptr p0, p1;
      unsigned i;

      /* Temporary storage: 3 rn + 2 mn */
      p0 = tp + rn;
      p1 = p0 + rn + mn;

      for (i = 0; i < 2; i++)
	{
	  MPN_COPY (tp, r0, rn);

	  if (rn >= mn)
	    {
	      mpn_mul (p0, r0, rn, m0, mn);
	      mpn_mul (p1, r1, rn, m3, mn);
	      mpn_mul (r0, r1, rn, m2, mn);
	      mpn_mul (r1, tp, rn, m1, mn);
	    }
	  else
	    {
	      mpn_mul (p0, m0, mn, r0, rn);
	      mpn_mul (p1, m3, mn, r1, rn);
	      mpn_mul (r0, m2, mn, r1, rn);
	      mpn_mul (r1, m1, mn, tp, rn);
	    }
	  r0[rn+mn] = mpn_add_n (r0, r0, p0, rn + mn);
	  r1[rn+mn] = mpn_add_n (r1, r1, p1, rn + mn);

	  r0 = r2; r1 = r3;
	}
    }
  else
    mpn_matrix22_mul_strassen (r0, r1, r2, r3, rn,
			       m0, m1, m2, m3, mn, tp);
}
예제 #11
0
mp_size_t
mpn_mulmod_bnm1_next_size (mp_size_t n)
{
  mp_size_t nh;

  if (BELOW_THRESHOLD (n,     MULMOD_BNM1_THRESHOLD))
    return n;
  if (BELOW_THRESHOLD (n, 4 * (MULMOD_BNM1_THRESHOLD - 1) + 1))
    return (n + (2-1)) & (-2);
  if (BELOW_THRESHOLD (n, 8 * (MULMOD_BNM1_THRESHOLD - 1) + 1))
    return (n + (4-1)) & (-4);

  nh = (n + 1) >> 1;

  if (BELOW_THRESHOLD (nh, MUL_FFT_MODF_THRESHOLD))
    return (n + (8-1)) & (-8);

  return 2 * mpn_fft_next_size (nh, mpn_fft_best_k (nh, 0));
}
예제 #12
0
void
mpn_binvert (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_ptr scratch)
{
  mp_ptr xp;
  mp_size_t rn, newrn;
  mp_size_t sizes[NPOWS], *sizp;
  mp_limb_t di;

  /* Compute the computation precisions from highest to lowest, leaving the
     base case size in 'rn'.  */
  sizp = sizes;
  for (rn = n; ABOVE_THRESHOLD (rn, BINV_NEWTON_THRESHOLD); rn = (rn + 1) >> 1)
    *sizp++ = rn;

  xp = scratch;

  /* Compute a base value using a low-overhead O(n^2) algorithm.  FIXME: We
     should call some divide-and-conquer lsb division function here for an
     operand subrange.  */
  MPN_ZERO (xp, rn);
  xp[0] = 1;
  binvert_limb (di, up[0]);
  if (BELOW_THRESHOLD (rn, DC_BDIV_Q_THRESHOLD))
    mpn_sb_bdiv_q (rp, xp, rn, up, rn, -di);
  else
    mpn_dc_bdiv_q (rp, xp, rn, up, rn, -di);

  /* Use Newton iterations to get the desired precision.  */
  for (; rn < n; rn = newrn)
    {
      newrn = *--sizp;

#if WANT_FFT
      if (ABOVE_THRESHOLD (newrn, 2 * MUL_FFT_MODF_THRESHOLD))
	{
	  int k;
	  mp_size_t m, i;

	  k = mpn_fft_best_k (newrn, 0);
	  m = mpn_fft_next_size (newrn, k);
	  mpn_mul_fft (xp, m, up, newrn, rp, rn, k);
	  for (i = rn - 1; i >= 0; i--)
	    if (xp[i] > (i == 0))
	      {
		mpn_add_1 (xp + rn, xp + rn, newrn - rn, 1);
		break;
	      }
	}
      else
#endif
	mpn_mul (xp, up, newrn, rp, rn);
      mpn_mullow_n (rp + rn, rp, xp + rn, newrn - rn);
      mpn_neg_n (rp + rn, rp + rn, newrn - rn);
    }
}
예제 #13
0
mp_limb_t
mpn_dc_divappr_q_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n,
		    mp_srcptr dip, mp_ptr tp)
{
  mp_size_t lo, hi;
  mp_limb_t cy, qh, ql;

  lo = n >> 1;			/* floor(n/2) */
  hi = n - lo;			/* ceil(n/2) */

  if (BELOW_THRESHOLD (hi, DC_DIV_QR_THRESHOLD))
    qh = mpn_sb_div_qr (qp + lo, np + 2 * lo, 2 * hi, dp + lo, hi, dip);
  else
    qh = mpn_dc_div_qr_n (qp + lo, np + 2 * lo, dp + lo, hi, dip, tp);

  mpn_mul (tp, qp + lo, hi, dp, lo);

  cy = mpn_sub_n (np + lo, np + lo, tp, n);
  if (qh != 0)
    cy += mpn_sub_n (np + n, np + n, dp, lo);

  while (cy != 0)
    {
      qh -= mpn_sub_1 (qp + lo, qp + lo, hi, 1);
      cy -= mpn_add_n (np + lo, np + lo, dp, n);
    }

  if (BELOW_THRESHOLD (lo, DC_DIVAPPR_Q_THRESHOLD))
    ql = mpn_sb_divappr_q (qp, np + hi, 2 * lo, dp + hi, lo, dip);
  else
    ql = mpn_dc_divappr_q_n (qp, np + hi, dp + hi, lo, dip, tp);

  if (UNLIKELY (ql != 0))
    {
      mp_size_t i;
      for (i = 0; i < lo; i++)
	qp[i] = GMP_NUMB_MASK;
    }

  return qh;
}
예제 #14
0
void
mpn_invert (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch)
{
  ASSERT (n > 0);
  ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
  ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
  ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n)));
  ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n)));

  if (n == 1)
    invert_limb (*ip, *dp);
  else {
    TMP_DECL;

    TMP_MARK;
    if (BELOW_THRESHOLD (n, INV_APPR_THRESHOLD))
      {
	/* Maximum scratch needed by this branch: 2*n */
	mp_size_t i;
	mp_ptr xp;

	xp = scratch;				/* 2 * n limbs */
	for (i = n - 1; i >= 0; i--)
	  xp[i] = GMP_NUMB_MAX;
	mpn_com (xp + n, dp, n);
	if (n == 2) {
	  mpn_divrem_2 (ip, 0, xp, 4, dp);
	} else {
	  gmp_pi1_t inv;
	  invert_pi1 (inv, dp[n-1], dp[n-2]);
	  /* FIXME: should we use dcpi1_div_q, for big sizes? */
	  mpn_sbpi1_div_q (ip, xp, 2 * n, dp, n, inv.inv32);
	}
      }
    else { /* Use approximated inverse; correct the result if needed. */
      mp_limb_t e; /* The possible error in the approximate inverse */

      ASSERT ( mpn_invert_itch (n) >= mpn_invertappr_itch (n) );
      e = mpn_ni_invertappr (ip, dp, n, scratch);

      if (UNLIKELY (e)) { /* Assume the error can only be "0" (no error) or "1". */
	/* Code to detect and correct the "off by one" approximation. */
	mpn_mul_n (scratch, ip, dp, n);
	ASSERT_NOCARRY (mpn_add_n (scratch + n, scratch + n, dp, n));
	if (! mpn_add (scratch, scratch, 2*n, dp, n))
	  MPN_INCR_U (ip, n, 1); /* The value was wrong, correct it.  */
      }
    }
    TMP_FREE;
  }
}
예제 #15
0
void
mpn_invert (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch)
{
  ASSERT (n > 0);
  ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
  ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
  ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n)));
  ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n)));

  if (n == 1)
    invert_limb (*ip, *dp);
  else if (BELOW_THRESHOLD (n, INV_APPR_THRESHOLD))
    {
	/* Maximum scratch needed by this branch: 2*n */
	mp_size_t i;
	mp_ptr xp;

	xp = scratch;				/* 2 * n limbs */
	/* n > 1 here */
	i = n;
	do
	  xp[--i] = GMP_NUMB_MAX;
	while (i);
	mpn_com (xp + n, dp, n);
	if (n == 2) {
	  mpn_divrem_2 (ip, 0, xp, 4, dp);
	} else {
	  gmp_pi1_t inv;
	  invert_pi1 (inv, dp[n-1], dp[n-2]);
	  /* FIXME: should we use dcpi1_div_q, for big sizes? */
	  mpn_sbpi1_div_q (ip, xp, 2 * n, dp, n, inv.inv32);
	}
    }
  else { /* Use approximated inverse; correct the result if needed. */
      mp_limb_t e; /* The possible error in the approximate inverse */

      ASSERT ( mpn_invert_itch (n) >= mpn_invertappr_itch (n) );
      e = mpn_ni_invertappr (ip, dp, n, scratch);

      if (UNLIKELY (e)) { /* Assume the error can only be "0" (no error) or "1". */
	/* Code to detect and correct the "off by one" approximation. */
	mpn_mul_n (scratch, ip, dp, n);
	e = mpn_add_n (scratch, scratch, dp, n); /* FIXME: we only need e.*/
	if (LIKELY(e)) /* The high part can not give a carry by itself. */
	  e = mpn_add_nc (scratch + n, scratch + n, dp, n, e); /* FIXME:e */
	/* If the value was wrong (no carry), correct it (increment). */
	e ^= CNST_LIMB (1);
	MPN_INCR_U (ip, n, e);
      }
  }
}
예제 #16
0
mp_limb_t mpn_rsh_divrem_hensel_qr_1(mp_ptr qp, mp_srcptr xp, 
                               mp_size_t n, mp_limb_t d, int s, mp_limb_t cin)
{
   ASSERT(n > 0);
   ASSERT(s >= 0);
   ASSERT_MPN(xp, n);
   ASSERT(MPN_SAME_OR_SEPARATE_P(qp, xp, n));
   ASSERT(d%2 == 1);

   if (BELOW_THRESHOLD(n, RSH_DIVREM_HENSEL_QR_1_THRESHOLD))
      return mpn_rsh_divrem_hensel_qr_1_1(qp, xp, n, d, s, cin);

   return mpn_rsh_divrem_hensel_qr_1_2(qp, xp, n, d, s, cin);
}
예제 #17
0
static void
mod (mp_ptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, gmp_pi1_t *dinv, mp_ptr tp)
{
  mp_ptr qp;
  TMP_DECL;
  TMP_MARK;

  qp = tp;

  if (dn == 1)
    np[0] = mpn_divrem_1 (qp, (mp_size_t) 0, np, nn, dp[0]);
  else if (dn == 2)
    mpn_div_qr_2n_pi1 (qp, np, np, nn, dp[1], dp[0], dinv->inv32);
  else if (BELOW_THRESHOLD (dn, DC_DIV_QR_THRESHOLD) ||
	   BELOW_THRESHOLD (nn - dn, DC_DIV_QR_THRESHOLD))
    mpn_sbpi1_div_qr (qp, np, nn, dp, dn, dinv->inv32);
  else if (BELOW_THRESHOLD (dn, MUPI_DIV_QR_THRESHOLD) ||   /* fast condition */
	   BELOW_THRESHOLD (nn, 2 * MU_DIV_QR_THRESHOLD) || /* fast condition */
	   (double) (2 * (MU_DIV_QR_THRESHOLD - MUPI_DIV_QR_THRESHOLD)) * dn /* slow... */
	   + (double) MUPI_DIV_QR_THRESHOLD * nn > (double) dn * nn)    /* ...condition */
    {
      mpn_dcpi1_div_qr (qp, np, nn, dp, dn, dinv);
    }
  else
    {
      /* We need to allocate separate remainder area, since mpn_mu_div_qr does
	 not handle overlap between the numerator and remainder areas.
	 FIXME: Make it handle such overlap.  */
      mp_ptr rp = TMP_ALLOC_LIMBS (dn);
      mp_size_t itch = mpn_mu_div_qr_itch (nn, dn, 0);
      mp_ptr scratch = TMP_ALLOC_LIMBS (itch);
      mpn_mu_div_qr (qp, rp, np, nn, dp, dn, scratch);
      MPN_COPY (np, rp, dn);
    }

  TMP_FREE;
}
예제 #18
0
// (rp,2n)=(xp,n)^2 with temp space (tp,2*n+C)
void	mpn_kara_sqr_n(mp_ptr rp,mp_srcptr xp,mp_size_t n,mp_ptr tp)
{mp_size_t n2,n3;mp_srcptr xl,xh;mp_ptr dx;mp_limb_t c;

n2=n>>1;
xl=xp;xh=xp+n2;n3=n-n2;
dx=rp+2*n2;
if((n&1)==0)
  {if(mpn_cmp(xh,xl,n2)>=0){mpn_sub_n(dx,xh,xl,n2);}else{mpn_sub_n(dx,xl,xh,n2);}}
else
  {if(xh[n2]!=0 || mpn_cmp(xh,xl,n2)>=0){c=mpn_sub_n(dx,xh,xl,n2);dx[n2]=xh[n2]-c;}else{mpn_sub_n(dx,xl,xh,n2);dx[n2]=0;}}
if(BELOW_THRESHOLD(n3,MUL_KARATSUBA_THRESHOLD))
  {mpn_mul_basecase(rp,xl,n2,xl,n2);
   mpn_mul_basecase(tp,dx,n3,dx,n3);
   mpn_mul_basecase(rp+2*n2,xh,n3,xh,n3);}
else if(BELOW_THRESHOLD(n3,SQR_KARATSUBA_THRESHOLD))
  {mpn_sqr_basecase(rp,xl,n2);
   mpn_sqr_basecase(tp,dx,n3);
   mpn_sqr_basecase(rp+2*n2,xh,n3);}
else
  {mpn_kara_sqr_n(rp,xl,n2,tp+2*n3);
   mpn_kara_sqr_n(tp,dx,n3,tp+2*n3);   
   mpn_kara_sqr_n(rp+2*n2,xh,n3,tp+2*n3);}
mpn_karasub(rp,tp,n);
return;}
예제 #19
0
/*
    mpn_dc_sqrlo requires a scratch space of 2*n limbs at tp.
    It accepts tp == rp.
*/
static void
mpn_dc_sqrlo (mp_ptr rp, mp_srcptr xp, mp_size_t n, mp_ptr tp)
{
  mp_size_t n2, n1;
  ASSERT (n >= 2);
  ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
  ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n));

  /* Divide-and-conquer */

  /* We need fractional approximation of the value 0 < a <= 1/2
     giving the minimum in the function k=(1-a)^e/(1-2*a^e).
  */
  if (MAYBE_range_basecase && BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD*36/(36-11)))
    n1 = n >> 1;
  else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD*36/(36-11)))
예제 #20
0
/* Define our own squaring function, which uses mpn_sqr_basecase for its
   allowed sizes, but its own code for larger sizes.  */
static void
mpn_local_sqr (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_ptr tp)
{
  mp_size_t i;

  ASSERT (n >= 1);
  ASSERT (! MPN_OVERLAP_P (rp, 2*n, up, n));

  if (BELOW_THRESHOLD (n, SQR_BASECASE_LIM))
    {
      mpn_sqr_basecase (rp, up, n);
      return;
    }

  {
    mp_limb_t ul, lpl;
    ul = up[0];
    umul_ppmm (rp[1], lpl, ul, ul << GMP_NAIL_BITS);
    rp[0] = lpl >> GMP_NAIL_BITS;
  }
  if (n > 1)
    {
      mp_limb_t cy;

      cy = mpn_mul_1 (tp, up + 1, n - 1, up[0]);
      tp[n - 1] = cy;
      for (i = 2; i < n; i++)
	{
	  mp_limb_t cy;
	  cy = mpn_addmul_1 (tp + 2 * i - 2, up + i, n - i, up[i - 1]);
	  tp[n + i - 2] = cy;
	}
      MPN_SQR_DIAGONAL (rp + 2, up + 1, n - 1);

      {
	mp_limb_t cy;
#if HAVE_NATIVE_mpn_addlsh1_n
	cy = mpn_addlsh1_n (rp + 1, rp + 1, tp, 2 * n - 2);
#else
	cy = mpn_lshift (tp, tp, 2 * n - 2, 1);
	cy += mpn_add_n (rp + 1, rp + 1, tp, 2 * n - 2);
#endif
	rp[2 * n - 1] += cy;
      }
    }
}
예제 #21
0
int
mpz_divisible_ui_p (mpz_srcptr a, mpir_ui d)
{
  mp_size_t  asize;
  mp_ptr     ap;
  unsigned   twos;

  asize = SIZ(a);
  if (UNLIKELY (d == 0))
    return (asize == 0);

  if (asize == 0)  /* 0 divisible by any d */
    return 1;

  /* For nails don't try to be clever if d is bigger than a limb, just fake
     up an mpz_t and go to the main mpz_divisible_p.  */
  if (d > GMP_NUMB_MAX)
    {
      mp_limb_t  dlimbs[2];
      mpz_t      dz;
      ALLOC(dz) = 2;
      PTR(dz) = dlimbs;
      mpz_set_ui (dz, d);
      return mpz_divisible_p (a, dz);
    }

  ap = PTR(a);
  asize = ABS(asize);  /* ignore sign of a */

  if (BELOW_THRESHOLD (asize, MODEXACT_1_ODD_THRESHOLD))
    return mpn_mod_1 (ap, asize, (mp_limb_t) d) == 0;

  if (! (d & 1))
    {
      /* Strip low zero bits to get odd d required by modexact.  If d==e*2^n
         and a is divisible by 2^n and by e, then it's divisible by d. */

      if ((ap[0] & LOW_ZEROS_MASK (d)) != 0)
        return 0;

      count_trailing_zeros (twos, (mp_limb_t) d);
      d >>= twos;
    }

  return mpn_modexact_1_odd (ap, asize, (mp_limb_t) d) == 0;
}
예제 #22
0
파일: hgcd.c 프로젝트: qsnake/mpir
mp_size_t
mpn_hgcd_itch (mp_size_t n)
{
  unsigned k;
  int count;
  mp_size_t nscaled;

  if (BELOW_THRESHOLD (n, HGCD_THRESHOLD))
    return MPN_HGCD_LEHMER_ITCH (n);

  /* Get the recursion depth. */
  nscaled = (n - 1) / (HGCD_THRESHOLD - 1);
  count_leading_zeros (count, nscaled);
  k = GMP_LIMB_BITS - count;

  return 20 * ((n+3) / 4) + 22 * k
    + MPN_HGCD_LEHMER_ITCH (HGCD_THRESHOLD);
}
예제 #23
0
/* Identical to mpn_hgcd_itch. FIXME: Do we really need to add
   HGCD_THRESHOLD at the end? */
mp_size_t
mpn_hgcd_appr_itch (mp_size_t n)
{
  if (BELOW_THRESHOLD (n, HGCD_APPR_THRESHOLD))
    return n;
  else
    {
      unsigned k;
      int count;
      mp_size_t nscaled;

      /* Get the recursion depth. */
      nscaled = (n - 1) / (HGCD_APPR_THRESHOLD - 1);
      count_leading_zeros (count, nscaled);
      k = GMP_LIMB_BITS - count;

      return 20 * ((n+3) / 4) + 22 * k + HGCD_THRESHOLD;
    }
}
예제 #24
0
mp_size_t
mpn_hgcd_reduce_itch (mp_size_t n, mp_size_t p)
{
  mp_size_t itch;
  if (BELOW_THRESHOLD (n, HGCD_REDUCE_THRESHOLD))
    {
      itch = mpn_hgcd_itch (n-p);

      /* For arbitrary p, the storage for _adjust is 2*(p + M->n) = 2 *
	 (p + ceil((n-p)/2) - 1 <= n + p - 1 */
      if (itch < n + p - 1)
	itch = n + p - 1;
    }
  else
    {
      itch = 2*(n-p) + mpn_hgcd_itch (n-p);
      /* Currently, hgcd_matrix_apply allocates its own storage. */
    }
  return itch;
}
예제 #25
0
// (rp,2n)=(xp,n)*(yp,n) with temp space (tp,2*n+C)
void	mpn_kara_mul_n(mp_ptr rp,mp_srcptr xp,mp_srcptr yp,mp_size_t n,mp_ptr tp)
{mp_size_t n2,n3;mp_srcptr xl,yl,xh,yh;mp_ptr dx,dy;int suboradd;mp_limb_t c;

n2=n>>1;suboradd=-1;
xl=xp;xh=xp+n2;yl=yp;yh=yp+n2;n3=n-n2;
dx=rp+2*n2;dy=dx+n3;
if((n&1)==0)
  {if(mpn_cmp(xh,xl,n2)>=0){mpn_sub_n(dx,xh,xl,n2);}else{mpn_sub_n(dx,xl,xh,n2);suboradd=-suboradd;}
   if(mpn_cmp(yh,yl,n2)>=0){mpn_sub_n(dy,yh,yl,n2);}else{mpn_sub_n(dy,yl,yh,n2);suboradd=-suboradd;}}
else
  {if(xh[n2]!=0 || mpn_cmp(xh,xl,n2)>=0){c=mpn_sub_n(dx,xh,xl,n2);dx[n2]=xh[n2]-c;}else{mpn_sub_n(dx,xl,xh,n2);dx[n2]=0;suboradd=-suboradd;}
   if(yh[n2]!=0 || mpn_cmp(yh,yl,n2)>=0){c=mpn_sub_n(dy,yh,yl,n2);dy[n2]=yh[n2]-c;}else{mpn_sub_n(dy,yl,yh,n2);dy[n2]=0;suboradd=-suboradd;}}
if(BELOW_THRESHOLD(n3,MUL_KARATSUBA_THRESHOLD))
  {mpn_mul_basecase(rp,xl,n2,yl,n2);
   mpn_mul_basecase(tp,dx,n3,dy,n3);
   mpn_mul_basecase(rp+2*n2,xh,n3,yh,n3);}
else
  {mpn_kara_mul_n(rp,xl,yl,n2,tp+2*n3);
   mpn_kara_mul_n(tp,dx,dy,n3,tp+2*n3);   
   mpn_kara_mul_n(rp+2*n2,xh,yh,n3,tp+2*n3);}
if(suboradd==-1){mpn_karasub(rp,tp,n);}else{mpn_karaadd(rp,tp,n);}
return;}
예제 #26
0
/* FIXME: Document storage need. */
mp_size_t
mpn_hgcd_reduce (struct hgcd_matrix *M,
		 mp_ptr ap, mp_ptr bp, mp_size_t n, mp_size_t p,
		 mp_ptr tp)
{
  mp_size_t nn;
  if (BELOW_THRESHOLD (n, HGCD_REDUCE_THRESHOLD))
    {
      nn = mpn_hgcd (ap + p, bp + p, n - p, M, tp);
      if (nn > 0)
	/* Needs 2*(p + M->n) <= 2*(floor(n/2) + ceil(n/2) - 1)
	   = 2 (n - 1) */
	return mpn_hgcd_matrix_adjust (M, p + nn, ap, bp, p, tp);
    }
  else
    {
      MPN_COPY (tp, ap + p, n - p);
      MPN_COPY (tp + n - p, bp + p, n - p);
      if (mpn_hgcd_appr (tp, tp + n - p, n - p, M, tp + 2*(n-p)))
	return hgcd_matrix_apply (M, ap, bp, n);
    }
  return 0;
}
예제 #27
0
파일: mul_n.c 프로젝트: angavrilov/ecl-sse
void
mpn_kara_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
{
  mp_limb_t w, w0, w1;
  mp_size_t n2;
  mp_srcptr x, y;
  mp_size_t i;

  n2 = n >> 1;
  ASSERT (n2 > 0);

  if ((n & 1) != 0)
    {
      /* Odd length. */
      mp_size_t n1, n3, nm1;

      n3 = n - n2;

      w = a[n2];
      if (w != 0)
	w -= mpn_sub_n (p, a, a + n3, n2);
      else
	{
	  i = n2;
	  do
	    {
	      --i;
	      w0 = a[i];
	      w1 = a[n3 + i];
	    }
	  while (w0 == w1 && i != 0);
	  if (w0 < w1)
	    {
	      x = a + n3;
	      y = a;
	    }
	  else
	    {
	      x = a;
	      y = a + n3;
	    }
	  mpn_sub_n (p, x, y, n2);
	}
      p[n2] = w;

      n1 = n + 1;

      /* n2 is always either n3 or n3-1 so maybe the two sets of tests here
	 could be combined.  But that's not important, since the tests will
	 take a miniscule amount of time compared to the function calls.  */
      if (BELOW_THRESHOLD (n3, SQR_BASECASE_THRESHOLD))
	{
	  mpn_mul_basecase (ws, p, n3, p, n3);
	  mpn_mul_basecase (p,  a, n3, a, n3);
	}
      else if (BELOW_THRESHOLD (n3, SQR_KARATSUBA_THRESHOLD))
	{
	  mpn_sqr_basecase (ws, p, n3);
	  mpn_sqr_basecase (p,  a, n3);
	}
      else
	{
	  mpn_kara_sqr_n   (ws, p, n3, ws + n1);	 /* (x-y)^2 */
	  mpn_kara_sqr_n   (p,  a, n3, ws + n1);	 /* x^2	    */
	}
      if (BELOW_THRESHOLD (n2, SQR_BASECASE_THRESHOLD))
	mpn_mul_basecase (p + n1, a + n3, n2, a + n3, n2);
      else if (BELOW_THRESHOLD (n2, SQR_KARATSUBA_THRESHOLD))
	mpn_sqr_basecase (p + n1, a + n3, n2);
      else
	mpn_kara_sqr_n   (p + n1, a + n3, n2, ws + n1);	 /* y^2	    */

      /* Since x^2+y^2-(x-y)^2 = 2xy >= 0 there's no need to track the
	 borrow from mpn_sub_n.	 If it occurs then it'll be cancelled by a
	 carry from ws[n].  Further, since 2xy fits in n1 limbs there won't
	 be any carry out of ws[n] other than cancelling that borrow. */

      mpn_sub_n (ws, p, ws, n1);	     /* x^2-(x-y)^2 */

      nm1 = n - 1;
      if (mpn_add_n (ws, p + n1, ws, nm1))   /* x^2+y^2-(x-y)^2 = 2xy */
	{
	  mp_limb_t x = (ws[nm1] + 1) & GMP_NUMB_MASK;
	  ws[nm1] = x;
	  if (x == 0)
	    ws[n] = (ws[n] + 1) & GMP_NUMB_MASK;
	}
      if (mpn_add_n (p + n3, p + n3, ws, n1))
	{
	  mpn_incr_u (p + n1 + n3, 1);
	}
    }
  else
    {
      /* Even length. */
      i = n2;
      do
	{
	  --i;
	  w0 = a[i];
	  w1 = a[n2 + i];
	}
      while (w0 == w1 && i != 0);
      if (w0 < w1)
	{
	  x = a + n2;
	  y = a;
	}
      else
	{
	  x = a;
	  y = a + n2;
	}
      mpn_sub_n (p, x, y, n2);

      /* Pointwise products. */
      if (BELOW_THRESHOLD (n2, SQR_BASECASE_THRESHOLD))
	{
	  mpn_mul_basecase (ws,    p,      n2, p,      n2);
	  mpn_mul_basecase (p,     a,      n2, a,      n2);
	  mpn_mul_basecase (p + n, a + n2, n2, a + n2, n2);
	}
      else if (BELOW_THRESHOLD (n2, SQR_KARATSUBA_THRESHOLD))
	{
	  mpn_sqr_basecase (ws,    p,      n2);
	  mpn_sqr_basecase (p,     a,      n2);
	  mpn_sqr_basecase (p + n, a + n2, n2);
	}
      else
	{
	  mpn_kara_sqr_n (ws,    p,      n2, ws + n);
	  mpn_kara_sqr_n (p,     a,      n2, ws + n);
	  mpn_kara_sqr_n (p + n, a + n2, n2, ws + n);
	}

      /* Interpolate. */
      w = -mpn_sub_n (ws, p, ws, n);
      w += mpn_add_n (ws, p + n, ws, n);
      w += mpn_add_n (p + n2, p + n2, ws, n);
      MPN_INCR_U (p + n2 + n, 2 * n - (n2 + n), w);
    }
}
예제 #28
0
mp_limb_t
mpn_dcpi1_divappr_q (mp_ptr qp, mp_ptr np, mp_size_t nn,
		     mp_srcptr dp, mp_size_t dn, gmp_pi1_t *dinv)
{
  mp_size_t qn;
  mp_limb_t qh, cy, qsave;
  mp_ptr tp;
  TMP_DECL;

  TMP_MARK;

  ASSERT (dn >= 6);
  ASSERT (nn > dn);
  ASSERT (dp[dn-1] & GMP_NUMB_HIGHBIT);

  qn = nn - dn;
  qp += qn;
  np += nn;
  dp += dn;

  if (qn >= dn)
    {
      qn++;			/* pretend we'll need an extra limb */
      /* Reduce qn mod dn without division, optimizing small operations.  */
      do
	qn -= dn;
      while (qn > dn);

      qp -= qn;			/* point at low limb of next quotient block */
      np -= qn;			/* point in the middle of partial remainder */

      tp = TMP_SALLOC_LIMBS (dn);

      /* Perform the typically smaller block first.  */
      if (qn == 1)
	{
	  mp_limb_t q, n2, n1, n0, d1, d0;

	  /* Handle qh up front, for simplicity. */
	  qh = mpn_cmp (np - dn + 1, dp - dn, dn) >= 0;
	  if (qh)
	    ASSERT_NOCARRY (mpn_sub_n (np - dn + 1, np - dn + 1, dp - dn, dn));

	  /* A single iteration of schoolbook: One 3/2 division,
	     followed by the bignum update and adjustment. */
	  n2 = np[0];
	  n1 = np[-1];
	  n0 = np[-2];
	  d1 = dp[-1];
	  d0 = dp[-2];

	  ASSERT (n2 < d1 || (n2 == d1 && n1 <= d0));

	  if (UNLIKELY (n2 == d1) && n1 == d0)
	    {
	      q = GMP_NUMB_MASK;
	      cy = mpn_submul_1 (np - dn, dp - dn, dn, q);
	      ASSERT (cy == n2);
	    }
	  else
	    {
	      udiv_qr_3by2 (q, n1, n0, n2, n1, n0, d1, d0, dinv->inv32);

	      if (dn > 2)
		{
		  mp_limb_t cy, cy1;
		  cy = mpn_submul_1 (np - dn, dp - dn, dn - 2, q);

		  cy1 = n0 < cy;
		  n0 = (n0 - cy) & GMP_NUMB_MASK;
		  cy = n1 < cy1;
		  n1 = (n1 - cy1) & GMP_NUMB_MASK;
		  np[-2] = n0;

		  if (UNLIKELY (cy != 0))
		    {
		      n1 += d1 + mpn_add_n (np - dn, np - dn, dp - dn, dn - 1);
		      qh -= (q == 0);
		      q = (q - 1) & GMP_NUMB_MASK;
		    }
		}
	      else
		np[-2] = n0;

	      np[-1] = n1;
	    }
	  qp[0] = q;
	}
      else
	{
	  if (qn == 2)
	    qh = mpn_divrem_2 (qp, 0L, np - 2, 4, dp - 2);
	  else if (BELOW_THRESHOLD (qn, DC_DIV_QR_THRESHOLD))
	    qh = mpn_sbpi1_div_qr (qp, np - qn, 2 * qn, dp - qn, qn, dinv->inv32);
	  else
	    qh = mpn_dcpi1_div_qr_n (qp, np - qn, dp - qn, qn, dinv, tp);

	  if (qn != dn)
	    {
	      if (qn > dn - qn)
		mpn_mul (tp, qp, qn, dp - dn, dn - qn);
	      else
		mpn_mul (tp, dp - dn, dn - qn, qp, qn);

	      cy = mpn_sub_n (np - dn, np - dn, tp, dn);
	      if (qh != 0)
		cy += mpn_sub_n (np - dn + qn, np - dn + qn, dp - dn, dn - qn);

	      while (cy != 0)
		{
		  qh -= mpn_sub_1 (qp, qp, qn, 1);
		  cy -= mpn_add_n (np - dn, np - dn, dp - dn, dn);
		}
	    }
	}
      qn = nn - dn - qn + 1;
      while (qn > dn)
	{
	  qp -= dn;
	  np -= dn;
	  mpn_dcpi1_div_qr_n (qp, np - dn, dp - dn, dn, dinv, tp);
	  qn -= dn;
	}

      /* Since we pretended we'd need an extra quotient limb before, we now
	 have made sure the code above left just dn-1=qn quotient limbs to
	 develop.  Develop that plus a guard limb. */
      qn--;
      qp -= qn;
      np -= dn;
      qsave = qp[qn];
      mpn_dcpi1_divappr_q_n (qp, np - dn, dp - dn, dn, dinv, tp);
      MPN_COPY_INCR (qp, qp + 1, qn);
      qp[qn] = qsave;
    }
  else    /* (qn < dn) */
    {
      mp_ptr q2p;
#if 0				/* not possible since we demand nn > dn */
      if (qn == 0)
	{
	  qh = mpn_cmp (np - dn, dp - dn, dn) >= 0;
	  if (qh)
	    mpn_sub_n (np - dn, np - dn, dp - dn, dn);
	  TMP_FREE;
	  return qh;
	}
#endif

      qp -= qn;			/* point at low limb of next quotient block */
      np -= qn;			/* point in the middle of partial remainder */

      q2p = TMP_SALLOC_LIMBS (qn + 1);
      /* Should we at all check DC_DIVAPPR_Q_THRESHOLD here, or reply on
	 callers not to be silly?  */
      if (BELOW_THRESHOLD (qn, DC_DIVAPPR_Q_THRESHOLD))
	{
	  qh = mpn_sbpi1_divappr_q (q2p, np - qn - 2, 2 * (qn + 1),
				    dp - (qn + 1), qn + 1, dinv->inv32);
	}
      else
	{
	  /* It is tempting to use qp for recursive scratch and put quotient in
	     tp, but the recursive scratch needs one limb too many.  */
	  tp = TMP_SALLOC_LIMBS (qn + 1);
	  qh = mpn_dcpi1_divappr_q_n (q2p, np - qn - 2, dp - (qn + 1), qn + 1, dinv, tp);
	}
      MPN_COPY (qp, q2p + 1, qn);
    }

  TMP_FREE;
  return qh;
}
예제 #29
0
mp_limb_t
mpn_dc_div_qr (mp_ptr qp,
		  mp_ptr np, mp_size_t nn,
		  mp_srcptr dp, mp_size_t dn,
		  mp_limb_t dinv)
{
  mp_size_t qn;
  mp_limb_t qh, cy;
  mp_ptr tp;
  TMP_DECL;

  TMP_MARK;

  ASSERT (dn >= 6);		/* to adhere to mpn_sb_div_qr's limits */
  ASSERT (nn - dn >= 3);	/* to adhere to mpn_sb_div_qr's limits */
  ASSERT (dp[dn-1] & GMP_NUMB_HIGHBIT);

  tp = TMP_ALLOC_LIMBS (DC_DIVAPPR_Q_N_ITCH(dn));

  qn = nn - dn;
  qp += qn;
  np += nn;
  dp += dn;

  if (qn > dn)
    {
      /* Reduce qn mod dn without division, optimizing small operations.  */
      do
	qn -= dn;
      while (qn > dn);

      qp -= qn;			/* point at low limb of next quotient block */
      np -= qn;			/* point in the middle of partial remainder */

      /* Perform the typically smaller block first.  */
      if (qn == 1)
	{
	  mp_limb_t q, n2, n1, n0, d1, d0, d11, d01;

	  /* Handle qh up front, for simplicity. */
	  qh = mpn_cmp (np - dn + 1, dp - dn, dn) >= 0;
	  if (qh)
	    ASSERT_NOCARRY (mpn_sub_n (np - dn + 1, np - dn + 1, dp - dn, dn));

	  /* A single iteration of schoolbook: One 3/2 division,
	     followed by the bignum update and adjustment. */
	  n2 = np[0];
	  n1 = np[-1];
	  n0 = np[-2];
	  d1 = dp[-1];
	  d0 = dp[-2];
     d01 = d0 + 1;
     d11 = d1 + (d01 < d0);

	  ASSERT (n2 < d1 || (n2 == d1 && n1 <= d0));

	  if (UNLIKELY (n2 == d1) && n1 == d0)
	    {
	      q = GMP_NUMB_MASK;
	      cy = mpn_submul_1 (np - dn, dp - dn, dn, q);
	      ASSERT (cy == n2);
	    }
	  else
	    {
	      mpir_divrem32_preinv2 (q, n1, n0, n2, n1, n0, d11, d01, d1, d0, dinv);

	      if (dn > 2)
		{
		  mp_limb_t cy, cy1;
		  cy = mpn_submul_1 (np - dn, dp - dn, dn - 2, q);

		  cy1 = n0 < cy;
		  n0 = (n0 - cy) & GMP_NUMB_MASK;
		  cy = n1 < cy1;
		  n1 = (n1 - cy1) & GMP_NUMB_MASK;
		  np[-2] = n0;

		  if (UNLIKELY (cy != 0))
		    {
		      n1 += d1 + mpn_add_n (np - dn, np - dn, dp - dn, dn - 1);
		      qh -= (q == 0);
		      q = (q - 1) & GMP_NUMB_MASK;
		    }
		}
	      else
		np[-2] = n0;

	      np[-1] = n1;
	    }
	  qp[0] = q;
	}
      else
	{
      /* Do a 2qn / qn division */
	  if (qn == 2)
	    qh = mpn_divrem_2 (qp, 0L, np - 2, 4, dp - 2); /* FIXME: obsolete function. Use 5/3 division? */
	  else if (BELOW_THRESHOLD (qn, DC_DIV_QR_THRESHOLD))
	    qh = mpn_sb_div_qr (qp, np - qn, 2 * qn, dp - qn, qn, dinv);
	  else
	    qh = mpn_dc_div_qr_n (qp, np - qn, dp - qn, qn, dinv, tp);

	  if (qn != dn)
	    {
	      if (qn > dn - qn)
		mpn_mul (tp, qp, qn, dp - dn, dn - qn);
	      else
		mpn_mul (tp, dp - dn, dn - qn, qp, qn);

	      cy = mpn_sub_n (np - dn, np - dn, tp, dn);
	      if (qh != 0)
		cy += mpn_sub_n (np - dn + qn, np - dn + qn, dp - dn, dn - qn);

	      while (cy != 0)
		{
		  qh -= mpn_sub_1 (qp, qp, qn, 1);
		  cy -= mpn_add_n (np - dn, np - dn, dp - dn, dn);
		}
	    }
	}

      qn = nn - dn - qn;
      do
	{
	  qp -= dn;
	  np -= dn;
	  ASSERT_NOCARRY(mpn_dc_div_qr_n (qp, np - dn, dp - dn, dn, dinv, tp));
	  qn -= dn;
	}
      while (qn > 0);
    }
  else
    {
      qp -= qn;			/* point at low limb of next quotient block */
      np -= qn;			/* point in the middle of partial remainder */

      if (BELOW_THRESHOLD (qn, DC_DIV_QR_THRESHOLD))
	qh = mpn_sb_div_qr (qp, np - qn, 2 * qn, dp - qn, qn, dinv);
      else
	qh = mpn_dc_div_qr_n (qp, np - qn, dp - qn, qn, dinv, tp);

      if (qn != dn)
	{
	  if (qn > dn - qn)
	    mpn_mul (tp, qp, qn, dp - dn, dn - qn);
	  else
	    mpn_mul (tp, dp - dn, dn - qn, qp, qn);

	  cy = mpn_sub_n (np - dn, np - dn, tp, dn);
	  if (qh != 0)
	    cy += mpn_sub_n (np - dn + qn, np - dn + qn, dp - dn, dn - qn);

	  while (cy != 0)
	    {
	      qh -= mpn_sub_1 (qp, qp, qn, 1);
	      cy -= mpn_add_n (np - dn, np - dn, dp - dn, dn);
	    }
	}
    }

  TMP_FREE;
  return qh;
}
예제 #30
0
/* Computes {rp,MIN(rn,an+bn)} <- {ap,an}*{bp,bn} Mod(B^rn-1)
 *
 * The result is expected to be ZERO if and only if one of the operand
 * already is. Otherwise the class [0] Mod(B^rn-1) is represented by
 * B^rn-1. This should not be a problem if mulmod_bnm1 is used to
 * combine results and obtain a natural number when one knows in
 * advance that the final value is less than (B^rn-1).
 * Moreover it should not be a problem if mulmod_bnm1 is used to
 * compute the full product with an+bn <= rn, because this condition
 * implies (B^an-1)(B^bn-1) < (B^rn-1) .
 *
 * Requires 0 < bn <= an <= rn and an + bn > rn/2
 * Scratch need: rn + (need for recursive call OR rn + 4). This gives
 *
 * S(n) <= rn + MAX (rn + 4, S(n/2)) <= 2rn + 4
 */
void
mpn_mulmod_bnm1 (mp_ptr rp, mp_size_t rn, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr tp)
{
  ASSERT (0 < bn);
  ASSERT (bn <= an);
  ASSERT (an <= rn);

  if ((rn & 1) != 0 || BELOW_THRESHOLD (rn, MULMOD_BNM1_THRESHOLD))
    {
      if (UNLIKELY (bn < rn))
	{
	  if (UNLIKELY (an + bn <= rn))
	    {
	      mpn_mul (rp, ap, an, bp, bn);
	    }
	  else
	    {
	      mp_limb_t cy;
	      mpn_mul (tp, ap, an, bp, bn);
	      cy = mpn_add (rp, tp, rn, tp + rn, an + bn - rn);
	      MPN_INCR_U (rp, rn, cy);
	    }
	}
      else
	mpn_bc_mulmod_bnm1 (rp, ap, bp, rn, tp);
    }
  else
    {
      mp_size_t n;
      mp_limb_t cy;
      mp_limb_t hi;

      n = rn >> 1;

      /* We need at least an + bn >= n, to be able to fit one of the
	 recursive products at rp. Requiring strict inequality makes
	 the coded slightly simpler. If desired, we could avoid this
	 restriction by initially halving rn as long as rn is even and
	 an + bn <= rn/2. */

      ASSERT (an + bn > n);

      /* Compute xm = a*b mod (B^n - 1), xp = a*b mod (B^n + 1)
	 and crt together as

	 x = -xp * B^n + (B^n + 1) * [ (xp + xm)/2 mod (B^n-1)]
      */

#define a0 ap
#define a1 (ap + n)
#define b0 bp
#define b1 (bp + n)

#define xp  tp	/* 2n + 2 */
      /* am1  maybe in {xp, n} */
      /* bm1  maybe in {xp + n, n} */
#define sp1 (tp + 2*n + 2)
      /* ap1  maybe in {sp1, n + 1} */
      /* bp1  maybe in {sp1 + n + 1, n + 1} */

      {
	mp_srcptr am1, bm1;
	mp_size_t anm, bnm;
	mp_ptr so;

	bm1 = b0;
	bnm = bn;
	if (LIKELY (an > n))
	  {
	    am1 = xp;
	    cy = mpn_add (xp, a0, n, a1, an - n);
	    MPN_INCR_U (xp, n, cy);
	    anm = n;
	    so = xp + n;
	    if (LIKELY (bn > n))
	      {
		bm1 = so;
		cy = mpn_add (so, b0, n, b1, bn - n);
		MPN_INCR_U (so, n, cy);
		bnm = n;
		so += n;
	      }
	  }
	else
	  {
	    so = xp;
	    am1 = a0;
	    anm = an;
	  }

	mpn_mulmod_bnm1 (rp, n, am1, anm, bm1, bnm, so);
      }

      {
	int       k;
	mp_srcptr ap1, bp1;
	mp_size_t anp, bnp;

	bp1 = b0;
	bnp = bn;
	if (LIKELY (an > n)) {
	  ap1 = sp1;
	  cy = mpn_sub (sp1, a0, n, a1, an - n);
	  sp1[n] = 0;
	  MPN_INCR_U (sp1, n + 1, cy);
	  anp = n + ap1[n];
	  if (LIKELY (bn > n)) {
	    bp1 = sp1 + n + 1;
	    cy = mpn_sub (sp1 + n + 1, b0, n, b1, bn - n);
	    sp1[2*n+1] = 0;
	    MPN_INCR_U (sp1 + n + 1, n + 1, cy);
	    bnp = n + bp1[n];
	  }
	} else {
	  ap1 = a0;
	  anp = an;
	}

	if (BELOW_THRESHOLD (n, MUL_FFT_MODF_THRESHOLD))
	  k=0;
	else
	  {
	    int mask;
	    k = mpn_fft_best_k (n, 0);
	    mask = (1<<k) - 1;
	    while (n & mask) {k--; mask >>=1;};
	  }
	if (k >= FFT_FIRST_K)
	  xp[n] = mpn_mul_fft (xp, n, ap1, anp, bp1, bnp, k);
	else if (UNLIKELY (bp1 == b0))
	  {
	    ASSERT (anp + bnp <= 2*n+1);
	    ASSERT (anp + bnp > n);
	    ASSERT (anp >= bnp);
	    mpn_mul (xp, ap1, anp, bp1, bnp);
	    anp = anp + bnp - n;
	    ASSERT (anp <= n || xp[2*n]==0);
	    anp-= anp > n;
	    cy = mpn_sub (xp, xp, n, xp + n, anp);
	    xp[n] = 0;
	    MPN_INCR_U (xp, n+1, cy);
	  }
	else
	  mpn_bc_mulmod_bnp1 (xp, ap1, bp1, n, xp);
      }

      /* Here the CRT recomposition begins.

	 xm <- (xp + xm)/2 = (xp + xm)B^n/2 mod (B^n-1)
	 Division by 2 is a bitwise rotation.

	 Assumes xp normalised mod (B^n+1).

	 The residue class [0] is represented by [B^n-1]; except when
	 both input are ZERO.
      */

#if HAVE_NATIVE_mpn_rsh1add_n || HAVE_NATIVE_mpn_rsh1add_nc
#if HAVE_NATIVE_mpn_rsh1add_nc
      cy = mpn_rsh1add_nc(rp, rp, xp, n, xp[n]); /* B^n = 1 */
      hi = cy << (GMP_NUMB_BITS - 1);
      cy = 0;
      /* next update of rp[n-1] will set cy = 1 only if rp[n-1]+=hi
	 overflows, i.e. a further increment will not overflow again. */
#else /* ! _nc */
      cy = xp[n] + mpn_rsh1add_n(rp, rp, xp, n); /* B^n = 1 */
      hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
      cy >>= 1;
      /* cy = 1 only if xp[n] = 1 i.e. {xp,n} = ZERO, this implies that
	 the rsh1add was a simple rshift: the top bit is 0. cy=1 => hi=0. */
#endif
#if GMP_NAIL_BITS == 0
      add_ssaaaa(cy, rp[n-1], cy, rp[n-1], 0, hi);
#else
      cy += (hi & rp[n-1]) >> (GMP_NUMB_BITS-1);
      rp[n-1] ^= hi;
#endif
#else /* ! HAVE_NATIVE_mpn_rsh1add_n */
#if HAVE_NATIVE_mpn_add_nc
      cy = mpn_add_nc(rp, rp, xp, n, xp[n]);
#else /* ! _nc */
      cy = xp[n] + mpn_add_n(rp, rp, xp, n); /* xp[n] == 1 implies {xp,n} == ZERO */
#endif
      cy += (rp[0]&1);
      mpn_rshift(rp, rp, n, 1);
      ASSERT (cy <= 2);
      hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
      cy >>= 1;
      /* We can have cy != 0 only if hi = 0... */
      ASSERT ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0);
      rp[n-1] |= hi;
      /* ... rp[n-1] + cy can not overflow, the following INCR is correct. */
#endif
      ASSERT (cy <= 1);
      /* Next increment can not overflow, read the previous comments about cy. */
      ASSERT ((cy == 0) || ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0));
      MPN_INCR_U(rp, n, cy);

      /* Compute the highest half:
	 ([(xp + xm)/2 mod (B^n-1)] - xp ) * B^n
       */
      if (UNLIKELY (an + bn < rn))
	{
	  /* Note that in this case, the only way the result can equal
	     zero mod B^{rn} - 1 is if one of the inputs is zero, and
	     then the output of both the recursive calls and this CRT
	     reconstruction is zero, not B^{rn} - 1. Which is good,
	     since the latter representation doesn't fit in the output
	     area.*/
	  cy = mpn_sub_n (rp + n, rp, xp, an + bn - n);

	  /* FIXME: This subtraction of the high parts is not really
	     necessary, we do it to get the carry out, and for sanity
	     checking. */
	  cy = xp[n] + mpn_sub_nc (xp + an + bn - n, rp + an + bn - n,
				   xp + an + bn - n, rn - (an + bn), cy);
	  ASSERT (an + bn == rn - 1 ||
		  mpn_zero_p (xp + an + bn - n + 1, rn - 1 - (an + bn)));
	  cy = mpn_sub_1 (rp, rp, an + bn, cy);
	  ASSERT (cy == (xp + an + bn - n)[0]);
	}
      else
	{
	  cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n);
	  /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO.
	     DECR will affect _at most_ the lowest n limbs. */
	  MPN_DECR_U (rp, 2*n, cy);
	}
#undef a0
#undef a1
#undef b0
#undef b1
#undef xp
#undef sp1
    }
}