Exemplo n.º 1
0
/* (rp, 2n) = (xp, n)*(yp, n) */
void
mpn_mulhigh_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
{
  mp_limb_t t;

  ASSERT(n > 0);
  ASSERT_MPN(xp, n);
  ASSERT_MPN(yp, n);
  ASSERT(!MPN_OVERLAP_P(rp, 2 * n, xp, n));
  ASSERT(!MPN_OVERLAP_P(rp, 2 * n, yp, n));
  
  if (BELOW_THRESHOLD(n, MULHIGH_BASECASE_THRESHOLD))
    {
      mpn_mul_basecase(rp, xp, n, yp, n);
      
      return;
    }
  
  if (ABOVE_THRESHOLD (n, MULHIGH_MUL_THRESHOLD))
    {
      mpn_mul_n(rp, xp, yp, n);
      
      return;
    }

  mpn_mulshort_n(rp, xp, yp, n);
  t = rp[n - 1] + n - 2;
  
  if (UNLIKELY(t < n - 2))
    mpn_mul_n(rp, xp, yp, n);
  
  return;
}
Exemplo n.º 2
0
// k degree poly so have k+1 coeffs and first k are size n
// k>3 so we can do the first add unconditionally 
int	mpn_toom_eval_pm1(mp_ptr pp,mp_ptr mp,unsigned int k,mp_srcptr xp,mp_size_t n,mp_size_t m,mp_ptr tp)
{int isneg=0;unsigned int i;

ASSERT(k>3);ASSERT(n>=m);ASSERT(m>0);ASSERT_MPN(xp,n*k+m);
//ASSERT_SPACE(pp,n+1);ASSERT_SPACE(mp,n+1);ASSERT_SPACE(tp,n+1);
ASSERT(!MPN_OVERLAP_P(pp,n+1,mp,n+1));ASSERT(!MPN_OVERLAP_P(pp,n+1,xp,n*k+m));ASSERT(!MPN_OVERLAP_P(pp,n+1,tp,n+1));
ASSERT(!MPN_OVERLAP_P(mp,n+1,xp,n*k+m));ASSERT(!MPN_OVERLAP_P(xp,n*k+m,tp,n+1));
#if ! HAVE_NATIVE_mpn_sumdiff_n
ASSERT(!MPN_OVERLAP_P(mp,n+1,tp,n+1));
#endif
#if HAVE_NATIVE_mpn_addadd_n
if(k==4){pp[n]=mpn_add_n(pp,xp,xp+2*n,n);tp[n]=mpn_add_n(tp,xp+n,xp+3*n,n);}else
if(k==5){pp[n]=mpn_addadd_n(pp,xp,xp+2*n,xp+4*n,n);tp[n]=mpn_add_n(tp,xp+n,xp+3*n,n);}else
  {pp[n]=mpn_addadd_n(pp,xp,xp+2*n,xp+4*n,n);tp[n]=mpn_addadd_n(tp,xp+n,xp+3*n,xp+5*n,n);
   for(i=7;i<k-2;i+=4){pp[n]+=mpn_addadd_n(pp,pp,xp+(i-1)*n,xp+(i+1)*n,n);tp[n]+=mpn_addadd_n(tp,tp,xp+i*n,xp+(i+2)*n,n);}
   if(k%4==3){pp[n]+=mpn_add_n(pp,pp,xp+(k-1)*n,n);}
   if(k%4==0){pp[n]+=mpn_add_n(pp,pp,xp+(k-2)*n,n);tp[n]+=mpn_add_n(tp,tp,xp+(k-1)*n,n);}
   if(k%4==1){pp[n]+=mpn_addadd_n(pp,pp,xp+(k-3)*n,xp+(k-1)*n,n);tp[n]+=mpn_add_n(tp,tp,xp+(k-2)*n,n);}}
if(k%2==0){pp[n]+=mpn_add(pp,pp,n,xp+k*n,m);}else{tp[n]+=mpn_add(tp,tp,n,xp+k*n,m);}
#else
// pp is xp+0 xp+2n xp+4n xp+6n ... xp+jn where j<=k-1
// mp is xp+1 xp+3n xp+5n xp+7n ... xp+jn where j<=k-1
pp[n]=mpn_add_n(pp,xp,xp+2*n,n);tp[n]=mpn_add_n(tp,xp+n,xp+3*n,n);
for(i=5;i<k;i+=2){pp[n]+=mpn_add_n(pp,pp,xp+(i-1)*n,n);tp[n]+=mpn_add_n(tp,tp,xp+i*n,n);}
if(k%2==1){pp[n]+=mpn_add_n(pp,pp,xp+(k-1)*n,n);tp[n]+=mpn_add(tp,tp,n,xp+k*n,m);}else{pp[n]+=mpn_add(pp,pp,n,xp+k*n,m);}
#endif
if(mpn_cmp(tp,pp,n+1)>0)isneg=-1;
#if HAVE_NATIVE_mpn_sumdiff_n
if(isneg==0){mpn_sumdiff_n(pp,mp,pp,tp,n+1);}else{mpn_sumdiff_n(pp,mp,tp,pp,n+1);}
#else
if(isneg==0){mpn_sub_n(mp,pp,tp,n+1);}else{mpn_sub_n(mp,tp,pp,n+1);}
mpn_add_n(pp,pp,tp,n+1);  
#endif
return isneg;}
Exemplo n.º 3
0
/* (rp, 2n) = (xp, n)*(yp, n) / B^n */ 
inline static void
mpn_mulshort_n_basecase(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
{
  mp_size_t i, k;

#if GMP_NAIL_BITS==0
  mp_limb_t t1, t2, t3;
#endif

  ASSERT(n >= 3);  /* this restriction doesn't make a lot of sense in general */
  ASSERT_MPN(xp, n);
  ASSERT_MPN(yp, n);
  ASSERT(!MPN_OVERLAP_P (rp, 2 * n, xp, n));
  ASSERT(!MPN_OVERLAP_P (rp, 2 * n, yp, n));

  k = n - 2; /* so want short product sum_(i + j >= k) x[i]y[j]B^(i + j) */

#if GMP_NAIL_BITS!=0
  rp[n] = mpn_mul_1(rp + k, xp + k, 2, yp[0]);
#else

  umul_ppmm(t1, rp[k], xp[k], yp[0]);
  umul_ppmm(t3, t2, xp[k + 1], yp[0]);
  add_ssaaaa(rp[n], rp[k + 1], t3, t2, 0, t1);
#endif

  for (i = 1; i <= n - 2; i++)
     rp[n + i] = mpn_addmul_1 (rp + k, xp + k - i, 2 + i, yp[i]);
  
  rp[n + n - 1] = mpn_addmul_1 (rp + n - 1, xp, n, yp[n - 1]);
  
  return;
}
Exemplo n.º 4
0
void
mpn_mulmid_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n)
{
  ASSERT (n >= 1);
  ASSERT (! MPN_OVERLAP_P (rp, n + 2, ap, 2*n - 1));
  ASSERT (! MPN_OVERLAP_P (rp, n + 2, bp, n));

  if (n < MULMID_TOOM42_THRESHOLD)
    {
      mpn_mulmid_basecase (rp, ap, 2*n - 1, bp, n);
    }
  else
    {
      mp_size_t k;
      mp_ptr scratch;
	   TMP_DECL;
	  
      k = mpn_toom42_mulmid_itch (n);

      if (k <= 1000) k = 1000;
	
	   TMP_MARK;
	   scratch = TMP_ALLOC_LIMBS (k);
	   mpn_toom42_mulmid (rp, ap, bp, n, scratch);
	   TMP_FREE;
    }
}
Exemplo n.º 5
0
void
my__gmpn_tdiv_qr (mp_ptr qp, mp_ptr rp, mp_size_t qxn,
mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn)
{
	ASSERT_ALWAYS (qxn == 0);

	ASSERT (nn >= 0);
	ASSERT (dn >= 0);
	ASSERT (dn == 0 || dp[dn - 1] != 0);
	ASSERT (! MPN_OVERLAP_P (qp, nn - dn + 1 + qxn, np, nn));
	ASSERT (! MPN_OVERLAP_P (qp, nn - dn + 1 + qxn, dp, dn));

	int adjust;
	gmp_pi1_t dinv;
	TMP_DECL;
	TMP_MARK;
								 /* conservative tests for quotient size */
	adjust = np[nn - 1] >= dp[dn - 1];
	mp_ptr n2p, d2p;
	mp_limb_t cy;
	int cnt;

	qp[nn - dn] = 0;			 /* zero high quotient limb */
	count_leading_zeros (cnt, dp[dn - 1]);
	cnt -= GMP_NAIL_BITS;
	d2p = TMP_ALLOC_LIMBS (dn);
	mpn_lshift (d2p, dp, dn, cnt);

	for (int i=0; i<dn; i+=1)
	{
		printf("d2p %08x\n", *( (int*) (((void*)(d2p))+(i*4))));
	}


	n2p = TMP_ALLOC_LIMBS (nn + 1);
	cy = mpn_lshift (n2p, np, nn, cnt);
	for (int i=0; i<nn; i+=1)
	{
		printf("n2p %08x\n", *( (int*) (((void*)(n2p))+(i*4))));
	}
	n2p[nn] = cy;
	nn += adjust;

        printf("d2p[dn-1] = %08lx\nd2p[dn-2] = %08lx\n", d2p[dn-1], d2p[dn-2]);
	invert_pi1 (dinv, d2p[dn - 1], d2p[dn - 2]);
        printf("dinv %08lx\n", dinv.inv32);
	my_mpn_sbpi1_div_qr (qp, n2p, nn, d2p, dn, dinv.inv32);
	for (int i=0; i<nn; i+=1)
	{
		printf("inside qp %08x\n", *( (int*) (((void*)(qp))+(i*4))));
	}
	n2p[nn] = cy;

	mpn_rshift (rp, n2p, dn, cnt);
	TMP_FREE;
	return;

}
Exemplo n.º 6
0
/* 
   c is the top bits of the inputs, (fully reduced)
   c & 2 is the top bit of y
   c & 1 is the top bit of z
*/
int
mpn_mulmod_2expp1_basecase (mp_ptr xp, mp_srcptr yp, mp_srcptr zp, 
                                           int c, mpir_ui b, mp_ptr tp)
{
  int cy, cz;
  mp_size_t n, k;

  cy = c & 2;
  cz = c & 1;

  n = BITS_TO_LIMBS (b);
  k = GMP_NUMB_BITS * n - b;

  ASSERT(b > 0);
  ASSERT(n > 0);
  ASSERT_MPN(yp, n);
  ASSERT_MPN(zp, n);
  ASSERT(!MPN_OVERLAP_P (tp, 2 * n, yp, n));
  ASSERT(!MPN_OVERLAP_P (tp, 2 * n, zp, n));
  ASSERT(MPN_SAME_OR_SEPARATE_P (xp, tp, n));
  ASSERT(MPN_SAME_OR_SEPARATE_P (xp, tp + n, n));
  ASSERT(k == 0 || yp[n - 1] >> (GMP_NUMB_BITS - k) == 0);
  ASSERT(k == 0 || zp[n - 1] >> (GMP_NUMB_BITS - k) == 0);

#if WANT_ASSERT
  {
     mp_size_t t = n;

     MPN_NORMALIZE(yp, t);
     ASSERT(cy == 0 || t == 0);
     
     t = n; 
     MPN_NORMALIZE(zp, t);
     ASSERT(cz == 0 || t == 0);
  }
#endif

  if (LIKELY (cy == 0))
    {
      if (LIKELY (cz == 0))
	{
	  c = mpn_mulmod_2expp1_internal (xp, yp, zp, b, tp);
	}
      else
	{
	  c = mpn_neg_n (xp, yp, n);
	  c = mpn_add_1 (xp, xp, n, c);
	  xp[n - 1] &= GMP_NUMB_MASK >> k;
	}
    }
  else
    {
      if (LIKELY (cz == 0))
Exemplo n.º 7
0
void
mpn_invert (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch)
{
  ASSERT (n > 0);
  ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
  ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
  ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n)));
  ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n)));

  if (n == 1)
    invert_limb (*ip, *dp);
  else {
    TMP_DECL;

    TMP_MARK;
    if (BELOW_THRESHOLD (n, INV_APPR_THRESHOLD))
      {
	/* Maximum scratch needed by this branch: 2*n */
	mp_size_t i;
	mp_ptr xp;

	xp = scratch;				/* 2 * n limbs */
	for (i = n - 1; i >= 0; i--)
	  xp[i] = GMP_NUMB_MAX;
	mpn_com (xp + n, dp, n);
	if (n == 2) {
	  mpn_divrem_2 (ip, 0, xp, 4, dp);
	} else {
	  gmp_pi1_t inv;
	  invert_pi1 (inv, dp[n-1], dp[n-2]);
	  /* FIXME: should we use dcpi1_div_q, for big sizes? */
	  mpn_sbpi1_div_q (ip, xp, 2 * n, dp, n, inv.inv32);
	}
      }
    else { /* Use approximated inverse; correct the result if needed. */
      mp_limb_t e; /* The possible error in the approximate inverse */

      ASSERT ( mpn_invert_itch (n) >= mpn_invertappr_itch (n) );
      e = mpn_ni_invertappr (ip, dp, n, scratch);

      if (UNLIKELY (e)) { /* Assume the error can only be "0" (no error) or "1". */
	/* Code to detect and correct the "off by one" approximation. */
	mpn_mul_n (scratch, ip, dp, n);
	ASSERT_NOCARRY (mpn_add_n (scratch + n, scratch + n, dp, n));
	if (! mpn_add (scratch, scratch, 2*n, dp, n))
	  MPN_INCR_U (ip, n, 1); /* The value was wrong, correct it.  */
      }
    }
    TMP_FREE;
  }
}
Exemplo n.º 8
0
void
mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n)
{
  ASSERT (n >= 1);
  ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n));
  ASSERT (! MPN_OVERLAP_P (p, 2 * n, b, n));

  if (BELOW_THRESHOLD (n, MUL_KARATSUBA_THRESHOLD))
    {
      mpn_mul_basecase (p, a, n, b, n);
    }
  else if (BELOW_THRESHOLD (n, MUL_TOOM3_THRESHOLD))
    {
      /* Allocate workspace of fixed size on stack: fast! */
      mp_limb_t ws[MPN_KARA_MUL_N_TSIZE (MUL_TOOM3_THRESHOLD_LIMIT-1)];
      ASSERT (MUL_TOOM3_THRESHOLD <= MUL_TOOM3_THRESHOLD_LIMIT);
      mpn_kara_mul_n (p, a, b, n, ws);
    }
#if WANT_FFT || TUNE_PROGRAM_BUILD
  else if (BELOW_THRESHOLD (n, MUL_FFT_THRESHOLD))
#else
  else if (BELOW_THRESHOLD (n, MPN_TOOM3_MAX_N))
#endif
    {
      mp_ptr ws;
      TMP_SDECL;
      TMP_SMARK;
      ws = TMP_SALLOC_LIMBS (MPN_TOOM3_MUL_N_TSIZE (n));
      mpn_toom3_mul_n (p, a, b, n, ws);
      TMP_SFREE;
    }
  else
#if WANT_FFT || TUNE_PROGRAM_BUILD
    {
      /* The current FFT code allocates its own space.  That should probably
	 change.  */
      mpn_mul_fft_full (p, a, n, b, n);
    }
#else
    {
      /* Toom3 for large operands.  Use workspace from the heap, as stack space
      may be limited.  Since n is at least MUL_TOOM3_THRESHOLD, multiplication
      will take much longer than malloc()/free().  */
      mp_ptr ws;  mp_size_t ws_size;
      ws_size = MPN_TOOM3_MUL_N_TSIZE (n);
      ws = __GMP_ALLOCATE_FUNC_LIMBS (ws_size);
      mpn_toom3_mul_n (p, a, b, n, ws);
      __GMP_FREE_FUNC_LIMBS (ws, ws_size);
    }
#endif
}
Exemplo n.º 9
0
void
mpn_invert (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch)
{
  ASSERT (n > 0);
  ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
  ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
  ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n)));
  ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n)));

  if (n == 1)
    invert_limb (*ip, *dp);
  else if (BELOW_THRESHOLD (n, INV_APPR_THRESHOLD))
    {
	/* Maximum scratch needed by this branch: 2*n */
	mp_size_t i;
	mp_ptr xp;

	xp = scratch;				/* 2 * n limbs */
	/* n > 1 here */
	i = n;
	do
	  xp[--i] = GMP_NUMB_MAX;
	while (i);
	mpn_com (xp + n, dp, n);
	if (n == 2) {
	  mpn_divrem_2 (ip, 0, xp, 4, dp);
	} else {
	  gmp_pi1_t inv;
	  invert_pi1 (inv, dp[n-1], dp[n-2]);
	  /* FIXME: should we use dcpi1_div_q, for big sizes? */
	  mpn_sbpi1_div_q (ip, xp, 2 * n, dp, n, inv.inv32);
	}
    }
  else { /* Use approximated inverse; correct the result if needed. */
      mp_limb_t e; /* The possible error in the approximate inverse */

      ASSERT ( mpn_invert_itch (n) >= mpn_invertappr_itch (n) );
      e = mpn_ni_invertappr (ip, dp, n, scratch);

      if (UNLIKELY (e)) { /* Assume the error can only be "0" (no error) or "1". */
	/* Code to detect and correct the "off by one" approximation. */
	mpn_mul_n (scratch, ip, dp, n);
	e = mpn_add_n (scratch, scratch, dp, n); /* FIXME: we only need e.*/
	if (LIKELY(e)) /* The high part can not give a carry by itself. */
	  e = mpn_add_nc (scratch + n, scratch + n, dp, n, e); /* FIXME:e */
	/* If the value was wrong (no carry), correct it (increment). */
	e ^= CNST_LIMB (1);
	MPN_INCR_U (ip, n, e);
      }
  }
}
Exemplo n.º 10
0
void
mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n)
{
  ASSERT (n >= 1);
  ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n));
  ASSERT (! MPN_OVERLAP_P (p, 2 * n, b, n));

  if (BELOW_THRESHOLD (n, MUL_KARATSUBA_THRESHOLD))
    {
      mpn_mul_basecase (p, a, n, b, n);
    }
  else if (BELOW_THRESHOLD (n, MUL_TOOM3_THRESHOLD))
    {
      /* Allocate workspace of fixed size on stack: fast! */
      mp_limb_t ws[MPN_KARA_MUL_N_TSIZE (MUL_TOOM3_THRESHOLD_LIMIT-1)];
      ASSERT (MUL_TOOM3_THRESHOLD <= MUL_TOOM3_THRESHOLD_LIMIT);
      mpn_kara_mul_n (p, a, b, n, ws);
    }
  else if (BELOW_THRESHOLD (n, MUL_TOOM4_THRESHOLD))
    {
      mp_ptr ws;
      TMP_SDECL;
      TMP_SMARK;
      ws = TMP_SALLOC_LIMBS (MPN_TOOM3_MUL_N_TSIZE (n));
      mpn_toom3_mul_n (p, a, b, n, ws);
      TMP_SFREE;
    }
  else if (BELOW_THRESHOLD (n, MUL_TOOM8H_THRESHOLD))
    {
       mpn_toom4_mul_n (p, a, b, n);
    }
#if WANT_FFT || TUNE_PROGRAM_BUILD
  else if (BELOW_THRESHOLD (n, MUL_FFT_FULL_THRESHOLD))
    {
       mpn_toom8h_mul (p, a, n, b, n);
    }
#endif
  else
#if WANT_FFT || TUNE_PROGRAM_BUILD
    {
       mpn_mul_fft_main(p, a, n, b, n); 
    }
#else
    {
      /* Toom8 for large operands. */
      mpn_toom8h_mul (p, a, n, b, n);
    }
#endif
}
Exemplo n.º 11
0
mp_limb_t	mpn_sumdiff_n(mp_ptr s,mp_ptr d,mp_srcptr x,mp_srcptr y,mp_size_t n)
{mp_limb_t ret;mp_ptr t;

ASSERT(n>0);
ASSERT_MPN(x,n);ASSERT_MPN(y,n);//ASSERT_SPACE(s,n);ASSERT_SPACE(d,n);
ASSERT(MPN_SAME_OR_SEPARATE_P(s,x,n));
ASSERT(MPN_SAME_OR_SEPARATE_P(s,y,n));
ASSERT(MPN_SAME_OR_SEPARATE_P(d,x,n));
ASSERT(MPN_SAME_OR_SEPARATE_P(d,y,n));
ASSERT(!MPN_OVERLAP_P(s,n,d,n));

if( (s==x && d==y)||(s==y && d==x) )
  {t=__GMP_ALLOCATE_FUNC_LIMBS(n);
   ret=mpn_sub_n(t,x,y,n);
   ret+=2*mpn_add_n(s,x,y,n);
   MPN_COPY(d,t,n);
   __GMP_FREE_FUNC_LIMBS(t,n);
   return ret;}
if(s==x || s==y)
  {ret=mpn_sub_n(d,x,y,n);
   ret+=2*mpn_add_n(s,x,y,n);
   return ret;}
ret=2*mpn_add_n(s,x,y,n);
ret+=mpn_sub_n(d,x,y,n);
return ret;}
Exemplo n.º 12
0
/*
    mpn_dc_mullo_n requires a scratch space of 2*n limbs at tp.
    It accepts tp == rp.
*/
static void
mpn_dc_mullo_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, mp_ptr tp)
{
  mp_size_t n2, n1;
  ASSERT (n >= 2);
  ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
  ASSERT (! MPN_OVERLAP_P (rp, n, yp, n));
  ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n));

  /* Divide-and-conquer */

  /* We need fractional approximation of the value 0 < a <= 1/2
     giving the minimum in the function k=(1-a)^e/(1-2*a^e).
  */
  if (MAYBE_range_basecase && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD*36/(36-11)))
    n1 = n >> 1;
  else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD*36/(36-11)))
Exemplo n.º 13
0
void
mpn_sqr (mp_ptr p, mp_srcptr a, mp_size_t n)
{
  ASSERT (n >= 1);
  ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n));

#if 0
  /* FIXME: Can this be removed? */
  if (n == 0)
    return;
#endif

  if (BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
    { /* mul_basecase is faster than sqr_basecase on small sizes sometimes */
      mpn_mul_basecase (p, a, n, a, n);
    }
  else if (BELOW_THRESHOLD (n, SQR_KARATSUBA_THRESHOLD))
    {
      mpn_sqr_basecase (p, a, n);
    }
  else if (BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD))
    {
      /* Allocate workspace of fixed size on stack: fast! */
      mp_limb_t ws[MPN_KARA_SQR_N_TSIZE (SQR_TOOM3_THRESHOLD_LIMIT-1)];
      ASSERT (SQR_TOOM3_THRESHOLD <= SQR_TOOM3_THRESHOLD_LIMIT);
      mpn_kara_sqr_n (p, a, n, ws);
    }
  else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD))
    {
      mp_ptr ws;
      TMP_SDECL;
      TMP_SMARK;
      ws = TMP_SALLOC_LIMBS (MPN_TOOM3_SQR_N_TSIZE (n));
      mpn_toom3_sqr_n (p, a, n, ws);
      TMP_SFREE;
    }
  else if (BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD))
    {
       mpn_toom4_sqr_n (p, a, n);
    }
#if WANT_FFT || TUNE_PROGRAM_BUILD
  else if (BELOW_THRESHOLD (n, SQR_FFT_FULL_THRESHOLD))
#else
  else 
#endif
    {
       mpn_toom8_sqr_n (p, a, n);
    }
#if WANT_FFT || TUNE_PROGRAM_BUILD
  else
    {
Exemplo n.º 14
0
Arquivo: mul.c Projeto: mahdiz/mpclib
void
mpn_sqr_n (mp_ptr prodp,
	   mp_srcptr up, mp_size_t un)
{
  ASSERT (un >= 1);
  ASSERT (! MPN_OVERLAP_P (prodp, 2*un, up, un));

  /* FIXME: Can this be removed? */
  if (un == 0)
    return;

  if (BELOW_THRESHOLD (un, SQR_BASECASE_THRESHOLD))
    { /* mul_basecase is faster than sqr_basecase on small sizes sometimes */
      mpn_mul_basecase (prodp, up, un, up, un);
    }
  else if (BELOW_THRESHOLD (un, SQR_KARATSUBA_THRESHOLD))
    { /* plain schoolbook multiplication */
      mpn_sqr_basecase (prodp, up, un);
    }
  else if (BELOW_THRESHOLD (un, SQR_TOOM3_THRESHOLD))
    { /* karatsuba multiplication */
      mp_ptr tspace;
      TMP_DECL (marker);
      TMP_MARK (marker);
      tspace = TMP_ALLOC_LIMBS (MPN_KARA_SQR_N_TSIZE (un));
      mpn_kara_sqr_n (prodp, up, un, tspace);
      TMP_FREE (marker);
    }
#if WANT_FFT || TUNE_PROGRAM_BUILD
  else if (BELOW_THRESHOLD (un, SQR_FFT_THRESHOLD))
#else
  else
#endif
    { /* Toom3 multiplication.
	 Use workspace from the heap, as stack may be limited.  Since n is
	 at least MUL_TOOM3_THRESHOLD, the multiplication will take much
	 longer than malloc()/free().  */
      mp_ptr     tspace;
      mp_size_t  tsize;
      tsize = MPN_TOOM3_SQR_N_TSIZE (un);
      tspace = __GMP_ALLOCATE_FUNC_LIMBS (tsize);
      mpn_toom3_sqr_n (prodp, up, un, tspace);
      __GMP_FREE_FUNC_LIMBS (tspace, tsize);
    }
#if WANT_FFT || TUNE_PROGRAM_BUILD
  else
    {
Exemplo n.º 15
0
/* Define our own squaring function, which uses mpn_sqr_basecase for its
   allowed sizes, but its own code for larger sizes.  */
static void
mpn_local_sqr (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_ptr tp)
{
  mp_size_t i;

  ASSERT (n >= 1);
  ASSERT (! MPN_OVERLAP_P (rp, 2*n, up, n));

  if (BELOW_THRESHOLD (n, SQR_BASECASE_LIM))
    {
      mpn_sqr_basecase (rp, up, n);
      return;
    }

  {
    mp_limb_t ul, lpl;
    ul = up[0];
    umul_ppmm (rp[1], lpl, ul, ul << GMP_NAIL_BITS);
    rp[0] = lpl >> GMP_NAIL_BITS;
  }
  if (n > 1)
    {
      mp_limb_t cy;

      cy = mpn_mul_1 (tp, up + 1, n - 1, up[0]);
      tp[n - 1] = cy;
      for (i = 2; i < n; i++)
	{
	  mp_limb_t cy;
	  cy = mpn_addmul_1 (tp + 2 * i - 2, up + i, n - i, up[i - 1]);
	  tp[n + i - 2] = cy;
	}
      MPN_SQR_DIAGONAL (rp + 2, up + 1, n - 1);

      {
	mp_limb_t cy;
#if HAVE_NATIVE_mpn_addlsh1_n
	cy = mpn_addlsh1_n (rp + 1, rp + 1, tp, 2 * n - 2);
#else
	cy = mpn_lshift (tp, tp, 2 * n - 2, 1);
	cy += mpn_add_n (rp + 1, rp + 1, tp, 2 * n - 2);
#endif
	rp[2 * n - 1] += cy;
      }
    }
}
Exemplo n.º 16
0
mp_limb_t
mpn_sb_divrem_mn (mp_ptr qp,
		  mp_ptr np, mp_size_t nn,
		  mp_srcptr dp, mp_size_t dn)
{
  mp_limb_t most_significant_q_limb = 0;
  mp_size_t qn = nn - dn;
  mp_size_t i;
  mp_limb_t dx, d1, n0;
  mp_limb_t dxinv;
  int use_preinv;

  ASSERT (dn > 2);
  ASSERT (nn >= dn);
  ASSERT (dp[dn-1] & GMP_NUMB_HIGHBIT);
  ASSERT (! MPN_OVERLAP_P (np, nn, dp, dn));
  ASSERT (! MPN_OVERLAP_P (qp, nn-dn, dp, dn));
  ASSERT (! MPN_OVERLAP_P (qp, nn-dn, np, nn) || qp+dn >= np);
  ASSERT_MPN (np, nn);
  ASSERT_MPN (dp, dn);

  np += qn;
  dx = dp[dn - 1];
  d1 = dp[dn - 2];
  n0 = np[dn - 1];

  if (n0 >= dx)
    {
      if (n0 > dx || mpn_cmp (np, dp, dn - 1) >= 0)
	{
	  mpn_sub_n (np, np, dp, dn);
	  most_significant_q_limb = 1;
	}
    }

  /* use_preinv is possibly a constant, but it's left to the compiler to
     optimize away the unused code in that case.  */
  use_preinv = ABOVE_THRESHOLD (qn, DIV_SB_PREINV_THRESHOLD);
  if (use_preinv)
    invert_limb (dxinv, dx);

  for (i = qn - 1; i >= 0; i--)
    {
      mp_limb_t q;
      mp_limb_t nx;
      mp_limb_t cy_limb;

      nx = np[dn - 1];		/* FIXME: could get value from r1 */
      np--;

      if (nx == dx)
	{
	  /* This might over-estimate q, but it's probably not worth
	     the extra code here to find out.  */
	  q = GMP_NUMB_MASK;

#if 1
	  cy_limb = mpn_submul_1 (np, dp, dn, q);
#else
	  /* This should be faster on many machines */
	  cy_limb = mpn_sub_n (np + 1, np + 1, dp, dn);
	  cy = mpn_add_n (np, np, dp, dn);
	  np[dn] += cy;
#endif

	  if (nx != cy_limb)
	    {
	      mpn_add_n (np, np, dp, dn);
	      q--;
	    }

	  qp[i] = q;
	}
      else
	{
	  mp_limb_t rx, r1, r0, p1, p0;

	  /* "workaround" avoids a problem with gcc 2.7.2.3 i386 register usage
	     when np[dn-1] is used in an asm statement like umul_ppmm in
	     udiv_qrnnd_preinv.  The symptom is seg faults due to registers
	     being clobbered.  gcc 2.95 i386 doesn't have the problem. */
	  {
	    mp_limb_t  workaround = np[dn - 1];
	    if (use_preinv)
	      udiv_qrnnd_preinv (q, r1, nx, workaround, dx, dxinv);
	    else
	      {
		udiv_qrnnd (q, r1, nx, workaround << GMP_NAIL_BITS,
			    dx << GMP_NAIL_BITS);
		r1 >>= GMP_NAIL_BITS;
	      }
	  }
	  umul_ppmm (p1, p0, d1, q << GMP_NAIL_BITS);
	  p0 >>= GMP_NAIL_BITS;

	  r0 = np[dn - 2];
	  rx = 0;
	  if (r1 < p1 || (r1 == p1 && r0 < p0))
	    {
	      p1 -= p0 < d1;
	      p0 = (p0 - d1) & GMP_NUMB_MASK;
	      q--;
	      r1 = (r1 + dx) & GMP_NUMB_MASK;
	      rx = r1 < dx;
	    }

	  p1 += r0 < p0;	/* cannot carry! */
	  rx -= r1 < p1;	/* may become 11..1 if q is still too large */
	  r1 = (r1 - p1) & GMP_NUMB_MASK;
	  r0 = (r0 - p0) & GMP_NUMB_MASK;

	  cy_limb = mpn_submul_1 (np, dp, dn - 2, q);

	  /* Check if we've over-estimated q, and adjust as needed.  */
	  {
	    mp_limb_t cy1, cy2;
	    cy1 = r0 < cy_limb;
	    r0 = (r0 - cy_limb) & GMP_NUMB_MASK;
	    cy2 = r1 < cy1;
	    r1 -= cy1;
	    np[dn - 1] = r1;
	    np[dn - 2] = r0;
	    if (cy2 != rx)
	      {
		mpn_add_n (np, np, dp, dn);
		q--;
	      }
	  }
	  qp[i] = q;
	}
    }

  /* ______ ______ ______
    |__rx__|__r1__|__r0__|		partial remainder
	    ______ ______
	 - |__p1__|__p0__|		partial product to subtract
	    ______ ______
	 - |______|cylimb|

     rx is -1, 0 or 1.  If rx=1, then q is correct (it should match
     carry out).  If rx=-1 then q is too large.  If rx=0, then q might
     be too large, but it is most likely correct.
  */

  return most_significant_q_limb;
}
Exemplo n.º 17
0
void
mpn_tdiv_q (mp_ptr qp,
	   mp_srcptr np, mp_size_t nn,
	   mp_srcptr dp, mp_size_t dn)
{
  mp_ptr new_dp, new_np, tp, rp, scratch;
  mp_limb_t cy, dh, qh;
  mp_size_t new_nn, qn;
  mp_limb_t dinv;
  int cnt;
  TMP_DECL;
  TMP_MARK;

  ASSERT (nn >= dn);
  ASSERT (dn > 0);
  ASSERT (dp[dn - 1] != 0);
  ASSERT (! MPN_OVERLAP_P (qp, nn - dn + 1, np, nn));
  ASSERT (! MPN_OVERLAP_P (qp, nn - dn + 1, dp, dn));

  ASSERT_ALWAYS (FUDGE >= 2);
  
  if (dn == 1)
    {
      mpn_divrem_1 (qp, 0L, np, nn, dp[dn - 1]);
      return;
    }

  scratch = TMP_ALLOC_LIMBS(nn + 1);
  
  qn = nn - dn + 1;		/* Quotient size, high limb might be zero */

  if (qn + FUDGE >= dn)
    {
      /* |________________________|
                          |_______|  */
      new_np = scratch;

      dh = dp[dn - 1];
      if (LIKELY ((dh & GMP_NUMB_HIGHBIT) == 0))
	{
	  count_leading_zeros (cnt, dh);

	  cy = mpn_lshift (new_np, np, nn, cnt);
	  new_np[nn] = cy;
	  new_nn = nn + (cy != 0);

	  new_dp = TMP_ALLOC_LIMBS (dn);
	  mpn_lshift (new_dp, dp, dn, cnt);

	  if (dn == 2)
	    {
	      qh = mpn_divrem_2 (qp, 0L, new_np, new_nn, new_dp);
	    }
	  else if (BELOW_THRESHOLD (dn, DC_DIV_Q_THRESHOLD) ||
		   BELOW_THRESHOLD (new_nn - dn, DC_DIV_Q_THRESHOLD))
	    {
          invert_1(dinv, new_dp[dn - 1], new_dp[dn - 2]);
	      qh = mpn_sb_div_q (qp, new_np, new_nn, new_dp, dn, dinv);
	    }
	  else if (BELOW_THRESHOLD (dn, INV_DIV_Q_THRESHOLD) || 
		   BELOW_THRESHOLD (nn, 2 * INV_DIV_Q_THRESHOLD)) 
	    {
          invert_1(dinv, new_dp[dn - 1], new_dp[dn - 2]);
          qh = mpn_dc_div_q (qp, new_np, new_nn, new_dp, dn, dinv);
	    }
	  else
	    {
           mp_ptr inv = TMP_ALLOC_LIMBS(dn);
           mpn_invert(inv, new_dp, dn);
           qh = mpn_inv_div_q (qp, new_np, new_nn, new_dp, dn, inv);
	    }
	  if (cy == 0)
	    qp[qn - 1] = qh;
	  else if (UNLIKELY (qh != 0))
	    {
	      /* This happens only when the quotient is close to B^n and
		 mpn_*_divappr_q returned B^n.  */
	      mp_size_t i, n;
	      n = new_nn - dn;
	      for (i = 0; i < n; i++)
		qp[i] = GMP_NUMB_MAX;
	      qh = 0;		/* currently ignored */
	    }
	}
      else  /* divisor is already normalised */
	{
	  if (new_np != np)
	    MPN_COPY (new_np, np, nn);

	  if (dn == 2)
	    {
	      qh = mpn_divrem_2 (qp, 0L, new_np, nn, dp);
	    }
	  else if (BELOW_THRESHOLD (dn, DC_DIV_Q_THRESHOLD) ||
		   BELOW_THRESHOLD (nn - dn, DC_DIV_Q_THRESHOLD))
	    {
           invert_1(dinv, dh, dp[dn - 2]);
           qh = mpn_sb_div_q (qp, new_np, nn, dp, dn, dinv);
	    }
	  else if (BELOW_THRESHOLD (dn, INV_DIV_Q_THRESHOLD) || 
		   BELOW_THRESHOLD (nn, 2 * INV_DIV_Q_THRESHOLD))
	    {
           invert_1(dinv, dh, dp[dn - 2]);
           qh = mpn_dc_div_q (qp, new_np, nn, dp, dn, dinv);
	    }
	  else
	    {
           mp_ptr inv = TMP_ALLOC_LIMBS(dn);
           mpn_invert(inv, dp, dn);
           qh = mpn_inv_div_q (qp, new_np, nn, dp, dn, inv);
	    }
	  qp[nn - dn] = qh;
	}
    }
  else
    {
      /* |________________________|
                |_________________|  */
      tp = TMP_ALLOC_LIMBS (qn + 1);

      new_np = scratch;
      new_nn = 2 * qn + 1;
      if (new_np == np)
	/* We need {np,nn} to remain untouched until the final adjustment, so
	   we need to allocate separate space for new_np.  */
	new_np = TMP_ALLOC_LIMBS (new_nn + 1);


      dh = dp[dn - 1];
      if (LIKELY ((dh & GMP_NUMB_HIGHBIT) == 0))
	{
	  count_leading_zeros (cnt, dh);

	  cy = mpn_lshift (new_np, np + nn - new_nn, new_nn, cnt);
	  new_np[new_nn] = cy;

	  new_nn += (cy != 0);

	  new_dp = TMP_ALLOC_LIMBS (qn + 1);
	  mpn_lshift (new_dp, dp + dn - (qn + 1), qn + 1, cnt);
	  new_dp[0] |= dp[dn - (qn + 1) - 1] >> (GMP_NUMB_BITS - cnt);

	  if (qn + 1 == 2)
	    {
	      qh = mpn_divrem_2 (tp, 0L, new_np, new_nn, new_dp);
	    }
	  else if (BELOW_THRESHOLD (qn - 1, DC_DIVAPPR_Q_THRESHOLD))
	    {
          invert_1(dinv, new_dp[qn], new_dp[qn - 1]);
	      qh = mpn_sb_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, dinv);
	    }
	  else if (BELOW_THRESHOLD (qn - 1, INV_DIVAPPR_Q_THRESHOLD))
	    {
          invert_1(dinv, new_dp[qn], new_dp[qn - 1]);
	      qh = mpn_dc_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, dinv);
	    }
	  else
	    {
           mp_ptr inv = TMP_ALLOC_LIMBS(qn + 1);
           mpn_invert(inv, new_dp, qn + 1);
           qh = mpn_inv_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, inv); 
	    }
	  if (cy == 0)
	    tp[qn] = qh;
	  else if (UNLIKELY (qh != 0))
	    {
	      /* This happens only when the quotient is close to B^n and
		 mpn_*_divappr_q returned B^n.  */
	      mp_size_t i, n;
	      n = new_nn - (qn + 1);
	      for (i = 0; i < n; i++)
		tp[i] = GMP_NUMB_MAX;
	      qh = 0;		/* currently ignored */
	    }
	}
      else  /* divisor is already normalised */
	{
Exemplo n.º 18
0
mp_limb_t
mpn_div_qr_1n_pi2 (mp_ptr qp,
		   mp_srcptr up, mp_size_t un,
		   struct precomp_div_1_pi2 *pd)
{
  mp_limb_t most_significant_q_limb;
  mp_size_t i;
  mp_limb_t r, u2, u1, u0;
  mp_limb_t d0, di1, di0;
  mp_limb_t q3a, q2a, q2b, q1b, q2c, q1c, q1d, q0d;
  mp_limb_t cnd;

  ASSERT (un >= 2);
  ASSERT ((pd->d & GMP_NUMB_HIGHBIT) != 0);
  ASSERT (! MPN_OVERLAP_P (qp, un-2, up, un) || qp+2 >= up);
  ASSERT_MPN (up, un);

#define q3 q3a
#define q2 q2b
#define q1 q1b

  up += un - 3;
  r = up[2];
  d0 = pd->d;

  most_significant_q_limb = (r >= d0);
  r -= d0 & -most_significant_q_limb;

  qp += un - 3;
  qp[2] = most_significant_q_limb;

  di1 = pd->dip[1];
  di0 = pd->dip[0];

  for (i = un - 3; i >= 0; i -= 2)
    {
      u2 = r;
      u1 = up[1];
      u0 = up[0];

      /* Dividend in {r,u1,u0} */

      umul_ppmm (q1d,q0d, u1, di0);
      umul_ppmm (q2b,q1b, u1, di1);
      q2b++;				/* cannot spill */
      add_sssaaaa (r,q2b,q1b, q2b,q1b, u1,u0);

      umul_ppmm (q2c,q1c, u2,  di0);
      add_sssaaaa (r,q2b,q1b, q2b,q1b, q2c,q1c);
      umul_ppmm (q3a,q2a, u2, di1);

      add_sssaaaa (r,q2b,q1b, q2b,q1b, q2a,q1d);

      q3 += r;

      r = u0 - q2 * d0;

      cnd = (r >= q1);
      r += d0 & -cnd;
      sub_ddmmss (q3,q2,  q3,q2,  0,cnd);

      if (UNLIKELY (r >= d0))
	{
	  r -= d0;
	  add_ssaaaa (q3,q2,  q3,q2,  0,1);
	}

      qp[0] = q2;
      qp[1] = q3;

      up -= 2;
      qp -= 2;
    }

  if ((un & 1) == 0)
    {
      u2 = r;
      u1 = up[1];

      udiv_qrnnd_preinv (q3, r, u2, u1, d0, di1);
      qp[1] = q3;
    }

  return r;

#undef q3
#undef q2
#undef q1
}
Exemplo n.º 19
0
/* (rp, 2n) = (xp, n)*(yp, n) */
static void
mpn_mulshort_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
{
  mp_size_t m;
  mp_limb_t t;
  mp_ptr rpn2;

  ASSERT(n >= 1);
  ASSERT_MPN(xp, n);
  ASSERT_MPN(yp, n);
  ASSERT(!MPN_OVERLAP_P (rp, 2 * n, xp, n));
  ASSERT(!MPN_OVERLAP_P (rp, 2 * n, yp, n));
  
  if (BELOW_THRESHOLD(n, MULHIGH_BASECASE_THRESHOLD))
    {
      mpn_mul_basecase(rp, xp, n, yp, n);
      
      return;
    }

  if (BELOW_THRESHOLD (n, MULHIGH_DC_THRESHOLD))
    {
      mpn_mulshort_n_basecase(rp, xp, yp, n);
      
      return;
    }

  /* choose optimal m s.t. n + 2 <= 2m,  m < n */
  ASSERT (n >= 4);

  m = 87 * n / 128;
  
  if (2 * m < n + 2)
    m = (n + 1) / 2 + 1;
  
  if (m >= n)
    m = n - 1;
  
  ASSERT (n + 2 <= 2 * m);
  ASSERT (m < n);
  
  rpn2 = rp + n - 2;
  
  mpn_mul_n (rp + n - m + n - m, xp + n - m, yp + n - m, m);
  mpn_mulshort_n (rp, xp, yp + m, n - m);

  ASSERT_NOCARRY (mpn_add (rpn2, rpn2, n + 2, rpn2 - m, n - m + 2));
  
  mpn_mulshort_n (rp, xp + m, yp, n - m);
  
  ASSERT_NOCARRY (mpn_add (rpn2, rpn2, n + 2, rpn2 - m, n - m + 2));
  
  umul_ppmm (rp[1], t, xp[m - 1], yp[n - m - 1] << GMP_NAIL_BITS);
  rp[0] = t >> GMP_NAIL_BITS;
  
  ASSERT_NOCARRY (mpn_add (rpn2, rpn2, n + 2, rp, 2));
  
  umul_ppmm (rp[1], t, xp[n - m - 1], yp[m - 1] << GMP_NAIL_BITS);
  rp[0] = t >> GMP_NAIL_BITS;
  
  ASSERT_NOCARRY (mpn_add (rpn2, rpn2, n + 2, rp, 2));
  
  return;
}
Exemplo n.º 20
0
void
mpn_mul_basecase (mp_ptr rp,
		  mp_srcptr up, mp_size_t un,
		  mp_srcptr vp, mp_size_t vn)
{
  ASSERT (un >= vn);
  ASSERT (vn >= 1);
  ASSERT (! MPN_OVERLAP_P (rp, un+vn, up, un));
  ASSERT (! MPN_OVERLAP_P (rp, un+vn, vp, vn));

  /* We first multiply by the low order limb (or depending on optional function
     availability, limbs).  This result can be stored, not added, to rp.  We
     also avoid a loop for zeroing this way.  */

#if HAVE_NATIVE_mpn_mul_2
  if (vn >= 2)
    {
      rp[un + 1] = mpn_mul_2 (rp, up, un, vp);
      rp += 2, vp += 2, vn -= 2;
    }
  else
    {
      rp[un] = mpn_mul_1 (rp, up, un, vp[0]);
      return;
    }
#else
  rp[un] = mpn_mul_1 (rp, up, un, vp[0]);
  rp += 1, vp += 1, vn -= 1;
#endif

  /* Now accumulate the product of up[] and the next low-order limb (or
     depending on optional function availability, limbs) from vp[0].  */

#define MAX_LEFT MP_SIZE_T_MAX

#if HAVE_NATIVE_mpn_addmul_4
  while (vn >= 4)
    {
      rp[un + 4 - 1] = mpn_addmul_4 (rp, up, un, vp);
      rp += 4, vp += 4, vn -= 4;
    }
#undef MAX_LEFT
#define MAX_LEFT 3
#endif

#if HAVE_NATIVE_mpn_addmul_3
  while (vn >= 3)
    {
      rp[un + 3 - 1] = mpn_addmul_3 (rp, up, un, vp);
      rp += 3, vp += 3, vn -= 3;
      if (MAX_LEFT - 3 <= 3)
	break;
    }
#undef MAX_LEFT
#define MAX_LEFT 2
#endif

#if HAVE_NATIVE_mpn_addmul_2
  while (vn >= 2)
    {
      rp[un + 2 - 1] = mpn_addmul_2 (rp, up, un, vp);
      rp += 2, vp += 2, vn -= 2;
      if (MAX_LEFT - 2 <= 2)
	break;
    }
#undef MAX_LEFT
#define MAX_LEFT 1
#endif

  while (vn >= 1)
    {
      rp[un] = mpn_addmul_1 (rp, up, un, vp[0]);
      rp += 1, vp += 1, vn -= 1;
      if (MAX_LEFT - 1 <= 1)
	break;
    }
}
Exemplo n.º 21
0
mp_limb_t
mpn_mul (mp_ptr prodp,
	 mp_srcptr up, mp_size_t un,
	 mp_srcptr vp, mp_size_t vn)
{
  mp_size_t l, k;
  mp_limb_t c;

  ASSERT (un >= vn);
  ASSERT (vn >= 1);
  ASSERT (! MPN_OVERLAP_P (prodp, un+vn, up, un));
  ASSERT (! MPN_OVERLAP_P (prodp, un+vn, vp, vn));

  if (un == vn)
   {
    if (up == vp)
    {
      mpn_sqr (prodp, up, un);
      return prodp[2 * un - 1];
    }
    else
    {
      mpn_mul_n (prodp, up, vp, un);
      return prodp[2 * un - 1];
    }
   }

  if (vn < MUL_KARATSUBA_THRESHOLD)
    { /* plain schoolbook multiplication */
      if (un <= MUL_BASECASE_MAX_UN)
	mpn_mul_basecase (prodp, up, un, vp, vn);
      else
	{
	  /* We have un >> MUL_BASECASE_MAX_UN > vn.  For better memory
	     locality, split up[] into MUL_BASECASE_MAX_UN pieces and multiply
	     these pieces with the vp[] operand.  After each such partial
	     multiplication (but the last) we copy the most significant vn
	     limbs into a temporary buffer since that part would otherwise be
	     overwritten by the next multiplication.  After the next
	     multiplication, we add it back.  This illustrates the situation:

                                                    -->vn<--
                                                      |  |<------- un ------->|
                                                         _____________________|
                                                        X                    /|
                                                      /XX__________________/  |
                                    _____________________                     |
                                   X                    /                     |
                                 /XX__________________/                       |
               _____________________                                          |
              /                    /                                          |
            /____________________/                                            |
	    ==================================================================

	    The parts marked with X are the parts whose sums are copied into
	    the temporary buffer.  */

	  mp_limb_t tp[MUL_KARATSUBA_THRESHOLD_LIMIT];
	  mp_limb_t cy;
          ASSERT (MUL_KARATSUBA_THRESHOLD <= MUL_KARATSUBA_THRESHOLD_LIMIT);

	  mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
	  prodp += MUL_BASECASE_MAX_UN;
	  MPN_COPY (tp, prodp, vn);		/* preserve high triangle */
	  up += MUL_BASECASE_MAX_UN;
	  un -= MUL_BASECASE_MAX_UN;
	  while (un > MUL_BASECASE_MAX_UN)
	    {
	      mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
	      cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
	      mpn_incr_u (prodp + vn, cy);		/* safe? */
	      prodp += MUL_BASECASE_MAX_UN;
	      MPN_COPY (tp, prodp, vn);		/* preserve high triangle */
	      up += MUL_BASECASE_MAX_UN;
	      un -= MUL_BASECASE_MAX_UN;
	    }
	  if (un > vn)
	    {
	      mpn_mul_basecase (prodp, up, un, vp, vn);
	    }
	  else
	    {
	      ASSERT_ALWAYS (un > 0);
	      mpn_mul_basecase (prodp, vp, vn, up, un);
	    }
	  cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
	  mpn_incr_u (prodp + vn, cy);		/* safe? */
	}
      return prodp[un + vn - 1];
  }

  if (ABOVE_THRESHOLD (un + vn, 2*MUL_FFT_FULL_THRESHOLD)
      && ABOVE_THRESHOLD (3*vn, MUL_FFT_FULL_THRESHOLD))
    {
      mpn_mul_fft_main (prodp, up, un, vp, vn);
      return prodp[un + vn - 1];
    }

  k = (un + 3)/4; // ceil(un/4)

#if GMP_NUMB_BITS == 32
  if ((ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM8H_THRESHOLD)) && (vn>=86) && (5*un <= 11*vn))
#else
  if ((ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM8H_THRESHOLD)) && (vn>=86) && (4*un <= 13*vn))
#endif
  {
      mpn_toom8h_mul(prodp, up, un, vp, vn);
      return prodp[un + vn - 1];
  }
  
  if (ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM4_THRESHOLD))
  {
          if (vn > 3*k)
          {
             mpn_toom4_mul(prodp, up, un, vp, vn);
             return prodp[un + vn - 1];
          } else
          {
             l = (un + 4)/5; // ceil(un/5)
             if ((((vn > 9*k/4) && (un+vn <= 6*MUL_TOOM4_THRESHOLD)) 
                 || ((vn > 2*l) && (un+vn > 6*MUL_TOOM4_THRESHOLD)))
                 && (vn <= 3*l))
             {
                mpn_toom53_mul(prodp, up, un, vp, vn);
                return prodp[un + vn - 1];
             }
          }
  } 
  
  if (ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM3_THRESHOLD) && (vn > k))
  {
          mp_ptr ws;
          TMP_DECL;
          TMP_MARK;

          if (vn < 2*k) // un/2 >= vn > un/4
          {
                  ws = TMP_ALLOC_LIMBS (MPN_TOOM3_MUL_TSIZE(un));
                  mpn_toom42_mul(prodp, up, un, vp, vn, ws);
                  TMP_FREE;
                  return prodp[un + vn - 1];
          }

          l = (un+2)/3; //ceil(u/3)
          if (vn > 2*l) // un >= vn > 2un/3
          {
                  ws = TMP_ALLOC_LIMBS (MPN_TOOM3_MUL_TSIZE(un));
                  mpn_toom3_mul(prodp, up, un, vp, vn, ws);
                  TMP_FREE;
                  return prodp[un + vn - 1];
          } else // 2un/3 >= vn > un/3
          {
                  ws = TMP_ALLOC_LIMBS (MPN_TOOM3_MUL_TSIZE(un));
                  mpn_toom32_mul(prodp, up, un, vp, vn, ws);
                  TMP_FREE;
                  return prodp[un + vn - 1];
          }
  }

  mpn_mul_n (prodp, up, vp, vn);

  if (un != vn)
    { mp_limb_t t;
      mp_ptr ws;
      TMP_DECL;
      TMP_MARK;

      prodp += vn;
      l = vn;
      up += vn;
      un -= vn;

      if (un < vn)
	{
	  /* Swap u's and v's. */
	  MPN_SRCPTR_SWAP (up,un, vp,vn);
	}

      ws = TMP_ALLOC_LIMBS ((vn >= MUL_KARATSUBA_THRESHOLD ? vn : un) + vn);

      t = 0;
      while (vn >= MUL_KARATSUBA_THRESHOLD)
	{
	  mpn_mul_n (ws, up, vp, vn);
	  if (l <= 2*vn)
	    {
	      t += mpn_add_n (prodp, prodp, ws, l);
	      if (l != 2*vn)
		{
		  t = mpn_add_1 (prodp + l, ws + l, 2*vn - l, t);
		  l = 2*vn;
		}
	    }
	  else
	    {
	      c = mpn_add_n (prodp, prodp, ws, 2*vn);
	      t += mpn_add_1 (prodp + 2*vn, prodp + 2*vn, l - 2*vn, c);
	    }
	  prodp += vn;
	  l -= vn;
	  up += vn;
	  un -= vn;
	  if (un < vn)
	    {
	      /* Swap u's and v's. */
	      MPN_SRCPTR_SWAP (up,un, vp,vn);
	    }
		}

      if (vn != 0)
	{
	  mpn_mul_basecase (ws, up, un, vp, vn);
	  if (l <= un + vn)
	    {
	      t += mpn_add_n (prodp, prodp, ws, l);
	      if (l != un + vn)
		t = mpn_add_1 (prodp + l, ws + l, un + vn - l, t);
	    }
	  else
	    {
	      c = mpn_add_n (prodp, prodp, ws, un + vn);
	      t += mpn_add_1 (prodp + un + vn, prodp + un + vn, l - un - vn, c);
	    }
	}

      TMP_FREE;
  }

  return prodp[un + vn - 1];
}
Exemplo n.º 22
0
mp_limb_t
mpn_bdivmod (mp_ptr qp, mp_ptr up, mp_size_t usize,
	     mp_srcptr vp, mp_size_t vsize, unsigned long int d)
{
  mp_limb_t v_inv;

  ASSERT (usize >= 1);
  ASSERT (vsize >= 1);
  ASSERT (usize * GMP_NUMB_BITS >= d);
  ASSERT (! MPN_OVERLAP_P (up, usize, vp, vsize));
  ASSERT (! MPN_OVERLAP_P (qp, d/GMP_NUMB_BITS, vp, vsize));
  ASSERT (MPN_SAME_OR_INCR2_P (qp, d/GMP_NUMB_BITS, up, usize));
  ASSERT_MPN (up, usize);
  ASSERT_MPN (vp, vsize);

  /* 1/V mod 2^GMP_NUMB_BITS. */
  binvert_limb (v_inv, vp[0]);

  /* Fast code for two cases previously used by the accel part of mpn_gcd.
     (Could probably remove this now it's inlined there.) */
  if (usize == 2 && vsize == 2 &&
      (d == GMP_NUMB_BITS || d == 2*GMP_NUMB_BITS))
    {
      mp_limb_t hi, lo;
      mp_limb_t q = (up[0] * v_inv) & GMP_NUMB_MASK;
      umul_ppmm (hi, lo, q, vp[0] << GMP_NAIL_BITS);
      up[0] = 0;
      up[1] -= hi + q*vp[1];
      qp[0] = q;
      if (d == 2*GMP_NUMB_BITS)
        {
          q = (up[1] * v_inv) & GMP_NUMB_MASK;
          up[1] = 0;
          qp[1] = q;
        }
      return 0;
    }

  /* Main loop.  */
  while (d >= GMP_NUMB_BITS)
    {
      mp_limb_t q = (up[0] * v_inv) & GMP_NUMB_MASK;
      mp_limb_t b = mpn_submul_1 (up, vp, MIN (usize, vsize), q);
      if (usize > vsize)
	mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b);
      d -= GMP_NUMB_BITS;
      up += 1, usize -= 1;
      *qp++ = q;
    }

  if (d)
    {
      mp_limb_t b;
      mp_limb_t q = (up[0] * v_inv) & (((mp_limb_t)1<<d) - 1);
      if (q <= 1)
	{
	  if (q == 0)
	    return 0;
	  else
	    b = mpn_sub_n (up, up, vp, MIN (usize, vsize));
	}
      else
	b = mpn_submul_1 (up, vp, MIN (usize, vsize), q);

      if (usize > vsize)
	mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b);
      return q;
    }

  return 0;
}
Exemplo n.º 23
0
/* (rp, 2n) = (xp, n)*(yp, n) / B^n */ 
inline static void
mpn_mulshort_n_basecase(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
{
  mp_size_t i, k;

  ASSERT(n >= 3);  /* this restriction doesn't make a lot of sense in general */
  ASSERT_MPN(xp, n);
  ASSERT_MPN(yp, n);
  ASSERT(!MPN_OVERLAP_P (rp, 2 * n, xp, n));
  ASSERT(!MPN_OVERLAP_P (rp, 2 * n, yp, n));

  k = n - 2; /* so want short product sum_(i + j >= k) x[i]y[j]B^(i + j) */
  i = 0;

  /* Multiply w limbs from y + i to (2 + i + w - 1) limbs from x + (n - 2 - i - w + 1)
     and put it into r + (n - 2 - w + 1), "overflow" (i.e. last) limb into
     r + (n + w - 1) for i between 0 and n - 2.
     i == n - w needs special treatment. */

  /* We first multiply by the low order limb (or depending on optional function
     availability, limbs).  This result can be stored, not added, to rp.  We
     also avoid a loop for zeroing this way.  */

#if HAVE_NATIVE_mpn_mul_2
  rp[n + 1] = mpn_mul_2 (rp + k - 1, xp + k - 1, 2 + 1, yp);
  i += 2;
#else
  rp[n] = mpn_mul_1 (rp + k, xp + k, 2, yp[0]);
  i += 1;
#endif

#if HAVE_NATIVE_mpn_addmul_6
  while (i < n - 6)
    {
      rp[n + i + 6 - 1] = mpn_addmul_6 (rp + k - 6 + 1, xp + k - i - 6 + 1, 2 + i + 6 - 1, yp + i);
      i += 6;
    }
  if (i == n - 6)
    {
      rp[n + n - 1] = mpn_addmul_6 (rp + i, xp, n, yp + i);
      return;
    }
#endif

#if HAVE_NATIVE_mpn_addmul_5
  while (i < n - 5)
    {
      rp[n + i + 5 - 1] = mpn_addmul_5 (rp + k - 5 + 1, xp + k - i - 5 + 1, 2 + i + 5 - 1, yp + i)
      i += 5;
    }
  if (i == n - 5)
    {
      rp[n + n - 1] = mpn_addmul_5 (rp + i, xp, n, yp + i);
      return;
    }
#endif

#if HAVE_NATIVE_mpn_addmul_4
  while (i < n - 4)
    {
      rp[n + i + 4 - 1] = mpn_addmul_4 (rp + k - 4 + 1, xp + k - i - 4 + 1, 2 + i + 4 - 1, yp + i);
      i += 4;
    }
  if (i == n - 4)
    {
      rp[n + n - 1] = mpn_addmul_4 (rp + i, xp, n, yp + i);
      return;
    }
#endif

#if HAVE_NATIVE_mpn_addmul_3
  while (i < n - 3)
    {
      rp[n + i + 3 - 1] = mpn_addmul_3 (rp + k - 3 + 1, xp + k - i - 3 + 1, 2 + i + 3 - 1, yp + i);
      i += 3;
    }
  if (i == n - 3)
    {
      rp[n + n - 1] = mpn_addmul_3 (rp + i, xp, n, yp + i);
      return;
    }
#endif

#if HAVE_NATIVE_mpn_addmul_2
  while (i < n - 2)
    {
      rp[n + i + 2 - 1] = mpn_addmul_2 (rp + k - 2 + 1, xp + k - i - 2 + 1, 2 + i + 2 - 1, yp + i);
      i += 2;
    }
  if (i == n - 2)
    {
      rp[n + n - 1] = mpn_addmul_2 (rp + i, xp, n, yp + i);
      return;
    }
#endif

  while (i < n - 1)
    {
      rp[n + i] = mpn_addmul_1 (rp + k, xp + k - i, 2 + i, yp[i]);
      i += 1;
    }
  rp[n + n - 1] = mpn_addmul_1 (rp + i, xp, n, yp[i]);
  return;
}
Exemplo n.º 24
0
/*
  Computes {np, n} / {dp, n} mod B^n, using divide-and-conquer
  algorithm, switching to classical for n <= BDIV_Q_DC_THRESHOLD.

  Also computes a 2 limb "overflow". See sb_bdiv_q.c for a definition.

  scratch is workspace.
*/
void
mpn_dc_bdiv_q_n (mp_ptr qp, mp_ptr wp, mp_ptr np, mp_srcptr dp,
		   mp_size_t n, mp_limb_t dinv, mp_ptr scratch)
{
  mp_size_t s, t;
  mp_limb_t cy;

  ASSERT (n >= 6);
  ASSERT (! MPN_OVERLAP_P (qp, n, np, n));
  ASSERT (! MPN_OVERLAP_P (qp, n, dp, n));
  ASSERT (! MPN_OVERLAP_P (wp, 2, np, n));
  ASSERT (! MPN_OVERLAP_P (wp, 2, dp, n));
  ASSERT (! MPN_OVERLAP_P (np, n, dp, n));

  /*
    Example with s = 4, t = 3, n = 7:

         C
         C C
         C C C
  qp  .  A B B B
      .  A A B B B
      1  A A A B B B
      0  A A A A B B B
         0 1 ...
           dp
  */

  t = n / 2;    /*  t = floor(n/2)  */
  s = n - t;    /*  s = ceil(n/2)   */

  /*  recurse into low half of quotient (region A)  */
  if (s <= DC_BDIV_Q_THRESHOLD)
    mpn_sb_bdiv_q (qp, wp, np, s, dp, s, dinv);
  else
    mpn_dc_bdiv_q_n (qp, wp, np, dp, s, dinv, scratch);

  /*  remove region B and overflow from A from N
      (if n odd, do first row of B separately --- we could have used
      mpn_mulmid, but this saves some logic) */
  mpn_mulmid_n (scratch, dp + 1, qp + (n & 1), t);
  if (n & 1)
    {
      cy = mpn_addmul_1 (scratch, dp + s, t, qp[0]);
      MPN_INCR_U (scratch + t, 2, cy);
    }
  ADDC_LIMB (cy, scratch[0], scratch[0], wp[0]);      /* overflow from A */
  MPN_INCR_U (scratch + 1, t + 1, wp[1] + cy);
  cy = mpn_sub_n (np + s, np + s, scratch, t);
  MPN_INCR_U (scratch + t, 2, cy);

  /*  recurse into top half of quotient (region C)
      (this does not overwrite {scratch + t, 2}, because n >= 6 implies
      t >= 3 implies floor(t/2) + 2 <= t) */
  if (t <= DC_BDIV_Q_THRESHOLD)
    mpn_sb_bdiv_q (qp + s, wp, np + s, t, dp, t, dinv);
  else
    mpn_dc_bdiv_q_n (qp + s, wp, np + s, dp, t, dinv, scratch);

  /*  combine overflows from B and C  */
  ADDC_LIMB (cy, wp[0], wp[0], scratch[t]);
  wp[1] += scratch[t + 1] + cy;
}
Exemplo n.º 25
0
mp_limb_t
mpn_divrem (mp_ptr qp, mp_size_t qxn,
	    mp_ptr np, mp_size_t nn,
	    mp_srcptr dp, mp_size_t dn)
{
  ASSERT (qxn >= 0);
  ASSERT (nn >= dn);
  ASSERT (dn >= 1);
  ASSERT (dp[dn-1] & GMP_NUMB_HIGHBIT);
  ASSERT (! MPN_OVERLAP_P (np, nn, dp, dn));
  ASSERT (! MPN_OVERLAP_P (qp, nn-dn+qxn, np, nn) || qp==np+dn+qxn);
  ASSERT (! MPN_OVERLAP_P (qp, nn-dn+qxn, dp, dn));
  ASSERT_MPN (np, nn);
  ASSERT_MPN (dp, dn);

  if (dn == 1)
    {
      mp_limb_t ret;
      mp_ptr q2p;
      mp_size_t qn;
      TMP_DECL;

      TMP_MARK;
      q2p = (mp_ptr) TMP_ALLOC ((nn + qxn) * BYTES_PER_MP_LIMB);

      np[0] = mpn_divrem_1 (q2p, qxn, np, nn, dp[0]);
      qn = nn + qxn - 1;
      MPN_COPY (qp, q2p, qn);
      ret = q2p[qn];

      TMP_FREE;
      return ret;
    }
  else if (dn == 2)
    {
      return mpn_divrem_2 (qp, qxn, np, nn, dp);
    }
  else
    {
      mp_ptr rp, q2p;
      mp_limb_t qhl;
      mp_size_t qn;
      TMP_DECL;

      TMP_MARK;
      if (UNLIKELY (qxn != 0))
	{
	  mp_ptr n2p;
	  n2p = (mp_ptr) TMP_ALLOC ((nn + qxn) * BYTES_PER_MP_LIMB);
	  MPN_ZERO (n2p, qxn);
	  MPN_COPY (n2p + qxn, np, nn);
	  q2p = (mp_ptr) TMP_ALLOC ((nn - dn + qxn + 1) * BYTES_PER_MP_LIMB);
	  rp = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
	  mpn_tdiv_qr (q2p, rp, 0L, n2p, nn + qxn, dp, dn);
	  MPN_COPY (np, rp, dn);
	  qn = nn - dn + qxn;
	  MPN_COPY (qp, q2p, qn);
	  qhl = q2p[qn];
	}
      else
	{
	  q2p = (mp_ptr) TMP_ALLOC ((nn - dn + 1) * BYTES_PER_MP_LIMB);
	  rp = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
	  mpn_tdiv_qr (q2p, rp, 0L, np, nn, dp, dn);
	  MPN_COPY (np, rp, dn);	/* overwrite np area with remainder */
	  qn = nn - dn;
	  MPN_COPY (qp, q2p, qn);
	  qhl = q2p[qn];
	}
      TMP_FREE;
      return qhl;
    }
}
Exemplo n.º 26
0
void
mpn_sqr (mp_ptr p, mp_srcptr a, mp_size_t n)
{
  ASSERT (n >= 1);
  ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n));

  if (BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
    { /* mul_basecase is faster than sqr_basecase on small sizes sometimes */
      mpn_mul_basecase (p, a, n, a, n);
    }
  else if (BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD))
    {
      mpn_sqr_basecase (p, a, n);
    }
  else if (BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD))
    {
      /* Allocate workspace of fixed size on stack: fast! */
      mp_limb_t ws[mpn_toom2_sqr_itch (SQR_TOOM3_THRESHOLD_LIMIT-1)];
      ASSERT (SQR_TOOM3_THRESHOLD <= SQR_TOOM3_THRESHOLD_LIMIT);
      mpn_toom2_sqr (p, a, n, ws);
    }
  else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD))
    {
      mp_ptr ws;
      TMP_SDECL;
      TMP_SMARK;
      ws = TMP_SALLOC_LIMBS (mpn_toom3_sqr_itch (n));
      mpn_toom3_sqr (p, a, n, ws);
      TMP_SFREE;
    }
  else if (BELOW_THRESHOLD (n, SQR_TOOM6_THRESHOLD))
    {
      mp_ptr ws;
      TMP_SDECL;
      TMP_SMARK;
      ws = TMP_SALLOC_LIMBS (mpn_toom4_sqr_itch (n));
      mpn_toom4_sqr (p, a, n, ws);
      TMP_SFREE;
    }
  else if (BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD))
    {
      mp_ptr ws;
      TMP_SDECL;
      TMP_SMARK;
      ws = TMP_SALLOC_LIMBS (mpn_toom6_sqr_itch (n));
      mpn_toom6_sqr (p, a, n, ws);
      TMP_SFREE;
    }
  else if (BELOW_THRESHOLD (n, SQR_FFT_THRESHOLD))
    {
      mp_ptr ws;
      TMP_DECL;
      TMP_MARK;
      ws = TMP_ALLOC_LIMBS (mpn_toom8_sqr_itch (n));
      mpn_toom8_sqr (p, a, n, ws);
      TMP_FREE;
    }
  else
    {
      /* The current FFT code allocates its own space.  That should probably
	 change.  */
      mpn_fft_mul (p, a, n, a, n);
    }
}
Exemplo n.º 27
0
/*
   ret + (xp, n) = (yp, n)*(zp, n) % 2^b + 1  
   needs (tp, 2n) temp space, everything reduced mod 2^b 
   inputs, outputs are fully reduced
  
   N.B: 2n is not the same as 2b rounded up to nearest limb!
*/
inline static int
mpn_mulmod_2expp1_internal (mp_ptr xp, mp_srcptr yp, mp_srcptr zp,
			                                         mpir_ui b, mp_ptr tp)
{
  mp_size_t n, k;
  mp_limb_t c;

  TMP_DECL;

  n = BITS_TO_LIMBS (b);
  k = GMP_NUMB_BITS * n - b;

  ASSERT(b > 0);
  ASSERT(n > 0);
  ASSERT_MPN(yp, n);
  ASSERT_MPN(zp, n);
  ASSERT(!MPN_OVERLAP_P (tp, 2 * n, yp, n));
  ASSERT(!MPN_OVERLAP_P (tp, 2 * n, zp, n));
  ASSERT(MPN_SAME_OR_SEPARATE_P (xp, tp, n));
  ASSERT(MPN_SAME_OR_SEPARATE_P (xp, tp + n, n));
  ASSERT(k == 0 || yp[n - 1] >> (GMP_NUMB_BITS - k) == 0);
  ASSERT(k == 0 || zp[n - 1] >> (GMP_NUMB_BITS - k) == 0);

#ifndef TUNE_PROGRAM_BUILD
  if (k == 0 && n > FFT_MULMOD_2EXPP1_CUTOFF && n == mpir_fft_adjust_limbs(n))
  {
      mp_bitcnt_t depth1, depth = 1;
      mp_size_t w1, off;
      mp_ptr tx, ty, tz;
      mp_limb_t ret;

      TMP_MARK;

      tx = TMP_BALLOC_LIMBS(3*n + 3);
      ty = tx + n + 1;
      tz = ty + n + 1;

      MPN_COPY(ty, yp, n);
      MPN_COPY(tz, zp, n);
      ty[n] = 0;
      tz[n] = 0;

      while ((((mp_limb_t)1)<<depth) < b) depth++;
   
      if (depth < 12) off = mulmod_2expp1_table_n[0];
      else off = mulmod_2expp1_table_n[MIN(depth, FFT_N_NUM + 11) - 12];
      depth1 = depth/2 - off;
   
      w1 = b/(((mp_limb_t)1)<<(2*depth1));

      mpir_fft_mulmod_2expp1(tx, ty, tz, n, depth1, w1);

      MPN_COPY(xp, tx, n);
      ret = tx[n];
      
      TMP_FREE;

	   return ret;
  }
#endif

  if (yp == zp)
     mpn_sqr(tp, yp, n);
  else
     mpn_mul_n (tp, yp, zp, n);

  if (k == 0)
    {
      c = mpn_sub_n (xp, tp, tp + n, n);

      return mpn_add_1 (xp, xp, n, c);
    }

  c = tp[n - 1];
  tp[n - 1] &= GMP_NUMB_MASK >> k;

#if HAVE_NATIVE_mpn_sublsh_nc
  c = mpn_sublsh_nc (xp, tp, tp + n, n, k, c);
#else
  {
    mp_limb_t c1;
    c1 = mpn_lshift (tp + n, tp + n, n, k);
    tp[n] |= c >> (GMP_NUMB_BITS - k);
    c = mpn_sub_n (xp, tp, tp + n, n) + c1;
  }
#endif

  c = mpn_add_1 (xp, xp, n, c);
  xp[n - 1] &= GMP_NUMB_MASK >> k;

  return c;
}
Exemplo n.º 28
0
void
mpn_mul_basecase (mp_ptr rp,
		  mp_srcptr up, mp_size_t un,
		  mp_srcptr vp, mp_size_t vn)
{
  ASSERT (un >= vn);
  ASSERT (vn >= 1);
  ASSERT (! MPN_OVERLAP_P (rp, un+vn, up, un));
  ASSERT (! MPN_OVERLAP_P (rp, un+vn, vp, vn));

  /* We first multiply by the low order limb (or depending on optional function
     availability, limbs).  This result can be stored, not added, to rp.  We
     also avoid a loop for zeroing this way.  */

#ifdef HAVE_NATIVE_mpn_mul_2
  if (vn >= 2)
    {
      rp[un + 1] = mpn_mul_2 (rp, up, un, vp);
      rp += 2, vp += 2, vn -= 2;
    }
  else
    {
      rp[un] = mpn_mul_1 (rp, up, un, vp[0]);
      return;
    }
#else
  rp[un] = mpn_mul_1 (rp, up, un, vp[0]);
  rp += 1, vp += 1, vn -= 1;
#endif

  /* Now accumulate the product of up[] and the next higher limb (or depending
     on optional function availability, limbs) from vp[].  */

#define MAX_LEFT MP_SIZE_T_MAX	/* Used to simplify loops into if statements */


#ifdef HAVE_NATIVE_mpn_addmul_6
  while (vn >= 6)
    {
      rp[un + 6 - 1] = mpn_addmul_6 (rp, up, un, vp);
      if (MAX_LEFT == 6)
	return;
      rp += 6, vp += 6, vn -= 6;
      if (MAX_LEFT < 2 * 6)
	break;
    }
#undef MAX_LEFT
#define MAX_LEFT (6 - 1)
#endif

#ifdef HAVE_NATIVE_mpn_addmul_5
  while (vn >= 5)
    {
      rp[un + 5 - 1] = mpn_addmul_5 (rp, up, un, vp);
      if (MAX_LEFT == 5)
	return;
      rp += 5, vp += 5, vn -= 5;
      if (MAX_LEFT < 2 * 5)
	break;
    }
#undef MAX_LEFT
#define MAX_LEFT (5 - 1)
#endif

#ifdef HAVE_NATIVE_mpn_addmul_4
  while (vn >= 4)
    {
      rp[un + 4 - 1] = mpn_addmul_4 (rp, up, un, vp);
      if (MAX_LEFT == 4)
	return;
      rp += 4, vp += 4, vn -= 4;
      if (MAX_LEFT < 2 * 4)
	break;
    }
#undef MAX_LEFT
#define MAX_LEFT (4 - 1)
#endif

#ifdef HAVE_NATIVE_mpn_addmul_3
  while (vn >= 3)
    {
      rp[un + 3 - 1] = mpn_addmul_3 (rp, up, un, vp);
      if (MAX_LEFT == 3)
	return;
      rp += 3, vp += 3, vn -= 3;
      if (MAX_LEFT < 2 * 3)
	break;
    }
#undef MAX_LEFT
#define MAX_LEFT (3 - 1)
#endif

#ifdef HAVE_NATIVE_mpn_addmul_2
  while (vn >= 2)
    {
      rp[un + 2 - 1] = mpn_addmul_2 (rp, up, un, vp);
      if (MAX_LEFT == 2)
	return;
      rp += 2, vp += 2, vn -= 2;
      if (MAX_LEFT < 2 * 2)
	break;
    }
#undef MAX_LEFT
#define MAX_LEFT (2 - 1)
#endif

  while (vn >= 1)
    {
      rp[un] = mpn_addmul_1 (rp, up, un, vp[0]);
      if (MAX_LEFT == 1)
	return;
      rp += 1, vp += 1, vn -= 1;
    }
}