Пример #1
0
mp_limb_t
mpn_sb_div_qr (mp_ptr qp,
		  mp_ptr np, mp_size_t nn,
		  mp_srcptr dp, mp_size_t dn,
		  mp_limb_t dinv)
{
  mp_limb_t qh;
  mp_size_t i;
  mp_limb_t n1, n0;
  mp_limb_t d1, d0, d01, d11;
  mp_limb_t cy, cy1, cy2;
  mp_limb_t q;

  ASSERT (dn > 2);
  ASSERT (nn >= dn);
  ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);

  np += nn;

  qh = mpn_cmp (np - dn, dp, dn) >= 0;
  if (qh != 0)
    mpn_sub_n (np - dn, np - dn, dp, dn);

  d1 = dp[dn - 1];

  if (dn <= SB_DIV_QR_SMALL_THRESHOLD)
    {
    np--;
    
  for (i = nn - dn - 1; i >= 0; i--)
    {
      /* fetch next word */
      cy = *np--;
      
      mpir_divapprox32_preinv2(q, cy, np[0], dinv);
      
	  /* np -= dp*q */
      cy -= mpn_submul_1(np - dn + 1, dp, dn, q);

      /* correct if remainder is too large */
      if (UNLIKELY(cy || np[0] >= d1))
      {
         if (cy || mpn_cmp(np - dn + 1, dp, dn) >= 0)
         {
            q++;
            mpn_sub_n(np - dn + 1, np - dn + 1, dp, dn);
         }
      }
      qp[i] = q;
     }
    }
  else
    {
  qp += nn - dn;

  dn -= 2;			/* offset dn by 2 for main division loops,
				   saving two iterations in mpn_submul_1.  */
  d0 = dp[dn];

  d01 = d0 + 1;
  d11 = d1 + (d01 == 0);

  np -= 2;

  n1 = np[1];

  for (i = nn - (dn + 2); i > 0; i--)
    {
      np--;
      if (UNLIKELY (n1 == d1) && np[1] == d0)
	{
	  q = GMP_NUMB_MASK;
	  mpn_submul_1 (np - dn, dp, dn + 2, q);
	  n1 = np[1];		/* update n1, last loop's value will now be invalid */
	}
      else
	{
	  mpir_divrem32_preinv2(q, n1, n0, n1, np[1], np[0], d11, d01, d1, d0, dinv);

     cy2 = mpn_submul_1 (np - dn, dp, dn, q);

	  sub_333(cy, n1, n0, 0, n1, n0, 0, 0, cy2);

	  np[0] = n0;

	  if (UNLIKELY (cy != 0))
	    {
	      n1 += d1 + mpn_add_n (np - dn, np - dn, dp, dn + 1);
         q--;
	    }
	}

      *--qp = q;
    }
  np[1] = n1;
  }

  return qh;
}
Пример #2
0
void
mpn_kara_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws)
{
  mp_limb_t w, w0, w1;
  mp_size_t n2;
  mp_srcptr x, y;
  mp_size_t i;
  int sign;

  n2 = n >> 1;
  ASSERT (n2 > 0);

  if ((n & 1) != 0)
    {
      /* Odd length. */
      mp_size_t n1, n3, nm1;

      n3 = n - n2;

      sign = 0;
      w = a[n2];
      if (w != 0)
	w -= mpn_sub_n (p, a, a + n3, n2);
      else
	{
	  i = n2;
	  do
	    {
	      --i;
	      w0 = a[i];
	      w1 = a[n3 + i];
	    }
	  while (w0 == w1 && i != 0);
	  if (w0 < w1)
	    {
	      x = a + n3;
	      y = a;
	      sign = ~0;
	    }
	  else
	    {
	      x = a;
	      y = a + n3;
	    }
	  mpn_sub_n (p, x, y, n2);
	}
      p[n2] = w;

      w = b[n2];
      if (w != 0)
	w -= mpn_sub_n (p + n3, b, b + n3, n2);
      else
	{
	  i = n2;
	  do
	    {
	      --i;
	      w0 = b[i];
	      w1 = b[n3 + i];
	    }
	  while (w0 == w1 && i != 0);
	  if (w0 < w1)
	    {
	      x = b + n3;
	      y = b;
	      sign = ~sign;
	    }
	  else
	    {
	      x = b;
	      y = b + n3;
	    }
	  mpn_sub_n (p + n3, x, y, n2);
	}
      p[n] = w;

      n1 = n + 1;
      if (n2 < MUL_KARATSUBA_THRESHOLD)
	{
	  if (n3 < MUL_KARATSUBA_THRESHOLD)
	    {
	      mpn_mul_basecase (ws, p, n3, p + n3, n3);
	      mpn_mul_basecase (p, a, n3, b, n3);
	    }
	  else
	    {
	      mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
	      mpn_kara_mul_n (p, a, b, n3, ws + n1);
	    }
	  mpn_mul_basecase (p + n1, a + n3, n2, b + n3, n2);
	}
      else
	{
	  mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
	  mpn_kara_mul_n (p, a, b, n3, ws + n1);
	  mpn_kara_mul_n (p + n1, a + n3, b + n3, n2, ws + n1);
	}

      if (sign)
	mpn_add_n (ws, p, ws, n1);
      else
	mpn_sub_n (ws, p, ws, n1);

      nm1 = n - 1;
      if (mpn_add_n (ws, p + n1, ws, nm1))
	{
	  mp_limb_t x = (ws[nm1] + 1) & GMP_NUMB_MASK;
	  ws[nm1] = x;
	  if (x == 0)
	    ws[n] = (ws[n] + 1) & GMP_NUMB_MASK;
	}
      if (mpn_add_n (p + n3, p + n3, ws, n1))
	{
	  mpn_incr_u (p + n1 + n3, 1);
	}
    }
  else
    {
      /* Even length. */
      i = n2;
      do
	{
	  --i;
	  w0 = a[i];
	  w1 = a[n2 + i];
	}
      while (w0 == w1 && i != 0);
      sign = 0;
      if (w0 < w1)
	{
	  x = a + n2;
	  y = a;
	  sign = ~0;
	}
      else
	{
	  x = a;
	  y = a + n2;
	}
      mpn_sub_n (p, x, y, n2);

      i = n2;
      do
	{
	  --i;
	  w0 = b[i];
	  w1 = b[n2 + i];
	}
      while (w0 == w1 && i != 0);
      if (w0 < w1)
	{
	  x = b + n2;
	  y = b;
	  sign = ~sign;
	}
      else
	{
	  x = b;
	  y = b + n2;
	}
      mpn_sub_n (p + n2, x, y, n2);

      /* Pointwise products. */
      if (n2 < MUL_KARATSUBA_THRESHOLD)
	{
	  mpn_mul_basecase (ws, p, n2, p + n2, n2);
	  mpn_mul_basecase (p, a, n2, b, n2);
	  mpn_mul_basecase (p + n, a + n2, n2, b + n2, n2);
	}
      else
	{
	  mpn_kara_mul_n (ws, p, p + n2, n2, ws + n);
	  mpn_kara_mul_n (p, a, b, n2, ws + n);
	  mpn_kara_mul_n (p + n, a + n2, b + n2, n2, ws + n);
	}

      /* Interpolate. */
      if (sign)
	w = mpn_add_n (ws, p, ws, n);
      else
	w = -mpn_sub_n (ws, p, ws, n);
      w += mpn_add_n (ws, p + n, ws, n);
      w += mpn_add_n (p + n2, p + n2, ws, n);
      MPN_INCR_U (p + n2 + n, 2 * n - (n2 + n), w);
    }
}
Пример #3
0
void
mpn_toom3_mul_n (mp_ptr c, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr t)
{
  mp_size_t k, k1, kk1, r, twok, twor;
  mp_limb_t cy, cc, saved, vinf0, cinf0;
  mp_ptr trec;
  int sa, sb;
  mp_ptr c1, c2, c3, c4, c5;

  ASSERT(GMP_NUMB_BITS >= 6);
  ASSERT(n >= 17); /* so that r <> 0 and 5k+3 <= 2n */

  /*
  The algorithm is the following:

  0. k = ceil(n/3), r = n - 2k, B = 2^(GMP_NUMB_BITS), t = B^k
  1. split a and b in three parts each a0, a1, a2 and b0, b1, b2
     with a0, a1, b0, b1 of k limbs, and a2, b2 of r limbs
  2. v0   <- a0*b0
     v1   <- (a0+a1+a2)*(b0+b1+b2)
     v2   <- (a0+2*a1+4*a2)*(b0+2*b1+4*b2)
     vm1  <- (a0-a1+a2)*(b0-b1+b2)
     vinf <- a2*b2
     t1   <- (3*v0+2*vm1+v2)/6-2*vinf
     t2   <- (v1+vm1)/2
  3. result is c0+c1*t+c2*t^2+c3*t^3+c4*t^4 where
     c0   <- v0
     c1   <- v1 - t1
     c2   <- t2 - v0 - vinf
     c3   <- t1 - t2
     c4   <- vinf
  */

  k = (n + 2) / 3; /* ceil(n/3) */
  twok = 2 * k;
  k1 = k + 1;
  kk1 = k + k1;
  r = n - twok;   /* last chunk */
  twor = 2 * r;

  c1 = c + k;
  c2 = c1 + k;
  c3 = c2 + k;
  c4 = c3 + k;
  c5 = c4 + k;

  trec = t + 4 * k + 3; /* trec = v2 + (2k+2) */

  /* put a0+a2 in {c, k+1}, and b0+b2 in {c+k+1, k+1};
     put a0+a1+a2 in {c+2k+2, k+1} and b0+b1+b2 in {c+3k+3,k+1}
     [requires 4k+4 <= 2n, ie. n >= 8] */
  cy = mpn_add_n (c, a, a + twok, r);
  cc = mpn_add_n (c1 + 1, b, b + twok, r);
  if (r < k)
    {
      __GMPN_ADD_1 (cy, c + r, a + r, k - r, cy);
      __GMPN_ADD_1 (cc, c1 + 1 + r, b + r, k - r, cc);
    }
  c3[2] = (c1[0] = cy) + mpn_add_n (c2 + 2, c, a + k, k);
  c4[3] = (c2[1] = cc) + mpn_add_n (c3 + 3, c1 + 1, b + k, k);

#define v2 (t+2*k+1)
#define vinf (t+4*k+2)

  /* compute v1 := (a0+a1+a2)*(b0+b1+b2) in {t, 2k+1};
     since v1 < 9*B^(2k), v1 uses only 2k+1 words if GMP_NUMB_BITS >= 4 */
  TOOM3_MUL_REC (t, c2 + 2, c3 + 3, k1, trec);

  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
					v1
  */

  /* put |a0-a1+a2| in {c, k+1} and |b0-b1+b2| in {c+4k+2,k+1} */
  /* sa = sign(a0-a1+a2) */
  sa = (c[k] != 0) ? 1 : mpn_cmp (c, a + k, k);
  c[k] = (sa >= 0) ? c[k] - mpn_sub_n (c, c, a + k, k)
		   : mpn_sub_n (c, a + k, c, k);
  /* b0+b2 is in {c+k+1, k+1} now */
  sb = (c2[1] != 0) ? 1 : mpn_cmp (c1 + 1, b + k, k);
  c5[2] = (sb >= 0) ? c2[1] - mpn_sub_n (c4 + 2, c1 + 1, b + k, k)
		    : mpn_sub_n (c4 + 2, b + k, c1 + 1, k);
  sa *= sb; /* sign of vm1 */

  /* compute vm1 := (a0-a1+a2)*(b0-b1+b2) in {c+2k, 2k+1};
     since |vm1| < 4*B^(2k), vm1 uses only 2k+1 limbs */
  TOOM3_MUL_REC (c2, c, c4 + 2, k1, trec);

  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
		vm1                      v1
  */

  /* compute a0+2a1+4a2 in {c, k+1} and b0+2b1+4b2 in {c+4k+2, k+1}
     [requires 5k+3 <= 2n, i.e. n >= 17] */
#ifdef HAVE_NATIVE_mpn_addlsh1_n
  c1[0] = mpn_addlsh1_n (c, a + k, a + twok, r);
  c5[2] = mpn_addlsh1_n (c4 + 2, b + k, b + twok, r);
  if (r < k)
    {
      __GMPN_ADD_1 (c1[0], c + r, a + k + r, k - r, c1[0]);
      __GMPN_ADD_1 (c5[2], c4 + 2 + r, b + k + r, k - r, c5[2]);
    }
  c1[0] = 2 * c1[0] + mpn_addlsh1_n (c, a, c, k);
  c5[2] = 2 * c5[2] + mpn_addlsh1_n (c4 + 2, b, c4 + 2, k);
#else
  c[r] = mpn_lshift (c, a + twok, r, 1);
  c4[r + 2] = mpn_lshift (c4 + 2, b + twok, r, 1);
  if (r < k)
    {
      MPN_ZERO(c + r + 1, k - r);
      MPN_ZERO(c4 + r + 3, k - r);
    }
  c1[0] += mpn_add_n (c, c, a + k, k);
  c5[2] += mpn_add_n (c4 + 2, c4 + 2, b + k, k);
  mpn_lshift (c, c, k1, 1);
  mpn_lshift (c4 + 2, c4 + 2, k1, 1);
  c1[0] += mpn_add_n (c, c, a, k);
  c5[2] += mpn_add_n (c4 + 2, c4 + 2, b, k);
#endif

  /* compute v2 := (a0+2a1+4a2)*(b0+2b1+4b2) in {t+2k+1, 2k+1}
     v2 < 49*B^k so v2 uses at most 2k+1 limbs if GMP_NUMB_BITS >= 6 */
  TOOM3_MUL_REC (v2, c, c4 + 2, k1, trec);

  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
		vm1                      v1         v2
  */

  /* compute v0 := a0*b0 in {c, 2k} */
  TOOM3_MUL_REC (c, a, b, k, trec);

  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
       v0       vm1                      v1         v2
  */

  /* now compute (3v0+2vm1+v2)/6 = [v0 + (2vm1+v2)/3]/2
     v2 <- v2+2vm1 = 3*(a0*b0+2*a0*b2+2*a1*b1+2*a1*b2+2*a2*b0+2*a2*b1+6*a2*b2),
     thus 0 <= v2 < 51*B^(2k) < 2^6*B^(2k)
     Uses temporary space {t+4k+2,2k+1}, requires T(n) >= 6k+3.
  */
  if (sa >= 0)
    {
#ifdef HAVE_NATIVE_mpn_addlsh1_n
      mpn_addlsh1_n (v2, v2, c2, kk1);
#else
      /* we can use vinf=t+4k+2 as workspace since it is not full yet */
      mpn_lshift (vinf, c2, kk1, 1);
      mpn_add_n (v2, v2, vinf, kk1);
#endif
    }
  else
    {
#ifdef HAVE_NATIVE_mpn_sublsh1_n
      mpn_sublsh1_n (v2, v2, c2, kk1);
#else
      /* we can use vinf=t+4k+2 as workspace since it is not full yet */
      mpn_lshift (vinf, c2, kk1, 1);
      mpn_sub_n (v2, v2, vinf, kk1);
#endif
    }

  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
       v0       vm1                      v1       v2+2vm1             */

  /* compute vinf := a2*b2 in {t+4k+2, 2r}: first put it in {c4, 2r},
     then copy it in {t+4k+2,2r} */
  saved = c4[0];
  TOOM3_MUL_REC (c4, a + twok, b + twok, r, trec);
  cinf0 = mpn_add_n (vinf, c4, c, twor); /* {v0,2r} + {vinf,2r} */
  vinf0 = c4[0];
  c4[0] = saved;

  toom3_interpolate (c, t, v2, c2, vinf, k, r, sa, vinf0, cinf0, vinf + twor);

#undef v2
#undef vinf
}
Пример #4
0
/*
   Toom 4 interpolation. Interpolates the value at 2^(sn*B) of a 
	polynomial p(x) with 7 coefficients given the values 
	p(oo), p(2), p(1), p(-1), 2^6*p(1/2), 2^6*p(-1/2), p(0).
	The output is placed in rp and the final number of limbs of the
	output is given in rpn.
	The 4th and 6th values may be negative, and if so, n4 and n6 
	should be set to a negative value respectively.
   To save space we pass r3, r5, r7 in place in the output rp.
	The other r's are stored separately in space tp.
	The low limb of r3 is stored in r30, as it will be overwritten
	by the high limb of r5.

rp          rp1          rp2           rp3          rp4           rp5         rp6           rp7
<----------- r7-----------><------------r5-------------->            
                                                      <-------------r3------------->

   We assume that r1 is stored at tp, r2 at (tp + t4), r4 at (tp + 2*t4) 
	and r6 (tp + 3*t4). Each of these r's has t4 = s4 + 1 limbs allocated.
*/
void mpn_toom4_interpolate(mp_ptr rp, mp_size_t * rpn, mp_size_t sn,  
		       mp_ptr tp, mp_size_t s4, mp_size_t n4, mp_size_t n6, mp_limb_t r30)
{
	mp_size_t n1, n2, n3, n5, n7, t4;
	mp_limb_t saved, saved2, cy;

   t4 = s4 + 1; 
   
	mpn_add_n(r2, r2, r5, s4);

   if (n6 < 0) 
		mpn_add_n(r6, r5, r6, s4);
	else
      mpn_sub_n(r6, r5, r6, s4);
	/* r6 is now in twos complement format */

	saved = r3[0];
	r3[0] = r30;
	if (n4 < 0) 
		mpn_add_n(r4, r3, r4, s4);
	else
      mpn_sub_n(r4, r3, r4, s4);
	r3[0] = saved;
	/* r4 is now in twos complement format */
	
	mpn_sub_n(r5, r5, r1, s4);

#if HAVE_NATIVE_mpn_sublsh_n
	r5[s4-1] -= mpn_sublsh_n(r5, r5, r7, s4-1, 6);
#else
	r5[s4-1] -= mpn_submul_1(r5, r7, s4-1, 64);
#endif
   
   TC4_RSHIFT1(r4, s4); 
	
	saved = r3[0];
	r3[0] = r30;
	mpn_sub_n(r3, r3, r4, s4);
	r30 = r3[0];
	r3[0] = saved;

	mpn_double(r5, s4); 

	mpn_sub_n(r5, r5, r6, s4);

   saved = r3[0];
	r3[0] = r30;
	mpn_submul_1(r2, r3, s4, 65);
   r3[0] = saved;
	
	saved2 = r7[s4-1];
	r7[s4-1] = CNST_LIMB(0); // r7 is always positive so no sign extend needed
	saved = r3[0];
	r3[0] = r30;
#if HAVE_NATIVE_mpn_subadd_n
	mpn_subadd_n(r3, r3, r7, r1, s4);
#else
    mpn_sub_n(r3, r3, r7, s4);
    mpn_sub_n(r3, r3, r1, s4);
#endif
	r7[s4-1] = saved2;
   r30 = r3[0];
	
   mpn_addmul_1(r2, r3, s4, 45);

#if HAVE_NATIVE_mpn_sublsh_n
   cy = mpn_sublsh_n(r5, r5, r3, s4 - 1, 3);
#else
   cy = mpn_submul_1(r5, r3, s4 - 1, 8);
#endif
   r3[0] = saved;
	r3[0] -= (cy + 8*r3[s4-1]);
   
	mpn_rshift(r5, r5, s4, 3); 

	mpn_divexact_by3(r5, r5, s4); 
   
	mpn_sub_n(r6, r6, r2, s4);

#if HAVE_NATIVE_mpn_sublsh_n
	mpn_sublsh_n(r2, r2, r4, s4, 4);
#else
	mpn_submul_1(r2, r4, s4, 16);
#endif
   
   mpn_rshift(r2, r2, s4, 1); 

	mpn_divexact_by3(r2, r2, s4); 

   mpn_divexact_by3(r2, r2, s4); 
   
   saved = r3[0];
	r3[0] = r30;
   cy = mpn_sub_n(r3, r3, r5, s4 - 1);
   r30 = r3[0];
	r3[0] = saved;
	r3[s4-1] -= (cy + r5[s4-1]);
   
	mpn_sub_n(r4, r4, r2, s4);
	
	mpn_addmul_1(r6, r2, s4, 30);

   mpn_divexact_byfobm1(r6, r6, s4, CNST_LIMB(15), CNST_LIMB(~0/15));

	mpn_rshift(r6, r6, s4, 2);

	mpn_sub_n(r2, r2, r6, s4);

	TC4_NORM(r1, n1, s4);
   TC4_NORM(r2, n2, s4);
   
   (*rpn) = 6*sn+1;
	cy = mpn_add_1(r3, r3, *rpn - 4*sn, r30); /* don't forget to add r3[0] back in */
   if (cy) 
	{
		rp[*rpn] = cy;
	   (*rpn)++;
	}

	tc4_copy(rp, rpn, 5*sn, r2, n2);
   tc4_copy(rp, rpn, 6*sn, r1, n1);

	tc4_copy(rp, rpn, sn, r6, s4);
   tc4_copy(rp, rpn, 3*sn, r4, s4); 
}
Пример #5
0
/* Evaluates a polynomial of degree k > 2, in the points +2^shift and -2^shift. */
int
mpn_toom_eval_pm2exp (mp_ptr xp2, mp_ptr xm2, unsigned k,
		      mp_srcptr xp, mp_size_t n, mp_size_t hn, unsigned shift,
		      mp_ptr tp)
{
  unsigned i;
  int neg;
#ifdef HAVE_NATIVE_mpn_addlsh_n
  mp_limb_t cy;
#endif

  ASSERT (k >= 3);
  ASSERT (shift*k < GMP_NUMB_BITS);

  ASSERT (hn > 0);
  ASSERT (hn <= n);

  /* The degree k is also the number of full-size coefficients, so
   * that last coefficient, of size hn, starts at xp + k*n. */

#ifdef HAVE_NATIVE_mpn_addlsh_n
  xp2[n] = mpn_addlsh_n (xp2, xp, xp + 2*n, n, 2*shift);
  for (i = 4; i < k; i += 2)
    xp2[n] += mpn_addlsh_n (xp2, xp2, xp + i*n, n, i*shift);

  tp[n] = mpn_lshift (tp, xp+n, n, shift);
  for (i = 3; i < k; i+= 2)
    tp[n] += mpn_addlsh_n (tp, tp, xp+i*n, n, i*shift);

  if (k & 1)
    {
      cy = mpn_addlsh_n (tp, tp, xp+k*n, hn, k*shift);
      MPN_INCR_U (tp + hn, n+1 - hn, cy);
    }
  else
    {
      cy = mpn_addlsh_n (xp2, xp2, xp+k*n, hn, k*shift);
      MPN_INCR_U (xp2 + hn, n+1 - hn, cy);
    }

#else /* !HAVE_NATIVE_mpn_addlsh_n */
  xp2[n] = mpn_lshift (tp, xp+2*n, n, 2*shift);
  xp2[n] += mpn_add_n (xp2, xp, tp, n);
  for (i = 4; i < k; i += 2)
    {
      xp2[n] += mpn_lshift (tp, xp + ((mp_size_t) i)*n, n, i*shift);
      xp2[n] += mpn_add_n (xp2, xp2, tp, n);
    }

  tp[n] = mpn_lshift (tp, xp+n, n, shift);
  for (i = 3; i < k; i+= 2)
    {
      tp[n] += mpn_lshift (xm2, xp + ((mp_size_t) i)*n, n, i*shift);
      tp[n] += mpn_add_n (tp, tp, xm2, n);
    }

  xm2[hn] = mpn_lshift (xm2, xp + ((mp_size_t) k)*n, hn, k*shift);
  if (k & 1)
    mpn_add (tp, tp, n+1, xm2, hn+1);
  else
    mpn_add (xp2, xp2, n+1, xm2, hn+1);
#endif /* !HAVE_NATIVE_mpn_addlsh_n */

  neg = (mpn_cmp (xp2, tp, n + 1) < 0) ? ~0 : 0;

#ifdef HAVE_NATIVE_mpn_add_n_sub_n
  if (neg)
    mpn_add_n_sub_n (xp2, xm2, tp, xp2, n + 1);
  else
    mpn_add_n_sub_n (xp2, xm2, xp2, tp, n + 1);
#else /* !HAVE_NATIVE_mpn_add_n_sub_n */
  if (neg)
    mpn_sub_n (xm2, tp, xp2, n + 1);
  else
    mpn_sub_n (xm2, xp2, tp, n + 1);

  mpn_add_n (xp2, xp2, tp, n + 1);
#endif /* !HAVE_NATIVE_mpn_add_n_sub_n */

  /* FIXME: the following asserts are useless if (k+1)*shift >= GMP_LIMB_BITS */
  ASSERT ((k+1)*shift >= GMP_LIMB_BITS ||
	  xp2[n] < ((CNST_LIMB(1)<<((k+1)*shift))-1)/((CNST_LIMB(1)<<shift)-1));
  ASSERT ((k+2)*shift >= GMP_LIMB_BITS ||
	  xm2[n] < ((CNST_LIMB(1)<<((k+2)*shift))-((k&1)?(CNST_LIMB(1)<<shift):1))/((CNST_LIMB(1)<<(2*shift))-1));

  return neg;
}
Пример #6
0
  MPFR_ASSERTD ((n+4)/2 <= k && k < n); /* bounds from [1] */
  MPFR_TMP_MARK (marker);
  l = n - k;
  /* first divide the most significant 2k limbs from N by the most significant
     k limbs of D */
  qh = mpn_divrem (qp + l, 0, np + 2 * l, 2 * k, dp + l, k); /* exact */

  /* it remains {np,2l+k} = {np,n+l} as remainder */

  /* now we have to subtract high(Q1)*D0 where Q1=qh*B^k+{qp+l,k} and
     D0={dp,l} */
  tp = MPFR_TMP_LIMBS_ALLOC (2 * l);
  mpfr_mulhigh_n (tp, qp + k, dp, l);
  /* we are only interested in the upper l limbs from {tp,2l} */
  cy = mpn_sub_n (np + n, np + n, tp + l, l);
  if (qh)
    cy += mpn_sub_n (np + n, np + n, dp, l);
  while (cy > 0) /* Q1 was too large: subtract 1 to Q1 and add D to np+l */
    {
      qh -= mpn_sub_1 (qp + l, qp + l, k, MPFR_LIMB_ONE);
      cy -= mpn_add_n (np + l, np + l, dp, n);
    }

  /* now it remains {np,n+l} to divide by D */
  cy = mpfr_divhigh_n (qp, np + k, dp + k, l);
  qh += mpn_add_1 (qp + l, qp + l, k, cy);
  MPFR_TMP_FREE(marker);

  return qh;
}
Пример #7
0
/* FIXME:
    x Take scratch parameter, and figure out scratch need.

    x Use some fallback for small M->n?
*/
static mp_size_t
hgcd_matrix_apply (const struct hgcd_matrix *M,
		   mp_ptr ap, mp_ptr bp,
		   mp_size_t n)
{
  mp_size_t an, bn, un, vn, nn;
  mp_size_t mn[2][2];
  mp_size_t modn;
  mp_ptr tp, sp, scratch;
  mp_limb_t cy;
  unsigned i, j;

  TMP_DECL;

  ASSERT ( (ap[n-1] | bp[n-1]) > 0);

  an = n;
  MPN_NORMALIZE (ap, an);
  bn = n;
  MPN_NORMALIZE (bp, bn);

  for (i = 0; i < 2; i++)
    for (j = 0; j < 2; j++)
      {
	mp_size_t k;
	k = M->n;
	MPN_NORMALIZE (M->p[i][j], k);
	mn[i][j] = k;
      }

  ASSERT (mn[0][0] > 0);
  ASSERT (mn[1][1] > 0);
  ASSERT ( (mn[0][1] | mn[1][0]) > 0);

  TMP_MARK;

  if (mn[0][1] == 0)
    {
      /* A unchanged, M = (1, 0; q, 1) */
      ASSERT (mn[0][0] == 1);
      ASSERT (M->p[0][0][0] == 1);
      ASSERT (mn[1][1] == 1);
      ASSERT (M->p[1][1][0] == 1);

      /* Put B <-- B - q A */
      nn = submul (bp, bn, ap, an, M->p[1][0], mn[1][0]);
    }
  else if (mn[1][0] == 0)
    {
      /* B unchanged, M = (1, q; 0, 1) */
      ASSERT (mn[0][0] == 1);
      ASSERT (M->p[0][0][0] == 1);
      ASSERT (mn[1][1] == 1);
      ASSERT (M->p[1][1][0] == 1);

      /* Put A  <-- A - q * B */
      nn = submul (ap, an, bp, bn, M->p[0][1], mn[0][1]);
    }
  else
    {
      /* A = m00 a + m01 b  ==> a <= A / m00, b <= A / m01.
	 B = m10 a + m11 b  ==> a <= B / m10, b <= B / m11. */
      un = MIN (an - mn[0][0], bn - mn[1][0]) + 1;
      vn = MIN (an - mn[0][1], bn - mn[1][1]) + 1;

      nn = MAX (un, vn);
      /* In the range of interest, mulmod_bnm1 should always beat mullo. */
      modn = mpn_mulmod_bnm1_next_size (nn + 1);

      scratch = TMP_ALLOC_LIMBS (mpn_mulmod_bnm1_itch (modn, modn, M->n));
      tp = TMP_ALLOC_LIMBS (modn);
      sp = TMP_ALLOC_LIMBS (modn);

      ASSERT (n <= 2*modn);

      if (n > modn)
	{
	  cy = mpn_add (ap, ap, modn, ap + modn, n - modn);
	  MPN_INCR_U (ap, modn, cy);

	  cy = mpn_add (bp, bp, modn, bp + modn, n - modn);
	  MPN_INCR_U (bp, modn, cy);

	  n = modn;
	}

      mpn_mulmod_bnm1 (tp, modn, ap, n, M->p[1][1], mn[1][1], scratch);
      mpn_mulmod_bnm1 (sp, modn, bp, n, M->p[0][1], mn[0][1], scratch);

      /* FIXME: Handle the small n case in some better way. */
      if (n + mn[1][1] < modn)
	MPN_ZERO (tp + n + mn[1][1], modn - n - mn[1][1]);
      if (n + mn[0][1] < modn)
	MPN_ZERO (sp + n + mn[0][1], modn - n - mn[0][1]);

      cy = mpn_sub_n (tp, tp, sp, modn);
      MPN_DECR_U (tp, modn, cy);

      ASSERT (mpn_zero_p (tp + nn, modn - nn));

      mpn_mulmod_bnm1 (sp, modn, ap, n, M->p[1][0], mn[1][0], scratch);
      MPN_COPY (ap, tp, nn);
      mpn_mulmod_bnm1 (tp, modn, bp, n, M->p[0][0], mn[0][0], scratch);

      if (n + mn[1][0] < modn)
	MPN_ZERO (sp + n + mn[1][0], modn - n - mn[1][0]);
      if (n + mn[0][0] < modn)
	MPN_ZERO (tp + n + mn[0][0], modn - n - mn[0][0]);

      cy = mpn_sub_n (tp, tp, sp, modn);
      MPN_DECR_U (tp, modn, cy);

      ASSERT (mpn_zero_p (tp + nn, modn - nn));
      MPN_COPY (bp, tp, nn);

      while ( (ap[nn-1] | bp[nn-1]) == 0)
	{
	  nn--;
	  ASSERT (nn > 0);
	}
    }
  TMP_FREE;

  return nn;
}
Пример #8
0
void
check_functions (void)
{
  mp_limb_t  wp[2], wp2[2], xp[2], yp[2], r;
  int  i;

  memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec));
  for (i = 0; i < 2; i++)
    {
      xp[0] = 123;
      yp[0] = 456;
      mpn_add_n (wp, xp, yp, (mp_size_t) 1);
      ASSERT_ALWAYS (wp[0] == 579);
    }

  memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec));
  for (i = 0; i < 2; i++)
    {
      xp[0] = 123;
      wp[0] = 456;
      r = mpn_addmul_1 (wp, xp, (mp_size_t) 1, CNST_LIMB(2));
      ASSERT_ALWAYS (wp[0] == 702);
      ASSERT_ALWAYS (r == 0);
    }

#if HAVE_NATIVE_mpn_copyd
  memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec));
  for (i = 0; i < 2; i++)
    {
      xp[0] = 123;
      xp[1] = 456;
      mpn_copyd (xp+1, xp, (mp_size_t) 1);
      ASSERT_ALWAYS (xp[1] == 123);
    }
#endif

#if HAVE_NATIVE_mpn_copyi
  memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec));
  for (i = 0; i < 2; i++)
    {
      xp[0] = 123;
      xp[1] = 456;
      mpn_copyi (xp, xp+1, (mp_size_t) 1);
      ASSERT_ALWAYS (xp[0] == 456);
    }
#endif

  memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec));
  for (i = 0; i < 2; i++)
    {
      xp[0] = 1605;
      mpn_divexact_1 (wp, xp, (mp_size_t) 1, CNST_LIMB(5));
      ASSERT_ALWAYS (wp[0] == 321);
    }

  memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec));
  for (i = 0; i < 2; i++)
    {
      xp[0] = 1296;
      r = mpn_divexact_by3c (wp, xp, (mp_size_t) 1, CNST_LIMB(0));
      ASSERT_ALWAYS (wp[0] == 432);
      ASSERT_ALWAYS (r == 0);
    }

  memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec));
  for (i = 0; i < 2; i++)
    {
      xp[0] = 578;
      r = mpn_divexact_byfobm1 (wp, xp, (mp_size_t) 1, CNST_LIMB(17),CNST_LIMB(-1)/CNST_LIMB(17));
      ASSERT_ALWAYS (wp[0] == 34);
      ASSERT_ALWAYS (r == 0);
    }

  memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec));
  for (i = 0; i < 2; i++)
    {
      xp[0] = 287;
      r = mpn_divrem_1 (wp, (mp_size_t) 1, xp, (mp_size_t) 1, CNST_LIMB(7));
      ASSERT_ALWAYS (wp[1] == 41);
      ASSERT_ALWAYS (wp[0] == 0);
      ASSERT_ALWAYS (r == 0);
    }

  memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec));
  for (i = 0; i < 2; i++)
    {
      xp[0] = 290;
      r = mpn_divrem_euclidean_qr_1 (wp, 0, xp, (mp_size_t) 1, CNST_LIMB(7));
      ASSERT_ALWAYS (wp[0] == 41);
      ASSERT_ALWAYS (r == 3);
    }

  memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec));
  for (i = 0; i < 2; i++)
    {
      xp[0] = 12;
      r = mpn_gcd_1 (xp, (mp_size_t) 1, CNST_LIMB(9));
      ASSERT_ALWAYS (r == 3);
    }

  memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec));
  for (i = 0; i < 2; i++)
    {
      xp[0] = 0x1001;
      mpn_lshift (wp, xp, (mp_size_t) 1, 1);
      ASSERT_ALWAYS (wp[0] == 0x2002);
    }

  memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec));
  for (i = 0; i < 2; i++)
    {
      xp[0] = 14;
      r = mpn_mod_1 (xp, (mp_size_t) 1, CNST_LIMB(4));
      ASSERT_ALWAYS (r == 2);
    }

#if (GMP_NUMB_BITS % 4) == 0
  memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec));
  for (i = 0; i < 2; i++)
    {
      int  bits = (GMP_NUMB_BITS / 4) * 3;
      mp_limb_t  mod = (CNST_LIMB(1) << bits) - 1;
      mp_limb_t  want = GMP_NUMB_MAX % mod;
      xp[0] = GMP_NUMB_MAX;
      r = mpn_mod_34lsub1 (xp, (mp_size_t) 1);
      ASSERT_ALWAYS (r % mod == want);
    }
#endif

  //   DECL_modexact_1c_odd ((*modexact_1c_odd)); 

  memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec));
  for (i = 0; i < 2; i++)
    {
      xp[0] = 14;
      r = mpn_mul_1 (wp, xp, (mp_size_t) 1, CNST_LIMB(4));
      ASSERT_ALWAYS (wp[0] == 56);
      ASSERT_ALWAYS (r == 0);
    }

  memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec));
  for (i = 0; i < 2; i++)
    {
      xp[0] = 5;
      yp[0] = 7;
      mpn_mul_basecase (wp, xp, (mp_size_t) 1, yp, (mp_size_t) 1);
      ASSERT_ALWAYS (wp[0] == 35);
      ASSERT_ALWAYS (wp[1] == 0);
    }

#if HAVE_NATIVE_mpn_preinv_divrem_1 && GMP_NAIL_BITS == 0
  memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec));
  for (i = 0; i < 2; i++)
    {
      xp[0] = 0x101;
      r = mpn_preinv_divrem_1 (wp, (mp_size_t) 1, xp, (mp_size_t) 1,
                               GMP_LIMB_HIGHBIT,
                               refmpn_invert_limb (GMP_LIMB_HIGHBIT), 0);
      ASSERT_ALWAYS (wp[0] == 0x202);
      ASSERT_ALWAYS (wp[1] == 0);
      ASSERT_ALWAYS (r == 0);
    }
#endif

#if GMP_NAIL_BITS == 0
  memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec));
  for (i = 0; i < 2; i++)
    {
      xp[0] = GMP_LIMB_HIGHBIT+123;
      r = mpn_preinv_mod_1 (xp, (mp_size_t) 1, GMP_LIMB_HIGHBIT,
                            refmpn_invert_limb (GMP_LIMB_HIGHBIT));
      ASSERT_ALWAYS (r == 123);
    }
#endif


 memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec));
   for (i = 0; i < 2; i++)
       {
        xp[0] = 5;
        modlimb_invert(r,xp[0]);
        r=-r;
        yp[0]=43;
        yp[1]=75;
        mpn_redc_1 (wp, yp, xp, (mp_size_t) 1,r);
        ASSERT_ALWAYS (wp[0] == 78);
       }

 memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec));
   for (i = 0; i < 2; i++)
       {
        xp[0]=5;
        yp[0]=3;
        mpn_sumdiff_n (wp, wp2,xp, yp,1);
        ASSERT_ALWAYS (wp[0] == 8);
        ASSERT_ALWAYS (wp2[0] == 2);
       }

  memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec));
  for (i = 0; i < 2; i++)
    {
      xp[0] = 0x8008;
      mpn_rshift (wp, xp, (mp_size_t) 1, 1);
      ASSERT_ALWAYS (wp[0] == 0x4004);
    }

  memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec));
  for (i = 0; i < 2; i++)
    {
      xp[0] = 5;
      mpn_sqr_basecase (wp, xp, (mp_size_t) 1);
      ASSERT_ALWAYS (wp[0] == 25);
      ASSERT_ALWAYS (wp[1] == 0);
    }

  memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec));
  for (i = 0; i < 2; i++)
    {
      xp[0] = 999;
      yp[0] = 666;
      mpn_sub_n (wp, xp, yp, (mp_size_t) 1);
      ASSERT_ALWAYS (wp[0] == 333);
    }

  memcpy (&__gmpn_cpuvec, &initial_cpuvec, sizeof (__gmpn_cpuvec));
  for (i = 0; i < 2; i++)
    {
      xp[0] = 123;
      wp[0] = 456;
      r = mpn_submul_1 (wp, xp, (mp_size_t) 1, CNST_LIMB(2));
      ASSERT_ALWAYS (wp[0] == 210);
      ASSERT_ALWAYS (r == 0);
    }
}
void
mpn_toom_interpolate_8pts (mp_ptr pp, mp_size_t n,
			   mp_ptr r3, mp_ptr r7,
			   mp_size_t spt, mp_ptr ws)
{
  mp_limb_signed_t cy;
  mp_ptr r5, r1;
  r5 = (pp + 3 * n);			/* 3n+1 */
  r1 = (pp + 7 * n);			/* spt */

  /******************************* interpolation *****************************/

  DO_mpn_subrsh(r3+n, 2 * n + 1, pp, 2 * n, 4, ws);
  cy = DO_mpn_sublsh_n (r3, r1, spt, 12, ws);
  MPN_DECR_U (r3 + spt, 3 * n + 1 - spt, cy);

  DO_mpn_subrsh(r5+n, 2 * n + 1, pp, 2 * n, 2, ws);
  cy = DO_mpn_sublsh_n (r5, r1, spt, 6, ws);
  MPN_DECR_U (r5 + spt, 3 * n + 1 - spt, cy);

  r7[3*n] -= mpn_sub_n (r7+n, r7+n, pp, 2 * n);
  cy = mpn_sub_n (r7, r7, r1, spt);
  MPN_DECR_U (r7 + spt, 3 * n + 1 - spt, cy);

  ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, 3 * n + 1));
  ASSERT_NOCARRY(mpn_rshift(r3, r3, 3 * n + 1, 2));

  ASSERT_NOCARRY(mpn_sub_n (r5, r5, r7, 3 * n + 1));

  ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, 3 * n + 1));

  mpn_divexact_by45 (r3, r3, 3 * n + 1);

  ASSERT_NOCARRY(mpn_divexact_by3 (r5, r5, 3 * n + 1));

  ASSERT_NOCARRY(DO_mpn_sublsh2_n (r5, r3, 3 * n + 1, ws));

  /* last interpolation steps... */
  /* ... are mixed with recomposition */

  /***************************** recomposition *******************************/
  /*
    pp[] prior to operations:
     |_H r1|_L r1|____||_H r5|_M_r5|_L r5|_____|_H r8|_L r8|pp

    summation scheme for remaining operations:
     |____8|n___7|n___6|n___5|n___4|n___3|n___2|n____|n____|pp
     |_H r1|_L r1|____||_H*r5|_M r5|_L r5|_____|_H_r8|_L r8|pp
	  ||_H r3|_M r3|_L*r3|
				  ||_H_r7|_M_r7|_L_r7|
		      ||-H r3|-M r3|-L*r3|
				  ||-H*r5|-M_r5|-L_r5|
  */

  cy = mpn_add_n (pp + n, pp + n, r7, n); /* Hr8+Lr7-Lr5 */
  cy-= mpn_sub_n (pp + n, pp + n, r5, n);
  if (0 > cy)
    MPN_DECR_U (r7 + n, 2*n + 1, 1);
  else
    MPN_INCR_U (r7 + n, 2*n + 1, cy);

  cy = mpn_sub_n (pp + 2*n, r7 + n, r5 + n, n); /* Mr7-Mr5 */
  MPN_DECR_U (r7 + 2*n, n + 1, cy);

  cy = mpn_add_n (pp + 3*n, r5, r7+ 2*n, n+1); /* Hr7+Lr5 */
  r5[3*n]+= mpn_add_n (r5 + 2*n, r5 + 2*n, r3, n); /* Hr5+Lr3 */
  cy-= mpn_sub_n (pp + 3*n, pp + 3*n, r5 + 2*n, n+1); /* Hr7-Hr5+Lr5-Lr3 */
  if (UNLIKELY(0 > cy))
    MPN_DECR_U (r5 + n + 1, 2*n, 1);
  else
    MPN_INCR_U (r5 + n + 1, 2*n, cy);

  ASSERT_NOCARRY(mpn_sub_n(pp + 4*n, r5 + n, r3 + n, 2*n +1)); /* Mr5-Mr3,Hr5-Hr3 */

  cy = mpn_add_1 (pp + 6*n, r3 + n, n, pp[6*n]);
  MPN_INCR_U (r3 + 2*n, n + 1, cy);
  cy = mpn_add_n (pp + 7*n, pp + 7*n, r3 + 2*n, n);
  if (LIKELY(spt != n))
    MPN_INCR_U (pp + 8*n, spt - n, cy + r3[3*n]);
  else
    ASSERT (r3[3*n] | cy == 0);
}
Пример #10
0
void
mpn_toom_interpolate_5pts (mp_ptr c, mp_ptr v2, mp_ptr vm1,
			   mp_size_t k, mp_size_t twor, int sa,
			   mp_limb_t vinf0)
{
  mp_limb_t cy, saved;
  mp_size_t twok;
  mp_size_t kk1;
  mp_ptr c1, v1, c3, vinf;

  twok = k + k;
  kk1 = twok + 1;

  c1 = c  + k;
  v1 = c1 + k;
  c3 = v1 + k;
  vinf = c3 + k;

#define v0 (c)
  /* (1) v2 <- v2-vm1 < v2+|vm1|,       (16 8 4 2 1) - (1 -1 1 -1  1) =
     thus 0 <= v2 < 50*B^(2k) < 2^6*B^(2k)             (15 9 3  3  0)
  */
  if (sa)
    ASSERT_NOCARRY (mpn_add_n (v2, v2, vm1, kk1));
  else
    ASSERT_NOCARRY (mpn_sub_n (v2, v2, vm1, kk1));

  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
       v0       v1       hi(vinf)       |vm1|     v2-vm1      EMPTY */

  ASSERT_NOCARRY (mpn_divexact_by3 (v2, v2, kk1));    /* v2 <- v2 / 3 */
						      /* (5 3 1 1 0)*/

  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
       v0       v1      hi(vinf)       |vm1|     (v2-vm1)/3    EMPTY */

  /* (2) vm1 <- tm1 := (v1 - vm1) / 2  [(1 1 1 1 1) - (1 -1 1 -1 1)] / 2 =
     tm1 >= 0                                         (0  1 0  1 0)
     No carry comes out from {v1, kk1} +/- {vm1, kk1},
     and the division by two is exact.
     If (sa!=0) the sign of vm1 is negative */
  if (sa)
    {
#ifdef HAVE_NATIVE_mpn_rsh1add_n
      mpn_rsh1add_n (vm1, v1, vm1, kk1);
#else
      ASSERT_NOCARRY (mpn_add_n (vm1, v1, vm1, kk1));
      ASSERT_NOCARRY (mpn_rshift (vm1, vm1, kk1, 1));
#endif
    }
  else
    {
#ifdef HAVE_NATIVE_mpn_rsh1sub_n
      mpn_rsh1sub_n (vm1, v1, vm1, kk1);
#else
      ASSERT_NOCARRY (mpn_sub_n (vm1, v1, vm1, kk1));
      ASSERT_NOCARRY (mpn_rshift (vm1, vm1, kk1, 1));
#endif
    }

  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
       v0       v1        hi(vinf)       tm1     (v2-vm1)/3    EMPTY */

  /* (3) v1 <- t1 := v1 - v0    (1 1 1 1 1) - (0 0 0 0 1) = (1 1 1 1 0)
     t1 >= 0
  */
  vinf[0] -= mpn_sub_n (v1, v1, c, twok);

  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
       v0     v1-v0        hi(vinf)       tm1     (v2-vm1)/3    EMPTY */

  /* (4) v2 <- t2 := ((v2-vm1)/3-t1)/2 = (v2-vm1-3*t1)/6
     t2 >= 0                  [(5 3 1 1 0) - (1 1 1 1 0)]/2 = (2 1 0 0 0)
  */
#ifdef HAVE_NATIVE_mpn_rsh1sub_n
  mpn_rsh1sub_n (v2, v2, v1, kk1);
#else
  ASSERT_NOCARRY (mpn_sub_n (v2, v2, v1, kk1));
  ASSERT_NOCARRY (mpn_rshift (v2, v2, kk1, 1));
#endif

  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
       v0     v1-v0        hi(vinf)     tm1    (v2-vm1-3t1)/6    EMPTY */

  /* (5) v1 <- t1-tm1           (1 1 1 1 0) - (0 1 0 1 0) = (1 0 1 0 0)
     result is v1 >= 0
  */
  ASSERT_NOCARRY (mpn_sub_n (v1, v1, vm1, kk1));

  /* We do not need to read the value in vm1, so we add it in {c+k, ...} */
  cy = mpn_add_n (c1, c1, vm1, kk1);
  MPN_INCR_U (c3 + 1, twor + k - 1, cy); /* 2n-(3k+1) = 2r+k-1 */
  /* Memory allocated for vm1 is now free, it can be recycled ...*/

  /* (6) v2 <- v2 - 2*vinf,     (2 1 0 0 0) - 2*(1 0 0 0 0) = (0 1 0 0 0)
     result is v2 >= 0 */
  saved = vinf[0];       /* Remember v1's highest byte (will be overwritten). */
  vinf[0] = vinf0;       /* Set the right value for vinf0                     */
#ifdef HAVE_NATIVE_mpn_sublsh1_n
  cy = mpn_sublsh1_n (v2, v2, vinf, twor);
#else
  /* Overwrite unused vm1 */
  cy = mpn_lshift (vm1, vinf, twor, 1);
  cy += mpn_sub_n (v2, v2, vm1, twor);
#endif
  MPN_DECR_U (v2 + twor, kk1 - twor, cy);

  /* Current matrix is
     [1 0 0 0 0; vinf
      0 1 0 0 0; v2
      1 0 1 0 0; v1
      0 1 0 1 0; vm1
      0 0 0 0 1] v0
     Some vaues already are in-place (we added vm1 in the correct position)
     | vinf|  v1 |  v0 |
	      | vm1 |
     One still is in a separated area
	| +v2 |
     We have to compute v1-=vinf; vm1 -= v2,
	   |-vinf|
	      | -v2 |
     Carefully reordering operations we can avoid to compute twice the sum
     of the high half of v2 plus the low half of vinf.
  */

  /* Add the high half of t2 in {vinf} */
  if ( LIKELY(twor > k + 1) ) { /* This is the expected flow  */
    cy = mpn_add_n (vinf, vinf, v2 + k, k + 1);
    MPN_INCR_U (c3 + kk1, twor - k - 1, cy); /* 2n-(5k+1) = 2r-k-1 */
  } else { /* triggered only by very unbalanced cases like
	      (k+k+(k-2))x(k+k+1) , should be handled by toom32 */
    ASSERT_NOCARRY (mpn_add_n (vinf, vinf, v2 + k, twor));
  }
  /* (7) v1 <- v1 - vinf,       (1 0 1 0 0) - (1 0 0 0 0) = (0 0 1 0 0)
     result is >= 0 */
  /* Side effect: we also subtracted (high half) vm1 -= v2 */
  cy = mpn_sub_n (v1, v1, vinf, twor);          /* vinf is at most twor long.  */
  vinf0 = vinf[0];                     /* Save again the right value for vinf0 */
  vinf[0] = saved;
  MPN_DECR_U (v1 + twor, kk1 - twor, cy);       /* Treat the last bytes.       */

  /* (8) vm1 <- vm1-v2          (0 1 0 1 0) - (0 1 0 0 0) = (0 0 0 1 0)
     Operate only on the low half.
  */
  cy = mpn_sub_n (c1, c1, v2, k);
  MPN_DECR_U (v1, kk1, cy);

  /********************* Beginning the final phase **********************/

  /* Most of the recomposition was done */

  /* add t2 in {c+3k, ...}, but only the low half */
  cy = mpn_add_n (c3, c3, v2, k);
  vinf[0] += cy;
  ASSERT(vinf[0] >= cy); /* No carry */
  MPN_INCR_U (vinf, twor, vinf0); /* Add vinf0, propagate carry. */

#undef v0
}
Пример #11
0
int main()
{
    slong iter;
    flint_rand_t state;

    flint_printf("atan_taylor_rs....");
    fflush(stdout);

    flint_randinit(state);
    _flint_rand_init_gmp(state);

    for (iter = 0; iter < 100000 * arb_test_multiplier(); iter++)
    {
        mp_ptr x, y1, y2, t;
        mp_limb_t err1, err2;
        ulong N;
        mp_size_t xn;
        int alternating, cmp, result;

        N = n_randint(state, 256);
        alternating = n_randint(state, 2);
        xn = 1 + n_randint(state, 20);

        x = flint_malloc(sizeof(mp_limb_t) * xn);
        y1 = flint_malloc(sizeof(mp_limb_t) * xn);
        y2 = flint_malloc(sizeof(mp_limb_t) * xn);
        t = flint_malloc(sizeof(mp_limb_t) * xn);

        flint_mpn_rrandom(x, state->gmp_state, xn);
        x[xn - 1] &= (LIMB_ONES >> 4);

        _arb_atan_taylor_naive(y1, &err1, x, xn, N, alternating);
        _arb_atan_taylor_rs(y2, &err2, x, xn, N, alternating);

        cmp = mpn_cmp(y1, y2, xn);

        if (cmp == 0)
        {
            result = 1;
        }
        else if (cmp > 0)
        {
            mpn_sub_n(t, y1, y2, xn);
            result = flint_mpn_zero_p(t + 1, xn - 1) && (t[0] <= err2);
        }
        else
        {
            mpn_sub_n(t, y2, y1, xn);
            result = flint_mpn_zero_p(t + 1, xn - 1) && (t[0] <= err2);
        }

        if (!result)
        {
            flint_printf("FAIL\n");
            flint_printf("N = %wd xn = %wd alternating = %d\n", N, xn, alternating);
            flint_printf("x =");
            flint_mpn_debug(x, xn);
            flint_printf("y1 =");
            flint_mpn_debug(y1, xn);
            flint_printf("y2 =");
            flint_mpn_debug(y2, xn);
            abort();
        }

        flint_free(x);
        flint_free(y1);
        flint_free(y2);
        flint_free(t);
    }

    flint_randclear(state);
    flint_cleanup();
    flint_printf("PASS\n");
    return EXIT_SUCCESS;
}
Пример #12
0
mp_limb_t
mpn_sb_divrem_mn (mp_ptr qp,
		  mp_ptr np, mp_size_t nn,
		  mp_srcptr dp, mp_size_t dn)
{
  mp_limb_t most_significant_q_limb = 0;
  mp_size_t qn = nn - dn;
  mp_size_t i;
  mp_limb_t dx, d1, n0;
  mp_limb_t dxinv;
  int use_preinv;

  ASSERT (dn > 2);
  ASSERT (nn >= dn);
  ASSERT (dp[dn-1] & GMP_NUMB_HIGHBIT);
  ASSERT (! MPN_OVERLAP_P (np, nn, dp, dn));
  ASSERT (! MPN_OVERLAP_P (qp, nn-dn, dp, dn));
  ASSERT (! MPN_OVERLAP_P (qp, nn-dn, np, nn) || qp+dn >= np);
  ASSERT_MPN (np, nn);
  ASSERT_MPN (dp, dn);

  np += qn;
  dx = dp[dn - 1];
  d1 = dp[dn - 2];
  n0 = np[dn - 1];

  if (n0 >= dx)
    {
      if (n0 > dx || mpn_cmp (np, dp, dn - 1) >= 0)
	{
	  mpn_sub_n (np, np, dp, dn);
	  most_significant_q_limb = 1;
	}
    }

  /* use_preinv is possibly a constant, but it's left to the compiler to
     optimize away the unused code in that case.  */
  use_preinv = ABOVE_THRESHOLD (qn, DIV_SB_PREINV_THRESHOLD);
  if (use_preinv)
    invert_limb (dxinv, dx);

  for (i = qn - 1; i >= 0; i--)
    {
      mp_limb_t q;
      mp_limb_t nx;
      mp_limb_t cy_limb;

      nx = np[dn - 1];		/* FIXME: could get value from r1 */
      np--;

      if (nx == dx)
	{
	  /* This might over-estimate q, but it's probably not worth
	     the extra code here to find out.  */
	  q = GMP_NUMB_MASK;

#if 1
	  cy_limb = mpn_submul_1 (np, dp, dn, q);
#else
	  /* This should be faster on many machines */
	  cy_limb = mpn_sub_n (np + 1, np + 1, dp, dn);
	  cy = mpn_add_n (np, np, dp, dn);
	  np[dn] += cy;
#endif

	  if (nx != cy_limb)
	    {
	      mpn_add_n (np, np, dp, dn);
	      q--;
	    }

	  qp[i] = q;
	}
      else
	{
	  mp_limb_t rx, r1, r0, p1, p0;

	  /* "workaround" avoids a problem with gcc 2.7.2.3 i386 register usage
	     when np[dn-1] is used in an asm statement like umul_ppmm in
	     udiv_qrnnd_preinv.  The symptom is seg faults due to registers
	     being clobbered.  gcc 2.95 i386 doesn't have the problem. */
	  {
	    mp_limb_t  workaround = np[dn - 1];
	    if (use_preinv)
	      udiv_qrnnd_preinv (q, r1, nx, workaround, dx, dxinv);
	    else
	      {
		udiv_qrnnd (q, r1, nx, workaround << GMP_NAIL_BITS,
			    dx << GMP_NAIL_BITS);
		r1 >>= GMP_NAIL_BITS;
	      }
	  }
	  umul_ppmm (p1, p0, d1, q << GMP_NAIL_BITS);
	  p0 >>= GMP_NAIL_BITS;

	  r0 = np[dn - 2];
	  rx = 0;
	  if (r1 < p1 || (r1 == p1 && r0 < p0))
	    {
	      p1 -= p0 < d1;
	      p0 = (p0 - d1) & GMP_NUMB_MASK;
	      q--;
	      r1 = (r1 + dx) & GMP_NUMB_MASK;
	      rx = r1 < dx;
	    }

	  p1 += r0 < p0;	/* cannot carry! */
	  rx -= r1 < p1;	/* may become 11..1 if q is still too large */
	  r1 = (r1 - p1) & GMP_NUMB_MASK;
	  r0 = (r0 - p0) & GMP_NUMB_MASK;

	  cy_limb = mpn_submul_1 (np, dp, dn - 2, q);

	  /* Check if we've over-estimated q, and adjust as needed.  */
	  {
	    mp_limb_t cy1, cy2;
	    cy1 = r0 < cy_limb;
	    r0 = (r0 - cy_limb) & GMP_NUMB_MASK;
	    cy2 = r1 < cy1;
	    r1 -= cy1;
	    np[dn - 1] = r1;
	    np[dn - 2] = r0;
	    if (cy2 != rx)
	      {
		mpn_add_n (np, np, dp, dn);
		q--;
	      }
	  }
	  qp[i] = q;
	}
    }

  /* ______ ______ ______
    |__rx__|__r1__|__r0__|		partial remainder
	    ______ ______
	 - |__p1__|__p0__|		partial product to subtract
	    ______ ______
	 - |______|cylimb|

     rx is -1, 0 or 1.  If rx=1, then q is correct (it should match
     carry out).  If rx=-1 then q is too large.  If rx=0, then q might
     be too large, but it is most likely correct.
  */

  return most_significant_q_limb;
}
Пример #13
0
void
mpn_toom2_sqr (mp_ptr pp,
	       mp_srcptr ap, mp_size_t an,
	       mp_ptr scratch)
{
  const int __gmpn_cpuvec_initialized = 1;
  mp_size_t n, s;
  mp_limb_t cy, cy2;
  mp_ptr asm1;

#define a0  ap
#define a1  (ap + n)

  s = an >> 1;
  n = an - s;

  ASSERT (0 < s && s <= n && s >= n - 1);

  asm1 = pp;

  /* Compute asm1.  */
  if (s == n)
    {
      if (mpn_cmp (a0, a1, n) < 0)
	{
	  mpn_sub_n (asm1, a1, a0, n);
	}
      else
	{
	  mpn_sub_n (asm1, a0, a1, n);
	}
    }
  else /* n - s == 1 */
    {
      if (a0[s] == 0 && mpn_cmp (a0, a1, s) < 0)
	{
	  mpn_sub_n (asm1, a1, a0, s);
	  asm1[s] = 0;
	}
      else
	{
	  asm1[s] = a0[s] - mpn_sub_n (asm1, a0, a1, s);
	}
    }

#define v0	pp				/* 2n */
#define vinf	(pp + 2 * n)			/* s+s */
#define vm1	scratch				/* 2n */
#define scratch_out	scratch + 2 * n

  /* vm1, 2n limbs */
  TOOM2_SQR_REC (vm1, asm1, n, scratch_out);

  /* vinf, s+s limbs */
  TOOM2_SQR_REC (vinf, a1, s, scratch_out);

  /* v0, 2n limbs */
  TOOM2_SQR_REC (v0, ap, n, scratch_out);

  /* H(v0) + L(vinf) */
  cy = mpn_add_n (pp + 2 * n, v0 + n, vinf, n);

  /* L(v0) + H(v0) */
  cy2 = cy + mpn_add_n (pp + n, pp + 2 * n, v0, n);

  /* L(vinf) + H(vinf) */
  cy += mpn_add (pp + 2 * n, pp + 2 * n, n, vinf + n, s + s - n);

  cy -= mpn_sub_n (pp + n, pp + n, vm1, 2 * n);

  ASSERT (cy + 1  <= 3);
  ASSERT (cy2 <= 2);

  MPN_INCR_U (pp + 2 * n, s + s, cy2);
  if (LIKELY (cy <= 2))
    MPN_INCR_U (pp + 3 * n, s + s - n, cy);
  else
    MPN_DECR_U (pp + 3 * n, s + s - n, 1);
}
Пример #14
0
/* 
   Computes an approximate quotient of { np, 2*dn } by { dp, dn } which is
   either correct or one too large. We require dp to be normalised and inv
   to be a precomputed inverse given by mpn_invert.
*/
mp_limb_t 
mpn_inv_divappr_q_n(mp_ptr qp, mp_ptr np, 
                              mp_srcptr dp, mp_size_t dn, mp_srcptr inv)
{
   mp_limb_t cy, lo, ret = 0, ret2 = 0;
   mp_ptr tp;
   TMP_DECL;

   TMP_MARK;

   ASSERT(dp[dn-1] & GMP_LIMB_HIGHBIT);
   ASSERT(mpn_is_invert(inv, dp, dn));

   if (mpn_cmp(np + dn, dp, dn) >= 0)
   {
      ret2 = 1;
      mpn_sub_n(np + dn, np + dn, dp, dn);
   }
   
   tp = TMP_ALLOC_LIMBS(2*dn + 1);
   mpn_mul(tp, np + dn - 1, dn + 1, inv, dn);
   add_ssaaaa(cy, lo, 0, np[dn - 1], 0, tp[dn]);
   ret += mpn_add_n(qp, tp + dn + 1, np + dn, dn);
   ret += mpn_add_1(qp, qp, dn, cy + 1);

   /* 
      Let X = B^dn + inv, D = { dp, dn }, N = { np, 2*dn }, then
      DX < B^{2*dn} <= D(X+1), thus
      Let N' = { np + n - 1, n + 1 }
	  N'X/B^{dn+1} < B^{dn-1}N'/D <= N'X/B^{dn+1} + N'/B^{dn+1} < N'X/B^{dn+1} + 1
      N'X/B^{dn+1} < N/D <=  N'X/B^{dn+1} + 1 + 2/B
      There is either one integer in this range, or two. However, in the latter case
	  the left hand bound is either an integer or < 2/B below one.
   */
    
   if (UNLIKELY(ret == 1))
   {
      ret -= mpn_sub_1(qp, qp, dn, 1);
      ASSERT(ret == 0);
   }
  
   if (UNLIKELY((lo == ~CNST_LIMB(0)) || (lo == ~CNST_LIMB(1)))) 
   {
	   /* Special case, multiply out to get accurate quotient */
	   ret -= mpn_sub_1(qp, qp, dn, 1);
       if (UNLIKELY(ret == ~CNST_LIMB(0)))
          ret += mpn_add_1(qp, qp, dn, 1);
       /* ret is now guaranteed to be 0*/
       ASSERT(ret == 0);
       mpn_mul_n(tp, qp, dp, dn);
       mpn_sub_n(tp, np, tp, dn+1);
       while (tp[dn] || mpn_cmp(tp, dp, dn) >= 0)
	   {
		   ret += mpn_add_1(qp, qp, dn, 1);
		   tp[dn] -= mpn_sub_n(tp, tp, dp, dn);
	   }
       /* Not possible for ret == 2 as we have qp*dp <= np */
       ASSERT(ret + ret2 < 2);
   }

   TMP_FREE;

   return ret + ret2;
}
Пример #15
0
/*
  Computes {np, n} / {dp, n} mod B^n, using divide-and-conquer
  algorithm, switching to classical for n <= BDIV_Q_DC_THRESHOLD.

  Also computes a 2 limb "overflow". See sb_bdiv_q.c for a definition.

  scratch is workspace.
*/
void
mpn_dc_bdiv_q_n (mp_ptr qp, mp_ptr wp, mp_ptr np, mp_srcptr dp,
		   mp_size_t n, mp_limb_t dinv, mp_ptr scratch)
{
  mp_size_t s, t;
  mp_limb_t cy;

  ASSERT (n >= 6);
  ASSERT (! MPN_OVERLAP_P (qp, n, np, n));
  ASSERT (! MPN_OVERLAP_P (qp, n, dp, n));
  ASSERT (! MPN_OVERLAP_P (wp, 2, np, n));
  ASSERT (! MPN_OVERLAP_P (wp, 2, dp, n));
  ASSERT (! MPN_OVERLAP_P (np, n, dp, n));

  /*
    Example with s = 4, t = 3, n = 7:

         C
         C C
         C C C
  qp  .  A B B B
      .  A A B B B
      1  A A A B B B
      0  A A A A B B B
         0 1 ...
           dp
  */

  t = n / 2;    /*  t = floor(n/2)  */
  s = n - t;    /*  s = ceil(n/2)   */

  /*  recurse into low half of quotient (region A)  */
  if (s <= DC_BDIV_Q_THRESHOLD)
    mpn_sb_bdiv_q (qp, wp, np, s, dp, s, dinv);
  else
    mpn_dc_bdiv_q_n (qp, wp, np, dp, s, dinv, scratch);

  /*  remove region B and overflow from A from N
      (if n odd, do first row of B separately --- we could have used
      mpn_mulmid, but this saves some logic) */
  mpn_mulmid_n (scratch, dp + 1, qp + (n & 1), t);
  if (n & 1)
    {
      cy = mpn_addmul_1 (scratch, dp + s, t, qp[0]);
      MPN_INCR_U (scratch + t, 2, cy);
    }
  ADDC_LIMB (cy, scratch[0], scratch[0], wp[0]);      /* overflow from A */
  MPN_INCR_U (scratch + 1, t + 1, wp[1] + cy);
  cy = mpn_sub_n (np + s, np + s, scratch, t);
  MPN_INCR_U (scratch + t, 2, cy);

  /*  recurse into top half of quotient (region C)
      (this does not overwrite {scratch + t, 2}, because n >= 6 implies
      t >= 3 implies floor(t/2) + 2 <= t) */
  if (t <= DC_BDIV_Q_THRESHOLD)
    mpn_sb_bdiv_q (qp + s, wp, np + s, t, dp, t, dinv);
  else
    mpn_dc_bdiv_q_n (qp + s, wp, np + s, dp, t, dinv, scratch);

  /*  combine overflows from B and C  */
  ADDC_LIMB (cy, wp[0], wp[0], scratch[t]);
  wp[1] += scratch[t + 1] + cy;
}
Пример #16
0
void
mpn_toom_interpolate_7pts (mp_ptr rp, mp_size_t n, enum toom7_flags flags,
			   mp_ptr w1, mp_ptr w3, mp_ptr w4, mp_ptr w5,
			   mp_size_t w6n, mp_ptr tp)
{
  mp_size_t m;
  mp_limb_t cy;

  m = 2*n + 1;
#define w0 rp
#define w2 (rp + 2*n)
#define w6 (rp + 6*n)

  ASSERT (w6n > 0);
  ASSERT (w6n <= 2*n);

  /* Using formulas similar to Marco Bodrato's

     W5 = W5 + W4
     W1 =(W4 - W1)/2
     W4 = W4 - W0
     W4 =(W4 - W1)/4 - W6*16
     W3 =(W2 - W3)/2
     W2 = W2 - W3

     W5 = W5 - W2*65      May be negative.
     W2 = W2 - W6 - W0
     W5 =(W5 + W2*45)/2   Now >= 0 again.
     W4 =(W4 - W2)/3
     W2 = W2 - W4

     W1 = W5 - W1         May be negative.
     W5 =(W5 - W3*8)/9
     W3 = W3 - W5
     W1 =(W1/15 + W5)/2   Now >= 0 again.
     W5 = W5 - W1

     where W0 = f(0), W1 = f(-2), W2 = f(1), W3 = f(-1),
	   W4 = f(2), W5 = f(1/2), W6 = f(oo),

     Note that most intermediate results are positive; the ones that
     may be negative are represented in two's complement. We must
     never shift right a value that may be negative, since that would
     invalidate the sign bit. On the other hand, divexact by odd
     numbers work fine with two's complement.
  */

  mpn_add_n (w5, w5, w4, m);
  if (flags & toom7_w1_neg)
    {
#ifdef HAVE_NATIVE_mpn_rsh1add_n
      mpn_rsh1add_n (w1, w1, w4, m);
#else
      mpn_add_n (w1, w1, w4, m);  ASSERT (!(w1[0] & 1));
      mpn_rshift (w1, w1, m, 1);
#endif
    }
  else
    {
#ifdef HAVE_NATIVE_mpn_rsh1sub_n
      mpn_rsh1sub_n (w1, w4, w1, m);
#else
      mpn_sub_n (w1, w4, w1, m);  ASSERT (!(w1[0] & 1));
      mpn_rshift (w1, w1, m, 1);
#endif
    }
  mpn_sub (w4, w4, m, w0, 2*n);
  mpn_sub_n (w4, w4, w1, m);  ASSERT (!(w4[0] & 3));
  mpn_rshift (w4, w4, m, 2); /* w4>=0 */

  tp[w6n] = mpn_lshift (tp, w6, w6n, 4);
  mpn_sub (w4, w4, m, tp, w6n+1);

  if (flags & toom7_w3_neg)
    {
#ifdef HAVE_NATIVE_mpn_rsh1add_n
      mpn_rsh1add_n (w3, w3, w2, m);
#else
      mpn_add_n (w3, w3, w2, m);  ASSERT (!(w3[0] & 1));
      mpn_rshift (w3, w3, m, 1);
#endif
    }
  else
    {
#ifdef HAVE_NATIVE_mpn_rsh1sub_n
      mpn_rsh1sub_n (w3, w2, w3, m);
#else
      mpn_sub_n (w3, w2, w3, m);  ASSERT (!(w3[0] & 1));
      mpn_rshift (w3, w3, m, 1);
#endif
    }

  mpn_sub_n (w2, w2, w3, m);

  mpn_submul_1 (w5, w2, m, 65);
  mpn_sub (w2, w2, m, w6, w6n);
  mpn_sub (w2, w2, m, w0, 2*n);

  mpn_addmul_1 (w5, w2, m, 45);  ASSERT (!(w5[0] & 1));
  mpn_rshift (w5, w5, m, 1);
  mpn_sub_n (w4, w4, w2, m);

  mpn_divexact_by3 (w4, w4, m);
  mpn_sub_n (w2, w2, w4, m);

  mpn_sub_n (w1, w5, w1, m);
  mpn_lshift (tp, w3, m, 3);
  mpn_sub_n (w5, w5, tp, m);
  mpn_divexact_by9 (w5, w5, m);
  mpn_sub_n (w3, w3, w5, m);

  mpn_divexact_by15 (w1, w1, m);
  mpn_add_n (w1, w1, w5, m);  ASSERT (!(w1[0] & 1));
  mpn_rshift (w1, w1, m, 1); /* w1>=0 now */
  mpn_sub_n (w5, w5, w1, m);

  /* These bounds are valid for the 4x4 polynomial product of toom44,
   * and they are conservative for toom53 and toom62. */
  ASSERT (w1[2*n] < 2);
  ASSERT (w2[2*n] < 3);
  ASSERT (w3[2*n] < 4);
  ASSERT (w4[2*n] < 3);
  ASSERT (w5[2*n] < 2);

  /* Addition chain. Note carries and the 2n'th limbs that need to be
   * added in.
   *
   * Special care is needed for w2[2n] and the corresponding carry,
   * since the "simple" way of adding it all together would overwrite
   * the limb at wp[2*n] and rp[4*n] (same location) with the sum of
   * the high half of w3 and the low half of w4.
   *
   *         7    6    5    4    3    2    1    0
   *    |    |    |    |    |    |    |    |    |
   *                  ||w3 (2n+1)|
   *             ||w4 (2n+1)|
   *        ||w5 (2n+1)|        ||w1 (2n+1)|
   *  + | w6 (w6n)|        ||w2 (2n+1)| w0 (2n) |  (share storage with r)
   *  -----------------------------------------------
   *  r |    |    |    |    |    |    |    |    |
   *        c7   c6   c5   c4   c3                 Carries to propagate
   */

  cy = mpn_add_n (rp + n, rp + n, w1, m);
  MPN_INCR_U (w2 + n + 1, n , cy);
  cy = mpn_add_n (rp + 3*n, rp + 3*n, w3, n);
  MPN_INCR_U (w3 + n, n + 1, w2[2*n] + cy);
  cy = mpn_add_n (rp + 4*n, w3 + n, w4, n);
  MPN_INCR_U (w4 + n, n + 1, w3[2*n] + cy);
  cy = mpn_add_n (rp + 5*n, w4 + n, w5, n);
  MPN_INCR_U (w5 + n, n + 1, w4[2*n] + cy);
  if (w6n > n + 1)
    ASSERT_NOCARRY (mpn_add (rp + 6*n, rp + 6*n, w6n, w5 + n, n + 1));
  else
    {
      ASSERT_NOCARRY (mpn_add_n (rp + 6*n, rp + 6*n, w5 + n, w6n));
#if WANT_ASSERT
      {
	mp_size_t i;
	for (i = w6n; i <= n; i++)
	  ASSERT (w5[n + i] == 0);
      }
#endif
    }
}
Пример #17
0
mp_limb_t
mpn_dcpi1_divappr_q (mp_ptr qp, mp_ptr np, mp_size_t nn,
		     mp_srcptr dp, mp_size_t dn, gmp_pi1_t *dinv)
{
  mp_size_t qn;
  mp_limb_t qh, cy, qsave;
  mp_ptr tp;
  TMP_DECL;

  TMP_MARK;

  ASSERT (dn >= 6);
  ASSERT (nn > dn);
  ASSERT (dp[dn-1] & GMP_NUMB_HIGHBIT);

  qn = nn - dn;
  qp += qn;
  np += nn;
  dp += dn;

  if (qn >= dn)
    {
      qn++;			/* pretend we'll need an extra limb */
      /* Reduce qn mod dn without division, optimizing small operations.  */
      do
	qn -= dn;
      while (qn > dn);

      qp -= qn;			/* point at low limb of next quotient block */
      np -= qn;			/* point in the middle of partial remainder */

      tp = TMP_SALLOC_LIMBS (dn);

      /* Perform the typically smaller block first.  */
      if (qn == 1)
	{
	  mp_limb_t q, n2, n1, n0, d1, d0;

	  /* Handle qh up front, for simplicity. */
	  qh = mpn_cmp (np - dn + 1, dp - dn, dn) >= 0;
	  if (qh)
	    ASSERT_NOCARRY (mpn_sub_n (np - dn + 1, np - dn + 1, dp - dn, dn));

	  /* A single iteration of schoolbook: One 3/2 division,
	     followed by the bignum update and adjustment. */
	  n2 = np[0];
	  n1 = np[-1];
	  n0 = np[-2];
	  d1 = dp[-1];
	  d0 = dp[-2];

	  ASSERT (n2 < d1 || (n2 == d1 && n1 <= d0));

	  if (UNLIKELY (n2 == d1) && n1 == d0)
	    {
	      q = GMP_NUMB_MASK;
	      cy = mpn_submul_1 (np - dn, dp - dn, dn, q);
	      ASSERT (cy == n2);
	    }
	  else
	    {
	      udiv_qr_3by2 (q, n1, n0, n2, n1, n0, d1, d0, dinv->inv32);

	      if (dn > 2)
		{
		  mp_limb_t cy, cy1;
		  cy = mpn_submul_1 (np - dn, dp - dn, dn - 2, q);

		  cy1 = n0 < cy;
		  n0 = (n0 - cy) & GMP_NUMB_MASK;
		  cy = n1 < cy1;
		  n1 = (n1 - cy1) & GMP_NUMB_MASK;
		  np[-2] = n0;

		  if (UNLIKELY (cy != 0))
		    {
		      n1 += d1 + mpn_add_n (np - dn, np - dn, dp - dn, dn - 1);
		      qh -= (q == 0);
		      q = (q - 1) & GMP_NUMB_MASK;
		    }
		}
	      else
		np[-2] = n0;

	      np[-1] = n1;
	    }
	  qp[0] = q;
	}
      else
	{
	  if (qn == 2)
	    qh = mpn_divrem_2 (qp, 0L, np - 2, 4, dp - 2);
	  else if (BELOW_THRESHOLD (qn, DC_DIV_QR_THRESHOLD))
	    qh = mpn_sbpi1_div_qr (qp, np - qn, 2 * qn, dp - qn, qn, dinv->inv32);
	  else
	    qh = mpn_dcpi1_div_qr_n (qp, np - qn, dp - qn, qn, dinv, tp);

	  if (qn != dn)
	    {
	      if (qn > dn - qn)
		mpn_mul (tp, qp, qn, dp - dn, dn - qn);
	      else
		mpn_mul (tp, dp - dn, dn - qn, qp, qn);

	      cy = mpn_sub_n (np - dn, np - dn, tp, dn);
	      if (qh != 0)
		cy += mpn_sub_n (np - dn + qn, np - dn + qn, dp - dn, dn - qn);

	      while (cy != 0)
		{
		  qh -= mpn_sub_1 (qp, qp, qn, 1);
		  cy -= mpn_add_n (np - dn, np - dn, dp - dn, dn);
		}
	    }
	}
      qn = nn - dn - qn + 1;
      while (qn > dn)
	{
	  qp -= dn;
	  np -= dn;
	  mpn_dcpi1_div_qr_n (qp, np - dn, dp - dn, dn, dinv, tp);
	  qn -= dn;
	}

      /* Since we pretended we'd need an extra quotient limb before, we now
	 have made sure the code above left just dn-1=qn quotient limbs to
	 develop.  Develop that plus a guard limb. */
      qn--;
      qp -= qn;
      np -= dn;
      qsave = qp[qn];
      mpn_dcpi1_divappr_q_n (qp, np - dn, dp - dn, dn, dinv, tp);
      MPN_COPY_INCR (qp, qp + 1, qn);
      qp[qn] = qsave;
    }
  else    /* (qn < dn) */
    {
      mp_ptr q2p;
#if 0				/* not possible since we demand nn > dn */
      if (qn == 0)
	{
	  qh = mpn_cmp (np - dn, dp - dn, dn) >= 0;
	  if (qh)
	    mpn_sub_n (np - dn, np - dn, dp - dn, dn);
	  TMP_FREE;
	  return qh;
	}
#endif

      qp -= qn;			/* point at low limb of next quotient block */
      np -= qn;			/* point in the middle of partial remainder */

      q2p = TMP_SALLOC_LIMBS (qn + 1);
      /* Should we at all check DC_DIVAPPR_Q_THRESHOLD here, or reply on
	 callers not to be silly?  */
      if (BELOW_THRESHOLD (qn, DC_DIVAPPR_Q_THRESHOLD))
	{
	  qh = mpn_sbpi1_divappr_q (q2p, np - qn - 2, 2 * (qn + 1),
				    dp - (qn + 1), qn + 1, dinv->inv32);
	}
      else
	{
	  /* It is tempting to use qp for recursive scratch and put quotient in
	     tp, but the recursive scratch needs one limb too many.  */
	  tp = TMP_SALLOC_LIMBS (qn + 1);
	  qh = mpn_dcpi1_divappr_q_n (q2p, np - qn - 2, dp - (qn + 1), qn + 1, dinv, tp);
	}
      MPN_COPY (qp, q2p + 1, qn);
    }

  TMP_FREE;
  return qh;
}
Пример #18
0
mp_limb_t
mpn_sbpi1_bdiv_qr (mp_ptr qp,
		   mp_ptr np, mp_size_t nn,
		   mp_srcptr dp, mp_size_t dn, mp_limb_t dinv)
{
  mp_size_t qn;
  mp_size_t i;
  mp_limb_t rh;
  mp_limb_t ql;

  ASSERT (dn > 0);
  ASSERT (nn > dn);
  ASSERT ((dp[0] & 1) != 0);
  /* FIXME: Add ASSERTs for allowable overlapping; i.e., that qp = np is OK,
     but some over N/Q overlaps will not work.  */

  qn = nn - dn;

  rh = 0;

  /* To complete the negation, this value is added to q. */
  ql = 1;
  while (qn > dn)
    {
      for (i = 0; i < dn; i++)
	{
	  mp_limb_t q;

	  q = dinv * np[i];
	  np[i] = mpn_addmul_1 (np + i, dp, dn, q);
	  qp[i] = ~q;
	}
      rh += mpn_add (np + dn, np + dn, qn, np, dn);
      ql = mpn_add_1 (qp, qp, dn, ql);

      qp += dn; qn -= dn;
      np += dn; nn -= dn;
    }

  for (i = 0; i < qn; i++)
    {
      mp_limb_t q;

      q = dinv * np[i];
      np[i] = mpn_addmul_1 (np + i, dp, dn, q);
      qp[i] = ~q;
    }

  rh += mpn_add_n (np + dn, np + dn, np, qn);
  ql = mpn_add_1 (qp, qp, qn, ql);

  if (UNLIKELY (ql > 0))
    {
      /* q == 0 */
      ASSERT (rh == 0);
      return 0;
    }
  else
    {
      mp_limb_t cy;

      cy = mpn_sub_n (np + qn, np + qn, dp, dn);
      ASSERT (cy >= rh);
      return cy - rh;
    }
}
Пример #19
0
/* Put in Q={qp, n} an approximation of N={np, 2*n} divided by D={dp, n},
   with the most significant limb of the quotient as return value (0 or 1).
   Assumes the most significant bit of D is set. Clobbers N.

   The approximate quotient Q satisfies - 2(n-1) < N/D - Q <= 4.
*/
static mp_limb_t
mpfr_divhigh_n_basecase (mpfr_limb_ptr qp, mpfr_limb_ptr np,
                         mpfr_limb_srcptr dp, mp_size_t n)
{
  mp_limb_t qh, d1, d0, dinv, q2, q1, q0;
  mpfr_pi1_t dinv2;

  np += n;

  if ((qh = (mpn_cmp (np, dp, n) >= 0)))
    mpn_sub_n (np, np, dp, n);

  /* now {np, n} is less than D={dp, n}, which implies np[n-1] <= dp[n-1] */

  d1 = dp[n - 1];

  if (n == 1)
    {
      invert_limb (dinv, d1);
      umul_ppmm (q1, q0, np[0], dinv);
      qp[0] = np[0] + q1;
      return qh;
    }

  /* now n >= 2 */
  d0 = dp[n - 2];
  invert_pi1 (dinv2, d1, d0);
  /* dinv2.inv32 = floor ((B^3 - 1) / (d0 + d1 B)) - B */
  while (n > 1)
    {
      /* Invariant: it remains to reduce n limbs from N (in addition to the
         initial low n limbs).
         Since n >= 2 here, necessarily we had n >= 2 initially, which means
         that in addition to the limb np[n-1] to reduce, we have at least 2
         extra limbs, thus accessing np[n-3] is valid. */

      /* warning: we can have np[n-1]=d1 and np[n-2]=d0, but since {np,n} < D,
         the largest possible partial quotient is B-1 */
      if (MPFR_UNLIKELY(np[n - 1] == d1 && np[n - 2] == d0))
        q2 = ~ (mp_limb_t) 0;
      else
        udiv_qr_3by2 (q2, q1, q0, np[n - 1], np[n - 2], np[n - 3],
                      d1, d0, dinv2.inv32);
      /* since q2 = floor((np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0)),
         we have q2 <= (np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0),
         thus np[n-1]*B^2+np[n-2]*B+np[n-3] >= q2*(d1*B+d0)
         and {np-1, n} >= q2*D - q2*B^(n-2) >= q2*D - B^(n-1)
         thus {np-1, n} - (q2-1)*D >= D - B^(n-1) >= 0
         which proves that at most one correction is needed */
      q0 = mpn_submul_1 (np - 1, dp, n, q2);
      if (MPFR_UNLIKELY(q0 > np[n - 1]))
        {
          mpn_add_n (np - 1, np - 1, dp, n);
          q2 --;
        }
      qp[--n] = q2;
      dp ++;
    }

  /* we have B+dinv2 = floor((B^3-1)/(d1*B+d0)) < B^2/d1
     q1 = floor(np[0]*(B+dinv2)/B) <= floor(np[0]*B/d1)
        <= floor((np[0]*B+np[1])/d1)
     thus q1 is not larger than the true quotient.
     q1 > np[0]*(B+dinv2)/B - 1 > np[0]*(B^3-1)/(d1*B+d0)/B - 2
     For d1*B+d0 <> B^2/2, we have B+dinv2 = floor(B^3/(d1*B+d0))
     thus q1 > np[0]*B^2/(d1*B+d0) - 2, i.e.,
     (d1*B+d0)*q1 > np[0]*B^2 - 2*(d1*B+d0)
     d1*B*q1 > np[0]*B^2 - 2*d1*B - 2*d0 - d0*q1 >= np[0]*B^2 - 2*d1*B - B^2
     thus q1 > np[0]*B/d1 - 2 - B/d1 > np[0]*B/d1 - 4.

     For d1*B+d0 = B^2/2, dinv2 = B-1 thus q1 > np[0]*(2B-1)/B - 1 >
     np[0]*B/d1 - 2.

     In all cases, if q = floor((np[0]*B+np[1])/d1), we have:
     q - 4 <= q1 <= q
  */
  umul_ppmm (q1, q0, np[0], dinv2.inv32);
  qp[0] = np[0] + q1;

  return qh;
}
Пример #20
0
/* Computes {rp,MIN(rn,an+bn)} <- {ap,an}*{bp,bn} Mod(B^rn-1)
 *
 * The result is expected to be ZERO if and only if one of the operand
 * already is. Otherwise the class [0] Mod(B^rn-1) is represented by
 * B^rn-1. This should not be a problem if mulmod_bnm1 is used to
 * combine results and obtain a natural number when one knows in
 * advance that the final value is less than (B^rn-1).
 * Moreover it should not be a problem if mulmod_bnm1 is used to
 * compute the full product with an+bn <= rn, because this condition
 * implies (B^an-1)(B^bn-1) < (B^rn-1) .
 *
 * Requires 0 < bn <= an <= rn and an + bn > rn/2
 * Scratch need: rn + (need for recursive call OR rn + 4). This gives
 *
 * S(n) <= rn + MAX (rn + 4, S(n/2)) <= 2rn + 4
 */
void
mpn_mulmod_bnm1 (mp_ptr rp, mp_size_t rn, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr tp)
{
  ASSERT (0 < bn);
  ASSERT (bn <= an);
  ASSERT (an <= rn);

  if ((rn & 1) != 0 || BELOW_THRESHOLD (rn, MULMOD_BNM1_THRESHOLD))
    {
      if (UNLIKELY (bn < rn))
	{
	  if (UNLIKELY (an + bn <= rn))
	    {
	      mpn_mul (rp, ap, an, bp, bn);
	    }
	  else
	    {
	      mp_limb_t cy;
	      mpn_mul (tp, ap, an, bp, bn);
	      cy = mpn_add (rp, tp, rn, tp + rn, an + bn - rn);
	      MPN_INCR_U (rp, rn, cy);
	    }
	}
      else
	mpn_bc_mulmod_bnm1 (rp, ap, bp, rn, tp);
    }
  else
    {
      mp_size_t n;
      mp_limb_t cy;
      mp_limb_t hi;

      n = rn >> 1;

      /* We need at least an + bn >= n, to be able to fit one of the
	 recursive products at rp. Requiring strict inequality makes
	 the coded slightly simpler. If desired, we could avoid this
	 restriction by initially halving rn as long as rn is even and
	 an + bn <= rn/2. */

      ASSERT (an + bn > n);

      /* Compute xm = a*b mod (B^n - 1), xp = a*b mod (B^n + 1)
	 and crt together as

	 x = -xp * B^n + (B^n + 1) * [ (xp + xm)/2 mod (B^n-1)]
      */

#define a0 ap
#define a1 (ap + n)
#define b0 bp
#define b1 (bp + n)

#define xp  tp	/* 2n + 2 */
      /* am1  maybe in {xp, n} */
      /* bm1  maybe in {xp + n, n} */
#define sp1 (tp + 2*n + 2)
      /* ap1  maybe in {sp1, n + 1} */
      /* bp1  maybe in {sp1 + n + 1, n + 1} */

      {
	mp_srcptr am1, bm1;
	mp_size_t anm, bnm;
	mp_ptr so;

	bm1 = b0;
	bnm = bn;
	if (LIKELY (an > n))
	  {
	    am1 = xp;
	    cy = mpn_add (xp, a0, n, a1, an - n);
	    MPN_INCR_U (xp, n, cy);
	    anm = n;
	    so = xp + n;
	    if (LIKELY (bn > n))
	      {
		bm1 = so;
		cy = mpn_add (so, b0, n, b1, bn - n);
		MPN_INCR_U (so, n, cy);
		bnm = n;
		so += n;
	      }
	  }
	else
	  {
	    so = xp;
	    am1 = a0;
	    anm = an;
	  }

	mpn_mulmod_bnm1 (rp, n, am1, anm, bm1, bnm, so);
      }

      {
	int       k;
	mp_srcptr ap1, bp1;
	mp_size_t anp, bnp;

	bp1 = b0;
	bnp = bn;
	if (LIKELY (an > n)) {
	  ap1 = sp1;
	  cy = mpn_sub (sp1, a0, n, a1, an - n);
	  sp1[n] = 0;
	  MPN_INCR_U (sp1, n + 1, cy);
	  anp = n + ap1[n];
	  if (LIKELY (bn > n)) {
	    bp1 = sp1 + n + 1;
	    cy = mpn_sub (sp1 + n + 1, b0, n, b1, bn - n);
	    sp1[2*n+1] = 0;
	    MPN_INCR_U (sp1 + n + 1, n + 1, cy);
	    bnp = n + bp1[n];
	  }
	} else {
	  ap1 = a0;
	  anp = an;
	}

	if (BELOW_THRESHOLD (n, MUL_FFT_MODF_THRESHOLD))
	  k=0;
	else
	  {
	    int mask;
	    k = mpn_fft_best_k (n, 0);
	    mask = (1<<k) - 1;
	    while (n & mask) {k--; mask >>=1;};
	  }
	if (k >= FFT_FIRST_K)
	  xp[n] = mpn_mul_fft (xp, n, ap1, anp, bp1, bnp, k);
	else if (UNLIKELY (bp1 == b0))
	  {
	    ASSERT (anp + bnp <= 2*n+1);
	    ASSERT (anp + bnp > n);
	    ASSERT (anp >= bnp);
	    mpn_mul (xp, ap1, anp, bp1, bnp);
	    anp = anp + bnp - n;
	    ASSERT (anp <= n || xp[2*n]==0);
	    anp-= anp > n;
	    cy = mpn_sub (xp, xp, n, xp + n, anp);
	    xp[n] = 0;
	    MPN_INCR_U (xp, n+1, cy);
	  }
	else
	  mpn_bc_mulmod_bnp1 (xp, ap1, bp1, n, xp);
      }

      /* Here the CRT recomposition begins.

	 xm <- (xp + xm)/2 = (xp + xm)B^n/2 mod (B^n-1)
	 Division by 2 is a bitwise rotation.

	 Assumes xp normalised mod (B^n+1).

	 The residue class [0] is represented by [B^n-1]; except when
	 both input are ZERO.
      */

#if HAVE_NATIVE_mpn_rsh1add_n || HAVE_NATIVE_mpn_rsh1add_nc
#if HAVE_NATIVE_mpn_rsh1add_nc
      cy = mpn_rsh1add_nc(rp, rp, xp, n, xp[n]); /* B^n = 1 */
      hi = cy << (GMP_NUMB_BITS - 1);
      cy = 0;
      /* next update of rp[n-1] will set cy = 1 only if rp[n-1]+=hi
	 overflows, i.e. a further increment will not overflow again. */
#else /* ! _nc */
      cy = xp[n] + mpn_rsh1add_n(rp, rp, xp, n); /* B^n = 1 */
      hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
      cy >>= 1;
      /* cy = 1 only if xp[n] = 1 i.e. {xp,n} = ZERO, this implies that
	 the rsh1add was a simple rshift: the top bit is 0. cy=1 => hi=0. */
#endif
#if GMP_NAIL_BITS == 0
      add_ssaaaa(cy, rp[n-1], cy, rp[n-1], 0, hi);
#else
      cy += (hi & rp[n-1]) >> (GMP_NUMB_BITS-1);
      rp[n-1] ^= hi;
#endif
#else /* ! HAVE_NATIVE_mpn_rsh1add_n */
#if HAVE_NATIVE_mpn_add_nc
      cy = mpn_add_nc(rp, rp, xp, n, xp[n]);
#else /* ! _nc */
      cy = xp[n] + mpn_add_n(rp, rp, xp, n); /* xp[n] == 1 implies {xp,n} == ZERO */
#endif
      cy += (rp[0]&1);
      mpn_rshift(rp, rp, n, 1);
      ASSERT (cy <= 2);
      hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
      cy >>= 1;
      /* We can have cy != 0 only if hi = 0... */
      ASSERT ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0);
      rp[n-1] |= hi;
      /* ... rp[n-1] + cy can not overflow, the following INCR is correct. */
#endif
      ASSERT (cy <= 1);
      /* Next increment can not overflow, read the previous comments about cy. */
      ASSERT ((cy == 0) || ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0));
      MPN_INCR_U(rp, n, cy);

      /* Compute the highest half:
	 ([(xp + xm)/2 mod (B^n-1)] - xp ) * B^n
       */
      if (UNLIKELY (an + bn < rn))
	{
	  /* Note that in this case, the only way the result can equal
	     zero mod B^{rn} - 1 is if one of the inputs is zero, and
	     then the output of both the recursive calls and this CRT
	     reconstruction is zero, not B^{rn} - 1. Which is good,
	     since the latter representation doesn't fit in the output
	     area.*/
	  cy = mpn_sub_n (rp + n, rp, xp, an + bn - n);

	  /* FIXME: This subtraction of the high parts is not really
	     necessary, we do it to get the carry out, and for sanity
	     checking. */
	  cy = xp[n] + mpn_sub_nc (xp + an + bn - n, rp + an + bn - n,
				   xp + an + bn - n, rn - (an + bn), cy);
	  ASSERT (an + bn == rn - 1 ||
		  mpn_zero_p (xp + an + bn - n + 1, rn - 1 - (an + bn)));
	  cy = mpn_sub_1 (rp, rp, an + bn, cy);
	  ASSERT (cy == (xp + an + bn - n)[0]);
	}
      else
	{
	  cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n);
	  /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO.
	     DECR will affect _at most_ the lowest n limbs. */
	  MPN_DECR_U (rp, 2*n, cy);
	}
#undef a0
#undef a1
#undef b0
#undef b1
#undef xp
#undef sp1
    }
}
Пример #21
0
void
mpn_toom4_mul_n (mp_ptr rp, mp_srcptr up,
		          mp_srcptr vp, mp_size_t n)
{
  mp_size_t ind;
  mp_limb_t cy, cy2, r30, r31;
  mp_ptr tp;
  mp_size_t sn, n1, n2, n3, n4, n5, n6, n7, n8, rpn, t4, h1;
  TMP_DECL;

  sn = (n + 3) / 4;

  h1 = n - 3*sn;
  
#define a0 (up)
#define a1 (up + sn)
#define a2 (up + 2*sn)
#define a3 (up + 3*sn)
#define b0 (vp)
#define b1 (vp + sn)
#define b2 (vp + 2*sn)
#define b3 (vp + 3*sn)

   t4 = 2*sn+2; // allows mult of 2 integers of sn + 1 limbs

   TMP_MARK;

   tp = TMP_ALLOC_LIMBS(4*t4 + 5*(sn + 1));

#define u2 (tp + 4*t4)
#define u3 (tp + 4*t4 + (sn+1))
#define u4 (tp + 4*t4 + 2*(sn+1))
#define u5 (tp + 4*t4 + 3*(sn+1))
#define u6 (tp + 4*t4 + 4*(sn+1))

   u6[sn] = mpn_add(u6, a1, sn, a3, h1);
   u5[sn] = mpn_add_n(u5, a2, a0, sn);
   mpn_add_n(u3, u5, u6, sn + 1);
   n4 = sn + 1;
   if (mpn_cmp(u5, u6, sn + 1) >= 0)
      mpn_sub_n(u4, u5, u6, sn + 1);
   else
   {  
      mpn_sub_n(u4, u6, u5, sn + 1);
      n4 = -n4;
   }

   u6[sn] = mpn_add(u6, b1, sn, b3, h1);
   u5[sn] = mpn_add_n(u5, b2, b0, sn);
   mpn_add_n(r2, u5, u6, sn + 1);
   n5 = sn + 1;
   if (mpn_cmp(u5, u6, sn + 1) >= 0)
      mpn_sub_n(u5, u5, u6, sn + 1);
   else
   {  
      mpn_sub_n(u5, u6, u5, sn + 1);
      n5 = -n5;
   }
 
   MUL_TC4_UNSIGNED(r3, n3, u3, sn + 1, r2, sn + 1); /* 1 */
   MUL_TC4(r4, n4, u4, n4, u5, n5); /* -1 */
   
#if HAVE_NATIVE_mpn_addlsh_n
   r1[sn] = mpn_addlsh_n(r1, a2, a0, sn, 2);
   mpn_lshift(r1, r1, sn + 1, 1);
   cy = mpn_addlsh_n(r2, a3, a1, h1, 2);
#else
   r1[sn] = mpn_lshift(r1, a2, sn, 1);
   MPN_COPY(r2, a3, h1);
   r1[sn] += mpn_addmul_1(r1, a0, sn, 8);
   cy = mpn_addmul_1(r2, a1, h1, 4);
#endif
   if (sn > h1) 
   {
      cy2 = mpn_lshift(r2 + h1, a1 + h1, sn - h1, 2);
      cy = cy2 + mpn_add_1(r2 + h1, r2 + h1, sn - h1, cy);
   }
   r2[sn] = cy;
   mpn_add_n(u5, r1, r2, sn + 1);
   n6 = sn + 1;
   if (mpn_cmp(r1, r2, sn + 1) >= 0)
      mpn_sub_n(u6, r1, r2, sn + 1);
   else
   {  
      mpn_sub_n(u6, r2, r1, sn + 1);
      n6 = -n6;
   }
 
#if HAVE_NATIVE_mpn_addlsh_n
   r1[sn] = mpn_addlsh_n(r1, b2, b0, sn, 2);
   mpn_lshift(r1, r1, sn + 1, 1);
   cy = mpn_addlsh_n(r2, b3, b1, h1, 2);
#else
   r1[sn] = mpn_lshift(r1, b2, sn, 1);
   MPN_COPY(r2, b3, h1);
   r1[sn] += mpn_addmul_1(r1, b0, sn, 8);
   cy = mpn_addmul_1(r2, b1, h1, 4);
#endif
   if (sn > h1) 
   {
      cy2 = mpn_lshift(r2 + h1, b1 + h1, sn - h1, 2);
      cy = cy2 + mpn_add_1(r2 + h1, r2 + h1, sn - h1, cy);
   }
   r2[sn] = cy;
   mpn_add_n(u2, r1, r2, sn + 1);
   n8 = sn + 1;
   if (mpn_cmp(r1, r2, sn + 1) >= 0)
      mpn_sub_n(r2, r1, r2, sn + 1);
   else
   {  
      mpn_sub_n(r2, r2, r1, sn + 1);
      n8 = -n8;
   }
    
   r30 = r3[0];
   r31 = r3[1];
   MUL_TC4_UNSIGNED(r5, n5, u5, sn + 1, u2, sn + 1); /* 1/2 */
   MUL_TC4(r6, n6, u6, n6, r2, n8); /* -1/2 */
   r3[1] = r31;

#if HAVE_NATIVE_mpn_addlsh1_n
   cy = mpn_addlsh1_n(u2, a2, a3, h1);
   if (sn > h1)
      cy = mpn_add_1(u2 + h1, a2 + h1, sn - h1, cy); 
   u2[sn] = cy;
   u2[sn] = 2*u2[sn] + mpn_addlsh1_n(u2, a1, u2, sn);     
   u2[sn] = 2*u2[sn] + mpn_addlsh1_n(u2, a0, u2, sn);     
#else
   MPN_COPY(u2, a0, sn);
   u2[sn] = mpn_addmul_1(u2, a1, sn, 2);
   u2[sn] += mpn_addmul_1(u2, a2, sn, 4);
   cy = mpn_addmul_1(u2, a3, h1, 8);
   if (sn > h1) cy = mpn_add_1(u2 + h1, u2 + h1, sn - h1, cy);
   u2[sn] += cy;
#endif

#if HAVE_NATIVE_mpn_addlsh1_n
   cy = mpn_addlsh1_n(r1, b2, b3, h1);
   if (sn > h1)
      cy = mpn_add_1(r1 + h1, b2 + h1, sn - h1, cy); 
   r1[sn] = cy;
   r1[sn] = 2*r1[sn] + mpn_addlsh1_n(r1, b1, r1, sn);     
   r1[sn] = 2*r1[sn] + mpn_addlsh1_n(r1, b0, r1, sn);     
#else
   MPN_COPY(r1, b0, sn);
   r1[sn] = mpn_addmul_1(r1, b1, sn, 2);
   r1[sn] += mpn_addmul_1(r1, b2, sn, 4);
   cy = mpn_addmul_1(r1, b3, h1, 8);
   if (sn > h1) cy = mpn_add_1(r1 + h1, r1 + h1, sn - h1, cy);
   r1[sn] += cy;
#endif
   
   MUL_TC4_UNSIGNED(r2, n2, u2, sn + 1, r1, sn + 1); /* 2 */
   
   MUL_TC4_UNSIGNED(r1, n1, a3, h1, b3, h1); /* oo */
   MUL_TC4_UNSIGNED(r7, n7, a0, sn, b0, sn); /* 0 */

   TC4_DENORM(r1, n1, t4 - 1);

/*	rp        rp1          rp2           rp3          rp4           rp5         rp6           rp7
<----------- r7-----------><------------r5-------------->            
                                                       <-------------r3------------->

              <-------------r6------------->                        < -----------r2------------>{           }
                                         <-------------r4-------------->         <--------------r1---->
*/

   mpn_toom4_interpolate(rp, &rpn, sn, tp, t4 - 1, n4, n6, r30);

   if (rpn != 2*n) 
   {
	  MPN_ZERO((rp + rpn), 2*n - rpn);
   }

   TMP_FREE;
}
Пример #22
0
mp_limb_t
mpn_dc_div_qr (mp_ptr qp,
		  mp_ptr np, mp_size_t nn,
		  mp_srcptr dp, mp_size_t dn,
		  mp_limb_t dinv)
{
  mp_size_t qn;
  mp_limb_t qh, cy;
  mp_ptr tp;
  TMP_DECL;

  TMP_MARK;

  ASSERT (dn >= 6);		/* to adhere to mpn_sb_div_qr's limits */
  ASSERT (nn - dn >= 3);	/* to adhere to mpn_sb_div_qr's limits */
  ASSERT (dp[dn-1] & GMP_NUMB_HIGHBIT);

  tp = TMP_ALLOC_LIMBS (DC_DIVAPPR_Q_N_ITCH(dn));

  qn = nn - dn;
  qp += qn;
  np += nn;
  dp += dn;

  if (qn > dn)
    {
      /* Reduce qn mod dn without division, optimizing small operations.  */
      do
	qn -= dn;
      while (qn > dn);

      qp -= qn;			/* point at low limb of next quotient block */
      np -= qn;			/* point in the middle of partial remainder */

      /* Perform the typically smaller block first.  */
      if (qn == 1)
	{
	  mp_limb_t q, n2, n1, n0, d1, d0, d11, d01;

	  /* Handle qh up front, for simplicity. */
	  qh = mpn_cmp (np - dn + 1, dp - dn, dn) >= 0;
	  if (qh)
	    ASSERT_NOCARRY (mpn_sub_n (np - dn + 1, np - dn + 1, dp - dn, dn));

	  /* A single iteration of schoolbook: One 3/2 division,
	     followed by the bignum update and adjustment. */
	  n2 = np[0];
	  n1 = np[-1];
	  n0 = np[-2];
	  d1 = dp[-1];
	  d0 = dp[-2];
     d01 = d0 + 1;
     d11 = d1 + (d01 < d0);

	  ASSERT (n2 < d1 || (n2 == d1 && n1 <= d0));

	  if (UNLIKELY (n2 == d1) && n1 == d0)
	    {
	      q = GMP_NUMB_MASK;
	      cy = mpn_submul_1 (np - dn, dp - dn, dn, q);
	      ASSERT (cy == n2);
	    }
	  else
	    {
	      mpir_divrem32_preinv2 (q, n1, n0, n2, n1, n0, d11, d01, d1, d0, dinv);

	      if (dn > 2)
		{
		  mp_limb_t cy, cy1;
		  cy = mpn_submul_1 (np - dn, dp - dn, dn - 2, q);

		  cy1 = n0 < cy;
		  n0 = (n0 - cy) & GMP_NUMB_MASK;
		  cy = n1 < cy1;
		  n1 = (n1 - cy1) & GMP_NUMB_MASK;
		  np[-2] = n0;

		  if (UNLIKELY (cy != 0))
		    {
		      n1 += d1 + mpn_add_n (np - dn, np - dn, dp - dn, dn - 1);
		      qh -= (q == 0);
		      q = (q - 1) & GMP_NUMB_MASK;
		    }
		}
	      else
		np[-2] = n0;

	      np[-1] = n1;
	    }
	  qp[0] = q;
	}
      else
	{
      /* Do a 2qn / qn division */
	  if (qn == 2)
	    qh = mpn_divrem_2 (qp, 0L, np - 2, 4, dp - 2); /* FIXME: obsolete function. Use 5/3 division? */
	  else if (BELOW_THRESHOLD (qn, DC_DIV_QR_THRESHOLD))
	    qh = mpn_sb_div_qr (qp, np - qn, 2 * qn, dp - qn, qn, dinv);
	  else
	    qh = mpn_dc_div_qr_n (qp, np - qn, dp - qn, qn, dinv, tp);

	  if (qn != dn)
	    {
	      if (qn > dn - qn)
		mpn_mul (tp, qp, qn, dp - dn, dn - qn);
	      else
		mpn_mul (tp, dp - dn, dn - qn, qp, qn);

	      cy = mpn_sub_n (np - dn, np - dn, tp, dn);
	      if (qh != 0)
		cy += mpn_sub_n (np - dn + qn, np - dn + qn, dp - dn, dn - qn);

	      while (cy != 0)
		{
		  qh -= mpn_sub_1 (qp, qp, qn, 1);
		  cy -= mpn_add_n (np - dn, np - dn, dp - dn, dn);
		}
	    }
	}

      qn = nn - dn - qn;
      do
	{
	  qp -= dn;
	  np -= dn;
	  ASSERT_NOCARRY(mpn_dc_div_qr_n (qp, np - dn, dp - dn, dn, dinv, tp));
	  qn -= dn;
	}
      while (qn > 0);
    }
  else
    {
      qp -= qn;			/* point at low limb of next quotient block */
      np -= qn;			/* point in the middle of partial remainder */

      if (BELOW_THRESHOLD (qn, DC_DIV_QR_THRESHOLD))
	qh = mpn_sb_div_qr (qp, np - qn, 2 * qn, dp - qn, qn, dinv);
      else
	qh = mpn_dc_div_qr_n (qp, np - qn, dp - qn, qn, dinv, tp);

      if (qn != dn)
	{
	  if (qn > dn - qn)
	    mpn_mul (tp, qp, qn, dp - dn, dn - qn);
	  else
	    mpn_mul (tp, dp - dn, dn - qn, qp, qn);

	  cy = mpn_sub_n (np - dn, np - dn, tp, dn);
	  if (qh != 0)
	    cy += mpn_sub_n (np - dn + qn, np - dn + qn, dp - dn, dn - qn);

	  while (cy != 0)
	    {
	      qh -= mpn_sub_1 (qp, qp, qn, 1);
	      cy -= mpn_add_n (np - dn, np - dn, dp - dn, dn);
	    }
	}
    }

  TMP_FREE;
  return qh;
}
void
mpn_toom_interpolate_12pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5,
			mp_size_t n, mp_size_t spt, int half, mp_ptr wsi)
{
  mp_limb_t cy;
  mp_size_t n3;
  mp_size_t n3p1;
  n3 = 3 * n;
  n3p1 = n3 + 1;

#define   r4    (pp + n3)			/* 3n+1 */
#define   r2    (pp + 7 * n)			/* 3n+1 */
#define   r0    (pp +11 * n)			/* s+t <= 2*n */

  /******************************* interpolation *****************************/
  if (half != 0) {
    cy = mpn_sub_n (r3, r3, r0, spt);
    MPN_DECR_U (r3 + spt, n3p1 - spt, cy);

    cy = DO_mpn_sublsh_n (r2, r0, spt, 10, wsi);
    MPN_DECR_U (r2 + spt, n3p1 - spt, cy);
    DO_mpn_subrsh(r5, n3p1, r0, spt, 2, wsi);

    cy = DO_mpn_sublsh_n (r1, r0, spt, 20, wsi);
    MPN_DECR_U (r1 + spt, n3p1 - spt, cy);
    DO_mpn_subrsh(r4, n3p1, r0, spt, 4, wsi);
  };

  r4[n3] -= DO_mpn_sublsh_n (r4 + n, pp, 2 * n, 20, wsi);
  DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 4, wsi);

#if HAVE_NATIVE_mpn_add_n_sub_n
  mpn_add_n_sub_n (r1, r4, r4, r1, n3p1);
#else
  ASSERT_NOCARRY(mpn_add_n (wsi, r1, r4, n3p1));
  mpn_sub_n (r4, r4, r1, n3p1); /* can be negative */
  MP_PTR_SWAP(r1, wsi);
#endif

  r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 10, wsi);
  DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 2, wsi);

#if HAVE_NATIVE_mpn_add_n_sub_n
  mpn_add_n_sub_n (r2, r5, r5, r2, n3p1);
#else
  mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */
  ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1));
  MP_PTR_SWAP(r5, wsi);
#endif

  r3[n3] -= mpn_sub_n (r3+n, r3+n, pp, 2 * n);

#if AORSMUL_FASTER_AORS_AORSLSH
  mpn_submul_1 (r4, r5, n3p1, 257); /* can be negative */
#else
  mpn_sub_n (r4, r4, r5, n3p1); /* can be negative */
  DO_mpn_sublsh_n (r4, r5, n3p1, 8, wsi); /* can be negative */
#endif
  /* A division by 2835x4 follows. Warning: the operand can be negative! */
  mpn_divexact_by2835x4(r4, r4, n3p1);
  if ((r4[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0)
    r4[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2));

#if AORSMUL_FASTER_2AORSLSH
  mpn_addmul_1 (r5, r4, n3p1, 60); /* can be negative */
#else
  DO_mpn_sublsh_n (r5, r4, n3p1, 2, wsi); /* can be negative */
  DO_mpn_addlsh_n (r5, r4, n3p1, 6, wsi); /* can give a carry */
#endif
  mpn_divexact_by255(r5, r5, n3p1);

  ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r3, n3p1, 5, wsi));

#if AORSMUL_FASTER_3AORSLSH
  ASSERT_NOCARRY(mpn_submul_1 (r1, r2, n3p1, 100));
#else
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 6, wsi));
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 5, wsi));
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 2, wsi));
#endif
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r3, n3p1, 9, wsi));
  mpn_divexact_by42525(r1, r1, n3p1);

#if AORSMUL_FASTER_AORS_2AORSLSH
  ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 225));
#else
  ASSERT_NOCARRY(mpn_sub_n (r2, r2, r1, n3p1));
  ASSERT_NOCARRY(DO_mpn_addlsh_n (r2, r1, n3p1, 5, wsi));
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r1, n3p1, 8, wsi));
#endif
  mpn_divexact_by9x4(r2, r2, n3p1);

  ASSERT_NOCARRY(mpn_sub_n (r3, r3, r2, n3p1));

  mpn_sub_n (r4, r2, r4, n3p1);
  ASSERT_NOCARRY(mpn_rshift(r4, r4, n3p1, 1));
  ASSERT_NOCARRY(mpn_sub_n (r2, r2, r4, n3p1));

  mpn_add_n (r5, r5, r1, n3p1);
  ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1));

  /* last interpolation steps... */
  ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1));
  ASSERT_NOCARRY(mpn_sub_n (r1, r1, r5, n3p1));
  /* ... could be mixed with recomposition
	||H-r5|M-r5|L-r5|   ||H-r1|M-r1|L-r1|
  */

  /***************************** recomposition *******************************/
  /*
    pp[] prior to operations:
    |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp

    summation scheme for remaining operations:
    |__12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp
    |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
	||H r1|M r1|L r1|   ||H r3|M r3|L r3|   ||H_r5|M_r5|L_r5|
  */

  cy = mpn_add_n (pp + n, pp + n, r5, n);
  cy = mpn_add_1 (pp + 2 * n, r5 + n, n, cy);
#if HAVE_NATIVE_mpn_add_nc
  cy = r5[n3] + mpn_add_nc(pp + n3, pp + n3, r5 + 2 * n, n, cy);
#else
  MPN_INCR_U (r5 + 2 * n, n + 1, cy);
  cy = r5[n3] + mpn_add_n (pp + n3, pp + n3, r5 + 2 * n, n);
#endif
  MPN_INCR_U (pp + n3 + n, 2 * n + 1, cy);

  pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r3, n);
  cy = mpn_add_1 (pp + 2 * n3, r3 + n, n, pp[2 * n3]);
#if HAVE_NATIVE_mpn_add_nc
  cy = r3[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r3 + 2 * n, n, cy);
#else
  MPN_INCR_U (r3 + 2 * n, n + 1, cy);
  cy = r3[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r3 + 2 * n, n);
#endif
  MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy);

  pp[10*n]+=mpn_add_n (pp + 9 * n, pp + 9 * n, r1, n);
  if (half) {
    cy = mpn_add_1 (pp + 10 * n, r1 + n, n, pp[10 * n]);
#if HAVE_NATIVE_mpn_add_nc
    if (LIKELY (spt > n)) {
      cy = r1[n3] + mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, n, cy);
      MPN_INCR_U (pp + 4 * n3, spt - n, cy);
    } else {
      ASSERT_NOCARRY(mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt, cy));
    }
#else
    MPN_INCR_U (r1 + 2 * n, n + 1, cy);
    if (LIKELY (spt > n)) {
      cy = r1[n3] + mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, n);
      MPN_INCR_U (pp + 4 * n3, spt - n, cy);
    } else {
      ASSERT_NOCARRY(mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt));
    }
#endif
  } else {
    ASSERT_NOCARRY(mpn_add_1 (pp + 10 * n, r1 + n, spt, pp[10 * n]));
  }

#undef   r0
#undef   r2
#undef   r4
}
Пример #24
0
mp_limb_t
mpn_sb_div_q (mp_ptr qp,
		 mp_ptr np, mp_size_t nn,
		 mp_srcptr dp, mp_size_t dn,
		 mp_limb_t dinv)
{
  mp_limb_t qh;
  mp_size_t qn, i;
  mp_limb_t n1, n0;
  mp_limb_t d1, d0, d11, d01;
  mp_limb_t cy, cy1;
  mp_limb_t q;
  mp_limb_t flag;

  mp_size_t dn_orig = dn, qn_orig;
  mp_srcptr dp_orig = dp;
  mp_ptr np_orig = np;

  ASSERT (dn > 2);
  ASSERT (nn >= dn);
  ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);

  np += nn;

  qn = nn - dn;
  if (qn + 1 < dn)
    {
      dp += dn - (qn + 1);
      dn = qn + 1;
    }

  qh = mpn_cmp (np - dn, dp, dn) >= 0;
  if (qh != 0)
    mpn_sub_n (np - dn, np - dn, dp, dn);

  if (dn <= SB_DIVAPPR_Q_SMALL_THRESHOLD)
     {
   qn_orig = qn;

   /* Reduce until dn - 2 >= qn */
   for (qn--, np--; qn > dn - 2; qn--)
     {
       /* fetch next word */
       cy = np[0];

       np--;
       mpir_divapprox32_preinv2(q, cy, np[0], dinv);
      
	    /* np -= dp*q1 */
       cy -= mpn_submul_1(np - dn + 1, dp, dn, q);

       /* correct if remainder is too large */
       if (UNLIKELY(cy || np[0] >= dp[dn - 1]))
         {
       if (cy || mpn_cmp(np - dn + 1, dp, dn) >= 0)
       {
          q++;
          cy -= mpn_sub_n(np - dn + 1, np - dn + 1, dp, dn);
       }
       }

       qp[qn] = q;
     }
   
   qn++;
   dp = dp + dn - qn - 1; /* make dp length qn + 1 */
   
   flag = ~CNST_LIMB(0);

   for ( ; qn > 0; qn--)
     {
       /* fetch next word */
       cy = np[0];
 
       np--;
       /* rare case where truncation ruins normalisation */
       if (cy > dp[qn] || (cy == dp[qn] && mpn_cmp(np - qn + 1, dp, qn) >= 0))
         {
       __div_helper(qp, np - qn, dp, qn);
       flag = 0;
       break;
         }
       
       mpir_divapprox32_preinv2(q, cy, np[0], dinv);
         
       /* np -= dp*q */
       cy -= mpn_submul_1(np - qn, dp, qn + 1, q);

       /* correct if remainder is too large */
       if (UNLIKELY(cy || np[0] >= dp[qn]))
         {
       if (cy || mpn_cmp(np - qn, dp, qn + 1) >= 0)
         {
       q++;
       cy -= mpn_sub_n(np - qn, np - qn, dp, qn + 1);
         }
         }
       
       qp[qn - 1] = q;
       dp++;
     }

     np--;
     n1 = np[1];
     qn = qn_orig;
     } 
  else
     {
  qp += qn;

  dn -= 2;			/* offset dn by 2 for main division loops,
				   saving two iterations in mpn_submul_1.  */
  d1 = dp[dn + 1];
  d0 = dp[dn + 0];

  d01 = d0 + 1;
  d11 = d1 + (d01 < d0);
  
  np -= 2;

  n1 = np[1];

  for (i = qn - (dn + 2); i >= 0; i--)
    {
      np--;
      if (UNLIKELY (n1 == d1) && np[1] == d0)
	{
	  q = GMP_NUMB_MASK;
	  mpn_submul_1 (np - dn, dp, dn + 2, q);
	  n1 = np[1];		/* update n1, last loop's value will now be invalid */
	}
      else
	{
	  mpir_divrem32_preinv2 (q, n1, n0, n1, np[1], np[0], d11, d01, d1, d0, dinv);

	  cy = mpn_submul_1 (np - dn, dp, dn, q);

	  cy1 = n0 < cy;
	  n0 = (n0 - cy) & GMP_NUMB_MASK;
	  cy = n1 < cy1;
	  n1 -= cy1;
	  np[0] = n0;

	  if (UNLIKELY (cy != 0))
	    {
	      n1 += d1 + mpn_add_n (np - dn, np - dn, dp, dn + 1);
	      q--;
	    }
	}

      *--qp = q;
    }

  flag = ~CNST_LIMB(0);

  if (dn >= 0)
    {
      for (i = dn; i > 0; i--)
	{
	  np--;
	  if (UNLIKELY (n1 >= (d1 & flag)))
	    {
	      q = GMP_NUMB_MASK;
	      cy = mpn_submul_1 (np - dn, dp, dn + 2, q);

	      if (UNLIKELY (n1 != cy))
		{
		  if (n1 < (cy & flag))
		    {
		      q--;
		      mpn_add_n (np - dn, np - dn, dp, dn + 2);
		    }
		  else
		    flag = 0;
		}
	      n1 = np[1];
	    }
	  else
	    {
	      mpir_divrem32_preinv2 (q, n1, n0, n1, np[1], np[0], d11, d01, d1, d0, dinv);

	      cy = mpn_submul_1 (np - dn, dp, dn, q);

	      cy1 = n0 < cy;
	      n0 = (n0 - cy) & GMP_NUMB_MASK;
	      cy = n1 < cy1;
	      n1 -= cy1;
	      np[0] = n0;

	      if (UNLIKELY (cy != 0))
		{
		  n1 += d1 + mpn_add_n (np - dn, np - dn, dp, dn + 1);
		  q--;
		}
	    }

	  *--qp = q;

	  /* Truncate operands.  */
	  dn--;
	  dp++;
	}

      np--;
      if (UNLIKELY (n1 >= (d1 & flag)))
	{
	  q = GMP_NUMB_MASK;
	  cy = mpn_submul_1 (np, dp, 2, q);

	  if (UNLIKELY (n1 != cy))
	    {
	      if (n1 < (cy & flag))
		{
		  q--;
		  add_ssaaaa (np[1], np[0], np[1], np[0], dp[1], dp[0]);
		}
	      else
		flag = 0;
	    }
	  n1 = np[1];
	}
      else
	{
	  mpir_divrem32_preinv2 (q, n1, n0, n1, np[1], np[0], d11, d01, d1, d0, dinv);

	  np[0] = n0;
	  np[1] = n1;
	}

      *--qp = q;
    }
  ASSERT_ALWAYS (np[1] == n1);
  }

  np += 2;

  dn = dn_orig;
  if (UNLIKELY (n1 < (dn & flag)))
    {
      mp_limb_t q, x;

      /* The quotient may be too large if the remainder is small.  Recompute
	 for above ignored operand parts, until the remainder spills.

	 FIXME: The quality of this code isn't the same as the code above.
	 1. We don't compute things in an optimal order, high-to-low, in order
	    to terminate as quickly as possible.
	 2. We mess with pointers and sizes, adding and subtracting and
	    adjusting to get things right.  It surely could be streamlined.
	 3. The only termination criteria are that we determine that the
	    quotient needs to be adjusted, or that we have recomputed
	    everything.  We should stop when the remainder is so large
	    that no additional subtracting could make it spill.
	 4. If nothing else, we should not do two loops of submul_1 over the
	    data, instead handle both the triangularization and chopping at
	    once.  */

      x = n1;

      if (dn > 2)
	{
	  /* Compensate for triangularization.  */
	  mp_limb_t y;

	  dp = dp_orig;
	  if (qn + 1 < dn)
	    {
	      dp += dn - (qn + 1);
	      dn = qn + 1;
	    }

	  y = np[-2];

	  for (i = dn - 3; i >= 0; i--)
	    {
	      q = qp[i];
	      cy = mpn_submul_1 (np - (dn - i), dp, dn - i - 2, q);

	      if (y < cy)
		{
		  if (x == 0)
		    {
		      cy = mpn_sub_1 (qp, qp, qn, 1);
		      ASSERT_ALWAYS (cy == 0);
		      return qh - cy;
		    }
		  x--;
		}
	      y -= cy;
	    }
	  np[-2] = y;
	}

      dn = dn_orig;
      if (qn + 1 < dn)
	{
	  /* Compensate for ignored dividend and divisor tails.  */

	  dp = dp_orig;
	  np = np_orig;

	  if (qh != 0)
	    {
	      cy = mpn_sub_n (np + qn, np + qn, dp, dn - (qn + 1));
	      if (cy != 0)
		{
		  if (x == 0)
		    {
		      if (qn != 0)
			cy = mpn_sub_1 (qp, qp, qn, 1);
		      return qh - cy;
		    }
		  x--;
		}
	    }

	  if (qn == 0)
	    return qh;

	  for (i = dn - qn - 2; i >= 0; i--)
	    {
	      cy = mpn_submul_1 (np + i, qp, qn, dp[i]);
	      cy = mpn_sub_1 (np + qn + i, np + qn + i, dn - qn - i - 1, cy);
	      if (cy != 0)
		{
		  if (x == 0)
		    {
		      cy = mpn_sub_1 (qp, qp, qn, 1);
		      return qh;
		    }
		  x--;
		}
	    }
	}
    }

  return qh;
}
Пример #25
0
void
mpn_kara_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
{
  mp_limb_t w, w0, w1;
  mp_size_t n2;
  mp_srcptr x, y;
  mp_size_t i;

  n2 = n >> 1;
  ASSERT (n2 > 0);

  if ((n & 1) != 0)
    {
      /* Odd length. */
      mp_size_t n1, n3, nm1;

      n3 = n - n2;

      w = a[n2];
      if (w != 0)
	w -= mpn_sub_n (p, a, a + n3, n2);
      else
	{
	  i = n2;
	  do
	    {
	      --i;
	      w0 = a[i];
	      w1 = a[n3 + i];
	    }
	  while (w0 == w1 && i != 0);
	  if (w0 < w1)
	    {
	      x = a + n3;
	      y = a;
	    }
	  else
	    {
	      x = a;
	      y = a + n3;
	    }
	  mpn_sub_n (p, x, y, n2);
	}
      p[n2] = w;

      n1 = n + 1;

      /* n2 is always either n3 or n3-1 so maybe the two sets of tests here
	 could be combined.  But that's not important, since the tests will
	 take a miniscule amount of time compared to the function calls.  */
      if (BELOW_THRESHOLD (n3, SQR_BASECASE_THRESHOLD))
	{
	  mpn_mul_basecase (ws, p, n3, p, n3);
	  mpn_mul_basecase (p,  a, n3, a, n3);
	}
      else if (BELOW_THRESHOLD (n3, SQR_KARATSUBA_THRESHOLD))
	{
	  mpn_sqr_basecase (ws, p, n3);
	  mpn_sqr_basecase (p,  a, n3);
	}
      else
	{
	  mpn_kara_sqr_n   (ws, p, n3, ws + n1);	 /* (x-y)^2 */
	  mpn_kara_sqr_n   (p,  a, n3, ws + n1);	 /* x^2	    */
	}
      if (BELOW_THRESHOLD (n2, SQR_BASECASE_THRESHOLD))
	mpn_mul_basecase (p + n1, a + n3, n2, a + n3, n2);
      else if (BELOW_THRESHOLD (n2, SQR_KARATSUBA_THRESHOLD))
	mpn_sqr_basecase (p + n1, a + n3, n2);
      else
	mpn_kara_sqr_n   (p + n1, a + n3, n2, ws + n1);	 /* y^2	    */

      /* Since x^2+y^2-(x-y)^2 = 2xy >= 0 there's no need to track the
	 borrow from mpn_sub_n.	 If it occurs then it'll be cancelled by a
	 carry from ws[n].  Further, since 2xy fits in n1 limbs there won't
	 be any carry out of ws[n] other than cancelling that borrow. */

      mpn_sub_n (ws, p, ws, n1);	     /* x^2-(x-y)^2 */

      nm1 = n - 1;
      if (mpn_add_n (ws, p + n1, ws, nm1))   /* x^2+y^2-(x-y)^2 = 2xy */
	{
	  mp_limb_t x = (ws[nm1] + 1) & GMP_NUMB_MASK;
	  ws[nm1] = x;
	  if (x == 0)
	    ws[n] = (ws[n] + 1) & GMP_NUMB_MASK;
	}
      if (mpn_add_n (p + n3, p + n3, ws, n1))
	{
	  mpn_incr_u (p + n1 + n3, 1);
	}
    }
  else
    {
      /* Even length. */
      i = n2;
      do
	{
	  --i;
	  w0 = a[i];
	  w1 = a[n2 + i];
	}
      while (w0 == w1 && i != 0);
      if (w0 < w1)
	{
	  x = a + n2;
	  y = a;
	}
      else
	{
	  x = a;
	  y = a + n2;
	}
      mpn_sub_n (p, x, y, n2);

      /* Pointwise products. */
      if (BELOW_THRESHOLD (n2, SQR_BASECASE_THRESHOLD))
	{
	  mpn_mul_basecase (ws,    p,      n2, p,      n2);
	  mpn_mul_basecase (p,     a,      n2, a,      n2);
	  mpn_mul_basecase (p + n, a + n2, n2, a + n2, n2);
	}
      else if (BELOW_THRESHOLD (n2, SQR_KARATSUBA_THRESHOLD))
	{
	  mpn_sqr_basecase (ws,    p,      n2);
	  mpn_sqr_basecase (p,     a,      n2);
	  mpn_sqr_basecase (p + n, a + n2, n2);
	}
      else
	{
	  mpn_kara_sqr_n (ws,    p,      n2, ws + n);
	  mpn_kara_sqr_n (p,     a,      n2, ws + n);
	  mpn_kara_sqr_n (p + n, a + n2, n2, ws + n);
	}

      /* Interpolate. */
      w = -mpn_sub_n (ws, p, ws, n);
      w += mpn_add_n (ws, p + n, ws, n);
      w += mpn_add_n (p + n2, p + n2, ws, n);
      MPN_INCR_U (p + n2 + n, 2 * n - (n2 + n), w);
    }
}
Пример #26
0
void
mpz_powm_ui (mpz_ptr r, mpz_srcptr b, unsigned long int el, mpz_srcptr m)
{
  mp_ptr xp, tp, qp, mp, bp;
  mp_size_t xn, tn, mn, bn;
  int m_zero_cnt;
  int c;
  mp_limb_t e;
  TMP_DECL;

  mp = PTR(m);
  mn = ABSIZ(m);
  if (mn == 0)
    DIVIDE_BY_ZERO;

  if (el == 0)
    {
      /* Exponent is zero, result is 1 mod MOD, i.e., 1 or 0
	 depending on if MOD equals 1.  */
      SIZ(r) = (mn == 1 && mp[0] == 1) ? 0 : 1;
      PTR(r)[0] = 1;
      return;
    }

  TMP_MARK;

  /* Normalize m (i.e. make its most significant bit set) as required by
     division functions below.  */
  count_leading_zeros (m_zero_cnt, mp[mn - 1]);
  m_zero_cnt -= GMP_NAIL_BITS;
  if (m_zero_cnt != 0)
    {
      mp_ptr new_mp = TMP_ALLOC_LIMBS (mn);
      mpn_lshift (new_mp, mp, mn, m_zero_cnt);
      mp = new_mp;
    }

  bn = ABSIZ(b);
  bp = PTR(b);
  if (bn > mn)
    {
      /* Reduce possibly huge base.  Use a function call to reduce, since we
	 don't want the quotient allocation to live until function return.  */
      mp_ptr new_bp = TMP_ALLOC_LIMBS (mn);
      reduce (new_bp, bp, bn, mp, mn);
      bp = new_bp;
      bn = mn;
      /* Canonicalize the base, since we are potentially going to multiply with
	 it quite a few times.  */
      MPN_NORMALIZE (bp, bn);
    }

  if (bn == 0)
    {
      SIZ(r) = 0;
      TMP_FREE;
      return;
    }

  tp = TMP_ALLOC_LIMBS (2 * mn + 1);
  xp = TMP_ALLOC_LIMBS (mn);

  qp = TMP_ALLOC_LIMBS (mn + 1);

  MPN_COPY (xp, bp, bn);
  xn = bn;

  e = el;
  count_leading_zeros (c, e);
  e = (e << c) << 1;		/* shift the exp bits to the left, lose msb */
  c = BITS_PER_MP_LIMB - 1 - c;

  /* Main loop. */

  /* If m is already normalized (high bit of high limb set), and b is the
     same size, but a bigger value, and e==1, then there's no modular
     reductions done and we can end up with a result out of range at the
     end. */
  if (c == 0)
    {
      if (xn == mn && mpn_cmp (xp, mp, mn) >= 0)
        mpn_sub_n (xp, xp, mp, mn);
      goto finishup;
    }

  while (c != 0)
    {
      mpn_sqr_n (tp, xp, xn);
      tn = 2 * xn; tn -= tp[tn - 1] == 0;
      if (tn < mn)
	{
	  MPN_COPY (xp, tp, tn);
	  xn = tn;
	}
      else
	{
	  mpn_tdiv_qr (qp, xp, 0L, tp, tn, mp, mn);
	  xn = mn;
	}

      if ((mp_limb_signed_t) e < 0)
	{
	  mpn_mul (tp, xp, xn, bp, bn);
	  tn = xn + bn; tn -= tp[tn - 1] == 0;
	  if (tn < mn)
	    {
	      MPN_COPY (xp, tp, tn);
	      xn = tn;
	    }
	  else
	    {
	      mpn_tdiv_qr (qp, xp, 0L, tp, tn, mp, mn);
	      xn = mn;
	    }
	}
      e <<= 1;
      c--;
    }

 finishup:
  /* We shifted m left m_zero_cnt steps.  Adjust the result by reducing
     it with the original MOD.  */
  if (m_zero_cnt != 0)
    {
      mp_limb_t cy;
      cy = mpn_lshift (tp, xp, xn, m_zero_cnt);
      tp[xn] = cy; xn += cy != 0;

      if (xn < mn)
	{
	  MPN_COPY (xp, tp, xn);
	}
      else
	{
	  mpn_tdiv_qr (qp, xp, 0L, tp, xn, mp, mn);
	  xn = mn;
	}
      mpn_rshift (xp, xp, xn, m_zero_cnt);
    }
  MPN_NORMALIZE (xp, xn);

  if ((el & 1) != 0 && SIZ(b) < 0 && xn != 0)
    {
      mp = PTR(m);			/* want original, unnormalized m */
      mpn_sub (xp, mp, mn, xp, xn);
      xn = mn;
      MPN_NORMALIZE (xp, xn);
    }
  MPZ_REALLOC (r, xn);
  SIZ (r) = xn;
  MPN_COPY (PTR(r), xp, xn);

  TMP_FREE;
}
Пример #27
0
/* put in {c, 2n} where n = 2k+r the value of {v0,2k} (already in place)
   + B^k * [{v1, 2k+1} - {t1, 2k+1}]
   + B^(2k) * [{t2, 2k+1} - {v0+vinf, 2k}]
   + B^(3k) * [{t1, 2k+1} - {t2, 2k+1}]
   + B^(4k) * {vinf,2r} (high 2r-1 limbs already in place)
   where {t1, 2k+1} = (3*{v0,2k}+2*sa*{vm1,2k+1}+{v2,2k+1})/6-2*{vinf,2r}
	 {t2, 2k+1} = ({v1, 2k+1} + sa * {vm1, 2k+1})/2
   (sa is the sign of {vm1, 2k+1}).

   {vinf, 2r} stores the content of {v0, 2r} + {vinf, 2r}, with carry in cinf0.
   vinf0 is the low limb of vinf.

   ws is temporary space, and should have at least 2r limbs.

   Think about:

   The evaluated point a-b+c stands a good chance of having a zero carry
   limb, a+b+c would have a 1/4 chance, and 4*a+2*b+c a 1/8 chance, roughly.
   Perhaps this could be tested and stripped.  Doing so before recursing
   would be better than stripping at the start of mpn_toom3_mul_n/sqr_n,
   since then the recursion could be based on the new size.  Although in
   truth the kara vs toom3 crossover is never so exact that one limb either
   way makes a difference.

   A small value like 1 or 2 for the carry could perhaps also be handled
   with an add_n or addlsh1_n.  Would that be faster than an extra limb on a
   (recursed) multiply/square?
*/
static void
toom3_interpolate (mp_ptr c, mp_srcptr v1, mp_ptr v2, mp_ptr vm1,
		   mp_ptr vinf, mp_size_t k, mp_size_t r, int sa,
		   mp_limb_t vinf0, mp_limb_t cinf0, mp_ptr ws)
{
  mp_limb_t cy, saved;
  unsigned long twok = k + k;
  unsigned long kk1 = twok + 1;
  unsigned long twor = r + r;
  mp_ptr c1, c2, c3, c4, c5;
  mp_limb_t cout; /* final carry, should be zero at the end */

  c1 = c + k;
  c2 = c1 + k;
  c3 = c2 + k;
  c4 = c3 + k;
  c5 = c4 + k;

#define v0 (c)
  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
       v0      |vm1|       hi(vinf)       v1       v2+2vm1      vinf
							      +lo(v0) */

  ASSERT_NOCARRY (mpn_divexact_by3 (v2, v2, kk1));    /* v2 <- v2 / 3 */
#ifdef HAVE_NATIVE_mpn_rsh1add_n
  mpn_rsh1add_n (v2, v2, v0, twok); /* v2 <- (lo(v2)+v0) / 2, exact */
  cy = v2[twok] & 1; /* add high limb of v2 divided by 2 */
  v2[twok] >>= 1;
  MPN_INCR_U (v2 + twok - 1, 2, cy << (GMP_NUMB_BITS - 1));
#else
  v2[twok] += mpn_add_n (v2, v2, v0, twok);
  mpn_rshift (v2, v2, kk1, 1);
#endif

  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
       v0      |vm1|      hi(vinf)       v1    (3v0+2vm1+v2)    vinf
						    /6         +lo(v0) */

  /* vm1 <- t2 := (v1 + sa*vm1) / 2
     t2 = a0*b0+a0*b2+a1*b1+a2*b0+a2*b2 >= 0
     No carry comes out from {v1, kk1} +/- {vm1, kk1},
     and the division by two is exact */
  if (sa >= 0)
    {
#ifdef HAVE_NATIVE_mpn_rsh1add_n
      mpn_rsh1add_n (vm1, v1, vm1, kk1);
#else
      mpn_add_n (vm1, vm1, v1, kk1);
      mpn_rshift (vm1, vm1, kk1, 1);
#endif
    }
  else
    {
#ifdef HAVE_NATIVE_mpn_rsh1sub_n
      mpn_rsh1sub_n (vm1, v1, vm1, kk1);
#else
      mpn_sub_n (vm1, v1, vm1, kk1);
      mpn_rshift (vm1, vm1, kk1, 1);
#endif
    }

  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
       v0       t2        hi(vinf)       v1         t1       vinf+lo(v0) */

  /* subtract 2*vinf to v2,
     result is t1 := a0*b0+a0*b2+a1*b1+a1*b2+a2*b0+a2*b1+a2*b2 >= 0 */
  saved = c4[0];
  c4[0] = vinf0;
#ifdef HAVE_NATIVE_mpn_sublsh1_n
  cy = mpn_sublsh1_n (v2, v2, c4, twor);
#else
  cy = mpn_lshift (ws, c4, twor, 1);
  cy += mpn_sub_n (v2, v2, ws, twor);
#endif
  MPN_DECR_U (v2 + twor, kk1 - twor, cy);
  c4[0] = saved;

  /* subtract {t2, 2k+1} in {c+3k, 2k+1} i.e. in {t2+k, 2k+1}:
     by chunks of k limbs from right to left to avoid overlap */
#define t2 (vm1)
  /* a borrow may occur in one of the 2 following __GMPN_SUB_1 calls, but since
     the final result is nonnegative, it will be compensated later on */
  __GMPN_SUB_1 (cout, c5, c5, twor - k, t2[twok]);
  cy = mpn_sub_n (c4, c4, t2 + k, k);
  __GMPN_SUB_1 (cout, c5, c5, twor - k, cy);
  cy = mpn_sub_n (c3, c3, t2, k);
  __GMPN_SUB_1 (cout, c4, c4, twor, cy);

  /* don't forget to add vinf0 in {c+4k, ...} */
  __GMPN_ADD_1 (cout, c4, c4, twor, vinf0);

  /* c  c+k c+2k c+3k c+4k+1   t  t+2k+1 t+4k+2
     v0     t2        hi(vinf) v1 t1     vinf
		 -t2                    +lo(v0)
  */

  /* c  c+k c+2k c+3k c+4k     t  t+2k+1 t+4k+2
     v0     t2        vinf     v1 t1     vinf
		 -t2                    +lo(v0)
  */

  /* subtract v0+vinf in {c+2k, ...} */
  cy = cinf0 + mpn_sub_n (c2, c2, vinf, twor);
  if (twor < twok)
    {
      __GMPN_SUB_1 (cy, c2 + twor, c2 + twor, twok - twor, cy);
      cy += mpn_sub_n (c2 + twor, c2 + twor, v0 + twor, twok - twor);
    }
  __GMPN_SUB_1 (cout, c4, c4, twor, cy); /* 2n-4k = 2r */

  /* c   c+k  c+2k  c+3k  c+4k      t   t+2k+1  t+4k+2
     v0       t2          vinf      v1  t1      vinf
	      -v0   -t2                        +lo(v0)
	      -vinf                                    */

  /* subtract t1 in {c+k, ...} */
  cy = mpn_sub_n (c1, c1, v2, kk1);
  __GMPN_SUB_1 (cout, c3 + 1, c3 + 1, twor + k - 1, cy); /* 2n-(3k+1)=k+2r-1 */

  /* c   c+k  c+2k  c+3k  c+4k      t   t+2k+1  t+4k+2
     v0       t2          vinf      v1  t1      vinf
	 -t1  -v0   -t2
	      -vinf                                    */

  /* add t1 in {c+3k, ...} */
  cy = mpn_add_n (c3, c3, v2, kk1);
  __GMPN_ADD_1 (cout, c5 + 1, c5 + 1, twor - k - 1, cy); /* 2n-(5k+1) = 2r-k-1 */

  /* c   c+k  c+2k  c+3k  c+4k      t   t+2k+1  t+4k+2
     v0       t2    t1    vinf      v1  t1      vinf
	 -t1  -v0   -t2
	      -vinf                                    */

  /* add v1 in {c+k, ...} */
  cy = mpn_add_n (c1, c1, v1, kk1);
  __GMPN_ADD_1 (cout, c3 + 1, c3 + 1, twor + k - 1, cy); /* 2n-(3k+1) = 2r+k-1 */

  /* c   c+k  c+2k  c+3k  c+4k      t   t+2k+1  t+4k+2
     v0  v1   t2    t1    vinf      v1  t1      vinf
	 -t1  -v0   -t2
	      -vinf                                    */
#undef v0
#undef t2
}
Пример #28
0
mp_limb_t
mpn_sbpi1_div_qr (mp_ptr qp,
		  mp_ptr np, mp_size_t nn,
		  mp_srcptr dp, mp_size_t dn,
		  mp_limb_t dinv)
{
  mp_limb_t qh;
  mp_size_t i;
  mp_limb_t n1, n0;
  mp_limb_t d1, d0;
  mp_limb_t cy, cy1;
  mp_limb_t q;

  ASSERT (dn > 2);
  ASSERT (nn >= dn);
  ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);

  np += nn;

  qh = mpn_cmp (np - dn, dp, dn) >= 0;
  if (qh != 0)
    mpn_sub_n (np - dn, np - dn, dp, dn);

  qp += nn - dn;

  dn -= 2;			/* offset dn by 2 for main division loops,
				   saving two iterations in mpn_submul_1.  */
  d1 = dp[dn + 1];
  d0 = dp[dn + 0];

  np -= 2;

  n1 = np[1];

  for (i = nn - (dn + 2); i > 0; i--)
    {
      np--;
      if (UNLIKELY (n1 == d1) && np[1] == d0)
	{
	  q = GMP_NUMB_MASK;
	  mpn_submul_1 (np - dn, dp, dn + 2, q);
	  n1 = np[1];		/* update n1, last loop's value will now be invalid */
	}
      else
	{
	  udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv);

	  cy = mpn_submul_1 (np - dn, dp, dn, q);

	  cy1 = n0 < cy;
	  n0 = (n0 - cy) & GMP_NUMB_MASK;
	  cy = n1 < cy1;
	  n1 = (n1 - cy1) & GMP_NUMB_MASK;
	  np[0] = n0;

	  if (UNLIKELY (cy != 0))
	    {
	      n1 += d1 + mpn_add_n (np - dn, np - dn, dp, dn + 1);
	      q--;
	    }
	}

      *--qp = q;
    }
  np[1] = n1;

  return qh;
}
Пример #29
0
void
mpn_toom3_sqr_n (mp_ptr c, mp_srcptr a, mp_size_t n, mp_ptr t)
{
  mp_size_t k, k1, kk1, r, twok, twor;
  mp_limb_t cy, saved, vinf0, cinf0;
  mp_ptr trec;
  int sa;
  mp_ptr c1, c2, c3, c4;

  ASSERT(GMP_NUMB_BITS >= 6);
  ASSERT(n >= 17); /* so that r <> 0 and 5k+3 <= 2n */

  /* the algorithm is the same as mpn_mul_n_tc3, with b=a */

  k = (n + 2) / 3; /* ceil(n/3) */
  twok = 2 * k;
  k1 = k + 1;
  kk1 = k + k1;
  r = n - twok;   /* last chunk */
  twor = 2 * r;

  c1 = c + k;
  c2 = c1 + k;
  c3 = c2 + k;
  c4 = c3 + k;

  trec = t + 4 * k + 3; /* trec = v2 + (2k+2) */

  cy = mpn_add_n (c, a, a + twok, r);
  if (r < k)
    __GMPN_ADD_1 (cy, c + r, a + r, k - r, cy);
  c3[2] = (c1[0] = cy) + mpn_add_n (c2 + 2, c, a + k, k);

#define v2 (t+2*k+1)
#define vinf (t+4*k+2)

  TOOM3_SQR_REC (t, c2 + 2, k1, trec);

  sa = (c[k] != 0) ? 1 : mpn_cmp (c, a + k, k);
  c[k] = (sa >= 0) ? c[k] - mpn_sub_n (c, c, a + k, k)
    : mpn_sub_n (c, a + k, c, k);

  TOOM3_SQR_REC (c2, c, k1, trec);

#ifdef HAVE_NATIVE_mpn_addlsh1_n
  c1[0] = mpn_addlsh1_n (c, a + k, a + twok, r);
  if (r < k)
    __GMPN_ADD_1 (c1[0], c + r, a + k + r, k - r, c1[0]);
  c1[0] = 2 * c1[0] + mpn_addlsh1_n (c, a, c, k);
#else
  c[r] = mpn_lshift (c, a + twok, r, 1);
  if (r < k)
    MPN_ZERO(c + r + 1, k - r);
  c1[0] += mpn_add_n (c, c, a + k, k);
  mpn_lshift (c, c, k1, 1);
  c1[0] += mpn_add_n (c, c, a, k);
#endif

  TOOM3_SQR_REC (v2, c, k1, trec);

  TOOM3_SQR_REC (c, a, k, trec);

#ifdef HAVE_NATIVE_mpn_addlsh1_n
  mpn_addlsh1_n (v2, v2, c2, kk1);
#else
  mpn_lshift (t + 4 * k + 2, c2, kk1, 1);
  mpn_add_n (v2, v2, t + 4 * k + 2, kk1);
#endif

  saved = c4[0];
  TOOM3_SQR_REC (c4, a + twok, r, trec);
  cinf0 = mpn_add_n (vinf, c4, c, twor);
  vinf0 = c4[0];
  c4[0] = saved;

  toom3_interpolate (c, t, v2, c2, vinf, k, r, 1, vinf0, cinf0, vinf + twor);

#undef v2
#undef vinf
}
Пример #30
0
mp_limb_t mpn_divrem_euclidean_qr_2(mp_ptr qp, mp_ptr xp, mp_size_t xn, mp_srcptr dp)
{
   mp_size_t qn;
   mp_limb_t qf, t[2], t1[2], q, h, l, d1, d2, i;
   int c1, c3, c4;

   ASSERT(xn >= 2);
   ASSERT_MPN(dp, 2);
   ASSERT_MPN(xp, xn);
   ASSERT(dp[1] != 0);

   qn = xn - 1;

   /* ASSERT(!MPN_OVERLAP_P(qp, qn, xp, xn)); */ /* FIXME: correct this overlap requirement */
   ASSERT((dp[1]>>(GMP_NUMB_BITS - 1)) != 0);

   h = 0;
   d1 = dp[1];
   d2 = dp[0];
   
   invert_limb(i, d1);
   
   l = xp[xn - 1];
   qn = xn - 2;
   t[0] = xp[qn];

   if (l < d1)
   { 
      h = t[1] = l;
      l = t[0] = xp[qn];
      qf = 0;
   }
   else
   {
      qf = 1;
      t[1] = l - d1;
      t1[1] = 0;
      t1[0] = d2;
   
      if (mpn_sub_n(t, t, t1, 2))
      {
         qf--;
         mpn_add_n(t, t, dp, 2);
      }
   
      h = t[1];
      l = t[0];
   }

   for (qn = xn - 3; qn >= 0; qn--)
   {
      t[0] = xp[qn];
    
      if (h < d1)
      {
         udiv_qrnnd_preinv(q, t[1], h, l, d1, i);
         umul_ppmm(t1[1], t1[0], q, d2);
         if (mpn_sub_n(t, t, t1, 2))
         {
            q--;
            if (mpn_add_n(t, t, dp, 2) == 0)
            {
               q--;
               
               ASSERT_CARRY(mpn_add_n(t, t, dp, 2));
            }
         }
      }
      else
      {
         ASSERT(h == d1);
         q = -1;
         t[1] = l;
         c3 = mpn_add_n(t, t, dp, 2);
         c1 = mpn_sub_1(t + 1, t + 1, 1, d2);
         c4 = c3 - c1;
       
         if (l >= d1)
         {
            ASSERT(c3 != 0);
            ASSERT(c4 == 0);
         } /* our guess is B + 1, so q = B - 1 is correct */
         else
         {
            ASSERT(c4 <= 0); /* our guess is B so q = B - 1 or B - 2 */
            if (c4 != 0)
            {
               q--;
               mpn_add_n(t, t, dp, 2);
            }
         }       
      }
    
      h = t[1];
      l = t[0];
      qp[qn] = q;
   }

   xp[1] = t[1];
   xp[0] = t[0];

   return qf;
}