예제 #1
0
mp_limb_t	mpn_addadd_n(mp_ptr t,mp_srcptr x,mp_srcptr y,mp_srcptr z,mp_size_t n)
{mp_limb_t ret;
 mp_srcptr a=x,b=y,c=z;

ASSERT(n>0);
ASSERT_MPN(x,n);ASSERT_MPN(y,n);ASSERT_MPN(z,n);//ASSERT_SPACE(t,n);
ASSERT(MPN_SAME_OR_SEPARATE_P(t,x,n));
ASSERT(MPN_SAME_OR_SEPARATE_P(t,y,n));
ASSERT(MPN_SAME_OR_SEPARATE_P(t,z,n));
if(t==x)
  {if(t==y)
     {if(t==z)
        {
         #ifdef HAVE_NATIVE_mpn_addlsh1_n
         return mpn_addlsh1_n(t,x,y,n);
         #else
         return mpn_mul_1(t,x,n,3);
         #endif  
        }
     }
   else
     {MP_SRCPTR_SWAP(b,c);}
  }
else
  {MP_SRCPTR_SWAP(a,c);if(t==y)MP_SRCPTR_SWAP(a,b);}
ret=mpn_add_n(t,a,b,n);return ret+mpn_add_n(t,t,c,n);}
예제 #2
0
파일: lucnum2_ui.c 프로젝트: qsnake/mpir
void
mpz_lucnum2_ui (mpz_ptr ln, mpz_ptr lnsub1, unsigned long n)
{
  mp_ptr     lp, l1p, f1p;
  mp_size_t  size;
  mp_limb_t  c;
  TMP_DECL;

  ASSERT (ln != lnsub1);

  /* handle small n quickly, and hide the special case for L[-1]=-1 */
  if (n <= FIB_TABLE_LUCNUM_LIMIT)
    {
      mp_limb_t  f  = FIB_TABLE (n);
      mp_limb_t  f1 = FIB_TABLE ((int) n - 1);

      /* L[n] = F[n] + 2F[n-1] */
      PTR(ln)[0] = f + 2*f1;
      SIZ(ln) = 1;

      /* L[n-1] = 2F[n] - F[n-1], but allow for L[-1]=-1 */
      PTR(lnsub1)[0] = (n == 0 ? 1 : 2*f - f1);
      SIZ(lnsub1) = (n == 0 ? -1 : 1);

      return;
    }

  TMP_MARK;
  size = MPN_FIB2_SIZE (n);
  f1p = TMP_ALLOC_LIMBS (size);

  MPZ_REALLOC (ln,     size+1);
  MPZ_REALLOC (lnsub1, size+1);
  lp  = PTR(ln);
  l1p = PTR(lnsub1);

  size = mpn_fib2_ui (l1p, f1p, n);

  /* L[n] = F[n] + 2F[n-1] */
#if HAVE_NATIVE_mpn_addlsh1_n
  c = mpn_addlsh1_n (lp, l1p, f1p, size);
#else
  c = mpn_lshift1 (lp, f1p, size);
  c += mpn_add_n (lp, lp, l1p, size);
#endif
  lp[size] = c;
  SIZ(ln) = size + (c != 0);

  /* L[n-1] = 2F[n] - F[n-1] */
  c = mpn_double (l1p, size);
  c -= mpn_sub_n (l1p, l1p, f1p, size);
  ASSERT ((mp_limb_signed_t) c >= 0);
  l1p[size] = c;
  SIZ(lnsub1) = size + (c != 0);

  TMP_FREE;
}
예제 #3
0
void tc4_addlsh1_unsigned(mp_ptr rp, mp_size_t * rn, mp_srcptr xp, mp_size_t xn)
{
	if (xn)
	{
		if (xn >= *rn)
		{
            mp_limb_t cy;
			if (xn > *rn) MPN_ZERO(rp + *rn, xn - *rn);
#if HAVE_NATIVE_mpn_addlsh1_n
            cy = mpn_addlsh1_n(rp, rp, xp, xn);
#else
            cy = mpn_add_n(rp, rp, xp, xn);
            cy += mpn_add_n(rp, rp, xp, xn);
#endif
			if (cy) 
			{
				rp[xn] = cy;
				*rn = xn + 1;
			} else *rn = xn;
		} else
	   {
		   mp_limb_t cy;
#if HAVE_NATIVE_mpn_addlsh1_n
            cy = mpn_addlsh1_n(rp, rp, xp, xn);
#else
            cy = mpn_add_n(rp, rp, xp, xn);
            cy += mpn_add_n(rp, rp, xp, xn);
#endif
	      if (cy) cy = mpn_add_1(rp + xn, rp + xn, *rn - xn, cy);
		   if (cy) 
		   {
			   rp[*rn] = cy;
			   (*rn)++;
		   }
		}
	}
}
예제 #4
0
/* Define our own squaring function, which uses mpn_sqr_basecase for its
   allowed sizes, but its own code for larger sizes.  */
static void
mpn_local_sqr (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_ptr tp)
{
  mp_size_t i;

  ASSERT (n >= 1);
  ASSERT (! MPN_OVERLAP_P (rp, 2*n, up, n));

  if (BELOW_THRESHOLD (n, SQR_BASECASE_LIM))
    {
      mpn_sqr_basecase (rp, up, n);
      return;
    }

  {
    mp_limb_t ul, lpl;
    ul = up[0];
    umul_ppmm (rp[1], lpl, ul, ul << GMP_NAIL_BITS);
    rp[0] = lpl >> GMP_NAIL_BITS;
  }
  if (n > 1)
    {
      mp_limb_t cy;

      cy = mpn_mul_1 (tp, up + 1, n - 1, up[0]);
      tp[n - 1] = cy;
      for (i = 2; i < n; i++)
	{
	  mp_limb_t cy;
	  cy = mpn_addmul_1 (tp + 2 * i - 2, up + i, n - i, up[i - 1]);
	  tp[n + i - 2] = cy;
	}
      MPN_SQR_DIAGONAL (rp + 2, up + 1, n - 1);

      {
	mp_limb_t cy;
#if HAVE_NATIVE_mpn_addlsh1_n
	cy = mpn_addlsh1_n (rp + 1, rp + 1, tp, 2 * n - 2);
#else
	cy = mpn_lshift (tp, tp, 2 * n - 2, 1);
	cy += mpn_add_n (rp + 1, rp + 1, tp, 2 * n - 2);
#endif
	rp[2 * n - 1] += cy;
      }
    }
}
예제 #5
0
void
mpn_toom53_mul (mp_ptr pp,
		mp_srcptr ap, mp_size_t an,
		mp_srcptr bp, mp_size_t bn,
		mp_ptr scratch)
{
  mp_size_t n, s, t;
  int vm1_neg, vmh_neg;
  mp_limb_t cy;
  mp_ptr gp, hp;
  mp_ptr as1, asm1, as2, ash, asmh;
  mp_ptr bs1, bsm1, bs2, bsh, bsmh;
  enum toom4_flags flags;
  TMP_DECL;

#define a0  ap
#define a1  (ap + n)
#define a2  (ap + 2*n)
#define a3  (ap + 3*n)
#define a4  (ap + 4*n)
#define b0  bp
#define b1  (bp + n)
#define b2  (bp + 2*n)

  n = 1 + (3 * an >= 5 * bn ? (an - 1) / (size_t) 5 : (bn - 1) / (size_t) 3);

  s = an - 4 * n;
  t = bn - 2 * n;

  ASSERT (0 < s && s <= n);
  ASSERT (0 < t && t <= n);

  TMP_MARK;

  as1  = TMP_SALLOC_LIMBS (n + 1);
  asm1 = TMP_SALLOC_LIMBS (n + 1);
  as2  = TMP_SALLOC_LIMBS (n + 1);
  ash  = TMP_SALLOC_LIMBS (n + 1);
  asmh = TMP_SALLOC_LIMBS (n + 1);

  bs1  = TMP_SALLOC_LIMBS (n + 1);
  bsm1 = TMP_SALLOC_LIMBS (n + 1);
  bs2  = TMP_SALLOC_LIMBS (n + 1);
  bsh  = TMP_SALLOC_LIMBS (n + 1);
  bsmh = TMP_SALLOC_LIMBS (n + 1);

  gp = pp;
  hp = pp + n + 1;

  /* Compute as1 and asm1.  */
  gp[n]  = mpn_add_n (gp, a0, a2, n);
  gp[n] += mpn_add   (gp, gp, n, a4, s);
  hp[n]  = mpn_add_n (hp, a1, a3, n);
#if HAVE_NATIVE_mpn_addsub_n
  if (mpn_cmp (gp, hp, n + 1) < 0)
    {
      mpn_addsub_n (as1, asm1, hp, gp, n + 1);
      vm1_neg = 1;
    }
  else
    {
      mpn_addsub_n (as1, asm1, gp, hp, n + 1);
      vm1_neg = 0;
    }
#else
  mpn_add_n (as1, gp, hp, n + 1);
  if (mpn_cmp (gp, hp, n + 1) < 0)
    {
      mpn_sub_n (asm1, hp, gp, n + 1);
      vm1_neg = 1;
    }
  else
    {
      mpn_sub_n (asm1, gp, hp, n + 1);
      vm1_neg = 0;
    }
#endif

  /* Compute as2.  */
#if !HAVE_NATIVE_mpn_addlsh_n
  ash[n] = mpn_lshift (ash, a2, n, 2);			/*        4a2       */
#endif
#if HAVE_NATIVE_mpn_addlsh1_n
  cy  = mpn_addlsh1_n (as2, a3, a4, s);
  if (s != n)
    cy = mpn_add_1 (as2 + s, a3 + s, n - s, cy);
  cy = 2 * cy + mpn_addlsh1_n (as2, a2, as2, n);
  cy = 2 * cy + mpn_addlsh1_n (as2, a1, as2, n);
  as2[n] = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
#else
  cy  = mpn_lshift (as2, a4, s, 1);
  cy += mpn_add_n (as2, a3, as2, s);
  if (s != n)
    cy = mpn_add_1 (as2 + s, a3 + s, n - s, cy);
  cy = 4 * cy + mpn_lshift (as2, as2, n, 2);
  cy += mpn_add_n (as2, a1, as2, n);
  cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
  as2[n] = cy + mpn_add_n (as2, a0, as2, n);
  mpn_add_n (as2, ash, as2, n + 1);
#endif

  /* Compute ash and asmh.  */
#if HAVE_NATIVE_mpn_addlsh_n
  cy  = mpn_addlsh_n (gp, a2, a0, n, 2);		/* 4a0  +  a2       */
  cy = 4 * cy + mpn_addlsh_n (gp, a4, gp, n, 2);	/* 16a0 + 4a2 +  a4 */ /* FIXME s */
  gp[n] = cy;
  cy  = mpn_addlsh_n (hp, a3, a1, n, 2);		/*  4a1 +  a3       */
  cy = 2 * cy + mpn_lshift (hp, hp, n, 1);		/*  8a1 + 2a3       */
  hp[n] = cy;
#else
  gp[n] = mpn_lshift (gp, a0, n, 4);			/* 16a0             */
  mpn_add (gp, gp, n + 1, a4, s);			/* 16a0 +        a4 */
  mpn_add_n (gp, ash, gp, n+1);				/* 16a0 + 4a2 +  a4 */
  cy  = mpn_lshift (hp, a1, n, 3);			/*  8a1             */
  cy += mpn_lshift (ash, a3, n, 1);			/*        2a3       */
  cy += mpn_add_n (hp, ash, hp, n);			/*  8a1 + 2a3       */
  hp[n] = cy;
#endif
#if HAVE_NATIVE_mpn_addsub_n
  if (mpn_cmp (gp, hp, n + 1) < 0)
    {
      mpn_addsub_n (ash, asmh, hp, gp, n + 1);
      vmh_neg = 1;
    }
  else
    {
      mpn_addsub_n (ash, asmh, gp, hp, n + 1);
      vmh_neg = 0;
    }
#else
  mpn_add_n (ash, gp, hp, n + 1);
  if (mpn_cmp (gp, hp, n + 1) < 0)
    {
      mpn_sub_n (asmh, hp, gp, n + 1);
      vmh_neg = 1;
    }
  else
    {
      mpn_sub_n (asmh, gp, hp, n + 1);
      vmh_neg = 0;
    }
#endif

  /* Compute bs1 and bsm1.  */
  bs1[n] = mpn_add (bs1, b0, n, b2, t);		/* b0 + b2 */
#if HAVE_NATIVE_mpn_addsub_n
  if (bs1[n] == 0 && mpn_cmp (bs1, b1, n) < 0)
    {
      bs1[n] = mpn_addsub_n (bs1, bsm1, b1, bs1, n) >> 1;
      bsm1[n] = 0;
      vm1_neg ^= 1;
    }
예제 #6
0
파일: mul_n.c 프로젝트: angavrilov/ecl-sse
void
mpn_toom3_sqr_n (mp_ptr c, mp_srcptr a, mp_size_t n, mp_ptr t)
{
  mp_size_t k, k1, kk1, r, twok, twor;
  mp_limb_t cy, saved, vinf0, cinf0;
  mp_ptr trec;
  int sa;
  mp_ptr c1, c2, c3, c4;

  ASSERT(GMP_NUMB_BITS >= 6);
  ASSERT(n >= 17); /* so that r <> 0 and 5k+3 <= 2n */

  /* the algorithm is the same as mpn_mul_n_tc3, with b=a */

  k = (n + 2) / 3; /* ceil(n/3) */
  twok = 2 * k;
  k1 = k + 1;
  kk1 = k + k1;
  r = n - twok;   /* last chunk */
  twor = 2 * r;

  c1 = c + k;
  c2 = c1 + k;
  c3 = c2 + k;
  c4 = c3 + k;

  trec = t + 4 * k + 3; /* trec = v2 + (2k+2) */

  cy = mpn_add_n (c, a, a + twok, r);
  if (r < k)
    __GMPN_ADD_1 (cy, c + r, a + r, k - r, cy);
  c3[2] = (c1[0] = cy) + mpn_add_n (c2 + 2, c, a + k, k);

#define v2 (t+2*k+1)
#define vinf (t+4*k+2)

  TOOM3_SQR_REC (t, c2 + 2, k1, trec);

  sa = (c[k] != 0) ? 1 : mpn_cmp (c, a + k, k);
  c[k] = (sa >= 0) ? c[k] - mpn_sub_n (c, c, a + k, k)
    : mpn_sub_n (c, a + k, c, k);

  TOOM3_SQR_REC (c2, c, k1, trec);

#ifdef HAVE_NATIVE_mpn_addlsh1_n
  c1[0] = mpn_addlsh1_n (c, a + k, a + twok, r);
  if (r < k)
    __GMPN_ADD_1 (c1[0], c + r, a + k + r, k - r, c1[0]);
  c1[0] = 2 * c1[0] + mpn_addlsh1_n (c, a, c, k);
#else
  c[r] = mpn_lshift (c, a + twok, r, 1);
  if (r < k)
    MPN_ZERO(c + r + 1, k - r);
  c1[0] += mpn_add_n (c, c, a + k, k);
  mpn_lshift (c, c, k1, 1);
  c1[0] += mpn_add_n (c, c, a, k);
#endif

  TOOM3_SQR_REC (v2, c, k1, trec);

  TOOM3_SQR_REC (c, a, k, trec);

#ifdef HAVE_NATIVE_mpn_addlsh1_n
  mpn_addlsh1_n (v2, v2, c2, kk1);
#else
  mpn_lshift (t + 4 * k + 2, c2, kk1, 1);
  mpn_add_n (v2, v2, t + 4 * k + 2, kk1);
#endif

  saved = c4[0];
  TOOM3_SQR_REC (c4, a + twok, r, trec);
  cinf0 = mpn_add_n (vinf, c4, c, twor);
  vinf0 = c4[0];
  c4[0] = saved;

  toom3_interpolate (c, t, v2, c2, vinf, k, r, 1, vinf0, cinf0, vinf + twor);

#undef v2
#undef vinf
}
예제 #7
0
파일: mul_n.c 프로젝트: angavrilov/ecl-sse
void
mpn_toom3_mul_n (mp_ptr c, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr t)
{
  mp_size_t k, k1, kk1, r, twok, twor;
  mp_limb_t cy, cc, saved, vinf0, cinf0;
  mp_ptr trec;
  int sa, sb;
  mp_ptr c1, c2, c3, c4, c5;

  ASSERT(GMP_NUMB_BITS >= 6);
  ASSERT(n >= 17); /* so that r <> 0 and 5k+3 <= 2n */

  /*
  The algorithm is the following:

  0. k = ceil(n/3), r = n - 2k, B = 2^(GMP_NUMB_BITS), t = B^k
  1. split a and b in three parts each a0, a1, a2 and b0, b1, b2
     with a0, a1, b0, b1 of k limbs, and a2, b2 of r limbs
  2. v0   <- a0*b0
     v1   <- (a0+a1+a2)*(b0+b1+b2)
     v2   <- (a0+2*a1+4*a2)*(b0+2*b1+4*b2)
     vm1  <- (a0-a1+a2)*(b0-b1+b2)
     vinf <- a2*b2
     t1   <- (3*v0+2*vm1+v2)/6-2*vinf
     t2   <- (v1+vm1)/2
  3. result is c0+c1*t+c2*t^2+c3*t^3+c4*t^4 where
     c0   <- v0
     c1   <- v1 - t1
     c2   <- t2 - v0 - vinf
     c3   <- t1 - t2
     c4   <- vinf
  */

  k = (n + 2) / 3; /* ceil(n/3) */
  twok = 2 * k;
  k1 = k + 1;
  kk1 = k + k1;
  r = n - twok;   /* last chunk */
  twor = 2 * r;

  c1 = c + k;
  c2 = c1 + k;
  c3 = c2 + k;
  c4 = c3 + k;
  c5 = c4 + k;

  trec = t + 4 * k + 3; /* trec = v2 + (2k+2) */

  /* put a0+a2 in {c, k+1}, and b0+b2 in {c+k+1, k+1};
     put a0+a1+a2 in {c+2k+2, k+1} and b0+b1+b2 in {c+3k+3,k+1}
     [requires 4k+4 <= 2n, ie. n >= 8] */
  cy = mpn_add_n (c, a, a + twok, r);
  cc = mpn_add_n (c1 + 1, b, b + twok, r);
  if (r < k)
    {
      __GMPN_ADD_1 (cy, c + r, a + r, k - r, cy);
      __GMPN_ADD_1 (cc, c1 + 1 + r, b + r, k - r, cc);
    }
  c3[2] = (c1[0] = cy) + mpn_add_n (c2 + 2, c, a + k, k);
  c4[3] = (c2[1] = cc) + mpn_add_n (c3 + 3, c1 + 1, b + k, k);

#define v2 (t+2*k+1)
#define vinf (t+4*k+2)

  /* compute v1 := (a0+a1+a2)*(b0+b1+b2) in {t, 2k+1};
     since v1 < 9*B^(2k), v1 uses only 2k+1 words if GMP_NUMB_BITS >= 4 */
  TOOM3_MUL_REC (t, c2 + 2, c3 + 3, k1, trec);

  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
					v1
  */

  /* put |a0-a1+a2| in {c, k+1} and |b0-b1+b2| in {c+4k+2,k+1} */
  /* sa = sign(a0-a1+a2) */
  sa = (c[k] != 0) ? 1 : mpn_cmp (c, a + k, k);
  c[k] = (sa >= 0) ? c[k] - mpn_sub_n (c, c, a + k, k)
		   : mpn_sub_n (c, a + k, c, k);
  /* b0+b2 is in {c+k+1, k+1} now */
  sb = (c2[1] != 0) ? 1 : mpn_cmp (c1 + 1, b + k, k);
  c5[2] = (sb >= 0) ? c2[1] - mpn_sub_n (c4 + 2, c1 + 1, b + k, k)
		    : mpn_sub_n (c4 + 2, b + k, c1 + 1, k);
  sa *= sb; /* sign of vm1 */

  /* compute vm1 := (a0-a1+a2)*(b0-b1+b2) in {c+2k, 2k+1};
     since |vm1| < 4*B^(2k), vm1 uses only 2k+1 limbs */
  TOOM3_MUL_REC (c2, c, c4 + 2, k1, trec);

  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
		vm1                      v1
  */

  /* compute a0+2a1+4a2 in {c, k+1} and b0+2b1+4b2 in {c+4k+2, k+1}
     [requires 5k+3 <= 2n, i.e. n >= 17] */
#ifdef HAVE_NATIVE_mpn_addlsh1_n
  c1[0] = mpn_addlsh1_n (c, a + k, a + twok, r);
  c5[2] = mpn_addlsh1_n (c4 + 2, b + k, b + twok, r);
  if (r < k)
    {
      __GMPN_ADD_1 (c1[0], c + r, a + k + r, k - r, c1[0]);
      __GMPN_ADD_1 (c5[2], c4 + 2 + r, b + k + r, k - r, c5[2]);
    }
  c1[0] = 2 * c1[0] + mpn_addlsh1_n (c, a, c, k);
  c5[2] = 2 * c5[2] + mpn_addlsh1_n (c4 + 2, b, c4 + 2, k);
#else
  c[r] = mpn_lshift (c, a + twok, r, 1);
  c4[r + 2] = mpn_lshift (c4 + 2, b + twok, r, 1);
  if (r < k)
    {
      MPN_ZERO(c + r + 1, k - r);
      MPN_ZERO(c4 + r + 3, k - r);
    }
  c1[0] += mpn_add_n (c, c, a + k, k);
  c5[2] += mpn_add_n (c4 + 2, c4 + 2, b + k, k);
  mpn_lshift (c, c, k1, 1);
  mpn_lshift (c4 + 2, c4 + 2, k1, 1);
  c1[0] += mpn_add_n (c, c, a, k);
  c5[2] += mpn_add_n (c4 + 2, c4 + 2, b, k);
#endif

  /* compute v2 := (a0+2a1+4a2)*(b0+2b1+4b2) in {t+2k+1, 2k+1}
     v2 < 49*B^k so v2 uses at most 2k+1 limbs if GMP_NUMB_BITS >= 6 */
  TOOM3_MUL_REC (v2, c, c4 + 2, k1, trec);

  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
		vm1                      v1         v2
  */

  /* compute v0 := a0*b0 in {c, 2k} */
  TOOM3_MUL_REC (c, a, b, k, trec);

  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
       v0       vm1                      v1         v2
  */

  /* now compute (3v0+2vm1+v2)/6 = [v0 + (2vm1+v2)/3]/2
     v2 <- v2+2vm1 = 3*(a0*b0+2*a0*b2+2*a1*b1+2*a1*b2+2*a2*b0+2*a2*b1+6*a2*b2),
     thus 0 <= v2 < 51*B^(2k) < 2^6*B^(2k)
     Uses temporary space {t+4k+2,2k+1}, requires T(n) >= 6k+3.
  */
  if (sa >= 0)
    {
#ifdef HAVE_NATIVE_mpn_addlsh1_n
      mpn_addlsh1_n (v2, v2, c2, kk1);
#else
      /* we can use vinf=t+4k+2 as workspace since it is not full yet */
      mpn_lshift (vinf, c2, kk1, 1);
      mpn_add_n (v2, v2, vinf, kk1);
#endif
    }
  else
    {
#ifdef HAVE_NATIVE_mpn_sublsh1_n
      mpn_sublsh1_n (v2, v2, c2, kk1);
#else
      /* we can use vinf=t+4k+2 as workspace since it is not full yet */
      mpn_lshift (vinf, c2, kk1, 1);
      mpn_sub_n (v2, v2, vinf, kk1);
#endif
    }

  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
       v0       vm1                      v1       v2+2vm1             */

  /* compute vinf := a2*b2 in {t+4k+2, 2r}: first put it in {c4, 2r},
     then copy it in {t+4k+2,2r} */
  saved = c4[0];
  TOOM3_MUL_REC (c4, a + twok, b + twok, r, trec);
  cinf0 = mpn_add_n (vinf, c4, c, twor); /* {v0,2r} + {vinf,2r} */
  vinf0 = c4[0];
  c4[0] = saved;

  toom3_interpolate (c, t, v2, c2, vinf, k, r, sa, vinf0, cinf0, vinf + twor);

#undef v2
#undef vinf
}
예제 #8
0
void
mpn_toom4_mul_n (mp_ptr rp, mp_srcptr up,
		          mp_srcptr vp, mp_size_t n)
{
  mp_size_t ind;
  mp_limb_t cy, cy2, r30, r31;
  mp_ptr tp;
  mp_size_t sn, n1, n2, n3, n4, n5, n6, n7, n8, rpn, t4, h1;
  TMP_DECL;

  sn = (n + 3) / 4;

  h1 = n - 3*sn;
  
#define a0 (up)
#define a1 (up + sn)
#define a2 (up + 2*sn)
#define a3 (up + 3*sn)
#define b0 (vp)
#define b1 (vp + sn)
#define b2 (vp + 2*sn)
#define b3 (vp + 3*sn)

   t4 = 2*sn+2; // allows mult of 2 integers of sn + 1 limbs

   TMP_MARK;

   tp = TMP_ALLOC_LIMBS(4*t4 + 5*(sn + 1));

#define u2 (tp + 4*t4)
#define u3 (tp + 4*t4 + (sn+1))
#define u4 (tp + 4*t4 + 2*(sn+1))
#define u5 (tp + 4*t4 + 3*(sn+1))
#define u6 (tp + 4*t4 + 4*(sn+1))

   u6[sn] = mpn_add(u6, a1, sn, a3, h1);
   u5[sn] = mpn_add_n(u5, a2, a0, sn);
   mpn_add_n(u3, u5, u6, sn + 1);
   n4 = sn + 1;
   if (mpn_cmp(u5, u6, sn + 1) >= 0)
      mpn_sub_n(u4, u5, u6, sn + 1);
   else
   {  
      mpn_sub_n(u4, u6, u5, sn + 1);
      n4 = -n4;
   }

   u6[sn] = mpn_add(u6, b1, sn, b3, h1);
   u5[sn] = mpn_add_n(u5, b2, b0, sn);
   mpn_add_n(r2, u5, u6, sn + 1);
   n5 = sn + 1;
   if (mpn_cmp(u5, u6, sn + 1) >= 0)
      mpn_sub_n(u5, u5, u6, sn + 1);
   else
   {  
      mpn_sub_n(u5, u6, u5, sn + 1);
      n5 = -n5;
   }
 
   MUL_TC4_UNSIGNED(r3, n3, u3, sn + 1, r2, sn + 1); /* 1 */
   MUL_TC4(r4, n4, u4, n4, u5, n5); /* -1 */
   
#if HAVE_NATIVE_mpn_addlsh_n
   r1[sn] = mpn_addlsh_n(r1, a2, a0, sn, 2);
   mpn_lshift(r1, r1, sn + 1, 1);
   cy = mpn_addlsh_n(r2, a3, a1, h1, 2);
#else
   r1[sn] = mpn_lshift(r1, a2, sn, 1);
   MPN_COPY(r2, a3, h1);
   r1[sn] += mpn_addmul_1(r1, a0, sn, 8);
   cy = mpn_addmul_1(r2, a1, h1, 4);
#endif
   if (sn > h1) 
   {
      cy2 = mpn_lshift(r2 + h1, a1 + h1, sn - h1, 2);
      cy = cy2 + mpn_add_1(r2 + h1, r2 + h1, sn - h1, cy);
   }
   r2[sn] = cy;
   mpn_add_n(u5, r1, r2, sn + 1);
   n6 = sn + 1;
   if (mpn_cmp(r1, r2, sn + 1) >= 0)
      mpn_sub_n(u6, r1, r2, sn + 1);
   else
   {  
      mpn_sub_n(u6, r2, r1, sn + 1);
      n6 = -n6;
   }
 
#if HAVE_NATIVE_mpn_addlsh_n
   r1[sn] = mpn_addlsh_n(r1, b2, b0, sn, 2);
   mpn_lshift(r1, r1, sn + 1, 1);
   cy = mpn_addlsh_n(r2, b3, b1, h1, 2);
#else
   r1[sn] = mpn_lshift(r1, b2, sn, 1);
   MPN_COPY(r2, b3, h1);
   r1[sn] += mpn_addmul_1(r1, b0, sn, 8);
   cy = mpn_addmul_1(r2, b1, h1, 4);
#endif
   if (sn > h1) 
   {
      cy2 = mpn_lshift(r2 + h1, b1 + h1, sn - h1, 2);
      cy = cy2 + mpn_add_1(r2 + h1, r2 + h1, sn - h1, cy);
   }
   r2[sn] = cy;
   mpn_add_n(u2, r1, r2, sn + 1);
   n8 = sn + 1;
   if (mpn_cmp(r1, r2, sn + 1) >= 0)
      mpn_sub_n(r2, r1, r2, sn + 1);
   else
   {  
      mpn_sub_n(r2, r2, r1, sn + 1);
      n8 = -n8;
   }
    
   r30 = r3[0];
   r31 = r3[1];
   MUL_TC4_UNSIGNED(r5, n5, u5, sn + 1, u2, sn + 1); /* 1/2 */
   MUL_TC4(r6, n6, u6, n6, r2, n8); /* -1/2 */
   r3[1] = r31;

#if HAVE_NATIVE_mpn_addlsh1_n
   cy = mpn_addlsh1_n(u2, a2, a3, h1);
   if (sn > h1)
      cy = mpn_add_1(u2 + h1, a2 + h1, sn - h1, cy); 
   u2[sn] = cy;
   u2[sn] = 2*u2[sn] + mpn_addlsh1_n(u2, a1, u2, sn);     
   u2[sn] = 2*u2[sn] + mpn_addlsh1_n(u2, a0, u2, sn);     
#else
   MPN_COPY(u2, a0, sn);
   u2[sn] = mpn_addmul_1(u2, a1, sn, 2);
   u2[sn] += mpn_addmul_1(u2, a2, sn, 4);
   cy = mpn_addmul_1(u2, a3, h1, 8);
   if (sn > h1) cy = mpn_add_1(u2 + h1, u2 + h1, sn - h1, cy);
   u2[sn] += cy;
#endif

#if HAVE_NATIVE_mpn_addlsh1_n
   cy = mpn_addlsh1_n(r1, b2, b3, h1);
   if (sn > h1)
      cy = mpn_add_1(r1 + h1, b2 + h1, sn - h1, cy); 
   r1[sn] = cy;
   r1[sn] = 2*r1[sn] + mpn_addlsh1_n(r1, b1, r1, sn);     
   r1[sn] = 2*r1[sn] + mpn_addlsh1_n(r1, b0, r1, sn);     
#else
   MPN_COPY(r1, b0, sn);
   r1[sn] = mpn_addmul_1(r1, b1, sn, 2);
   r1[sn] += mpn_addmul_1(r1, b2, sn, 4);
   cy = mpn_addmul_1(r1, b3, h1, 8);
   if (sn > h1) cy = mpn_add_1(r1 + h1, r1 + h1, sn - h1, cy);
   r1[sn] += cy;
#endif
   
   MUL_TC4_UNSIGNED(r2, n2, u2, sn + 1, r1, sn + 1); /* 2 */
   
   MUL_TC4_UNSIGNED(r1, n1, a3, h1, b3, h1); /* oo */
   MUL_TC4_UNSIGNED(r7, n7, a0, sn, b0, sn); /* 0 */

   TC4_DENORM(r1, n1, t4 - 1);

/*	rp        rp1          rp2           rp3          rp4           rp5         rp6           rp7
<----------- r7-----------><------------r5-------------->            
                                                       <-------------r3------------->

              <-------------r6------------->                        < -----------r2------------>{           }
                                         <-------------r4-------------->         <--------------r1---->
*/

   mpn_toom4_interpolate(rp, &rpn, sn, tp, t4 - 1, n4, n6, r30);

   if (rpn != 2*n) 
   {
	  MPN_ZERO((rp + rpn), 2*n - rpn);
   }

   TMP_FREE;
}
예제 #9
0
파일: toom53_mul.c 프로젝트: da2ce7/gmp
void
mpn_toom53_mul (mp_ptr pp,
		mp_srcptr ap, mp_size_t an,
		mp_srcptr bp, mp_size_t bn,
		mp_ptr scratch)
{
  mp_size_t n, s, t;
  mp_limb_t cy;
  mp_ptr gp;
  mp_ptr as1, asm1, as2, asm2, ash;
  mp_ptr bs1, bsm1, bs2, bsm2, bsh;
  mp_ptr tmp;
  enum toom7_flags flags;
  TMP_DECL;

#define a0  ap
#define a1  (ap + n)
#define a2  (ap + 2*n)
#define a3  (ap + 3*n)
#define a4  (ap + 4*n)
#define b0  bp
#define b1  (bp + n)
#define b2  (bp + 2*n)

  n = 1 + (3 * an >= 5 * bn ? (an - 1) / (size_t) 5 : (bn - 1) / (size_t) 3);

  s = an - 4 * n;
  t = bn - 2 * n;

  ASSERT (0 < s && s <= n);
  ASSERT (0 < t && t <= n);

  TMP_MARK;

  tmp = TMP_ALLOC_LIMBS (10 * (n + 1));
  as1  = tmp; tmp += n + 1;
  asm1 = tmp; tmp += n + 1;
  as2  = tmp; tmp += n + 1;
  asm2 = tmp; tmp += n + 1;
  ash  = tmp; tmp += n + 1;
  bs1  = tmp; tmp += n + 1;
  bsm1 = tmp; tmp += n + 1;
  bs2  = tmp; tmp += n + 1;
  bsm2 = tmp; tmp += n + 1;
  bsh  = tmp; tmp += n + 1;

  gp = pp;

  /* Compute as1 and asm1.  */
  flags = (enum toom7_flags) (toom7_w3_neg & mpn_toom_eval_pm1 (as1, asm1, 4, ap, n, s, gp));

  /* Compute as2 and asm2. */
  flags = (enum toom7_flags) (flags | toom7_w1_neg & mpn_toom_eval_pm2 (as2, asm2, 4, ap, n, s, gp));

  /* Compute ash = 16 a0 + 8 a1 + 4 a2 + 2 a3 + a4
     = 2*(2*(2*(2*a0 + a1) + a2) + a3) + a4  */
#if HAVE_NATIVE_mpn_addlsh1_n
  cy = mpn_addlsh1_n (ash, a1, a0, n);
  cy = 2*cy + mpn_addlsh1_n (ash, a2, ash, n);
  cy = 2*cy + mpn_addlsh1_n (ash, a3, ash, n);
  if (s < n)
    {
      mp_limb_t cy2;
      cy2 = mpn_addlsh1_n (ash, a4, ash, s);
      ash[n] = 2*cy + mpn_lshift (ash + s, ash + s, n - s, 1);
      MPN_INCR_U (ash + s, n+1-s, cy2);
    }
  else
    ash[n] = 2*cy + mpn_addlsh1_n (ash, a4, ash, n);
#else
  cy = mpn_lshift (ash, a0, n, 1);
  cy += mpn_add_n (ash, ash, a1, n);
  cy = 2*cy + mpn_lshift (ash, ash, n, 1);
  cy += mpn_add_n (ash, ash, a2, n);
  cy = 2*cy + mpn_lshift (ash, ash, n, 1);
  cy += mpn_add_n (ash, ash, a3, n);
  cy = 2*cy + mpn_lshift (ash, ash, n, 1);
  ash[n] = cy + mpn_add (ash, ash, n, a4, s);
#endif

  /* Compute bs1 and bsm1.  */
  bs1[n] = mpn_add (bs1, b0, n, b2, t);		/* b0 + b2 */
#if HAVE_NATIVE_mpn_add_n_sub_n
  if (bs1[n] == 0 && mpn_cmp (bs1, b1, n) < 0)
    {
      bs1[n] = mpn_add_n_sub_n (bs1, bsm1, b1, bs1, n) >> 1;
      bsm1[n] = 0;
      flags = (enum toom7_flags) (flags ^ toom7_w3_neg);
    }
예제 #10
0
void
check (void)
{
  mp_limb_t  wp[100], xp[100], yp[100];
  mp_size_t  size = 100;

  refmpn_zero (xp, size);
  refmpn_zero (yp, size);
  refmpn_zero (wp, size);

  pre ("mpn_add_n");
  mpn_add_n (wp, xp, yp, size);
  post ();

#if HAVE_NATIVE_mpn_add_nc
  pre ("mpn_add_nc");
  mpn_add_nc (wp, xp, yp, size, CNST_LIMB(0));
  post ();
#endif

#if HAVE_NATIVE_mpn_addlsh1_n
  pre ("mpn_addlsh1_n");
  mpn_addlsh1_n (wp, xp, yp, size);
  post ();
#endif

#if HAVE_NATIVE_mpn_and_n
  pre ("mpn_and_n");
  mpn_and_n (wp, xp, yp, size);
  post ();
#endif

#if HAVE_NATIVE_mpn_andn_n
  pre ("mpn_andn_n");
  mpn_andn_n (wp, xp, yp, size);
  post ();
#endif

  pre ("mpn_addmul_1");
  mpn_addmul_1 (wp, xp, size, yp[0]);
  post ();

#if HAVE_NATIVE_mpn_addmul_1c
  pre ("mpn_addmul_1c");
  mpn_addmul_1c (wp, xp, size, yp[0], CNST_LIMB(0));
  post ();
#endif

#if HAVE_NATIVE_mpn_com_n
  pre ("mpn_com_n");
  mpn_com_n (wp, xp, size);
  post ();
#endif

#if HAVE_NATIVE_mpn_copyd
  pre ("mpn_copyd");
  mpn_copyd (wp, xp, size);
  post ();
#endif

#if HAVE_NATIVE_mpn_copyi
  pre ("mpn_copyi");
  mpn_copyi (wp, xp, size);
  post ();
#endif

  pre ("mpn_divexact_1");
  mpn_divexact_1 (wp, xp, size, CNST_LIMB(123));
  post ();

  pre ("mpn_divexact_by3c");
  mpn_divexact_by3c (wp, xp, size, CNST_LIMB(0));
  post ();

  pre ("mpn_divrem_1");
  mpn_divrem_1 (wp, (mp_size_t) 0, xp, size, CNST_LIMB(123));
  post ();

#if HAVE_NATIVE_mpn_divrem_1c
  pre ("mpn_divrem_1c");
  mpn_divrem_1c (wp, (mp_size_t) 0, xp, size, CNST_LIMB(123), CNST_LIMB(122));
  post ();
#endif

  pre ("mpn_gcd_1");
  xp[0] |= 1;
  notdead += (unsigned long) mpn_gcd_1 (xp, size, CNST_LIMB(123));
  post ();

#if HAVE_NATIVE_mpn_gcd_finda
  pre ("mpn_gcd_finda");
  xp[0] |= 1;
  xp[1] |= 1;
  notdead += mpn_gcd_finda (xp);
  post ();
#endif

  pre ("mpn_hamdist");
  notdead += mpn_hamdist (xp, yp, size);
  post ();

#if HAVE_NATIVE_mpn_ior_n
  pre ("mpn_ior_n");
  mpn_ior_n (wp, xp, yp, size);
  post ();
#endif

#if HAVE_NATIVE_mpn_iorn_n
  pre ("mpn_iorn_n");
  mpn_iorn_n (wp, xp, yp, size);
  post ();
#endif

  pre ("mpn_lshift");
  mpn_lshift (wp, xp, size, 1);
  post ();

  pre ("mpn_mod_1");
  notdead += mpn_mod_1 (xp, size, CNST_LIMB(123));
  post ();

#if HAVE_NATIVE_mpn_mod_1c
  pre ("mpn_mod_1c");
  notdead += mpn_mod_1c (xp, size, CNST_LIMB(123), CNST_LIMB(122));
  post ();
#endif

#if GMP_NUMB_BITS % 4 == 0
  pre ("mpn_mod_34lsub1");
  notdead += mpn_mod_34lsub1 (xp, size);
  post ();
#endif

  pre ("mpn_modexact_1_odd");
  notdead += mpn_modexact_1_odd (xp, size, CNST_LIMB(123));
  post ();

  pre ("mpn_modexact_1c_odd");
  notdead += mpn_modexact_1c_odd (xp, size, CNST_LIMB(123), CNST_LIMB(456));
  post ();

  pre ("mpn_mul_1");
  mpn_mul_1 (wp, xp, size, yp[0]);
  post ();

#if HAVE_NATIVE_mpn_mul_1c
  pre ("mpn_mul_1c");
  mpn_mul_1c (wp, xp, size, yp[0], CNST_LIMB(0));
  post ();
#endif

#if HAVE_NATIVE_mpn_mul_2
  pre ("mpn_mul_2");
  mpn_mul_2 (wp, xp, size-1, yp);
  post ();
#endif

  pre ("mpn_mul_basecase");
  mpn_mul_basecase (wp, xp, (mp_size_t) 3, yp, (mp_size_t) 3);
  post ();

#if HAVE_NATIVE_mpn_nand_n
  pre ("mpn_nand_n");
  mpn_nand_n (wp, xp, yp, size);
  post ();
#endif

#if HAVE_NATIVE_mpn_nior_n
  pre ("mpn_nior_n");
  mpn_nior_n (wp, xp, yp, size);
  post ();
#endif

  pre ("mpn_popcount");
  notdead += mpn_popcount (xp, size);
  post ();

  pre ("mpn_preinv_mod_1");
  notdead += mpn_preinv_mod_1 (xp, size, GMP_NUMB_MAX,
                               refmpn_invert_limb (GMP_NUMB_MAX));
  post ();

#if USE_PREINV_DIVREM_1 || HAVE_NATIVE_mpn_preinv_divrem_1
  pre ("mpn_preinv_divrem_1");
  mpn_preinv_divrem_1 (wp, (mp_size_t) 0, xp, size, GMP_NUMB_MAX,
                       refmpn_invert_limb (GMP_NUMB_MAX), 0);
  post ();
#endif

#if HAVE_NATIVE_mpn_rsh1add_n
  pre ("mpn_rsh1add_n");
  mpn_rsh1add_n (wp, xp, yp, size);
  post ();
#endif

#if HAVE_NATIVE_mpn_rsh1sub_n
  pre ("mpn_rsh1sub_n");
  mpn_rsh1sub_n (wp, xp, yp, size);
  post ();
#endif

  pre ("mpn_rshift");
  mpn_rshift (wp, xp, size, 1);
  post ();

  pre ("mpn_sqr_basecase");
  mpn_sqr_basecase (wp, xp, (mp_size_t) 3);
  post ();

  pre ("mpn_submul_1");
  mpn_submul_1 (wp, xp, size, yp[0]);
  post ();

#if HAVE_NATIVE_mpn_submul_1c
  pre ("mpn_submul_1c");
  mpn_submul_1c (wp, xp, size, yp[0], CNST_LIMB(0));
  post ();
#endif

  pre ("mpn_sub_n");
  mpn_sub_n (wp, xp, yp, size);
  post ();

#if HAVE_NATIVE_mpn_sub_nc
  pre ("mpn_sub_nc");
  mpn_sub_nc (wp, xp, yp, size, CNST_LIMB(0));
  post ();
#endif

#if HAVE_NATIVE_mpn_sublsh1_n
  pre ("mpn_sublsh1_n");
  mpn_sublsh1_n (wp, xp, yp, size);
  post ();
#endif

#if HAVE_NATIVE_mpn_udiv_qrnnd
  pre ("mpn_udiv_qrnnd");
  mpn_udiv_qrnnd (&wp[0], CNST_LIMB(122), xp[0], CNST_LIMB(123));
  post ();
#endif

#if HAVE_NATIVE_mpn_udiv_qrnnd_r
  pre ("mpn_udiv_qrnnd_r");
  mpn_udiv_qrnnd (CNST_LIMB(122), xp[0], CNST_LIMB(123), &wp[0]);
  post ();
#endif

#if HAVE_NATIVE_mpn_umul_ppmm
  pre ("mpn_umul_ppmm");
  mpn_umul_ppmm (&wp[0], xp[0], yp[0]);
  post ();
#endif

#if HAVE_NATIVE_mpn_umul_ppmm_r
  pre ("mpn_umul_ppmm_r");
  mpn_umul_ppmm_r (&wp[0], xp[0], yp[0]);
  post ();
#endif

#if HAVE_NATIVE_mpn_xor_n
  pre ("mpn_xor_n");
  mpn_xor_n (wp, xp, yp, size);
  post ();
#endif

#if HAVE_NATIVE_mpn_xnor_n
  pre ("mpn_xnor_n");
  mpn_xnor_n (wp, xp, yp, size);
  post ();
#endif
}
예제 #11
0
void
mpn_toom4_sqr (mp_ptr pp,
               mp_srcptr ap, mp_size_t an,
               mp_ptr scratch)
{
    mp_size_t n, s;
    mp_limb_t cy;

#define a0  ap
#define a1  (ap + n)
#define a2  (ap + 2*n)
#define a3  (ap + 3*n)

    n = (an + 3) >> 2;

    s = an - 3 * n;

    ASSERT (0 < s && s <= n);

    /* NOTE: The multiplications to v2, vm2, vh and vm1 overwrites the
     * following limb, so these must be computed in order, and we need a
     * one limb gap to tp. */
#define v0    pp				/* 2n */
#define v1    (pp + 2 * n)			/* 2n+1 */
#define vinf  (pp + 6 * n)			/* s+t */
#define v2    scratch				/* 2n+1 */
#define vm2   (scratch + 2 * n + 1)		/* 2n+1 */
#define vh    (scratch + 4 * n + 2)		/* 2n+1 */
#define vm1   (scratch + 6 * n + 3)		/* 2n+1 */
#define tp (scratch + 8*n + 5)

    /* No overlap with v1 */
#define apx   pp				/* n+1 */
#define amx   (pp + 4*n + 2)			/* n+1 */

    /* Total scratch need: 8*n + 5 + scratch for recursive calls. This
       gives roughly 32 n/3 + log term. */

    /* Compute apx = a0 + 2 a1 + 4 a2 + 8 a3 and amx = a0 - 2 a1 + 4 a2 - 8 a3.  */
    mpn_toom_eval_dgr3_pm2 (apx, amx, ap, n, s, tp);

    TOOM4_SQR_REC (v2, apx, n + 1, tp);	/* v2,  2n+1 limbs */
    TOOM4_SQR_REC (vm2, amx, n + 1, tp);	/* vm2,  2n+1 limbs */

    /* Compute apx = 8 a0 + 4 a1 + 2 a2 + a3 = (((2*a0 + a1) * 2 + a2) * 2 + a3 */
#if HAVE_NATIVE_mpn_addlsh1_n
    cy = mpn_addlsh1_n (apx, a1, a0, n);
    cy = 2*cy + mpn_addlsh1_n (apx, a2, apx, n);
    if (s < n)
    {
        mp_limb_t cy2;
        cy2 = mpn_addlsh1_n (apx, a3, apx, s);
        apx[n] = 2*cy + mpn_lshift (apx + s, apx + s, n - s, 1);
        MPN_INCR_U (apx + s, n+1-s, cy2);
    }
    else
        apx[n] = 2*cy + mpn_addlsh1_n (apx, a3, apx, n);
#else
    cy = mpn_lshift (apx, a0, n, 1);
    cy += mpn_add_n (apx, apx, a1, n);
    cy = 2*cy + mpn_lshift (apx, apx, n, 1);
    cy += mpn_add_n (apx, apx, a2, n);
    cy = 2*cy + mpn_lshift (apx, apx, n, 1);
    apx[n] = cy + mpn_add (apx, apx, n, a3, s);
#endif

    ASSERT (apx[n] < 15);

    TOOM4_SQR_REC (vh, apx, n + 1, tp);	/* vh,  2n+1 limbs */

    /* Compute apx = a0 + a1 + a2 + a3 and amx = a0 - a1 + a2 - a3.  */
    mpn_toom_eval_dgr3_pm1 (apx, amx, ap, n, s, tp);

    TOOM4_SQR_REC (v1, apx, n + 1, tp);	/* v1,  2n+1 limbs */
    TOOM4_SQR_REC (vm1, amx, n + 1, tp);	/* vm1,  2n+1 limbs */

    TOOM4_SQR_REC (v0, a0, n, tp);
    TOOM4_SQR_REC (vinf, a3, s, tp);	/* vinf, 2s limbs */

    mpn_toom_interpolate_7pts (pp, n, 0, vm2, vm1, v2, vh, 2*s, tp);
}
예제 #12
0
void
mpn_toom44_mul (mp_ptr pp,
		mp_srcptr ap, mp_size_t an,
		mp_srcptr bp, mp_size_t bn,
		mp_ptr scratch)
{
  mp_size_t n, s, t;
  mp_limb_t cy;
  enum toom7_flags flags;

#define a0  ap
#define a1  (ap + n)
#define a2  (ap + 2*n)
#define a3  (ap + 3*n)
#define b0  bp
#define b1  (bp + n)
#define b2  (bp + 2*n)
#define b3  (bp + 3*n)

  ASSERT (an >= bn);

  n = (an + 3) >> 2;

  s = an - 3 * n;
  t = bn - 3 * n;

  ASSERT (0 < s && s <= n);
  ASSERT (0 < t && t <= n);
  ASSERT (s >= t);

  /* NOTE: The multiplications to v2, vm2, vh and vm1 overwrites the
   * following limb, so these must be computed in order, and we need a
   * one limb gap to tp. */
#define v0    pp				/* 2n */
#define v1    (pp + 2 * n)			/* 2n+1 */
#define vinf  (pp + 6 * n)			/* s+t */
#define v2    scratch				/* 2n+1 */
#define vm2   (scratch + 2 * n + 1)		/* 2n+1 */
#define vh    (scratch + 4 * n + 2)		/* 2n+1 */
#define vm1   (scratch + 6 * n + 3)		/* 2n+1 */
#define tp (scratch + 8*n + 5)

  /* apx and bpx must not overlap with v1 */
#define apx   pp				/* n+1 */
#define amx   (pp + n + 1)			/* n+1 */
#define bmx   (pp + 2*n + 2)			/* n+1 */
#define bpx   (pp + 4*n + 2)			/* n+1 */

  /* Total scratch need: 8*n + 5 + scratch for recursive calls. This
     gives roughly 32 n/3 + log term. */

  /* Compute apx = a0 + 2 a1 + 4 a2 + 8 a3 and amx = a0 - 2 a1 + 4 a2 - 8 a3.  */
  flags = (enum toom7_flags) (toom7_w1_neg & mpn_toom_eval_dgr3_pm2 (apx, amx, ap, n, s, tp));

  /* Compute bpx = b0 + 2 b1 + 4 b2 + 8 b3 and bmx = b0 - 2 b1 + 4 b2 - 8 b3.  */
  flags = (enum toom7_flags) (flags ^ toom7_w1_neg & mpn_toom_eval_dgr3_pm2 (bpx, bmx, bp, n, t, tp));

  TOOM44_MUL_N_REC (v2, apx, bpx, n + 1, tp);	/* v2,  2n+1 limbs */
  TOOM44_MUL_N_REC (vm2, amx, bmx, n + 1, tp);	/* vm2,  2n+1 limbs */

  /* Compute apx = 8 a0 + 4 a1 + 2 a2 + a3 = (((2*a0 + a1) * 2 + a2) * 2 + a3 */
#if HAVE_NATIVE_mpn_addlsh1_n
  cy = mpn_addlsh1_n (apx, a1, a0, n);
  cy = 2*cy + mpn_addlsh1_n (apx, a2, apx, n);
  if (s < n)
    {
      mp_limb_t cy2;
      cy2 = mpn_addlsh1_n (apx, a3, apx, s);
      apx[n] = 2*cy + mpn_lshift (apx + s, apx + s, n - s, 1);
      MPN_INCR_U (apx + s, n+1-s, cy2);
    }
  else
    apx[n] = 2*cy + mpn_addlsh1_n (apx, a3, apx, n);
#else
  cy = mpn_lshift (apx, a0, n, 1);
  cy += mpn_add_n (apx, apx, a1, n);
  cy = 2*cy + mpn_lshift (apx, apx, n, 1);
  cy += mpn_add_n (apx, apx, a2, n);
  cy = 2*cy + mpn_lshift (apx, apx, n, 1);
  apx[n] = cy + mpn_add (apx, apx, n, a3, s);
#endif

  /* Compute bpx = 8 b0 + 4 b1 + 2 b2 + b3 = (((2*b0 + b1) * 2 + b2) * 2 + b3 */
#if HAVE_NATIVE_mpn_addlsh1_n
  cy = mpn_addlsh1_n (bpx, b1, b0, n);
  cy = 2*cy + mpn_addlsh1_n (bpx, b2, bpx, n);
  if (t < n)
    {
      mp_limb_t cy2;
      cy2 = mpn_addlsh1_n (bpx, b3, bpx, t);
      bpx[n] = 2*cy + mpn_lshift (bpx + t, bpx + t, n - t, 1);
      MPN_INCR_U (bpx + t, n+1-t, cy2);
    }
  else
    bpx[n] = 2*cy + mpn_addlsh1_n (bpx, b3, bpx, n);
#else
  cy = mpn_lshift (bpx, b0, n, 1);
  cy += mpn_add_n (bpx, bpx, b1, n);
  cy = 2*cy + mpn_lshift (bpx, bpx, n, 1);
  cy += mpn_add_n (bpx, bpx, b2, n);
  cy = 2*cy + mpn_lshift (bpx, bpx, n, 1);
  bpx[n] = cy + mpn_add (bpx, bpx, n, b3, t);
#endif

  ASSERT (apx[n] < 15);
  ASSERT (bpx[n] < 15);

  TOOM44_MUL_N_REC (vh, apx, bpx, n + 1, tp);	/* vh,  2n+1 limbs */

  /* Compute apx = a0 + a1 + a2 + a3 and amx = a0 - a1 + a2 - a3.  */
  flags = (enum toom7_flags) (flags | toom7_w3_neg & mpn_toom_eval_dgr3_pm1 (apx, amx, ap, n, s, tp));

  /* Compute bpx = b0 + b1 + b2 + b3 bnd bmx = b0 - b1 + b2 - b3.  */
  flags = (enum toom7_flags) (flags ^ toom7_w3_neg & mpn_toom_eval_dgr3_pm1 (bpx, bmx, bp, n, t, tp));

  TOOM44_MUL_N_REC (vm1, amx, bmx, n + 1, tp);	/* vm1,  2n+1 limbs */
  /* Clobbers amx, bmx. */
  TOOM44_MUL_N_REC (v1, apx, bpx, n + 1, tp);	/* v1,  2n+1 limbs */

  TOOM44_MUL_N_REC (v0, a0, b0, n, tp);
  if (s > t)
    mpn_mul (vinf, a3, s, b3, t);
  else
    TOOM44_MUL_N_REC (vinf, a3, b3, s, tp);	/* vinf, s+t limbs */

  mpn_toom_interpolate_7pts (pp, n, flags, vm2, vm1, v2, vh, s + t, tp);
}
예제 #13
0
void
mpn_toom42_mul (mp_ptr c, mp_srcptr a, mp_size_t an, mp_srcptr b, mp_size_t bn, mp_ptr t)
{
  mp_size_t k, k1, kk1, r, r2, twok, threek, rr2, n1, n2;
  mp_limb_t cy, cc, saved, vinf0, c20, c21;
  mp_ptr trec;
  int sa, sb;
  mp_ptr c1, c2, c3, c4, t1, t2, t3, t4;

  ASSERT(GMP_NUMB_BITS >= 6);

  k = (an + 3) / 4; /* ceil(an/4) */
  ASSERT(bn > k);
  ASSERT(bn <= 2*k);
  ASSERT(an >= 20);
  
  twok = 2 * k;
  threek = 3 * k;
  k1 = k + 1;
  kk1 = k + k1;
  r = an - threek;   /* last chunk */
  r2 = bn - k;   /* last chunk */
  rr2 = r + r2;

  c1 = c + k;
  c2 = c1 + k;
  c3 = c2 + k;
  c4 = c3 + k;
  
  t1 = t + k;
  t2 = t1 + k;
  t3 = t2 + k;
  t4 = t3 + k;

  trec = t + 4 * k + 4; 

  /* put a0+a2 in {t, k+1}, and b0+b1 in {t1 + 1, k+1};
     put a1+a3 in {t3+3, k+1}, put a0+a1+a2+a3 in {t2 + 2, k+1} 
  */
  t[k] = mpn_add_n (t, a, a + twok, k);
  
  t4[3] = mpn_add_n (t3 + 3, a + k, a + threek, r);
  if (k > r)
     t4[3] = mpn_add_1(t3 + r + 3, a + k + r, k - r, t4[3]);

  mpn_add_n (t2 + 2, t, t3 + 3, k1);

  t2[1] = mpn_add_n (t1 + 1, b, b + k, r2);
  if (k > r2)
	  t2[1] = mpn_add_1(t1 + 1 + r2, b + r2, k - r2, t2[1]);

  /* compute v1 := (a0+a1+a2+a3)*(b0+b1) in {c2, 2k+1};
     since v1 < 6*B^(2k), v1 uses only 2k+1 words if GMP_NUMB_BITS >= 3 */
  TOOM3_MUL_REC (c2, t2 + 2, t1 + 1, k1, trec);

  ASSERT(c2[k+k] < 6);

  /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} 
					v1
  */

  /* put |a0-a1+a2-a3| in {t2 + 2, k+1} and |b0-b1| in {t3 + 3,k+1} */
  /* sa = sign(a0-a1+a2-a3) */
  /* sb = sign(b0-b1) */
  sa = mpn_cmp (t, t3 + 3, k1);
  if (sa >= 0) mpn_sub_n (t2 + 2, t, t3 + 3, k1);
  else mpn_sub_n (t2 + 2, t3 + 3, t, k1);

  n1 = k;
  n2 = r2;
  MPN_NORMALIZE(b, n1);
  MPN_NORMALIZE(b+k, n2);
  if (n1 != n2) sb = (n1 > n2) ? 1 : -1;
  else sb = mpn_cmp (b, b + k, n2);

  if (sb >= 0) 
  {
	  t4[3] = mpn_sub_n (t3 + 3, b, b + k, r2);
	  if (k > r2) t4[3] = -mpn_sub_1(t3 + 3 + r2, b + r2, k - r2, t4[3]);
  } else
  {
	  mpn_sub_n (t3 + 3, b + k, b, r2);
	  MPN_ZERO(t3 + r2 + 3, k1 - r2);
  }
  sa *= sb; /* sign of vm1 */

  /* compute vm1 := (a0-a1+a2-a3)*(b0-b1) in {t, 2k+1};
     since |vm1| < 2*B^(2k), vm1 uses only 2k+1 limbs */
  TOOM3_MUL_REC (t, t2 + 2, t3 + 3, k1, trec);

  ASSERT(t[k+k] < 2);
  /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} 
					v1

	  {t, 2k+1} {t+2k+1, 2k + 1}
	     vm1
  */

  c20 = c2[0]; /* save c2[0] and c2[1] giving space 2k+2 at c */
  c21 = c2[1];

  /* 
     compute a0+2a1+4a2+8a3 in {c, k+1} and b0+2b1 in {c1 + 1, k+1}
  */
#if HAVE_NATIVE_mpn_addlsh1_n
  c1[0] = mpn_addlsh1_n (c, a + twok, a + threek, r);
  if (r < k)
    {
      c1[0] = mpn_add_1 (c + r, a + twok + r, k - r, c1[0]);
    }
  c1[0] = 2 * c1[0] + mpn_addlsh1_n (c, a + k, c, k);
  c1[0] = 2 * c1[0] + mpn_addlsh1_n (c, a, c, k);
  
  c2[1] = mpn_addlsh1_n (c1 + 1, b, b + k, r2);
  if (r2 < k)
    {
		 c2[1] = mpn_add_1(c1 + 1 + r2, b + r2, k - r2, c2[1]);
    }
#else
  c[r] = mpn_lshift1 (c, a + threek, r);
  if (r < k)
    {
      MPN_ZERO(c + r + 1, k - r);
    }
  c1[0] += mpn_add_n (c, c, a + twok, k);
  mpn_double (c, k1);
  c1[0] += mpn_add_n (c, c, a + k, k);
  mpn_double (c, k1);
  c1[0] += mpn_add_n (c, c, a, k);
  
  c1[r2 + 1] = mpn_lshift1 (c1 + 1, b + k, r2);
  if (r2 < k)
    {
      MPN_ZERO(c1 + r2 + 2, k - r2);
    }
  c2[1] += mpn_add_n (c1 + 1, c1 + 1, b, k);
#endif

#define v2 (t+2*k+1)

  /* compute v2 := (a0+2a1+4a2+8a3)*(b0+2b1) in {t+2k+1, 2k+1}
     v2 < 45*B^k so v2 uses at most 2k+1 limbs if GMP_NUMB_BITS >= 6 */
  TOOM3_MUL_REC (v2, c, c1 + 1, k1, trec);

  ASSERT(v2[k+k] < 45);
  /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} 
					v1

	  {t, 2k+1} {t+2k+1, 2k + 1}
	     vm1        v2
  */

  c2[0] = c20; /* restore c2[0] and c2[1] */
  c2[1] = c21;

  /* compute v0 := a0*b0 in {c, 2k} */
  TOOM3_MUL_REC (c, a, b, k, trec);

 /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} 
		v0 		v1

	  {t, 2k+1} {t+2k+1, 2k + 1}
	     vm1        v2
  */

#define vinf (c+4*k)

  /* compute vinf := a3*b1 in {c4, r + r2},
  */
  saved = c4[0];

  if (r == r2) TOOM3_MUL_REC (c4, a + threek, b + k, r, trec);
  else if (r > r2) mpn_mul(c4, a + threek, r, b + k, r2);
  else mpn_mul(c4, b + k, r2, a + threek, r);

  vinf0 = c4[0];
  c4[0] = saved;

 /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} 
		v0 		v1        {-}vinf

	  {t, 2k+1} {t+2k+1, 2k + 1}
	     vm1        v2

	  vinf0 = {-}
  */

  mpn_toom3_interpolate (c, c2, v2, t, vinf, k, rr2, sa, vinf0, t4+2);

#undef v2
#undef vinf
}
예제 #14
0
void
mpn_toom3_mul (mp_ptr c, mp_srcptr a, mp_size_t an, mp_srcptr b, mp_size_t bn, mp_ptr t)
{
  mp_size_t k, k1, kk1, r, r2, twok, rr2;
  mp_limb_t cy, cc, saved, vinf0, c20, c21;
  mp_ptr trec;
  int sa, sb;
  mp_ptr c1, c2, c3, c4, t1, t2, t3, t4;

  ASSERT(GMP_NUMB_BITS >= 6);

  k = (an + 2) / 3; /* ceil(an/3) */
  ASSERT(bn > 2*k);
  ASSERT(an >= 20);
  
  twok = 2 * k;
  k1 = k + 1;
  kk1 = k + k1;
  r = an - twok;   /* last chunk */
  r2 = bn - twok;   /* last chunk */
  rr2 = r + r2;

  c1 = c + k;
  c2 = c1 + k;
  c3 = c2 + k;
  c4 = c3 + k;
  
  t1 = t + k;
  t2 = t1 + k;
  t3 = t2 + k;
  t4 = t3 + k;

  trec = t + 4 * k + 4; 

  /* put a0+a2 in {t, k+1}, and b0+b2 in {t1 + 1, k+1};
     put a0+a1+a2 in {t2 + 2, k+1} and b0+b1+b2 in {t3 + 3,k+1}
  */
  cy = mpn_add_n (t, a, a + twok, r);
  cc = mpn_add_n (t1 + 1, b, b + twok, r2);
  if (r < k)
    {
      __GMPN_ADD_1 (cy, t + r, a + r, k - r, cy);
    }
  if (r2 < k)
    {
	  __GMPN_ADD_1 (cc, t1 + 1 + r2, b + r2, k - r2, cc);
    }
  t3[2] = (t1[0] = cy) + mpn_add_n (t2 + 2, t, a + k, k);
  t4[3] = (t2[1] = cc) + mpn_add_n (t3 + 3, t1 + 1, b + k, k);

  /* compute v1 := (a0+a1+a2)*(b0+b1+b2) in {c2, 2k+1};
     since v1 < 9*B^(2k), v1 uses only 2k+1 words if GMP_NUMB_BITS >= 4 */
  TOOM3_MUL_REC (c2, t2 + 2, t3 + 3, k1, trec);

  /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} 
					v1
  */

  /* put |a0-a1+a2| in {t2 + 2, k+1} and |b0-b1+b2| in {t3 + 3,k+1} */
  /* sa = sign(a0-a1+a2) */
  /* sb = sign(b0-b1+b2) */
  sa = (t[k] != 0) ? 1 : mpn_cmp (t, a + k, k);
  t3[2] = (sa >= 0) ? t[k] - mpn_sub_n (t2 + 2, t, a + k, k)
		   : mpn_sub_n (t2 + 2, a + k, t, k);
  /* b0+b2 is in {c+k+1, k+1} now */
  sb = (t2[1] != 0) ? 1 : mpn_cmp (t1 + 1, b + k, k);
  t4[3] = (sb >= 0) ? t2[1] - mpn_sub_n (t3 + 3, t1 + 1, b + k, k)
		    : mpn_sub_n (t3 + 3, b + k, t1 + 1, k);
  sa *= sb; /* sign of vm1 */

  /* compute vm1 := (a0-a1+a2)*(b0-b1+b2) in {t, 2k+1};
     since |vm1| < 4*B^(2k), vm1 uses only 2k+1 limbs */
  TOOM3_MUL_REC (t, t2 + 2, t3 + 3, k1, trec);

  /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} 
					v1

	  {t, 2k+1} {t+2k+1, 2k + 1}
	     vm1
  */

  c20 = c2[0]; /* save c2[0] and c2[1] giving space 2k+2 at c */
  c21 = c2[1];

  /* 
     compute a0+2a1+4a2 in {c, k+1} and b0+2b1+4b2 in {c1 + 1, k+1}
  */
#if HAVE_NATIVE_mpn_addlsh1_n
  c1[0] = mpn_addlsh1_n (c, a + k, a + twok, r);
  c2[1] = mpn_addlsh1_n (c1 + 1, b + k, b + twok, r2);
  if (r < k)
    {
      __GMPN_ADD_1 (c1[0], c + r, a + k + r, k - r, c1[0]);
    }
  if (r2 < k)
    {__GMPN_ADD_1 (c2[1], c1 + 1 + r2, b + k + r2, k - r2, c2[1]);
    }
  c1[0] = 2 * c1[0] + mpn_addlsh1_n (c, a, c, k);
  c2[1] = 2 * c2[1] + mpn_addlsh1_n (c1 + 1, b, c1 + 1, k);
#else
  c[r] = mpn_lshift1 (c, a + twok, r);
  c1[r2 + 1] = mpn_lshift1 (c1 + 1, b + twok, r2);
  if (r  < k)
    {
      MPN_ZERO(c + r + 1, k - r);
    }
  if (r2 < k)
    {
      MPN_ZERO(c1 + r2 + 2, k - r2);
    }
  c1[0] += mpn_add_n (c, c, a + k, k);
  c2[1] += mpn_add_n (c1 + 1, c1 + 1, b + k, k);
  mpn_double (c, k1);
  mpn_double (c1 + 1, k1);
  c1[0] += mpn_add_n (c, c, a, k);
  c2[1] += mpn_add_n (c1 + 1, c1 + 1, b, k);
#endif

#define v2 (t+2*k+1)

  /* compute v2 := (a0+2a1+4a2)*(b0+2b1+4b2) in {t+2k+1, 2k+1}
     v2 < 49*B^k so v2 uses at most 2k+1 limbs if GMP_NUMB_BITS >= 6 */
  TOOM3_MUL_REC (v2, c, c1 + 1, k1, trec);

  /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} 
					v1

	  {t, 2k+1} {t+2k+1, 2k + 1}
	     vm1        v2
  */

  c2[0] = c20; /* restore c2[0] and c2[1] */
  c2[1] = c21;

  /* compute v0 := a0*b0 in {c, 2k} */
  TOOM3_MUL_REC (c, a, b, k, trec);

 /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} 
		v0 		v1

	  {t, 2k+1} {t+2k+1, 2k + 1}
	     vm1        v2
  */

#define vinf (c+4*k)

  /* compute vinf := a2*b2 in {c4, r + r2},
  */
  saved = c4[0];

  if (r == r2) TOOM3_MUL_REC (c4, a + twok, b + twok, r, trec);
  else if (r > r2) mpn_mul(c4, a + twok, r, b + twok, r2);
  else mpn_mul(c4, b + twok, r2, a + twok, r);

  vinf0 = c4[0];
  c4[0] = saved;

 /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} 
		v0 		v1        {-}vinf

	  {t, 2k+1} {t+2k+1, 2k + 1}
	     vm1        v2

	  vinf0 = {-}
  */

  mpn_toom3_interpolate (c, c2, v2, t, vinf, k, rr2, sa, vinf0, t4+2);

#undef v2
#undef vinf
}
예제 #15
0
/* The necessary temporary space T(n) satisfies T(n)=0 for n < THRESHOLD,
   and T(n) <= max(2n+2, 6k+3, 4k+3+T(k+1)) otherwise, where k = ceil(n/3).

   Assuming T(n) >= 2n, 6k+3 <= 4k+3+T(k+1).
   Similarly, 2n+2 <= 6k+2 <= 4k+3+T(k+1).

   With T(n) = 2n+S(n), this simplifies to S(n) <= 9 + S(k+1).
   Since THRESHOLD >= 17, we have n/(k+1) >= 19/8
   thus S(n) <= S(n/(19/8)) + 9 thus S(n) <= 9*log(n)/log(19/8) <= 8*log2(n).

   We need in addition 2*r for mpn_sublsh1_n, so the total is at most
   8/3*n+8*log2(n).
*/
void
mpn_toom3_mul_n (mp_ptr c, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr t)
{
  mp_size_t k, k1, kk1, r, twok, rr2;
  mp_limb_t cy, cc, saved, vinf0;
  mp_ptr trec;
  int sa, sb;
  mp_ptr c1, c2, c3, c4, c5, t1, t2, t3, t4;

  ASSERT(GMP_NUMB_BITS >= 6);

  k = (n + 2) / 3; /* ceil(n/3) */
  ASSERT(GMP_NUMB_BITS >= 6);
  ASSERT(n >= 17); /* so that r <> 0 and 5k+3 <= 2n */

  twok = 2 * k;
  k1 = k + 1;
  kk1 = k + k1;
  r = n - twok;   /* last chunk */
  rr2 = 2*r;

  c1 = c + k;
  c2 = c1 + k;
  c3 = c2 + k;
  c4 = c3 + k;
  c5 = c4 + k;
  
  t1 = t + k;
  t2 = t1 + k;
  t3 = t2 + k;
  t4 = t3 + k;

  trec = t + 4 * k + 4; 

  /* put a0+a2 in {c, k+1}, and b0+b2 in {c4 + 2, k+1};
     put a0+a1+a2 in {t2 + 1, k+1} and b0+b1+b2 in {t3 + 2,k+1}
  */
  c1[0] = mpn_add_n (c, a, a + twok, r);
  c5[2] = mpn_add_n (c4 + 2, b, b + twok, r);
  if (r < k)
    {
      c1[0] = mpn_add_1 (c + r, a + r, k - r, c1[0]);
	   c5[2] = mpn_add_1 (c4 + 2 + r, b + r, k - r, c5[2]);
    }
  t3[1] = c1[0] + mpn_add_n (t2 + 1, c, a + k, k);
  t4[2] = c5[2] + mpn_add_n (t3 + 2, c4 + 2, b + k, k);

  ASSERT(c1[0] < 2);
  ASSERT(c5[2] < 2);
  ASSERT(t3[1] < 3);
  ASSERT(t4[2] < 3);

  /* compute v1 := (a0+a1+a2)*(b0+b1+b2) in {c2, 2k+1};
     since v1 < 9*B^(2k), v1 uses only 2k+1 words if GMP_NUMB_BITS >= 4 */
  TOOM3_MUL_REC (c2, t2 + 1, t3 + 2, k1, trec);

  ASSERT(c2[k+k] < 9);

  /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} 
					v1
  */

  /* put |a0-a1+a2| in {c,k+1} and |b0-b1+b2| in {c4 + 2,k+1} */
  /* sa = sign(a0-a1+a2) */
  /* sb = sign(b0-b1+b2) */
  sa = (c[k] != 0) ? 1 : mpn_cmp (c, a + k, k);
  c[k] = (sa >= 0) ? c[k] - mpn_sub_n (c, c, a + k, k)
		   : mpn_sub_n (c, a + k, c, k);
  /* b0+b2 is in {c4+2, k+1} now */
  sb = (c5[2] != 0) ? 1 : mpn_cmp (c4 + 2, b + k, k);
  c5[2] = (sb >= 0) ? c5[2] - mpn_sub_n (c4 + 2, c4 + 2, b + k, k)
		    : mpn_sub_n (c4 + 2, b + k, c4 + 2, k);
  
  ASSERT(c[k] < 2);
  ASSERT(c5[2] < 2);

  sa *= sb; /* sign of vm1 */

  /* compute vm1 := (a0-a1+a2)*(b0-b1+b2) in {t, 2k+1};
     since |vm1| < 4*B^(2k), vm1 uses only 2k+1 limbs */
  TOOM3_MUL_REC (t, c, c4 + 2, k1, trec);

  ASSERT(t[k+k] < 4);

  /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} 
					v1

	  {t, 2k+1} {t+2k+1, 2k + 1}
	     vm1
  */

  /* 
     compute a0+2a1+4a2 in {c, k+1} and b0+2b1+4b2 in {c4 + 2, k+1}
  */
#if HAVE_NATIVE_mpn_addlsh1_n
  c1[0] = mpn_addlsh1_n (c, a + k, a + twok, r);
  c5[2] = mpn_addlsh1_n (c4 + 2, b + k, b + twok, r);
  if (r < k)
    {
      c1[0] = mpn_add_1(c + r, a + k + r, k - r, c1[0]);
      c5[2] = mpn_add_1(c4 + 2 + r, b + k + r, k - r, c5[2]);
    }
  c1[0] = 2 * c1[0] + mpn_addlsh1_n (c, a, c, k);
  c5[2] = 2 * c5[2] + mpn_addlsh1_n (c4 + 2, b, c4 + 2, k);
#else
  c[r] = mpn_lshift1 (c, a + twok, r);
  c4[r + 2] = mpn_lshift1 (c4 + 2, b + twok, r);
  if (r < k)
    {
      MPN_ZERO(c + r + 1, k - r);
      MPN_ZERO(c4 + r + 3, k - r);
    }
  c1[0] += mpn_add_n (c, c, a + k, k);
  c5[2] += mpn_add_n (c4 + 2, c4 + 2, b + k, k);
  mpn_double (c, k1);
  mpn_double (c4 + 2, k1);
  c1[0] += mpn_add_n (c, c, a, k);
  c5[2] += mpn_add_n (c4 + 2, c4 + 2, b, k);
#endif

  ASSERT(c[k] < 7);
  ASSERT(c5[2] < 7);

#define v2 (t+2*k+1)

  /* compute v2 := (a0+2a1+4a2)*(b0+2b1+4b2) in {t+2k+1, 2k+1}
     v2 < 49*B^k so v2 uses at most 2k+1 limbs if GMP_NUMB_BITS >= 6 */
  TOOM3_MUL_REC (v2, c, c4 + 2, k1, trec);

  ASSERT(v2[k+k] < 49);

  /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} 
					v1

	  {t, 2k+1} {t+2k+1, 2k + 1}
	     vm1        v2
  */

  /* compute v0 := a0*b0 in {c, 2k} */
  TOOM3_MUL_REC (c, a, b, k, trec);

 /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} 
		v0 		v1

	  {t, 2k+1} {t+2k+1, 2k + 1}
	     vm1        v2
  */

#define vinf (c+4*k)

  /* compute vinf := a2*b2 in {c4, r + r2},
  */
  saved = c4[0];

  TOOM3_MUL_REC (c4, a + twok, b + twok, r, trec);
  
  vinf0 = c4[0];
  c4[0] = saved;
  
 /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} 
		v0 		v1        {-}vinf

	  {t, 2k+1} {t+2k+1, 2k + 1}
	     vm1        v2

	  vinf0 = {-}
  */

  mpn_toom3_interpolate (c, c2, v2, t, vinf, k, rr2, sa, vinf0, t4+2);

#undef v2
#undef vinf
}
예제 #16
0
void
mpn_toom3_sqr_n (mp_ptr c, mp_srcptr a, mp_size_t n, mp_ptr t)
{
  mp_size_t k, k1, kk1, r, twok, rr2;
  mp_limb_t cy, cc, saved, vinf0;
  mp_ptr trec;
  int sa;
  mp_ptr c1, c2, c3, c4, c5, t1, t2, t3, t4;

  ASSERT(GMP_NUMB_BITS >= 6);

  k = (n + 2) / 3; /* ceil(n/3) */
  ASSERT(GMP_NUMB_BITS >= 6);
  ASSERT(n >= 17); /* so that r <> 0 and 5k+3 <= 2n */

  twok = 2 * k;
  k1 = k + 1;
  kk1 = k + k1;
  r = n - twok;   /* last chunk */
  rr2 = 2*r;

  c1 = c + k;
  c2 = c1 + k;
  c3 = c2 + k;
  c4 = c3 + k;
  c5 = c4 + k;
  
  t1 = t + k;
  t2 = t1 + k;
  t3 = t2 + k;
  t4 = t3 + k;

  trec = t + 4 * k + 3; 

  /* put a0+a2 in {c, k+1}
     put a0+a1+a2 in {t2 + 1, k+1}
  */
  cy = mpn_add_n (c, a, a + twok, r);
  if (r < k)
    {
      __GMPN_ADD_1 (cy, c + r, a + r, k - r, cy);
   }
  t3[1] = (c1[0] = cy) + mpn_add_n (t2 + 1, c, a + k, k);

  /* compute v1 := (a0+a1+a2)^2 in {c2, 2k+1};
     since v1 < 9*B^(2k), v1 uses only 2k+1 words if GMP_NUMB_BITS >= 4 */
  TOOM3_SQR_REC (c2, t2 + 1, k1, trec);

  /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} 
					v1
  */

  /* put |a0-a1+a2| in {c,k+1} */
  sa = (c[k] != 0) ? 1 : mpn_cmp (c, a + k, k);
  c[k] = (sa >= 0) ? c[k] - mpn_sub_n (c, c, a + k, k)
		   : mpn_sub_n (c, a + k, c, k);
  
  /* compute vm1 := (a0-a1+a2)^2 in {t, 2k+1};
     since |vm1| < 4*B^(2k), vm1 uses only 2k+1 limbs */
  TOOM3_SQR_REC (t, c, k1, trec);

  /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} 
					v1

	  {t, 2k+1} {t+2k+1, 2k + 1}
	     vm1
  */

  /* 
     compute a0+2a1+4a2 in {c, k+1}
  */
#if HAVE_NATIVE_mpn_addlsh1_n
  c1[0] = mpn_addlsh1_n (c, a + k, a + twok, r);
  if (r < k)
    {
      __GMPN_ADD_1 (c1[0], c + r, a + k + r, k - r, c1[0]);
    }
  c1[0] = 2 * c1[0] + mpn_addlsh1_n (c, a, c, k);
#else
  c[r] = mpn_lshift1 (c, a + twok, r);
  if (r < k)
    {
      MPN_ZERO(c + r + 1, k - r);
    }
  c1[0] += mpn_add_n (c, c, a + k, k);
  mpn_double (c, k1);
  c1[0] += mpn_add_n (c, c, a, k);
#endif

#define v2 (t+2*k+1)

  /* compute v2 := (a0+2a1+4a2)^2 in {t+2k+1, 2k+1}
     v2 < 49*B^k so v2 uses at most 2k+1 limbs if GMP_NUMB_BITS >= 6 */
  TOOM3_SQR_REC (v2, c, k1, trec);

  /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} 
					v1

	  {t, 2k+1} {t+2k+1, 2k + 1}
	     vm1        v2
  */

  /* compute v0 := a0^2 in {c, 2k} */
  TOOM3_SQR_REC (c, a, k, trec);

 /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} 
		v0 		v1

	  {t, 2k+1} {t+2k+1, 2k + 1}
	     vm1        v2
  */

#define vinf (c+4*k)

  /* compute vinf := a2*b2 in {c4, r + r2},
  */
  saved = c4[0];

  TOOM3_SQR_REC (c4, a + twok, r, trec);
  
  vinf0 = c4[0];
  c4[0] = saved;

 /* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1} 
		v0 		v1        {-}vinf

	  {t, 2k+1} {t+2k+1, 2k + 1}
	     vm1        v2

	  vinf0 = {-}
  */

  mpn_toom3_interpolate (c, c2, v2, t, vinf, k, rr2, 1, vinf0, t4+2);

#undef v2
#undef vinf
}