Ejemplo n.º 1
0
void
mpn_invert (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch)
{
  ASSERT (n > 0);
  ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
  ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
  ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n)));
  ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n)));

  if (n == 1)
    invert_limb (*ip, *dp);
  else if (BELOW_THRESHOLD (n, INV_APPR_THRESHOLD))
    {
	/* Maximum scratch needed by this branch: 2*n */
	mp_size_t i;
	mp_ptr xp;

	xp = scratch;				/* 2 * n limbs */
	/* n > 1 here */
	i = n;
	do
	  xp[--i] = GMP_NUMB_MAX;
	while (i);
	mpn_com (xp + n, dp, n);
	if (n == 2) {
	  mpn_divrem_2 (ip, 0, xp, 4, dp);
	} else {
	  gmp_pi1_t inv;
	  invert_pi1 (inv, dp[n-1], dp[n-2]);
	  /* FIXME: should we use dcpi1_div_q, for big sizes? */
	  mpn_sbpi1_div_q (ip, xp, 2 * n, dp, n, inv.inv32);
	}
    }
  else { /* Use approximated inverse; correct the result if needed. */
      mp_limb_t e; /* The possible error in the approximate inverse */

      ASSERT ( mpn_invert_itch (n) >= mpn_invertappr_itch (n) );
      e = mpn_ni_invertappr (ip, dp, n, scratch);

      if (UNLIKELY (e)) { /* Assume the error can only be "0" (no error) or "1". */
	/* Code to detect and correct the "off by one" approximation. */
	mpn_mul_n (scratch, ip, dp, n);
	e = mpn_add_n (scratch, scratch, dp, n); /* FIXME: we only need e.*/
	if (LIKELY(e)) /* The high part can not give a carry by itself. */
	  e = mpn_add_nc (scratch + n, scratch + n, dp, n, e); /* FIXME:e */
	/* If the value was wrong (no carry), correct it (increment). */
	e ^= CNST_LIMB (1);
	MPN_INCR_U (ip, n, e);
      }
  }
}
void
mpn_toom_interpolate_12pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5,
			mp_size_t n, mp_size_t spt, int half, mp_ptr wsi)
{
  mp_limb_t cy;
  mp_size_t n3;
  mp_size_t n3p1;
  n3 = 3 * n;
  n3p1 = n3 + 1;

#define   r4    (pp + n3)			/* 3n+1 */
#define   r2    (pp + 7 * n)			/* 3n+1 */
#define   r0    (pp +11 * n)			/* s+t <= 2*n */

  /******************************* interpolation *****************************/
  if (half != 0) {
    cy = mpn_sub_n (r3, r3, r0, spt);
    MPN_DECR_U (r3 + spt, n3p1 - spt, cy);

    cy = DO_mpn_sublsh_n (r2, r0, spt, 10, wsi);
    MPN_DECR_U (r2 + spt, n3p1 - spt, cy);
    DO_mpn_subrsh(r5, n3p1, r0, spt, 2, wsi);

    cy = DO_mpn_sublsh_n (r1, r0, spt, 20, wsi);
    MPN_DECR_U (r1 + spt, n3p1 - spt, cy);
    DO_mpn_subrsh(r4, n3p1, r0, spt, 4, wsi);
  };

  r4[n3] -= DO_mpn_sublsh_n (r4 + n, pp, 2 * n, 20, wsi);
  DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 4, wsi);

#if HAVE_NATIVE_mpn_add_n_sub_n
  mpn_add_n_sub_n (r1, r4, r4, r1, n3p1);
#else
  ASSERT_NOCARRY(mpn_add_n (wsi, r1, r4, n3p1));
  mpn_sub_n (r4, r4, r1, n3p1); /* can be negative */
  MP_PTR_SWAP(r1, wsi);
#endif

  r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 10, wsi);
  DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 2, wsi);

#if HAVE_NATIVE_mpn_add_n_sub_n
  mpn_add_n_sub_n (r2, r5, r5, r2, n3p1);
#else
  mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */
  ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1));
  MP_PTR_SWAP(r5, wsi);
#endif

  r3[n3] -= mpn_sub_n (r3+n, r3+n, pp, 2 * n);

#if AORSMUL_FASTER_AORS_AORSLSH
  mpn_submul_1 (r4, r5, n3p1, 257); /* can be negative */
#else
  mpn_sub_n (r4, r4, r5, n3p1); /* can be negative */
  DO_mpn_sublsh_n (r4, r5, n3p1, 8, wsi); /* can be negative */
#endif
  /* A division by 2835x4 follows. Warning: the operand can be negative! */
  mpn_divexact_by2835x4(r4, r4, n3p1);
  if ((r4[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0)
    r4[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2));

#if AORSMUL_FASTER_2AORSLSH
  mpn_addmul_1 (r5, r4, n3p1, 60); /* can be negative */
#else
  DO_mpn_sublsh_n (r5, r4, n3p1, 2, wsi); /* can be negative */
  DO_mpn_addlsh_n (r5, r4, n3p1, 6, wsi); /* can give a carry */
#endif
  mpn_divexact_by255(r5, r5, n3p1);

  ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r3, n3p1, 5, wsi));

#if AORSMUL_FASTER_3AORSLSH
  ASSERT_NOCARRY(mpn_submul_1 (r1, r2, n3p1, 100));
#else
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 6, wsi));
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 5, wsi));
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 2, wsi));
#endif
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r3, n3p1, 9, wsi));
  mpn_divexact_by42525(r1, r1, n3p1);

#if AORSMUL_FASTER_AORS_2AORSLSH
  ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 225));
#else
  ASSERT_NOCARRY(mpn_sub_n (r2, r2, r1, n3p1));
  ASSERT_NOCARRY(DO_mpn_addlsh_n (r2, r1, n3p1, 5, wsi));
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r1, n3p1, 8, wsi));
#endif
  mpn_divexact_by9x4(r2, r2, n3p1);

  ASSERT_NOCARRY(mpn_sub_n (r3, r3, r2, n3p1));

  mpn_sub_n (r4, r2, r4, n3p1);
  ASSERT_NOCARRY(mpn_rshift(r4, r4, n3p1, 1));
  ASSERT_NOCARRY(mpn_sub_n (r2, r2, r4, n3p1));

  mpn_add_n (r5, r5, r1, n3p1);
  ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1));

  /* last interpolation steps... */
  ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1));
  ASSERT_NOCARRY(mpn_sub_n (r1, r1, r5, n3p1));
  /* ... could be mixed with recomposition
	||H-r5|M-r5|L-r5|   ||H-r1|M-r1|L-r1|
  */

  /***************************** recomposition *******************************/
  /*
    pp[] prior to operations:
    |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp

    summation scheme for remaining operations:
    |__12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp
    |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
	||H r1|M r1|L r1|   ||H r3|M r3|L r3|   ||H_r5|M_r5|L_r5|
  */

  cy = mpn_add_n (pp + n, pp + n, r5, n);
  cy = mpn_add_1 (pp + 2 * n, r5 + n, n, cy);
#if HAVE_NATIVE_mpn_add_nc
  cy = r5[n3] + mpn_add_nc(pp + n3, pp + n3, r5 + 2 * n, n, cy);
#else
  MPN_INCR_U (r5 + 2 * n, n + 1, cy);
  cy = r5[n3] + mpn_add_n (pp + n3, pp + n3, r5 + 2 * n, n);
#endif
  MPN_INCR_U (pp + n3 + n, 2 * n + 1, cy);

  pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r3, n);
  cy = mpn_add_1 (pp + 2 * n3, r3 + n, n, pp[2 * n3]);
#if HAVE_NATIVE_mpn_add_nc
  cy = r3[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r3 + 2 * n, n, cy);
#else
  MPN_INCR_U (r3 + 2 * n, n + 1, cy);
  cy = r3[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r3 + 2 * n, n);
#endif
  MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy);

  pp[10*n]+=mpn_add_n (pp + 9 * n, pp + 9 * n, r1, n);
  if (half) {
    cy = mpn_add_1 (pp + 10 * n, r1 + n, n, pp[10 * n]);
#if HAVE_NATIVE_mpn_add_nc
    if (LIKELY (spt > n)) {
      cy = r1[n3] + mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, n, cy);
      MPN_INCR_U (pp + 4 * n3, spt - n, cy);
    } else {
      ASSERT_NOCARRY(mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt, cy));
    }
#else
    MPN_INCR_U (r1 + 2 * n, n + 1, cy);
    if (LIKELY (spt > n)) {
      cy = r1[n3] + mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, n);
      MPN_INCR_U (pp + 4 * n3, spt - n, cy);
    } else {
      ASSERT_NOCARRY(mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt));
    }
#endif
  } else {
    ASSERT_NOCARRY(mpn_add_1 (pp + 10 * n, r1 + n, spt, pp[10 * n]));
  }

#undef   r0
#undef   r2
#undef   r4
}
Ejemplo n.º 3
0
/* Computes {rp,MIN(rn,an+bn)} <- {ap,an}*{bp,bn} Mod(B^rn-1)
 *
 * The result is expected to be ZERO if and only if one of the operand
 * already is. Otherwise the class [0] Mod(B^rn-1) is represented by
 * B^rn-1. This should not be a problem if mulmod_bnm1 is used to
 * combine results and obtain a natural number when one knows in
 * advance that the final value is less than (B^rn-1).
 * Moreover it should not be a problem if mulmod_bnm1 is used to
 * compute the full product with an+bn <= rn, because this condition
 * implies (B^an-1)(B^bn-1) < (B^rn-1) .
 *
 * Requires 0 < bn <= an <= rn and an + bn > rn/2
 * Scratch need: rn + (need for recursive call OR rn + 4). This gives
 *
 * S(n) <= rn + MAX (rn + 4, S(n/2)) <= 2rn + 4
 */
void
mpn_mulmod_bnm1 (mp_ptr rp, mp_size_t rn, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr tp)
{
  ASSERT (0 < bn);
  ASSERT (bn <= an);
  ASSERT (an <= rn);

  if ((rn & 1) != 0 || BELOW_THRESHOLD (rn, MULMOD_BNM1_THRESHOLD))
    {
      if (UNLIKELY (bn < rn))
	{
	  if (UNLIKELY (an + bn <= rn))
	    {
	      mpn_mul (rp, ap, an, bp, bn);
	    }
	  else
	    {
	      mp_limb_t cy;
	      mpn_mul (tp, ap, an, bp, bn);
	      cy = mpn_add (rp, tp, rn, tp + rn, an + bn - rn);
	      MPN_INCR_U (rp, rn, cy);
	    }
	}
      else
	mpn_bc_mulmod_bnm1 (rp, ap, bp, rn, tp);
    }
  else
    {
      mp_size_t n;
      mp_limb_t cy;
      mp_limb_t hi;

      n = rn >> 1;

      /* We need at least an + bn >= n, to be able to fit one of the
	 recursive products at rp. Requiring strict inequality makes
	 the coded slightly simpler. If desired, we could avoid this
	 restriction by initially halving rn as long as rn is even and
	 an + bn <= rn/2. */

      ASSERT (an + bn > n);

      /* Compute xm = a*b mod (B^n - 1), xp = a*b mod (B^n + 1)
	 and crt together as

	 x = -xp * B^n + (B^n + 1) * [ (xp + xm)/2 mod (B^n-1)]
      */

#define a0 ap
#define a1 (ap + n)
#define b0 bp
#define b1 (bp + n)

#define xp  tp	/* 2n + 2 */
      /* am1  maybe in {xp, n} */
      /* bm1  maybe in {xp + n, n} */
#define sp1 (tp + 2*n + 2)
      /* ap1  maybe in {sp1, n + 1} */
      /* bp1  maybe in {sp1 + n + 1, n + 1} */

      {
	mp_srcptr am1, bm1;
	mp_size_t anm, bnm;
	mp_ptr so;

	bm1 = b0;
	bnm = bn;
	if (LIKELY (an > n))
	  {
	    am1 = xp;
	    cy = mpn_add (xp, a0, n, a1, an - n);
	    MPN_INCR_U (xp, n, cy);
	    anm = n;
	    so = xp + n;
	    if (LIKELY (bn > n))
	      {
		bm1 = so;
		cy = mpn_add (so, b0, n, b1, bn - n);
		MPN_INCR_U (so, n, cy);
		bnm = n;
		so += n;
	      }
	  }
	else
	  {
	    so = xp;
	    am1 = a0;
	    anm = an;
	  }

	mpn_mulmod_bnm1 (rp, n, am1, anm, bm1, bnm, so);
      }

      {
	int       k;
	mp_srcptr ap1, bp1;
	mp_size_t anp, bnp;

	bp1 = b0;
	bnp = bn;
	if (LIKELY (an > n)) {
	  ap1 = sp1;
	  cy = mpn_sub (sp1, a0, n, a1, an - n);
	  sp1[n] = 0;
	  MPN_INCR_U (sp1, n + 1, cy);
	  anp = n + ap1[n];
	  if (LIKELY (bn > n)) {
	    bp1 = sp1 + n + 1;
	    cy = mpn_sub (sp1 + n + 1, b0, n, b1, bn - n);
	    sp1[2*n+1] = 0;
	    MPN_INCR_U (sp1 + n + 1, n + 1, cy);
	    bnp = n + bp1[n];
	  }
	} else {
	  ap1 = a0;
	  anp = an;
	}

	if (BELOW_THRESHOLD (n, MUL_FFT_MODF_THRESHOLD))
	  k=0;
	else
	  {
	    int mask;
	    k = mpn_fft_best_k (n, 0);
	    mask = (1<<k) - 1;
	    while (n & mask) {k--; mask >>=1;};
	  }
	if (k >= FFT_FIRST_K)
	  xp[n] = mpn_mul_fft (xp, n, ap1, anp, bp1, bnp, k);
	else if (UNLIKELY (bp1 == b0))
	  {
	    ASSERT (anp + bnp <= 2*n+1);
	    ASSERT (anp + bnp > n);
	    ASSERT (anp >= bnp);
	    mpn_mul (xp, ap1, anp, bp1, bnp);
	    anp = anp + bnp - n;
	    ASSERT (anp <= n || xp[2*n]==0);
	    anp-= anp > n;
	    cy = mpn_sub (xp, xp, n, xp + n, anp);
	    xp[n] = 0;
	    MPN_INCR_U (xp, n+1, cy);
	  }
	else
	  mpn_bc_mulmod_bnp1 (xp, ap1, bp1, n, xp);
      }

      /* Here the CRT recomposition begins.

	 xm <- (xp + xm)/2 = (xp + xm)B^n/2 mod (B^n-1)
	 Division by 2 is a bitwise rotation.

	 Assumes xp normalised mod (B^n+1).

	 The residue class [0] is represented by [B^n-1]; except when
	 both input are ZERO.
      */

#if HAVE_NATIVE_mpn_rsh1add_n || HAVE_NATIVE_mpn_rsh1add_nc
#if HAVE_NATIVE_mpn_rsh1add_nc
      cy = mpn_rsh1add_nc(rp, rp, xp, n, xp[n]); /* B^n = 1 */
      hi = cy << (GMP_NUMB_BITS - 1);
      cy = 0;
      /* next update of rp[n-1] will set cy = 1 only if rp[n-1]+=hi
	 overflows, i.e. a further increment will not overflow again. */
#else /* ! _nc */
      cy = xp[n] + mpn_rsh1add_n(rp, rp, xp, n); /* B^n = 1 */
      hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
      cy >>= 1;
      /* cy = 1 only if xp[n] = 1 i.e. {xp,n} = ZERO, this implies that
	 the rsh1add was a simple rshift: the top bit is 0. cy=1 => hi=0. */
#endif
#if GMP_NAIL_BITS == 0
      add_ssaaaa(cy, rp[n-1], cy, rp[n-1], 0, hi);
#else
      cy += (hi & rp[n-1]) >> (GMP_NUMB_BITS-1);
      rp[n-1] ^= hi;
#endif
#else /* ! HAVE_NATIVE_mpn_rsh1add_n */
#if HAVE_NATIVE_mpn_add_nc
      cy = mpn_add_nc(rp, rp, xp, n, xp[n]);
#else /* ! _nc */
      cy = xp[n] + mpn_add_n(rp, rp, xp, n); /* xp[n] == 1 implies {xp,n} == ZERO */
#endif
      cy += (rp[0]&1);
      mpn_rshift(rp, rp, n, 1);
      ASSERT (cy <= 2);
      hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
      cy >>= 1;
      /* We can have cy != 0 only if hi = 0... */
      ASSERT ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0);
      rp[n-1] |= hi;
      /* ... rp[n-1] + cy can not overflow, the following INCR is correct. */
#endif
      ASSERT (cy <= 1);
      /* Next increment can not overflow, read the previous comments about cy. */
      ASSERT ((cy == 0) || ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0));
      MPN_INCR_U(rp, n, cy);

      /* Compute the highest half:
	 ([(xp + xm)/2 mod (B^n-1)] - xp ) * B^n
       */
      if (UNLIKELY (an + bn < rn))
	{
	  /* Note that in this case, the only way the result can equal
	     zero mod B^{rn} - 1 is if one of the inputs is zero, and
	     then the output of both the recursive calls and this CRT
	     reconstruction is zero, not B^{rn} - 1. Which is good,
	     since the latter representation doesn't fit in the output
	     area.*/
	  cy = mpn_sub_n (rp + n, rp, xp, an + bn - n);

	  /* FIXME: This subtraction of the high parts is not really
	     necessary, we do it to get the carry out, and for sanity
	     checking. */
	  cy = xp[n] + mpn_sub_nc (xp + an + bn - n, rp + an + bn - n,
				   xp + an + bn - n, rn - (an + bn), cy);
	  ASSERT (an + bn == rn - 1 ||
		  mpn_zero_p (xp + an + bn - n + 1, rn - 1 - (an + bn)));
	  cy = mpn_sub_1 (rp, rp, an + bn, cy);
	  ASSERT (cy == (xp + an + bn - n)[0]);
	}
      else
	{
	  cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n);
	  /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO.
	     DECR will affect _at most_ the lowest n limbs. */
	  MPN_DECR_U (rp, 2*n, cy);
	}
#undef a0
#undef a1
#undef b0
#undef b1
#undef xp
#undef sp1
    }
}
Ejemplo n.º 4
0
void
check (void)
{
  mp_limb_t  wp[100], xp[100], yp[100];
  mp_size_t  size = 100;

  refmpn_zero (xp, size);
  refmpn_zero (yp, size);
  refmpn_zero (wp, size);

  pre ("mpn_add_n");
  mpn_add_n (wp, xp, yp, size);
  post ();

#if HAVE_NATIVE_mpn_add_nc
  pre ("mpn_add_nc");
  mpn_add_nc (wp, xp, yp, size, CNST_LIMB(0));
  post ();
#endif

#if HAVE_NATIVE_mpn_addlsh1_n
  pre ("mpn_addlsh1_n");
  mpn_addlsh1_n (wp, xp, yp, size);
  post ();
#endif

#if HAVE_NATIVE_mpn_and_n
  pre ("mpn_and_n");
  mpn_and_n (wp, xp, yp, size);
  post ();
#endif

#if HAVE_NATIVE_mpn_andn_n
  pre ("mpn_andn_n");
  mpn_andn_n (wp, xp, yp, size);
  post ();
#endif

  pre ("mpn_addmul_1");
  mpn_addmul_1 (wp, xp, size, yp[0]);
  post ();

#if HAVE_NATIVE_mpn_addmul_1c
  pre ("mpn_addmul_1c");
  mpn_addmul_1c (wp, xp, size, yp[0], CNST_LIMB(0));
  post ();
#endif

#if HAVE_NATIVE_mpn_com_n
  pre ("mpn_com_n");
  mpn_com_n (wp, xp, size);
  post ();
#endif

#if HAVE_NATIVE_mpn_copyd
  pre ("mpn_copyd");
  mpn_copyd (wp, xp, size);
  post ();
#endif

#if HAVE_NATIVE_mpn_copyi
  pre ("mpn_copyi");
  mpn_copyi (wp, xp, size);
  post ();
#endif

  pre ("mpn_divexact_1");
  mpn_divexact_1 (wp, xp, size, CNST_LIMB(123));
  post ();

  pre ("mpn_divexact_by3c");
  mpn_divexact_by3c (wp, xp, size, CNST_LIMB(0));
  post ();

  pre ("mpn_divrem_1");
  mpn_divrem_1 (wp, (mp_size_t) 0, xp, size, CNST_LIMB(123));
  post ();

#if HAVE_NATIVE_mpn_divrem_1c
  pre ("mpn_divrem_1c");
  mpn_divrem_1c (wp, (mp_size_t) 0, xp, size, CNST_LIMB(123), CNST_LIMB(122));
  post ();
#endif

  pre ("mpn_gcd_1");
  xp[0] |= 1;
  notdead += (unsigned long) mpn_gcd_1 (xp, size, CNST_LIMB(123));
  post ();

#if HAVE_NATIVE_mpn_gcd_finda
  pre ("mpn_gcd_finda");
  xp[0] |= 1;
  xp[1] |= 1;
  notdead += mpn_gcd_finda (xp);
  post ();
#endif

  pre ("mpn_hamdist");
  notdead += mpn_hamdist (xp, yp, size);
  post ();

#if HAVE_NATIVE_mpn_ior_n
  pre ("mpn_ior_n");
  mpn_ior_n (wp, xp, yp, size);
  post ();
#endif

#if HAVE_NATIVE_mpn_iorn_n
  pre ("mpn_iorn_n");
  mpn_iorn_n (wp, xp, yp, size);
  post ();
#endif

  pre ("mpn_lshift");
  mpn_lshift (wp, xp, size, 1);
  post ();

  pre ("mpn_mod_1");
  notdead += mpn_mod_1 (xp, size, CNST_LIMB(123));
  post ();

#if HAVE_NATIVE_mpn_mod_1c
  pre ("mpn_mod_1c");
  notdead += mpn_mod_1c (xp, size, CNST_LIMB(123), CNST_LIMB(122));
  post ();
#endif

#if GMP_NUMB_BITS % 4 == 0
  pre ("mpn_mod_34lsub1");
  notdead += mpn_mod_34lsub1 (xp, size);
  post ();
#endif

  pre ("mpn_modexact_1_odd");
  notdead += mpn_modexact_1_odd (xp, size, CNST_LIMB(123));
  post ();

  pre ("mpn_modexact_1c_odd");
  notdead += mpn_modexact_1c_odd (xp, size, CNST_LIMB(123), CNST_LIMB(456));
  post ();

  pre ("mpn_mul_1");
  mpn_mul_1 (wp, xp, size, yp[0]);
  post ();

#if HAVE_NATIVE_mpn_mul_1c
  pre ("mpn_mul_1c");
  mpn_mul_1c (wp, xp, size, yp[0], CNST_LIMB(0));
  post ();
#endif

#if HAVE_NATIVE_mpn_mul_2
  pre ("mpn_mul_2");
  mpn_mul_2 (wp, xp, size-1, yp);
  post ();
#endif

  pre ("mpn_mul_basecase");
  mpn_mul_basecase (wp, xp, (mp_size_t) 3, yp, (mp_size_t) 3);
  post ();

#if HAVE_NATIVE_mpn_nand_n
  pre ("mpn_nand_n");
  mpn_nand_n (wp, xp, yp, size);
  post ();
#endif

#if HAVE_NATIVE_mpn_nior_n
  pre ("mpn_nior_n");
  mpn_nior_n (wp, xp, yp, size);
  post ();
#endif

  pre ("mpn_popcount");
  notdead += mpn_popcount (xp, size);
  post ();

  pre ("mpn_preinv_mod_1");
  notdead += mpn_preinv_mod_1 (xp, size, GMP_NUMB_MAX,
                               refmpn_invert_limb (GMP_NUMB_MAX));
  post ();

#if USE_PREINV_DIVREM_1 || HAVE_NATIVE_mpn_preinv_divrem_1
  pre ("mpn_preinv_divrem_1");
  mpn_preinv_divrem_1 (wp, (mp_size_t) 0, xp, size, GMP_NUMB_MAX,
                       refmpn_invert_limb (GMP_NUMB_MAX), 0);
  post ();
#endif

#if HAVE_NATIVE_mpn_rsh1add_n
  pre ("mpn_rsh1add_n");
  mpn_rsh1add_n (wp, xp, yp, size);
  post ();
#endif

#if HAVE_NATIVE_mpn_rsh1sub_n
  pre ("mpn_rsh1sub_n");
  mpn_rsh1sub_n (wp, xp, yp, size);
  post ();
#endif

  pre ("mpn_rshift");
  mpn_rshift (wp, xp, size, 1);
  post ();

  pre ("mpn_sqr_basecase");
  mpn_sqr_basecase (wp, xp, (mp_size_t) 3);
  post ();

  pre ("mpn_submul_1");
  mpn_submul_1 (wp, xp, size, yp[0]);
  post ();

#if HAVE_NATIVE_mpn_submul_1c
  pre ("mpn_submul_1c");
  mpn_submul_1c (wp, xp, size, yp[0], CNST_LIMB(0));
  post ();
#endif

  pre ("mpn_sub_n");
  mpn_sub_n (wp, xp, yp, size);
  post ();

#if HAVE_NATIVE_mpn_sub_nc
  pre ("mpn_sub_nc");
  mpn_sub_nc (wp, xp, yp, size, CNST_LIMB(0));
  post ();
#endif

#if HAVE_NATIVE_mpn_sublsh1_n
  pre ("mpn_sublsh1_n");
  mpn_sublsh1_n (wp, xp, yp, size);
  post ();
#endif

#if HAVE_NATIVE_mpn_udiv_qrnnd
  pre ("mpn_udiv_qrnnd");
  mpn_udiv_qrnnd (&wp[0], CNST_LIMB(122), xp[0], CNST_LIMB(123));
  post ();
#endif

#if HAVE_NATIVE_mpn_udiv_qrnnd_r
  pre ("mpn_udiv_qrnnd_r");
  mpn_udiv_qrnnd (CNST_LIMB(122), xp[0], CNST_LIMB(123), &wp[0]);
  post ();
#endif

#if HAVE_NATIVE_mpn_umul_ppmm
  pre ("mpn_umul_ppmm");
  mpn_umul_ppmm (&wp[0], xp[0], yp[0]);
  post ();
#endif

#if HAVE_NATIVE_mpn_umul_ppmm_r
  pre ("mpn_umul_ppmm_r");
  mpn_umul_ppmm_r (&wp[0], xp[0], yp[0]);
  post ();
#endif

#if HAVE_NATIVE_mpn_xor_n
  pre ("mpn_xor_n");
  mpn_xor_n (wp, xp, yp, size);
  post ();
#endif

#if HAVE_NATIVE_mpn_xnor_n
  pre ("mpn_xnor_n");
  mpn_xnor_n (wp, xp, yp, size);
  post ();
#endif
}
Ejemplo n.º 5
0
/* mpn_addsub_n.
   r1[] = s1[] + s2[]
   r2[] = s1[] - s2[]
   All operands have n limbs.
   In-place operations allowed.  */
mp_limb_t
mpn_addsub_n (mp_ptr r1p, mp_ptr r2p, mp_srcptr s1p, mp_srcptr s2p, mp_size_t n)
{
  mp_limb_t acyn, acyo;		/* carry for add */
  mp_limb_t scyn, scyo;		/* carry for subtract */
  mp_size_t off;		/* offset in operands */
  mp_size_t this_n;		/* size of current chunk */

  /* We alternatingly add and subtract in chunks that fit into the (L1)
     cache.  Since the chunks are several hundred limbs, the function call
     overhead is insignificant, but we get much better locality.  */

  /* We have three variant of the inner loop, the proper loop is chosen
     depending on whether r1 or r2 are the same operand as s1 or s2.  */

  if (r1p != s1p && r1p != s2p)
    {
      /* r1 is not identical to either input operand.  We can therefore write
	 to r1 directly, without using temporary storage.  */
      acyo = 0;
      scyo = 0;
      for (off = 0; off < n; off += PART_SIZE)
	{
	  this_n = MIN (n - off, PART_SIZE);
#if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n
	  acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo);
#else
	  acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n);
	  acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo);
#endif
#if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n
	  scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
#else
	  scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
	  scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
#endif
	}
    }
  else if (r2p != s1p && r2p != s2p)
    {
      /* r2 is not identical to either input operand.  We can therefore write
	 to r2 directly, without using temporary storage.  */
      acyo = 0;
      scyo = 0;
      for (off = 0; off < n; off += PART_SIZE)
	{
	  this_n = MIN (n - off, PART_SIZE);
#if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n
	  scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
#else
	  scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
	  scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
#endif
#if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n
	  acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo);
#else
	  acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n);
	  acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo);
#endif
	}
    }
  else
    {
      /* r1 and r2 are identical to s1 and s2 (r1==s1 and r2=s2 or vice versa)
	 Need temporary storage.  */
      mp_limb_t tp[PART_SIZE];
      acyo = 0;
      scyo = 0;
      for (off = 0; off < n; off += PART_SIZE)
	{
	  this_n = MIN (n - off, PART_SIZE);
#if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n
	  acyo = mpn_add_nc (tp, s1p + off, s2p + off, this_n, acyo);
#else
	  acyn = mpn_add_n (tp, s1p + off, s2p + off, this_n);
	  acyo = acyn + mpn_add_1 (tp, tp, this_n, acyo);
#endif
#if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n
	  scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
#else
	  scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
	  scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
#endif
	  MPN_COPY (r1p + off, tp, this_n);
	}
    }

  return 2 * acyo + scyo;
}