Пример #1
0
/*
   ret + (xp, n) = (yp, n)*(zp, n) % 2^b + 1  
   needs (tp, 2n) temp space, everything reduced mod 2^b 
   inputs, outputs are fully reduced
  
   N.B: 2n is not the same as 2b rounded up to nearest limb!
*/
inline static int
mpn_mulmod_2expp1_internal (mp_ptr xp, mp_srcptr yp, mp_srcptr zp,
			                                         mpir_ui b, mp_ptr tp)
{
  mp_size_t n, k;
  mp_limb_t c;

  TMP_DECL;

  n = BITS_TO_LIMBS (b);
  k = GMP_NUMB_BITS * n - b;

  ASSERT(b > 0);
  ASSERT(n > 0);
  ASSERT_MPN(yp, n);
  ASSERT_MPN(zp, n);
  ASSERT(!MPN_OVERLAP_P (tp, 2 * n, yp, n));
  ASSERT(!MPN_OVERLAP_P (tp, 2 * n, zp, n));
  ASSERT(MPN_SAME_OR_SEPARATE_P (xp, tp, n));
  ASSERT(MPN_SAME_OR_SEPARATE_P (xp, tp + n, n));
  ASSERT(k == 0 || yp[n - 1] >> (GMP_NUMB_BITS - k) == 0);
  ASSERT(k == 0 || zp[n - 1] >> (GMP_NUMB_BITS - k) == 0);

#ifndef TUNE_PROGRAM_BUILD
  if (k == 0 && n > FFT_MULMOD_2EXPP1_CUTOFF && n == mpir_fft_adjust_limbs(n))
  {
      mp_bitcnt_t depth1, depth = 1;
      mp_size_t w1, off;
      mp_ptr tx, ty, tz;
      mp_limb_t ret;

      TMP_MARK;

      tx = TMP_BALLOC_LIMBS(3*n + 3);
      ty = tx + n + 1;
      tz = ty + n + 1;

      MPN_COPY(ty, yp, n);
      MPN_COPY(tz, zp, n);
      ty[n] = 0;
      tz[n] = 0;

      while ((((mp_limb_t)1)<<depth) < b) depth++;
   
      if (depth < 12) off = mulmod_2expp1_table_n[0];
      else off = mulmod_2expp1_table_n[MIN(depth, FFT_N_NUM + 11) - 12];
      depth1 = depth/2 - off;
   
      w1 = b/(((mp_limb_t)1)<<(2*depth1));

      mpir_fft_mulmod_2expp1(tx, ty, tz, n, depth1, w1);

      MPN_COPY(xp, tx, n);
      ret = tx[n];
      
      TMP_FREE;

	   return ret;
  }
#endif

  if (yp == zp)
     mpn_sqr(tp, yp, n);
  else
     mpn_mul_n (tp, yp, zp, n);

  if (k == 0)
    {
      c = mpn_sub_n (xp, tp, tp + n, n);

      return mpn_add_1 (xp, xp, n, c);
    }

  c = tp[n - 1];
  tp[n - 1] &= GMP_NUMB_MASK >> k;

#if HAVE_NATIVE_mpn_sublsh_nc
  c = mpn_sublsh_nc (xp, tp, tp + n, n, k, c);
#else
  {
    mp_limb_t c1;
    c1 = mpn_lshift (tp + n, tp + n, n, k);
    tp[n] |= c >> (GMP_NUMB_BITS - k);
    c = mpn_sub_n (xp, tp, tp + n, n) + c1;
  }
#endif

  c = mpn_add_1 (xp, xp, n, c);
  xp[n - 1] &= GMP_NUMB_MASK >> k;

  return c;
}
Пример #2
0
mp_limb_t
mpn_mu_div_q (mp_ptr qp,
	      mp_srcptr np, mp_size_t nn,
	      mp_srcptr dp, mp_size_t dn,
	      mp_ptr scratch)
{
  mp_ptr tp, rp;
  mp_size_t qn;
  mp_limb_t cy, qh;
  TMP_DECL;

  TMP_MARK;

  qn = nn - dn;

  tp = TMP_BALLOC_LIMBS (qn + 1);

  if (qn >= dn)			/* nn >= 2*dn + 1 */
    {
       /* |_______________________|   dividend
			 |________|   divisor  */

      rp = TMP_BALLOC_LIMBS (nn + 1);
      MPN_COPY (rp + 1, np, nn);
      rp[0] = 0;

      qh = mpn_cmp (rp + 1 + nn - dn, dp, dn) >= 0;
      if (qh != 0)
	mpn_sub_n (rp + 1 + nn - dn, rp + 1 + nn - dn, dp, dn);

      cy = mpn_mu_divappr_q (tp, rp, nn + 1, dp, dn, scratch);

      if (UNLIKELY (cy != 0))
	{
	  /* Since the partial remainder fed to mpn_preinv_mu_divappr_q was
	     canonically reduced, replace the returned value of B^(qn-dn)+eps
	     by the largest possible value.  */
	  mp_size_t i;
	  for (i = 0; i < qn + 1; i++)
	    tp[i] = GMP_NUMB_MAX;
	}

      /* The max error of mpn_mu_divappr_q is +4.  If the low quotient limb is
	 smaller than the max error, we cannot trust the quotient.  */
      if (tp[0] > 4)
	{
	  MPN_COPY (qp, tp + 1, qn);
	}
      else
	{
	  mp_limb_t cy;
	  mp_ptr pp;

	  pp = rp;
	  mpn_mul (pp, tp + 1, qn, dp, dn);

	  cy = (qh != 0) ? mpn_add_n (pp + qn, pp + qn, dp, dn) : 0;

	  if (cy || mpn_cmp (pp, np, nn) > 0) /* At most is wrong by one, no cycle. */
	    qh -= mpn_sub_1 (qp, tp + 1, qn, 1);
	  else /* Same as above */
	    MPN_COPY (qp, tp + 1, qn);
	}
    }
  else
    {
       /* |_______________________|   dividend
		 |________________|   divisor  */

      /* FIXME: When nn = 2dn-1, qn becomes dn-1, and the numerator size passed
	 here becomes 2dn, i.e., more than nn.  This shouldn't hurt, since only
	 the most significant dn-1 limbs will actually be read, but it is not
	 pretty.  */

      qh = mpn_mu_divappr_q (tp, np + nn - (2 * qn + 2), 2 * qn + 2,
			     dp + dn - (qn + 1), qn + 1, scratch);

      /* The max error of mpn_mu_divappr_q is +4, but we get an additional
         error from the divisor truncation.  */
      if (tp[0] > 6)
	{
	  MPN_COPY (qp, tp + 1, qn);
	}
      else
	{
	  mp_limb_t cy;

	  /* FIXME: a shorter product should be enough; we may use already
	     allocated space... */
	  rp = TMP_BALLOC_LIMBS (nn);
	  mpn_mul (rp, dp, dn, tp + 1, qn);

	  cy = (qh != 0) ? mpn_add_n (rp + qn, rp + qn, dp, dn) : 0;

	  if (cy || mpn_cmp (rp, np, nn) > 0) /* At most is wrong by one, no cycle. */
	    qh -= mpn_sub_1 (qp, tp + 1, qn, 1);
	  else /* Same as above */
	    MPN_COPY (qp, tp + 1, qn);
	}
    }

  TMP_FREE;
  return qh;
}