예제 #1
0
void
mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n)
{
  ASSERT (n >= 1);
  ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n));
  ASSERT (! MPN_OVERLAP_P (p, 2 * n, b, n));

  if (BELOW_THRESHOLD (n, MUL_KARATSUBA_THRESHOLD))
    {
      mpn_mul_basecase (p, a, n, b, n);
    }
  else if (BELOW_THRESHOLD (n, MUL_TOOM3_THRESHOLD))
    {
      /* Allocate workspace of fixed size on stack: fast! */
      mp_limb_t ws[MPN_KARA_MUL_N_TSIZE (MUL_TOOM3_THRESHOLD_LIMIT-1)];
      ASSERT (MUL_TOOM3_THRESHOLD <= MUL_TOOM3_THRESHOLD_LIMIT);
      mpn_kara_mul_n (p, a, b, n, ws);
    }
  else if (BELOW_THRESHOLD (n, MUL_TOOM4_THRESHOLD))
    {
      mp_ptr ws;
      TMP_SDECL;
      TMP_SMARK;
      ws = TMP_SALLOC_LIMBS (MPN_TOOM3_MUL_N_TSIZE (n));
      mpn_toom3_mul_n (p, a, b, n, ws);
      TMP_SFREE;
    }
  else if (BELOW_THRESHOLD (n, MUL_TOOM8H_THRESHOLD))
    {
       mpn_toom4_mul_n (p, a, b, n);
    }
#if WANT_FFT || TUNE_PROGRAM_BUILD
  else if (BELOW_THRESHOLD (n, MUL_FFT_FULL_THRESHOLD))
    {
       mpn_toom8h_mul (p, a, n, b, n);
    }
#endif
  else
#if WANT_FFT || TUNE_PROGRAM_BUILD
    {
       mpn_mul_fft_main(p, a, n, b, n); 
    }
#else
    {
      /* Toom8 for large operands. */
      mpn_toom8h_mul (p, a, n, b, n);
    }
#endif
}
예제 #2
0
mp_limb_t
mpn_mul (mp_ptr prodp,
	 mp_srcptr up, mp_size_t un,
	 mp_srcptr vp, mp_size_t vn)
{
  mp_size_t l, k;
  mp_limb_t c;

  ASSERT (un >= vn);
  ASSERT (vn >= 1);
  ASSERT (! MPN_OVERLAP_P (prodp, un+vn, up, un));
  ASSERT (! MPN_OVERLAP_P (prodp, un+vn, vp, vn));

  if (un == vn)
   {
    if (up == vp)
    {
      mpn_sqr (prodp, up, un);
      return prodp[2 * un - 1];
    }
    else
    {
      mpn_mul_n (prodp, up, vp, un);
      return prodp[2 * un - 1];
    }
   }

  if (vn < MUL_KARATSUBA_THRESHOLD)
    { /* plain schoolbook multiplication */
      if (un <= MUL_BASECASE_MAX_UN)
	mpn_mul_basecase (prodp, up, un, vp, vn);
      else
	{
	  /* We have un >> MUL_BASECASE_MAX_UN > vn.  For better memory
	     locality, split up[] into MUL_BASECASE_MAX_UN pieces and multiply
	     these pieces with the vp[] operand.  After each such partial
	     multiplication (but the last) we copy the most significant vn
	     limbs into a temporary buffer since that part would otherwise be
	     overwritten by the next multiplication.  After the next
	     multiplication, we add it back.  This illustrates the situation:

                                                    -->vn<--
                                                      |  |<------- un ------->|
                                                         _____________________|
                                                        X                    /|
                                                      /XX__________________/  |
                                    _____________________                     |
                                   X                    /                     |
                                 /XX__________________/                       |
               _____________________                                          |
              /                    /                                          |
            /____________________/                                            |
	    ==================================================================

	    The parts marked with X are the parts whose sums are copied into
	    the temporary buffer.  */

	  mp_limb_t tp[MUL_KARATSUBA_THRESHOLD_LIMIT];
	  mp_limb_t cy;
          ASSERT (MUL_KARATSUBA_THRESHOLD <= MUL_KARATSUBA_THRESHOLD_LIMIT);

	  mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
	  prodp += MUL_BASECASE_MAX_UN;
	  MPN_COPY (tp, prodp, vn);		/* preserve high triangle */
	  up += MUL_BASECASE_MAX_UN;
	  un -= MUL_BASECASE_MAX_UN;
	  while (un > MUL_BASECASE_MAX_UN)
	    {
	      mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
	      cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
	      mpn_incr_u (prodp + vn, cy);		/* safe? */
	      prodp += MUL_BASECASE_MAX_UN;
	      MPN_COPY (tp, prodp, vn);		/* preserve high triangle */
	      up += MUL_BASECASE_MAX_UN;
	      un -= MUL_BASECASE_MAX_UN;
	    }
	  if (un > vn)
	    {
	      mpn_mul_basecase (prodp, up, un, vp, vn);
	    }
	  else
	    {
	      ASSERT_ALWAYS (un > 0);
	      mpn_mul_basecase (prodp, vp, vn, up, un);
	    }
	  cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
	  mpn_incr_u (prodp + vn, cy);		/* safe? */
	}
      return prodp[un + vn - 1];
  }

  if (ABOVE_THRESHOLD (un + vn, 2*MUL_FFT_FULL_THRESHOLD)
      && ABOVE_THRESHOLD (3*vn, MUL_FFT_FULL_THRESHOLD))
    {
      mpn_mul_fft_main (prodp, up, un, vp, vn);
      return prodp[un + vn - 1];
    }

  k = (un + 3)/4; // ceil(un/4)

#if GMP_NUMB_BITS == 32
  if ((ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM8H_THRESHOLD)) && (vn>=86) && (5*un <= 11*vn))
#else
  if ((ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM8H_THRESHOLD)) && (vn>=86) && (4*un <= 13*vn))
#endif
  {
      mpn_toom8h_mul(prodp, up, un, vp, vn);
      return prodp[un + vn - 1];
  }
  
  if (ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM4_THRESHOLD))
  {
          if (vn > 3*k)
          {
             mpn_toom4_mul(prodp, up, un, vp, vn);
             return prodp[un + vn - 1];
          } else
          {
             l = (un + 4)/5; // ceil(un/5)
             if ((((vn > 9*k/4) && (un+vn <= 6*MUL_TOOM4_THRESHOLD)) 
                 || ((vn > 2*l) && (un+vn > 6*MUL_TOOM4_THRESHOLD)))
                 && (vn <= 3*l))
             {
                mpn_toom53_mul(prodp, up, un, vp, vn);
                return prodp[un + vn - 1];
             }
          }
  } 
  
  if (ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM3_THRESHOLD) && (vn > k))
  {
          mp_ptr ws;
          TMP_DECL;
          TMP_MARK;

          if (vn < 2*k) // un/2 >= vn > un/4
          {
                  ws = TMP_ALLOC_LIMBS (MPN_TOOM3_MUL_TSIZE(un));
                  mpn_toom42_mul(prodp, up, un, vp, vn, ws);
                  TMP_FREE;
                  return prodp[un + vn - 1];
          }

          l = (un+2)/3; //ceil(u/3)
          if (vn > 2*l) // un >= vn > 2un/3
          {
                  ws = TMP_ALLOC_LIMBS (MPN_TOOM3_MUL_TSIZE(un));
                  mpn_toom3_mul(prodp, up, un, vp, vn, ws);
                  TMP_FREE;
                  return prodp[un + vn - 1];
          } else // 2un/3 >= vn > un/3
          {
                  ws = TMP_ALLOC_LIMBS (MPN_TOOM3_MUL_TSIZE(un));
                  mpn_toom32_mul(prodp, up, un, vp, vn, ws);
                  TMP_FREE;
                  return prodp[un + vn - 1];
          }
  }

  mpn_mul_n (prodp, up, vp, vn);

  if (un != vn)
    { mp_limb_t t;
      mp_ptr ws;
      TMP_DECL;
      TMP_MARK;

      prodp += vn;
      l = vn;
      up += vn;
      un -= vn;

      if (un < vn)
	{
	  /* Swap u's and v's. */
	  MPN_SRCPTR_SWAP (up,un, vp,vn);
	}

      ws = TMP_ALLOC_LIMBS ((vn >= MUL_KARATSUBA_THRESHOLD ? vn : un) + vn);

      t = 0;
      while (vn >= MUL_KARATSUBA_THRESHOLD)
	{
	  mpn_mul_n (ws, up, vp, vn);
	  if (l <= 2*vn)
	    {
	      t += mpn_add_n (prodp, prodp, ws, l);
	      if (l != 2*vn)
		{
		  t = mpn_add_1 (prodp + l, ws + l, 2*vn - l, t);
		  l = 2*vn;
		}
	    }
	  else
	    {
	      c = mpn_add_n (prodp, prodp, ws, 2*vn);
	      t += mpn_add_1 (prodp + 2*vn, prodp + 2*vn, l - 2*vn, c);
	    }
	  prodp += vn;
	  l -= vn;
	  up += vn;
	  un -= vn;
	  if (un < vn)
	    {
	      /* Swap u's and v's. */
	      MPN_SRCPTR_SWAP (up,un, vp,vn);
	    }
		}

      if (vn != 0)
	{
	  mpn_mul_basecase (ws, up, un, vp, vn);
	  if (l <= un + vn)
	    {
	      t += mpn_add_n (prodp, prodp, ws, l);
	      if (l != un + vn)
		t = mpn_add_1 (prodp + l, ws + l, un + vn - l, t);
	    }
	  else
	    {
	      c = mpn_add_n (prodp, prodp, ws, un + vn);
	      t += mpn_add_1 (prodp + un + vn, prodp + un + vn, l - un - vn, c);
	    }
	}

      TMP_FREE;
  }

  return prodp[un + vn - 1];
}
예제 #3
0
int
main(void)
{
    mp_bitcnt_t depth, w;
    
    flint_rand_t state;

    printf("mul_fft_main....");
    fflush(stdout);

    flint_randinit(state);
    _flint_rand_init_gmp(state);

    for (depth = 6; depth <= 13; depth++)
    {
        for (w = 1; w <= 3 - (depth >= 12); w++)
        {
            int iter = 1 + 200*(depth <= 8) + 80*(depth <= 9) + 10*(depth <= 10), i;
            
            for (i = 0; i < iter; i++)
            {
               mp_size_t n = (1UL<<depth);
               mp_bitcnt_t bits1 = (n*w - (depth + 1))/2; 
               mp_size_t len1 = 2*n + n_randint(state, 2*n) + 1;
               mp_size_t len2 = 2*n + 2 - len1 + n_randint(state, 2*n);

               mp_bitcnt_t b1 = len1*bits1, b2;
               mp_size_t n1, n2;
               mp_size_t j;
               mp_limb_t * i1, *i2, *r1, *r2;

               if (len2 <= 0)
                  len2 = 2*n + n_randint(state, 2*n) + 1;
               
               b2 = len2*bits1;
               
               n1 = (b1 - 1)/FLINT_BITS + 1;
               n2 = (b2 - 1)/FLINT_BITS + 1;
                    
               if (n1 < n2) /* ensure b1 >= b2 */
               {
                  mp_size_t t = n1;
                  mp_bitcnt_t tb = b1;
                  n1 = n2;
                  b1 = b2;
                  n2 = t;
                  b2 = tb;
               }

               i1 = flint_malloc(3*(n1 + n2)*sizeof(mp_limb_t));
               i2 = i1 + n1;
               r1 = i2 + n2;
               r2 = r1 + n1 + n2;
   
               mpn_urandomb(i1, state->gmp_state, b1);
               mpn_urandomb(i2, state->gmp_state, b2);
  
               mpn_mul(r2, i1, n1, i2, n2);
               mpn_mul_fft_main(r1, i1, n1, i2, n2);
           
               for (j = 0; j < n1 + n2; j++)
               {
                   if (r1[j] != r2[j]) 
                   {
                       printf("error in limb %ld, %lx != %lx\n", j, r1[j], r2[j]);
                       abort();
                   }
               }

               flint_free(i1);
            }
        }
    }

    flint_randclear(state);
    
    printf("PASS\n");
    return 0;
}