Beispiel #1
0
int mpn_mulmod_Bexpp1(mp_ptr r, mp_srcptr i1, mp_srcptr i2, mp_size_t limbs, mp_ptr tt)
{
   mp_size_t bits = limbs * GMP_LIMB_BITS;
   mp_bitcnt_t depth1, depth = 1;
   mp_size_t w1, off;

   mp_limb_t c = 2 * i1[limbs] + i2[limbs];
   
   if (c & 1)
   {
      mpn_neg_n(r, i1, limbs + 1);
      mpn_normmod_2expp1(r, limbs);
      return 0;
   } else if (c & 2)
   {
      mpn_neg_n(r, i2, limbs + 1);
      mpn_normmod_2expp1(r, limbs);
      return 0;
   }

   if (limbs <= FFT_MULMOD_2EXPP1_CUTOFF) 
   {
       if(bits)
          r[limbs] = mpn_mulmod_2expp1_basecase(r, i1, i2, c, bits, tt);
       else
          r[limbs] = 0;
       return r[limbs];
   }
   while ((((mp_limb_t)1)<<depth) < bits) depth++;
   
   if (depth < 12) off = mulmod_2expp1_table_n[0];
   else off = mulmod_2expp1_table_n[MIN(depth, FFT_N_NUM + 11) - 12];
   depth1 = depth/2 - off;
   
   w1 = bits/(((mp_limb_t)1)<<(2*depth1));

   mpir_fft_mulmod_2expp1(r, i1, i2, limbs, depth1, w1);

   return r[limbs];
}
Beispiel #2
0
void mpir_fft_mulmod_2expp1(mp_ptr r1, mp_srcptr i1, mp_srcptr i2, 
                 mp_size_t r_limbs, mp_bitcnt_t depth, mp_bitcnt_t w)
{
   mp_size_t n = (((mp_size_t)1)<<depth);
   mp_bitcnt_t bits1 = (r_limbs*GMP_LIMB_BITS)/(2*n);
   
   mp_size_t limb_add, limbs = (n*w)/GMP_LIMB_BITS;
   mp_size_t size = limbs + 1;
   mp_size_t i, j, ll;

   mp_limb_t * ptr;
   mp_limb_t ** ii, ** jj, *tt, *t1, *t2, *s1, *r, *ii0, *jj0;
   mp_limb_t c;
   TMP_DECL;

   TMP_MARK;
   ii = TMP_BALLOC_MP_PTRS(2*(n + n*size) + 4*n + 5*size);
   for (i = 0, ptr = (mp_ptr) ii + 2*n; i < 2*n; i++, ptr += size) 
   {
      ii[i] = ptr;
   }
   ii0 = ptr;
   t1 = ii0 + 2*n;
   t2 = t1 + size;
   s1 = t2 + size;
   r = s1 + size;
   tt = r + 2*n;
   
   if (i1 != i2)
   {
      jj = TMP_BALLOC_MP_PTRS(2*(n + n*size) + 2*n);
      for (i = 0, ptr = (mp_ptr) jj + 2*n; i < 2*n; i++, ptr += size) 
      {
         jj[i] = ptr;
      }
      jj0 = ptr;
   } else
   {
      jj = ii;
      jj0 = ii0;
   }

   j = mpir_fft_split_bits(ii, i1, r_limbs, bits1, limbs);
   for ( ; j < 2*n; j++)
      mpn_zero(ii[j], limbs + 1);

   for (i = 0; i < 2*n; i++)
      ii0[i] = ii[i][0];
 
   mpir_fft_negacyclic(ii, n, w, &t1, &t2, &s1);
   for (j = 0; j < 2*n; j++)
      mpn_normmod_2expp1(ii[j], limbs);

   if (i1 != i2)
   {
      j = mpir_fft_split_bits(jj, i2, r_limbs, bits1, limbs);
      for ( ; j < 2*n; j++)
          mpn_zero(jj[j], limbs + 1);

      for (i = 0; i < 2*n; i++)
         jj0[i] = jj[i][0];

      mpir_fft_negacyclic(jj, n, w, &t1, &t2, &s1);
   }
      
   for (j = 0; j < 2*n; j++)
   {
      if (i1 != i2) mpn_normmod_2expp1(jj[j], limbs);
      c = 2*ii[j][limbs] + jj[j][limbs];

      ii[j][limbs] = mpn_mulmod_2expp1_basecase(ii[j], ii[j], jj[j], c, n*w, tt);
   }
   
   mpir_ifft_negacyclic(ii, n, w, &t1, &t2, &s1);
   
   mpir_fft_naive_convolution_1(r, ii0, jj0, 2*n);

   for (j = 0; j < 2*n; j++)
   {
      mp_limb_t t, cy2;
      
      mpn_div_2expmod_2expp1(ii[j], ii[j], limbs, depth + 1);
      mpn_normmod_2expp1(ii[j], limbs);

      t = ii[j][limbs];
      ii[j][limbs] = r[j] - ii[j][0];
      cy2 = mpn_add_1(ii[j], ii[j], limbs + 1, ii[j][limbs]);
      add_ssaaaa(r[j], ii[j][limbs], 0, ii[j][limbs], 0, t);
      if (cy2) r[j]++;
   }
   
   mpn_zero(r1, r_limbs + 1);
   mpir_fft_combine_bits(r1, ii, 2*n - 1, bits1, limbs + 1, r_limbs + 1);
   
   /* 
      as the negacyclic convolution has effectively done subtractions
      some of the coefficients will be negative, so need to subtract p
   */
   ll = 0;
   limb_add = bits1/GMP_LIMB_BITS;
   
   for (j = 0; j < 2*n - 2; j++)
   {   
      if (r[j]) 
         mpn_sub_1(r1 + ll + 1, r1 + ll + 1, r_limbs - ll, 1);
      else if ((mp_limb_signed_t) ii[j][limbs] < 0) /* coefficient was -ve */
      {
         mpn_sub_1(r1 + ll + 1, r1 + ll + 1, r_limbs - ll, 1);
         mpn_sub_1(r1 + ll + limbs + 1, r1 + ll + limbs + 1, r_limbs - limbs - ll, 1);
      }

      ll += limb_add;
   }
   /* penultimate coefficient, top bit was already ignored */
   if (r[j] || (mp_limb_signed_t) ii[j][limbs] < 0) /* coefficient was -ve */
      mpn_sub_1(r1 + ll + 1, r1 + ll + 1, r_limbs - ll, 1);
   
   /* final coefficient wraps around */
   if (limb_add)
      r1[r_limbs] += mpn_add_n(r1 + r_limbs - limb_add, r1 + r_limbs - limb_add, ii[2*n - 1], limb_add);
   c = mpn_sub_n(r1, r1, ii[2*n - 1] + limb_add, limbs + 1 - limb_add);
   mpn_addmod_2expp1_1(r1 + limbs + 1 - limb_add, r_limbs - limbs - 1 + limb_add, -c);
   mpn_normmod_2expp1(r1, r_limbs);
   
   TMP_FREE;
}
Beispiel #3
0
int
main(void)
{
    mp_bitcnt_t depth, w, depth1, w1;
    clock_t start, end;
    double elapsed;
    double best = 0.0;
    mp_size_t best_off, off, best_d, best_w;

    gmp_randstate_t state;

    printf("/* fft_tuning.h -- autogenerated by tune-fft */\n\n");
    printf("#ifndef FFT_TUNING_H\n");
    printf("#define FFT_TUNING_H\n\n");
    printf("#include \"mpir.h\"\n\n");
    printf("#define FFT_TAB \\\n");
    fflush(stdout);

    gmp_randinit_default(state);

    printf("   { "); fflush(stdout);
    for (depth = 6; depth <= 10; depth++)
    {
        printf("{ "); fflush(stdout);
        for (w = 1; w <= 2; w++)
        {
            int iters = 100*((mp_size_t) 1 << (3*(10 - depth)/2)), i;
            
            mp_size_t n = ((mp_limb_t)1<<depth);
            mp_bitcnt_t bits1 = (n*w - (depth + 1))/2; 
            mp_size_t len1 = 2*n;
            mp_size_t len2 = 2*n;

            mp_bitcnt_t b1 = len1*bits1, b2 = len2*bits1;
            mp_size_t n1, n2;
            mp_size_t j;
            mp_limb_t * i1, *i2, *r1;
   
            n1 = (b1 - 1)/GMP_LIMB_BITS + 1;
            n2 = (b2 - 1)/GMP_LIMB_BITS + 1;
                    
            i1 = malloc(2*(n1 + n2)*sizeof(mp_limb_t));
            i2 = i1 + n1;
            r1 = i2 + n2;
   
            mpn_urandomb(i1, state, b1);
            mpn_urandomb(i2, state, b2);
  
            best_off = -1;
            
            for (off = 0; off <= 4; off++)
            {
               start = clock();
               for (i = 0; i < iters; i++)
                  mpn_mul_trunc_sqrt2(r1, i1, n1, i2, n2, depth - off, w*((mp_size_t)1 << (off*2)));
               end = clock();
               
               elapsed = ((double) (end - start)) / CLOCKS_PER_SEC;
               
               if (elapsed < best || best_off == -1)
               {
                  best_off = off;
                  best = elapsed;
               }
            }
           
            printf("%ld", best_off); 
            if (w != 2) printf(",");
            printf(" "); fflush(stdout);

            free(i1);
        }
        printf("}");
        if (depth != 10) printf(",");
        printf(" "); fflush(stdout);
    }

    printf("}\n\n");
    
    best_d = 12;
    best_w = 1;
    best_off = -1;

    printf("#define MULMOD_TAB \\\n");
    fflush(stdout);
    printf("   { "); fflush(stdout);
    for (depth = 12; best_off != 1 ; depth++)
    {
        for (w = 1; w <= 2; w++)
        {
            int iters = 100*((mp_size_t) 1 << (3*(18 - depth)/2)), i;
            mp_size_t n = ((mp_limb_t)1<<depth);
            mp_bitcnt_t bits = n*w;
            mp_size_t int_limbs = (bits - 1)/GMP_LIMB_BITS + 1;
            mp_size_t j;
            mp_limb_t c, * i1, * i2, * r1, * tt;
        
            if (depth <= 21) iters = 32*((mp_size_t) 1 << (21 - depth));
            else iters = MAX(32/((mp_size_t) 1 << (depth - 21)), 1);

            i1 = malloc(6*(int_limbs+1)*sizeof(mp_limb_t));
            i2 = i1 + int_limbs + 1;
            r1 = i2 + int_limbs + 1;
            tt = r1 + 2*(int_limbs + 1);
                
            mpn_urandomb(i1, state, int_limbs*GMP_LIMB_BITS);
            mpn_urandomb(i2, state, int_limbs*GMP_LIMB_BITS);
            i1[int_limbs] = 0;
            i2[int_limbs] = 0;

            depth1 = 1;
            while ((((mp_limb_t)1)<<depth1) < bits) depth1++;
            depth1 = depth1/2;

            w1 = bits/(((mp_limb_t)1)<<(2*depth1));

            best_off = -1;
            
            for (off = 0; off <= 4; off++)
            {
               start = clock();
               for (i = 0; i < iters; i++)
                  mpir_fft_mulmod_2expp1(r1, i1, i2, int_limbs, depth1 - off, w1*((mp_size_t)1 << (off*2)));
               end = clock();
               
               elapsed = ((double) (end - start)) / CLOCKS_PER_SEC;
               
               if (best_off == -1 || elapsed < best)
               {
                  best_off = off;
                  best = elapsed;
               }
            }
           
            start = clock();
            for (i = 0; i < iters; i++)
                mpn_mulmod_2expp1_basecase(r1, i1, i2, 0, bits, tt);
            end = clock();
               
            elapsed = ((double) (end - start)) / CLOCKS_PER_SEC;
            if (elapsed < best)
            {
                best_d = depth + (w == 2);
                best_w = w + 1 - 2*(w == 2);
            }

            printf("%ld", best_off); 
            if (w != 2) printf(", "); fflush(stdout);

            free(i1);
        }
        printf(", "); fflush(stdout);
    }
    printf("1 }\n\n");
    
    printf("#define FFT_N_NUM %ld\n\n", 2*(depth - 12) + 1);
    
    printf("#define FFT_MULMOD_2EXPP1_CUTOFF %ld\n\n", ((mp_limb_t) 1 << best_d)*best_w/(2*GMP_LIMB_BITS));
    
    gmp_randclear(state);
    
    printf("#endif\n");
    return 0;
}