int mpn_mulmod_Bexpp1(mp_ptr r, mp_srcptr i1, mp_srcptr i2, mp_size_t limbs, mp_ptr tt) { mp_size_t bits = limbs * GMP_LIMB_BITS; mp_bitcnt_t depth1, depth = 1; mp_size_t w1, off; mp_limb_t c = 2 * i1[limbs] + i2[limbs]; if (c & 1) { mpn_neg_n(r, i1, limbs + 1); mpn_normmod_2expp1(r, limbs); return 0; } else if (c & 2) { mpn_neg_n(r, i2, limbs + 1); mpn_normmod_2expp1(r, limbs); return 0; } if (limbs <= FFT_MULMOD_2EXPP1_CUTOFF) { if(bits) r[limbs] = mpn_mulmod_2expp1_basecase(r, i1, i2, c, bits, tt); else r[limbs] = 0; return r[limbs]; } while ((((mp_limb_t)1)<<depth) < bits) depth++; if (depth < 12) off = mulmod_2expp1_table_n[0]; else off = mulmod_2expp1_table_n[MIN(depth, FFT_N_NUM + 11) - 12]; depth1 = depth/2 - off; w1 = bits/(((mp_limb_t)1)<<(2*depth1)); mpir_fft_mulmod_2expp1(r, i1, i2, limbs, depth1, w1); return r[limbs]; }
/* ret + (xp, n) = (yp, n)*(zp, n) % 2^b + 1 needs (tp, 2n) temp space, everything reduced mod 2^b inputs, outputs are fully reduced N.B: 2n is not the same as 2b rounded up to nearest limb! */ inline static int mpn_mulmod_2expp1_internal (mp_ptr xp, mp_srcptr yp, mp_srcptr zp, mpir_ui b, mp_ptr tp) { mp_size_t n, k; mp_limb_t c; TMP_DECL; n = BITS_TO_LIMBS (b); k = GMP_NUMB_BITS * n - b; ASSERT(b > 0); ASSERT(n > 0); ASSERT_MPN(yp, n); ASSERT_MPN(zp, n); ASSERT(!MPN_OVERLAP_P (tp, 2 * n, yp, n)); ASSERT(!MPN_OVERLAP_P (tp, 2 * n, zp, n)); ASSERT(MPN_SAME_OR_SEPARATE_P (xp, tp, n)); ASSERT(MPN_SAME_OR_SEPARATE_P (xp, tp + n, n)); ASSERT(k == 0 || yp[n - 1] >> (GMP_NUMB_BITS - k) == 0); ASSERT(k == 0 || zp[n - 1] >> (GMP_NUMB_BITS - k) == 0); #ifndef TUNE_PROGRAM_BUILD if (k == 0 && n > FFT_MULMOD_2EXPP1_CUTOFF && n == mpir_fft_adjust_limbs(n)) { mp_bitcnt_t depth1, depth = 1; mp_size_t w1, off; mp_ptr tx, ty, tz; mp_limb_t ret; TMP_MARK; tx = TMP_BALLOC_LIMBS(3*n + 3); ty = tx + n + 1; tz = ty + n + 1; MPN_COPY(ty, yp, n); MPN_COPY(tz, zp, n); ty[n] = 0; tz[n] = 0; while ((((mp_limb_t)1)<<depth) < b) depth++; if (depth < 12) off = mulmod_2expp1_table_n[0]; else off = mulmod_2expp1_table_n[MIN(depth, FFT_N_NUM + 11) - 12]; depth1 = depth/2 - off; w1 = b/(((mp_limb_t)1)<<(2*depth1)); mpir_fft_mulmod_2expp1(tx, ty, tz, n, depth1, w1); MPN_COPY(xp, tx, n); ret = tx[n]; TMP_FREE; return ret; } #endif if (yp == zp) mpn_sqr(tp, yp, n); else mpn_mul_n (tp, yp, zp, n); if (k == 0) { c = mpn_sub_n (xp, tp, tp + n, n); return mpn_add_1 (xp, xp, n, c); } c = tp[n - 1]; tp[n - 1] &= GMP_NUMB_MASK >> k; #if HAVE_NATIVE_mpn_sublsh_nc c = mpn_sublsh_nc (xp, tp, tp + n, n, k, c); #else { mp_limb_t c1; c1 = mpn_lshift (tp + n, tp + n, n, k); tp[n] |= c >> (GMP_NUMB_BITS - k); c = mpn_sub_n (xp, tp, tp + n, n) + c1; } #endif c = mpn_add_1 (xp, xp, n, c); xp[n - 1] &= GMP_NUMB_MASK >> k; return c; }
int main(void) { mp_bitcnt_t depth, w, depth1, w1; clock_t start, end; double elapsed; double best = 0.0; mp_size_t best_off, off, best_d, best_w; gmp_randstate_t state; printf("/* fft_tuning.h -- autogenerated by tune-fft */\n\n"); printf("#ifndef FFT_TUNING_H\n"); printf("#define FFT_TUNING_H\n\n"); printf("#include \"mpir.h\"\n\n"); printf("#define FFT_TAB \\\n"); fflush(stdout); gmp_randinit_default(state); printf(" { "); fflush(stdout); for (depth = 6; depth <= 10; depth++) { printf("{ "); fflush(stdout); for (w = 1; w <= 2; w++) { int iters = 100*((mp_size_t) 1 << (3*(10 - depth)/2)), i; mp_size_t n = ((mp_limb_t)1<<depth); mp_bitcnt_t bits1 = (n*w - (depth + 1))/2; mp_size_t len1 = 2*n; mp_size_t len2 = 2*n; mp_bitcnt_t b1 = len1*bits1, b2 = len2*bits1; mp_size_t n1, n2; mp_size_t j; mp_limb_t * i1, *i2, *r1; n1 = (b1 - 1)/GMP_LIMB_BITS + 1; n2 = (b2 - 1)/GMP_LIMB_BITS + 1; i1 = malloc(2*(n1 + n2)*sizeof(mp_limb_t)); i2 = i1 + n1; r1 = i2 + n2; mpn_urandomb(i1, state, b1); mpn_urandomb(i2, state, b2); best_off = -1; for (off = 0; off <= 4; off++) { start = clock(); for (i = 0; i < iters; i++) mpn_mul_trunc_sqrt2(r1, i1, n1, i2, n2, depth - off, w*((mp_size_t)1 << (off*2))); end = clock(); elapsed = ((double) (end - start)) / CLOCKS_PER_SEC; if (elapsed < best || best_off == -1) { best_off = off; best = elapsed; } } printf("%ld", best_off); if (w != 2) printf(","); printf(" "); fflush(stdout); free(i1); } printf("}"); if (depth != 10) printf(","); printf(" "); fflush(stdout); } printf("}\n\n"); best_d = 12; best_w = 1; best_off = -1; printf("#define MULMOD_TAB \\\n"); fflush(stdout); printf(" { "); fflush(stdout); for (depth = 12; best_off != 1 ; depth++) { for (w = 1; w <= 2; w++) { int iters = 100*((mp_size_t) 1 << (3*(18 - depth)/2)), i; mp_size_t n = ((mp_limb_t)1<<depth); mp_bitcnt_t bits = n*w; mp_size_t int_limbs = (bits - 1)/GMP_LIMB_BITS + 1; mp_size_t j; mp_limb_t c, * i1, * i2, * r1, * tt; if (depth <= 21) iters = 32*((mp_size_t) 1 << (21 - depth)); else iters = MAX(32/((mp_size_t) 1 << (depth - 21)), 1); i1 = malloc(6*(int_limbs+1)*sizeof(mp_limb_t)); i2 = i1 + int_limbs + 1; r1 = i2 + int_limbs + 1; tt = r1 + 2*(int_limbs + 1); mpn_urandomb(i1, state, int_limbs*GMP_LIMB_BITS); mpn_urandomb(i2, state, int_limbs*GMP_LIMB_BITS); i1[int_limbs] = 0; i2[int_limbs] = 0; depth1 = 1; while ((((mp_limb_t)1)<<depth1) < bits) depth1++; depth1 = depth1/2; w1 = bits/(((mp_limb_t)1)<<(2*depth1)); best_off = -1; for (off = 0; off <= 4; off++) { start = clock(); for (i = 0; i < iters; i++) mpir_fft_mulmod_2expp1(r1, i1, i2, int_limbs, depth1 - off, w1*((mp_size_t)1 << (off*2))); end = clock(); elapsed = ((double) (end - start)) / CLOCKS_PER_SEC; if (best_off == -1 || elapsed < best) { best_off = off; best = elapsed; } } start = clock(); for (i = 0; i < iters; i++) mpn_mulmod_2expp1_basecase(r1, i1, i2, 0, bits, tt); end = clock(); elapsed = ((double) (end - start)) / CLOCKS_PER_SEC; if (elapsed < best) { best_d = depth + (w == 2); best_w = w + 1 - 2*(w == 2); } printf("%ld", best_off); if (w != 2) printf(", "); fflush(stdout); free(i1); } printf(", "); fflush(stdout); } printf("1 }\n\n"); printf("#define FFT_N_NUM %ld\n\n", 2*(depth - 12) + 1); printf("#define FFT_MULMOD_2EXPP1_CUTOFF %ld\n\n", ((mp_limb_t) 1 << best_d)*best_w/(2*GMP_LIMB_BITS)); gmp_randclear(state); printf("#endif\n"); return 0; }