void tunetoom(long tablesz) { long high, n; int k; double T3[1], TK[1], TW[1], T4[1]; double mint; unsigned long *a, *b, *c, *d, *t; high = tablesz; if (high < BESTMIN) high = BESTMIN; if (high > GF2X_TOOM_TUNING_LIMIT) { fprintf(stderr, "Increase constant GF2X_TOOM_TUNING_LIMIT in thresholds.h to %ld\n", high); exit(1); } a = (unsigned long *) malloc(high * sizeof(unsigned long)); b = (unsigned long *) malloc(high * sizeof(unsigned long)); c = (unsigned long *) malloc(2 * high * sizeof(unsigned long)); d = (unsigned long *) malloc(2 * high * sizeof(unsigned long)); t = (unsigned long *) malloc(gf2x_toomspace(high) * sizeof(unsigned long)); for (n = BESTMIN + 1; n <= high; ) { srandom(1); TK[0] = T3[0] = TW[0] = T4[0] = 0.0; printf("%ld ", n); fflush(stdout); random_wordstring(a, n); random_wordstring(b, n); if (n >= GF2X_MUL_KARA_THRESHOLD) TIME(TK[0], gf2x_mul_kara(c, a, b, n, t)); if (n >= MINI_GF2X_MUL_TOOM_THRESHOLD) { TIME(T3[0], gf2x_mul_tc3(d, a, b, n, t)); check(a, n, b, n, "Kara", c, "TC3", d); } if (n >= MINI_GF2X_MUL_TOOMW_THRESHOLD) { TIME(TW[0], gf2x_mul_tc3w(d, a, b, n, t)); check(a, n, b, n, "Kara", c, "TC3W", d); } if (n >= MINI_GF2X_MUL_TOOM4_THRESHOLD) { TIME(T4[0], gf2x_mul_tc4(d, a, b, n, t)); check(a, n, b, n, "Kara", c, "TC4", d); } printf("TC2:%1.2e TC3:%1.2e TC3W:%1.2e TC4:%1.2e ", TK[0], T3[0], TW[0], T4[0]); mint = TK[0]; k = GF2X_SELECT_KARA; if ((T3[0] < mint) && (n >= MINI_GF2X_MUL_TOOM_THRESHOLD)) { mint = T3[0]; k = GF2X_SELECT_TC3; } if ((TW[0] < mint) && (n >= MINI_GF2X_MUL_TOOMW_THRESHOLD)) { mint = TW[0]; k = GF2X_SELECT_TC3W; } if ((T4[0] < mint) && (n >= MINI_GF2X_MUL_TOOM4_THRESHOLD)) { mint = T4[0]; k = GF2X_SELECT_TC4; } printf("best:%1.2e %s\n", mint, gf2x_toom_select_string[k]); fprintf(rp, "toom %ld %d\n", n, k); fflush(stdout); long nn = MAX(n * mulstep, n + 1); for( ; n < nn && n <= high ; n++) { best_tab[n - 1] = k; } } free(a); free(b); free(c); free(d); free(t); return; }
void tuneutoom(long tabsz) { long high; int k; double T3[1], TK[1]; double mint; unsigned long *a, *b, *c, *d, *t; high = tabsz; if (high < BESTMINU) high = BESTMINU; if (high > GF2X_TOOM_TUNING_LIMIT) { fprintf(stderr, "Increase constant GF2X_TOOM_TUNING_LIMIT in thresholds.c to %ld\n", high); exit(1); } long sa = high; long sb = (sa + 1) / 2; long sp1 = gf2x_toomuspace(sa); // space for mul_toom3u long sp2 = gf2x_toomspace(sb) + 2 * sb; // space for mul21 long sp = (sp1 > sp2) ? sp1 : sp2; a = (unsigned long *) malloc(sa * sizeof(unsigned long)); b = (unsigned long *) malloc(sb * sizeof(unsigned long)); c = (unsigned long *) malloc(3 * sb * sizeof(unsigned long)); d = (unsigned long *) malloc(3 * sb * sizeof(unsigned long)); t = (unsigned long *) malloc(sp * sizeof(unsigned long)); for (sa = BESTMINU + 1; sa <= high; ) { sb = (sa + 1) / 2; random_wordstring(a, sa); random_wordstring(b, sb); TK[0] = T3[0] = 0.0; printf("%ld ", sa); fflush(stdout); TIME(TK[0], gf2x_mul21(c, a, sa, b, t)); if (sa >= MINI_GF2X_MUL_TOOMU_THRESHOLD) { TIME(T3[0], gf2x_mul_tc3u(d, a, sa, b, t)); checku(c, d, sa + sb); } printf("default:%1.2e TC3U:%1.2e ", TK[0], T3[0]); mint = TK[0]; k = GF2X_SELECT_UNB_DFLT; if ((T3[0] < mint) && (sa >= MINI_GF2X_MUL_TOOMU_THRESHOLD)) { mint = T3[0]; k = GF2X_SELECT_UNB_TC3U; } printf("best:%1.2e %s\n", mint, gf2x_utoom_select_string[k]); fflush(stdout); fprintf(rp, "utoom %ld %d\n", sa, k); long nn = MAX(sa * mulstep, sa + 1); for( ; sa < nn && sa <= high ; sa++) { best_utab[sa - 1] = k; } } free(a); free(b); free(c); free(d); free(t); return; }
int main(int argc, char *argv[]) { long minn, maxn, mid, n, n2, ns, i; long besti; /* 0 for TC, 1, 2, ... for FFT(K0*3^(bestK-1)) */ long bestK; long K, K0 = 3; /* try K0, 3*K0, 9*K0 */ double T[4]; /* T[0] is for TC, T[1] for K0, T[2] for 3*K0, T[3] for 9*K0 */ double t1[4], t2[4]; unsigned long *a, *b, *c, *t, *u, *v; int nsz = 0; int tc_takes_too_long = 0; const char * reference = "TC"; maxn = 1000000; // default minn = GF2X_MUL_FFT_BEGIN_TUNE / 2 + 1; char * progname = argc ? argv[0] : ""; argc--,argv++; for( ; argc ; argc--,argv++) { int r; if (strcmp(argv[0], "--help") == 0) { usage(0); } if (strcmp(argv[0], "--no-toom") == 0) { tc_takes_too_long = 1; reference = "F1(K0)"; continue; } r = handle_tuning_mulstep(&argc, &argv); if (r < 0) usage(1); else if (r) continue; r = handle_tuning_outfile(&argc, &argv); if (r < 0) usage(1); else if (r) continue; if (strcmp(argv[0], "-k0") == 0) { argc--,argv++; if (! argc) usage(1); K0 = atoi(argv[0]); continue; } if (nsz == 0) { maxn = atoi(argv[0]); nsz++; continue; } if (nsz == 1) { minn = maxn; maxn = atoi(argv[0]); nsz++; continue; } usage(1); } if (nsz == 0) usage(1); set_tuning_output(); { char date[40]; time_t t; size_t u; struct utsname buf; time(&t); ctime_r(&t, date); u = strlen(date); for (; u && isspace(date[u - 1]); date[--u] = '\0'); uname(&buf); /* strip the dirname */ char * ptr = strrchr(progname, '/'); if (ptr) { ptr++; } else { ptr = progname; } fprintf(rp, "info-fft \"%s -s %.2f %ld run on %s on %s ; based on %s\"\n", ptr,mulstep,maxn,buf.nodename,date,GF2X_TOOM_TUNING_INFO); } printf("Tuning FFT multiplication to wordsize %ld\n\n", maxn); a = (unsigned long *) malloc(maxn * sizeof(unsigned long)); b = (unsigned long *) malloc(maxn * sizeof(unsigned long)); c = (unsigned long *) malloc(2 * maxn * sizeof(unsigned long)); u = (unsigned long *) malloc(2 * maxn * sizeof(unsigned long)); v = (unsigned long *) malloc(2 * maxn * sizeof(unsigned long)); t = (unsigned long *) malloc(gf2x_toomspace(maxn) * sizeof(unsigned long)); random_wordstring(a, maxn); random_wordstring(b, maxn); /* Skip n if (2*n < GF2X_MUL_FFT_BEGIN_TUNE) as this is too small for the FFT */ for (n = minn; n <= maxn;) { n2 = next_step(n, 3 * K0); // End of interval if (n2 > maxn) // Only go as far n2 = maxn; // as maxn. mid = (n + n2) / 2; // Mid-point printf("%ld..%ld ", n, n2); fflush(stdout); if (tc_takes_too_long) { T[0] = DBL_MAX; } else { TIME(T[0], gf2x_mul_toom(u, a, b, mid, t)); // Time Toom-Cook printf("TC:%1.1e ", T[0]); } fflush(stdout); besti = 0; bestK = 1; K = K0; i = 1; ugly_label: for ( ; i <= 3; i++, K *= 3) { TIME(t1[i], gf2x_mul_fft(c, a, mid, b, mid, K)); if (tc_takes_too_long) { memcpy(u, c, 2 * maxn * sizeof(unsigned long)); } check(a, mid, b, mid, reference, u, "F1", c); if (K >= GF2X_WORDSIZE) { TIME(t2[i], gf2x_mul_fft(v, a, mid, b, mid, -K)); check(a, mid, b, mid, "F1", c, "F2", v); } else { t2[i] = DBL_MAX; } if (t1[i] < t2[i]) { T[i] = t1[i]; printf("F1(%ld):%1.1e ", K, T[i]); } else { T[i] = t2[i]; printf("F2(%ld):%1.1e ", K, T[i]); } fflush(stdout); if (T[i] < T[besti]) { besti = i; bestK = (t2[i] > t1[i]) ? K : -K; /* -K for FFT2(|K|) */ } } if (T[3] < T[1] && T[3] < T[2]) { if (besti) { if (besti == 1) abort(); besti--; } K0 *= 3; /* K just stays as it was */ i = 3; T[1] = T[2]; T[2] = T[3]; goto ugly_label; /* Notice that we can't loop forever here. If we have T[3] < * T[2], this will ensure T[2] < T[1] at the next turn, * thereby forcing the other case not to happen */ } else if (T[1] < T[2] && T[1] < T[3] && K0 > 3) { K0 /= 3; } /* OK, this stair is done */ if (bestK == 1) printf("TC"); else { if (bestK > 0) printf("F1(%ld)", bestK); else printf("F2(%ld)", -bestK); } printf("\n"); fflush(stdout); if (T[0] >= 4 * T[besti] && !tc_takes_too_long) { printf("TC is taking too long, disabling for next sizes\n"); tc_takes_too_long = 1; reference = "F1(K0)"; } /* go to next size */ ns = n; n = next_step(n, 3 * K0); /* middle value of K */ if (n > n2) n = n2; /* end of last stair if K0 increased */ n++; if (n < mid) { /* redo the last stair if K0 decreased */ n = ns; } else { fprintf(rp, "fft %ld %ld\n", ns == minn ? 1 : ns, ns == minn ? 1 : bestK); } } free(a); free(b); free(c); free(t); free(u); free(v); return 0; }