static void gf2x_mul21(unsigned long *c, const unsigned long *b, long sb, const unsigned long *a, unsigned long *stk) { long i, j; long sa = (sb + 1) / 2; long sc = sa + sb; unsigned long *v; v = stk; stk += 2 * sa; for (i = 0; i < sc; i++) c[i] = 0; do { if (sa == 0) break; if (sa == 1) { c[sb] ^= gf2x_addmul_1_n(c, c, b, sb, a[0]); break; } for (i = 0; i + sa <= sb; i += sa) { gf2x_mul_toom(v, a, b + i, sa, stk); // Generic Toom-Cook mult. for (j = 0; j < 2 * sa; j++) c[i + j] ^= v[j]; } { const unsigned long *t; t = a; a = b + i; b = t; } { long t; t = sa; sa = sb - i; sb = t; } c = c + i; } while (1); }
int main(int argc, char *argv[]) { long minn, maxn, mid, n, n2, ns, i; long besti; /* 0 for TC, 1, 2, ... for FFT(K0*3^(bestK-1)) */ long bestK; long K, K0 = 3; /* try K0, 3*K0, 9*K0 */ double T[4]; /* T[0] is for TC, T[1] for K0, T[2] for 3*K0, T[3] for 9*K0 */ double t1[4], t2[4]; unsigned long *a, *b, *c, *t, *u, *v; int nsz = 0; int tc_takes_too_long = 0; const char * reference = "TC"; maxn = 1000000; // default minn = GF2X_MUL_FFT_BEGIN_TUNE / 2 + 1; char * progname = argc ? argv[0] : ""; argc--,argv++; for( ; argc ; argc--,argv++) { int r; if (strcmp(argv[0], "--help") == 0) { usage(0); } if (strcmp(argv[0], "--no-toom") == 0) { tc_takes_too_long = 1; reference = "F1(K0)"; continue; } r = handle_tuning_mulstep(&argc, &argv); if (r < 0) usage(1); else if (r) continue; r = handle_tuning_outfile(&argc, &argv); if (r < 0) usage(1); else if (r) continue; if (strcmp(argv[0], "-k0") == 0) { argc--,argv++; if (! argc) usage(1); K0 = atoi(argv[0]); continue; } if (nsz == 0) { maxn = atoi(argv[0]); nsz++; continue; } if (nsz == 1) { minn = maxn; maxn = atoi(argv[0]); nsz++; continue; } usage(1); } if (nsz == 0) usage(1); set_tuning_output(); { char date[40]; time_t t; size_t u; struct utsname buf; time(&t); ctime_r(&t, date); u = strlen(date); for (; u && isspace(date[u - 1]); date[--u] = '\0'); uname(&buf); /* strip the dirname */ char * ptr = strrchr(progname, '/'); if (ptr) { ptr++; } else { ptr = progname; } fprintf(rp, "info-fft \"%s -s %.2f %ld run on %s on %s ; based on %s\"\n", ptr,mulstep,maxn,buf.nodename,date,GF2X_TOOM_TUNING_INFO); } printf("Tuning FFT multiplication to wordsize %ld\n\n", maxn); a = (unsigned long *) malloc(maxn * sizeof(unsigned long)); b = (unsigned long *) malloc(maxn * sizeof(unsigned long)); c = (unsigned long *) malloc(2 * maxn * sizeof(unsigned long)); u = (unsigned long *) malloc(2 * maxn * sizeof(unsigned long)); v = (unsigned long *) malloc(2 * maxn * sizeof(unsigned long)); t = (unsigned long *) malloc(gf2x_toomspace(maxn) * sizeof(unsigned long)); random_wordstring(a, maxn); random_wordstring(b, maxn); /* Skip n if (2*n < GF2X_MUL_FFT_BEGIN_TUNE) as this is too small for the FFT */ for (n = minn; n <= maxn;) { n2 = next_step(n, 3 * K0); // End of interval if (n2 > maxn) // Only go as far n2 = maxn; // as maxn. mid = (n + n2) / 2; // Mid-point printf("%ld..%ld ", n, n2); fflush(stdout); if (tc_takes_too_long) { T[0] = DBL_MAX; } else { TIME(T[0], gf2x_mul_toom(u, a, b, mid, t)); // Time Toom-Cook printf("TC:%1.1e ", T[0]); } fflush(stdout); besti = 0; bestK = 1; K = K0; i = 1; ugly_label: for ( ; i <= 3; i++, K *= 3) { TIME(t1[i], gf2x_mul_fft(c, a, mid, b, mid, K)); if (tc_takes_too_long) { memcpy(u, c, 2 * maxn * sizeof(unsigned long)); } check(a, mid, b, mid, reference, u, "F1", c); if (K >= GF2X_WORDSIZE) { TIME(t2[i], gf2x_mul_fft(v, a, mid, b, mid, -K)); check(a, mid, b, mid, "F1", c, "F2", v); } else { t2[i] = DBL_MAX; } if (t1[i] < t2[i]) { T[i] = t1[i]; printf("F1(%ld):%1.1e ", K, T[i]); } else { T[i] = t2[i]; printf("F2(%ld):%1.1e ", K, T[i]); } fflush(stdout); if (T[i] < T[besti]) { besti = i; bestK = (t2[i] > t1[i]) ? K : -K; /* -K for FFT2(|K|) */ } } if (T[3] < T[1] && T[3] < T[2]) { if (besti) { if (besti == 1) abort(); besti--; } K0 *= 3; /* K just stays as it was */ i = 3; T[1] = T[2]; T[2] = T[3]; goto ugly_label; /* Notice that we can't loop forever here. If we have T[3] < * T[2], this will ensure T[2] < T[1] at the next turn, * thereby forcing the other case not to happen */ } else if (T[1] < T[2] && T[1] < T[3] && K0 > 3) { K0 /= 3; } /* OK, this stair is done */ if (bestK == 1) printf("TC"); else { if (bestK > 0) printf("F1(%ld)", bestK); else printf("F2(%ld)", -bestK); } printf("\n"); fflush(stdout); if (T[0] >= 4 * T[besti] && !tc_takes_too_long) { printf("TC is taking too long, disabling for next sizes\n"); tc_takes_too_long = 1; reference = "F1(K0)"; } /* go to next size */ ns = n; n = next_step(n, 3 * K0); /* middle value of K */ if (n > n2) n = n2; /* end of last stair if K0 increased */ n++; if (n < mid) { /* redo the last stair if K0 decreased */ n = ns; } else { fprintf(rp, "fft %ld %ld\n", ns == minn ? 1 : ns, ns == minn ? 1 : bestK); } } free(a); free(b); free(c); free(t); free(u); free(v); return 0; }