void problem_free(bench_problem *p) { if (p->outphys && p->outphys != p->inphys) bench_free(p->outphys); if (p->inphys) bench_free(p->inphys); tensor_destroy(p->sz); tensor_destroy(p->vecsz); }
void problem_destroy(bench_problem *p) { BENCH_ASSERT(p); problem_free(p); if (p->k) bench_free(p->k); bench_free(p->pstring); bench_free(p); }
static void alloc_local(ptrdiff_t nreal, int inplace) { bench_free(local_in); if (local_out != local_in) bench_free(local_out); local_in = local_out = 0; if (nreal > 0) { ptrdiff_t i; local_in = (bench_real*) bench_malloc(nreal * sizeof(bench_real)); if (inplace) local_out = local_in; else local_out = (bench_real*) bench_malloc(nreal * sizeof(bench_real)); for (i = 0; i < nreal; ++i) local_in[i] = local_out[i] = 0.0; } }
void verify_rdft2(bench_problem *p, int rounds, double tol, errors *e) { C *inA, *inB, *inC, *outA, *outB, *outC, *tmp; int n, vecn, N; dofft_rdft2_closure k; BENCH_ASSERT(p->kind == PROBLEM_REAL); if (!FINITE_RNK(p->sz->rnk) || !FINITE_RNK(p->vecsz->rnk)) return; /* give up */ k.k.apply = rdft2_apply; k.k.recopy_input = 0; k.p = p; if (rounds == 0) rounds = 20; /* default value */ n = tensor_sz(p->sz); vecn = tensor_sz(p->vecsz); N = n * vecn; inA = (C *) bench_malloc(N * sizeof(C)); inB = (C *) bench_malloc(N * sizeof(C)); inC = (C *) bench_malloc(N * sizeof(C)); outA = (C *) bench_malloc(N * sizeof(C)); outB = (C *) bench_malloc(N * sizeof(C)); outC = (C *) bench_malloc(N * sizeof(C)); tmp = (C *) bench_malloc(N * sizeof(C)); e->i = impulse(&k.k, n, vecn, inA, inB, inC, outA, outB, outC, tmp, rounds, tol); e->l = linear(&k.k, 1, N, inA, inB, inC, outA, outB, outC, tmp, rounds, tol); e->s = 0.0; if (p->sign < 0) e->s = dmax(e->s, tf_shift(&k.k, 1, p->sz, n, vecn, p->sign, inA, inB, outA, outB, tmp, rounds, tol, TIME_SHIFT)); else e->s = dmax(e->s, tf_shift(&k.k, 1, p->sz, n, vecn, p->sign, inA, inB, outA, outB, tmp, rounds, tol, FREQ_SHIFT)); if (!p->in_place && !p->destroy_input) preserves_input(&k.k, p->sign < 0 ? mkreal : mkhermitian1, N, inA, inB, outB, rounds); bench_free(tmp); bench_free(outC); bench_free(outB); bench_free(outA); bench_free(inC); bench_free(inB); bench_free(inA); }
void accuracy_dft(bench_problem *p, int rounds, int impulse_rounds, double t[6]) { dofft_dft_closure k; int n; C *a, *b; BENCH_ASSERT(p->kind == PROBLEM_COMPLEX); BENCH_ASSERT(p->sz->rnk == 1); BENCH_ASSERT(p->vecsz->rnk == 0); k.k.apply = dft_apply; k.k.recopy_input = 0; k.p = p; n = tensor_sz(p->sz); a = (C *) bench_malloc(n * sizeof(C)); b = (C *) bench_malloc(n * sizeof(C)); accuracy_test(&k.k, 0, p->sign, n, a, b, rounds, impulse_rounds, t); bench_free(b); bench_free(a); }
void speed(const char *param) { double *t; int iter, k; bench_problem *p; double tmin, y; t = (double *) bench_malloc(time_repeat * sizeof(double)); p = problem_parse(param); BENCH_ASSERT(can_do(p)); problem_alloc(p); problem_zero(p); timer_start(); setup(p); p->setup_time = timer_stop(); start_over: for (iter = 1; iter < (1<<30); iter *= 2) { tmin = 1.0e20; for (k = 0; k < time_repeat; ++k) { timer_start(); doit(iter, p); y = timer_stop(); if (y < 0) /* yes, it happens */ goto start_over; t[k] = y; if (y < tmin) tmin = y; } if (tmin >= time_min) goto done; } goto start_over; /* this also happens */ done: done(p); for (k = 0; k < time_repeat; ++k) { t[k] /= iter; } report(p, t, time_repeat); problem_destroy(p); bench_free(t); return; }
void accuracy_r2r(bench_problem *p, int rounds, int impulse_rounds, double t[6]) { dofft_r2r_closure k; int n, n0 = 1; C *a, *b; aconstrain constrain = 0; BENCH_ASSERT(p->kind == PROBLEM_R2R); BENCH_ASSERT(p->sz->rnk == 1); BENCH_ASSERT(p->vecsz->rnk == 0); k.k.apply = r2r_apply; k.k.recopy_input = 0; k.p = p; n = tensor_sz(p->sz); switch (p->k[0]) { case R2R_R2HC: constrain = mkreal; n0 = n; break; case R2R_HC2R: constrain = mkhermitian1; n0 = n; break; case R2R_REDFT00: constrain = mkre00; n0 = 2*(n-1); break; case R2R_RODFT00: constrain = mkro00; n0 = 2*(n+1); break; case R2R_REDFT01: constrain = mkre01; n0 = 4*n; break; case R2R_REDFT10: constrain = mkre10; n0 = 4*n; break; case R2R_RODFT01: constrain = mkro01; n0 = 4*n; break; case R2R_RODFT10: constrain = mkio10; n0 = 4*n; break; case R2R_REDFT11: constrain = mkre11; n0 = 8*n; break; case R2R_RODFT11: constrain = mkro11; n0 = 8*n; break; default: BENCH_ASSERT(0); /* not yet implemented */ } k.n0 = n0; a = (C *) bench_malloc(n0 * sizeof(C)); b = (C *) bench_malloc(n0 * sizeof(C)); accuracy_test(&k.k, constrain, -1, n0, a, b, rounds, impulse_rounds, t); bench_free(b); bench_free(a); }
void accuracy_rdft2(bench_problem *p, int rounds, int impulse_rounds, double t[6]) { dofft_rdft2_closure k; int n; C *a, *b; BENCH_ASSERT(p->kind == PROBLEM_REAL); BENCH_ASSERT(p->sz->rnk == 1); BENCH_ASSERT(p->vecsz->rnk == 0); k.k.apply = rdft2_apply; k.k.recopy_input = 0; k.p = p; n = tensor_sz(p->sz); a = (C *) bench_malloc(n * sizeof(C)); b = (C *) bench_malloc(n * sizeof(C)); accuracy_test(&k.k, p->sign < 0 ? mkreal : mkhermitian1, p->sign, n, a, b, rounds, impulse_rounds, t); bench_free(b); bench_free(a); }
void verify_dft(bench_problem *p, int rounds, double tol, errors *e) { C *inA, *inB, *inC, *outA, *outB, *outC, *tmp; int n, vecn, N; dofft_dft_closure k; BENCH_ASSERT(p->kind == PROBLEM_COMPLEX); k.k.apply = dft_apply; k.k.recopy_input = 0; k.p = p; if (rounds == 0) rounds = 20; /* default value */ n = tensor_sz(p->sz); vecn = tensor_sz(p->vecsz); N = n * vecn; inA = (C *) bench_malloc(N * sizeof(C)); inB = (C *) bench_malloc(N * sizeof(C)); inC = (C *) bench_malloc(N * sizeof(C)); outA = (C *) bench_malloc(N * sizeof(C)); outB = (C *) bench_malloc(N * sizeof(C)); outC = (C *) bench_malloc(N * sizeof(C)); tmp = (C *) bench_malloc(N * sizeof(C)); e->i = impulse(&k.k, n, vecn, inA, inB, inC, outA, outB, outC, tmp, rounds, tol); e->l = linear(&k.k, 0, N, inA, inB, inC, outA, outB, outC, tmp, rounds, tol); e->s = 0.0; e->s = dmax(e->s, tf_shift(&k.k, 0, p->sz, n, vecn, p->sign, inA, inB, outA, outB, tmp, rounds, tol, TIME_SHIFT)); e->s = dmax(e->s, tf_shift(&k.k, 0, p->sz, n, vecn, p->sign, inA, inB, outA, outB, tmp, rounds, tol, FREQ_SHIFT)); if (!p->in_place && !p->destroy_input) preserves_input(&k.k, 0, N, inA, inB, outB, rounds); bench_free(tmp); bench_free(outC); bench_free(outB); bench_free(outA); bench_free(inC); bench_free(inB); bench_free(inA); }
static void alloc_rnk(int rnk_) { rnk = rnk_; bench_free(local_ni); if (rnk == 0) local_ni = 0; else local_ni = (ptrdiff_t *) bench_malloc(sizeof(ptrdiff_t) * rnk * (8 + n_pes * 4)); local_starti = local_ni + rnk; local_no = local_ni + 2 * rnk; local_starto = local_ni + 3 * rnk; istrides = local_ni + 4 * rnk; ostrides = local_ni + 5 * rnk; total_ni = local_ni + 6 * rnk; total_no = local_ni + 7 * rnk; all_local_ni = local_ni + 8 * rnk; all_local_starti = local_ni + (8 + n_pes) * rnk; all_local_no = local_ni + (8 + 2 * n_pes) * rnk; all_local_starto = local_ni + (8 + 3 * n_pes) * rnk; }
static void do_scatter_in(bench_real *in) { bench_real *ali; int i; if (all_local_in_alloc) { bench_free(all_local_in); all_local_in = (bench_real*) bench_malloc(iNtot*sizeof(bench_real)); all_local_in_alloc = 0; } ali = all_local_in; for (i = 0; i < n_pes; ++i) { copy_block_in(ali, rnk, all_local_ni + i * rnk, all_local_starti + i * rnk, vn, istrides, vn, in); ali += isend_cnt[i]; } MPI_Scatterv(all_local_in, isend_cnt, isend_off, BENCH_MPI_TYPE, local_in, isend_cnt[my_pe], BENCH_MPI_TYPE, 0, MPI_COMM_WORLD); }
static void do_gather_out(bench_real *out) { bench_real *alo; int i; if (all_local_out_alloc) { bench_free(all_local_out); all_local_out = (bench_real*) bench_malloc(oNtot*sizeof(bench_real)); all_local_out_alloc = 0; } MPI_Gatherv(local_out, orecv_cnt[my_pe], BENCH_MPI_TYPE, all_local_out, orecv_cnt, orecv_off, BENCH_MPI_TYPE, 0, MPI_COMM_WORLD); MPI_Bcast(all_local_out, oNtot, BENCH_MPI_TYPE, 0, MPI_COMM_WORLD); alo = all_local_out; for (i = 0; i < n_pes; ++i) { copy_block_out(alo, rnk, all_local_no + i * rnk, all_local_starto + i * rnk, vn, ostrides, vn, out); alo += orecv_cnt[i]; } }
void verify_r2r(bench_problem *p, int rounds, double tol, errors *e) { R *inA, *inB, *inC, *outA, *outB, *outC, *tmp; info nfo; int n, vecn, N; double impulse_amp = 1.0; dim_stuff *d; int i; if (rounds == 0) rounds = 20; /* default value */ n = tensor_sz(p->sz); vecn = tensor_sz(p->vecsz); N = n * vecn; d = (dim_stuff *) bench_malloc(sizeof(dim_stuff) * p->sz->rnk); for (i = 0; i < p->sz->rnk; ++i) { int n0, i0, k0; trigfun ti, ts; d[i].n = n0 = p->sz->dims[i].n; if (p->k[i] > R2R_DHT) n0 = 2 * (n0 + (p->k[i] == R2R_REDFT00 ? -1 : (p->k[i] == R2R_RODFT00 ? 1 : 0))); switch (p->k[i]) { case R2R_R2HC: i0 = k0 = 0; ti = realhalf; ts = coshalf; break; case R2R_DHT: i0 = k0 = 0; ti = unity; ts = cos00; break; case R2R_HC2R: i0 = k0 = 0; ti = unity; ts = cos00; break; case R2R_REDFT00: i0 = k0 = 0; ti = ts = cos00; break; case R2R_REDFT01: i0 = k0 = 0; ti = ts = cos01; break; case R2R_REDFT10: i0 = k0 = 0; ti = cos10; impulse_amp *= 2.0; ts = cos00; break; case R2R_REDFT11: i0 = k0 = 0; ti = cos11; impulse_amp *= 2.0; ts = cos01; break; case R2R_RODFT00: i0 = k0 = 1; ti = sin00; impulse_amp *= 2.0; ts = cos00; break; case R2R_RODFT01: i0 = 1; k0 = 0; ti = sin01; impulse_amp *= n == 1 ? 1.0 : 2.0; ts = cos01; break; case R2R_RODFT10: i0 = 0; k0 = 1; ti = sin10; impulse_amp *= 2.0; ts = cos00; break; case R2R_RODFT11: i0 = k0 = 0; ti = sin11; impulse_amp *= 2.0; ts = cos01; break; default: BENCH_ASSERT(0); return; } d[i].n0 = n0; d[i].i0 = i0; d[i].k0 = k0; d[i].ti = ti; d[i].ts = ts; } inA = (R *) bench_malloc(N * sizeof(R)); inB = (R *) bench_malloc(N * sizeof(R)); inC = (R *) bench_malloc(N * sizeof(R)); outA = (R *) bench_malloc(N * sizeof(R)); outB = (R *) bench_malloc(N * sizeof(R)); outC = (R *) bench_malloc(N * sizeof(R)); tmp = (R *) bench_malloc(N * sizeof(R)); nfo.p = p; nfo.probsz = p->sz; nfo.totalsz = tensor_append(p->vecsz, nfo.probsz); nfo.pckdsz = verify_pack(nfo.totalsz, 1); nfo.pckdvecsz = verify_pack(p->vecsz, tensor_sz(nfo.probsz)); e->i = rimpulse(d, impulse_amp, n, vecn, &nfo, inA, inB, inC, outA, outB, outC, tmp, rounds, tol); e->l = rlinear(N, &nfo, inA, inB, inC, outA, outB, outC, tmp, rounds,tol); e->s = t_shift(n, vecn, &nfo, inA, inB, outA, outB, tmp, rounds, tol, d); /* grr, verify-lib.c:preserves_input() only works for complex */ if (!p->in_place && !p->destroy_input) { bench_tensor *totalsz_swap, *pckdsz_swap; totalsz_swap = tensor_copy_swapio(nfo.totalsz); pckdsz_swap = tensor_copy_swapio(nfo.pckdsz); for (i = 0; i < rounds; ++i) { rarand(inA, N); dofft(&nfo, inA, outB); cpyr((R *) nfo.p->in, totalsz_swap, inB, pckdsz_swap); racmp(inB, inA, N, "preserves_input", 0.0); } tensor_destroy(totalsz_swap); tensor_destroy(pckdsz_swap); } tensor_destroy(nfo.totalsz); tensor_destroy(nfo.pckdsz); tensor_destroy(nfo.pckdvecsz); bench_free(tmp); bench_free(outC); bench_free(outB); bench_free(outA); bench_free(inC); bench_free(inB); bench_free(inA); bench_free(d); }
void speed(const char *param, int setup_only) { double *t; int iter = 0, k; bench_problem *p; double tmin, y; t = (double *) bench_malloc(time_repeat * sizeof(double)); for (k = 0; k < time_repeat; ++k) t[k] = 0; p = problem_parse(param); BENCH_ASSERT(can_do(p)); if (!no_speed_allocation) { problem_alloc(p); problem_zero(p); } timer_start(LIBBENCH_TIMER); setup(p); p->setup_time = bench_cost_postprocess(timer_stop(LIBBENCH_TIMER)); /* reset the input to zero again, because the planner in paranoid mode sets it to random values, thus making the benchmark diverge. */ if (!no_speed_allocation) problem_zero(p); if (setup_only) goto done; start_over: for (iter = 1; iter < (1<<30); iter *= 2) { tmin = 1.0e20; for (k = 0; k < time_repeat; ++k) { timer_start(LIBBENCH_TIMER); doit(iter, p); y = bench_cost_postprocess(timer_stop(LIBBENCH_TIMER)); if (y < 0) /* yes, it happens */ goto start_over; t[k] = y; if (y < tmin) tmin = y; } if (tmin >= time_min) goto done; } goto start_over; /* this also happens */ done: done(p); if (iter) for (k = 0; k < time_repeat; ++k) t[k] /= iter; else for (k = 0; k < time_repeat; ++k) t[k] = 0; report(p, t, time_repeat); if (!no_speed_allocation) problem_destroy(p); bench_free(t); return; }
int bench_main(int argc, char *argv[]) { double tmin = 0.0; double tol; int repeat = 0; int rounds = 10; int iarounds = 0; int arounds = 1; /* this is too low for precise results */ int c; int index; char *short_options = make_short_options(long_options); check_alignment(&tol); report = report_verbose; /* default */ verbose = 0; tol = SINGLE_PRECISION ? 1.0e-3 : 1.0e-10; bench_srand(1); while ((c = getopt_long (argc, argv, short_options, long_options, &index)) != -1) { switch (c) { case 't' : tmin = strtod(optarg, 0); break; case 'r': repeat = atoi(optarg); break; case 's': timer_init(tmin, repeat); speed(optarg); break; case 'd': report_can_do(optarg); break; case 'o': useropt(optarg); break; case 'v': if (optarg) verbose = atoi(optarg); else ++verbose; break; case 'y': verify(optarg, rounds, tol); break; case 'a': accuracy(optarg, arounds, iarounds); break; case 'i': report_info(optarg); break; case 'I': report_info_all(); break; case 'h': usage(argv[0], long_options); break; case 300: /* --report-mflops */ report = report_mflops; break; case 310: /* --report-time */ report = report_time; break; case 320: /* --report-benchmark */ report = report_benchmark; break; case 330: /* --report-verbose */ report = report_verbose; break; case 400: /* --print-time-min */ timer_init(tmin, repeat); ovtpvt("%g\n", time_min); break; case 401: /* --verify-rounds */ rounds = atoi(optarg); break; case 402: /* --print-precision */ if (SINGLE_PRECISION) ovtpvt("single\n"); else if (LDOUBLE_PRECISION) ovtpvt("long-double\n"); else if (DOUBLE_PRECISION) ovtpvt("double\n"); else ovtpvt("unknown %d\n", sizeof(bench_real)); break; case 403: /* --verify-tolerance */ tol = strtod(optarg, 0); break; case 404: /* --random-seed */ bench_srand(atoi(optarg)); break; case 405: /* --accuracy-rounds */ arounds = atoi(optarg); break; case 406: /* --impulse-accuracy-rounds */ iarounds = atoi(optarg); break; case '?': /* `getopt_long' already printed an error message. */ break; default: abort (); } } /* assume that any remaining arguments are problems to be benchmarked */ while (optind < argc) { timer_init(tmin, repeat); speed(argv[optind++]); } cleanup(); bench_free(short_options); return 0; }
void done(struct problem *p) { UNUSED(p); if (p->rank == 1) bench_free(WSAVE); }
void tensor_destroy(bench_tensor *sz) { bench_free0(sz->dims); bench_free(sz); }
static const char *parsetensor(const char *s, bench_tensor **tp, r2r_kind_t **k) { struct dimlist *l = 0, *m; bench_tensor *t; int rnk = 0; L1: m = (struct dimlist *)bench_malloc(sizeof(struct dimlist)); /* nconc onto l */ m->cdr = l; l = m; ++rnk; s = parseint(s, &m->car.n); if (*s == ':') { /* read input stride */ ++s; s = parseint(s, &m->car.is); if (*s == ':') { /* read output stride */ ++s; s = parseint(s, &m->car.os); } else { /* default */ m->car.os = m->car.is; } } else { m->car.is = 0; m->car.os = 0; } if (*s == 'f' || *s == 'F') { m->k = R2R_R2HC; ++s; } else if (*s == 'b' || *s == 'B') { m->k = R2R_HC2R; ++s; } else if (*s == 'h' || *s == 'H') { m->k = R2R_DHT; ++s; } else if (*s == 'e' || *s == 'E' || *s == 'o' || *s == 'O') { char c = *(s++); int ab; s = parseint(s, &ab); if (c == 'e' || c == 'E') { if (ab == 0) m->k = R2R_REDFT00; else if (ab == 1) m->k = R2R_REDFT01; else if (ab == 10) m->k = R2R_REDFT10; else if (ab == 11) m->k = R2R_REDFT11; else BENCH_ASSERT(0); } else { if (ab == 0) m->k = R2R_RODFT00; else if (ab == 1) m->k = R2R_RODFT01; else if (ab == 10) m->k = R2R_RODFT10; else if (ab == 11) m->k = R2R_RODFT11; else BENCH_ASSERT(0); } } else m->k = R2R_R2HC; if (*s == 'x' || *s == 'X') { ++s; goto L1; } /* now we have a dimlist. Build bench_tensor, etc. */ if (k && rnk > 0) { int i; *k = (r2r_kind_t *) bench_malloc(sizeof(r2r_kind_t) * rnk); for (m = l, i = rnk - 1; i >= 0; --i, m = m->cdr) { BENCH_ASSERT(m); (*k)[i] = m->k; } } t = mktensor(rnk); while (--rnk >= 0) { bench_iodim *d = t->dims + rnk; BENCH_ASSERT(l); m = l; l = m->cdr; d->n = m->car.n; d->is = m->car.is; d->os = m->car.os; bench_free(m); } *tp = t; return s; }