/* Check if the vecsz/sz strides are consistent with the problem being in-place for vecsz.dim[vdim], or for all dimensions if vdim == RNK_MINFTY. We can't just use tensor_inplace_strides because rdft transforms have the unfortunate property of differing input and output sizes. This routine is not exhaustive; we only return 1 for the most common case. */ int X(rdft2_inplace_strides)(const problem_rdft2 *p, int vdim) { INT N, Nc; INT rs, cs; int i; for (i = 0; i + 1 < p->sz->rnk; ++i) if (p->sz->dims[i].is != p->sz->dims[i].os) return 0; if (!FINITE_RNK(p->vecsz->rnk) || p->vecsz->rnk == 0) return 1; if (!FINITE_RNK(vdim)) { /* check all vector dimensions */ for (vdim = 0; vdim < p->vecsz->rnk; ++vdim) if (!X(rdft2_inplace_strides)(p, vdim)) return 0; return 1; } A(vdim < p->vecsz->rnk); if (p->sz->rnk == 0) return(p->vecsz->dims[vdim].is == p->vecsz->dims[vdim].os); N = X(tensor_sz)(p->sz); Nc = (N / p->sz->dims[p->sz->rnk-1].n) * (p->sz->dims[p->sz->rnk-1].n/2 + 1); X(rdft2_strides)(p->kind, p->sz->dims + p->sz->rnk - 1, &rs, &cs); /* the factor of 2 comes from the fact that RS is the stride of p->r0 and p->r1, which is twice as large as the strides in the r2r case */ return(p->vecsz->dims[vdim].is == p->vecsz->dims[vdim].os && (X(iabs)(2 * p->vecsz->dims[vdim].os) >= X(imax)(2 * Nc * X(iabs)(cs), N * X(iabs)(rs)))); }
/* Check if the vecsz/sz strides are consistent with the problem being in-place for vecsz.dim[vdim], or for all dimensions if vdim == RNK_MINFTY. We can't just use tensor_inplace_strides because rdft transforms have the unfortunate property of differing input and output sizes. This routine is not exhaustive; we only return 1 for the most common case. */ int X(rdft2_inplace_strides)(const problem_rdft2 *p, int vdim) { int N, Nc; int is, os; int i; for (i = 0; i + 1 < p->sz->rnk; ++i) if (p->sz->dims[i].is != p->sz->dims[i].os) return 0; if (!FINITE_RNK(p->vecsz->rnk) || p->vecsz->rnk == 0) return 1; if (!FINITE_RNK(vdim)) { /* check all vector dimensions */ for (vdim = 0; vdim < p->vecsz->rnk; ++vdim) if (!X(rdft2_inplace_strides)(p, vdim)) return 0; return 1; } A(vdim < p->vecsz->rnk); if (p->sz->rnk == 0) return(p->vecsz->dims[vdim].is == p->vecsz->dims[vdim].os); N = X(tensor_sz)(p->sz); Nc = (N / p->sz->dims[p->sz->rnk-1].n) * (p->sz->dims[p->sz->rnk-1].n/2 + 1); X(rdft2_strides)(p->kind, p->sz->dims + p->sz->rnk - 1, &is, &os); return(p->vecsz->dims[vdim].is == p->vecsz->dims[vdim].os && X(iabs)(p->vecsz->dims[vdim].os) >= X(imax)(Nc * X(iabs)(os), N * X(iabs)(is))); }
/* The inverse of X(tensor_append): splits the sz tensor into tensor a followed by tensor b, where a's rank is arnk. */ void X(tensor_split)(const tensor *sz, tensor **a, int arnk, tensor **b) { A(FINITE_RNK(sz->rnk) && FINITE_RNK(arnk)); *a = X(tensor_copy_sub)(sz, 0, arnk); *b = X(tensor_copy_sub)(sz, arnk, sz->rnk - arnk); }
tensor *X(mktensor)(int rnk) { tensor *x; A(rnk >= 0); #if defined(STRUCT_HACK_KR) if (FINITE_RNK(rnk) && rnk > 1) x = (tensor *)MALLOC(sizeof(tensor) + (rnk - 1) * sizeof(iodim), TENSORS); else x = (tensor *)MALLOC(sizeof(tensor), TENSORS); #elif defined(STRUCT_HACK_C99) if (FINITE_RNK(rnk)) x = (tensor *)MALLOC(sizeof(tensor) + rnk * sizeof(iodim), TENSORS); else x = (tensor *)MALLOC(sizeof(tensor), TENSORS); #else x = (tensor *)MALLOC(sizeof(tensor), TENSORS); if (FINITE_RNK(rnk) && rnk > 0) x->dims = (iodim *)MALLOC(sizeof(iodim) * rnk, TENSORS); else x->dims = 0; #endif x->rnk = rnk; return x; }
void verify_rdft2(bench_problem *p, int rounds, double tol, errors *e) { C *inA, *inB, *inC, *outA, *outB, *outC, *tmp; int n, vecn, N; dofft_rdft2_closure k; BENCH_ASSERT(p->kind == PROBLEM_REAL); if (!FINITE_RNK(p->sz->rnk) || !FINITE_RNK(p->vecsz->rnk)) return; /* give up */ k.k.apply = rdft2_apply; k.k.recopy_input = 0; k.p = p; if (rounds == 0) rounds = 20; /* default value */ n = tensor_sz(p->sz); vecn = tensor_sz(p->vecsz); N = n * vecn; inA = (C *) bench_malloc(N * sizeof(C)); inB = (C *) bench_malloc(N * sizeof(C)); inC = (C *) bench_malloc(N * sizeof(C)); outA = (C *) bench_malloc(N * sizeof(C)); outB = (C *) bench_malloc(N * sizeof(C)); outC = (C *) bench_malloc(N * sizeof(C)); tmp = (C *) bench_malloc(N * sizeof(C)); e->i = impulse(&k.k, n, vecn, inA, inB, inC, outA, outB, outC, tmp, rounds, tol); e->l = linear(&k.k, 1, N, inA, inB, inC, outA, outB, outC, tmp, rounds, tol); e->s = 0.0; if (p->sign < 0) e->s = dmax(e->s, tf_shift(&k.k, 1, p->sz, n, vecn, p->sign, inA, inB, outA, outB, tmp, rounds, tol, TIME_SHIFT)); else e->s = dmax(e->s, tf_shift(&k.k, 1, p->sz, n, vecn, p->sign, inA, inB, outA, outB, tmp, rounds, tol, FREQ_SHIFT)); if (!p->in_place && !p->destroy_input) preserves_input(&k.k, p->sign < 0 ? mkreal : mkhermitian1, N, inA, inB, outB, rounds); bench_free(tmp); bench_free(outC); bench_free(outB); bench_free(outA); bench_free(inC); bench_free(inB); bench_free(inA); }
static int applicable0(const solver *ego_, const problem *p_, int *rp) { const problem_rdft *p = (const problem_rdft *) p_; const S *ego = (const S *)ego_; return (1 && FINITE_RNK(p->sz->rnk) && FINITE_RNK(p->vecsz->rnk) && p->sz->rnk >= 2 && picksplit(ego, p->sz, rp) ); }
tensor *X(tensor_append)(const tensor *a, const tensor *b) { if (!FINITE_RNK(a->rnk) || !FINITE_RNK(b->rnk)) { return X(mktensor)(RNK_MINFTY); } else { tensor *x = X(mktensor)(a->rnk + b->rnk); dimcpy(x->dims, a->dims, a->rnk); dimcpy(x->dims + a->rnk, b->dims, b->rnk); return x; } }
/* do what I mean */ static bench_tensor *dwim(bench_tensor *t, bench_iodim **last_iodim, n_transform nti, n_transform nto, bench_iodim *dt) { int i; bench_iodim *d, *d1; if (!FINITE_RNK(t->rnk) || t->rnk < 1) return t; i = t->rnk; d1 = *last_iodim; while (--i >= 0) { d = t->dims + i; if (!d->is) d->is = d1->is * transform_n(d1->n, d1==dt ? nti : SAME); if (!d->os) d->os = d1->os * transform_n(d1->n, d1==dt ? nto : SAME); d1 = d; } *last_iodim = d1; return t; }
static void dimcpy(iodim *dst, const iodim *src, int rnk) { int i; if (FINITE_RNK(rnk)) for (i = 0; i < rnk; ++i) dst[i] = src[i]; }
/* Like tensor_compress, but also compress into one dimension any group of dimensions that form a contiguous block of indices with some stride. (This can safely be done for transform vector sizes.) */ tensor *X(tensor_compress_contiguous)(const tensor *sz) { int i, rnk; tensor *sz2, *x; if (X(tensor_sz)(sz) == 0) return X(mktensor)(RNK_MINFTY); sz2 = X(tensor_compress)(sz); A(FINITE_RNK(sz2->rnk)); if (sz2->rnk < 2) /* nothing to compress */ return sz2; for (i = rnk = 1; i < sz2->rnk; ++i) if (!strides_contig(sz2->dims + i - 1, sz2->dims + i)) ++rnk; x = X(mktensor)(rnk); x->dims[0] = sz2->dims[0]; for (i = rnk = 1; i < sz2->rnk; ++i) { if (strides_contig(sz2->dims + i - 1, sz2->dims + i)) { x->dims[rnk - 1].n *= sz2->dims[i].n; x->dims[rnk - 1].is = sz2->dims[i].is; x->dims[rnk - 1].os = sz2->dims[i].os; } else { A(rnk < x->rnk); x->dims[rnk++] = sz2->dims[i]; } } X(tensor_destroy)(sz2); return x; }
/* Like tensor_copy, but eliminate n == 1 dimensions, which never affect any transform or transform vector. Also, we sort the tensor into a canonical order of decreasing is. In general, processing a loop/array in order of decreasing stride will improve locality; sorting also makes the analysis in fftw_tensor_contiguous (below) easier. The choice of is over os is mostly arbitrary, and hopefully shouldn't affect things much. Normally, either the os will be in the same order as is (for e.g. multi-dimensional transforms) or will be in opposite order (e.g. for Cooley-Tukey recursion). (Both forward and backwards traversal of the tensor are considered e.g. by vrank-geq1, so sorting in increasing vs. decreasing order is not really important.) */ tensor *X(tensor_compress)(const tensor *sz) { int i, rnk; tensor *x; A(FINITE_RNK(sz->rnk)); for (i = rnk = 0; i < sz->rnk; ++i) { A(sz->dims[i].n > 0); if (sz->dims[i].n != 1) ++rnk; } x = X(mktensor)(rnk); for (i = rnk = 0; i < sz->rnk; ++i) { if (sz->dims[i].n != 1) x->dims[rnk++] = sz->dims[i]; } if (rnk > 1) { qsort(x->dims, (size_t)x->rnk, sizeof(iodim), (int (*)(const void *, const void *))X(dimcmp)); } return x; }
problem *X(mkproblem_dft)(const tensor *sz, const tensor *vecsz, R *ri, R *ii, R *ro, R *io) { problem_dft *ego = (problem_dft *)X(mkproblem)(sizeof(problem_dft), &padt); A((ri == ro) == (ii == io)); /* both in place or both out of place */ A(X(tensor_kosherp)(sz)); A(X(tensor_kosherp)(vecsz)); /* enforce pointer equality if untainted pointers are equal */ if (UNTAINT(ri) == UNTAINT(ro)) ri = ro = JOIN_TAINT(ri, ro); if (UNTAINT(ii) == UNTAINT(io)) ii = io = JOIN_TAINT(ii, io); /* more correctness conditions: */ A(TAINTOF(ri) == TAINTOF(ii)); A(TAINTOF(ro) == TAINTOF(io)); ego->sz = X(tensor_compress)(sz); ego->vecsz = X(tensor_compress_contiguous)(vecsz); ego->ri = ri; ego->ii = ii; ego->ro = ro; ego->io = io; A(FINITE_RNK(ego->sz->rnk)); return &(ego->super); }
static int applicable0(const solver *ego_, const problem *p_, const planner *plnr) { const S *ego = (const S *) ego_; const problem_rdft *p = (const problem_rdft *) p_; return (1 && FINITE_RNK(p->vecsz->rnk) /* problem must be a nontrivial transform, not just a copy */ && p->sz->rnk > 0 && (0 /* problem must be in-place & require some rearrangement of the data */ || (p->I == p->O && !(X(tensor_inplace_strides2)(p->sz, p->vecsz))) /* or problem must be out of place, transforming from stride 1/2 to bigger stride, for apply_after */ || (p->I != p->O && ego->adt->apply == apply_after && !NO_DESTROY_INPUTP(plnr) && X(tensor_min_istride)(p->sz) <= 2 && X(tensor_min_ostride)(p->sz) > 2) /* or problem must be out of place, transforming to stride 1/2 from bigger stride, for apply_before */ || (p->I != p->O && ego->adt->apply == apply_before && X(tensor_min_ostride)(p->sz) <= 2 && X(tensor_min_istride)(p->sz) > 2) ) ); }
static int applicable0(const problem *p_) { const problem_dft *p = (const problem_dft *) p_; return ((p->sz->rnk == 1 && p->vecsz->rnk == 0) || (p->sz->rnk == 0 && FINITE_RNK(p->vecsz->rnk)) ); }
static int tensor_rowmajor_transposedp(bench_tensor *t) { bench_iodim *d; int i; BENCH_ASSERT(FINITE_RNK(t->rnk)); if (t->rnk < 2) return 0; d = t->dims; if (d[0].is != d[1].is * d[1].n || d[0].os != d[1].is || d[1].os != d[0].os * d[0].n) return 0; if (t->rnk > 2 && d[1].is != d[2].is * d[2].n) return 0; for (i = 2; i + 1 < t->rnk; ++i) { d = t->dims + i; if (d[0].is != d[1].is * d[1].n || d[0].os != d[1].os * d[1].n) return 0; } if (t->rnk > 2 && t->dims[t->rnk-1].is != t->dims[t->rnk-1].os) return 0; return 1; }
problem *X(mkproblem_rdft2)(const tensor *sz, const tensor *vecsz, R *r0, R *r1, R *cr, R *ci, rdft_kind kind) { problem_rdft2 *ego; A(kind == R2HC || kind == R2HCII || kind == HC2R || kind == HC2RIII); A(X(tensor_kosherp)(sz)); A(X(tensor_kosherp)(vecsz)); A(FINITE_RNK(sz->rnk)); /* require in-place problems to use r0 == cr */ if (UNTAINT(r0) == UNTAINT(ci)) return X(mkproblem_unsolvable)(); /* FIXME: should check UNTAINT(r1) == UNTAINT(cr) but only if odd elements exist, which requires compressing the tensors first */ if (UNTAINT(r0) == UNTAINT(cr)) r0 = cr = JOIN_TAINT(r0, cr); ego = (problem_rdft2 *)X(mkproblem)(sizeof(problem_rdft2), &padt); if (sz->rnk > 1) { /* have to compress rnk-1 dims separately, ugh */ tensor *szc = X(tensor_copy_except)(sz, sz->rnk - 1); tensor *szr = X(tensor_copy_sub)(sz, sz->rnk - 1, 1); tensor *szcc = X(tensor_compress)(szc); if (szcc->rnk > 0) ego->sz = X(tensor_append)(szcc, szr); else ego->sz = X(tensor_compress)(szr); X(tensor_destroy2)(szc, szr); X(tensor_destroy)(szcc); } else { ego->sz = X(tensor_compress)(sz); } ego->vecsz = X(tensor_compress_contiguous)(vecsz); ego->r0 = r0; ego->r1 = r1; ego->cr = cr; ego->ci = ci; ego->kind = kind; A(FINITE_RNK(ego->sz->rnk)); return &(ego->super); }
static void transpose_tensor(bench_tensor *t) { if (!FINITE_RNK(t->rnk) || t->rnk < 2) return; t->dims[0].os = t->dims[1].os; t->dims[1].os = t->dims[0].os * t->dims[0].n; }
/* Return whether sz is distributed for k according to a simple 1d block distribution in the first or second dimensions */ int XM(is_block1d)(const dtensor *sz, block_kind k) { int i; if (!FINITE_RNK(sz->rnk)) return 0; for (i = 0; i < sz->rnk && num_blocks_kind(sz->dims + i, k) == 1; ++i) ; return(i < sz->rnk && i < 2 && XM(is_local_after)(i + 1, sz, k)); }
/* returns whether sz is local for dims >= dim */ int XM(is_local_after)(int dim, const dtensor *sz, block_kind k) { if (FINITE_RNK(sz->rnk)) for (; dim < sz->rnk; ++dim) if (XM(num_blocks)(sz->dims[dim].n, sz->dims[dim].b[k]) > 1) return 0; return 1; }
static int applicable(const solver *ego_, const problem *p_) { const problem_dft *p = (const problem_dft *) p_; UNUSED(ego_); return 0 /* case 1 : -infty vector rank */ || (!FINITE_RNK(p->vecsz->rnk)) /* case 2 : rank-0 in-place dft */ || (1 && p->sz->rnk == 0 && FINITE_RNK(p->vecsz->rnk) && p->ro == p->ri && X(tensor_inplace_strides)(p->vecsz) ); }
/* Like X(tensor_copy), but copy only rnk dimensions starting with start_dim. */ tensor *X(tensor_copy_sub)(const tensor *sz, int start_dim, int rnk) { tensor *x; A(FINITE_RNK(sz->rnk) && start_dim + rnk <= sz->rnk); x = X(mktensor)(rnk); dimcpy(x->dims, sz->dims + start_dim, rnk); return x; }
problem *XM(mkproblem_rdft)(const dtensor *sz, INT vn, R *I, R *O, MPI_Comm comm, const rdft_kind *kind, unsigned flags) { problem_mpi_rdft *ego; int i, rnk = sz->rnk; int n_pes; A(XM(dtensor_validp)(sz) && FINITE_RNK(sz->rnk)); MPI_Comm_size(comm, &n_pes); A(n_pes >= XM(num_blocks_total)(sz, IB) && n_pes >= XM(num_blocks_total)(sz, OB)); A(vn >= 0); #if defined(STRUCT_HACK_KR) ego = (problem_mpi_rdft *) X(mkproblem)(sizeof(problem_mpi_rdft) + sizeof(rdft_kind) * (rnk > 0 ? rnk - 1 : 0), &padt); #elif defined(STRUCT_HACK_C99) ego = (problem_mpi_rdft *) X(mkproblem)(sizeof(problem_mpi_rdft) + sizeof(rdft_kind) * rnk, &padt); #else ego = (problem_mpi_rdft *) X(mkproblem)(sizeof(problem_mpi_rdft), &padt); ego->kind = (rdft_kind *) MALLOC(sizeof(rdft_kind) * rnk, PROBLEMS); #endif /* enforce pointer equality if untainted pointers are equal */ if (UNTAINT(I) == UNTAINT(O)) I = O = JOIN_TAINT(I, O); ego->sz = XM(dtensor_canonical)(sz, 0); ego->vn = vn; ego->I = I; ego->O = O; for (i = 0; i< ego->sz->rnk; ++i) ego->kind[i] = kind[i]; /* canonicalize: replace TRANSPOSED_IN with TRANSPOSED_OUT by swapping the first two dimensions (for rnk > 1) */ if ((flags & TRANSPOSED_IN) && ego->sz->rnk > 1) { rdft_kind k = ego->kind[0]; ddim dim0 = ego->sz->dims[0]; ego->sz->dims[0] = ego->sz->dims[1]; ego->sz->dims[1] = dim0; ego->kind[0] = ego->kind[1]; ego->kind[1] = k; flags &= ~TRANSPOSED_IN; flags ^= TRANSPOSED_OUT; } ego->flags = flags; MPI_Comm_dup(comm, &ego->comm); return &(ego->super); }
static int applicable0(const solver *ego_, const problem *p_, const planner *plnr, int *pdim0, int *pdim1) { const problem_dft *p = (const problem_dft *) p_; UNUSED(ego_); UNUSED(plnr); return (1 && FINITE_RNK(p->vecsz->rnk) && FINITE_RNK(p->sz->rnk) /* FIXME: can/should we relax this constraint? */ && X(tensor_inplace_strides2)(p->vecsz, p->sz) && pickdim(p->vecsz, p->sz, pdim0, pdim1) /* output should not *already* include the transpose (in which case we duplicate the regular indirect.c) */ && (p->sz->dims[*pdim1].os != p->vecsz->dims[*pdim0].is) ); }
static int applicable(const S *ego, const problem *p_) { const problem_rdft *p = (const problem_rdft *) p_; P pln; return (1 && p->sz->rnk == 0 && FINITE_RNK(p->vecsz->rnk) && fill_iodim(&pln, p) && ego->applicable(&pln, p) ); }
/* return true (1) iff *any* strides of sz decrease when we tensor_inplace_copy(sz, k). */ static int tensor_strides_decrease(const tensor *sz, inplace_kind k) { if (FINITE_RNK(sz->rnk)) { int i; for (i = 0; i < sz->rnk; ++i) if ((sz->dims[i].os - sz->dims[i].is) * (k == INPLACE_OS ? (INT)1 : (INT)-1) < 0) return 1; } return 0; }
int X(tensor_inplace_strides)(const tensor *sz) { int i; A(FINITE_RNK(sz->rnk)); for (i = 0; i < sz->rnk; ++i) { const iodim *p = sz->dims + i; if (p->is != p->os) return 0; } return 1; }
static void transpose_tensor(bench_tensor *t) { int i; if (!FINITE_RNK(t->rnk) || t->rnk < 1) return; t->dims[0].os = t->dims[t->rnk - 1].os; for (i = 1; i < t->rnk; ++i) t->dims[i].os = t->dims[i-1].os * t->dims[i-1].n; }
/* Given a non-idle process which_pe, computes the coordinate vector coords[rnk] giving the coordinates of a block in the matrix of blocks. k specifies whether we are talking about the input or output data distribution. */ void XM(block_coords)(const dtensor *sz, block_kind k, int which_pe, INT *coords) { int i; A(!XM(idle_process)(sz, k, which_pe) && FINITE_RNK(sz->rnk)); for (i = sz->rnk - 1; i >= 0; --i) { INT nb = num_blocks_kind(sz->dims + i, k); coords[i] = which_pe % nb; which_pe /= nb; } }
/* Like X(tensor_copy), but copy all of the dimensions *except* except_dim. */ tensor *X(tensor_copy_except)(const tensor *sz, int except_dim) { tensor *x; A(FINITE_RNK(sz->rnk) && sz->rnk >= 1 && except_dim < sz->rnk); x = X(mktensor)(sz->rnk - 1); dimcpy(x->dims, sz->dims, except_dim); dimcpy(x->dims + except_dim, sz->dims + except_dim + 1, x->rnk - except_dim); return x; }
static int rowmajor_kosherp(int rnk, const int *n) { int i; if (!FINITE_RNK(rnk)) return 0; if (rnk < 0) return 0; for (i = 0; i < rnk; ++i) if (n[i] <= 0) return 0; return 1; }