/* Return the radix r for a 1d MPI transform of a distributed dimension d, with the given flags and transform size. That is, decomposes d.n as r * m, Cooley-Tukey style. Also computes the block sizes rblock and mblock. Returns 0 if such a decomposition is not feasible. This is unfortunately somewhat complicated. A distributed Cooley-Tukey algorithm works as follows (see dft-rank1.c): d.n is initially distributed as an m x r array with block size mblock[IB]. Then it is internally transposed to an r x m array with block size rblock[IB]. Then it is internally transposed to m x r again with block size mblock[OB]. Finally, it is transposed to r x m with block size rblock[IB]. If flags & SCRAMBLED_IN, then the first transpose is skipped (the array starts out as r x m). If flags & SCRAMBLED_OUT, then the last transpose is skipped (the array ends up as m x r). To make sure the forward and backward transforms use the same "scrambling" format, we swap r and m when sign != FFT_SIGN. There are some downsides to this, especially in the case where either m or r is not divisible by n_pes. For one thing, it means that in general we can't use the same block size for the input and output. For another thing, it means that we can't in general honor a user's "requested" block sizes in d.b[]. Therefore, for simplicity, we simply ignore d.b[] for now. */ INT XM(choose_radix)(ddim d, int n_pes, unsigned flags, int sign, INT rblock[2], INT mblock[2]) { INT r, m; UNUSED(flags); /* we would need this if we paid attention to d.b[*] */ /* If n_pes is a factor of d.n, then choose r to be d.n / n_pes. This not only ensures that the input (the m dimension) is equally distributed if possible, and at the r dimension is maximally equally distributed (if d.n/n_pes >= n_pes), it also makes one of the local transpositions in the algorithm trivial. */ if (d.n % n_pes == 0 /* it's good if n_pes divides d.n ...*/ && d.n / n_pes >= n_pes /* .. unless we can't use n_pes processes */) r = d.n / n_pes; else { /* n_pes does not divide d.n, pick a factor close to sqrt(d.n) */ for (r = X(isqrt)(d.n); d.n % r != 0; ++r) ; } if (r == 1 || r == d.n) return 0; /* punt if we can't reduce size */ if (sign != FFT_SIGN) { /* swap {m,r} so that scrambling is reversible */ m = r; r = d.n / m; } else m = d.n / r; rblock[IB] = rblock[OB] = XM(default_block)(r, n_pes); mblock[IB] = mblock[OB] = XM(default_block)(m, n_pes); return r; }
int XM(dft_serial_applicable)(const problem_mpi_dft *p) { return (1 && p->flags == 0 /* TRANSPOSED/SCRAMBLED_IN/OUT not supported */ && ((XM(is_local)(p->sz, IB) && XM(is_local)(p->sz, OB)) || p->vn == 0)); }
static plan *mkplan(const solver *ego, const problem *p_, planner *plnr) { const problem_mpi_rdft2 *p = (const problem_mpi_rdft2 *) p_; P *pln; plan *cld; int my_pe; R *r0, *r1, *cr, *ci; static const plan_adt padt = { XM(rdft2_solve), awake, print, destroy }; UNUSED(ego); /* check whether applicable: */ if (!XM(rdft2_serial_applicable)(p)) return (plan *) 0; if (p->kind == R2HC) { r1 = (r0 = p->I) + p->vn; ci = (cr = p->O) + 1; } else { r1 = (r0 = p->O) + p->vn; ci = (cr = p->I) + 1; } MPI_Comm_rank(p->comm, &my_pe); if (my_pe == 0 && p->vn > 0) { INT ivs = 1 + (p->kind == HC2R), ovs = 1 + (p->kind == R2HC); int i, rnk = p->sz->rnk; tensor *sz = X(mktensor)(p->sz->rnk); sz->dims[rnk - 1].is = sz->dims[rnk - 1].os = 2 * p->vn; sz->dims[rnk - 1].n = p->sz->dims[rnk - 1].n / 2 + 1; for (i = rnk - 1; i > 0; --i) { sz->dims[i - 1].is = sz->dims[i - 1].os = sz->dims[i].is * sz->dims[i].n; sz->dims[i - 1].n = p->sz->dims[i - 1].n; } sz->dims[rnk - 1].n = p->sz->dims[rnk - 1].n; cld = X(mkplan_d)(plnr, X(mkproblem_rdft2_d)(sz, X(mktensor_1d)(p->vn,ivs,ovs), r0, r1, cr, ci, p->kind)); } else { /* idle process: make nop plan */ cld = X(mkplan_d)(plnr, X(mkproblem_rdft2_d)(X(mktensor_0d)(), X(mktensor_1d)(0,0,0), cr, ci, cr, ci, HC2R)); } if (XM(any_true)(!cld, p->comm)) return (plan *) 0; pln = MKPLAN_MPI_RDFT2(P, &padt, p->kind == R2HC ? apply_r2c : apply_c2r); pln->cld = cld; pln->vn = p->vn; X(ops_cpy)(&cld->ops, &pln->super.super.ops); return &(pln->super.super); }
problem *XM(mkproblem_rdft_d)(dtensor *sz, INT vn, R *I, R *O, MPI_Comm comm, const rdft_kind *kind, unsigned flags) { problem *p = XM(mkproblem_rdft)(sz, vn, I, O, comm, kind, flags); XM(dtensor_destroy)(sz); return p; }
problem *XM(mkproblem_rdft)(const dtensor *sz, INT vn, R *I, R *O, MPI_Comm comm, const rdft_kind *kind, unsigned flags) { problem_mpi_rdft *ego; int i, rnk = sz->rnk; int n_pes; A(XM(dtensor_validp)(sz) && FINITE_RNK(sz->rnk)); MPI_Comm_size(comm, &n_pes); A(n_pes >= XM(num_blocks_total)(sz, IB) && n_pes >= XM(num_blocks_total)(sz, OB)); A(vn >= 0); #if defined(STRUCT_HACK_KR) ego = (problem_mpi_rdft *) X(mkproblem)(sizeof(problem_mpi_rdft) + sizeof(rdft_kind) * (rnk > 0 ? rnk - 1 : 0), &padt); #elif defined(STRUCT_HACK_C99) ego = (problem_mpi_rdft *) X(mkproblem)(sizeof(problem_mpi_rdft) + sizeof(rdft_kind) * rnk, &padt); #else ego = (problem_mpi_rdft *) X(mkproblem)(sizeof(problem_mpi_rdft), &padt); ego->kind = (rdft_kind *) MALLOC(sizeof(rdft_kind) * rnk, PROBLEMS); #endif /* enforce pointer equality if untainted pointers are equal */ if (UNTAINT(I) == UNTAINT(O)) I = O = JOIN_TAINT(I, O); ego->sz = XM(dtensor_canonical)(sz, 0); ego->vn = vn; ego->I = I; ego->O = O; for (i = 0; i< ego->sz->rnk; ++i) ego->kind[i] = kind[i]; /* canonicalize: replace TRANSPOSED_IN with TRANSPOSED_OUT by swapping the first two dimensions (for rnk > 1) */ if ((flags & TRANSPOSED_IN) && ego->sz->rnk > 1) { rdft_kind k = ego->kind[0]; ddim dim0 = ego->sz->dims[0]; ego->sz->dims[0] = ego->sz->dims[1]; ego->sz->dims[1] = dim0; ego->kind[0] = ego->kind[1]; ego->kind[1] = k; flags &= ~TRANSPOSED_IN; flags ^= TRANSPOSED_OUT; } ego->flags = flags; MPI_Comm_dup(comm, &ego->comm); return &(ego->super); }
problem *XM(mkproblem_dft_d)(dtensor *sz, INT vn, R *I, R *O, MPI_Comm comm, int sign, unsigned flags) { problem *p = XM(mkproblem_dft)(sz, vn, I, O, comm, sign, flags); XM(dtensor_destroy)(sz); return p; }
static plan *mkplan(const solver *ego, const problem *p_, planner *plnr) { const problem_mpi_dft *p = (const problem_mpi_dft *) p_; P *pln; plan *cld; int my_pe; R *ri, *ii, *ro, *io; static const plan_adt padt = { XM(dft_solve), awake, print, destroy }; UNUSED(ego); /* check whether applicable: */ if (!XM(dft_serial_applicable)(p)) return (plan *) 0; X(extract_reim)(p->sign, p->I, &ri, &ii); X(extract_reim)(p->sign, p->O, &ro, &io); MPI_Comm_rank(p->comm, &my_pe); if (my_pe == 0 && p->vn > 0) { int i, rnk = p->sz->rnk; tensor *sz = X(mktensor)(p->sz->rnk); sz->dims[rnk - 1].is = sz->dims[rnk - 1].os = 2 * p->vn; sz->dims[rnk - 1].n = p->sz->dims[rnk - 1].n; for (i = rnk - 1; i > 0; --i) { sz->dims[i - 1].is = sz->dims[i - 1].os = sz->dims[i].is * sz->dims[i].n; sz->dims[i - 1].n = p->sz->dims[i - 1].n; } cld = X(mkplan_d)(plnr, X(mkproblem_dft_d)(sz, X(mktensor_1d)(p->vn, 2, 2), ri, ii, ro, io)); } else { /* idle process: make nop plan */ cld = X(mkplan_d)(plnr, X(mkproblem_dft_d)(X(mktensor_0d)(), X(mktensor_1d)(0,0,0), ri, ii, ro, io)); } if (XM(any_true)(!cld, p->comm)) return (plan *) 0; pln = MKPLAN_MPI_DFT(P, &padt, apply); pln->cld = cld; pln->roff = ro - p->O; pln->ioff = io - p->O; X(ops_cpy)(&cld->ops, &pln->super.super.ops); return &(pln->super.super); }
/* check whether the recursive transposes fit within the space that must have been allocated on each process for this transpose; this must be modified if the subdivision in mkplan is changed! */ static int enough_space(INT nx, INT ny, INT block, INT tblock, int r, int n_pes) { int pe; int m = n_pes / r; for (pe = 0; pe < n_pes; ++pe) { INT space = transpose_space(nx, ny, block, tblock, pe); INT b1 = XM(block)(nx, r * block, pe / r); INT b2 = XM(block)(ny, m * tblock, pe % r); if (transpose_space(b1, ny, block, m*tblock, pe % r) > space || transpose_space(nx, b2, r*block, tblock, pe / r) > space) return 0; } return 1; }
static int applicable(const S *ego, const problem *p_, const planner *plnr) { const problem_mpi_dft *p = (const problem_mpi_dft *) p_; return (1 && p->sz->rnk > 1 && p->flags == 0 /* TRANSPOSED/SCRAMBLED_IN/OUT not supported */ && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr) && p->I != p->O)) && XM(is_local_after)(1, p->sz, IB) && XM(is_local_after)(1, p->sz, OB) && (!NO_SLOWP(plnr) /* slow if dft-serial is applicable */ || !XM(dft_serial_applicable)(p)) ); }
INT XM(total_block)(const dtensor *sz, block_kind k, int which_pe) { if (XM(idle_process)(sz, k, which_pe)) return 0; else { int i; INT N = 1, *coords; STACK_MALLOC(INT*, coords, sizeof(INT) * sz->rnk); XM(block_coords)(sz, k, which_pe, coords); for (i = 0; i < sz->rnk; ++i) N *= XM(block)(sz->dims[i].n, sz->dims[i].b[k], coords[i]); STACK_FREE(coords); return N; } }
static void destroy(problem *ego_) { problem_mpi_dft *ego = (problem_mpi_dft *) ego_; XM(dtensor_destroy)(ego->sz); MPI_Comm_free(&ego->comm); X(ifree)(ego_); }
/* wrappers for fftw init and cleanup */ void PX(init) (void){ #ifdef _OPENMP X(init_threads)(); PX(plan_with_nthreads)(omp_get_max_threads()); #endif XM(init)(); }
static int applicable(const S *ego, const problem *p_, const planner *plnr) { const problem_mpi_dft *p = (const problem_mpi_dft *) p_; return (1 && p->sz->rnk > 1 && p->flags == TRANSPOSED_OUT && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr) && p->I != p->O)) && XM(is_local_after)(1, p->sz, IB) && XM(is_local_after)(2, p->sz, OB) && XM(num_blocks)(p->sz->dims[0].n, p->sz->dims[0].b[OB]) == 1 && (!NO_SLOWP(plnr) /* slow if dft-serial is applicable */ || !XM(dft_serial_applicable)(p)) ); }
/* needed to synchronize planner bogosity flag, in case non-MPI problems on a subset of processes encountered bogus wisdom */ static wisdom_state_t bogosity_hook(wisdom_state_t state, const problem *p) { MPI_Comm comm = problem_comm(p); if (comm != MPI_COMM_NULL /* an MPI problem */ && XM(any_true)(state == WISDOM_IS_BOGUS, comm)) /* bogus somewhere */ return WISDOM_IS_BOGUS; return state; }
/* Return whether sz is distributed for k according to a simple 1d block distribution in the first or second dimensions */ int XM(is_block1d)(const dtensor *sz, block_kind k) { int i; if (!FINITE_RNK(sz->rnk)) return 0; for (i = 0; i < sz->rnk && num_blocks_kind(sz->dims + i, k) == 1; ++i) ; return(i < sz->rnk && i < 2 && XM(is_local_after)(i + 1, sz, k)); }
/* returns whether sz is local for dims >= dim */ int XM(is_local_after)(int dim, const dtensor *sz, block_kind k) { if (FINITE_RNK(sz->rnk)) for (; dim < sz->rnk; ++dim) if (XM(num_blocks)(sz->dims[dim].n, sz->dims[dim].b[k]) > 1) return 0; return 1; }
/* For a given block size and dimension n, compute the block size b and the starting offset s on the given process. */ INT XM(block)(INT n, INT block, int which_block) { INT n_blocks = XM(num_blocks)(n, block); if (which_block >= n_blocks) return 0; else return ((which_block == n_blocks - 1) ? (n - which_block * block) : block); }
static void hash(const problem *p_, md5 *m) { const problem_mpi_dft *p = (const problem_mpi_dft *) p_; int i; X(md5puts)(m, "mpi-dft"); X(md5int)(m, p->I == p->O); /* don't include alignment -- may differ between processes X(md5int)(m, X(alignment_of)(p->I)); X(md5int)(m, X(alignment_of)(p->O)); ... note that applicability of MPI plans does not depend on alignment (although optimality may, in principle). */ XM(dtensor_md5)(m, p->sz); X(md5INT)(m, p->vn); X(md5int)(m, p->sign); X(md5int)(m, p->flags); MPI_Comm_size(p->comm, &i); X(md5int)(m, i); A(XM(md5_equal)(*m, p->comm)); }
static void destroy(problem *ego_) { problem_mpi_rdft *ego = (problem_mpi_rdft *) ego_; XM(dtensor_destroy)(ego->sz); MPI_Comm_free(&ego->comm); #if !defined(STRUCT_HACK_C99) && !defined(STRUCT_HACK_KR) X(ifree0)(ego->kind); #endif X(ifree)(ego_); }
problem *XM(mkproblem_dft)(const dtensor *sz, INT vn, R *I, R *O, MPI_Comm comm, int sign, unsigned flags) { problem_mpi_dft *ego = (problem_mpi_dft *)X(mkproblem)(sizeof(problem_mpi_dft), &padt); int n_pes; A(XM(dtensor_validp)(sz) && FINITE_RNK(sz->rnk)); MPI_Comm_size(comm, &n_pes); A(n_pes >= XM(num_blocks_total)(sz, IB) && n_pes >= XM(num_blocks_total)(sz, OB)); A(vn >= 0); A(sign == -1 || sign == 1); /* enforce pointer equality if untainted pointers are equal */ if (UNTAINT(I) == UNTAINT(O)) I = O = JOIN_TAINT(I, O); ego->sz = XM(dtensor_canonical)(sz, 1); ego->vn = vn; ego->I = I; ego->O = O; ego->sign = sign; /* canonicalize: replace TRANSPOSED_IN with TRANSPOSED_OUT by swapping the first two dimensions (for rnk > 1) */ if ((flags & TRANSPOSED_IN) && ego->sz->rnk > 1) { ddim dim0 = ego->sz->dims[0]; ego->sz->dims[0] = ego->sz->dims[1]; ego->sz->dims[1] = dim0; flags &= ~TRANSPOSED_IN; flags ^= TRANSPOSED_OUT; } ego->flags = flags; MPI_Comm_dup(comm, &ego->comm); return &(ego->super); }
gtransp_plan PX(plan_global_transp)( INT N0, INT N1, INT h0, INT h1, INT hm, INT blk0, INT blk1, MPI_Comm comm, R *in, R *out, unsigned transp_flag, unsigned fftw_flags ) { gtransp_plan ths = NULL; INT N[2], blk[2]; N[0] = N1 * h1; blk[0] = blk1 * h1; N[1] = N0 * h0; blk[1] = blk0 * h0; /* For strange distributions (e.g. 4 data points on 3 processes) * all processes in a row get no data. */ if(N[0]*N[1]*h0*h1*hm==0) return NULL; ths = gtransp_mkplan(); /* PFFTs and FFTWs transpose flags are complementary */ if( ~transp_flag & PFFT_TRANSPOSED_OUT) fftw_flags |= FFTW_MPI_TRANSPOSED_OUT; if( ~transp_flag & PFFT_TRANSPOSED_IN) fftw_flags |= FFTW_MPI_TRANSPOSED_IN; #if PFFT_BUGFIX_FORGET_PARALLEL_FFTW_WISDOM X(forget_wisdom)(); #endif #if PFFT_DEBUG_GTRANSP ths->dbg = gtransp_mkdbg(N, hm, blk, in, out, comm, fftw_flags); #endif ths->plan.plan = XM(plan_many_transpose)( N[0], N[1], hm, blk[0], blk[1], in, out, comm, fftw_flags); ths->plan.plannedin = in; ths->plan.plannedout = out; ths->plan.execute = (PX(fftw_execute))(XM(execute_r2r)); return ths; }
static void zero(const problem *ego_) { const problem_mpi_rdft *ego = (const problem_mpi_rdft *) ego_; R *I = ego->I; INT i, N; int my_pe; MPI_Comm_rank(ego->comm, &my_pe); N = ego->vn * XM(total_block)(ego->sz, IB, my_pe); for (i = 0; i < N; ++i) I[i] = K(0.0); }
/* Given a non-idle process which_pe, computes the coordinate vector coords[rnk] giving the coordinates of a block in the matrix of blocks. k specifies whether we are talking about the input or output data distribution. */ void XM(block_coords)(const dtensor *sz, block_kind k, int which_pe, INT *coords) { int i; A(!XM(idle_process)(sz, k, which_pe) && FINITE_RNK(sz->rnk)); for (i = sz->rnk - 1; i >= 0; --i) { INT nb = num_blocks_kind(sz->dims + i, k); coords[i] = which_pe % nb; which_pe /= nb; } }
void XM(init)(void) { if (!mpi_inited) { planner *plnr = X(the_planner)(); plnr->cost_hook = cost_hook; plnr->wisdom_ok_hook = wisdom_ok_hook; plnr->nowisdom_hook = nowisdom_hook; plnr->bogosity_hook = bogosity_hook; XM(conf_standard)(plnr); mpi_inited = 1; } }
static void zero(const problem *ego_) { const problem_mpi_transpose *ego = (const problem_mpi_transpose *) ego_; R *I = ego->I; INT i, N = ego->vn * ego->ny; int my_pe; MPI_Comm_rank(ego->comm, &my_pe); N *= XM(block)(ego->nx, ego->block, my_pe); for (i = 0; i < N; ++i) I[i] = K(0.0); }
static int applicable(const S *ego, const problem *p_, const planner *plnr) { const problem_mpi_dft *p = (const problem_mpi_dft *) p_; int n_pes; MPI_Comm_size(p->comm, &n_pes); return (1 && p->sz->rnk == 1 && !(p->flags & ~RANK1_BIGVEC_ONLY) && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr) && p->I != p->O)) && (p->vn >= n_pes /* TODO: relax this, using more memory? */ || (p->flags & RANK1_BIGVEC_ONLY)) && XM(rearrange_applicable)(ego->rearrange, p->sz->dims[0], p->vn, n_pes) && (!NO_SLOWP(plnr) /* slow if dft-serial is applicable */ || !XM(dft_serial_applicable)(p)) ); }
static void print(const problem *ego_, printer *p) { const problem_mpi_dft *ego = (const problem_mpi_dft *) ego_; int i; p->print(p, "(mpi-dft %d %d %d ", ego->I == ego->O, X(alignment_of)(ego->I), X(alignment_of)(ego->O)); XM(dtensor_print)(ego->sz, p); p->print(p, " %D %d %d", ego->vn, ego->sign, ego->flags); MPI_Comm_size(ego->comm, &i); p->print(p, " %d)", i); }
static void print(const problem *ego_, printer *p) { const problem_mpi_rdft *ego = (const problem_mpi_rdft *) ego_; int i; p->print(p, "(mpi-rdft %d %d %d ", ego->I == ego->O, X(alignment_of)(ego->I), X(alignment_of)(ego->O)); XM(dtensor_print)(ego->sz, p); for (i = 0; i < ego->sz->rnk; ++i) p->print(p, " %d", (int)ego->kind[i]); p->print(p, " %D %d", ego->vn, ego->flags); MPI_Comm_size(ego->comm, &i); p->print(p, " %d)", i); }
problem *XM(mkproblem_transpose)(INT nx, INT ny, INT vn, R *I, R *O, INT block, INT tblock, MPI_Comm comm, unsigned flags) { problem_mpi_transpose *ego = (problem_mpi_transpose *)X(mkproblem)(sizeof(problem_mpi_transpose), &padt); A(nx > 0 && ny > 0 && vn > 0); A(block > 0 && XM(num_blocks_ok)(nx, block, comm) && tblock > 0 && XM(num_blocks_ok)(ny, tblock, comm)); /* enforce pointer equality if untainted pointers are equal */ if (UNTAINT(I) == UNTAINT(O)) I = O = JOIN_TAINT(I, O); ego->nx = nx; ego->ny = ny; ego->vn = vn; ego->I = I; ego->O = O; ego->block = block > nx ? nx : block; ego->tblock = tblock > ny ? ny : tblock; /* canonicalize flags: we can freely assume that the data is "transposed" if one of the dimensions is 1. */ if (ego->block == 1) flags |= TRANSPOSED_IN; if (ego->tblock == 1) flags |= TRANSPOSED_OUT; ego->flags = flags; MPI_Comm_dup(comm, &ego->comm); return &(ego->super); }
static void hash(const problem *p_, md5 *m) { const problem_mpi_rdft *p = (const problem_mpi_rdft *) p_; int i; X(md5puts)(m, "mpi-dft"); X(md5int)(m, p->I == p->O); X(md5int)(m, X(alignment_of)(p->I)); X(md5int)(m, X(alignment_of)(p->O)); XM(dtensor_md5)(m, p->sz); X(md5INT)(m, p->vn); for (i = 0; i < p->sz->rnk; ++i) X(md5int)(m, p->kind[i]); X(md5int)(m, p->flags); MPI_Comm_size(p->comm, &i); X(md5int)(m, i); }