static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr) { const S *ego = (const S *) ego_; const problem_mpi_transpose *p; P *pln; plan *cld1 = 0, *cld2 = 0, *cld2rest = 0, *cld3 = 0; INT b, bt, vn, rest_Ioff, rest_Ooff; INT *sbs, *sbo, *rbs, *rbo; int pe, my_pe, n_pes, sort_pe = -1, ascending = 1; R *I, *O; static const plan_adt padt = { XM(transpose_solve), awake, print, destroy }; UNUSED(ego); if (!applicable(ego, p_, plnr)) return (plan *) 0; p = (const problem_mpi_transpose *) p_; vn = p->vn; I = p->I; O = p->O; MPI_Comm_rank(p->comm, &my_pe); MPI_Comm_size(p->comm, &n_pes); b = XM(block)(p->nx, p->block, my_pe); if (!(p->flags & TRANSPOSED_IN)) { /* b x ny x vn -> ny x b x vn */ cld1 = X(mkplan_f_d)(plnr, X(mkproblem_rdft_0_d)(X(mktensor_3d) (b, p->ny * vn, vn, p->ny, vn, b * vn, vn, 1, 1), I, O), 0, 0, NO_SLOW); if (XM(any_true)(!cld1, p->comm)) goto nada; } if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) I = O; if (XM(any_true)(!XM(mkplans_posttranspose)(p, plnr, I, O, my_pe, &cld2, &cld2rest, &cld3, &rest_Ioff, &rest_Ooff), p->comm)) goto nada; pln = MKPLAN_MPI_TRANSPOSE(P, &padt, apply); pln->cld1 = cld1; pln->cld2 = cld2; pln->cld2rest = cld2rest; pln->rest_Ioff = rest_Ioff; pln->rest_Ooff = rest_Ooff; pln->cld3 = cld3; pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr); MPI_Comm_dup(p->comm, &pln->comm); n_pes = (int) X(imax)(XM(num_blocks)(p->nx, p->block), XM(num_blocks)(p->ny, p->tblock)); /* Compute sizes/offsets of blocks to exchange between processors */ sbs = (INT *) MALLOC(4 * n_pes * sizeof(INT), PLANS); sbo = sbs + n_pes; rbs = sbo + n_pes; rbo = rbs + n_pes; b = XM(block)(p->nx, p->block, my_pe); bt = XM(block)(p->ny, p->tblock, my_pe); for (pe = 0; pe < n_pes; ++pe) { INT db, dbt; /* destination block sizes */ db = XM(block)(p->nx, p->block, pe); dbt = XM(block)(p->ny, p->tblock, pe); sbs[pe] = b * dbt * vn; sbo[pe] = pe * (b * p->tblock) * vn; rbs[pe] = db * bt * vn; rbo[pe] = pe * (p->block * bt) * vn; if (db * dbt > 0 && db * p->tblock != p->block * dbt) { A(sort_pe == -1); /* only one process should need sorting */ sort_pe = pe; ascending = db * p->tblock > p->block * dbt; } } pln->n_pes = n_pes; pln->my_pe = my_pe; pln->send_block_sizes = sbs; pln->send_block_offsets = sbo; pln->recv_block_sizes = rbs; pln->recv_block_offsets = rbo; if (my_pe >= n_pes) { pln->sched = 0; /* this process is not doing anything */ } else { pln->sched = (int *) MALLOC(n_pes * sizeof(int), PLANS); fill1_comm_sched(pln->sched, my_pe, n_pes); if (sort_pe >= 0) sort1_comm_sched(pln->sched, n_pes, sort_pe, ascending); } X(ops_zero)(&pln->super.super.ops); if (cld1) X(ops_add2)(&cld1->ops, &pln->super.super.ops); if (cld2) X(ops_add2)(&cld2->ops, &pln->super.super.ops); if (cld2rest) X(ops_add2)(&cld2rest->ops, &pln->super.super.ops); if (cld3) X(ops_add2)(&cld3->ops, &pln->super.super.ops); /* FIXME: should MPI operations be counted in "other" somehow? */ return &(pln->super.super); nada: X(plan_destroy_internal)(cld3); X(plan_destroy_internal)(cld2rest); X(plan_destroy_internal)(cld2); X(plan_destroy_internal)(cld1); return (plan *) 0; }
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr) { const S *ego = (const S *) ego_; const problem_mpi_transpose *p; P *pln; plan *cld1 = 0, *cld2 = 0, *cld2rest = 0; INT b, bt, nxb, vn, Ioff = 0, Ooff = 0; R *I; int *sbs, *sbo, *rbs, *rbo; int pe, my_pe, n_pes; int equal_blocks = 1; static const plan_adt padt = { XM(transpose_solve), awake, print, destroy }; if (!applicable(ego, p_, plnr)) return (plan *) 0; p = (const problem_mpi_transpose *) p_; vn = p->vn; MPI_Comm_rank(p->comm, &my_pe); MPI_Comm_size(p->comm, &n_pes); b = XM(block)(p->nx, p->block, my_pe); if (p->flags & TRANSPOSED_IN) { /* I is already transposed */ if (ego->copy_transposed_in) { cld1 = X(mkplan_f_d)(plnr, X(mkproblem_rdft_0_d)(X(mktensor_1d) (b * p->ny * vn, 1, 1), I = p->I, p->O), 0, 0, NO_SLOW); if (XM(any_true)(!cld1, p->comm)) goto nada; } else I = p->O; /* final transpose is in-place */ } else { /* transpose b x ny x vn -> ny x b x vn */ cld1 = X(mkplan_f_d)(plnr, X(mkproblem_rdft_0_d)(X(mktensor_3d) (b, p->ny * vn, vn, p->ny, vn, b * vn, vn, 1, 1), I = p->I, p->O), 0, 0, NO_SLOW); if (XM(any_true)(!cld1, p->comm)) goto nada; } bt = XM(block)(p->ny, p->tblock, my_pe); nxb = (p->nx + p->block - 1) / p->block; if (p->nx != nxb * p->block) nxb -= 1; /* number of equal-sized blocks */ if (!(p->flags & TRANSPOSED_OUT)) { INT nx = p->nx * vn; b = p->block * vn; cld2 = X(mkplan_f_d)(plnr, X(mkproblem_rdft_0_d)(X(mktensor_3d) (nxb, bt * b, b, bt, b, nx, b, 1, 1), I, p->O), 0, 0, NO_SLOW); if (XM(any_true)(!cld2, p->comm)) goto nada; if (p->nx != nxb * p->block) { /* leftover blocks to transpose */ Ioff = bt * b * nxb; Ooff = b * nxb; b = nx - nxb * b; cld2rest = X(mkplan_f_d)(plnr, X(mkproblem_rdft_0_d)(X(mktensor_2d) (bt, b, nx, b, 1, 1), I + Ioff, p->O + Ooff), 0, 0, NO_SLOW); if (XM(any_true)(!cld2rest, p->comm)) goto nada; } } else { /* TRANSPOSED_OUT */ b = p->block; cld2 = X(mkplan_f_d)(plnr, X(mkproblem_rdft_0_d)(X(mktensor_4d) (nxb, bt * b*vn, bt * b*vn, bt, b*vn, vn, b, vn, bt*vn, vn, 1, 1), I, p->O), 0, 0, NO_SLOW); if (XM(any_true)(!cld2, p->comm)) goto nada; if (p->nx != nxb * p->block) { /* leftover blocks to transpose */ Ioff = Ooff = bt * b * nxb * vn; b = p->nx - nxb * b; cld2rest = X(mkplan_f_d)(plnr, X(mkproblem_rdft_0_d)(X(mktensor_3d) (bt, b*vn, vn, b, vn, bt*vn, vn, 1, 1), I + Ioff, p->O + Ooff), 0, 0, NO_SLOW); if (XM(any_true)(!cld2rest, p->comm)) goto nada; } } pln = MKPLAN_MPI_TRANSPOSE(P, &padt, apply); pln->cld1 = cld1; pln->cld2 = cld2; pln->cld2rest = cld2rest; pln->rest_Ioff = Ioff; pln->rest_Ooff = Ooff; MPI_Comm_dup(p->comm, &pln->comm); /* Compute sizes/offsets of blocks to send for all-to-all command. TODO: In the special case where all block sizes are equal, we could use the MPI_Alltoall command. It's not clear whether/why this would be any faster, though. */ sbs = (int *) MALLOC(4 * n_pes * sizeof(int), PLANS); sbo = sbs + n_pes; rbs = sbo + n_pes; rbo = rbs + n_pes; b = XM(block)(p->nx, p->block, my_pe); bt = XM(block)(p->ny, p->tblock, my_pe); for (pe = 0; pe < n_pes; ++pe) { INT db, dbt; /* destination block sizes */ db = XM(block)(p->nx, p->block, pe); dbt = XM(block)(p->ny, p->tblock, pe); /* MPI requires type "int" here; apparently it has no 64-bit API? Grrr. */ sbs[pe] = (int) (b * dbt * vn); sbo[pe] = (int) (pe * (b * p->tblock) * vn); rbs[pe] = (int) (db * bt * vn); rbo[pe] = (int) (pe * (p->block * bt) * vn); if (sbs[pe] != (b * p->tblock) * vn || rbs[pe] != (p->block * bt) * vn) equal_blocks = 0; } pln->send_block_sizes = sbs; pln->send_block_offsets = sbo; pln->recv_block_sizes = rbs; pln->recv_block_offsets = rbo; pln->equal_blocks = equal_blocks; X(ops_zero)(&pln->super.super.ops); if (cld1) X(ops_add2)(&cld1->ops, &pln->super.super.ops); if (cld2) X(ops_add2)(&cld2->ops, &pln->super.super.ops); if (cld2rest) X(ops_add2)(&cld2rest->ops, &pln->super.super.ops); /* FIXME: should MPI operations be counted in "other" somehow? */ return &(pln->super.super); nada: X(plan_destroy_internal)(cld2rest); X(plan_destroy_internal)(cld2); X(plan_destroy_internal)(cld1); return (plan *) 0; }
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr) { const S *ego = (const S *) ego_; const problem_mpi_transpose *p; P *pln; plan *cld1 = 0, *cldtr = 0, *cldtm = 0; R *I, *O; int me, np, r, m; INT b; MPI_Comm comm2; static const plan_adt padt = { XM(transpose_solve), awake, print, destroy }; UNUSED(ego); if (!applicable(ego, p_, plnr, &r)) return (plan *) 0; p = (const problem_mpi_transpose *) p_; MPI_Comm_size(p->comm, &np); MPI_Comm_rank(p->comm, &me); m = np / r; A(r * m == np); I = p->I; O = p->O; b = XM(block)(p->nx, p->block, me); A(p->tblock * np == p->ny); /* this is currently required for cld1 */ if (p->flags & TRANSPOSED_IN) { /* m x r x (bt x b x vn) -> r x m x (bt x b x vn) */ INT vn = p->vn * b * p->tblock; cld1 = X(mkplan_f_d)(plnr, X(mkproblem_rdft_0_d)(X(mktensor_3d) (m, r*vn, vn, r, vn, m*vn, vn, 1, 1), I, O), 0, 0, NO_SLOW); } else if (I != O) { /* combine cld1 with TRANSPOSED_IN permutation */ /* b x m x r x bt x vn -> r x m x bt x b x vn */ INT vn = p->vn; INT bt = p->tblock; cld1 = X(mkplan_f_d)(plnr, X(mkproblem_rdft_0_d)(X(mktensor_5d) (b, m*r*bt*vn, vn, m, r*bt*vn, bt*b*vn, r, bt*vn, m*bt*b*vn, bt, vn, b*vn, vn, 1, 1), I, O), 0, 0, NO_SLOW); } else { /* TRANSPOSED_IN permutation must be separate for in-place */ /* b x (m x r) x bt x vn -> b x (r x m) x bt x vn */ INT vn = p->vn * p->tblock; cld1 = X(mkplan_f_d)(plnr, X(mkproblem_rdft_0_d)(X(mktensor_4d) (m, r*vn, vn, r, vn, m*vn, vn, 1, 1, b, np*vn, np*vn), I, O), 0, 0, NO_SLOW); } if (XM(any_true)(!cld1, p->comm)) goto nada; if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) I = O; b = XM(block)(p->nx, r * p->block, me / r); MPI_Comm_split(p->comm, me / r, me, &comm2); if (b) cldtr = X(mkplan_d)(plnr, XM(mkproblem_transpose) (b, p->ny, p->vn, O, I, p->block, m * p->tblock, comm2, p->I != p->O ? TRANSPOSED_IN : (p->flags & TRANSPOSED_IN))); MPI_Comm_free(&comm2); if (XM(any_true)(b && !cldtr, p->comm)) goto nada; b = XM(block)(p->ny, m * p->tblock, me % r); MPI_Comm_split(p->comm, me % r, me, &comm2); if (b) cldtm = X(mkplan_d)(plnr, XM(mkproblem_transpose) (p->nx, b, p->vn, I, O, r * p->block, p->tblock, comm2, TRANSPOSED_IN | (p->flags & TRANSPOSED_OUT))); MPI_Comm_free(&comm2); if (XM(any_true)(b && !cldtm, p->comm)) goto nada; pln = MKPLAN_MPI_TRANSPOSE(P, &padt, apply); pln->cld1 = cld1; pln->cldtr = cldtr; pln->cldtm = cldtm; pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr); pln->r = r; pln->nam = ego->nam; pln->super.super.ops = cld1->ops; if (cldtr) X(ops_add2)(&cldtr->ops, &pln->super.super.ops); if (cldtm) X(ops_add2)(&cldtm->ops, &pln->super.super.ops); return &(pln->super.super); nada: X(plan_destroy_internal)(cldtm); X(plan_destroy_internal)(cldtr); X(plan_destroy_internal)(cld1); return (plan *) 0; }
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr) { const S *ego = (const S *) ego_; const problem_mpi_transpose *p; P *pln; plan *cld1 = 0, *cld2 = 0, *cld2rest = 0, *cld3 = 0; INT b, bt, vn, rest_Ioff, rest_Ooff; R *I; int *sbs, *sbo, *rbs, *rbo; int pe, my_pe, n_pes; int equal_blocks = 1; static const plan_adt padt = { XM(transpose_solve), awake, print, destroy }; if (!applicable(ego, p_, plnr)) return (plan *) 0; p = (const problem_mpi_transpose *) p_; vn = p->vn; MPI_Comm_rank(p->comm, &my_pe); MPI_Comm_size(p->comm, &n_pes); b = XM(block)(p->nx, p->block, my_pe); if (p->flags & TRANSPOSED_IN) { /* I is already transposed */ if (ego->copy_transposed_in) { cld1 = X(mkplan_f_d)(plnr, X(mkproblem_rdft_0_d)(X(mktensor_1d) (b * p->ny * vn, 1, 1), I = p->I, p->O), 0, 0, NO_SLOW); if (XM(any_true)(!cld1, p->comm)) goto nada; } else I = p->O; /* final transpose is in-place */ } else { /* transpose b x ny x vn -> ny x b x vn */ cld1 = X(mkplan_f_d)(plnr, X(mkproblem_rdft_0_d)(X(mktensor_3d) (b, p->ny * vn, vn, p->ny, vn, b * vn, vn, 1, 1), I = p->I, p->O), 0, 0, NO_SLOW); if (XM(any_true)(!cld1, p->comm)) goto nada; } if (XM(any_true)(!XM(mkplans_posttranspose)(p, plnr, I, p->O, my_pe, &cld2, &cld2rest, &cld3, &rest_Ioff, &rest_Ooff), p->comm)) goto nada; pln = MKPLAN_MPI_TRANSPOSE(P, &padt, apply); pln->cld1 = cld1; pln->cld2 = cld2; pln->cld2rest = cld2rest; pln->rest_Ioff = rest_Ioff; pln->rest_Ooff = rest_Ooff; pln->cld3 = cld3; MPI_Comm_dup(p->comm, &pln->comm); /* Compute sizes/offsets of blocks to send for all-to-all command. */ sbs = (int *) MALLOC(4 * n_pes * sizeof(int), PLANS); sbo = sbs + n_pes; rbs = sbo + n_pes; rbo = rbs + n_pes; b = XM(block)(p->nx, p->block, my_pe); bt = XM(block)(p->ny, p->tblock, my_pe); for (pe = 0; pe < n_pes; ++pe) { INT db, dbt; /* destination block sizes */ db = XM(block)(p->nx, p->block, pe); dbt = XM(block)(p->ny, p->tblock, pe); if (db != p->block || dbt != p->tblock) equal_blocks = 0; /* MPI requires type "int" here; apparently it has no 64-bit API? Grrr. */ sbs[pe] = (int) (b * dbt * vn); sbo[pe] = (int) (pe * (b * p->tblock) * vn); rbs[pe] = (int) (db * bt * vn); rbo[pe] = (int) (pe * (p->block * bt) * vn); } pln->send_block_sizes = sbs; pln->send_block_offsets = sbo; pln->recv_block_sizes = rbs; pln->recv_block_offsets = rbo; pln->equal_blocks = equal_blocks; X(ops_zero)(&pln->super.super.ops); if (cld1) X(ops_add2)(&cld1->ops, &pln->super.super.ops); if (cld2) X(ops_add2)(&cld2->ops, &pln->super.super.ops); if (cld2rest) X(ops_add2)(&cld2rest->ops, &pln->super.super.ops); if (cld3) X(ops_add2)(&cld3->ops, &pln->super.super.ops); /* FIXME: should MPI operations be counted in "other" somehow? */ return &(pln->super.super); nada: X(plan_destroy_internal)(cld3); X(plan_destroy_internal)(cld2rest); X(plan_destroy_internal)(cld2); X(plan_destroy_internal)(cld1); return (plan *) 0; }