/* Sort the communication schedule sched for npes so that the schedule on process sortpe is ascending or descending (!ascending). This is necessary to allow in-place transposes when the problem does not divide equally among the processes. In this case there is one process where the incoming blocks are bigger/smaller than the outgoing blocks and thus have to be received in descending/ascending order, respectively, to avoid overwriting data before it is sent. */ static void sort1_comm_sched(int *sched, int npes, int sortpe, int ascending) { int *sortsched, i; sortsched = (int *) MALLOC(npes * sizeof(int) * 2, OTHER); fill1_comm_sched(sortsched, sortpe, npes); if (ascending) for (i = 0; i < npes; ++i) sortsched[npes + sortsched[i]] = sched[i]; else for (i = 0; i < npes; ++i) sortsched[2*npes - 1 - sortsched[i]] = sched[i]; for (i = 0; i < npes; ++i) sched[i] = sortsched[npes + i]; X(ifree)(sortsched); }
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr) { const S *ego = (const S *) ego_; const problem_mpi_transpose *p; P *pln; plan *cld1 = 0, *cld2 = 0, *cld2rest = 0, *cld3 = 0; INT b, bt, vn, rest_Ioff, rest_Ooff; INT *sbs, *sbo, *rbs, *rbo; int pe, my_pe, n_pes, sort_pe = -1, ascending = 1; R *I, *O; static const plan_adt padt = { XM(transpose_solve), awake, print, destroy }; UNUSED(ego); if (!applicable(ego, p_, plnr)) return (plan *) 0; p = (const problem_mpi_transpose *) p_; vn = p->vn; I = p->I; O = p->O; MPI_Comm_rank(p->comm, &my_pe); MPI_Comm_size(p->comm, &n_pes); b = XM(block)(p->nx, p->block, my_pe); if (!(p->flags & TRANSPOSED_IN)) { /* b x ny x vn -> ny x b x vn */ cld1 = X(mkplan_f_d)(plnr, X(mkproblem_rdft_0_d)(X(mktensor_3d) (b, p->ny * vn, vn, p->ny, vn, b * vn, vn, 1, 1), I, O), 0, 0, NO_SLOW); if (XM(any_true)(!cld1, p->comm)) goto nada; } if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) I = O; if (XM(any_true)(!XM(mkplans_posttranspose)(p, plnr, I, O, my_pe, &cld2, &cld2rest, &cld3, &rest_Ioff, &rest_Ooff), p->comm)) goto nada; pln = MKPLAN_MPI_TRANSPOSE(P, &padt, apply); pln->cld1 = cld1; pln->cld2 = cld2; pln->cld2rest = cld2rest; pln->rest_Ioff = rest_Ioff; pln->rest_Ooff = rest_Ooff; pln->cld3 = cld3; pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr); MPI_Comm_dup(p->comm, &pln->comm); n_pes = (int) X(imax)(XM(num_blocks)(p->nx, p->block), XM(num_blocks)(p->ny, p->tblock)); /* Compute sizes/offsets of blocks to exchange between processors */ sbs = (INT *) MALLOC(4 * n_pes * sizeof(INT), PLANS); sbo = sbs + n_pes; rbs = sbo + n_pes; rbo = rbs + n_pes; b = XM(block)(p->nx, p->block, my_pe); bt = XM(block)(p->ny, p->tblock, my_pe); for (pe = 0; pe < n_pes; ++pe) { INT db, dbt; /* destination block sizes */ db = XM(block)(p->nx, p->block, pe); dbt = XM(block)(p->ny, p->tblock, pe); sbs[pe] = b * dbt * vn; sbo[pe] = pe * (b * p->tblock) * vn; rbs[pe] = db * bt * vn; rbo[pe] = pe * (p->block * bt) * vn; if (db * dbt > 0 && db * p->tblock != p->block * dbt) { A(sort_pe == -1); /* only one process should need sorting */ sort_pe = pe; ascending = db * p->tblock > p->block * dbt; } } pln->n_pes = n_pes; pln->my_pe = my_pe; pln->send_block_sizes = sbs; pln->send_block_offsets = sbo; pln->recv_block_sizes = rbs; pln->recv_block_offsets = rbo; if (my_pe >= n_pes) { pln->sched = 0; /* this process is not doing anything */ } else { pln->sched = (int *) MALLOC(n_pes * sizeof(int), PLANS); fill1_comm_sched(pln->sched, my_pe, n_pes); if (sort_pe >= 0) sort1_comm_sched(pln->sched, n_pes, sort_pe, ascending); } X(ops_zero)(&pln->super.super.ops); if (cld1) X(ops_add2)(&cld1->ops, &pln->super.super.ops); if (cld2) X(ops_add2)(&cld2->ops, &pln->super.super.ops); if (cld2rest) X(ops_add2)(&cld2rest->ops, &pln->super.super.ops); if (cld3) X(ops_add2)(&cld3->ops, &pln->super.super.ops); /* FIXME: should MPI operations be counted in "other" somehow? */ return &(pln->super.super); nada: X(plan_destroy_internal)(cld3); X(plan_destroy_internal)(cld2rest); X(plan_destroy_internal)(cld2); X(plan_destroy_internal)(cld1); return (plan *) 0; }
int main(int argc, char **argv) { int **sched; int npes = -1, sortpe = -1, steps, i; if (argc >= 2) { npes = atoi(argv[1]); if (npes <= 0) { fprintf(stderr,"npes must be positive!"); return 1; } } if (argc >= 3) { sortpe = atoi(argv[2]); if (sortpe < 0 || sortpe >= npes) { fprintf(stderr,"sortpe must be between 0 and npes-1.\n"); return 1; } } if (npes != -1) { printf("Computing schedule for npes = %d:\n",npes); sched = make_comm_schedule(npes); if (!sched) { fprintf(stderr,"Out of memory!"); return 6; } if (steps = check_comm_schedule(sched,npes)) printf("schedule OK (takes %d steps to complete).\n", steps); else printf("schedule not OK.\n"); print_comm_schedule(sched, npes); if (sortpe != -1) { printf("\nRe-creating schedule for pe = %d...\n", sortpe); int *sched1 = (int*) malloc(sizeof(int) * npes); for (i = 0; i < npes; ++i) sched1[i] = -1; fill1_comm_sched(sched1, sortpe, npes); printf(" ="); for (i = 0; i < npes; ++i) printf(" %*d", npes < 10 ? 1 : (npes < 100 ? 2 : 3), sched1[i]); printf("\n"); printf("\nSorting schedule for sortpe = %d...\n", sortpe); sort_comm_schedule(sched,npes,sortpe); if (steps = check_comm_schedule(sched,npes)) printf("schedule OK (takes %d steps to complete).\n", steps); else printf("schedule not OK.\n"); print_comm_schedule(sched, npes); printf("\nInverting schedule...\n"); invert_comm_schedule(sched,npes); if (steps = check_comm_schedule(sched,npes)) printf("schedule OK (takes %d steps to complete).\n", steps); else printf("schedule not OK.\n"); print_comm_schedule(sched, npes); free_comm_schedule(sched,npes); free(sched1); } } else { printf("Doing infinite tests...\n"); for (npes = 1; ; ++npes) { int *sched1 = (int*) malloc(sizeof(int) * npes); printf("npes = %d...",npes); sched = make_comm_schedule(npes); if (!sched) { fprintf(stderr,"Out of memory!\n"); return 5; } for (sortpe = 0; sortpe < npes; ++sortpe) { empty_comm_schedule(sched,npes); fill_comm_schedule(sched,npes); if (!check_comm_schedule(sched,npes)) { fprintf(stderr, "\n -- fill error for sortpe = %d!\n",sortpe); return 2; } for (i = 0; i < npes; ++i) sched1[i] = -1; fill1_comm_sched(sched1, sortpe, npes); for (i = 0; i < npes; ++i) if (sched1[i] != sched[sortpe][i]) fprintf(stderr, "\n -- fill1 error for pe = %d!\n", sortpe); sort_comm_schedule(sched,npes,sortpe); if (!check_comm_schedule(sched,npes)) { fprintf(stderr, "\n -- sort error for sortpe = %d!\n",sortpe); return 3; } invert_comm_schedule(sched,npes); if (!check_comm_schedule(sched,npes)) { fprintf(stderr, "\n -- invert error for sortpe = %d!\n", sortpe); return 4; } } free_comm_schedule(sched,npes); printf("OK\n"); if (npes % 50 == 0) printf("(...Hit Ctrl-C to stop...)\n"); free(sched1); } } return 0; }