コード例 #1
0
ファイル: transpose-pairwise.c プロジェクト: 376473984/fftw3
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
     const S *ego = (const S *) ego_;
     const problem_mpi_transpose *p;
     P *pln;
     plan *cld1 = 0, *cld2 = 0, *cld2rest = 0, *cld3 = 0;
     INT b, bt, vn, rest_Ioff, rest_Ooff;
     INT *sbs, *sbo, *rbs, *rbo;
     int pe, my_pe, n_pes, sort_pe = -1, ascending = 1;
     R *I, *O;
     static const plan_adt padt = {
          XM(transpose_solve), awake, print, destroy
     };

     UNUSED(ego);

     if (!applicable(ego, p_, plnr))
          return (plan *) 0;

     p = (const problem_mpi_transpose *) p_;
     vn = p->vn;
     I = p->I; O = p->O;

     MPI_Comm_rank(p->comm, &my_pe);
     MPI_Comm_size(p->comm, &n_pes);

     b = XM(block)(p->nx, p->block, my_pe);
     
     if (!(p->flags & TRANSPOSED_IN)) { /* b x ny x vn -> ny x b x vn */
	  cld1 = X(mkplan_f_d)(plnr, 
			       X(mkproblem_rdft_0_d)(X(mktensor_3d)
						     (b, p->ny * vn, vn,
						      p->ny, vn, b * vn,
						      vn, 1, 1),
						     I, O),
			       0, 0, NO_SLOW);
	  if (XM(any_true)(!cld1, p->comm)) goto nada;
     }
     if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) I = O;

     if (XM(any_true)(!XM(mkplans_posttranspose)(p, plnr, I, O, my_pe,
						 &cld2, &cld2rest, &cld3,
						 &rest_Ioff, &rest_Ooff),
		      p->comm)) goto nada;

     pln = MKPLAN_MPI_TRANSPOSE(P, &padt, apply);

     pln->cld1 = cld1;
     pln->cld2 = cld2;
     pln->cld2rest = cld2rest;
     pln->rest_Ioff = rest_Ioff;
     pln->rest_Ooff = rest_Ooff;
     pln->cld3 = cld3;
     pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);

     MPI_Comm_dup(p->comm, &pln->comm);

     n_pes = (int) X(imax)(XM(num_blocks)(p->nx, p->block),
			   XM(num_blocks)(p->ny, p->tblock));

     /* Compute sizes/offsets of blocks to exchange between processors */
     sbs = (INT *) MALLOC(4 * n_pes * sizeof(INT), PLANS);
     sbo = sbs + n_pes;
     rbs = sbo + n_pes;
     rbo = rbs + n_pes;
     b = XM(block)(p->nx, p->block, my_pe);
     bt = XM(block)(p->ny, p->tblock, my_pe);
     for (pe = 0; pe < n_pes; ++pe) {
	  INT db, dbt; /* destination block sizes */
	  db = XM(block)(p->nx, p->block, pe);
	  dbt = XM(block)(p->ny, p->tblock, pe);

	  sbs[pe] = b * dbt * vn;
	  sbo[pe] = pe * (b * p->tblock) * vn;
	  rbs[pe] = db * bt * vn;
	  rbo[pe] = pe * (p->block * bt) * vn;

	  if (db * dbt > 0 && db * p->tblock != p->block * dbt) {
	       A(sort_pe == -1); /* only one process should need sorting */
	       sort_pe = pe;
	       ascending = db * p->tblock > p->block * dbt;
	  }
     }
     pln->n_pes = n_pes;
     pln->my_pe = my_pe;
     pln->send_block_sizes = sbs;
     pln->send_block_offsets = sbo;
     pln->recv_block_sizes = rbs;
     pln->recv_block_offsets = rbo;

     if (my_pe >= n_pes) {
	  pln->sched = 0; /* this process is not doing anything */
     }
     else {
	  pln->sched = (int *) MALLOC(n_pes * sizeof(int), PLANS);
	  fill1_comm_sched(pln->sched, my_pe, n_pes);
	  if (sort_pe >= 0)
	       sort1_comm_sched(pln->sched, n_pes, sort_pe, ascending);
     }

     X(ops_zero)(&pln->super.super.ops);
     if (cld1) X(ops_add2)(&cld1->ops, &pln->super.super.ops);
     if (cld2) X(ops_add2)(&cld2->ops, &pln->super.super.ops);
     if (cld2rest) X(ops_add2)(&cld2rest->ops, &pln->super.super.ops);
     if (cld3) X(ops_add2)(&cld3->ops, &pln->super.super.ops);
     /* FIXME: should MPI operations be counted in "other" somehow? */

     return &(pln->super.super);

 nada:
     X(plan_destroy_internal)(cld3);
     X(plan_destroy_internal)(cld2rest);
     X(plan_destroy_internal)(cld2);
     X(plan_destroy_internal)(cld1);
     return (plan *) 0;
}
コード例 #2
0
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
     const S *ego = (const S *) ego_;
     const problem_mpi_transpose *p;
     P *pln;
     plan *cld1 = 0, *cld2 = 0, *cld2rest = 0;
     INT b, bt, nxb, vn, Ioff = 0, Ooff = 0;
     R *I;
     int *sbs, *sbo, *rbs, *rbo;
     int pe, my_pe, n_pes;
     int equal_blocks = 1;
     static const plan_adt padt = {
          XM(transpose_solve), awake, print, destroy
     };

     if (!applicable(ego, p_, plnr))
          return (plan *) 0;

     p = (const problem_mpi_transpose *) p_;
     vn = p->vn;

     MPI_Comm_rank(p->comm, &my_pe);
     MPI_Comm_size(p->comm, &n_pes);

     b = XM(block)(p->nx, p->block, my_pe);

     if (p->flags & TRANSPOSED_IN) { /* I is already transposed */
	  if (ego->copy_transposed_in) {
	       cld1 = X(mkplan_f_d)(plnr,
				  X(mkproblem_rdft_0_d)(X(mktensor_1d)
							(b * p->ny * vn, 1, 1),
							I = p->I, p->O),
				    0, 0, NO_SLOW);
	       if (XM(any_true)(!cld1, p->comm)) goto nada;
	  }
	  else
	       I = p->O; /* final transpose is in-place */
     }
     else { /* transpose b x ny x vn -> ny x b x vn */
	  cld1 = X(mkplan_f_d)(plnr, 
			       X(mkproblem_rdft_0_d)(X(mktensor_3d)
						     (b, p->ny * vn, vn,
						      p->ny, vn, b * vn,
						      vn, 1, 1),
						     I = p->I, p->O),
			       0, 0, NO_SLOW);
	  if (XM(any_true)(!cld1, p->comm)) goto nada;
     }
	  
     bt = XM(block)(p->ny, p->tblock, my_pe);
     nxb = (p->nx + p->block - 1) / p->block;
     if (p->nx != nxb * p->block)
	  nxb -= 1; /* number of equal-sized blocks */
     if (!(p->flags & TRANSPOSED_OUT)) {
	  INT nx = p->nx * vn;
	  b = p->block * vn;
	  cld2 = X(mkplan_f_d)(plnr, 
			       X(mkproblem_rdft_0_d)(X(mktensor_3d)
						     (nxb, bt * b, b,
						      bt, b, nx,
						      b, 1, 1),
						     I, p->O),
			       0, 0, NO_SLOW);
	  if (XM(any_true)(!cld2, p->comm)) goto nada;
	  
	  if (p->nx != nxb * p->block) { /* leftover blocks to transpose */
	       Ioff = bt * b * nxb;
	       Ooff = b * nxb;
	       b = nx - nxb * b;
	       cld2rest = X(mkplan_f_d)(plnr,
					X(mkproblem_rdft_0_d)(X(mktensor_2d)
							      (bt, b, nx,
							       b, 1, 1),
							      I + Ioff,
							      p->O + Ooff),
					0, 0, NO_SLOW);
	       if (XM(any_true)(!cld2rest, p->comm)) goto nada;
	  }
     }
     else { /* TRANSPOSED_OUT */
	  b = p->block;
	  cld2 = X(mkplan_f_d)(plnr, 
			     X(mkproblem_rdft_0_d)(X(mktensor_4d)
						   (nxb, bt * b*vn, bt * b*vn,
						    bt, b*vn, vn,
						    b, vn, bt*vn,
						    vn, 1, 1),
						   I, p->O),
			       0, 0, NO_SLOW);
	  if (XM(any_true)(!cld2, p->comm)) goto nada;
	  
	  if (p->nx != nxb * p->block) { /* leftover blocks to transpose */
	       Ioff = Ooff = bt * b * nxb * vn;
	       b = p->nx - nxb * b;
	       cld2rest = X(mkplan_f_d)(plnr,
					X(mkproblem_rdft_0_d)(X(mktensor_3d)
							      (bt, b*vn, vn,
							       b, vn, bt*vn,
							       vn, 1, 1),
							      I + Ioff,
							      p->O + Ooff),
					0, 0, NO_SLOW);
	       if (XM(any_true)(!cld2rest, p->comm)) goto nada;
	  }
     }

     pln = MKPLAN_MPI_TRANSPOSE(P, &padt, apply);

     pln->cld1 = cld1;
     pln->cld2 = cld2;
     pln->cld2rest = cld2rest;
     pln->rest_Ioff = Ioff;
     pln->rest_Ooff = Ooff;

     MPI_Comm_dup(p->comm, &pln->comm);

     /* Compute sizes/offsets of blocks to send for all-to-all
        command.  TODO: In the special case where all block sizes are
        equal, we could use the MPI_Alltoall command.  It's not clear
        whether/why this would be any faster, though. */
     sbs = (int *) MALLOC(4 * n_pes * sizeof(int), PLANS);
     sbo = sbs + n_pes;
     rbs = sbo + n_pes;
     rbo = rbs + n_pes;
     b = XM(block)(p->nx, p->block, my_pe);
     bt = XM(block)(p->ny, p->tblock, my_pe);
     for (pe = 0; pe < n_pes; ++pe) {
	  INT db, dbt; /* destination block sizes */
	  db = XM(block)(p->nx, p->block, pe);
	  dbt = XM(block)(p->ny, p->tblock, pe);

	  /* MPI requires type "int" here; apparently it
	     has no 64-bit API?  Grrr. */
	  sbs[pe] = (int) (b * dbt * vn);
	  sbo[pe] = (int) (pe * (b * p->tblock) * vn);
	  rbs[pe] = (int) (db * bt * vn);
	  rbo[pe] = (int) (pe * (p->block * bt) * vn);
	  if (sbs[pe] != (b * p->tblock) * vn
	      || rbs[pe] != (p->block * bt) * vn)
	       equal_blocks = 0;
     }
     pln->send_block_sizes = sbs;
     pln->send_block_offsets = sbo;
     pln->recv_block_sizes = rbs;
     pln->recv_block_offsets = rbo;
     pln->equal_blocks = equal_blocks;

     X(ops_zero)(&pln->super.super.ops);
     if (cld1) X(ops_add2)(&cld1->ops, &pln->super.super.ops);
     if (cld2) X(ops_add2)(&cld2->ops, &pln->super.super.ops);
     if (cld2rest) X(ops_add2)(&cld2rest->ops, &pln->super.super.ops);
     /* FIXME: should MPI operations be counted in "other" somehow? */

     return &(pln->super.super);

 nada:
     X(plan_destroy_internal)(cld2rest);
     X(plan_destroy_internal)(cld2);
     X(plan_destroy_internal)(cld1);
     return (plan *) 0;
}
コード例 #3
0
ファイル: transpose-recurse.c プロジェクト: Aegisub/fftw3
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
     const S *ego = (const S *) ego_;
     const problem_mpi_transpose *p;
     P *pln;
     plan *cld1 = 0, *cldtr = 0, *cldtm = 0;
     R *I, *O;
     int me, np, r, m;
     INT b;
     MPI_Comm comm2;
     static const plan_adt padt = {
          XM(transpose_solve), awake, print, destroy
     };

     UNUSED(ego);

     if (!applicable(ego, p_, plnr, &r))
          return (plan *) 0;

     p = (const problem_mpi_transpose *) p_;

     MPI_Comm_size(p->comm, &np);
     MPI_Comm_rank(p->comm, &me);
     m = np / r;
     A(r * m == np);

     I = p->I; O = p->O;

     b = XM(block)(p->nx, p->block, me);
     A(p->tblock * np == p->ny); /* this is currently required for cld1 */
     if (p->flags & TRANSPOSED_IN) { 
          /* m x r x (bt x b x vn) -> r x m x (bt x b x vn) */
	  INT vn = p->vn * b * p->tblock;
	  cld1 = X(mkplan_f_d)(plnr,
                               X(mkproblem_rdft_0_d)(X(mktensor_3d)
						     (m, r*vn, vn,
						      r, vn, m*vn,
						      vn, 1, 1),
                                                     I, O),
                               0, 0, NO_SLOW);
     }
     else if (I != O) { /* combine cld1 with TRANSPOSED_IN permutation */
          /* b x m x r x bt x vn -> r x m x bt x b x vn */
	  INT vn = p->vn;
	  INT bt = p->tblock;
	  cld1 = X(mkplan_f_d)(plnr,
                               X(mkproblem_rdft_0_d)(X(mktensor_5d)
						     (b, m*r*bt*vn, vn,
						      m, r*bt*vn, bt*b*vn,
						      r, bt*vn, m*bt*b*vn,
						      bt, vn, b*vn,
						      vn, 1, 1),
                                                     I, O),
                               0, 0, NO_SLOW);
     }
     else { /* TRANSPOSED_IN permutation must be separate for in-place */
	  /* b x (m x r) x bt x vn -> b x (r x m) x bt x vn */
	  INT vn = p->vn * p->tblock;
	  cld1 = X(mkplan_f_d)(plnr,
                               X(mkproblem_rdft_0_d)(X(mktensor_4d)
						     (m, r*vn, vn,
						      r, vn, m*vn,
						      vn, 1, 1,
						      b, np*vn, np*vn),
                                                     I, O),
                               0, 0, NO_SLOW);
     }
     if (XM(any_true)(!cld1, p->comm)) goto nada;

     if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) I = O;

     b = XM(block)(p->nx, r * p->block, me / r);
     MPI_Comm_split(p->comm, me / r, me, &comm2);
     if (b)
	  cldtr = X(mkplan_d)(plnr, XM(mkproblem_transpose)
			      (b, p->ny, p->vn,
			       O, I, p->block, m * p->tblock, comm2, 
			       p->I != p->O
			       ? TRANSPOSED_IN : (p->flags & TRANSPOSED_IN)));
     MPI_Comm_free(&comm2);
     if (XM(any_true)(b && !cldtr, p->comm)) goto nada;
     
     b = XM(block)(p->ny, m * p->tblock, me % r);
     MPI_Comm_split(p->comm, me % r, me, &comm2);
     if (b)
	  cldtm = X(mkplan_d)(plnr, XM(mkproblem_transpose)
			      (p->nx, b, p->vn,
			       I, O, r * p->block, p->tblock, comm2, 
			       TRANSPOSED_IN | (p->flags & TRANSPOSED_OUT)));
     MPI_Comm_free(&comm2);
     if (XM(any_true)(b && !cldtm, p->comm)) goto nada;

     pln = MKPLAN_MPI_TRANSPOSE(P, &padt, apply);

     pln->cld1 = cld1;
     pln->cldtr = cldtr;
     pln->cldtm = cldtm;
     pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
     pln->r = r;
     pln->nam = ego->nam;

     pln->super.super.ops = cld1->ops;
     if (cldtr) X(ops_add2)(&cldtr->ops, &pln->super.super.ops);
     if (cldtm) X(ops_add2)(&cldtm->ops, &pln->super.super.ops);

     return &(pln->super.super);

 nada:
     X(plan_destroy_internal)(cldtm);
     X(plan_destroy_internal)(cldtr);
     X(plan_destroy_internal)(cld1);
     return (plan *) 0;
}
コード例 #4
0
ファイル: transpose-alltoall.c プロジェクト: 376473984/fftw3
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
     const S *ego = (const S *) ego_;
     const problem_mpi_transpose *p;
     P *pln;
     plan *cld1 = 0, *cld2 = 0, *cld2rest = 0, *cld3 = 0;
     INT b, bt, vn, rest_Ioff, rest_Ooff;
     R *I;
     int *sbs, *sbo, *rbs, *rbo;
     int pe, my_pe, n_pes;
     int equal_blocks = 1;
     static const plan_adt padt = {
          XM(transpose_solve), awake, print, destroy
     };

     if (!applicable(ego, p_, plnr))
          return (plan *) 0;

     p = (const problem_mpi_transpose *) p_;
     vn = p->vn;

     MPI_Comm_rank(p->comm, &my_pe);
     MPI_Comm_size(p->comm, &n_pes);

     b = XM(block)(p->nx, p->block, my_pe);

     if (p->flags & TRANSPOSED_IN) { /* I is already transposed */
	  if (ego->copy_transposed_in) {
	       cld1 = X(mkplan_f_d)(plnr,
				  X(mkproblem_rdft_0_d)(X(mktensor_1d)
							(b * p->ny * vn, 1, 1),
							I = p->I, p->O),
				    0, 0, NO_SLOW);
	       if (XM(any_true)(!cld1, p->comm)) goto nada;
	  }
	  else
	       I = p->O; /* final transpose is in-place */
     }
     else { /* transpose b x ny x vn -> ny x b x vn */
	  cld1 = X(mkplan_f_d)(plnr, 
			       X(mkproblem_rdft_0_d)(X(mktensor_3d)
						     (b, p->ny * vn, vn,
						      p->ny, vn, b * vn,
						      vn, 1, 1),
						     I = p->I, p->O),
			       0, 0, NO_SLOW);
	  if (XM(any_true)(!cld1, p->comm)) goto nada;
     }
	  
     if (XM(any_true)(!XM(mkplans_posttranspose)(p, plnr, I, p->O, my_pe,
						 &cld2, &cld2rest, &cld3,
						 &rest_Ioff, &rest_Ooff),
		      p->comm)) goto nada;

     pln = MKPLAN_MPI_TRANSPOSE(P, &padt, apply);

     pln->cld1 = cld1;
     pln->cld2 = cld2;
     pln->cld2rest = cld2rest;
     pln->rest_Ioff = rest_Ioff;
     pln->rest_Ooff = rest_Ooff;
     pln->cld3 = cld3;

     MPI_Comm_dup(p->comm, &pln->comm);

     /* Compute sizes/offsets of blocks to send for all-to-all command. */
     sbs = (int *) MALLOC(4 * n_pes * sizeof(int), PLANS);
     sbo = sbs + n_pes;
     rbs = sbo + n_pes;
     rbo = rbs + n_pes;
     b = XM(block)(p->nx, p->block, my_pe);
     bt = XM(block)(p->ny, p->tblock, my_pe);
     for (pe = 0; pe < n_pes; ++pe) {
	  INT db, dbt; /* destination block sizes */
	  db = XM(block)(p->nx, p->block, pe);
	  dbt = XM(block)(p->ny, p->tblock, pe);
	  if (db != p->block || dbt != p->tblock)
	       equal_blocks = 0;

	  /* MPI requires type "int" here; apparently it
	     has no 64-bit API?  Grrr. */
	  sbs[pe] = (int) (b * dbt * vn);
	  sbo[pe] = (int) (pe * (b * p->tblock) * vn);
	  rbs[pe] = (int) (db * bt * vn);
	  rbo[pe] = (int) (pe * (p->block * bt) * vn);
     }
     pln->send_block_sizes = sbs;
     pln->send_block_offsets = sbo;
     pln->recv_block_sizes = rbs;
     pln->recv_block_offsets = rbo;
     pln->equal_blocks = equal_blocks;

     X(ops_zero)(&pln->super.super.ops);
     if (cld1) X(ops_add2)(&cld1->ops, &pln->super.super.ops);
     if (cld2) X(ops_add2)(&cld2->ops, &pln->super.super.ops);
     if (cld2rest) X(ops_add2)(&cld2rest->ops, &pln->super.super.ops);
     if (cld3) X(ops_add2)(&cld3->ops, &pln->super.super.ops);
     /* FIXME: should MPI operations be counted in "other" somehow? */

     return &(pln->super.super);

 nada:
     X(plan_destroy_internal)(cld3);
     X(plan_destroy_internal)(cld2rest);
     X(plan_destroy_internal)(cld2);
     X(plan_destroy_internal)(cld1);
     return (plan *) 0;
}