/* Return the radix r for a 1d MPI transform of a distributed dimension d,
   with the given flags and transform size.   That is, decomposes d.n
   as r * m, Cooley-Tukey style.  Also computes the block sizes rblock
   and mblock.  Returns 0 if such a decomposition is not feasible.
   This is unfortunately somewhat complicated.

   A distributed Cooley-Tukey algorithm works as follows (see dft-rank1.c):

   d.n is initially distributed as an m x r array with block size mblock[IB].
   Then it is internally transposed to an r x m array with block size
   rblock[IB].  Then it is internally transposed to m x r again with block
   size mblock[OB].  Finally, it is transposed to r x m with block size
   rblock[IB].

   If flags & SCRAMBLED_IN, then the first transpose is skipped (the array
   starts out as r x m).  If flags & SCRAMBLED_OUT, then the last transpose
   is skipped (the array ends up as m x r).  To make sure the forward
   and backward transforms use the same "scrambling" format, we swap r
   and m when sign != FFT_SIGN.

   There are some downsides to this, especially in the case where
   either m or r is not divisible by n_pes.  For one thing, it means
   that in general we can't use the same block size for the input and
   output.  For another thing, it means that we can't in general honor
   a user's "requested" block sizes in d.b[].  Therefore, for simplicity,
   we simply ignore d.b[] for now.
*/
INT XM(choose_radix)(ddim d, int n_pes, unsigned flags, int sign,
		     INT rblock[2], INT mblock[2])
{
     INT r, m;

     UNUSED(flags); /* we would need this if we paid attention to d.b[*] */

     /* If n_pes is a factor of d.n, then choose r to be d.n / n_pes.
        This not only ensures that the input (the m dimension) is
        equally distributed if possible, and at the r dimension is
        maximally equally distributed (if d.n/n_pes >= n_pes), it also
        makes one of the local transpositions in the algorithm
        trivial. */
     if (d.n % n_pes == 0 /* it's good if n_pes divides d.n ...*/
	 && d.n / n_pes >= n_pes /* .. unless we can't use n_pes processes */)
	  r = d.n / n_pes;
     else {  /* n_pes does not divide d.n, pick a factor close to sqrt(d.n) */
	  for (r = X(isqrt)(d.n); d.n % r != 0; ++r)
	       ;
     }
     if (r == 1 || r == d.n) return 0; /* punt if we can't reduce size */

     if (sign != FFT_SIGN) { /* swap {m,r} so that scrambling is reversible */
	  m = r;
	  r = d.n / m;
     }
     else
	  m = d.n / r;

     rblock[IB] = rblock[OB] = XM(default_block)(r, n_pes);
     mblock[IB] = mblock[OB] = XM(default_block)(m, n_pes);

     return r;
}
Example #2
0
int XM(dft_serial_applicable)(const problem_mpi_dft *p)
{
     return (1
	     && p->flags == 0 /* TRANSPOSED/SCRAMBLED_IN/OUT not supported */
	     && ((XM(is_local)(p->sz, IB) && XM(is_local)(p->sz, OB))
		 || p->vn == 0));
}
static plan *mkplan(const solver *ego, const problem *p_, planner *plnr)
{
     const problem_mpi_rdft2 *p = (const problem_mpi_rdft2 *) p_;
     P *pln;
     plan *cld;
     int my_pe;
     R *r0, *r1, *cr, *ci;
     static const plan_adt padt = {
          XM(rdft2_solve), awake, print, destroy
     };

     UNUSED(ego);

     /* check whether applicable: */
     if (!XM(rdft2_serial_applicable)(p))
          return (plan *) 0;

     if (p->kind == R2HC) {
	  r1 = (r0 = p->I) + p->vn;
	  ci = (cr = p->O) + 1;
     }
     else {
	  r1 = (r0 = p->O) + p->vn;
	  ci = (cr = p->I) + 1;
     }

     MPI_Comm_rank(p->comm, &my_pe);
     if (my_pe == 0 && p->vn > 0) {
	  INT ivs = 1 + (p->kind == HC2R), ovs = 1 + (p->kind == R2HC);
	  int i, rnk = p->sz->rnk;
	  tensor *sz = X(mktensor)(p->sz->rnk);
	  sz->dims[rnk - 1].is = sz->dims[rnk - 1].os = 2 * p->vn;
	  sz->dims[rnk - 1].n = p->sz->dims[rnk - 1].n / 2 + 1;
	  for (i = rnk - 1; i > 0; --i) {
	       sz->dims[i - 1].is = sz->dims[i - 1].os = 
		    sz->dims[i].is * sz->dims[i].n;
	       sz->dims[i - 1].n = p->sz->dims[i - 1].n;
	  }
	  sz->dims[rnk - 1].n = p->sz->dims[rnk - 1].n;

	  cld = X(mkplan_d)(plnr,
			    X(mkproblem_rdft2_d)(sz,
						 X(mktensor_1d)(p->vn,ivs,ovs),
						 r0, r1, cr, ci, p->kind));
     }
     else { /* idle process: make nop plan */
	  cld = X(mkplan_d)(plnr,
			    X(mkproblem_rdft2_d)(X(mktensor_0d)(),
						 X(mktensor_1d)(0,0,0),
						 cr, ci, cr, ci, HC2R));
     }
     if (XM(any_true)(!cld, p->comm)) return (plan *) 0;

     pln = MKPLAN_MPI_RDFT2(P, &padt, p->kind == R2HC ? apply_r2c : apply_c2r);
     pln->cld = cld;
     pln->vn = p->vn;
     X(ops_cpy)(&cld->ops, &pln->super.super.ops);
     return &(pln->super.super);
}
problem *XM(mkproblem_rdft_d)(dtensor *sz, INT vn,
			      R *I, R *O, 
			      MPI_Comm comm,
			      const rdft_kind *kind, unsigned flags)
{
     problem *p = XM(mkproblem_rdft)(sz, vn, I, O, comm, kind, flags);
     XM(dtensor_destroy)(sz);
     return p;
}
problem *XM(mkproblem_rdft)(const dtensor *sz, INT vn,
			    R *I, R *O,
			    MPI_Comm comm,
			    const rdft_kind *kind, unsigned flags)
{
     problem_mpi_rdft *ego;
     int i, rnk = sz->rnk;
     int n_pes;

     A(XM(dtensor_validp)(sz) && FINITE_RNK(sz->rnk));
     MPI_Comm_size(comm, &n_pes);
     A(n_pes >= XM(num_blocks_total)(sz, IB)
       && n_pes >= XM(num_blocks_total)(sz, OB));
     A(vn >= 0);

#if defined(STRUCT_HACK_KR)
     ego = (problem_mpi_rdft *) X(mkproblem)(sizeof(problem_mpi_rdft)
					     + sizeof(rdft_kind)
					     * (rnk > 0 ? rnk - 1 : 0), &padt);
#elif defined(STRUCT_HACK_C99)
     ego = (problem_mpi_rdft *) X(mkproblem)(sizeof(problem_mpi_rdft)
					     + sizeof(rdft_kind) * rnk, &padt);
#else
     ego = (problem_mpi_rdft *) X(mkproblem)(sizeof(problem_mpi_rdft), &padt);
     ego->kind = (rdft_kind *) MALLOC(sizeof(rdft_kind) * rnk, PROBLEMS);
#endif

     /* enforce pointer equality if untainted pointers are equal */
     if (UNTAINT(I) == UNTAINT(O))
	  I = O = JOIN_TAINT(I, O);

     ego->sz = XM(dtensor_canonical)(sz, 0);
     ego->vn = vn;
     ego->I = I;
     ego->O = O;
     for (i = 0; i< ego->sz->rnk; ++i)
	  ego->kind[i] = kind[i];

     /* canonicalize: replace TRANSPOSED_IN with TRANSPOSED_OUT by
        swapping the first two dimensions (for rnk > 1) */
     if ((flags & TRANSPOSED_IN) && ego->sz->rnk > 1) {
	  rdft_kind k = ego->kind[0];
	  ddim dim0 = ego->sz->dims[0];
	  ego->sz->dims[0] = ego->sz->dims[1];
	  ego->sz->dims[1] = dim0;
	  ego->kind[0] = ego->kind[1];
	  ego->kind[1] = k;
	  flags &= ~TRANSPOSED_IN;
	  flags ^= TRANSPOSED_OUT;
     }
     ego->flags = flags;

     MPI_Comm_dup(comm, &ego->comm);

     return &(ego->super);
}
Example #6
0
problem *XM(mkproblem_dft_d)(dtensor *sz, INT vn,
                             R *I, R *O,
                             MPI_Comm comm,
                             int sign,
                             unsigned flags)
{
    problem *p = XM(mkproblem_dft)(sz, vn, I, O, comm, sign, flags);
    XM(dtensor_destroy)(sz);
    return p;
}
Example #7
0
static plan *mkplan(const solver *ego, const problem *p_, planner *plnr)
{
     const problem_mpi_dft *p = (const problem_mpi_dft *) p_;
     P *pln;
     plan *cld;
     int my_pe;
     R *ri, *ii, *ro, *io;
     static const plan_adt padt = {
          XM(dft_solve), awake, print, destroy
     };

     UNUSED(ego);

     /* check whether applicable: */
     if (!XM(dft_serial_applicable)(p))
          return (plan *) 0;

     X(extract_reim)(p->sign, p->I, &ri, &ii);
     X(extract_reim)(p->sign, p->O, &ro, &io);

     MPI_Comm_rank(p->comm, &my_pe);
     if (my_pe == 0 && p->vn > 0) {
	  int i, rnk = p->sz->rnk;
	  tensor *sz = X(mktensor)(p->sz->rnk);
	  sz->dims[rnk - 1].is = sz->dims[rnk - 1].os = 2 * p->vn;
	  sz->dims[rnk - 1].n = p->sz->dims[rnk - 1].n;
	  for (i = rnk - 1; i > 0; --i) {
	       sz->dims[i - 1].is = sz->dims[i - 1].os = 
		    sz->dims[i].is * sz->dims[i].n;
	       sz->dims[i - 1].n = p->sz->dims[i - 1].n;
	  }
	  
	  cld = X(mkplan_d)(plnr,
			    X(mkproblem_dft_d)(sz,
					       X(mktensor_1d)(p->vn, 2, 2),
					       ri, ii, ro, io));
     }
     else { /* idle process: make nop plan */
	  cld = X(mkplan_d)(plnr,
			    X(mkproblem_dft_d)(X(mktensor_0d)(),
					       X(mktensor_1d)(0,0,0),
					       ri, ii, ro, io));
     }
     if (XM(any_true)(!cld, p->comm)) return (plan *) 0;

     pln = MKPLAN_MPI_DFT(P, &padt, apply);
     pln->cld = cld;
     pln->roff = ro - p->O;
     pln->ioff = io - p->O;
     X(ops_cpy)(&cld->ops, &pln->super.super.ops);
     return &(pln->super.super);
}
Example #8
0
/* check whether the recursive transposes fit within the space
   that must have been allocated on each process for this transpose;
   this must be modified if the subdivision in mkplan is changed! */
static int enough_space(INT nx, INT ny, INT block, INT tblock,
			int r, int n_pes)
{
     int pe;
     int m = n_pes / r;
     for (pe = 0; pe < n_pes; ++pe) {
	  INT space = transpose_space(nx, ny, block, tblock, pe);
	  INT b1 = XM(block)(nx, r * block, pe / r);
	  INT b2 = XM(block)(ny, m * tblock, pe % r);
	  if (transpose_space(b1, ny, block, m*tblock, pe % r) > space
	      || transpose_space(nx, b2, r*block, tblock, pe / r) > space)
	       return 0;
     }
     return 1;
}
Example #9
0
static int applicable(const S *ego, const problem *p_,
		      const planner *plnr)
{
     const problem_mpi_dft *p = (const problem_mpi_dft *) p_;
     return (1
	     && p->sz->rnk > 1
	     && p->flags == 0 /* TRANSPOSED/SCRAMBLED_IN/OUT not supported */
	     && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
					  && p->I != p->O))
	     && XM(is_local_after)(1, p->sz, IB)
	     && XM(is_local_after)(1, p->sz, OB)
	     && (!NO_SLOWP(plnr) /* slow if dft-serial is applicable */
		 || !XM(dft_serial_applicable)(p))
	  );
}
Example #10
0
INT XM(total_block)(const dtensor *sz, block_kind k, int which_pe)
{
     if (XM(idle_process)(sz, k, which_pe))
	  return 0;
     else {
	  int i;
	  INT N = 1, *coords;
	  STACK_MALLOC(INT*, coords, sizeof(INT) * sz->rnk);
	  XM(block_coords)(sz, k, which_pe, coords);
	  for (i = 0; i < sz->rnk; ++i)
	       N *= XM(block)(sz->dims[i].n, sz->dims[i].b[k], coords[i]);
	  STACK_FREE(coords);
	  return N;
     }
}
Example #11
0
static void destroy(problem *ego_)
{
    problem_mpi_dft *ego = (problem_mpi_dft *) ego_;
    XM(dtensor_destroy)(ego->sz);
    MPI_Comm_free(&ego->comm);
    X(ifree)(ego_);
}
Example #12
0
/* wrappers for fftw init and cleanup */
void PX(init) (void){
#ifdef _OPENMP
  X(init_threads)();
  PX(plan_with_nthreads)(omp_get_max_threads());
#endif
  XM(init)();
}
static int applicable(const S *ego, const problem *p_,
		      const planner *plnr)
{
     const problem_mpi_dft *p = (const problem_mpi_dft *) p_;
     return (1
	     && p->sz->rnk > 1
	     && p->flags == TRANSPOSED_OUT
	     && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
					  && p->I != p->O))
	     && XM(is_local_after)(1, p->sz, IB)
	     && XM(is_local_after)(2, p->sz, OB)
	     && XM(num_blocks)(p->sz->dims[0].n, p->sz->dims[0].b[OB]) == 1
	     && (!NO_SLOWP(plnr) /* slow if dft-serial is applicable */
		 || !XM(dft_serial_applicable)(p))
	  );
}
Example #14
0
/* needed to synchronize planner bogosity flag, in case non-MPI problems
   on a subset of processes encountered bogus wisdom */
static wisdom_state_t bogosity_hook(wisdom_state_t state, const problem *p)
{
     MPI_Comm comm = problem_comm(p);
     if (comm != MPI_COMM_NULL /* an MPI problem */
	 && XM(any_true)(state == WISDOM_IS_BOGUS, comm)) /* bogus somewhere */
	  return WISDOM_IS_BOGUS;
     return state;
}
Example #15
0
/* Return whether sz is distributed for k according to a simple
   1d block distribution in the first or second dimensions */
int XM(is_block1d)(const dtensor *sz, block_kind k)
{
     int i;
     if (!FINITE_RNK(sz->rnk)) return 0;
     for (i = 0; i < sz->rnk && num_blocks_kind(sz->dims + i, k) == 1; ++i) ;
     return(i < sz->rnk && i < 2 && XM(is_local_after)(i + 1, sz, k));

}
Example #16
0
/* returns whether sz is local for dims >= dim */
int XM(is_local_after)(int dim, const dtensor *sz, block_kind k)
{
     if (FINITE_RNK(sz->rnk))
	  for (; dim < sz->rnk; ++dim)
	       if (XM(num_blocks)(sz->dims[dim].n, sz->dims[dim].b[k]) > 1)
		    return 0;
     return 1;
}
/* For a given block size and dimension n, compute the block size b
   and the starting offset s on the given process. */
INT XM(block)(INT n, INT block, int which_block)
{
     INT n_blocks = XM(num_blocks)(n, block);
     if (which_block >= n_blocks)
	  return 0;
     else
	  return ((which_block == n_blocks - 1)
		  ? (n - which_block * block) : block);
}
Example #18
0
static void hash(const problem *p_, md5 *m)
{
    const problem_mpi_dft *p = (const problem_mpi_dft *) p_;
    int i;
    X(md5puts)(m, "mpi-dft");
    X(md5int)(m, p->I == p->O);
    /* don't include alignment -- may differ between processes
    X(md5int)(m, X(alignment_of)(p->I));
    X(md5int)(m, X(alignment_of)(p->O));
    ... note that applicability of MPI plans does not depend
        on alignment (although optimality may, in principle). */
    XM(dtensor_md5)(m, p->sz);
    X(md5INT)(m, p->vn);
    X(md5int)(m, p->sign);
    X(md5int)(m, p->flags);
    MPI_Comm_size(p->comm, &i);
    X(md5int)(m, i);
    A(XM(md5_equal)(*m, p->comm));
}
static void destroy(problem *ego_)
{
     problem_mpi_rdft *ego = (problem_mpi_rdft *) ego_;
     XM(dtensor_destroy)(ego->sz);
     MPI_Comm_free(&ego->comm);
#if !defined(STRUCT_HACK_C99) && !defined(STRUCT_HACK_KR)
     X(ifree0)(ego->kind);
#endif
     X(ifree)(ego_);
}
Example #20
0
problem *XM(mkproblem_dft)(const dtensor *sz, INT vn,
                           R *I, R *O,
                           MPI_Comm comm,
                           int sign,
                           unsigned flags)
{
    problem_mpi_dft *ego =
        (problem_mpi_dft *)X(mkproblem)(sizeof(problem_mpi_dft), &padt);
    int n_pes;

    A(XM(dtensor_validp)(sz) && FINITE_RNK(sz->rnk));
    MPI_Comm_size(comm, &n_pes);
    A(n_pes >= XM(num_blocks_total)(sz, IB)
      && n_pes >= XM(num_blocks_total)(sz, OB));
    A(vn >= 0);
    A(sign == -1 || sign == 1);

    /* enforce pointer equality if untainted pointers are equal */
    if (UNTAINT(I) == UNTAINT(O))
        I = O = JOIN_TAINT(I, O);

    ego->sz = XM(dtensor_canonical)(sz, 1);
    ego->vn = vn;
    ego->I = I;
    ego->O = O;
    ego->sign = sign;

    /* canonicalize: replace TRANSPOSED_IN with TRANSPOSED_OUT by
       swapping the first two dimensions (for rnk > 1) */
    if ((flags & TRANSPOSED_IN) && ego->sz->rnk > 1) {
        ddim dim0 = ego->sz->dims[0];
        ego->sz->dims[0] = ego->sz->dims[1];
        ego->sz->dims[1] = dim0;
        flags &= ~TRANSPOSED_IN;
        flags ^= TRANSPOSED_OUT;
    }
    ego->flags = flags;

    MPI_Comm_dup(comm, &ego->comm);

    return &(ego->super);
}
Example #21
0
gtransp_plan PX(plan_global_transp)(
    INT N0, INT N1, INT h0, INT h1, INT hm, INT blk0, INT blk1,
    MPI_Comm comm, R *in, R *out,
    unsigned transp_flag, unsigned fftw_flags
    )
{
  gtransp_plan ths = NULL;
  
  INT N[2], blk[2];

  N[0] = N1 * h1; blk[0] = blk1 * h1;
  N[1] = N0 * h0; blk[1] = blk0 * h0;

  /* For strange distributions (e.g. 4 data points on 3 processes)
   * all processes in a row get no data. */
  if(N[0]*N[1]*h0*h1*hm==0)
    return NULL;

  ths = gtransp_mkplan();

  /* PFFTs and FFTWs transpose flags are complementary */
  if( ~transp_flag & PFFT_TRANSPOSED_OUT)
    fftw_flags |= FFTW_MPI_TRANSPOSED_OUT;
  if( ~transp_flag & PFFT_TRANSPOSED_IN)
    fftw_flags |= FFTW_MPI_TRANSPOSED_IN;

#if PFFT_BUGFIX_FORGET_PARALLEL_FFTW_WISDOM
  X(forget_wisdom)();
#endif

#if PFFT_DEBUG_GTRANSP
  ths->dbg = gtransp_mkdbg(N, hm, blk, in, out, comm, fftw_flags);
#endif

  ths->plan.plan = XM(plan_many_transpose)(
      N[0], N[1], hm, blk[0], blk[1], in, out,
      comm, fftw_flags);
  ths->plan.plannedin = in;
  ths->plan.plannedout = out;
  ths->plan.execute = (PX(fftw_execute))(XM(execute_r2r));
  return ths;
}
static void zero(const problem *ego_)
{
     const problem_mpi_rdft *ego = (const problem_mpi_rdft *) ego_;
     R *I = ego->I;
     INT i, N;
     int my_pe;

     MPI_Comm_rank(ego->comm, &my_pe);
     N = ego->vn * XM(total_block)(ego->sz, IB, my_pe);
     for (i = 0; i < N; ++i) I[i] = K(0.0);
}
Example #23
0
/* Given a non-idle process which_pe, computes the coordinate
   vector coords[rnk] giving the coordinates of a block in the
   matrix of blocks.  k specifies whether we are talking about
   the input or output data distribution. */
void XM(block_coords)(const dtensor *sz, block_kind k, int which_pe, 
		     INT *coords)
{
     int i;
     A(!XM(idle_process)(sz, k, which_pe) && FINITE_RNK(sz->rnk));
     for (i = sz->rnk - 1; i >= 0; --i) {
	  INT nb = num_blocks_kind(sz->dims + i, k);
	  coords[i] = which_pe % nb;
	  which_pe /= nb;
     }
}
Example #24
0
void XM(init)(void)
{
     if (!mpi_inited) {
	  planner *plnr = X(the_planner)();
	  plnr->cost_hook = cost_hook;
	  plnr->wisdom_ok_hook = wisdom_ok_hook;
	  plnr->nowisdom_hook = nowisdom_hook;
	  plnr->bogosity_hook = bogosity_hook;
          XM(conf_standard)(plnr);
	  mpi_inited = 1;	  
     }
}
Example #25
0
static void zero(const problem *ego_)
{
    const problem_mpi_transpose *ego = (const problem_mpi_transpose *) ego_;
    R *I = ego->I;
    INT i, N = ego->vn * ego->ny;
    int my_pe;

    MPI_Comm_rank(ego->comm, &my_pe);
    N *= XM(block)(ego->nx, ego->block, my_pe);

    for (i = 0; i < N; ++i) I[i] = K(0.0);
}
static int applicable(const S *ego, const problem *p_,
		      const planner *plnr)
{
     const problem_mpi_dft *p = (const problem_mpi_dft *) p_;
     int n_pes;
     MPI_Comm_size(p->comm, &n_pes);
     return (1
	     && p->sz->rnk == 1
	     && !(p->flags & ~RANK1_BIGVEC_ONLY)
	     && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
					  && p->I != p->O))
	     && (p->vn >= n_pes /* TODO: relax this, using more memory? */
		 || (p->flags & RANK1_BIGVEC_ONLY))

	     && XM(rearrange_applicable)(ego->rearrange,
					 p->sz->dims[0], p->vn, n_pes)

	     && (!NO_SLOWP(plnr) /* slow if dft-serial is applicable */
                 || !XM(dft_serial_applicable)(p))
	  );
}
Example #27
0
static void print(const problem *ego_, printer *p)
{
    const problem_mpi_dft *ego = (const problem_mpi_dft *) ego_;
    int i;
    p->print(p, "(mpi-dft %d %d %d ",
             ego->I == ego->O,
             X(alignment_of)(ego->I),
             X(alignment_of)(ego->O));
    XM(dtensor_print)(ego->sz, p);
    p->print(p, " %D %d %d", ego->vn, ego->sign, ego->flags);
    MPI_Comm_size(ego->comm, &i);
    p->print(p, " %d)", i);
}
static void print(const problem *ego_, printer *p)
{
     const problem_mpi_rdft *ego = (const problem_mpi_rdft *) ego_;
     int i;
     p->print(p, "(mpi-rdft %d %d %d ", 
	      ego->I == ego->O,
	      X(alignment_of)(ego->I),
	      X(alignment_of)(ego->O));
     XM(dtensor_print)(ego->sz, p);
     for (i = 0; i < ego->sz->rnk; ++i)
          p->print(p, " %d", (int)ego->kind[i]);
     p->print(p, " %D %d", ego->vn, ego->flags);
     MPI_Comm_size(ego->comm, &i); p->print(p, " %d)", i);
}
Example #29
0
problem *XM(mkproblem_transpose)(INT nx, INT ny, INT vn,
                                 R *I, R *O,
                                 INT block, INT tblock,
                                 MPI_Comm comm,
                                 unsigned flags)
{
    problem_mpi_transpose *ego =
        (problem_mpi_transpose *)X(mkproblem)(sizeof(problem_mpi_transpose), &padt);

    A(nx > 0 && ny > 0 && vn > 0);
    A(block > 0 && XM(num_blocks_ok)(nx, block, comm)
      && tblock > 0 && XM(num_blocks_ok)(ny, tblock, comm));

    /* enforce pointer equality if untainted pointers are equal */
    if (UNTAINT(I) == UNTAINT(O))
        I = O = JOIN_TAINT(I, O);

    ego->nx = nx;
    ego->ny = ny;
    ego->vn = vn;
    ego->I = I;
    ego->O = O;
    ego->block = block > nx ? nx : block;
    ego->tblock = tblock > ny ? ny : tblock;

    /* canonicalize flags: we can freely assume that the data is
    "transposed" if one of the dimensions is 1. */
    if (ego->block == 1)
        flags |= TRANSPOSED_IN;
    if (ego->tblock == 1)
        flags |= TRANSPOSED_OUT;
    ego->flags = flags;

    MPI_Comm_dup(comm, &ego->comm);

    return &(ego->super);
}
static void hash(const problem *p_, md5 *m)
{
     const problem_mpi_rdft *p = (const problem_mpi_rdft *) p_;
     int i;
     X(md5puts)(m, "mpi-dft");
     X(md5int)(m, p->I == p->O);
     X(md5int)(m, X(alignment_of)(p->I));
     X(md5int)(m, X(alignment_of)(p->O));
     XM(dtensor_md5)(m, p->sz);
     X(md5INT)(m, p->vn);
     for (i = 0; i < p->sz->rnk; ++i)
	  X(md5int)(m, p->kind[i]);
     X(md5int)(m, p->flags);
     MPI_Comm_size(p->comm, &i); X(md5int)(m, i);
}