Beispiel #1
0
/*@
  Dataloop_free - deallocate the resources used to store a dataloop

Input/output Parameters:
. dataloop - pointer to dataloop structure
@*/
void PREPEND_PREFIX(Dataloop_free)(DLOOP_Dataloop **dataloop)
{

    if (*dataloop == NULL) return;

#ifdef DLOOP_DEBUG_MEMORY
    DLOOP_dbg_printf("DLOOP_Dataloop_free: freeing loop @ %x.\n",
		     (int) *dataloop);
#endif

    memset(*dataloop, 0, sizeof(DLOOP_Dataloop_common));
    DLOOP_Free(*dataloop);
    *dataloop = NULL;
    return;
}
static int DLOOP_Dataloop_create_flattened_struct(DLOOP_Count count,
						  const int *blklens,
						  const MPI_Aint *disps,
						  const DLOOP_Type *oldtypes,
						  DLOOP_Dataloop **dlp_p,
						  MPI_Aint *dlsz_p,
						  int *dldepth_p,
						  int flag)
{
    /* arbitrary types, convert to bytes and use indexed */
    int i, err, nr_blks = 0;
    DLOOP_Size *tmp_blklens;
    MPI_Aint *tmp_disps; /* since we're calling another fn that takes
			    this type as an input parameter */
    DLOOP_Offset bytes;
    DLOOP_Segment *segp;

    int first_ind;
    DLOOP_Size last_ind;

    segp = MPIR_Segment_alloc();
    /* --BEGIN ERROR HANDLING-- */
    if (!segp) {
	return DLOOP_Dataloop_create_struct_memory_error();
    }
    /* --END ERROR HANDLING-- */

    /* use segment code once to count contiguous regions */
    for (i=0; i < count; i++)
    {
	int is_basic;

	/* ignore type elements with a zero blklen */
	if (blklens[i] == 0) continue;

	is_basic = (DLOOP_Handle_hasloop_macro(oldtypes[i])) ? 0 : 1;

	if (is_basic && (oldtypes[i] != MPI_LB &&
			 oldtypes[i] != MPI_UB))
	{
	    nr_blks++;
	}
        else /* derived type; get a count of contig blocks */
        {
            DLOOP_Count tmp_nr_blks, sz;

            DLOOP_Handle_get_size_macro(oldtypes[i], sz);

            /* if the derived type has some data to contribute,
             * add to flattened representation */
            if (sz > 0) {
                err = MPIR_Segment_init(NULL,
                                                   (DLOOP_Count) blklens[i],
                                                   oldtypes[i],
                                                   segp,
                                                   flag);
                if (err) return err;

                bytes = SEGMENT_IGNORE_LAST;

                MPIR_Segment_count_contig_blocks(segp,
                                                            0,
                                                            &bytes,
                                                            &tmp_nr_blks);

                nr_blks += tmp_nr_blks;
            }
        }
    }

    /* it's possible for us to get to this point only to realize that
     * there isn't any data in this type. in that case do what we always
     * do: store a simple contig of zero ints and call it done.
     */
    if (nr_blks == 0) {
	MPIR_Segment_free(segp);
	err = MPIR_Dataloop_create_contiguous(0,
							 MPI_INT,
							 dlp_p,
							 dlsz_p,
							 dldepth_p,
							 flag);
	return err;

    }

    nr_blks += 2; /* safety measure */

    tmp_blklens = (DLOOP_Size *) DLOOP_Malloc(nr_blks * sizeof(DLOOP_Size), MPL_MEM_DATATYPE);
    /* --BEGIN ERROR HANDLING-- */
    if (!tmp_blklens) {
	MPIR_Segment_free(segp);
	return DLOOP_Dataloop_create_struct_memory_error();
    }
    /* --END ERROR HANDLING-- */


    tmp_disps = (MPI_Aint *) DLOOP_Malloc(nr_blks * sizeof(MPI_Aint), MPL_MEM_DATATYPE);
    /* --BEGIN ERROR HANDLING-- */
    if (!tmp_disps) {
	DLOOP_Free(tmp_blklens);
	MPIR_Segment_free(segp);
	return DLOOP_Dataloop_create_struct_memory_error();
    }
    /* --END ERROR HANDLING-- */

    /* use segment code again to flatten the type */
    first_ind = 0;
    for (i=0; i < count; i++)
    {
	int is_basic;
	DLOOP_Count sz = -1;

	is_basic = (DLOOP_Handle_hasloop_macro(oldtypes[i])) ? 0 : 1;
	if (!is_basic) DLOOP_Handle_get_size_macro(oldtypes[i], sz);

	/* we're going to use the segment code to flatten the type.
	 * we put in our displacement as the buffer location, and use
	 * the blocklength as the count value to get N contiguous copies
	 * of the type.
	 *
	 * Note that we're going to get back values in bytes, so that will
	 * be our new element type.
	 */
	if (oldtypes[i] != MPI_UB &&
	    oldtypes[i] != MPI_LB &&
	    blklens[i] != 0 &&
	    (is_basic || sz > 0))
	{
	    err = MPIR_Segment_init((char *) DLOOP_OFFSET_CAST_TO_VOID_PTR disps[i],
					 (DLOOP_Count) blklens[i],
					 oldtypes[i],
					 segp,
					 0 /* homogeneous */);
            if (err) return err;

	    last_ind = nr_blks - first_ind;
	    bytes = SEGMENT_IGNORE_LAST;
	    MPIR_Segment_mpi_flatten(segp,
						0,
						&bytes,
						&tmp_blklens[first_ind],
						&tmp_disps[first_ind],
						&last_ind);
            if (err) return err;
	    first_ind += last_ind;
	}
    }
    nr_blks = first_ind;

#if 0
    if (MPL_DBG_SELECTED(MPIR_DBG_DATATYPE,VERBOSE)) {
	MPL_DBG_OUT(MPIR_DBG_DATATYPE,"--- start of flattened type ---");
        for (i=0; i < nr_blks; i++) {
	MPL_DBG_OUT_FMT(MPIR_DBG_DATATYPE,(MPL_DBG_FDEST,
				   "a[%d] = (%d, " DLOOP_OFFSET_FMT_DEC_SPEC ")", i,
				   tmp_blklens[i], tmp_disps[i]));
	}
	MPL_DBG_OUT(MPIR_DBG_DATATYPE,"--- end of flattened type ---");
    }
#endif

    MPIR_Segment_free(segp);

    err = MPIR_Dataloop_create_indexed(nr_blks,
						  tmp_blklens,
						  tmp_disps,
						  1, /* disp in bytes */
						  MPI_BYTE,
						  dlp_p,
						  dlsz_p,
						  dldepth_p,
						  flag);

    DLOOP_Free(tmp_blklens);
    DLOOP_Free(tmp_disps);

    return err;
}
static int DLOOP_Dataloop_create_basic_all_bytes_struct(
	       DLOOP_Count count,
	       const int *blklens,
	       const MPI_Aint *disps,
	       const DLOOP_Type *oldtypes,
	       DLOOP_Dataloop **dlp_p,
	       MPI_Aint *dlsz_p,
	       int *dldepth_p,
	       int flag)
{
    int i, err, cur_pos = 0;
    DLOOP_Size *tmp_blklens;
    MPI_Aint *tmp_disps;

    /* count is an upper bound on number of type instances */
    tmp_blklens = (DLOOP_Size *) DLOOP_Malloc(count * sizeof(DLOOP_Size), MPL_MEM_DATATYPE);

    /* --BEGIN ERROR HANDLING-- */
    if (!tmp_blklens)
    {
	return DLOOP_Dataloop_create_struct_memory_error();
    }
    /* --END ERROR HANDLING-- */

    tmp_disps = (MPI_Aint *) DLOOP_Malloc(count * sizeof(MPI_Aint), MPL_MEM_DATATYPE);

    /* --BEGIN ERROR HANDLING-- */
    if (!tmp_disps)
    {
	DLOOP_Free(tmp_blklens);
	return DLOOP_Dataloop_create_struct_memory_error();
    }
    /* --END ERROR HANDLING-- */

    for (i=0; i < count; i++)
    {
	if (oldtypes[i] != MPI_LB && oldtypes[i] != MPI_UB && blklens[i] != 0)
	{
	    DLOOP_Offset sz;

	    DLOOP_Handle_get_size_macro(oldtypes[i], sz);
	    tmp_blklens[cur_pos] = (int) sz * blklens[i];
	    tmp_disps[cur_pos]   = disps[i];
	    cur_pos++;
	}
    }
    err = MPIR_Dataloop_create_indexed(cur_pos,
						  tmp_blklens,
						  tmp_disps,
						  1, /* disp in bytes */
						  MPI_BYTE,
						  dlp_p,
						  dlsz_p,
						  dldepth_p,
						  flag);

    DLOOP_Free(tmp_blklens);
    DLOOP_Free(tmp_disps);

    return err;
}
static int DLOOP_Dataloop_create_unique_type_struct(DLOOP_Count count,
						    const int *blklens,
						    const MPI_Aint *disps,
						    const DLOOP_Type *oldtypes,
						    int type_pos,
						    DLOOP_Dataloop **dlp_p,
						    MPI_Aint *dlsz_p,
						    int *dldepth_p,
						    int flag)
{
    /* the same type used more than once in the array; type_pos
     * indexes to the first of these.
     */
    int i, err, cur_pos = 0;
    DLOOP_Size *tmp_blklens;
    DLOOP_Offset *tmp_disps;

    /* count is an upper bound on number of type instances */
    tmp_blklens = (DLOOP_Size *) DLOOP_Malloc(count * sizeof(DLOOP_Size), MPL_MEM_DATATYPE);
    /* --BEGIN ERROR HANDLING-- */
    if (!tmp_blklens) {
	/* TODO: ??? */
	return DLOOP_Dataloop_create_struct_memory_error();
    }
    /* --END ERROR HANDLING-- */

    tmp_disps = (DLOOP_Offset *)
	DLOOP_Malloc(count * sizeof(DLOOP_Offset), MPL_MEM_DATATYPE);
    /* --BEGIN ERROR HANDLING-- */
    if (!tmp_disps) {
	DLOOP_Free(tmp_blklens);
	/* TODO: ??? */
	return DLOOP_Dataloop_create_struct_memory_error();
    }
    /* --END ERROR HANDLING-- */

    for (i=type_pos; i < count; i++)
    {
	if (oldtypes[i] == oldtypes[type_pos] && blklens != 0)
	{
	    tmp_blklens[cur_pos] = blklens[i];
	    tmp_disps[cur_pos]   = disps[i];
	    cur_pos++;
	}
    }

    err = MPIR_Dataloop_create_indexed(cur_pos,
						  tmp_blklens,
						  tmp_disps,
						  1, /* disp in bytes */
						  oldtypes[type_pos],
						  dlp_p,
						  dlsz_p,
						  dldepth_p,
						  flag);

    DLOOP_Free(tmp_blklens);
    DLOOP_Free(tmp_disps);

    return err;

}
Beispiel #5
0
void PREPEND_PREFIX(Dataloop_create)(MPI_Datatype type,
				     DLOOP_Dataloop **dlp_p,
				     int *dlsz_p,
				     int *dldepth_p,
				     int flag)
{
    int i;
    int err;

    int nr_ints, nr_aints, nr_types, combiner;
    MPI_Datatype *types;
    int *ints;
    MPI_Aint *aints;

    DLOOP_Dataloop *old_dlp;
    int old_dlsz, old_dldepth;

    int dummy1, dummy2, dummy3, type0_combiner, ndims;
    MPI_Datatype tmptype;

    MPI_Aint stride;
    MPI_Aint *disps;

    MPIR_Type_get_envelope_impl(type, &nr_ints, &nr_aints, &nr_types, &combiner);

    /* some named types do need dataloops; handle separately. */
    if (combiner == MPI_COMBINER_NAMED) {
	DLOOP_Dataloop_create_named(type, dlp_p, dlsz_p, dldepth_p, flag);
	return;
    }
    else if (combiner == MPI_COMBINER_F90_REAL ||
             combiner == MPI_COMBINER_F90_COMPLEX ||
             combiner == MPI_COMBINER_F90_INTEGER)
    {
        MPI_Datatype f90basetype;
        DLOOP_Handle_get_basic_type_macro(type, f90basetype);
        PREPEND_PREFIX(Dataloop_create_contiguous)(1 /* count */,
                                                   f90basetype,
                                                   dlp_p, dlsz_p,
                                                   dldepth_p,
                                                   flag);
        return;
    }

    /* Q: should we also check for "hasloop", or is the COMBINER
     *    check above enough to weed out everything that wouldn't
     *    have a loop?
     */
    DLOOP_Handle_get_loopptr_macro(type, old_dlp, flag);
    if (old_dlp != NULL) {
	/* dataloop already created; just return it. */
	*dlp_p = old_dlp;
	DLOOP_Handle_get_loopsize_macro(type, *dlsz_p, flag);
	DLOOP_Handle_get_loopdepth_macro(type, *dldepth_p, flag);
	return;
    }

    PREPEND_PREFIX(Type_access_contents)(type, &ints, &aints, &types);

    /* first check for zero count on types where that makes sense */
    switch(combiner) {
	case MPI_COMBINER_CONTIGUOUS:
	case MPI_COMBINER_VECTOR:
	case MPI_COMBINER_HVECTOR_INTEGER:
	case MPI_COMBINER_HVECTOR:
	case MPI_COMBINER_INDEXED_BLOCK:
	case MPI_COMBINER_HINDEXED_BLOCK:
	case MPI_COMBINER_INDEXED:
	case MPI_COMBINER_HINDEXED_INTEGER:
	case MPI_COMBINER_HINDEXED:
	case MPI_COMBINER_STRUCT_INTEGER:
	case MPI_COMBINER_STRUCT:
	    if (ints[0] == 0) {
		PREPEND_PREFIX(Dataloop_create_contiguous)(0,
							   MPI_INT,
							   dlp_p,
							   dlsz_p,
							   dldepth_p,
							   flag);
		goto clean_exit;
	    }
	    break;
	default:
	    break;
    }

    /* recurse, processing types "below" this one before processing
     * this one, if those type don't already have dataloops.
     *
     * note: in the struct case below we'll handle any additional
     *       types "below" the current one.
     */
    MPIR_Type_get_envelope_impl(types[0], &dummy1, &dummy2, &dummy3, &type0_combiner);
    if (type0_combiner != MPI_COMBINER_NAMED)
    {
	DLOOP_Handle_get_loopptr_macro(types[0], old_dlp, flag);
	if (old_dlp == NULL)
	{
	    /* no dataloop already present; create and store one */
	    PREPEND_PREFIX(Dataloop_create)(types[0],
					    &old_dlp,
					    &old_dlsz,
					    &old_dldepth,
					    flag);

	    DLOOP_Handle_set_loopptr_macro(types[0], old_dlp, flag);
	    DLOOP_Handle_set_loopsize_macro(types[0], old_dlsz, flag);
	    DLOOP_Handle_set_loopdepth_macro(types[0], old_dldepth, flag);
	}
	else {
	    DLOOP_Handle_get_loopsize_macro(types[0], old_dlsz, flag);
	    DLOOP_Handle_get_loopdepth_macro(types[0], old_dldepth, flag);
	}
    }
       
    switch(combiner)
    {
	case MPI_COMBINER_DUP:
	    if (type0_combiner != MPI_COMBINER_NAMED) {
		PREPEND_PREFIX(Dataloop_dup)(old_dlp, old_dlsz, dlp_p);
		*dlsz_p    = old_dlsz;
		*dldepth_p = old_dldepth;
	    }
	    else {
		PREPEND_PREFIX(Dataloop_create_contiguous)(1,
							   types[0], 
							   dlp_p, dlsz_p,
							   dldepth_p,
							   flag);
	    }
	    break;
	case MPI_COMBINER_RESIZED:
	    if (type0_combiner != MPI_COMBINER_NAMED) {
		PREPEND_PREFIX(Dataloop_dup)(old_dlp, old_dlsz, dlp_p);
		*dlsz_p    = old_dlsz;
		*dldepth_p = old_dldepth;
	    }
	    else {
		PREPEND_PREFIX(Dataloop_create_contiguous)(1,
							   types[0], 
							   dlp_p, dlsz_p,
							   dldepth_p,
							   flag);

		(*dlp_p)->el_extent = aints[1]; /* extent */
	    }
	    break;
	case MPI_COMBINER_CONTIGUOUS:
	    PREPEND_PREFIX(Dataloop_create_contiguous)(ints[0] /* count */,
						       types[0] /* oldtype */,
						       dlp_p, dlsz_p,
						       dldepth_p,
						       flag);
	    break;
	case MPI_COMBINER_VECTOR:
	    PREPEND_PREFIX(Dataloop_create_vector)(ints[0] /* count */,
						   ints[1] /* blklen */,
						   ints[2] /* stride */,
						   0 /* stride not bytes */,
						   types[0] /* oldtype */,
						   dlp_p, dlsz_p, dldepth_p,
						   flag);
	    break;
	case MPI_COMBINER_HVECTOR_INTEGER:
	case MPI_COMBINER_HVECTOR:
	    /* fortran hvector has integer stride in bytes */
	    if (combiner == MPI_COMBINER_HVECTOR_INTEGER) {
		stride = (MPI_Aint) ints[2];
	    }
	    else {
		stride = aints[0];
	    }

	    PREPEND_PREFIX(Dataloop_create_vector)(ints[0] /* count */,
						   ints[1] /* blklen */,
						   stride,
						   1 /* stride in bytes */,
						   types[0] /* oldtype */,
						   dlp_p, dlsz_p, dldepth_p,
						   flag);
	    break;
	case MPI_COMBINER_INDEXED_BLOCK:
	    PREPEND_PREFIX(Dataloop_create_blockindexed)(ints[0] /* count */,
							 ints[1] /* blklen */,
							 &ints[2] /* disps */,
							 0 /* disp not bytes */,
							 types[0] /* oldtype */,
							 dlp_p, dlsz_p,
							 dldepth_p,
							 flag);
	    break;
	case MPI_COMBINER_HINDEXED_BLOCK:
            disps = (MPI_Aint *) DLOOP_Malloc(ints[0] * sizeof(MPI_Aint));
            for (i = 0; i < ints[0]; i++)
                disps[i] = aints[i];
	    PREPEND_PREFIX(Dataloop_create_blockindexed)(ints[0] /* count */,
							 ints[1] /* blklen */,
							 disps /* disps */,
							 1 /* disp is bytes */,
							 types[0] /* oldtype */,
							 dlp_p, dlsz_p,
							 dldepth_p,
							 flag);
            DLOOP_Free(disps);
	    break;
	case MPI_COMBINER_INDEXED:
	    PREPEND_PREFIX(Dataloop_create_indexed)(ints[0] /* count */,
						    &ints[1] /* blklens */,
						    &ints[ints[0]+1] /* disp */,
						    0 /* disp not in bytes */,
						    types[0] /* oldtype */,
						    dlp_p, dlsz_p, dldepth_p,
						    flag);
	    break;
	case MPI_COMBINER_HINDEXED_INTEGER:
	case MPI_COMBINER_HINDEXED:
	    if (combiner == MPI_COMBINER_HINDEXED_INTEGER) {
		disps = (MPI_Aint *) DLOOP_Malloc(ints[0] * sizeof(MPI_Aint));

		for (i=0; i < ints[0]; i++) {
		    disps[i] = (MPI_Aint) ints[ints[0] + 1 + i];
		}
	    }
	    else {
		disps = aints;
	    }

	    PREPEND_PREFIX(Dataloop_create_indexed)(ints[0] /* count */,
						    &ints[1] /* blklens */,
						    disps,
						    1 /* disp in bytes */,
						    types[0] /* oldtype */,
						    dlp_p, dlsz_p, dldepth_p,
						    flag);

	    if (combiner == MPI_COMBINER_HINDEXED_INTEGER) {
		DLOOP_Free(disps);
	    }

	    break;
	case MPI_COMBINER_STRUCT_INTEGER:
	case MPI_COMBINER_STRUCT:
	    for (i = 1; i < ints[0]; i++) {
		int type_combiner;
		MPIR_Type_get_envelope_impl(types[i], &dummy1, &dummy2, &dummy3, &type_combiner);

		if (type_combiner != MPI_COMBINER_NAMED) {
		    DLOOP_Handle_get_loopptr_macro(types[i], old_dlp, flag);
		    if (old_dlp == NULL)
		    {
			PREPEND_PREFIX(Dataloop_create)(types[i],
							&old_dlp,
							&old_dlsz,
							&old_dldepth,
							flag);
			
			DLOOP_Handle_set_loopptr_macro(types[i], old_dlp,
						       flag);
			DLOOP_Handle_set_loopsize_macro(types[i], old_dlsz,
							flag);
			DLOOP_Handle_set_loopdepth_macro(types[i], old_dldepth,
							 flag);
		    }
		}
	    }
	    if (combiner == MPI_COMBINER_STRUCT_INTEGER) {
		disps = (MPI_Aint *) DLOOP_Malloc(ints[0] * sizeof(MPI_Aint));

		for (i=0; i < ints[0]; i++) {
		    disps[i] = (MPI_Aint) ints[ints[0] + 1 + i];
		}
	    }
	    else {
		disps = aints;
	    }

            err = PREPEND_PREFIX(Dataloop_create_struct)(ints[0] /* count */,
                                                         &ints[1] /* blklens */,
                                                         disps,
                                                         types /* oldtype array */,
                                                         dlp_p, dlsz_p, dldepth_p,
                                                         flag);
            /* TODO if/when this function returns error codes, propagate this failure instead */
            DLOOP_Assert(0 == err);
            /* if (err) return err; */

	    if (combiner == MPI_COMBINER_STRUCT_INTEGER) {
		DLOOP_Free(disps);
	    }
	    break;
	case MPI_COMBINER_SUBARRAY:
	    ndims = ints[0];
	    PREPEND_PREFIX(Type_convert_subarray)(ndims,
						  &ints[1] /* sizes */,
						  &ints[1+ndims] /* subsizes */,
						  &ints[1+2*ndims] /* starts */,
						  ints[1+3*ndims] /* order */,
						  types[0],
						  &tmptype);

	    PREPEND_PREFIX(Dataloop_create)(tmptype,
					    dlp_p,
					    dlsz_p,
					    dldepth_p,
					    flag);
	    
	    MPIR_Type_free_impl(&tmptype);
	    break;
	case MPI_COMBINER_DARRAY:
	    ndims = ints[2];
	    PREPEND_PREFIX(Type_convert_darray)(ints[0] /* size */,
						ints[1] /* rank */,
						ndims,
						&ints[3] /* gsizes */,
						&ints[3+ndims] /*distribs */,
						&ints[3+2*ndims] /* dargs */,
						&ints[3+3*ndims] /* psizes */,
						ints[3+4*ndims] /* order */,
						types[0],
						&tmptype);

	    PREPEND_PREFIX(Dataloop_create)(tmptype,
					    dlp_p,
					    dlsz_p,
					    dldepth_p,
					    flag);

	    MPIR_Type_free_impl(&tmptype);
	    break;
	default:
	    DLOOP_Assert(0);
	    break;
    }

 clean_exit:

    PREPEND_PREFIX(Type_release_contents)(type, &ints, &aints, &types);

    /* for now we just leave the intermediate dataloops in place.
     * could remove them to save space if we wanted.
     */

    return;
}
Beispiel #6
0
int MPIR_Type_convert_darray(int size,
                             int rank,
                             int ndims,
                             int *array_of_gsizes,
                             int *array_of_distribs,
                             int *array_of_dargs,
                             int *array_of_psizes,
                             int order, MPI_Datatype oldtype, MPI_Datatype * newtype)
{
    int mpi_errno = MPI_SUCCESS;
    MPI_Datatype type_old, type_new = MPI_DATATYPE_NULL, types[3];
    int procs, tmp_rank, i, tmp_size, blklens[3], *coords;
    MPI_Aint *st_offsets, orig_extent, disps[3];

    MPIR_Datatype_get_extent_macro(oldtype, orig_extent);

/* calculate position in Cartesian grid as MPI would (row-major
   ordering) */
    coords = (int *) DLOOP_Malloc(ndims * sizeof(int), MPL_MEM_DATATYPE);
    procs = size;
    tmp_rank = rank;
    for (i = 0; i < ndims; i++) {
        procs = procs / array_of_psizes[i];
        coords[i] = tmp_rank / procs;
        tmp_rank = tmp_rank % procs;
    }

    st_offsets = (MPI_Aint *) DLOOP_Malloc(ndims * sizeof(MPI_Aint), MPL_MEM_DATATYPE);
    type_old = oldtype;

    if (order == MPI_ORDER_FORTRAN) {
        /* dimension 0 changes fastest */
        for (i = 0; i < ndims; i++) {
            switch (array_of_distribs[i]) {
                case MPI_DISTRIBUTE_BLOCK:
                    mpi_errno = MPII_Type_block(array_of_gsizes, i, ndims,
                                                array_of_psizes[i],
                                                coords[i], array_of_dargs[i],
                                                order, orig_extent,
                                                type_old, &type_new, st_offsets + i);
                    if (mpi_errno)
                        MPIR_ERR_POP(mpi_errno);
                    break;
                case MPI_DISTRIBUTE_CYCLIC:
                    mpi_errno = MPII_Type_cyclic(array_of_gsizes, i, ndims,
                                                 array_of_psizes[i], coords[i],
                                                 array_of_dargs[i], order,
                                                 orig_extent, type_old, &type_new, st_offsets + i);
                    if (mpi_errno)
                        MPIR_ERR_POP(mpi_errno);
                    break;
                case MPI_DISTRIBUTE_NONE:
                    /* treat it as a block distribution on 1 process */
                    mpi_errno = MPII_Type_block(array_of_gsizes, i, ndims, 1, 0,
                                                MPI_DISTRIBUTE_DFLT_DARG, order,
                                                orig_extent, type_old, &type_new, st_offsets + i);
                    if (mpi_errno)
                        MPIR_ERR_POP(mpi_errno);
                    break;
            }
            if (i)
                MPIR_Type_free_impl(&type_old);
            type_old = type_new;
        }

        /* add displacement and UB */
        disps[1] = st_offsets[0];
        tmp_size = 1;
        for (i = 1; i < ndims; i++) {
            tmp_size *= array_of_gsizes[i - 1];
            disps[1] += ((MPI_Aint) tmp_size) * st_offsets[i];
        }
        /* rest done below for both Fortran and C order */
    }

    else {      /* order == MPI_ORDER_C */

        /* dimension ndims-1 changes fastest */
        for (i = ndims - 1; i >= 0; i--) {
            switch (array_of_distribs[i]) {
                case MPI_DISTRIBUTE_BLOCK:
                    mpi_errno = MPII_Type_block(array_of_gsizes, i, ndims, array_of_psizes[i],
                                                coords[i], array_of_dargs[i], order,
                                                orig_extent, type_old, &type_new, st_offsets + i);
                    if (mpi_errno)
                        MPIR_ERR_POP(mpi_errno);
                    break;
                case MPI_DISTRIBUTE_CYCLIC:
                    mpi_errno = MPII_Type_cyclic(array_of_gsizes, i, ndims,
                                                 array_of_psizes[i], coords[i],
                                                 array_of_dargs[i], order,
                                                 orig_extent, type_old, &type_new, st_offsets + i);
                    if (mpi_errno)
                        MPIR_ERR_POP(mpi_errno);
                    break;
                case MPI_DISTRIBUTE_NONE:
                    /* treat it as a block distribution on 1 process */
                    mpi_errno = MPII_Type_block(array_of_gsizes, i, ndims, array_of_psizes[i],
                                                coords[i], MPI_DISTRIBUTE_DFLT_DARG, order,
                                                orig_extent, type_old, &type_new, st_offsets + i);
                    if (mpi_errno)
                        MPIR_ERR_POP(mpi_errno);
                    break;
            }
            if (i != ndims - 1)
                MPIR_Type_free_impl(&type_old);
            type_old = type_new;
        }

        /* add displacement and UB */
        disps[1] = st_offsets[ndims - 1];
        tmp_size = 1;
        for (i = ndims - 2; i >= 0; i--) {
            tmp_size *= array_of_gsizes[i + 1];
            disps[1] += ((MPI_Aint) tmp_size) * st_offsets[i];
        }
    }

    disps[1] *= orig_extent;

    disps[2] = orig_extent;
    for (i = 0; i < ndims; i++)
        disps[2] *= (MPI_Aint) (array_of_gsizes[i]);

    disps[0] = 0;
    blklens[0] = blklens[1] = blklens[2] = 1;
    types[0] = MPI_LB;
    types[1] = type_new;
    types[2] = MPI_UB;

    DLOOP_Free(st_offsets);
    DLOOP_Free(coords);

    mpi_errno = MPIR_Type_struct_impl(3, blklens, disps, types, newtype);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);

    MPIR_Type_free_impl(&type_new);


  fn_exit:
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}