/*@ DLOOP_Dataloop_create_named - create a dataloop for a "named" type if necessary. "named" types are ones for which MPI_Type_get_envelope() returns a combiner of MPI_COMBINER_NAMED. some types that fit this category, such as MPI_SHORT_INT, have multiple elements with potential gaps and padding. these types need dataloops for correct processing. @*/ static void DLOOP_Dataloop_create_named(MPI_Datatype type, DLOOP_Dataloop **dlp_p, int *dlsz_p, int *dldepth_p, int flag) { DLOOP_Dataloop *dlp; /* special case: pairtypes need dataloops too. * * note: not dealing with MPI_2INT because size == extent * in all cases for that type. * * note: MPICH always precreates these, so we will never call * Dataloop_create_pairtype() from here in the MPICH * case. */ if (type == MPI_FLOAT_INT || type == MPI_DOUBLE_INT || type == MPI_LONG_INT || type == MPI_SHORT_INT || type == MPI_LONG_DOUBLE_INT) { DLOOP_Handle_get_loopptr_macro(type, dlp, flag); if (dlp != NULL) { /* dataloop already created; just return it. */ *dlp_p = dlp; DLOOP_Handle_get_loopsize_macro(type, *dlsz_p, flag); DLOOP_Handle_get_loopdepth_macro(type, *dldepth_p, flag); } else { PREPEND_PREFIX(Dataloop_create_pairtype)(type, dlp_p, dlsz_p, dldepth_p, flag); } return; } /* no other combiners need dataloops; exit. */ else { *dlp_p = NULL; *dlsz_p = 0; *dldepth_p = 0; return; } }
/*@ Dataloop_contiguous - create the dataloop representation for a contiguous datatype Input Parameters: + int icount, . DLOOP_Type oldtype - int flag Output Parameters: + DLOOP_Dataloop **dlp_p, . DLOOP_Size *dlsz_p, - int *dldepth_p, .N Errors .N Returns 0 on success, -1 on failure. @*/ int PREPEND_PREFIX(Dataloop_create_contiguous)(DLOOP_Count icount, DLOOP_Type oldtype, DLOOP_Dataloop **dlp_p, DLOOP_Size *dlsz_p, int *dldepth_p, int flag) { DLOOP_Count count; int is_builtin, apply_contig_coalescing = 0; int new_loop_depth; DLOOP_Size new_loop_sz; DLOOP_Dataloop *new_dlp; count = icount; is_builtin = (DLOOP_Handle_hasloop_macro(oldtype)) ? 0 : 1; if (is_builtin) { new_loop_depth = 1; } else { int old_loop_depth = 0; DLOOP_Offset old_size = 0, old_extent = 0; DLOOP_Dataloop *old_loop_ptr; DLOOP_Handle_get_loopdepth_macro(oldtype, old_loop_depth, flag); DLOOP_Handle_get_loopptr_macro(oldtype, old_loop_ptr, flag); DLOOP_Handle_get_size_macro(oldtype, old_size); DLOOP_Handle_get_extent_macro(oldtype, old_extent); /* if we have a simple combination of contigs, coalesce */ if (((old_loop_ptr->kind & DLOOP_KIND_MASK) == DLOOP_KIND_CONTIG) && (old_size == old_extent)) { /* will just copy contig and multiply count */ apply_contig_coalescing = 1; new_loop_depth = old_loop_depth; } else { new_loop_depth = old_loop_depth + 1; } } if (is_builtin) { DLOOP_Offset basic_sz = 0; PREPEND_PREFIX(Dataloop_alloc)(DLOOP_KIND_CONTIG, count, &new_dlp, &new_loop_sz); /* --BEGIN ERROR HANDLING-- */ if (!new_dlp) return -1; /* --END ERROR HANDLING-- */ DLOOP_Handle_get_size_macro(oldtype, basic_sz); new_dlp->kind = DLOOP_KIND_CONTIG | DLOOP_FINAL_MASK; if (flag == DLOOP_DATALOOP_ALL_BYTES) { count *= basic_sz; new_dlp->el_size = 1; new_dlp->el_extent = 1; new_dlp->el_type = MPI_BYTE; } else { new_dlp->el_size = basic_sz; new_dlp->el_extent = new_dlp->el_size; new_dlp->el_type = oldtype; } new_dlp->loop_params.c_t.count = count; } else { /* user-defined base type (oldtype) */ DLOOP_Dataloop *old_loop_ptr; MPI_Aint old_loop_sz = 0; DLOOP_Handle_get_loopptr_macro(oldtype, old_loop_ptr, flag); DLOOP_Handle_get_loopsize_macro(oldtype, old_loop_sz, flag); if (apply_contig_coalescing) { /* make a copy of the old loop and multiply the count */ PREPEND_PREFIX(Dataloop_dup)(old_loop_ptr, old_loop_sz, &new_dlp); /* --BEGIN ERROR HANDLING-- */ if (!new_dlp) return -1; /* --END ERROR HANDLING-- */ new_dlp->loop_params.c_t.count *= count; new_loop_sz = old_loop_sz; DLOOP_Handle_get_loopdepth_macro(oldtype, new_loop_depth, flag); } else { /* allocate space for new loop including copy of old */ PREPEND_PREFIX(Dataloop_alloc_and_copy)(DLOOP_KIND_CONTIG, count, old_loop_ptr, old_loop_sz, &new_dlp, &new_loop_sz); /* --BEGIN ERROR HANDLING-- */ if (!new_dlp) return -1; /* --END ERROR HANDLING-- */ new_dlp->kind = DLOOP_KIND_CONTIG; DLOOP_Handle_get_size_macro(oldtype, new_dlp->el_size); DLOOP_Handle_get_extent_macro(oldtype, new_dlp->el_extent); DLOOP_Handle_get_basic_type_macro(oldtype, new_dlp->el_type); new_dlp->loop_params.c_t.count = count; } } *dlp_p = new_dlp; *dlsz_p = new_loop_sz; *dldepth_p = new_loop_depth; return 0; }
/*@ Dataloop_create_blockindexed - create blockindexed dataloop Arguments: + DLOOP_Count count . void *displacement_array (array of either MPI_Aints or ints) . int displacement_in_bytes (boolean) . MPI_Datatype old_type . DLOOP_Dataloop **output_dataloop_ptr . int output_dataloop_size . int output_dataloop_depth - int flag .N Errors .N Returns 0 on success, -1 on failure. @*/ int MPIR_Dataloop_create_blockindexed(DLOOP_Count icount, DLOOP_Count iblklen, const void *disp_array, int dispinbytes, DLOOP_Type oldtype, DLOOP_Dataloop ** dlp_p, DLOOP_Size * dlsz_p, int *dldepth_p, int flag) { int err, is_builtin, is_vectorizable = 1; int i, old_loop_depth; DLOOP_Size new_loop_sz; DLOOP_Count contig_count, count, blklen; DLOOP_Offset old_extent, eff_disp0, eff_disp1, last_stride; DLOOP_Dataloop *new_dlp; count = (DLOOP_Count) icount; /* avoid subsequent casting */ blklen = (DLOOP_Count) iblklen; /* if count or blklen are zero, handle with contig code, call it a int */ if (count == 0 || blklen == 0) { err = MPIR_Dataloop_create_contiguous(0, MPI_INT, dlp_p, dlsz_p, dldepth_p, flag); return err; } is_builtin = (DLOOP_Handle_hasloop_macro(oldtype)) ? 0 : 1; if (is_builtin) { DLOOP_Handle_get_size_macro(oldtype, old_extent); old_loop_depth = 0; } else { DLOOP_Handle_get_extent_macro(oldtype, old_extent); DLOOP_Handle_get_loopdepth_macro(oldtype, old_loop_depth); } contig_count = MPIR_Type_blockindexed_count_contig(count, blklen, disp_array, dispinbytes, old_extent); /* optimization: * * if contig_count == 1 and block starts at displacement 0, * store it as a contiguous rather than a blockindexed dataloop. */ if ((contig_count == 1) && ((!dispinbytes && ((int *) disp_array)[0] == 0) || (dispinbytes && ((MPI_Aint *) disp_array)[0] == 0))) { err = MPIR_Dataloop_create_contiguous(icount * iblklen, oldtype, dlp_p, dlsz_p, dldepth_p, flag); return err; } /* optimization: * * if contig_count == 1 store it as a blockindexed with one * element rather than as a lot of individual blocks. */ if (contig_count == 1) { /* adjust count and blklen and drop through */ blklen *= count; count = 1; iblklen *= icount; icount = 1; } /* optimization: * * if displacements start at zero and result in a fixed stride, * store it as a vector rather than a blockindexed dataloop. */ eff_disp0 = (dispinbytes) ? ((DLOOP_Offset) ((MPI_Aint *) disp_array)[0]) : (((DLOOP_Offset) ((int *) disp_array)[0]) * old_extent); if (count > 1 && eff_disp0 == (DLOOP_Offset) 0) { eff_disp1 = (dispinbytes) ? ((DLOOP_Offset) ((MPI_Aint *) disp_array)[1]) : (((DLOOP_Offset) ((int *) disp_array)[1]) * old_extent); last_stride = eff_disp1 - eff_disp0; for (i = 2; i < count; i++) { eff_disp0 = eff_disp1; eff_disp1 = (dispinbytes) ? ((DLOOP_Offset) ((MPI_Aint *) disp_array)[i]) : (((DLOOP_Offset) ((int *) disp_array)[i]) * old_extent); if (eff_disp1 - eff_disp0 != last_stride) { is_vectorizable = 0; break; } } if (is_vectorizable) { err = MPIR_Dataloop_create_vector(count, blklen, last_stride, 1, /* strideinbytes */ oldtype, dlp_p, dlsz_p, dldepth_p, flag); return err; } } /* TODO: optimization: * * if displacements result in a fixed stride, but first displacement * is not zero, store it as a blockindexed (blklen == 1) of a vector. */ /* TODO: optimization: * * if a blockindexed of a contig, absorb the contig into the blocklen * parameter and keep the same overall depth */ /* otherwise storing as a blockindexed dataloop */ /* Q: HOW CAN WE TELL IF IT IS WORTH IT TO STORE AS AN * INDEXED WITH FEWER CONTIG BLOCKS (IF CONTIG_COUNT IS SMALL)? */ if (is_builtin) { MPIR_Dataloop_alloc(DLOOP_KIND_BLOCKINDEXED, count, &new_dlp, &new_loop_sz); /* --BEGIN ERROR HANDLING-- */ if (!new_dlp) return -1; /* --END ERROR HANDLING-- */ new_dlp->kind = DLOOP_KIND_BLOCKINDEXED | DLOOP_FINAL_MASK; if (flag == DLOOP_DATALOOP_ALL_BYTES) { blklen *= old_extent; new_dlp->el_size = 1; new_dlp->el_extent = 1; new_dlp->el_type = MPI_BYTE; } else { new_dlp->el_size = old_extent; new_dlp->el_extent = old_extent; new_dlp->el_type = oldtype; } } else { DLOOP_Dataloop *old_loop_ptr = NULL; MPI_Aint old_loop_sz = 0; DLOOP_Handle_get_loopptr_macro(oldtype, old_loop_ptr); DLOOP_Handle_get_loopsize_macro(oldtype, old_loop_sz); MPIR_Dataloop_alloc_and_copy(DLOOP_KIND_BLOCKINDEXED, count, old_loop_ptr, old_loop_sz, &new_dlp, &new_loop_sz); /* --BEGIN ERROR HANDLING-- */ if (!new_dlp) return -1; /* --END ERROR HANDLING-- */ new_dlp->kind = DLOOP_KIND_BLOCKINDEXED; DLOOP_Handle_get_size_macro(oldtype, new_dlp->el_size); DLOOP_Handle_get_extent_macro(oldtype, new_dlp->el_extent); DLOOP_Handle_get_basic_type_macro(oldtype, new_dlp->el_type); } new_dlp->loop_params.bi_t.count = count; new_dlp->loop_params.bi_t.blocksize = blklen; /* copy in displacement parameters * * regardless of dispinbytes, we store displacements in bytes in loop. */ DLOOP_Type_blockindexed_array_copy(count, disp_array, new_dlp->loop_params.bi_t.offset_array, dispinbytes, old_extent); *dlp_p = new_dlp; *dlsz_p = new_loop_sz; *dldepth_p = old_loop_depth + 1; return 0; }
int PREPEND_PREFIX(Dataloop_create_indexed)(DLOOP_Count icount, const DLOOP_Size *blocklength_array, const void *displacement_array, int dispinbytes, MPI_Datatype oldtype, DLOOP_Dataloop **dlp_p, DLOOP_Size *dlsz_p, int *dldepth_p, int flag) { int err, is_builtin; int old_loop_depth; MPI_Aint i; DLOOP_Size new_loop_sz, blksz; DLOOP_Count first; DLOOP_Count old_type_count = 0, contig_count, count; DLOOP_Offset old_extent; struct DLOOP_Dataloop *new_dlp; count = (DLOOP_Count) icount; /* avoid subsequent casting */ /* if count is zero, handle with contig code, call it an int */ if (count == 0) { err = PREPEND_PREFIX(Dataloop_create_contiguous)(0, MPI_INT, dlp_p, dlsz_p, dldepth_p, flag); return err; } /* Skip any initial zero-length blocks */ for (first = 0; first < count; first++) if ((DLOOP_Count) blocklength_array[first]) break; is_builtin = (DLOOP_Handle_hasloop_macro(oldtype)) ? 0 : 1; if (is_builtin) { DLOOP_Handle_get_extent_macro(oldtype, old_extent); old_loop_depth = 0; } else { DLOOP_Handle_get_extent_macro(oldtype, old_extent); DLOOP_Handle_get_loopdepth_macro(oldtype, old_loop_depth, flag); } for (i=first; i < count; i++) { old_type_count += (DLOOP_Count) blocklength_array[i]; } contig_count = PREPEND_PREFIX(Type_indexed_count_contig)(count, blocklength_array, displacement_array, dispinbytes, old_extent); /* if contig_count is zero (no data), handle with contig code */ if (contig_count == 0) { err = PREPEND_PREFIX(Dataloop_create_contiguous)(0, MPI_INT, dlp_p, dlsz_p, dldepth_p, flag); return err; } /* optimization: * * if contig_count == 1 and block starts at displacement 0, * store it as a contiguous rather than an indexed dataloop. */ if ((contig_count == 1) && ((!dispinbytes && ((int *) displacement_array)[first] == 0) || (dispinbytes && ((MPI_Aint *) displacement_array)[first] == 0))) { err = PREPEND_PREFIX(Dataloop_create_contiguous)(old_type_count, oldtype, dlp_p, dlsz_p, dldepth_p, flag); return err; } /* optimization: * * if contig_count == 1 (and displacement != 0), store this as * a single element blockindexed rather than a lot of individual * blocks. */ if (contig_count == 1) { const void *disp_arr_tmp; /* no ternary assignment to avoid clang warnings */ if (dispinbytes) disp_arr_tmp = &(((const MPI_Aint *)displacement_array)[first]); else disp_arr_tmp = &(((const int *)displacement_array)[first]); err = PREPEND_PREFIX(Dataloop_create_blockindexed)(1, old_type_count, disp_arr_tmp, dispinbytes, oldtype, dlp_p, dlsz_p, dldepth_p, flag); return err; } /* optimization: * * if block length is the same for all blocks, store it as a * blockindexed rather than an indexed dataloop. */ blksz = blocklength_array[first]; for (i = first+1; i < count; i++) { if (blocklength_array[i] != blksz) { blksz--; break; } } if (blksz == blocklength_array[first]) { const void *disp_arr_tmp; /* no ternary assignment to avoid clang warnings */ if (dispinbytes) disp_arr_tmp = &(((const MPI_Aint *)displacement_array)[first]); else disp_arr_tmp = &(((const int *)displacement_array)[first]); err = PREPEND_PREFIX(Dataloop_create_blockindexed)(icount-first, blksz, disp_arr_tmp, dispinbytes, oldtype, dlp_p, dlsz_p, dldepth_p, flag); return err; } /* note: blockindexed looks for the vector optimization */ /* TODO: optimization: * * if an indexed of a contig, absorb the contig into the blocklen array * and keep the same overall depth */ /* otherwise storing as an indexed dataloop */ if (is_builtin) { PREPEND_PREFIX(Dataloop_alloc)(DLOOP_KIND_INDEXED, count, &new_dlp, &new_loop_sz); /* --BEGIN ERROR HANDLING-- */ if (!new_dlp) return -1; /* --END ERROR HANDLING-- */ new_dlp->kind = DLOOP_KIND_INDEXED | DLOOP_FINAL_MASK; if (flag == DLOOP_DATALOOP_ALL_BYTES) { /* blocklengths are modified below */ new_dlp->el_size = 1; new_dlp->el_extent = 1; new_dlp->el_type = MPI_BYTE; } else { new_dlp->el_size = old_extent; new_dlp->el_extent = old_extent; new_dlp->el_type = oldtype; } } else { DLOOP_Dataloop *old_loop_ptr = NULL; MPI_Aint old_loop_sz = 0; DLOOP_Handle_get_loopptr_macro(oldtype, old_loop_ptr, flag); DLOOP_Handle_get_loopsize_macro(oldtype, old_loop_sz, flag); PREPEND_PREFIX(Dataloop_alloc_and_copy)(DLOOP_KIND_INDEXED, contig_count, old_loop_ptr, old_loop_sz, &new_dlp, &new_loop_sz); /* --BEGIN ERROR HANDLING-- */ if (!new_dlp) return -1; /* --END ERROR HANDLING-- */ new_dlp->kind = DLOOP_KIND_INDEXED; DLOOP_Handle_get_size_macro(oldtype, new_dlp->el_size); DLOOP_Handle_get_extent_macro(oldtype, new_dlp->el_extent); DLOOP_Handle_get_basic_type_macro(oldtype, new_dlp->el_type); } new_dlp->loop_params.i_t.count = contig_count; new_dlp->loop_params.i_t.total_blocks = old_type_count; /* copy in blocklength and displacement parameters (in that order) * * regardless of dispinbytes, we store displacements in bytes in loop. */ DLOOP_Type_indexed_array_copy(count, contig_count, blocklength_array, displacement_array, new_dlp->loop_params.i_t.blocksize_array, new_dlp->loop_params.i_t.offset_array, dispinbytes, old_extent); if (is_builtin && (flag == DLOOP_DATALOOP_ALL_BYTES)) { DLOOP_Count *tmp_blklen_array = new_dlp->loop_params.i_t.blocksize_array; for (i=0; i < contig_count; i++) { /* increase block lengths so they are in bytes */ tmp_blklen_array[i] *= old_extent; } new_dlp->loop_params.i_t.total_blocks *= old_extent; } *dlp_p = new_dlp; *dlsz_p = new_loop_sz; *dldepth_p = old_loop_depth + 1; return MPI_SUCCESS; }
/*@ Dataloop_create_struct - create the dataloop representation for a struct datatype Input Parameters: + count - number of blocks in vector . blklens - number of elements in each block . disps - offsets of blocks from start of type in bytes - oldtypes - types (using handle) of datatypes on which vector is based Output Parameters: + dlp_p - pointer to address in which to place pointer to new dataloop - dlsz_p - pointer to address in which to place size of new dataloop Return Value: 0 on success, -1 on failure. Notes: This function relies on others, like Dataloop_create_indexed, to create types in some cases. This call (like all the rest) takes int blklens and MPI_Aint displacements, so it's possible to overflow when working with a particularly large struct type in some cases. This isn't detected or corrected in this code at this time. @*/ int MPIR_Dataloop_create_struct(DLOOP_Count count, const int *blklens, const MPI_Aint *disps, const DLOOP_Type *oldtypes, DLOOP_Dataloop **dlp_p, MPI_Aint *dlsz_p, int *dldepth_p, int flag) { int err, i, nr_basics = 0, nr_derived = 0, type_pos = 0; DLOOP_Type first_basic = MPI_DATATYPE_NULL, first_derived = MPI_DATATYPE_NULL; /* variables used in general case only */ int loop_idx, new_loop_depth; int old_loop_depth = 0; MPI_Aint new_loop_sz, old_loop_sz = 0; DLOOP_Dataloop *new_dlp, *curpos; /* if count is zero, handle with contig code, call it a int */ if (count == 0) { err = MPIR_Dataloop_create_contiguous(0, MPI_INT, dlp_p, dlsz_p, dldepth_p, flag); return err; } /* browse the old types and characterize */ for (i=0; i < count; i++) { /* ignore type elements with a zero blklen */ if (blklens[i] == 0) continue; if (oldtypes[i] != MPI_LB && oldtypes[i] != MPI_UB) { int is_builtin; is_builtin = (DLOOP_Handle_hasloop_macro(oldtypes[i])) ? 0 : 1; if (is_builtin) { if (nr_basics == 0) { first_basic = oldtypes[i]; type_pos = i; } else if (oldtypes[i] != first_basic) { first_basic = MPI_DATATYPE_NULL; } nr_basics++; } else /* derived type */ { if (nr_derived == 0) { first_derived = oldtypes[i]; type_pos = i; } else if (oldtypes[i] != first_derived) { first_derived = MPI_DATATYPE_NULL; } nr_derived++; } } } /* note on optimizations: * * because LB, UB, and extent calculations are handled as part of * the Datatype, we can safely ignore them in all our calculations * here. */ /* optimization: * * if there were only MPI_LBs and MPI_UBs in the struct type, * treat it as a zero-element contiguous (just as count == 0). */ if (nr_basics == 0 && nr_derived == 0) { err = MPIR_Dataloop_create_contiguous(0, MPI_INT, dlp_p, dlsz_p, dldepth_p, flag); return err; } /* optimization: * * if there is only one unique instance of a type in the struct, treat it * as a blockindexed type. * * notes: * * if the displacement happens to be zero, the blockindexed code will * optimize this into a contig. */ if (nr_basics + nr_derived == 1) { /* type_pos is index to only real type in array */ err = MPIR_Dataloop_create_blockindexed (1, /* count */ blklens[type_pos], &disps[type_pos], 1, /* displacement in bytes */ oldtypes[type_pos], dlp_p, dlsz_p, dldepth_p, flag); return err; } /* optimization: * * if there only one unique type (more than one instance) in the * struct, treat it as an indexed type. * * notes: * * this will apply to a single type with an LB/UB, as those * are handled elsewhere. * */ if (((nr_derived == 0) && (first_basic != MPI_DATATYPE_NULL)) || ((nr_basics == 0) && (first_derived != MPI_DATATYPE_NULL))) { return DLOOP_Dataloop_create_unique_type_struct(count, blklens, disps, oldtypes, type_pos, dlp_p, dlsz_p, dldepth_p, flag); } /* optimization: * * if there are no derived types and caller indicated either a * homogeneous system or the "all bytes" conversion, convert * everything to bytes and use an indexed type. */ if (nr_derived == 0 && ((flag == DLOOP_DATALOOP_HOMOGENEOUS) || (flag == DLOOP_DATALOOP_ALL_BYTES))) { return DLOOP_Dataloop_create_basic_all_bytes_struct(count, blklens, disps, oldtypes, dlp_p, dlsz_p, dldepth_p, flag); } /* optimization: * * if caller asked for homogeneous or all bytes representation, * flatten the type and store it as an indexed type so that * there are no branches in the dataloop tree. */ if ((flag == DLOOP_DATALOOP_HOMOGENEOUS) || (flag == DLOOP_DATALOOP_ALL_BYTES)) { return DLOOP_Dataloop_create_flattened_struct(count, blklens, disps, oldtypes, dlp_p, dlsz_p, dldepth_p, flag); } /* scan through types and gather derived type info */ for (i=0; i < count; i++) { /* ignore type elements with a zero blklen */ if (blklens[i] == 0) continue; if (DLOOP_Handle_hasloop_macro(oldtypes[i])) { int tmp_loop_depth; MPI_Aint tmp_loop_sz; DLOOP_Handle_get_loopdepth_macro(oldtypes[i], tmp_loop_depth, flag); DLOOP_Handle_get_loopsize_macro(oldtypes[i], tmp_loop_sz, flag); if (tmp_loop_depth > old_loop_depth) { old_loop_depth = tmp_loop_depth; } old_loop_sz += tmp_loop_sz; } } /* general case below: 2 or more distinct types that are either * basics or derived, and for which we want to preserve the types * themselves. */ if (nr_basics > 0) { /* basics introduce an extra level of depth, so if our new depth * must be at least 2 if there are basics. */ new_loop_depth = ((old_loop_depth+1) > 2) ? (old_loop_depth+1) : 2; } else { new_loop_depth = old_loop_depth + 1; } MPIR_Dataloop_struct_alloc((DLOOP_Count) nr_basics + nr_derived, old_loop_sz, nr_basics, &curpos, &new_dlp, &new_loop_sz); /* --BEGIN ERROR HANDLING-- */ if (!new_dlp) { return DLOOP_Dataloop_create_struct_memory_error(); } /* --END ERROR HANDLING-- */ new_dlp->kind = DLOOP_KIND_STRUCT; new_dlp->el_size = -1; /* not valid for struct */ new_dlp->el_extent = -1; /* not valid for struct; see el_extent_array */ new_dlp->el_type = MPI_DATATYPE_NULL; /* not valid for struct */ new_dlp->loop_params.s_t.count = (DLOOP_Count) nr_basics + nr_derived; /* note: curpos points to first byte in "old dataloop" region of * newly allocated space. */ for (i=0, loop_idx = 0; i < count; i++) { int is_builtin; /* ignore type elements with a zero blklen */ if (blklens[i] == 0) continue; is_builtin = (DLOOP_Handle_hasloop_macro(oldtypes[i])) ? 0 : 1; if (is_builtin) { DLOOP_Dataloop *dummy_dlp; int dummy_depth; MPI_Aint dummy_sz; /* LBs and UBs already taken care of -- skip them */ if (oldtypes[i] == MPI_LB || oldtypes[i] == MPI_UB) { continue; } /* build a contig dataloop for this basic and point to that * * optimization: * * push the count (blklen) from the struct down into the * contig so we can process more at the leaf. */ err = MPIR_Dataloop_create_contiguous(blklens[i], oldtypes[i], &dummy_dlp, &dummy_sz, &dummy_depth, flag); /* --BEGIN ERROR HANDLING-- */ if (err) { /* TODO: FREE ALLOCATED RESOURCES */ return -1; } /* --END ERROR HANDLING-- */ /* copy the new contig loop into place in the struct memory * region */ MPIR_Dataloop_copy(curpos, dummy_dlp, dummy_sz); new_dlp->loop_params.s_t.dataloop_array[loop_idx] = curpos; curpos = (DLOOP_Dataloop *) ((char *) curpos + dummy_sz); /* we stored the block size in the contig -- use 1 here */ new_dlp->loop_params.s_t.blocksize_array[loop_idx] = 1; new_dlp->loop_params.s_t.el_extent_array[loop_idx] = ((DLOOP_Offset) blklens[i]) * dummy_dlp->el_extent; MPIR_Dataloop_free(&dummy_dlp); } else { DLOOP_Dataloop *old_loop_ptr; DLOOP_Offset old_extent; DLOOP_Handle_get_loopptr_macro(oldtypes[i], old_loop_ptr, flag); DLOOP_Handle_get_loopsize_macro(oldtypes[i], old_loop_sz, flag); DLOOP_Handle_get_extent_macro(oldtypes[i], old_extent); MPIR_Dataloop_copy(curpos, old_loop_ptr, old_loop_sz); new_dlp->loop_params.s_t.dataloop_array[loop_idx] = curpos; curpos = (DLOOP_Dataloop *) ((char *) curpos + old_loop_sz); new_dlp->loop_params.s_t.blocksize_array[loop_idx] = (DLOOP_Count) blklens[i]; new_dlp->loop_params.s_t.el_extent_array[loop_idx] = old_extent; } new_dlp->loop_params.s_t.offset_array[loop_idx] = (DLOOP_Offset) disps[i]; loop_idx++; } *dlp_p = new_dlp; *dlsz_p = new_loop_sz; *dldepth_p = new_loop_depth; return 0; }
/*@ Dataloop_create_vector Arguments: + int icount . int iblocklength . MPI_Aint astride . int strideinbytes . MPI_Datatype oldtype . DLOOP_Dataloop **dlp_p . int *dlsz_p . int *dldepth_p - int flag Returns 0 on success, -1 on failure. @*/ int PREPEND_PREFIX(Dataloop_create_vector)(int icount, int iblocklength, MPI_Aint astride, int strideinbytes, DLOOP_Type oldtype, DLOOP_Dataloop **dlp_p, int *dlsz_p, int *dldepth_p, int flag) { int err, is_builtin; int new_loop_sz, new_loop_depth; DLOOP_Count count, blocklength; DLOOP_Offset stride; DLOOP_Dataloop *new_dlp; count = (DLOOP_Count) icount; /* avoid subsequent casting */ blocklength = (DLOOP_Count) iblocklength; stride = (DLOOP_Offset) astride; /* if count or blocklength are zero, handle with contig code, * call it a int */ if (count == 0 || blocklength == 0) { err = PREPEND_PREFIX(Dataloop_create_contiguous)(0, MPI_INT, dlp_p, dlsz_p, dldepth_p, flag); return err; } /* optimization: * * if count == 1, store as a contiguous rather than a vector dataloop. */ if (count == 1) { err = PREPEND_PREFIX(Dataloop_create_contiguous)(iblocklength, oldtype, dlp_p, dlsz_p, dldepth_p, flag); return err; } is_builtin = (DLOOP_Handle_hasloop_macro(oldtype)) ? 0 : 1; if (is_builtin) { new_loop_sz = sizeof(DLOOP_Dataloop); new_loop_depth = 1; } else { int old_loop_sz = 0, old_loop_depth = 0; DLOOP_Handle_get_loopsize_macro(oldtype, old_loop_sz, flag); DLOOP_Handle_get_loopdepth_macro(oldtype, old_loop_depth, flag); /* TODO: ACCOUNT FOR PADDING IN LOOP_SZ HERE */ new_loop_sz = sizeof(DLOOP_Dataloop) + old_loop_sz; new_loop_depth = old_loop_depth + 1; } if (is_builtin) { DLOOP_Offset basic_sz = 0; PREPEND_PREFIX(Dataloop_alloc)(DLOOP_KIND_VECTOR, count, &new_dlp, &new_loop_sz); /* --BEGIN ERROR HANDLING-- */ if (!new_dlp) return -1; /* --END ERROR HANDLING-- */ DLOOP_Handle_get_size_macro(oldtype, basic_sz); new_dlp->kind = DLOOP_KIND_VECTOR | DLOOP_FINAL_MASK; if (flag == DLOOP_DATALOOP_ALL_BYTES) { blocklength *= basic_sz; new_dlp->el_size = 1; new_dlp->el_extent = 1; new_dlp->el_type = MPI_BYTE; if(!strideinbytes) /* the stride was specified in units of oldtype, now that we're using bytes, rather than oldtype, we need to update stride. */ stride *= basic_sz; } else { new_dlp->el_size = basic_sz; new_dlp->el_extent = new_dlp->el_size; new_dlp->el_type = oldtype; } } else { /* user-defined base type (oldtype) */ DLOOP_Dataloop *old_loop_ptr; int old_loop_sz = 0; DLOOP_Handle_get_loopptr_macro(oldtype, old_loop_ptr, flag); DLOOP_Handle_get_loopsize_macro(oldtype, old_loop_sz, flag); PREPEND_PREFIX(Dataloop_alloc_and_copy)(DLOOP_KIND_VECTOR, count, old_loop_ptr, old_loop_sz, &new_dlp, &new_loop_sz); /* --BEGIN ERROR HANDLING-- */ if (!new_dlp) return -1; /* --END ERROR HANDLING-- */ new_dlp->kind = DLOOP_KIND_VECTOR; DLOOP_Handle_get_size_macro(oldtype, new_dlp->el_size); DLOOP_Handle_get_extent_macro(oldtype, new_dlp->el_extent); DLOOP_Handle_get_basic_type_macro(oldtype, new_dlp->el_type); } /* vector-specific members * * stride stored in dataloop is always in bytes for local rep of type */ new_dlp->loop_params.v_t.count = count; new_dlp->loop_params.v_t.blocksize = blocklength; new_dlp->loop_params.v_t.stride = (strideinbytes) ? stride : stride * new_dlp->el_extent; *dlp_p = new_dlp; *dlsz_p = new_loop_sz; *dldepth_p = new_loop_depth; return 0; }
void PREPEND_PREFIX(Dataloop_create)(MPI_Datatype type, DLOOP_Dataloop **dlp_p, int *dlsz_p, int *dldepth_p, int flag) { int i; int err; int nr_ints, nr_aints, nr_types, combiner; MPI_Datatype *types; int *ints; MPI_Aint *aints; DLOOP_Dataloop *old_dlp; int old_dlsz, old_dldepth; int dummy1, dummy2, dummy3, type0_combiner, ndims; MPI_Datatype tmptype; MPI_Aint stride; MPI_Aint *disps; MPIR_Type_get_envelope_impl(type, &nr_ints, &nr_aints, &nr_types, &combiner); /* some named types do need dataloops; handle separately. */ if (combiner == MPI_COMBINER_NAMED) { DLOOP_Dataloop_create_named(type, dlp_p, dlsz_p, dldepth_p, flag); return; } else if (combiner == MPI_COMBINER_F90_REAL || combiner == MPI_COMBINER_F90_COMPLEX || combiner == MPI_COMBINER_F90_INTEGER) { MPI_Datatype f90basetype; DLOOP_Handle_get_basic_type_macro(type, f90basetype); PREPEND_PREFIX(Dataloop_create_contiguous)(1 /* count */, f90basetype, dlp_p, dlsz_p, dldepth_p, flag); return; } /* Q: should we also check for "hasloop", or is the COMBINER * check above enough to weed out everything that wouldn't * have a loop? */ DLOOP_Handle_get_loopptr_macro(type, old_dlp, flag); if (old_dlp != NULL) { /* dataloop already created; just return it. */ *dlp_p = old_dlp; DLOOP_Handle_get_loopsize_macro(type, *dlsz_p, flag); DLOOP_Handle_get_loopdepth_macro(type, *dldepth_p, flag); return; } PREPEND_PREFIX(Type_access_contents)(type, &ints, &aints, &types); /* first check for zero count on types where that makes sense */ switch(combiner) { case MPI_COMBINER_CONTIGUOUS: case MPI_COMBINER_VECTOR: case MPI_COMBINER_HVECTOR_INTEGER: case MPI_COMBINER_HVECTOR: case MPI_COMBINER_INDEXED_BLOCK: case MPI_COMBINER_HINDEXED_BLOCK: case MPI_COMBINER_INDEXED: case MPI_COMBINER_HINDEXED_INTEGER: case MPI_COMBINER_HINDEXED: case MPI_COMBINER_STRUCT_INTEGER: case MPI_COMBINER_STRUCT: if (ints[0] == 0) { PREPEND_PREFIX(Dataloop_create_contiguous)(0, MPI_INT, dlp_p, dlsz_p, dldepth_p, flag); goto clean_exit; } break; default: break; } /* recurse, processing types "below" this one before processing * this one, if those type don't already have dataloops. * * note: in the struct case below we'll handle any additional * types "below" the current one. */ MPIR_Type_get_envelope_impl(types[0], &dummy1, &dummy2, &dummy3, &type0_combiner); if (type0_combiner != MPI_COMBINER_NAMED) { DLOOP_Handle_get_loopptr_macro(types[0], old_dlp, flag); if (old_dlp == NULL) { /* no dataloop already present; create and store one */ PREPEND_PREFIX(Dataloop_create)(types[0], &old_dlp, &old_dlsz, &old_dldepth, flag); DLOOP_Handle_set_loopptr_macro(types[0], old_dlp, flag); DLOOP_Handle_set_loopsize_macro(types[0], old_dlsz, flag); DLOOP_Handle_set_loopdepth_macro(types[0], old_dldepth, flag); } else { DLOOP_Handle_get_loopsize_macro(types[0], old_dlsz, flag); DLOOP_Handle_get_loopdepth_macro(types[0], old_dldepth, flag); } } switch(combiner) { case MPI_COMBINER_DUP: if (type0_combiner != MPI_COMBINER_NAMED) { PREPEND_PREFIX(Dataloop_dup)(old_dlp, old_dlsz, dlp_p); *dlsz_p = old_dlsz; *dldepth_p = old_dldepth; } else { PREPEND_PREFIX(Dataloop_create_contiguous)(1, types[0], dlp_p, dlsz_p, dldepth_p, flag); } break; case MPI_COMBINER_RESIZED: if (type0_combiner != MPI_COMBINER_NAMED) { PREPEND_PREFIX(Dataloop_dup)(old_dlp, old_dlsz, dlp_p); *dlsz_p = old_dlsz; *dldepth_p = old_dldepth; } else { PREPEND_PREFIX(Dataloop_create_contiguous)(1, types[0], dlp_p, dlsz_p, dldepth_p, flag); (*dlp_p)->el_extent = aints[1]; /* extent */ } break; case MPI_COMBINER_CONTIGUOUS: PREPEND_PREFIX(Dataloop_create_contiguous)(ints[0] /* count */, types[0] /* oldtype */, dlp_p, dlsz_p, dldepth_p, flag); break; case MPI_COMBINER_VECTOR: PREPEND_PREFIX(Dataloop_create_vector)(ints[0] /* count */, ints[1] /* blklen */, ints[2] /* stride */, 0 /* stride not bytes */, types[0] /* oldtype */, dlp_p, dlsz_p, dldepth_p, flag); break; case MPI_COMBINER_HVECTOR_INTEGER: case MPI_COMBINER_HVECTOR: /* fortran hvector has integer stride in bytes */ if (combiner == MPI_COMBINER_HVECTOR_INTEGER) { stride = (MPI_Aint) ints[2]; } else { stride = aints[0]; } PREPEND_PREFIX(Dataloop_create_vector)(ints[0] /* count */, ints[1] /* blklen */, stride, 1 /* stride in bytes */, types[0] /* oldtype */, dlp_p, dlsz_p, dldepth_p, flag); break; case MPI_COMBINER_INDEXED_BLOCK: PREPEND_PREFIX(Dataloop_create_blockindexed)(ints[0] /* count */, ints[1] /* blklen */, &ints[2] /* disps */, 0 /* disp not bytes */, types[0] /* oldtype */, dlp_p, dlsz_p, dldepth_p, flag); break; case MPI_COMBINER_HINDEXED_BLOCK: disps = (MPI_Aint *) DLOOP_Malloc(ints[0] * sizeof(MPI_Aint)); for (i = 0; i < ints[0]; i++) disps[i] = aints[i]; PREPEND_PREFIX(Dataloop_create_blockindexed)(ints[0] /* count */, ints[1] /* blklen */, disps /* disps */, 1 /* disp is bytes */, types[0] /* oldtype */, dlp_p, dlsz_p, dldepth_p, flag); DLOOP_Free(disps); break; case MPI_COMBINER_INDEXED: PREPEND_PREFIX(Dataloop_create_indexed)(ints[0] /* count */, &ints[1] /* blklens */, &ints[ints[0]+1] /* disp */, 0 /* disp not in bytes */, types[0] /* oldtype */, dlp_p, dlsz_p, dldepth_p, flag); break; case MPI_COMBINER_HINDEXED_INTEGER: case MPI_COMBINER_HINDEXED: if (combiner == MPI_COMBINER_HINDEXED_INTEGER) { disps = (MPI_Aint *) DLOOP_Malloc(ints[0] * sizeof(MPI_Aint)); for (i=0; i < ints[0]; i++) { disps[i] = (MPI_Aint) ints[ints[0] + 1 + i]; } } else { disps = aints; } PREPEND_PREFIX(Dataloop_create_indexed)(ints[0] /* count */, &ints[1] /* blklens */, disps, 1 /* disp in bytes */, types[0] /* oldtype */, dlp_p, dlsz_p, dldepth_p, flag); if (combiner == MPI_COMBINER_HINDEXED_INTEGER) { DLOOP_Free(disps); } break; case MPI_COMBINER_STRUCT_INTEGER: case MPI_COMBINER_STRUCT: for (i = 1; i < ints[0]; i++) { int type_combiner; MPIR_Type_get_envelope_impl(types[i], &dummy1, &dummy2, &dummy3, &type_combiner); if (type_combiner != MPI_COMBINER_NAMED) { DLOOP_Handle_get_loopptr_macro(types[i], old_dlp, flag); if (old_dlp == NULL) { PREPEND_PREFIX(Dataloop_create)(types[i], &old_dlp, &old_dlsz, &old_dldepth, flag); DLOOP_Handle_set_loopptr_macro(types[i], old_dlp, flag); DLOOP_Handle_set_loopsize_macro(types[i], old_dlsz, flag); DLOOP_Handle_set_loopdepth_macro(types[i], old_dldepth, flag); } } } if (combiner == MPI_COMBINER_STRUCT_INTEGER) { disps = (MPI_Aint *) DLOOP_Malloc(ints[0] * sizeof(MPI_Aint)); for (i=0; i < ints[0]; i++) { disps[i] = (MPI_Aint) ints[ints[0] + 1 + i]; } } else { disps = aints; } err = PREPEND_PREFIX(Dataloop_create_struct)(ints[0] /* count */, &ints[1] /* blklens */, disps, types /* oldtype array */, dlp_p, dlsz_p, dldepth_p, flag); /* TODO if/when this function returns error codes, propagate this failure instead */ DLOOP_Assert(0 == err); /* if (err) return err; */ if (combiner == MPI_COMBINER_STRUCT_INTEGER) { DLOOP_Free(disps); } break; case MPI_COMBINER_SUBARRAY: ndims = ints[0]; PREPEND_PREFIX(Type_convert_subarray)(ndims, &ints[1] /* sizes */, &ints[1+ndims] /* subsizes */, &ints[1+2*ndims] /* starts */, ints[1+3*ndims] /* order */, types[0], &tmptype); PREPEND_PREFIX(Dataloop_create)(tmptype, dlp_p, dlsz_p, dldepth_p, flag); MPIR_Type_free_impl(&tmptype); break; case MPI_COMBINER_DARRAY: ndims = ints[2]; PREPEND_PREFIX(Type_convert_darray)(ints[0] /* size */, ints[1] /* rank */, ndims, &ints[3] /* gsizes */, &ints[3+ndims] /*distribs */, &ints[3+2*ndims] /* dargs */, &ints[3+3*ndims] /* psizes */, ints[3+4*ndims] /* order */, types[0], &tmptype); PREPEND_PREFIX(Dataloop_create)(tmptype, dlp_p, dlsz_p, dldepth_p, flag); MPIR_Type_free_impl(&tmptype); break; default: DLOOP_Assert(0); break; } clean_exit: PREPEND_PREFIX(Type_release_contents)(type, &ints, &aints, &types); /* for now we just leave the intermediate dataloops in place. * could remove them to save space if we wanted. */ return; }