void PREPEND_PREFIX(Segment_unpack)(DLOOP_Segment *segp, DLOOP_Offset first, DLOOP_Offset *lastp, void *streambuf) { struct PREPEND_PREFIX(m2m_params) params; DBG_SEGMENT(printf( "Segment_unpack...\n" )); /* experimenting with discarding buf value in the segment, keeping in * per-use structure instead. would require moving the parameters around a * bit. */ params.userbuf = segp->ptr; params.streambuf = streambuf; params.direction = DLOOP_M2M_TO_USERBUF; PREPEND_PREFIX(Segment_manipulate)(segp, first, lastp, PREPEND_PREFIX(Segment_contig_m2m), PREPEND_PREFIX(Segment_vector_m2m), PREPEND_PREFIX(Segment_blkidx_m2m), PREPEND_PREFIX(Segment_index_m2m), NULL, /* size fn */ ¶ms); return; }
/*@ Dataloop_copy - Copy an arbitrary dataloop structure, updating pointers as necessary Input Parameters: + dest - pointer to destination region . src - pointer to original dataloop structure - size - size of dataloop structure This routine parses the dataloop structure as it goes in order to determine what exactly it needs to update. Notes: It assumes that the source dataloop was allocated in our usual way; this means that the entire dataloop is in a contiguous region and that the root of the tree is first in the array. This has some implications: + we can use a contiguous copy mechanism to copy the majority of the structure - all pointers in the region are relative to the start of the data region the first dataloop in the array is the root of the tree @*/ void PREPEND_PREFIX(Dataloop_copy)(void *dest, void *src, DLOOP_Size size) { DLOOP_Offset ptrdiff; #ifdef DLOOP_DEBUG_MEMORY DLOOP_dbg_printf("DLOOP_Dataloop_copy: copying from %x to %x (%z bytes).\n", (int) src, (int) dest, (size_t)size); #endif /* copy region first */ DLOOP_Memcpy(dest, src, size); /* Calculate difference in starting locations. DLOOP_Dataloop_update() * then traverses the new structure and updates internal pointers by * adding this difference to them. This way we can just copy the * structure, including pointers, in one big block. */ ptrdiff = (DLOOP_Offset) ((char *) dest - (char *) src); /* traverse structure updating pointers */ PREPEND_PREFIX(Dataloop_update)(dest, ptrdiff); return; }
/* MPID_Segment_mpi_flatten - flatten a type into a representation * appropriate for passing to hindexed create. * * NOTE: blocks will be in units of bytes when returned. * * WARNING: there's potential for overflow here as we convert from * various types into an index of bytes. * * Parameters: * segp - pointer to segment structure * first - first byte in segment to pack * lastp - in/out parameter describing last byte to pack (and afterwards * the last byte _actually_ packed) * NOTE: actually returns index of byte _after_ last one packed * blklens, disps - the usual blocklength and displacement arrays for MPI * lengthp - in/out parameter describing length of array (and afterwards * the amount of the array that has actual data) */ void PREPEND_PREFIX(Segment_mpi_flatten)(DLOOP_Segment *segp, DLOOP_Offset first, DLOOP_Offset *lastp, DLOOP_Size *blklens, MPI_Aint *disps, DLOOP_Size *lengthp) { struct PREPEND_PREFIX(mpi_flatten_params) params; DLOOP_Assert(*lengthp > 0); params.index = 0; params.length = *lengthp; params.blklens = blklens; params.disps = disps; PREPEND_PREFIX(Segment_manipulate)(segp, first, lastp, DLOOP_Leaf_contig_mpi_flatten, DLOOP_Leaf_vector_mpi_flatten, DLOOP_Leaf_blkidx_mpi_flatten, DLOOP_Leaf_index_mpi_flatten, NULL, ¶ms); /* last value already handled by MPID_Segment_manipulate */ *lengthp = params.index; return; }
/* DLOOP_Segment_count_contig_blocks() * * Count number of contiguous regions in segment between first and last. */ void PREPEND_PREFIX(Segment_count_contig_blocks)(DLOOP_Segment *segp, DLOOP_Offset first, DLOOP_Offset *lastp, DLOOP_Count *countp) { struct PREPEND_PREFIX(contig_blocks_params) params; params.count = 0; params.last_loc = 0; /* FIXME: The blkidx and index functions are not used since they * optimize the count by coalescing contiguous segments, while * functions using the count do not optimize in the same way * (e.g., flatten code) */ PREPEND_PREFIX(Segment_manipulate)(segp, first, lastp, DLOOP_Leaf_contig_count_block, DLOOP_Leaf_vector_count_block, DLOOP_Leaf_blkidx_count_block, DLOOP_Leaf_index_count_block, NULL, /* size fn */ (void *) ¶ms); *countp = params.count; return; }
/*@ Dataloop_alloc - allocate the resources used to store a dataloop with no old loops associated with it. Input Parameters: + kind - kind of dataloop to allocate . count - number of elements in dataloop (kind dependent) . new_loop_p - address at which to store new dataloop pointer - new_loop_sz_p - pointer to integer in which to store new loop size Notes: The count parameter passed into this function will often be different from the count passed in at the MPI layer due to optimizations. @*/ void PREPEND_PREFIX(Dataloop_alloc)(int kind, DLOOP_Count count, DLOOP_Dataloop **new_loop_p, MPI_Aint *new_loop_sz_p) { PREPEND_PREFIX(Dataloop_alloc_and_copy)(kind, count, NULL, 0, new_loop_p, new_loop_sz_p); return; }
/*@ DLOOP_Dataloop_create_named - create a dataloop for a "named" type if necessary. "named" types are ones for which MPI_Type_get_envelope() returns a combiner of MPI_COMBINER_NAMED. some types that fit this category, such as MPI_SHORT_INT, have multiple elements with potential gaps and padding. these types need dataloops for correct processing. @*/ static void DLOOP_Dataloop_create_named(MPI_Datatype type, DLOOP_Dataloop **dlp_p, int *dlsz_p, int *dldepth_p, int flag) { DLOOP_Dataloop *dlp; /* special case: pairtypes need dataloops too. * * note: not dealing with MPI_2INT because size == extent * in all cases for that type. * * note: MPICH always precreates these, so we will never call * Dataloop_create_pairtype() from here in the MPICH * case. */ if (type == MPI_FLOAT_INT || type == MPI_DOUBLE_INT || type == MPI_LONG_INT || type == MPI_SHORT_INT || type == MPI_LONG_DOUBLE_INT) { DLOOP_Handle_get_loopptr_macro(type, dlp, flag); if (dlp != NULL) { /* dataloop already created; just return it. */ *dlp_p = dlp; DLOOP_Handle_get_loopsize_macro(type, *dlsz_p, flag); DLOOP_Handle_get_loopdepth_macro(type, *dldepth_p, flag); } else { PREPEND_PREFIX(Dataloop_create_pairtype)(type, dlp_p, dlsz_p, dldepth_p, flag); } return; } /* no other combiners need dataloops; exit. */ else { *dlp_p = NULL; *dlsz_p = 0; *dldepth_p = 0; return; } }
/*@ Dataloop_create_pairtype - create dataloop for a pairtype Arguments: + MPI_Datatype type - the pairtype . DLOOP_Dataloop **output_dataloop_ptr . int output_dataloop_size . int output_dataloop_depth - int flag .N Errors .N Returns 0 on success, -1 on failure. Note: This function simply creates the appropriate input parameters for use with Dataloop_create_struct and then calls that function. This same function could be used to create dataloops for any type that actually consists of two distinct elements. @*/ int PREPEND_PREFIX(Dataloop_create_pairtype)(MPI_Datatype type, DLOOP_Dataloop **dlp_p, int *dlsz_p, int *dldepth_p, int flag) { int blocks[2] = { 1, 1 }; MPI_Aint disps[2]; MPI_Datatype types[2]; DLOOP_Assert(type == MPI_FLOAT_INT || type == MPI_DOUBLE_INT || type == MPI_LONG_INT || type == MPI_SHORT_INT || type == MPI_LONG_DOUBLE_INT || type == MPI_2INT); switch(type) { case MPI_FLOAT_INT: PAIRTYPE_CONTENTS(MPI_FLOAT, float, MPI_INT, int); break; case MPI_DOUBLE_INT: PAIRTYPE_CONTENTS(MPI_DOUBLE, double, MPI_INT, int); break; case MPI_LONG_INT: PAIRTYPE_CONTENTS(MPI_LONG, long, MPI_INT, int); break; case MPI_SHORT_INT: PAIRTYPE_CONTENTS(MPI_SHORT, short, MPI_INT, int); break; case MPI_LONG_DOUBLE_INT: PAIRTYPE_CONTENTS(MPI_LONG_DOUBLE, long double, MPI_INT, int); break; case MPI_2INT: PAIRTYPE_CONTENTS(MPI_INT, int, MPI_INT, int); break; } return PREPEND_PREFIX(Dataloop_create_struct)(2, blocks, disps, types, dlp_p, dlsz_p, dldepth_p, flag); }
/*@ Dataloop_dup - make a copy of a dataloop Returns 0 on success, -1 on failure. @*/ void PREPEND_PREFIX(Dataloop_dup)(DLOOP_Dataloop *old_loop, DLOOP_Count old_loop_sz, DLOOP_Dataloop **new_loop_p) { DLOOP_Dataloop *new_loop; DLOOP_Assert(old_loop != NULL); DLOOP_Assert(old_loop_sz > 0); new_loop = (DLOOP_Dataloop *) DLOOP_Malloc(old_loop_sz); if (new_loop == NULL) { *new_loop_p = NULL; return; } PREPEND_PREFIX(Dataloop_copy)(new_loop, old_loop, old_loop_sz); *new_loop_p = new_loop; return; }
/*@ Dataloop_contiguous - create the dataloop representation for a contiguous datatype Input Parameters: + int icount, . DLOOP_Type oldtype - int flag Output Parameters: + DLOOP_Dataloop **dlp_p, . DLOOP_Size *dlsz_p, - int *dldepth_p, .N Errors .N Returns 0 on success, -1 on failure. @*/ int PREPEND_PREFIX(Dataloop_create_contiguous)(DLOOP_Count icount, DLOOP_Type oldtype, DLOOP_Dataloop **dlp_p, DLOOP_Size *dlsz_p, int *dldepth_p, int flag) { DLOOP_Count count; int is_builtin, apply_contig_coalescing = 0; int new_loop_depth; DLOOP_Size new_loop_sz; DLOOP_Dataloop *new_dlp; count = icount; is_builtin = (DLOOP_Handle_hasloop_macro(oldtype)) ? 0 : 1; if (is_builtin) { new_loop_depth = 1; } else { int old_loop_depth = 0; DLOOP_Offset old_size = 0, old_extent = 0; DLOOP_Dataloop *old_loop_ptr; DLOOP_Handle_get_loopdepth_macro(oldtype, old_loop_depth, flag); DLOOP_Handle_get_loopptr_macro(oldtype, old_loop_ptr, flag); DLOOP_Handle_get_size_macro(oldtype, old_size); DLOOP_Handle_get_extent_macro(oldtype, old_extent); /* if we have a simple combination of contigs, coalesce */ if (((old_loop_ptr->kind & DLOOP_KIND_MASK) == DLOOP_KIND_CONTIG) && (old_size == old_extent)) { /* will just copy contig and multiply count */ apply_contig_coalescing = 1; new_loop_depth = old_loop_depth; } else { new_loop_depth = old_loop_depth + 1; } } if (is_builtin) { DLOOP_Offset basic_sz = 0; PREPEND_PREFIX(Dataloop_alloc)(DLOOP_KIND_CONTIG, count, &new_dlp, &new_loop_sz); /* --BEGIN ERROR HANDLING-- */ if (!new_dlp) return -1; /* --END ERROR HANDLING-- */ DLOOP_Handle_get_size_macro(oldtype, basic_sz); new_dlp->kind = DLOOP_KIND_CONTIG | DLOOP_FINAL_MASK; if (flag == DLOOP_DATALOOP_ALL_BYTES) { count *= basic_sz; new_dlp->el_size = 1; new_dlp->el_extent = 1; new_dlp->el_type = MPI_BYTE; } else { new_dlp->el_size = basic_sz; new_dlp->el_extent = new_dlp->el_size; new_dlp->el_type = oldtype; } new_dlp->loop_params.c_t.count = count; } else { /* user-defined base type (oldtype) */ DLOOP_Dataloop *old_loop_ptr; MPI_Aint old_loop_sz = 0; DLOOP_Handle_get_loopptr_macro(oldtype, old_loop_ptr, flag); DLOOP_Handle_get_loopsize_macro(oldtype, old_loop_sz, flag); if (apply_contig_coalescing) { /* make a copy of the old loop and multiply the count */ PREPEND_PREFIX(Dataloop_dup)(old_loop_ptr, old_loop_sz, &new_dlp); /* --BEGIN ERROR HANDLING-- */ if (!new_dlp) return -1; /* --END ERROR HANDLING-- */ new_dlp->loop_params.c_t.count *= count; new_loop_sz = old_loop_sz; DLOOP_Handle_get_loopdepth_macro(oldtype, new_loop_depth, flag); } else { /* allocate space for new loop including copy of old */ PREPEND_PREFIX(Dataloop_alloc_and_copy)(DLOOP_KIND_CONTIG, count, old_loop_ptr, old_loop_sz, &new_dlp, &new_loop_sz); /* --BEGIN ERROR HANDLING-- */ if (!new_dlp) return -1; /* --END ERROR HANDLING-- */ new_dlp->kind = DLOOP_KIND_CONTIG; DLOOP_Handle_get_size_macro(oldtype, new_dlp->el_size); DLOOP_Handle_get_extent_macro(oldtype, new_dlp->el_extent); DLOOP_Handle_get_basic_type_macro(oldtype, new_dlp->el_type); new_dlp->loop_params.c_t.count = count; } } *dlp_p = new_dlp; *dlsz_p = new_loop_sz; *dldepth_p = new_loop_depth; return 0; }
int PREPEND_PREFIX(Dataloop_create_indexed)(DLOOP_Count icount, const DLOOP_Size *blocklength_array, const void *displacement_array, int dispinbytes, MPI_Datatype oldtype, DLOOP_Dataloop **dlp_p, DLOOP_Size *dlsz_p, int *dldepth_p, int flag) { int err, is_builtin; int old_loop_depth; MPI_Aint i; DLOOP_Size new_loop_sz, blksz; DLOOP_Count first; DLOOP_Count old_type_count = 0, contig_count, count; DLOOP_Offset old_extent; struct DLOOP_Dataloop *new_dlp; count = (DLOOP_Count) icount; /* avoid subsequent casting */ /* if count is zero, handle with contig code, call it an int */ if (count == 0) { err = PREPEND_PREFIX(Dataloop_create_contiguous)(0, MPI_INT, dlp_p, dlsz_p, dldepth_p, flag); return err; } /* Skip any initial zero-length blocks */ for (first = 0; first < count; first++) if ((DLOOP_Count) blocklength_array[first]) break; is_builtin = (DLOOP_Handle_hasloop_macro(oldtype)) ? 0 : 1; if (is_builtin) { DLOOP_Handle_get_extent_macro(oldtype, old_extent); old_loop_depth = 0; } else { DLOOP_Handle_get_extent_macro(oldtype, old_extent); DLOOP_Handle_get_loopdepth_macro(oldtype, old_loop_depth, flag); } for (i=first; i < count; i++) { old_type_count += (DLOOP_Count) blocklength_array[i]; } contig_count = PREPEND_PREFIX(Type_indexed_count_contig)(count, blocklength_array, displacement_array, dispinbytes, old_extent); /* if contig_count is zero (no data), handle with contig code */ if (contig_count == 0) { err = PREPEND_PREFIX(Dataloop_create_contiguous)(0, MPI_INT, dlp_p, dlsz_p, dldepth_p, flag); return err; } /* optimization: * * if contig_count == 1 and block starts at displacement 0, * store it as a contiguous rather than an indexed dataloop. */ if ((contig_count == 1) && ((!dispinbytes && ((int *) displacement_array)[first] == 0) || (dispinbytes && ((MPI_Aint *) displacement_array)[first] == 0))) { err = PREPEND_PREFIX(Dataloop_create_contiguous)(old_type_count, oldtype, dlp_p, dlsz_p, dldepth_p, flag); return err; } /* optimization: * * if contig_count == 1 (and displacement != 0), store this as * a single element blockindexed rather than a lot of individual * blocks. */ if (contig_count == 1) { const void *disp_arr_tmp; /* no ternary assignment to avoid clang warnings */ if (dispinbytes) disp_arr_tmp = &(((const MPI_Aint *)displacement_array)[first]); else disp_arr_tmp = &(((const int *)displacement_array)[first]); err = PREPEND_PREFIX(Dataloop_create_blockindexed)(1, old_type_count, disp_arr_tmp, dispinbytes, oldtype, dlp_p, dlsz_p, dldepth_p, flag); return err; } /* optimization: * * if block length is the same for all blocks, store it as a * blockindexed rather than an indexed dataloop. */ blksz = blocklength_array[first]; for (i = first+1; i < count; i++) { if (blocklength_array[i] != blksz) { blksz--; break; } } if (blksz == blocklength_array[first]) { const void *disp_arr_tmp; /* no ternary assignment to avoid clang warnings */ if (dispinbytes) disp_arr_tmp = &(((const MPI_Aint *)displacement_array)[first]); else disp_arr_tmp = &(((const int *)displacement_array)[first]); err = PREPEND_PREFIX(Dataloop_create_blockindexed)(icount-first, blksz, disp_arr_tmp, dispinbytes, oldtype, dlp_p, dlsz_p, dldepth_p, flag); return err; } /* note: blockindexed looks for the vector optimization */ /* TODO: optimization: * * if an indexed of a contig, absorb the contig into the blocklen array * and keep the same overall depth */ /* otherwise storing as an indexed dataloop */ if (is_builtin) { PREPEND_PREFIX(Dataloop_alloc)(DLOOP_KIND_INDEXED, count, &new_dlp, &new_loop_sz); /* --BEGIN ERROR HANDLING-- */ if (!new_dlp) return -1; /* --END ERROR HANDLING-- */ new_dlp->kind = DLOOP_KIND_INDEXED | DLOOP_FINAL_MASK; if (flag == DLOOP_DATALOOP_ALL_BYTES) { /* blocklengths are modified below */ new_dlp->el_size = 1; new_dlp->el_extent = 1; new_dlp->el_type = MPI_BYTE; } else { new_dlp->el_size = old_extent; new_dlp->el_extent = old_extent; new_dlp->el_type = oldtype; } } else { DLOOP_Dataloop *old_loop_ptr = NULL; MPI_Aint old_loop_sz = 0; DLOOP_Handle_get_loopptr_macro(oldtype, old_loop_ptr, flag); DLOOP_Handle_get_loopsize_macro(oldtype, old_loop_sz, flag); PREPEND_PREFIX(Dataloop_alloc_and_copy)(DLOOP_KIND_INDEXED, contig_count, old_loop_ptr, old_loop_sz, &new_dlp, &new_loop_sz); /* --BEGIN ERROR HANDLING-- */ if (!new_dlp) return -1; /* --END ERROR HANDLING-- */ new_dlp->kind = DLOOP_KIND_INDEXED; DLOOP_Handle_get_size_macro(oldtype, new_dlp->el_size); DLOOP_Handle_get_extent_macro(oldtype, new_dlp->el_extent); DLOOP_Handle_get_basic_type_macro(oldtype, new_dlp->el_type); } new_dlp->loop_params.i_t.count = contig_count; new_dlp->loop_params.i_t.total_blocks = old_type_count; /* copy in blocklength and displacement parameters (in that order) * * regardless of dispinbytes, we store displacements in bytes in loop. */ DLOOP_Type_indexed_array_copy(count, contig_count, blocklength_array, displacement_array, new_dlp->loop_params.i_t.blocksize_array, new_dlp->loop_params.i_t.offset_array, dispinbytes, old_extent); if (is_builtin && (flag == DLOOP_DATALOOP_ALL_BYTES)) { DLOOP_Count *tmp_blklen_array = new_dlp->loop_params.i_t.blocksize_array; for (i=0; i < contig_count; i++) { /* increase block lengths so they are in bytes */ tmp_blklen_array[i] *= old_extent; } new_dlp->loop_params.i_t.total_blocks *= old_extent; } *dlp_p = new_dlp; *dlsz_p = new_loop_sz; *dldepth_p = old_loop_depth + 1; return MPI_SUCCESS; }
/*@ Dataloop_print - dump a dataloop tree to stdout for debugging purposes Input Parameters: + dataloop - root of tree to dump - depth - starting depth; used to help keep up with where we are in the tree @*/ void PREPEND_PREFIX(Dataloop_print)(struct DLOOP_Dataloop *dataloop, int depth) { int i; if (dataloop == NULL) { DLOOP_dbg_printf("dataloop is NULL (probably basic type)\n"); return; } DLOOP_dbg_printf("loc=%p, treedepth=%d, kind=%d, el_extent=" DLOOP_OFFSET_FMT_DEC_SPEC "\n", dataloop, (int) depth, (int) dataloop->kind, (DLOOP_Offset) dataloop->el_extent); switch(dataloop->kind & DLOOP_KIND_MASK) { case DLOOP_KIND_CONTIG: DLOOP_dbg_printf("\tCONTIG: count=%d, datatype=%p\n", (int) dataloop->loop_params.c_t.count, dataloop->loop_params.c_t.dataloop); if (!(dataloop->kind & DLOOP_FINAL_MASK)) PREPEND_PREFIX(Dataloop_print)(dataloop->loop_params.c_t.dataloop, depth+1); break; case DLOOP_KIND_VECTOR: DLOOP_dbg_printf("\tVECTOR: count=%d, blksz=%d, stride=" DLOOP_OFFSET_FMT_DEC_SPEC ", datatype=%p\n", (int) dataloop->loop_params.v_t.count, (int) dataloop->loop_params.v_t.blocksize, (DLOOP_Offset) dataloop->loop_params.v_t.stride, dataloop->loop_params.v_t.dataloop); if (!(dataloop->kind & DLOOP_FINAL_MASK)) PREPEND_PREFIX(Dataloop_print)(dataloop->loop_params.v_t.dataloop, depth+1); break; case DLOOP_KIND_BLOCKINDEXED: DLOOP_dbg_printf("\tBLOCKINDEXED: count=%d, blksz=%d, datatype=%p\n", (int) dataloop->loop_params.bi_t.count, (int) dataloop->loop_params.bi_t.blocksize, dataloop->loop_params.bi_t.dataloop); /* print out offsets later */ if (!(dataloop->kind & DLOOP_FINAL_MASK)) PREPEND_PREFIX(Dataloop_print)(dataloop->loop_params.bi_t.dataloop, depth+1); break; case DLOOP_KIND_INDEXED: DLOOP_dbg_printf("\tINDEXED: count=%d, datatype=%p\n", (int) dataloop->loop_params.i_t.count, dataloop->loop_params.i_t.dataloop); /* print out blocksizes and offsets later */ if (!(dataloop->kind & DLOOP_FINAL_MASK)) PREPEND_PREFIX(Dataloop_print)(dataloop->loop_params.i_t.dataloop, depth+1); break; case DLOOP_KIND_STRUCT: DLOOP_dbg_printf("\tSTRUCT: count=%d\n", (int) dataloop->loop_params.s_t.count); DLOOP_dbg_printf("\tblocksizes:\n"); for (i=0; i < dataloop->loop_params.s_t.count; i++) DLOOP_dbg_printf("\t\t%d\n", (int) dataloop->loop_params.s_t.blocksize_array[i]); DLOOP_dbg_printf("\toffsets:\n"); for (i=0; i < dataloop->loop_params.s_t.count; i++) DLOOP_dbg_printf("\t\t" DLOOP_OFFSET_FMT_DEC_SPEC "\n", (DLOOP_Offset) dataloop->loop_params.s_t.offset_array[i]); DLOOP_dbg_printf("\tdatatypes:\n"); for (i=0; i < dataloop->loop_params.s_t.count; i++) DLOOP_dbg_printf("\t\t%p\n", dataloop->loop_params.s_t.dataloop_array[i]); if (dataloop->kind & DLOOP_FINAL_MASK) break; for (i=0; i < dataloop->loop_params.s_t.count; i++) { PREPEND_PREFIX(Dataloop_print)(dataloop->loop_params.s_t.dataloop_array[i],depth+1); } break; default: DLOOP_Assert(0); break; } return; }
/*@ Dataloop_stream_size - return the size of the data described by the dataloop Input Parameters: + dl_p - pointer to dataloop for which we will return the size - sizefn - function for determining size of types in the corresponding stream (passing NULL will instead result in el_size values being used) @*/ DLOOP_Offset PREPEND_PREFIX(Dataloop_stream_size)(struct DLOOP_Dataloop *dl_p, DLOOP_Offset (*sizefn)(DLOOP_Type el_type)) { DLOOP_Offset tmp_sz, tmp_ct = 1; for (;;) { if ((dl_p->kind & DLOOP_KIND_MASK) == DLOOP_KIND_STRUCT) { int i; tmp_sz = 0; for (i = 0; i < dl_p->loop_params.s_t.count; i++) { tmp_sz += (DLOOP_Offset)(dl_p->loop_params.s_t.blocksize_array[i]) * PREPEND_PREFIX(Dataloop_stream_size)(dl_p->loop_params.s_t.dataloop_array[i], sizefn); } return tmp_sz * tmp_ct; } switch (dl_p->kind & DLOOP_KIND_MASK) { case DLOOP_KIND_CONTIG: tmp_ct *= (DLOOP_Offset)(dl_p->loop_params.c_t.count); #ifdef DLOOP_DEBUG_SIZE DLOOP_dbg_printf("stream_size: contig: ct = %d; new tot_ct = " DLOOP_OFFSET_FMT_DEC_SPEC "\n", (int) dl_p->loop_params.c_t.count, (DLOOP_Offset) tmp_ct); #endif break; case DLOOP_KIND_VECTOR: tmp_ct *= (DLOOP_Offset)(dl_p->loop_params.v_t.count) * (DLOOP_Offset)(dl_p->loop_params.v_t.blocksize); #ifdef DLOOP_DEBUG_SIZE DLOOP_dbg_printf("stream_size: vector: ct = %d; blk = %d; new tot_ct = " DLOOP_OFFSET_FMT_DEC_SPEC "\n", (int) dl_p->loop_params.v_t.count, (int) dl_p->loop_params.v_t.blocksize, (DLOOP_Offset) tmp_ct); #endif break; case DLOOP_KIND_BLOCKINDEXED: tmp_ct *= (DLOOP_Offset)(dl_p->loop_params.bi_t.count) * (DLOOP_Offset)(dl_p->loop_params.bi_t.blocksize); #ifdef DLOOP_DEBUG_SIZE DLOOP_dbg_printf("stream_size: blkindexed: blks = %d; new tot_ct = " DLOOP_OFFSET_FMT_DEC_SPEC "\n", (int) dl_p->loop_params.bi_t.count * (int) dl_p->loop_params.bi_t.blocksize, (DLOOP_Offset) tmp_ct); #endif break; case DLOOP_KIND_INDEXED: tmp_ct *= (DLOOP_Offset)(dl_p->loop_params.i_t.total_blocks); #ifdef DLOOP_DEBUG_SIZE DLOOP_dbg_printf("stream_size: contig: blks = %d; new tot_ct = " DLOOP_OFFSET_FMT_DEC_SPEC "\n", (int) dl_p->loop_params.i_t.total_blocks, (DLOOP_Offset) tmp_ct); #endif break; default: /* --BEGIN ERROR HANDLING-- */ DLOOP_Assert(0); break; /* --END ERROR HANDLING-- */ } if (dl_p->kind & DLOOP_FINAL_MASK) break; else { DLOOP_Assert(dl_p->loop_params.cm_t.dataloop != NULL); dl_p = dl_p->loop_params.cm_t.dataloop; } } /* call fn for size using bottom type, or use size if fnptr is NULL */ tmp_sz = ((sizefn) ? sizefn(dl_p->el_type) : dl_p->el_size); return tmp_sz * tmp_ct; }
/*@ Dataloop_alloc_and_copy - allocate the resources used to store a dataloop and copy in old dataloop as appropriate Input Parameters: + kind - kind of dataloop to allocate . count - number of elements in dataloop (kind dependent) . old_loop - pointer to old dataloop (or NULL for none) . old_loop_sz - size of old dataloop (should be zero if old_loop is NULL) . new_loop_p - address at which to store new dataloop pointer - new_loop_sz_p - pointer to integer in which to store new loop size Notes: The count parameter passed into this function will often be different from the count passed in at the MPI layer. @*/ void PREPEND_PREFIX(Dataloop_alloc_and_copy)(int kind, DLOOP_Count count, DLOOP_Dataloop *old_loop, DLOOP_Size old_loop_sz, DLOOP_Dataloop **new_loop_p, DLOOP_Size *new_loop_sz_p) { DLOOP_Size new_loop_sz = 0; int align_sz = 8; /* default aligns everything to 8-byte boundaries */ int epsilon; DLOOP_Size loop_sz = sizeof(DLOOP_Dataloop); DLOOP_Size off_sz = 0, blk_sz = 0, ptr_sz = 0, extent_sz = 0; char *pos; DLOOP_Dataloop *new_loop; #ifdef HAVE_MAX_STRUCT_ALIGNMENT if (align_sz > HAVE_MAX_STRUCT_ALIGNMENT) { align_sz = HAVE_MAX_STRUCT_ALIGNMENT; } #endif if (old_loop != NULL) { DLOOP_Assert((old_loop_sz % align_sz) == 0); } /* calculate the space that we actually need for everything */ switch (kind) { case DLOOP_KIND_STRUCT: /* need space for dataloop pointers and extents */ ptr_sz = count * sizeof(DLOOP_Dataloop *); extent_sz = count * sizeof(DLOOP_Offset); case DLOOP_KIND_INDEXED: /* need space for block sizes */ blk_sz = count * sizeof(DLOOP_Count); case DLOOP_KIND_BLOCKINDEXED: /* need space for block offsets */ off_sz = count * sizeof(DLOOP_Offset); case DLOOP_KIND_CONTIG: case DLOOP_KIND_VECTOR: break; default: DLOOP_Assert(0); } /* pad everything that we're going to allocate */ epsilon = loop_sz % align_sz; if (epsilon) loop_sz += align_sz - epsilon; epsilon = off_sz % align_sz; if (epsilon) off_sz += align_sz - epsilon; epsilon = blk_sz % align_sz; if (epsilon) blk_sz += align_sz - epsilon; epsilon = ptr_sz % align_sz; if (epsilon) ptr_sz += align_sz - epsilon; epsilon = extent_sz % align_sz; if (epsilon) extent_sz += align_sz - epsilon; new_loop_sz += loop_sz + off_sz + blk_sz + ptr_sz + extent_sz + old_loop_sz; /* allocate space */ new_loop = (DLOOP_Dataloop *) DLOOP_Malloc(new_loop_sz); if (new_loop == NULL) { *new_loop_p = NULL; return; } #ifdef DLOOP_DEBUG_MEMORY DLOOP_dbg_printf("DLOOP_Dataloop_alloc_and_copy: new loop @ %x (tot sz = %z, loop = %z, off = %z, blk = %z, ptr = %z, extent = %z, old = %z)\n", (int) new_loop, new_loop_sz, loop_sz, off_sz, blk_sz, ptr_sz, extent_sz, old_loop_sz); #endif /* set all the pointers in the new dataloop structure */ switch (kind) { case DLOOP_KIND_STRUCT: /* order is: * - pointers * - blocks * - offsets * - extents */ new_loop->loop_params.s_t.dataloop_array = (DLOOP_Dataloop **) (((char *) new_loop) + loop_sz); new_loop->loop_params.s_t.blocksize_array = (DLOOP_Count *) (((char *) new_loop) + loop_sz + ptr_sz); new_loop->loop_params.s_t.offset_array = (DLOOP_Offset *) (((char *) new_loop) + loop_sz + ptr_sz + blk_sz); new_loop->loop_params.s_t.el_extent_array = (DLOOP_Offset *) (((char *) new_loop) + loop_sz + ptr_sz + blk_sz + off_sz); break; case DLOOP_KIND_INDEXED: /* order is: * - blocks * - offsets */ new_loop->loop_params.i_t.blocksize_array = (DLOOP_Count *) (((char *) new_loop) + loop_sz); new_loop->loop_params.i_t.offset_array = (DLOOP_Offset *) (((char *) new_loop) + loop_sz + blk_sz); if (old_loop == NULL) { new_loop->loop_params.i_t.dataloop = NULL; } else { new_loop->loop_params.i_t.dataloop = (DLOOP_Dataloop *) (((char *) new_loop) + (new_loop_sz - old_loop_sz)); } break; case DLOOP_KIND_BLOCKINDEXED: new_loop->loop_params.bi_t.offset_array = (DLOOP_Offset *) (((char *) new_loop) + loop_sz); if (old_loop == NULL) { new_loop->loop_params.bi_t.dataloop = NULL; } else { new_loop->loop_params.bi_t.dataloop = (DLOOP_Dataloop *) (((char *) new_loop) + (new_loop_sz - old_loop_sz)); } break; case DLOOP_KIND_CONTIG: if (old_loop == NULL) { new_loop->loop_params.c_t.dataloop = NULL; } else { new_loop->loop_params.c_t.dataloop = (DLOOP_Dataloop *) (((char *) new_loop) + (new_loop_sz - old_loop_sz)); } break; case DLOOP_KIND_VECTOR: if (old_loop == NULL) { new_loop->loop_params.v_t.dataloop = NULL; } else { new_loop->loop_params.v_t.dataloop = (DLOOP_Dataloop *) (((char *) new_loop) + (new_loop_sz - old_loop_sz)); } break; default: DLOOP_Assert(0); } pos = ((char *) new_loop) + (new_loop_sz - old_loop_sz); if (old_loop != NULL) { PREPEND_PREFIX(Dataloop_copy)(pos, old_loop, old_loop_sz); } *new_loop_p = new_loop; *new_loop_sz_p = new_loop_sz; return; }
void PREPEND_PREFIX(Dataloop_create)(MPI_Datatype type, DLOOP_Dataloop **dlp_p, int *dlsz_p, int *dldepth_p, int flag) { int i; int err; int nr_ints, nr_aints, nr_types, combiner; MPI_Datatype *types; int *ints; MPI_Aint *aints; DLOOP_Dataloop *old_dlp; int old_dlsz, old_dldepth; int dummy1, dummy2, dummy3, type0_combiner, ndims; MPI_Datatype tmptype; MPI_Aint stride; MPI_Aint *disps; MPIR_Type_get_envelope_impl(type, &nr_ints, &nr_aints, &nr_types, &combiner); /* some named types do need dataloops; handle separately. */ if (combiner == MPI_COMBINER_NAMED) { DLOOP_Dataloop_create_named(type, dlp_p, dlsz_p, dldepth_p, flag); return; } else if (combiner == MPI_COMBINER_F90_REAL || combiner == MPI_COMBINER_F90_COMPLEX || combiner == MPI_COMBINER_F90_INTEGER) { MPI_Datatype f90basetype; DLOOP_Handle_get_basic_type_macro(type, f90basetype); PREPEND_PREFIX(Dataloop_create_contiguous)(1 /* count */, f90basetype, dlp_p, dlsz_p, dldepth_p, flag); return; } /* Q: should we also check for "hasloop", or is the COMBINER * check above enough to weed out everything that wouldn't * have a loop? */ DLOOP_Handle_get_loopptr_macro(type, old_dlp, flag); if (old_dlp != NULL) { /* dataloop already created; just return it. */ *dlp_p = old_dlp; DLOOP_Handle_get_loopsize_macro(type, *dlsz_p, flag); DLOOP_Handle_get_loopdepth_macro(type, *dldepth_p, flag); return; } PREPEND_PREFIX(Type_access_contents)(type, &ints, &aints, &types); /* first check for zero count on types where that makes sense */ switch(combiner) { case MPI_COMBINER_CONTIGUOUS: case MPI_COMBINER_VECTOR: case MPI_COMBINER_HVECTOR_INTEGER: case MPI_COMBINER_HVECTOR: case MPI_COMBINER_INDEXED_BLOCK: case MPI_COMBINER_HINDEXED_BLOCK: case MPI_COMBINER_INDEXED: case MPI_COMBINER_HINDEXED_INTEGER: case MPI_COMBINER_HINDEXED: case MPI_COMBINER_STRUCT_INTEGER: case MPI_COMBINER_STRUCT: if (ints[0] == 0) { PREPEND_PREFIX(Dataloop_create_contiguous)(0, MPI_INT, dlp_p, dlsz_p, dldepth_p, flag); goto clean_exit; } break; default: break; } /* recurse, processing types "below" this one before processing * this one, if those type don't already have dataloops. * * note: in the struct case below we'll handle any additional * types "below" the current one. */ MPIR_Type_get_envelope_impl(types[0], &dummy1, &dummy2, &dummy3, &type0_combiner); if (type0_combiner != MPI_COMBINER_NAMED) { DLOOP_Handle_get_loopptr_macro(types[0], old_dlp, flag); if (old_dlp == NULL) { /* no dataloop already present; create and store one */ PREPEND_PREFIX(Dataloop_create)(types[0], &old_dlp, &old_dlsz, &old_dldepth, flag); DLOOP_Handle_set_loopptr_macro(types[0], old_dlp, flag); DLOOP_Handle_set_loopsize_macro(types[0], old_dlsz, flag); DLOOP_Handle_set_loopdepth_macro(types[0], old_dldepth, flag); } else { DLOOP_Handle_get_loopsize_macro(types[0], old_dlsz, flag); DLOOP_Handle_get_loopdepth_macro(types[0], old_dldepth, flag); } } switch(combiner) { case MPI_COMBINER_DUP: if (type0_combiner != MPI_COMBINER_NAMED) { PREPEND_PREFIX(Dataloop_dup)(old_dlp, old_dlsz, dlp_p); *dlsz_p = old_dlsz; *dldepth_p = old_dldepth; } else { PREPEND_PREFIX(Dataloop_create_contiguous)(1, types[0], dlp_p, dlsz_p, dldepth_p, flag); } break; case MPI_COMBINER_RESIZED: if (type0_combiner != MPI_COMBINER_NAMED) { PREPEND_PREFIX(Dataloop_dup)(old_dlp, old_dlsz, dlp_p); *dlsz_p = old_dlsz; *dldepth_p = old_dldepth; } else { PREPEND_PREFIX(Dataloop_create_contiguous)(1, types[0], dlp_p, dlsz_p, dldepth_p, flag); (*dlp_p)->el_extent = aints[1]; /* extent */ } break; case MPI_COMBINER_CONTIGUOUS: PREPEND_PREFIX(Dataloop_create_contiguous)(ints[0] /* count */, types[0] /* oldtype */, dlp_p, dlsz_p, dldepth_p, flag); break; case MPI_COMBINER_VECTOR: PREPEND_PREFIX(Dataloop_create_vector)(ints[0] /* count */, ints[1] /* blklen */, ints[2] /* stride */, 0 /* stride not bytes */, types[0] /* oldtype */, dlp_p, dlsz_p, dldepth_p, flag); break; case MPI_COMBINER_HVECTOR_INTEGER: case MPI_COMBINER_HVECTOR: /* fortran hvector has integer stride in bytes */ if (combiner == MPI_COMBINER_HVECTOR_INTEGER) { stride = (MPI_Aint) ints[2]; } else { stride = aints[0]; } PREPEND_PREFIX(Dataloop_create_vector)(ints[0] /* count */, ints[1] /* blklen */, stride, 1 /* stride in bytes */, types[0] /* oldtype */, dlp_p, dlsz_p, dldepth_p, flag); break; case MPI_COMBINER_INDEXED_BLOCK: PREPEND_PREFIX(Dataloop_create_blockindexed)(ints[0] /* count */, ints[1] /* blklen */, &ints[2] /* disps */, 0 /* disp not bytes */, types[0] /* oldtype */, dlp_p, dlsz_p, dldepth_p, flag); break; case MPI_COMBINER_HINDEXED_BLOCK: disps = (MPI_Aint *) DLOOP_Malloc(ints[0] * sizeof(MPI_Aint)); for (i = 0; i < ints[0]; i++) disps[i] = aints[i]; PREPEND_PREFIX(Dataloop_create_blockindexed)(ints[0] /* count */, ints[1] /* blklen */, disps /* disps */, 1 /* disp is bytes */, types[0] /* oldtype */, dlp_p, dlsz_p, dldepth_p, flag); DLOOP_Free(disps); break; case MPI_COMBINER_INDEXED: PREPEND_PREFIX(Dataloop_create_indexed)(ints[0] /* count */, &ints[1] /* blklens */, &ints[ints[0]+1] /* disp */, 0 /* disp not in bytes */, types[0] /* oldtype */, dlp_p, dlsz_p, dldepth_p, flag); break; case MPI_COMBINER_HINDEXED_INTEGER: case MPI_COMBINER_HINDEXED: if (combiner == MPI_COMBINER_HINDEXED_INTEGER) { disps = (MPI_Aint *) DLOOP_Malloc(ints[0] * sizeof(MPI_Aint)); for (i=0; i < ints[0]; i++) { disps[i] = (MPI_Aint) ints[ints[0] + 1 + i]; } } else { disps = aints; } PREPEND_PREFIX(Dataloop_create_indexed)(ints[0] /* count */, &ints[1] /* blklens */, disps, 1 /* disp in bytes */, types[0] /* oldtype */, dlp_p, dlsz_p, dldepth_p, flag); if (combiner == MPI_COMBINER_HINDEXED_INTEGER) { DLOOP_Free(disps); } break; case MPI_COMBINER_STRUCT_INTEGER: case MPI_COMBINER_STRUCT: for (i = 1; i < ints[0]; i++) { int type_combiner; MPIR_Type_get_envelope_impl(types[i], &dummy1, &dummy2, &dummy3, &type_combiner); if (type_combiner != MPI_COMBINER_NAMED) { DLOOP_Handle_get_loopptr_macro(types[i], old_dlp, flag); if (old_dlp == NULL) { PREPEND_PREFIX(Dataloop_create)(types[i], &old_dlp, &old_dlsz, &old_dldepth, flag); DLOOP_Handle_set_loopptr_macro(types[i], old_dlp, flag); DLOOP_Handle_set_loopsize_macro(types[i], old_dlsz, flag); DLOOP_Handle_set_loopdepth_macro(types[i], old_dldepth, flag); } } } if (combiner == MPI_COMBINER_STRUCT_INTEGER) { disps = (MPI_Aint *) DLOOP_Malloc(ints[0] * sizeof(MPI_Aint)); for (i=0; i < ints[0]; i++) { disps[i] = (MPI_Aint) ints[ints[0] + 1 + i]; } } else { disps = aints; } err = PREPEND_PREFIX(Dataloop_create_struct)(ints[0] /* count */, &ints[1] /* blklens */, disps, types /* oldtype array */, dlp_p, dlsz_p, dldepth_p, flag); /* TODO if/when this function returns error codes, propagate this failure instead */ DLOOP_Assert(0 == err); /* if (err) return err; */ if (combiner == MPI_COMBINER_STRUCT_INTEGER) { DLOOP_Free(disps); } break; case MPI_COMBINER_SUBARRAY: ndims = ints[0]; PREPEND_PREFIX(Type_convert_subarray)(ndims, &ints[1] /* sizes */, &ints[1+ndims] /* subsizes */, &ints[1+2*ndims] /* starts */, ints[1+3*ndims] /* order */, types[0], &tmptype); PREPEND_PREFIX(Dataloop_create)(tmptype, dlp_p, dlsz_p, dldepth_p, flag); MPIR_Type_free_impl(&tmptype); break; case MPI_COMBINER_DARRAY: ndims = ints[2]; PREPEND_PREFIX(Type_convert_darray)(ints[0] /* size */, ints[1] /* rank */, ndims, &ints[3] /* gsizes */, &ints[3+ndims] /*distribs */, &ints[3+2*ndims] /* dargs */, &ints[3+3*ndims] /* psizes */, ints[3+4*ndims] /* order */, types[0], &tmptype); PREPEND_PREFIX(Dataloop_create)(tmptype, dlp_p, dlsz_p, dldepth_p, flag); MPIR_Type_free_impl(&tmptype); break; default: DLOOP_Assert(0); break; } clean_exit: PREPEND_PREFIX(Type_release_contents)(type, &ints, &aints, &types); /* for now we just leave the intermediate dataloops in place. * could remove them to save space if we wanted. */ return; }
/*@ Dataloop_update - update pointers after a copy operation Input Parameters: + dataloop - pointer to loop to update - ptrdiff - value indicating offset between old and new pointer values This function is used to recursively update all the pointers in a dataloop tree. @*/ void PREPEND_PREFIX(Dataloop_update)(DLOOP_Dataloop *dataloop, DLOOP_Offset ptrdiff) { /* OPT: only declare these variables down in the Struct case */ int i; DLOOP_Dataloop **looparray; switch(dataloop->kind & DLOOP_KIND_MASK) { case DLOOP_KIND_CONTIG: case DLOOP_KIND_VECTOR: /* * All these really ugly assignments are really of the form: * * ((char *) dataloop->loop_params.c_t.loop) += ptrdiff; * * However, some compilers spit out warnings about casting on the * LHS, so we get this much nastier form instead (using common * struct for contig and vector): */ if (!(dataloop->kind & DLOOP_FINAL_MASK)) { DLOOP_Assert(dataloop->loop_params.cm_t.dataloop); DLOOP_Ensure_Offset_fits_in_pointer(DLOOP_VOID_PTR_CAST_TO_OFFSET (char *) dataloop->loop_params.cm_t.dataloop + ptrdiff); dataloop->loop_params.cm_t.dataloop = (DLOOP_Dataloop *) DLOOP_OFFSET_CAST_TO_VOID_PTR (DLOOP_VOID_PTR_CAST_TO_OFFSET (char *) dataloop->loop_params.cm_t.dataloop + ptrdiff); PREPEND_PREFIX(Dataloop_update)(dataloop->loop_params.cm_t.dataloop, ptrdiff); } break; case DLOOP_KIND_BLOCKINDEXED: DLOOP_Assert(dataloop->loop_params.bi_t.offset_array); DLOOP_Ensure_Offset_fits_in_pointer(DLOOP_VOID_PTR_CAST_TO_OFFSET (char *) dataloop->loop_params.bi_t.offset_array + ptrdiff); dataloop->loop_params.bi_t.offset_array = (DLOOP_Offset *) DLOOP_OFFSET_CAST_TO_VOID_PTR (DLOOP_VOID_PTR_CAST_TO_OFFSET (char *) dataloop->loop_params.bi_t.offset_array + ptrdiff); if (!(dataloop->kind & DLOOP_FINAL_MASK)) { DLOOP_Assert(dataloop->loop_params.bi_t.dataloop); DLOOP_Ensure_Offset_fits_in_pointer(DLOOP_VOID_PTR_CAST_TO_OFFSET (char *) dataloop->loop_params.bi_t.dataloop + ptrdiff); dataloop->loop_params.bi_t.dataloop = (DLOOP_Dataloop *) DLOOP_OFFSET_CAST_TO_VOID_PTR (DLOOP_VOID_PTR_CAST_TO_OFFSET (char *) dataloop->loop_params.bi_t.dataloop + ptrdiff); PREPEND_PREFIX(Dataloop_update)(dataloop->loop_params.bi_t.dataloop, ptrdiff); } break; case DLOOP_KIND_INDEXED: DLOOP_Assert(dataloop->loop_params.i_t.blocksize_array); DLOOP_Ensure_Offset_fits_in_pointer(DLOOP_VOID_PTR_CAST_TO_OFFSET (char *) dataloop->loop_params.i_t.blocksize_array + ptrdiff); dataloop->loop_params.i_t.blocksize_array = (DLOOP_Count *) DLOOP_OFFSET_CAST_TO_VOID_PTR (DLOOP_VOID_PTR_CAST_TO_OFFSET (char *) dataloop->loop_params.i_t.blocksize_array + ptrdiff); DLOOP_Assert(dataloop->loop_params.i_t.offset_array); DLOOP_Ensure_Offset_fits_in_pointer(DLOOP_VOID_PTR_CAST_TO_OFFSET (char *) dataloop->loop_params.i_t.offset_array + ptrdiff); dataloop->loop_params.i_t.offset_array = (DLOOP_Offset *) DLOOP_OFFSET_CAST_TO_VOID_PTR (DLOOP_VOID_PTR_CAST_TO_OFFSET (char *) dataloop->loop_params.i_t.offset_array + ptrdiff); if (!(dataloop->kind & DLOOP_FINAL_MASK)) { DLOOP_Assert(dataloop->loop_params.i_t.dataloop); DLOOP_Ensure_Offset_fits_in_pointer(DLOOP_VOID_PTR_CAST_TO_OFFSET (char *) dataloop->loop_params.i_t.dataloop + ptrdiff); dataloop->loop_params.i_t.dataloop = (DLOOP_Dataloop *) DLOOP_OFFSET_CAST_TO_VOID_PTR (DLOOP_VOID_PTR_CAST_TO_OFFSET (char *) dataloop->loop_params.i_t.dataloop + ptrdiff); PREPEND_PREFIX(Dataloop_update)(dataloop->loop_params.i_t.dataloop, ptrdiff); } break; case DLOOP_KIND_STRUCT: DLOOP_Assert(dataloop->loop_params.s_t.blocksize_array); DLOOP_Ensure_Offset_fits_in_pointer(DLOOP_VOID_PTR_CAST_TO_OFFSET (char *) dataloop->loop_params.s_t.blocksize_array + ptrdiff); dataloop->loop_params.s_t.blocksize_array = (DLOOP_Count *) DLOOP_OFFSET_CAST_TO_VOID_PTR (DLOOP_VOID_PTR_CAST_TO_OFFSET (char *) dataloop->loop_params.s_t.blocksize_array + ptrdiff); DLOOP_Assert(dataloop->loop_params.s_t.offset_array); DLOOP_Ensure_Offset_fits_in_pointer(DLOOP_VOID_PTR_CAST_TO_OFFSET (char *) dataloop->loop_params.s_t.offset_array + ptrdiff); dataloop->loop_params.s_t.offset_array = (DLOOP_Offset *) DLOOP_OFFSET_CAST_TO_VOID_PTR (DLOOP_VOID_PTR_CAST_TO_OFFSET (char *) dataloop->loop_params.s_t.offset_array + ptrdiff); if (dataloop->kind & DLOOP_FINAL_MASK) break; DLOOP_Assert(dataloop->loop_params.s_t.dataloop_array); DLOOP_Ensure_Offset_fits_in_pointer(DLOOP_VOID_PTR_CAST_TO_OFFSET (char *) dataloop->loop_params.s_t.dataloop_array + ptrdiff); dataloop->loop_params.s_t.dataloop_array = (DLOOP_Dataloop **) DLOOP_OFFSET_CAST_TO_VOID_PTR (DLOOP_VOID_PTR_CAST_TO_OFFSET (char *) dataloop->loop_params.s_t.dataloop_array + ptrdiff); /* fix the N dataloop pointers too */ looparray = dataloop->loop_params.s_t.dataloop_array; for (i=0; i < dataloop->loop_params.s_t.count; i++) { DLOOP_Assert(looparray[i]); DLOOP_Ensure_Offset_fits_in_pointer(DLOOP_VOID_PTR_CAST_TO_OFFSET (char *) looparray[i] + ptrdiff); looparray[i] = (DLOOP_Dataloop *) DLOOP_OFFSET_CAST_TO_VOID_PTR (DLOOP_VOID_PTR_CAST_TO_OFFSET (char *) looparray[i] + ptrdiff); } for (i=0; i < dataloop->loop_params.s_t.count; i++) { PREPEND_PREFIX(Dataloop_update)(looparray[i], ptrdiff); } break; default: /* --BEGIN ERROR HANDLING-- */ DLOOP_Assert(0); break; /* --END ERROR HANDLING-- */ } return; }
/*@ Dataloop_create_vector Arguments: + int icount . int iblocklength . MPI_Aint astride . int strideinbytes . MPI_Datatype oldtype . DLOOP_Dataloop **dlp_p . int *dlsz_p . int *dldepth_p - int flag Returns 0 on success, -1 on failure. @*/ int PREPEND_PREFIX(Dataloop_create_vector)(int icount, int iblocklength, MPI_Aint astride, int strideinbytes, DLOOP_Type oldtype, DLOOP_Dataloop **dlp_p, int *dlsz_p, int *dldepth_p, int flag) { int err, is_builtin; int new_loop_sz, new_loop_depth; DLOOP_Count count, blocklength; DLOOP_Offset stride; DLOOP_Dataloop *new_dlp; count = (DLOOP_Count) icount; /* avoid subsequent casting */ blocklength = (DLOOP_Count) iblocklength; stride = (DLOOP_Offset) astride; /* if count or blocklength are zero, handle with contig code, * call it a int */ if (count == 0 || blocklength == 0) { err = PREPEND_PREFIX(Dataloop_create_contiguous)(0, MPI_INT, dlp_p, dlsz_p, dldepth_p, flag); return err; } /* optimization: * * if count == 1, store as a contiguous rather than a vector dataloop. */ if (count == 1) { err = PREPEND_PREFIX(Dataloop_create_contiguous)(iblocklength, oldtype, dlp_p, dlsz_p, dldepth_p, flag); return err; } is_builtin = (DLOOP_Handle_hasloop_macro(oldtype)) ? 0 : 1; if (is_builtin) { new_loop_sz = sizeof(DLOOP_Dataloop); new_loop_depth = 1; } else { int old_loop_sz = 0, old_loop_depth = 0; DLOOP_Handle_get_loopsize_macro(oldtype, old_loop_sz, flag); DLOOP_Handle_get_loopdepth_macro(oldtype, old_loop_depth, flag); /* TODO: ACCOUNT FOR PADDING IN LOOP_SZ HERE */ new_loop_sz = sizeof(DLOOP_Dataloop) + old_loop_sz; new_loop_depth = old_loop_depth + 1; } if (is_builtin) { DLOOP_Offset basic_sz = 0; PREPEND_PREFIX(Dataloop_alloc)(DLOOP_KIND_VECTOR, count, &new_dlp, &new_loop_sz); /* --BEGIN ERROR HANDLING-- */ if (!new_dlp) return -1; /* --END ERROR HANDLING-- */ DLOOP_Handle_get_size_macro(oldtype, basic_sz); new_dlp->kind = DLOOP_KIND_VECTOR | DLOOP_FINAL_MASK; if (flag == DLOOP_DATALOOP_ALL_BYTES) { blocklength *= basic_sz; new_dlp->el_size = 1; new_dlp->el_extent = 1; new_dlp->el_type = MPI_BYTE; if(!strideinbytes) /* the stride was specified in units of oldtype, now that we're using bytes, rather than oldtype, we need to update stride. */ stride *= basic_sz; } else { new_dlp->el_size = basic_sz; new_dlp->el_extent = new_dlp->el_size; new_dlp->el_type = oldtype; } } else { /* user-defined base type (oldtype) */ DLOOP_Dataloop *old_loop_ptr; int old_loop_sz = 0; DLOOP_Handle_get_loopptr_macro(oldtype, old_loop_ptr, flag); DLOOP_Handle_get_loopsize_macro(oldtype, old_loop_sz, flag); PREPEND_PREFIX(Dataloop_alloc_and_copy)(DLOOP_KIND_VECTOR, count, old_loop_ptr, old_loop_sz, &new_dlp, &new_loop_sz); /* --BEGIN ERROR HANDLING-- */ if (!new_dlp) return -1; /* --END ERROR HANDLING-- */ new_dlp->kind = DLOOP_KIND_VECTOR; DLOOP_Handle_get_size_macro(oldtype, new_dlp->el_size); DLOOP_Handle_get_extent_macro(oldtype, new_dlp->el_extent); DLOOP_Handle_get_basic_type_macro(oldtype, new_dlp->el_type); } /* vector-specific members * * stride stored in dataloop is always in bytes for local rep of type */ new_dlp->loop_params.v_t.count = count; new_dlp->loop_params.v_t.blocksize = blocklength; new_dlp->loop_params.v_t.stride = (strideinbytes) ? stride : stride * new_dlp->el_extent; *dlp_p = new_dlp; *dlsz_p = new_loop_sz; *dldepth_p = new_loop_depth; return 0; }
/* DLOOP_Type_calc_footprint_struct - calculate size, lb, ub, extent, and alignsize for a struct type */ static void DLOOP_Type_calc_footprint_struct(MPI_Datatype type, int struct_combiner, int *ints, MPI_Aint *aints, MPI_Datatype *types, DLOOP_Type_footprint *tfp) { int i, found_sticky_lb = 0, found_sticky_ub = 0, first_iter = 1; DLOOP_Offset tmp_lb, tmp_ub, tmp_extent, tmp_true_lb, tmp_true_ub; DLOOP_Offset max_alignsz = 0, tmp_size = 0, min_lb = 0, max_ub = 0; DLOOP_Offset min_true_lb = 0, max_true_ub = 0; int nr_ints, nr_aints, nr_types, combiner; /* used to store parameters for constituent types */ DLOOP_Type_footprint cfp; DLOOP_Offset size, lb, ub, true_lb, true_ub, extent, alignsz; int sticky_lb, sticky_ub; /* find first non-zero blocklength element */ for (i=0; i < ints[0] && ints[i+1] == 0; i++); if (i == ints[0]) /* all zero-length blocks */ { tfp->size = tfp->lb = tfp->ub = tfp->extent = tfp->alignsz = 0; tfp->has_sticky_lb = tfp->has_sticky_ub = 0; return; } for (; i < ints[0]; i++) { /* skip zero blocklength elements */ if (ints[i+1] == 0) continue; MPIR_Type_get_envelope_impl(types[i], &nr_ints, &nr_aints, &nr_types, &combiner); /* opt: could just inline assignments for combiner == NAMED case */ PREPEND_PREFIX(Type_calc_footprint)(types[i], &cfp); size = cfp.size; lb = cfp.lb; ub = cfp.ub; true_lb = cfp.true_lb; true_ub = cfp.true_ub; extent = cfp.extent; alignsz = cfp.alignsz; sticky_lb = cfp.has_sticky_lb; sticky_ub = cfp.has_sticky_ub; DLOOP_DATATYPE_BLOCK_LB_UB(ints[i+1] /* blklen */, aints[i] /* disp */, lb, ub, extent, tmp_lb, tmp_ub); tmp_true_lb = tmp_lb + (true_lb - lb); tmp_true_ub = tmp_ub + (true_ub - ub); tmp_size += size * (DLOOP_Offset) ints[i+1]; if (combiner == MPI_COMBINER_NAMED) { /* NOTE: This is a special case. If a user creates a struct * with a named type at a non-zero displacement, the * alignment may be different than expected due to * special compiler rules for this case. Thus we must * over-ride the value that we obtained from * Type_calc_footprint() above. */ alignsz = DLOOP_Named_type_alignsize(types[i], aints[i]); } if (max_alignsz < alignsz) max_alignsz = alignsz; /* We save this LB if: * (1) this is our first iteration where we saw a nonzero blklen, * (2) we haven't found a sticky LB and this LB is lower than * any we have previously seen, * (3) we haven't found a sticky LB and this one is sticky, or * (4) this sticky LB is lower than any we have previously seen. */ if ((first_iter) || (!found_sticky_lb && min_lb > tmp_lb) || (!found_sticky_lb && sticky_lb) || (sticky_lb && min_lb > tmp_lb)) { min_lb = tmp_lb; if (sticky_lb) found_sticky_lb = 1; } if ((first_iter) || (!found_sticky_ub && max_ub < tmp_ub) || (!found_sticky_ub && sticky_ub) || (sticky_ub && max_ub < tmp_ub)) { max_ub = tmp_ub; if (sticky_ub) found_sticky_ub = 1; } if ((first_iter) || (tmp_true_lb > min_true_lb)) { min_true_lb = tmp_true_lb; } if ((first_iter) || (tmp_true_ub < max_true_ub)) { max_true_ub = tmp_true_ub; } first_iter = 0; } /* calculate extent, not including potential padding */ tmp_extent = max_ub - min_lb; /* account for padding if no sticky LB/UB is found */ if ((!found_sticky_lb) && (!found_sticky_ub)) { DLOOP_Offset epsilon; epsilon = (max_alignsz > 0) ? tmp_extent % max_alignsz : 0; if (epsilon) { max_ub += (max_alignsz - epsilon); tmp_extent = max_ub - min_lb; } } tfp->size = tmp_size; tfp->lb = min_lb; tfp->ub = max_ub; tfp->true_lb = min_true_lb; tfp->true_ub = max_true_ub; tfp->extent = tmp_extent; tfp->alignsz = max_alignsz; tfp->has_sticky_lb = found_sticky_lb; tfp->has_sticky_ub = found_sticky_ub; return; }
void PREPEND_PREFIX(Type_calc_footprint)(MPI_Datatype type, DLOOP_Type_footprint *tfp) { int mpi_errno; int nr_ints, nr_aints, nr_types, combiner; int *ints; MPI_Aint *aints; MPI_Datatype *types; /* used to store parameters for constituent types */ DLOOP_Offset size = 0, lb = 0, ub = 0, true_lb = 0, true_ub = 0; DLOOP_Offset extent = 0, alignsz; int has_sticky_lb, has_sticky_ub; /* used for vector/hvector/hvector_integer calculations */ DLOOP_Offset stride; /* used for indexed/hindexed calculations */ DLOOP_Offset disp; /* used for calculations on types with more than one block of data */ DLOOP_Offset i, min_lb, max_ub, ntypes, tmp_lb, tmp_ub; /* used for processing subarray and darray types */ int ndims; MPI_Datatype tmptype; MPIR_Type_get_envelope_impl(type, &nr_ints, &nr_aints, &nr_types, &combiner); if (combiner == MPI_COMBINER_NAMED) { int mpisize; MPI_Aint mpiextent; MPIR_Type_size_impl(type, &mpisize); MPIR_Type_extent_impl(type, &mpiextent); tfp->size = (DLOOP_Offset) mpisize; tfp->lb = 0; tfp->ub = (DLOOP_Offset) mpiextent; tfp->true_lb = 0; tfp->true_ub = (DLOOP_Offset) mpiextent; tfp->extent = (DLOOP_Offset) mpiextent; tfp->alignsz = DLOOP_Named_type_alignsize(type, (MPI_Aint) 0); tfp->has_sticky_lb = (type == MPI_LB) ? 1 : 0; tfp->has_sticky_ub = (type == MPI_UB) ? 1 : 0; goto clean_exit; } /* get access to contents; need it immediately to check for zero count */ PREPEND_PREFIX(Type_access_contents)(type, &ints, &aints, &types); /* knock out all the zero count cases */ if ((combiner == MPI_COMBINER_CONTIGUOUS || combiner == MPI_COMBINER_VECTOR || combiner == MPI_COMBINER_HVECTOR_INTEGER || combiner == MPI_COMBINER_HVECTOR || combiner == MPI_COMBINER_INDEXED_BLOCK || combiner == MPI_COMBINER_HINDEXED_BLOCK || combiner == MPI_COMBINER_INDEXED || combiner == MPI_COMBINER_HINDEXED_INTEGER || combiner == MPI_COMBINER_STRUCT_INTEGER || combiner == MPI_COMBINER_STRUCT) && ints[0] == 0) { tfp->size = tfp->lb = tfp->ub = tfp->extent = tfp->alignsz = 0; tfp->true_lb = tfp->true_ub = 0; tfp->has_sticky_lb = tfp->has_sticky_ub = 0; goto clean_exit; } if (combiner != MPI_COMBINER_STRUCT && combiner != MPI_COMBINER_STRUCT_INTEGER) { DLOOP_Type_footprint cfp; PREPEND_PREFIX(Type_calc_footprint)(types[0], &cfp); size = cfp.size; lb = cfp.lb; ub = cfp.ub; true_lb = cfp.true_lb; true_ub = cfp.true_ub; extent = cfp.extent; alignsz = cfp.alignsz; has_sticky_lb = cfp.has_sticky_lb; has_sticky_ub = cfp.has_sticky_ub; /* initialize some common values so we don't have to assign * them in every case below. */ tfp->alignsz = alignsz; tfp->has_sticky_lb = has_sticky_lb; tfp->has_sticky_ub = has_sticky_ub; } switch(combiner) { case MPI_COMBINER_DUP: tfp->size = size; tfp->lb = lb; tfp->ub = ub; tfp->true_lb = true_lb; tfp->true_ub = true_ub; tfp->extent = extent; break; case MPI_COMBINER_RESIZED: tfp->size = size; tfp->lb = aints[0]; /* lb */ tfp->ub = aints[0] + aints[1]; tfp->true_lb = true_lb; tfp->true_ub = true_ub; tfp->extent = aints[1]; /* extent */ tfp->has_sticky_lb = 1; tfp->has_sticky_ub = 1; break; case MPI_COMBINER_CONTIGUOUS: DLOOP_DATATYPE_CONTIG_LB_UB(ints[0] /* count */, lb, ub, extent, tfp->lb, tfp->ub); tfp->true_lb = tfp->lb + (true_lb - lb); tfp->true_ub = tfp->ub + (true_ub - ub); tfp->size = (DLOOP_Offset) ints[0] * size; tfp->extent = tfp->ub - tfp->lb; break; case MPI_COMBINER_VECTOR: case MPI_COMBINER_HVECTOR: case MPI_COMBINER_HVECTOR_INTEGER: if (combiner == MPI_COMBINER_VECTOR) stride = (DLOOP_Offset) ints[2] * extent; else if (combiner == MPI_COMBINER_HVECTOR) stride = aints[0]; else /* HVECTOR_INTEGER */ stride = (DLOOP_Offset) ints[2]; DLOOP_DATATYPE_VECTOR_LB_UB(ints[0] /* count */, stride /* stride in bytes */, ints[1] /* blklen */, lb, ub, extent, tfp->lb, tfp->ub); tfp->true_lb = tfp->lb + (true_lb - lb); tfp->true_ub = tfp->ub + (true_ub - ub); tfp->size = (DLOOP_Offset) ints[0] * (DLOOP_Offset) ints[1] * size; tfp->extent = tfp->ub - tfp->lb; break; case MPI_COMBINER_INDEXED_BLOCK: /* prime min_lb and max_ub */ DLOOP_DATATYPE_BLOCK_LB_UB(ints[1] /* blklen */, (DLOOP_Offset) ints[2] * extent /* disp */, lb, ub, extent, min_lb, max_ub); for (i=1; i < ints[0]; i++) { DLOOP_DATATYPE_BLOCK_LB_UB(ints[1] /* blklen */, (DLOOP_Offset) ints[i+2] * extent /* disp */, lb, ub, extent, tmp_lb, tmp_ub); if (tmp_lb < min_lb) min_lb = tmp_lb; if (tmp_ub > max_ub) max_ub = tmp_ub; } tfp->size = (DLOOP_Offset) ints[0] * (DLOOP_Offset) ints[1] * size; tfp->lb = min_lb; tfp->ub = max_ub; tfp->true_lb = min_lb + (true_lb - lb); tfp->true_ub = max_ub + (true_ub - ub); tfp->extent = tfp->ub - tfp->lb; break; case MPI_COMBINER_HINDEXED_BLOCK: /* prime min_lb and max_ub */ DLOOP_DATATYPE_BLOCK_LB_UB(ints[1] /* blklen */, (DLOOP_Offset) ints[2] /* disp */, lb, ub, extent, min_lb, max_ub); for (i=1; i < ints[0]; i++) { DLOOP_DATATYPE_BLOCK_LB_UB(ints[1] /* blklen */, (DLOOP_Offset) ints[i+2] /* disp */, lb, ub, extent, tmp_lb, tmp_ub); if (tmp_lb < min_lb) min_lb = tmp_lb; if (tmp_ub > max_ub) max_ub = tmp_ub; } tfp->size = (DLOOP_Offset) ints[0] * (DLOOP_Offset) ints[1] * size; tfp->lb = min_lb; tfp->ub = max_ub; tfp->true_lb = min_lb + (true_lb - lb); tfp->true_ub = max_ub + (true_ub - ub); tfp->extent = tfp->ub - tfp->lb; break; case MPI_COMBINER_INDEXED: case MPI_COMBINER_HINDEXED_INTEGER: case MPI_COMBINER_HINDEXED: /* find first non-zero blocklength element */ for (i=0; i < ints[0] && ints[i+1] == 0; i++); if (i == ints[0]) { /* all zero blocklengths */ tfp->size = tfp->lb = tfp->ub = tfp->extent = tfp->alignsz = 0; tfp->has_sticky_lb = tfp->has_sticky_ub = 0; } else { /* prime min_lb, max_ub, count */ ntypes = ints[i+1]; if (combiner == MPI_COMBINER_INDEXED) disp = (DLOOP_Offset) ints[ints[0]+i+1] * extent; else if (combiner == MPI_COMBINER_HINDEXED_INTEGER) disp = (DLOOP_Offset) ints[ints[0]+i+1]; else /* MPI_COMBINER_HINDEXED */ disp = aints[i]; DLOOP_DATATYPE_BLOCK_LB_UB(ints[i+1] /* blklen */, disp, lb, ub, extent, min_lb, max_ub); for (i++; i < ints[0]; i++) { /* skip zero blocklength elements */ if (ints[i+1] == 0) continue; ntypes += ints[i+1]; if (combiner == MPI_COMBINER_INDEXED) disp = (DLOOP_Offset) ints[ints[0]+i+1] * extent; else if (combiner == MPI_COMBINER_HINDEXED_INTEGER) disp = (DLOOP_Offset) ints[ints[0]+i+1]; else /* MPI_COMBINER_HINDEXED */ disp = aints[i]; DLOOP_DATATYPE_BLOCK_LB_UB(ints[i+1], disp, lb, ub, extent, tmp_lb, tmp_ub); if (tmp_lb < min_lb) min_lb = tmp_lb; if (tmp_ub > max_ub) max_ub = tmp_ub; } tfp->size = ntypes * size; tfp->lb = min_lb; tfp->ub = max_ub; tfp->true_lb = min_lb + (true_lb - lb); tfp->true_ub = max_ub + (true_ub - ub); tfp->extent = tfp->ub - tfp->lb; } break; case MPI_COMBINER_STRUCT_INTEGER: DLOOP_Assert(combiner != MPI_COMBINER_STRUCT_INTEGER); break; case MPI_COMBINER_STRUCT: /* sufficiently complicated to pull out into separate fn */ DLOOP_Type_calc_footprint_struct(type, combiner, ints, aints, types, tfp); break; case MPI_COMBINER_SUBARRAY: ndims = ints[0]; PREPEND_PREFIX(Type_convert_subarray)(ndims, &ints[1] /* sizes */, &ints[1+ndims] /* subsz */, &ints[1+2*ndims] /* strts */, ints[1+3*ndims] /* order */, types[0], &tmptype); PREPEND_PREFIX(Type_calc_footprint)(tmptype, tfp); MPIR_Type_free_impl(&tmptype); break; case MPI_COMBINER_DARRAY: ndims = ints[2]; PREPEND_PREFIX(Type_convert_darray)(ints[0] /* size */, ints[1] /* rank */, ndims, &ints[3] /* gsizes */, &ints[3+ndims] /*distribs */, &ints[3+2*ndims] /* dargs */, &ints[3+3*ndims] /* psizes */, ints[3+4*ndims] /* order */, types[0], &tmptype); PREPEND_PREFIX(Type_calc_footprint)(tmptype, tfp); MPIR_Type_free_impl(&tmptype); break; case MPI_COMBINER_F90_REAL: case MPI_COMBINER_F90_COMPLEX: case MPI_COMBINER_F90_INTEGER: default: DLOOP_Assert(0); break; } clean_exit: PREPEND_PREFIX(Type_release_contents)(type, &ints, &aints, &types); return; }