/* * compute aggregators ranklist and put it into fd->hints struct */ static void ADIOI_BGL_compute_agg_ranklist_serial ( ADIO_File fd, const ADIOI_BGL_ConfInfo_t *confInfo, ADIOI_BGL_ProcInfo_t *all_procInfo, int *aggrsInPset ) { # if AGG_DEBUG int i; # endif int naggs; int *tmp_ranklist; /* compute the ranklist of IO aggregators and put into tmp_ranklist */ tmp_ranklist = (int *) ADIOI_Malloc (confInfo->nProcs * sizeof(int)); # if AGG_DEBUG for (i=0; i<confInfo->nProcs; i++) { DBG_FPRINTF(stderr, "\tcpuid %1d, rank = %6d\n", all_procInfo[i].cpuid, all_procInfo[i].rank ); } # endif naggs = ADIOI_BGL_compute_agg_ranklist_serial_do (confInfo, all_procInfo, aggrsInPset, tmp_ranklist); # define VERIFY 0 # if VERIFY DBG_FPRINTF(stderr, "\tconfInfo = %3d,%3d,%3d,%3d,%3d,%3d,%.4f; naggs = %d\n", confInfo->PsetSize , confInfo->numPsets , confInfo->isVNM , confInfo->virtualPsetSize , confInfo->nProcs , confInfo->nAggrs , confInfo->aggRatio , naggs ); # endif # if AGG_DEBUG for (i=0; i<naggs; i++) { DBG_FPRINTF(stderr, "\taggr %-4d = %6d\n", i, tmp_ranklist[i] ); } # endif /* copy the ranklist of IO aggregators to fd->hints */ if(fd->hints->ranklist != NULL) ADIOI_Free (fd->hints->ranklist); fd->hints->cb_nodes = naggs; fd->hints->ranklist = (int *) ADIOI_Malloc (naggs * sizeof(int)); memcpy( fd->hints->ranklist, tmp_ranklist, naggs*sizeof(int) ); /* */ ADIOI_Free( tmp_ranklist ); return; }
static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype datatype, int nprocs, int myrank, ADIOI_Access *others_req, ADIO_Offset *offset_list, ADIO_Offset *len_list, int contig_access_count, ADIO_Offset min_st_offset, ADIO_Offset fd_size, ADIO_Offset *fd_start, ADIO_Offset *fd_end, int *buf_idx, int *error_code) { /* Read in sizes of no more than coll_bufsize, an info parameter. Send data to appropriate processes. Place recd. data in user buf. The idea is to reduce the amount of extra memory required for collective I/O. If all data were read all at once, which is much easier, it would require temp space more than the size of user_buf, which is often unacceptable. For example, to read a distributed array from a file, where each local array is 8Mbytes, requiring at least another 8Mbytes of temp space is unacceptable. */ int i, j, m, ntimes, max_ntimes, buftype_is_contig; ADIO_Offset st_loc=-1, end_loc=-1, off, done, real_off, req_off; char *read_buf = NULL, *tmp_buf; int *curr_offlen_ptr, *count, *send_size, *recv_size; int *partial_send, *recd_from_proc, *start_pos; /* Not convinced end_loc-st_loc couldn't be > int, so make these offsets*/ ADIO_Offset real_size, size, for_curr_iter, for_next_iter; int req_len, flag, rank; MPI_Status status; ADIOI_Flatlist_node *flat_buf=NULL; MPI_Aint buftype_extent; int coll_bufsize; *error_code = MPI_SUCCESS; /* changed below if error */ /* only I/O errors are currently reported */ /* calculate the number of reads of size coll_bufsize to be done by each process and the max among all processes. That gives the no. of communication phases as well. coll_bufsize is obtained from the hints object. */ coll_bufsize = fd->hints->cb_buffer_size; /* grab some initial values for st_loc and end_loc */ for (i=0; i < nprocs; i++) { if (others_req[i].count) { st_loc = others_req[i].offsets[0]; end_loc = others_req[i].offsets[0]; break; } } /* now find the real values */ for (i=0; i < nprocs; i++) for (j=0; j<others_req[i].count; j++) { st_loc = ADIOI_MIN(st_loc, others_req[i].offsets[j]); end_loc = ADIOI_MAX(end_loc, (others_req[i].offsets[j] + others_req[i].lens[j] - 1)); } /* calculate ntimes, the number of times this process must perform I/O * operations in order to complete all the requests it has received. * the need for multiple I/O operations comes from the restriction that * we only use coll_bufsize bytes of memory for internal buffering. */ if ((st_loc==-1) && (end_loc==-1)) { /* this process does no I/O. */ ntimes = 0; } else { /* ntimes=ceiling_div(end_loc - st_loc + 1, coll_bufsize)*/ ntimes = (int) ((end_loc - st_loc + coll_bufsize)/coll_bufsize); } MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX, fd->comm); read_buf = fd->io_buf; /* Allocated at open time */ curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int)); /* its use is explained below. calloc initializes to 0. */ count = (int *) ADIOI_Malloc(nprocs * sizeof(int)); /* to store count of how many off-len pairs per proc are satisfied in an iteration. */ partial_send = (int *) ADIOI_Calloc(nprocs, sizeof(int)); /* if only a portion of the last off-len pair is sent to a process in a particular iteration, the length sent is stored here. calloc initializes to 0. */ send_size = (int *) ADIOI_Malloc(nprocs * sizeof(int)); /* total size of data to be sent to each proc. in an iteration */ recv_size = (int *) ADIOI_Malloc(nprocs * sizeof(int)); /* total size of data to be recd. from each proc. in an iteration. Of size nprocs so that I can use MPI_Alltoall later. */ recd_from_proc = (int *) ADIOI_Calloc(nprocs, sizeof(int)); /* amount of data recd. so far from each proc. Used in ADIOI_Fill_user_buffer. initialized to 0 here. */ start_pos = (int *) ADIOI_Malloc(nprocs*sizeof(int)); /* used to store the starting value of curr_offlen_ptr[i] in this iteration */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); if (!buftype_is_contig) { ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; } MPI_Type_extent(datatype, &buftype_extent); done = 0; off = st_loc; for_curr_iter = for_next_iter = 0; MPI_Comm_rank(fd->comm, &rank); for (m=0; m<ntimes; m++) { /* read buf of size coll_bufsize (or less) */ /* go through all others_req and check if any are satisfied by the current read */ /* since MPI guarantees that displacements in filetypes are in monotonically nondecreasing order, I can maintain a pointer (curr_offlen_ptr) to current off-len pair for each process in others_req and scan further only from there. There is still a problem of filetypes such as: (1, 2, 3 are not process nos. They are just numbers for three chunks of data, specified by a filetype.) 1 -------!-- 2 -----!---- 3 --!----- where ! indicates where the current read_size limitation cuts through the filetype. I resolve this by reading up to !, but filling the communication buffer only for 1. I copy the portion left over for 2 into a tmp_buf for use in the next iteration. i.e., 2 and 3 will be satisfied in the next iteration. This simplifies filling in the user's buf at the other end, as only one off-len pair with incomplete data will be sent. I also don't need to send the individual offsets and lens along with the data, as the data is being sent in a particular order. */ /* off = start offset in the file for the data actually read in this iteration size = size of data read corresponding to off real_off = off minus whatever data was retained in memory from previous iteration for cases like 2, 3 illustrated above real_size = size plus the extra corresponding to real_off req_off = off in file for a particular contiguous request minus what was satisfied in previous iteration req_size = size corresponding to req_off */ size = ADIOI_MIN((unsigned)coll_bufsize, end_loc-st_loc+1-done); real_off = off - for_curr_iter; real_size = size + for_curr_iter; for (i=0; i<nprocs; i++) count[i] = send_size[i] = 0; for_next_iter = 0; for (i=0; i<nprocs; i++) { #ifdef RDCOLL_DEBUG DBG_FPRINTF(stderr, "rank %d, i %d, others_count %d\n", rank, i, others_req[i].count); #endif if (others_req[i].count) { start_pos[i] = curr_offlen_ptr[i]; for (j=curr_offlen_ptr[i]; j<others_req[i].count; j++) { if (partial_send[i]) { /* this request may have been partially satisfied in the previous iteration. */ req_off = others_req[i].offsets[j] + partial_send[i]; req_len = others_req[i].lens[j] - partial_send[i]; partial_send[i] = 0; /* modify the off-len pair to reflect this change */ others_req[i].offsets[j] = req_off; others_req[i].lens[j] = req_len; } else { req_off = others_req[i].offsets[j]; req_len = others_req[i].lens[j]; } if (req_off < real_off + real_size) { count[i]++; ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)read_buf)+req_off-real_off) == (ADIO_Offset)(MPIR_Upint)(read_buf+req_off-real_off)); MPI_Address(read_buf+req_off-real_off, &(others_req[i].mem_ptrs[j])); ADIOI_Assert((real_off + real_size - req_off) == (int)(real_off + real_size - req_off)); send_size[i] += (int)(ADIOI_MIN(real_off + real_size - req_off, (ADIO_Offset)(unsigned)req_len)); if (real_off+real_size-req_off < (ADIO_Offset)(unsigned)req_len) { partial_send[i] = (int) (real_off + real_size - req_off); if ((j+1 < others_req[i].count) && (others_req[i].offsets[j+1] < real_off+real_size)) { /* this is the case illustrated in the figure above. */ for_next_iter = ADIOI_MAX(for_next_iter, real_off + real_size - others_req[i].offsets[j+1]); /* max because it must cover requests from different processes */ } break; } } else break; } curr_offlen_ptr[i] = j; } } flag = 0; for (i=0; i<nprocs; i++) if (count[i]) flag = 1; if (flag) { ADIOI_Assert(size == (int)size); ADIO_ReadContig(fd, read_buf+for_curr_iter, (int)size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, off, &status, error_code); if (*error_code != MPI_SUCCESS) return; } for_curr_iter = for_next_iter; ADIOI_R_Exchange_data(fd, buf, flat_buf, offset_list, len_list, send_size, recv_size, count, start_pos, partial_send, recd_from_proc, nprocs, myrank, buftype_is_contig, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, others_req, m, buftype_extent, buf_idx); if (for_next_iter) { tmp_buf = (char *) ADIOI_Malloc(for_next_iter); ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)read_buf)+real_size-for_next_iter) == (ADIO_Offset)(MPIR_Upint)(read_buf+real_size-for_next_iter)); ADIOI_Assert((for_next_iter+coll_bufsize) == (size_t)(for_next_iter+coll_bufsize)); memcpy(tmp_buf, read_buf+real_size-for_next_iter, for_next_iter); ADIOI_Free(fd->io_buf); fd->io_buf = (char *) ADIOI_Malloc(for_next_iter+coll_bufsize); memcpy(fd->io_buf, tmp_buf, for_next_iter); read_buf = fd->io_buf; ADIOI_Free(tmp_buf); } off += size; done += size; } for (i=0; i<nprocs; i++) count[i] = send_size[i] = 0; for (m=ntimes; m<max_ntimes; m++) /* nothing to send, but check for recv. */ ADIOI_R_Exchange_data(fd, buf, flat_buf, offset_list, len_list, send_size, recv_size, count, start_pos, partial_send, recd_from_proc, nprocs, myrank, buftype_is_contig, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, others_req, m, buftype_extent, buf_idx); ADIOI_Free(curr_offlen_ptr); ADIOI_Free(count); ADIOI_Free(partial_send); ADIOI_Free(send_size); ADIOI_Free(recv_size); ADIOI_Free(recd_from_proc); ADIOI_Free(start_pos); }
void ADIOI_Calc_my_off_len(ADIO_File fd, int bufcount, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Offset **offset_list_ptr, ADIO_Offset **len_list_ptr, ADIO_Offset *start_offset_ptr, ADIO_Offset *end_offset_ptr, int *contig_access_count_ptr) { MPI_Count filetype_size, etype_size; MPI_Count buftype_size; int i, j, k; ADIO_Offset i_offset; ADIO_Offset frd_size=0, old_frd_size=0; int st_index=0; ADIO_Offset n_filetypes, etype_in_filetype; ADIO_Offset abs_off_in_filetype=0; ADIO_Offset bufsize; ADIO_Offset sum, n_etypes_in_filetype, size_in_filetype; int contig_access_count, filetype_is_contig; ADIO_Offset *len_list; MPI_Aint filetype_extent, filetype_lb; ADIOI_Flatlist_node *flat_file; ADIO_Offset *offset_list, off, end_offset=0, disp; #ifdef AGGREGATION_PROFILE MPE_Log_event (5028, 0, NULL); #endif /* For this process's request, calculate the list of offsets and lengths in the file and determine the start and end offsets. */ ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size_x(fd->filetype, &filetype_size); MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_lb(fd->filetype, &filetype_lb); MPI_Type_size_x(datatype, &buftype_size); etype_size = fd->etype_size; if ( ! filetype_size ) { *contig_access_count_ptr = 0; *offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc(2*sizeof(ADIO_Offset)); *len_list_ptr = (ADIO_Offset *) ADIOI_Malloc(2*sizeof(ADIO_Offset)); /* 2 is for consistency. everywhere I malloc one more than needed */ offset_list = *offset_list_ptr; len_list = *len_list_ptr; offset_list[0] = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : fd->disp + (ADIO_Offset)etype_size * offset; len_list[0] = 0; *start_offset_ptr = offset_list[0]; *end_offset_ptr = offset_list[0] + len_list[0] - 1; return; } if (filetype_is_contig) { *contig_access_count_ptr = 1; *offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc(2*sizeof(ADIO_Offset)); *len_list_ptr = (ADIO_Offset *) ADIOI_Malloc(2*sizeof(ADIO_Offset)); /* 2 is for consistency. everywhere I malloc one more than needed */ offset_list = *offset_list_ptr; len_list = *len_list_ptr; offset_list[0] = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : fd->disp + (ADIO_Offset)etype_size * offset; len_list[0] = (ADIO_Offset)bufcount * (ADIO_Offset)buftype_size; *start_offset_ptr = offset_list[0]; *end_offset_ptr = offset_list[0] + len_list[0] - 1; /* update file pointer */ if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = *end_offset_ptr + 1; } else { /* First calculate what size of offset_list and len_list to allocate */ /* filetype already flattened in ADIO_Open or ADIO_Fcntl */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; disp = fd->disp; #ifdef RDCOLL_DEBUG { int ii; DBG_FPRINTF(stderr, "flattened %3lld : ", flat_file->count ); for (ii=0; ii<flat_file->count; ii++) { DBG_FPRINTF(stderr, "%16lld:%-16lld", flat_file->indices[ii], flat_file->blocklens[ii] ); } DBG_FPRINTF(stderr, "\n" ); } #endif if (file_ptr_type == ADIO_INDIVIDUAL) { /* Wei-keng reworked type processing to be a bit more efficient */ offset = fd->fp_ind - disp; n_filetypes = (offset - flat_file->indices[0]) / filetype_extent; offset -= (ADIO_Offset)n_filetypes * filetype_extent; /* now offset is local to this extent */ /* find the block where offset is located, skip blocklens[i]==0 */ for (i=0; i<flat_file->count; i++) { ADIO_Offset dist; if (flat_file->blocklens[i] == 0) continue; dist = flat_file->indices[i] + flat_file->blocklens[i] - offset; /* frd_size is from offset to the end of block i */ if (dist == 0) { i++; offset = flat_file->indices[i]; frd_size = flat_file->blocklens[i]; break; } if (dist > 0) { frd_size = dist; break; } } st_index = i; /* starting index in flat_file->indices[] */ offset += disp + (ADIO_Offset)n_filetypes*filetype_extent; } else { n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = offset / n_etypes_in_filetype; etype_in_filetype = offset % n_etypes_in_filetype; size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i=0; i<flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; frd_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + n_filetypes* (ADIO_Offset)filetype_extent + abs_off_in_filetype; } /* calculate how much space to allocate for offset_list, len_list */ old_frd_size = frd_size; contig_access_count = i_offset = 0; j = st_index; bufsize = (ADIO_Offset)buftype_size * (ADIO_Offset)bufcount; frd_size = ADIOI_MIN(frd_size, bufsize); while (i_offset < bufsize) { if (frd_size) contig_access_count++; i_offset += frd_size; j = (j + 1) % flat_file->count; frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset); } /* allocate space for offset_list and len_list */ *offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc((contig_access_count+1)*sizeof(ADIO_Offset)); *len_list_ptr = (ADIO_Offset *) ADIOI_Malloc((contig_access_count+1)*sizeof(ADIO_Offset)); /* +1 to avoid a 0-size malloc */ offset_list = *offset_list_ptr; len_list = *len_list_ptr; /* find start offset, end offset, and fill in offset_list and len_list */ *start_offset_ptr = offset; /* calculated above */ i_offset = k = 0; j = st_index; off = offset; frd_size = ADIOI_MIN(old_frd_size, bufsize); while (i_offset < bufsize) { if (frd_size) { offset_list[k] = off; len_list[k] = frd_size; k++; } i_offset += frd_size; end_offset = off + frd_size - 1; /* Note: end_offset points to the last byte-offset that will be accessed. e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/ if (off + frd_size < disp + flat_file->indices[j] + flat_file->blocklens[j] + n_filetypes* (ADIO_Offset)filetype_extent) { off += frd_size; /* did not reach end of contiguous block in filetype. * no more I/O needed. off is incremented by frd_size. */ } else { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; while (flat_file->blocklens[j]==0) { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; /* hit end of flattened filetype; start at beginning * again */ } off = disp + flat_file->indices[j] + n_filetypes* (ADIO_Offset)filetype_extent; frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset); } } /* update file pointer */ if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; *contig_access_count_ptr = contig_access_count; *end_offset_ptr = end_offset; } #ifdef AGGREGATION_PROFILE MPE_Log_event (5029, 0, NULL); #endif }
int main(int argc, char ** argv) { pami_client_t client; pami_context_t context; pami_result_t status = PAMI_ERROR; /* create PAMI client */ RC( PAMI_Client_create("TEST", &client, NULL, 0) ); DBG_FPRINTF((stderr,"Client created successfully at 0x%p\n",client)); /* create PAMI context */ RC( PAMI_Context_createv(client, NULL, 0, &context, 1) ); DBG_FPRINTF((stderr,"Context created successfully at 0x%p\n",context)); /* ------------------------------------------------------------------------ */ pami_extension_t extension; const char ext_name[] = "EXT_hfi_extension"; const char sym_name[] = "hfi_pkt_counters"; hfi_pkt_counters_fn hfi_counters = NULL; hfi_pkt_counter_t pkt_counter; /* open PAMI extension */ RC( PAMI_Extension_open (client, ext_name, &extension) ); DBG_FPRINTF((stderr,"Open %s successfully.\n", ext_name)); /* load PAMI extension function */ hfi_counters = (hfi_pkt_counters_fn) PAMI_Extension_symbol (extension, sym_name); if (hfi_counters == (void *)NULL) { fprintf (stderr, "Error. Failed to load %s function in %s\n", sym_name, ext_name); return 1; } DBG_FPRINTF((stderr,"Loaded function %s in %s successfully.\n", sym_name, ext_name)); /* invoke PAMI extension function */ RC( hfi_counters(context, &pkt_counter) ); DBG_FPRINTF((stderr,"Function %s invoked successfully.\n", sym_name)); printf( "Pkt sent = %lu\n" "Pkt sent dropped = %lu\n" "Ind pkt sent = %lu\n" "Pkt recv = %lu\n" "Pkt recv dropped = %lu\n" "Ind pkt recv = %lu\n" "Imm pkt sent = %lu\n", pkt_counter.total_packets_sent, pkt_counter.packets_send_drop, pkt_counter.indicate_packet_sent, pkt_counter.total_packets_recv, pkt_counter.packets_recv_drop, pkt_counter.indicate_packet_recv, pkt_counter.immediate_packet_sent); /* close PAMI extension */ RC( PAMI_Extension_close (extension) ); DBG_FPRINTF((stderr,"Close %s successfully.\n", ext_name)); /* ------------------------------------------------------------------------ */ /* destroy PAMI context */ RC( PAMI_Context_destroyv(&context, 1) ); DBG_FPRINTF((stderr, "PAMI context destroyed successfully\n")); /* destroy PAMI client */ RC( PAMI_Client_destroy(&client) ); DBG_FPRINTF((stderr, "PAMI client destroyed successfully\n")); return 0; }
int main(int argc, char ** argv) { pami_client_t client; pami_context_t context; pami_result_t status = PAMI_ERROR; pami_configuration_t pami_config; pami_geometry_t world_geo; size_t barrier_alg_num[2]; pami_algorithm_t* bar_always_works_algo = NULL; pami_metadata_t* bar_always_works_md = NULL; pami_algorithm_t* bar_must_query_algo = NULL; pami_metadata_t* bar_must_query_md = NULL; pami_xfer_t barrier; int my_id; volatile int is_fence_done = 0; volatile int is_barrier_done = 0; /* create PAMI client */ RC( PAMI_Client_create("TEST", &client, NULL, 0) ); DBG_FPRINTF((stderr,"Client created successfully at 0x%p\n",client)); /* create PAMI context */ RC( PAMI_Context_createv(client, NULL, 0, &context, 1) ); DBG_FPRINTF((stderr,"Context created successfully at 0x%p\n",context)); /* query my task id */ bzero(&pami_config, sizeof(pami_configuration_t)); pami_config.name = PAMI_CLIENT_TASK_ID; RC( PAMI_Client_query(client, &pami_config, 1) ); my_id = pami_config.value.intval; DBG_FPRINTF((stderr,"My task id is %d\n", my_id)); /* get the world geometry */ RC( PAMI_Geometry_world(client, &world_geo) ); DBG_FPRINTF((stderr,"World geometry is at 0x%p\n",world_geo)); /* query number of barrier algorithms */ RC( PAMI_Geometry_algorithms_num(world_geo, PAMI_XFER_BARRIER, barrier_alg_num) ); DBG_FPRINTF((stderr,"%d-%d algorithms are available for barrier op\n", barrier_alg_num[0], barrier_alg_num[1])); if (barrier_alg_num[0] <= 0) { fprintf (stderr, "Error. No (%lu) algorithm is available for barrier op\n", barrier_alg_num[0]); return 1; } /* query barrier algorithm list */ bar_always_works_algo = (pami_algorithm_t*)malloc(sizeof(pami_algorithm_t)*barrier_alg_num[0]); bar_always_works_md = (pami_metadata_t*)malloc(sizeof(pami_metadata_t)*barrier_alg_num[0]); bar_must_query_algo = (pami_algorithm_t*)malloc(sizeof(pami_algorithm_t)*barrier_alg_num[1]); bar_must_query_md = (pami_metadata_t*)malloc(sizeof(pami_metadata_t)*barrier_alg_num[1]); RC( PAMI_Geometry_algorithms_query(world_geo, PAMI_XFER_BARRIER, bar_always_works_algo, bar_always_works_md, barrier_alg_num[0], bar_must_query_algo, bar_must_query_md, barrier_alg_num[1]) ); DBG_FPRINTF((stderr,"Algorithm [%s] at 0x%p will be used for barrier op\n", bar_always_works_md[0].name, bar_always_works_algo[0])); /* begin PAMI fence */ RC( PAMI_Fence_begin(context) ); DBG_FPRINTF((stderr,"PAMI fence begins\n")); /* ------------------------------------------------------------------------ */ pami_extension_t extension; const char ext_name[] = "EXT_hfi_extension"; const char sym_name[] = "hfi_remote_update"; hfi_remote_update_fn remote_update = NULL; hfi_remote_update_info_t remote_info; pami_memregion_t mem_region; size_t mem_region_sz = 0; unsigned long long operand = 1234; unsigned long long orig_val = 0; int offset = (operand)%MAX_TABLE_SZ; /* initialize table for remote update operation */ int i; for (i = 0; i < MAX_TABLE_SZ; i ++) { table[i] = (unsigned long long) i; } orig_val = table[offset]; /* open PAMI extension */ RC( PAMI_Extension_open (client, ext_name, &extension) ); DBG_FPRINTF((stderr,"Open %s successfully.\n", ext_name)); /* load PAMI extension function */ remote_update = (hfi_remote_update_fn) PAMI_Extension_symbol (extension, sym_name); if (remote_update == (void *)NULL) { fprintf (stderr, "Error. Failed to load %s function in %s\n", sym_name, ext_name); return 1; } else { DBG_FPRINTF((stderr,"Loaded function %s in %s successfully.\n", sym_name, ext_name)); } /* create a memory region for remote update operation */ RC( PAMI_Memregion_create(context, table, MAX_TABLE_SZ*sizeof(unsigned long long), &mem_region_sz, &mem_region) ); DBG_FPRINTF((stderr,"%d-byte PAMI memory region created successfully.\n", mem_region_sz)); /* perform a PAMI barrier */ is_barrier_done = 0; barrier.cb_done = barrier_done; barrier.cookie = (void*)&is_barrier_done; barrier.algorithm = bar_always_works_algo[0]; RC( PAMI_Collective(context, &barrier) ); DBG_FPRINTF((stderr,"PAMI barrier op invoked successfully.\n")); while (is_barrier_done == 0) PAMI_Context_advance(context, 1000); DBG_FPRINTF((stderr,"PAMI barrier op finished successfully.\n")); RC( PAMI_Context_lock(context) ); /* prepare remote update info */ remote_info.dest = my_id^1; remote_info.op = 0; /* op_add */ remote_info.atomic_operand = operand; remote_info.dest_buf = (unsigned long long)(&(table[offset])); /* invoke remote update PAMI extension function */ RC( remote_update(context, 1, &remote_info) ); DBG_FPRINTF((stderr,"Function %s invoked successfully.\n", sym_name)); RC( PAMI_Context_unlock(context) ); /* perform a PAMI fence */ is_fence_done = 0; RC( PAMI_Fence_all(context, fence_done, (void*)&is_fence_done) ); DBG_FPRINTF((stderr,"PAMI_Fence_all invoked successfully.\n")); while (is_fence_done == 0) PAMI_Context_advance(context, 1000); DBG_FPRINTF((stderr,"PAMI_Fence_all finished successfully.\n")); /* perform a PAMI barrier */ is_barrier_done = 0; barrier.cb_done = barrier_done; barrier.cookie = (void*)&is_barrier_done; barrier.algorithm = bar_always_works_algo[0]; RC( PAMI_Collective(context, &barrier) ); DBG_FPRINTF((stderr,"PAMI barrier op invoked successfully.\n")); while (is_barrier_done == 0) PAMI_Context_advance(context, 1000); DBG_FPRINTF((stderr,"PAMI barrier op finished successfully.\n")); /* verify data after remote update operation */ if (table[offset] != orig_val + operand) { printf("Data verification at offset %d with operand %lu failed: " "[%lu expected with %lu updated]\n", offset, operand, orig_val+operand, table[offset]); } else { printf("Data verification at offset %d with operand %lu passed: " "[%lu expected with %lu updated].\n", offset, operand, orig_val+operand, table[offset]); } /* destroy the memory region after remote update operation */ RC( PAMI_Memregion_destroy(context, &mem_region) ); DBG_FPRINTF((stderr,"PAMI memory region removed successfully.\n")); /* close PAMI extension */ RC( PAMI_Extension_close (extension) ); DBG_FPRINTF((stderr,"Close %s successfully.\n", ext_name)); /* ------------------------------------------------------------------------ */ /* end PAMI fence */ RC( PAMI_Fence_end(context) ); DBG_FPRINTF((stderr,"PAMI fence ends\n")); /* destroy PAMI context */ RC( PAMI_Context_destroyv(&context, 1) ); DBG_FPRINTF((stderr, "PAMI context destroyed successfully\n")); /* destroy PAMI client */ RC( PAMI_Client_destroy(&client) ); DBG_FPRINTF((stderr, "PAMI client destroyed successfully\n")); return 0; }
int main(int argc, char ** argv) { pami_client_t client; pami_context_t context; pami_result_t status = PAMI_ERROR; status = PAMI_Client_create("TEST", &client, NULL, 0); if(status != PAMI_SUCCESS) { fprintf (stderr, "Error. Unable to initialize pami client. result = %d\n", status); return 1; } DBG_FPRINTF((stderr,"Client %p\n",client)); status = PAMI_Context_createv(client, NULL, 0, &context, 1); if(status != PAMI_SUCCESS) { fprintf (stderr, "Error. Unable to create pami context. result = %d\n", status); return 1; } /* ------------------------------------------------------------------------ */ pami_extension_t extension; status = PAMI_Extension_open (client, "EXT_torus_network", &extension); if(status != PAMI_SUCCESS) { fprintf (stderr, "Error. The \"EXT_torus_network\" extension is not implemented. result = %d\n", status); return 1; } pami_extension_torus_information_fn pamix_torus_info = (pami_extension_torus_information_fn) PAMI_Extension_symbol (extension, "information"); if (pamix_torus_info == (void *)NULL) { fprintf (stderr, "Error. The \"EXT_torus_network\" extension function \"information\" is not implemented. result = %d\n", status); return 1; } const pami_extension_torus_information_t * info = pamix_torus_info (); fprintf (stdout, "Torus Dimensions: %zu\n", info->dims); char str[1024]; size_t i, nchars; for (nchars=i=0; i<(info->dims-1); i++) nchars += snprintf (&str[nchars],1023-nchars, "%zu,", info->coord[i]); nchars += snprintf (&str[nchars],1023-nchars, "%zu", info->coord[info->dims-1]); fprintf (stdout, "Torus Coordinates: [%s]\n", str); for (nchars=i=0; i<(info->dims-1); i++) nchars += snprintf (&str[nchars],1023-nchars, "%zu,", info->size[i]); nchars += snprintf (&str[nchars],1023-nchars, "%zu", info->size[info->dims-1]); fprintf (stdout, "Torus Size: [%s]\n", str); for (nchars=i=0; i<(info->dims-1); i++) nchars += snprintf (&str[nchars],1023-nchars, "%zu,", info->torus[i]); nchars += snprintf (&str[nchars],1023-nchars, "%zu", info->torus[info->dims-1]); fprintf (stdout, "Torus Wrap: [%s]\n", str); pami_extension_torus_task2torus_fn pamix_torus_task2torus = (pami_extension_torus_task2torus_fn) PAMI_Extension_symbol (extension, "task2torus"); if (pamix_torus_task2torus == (void *)NULL) { fprintf (stderr, "Error. The \"EXT_torus_network\" extension function \"task2torus\" is not implemented. result = %d\n", status); return 1; } pami_task_t task = 1; size_t * coord = (size_t *) malloc (sizeof(size_t) * info->dims); status = pamix_torus_task2torus (task, coord); if (status != PAMI_SUCCESS) { fprintf (stderr, "Error. Unable to query the torus coordinates of task 1\n"); return 1; } for (nchars=i=0; i<(info->dims-1); i++) nchars += snprintf (&str[nchars],1023-nchars, "%zu,", coord[i]); nchars += snprintf (&str[nchars],1023-nchars, "%zu", coord[i]); fprintf (stdout, "Task 1 Torus Coordinates: [%s]\n", str); pami_extension_torus_torus2task_fn pamix_torus_torus2task = (pami_extension_torus_torus2task_fn) PAMI_Extension_symbol (extension, "torus2task"); if (pamix_torus_torus2task == (void *)NULL) { fprintf (stderr, "Error. The \"EXT_torus_network\" extension function \"torus2task\" is not implemented. result = %d\n", status); return 1; } /*coord[0] = 0; */ /*coord[1] = 0; */ /*coord[2] = 0; */ /*coord[3] = 1; */ status = pamix_torus_torus2task (coord, &task); if (status != PAMI_SUCCESS) { fprintf (stderr, "Error. Unable to query the task for coordinates [%zu,%zu,%zu,%zu]\n",coord[0],coord[1],coord[2],coord[3]); return 1; } for (nchars=i=0; i<(info->dims-1); i++) nchars += snprintf (&str[nchars],1023-nchars, "%zu,", coord[i]); nchars += snprintf (&str[nchars],1023-nchars, "%zu", coord[i]); fprintf (stdout, "Task at Torus Coordinates [%s]: %d\n", str, task);; status = PAMI_Extension_close (extension); if(status != PAMI_SUCCESS) { fprintf (stderr, "Error. The \"EXT_torus_network\" extension could not be closed. result = %d\n", status); return 1; } /* ------------------------------------------------------------------------ */ DBG_FPRINTF((stderr, "PAMI_Context_destroyv(&context, 1);\n")); status = PAMI_Context_destroyv(&context, 1); if(status != PAMI_SUCCESS) { fprintf(stderr, "Error. Unable to destroy pami context. result = %d\n", status); return 1; } DBG_FPRINTF((stderr, "PAMI_Client_destroy(&client);\n")); status = PAMI_Client_destroy(&client); if(status != PAMI_SUCCESS) { fprintf(stderr, "Error. Unable to finalize pami client. result = %d\n", status); return 1; } DBG_FPRINTF((stderr, "return 0;\n")); return 0; }
void ADIOI_BG_Open(ADIO_File fd, int *error_code) { int perm, old_mask, amode; static char myname[] = "ADIOI_BG_OPEN"; /* set internal variables for tuning environment variables */ ad_bg_get_env_vars(); if (fd->perm == ADIO_PERM_NULL) { old_mask = umask(022); umask(old_mask); perm = old_mask ^ 0666; } else perm = fd->perm; amode = 0; if (fd->access_mode & ADIO_CREATE) amode = amode | O_CREAT; if (fd->access_mode & ADIO_RDONLY) amode = amode | O_RDONLY; if (fd->access_mode & ADIO_WRONLY) amode = amode | O_WRONLY; if (fd->access_mode & ADIO_RDWR) amode = amode | O_RDWR; if (fd->access_mode & ADIO_EXCL) amode = amode | O_EXCL; #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_open_a, 0, NULL); #endif fd->fd_sys = open(fd->filename, amode, perm); #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_open_b, 0, NULL); #endif DBG_FPRINTF(stderr,"open('%s',%#X,%#X) rc=%d, errno=%d\n",fd->filename,amode,perm,fd->fd_sys,errno); fd->fd_direct = -1; if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND)) fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END); if(fd->fd_sys != -1) { /* Initialize the ad_bg file system specific information */ ADIOI_BG_assert(fd->fs_ptr == NULL); fd->fs_ptr = (ADIOI_BG_fs*) ADIOI_Malloc(sizeof(ADIOI_BG_fs)); ((ADIOI_BG_fs*)fd->fs_ptr)->blksize = 1048576; /* default to 1M */ /* default is no fsync aggregation */ ((ADIOI_BG_fs*)fd->fs_ptr)->fsync_aggr = ADIOI_BG_FSYNC_AGGREGATION_DISABLED; #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_stat_a, 0, NULL); #endif scaleable_stat(fd); #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_stat_b, 0, NULL); #endif /* file domain code will get terribly confused in a hard-to-debug way * if gpfs blocksize not sensible */ ADIOI_BG_assert( ((ADIOI_BG_fs*)fd->fs_ptr)->blksize > 0); } if (fd->fd_sys == -1) { if (errno == ENAMETOOLONG) *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_BAD_FILE, "**filenamelong", "**filenamelong %s %d", fd->filename, strlen(fd->filename)); else if (errno == ENOENT) *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_NO_SUCH_FILE, "**filenoexist", "**filenoexist %s", fd->filename); else if (errno == ENOTDIR || errno == ELOOP) *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_BAD_FILE, "**filenamedir", "**filenamedir %s", fd->filename); else if (errno == EACCES) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_ACCESS, "**fileaccess", "**fileaccess %s", fd->filename ); } else if (errno == EROFS) { /* Read only file or file system and write access requested */ *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_READ_ONLY, "**ioneedrd", 0 ); } else { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); } } else *error_code = MPI_SUCCESS; }
void ADIOI_GPFS_Open(ADIO_File fd, int *error_code) { int perm, old_mask, amode, rank, rc; static char myname[] = "ADIOI_GPFS_OPEN"; /* set internal variables for tuning environment variables */ ad_gpfs_get_env_vars(); if (fd->perm == ADIO_PERM_NULL) { old_mask = umask(022); umask(old_mask); perm = old_mask ^ 0666; } else perm = fd->perm; amode = 0; if (fd->access_mode & ADIO_CREATE) amode = amode | O_CREAT; if (fd->access_mode & ADIO_RDONLY) amode = amode | O_RDONLY; if (fd->access_mode & ADIO_WRONLY) amode = amode | O_WRONLY; if (fd->access_mode & ADIO_RDWR) amode = amode | O_RDWR; if (fd->access_mode & ADIO_EXCL) amode = amode | O_EXCL; #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_open_a, 0, NULL); #endif fd->fd_sys = open(fd->filename, amode, perm); #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_open_b, 0, NULL); #endif DBG_FPRINTF(stderr,"open('%s',%#X,%#X) rc=%d, errno=%d\n",fd->filename,amode,perm,fd->fd_sys,errno); fd->fd_direct = -1; if (gpfsmpio_devnullio == 1) { fd->null_fd = open("/dev/null", O_RDWR); } else { fd->null_fd = -1; } if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND)) fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END); if(fd->fd_sys != -1) { fd->blksize = 1048576; /* default to 1M */ #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_stat_a, 0, NULL); #endif /* in this fs-specific routine, we might not be called over entire * communicator (deferred open). Collect statistics on one process. * ADIOI_GEN_Opencoll (common-code caller) will take care of the * broadcast */ MPI_Comm_rank(fd->comm, &rank); if ((rank == fd->hints->ranklist[0]) || (fd->comm == MPI_COMM_SELF)) { struct stat64 gpfs_statbuf; /* Get the (real) underlying file system block size */ rc = stat64(fd->filename, &gpfs_statbuf); if (rc >= 0) { fd->blksize = gpfs_statbuf.st_blksize; DBGV_FPRINTF(stderr,"Successful stat '%s'. Blocksize=%ld\n", fd->filename,gpfs_statbuf.st_blksize); } else { DBGV_FPRINTF(stderr,"Stat '%s' failed with rc=%d, errno=%d\n", fd->filename,rc,errno); } } /* all other ranks have incorrect fd->blocksize, but ADIOI_GEN_Opencoll * will take care of that in both standard and deferred-open case */ #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_stat_b, 0, NULL); #endif #ifdef HAVE_GPFS_FCNTL_H /* in parallel workload, might be helpful to immediately release block * tokens. Or, system call overhead will outweigh any benefits... */ if (getenv("ROMIO_GPFS_FREE_LOCKS")!=NULL) gpfs_free_all_locks(fd->fd_sys); #endif } if (fd->fd_sys == -1) { *error_code = ADIOI_Err_create_code(myname, fd->filename, errno); } else *error_code = MPI_SUCCESS; }
static void ADIOI_R_Iexchange_data_recv(ADIOI_NBC_Request *nbc_req, int *error_code) { ADIOI_R_Iexchange_data_vars *vars = nbc_req->data.rd.red_vars; ADIO_File fd = vars->fd; int *send_size = vars->send_size; int *recv_size = vars->recv_size; int *count = vars->count; int *start_pos = vars->start_pos; int *partial_send = vars->partial_send; int nprocs = vars->nprocs; int myrank = vars->myrank; ADIOI_Access *others_req = vars->others_req; int iter = vars->iter; int *buf_idx = vars->buf_idx; int i, j, k = 0, tmp = 0, nprocs_recv, nprocs_send; char **recv_buf = NULL; MPI_Datatype send_type; nprocs_recv = 0; for (i = 0; i < nprocs; i++) if (recv_size[i]) nprocs_recv++; vars->nprocs_recv = nprocs_recv; nprocs_send = 0; for (i = 0; i < nprocs; i++) if (send_size[i]) nprocs_send++; vars->nprocs_send = nprocs_send; vars->req2 = (MPI_Request *) ADIOI_Malloc((nprocs_send+nprocs_recv+1)*sizeof(MPI_Request)); /* +1 to avoid a 0-size malloc */ /* post recvs. if buftype_is_contig, data can be directly recd. into user buf at location given by buf_idx. else use recv_buf. */ #ifdef AGGREGATION_PROFILE MPE_Log_event (5032, 0, NULL); #endif if (vars->buftype_is_contig) { j = 0; for (i = 0; i < nprocs; i++) if (recv_size[i]) { MPI_Irecv(((char *)vars->buf) + buf_idx[i], recv_size[i], MPI_BYTE, i, myrank+i+100*iter, fd->comm, vars->req2 + j); j++; buf_idx[i] += recv_size[i]; } } else { /* allocate memory for recv_buf and post receives */ recv_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char*)); vars->recv_buf = recv_buf; for (i = 0; i < nprocs; i++) if (recv_size[i]) recv_buf[i] = (char *)ADIOI_Malloc(recv_size[i]); j = 0; for (i = 0; i < nprocs; i++) if (recv_size[i]) { MPI_Irecv(recv_buf[i], recv_size[i], MPI_BYTE, i, myrank+i+100*iter, fd->comm, vars->req2 + j); j++; #ifdef RDCOLL_DEBUG DBG_FPRINTF(stderr, "node %d, recv_size %d, tag %d \n", myrank, recv_size[i], myrank+i+100*iter); #endif } } /* create derived datatypes and send data */ j = 0; for (i = 0; i < nprocs; i++) { if (send_size[i]) { /* take care if the last off-len pair is a partial send */ if (partial_send[i]) { k = start_pos[i] + count[i] - 1; tmp = others_req[i].lens[k]; others_req[i].lens[k] = partial_send[i]; } ADIOI_Type_create_hindexed_x(count[i], &(others_req[i].lens[start_pos[i]]), &(others_req[i].mem_ptrs[start_pos[i]]), MPI_BYTE, &send_type); /* absolute displacement; use MPI_BOTTOM in send */ MPI_Type_commit(&send_type); MPI_Isend(MPI_BOTTOM, 1, send_type, i, myrank+i+100*iter, fd->comm, vars->req2 + nprocs_recv + j); MPI_Type_free(&send_type); if (partial_send[i]) others_req[i].lens[k] = tmp; j++; } } /* wait on the receives */ if (nprocs_recv) { nbc_req->data.rd.state = ADIOI_IRC_STATE_R_IEXCHANGE_DATA_RECV; return; } ADIOI_R_Iexchange_data_fill(nbc_req, error_code); }
static void ADIOI_Iread_and_exch_l1_begin(ADIOI_NBC_Request *nbc_req, int *error_code) { ADIOI_Iread_and_exch_vars *vars = nbc_req->data.rd.rae_vars; ADIO_File fd; int nprocs; ADIOI_Access *others_req; int i, j; ADIO_Offset real_off, req_off; char *read_buf; int *curr_offlen_ptr, *count, *send_size; int *partial_send, *start_pos; ADIO_Offset size, real_size, for_next_iter; int req_len, flag; ADIOI_R_Iexchange_data_vars *red_vars = NULL; /* loop exit condition */ if (vars->m >= vars->ntimes) { ADIOI_Iread_and_exch_reset(nbc_req, error_code); return; } fd = vars->fd; nprocs = vars->nprocs; others_req = vars->others_req; read_buf = vars->read_buf; curr_offlen_ptr = vars->curr_offlen_ptr; count = vars->count; send_size = vars->send_size; partial_send = vars->partial_send; start_pos = vars->start_pos; /* read buf of size coll_bufsize (or less) */ /* go through all others_req and check if any are satisfied by the current read */ /* since MPI guarantees that displacements in filetypes are in monotonically nondecreasing order, I can maintain a pointer (curr_offlen_ptr) to current off-len pair for each process in others_req and scan further only from there. There is still a problem of filetypes such as: (1, 2, 3 are not process nos. They are just numbers for three chunks of data, specified by a filetype.) 1 -------!-- 2 -----!---- 3 --!----- where ! indicates where the current read_size limitation cuts through the filetype. I resolve this by reading up to !, but filling the communication buffer only for 1. I copy the portion left over for 2 into a tmp_buf for use in the next iteration. i.e., 2 and 3 will be satisfied in the next iteration. This simplifies filling in the user's buf at the other end, as only one off-len pair with incomplete data will be sent. I also don't need to send the individual offsets and lens along with the data, as the data is being sent in a particular order. */ /* off = start offset in the file for the data actually read in this iteration size = size of data read corresponding to off real_off = off minus whatever data was retained in memory from previous iteration for cases like 2, 3 illustrated above real_size = size plus the extra corresponding to real_off req_off = off in file for a particular contiguous request minus what was satisfied in previous iteration req_size = size corresponding to req_off */ size = ADIOI_MIN((unsigned)vars->coll_bufsize, vars->end_loc - vars->st_loc + 1 - vars->done); real_off = vars->off - vars->for_curr_iter; real_size = size + vars->for_curr_iter; vars->size = size; vars->real_size = real_size; for (i = 0; i < nprocs; i++) count[i] = send_size[i] = 0; for_next_iter = 0; for (i = 0; i < nprocs; i++) { #ifdef RDCOLL_DEBUG DBG_FPRINTF(stderr, "rank %d, i %d, others_count %d\n", vars->myrank, i, others_req[i].count); #endif if (others_req[i].count) { start_pos[i] = curr_offlen_ptr[i]; for (j = curr_offlen_ptr[i]; j < others_req[i].count; j++) { if (partial_send[i]) { /* this request may have been partially satisfied in the previous iteration. */ req_off = others_req[i].offsets[j] + partial_send[i]; req_len = others_req[i].lens[j] - partial_send[i]; partial_send[i] = 0; /* modify the off-len pair to reflect this change */ others_req[i].offsets[j] = req_off; others_req[i].lens[j] = req_len; } else { req_off = others_req[i].offsets[j]; req_len = others_req[i].lens[j]; } if (req_off < real_off + real_size) { count[i]++; ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)read_buf) + req_off - real_off) == (ADIO_Offset)(MPIR_Upint)(read_buf + req_off - real_off)); MPI_Address(read_buf + req_off - real_off, &(others_req[i].mem_ptrs[j])); ADIOI_Assert((real_off + real_size - req_off) == (int)(real_off + real_size - req_off)); send_size[i] += (int)(ADIOI_MIN(real_off + real_size - req_off, (ADIO_Offset)(unsigned)req_len)); if (real_off + real_size - req_off < (ADIO_Offset)(unsigned)req_len) { partial_send[i] = (int)(real_off + real_size - req_off); if ((j+1 < others_req[i].count) && (others_req[i].offsets[j+1] < real_off + real_size)) { /* this is the case illustrated in the figure above. */ for_next_iter = ADIOI_MAX(for_next_iter, real_off + real_size - others_req[i].offsets[j+1]); /* max because it must cover requests from different processes */ } break; } } else break; } curr_offlen_ptr[i] = j; } } vars->for_next_iter = for_next_iter; flag = 0; for (i = 0; i < nprocs; i++) if (count[i]) flag = 1; /* create a struct for ADIOI_R_Iexchange_data() */ red_vars = (ADIOI_R_Iexchange_data_vars *)ADIOI_Calloc( 1, sizeof(ADIOI_R_Iexchange_data_vars)); nbc_req->data.rd.red_vars = red_vars; red_vars->fd = vars->fd; red_vars->buf = vars->buf; red_vars->flat_buf = vars->flat_buf; red_vars->offset_list = vars->offset_list; red_vars->len_list = vars->len_list; red_vars->send_size = vars->send_size; red_vars->recv_size = vars->recv_size; red_vars->count = vars->count; red_vars->start_pos = vars->start_pos; red_vars->partial_send = vars->partial_send; red_vars->recd_from_proc = vars->recd_from_proc; red_vars->nprocs = vars->nprocs; red_vars->myrank = vars->myrank; red_vars->buftype_is_contig = vars->buftype_is_contig; red_vars->contig_access_count = vars->contig_access_count; red_vars->min_st_offset = vars->min_st_offset; red_vars->fd_size = vars->fd_size; red_vars->fd_start = vars->fd_start; red_vars->fd_end = vars->fd_end; red_vars->others_req = vars->others_req; red_vars->iter = vars->m; red_vars->buftype_extent = vars->buftype_extent; red_vars->buf_idx = vars->buf_idx; red_vars->next_fn = ADIOI_Iread_and_exch_l1_end; if (flag) { ADIOI_Assert(size == (int)size); ADIO_IreadContig(fd, read_buf+vars->for_curr_iter, (int)size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, vars->off, &vars->req2, error_code); nbc_req->data.rd.state = ADIOI_IRC_STATE_IREAD_AND_EXCH_L1_BEGIN; return; } ADIOI_R_Iexchange_data(nbc_req, error_code); }
/* Nonblocking version of ADIOI_GEN_ReadStridedColl() */ void ADIOI_GEN_IreadStridedColl(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, MPI_Request *request, int *error_code) { /* Uses a generalized version of the extended two-phase method described in "An Extended Two-Phase Method for Accessing Sections of Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary, Scientific Programming, (5)4:301--317, Winter 1996. http://www.mcs.anl.gov/home/thakur/ext2ph.ps */ ADIOI_NBC_Request *nbc_req = NULL; ADIOI_GEN_IreadStridedColl_vars *vars = NULL; int nprocs, myrank; #ifdef RDCOLL_DEBUG int i; #endif /* FIXME: need an implementation of ADIOI_IOIstridedColl if (fd->hints->cb_pfr != ADIOI_HINT_DISABLE) { ADIOI_IOIstridedColl(fd, buf, count, ADIOI_READ, datatype, file_ptr_type, offset, request, error_code); return; } */ /* top-level struct keeping the status of function progress */ nbc_req = (ADIOI_NBC_Request *)ADIOI_Calloc(1, sizeof(ADIOI_NBC_Request)); nbc_req->rdwr = ADIOI_READ; /* create a generalized request */ if (ADIOI_GEN_greq_class == 0) { MPIX_Grequest_class_create(ADIOI_GEN_irc_query_fn, ADIOI_GEN_irc_free_fn, MPIU_Greq_cancel_fn, ADIOI_GEN_irc_poll_fn, ADIOI_GEN_irc_wait_fn, &ADIOI_GEN_greq_class); } MPIX_Grequest_class_allocate(ADIOI_GEN_greq_class, nbc_req, request); memcpy(&nbc_req->req, request, sizeof(MPI_Request)); /* create a struct for parameters and variables */ vars = (ADIOI_GEN_IreadStridedColl_vars *)ADIOI_Calloc( 1, sizeof(ADIOI_GEN_IreadStridedColl_vars)); nbc_req->data.rd.rsc_vars = vars; /* save the parameters */ vars->fd = fd; vars->buf = buf; vars->count = count; vars->datatype = datatype; vars->file_ptr_type = file_ptr_type; vars->offset = offset; MPI_Comm_size(fd->comm, &nprocs); MPI_Comm_rank(fd->comm, &myrank); vars->nprocs = nprocs; vars->myrank = myrank; /* number of aggregators, cb_nodes, is stored in the hints */ vars->nprocs_for_coll = fd->hints->cb_nodes; vars->orig_fp = fd->fp_ind; /* only check for interleaving if cb_read isn't disabled */ if (fd->hints->cb_read != ADIOI_HINT_DISABLE) { /* For this process's request, calculate the list of offsets and lengths in the file and determine the start and end offsets. */ /* Note: end_offset points to the last byte-offset that will be accessed. e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/ ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset, &vars->offset_list, &vars->len_list, &vars->start_offset, &vars->end_offset, &vars->contig_access_count); #ifdef RDCOLL_DEBUG for (i = 0; i < vars->contig_access_count; i++) { DBG_FPRINTF(stderr, "rank %d off %lld len %lld\n", myrank, vars->offset_list[i], vars->len_list[i]); } #endif /* each process communicates its start and end offsets to other processes. The result is an array each of start and end offsets stored in order of process rank. */ vars->st_offsets = (ADIO_Offset *)ADIOI_Malloc(nprocs*sizeof(ADIO_Offset)); vars->end_offsets = (ADIO_Offset *)ADIOI_Malloc(nprocs*sizeof(ADIO_Offset)); *error_code = MPI_Iallgather(&vars->start_offset, 1, ADIO_OFFSET, vars->st_offsets, 1, ADIO_OFFSET, fd->comm, &vars->req_offset[0]); if (*error_code != MPI_SUCCESS) return; *error_code = MPI_Iallgather(&vars->end_offset, 1, ADIO_OFFSET, vars->end_offsets, 1, ADIO_OFFSET, fd->comm, &vars->req_offset[1]); nbc_req->data.rd.state = ADIOI_IRC_STATE_GEN_IREADSTRIDEDCOLL; return; } ADIOI_GEN_IreadStridedColl_indio(nbc_req, error_code); }
/* ADIOI_Count_contiguous_blocks * * Returns number of contiguous blocks in type, and also updates * curr_index to reflect the space for the additional blocks. * * ASSUMES THAT TYPE IS NOT A BASIC!!! */ MPI_Count ADIOI_Count_contiguous_blocks(MPI_Datatype datatype, MPI_Count *curr_index) { int i, n; MPI_Count count=0, prev_index, num, basic_num; int top_count, combiner, old_combiner, old_is_contig; int nints, nadds, ntypes, old_nints, old_nadds, old_ntypes; int *ints; MPI_Aint *adds; /* Make no assumptions about +/- sign on these */ MPI_Datatype *types; MPI_Type_get_envelope(datatype, &nints, &nadds, &ntypes, &combiner); ints = (int *) ADIOI_Malloc((nints+1)*sizeof(int)); adds = (MPI_Aint *) ADIOI_Malloc((nadds+1)*sizeof(MPI_Aint)); types = (MPI_Datatype *) ADIOI_Malloc((ntypes+1)*sizeof(MPI_Datatype)); MPI_Type_get_contents(datatype, nints, nadds, ntypes, ints, adds, types); switch (combiner) { #ifdef MPIIMPL_HAVE_MPI_COMBINER_DUP case MPI_COMBINER_DUP: MPI_Type_get_envelope(types[0], &old_nints, &old_nadds, &old_ntypes, &old_combiner); ADIOI_Datatype_iscontig(types[0], &old_is_contig); if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig)) count = ADIOI_Count_contiguous_blocks(types[0], curr_index); else { count = 1; (*curr_index)++; } break; #endif #ifdef MPIIMPL_HAVE_MPI_COMBINER_SUBARRAY case MPI_COMBINER_SUBARRAY: { int dims = ints[0]; MPI_Datatype stype; ADIO_Type_create_subarray(dims, &ints[1], /* sizes */ &ints[dims+1], /* subsizes */ &ints[2*dims+1], /* starts */ ints[3*dims+1], /* order */ types[0], /* type */ &stype); count = ADIOI_Count_contiguous_blocks(stype, curr_index); /* curr_index will have already been updated; just pass * count back up. */ MPI_Type_free(&stype); } break; #endif #ifdef MPIIMPL_HAVE_MPI_COMBINER_DARRAY case MPI_COMBINER_DARRAY: { int dims = ints[2]; MPI_Datatype dtype; ADIO_Type_create_darray(ints[0], /* size */ ints[1], /* rank */ dims, &ints[3], /* gsizes */ &ints[dims+3], /* distribs */ &ints[2*dims+3], /* dargs */ &ints[3*dims+3], /* psizes */ ints[4*dims+3], /* order */ types[0], &dtype); count = ADIOI_Count_contiguous_blocks(dtype, curr_index); /* curr_index will have already been updated; just pass * count back up. */ MPI_Type_free(&dtype); } break; #endif case MPI_COMBINER_CONTIGUOUS: top_count = ints[0]; MPI_Type_get_envelope(types[0], &old_nints, &old_nadds, &old_ntypes, &old_combiner); ADIOI_Datatype_iscontig(types[0], &old_is_contig); prev_index = *curr_index; if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig)) count = ADIOI_Count_contiguous_blocks(types[0], curr_index); else count = 1; if (prev_index == *curr_index) /* simplest case, made up of basic or contiguous types */ (*curr_index)++; else { /* made up of noncontiguous derived types */ num = *curr_index - prev_index; count *= top_count; *curr_index += (top_count - 1)*num; } break; case MPI_COMBINER_VECTOR: case MPI_COMBINER_HVECTOR: case MPI_COMBINER_HVECTOR_INTEGER: top_count = ints[0]; MPI_Type_get_envelope(types[0], &old_nints, &old_nadds, &old_ntypes, &old_combiner); ADIOI_Datatype_iscontig(types[0], &old_is_contig); prev_index = *curr_index; if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig)) count = ADIOI_Count_contiguous_blocks(types[0], curr_index); else count = 1; if (prev_index == *curr_index) { /* simplest case, vector of basic or contiguous types */ count = top_count; *curr_index += count; } else { /* vector of noncontiguous derived types */ num = *curr_index - prev_index; /* The noncontiguous types have to be replicated blocklen times and then strided. */ count *= ints[1] * top_count; /* First one */ *curr_index += (ints[1] - 1)*num; /* Now repeat with strides. */ num = *curr_index - prev_index; *curr_index += (top_count - 1)*num; } break; case MPI_COMBINER_INDEXED: case MPI_COMBINER_HINDEXED: case MPI_COMBINER_HINDEXED_INTEGER: top_count = ints[0]; MPI_Type_get_envelope(types[0], &old_nints, &old_nadds, &old_ntypes, &old_combiner); ADIOI_Datatype_iscontig(types[0], &old_is_contig); prev_index = *curr_index; if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig)) count = ADIOI_Count_contiguous_blocks(types[0], curr_index); else count = 1; if (prev_index == *curr_index) { /* simplest case, indexed type made up of basic or contiguous types */ count = top_count; *curr_index += count; } else { /* indexed type made up of noncontiguous derived types */ basic_num = *curr_index - prev_index; /* The noncontiguous types have to be replicated blocklens[i] times and then strided. */ *curr_index += (ints[1]-1) * basic_num; count *= ints[1]; /* Now repeat with strides. */ for (i=1; i<top_count; i++) { count += ints[1+i] * basic_num; *curr_index += ints[1+i] * basic_num; } } break; #if defined HAVE_DECL_MPI_COMBINER_HINDEXED_BLOCK && HAVE_DECL_MPI_COMBINER_HINDEXED_BLOCK case MPI_COMBINER_HINDEXED_BLOCK: #endif case MPI_COMBINER_INDEXED_BLOCK: top_count = ints[0]; MPI_Type_get_envelope(types[0], &old_nints, &old_nadds, &old_ntypes, &old_combiner); ADIOI_Datatype_iscontig(types[0], &old_is_contig); prev_index = *curr_index; if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig)) count = ADIOI_Count_contiguous_blocks(types[0], curr_index); else count = 1; if (prev_index == *curr_index) { /* simplest case, indexed type made up of basic or contiguous types */ count = top_count; *curr_index += count; } else { /* indexed type made up of noncontiguous derived types */ basic_num = *curr_index - prev_index; /* The noncontiguous types have to be replicated blocklens[i] times and then strided. */ *curr_index += (ints[1]-1) * basic_num; count *= ints[1]; /* Now repeat with strides. */ *curr_index += (top_count-1) * count; count *= top_count; } break; case MPI_COMBINER_STRUCT: case MPI_COMBINER_STRUCT_INTEGER: top_count = ints[0]; count = 0; for (n=0; n<top_count; n++) { MPI_Type_get_envelope(types[n], &old_nints, &old_nadds, &old_ntypes, &old_combiner); ADIOI_Datatype_iscontig(types[n], &old_is_contig); prev_index = *curr_index; if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig)) count += ADIOI_Count_contiguous_blocks(types[n], curr_index); if (prev_index == *curr_index) { /* simplest case, current type is basic or contiguous types */ count++; (*curr_index)++; } else { /* current type made up of noncontiguous derived types */ /* The current type has to be replicated blocklens[n] times */ num = *curr_index - prev_index; count += (ints[1+n]-1)*num; (*curr_index) += (ints[1+n]-1)*num; } } break; case MPI_COMBINER_RESIZED: /* treat it as a struct with lb, type, ub */ /* add 2 for lb and ub */ (*curr_index) += 2; count += 2; /* add for datatype */ MPI_Type_get_envelope(types[0], &old_nints, &old_nadds, &old_ntypes, &old_combiner); ADIOI_Datatype_iscontig(types[0], &old_is_contig); if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig)) { count += ADIOI_Count_contiguous_blocks(types[0], curr_index); } else { /* basic or contiguous type */ count++; (*curr_index)++; } break; default: /* TODO: FIXME */ DBG_FPRINTF(stderr, "Error: Unsupported datatype passed to ADIOI_Count_contiguous_blocks, combiner = %d\n", combiner); MPI_Abort(MPI_COMM_WORLD, 1); } #ifndef MPISGI /* There is a bug in SGI's impl. of MPI_Type_get_contents. It doesn't return new datatypes. Therefore no need to free. */ for (i=0; i<ntypes; i++) { MPI_Type_get_envelope(types[i], &old_nints, &old_nadds, &old_ntypes, &old_combiner); if (old_combiner != MPI_COMBINER_NAMED) MPI_Type_free(types+i); } #endif ADIOI_Free(ints); ADIOI_Free(adds); ADIOI_Free(types); return count; }
/* flatten datatype and add it to Flatlist */ void ADIOI_Flatten_datatype(MPI_Datatype datatype) { #ifdef HAVE_MPIR_TYPE_FLATTEN MPI_Aint flatten_idx; #endif MPI_Count curr_index=0; int is_contig; ADIOI_Flatlist_node *flat, *prev=0; /* check if necessary to flatten. */ /* is it entirely contiguous? */ ADIOI_Datatype_iscontig(datatype, &is_contig); #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten_datatype:: is_contig %#X\n",is_contig); #endif if (is_contig) return; /* has it already been flattened? */ flat = ADIOI_Flatlist; while (flat) { if (flat->type == datatype) { #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten_datatype:: found datatype %#X\n", datatype); #endif return; } else { prev = flat; flat = flat->next; } } /* flatten and add to the list */ flat = prev; flat->next = (ADIOI_Flatlist_node *)ADIOI_Malloc(sizeof(ADIOI_Flatlist_node)); flat = flat->next; flat->type = datatype; flat->next = NULL; flat->blocklens = NULL; flat->indices = NULL; flat->count = ADIOI_Count_contiguous_blocks(datatype, &curr_index); #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten_datatype:: count %llX, cur_idx = %#llX\n",flat->count,curr_index); #endif /* DBG_FPRINTF(stderr, "%d\n", flat->count);*/ if (flat->count) { flat->blocklens = (ADIO_Offset *) ADIOI_Malloc(flat->count * sizeof(ADIO_Offset)); flat->indices = (ADIO_Offset *) ADIOI_Malloc(flat->count * sizeof(ADIO_Offset)); } curr_index = 0; #ifdef HAVE_MPIR_TYPE_FLATTEN flatten_idx = (MPI_Aint) flat->count; MPIR_Type_flatten(datatype, flat->indices, flat->blocklens, &flatten_idx); #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten_datatype:: MPIR_Type_flatten\n"); #endif #else ADIOI_Flatten(datatype, flat, 0, &curr_index); #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten_datatype:: ADIOI_Flatten\n"); #endif ADIOI_Optimize_flattened(flat); #endif /* debug */ #ifdef FLATTEN_DEBUG { int i; for (i=0; i<flat->count; i++) DBG_FPRINTF(stderr,"ADIOI_Flatten_datatype:: i %#X, blocklens %#llX, indices %#llX\n", i, flat->blocklens[i], flat->indices[i] ); } #endif }
/* ADIOI_Flatten() * * Assumption: input datatype is not a basic!!!! */ void ADIOI_Flatten(MPI_Datatype datatype, ADIOI_Flatlist_node *flat, ADIO_Offset st_offset, MPI_Count *curr_index) { int i, k, m, n, basic_num, nonzeroth, is_hindexed_block=0; int combiner, old_combiner, old_is_contig; int nints, nadds, ntypes, old_nints, old_nadds, old_ntypes; /* By using ADIO_Offset we preserve +/- sign and avoid >2G integer arithmetic problems */ ADIO_Offset top_count; MPI_Count j, old_size, prev_index, num; MPI_Aint old_extent;/* Assume extents are non-negative */ int *ints; MPI_Aint *adds; /* Make no assumptions about +/- sign on these */ MPI_Datatype *types; MPI_Type_get_envelope(datatype, &nints, &nadds, &ntypes, &combiner); ints = (int *) ADIOI_Malloc((nints+1)*sizeof(int)); adds = (MPI_Aint *) ADIOI_Malloc((nadds+1)*sizeof(MPI_Aint)); types = (MPI_Datatype *) ADIOI_Malloc((ntypes+1)*sizeof(MPI_Datatype)); MPI_Type_get_contents(datatype, nints, nadds, ntypes, ints, adds, types); #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: st_offset %#llX, curr_index %#llX\n",st_offset,*curr_index); DBG_FPRINTF(stderr,"ADIOI_Flatten:: nints %#X, nadds %#X, ntypes %#X\n",nints, nadds, ntypes); for(i=0; i< nints; ++i) { DBG_FPRINTF(stderr,"ADIOI_Flatten:: ints[%d]=%#X\n",i,ints[i]); } for(i=0; i< nadds; ++i) { DBG_FPRINTF(stderr,"ADIOI_Flatten:: adds[%d]="MPI_AINT_FMT_HEX_SPEC"\n",i,adds[i]); } for(i=0; i< ntypes; ++i) { DBG_FPRINTF(stderr,"ADIOI_Flatten:: types[%d]=%#llX\n",i,(unsigned long long)(unsigned long)types[i]); } #endif /* Chapter 4, page 83: when processing datatypes, note this item from the * standard: Most datatype constructors have replication count or block length arguments. Allowed values are non-negative integers. If the value is zero, no elements are generated in the type map and there is no effect on datatype bounds or extent. */ switch (combiner) { #ifdef MPIIMPL_HAVE_MPI_COMBINER_DUP case MPI_COMBINER_DUP: #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_DUP\n"); #endif MPI_Type_get_envelope(types[0], &old_nints, &old_nadds, &old_ntypes, &old_combiner); ADIOI_Datatype_iscontig(types[0], &old_is_contig); if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig)) ADIOI_Flatten(types[0], flat, st_offset, curr_index); break; #endif #ifdef MPIIMPL_HAVE_MPI_COMBINER_SUBARRAY case MPI_COMBINER_SUBARRAY: { int dims = ints[0]; MPI_Datatype stype; #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_SUBARRAY\n"); #endif ADIO_Type_create_subarray(dims, &ints[1], /* sizes */ &ints[dims+1], /* subsizes */ &ints[2*dims+1], /* starts */ ints[3*dims+1], /* order */ types[0], /* type */ &stype); ADIOI_Flatten(stype, flat, st_offset, curr_index); MPI_Type_free(&stype); } break; #endif #ifdef MPIIMPL_HAVE_MPI_COMBINER_DARRAY case MPI_COMBINER_DARRAY: { int dims = ints[2]; MPI_Datatype dtype; #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_DARRAY\n"); #endif ADIO_Type_create_darray(ints[0], /* size */ ints[1], /* rank */ dims, &ints[3], /* gsizes */ &ints[dims+3], /* distribs */ &ints[2*dims+3], /* dargs */ &ints[3*dims+3], /* psizes */ ints[4*dims+3], /* order */ types[0], &dtype); #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_DARRAY <ADIOI_Flatten(dtype, flat->indices[%#X] %#llX, flat->blocklens[%#X] %#llX, st_offset %#llX, curr_index %#llX);\n", 0, flat->indices[0], 0, flat->blocklens[0], st_offset, *curr_index); #endif ADIOI_Flatten(dtype, flat, st_offset, curr_index); #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_DARRAY >ADIOI_Flatten(dtype, flat->indices[%#X] %#llX, flat->blocklens[%#X] %#llX, st_offset %#llX, curr_index %#llX);\n", 0, flat->indices[0], 0, flat->blocklens[0], st_offset, *curr_index); #endif MPI_Type_free(&dtype); } break; #endif case MPI_COMBINER_CONTIGUOUS: #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_CONTIGUOUS\n"); #endif top_count = ints[0]; MPI_Type_get_envelope(types[0], &old_nints, &old_nadds, &old_ntypes, &old_combiner); ADIOI_Datatype_iscontig(types[0], &old_is_contig); prev_index = *curr_index; if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig)) ADIOI_Flatten(types[0], flat, st_offset, curr_index); if (prev_index == *curr_index) { /* simplest case, made up of basic or contiguous types */ j = *curr_index; flat->indices[j] = st_offset; MPI_Type_size_x(types[0], &old_size); flat->blocklens[j] = top_count * old_size; #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: simple flat->indices[%#llX] %#llX, flat->blocklens[%#llX] %#llX\n",j, flat->indices[j], j, flat->blocklens[j]); #endif (*curr_index)++; } else { /* made up of noncontiguous derived types */ j = *curr_index; num = *curr_index - prev_index; /* The noncontiguous types have to be replicated count times */ MPI_Type_extent(types[0], &old_extent); for (m=1; m<top_count; m++) { for (i=0; i<num; i++) { flat->indices[j] = flat->indices[j-num] + ADIOI_AINT_CAST_TO_OFFSET old_extent; flat->blocklens[j] = flat->blocklens[j-num]; #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: derived flat->indices[%#llX] %#llX, flat->blocklens[%#llX] %#llX\n",j, flat->indices[j], j, flat->blocklens[j]); #endif j++; } } *curr_index = j; } break; case MPI_COMBINER_VECTOR: #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_VECTOR\n"); #endif top_count = ints[0]; MPI_Type_get_envelope(types[0], &old_nints, &old_nadds, &old_ntypes, &old_combiner); ADIOI_Datatype_iscontig(types[0], &old_is_contig); prev_index = *curr_index; if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig)) ADIOI_Flatten(types[0], flat, st_offset, curr_index); if (prev_index == *curr_index) { /* simplest case, vector of basic or contiguous types */ /* By using ADIO_Offset we preserve +/- sign and avoid >2G integer arithmetic problems */ ADIO_Offset blocklength = ints[1], stride = ints[2]; j = *curr_index; flat->indices[j] = st_offset; MPI_Type_size_x(types[0], &old_size); flat->blocklens[j] = blocklength * old_size; for (i=j+1; i<j+top_count; i++) { flat->indices[i] = flat->indices[i-1] + stride * old_size; flat->blocklens[i] = flat->blocklens[j]; } *curr_index = i; } else { /* vector of noncontiguous derived types */ /* By using ADIO_Offset we preserve +/- sign and avoid >2G integer arithmetic problems */ ADIO_Offset blocklength = ints[1], stride = ints[2]; j = *curr_index; num = *curr_index - prev_index; /* The noncontiguous types have to be replicated blocklen times and then strided. Replicate the first one. */ MPI_Type_extent(types[0], &old_extent); for (m=1; m<blocklength; m++) { for (i=0; i<num; i++) { flat->indices[j] = flat->indices[j-num] + ADIOI_AINT_CAST_TO_OFFSET old_extent; flat->blocklens[j] = flat->blocklens[j-num]; j++; } } *curr_index = j; /* Now repeat with strides. */ num = *curr_index - prev_index; for (i=1; i<top_count; i++) { for (m=0; m<num; m++) { flat->indices[j] = flat->indices[j-num] + stride * ADIOI_AINT_CAST_TO_OFFSET old_extent; flat->blocklens[j] = flat->blocklens[j-num]; j++; } } *curr_index = j; } break; case MPI_COMBINER_HVECTOR: case MPI_COMBINER_HVECTOR_INTEGER: #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_HVECTOR_INTEGER\n"); #endif top_count = ints[0]; MPI_Type_get_envelope(types[0], &old_nints, &old_nadds, &old_ntypes, &old_combiner); ADIOI_Datatype_iscontig(types[0], &old_is_contig); prev_index = *curr_index; if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig)) ADIOI_Flatten(types[0], flat, st_offset, curr_index); if (prev_index == *curr_index) { /* simplest case, vector of basic or contiguous types */ /* By using ADIO_Offset we preserve +/- sign and avoid >2G integer arithmetic problems */ ADIO_Offset blocklength = ints[1]; j = *curr_index; flat->indices[j] = st_offset; MPI_Type_size_x(types[0], &old_size); flat->blocklens[j] = blocklength * old_size; for (i=j+1; i<j+top_count; i++) { flat->indices[i] = flat->indices[i-1] + adds[0]; flat->blocklens[i] = flat->blocklens[j]; } *curr_index = i; } else { /* vector of noncontiguous derived types */ /* By using ADIO_Offset we preserve +/- sign and avoid >2G integer arithmetic problems */ ADIO_Offset blocklength = ints[1]; j = *curr_index; num = *curr_index - prev_index; /* The noncontiguous types have to be replicated blocklen times and then strided. Replicate the first one. */ MPI_Type_extent(types[0], &old_extent); for (m=1; m<blocklength; m++) { for (i=0; i<num; i++) { flat->indices[j] = flat->indices[j-num] + ADIOI_AINT_CAST_TO_OFFSET old_extent; flat->blocklens[j] = flat->blocklens[j-num]; j++; } } *curr_index = j; /* Now repeat with strides. */ num = *curr_index - prev_index; for (i=1; i<top_count; i++) { for (m=0; m<num; m++) { flat->indices[j] = flat->indices[j-num] + adds[0]; flat->blocklens[j] = flat->blocklens[j-num]; j++; } } *curr_index = j; } break; case MPI_COMBINER_INDEXED: #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_INDEXED\n"); #endif top_count = ints[0]; MPI_Type_get_envelope(types[0], &old_nints, &old_nadds, &old_ntypes, &old_combiner); ADIOI_Datatype_iscontig(types[0], &old_is_contig); MPI_Type_extent(types[0], &old_extent); prev_index = *curr_index; if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig)) { /* By using ADIO_Offset we preserve +/- sign and avoid >2G integer arithmetic problems */ ADIO_Offset stride = ints[top_count+1]; ADIOI_Flatten(types[0], flat, st_offset+stride* ADIOI_AINT_CAST_TO_OFFSET old_extent, curr_index); } if (prev_index == *curr_index) { /* simplest case, indexed type made up of basic or contiguous types */ j = *curr_index; for (i=j, nonzeroth=i; i<j+top_count; i++) { /* By using ADIO_Offset we preserve +/- sign and avoid >2G integer arithmetic problems */ ADIO_Offset blocklength = ints[1+i-j], stride = ints[top_count+1+i-j]; if (blocklength > 0) { flat->indices[nonzeroth] = st_offset + stride* ADIOI_AINT_CAST_TO_OFFSET old_extent; flat->blocklens[nonzeroth] = blocklength* ADIOI_AINT_CAST_TO_OFFSET old_extent; nonzeroth++; } else { flat->count--; /* don't count/consider any zero-length blocklens */ } } *curr_index = i; } else { /* indexed type made up of noncontiguous derived types */ j = *curr_index; num = *curr_index - prev_index; basic_num = num; /* The noncontiguous types have to be replicated blocklens[i] times and then strided. Replicate the first one. */ for (m=1; m<ints[1]; m++) { for (i=0, nonzeroth = j; i<num; i++) { if (flat->blocklens[j-num] > 0) { flat->indices[nonzeroth] = flat->indices[nonzeroth-num] + ADIOI_AINT_CAST_TO_OFFSET old_extent; flat->blocklens[nonzeroth] = flat->blocklens[nonzeroth-num]; j++; nonzeroth++; } else { flat->count --; } } } *curr_index = j; /* Now repeat with strides. */ for (i=1; i<top_count; i++) { num = *curr_index - prev_index; prev_index = *curr_index; for (m=0, nonzeroth=j; m<basic_num; m++) { /* By using ADIO_Offset we preserve +/- sign and avoid >2G integer arithmetic problems */ ADIO_Offset stride = ints[top_count+1+i]-ints[top_count+i]; if (flat->blocklens[j-num] > 0 ) { flat->indices[nonzeroth] = flat->indices[j-num] + stride* ADIOI_AINT_CAST_TO_OFFSET old_extent; flat->blocklens[nonzeroth] = flat->blocklens[j-num]; j++; nonzeroth++; } else { flat->count--; } } *curr_index = j; for (m=1; m<ints[1+i]; m++) { for (k=0, nonzeroth=j; k<basic_num; k++) { if (flat->blocklens[j-basic_num] > 0) { flat->indices[nonzeroth] = flat->indices[j-basic_num] + ADIOI_AINT_CAST_TO_OFFSET old_extent; flat->blocklens[nonzeroth] = flat->blocklens[j-basic_num]; j++; nonzeroth++; } else { flat->count --; } } } *curr_index = j; } } break; #if defined HAVE_DECL_MPI_COMBINER_HINDEXED_BLOCK && HAVE_DECL_MPI_COMBINER_HINDEXED_BLOCK case MPI_COMBINER_HINDEXED_BLOCK: is_hindexed_block=1; /* deliberate fall-through */ #endif case MPI_COMBINER_INDEXED_BLOCK: #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_INDEXED_BLOCK\n"); #endif top_count = ints[0]; MPI_Type_get_envelope(types[0], &old_nints, &old_nadds, &old_ntypes, &old_combiner); ADIOI_Datatype_iscontig(types[0], &old_is_contig); MPI_Type_extent(types[0], &old_extent); prev_index = *curr_index; if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig)) { /* By using ADIO_Offset we preserve +/- sign and avoid >2G integer arithmetic problems */ ADIO_Offset stride = ints[1+1]; if (is_hindexed_block) { ADIOI_Flatten(types[0], flat, st_offset+adds[0], curr_index); } else { ADIOI_Flatten(types[0], flat, st_offset+stride* ADIOI_AINT_CAST_TO_OFFSET old_extent, curr_index); } } if (prev_index == *curr_index) { /* simplest case, indexed type made up of basic or contiguous types */ j = *curr_index; for (i=j; i<j+top_count; i++) { /* By using ADIO_Offset we preserve +/- sign and avoid >2G integer arithmetic problems */ ADIO_Offset blocklength = ints[1]; if (is_hindexed_block) { flat->indices[i] = st_offset + adds[i-j]; } else { ADIO_Offset stride = ints[1+1+i-j]; flat->indices[i] = st_offset + stride* ADIOI_AINT_CAST_TO_OFFSET old_extent; } flat->blocklens[i] = blocklength* ADIOI_AINT_CAST_TO_OFFSET old_extent; } *curr_index = i; } else { /* vector of noncontiguous derived types */ j = *curr_index; num = *curr_index - prev_index; /* The noncontiguous types have to be replicated blocklens[i] times and then strided. Replicate the first one. */ for (m=1; m<ints[1]; m++) { for (i=0; i<num; i++) { if (is_hindexed_block) { /* this is the one place the hindexed case uses the * extent of a type */ MPI_Type_extent(types[0], &old_extent); } flat->indices[j] = flat->indices[j-num] + ADIOI_AINT_CAST_TO_OFFSET old_extent; flat->blocklens[j] = flat->blocklens[j-num]; j++; } } *curr_index = j; /* Now repeat with strides. */ num = *curr_index - prev_index; for (i=1; i<top_count; i++) { for (m=0; m<num; m++) { if (is_hindexed_block) { flat->indices[j] = flat->indices[j-num] + adds[i] - adds[i-1]; } else { /* By using ADIO_Offset we preserve +/- sign and avoid >2G integer arithmetic problems */ ADIO_Offset stride = ints[2+i]-ints[1+i]; flat->indices[j] = flat->indices[j-num] + stride* ADIOI_AINT_CAST_TO_OFFSET old_extent; } flat->blocklens[j] = flat->blocklens[j-num]; j++; } } *curr_index = j; } break; case MPI_COMBINER_HINDEXED: case MPI_COMBINER_HINDEXED_INTEGER: #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_HINDEXED_INTEGER\n"); #endif top_count = ints[0]; MPI_Type_get_envelope(types[0], &old_nints, &old_nadds, &old_ntypes, &old_combiner); ADIOI_Datatype_iscontig(types[0], &old_is_contig); prev_index = *curr_index; if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig)) { ADIOI_Flatten(types[0], flat, st_offset+adds[0], curr_index); } if (prev_index == *curr_index) { /* simplest case, indexed type made up of basic or contiguous types */ j = *curr_index; MPI_Type_size_x(types[0], &old_size); for (i=j, nonzeroth=j; i<j+top_count; i++) { if (ints[1+i-j] > 0) { /* By using ADIO_Offset we preserve +/- sign and avoid >2G integer arithmetic problems */ ADIO_Offset blocklength = ints[1+i-j]; flat->indices[nonzeroth] = st_offset + adds[i-j]; flat->blocklens[nonzeroth] = blocklength*old_size; nonzeroth++; } else { flat->count--; } } *curr_index = i; } else { /* indexed type made up of noncontiguous derived types */ j = *curr_index; num = *curr_index - prev_index; basic_num = num; /* The noncontiguous types have to be replicated blocklens[i] times and then strided. Replicate the first one. */ MPI_Type_extent(types[0], &old_extent); for (m=1; m<ints[1]; m++) { for (i=0, nonzeroth=j; i<num; i++) { if (flat->blocklens[j-num] > 0) { flat->indices[nonzeroth] = flat->indices[j-num] + ADIOI_AINT_CAST_TO_OFFSET old_extent; flat->blocklens[nonzeroth] = flat->blocklens[j-num]; j++; nonzeroth++; } else { flat->count--; } } } *curr_index = j; /* Now repeat with strides. */ for (i=1; i<top_count; i++) { num = *curr_index - prev_index; prev_index = *curr_index; for (m=0, nonzeroth=j; m<basic_num; m++) { if (flat->blocklens[j-num] > 0) { flat->indices[nonzeroth] = flat->indices[j-num] + adds[i] - adds[i-1]; flat->blocklens[nonzeroth] = flat->blocklens[j-num]; j++; nonzeroth++; } else { flat->count--; } } *curr_index = j; for (m=1; m<ints[1+i]; m++) { for (k=0,nonzeroth=j; k<basic_num; k++) { if (flat->blocklens[j-basic_num] >0) { flat->indices[nonzeroth] = flat->indices[j-basic_num] + ADIOI_AINT_CAST_TO_OFFSET old_extent; flat->blocklens[nonzeroth] = flat->blocklens[j-basic_num]; j++; nonzeroth++; } } } *curr_index = j; } } break; case MPI_COMBINER_STRUCT: case MPI_COMBINER_STRUCT_INTEGER: #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_STRUCT_INTEGER\n"); #endif top_count = ints[0]; for (n=0; n<top_count; n++) { MPI_Type_get_envelope(types[n], &old_nints, &old_nadds, &old_ntypes, &old_combiner); ADIOI_Datatype_iscontig(types[n], &old_is_contig); prev_index = *curr_index; if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig)) ADIOI_Flatten(types[n], flat, st_offset+adds[n], curr_index); if (prev_index == *curr_index) { /* simplest case, current type is basic or contiguous types */ /* By using ADIO_Offset we preserve +/- sign and avoid >2G integer arithmetic problems */ if (ints[1+n] > 0 || types[n] == MPI_LB || types[n] == MPI_UB) { ADIO_Offset blocklength = ints[1+n]; j = *curr_index; flat->indices[j] = st_offset + adds[n]; MPI_Type_size_x(types[n], &old_size); flat->blocklens[j] = blocklength * old_size; #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: simple adds[%#X] "MPI_AINT_FMT_HEX_SPEC", flat->indices[%#llX] %#llX, flat->blocklens[%#llX] %#llX\n",n,adds[n],j, flat->indices[j], j, flat->blocklens[j]); #endif (*curr_index)++; } } else { /* current type made up of noncontiguous derived types */ j = *curr_index; num = *curr_index - prev_index; /* The current type has to be replicated blocklens[n] times */ MPI_Type_extent(types[n], &old_extent); for (m=1; m<ints[1+n]; m++) { for (i=0; i<num; i++) { flat->indices[j] = flat->indices[j-num] + ADIOI_AINT_CAST_TO_OFFSET old_extent; flat->blocklens[j] = flat->blocklens[j-num]; #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: simple old_extent "MPI_AINT_FMT_HEX_SPEC", flat->indices[%#llX] %#llX, flat->blocklens[%#llX] %#llX\n",old_extent,j, flat->indices[j], j, flat->blocklens[j]); #endif j++; } } *curr_index = j; } } break; case MPI_COMBINER_RESIZED: #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_RESIZED\n"); #endif /* This is done similar to a type_struct with an lb, datatype, ub */ /* handle the Lb */ j = *curr_index; flat->indices[j] = st_offset + adds[0]; /* this zero-length blocklens[] element, unlike eleswhere in the * flattening code, is correct and is used to indicate a lower bound * marker */ flat->blocklens[j] = 0; #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: simple adds[%#X] "MPI_AINT_FMT_HEX_SPEC", flat->indices[%#llX] %#llX, flat->blocklens[%#llX] %#llX\n",0,adds[0],j, flat->indices[j], j, flat->blocklens[j]); #endif (*curr_index)++; /* handle the datatype */ MPI_Type_get_envelope(types[0], &old_nints, &old_nadds, &old_ntypes, &old_combiner); ADIOI_Datatype_iscontig(types[0], &old_is_contig); if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig)) { ADIOI_Flatten(types[0], flat, st_offset+adds[0], curr_index); } else { /* current type is basic or contiguous */ j = *curr_index; flat->indices[j] = st_offset; MPI_Type_size_x(types[0], &old_size); flat->blocklens[j] = old_size; #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: simple adds[%#X] "MPI_AINT_FMT_HEX_SPEC", flat->indices[%#llX] %#llX, flat->blocklens[%#llX] %#llX\n",0,adds[0],j, flat->indices[j], j, flat->blocklens[j]); #endif (*curr_index)++; } /* take care of the extent as a UB */ j = *curr_index; flat->indices[j] = st_offset + adds[0] + adds[1]; /* again, zero-element ok: an upper-bound marker explicitly set by the * constructor of this resized type */ flat->blocklens[j] = 0; #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: simple adds[%#X] "MPI_AINT_FMT_HEX_SPEC", flat->indices[%#llX] %#llX, flat->blocklens[%#llX] %#llX\n",1,adds[1],j, flat->indices[j], j, flat->blocklens[j]); #endif (*curr_index)++; break; default: /* TODO: FIXME (requires changing prototypes to return errors...) */ DBG_FPRINTF(stderr, "Error: Unsupported datatype passed to ADIOI_Flatten\n"); MPI_Abort(MPI_COMM_WORLD, 1); } #ifndef MPISGI /* There is a bug in SGI's impl. of MPI_Type_get_contents. It doesn't return new datatypes. Therefore no need to free. */ for (i=0; i<ntypes; i++) { MPI_Type_get_envelope(types[i], &old_nints, &old_nadds, &old_ntypes, &old_combiner); if (old_combiner != MPI_COMBINER_NAMED) MPI_Type_free(types+i); } #endif ADIOI_Free(ints); ADIOI_Free(adds); ADIOI_Free(types); #ifdef FLATTEN_DEBUG DBG_FPRINTF(stderr,"ADIOI_Flatten:: return st_offset %#llX, curr_index %#llX\n",st_offset,*curr_index); #endif }
void ADIOI_GEN_ReadStridedColl(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* Uses a generalized version of the extended two-phase method described in "An Extended Two-Phase Method for Accessing Sections of Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary, Scientific Programming, (5)4:301--317, Winter 1996. http://www.mcs.anl.gov/home/thakur/ext2ph.ps */ ADIOI_Access *my_req; /* array of nprocs structures, one for each other process in whose file domain this process's request lies */ ADIOI_Access *others_req; /* array of nprocs structures, one for each other process whose request lies in this process's file domain. */ int i, filetype_is_contig, nprocs, nprocs_for_coll, myrank; int contig_access_count=0, interleave_count = 0, buftype_is_contig; int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs; ADIO_Offset start_offset, end_offset, orig_fp, fd_size, min_st_offset, off; ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *fd_start = NULL, *fd_end = NULL, *end_offsets = NULL; ADIO_Offset *len_list = NULL; int *buf_idx = NULL; #ifdef HAVE_STATUS_SET_BYTES MPI_Count bufsize, size; #endif if (fd->hints->cb_pfr != ADIOI_HINT_DISABLE) { ADIOI_IOStridedColl (fd, buf, count, ADIOI_READ, datatype, file_ptr_type, offset, status, error_code); return; } MPI_Comm_size(fd->comm, &nprocs); MPI_Comm_rank(fd->comm, &myrank); /* number of aggregators, cb_nodes, is stored in the hints */ nprocs_for_coll = fd->hints->cb_nodes; orig_fp = fd->fp_ind; /* only check for interleaving if cb_read isn't disabled */ if (fd->hints->cb_read != ADIOI_HINT_DISABLE) { /* For this process's request, calculate the list of offsets and lengths in the file and determine the start and end offsets. */ /* Note: end_offset points to the last byte-offset that will be accessed. e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/ ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset, &offset_list, &len_list, &start_offset, &end_offset, &contig_access_count); #ifdef RDCOLL_DEBUG for (i=0; i<contig_access_count; i++) { DBG_FPRINTF(stderr, "rank %d off %lld len %lld\n", myrank, offset_list[i], len_list[i]); } #endif /* each process communicates its start and end offsets to other processes. The result is an array each of start and end offsets stored in order of process rank. */ st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset)); end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset)); MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1, ADIO_OFFSET, fd->comm); MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1, ADIO_OFFSET, fd->comm); /* are the accesses of different processes interleaved? */ for (i=1; i<nprocs; i++) if ((st_offsets[i] < end_offsets[i-1]) && (st_offsets[i] <= end_offsets[i])) interleave_count++; /* This is a rudimentary check for interleaving, but should suffice for the moment. */ } ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); if (fd->hints->cb_read == ADIOI_HINT_DISABLE || (!interleave_count && (fd->hints->cb_read == ADIOI_HINT_AUTO))) { /* don't do aggregation */ if (fd->hints->cb_read != ADIOI_HINT_DISABLE) { ADIOI_Free(offset_list); ADIOI_Free(len_list); ADIOI_Free(st_offsets); ADIOI_Free(end_offsets); } fd->fp_ind = orig_fp; ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); if (buftype_is_contig && filetype_is_contig) { if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fd->disp + (fd->etype_size) * offset; ADIO_ReadContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET, off, status, error_code); } else ADIO_ReadContig(fd, buf, count, datatype, ADIO_INDIVIDUAL, 0, status, error_code); } else ADIO_ReadStrided(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); return; } /* We're going to perform aggregation of I/O. Here we call * ADIOI_Calc_file_domains() to determine what processes will handle I/O * to what regions. We pass nprocs_for_coll into this function; it is * used to determine how many processes will perform I/O, which is also * the number of regions into which the range of bytes must be divided. * These regions are called "file domains", or FDs. * * When this function returns, fd_start, fd_end, fd_size, and * min_st_offset will be filled in. fd_start holds the starting byte * location for each file domain. fd_end holds the ending byte location. * min_st_offset holds the minimum byte location that will be accessed. * * Both fd_start[] and fd_end[] are indexed by an aggregator number; this * needs to be mapped to an actual rank in the communicator later. * */ ADIOI_Calc_file_domains(st_offsets, end_offsets, nprocs, nprocs_for_coll, &min_st_offset, &fd_start, &fd_end, fd->hints->min_fdomain_size, &fd_size, fd->hints->striping_unit); /* calculate where the portions of the access requests of this process * are located in terms of the file domains. this could be on the same * process or on other processes. this function fills in: * count_my_req_procs - number of processes (including this one) for which * this process has requests in their file domain * count_my_req_per_proc - count of requests for each process, indexed * by rank of the process * my_req[] - array of data structures describing the requests to be * performed by each process (including self). indexed by rank. * buf_idx[] - array of locations into which data can be directly moved; * this is only valid for contiguous buffer case */ ADIOI_Calc_my_req(fd, offset_list, len_list, contig_access_count, min_st_offset, fd_start, fd_end, fd_size, nprocs, &count_my_req_procs, &count_my_req_per_proc, &my_req, &buf_idx); /* perform a collective communication in order to distribute the * data calculated above. fills in the following: * count_others_req_procs - number of processes (including this * one) which have requests in this process's file domain. * count_others_req_per_proc[] - number of separate contiguous * requests from proc i lie in this process's file domain. */ ADIOI_Calc_others_req(fd, count_my_req_procs, count_my_req_per_proc, my_req, nprocs, myrank, &count_others_req_procs, &others_req); /* my_req[] and count_my_req_per_proc aren't needed at this point, so * let's free the memory */ ADIOI_Free(count_my_req_per_proc); for (i=0; i<nprocs; i++) { if (my_req[i].count) { ADIOI_Free(my_req[i].offsets); ADIOI_Free(my_req[i].lens); } } ADIOI_Free(my_req); /* read data in sizes of no more than ADIOI_Coll_bufsize, * communicate, and fill user buf. */ ADIOI_Read_and_exch(fd, buf, datatype, nprocs, myrank, others_req, offset_list, len_list, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, buf_idx, error_code); if (!buftype_is_contig) ADIOI_Delete_flattened(datatype); /* free all memory allocated for collective I/O */ for (i=0; i<nprocs; i++) { if (others_req[i].count) { ADIOI_Free(others_req[i].offsets); ADIOI_Free(others_req[i].lens); ADIOI_Free(others_req[i].mem_ptrs); } } ADIOI_Free(others_req); ADIOI_Free(buf_idx); ADIOI_Free(offset_list); ADIOI_Free(len_list); ADIOI_Free(st_offsets); ADIOI_Free(end_offsets); ADIOI_Free(fd_start); ADIOI_Free(fd_end); #ifdef HAVE_STATUS_SET_BYTES MPI_Type_size_x(datatype, &size); bufsize = size * count; MPIR_Status_set_bytes(status, datatype, bufsize); /* This is a temporary way of filling in status. The right way is to keep track of how much data was actually read and placed in buf during collective I/O. */ #endif fd->fp_sys_posn = -1; /* set it to null. */ }
static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node *flat_buf, ADIO_Offset *offset_list, ADIO_Offset *len_list, int *send_size, int *recv_size, int *count, int *start_pos, int *partial_send, int *recd_from_proc, int nprocs, int myrank, int buftype_is_contig, int contig_access_count, ADIO_Offset min_st_offset, ADIO_Offset fd_size, ADIO_Offset *fd_start, ADIO_Offset *fd_end, ADIOI_Access *others_req, int iter, MPI_Aint buftype_extent, int *buf_idx) { int i, j, k=0, tmp=0, nprocs_recv, nprocs_send; char **recv_buf = NULL; MPI_Request *requests; MPI_Datatype send_type; MPI_Status *statuses; /* exchange send_size info so that each process knows how much to receive from whom and how much memory to allocate. */ MPI_Alltoall(send_size, 1, MPI_INT, recv_size, 1, MPI_INT, fd->comm); nprocs_recv = 0; for (i=0; i < nprocs; i++) if (recv_size[i]) nprocs_recv++; nprocs_send = 0; for (i=0; i<nprocs; i++) if (send_size[i]) nprocs_send++; requests = (MPI_Request *) ADIOI_Malloc((nprocs_send+nprocs_recv+1)*sizeof(MPI_Request)); /* +1 to avoid a 0-size malloc */ /* post recvs. if buftype_is_contig, data can be directly recd. into user buf at location given by buf_idx. else use recv_buf. */ #ifdef AGGREGATION_PROFILE MPE_Log_event (5032, 0, NULL); #endif if (buftype_is_contig) { j = 0; for (i=0; i < nprocs; i++) if (recv_size[i]) { MPI_Irecv(((char *) buf) + buf_idx[i], recv_size[i], MPI_BYTE, i, myrank+i+100*iter, fd->comm, requests+j); j++; buf_idx[i] += recv_size[i]; } } else { /* allocate memory for recv_buf and post receives */ recv_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char*)); for (i=0; i < nprocs; i++) if (recv_size[i]) recv_buf[i] = (char *) ADIOI_Malloc(recv_size[i]); j = 0; for (i=0; i < nprocs; i++) if (recv_size[i]) { MPI_Irecv(recv_buf[i], recv_size[i], MPI_BYTE, i, myrank+i+100*iter, fd->comm, requests+j); j++; #ifdef RDCOLL_DEBUG DBG_FPRINTF(stderr, "node %d, recv_size %d, tag %d \n", myrank, recv_size[i], myrank+i+100*iter); #endif } } /* create derived datatypes and send data */ j = 0; for (i=0; i<nprocs; i++) { if (send_size[i]) { /* take care if the last off-len pair is a partial send */ if (partial_send[i]) { k = start_pos[i] + count[i] - 1; tmp = others_req[i].lens[k]; others_req[i].lens[k] = partial_send[i]; } ADIOI_Type_create_hindexed_x(count[i], &(others_req[i].lens[start_pos[i]]), &(others_req[i].mem_ptrs[start_pos[i]]), MPI_BYTE, &send_type); /* absolute displacement; use MPI_BOTTOM in send */ MPI_Type_commit(&send_type); MPI_Isend(MPI_BOTTOM, 1, send_type, i, myrank+i+100*iter, fd->comm, requests+nprocs_recv+j); MPI_Type_free(&send_type); if (partial_send[i]) others_req[i].lens[k] = tmp; j++; } } statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send+nprocs_recv+1) * \ sizeof(MPI_Status)); /* +1 to avoid a 0-size malloc */ /* wait on the receives */ if (nprocs_recv) { #ifdef NEEDS_MPI_TEST j = 0; while (!j) MPI_Testall(nprocs_recv, requests, &j, statuses); #else MPI_Waitall(nprocs_recv, requests, statuses); #endif /* if noncontiguous, to the copies from the recv buffers */ if (!buftype_is_contig) ADIOI_Fill_user_buffer(fd, buf, flat_buf, recv_buf, offset_list, len_list, (unsigned*)recv_size, requests, statuses, recd_from_proc, nprocs, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, buftype_extent); } /* wait on the sends*/ MPI_Waitall(nprocs_send, requests+nprocs_recv, statuses+nprocs_recv); ADIOI_Free(statuses); ADIOI_Free(requests); if (!buftype_is_contig) { for (i=0; i < nprocs; i++) if (recv_size[i]) ADIOI_Free(recv_buf[i]); ADIOI_Free(recv_buf); } #ifdef AGGREGATION_PROFILE MPE_Log_event (5033, 0, NULL); #endif }
/* * Compute a dynamic access range based file domain partition among I/O aggregators, * which align to the GPFS block size * Divide the I/O workload among "nprocs_for_coll" processes. This is * done by (logically) dividing the file into file domains (FDs); each * process may directly access only its own file domain. * Additional effort is to make sure that each I/O aggregator get * a file domain that aligns to the GPFS block size. So, there will * not be any false sharing of GPFS file blocks among multiple I/O nodes. * * The common version of this now accepts a min_fd_size and striping_unit. * It doesn't seem necessary here (using GPFS block sizes) but keep it in mind * (e.g. we could pass striping unit instead of using fs_ptr->blksize). */ void ADIOI_BGL_GPFS_Calc_file_domains(ADIO_Offset *st_offsets, ADIO_Offset *end_offsets, int nprocs, int nprocs_for_coll, ADIO_Offset *min_st_offset_ptr, ADIO_Offset **fd_start_ptr, ADIO_Offset **fd_end_ptr, ADIO_Offset *fd_size_ptr, void *fs_ptr) { ADIO_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, *fd_size; int i, aggr; #ifdef AGGREGATION_PROFILE MPE_Log_event (5004, 0, NULL); #endif # if AGG_DEBUG static char myname[] = "ADIOI_BGL_GPFS_Calc_file_domains"; DBG_FPRINTF(stderr, "%s(%d): %d aggregator(s)\n", myname,__LINE__,nprocs_for_coll); # endif __blksize_t blksize = 1048576; /* default to 1M */ if(fs_ptr && ((ADIOI_BGL_fs*)fs_ptr)->blksize) /* ignore null ptr or 0 blksize */ blksize = ((ADIOI_BGL_fs*)fs_ptr)->blksize; # if AGG_DEBUG DBG_FPRINTF(stderr,"%s(%d): Blocksize=%ld\n",myname,__LINE__,blksize); # endif /* find min of start offsets and max of end offsets of all processes */ min_st_offset = st_offsets [0]; max_end_offset = end_offsets[0]; for (i=1; i<nprocs; i++) { min_st_offset = ADIOI_MIN(min_st_offset, st_offsets[i]); max_end_offset = ADIOI_MAX(max_end_offset, end_offsets[i]); } // DBG_FPRINTF(stderr, "_calc_file_domains, min_st_offset, max_ = %qd, %qd\n", min_st_offset, max_end_offset ); /* determine the "file domain (FD)" of each process, i.e., the portion of the file that will be "owned" by each process */ ADIO_Offset gpfs_ub = (max_end_offset +blksize-1) / blksize * blksize - 1; ADIO_Offset gpfs_lb = min_st_offset / blksize * blksize; ADIO_Offset gpfs_ub_rdoff = (max_end_offset +blksize-1) / blksize * blksize - 1 - max_end_offset; ADIO_Offset gpfs_lb_rdoff = min_st_offset - min_st_offset / blksize * blksize; ADIO_Offset fd_gpfs_range = gpfs_ub - gpfs_lb + 1; int naggs = nprocs_for_coll; /* Tweak the file domains so that no fd is smaller than a threshold. We * have to strike a balance between efficency and parallelism: somewhere * between 10k processes sending 32-byte requests and one process sending a * 320k request is a (system-dependent) sweet spot This is from the common code - the new min_fd_size parm that we didn't implement. (And common code uses a different declaration of fd_size so beware) */ /* this is not entirely sufficient on BlueGene: we must be mindful of * imbalance over psets. the hint processing code has already picked, say, * 8 processors per pset, so if we go increasing fd_size we'll end up with * some psets with 8 processors and some psets with none. */ /* if (fd_size < min_fd_size) fd_size = min_fd_size; */ fd_size = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset)); *fd_start_ptr = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset)); *fd_end_ptr = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset)); fd_start = *fd_start_ptr; fd_end = *fd_end_ptr; ADIO_Offset n_gpfs_blk = fd_gpfs_range / blksize; ADIO_Offset nb_cn_small = n_gpfs_blk/naggs; ADIO_Offset naggs_large = n_gpfs_blk - naggs * (n_gpfs_blk/naggs); ADIO_Offset naggs_small = naggs - naggs_large; /* nb_cn_small * blksize: evenly split file domain among processors: * equivalent to fd_gpfs_rnage/naggs * (nb_cn_small+1) * blksize: keeps file domain at least 'blksize' big */ for (i=0; i<naggs; i++) if (i < naggs_small) fd_size[i] = nb_cn_small * blksize; else fd_size[i] = (nb_cn_small+1) * blksize; /*potential optimization: if n_gpfs_blk smalller than * naggs, slip in some zero-sized file * domains to spread the work across all psets. */ # if AGG_DEBUG DBG_FPRINTF(stderr,"%s(%d): " "gpfs_ub %llu, " "gpfs_lb %llu, " "gpfs_ub_rdoff %llu, " "gpfs_lb_rdoff %llu, " "fd_gpfs_range %llu, " "n_gpfs_blk %llu, " "nb_cn_small %llu, " "naggs_large %llu, " "naggs_small %llu, " "\n", myname,__LINE__, gpfs_ub , gpfs_lb , gpfs_ub_rdoff, gpfs_lb_rdoff, fd_gpfs_range, n_gpfs_blk , nb_cn_small , naggs_large , naggs_small ); # endif fd_size[0] -= gpfs_lb_rdoff; fd_size[naggs-1] -= gpfs_ub_rdoff; /* compute the file domain for each aggr */ ADIO_Offset offset = min_st_offset; for (aggr=0; aggr<naggs; aggr++) { fd_start[aggr] = offset; fd_end [aggr] = offset + fd_size[aggr] - 1; offset += fd_size[aggr]; } *fd_size_ptr = fd_size[0]; *min_st_offset_ptr = min_st_offset; #ifdef AGGREGATION_PROFILE MPE_Log_event (5005, 0, NULL); #endif ADIOI_Free (fd_size); }
static void scaleable_stat(ADIO_File fd) { struct stat64 bg_stat; struct statfs bg_statfs; int rank, rc; char * dir; long buf[2]; MPI_Comm_rank(fd->comm, &rank); if (rank == fd->hints->ranklist[0]) { /* Get the (real) underlying file system block size */ rc = stat64(fd->filename, &bg_stat); if (rc >= 0) { buf[0] = bg_stat.st_blksize; DBGV_FPRINTF(stderr,"Successful stat '%s'. Blocksize=%ld\n", fd->filename,bg_stat.st_blksize); } else { DBGV_FPRINTF(stderr,"Stat '%s' failed with rc=%d, errno=%d\n", fd->filename,rc,errno); } /* Get the (real) underlying file system type so we can * plan our fsync scaling strategy */ rc = statfs(fd->filename,&bg_statfs); if (rc >= 0) { DBGV_FPRINTF(stderr,"Successful statfs '%s'. Magic number=%#lX\n", fd->filename,bg_statfs.f_type); buf[1] = bg_statfs.f_type; } else { DBGV_FPRINTF(stderr,"Statfs '%s' failed with rc=%d, errno=%d\n", fd->filename,rc,errno); ADIO_FileSysType_parentdir(fd->filename, &dir); rc = statfs(dir,&bg_statfs); if (rc >= 0) { DBGV_FPRINTF(stderr,"Successful statfs '%s'. Magic number=%#lX\n",dir,bg_statfs.f_type); buf[1] = bg_statfs.f_type; } else { /* Hmm. Guess we'll assume the worst-case, that it's not GPFS * or BGLOCKLESSMPIO_F_TYPE (default PVFS2) below */ buf[1] = -1; /* bogus magic number */ DBGV_FPRINTF(stderr,"Statfs '%s' failed with rc=%d, errno=%d\n",dir,rc,errno); } free(dir); } } /* now we can broadcast the stat/statfs data to everyone else */ if (fd->comm != MPI_COMM_SELF) { /* if indep open, there's no one to talk to*/ if (fd->agg_comm != MPI_COMM_NULL) /* deferred open: only a subset of processes participate */ MPI_Bcast(buf, 2, MPI_LONG, fd->hints->ranklist[0], fd->agg_comm); else MPI_Bcast(buf, 2, MPI_LONG, fd->hints->ranklist[0], fd->comm); } bg_stat.st_blksize = buf[0]; bg_statfs.f_type = buf[1]; /* data from stat64 */ /* store the blksize in the file system specific storage */ ((ADIOI_BG_fs*)fd->fs_ptr)->blksize = bg_stat.st_blksize; /* data from statfs */ if ((bg_statfs.f_type == GPFS_SUPER_MAGIC) || (bg_statfs.f_type == bglocklessmpio_f_type)) { ((ADIOI_BG_fs*)fd->fs_ptr)->fsync_aggr = ADIOI_BG_FSYNC_AGGREGATION_ENABLED; /* Only one rank is an "fsync aggregator" because only one * fsync is needed */ if (rank == fd->hints->ranklist[0]) { ((ADIOI_BG_fs*)fd->fs_ptr)->fsync_aggr |= ADIOI_BG_FSYNC_AGGREGATOR; DBG_FPRINTF(stderr,"fsync aggregator %d\n",rank); } else ; /* aggregation enabled but this rank is not an aggregator*/ } else ; /* Other filesystems default to no fsync aggregation */ }
/* * ADIOI_BGL_Calc_my_req() overrides ADIOI_Calc_my_req for the default implementation * is specific for static file domain partitioning. * * ADIOI_Calc_my_req() - calculate what portions of the access requests * of this process are located in the file domains of various processes * (including this one) */ void ADIOI_BGL_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *len_list, int contig_access_count, ADIO_Offset min_st_offset, ADIO_Offset *fd_start, ADIO_Offset *fd_end, ADIO_Offset fd_size, int nprocs, int *count_my_req_procs_ptr, int **count_my_req_per_proc_ptr, ADIOI_Access **my_req_ptr, int **buf_idx_ptr) /* Possibly reconsider if buf_idx's are ok as int's, or should they be aints/offsets? They are used as memory buffer indices so it seems like the 2G limit is in effect */ { int *count_my_req_per_proc, count_my_req_procs, *buf_idx; int i, l, proc; ADIO_Offset fd_len, rem_len, curr_idx, off; ADIOI_Access *my_req; #ifdef AGGREGATION_PROFILE MPE_Log_event (5024, 0, NULL); #endif *count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs,sizeof(int)); count_my_req_per_proc = *count_my_req_per_proc_ptr; /* count_my_req_per_proc[i] gives the no. of contig. requests of this process in process i's file domain. calloc initializes to zero. I'm allocating memory of size nprocs, so that I can do an MPI_Alltoall later on.*/ buf_idx = (int *) ADIOI_Malloc(nprocs*sizeof(int)); /* buf_idx is relevant only if buftype_is_contig. buf_idx[i] gives the index into user_buf where data received from proc. i should be placed. This allows receives to be done without extra buffer. This can't be done if buftype is not contig. */ /* initialize buf_idx to -1 */ for (i=0; i < nprocs; i++) buf_idx[i] = -1; /* one pass just to calculate how much space to allocate for my_req; * contig_access_count was calculated way back in ADIOI_Calc_my_off_len() */ for (i=0; i < contig_access_count; i++) { /* short circuit offset/len processing if len == 0 * (zero-byte read/write */ if (len_list[i] == 0) continue; off = offset_list[i]; fd_len = len_list[i]; /* note: we set fd_len to be the total size of the access. then * ADIOI_Calc_aggregator() will modify the value to return the * amount that was available from the file domain that holds the * first part of the access. */ proc = ADIOI_BGL_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size, fd_start, fd_end); count_my_req_per_proc[proc]++; /* figure out how much data is remaining in the access (i.e. wasn't * part of the file domain that had the starting byte); we'll take * care of this data (if there is any) in the while loop below. */ rem_len = len_list[i] - fd_len; while (rem_len > 0) { off += fd_len; /* point to first remaining byte */ fd_len = rem_len; /* save remaining size, pass to calc */ proc = ADIOI_BGL_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size, fd_start, fd_end); count_my_req_per_proc[proc]++; rem_len -= fd_len; /* reduce remaining length by amount from fd */ } } /* now allocate space for my_req, offset, and len */ *my_req_ptr = (ADIOI_Access *) ADIOI_Malloc(nprocs*sizeof(ADIOI_Access)); my_req = *my_req_ptr; count_my_req_procs = 0; for (i=0; i < nprocs; i++) { if (count_my_req_per_proc[i]) { my_req[i].offsets = (ADIO_Offset *) ADIOI_Malloc(count_my_req_per_proc[i] * sizeof(ADIO_Offset)); my_req[i].lens = (int *) ADIOI_Malloc(count_my_req_per_proc[i] * sizeof(int)); count_my_req_procs++; } my_req[i].count = 0; /* will be incremented where needed later */ } /* now fill in my_req */ curr_idx = 0; for (i=0; i<contig_access_count; i++) { /* short circuit offset/len processing if len == 0 * (zero-byte read/write */ if (len_list[i] == 0) continue; off = offset_list[i]; fd_len = len_list[i]; proc = ADIOI_BGL_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size, fd_start, fd_end); /* for each separate contiguous access from this process */ if (buf_idx[proc] == -1) { ADIOI_Assert(curr_idx == (int) curr_idx); buf_idx[proc] = (int) curr_idx; } l = my_req[proc].count; curr_idx += fd_len; rem_len = len_list[i] - fd_len; /* store the proc, offset, and len information in an array * of structures, my_req. Each structure contains the * offsets and lengths located in that process's FD, * and the associated count. */ my_req[proc].offsets[l] = off; ADIOI_Assert(fd_len == (int) fd_len); my_req[proc].lens[l] = (int) fd_len; my_req[proc].count++; while (rem_len > 0) { off += fd_len; fd_len = rem_len; proc = ADIOI_BGL_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size, fd_start, fd_end); if (buf_idx[proc] == -1) { ADIOI_Assert(curr_idx == (int) curr_idx); buf_idx[proc] = (int) curr_idx; } l = my_req[proc].count; curr_idx += fd_len; rem_len -= fd_len; my_req[proc].offsets[l] = off; ADIOI_Assert(fd_len == (int) fd_len); my_req[proc].lens[l] = (int) fd_len; my_req[proc].count++; } } #ifdef AGG_DEBUG for (i=0; i<nprocs; i++) { if (count_my_req_per_proc[i] > 0) { DBG_FPRINTF(stderr, "data needed from %d (count = %d):\n", i, my_req[i].count); for (l=0; l < my_req[i].count; l++) { DBG_FPRINTF(stderr, " off[%d] = %lld, len[%d] = %d\n", l, my_req[i].offsets[l], l, my_req[i].lens[l]); } } DBG_FPRINTF(stderr, "buf_idx[%d] = 0x%x\n", i, buf_idx[i]); } #endif *count_my_req_procs_ptr = count_my_req_procs; *buf_idx_ptr = buf_idx; #ifdef AGGREGATION_PROFILE MPE_Log_event (5025, 0, NULL); #endif }
void ad_gpfs_get_env_vars() { char *x, *dummy; gpfsmpio_comm = 0; x = getenv( "GPFSMPIO_COMM" ); if (x) gpfsmpio_comm = atoi(x); gpfsmpio_timing = 0; x = getenv( "GPFSMPIO_TIMING" ); if (x) gpfsmpio_timing = atoi(x); gpfsmpio_tunegather = 1; x = getenv( "GPFSMPIO_TUNEGATHER" ); if (x) gpfsmpio_tunegather = atoi(x); gpfsmpio_tuneblocking = 1; x = getenv( "GPFSMPIO_TUNEBLOCKING" ); if (x) gpfsmpio_tuneblocking = atoi(x); bglocklessmpio_f_type = PVFS2_SUPER_MAGIC; x = getenv( "BGLOCKLESSMPIO_F_TYPE" ); if (x) bglocklessmpio_f_type = strtol(x,&dummy,0); DBG_FPRINTF(stderr,"BGLOCKLESSMPIO_F_TYPE=%ld/%#lX\n", bglocklessmpio_f_type,bglocklessmpio_f_type); /* note: this value will be 'sanity checked' in ADIOI_BG_persInfo_init(), * when we know a bit more about what "largest possible value" and * "smallest possible value" should be */ gpfsmpio_bg_nagg_pset = ADIOI_BG_NAGG_PSET_DFLT; x = getenv("GPFSMPIO_NAGG_PSET"); if (x) gpfsmpio_bg_nagg_pset = atoi(x); gpfsmpio_pthreadio = 0; x = getenv( "GPFSMPIO_PTHREADIO" ); if (x) gpfsmpio_pthreadio = atoi(x); gpfsmpio_p2pcontig = 0; x = getenv( "GPFSMPIO_P2PCONTIG" ); if (x) gpfsmpio_p2pcontig = atoi(x); gpfsmpio_write_aggmethod = 0; x = getenv( "GPFSMPIO_WRITE_AGGMETHOD" ); if (x) gpfsmpio_write_aggmethod = atoi(x); gpfsmpio_read_aggmethod = 0; x = getenv( "GPFSMPIO_READ_AGGMETHOD" ); if (x) gpfsmpio_read_aggmethod = atoi(x); gpfsmpio_balancecontig = 0; x = getenv( "GPFSMPIO_BALANCECONTIG" ); if (x) gpfsmpio_balancecontig = atoi(x); gpfsmpio_devnullio = 0; x = getenv( "GPFSMPIO_DEVNULLIO" ); if (x) gpfsmpio_devnullio = atoi(x); gpfsmpio_bridgeringagg = 0; x = getenv( "GPFSMPIO_BRIDGERINGAGG" ); if (x) gpfsmpio_bridgeringagg = atoi(x); gpfsmpio_onesided_no_rmw = 0; x = getenv( "GPFSMPIO_ONESIDED_NO_RMW" ); if (x) gpfsmpio_onesided_no_rmw = atoi(x); gpfsmpio_onesided_always_rmw = 0; x = getenv( "GPFSMPIO_ONESIDED_ALWAYS_RMW" ); if (x) gpfsmpio_onesided_always_rmw = atoi(x); if (gpfsmpio_onesided_always_rmw) gpfsmpio_onesided_no_rmw = 1; gpfsmpio_onesided_inform_rmw = 0; x = getenv( "GPFSMPIO_ONESIDED_INFORM_RMW" ); if (x) gpfsmpio_onesided_inform_rmw = atoi(x); }
/* * Compute a dynamic access range based file domain partition among I/O aggregators, * which align to the GPFS block size * Divide the I/O workload among "nprocs_for_coll" processes. This is * done by (logically) dividing the file into file domains (FDs); each * process may directly access only its own file domain. * Additional effort is to make sure that each I/O aggregator get * a file domain that aligns to the GPFS block size. So, there will * not be any false sharing of GPFS file blocks among multiple I/O nodes. * * The common version of this now accepts a min_fd_size and striping_unit. * It doesn't seem necessary here (using GPFS block sizes) but keep it in mind * (e.g. we could pass striping unit instead of using fs_ptr->blksize). */ void ADIOI_GPFS_Calc_file_domains(ADIO_File fd, ADIO_Offset *st_offsets, ADIO_Offset *end_offsets, int nprocs, int nprocs_for_coll, ADIO_Offset *min_st_offset_ptr, ADIO_Offset **fd_start_ptr, ADIO_Offset **fd_end_ptr, ADIO_Offset *fd_size_ptr, void *fs_ptr) { ADIO_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, *fd_size; int i, aggr; TRACE_ERR("Entering ADIOI_GPFS_Calc_file_domains\n"); blksize_t blksize; #ifdef AGGREGATION_PROFILE MPE_Log_event (5004, 0, NULL); #endif # if AGG_DEBUG static char myname[] = "ADIOI_GPFS_Calc_file_domains"; DBG_FPRINTF(stderr, "%s(%d): %d aggregator(s)\n", myname,__LINE__,nprocs_for_coll); # endif if (fd->blksize <= 0) /* default to 1M if blksize unset */ fd->blksize = 1048576; blksize = fd->blksize; # if AGG_DEBUG DBG_FPRINTF(stderr,"%s(%d): Blocksize=%ld\n",myname,__LINE__,blksize); # endif /* find min of start offsets and max of end offsets of all processes */ min_st_offset = st_offsets [0]; max_end_offset = end_offsets[0]; for (i=1; i<nprocs; i++) { min_st_offset = ADIOI_MIN(min_st_offset, st_offsets[i]); max_end_offset = ADIOI_MAX(max_end_offset, end_offsets[i]); } /* DBG_FPRINTF(stderr, "_calc_file_domains, min_st_offset, max_ = %qd, %qd\n", min_st_offset, max_end_offset );*/ /* determine the "file domain (FD)" of each process, i.e., the portion of the file that will be "owned" by each process */ ADIO_Offset gpfs_ub = (max_end_offset +blksize-1) / blksize * blksize - 1; ADIO_Offset gpfs_lb = min_st_offset / blksize * blksize; ADIO_Offset gpfs_ub_rdoff = (max_end_offset +blksize-1) / blksize * blksize - 1 - max_end_offset; ADIO_Offset gpfs_lb_rdoff = min_st_offset - min_st_offset / blksize * blksize; ADIO_Offset fd_gpfs_range = gpfs_ub - gpfs_lb + 1; int naggs = nprocs_for_coll; /* Tweak the file domains so that no fd is smaller than a threshold. We * have to strike a balance between efficency and parallelism: somewhere * between 10k processes sending 32-byte requests and one process sending a * 320k request is a (system-dependent) sweet spot This is from the common code - the new min_fd_size parm that we didn't implement. (And common code uses a different declaration of fd_size so beware) if (fd_size < min_fd_size) fd_size = min_fd_size; */ fd_size = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset)); *fd_start_ptr = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset)); *fd_end_ptr = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset)); fd_start = *fd_start_ptr; fd_end = *fd_end_ptr; /* each process will have a file domain of some number of gpfs blocks, but * the division of blocks is not likely to be even. Some file domains will * be "large" and others "small" * * Example: consider 17 blocks distributed over 3 aggregators. * nb_cn_small = 17/3 = 5 * naggs_large = 17 - 3*(17/3) = 17 - 15 = 2 * naggs_small = 3 - 2 = 1 * * and you end up with file domains of {5-blocks, 6-blocks, 6-blocks} * * what about (relatively) small files? say, a file of 1000 blocks * distributed over 2064 aggregators: * nb_cn_small = 1000/2064 = 0 * naggs_large = 1000 - 2064*(1000/2064) = 1000 * naggs_small = 2064 - 1000 = 1064 * and you end up with domains of {0, 0, 0, ... 1, 1, 1 ...} * * it might be a good idea instead of having all the zeros up front, to * "mix" those zeros into the fd_size array. that way, no pset/bridge-set * is left with zero work. In fact, even if the small file domains aren't * zero, it's probably still a good idea to mix the "small" file domains * across the fd_size array to keep the io nodes in balance */ ADIO_Offset n_gpfs_blk = fd_gpfs_range / blksize; ADIO_Offset nb_cn_small = n_gpfs_blk/naggs; ADIO_Offset naggs_large = n_gpfs_blk - naggs * (n_gpfs_blk/naggs); ADIO_Offset naggs_small = naggs - naggs_large; #ifdef BGQPLATFORM if (gpfsmpio_balancecontig == 1) { /* File domains blocks are assigned to aggregators in a breadth-first * fashion relative to the ions - additionally, file domains on the * aggregators sharing the same bridgeset and ion have contiguous * offsets. */ // initialize everything to small for (i=0; i<naggs; i++) fd_size[i] = nb_cn_small * blksize; // go thru and distribute the large across the bridges /* bridelistoffset: agg rank list offsets using the bridgelist - each * entry is created by adding up the indexes for the aggs from all * previous bridges */ int *bridgelistoffset = (int *) ADIOI_Malloc(fd->hints->fs_hints.bg.numbridges*sizeof(int)); /* tmpbridgelistnum: copy of the bridgelistnum whose entries can be * decremented to keep track of bridge assignments during the actual * large block assignments to the agg rank list*/ int *tmpbridgelistnum = (int *) ADIOI_Malloc(fd->hints->fs_hints.bg.numbridges*sizeof(int)); int j; for (j=0;j<fd->hints->fs_hints.bg.numbridges;j++) { int k, bridgerankoffset = 0; for (k=0;k<j;k++) { bridgerankoffset += fd->hints->fs_hints.bg.bridgelistnum[k]; } bridgelistoffset[j] = bridgerankoffset; } for (j=0;j<fd->hints->fs_hints.bg.numbridges;j++) tmpbridgelistnum[j] = fd->hints->fs_hints.bg.bridgelistnum[j]; int bridgeiter = 0; /* distribute the large blocks across the aggs going breadth-first * across the bridgelist - this distributes the fd sizes across the * ions, so later in the file domain assignment when it iterates thru * the ranklist the offsets will be contiguous within the bridge and * ion as well */ for (j=0;j<naggs_large;j++) { int foundbridge = 0; int numbridgelistpasses = 0; while (!foundbridge) { if (tmpbridgelistnum[bridgeiter] > 0) { foundbridge = 1; /* printf("bridgeiter is %d tmpbridgelistnum[bridgeiter] is %d bridgelistoffset[bridgeiter] is %d\n",bridgeiter,tmpbridgelistnum[bridgeiter],bridgelistoffset[bridgeiter]); printf("naggs is %d bridgeiter is %d bridgelistoffset[bridgeiter] is %d tmpbridgelistnum[bridgeiter] is %d\n",naggs, bridgeiter,bridgelistoffset[bridgeiter],tmpbridgelistnum[bridgeiter]); printf("naggs is %d bridgeiter is %d setting fd_size[%d]\n",naggs, bridgeiter,bridgelistoffset[bridgeiter]+(fd->hints->bridgelistnum[bridgeiter]-tmpbridgelistnum[bridgeiter])); */ int currentbridgelistnum = (fd->hints->fs_hints.bg.bridgelistnum[bridgeiter]- tmpbridgelistnum[bridgeiter]); int currentfdsizeindex = bridgelistoffset[bridgeiter] + currentbridgelistnum; fd_size[currentfdsizeindex] = (nb_cn_small+1) * blksize; tmpbridgelistnum[bridgeiter]--; } if (bridgeiter == (fd->hints->fs_hints.bg.numbridges-1)) { /* guard against infinite loop - should only ever make 1 pass * thru bridgelist */ ADIOI_Assert(numbridgelistpasses == 0); numbridgelistpasses++; bridgeiter = 0; } else bridgeiter++; } } ADIOI_Free(tmpbridgelistnum); ADIOI_Free(bridgelistoffset); } else { /* BG/L- and BG/P-style distribution of file domains: simple allocation of * file domins to each aggregator */ for (i=0; i<naggs; i++) { if (i < naggs_large) { fd_size[i] = (nb_cn_small+1) * blksize; } else { fd_size[i] = nb_cn_small * blksize; } } } #ifdef balancecontigtrace int myrank; MPI_Comm_rank(fd->comm,&myrank); if (myrank == 0) { fprintf(stderr,"naggs_small is %d nb_cn_small is %d\n",naggs_small,nb_cn_small); for (i=0; i<naggs; i++) { fprintf(stderr,"fd_size[%d] set to %d agg rank is %d\n",i,fd_size[i],fd->hints->ranklist[i]); } } #endif #else // not BGQ platform for (i=0; i<naggs; i++) { if (i < naggs_large) { fd_size[i] = (nb_cn_small+1) * blksize; } else { fd_size[i] = nb_cn_small * blksize; } } #endif # if AGG_DEBUG DBG_FPRINTF(stderr,"%s(%d): " "gpfs_ub %llu, " "gpfs_lb %llu, " "gpfs_ub_rdoff %llu, " "gpfs_lb_rdoff %llu, " "fd_gpfs_range %llu, " "n_gpfs_blk %llu, " "nb_cn_small %llu, " "naggs_large %llu, " "naggs_small %llu, " "\n", myname,__LINE__, gpfs_ub , gpfs_lb , gpfs_ub_rdoff, gpfs_lb_rdoff, fd_gpfs_range, n_gpfs_blk , nb_cn_small , naggs_large , naggs_small ); # endif fd_size[0] -= gpfs_lb_rdoff; fd_size[naggs-1] -= gpfs_ub_rdoff; /* compute the file domain for each aggr */ ADIO_Offset offset = min_st_offset; for (aggr=0; aggr<naggs; aggr++) { fd_start[aggr] = offset; fd_end [aggr] = offset + fd_size[aggr] - 1; offset += fd_size[aggr]; } *fd_size_ptr = fd_size[0]; *min_st_offset_ptr = min_st_offset; #ifdef AGGREGATION_PROFILE MPE_Log_event (5005, 0, NULL); #endif ADIOI_Free (fd_size); TRACE_ERR("Leaving ADIOI_GPFS_Calc_file_domains\n"); }