int mca_fcoll_two_phase_domain_partition (mca_io_ompio_file_t *fh, OMPI_MPI_OFFSET_TYPE *start_offsets, OMPI_MPI_OFFSET_TYPE *end_offsets, OMPI_MPI_OFFSET_TYPE *min_st_offset_ptr, OMPI_MPI_OFFSET_TYPE **fd_st_ptr, OMPI_MPI_OFFSET_TYPE **fd_end_ptr, int min_fd_size, OMPI_MPI_OFFSET_TYPE *fd_size_ptr, int striping_unit, int nprocs_for_coll){ OMPI_MPI_OFFSET_TYPE min_st_offset, max_end_offset, *fd_start=NULL, *fd_end=NULL, fd_size; int i; min_st_offset = start_offsets[0]; max_end_offset = end_offsets[0]; for (i=0; i< fh->f_size; i++){ min_st_offset = OMPIO_MIN(min_st_offset, start_offsets[i]); max_end_offset = OMPIO_MAX(max_end_offset, end_offsets[i]); } fd_size = ((max_end_offset - min_st_offset + 1) + nprocs_for_coll - 1)/nprocs_for_coll; if (fd_size < min_fd_size) fd_size = min_fd_size; *fd_st_ptr = (OMPI_MPI_OFFSET_TYPE *) malloc(nprocs_for_coll*sizeof(OMPI_MPI_OFFSET_TYPE)); if ( NULL == *fd_st_ptr ) { return OMPI_ERR_OUT_OF_RESOURCE; } *fd_end_ptr = (OMPI_MPI_OFFSET_TYPE *) malloc(nprocs_for_coll*sizeof(OMPI_MPI_OFFSET_TYPE)); if ( NULL == *fd_end_ptr ) { return OMPI_ERR_OUT_OF_RESOURCE; } fd_start = *fd_st_ptr; fd_end = *fd_end_ptr; if (striping_unit > 0){ /* Lock Boundary based domain partitioning */ int rem_front, rem_back; OMPI_MPI_OFFSET_TYPE end_off; fd_start[0] = min_st_offset; end_off = fd_start[0] + fd_size; rem_front = end_off % striping_unit; rem_back = striping_unit - rem_front; if (rem_front < rem_back) end_off -= rem_front; else end_off += rem_back; fd_end[0] = end_off - 1; /* align fd_end[i] to the nearest file lock boundary */ for (i=1; i<nprocs_for_coll; i++) { fd_start[i] = fd_end[i-1] + 1; end_off = min_st_offset + fd_size * (i+1); rem_front = end_off % striping_unit; rem_back = striping_unit - rem_front; if (rem_front < rem_back) end_off -= rem_front; else end_off += rem_back; fd_end[i] = end_off - 1; } fd_end[nprocs_for_coll-1] = max_end_offset; } else{ fd_start[0] = min_st_offset; fd_end[0] = min_st_offset + fd_size - 1; for (i=1; i<nprocs_for_coll; i++) { fd_start[i] = fd_end[i-1] + 1; fd_end[i] = fd_start[i] + fd_size - 1; } } for (i=0; i<nprocs_for_coll; i++) { if (fd_start[i] > max_end_offset) fd_start[i] = fd_end[i] = -1; if (fd_end[i] > max_end_offset) fd_end[i] = max_end_offset; } *fd_size_ptr = fd_size; *min_st_offset_ptr = min_st_offset; return OMPI_SUCCESS; }
static int two_phase_exch_and_write(mca_io_ompio_file_t *fh, void *buf, MPI_Datatype datatype, mca_io_ompio_access_array_t *others_req, struct iovec *offset_len, int contig_access_count, OMPI_MPI_OFFSET_TYPE min_st_offset, OMPI_MPI_OFFSET_TYPE fd_size, OMPI_MPI_OFFSET_TYPE *fd_start, OMPI_MPI_OFFSET_TYPE *fd_end, Flatlist_node *flat_buf, size_t *buf_idx, int striping_unit, int *aggregator_list) { int i, j, ntimes, max_ntimes, m; int *curr_offlen_ptr=NULL, *count=NULL, *send_size=NULL, *recv_size=NULL; int *partial_recv=NULL, *start_pos=NULL, req_len, flag; int *sent_to_proc=NULL, ret = OMPI_SUCCESS; int *send_buf_idx=NULL, *curr_to_proc=NULL, *done_to_proc=NULL; OMPI_MPI_OFFSET_TYPE st_loc=-1, end_loc=-1, off, done; OMPI_MPI_OFFSET_TYPE size=0, req_off, len; MPI_Aint buftype_extent; int hole; size_t byte_size; MPI_Datatype byte = MPI_BYTE; #if DEBUG_ON int ii,jj; #endif char *write_buf=NULL; opal_datatype_type_size(&byte->super, &byte_size); for (i = 0; i < fh->f_size; i++){ if (others_req[i].count) { st_loc = others_req[i].offsets[0]; end_loc = others_req[i].offsets[0]; break; } } for (i=0;i<fh->f_size;i++){ for(j=0;j< others_req[i].count; j++){ st_loc = OMPIO_MIN(st_loc, others_req[i].offsets[j]); end_loc = OMPIO_MAX(end_loc, (others_req[i].offsets[j] + others_req[i].lens[j] - 1)); } } ntimes = (int) ((end_loc - st_loc + mca_fcoll_two_phase_cycle_buffer_size)/mca_fcoll_two_phase_cycle_buffer_size); if ((st_loc == -1) && (end_loc == -1)) { ntimes = 0; } fh->f_comm->c_coll.coll_allreduce (&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX, fh->f_comm, fh->f_comm->c_coll.coll_allreduce_module); if (ntimes){ write_buf = (char *) malloc (mca_fcoll_two_phase_cycle_buffer_size); if ( NULL == write_buf ){ return OMPI_ERR_OUT_OF_RESOURCE; } } curr_offlen_ptr = (int *) calloc(fh->f_size, sizeof(int)); if ( NULL == curr_offlen_ptr ){ return OMPI_ERR_OUT_OF_RESOURCE; } count = (int *) malloc(fh->f_size*sizeof(int)); if ( NULL == count ){ return OMPI_ERR_OUT_OF_RESOURCE; } partial_recv = (int *)calloc(fh->f_size, sizeof(int)); if ( NULL == partial_recv ){ return OMPI_ERR_OUT_OF_RESOURCE; } send_size = (int *) calloc(fh->f_size,sizeof(int)); if ( NULL == send_size ){ return OMPI_ERR_OUT_OF_RESOURCE; } recv_size = (int *) calloc(fh->f_size,sizeof(int)); if ( NULL == recv_size ){ return OMPI_ERR_OUT_OF_RESOURCE; } send_buf_idx = (int *) malloc(fh->f_size*sizeof(int)); if ( NULL == send_buf_idx ){ return OMPI_ERR_OUT_OF_RESOURCE; } sent_to_proc = (int *) calloc(fh->f_size, sizeof(int)); if ( NULL == sent_to_proc){ return OMPI_ERR_OUT_OF_RESOURCE; } curr_to_proc = (int *) malloc(fh->f_size*sizeof(int)); if ( NULL == curr_to_proc ){ return OMPI_ERR_OUT_OF_RESOURCE; } done_to_proc = (int *) malloc(fh->f_size*sizeof(int)); if ( NULL == done_to_proc ){ return OMPI_ERR_OUT_OF_RESOURCE; } start_pos = (int *) malloc(fh->f_size*sizeof(int)); if ( NULL == start_pos ){ return OMPI_ERR_OUT_OF_RESOURCE; } done = 0; off = st_loc; ompi_datatype_type_extent(datatype, &buftype_extent); for (m=0;m <ntimes; m++){ for (i=0; i< fh->f_size; i++) count[i] = recv_size[i] = 0; size = OMPIO_MIN((unsigned)mca_fcoll_two_phase_cycle_buffer_size, end_loc-st_loc+1-done); for (i=0;i<fh->f_size;i++){ if(others_req[i].count){ start_pos[i] = curr_offlen_ptr[i]; for (j=curr_offlen_ptr[i]; j<others_req[i].count; j++) { if (partial_recv[i]) { /* this request may have been partially satisfied in the previous iteration. */ req_off = others_req[i].offsets[j] + partial_recv[i]; req_len = others_req[i].lens[j] - partial_recv[i]; partial_recv[i] = 0; /* modify the off-len pair to reflect this change */ others_req[i].offsets[j] = req_off; others_req[i].lens[j] = req_len; } else { req_off = others_req[i].offsets[j]; req_len = others_req[i].lens[j]; } if (req_off < off + size) { count[i]++; #if DEBUG_ON printf("%d: req_off : %lld, off : %lld, size : %lld, count[%d]: %d\n", fh->f_rank, req_off, off, size,i, count[i]); #endif MPI_Address(write_buf+req_off-off, &(others_req[i].mem_ptrs[j])); #if DEBUG_ON printf("%d : mem_ptrs : %ld\n", fh->f_rank, others_req[i].mem_ptrs[j]); #endif recv_size[i] += (int) (OMPIO_MIN(off + size - req_off, (unsigned)req_len)); if (off+size-req_off < (unsigned)req_len){ partial_recv[i] = (int)(off + size - req_off); break; } } else break; } curr_offlen_ptr[i] = j; } } ret = two_phase_exchage_data(fh, buf, write_buf, offset_len,send_size, start_pos,recv_size,off,size, count, partial_recv, sent_to_proc, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, flat_buf, others_req, send_buf_idx, curr_to_proc, done_to_proc, m, buf_idx, buftype_extent, striping_unit, aggregator_list, &hole); if ( OMPI_SUCCESS != ret ){ goto exit; } flag = 0; for (i=0; i<fh->f_size; i++) if (count[i]) flag = 1; if (flag){ #if TIME_BREAKDOWN start_write_time = MPI_Wtime(); #endif #if DEBUG_ON printf("rank : %d enters writing\n", fh->f_rank); printf("size : %ld, off : %ld\n",size, off); for (ii=0, jj=0;jj<size;jj+=4, ii++){ printf("%d : write_buf[%d]: %d\n", fh->f_rank, ii,((int *)write_buf[jj])); } #endif len = size * byte_size; fh->f_io_array = (mca_io_ompio_io_array_t *)malloc (sizeof(mca_io_ompio_io_array_t)); if (NULL == fh->f_io_array) { opal_output(1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } fh->f_io_array[0].offset =(IOVBASE_TYPE *)(intptr_t) off; fh->f_io_array[0].length = len; fh->f_io_array[0].memory_address = write_buf; fh->f_num_of_io_entries = 1; #if DEBUG_ON for (i=0 ; i<fh->f_num_of_io_entries ; i++) { printf("%d: ADDRESS: %p OFFSET: %ld LENGTH: %d\n", fh->f_rank, fh->f_io_array[i].memory_address, fh->f_io_array[i].offset, fh->f_io_array[i].length); } #endif if (fh->f_num_of_io_entries){ if (OMPI_SUCCESS != fh->f_fbtl->fbtl_pwritev (fh, NULL)) { opal_output(1, "WRITE FAILED\n"); return OMPI_ERROR; } } #if TIME_BREAKDOWN end_write_time = MPI_Wtime(); write_time += (end_write_time - start_write_time); #endif } /***************** DONE WRITING *****************************************/ /****RESET **********************/ fh->f_num_of_io_entries = 0; if (NULL != fh->f_io_array) { free (fh->f_io_array); fh->f_io_array = NULL; } off += size; done += size; } for (i=0; i<fh->f_size; i++) count[i] = recv_size[i] = 0; for (m=ntimes; m<max_ntimes; m++) { ret = two_phase_exchage_data(fh, buf, write_buf, offset_len,send_size, start_pos,recv_size,off,size, count, partial_recv, sent_to_proc, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, flat_buf,others_req, send_buf_idx, curr_to_proc, done_to_proc, m, buf_idx, buftype_extent, striping_unit, aggregator_list, &hole); if ( OMPI_SUCCESS != ret ){ goto exit; } } exit: if (ntimes){ if ( NULL != write_buf ){ free(write_buf); } } if ( NULL != curr_offlen_ptr ){ free(curr_offlen_ptr); } if ( NULL != count ){ free(count); } if ( NULL != partial_recv ){ free(partial_recv); } if ( NULL != send_size ){ free(send_size); } if ( NULL != recv_size ){ free(recv_size); } if ( NULL != sent_to_proc ){ free(sent_to_proc); } if ( NULL != start_pos ){ free(start_pos); } if ( NULL != send_buf_idx ){ free(send_buf_idx); } if ( NULL != curr_to_proc ){ free(curr_to_proc); } if ( NULL != done_to_proc ){ free(done_to_proc); } return ret; }
static int two_phase_read_and_exch(mca_io_ompio_file_t *fh, void *buf, MPI_Datatype datatype, mca_io_ompio_access_array_t *others_req, struct iovec *offset_len, int contig_access_count, OMPI_MPI_OFFSET_TYPE min_st_offset, OMPI_MPI_OFFSET_TYPE fd_size, OMPI_MPI_OFFSET_TYPE *fd_start, OMPI_MPI_OFFSET_TYPE *fd_end, Flatlist_node *flat_buf, size_t *buf_idx, int striping_unit, int *aggregator_list){ int ret=OMPI_SUCCESS, i = 0, j = 0, ntimes = 0, max_ntimes = 0; int m = 0; int *curr_offlen_ptr=NULL, *count=NULL, *send_size=NULL, *recv_size=NULL; int *partial_send=NULL, *start_pos=NULL, req_len=0, flag=0; int *recd_from_proc=NULL; MPI_Aint buftype_extent=0; size_t byte_size = 0; OMPI_MPI_OFFSET_TYPE st_loc=-1, end_loc=-1, off=0, done=0, for_next_iter=0; OMPI_MPI_OFFSET_TYPE size=0, req_off=0, real_size=0, real_off=0, len=0; OMPI_MPI_OFFSET_TYPE for_curr_iter=0; char *read_buf=NULL, *tmp_buf=NULL; MPI_Datatype byte = MPI_BYTE; opal_datatype_type_size(&byte->super, &byte_size); for (i = 0; i < fh->f_size; i++){ if (others_req[i].count) { st_loc = others_req[i].offsets[0]; end_loc = others_req[i].offsets[0]; break; } } for (i=0;i<fh->f_size;i++){ for(j=0;j< others_req[i].count; j++){ st_loc = OMPIO_MIN(st_loc, others_req[i].offsets[j]); end_loc = OMPIO_MAX(end_loc, (others_req[i].offsets[j] + others_req[i].lens[j] - 1)); } } ntimes = (int)((end_loc - st_loc + mca_fcoll_two_phase_cycle_buffer_size)/ mca_fcoll_two_phase_cycle_buffer_size); if ((st_loc == -1) && (end_loc == -1)){ ntimes = 0; } fh->f_comm->c_coll.coll_allreduce (&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX, fh->f_comm, fh->f_comm->c_coll.coll_allreduce_module); if (ntimes){ read_buf = (char *) calloc (mca_fcoll_two_phase_cycle_buffer_size, sizeof(char)); if ( NULL == read_buf ){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } curr_offlen_ptr = (int *)calloc (fh->f_size, sizeof(int)); if (NULL == curr_offlen_ptr){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } count = (int *)calloc (fh->f_size, sizeof(int)); if (NULL == count){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } partial_send = (int *)calloc(fh->f_size, sizeof(int)); if ( NULL == partial_send ){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } send_size = (int *)malloc(fh->f_size * sizeof(int)); if (NULL == send_size){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } recv_size = (int *)malloc(fh->f_size * sizeof(int)); if (NULL == recv_size){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } recd_from_proc = (int *)calloc(fh->f_size,sizeof(int)); if (NULL == recd_from_proc){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } start_pos = (int *) calloc(fh->f_size, sizeof(int)); if ( NULL == start_pos ){ ret = OMPI_ERR_OUT_OF_RESOURCE; return ret; } done = 0; off = st_loc; for_curr_iter = for_next_iter = 0; ompi_datatype_type_extent(datatype, &buftype_extent); for (m=0; m<ntimes; m++) { size = OMPIO_MIN((unsigned)mca_fcoll_two_phase_cycle_buffer_size, end_loc-st_loc+1-done); real_off = off - for_curr_iter; real_size = size + for_curr_iter; for (i=0; i<fh->f_size; i++) count[i] = send_size[i] = 0; for_next_iter = 0; for (i=0; i<fh->f_size; i++) { if (others_req[i].count) { start_pos[i] = curr_offlen_ptr[i]; for (j=curr_offlen_ptr[i]; j<others_req[i].count; j++) { if (partial_send[i]) { /* this request may have been partially satisfied in the previous iteration. */ req_off = others_req[i].offsets[j] + partial_send[i]; req_len = others_req[i].lens[j] - partial_send[i]; partial_send[i] = 0; /* modify the off-len pair to reflect this change */ others_req[i].offsets[j] = req_off; others_req[i].lens[j] = req_len; } else { req_off = others_req[i].offsets[j]; req_len = others_req[i].lens[j]; } if (req_off < real_off + real_size) { count[i]++; MPI_Address(read_buf+req_off-real_off, &(others_req[i].mem_ptrs[j])); send_size[i] += (int)(OMPIO_MIN(real_off + real_size - req_off, (OMPI_MPI_OFFSET_TYPE)req_len)); if (real_off+real_size-req_off < (OMPI_MPI_OFFSET_TYPE)req_len) { partial_send[i] = (int) (real_off + real_size - req_off); if ((j+1 < others_req[i].count) && (others_req[i].offsets[j+1] < real_off+real_size)) { /* this is the case illustrated in the figure above. */ for_next_iter = OMPIO_MAX(for_next_iter, real_off + real_size - others_req[i].offsets[j+1]); /* max because it must cover requests from different processes */ } break; } } else break; } curr_offlen_ptr[i] = j; } } flag = 0; for (i=0; i<fh->f_size; i++) if (count[i]) flag = 1; if (flag) { #if TIME_BREAKDOWN start_read_time = MPI_Wtime(); #endif len = size * byte_size; fh->f_io_array = (mca_io_ompio_io_array_t *)calloc (1,sizeof(mca_io_ompio_io_array_t)); if (NULL == fh->f_io_array) { opal_output(1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } fh->f_io_array[0].offset = (IOVBASE_TYPE *)(intptr_t)off; fh->f_io_array[0].length = len; fh->f_io_array[0].memory_address = read_buf+for_curr_iter; fh->f_num_of_io_entries = 1; if (fh->f_num_of_io_entries){ if (OMPI_SUCCESS != fh->f_fbtl->fbtl_preadv (fh, NULL)) { opal_output(1, "READ FAILED\n"); return OMPI_ERROR; } } #if 0 int ii; printf("%d: len/4 : %lld\n", fh->f_rank, len/4); for (ii = 0; ii < len/4 ;ii++){ printf("%d: read_buf[%d]: %ld\n", fh->f_rank, ii, (int *)read_buf[ii]); } #endif fh->f_num_of_io_entries = 0; if (NULL != fh->f_io_array) { free (fh->f_io_array); fh->f_io_array = NULL; } #if TIME_BREAKDOWN end_read_time = MPI_Wtime(); read_time += (end_read_time - start_read_time); #endif } for_curr_iter = for_next_iter; for (i=0; i< fh->f_size; i++){ recv_size[i] = 0; } two_phase_exchange_data(fh, buf, offset_len, send_size, start_pos, recv_size, count, partial_send, recd_from_proc, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, flat_buf, others_req, m, buf_idx, buftype_extent, striping_unit, aggregator_list); if (for_next_iter){ tmp_buf = (char *) calloc (for_next_iter, sizeof(char)); memcpy(tmp_buf, read_buf+real_size-for_next_iter, for_next_iter); free(read_buf); read_buf = (char *)malloc(for_next_iter+mca_fcoll_two_phase_cycle_buffer_size); memcpy(read_buf, tmp_buf, for_next_iter); free(tmp_buf); } off += size; done += size; } for (i=0; i<fh->f_size; i++) count[i] = send_size[i] = 0; for (m=ntimes; m<max_ntimes; m++) two_phase_exchange_data(fh, buf, offset_len, send_size, start_pos, recv_size, count, partial_send, recd_from_proc, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, flat_buf, others_req, m, buf_idx, buftype_extent, striping_unit, aggregator_list); if (ntimes){ free(read_buf); read_buf = NULL; } if (NULL != curr_offlen_ptr){ free(curr_offlen_ptr); curr_offlen_ptr = NULL; } if (NULL != count){ free(count); count = NULL; } if (NULL != partial_send){ free(partial_send); partial_send = NULL; } if (NULL != send_size){ free(send_size); send_size = NULL; } if (NULL != recv_size){ free(recv_size); recv_size = NULL; } if (NULL != recd_from_proc){ free(recd_from_proc); recd_from_proc = NULL; } if (NULL != start_pos){ free(start_pos); start_pos = NULL; } exit: return ret; }
static int two_phase_fill_send_buffer(mca_io_ompio_file_t *fh, void *buf, Flatlist_node *flat_buf, char **send_buf, struct iovec *offset_length, int *send_size, MPI_Request *requests, int *sent_to_proc, int contig_access_count, OMPI_MPI_OFFSET_TYPE min_st_offset, OMPI_MPI_OFFSET_TYPE fd_size, OMPI_MPI_OFFSET_TYPE *fd_start, OMPI_MPI_OFFSET_TYPE *fd_end, int *send_buf_idx, int *curr_to_proc, int *done_to_proc, int iter, MPI_Aint buftype_extent, int striping_unit, int *aggregator_list){ int i, p, flat_buf_idx; OMPI_MPI_OFFSET_TYPE flat_buf_sz, size_in_buf, buf_incr, size; int jj, n_buftypes, ret=OMPI_SUCCESS; OMPI_MPI_OFFSET_TYPE off, len, rem_len, user_buf_idx; for (i=0; i < fh->f_size; i++) { send_buf_idx[i] = curr_to_proc[i] = 0; done_to_proc[i] = sent_to_proc[i]; } jj = 0; user_buf_idx = flat_buf->indices[0]; flat_buf_idx = 0; n_buftypes = 0; flat_buf_sz = flat_buf->blocklens[0]; for (i=0; i<contig_access_count; i++) { off = (OMPI_MPI_OFFSET_TYPE) (intptr_t)offset_length[i].iov_base; rem_len = (OMPI_MPI_OFFSET_TYPE)offset_length[i].iov_len; while (rem_len != 0) { len = rem_len; p = mca_fcoll_two_phase_calc_aggregator(fh, off, min_st_offset, &len, fd_size, fd_start, fd_end, striping_unit, mca_fcoll_two_phase_num_io_procs, aggregator_list); if (send_buf_idx[p] < send_size[p]) { if (curr_to_proc[p]+len > done_to_proc[p]) { if (done_to_proc[p] > curr_to_proc[p]) { size = OMPIO_MIN(curr_to_proc[p] + len - done_to_proc[p], send_size[p]-send_buf_idx[p]); buf_incr = done_to_proc[p] - curr_to_proc[p]; TWO_PHASE_BUF_INCR buf_incr = curr_to_proc[p] + len - done_to_proc[p]; curr_to_proc[p] = done_to_proc[p] + size; TWO_PHASE_BUF_COPY } else { size = OMPIO_MIN(len,send_size[p]-send_buf_idx[p]); buf_incr = len; curr_to_proc[p] += size; TWO_PHASE_BUF_COPY } if (send_buf_idx[p] == send_size[p]) { ret = MCA_PML_CALL(isend(send_buf[p], send_size[p], MPI_BYTE, p, fh->f_rank+p+100*iter, MCA_PML_BASE_SEND_STANDARD, fh->f_comm, requests+jj)); if ( OMPI_SUCCESS != ret ){ return ret; } jj++; } } else { curr_to_proc[p] += len; buf_incr = len; TWO_PHASE_BUF_INCR } }
static void two_phase_fill_user_buffer(mca_io_ompio_file_t *fh, void *buf, Flatlist_node *flat_buf, char **recv_buf, struct iovec *offset_length, unsigned *recv_size, MPI_Request *requests, int *recd_from_proc, int contig_access_count, OMPI_MPI_OFFSET_TYPE min_st_offset, OMPI_MPI_OFFSET_TYPE fd_size, OMPI_MPI_OFFSET_TYPE *fd_start, OMPI_MPI_OFFSET_TYPE *fd_end, MPI_Aint buftype_extent, int striping_unit, int *aggregator_list){ int i = 0, p = 0, flat_buf_idx = 0; OMPI_MPI_OFFSET_TYPE flat_buf_sz = 0, size_in_buf = 0, buf_incr = 0, size = 0; int n_buftypes = 0; OMPI_MPI_OFFSET_TYPE off=0, len=0, rem_len=0, user_buf_idx=0; unsigned *curr_from_proc=NULL, *done_from_proc=NULL, *recv_buf_idx=NULL; curr_from_proc = (unsigned *) malloc (fh->f_size * sizeof(unsigned)); done_from_proc = (unsigned *) malloc (fh->f_size * sizeof(unsigned)); recv_buf_idx = (unsigned *) malloc (fh->f_size * sizeof(unsigned)); for (i=0; i < fh->f_size; i++) { recv_buf_idx[i] = curr_from_proc[i] = 0; done_from_proc[i] = recd_from_proc[i]; } user_buf_idx = flat_buf->indices[0]; flat_buf_idx = 0; n_buftypes = 0; flat_buf_sz = flat_buf->blocklens[0]; /* flat_buf_idx = current index into flattened buftype flat_buf_sz = size of current contiguous component in flattened buf */ for (i=0; i<contig_access_count; i++) { off = (OMPI_MPI_OFFSET_TYPE)(intptr_t)offset_length[i].iov_base; rem_len = (OMPI_MPI_OFFSET_TYPE)offset_length[i].iov_len; /* this request may span the file domains of more than one process */ while (rem_len != 0) { len = rem_len; /* NOTE: len value is modified by ADIOI_Calc_aggregator() to be no * longer than the single region that processor "p" is responsible * for. */ p = mca_fcoll_two_phase_calc_aggregator(fh, off, min_st_offset, &len, fd_size, fd_start, fd_end, striping_unit, mca_fcoll_two_phase_num_io_procs, aggregator_list); if (recv_buf_idx[p] < recv_size[p]) { if (curr_from_proc[p]+len > done_from_proc[p]) { if (done_from_proc[p] > curr_from_proc[p]) { size = OMPIO_MIN(curr_from_proc[p] + len - done_from_proc[p], recv_size[p]-recv_buf_idx[p]); buf_incr = done_from_proc[p] - curr_from_proc[p]; TWO_PHASE_BUF_INCR buf_incr = curr_from_proc[p]+len-done_from_proc[p]; curr_from_proc[p] = done_from_proc[p] + size; TWO_PHASE_BUF_COPY } else { size = OMPIO_MIN(len,recv_size[p]-recv_buf_idx[p]); buf_incr = len; curr_from_proc[p] += (unsigned) size; TWO_PHASE_BUF_COPY } } else { curr_from_proc[p] += (unsigned) len; buf_incr = len; TWO_PHASE_BUF_INCR } }