/* Sets error_code to MPI_SUCCESS if successful, or creates an error code * in the case of error. */ static void ADIOI_W_Exchange_data(ADIO_File fd, void *buf, char *write_buf, ADIOI_Flatlist_node * flat_buf, ADIO_Offset * offset_list, ADIO_Offset * len_list, int *send_size, int *recv_size, ADIO_Offset off, int size, int *count, int *start_pos, int *partial_recv, int *sent_to_proc, int nprocs, int myrank, int buftype_is_contig, int contig_access_count, ADIO_Offset min_st_offset, ADIO_Offset fd_size, ADIO_Offset * fd_start, ADIO_Offset * fd_end, ADIOI_Access * others_req, int *send_buf_idx, int *curr_to_proc, int *done_to_proc, int *hole, int iter, MPI_Aint buftype_extent, MPI_Aint * buf_idx, int *error_code) { int i, j, k, *tmp_len, nprocs_recv, nprocs_send, err; char **send_buf = NULL; MPI_Request *requests, *send_req; MPI_Datatype *recv_types; MPI_Status *statuses, status; int *srt_len = NULL, sum; ADIO_Offset *srt_off = NULL; static char myname[] = "ADIOI_W_EXCHANGE_DATA"; /* exchange recv_size info so that each process knows how much to send to whom. */ MPI_Alltoall(recv_size, 1, MPI_INT, send_size, 1, MPI_INT, fd->comm); /* create derived datatypes for recv */ nprocs_send = 0; nprocs_recv = 0; sum = 0; for (i = 0; i < nprocs; i++) { sum += count[i]; if (recv_size[i]) nprocs_recv++; if (send_size[i]) nprocs_send++; } recv_types = (MPI_Datatype *) ADIOI_Malloc((nprocs_recv + 1) * sizeof(MPI_Datatype)); /* +1 to avoid a 0-size malloc */ tmp_len = (int *) ADIOI_Malloc(nprocs * sizeof(int)); j = 0; for (i = 0; i < nprocs; i++) { if (recv_size[i]) { /* take care if the last off-len pair is a partial recv */ if (partial_recv[i]) { k = start_pos[i] + count[i] - 1; tmp_len[i] = others_req[i].lens[k]; others_req[i].lens[k] = partial_recv[i]; } ADIOI_Type_create_hindexed_x(count[i], &(others_req[i].lens[start_pos[i]]), &(others_req[i].mem_ptrs[start_pos[i]]), MPI_BYTE, recv_types + j); /* absolute displacements; use MPI_BOTTOM in recv */ MPI_Type_commit(recv_types + j); j++; } } /* To avoid a read-modify-write, check if there are holes in the * data to be written. For this, merge the (sorted) offset lists * others_req using a heap-merge. */ /* valgrind-detcted optimization: if there is no work on this process we do * not need to search for holes */ if (sum) { srt_off = (ADIO_Offset *) ADIOI_Malloc(sum * sizeof(ADIO_Offset)); srt_len = (int *) ADIOI_Malloc(sum * sizeof(int)); ADIOI_Heap_merge(others_req, count, srt_off, srt_len, start_pos, nprocs, nprocs_recv, sum); } /* for partial recvs, restore original lengths */ for (i = 0; i < nprocs; i++) if (partial_recv[i]) { k = start_pos[i] + count[i] - 1; others_req[i].lens[k] = tmp_len[i]; } ADIOI_Free(tmp_len); /* check if there are any holes. If yes, must do read-modify-write. * holes can be in three places. 'middle' is what you'd expect: the * processes are operating on noncontigous data. But holes can also show * up at the beginning or end of the file domain (see John Bent ROMIO REQ * #835). Missing these holes would result in us writing more data than * recieved by everyone else. */ *hole = 0; if (sum) { if (off != srt_off[0]) /* hole at the front */ *hole = 1; else { /* coalesce the sorted offset-length pairs */ for (i = 1; i < sum; i++) { if (srt_off[i] <= srt_off[0] + srt_len[0]) { /* ok to cast: operating on cb_buffer_size chunks */ int new_len = (int) srt_off[i] + srt_len[i] - (int) srt_off[0]; if (new_len > srt_len[0]) srt_len[0] = new_len; } else break; } if (i < sum || size != srt_len[0]) /* hole in middle or end */ *hole = 1; } ADIOI_Free(srt_off); ADIOI_Free(srt_len); } if (nprocs_recv) { if (*hole) { ADIO_ReadContig(fd, write_buf, size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, off, &status, &err); /* --BEGIN ERROR HANDLING-- */ if (err != MPI_SUCCESS) { *error_code = MPIO_Err_create_code(err, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**ioRMWrdwr", 0); return; } /* --END ERROR HANDLING-- */ } } if (fd->atomicity) { /* bug fix from Wei-keng Liao and Kenin Coloma */ requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + 1) * sizeof(MPI_Request)); send_req = requests; } else { requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1) * sizeof(MPI_Request)); /* +1 to avoid a 0-size malloc */ /* post receives */ j = 0; for (i = 0; i < nprocs; i++) { if (recv_size[i]) { MPI_Irecv(MPI_BOTTOM, 1, recv_types[j], i, myrank + i + 100 * iter, fd->comm, requests + j); j++; } } send_req = requests + nprocs_recv; } /* post sends. if buftype_is_contig, data can be directly sent from user buf at location given by buf_idx. else use send_buf. */ #ifdef AGGREGATION_PROFILE MPE_Log_event(5032, 0, NULL); #endif if (buftype_is_contig) { j = 0; for (i = 0; i < nprocs; i++) if (send_size[i]) { MPI_Isend(((char *) buf) + buf_idx[i], send_size[i], MPI_BYTE, i, myrank + i + 100 * iter, fd->comm, send_req + j); j++; buf_idx[i] += send_size[i]; } } else if (nprocs_send) { /* buftype is not contig */ size_t msgLen = 0; for (i = 0; i < nprocs; i++) msgLen += send_size[i]; send_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char *)); send_buf[0] = (char *) ADIOI_Malloc(msgLen * sizeof(char)); for (i = 1; i < nprocs; i++) send_buf[i] = send_buf[i - 1] + send_size[i - 1]; ADIOI_Fill_send_buffer(fd, buf, flat_buf, send_buf, offset_list, len_list, send_size, send_req, sent_to_proc, nprocs, myrank, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, send_buf_idx, curr_to_proc, done_to_proc, iter, buftype_extent); /* the send is done in ADIOI_Fill_send_buffer */ } if (fd->atomicity) { /* bug fix from Wei-keng Liao and Kenin Coloma */ j = 0; for (i = 0; i < nprocs; i++) { MPI_Status wkl_status; if (recv_size[i]) { MPI_Recv(MPI_BOTTOM, 1, recv_types[j], i, myrank + i + 100 * iter, fd->comm, &wkl_status); j++; } } } for (i = 0; i < nprocs_recv; i++) MPI_Type_free(recv_types + i); ADIOI_Free(recv_types); #ifdef MPI_STATUSES_IGNORE statuses = MPI_STATUSES_IGNORE; #else if (fd->atomicity) { /* bug fix from Wei-keng Liao and Kenin Coloma */ statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + 1) * sizeof(MPI_Status)); /* +1 to avoid a 0-size malloc */ } else { statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1) * sizeof(MPI_Status)); /* +1 to avoid a 0-size malloc */ } #endif #ifdef NEEDS_MPI_TEST i = 0; if (fd->atomicity) { /* bug fix from Wei-keng Liao and Kenin Coloma */ while (!i) MPI_Testall(nprocs_send, send_req, &i, statuses); } else { while (!i) MPI_Testall(nprocs_send + nprocs_recv, requests, &i, statuses); } #else if (fd->atomicity) /* bug fix from Wei-keng Liao and Kenin Coloma */ MPI_Waitall(nprocs_send, send_req, statuses); else MPI_Waitall(nprocs_send + nprocs_recv, requests, statuses); #endif #ifdef AGGREGATION_PROFILE MPE_Log_event(5033, 0, NULL); #endif #ifndef MPI_STATUSES_IGNORE ADIOI_Free(statuses); #endif ADIOI_Free(requests); if (!buftype_is_contig && nprocs_send) { ADIOI_Free(send_buf[0]); ADIOI_Free(send_buf); } }
static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node *flat_buf, ADIO_Offset *offset_list, ADIO_Offset *len_list, int *send_size, int *recv_size, int *count, int *start_pos, int *partial_send, int *recd_from_proc, int nprocs, int myrank, int buftype_is_contig, int contig_access_count, ADIO_Offset min_st_offset, ADIO_Offset fd_size, ADIO_Offset *fd_start, ADIO_Offset *fd_end, ADIOI_Access *others_req, int iter, MPI_Aint buftype_extent, int *buf_idx) { int i, j, k=0, tmp=0, nprocs_recv, nprocs_send; char **recv_buf = NULL; MPI_Request *requests; MPI_Datatype send_type; MPI_Status *statuses; /* exchange send_size info so that each process knows how much to receive from whom and how much memory to allocate. */ MPI_Alltoall(send_size, 1, MPI_INT, recv_size, 1, MPI_INT, fd->comm); nprocs_recv = 0; for (i=0; i < nprocs; i++) if (recv_size[i]) nprocs_recv++; nprocs_send = 0; for (i=0; i<nprocs; i++) if (send_size[i]) nprocs_send++; requests = (MPI_Request *) ADIOI_Malloc((nprocs_send+nprocs_recv+1)*sizeof(MPI_Request)); /* +1 to avoid a 0-size malloc */ /* post recvs. if buftype_is_contig, data can be directly recd. into user buf at location given by buf_idx. else use recv_buf. */ #ifdef AGGREGATION_PROFILE MPE_Log_event (5032, 0, NULL); #endif if (buftype_is_contig) { j = 0; for (i=0; i < nprocs; i++) if (recv_size[i]) { MPI_Irecv(((char *) buf) + buf_idx[i], recv_size[i], MPI_BYTE, i, myrank+i+100*iter, fd->comm, requests+j); j++; buf_idx[i] += recv_size[i]; } } else { /* allocate memory for recv_buf and post receives */ recv_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char*)); for (i=0; i < nprocs; i++) if (recv_size[i]) recv_buf[i] = (char *) ADIOI_Malloc(recv_size[i]); j = 0; for (i=0; i < nprocs; i++) if (recv_size[i]) { MPI_Irecv(recv_buf[i], recv_size[i], MPI_BYTE, i, myrank+i+100*iter, fd->comm, requests+j); j++; #ifdef RDCOLL_DEBUG DBG_FPRINTF(stderr, "node %d, recv_size %d, tag %d \n", myrank, recv_size[i], myrank+i+100*iter); #endif } } /* create derived datatypes and send data */ j = 0; for (i=0; i<nprocs; i++) { if (send_size[i]) { /* take care if the last off-len pair is a partial send */ if (partial_send[i]) { k = start_pos[i] + count[i] - 1; tmp = others_req[i].lens[k]; others_req[i].lens[k] = partial_send[i]; } ADIOI_Type_create_hindexed_x(count[i], &(others_req[i].lens[start_pos[i]]), &(others_req[i].mem_ptrs[start_pos[i]]), MPI_BYTE, &send_type); /* absolute displacement; use MPI_BOTTOM in send */ MPI_Type_commit(&send_type); MPI_Isend(MPI_BOTTOM, 1, send_type, i, myrank+i+100*iter, fd->comm, requests+nprocs_recv+j); MPI_Type_free(&send_type); if (partial_send[i]) others_req[i].lens[k] = tmp; j++; } } statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send+nprocs_recv+1) * \ sizeof(MPI_Status)); /* +1 to avoid a 0-size malloc */ /* wait on the receives */ if (nprocs_recv) { #ifdef NEEDS_MPI_TEST j = 0; while (!j) MPI_Testall(nprocs_recv, requests, &j, statuses); #else MPI_Waitall(nprocs_recv, requests, statuses); #endif /* if noncontiguous, to the copies from the recv buffers */ if (!buftype_is_contig) ADIOI_Fill_user_buffer(fd, buf, flat_buf, recv_buf, offset_list, len_list, (unsigned*)recv_size, requests, statuses, recd_from_proc, nprocs, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, buftype_extent); } /* wait on the sends*/ MPI_Waitall(nprocs_send, requests+nprocs_recv, statuses+nprocs_recv); ADIOI_Free(statuses); ADIOI_Free(requests); if (!buftype_is_contig) { for (i=0; i < nprocs; i++) if (recv_size[i]) ADIOI_Free(recv_buf[i]); ADIOI_Free(recv_buf); } #ifdef AGGREGATION_PROFILE MPE_Log_event (5033, 0, NULL); #endif }
/* Sets error_code to MPI_SUCCESS if successful, or creates an error code * in the case of error. */ static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, const void *buf, char *write_buf, ADIOI_Flatlist_node *flat_buf, ADIO_Offset *offset_list, ADIO_Offset *len_list, int *send_size, int *recv_size, ADIO_Offset off, int size, int *count, int *start_pos, int *sent_to_proc, int nprocs, int myrank, int buftype_is_contig, int contig_access_count, int *striping_info, ADIOI_Access *others_req, int *send_buf_idx, int *curr_to_proc, int *done_to_proc, int *hole, int iter, MPI_Aint buftype_extent, int *buf_idx, ADIO_Offset **srt_off, int **srt_len, int *srt_num, int *error_code) { int i, j, nprocs_recv, nprocs_send, err; char **send_buf = NULL; MPI_Request *requests, *send_req; MPI_Datatype *recv_types; MPI_Status *statuses, status; int sum_recv; int data_sieving = *hole; static char myname[] = "ADIOI_W_EXCHANGE_DATA"; /* create derived datatypes for recv */ nprocs_recv = 0; for (i = 0; i < nprocs; i++) if (recv_size[i]) nprocs_recv++; recv_types = (MPI_Datatype *) ADIOI_Malloc((nprocs_recv + 1) * sizeof(MPI_Datatype)); /* +1 to avoid a 0-size malloc */ j = 0; for (i = 0; i < nprocs; i++) { if (recv_size[i]) { ADIOI_Type_create_hindexed_x(count[i], &(others_req[i].lens[start_pos[i]]), &(others_req[i].mem_ptrs[start_pos[i]]), MPI_BYTE, recv_types + j); /* absolute displacements; use MPI_BOTTOM in recv */ MPI_Type_commit(recv_types + j); j++; } } /* To avoid a read-modify-write, * check if there are holes in the data to be written. * For this, merge the (sorted) offset lists others_req using a heap-merge. */ *srt_num = 0; for (i = 0; i < nprocs; i++) *srt_num += count[i]; if (*srt_off) *srt_off = (ADIO_Offset *) ADIOI_Realloc(*srt_off, (*srt_num + 1) * sizeof(ADIO_Offset)); else *srt_off = (ADIO_Offset *) ADIOI_Malloc((*srt_num + 1) * sizeof(ADIO_Offset)); if (*srt_len) *srt_len = (int *) ADIOI_Realloc(*srt_len, (*srt_num + 1) * sizeof(int)); else *srt_len = (int *) ADIOI_Malloc((*srt_num + 1) * sizeof(int)); /* +1 to avoid a 0-size malloc */ ADIOI_Heap_merge(others_req, count, *srt_off, *srt_len, start_pos, nprocs, nprocs_recv, *srt_num); /* check if there are any holes */ *hole = 0; for (i = 0; i < *srt_num - 1; i++) { if ((*srt_off)[i] + (*srt_len)[i] < (*srt_off)[i + 1]) { *hole = 1; break; } } /* In some cases (see John Bent ROMIO REQ # 835), an odd interaction * between aggregation, nominally contiguous regions, and cb_buffer_size * should be handled with a read-modify-write (otherwise we will write out * more data than we receive from everyone else (inclusive), so override * hole detection */ if (*hole == 0) { sum_recv = 0; for (i = 0; i < nprocs; i++) sum_recv += recv_size[i]; if (size > sum_recv) *hole = 1; } /* check the hint for data sieving */ if (data_sieving == ADIOI_HINT_ENABLE && nprocs_recv && *hole) { ADIO_ReadContig(fd, write_buf, size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, off, &status, &err); // --BEGIN ERROR HANDLING-- if (err != MPI_SUCCESS) { *error_code = MPIO_Err_create_code(err, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**ioRMWrdwr", 0); ADIOI_Free(recv_types); return; } // --END ERROR HANDLING-- } nprocs_send = 0; for (i = 0; i < nprocs; i++) if (send_size[i]) nprocs_send++; if (fd->atomicity) { /* bug fix from Wei-keng Liao and Kenin Coloma */ requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + 1) * sizeof(MPI_Request)); send_req = requests; } else { requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1)* sizeof(MPI_Request)); /* +1 to avoid a 0-size malloc */ /* post receives */ j = 0; for (i = 0; i < nprocs; i++) { if (recv_size[i]) { MPI_Irecv(MPI_BOTTOM, 1, recv_types[j], i, myrank + i + 100 * iter, fd->comm, requests + j); j++; } } send_req = requests + nprocs_recv; } /* post sends. * if buftype_is_contig, data can be directly sent from * user buf at location given by buf_idx. else use send_buf. */ if (buftype_is_contig) { j = 0; for (i = 0; i < nprocs; i++) if (send_size[i]) { ADIOI_Assert(buf_idx[i] != -1); MPI_Isend(((char *) buf) + buf_idx[i], send_size[i], MPI_BYTE, i, myrank + i + 100 * iter, fd->comm, send_req + j); j++; } } else if (nprocs_send) { /* buftype is not contig */ send_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char *)); for (i = 0; i < nprocs; i++) if (send_size[i]) send_buf[i] = (char *) ADIOI_Malloc(send_size[i]); ADIOI_LUSTRE_Fill_send_buffer(fd, buf, flat_buf, send_buf, offset_list, len_list, send_size, send_req, sent_to_proc, nprocs, myrank, contig_access_count, striping_info, send_buf_idx, curr_to_proc, done_to_proc, iter, buftype_extent); /* the send is done in ADIOI_Fill_send_buffer */ } /* bug fix from Wei-keng Liao and Kenin Coloma */ if (fd->atomicity) { j = 0; for (i = 0; i < nprocs; i++) { MPI_Status wkl_status; if (recv_size[i]) { MPI_Recv(MPI_BOTTOM, 1, recv_types[j], i, myrank + i + 100 * iter, fd->comm, &wkl_status); j++; } } } for (i = 0; i < nprocs_recv; i++) MPI_Type_free(recv_types + i); ADIOI_Free(recv_types); /* bug fix from Wei-keng Liao and Kenin Coloma */ /* +1 to avoid a 0-size malloc */ if (fd->atomicity) { statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + 1) * sizeof(MPI_Status)); } else { statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1) * sizeof(MPI_Status)); } #ifdef NEEDS_MPI_TEST i = 0; if (fd->atomicity) { /* bug fix from Wei-keng Liao and Kenin Coloma */ while (!i) MPI_Testall(nprocs_send, send_req, &i, statuses); } else { while (!i) MPI_Testall(nprocs_send + nprocs_recv, requests, &i, statuses); } #else /* bug fix from Wei-keng Liao and Kenin Coloma */ if (fd->atomicity) MPI_Waitall(nprocs_send, send_req, statuses); else MPI_Waitall(nprocs_send + nprocs_recv, requests, statuses); #endif ADIOI_Free(statuses); ADIOI_Free(requests); if (!buftype_is_contig && nprocs_send) { for (i = 0; i < nprocs; i++) if (send_size[i]) ADIOI_Free(send_buf[i]); ADIOI_Free(send_buf); } }
static void ADIOI_R_Iexchange_data_recv(ADIOI_NBC_Request *nbc_req, int *error_code) { ADIOI_R_Iexchange_data_vars *vars = nbc_req->data.rd.red_vars; ADIO_File fd = vars->fd; int *send_size = vars->send_size; int *recv_size = vars->recv_size; int *count = vars->count; int *start_pos = vars->start_pos; int *partial_send = vars->partial_send; int nprocs = vars->nprocs; int myrank = vars->myrank; ADIOI_Access *others_req = vars->others_req; int iter = vars->iter; int *buf_idx = vars->buf_idx; int i, j, k = 0, tmp = 0, nprocs_recv, nprocs_send; char **recv_buf = NULL; MPI_Datatype send_type; nprocs_recv = 0; for (i = 0; i < nprocs; i++) if (recv_size[i]) nprocs_recv++; vars->nprocs_recv = nprocs_recv; nprocs_send = 0; for (i = 0; i < nprocs; i++) if (send_size[i]) nprocs_send++; vars->nprocs_send = nprocs_send; vars->req2 = (MPI_Request *) ADIOI_Malloc((nprocs_send+nprocs_recv+1)*sizeof(MPI_Request)); /* +1 to avoid a 0-size malloc */ /* post recvs. if buftype_is_contig, data can be directly recd. into user buf at location given by buf_idx. else use recv_buf. */ #ifdef AGGREGATION_PROFILE MPE_Log_event (5032, 0, NULL); #endif if (vars->buftype_is_contig) { j = 0; for (i = 0; i < nprocs; i++) if (recv_size[i]) { MPI_Irecv(((char *)vars->buf) + buf_idx[i], recv_size[i], MPI_BYTE, i, myrank+i+100*iter, fd->comm, vars->req2 + j); j++; buf_idx[i] += recv_size[i]; } } else { /* allocate memory for recv_buf and post receives */ recv_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char*)); vars->recv_buf = recv_buf; for (i = 0; i < nprocs; i++) if (recv_size[i]) recv_buf[i] = (char *)ADIOI_Malloc(recv_size[i]); j = 0; for (i = 0; i < nprocs; i++) if (recv_size[i]) { MPI_Irecv(recv_buf[i], recv_size[i], MPI_BYTE, i, myrank+i+100*iter, fd->comm, vars->req2 + j); j++; #ifdef RDCOLL_DEBUG DBG_FPRINTF(stderr, "node %d, recv_size %d, tag %d \n", myrank, recv_size[i], myrank+i+100*iter); #endif } } /* create derived datatypes and send data */ j = 0; for (i = 0; i < nprocs; i++) { if (send_size[i]) { /* take care if the last off-len pair is a partial send */ if (partial_send[i]) { k = start_pos[i] + count[i] - 1; tmp = others_req[i].lens[k]; others_req[i].lens[k] = partial_send[i]; } ADIOI_Type_create_hindexed_x(count[i], &(others_req[i].lens[start_pos[i]]), &(others_req[i].mem_ptrs[start_pos[i]]), MPI_BYTE, &send_type); /* absolute displacement; use MPI_BOTTOM in send */ MPI_Type_commit(&send_type); MPI_Isend(MPI_BOTTOM, 1, send_type, i, myrank+i+100*iter, fd->comm, vars->req2 + nprocs_recv + j); MPI_Type_free(&send_type); if (partial_send[i]) others_req[i].lens[k] = tmp; j++; } } /* wait on the receives */ if (nprocs_recv) { nbc_req->data.rd.state = ADIOI_IRC_STATE_R_IEXCHANGE_DATA_RECV; return; } ADIOI_R_Iexchange_data_fill(nbc_req, error_code); }