/* Sets error_code to MPI_SUCCESS if successful, or creates an error code * in the case of error. */ static void ADIOI_W_Exchange_data(ADIO_File fd, void *buf, char *write_buf, ADIOI_Flatlist_node * flat_buf, ADIO_Offset * offset_list, ADIO_Offset * len_list, int *send_size, int *recv_size, ADIO_Offset off, int size, int *count, int *start_pos, int *partial_recv, int *sent_to_proc, int nprocs, int myrank, int buftype_is_contig, int contig_access_count, ADIO_Offset min_st_offset, ADIO_Offset fd_size, ADIO_Offset * fd_start, ADIO_Offset * fd_end, ADIOI_Access * others_req, int *send_buf_idx, int *curr_to_proc, int *done_to_proc, int *hole, int iter, MPI_Aint buftype_extent, MPI_Aint * buf_idx, int *error_code) { int i, j, k, *tmp_len, nprocs_recv, nprocs_send, err; char **send_buf = NULL; MPI_Request *requests, *send_req; MPI_Datatype *recv_types; MPI_Status *statuses, status; int *srt_len = NULL, sum; ADIO_Offset *srt_off = NULL; static char myname[] = "ADIOI_W_EXCHANGE_DATA"; /* exchange recv_size info so that each process knows how much to send to whom. */ MPI_Alltoall(recv_size, 1, MPI_INT, send_size, 1, MPI_INT, fd->comm); /* create derived datatypes for recv */ nprocs_send = 0; nprocs_recv = 0; sum = 0; for (i = 0; i < nprocs; i++) { sum += count[i]; if (recv_size[i]) nprocs_recv++; if (send_size[i]) nprocs_send++; } recv_types = (MPI_Datatype *) ADIOI_Malloc((nprocs_recv + 1) * sizeof(MPI_Datatype)); /* +1 to avoid a 0-size malloc */ tmp_len = (int *) ADIOI_Malloc(nprocs * sizeof(int)); j = 0; for (i = 0; i < nprocs; i++) { if (recv_size[i]) { /* take care if the last off-len pair is a partial recv */ if (partial_recv[i]) { k = start_pos[i] + count[i] - 1; tmp_len[i] = others_req[i].lens[k]; others_req[i].lens[k] = partial_recv[i]; } ADIOI_Type_create_hindexed_x(count[i], &(others_req[i].lens[start_pos[i]]), &(others_req[i].mem_ptrs[start_pos[i]]), MPI_BYTE, recv_types + j); /* absolute displacements; use MPI_BOTTOM in recv */ MPI_Type_commit(recv_types + j); j++; } } /* To avoid a read-modify-write, check if there are holes in the * data to be written. For this, merge the (sorted) offset lists * others_req using a heap-merge. */ /* valgrind-detcted optimization: if there is no work on this process we do * not need to search for holes */ if (sum) { srt_off = (ADIO_Offset *) ADIOI_Malloc(sum * sizeof(ADIO_Offset)); srt_len = (int *) ADIOI_Malloc(sum * sizeof(int)); ADIOI_Heap_merge(others_req, count, srt_off, srt_len, start_pos, nprocs, nprocs_recv, sum); } /* for partial recvs, restore original lengths */ for (i = 0; i < nprocs; i++) if (partial_recv[i]) { k = start_pos[i] + count[i] - 1; others_req[i].lens[k] = tmp_len[i]; } ADIOI_Free(tmp_len); /* check if there are any holes. If yes, must do read-modify-write. * holes can be in three places. 'middle' is what you'd expect: the * processes are operating on noncontigous data. But holes can also show * up at the beginning or end of the file domain (see John Bent ROMIO REQ * #835). Missing these holes would result in us writing more data than * recieved by everyone else. */ *hole = 0; if (sum) { if (off != srt_off[0]) /* hole at the front */ *hole = 1; else { /* coalesce the sorted offset-length pairs */ for (i = 1; i < sum; i++) { if (srt_off[i] <= srt_off[0] + srt_len[0]) { /* ok to cast: operating on cb_buffer_size chunks */ int new_len = (int) srt_off[i] + srt_len[i] - (int) srt_off[0]; if (new_len > srt_len[0]) srt_len[0] = new_len; } else break; } if (i < sum || size != srt_len[0]) /* hole in middle or end */ *hole = 1; } ADIOI_Free(srt_off); ADIOI_Free(srt_len); } if (nprocs_recv) { if (*hole) { ADIO_ReadContig(fd, write_buf, size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, off, &status, &err); /* --BEGIN ERROR HANDLING-- */ if (err != MPI_SUCCESS) { *error_code = MPIO_Err_create_code(err, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**ioRMWrdwr", 0); return; } /* --END ERROR HANDLING-- */ } } if (fd->atomicity) { /* bug fix from Wei-keng Liao and Kenin Coloma */ requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + 1) * sizeof(MPI_Request)); send_req = requests; } else { requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1) * sizeof(MPI_Request)); /* +1 to avoid a 0-size malloc */ /* post receives */ j = 0; for (i = 0; i < nprocs; i++) { if (recv_size[i]) { MPI_Irecv(MPI_BOTTOM, 1, recv_types[j], i, myrank + i + 100 * iter, fd->comm, requests + j); j++; } } send_req = requests + nprocs_recv; } /* post sends. if buftype_is_contig, data can be directly sent from user buf at location given by buf_idx. else use send_buf. */ #ifdef AGGREGATION_PROFILE MPE_Log_event(5032, 0, NULL); #endif if (buftype_is_contig) { j = 0; for (i = 0; i < nprocs; i++) if (send_size[i]) { MPI_Isend(((char *) buf) + buf_idx[i], send_size[i], MPI_BYTE, i, myrank + i + 100 * iter, fd->comm, send_req + j); j++; buf_idx[i] += send_size[i]; } } else if (nprocs_send) { /* buftype is not contig */ size_t msgLen = 0; for (i = 0; i < nprocs; i++) msgLen += send_size[i]; send_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char *)); send_buf[0] = (char *) ADIOI_Malloc(msgLen * sizeof(char)); for (i = 1; i < nprocs; i++) send_buf[i] = send_buf[i - 1] + send_size[i - 1]; ADIOI_Fill_send_buffer(fd, buf, flat_buf, send_buf, offset_list, len_list, send_size, send_req, sent_to_proc, nprocs, myrank, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, send_buf_idx, curr_to_proc, done_to_proc, iter, buftype_extent); /* the send is done in ADIOI_Fill_send_buffer */ } if (fd->atomicity) { /* bug fix from Wei-keng Liao and Kenin Coloma */ j = 0; for (i = 0; i < nprocs; i++) { MPI_Status wkl_status; if (recv_size[i]) { MPI_Recv(MPI_BOTTOM, 1, recv_types[j], i, myrank + i + 100 * iter, fd->comm, &wkl_status); j++; } } } for (i = 0; i < nprocs_recv; i++) MPI_Type_free(recv_types + i); ADIOI_Free(recv_types); #ifdef MPI_STATUSES_IGNORE statuses = MPI_STATUSES_IGNORE; #else if (fd->atomicity) { /* bug fix from Wei-keng Liao and Kenin Coloma */ statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + 1) * sizeof(MPI_Status)); /* +1 to avoid a 0-size malloc */ } else { statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1) * sizeof(MPI_Status)); /* +1 to avoid a 0-size malloc */ } #endif #ifdef NEEDS_MPI_TEST i = 0; if (fd->atomicity) { /* bug fix from Wei-keng Liao and Kenin Coloma */ while (!i) MPI_Testall(nprocs_send, send_req, &i, statuses); } else { while (!i) MPI_Testall(nprocs_send + nprocs_recv, requests, &i, statuses); } #else if (fd->atomicity) /* bug fix from Wei-keng Liao and Kenin Coloma */ MPI_Waitall(nprocs_send, send_req, statuses); else MPI_Waitall(nprocs_send + nprocs_recv, requests, statuses); #endif #ifdef AGGREGATION_PROFILE MPE_Log_event(5033, 0, NULL); #endif #ifndef MPI_STATUSES_IGNORE ADIOI_Free(statuses); #endif ADIOI_Free(requests); if (!buftype_is_contig && nprocs_send) { ADIOI_Free(send_buf[0]); ADIOI_Free(send_buf); } }
/* Sets error_code to MPI_SUCCESS if successful, or creates an error code * in the case of error. */ static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, const void *buf, char *write_buf, ADIOI_Flatlist_node *flat_buf, ADIO_Offset *offset_list, ADIO_Offset *len_list, int *send_size, int *recv_size, ADIO_Offset off, int size, int *count, int *start_pos, int *sent_to_proc, int nprocs, int myrank, int buftype_is_contig, int contig_access_count, int *striping_info, ADIOI_Access *others_req, int *send_buf_idx, int *curr_to_proc, int *done_to_proc, int *hole, int iter, MPI_Aint buftype_extent, int *buf_idx, ADIO_Offset **srt_off, int **srt_len, int *srt_num, int *error_code) { int i, j, nprocs_recv, nprocs_send, err; char **send_buf = NULL; MPI_Request *requests, *send_req; MPI_Datatype *recv_types; MPI_Status *statuses, status; int sum_recv; int data_sieving = *hole; static char myname[] = "ADIOI_W_EXCHANGE_DATA"; /* create derived datatypes for recv */ nprocs_recv = 0; for (i = 0; i < nprocs; i++) if (recv_size[i]) nprocs_recv++; recv_types = (MPI_Datatype *) ADIOI_Malloc((nprocs_recv + 1) * sizeof(MPI_Datatype)); /* +1 to avoid a 0-size malloc */ j = 0; for (i = 0; i < nprocs; i++) { if (recv_size[i]) { ADIOI_Type_create_hindexed_x(count[i], &(others_req[i].lens[start_pos[i]]), &(others_req[i].mem_ptrs[start_pos[i]]), MPI_BYTE, recv_types + j); /* absolute displacements; use MPI_BOTTOM in recv */ MPI_Type_commit(recv_types + j); j++; } } /* To avoid a read-modify-write, * check if there are holes in the data to be written. * For this, merge the (sorted) offset lists others_req using a heap-merge. */ *srt_num = 0; for (i = 0; i < nprocs; i++) *srt_num += count[i]; if (*srt_off) *srt_off = (ADIO_Offset *) ADIOI_Realloc(*srt_off, (*srt_num + 1) * sizeof(ADIO_Offset)); else *srt_off = (ADIO_Offset *) ADIOI_Malloc((*srt_num + 1) * sizeof(ADIO_Offset)); if (*srt_len) *srt_len = (int *) ADIOI_Realloc(*srt_len, (*srt_num + 1) * sizeof(int)); else *srt_len = (int *) ADIOI_Malloc((*srt_num + 1) * sizeof(int)); /* +1 to avoid a 0-size malloc */ ADIOI_Heap_merge(others_req, count, *srt_off, *srt_len, start_pos, nprocs, nprocs_recv, *srt_num); /* check if there are any holes */ *hole = 0; for (i = 0; i < *srt_num - 1; i++) { if ((*srt_off)[i] + (*srt_len)[i] < (*srt_off)[i + 1]) { *hole = 1; break; } } /* In some cases (see John Bent ROMIO REQ # 835), an odd interaction * between aggregation, nominally contiguous regions, and cb_buffer_size * should be handled with a read-modify-write (otherwise we will write out * more data than we receive from everyone else (inclusive), so override * hole detection */ if (*hole == 0) { sum_recv = 0; for (i = 0; i < nprocs; i++) sum_recv += recv_size[i]; if (size > sum_recv) *hole = 1; } /* check the hint for data sieving */ if (data_sieving == ADIOI_HINT_ENABLE && nprocs_recv && *hole) { ADIO_ReadContig(fd, write_buf, size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, off, &status, &err); // --BEGIN ERROR HANDLING-- if (err != MPI_SUCCESS) { *error_code = MPIO_Err_create_code(err, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**ioRMWrdwr", 0); ADIOI_Free(recv_types); return; } // --END ERROR HANDLING-- } nprocs_send = 0; for (i = 0; i < nprocs; i++) if (send_size[i]) nprocs_send++; if (fd->atomicity) { /* bug fix from Wei-keng Liao and Kenin Coloma */ requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + 1) * sizeof(MPI_Request)); send_req = requests; } else { requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1)* sizeof(MPI_Request)); /* +1 to avoid a 0-size malloc */ /* post receives */ j = 0; for (i = 0; i < nprocs; i++) { if (recv_size[i]) { MPI_Irecv(MPI_BOTTOM, 1, recv_types[j], i, myrank + i + 100 * iter, fd->comm, requests + j); j++; } } send_req = requests + nprocs_recv; } /* post sends. * if buftype_is_contig, data can be directly sent from * user buf at location given by buf_idx. else use send_buf. */ if (buftype_is_contig) { j = 0; for (i = 0; i < nprocs; i++) if (send_size[i]) { ADIOI_Assert(buf_idx[i] != -1); MPI_Isend(((char *) buf) + buf_idx[i], send_size[i], MPI_BYTE, i, myrank + i + 100 * iter, fd->comm, send_req + j); j++; } } else if (nprocs_send) { /* buftype is not contig */ send_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char *)); for (i = 0; i < nprocs; i++) if (send_size[i]) send_buf[i] = (char *) ADIOI_Malloc(send_size[i]); ADIOI_LUSTRE_Fill_send_buffer(fd, buf, flat_buf, send_buf, offset_list, len_list, send_size, send_req, sent_to_proc, nprocs, myrank, contig_access_count, striping_info, send_buf_idx, curr_to_proc, done_to_proc, iter, buftype_extent); /* the send is done in ADIOI_Fill_send_buffer */ } /* bug fix from Wei-keng Liao and Kenin Coloma */ if (fd->atomicity) { j = 0; for (i = 0; i < nprocs; i++) { MPI_Status wkl_status; if (recv_size[i]) { MPI_Recv(MPI_BOTTOM, 1, recv_types[j], i, myrank + i + 100 * iter, fd->comm, &wkl_status); j++; } } } for (i = 0; i < nprocs_recv; i++) MPI_Type_free(recv_types + i); ADIOI_Free(recv_types); /* bug fix from Wei-keng Liao and Kenin Coloma */ /* +1 to avoid a 0-size malloc */ if (fd->atomicity) { statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + 1) * sizeof(MPI_Status)); } else { statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1) * sizeof(MPI_Status)); } #ifdef NEEDS_MPI_TEST i = 0; if (fd->atomicity) { /* bug fix from Wei-keng Liao and Kenin Coloma */ while (!i) MPI_Testall(nprocs_send, send_req, &i, statuses); } else { while (!i) MPI_Testall(nprocs_send + nprocs_recv, requests, &i, statuses); } #else /* bug fix from Wei-keng Liao and Kenin Coloma */ if (fd->atomicity) MPI_Waitall(nprocs_send, send_req, statuses); else MPI_Waitall(nprocs_send + nprocs_recv, requests, statuses); #endif ADIOI_Free(statuses); ADIOI_Free(requests); if (!buftype_is_contig && nprocs_send) { for (i = 0; i < nprocs; i++) if (send_size[i]) ADIOI_Free(send_buf[i]); ADIOI_Free(send_buf); } }