Exemple #1
0
/* Sets error_code to MPI_SUCCESS if successful, or creates an error code
 * in the case of error.
 */
static void ADIOI_W_Exchange_data(ADIO_File fd, void *buf, char *write_buf,
                                  ADIOI_Flatlist_node * flat_buf, ADIO_Offset
                                  * offset_list, ADIO_Offset * len_list, int *send_size,
                                  int *recv_size, ADIO_Offset off, int size,
                                  int *count, int *start_pos,
                                  int *partial_recv,
                                  int *sent_to_proc, int nprocs,
                                  int myrank, int
                                  buftype_is_contig, int contig_access_count,
                                  ADIO_Offset min_st_offset,
                                  ADIO_Offset fd_size,
                                  ADIO_Offset * fd_start, ADIO_Offset * fd_end,
                                  ADIOI_Access * others_req,
                                  int *send_buf_idx, int *curr_to_proc,
                                  int *done_to_proc, int *hole, int iter,
                                  MPI_Aint buftype_extent, MPI_Aint * buf_idx, int *error_code)
{
    int i, j, k, *tmp_len, nprocs_recv, nprocs_send, err;
    char **send_buf = NULL;
    MPI_Request *requests, *send_req;
    MPI_Datatype *recv_types;
    MPI_Status *statuses, status;
    int *srt_len = NULL, sum;
    ADIO_Offset *srt_off = NULL;
    static char myname[] = "ADIOI_W_EXCHANGE_DATA";

/* exchange recv_size info so that each process knows how much to
   send to whom. */

    MPI_Alltoall(recv_size, 1, MPI_INT, send_size, 1, MPI_INT, fd->comm);

    /* create derived datatypes for recv */

    nprocs_send = 0;
    nprocs_recv = 0;
    sum = 0;
    for (i = 0; i < nprocs; i++) {
        sum += count[i];
        if (recv_size[i])
            nprocs_recv++;
        if (send_size[i])
            nprocs_send++;
    }

    recv_types = (MPI_Datatype *)
        ADIOI_Malloc((nprocs_recv + 1) * sizeof(MPI_Datatype));
/* +1 to avoid a 0-size malloc */

    tmp_len = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    j = 0;
    for (i = 0; i < nprocs; i++) {
        if (recv_size[i]) {
/* take care if the last off-len pair is a partial recv */
            if (partial_recv[i]) {
                k = start_pos[i] + count[i] - 1;
                tmp_len[i] = others_req[i].lens[k];
                others_req[i].lens[k] = partial_recv[i];
            }
            ADIOI_Type_create_hindexed_x(count[i],
                                         &(others_req[i].lens[start_pos[i]]),
                                         &(others_req[i].mem_ptrs[start_pos[i]]),
                                         MPI_BYTE, recv_types + j);
            /* absolute displacements; use MPI_BOTTOM in recv */
            MPI_Type_commit(recv_types + j);
            j++;
        }
    }

    /* To avoid a read-modify-write, check if there are holes in the
     * data to be written. For this, merge the (sorted) offset lists
     * others_req using a heap-merge. */

    /* valgrind-detcted optimization: if there is no work on this process we do
     * not need to search for holes */
    if (sum) {
        srt_off = (ADIO_Offset *) ADIOI_Malloc(sum * sizeof(ADIO_Offset));
        srt_len = (int *) ADIOI_Malloc(sum * sizeof(int));

        ADIOI_Heap_merge(others_req, count, srt_off, srt_len, start_pos, nprocs, nprocs_recv, sum);
    }

    /* for partial recvs, restore original lengths */
    for (i = 0; i < nprocs; i++)
        if (partial_recv[i]) {
            k = start_pos[i] + count[i] - 1;
            others_req[i].lens[k] = tmp_len[i];
        }
    ADIOI_Free(tmp_len);

    /* check if there are any holes. If yes, must do read-modify-write.
     * holes can be in three places.  'middle' is what you'd expect: the
     * processes are operating on noncontigous data.  But holes can also show
     * up at the beginning or end of the file domain (see John Bent ROMIO REQ
     * #835). Missing these holes would result in us writing more data than
     * recieved by everyone else. */

    *hole = 0;
    if (sum) {
        if (off != srt_off[0])  /* hole at the front */
            *hole = 1;
        else {  /* coalesce the sorted offset-length pairs */
            for (i = 1; i < sum; i++) {
                if (srt_off[i] <= srt_off[0] + srt_len[0]) {
                    /* ok to cast: operating on cb_buffer_size chunks */
                    int new_len = (int) srt_off[i] + srt_len[i] - (int) srt_off[0];
                    if (new_len > srt_len[0])
                        srt_len[0] = new_len;
                } else
                    break;
            }
            if (i < sum || size != srt_len[0])  /* hole in middle or end */
                *hole = 1;
        }

        ADIOI_Free(srt_off);
        ADIOI_Free(srt_len);
    }

    if (nprocs_recv) {
        if (*hole) {
            ADIO_ReadContig(fd, write_buf, size, MPI_BYTE,
                            ADIO_EXPLICIT_OFFSET, off, &status, &err);
            /* --BEGIN ERROR HANDLING-- */
            if (err != MPI_SUCCESS) {
                *error_code = MPIO_Err_create_code(err,
                                                   MPIR_ERR_RECOVERABLE, myname,
                                                   __LINE__, MPI_ERR_IO, "**ioRMWrdwr", 0);
                return;
            }
            /* --END ERROR HANDLING-- */
        }
    }

    if (fd->atomicity) {
        /* bug fix from Wei-keng Liao and Kenin Coloma */
        requests = (MPI_Request *)
            ADIOI_Malloc((nprocs_send + 1) * sizeof(MPI_Request));
        send_req = requests;
    } else {
        requests = (MPI_Request *)
            ADIOI_Malloc((nprocs_send + nprocs_recv + 1) * sizeof(MPI_Request));
        /* +1 to avoid a 0-size malloc */

        /* post receives */
        j = 0;
        for (i = 0; i < nprocs; i++) {
            if (recv_size[i]) {
                MPI_Irecv(MPI_BOTTOM, 1, recv_types[j], i, myrank + i + 100 * iter,
                          fd->comm, requests + j);
                j++;
            }
        }
        send_req = requests + nprocs_recv;
    }

/* post sends. if buftype_is_contig, data can be directly sent from
   user buf at location given by buf_idx. else use send_buf. */

#ifdef AGGREGATION_PROFILE
    MPE_Log_event(5032, 0, NULL);
#endif
    if (buftype_is_contig) {
        j = 0;
        for (i = 0; i < nprocs; i++)
            if (send_size[i]) {
                MPI_Isend(((char *) buf) + buf_idx[i], send_size[i],
                          MPI_BYTE, i, myrank + i + 100 * iter, fd->comm, send_req + j);
                j++;
                buf_idx[i] += send_size[i];
            }
    } else if (nprocs_send) {
        /* buftype is not contig */
        size_t msgLen = 0;
        for (i = 0; i < nprocs; i++)
            msgLen += send_size[i];
        send_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char *));
        send_buf[0] = (char *) ADIOI_Malloc(msgLen * sizeof(char));
        for (i = 1; i < nprocs; i++)
            send_buf[i] = send_buf[i - 1] + send_size[i - 1];

        ADIOI_Fill_send_buffer(fd, buf, flat_buf, send_buf,
                               offset_list, len_list, send_size,
                               send_req,
                               sent_to_proc, nprocs, myrank,
                               contig_access_count,
                               min_st_offset, fd_size, fd_start, fd_end,
                               send_buf_idx, curr_to_proc, done_to_proc, iter, buftype_extent);
        /* the send is done in ADIOI_Fill_send_buffer */
    }

    if (fd->atomicity) {
        /* bug fix from Wei-keng Liao and Kenin Coloma */
        j = 0;
        for (i = 0; i < nprocs; i++) {
            MPI_Status wkl_status;
            if (recv_size[i]) {
                MPI_Recv(MPI_BOTTOM, 1, recv_types[j], i, myrank + i + 100 * iter,
                         fd->comm, &wkl_status);
                j++;
            }
        }
    }

    for (i = 0; i < nprocs_recv; i++)
        MPI_Type_free(recv_types + i);
    ADIOI_Free(recv_types);

#ifdef MPI_STATUSES_IGNORE
    statuses = MPI_STATUSES_IGNORE;
#else
    if (fd->atomicity) {
        /* bug fix from Wei-keng Liao and Kenin Coloma */
        statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + 1) * sizeof(MPI_Status));
        /* +1 to avoid a 0-size malloc */
    } else {
        statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1) *
                                               sizeof(MPI_Status));
        /* +1 to avoid a 0-size malloc */
    }
#endif

#ifdef NEEDS_MPI_TEST
    i = 0;
    if (fd->atomicity) {
        /* bug fix from Wei-keng Liao and Kenin Coloma */
        while (!i)
            MPI_Testall(nprocs_send, send_req, &i, statuses);
    } else {
        while (!i)
            MPI_Testall(nprocs_send + nprocs_recv, requests, &i, statuses);
    }
#else
    if (fd->atomicity)
        /* bug fix from Wei-keng Liao and Kenin Coloma */
        MPI_Waitall(nprocs_send, send_req, statuses);
    else
        MPI_Waitall(nprocs_send + nprocs_recv, requests, statuses);
#endif

#ifdef AGGREGATION_PROFILE
    MPE_Log_event(5033, 0, NULL);
#endif
#ifndef MPI_STATUSES_IGNORE
    ADIOI_Free(statuses);
#endif
    ADIOI_Free(requests);
    if (!buftype_is_contig && nprocs_send) {
        ADIOI_Free(send_buf[0]);
        ADIOI_Free(send_buf);
    }
}
Exemple #2
0
/* Sets error_code to MPI_SUCCESS if successful, or creates an error code
 * in the case of error.
 */
static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, const void *buf,
					 char *write_buf,
					 ADIOI_Flatlist_node *flat_buf,
					 ADIO_Offset *offset_list,
					 ADIO_Offset *len_list, int *send_size,
					 int *recv_size, ADIO_Offset off,
					 int size, int *count,
					 int *start_pos, 
					 int *sent_to_proc, int nprocs,
					 int myrank, int buftype_is_contig,
					 int contig_access_count,
					 int *striping_info,
					 ADIOI_Access *others_req,
					 int *send_buf_idx,
					 int *curr_to_proc, int *done_to_proc,
                                         int *hole, int iter,
                                         MPI_Aint buftype_extent,
					 int *buf_idx,
                          ADIO_Offset **srt_off, int **srt_len, int *srt_num,
                          int *error_code)
{
    int i, j, nprocs_recv, nprocs_send, err;
    char **send_buf = NULL;
    MPI_Request *requests, *send_req;
    MPI_Datatype *recv_types;
    MPI_Status *statuses, status;
    int sum_recv;
    int data_sieving = *hole;
    static char myname[] = "ADIOI_W_EXCHANGE_DATA";

    /* create derived datatypes for recv */
    nprocs_recv = 0;
    for (i = 0; i < nprocs; i++)
	if (recv_size[i])
	    nprocs_recv++;

    recv_types = (MPI_Datatype *) ADIOI_Malloc((nprocs_recv + 1) *
					       sizeof(MPI_Datatype));
    /* +1 to avoid a 0-size malloc */

    j = 0;
    for (i = 0; i < nprocs; i++) {
	if (recv_size[i]) {
	    ADIOI_Type_create_hindexed_x(count[i],
			      &(others_req[i].lens[start_pos[i]]),
			      &(others_req[i].mem_ptrs[start_pos[i]]),
			      MPI_BYTE, recv_types + j);
	    /* absolute displacements; use MPI_BOTTOM in recv */
	    MPI_Type_commit(recv_types + j);
	    j++;
	}
    }

    /* To avoid a read-modify-write,
     * check if there are holes in the data to be written.
     * For this, merge the (sorted) offset lists others_req using a heap-merge.
     */

    *srt_num = 0;
    for (i = 0; i < nprocs; i++)
        *srt_num += count[i];
    if (*srt_off)
        *srt_off = (ADIO_Offset *) ADIOI_Realloc(*srt_off, (*srt_num + 1) * sizeof(ADIO_Offset));
    else
        *srt_off = (ADIO_Offset *) ADIOI_Malloc((*srt_num + 1) * sizeof(ADIO_Offset));
    if (*srt_len)
        *srt_len = (int *) ADIOI_Realloc(*srt_len, (*srt_num + 1) * sizeof(int));
    else
        *srt_len = (int *) ADIOI_Malloc((*srt_num + 1) * sizeof(int));
    /* +1 to avoid a 0-size malloc */

    ADIOI_Heap_merge(others_req, count, *srt_off, *srt_len, start_pos,
		     nprocs, nprocs_recv, *srt_num);

    /* check if there are any holes */
    *hole = 0;
    for (i = 0; i < *srt_num - 1; i++) {
        if ((*srt_off)[i] + (*srt_len)[i] < (*srt_off)[i + 1]) {
            *hole = 1;
	    break;
	}
    }
    /* In some cases (see John Bent ROMIO REQ # 835), an odd interaction
     * between aggregation, nominally contiguous regions, and cb_buffer_size
     * should be handled with a read-modify-write (otherwise we will write out
     * more data than we receive from everyone else (inclusive), so override
     * hole detection
     */
    if (*hole == 0) {
        sum_recv = 0;
        for (i = 0; i < nprocs; i++)
            sum_recv += recv_size[i];
	if (size > sum_recv)
	    *hole = 1;
    }
    /* check the hint for data sieving */
    if (data_sieving == ADIOI_HINT_ENABLE && nprocs_recv && *hole) {
        ADIO_ReadContig(fd, write_buf, size, MPI_BYTE,
                        ADIO_EXPLICIT_OFFSET, off, &status, &err);
        // --BEGIN ERROR HANDLING--
        if (err != MPI_SUCCESS) {
            *error_code = MPIO_Err_create_code(err,
                                               MPIR_ERR_RECOVERABLE,
                                               myname, __LINE__,
                                               MPI_ERR_IO,
                                               "**ioRMWrdwr", 0);
            ADIOI_Free(recv_types);
            return;
        }
        // --END ERROR HANDLING--
    }

    nprocs_send = 0;
    for (i = 0; i < nprocs; i++)
	if (send_size[i])
	    nprocs_send++;

    if (fd->atomicity) {
	/* bug fix from Wei-keng Liao and Kenin Coloma */
	requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + 1) *
                                                sizeof(MPI_Request));
	send_req = requests;
    } else {
	requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1)*
                                                sizeof(MPI_Request));
	/* +1 to avoid a 0-size malloc */

	/* post receives */
	j = 0;
	for (i = 0; i < nprocs; i++) {
	    if (recv_size[i]) {
		MPI_Irecv(MPI_BOTTOM, 1, recv_types[j], i,
			  myrank + i + 100 * iter, fd->comm, requests + j);
		j++;
	    }
	}
	send_req = requests + nprocs_recv;
    }

    /* post sends.
     * if buftype_is_contig, data can be directly sent from
     * user buf at location given by buf_idx. else use send_buf.
     */
    if (buftype_is_contig) {
	j = 0;
	for (i = 0; i < nprocs; i++)
	    if (send_size[i]) {
                ADIOI_Assert(buf_idx[i] != -1);
		MPI_Isend(((char *) buf) + buf_idx[i], send_size[i],
			  MPI_BYTE, i, myrank + i + 100 * iter, fd->comm,
			  send_req + j);
		j++;
	    }
    } else
        if (nprocs_send) {
	/* buftype is not contig */
	send_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char *));
	for (i = 0; i < nprocs; i++)
	    if (send_size[i])
		send_buf[i] = (char *) ADIOI_Malloc(send_size[i]);

	ADIOI_LUSTRE_Fill_send_buffer(fd, buf, flat_buf, send_buf, offset_list,
                                      len_list, send_size, send_req,
                                      sent_to_proc, nprocs, myrank,
                                      contig_access_count, striping_info,
                                      send_buf_idx, curr_to_proc, done_to_proc,
                                      iter, buftype_extent);
	/* the send is done in ADIOI_Fill_send_buffer */
    }

	/* bug fix from Wei-keng Liao and Kenin Coloma */
    if (fd->atomicity) {
	j = 0;
	for (i = 0; i < nprocs; i++) {
	    MPI_Status wkl_status;
	    if (recv_size[i]) {
		MPI_Recv(MPI_BOTTOM, 1, recv_types[j], i,
			 myrank + i + 100 * iter, fd->comm, &wkl_status);
		j++;
	    }
	}
    }

    for (i = 0; i < nprocs_recv; i++)
	MPI_Type_free(recv_types + i);
    ADIOI_Free(recv_types);

	/* bug fix from Wei-keng Liao and Kenin Coloma */
	/* +1 to avoid a 0-size malloc */
    if (fd->atomicity) {
	statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + 1) *
					       sizeof(MPI_Status));
    } else {
	statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1) *
					       sizeof(MPI_Status));
    }

#ifdef NEEDS_MPI_TEST
    i = 0;
    if (fd->atomicity) {
	/* bug fix from Wei-keng Liao and Kenin Coloma */
	while (!i)
	    MPI_Testall(nprocs_send, send_req, &i, statuses);
    } else {
	while (!i)
	    MPI_Testall(nprocs_send + nprocs_recv, requests, &i, statuses);
    }
#else
	/* bug fix from Wei-keng Liao and Kenin Coloma */
    if (fd->atomicity)
	MPI_Waitall(nprocs_send, send_req, statuses);
    else
	MPI_Waitall(nprocs_send + nprocs_recv, requests, statuses);
#endif
    ADIOI_Free(statuses);
    ADIOI_Free(requests);
    if (!buftype_is_contig && nprocs_send) {
	for (i = 0; i < nprocs; i++)
	    if (send_size[i])
		ADIOI_Free(send_buf[i]);
	ADIOI_Free(send_buf);
    }
}