Beispiel #1
0
static void ADIOI_Iread_and_exch_l1_end(ADIOI_NBC_Request *nbc_req,
                                        int *error_code)
{
    ADIOI_Iread_and_exch_vars *vars = nbc_req->data.rd.rae_vars;
    ADIO_File fd = vars->fd;
    ADIO_Offset size = vars->size;
    ADIO_Offset real_size = vars->real_size;
    ADIO_Offset for_next_iter = vars->for_next_iter;
    char *read_buf = vars->read_buf;
    char *tmp_buf;

    vars->for_curr_iter = for_next_iter;

    if (for_next_iter) {
        tmp_buf = (char *)ADIOI_Malloc(for_next_iter);
        ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)read_buf)+real_size-for_next_iter) == (ADIO_Offset)(MPIR_Upint)(read_buf+real_size-for_next_iter));
        ADIOI_Assert((for_next_iter+vars->coll_bufsize) == (size_t)(for_next_iter+vars->coll_bufsize));
        memcpy(tmp_buf, read_buf+real_size-for_next_iter, for_next_iter);
        ADIOI_Free(fd->io_buf);
        fd->io_buf = (char *)ADIOI_Malloc(for_next_iter+vars->coll_bufsize);
        memcpy(fd->io_buf, tmp_buf, for_next_iter);
        vars->read_buf = fd->io_buf;
        ADIOI_Free(tmp_buf);
    }

    vars->off += size;
    vars->done += size;

    /* increment m and go back to the beginning of m loop */
    vars->m++;
    ADIOI_Iread_and_exch_l1_begin(nbc_req, error_code);
}
Beispiel #2
0
/* ADIOI_GEN_IwriteContig
 *
 * This code handles only the case where ROMIO_HAVE_WORKING_AIO is 
 * defined. We post an asynchronous I/O operations using the appropriate aio
 * routines.  Otherwise, the ADIOI_Fns_struct will point to the FAKE
 * version.
 */
void ADIOI_GEN_IwriteContig(ADIO_File fd, const void *buf, int count,
			    MPI_Datatype datatype, int file_ptr_type,
			    ADIO_Offset offset, ADIO_Request *request,
			    int *error_code)
{
    MPI_Count len, typesize;
    int aio_errno = 0;
    static char myname[] = "ADIOI_GEN_IWRITECONTIG";

    MPI_Type_size_x(datatype, &typesize);
    len = count * typesize;
    ADIOI_Assert(len == (int)((ADIO_Offset)count * (ADIO_Offset)typesize)); /* the count is an int parm */

    if (file_ptr_type == ADIO_INDIVIDUAL) offset = fd->fp_ind;
    /* Cast away the const'ness of 'buf' as ADIOI_GEN_aio is used for
     * both read and write calls */
    aio_errno = ADIOI_GEN_aio(fd, (char *) buf, len, offset, 1, request);
    if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind += len;

    fd->fp_sys_posn = -1;

    /* --BEGIN ERROR HANDLING-- */
    if (aio_errno != 0) {
	MPIO_ERR_CREATE_CODE_ERRNO(myname, aio_errno, error_code);
	return;
    }
    /* --END ERROR HANDLING-- */

    *error_code = MPI_SUCCESS;
}
/* Generic implementation of IwriteContig calls the blocking WriteContig
 * immediately.
 */
void ADIOI_FAKE_IwriteContig(ADIO_File fd, const void *buf, int count,
			    MPI_Datatype datatype, int file_ptr_type,
			    ADIO_Offset offset, ADIO_Request *request,
			    int *error_code)
{
    ADIO_Status status;
    MPI_Offset len;
    int typesize;
    MPI_Offset nbytes=0;

    MPI_Type_size(datatype, &typesize);
    len = (MPI_Offset)count * (MPI_Offset)typesize;

    /* Call the blocking function.  It will create an error code
     * if necessary.
     */
    ADIOI_Assert(len == (int) len); /* the count is an int parm */
    ADIO_WriteContig(fd, buf, (int)len, MPI_BYTE, file_ptr_type, offset,
		     &status, error_code);  
    if (*error_code == MPI_SUCCESS) {
	MPI_Type_size(datatype, &typesize);
	nbytes = (MPI_Offset)count*(MPI_Offset)typesize;
    }
    MPIO_Completed_request_create(&fd, nbytes, error_code, request);

}
Beispiel #4
0
void ADIOI_GEN_Resize(ADIO_File fd, ADIO_Offset size, int *error_code)
{
    int err, rank;
    static char myname[] = "ADIOI_GEN_RESIZE";

    MPI_Comm_rank(fd->comm, &rank);

    /* first aggregator performs ftruncate() */
    if (rank == fd->hints->ranklist[0]) {
    ADIOI_Assert(size == (off_t) size);
	err = ftruncate(fd->fd_sys, (off_t)size);
    }

    /* bcast return value */
    MPI_Bcast(&err, 1, MPI_INT, fd->hints->ranklist[0], fd->comm);

    /* --BEGIN ERROR HANDLING-- */
    if (err == -1) {
	*error_code = ADIOI_Err_create_code(myname, fd->filename, errno);
	return;
    }
    /* --END ERROR HANDLING-- */

    *error_code = MPI_SUCCESS;
}
Beispiel #5
0
/* ADIOI_GEN_IreadContig
 *
 * This code handles two distinct cases.  If ROMIO_HAVE_WORKING_AIO is not
 * defined, then I/O is performed in a blocking manner.  Otherwise we post
 * an asynchronous I/O operation using the appropriate aio routines. 
 *
 * In the aio case we rely on ADIOI_GEN_aio(), which is implemented in
 * common/ad_iwrite.c.
 */
void ADIOI_GEN_IreadContig(ADIO_File fd, void *buf, int count, 
			   MPI_Datatype datatype, int file_ptr_type,
			   ADIO_Offset offset, MPI_Request *request,
			   int *error_code)  
{
    MPI_Count len, typesize;
    int aio_errno = 0;
    static char myname[] = "ADIOI_GEN_IREADCONTIG";

    MPI_Type_size_x(datatype, &typesize);
    ADIOI_Assert((count * typesize) == ((ADIO_Offset)(unsigned)count * (ADIO_Offset)typesize));
    len = count * typesize;

    if (file_ptr_type == ADIO_INDIVIDUAL) offset = fd->fp_ind;
    aio_errno = ADIOI_GEN_aio(fd, buf, len, offset, 0, request);
    if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind += len;

    fd->fp_sys_posn = -1;

    /* --BEGIN ERROR HANDLING-- */
    if (aio_errno != 0) {
	MPIO_ERR_CREATE_CODE_ERRNO(myname, aio_errno, error_code);
	return;
    }
    /* --END ERROR HANDLING-- */
    
    *error_code = MPI_SUCCESS;
}
Beispiel #6
0
/* ADIOI_cb_delete_name_array() - attribute destructor
 */
int ADIOI_cb_delete_name_array(MPI_Comm comm, 
			 int keyval, 
			 void *attr_val, 
			 void *extra)
{
    ADIO_cb_name_array array;

    ADIOI_UNREFERENCED_ARG(comm);
    ADIOI_UNREFERENCED_ARG(extra);

    array = (ADIO_cb_name_array) attr_val;
    ADIOI_Assert(array != NULL);
    array->refct--;

    if (array->refct <= 0) {
	/* time to free the structures (names, array of ptrs to names, struct)
	 */
	if (array->namect) {
	    /* Note that array->names[i], where i > 0, 
	     * are just pointers into the allocated region array->names[0]
	     */
	    ADIOI_Free(array->names[0]);
	}
	if (array->names != NULL) ADIOI_Free(array->names);
	ADIOI_Free(array);
    }
    return MPI_SUCCESS;
}
Beispiel #7
0
void *ADIOI_IO_Thread_Func(void *vptr_args) {
    ADIOI_IO_ThreadFuncData *args = (ADIOI_IO_ThreadFuncData*)vptr_args;

    ADIOI_Assert(args->size == (int)(args->size));

    if (args->io_kind == ADIOI_READ) {
	ADIO_ReadContig(args->fd, args->buf, args->size, MPI_BYTE,
		ADIO_EXPLICIT_OFFSET, args->offset,
		args->status, &(args->error_code));
    } else {
	ADIO_WriteContig(args->fd, args->buf, args->size, MPI_BYTE,
		ADIO_EXPLICIT_OFFSET, args->offset,
		args->status, &(args->error_code));
    }
    pthread_exit(&(args->error_code));
    return NULL;
}
Beispiel #8
0
static int type_create_contiguous_x(MPI_Count count,
                                    MPI_Datatype oldtype, MPI_Datatype *newtype)
{
    /* to make 'count' fit MPI-3 type processing routines (which take integer
     * counts), we construct a type consisting of N INT_MAX chunks followed by
     * a remainder.  e.g for a count of 4000000000 bytes you would end up with
     * one 2147483647-byte chunk followed immediately by a 1852516353-byte
     * chunk */
    MPI_Datatype chunks, remainder;
    MPI_Aint lb, extent, disps[2];
    int blocklens[2];
    MPI_Datatype types[2];

    /* truly stupendously large counts will overflow an integer with this math,
     * but that is a problem for a few decades from now.  Sorry, few decades
     * from now! */
    ADIOI_Assert(count/INT_MAX == (int)(count/INT_MAX));
    int c = (int)(count/INT_MAX); /* OK to cast until 'count' is 256 bits */
    int r = count%INT_MAX;

    MPI_Type_vector(c, INT_MAX, INT_MAX, oldtype, &chunks);
    MPI_Type_contiguous(r, oldtype, &remainder);

    MPI_Type_get_extent(oldtype, &lb, &extent);

    blocklens[0] = 1;
    blocklens[1] = 1;
    disps[0]     = 0;
    disps[1]     = c*extent*INT_MAX;
    types[0]     = chunks;
    types[1]     = remainder;

    MPI_Type_create_struct(2, blocklens, disps, types, newtype);

    MPI_Type_free(&chunks);
    MPI_Type_free(&remainder);

    return MPI_SUCCESS;
}
Beispiel #9
0
/* If successful, error_code is set to MPI_SUCCESS.  Otherwise an error
 * code is created and returned in error_code.
 */
static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, const void *buf,
					MPI_Datatype datatype, int nprocs,
					int myrank, ADIOI_Access *others_req,
                                        ADIOI_Access *my_req,
					ADIO_Offset *offset_list,
                                        ADIO_Offset *len_list, 
					int contig_access_count,
                                        int *striping_info, int **buf_idx,
                                        int *error_code)
{
    /* Send data to appropriate processes and write in sizes of no more
     * than lustre stripe_size.
     * The idea is to reduce the amount of extra memory required for
     * collective I/O. If all data were written all at once, which is much
     * easier, it would require temp space more than the size of user_buf,
     * which is often unacceptable. For example, to write a distributed
     * array to a file, where each local array is 8Mbytes, requiring
     * at least another 8Mbytes of temp space is unacceptable.
     */

    int hole, i, j, m, flag, ntimes = 1 , max_ntimes, buftype_is_contig;
    ADIO_Offset st_loc = -1, end_loc = -1, min_st_loc, max_end_loc;
    ADIO_Offset off, req_off, send_off, iter_st_off, *off_list;
    ADIO_Offset max_size, step_size = 0;
    int real_size, req_len, send_len;
    int *recv_curr_offlen_ptr, *recv_count, *recv_size;
    int *send_curr_offlen_ptr, *send_size;
    int *sent_to_proc, *recv_start_pos;
    int *send_buf_idx, *curr_to_proc, *done_to_proc;
    int *this_buf_idx;
    char *write_buf = NULL;
    MPI_Status status;
    ADIOI_Flatlist_node *flat_buf = NULL;
    MPI_Aint buftype_extent;
    int stripe_size = striping_info[0], avail_cb_nodes = striping_info[2];
    int data_sieving = 0;
    ADIO_Offset *srt_off = NULL;
    int *srt_len = NULL;
    int srt_num = 0;
    ADIO_Offset block_offset;
    int block_len;

    *error_code = MPI_SUCCESS;	/* changed below if error */
    /* only I/O errors are currently reported */

    /* calculate the number of writes of stripe size to be done.
     * That gives the no. of communication phases as well.
     * Note:
     *   Because we redistribute data in stripe-contiguous pattern for Lustre,
     *   each process has the same no. of communication phases.
     */

    for (i = 0; i < nprocs; i++) {
	if (others_req[i].count) {
	    st_loc = others_req[i].offsets[0];
	    end_loc = others_req[i].offsets[0];
	    break;
	}
    }
    for (i = 0; i < nprocs; i++) {
	for (j = 0; j < others_req[i].count; j++) {
	    st_loc = MPL_MIN(st_loc, others_req[i].offsets[j]);
	    end_loc = MPL_MAX(end_loc, (others_req[i].offsets[j] +
                                          others_req[i].lens[j] - 1));
	}
    }
    /* this process does no writing. */
    if ((st_loc == -1) && (end_loc == -1))
	ntimes = 0;
    MPI_Allreduce(&end_loc, &max_end_loc, 1, MPI_LONG_LONG_INT, MPI_MAX, fd->comm);
    /* avoid min_st_loc be -1 */
    if (st_loc == -1)
        st_loc = max_end_loc;
    MPI_Allreduce(&st_loc, &min_st_loc, 1, MPI_LONG_LONG_INT, MPI_MIN, fd->comm);
    /* align downward */
    min_st_loc -= min_st_loc % (ADIO_Offset)stripe_size;

    /* Each time, only avail_cb_nodes number of IO clients perform IO,
     * so, step_size=avail_cb_nodes*stripe_size IO will be performed at most,
     * and ntimes=whole_file_portion/step_size
     */
    step_size = (ADIO_Offset) avail_cb_nodes * stripe_size;
    max_ntimes = (max_end_loc - min_st_loc + 1) / step_size
        + (((max_end_loc - min_st_loc + 1) % step_size) ? 1 : 0);
/*     max_ntimes = (int)((max_end_loc - min_st_loc) / step_size + 1); */
    if (ntimes)
	write_buf = (char *) ADIOI_Malloc(stripe_size);

    /* calculate the start offset for each iteration */
    off_list = (ADIO_Offset *) ADIOI_Malloc(max_ntimes * sizeof(ADIO_Offset));
    for (m = 0; m < max_ntimes; m ++)
        off_list[m] = max_end_loc;
    for (i = 0; i < nprocs; i++) {
        for (j = 0; j < others_req[i].count; j ++) {
            req_off = others_req[i].offsets[j];
            m = (int)((req_off - min_st_loc) / step_size);
            off_list[m] = MPL_MIN(off_list[m], req_off);
        }
    }

    recv_curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
    send_curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
    /* their use is explained below. calloc initializes to 0. */

    recv_count = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    /* to store count of how many off-len pairs per proc are satisfied
       in an iteration. */

    send_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    /* total size of data to be sent to each proc. in an iteration.
       Of size nprocs so that I can use MPI_Alltoall later. */

    recv_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    /* total size of data to be recd. from each proc. in an iteration. */

    sent_to_proc = (int *) ADIOI_Calloc(nprocs, sizeof(int));
    /* amount of data sent to each proc so far. Used in
       ADIOI_Fill_send_buffer. initialized to 0 here. */

    send_buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    curr_to_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    done_to_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    /* Above three are used in ADIOI_Fill_send_buffer */

    this_buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));

    recv_start_pos = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    /* used to store the starting value of recv_curr_offlen_ptr[i] in
       this iteration */

    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
    if (!buftype_is_contig) {
	flat_buf = ADIOI_Flatten_and_find(datatype);
    }
    MPI_Type_extent(datatype, &buftype_extent);
    /* I need to check if there are any outstanding nonblocking writes to
     * the file, which could potentially interfere with the writes taking
     * place in this collective write call. Since this is not likely to be
     * common, let me do the simplest thing possible here: Each process
     * completes all pending nonblocking operations before completing.
     */
    /*ADIOI_Complete_async(error_code);
    if (*error_code != MPI_SUCCESS) return;
    MPI_Barrier(fd->comm);
    */

    iter_st_off = min_st_loc;

    /* Although we have recognized the data according to OST index,
     * a read-modify-write will be done if there is a hole between the data.
     * For example: if blocksize=60, xfersize=30 and stripe_size=100,
     * then rank0 will collect data [0, 30] and [60, 90] then write. There
     * is a hole in [30, 60], which will cause a read-modify-write in [0, 90].
     *
     * To reduce its impact on the performance, we can disable data sieving
     * by hint "ds_in_coll".
     */
    /* check the hint for data sieving */
    data_sieving = fd->hints->fs_hints.lustre.ds_in_coll;

    for (m = 0; m < max_ntimes; m++) {
	/* go through all others_req and my_req to check which will be received
         * and sent in this iteration.
         */

	/* Note that MPI guarantees that displacements in filetypes are in
	   monotonically nondecreasing order and that, for writes, the
	   filetypes cannot specify overlapping regions in the file. This
	   simplifies implementation a bit compared to reads. */

	/*
           off         = start offset in the file for the data to be written in
                         this iteration
           iter_st_off = start offset of this iteration
           real_size   = size of data written (bytes) corresponding to off
           max_size    = possible maximum size of data written in this iteration
           req_off     = offset in the file for a particular contiguous request minus
                         what was satisfied in previous iteration
           send_off    = offset the request needed by other processes in this iteration
           req_len     = size corresponding to req_off
           send_len    = size corresponding to send_off
         */

	/* first calculate what should be communicated */
	for (i = 0; i < nprocs; i++)
	    recv_count[i] = recv_size[i] = send_size[i] = 0;

        off = off_list[m];
        max_size = MPL_MIN(step_size, max_end_loc - iter_st_off + 1);
        real_size = (int) MPL_MIN((off / stripe_size + 1) * stripe_size -
                                    off,
                                    end_loc - off + 1);

	for (i = 0; i < nprocs; i++) {
            if (my_req[i].count) {
                this_buf_idx[i] = buf_idx[i][send_curr_offlen_ptr[i]];
                for (j = send_curr_offlen_ptr[i]; j < my_req[i].count; j++) {
                    send_off = my_req[i].offsets[j];
                    send_len = my_req[i].lens[j];
                    if (send_off < iter_st_off + max_size) {
                        send_size[i] += send_len;
                    } else {
                        break;
                    }
                }
                send_curr_offlen_ptr[i] = j;
            }
	    if (others_req[i].count) {
		recv_start_pos[i] = recv_curr_offlen_ptr[i];
		for (j = recv_curr_offlen_ptr[i]; j < others_req[i].count; j++) {
                    req_off = others_req[i].offsets[j];
                    req_len = others_req[i].lens[j];
		    if (req_off < iter_st_off + max_size) {
			recv_count[i]++;
                        ADIOI_Assert((((ADIO_Offset)(MPIU_Upint)write_buf)+req_off-off) == (ADIO_Offset)(MPIU_Upint)(write_buf+req_off-off));
			MPI_Address(write_buf + req_off - off,
				    &(others_req[i].mem_ptrs[j]));
                        recv_size[i] += req_len;
		    } else {
			break;
                    }
		}
		recv_curr_offlen_ptr[i] = j;
	    }
	}
        /* use variable "hole" to pass data_sieving flag into W_Exchange_data */
        hole = data_sieving;
	ADIOI_LUSTRE_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
                                     len_list, send_size, recv_size, off, real_size,
                                     recv_count, recv_start_pos,
                                     sent_to_proc, nprocs, myrank,
                                     buftype_is_contig, contig_access_count,
                                     striping_info, others_req, send_buf_idx,
                                     curr_to_proc, done_to_proc, &hole, m,
                                  buftype_extent, this_buf_idx,
                                  &srt_off, &srt_len, &srt_num, error_code);

	if (*error_code != MPI_SUCCESS)
            goto over;

	flag = 0;
	for (i = 0; i < nprocs; i++)
	    if (recv_count[i]) {
		flag = 1;
		break;
	    }
	if (flag) {
            /* check whether to do data sieving */
            if(data_sieving == ADIOI_HINT_ENABLE) {
	        ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE,
			         ADIO_EXPLICIT_OFFSET, off, &status,
			         error_code);
            } else {
                /* if there is no hole, write data in one time;
                 * otherwise, write data in several times */
                if (!hole) {
                    ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE,
                                     ADIO_EXPLICIT_OFFSET, off, &status,
                                     error_code);
                } else {
                    block_offset = -1;
                    block_len = 0;
                    for (i = 0; i < srt_num; ++i) {
                        if (srt_off[i] < off + real_size &&
                            srt_off[i] >= off) {
                            if (block_offset == -1) {
                                block_offset = srt_off[i];
                                block_len = srt_len[i];
                            } else {
                                if (srt_off[i] == block_offset + block_len) {
                                    block_len += srt_len[i];
                                } else {
                                    ADIO_WriteContig(fd,
                                                     write_buf + block_offset - off,
                                                     block_len,
                                                     MPI_BYTE, ADIO_EXPLICIT_OFFSET,
                                                     block_offset, &status,
                                                     error_code);
	                            if (*error_code != MPI_SUCCESS)
		                        goto over;
                                    block_offset = srt_off[i];
                                    block_len = srt_len[i];
                                }
                            }
                        }
                    }
                    if (block_offset != -1) {
                        ADIO_WriteContig(fd,
                                         write_buf + block_offset - off,
                                         block_len,
                                         MPI_BYTE, ADIO_EXPLICIT_OFFSET,
                                         block_offset, &status,
                                         error_code);
                        if (*error_code != MPI_SUCCESS)
                            goto over;
                    }
                }
            }
	    if (*error_code != MPI_SUCCESS)
		goto over;
	}
        iter_st_off += max_size;
    }
over:
    if (srt_off)
        ADIOI_Free(srt_off);
    if (srt_len)
        ADIOI_Free(srt_len);
    if (ntimes)
	ADIOI_Free(write_buf);
    ADIOI_Free(recv_curr_offlen_ptr);
    ADIOI_Free(send_curr_offlen_ptr);
    ADIOI_Free(recv_count);
    ADIOI_Free(send_size);
    ADIOI_Free(recv_size);
    ADIOI_Free(sent_to_proc);
    ADIOI_Free(recv_start_pos);
    ADIOI_Free(send_buf_idx);
    ADIOI_Free(curr_to_proc);
    ADIOI_Free(done_to_proc);
    ADIOI_Free(this_buf_idx);
    ADIOI_Free(off_list);
}
Beispiel #10
0
void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
				   *flat_buf, char **recv_buf, ADIO_Offset 
				   *offset_list, ADIO_Offset *len_list, 
				   unsigned *recv_size, 
				   MPI_Request *requests, MPI_Status *statuses,
				   int *recd_from_proc, int nprocs,
				   int contig_access_count, 
				   ADIO_Offset min_st_offset, 
				   ADIO_Offset fd_size, ADIO_Offset *fd_start, 
				   ADIO_Offset *fd_end,
				   MPI_Aint buftype_extent)
{

/* this function is only called if buftype is not contig */

    int i, p, flat_buf_idx;
    ADIO_Offset flat_buf_sz, size_in_buf, buf_incr, size;
    int n_buftypes;
    ADIO_Offset off, len, rem_len, user_buf_idx;
    /* Not sure unsigned is necessary, but it makes the math safer */
    unsigned *curr_from_proc, *done_from_proc, *recv_buf_idx;

    ADIOI_UNREFERENCED_ARG(requests);
    ADIOI_UNREFERENCED_ARG(statuses);

/*  curr_from_proc[p] = amount of data recd from proc. p that has already
                        been accounted for so far
    done_from_proc[p] = amount of data already recd from proc. p and 
                        filled into user buffer in previous iterations
    user_buf_idx = current location in user buffer 
    recv_buf_idx[p] = current location in recv_buf of proc. p  */
    curr_from_proc = (unsigned *) ADIOI_Malloc(nprocs * sizeof(unsigned));
    done_from_proc = (unsigned *) ADIOI_Malloc(nprocs * sizeof(unsigned));
    recv_buf_idx   = (unsigned *) ADIOI_Malloc(nprocs * sizeof(unsigned));

    for (i=0; i < nprocs; i++) {
	recv_buf_idx[i] = curr_from_proc[i] = 0;
	done_from_proc[i] = recd_from_proc[i];
    }

    user_buf_idx = flat_buf->indices[0];
    flat_buf_idx = 0;
    n_buftypes = 0;
    flat_buf_sz = flat_buf->blocklens[0];

    /* flat_buf_idx = current index into flattened buftype
       flat_buf_sz = size of current contiguous component in 
                flattened buf */

    for (i=0; i<contig_access_count; i++) { 
	off     = offset_list[i];
	rem_len = len_list[i];

	/* this request may span the file domains of more than one process */
	while (rem_len != 0) {
	    len = rem_len;
	    /* NOTE: len value is modified by ADIOI_Calc_aggregator() to be no
	     * longer than the single region that processor "p" is responsible
	     * for.
	     */
	    p = ADIOI_Calc_aggregator(fd,
				      off,
				      min_st_offset,
				      &len,
				      fd_size,
				      fd_start,
				      fd_end);

	    if (recv_buf_idx[p] < recv_size[p]) {
		if (curr_from_proc[p]+len > done_from_proc[p]) {
		    if (done_from_proc[p] > curr_from_proc[p]) {
			size = ADIOI_MIN(curr_from_proc[p] + len - 
			      done_from_proc[p], recv_size[p]-recv_buf_idx[p]);
			buf_incr = done_from_proc[p] - curr_from_proc[p];
			ADIOI_BUF_INCR
			buf_incr = curr_from_proc[p]+len-done_from_proc[p];
      ADIOI_Assert((done_from_proc[p] + size) == (unsigned)((ADIO_Offset)done_from_proc[p] + size));
			curr_from_proc[p] = done_from_proc[p] + size;
			ADIOI_BUF_COPY
		    }
		    else {
			size = ADIOI_MIN(len,recv_size[p]-recv_buf_idx[p]);
			buf_incr = len;
      ADIOI_Assert((curr_from_proc[p] + size) == (unsigned)((ADIO_Offset)curr_from_proc[p] + size));
			curr_from_proc[p] += (unsigned) size;
			ADIOI_BUF_COPY
		    }
		}
		else {
        ADIOI_Assert((curr_from_proc[p] + len) == (unsigned)((ADIO_Offset)curr_from_proc[p] + len));
		    curr_from_proc[p] += (unsigned) len;
		    buf_incr = len;
		    ADIOI_BUF_INCR
		}
	    }
Beispiel #11
0
static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
			 datatype, int nprocs,
			 int myrank, ADIOI_Access
			 *others_req, ADIO_Offset *offset_list,
			 ADIO_Offset *len_list, int contig_access_count, ADIO_Offset
                         min_st_offset, ADIO_Offset fd_size,
			 ADIO_Offset *fd_start, ADIO_Offset *fd_end,
                         int *buf_idx, int *error_code)
{
/* Read in sizes of no more than coll_bufsize, an info parameter.
   Send data to appropriate processes. 
   Place recd. data in user buf.
   The idea is to reduce the amount of extra memory required for
   collective I/O. If all data were read all at once, which is much
   easier, it would require temp space more than the size of user_buf,
   which is often unacceptable. For example, to read a distributed
   array from a file, where each local array is 8Mbytes, requiring
   at least another 8Mbytes of temp space is unacceptable. */

    int i, j, m, ntimes, max_ntimes, buftype_is_contig;
    ADIO_Offset st_loc=-1, end_loc=-1, off, done, real_off, req_off;
    char *read_buf = NULL, *tmp_buf;
    int *curr_offlen_ptr, *count, *send_size, *recv_size;
    int *partial_send, *recd_from_proc, *start_pos;
    /* Not convinced end_loc-st_loc couldn't be > int, so make these offsets*/
    ADIO_Offset real_size, size, for_curr_iter, for_next_iter;
    int req_len, flag, rank;
    MPI_Status status;
    ADIOI_Flatlist_node *flat_buf=NULL;
    MPI_Aint buftype_extent;
    int coll_bufsize;

    *error_code = MPI_SUCCESS;  /* changed below if error */
    /* only I/O errors are currently reported */
    
/* calculate the number of reads of size coll_bufsize
   to be done by each process and the max among all processes.
   That gives the no. of communication phases as well.
   coll_bufsize is obtained from the hints object. */

    coll_bufsize = fd->hints->cb_buffer_size;

    /* grab some initial values for st_loc and end_loc */
    for (i=0; i < nprocs; i++) {
	if (others_req[i].count) {
	    st_loc = others_req[i].offsets[0];
	    end_loc = others_req[i].offsets[0];
	    break;
	}
    }

    /* now find the real values */
    for (i=0; i < nprocs; i++)
	for (j=0; j<others_req[i].count; j++) {
	    st_loc = ADIOI_MIN(st_loc, others_req[i].offsets[j]);
	    end_loc = ADIOI_MAX(end_loc, (others_req[i].offsets[j]
					  + others_req[i].lens[j] - 1));
	}

    /* calculate ntimes, the number of times this process must perform I/O
     * operations in order to complete all the requests it has received.
     * the need for multiple I/O operations comes from the restriction that
     * we only use coll_bufsize bytes of memory for internal buffering.
     */
    if ((st_loc==-1) && (end_loc==-1)) {
	/* this process does no I/O. */
	ntimes = 0;
    }
    else {
	/* ntimes=ceiling_div(end_loc - st_loc + 1, coll_bufsize)*/
	ntimes = (int) ((end_loc - st_loc + coll_bufsize)/coll_bufsize);
    }

    MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX, fd->comm); 

    read_buf = fd->io_buf;  /* Allocated at open time */

    curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int)); 
    /* its use is explained below. calloc initializes to 0. */

    count = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    /* to store count of how many off-len pairs per proc are satisfied
       in an iteration. */

    partial_send = (int *) ADIOI_Calloc(nprocs, sizeof(int));
    /* if only a portion of the last off-len pair is sent to a process 
       in a particular iteration, the length sent is stored here.
       calloc initializes to 0. */

    send_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    /* total size of data to be sent to each proc. in an iteration */

    recv_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    /* total size of data to be recd. from each proc. in an iteration.
       Of size nprocs so that I can use MPI_Alltoall later. */

    recd_from_proc = (int *) ADIOI_Calloc(nprocs, sizeof(int));
    /* amount of data recd. so far from each proc. Used in
       ADIOI_Fill_user_buffer. initialized to 0 here. */

    start_pos = (int *) ADIOI_Malloc(nprocs*sizeof(int));
    /* used to store the starting value of curr_offlen_ptr[i] in 
       this iteration */

    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
    if (!buftype_is_contig) {
	ADIOI_Flatten_datatype(datatype);
	flat_buf = ADIOI_Flatlist;
        while (flat_buf->type != datatype) flat_buf = flat_buf->next;
    }
    MPI_Type_extent(datatype, &buftype_extent);

    done = 0;
    off = st_loc;
    for_curr_iter = for_next_iter = 0;

    MPI_Comm_rank(fd->comm, &rank);

    for (m=0; m<ntimes; m++) {
       /* read buf of size coll_bufsize (or less) */
       /* go through all others_req and check if any are satisfied
          by the current read */

       /* since MPI guarantees that displacements in filetypes are in 
          monotonically nondecreasing order, I can maintain a pointer
	  (curr_offlen_ptr) to 
          current off-len pair for each process in others_req and scan
          further only from there. There is still a problem of filetypes
          such as:  (1, 2, 3 are not process nos. They are just numbers for
          three chunks of data, specified by a filetype.)

                   1  -------!--
                   2    -----!----
                   3       --!-----

          where ! indicates where the current read_size limitation cuts 
          through the filetype.  I resolve this by reading up to !, but
          filling the communication buffer only for 1. I copy the portion
          left over for 2 into a tmp_buf for use in the next
	  iteration. i.e., 2 and 3 will be satisfied in the next
	  iteration. This simplifies filling in the user's buf at the
	  other end, as only one off-len pair with incomplete data
	  will be sent. I also don't need to send the individual
	  offsets and lens along with the data, as the data is being
	  sent in a particular order. */ 

          /* off = start offset in the file for the data actually read in 
                   this iteration 
             size = size of data read corresponding to off
             real_off = off minus whatever data was retained in memory from
                  previous iteration for cases like 2, 3 illustrated above
             real_size = size plus the extra corresponding to real_off
             req_off = off in file for a particular contiguous request 
                       minus what was satisfied in previous iteration
             req_size = size corresponding to req_off */

	size = ADIOI_MIN((unsigned)coll_bufsize, end_loc-st_loc+1-done); 
	real_off = off - for_curr_iter;
	real_size = size + for_curr_iter;

	for (i=0; i<nprocs; i++) count[i] = send_size[i] = 0;
	for_next_iter = 0;

	for (i=0; i<nprocs; i++) {
#ifdef RDCOLL_DEBUG
	    DBG_FPRINTF(stderr, "rank %d, i %d, others_count %d\n", rank, i, others_req[i].count); 
#endif
	    if (others_req[i].count) {
		start_pos[i] = curr_offlen_ptr[i];
		for (j=curr_offlen_ptr[i]; j<others_req[i].count;
		     j++) {
		    if (partial_send[i]) {
			/* this request may have been partially
			   satisfied in the previous iteration. */
			req_off = others_req[i].offsets[j] +
			    partial_send[i]; 
                        req_len = others_req[i].lens[j] -
			    partial_send[i];
			partial_send[i] = 0;
			/* modify the off-len pair to reflect this change */
			others_req[i].offsets[j] = req_off;
			others_req[i].lens[j] = req_len;
		    }
		    else {
			req_off = others_req[i].offsets[j];
                        req_len = others_req[i].lens[j];
		    }
		    if (req_off < real_off + real_size) {
			count[i]++;
      ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)read_buf)+req_off-real_off) == (ADIO_Offset)(MPIR_Upint)(read_buf+req_off-real_off));
			MPI_Address(read_buf+req_off-real_off, 
                               &(others_req[i].mem_ptrs[j]));
      ADIOI_Assert((real_off + real_size - req_off) == (int)(real_off + real_size - req_off));
			send_size[i] += (int)(ADIOI_MIN(real_off + real_size - req_off, 
                                      (ADIO_Offset)(unsigned)req_len)); 

			if (real_off+real_size-req_off < (ADIO_Offset)(unsigned)req_len) {
			    partial_send[i] = (int) (real_off + real_size - req_off);
			    if ((j+1 < others_req[i].count) && 
                                 (others_req[i].offsets[j+1] < 
                                     real_off+real_size)) { 
				/* this is the case illustrated in the
				   figure above. */
				for_next_iter = ADIOI_MAX(for_next_iter,
					  real_off + real_size - others_req[i].offsets[j+1]); 
				/* max because it must cover requests 
				   from different processes */
			    }
			    break;
			}
		    }
		    else break;
		}
		curr_offlen_ptr[i] = j;
	    }
	}

	flag = 0;
	for (i=0; i<nprocs; i++)
	    if (count[i]) flag = 1;

	if (flag) {
      ADIOI_Assert(size == (int)size);
	    ADIO_ReadContig(fd, read_buf+for_curr_iter, (int)size, MPI_BYTE,
			    ADIO_EXPLICIT_OFFSET, off, &status, error_code);
	    if (*error_code != MPI_SUCCESS) return;
	}
	
	for_curr_iter = for_next_iter;
	
	ADIOI_R_Exchange_data(fd, buf, flat_buf, offset_list, len_list,
			    send_size, recv_size, count, 
       			    start_pos, partial_send, recd_from_proc, nprocs,
			    myrank, 
			    buftype_is_contig, contig_access_count,
			    min_st_offset, fd_size, fd_start, fd_end,
			    others_req, 
                            m, buftype_extent, buf_idx); 


	if (for_next_iter) {
	    tmp_buf = (char *) ADIOI_Malloc(for_next_iter);
      ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)read_buf)+real_size-for_next_iter) == (ADIO_Offset)(MPIR_Upint)(read_buf+real_size-for_next_iter));
      ADIOI_Assert((for_next_iter+coll_bufsize) == (size_t)(for_next_iter+coll_bufsize));
	    memcpy(tmp_buf, read_buf+real_size-for_next_iter, for_next_iter);
	    ADIOI_Free(fd->io_buf);
	    fd->io_buf = (char *) ADIOI_Malloc(for_next_iter+coll_bufsize);
	    memcpy(fd->io_buf, tmp_buf, for_next_iter);
	    read_buf = fd->io_buf;
	    ADIOI_Free(tmp_buf);
	}

	off += size;
	done += size;
    }

    for (i=0; i<nprocs; i++) count[i] = send_size[i] = 0;
    for (m=ntimes; m<max_ntimes; m++) 
/* nothing to send, but check for recv. */
	ADIOI_R_Exchange_data(fd, buf, flat_buf, offset_list, len_list,
			    send_size, recv_size, count, 
			    start_pos, partial_send, recd_from_proc, nprocs,
			    myrank, 
			    buftype_is_contig, contig_access_count,
			    min_st_offset, fd_size, fd_start, fd_end,
			    others_req, m,
                            buftype_extent, buf_idx); 

    ADIOI_Free(curr_offlen_ptr);
    ADIOI_Free(count);
    ADIOI_Free(partial_send);
    ADIOI_Free(send_size);
    ADIOI_Free(recv_size);
    ADIOI_Free(recd_from_proc);
    ADIOI_Free(start_pos);
}
Beispiel #12
0
void ADIOI_GPFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
{
/* if fd->info is null, create a new info object.
   Initialize fd->info to default values.
   Initialize fd->hints to default values.
   Examine the info object passed by the user. If it contains values that
   ROMIO understands, override the default. */

    MPI_Info info;
    char *value;
    int flag, intval, nprocs=0, nprocs_is_valid = 0;
    static char myname[] = "ADIOI_GPFS_SETINFO";

    int did_anything = 0;

    if (fd->info == MPI_INFO_NULL) MPI_Info_create(&(fd->info));
    info = fd->info;

    /* Note that fd->hints is allocated at file open time; thus it is
     * not necessary to allocate it, or check for allocation, here.
     */

    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
    ADIOI_Assert ((value != NULL));

    /* initialize info and hints to default values if they haven't been
     * previously initialized
     */
    if (!fd->hints->initialized) {

	ad_get_env_vars();
	ad_gpfs_get_env_vars();
	did_anything = 1;

	/* buffer size for collective I/O */
	ADIOI_Info_set(info, "cb_buffer_size", ADIOI_GPFS_CB_BUFFER_SIZE_DFLT);
	fd->hints->cb_buffer_size = atoi(ADIOI_GPFS_CB_BUFFER_SIZE_DFLT);

	/* default is to let romio automatically decide when to use
	 * collective buffering
	 */
	ADIOI_Info_set(info, "romio_cb_read", "enable");
	fd->hints->cb_read = ADIOI_HINT_ENABLE;
	ADIOI_Info_set(info, "romio_cb_write", "enable");
	fd->hints->cb_write = ADIOI_HINT_ENABLE;

   	if ( fd->hints->cb_config_list != NULL ) ADIOI_Free (fd->hints->cb_config_list);
	fd->hints->cb_config_list = NULL;

	/* number of processes that perform I/O in collective I/O */
	MPI_Comm_size(fd->comm, &nprocs);
	nprocs_is_valid = 1;
	MPL_snprintf(value, MPI_MAX_INFO_VAL+1, "%d", nprocs);
	ADIOI_Info_set(info, "cb_nodes", value);
	fd->hints->cb_nodes = -1;

	/* hint indicating that no indep. I/O will be performed on this file */
	ADIOI_Info_set(info, "romio_no_indep_rw", "false");
	fd->hints->no_indep_rw = 0;

	/* gpfs is not implementing file realms (ADIOI_IOStridedColl),
	   initialize to disabled it. 	   */
	/* hint instructing the use of persistent file realms */
	ADIOI_Info_set(info, "romio_cb_pfr", "disable");
	fd->hints->cb_pfr = ADIOI_HINT_DISABLE;

	/* hint guiding the assignment of persistent file realms */
	ADIOI_Info_set(info, "romio_cb_fr_types", "aar");
	fd->hints->cb_fr_type = ADIOI_FR_AAR;

	/* hint to align file realms with a certain byte value */
	ADIOI_Info_set(info, "romio_cb_fr_alignment", "1");
	fd->hints->cb_fr_alignment = 1;

	/* hint to set a threshold percentage for a datatype's size/extent at
	 * which data sieving should be done in collective I/O */
	ADIOI_Info_set(info, "romio_cb_ds_threshold", "0");
	fd->hints->cb_ds_threshold = 0;

	/* hint to switch between point-to-point or all-to-all for two-phase */
	ADIOI_Info_set(info, "romio_cb_alltoall", "automatic");
	fd->hints->cb_alltoall = ADIOI_HINT_AUTO;

	 /* deferred_open derived from no_indep_rw and cb_{read,write} */
	fd->hints->deferred_open = 0;

	/* buffer size for data sieving in independent reads */
	ADIOI_Info_set(info, "ind_rd_buffer_size", ADIOI_GPFS_IND_RD_BUFFER_SIZE_DFLT);
	fd->hints->ind_rd_buffer_size = atoi(ADIOI_GPFS_IND_RD_BUFFER_SIZE_DFLT);

	/* buffer size for data sieving in independent writes */
	ADIOI_Info_set(info, "ind_wr_buffer_size", ADIOI_GPFS_IND_WR_BUFFER_SIZE_DFLT);
	fd->hints->ind_wr_buffer_size = atoi(ADIOI_GPFS_IND_WR_BUFFER_SIZE_DFLT);


    ADIOI_Info_set(info, "romio_ds_read", "automatic");
    fd->hints->ds_read = ADIOI_HINT_AUTO;
    ADIOI_Info_set(info, "romio_ds_write", "automatic");
    fd->hints->ds_write = ADIOI_HINT_AUTO;

    /* still to do: tune this a bit for a variety of file systems. there's
	 * no good default value so just leave it unset */
    fd->hints->min_fdomain_size = 0;
    fd->hints->striping_unit = 0;

    fd->hints->initialized = 1;
    }

    /* add in user's info if supplied */
    if (users_info != MPI_INFO_NULL) {
	ADIOI_Info_check_and_install_int(fd, users_info, "cb_buffer_size",
		&(fd->hints->cb_buffer_size), myname, error_code);
	/* new hints for enabling/disabling coll. buffering on
	 * reads/writes
	 */
	ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_cb_read",
		&(fd->hints->cb_read), myname, error_code);
	if (fd->hints->cb_read == ADIOI_HINT_DISABLE) {
	    /* romio_cb_read overrides no_indep_rw */
	    ADIOI_Info_set(info, "romio_no_indep_rw", "false");
	    fd->hints->no_indep_rw = ADIOI_HINT_DISABLE;
	}
	ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_cb_write",
		&(fd->hints->cb_write), myname, error_code);
	if (fd->hints->cb_write == ADIOI_HINT_DISABLE) {
	    /* romio_cb_write overrides no_indep_rw */
	    ADIOI_Info_set(info, "romio_no_indep_rw", "false");
	    fd->hints->no_indep_rw = ADIOI_HINT_DISABLE;
	}
	/* Has the user indicated all I/O will be done collectively? */
	ADIOI_Info_check_and_install_true(fd, users_info, "romio_no_indep_rw",
		&(fd->hints->no_indep_rw), myname, error_code);
	if (fd->hints->no_indep_rw == 1) {
	    /* if 'no_indep_rw' set, also hint that we will do
	     * collective buffering: if we aren't doing independent io,
	     * then we have to do collective  */
	    ADIOI_Info_set(info, "romio_cb_write", "enable");
	    ADIOI_Info_set(info, "romio_cb_read", "enable");
	    fd->hints->cb_read = 1;
	    fd->hints->cb_write = 1;
	}

	/* new hints for enabling/disabling data sieving on
	 * reads/writes
	 */
	ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_ds_read",
		&(fd->hints->ds_read), myname, error_code);
	ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_ds_write",
		&(fd->hints->ds_write), myname, error_code);

	ADIOI_Info_check_and_install_int(fd, users_info, "ind_wr_buffer_size",
		&(fd->hints->ind_wr_buffer_size), myname, error_code);
	ADIOI_Info_check_and_install_int(fd, users_info, "ind_rd_buffer_size",
		&(fd->hints->ind_rd_buffer_size), myname, error_code);

	memset( value, 0, MPI_MAX_INFO_VAL+1 );
	ADIOI_Info_get(users_info, "romio_min_fdomain_size", MPI_MAX_INFO_VAL,
			value, &flag);
	if ( flag && ((intval = atoi(value)) > 0) ) {
		ADIOI_Info_set(info, "romio_min_fdomain_size", value);
		fd->hints->min_fdomain_size = intval;
	}
  /* Now we use striping unit in common code so we should
     process hints for it. */
	ADIOI_Info_check_and_install_int(fd, users_info, "striping_unit",
		&(fd->hints->striping_unit), myname, error_code);

#ifdef BGQPLATFORM
	memset( value, 0, MPI_MAX_INFO_VAL+1 );
        ADIOI_Info_get(users_info, ADIOI_BG_NAGG_IN_PSET_HINT_NAME, MPI_MAX_INFO_VAL,
		     value, &flag);
	if (flag && ((intval = atoi(value)) > 0)) {

	    did_anything = 1;
	    ADIOI_Info_set(info, ADIOI_BG_NAGG_IN_PSET_HINT_NAME, value);
	    fd->hints->cb_nodes = intval;
	}
#endif
    }

    /* special CB aggregator assignment */
    if (did_anything) {
#ifdef BGQPLATFORM
	ADIOI_BG_gen_agg_ranklist(fd, fd->hints->cb_nodes);
#elif PEPLATFORM
	ADIOI_PE_gen_agg_ranklist(fd);
#endif
    }

    /* deferred_open won't be set by callers, but if the user doesn't
     * explicitly disable collecitve buffering (two-phase) and does hint that
     * io w/o independent io is going on, we'll set this internal hint as a
     * convenience */
    if ( ( (fd->hints->cb_read != ADIOI_HINT_DISABLE) \
			    && (fd->hints->cb_write != ADIOI_HINT_DISABLE)\
			    && fd->hints->no_indep_rw ) ) {
	    fd->hints->deferred_open = 1;
    } else {
	    /* setting romio_no_indep_rw enable and romio_cb_{read,write}
	     * disable at the same time doesn't make sense. honor
	     * romio_cb_{read,write} and force the no_indep_rw hint to
	     * 'disable' */
	    ADIOI_Info_set(info, "romio_no_indep_rw", "false");
	    fd->hints->no_indep_rw = 0;
	    fd->hints->deferred_open = 0;
    }

    /* BobC commented this out, but since hint processing runs on both bg and
     * bglockless, we need to keep DS writes enabled on gpfs and disabled on
     * PVFS */
    if (ADIO_Feature(fd, ADIO_DATA_SIEVING_WRITES) == 0) {
    /* disable data sieving for fs that do not
       support file locking */
       	ADIOI_Info_get(info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL,
		     value, &flag);
	if (flag) {
	    /* get rid of this value if it is set */
	    ADIOI_Info_delete(info, "ind_wr_buffer_size");
	}
	/* note: leave ind_wr_buffer_size alone; used for other cases
	 * as well. -- Rob Ross, 04/22/2003
	 */
	ADIOI_Info_set(info, "romio_ds_write", "disable");
	fd->hints->ds_write = ADIOI_HINT_DISABLE;
    }

    ADIOI_Free(value);

    *error_code = MPI_SUCCESS;
}
Beispiel #13
0
void ADIOI_GEN_ReadContig(ADIO_File fd, void *buf, int count, 
			  MPI_Datatype datatype, int file_ptr_type,
			  ADIO_Offset offset, ADIO_Status *status,
			  int *error_code)
{
    int err = -1, datatype_size;
    ADIO_Offset len;
    static char myname[] = "ADIOI_GEN_READCONTIG";

#ifdef AGGREGATION_PROFILE
    MPE_Log_event (5034, 0, NULL);
#endif
    MPI_Type_size(datatype, &datatype_size);
    len = (ADIO_Offset)datatype_size * (ADIO_Offset)count;
    ADIOI_Assert(len == (unsigned int) len); /* read takes an unsigned int parm */

    if (file_ptr_type == ADIO_INDIVIDUAL) {
	offset = fd->fp_ind;
    }

    if (fd->fp_sys_posn != offset) {
#ifdef ADIOI_MPE_LOGGING
        MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL );
#endif
	err = lseek(fd->fd_sys, offset, SEEK_SET);
#ifdef ADIOI_MPE_LOGGING
        MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL );
#endif
	/* --BEGIN ERROR HANDLING-- */
	if (err == -1) {
	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
					       MPIR_ERR_RECOVERABLE,
					       myname, __LINE__,
					       MPI_ERR_IO, "**io",
					       "**io %s", strerror(errno));
	    fd->fp_sys_posn = -1;
	    return;
	}
	/* --END ERROR HANDLING-- */
    }

#ifdef ADIOI_MPE_LOGGING
    MPE_Log_event( ADIOI_MPE_read_a, 0, NULL );
#endif
    err = read(fd->fd_sys, buf, (unsigned int)len);
#ifdef ADIOI_MPE_LOGGING
    MPE_Log_event( ADIOI_MPE_read_b, 0, NULL );
#endif
    /* --BEGIN ERROR HANDLING-- */
    if (err == -1) {
	*error_code = MPIO_Err_create_code(MPI_SUCCESS,
					   MPIR_ERR_RECOVERABLE,
					   myname, __LINE__,
					   MPI_ERR_IO, "**io",
					   "**io %s", strerror(errno));
	fd->fp_sys_posn = -1;
	return;
    }
    /* --END ERROR HANDLING-- */

    fd->fp_sys_posn = offset + err;

    if (file_ptr_type == ADIO_INDIVIDUAL) {
	fd->fp_ind += err; 
    }

#ifdef HAVE_STATUS_SET_BYTES
    if (err != -1) MPIR_Status_set_bytes(status, datatype, err);
#endif

    *error_code = MPI_SUCCESS;
#ifdef AGGREGATION_PROFILE
    MPE_Log_event (5035, 0, NULL);
#endif
}
Beispiel #14
0
/* Some of this code is from the old Calc_my_off_len() function.
 * It calculates the 1st and last byte accessed */
void ADIOI_Calc_bounds(ADIO_File fd, int count, MPI_Datatype buftype,
                       int file_ptr_type, ADIO_Offset offset,
                       ADIO_Offset * st_offset, ADIO_Offset * end_offset)
{
    MPI_Count filetype_size, buftype_size, etype_size;
    int sum;
    MPI_Aint filetype_extent;
    ADIO_Offset total_io;
    int filetype_is_contig;
    ADIO_Offset i, remainder;
    ADIOI_Flatlist_node *flat_file;

    ADIO_Offset st_byte_off, end_byte_off;

#ifdef AGGREGATION_PROFILE
    MPE_Log_event(5000, 0, NULL);
#endif

    if (!count) {
        /* Max signed positive value for ADIO_Offset
         * (arch. dependent?).  is there a better way? */
        memset(st_offset, 8, sizeof(ADIO_Offset));
        *st_offset = *st_offset / 2;
        *end_offset = -1;
        return;
    }

    ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);

    MPI_Type_size_x(fd->filetype, &filetype_size);
    ADIOI_Assert(filetype_size != 0);
    MPI_Type_extent(fd->filetype, &filetype_extent);
    MPI_Type_size_x(fd->etype, &etype_size);
    MPI_Type_size_x(buftype, &buftype_size);

    total_io = buftype_size * count;

    if (filetype_is_contig) {
        if (file_ptr_type == ADIO_INDIVIDUAL)
            st_byte_off = fd->fp_ind;
        else
            st_byte_off = fd->disp + etype_size * offset;

        end_byte_off = st_byte_off + total_io - 1;
    } else {
        flat_file = ADIOI_Flatten_and_find(fd->filetype);

        /* we need to take care of some weirdness since fd->fp_ind
         * points at an accessible byte in file.  the first accessible
         * byte in the file is not necessarily the first byte, nor is
         * it necessarily the first off/len pair in the filetype. */
        if (file_ptr_type == ADIO_INDIVIDUAL) {
            st_byte_off = fd->fp_ind;
            /* find end byte of I/O (may be in middle of an etype) */

            /* calculate byte starting point of first filetype */
            end_byte_off = (ADIO_Offset)
                ((fd->fp_ind - fd->disp - flat_file->indices[0]) /
                 filetype_extent) * filetype_extent + fd->disp + flat_file->indices[0];
            /* number of absolute bytes into first filetype */
            remainder = (fd->fp_ind - fd->disp - flat_file->indices[0]) % filetype_extent;
            if (remainder) {
                /* find how many file viewable bytes into first filetype */
                sum = 0;
                for (i = 0; i < flat_file->count; i++) {
                    sum += flat_file->blocklens[i];
                    if ((flat_file->indices[i] - flat_file->indices[0] +
                         flat_file->blocklens[i]) >= remainder) {
                        sum -= (flat_file->blocklens[i] - (sum - remainder));
                        break;
                    }
                }
                total_io += sum;
            }
            /* byte starting point of last filetype */
            end_byte_off += (total_io - 1) / filetype_size * filetype_extent;
            /* number of bytes into last filetype */
            remainder = total_io % filetype_size;
            if (!remainder) {
                for (i = flat_file->count - 1; i >= 0; i--) {
                    if (flat_file->blocklens[i])
                        break;
                }
                ADIOI_Assert(i > -1);
                end_byte_off += flat_file->indices[i] + flat_file->blocklens[i] - 1;
                end_byte_off -= flat_file->indices[0];
            } else {
                sum = 0;
                for (i = 0; i < flat_file->count; i++) {
                    sum += flat_file->blocklens[i];
                    if (sum >= remainder) {
                        end_byte_off += flat_file->indices[i] +
                            flat_file->blocklens[i] - sum + remainder - 1;
                        break;
                    }
                }
                end_byte_off -= flat_file->indices[0];
            }
        } else {
            /* find starting byte of I/O (must be aligned with an etype) */
            /* byte starting point of starting filetype */
            st_byte_off = fd->disp + ((offset * etype_size) / filetype_size) * filetype_extent;
            /* number of file viewable bytes into starting filetype */
            remainder = (etype_size * offset) % filetype_size;

            sum = 0;
            for (i = 0; i < flat_file->count; i++) {
                sum += flat_file->blocklens[i];
                if (sum >= remainder) {
                    if (sum == remainder)
                        st_byte_off += flat_file->indices[i + 1];
                    else
                        st_byte_off += flat_file->indices[i] +
                            flat_file->blocklens[i] - sum + remainder;
                    break;
                }
            }

            /* find end byte of I/O (may be in middle of an etype) */
            /* byte starting point of last filetype */
            end_byte_off = fd->disp + (offset * etype_size + total_io) /
                filetype_size * filetype_extent;
            /* number of bytes into last filetype */
            remainder = (offset * etype_size + total_io) % filetype_size;

            if (!remainder) {
                /* the last non-zero off/len pair */
                for (i = flat_file->count - 1; i >= 0; i--) {
                    if (flat_file->blocklens[i])
                        break;
                }
                ADIOI_Assert(i >= 0);
                /* back up a whole filetype, and put back up to the
                 * last byte of the last non-zero offlen pair */
                /* end_byte_off = (end_byte_off - filetype_extent) +
                 * flat_file->indices[i] +
                 * flat_file->blocklens[i] - 1; */
                /* equivalent of above commented out equation */
                end_byte_off -= filetype_extent - flat_file->indices[i] -
                    flat_file->blocklens[i] + 1;
            } else {
                sum = 0;
                for (i = 0; i < flat_file->count; i++) {
                    sum += flat_file->blocklens[i];
                    if (sum >= remainder) {
                        end_byte_off += flat_file->indices[i] +
                            flat_file->blocklens[i] - sum + remainder - 1;
                        break;
                    }
                }
            }
        }
    }

    *st_offset = st_byte_off;
    *end_offset = end_byte_off;
#ifdef DEBUG
    printf("st_offset = %lld\nend_offset = %lld\n", st_byte_off, end_byte_off);
#endif
#ifdef AGGREGATION_PROFILE
    MPE_Log_event(5001, 0, NULL);
#endif
}
Beispiel #15
0
/* If successful, error_code is set to MPI_SUCCESS.  Otherwise an error
 * code is created and returned in error_code.
 */
static void ADIOI_Exch_and_write(ADIO_File fd, void *buf, MPI_Datatype
                                 datatype, int nprocs,
                                 int myrank,
                                 ADIOI_Access
                                 * others_req, ADIO_Offset * offset_list,
                                 ADIO_Offset * len_list, int contig_access_count,
                                 ADIO_Offset min_st_offset, ADIO_Offset fd_size,
                                 ADIO_Offset * fd_start, ADIO_Offset * fd_end,
                                 MPI_Aint * buf_idx, int *error_code)
{
/* Send data to appropriate processes and write in sizes of no more
   than coll_bufsize.
   The idea is to reduce the amount of extra memory required for
   collective I/O. If all data were written all at once, which is much
   easier, it would require temp space more than the size of user_buf,
   which is often unacceptable. For example, to write a distributed
   array to a file, where each local array is 8Mbytes, requiring
   at least another 8Mbytes of temp space is unacceptable. */

    /* Not convinced end_loc-st_loc couldn't be > int, so make these offsets */
    ADIO_Offset size = 0;
    int hole, i, j, m, ntimes, max_ntimes, buftype_is_contig;
    ADIO_Offset st_loc = -1, end_loc = -1, off, done, req_off;
    char *write_buf = NULL;
    int *curr_offlen_ptr, *count, *send_size, req_len, *recv_size;
    int *partial_recv, *sent_to_proc, *start_pos, flag;
    int *send_buf_idx, *curr_to_proc, *done_to_proc;
    MPI_Status status;
    ADIOI_Flatlist_node *flat_buf = NULL;
    MPI_Aint buftype_extent;
    int info_flag, coll_bufsize;
    char *value;
    static char myname[] = "ADIOI_EXCH_AND_WRITE";

    *error_code = MPI_SUCCESS;  /* changed below if error */
    /* only I/O errors are currently reported */

/* calculate the number of writes of size coll_bufsize
   to be done by each process and the max among all processes.
   That gives the no. of communication phases as well. */

    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
    ADIOI_Info_get(fd->info, "cb_buffer_size", MPI_MAX_INFO_VAL, value, &info_flag);
    coll_bufsize = atoi(value);
    ADIOI_Free(value);


    for (i = 0; i < nprocs; i++) {
        if (others_req[i].count) {
            st_loc = others_req[i].offsets[0];
            end_loc = others_req[i].offsets[0];
            break;
        }
    }

    for (i = 0; i < nprocs; i++)
        for (j = 0; j < others_req[i].count; j++) {
            st_loc = MPL_MIN(st_loc, others_req[i].offsets[j]);
            end_loc = MPL_MAX(end_loc, (others_req[i].offsets[j]
                                        + others_req[i].lens[j] - 1));
        }

/* ntimes=ceiling_div(end_loc - st_loc + 1, coll_bufsize)*/

    ntimes = (int) ((end_loc - st_loc + coll_bufsize) / coll_bufsize);

    if ((st_loc == -1) && (end_loc == -1)) {
        ntimes = 0;     /* this process does no writing. */
    }

    MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX, fd->comm);

    write_buf = fd->io_buf;

    curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs * 10, sizeof(int));
    /* its use is explained below. calloc initializes to 0. */

    count = curr_offlen_ptr + nprocs;
    /* to store count of how many off-len pairs per proc are satisfied
     * in an iteration. */

    partial_recv = count + nprocs;
    /* if only a portion of the last off-len pair is recd. from a process
     * in a particular iteration, the length recd. is stored here.
     * calloc initializes to 0. */

    send_size = partial_recv + nprocs;
    /* total size of data to be sent to each proc. in an iteration.
     * Of size nprocs so that I can use MPI_Alltoall later. */

    recv_size = send_size + nprocs;
    /* total size of data to be recd. from each proc. in an iteration. */

    sent_to_proc = recv_size + nprocs;
    /* amount of data sent to each proc so far. Used in
     * ADIOI_Fill_send_buffer. initialized to 0 here. */

    send_buf_idx = sent_to_proc + nprocs;
    curr_to_proc = send_buf_idx + nprocs;
    done_to_proc = curr_to_proc + nprocs;
    /* Above three are used in ADIOI_Fill_send_buffer */

    start_pos = done_to_proc + nprocs;
    /* used to store the starting value of curr_offlen_ptr[i] in
     * this iteration */

    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
    if (!buftype_is_contig) {
        flat_buf = ADIOI_Flatten_and_find(datatype);
    }
    MPI_Type_extent(datatype, &buftype_extent);


/* I need to check if there are any outstanding nonblocking writes to
   the file, which could potentially interfere with the writes taking
   place in this collective write call. Since this is not likely to be
   common, let me do the simplest thing possible here: Each process
   completes all pending nonblocking operations before completing. */

    /*ADIOI_Complete_async(error_code);
     * if (*error_code != MPI_SUCCESS) return;
     * MPI_Barrier(fd->comm);
     */

    done = 0;
    off = st_loc;

    for (m = 0; m < ntimes; m++) {
        /* go through all others_req and check which will be satisfied
         * by the current write */

        /* Note that MPI guarantees that displacements in filetypes are in
         * monotonically nondecreasing order and that, for writes, the
         * filetypes cannot specify overlapping regions in the file. This
         * simplifies implementation a bit compared to reads. */

        /* off = start offset in the file for the data to be written in
         * this iteration
         * size = size of data written (bytes) corresponding to off
         * req_off = off in file for a particular contiguous request
         * minus what was satisfied in previous iteration
         * req_size = size corresponding to req_off */

        /* first calculate what should be communicated */

        for (i = 0; i < nprocs; i++)
            count[i] = recv_size[i] = 0;

        size = MPL_MIN((unsigned) coll_bufsize, end_loc - st_loc + 1 - done);

        for (i = 0; i < nprocs; i++) {
            if (others_req[i].count) {
                start_pos[i] = curr_offlen_ptr[i];
                for (j = curr_offlen_ptr[i]; j < others_req[i].count; j++) {
                    if (partial_recv[i]) {
                        /* this request may have been partially
                         * satisfied in the previous iteration. */
                        req_off = others_req[i].offsets[j] + partial_recv[i];
                        req_len = others_req[i].lens[j] - partial_recv[i];
                        partial_recv[i] = 0;
                        /* modify the off-len pair to reflect this change */
                        others_req[i].offsets[j] = req_off;
                        others_req[i].lens[j] = req_len;
                    } else {
                        req_off = others_req[i].offsets[j];
                        req_len = others_req[i].lens[j];
                    }
                    if (req_off < off + size) {
                        count[i]++;
                        ADIOI_Assert((((ADIO_Offset) (uintptr_t) write_buf) + req_off - off) ==
                                     (ADIO_Offset) (uintptr_t) (write_buf + req_off - off));
                        MPI_Address(write_buf + req_off - off, &(others_req[i].mem_ptrs[j]));
                        ADIOI_Assert((off + size - req_off) == (int) (off + size - req_off));
                        recv_size[i] += (int) (MPL_MIN(off + size - req_off, (unsigned) req_len));

                        if (off + size - req_off < (unsigned) req_len) {
                            partial_recv[i] = (int) (off + size - req_off);

                            /* --BEGIN ERROR HANDLING-- */
                            if ((j + 1 < others_req[i].count) &&
                                (others_req[i].offsets[j + 1] < off + size)) {
                                *error_code = MPIO_Err_create_code(MPI_SUCCESS,
                                                                   MPIR_ERR_RECOVERABLE,
                                                                   myname,
                                                                   __LINE__,
                                                                   MPI_ERR_ARG,
                                                                   "Filetype specifies overlapping write regions (which is illegal according to the MPI-2 specification)",
                                                                   0);
                                /* allow to continue since additional
                                 * communication might have to occur
                                 */
                            }
                            /* --END ERROR HANDLING-- */
                            break;
                        }
                    } else
                        break;
                }
                curr_offlen_ptr[i] = j;
            }
        }

        ADIOI_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
                              len_list, send_size, recv_size, off, size, count,
                              start_pos, partial_recv,
                              sent_to_proc, nprocs, myrank,
                              buftype_is_contig, contig_access_count,
                              min_st_offset, fd_size, fd_start, fd_end,
                              others_req, send_buf_idx, curr_to_proc,
                              done_to_proc, &hole, m, buftype_extent, buf_idx, error_code);
        if (*error_code != MPI_SUCCESS)
            return;

        flag = 0;
        for (i = 0; i < nprocs; i++)
            if (count[i])
                flag = 1;

        if (flag) {
            ADIOI_Assert(size == (int) size);
            ADIO_WriteContig(fd, write_buf, (int) size, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
                             off, &status, error_code);
            if (*error_code != MPI_SUCCESS)
                return;
        }

        off += size;
        done += size;
    }

    for (i = 0; i < nprocs; i++)
        count[i] = recv_size[i] = 0;
    for (m = ntimes; m < max_ntimes; m++) {
        ADIOI_Assert(size == (int) size);
        /* nothing to recv, but check for send. */
        ADIOI_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
                              len_list, send_size, recv_size, off, (int) size, count,
                              start_pos, partial_recv,
                              sent_to_proc, nprocs, myrank,
                              buftype_is_contig, contig_access_count,
                              min_st_offset, fd_size, fd_start, fd_end,
                              others_req, send_buf_idx,
                              curr_to_proc, done_to_proc, &hole, m,
                              buftype_extent, buf_idx, error_code);
        if (*error_code != MPI_SUCCESS)
            return;
    }

    ADIOI_Free(curr_offlen_ptr);
}
Beispiel #16
0
/* ADIOI_cb_gather_name_array() - gather a list of processor names from all processes
 *                          in a communicator and store them on rank 0.
 *
 * This is a collective call on the communicator(s) passed in.
 *
 * Obtains a rank-ordered list of processor names from the processes in
 * "dupcomm".
 *
 * Returns 0 on success, -1 on failure.
 *
 * NOTE: Needs some work to cleanly handle out of memory cases!  
 */
int ADIOI_cb_gather_name_array(MPI_Comm comm,
			       MPI_Comm dupcomm,
			       ADIO_cb_name_array *arrayp)
{
    char my_procname[MPI_MAX_PROCESSOR_NAME], **procname = 0;
    int *procname_len = NULL, my_procname_len, *disp = NULL, i;
    int commsize, commrank, found;
    ADIO_cb_name_array array = NULL;
    int alloc_size;

    if (ADIOI_cb_config_list_keyval == MPI_KEYVAL_INVALID) {
        /* cleaned up by ADIOI_End_call */
	MPI_Keyval_create((MPI_Copy_function *) ADIOI_cb_copy_name_array, 
			  (MPI_Delete_function *) ADIOI_cb_delete_name_array,
			  &ADIOI_cb_config_list_keyval, NULL);
    }
    else {
	MPI_Attr_get(comm, ADIOI_cb_config_list_keyval, (void *) &array, &found);
        if (found) {
            ADIOI_Assert(array != NULL);
	    *arrayp = array;
	    return 0;
	}
    }

    MPI_Comm_size(dupcomm, &commsize);
    MPI_Comm_rank(dupcomm, &commrank);

    MPI_Get_processor_name(my_procname, &my_procname_len);

    /* allocate space for everything */
    array = (ADIO_cb_name_array) ADIOI_Malloc(sizeof(*array));
    if (array == NULL) {
	return -1;
    }
    array->refct = 2; /* we're going to associate this with two comms */

    if (commrank == 0) {
	/* process 0 keeps the real list */
	array->namect = commsize;

	array->names = (char **) ADIOI_Malloc(sizeof(char *) * commsize);
	if (array->names == NULL) {
	    return -1;
	}
	procname = array->names; /* simpler to read */

	procname_len = (int *) ADIOI_Malloc(commsize * sizeof(int));
	if (procname_len == NULL) { 
	    return -1;
	}
    }
    else {
	/* everyone else just keeps an empty list as a placeholder */
	array->namect = 0;
	array->names = NULL;
    }
    /* gather lengths first */
    MPI_Gather(&my_procname_len, 1, MPI_INT, 
	       procname_len, 1, MPI_INT, 0, dupcomm);

    if (commrank == 0) {
#ifdef CB_CONFIG_LIST_DEBUG
	for (i=0; i < commsize; i++) {
	    FPRINTF(stderr, "len[%d] = %d\n", i, procname_len[i]);
	}
#endif

	alloc_size = 0;
	for (i=0; i < commsize; i++) {
	    /* add one to the lengths because we need to count the
	     * terminator, and we are going to use this list of lengths
	     * again in the gatherv.  
	     */
	    alloc_size += ++procname_len[i];
	}
	
	procname[0] = ADIOI_Malloc(alloc_size);
	if (procname[0] == NULL) {
	    return -1;
	}

	for (i=1; i < commsize; i++) {
	    procname[i] = procname[i-1] + procname_len[i-1];
	}
	
	/* create our list of displacements for the gatherv.  we're going
	 * to do everything relative to the start of the region allocated
	 * for procname[0]
	 */
	disp = ADIOI_Malloc(commsize * sizeof(int));
	disp[0] = 0;
	for (i=1; i < commsize; i++) {
	    disp[i] = (int) (procname[i] - procname[0]);
	}

    }

    /* now gather strings */
    if (commrank == 0) {
	MPI_Gatherv(my_procname, my_procname_len + 1, MPI_CHAR, 
		    procname[0], procname_len, disp, MPI_CHAR,
		    0, dupcomm);
    }
    else {
	/* if we didn't do this, we would need to allocate procname[]
	 * on all processes...which seems a little silly.
	 */
	MPI_Gatherv(my_procname, my_procname_len + 1, MPI_CHAR, 
		    NULL, NULL, NULL, MPI_CHAR, 0, dupcomm);
    }

    if (commrank == 0) {
	/* no longer need the displacements or lengths */
	ADIOI_Free(disp);
	ADIOI_Free(procname_len);

#ifdef CB_CONFIG_LIST_DEBUG
	for (i=0; i < commsize; i++) {
	    FPRINTF(stderr, "name[%d] = %s\n", i, procname[i]);
	}
#endif
    }

    /* store the attribute; we want to store SOMETHING on all processes
     * so that they can all tell if we have gone through this procedure 
     * or not for the given communicator.
     *
     * specifically we put it on both the original comm, so we can find
     * it next time an open is performed on this same comm, and on the
     * dupcomm, so we can use it in I/O operations.
     */
    MPI_Attr_put(comm, ADIOI_cb_config_list_keyval, array);
    MPI_Attr_put(dupcomm, ADIOI_cb_config_list_keyval, array);
    *arrayp = array;
    return 0;
}
Beispiel #17
0
void ADIOI_GEN_WriteStrided_naive(ADIO_File fd, const void *buf, int count,
                       MPI_Datatype buftype, int file_ptr_type,
                       ADIO_Offset offset, ADIO_Status *status, int
                       *error_code)
{
    /* offset is in units of etype relative to the filetype. */

    ADIOI_Flatlist_node *flat_buf, *flat_file;
    /* bwr == buffer write; fwr == file write */
    ADIO_Offset bwr_size, fwr_size=0, sum, size_in_filetype; 
    int b_index;
    MPI_Count bufsize;
    ADIO_Offset n_etypes_in_filetype;
    ADIO_Offset size, n_filetypes, etype_in_filetype;
    ADIO_Offset abs_off_in_filetype=0, req_len;
    MPI_Count filetype_size, etype_size, buftype_size;
    MPI_Aint filetype_extent, buftype_extent, lb; 
    int buf_count, buftype_is_contig, filetype_is_contig;
    ADIO_Offset userbuf_off;
    ADIO_Offset off, req_off, disp, end_offset=0, start_off;
    ADIO_Status status1;

    *error_code = MPI_SUCCESS;  /* changed below if error */

    ADIOI_Datatype_iscontig(buftype, &buftype_is_contig);
    ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);

    MPI_Type_size_x(fd->filetype, &filetype_size);
    if ( ! filetype_size ) {
#ifdef HAVE_STATUS_SET_BYTES
	MPIR_Status_set_bytes(status, buftype, 0);
#endif
	*error_code = MPI_SUCCESS; 
	return;
    }

    MPI_Type_get_extent(fd->filetype, &lb, &filetype_extent);
    MPI_Type_size_x(buftype, &buftype_size);
    MPI_Type_get_extent(buftype, &lb, &buftype_extent);
    etype_size = fd->etype_size;

    ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(unsigned)buftype_size * (ADIO_Offset)count));
    bufsize = buftype_size * count;

    /* contiguous in buftype and filetype is handled elsewhere */

    if (!buftype_is_contig && filetype_is_contig) {
    	int b_count;
	/* noncontiguous in memory, contiguous in file. */

	flat_buf = ADIOI_Flatten_and_find(buftype);

        off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : 
              fd->disp + (ADIO_Offset)etype_size * offset;

	start_off = off;
	end_offset = off + bufsize - 1;

	/* if atomicity is true, lock (exclusive) the region to be accessed */
        if ((fd->atomicity) && ADIO_Feature(fd, ADIO_LOCKS))
	{
            ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
	}

	/* for each region in the buffer, grab the data and put it in
	 * place
	 */
        for (b_count=0; b_count < count; b_count++) {
            for (b_index=0; b_index < flat_buf->count; b_index++) {
                userbuf_off = (ADIO_Offset)b_count*(ADIO_Offset)buftype_extent + 
		              flat_buf->indices[b_index];
		req_off = off;
		req_len = flat_buf->blocklens[b_index];

    ADIOI_Assert(req_len == (int) req_len);
    ADIOI_Assert((((ADIO_Offset)(MPIU_Upint)buf) + userbuf_off) == (ADIO_Offset)(MPIU_Upint)((MPIU_Upint)buf + userbuf_off));
		ADIO_WriteContig(fd, 
				(char *) buf + userbuf_off,
				(int)req_len, 
				MPI_BYTE, 
		    		ADIO_EXPLICIT_OFFSET,
				req_off,
				&status1,
				error_code);
		if (*error_code != MPI_SUCCESS) return;

		/* off is (potentially) used to save the final offset later */
                off += flat_buf->blocklens[b_index];
            }
	}

        if ((fd->atomicity) && ADIO_Feature(fd, ADIO_LOCKS))
	{
            ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
	}

        if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;

    }

    else {  /* noncontiguous in file */
    	int f_index, st_index = 0;
      ADIO_Offset st_fwr_size, st_n_filetypes;
	int flag;

        /* First we're going to calculate a set of values for use in all
	 * the noncontiguous in file cases:
	 * start_off - starting byte position of data in file
	 * end_offset - last byte offset to be acessed in the file
	 * st_n_filetypes - how far into the file we start in terms of
	 *                  whole filetypes
	 * st_index - index of block in first filetype that we will be
	 *            starting in (?)
	 * st_fwr_size - size of the data in the first filetype block
	 *               that we will write (accounts for being part-way
	 *               into writing this block of the filetype
	 *
	 */

	/* filetype already flattened in ADIO_Open */
	flat_file = ADIOI_Flatlist;
	while (flat_file->type != fd->filetype) flat_file = flat_file->next;
	disp = fd->disp;

	if (file_ptr_type == ADIO_INDIVIDUAL) {
	    start_off = fd->fp_ind; /* in bytes */
	    n_filetypes = -1;
	    flag = 0;
	    while (!flag) {
                n_filetypes++;
		for (f_index=0; f_index < flat_file->count; f_index++) {
		    if (disp + flat_file->indices[f_index] + 
                       n_filetypes*(ADIO_Offset)filetype_extent + 
		       flat_file->blocklens[f_index] >= start_off) 
		    {
		    	/* this block contains our starting position */

			st_index = f_index;
			fwr_size = disp + flat_file->indices[f_index] + 
		 	           n_filetypes*(ADIO_Offset)filetype_extent + 
				   flat_file->blocklens[f_index] - start_off;
			flag = 1;
			break;
		    }
		}
	    }
	}
	else {
	    n_etypes_in_filetype = filetype_size/etype_size;
	    n_filetypes = offset / n_etypes_in_filetype;
	    etype_in_filetype = offset % n_etypes_in_filetype;
	    size_in_filetype = etype_in_filetype * etype_size;
 
	    sum = 0;
	    for (f_index=0; f_index < flat_file->count; f_index++) {
		sum += flat_file->blocklens[f_index];
		if (sum > size_in_filetype) {
		    st_index = f_index;
		    fwr_size = sum - size_in_filetype;
		    abs_off_in_filetype = flat_file->indices[f_index] +
			                  size_in_filetype - 
			                  (sum - flat_file->blocklens[f_index]);
		    break;
		}
	    }

	    /* abs. offset in bytes in the file */
	    start_off = disp + n_filetypes*(ADIO_Offset)filetype_extent + 
	    	        abs_off_in_filetype;
	}

	st_fwr_size = fwr_size;
	st_n_filetypes = n_filetypes;

	/* start_off, st_n_filetypes, st_index, and st_fwr_size are 
	 * all calculated at this point
	 */

        /* Calculate end_offset, the last byte-offset that will be accessed.
         * e.g., if start_off=0 and 100 bytes to be written, end_offset=99
	 */
	userbuf_off = 0;
	f_index = st_index;
	off = start_off;
	fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
	while (userbuf_off < bufsize) {
	    userbuf_off += fwr_size;
	    end_offset = off + fwr_size - 1;

	    if (f_index < (flat_file->count - 1)) f_index++;
	    else {
		f_index = 0;
		n_filetypes++;
	    }

	    off = disp + flat_file->indices[f_index] + 
	          n_filetypes*(ADIO_Offset)filetype_extent;
	    fwr_size = ADIOI_MIN(flat_file->blocklens[f_index], 
	                         bufsize-(unsigned)userbuf_off);
	}

	/* End of calculations.  At this point the following values have
	 * been calculated and are ready for use:
	 * - start_off
	 * - end_offset
	 * - st_n_filetypes
	 * - st_index
	 * - st_fwr_size
	 */

	/* if atomicity is true, lock (exclusive) the region to be accessed */
        if ((fd->atomicity) && ADIO_Feature(fd, ADIO_LOCKS))
	{
            ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
	}

	if (buftype_is_contig && !filetype_is_contig) {
	    /* contiguous in memory, noncontiguous in file. should be the
	     * most common case.
	     */

	    userbuf_off = 0;
	    f_index = st_index;
	    off = start_off;
	    n_filetypes = st_n_filetypes;
	    fwr_size = ADIOI_MIN(st_fwr_size, bufsize);

	    /* while there is still space in the buffer, write more data */
	    while (userbuf_off < bufsize) {
                if (fwr_size) { 
                    /* TYPE_UB and TYPE_LB can result in 
                       fwr_size = 0. save system call in such cases */ 
		    req_off = off;
		    req_len = fwr_size;

        ADIOI_Assert(req_len == (int) req_len);
        ADIOI_Assert((((ADIO_Offset)(MPIU_Upint)buf) + userbuf_off) == (ADIO_Offset)(MPIU_Upint)((MPIU_Upint)buf + userbuf_off));
		    ADIO_WriteContig(fd, 
				    (char *) buf + userbuf_off,
				    (int)req_len, 
				    MPI_BYTE, 
				    ADIO_EXPLICIT_OFFSET,
				    req_off,
				    &status1,
				    error_code);
		    if (*error_code != MPI_SUCCESS) return;
		}
		userbuf_off += fwr_size;

                if (off + fwr_size < disp + flat_file->indices[f_index] +
                   flat_file->blocklens[f_index] + 
		   n_filetypes*(ADIO_Offset)filetype_extent)
		{
		    /* important that this value be correct, as it is
		     * used to set the offset in the fd near the end of
		     * this function.
		     */
                    off += fwr_size;
		}
                /* did not reach end of contiguous block in filetype.
                 * no more I/O needed. off is incremented by fwr_size.
		 */
                else {
		    if (f_index < (flat_file->count - 1)) f_index++;
		    else {
			f_index = 0;
			n_filetypes++;
		    }
		    off = disp + flat_file->indices[f_index] + 
                          n_filetypes*(ADIO_Offset)filetype_extent;
		    fwr_size = ADIOI_MIN(flat_file->blocklens[f_index], 
		                         bufsize-(unsigned)userbuf_off);
		}
	    }
	}
	else {
	    ADIO_Offset i_offset, tmp_bufsize = 0;
	    /* noncontiguous in memory as well as in file */

	    flat_buf = ADIOI_Flatten_and_find(buftype);

	    b_index = buf_count = 0;
	    i_offset = flat_buf->indices[0];
	    f_index = st_index;
	    off = start_off;
	    n_filetypes = st_n_filetypes;
	    fwr_size = st_fwr_size;
	    bwr_size = flat_buf->blocklens[0];

	    /* while we haven't read size * count bytes, keep going */
	    while (tmp_bufsize < bufsize) {
    		ADIO_Offset new_bwr_size = bwr_size, new_fwr_size = fwr_size;

		size = ADIOI_MIN(fwr_size, bwr_size);
		if (size) {
		    req_off = off;
		    req_len = size;
		    userbuf_off = i_offset;

        ADIOI_Assert(req_len == (int) req_len);
        ADIOI_Assert((((ADIO_Offset)(MPIU_Upint)buf) + userbuf_off) == (ADIO_Offset)(MPIU_Upint)((MPIU_Upint)buf + userbuf_off));
		    ADIO_WriteContig(fd, 
				    (char *) buf + userbuf_off,
				    (int)req_len, 
				    MPI_BYTE, 
				    ADIO_EXPLICIT_OFFSET,
				    req_off,
				    &status1,
				    error_code);
		    if (*error_code != MPI_SUCCESS) return;
		}

		if (size == fwr_size) {
		    /* reached end of contiguous block in file */
		    if (f_index < (flat_file->count - 1)) f_index++;
		    else {
			f_index = 0;
			n_filetypes++;
		    }

		    off = disp + flat_file->indices[f_index] + 
                          n_filetypes*(ADIO_Offset)filetype_extent;

		    new_fwr_size = flat_file->blocklens[f_index];
		    if (size != bwr_size) {
			i_offset += size;
			new_bwr_size -= size;
		    }
		}

		if (size == bwr_size) {
		    /* reached end of contiguous block in memory */

		    b_index = (b_index + 1)%flat_buf->count;
		    buf_count++;
		    i_offset = (ADIO_Offset)buftype_extent*(ADIO_Offset)(buf_count/flat_buf->count) +
			flat_buf->indices[b_index];
		    new_bwr_size = flat_buf->blocklens[b_index];
		    if (size != fwr_size) {
			off += size;
			new_fwr_size -= size;
		    }
		}
		tmp_bufsize += size;
		fwr_size = new_fwr_size;
                bwr_size = new_bwr_size;
	    }
	}

	/* unlock the file region if we locked it */
        if ((fd->atomicity) && ADIO_Feature(fd, ADIO_LOCKS))
	{
            ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
	}

	if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
    } /* end of (else noncontiguous in file) */

    fd->fp_sys_posn = -1;   /* mark it as invalid. */

#ifdef HAVE_STATUS_SET_BYTES
    MPIR_Status_set_bytes(status, buftype, bufsize);
    /* This is a temporary way of filling in status. The right way is to 
     * keep track of how much data was actually written and placed in buf 
     */
#endif

    if (!buftype_is_contig) ADIOI_Delete_flattened(buftype);
}
void ADIOI_BGL_WriteContig(ADIO_File fd, void *buf, int count, 
                     MPI_Datatype datatype, int file_ptr_type,
		     ADIO_Offset offset, ADIO_Status *status, int *error_code)
{
    int err=-1, datatype_size;
    ADIO_Offset len;
    static char myname[] = "ADIOI_BGL_WRITECONTIG";
#ifdef AGGREGATION_PROFILE
    MPE_Log_event (5036, 0, NULL);
#endif
#if BGL_PROFILE
		/* timing */
		double io_time, io_time2;

		if (bglmpio_timing) { 
		    io_time = MPI_Wtime(); 
		    bglmpio_prof_cw[ BGLMPIO_CIO_DATA_SIZE ] += len;
		}
#endif
			  
    MPI_Type_size(datatype, &datatype_size);
    len = (ADIO_Offset)datatype_size * (ADIO_Offset)count;
    ADIOI_Assert(len == (unsigned int) len); /* write takes an unsigned int parm */

#if BGL_PROFILE

    if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
        	if (bglmpio_timing2) io_time2 = MPI_Wtime();
	if (fd->fp_sys_posn != offset)
	    lseek(fd->fd_sys, offset, SEEK_SET);
        	if (bglmpio_timing2) bglmpio_prof_cw[ BGLMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
	ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
        	if (bglmpio_timing2) io_time2 = MPI_Wtime();
	err = write(fd->fd_sys, buf, (unsigned int)len);
        	if (bglmpio_timing2) bglmpio_prof_cw[ BGLMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
	ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
	fd->fp_sys_posn = offset + err;
	/* individual file pointer not updated */        
    }
    else { /* write from curr. location of ind. file pointer */
	offset = fd->fp_ind;
	        if (bglmpio_timing2) io_time2 = MPI_Wtime();
	if (fd->fp_sys_posn != fd->fp_ind)
	    lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);
        	if (bglmpio_timing2) bglmpio_prof_cw[ BGLMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
	ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
        	if (bglmpio_timing2) io_time2 = MPI_Wtime();
	err = write(fd->fd_sys, buf, (unsigned int)len);
        	if (bglmpio_timing2) bglmpio_prof_cw[ BGLMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
	ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
	fd->fp_ind += err;
	fd->fp_sys_posn = fd->fp_ind;
    }

#else	/* BGL_PROFILE */

    if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
	if (fd->fp_sys_posn != offset)
	    lseek(fd->fd_sys, offset, SEEK_SET);
	ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
	err = write(fd->fd_sys, buf, (unsigned int)len);
	ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
	fd->fp_sys_posn = offset + err;
	/* individual file pointer not updated */        
    }
    else { /* write from curr. location of ind. file pointer */
	offset = fd->fp_ind;
	if (fd->fp_sys_posn != fd->fp_ind)
	    lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);
	ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
	err = write(fd->fd_sys, buf, (unsigned int)len);
	ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
	fd->fp_ind += err;
	fd->fp_sys_posn = fd->fp_ind;
    }

#endif	/* BGL_PROFILE */

#if BGL_PROFILE
		if (bglmpio_timing) bglmpio_prof_cw[ BGLMPIO_CIO_T_MPIO_RW ] += (MPI_Wtime() - io_time);
#endif

    /* --BEGIN ERROR HANDLING-- */
    if (err == -1) {
	*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
					   myname, __LINE__, MPI_ERR_IO,
					   "**io",
					   "**io %s", strerror(errno));
	return;
    }
    /* --END ERROR HANDLING-- */

#ifdef HAVE_STATUS_SET_BYTES
    MPIR_Status_set_bytes(status, datatype, err);
#endif

    *error_code = MPI_SUCCESS;
#ifdef AGGREGATION_PROFILE
    MPE_Log_event (5037, 0, NULL);
#endif
}
void ADIOI_BGL_WriteStrided(ADIO_File fd, void *buf, int count,
                       MPI_Datatype datatype, int file_ptr_type,
                       ADIO_Offset offset, ADIO_Status *status, int
                       *error_code)
{
/* offset is in units of etype relative to the filetype. */



    ADIOI_Flatlist_node *flat_buf, *flat_file;
    ADIO_Offset i_offset, sum, size_in_filetype;
    int i, j, k, err=-1, st_index=0;
    int n_etypes_in_filetype;
    ADIO_Offset num, size, n_filetypes, etype_in_filetype, st_n_filetypes;
    ADIO_Offset abs_off_in_filetype=0;
    int filetype_size, etype_size, buftype_size;
    MPI_Aint filetype_extent, buftype_extent; 
    int buf_count, buftype_is_contig, filetype_is_contig;
    ADIO_Offset userbuf_off;
    ADIO_Offset off, req_off, disp, end_offset=0, writebuf_off, start_off;
    char *writebuf, *value;
    unsigned bufsize, writebuf_len, max_bufsize, write_sz;
    int err_flag=0, info_flag;
    ADIO_Offset new_bwr_size, new_fwr_size, st_fwr_size, fwr_size=0, bwr_size, req_len;
    static char myname[] = "ADIOI_BGL_WRITESTRIDED";

    if (fd->hints->ds_write == ADIOI_HINT_DISABLE) {
    	/* if user has disabled data sieving on reads, use naive
	 * approach instead.
	 */
      /*FPRINTF(stderr, "ADIOI_GEN_WriteStrided_naive(%d):\n", __LINE__);*/
      ADIOI_GEN_WriteStrided_naive(fd, 
				    buf,
				    count,
				    datatype,
				    file_ptr_type,
				    offset,
				    status,
				    error_code);
    	return;
    }
    /*FPRINTF(stderr, "%s(%d):\n",myname, __LINE__);*/

    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
    ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);

    MPI_Type_size(fd->filetype, &filetype_size);
    if ( ! filetype_size ) {
#ifdef HAVE_STATUS_SET_BYTES
	MPIR_Status_set_bytes(status, datatype, 0);
#endif
	*error_code = MPI_SUCCESS; 
	return;
    }

    MPI_Type_extent(fd->filetype, &filetype_extent);
    MPI_Type_size(datatype, &buftype_size);
    MPI_Type_extent(datatype, &buftype_extent);
    etype_size = fd->etype_size;

    ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(unsigned)buftype_size * (ADIO_Offset)count));
    bufsize = buftype_size * count;

/* get max_bufsize from the info object. */

    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
    ADIOI_Info_get(fd->info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value, 
                 &info_flag);
    max_bufsize = atoi(value);
    ADIOI_Free(value);

    if (!buftype_is_contig && filetype_is_contig) {

/* noncontiguous in memory, contiguous in file. */

	ADIOI_Flatten_datatype(datatype);
	flat_buf = ADIOI_Flatlist;
	while (flat_buf->type != datatype) flat_buf = flat_buf->next;

        off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : 
                 fd->disp + etype_size * offset;

        start_off = off;
	end_offset = off + bufsize - 1;
        writebuf_off = off;
        writebuf = (char *) ADIOI_Malloc(max_bufsize);
        writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));

/* if atomicity is true, lock the region to be accessed */
        if (fd->atomicity) 
            ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);

        for (j=0; j<count; j++) 
        {
          int i;
            for (i=0; i<flat_buf->count; i++) {
                userbuf_off = (ADIO_Offset)j*(ADIO_Offset)buftype_extent + flat_buf->indices[i];
		req_off = off;
		req_len = flat_buf->blocklens[i];
		ADIOI_BUFFERED_WRITE_WITHOUT_READ
                off += flat_buf->blocklens[i];
            }
        }

        /* write the buffer out finally */
	lseek(fd->fd_sys, writebuf_off, SEEK_SET); 
	if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
	err = write(fd->fd_sys, writebuf, writebuf_len); 
        if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
        if (err == -1) err_flag = 1; 

        if (fd->atomicity) 
            ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);

	ADIOI_Free(writebuf); /* malloced in the buffered_write macro */

        if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
	if (err_flag) {
	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
					       MPIR_ERR_RECOVERABLE, myname,
					       __LINE__, MPI_ERR_IO, "**io",
					       "**io %s", strerror(errno));
	}
	else *error_code = MPI_SUCCESS;
    }

    else {  /* noncontiguous in file */

/* filetype already flattened in ADIO_Open */
	flat_file = ADIOI_Flatlist;
	while (flat_file->type != fd->filetype) flat_file = flat_file->next;
	disp = fd->disp;

	if (file_ptr_type == ADIO_INDIVIDUAL) {
	/* Wei-keng reworked type processing to be a bit more efficient */
            offset       = fd->fp_ind - disp;
            n_filetypes  = (offset - flat_file->indices[0]) / filetype_extent;
            offset      -= (ADIO_Offset)n_filetypes * filetype_extent;
            /* now offset is local to this extent */

            /* find the block where offset is located, skip blocklens[i]==0 */
            for (i=0; i<flat_file->count; i++) {
                ADIO_Offset dist;
                if (flat_file->blocklens[i] == 0) continue;
                dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
                /* fwr_size is from offset to the end of block i */
                if (dist == 0) {
                    i++;
                    offset   = flat_file->indices[i];
                    fwr_size = flat_file->blocklens[i];
                    break;
                }
                if (dist > 0) {
                    fwr_size = dist;
                    break;
                }
            }
            st_index = i;  /* starting index in flat_file->indices[] */
            offset += disp + (ADIO_Offset)n_filetypes*filetype_extent;
	}
	else {
    int i;
	    n_etypes_in_filetype = filetype_size/etype_size;
	    n_filetypes = offset / n_etypes_in_filetype;
	    etype_in_filetype = offset % n_etypes_in_filetype;
	    size_in_filetype = etype_in_filetype * etype_size;
 
	    sum = 0;
	    for (i=0; i<flat_file->count; i++) {
		sum += flat_file->blocklens[i];
		if (sum > size_in_filetype) {
		    st_index = i;
		    fwr_size = sum - size_in_filetype;
		    abs_off_in_filetype = flat_file->indices[i] +
			size_in_filetype - (sum - flat_file->blocklens[i]);
		    break;
		}
	    }

	    /* abs. offset in bytes in the file */
	    offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + 
		    abs_off_in_filetype;
	}

        start_off = offset;
        /* Wei-keng Liao:write request is within single flat_file contig block*/
	/* this could happen, for example, with subarray types that are
	 * actually fairly contiguous */
        if (buftype_is_contig && bufsize <= fwr_size) {
            ADIO_WriteContig(fd, buf, bufsize, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
                             offset, status, error_code);

	    if (file_ptr_type == ADIO_INDIVIDUAL) {
                /* update MPI-IO file pointer to point to the first byte 
		 * that can be accessed in the fileview. */
                fd->fp_ind = offset + bufsize;
                if (bufsize == fwr_size) {
                    do {
                        st_index++;
                        if (st_index == flat_file->count) {
                            st_index = 0;
                            n_filetypes++;
                        }
                    } while (flat_file->blocklens[st_index] == 0);
                    fd->fp_ind = disp + flat_file->indices[st_index]
                               + (ADIO_Offset)n_filetypes*filetype_extent;
                }
            }
	    fd->fp_sys_posn = -1;   /* set it to null. */ 
#ifdef HAVE_STATUS_SET_BYTES
	    MPIR_Status_set_bytes(status, datatype, bufsize);
#endif 
            return;
        }

       /* Calculate end_offset, the last byte-offset that will be accessed.
         e.g., if start_offset=0 and 100 bytes to be write, end_offset=99*/

	st_fwr_size = fwr_size;
	st_n_filetypes = n_filetypes;
	i_offset = 0;
	j = st_index;
	off = offset;
	fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
	while (i_offset < bufsize) {
	    i_offset += fwr_size;
	    end_offset = off + fwr_size - 1;

            j = (j+1) % flat_file->count;
            n_filetypes += (j == 0) ? 1 : 0;
            while (flat_file->blocklens[j]==0) {
                j = (j+1) % flat_file->count;
                n_filetypes += (j == 0) ? 1 : 0;
            }

	    off = disp + flat_file->indices[j] + 
		    n_filetypes*(ADIO_Offset)filetype_extent;
	    fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset);
	}

/* if atomicity is true, lock the region to be accessed */
        if (fd->atomicity) 
            ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);

        /* initial read for the read-modify-write */
        writebuf_off = offset;
        writebuf = (char *) ADIOI_Malloc(max_bufsize);
        writebuf_len = (unsigned)(ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));
	if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
	lseek(fd->fd_sys, writebuf_off, SEEK_SET); 
	err = read(fd->fd_sys, writebuf, writebuf_len); 
        if (err == -1) {
	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
					       MPIR_ERR_RECOVERABLE,
					       myname, __LINE__,
					       MPI_ERR_IO,
					       "ADIOI_BGL_WriteStrided: ROMIO tries to optimize this access by doing a read-modify-write, but is unable to read the file. Please give the file read permission and open it with MPI_MODE_RDWR.", 0);
	    return;
        } 

	if (buftype_is_contig && !filetype_is_contig) {

/* contiguous in memory, noncontiguous in file. should be the most
   common case. */

	    i_offset = 0;
	    j = st_index;
	    off = offset;
	    n_filetypes = st_n_filetypes;
	    fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
	    while (i_offset < bufsize) {
                if (fwr_size) { 
                    /* TYPE_UB and TYPE_LB can result in 
                       fwr_size = 0. save system call in such cases */ 
		    /* lseek(fd->fd_sys, off, SEEK_SET);
		    err = write(fd->fd_sys, ((char *) buf) + i_offset, fwr_size);*/

		    req_off = off;
		    req_len = fwr_size;
		    userbuf_off = i_offset;
		    ADIOI_BUFFERED_WRITE
		}
		i_offset += fwr_size;

                if (off + fwr_size < disp + flat_file->indices[j] +
                   flat_file->blocklens[j] + n_filetypes*(ADIO_Offset)filetype_extent)
                       off += fwr_size;
                /* did not reach end of contiguous block in filetype.
                   no more I/O needed. off is incremented by fwr_size. */
                else {
                    j = (j+1) % flat_file->count;
                    n_filetypes += (j == 0) ? 1 : 0;
                    while (flat_file->blocklens[j]==0) {
                        j = (j+1) % flat_file->count;
                        n_filetypes += (j == 0) ? 1 : 0;
                    }
		    off = disp + flat_file->indices[j] + 
                                    n_filetypes*(ADIO_Offset)filetype_extent;
		    fwr_size = ADIOI_MIN(flat_file->blocklens[j], 
				    bufsize-i_offset);
		}
	    }
	}
	else {
Beispiel #20
0
/* #define IO_DEBUG 1 */
void ADIOI_NOLOCK_WriteStrided(ADIO_File fd, const void *buf, int count,
			     MPI_Datatype datatype, int file_ptr_type,
			     ADIO_Offset offset, ADIO_Status *status, int
			     *error_code)
{
/* borrowed from old-school PVFS (v1) code. A driver for file systems that
 * cannot or do not support client-side buffering
 * Does not do data sieving optimization
 * Does contain write-combining optimization for noncontig in memory, contig in
 * file 
 */

/* offset is in units of etype relative to the filetype. */

    ADIOI_Flatlist_node *flat_buf, *flat_file;
    int j, k, st_index=0;
    off_t err_lseek=-1;
    ssize_t err=-1;
    ADIO_Offset fwr_size=0, bwr_size, new_bwr_size, new_fwr_size, i_offset, num;
    ADIO_Offset bufsize, n_etypes_in_filetype;
    ADIO_Offset n_filetypes, etype_in_filetype, size, sum;
    ADIO_Offset abs_off_in_filetype=0, size_in_filetype;
    MPI_Count filetype_size, etype_size, buftype_size;
    MPI_Aint filetype_extent, buftype_extent, indx;
    int buf_count, buftype_is_contig, filetype_is_contig;
    ADIO_Offset off, disp;
    int flag, err_flag=0;
    static char myname[] = "ADIOI_NOLOCK_WRITESTRIDED";
#ifdef IO_DEBUG
    int rank,nprocs;
#endif

    /* --BEGIN ERROR HANDLING-- */
    if (fd->atomicity) {
	*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
					   myname, __LINE__,
					   MPI_ERR_INTERN,
					   "Atomic mode set in I/O function", 0);
	return;
    }
    /* --END ERROR HANDLING-- */

    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
    ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);

    MPI_Type_size_x(fd->filetype, &filetype_size);
    if ( ! filetype_size ) {
#ifdef HAVE_STATUS_SET_BYTES
	MPIR_Status_set_bytes(status, datatype, 0);
#endif
	*error_code = MPI_SUCCESS; 
	return;
    }

#ifdef IO_DEBUG
    MPI_Comm_rank(fd->comm, &rank);
    MPI_Comm_size(fd->comm, &nprocs);
#endif

    MPI_Type_extent(fd->filetype, &filetype_extent);
    MPI_Type_size_x(datatype, &buftype_size);
    MPI_Type_extent(datatype, &buftype_extent);
    etype_size = fd->etype_size;
    
    ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(unsigned)buftype_size * (ADIO_Offset)count));
    bufsize = buftype_size * count;

    if (!buftype_is_contig && filetype_is_contig) {
	char *combine_buf, *combine_buf_ptr;
	ADIO_Offset combine_buf_remain;
/* noncontiguous in memory, contiguous in file. use writev */

	ADIOI_Flatten_datatype(datatype);
	flat_buf = ADIOI_Flatlist;
	while (flat_buf->type != datatype) flat_buf = flat_buf->next;

	/* allocate our "combine buffer" to pack data into before writing */
	combine_buf = (char *) ADIOI_Malloc(fd->hints->ind_wr_buffer_size);
	combine_buf_ptr = combine_buf;
	combine_buf_remain = fd->hints->ind_wr_buffer_size;

	/* seek to the right spot in the file */
	if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
	    off = fd->disp + etype_size * offset;
	    lseek(fd->fd_sys, off, SEEK_SET);
	}
	else off = lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);

	/* loop through all the flattened pieces.  combine into buffer until
	 * no more will fit, then write.
	 *
	 * special case of a given piece being bigger than the combine buffer
	 * is also handled.
	 */
	for (j=0; j<count; j++) {
    int i;
	    for (i=0; i<flat_buf->count; i++) {
		if (flat_buf->blocklens[i] > combine_buf_remain && combine_buf != combine_buf_ptr) {
		    /* there is data in the buffer; write out the buffer so far */
#ifdef IO_DEBUG
		    printf("[%d/%d] nc mem c file (0) writing loc = %Ld sz = %Ld\n", 
				    rank, nprocs, off, 
				    fd->hints->ind_wr_buffer_size-combine_buf_remain);
#endif
#ifdef ADIOI_MPE_LOGGING
		    MPE_Log_event( ADIOI_MPE_write_a, 0, NULL );
#endif
		    err = write(fd->fd_sys,
				     combine_buf,
				     fd->hints->ind_wr_buffer_size - combine_buf_remain);
#ifdef ADIOI_MPE_LOGGING
		    MPE_Log_event( ADIOI_MPE_write_b, 0, NULL );
#endif
		    if (err == -1) err_flag = 1;

		    /* reset our buffer info */
		    combine_buf_ptr = combine_buf;
		    combine_buf_remain = fd->hints->ind_wr_buffer_size;
		}

		/* TODO: heuristic for when to not bother to use combine buffer? */
		if (flat_buf->blocklens[i] >= combine_buf_remain) {
		    /* special case: blocklen is as big as or bigger than the combine buf;
		     * write directly
		     */
#ifdef IO_DEBUG
		    printf("[%d/%d] nc mem c file (1) writing loc = %Ld sz = %d\n", 
				    rank, nprocs, off, 
				    flat_buf->blocklens[i]);
#endif
        ADIOI_Assert(flat_buf->blocklens[i] == (unsigned)flat_buf->blocklens[i]);
        ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)buf) + (ADIO_Offset)j*(ADIO_Offset)buftype_extent + flat_buf->indices[i]) == (ADIO_Offset)((MPIR_Upint)buf + (ADIO_Offset)j*(ADIO_Offset)buftype_extent + flat_buf->indices[i]));
#ifdef ADIOI_MPE_LOGGING
		    MPE_Log_event( ADIOI_MPE_write_a, 0, NULL );
#endif
		    err = write(fd->fd_sys,
				     ((char *) buf) + (ADIO_Offset)j*(ADIO_Offset)buftype_extent + flat_buf->indices[i],
				     (unsigned)flat_buf->blocklens[i]);
#ifdef ADIOI_MPE_LOGGING
		    MPE_Log_event( ADIOI_MPE_write_b, 0, NULL );
#endif
		    if (err == -1) err_flag = 1;
		    off += flat_buf->blocklens[i]; /* keep up with the final file offset too */
		}
		else {
		    /* copy more data into combine buffer */
		    memcpy(combine_buf_ptr,
			   ((char *) buf) + j*buftype_extent + flat_buf->indices[i],
			   flat_buf->blocklens[i]);
		    combine_buf_ptr += flat_buf->blocklens[i];
		    combine_buf_remain -= flat_buf->blocklens[i];
		    off += flat_buf->blocklens[i]; /* keep up with the final file offset too */
		}
	    }
	}

	if (combine_buf_ptr != combine_buf) {
	    /* data left in buffer to write */
#ifdef IO_DEBUG
	    printf("[%d/%d] nc mem c file (2) writing loc = %Ld sz = %Ld\n", 
			    rank, nprocs, off, 
			     fd->hints->ind_wr_buffer_size-combine_buf_remain);
#endif
#ifdef ADIOI_MPE_LOGGING
	    MPE_Log_event( ADIOI_MPE_write_a, 0, NULL );
#endif
	    err = write(fd->fd_sys,
			     combine_buf,
			     fd->hints->ind_wr_buffer_size - combine_buf_remain);
#ifdef ADIOI_MPE_LOGGING
	    MPE_Log_event( ADIOI_MPE_write_b, 0, NULL );
#endif
	    if (err == -1) err_flag = 1;
	}

	if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;

	ADIOI_Free(combine_buf);

	if (err_flag) {
	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
					       MPIR_ERR_RECOVERABLE, myname,
					       __LINE__, MPI_ERR_IO, "**io",
					       "**io %s", strerror(errno));
	}
	else *error_code = MPI_SUCCESS;
    } /* if (!buftype_is_contig && filetype_is_contig)  ... */

    else {  /* noncontiguous in file */

/* split up into several contiguous writes */

/* find starting location in the file */

/* filetype already flattened in ADIO_Open */
	flat_file = ADIOI_Flatlist;
	while (flat_file->type != fd->filetype) flat_file = flat_file->next;
        disp = fd->disp;

	if (file_ptr_type == ADIO_INDIVIDUAL) {
	    offset = fd->fp_ind; /* in bytes */
            n_filetypes = -1;
            flag = 0;
            while (!flag) {
                int i;
                n_filetypes++;
                for (i=0; i<flat_file->count; i++) {
                    if (disp + flat_file->indices[i] + 
                        n_filetypes*(ADIO_Offset)filetype_extent + flat_file->blocklens[i] 
                            >= offset) {
                        st_index = i;
                        fwr_size = disp + flat_file->indices[i] + 
                                n_filetypes*(ADIO_Offset)filetype_extent
                                 + flat_file->blocklens[i] - offset;
                        flag = 1;
                        break;
                    }
                }
            }
	}
	else {
            int i;
	    n_etypes_in_filetype = filetype_size/etype_size;
	    n_filetypes = offset / n_etypes_in_filetype;
	    etype_in_filetype = offset % n_etypes_in_filetype;
	    size_in_filetype = etype_in_filetype * etype_size;
 
	    sum = 0;
	    for (i=0; i<flat_file->count; i++) {
		sum += flat_file->blocklens[i];
		if (sum > size_in_filetype) {
		    st_index = i;
		    fwr_size = sum - size_in_filetype;
		    abs_off_in_filetype = flat_file->indices[i] +
			size_in_filetype - (sum - flat_file->blocklens[i]);
		    break;
		}
	    }

	    /* abs. offset in bytes in the file */
            offset = disp + n_filetypes*(ADIO_Offset)filetype_extent + abs_off_in_filetype;
	}

	if (buftype_is_contig && !filetype_is_contig) {

/* contiguous in memory, noncontiguous in file. should be the most
   common case. */

	    i_offset = 0;
	    j = st_index;
	    off = offset;
	    fwr_size = ADIOI_MIN(fwr_size, bufsize);
	    while (i_offset < bufsize) {
                if (fwr_size) { 
                    /* TYPE_UB and TYPE_LB can result in 
                       fwr_size = 0. save system call in such cases */ 
#ifdef ADIOI_MPE_LOGGING
		    MPE_Log_event(ADIOI_MPE_lseek_a, 0, NULL);
#endif
#ifdef IO_DEBUG
		    printf("[%d/%d] c mem nc file writing loc = %Ld sz = %d\n", 
			    rank, nprocs, off, fwr_size);
#endif
		    err_lseek = lseek(fd->fd_sys, off, SEEK_SET);
#ifdef ADIOI_MPE_LOGGING
		    MPE_Log_event(ADIOI_MPE_lseek_b, 0, NULL);
#endif
		    if (err_lseek == -1) err_flag = 1;
#ifdef ADIOI_MPE_LOGGING
		    MPE_Log_event(ADIOI_MPE_write_a, 0, NULL);
#endif
		    err = write(fd->fd_sys, ((char *) buf) + i_offset, fwr_size);
#ifdef ADIOI_MPE_LOGGING
		    MPE_Log_event(ADIOI_MPE_write_b, 0, NULL);
#endif
		    if (err == -1) err_flag = 1;
		}
		i_offset += fwr_size;

                if (off + fwr_size < disp + flat_file->indices[j] +
                   flat_file->blocklens[j] + n_filetypes*(ADIO_Offset)filetype_extent)
                       off += fwr_size;
                /* did not reach end of contiguous block in filetype.
                   no more I/O needed. off is incremented by fwr_size. */
                else {
		    if (j < (flat_file->count - 1)) j++;
		    else {
			j = 0;
			n_filetypes++;
		    }
		    off = disp + flat_file->indices[j] + 
                                        n_filetypes*(ADIO_Offset)filetype_extent;
		    fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset);
		}
	    }
	}
	else {
/* noncontiguous in memory as well as in file */

	    ADIOI_Flatten_datatype(datatype);
	    flat_buf = ADIOI_Flatlist;
	    while (flat_buf->type != datatype) flat_buf = flat_buf->next;

	    k = num = buf_count = 0;
	    indx = flat_buf->indices[0];
	    j = st_index;
	    off = offset;
	    bwr_size = flat_buf->blocklens[0];

	    while (num < bufsize) {
		size = ADIOI_MIN(fwr_size, bwr_size);
		if (size) {
#ifdef IO_DEBUG
		    printf("[%d/%d] nc mem nc file writing loc = %Ld sz = %d\n", 
				    rank, nprocs, off, size);
#endif
#ifdef ADIOI_MPE_LOGGING
		    MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL );
#endif
		    lseek(fd->fd_sys, off, SEEK_SET);
#ifdef ADIOI_MPE_LOGGING 
		    MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL );
#endif
		    if (err == -1) err_flag = 1;
#ifdef ADIOI_MPE_LOGGING 
		    MPE_Log_event( ADIOI_MPE_write_a, 0, NULL );
#endif
                    ADIOI_Assert(size == (size_t) size);
                    ADIOI_Assert(off == (off_t) off);
		    err = write(fd->fd_sys, ((char *) buf) + indx, size);
#ifdef ADIOI_MPE_LOGGING
		    MPE_Log_event( ADIOI_MPE_write_b, 0, NULL );
#endif
		    if (err == -1) err_flag = 1;
		}

		new_fwr_size = fwr_size;
		new_bwr_size = bwr_size;

		if (size == fwr_size) {
/* reached end of contiguous block in file */
                    if (j < (flat_file->count - 1)) j++;
                    else {
                        j = 0;
                        n_filetypes++;
                    }

                    off = disp + flat_file->indices[j] + 
                                   n_filetypes*(ADIO_Offset)filetype_extent;

		    new_fwr_size = flat_file->blocklens[j];
		    if (size != bwr_size) {
			indx += size;
			new_bwr_size -= size;
		    }
		}

		if (size == bwr_size) {
/* reached end of contiguous block in memory */

		    k = (k + 1)%flat_buf->count;
		    buf_count++;
		    indx = buftype_extent*(buf_count/flat_buf->count) +
			flat_buf->indices[k]; 
		    new_bwr_size = flat_buf->blocklens[k];
		    if (size != fwr_size) {
			off += size;
			new_fwr_size -= size;
		    }
		}
		num += size;
		fwr_size = new_fwr_size;
                bwr_size = new_bwr_size;
	    }
	}

        if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
	if (err_flag) {
	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
					       MPIR_ERR_RECOVERABLE, myname,
					       __LINE__, MPI_ERR_IO, "**io",
					       "**io %s", strerror(errno));
	}
	else *error_code = MPI_SUCCESS;
    }

    fd->fp_sys_posn = -1;   /* set it to null. */

#ifdef HAVE_STATUS_SET_BYTES
    MPIR_Status_set_bytes(status, datatype, bufsize);
/* This is a temporary way of filling in status. The right way is to 
   keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
#endif

    if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
}
Beispiel #21
0
/* Sets error_code to MPI_SUCCESS if successful, or creates an error code
 * in the case of error.
 */
static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, const void *buf,
					 char *write_buf,
					 ADIOI_Flatlist_node *flat_buf,
					 ADIO_Offset *offset_list,
					 ADIO_Offset *len_list, int *send_size,
					 int *recv_size, ADIO_Offset off,
					 int size, int *count,
					 int *start_pos, 
					 int *sent_to_proc, int nprocs,
					 int myrank, int buftype_is_contig,
					 int contig_access_count,
					 int *striping_info,
					 ADIOI_Access *others_req,
					 int *send_buf_idx,
					 int *curr_to_proc, int *done_to_proc,
                                         int *hole, int iter,
                                         MPI_Aint buftype_extent,
					 int *buf_idx,
                          ADIO_Offset **srt_off, int **srt_len, int *srt_num,
                          int *error_code)
{
    int i, j, nprocs_recv, nprocs_send, err;
    char **send_buf = NULL;
    MPI_Request *requests, *send_req;
    MPI_Datatype *recv_types;
    MPI_Status *statuses, status;
    int sum_recv;
    int data_sieving = *hole;
    static char myname[] = "ADIOI_W_EXCHANGE_DATA";

    /* create derived datatypes for recv */
    nprocs_recv = 0;
    for (i = 0; i < nprocs; i++)
	if (recv_size[i])
	    nprocs_recv++;

    recv_types = (MPI_Datatype *) ADIOI_Malloc((nprocs_recv + 1) *
					       sizeof(MPI_Datatype));
    /* +1 to avoid a 0-size malloc */

    j = 0;
    for (i = 0; i < nprocs; i++) {
	if (recv_size[i]) {
	    ADIOI_Type_create_hindexed_x(count[i],
			      &(others_req[i].lens[start_pos[i]]),
			      &(others_req[i].mem_ptrs[start_pos[i]]),
			      MPI_BYTE, recv_types + j);
	    /* absolute displacements; use MPI_BOTTOM in recv */
	    MPI_Type_commit(recv_types + j);
	    j++;
	}
    }

    /* To avoid a read-modify-write,
     * check if there are holes in the data to be written.
     * For this, merge the (sorted) offset lists others_req using a heap-merge.
     */

    *srt_num = 0;
    for (i = 0; i < nprocs; i++)
        *srt_num += count[i];
    if (*srt_off)
        *srt_off = (ADIO_Offset *) ADIOI_Realloc(*srt_off, (*srt_num + 1) * sizeof(ADIO_Offset));
    else
        *srt_off = (ADIO_Offset *) ADIOI_Malloc((*srt_num + 1) * sizeof(ADIO_Offset));
    if (*srt_len)
        *srt_len = (int *) ADIOI_Realloc(*srt_len, (*srt_num + 1) * sizeof(int));
    else
        *srt_len = (int *) ADIOI_Malloc((*srt_num + 1) * sizeof(int));
    /* +1 to avoid a 0-size malloc */

    ADIOI_Heap_merge(others_req, count, *srt_off, *srt_len, start_pos,
		     nprocs, nprocs_recv, *srt_num);

    /* check if there are any holes */
    *hole = 0;
    for (i = 0; i < *srt_num - 1; i++) {
        if ((*srt_off)[i] + (*srt_len)[i] < (*srt_off)[i + 1]) {
            *hole = 1;
	    break;
	}
    }
    /* In some cases (see John Bent ROMIO REQ # 835), an odd interaction
     * between aggregation, nominally contiguous regions, and cb_buffer_size
     * should be handled with a read-modify-write (otherwise we will write out
     * more data than we receive from everyone else (inclusive), so override
     * hole detection
     */
    if (*hole == 0) {
        sum_recv = 0;
        for (i = 0; i < nprocs; i++)
            sum_recv += recv_size[i];
	if (size > sum_recv)
	    *hole = 1;
    }
    /* check the hint for data sieving */
    if (data_sieving == ADIOI_HINT_ENABLE && nprocs_recv && *hole) {
        ADIO_ReadContig(fd, write_buf, size, MPI_BYTE,
                        ADIO_EXPLICIT_OFFSET, off, &status, &err);
        // --BEGIN ERROR HANDLING--
        if (err != MPI_SUCCESS) {
            *error_code = MPIO_Err_create_code(err,
                                               MPIR_ERR_RECOVERABLE,
                                               myname, __LINE__,
                                               MPI_ERR_IO,
                                               "**ioRMWrdwr", 0);
            ADIOI_Free(recv_types);
            return;
        }
        // --END ERROR HANDLING--
    }

    nprocs_send = 0;
    for (i = 0; i < nprocs; i++)
	if (send_size[i])
	    nprocs_send++;

    if (fd->atomicity) {
	/* bug fix from Wei-keng Liao and Kenin Coloma */
	requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + 1) *
                                                sizeof(MPI_Request));
	send_req = requests;
    } else {
	requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1)*
                                                sizeof(MPI_Request));
	/* +1 to avoid a 0-size malloc */

	/* post receives */
	j = 0;
	for (i = 0; i < nprocs; i++) {
	    if (recv_size[i]) {
		MPI_Irecv(MPI_BOTTOM, 1, recv_types[j], i,
			  myrank + i + 100 * iter, fd->comm, requests + j);
		j++;
	    }
	}
	send_req = requests + nprocs_recv;
    }

    /* post sends.
     * if buftype_is_contig, data can be directly sent from
     * user buf at location given by buf_idx. else use send_buf.
     */
    if (buftype_is_contig) {
	j = 0;
	for (i = 0; i < nprocs; i++)
	    if (send_size[i]) {
                ADIOI_Assert(buf_idx[i] != -1);
		MPI_Isend(((char *) buf) + buf_idx[i], send_size[i],
			  MPI_BYTE, i, myrank + i + 100 * iter, fd->comm,
			  send_req + j);
		j++;
	    }
    } else
        if (nprocs_send) {
	/* buftype is not contig */
	send_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char *));
	for (i = 0; i < nprocs; i++)
	    if (send_size[i])
		send_buf[i] = (char *) ADIOI_Malloc(send_size[i]);

	ADIOI_LUSTRE_Fill_send_buffer(fd, buf, flat_buf, send_buf, offset_list,
                                      len_list, send_size, send_req,
                                      sent_to_proc, nprocs, myrank,
                                      contig_access_count, striping_info,
                                      send_buf_idx, curr_to_proc, done_to_proc,
                                      iter, buftype_extent);
	/* the send is done in ADIOI_Fill_send_buffer */
    }

	/* bug fix from Wei-keng Liao and Kenin Coloma */
    if (fd->atomicity) {
	j = 0;
	for (i = 0; i < nprocs; i++) {
	    MPI_Status wkl_status;
	    if (recv_size[i]) {
		MPI_Recv(MPI_BOTTOM, 1, recv_types[j], i,
			 myrank + i + 100 * iter, fd->comm, &wkl_status);
		j++;
	    }
	}
    }

    for (i = 0; i < nprocs_recv; i++)
	MPI_Type_free(recv_types + i);
    ADIOI_Free(recv_types);

	/* bug fix from Wei-keng Liao and Kenin Coloma */
	/* +1 to avoid a 0-size malloc */
    if (fd->atomicity) {
	statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + 1) *
					       sizeof(MPI_Status));
    } else {
	statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1) *
					       sizeof(MPI_Status));
    }

#ifdef NEEDS_MPI_TEST
    i = 0;
    if (fd->atomicity) {
	/* bug fix from Wei-keng Liao and Kenin Coloma */
	while (!i)
	    MPI_Testall(nprocs_send, send_req, &i, statuses);
    } else {
	while (!i)
	    MPI_Testall(nprocs_send + nprocs_recv, requests, &i, statuses);
    }
#else
	/* bug fix from Wei-keng Liao and Kenin Coloma */
    if (fd->atomicity)
	MPI_Waitall(nprocs_send, send_req, statuses);
    else
	MPI_Waitall(nprocs_send + nprocs_recv, requests, statuses);
#endif
    ADIOI_Free(statuses);
    ADIOI_Free(requests);
    if (!buftype_is_contig && nprocs_send) {
	for (i = 0; i < nprocs; i++)
	    if (send_size[i])
		ADIOI_Free(send_buf[i]);
	ADIOI_Free(send_buf);
    }
}
Beispiel #22
0
/* Avery Ching and Kenin Columa's reworked two-phase algorithm.  Key features
 * - persistent file domains
 * - an option to use alltoall instead of point-to-point
 */
void ADIOI_IOStridedColl(ADIO_File fd, void *buf, int count, int rdwr,
                         MPI_Datatype datatype, int file_ptr_type,
                         ADIO_Offset offset, ADIO_Status * status, int *error_code)
{
    ADIO_Offset min_st_offset = 0, max_end_offset = 0;
    ADIO_Offset st_end_offset[2];
    ADIO_Offset *all_st_end_offsets = NULL;
    int filetype_is_contig, buftype_is_contig, is_contig;
    ADIO_Offset off;
    int interleave_count = 0, i, nprocs, myrank, nprocs_for_coll;
    int cb_enable;
    ADIO_Offset bufsize;
    MPI_Aint extent;
#ifdef DEBUG2
    MPI_Aint bufextent;
#endif
    MPI_Count size;
    int agg_rank;

    ADIO_Offset agg_disp;       /* aggregated file offset */
    MPI_Datatype agg_dtype;     /* aggregated file datatype */

    int aggregators_done = 0;
    ADIO_Offset buffered_io_size = 0;

    int *alltoallw_disps;

    int *alltoallw_counts;
    int *client_alltoallw_counts;
    int *agg_alltoallw_counts;

    char *cb_buf = NULL;

    MPI_Datatype *client_comm_dtype_arr;        /* aggregator perspective */
    MPI_Datatype *agg_comm_dtype_arr;   /* client perspective */
    ADIO_Offset *client_comm_sz_arr;    /* aggregator perspective */
    ADIO_Offset *agg_comm_sz_arr;       /* client perspective */

    /* file views for each client and aggregator */
    view_state *client_file_view_state_arr = NULL;
    view_state *agg_file_view_state_arr = NULL;
    /* mem views for local process */
    view_state *my_mem_view_state_arr = NULL;

    MPI_Status *agg_comm_statuses = NULL;
    MPI_Request *agg_comm_requests = NULL;
    MPI_Status *client_comm_statuses = NULL;
    MPI_Request *client_comm_requests = NULL;
    int aggs_client_count = 0;
    int clients_agg_count = 0;

    MPI_Comm_size(fd->comm, &nprocs);
    MPI_Comm_rank(fd->comm, &myrank);
#ifdef DEBUG
    fprintf(stderr, "p%d: entering ADIOI_IOStridedColl\n", myrank);
#endif
#ifdef AGGREGATION_PROFILE
    if (rdwr == ADIOI_READ)
        MPE_Log_event(5010, 0, NULL);
    else
        MPE_Log_event(5012, 0, NULL);
#endif

    /* I need to check if there are any outstanding nonblocking writes
     * to the file, which could potentially interfere with the writes
     * taking place in this collective write call. Since this is not
     * likely to be common, let me do the simplest thing possible here:
     * Each process completes all pending nonblocking operations before
     * completing. */

    nprocs_for_coll = fd->hints->cb_nodes;

    if (rdwr == ADIOI_READ)
        cb_enable = fd->hints->cb_read;
    else
        cb_enable = fd->hints->cb_write;

    /* only check for interleaving if cb_read isn't disabled */
    if (cb_enable != ADIOI_HINT_DISABLE) {
        /* find the starting and ending byte of my I/O access */
        ADIOI_Calc_bounds(fd, count, datatype, file_ptr_type, offset,
                          &st_end_offset[0], &st_end_offset[1]);

        /* allocate an array of start/end pairs */
        all_st_end_offsets = (ADIO_Offset *)
            ADIOI_Malloc(2 * nprocs * sizeof(ADIO_Offset));
        MPI_Allgather(st_end_offset, 2, ADIO_OFFSET, all_st_end_offsets, 2, ADIO_OFFSET, fd->comm);

        min_st_offset = all_st_end_offsets[0];
        max_end_offset = all_st_end_offsets[1];

        for (i = 1; i < nprocs; i++) {
            /* are the accesses of different processes interleaved? */
            if ((all_st_end_offsets[i * 2] < all_st_end_offsets[i * 2 - 1]) &&
                (all_st_end_offsets[i * 2] <= all_st_end_offsets[i * 2 + 1]))
                interleave_count++;
            /* This is a rudimentary check for interleaving, but should
             * suffice for the moment. */

            min_st_offset = MPL_MIN(all_st_end_offsets[i * 2], min_st_offset);
            max_end_offset = MPL_MAX(all_st_end_offsets[i * 2 + 1], max_end_offset);
        }
    }

    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
    ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);

    if ((cb_enable == ADIOI_HINT_DISABLE || (!interleave_count && (cb_enable == ADIOI_HINT_AUTO)))
        && (fd->hints->cb_pfr != ADIOI_HINT_ENABLE)) {
        if (cb_enable != ADIOI_HINT_DISABLE) {
            ADIOI_Free(all_st_end_offsets);
        }

        if (buftype_is_contig && filetype_is_contig) {
            if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
                off = fd->disp + (fd->etype_size) * offset;
                if (rdwr == ADIOI_READ)
                    ADIO_ReadContig(fd, buf, count, datatype,
                                    ADIO_EXPLICIT_OFFSET, off, status, error_code);
                else
                    ADIO_WriteContig(fd, buf, count, datatype,
                                     ADIO_EXPLICIT_OFFSET, off, status, error_code);
            } else {
                if (rdwr == ADIOI_READ)
                    ADIO_ReadContig(fd, buf, count, datatype, ADIO_INDIVIDUAL,
                                    0, status, error_code);
                else
                    ADIO_WriteContig(fd, buf, count, datatype, ADIO_INDIVIDUAL,
                                     0, status, error_code);
            }
        } else {
            if (rdwr == ADIOI_READ)
                ADIO_ReadStrided(fd, buf, count, datatype, file_ptr_type,
                                 offset, status, error_code);
            else
                ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type,
                                  offset, status, error_code);
        }
        return;
    }

    MPI_Type_extent(datatype, &extent);
#ifdef DEBUG2
    bufextent = extent * count;
#endif
    MPI_Type_size_x(datatype, &size);
    bufsize = size * (MPI_Count) count;

    /* Calculate file realms */
    if ((fd->hints->cb_pfr != ADIOI_HINT_ENABLE) || (fd->file_realm_types == NULL))
        ADIOI_Calc_file_realms(fd, min_st_offset, max_end_offset);

    my_mem_view_state_arr = (view_state *)
        ADIOI_Calloc(1, nprocs * sizeof(view_state));
    agg_file_view_state_arr = (view_state *)
        ADIOI_Calloc(1, nprocs * sizeof(view_state));
    client_comm_sz_arr = (ADIO_Offset *)
        ADIOI_Calloc(1, nprocs * sizeof(ADIO_Offset));

    if (fd->is_agg) {
        client_file_view_state_arr = (view_state *)
            ADIOI_Calloc(1, nprocs * sizeof(view_state));
    } else {
        client_file_view_state_arr = NULL;
    }

    /* Alltoallw doesn't like a null array even if the counts are
     * zero.  If you do not include this code, it will fail. */
    client_comm_dtype_arr = (MPI_Datatype *)
        ADIOI_Calloc(1, nprocs * sizeof(MPI_Datatype));
    if (!fd->is_agg)
        for (i = 0; i < nprocs; i++)
            client_comm_dtype_arr[i] = MPI_BYTE;

    ADIOI_Exch_file_views(myrank, nprocs, file_ptr_type, fd, count,
                          datatype, offset, my_mem_view_state_arr,
                          agg_file_view_state_arr, client_file_view_state_arr);

    agg_comm_sz_arr = (ADIO_Offset *)
        ADIOI_Calloc(1, nprocs * sizeof(ADIO_Offset));
    agg_comm_dtype_arr = (MPI_Datatype *)
        ADIOI_Malloc(nprocs * sizeof(MPI_Datatype));
    if (fd->is_agg) {
        ADIOI_Build_agg_reqs(fd, rdwr, nprocs,
                             client_file_view_state_arr,
                             client_comm_dtype_arr, client_comm_sz_arr, &agg_disp, &agg_dtype);
        buffered_io_size = 0;
        for (i = 0; i < nprocs; i++) {
            if (client_comm_sz_arr[i] > 0)
                buffered_io_size += client_comm_sz_arr[i];
        }
    }
#ifdef USE_PRE_REQ
    else {
        /* Example use of ADIOI_Build_client_pre_req. to an
         * appropriate section */

        for (i = 0; i < fd->hints->cb_nodes; i++) {
            agg_rank = fd->hints->ranklist[(i + myrank) % fd->hints->cb_nodes];
#ifdef AGGREGATION_PROFILE
            MPE_Log_event(5040, 0, NULL);
#endif
            ADIOI_Build_client_pre_req(fd, agg_rank, (i + myrank) % fd->hints->cb_nodes,
                                       &(my_mem_view_state_arr[agg_rank]),
                                       &(agg_file_view_state_arr[agg_rank]),
                                       2 * 1024 * 1024, 64 * 1024);
#ifdef AGGREGATION_PROFILE
            MPE_Log_event(5041, 0, NULL);
#endif
        }
    }
#endif


    if (fd->is_agg)
        cb_buf = (char *) ADIOI_Malloc(fd->hints->cb_buffer_size);
    alltoallw_disps = (int *) ADIOI_Calloc(nprocs, sizeof(int));
    alltoallw_counts = client_alltoallw_counts = (int *)
        ADIOI_Calloc(2 * nprocs, sizeof(int));
    agg_alltoallw_counts = &alltoallw_counts[nprocs];

    if (fd->hints->cb_alltoall == ADIOI_HINT_DISABLE) {
        /* aggregators pre-post all Irecv's for incoming data from clients */
        if ((fd->is_agg) && (rdwr == ADIOI_WRITE))
            post_aggregator_comm(fd->comm, rdwr, nprocs, cb_buf,
                                 client_comm_dtype_arr,
                                 client_comm_sz_arr, &agg_comm_requests, &aggs_client_count);
    }
    /* Aggregators send amounts for data requested to clients */
    Exch_data_amounts(fd, nprocs, client_comm_sz_arr, agg_comm_sz_arr,
                      client_alltoallw_counts, agg_alltoallw_counts, &aggregators_done);

#ifdef DEBUG
    fprintf(stderr, "client_alltoallw_counts[ ");
    for (i = 0; i < nprocs; i++) {
        fprintf(stderr, "%d ", client_alltoallw_counts[i]);
    }
    fprintf(stderr, "]\n");
    fprintf(stderr, "agg_alltoallw_counts[ ");
    for (i = 0; i < nprocs; i++) {
        fprintf(stderr, "%d ", agg_alltoallw_counts[i]);
    }
    fprintf(stderr, "]\n");
#endif

    /* keep looping while aggregators still have I/O to do */
    while (aggregators_done != nprocs_for_coll) {
        if (fd->hints->cb_alltoall == ADIOI_HINT_DISABLE) {
            /* clients should build datatypes for local memory locations
             * for data communication with aggregators and post
             * communication as the datatypes are built */

            client_comm_requests = (MPI_Request *)
                ADIOI_Calloc(fd->hints->cb_nodes, sizeof(MPI_Request));

            for (i = 0; i < fd->hints->cb_nodes; i++) {
                clients_agg_count = 0;
                agg_rank = fd->hints->ranklist[(i + myrank) % fd->hints->cb_nodes];
                if (agg_comm_sz_arr[agg_rank] > 0) {
                    ADIOI_Build_client_req(fd, agg_rank,
                                           (i + myrank) % fd->hints->cb_nodes,
                                           &(my_mem_view_state_arr[agg_rank]),
                                           &(agg_file_view_state_arr[agg_rank]),
                                           agg_comm_sz_arr[agg_rank],
                                           &(agg_comm_dtype_arr[agg_rank]));

#ifdef AGGREGATION_PROFILE
                    if (i == 0)
                        MPE_Log_event(5038, 0, NULL);
#endif
                    post_client_comm(fd, rdwr, agg_rank, buf,
                                     agg_comm_dtype_arr[agg_rank],
                                     agg_alltoallw_counts[agg_rank],
                                     &client_comm_requests[clients_agg_count]);
                    clients_agg_count++;
                }
            }
#ifdef AGGREGATION_PROFILE
            if (!clients_agg_count)
                MPE_Log_event(5039, 0, NULL);
#endif

            if (rdwr == ADIOI_READ) {
                if (fd->is_agg && buffered_io_size) {
                    ADIOI_IOFiletype(fd, cb_buf, buffered_io_size, MPI_BYTE,
                                     ADIO_EXPLICIT_OFFSET, agg_disp, agg_dtype,
                                     ADIOI_READ, status, error_code);
                    if (*error_code != MPI_SUCCESS)
                        return;
                    MPI_Type_free(&agg_dtype);
                }
#ifdef DEBUG
                fprintf(stderr, "expecting from [agg](disp,size,cnt)=");
                for (i = 0; i < nprocs; i++) {
                    MPI_Type_size_x(agg_comm_dtype_arr[i], &size);
                    fprintf(stderr, "[%d](%d,%d,%d)", i, alltoallw_disps[i],
                            size, agg_alltoallw_counts[i]);
                    if (i != nprocs - 1)
                        fprintf(stderr, ",");
                }
                fprintf(stderr, "]\n");
                if (fd->is_agg) {
                    fprintf(stderr, "sending to [client](disp,size,cnt)=");
                    for (i = 0; i < nprocs; i++) {
                        if (fd->is_agg)
                            MPI_Type_size_x(client_comm_dtype_arr[i], &size);
                        else
                            size = -1;

                        fprintf(stderr, "[%d](%d,%d,%d)", i, alltoallw_disps[i],
                                size, client_alltoallw_counts[i]);
                        if (i != nprocs - 1)
                            fprintf(stderr, ",");
                    }
                    fprintf(stderr, "\n");
                }
                fflush(NULL);
#endif
                /* aggregators post all Isends for outgoing data to clients */
                if (fd->is_agg)
                    post_aggregator_comm(fd->comm, rdwr, nprocs, cb_buf,
                                         client_comm_dtype_arr,
                                         client_comm_sz_arr,
                                         &agg_comm_requests, &aggs_client_count);

                if (fd->is_agg && aggs_client_count) {
#ifdef MPI_STATUSES_IGNORE
                    agg_comm_statuses = MPI_STATUSES_IGNORE;
#else
                    agg_comm_statuses = ADIOI_Malloc(aggs_client_count * sizeof(MPI_Status));
#endif
                    MPI_Waitall(aggs_client_count, agg_comm_requests, agg_comm_statuses);
#ifdef AGGREGATION_PROFILE
                    MPE_Log_event(5033, 0, NULL);
#endif
                    ADIOI_Free(agg_comm_requests);
#ifndef MPI_STATUSES_IGNORE
                    ADIOI_Free(agg_comm_statuses);
#endif
                }

                if (clients_agg_count) {
#ifdef MPI_STATUSES_IGNORE
                    client_comm_statuses = MPI_STATUSES_IGNORE;
#else
                    client_comm_statuses = ADIOI_Malloc(clients_agg_count * sizeof(MPI_Status));
#endif
                    MPI_Waitall(clients_agg_count, client_comm_requests, client_comm_statuses);
#ifdef AGGREGATION_PROFILE
                    MPE_Log_event(5039, 0, NULL);
#endif
                    ADIOI_Free(client_comm_requests);
#ifndef MPI_STATUSES_IGNORE
                    ADIOI_Free(client_comm_statuses);
#endif
                }
#ifdef DEBUG2
                fprintf(stderr, "buffered_io_size = %lld\n", buffered_io_size);
                if (fd->is_agg && buffered_io_size) {
                    fprintf(stderr, "buf = [");
                    for (i = 0; i < bufextent; i++)
                        fprintf(stderr, "%c", ((char *) buf)[i]);
                    fprintf(stderr, "]\n");
                    fprintf(stderr, "cb_buf = [");
                    for (i = 0; i < buffered_io_size; i++)
                        fprintf(stderr, "%c", cb_buf[i]);
                    fprintf(stderr, "]\n");
                    fflush(NULL);
                }
#endif
            } else {    /* Write Case */
#ifdef DEBUG
                fprintf(stderr, "sending to [agg](disp,size,cnt)=");
                for (i = 0; i < nprocs; i++) {
                    MPI_Type_size_x(agg_comm_dtype_arr[i], &size);
                    fprintf(stderr, "[%d](%d,%d,%d)", i, alltoallw_disps[i],
                            size, agg_alltoallw_counts[i]);
                    if (i != nprocs - 1)
                        fprintf(stderr, ",");
                }
                fprintf(stderr, "]\n");
                fprintf(stderr, "expecting from [client](disp,size,cnt)=");
                for (i = 0; i < nprocs; i++) {
                    if (fd->is_agg)
                        MPI_Type_size_x(client_comm_dtype_arr[i], &size);
                    else
                        size = -1;

                    fprintf(stderr, "[%d](%d,%d,%d)", i, alltoallw_disps[i],
                            size, client_alltoallw_counts[i]);
                    if (i != nprocs - 1)
                        fprintf(stderr, ",");
                }
                fprintf(stderr, "\n");
                fflush(NULL);
#endif
#ifdef DEBUG
                fprintf(stderr, "buffered_io_size = %lld\n", buffered_io_size);
#endif

                if (clients_agg_count) {
#ifdef MPI_STATUSES_IGNORE
                    client_comm_statuses = MPI_STATUSES_IGNORE;
#else
                    client_comm_statuses = ADIOI_Malloc(clients_agg_count * sizeof(MPI_Status));
#endif
                    MPI_Waitall(clients_agg_count, client_comm_requests, client_comm_statuses);
#ifdef AGGREGATION_PROFILE
                    MPE_Log_event(5039, 0, NULL);
#endif
                    ADIOI_Free(client_comm_requests);
#ifndef MPI_STATUSES_IGNORE
                    ADIOI_Free(client_comm_statuses);
#endif
                }
#ifdef DEBUG2
                if (bufextent) {
                    fprintf(stderr, "buf = [");
                    for (i = 0; i < bufextent; i++)
                        fprintf(stderr, "%c", ((char *) buf)[i]);
                    fprintf(stderr, "]\n");
                }
#endif

                if (fd->is_agg && buffered_io_size) {
                    ADIOI_Assert(aggs_client_count != 0);
                    /* make sure we actually have the data to write out */
#ifdef MPI_STATUSES_IGNORE
                    agg_comm_statuses = MPI_STATUSES_IGNORE;
#else
                    agg_comm_statuses = (MPI_Status *)
                        ADIOI_Malloc(aggs_client_count * sizeof(MPI_Status));
#endif

                    MPI_Waitall(aggs_client_count, agg_comm_requests, agg_comm_statuses);
#ifdef AGGREGATION_PROFILE
                    MPE_Log_event(5033, 0, NULL);
#endif
                    ADIOI_Free(agg_comm_requests);
#ifndef MPI_STATUSES_IGNORE
                    ADIOI_Free(agg_comm_statuses);
#endif
#ifdef DEBUG2
                    fprintf(stderr, "cb_buf = [");
                    for (i = 0; i < buffered_io_size; i++)
                        fprintf(stderr, "%c", cb_buf[i]);
                    fprintf(stderr, "]\n");
                    fflush(NULL);
#endif
                    ADIOI_IOFiletype(fd, cb_buf, buffered_io_size, MPI_BYTE,
                                     ADIO_EXPLICIT_OFFSET, agg_disp, agg_dtype,
                                     ADIOI_WRITE, status, error_code);
                    if (*error_code != MPI_SUCCESS)
                        return;
                    MPI_Type_free(&agg_dtype);
                }

            }
        } else {
            /* Alltoallw version of everything */
            ADIOI_Build_client_reqs(fd, nprocs, my_mem_view_state_arr,
                                    agg_file_view_state_arr, agg_comm_sz_arr, agg_comm_dtype_arr);

            if (rdwr == ADIOI_READ) {
                if (fd->is_agg && buffered_io_size) {
                    ADIOI_IOFiletype(fd, cb_buf, buffered_io_size, MPI_BYTE,
                                     ADIO_EXPLICIT_OFFSET, agg_disp, agg_dtype,
                                     ADIOI_READ, status, error_code);
                    if (*error_code != MPI_SUCCESS)
                        return;
                    MPI_Type_free(&agg_dtype);
                }
#ifdef AGGREGATION_PROFILE
                MPE_Log_event(5032, 0, NULL);
#endif
                MPI_Alltoallw(cb_buf, client_alltoallw_counts, alltoallw_disps,
                              client_comm_dtype_arr,
                              buf, agg_alltoallw_counts, alltoallw_disps,
                              agg_comm_dtype_arr, fd->comm);
#ifdef AGGREGATION_PROFILE
                MPE_Log_event(5033, 0, NULL);
#endif
            } else {    /* Write Case */
#ifdef AGGREGATION_PROFILE
                MPE_Log_event(5032, 0, NULL);
#endif
                MPI_Alltoallw(buf, agg_alltoallw_counts, alltoallw_disps,
                              agg_comm_dtype_arr,
                              cb_buf, client_alltoallw_counts, alltoallw_disps,
                              client_comm_dtype_arr, fd->comm);
#ifdef AGGREGATION_PROFILE
                MPE_Log_event(5033, 0, NULL);
#endif
                if (fd->is_agg && buffered_io_size) {
                    ADIOI_IOFiletype(fd, cb_buf, buffered_io_size, MPI_BYTE,
                                     ADIO_EXPLICIT_OFFSET, agg_disp, agg_dtype,
                                     ADIOI_WRITE, status, error_code);
                    if (*error_code != MPI_SUCCESS)
                        return;
                    MPI_Type_free(&agg_dtype);
                }
            }
        }

        /* Free (uncommit) datatypes for reuse */
        if (fd->is_agg) {
            if (buffered_io_size > 0) {
                for (i = 0; i < nprocs; i++) {
                    if (client_comm_sz_arr[i] > 0)
                        MPI_Type_free(&client_comm_dtype_arr[i]);
                }
            }
        }
        for (i = 0; i < nprocs; i++) {
            if (agg_comm_sz_arr[i] > 0)
                MPI_Type_free(&agg_comm_dtype_arr[i]);
        }

        /* figure out next set up requests */
        if (fd->is_agg) {
            ADIOI_Build_agg_reqs(fd, rdwr, nprocs,
                                 client_file_view_state_arr,
                                 client_comm_dtype_arr, client_comm_sz_arr, &agg_disp, &agg_dtype);
            buffered_io_size = 0;
            for (i = 0; i < nprocs; i++) {
                if (client_comm_sz_arr[i] > 0)
                    buffered_io_size += client_comm_sz_arr[i];
            }
        }
#ifdef USE_PRE_REQ
        else {
            /* Example use of ADIOI_Build_client_pre_req. to an
             * appropriate section */
            for (i = 0; i < fd->hints->cb_nodes; i++) {
                agg_rank = fd->hints->ranklist[(i + myrank) % fd->hints->cb_nodes];
#ifdef AGGREGATION_PROFILE
                MPE_Log_event(5040, 0, NULL);
#endif
                ADIOI_Build_client_pre_req(fd, agg_rank, (i + myrank) % fd->hints->cb_nodes,
                                           &(my_mem_view_state_arr[agg_rank]),
                                           &(agg_file_view_state_arr[agg_rank]),
                                           2 * 1024 * 1024, 64 * 1024);
#ifdef AGGREGATION_PROFILE
                MPE_Log_event(5041, 0, NULL);
#endif
            }
        }
#endif

        /* aggregators pre-post all Irecv's for incoming data from
         * clients.  if nothing is needed, agg_comm_requests is not
         * allocated */
        if (fd->hints->cb_alltoall == ADIOI_HINT_DISABLE) {
            if ((fd->is_agg) && (rdwr == ADIOI_WRITE))
                post_aggregator_comm(fd->comm, rdwr, nprocs, cb_buf,
                                     client_comm_dtype_arr,
                                     client_comm_sz_arr, &agg_comm_requests, &aggs_client_count);
        }

        /* Aggregators send amounts for data requested to clients */
        Exch_data_amounts(fd, nprocs, client_comm_sz_arr, agg_comm_sz_arr,
                          client_alltoallw_counts, agg_alltoallw_counts, &aggregators_done);

    }

    /* Clean up */

    if (fd->hints->cb_pfr != ADIOI_HINT_ENABLE) {
        /* AAR, FSIZE, and User provided uniform File realms */
        if (1) {
            MPI_Type_free(&fd->file_realm_types[0]);
        } else {
            for (i = 0; i < fd->hints->cb_nodes; i++) {
                ADIOI_Datatype_iscontig(fd->file_realm_types[i], &is_contig);
                MPI_Type_free(&fd->file_realm_types[i]);
            }
        }
        ADIOI_Free(fd->file_realm_types);
        ADIOI_Free(fd->file_realm_st_offs);
    }


    if (fd->is_agg) {
        if (buffered_io_size > 0)
            MPI_Type_free(&agg_dtype);
        for (i = 0; i < nprocs; i++) {
            MPI_Type_free(&client_comm_dtype_arr[i]);
            ADIOI_Free(client_file_view_state_arr[i].flat_type_p->indices);
            ADIOI_Free(client_file_view_state_arr[i].flat_type_p->blocklens);
            ADIOI_Free(client_file_view_state_arr[i].flat_type_p);
        }
        ADIOI_Free(client_file_view_state_arr);
        ADIOI_Free(cb_buf);
    }
    for (i = 0; i < nprocs; i++)
        if (agg_comm_sz_arr[i] > 0)
            MPI_Type_free(&agg_comm_dtype_arr[i]);

    ADIOI_Free(client_comm_sz_arr);
    ADIOI_Free(client_comm_dtype_arr);
    ADIOI_Free(my_mem_view_state_arr);
    ADIOI_Free(agg_file_view_state_arr);
    ADIOI_Free(agg_comm_sz_arr);
    ADIOI_Free(agg_comm_dtype_arr);
    ADIOI_Free(alltoallw_disps);
    ADIOI_Free(alltoallw_counts);
    ADIOI_Free(all_st_end_offsets);

#ifdef HAVE_STATUS_SET_BYTES
    MPIR_Status_set_bytes(status, datatype, bufsize);
    /* This is a temporary way of filling in status.  The right way is
     * to keep track of how much data was actually read and placed in
     * buf during collective I/O. */
#endif
    fd->fp_sys_posn = -1;       /* set it to null. */
#ifdef AGGREGATION_PROFILE
    if (rdwr == ADIOI_READ)
        MPE_Log_event(5011, 0, NULL);
    else
        MPE_Log_event(5013, 0, NULL);
#endif
}
/* ADIOI_Calc_my_req() - calculate what portions of the access requests
 * of this process are located in the file domains of various processes
 * (including this one)
 */
void ADIOI_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *len_list, 
		       int contig_access_count, ADIO_Offset 
		       min_st_offset, ADIO_Offset *fd_start,
		       ADIO_Offset *fd_end, ADIO_Offset fd_size,
                       int nprocs,
                       int *count_my_req_procs_ptr,
		       int **count_my_req_per_proc_ptr,
		       ADIOI_Access **my_req_ptr,
		       int **buf_idx_ptr)
/* Possibly reconsider if buf_idx's are ok as int's, or should they be aints/offsets? 
   They are used as memory buffer indices so it seems like the 2G limit is in effect */
{
    int *count_my_req_per_proc, count_my_req_procs, *buf_idx;
    int i, l, proc;
    ADIO_Offset fd_len, rem_len, curr_idx, off;
    ADIOI_Access *my_req;

#ifdef AGGREGATION_PROFILE
    MPE_Log_event (5024, 0, NULL);
#endif

    *count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs,sizeof(int)); 
    count_my_req_per_proc = *count_my_req_per_proc_ptr;
/* count_my_req_per_proc[i] gives the no. of contig. requests of this
   process in process i's file domain. calloc initializes to zero.
   I'm allocating memory of size nprocs, so that I can do an 
   MPI_Alltoall later on.*/

    buf_idx = (int *) ADIOI_Malloc(nprocs*sizeof(int));
/* buf_idx is relevant only if buftype_is_contig.
   buf_idx[i] gives the index into user_buf where data received
   from proc. i should be placed. This allows receives to be done
   without extra buffer. This can't be done if buftype is not contig. */
   
    /* initialize buf_idx to -1 */
    for (i=0; i < nprocs; i++) buf_idx[i] = -1;

    /* one pass just to calculate how much space to allocate for my_req;
     * contig_access_count was calculated way back in ADIOI_Calc_my_off_len()
     */
    for (i=0; i < contig_access_count; i++) {
	/* short circuit offset/len processing if len == 0 
	 * 	(zero-byte  read/write */
	if (len_list[i] == 0) 
		continue;
	off = offset_list[i];
	fd_len = len_list[i];
	/* note: we set fd_len to be the total size of the access.  then
	 * ADIOI_Calc_aggregator() will modify the value to return the 
	 * amount that was available from the file domain that holds the
	 * first part of the access.
	 */
	proc = ADIOI_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size, 
				     fd_start, fd_end);
	count_my_req_per_proc[proc]++;

	/* figure out how much data is remaining in the access (i.e. wasn't 
	 * part of the file domain that had the starting byte); we'll take 
	 * care of this data (if there is any) in the while loop below.
	 */
	rem_len = len_list[i] - fd_len;

	while (rem_len != 0) {
	    off += fd_len; /* point to first remaining byte */
	    fd_len = rem_len; /* save remaining size, pass to calc */
	    proc = ADIOI_Calc_aggregator(fd, off, min_st_offset, &fd_len, 
					 fd_size, fd_start, fd_end);

	    count_my_req_per_proc[proc]++;
	    rem_len -= fd_len; /* reduce remaining length by amount from fd */
	}
    }

/* now allocate space for my_req, offset, and len */

    *my_req_ptr = (ADIOI_Access *)
	ADIOI_Malloc(nprocs*sizeof(ADIOI_Access)); 
    my_req = *my_req_ptr;

    count_my_req_procs = 0;
    for (i=0; i < nprocs; i++) {
	if (count_my_req_per_proc[i]) {
	    my_req[i].offsets = (ADIO_Offset *)
		ADIOI_Malloc(count_my_req_per_proc[i] * sizeof(ADIO_Offset));
	    my_req[i].lens = (int *)
		ADIOI_Malloc(count_my_req_per_proc[i] * sizeof(int));
	    count_my_req_procs++;
	}	    
	my_req[i].count = 0;  /* will be incremented where needed
				      later */
    }

/* now fill in my_req */
    curr_idx = 0;
    for (i=0; i<contig_access_count; i++) { 
	/* short circuit offset/len processing if len == 0 
	 * 	(zero-byte  read/write */
	if (len_list[i] == 0)
		continue;
	off = offset_list[i];
	fd_len = len_list[i];
	proc = ADIOI_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size, 
				     fd_start, fd_end);

	/* for each separate contiguous access from this process */
	if (buf_idx[proc] == -1) 
  {
    ADIOI_Assert(curr_idx == (int) curr_idx);
    buf_idx[proc] = (int) curr_idx;
  }

	l = my_req[proc].count;
	curr_idx += fd_len; 

	rem_len = len_list[i] - fd_len;

	/* store the proc, offset, and len information in an array
         * of structures, my_req. Each structure contains the 
         * offsets and lengths located in that process's FD, 
	 * and the associated count. 
	 */
	my_req[proc].offsets[l] = off;
  ADIOI_Assert(fd_len == (int) fd_len);
	my_req[proc].lens[l] = (int) fd_len;
	my_req[proc].count++;

	while (rem_len != 0) {
	    off += fd_len;
	    fd_len = rem_len;
	    proc = ADIOI_Calc_aggregator(fd, off, min_st_offset, &fd_len, 
					 fd_size, fd_start, fd_end);

	    if (buf_idx[proc] == -1) 
      {
        ADIOI_Assert(curr_idx == (int) curr_idx);
        buf_idx[proc] = (int) curr_idx;
      }

	    l = my_req[proc].count;
	    curr_idx += fd_len;
	    rem_len -= fd_len;

	    my_req[proc].offsets[l] = off;
      ADIOI_Assert(fd_len == (int) fd_len);
	    my_req[proc].lens[l] = (int) fd_len;
	    my_req[proc].count++;
	}
    }

#ifdef AGG_DEBUG
    for (i=0; i<nprocs; i++) {
	if (count_my_req_per_proc[i] > 0) {
	    FPRINTF(stdout, "data needed from %d (count = %d):\n", i, 
		    my_req[i].count);
	    for (l=0; l < my_req[i].count; l++) {
		FPRINTF(stdout, "   off[%d] = %lld, len[%d] = %d\n", l,
			my_req[i].offsets[l], l, my_req[i].lens[l]);
	    }
	FPRINTF(stdout, "buf_idx[%d] = 0x%x\n", i, buf_idx[i]);
	}
    }
#endif

    *count_my_req_procs_ptr = count_my_req_procs;
    *buf_idx_ptr = buf_idx;
#ifdef AGGREGATION_PROFILE
    MPE_Log_event (5025, 0, NULL);
#endif
}
void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count,
			       MPI_Datatype datatype, int file_ptr_type,
			       ADIO_Offset offset, ADIO_Status * status,
			       int *error_code)
{
    /* offset is in units of etype relative to the filetype. */
    ADIOI_Flatlist_node *flat_buf, *flat_file;
    ADIO_Offset i_offset, sum, size_in_filetype;
    int i, j, k, st_index=0;
    int n_etypes_in_filetype;
    ADIO_Offset num, size, n_filetypes, etype_in_filetype, st_n_filetypes;
    ADIO_Offset abs_off_in_filetype=0;
    int filetype_size, etype_size, buftype_size;
    MPI_Aint filetype_extent, buftype_extent;
    int buf_count, buftype_is_contig, filetype_is_contig;
    ADIO_Offset userbuf_off;
    ADIO_Offset off, req_off, disp, end_offset=0, writebuf_off, start_off;
    char *writebuf;
    unsigned bufsize, writebuf_len, write_sz;
    ADIO_Status status1;
    ADIO_Offset new_bwr_size, new_fwr_size, st_fwr_size, fwr_size=0, bwr_size, req_len;
    int stripe_size;
    static char myname[] = "ADIOI_LUSTRE_WriteStrided";

    if (fd->hints->ds_write == ADIOI_HINT_DISABLE) {
	/* if user has disabled data sieving on writes, use naive
	 * approach instead.
	 */
	ADIOI_GEN_WriteStrided_naive(fd,
				     buf,
				     count,
				     datatype,
				     file_ptr_type,
				     offset, status, error_code);
	return;
    }

    *error_code = MPI_SUCCESS;	/* changed below if error */

    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
    ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);

    MPI_Type_size(fd->filetype, &filetype_size);
    if (!filetype_size) {
#ifdef HAVE_STATUS_SET_BYTES
	MPIR_Status_set_bytes(status, datatype, 0);
#endif
	*error_code = MPI_SUCCESS;
	return;
    }

    MPI_Type_extent(fd->filetype, &filetype_extent);
    MPI_Type_size(datatype, &buftype_size);
    MPI_Type_extent(datatype, &buftype_extent);
    etype_size = fd->etype_size;

    ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(unsigned)buftype_size * (ADIO_Offset)count));
    bufsize = buftype_size * count;

    /* get striping info */
    stripe_size = fd->hints->striping_unit;

    /* Different buftype to different filetype */
    if (!buftype_is_contig && filetype_is_contig) {
        /* noncontiguous in memory, contiguous in file. */
	ADIOI_Flatten_datatype(datatype);
	flat_buf = ADIOI_Flatlist;
	while (flat_buf->type != datatype)
	    flat_buf = flat_buf->next;

	off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
            fd->disp + (ADIO_Offset)etype_size * offset;

	start_off = off;
	end_offset = start_off + bufsize - 1;
        /* write stripe size buffer each time */
	writebuf = (char *) ADIOI_Malloc(ADIOI_MIN(bufsize, stripe_size));
        writebuf_off = 0;
        writebuf_len = 0;

        /* if atomicity is true, lock the region to be accessed */
	if (fd->atomicity)
	    ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, bufsize);

	for (j = 0; j < count; j++) {
	    for (i = 0; i < flat_buf->count; i++) {
                userbuf_off = (ADIO_Offset)j * (ADIO_Offset)buftype_extent +
                    flat_buf->indices[i];
		req_off = off;
		req_len = flat_buf->blocklens[i];
		ADIOI_BUFFERED_WRITE_WITHOUT_READ
		off += flat_buf->blocklens[i];
	    }
        }

	/* write the buffer out finally */
	ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE,
			 ADIO_EXPLICIT_OFFSET, writebuf_off, &status1,
			 error_code);

	if (fd->atomicity)
	    ADIOI_UNLOCK(fd, start_off, SEEK_SET, bufsize);
	if (*error_code != MPI_SUCCESS) {
            ADIOI_Free(writebuf);
	    return;
        }
	ADIOI_Free(writebuf);
	if (file_ptr_type == ADIO_INDIVIDUAL)
	    fd->fp_ind = off;
    } else {
        /* noncontiguous in file */
        /* filetype already flattened in ADIO_Open */
	flat_file = ADIOI_Flatlist;
	while (flat_file->type != fd->filetype)
	    flat_file = flat_file->next;
	disp = fd->disp;

	if (file_ptr_type == ADIO_INDIVIDUAL) {
            /* Wei-keng reworked type processing to be a bit more efficient */
            offset       = fd->fp_ind - disp;
            n_filetypes  = (offset - flat_file->indices[0]) / filetype_extent;
            offset      -= (ADIO_Offset)n_filetypes * filetype_extent;
            /* now offset is local to this extent */

            /* find the block where offset is located, skip blocklens[i]==0 */
            for (i=0; i<flat_file->count; i++) {
                ADIO_Offset dist;
                if (flat_file->blocklens[i] == 0) continue;
                dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
                /* fwr_size is from offset to the end of block i */
                if (dist == 0) {
                    i++;
                    offset   = flat_file->indices[i];
                    fwr_size = flat_file->blocklens[i];
			break;
		    }
                if (dist > 0) {
                    fwr_size = dist;
                    break;
		}
	    }
            st_index = i;  /* starting index in flat_file->indices[] */
            offset += disp + (ADIO_Offset)n_filetypes*filetype_extent;
        }
        else {
            n_etypes_in_filetype = filetype_size/etype_size;
            n_filetypes = offset / n_etypes_in_filetype;
            etype_in_filetype = offset % n_etypes_in_filetype;
	    size_in_filetype = etype_in_filetype * etype_size;

	    sum = 0;
	    for (i = 0; i < flat_file->count; i++) {
		sum += flat_file->blocklens[i];
		if (sum > size_in_filetype) {
		    st_index = i;
		    fwr_size = sum - size_in_filetype;
		    abs_off_in_filetype = flat_file->indices[i] +
			size_in_filetype - (sum - flat_file->blocklens[i]);
		    break;
		}
	    }

	    /* abs. offset in bytes in the file */
	    offset = disp + (ADIO_Offset) n_filetypes *filetype_extent +
		     abs_off_in_filetype;
	}

	start_off = offset;

        /* Wei-keng Liao:write request is within single flat_file
         * contig block*/
        /* this could happen, for example, with subarray types that are
         * actually fairly contiguous */
        if (buftype_is_contig && bufsize <= fwr_size) {
            req_off = start_off;
            req_len = bufsize;
            end_offset = start_off + bufsize - 1;
	    writebuf = (char *) ADIOI_Malloc(ADIOI_MIN(bufsize, stripe_size));
	    memset(writebuf, -1, ADIOI_MIN(bufsize, stripe_size));
            writebuf_off = 0;
            writebuf_len = 0;
            userbuf_off = 0;
            ADIOI_BUFFERED_WRITE_WITHOUT_READ
            /* write the buffer out finally */
            ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE,
                             ADIO_EXPLICIT_OFFSET, writebuf_off, &status1,
                             error_code);

            if (file_ptr_type == ADIO_INDIVIDUAL) {
                /* update MPI-IO file pointer to point to the first byte
                 * that can be accessed in the fileview. */
                fd->fp_ind = offset + bufsize;
                if (bufsize == fwr_size) {
                    do {
                        st_index++;
                        if (st_index == flat_file->count) {
                            st_index = 0;
                            n_filetypes++;
                        }
                    } while (flat_file->blocklens[st_index] == 0);
                    fd->fp_ind = disp + flat_file->indices[st_index]
                        + (ADIO_Offset)n_filetypes*filetype_extent;
                }
            }
            fd->fp_sys_posn = -1;   /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
            MPIR_Status_set_bytes(status, datatype, bufsize);
#endif
            ADIOI_Free(writebuf);
            return;
        }

	    /* Calculate end_offset, the last byte-offset that will be accessed.
           e.g., if start_offset=0 and 100 bytes to be write, end_offset=99*/

	    st_fwr_size = fwr_size;
	    st_n_filetypes = n_filetypes;
        i_offset = 0;
	    j = st_index;
	    off = offset;
	    fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
        while (i_offset < bufsize) {
            i_offset += fwr_size;
		end_offset = off + fwr_size - 1;

            j = (j+1) % flat_file->count;
            n_filetypes += (j == 0) ? 1 : 0;
            while (flat_file->blocklens[j]==0) {
                j = (j+1) % flat_file->count;
                n_filetypes += (j == 0) ? 1 : 0;
		}

		off = disp + flat_file->indices[j] +
                n_filetypes*(ADIO_Offset)filetype_extent;
            fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset);
	    }

/* if atomicity is true, lock the region to be accessed */
        if (fd->atomicity)
            ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);

	    writebuf_off = 0;
	    writebuf_len = 0;
	    writebuf = (char *) ADIOI_Malloc(stripe_size);
	    memset(writebuf, -1, stripe_size);

	    if (buftype_is_contig && !filetype_is_contig) {

/* contiguous in memory, noncontiguous in file. should be the most
		   common case. */

            i_offset = 0;
		j = st_index;
		off = offset;
		n_filetypes = st_n_filetypes;
		fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
            while (i_offset < bufsize) {
		    if (fwr_size) {
			/* TYPE_UB and TYPE_LB can result in
			   fwr_size = 0. save system call in such cases */
                    /* lseek(fd->fd_sys, off, SEEK_SET);
                       err = write(fd->fd_sys, ((char *) buf) + i_offset, fwr_size);*/

			req_off = off;
			req_len = fwr_size;
                    userbuf_off = i_offset;
			ADIOI_BUFFERED_WRITE
                    }
                i_offset += fwr_size;

		    if (off + fwr_size < disp + flat_file->indices[j] +
		                         flat_file->blocklens[j] +
                    n_filetypes*(ADIO_Offset)filetype_extent)
		        off += fwr_size;
		    /* did not reach end of contiguous block in filetype.
		    no more I/O needed. off is incremented by fwr_size. */
		    else {
                    j = (j+1) % flat_file->count;
                    n_filetypes += (j == 0) ? 1 : 0;
                    while (flat_file->blocklens[j]==0) {
                        j = (j+1) % flat_file->count;
                        n_filetypes += (j == 0) ? 1 : 0;
			}
			off = disp + flat_file->indices[j] +
                        n_filetypes*(ADIO_Offset)filetype_extent;
			fwr_size = ADIOI_MIN(flat_file->blocklens[j],
                                         bufsize-i_offset);
		    }
		}
        }
        else {
Beispiel #25
0
void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
                            * flat_buf, char **send_buf, ADIO_Offset
                            * offset_list, ADIO_Offset * len_list, int *send_size,
                            MPI_Request * requests, int *sent_to_proc,
                            int nprocs, int myrank,
                            int contig_access_count,
                            ADIO_Offset min_st_offset, ADIO_Offset fd_size,
                            ADIO_Offset * fd_start, ADIO_Offset * fd_end,
                            int *send_buf_idx, int *curr_to_proc,
                            int *done_to_proc, int iter, MPI_Aint buftype_extent)
{
/* this function is only called if buftype is not contig */

    int i, p, flat_buf_idx;
    ADIO_Offset flat_buf_sz, size_in_buf, buf_incr, size;
    int jj, n_buftypes;
    ADIO_Offset off, len, rem_len, user_buf_idx;

/*  curr_to_proc[p] = amount of data sent to proc. p that has already
    been accounted for so far
    done_to_proc[p] = amount of data already sent to proc. p in
    previous iterations
    user_buf_idx = current location in user buffer
    send_buf_idx[p] = current location in send_buf of proc. p  */

    for (i = 0; i < nprocs; i++) {
        send_buf_idx[i] = curr_to_proc[i] = 0;
        done_to_proc[i] = sent_to_proc[i];
    }
    jj = 0;

    user_buf_idx = flat_buf->indices[0];
    flat_buf_idx = 0;
    n_buftypes = 0;
    flat_buf_sz = flat_buf->blocklens[0];

    /* flat_buf_idx = current index into flattened buftype
     * flat_buf_sz = size of current contiguous component in
     * flattened buf */

    for (i = 0; i < contig_access_count; i++) {
        off = offset_list[i];
        rem_len = len_list[i];

        /*this request may span the file domains of more than one process */
        while (rem_len != 0) {
            len = rem_len;
            /* NOTE: len value is modified by ADIOI_Calc_aggregator() to be no
             * longer than the single region that processor "p" is responsible
             * for.
             */
            p = ADIOI_Calc_aggregator(fd, off, min_st_offset, &len, fd_size, fd_start, fd_end);

            if (send_buf_idx[p] < send_size[p]) {
                if (curr_to_proc[p] + len > done_to_proc[p]) {
                    if (done_to_proc[p] > curr_to_proc[p]) {
                        size = MPL_MIN(curr_to_proc[p] + len -
                                       done_to_proc[p], send_size[p] - send_buf_idx[p]);
                        buf_incr = done_to_proc[p] - curr_to_proc[p];
                        ADIOI_BUF_INCR
                            ADIOI_Assert((curr_to_proc[p] + len - done_to_proc[p]) ==
                                         (unsigned) (curr_to_proc[p] + len - done_to_proc[p]));
                        buf_incr = curr_to_proc[p] + len - done_to_proc[p];
                        ADIOI_Assert((done_to_proc[p] + size) ==
                                     (unsigned) (done_to_proc[p] + size));
                        /* ok to cast: bounded by cb buffer size */
                        curr_to_proc[p] = done_to_proc[p] + (int) size;
                    ADIOI_BUF_COPY} else {
                        size = MPL_MIN(len, send_size[p] - send_buf_idx[p]);
                        buf_incr = len;
                        ADIOI_Assert((curr_to_proc[p] + size) ==
                                     (unsigned) ((ADIO_Offset) curr_to_proc[p] + size));
                        curr_to_proc[p] += size;
                    ADIOI_BUF_COPY}
                    if (send_buf_idx[p] == send_size[p]) {
                        MPI_Isend(send_buf[p], send_size[p], MPI_BYTE, p,
                                  myrank + p + 100 * iter, fd->comm, requests + jj);
                        jj++;
                    }
                } else {
                    ADIOI_Assert((curr_to_proc[p] + len) ==
                                 (unsigned) ((ADIO_Offset) curr_to_proc[p] + len));
                    curr_to_proc[p] += len;
                    buf_incr = len;
                ADIOI_BUF_INCR}
            } else {
Beispiel #26
0
static void ADIOI_Iread_and_exch_l1_begin(ADIOI_NBC_Request *nbc_req,
                                          int *error_code)
{
    ADIOI_Iread_and_exch_vars *vars = nbc_req->data.rd.rae_vars;
    ADIO_File fd;
    int nprocs;
    ADIOI_Access *others_req;

    int i, j;
    ADIO_Offset real_off, req_off;
    char *read_buf;
    int *curr_offlen_ptr, *count, *send_size;
    int *partial_send, *start_pos;
    ADIO_Offset size, real_size, for_next_iter;
    int req_len, flag;

    ADIOI_R_Iexchange_data_vars *red_vars = NULL;

    /* loop exit condition */
    if (vars->m >= vars->ntimes) {
        ADIOI_Iread_and_exch_reset(nbc_req, error_code);
        return;
    }

    fd = vars->fd;
    nprocs = vars->nprocs;
    others_req = vars->others_req;

    read_buf = vars->read_buf;
    curr_offlen_ptr = vars->curr_offlen_ptr;
    count = vars->count;
    send_size = vars->send_size;
    partial_send = vars->partial_send;
    start_pos = vars->start_pos;

    /* read buf of size coll_bufsize (or less) */
    /* go through all others_req and check if any are satisfied
       by the current read */

    /* since MPI guarantees that displacements in filetypes are in
       monotonically nondecreasing order, I can maintain a pointer
       (curr_offlen_ptr) to
       current off-len pair for each process in others_req and scan
       further only from there. There is still a problem of filetypes
       such as:  (1, 2, 3 are not process nos. They are just numbers for
       three chunks of data, specified by a filetype.)

       1  -------!--
       2    -----!----
       3       --!-----

       where ! indicates where the current read_size limitation cuts
       through the filetype.  I resolve this by reading up to !, but
       filling the communication buffer only for 1. I copy the portion
       left over for 2 into a tmp_buf for use in the next
       iteration. i.e., 2 and 3 will be satisfied in the next
       iteration. This simplifies filling in the user's buf at the
       other end, as only one off-len pair with incomplete data
       will be sent. I also don't need to send the individual
       offsets and lens along with the data, as the data is being
       sent in a particular order. */

    /* off = start offset in the file for the data actually read in
             this iteration
       size = size of data read corresponding to off
       real_off = off minus whatever data was retained in memory from
             previous iteration for cases like 2, 3 illustrated above
       real_size = size plus the extra corresponding to real_off
       req_off = off in file for a particular contiguous request
                 minus what was satisfied in previous iteration
       req_size = size corresponding to req_off */

    size = ADIOI_MIN((unsigned)vars->coll_bufsize,
                     vars->end_loc - vars->st_loc + 1 - vars->done);
    real_off = vars->off - vars->for_curr_iter;
    real_size = size + vars->for_curr_iter;

    vars->size = size;
    vars->real_size = real_size;

    for (i = 0; i < nprocs; i++) count[i] = send_size[i] = 0;
    for_next_iter = 0;

    for (i = 0; i < nprocs; i++) {
#ifdef RDCOLL_DEBUG
        DBG_FPRINTF(stderr, "rank %d, i %d, others_count %d\n",
                    vars->myrank, i, others_req[i].count);
#endif
        if (others_req[i].count) {
            start_pos[i] = curr_offlen_ptr[i];
            for (j = curr_offlen_ptr[i]; j < others_req[i].count; j++) {
                if (partial_send[i]) {
                    /* this request may have been partially
                       satisfied in the previous iteration. */
                    req_off = others_req[i].offsets[j] + partial_send[i];
                    req_len = others_req[i].lens[j] - partial_send[i];
                    partial_send[i] = 0;
                    /* modify the off-len pair to reflect this change */
                    others_req[i].offsets[j] = req_off;
                    others_req[i].lens[j] = req_len;
                }
                else {
                    req_off = others_req[i].offsets[j];
                    req_len = others_req[i].lens[j];
                }
                if (req_off < real_off + real_size) {
                    count[i]++;
                    ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)read_buf) + req_off - real_off) == (ADIO_Offset)(MPIR_Upint)(read_buf + req_off - real_off));
                    MPI_Address(read_buf + req_off - real_off,
                                &(others_req[i].mem_ptrs[j]));
                    ADIOI_Assert((real_off + real_size - req_off) == (int)(real_off + real_size - req_off));
                    send_size[i] += (int)(ADIOI_MIN(real_off + real_size - req_off,
                                                    (ADIO_Offset)(unsigned)req_len));

                    if (real_off + real_size - req_off < (ADIO_Offset)(unsigned)req_len) {
                        partial_send[i] = (int)(real_off + real_size - req_off);
                        if ((j+1 < others_req[i].count) &&
                            (others_req[i].offsets[j+1] < real_off + real_size)) {
                            /* this is the case illustrated in the
                               figure above. */
                            for_next_iter = ADIOI_MAX(for_next_iter,
                                    real_off + real_size - others_req[i].offsets[j+1]);
                            /* max because it must cover requests
                               from different processes */
                        }
                        break;
                    }
                }
                else break;
            }
            curr_offlen_ptr[i] = j;
        }
    }
    vars->for_next_iter = for_next_iter;

    flag = 0;
    for (i = 0; i < nprocs; i++)
        if (count[i]) flag = 1;

    /* create a struct for ADIOI_R_Iexchange_data() */
    red_vars = (ADIOI_R_Iexchange_data_vars *)ADIOI_Calloc(
            1, sizeof(ADIOI_R_Iexchange_data_vars));
    nbc_req->data.rd.red_vars = red_vars;
    red_vars->fd = vars->fd;
    red_vars->buf = vars->buf;
    red_vars->flat_buf = vars->flat_buf;
    red_vars->offset_list = vars->offset_list;
    red_vars->len_list = vars->len_list;
    red_vars->send_size = vars->send_size;
    red_vars->recv_size = vars->recv_size;
    red_vars->count = vars->count;
    red_vars->start_pos = vars->start_pos;
    red_vars->partial_send = vars->partial_send;
    red_vars->recd_from_proc = vars->recd_from_proc;
    red_vars->nprocs = vars->nprocs;
    red_vars->myrank = vars->myrank;
    red_vars->buftype_is_contig = vars->buftype_is_contig;
    red_vars->contig_access_count = vars->contig_access_count;
    red_vars->min_st_offset = vars->min_st_offset;
    red_vars->fd_size = vars->fd_size;
    red_vars->fd_start = vars->fd_start;
    red_vars->fd_end = vars->fd_end;
    red_vars->others_req = vars->others_req;
    red_vars->iter = vars->m;
    red_vars->buftype_extent = vars->buftype_extent;
    red_vars->buf_idx = vars->buf_idx;
    red_vars->next_fn = ADIOI_Iread_and_exch_l1_end;

    if (flag) {
        ADIOI_Assert(size == (int)size);
        ADIO_IreadContig(fd, read_buf+vars->for_curr_iter, (int)size,
                         MPI_BYTE, ADIO_EXPLICIT_OFFSET, vars->off,
                         &vars->req2, error_code);

        nbc_req->data.rd.state = ADIOI_IRC_STATE_IREAD_AND_EXCH_L1_BEGIN;
        return;
    }

    ADIOI_R_Iexchange_data(nbc_req, error_code);
}
Beispiel #27
0
void ADIOI_GEN_OpenColl(ADIO_File fd, int rank, 
	int access_mode, int *error_code)
{
    int orig_amode_excl, orig_amode_wronly;
    MPI_Comm tmp_comm;
    MPI_Datatype stats_type;  /* deferred open: some processes might not
				 open the file, so we'll exchange some
				 information with those non-aggregators */

    orig_amode_excl = access_mode;

    if (access_mode & ADIO_CREATE ){
       if(rank == fd->hints->ranklist[0]) {
	   /* remove delete_on_close flag if set */
	   if (access_mode & ADIO_DELETE_ON_CLOSE)
	       fd->access_mode = access_mode ^ ADIO_DELETE_ON_CLOSE;
	   else 
	       fd->access_mode = access_mode;
	       
	   tmp_comm = fd->comm;
	   fd->comm = MPI_COMM_SELF;
	   (*(fd->fns->ADIOI_xxx_Open))(fd, error_code);
	   fd->comm = tmp_comm;
	   MPI_Bcast(error_code, 1, MPI_INT, \
		     fd->hints->ranklist[0], fd->comm);
	   /* if no error, close the file and reopen normally below */
	   if (*error_code == MPI_SUCCESS) 
	       (*(fd->fns->ADIOI_xxx_Close))(fd, error_code);

	   fd->access_mode = access_mode; /* back to original */
       }
       else MPI_Bcast(error_code, 1, MPI_INT, fd->hints->ranklist[0], fd->comm);

       if (*error_code != MPI_SUCCESS) {
	   return;
       } 
       else {
           /* turn off CREAT (and EXCL if set) for real multi-processor open */
           access_mode ^= ADIO_CREATE; 
	   if (access_mode & ADIO_EXCL)
		   access_mode ^= ADIO_EXCL;
       }
    }
    fd->blksize = 1024*1024*4; /* this large default value should be good for
				 most file systems.  any ROMIO driver is free
				 to stat the file and find an optimial value */

    /* if we are doing deferred open, non-aggregators should return now */
    if (fd->hints->deferred_open ) {
        if (!(fd->is_agg)) {
	    char value[MPI_MAX_INFO_VAL+1];
            /* we might have turned off EXCL for the aggregators.
             * restore access_mode that non-aggregators get the right
             * value from get_amode */
            fd->access_mode = orig_amode_excl;
	    /* In file-system specific open, a driver might collect some
	     * information via stat().  Deferred open means not every process
	     * participates in fs-specific open, but they all participate in
	     * this open call.  Broadcast a bit of information in case
	     * lower-level file system driver (e.g. 'bluegene') collected it
	     * (not all do)*/
	    stats_type = make_stats_type(fd);
	    MPI_Bcast(MPI_BOTTOM, 1, stats_type, fd->hints->ranklist[0], fd->comm);
	    ADIOI_Assert(fd->blksize > 0);
	    /* some file systems (e.g. lustre) will inform the user via the
	     * info object about the file configuration.  deferred open,
	     * though, skips that step for non-aggregators.  we do the
	     * info-setting here */
	    sprintf(value, "%d", fd->hints->striping_unit);
	    ADIOI_Info_set(fd->info, "striping_unit", value);

	    sprintf(value, "%d", fd->hints->striping_factor);
	    ADIOI_Info_set(fd->info, "striping_factor", value);

	    sprintf(value, "%d", fd->hints->start_iodevice);
	    ADIOI_Info_set(fd->info, "romio_lustre_start_iodevice", value);

	    *error_code = MPI_SUCCESS;
	    MPI_Type_free(&stats_type);
	    return;
	}
    }

/* For writing with data sieving, a read-modify-write is needed. If 
   the file is opened for write_only, the read will fail. Therefore,
   if write_only, open the file as read_write, but record it as write_only
   in fd, so that get_amode returns the right answer. */

    /* observation from David Knaak: file systems that do not support data
     * sieving do not need to change the mode */

    orig_amode_wronly = access_mode;
    if ( (access_mode & ADIO_WRONLY) &&
	    ADIO_Feature(fd, ADIO_DATA_SIEVING_WRITES) ) {
	access_mode = access_mode ^ ADIO_WRONLY;
	access_mode = access_mode | ADIO_RDWR;
    }
    fd->access_mode = access_mode;

    (*(fd->fns->ADIOI_xxx_Open))(fd, error_code);

    /* if error, may be it was due to the change in amode above. 
       therefore, reopen with access mode provided by the user.*/ 
    fd->access_mode = orig_amode_wronly;  
    if (*error_code != MPI_SUCCESS) 
        (*(fd->fns->ADIOI_xxx_Open))(fd, error_code);

    /* if we turned off EXCL earlier, then we should turn it back on */
    if (fd->access_mode != orig_amode_excl) fd->access_mode = orig_amode_excl;

    /* broadcast information to all proceses in
     * communicator, not just those who participated in open */

    stats_type = make_stats_type(fd);
    MPI_Bcast(MPI_BOTTOM, 1, stats_type, fd->hints->ranklist[0], fd->comm);
    MPI_Type_free(&stats_type);
    /* file domain code will get terribly confused in a hard-to-debug way if
     * gpfs blocksize not sensible */
    ADIOI_Assert( fd->blksize > 0);

    /* for deferred open: this process has opened the file (because if we are
     * not an aggregaor and we are doing deferred open, we returned earlier)*/
    fd->is_open = 1;

}
Beispiel #28
0
void ADIOI_P2PContigWriteAggregation(ADIO_File fd,
	const void *buf,
	int *error_code,
	ADIO_Offset *st_offsets,
	ADIO_Offset *end_offsets,
	ADIO_Offset *fd_start,
	ADIO_Offset* fd_end)
{

    *error_code = MPI_SUCCESS; /* initialize to success */

#ifdef ROMIO_GPFS
    double startTimeBase,endTimeBase;
#endif

    MPI_Status status;
    pthread_t io_thread;
    void *thread_ret;
    ADIOI_IO_ThreadFuncData io_thread_args;

    int nprocs,myrank;
    MPI_Comm_size(fd->comm, &nprocs);
    MPI_Comm_rank(fd->comm, &myrank);

	ADIO_Offset myOffsetStart = st_offsets[myrank], myOffsetEnd = end_offsets[myrank];

    int myAggRank = -1; /* if I am an aggregor this is my index into fd->hints->ranklist */
    int iAmUsedAgg = 0;

#ifdef ROMIO_GPFS
    startTimeBase = MPI_Wtime();
#endif

    int naggs = fd->hints->cb_nodes;
    int coll_bufsize = fd->hints->cb_buffer_size;
#ifdef ROMIO_GPFS
    if (gpfsmpio_pthreadio == 1) {
	/* split buffer in half for a kind of double buffering with the threads*/
	coll_bufsize = fd->hints->cb_buffer_size/2;
    }
#endif

    int j;
    for (j=0;j<naggs;j++) {
	if (fd->hints->ranklist[j] == myrank) {
	    myAggRank = j;
	    if (fd_end[j] > fd_start[j]) {
		iAmUsedAgg = 1;
	    }
	}
    }

    /* Determine how much data and to whom I need to send.  For source proc
     * targets, also determine the target file domain offsets locally to
     * reduce communication overhead */
    int *targetAggsForMyData = (int *)ADIOI_Malloc(naggs * sizeof(int));
    ADIO_Offset *targetAggsForMyDataFDStart = (ADIO_Offset *)ADIOI_Malloc(naggs * sizeof(ADIO_Offset));
    ADIO_Offset *targetAggsForMyDataFDEnd = (ADIO_Offset *)ADIOI_Malloc(naggs * sizeof(ADIO_Offset));
    int numTargetAggs = 0;
    int i;
    for (i=0;i<naggs;i++) {
	if ( ((myOffsetStart >= fd_start[i]) &&  (myOffsetStart <= fd_end[i])) || ((myOffsetEnd >= fd_start[i]) &&  (myOffsetEnd <= fd_end[i]))) {
	    targetAggsForMyData[numTargetAggs] = fd->hints->ranklist[i];
	    targetAggsForMyDataFDStart[numTargetAggs] = fd_start[i];
	    targetAggsForMyDataFDEnd[numTargetAggs] = fd_end[i];
	    numTargetAggs++;
	}
    }

    /* these 3 arrays track info on the procs that feed an aggregtor */
    int *sourceProcsForMyData=NULL;
    int *remainingDataAmountToGetPerProc=NULL;
    ADIO_Offset *remainingDataOffsetToGetPerProc=NULL;

    int numSourceProcs = 0;

    if (iAmUsedAgg) { /* for the used aggregators figure out how much data I
			 need from what procs */

	/* count numSourceProcs so we know how large to make the arrays */
	for (i=0;i<nprocs;i++)
	    if ( ((st_offsets[i] >= fd_start[myAggRank]) &&  (st_offsets[i] <= fd_end[myAggRank])) || ((end_offsets[i] >= fd_start[myAggRank]) &&  (end_offsets[i] <= fd_end[myAggRank])))
		numSourceProcs++;

	sourceProcsForMyData = (int *)ADIOI_Malloc(numSourceProcs * sizeof(int));
	remainingDataAmountToGetPerProc = (int *)ADIOI_Malloc(numSourceProcs * sizeof(int));
	remainingDataOffsetToGetPerProc = (ADIO_Offset *)ADIOI_Malloc(numSourceProcs * sizeof(ADIO_Offset));

	/* everybody has the st_offsets and end_offsets for all ranks so if I am a
	 * used aggregator go thru them and figure out which ranks have data that
	 * falls into my file domain assigned to me */
	numSourceProcs = 0;
	for (i=0;i<nprocs;i++) {
	    if ( ((st_offsets[i] >= fd_start[myAggRank]) &&  (st_offsets[i] <= fd_end[myAggRank])) || ((end_offsets[i] >= fd_start[myAggRank]) &&  (end_offsets[i] <= fd_end[myAggRank]))) {
		sourceProcsForMyData[numSourceProcs] = i;
		if ( ((st_offsets[i] >= fd_start[myAggRank]) &&  (st_offsets[i] <= fd_end[myAggRank])) && ((end_offsets[i] >= fd_start[myAggRank]) &&  (end_offsets[i] <= fd_end[myAggRank]))) {
		    remainingDataAmountToGetPerProc[numSourceProcs] = (end_offsets[i] - st_offsets[i])+1;
		    remainingDataOffsetToGetPerProc[numSourceProcs] = st_offsets[i];
		}
		else if ((st_offsets[i] >= fd_start[myAggRank]) &&  (st_offsets[i] <= fd_end[myAggRank])) {/* starts in this fd and goes past it */
		    remainingDataAmountToGetPerProc[numSourceProcs] = (fd_end[myAggRank] - st_offsets[i]) +1;
		    remainingDataOffsetToGetPerProc[numSourceProcs] = st_offsets[i];
		}
		else { /* starts in fd before this and ends in it */
		    remainingDataAmountToGetPerProc[numSourceProcs] = (end_offsets[i] - fd_start[myAggRank]) +1;
		    remainingDataOffsetToGetPerProc[numSourceProcs] = fd_start[myAggRank];
		}
#ifdef p2pcontigtrace
		printf("getting %ld bytes from source proc %d in fd rank %d with borders %ld to %ld\n",remainingDataAmountToGetPerProc[numSourceProcs],i,fd->hints->ranklist[myAggRank],fd_start[myAggRank],fd_end[myAggRank]);
#endif
		numSourceProcs++;
	    }
	}
    }

    int *amountOfDataReqestedByTargetAgg = (int *)ADIOI_Malloc(naggs * sizeof(int));
    for (i=0;i<numTargetAggs;i++) {
	amountOfDataReqestedByTargetAgg[i] = 0;
    }

    int totalAmountDataReceived = 0;
    MPI_Request *mpiSizeToSendRequest = (MPI_Request *) ADIOI_Malloc(numTargetAggs * sizeof(MPI_Request));
    MPI_Request *mpiRecvDataRequest = (MPI_Request *) ADIOI_Malloc(numSourceProcs * sizeof(MPI_Request));
    MPI_Request *mpiSendDataSizeRequest = (MPI_Request *) ADIOI_Malloc(numSourceProcs * sizeof(MPI_Request));

    MPI_Request *mpiSendDataToTargetAggRequest = (MPI_Request *) ADIOI_Malloc(numTargetAggs * sizeof(MPI_Request));
    MPI_Status mpiWaitAnyStatusFromTargetAggs,mpiWaitAnyStatusFromSourceProcs;
    MPI_Status mpiIsendStatusForSize,  mpiIsendStatusForData;

    /* use the write buffer allocated in the file_open */
    char *write_buf0 = fd->io_buf;
    char *write_buf1 = fd->io_buf + coll_bufsize;

    /* start off pointing to the first buffer. If we use the 2nd buffer (threaded
     * case) we'll swap later */
    char *write_buf = write_buf0;

    /* compute number of rounds */
    ADIO_Offset numberOfRounds = (ADIO_Offset)((((ADIO_Offset)(end_offsets[nprocs-1]-st_offsets[0]))/((ADIO_Offset)((ADIO_Offset)coll_bufsize*(ADIO_Offset)naggs)))) + 1;

    int currentWriteBuf = 0;
    int useIOBuffer = 0;
#ifdef ROMIO_GPFS
    if (gpfsmpio_pthreadio && (numberOfRounds>1)) {
	useIOBuffer = 1;
	io_thread = pthread_self();
    }
#endif

    ADIO_Offset currentRoundFDStart = 0;
    ADIO_Offset currentRoundFDEnd = 0;

    if (iAmUsedAgg) {
	currentRoundFDStart = fd_start[myAggRank];
    }

    int *dataSizeGottenThisRoundPerProc = (int *)ADIOI_Malloc(numSourceProcs * sizeof(int));
    int *mpiRequestMapPerProc = (int *)ADIOI_Malloc(numSourceProcs * sizeof(int));
    int *targetAggIndexesForMyDataThisRound = (int *)ADIOI_Malloc(numTargetAggs * sizeof(int));
    int *sendBufferOffsetsThisRound = (int *)ADIOI_Malloc(numTargetAggs * sizeof(int));
    int *bufferAmountsToSendThisRound = (int *)ADIOI_Malloc(numTargetAggs * sizeof(int));

#ifdef ROMIO_GPFS
    endTimeBase = MPI_Wtime();
    gpfsmpio_prof_cw[GPFSMPIO_CIO_T_MYREQ] += (endTimeBase-startTimeBase);
    startTimeBase = MPI_Wtime();
#endif

    /* each iteration of this loop writes a coll_bufsize portion of the file
     * domain */
    int roundIter;
    for (roundIter=0;roundIter<numberOfRounds;roundIter++) {

	/* determine what target aggs I need to send data to this round */
	int numTargetAggsThisRound = 0;
	for (i=0;i<numTargetAggs;i++) {
	    if ( ((myOffsetStart >= targetAggsForMyDataFDStart[i]) && (myOffsetStart <= targetAggsForMyDataFDEnd[i])) ||
		    ((myOffsetEnd >= targetAggsForMyDataFDStart[i]) && (myOffsetEnd <= targetAggsForMyDataFDEnd[i]))) {
		/* we know that we need to send data to this target agg at some point, now need to figure out how much this round */

		/* here are the offsets currently being collected by the aggregator during this round */
		ADIO_Offset currentRoundFDStartForMyTargetAgg = (ADIO_Offset)((ADIO_Offset)targetAggsForMyDataFDStart[i] + (ADIO_Offset)((ADIO_Offset)roundIter*(ADIO_Offset)coll_bufsize));
		ADIO_Offset currentRoundFDEndForMyTargetAgg = (ADIO_Offset)((ADIO_Offset)targetAggsForMyDataFDStart[i] + (ADIO_Offset)((ADIO_Offset)(roundIter+1)*(ADIO_Offset)coll_bufsize) - (ADIO_Offset)1);
		if (currentRoundFDEndForMyTargetAgg > targetAggsForMyDataFDEnd[i])
		    currentRoundFDEndForMyTargetAgg = targetAggsForMyDataFDEnd[i];

#ifdef p2pcontigtrace
		printf("roundIter %d target iter %d targetAggsForMyData is %d myOffsetStart is %ld myOffsetEnd is %ld targetAggsForMyDataFDStart is %ld targetAggsForMyDataFDEnd is %ld currentRoundFDStartForMyTargetAgg is %ld currentRoundFDEndForMyTargetAgg is %ld\n",
			roundIter,i,targetAggsForMyData[i],myOffsetStart,myOffsetEnd,
			targetAggsForMyDataFDStart[i],targetAggsForMyDataFDEnd[i],
			currentRoundFDStartForMyTargetAgg,currentRoundFDEndForMyTargetAgg);
#endif

		/* send the portion of my data that is within
		 * currentRoundFDStartForMyTargetAgg to
		 * currentRoundFDEndForMyTargetAgg */
		/* find the offset into the send buffer and the amount
		 * of data to send */
		int sendBufferOffset = 0;
		int bufferAmountToSend = 0;

		if ((myOffsetStart >= currentRoundFDStartForMyTargetAgg) && (myOffsetStart <= currentRoundFDEndForMyTargetAgg)) {
		    if (myOffsetEnd > currentRoundFDEndForMyTargetAgg)
			bufferAmountToSend = (currentRoundFDEndForMyTargetAgg - myOffsetStart) +1;
		    else
			bufferAmountToSend = (myOffsetEnd - myOffsetStart) +1;
		}
		else if ((myOffsetEnd >= currentRoundFDStartForMyTargetAgg) && (myOffsetEnd <= currentRoundFDEndForMyTargetAgg)) {
		    sendBufferOffset = (int) (currentRoundFDStartForMyTargetAgg - myOffsetStart);
		    if (myOffsetEnd > currentRoundFDEndForMyTargetAgg)
			bufferAmountToSend = (currentRoundFDEndForMyTargetAgg - currentRoundFDStartForMyTargetAgg) +1;
		    else
			bufferAmountToSend = (myOffsetEnd - currentRoundFDStartForMyTargetAgg) +1;
		}
		else if ((myOffsetStart <= currentRoundFDStartForMyTargetAgg) && (myOffsetEnd >= currentRoundFDEndForMyTargetAgg)) {
		    sendBufferOffset = (int) (currentRoundFDStartForMyTargetAgg - myOffsetStart);
		    bufferAmountToSend = (currentRoundFDEndForMyTargetAgg - currentRoundFDStartForMyTargetAgg) +1;
		}

		if (bufferAmountToSend > 0) { /* we have data to send this round */
		    targetAggIndexesForMyDataThisRound[numTargetAggsThisRound] = i;
		    sendBufferOffsetsThisRound[numTargetAggsThisRound] = sendBufferOffset;
		    bufferAmountsToSendThisRound[numTargetAggsThisRound] = bufferAmountToSend;
#ifdef p2pcontigtrace
		    printf("bufferAmountToSend is %d sendBufferOffset is %d\n",bufferAmountToSend,sendBufferOffset);
#endif
		    /* only need to be pinged by the agg for rounds after the first one - for the first one just
		     * send the data without being pinged */
		    if (roundIter > 0)
			MPI_Irecv(&amountOfDataReqestedByTargetAgg[numTargetAggsThisRound],1,
				MPI_INT,targetAggsForMyData[i],0,
				fd->comm,&mpiSizeToSendRequest[numTargetAggsThisRound]);
		    numTargetAggsThisRound++;

		}
	    }
	}

	/* determine what offsets define the portion of the file domain the agg is writing this round */
	if (iAmUsedAgg) {
	    if ((fd_end[myAggRank] - currentRoundFDStart) < coll_bufsize) {
		currentRoundFDEnd = fd_end[myAggRank];
	    }
	    else
		currentRoundFDEnd = currentRoundFDStart + coll_bufsize - 1;
#ifdef p2pcontigtrace
	    printf("currentRoundFDStart is %ld currentRoundFDEnd is %ld within file domeain %ld to %ld\n",currentRoundFDStart,currentRoundFDEnd,fd_start[myAggRank],fd_end[myAggRank]);
#endif
	}

	int irecv,isend;
	int numSourceProcsSentData = 0;

	/* the aggs send the amount of data they need to their source procs */
	for (i=0;i<numSourceProcs;i++) {
	    if ((remainingDataOffsetToGetPerProc[i] >= currentRoundFDStart) && (remainingDataOffsetToGetPerProc[i] <= currentRoundFDEnd)) {
		if ((remainingDataOffsetToGetPerProc[i] + remainingDataAmountToGetPerProc[i]) <= currentRoundFDEnd)
		    dataSizeGottenThisRoundPerProc[i] = remainingDataAmountToGetPerProc[i];
		else
		    dataSizeGottenThisRoundPerProc[i] = (currentRoundFDEnd - remainingDataOffsetToGetPerProc[i]) +1;
	    }
	    else if (((remainingDataOffsetToGetPerProc[i]+remainingDataAmountToGetPerProc[i]) >= currentRoundFDStart) && ((remainingDataOffsetToGetPerProc[i]+remainingDataAmountToGetPerProc[i]) <= currentRoundFDEnd)) {
		if ((remainingDataOffsetToGetPerProc[i]) >= currentRoundFDStart)
		    dataSizeGottenThisRoundPerProc[i] = remainingDataAmountToGetPerProc[i];
		else
		    dataSizeGottenThisRoundPerProc[i] = (remainingDataOffsetToGetPerProc[i]-currentRoundFDStart) +1;
	    }
	    else
		dataSizeGottenThisRoundPerProc[i] = 0;

#ifdef p2pcontigtrace
	    printf("dataSizeGottenThisRoundPerProc[%d] set to %d - remainingDataOffsetToGetPerProc is %d remainingDataAmountToGetPerProc is %d currentRoundFDStart is %d currentRoundFDEnd is %d\n",i,dataSizeGottenThisRoundPerProc[i],remainingDataOffsetToGetPerProc[i],remainingDataAmountToGetPerProc[i],currentRoundFDStart,currentRoundFDEnd);
#endif
	    if (dataSizeGottenThisRoundPerProc[i] > 0) {
		if (roundIter > 0) {
		    MPI_Isend(&dataSizeGottenThisRoundPerProc[i],1,MPI_INT,
			    sourceProcsForMyData[i],0,fd->comm,
			    &mpiSendDataSizeRequest[numSourceProcsSentData]);
		    numSourceProcsSentData++;
		}
	    }
	}

	int numDataSendToWaitFor = 0;
	/* the source procs send the requested data to the aggs */
	for (i = 0; i < numTargetAggsThisRound; i++) {

		/* the source procs aren't pinged by the target aggs on the first round */
	    if (roundIter > 0) {

		MPI_Waitany(numTargetAggsThisRound,mpiSizeToSendRequest,
			&irecv,&mpiWaitAnyStatusFromTargetAggs);

#ifdef p2pcontigtrace
		printf("irecv is %d amountOfDataReqestedByTargetAgg is %d bufferAmountsToSendThisRound is %d sendBufferOffsetsThisRound is %d targetAggsForMyData is %d\n",irecv,amountOfDataReqestedByTargetAgg[irecv], bufferAmountsToSendThisRound[irecv], sendBufferOffsetsThisRound[irecv],targetAggsForMyData[targetAggIndexesForMyDataThisRound[irecv]]);
#endif
		ADIOI_Assert(amountOfDataReqestedByTargetAgg[irecv] == bufferAmountsToSendThisRound[irecv]);
		MPI_Isend(&((char*)buf)[sendBufferOffsetsThisRound[irecv]],
			bufferAmountsToSendThisRound[irecv],MPI_BYTE,
			targetAggsForMyData[targetAggIndexesForMyDataThisRound[irecv]],
			0,fd->comm,&mpiSendDataToTargetAggRequest[irecv]);

	    }
	    else {
#ifdef p2pcontigtrace
		printf("i is %d bufferAmountsToSendThisRound is %d sendBufferOffsetsThisRound is %d targetAggsForMyData is %d\n",i, bufferAmountsToSendThisRound[i], sendBufferOffsetsThisRound[i],targetAggsForMyData[targetAggIndexesForMyDataThisRound[i]]);
#endif
		MPI_Isend(&((char*)buf)[sendBufferOffsetsThisRound[i]],bufferAmountsToSendThisRound[i],MPI_BYTE,
			targetAggsForMyData[targetAggIndexesForMyDataThisRound[i]],0,fd->comm,&mpiSendDataToTargetAggRequest[i]);
	    }
        numDataSendToWaitFor++;
	}

#ifdef ROMIO_GPFS
	gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH_SETUP] += (endTimeBase-startTimeBase);
	startTimeBase = MPI_Wtime();
#endif

	/* the aggs receive the data from the source procs */
	int numDataRecvToWaitFor = 0;
	for (i=0;i<numSourceProcs;i++) {

	    int currentWBOffset = 0;
	    for (j=0;j<i;j++)
		currentWBOffset += dataSizeGottenThisRoundPerProc[j];

	    /* only receive from source procs that will send > 0 count data */
	    if (dataSizeGottenThisRoundPerProc[i] > 0) {
#ifdef p2pcontigtrace
		printf("receiving data from rank %d dataSizeGottenThisRoundPerProc is %d currentWBOffset is %d\n",sourceProcsForMyData[i],dataSizeGottenThisRoundPerProc[i],currentWBOffset);
#endif
		MPI_Irecv(&((char*)write_buf)[currentWBOffset],dataSizeGottenThisRoundPerProc[i],
			MPI_BYTE,sourceProcsForMyData[i],0,
			fd->comm,&mpiRecvDataRequest[numDataRecvToWaitFor]);
		mpiRequestMapPerProc[numDataRecvToWaitFor] = i;
		numDataRecvToWaitFor++;
	    }

#ifdef p2pcontigtrace
	    printf("MPI_Irecv from rank %d\n",targetAggsForMyData[i]);
#endif
	}

	int totalDataReceivedThisRound = 0;
	for (i = 0; i < numDataRecvToWaitFor; i++) {
	    MPI_Waitany(numDataRecvToWaitFor,mpiRecvDataRequest,
		    &irecv,&mpiWaitAnyStatusFromSourceProcs);
	    totalDataReceivedThisRound +=
		dataSizeGottenThisRoundPerProc[mpiRequestMapPerProc[irecv]];
	    totalAmountDataReceived +=
		dataSizeGottenThisRoundPerProc[mpiRequestMapPerProc[irecv]];

#ifdef p2pcontigtrace
	    printf("numDataRecvToWaitFor is %d was sent %d bytes data for %d remaining bytes from rank %d irecv index %d\n",numDataRecvToWaitFor,dataSizeGottenThisRoundPerProc[mpiRequestMapPerProc[irecv]],remainingDataAmountToGetPerProc[mpiRequestMapPerProc[irecv]],sourceProcsForMyData[mpiRequestMapPerProc[irecv]],irecv);
#endif
	    remainingDataAmountToGetPerProc[mpiRequestMapPerProc[irecv]] -=
		dataSizeGottenThisRoundPerProc[mpiRequestMapPerProc[irecv]];
	    remainingDataOffsetToGetPerProc[mpiRequestMapPerProc[irecv]] +=
		dataSizeGottenThisRoundPerProc[mpiRequestMapPerProc[irecv]];

	}

	/* clean up the MPI_Request object for the MPI_Isend which told the
	 * source procs how much data to send */
        for (i=0;i<numSourceProcsSentData;i++) {
           MPI_Waitany(numSourceProcsSentData,mpiSendDataSizeRequest,
		   &isend,&mpiIsendStatusForSize);
        }


#ifdef ROMIO_GPFS
        endTimeBase = MPI_Wtime();
	gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH_NET] += (endTimeBase-startTimeBase);
#endif
	/* the aggs now write the data */
	if (numDataRecvToWaitFor > 0) {

#ifdef p2pcontigtrace
	    printf("totalDataReceivedThisRound is %d\n",totalDataReceivedThisRound);
#endif
	    if (!useIOBuffer) {

		ADIO_WriteContig(fd, write_buf, (int)totalDataReceivedThisRound,
			MPI_BYTE, ADIO_EXPLICIT_OFFSET,
			currentRoundFDStart, &status, error_code);
	    } else { /* use the thread writer */

		if(!pthread_equal(io_thread, pthread_self())) {
		    pthread_join(io_thread, &thread_ret);
		    *error_code = *(int *)thread_ret;
		    if (*error_code != MPI_SUCCESS) return;
		    io_thread = pthread_self();

		}
		io_thread_args.fd = fd;
		/* do a little pointer shuffling: background I/O works from one
		 * buffer while two-phase machinery fills up another */

		if (currentWriteBuf == 0) {
		    io_thread_args.buf = write_buf0;
		    currentWriteBuf = 1;
		    write_buf = write_buf1;
		}
		else {
		    io_thread_args.buf = write_buf1;
		    currentWriteBuf = 0;
		    write_buf = write_buf0;
		}
		io_thread_args.io_kind = ADIOI_WRITE;
		io_thread_args.size = totalDataReceivedThisRound;
		io_thread_args.offset = currentRoundFDStart;
		io_thread_args.status = &status;
		io_thread_args.error_code = *error_code;
		if ( (pthread_create(&io_thread, NULL,
				ADIOI_IO_Thread_Func, &(io_thread_args))) != 0)
		    io_thread = pthread_self();

	    }

	} /* numDataRecvToWaitFor > 0 */

	if (iAmUsedAgg)
	    currentRoundFDStart += coll_bufsize;
        for (i = 0; i < numDataSendToWaitFor; i++) {
          MPI_Wait(&mpiSendDataToTargetAggRequest[i],
		  &mpiIsendStatusForData);
        }

    } /* for-loop roundIter */

#ifdef ROMIO_GPFS
    endTimeBase = MPI_Wtime();
    gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH] += (endTimeBase-startTimeBase);
#endif

    if (useIOBuffer) { /* thread writer cleanup */

	if ( !pthread_equal(io_thread, pthread_self()) ) {
	    pthread_join(io_thread, &thread_ret);
	    *error_code = *(int *)thread_ret;
	}

    }



    if (iAmUsedAgg) {
	ADIOI_Free(sourceProcsForMyData);
	ADIOI_Free(remainingDataAmountToGetPerProc);
	ADIOI_Free(remainingDataOffsetToGetPerProc);
    }

    ADIOI_Free(targetAggsForMyData);
    ADIOI_Free(targetAggsForMyDataFDStart);
    ADIOI_Free(targetAggsForMyDataFDEnd);
    ADIOI_Free(targetAggIndexesForMyDataThisRound);
    ADIOI_Free(sendBufferOffsetsThisRound);
    ADIOI_Free(bufferAmountsToSendThisRound);
    ADIOI_Free(amountOfDataReqestedByTargetAgg);
    ADIOI_Free(mpiSizeToSendRequest);
    ADIOI_Free(mpiRecvDataRequest);
    ADIOI_Free(mpiSendDataSizeRequest);
    ADIOI_Free(mpiSendDataToTargetAggRequest);
    ADIOI_Free(dataSizeGottenThisRoundPerProc);
    ADIOI_Free(mpiRequestMapPerProc);

    /* TODO: still need a barrier here? */
    MPI_Barrier(fd->comm);
    return;
}
Beispiel #29
0
void ADIOI_GEN_ReadStrided(ADIO_File fd, void *buf, int count,
                           MPI_Datatype datatype, int file_ptr_type,
                           ADIO_Offset offset, ADIO_Status *status, int
                           *error_code)
{


    /* offset is in units of etype relative to the filetype. */

    ADIOI_Flatlist_node *flat_buf, *flat_file;
    ADIO_Offset i_offset, new_brd_size, brd_size, size;
    int i, j, k, st_index=0;
    MPI_Count num, bufsize;
    int n_etypes_in_filetype;
    ADIO_Offset n_filetypes, etype_in_filetype, st_n_filetypes, size_in_filetype;
    ADIO_Offset abs_off_in_filetype=0, new_frd_size, frd_size=0, st_frd_size;
    MPI_Count filetype_size, etype_size, buftype_size, partial_read;
    MPI_Aint filetype_extent, buftype_extent;
    int buf_count, buftype_is_contig, filetype_is_contig;
    ADIO_Offset userbuf_off, req_len, sum;
    ADIO_Offset off, req_off, disp, end_offset=0, readbuf_off, start_off;
    char *readbuf, *tmp_buf, *value;
    int info_flag;
    unsigned max_bufsize, readbuf_len;
    ADIO_Status status1;

    if (fd->hints->ds_read == ADIOI_HINT_DISABLE) {
        /* if user has disabled data sieving on reads, use naive
        * approach instead.
        */
        ADIOI_GEN_ReadStrided_naive(fd,
                                    buf,
                                    count,
                                    datatype,
                                    file_ptr_type,
                                    offset,
                                    status,
                                    error_code);
        return;
    }

    *error_code = MPI_SUCCESS;  /* changed below if error */

    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
    ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);

    MPI_Type_size_x(fd->filetype, &filetype_size);
    if ( ! filetype_size ) {
#ifdef HAVE_STATUS_SET_BYTES
        MPIR_Status_set_bytes(status, datatype, 0);
#endif
        *error_code = MPI_SUCCESS;
        return;
    }

    MPI_Type_extent(fd->filetype, &filetype_extent);
    MPI_Type_size_x(datatype, &buftype_size);
    MPI_Type_extent(datatype, &buftype_extent);
    etype_size = fd->etype_size;

    ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(MPI_Count)buftype_size * (ADIO_Offset)count));
    bufsize = buftype_size * count;

    /* get max_bufsize from the info object. */

    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
    ADIOI_Info_get(fd->info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL, value,
                   &info_flag);
    max_bufsize = atoi(value);
    ADIOI_Free(value);


    if (!buftype_is_contig && filetype_is_contig) {

        /* noncontiguous in memory, contiguous in file. */

        flat_buf = ADIOI_Flatten_and_find(datatype);

        off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
              fd->disp + (ADIO_Offset)etype_size * offset;

        start_off = off;
        end_offset = off + bufsize - 1;
        readbuf_off = off;
        readbuf = (char *) ADIOI_Malloc(max_bufsize);
        readbuf_len = (unsigned) (MPL_MIN(max_bufsize, end_offset-readbuf_off+1));

        /* if atomicity is true, lock (exclusive) the region to be accessed */
        if ((fd->atomicity) && ADIO_Feature(fd, ADIO_LOCKS))
            ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);

        ADIO_ReadContig(fd, readbuf, readbuf_len, MPI_BYTE,
                        ADIO_EXPLICIT_OFFSET, readbuf_off, &status1, error_code);
        if (*error_code != MPI_SUCCESS) return;

        for (j=0; j<count; j++)
        {
            for (i=0; i<flat_buf->count; i++) {
                userbuf_off = (ADIO_Offset)j*(ADIO_Offset)buftype_extent + flat_buf->indices[i];
                req_off = off;
                req_len = flat_buf->blocklens[i];
                ADIOI_BUFFERED_READ
                off += flat_buf->blocklens[i];
            }
        }

        if ((fd->atomicity) && ADIO_Feature(fd, ADIO_LOCKS))
            ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);

        if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;

        ADIOI_Free(readbuf);
    }

    else {  /* noncontiguous in file */

        flat_file = ADIOI_Flatten_and_find(fd->filetype);
        disp = fd->disp;

        if (file_ptr_type == ADIO_INDIVIDUAL) {
            /* Wei-keng reworked type processing to be a bit more efficient */
            offset       = fd->fp_ind - disp;
            n_filetypes  = (offset - flat_file->indices[0]) / filetype_extent;
            offset -= (ADIO_Offset)n_filetypes * filetype_extent;
            /* now offset is local to this extent */

            /* find the block where offset is located, skip blocklens[i]==0 */
            for (i=0; i<flat_file->count; i++) {
                ADIO_Offset dist;
                if (flat_file->blocklens[i] == 0) continue;
                dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
                /* frd_size is from offset to the end of block i */
                if (dist == 0) {
                    i++;
                    offset   = flat_file->indices[i];
                    frd_size = flat_file->blocklens[i];
                    break;
                }
                if (dist > 0) {
                    frd_size = dist;
                    break;
                }
            }
            st_index = i;  /* starting index in flat_file->indices[] */
            offset += disp + (ADIO_Offset)n_filetypes*filetype_extent;
        }
        else {
            n_etypes_in_filetype = filetype_size/etype_size;
            n_filetypes = offset / n_etypes_in_filetype;
            etype_in_filetype = offset % n_etypes_in_filetype;
            size_in_filetype = etype_in_filetype * etype_size;

            sum = 0;
            for (i=0; i<flat_file->count; i++) {
                sum += flat_file->blocklens[i];
                if (sum > size_in_filetype) {
                    st_index = i;
                    frd_size = sum - size_in_filetype;
                    abs_off_in_filetype = flat_file->indices[i] +
                                          size_in_filetype - (sum - flat_file->blocklens[i]);
                    break;
                }
            }

            /* abs. offset in bytes in the file */
            offset = disp + (ADIO_Offset) n_filetypes*filetype_extent +
                     abs_off_in_filetype;
        }

        start_off = offset;

        /* Wei-keng Liao: read request is within a single flat_file contig
         * block e.g. with subarray types that actually describe the whole
         * array */
        if (buftype_is_contig && bufsize <= frd_size) {
            /* a count of bytes can overflow. operate on original type instead */
            ADIO_ReadContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET,
                            offset, status, error_code);

            if (file_ptr_type == ADIO_INDIVIDUAL) {
                /* update MPI-IO file pointer to point to the first byte that
                * can be accessed in the fileview. */
                fd->fp_ind = offset + bufsize;
                if (bufsize == frd_size) {
                    do {
                        st_index++;
                        if (st_index == flat_file->count) {
                            st_index = 0;
                            n_filetypes++;
                        }
                    } while (flat_file->blocklens[st_index] == 0);
                    fd->fp_ind = disp + flat_file->indices[st_index]
                                 + n_filetypes*filetype_extent;
                }
            }
            fd->fp_sys_posn = -1;   /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
            MPIR_Status_set_bytes(status, datatype, bufsize);
#endif
            return;
        }

        /* Calculate end_offset, the last byte-offset that will be accessed.
          e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/

        st_frd_size = frd_size;
        st_n_filetypes = n_filetypes;
        i_offset = 0;
        j = st_index;
        off = offset;
        frd_size = MPL_MIN(st_frd_size, bufsize);
        while (i_offset < bufsize) {
            i_offset += frd_size;
            end_offset = off + frd_size - 1;

            j = (j+1) % flat_file->count;
            n_filetypes += (j == 0) ? 1 : 0;
            while (flat_file->blocklens[j]==0) {
                j = (j+1) % flat_file->count;
                n_filetypes += (j == 0) ? 1 : 0;
            }
            off = disp + flat_file->indices[j] + n_filetypes*(ADIO_Offset)filetype_extent;
            frd_size = MPL_MIN(flat_file->blocklens[j], bufsize-i_offset);
        }

        /* if atomicity is true, lock (exclusive) the region to be accessed */
        if ((fd->atomicity) && ADIO_Feature(fd, ADIO_LOCKS))
            ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);

        readbuf_off = 0;
        readbuf_len = 0;
        readbuf = (char *) ADIOI_Malloc(max_bufsize);

        if (buftype_is_contig && !filetype_is_contig) {

            /* contiguous in memory, noncontiguous in file. should be the most
               common case. */

            i_offset = 0;
            j = st_index;
            off = offset;
            n_filetypes = st_n_filetypes;
            frd_size = MPL_MIN(st_frd_size, bufsize);
            while (i_offset < bufsize) {
                if (frd_size) {
                    /* TYPE_UB and TYPE_LB can result in
                       frd_size = 0. save system call in such cases */
                    /* lseek(fd->fd_sys, off, SEEK_SET);
                    err = read(fd->fd_sys, ((char *) buf) + i, frd_size);*/

                    req_off = off;
                    req_len = frd_size;
                    userbuf_off = i_offset;
                    ADIOI_BUFFERED_READ
                }
                i_offset += frd_size;

                if (off + frd_size < disp + flat_file->indices[j] +
                        flat_file->blocklens[j] + n_filetypes*(ADIO_Offset)filetype_extent)
                    off += frd_size;
                /* did not reach end of contiguous block in filetype.
                   no more I/O needed. off is incremented by frd_size. */
                else {
                    j = (j+1) % flat_file->count;
                    n_filetypes += (j == 0) ? 1 : 0;
                    while (flat_file->blocklens[j]==0) {
                        j = (j+1) % flat_file->count;
                        n_filetypes += (j == 0) ? 1 : 0;
                    }
                    off = disp + flat_file->indices[j] +
                          n_filetypes*(ADIO_Offset)filetype_extent;
                    frd_size = MPL_MIN(flat_file->blocklens[j], bufsize-i_offset);
                }
            }
        }
        else {
Beispiel #30
0
/*
 * This is more general aggregator search function which does not base on the assumption
 * that each aggregator hosts the file domain with the same size
 */
int ADIOI_GPFS_Calc_aggregator(ADIO_File fd,
			      ADIO_Offset off,
			      ADIO_Offset min_off,
			      ADIO_Offset *len,
			      ADIO_Offset fd_size,
			      ADIO_Offset *fd_start,
			      ADIO_Offset *fd_end)
{
    int rank_index, rank;
    ADIO_Offset avail_bytes;
    TRACE_ERR("Entering ADIOI_GPFS_Calc_aggregator\n");

    ADIOI_Assert ( (off <= fd_end[fd->hints->cb_nodes-1] && off >= min_off && fd_start[0] >= min_off ) );

    /* binary search --> rank_index is returned */
    int ub = fd->hints->cb_nodes;
    int lb = 0;
    /* get an index into our array of aggregators */
    /* Common code for striping - bg doesn't use it but it's
       here to make diff'ing easier.
    rank_index = (int) ((off - min_off + fd_size)/ fd_size - 1);

    if (fd->hints->striping_unit > 0) {
        * wkliao: implementation for file domain alignment
           fd_start[] and fd_end[] have been aligned with file lock
	   boundaries when returned from ADIOI_Calc_file_domains() so cannot
	   just use simple arithmatic as above *
        rank_index = 0;
        while (off > fd_end[rank_index]) rank_index++;
    }
    bg does it's own striping below
    */
    rank_index = fd->hints->cb_nodes / 2;
    while ( off < fd_start[rank_index] || off > fd_end[rank_index] ) {
	if ( off > fd_end  [rank_index] ) {
	    lb = rank_index;
	    rank_index = (rank_index + ub) / 2;
	}
	else
	if ( off < fd_start[rank_index] ) {
	    ub = rank_index;
	    rank_index = (rank_index + lb) / 2;
	}
    }
    /* we index into fd_end with rank_index, and fd_end was allocated to be no
     * bigger than fd->hins->cb_nodes.   If we ever violate that, we're
     * overrunning arrays.  Obviously, we should never ever hit this abort */
    if (rank_index >= fd->hints->cb_nodes || rank_index < 0) {
        FPRINTF(stderr, "Error in ADIOI_Calc_aggregator(): rank_index(%d) >= fd->hints->cb_nodes (%d) fd_size=%lld off=%lld\n",
			rank_index,fd->hints->cb_nodes,fd_size,off);
        MPI_Abort(MPI_COMM_WORLD, 1);
    }
    /* DBG_FPRINTF ("ADIOI_GPFS_Calc_aggregator: rank_index = %d\n",
       rank_index ); */

    /*
     * remember here that even in Rajeev's original code it was the case that
     * different aggregators could end up with different amounts of data to
     * aggregate.  here we use fd_end[] to make sure that we know how much
     * data this aggregator is working with.
     *
     * the +1 is to take into account the end vs. length issue.
     */
    avail_bytes = fd_end[rank_index] + 1 - off;
    if (avail_bytes < *len && avail_bytes > 0) {
        /* this file domain only has part of the requested contig. region */

        *len = avail_bytes;
    }

    /* map our index to a rank */
    /* NOTE: FOR NOW WE DON'T HAVE A MAPPING...JUST DO 0..NPROCS_FOR_COLL */
    rank = fd->hints->ranklist[rank_index];
    TRACE_ERR("Leaving ADIOI_GPFS_Calc_aggregator\n");

    return rank;
}