Exemple #1
0
static void big_get(void *buf, ptl_size_t left_to_get, MPIDI_VC_t *vc, ptl_match_bits_t match_bits, MPIR_Request *rreq)
{
    int ret;
    MPID_nem_ptl_vc_area *vc_ptl;
    ptl_size_t start, get_sz;

    vc_ptl = VC_PTL(vc);
    start = (ptl_size_t)buf;

    /* we need to handle all events */
    REQ_PTL(rreq)->event_handler = handler_recv_big_get;

    while (left_to_get > 0) {
        /* get up to the maximum allowed by the portals interface */
        get_sz = MPL_MIN(MPIDI_nem_ptl_ni_limits.max_msg_size, left_to_get);

        ret = MPID_nem_ptl_rptl_get(MPIDI_nem_ptl_global_md, start, get_sz, vc_ptl->id, vc_ptl->ptg, match_bits, 0, rreq);
        DBG_MSG_GET("global", get_sz, vc->pg_rank, match_bits);
        MPL_DBG_MSG_P(MPIDI_CH3_DBG_CHANNEL, VERBOSE, "   buf=%p", (char *)start);
        MPIR_Assert(ret == 0);

        /* account for what has been sent */
        start += get_sz;
        left_to_get -= get_sz;
        REQ_PTL(rreq)->num_gets++;
    }
}
Exemple #2
0
void MPIDI_OFI_mr_key_free(uint64_t index)
{
    uint64_t int_index, bitpos, numbits;
    numbits = sizeof(uint64_t) * 8;
    int_index = (index + 1) / numbits;
    bitpos = index % numbits;

    mr_key_allocator.last_free_mr_key = MPL_MIN(int_index, mr_key_allocator.last_free_mr_key);
    mr_key_allocator.bitmask[int_index] |= (0x1ULL << bitpos);
}
Exemple #3
0
/* If successful, error_code is set to MPI_SUCCESS.  Otherwise an error
 * code is created and returned in error_code.
 */
static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, const void *buf,
					MPI_Datatype datatype, int nprocs,
					int myrank, ADIOI_Access *others_req,
                                        ADIOI_Access *my_req,
					ADIO_Offset *offset_list,
                                        ADIO_Offset *len_list, 
					int contig_access_count,
                                        int *striping_info, int **buf_idx,
                                        int *error_code)
{
    /* Send data to appropriate processes and write in sizes of no more
     * than lustre stripe_size.
     * The idea is to reduce the amount of extra memory required for
     * collective I/O. If all data were written all at once, which is much
     * easier, it would require temp space more than the size of user_buf,
     * which is often unacceptable. For example, to write a distributed
     * array to a file, where each local array is 8Mbytes, requiring
     * at least another 8Mbytes of temp space is unacceptable.
     */

    int hole, i, j, m, flag, ntimes = 1 , max_ntimes, buftype_is_contig;
    ADIO_Offset st_loc = -1, end_loc = -1, min_st_loc, max_end_loc;
    ADIO_Offset off, req_off, send_off, iter_st_off, *off_list;
    ADIO_Offset max_size, step_size = 0;
    int real_size, req_len, send_len;
    int *recv_curr_offlen_ptr, *recv_count, *recv_size;
    int *send_curr_offlen_ptr, *send_size;
    int *sent_to_proc, *recv_start_pos;
    int *send_buf_idx, *curr_to_proc, *done_to_proc;
    int *this_buf_idx;
    char *write_buf = NULL;
    MPI_Status status;
    ADIOI_Flatlist_node *flat_buf = NULL;
    MPI_Aint buftype_extent;
    int stripe_size = striping_info[0], avail_cb_nodes = striping_info[2];
    int data_sieving = 0;
    ADIO_Offset *srt_off = NULL;
    int *srt_len = NULL;
    int srt_num = 0;
    ADIO_Offset block_offset;
    int block_len;

    *error_code = MPI_SUCCESS;	/* changed below if error */
    /* only I/O errors are currently reported */

    /* calculate the number of writes of stripe size to be done.
     * That gives the no. of communication phases as well.
     * Note:
     *   Because we redistribute data in stripe-contiguous pattern for Lustre,
     *   each process has the same no. of communication phases.
     */

    for (i = 0; i < nprocs; i++) {
	if (others_req[i].count) {
	    st_loc = others_req[i].offsets[0];
	    end_loc = others_req[i].offsets[0];
	    break;
	}
    }
    for (i = 0; i < nprocs; i++) {
	for (j = 0; j < others_req[i].count; j++) {
	    st_loc = MPL_MIN(st_loc, others_req[i].offsets[j]);
	    end_loc = MPL_MAX(end_loc, (others_req[i].offsets[j] +
                                          others_req[i].lens[j] - 1));
	}
    }
    /* this process does no writing. */
    if ((st_loc == -1) && (end_loc == -1))
	ntimes = 0;
    MPI_Allreduce(&end_loc, &max_end_loc, 1, MPI_LONG_LONG_INT, MPI_MAX, fd->comm);
    /* avoid min_st_loc be -1 */
    if (st_loc == -1)
        st_loc = max_end_loc;
    MPI_Allreduce(&st_loc, &min_st_loc, 1, MPI_LONG_LONG_INT, MPI_MIN, fd->comm);
    /* align downward */
    min_st_loc -= min_st_loc % (ADIO_Offset)stripe_size;

    /* Each time, only avail_cb_nodes number of IO clients perform IO,
     * so, step_size=avail_cb_nodes*stripe_size IO will be performed at most,
     * and ntimes=whole_file_portion/step_size
     */
    step_size = (ADIO_Offset) avail_cb_nodes * stripe_size;
    max_ntimes = (max_end_loc - min_st_loc + 1) / step_size
        + (((max_end_loc - min_st_loc + 1) % step_size) ? 1 : 0);
/*     max_ntimes = (int)((max_end_loc - min_st_loc) / step_size + 1); */
    if (ntimes)
	write_buf = (char *) ADIOI_Malloc(stripe_size);

    /* calculate the start offset for each iteration */
    off_list = (ADIO_Offset *) ADIOI_Malloc(max_ntimes * sizeof(ADIO_Offset));
    for (m = 0; m < max_ntimes; m ++)
        off_list[m] = max_end_loc;
    for (i = 0; i < nprocs; i++) {
        for (j = 0; j < others_req[i].count; j ++) {
            req_off = others_req[i].offsets[j];
            m = (int)((req_off - min_st_loc) / step_size);
            off_list[m] = MPL_MIN(off_list[m], req_off);
        }
    }

    recv_curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
    send_curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
    /* their use is explained below. calloc initializes to 0. */

    recv_count = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    /* to store count of how many off-len pairs per proc are satisfied
       in an iteration. */

    send_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    /* total size of data to be sent to each proc. in an iteration.
       Of size nprocs so that I can use MPI_Alltoall later. */

    recv_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    /* total size of data to be recd. from each proc. in an iteration. */

    sent_to_proc = (int *) ADIOI_Calloc(nprocs, sizeof(int));
    /* amount of data sent to each proc so far. Used in
       ADIOI_Fill_send_buffer. initialized to 0 here. */

    send_buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    curr_to_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    done_to_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    /* Above three are used in ADIOI_Fill_send_buffer */

    this_buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));

    recv_start_pos = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    /* used to store the starting value of recv_curr_offlen_ptr[i] in
       this iteration */

    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
    if (!buftype_is_contig) {
	flat_buf = ADIOI_Flatten_and_find(datatype);
    }
    MPI_Type_extent(datatype, &buftype_extent);
    /* I need to check if there are any outstanding nonblocking writes to
     * the file, which could potentially interfere with the writes taking
     * place in this collective write call. Since this is not likely to be
     * common, let me do the simplest thing possible here: Each process
     * completes all pending nonblocking operations before completing.
     */
    /*ADIOI_Complete_async(error_code);
    if (*error_code != MPI_SUCCESS) return;
    MPI_Barrier(fd->comm);
    */

    iter_st_off = min_st_loc;

    /* Although we have recognized the data according to OST index,
     * a read-modify-write will be done if there is a hole between the data.
     * For example: if blocksize=60, xfersize=30 and stripe_size=100,
     * then rank0 will collect data [0, 30] and [60, 90] then write. There
     * is a hole in [30, 60], which will cause a read-modify-write in [0, 90].
     *
     * To reduce its impact on the performance, we can disable data sieving
     * by hint "ds_in_coll".
     */
    /* check the hint for data sieving */
    data_sieving = fd->hints->fs_hints.lustre.ds_in_coll;

    for (m = 0; m < max_ntimes; m++) {
	/* go through all others_req and my_req to check which will be received
         * and sent in this iteration.
         */

	/* Note that MPI guarantees that displacements in filetypes are in
	   monotonically nondecreasing order and that, for writes, the
	   filetypes cannot specify overlapping regions in the file. This
	   simplifies implementation a bit compared to reads. */

	/*
           off         = start offset in the file for the data to be written in
                         this iteration
           iter_st_off = start offset of this iteration
           real_size   = size of data written (bytes) corresponding to off
           max_size    = possible maximum size of data written in this iteration
           req_off     = offset in the file for a particular contiguous request minus
                         what was satisfied in previous iteration
           send_off    = offset the request needed by other processes in this iteration
           req_len     = size corresponding to req_off
           send_len    = size corresponding to send_off
         */

	/* first calculate what should be communicated */
	for (i = 0; i < nprocs; i++)
	    recv_count[i] = recv_size[i] = send_size[i] = 0;

        off = off_list[m];
        max_size = MPL_MIN(step_size, max_end_loc - iter_st_off + 1);
        real_size = (int) MPL_MIN((off / stripe_size + 1) * stripe_size -
                                    off,
                                    end_loc - off + 1);

	for (i = 0; i < nprocs; i++) {
            if (my_req[i].count) {
                this_buf_idx[i] = buf_idx[i][send_curr_offlen_ptr[i]];
                for (j = send_curr_offlen_ptr[i]; j < my_req[i].count; j++) {
                    send_off = my_req[i].offsets[j];
                    send_len = my_req[i].lens[j];
                    if (send_off < iter_st_off + max_size) {
                        send_size[i] += send_len;
                    } else {
                        break;
                    }
                }
                send_curr_offlen_ptr[i] = j;
            }
	    if (others_req[i].count) {
		recv_start_pos[i] = recv_curr_offlen_ptr[i];
		for (j = recv_curr_offlen_ptr[i]; j < others_req[i].count; j++) {
                    req_off = others_req[i].offsets[j];
                    req_len = others_req[i].lens[j];
		    if (req_off < iter_st_off + max_size) {
			recv_count[i]++;
                        ADIOI_Assert((((ADIO_Offset)(MPIU_Upint)write_buf)+req_off-off) == (ADIO_Offset)(MPIU_Upint)(write_buf+req_off-off));
			MPI_Address(write_buf + req_off - off,
				    &(others_req[i].mem_ptrs[j]));
                        recv_size[i] += req_len;
		    } else {
			break;
                    }
		}
		recv_curr_offlen_ptr[i] = j;
	    }
	}
        /* use variable "hole" to pass data_sieving flag into W_Exchange_data */
        hole = data_sieving;
	ADIOI_LUSTRE_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
                                     len_list, send_size, recv_size, off, real_size,
                                     recv_count, recv_start_pos,
                                     sent_to_proc, nprocs, myrank,
                                     buftype_is_contig, contig_access_count,
                                     striping_info, others_req, send_buf_idx,
                                     curr_to_proc, done_to_proc, &hole, m,
                                  buftype_extent, this_buf_idx,
                                  &srt_off, &srt_len, &srt_num, error_code);

	if (*error_code != MPI_SUCCESS)
            goto over;

	flag = 0;
	for (i = 0; i < nprocs; i++)
	    if (recv_count[i]) {
		flag = 1;
		break;
	    }
	if (flag) {
            /* check whether to do data sieving */
            if(data_sieving == ADIOI_HINT_ENABLE) {
	        ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE,
			         ADIO_EXPLICIT_OFFSET, off, &status,
			         error_code);
            } else {
                /* if there is no hole, write data in one time;
                 * otherwise, write data in several times */
                if (!hole) {
                    ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE,
                                     ADIO_EXPLICIT_OFFSET, off, &status,
                                     error_code);
                } else {
                    block_offset = -1;
                    block_len = 0;
                    for (i = 0; i < srt_num; ++i) {
                        if (srt_off[i] < off + real_size &&
                            srt_off[i] >= off) {
                            if (block_offset == -1) {
                                block_offset = srt_off[i];
                                block_len = srt_len[i];
                            } else {
                                if (srt_off[i] == block_offset + block_len) {
                                    block_len += srt_len[i];
                                } else {
                                    ADIO_WriteContig(fd,
                                                     write_buf + block_offset - off,
                                                     block_len,
                                                     MPI_BYTE, ADIO_EXPLICIT_OFFSET,
                                                     block_offset, &status,
                                                     error_code);
	                            if (*error_code != MPI_SUCCESS)
		                        goto over;
                                    block_offset = srt_off[i];
                                    block_len = srt_len[i];
                                }
                            }
                        }
                    }
                    if (block_offset != -1) {
                        ADIO_WriteContig(fd,
                                         write_buf + block_offset - off,
                                         block_len,
                                         MPI_BYTE, ADIO_EXPLICIT_OFFSET,
                                         block_offset, &status,
                                         error_code);
                        if (*error_code != MPI_SUCCESS)
                            goto over;
                    }
                }
            }
	    if (*error_code != MPI_SUCCESS)
		goto over;
	}
        iter_st_off += max_size;
    }
over:
    if (srt_off)
        ADIOI_Free(srt_off);
    if (srt_len)
        ADIOI_Free(srt_len);
    if (ntimes)
	ADIOI_Free(write_buf);
    ADIOI_Free(recv_curr_offlen_ptr);
    ADIOI_Free(send_curr_offlen_ptr);
    ADIOI_Free(recv_count);
    ADIOI_Free(send_size);
    ADIOI_Free(recv_size);
    ADIOI_Free(sent_to_proc);
    ADIOI_Free(recv_start_pos);
    ADIOI_Free(send_buf_idx);
    ADIOI_Free(curr_to_proc);
    ADIOI_Free(done_to_proc);
    ADIOI_Free(this_buf_idx);
    ADIOI_Free(off_list);
}
Exemple #4
0
void ADIOI_NFS_WriteStrided(ADIO_File fd, const void *buf, int count,
                            MPI_Datatype datatype, int file_ptr_type,
                            ADIO_Offset offset, ADIO_Status * status, int
                            *error_code)
{
/* offset is in units of etype relative to the filetype. */

    ADIOI_Flatlist_node *flat_buf, *flat_file;
    int i, j, k, err = -1, bwr_size, st_index = 0;
    ADIO_Offset i_offset, sum, size_in_filetype;
    ADIO_Offset num, size, n_etypes_in_filetype;
    MPI_Count bufsize;
    ADIO_Offset n_filetypes, etype_in_filetype;
    ADIO_Offset abs_off_in_filetype = 0;
    int req_len;
    MPI_Count filetype_size, etype_size, buftype_size;
    MPI_Aint filetype_extent, buftype_extent;
    int buf_count, buftype_is_contig, filetype_is_contig;
    ADIO_Offset userbuf_off;
    ADIO_Offset off, req_off, disp, end_offset = 0, writebuf_off, start_off;
    char *writebuf = NULL, *value;
    int st_n_filetypes, writebuf_len, write_sz;
    ADIO_Offset fwr_size = 0, new_fwr_size, st_fwr_size;
    int new_bwr_size, err_flag = 0, info_flag, max_bufsize;
    static char myname[] = "ADIOI_NFS_WRITESTRIDED";

    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
    ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);

    MPI_Type_size_x(fd->filetype, &filetype_size);
    if (!filetype_size) {
#ifdef HAVE_STATUS_SET_BYTES
        MPIR_Status_set_bytes(status, datatype, 0);
#endif
        *error_code = MPI_SUCCESS;
        return;
    }

    MPI_Type_extent(fd->filetype, &filetype_extent);
    MPI_Type_size_x(datatype, &buftype_size);
    MPI_Type_extent(datatype, &buftype_extent);
    etype_size = fd->etype_size;

    bufsize = buftype_size * count;

/* get max_bufsize from the info object. */

    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
    ADIOI_Info_get(fd->info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value, &info_flag);
    max_bufsize = atoi(value);
    ADIOI_Free(value);

    if (!buftype_is_contig && filetype_is_contig) {

/* noncontiguous in memory, contiguous in file. */

        flat_buf = ADIOI_Flatten_and_find(datatype);

        off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : fd->disp + etype_size * offset;

        start_off = off;
        end_offset = off + bufsize - 1;
        writebuf_off = off;
        writebuf = (char *) ADIOI_Malloc(max_bufsize);
        writebuf_len = (int) (MPL_MIN(max_bufsize, end_offset - writebuf_off + 1));

/* if atomicity is true, lock the region to be accessed */
        if (fd->atomicity)
            ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1);

        for (j = 0; j < count; j++)
            for (i = 0; i < flat_buf->count; i++) {
                userbuf_off = j * buftype_extent + flat_buf->indices[i];
                req_off = off;
                req_len = flat_buf->blocklens[i];
                ADIOI_BUFFERED_WRITE_WITHOUT_READ off += flat_buf->blocklens[i];
            }

        /* write the buffer out finally */
#ifdef ADIOI_MPE_LOGGING
        MPE_Log_event(ADIOI_MPE_lseek_a, 0, NULL);
#endif
        lseek(fd->fd_sys, writebuf_off, SEEK_SET);
#ifdef ADIOI_MPE_LOGGING
        MPE_Log_event(ADIOI_MPE_lseek_b, 0, NULL);
#endif
        if (!(fd->atomicity))
            ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
#ifdef ADIOI_MPE_LOGGING
        MPE_Log_event(ADIOI_MPE_write_a, 0, NULL);
#endif
        err = write(fd->fd_sys, writebuf, writebuf_len);
#ifdef ADIOI_MPE_LOGGING
        MPE_Log_event(ADIOI_MPE_write_b, 0, NULL);
#endif
        if (!(fd->atomicity))
            ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
        if (err == -1)
            err_flag = 1;

        if (fd->atomicity)
            ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1);

        if (file_ptr_type == ADIO_INDIVIDUAL)
            fd->fp_ind = off;
        if (err_flag) {
            *error_code = MPIO_Err_create_code(MPI_SUCCESS,
                                               MPIR_ERR_RECOVERABLE, myname,
                                               __LINE__, MPI_ERR_IO, "**io",
                                               "**io %s", strerror(errno));
        } else
            *error_code = MPI_SUCCESS;
    }

    else {      /* noncontiguous in file */

        flat_file = ADIOI_Flatten_and_find(fd->filetype);
        disp = fd->disp;

        if (file_ptr_type == ADIO_INDIVIDUAL) {
            /* Wei-keng reworked type processing to be a bit more efficient */
            offset = fd->fp_ind - disp;
            n_filetypes = (offset - flat_file->indices[0]) / filetype_extent;
            offset -= (ADIO_Offset) n_filetypes *filetype_extent;
            /* now offset is local to this extent */

            /* find the block where offset is located, skip blocklens[i]==0 */
            for (i = 0; i < flat_file->count; i++) {
                ADIO_Offset dist;
                if (flat_file->blocklens[i] == 0)
                    continue;
                dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
                /* fwr_size is from offset to the end of block i */
                if (dist == 0) {
                    i++;
                    offset = flat_file->indices[i];
                    fwr_size = flat_file->blocklens[i];
                    break;
                }
                if (dist > 0) {
                    fwr_size = dist;
                    break;
                }
            }
            st_index = i;       /* starting index in flat_file->indices[] */
            offset += disp + (ADIO_Offset) n_filetypes *filetype_extent;
        } else {
            n_etypes_in_filetype = filetype_size / etype_size;
            n_filetypes = offset / n_etypes_in_filetype;
            etype_in_filetype = offset % n_etypes_in_filetype;
            size_in_filetype = etype_in_filetype * etype_size;

            sum = 0;
            for (i = 0; i < flat_file->count; i++) {
                sum += flat_file->blocklens[i];
                if (sum > size_in_filetype) {
                    st_index = i;
                    fwr_size = sum - size_in_filetype;
                    abs_off_in_filetype = flat_file->indices[i] +
                        size_in_filetype - (sum - flat_file->blocklens[i]);
                    break;
                }
            }

            /* abs. offset in bytes in the file */
            offset = disp + (ADIO_Offset) n_filetypes *filetype_extent + abs_off_in_filetype;
        }

        start_off = offset;
        /* Wei-keng Liao:write request is within single flat_file contig block */
        /* this could happen, for example, with subarray types that are
         * actually fairly contiguous */
        if (buftype_is_contig && bufsize <= fwr_size) {
            /* though MPI api has an integer 'count' parameter, derived
             * datatypes might describe more bytes than can fit into an integer.
             * if we've made it this far, we can pass a count of original
             * datatypes, instead of a count of bytes (which might overflow)
             * Other WriteContig calls in this path are operating on data
             * sieving buffer */
            ADIO_WriteContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET,
                             offset, status, error_code);

            if (file_ptr_type == ADIO_INDIVIDUAL) {
                /* update MPI-IO file pointer to point to the first byte
                 * that can be accessed in the fileview. */
                fd->fp_ind = offset + bufsize;
                if (bufsize == fwr_size) {
                    do {
                        st_index++;
                        if (st_index == flat_file->count) {
                            st_index = 0;
                            n_filetypes++;
                        }
                    } while (flat_file->blocklens[st_index] == 0);
                    fd->fp_ind = disp + flat_file->indices[st_index]
                    + (ADIO_Offset) n_filetypes *filetype_extent;
                }
            }
            fd->fp_sys_posn = -1;       /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
            MPIR_Status_set_bytes(status, datatype, bufsize);
#endif
            goto fn_exit;
        }

        /* Calculate end_offset, the last byte-offset that will be accessed.
         * e.g., if start_offset=0 and 100 bytes to be write, end_offset=99 */

        st_fwr_size = fwr_size;
        st_n_filetypes = n_filetypes;
        i_offset = 0;
        j = st_index;
        off = offset;
        fwr_size = MPL_MIN(st_fwr_size, bufsize);
        while (i_offset < bufsize) {
            i_offset += fwr_size;
            end_offset = off + fwr_size - 1;

            j = (j + 1) % flat_file->count;
            n_filetypes += (j == 0) ? 1 : 0;
            while (flat_file->blocklens[j] == 0) {
                j = (j + 1) % flat_file->count;
                n_filetypes += (j == 0) ? 1 : 0;
            }

            off = disp + flat_file->indices[j] + n_filetypes * (ADIO_Offset) filetype_extent;
            fwr_size = MPL_MIN(flat_file->blocklens[j], bufsize - i_offset);
        }

/* if atomicity is true, lock the region to be accessed */
        if (fd->atomicity)
            ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1);

        /* initial read for the read-modify-write */
        writebuf_off = offset;
        writebuf = (char *) ADIOI_Malloc(max_bufsize);
        memset(writebuf, -1, max_bufsize);
        writebuf_len = (int) (MPL_MIN(max_bufsize, end_offset - writebuf_off + 1));
        if (!(fd->atomicity))
            ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
#ifdef ADIOI_MPE_LOGGING
        MPE_Log_event(ADIOI_MPE_lseek_a, 0, NULL);
#endif
        lseek(fd->fd_sys, writebuf_off, SEEK_SET);
#ifdef ADIOI_MPE_LOGGING
        MPE_Log_event(ADIOI_MPE_lseek_b, 0, NULL);
#endif
#ifdef ADIOI_MPE_LOGGING
        MPE_Log_event(ADIOI_MPE_read_a, 0, NULL);
#endif
        err = read(fd->fd_sys, writebuf, writebuf_len);
#ifdef ADIOI_MPE_LOGGING
        MPE_Log_event(ADIOI_MPE_read_b, 0, NULL);
#endif
        if (err == -1) {
            *error_code = MPIO_Err_create_code(MPI_SUCCESS,
                                               MPIR_ERR_RECOVERABLE,
                                               myname, __LINE__,
                                               MPI_ERR_IO,
                                               "ADIOI_NFS_WriteStrided: ROMIO tries to optimize this access by doing a read-modify-write, but is unable to read the file. Please give the file read permission and open it with MPI_MODE_RDWR.",
                                               0);
            goto fn_exit;
        }

        if (buftype_is_contig && !filetype_is_contig) {

/* contiguous in memory, noncontiguous in file. should be the most
   common case. */

            i_offset = 0;
            j = st_index;
            off = offset;
            n_filetypes = st_n_filetypes;
            fwr_size = MPL_MIN(st_fwr_size, bufsize);
            while (i_offset < bufsize) {
                if (fwr_size) {
                    /* TYPE_UB and TYPE_LB can result in
                     * fwr_size = 0. save system call in such cases */
                    /* lseek(fd->fd_sys, off, SEEK_SET);
                     * err = write(fd->fd_sys, ((char *) buf) + i, fwr_size); */

                    req_off = off;
                    req_len = fwr_size;
                    userbuf_off = i_offset;
                ADIOI_BUFFERED_WRITE}
                i_offset += fwr_size;

                if (off + fwr_size < disp + flat_file->indices[j] +
                    flat_file->blocklens[j] + n_filetypes * (ADIO_Offset) filetype_extent)
                    off += fwr_size;
                /* did not reach end of contiguous block in filetype.
                 * no more I/O needed. off is incremented by fwr_size. */
                else {
                    j = (j + 1) % flat_file->count;
                    n_filetypes += (j == 0) ? 1 : 0;
                    while (flat_file->blocklens[j] == 0) {
                        j = (j + 1) % flat_file->count;
                        n_filetypes += (j == 0) ? 1 : 0;
                    }
                    off = disp + flat_file->indices[j] +
                        n_filetypes * (ADIO_Offset) filetype_extent;
                    fwr_size = MPL_MIN(flat_file->blocklens[j], bufsize - i_offset);
                }
            }
        } else {
PMPI_LOCAL int MPIR_Type_block(const int *array_of_gsizes,
                               int dim,
                               int ndims,
                               int nprocs,
                               int rank,
                               int darg,
                               int order,
                               MPI_Aint orig_extent,
                               MPI_Datatype type_old, MPI_Datatype * type_new, MPI_Aint * st_offset)
{
/* nprocs = no. of processes in dimension dim of grid
   rank = coordinate of this process in dimension dim */
    int mpi_errno, blksize, global_size, mysize, i, j;
    MPI_Aint stride;

    global_size = array_of_gsizes[dim];

    if (darg == MPI_DISTRIBUTE_DFLT_DARG)
        blksize = (global_size + nprocs - 1) / nprocs;
    else {
        blksize = darg;

#ifdef HAVE_ERROR_CHECKING
        if (blksize <= 0) {
            mpi_errno = MPIR_Err_create_code(MPI_SUCCESS,
                                             MPIR_ERR_RECOVERABLE,
                                             __func__,
                                             __LINE__,
                                             MPI_ERR_ARG,
                                             "**darrayblock", "**darrayblock %d", blksize);
            return mpi_errno;
        }
        if (blksize * nprocs < global_size) {
            mpi_errno = MPIR_Err_create_code(MPI_SUCCESS,
                                             MPIR_ERR_RECOVERABLE,
                                             __func__,
                                             __LINE__,
                                             MPI_ERR_ARG,
                                             "**darrayblock2",
                                             "**darrayblock2 %d %d", blksize * nprocs, global_size);
            return mpi_errno;
        }
#endif
    }

    j = global_size - blksize * rank;
    mysize = MPL_MIN(blksize, j);
    if (mysize < 0)
        mysize = 0;

    stride = orig_extent;
    if (order == MPI_ORDER_FORTRAN) {
        if (dim == 0) {
            mpi_errno = MPIR_Type_contiguous(mysize, type_old, type_new);
            /* --BEGIN ERROR HANDLING-- */
            if (mpi_errno != MPI_SUCCESS) {
                mpi_errno =
                    MPIR_Err_create_code(mpi_errno, MPIR_ERR_RECOVERABLE, __func__, __LINE__,
                                         MPI_ERR_OTHER, "**fail", 0);
                return mpi_errno;
            }
            /* --END ERROR HANDLING-- */
        } else {
            for (i = 0; i < dim; i++)
                stride *= (MPI_Aint) (array_of_gsizes[i]);
            mpi_errno = MPIR_Type_vector(mysize, 1, stride, 1,  /* stride in bytes */
                                         type_old, type_new);
            /* --BEGIN ERROR HANDLING-- */
            if (mpi_errno != MPI_SUCCESS) {
                mpi_errno =
                    MPIR_Err_create_code(mpi_errno, MPIR_ERR_RECOVERABLE, __func__, __LINE__,
                                         MPI_ERR_OTHER, "**fail", 0);
                return mpi_errno;
            }
            /* --END ERROR HANDLING-- */
        }
    } else {
        if (dim == ndims - 1) {
            mpi_errno = MPIR_Type_contiguous(mysize, type_old, type_new);
            /* --BEGIN ERROR HANDLING-- */
            if (mpi_errno != MPI_SUCCESS) {
                mpi_errno =
                    MPIR_Err_create_code(mpi_errno, MPIR_ERR_RECOVERABLE, __func__, __LINE__,
                                         MPI_ERR_OTHER, "**fail", 0);
                return mpi_errno;
            }
            /* --END ERROR HANDLING-- */
        } else {
            for (i = ndims - 1; i > dim; i--)
                stride *= (MPI_Aint) (array_of_gsizes[i]);
            mpi_errno = MPIR_Type_vector(mysize, 1, stride, 1,  /* stride in bytes */
                                         type_old, type_new);
            /* --BEGIN ERROR HANDLING-- */
            if (mpi_errno != MPI_SUCCESS) {
                mpi_errno =
                    MPIR_Err_create_code(mpi_errno, MPIR_ERR_RECOVERABLE, __func__, __LINE__,
                                         MPI_ERR_OTHER, "**fail", 0);
                return mpi_errno;
            }
            /* --END ERROR HANDLING-- */
        }
    }

    *st_offset = (MPI_Aint) blksize *(MPI_Aint) rank;
    /* in terms of no. of elements of type oldtype in this dimension */
    if (mysize == 0)
        *st_offset = 0;

    return MPI_SUCCESS;
}
PMPI_LOCAL int MPIR_Type_cyclic(const int *array_of_gsizes,
                                int dim,
                                int ndims,
                                int nprocs,
                                int rank,
                                int darg,
                                int order,
                                MPI_Aint orig_extent,
                                MPI_Datatype type_old,
                                MPI_Datatype * type_new, MPI_Aint * st_offset)
{
/* nprocs = no. of processes in dimension dim of grid
   rank = coordinate of this process in dimension dim */
    int mpi_errno, blksize, i, blklens[3], st_index, end_index, local_size, rem, count;
    MPI_Aint stride, disps[3];
    MPI_Datatype type_tmp, type_indexed, types[3];

    if (darg == MPI_DISTRIBUTE_DFLT_DARG)
        blksize = 1;
    else
        blksize = darg;

#ifdef HAVE_ERROR_CHECKING
    if (blksize <= 0) {
        mpi_errno = MPIR_Err_create_code(MPI_SUCCESS,
                                         MPIR_ERR_RECOVERABLE,
                                         __func__,
                                         __LINE__,
                                         MPI_ERR_ARG,
                                         "**darraycyclic", "**darraycyclic %d", blksize);
        return mpi_errno;
    }
#endif

    st_index = rank * blksize;
    end_index = array_of_gsizes[dim] - 1;

    if (end_index < st_index)
        local_size = 0;
    else {
        local_size = ((end_index - st_index + 1) / (nprocs * blksize)) * blksize;
        rem = (end_index - st_index + 1) % (nprocs * blksize);
        local_size += MPL_MIN(rem, blksize);
    }

    count = local_size / blksize;
    rem = local_size % blksize;

    stride = (MPI_Aint) nprocs *(MPI_Aint) blksize *orig_extent;
    if (order == MPI_ORDER_FORTRAN)
        for (i = 0; i < dim; i++)
            stride *= (MPI_Aint) (array_of_gsizes[i]);
    else
        for (i = ndims - 1; i > dim; i--)
            stride *= (MPI_Aint) (array_of_gsizes[i]);

    mpi_errno = MPIR_Type_vector(count, blksize, stride, 1,     /* stride in bytes */
                                 type_old, type_new);
    /* --BEGIN ERROR HANDLING-- */
    if (mpi_errno != MPI_SUCCESS) {
        mpi_errno =
            MPIR_Err_create_code(mpi_errno, MPIR_ERR_RECOVERABLE, __func__, __LINE__, MPI_ERR_OTHER,
                                 "**fail", 0);
        return mpi_errno;
    }
    /* --END ERROR HANDLING-- */

    if (rem) {
        /* if the last block is of size less than blksize, include
         * it separately using MPI_Type_struct */

        types[0] = *type_new;
        types[1] = type_old;
        disps[0] = 0;
        disps[1] = (MPI_Aint) count *stride;
        blklens[0] = 1;
        blklens[1] = rem;

        mpi_errno = MPIR_Type_struct(2, blklens, disps, types, &type_tmp);
        MPIR_Type_free_impl(type_new);
        *type_new = type_tmp;

        /* --BEGIN ERROR HANDLING-- */
        if (mpi_errno != MPI_SUCCESS) {
            mpi_errno =
                MPIR_Err_create_code(mpi_errno, MPIR_ERR_RECOVERABLE, __func__, __LINE__,
                                     MPI_ERR_OTHER, "**fail", 0);
            return mpi_errno;
        }
        /* --END ERROR HANDLING-- */
    }

    /* In the first iteration, we need to set the displacement in that
     * dimension correctly. */
    if (((order == MPI_ORDER_FORTRAN) && (dim == 0)) ||
        ((order == MPI_ORDER_C) && (dim == ndims - 1))) {
        disps[0] = 0;
        disps[1] = (MPI_Aint) rank *(MPI_Aint) blksize *orig_extent;
        disps[2] = orig_extent * (MPI_Aint) (array_of_gsizes[dim]);

/* Instead of using MPI_LB/MPI_UB, which have been removed from MPI in MPI-3,
   use MPI_Type_create_resized. Use hindexed_block to set the starting displacement
   of the datatype (disps[1]) and type_create_resized to set lb to 0 (disps[0])
   and extent to disps[2], which makes ub = disps[2].
 */
        mpi_errno = MPIR_Type_blockindexed(1, 1, &disps[1], 1,  /* 1 means disp is in bytes */
                                           *type_new, &type_indexed);

        /* --BEGIN ERROR HANDLING-- */
        if (mpi_errno != MPI_SUCCESS) {
            mpi_errno =
                MPIR_Err_create_code(mpi_errno, MPIR_ERR_RECOVERABLE, __func__, __LINE__,
                                     MPI_ERR_OTHER, "**fail", 0);
            return mpi_errno;
        }
        /* --END ERROR HANDLING-- */

        mpi_errno = MPIR_Type_create_resized(type_indexed, 0, disps[2], &type_tmp);

        MPIR_Type_free_impl(&type_indexed);
        MPIR_Type_free_impl(type_new);
        *type_new = type_tmp;

        /* --BEGIN ERROR HANDLING-- */
        if (mpi_errno != MPI_SUCCESS) {
            mpi_errno =
                MPIR_Err_create_code(mpi_errno, MPIR_ERR_RECOVERABLE, __func__, __LINE__,
                                     MPI_ERR_OTHER, "**fail", 0);
            return mpi_errno;
        }
        /* --END ERROR HANDLING-- */

        *st_offset = 0; /* set it to 0 because it is taken care of in
                         * the struct above */
    } else {
        *st_offset = (MPI_Aint) rank *(MPI_Aint) blksize;
        /* st_offset is in terms of no. of elements of type oldtype in
         * this dimension */
    }

    if (local_size == 0)
        *st_offset = 0;

    return MPI_SUCCESS;
}
Exemple #7
0
static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model, MPIR_Info * info,
                    MPIR_Comm * comm_ptr, MPIR_Win ** win_ptr)
{
    int mpi_errno = MPI_SUCCESS;
    int i;
    MPIR_Comm *win_comm_ptr;
    int win_target_pool_size;
    MPIR_CHKPMEM_DECL(5);
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_WIN_INIT);

    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_WIN_INIT);

    MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
    if (initRMAoptions) {

        MPIDI_CH3_RMA_Init_sync_pvars();
        MPIDI_CH3_RMA_Init_pkthandler_pvars();

        initRMAoptions = 0;
    }
    MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);

    *win_ptr = (MPIR_Win *) MPIR_Handle_obj_alloc(&MPIR_Win_mem);
    MPIR_ERR_CHKANDJUMP1(!(*win_ptr), mpi_errno, MPI_ERR_OTHER, "**nomem",
                         "**nomem %s", "MPIR_Win_mem");

    mpi_errno = MPIR_Comm_dup_impl(comm_ptr, &win_comm_ptr);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);

    MPIR_Object_set_ref(*win_ptr, 1);

    /* (*win_ptr)->errhandler is set by upper level; */
    /* (*win_ptr)->base is set by caller; */
    (*win_ptr)->size = size;
    (*win_ptr)->disp_unit = disp_unit;
    (*win_ptr)->create_flavor = create_flavor;
    (*win_ptr)->model = model;
    (*win_ptr)->attributes = NULL;
    (*win_ptr)->comm_ptr = win_comm_ptr;

    (*win_ptr)->at_completion_counter = 0;
    (*win_ptr)->shm_base_addrs = NULL;
    /* (*win_ptr)->basic_info_table[] is set by caller; */
    (*win_ptr)->current_lock_type = MPID_LOCK_NONE;
    (*win_ptr)->shared_lock_ref_cnt = 0;
    (*win_ptr)->target_lock_queue_head = NULL;
    (*win_ptr)->shm_allocated = FALSE;
    (*win_ptr)->states.access_state = MPIDI_RMA_NONE;
    (*win_ptr)->states.exposure_state = MPIDI_RMA_NONE;
    (*win_ptr)->num_targets_with_pending_net_ops = 0;
    (*win_ptr)->start_ranks_in_win_grp = NULL;
    (*win_ptr)->start_grp_size = 0;
    (*win_ptr)->lock_all_assert = 0;
    (*win_ptr)->lock_epoch_count = 0;
    (*win_ptr)->outstanding_locks = 0;
    (*win_ptr)->current_target_lock_data_bytes = 0;
    (*win_ptr)->sync_request_cnt = 0;
    (*win_ptr)->active = FALSE;
    (*win_ptr)->next = NULL;
    (*win_ptr)->prev = NULL;
    (*win_ptr)->outstanding_acks = 0;

    /* Initialize the info flags */
    (*win_ptr)->info_args.no_locks = 0;
    (*win_ptr)->info_args.accumulate_ordering = MPIDI_ACC_ORDER_RAR | MPIDI_ACC_ORDER_RAW |
        MPIDI_ACC_ORDER_WAR | MPIDI_ACC_ORDER_WAW;
    (*win_ptr)->info_args.accumulate_ops = MPIDI_ACC_OPS_SAME_OP_NO_OP;
    (*win_ptr)->info_args.same_size = 0;
    (*win_ptr)->info_args.same_disp_unit = FALSE;
    (*win_ptr)->info_args.alloc_shared_noncontig = 0;
    (*win_ptr)->info_args.alloc_shm = FALSE;
    if ((*win_ptr)->create_flavor == MPI_WIN_FLAVOR_ALLOCATE ||
        (*win_ptr)->create_flavor == MPI_WIN_FLAVOR_SHARED) {
        (*win_ptr)->info_args.alloc_shm = TRUE;
    }

    /* Set info_args on window based on info provided by user */
    mpi_errno = MPID_Win_set_info((*win_ptr), info);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

    MPIR_CHKPMEM_MALLOC((*win_ptr)->op_pool_start, MPIDI_RMA_Op_t *,
                        sizeof(MPIDI_RMA_Op_t) * MPIR_CVAR_CH3_RMA_OP_WIN_POOL_SIZE, mpi_errno,
                        "RMA op pool", MPL_MEM_RMA);
    (*win_ptr)->op_pool_head = NULL;
    for (i = 0; i < MPIR_CVAR_CH3_RMA_OP_WIN_POOL_SIZE; i++) {
        (*win_ptr)->op_pool_start[i].pool_type = MPIDI_RMA_POOL_WIN;
        DL_APPEND((*win_ptr)->op_pool_head, &((*win_ptr)->op_pool_start[i]));
    }

    win_target_pool_size =
        MPL_MIN(MPIR_CVAR_CH3_RMA_TARGET_WIN_POOL_SIZE, MPIR_Comm_size(win_comm_ptr));
    MPIR_CHKPMEM_MALLOC((*win_ptr)->target_pool_start, MPIDI_RMA_Target_t *,
                        sizeof(MPIDI_RMA_Target_t) * win_target_pool_size, mpi_errno,
                        "RMA target pool", MPL_MEM_RMA);
    (*win_ptr)->target_pool_head = NULL;
    for (i = 0; i < win_target_pool_size; i++) {
        (*win_ptr)->target_pool_start[i].pool_type = MPIDI_RMA_POOL_WIN;
        DL_APPEND((*win_ptr)->target_pool_head, &((*win_ptr)->target_pool_start[i]));
    }

    (*win_ptr)->num_slots = MPL_MIN(MPIR_CVAR_CH3_RMA_SLOTS_SIZE, MPIR_Comm_size(win_comm_ptr));
    MPIR_CHKPMEM_MALLOC((*win_ptr)->slots, MPIDI_RMA_Slot_t *,
                        sizeof(MPIDI_RMA_Slot_t) * (*win_ptr)->num_slots, mpi_errno, "RMA slots",
                        MPL_MEM_RMA);
    for (i = 0; i < (*win_ptr)->num_slots; i++) {
        (*win_ptr)->slots[i].target_list_head = NULL;
    }

    MPIR_CHKPMEM_MALLOC((*win_ptr)->target_lock_entry_pool_start,
                        MPIDI_RMA_Target_lock_entry_t *,
                        sizeof(MPIDI_RMA_Target_lock_entry_t) *
                        MPIR_CVAR_CH3_RMA_TARGET_LOCK_ENTRY_WIN_POOL_SIZE, mpi_errno,
                        "RMA lock entry pool", MPL_MEM_RMA);
    (*win_ptr)->target_lock_entry_pool_head = NULL;
    for (i = 0; i < MPIR_CVAR_CH3_RMA_TARGET_LOCK_ENTRY_WIN_POOL_SIZE; i++) {
        DL_APPEND((*win_ptr)->target_lock_entry_pool_head,
                      &((*win_ptr)->target_lock_entry_pool_start[i]));
    }

    if (MPIDI_RMA_Win_inactive_list_head == NULL && MPIDI_RMA_Win_active_list_head == NULL) {
        /* this is the first window, register RMA progress hook */
        mpi_errno = MPID_Progress_register_hook(MPIDI_CH3I_RMA_Make_progress_global,
                                                &MPIDI_CH3I_RMA_Progress_hook_id);
        if (mpi_errno) {
            MPIR_ERR_POP(mpi_errno);
        }
    }

    DL_APPEND(MPIDI_RMA_Win_inactive_list_head, (*win_ptr));

    if (MPIDI_CH3U_Win_hooks.win_init != NULL) {
        mpi_errno =
            MPIDI_CH3U_Win_hooks.win_init(size, disp_unit, create_flavor, model, info, comm_ptr,
                                          win_ptr);
        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_POP(mpi_errno);
    }

  fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_WIN_INIT);
    return mpi_errno;
  fn_fail:
    MPIR_CHKPMEM_REAP();
    goto fn_exit;
}
Exemple #8
0
void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
                            * flat_buf, char **send_buf, ADIO_Offset
                            * offset_list, ADIO_Offset * len_list, int *send_size,
                            MPI_Request * requests, int *sent_to_proc,
                            int nprocs, int myrank,
                            int contig_access_count,
                            ADIO_Offset min_st_offset, ADIO_Offset fd_size,
                            ADIO_Offset * fd_start, ADIO_Offset * fd_end,
                            int *send_buf_idx, int *curr_to_proc,
                            int *done_to_proc, int iter, MPI_Aint buftype_extent)
{
/* this function is only called if buftype is not contig */

    int i, p, flat_buf_idx;
    ADIO_Offset flat_buf_sz, size_in_buf, buf_incr, size;
    int jj, n_buftypes;
    ADIO_Offset off, len, rem_len, user_buf_idx;

/*  curr_to_proc[p] = amount of data sent to proc. p that has already
    been accounted for so far
    done_to_proc[p] = amount of data already sent to proc. p in
    previous iterations
    user_buf_idx = current location in user buffer
    send_buf_idx[p] = current location in send_buf of proc. p  */

    for (i = 0; i < nprocs; i++) {
        send_buf_idx[i] = curr_to_proc[i] = 0;
        done_to_proc[i] = sent_to_proc[i];
    }
    jj = 0;

    user_buf_idx = flat_buf->indices[0];
    flat_buf_idx = 0;
    n_buftypes = 0;
    flat_buf_sz = flat_buf->blocklens[0];

    /* flat_buf_idx = current index into flattened buftype
     * flat_buf_sz = size of current contiguous component in
     * flattened buf */

    for (i = 0; i < contig_access_count; i++) {
        off = offset_list[i];
        rem_len = len_list[i];

        /*this request may span the file domains of more than one process */
        while (rem_len != 0) {
            len = rem_len;
            /* NOTE: len value is modified by ADIOI_Calc_aggregator() to be no
             * longer than the single region that processor "p" is responsible
             * for.
             */
            p = ADIOI_Calc_aggregator(fd, off, min_st_offset, &len, fd_size, fd_start, fd_end);

            if (send_buf_idx[p] < send_size[p]) {
                if (curr_to_proc[p] + len > done_to_proc[p]) {
                    if (done_to_proc[p] > curr_to_proc[p]) {
                        size = MPL_MIN(curr_to_proc[p] + len -
                                       done_to_proc[p], send_size[p] - send_buf_idx[p]);
                        buf_incr = done_to_proc[p] - curr_to_proc[p];
                        ADIOI_BUF_INCR
                            ADIOI_Assert((curr_to_proc[p] + len - done_to_proc[p]) ==
                                         (unsigned) (curr_to_proc[p] + len - done_to_proc[p]));
                        buf_incr = curr_to_proc[p] + len - done_to_proc[p];
                        ADIOI_Assert((done_to_proc[p] + size) ==
                                     (unsigned) (done_to_proc[p] + size));
                        /* ok to cast: bounded by cb buffer size */
                        curr_to_proc[p] = done_to_proc[p] + (int) size;
                    ADIOI_BUF_COPY} else {
                        size = MPL_MIN(len, send_size[p] - send_buf_idx[p]);
                        buf_incr = len;
                        ADIOI_Assert((curr_to_proc[p] + size) ==
                                     (unsigned) ((ADIO_Offset) curr_to_proc[p] + size));
                        curr_to_proc[p] += size;
                    ADIOI_BUF_COPY}
                    if (send_buf_idx[p] == send_size[p]) {
                        MPI_Isend(send_buf[p], send_size[p], MPI_BYTE, p,
                                  myrank + p + 100 * iter, fd->comm, requests + jj);
                        jj++;
                    }
                } else {
                    ADIOI_Assert((curr_to_proc[p] + len) ==
                                 (unsigned) ((ADIO_Offset) curr_to_proc[p] + len));
                    curr_to_proc[p] += len;
                    buf_incr = len;
                ADIOI_BUF_INCR}
            } else {
Exemple #9
0
/* Copied from ADIOI_PVFS2_OldWriteStrided.  It would be good to have fewer
 * copies of this code... */
void ADIOI_ZOIDFS_WriteStrided(ADIO_File fd, void *buf, int count,
			MPI_Datatype datatype, int file_ptr_type,
			ADIO_Offset offset, ADIO_Status *status,
			int *error_code)
{
    /* as with all the other WriteStrided functions, offset is in units of
     * etype relative to the filetype */

    /* Since zoidfs does not support file locking, can't do buffered writes
       as on Unix */

    ADIOI_Flatlist_node *flat_buf, *flat_file;
    int i, j, k, bwr_size, fwr_size=0, st_index=0;
    int sum, n_etypes_in_filetype, size_in_filetype;
    MPI_Count bufsize;
    int n_filetypes, etype_in_filetype;
    ADIO_Offset abs_off_in_filetype=0;
    MPI_Count filetype_size, etype_size, buftype_size;
    MPI_Aint filetype_extent, buftype_extent;
    int buf_count, buftype_is_contig, filetype_is_contig;
    ADIO_Offset off, disp, start_off, initial_off;
    int flag, st_fwr_size, st_n_filetypes;
    int err_flag=0;

    size_t mem_list_count, file_list_count;
    const void ** mem_offsets;
    uint64_t *file_offsets;
    size_t *mem_lengths;
    uint64_t *file_lengths;
    int total_blks_to_write;

    int max_mem_list, max_file_list;

    int b_blks_wrote;
    int f_data_wrote;
    int size_wrote=0, n_write_lists, extra_blks;

    int end_bwr_size, end_fwr_size;
    int start_k, start_j, new_file_write, new_buffer_write;
    int start_mem_offset;
    ADIOI_ZOIDFS_object *zoidfs_obj_ptr;
    MPI_Offset total_bytes_written=0;
    static char myname[] = "ADIOI_ZOIDFS_WRITESTRIDED";

    /* note: I don't know what zoidfs will do if you pass it a super-long list,
     * so let's keep with the PVFS limit for now */
#define MAX_ARRAY_SIZE 64

    /* --BEGIN ERROR HANDLING-- */
    if (fd->atomicity) {
	*error_code = MPIO_Err_create_code(MPI_SUCCESS,
					   MPIR_ERR_RECOVERABLE,
					   myname, __LINE__,
					   MPI_ERR_ARG,
					   "Atomic noncontiguous writes are not supported by ZOIDFS", 0);
	return;
    }
    /* --END ERROR HANDLING-- */

    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
    ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);

    /* the HDF5 tests showed a bug in this list processing code (see many many
     * lines down below).  We added a workaround, but common HDF5 file types
     * are actually contiguous and do not need the expensive workarond */
    if (!filetype_is_contig) {
	flat_file = ADIOI_Flatlist;
	while (flat_file->type != fd->filetype) flat_file = flat_file->next;
	if (flat_file->count == 1 && !buftype_is_contig)
	    filetype_is_contig = 1;
    }

    MPI_Type_size_x(fd->filetype, &filetype_size);
    if ( ! filetype_size ) {
#ifdef HAVE_STATUS_SET_BYTES
	MPIR_Status_set_bytes(status, datatype, 0);
#endif
	*error_code = MPI_SUCCESS; 
	return;
    }

    MPI_Type_extent(fd->filetype, &filetype_extent);
    MPI_Type_size_x(datatype, &buftype_size);
    MPI_Type_extent(datatype, &buftype_extent);
    etype_size = fd->etype_size;
    
    bufsize = buftype_size * count;

    zoidfs_obj_ptr = (ADIOI_ZOIDFS_object*)fd->fs_ptr;

    if (!buftype_is_contig && filetype_is_contig) {

/* noncontiguous in memory, contiguous in file.  */
        uint64_t file_offsets;
	uint64_t file_lengths;

	flat_buf = ADIOI_Flatten_and_find(datatype);
	
	if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
	    off = fd->disp + etype_size * offset;
	}
	else off = fd->fp_ind;

	file_list_count = 1;
	file_offsets = off;
	file_lengths = 0;
	total_blks_to_write = count*flat_buf->count;
	b_blks_wrote = 0;

	/* allocate arrays according to max usage */
	if (total_blks_to_write > MAX_ARRAY_SIZE)
	    mem_list_count = MAX_ARRAY_SIZE;
	else mem_list_count = total_blks_to_write;
	mem_offsets = (void*)ADIOI_Malloc(mem_list_count*sizeof(void*));
	mem_lengths = (size_t*)ADIOI_Malloc(mem_list_count*sizeof(size_t));

	j = 0;
	/* step through each block in memory, filling memory arrays */
	while (b_blks_wrote < total_blks_to_write) {
	    for (i=0; i<flat_buf->count; i++) {
		mem_offsets[b_blks_wrote % MAX_ARRAY_SIZE] = 
		    buf + 
		     j*buftype_extent + 
		     flat_buf->indices[i];
		mem_lengths[b_blks_wrote % MAX_ARRAY_SIZE] = 
		    flat_buf->blocklens[i];
		file_lengths += flat_buf->blocklens[i];
		b_blks_wrote++;
		if (!(b_blks_wrote % MAX_ARRAY_SIZE) ||
		    (b_blks_wrote == total_blks_to_write)) {

		    /* in the case of the last write list call,
		       adjust mem_list_count */
		    if (b_blks_wrote == total_blks_to_write) {
		        mem_list_count = total_blks_to_write % MAX_ARRAY_SIZE;
			/* in case last write list call fills max arrays */
			if (!mem_list_count) mem_list_count = MAX_ARRAY_SIZE;
		    }
#ifdef ADIOI_MPE_LOGGING
                    MPE_Log_event( ADIOI_MPE_write_a, 0, NULL );
#endif
		    NO_STALE(err_flag, fd, zoidfs_obj_ptr,
				    zoidfs_write(zoidfs_obj_ptr, 
					    mem_list_count,
					    mem_offsets, mem_lengths, 
					    1, &file_offsets, &file_lengths, ZOIDFS_NO_OP_HINT));

		    /* --BEGIN ERROR HANDLING-- */
		    if (err_flag != ZFS_OK) {
			*error_code = MPIO_Err_create_code(MPI_SUCCESS,
							   MPIR_ERR_RECOVERABLE,
							   myname, __LINE__,
							   ADIOI_ZOIDFS_error_convert(err_flag),
							   "Error in zoidfs_write", 0);
			break;
		    }
#ifdef ADIOI_MPE_LOGGING
                    MPE_Log_event( ADIOI_MPE_write_b, 0, NULL );
#endif
		    total_bytes_written += file_lengths;
		  
		    /* in the case of error or the last write list call, 
		     * leave here */
		    /* --BEGIN ERROR HANDLING-- */
		    if (err_flag) {
			*error_code = MPIO_Err_create_code(MPI_SUCCESS,
							   MPIR_ERR_RECOVERABLE,
							   myname, __LINE__,
							   ADIOI_ZOIDFS_error_convert(err_flag),
							   "Error in zoidfs_write", 0);
			break;
		    }
		    /* --END ERROR HANDLING-- */
		    if (b_blks_wrote == total_blks_to_write) break;

		    file_offsets += file_lengths;
		    file_lengths = 0;
		} 
	    } /* for (i=0; i<flat_buf->count; i++) */
	    j++;
	} /* while (b_blks_wrote < total_blks_to_write) */
	ADIOI_Free(mem_offsets);
	ADIOI_Free(mem_lengths);

	if (file_ptr_type == ADIO_INDIVIDUAL) 
	    fd->fp_ind += total_bytes_written;

	if (!err_flag)  *error_code = MPI_SUCCESS;

	fd->fp_sys_posn = -1;   /* clear this. */

#ifdef HAVE_STATUS_SET_BYTES
	MPIR_Status_set_bytes(status, datatype, bufsize);
/* This is a temporary way of filling in status. The right way is to 
   keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
#endif

	ADIOI_Delete_flattened(datatype);
	return;
    } /* if (!buftype_is_contig && filetype_is_contig) */

    /* already know that file is noncontiguous from above */
    /* noncontiguous in file */

/* filetype already flattened in ADIO_Open */
    flat_file = ADIOI_Flatlist;
    while (flat_file->type != fd->filetype) flat_file = flat_file->next;

    disp = fd->disp;
    initial_off = offset;

    /* for each case - ADIO_Individual pointer or explicit, find offset
       (file offset in bytes), n_filetypes (how many filetypes into file 
       to start), fwr_size (remaining amount of data in present file
       block), and st_index (start point in terms of blocks in starting
       filetype) */
    if (file_ptr_type == ADIO_INDIVIDUAL) {
        offset = fd->fp_ind; /* in bytes */
	n_filetypes = -1;
	flag = 0;
	while (!flag) {
	    n_filetypes++;
	    for (i=0; i<flat_file->count; i++) {
	        if (disp + flat_file->indices[i] + 
		    ((ADIO_Offset) n_filetypes)*filetype_extent +
		      flat_file->blocklens[i] >= offset) {
		  st_index = i;
		  fwr_size = disp + flat_file->indices[i] + 
		    ((ADIO_Offset) n_filetypes)*filetype_extent
		    + flat_file->blocklens[i] - offset;
		  flag = 1;
		  break;
		}
	    }
	} /* while (!flag) */
    } /* if (file_ptr_type == ADIO_INDIVIDUAL) */
    else {
        n_etypes_in_filetype = filetype_size/etype_size;
	n_filetypes = (int) (offset / n_etypes_in_filetype);
	etype_in_filetype = (int) (offset % n_etypes_in_filetype);
	size_in_filetype = etype_in_filetype * etype_size;
	
	sum = 0;
	for (i=0; i<flat_file->count; i++) {
	    sum += flat_file->blocklens[i];
	    if (sum > size_in_filetype) {
	        st_index = i;
		fwr_size = sum - size_in_filetype;
		abs_off_in_filetype = flat_file->indices[i] +
		    size_in_filetype - (sum - flat_file->blocklens[i]);
		break;
	    }
	}

	/* abs. offset in bytes in the file */
	offset = disp + ((ADIO_Offset) n_filetypes)*filetype_extent +
	    abs_off_in_filetype;
    } /* else [file_ptr_type != ADIO_INDIVIDUAL] */

    start_off = offset;
    st_fwr_size = fwr_size;
    st_n_filetypes = n_filetypes;
    
    if (buftype_is_contig && !filetype_is_contig) {

/* contiguous in memory, noncontiguous in file. should be the most
   common case. */

	/* only one memory off-len pair, so no array */
        size_t mem_lengths;
	size_t mem_offsets;
        
	i = 0;
	j = st_index;
	off = offset;
	n_filetypes = st_n_filetypes;
        
	mem_list_count = 1;
        
	/* determine how many blocks in file to write */
	f_data_wrote = MPL_MIN(st_fwr_size, bufsize);
	total_blks_to_write = 1;
	if (j < (flat_file->count -1)) j++;
	else {
	    j = 0;
	    n_filetypes++;
	}
	while (f_data_wrote < bufsize) {
	    f_data_wrote += flat_file->blocklens[j];
	    total_blks_to_write++;
	    if (j<(flat_file->count-1)) j++;
	    else j = 0; 
	}
	    
	j = st_index;
	n_filetypes = st_n_filetypes;
	n_write_lists = total_blks_to_write/MAX_ARRAY_SIZE;
	extra_blks = total_blks_to_write%MAX_ARRAY_SIZE;
        
	mem_offsets = (size_t)buf;
	mem_lengths = 0;
        
	/* if at least one full writelist, allocate file arrays
	   at max array size and don't free until very end */
	if (n_write_lists) {
	    file_offsets = (int64_t*)ADIOI_Malloc(MAX_ARRAY_SIZE*
						  sizeof(int64_t));
	    file_lengths = (uint64_t*)ADIOI_Malloc(MAX_ARRAY_SIZE*
						  sizeof(uint64_t));
	}
	/* if there's no full writelist allocate file arrays according
	   to needed size (extra_blks) */
	else {
	    file_offsets = (int64_t*)ADIOI_Malloc(extra_blks*
                                                  sizeof(int64_t));
            file_lengths = (uint64_t*)ADIOI_Malloc(extra_blks*
                                                  sizeof(uint64_t));
        }
        
        /* for file arrays that are of MAX_ARRAY_SIZE, build arrays */
        for (i=0; i<n_write_lists; i++) {
            file_list_count = MAX_ARRAY_SIZE;
            if(!i) {
                file_offsets[0] = offset;
                file_lengths[0] = st_fwr_size;
                mem_lengths = st_fwr_size;
            }
            for (k=0; k<MAX_ARRAY_SIZE; k++) {
                if (i || k) {
                    file_offsets[k] = disp + 
			((ADIO_Offset)n_filetypes)*filetype_extent
			+ flat_file->indices[j];
                    file_lengths[k] = flat_file->blocklens[j];
                    mem_lengths += file_lengths[k];
                }
                if (j<(flat_file->count - 1)) j++;
                else {
                    j = 0;
                    n_filetypes++;
                }
            } /* for (k=0; k<MAX_ARRAY_SIZE; k++) */
#ifdef ADIOI_MPE_LOGGING
            MPE_Log_event( ADIOI_MPE_write_a, 0, NULL );
#endif
	    NO_STALE(err_flag, fd, zoidfs_obj_ptr,
			    zoidfs_write(zoidfs_obj_ptr,
				    1, buf, &mem_lengths,
				    file_list_count, 
				    file_offsets, file_lengths, ZOIDFS_NO_OP_HINT));

#ifdef ADIOI_MPE_LOGGING
            MPE_Log_event( ADIOI_MPE_write_b, 0, NULL );
#endif
	    /* --BEGIN ERROR HANDLING-- */
	    if (err_flag != ZFS_OK) {
		*error_code = MPIO_Err_create_code(MPI_SUCCESS,
						   MPIR_ERR_RECOVERABLE,
						   myname, __LINE__,
						   ADIOI_ZOIDFS_error_convert(err_flag),
						   "Error in zoidfs_write", 0);
		goto error_state;
	    }
	    /* --END ERROR HANDLING-- */
	    total_bytes_written += mem_lengths;

            mem_offsets += mem_lengths;
            mem_lengths = 0;

        } /* for (i=0; i<n_write_lists; i++) */

        /* for file arrays smaller than MAX_ARRAY_SIZE (last write_list call) */
        if (extra_blks) {
            file_list_count = extra_blks;
            if(!i) {
                file_offsets[0] = offset;
                file_lengths[0] = MPL_MIN(st_fwr_size, bufsize);
            }
            for (k=0; k<extra_blks; k++) {
                if(i || k) {
                    file_offsets[k] = disp + 
			((ADIO_Offset)n_filetypes)*filetype_extent +
			flat_file->indices[j];
		    /* XXX: double-check these casts  */
                    if (k == (extra_blks - 1)) {
                        file_lengths[k] = bufsize 
				- mem_lengths - mem_offsets +  (size_t)buf;
                    }
                    else file_lengths[k] = flat_file->blocklens[j];
                } /* if(i || k) */
                mem_lengths += file_lengths[k];
                if (j<(flat_file->count - 1)) j++;
                else {
                    j = 0;
                    n_filetypes++;
                }
            } /* for (k=0; k<extra_blks; k++) */

#ifdef ADIOI_MPE_LOGGING
            MPE_Log_event( ADIOI_MPE_write_a, 0, NULL );
#endif
	    NO_STALE(err_flag, fd, zoidfs_obj_ptr, 
			    zoidfs_write(zoidfs_obj_ptr, 1, 
				    (const void **)&mem_offsets, 
				    &mem_lengths,
				    file_list_count, 
				    file_offsets, file_lengths, ZOIDFS_NO_OP_HINT));

#ifdef ADIOI_MPE_LOGGING
            MPE_Log_event( ADIOI_MPE_write_b, 0, NULL );
#endif
	    /* --BEGIN ERROR HANDLING-- */
	    if (err_flag != 0) {
		*error_code = MPIO_Err_create_code(MPI_SUCCESS,
						   MPIR_ERR_RECOVERABLE,
						   myname, __LINE__,
						   ADIOI_ZOIDFS_error_convert(err_flag),
						   "Error in zoidfs_write", 0);
		goto error_state;
	    }
	    /* --END ERROR HANDLING-- */
	    total_bytes_written += mem_lengths;
        }
    } 
    else {
        /* noncontiguous in memory as well as in file */

	flat_buf = ADIOI_Flatten_and_find(datatype);

	size_wrote = 0;
	n_filetypes = st_n_filetypes;
	fwr_size = st_fwr_size;
	bwr_size = flat_buf->blocklens[0];
	buf_count = 0;
	start_mem_offset = 0;
	start_k = k = 0;
	start_j = st_index;
	max_mem_list = 0;
	max_file_list = 0;

	/* run through and file max_file_list and max_mem_list so that you 
	   can allocate the file and memory arrays less than MAX_ARRAY_SIZE
	   if possible */

	while (size_wrote < bufsize) {
	    k = start_k;
	    new_buffer_write = 0;
	    mem_list_count = 0;
	    while ((mem_list_count < MAX_ARRAY_SIZE) && 
		   (new_buffer_write < bufsize-size_wrote)) {
	        /* find mem_list_count and file_list_count such that both are
		   less than MAX_ARRAY_SIZE, the sum of their lengths are
		   equal, and the sum of all the data written and data to be
		   written in the next immediate write list is less than
		   bufsize */
	        if(mem_list_count) {
		    if((new_buffer_write + flat_buf->blocklens[k] + 
			size_wrote) > bufsize) {
		        end_bwr_size = new_buffer_write + 
			    flat_buf->blocklens[k] - (bufsize - size_wrote);
			new_buffer_write = bufsize - size_wrote;
		    }
		    else {
		        new_buffer_write += flat_buf->blocklens[k];
			end_bwr_size = flat_buf->blocklens[k];
		    }
		}
		else {
		    if (bwr_size > (bufsize - size_wrote)) {
		        new_buffer_write = bufsize - size_wrote;
			bwr_size = new_buffer_write;
		    }
		    else new_buffer_write = bwr_size;
		}
		mem_list_count++;
		k = (k + 1)%flat_buf->count;
	     } /* while ((mem_list_count < MAX_ARRAY_SIZE) && 
	       (new_buffer_write < bufsize-size_wrote)) */
	    j = start_j;
	    new_file_write = 0;
	    file_list_count = 0;
	    while ((file_list_count < MAX_ARRAY_SIZE) && 
		   (new_file_write < new_buffer_write)) { 
	        if(file_list_count) {
		    if((new_file_write + flat_file->blocklens[j]) > 
		       new_buffer_write) {
		        end_fwr_size = new_buffer_write - new_file_write;
			new_file_write = new_buffer_write;
			j--;
		    }
		    else {
		        new_file_write += flat_file->blocklens[j];
			end_fwr_size = flat_file->blocklens[j];
		    }
		}
		else {
		    if (fwr_size > new_buffer_write) {
		        new_file_write = new_buffer_write;
			fwr_size = new_file_write;
		    }
		    else new_file_write = fwr_size;
		}
		file_list_count++;
		if (j < (flat_file->count - 1)) j++;
		else j = 0;
		
		k = start_k;
		if ((new_file_write < new_buffer_write) && 
		    (file_list_count == MAX_ARRAY_SIZE)) {
		    new_buffer_write = 0;
		    mem_list_count = 0;
		    while (new_buffer_write < new_file_write) {
		        if(mem_list_count) {
			    if((new_buffer_write + flat_buf->blocklens[k]) >
			       new_file_write) {
			        end_bwr_size = new_file_write - 
				    new_buffer_write;
				new_buffer_write = new_file_write;
				k--;
			    }
			    else {
			        new_buffer_write += flat_buf->blocklens[k];
				end_bwr_size = flat_buf->blocklens[k];
			    }
			}
			else {
			    new_buffer_write = bwr_size;
			    if (bwr_size > (bufsize - size_wrote)) {
			        new_buffer_write = bufsize - size_wrote;
				bwr_size = new_buffer_write;
			    }
			}
			mem_list_count++;
			k = (k + 1)%flat_buf->count;
		    } /* while (new_buffer_write < new_file_write) */
		} /* if ((new_file_write < new_buffer_write) &&
		     (file_list_count == MAX_ARRAY_SIZE)) */
	    } /* while ((mem_list_count < MAX_ARRAY_SIZE) && 
		 (new_buffer_write < bufsize-size_wrote)) */

	    /*  fakes filling the writelist arrays of lengths found above  */
	    k = start_k;
	    j = start_j;
	    for (i=0; i<mem_list_count; i++) {	     
		if(i) {
		    if (i == (mem_list_count - 1)) {
			if (flat_buf->blocklens[k] == end_bwr_size)
			    bwr_size = flat_buf->blocklens[(k+1)%
							  flat_buf->count];
			else {
			    bwr_size = flat_buf->blocklens[k] - end_bwr_size;
			    k--;
			    buf_count--;
			}
		    }
		}
		buf_count++;
		k = (k + 1)%flat_buf->count;
	    } /* for (i=0; i<mem_list_count; i++) */
	    for (i=0; i<file_list_count; i++) {
		if (i) {
		    if (i == (file_list_count - 1)) {
			if (flat_file->blocklens[j] == end_fwr_size)
			    fwr_size = flat_file->blocklens[(j+1)%
							  flat_file->count];   
			else {
			    fwr_size = flat_file->blocklens[j] - end_fwr_size;
			    j--;
			}
		    }
		}
		if (j < flat_file->count - 1) j++;
		else {
		    j = 0;
		    n_filetypes++;
		}
	    } /* for (i=0; i<file_list_count; i++) */
	    size_wrote += new_buffer_write;
	    start_k = k;
	    start_j = j;
	    if (max_mem_list < mem_list_count)
	        max_mem_list = mem_list_count;
	    if (max_file_list < file_list_count)
	        max_file_list = file_list_count;
	} /* while (size_wrote < bufsize) */

	/* one last check before we actually carry out the operation:
	 * this code has hard-to-fix bugs when a noncontiguous file type has
	 * such large pieces that the sum of the lengths of the memory type is
	 * not larger than one of those pieces (and vice versa for large memory
	 * types and many pices of file types.  In these cases, give up and
	 * fall back to naive reads and writes.  The testphdf5 test created a
	 * type with two very large memory regions and 600 very small file
	 * regions.  The same test also created a type with one very large file
	 * region and many (700) very small memory regions.  both cases caused
	 * problems for this code */

	if ( ( (file_list_count == 1) && 
		    (new_file_write < flat_file->blocklens[0] ) ) ||
		((mem_list_count == 1) && 
		    (new_buffer_write < flat_buf->blocklens[0]) ) ||
		((file_list_count == MAX_ARRAY_SIZE) && 
		    (new_file_write < flat_buf->blocklens[0]) ) ||
		( (mem_list_count == MAX_ARRAY_SIZE) &&
		    (new_buffer_write < flat_file->blocklens[0])) )
	{
	    ADIOI_Delete_flattened(datatype);
	    ADIOI_GEN_WriteStrided_naive(fd, buf, count, datatype,
		    file_ptr_type, initial_off, status, error_code);
	    return;
	}


	mem_offsets = (void *)ADIOI_Malloc(max_mem_list*sizeof(void *));
	mem_lengths = (size_t*)ADIOI_Malloc(max_mem_list*sizeof(size_t));
	file_offsets = (uint64_t *)ADIOI_Malloc(max_file_list*sizeof(uint64_t));
	file_lengths = (uint64_t*)ADIOI_Malloc(max_file_list*sizeof(uint64_t));
	    
	size_wrote = 0;
	n_filetypes = st_n_filetypes;
	fwr_size = st_fwr_size;
	bwr_size = flat_buf->blocklens[0];
	buf_count = 0;
	start_mem_offset = 0;
	start_k = k = 0;
	start_j = st_index;

	/*  this section calculates mem_list_count and file_list_count
	    and also finds the possibly odd sized last array elements
	    in new_fwr_size and new_bwr_size  */
	
	while (size_wrote < bufsize) {
	    k = start_k;
	    new_buffer_write = 0;
	    mem_list_count = 0;
	    while ((mem_list_count < MAX_ARRAY_SIZE) && 
		   (new_buffer_write < bufsize-size_wrote)) {
	        /* find mem_list_count and file_list_count such that both are
		   less than MAX_ARRAY_SIZE, the sum of their lengths are
		   equal, and the sum of all the data written and data to be
		   written in the next immediate write list is less than
		   bufsize */
	        if(mem_list_count) {
		    if((new_buffer_write + flat_buf->blocklens[k] + 
			size_wrote) > bufsize) {
		        end_bwr_size = new_buffer_write + 
			    flat_buf->blocklens[k] - (bufsize - size_wrote);
			new_buffer_write = bufsize - size_wrote;
		    }
		    else {
		        new_buffer_write += flat_buf->blocklens[k];
			end_bwr_size = flat_buf->blocklens[k];
		    }
		}
		else {
		    if (bwr_size > (bufsize - size_wrote)) {
		        new_buffer_write = bufsize - size_wrote;
			bwr_size = new_buffer_write;
		    }
		    else new_buffer_write = bwr_size;
		}
		mem_list_count++;
		k = (k + 1)%flat_buf->count;
	     } /* while ((mem_list_count < MAX_ARRAY_SIZE) && 
	       (new_buffer_write < bufsize-size_wrote)) */
	    j = start_j;
	    new_file_write = 0;
	    file_list_count = 0;
	    while ((file_list_count < MAX_ARRAY_SIZE) && 
		   (new_file_write < new_buffer_write)) {
	        if(file_list_count) {
		    if((new_file_write + flat_file->blocklens[j]) > 
		       new_buffer_write) {
		        end_fwr_size = new_buffer_write - new_file_write;
			new_file_write = new_buffer_write;
			j--;
		    }
		    else {
		        new_file_write += flat_file->blocklens[j];
			end_fwr_size = flat_file->blocklens[j];
		    }
		}
		else {
		    if (fwr_size > new_buffer_write) {
		        new_file_write = new_buffer_write;
			fwr_size = new_file_write;
		    }
		    else new_file_write = fwr_size;
		}
		file_list_count++;
		if (j < (flat_file->count - 1)) j++;
		else j = 0;
		
		k = start_k;
		if ((new_file_write < new_buffer_write) && 
		    (file_list_count == MAX_ARRAY_SIZE)) {
		    new_buffer_write = 0;
		    mem_list_count = 0;
		    while (new_buffer_write < new_file_write) {
		        if(mem_list_count) {
			    if((new_buffer_write + flat_buf->blocklens[k]) >
			       new_file_write) {
			        end_bwr_size = new_file_write -
				  new_buffer_write;
				new_buffer_write = new_file_write;
				k--;
			    }
			    else {
			        new_buffer_write += flat_buf->blocklens[k];
				end_bwr_size = flat_buf->blocklens[k];
			    }
			}
			else {
			    new_buffer_write = bwr_size;
			    if (bwr_size > (bufsize - size_wrote)) {
			        new_buffer_write = bufsize - size_wrote;
				bwr_size = new_buffer_write;
			    }
			}
			mem_list_count++;
			k = (k + 1)%flat_buf->count;
		    } /* while (new_buffer_write < new_file_write) */
		} /* if ((new_file_write < new_buffer_write) &&
		     (file_list_count == MAX_ARRAY_SIZE)) */
	    } /* while ((mem_list_count < MAX_ARRAY_SIZE) && 
		 (new_buffer_write < bufsize-size_wrote)) */

	    /*  fills the allocated writelist arrays  */
	    k = start_k;
	    j = start_j;
	    for (i=0; i<mem_list_count; i++) {	     
	        mem_offsets[i] = buf + 
			buftype_extent* (buf_count/flat_buf->count) +
				  flat_buf->indices[k];
		
		if(!i) {
		    mem_lengths[0] = bwr_size;
		    mem_offsets[0] += flat_buf->blocklens[k] - bwr_size;
		}
		else {
		    if (i == (mem_list_count - 1)) {
		        mem_lengths[i] = end_bwr_size;
			if (flat_buf->blocklens[k] == end_bwr_size)
			    bwr_size = flat_buf->blocklens[(k+1)%
							  flat_buf->count];
			else {
			    bwr_size = flat_buf->blocklens[k] - end_bwr_size;
			    k--;
			    buf_count--;
			}
		    }
		    else {
		        mem_lengths[i] = flat_buf->blocklens[k];
		    }
		}
		buf_count++;
		k = (k + 1)%flat_buf->count;
	    } /* for (i=0; i<mem_list_count; i++) */
	    for (i=0; i<file_list_count; i++) {
	        file_offsets[i] = disp + flat_file->indices[j] + 
		    ((ADIO_Offset)n_filetypes) * filetype_extent;
	        if (!i) {
		    file_lengths[0] = fwr_size;
		    file_offsets[0] += flat_file->blocklens[j] - fwr_size;
		}
		else {
		    if (i == (file_list_count - 1)) {
		        file_lengths[i] = end_fwr_size;
			if (flat_file->blocklens[j] == end_fwr_size)
			    fwr_size = flat_file->blocklens[(j+1)%
							  flat_file->count];   
			else {
			    fwr_size = flat_file->blocklens[j] - end_fwr_size;
			    j--;
			}
		    }
		    else file_lengths[i] = flat_file->blocklens[j];
		}
		if (j < flat_file->count - 1) j++;
		else {
		    j = 0;
		    n_filetypes++;
		}
	    } /* for (i=0; i<file_list_count; i++) */

#ifdef ADIOI_MPE_LOGGING
            MPE_Log_event( ADIOI_MPE_write_a, 0, NULL );
#endif
	    NO_STALE(err_flag, fd, zoidfs_obj_ptr,
			    zoidfs_write(zoidfs_obj_ptr, 
				    mem_list_count, mem_offsets, mem_lengths, 
				    file_list_count, 
				    file_offsets, file_lengths, ZOIDFS_NO_OP_HINT));
	    /* --BEGIN ERROR HANDLING-- */
	    if (err_flag != ZFS_OK) {
		*error_code = MPIO_Err_create_code(MPI_SUCCESS,
						   MPIR_ERR_RECOVERABLE,
						   myname, __LINE__,
						   ADIOI_ZOIDFS_error_convert(err_flag),
						   "Error in zoidfs_write", 0);
		goto error_state;
	    }
	    /* --END ERROR HANDLING-- */
#ifdef ADIOI_MPE_LOGGING
            MPE_Log_event( ADIOI_MPE_write_b, 0, NULL );
#endif
	    size_wrote += new_buffer_write;
	    total_bytes_written += new_buffer_write; /* XXX: is this right? */
	    start_k = k;
	    start_j = j;
	} /* while (size_wrote < bufsize) */
	ADIOI_Free(mem_offsets);
	ADIOI_Free(mem_lengths);
    }
    /* when incrementing fp_ind, need to also take into account the file type:
     * consider an N-element 1-d subarray with a lb and ub: ( |---xxxxx-----|
     * if we wrote N elements, offset needs to point at beginning of type, not
     * at empty region at offset N+1).  
     *
     * As we discussed on mpich-discuss in may/june 2009, the code below might
     * look wierd, but by putting fp_ind at the last byte written, the next
     * time we run through the strided code we'll update the fp_ind to the
     * right location. */
    if (file_ptr_type == ADIO_INDIVIDUAL) {
	fd->fp_ind = file_offsets[file_list_count-1]+
	    file_lengths[file_list_count-1];
    }
    ADIOI_Free(file_offsets);
    ADIOI_Free(file_lengths);

    *error_code = MPI_SUCCESS;

error_state:
    fd->fp_sys_posn = -1;   /* set it to null. */

#ifdef HAVE_STATUS_SET_BYTES
    MPIR_Status_set_bytes(status, datatype, bufsize);
/* This is a temporary way of filling in status. The right way is to 
   keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
#endif

    if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
}
Exemple #10
0
void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, const void *buf, int count,
			       MPI_Datatype datatype, int file_ptr_type,
			       ADIO_Offset offset, ADIO_Status * status,
			       int *error_code)
{
    /* offset is in units of etype relative to the filetype. */
    ADIOI_Flatlist_node *flat_buf, *flat_file;
    ADIO_Offset i_offset, sum, size_in_filetype;
    int i, j, k, st_index=0;
    int n_etypes_in_filetype;
    ADIO_Offset num, size, n_filetypes, etype_in_filetype, st_n_filetypes;
    ADIO_Offset abs_off_in_filetype=0;
    MPI_Count filetype_size, etype_size, buftype_size;
    MPI_Aint filetype_extent, buftype_extent;
    int buf_count, buftype_is_contig, filetype_is_contig;
    ADIO_Offset userbuf_off;
    ADIO_Offset off, req_off, disp, end_offset=0, writebuf_off, start_off;
    char *writebuf;
    unsigned bufsize, writebuf_len, write_sz;
    ADIO_Status status1;
    ADIO_Offset new_bwr_size, new_fwr_size, st_fwr_size, fwr_size=0, bwr_size, req_len;
    int stripe_size;
    static char myname[] = "ADIOI_LUSTRE_WriteStrided";

    if (fd->hints->ds_write == ADIOI_HINT_DISABLE) {
	/* if user has disabled data sieving on writes, use naive
	 * approach instead.
	 */
	ADIOI_GEN_WriteStrided_naive(fd,
				     buf,
				     count,
				     datatype,
				     file_ptr_type,
				     offset, status, error_code);
	return;
    }

    *error_code = MPI_SUCCESS;	/* changed below if error */

    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
    ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);

    MPI_Type_size_x(fd->filetype, &filetype_size);
    if (!filetype_size) {
#ifdef HAVE_STATUS_SET_BYTES
	MPIR_Status_set_bytes(status, datatype, 0);
#endif
	*error_code = MPI_SUCCESS;
	return;
    }

    MPI_Type_extent(fd->filetype, &filetype_extent);
    MPI_Type_size_x(datatype, &buftype_size);
    MPI_Type_extent(datatype, &buftype_extent);
    etype_size = fd->etype_size;

    ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(unsigned)buftype_size * (ADIO_Offset)count));
    bufsize = buftype_size * count;

    /* get striping info */
    stripe_size = fd->hints->striping_unit;

    /* Different buftype to different filetype */
    if (!buftype_is_contig && filetype_is_contig) {
        /* noncontiguous in memory, contiguous in file. */
	flat_buf = ADIOI_Flatten_and_find(datatype);

	off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
            fd->disp + (ADIO_Offset)etype_size * offset;

	start_off = off;
	end_offset = start_off + bufsize - 1;
        /* write stripe size buffer each time */
	writebuf = (char *) ADIOI_Malloc(MPL_MIN(bufsize, stripe_size));
        writebuf_off = 0;
        writebuf_len = 0;

        /* if atomicity is true, lock the region to be accessed */
	if (fd->atomicity)
	    ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, bufsize);

	for (j = 0; j < count; j++) {
	    for (i = 0; i < flat_buf->count; i++) {
                userbuf_off = (ADIO_Offset)j * (ADIO_Offset)buftype_extent +
                    flat_buf->indices[i];
		req_off = off;
		req_len = flat_buf->blocklens[i];
		ADIOI_BUFFERED_WRITE_WITHOUT_READ
		off += flat_buf->blocklens[i];
	    }
        }

	/* write the buffer out finally */
	ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE,
			 ADIO_EXPLICIT_OFFSET, writebuf_off, &status1,
			 error_code);

	if (fd->atomicity)
	    ADIOI_UNLOCK(fd, start_off, SEEK_SET, bufsize);
	if (*error_code != MPI_SUCCESS) {
            ADIOI_Free(writebuf);
	    return;
        }
	ADIOI_Free(writebuf);
	if (file_ptr_type == ADIO_INDIVIDUAL)
	    fd->fp_ind = off;
    } else {
        /* noncontiguous in file */
	flat_file = ADIOI_Flatten_and_find(fd->filetype);
	disp = fd->disp;

	if (file_ptr_type == ADIO_INDIVIDUAL) {
            /* Wei-keng reworked type processing to be a bit more efficient */
            offset       = fd->fp_ind - disp;
            n_filetypes  = (offset - flat_file->indices[0]) / filetype_extent;
            offset      -= (ADIO_Offset)n_filetypes * filetype_extent;
            /* now offset is local to this extent */

            /* find the block where offset is located, skip blocklens[i]==0 */
            for (i=0; i<flat_file->count; i++) {
                ADIO_Offset dist;
                if (flat_file->blocklens[i] == 0) continue;
                dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
                /* fwr_size is from offset to the end of block i */
                if (dist == 0) {
                    i++;
                    offset   = flat_file->indices[i];
                    fwr_size = flat_file->blocklens[i];
			break;
		    }
                if (dist > 0) {
                    fwr_size = dist;
                    break;
		}
	    }
            st_index = i;  /* starting index in flat_file->indices[] */
            offset += disp + (ADIO_Offset)n_filetypes*filetype_extent;
        }
        else {
            n_etypes_in_filetype = filetype_size/etype_size;
            n_filetypes = offset / n_etypes_in_filetype;
            etype_in_filetype = offset % n_etypes_in_filetype;
	    size_in_filetype = etype_in_filetype * etype_size;

	    sum = 0;
	    for (i = 0; i < flat_file->count; i++) {
		sum += flat_file->blocklens[i];
		if (sum > size_in_filetype) {
		    st_index = i;
		    fwr_size = sum - size_in_filetype;
		    abs_off_in_filetype = flat_file->indices[i] +
			size_in_filetype - (sum - flat_file->blocklens[i]);
		    break;
		}
	    }

	    /* abs. offset in bytes in the file */
	    offset = disp + (ADIO_Offset) n_filetypes *filetype_extent +
		     abs_off_in_filetype;
	}

	start_off = offset;

        /* Wei-keng Liao:write request is within single flat_file
         * contig block*/
        /* this could happen, for example, with subarray types that are
         * actually fairly contiguous */
        if (buftype_is_contig && bufsize <= fwr_size) {
            req_off = start_off;
            req_len = bufsize;
            end_offset = start_off + bufsize - 1;
	    writebuf = (char *) ADIOI_Malloc(MPL_MIN(bufsize, stripe_size));
	    memset(writebuf, -1, MPL_MIN(bufsize, stripe_size));
            writebuf_off = 0;
            writebuf_len = 0;
            userbuf_off = 0;
            ADIOI_BUFFERED_WRITE_WITHOUT_READ
            /* write the buffer out finally */
            ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE,
                             ADIO_EXPLICIT_OFFSET, writebuf_off, &status1,
                             error_code);

            if (file_ptr_type == ADIO_INDIVIDUAL) {
                /* update MPI-IO file pointer to point to the first byte
                 * that can be accessed in the fileview. */
                fd->fp_ind = offset + bufsize;
                if (bufsize == fwr_size) {
                    do {
                        st_index++;
                        if (st_index == flat_file->count) {
                            st_index = 0;
                            n_filetypes++;
                        }
                    } while (flat_file->blocklens[st_index] == 0);
                    fd->fp_ind = disp + flat_file->indices[st_index]
                        + (ADIO_Offset)n_filetypes*filetype_extent;
                }
            }
            fd->fp_sys_posn = -1;   /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
            MPIR_Status_set_bytes(status, datatype, bufsize);
#endif
            ADIOI_Free(writebuf);
            return;
        }

	    /* Calculate end_offset, the last byte-offset that will be accessed.
           e.g., if start_offset=0 and 100 bytes to be write, end_offset=99*/

	    st_fwr_size = fwr_size;
	    st_n_filetypes = n_filetypes;
        i_offset = 0;
	    j = st_index;
	    off = offset;
	    fwr_size = MPL_MIN(st_fwr_size, bufsize);
        while (i_offset < bufsize) {
            i_offset += fwr_size;
		end_offset = off + fwr_size - 1;

            j = (j+1) % flat_file->count;
            n_filetypes += (j == 0) ? 1 : 0;
            while (flat_file->blocklens[j]==0) {
                j = (j+1) % flat_file->count;
                n_filetypes += (j == 0) ? 1 : 0;
		}

		off = disp + flat_file->indices[j] +
                n_filetypes*(ADIO_Offset)filetype_extent;
            fwr_size = MPL_MIN(flat_file->blocklens[j], bufsize-i_offset);
	    }

/* if atomicity is true, lock the region to be accessed */
        if (fd->atomicity)
            ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);

	    writebuf_off = 0;
	    writebuf_len = 0;
	    writebuf = (char *) ADIOI_Malloc(stripe_size);
	    memset(writebuf, -1, stripe_size);

	    if (buftype_is_contig && !filetype_is_contig) {

/* contiguous in memory, noncontiguous in file. should be the most
		   common case. */

            i_offset = 0;
		j = st_index;
		off = offset;
		n_filetypes = st_n_filetypes;
		fwr_size = MPL_MIN(st_fwr_size, bufsize);
            while (i_offset < bufsize) {
		    if (fwr_size) {
			/* TYPE_UB and TYPE_LB can result in
			   fwr_size = 0. save system call in such cases */
                    /* lseek(fd->fd_sys, off, SEEK_SET);
                       err = write(fd->fd_sys, ((char *) buf) + i_offset, fwr_size);*/

			req_off = off;
			req_len = fwr_size;
                    userbuf_off = i_offset;
			ADIOI_BUFFERED_WRITE
                    }
                i_offset += fwr_size;

		    if (off + fwr_size < disp + flat_file->indices[j] +
		                         flat_file->blocklens[j] +
                    n_filetypes*(ADIO_Offset)filetype_extent)
		        off += fwr_size;
		    /* did not reach end of contiguous block in filetype.
		    no more I/O needed. off is incremented by fwr_size. */
		    else {
                    j = (j+1) % flat_file->count;
                    n_filetypes += (j == 0) ? 1 : 0;
                    while (flat_file->blocklens[j]==0) {
                        j = (j+1) % flat_file->count;
                        n_filetypes += (j == 0) ? 1 : 0;
			}
			off = disp + flat_file->indices[j] +
                        n_filetypes*(ADIO_Offset)filetype_extent;
			fwr_size = MPL_MIN(flat_file->blocklens[j],
                                         bufsize-i_offset);
		    }
		}
        }
        else {
Exemple #11
0
void ADIOI_PVFS_WriteStrided(ADIO_File fd, void *buf, int count,
			     MPI_Datatype datatype, int file_ptr_type,
			     ADIO_Offset offset, ADIO_Status *status, int
			     *error_code)
{
/* Since PVFS does not support file locking, can't do buffered writes
   as on Unix */

/* offset is in units of etype relative to the filetype. */

    ADIOI_Flatlist_node *flat_buf, *flat_file;
    int i, j, k, err=-1, bwr_size, fwr_size=0, st_index=0;
    int num, size, sum, n_etypes_in_filetype, size_in_filetype;
    MPI_Count bufsize;
    int n_filetypes, etype_in_filetype;
    ADIO_Offset abs_off_in_filetype=0;
    MPI_Count filetype_size, etype_size, buftype_size;
    MPI_Aint filetype_extent, buftype_extent, indx;
    int buf_count, buftype_is_contig, filetype_is_contig;
    ADIO_Offset off, disp;
    int flag, new_bwr_size, new_fwr_size, err_flag=0;
    static char myname[] = "ADIOI_PVFS_WRITESTRIDED";

#ifdef HAVE_PVFS_LISTIO
    if ( fd->hints->fs_hints.pvfs.listio_write == ADIOI_HINT_ENABLE ) {
	    ADIOI_PVFS_WriteStridedListIO(fd, buf, count, datatype, 
			    file_ptr_type, offset, status, error_code);
	    return;
    }
#endif
    /* if hint set to DISABLE or AUTOMATIC, don't use listio */

    /* --BEGIN ERROR HANDLING-- */
    if (fd->atomicity) {
	*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
					   myname, __LINE__,
					   MPI_ERR_INTERN,
					   "Atomic mode set in PVFS I/O function", 0);
	return;
    }
    /* --END ERROR HANDLING-- */

    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
    ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);

    MPI_Type_size_x(fd->filetype, &filetype_size);
    if ( ! filetype_size ) {
#ifdef HAVE_STATUS_SET_BYTES
	MPIR_Status_set_bytes(status, datatype, 0);
#endif
	*error_code = MPI_SUCCESS; 
	return;
    }

    MPI_Type_extent(fd->filetype, &filetype_extent);
    MPI_Type_size_x(datatype, &buftype_size);
    MPI_Type_extent(datatype, &buftype_extent);
    etype_size = fd->etype_size;
    
    bufsize = buftype_size * count;

    if (!buftype_is_contig && filetype_is_contig) {
	char *combine_buf, *combine_buf_ptr;
	ADIO_Offset combine_buf_remain;
/* noncontiguous in memory, contiguous in file. use writev */

	flat_buf = ADIOI_Flatten_and_find(datatype);

	/* allocate our "combine buffer" to pack data into before writing */
	combine_buf = (char *) ADIOI_Malloc(fd->hints->ind_wr_buffer_size);
	combine_buf_ptr = combine_buf;
	combine_buf_remain = fd->hints->ind_wr_buffer_size;

	/* seek to the right spot in the file */
	if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
	    off = fd->disp + etype_size * offset;
#ifdef ADIOI_MPE_LOGGING
            MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL );
#endif
	    pvfs_lseek64(fd->fd_sys, off, SEEK_SET);
#ifdef ADIOI_MPE_LOGGING
            MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL );
#endif
	}
	else {
#ifdef ADIOI_MPE_LOGGING
            MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL );
#endif
            off = pvfs_lseek64(fd->fd_sys, fd->fp_ind, SEEK_SET);
#ifdef ADIOI_MPE_LOGGING
            MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL );
#endif
        }

	/* loop through all the flattened pieces.  combine into buffer until
	 * no more will fit, then write.
	 *
	 * special case of a given piece being bigger than the combine buffer
	 * is also handled.
	 */
	for (j=0; j<count; j++) {
	    for (i=0; i<flat_buf->count; i++) {
		if (flat_buf->blocklens[i] > combine_buf_remain && combine_buf != combine_buf_ptr) {
		    /* there is data in the buffer; write out the buffer so far */
#ifdef ADIOI_MPE_LOGGING
                    MPE_Log_event( ADIOI_MPE_write_a, 0, NULL );
#endif
		    err = pvfs_write(fd->fd_sys,
				     combine_buf,
				     fd->hints->ind_wr_buffer_size - combine_buf_remain);
#ifdef ADIOI_MPE_LOGGING
                    MPE_Log_event( ADIOI_MPE_write_b, 0, NULL );
#endif
		    if (err == -1) err_flag = 1;

		    /* reset our buffer info */
		    combine_buf_ptr = combine_buf;
		    combine_buf_remain = fd->hints->ind_wr_buffer_size;
		}

		/* TODO: heuristic for when to not bother to use combine buffer? */
		if (flat_buf->blocklens[i] >= combine_buf_remain) {
		    /* special case: blocklen is as big as or bigger than the combine buf;
		     * write directly
		     */
#ifdef ADIOI_MPE_LOGGING
                    MPE_Log_event( ADIOI_MPE_write_a, 0, NULL );
#endif
		    err = pvfs_write(fd->fd_sys,
				     ((char *) buf) + j*buftype_extent + flat_buf->indices[i],
				     flat_buf->blocklens[i]);
#ifdef ADIOI_MPE_LOGGING
                    MPE_Log_event( ADIOI_MPE_write_b, 0, NULL );
#endif
		    if (err == -1) err_flag = 1;
		    off += flat_buf->blocklens[i]; /* keep up with the final file offset too */
		}
		else {
		    /* copy more data into combine buffer */
		    memcpy(combine_buf_ptr,
			   ((char *) buf) + j*buftype_extent + flat_buf->indices[i],
			   flat_buf->blocklens[i]);
		    combine_buf_ptr += flat_buf->blocklens[i];
		    combine_buf_remain -= flat_buf->blocklens[i];
		    off += flat_buf->blocklens[i]; /* keep up with the final file offset too */
		}
	    }
	}

	if (combine_buf_ptr != combine_buf) {
	    /* data left in buffer to write */
#ifdef ADIOI_MPE_LOGGING
            MPE_Log_event( ADIOI_MPE_write_a, 0, NULL );
#endif
	    err = pvfs_write(fd->fd_sys,
			     combine_buf,
			     fd->hints->ind_wr_buffer_size - combine_buf_remain);
#ifdef ADIOI_MPE_LOGGING
            MPE_Log_event( ADIOI_MPE_write_b, 0, NULL );
#endif
	    if (err == -1) err_flag = 1;
	}

	if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;

	ADIOI_Free(combine_buf);

	if (err_flag) {
	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
					       MPIR_ERR_RECOVERABLE, myname,
					       __LINE__, MPI_ERR_IO, "**io",
					       "**io %s", strerror(errno));
	}
	else *error_code = MPI_SUCCESS;
    } /* if (!buftype_is_contig && filetype_is_contig)  ... */

    else {  /* noncontiguous in file */

/* split up into several contiguous writes */

/* find starting location in the file */

/* filetype already flattened in ADIO_Open */
	flat_file = ADIOI_Flatlist;
	while (flat_file->type != fd->filetype) flat_file = flat_file->next;
        disp = fd->disp;

	if (file_ptr_type == ADIO_INDIVIDUAL) {
	    offset = fd->fp_ind; /* in bytes */
            n_filetypes = -1;
            flag = 0;
            while (!flag) {
                n_filetypes++;
                for (i=0; i<flat_file->count; i++) {
                    if (disp + flat_file->indices[i] + 
                        (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] 
                            >= offset) {
                        st_index = i;
                        fwr_size = disp + flat_file->indices[i] + 
                                (ADIO_Offset) n_filetypes*filetype_extent
                                 + flat_file->blocklens[i] - offset;
                        flag = 1;
                        break;
                    }
                }
            }
	}
	else {
	    n_etypes_in_filetype = filetype_size/etype_size;
	    n_filetypes = (int) (offset / n_etypes_in_filetype);
	    etype_in_filetype = (int) (offset % n_etypes_in_filetype);
	    size_in_filetype = etype_in_filetype * etype_size;
 
	    sum = 0;
	    for (i=0; i<flat_file->count; i++) {
		sum += flat_file->blocklens[i];
		if (sum > size_in_filetype) {
		    st_index = i;
		    fwr_size = sum - size_in_filetype;
		    abs_off_in_filetype = flat_file->indices[i] +
			size_in_filetype - (sum - flat_file->blocklens[i]);
		    break;
		}
	    }

	    /* abs. offset in bytes in the file */
            offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype;
	}

	if (buftype_is_contig && !filetype_is_contig) {

/* contiguous in memory, noncontiguous in file. should be the most
   common case. */

	    i = 0;
	    j = st_index;
	    off = offset;
	    fwr_size = MPL_MIN(fwr_size, bufsize);
	    while (i < bufsize) {
                if (fwr_size) { 
                    /* TYPE_UB and TYPE_LB can result in 
                       fwr_size = 0. save system call in such cases */ 
#ifdef ADIOI_MPE_LOGGING
                    MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL );
#endif
		    pvfs_lseek64(fd->fd_sys, off, SEEK_SET);
#ifdef ADIOI_MPE_LOGGING
                    MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL );
#endif
#ifdef ADIOI_MPE_LOGGING
                    MPE_Log_event( ADIOI_MPE_write_a, 0, NULL );
#endif
		    err = pvfs_write(fd->fd_sys, ((char *) buf) + i, fwr_size);
#ifdef ADIOI_MPE_LOGGING
                    MPE_Log_event( ADIOI_MPE_write_b, 0, NULL );
#endif
		    if (err == -1) err_flag = 1;
		}
		i += fwr_size;

                if (off + fwr_size < disp + flat_file->indices[j] +
                   flat_file->blocklens[j] + (ADIO_Offset) n_filetypes*filetype_extent)
                       off += fwr_size;
                /* did not reach end of contiguous block in filetype.
                   no more I/O needed. off is incremented by fwr_size. */
                else {
		    if (j < (flat_file->count - 1)) j++;
		    else {
			j = 0;
			n_filetypes++;
		    }
		    off = disp + flat_file->indices[j] + 
                                        (ADIO_Offset) n_filetypes*filetype_extent;
		    fwr_size = MPL_MIN(flat_file->blocklens[j], bufsize-i);
		}
	    }
	}
	else {
/* noncontiguous in memory as well as in file */

	    flat_buf = ADIOI_Flatten_and_find(datatype);

	    k = num = buf_count = 0;
	    indx = flat_buf->indices[0];
	    j = st_index;
	    off = offset;
	    bwr_size = flat_buf->blocklens[0];

	    while (num < bufsize) {
		size = MPL_MIN(fwr_size, bwr_size);
		if (size) {
#ifdef ADIOI_MPE_LOGGING
                    MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL );
#endif
		    pvfs_lseek64(fd->fd_sys, off, SEEK_SET);
#ifdef ADIOI_MPE_LOGGING
                    MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL );
#endif
#ifdef ADIOI_MPE_LOGGING
                    MPE_Log_event( ADIOI_MPE_write_a, 0, NULL );
#endif
		    err = pvfs_write(fd->fd_sys, ((char *) buf) + indx, size);
#ifdef ADIOI_MPE_LOGGING
                    MPE_Log_event( ADIOI_MPE_write_b, 0, NULL );
#endif
		    if (err == -1) err_flag = 1;
		}

		new_fwr_size = fwr_size;
		new_bwr_size = bwr_size;

		if (size == fwr_size) {
/* reached end of contiguous block in file */
                    if (j < (flat_file->count - 1)) j++;
                    else {
                        j = 0;
                        n_filetypes++;
                    }

                    off = disp + flat_file->indices[j] + 
                                   (ADIO_Offset) n_filetypes*filetype_extent;

		    new_fwr_size = flat_file->blocklens[j];
		    if (size != bwr_size) {
			indx += size;
			new_bwr_size -= size;
		    }
		}

		if (size == bwr_size) {
/* reached end of contiguous block in memory */

		    k = (k + 1)%flat_buf->count;
		    buf_count++;
		    indx = buftype_extent*(buf_count/flat_buf->count) +
			flat_buf->indices[k]; 
		    new_bwr_size = flat_buf->blocklens[k];
		    if (size != fwr_size) {
			off += size;
			new_fwr_size -= size;
		    }
		}
		num += size;
		fwr_size = new_fwr_size;
                bwr_size = new_bwr_size;
	    }
	}

        if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
	if (err_flag) {
	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
					       MPIR_ERR_RECOVERABLE, myname,
					       __LINE__, MPI_ERR_IO, "**io",
					       "**io %s", strerror(errno));
	}
	else *error_code = MPI_SUCCESS;
    }

    fd->fp_sys_posn = -1;   /* set it to null. */

#ifdef HAVE_STATUS_SET_BYTES
    MPIR_Status_set_bytes(status, datatype, bufsize);
/* This is a temporary way of filling in status. The right way is to 
   keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
#endif

    if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
}
Exemple #12
0
void ADIOI_PVFS_WriteStridedListIO(ADIO_File fd, void *buf, int count,
                       MPI_Datatype datatype, int file_ptr_type,
                       ADIO_Offset offset, ADIO_Status *status, int
                       *error_code) 
{
/* Since PVFS does not support file locking, can't do buffered writes
   as on Unix */

/* offset is in units of etype relative to the filetype. */

    ADIOI_Flatlist_node *flat_buf, *flat_file;
    int i, j, k, err=-1, bwr_size, fwr_size=0, st_index=0;
    int size, sum, n_etypes_in_filetype, size_in_filetype;
    MPI_Count bufsize;
    int n_filetypes, etype_in_filetype;
    ADIO_Offset abs_off_in_filetype=0;
    MPI_Count filetype_size, etype_size, buftype_size;
    MPI_Aint filetype_extent, buftype_extent;
    int buf_count, buftype_is_contig, filetype_is_contig;
    ADIO_Offset userbuf_off;
    ADIO_Offset off, disp, start_off;
    int flag, st_fwr_size, st_n_filetypes;
    int new_bwr_size, new_fwr_size, err_flag=0;

    int mem_list_count, file_list_count;
    char ** mem_offsets;
    int64_t *file_offsets;
    int *mem_lengths;
    int32_t *file_lengths;
    int total_blks_to_write;

    int max_mem_list, max_file_list;

    int b_blks_wrote;
    int f_data_wrote;
    int size_wrote=0, n_write_lists, extra_blks;

    int end_bwr_size, end_fwr_size;
    int start_k, start_j, new_file_write, new_buffer_write;
    int start_mem_offset;
#define MAX_ARRAY_SIZE 1024
    static char myname[] = "ADIOI_PVFS_WRITESTRIDED";

/* PFS file pointer modes are not relevant here, because PFS does
   not support strided accesses. */

    /* --BEGIN ERROR HANDLING-- */
    if (fd->atomicity) {
	*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
					   myname, __LINE__,
					   MPI_ERR_INTERN,
					   "Atomic mode set in PVFS I/O function", 0);
	return;
    }
    /* --END ERROR HANDLING-- */

    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
    ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);

    MPI_Type_size_x(fd->filetype, &filetype_size);
    if ( ! filetype_size ) {
#ifdef HAVE_STATUS_SET_BYTES
	MPIR_Status_set_bytes(status, datatype, 0);
#endif
	*error_code = MPI_SUCCESS; 
	return;
    }

    MPI_Type_extent(fd->filetype, &filetype_extent);
    MPI_Type_size_x(datatype, &buftype_size);
    MPI_Type_extent(datatype, &buftype_extent);
    etype_size = fd->etype_size;
    
    bufsize = buftype_size * count;

    if (!buftype_is_contig && filetype_is_contig) {

/* noncontiguous in memory, contiguous in file.  */
        int64_t file_offsets;
	int32_t file_lengths;

	flat_buf = ADIOI_Flatten_and_find(datatype);
	
	if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
	    off = fd->disp + etype_size * offset;
#ifdef ADIOI_MPE_LOGGING
            MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL );
#endif
	    pvfs_lseek64(fd->fd_sys, fd->fp_ind, SEEK_SET);
#ifdef ADIOI_MPE_LOGGING
            MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL );
#endif
	}
	else {
#ifdef ADIOI_MPE_LOGGING
            MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL );
#endif
            off = pvfs_lseek64(fd->fd_sys, fd->fp_ind, SEEK_SET);
#ifdef ADIOI_MPE_LOGGING
            MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL );
#endif
        }

	file_list_count = 1;
	file_offsets = off;
	file_lengths = 0;
	total_blks_to_write = count*flat_buf->count;
	b_blks_wrote = 0;

	/* allocate arrays according to max usage */
	if (total_blks_to_write > MAX_ARRAY_SIZE)
	    mem_list_count = MAX_ARRAY_SIZE;
	else mem_list_count = total_blks_to_write;
	mem_offsets = (char**)ADIOI_Malloc(mem_list_count*sizeof(char*));
	mem_lengths = (int*)ADIOI_Malloc(mem_list_count*sizeof(int));

	j = 0;
	/* step through each block in memory, filling memory arrays */
	while (b_blks_wrote < total_blks_to_write) {
	    for (i=0; i<flat_buf->count; i++) {
		mem_offsets[b_blks_wrote % MAX_ARRAY_SIZE] = 
		    ((char*)buf + j*buftype_extent + flat_buf->indices[i]);
		mem_lengths[b_blks_wrote % MAX_ARRAY_SIZE] = 
		    flat_buf->blocklens[i];
		file_lengths += flat_buf->blocklens[i];
		b_blks_wrote++;
		if (!(b_blks_wrote % MAX_ARRAY_SIZE) ||
		    (b_blks_wrote == total_blks_to_write)) {

		    /* in the case of the last read list call,
		       adjust mem_list_count */
		    if (b_blks_wrote == total_blks_to_write) {
		        mem_list_count = total_blks_to_write % MAX_ARRAY_SIZE;
			/* in case last read list call fills max arrays */
			if (!mem_list_count) mem_list_count = MAX_ARRAY_SIZE;
		    }

		    pvfs_write_list(fd->fd_sys ,mem_list_count, mem_offsets,
				   mem_lengths, file_list_count,
				   &file_offsets, &file_lengths);
		  
		    /* in the case of the last read list call, leave here */
		    if (b_blks_wrote == total_blks_to_write) break;

		    file_offsets += file_lengths;
		    file_lengths = 0;
		} 
	    } /* for (i=0; i<flat_buf->count; i++) */
	    j++;
	} /* while (b_blks_wrote < total_blks_to_write) */
	ADIOI_Free(mem_offsets);
	ADIOI_Free(mem_lengths);

	if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;

	if (err_flag) {
	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
					       MPIR_ERR_RECOVERABLE, myname,
					       __LINE__, MPI_ERR_IO, "**io",
					       "**io %s", strerror(errno));
	}
	else *error_code = MPI_SUCCESS;

	fd->fp_sys_posn = -1;   /* clear this. */

#ifdef HAVE_STATUS_SET_BYTES
	MPIR_Status_set_bytes(status, datatype, bufsize);
/* This is a temporary way of filling in status. The right way is to 
   keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
#endif

	ADIOI_Delete_flattened(datatype);
	return;
    } /* if (!buftype_is_contig && filetype_is_contig) */

    /* already know that file is noncontiguous from above */
    /* noncontiguous in file */

/* filetype already flattened in ADIO_Open */
    flat_file = ADIOI_Flatlist;
    while (flat_file->type != fd->filetype) flat_file = flat_file->next;

    disp = fd->disp;

    /* for each case - ADIO_Individual pointer or explicit, find offset
       (file offset in bytes), n_filetypes (how many filetypes into file 
       to start), fwr_size (remaining amount of data in present file
       block), and st_index (start point in terms of blocks in starting
       filetype) */
    if (file_ptr_type == ADIO_INDIVIDUAL) {
        offset = fd->fp_ind; /* in bytes */
	n_filetypes = -1;
	flag = 0;
	while (!flag) {
	    n_filetypes++;
	    for (i=0; i<flat_file->count; i++) {
	        if (disp + flat_file->indices[i] + 
		    (ADIO_Offset) n_filetypes*filetype_extent +
		      flat_file->blocklens[i] >= offset) {
		  st_index = i;
		  fwr_size = disp + flat_file->indices[i] + 
		    (ADIO_Offset) n_filetypes*filetype_extent
		    + flat_file->blocklens[i] - offset;
		  flag = 1;
		  break;
		}
	    }
	} /* while (!flag) */
    } /* if (file_ptr_type == ADIO_INDIVIDUAL) */
    else {
        n_etypes_in_filetype = filetype_size/etype_size;
	n_filetypes = (int) (offset / n_etypes_in_filetype);
	etype_in_filetype = (int) (offset % n_etypes_in_filetype);
	size_in_filetype = etype_in_filetype * etype_size;
	
	sum = 0;
	for (i=0; i<flat_file->count; i++) {
	    sum += flat_file->blocklens[i];
	    if (sum > size_in_filetype) {
	        st_index = i;
		fwr_size = sum - size_in_filetype;
		abs_off_in_filetype = flat_file->indices[i] +
		    size_in_filetype - (sum - flat_file->blocklens[i]);
		break;
	    }
	}

	/* abs. offset in bytes in the file */
	offset = disp + (ADIO_Offset) n_filetypes*filetype_extent +
	    abs_off_in_filetype;
    } /* else [file_ptr_type != ADIO_INDIVIDUAL] */

    start_off = offset;
    st_fwr_size = fwr_size;
    st_n_filetypes = n_filetypes;
    
    if (buftype_is_contig && !filetype_is_contig) {

/* contiguous in memory, noncontiguous in file. should be the most
   common case. */

        int mem_lengths;
	char *mem_offsets;
        
	i = 0;
	j = st_index;
	off = offset;
	n_filetypes = st_n_filetypes;
        
	mem_list_count = 1;
        
	/* determine how many blocks in file to read */
	f_data_wrote = MPL_MIN(st_fwr_size, bufsize);
	total_blks_to_write = 1;
	j++;
	while (f_data_wrote < bufsize) {
	    f_data_wrote += flat_file->blocklens[j];
	    total_blks_to_write++;
	    if (j<(flat_file->count-1)) j++;
	    else j = 0; 
	}
	    
	j = st_index;
	n_filetypes = st_n_filetypes;
	n_write_lists = total_blks_to_write/MAX_ARRAY_SIZE;
	extra_blks = total_blks_to_write%MAX_ARRAY_SIZE;
        
	mem_offsets = buf;
	mem_lengths = 0;
        
	/* if at least one full readlist, allocate file arrays
	   at max array size and don't free until very end */
	if (n_write_lists) {
	    file_offsets = (int64_t*)ADIOI_Malloc(MAX_ARRAY_SIZE*
						  sizeof(int64_t));
	    file_lengths = (int32_t*)ADIOI_Malloc(MAX_ARRAY_SIZE*
						  sizeof(int32_t));
	}
	/* if there's no full readlist allocate file arrays according
	   to needed size (extra_blks) */
	else {
	    file_offsets = (int64_t*)ADIOI_Malloc(extra_blks*
                                                  sizeof(int64_t));
            file_lengths = (int32_t*)ADIOI_Malloc(extra_blks*
                                                  sizeof(int32_t));
        }
        
        /* for file arrays that are of MAX_ARRAY_SIZE, build arrays */
        for (i=0; i<n_write_lists; i++) {
            file_list_count = MAX_ARRAY_SIZE;
            if(!i) {
                file_offsets[0] = offset;
                file_lengths[0] = st_fwr_size;
                mem_lengths = st_fwr_size;
            }
            for (k=0; k<MAX_ARRAY_SIZE; k++) {
                if (i || k) {
                    file_offsets[k] = disp + n_filetypes*filetype_extent
                      + flat_file->indices[j];
                    file_lengths[k] = flat_file->blocklens[j];
                    mem_lengths += file_lengths[k];
                }
                if (j<(flat_file->count - 1)) j++;
                else {
                    j = 0;
                    n_filetypes++;
                }
            } /* for (k=0; k<MAX_ARRAY_SIZE; k++) */
            pvfs_write_list(fd->fd_sys, mem_list_count,
                           &mem_offsets, &mem_lengths,
                           file_list_count, file_offsets,
                           file_lengths);
            mem_offsets += mem_lengths;
            mem_lengths = 0;
        } /* for (i=0; i<n_write_lists; i++) */

        /* for file arrays smaller than MAX_ARRAY_SIZE (last read_list call) */
        if (extra_blks) {
            file_list_count = extra_blks;
            if(!i) {
                file_offsets[0] = offset;
                file_lengths[0] = st_fwr_size;
            }
            for (k=0; k<extra_blks; k++) {
                if(i || k) {
                    file_offsets[k] = disp + n_filetypes*filetype_extent +
                      flat_file->indices[j];
                    if (k == (extra_blks - 1)) {
                        file_lengths[k] = bufsize - (int32_t) mem_lengths
                          - (int32_t) mem_offsets + (int32_t)  buf;
                    }
                    else file_lengths[k] = flat_file->blocklens[j];
                } /* if(i || k) */
                mem_lengths += file_lengths[k];
                if (j<(flat_file->count - 1)) j++;
                else {
                    j = 0;
                    n_filetypes++;
                }
            } /* for (k=0; k<extra_blks; k++) */
            pvfs_write_list(fd->fd_sys, mem_list_count, &mem_offsets,
                           &mem_lengths, file_list_count, file_offsets,
                           file_lengths);
        }
    } 
    else {
        /* noncontiguous in memory as well as in file */

	flat_buf = ADIOI_Flatten_and_find(datatype);

	size_wrote = 0;
	n_filetypes = st_n_filetypes;
	fwr_size = st_fwr_size;
	bwr_size = flat_buf->blocklens[0];
	buf_count = 0;
	start_mem_offset = 0;
	start_k = k = 0;
	start_j = st_index;
	max_mem_list = 0;
	max_file_list = 0;

	/* run through and file max_file_list and max_mem_list so that you 
	   can allocate the file and memory arrays less than MAX_ARRAY_SIZE
	   if possible */

	while (size_wrote < bufsize) {
	    k = start_k;
	    new_buffer_write = 0;
	    mem_list_count = 0;
	    while ((mem_list_count < MAX_ARRAY_SIZE) && 
		   (new_buffer_write < bufsize-size_wrote)) {
	        /* find mem_list_count and file_list_count such that both are
		   less than MAX_ARRAY_SIZE, the sum of their lengths are
		   equal, and the sum of all the data read and data to be
		   read in the next immediate read list is less than
		   bufsize */
	        if(mem_list_count) {
		    if((new_buffer_write + flat_buf->blocklens[k] + 
			size_wrote) > bufsize) {
		        end_bwr_size = new_buffer_write + 
			    flat_buf->blocklens[k] - (bufsize - size_wrote);
			new_buffer_write = bufsize - size_wrote;
		    }
		    else {
		        new_buffer_write += flat_buf->blocklens[k];
			end_bwr_size = flat_buf->blocklens[k];
		    }
		}
		else {
		    if (bwr_size > (bufsize - size_wrote)) {
		        new_buffer_write = bufsize - size_wrote;
			bwr_size = new_buffer_write;
		    }
		    else new_buffer_write = bwr_size;
		}
		mem_list_count++;
		k = (k + 1)%flat_buf->count;
	     } /* while ((mem_list_count < MAX_ARRAY_SIZE) && 
	       (new_buffer_write < bufsize-size_wrote)) */
	    j = start_j;
	    new_file_write = 0;
	    file_list_count = 0;
	    while ((file_list_count < MAX_ARRAY_SIZE) && 
		   (new_file_write < new_buffer_write)) {
	        if(file_list_count) {
		    if((new_file_write + flat_file->blocklens[j]) > 
		       new_buffer_write) {
		        end_fwr_size = new_buffer_write - new_file_write;
			new_file_write = new_buffer_write;
			j--;
		    }
		    else {
		        new_file_write += flat_file->blocklens[j];
			end_fwr_size = flat_file->blocklens[j];
		    }
		}
		else {
		    if (fwr_size > new_buffer_write) {
		        new_file_write = new_buffer_write;
			fwr_size = new_file_write;
		    }
		    else new_file_write = fwr_size;
		}
		file_list_count++;
		if (j < (flat_file->count - 1)) j++;
		else j = 0;
		
		k = start_k;
		if ((new_file_write < new_buffer_write) && 
		    (file_list_count == MAX_ARRAY_SIZE)) {
		    new_buffer_write = 0;
		    mem_list_count = 0;
		    while (new_buffer_write < new_file_write) {
		        if(mem_list_count) {
			    if((new_buffer_write + flat_buf->blocklens[k]) >
			       new_file_write) {
			        end_bwr_size = new_file_write - 
				    new_buffer_write;
				new_buffer_write = new_file_write;
				k--;
			    }
			    else {
			        new_buffer_write += flat_buf->blocklens[k];
				end_bwr_size = flat_buf->blocklens[k];
			    }
			}
			else {
			    new_buffer_write = bwr_size;
			    if (bwr_size > (bufsize - size_wrote)) {
			        new_buffer_write = bufsize - size_wrote;
				bwr_size = new_buffer_write;
			    }
			}
			mem_list_count++;
			k = (k + 1)%flat_buf->count;
		    } /* while (new_buffer_write < new_file_write) */
		} /* if ((new_file_write < new_buffer_write) &&
		     (file_list_count == MAX_ARRAY_SIZE)) */
	    } /* while ((mem_list_count < MAX_ARRAY_SIZE) && 
		 (new_buffer_write < bufsize-size_wrote)) */

	    /*  fakes filling the writelist arrays of lengths found above  */
	    k = start_k;
	    j = start_j;
	    for (i=0; i<mem_list_count; i++) {	     
		if(i) {
		    if (i == (mem_list_count - 1)) {
			if (flat_buf->blocklens[k] == end_bwr_size)
			    bwr_size = flat_buf->blocklens[(k+1)%
							  flat_buf->count];
			else {
			    bwr_size = flat_buf->blocklens[k] - end_bwr_size;
			    k--;
			    buf_count--;
			}
		    }
		}
		buf_count++;
		k = (k + 1)%flat_buf->count;
	    } /* for (i=0; i<mem_list_count; i++) */
	    for (i=0; i<file_list_count; i++) {
		if (i) {
		    if (i == (file_list_count - 1)) {
			if (flat_file->blocklens[j] == end_fwr_size)
			    fwr_size = flat_file->blocklens[(j+1)%
							  flat_file->count];   
			else {
			    fwr_size = flat_file->blocklens[j] - end_fwr_size;
			    j--;
			}
		    }
		}
		if (j < flat_file->count - 1) j++;
		else {
		    j = 0;
		    n_filetypes++;
		}
	    } /* for (i=0; i<file_list_count; i++) */
	    size_wrote += new_buffer_write;
	    start_k = k;
	    start_j = j;
	    if (max_mem_list < mem_list_count)
	        max_mem_list = mem_list_count;
	    if (max_file_list < file_list_count)
	        max_file_list = file_list_count;
	} /* while (size_wrote < bufsize) */

	mem_offsets = (char **)ADIOI_Malloc(max_mem_list*sizeof(char *));
	mem_lengths = (int *)ADIOI_Malloc(max_mem_list*sizeof(int));
	file_offsets = (int64_t *)ADIOI_Malloc(max_file_list*sizeof(int64_t));
	file_lengths = (int32_t *)ADIOI_Malloc(max_file_list*sizeof(int32_t));
	    
	size_wrote = 0;
	n_filetypes = st_n_filetypes;
	fwr_size = st_fwr_size;
	bwr_size = flat_buf->blocklens[0];
	buf_count = 0;
	start_mem_offset = 0;
	start_k = k = 0;
	start_j = st_index;

	/*  this section calculates mem_list_count and file_list_count
	    and also finds the possibly odd sized last array elements
	    in new_fwr_size and new_bwr_size  */
	
	while (size_wrote < bufsize) {
	    k = start_k;
	    new_buffer_write = 0;
	    mem_list_count = 0;
	    while ((mem_list_count < MAX_ARRAY_SIZE) && 
		   (new_buffer_write < bufsize-size_wrote)) {
	        /* find mem_list_count and file_list_count such that both are
		   less than MAX_ARRAY_SIZE, the sum of their lengths are
		   equal, and the sum of all the data read and data to be
		   read in the next immediate read list is less than
		   bufsize */
	        if(mem_list_count) {
		    if((new_buffer_write + flat_buf->blocklens[k] + 
			size_wrote) > bufsize) {
		        end_bwr_size = new_buffer_write + 
			    flat_buf->blocklens[k] - (bufsize - size_wrote);
			new_buffer_write = bufsize - size_wrote;
		    }
		    else {
		        new_buffer_write += flat_buf->blocklens[k];
			end_bwr_size = flat_buf->blocklens[k];
		    }
		}
		else {
		    if (bwr_size > (bufsize - size_wrote)) {
		        new_buffer_write = bufsize - size_wrote;
			bwr_size = new_buffer_write;
		    }
		    else new_buffer_write = bwr_size;
		}
		mem_list_count++;
		k = (k + 1)%flat_buf->count;
	     } /* while ((mem_list_count < MAX_ARRAY_SIZE) && 
	       (new_buffer_write < bufsize-size_wrote)) */
	    j = start_j;
	    new_file_write = 0;
	    file_list_count = 0;
	    while ((file_list_count < MAX_ARRAY_SIZE) && 
		   (new_file_write < new_buffer_write)) {
	        if(file_list_count) {
		    if((new_file_write + flat_file->blocklens[j]) > 
		       new_buffer_write) {
		        end_fwr_size = new_buffer_write - new_file_write;
			new_file_write = new_buffer_write;
			j--;
		    }
		    else {
		        new_file_write += flat_file->blocklens[j];
			end_fwr_size = flat_file->blocklens[j];
		    }
		}
		else {
		    if (fwr_size > new_buffer_write) {
		        new_file_write = new_buffer_write;
			fwr_size = new_file_write;
		    }
		    else new_file_write = fwr_size;
		}
		file_list_count++;
		if (j < (flat_file->count - 1)) j++;
		else j = 0;
		
		k = start_k;
		if ((new_file_write < new_buffer_write) && 
		    (file_list_count == MAX_ARRAY_SIZE)) {
		    new_buffer_write = 0;
		    mem_list_count = 0;
		    while (new_buffer_write < new_file_write) {
		        if(mem_list_count) {
			    if((new_buffer_write + flat_buf->blocklens[k]) >
			       new_file_write) {
			        end_bwr_size = new_file_write -
				  new_buffer_write;
				new_buffer_write = new_file_write;
				k--;
			    }
			    else {
			        new_buffer_write += flat_buf->blocklens[k];
				end_bwr_size = flat_buf->blocklens[k];
			    }
			}
			else {
			    new_buffer_write = bwr_size;
			    if (bwr_size > (bufsize - size_wrote)) {
			        new_buffer_write = bufsize - size_wrote;
				bwr_size = new_buffer_write;
			    }
			}
			mem_list_count++;
			k = (k + 1)%flat_buf->count;
		    } /* while (new_buffer_write < new_file_write) */
		} /* if ((new_file_write < new_buffer_write) &&
		     (file_list_count == MAX_ARRAY_SIZE)) */
	    } /* while ((mem_list_count < MAX_ARRAY_SIZE) && 
		 (new_buffer_write < bufsize-size_wrote)) */

	    /*  fills the allocated readlist arrays  */
	    k = start_k;
	    j = start_j;
	    for (i=0; i<mem_list_count; i++) {	     
	        mem_offsets[i] = ((char*)buf + buftype_extent*
					 (buf_count/flat_buf->count) +
					 (int)flat_buf->indices[k]);
		
		if(!i) {
		    mem_lengths[0] = bwr_size;
		    mem_offsets[0] += flat_buf->blocklens[k] - bwr_size;
		}
		else {
		    if (i == (mem_list_count - 1)) {
		        mem_lengths[i] = end_bwr_size;
			if (flat_buf->blocklens[k] == end_bwr_size)
			    bwr_size = flat_buf->blocklens[(k+1)%
							  flat_buf->count];
			else {
			    bwr_size = flat_buf->blocklens[k] - end_bwr_size;
			    k--;
			    buf_count--;
			}
		    }
		    else {
		        mem_lengths[i] = flat_buf->blocklens[k];
		    }
		}
		buf_count++;
		k = (k + 1)%flat_buf->count;
	    } /* for (i=0; i<mem_list_count; i++) */
	    for (i=0; i<file_list_count; i++) {
	        file_offsets[i] = disp + flat_file->indices[j] + n_filetypes *
		    filetype_extent;
	        if (!i) {
		    file_lengths[0] = fwr_size;
		    file_offsets[0] += flat_file->blocklens[j] - fwr_size;
		}
		else {
		    if (i == (file_list_count - 1)) {
		        file_lengths[i] = end_fwr_size;
			if (flat_file->blocklens[j] == end_fwr_size)
			    fwr_size = flat_file->blocklens[(j+1)%
							  flat_file->count];   
			else {
			    fwr_size = flat_file->blocklens[j] - end_fwr_size;
			    j--;
			}
		    }
		    else file_lengths[i] = flat_file->blocklens[j];
		}
		if (j < flat_file->count - 1) j++;
		else {
		    j = 0;
		    n_filetypes++;
		}
	    } /* for (i=0; i<file_list_count; i++) */
	    pvfs_write_list(fd->fd_sys,mem_list_count, mem_offsets,
			   mem_lengths, file_list_count, file_offsets,
			   file_lengths);
	    size_wrote += new_buffer_write;
	    start_k = k;
	    start_j = j;
	} /* while (size_wrote < bufsize) */
	ADIOI_Free(mem_offsets);
	ADIOI_Free(mem_lengths);
    }
    ADIOI_Free(file_offsets);
    ADIOI_Free(file_lengths);

    if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
    if (err_flag) {
	*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
					   myname, __LINE__, MPI_ERR_IO,
					   "**io",
					   "**io %s", strerror(errno));
    }
    else *error_code = MPI_SUCCESS;

    fd->fp_sys_posn = -1;   /* set it to null. */

#ifdef HAVE_STATUS_SET_BYTES
    MPIR_Status_set_bytes(status, datatype, bufsize);
/* This is a temporary way of filling in status. The right way is to 
   keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
#endif

    if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
}
/*@
  MPIDU_Type_create_pairtype - create necessary data structures for certain
  pair types (all but MPI_2INT etc., which never have the size != extent
  issue).

  This function is different from the other MPIDU_Type_create functions in that
  it fills in an already- allocated MPIDU_Datatype.  This is important for
  allowing configure-time determination of the MPI type values (these types
  are stored in the "direct" space, for those familiar with how MPICH deals
  with type allocation).

Input Parameters:
+ type - name of pair type (e.g. MPI_FLOAT_INT)
- new_dtp - pointer to previously allocated datatype structure, which is
            filled in by this function

  Return Value:
  MPI_SUCCESS on success, MPI errno on failure.

  Note:
  Section 4.9.3 (MINLOC and MAXLOC) of the MPI-1 standard specifies that
  these types should be built as if by the following (e.g. MPI_FLOAT_INT):

    type[0] = MPI_FLOAT
    type[1] = MPI_INT
    disp[0] = 0
    disp[1] = sizeof(float) <---- questionable displacement!
    block[0] = 1
    block[1] = 1
    MPI_TYPE_STRUCT(2, block, disp, type, MPI_FLOAT_INT)

  However, this is a relatively naive approach that does not take struct
  padding into account when setting the displacement of the second element.
  Thus in our implementation we have chosen to instead use the actual
  difference in starting locations of the two types in an actual struct.
@*/
int MPIDU_Type_create_pairtype(MPI_Datatype type,
			      MPIDU_Datatype *new_dtp)
{
    int err, mpi_errno = MPI_SUCCESS;
    int type_size, alignsize;
    MPI_Aint type_extent, true_ub, el_size;

    /* handle is filled in by MPIR_Handle_obj_alloc() */
    MPIR_Object_set_ref(new_dtp, 1);
    new_dtp->is_permanent = 1;
    new_dtp->is_committed = 1; /* predefined types are pre-committed */
    new_dtp->attributes   = NULL;
    new_dtp->cache_id     = 0;
    new_dtp->name[0]      = 0;
    new_dtp->contents     = NULL;

    new_dtp->dataloop       = NULL;
    new_dtp->dataloop_size  = -1;
    new_dtp->dataloop_depth = -1;
    new_dtp->hetero_dloop       = NULL;
    new_dtp->hetero_dloop_size  = -1;
    new_dtp->hetero_dloop_depth = -1;

    switch(type) {
	case MPI_FLOAT_INT:
            PAIRTYPE_SIZE_EXTENT(MPI_FLOAT, float, MPI_INT, int,
                                 type_size, type_extent, el_size, true_ub, alignsize);
	    break;
	case MPI_DOUBLE_INT:
            PAIRTYPE_SIZE_EXTENT(MPI_DOUBLE, double, MPI_INT, int,
                                 type_size, type_extent, el_size, true_ub, alignsize);
	    break;
	case MPI_LONG_INT:
            PAIRTYPE_SIZE_EXTENT(MPI_LONG, long, MPI_INT, int,
                                 type_size, type_extent, el_size, true_ub, alignsize);
	    break;
	case MPI_SHORT_INT:
            PAIRTYPE_SIZE_EXTENT(MPI_SHORT, short, MPI_INT, int,
                                 type_size, type_extent, el_size, true_ub, alignsize);
	    break;
	case MPI_LONG_DOUBLE_INT:
            PAIRTYPE_SIZE_EXTENT(MPI_LONG_DOUBLE, long double, MPI_INT, int,
                                 type_size, type_extent, el_size, true_ub, alignsize);
	    break;
	default:
	    /* --BEGIN ERROR HANDLING-- */
	    mpi_errno = MPIR_Err_create_code(MPI_SUCCESS,
					     MPIR_ERR_RECOVERABLE,
					     "MPIDU_Type_create_pairtype",
					     __LINE__,
					     MPI_ERR_OTHER,
					     "**dtype", 0);
	    return mpi_errno;
	    /* --END ERROR HANDLING-- */
    }

    new_dtp->n_builtin_elements      = 2;
    new_dtp->builtin_element_size    = el_size;
    new_dtp->basic_type          = type;

    new_dtp->has_sticky_lb   = 0;
    new_dtp->true_lb         = 0;
    new_dtp->lb              = 0;

    new_dtp->has_sticky_ub   = 0;
    new_dtp->true_ub         = true_ub;

    new_dtp->size            = type_size;
    new_dtp->ub              = type_extent; /* possible padding */
    new_dtp->extent          = type_extent;
    new_dtp->alignsize       = alignsize;

   /* place maximum on alignment based on padding rules */
    /* There are some really wierd rules for structure alignment;
       these capture the ones of which we are aware. */
    switch(type) {
	case MPI_SHORT_INT:
	case MPI_LONG_INT:
#ifdef HAVE_MAX_INTEGER_ALIGNMENT
	    new_dtp->alignsize       = MPL_MIN(new_dtp->alignsize,
						HAVE_MAX_INTEGER_ALIGNMENT);
#endif
	    break;
	case MPI_FLOAT_INT:
#ifdef HAVE_MAX_FP_ALIGNMENT
	    new_dtp->alignsize       = MPL_MIN(new_dtp->alignsize,
						HAVE_MAX_FP_ALIGNMENT);
#endif
	    break;
	case MPI_DOUBLE_INT:
#ifdef HAVE_MAX_DOUBLE_FP_ALIGNMENT
	    new_dtp->alignsize       = MPL_MIN(new_dtp->alignsize,
						HAVE_MAX_DOUBLE_FP_ALIGNMENT);
#elif defined(HAVE_MAX_FP_ALIGNMENT)
	    new_dtp->alignsize       = MPL_MIN(new_dtp->alignsize,
						HAVE_MAX_FP_ALIGNMENT);
#endif
	    break;
	case MPI_LONG_DOUBLE_INT:
#ifdef HAVE_MAX_LONG_DOUBLE_FP_ALIGNMENT
	    new_dtp->alignsize       = MPL_MIN(new_dtp->alignsize,
					HAVE_MAX_LONG_DOUBLE_FP_ALIGNMENT);
#elif defined(HAVE_MAX_FP_ALIGNMENT)
	    new_dtp->alignsize       = MPL_MIN(new_dtp->alignsize,
						HAVE_MAX_FP_ALIGNMENT);
#endif
	    break;
    }

    new_dtp->is_contig       = (((MPI_Aint) type_size) == type_extent) ? 1 : 0;
    new_dtp->max_contig_blocks = (((MPI_Aint) type_size) == type_extent) ? 1 : 2;

    /* fill in dataloops -- only case where we precreate dataloops
     *
     * this is necessary because these types aren't committed by the
     * user, which is the other place where we create dataloops. so
     * if the user uses one of these w/out building some more complex
     * type and then committing it, then the dataloop will be missing.
     */

#ifdef MPID_NEEDS_DLOOP_ALL_BYTES
    /* If MPID implementation needs use to reduce everything to
       a byte stream, do that. */
    err = MPIDU_Dataloop_create_pairtype(type,
					&(new_dtp->dataloop),
					&(new_dtp->dataloop_size),
					&(new_dtp->dataloop_depth),
					MPIDU_DATALOOP_ALL_BYTES);
#else
    err = MPIDU_Dataloop_create_pairtype(type,
					&(new_dtp->dataloop),
					&(new_dtp->dataloop_size),
					&(new_dtp->dataloop_depth),
					MPIDU_DATALOOP_HOMOGENEOUS);
#endif

    if (!err) {
	err = MPIDU_Dataloop_create_pairtype(type,
					    &(new_dtp->hetero_dloop),
					    &(new_dtp->hetero_dloop_size),
					    &(new_dtp->hetero_dloop_depth),
					    MPIDU_DATALOOP_HETEROGENEOUS);
    }

#ifdef MPID_Type_commit_hook
    if (!err) {
        err =  MPID_Type_commit_hook(new_dtp);
    }
#endif /* MPID_Type_commit_hook */

    /* --BEGIN ERROR HANDLING-- */
    if (err) {
	mpi_errno = MPIR_Err_create_code(MPI_SUCCESS,
					 MPIR_ERR_RECOVERABLE,
					 "MPIDU_Dataloop_create_pairtype",
					 __LINE__,
					 MPI_ERR_OTHER,
					 "**nomem",
					 0);
	return mpi_errno;

    }
    /* --END ERROR HANDLING-- */

    return mpi_errno;
}
Exemple #14
0
/* Avery Ching and Kenin Columa's reworked two-phase algorithm.  Key features
 * - persistent file domains
 * - an option to use alltoall instead of point-to-point
 */
void ADIOI_IOStridedColl(ADIO_File fd, void *buf, int count, int rdwr,
                         MPI_Datatype datatype, int file_ptr_type,
                         ADIO_Offset offset, ADIO_Status * status, int *error_code)
{
    ADIO_Offset min_st_offset = 0, max_end_offset = 0;
    ADIO_Offset st_end_offset[2];
    ADIO_Offset *all_st_end_offsets = NULL;
    int filetype_is_contig, buftype_is_contig, is_contig;
    ADIO_Offset off;
    int interleave_count = 0, i, nprocs, myrank, nprocs_for_coll;
    int cb_enable;
    ADIO_Offset bufsize;
    MPI_Aint extent;
#ifdef DEBUG2
    MPI_Aint bufextent;
#endif
    MPI_Count size;
    int agg_rank;

    ADIO_Offset agg_disp;       /* aggregated file offset */
    MPI_Datatype agg_dtype;     /* aggregated file datatype */

    int aggregators_done = 0;
    ADIO_Offset buffered_io_size = 0;

    int *alltoallw_disps;

    int *alltoallw_counts;
    int *client_alltoallw_counts;
    int *agg_alltoallw_counts;

    char *cb_buf = NULL;

    MPI_Datatype *client_comm_dtype_arr;        /* aggregator perspective */
    MPI_Datatype *agg_comm_dtype_arr;   /* client perspective */
    ADIO_Offset *client_comm_sz_arr;    /* aggregator perspective */
    ADIO_Offset *agg_comm_sz_arr;       /* client perspective */

    /* file views for each client and aggregator */
    view_state *client_file_view_state_arr = NULL;
    view_state *agg_file_view_state_arr = NULL;
    /* mem views for local process */
    view_state *my_mem_view_state_arr = NULL;

    MPI_Status *agg_comm_statuses = NULL;
    MPI_Request *agg_comm_requests = NULL;
    MPI_Status *client_comm_statuses = NULL;
    MPI_Request *client_comm_requests = NULL;
    int aggs_client_count = 0;
    int clients_agg_count = 0;

    MPI_Comm_size(fd->comm, &nprocs);
    MPI_Comm_rank(fd->comm, &myrank);
#ifdef DEBUG
    fprintf(stderr, "p%d: entering ADIOI_IOStridedColl\n", myrank);
#endif
#ifdef AGGREGATION_PROFILE
    if (rdwr == ADIOI_READ)
        MPE_Log_event(5010, 0, NULL);
    else
        MPE_Log_event(5012, 0, NULL);
#endif

    /* I need to check if there are any outstanding nonblocking writes
     * to the file, which could potentially interfere with the writes
     * taking place in this collective write call. Since this is not
     * likely to be common, let me do the simplest thing possible here:
     * Each process completes all pending nonblocking operations before
     * completing. */

    nprocs_for_coll = fd->hints->cb_nodes;

    if (rdwr == ADIOI_READ)
        cb_enable = fd->hints->cb_read;
    else
        cb_enable = fd->hints->cb_write;

    /* only check for interleaving if cb_read isn't disabled */
    if (cb_enable != ADIOI_HINT_DISABLE) {
        /* find the starting and ending byte of my I/O access */
        ADIOI_Calc_bounds(fd, count, datatype, file_ptr_type, offset,
                          &st_end_offset[0], &st_end_offset[1]);

        /* allocate an array of start/end pairs */
        all_st_end_offsets = (ADIO_Offset *)
            ADIOI_Malloc(2 * nprocs * sizeof(ADIO_Offset));
        MPI_Allgather(st_end_offset, 2, ADIO_OFFSET, all_st_end_offsets, 2, ADIO_OFFSET, fd->comm);

        min_st_offset = all_st_end_offsets[0];
        max_end_offset = all_st_end_offsets[1];

        for (i = 1; i < nprocs; i++) {
            /* are the accesses of different processes interleaved? */
            if ((all_st_end_offsets[i * 2] < all_st_end_offsets[i * 2 - 1]) &&
                (all_st_end_offsets[i * 2] <= all_st_end_offsets[i * 2 + 1]))
                interleave_count++;
            /* This is a rudimentary check for interleaving, but should
             * suffice for the moment. */

            min_st_offset = MPL_MIN(all_st_end_offsets[i * 2], min_st_offset);
            max_end_offset = MPL_MAX(all_st_end_offsets[i * 2 + 1], max_end_offset);
        }
    }

    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
    ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);

    if ((cb_enable == ADIOI_HINT_DISABLE || (!interleave_count && (cb_enable == ADIOI_HINT_AUTO)))
        && (fd->hints->cb_pfr != ADIOI_HINT_ENABLE)) {
        if (cb_enable != ADIOI_HINT_DISABLE) {
            ADIOI_Free(all_st_end_offsets);
        }

        if (buftype_is_contig && filetype_is_contig) {
            if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
                off = fd->disp + (fd->etype_size) * offset;
                if (rdwr == ADIOI_READ)
                    ADIO_ReadContig(fd, buf, count, datatype,
                                    ADIO_EXPLICIT_OFFSET, off, status, error_code);
                else
                    ADIO_WriteContig(fd, buf, count, datatype,
                                     ADIO_EXPLICIT_OFFSET, off, status, error_code);
            } else {
                if (rdwr == ADIOI_READ)
                    ADIO_ReadContig(fd, buf, count, datatype, ADIO_INDIVIDUAL,
                                    0, status, error_code);
                else
                    ADIO_WriteContig(fd, buf, count, datatype, ADIO_INDIVIDUAL,
                                     0, status, error_code);
            }
        } else {
            if (rdwr == ADIOI_READ)
                ADIO_ReadStrided(fd, buf, count, datatype, file_ptr_type,
                                 offset, status, error_code);
            else
                ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type,
                                  offset, status, error_code);
        }
        return;
    }

    MPI_Type_extent(datatype, &extent);
#ifdef DEBUG2
    bufextent = extent * count;
#endif
    MPI_Type_size_x(datatype, &size);
    bufsize = size * (MPI_Count) count;

    /* Calculate file realms */
    if ((fd->hints->cb_pfr != ADIOI_HINT_ENABLE) || (fd->file_realm_types == NULL))
        ADIOI_Calc_file_realms(fd, min_st_offset, max_end_offset);

    my_mem_view_state_arr = (view_state *)
        ADIOI_Calloc(1, nprocs * sizeof(view_state));
    agg_file_view_state_arr = (view_state *)
        ADIOI_Calloc(1, nprocs * sizeof(view_state));
    client_comm_sz_arr = (ADIO_Offset *)
        ADIOI_Calloc(1, nprocs * sizeof(ADIO_Offset));

    if (fd->is_agg) {
        client_file_view_state_arr = (view_state *)
            ADIOI_Calloc(1, nprocs * sizeof(view_state));
    } else {
        client_file_view_state_arr = NULL;
    }

    /* Alltoallw doesn't like a null array even if the counts are
     * zero.  If you do not include this code, it will fail. */
    client_comm_dtype_arr = (MPI_Datatype *)
        ADIOI_Calloc(1, nprocs * sizeof(MPI_Datatype));
    if (!fd->is_agg)
        for (i = 0; i < nprocs; i++)
            client_comm_dtype_arr[i] = MPI_BYTE;

    ADIOI_Exch_file_views(myrank, nprocs, file_ptr_type, fd, count,
                          datatype, offset, my_mem_view_state_arr,
                          agg_file_view_state_arr, client_file_view_state_arr);

    agg_comm_sz_arr = (ADIO_Offset *)
        ADIOI_Calloc(1, nprocs * sizeof(ADIO_Offset));
    agg_comm_dtype_arr = (MPI_Datatype *)
        ADIOI_Malloc(nprocs * sizeof(MPI_Datatype));
    if (fd->is_agg) {
        ADIOI_Build_agg_reqs(fd, rdwr, nprocs,
                             client_file_view_state_arr,
                             client_comm_dtype_arr, client_comm_sz_arr, &agg_disp, &agg_dtype);
        buffered_io_size = 0;
        for (i = 0; i < nprocs; i++) {
            if (client_comm_sz_arr[i] > 0)
                buffered_io_size += client_comm_sz_arr[i];
        }
    }
#ifdef USE_PRE_REQ
    else {
        /* Example use of ADIOI_Build_client_pre_req. to an
         * appropriate section */

        for (i = 0; i < fd->hints->cb_nodes; i++) {
            agg_rank = fd->hints->ranklist[(i + myrank) % fd->hints->cb_nodes];
#ifdef AGGREGATION_PROFILE
            MPE_Log_event(5040, 0, NULL);
#endif
            ADIOI_Build_client_pre_req(fd, agg_rank, (i + myrank) % fd->hints->cb_nodes,
                                       &(my_mem_view_state_arr[agg_rank]),
                                       &(agg_file_view_state_arr[agg_rank]),
                                       2 * 1024 * 1024, 64 * 1024);
#ifdef AGGREGATION_PROFILE
            MPE_Log_event(5041, 0, NULL);
#endif
        }
    }
#endif


    if (fd->is_agg)
        cb_buf = (char *) ADIOI_Malloc(fd->hints->cb_buffer_size);
    alltoallw_disps = (int *) ADIOI_Calloc(nprocs, sizeof(int));
    alltoallw_counts = client_alltoallw_counts = (int *)
        ADIOI_Calloc(2 * nprocs, sizeof(int));
    agg_alltoallw_counts = &alltoallw_counts[nprocs];

    if (fd->hints->cb_alltoall == ADIOI_HINT_DISABLE) {
        /* aggregators pre-post all Irecv's for incoming data from clients */
        if ((fd->is_agg) && (rdwr == ADIOI_WRITE))
            post_aggregator_comm(fd->comm, rdwr, nprocs, cb_buf,
                                 client_comm_dtype_arr,
                                 client_comm_sz_arr, &agg_comm_requests, &aggs_client_count);
    }
    /* Aggregators send amounts for data requested to clients */
    Exch_data_amounts(fd, nprocs, client_comm_sz_arr, agg_comm_sz_arr,
                      client_alltoallw_counts, agg_alltoallw_counts, &aggregators_done);

#ifdef DEBUG
    fprintf(stderr, "client_alltoallw_counts[ ");
    for (i = 0; i < nprocs; i++) {
        fprintf(stderr, "%d ", client_alltoallw_counts[i]);
    }
    fprintf(stderr, "]\n");
    fprintf(stderr, "agg_alltoallw_counts[ ");
    for (i = 0; i < nprocs; i++) {
        fprintf(stderr, "%d ", agg_alltoallw_counts[i]);
    }
    fprintf(stderr, "]\n");
#endif

    /* keep looping while aggregators still have I/O to do */
    while (aggregators_done != nprocs_for_coll) {
        if (fd->hints->cb_alltoall == ADIOI_HINT_DISABLE) {
            /* clients should build datatypes for local memory locations
             * for data communication with aggregators and post
             * communication as the datatypes are built */

            client_comm_requests = (MPI_Request *)
                ADIOI_Calloc(fd->hints->cb_nodes, sizeof(MPI_Request));

            for (i = 0; i < fd->hints->cb_nodes; i++) {
                clients_agg_count = 0;
                agg_rank = fd->hints->ranklist[(i + myrank) % fd->hints->cb_nodes];
                if (agg_comm_sz_arr[agg_rank] > 0) {
                    ADIOI_Build_client_req(fd, agg_rank,
                                           (i + myrank) % fd->hints->cb_nodes,
                                           &(my_mem_view_state_arr[agg_rank]),
                                           &(agg_file_view_state_arr[agg_rank]),
                                           agg_comm_sz_arr[agg_rank],
                                           &(agg_comm_dtype_arr[agg_rank]));

#ifdef AGGREGATION_PROFILE
                    if (i == 0)
                        MPE_Log_event(5038, 0, NULL);
#endif
                    post_client_comm(fd, rdwr, agg_rank, buf,
                                     agg_comm_dtype_arr[agg_rank],
                                     agg_alltoallw_counts[agg_rank],
                                     &client_comm_requests[clients_agg_count]);
                    clients_agg_count++;
                }
            }
#ifdef AGGREGATION_PROFILE
            if (!clients_agg_count)
                MPE_Log_event(5039, 0, NULL);
#endif

            if (rdwr == ADIOI_READ) {
                if (fd->is_agg && buffered_io_size) {
                    ADIOI_IOFiletype(fd, cb_buf, buffered_io_size, MPI_BYTE,
                                     ADIO_EXPLICIT_OFFSET, agg_disp, agg_dtype,
                                     ADIOI_READ, status, error_code);
                    if (*error_code != MPI_SUCCESS)
                        return;
                    MPI_Type_free(&agg_dtype);
                }
#ifdef DEBUG
                fprintf(stderr, "expecting from [agg](disp,size,cnt)=");
                for (i = 0; i < nprocs; i++) {
                    MPI_Type_size_x(agg_comm_dtype_arr[i], &size);
                    fprintf(stderr, "[%d](%d,%d,%d)", i, alltoallw_disps[i],
                            size, agg_alltoallw_counts[i]);
                    if (i != nprocs - 1)
                        fprintf(stderr, ",");
                }
                fprintf(stderr, "]\n");
                if (fd->is_agg) {
                    fprintf(stderr, "sending to [client](disp,size,cnt)=");
                    for (i = 0; i < nprocs; i++) {
                        if (fd->is_agg)
                            MPI_Type_size_x(client_comm_dtype_arr[i], &size);
                        else
                            size = -1;

                        fprintf(stderr, "[%d](%d,%d,%d)", i, alltoallw_disps[i],
                                size, client_alltoallw_counts[i]);
                        if (i != nprocs - 1)
                            fprintf(stderr, ",");
                    }
                    fprintf(stderr, "\n");
                }
                fflush(NULL);
#endif
                /* aggregators post all Isends for outgoing data to clients */
                if (fd->is_agg)
                    post_aggregator_comm(fd->comm, rdwr, nprocs, cb_buf,
                                         client_comm_dtype_arr,
                                         client_comm_sz_arr,
                                         &agg_comm_requests, &aggs_client_count);

                if (fd->is_agg && aggs_client_count) {
#ifdef MPI_STATUSES_IGNORE
                    agg_comm_statuses = MPI_STATUSES_IGNORE;
#else
                    agg_comm_statuses = ADIOI_Malloc(aggs_client_count * sizeof(MPI_Status));
#endif
                    MPI_Waitall(aggs_client_count, agg_comm_requests, agg_comm_statuses);
#ifdef AGGREGATION_PROFILE
                    MPE_Log_event(5033, 0, NULL);
#endif
                    ADIOI_Free(agg_comm_requests);
#ifndef MPI_STATUSES_IGNORE
                    ADIOI_Free(agg_comm_statuses);
#endif
                }

                if (clients_agg_count) {
#ifdef MPI_STATUSES_IGNORE
                    client_comm_statuses = MPI_STATUSES_IGNORE;
#else
                    client_comm_statuses = ADIOI_Malloc(clients_agg_count * sizeof(MPI_Status));
#endif
                    MPI_Waitall(clients_agg_count, client_comm_requests, client_comm_statuses);
#ifdef AGGREGATION_PROFILE
                    MPE_Log_event(5039, 0, NULL);
#endif
                    ADIOI_Free(client_comm_requests);
#ifndef MPI_STATUSES_IGNORE
                    ADIOI_Free(client_comm_statuses);
#endif
                }
#ifdef DEBUG2
                fprintf(stderr, "buffered_io_size = %lld\n", buffered_io_size);
                if (fd->is_agg && buffered_io_size) {
                    fprintf(stderr, "buf = [");
                    for (i = 0; i < bufextent; i++)
                        fprintf(stderr, "%c", ((char *) buf)[i]);
                    fprintf(stderr, "]\n");
                    fprintf(stderr, "cb_buf = [");
                    for (i = 0; i < buffered_io_size; i++)
                        fprintf(stderr, "%c", cb_buf[i]);
                    fprintf(stderr, "]\n");
                    fflush(NULL);
                }
#endif
            } else {    /* Write Case */
#ifdef DEBUG
                fprintf(stderr, "sending to [agg](disp,size,cnt)=");
                for (i = 0; i < nprocs; i++) {
                    MPI_Type_size_x(agg_comm_dtype_arr[i], &size);
                    fprintf(stderr, "[%d](%d,%d,%d)", i, alltoallw_disps[i],
                            size, agg_alltoallw_counts[i]);
                    if (i != nprocs - 1)
                        fprintf(stderr, ",");
                }
                fprintf(stderr, "]\n");
                fprintf(stderr, "expecting from [client](disp,size,cnt)=");
                for (i = 0; i < nprocs; i++) {
                    if (fd->is_agg)
                        MPI_Type_size_x(client_comm_dtype_arr[i], &size);
                    else
                        size = -1;

                    fprintf(stderr, "[%d](%d,%d,%d)", i, alltoallw_disps[i],
                            size, client_alltoallw_counts[i]);
                    if (i != nprocs - 1)
                        fprintf(stderr, ",");
                }
                fprintf(stderr, "\n");
                fflush(NULL);
#endif
#ifdef DEBUG
                fprintf(stderr, "buffered_io_size = %lld\n", buffered_io_size);
#endif

                if (clients_agg_count) {
#ifdef MPI_STATUSES_IGNORE
                    client_comm_statuses = MPI_STATUSES_IGNORE;
#else
                    client_comm_statuses = ADIOI_Malloc(clients_agg_count * sizeof(MPI_Status));
#endif
                    MPI_Waitall(clients_agg_count, client_comm_requests, client_comm_statuses);
#ifdef AGGREGATION_PROFILE
                    MPE_Log_event(5039, 0, NULL);
#endif
                    ADIOI_Free(client_comm_requests);
#ifndef MPI_STATUSES_IGNORE
                    ADIOI_Free(client_comm_statuses);
#endif
                }
#ifdef DEBUG2
                if (bufextent) {
                    fprintf(stderr, "buf = [");
                    for (i = 0; i < bufextent; i++)
                        fprintf(stderr, "%c", ((char *) buf)[i]);
                    fprintf(stderr, "]\n");
                }
#endif

                if (fd->is_agg && buffered_io_size) {
                    ADIOI_Assert(aggs_client_count != 0);
                    /* make sure we actually have the data to write out */
#ifdef MPI_STATUSES_IGNORE
                    agg_comm_statuses = MPI_STATUSES_IGNORE;
#else
                    agg_comm_statuses = (MPI_Status *)
                        ADIOI_Malloc(aggs_client_count * sizeof(MPI_Status));
#endif

                    MPI_Waitall(aggs_client_count, agg_comm_requests, agg_comm_statuses);
#ifdef AGGREGATION_PROFILE
                    MPE_Log_event(5033, 0, NULL);
#endif
                    ADIOI_Free(agg_comm_requests);
#ifndef MPI_STATUSES_IGNORE
                    ADIOI_Free(agg_comm_statuses);
#endif
#ifdef DEBUG2
                    fprintf(stderr, "cb_buf = [");
                    for (i = 0; i < buffered_io_size; i++)
                        fprintf(stderr, "%c", cb_buf[i]);
                    fprintf(stderr, "]\n");
                    fflush(NULL);
#endif
                    ADIOI_IOFiletype(fd, cb_buf, buffered_io_size, MPI_BYTE,
                                     ADIO_EXPLICIT_OFFSET, agg_disp, agg_dtype,
                                     ADIOI_WRITE, status, error_code);
                    if (*error_code != MPI_SUCCESS)
                        return;
                    MPI_Type_free(&agg_dtype);
                }

            }
        } else {
            /* Alltoallw version of everything */
            ADIOI_Build_client_reqs(fd, nprocs, my_mem_view_state_arr,
                                    agg_file_view_state_arr, agg_comm_sz_arr, agg_comm_dtype_arr);

            if (rdwr == ADIOI_READ) {
                if (fd->is_agg && buffered_io_size) {
                    ADIOI_IOFiletype(fd, cb_buf, buffered_io_size, MPI_BYTE,
                                     ADIO_EXPLICIT_OFFSET, agg_disp, agg_dtype,
                                     ADIOI_READ, status, error_code);
                    if (*error_code != MPI_SUCCESS)
                        return;
                    MPI_Type_free(&agg_dtype);
                }
#ifdef AGGREGATION_PROFILE
                MPE_Log_event(5032, 0, NULL);
#endif
                MPI_Alltoallw(cb_buf, client_alltoallw_counts, alltoallw_disps,
                              client_comm_dtype_arr,
                              buf, agg_alltoallw_counts, alltoallw_disps,
                              agg_comm_dtype_arr, fd->comm);
#ifdef AGGREGATION_PROFILE
                MPE_Log_event(5033, 0, NULL);
#endif
            } else {    /* Write Case */
#ifdef AGGREGATION_PROFILE
                MPE_Log_event(5032, 0, NULL);
#endif
                MPI_Alltoallw(buf, agg_alltoallw_counts, alltoallw_disps,
                              agg_comm_dtype_arr,
                              cb_buf, client_alltoallw_counts, alltoallw_disps,
                              client_comm_dtype_arr, fd->comm);
#ifdef AGGREGATION_PROFILE
                MPE_Log_event(5033, 0, NULL);
#endif
                if (fd->is_agg && buffered_io_size) {
                    ADIOI_IOFiletype(fd, cb_buf, buffered_io_size, MPI_BYTE,
                                     ADIO_EXPLICIT_OFFSET, agg_disp, agg_dtype,
                                     ADIOI_WRITE, status, error_code);
                    if (*error_code != MPI_SUCCESS)
                        return;
                    MPI_Type_free(&agg_dtype);
                }
            }
        }

        /* Free (uncommit) datatypes for reuse */
        if (fd->is_agg) {
            if (buffered_io_size > 0) {
                for (i = 0; i < nprocs; i++) {
                    if (client_comm_sz_arr[i] > 0)
                        MPI_Type_free(&client_comm_dtype_arr[i]);
                }
            }
        }
        for (i = 0; i < nprocs; i++) {
            if (agg_comm_sz_arr[i] > 0)
                MPI_Type_free(&agg_comm_dtype_arr[i]);
        }

        /* figure out next set up requests */
        if (fd->is_agg) {
            ADIOI_Build_agg_reqs(fd, rdwr, nprocs,
                                 client_file_view_state_arr,
                                 client_comm_dtype_arr, client_comm_sz_arr, &agg_disp, &agg_dtype);
            buffered_io_size = 0;
            for (i = 0; i < nprocs; i++) {
                if (client_comm_sz_arr[i] > 0)
                    buffered_io_size += client_comm_sz_arr[i];
            }
        }
#ifdef USE_PRE_REQ
        else {
            /* Example use of ADIOI_Build_client_pre_req. to an
             * appropriate section */
            for (i = 0; i < fd->hints->cb_nodes; i++) {
                agg_rank = fd->hints->ranklist[(i + myrank) % fd->hints->cb_nodes];
#ifdef AGGREGATION_PROFILE
                MPE_Log_event(5040, 0, NULL);
#endif
                ADIOI_Build_client_pre_req(fd, agg_rank, (i + myrank) % fd->hints->cb_nodes,
                                           &(my_mem_view_state_arr[agg_rank]),
                                           &(agg_file_view_state_arr[agg_rank]),
                                           2 * 1024 * 1024, 64 * 1024);
#ifdef AGGREGATION_PROFILE
                MPE_Log_event(5041, 0, NULL);
#endif
            }
        }
#endif

        /* aggregators pre-post all Irecv's for incoming data from
         * clients.  if nothing is needed, agg_comm_requests is not
         * allocated */
        if (fd->hints->cb_alltoall == ADIOI_HINT_DISABLE) {
            if ((fd->is_agg) && (rdwr == ADIOI_WRITE))
                post_aggregator_comm(fd->comm, rdwr, nprocs, cb_buf,
                                     client_comm_dtype_arr,
                                     client_comm_sz_arr, &agg_comm_requests, &aggs_client_count);
        }

        /* Aggregators send amounts for data requested to clients */
        Exch_data_amounts(fd, nprocs, client_comm_sz_arr, agg_comm_sz_arr,
                          client_alltoallw_counts, agg_alltoallw_counts, &aggregators_done);

    }

    /* Clean up */

    if (fd->hints->cb_pfr != ADIOI_HINT_ENABLE) {
        /* AAR, FSIZE, and User provided uniform File realms */
        if (1) {
            MPI_Type_free(&fd->file_realm_types[0]);
        } else {
            for (i = 0; i < fd->hints->cb_nodes; i++) {
                ADIOI_Datatype_iscontig(fd->file_realm_types[i], &is_contig);
                MPI_Type_free(&fd->file_realm_types[i]);
            }
        }
        ADIOI_Free(fd->file_realm_types);
        ADIOI_Free(fd->file_realm_st_offs);
    }


    if (fd->is_agg) {
        if (buffered_io_size > 0)
            MPI_Type_free(&agg_dtype);
        for (i = 0; i < nprocs; i++) {
            MPI_Type_free(&client_comm_dtype_arr[i]);
            ADIOI_Free(client_file_view_state_arr[i].flat_type_p->indices);
            ADIOI_Free(client_file_view_state_arr[i].flat_type_p->blocklens);
            ADIOI_Free(client_file_view_state_arr[i].flat_type_p);
        }
        ADIOI_Free(client_file_view_state_arr);
        ADIOI_Free(cb_buf);
    }
    for (i = 0; i < nprocs; i++)
        if (agg_comm_sz_arr[i] > 0)
            MPI_Type_free(&agg_comm_dtype_arr[i]);

    ADIOI_Free(client_comm_sz_arr);
    ADIOI_Free(client_comm_dtype_arr);
    ADIOI_Free(my_mem_view_state_arr);
    ADIOI_Free(agg_file_view_state_arr);
    ADIOI_Free(agg_comm_sz_arr);
    ADIOI_Free(agg_comm_dtype_arr);
    ADIOI_Free(alltoallw_disps);
    ADIOI_Free(alltoallw_counts);
    ADIOI_Free(all_st_end_offsets);

#ifdef HAVE_STATUS_SET_BYTES
    MPIR_Status_set_bytes(status, datatype, bufsize);
    /* This is a temporary way of filling in status.  The right way is
     * to keep track of how much data was actually read and placed in
     * buf during collective I/O. */
#endif
    fd->fp_sys_posn = -1;       /* set it to null. */
#ifdef AGGREGATION_PROFILE
    if (rdwr == ADIOI_READ)
        MPE_Log_event(5011, 0, NULL);
    else
        MPE_Log_event(5013, 0, NULL);
#endif
}
/* MPIR_Type_get_basic_type_elements()
 *
 * Arguments:
 * - bytes_p - input/output byte count
 * - count - maximum number of this type to subtract from the bytes; a count
 *           of -1 indicates use as many as we like
 * - datatype - input datatype
 *
 * Returns number of elements available given the two constraints of number of
 * bytes and count of types.  Also reduces the byte count by the amount taken
 * up by the types.
 *
 * Assumptions:
 * - the type passed to this function must be a basic *or* a pairtype
 *   (which aren't basic types)
 * - the count is not zero (otherwise we can't tell between a "no more
 *   complete types" case and a "zero count" case)
 *
 * As per section 4.9.3 of the MPI 1.1 specification, the two-part reduction
 * types are to be treated as structs of the constituent types.  So we have to
 * do something special to handle them correctly in here.
 *
 * As per section 3.12.5 get_count and get_elements report the same value for
 * basic datatypes; I'm currently interpreting this to *not* include these
 * reduction types, as they are considered structs.
 */
PMPI_LOCAL MPI_Count MPIR_Type_get_basic_type_elements(MPI_Count *bytes_p,
                                                       MPI_Count count,
                                                       MPI_Datatype datatype)
{
    MPI_Count elements, usable_bytes, used_bytes, type1_sz, type2_sz;

    if (count == 0) return 0;

    /* determine the maximum number of bytes we should take from the
     * byte count.
     */
    if (count < 0) {
        usable_bytes = *bytes_p;
    }
    else {
        usable_bytes = MPL_MIN(*bytes_p,
                           count * MPID_Datatype_get_basic_size(datatype));
    }

    switch (datatype) {
        /* we don't get valid fortran datatype handles in all cases... */
#ifdef HAVE_FORTRAN_BINDING
        case MPI_2REAL:
            type1_sz = type2_sz = MPID_Datatype_get_basic_size(MPI_REAL);
            break;
        case MPI_2DOUBLE_PRECISION:
            type1_sz = type2_sz =
                MPID_Datatype_get_basic_size(MPI_DOUBLE_PRECISION);
            break;
        case MPI_2INTEGER:
            type1_sz = type2_sz = MPID_Datatype_get_basic_size(MPI_INTEGER);
            break;
#endif
        case MPI_2INT:
            type1_sz = type2_sz = MPID_Datatype_get_basic_size(MPI_INT);
            break;
        case MPI_FLOAT_INT:
            type1_sz = MPID_Datatype_get_basic_size(MPI_FLOAT);
            type2_sz = MPID_Datatype_get_basic_size(MPI_INT);
            break;
        case MPI_DOUBLE_INT:
            type1_sz = MPID_Datatype_get_basic_size(MPI_DOUBLE);
            type2_sz = MPID_Datatype_get_basic_size(MPI_INT);
            break;
        case MPI_LONG_INT:
            type1_sz = MPID_Datatype_get_basic_size(MPI_LONG);
            type2_sz = MPID_Datatype_get_basic_size(MPI_INT);
            break;
        case MPI_SHORT_INT:
            type1_sz = MPID_Datatype_get_basic_size(MPI_SHORT);
            type2_sz = MPID_Datatype_get_basic_size(MPI_INT);
            break;
        case MPI_LONG_DOUBLE_INT:
            type1_sz = MPID_Datatype_get_basic_size(MPI_LONG_DOUBLE);
            type2_sz = MPID_Datatype_get_basic_size(MPI_INT);
            break;
        default:
            /* all other types.  this is more complicated than
             * necessary for handling these types, but it puts us in the
             * same code path for all the basics, so we stick with it.
             */
            type1_sz = type2_sz = MPID_Datatype_get_basic_size(datatype);
            break;
    }

    /* determine the number of elements in the region */
    elements = 2 * (usable_bytes / (type1_sz + type2_sz));
    if (usable_bytes % (type1_sz + type2_sz) >= type1_sz) elements++;

    /* determine how many bytes we used up with those elements */
    used_bytes = ((elements / 2) * (type1_sz + type2_sz));
    if (elements % 2 == 1) used_bytes += type1_sz;

    *bytes_p -= used_bytes;

    return elements;
}
Exemple #16
0
/* If successful, error_code is set to MPI_SUCCESS.  Otherwise an error
 * code is created and returned in error_code.
 */
static void ADIOI_Exch_and_write(ADIO_File fd, void *buf, MPI_Datatype
                                 datatype, int nprocs,
                                 int myrank,
                                 ADIOI_Access
                                 * others_req, ADIO_Offset * offset_list,
                                 ADIO_Offset * len_list, int contig_access_count,
                                 ADIO_Offset min_st_offset, ADIO_Offset fd_size,
                                 ADIO_Offset * fd_start, ADIO_Offset * fd_end,
                                 MPI_Aint * buf_idx, int *error_code)
{
/* Send data to appropriate processes and write in sizes of no more
   than coll_bufsize.
   The idea is to reduce the amount of extra memory required for
   collective I/O. If all data were written all at once, which is much
   easier, it would require temp space more than the size of user_buf,
   which is often unacceptable. For example, to write a distributed
   array to a file, where each local array is 8Mbytes, requiring
   at least another 8Mbytes of temp space is unacceptable. */

    /* Not convinced end_loc-st_loc couldn't be > int, so make these offsets */
    ADIO_Offset size = 0;
    int hole, i, j, m, ntimes, max_ntimes, buftype_is_contig;
    ADIO_Offset st_loc = -1, end_loc = -1, off, done, req_off;
    char *write_buf = NULL;
    int *curr_offlen_ptr, *count, *send_size, req_len, *recv_size;
    int *partial_recv, *sent_to_proc, *start_pos, flag;
    int *send_buf_idx, *curr_to_proc, *done_to_proc;
    MPI_Status status;
    ADIOI_Flatlist_node *flat_buf = NULL;
    MPI_Aint buftype_extent;
    int info_flag, coll_bufsize;
    char *value;
    static char myname[] = "ADIOI_EXCH_AND_WRITE";

    *error_code = MPI_SUCCESS;  /* changed below if error */
    /* only I/O errors are currently reported */

/* calculate the number of writes of size coll_bufsize
   to be done by each process and the max among all processes.
   That gives the no. of communication phases as well. */

    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
    ADIOI_Info_get(fd->info, "cb_buffer_size", MPI_MAX_INFO_VAL, value, &info_flag);
    coll_bufsize = atoi(value);
    ADIOI_Free(value);


    for (i = 0; i < nprocs; i++) {
        if (others_req[i].count) {
            st_loc = others_req[i].offsets[0];
            end_loc = others_req[i].offsets[0];
            break;
        }
    }

    for (i = 0; i < nprocs; i++)
        for (j = 0; j < others_req[i].count; j++) {
            st_loc = MPL_MIN(st_loc, others_req[i].offsets[j]);
            end_loc = MPL_MAX(end_loc, (others_req[i].offsets[j]
                                        + others_req[i].lens[j] - 1));
        }

/* ntimes=ceiling_div(end_loc - st_loc + 1, coll_bufsize)*/

    ntimes = (int) ((end_loc - st_loc + coll_bufsize) / coll_bufsize);

    if ((st_loc == -1) && (end_loc == -1)) {
        ntimes = 0;     /* this process does no writing. */
    }

    MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX, fd->comm);

    write_buf = fd->io_buf;

    curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs * 10, sizeof(int));
    /* its use is explained below. calloc initializes to 0. */

    count = curr_offlen_ptr + nprocs;
    /* to store count of how many off-len pairs per proc are satisfied
     * in an iteration. */

    partial_recv = count + nprocs;
    /* if only a portion of the last off-len pair is recd. from a process
     * in a particular iteration, the length recd. is stored here.
     * calloc initializes to 0. */

    send_size = partial_recv + nprocs;
    /* total size of data to be sent to each proc. in an iteration.
     * Of size nprocs so that I can use MPI_Alltoall later. */

    recv_size = send_size + nprocs;
    /* total size of data to be recd. from each proc. in an iteration. */

    sent_to_proc = recv_size + nprocs;
    /* amount of data sent to each proc so far. Used in
     * ADIOI_Fill_send_buffer. initialized to 0 here. */

    send_buf_idx = sent_to_proc + nprocs;
    curr_to_proc = send_buf_idx + nprocs;
    done_to_proc = curr_to_proc + nprocs;
    /* Above three are used in ADIOI_Fill_send_buffer */

    start_pos = done_to_proc + nprocs;
    /* used to store the starting value of curr_offlen_ptr[i] in
     * this iteration */

    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
    if (!buftype_is_contig) {
        flat_buf = ADIOI_Flatten_and_find(datatype);
    }
    MPI_Type_extent(datatype, &buftype_extent);


/* I need to check if there are any outstanding nonblocking writes to
   the file, which could potentially interfere with the writes taking
   place in this collective write call. Since this is not likely to be
   common, let me do the simplest thing possible here: Each process
   completes all pending nonblocking operations before completing. */

    /*ADIOI_Complete_async(error_code);
     * if (*error_code != MPI_SUCCESS) return;
     * MPI_Barrier(fd->comm);
     */

    done = 0;
    off = st_loc;

    for (m = 0; m < ntimes; m++) {
        /* go through all others_req and check which will be satisfied
         * by the current write */

        /* Note that MPI guarantees that displacements in filetypes are in
         * monotonically nondecreasing order and that, for writes, the
         * filetypes cannot specify overlapping regions in the file. This
         * simplifies implementation a bit compared to reads. */

        /* off = start offset in the file for the data to be written in
         * this iteration
         * size = size of data written (bytes) corresponding to off
         * req_off = off in file for a particular contiguous request
         * minus what was satisfied in previous iteration
         * req_size = size corresponding to req_off */

        /* first calculate what should be communicated */

        for (i = 0; i < nprocs; i++)
            count[i] = recv_size[i] = 0;

        size = MPL_MIN((unsigned) coll_bufsize, end_loc - st_loc + 1 - done);

        for (i = 0; i < nprocs; i++) {
            if (others_req[i].count) {
                start_pos[i] = curr_offlen_ptr[i];
                for (j = curr_offlen_ptr[i]; j < others_req[i].count; j++) {
                    if (partial_recv[i]) {
                        /* this request may have been partially
                         * satisfied in the previous iteration. */
                        req_off = others_req[i].offsets[j] + partial_recv[i];
                        req_len = others_req[i].lens[j] - partial_recv[i];
                        partial_recv[i] = 0;
                        /* modify the off-len pair to reflect this change */
                        others_req[i].offsets[j] = req_off;
                        others_req[i].lens[j] = req_len;
                    } else {
                        req_off = others_req[i].offsets[j];
                        req_len = others_req[i].lens[j];
                    }
                    if (req_off < off + size) {
                        count[i]++;
                        ADIOI_Assert((((ADIO_Offset) (uintptr_t) write_buf) + req_off - off) ==
                                     (ADIO_Offset) (uintptr_t) (write_buf + req_off - off));
                        MPI_Address(write_buf + req_off - off, &(others_req[i].mem_ptrs[j]));
                        ADIOI_Assert((off + size - req_off) == (int) (off + size - req_off));
                        recv_size[i] += (int) (MPL_MIN(off + size - req_off, (unsigned) req_len));

                        if (off + size - req_off < (unsigned) req_len) {
                            partial_recv[i] = (int) (off + size - req_off);

                            /* --BEGIN ERROR HANDLING-- */
                            if ((j + 1 < others_req[i].count) &&
                                (others_req[i].offsets[j + 1] < off + size)) {
                                *error_code = MPIO_Err_create_code(MPI_SUCCESS,
                                                                   MPIR_ERR_RECOVERABLE,
                                                                   myname,
                                                                   __LINE__,
                                                                   MPI_ERR_ARG,
                                                                   "Filetype specifies overlapping write regions (which is illegal according to the MPI-2 specification)",
                                                                   0);
                                /* allow to continue since additional
                                 * communication might have to occur
                                 */
                            }
                            /* --END ERROR HANDLING-- */
                            break;
                        }
                    } else
                        break;
                }
                curr_offlen_ptr[i] = j;
            }
        }

        ADIOI_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
                              len_list, send_size, recv_size, off, size, count,
                              start_pos, partial_recv,
                              sent_to_proc, nprocs, myrank,
                              buftype_is_contig, contig_access_count,
                              min_st_offset, fd_size, fd_start, fd_end,
                              others_req, send_buf_idx, curr_to_proc,
                              done_to_proc, &hole, m, buftype_extent, buf_idx, error_code);
        if (*error_code != MPI_SUCCESS)
            return;

        flag = 0;
        for (i = 0; i < nprocs; i++)
            if (count[i])
                flag = 1;

        if (flag) {
            ADIOI_Assert(size == (int) size);
            ADIO_WriteContig(fd, write_buf, (int) size, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
                             off, &status, error_code);
            if (*error_code != MPI_SUCCESS)
                return;
        }

        off += size;
        done += size;
    }

    for (i = 0; i < nprocs; i++)
        count[i] = recv_size[i] = 0;
    for (m = ntimes; m < max_ntimes; m++) {
        ADIOI_Assert(size == (int) size);
        /* nothing to recv, but check for send. */
        ADIOI_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
                              len_list, send_size, recv_size, off, (int) size, count,
                              start_pos, partial_recv,
                              sent_to_proc, nprocs, myrank,
                              buftype_is_contig, contig_access_count,
                              min_st_offset, fd_size, fd_start, fd_end,
                              others_req, send_buf_idx,
                              curr_to_proc, done_to_proc, &hole, m,
                              buftype_extent, buf_idx, error_code);
        if (*error_code != MPI_SUCCESS)
            return;
    }

    ADIOI_Free(curr_offlen_ptr);
}
Exemple #17
0
void ADIOI_XFS_WriteContig(ADIO_File fd, void *buf, int count, 
                     MPI_Datatype datatype, int file_ptr_type,
		     ADIO_Offset offset, ADIO_Status *status, int *error_code)
{
    int diff, size;
    MPI_Count err=-1, datatype_size;
    ssize_t len;
    void *newbuf;
    static char myname[] = "ADIOI_XFS_WRITECONTIG";

    MPI_Type_size_x(datatype, &datatype_size);
    len = datatype_size * count;

    fd->fp_sys_posn = -1; /* set it to null, since we are using pwrite */

    if (file_ptr_type == ADIO_INDIVIDUAL) offset = fd->fp_ind;

    if (!(fd->direct_write)) {    /* direct I/O not enabled */
	err = pwrite(fd->fd_sys, buf, len, offset);
	if (err < 0) {goto leaving;}
    } else {       /* direct I/O enabled */

	/* (1) if mem_aligned && file_aligned 
                    use direct I/O to write up to correct io_size
                    use buffered I/O for remaining  */

	if (!(((long) buf) % fd->d_mem) && !(offset % fd->d_miniosz)) {
	    err = ADIOI_XFS_Aligned_Mem_File_Write(fd, buf, len, offset);
	    if (err < 0) {goto leaving;}

        /* (2) if !file_aligned
                    use buffered I/O to write up to file_aligned
                    At that point, if still mem_aligned, use (1)
   		        else copy into aligned buf and then use (1) */
	} else if (offset % fd->d_miniosz) {
	    diff = fd->d_miniosz - (offset % fd->d_miniosz);
	    diff = MPL_MIN(diff, len);
	    err = pwrite(fd->fd_sys, buf, diff, offset);
	    if (err < 0) {goto leaving;}

	    buf = ((char *) buf) + diff;
	    offset += diff;
	    size = len - diff;
	    if (!(((long) buf) % fd->d_mem)) {
		err = ADIOI_XFS_Aligned_Mem_File_Write(fd, buf, size, offset);
		if (err < 0) {goto leaving;}
	    }
	    else {
		newbuf = (void *) memalign(XFS_MEMALIGN, size);
		if (newbuf) {
		    memcpy(newbuf, buf, size);
		    err = ADIOI_XFS_Aligned_Mem_File_Write(fd, newbuf, size, offset);
		    ADIOI_Free(newbuf);
		    if (err < 0) {goto leaving;}
		} else {
		    err = pwrite(fd->fd_sys, buf, size, offset);
		    if (err < 0) {goto leaving;}
		}
	    }
	}

        /* (3) if !mem_aligned && file_aligned
    	            copy into aligned buf, then use (1)  */
	else {
	    newbuf = (void *) memalign(XFS_MEMALIGN, len);
	    if (newbuf) {
		memcpy(newbuf, buf, len);
		err = ADIOI_XFS_Aligned_Mem_File_Write(fd, newbuf, len, offset);
		ADIOI_Free(newbuf);
	    } else {
		 err = pwrite(fd->fd_sys, buf, len, offset);
	    }

	    if (err < 0) {goto leaving;}
	}
    }

    if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind += len;

#ifdef HAVE_STATUS_SET_BYTES
    if (err != -1) MPIR_Status_set_bytes(status, datatype, len);
#endif
leaving:
    if (err == -1) {
	*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
					   myname, __LINE__, MPI_ERR_IO, "**io",
					   "**io %s", strerror(errno));
    }
    else *error_code = MPI_SUCCESS;
}
Exemple #18
0
void ADIOI_GEN_ReadStrided(ADIO_File fd, void *buf, int count,
                           MPI_Datatype datatype, int file_ptr_type,
                           ADIO_Offset offset, ADIO_Status *status, int
                           *error_code)
{


    /* offset is in units of etype relative to the filetype. */

    ADIOI_Flatlist_node *flat_buf, *flat_file;
    ADIO_Offset i_offset, new_brd_size, brd_size, size;
    int i, j, k, st_index=0;
    MPI_Count num, bufsize;
    int n_etypes_in_filetype;
    ADIO_Offset n_filetypes, etype_in_filetype, st_n_filetypes, size_in_filetype;
    ADIO_Offset abs_off_in_filetype=0, new_frd_size, frd_size=0, st_frd_size;
    MPI_Count filetype_size, etype_size, buftype_size, partial_read;
    MPI_Aint filetype_extent, buftype_extent;
    int buf_count, buftype_is_contig, filetype_is_contig;
    ADIO_Offset userbuf_off, req_len, sum;
    ADIO_Offset off, req_off, disp, end_offset=0, readbuf_off, start_off;
    char *readbuf, *tmp_buf, *value;
    int info_flag;
    unsigned max_bufsize, readbuf_len;
    ADIO_Status status1;

    if (fd->hints->ds_read == ADIOI_HINT_DISABLE) {
        /* if user has disabled data sieving on reads, use naive
        * approach instead.
        */
        ADIOI_GEN_ReadStrided_naive(fd,
                                    buf,
                                    count,
                                    datatype,
                                    file_ptr_type,
                                    offset,
                                    status,
                                    error_code);
        return;
    }

    *error_code = MPI_SUCCESS;  /* changed below if error */

    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
    ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);

    MPI_Type_size_x(fd->filetype, &filetype_size);
    if ( ! filetype_size ) {
#ifdef HAVE_STATUS_SET_BYTES
        MPIR_Status_set_bytes(status, datatype, 0);
#endif
        *error_code = MPI_SUCCESS;
        return;
    }

    MPI_Type_extent(fd->filetype, &filetype_extent);
    MPI_Type_size_x(datatype, &buftype_size);
    MPI_Type_extent(datatype, &buftype_extent);
    etype_size = fd->etype_size;

    ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(MPI_Count)buftype_size * (ADIO_Offset)count));
    bufsize = buftype_size * count;

    /* get max_bufsize from the info object. */

    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
    ADIOI_Info_get(fd->info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL, value,
                   &info_flag);
    max_bufsize = atoi(value);
    ADIOI_Free(value);


    if (!buftype_is_contig && filetype_is_contig) {

        /* noncontiguous in memory, contiguous in file. */

        flat_buf = ADIOI_Flatten_and_find(datatype);

        off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
              fd->disp + (ADIO_Offset)etype_size * offset;

        start_off = off;
        end_offset = off + bufsize - 1;
        readbuf_off = off;
        readbuf = (char *) ADIOI_Malloc(max_bufsize);
        readbuf_len = (unsigned) (MPL_MIN(max_bufsize, end_offset-readbuf_off+1));

        /* if atomicity is true, lock (exclusive) the region to be accessed */
        if ((fd->atomicity) && ADIO_Feature(fd, ADIO_LOCKS))
            ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);

        ADIO_ReadContig(fd, readbuf, readbuf_len, MPI_BYTE,
                        ADIO_EXPLICIT_OFFSET, readbuf_off, &status1, error_code);
        if (*error_code != MPI_SUCCESS) return;

        for (j=0; j<count; j++)
        {
            for (i=0; i<flat_buf->count; i++) {
                userbuf_off = (ADIO_Offset)j*(ADIO_Offset)buftype_extent + flat_buf->indices[i];
                req_off = off;
                req_len = flat_buf->blocklens[i];
                ADIOI_BUFFERED_READ
                off += flat_buf->blocklens[i];
            }
        }

        if ((fd->atomicity) && ADIO_Feature(fd, ADIO_LOCKS))
            ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);

        if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;

        ADIOI_Free(readbuf);
    }

    else {  /* noncontiguous in file */

        flat_file = ADIOI_Flatten_and_find(fd->filetype);
        disp = fd->disp;

        if (file_ptr_type == ADIO_INDIVIDUAL) {
            /* Wei-keng reworked type processing to be a bit more efficient */
            offset       = fd->fp_ind - disp;
            n_filetypes  = (offset - flat_file->indices[0]) / filetype_extent;
            offset -= (ADIO_Offset)n_filetypes * filetype_extent;
            /* now offset is local to this extent */

            /* find the block where offset is located, skip blocklens[i]==0 */
            for (i=0; i<flat_file->count; i++) {
                ADIO_Offset dist;
                if (flat_file->blocklens[i] == 0) continue;
                dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
                /* frd_size is from offset to the end of block i */
                if (dist == 0) {
                    i++;
                    offset   = flat_file->indices[i];
                    frd_size = flat_file->blocklens[i];
                    break;
                }
                if (dist > 0) {
                    frd_size = dist;
                    break;
                }
            }
            st_index = i;  /* starting index in flat_file->indices[] */
            offset += disp + (ADIO_Offset)n_filetypes*filetype_extent;
        }
        else {
            n_etypes_in_filetype = filetype_size/etype_size;
            n_filetypes = offset / n_etypes_in_filetype;
            etype_in_filetype = offset % n_etypes_in_filetype;
            size_in_filetype = etype_in_filetype * etype_size;

            sum = 0;
            for (i=0; i<flat_file->count; i++) {
                sum += flat_file->blocklens[i];
                if (sum > size_in_filetype) {
                    st_index = i;
                    frd_size = sum - size_in_filetype;
                    abs_off_in_filetype = flat_file->indices[i] +
                                          size_in_filetype - (sum - flat_file->blocklens[i]);
                    break;
                }
            }

            /* abs. offset in bytes in the file */
            offset = disp + (ADIO_Offset) n_filetypes*filetype_extent +
                     abs_off_in_filetype;
        }

        start_off = offset;

        /* Wei-keng Liao: read request is within a single flat_file contig
         * block e.g. with subarray types that actually describe the whole
         * array */
        if (buftype_is_contig && bufsize <= frd_size) {
            /* a count of bytes can overflow. operate on original type instead */
            ADIO_ReadContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET,
                            offset, status, error_code);

            if (file_ptr_type == ADIO_INDIVIDUAL) {
                /* update MPI-IO file pointer to point to the first byte that
                * can be accessed in the fileview. */
                fd->fp_ind = offset + bufsize;
                if (bufsize == frd_size) {
                    do {
                        st_index++;
                        if (st_index == flat_file->count) {
                            st_index = 0;
                            n_filetypes++;
                        }
                    } while (flat_file->blocklens[st_index] == 0);
                    fd->fp_ind = disp + flat_file->indices[st_index]
                                 + n_filetypes*filetype_extent;
                }
            }
            fd->fp_sys_posn = -1;   /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
            MPIR_Status_set_bytes(status, datatype, bufsize);
#endif
            return;
        }

        /* Calculate end_offset, the last byte-offset that will be accessed.
          e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/

        st_frd_size = frd_size;
        st_n_filetypes = n_filetypes;
        i_offset = 0;
        j = st_index;
        off = offset;
        frd_size = MPL_MIN(st_frd_size, bufsize);
        while (i_offset < bufsize) {
            i_offset += frd_size;
            end_offset = off + frd_size - 1;

            j = (j+1) % flat_file->count;
            n_filetypes += (j == 0) ? 1 : 0;
            while (flat_file->blocklens[j]==0) {
                j = (j+1) % flat_file->count;
                n_filetypes += (j == 0) ? 1 : 0;
            }
            off = disp + flat_file->indices[j] + n_filetypes*(ADIO_Offset)filetype_extent;
            frd_size = MPL_MIN(flat_file->blocklens[j], bufsize-i_offset);
        }

        /* if atomicity is true, lock (exclusive) the region to be accessed */
        if ((fd->atomicity) && ADIO_Feature(fd, ADIO_LOCKS))
            ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);

        readbuf_off = 0;
        readbuf_len = 0;
        readbuf = (char *) ADIOI_Malloc(max_bufsize);

        if (buftype_is_contig && !filetype_is_contig) {

            /* contiguous in memory, noncontiguous in file. should be the most
               common case. */

            i_offset = 0;
            j = st_index;
            off = offset;
            n_filetypes = st_n_filetypes;
            frd_size = MPL_MIN(st_frd_size, bufsize);
            while (i_offset < bufsize) {
                if (frd_size) {
                    /* TYPE_UB and TYPE_LB can result in
                       frd_size = 0. save system call in such cases */
                    /* lseek(fd->fd_sys, off, SEEK_SET);
                    err = read(fd->fd_sys, ((char *) buf) + i, frd_size);*/

                    req_off = off;
                    req_len = frd_size;
                    userbuf_off = i_offset;
                    ADIOI_BUFFERED_READ
                }
                i_offset += frd_size;

                if (off + frd_size < disp + flat_file->indices[j] +
                        flat_file->blocklens[j] + n_filetypes*(ADIO_Offset)filetype_extent)
                    off += frd_size;
                /* did not reach end of contiguous block in filetype.
                   no more I/O needed. off is incremented by frd_size. */
                else {
                    j = (j+1) % flat_file->count;
                    n_filetypes += (j == 0) ? 1 : 0;
                    while (flat_file->blocklens[j]==0) {
                        j = (j+1) % flat_file->count;
                        n_filetypes += (j == 0) ? 1 : 0;
                    }
                    off = disp + flat_file->indices[j] +
                          n_filetypes*(ADIO_Offset)filetype_extent;
                    frd_size = MPL_MIN(flat_file->blocklens[j], bufsize-i_offset);
                }
            }
        }
        else {
Exemple #19
0
/* this used to be implemented in every file system as an fcntl, but the code
 * is identical for all file systems without a real "preallocate" system call.
 * This naive approach will get the job done, but not in a terribly efficient
 * manner.
 */
void ADIOI_GEN_Prealloc(ADIO_File fd, ADIO_Offset diskspace, int *error_code) 
{
	ADIO_Offset curr_fsize, alloc_size, size, len, done, ntimes;
	ADIO_Status status;
	int i;
	char *buf;
	ADIO_Fcntl_t *fcntl_struct;
	static char myname[] = "ADIOI_GEN_PREALLOC";

	/* will be called by one process only */
	/* On file systems with no preallocation function, we have to 
           explicitly write 
           to allocate space. Since there could be holes in the file, 
           we need to read up to the current file size, write it back, 
           and then write beyond that depending on how much 
           preallocation is needed.
           read/write in sizes of no more than ADIOI_PREALLOC_BUFSZ */

	/*curr_fsize = fd->fp_ind; */
	fcntl_struct = (ADIO_Fcntl_t *) ADIOI_Malloc(sizeof(ADIO_Fcntl_t));
	ADIO_Fcntl(fd, ADIO_FCNTL_GET_FSIZE, fcntl_struct, error_code);

	curr_fsize = fcntl_struct->fsize; /* don't rely on fd->fp_ind: might be
					    working on a pre-existing file */
	alloc_size = diskspace;

	size = MPL_MIN(curr_fsize, alloc_size);
	
	ntimes = (size + ADIOI_PREALLOC_BUFSZ - 1)/ADIOI_PREALLOC_BUFSZ;
	buf = (char *) ADIOI_Malloc(ADIOI_PREALLOC_BUFSZ);
	done = 0;

	for (i=0; i<ntimes; i++) {
	    len = MPL_MIN(size-done, ADIOI_PREALLOC_BUFSZ);
	    ADIO_ReadContig(fd, buf, 
                      (int)len, /* len is ADIO_Offset but is <= ADIOI_PREALLOC_BUFSZ (16M),
                              so it fits in an int parameter */
                      MPI_BYTE, ADIO_EXPLICIT_OFFSET, done,
			    &status, error_code);
	    if (*error_code != MPI_SUCCESS) {
		*error_code = MPIO_Err_create_code(MPI_SUCCESS,
						   MPIR_ERR_RECOVERABLE,
						   myname, __LINE__,
						   MPI_ERR_IO, 
						   "**iopreallocrdwr",
						   0);
                return;  
	    }
	    ADIO_WriteContig(fd, buf, 
                       (int)len, /* len is ADIO_Offset but is <= ADIOI_PREALLOC_BUFSZ (16M),
                               so it fits in an int parameter */
                       MPI_BYTE, ADIO_EXPLICIT_OFFSET, 
                             done, &status, error_code);
	    if (*error_code != MPI_SUCCESS) return;
	    done += len;
	}

	if (alloc_size > curr_fsize) {
	    memset(buf, 0, ADIOI_PREALLOC_BUFSZ); 
	    size = alloc_size - curr_fsize;
	    ntimes = (size + ADIOI_PREALLOC_BUFSZ - 1)/ADIOI_PREALLOC_BUFSZ;
	    for (i=0; i<ntimes; i++) {
		len = MPL_MIN(alloc_size-done, ADIOI_PREALLOC_BUFSZ);
		ADIO_WriteContig(fd, buf, 
                     (int)len, /* len is ADIO_Offset but is <= ADIOI_PREALLOC_BUFSZ (16M),
                             so it fits in an int parameter */
                     MPI_BYTE, ADIO_EXPLICIT_OFFSET, 
				 done, &status, error_code);
		if (*error_code != MPI_SUCCESS) return;
		done += len;  
	    }
	}
	ADIOI_Free(fcntl_struct);
	ADIOI_Free(buf);
	*error_code = MPI_SUCCESS;
}