Пример #1
0
void ADIOI_Calc_file_domains(ADIO_Offset *st_offsets, ADIO_Offset
			     *end_offsets, int nprocs, int nprocs_for_coll,
			     ADIO_Offset *min_st_offset_ptr,
			     ADIO_Offset **fd_start_ptr, ADIO_Offset 
			     **fd_end_ptr, int min_fd_size, 
			     ADIO_Offset *fd_size_ptr,
			     int striping_unit)
{
/* Divide the I/O workload among "nprocs_for_coll" processes. This is
   done by (logically) dividing the file into file domains (FDs); each
   process may directly access only its own file domain. */

    ADIO_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, fd_size;
    int i;

#ifdef AGGREGATION_PROFILE
    MPE_Log_event (5004, 0, NULL);
#endif

#ifdef AGG_DEBUG
    FPRINTF(stderr, "ADIOI_Calc_file_domains: %d aggregator(s)\n", 
	    nprocs_for_coll);
#endif

/* find min of start offsets and max of end offsets of all processes */

    min_st_offset = st_offsets[0];
    max_end_offset = end_offsets[0];

    for (i=1; i<nprocs; i++) {
	min_st_offset = ADIOI_MIN(min_st_offset, st_offsets[i]);
	max_end_offset = ADIOI_MAX(max_end_offset, end_offsets[i]);
    }

/* determine the "file domain (FD)" of each process, i.e., the portion of
   the file that will be "owned" by each process */

/* partition the total file access range equally among nprocs_for_coll
   processes */ 
    fd_size = ((max_end_offset - min_st_offset + 1) + nprocs_for_coll -
	       1)/nprocs_for_coll; 
    /* ceiling division as in HPF block distribution */

    /* Tweak the file domains so that no fd is smaller than a threshold.  We
     * have to strike a balance between efficency and parallelism: somewhere
     * between 10k processes sending 32-byte requests and one process sending a
     * 320k request is a (system-dependent) sweet spot */

    if (fd_size < min_fd_size)
	fd_size = min_fd_size;

    *fd_start_ptr = (ADIO_Offset *)
	ADIOI_Malloc(nprocs_for_coll*sizeof(ADIO_Offset)); 
    *fd_end_ptr = (ADIO_Offset *)
	ADIOI_Malloc(nprocs_for_coll*sizeof(ADIO_Offset)); 

    fd_start = *fd_start_ptr;
    fd_end = *fd_end_ptr;

    /* Wei-keng Liao: implementation for fild domain alignment to nearest file
     * lock boundary (as specified by striping_unit hint).  Could also
     * experiment with other alignment strategies here */
    if (striping_unit > 0) {
        ADIO_Offset end_off;
        int         rem_front, rem_back;

        /* align fd_end[0] to the nearest file lock boundary */
        fd_start[0] = min_st_offset;
        end_off     = fd_start[0] + fd_size;
        rem_front   = end_off % striping_unit;
        rem_back    = striping_unit - rem_front;
        if (rem_front < rem_back) 
		end_off -= rem_front;
        else                      
		end_off += rem_back;
        fd_end[0] = end_off - 1;
    
        /* align fd_end[i] to the nearest file lock boundary */
        for (i=1; i<nprocs_for_coll; i++) {
            fd_start[i] = fd_end[i-1] + 1;
            end_off     = min_st_offset + fd_size * (i+1);
            rem_front   = end_off % striping_unit;
            rem_back    = striping_unit - rem_front;
            if (rem_front < rem_back) 
		    end_off -= rem_front;
            else                      
		    end_off += rem_back;
            fd_end[i] = end_off - 1;
        }
        fd_end[nprocs_for_coll-1] = max_end_offset;
    }
    else { /* no hints set: do things the 'old' way */
        fd_start[0] = min_st_offset;
        fd_end[0] = min_st_offset + fd_size - 1;

        for (i=1; i<nprocs_for_coll; i++) {
            fd_start[i] = fd_end[i-1] + 1;
            fd_end[i] = fd_start[i] + fd_size - 1;
        }
    }

/* take care of cases in which the total file access range is not
   divisible by the number of processes. In such cases, the last
   process, or the last few processes, may have unequal load (even 0).
   For example, a range of 97 divided among 16 processes.
   Note that the division is ceiling division. */

    for (i=0; i<nprocs_for_coll; i++) {
	if (fd_start[i] > max_end_offset)
	    fd_start[i] = fd_end[i] = -1;
	if (fd_end[i] > max_end_offset)
	    fd_end[i] = max_end_offset;
    }

    *fd_size_ptr = fd_size;
    *min_st_offset_ptr = min_st_offset;

#ifdef AGGREGATION_PROFILE
    MPE_Log_event (5005, 0, NULL);
#endif
}
Пример #2
0
/* If successful, error_code is set to MPI_SUCCESS.  Otherwise an error
 * code is created and returned in error_code.
 */
static void ADIOI_Exch_and_write(ADIO_File fd, void *buf, MPI_Datatype
				 datatype, int nprocs,
				 int myrank,
				 ADIOI_Access
				 *others_req, ADIO_Offset *offset_list,
				 ADIO_Offset *len_list, int contig_access_count,
				 ADIO_Offset min_st_offset, ADIO_Offset fd_size,
				 ADIO_Offset *fd_start, ADIO_Offset *fd_end,
				 int *buf_idx, int *error_code)
{
/* Send data to appropriate processes and write in sizes of no more
   than coll_bufsize.
   The idea is to reduce the amount of extra memory required for
   collective I/O. If all data were written all at once, which is much
   easier, it would require temp space more than the size of user_buf,
   which is often unacceptable. For example, to write a distributed
   array to a file, where each local array is 8Mbytes, requiring
   at least another 8Mbytes of temp space is unacceptable. */

    /* Not convinced end_loc-st_loc couldn't be > int, so make these offsets*/
    ADIO_Offset size=0;
    int hole, i, j, m, ntimes, max_ntimes, buftype_is_contig;
    ADIO_Offset st_loc=-1, end_loc=-1, off, done, req_off;
    char *write_buf=NULL;
    int *curr_offlen_ptr, *count, *send_size, req_len, *recv_size;
    int *partial_recv, *sent_to_proc, *start_pos, flag;
    int *send_buf_idx, *curr_to_proc, *done_to_proc;
    MPI_Status status;
    ADIOI_Flatlist_node *flat_buf=NULL;
    MPI_Aint buftype_extent;
    int info_flag, coll_bufsize;
    char *value;
    static char myname[] = "ADIOI_EXCH_AND_WRITE";

    *error_code = MPI_SUCCESS;  /* changed below if error */
    /* only I/O errors are currently reported */

/* calculate the number of writes of size coll_bufsize
   to be done by each process and the max among all processes.
   That gives the no. of communication phases as well. */

    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
    ADIOI_Info_get(fd->info, "cb_buffer_size", MPI_MAX_INFO_VAL, value,
                 &info_flag);
    coll_bufsize = atoi(value);
    ADIOI_Free(value);


    for (i=0; i < nprocs; i++) {
	if (others_req[i].count) {
	    st_loc = others_req[i].offsets[0];
	    end_loc = others_req[i].offsets[0];
	    break;
	}
    }

    for (i=0; i < nprocs; i++)
	for (j=0; j < others_req[i].count; j++) {
	    st_loc = ADIOI_MIN(st_loc, others_req[i].offsets[j]);
	    end_loc = ADIOI_MAX(end_loc, (others_req[i].offsets[j]
				       + others_req[i].lens[j] - 1));
	}

/* ntimes=ceiling_div(end_loc - st_loc + 1, coll_bufsize)*/

    ntimes = (int) ((end_loc - st_loc + coll_bufsize)/coll_bufsize);

    if ((st_loc==-1) && (end_loc==-1)) {
	ntimes = 0; /* this process does no writing. */
    }

    MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX,
		  fd->comm);

    write_buf = fd->io_buf;

    curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
    /* its use is explained below. calloc initializes to 0. */

    count = (int *) ADIOI_Malloc(nprocs*sizeof(int));
    /* to store count of how many off-len pairs per proc are satisfied
       in an iteration. */

    partial_recv = (int *) ADIOI_Calloc(nprocs, sizeof(int));
    /* if only a portion of the last off-len pair is recd. from a process
       in a particular iteration, the length recd. is stored here.
       calloc initializes to 0. */

    send_size = (int *) ADIOI_Malloc(nprocs*sizeof(int));
    /* total size of data to be sent to each proc. in an iteration.
       Of size nprocs so that I can use MPI_Alltoall later. */

    recv_size = (int *) ADIOI_Malloc(nprocs*sizeof(int));
    /* total size of data to be recd. from each proc. in an iteration.*/

    sent_to_proc = (int *) ADIOI_Calloc(nprocs, sizeof(int));
    /* amount of data sent to each proc so far. Used in
       ADIOI_Fill_send_buffer. initialized to 0 here. */

    send_buf_idx = (int *) ADIOI_Malloc(nprocs*sizeof(int));
    curr_to_proc = (int *) ADIOI_Malloc(nprocs*sizeof(int));
    done_to_proc = (int *) ADIOI_Malloc(nprocs*sizeof(int));
    /* Above three are used in ADIOI_Fill_send_buffer*/

    start_pos = (int *) ADIOI_Malloc(nprocs*sizeof(int));
    /* used to store the starting value of curr_offlen_ptr[i] in
       this iteration */

    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
    if (!buftype_is_contig) {
	ADIOI_Flatten_datatype(datatype);
	flat_buf = ADIOI_Flatlist;
        while (flat_buf->type != datatype) flat_buf = flat_buf->next;
    }
    MPI_Type_extent(datatype, &buftype_extent);


/* I need to check if there are any outstanding nonblocking writes to
   the file, which could potentially interfere with the writes taking
   place in this collective write call. Since this is not likely to be
   common, let me do the simplest thing possible here: Each process
   completes all pending nonblocking operations before completing. */

    /*ADIOI_Complete_async(error_code);
    if (*error_code != MPI_SUCCESS) return;
    MPI_Barrier(fd->comm);
    */

    done = 0;
    off = st_loc;

    for (m=0; m < ntimes; m++) {
       /* go through all others_req and check which will be satisfied
          by the current write */

       /* Note that MPI guarantees that displacements in filetypes are in
          monotonically nondecreasing order and that, for writes, the
	  filetypes cannot specify overlapping regions in the file. This
	  simplifies implementation a bit compared to reads. */

          /* off = start offset in the file for the data to be written in
                   this iteration
             size = size of data written (bytes) corresponding to off
             req_off = off in file for a particular contiguous request
                       minus what was satisfied in previous iteration
             req_size = size corresponding to req_off */

	/* first calculate what should be communicated */

	for (i=0; i < nprocs; i++) count[i] = recv_size[i] = 0;

	size = ADIOI_MIN((unsigned)coll_bufsize, end_loc-st_loc+1-done);

	for (i=0; i < nprocs; i++) {
	    if (others_req[i].count) {
		start_pos[i] = curr_offlen_ptr[i];
		for (j=curr_offlen_ptr[i]; j<others_req[i].count; j++) {
		    if (partial_recv[i]) {
			/* this request may have been partially
			   satisfied in the previous iteration. */
			req_off = others_req[i].offsets[j] +
			    partial_recv[i];
                        req_len = others_req[i].lens[j] -
			    partial_recv[i];
			partial_recv[i] = 0;
			/* modify the off-len pair to reflect this change */
			others_req[i].offsets[j] = req_off;
			others_req[i].lens[j] = req_len;
		    }
		    else {
			req_off = others_req[i].offsets[j];
                        req_len = others_req[i].lens[j];
		    }
		    if (req_off < off + size) {
			count[i]++;
      ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)write_buf)+req_off-off) == (ADIO_Offset)(MPIR_Upint)(write_buf+req_off-off));
			MPI_Address(write_buf+req_off-off,
                               &(others_req[i].mem_ptrs[j]));
      ADIOI_Assert((off + size - req_off) == (int)(off + size - req_off));
			recv_size[i] += (int)(ADIOI_MIN(off + size - req_off,
                                      (unsigned)req_len));

			if (off+size-req_off < (unsigned)req_len)
			{
			    partial_recv[i] = (int) (off + size - req_off);

			    /* --BEGIN ERROR HANDLING-- */
			    if ((j+1 < others_req[i].count) &&
                                 (others_req[i].offsets[j+1] < off+size))
			    {
				*error_code = MPIO_Err_create_code(MPI_SUCCESS,
								   MPIR_ERR_RECOVERABLE,
								   myname,
								   __LINE__,
								   MPI_ERR_ARG,
								   "Filetype specifies overlapping write regions (which is illegal according to the MPI-2 specification)", 0);
				/* allow to continue since additional
				 * communication might have to occur
				 */
			    }
			    /* --END ERROR HANDLING-- */
			    break;
			}
		    }
		    else break;
		}
		curr_offlen_ptr[i] = j;
	    }
	}

	ADIOI_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
                            len_list, send_size, recv_size, off, size, count,
                            start_pos, partial_recv,
                            sent_to_proc, nprocs, myrank,
			    buftype_is_contig, contig_access_count,
			    min_st_offset, fd_size, fd_start, fd_end,
			    others_req, send_buf_idx, curr_to_proc,
                            done_to_proc, &hole, m, buftype_extent, buf_idx,
			    error_code);
        if (*error_code != MPI_SUCCESS) return;

	flag = 0;
	for (i=0; i<nprocs; i++)
	    if (count[i]) flag = 1;

	if (flag) {
      ADIOI_Assert(size == (int)size);
	    ADIO_WriteContig(fd, write_buf, (int)size, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
                        off, &status, error_code);
	    if (*error_code != MPI_SUCCESS) return;
	}

	off += size;
	done += size;
    }

    for (i=0; i<nprocs; i++) count[i] = recv_size[i] = 0;
    for (m=ntimes; m<max_ntimes; m++) {
	ADIOI_Assert(size == (int)size);
	/* nothing to recv, but check for send. */
	ADIOI_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
                            len_list, send_size, recv_size, off, (int)size, count,
                            start_pos, partial_recv,
                            sent_to_proc, nprocs, myrank,
			    buftype_is_contig, contig_access_count,
			    min_st_offset, fd_size, fd_start, fd_end,
			    others_req, send_buf_idx,
                            curr_to_proc, done_to_proc, &hole, m,
                            buftype_extent, buf_idx, error_code);
        if (*error_code != MPI_SUCCESS) return;
    }

    ADIOI_Free(curr_offlen_ptr);
    ADIOI_Free(count);
    ADIOI_Free(partial_recv);
    ADIOI_Free(send_size);
    ADIOI_Free(recv_size);
    ADIOI_Free(sent_to_proc);
    ADIOI_Free(start_pos);
    ADIOI_Free(send_buf_idx);
    ADIOI_Free(curr_to_proc);
    ADIOI_Free(done_to_proc);
}
Пример #3
0
static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
			 datatype, int nprocs,
			 int myrank, ADIOI_Access
			 *others_req, ADIO_Offset *offset_list,
			 ADIO_Offset *len_list, int contig_access_count, ADIO_Offset
                         min_st_offset, ADIO_Offset fd_size,
			 ADIO_Offset *fd_start, ADIO_Offset *fd_end,
                         int *buf_idx, int *error_code)
{
/* Read in sizes of no more than coll_bufsize, an info parameter.
   Send data to appropriate processes. 
   Place recd. data in user buf.
   The idea is to reduce the amount of extra memory required for
   collective I/O. If all data were read all at once, which is much
   easier, it would require temp space more than the size of user_buf,
   which is often unacceptable. For example, to read a distributed
   array from a file, where each local array is 8Mbytes, requiring
   at least another 8Mbytes of temp space is unacceptable. */

    int i, j, m, ntimes, max_ntimes, buftype_is_contig;
    ADIO_Offset st_loc=-1, end_loc=-1, off, done, real_off, req_off;
    char *read_buf = NULL, *tmp_buf;
    int *curr_offlen_ptr, *count, *send_size, *recv_size;
    int *partial_send, *recd_from_proc, *start_pos;
    /* Not convinced end_loc-st_loc couldn't be > int, so make these offsets*/
    ADIO_Offset real_size, size, for_curr_iter, for_next_iter;
    int req_len, flag, rank;
    MPI_Status status;
    ADIOI_Flatlist_node *flat_buf=NULL;
    MPI_Aint buftype_extent;
    int coll_bufsize;

    *error_code = MPI_SUCCESS;  /* changed below if error */
    /* only I/O errors are currently reported */
    
/* calculate the number of reads of size coll_bufsize
   to be done by each process and the max among all processes.
   That gives the no. of communication phases as well.
   coll_bufsize is obtained from the hints object. */

    coll_bufsize = fd->hints->cb_buffer_size;

    /* grab some initial values for st_loc and end_loc */
    for (i=0; i < nprocs; i++) {
	if (others_req[i].count) {
	    st_loc = others_req[i].offsets[0];
	    end_loc = others_req[i].offsets[0];
	    break;
	}
    }

    /* now find the real values */
    for (i=0; i < nprocs; i++)
	for (j=0; j<others_req[i].count; j++) {
	    st_loc = ADIOI_MIN(st_loc, others_req[i].offsets[j]);
	    end_loc = ADIOI_MAX(end_loc, (others_req[i].offsets[j]
					  + others_req[i].lens[j] - 1));
	}

    /* calculate ntimes, the number of times this process must perform I/O
     * operations in order to complete all the requests it has received.
     * the need for multiple I/O operations comes from the restriction that
     * we only use coll_bufsize bytes of memory for internal buffering.
     */
    if ((st_loc==-1) && (end_loc==-1)) {
	/* this process does no I/O. */
	ntimes = 0;
    }
    else {
	/* ntimes=ceiling_div(end_loc - st_loc + 1, coll_bufsize)*/
	ntimes = (int) ((end_loc - st_loc + coll_bufsize)/coll_bufsize);
    }

    MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX, fd->comm); 

    read_buf = fd->io_buf;  /* Allocated at open time */

    curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int)); 
    /* its use is explained below. calloc initializes to 0. */

    count = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    /* to store count of how many off-len pairs per proc are satisfied
       in an iteration. */

    partial_send = (int *) ADIOI_Calloc(nprocs, sizeof(int));
    /* if only a portion of the last off-len pair is sent to a process 
       in a particular iteration, the length sent is stored here.
       calloc initializes to 0. */

    send_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    /* total size of data to be sent to each proc. in an iteration */

    recv_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    /* total size of data to be recd. from each proc. in an iteration.
       Of size nprocs so that I can use MPI_Alltoall later. */

    recd_from_proc = (int *) ADIOI_Calloc(nprocs, sizeof(int));
    /* amount of data recd. so far from each proc. Used in
       ADIOI_Fill_user_buffer. initialized to 0 here. */

    start_pos = (int *) ADIOI_Malloc(nprocs*sizeof(int));
    /* used to store the starting value of curr_offlen_ptr[i] in 
       this iteration */

    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
    if (!buftype_is_contig) {
	ADIOI_Flatten_datatype(datatype);
	flat_buf = ADIOI_Flatlist;
        while (flat_buf->type != datatype) flat_buf = flat_buf->next;
    }
    MPI_Type_extent(datatype, &buftype_extent);

    done = 0;
    off = st_loc;
    for_curr_iter = for_next_iter = 0;

    MPI_Comm_rank(fd->comm, &rank);

    for (m=0; m<ntimes; m++) {
       /* read buf of size coll_bufsize (or less) */
       /* go through all others_req and check if any are satisfied
          by the current read */

       /* since MPI guarantees that displacements in filetypes are in 
          monotonically nondecreasing order, I can maintain a pointer
	  (curr_offlen_ptr) to 
          current off-len pair for each process in others_req and scan
          further only from there. There is still a problem of filetypes
          such as:  (1, 2, 3 are not process nos. They are just numbers for
          three chunks of data, specified by a filetype.)

                   1  -------!--
                   2    -----!----
                   3       --!-----

          where ! indicates where the current read_size limitation cuts 
          through the filetype.  I resolve this by reading up to !, but
          filling the communication buffer only for 1. I copy the portion
          left over for 2 into a tmp_buf for use in the next
	  iteration. i.e., 2 and 3 will be satisfied in the next
	  iteration. This simplifies filling in the user's buf at the
	  other end, as only one off-len pair with incomplete data
	  will be sent. I also don't need to send the individual
	  offsets and lens along with the data, as the data is being
	  sent in a particular order. */ 

          /* off = start offset in the file for the data actually read in 
                   this iteration 
             size = size of data read corresponding to off
             real_off = off minus whatever data was retained in memory from
                  previous iteration for cases like 2, 3 illustrated above
             real_size = size plus the extra corresponding to real_off
             req_off = off in file for a particular contiguous request 
                       minus what was satisfied in previous iteration
             req_size = size corresponding to req_off */

	size = ADIOI_MIN((unsigned)coll_bufsize, end_loc-st_loc+1-done); 
	real_off = off - for_curr_iter;
	real_size = size + for_curr_iter;

	for (i=0; i<nprocs; i++) count[i] = send_size[i] = 0;
	for_next_iter = 0;

	for (i=0; i<nprocs; i++) {
#ifdef RDCOLL_DEBUG
	    DBG_FPRINTF(stderr, "rank %d, i %d, others_count %d\n", rank, i, others_req[i].count); 
#endif
	    if (others_req[i].count) {
		start_pos[i] = curr_offlen_ptr[i];
		for (j=curr_offlen_ptr[i]; j<others_req[i].count;
		     j++) {
		    if (partial_send[i]) {
			/* this request may have been partially
			   satisfied in the previous iteration. */
			req_off = others_req[i].offsets[j] +
			    partial_send[i]; 
                        req_len = others_req[i].lens[j] -
			    partial_send[i];
			partial_send[i] = 0;
			/* modify the off-len pair to reflect this change */
			others_req[i].offsets[j] = req_off;
			others_req[i].lens[j] = req_len;
		    }
		    else {
			req_off = others_req[i].offsets[j];
                        req_len = others_req[i].lens[j];
		    }
		    if (req_off < real_off + real_size) {
			count[i]++;
      ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)read_buf)+req_off-real_off) == (ADIO_Offset)(MPIR_Upint)(read_buf+req_off-real_off));
			MPI_Address(read_buf+req_off-real_off, 
                               &(others_req[i].mem_ptrs[j]));
      ADIOI_Assert((real_off + real_size - req_off) == (int)(real_off + real_size - req_off));
			send_size[i] += (int)(ADIOI_MIN(real_off + real_size - req_off, 
                                      (ADIO_Offset)(unsigned)req_len)); 

			if (real_off+real_size-req_off < (ADIO_Offset)(unsigned)req_len) {
			    partial_send[i] = (int) (real_off + real_size - req_off);
			    if ((j+1 < others_req[i].count) && 
                                 (others_req[i].offsets[j+1] < 
                                     real_off+real_size)) { 
				/* this is the case illustrated in the
				   figure above. */
				for_next_iter = ADIOI_MAX(for_next_iter,
					  real_off + real_size - others_req[i].offsets[j+1]); 
				/* max because it must cover requests 
				   from different processes */
			    }
			    break;
			}
		    }
		    else break;
		}
		curr_offlen_ptr[i] = j;
	    }
	}

	flag = 0;
	for (i=0; i<nprocs; i++)
	    if (count[i]) flag = 1;

	if (flag) {
      ADIOI_Assert(size == (int)size);
	    ADIO_ReadContig(fd, read_buf+for_curr_iter, (int)size, MPI_BYTE,
			    ADIO_EXPLICIT_OFFSET, off, &status, error_code);
	    if (*error_code != MPI_SUCCESS) return;
	}
	
	for_curr_iter = for_next_iter;
	
	ADIOI_R_Exchange_data(fd, buf, flat_buf, offset_list, len_list,
			    send_size, recv_size, count, 
       			    start_pos, partial_send, recd_from_proc, nprocs,
			    myrank, 
			    buftype_is_contig, contig_access_count,
			    min_st_offset, fd_size, fd_start, fd_end,
			    others_req, 
                            m, buftype_extent, buf_idx); 


	if (for_next_iter) {
	    tmp_buf = (char *) ADIOI_Malloc(for_next_iter);
      ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)read_buf)+real_size-for_next_iter) == (ADIO_Offset)(MPIR_Upint)(read_buf+real_size-for_next_iter));
      ADIOI_Assert((for_next_iter+coll_bufsize) == (size_t)(for_next_iter+coll_bufsize));
	    memcpy(tmp_buf, read_buf+real_size-for_next_iter, for_next_iter);
	    ADIOI_Free(fd->io_buf);
	    fd->io_buf = (char *) ADIOI_Malloc(for_next_iter+coll_bufsize);
	    memcpy(fd->io_buf, tmp_buf, for_next_iter);
	    read_buf = fd->io_buf;
	    ADIOI_Free(tmp_buf);
	}

	off += size;
	done += size;
    }

    for (i=0; i<nprocs; i++) count[i] = send_size[i] = 0;
    for (m=ntimes; m<max_ntimes; m++) 
/* nothing to send, but check for recv. */
	ADIOI_R_Exchange_data(fd, buf, flat_buf, offset_list, len_list,
			    send_size, recv_size, count, 
			    start_pos, partial_send, recd_from_proc, nprocs,
			    myrank, 
			    buftype_is_contig, contig_access_count,
			    min_st_offset, fd_size, fd_start, fd_end,
			    others_req, m,
                            buftype_extent, buf_idx); 

    ADIOI_Free(curr_offlen_ptr);
    ADIOI_Free(count);
    ADIOI_Free(partial_send);
    ADIOI_Free(send_size);
    ADIOI_Free(recv_size);
    ADIOI_Free(recd_from_proc);
    ADIOI_Free(start_pos);
}
Пример #4
0
void ADIOI_Calc_others_req(ADIO_File fd, int count_my_req_procs,
                           int *count_my_req_per_proc,
                           ADIOI_Access *my_req,
                           int nprocs, int myrank,
                           int *count_others_req_procs_ptr,
                           ADIOI_Access **others_req_ptr)
{
    /* determine what requests of other processes lie in this process's
       file domain */

    /* count_others_req_procs = number of processes whose requests lie in
       this process's file domain (including this process itself)
       count_others_req_per_proc[i] indicates how many separate contiguous
       requests of proc. i lie in this process's file domain. */

    int *count_others_req_per_proc, count_others_req_procs;
    int i, j;
    MPI_Request *send_requests, *recv_requests;
    MPI_Status *statuses;
    ADIOI_Access *others_req;

    /* first find out how much to send/recv and from/to whom */

    count_others_req_per_proc = (int *) ADIOI_Malloc(nprocs*sizeof(int));

    MPI_Alltoall(count_my_req_per_proc, 1, MPI_INT,
                 count_others_req_per_proc, 1, MPI_INT, fd->comm);

    *others_req_ptr = (ADIOI_Access *)
                      ADIOI_Malloc(nprocs*sizeof(ADIOI_Access));
    others_req = *others_req_ptr;

    count_others_req_procs = 0;
    for (i=0; i<nprocs; i++) {
        if (count_others_req_per_proc[i]) {
            others_req[i].count = count_others_req_per_proc[i];
            others_req[i].offsets = (ADIO_Offset *)
                                    ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(ADIO_Offset));
            others_req[i].lens = (int *)
                                 ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(int));
            others_req[i].mem_ptrs = (MPI_Aint *)
                                     ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(MPI_Aint));
            count_others_req_procs++;
        }
        else others_req[i].count = 0;
    }

    /* now send the calculated offsets and lengths to respective processes */

    send_requests = (MPI_Request *)
                    ADIOI_Malloc(2*(count_my_req_procs+1)*sizeof(MPI_Request));
    recv_requests = (MPI_Request *)
                    ADIOI_Malloc(2*(count_others_req_procs+1)*sizeof(MPI_Request));
    /* +1 to avoid a 0-size malloc */

    j = 0;
    for (i=0; i<nprocs; i++) {
        if (others_req[i].count) {
            MPI_Irecv(others_req[i].offsets, others_req[i].count,
                      ADIO_OFFSET, i, i+myrank, fd->comm, &recv_requests[j]);
            j++;
            MPI_Irecv(others_req[i].lens, others_req[i].count,
                      MPI_INT, i, i+myrank+1, fd->comm, &recv_requests[j]);
            j++;
        }
    }

    j = 0;
    for (i=0; i < nprocs; i++) {
        if (my_req[i].count) {
            MPI_Isend(my_req[i].offsets, my_req[i].count,
                      ADIO_OFFSET, i, i+myrank, fd->comm, &send_requests[j]);
            j++;
            MPI_Isend(my_req[i].lens, my_req[i].count,
                      MPI_INT, i, i+myrank+1, fd->comm, &send_requests[j]);
            j++;
        }
    }

    statuses = (MPI_Status *) ADIOI_Malloc((1 + 2* \
                                            ADIOI_MAX(count_my_req_procs,count_others_req_procs)) * \
                                           sizeof(MPI_Status));
    /* +1 to avoid a 0-size malloc */

    MPI_Waitall(2*count_my_req_procs, send_requests, statuses);
    MPI_Waitall(2*count_others_req_procs, recv_requests, statuses);

    ADIOI_Free(send_requests);
    ADIOI_Free(recv_requests);
    ADIOI_Free(statuses);
    ADIOI_Free(count_others_req_per_proc);

    *count_others_req_procs_ptr = count_others_req_procs;
}
Пример #5
0
/* ADIOI_Exchange_file_views - Sends all the aggregators the file
 * views and file view states of the clients.  It fills in the
 * client_file_view_state_arr for the aggregators and the
 * my_mem_view_state for the client.  It also initializes the
 * agg_file_view_state for all clients, which is the view for each
 * aggregator of a client's filetype. */
void ADIOI_Exch_file_views(int myrank, int nprocs, int file_ptr_type,
			   ADIO_File fd, int count,
			   MPI_Datatype datatype, ADIO_Offset off,
			   view_state *my_mem_view_state_arr,
			   view_state *agg_file_view_state_arr,
			   view_state *client_file_view_state_arr)
{
    /* Convert my own fileview to an ADIOI_Flattened type and a
     * disp. MPI_Alltoall the count of ADIOI_Flatlist nodes.
     * MPI_Isend/Irecv the block_lens, indices of ADIOI_Flatlist node
     * to/from each of the aggregators with the rest of the file view
     * state. */

    int i = -1, j = -1;
    amount_and_extra_data_t *send_count_arr = NULL;
    amount_and_extra_data_t *recv_count_arr = NULL;
    int send_req_arr_sz = 0;
    int recv_req_arr_sz = 0;
    MPI_Request *send_req_arr = NULL, *recv_req_arr = NULL;
    MPI_Status *statuses = NULL;
    ADIO_Offset disp_off_sz_ext_typesz[6];
    MPI_Aint memtype_extent, filetype_extent, lb;
    int ret = -1;

    /* parameters for datatypes */
    ADIOI_Flatlist_node *flat_mem_p = NULL, *flat_file_p = NULL;
    MPI_Count memtype_sz = -1;
    int memtype_is_contig = -1;
    ADIO_Offset filetype_sz = -1;

#ifdef AGGREGATION_PROFILE
    MPE_Log_event (5014, 0, NULL);
#endif
    /* The memtype will be freed after the call.  The filetype will be
     * freed in the close and should have been flattened in the file
     * view. */
    MPI_Type_size_x(datatype, &memtype_sz);
    MPI_Type_get_extent(datatype, &lb, &memtype_extent);
    if (memtype_sz == memtype_extent) {
	memtype_is_contig = 1;
	flat_mem_p = ADIOI_Add_contig_flattened(datatype);
	flat_mem_p->blocklens[0] = memtype_sz*count;
    }
    else {
	flat_mem_p = ADIOI_Flatten_and_find(datatype);
    }

    MPI_Type_get_extent(fd->filetype, &lb, &filetype_extent);
    MPI_Type_size_x(fd->filetype, &filetype_sz);
    if (filetype_extent == filetype_sz) {
	flat_file_p = ADIOI_Add_contig_flattened(fd->filetype);
	flat_file_p->blocklens[0] = memtype_sz*count;
	filetype_extent = memtype_sz*count;
	filetype_sz = filetype_extent;
    }
    else {
        flat_file_p = ADIOI_Flatlist;
        while (flat_file_p->type != fd->filetype)
            flat_file_p = flat_file_p->next; 
    }

    disp_off_sz_ext_typesz[0] = fd->fp_ind;
    disp_off_sz_ext_typesz[1] = fd->disp;
    disp_off_sz_ext_typesz[2] = off;
    disp_off_sz_ext_typesz[3] = memtype_sz*count;
    disp_off_sz_ext_typesz[4] = (ADIO_Offset) filetype_extent;
    disp_off_sz_ext_typesz[5] = (ADIO_Offset) filetype_sz;

    if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) {
        recv_count_arr = ADIOI_Calloc(nprocs, sizeof(amount_and_extra_data_t));
        send_count_arr = ADIOI_Calloc(nprocs, sizeof(amount_and_extra_data_t));
    } else {
        send_count_arr = ADIOI_Calloc(fd->hints->cb_nodes,
				  sizeof(amount_and_extra_data_t));

        /* only aggregators receive data */
        if (fd->is_agg) {
	    recv_count_arr = ADIOI_Calloc(nprocs, 
			    sizeof(amount_and_extra_data_t));
	    recv_req_arr = ADIOI_Malloc (nprocs * sizeof(MPI_Request));
	    for (i=0; i < nprocs; i++)
	        MPI_Irecv (&recv_count_arr[i], sizeof(amount_and_extra_data_t),
		       MPI_BYTE, i, COUNT_EXCH, fd->comm, &recv_req_arr[i]);
        }
    
        /* only send data to aggregators */
        send_req_arr = ADIOI_Calloc (fd->hints->cb_nodes, sizeof(MPI_Request));
        for (i=0; i < fd->hints->cb_nodes; i++) {
	    send_count_arr[i].count    = flat_file_p->count;
	    send_count_arr[i].fp_ind   = disp_off_sz_ext_typesz[0];
	    send_count_arr[i].disp     = disp_off_sz_ext_typesz[1];
	    send_count_arr[i].byte_off = disp_off_sz_ext_typesz[2];
	    send_count_arr[i].sz       = disp_off_sz_ext_typesz[3];
	    send_count_arr[i].ext      = disp_off_sz_ext_typesz[4];
	    send_count_arr[i].type_sz  = disp_off_sz_ext_typesz[5];
	    MPI_Isend (&send_count_arr[i], sizeof(amount_and_extra_data_t),
		   MPI_BYTE, fd->hints->ranklist[i], COUNT_EXCH, fd->comm,
		   &send_req_arr[i]);
        }
    }

 
    /* Every client has to build mem and file view_states for each aggregator.
     * We initialize their values here.  and we also initialize
     * send_count_arr */

    if (memtype_is_contig) {
	/* if memory is contigous, we now replace memtype_sz and
	 * memtype_extent with the full access size */
	memtype_sz *= count;
	memtype_extent = memtype_sz;
    }

    for (i = 0; i < fd->hints->cb_nodes; i++)
    {
	int tmp_agg_idx = fd->hints->ranklist[i];
	memset(&(my_mem_view_state_arr[tmp_agg_idx]), 0, sizeof(view_state));
	my_mem_view_state_arr[tmp_agg_idx].sz          =
	    disp_off_sz_ext_typesz[3];
	my_mem_view_state_arr[tmp_agg_idx].ext         =
	    (ADIO_Offset) memtype_extent;
	my_mem_view_state_arr[tmp_agg_idx].type_sz     =
	    (ADIO_Offset) memtype_sz;
	my_mem_view_state_arr[tmp_agg_idx].flat_type_p = flat_mem_p;
	ADIOI_init_view_state(file_ptr_type,
			1,
			&(my_mem_view_state_arr[tmp_agg_idx]),
			TEMP_OFF);
	ADIOI_init_view_state(file_ptr_type,
			1,
			&(my_mem_view_state_arr[tmp_agg_idx]),
			REAL_OFF);
	
	memset(&(agg_file_view_state_arr[tmp_agg_idx]), 0, sizeof(view_state));
	agg_file_view_state_arr[tmp_agg_idx].fp_ind    =
	    disp_off_sz_ext_typesz[0];
	agg_file_view_state_arr[tmp_agg_idx].disp      =
	    disp_off_sz_ext_typesz[1];
	agg_file_view_state_arr[tmp_agg_idx].byte_off  =
	    disp_off_sz_ext_typesz[2];
	agg_file_view_state_arr[tmp_agg_idx].sz        =
	    disp_off_sz_ext_typesz[3];
	agg_file_view_state_arr[tmp_agg_idx].ext       =
	    disp_off_sz_ext_typesz[4];
	agg_file_view_state_arr[tmp_agg_idx].type_sz   =
	    disp_off_sz_ext_typesz[5];
	agg_file_view_state_arr[tmp_agg_idx].flat_type_p = flat_file_p;

	ADIOI_init_view_state(file_ptr_type,
			1,
			&(agg_file_view_state_arr[tmp_agg_idx]),
			TEMP_OFF);
	ADIOI_init_view_state(file_ptr_type,
			1,
			&(agg_file_view_state_arr[tmp_agg_idx]),
			REAL_OFF);

	if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) {
	    send_count_arr[tmp_agg_idx].count    = flat_file_p->count;
	    send_count_arr[tmp_agg_idx].fp_ind   = disp_off_sz_ext_typesz[0];
	    send_count_arr[tmp_agg_idx].disp     = disp_off_sz_ext_typesz[1];
	    send_count_arr[tmp_agg_idx].byte_off = disp_off_sz_ext_typesz[2];
	    send_count_arr[tmp_agg_idx].sz       = disp_off_sz_ext_typesz[3];
	    send_count_arr[tmp_agg_idx].ext      = disp_off_sz_ext_typesz[4];
	    send_count_arr[tmp_agg_idx].type_sz  = disp_off_sz_ext_typesz[5];
	}
    }

#ifdef DEBUG2
    fprintf(stderr, "my own flattened memtype: ");
    ADIOI_Print_flatlist_node(flat_mem_p);
    fprintf(stderr, "my own flattened filetype: ");
    ADIOI_Print_flatlist_node(flat_file_p);
#endif
	
    if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) {
        ret = MPI_Alltoall(send_count_arr, sizeof(amount_and_extra_data_t),
		       MPI_BYTE, 
		       recv_count_arr, sizeof(amount_and_extra_data_t),
		       MPI_BYTE, fd->comm);
        if (ret != MPI_SUCCESS)
        {
	    fprintf(stderr, "ADIOI_Exchange_file_views: MPI_Alltoall failed "
		"with error %d", ret);
	    return;
        }
    } else {
        statuses = (MPI_Status *) ADIOI_Malloc(1 + nprocs * sizeof(MPI_Status));
        if (fd->is_agg) {
	    MPI_Waitall(nprocs, recv_req_arr, statuses);
	    ADIOI_Free(recv_req_arr);
        }
        MPI_Waitall(fd->hints->cb_nodes, send_req_arr, statuses);
        ADIOI_Free(statuses);
        ADIOI_Free(send_req_arr);
    }
#ifdef DEBUG2
    if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) {
        fprintf(stderr, "send_count_arr:");
        for (i = 0; i < nprocs; i++)
        {
	    fprintf(stderr, "[%d]=%d ", i, send_count_arr[i].count);
        }
        fprintf(stderr, "\n");
        fprintf(stderr, "recv_count_arr:");
        for (i = 0; i < nprocs; i++)
	{
	    fprintf(stderr, "[%d]=%d ", i, recv_count_arr[i].count);
	}
        fprintf(stderr, "\n");
    } else {
        fprintf(stderr, "send_count_arr:");
        for (i = 0; i < fd->hints->cb_nodes; i++)
        {
	    fprintf(stderr, "[%d]=%d ", i, send_count_arr[i].count);
        }
        fprintf(stderr, "\n");
        if (fd->is_agg) {
	    fprintf(stderr, "recv_count_arr:");
	    for (i = 0; i < nprocs; i++)
	    {
	        fprintf(stderr, "[%d]=%d ", i, recv_count_arr[i].count);
	    }
	    fprintf(stderr, "\n");
        }
    }
#endif

    if (fd->hints->cb_alltoall == ADIOI_HINT_DISABLE) {
        for (i=0; i < fd->hints->cb_nodes; i++)
	    if (send_count_arr[i].count > 0)
	        send_req_arr_sz++;
    }
    /* Figure out how many counts to send/recv */
    for (i = 0; i < nprocs; i++)
    {
        if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) {
	    if (send_count_arr[i].count > 0)
	        send_req_arr_sz++;
	}
	/* Only aggregators should recv*/
	if (fd->is_agg) {
	    if (recv_count_arr[i].count > 0)
	    {
		if ((client_file_view_state_arr[i].flat_type_p = 
		     (ADIOI_Flatlist_node *) ADIOI_Malloc(
			 sizeof(ADIOI_Flatlist_node))) == NULL)
		{
		    fprintf(stderr, "ADIOI_Exchange_file_views: malloc "
			    "flat_type_p failed\n");
		}
		client_file_view_state_arr[i].flat_type_p->count = 
		    recv_count_arr[i].count;
		client_file_view_state_arr[i].flat_type_p->indices = 
		    (ADIO_Offset *) ADIOI_Calloc(recv_count_arr[i].count, 
						 sizeof(ADIO_Offset));
		client_file_view_state_arr[i].flat_type_p->blocklens =
		    (ADIO_Offset *) ADIOI_Calloc(recv_count_arr[i].count, 
				    sizeof(ADIO_Offset));
		
		/* Copy the extra data out of the stuff we Alltoall'd */
		memcpy (&client_file_view_state_arr[i].fp_ind,
			&recv_count_arr[i].fp_ind,
			6*sizeof(ADIO_Offset));

		recv_req_arr_sz++;
	    }
	}
    }

    /* Since ADIOI_Calloc may do other things we add the +1 
     * to avoid a 0-size malloc */
    send_req_arr = (MPI_Request *) ADIOI_Calloc(2*(send_req_arr_sz)+1,
						sizeof(MPI_Request));
    
    j = 0;
    if (recv_req_arr_sz > 0) {
	assert (fd->is_agg);
	recv_req_arr = (MPI_Request *) ADIOI_Calloc(2*(recv_req_arr_sz),
						    sizeof(MPI_Request));
    	for (i = 0; i < nprocs; i++) {
	    if (recv_count_arr[i].count > 0) {
		MPI_Irecv(client_file_view_state_arr[i].flat_type_p->indices,
			  recv_count_arr[i].count, ADIO_OFFSET, i, 
			  INDICES, fd->comm, &recv_req_arr[j]);
		j++;
		MPI_Irecv(client_file_view_state_arr[i].flat_type_p->blocklens,
			  recv_count_arr[i].count, ADIO_OFFSET, i,
			  BLOCK_LENS, fd->comm, &recv_req_arr[j]);
		j++;
	    }
	}
    }

    if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) {
        j = 0;
        for (i = 0; i < nprocs; i++) {
	    if (send_count_arr[i].count > 0) {
	        MPI_Isend(flat_file_p->indices,
		      send_count_arr[i].count, ADIO_OFFSET, i,
                      INDICES, fd->comm, &send_req_arr[j]);
	        j++;
	        MPI_Isend(flat_file_p->blocklens,         
		      send_count_arr[i].count, ADIO_OFFSET, i,
                      BLOCK_LENS, fd->comm, &send_req_arr[j]);
	        j++;
	    }
        }
    } else {
        j = 0;
        for (i = 0; i < fd->hints->cb_nodes; i++) {
	    if (send_count_arr[i].count > 0) {
	        MPI_Isend(flat_file_p->indices,
		      send_count_arr[i].count, ADIO_OFFSET,
		      fd->hints->ranklist[i], INDICES, fd->comm,
		      &send_req_arr[j]);
	        j++;
	        MPI_Isend(flat_file_p->blocklens,         
		      send_count_arr[i].count, ADIO_OFFSET,
		      fd->hints->ranklist[i], BLOCK_LENS, fd->comm,
		      &send_req_arr[j]);
	        j++;
	    }
        }
    }

    /* Since ADIOI_Malloc may do other things we add the +1 
     * to avoid a 0-size malloc */    
    statuses = (MPI_Status *) 
	ADIOI_Malloc(1 + 2 * ADIOI_MAX(send_req_arr_sz,recv_req_arr_sz)
		     * sizeof(MPI_Status));

    if (send_req_arr_sz > 0) {
	MPI_Waitall(2 * send_req_arr_sz, send_req_arr, statuses);
	ADIOI_Free(send_count_arr);
	ADIOI_Free(send_req_arr);
    }
    if (recv_req_arr_sz > 0) {
	MPI_Waitall(2 * recv_req_arr_sz, recv_req_arr, statuses);
	ADIOI_Free(recv_count_arr);
	ADIOI_Free(recv_req_arr);
    }
    ADIOI_Free(statuses);

    if (fd->is_agg == 1)
    {
	ADIOI_init_view_state(file_ptr_type,
			nprocs,
			client_file_view_state_arr,
			TEMP_OFF);
	ADIOI_init_view_state(file_ptr_type,
			nprocs,
			client_file_view_state_arr,
			REAL_OFF);
    }

#ifdef DEBUG
    if (fd->is_agg == 1)
    {
	ADIOI_Flatlist_node *fr_node_p = ADIOI_Flatlist;
	for (i = 0; i < nprocs; i++)
	{
	    fprintf(stderr, "client_file_view_state_arr[%d]=(fp_ind=%Ld,"
		    "disp=%Ld,byte_off=%Ld,sz=%Ld,ext=%Ld\n", i,
		    client_file_view_state_arr[i].fp_ind,
		    client_file_view_state_arr[i].disp,
		    client_file_view_state_arr[i].byte_off,
		    client_file_view_state_arr[i].sz,
		    client_file_view_state_arr[i].ext);
	}
	
	while (fr_node_p->type != 
	       fd->file_realm_types[fd->my_cb_nodes_index])
	    fr_node_p = fr_node_p->next;
	assert(fr_node_p != NULL);
	
	fprintf(stderr, "my file realm (idx=%d,st_off=%Ld) ", 
		fd->my_cb_nodes_index,
		fd->file_realm_st_offs[fd->my_cb_nodes_index]);
	ADIOI_Print_flatlist_node(fr_node_p);
    }
#endif
    
#ifdef DEBUG2
    if (fd->is_agg == 1)
    {
	for (i = 0; i < nprocs; i++)
	{
	    fprintf(stderr, "client_file_view_state_arr[%d]: ", i);
	    ADIOI_Print_flatlist_node(
		client_file_view_state_arr[i].flat_type_p);
	}
    }
#endif
#ifdef AGGREGATION_PROFILE
    MPE_Log_event (5015, 0, NULL);
#endif
}
Пример #6
0
/* 
 * Compute a dynamic access range based file domain partition among I/O aggregators,
 * which align to the GPFS block size
 * Divide the I/O workload among "nprocs_for_coll" processes. This is
 * done by (logically) dividing the file into file domains (FDs); each
 * process may directly access only its own file domain. 
 * Additional effort is to make sure that each I/O aggregator get
 * a file domain that aligns to the GPFS block size.  So, there will 
 * not be any false sharing of GPFS file blocks among multiple I/O nodes. 
 *  
 * The common version of this now accepts a min_fd_size and striping_unit. 
 * It doesn't seem necessary here (using GPFS block sizes) but keep it in mind
 * (e.g. we could pass striping unit instead of using fs_ptr->blksize). 
 */
void ADIOI_BGL_GPFS_Calc_file_domains(ADIO_Offset *st_offsets,
                                      ADIO_Offset *end_offsets,
                                      int          nprocs,
                                      int          nprocs_for_coll,
                                      ADIO_Offset *min_st_offset_ptr,
                                      ADIO_Offset **fd_start_ptr,
                                      ADIO_Offset **fd_end_ptr,
                                      ADIO_Offset *fd_size_ptr,
                                      void        *fs_ptr)
{
    ADIO_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, *fd_size;
    int i, aggr;

#ifdef AGGREGATION_PROFILE
    MPE_Log_event (5004, 0, NULL);
#endif

#   if AGG_DEBUG
    static char myname[] = "ADIOI_BGL_GPFS_Calc_file_domains";
    DBG_FPRINTF(stderr, "%s(%d): %d aggregator(s)\n", 
	    myname,__LINE__,nprocs_for_coll);
#   endif
    __blksize_t blksize = 1048576; /* default to 1M */
    if(fs_ptr && ((ADIOI_BGL_fs*)fs_ptr)->blksize) /* ignore null ptr or 0 blksize */
      blksize = ((ADIOI_BGL_fs*)fs_ptr)->blksize;
#   if AGG_DEBUG
    DBG_FPRINTF(stderr,"%s(%d): Blocksize=%ld\n",myname,__LINE__,blksize);
#   endif
/* find min of start offsets and max of end offsets of all processes */
    min_st_offset  = st_offsets [0];
    max_end_offset = end_offsets[0];
    for (i=1; i<nprocs; i++) {
        min_st_offset = ADIOI_MIN(min_st_offset, st_offsets[i]);
        max_end_offset = ADIOI_MAX(max_end_offset, end_offsets[i]);
    }

    // DBG_FPRINTF(stderr, "_calc_file_domains, min_st_offset, max_ = %qd, %qd\n", min_st_offset, max_end_offset );

    /* determine the "file domain (FD)" of each process, i.e., the portion of
       the file that will be "owned" by each process */

    ADIO_Offset gpfs_ub       = (max_end_offset +blksize-1) / blksize * blksize - 1;
    ADIO_Offset gpfs_lb       = min_st_offset / blksize * blksize;
    ADIO_Offset gpfs_ub_rdoff = (max_end_offset +blksize-1) / blksize * blksize - 1 - max_end_offset;
    ADIO_Offset gpfs_lb_rdoff = min_st_offset - min_st_offset / blksize * blksize;
    ADIO_Offset fd_gpfs_range = gpfs_ub - gpfs_lb + 1;

    int         naggs    = nprocs_for_coll;

    /* Tweak the file domains so that no fd is smaller than a threshold.  We
     * have to strike a balance between efficency and parallelism: somewhere
     * between 10k processes sending 32-byte requests and one process sending a
     * 320k request is a (system-dependent) sweet spot 
     
    This is from the common code - the new min_fd_size parm that we didn't implement. 
    (And common code uses a different declaration of fd_size so beware)  */
     

    /* this is not entirely sufficient on BlueGene: we must be mindful of
     * imbalance over psets.  the hint processing code has already picked, say,
     * 8 processors per pset, so if we go increasing fd_size we'll end up with
     * some psets with 8 processors and some psets with none.  */
    /*
    if (fd_size < min_fd_size)
        fd_size = min_fd_size;
	*/
    fd_size              = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
    *fd_start_ptr        = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
    *fd_end_ptr          = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
    fd_start             = *fd_start_ptr;
    fd_end               = *fd_end_ptr;

    ADIO_Offset n_gpfs_blk    = fd_gpfs_range / blksize;
    ADIO_Offset nb_cn_small   = n_gpfs_blk/naggs;
    ADIO_Offset naggs_large   = n_gpfs_blk - naggs * (n_gpfs_blk/naggs);
    ADIO_Offset naggs_small   = naggs - naggs_large;

    /* nb_cn_small * blksize: evenly split file domain among processors:
     *      equivalent to fd_gpfs_rnage/naggs 
     * (nb_cn_small+1) * blksize: keeps file domain at least 'blksize' big
     */
    for (i=0; i<naggs; i++)
        if (i < naggs_small) fd_size[i] = nb_cn_small     * blksize;
			else fd_size[i] = (nb_cn_small+1) * blksize;
			/*potential optimization: if n_gpfs_blk smalller than
			 * naggs, slip in some zero-sized file
			 * domains to spread the work across all psets.  */

#   if AGG_DEBUG
     DBG_FPRINTF(stderr,"%s(%d): "
                   "gpfs_ub       %llu, "
                   "gpfs_lb       %llu, "
                   "gpfs_ub_rdoff %llu, "
                   "gpfs_lb_rdoff %llu, "
                   "fd_gpfs_range %llu, "
                   "n_gpfs_blk    %llu, "
                   "nb_cn_small   %llu, "
                   "naggs_large   %llu, "
                   "naggs_small   %llu, "
                   "\n",
                   myname,__LINE__,
                   gpfs_ub      ,
                   gpfs_lb      ,
                   gpfs_ub_rdoff,
                   gpfs_lb_rdoff,
                   fd_gpfs_range,
                   n_gpfs_blk   ,
                   nb_cn_small  ,
                   naggs_large  ,
                   naggs_small
                   );
#   endif

    fd_size[0]       -= gpfs_lb_rdoff;
    fd_size[naggs-1] -= gpfs_ub_rdoff;

    /* compute the file domain for each aggr */
    ADIO_Offset offset = min_st_offset;
    for (aggr=0; aggr<naggs; aggr++) {
        fd_start[aggr] = offset;
        fd_end  [aggr] = offset + fd_size[aggr] - 1;
        offset += fd_size[aggr];
    }

    *fd_size_ptr = fd_size[0];
    *min_st_offset_ptr = min_st_offset;

#ifdef AGGREGATION_PROFILE
    MPE_Log_event (5005, 0, NULL);
#endif
    ADIOI_Free (fd_size);
}
Пример #7
0
void ADIOI_Calc_file_domains(ADIO_Offset *st_offsets, ADIO_Offset
                             *end_offsets, int nprocs, int nprocs_for_coll,
                             ADIO_Offset *min_st_offset_ptr,
                             ADIO_Offset **fd_start_ptr, ADIO_Offset
                             **fd_end_ptr, ADIO_Offset *fd_size_ptr)
{
    /* Divide the I/O workload among "nprocs_for_coll" processes. This is
       done by (logically) dividing the file into file domains (FDs); each
       process may directly access only its own file domain. */

    ADIO_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, fd_size;
    ADIO_Offset alignment = *fd_size_ptr;
    int i;

#ifdef AGG_DEBUG
    FPRINTF(stderr, "ADIOI_Calc_file_domains: %d aggregator(s)\n",
            nprocs_for_coll);
#endif

    /* find min of start offsets and max of end offsets of all processes */

    min_st_offset = st_offsets[0];
    max_end_offset = end_offsets[0];

    for (i=1; i<nprocs; i++) {
        min_st_offset = ADIOI_MIN(min_st_offset, st_offsets[i]);
        max_end_offset = ADIOI_MAX(max_end_offset, end_offsets[i]);
    }

    /* determine the "file domain (FD)" of each process, i.e., the portion of
       the file that will be "owned" by each process */

    /* partition the total file access range equally among nprocs_for_coll
       processes */

    if (alignment) {
        min_st_offset = ALIGNDOWN(min_st_offset, alignment);
        fd_size = ((max_end_offset - min_st_offset + 1)
                   + nprocs_for_coll - 1)/nprocs_for_coll;
        fd_size = (fd_size + alignment -1 ) / alignment * alignment;
    } else {
        fd_size = ((max_end_offset - min_st_offset + 1) + nprocs_for_coll -
                   1)/nprocs_for_coll;
    }

    /* ceiling division as in HPF block distribution */

    *fd_start_ptr = (ADIO_Offset *)
                    ADIOI_Malloc(nprocs_for_coll*sizeof(ADIO_Offset));
    *fd_end_ptr = (ADIO_Offset *)
                  ADIOI_Malloc(nprocs_for_coll*sizeof(ADIO_Offset));

    fd_start = *fd_start_ptr;
    fd_end = *fd_end_ptr;

    fd_start[0] = min_st_offset;
    fd_end[0] = min_st_offset + fd_size - 1;

    for (i=1; i<nprocs_for_coll; i++) {
        fd_start[i] = fd_end[i-1] + 1;
        fd_end[i] = fd_start[i] + fd_size - 1;
    }

    /* take care of cases in which the total file access range is not
       divisible by the number of processes. In such cases, the last
       process, or the last few processes, may have unequal load (even 0).
       For example, a range of 97 divided among 16 processes.
       Note that the division is ceiling division. */

    for (i=0; i<nprocs_for_coll; i++) {
        if (fd_start[i] > max_end_offset)
            fd_start[i] = fd_end[i] = -1;
        if (fd_end[i] > max_end_offset)
            fd_end[i] = max_end_offset;
    }

    *fd_size_ptr = fd_size;
    *min_st_offset_ptr = min_st_offset;
}
Пример #8
0
static void ADIOI_Iread_and_exch_l1_begin(ADIOI_NBC_Request *nbc_req,
                                          int *error_code)
{
    ADIOI_Iread_and_exch_vars *vars = nbc_req->data.rd.rae_vars;
    ADIO_File fd;
    int nprocs;
    ADIOI_Access *others_req;

    int i, j;
    ADIO_Offset real_off, req_off;
    char *read_buf;
    int *curr_offlen_ptr, *count, *send_size;
    int *partial_send, *start_pos;
    ADIO_Offset size, real_size, for_next_iter;
    int req_len, flag;

    ADIOI_R_Iexchange_data_vars *red_vars = NULL;

    /* loop exit condition */
    if (vars->m >= vars->ntimes) {
        ADIOI_Iread_and_exch_reset(nbc_req, error_code);
        return;
    }

    fd = vars->fd;
    nprocs = vars->nprocs;
    others_req = vars->others_req;

    read_buf = vars->read_buf;
    curr_offlen_ptr = vars->curr_offlen_ptr;
    count = vars->count;
    send_size = vars->send_size;
    partial_send = vars->partial_send;
    start_pos = vars->start_pos;

    /* read buf of size coll_bufsize (or less) */
    /* go through all others_req and check if any are satisfied
       by the current read */

    /* since MPI guarantees that displacements in filetypes are in
       monotonically nondecreasing order, I can maintain a pointer
       (curr_offlen_ptr) to
       current off-len pair for each process in others_req and scan
       further only from there. There is still a problem of filetypes
       such as:  (1, 2, 3 are not process nos. They are just numbers for
       three chunks of data, specified by a filetype.)

       1  -------!--
       2    -----!----
       3       --!-----

       where ! indicates where the current read_size limitation cuts
       through the filetype.  I resolve this by reading up to !, but
       filling the communication buffer only for 1. I copy the portion
       left over for 2 into a tmp_buf for use in the next
       iteration. i.e., 2 and 3 will be satisfied in the next
       iteration. This simplifies filling in the user's buf at the
       other end, as only one off-len pair with incomplete data
       will be sent. I also don't need to send the individual
       offsets and lens along with the data, as the data is being
       sent in a particular order. */

    /* off = start offset in the file for the data actually read in
             this iteration
       size = size of data read corresponding to off
       real_off = off minus whatever data was retained in memory from
             previous iteration for cases like 2, 3 illustrated above
       real_size = size plus the extra corresponding to real_off
       req_off = off in file for a particular contiguous request
                 minus what was satisfied in previous iteration
       req_size = size corresponding to req_off */

    size = ADIOI_MIN((unsigned)vars->coll_bufsize,
                     vars->end_loc - vars->st_loc + 1 - vars->done);
    real_off = vars->off - vars->for_curr_iter;
    real_size = size + vars->for_curr_iter;

    vars->size = size;
    vars->real_size = real_size;

    for (i = 0; i < nprocs; i++) count[i] = send_size[i] = 0;
    for_next_iter = 0;

    for (i = 0; i < nprocs; i++) {
#ifdef RDCOLL_DEBUG
        DBG_FPRINTF(stderr, "rank %d, i %d, others_count %d\n",
                    vars->myrank, i, others_req[i].count);
#endif
        if (others_req[i].count) {
            start_pos[i] = curr_offlen_ptr[i];
            for (j = curr_offlen_ptr[i]; j < others_req[i].count; j++) {
                if (partial_send[i]) {
                    /* this request may have been partially
                       satisfied in the previous iteration. */
                    req_off = others_req[i].offsets[j] + partial_send[i];
                    req_len = others_req[i].lens[j] - partial_send[i];
                    partial_send[i] = 0;
                    /* modify the off-len pair to reflect this change */
                    others_req[i].offsets[j] = req_off;
                    others_req[i].lens[j] = req_len;
                }
                else {
                    req_off = others_req[i].offsets[j];
                    req_len = others_req[i].lens[j];
                }
                if (req_off < real_off + real_size) {
                    count[i]++;
                    ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)read_buf) + req_off - real_off) == (ADIO_Offset)(MPIR_Upint)(read_buf + req_off - real_off));
                    MPI_Address(read_buf + req_off - real_off,
                                &(others_req[i].mem_ptrs[j]));
                    ADIOI_Assert((real_off + real_size - req_off) == (int)(real_off + real_size - req_off));
                    send_size[i] += (int)(ADIOI_MIN(real_off + real_size - req_off,
                                                    (ADIO_Offset)(unsigned)req_len));

                    if (real_off + real_size - req_off < (ADIO_Offset)(unsigned)req_len) {
                        partial_send[i] = (int)(real_off + real_size - req_off);
                        if ((j+1 < others_req[i].count) &&
                            (others_req[i].offsets[j+1] < real_off + real_size)) {
                            /* this is the case illustrated in the
                               figure above. */
                            for_next_iter = ADIOI_MAX(for_next_iter,
                                    real_off + real_size - others_req[i].offsets[j+1]);
                            /* max because it must cover requests
                               from different processes */
                        }
                        break;
                    }
                }
                else break;
            }
            curr_offlen_ptr[i] = j;
        }
    }
    vars->for_next_iter = for_next_iter;

    flag = 0;
    for (i = 0; i < nprocs; i++)
        if (count[i]) flag = 1;

    /* create a struct for ADIOI_R_Iexchange_data() */
    red_vars = (ADIOI_R_Iexchange_data_vars *)ADIOI_Calloc(
            1, sizeof(ADIOI_R_Iexchange_data_vars));
    nbc_req->data.rd.red_vars = red_vars;
    red_vars->fd = vars->fd;
    red_vars->buf = vars->buf;
    red_vars->flat_buf = vars->flat_buf;
    red_vars->offset_list = vars->offset_list;
    red_vars->len_list = vars->len_list;
    red_vars->send_size = vars->send_size;
    red_vars->recv_size = vars->recv_size;
    red_vars->count = vars->count;
    red_vars->start_pos = vars->start_pos;
    red_vars->partial_send = vars->partial_send;
    red_vars->recd_from_proc = vars->recd_from_proc;
    red_vars->nprocs = vars->nprocs;
    red_vars->myrank = vars->myrank;
    red_vars->buftype_is_contig = vars->buftype_is_contig;
    red_vars->contig_access_count = vars->contig_access_count;
    red_vars->min_st_offset = vars->min_st_offset;
    red_vars->fd_size = vars->fd_size;
    red_vars->fd_start = vars->fd_start;
    red_vars->fd_end = vars->fd_end;
    red_vars->others_req = vars->others_req;
    red_vars->iter = vars->m;
    red_vars->buftype_extent = vars->buftype_extent;
    red_vars->buf_idx = vars->buf_idx;
    red_vars->next_fn = ADIOI_Iread_and_exch_l1_end;

    if (flag) {
        ADIOI_Assert(size == (int)size);
        ADIO_IreadContig(fd, read_buf+vars->for_curr_iter, (int)size,
                         MPI_BYTE, ADIO_EXPLICIT_OFFSET, vars->off,
                         &vars->req2, error_code);

        nbc_req->data.rd.state = ADIOI_IRC_STATE_IREAD_AND_EXCH_L1_BEGIN;
        return;
    }

    ADIOI_R_Iexchange_data(nbc_req, error_code);
}
Пример #9
0
static void ADIOI_Iread_and_exch(ADIOI_NBC_Request *nbc_req, int *error_code)
{
    ADIOI_Iread_and_exch_vars *vars = nbc_req->data.rd.rae_vars;
    ADIO_File fd = vars->fd;
    MPI_Datatype datatype = vars->datatype;
    int nprocs = vars->nprocs;
    ADIOI_Access *others_req = vars->others_req;

    /* Read in sizes of no more than coll_bufsize, an info parameter.
       Send data to appropriate processes.
       Place recd. data in user buf.
       The idea is to reduce the amount of extra memory required for
       collective I/O. If all data were read all at once, which is much
       easier, it would require temp space more than the size of user_buf,
       which is often unacceptable. For example, to read a distributed
       array from a file, where each local array is 8Mbytes, requiring
       at least another 8Mbytes of temp space is unacceptable. */

    int i, j;
    ADIO_Offset st_loc = -1, end_loc = -1;
    ADIOI_Flatlist_node *flat_buf = NULL;
    int coll_bufsize;

    *error_code = MPI_SUCCESS;  /* changed below if error */
    /* only I/O errors are currently reported */

    /* calculate the number of reads of size coll_bufsize
       to be done by each process and the max among all processes.
       That gives the no. of communication phases as well.
       coll_bufsize is obtained from the hints object. */

    coll_bufsize = fd->hints->cb_buffer_size;
    vars->coll_bufsize = coll_bufsize;

    /* grab some initial values for st_loc and end_loc */
    for (i = 0; i < nprocs; i++) {
        if (others_req[i].count) {
            st_loc = others_req[i].offsets[0];
            end_loc = others_req[i].offsets[0];
            break;
        }
    }

    /* now find the real values */
    for (i = 0; i < nprocs; i++)
        for (j = 0; j < others_req[i].count; j++) {
            st_loc = ADIOI_MIN(st_loc, others_req[i].offsets[j]);
            end_loc = ADIOI_MAX(end_loc, (others_req[i].offsets[j]
                          + others_req[i].lens[j] - 1));
        }

    vars->st_loc = st_loc;
    vars->end_loc = end_loc;

    /* calculate ntimes, the number of times this process must perform I/O
     * operations in order to complete all the requests it has received.
     * the need for multiple I/O operations comes from the restriction that
     * we only use coll_bufsize bytes of memory for internal buffering.
     */
    if ((st_loc == -1) && (end_loc == -1)) {
        /* this process does no I/O. */
        vars->ntimes = 0;
    }
    else {
        /* ntimes=ceiling_div(end_loc - st_loc + 1, coll_bufsize)*/
        vars->ntimes = (int)((end_loc - st_loc + coll_bufsize) / coll_bufsize);
    }

    *error_code = MPI_Iallreduce(&vars->ntimes, &vars->max_ntimes, 1, MPI_INT,
                                 MPI_MAX, fd->comm, &vars->req1);

    vars->read_buf = fd->io_buf;  /* Allocated at open time */

    vars->curr_offlen_ptr = (int *)ADIOI_Calloc(nprocs, sizeof(int));
    /* its use is explained below. calloc initializes to 0. */

    vars->count = (int *)ADIOI_Malloc(nprocs * sizeof(int));
    /* to store count of how many off-len pairs per proc are satisfied
       in an iteration. */

    vars->partial_send = (int *)ADIOI_Calloc(nprocs, sizeof(int));
    /* if only a portion of the last off-len pair is sent to a process
       in a particular iteration, the length sent is stored here.
       calloc initializes to 0. */

    vars->send_size = (int *)ADIOI_Malloc(nprocs * sizeof(int));
    /* total size of data to be sent to each proc. in an iteration */

    vars->recv_size = (int *)ADIOI_Malloc(nprocs * sizeof(int));
    /* total size of data to be recd. from each proc. in an iteration.
       Of size nprocs so that I can use MPI_Alltoall later. */

    vars->recd_from_proc = (int *)ADIOI_Calloc(nprocs, sizeof(int));
    /* amount of data recd. so far from each proc. Used in
       ADIOI_Fill_user_buffer. initialized to 0 here. */

    vars->start_pos = (int *)ADIOI_Malloc(nprocs*sizeof(int));
    /* used to store the starting value of curr_offlen_ptr[i] in
       this iteration */

    ADIOI_Datatype_iscontig(datatype, &vars->buftype_is_contig);
    if (!vars->buftype_is_contig) {
        ADIOI_Flatten_datatype(datatype);
        flat_buf = ADIOI_Flatlist;
        while (flat_buf->type != datatype) flat_buf = flat_buf->next;
        vars->flat_buf = flat_buf;
    }
    MPI_Type_extent(datatype, &vars->buftype_extent);

    vars->done = 0;
    vars->off = st_loc;
    vars->for_curr_iter = vars->for_next_iter = 0;

    /* set the state to wait until MPI_Ialltoall finishes. */
    nbc_req->data.rd.state = ADIOI_IRC_STATE_IREAD_AND_EXCH;
}
Пример #10
0
/* If successful, error_code is set to MPI_SUCCESS.  Otherwise an error
 * code is created and returned in error_code.
 */
static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, void *buf,
					MPI_Datatype datatype, int nprocs,
					int myrank, ADIOI_Access *others_req,
                                        ADIOI_Access *my_req,
					ADIO_Offset *offset_list,
                                        ADIO_Offset *len_list, 
					int contig_access_count,
                                        int *striping_info, int **buf_idx,
                                        int *error_code)
{
    /* Send data to appropriate processes and write in sizes of no more
     * than lustre stripe_size.
     * The idea is to reduce the amount of extra memory required for
     * collective I/O. If all data were written all at once, which is much
     * easier, it would require temp space more than the size of user_buf,
     * which is often unacceptable. For example, to write a distributed
     * array to a file, where each local array is 8Mbytes, requiring
     * at least another 8Mbytes of temp space is unacceptable.
     */

    int hole, i, j, m, flag, ntimes = 1 , max_ntimes, buftype_is_contig;
    ADIO_Offset st_loc = -1, end_loc = -1, min_st_loc, max_end_loc;
    ADIO_Offset off, req_off, send_off, iter_st_off, *off_list;
    ADIO_Offset max_size, step_size = 0;
    int real_size, req_len, send_len;
    int *recv_curr_offlen_ptr, *recv_count, *recv_size;
    int *send_curr_offlen_ptr, *send_size;
    int *sent_to_proc, *recv_start_pos;
    int *send_buf_idx, *curr_to_proc, *done_to_proc;
    int *this_buf_idx;
    char *write_buf = NULL;
    MPI_Status status;
    ADIOI_Flatlist_node *flat_buf = NULL;
    MPI_Aint buftype_extent;
    int stripe_size = striping_info[0], avail_cb_nodes = striping_info[2];
    int data_sieving = 0;
    ADIO_Offset *srt_off = NULL;
    int *srt_len = NULL;
    int srt_num = 0;
    ADIO_Offset block_offset;
    int block_len;

    *error_code = MPI_SUCCESS;	/* changed below if error */
    /* only I/O errors are currently reported */

    /* calculate the number of writes of stripe size to be done.
     * That gives the no. of communication phases as well.
     * Note:
     *   Because we redistribute data in stripe-contiguous pattern for Lustre,
     *   each process has the same no. of communication phases.
     */

    for (i = 0; i < nprocs; i++) {
	if (others_req[i].count) {
	    st_loc = others_req[i].offsets[0];
	    end_loc = others_req[i].offsets[0];
	    break;
	}
    }
    for (i = 0; i < nprocs; i++) {
	for (j = 0; j < others_req[i].count; j++) {
	    st_loc = ADIOI_MIN(st_loc, others_req[i].offsets[j]);
	    end_loc = ADIOI_MAX(end_loc, (others_req[i].offsets[j] +
                                          others_req[i].lens[j] - 1));
	}
    }
    /* this process does no writing. */
    if ((st_loc == -1) && (end_loc == -1))
	ntimes = 0;
    MPI_Allreduce(&end_loc, &max_end_loc, 1, MPI_LONG_LONG_INT, MPI_MAX, fd->comm);
    /* avoid min_st_loc be -1 */
    if (st_loc == -1)
        st_loc = max_end_loc;
    MPI_Allreduce(&st_loc, &min_st_loc, 1, MPI_LONG_LONG_INT, MPI_MIN, fd->comm);
    /* align downward */
    min_st_loc -= min_st_loc % (ADIO_Offset)stripe_size;

    /* Each time, only avail_cb_nodes number of IO clients perform IO,
     * so, step_size=avail_cb_nodes*stripe_size IO will be performed at most,
     * and ntimes=whole_file_portion/step_size
     */
    step_size = (ADIO_Offset) avail_cb_nodes * stripe_size;
    max_ntimes = (max_end_loc - min_st_loc + 1) / step_size
        + (((max_end_loc - min_st_loc + 1) % step_size) ? 1 : 0);
/*     max_ntimes = (int)((max_end_loc - min_st_loc) / step_size + 1); */
    if (ntimes)
	write_buf = (char *) ADIOI_Malloc(stripe_size);

    /* calculate the start offset for each iteration */
    off_list = (ADIO_Offset *) ADIOI_Malloc(max_ntimes * sizeof(ADIO_Offset));
    for (m = 0; m < max_ntimes; m ++)
        off_list[m] = max_end_loc;
    for (i = 0; i < nprocs; i++) {
        for (j = 0; j < others_req[i].count; j ++) {
            req_off = others_req[i].offsets[j];
            m = (int)((req_off - min_st_loc) / step_size);
            off_list[m] = ADIOI_MIN(off_list[m], req_off);
        }
    }

    recv_curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
    send_curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
    /* their use is explained below. calloc initializes to 0. */

    recv_count = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    /* to store count of how many off-len pairs per proc are satisfied
       in an iteration. */

    send_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    /* total size of data to be sent to each proc. in an iteration.
       Of size nprocs so that I can use MPI_Alltoall later. */

    recv_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    /* total size of data to be recd. from each proc. in an iteration. */

    sent_to_proc = (int *) ADIOI_Calloc(nprocs, sizeof(int));
    /* amount of data sent to each proc so far. Used in
       ADIOI_Fill_send_buffer. initialized to 0 here. */

    send_buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    curr_to_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    done_to_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    /* Above three are used in ADIOI_Fill_send_buffer */

    this_buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));

    recv_start_pos = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    /* used to store the starting value of recv_curr_offlen_ptr[i] in
       this iteration */

    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
    if (!buftype_is_contig) {
	ADIOI_Flatten_datatype(datatype);
	flat_buf = ADIOI_Flatlist;
	while (flat_buf->type != datatype)
	    flat_buf = flat_buf->next;
    }
    MPI_Type_extent(datatype, &buftype_extent);
    /* I need to check if there are any outstanding nonblocking writes to
     * the file, which could potentially interfere with the writes taking
     * place in this collective write call. Since this is not likely to be
     * common, let me do the simplest thing possible here: Each process
     * completes all pending nonblocking operations before completing.
     */
    /*ADIOI_Complete_async(error_code);
    if (*error_code != MPI_SUCCESS) return;
    MPI_Barrier(fd->comm);
    */

    iter_st_off = min_st_loc;

    /* Although we have recognized the data according to OST index,
     * a read-modify-write will be done if there is a hole between the data.
     * For example: if blocksize=60, xfersize=30 and stripe_size=100,
     * then rank0 will collect data [0, 30] and [60, 90] then write. There
     * is a hole in [30, 60], which will cause a read-modify-write in [0, 90].
     *
     * To reduce its impact on the performance, we can disable data sieving
     * by hint "ds_in_coll".
     */
    /* check the hint for data sieving */
    data_sieving = fd->hints->fs_hints.lustre.ds_in_coll;

    for (m = 0; m < max_ntimes; m++) {
	/* go through all others_req and my_req to check which will be received
         * and sent in this iteration.
         */

	/* Note that MPI guarantees that displacements in filetypes are in
	   monotonically nondecreasing order and that, for writes, the
	   filetypes cannot specify overlapping regions in the file. This
	   simplifies implementation a bit compared to reads. */

	/*
           off         = start offset in the file for the data to be written in
                         this iteration
           iter_st_off = start offset of this iteration
           real_size   = size of data written (bytes) corresponding to off
           max_size    = possible maximum size of data written in this iteration
           req_off     = offset in the file for a particular contiguous request minus
                         what was satisfied in previous iteration
           send_off    = offset the request needed by other processes in this iteration
           req_len     = size corresponding to req_off
           send_len    = size corresponding to send_off
         */

	/* first calculate what should be communicated */
	for (i = 0; i < nprocs; i++)
	    recv_count[i] = recv_size[i] = send_size[i] = 0;

        off = off_list[m];
        max_size = ADIOI_MIN(step_size, max_end_loc - iter_st_off + 1);
        real_size = (int) ADIOI_MIN((off / stripe_size + 1) * stripe_size -
                                    off,
                                    end_loc - off + 1);

	for (i = 0; i < nprocs; i++) {
            if (my_req[i].count) {
                this_buf_idx[i] = buf_idx[i][send_curr_offlen_ptr[i]];
                for (j = send_curr_offlen_ptr[i]; j < my_req[i].count; j++) {
                    send_off = my_req[i].offsets[j];
                    send_len = my_req[i].lens[j];
                    if (send_off < iter_st_off + max_size) {
                        send_size[i] += send_len;
                    } else {
                        break;
                    }
                }
                send_curr_offlen_ptr[i] = j;
            }
	    if (others_req[i].count) {
		recv_start_pos[i] = recv_curr_offlen_ptr[i];
		for (j = recv_curr_offlen_ptr[i]; j < others_req[i].count; j++) {
                    req_off = others_req[i].offsets[j];
                    req_len = others_req[i].lens[j];
		    if (req_off < iter_st_off + max_size) {
			recv_count[i]++;
                        ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)write_buf)+req_off-off) == (ADIO_Offset)(MPIR_Upint)(write_buf+req_off-off));
			MPI_Address(write_buf + req_off - off,
				    &(others_req[i].mem_ptrs[j]));
                        recv_size[i] += req_len;
		    } else {
			break;
                    }
		}
		recv_curr_offlen_ptr[i] = j;
	    }
	}
        /* use variable "hole" to pass data_sieving flag into W_Exchange_data */
        hole = data_sieving;
	ADIOI_LUSTRE_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
                                     len_list, send_size, recv_size, off, real_size,
                                     recv_count, recv_start_pos,
                                     sent_to_proc, nprocs, myrank,
                                     buftype_is_contig, contig_access_count,
                                     striping_info, others_req, send_buf_idx,
                                     curr_to_proc, done_to_proc, &hole, m,
                                  buftype_extent, this_buf_idx,
                                  &srt_off, &srt_len, &srt_num, error_code);

	if (*error_code != MPI_SUCCESS)
            goto over;

	flag = 0;
	for (i = 0; i < nprocs; i++)
	    if (recv_count[i]) {
		flag = 1;
		break;
	    }
	if (flag) {
            /* check whether to do data sieving */
            if(data_sieving == ADIOI_HINT_ENABLE) {
	        ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE,
			         ADIO_EXPLICIT_OFFSET, off, &status,
			         error_code);
            } else {
                /* if there is no hole, write data in one time;
                 * otherwise, write data in several times */
                if (!hole) {
                    ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE,
                                     ADIO_EXPLICIT_OFFSET, off, &status,
                                     error_code);
                } else {
                    block_offset = -1;
                    block_len = 0;
                    for (i = 0; i < srt_num; ++i) {
                        if (srt_off[i] < off + real_size &&
                            srt_off[i] >= off) {
                            if (block_offset == -1) {
                                block_offset = srt_off[i];
                                block_len = srt_len[i];
                            } else {
                                if (srt_off[i] == block_offset + block_len) {
                                    block_len += srt_len[i];
                                } else {
                                    ADIO_WriteContig(fd,
                                                     write_buf + block_offset - off,
                                                     block_len,
                                                     MPI_BYTE, ADIO_EXPLICIT_OFFSET,
                                                     block_offset, &status,
                                                     error_code);
	                            if (*error_code != MPI_SUCCESS)
		                        goto over;
                                    block_offset = srt_off[i];
                                    block_len = srt_len[i];
                                }
                            }
                        }
                    }
                    if (block_offset != -1) {
                        ADIO_WriteContig(fd,
                                         write_buf + block_offset - off,
                                         block_len,
                                         MPI_BYTE, ADIO_EXPLICIT_OFFSET,
                                         block_offset, &status,
                                         error_code);
                        if (*error_code != MPI_SUCCESS)
                            goto over;
                    }
                }
            }
	    if (*error_code != MPI_SUCCESS)
		goto over;
	}
        iter_st_off += max_size;
    }
over:
    if (srt_off)
        ADIOI_Free(srt_off);
    if (srt_len)
        ADIOI_Free(srt_len);
    if (ntimes)
	ADIOI_Free(write_buf);
    ADIOI_Free(recv_curr_offlen_ptr);
    ADIOI_Free(send_curr_offlen_ptr);
    ADIOI_Free(recv_count);
    ADIOI_Free(send_size);
    ADIOI_Free(recv_size);
    ADIOI_Free(sent_to_proc);
    ADIOI_Free(recv_start_pos);
    ADIOI_Free(send_buf_idx);
    ADIOI_Free(curr_to_proc);
    ADIOI_Free(done_to_proc);
    ADIOI_Free(this_buf_idx);
    ADIOI_Free(off_list);
}
Пример #11
0
/* Avery Ching and Kenin Columa's reworked two-phase algorithm.  Key features
 * - persistent file domains
 * - an option to use alltoall instead of point-to-point
 */
void ADIOI_IOStridedColl (ADIO_File fd, void *buf, int count, int rdwr,
			  MPI_Datatype datatype, int file_ptr_type,
			  ADIO_Offset offset, ADIO_Status *status,
			  int *error_code)
{
    ADIO_Offset min_st_offset=0, max_end_offset=0;
    ADIO_Offset st_end_offset[2];
    ADIO_Offset *all_st_end_offsets = NULL;
    int filetype_is_contig, buftype_is_contig, is_contig;
    ADIO_Offset orig_fp, off;
    int interleave_count = 0, i, nprocs, myrank, nprocs_for_coll;
    int cb_enable;
    ADIO_Offset bufsize;
    MPI_Aint extent, bufextent;
    int size;
    int agg_rank;

    ADIO_Offset agg_disp; /* aggregated file offset */
    MPI_Datatype agg_dtype; /* aggregated file datatype */

    int aggregators_done = 0;
    ADIO_Offset buffered_io_size = 0;

    int *alltoallw_disps;

    int *alltoallw_counts;
    int *client_alltoallw_counts;
    int *agg_alltoallw_counts;

    char *cb_buf = NULL;

    MPI_Datatype *client_comm_dtype_arr; /* aggregator perspective */
    MPI_Datatype *agg_comm_dtype_arr;    /* client perspective */
    ADIO_Offset *client_comm_sz_arr;     /* aggregator perspective */
    ADIO_Offset *agg_comm_sz_arr;        /* client perspective */

    /* file views for each client and aggregator */
    view_state *client_file_view_state_arr = NULL;
    view_state *agg_file_view_state_arr    = NULL;
    /* mem views for local process */
    view_state *my_mem_view_state_arr      = NULL;

    MPI_Status *agg_comm_statuses     = NULL;
    MPI_Request *agg_comm_requests    = NULL;
    MPI_Status *client_comm_statuses  = NULL;
    MPI_Request *client_comm_requests = NULL;
    int aggs_client_count = 0;
    int clients_agg_count = 0;

    MPI_Comm_size (fd->comm, &nprocs);
    MPI_Comm_rank (fd->comm, &myrank);
#ifdef DEBUG
    fprintf (stderr, "p%d: entering ADIOI_IOStridedColl\n", myrank);
#endif
#ifdef AGGREGATION_PROFILE
    if (rdwr == ADIOI_READ)
	MPE_Log_event (5010, 0, NULL);
    else
	MPE_Log_event (5012, 0, NULL);
#endif

    /* I need to check if there are any outstanding nonblocking writes
       to the file, which could potentially interfere with the writes
       taking place in this collective write call. Since this is not
       likely to be common, let me do the simplest thing possible here:
       Each process completes all pending nonblocking operations before
       completing. */

    nprocs_for_coll = fd->hints->cb_nodes;
    orig_fp = fd->fp_ind;

    if (rdwr == ADIOI_READ)
	cb_enable = fd->hints->cb_read;
    else
	cb_enable = fd->hints->cb_write;

    /* only check for interleaving if cb_read isn't disabled */
    if (cb_enable != ADIOI_HINT_DISABLE) {
	/* find the starting and ending byte of my I/O access */
	ADIOI_Calc_bounds (fd, count, datatype, file_ptr_type, offset,
			   &st_end_offset[0], &st_end_offset[1]);

	/* allocate an array of start/end pairs */
	all_st_end_offsets = (ADIO_Offset *)
	    ADIOI_Malloc (2*nprocs*sizeof(ADIO_Offset));
	MPI_Allgather (st_end_offset, 2, ADIO_OFFSET, all_st_end_offsets, 2,
		       ADIO_OFFSET, fd->comm);

	min_st_offset = all_st_end_offsets[0];
	max_end_offset = all_st_end_offsets[1];

	for (i=1; i<nprocs; i++) {
	    /* are the accesses of different processes interleaved? */
	    if ((all_st_end_offsets[i*2] < all_st_end_offsets[i*2-1]) &&
		(all_st_end_offsets[i*2] <= all_st_end_offsets[i*2+1]))
		interleave_count++;
	    /* This is a rudimentary check for interleaving, but should
	     * suffice for the moment. */
	    
	    min_st_offset = ADIOI_MIN(all_st_end_offsets[i*2],
				      min_st_offset);
	    max_end_offset = ADIOI_MAX(all_st_end_offsets[i*2+1],
				       max_end_offset);
	}
    }

    ADIOI_Datatype_iscontig (datatype, &buftype_is_contig);
    ADIOI_Datatype_iscontig (fd->filetype, &filetype_is_contig);

    if ((cb_enable == ADIOI_HINT_DISABLE
	 || (!interleave_count && (cb_enable == ADIOI_HINT_AUTO)))
	&& (fd->hints->cb_pfr != ADIOI_HINT_ENABLE)){
	if (cb_enable != ADIOI_HINT_DISABLE) {
	    ADIOI_Free (all_st_end_offsets);
	}

	if (buftype_is_contig && filetype_is_contig) {
	    if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
		off = fd->disp + (fd->etype_size) * offset;
		if (rdwr == ADIOI_READ)
		    ADIO_ReadContig(fd, buf, count, datatype,
				    ADIO_EXPLICIT_OFFSET, off, status,
				    error_code);
		else
		    ADIO_WriteContig(fd, buf, count, datatype,
				     ADIO_EXPLICIT_OFFSET, off, status,
				     error_code);
	    }
	    else {
		if (rdwr == ADIOI_READ)
		    ADIO_ReadContig(fd, buf, count, datatype, ADIO_INDIVIDUAL,
				    0, status, error_code);
		else
		    ADIO_WriteContig(fd, buf, count, datatype, ADIO_INDIVIDUAL,
				     0, status, error_code);
	    }
	}
	else {
	    if (rdwr == ADIOI_READ)
		ADIO_ReadStrided(fd, buf, count, datatype, file_ptr_type,
				 offset, status, error_code);
	    else
		ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type,
				  offset, status, error_code);
	}
	return;
    }

    MPI_Type_extent(datatype, &extent);
    bufextent = extent * count;
    MPI_Type_size(datatype, &size);
    bufsize = size * count;

    /* Calculate file realms */
    if ((fd->hints->cb_pfr != ADIOI_HINT_ENABLE) ||
	(fd->file_realm_types == NULL))
	ADIOI_Calc_file_realms (fd, min_st_offset, max_end_offset);

    my_mem_view_state_arr = (view_state *)
	ADIOI_Calloc (1, nprocs * sizeof(view_state));
    agg_file_view_state_arr = (view_state *)
	ADIOI_Calloc (1, nprocs * sizeof(view_state));
    client_comm_sz_arr = (ADIO_Offset *)
	ADIOI_Calloc (1, nprocs * sizeof(ADIO_Offset));

    if (fd->is_agg) {
	client_file_view_state_arr = (view_state *)
	    ADIOI_Calloc (1, nprocs * sizeof(view_state));
    }
    else {
	client_file_view_state_arr = NULL;
    }

    /* Alltoallw doesn't like a null array even if the counts are
     * zero.  If you do not include this code, it will fail. */
    client_comm_dtype_arr = (MPI_Datatype *)
	ADIOI_Calloc (1, nprocs * sizeof(MPI_Datatype));
    if (!fd->is_agg)
	for (i = 0; i < nprocs; i++)
	    client_comm_dtype_arr[i] = MPI_BYTE;

    ADIOI_Exch_file_views (myrank, nprocs, file_ptr_type, fd, count,
			   datatype, offset, my_mem_view_state_arr,
			   agg_file_view_state_arr,
			   client_file_view_state_arr);

    agg_comm_sz_arr = (ADIO_Offset *)
	ADIOI_Calloc (1, nprocs * sizeof(ADIO_Offset));
    agg_comm_dtype_arr = (MPI_Datatype *)
	ADIOI_Malloc (nprocs * sizeof(MPI_Datatype));
    if (fd->is_agg) {
	ADIOI_Build_agg_reqs (fd, rdwr, nprocs,
			      client_file_view_state_arr,
			      client_comm_dtype_arr,
			      client_comm_sz_arr,
			      &agg_disp,
			      &agg_dtype);
	buffered_io_size = 0;
	for (i=0; i <nprocs; i++) {
	    if (client_comm_sz_arr[i] > 0)
		buffered_io_size += client_comm_sz_arr[i];
	}
    }
#ifdef USE_PRE_REQ
    else 
    {
	/* Example use of ADIOI_Build_client_pre_req. to an
	 * appropriate section */
	
	for (i = 0; i < fd->hints->cb_nodes; i++)
	{
	    agg_rank = fd->hints->ranklist[(i+myrank)%fd->hints->cb_nodes];
#ifdef AGGREGATION_PROFILE
	    MPE_Log_event (5040, 0, NULL);
#endif
	    ADIOI_Build_client_pre_req(
		fd, agg_rank, (i+myrank)%fd->hints->cb_nodes,
		&(my_mem_view_state_arr[agg_rank]),
		&(agg_file_view_state_arr[agg_rank]),
		2*1024*1024, 
		64*1024);
#ifdef AGGREGATION_PROFILE
	    MPE_Log_event (5041, 0, NULL);
#endif
	}
    }
#endif


    if (fd->is_agg)
	cb_buf = (char *) ADIOI_Malloc (fd->hints->cb_buffer_size);
    alltoallw_disps  = (int *) ADIOI_Calloc (nprocs, sizeof(int));
    alltoallw_counts = client_alltoallw_counts = (int *)
	ADIOI_Calloc (2*nprocs, sizeof(int));
    agg_alltoallw_counts = &alltoallw_counts[nprocs];

    if (fd->hints->cb_alltoall == ADIOI_HINT_DISABLE) {
        /* aggregators pre-post all Irecv's for incoming data from clients */
        if ((fd->is_agg) && (rdwr == ADIOI_WRITE))
	    post_aggregator_comm(fd->comm, rdwr, nprocs, cb_buf,
			     client_comm_dtype_arr,
			     client_comm_sz_arr,
			     &agg_comm_requests,
			     &aggs_client_count);
    }
    /* Aggregators send amounts for data requested to clients */
    Exch_data_amounts (fd, nprocs, client_comm_sz_arr, agg_comm_sz_arr,
		       client_alltoallw_counts, agg_alltoallw_counts,
		       &aggregators_done);

#ifdef DEBUG
    fprintf (stderr, "client_alltoallw_counts[ ");
    for (i=0; i<nprocs; i++) {
	fprintf (stderr, "%d ", client_alltoallw_counts[i]);
    }
    fprintf (stderr, "]\n");
    fprintf (stderr, "agg_alltoallw_counts[ ");
    for (i=0; i<nprocs; i++) {
	fprintf (stderr,"%d ", agg_alltoallw_counts[i]);
    }
    fprintf (stderr, "]\n");
#endif

    /* keep looping while aggregators still have I/O to do */
    while (aggregators_done != nprocs_for_coll) {
	if (fd->hints->cb_alltoall == ADIOI_HINT_DISABLE) {
	/* clients should build datatypes for local memory locations
	   for data communication with aggregators and post
	   communication as the datatypes are built */

	client_comm_requests = (MPI_Request *)
	    ADIOI_Calloc (fd->hints->cb_nodes, sizeof(MPI_Request));

	for (i = 0; i < fd->hints->cb_nodes; i++)
	{
	    clients_agg_count = 0;
	    agg_rank = fd->hints->ranklist[(i+myrank)%fd->hints->cb_nodes];
	    if (agg_comm_sz_arr[agg_rank] > 0) {
	        ADIOI_Build_client_req(fd, agg_rank,
				       (i+myrank)%fd->hints->cb_nodes,
				       &(my_mem_view_state_arr[agg_rank]),
				       &(agg_file_view_state_arr[agg_rank]),
				       agg_comm_sz_arr[agg_rank], 
				       &(agg_comm_dtype_arr[agg_rank]));

#ifdef AGGREGATION_PROFILE
		if (i == 0)
		    MPE_Log_event (5038, 0, NULL);
#endif
		post_client_comm (fd, rdwr, agg_rank, buf,
				  agg_comm_dtype_arr[agg_rank],
				  agg_alltoallw_counts[agg_rank],
				  &client_comm_requests[clients_agg_count]);
		clients_agg_count++;
	    }
	}
#ifdef AGGREGATION_PROFILE
	if (!clients_agg_count)
	    MPE_Log_event(5039, 0, NULL);
#endif

	if (rdwr == ADIOI_READ) {
	    if (fd->is_agg && buffered_io_size) {
		ADIOI_IOFiletype (fd, cb_buf, buffered_io_size, MPI_BYTE,
				  ADIO_EXPLICIT_OFFSET, agg_disp, agg_dtype,
				  ADIOI_READ, status, error_code);
		if (*error_code != MPI_SUCCESS) return;
		MPI_Type_free (&agg_dtype);
	    }

#ifdef DEBUG
	    fprintf (stderr, "expecting from [agg](disp,size,cnt)=");
	    for (i=0; i < nprocs; i++) {
		MPI_Type_size (agg_comm_dtype_arr[i], &size);
		fprintf (stderr, "[%d](%d,%d,%d)", i, alltoallw_disps[i], 
			 size, agg_alltoallw_counts[i]);
		if (i != nprocs - 1)
		    fprintf(stderr, ",");
	    }
	    fprintf (stderr, "]\n");
	    if (fd->is_agg) {
		fprintf (stderr, "sending to [client](disp,size,cnt)=");
		for (i=0; i < nprocs; i++) {
		    if (fd->is_agg)
			MPI_Type_size (client_comm_dtype_arr[i], &size);
		    else
			size = -1;
		    
		    fprintf (stderr, "[%d](%d,%d,%d)", i, alltoallw_disps[i], 
			     size, client_alltoallw_counts[i]);
		    if (i != nprocs - 1)
			fprintf(stderr, ",");
		}
		fprintf (stderr,"\n");
	    }
	    fflush (NULL);
#endif
	    /* aggregators post all Isends for outgoing data to clients */
	    if (fd->is_agg)
		post_aggregator_comm(fd->comm, rdwr, nprocs, cb_buf,
				     client_comm_dtype_arr,
				     client_comm_sz_arr,
				     &agg_comm_requests,
				     &aggs_client_count);

	    if (fd->is_agg && aggs_client_count) {
		agg_comm_statuses = ADIOI_Malloc(aggs_client_count *
						 sizeof(MPI_Status));
		MPI_Waitall(aggs_client_count, agg_comm_requests,
			    agg_comm_statuses);
#ifdef AGGREGATION_PROFILE
		MPE_Log_event (5033, 0, NULL);
#endif
		ADIOI_Free (agg_comm_requests);
		ADIOI_Free (agg_comm_statuses);
	    }

	    if (clients_agg_count) {
		client_comm_statuses = ADIOI_Malloc(clients_agg_count *
						    sizeof(MPI_Status));
		MPI_Waitall(clients_agg_count, client_comm_requests,
			    client_comm_statuses);
#ifdef AGGREGATION_PROFILE
		MPE_Log_event (5039, 0, NULL);
#endif
		ADIOI_Free (client_comm_requests);
		ADIOI_Free (client_comm_statuses);
	    }

#ifdef DEBUG2
	    fprintf (stderr, "buffered_io_size = %lld\n", buffered_io_size);
	    if (fd->is_agg && buffered_io_size) {
		fprintf (stderr, "buf = [");
		for (i=0; i<bufextent; i++)
		    fprintf (stderr, "%c", ((char *) buf)[i]);
		fprintf (stderr, "]\n");
		fprintf (stderr, "cb_buf = [");
		for (i=0; i<buffered_io_size; i++)
		    fprintf (stderr, "%c", cb_buf[i]);
		fprintf (stderr, "]\n");
		fflush (NULL);
	    }
#endif
	}
	else { /* Write Case */
#ifdef DEBUG
	    fprintf (stderr, "sending to [agg](disp,size,cnt)=");
	    for (i=0; i < nprocs; i++) {
		MPI_Type_size (agg_comm_dtype_arr[i], &size);
		fprintf (stderr, "[%d](%d,%d,%d)", i, alltoallw_disps[i], 
			 size, agg_alltoallw_counts[i]);
		if (i != nprocs - 1)
		    fprintf(stderr, ",");
	    }
	    fprintf (stderr, "]\n");
	    fprintf (stderr, "expecting from [client](disp,size,cnt)=");
	    for (i=0; i < nprocs; i++) {
		if (fd->is_agg)
		    MPI_Type_size (client_comm_dtype_arr[i], &size);
		else
		    size = -1;
		
		fprintf (stderr, "[%d](%d,%d,%d)", i, alltoallw_disps[i], 
			 size, client_alltoallw_counts[i]);
		if (i != nprocs - 1)
		    fprintf(stderr, ",");
	    }
	    fprintf (stderr,"\n");
	    fflush (NULL);
#endif
#ifdef DEBUG
	    fprintf (stderr, "buffered_io_size = %lld\n", buffered_io_size);
#endif
	    
	    if (clients_agg_count) {
		client_comm_statuses = ADIOI_Malloc(clients_agg_count *
						    sizeof(MPI_Status));
		MPI_Waitall(clients_agg_count, client_comm_requests,
			    client_comm_statuses);
#ifdef AGGREGATION_PROFILE
		MPE_Log_event (5039, 0, NULL);
#endif
		ADIOI_Free(client_comm_requests);
		ADIOI_Free(client_comm_statuses);
	    }
#ifdef DEBUG2
	    if (bufextent) {
		fprintf (stderr, "buf = [");
		for (i=0; i<bufextent; i++)
		    fprintf (stderr, "%c", ((char *) buf)[i]);
		fprintf (stderr, "]\n");
	    }
#endif

	    if (fd->is_agg && buffered_io_size) {
		assert (aggs_client_count != 0);
		/* make sure we actually have the data to write out */
		agg_comm_statuses = (MPI_Status *)
		    ADIOI_Malloc (aggs_client_count*sizeof(MPI_Status));
		
		MPI_Waitall (aggs_client_count, agg_comm_requests,
			     agg_comm_statuses);
#ifdef AGGREGATION_PROFILE
		MPE_Log_event (5033, 0, NULL);
#endif
		ADIOI_Free (agg_comm_requests);
		ADIOI_Free (agg_comm_statuses);
#ifdef DEBUG2
		fprintf (stderr, "cb_buf = [");
		for (i=0; i<buffered_io_size; i++)
		    fprintf (stderr, "%c", cb_buf[i]);
		fprintf (stderr, "]\n");
		fflush (NULL);
#endif
		ADIOI_IOFiletype (fd, cb_buf, buffered_io_size, MPI_BYTE,
				  ADIO_EXPLICIT_OFFSET, agg_disp, agg_dtype,
				  ADIOI_WRITE, status, error_code);
		if (*error_code != MPI_SUCCESS) return;
		MPI_Type_free (&agg_dtype);
	    }

	}
	} else {
	/* Alltoallw version of everything */
	ADIOI_Build_client_reqs(fd, nprocs, my_mem_view_state_arr,
				agg_file_view_state_arr,
				agg_comm_sz_arr, agg_comm_dtype_arr);

	if (rdwr == ADIOI_READ) {
	    if (fd->is_agg && buffered_io_size) {
		ADIOI_IOFiletype (fd, cb_buf, buffered_io_size, MPI_BYTE,
				  ADIO_EXPLICIT_OFFSET, agg_disp, agg_dtype,
				  ADIOI_READ, status, error_code);
		if (*error_code != MPI_SUCCESS) return;
		MPI_Type_free (&agg_dtype);
	    }

#ifdef AGGREGATION_PROFILE
	    MPE_Log_event (5032, 0, NULL);
#endif
	    MPI_Alltoallw (cb_buf, client_alltoallw_counts, alltoallw_disps,
			   client_comm_dtype_arr,
			   buf, agg_alltoallw_counts , alltoallw_disps,
			   agg_comm_dtype_arr,
			   fd->comm);
#ifdef AGGREGATION_PROFILE
	    MPE_Log_event (5033, 0, NULL);
#endif
	}
	else { /* Write Case */
#ifdef AGGREGATION_PROFILE
	    MPE_Log_event (5032, 0, NULL);
#endif
	    MPI_Alltoallw (buf, agg_alltoallw_counts, alltoallw_disps,
			   agg_comm_dtype_arr,
			   cb_buf, client_alltoallw_counts, alltoallw_disps,
			   client_comm_dtype_arr,
			   fd->comm);
#ifdef AGGREGATION_PROFILE
	    MPE_Log_event (5033, 0, NULL);
#endif
	    if (fd->is_agg && buffered_io_size) {
		ADIOI_IOFiletype (fd, cb_buf, buffered_io_size, MPI_BYTE,
				  ADIO_EXPLICIT_OFFSET, agg_disp, agg_dtype,
				  ADIOI_WRITE, status, error_code);
		if (*error_code != MPI_SUCCESS) return;
		MPI_Type_free (&agg_dtype);
	    }
	}
	}

	/* Free (uncommit) datatypes for reuse */
	if (fd->is_agg) {
	    if (buffered_io_size > 0) {
		for (i=0; i<nprocs; i++) {
		    if (client_comm_sz_arr[i] > 0)
			MPI_Type_free (&client_comm_dtype_arr[i]);
		}
	    }
	}
	for (i=0; i<nprocs; i++) {
	    if (agg_comm_sz_arr[i] > 0)
		MPI_Type_free (&agg_comm_dtype_arr[i]);
	}

	/* figure out next set up requests */
	if (fd->is_agg) {
	    ADIOI_Build_agg_reqs (fd, rdwr, nprocs,
				  client_file_view_state_arr,
				  client_comm_dtype_arr,
				  client_comm_sz_arr,
				  &agg_disp,
				  &agg_dtype);
	    buffered_io_size = 0;
	    for (i=0; i <nprocs; i++) {
		if (client_comm_sz_arr[i] > 0)
		    buffered_io_size += client_comm_sz_arr[i];
	    }
	}
#ifdef USE_PRE_REQ
	else {
	    /* Example use of ADIOI_Build_client_pre_req. to an
	     * appropriate section */
	    for (i = 0; i < fd->hints->cb_nodes; i++)
	    {
		agg_rank = fd->hints->ranklist[(i+myrank)%fd->hints->cb_nodes];
#ifdef AGGREGATION_PROFILE
		MPE_Log_event (5040, 0, NULL);
#endif
		ADIOI_Build_client_pre_req(
		    fd, agg_rank, (i+myrank)%fd->hints->cb_nodes,
		    &(my_mem_view_state_arr[agg_rank]),
		    &(agg_file_view_state_arr[agg_rank]),
		    2*1024*1024, 
		    64*1024);
#ifdef AGGREGATION_PROFILE
		MPE_Log_event (5041, 0, NULL);
#endif
	    }
	}
#endif
	
	/* aggregators pre-post all Irecv's for incoming data from
	 * clients.  if nothing is needed, agg_comm_requests is not
	 * allocated */
	if (fd->hints->cb_alltoall == ADIOI_HINT_DISABLE) {
	    if ((fd->is_agg) && (rdwr == ADIOI_WRITE))
	        post_aggregator_comm(fd->comm, rdwr, nprocs, cb_buf,
				 client_comm_dtype_arr,
				 client_comm_sz_arr,
				 &agg_comm_requests,
				 &aggs_client_count);
	}

	/* Aggregators send amounts for data requested to clients */
	Exch_data_amounts (fd, nprocs, client_comm_sz_arr, agg_comm_sz_arr,
			   client_alltoallw_counts, agg_alltoallw_counts,
			   &aggregators_done);

    }

    /* Clean up */
	
    if (fd->hints->cb_pfr != ADIOI_HINT_ENABLE) {
	/* AAR, FSIZE, and User provided uniform File realms */
	if (1) {
	    ADIOI_Delete_flattened (fd->file_realm_types[0]);
	    MPI_Type_free (&fd->file_realm_types[0]);
	}
	else {
	    for (i=0; i<fd->hints->cb_nodes; i++) {
		ADIOI_Datatype_iscontig(fd->file_realm_types[i], &is_contig);
		if (!is_contig)
		    ADIOI_Delete_flattened(fd->file_realm_types[i]);
		MPI_Type_free (&fd->file_realm_types[i]);
	    }
	}
	ADIOI_Free (fd->file_realm_types);
	ADIOI_Free (fd->file_realm_st_offs);
    }

    /* This memtype must be deleted from the ADIOI_Flatlist or else it
     * will match incorrectly with other datatypes which use this
     * pointer. */
    ADIOI_Delete_flattened(datatype);
    ADIOI_Delete_flattened(fd->filetype);

    if (fd->is_agg) {
	if (buffered_io_size > 0)
	    MPI_Type_free (&agg_dtype);
	for (i=0; i<nprocs; i++) {
	    MPI_Type_free (&client_comm_dtype_arr[i]);
	    ADIOI_Free (client_file_view_state_arr[i].flat_type_p->indices);
	    ADIOI_Free (client_file_view_state_arr[i].flat_type_p->blocklens);
	    ADIOI_Free (client_file_view_state_arr[i].flat_type_p);
	}
	ADIOI_Free (client_file_view_state_arr);
	ADIOI_Free (cb_buf);
    } 
    for (i = 0; i<nprocs; i++)
	if (agg_comm_sz_arr[i] > 0)
	    MPI_Type_free (&agg_comm_dtype_arr[i]);
    
    ADIOI_Free (client_comm_sz_arr);
    ADIOI_Free (client_comm_dtype_arr);
    ADIOI_Free (my_mem_view_state_arr);
    ADIOI_Free (agg_file_view_state_arr);
    ADIOI_Free (agg_comm_sz_arr);
    ADIOI_Free (agg_comm_dtype_arr);
    ADIOI_Free (alltoallw_disps);
    ADIOI_Free (alltoallw_counts);
    ADIOI_Free (all_st_end_offsets);

#ifdef HAVE_STATUS_SET_BYTES
    MPIR_Status_set_bytes(status, datatype, bufsize);
    /* This is a temporary way of filling in status.  The right way is
     * to keep track of how much data was actually read and placed in
     * buf during collective I/O. */
#endif
    fd->fp_sys_posn = -1; /* set it to null. */
#ifdef AGGREGATION_PROFILE
    if (rdwr == ADIOI_READ)
	MPE_Log_event (5011, 0, NULL);
    else
	MPE_Log_event (5013, 0, NULL);
#endif
}
Пример #12
0
/*
 * Compute a dynamic access range based file domain partition among I/O aggregators,
 * which align to the GPFS block size
 * Divide the I/O workload among "nprocs_for_coll" processes. This is
 * done by (logically) dividing the file into file domains (FDs); each
 * process may directly access only its own file domain.
 * Additional effort is to make sure that each I/O aggregator get
 * a file domain that aligns to the GPFS block size.  So, there will
 * not be any false sharing of GPFS file blocks among multiple I/O nodes.
 *
 * The common version of this now accepts a min_fd_size and striping_unit.
 * It doesn't seem necessary here (using GPFS block sizes) but keep it in mind
 * (e.g. we could pass striping unit instead of using fs_ptr->blksize).
 */
void ADIOI_GPFS_Calc_file_domains(ADIO_File fd,
	                              ADIO_Offset *st_offsets,
                                      ADIO_Offset *end_offsets,
                                      int          nprocs,
                                      int          nprocs_for_coll,
                                      ADIO_Offset *min_st_offset_ptr,
                                      ADIO_Offset **fd_start_ptr,
                                      ADIO_Offset **fd_end_ptr,
                                      ADIO_Offset *fd_size_ptr,
                                      void        *fs_ptr)
{
    ADIO_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, *fd_size;
    int i, aggr;
    TRACE_ERR("Entering ADIOI_GPFS_Calc_file_domains\n");
    blksize_t blksize;

#ifdef AGGREGATION_PROFILE
    MPE_Log_event (5004, 0, NULL);
#endif

#   if AGG_DEBUG
    static char myname[] = "ADIOI_GPFS_Calc_file_domains";
    DBG_FPRINTF(stderr, "%s(%d): %d aggregator(s)\n",
	    myname,__LINE__,nprocs_for_coll);
#   endif
    if (fd->blksize <= 0)
	/* default to 1M if blksize unset */
	fd->blksize = 1048576;
    blksize = fd->blksize;

#   if AGG_DEBUG
    DBG_FPRINTF(stderr,"%s(%d): Blocksize=%ld\n",myname,__LINE__,blksize);
#   endif
/* find min of start offsets and max of end offsets of all processes */
    min_st_offset  = st_offsets [0];
    max_end_offset = end_offsets[0];
    for (i=1; i<nprocs; i++) {
        min_st_offset = ADIOI_MIN(min_st_offset, st_offsets[i]);
        max_end_offset = ADIOI_MAX(max_end_offset, end_offsets[i]);
    }

    /* DBG_FPRINTF(stderr, "_calc_file_domains, min_st_offset, max_
       = %qd, %qd\n", min_st_offset, max_end_offset );*/

    /* determine the "file domain (FD)" of each process, i.e., the portion of
       the file that will be "owned" by each process */

    ADIO_Offset gpfs_ub       = (max_end_offset +blksize-1) / blksize * blksize - 1;
    ADIO_Offset gpfs_lb       = min_st_offset / blksize * blksize;
    ADIO_Offset gpfs_ub_rdoff = (max_end_offset +blksize-1) / blksize * blksize - 1 - max_end_offset;
    ADIO_Offset gpfs_lb_rdoff = min_st_offset - min_st_offset / blksize * blksize;
    ADIO_Offset fd_gpfs_range = gpfs_ub - gpfs_lb + 1;

    int         naggs    = nprocs_for_coll;

    /* Tweak the file domains so that no fd is smaller than a threshold.  We
     * have to strike a balance between efficency and parallelism: somewhere
     * between 10k processes sending 32-byte requests and one process sending a
     * 320k request is a (system-dependent) sweet spot

    This is from the common code - the new min_fd_size parm that we didn't implement.
    (And common code uses a different declaration of fd_size so beware)

    if (fd_size < min_fd_size)
        fd_size = min_fd_size;
    */
    fd_size              = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
    *fd_start_ptr        = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
    *fd_end_ptr          = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
    fd_start             = *fd_start_ptr;
    fd_end               = *fd_end_ptr;

    /* each process will have a file domain of some number of gpfs blocks, but
     * the division of blocks is not likely to be even.  Some file domains will
     * be "large" and others "small"
     *
     * Example: consider  17 blocks distributed over 3 aggregators.
     * nb_cn_small = 17/3 = 5
     * naggs_large = 17 - 3*(17/3) = 17 - 15  = 2
     * naggs_small = 3 - 2 = 1
     *
     * and you end up with file domains of {5-blocks, 6-blocks, 6-blocks}
     *
     * what about (relatively) small files?  say, a file of 1000 blocks
     * distributed over 2064 aggregators:
     * nb_cn_small = 1000/2064 = 0
     * naggs_large = 1000 - 2064*(1000/2064) = 1000
     * naggs_small = 2064 - 1000 = 1064
     * and you end up with domains of {0, 0, 0, ... 1, 1, 1 ...}
     *
     * it might be a good idea instead of having all the zeros up front, to
     * "mix" those zeros into the fd_size array.  that way, no pset/bridge-set
     * is left with zero work.  In fact, even if the small file domains aren't
     * zero, it's probably still a good idea to mix the "small" file domains
     * across the fd_size array to keep the io nodes in balance */


    ADIO_Offset n_gpfs_blk    = fd_gpfs_range / blksize;
    ADIO_Offset nb_cn_small   = n_gpfs_blk/naggs;
    ADIO_Offset naggs_large   = n_gpfs_blk - naggs * (n_gpfs_blk/naggs);
    ADIO_Offset naggs_small   = naggs - naggs_large;

#ifdef BGQPLATFORM
    if (gpfsmpio_balancecontig == 1) {
	/* File domains blocks are assigned to aggregators in a breadth-first
	 * fashion relative to the ions - additionally, file domains on the
	 * aggregators sharing the same bridgeset and ion have contiguous
	 * offsets. */

	// initialize everything to small
	for (i=0; i<naggs; i++)
	    fd_size[i] = nb_cn_small     * blksize;

	// go thru and distribute the large across the bridges

	/* bridelistoffset: agg rank list offsets using the bridgelist - each
	 * entry is created by adding up the indexes for the aggs from all
	 * previous bridges */
	int *bridgelistoffset =
	    (int *) ADIOI_Malloc(fd->hints->fs_hints.bg.numbridges*sizeof(int));
	/* tmpbridgelistnum: copy of the bridgelistnum whose entries can be
	 * decremented to keep track of bridge assignments during the actual
	 * large block assignments to the agg rank list*/
	int *tmpbridgelistnum =
	    (int *) ADIOI_Malloc(fd->hints->fs_hints.bg.numbridges*sizeof(int));

	int j;
	for (j=0;j<fd->hints->fs_hints.bg.numbridges;j++) {
	    int k, bridgerankoffset = 0;
	    for (k=0;k<j;k++) {
		bridgerankoffset += fd->hints->fs_hints.bg.bridgelistnum[k];
	    }
	    bridgelistoffset[j] = bridgerankoffset;
	}

	for (j=0;j<fd->hints->fs_hints.bg.numbridges;j++)
	    tmpbridgelistnum[j] = fd->hints->fs_hints.bg.bridgelistnum[j];
	int bridgeiter = 0;

	/* distribute the large blocks across the aggs going breadth-first
	 * across the bridgelist - this distributes the fd sizes across the
	 * ions, so later in the file domain assignment when it iterates thru
	 * the ranklist the offsets will be contiguous within the bridge and
	 * ion as well */
	for (j=0;j<naggs_large;j++) {
	    int foundbridge = 0;
	    int numbridgelistpasses = 0;
	    while (!foundbridge) {
		if (tmpbridgelistnum[bridgeiter] > 0) {
		    foundbridge = 1;
		    /*
		       printf("bridgeiter is %d tmpbridgelistnum[bridgeiter] is %d bridgelistoffset[bridgeiter] is %d\n",bridgeiter,tmpbridgelistnum[bridgeiter],bridgelistoffset[bridgeiter]);
		       printf("naggs is %d bridgeiter is %d bridgelistoffset[bridgeiter] is %d tmpbridgelistnum[bridgeiter] is %d\n",naggs, bridgeiter,bridgelistoffset[bridgeiter],tmpbridgelistnum[bridgeiter]);
		       printf("naggs is %d bridgeiter is %d setting fd_size[%d]\n",naggs, bridgeiter,bridgelistoffset[bridgeiter]+(fd->hints->bridgelistnum[bridgeiter]-tmpbridgelistnum[bridgeiter]));
		     */
		    int currentbridgelistnum =
			(fd->hints->fs_hints.bg.bridgelistnum[bridgeiter]-
			 tmpbridgelistnum[bridgeiter]);
		    int currentfdsizeindex = bridgelistoffset[bridgeiter] +
			currentbridgelistnum;
		    fd_size[currentfdsizeindex] = (nb_cn_small+1) * blksize;
		    tmpbridgelistnum[bridgeiter]--;
		}
		if (bridgeiter == (fd->hints->fs_hints.bg.numbridges-1)) {
		    /* guard against infinite loop - should only ever make 1 pass
		     * thru bridgelist */
		    ADIOI_Assert(numbridgelistpasses == 0);
		    numbridgelistpasses++;
		    bridgeiter = 0;
		}
		else
		    bridgeiter++;
	    }
	}
	ADIOI_Free(tmpbridgelistnum);
	ADIOI_Free(bridgelistoffset);

    } else {
	/* BG/L- and BG/P-style distribution of file domains: simple allocation of
	 * file domins to each aggregator */
	for (i=0; i<naggs; i++) {
	    if (i < naggs_large) {
		fd_size[i] = (nb_cn_small+1) * blksize;
	    } else {
		fd_size[i] = nb_cn_small     * blksize;
	    }
	}
    }
#ifdef balancecontigtrace
    int myrank;
    MPI_Comm_rank(fd->comm,&myrank);
    if (myrank == 0) {
      fprintf(stderr,"naggs_small is %d nb_cn_small is %d\n",naggs_small,nb_cn_small);
	for (i=0; i<naggs; i++) {
	    fprintf(stderr,"fd_size[%d] set to %d agg rank is %d\n",i,fd_size[i],fd->hints->ranklist[i]);
	}
    }
#endif

#else // not BGQ platform
	for (i=0; i<naggs; i++) {
	    if (i < naggs_large) {
		fd_size[i] = (nb_cn_small+1) * blksize;
	    } else {
		fd_size[i] = nb_cn_small     * blksize;
	    }
    }

#endif


#   if AGG_DEBUG
     DBG_FPRINTF(stderr,"%s(%d): "
                   "gpfs_ub       %llu, "
                   "gpfs_lb       %llu, "
                   "gpfs_ub_rdoff %llu, "
                   "gpfs_lb_rdoff %llu, "
                   "fd_gpfs_range %llu, "
                   "n_gpfs_blk    %llu, "
                   "nb_cn_small   %llu, "
                   "naggs_large   %llu, "
                   "naggs_small   %llu, "
                   "\n",
                   myname,__LINE__,
                   gpfs_ub      ,
                   gpfs_lb      ,
                   gpfs_ub_rdoff,
                   gpfs_lb_rdoff,
                   fd_gpfs_range,
                   n_gpfs_blk   ,
                   nb_cn_small  ,
                   naggs_large  ,
                   naggs_small
                   );
#   endif

    fd_size[0]       -= gpfs_lb_rdoff;
    fd_size[naggs-1] -= gpfs_ub_rdoff;

    /* compute the file domain for each aggr */
    ADIO_Offset offset = min_st_offset;
    for (aggr=0; aggr<naggs; aggr++) {
        fd_start[aggr] = offset;
        fd_end  [aggr] = offset + fd_size[aggr] - 1;
        offset += fd_size[aggr];
    }

    *fd_size_ptr = fd_size[0];
    *min_st_offset_ptr = min_st_offset;

#ifdef AGGREGATION_PROFILE
    MPE_Log_event (5005, 0, NULL);
#endif
    ADIOI_Free (fd_size);
    TRACE_ERR("Leaving ADIOI_GPFS_Calc_file_domains\n");
}