/* 
 * compute aggregators ranklist and put it into fd->hints struct
 */ 
static void 
ADIOI_BGL_compute_agg_ranklist_serial ( ADIO_File fd, 
					const ADIOI_BGL_ConfInfo_t *confInfo, 
					ADIOI_BGL_ProcInfo_t *all_procInfo,
					int *aggrsInPset )
{
#   if AGG_DEBUG
    int i; 
#   endif
    int naggs; 
    int *tmp_ranklist;

  /* compute the ranklist of IO aggregators and put into tmp_ranklist */
    tmp_ranklist = (int *) ADIOI_Malloc (confInfo->nProcs * sizeof(int));

#   if AGG_DEBUG
    for (i=0; i<confInfo->nProcs; i++) {
      DBG_FPRINTF(stderr, "\tcpuid %1d, rank = %6d\n", all_procInfo[i].cpuid, all_procInfo[i].rank );
    }
#   endif

    naggs = 
    ADIOI_BGL_compute_agg_ranklist_serial_do (confInfo, all_procInfo, aggrsInPset, tmp_ranklist);

#   define VERIFY 0
#   if VERIFY
    DBG_FPRINTF(stderr, "\tconfInfo = %3d,%3d,%3d,%3d,%3d,%3d,%.4f; naggs = %d\n", 
	    confInfo->PsetSize        ,
	    confInfo->numPsets        ,
	    confInfo->isVNM           ,
	    confInfo->virtualPsetSize ,
	    confInfo->nProcs          ,
	    confInfo->nAggrs          ,
	    confInfo->aggRatio        ,
	    naggs );
#   endif

#   if AGG_DEBUG
    for (i=0; i<naggs; i++) {
      DBG_FPRINTF(stderr, "\taggr %-4d = %6d\n", i, tmp_ranklist[i] );
    }
#   endif

  /* copy the ranklist of IO aggregators to fd->hints */
    if(fd->hints->ranklist != NULL) ADIOI_Free (fd->hints->ranklist);

    fd->hints->cb_nodes = naggs;
    fd->hints->ranklist = (int *) ADIOI_Malloc (naggs * sizeof(int));
    memcpy( fd->hints->ranklist, tmp_ranklist, naggs*sizeof(int) );

  /* */
    ADIOI_Free( tmp_ranklist );
    return;
}
Example #2
0
static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
			 datatype, int nprocs,
			 int myrank, ADIOI_Access
			 *others_req, ADIO_Offset *offset_list,
			 ADIO_Offset *len_list, int contig_access_count, ADIO_Offset
                         min_st_offset, ADIO_Offset fd_size,
			 ADIO_Offset *fd_start, ADIO_Offset *fd_end,
                         int *buf_idx, int *error_code)
{
/* Read in sizes of no more than coll_bufsize, an info parameter.
   Send data to appropriate processes. 
   Place recd. data in user buf.
   The idea is to reduce the amount of extra memory required for
   collective I/O. If all data were read all at once, which is much
   easier, it would require temp space more than the size of user_buf,
   which is often unacceptable. For example, to read a distributed
   array from a file, where each local array is 8Mbytes, requiring
   at least another 8Mbytes of temp space is unacceptable. */

    int i, j, m, ntimes, max_ntimes, buftype_is_contig;
    ADIO_Offset st_loc=-1, end_loc=-1, off, done, real_off, req_off;
    char *read_buf = NULL, *tmp_buf;
    int *curr_offlen_ptr, *count, *send_size, *recv_size;
    int *partial_send, *recd_from_proc, *start_pos;
    /* Not convinced end_loc-st_loc couldn't be > int, so make these offsets*/
    ADIO_Offset real_size, size, for_curr_iter, for_next_iter;
    int req_len, flag, rank;
    MPI_Status status;
    ADIOI_Flatlist_node *flat_buf=NULL;
    MPI_Aint buftype_extent;
    int coll_bufsize;

    *error_code = MPI_SUCCESS;  /* changed below if error */
    /* only I/O errors are currently reported */
    
/* calculate the number of reads of size coll_bufsize
   to be done by each process and the max among all processes.
   That gives the no. of communication phases as well.
   coll_bufsize is obtained from the hints object. */

    coll_bufsize = fd->hints->cb_buffer_size;

    /* grab some initial values for st_loc and end_loc */
    for (i=0; i < nprocs; i++) {
	if (others_req[i].count) {
	    st_loc = others_req[i].offsets[0];
	    end_loc = others_req[i].offsets[0];
	    break;
	}
    }

    /* now find the real values */
    for (i=0; i < nprocs; i++)
	for (j=0; j<others_req[i].count; j++) {
	    st_loc = ADIOI_MIN(st_loc, others_req[i].offsets[j]);
	    end_loc = ADIOI_MAX(end_loc, (others_req[i].offsets[j]
					  + others_req[i].lens[j] - 1));
	}

    /* calculate ntimes, the number of times this process must perform I/O
     * operations in order to complete all the requests it has received.
     * the need for multiple I/O operations comes from the restriction that
     * we only use coll_bufsize bytes of memory for internal buffering.
     */
    if ((st_loc==-1) && (end_loc==-1)) {
	/* this process does no I/O. */
	ntimes = 0;
    }
    else {
	/* ntimes=ceiling_div(end_loc - st_loc + 1, coll_bufsize)*/
	ntimes = (int) ((end_loc - st_loc + coll_bufsize)/coll_bufsize);
    }

    MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX, fd->comm); 

    read_buf = fd->io_buf;  /* Allocated at open time */

    curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int)); 
    /* its use is explained below. calloc initializes to 0. */

    count = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    /* to store count of how many off-len pairs per proc are satisfied
       in an iteration. */

    partial_send = (int *) ADIOI_Calloc(nprocs, sizeof(int));
    /* if only a portion of the last off-len pair is sent to a process 
       in a particular iteration, the length sent is stored here.
       calloc initializes to 0. */

    send_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    /* total size of data to be sent to each proc. in an iteration */

    recv_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    /* total size of data to be recd. from each proc. in an iteration.
       Of size nprocs so that I can use MPI_Alltoall later. */

    recd_from_proc = (int *) ADIOI_Calloc(nprocs, sizeof(int));
    /* amount of data recd. so far from each proc. Used in
       ADIOI_Fill_user_buffer. initialized to 0 here. */

    start_pos = (int *) ADIOI_Malloc(nprocs*sizeof(int));
    /* used to store the starting value of curr_offlen_ptr[i] in 
       this iteration */

    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
    if (!buftype_is_contig) {
	ADIOI_Flatten_datatype(datatype);
	flat_buf = ADIOI_Flatlist;
        while (flat_buf->type != datatype) flat_buf = flat_buf->next;
    }
    MPI_Type_extent(datatype, &buftype_extent);

    done = 0;
    off = st_loc;
    for_curr_iter = for_next_iter = 0;

    MPI_Comm_rank(fd->comm, &rank);

    for (m=0; m<ntimes; m++) {
       /* read buf of size coll_bufsize (or less) */
       /* go through all others_req and check if any are satisfied
          by the current read */

       /* since MPI guarantees that displacements in filetypes are in 
          monotonically nondecreasing order, I can maintain a pointer
	  (curr_offlen_ptr) to 
          current off-len pair for each process in others_req and scan
          further only from there. There is still a problem of filetypes
          such as:  (1, 2, 3 are not process nos. They are just numbers for
          three chunks of data, specified by a filetype.)

                   1  -------!--
                   2    -----!----
                   3       --!-----

          where ! indicates where the current read_size limitation cuts 
          through the filetype.  I resolve this by reading up to !, but
          filling the communication buffer only for 1. I copy the portion
          left over for 2 into a tmp_buf for use in the next
	  iteration. i.e., 2 and 3 will be satisfied in the next
	  iteration. This simplifies filling in the user's buf at the
	  other end, as only one off-len pair with incomplete data
	  will be sent. I also don't need to send the individual
	  offsets and lens along with the data, as the data is being
	  sent in a particular order. */ 

          /* off = start offset in the file for the data actually read in 
                   this iteration 
             size = size of data read corresponding to off
             real_off = off minus whatever data was retained in memory from
                  previous iteration for cases like 2, 3 illustrated above
             real_size = size plus the extra corresponding to real_off
             req_off = off in file for a particular contiguous request 
                       minus what was satisfied in previous iteration
             req_size = size corresponding to req_off */

	size = ADIOI_MIN((unsigned)coll_bufsize, end_loc-st_loc+1-done); 
	real_off = off - for_curr_iter;
	real_size = size + for_curr_iter;

	for (i=0; i<nprocs; i++) count[i] = send_size[i] = 0;
	for_next_iter = 0;

	for (i=0; i<nprocs; i++) {
#ifdef RDCOLL_DEBUG
	    DBG_FPRINTF(stderr, "rank %d, i %d, others_count %d\n", rank, i, others_req[i].count); 
#endif
	    if (others_req[i].count) {
		start_pos[i] = curr_offlen_ptr[i];
		for (j=curr_offlen_ptr[i]; j<others_req[i].count;
		     j++) {
		    if (partial_send[i]) {
			/* this request may have been partially
			   satisfied in the previous iteration. */
			req_off = others_req[i].offsets[j] +
			    partial_send[i]; 
                        req_len = others_req[i].lens[j] -
			    partial_send[i];
			partial_send[i] = 0;
			/* modify the off-len pair to reflect this change */
			others_req[i].offsets[j] = req_off;
			others_req[i].lens[j] = req_len;
		    }
		    else {
			req_off = others_req[i].offsets[j];
                        req_len = others_req[i].lens[j];
		    }
		    if (req_off < real_off + real_size) {
			count[i]++;
      ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)read_buf)+req_off-real_off) == (ADIO_Offset)(MPIR_Upint)(read_buf+req_off-real_off));
			MPI_Address(read_buf+req_off-real_off, 
                               &(others_req[i].mem_ptrs[j]));
      ADIOI_Assert((real_off + real_size - req_off) == (int)(real_off + real_size - req_off));
			send_size[i] += (int)(ADIOI_MIN(real_off + real_size - req_off, 
                                      (ADIO_Offset)(unsigned)req_len)); 

			if (real_off+real_size-req_off < (ADIO_Offset)(unsigned)req_len) {
			    partial_send[i] = (int) (real_off + real_size - req_off);
			    if ((j+1 < others_req[i].count) && 
                                 (others_req[i].offsets[j+1] < 
                                     real_off+real_size)) { 
				/* this is the case illustrated in the
				   figure above. */
				for_next_iter = ADIOI_MAX(for_next_iter,
					  real_off + real_size - others_req[i].offsets[j+1]); 
				/* max because it must cover requests 
				   from different processes */
			    }
			    break;
			}
		    }
		    else break;
		}
		curr_offlen_ptr[i] = j;
	    }
	}

	flag = 0;
	for (i=0; i<nprocs; i++)
	    if (count[i]) flag = 1;

	if (flag) {
      ADIOI_Assert(size == (int)size);
	    ADIO_ReadContig(fd, read_buf+for_curr_iter, (int)size, MPI_BYTE,
			    ADIO_EXPLICIT_OFFSET, off, &status, error_code);
	    if (*error_code != MPI_SUCCESS) return;
	}
	
	for_curr_iter = for_next_iter;
	
	ADIOI_R_Exchange_data(fd, buf, flat_buf, offset_list, len_list,
			    send_size, recv_size, count, 
       			    start_pos, partial_send, recd_from_proc, nprocs,
			    myrank, 
			    buftype_is_contig, contig_access_count,
			    min_st_offset, fd_size, fd_start, fd_end,
			    others_req, 
                            m, buftype_extent, buf_idx); 


	if (for_next_iter) {
	    tmp_buf = (char *) ADIOI_Malloc(for_next_iter);
      ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)read_buf)+real_size-for_next_iter) == (ADIO_Offset)(MPIR_Upint)(read_buf+real_size-for_next_iter));
      ADIOI_Assert((for_next_iter+coll_bufsize) == (size_t)(for_next_iter+coll_bufsize));
	    memcpy(tmp_buf, read_buf+real_size-for_next_iter, for_next_iter);
	    ADIOI_Free(fd->io_buf);
	    fd->io_buf = (char *) ADIOI_Malloc(for_next_iter+coll_bufsize);
	    memcpy(fd->io_buf, tmp_buf, for_next_iter);
	    read_buf = fd->io_buf;
	    ADIOI_Free(tmp_buf);
	}

	off += size;
	done += size;
    }

    for (i=0; i<nprocs; i++) count[i] = send_size[i] = 0;
    for (m=ntimes; m<max_ntimes; m++) 
/* nothing to send, but check for recv. */
	ADIOI_R_Exchange_data(fd, buf, flat_buf, offset_list, len_list,
			    send_size, recv_size, count, 
			    start_pos, partial_send, recd_from_proc, nprocs,
			    myrank, 
			    buftype_is_contig, contig_access_count,
			    min_st_offset, fd_size, fd_start, fd_end,
			    others_req, m,
                            buftype_extent, buf_idx); 

    ADIOI_Free(curr_offlen_ptr);
    ADIOI_Free(count);
    ADIOI_Free(partial_send);
    ADIOI_Free(send_size);
    ADIOI_Free(recv_size);
    ADIOI_Free(recd_from_proc);
    ADIOI_Free(start_pos);
}
Example #3
0
void ADIOI_Calc_my_off_len(ADIO_File fd, int bufcount, MPI_Datatype
			    datatype, int file_ptr_type, ADIO_Offset
			    offset, ADIO_Offset **offset_list_ptr, ADIO_Offset
			    **len_list_ptr, ADIO_Offset *start_offset_ptr,
			    ADIO_Offset *end_offset_ptr, int
			   *contig_access_count_ptr)
{
    MPI_Count filetype_size, etype_size;
    MPI_Count buftype_size;
    int i, j, k;
    ADIO_Offset i_offset;
    ADIO_Offset frd_size=0, old_frd_size=0;
    int st_index=0;
    ADIO_Offset n_filetypes, etype_in_filetype;
    ADIO_Offset abs_off_in_filetype=0;
    ADIO_Offset bufsize;
    ADIO_Offset sum, n_etypes_in_filetype, size_in_filetype;
    int contig_access_count, filetype_is_contig;
    ADIO_Offset *len_list;
    MPI_Aint filetype_extent, filetype_lb;
    ADIOI_Flatlist_node *flat_file;
    ADIO_Offset *offset_list, off, end_offset=0, disp;

#ifdef AGGREGATION_PROFILE
    MPE_Log_event (5028, 0, NULL);
#endif
    
/* For this process's request, calculate the list of offsets and
   lengths in the file and determine the start and end offsets. */

    ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);

    MPI_Type_size_x(fd->filetype, &filetype_size);
    MPI_Type_extent(fd->filetype, &filetype_extent);
    MPI_Type_lb(fd->filetype, &filetype_lb);
    MPI_Type_size_x(datatype, &buftype_size);
    etype_size = fd->etype_size;

    if ( ! filetype_size ) {
	*contig_access_count_ptr = 0;
	*offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc(2*sizeof(ADIO_Offset));
	*len_list_ptr = (ADIO_Offset *) ADIOI_Malloc(2*sizeof(ADIO_Offset));
        /* 2 is for consistency. everywhere I malloc one more than needed */

	offset_list = *offset_list_ptr;
	len_list = *len_list_ptr;
        offset_list[0] = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : 
                 fd->disp + (ADIO_Offset)etype_size * offset;
	len_list[0] = 0;
	*start_offset_ptr = offset_list[0];
	*end_offset_ptr = offset_list[0] + len_list[0] - 1;
	
	return;
    }

    if (filetype_is_contig) {
	*contig_access_count_ptr = 1;        
	*offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc(2*sizeof(ADIO_Offset));
	*len_list_ptr = (ADIO_Offset *) ADIOI_Malloc(2*sizeof(ADIO_Offset));
        /* 2 is for consistency. everywhere I malloc one more than needed */

	offset_list = *offset_list_ptr;
	len_list = *len_list_ptr;
        offset_list[0] = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : 
                 fd->disp + (ADIO_Offset)etype_size * offset;
	len_list[0] = (ADIO_Offset)bufcount * (ADIO_Offset)buftype_size;
	*start_offset_ptr = offset_list[0];
	*end_offset_ptr = offset_list[0] + len_list[0] - 1;

	/* update file pointer */
	if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = *end_offset_ptr + 1;
    }

    else {

       /* First calculate what size of offset_list and len_list to allocate */
   
       /* filetype already flattened in ADIO_Open or ADIO_Fcntl */
	flat_file = ADIOI_Flatlist;
	while (flat_file->type != fd->filetype) flat_file = flat_file->next;
	disp = fd->disp;

#ifdef RDCOLL_DEBUG 
        {
            int ii;
            DBG_FPRINTF(stderr, "flattened %3lld : ", flat_file->count );
            for (ii=0; ii<flat_file->count; ii++) {
                DBG_FPRINTF(stderr, "%16lld:%-16lld", flat_file->indices[ii], flat_file->blocklens[ii] );
            }
            DBG_FPRINTF(stderr, "\n" );
        }
#endif
	if (file_ptr_type == ADIO_INDIVIDUAL) {
           /* Wei-keng reworked type processing to be a bit more efficient */
            offset       = fd->fp_ind - disp;
            n_filetypes  = (offset - flat_file->indices[0]) / filetype_extent;
             offset     -= (ADIO_Offset)n_filetypes * filetype_extent;
	     	/* now offset is local to this extent */
 
            /* find the block where offset is located, skip blocklens[i]==0 */
            for (i=0; i<flat_file->count; i++) {
                ADIO_Offset dist;
                if (flat_file->blocklens[i] == 0) continue;
                dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
                /* frd_size is from offset to the end of block i */
		if (dist == 0) {
			i++;
			offset   = flat_file->indices[i];
			frd_size = flat_file->blocklens[i];
			break;
		}
		if (dist > 0) {
                    frd_size = dist;
		    break;
		}
	    }
            st_index = i;  /* starting index in flat_file->indices[] */
            offset += disp + (ADIO_Offset)n_filetypes*filetype_extent;
        }
	else {
	    n_etypes_in_filetype = filetype_size/etype_size;
	    n_filetypes = offset / n_etypes_in_filetype;
	    etype_in_filetype = offset % n_etypes_in_filetype;
	    size_in_filetype = etype_in_filetype * etype_size;
 
	    sum = 0;
	    for (i=0; i<flat_file->count; i++) {
		sum += flat_file->blocklens[i];
		if (sum > size_in_filetype) {
		    st_index = i;
		    frd_size = sum - size_in_filetype;
		    abs_off_in_filetype = flat_file->indices[i] +
			size_in_filetype - (sum - flat_file->blocklens[i]);
		    break;
		}
	    }

	    /* abs. offset in bytes in the file */
	    offset = disp + n_filetypes* (ADIO_Offset)filetype_extent + 
		abs_off_in_filetype;
	}

         /* calculate how much space to allocate for offset_list, len_list */

	old_frd_size = frd_size;
	contig_access_count = i_offset = 0;
	j = st_index;
	bufsize = (ADIO_Offset)buftype_size * (ADIO_Offset)bufcount;
	frd_size = ADIOI_MIN(frd_size, bufsize);
	while (i_offset < bufsize) {
	    if (frd_size) contig_access_count++;
	    i_offset += frd_size;
	    j = (j + 1) % flat_file->count;
	    frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset);
	}

        /* allocate space for offset_list and len_list */

	*offset_list_ptr = (ADIO_Offset *)
	         ADIOI_Malloc((contig_access_count+1)*sizeof(ADIO_Offset));  
	*len_list_ptr = (ADIO_Offset *) ADIOI_Malloc((contig_access_count+1)*sizeof(ADIO_Offset));
        /* +1 to avoid a 0-size malloc */

	offset_list = *offset_list_ptr;
	len_list = *len_list_ptr;

      /* find start offset, end offset, and fill in offset_list and len_list */

	*start_offset_ptr = offset; /* calculated above */

	i_offset = k = 0;
	j = st_index;
	off = offset;
	frd_size = ADIOI_MIN(old_frd_size, bufsize);
	while (i_offset < bufsize) {
	    if (frd_size) {
		offset_list[k] = off;
		len_list[k] = frd_size;
		k++;
	    }
	    i_offset += frd_size;
	    end_offset = off + frd_size - 1;

     /* Note: end_offset points to the last byte-offset that will be accessed.
         e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/

	    if (off + frd_size < disp + flat_file->indices[j] +
		flat_file->blocklens[j] + 
		 n_filetypes* (ADIO_Offset)filetype_extent)
	    {
		off += frd_size;
		/* did not reach end of contiguous block in filetype.
		 * no more I/O needed. off is incremented by frd_size. 
		 */
	    }
	    else {
		j = (j+1) % flat_file->count;
                n_filetypes += (j == 0) ? 1 : 0;
                while (flat_file->blocklens[j]==0) {
			j = (j+1) % flat_file->count;
                    n_filetypes += (j == 0) ? 1 : 0;
                    /* hit end of flattened filetype; start at beginning 
		     * again */
		}
		off = disp + flat_file->indices[j] + 
		     n_filetypes* (ADIO_Offset)filetype_extent;
		frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset);
	    }
	}

	/* update file pointer */
	if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;

	*contig_access_count_ptr = contig_access_count;
	 *end_offset_ptr = end_offset;
    }
#ifdef AGGREGATION_PROFILE
    MPE_Log_event (5029, 0, NULL);
#endif
}
Example #4
0
int main(int argc, char ** argv)
{
  pami_client_t     client;
  pami_context_t    context;
  pami_result_t     status = PAMI_ERROR;
  
  /* create PAMI client */
  RC( PAMI_Client_create("TEST", &client, NULL, 0) );
  DBG_FPRINTF((stderr,"Client created successfully at 0x%p\n",client));

  /* create PAMI context */
  RC( PAMI_Context_createv(client, NULL, 0, &context, 1) );
  DBG_FPRINTF((stderr,"Context created successfully at 0x%p\n",context));

  /* ------------------------------------------------------------------------ */

  pami_extension_t      extension;
  const char            ext_name[] = "EXT_hfi_extension";
  const char            sym_name[] = "hfi_pkt_counters";
  hfi_pkt_counters_fn   hfi_counters = NULL;
  hfi_pkt_counter_t     pkt_counter;

  /* open PAMI extension */
  RC( PAMI_Extension_open (client, ext_name, &extension) );
  DBG_FPRINTF((stderr,"Open %s successfully.\n", ext_name));

  /* load PAMI extension function */
  hfi_counters = (hfi_pkt_counters_fn) 
      PAMI_Extension_symbol (extension, sym_name);
  if (hfi_counters == (void *)NULL)
  {
    fprintf (stderr, "Error. Failed to load %s function in %s\n",
             sym_name, ext_name); 
    return 1;
  } 
  DBG_FPRINTF((stderr,"Loaded function %s in %s successfully.\n", 
              sym_name, ext_name));

  /* invoke PAMI extension function */
  RC( hfi_counters(context, &pkt_counter) );
  DBG_FPRINTF((stderr,"Function %s invoked successfully.\n", 
              sym_name));
  printf( "Pkt sent =         %lu\n"
          "Pkt sent dropped = %lu\n"
          "Ind pkt sent =     %lu\n"
          "Pkt recv =         %lu\n"
          "Pkt recv dropped = %lu\n"
          "Ind pkt recv =     %lu\n"
          "Imm pkt sent =     %lu\n",
          pkt_counter.total_packets_sent,
          pkt_counter.packets_send_drop,
          pkt_counter.indicate_packet_sent,
          pkt_counter.total_packets_recv,
          pkt_counter.packets_recv_drop,
          pkt_counter.indicate_packet_recv,
          pkt_counter.immediate_packet_sent);

  /* close PAMI extension */
  RC( PAMI_Extension_close (extension) );
  DBG_FPRINTF((stderr,"Close %s successfully.\n", ext_name));

  /* ------------------------------------------------------------------------ */
  /* destroy PAMI context */
  RC( PAMI_Context_destroyv(&context, 1) );
  DBG_FPRINTF((stderr, "PAMI context destroyed successfully\n"));

  /* destroy PAMI client */
  RC( PAMI_Client_destroy(&client) );
  DBG_FPRINTF((stderr, "PAMI client destroyed successfully\n"));

  return 0;
}
Example #5
0
int main(int argc, char ** argv)
{
    pami_client_t         client;
    pami_context_t        context;
    pami_result_t         status = PAMI_ERROR;
    pami_configuration_t  pami_config;
    pami_geometry_t       world_geo;
    size_t                barrier_alg_num[2];
    pami_algorithm_t*     bar_always_works_algo = NULL;
    pami_metadata_t*      bar_always_works_md = NULL;
    pami_algorithm_t*     bar_must_query_algo = NULL;
    pami_metadata_t*      bar_must_query_md   = NULL;
    pami_xfer_t           barrier;
    int                   my_id;
    volatile int          is_fence_done   = 0;
    volatile int          is_barrier_done = 0;

    /* create PAMI client */
    RC( PAMI_Client_create("TEST", &client, NULL, 0) );
    DBG_FPRINTF((stderr,"Client created successfully at 0x%p\n",client));

    /* create PAMI context */
    RC( PAMI_Context_createv(client, NULL, 0, &context, 1) );
    DBG_FPRINTF((stderr,"Context created successfully at 0x%p\n",context));

    /* query my task id */
    bzero(&pami_config, sizeof(pami_configuration_t));
    pami_config.name = PAMI_CLIENT_TASK_ID;
    RC( PAMI_Client_query(client, &pami_config, 1) );
    my_id = pami_config.value.intval;
    DBG_FPRINTF((stderr,"My task id is %d\n", my_id));

    /* get the world geometry */
    RC( PAMI_Geometry_world(client, &world_geo) );
    DBG_FPRINTF((stderr,"World geometry is at 0x%p\n",world_geo));

    /* query number of barrier algorithms */
    RC( PAMI_Geometry_algorithms_num(world_geo, PAMI_XFER_BARRIER, 
                barrier_alg_num) );
    DBG_FPRINTF((stderr,"%d-%d algorithms are available for barrier op\n",
                barrier_alg_num[0], barrier_alg_num[1]));
    if (barrier_alg_num[0] <= 0) {
        fprintf (stderr, "Error. No (%lu) algorithm is available for barrier op\n",
                barrier_alg_num[0]);
        return 1;
    }

    /* query barrier algorithm list */
    bar_always_works_algo =
        (pami_algorithm_t*)malloc(sizeof(pami_algorithm_t)*barrier_alg_num[0]);
    bar_always_works_md =
        (pami_metadata_t*)malloc(sizeof(pami_metadata_t)*barrier_alg_num[0]);
    bar_must_query_algo =
        (pami_algorithm_t*)malloc(sizeof(pami_algorithm_t)*barrier_alg_num[1]);
    bar_must_query_md =
        (pami_metadata_t*)malloc(sizeof(pami_metadata_t)*barrier_alg_num[1]);

    RC( PAMI_Geometry_algorithms_query(world_geo, PAMI_XFER_BARRIER,
                bar_always_works_algo, bar_always_works_md, barrier_alg_num[0],
                bar_must_query_algo, bar_must_query_md, barrier_alg_num[1]) );
    DBG_FPRINTF((stderr,"Algorithm [%s] at 0x%p will be used for barrier op\n",
                bar_always_works_md[0].name, bar_always_works_algo[0]));

    /* begin PAMI fence */
    RC( PAMI_Fence_begin(context) );
    DBG_FPRINTF((stderr,"PAMI fence begins\n"));

    /* ------------------------------------------------------------------------ */

    pami_extension_t          extension;
    const char                ext_name[] = "EXT_hfi_extension";
    const char                sym_name[] = "hfi_remote_update";
    hfi_remote_update_fn      remote_update = NULL;
    hfi_remote_update_info_t  remote_info;
    pami_memregion_t          mem_region;
    size_t                    mem_region_sz = 0;
    unsigned long long        operand = 1234;
    unsigned long long        orig_val = 0;
    int                       offset = (operand)%MAX_TABLE_SZ;

    /* initialize table for remote update operation */
    int i;
    for (i = 0; i < MAX_TABLE_SZ; i ++) {
        table[i] = (unsigned long long) i;
    }
    orig_val = table[offset];

    /* open PAMI extension */
    RC( PAMI_Extension_open (client, ext_name, &extension) );
    DBG_FPRINTF((stderr,"Open %s successfully.\n", ext_name));

    /* load PAMI extension function */
    remote_update = (hfi_remote_update_fn) 
        PAMI_Extension_symbol (extension, sym_name);
    if (remote_update == (void *)NULL)
    {
        fprintf (stderr, "Error. Failed to load %s function in %s\n",
                 sym_name, ext_name); 
        return 1;
    } else {
        DBG_FPRINTF((stderr,"Loaded function %s in %s successfully.\n", 
                    sym_name, ext_name));
    }

    /* create a memory region for remote update operation */
    RC( PAMI_Memregion_create(context, table, 
                MAX_TABLE_SZ*sizeof(unsigned long long),
                &mem_region_sz, &mem_region) );
    DBG_FPRINTF((stderr,"%d-byte PAMI memory region created successfully.\n",
                mem_region_sz));

    /* perform a PAMI barrier */
    is_barrier_done = 0;
    barrier.cb_done = barrier_done;
    barrier.cookie = (void*)&is_barrier_done;
    barrier.algorithm = bar_always_works_algo[0];
    RC( PAMI_Collective(context, &barrier) );
    DBG_FPRINTF((stderr,"PAMI barrier op invoked successfully.\n"));
    while (is_barrier_done == 0)
        PAMI_Context_advance(context, 1000);
    DBG_FPRINTF((stderr,"PAMI barrier op finished successfully.\n"));

    RC( PAMI_Context_lock(context) );

    /* prepare remote update info */
    remote_info.dest = my_id^1;
    remote_info.op = 0;           /* op_add */
    remote_info.atomic_operand = operand;
    remote_info.dest_buf = (unsigned long long)(&(table[offset]));

    /* invoke remote update PAMI extension function */
    RC( remote_update(context, 1, &remote_info) );
    DBG_FPRINTF((stderr,"Function %s invoked successfully.\n", 
                sym_name));

    RC( PAMI_Context_unlock(context) );

    /* perform a PAMI fence */
    is_fence_done = 0;
    RC( PAMI_Fence_all(context, fence_done, (void*)&is_fence_done) );
    DBG_FPRINTF((stderr,"PAMI_Fence_all invoked successfully.\n")); 
    while (is_fence_done == 0)
        PAMI_Context_advance(context, 1000);
    DBG_FPRINTF((stderr,"PAMI_Fence_all finished successfully.\n")); 

    /* perform a PAMI barrier */
    is_barrier_done = 0;
    barrier.cb_done = barrier_done;
    barrier.cookie = (void*)&is_barrier_done;
    barrier.algorithm = bar_always_works_algo[0];
    RC( PAMI_Collective(context, &barrier) );
    DBG_FPRINTF((stderr,"PAMI barrier op invoked successfully.\n"));
    while (is_barrier_done == 0)
        PAMI_Context_advance(context, 1000);
    DBG_FPRINTF((stderr,"PAMI barrier op finished successfully.\n"));

    /* verify data after remote update operation */
    if (table[offset] != orig_val + operand) {
        printf("Data verification at offset %d with operand %lu failed: "
                "[%lu expected with %lu updated]\n",
                offset, operand, orig_val+operand, table[offset]);
    } else {
        printf("Data verification at offset %d with operand %lu passed: "
                "[%lu expected with %lu updated].\n",
                offset, operand, orig_val+operand, table[offset]);
    }

    /* destroy the memory region after remote update operation */
    RC( PAMI_Memregion_destroy(context, &mem_region) );
    DBG_FPRINTF((stderr,"PAMI memory region removed successfully.\n"));

    /* close PAMI extension */
    RC( PAMI_Extension_close (extension) );
    DBG_FPRINTF((stderr,"Close %s successfully.\n", ext_name));

    /* ------------------------------------------------------------------------ */

    /* end PAMI fence */
    RC( PAMI_Fence_end(context) );
    DBG_FPRINTF((stderr,"PAMI fence ends\n"));

    /* destroy PAMI context */
    RC( PAMI_Context_destroyv(&context, 1) );
    DBG_FPRINTF((stderr, "PAMI context destroyed successfully\n"));

    /* destroy PAMI client */
    RC( PAMI_Client_destroy(&client) );
    DBG_FPRINTF((stderr, "PAMI client destroyed successfully\n"));

    return 0;
}
Example #6
0
int main(int argc, char ** argv)
{
  pami_client_t client;
  pami_context_t context;
  pami_result_t status = PAMI_ERROR;

  status = PAMI_Client_create("TEST", &client, NULL, 0);
  if(status != PAMI_SUCCESS)
  {
    fprintf (stderr, "Error. Unable to initialize pami client. result = %d\n", status);
    return 1;
  }
  DBG_FPRINTF((stderr,"Client %p\n",client));

  status = PAMI_Context_createv(client, NULL, 0, &context, 1);
  if(status != PAMI_SUCCESS)
  {
    fprintf (stderr, "Error. Unable to create pami context. result = %d\n", status);
    return 1;
  }

  /* ------------------------------------------------------------------------ */

  pami_extension_t extension;
  status = PAMI_Extension_open (client, "EXT_torus_network", &extension);
  if(status != PAMI_SUCCESS)
  {
    fprintf (stderr, "Error. The \"EXT_torus_network\" extension is not implemented. result = %d\n", status);
    return 1;
  }

  pami_extension_torus_information_fn pamix_torus_info =
    (pami_extension_torus_information_fn) PAMI_Extension_symbol (extension, "information");
  if (pamix_torus_info == (void *)NULL)
  {
    fprintf (stderr, "Error. The \"EXT_torus_network\" extension function \"information\" is not implemented. result = %d\n", status);
    return 1;
  }

  const pami_extension_torus_information_t * info = pamix_torus_info ();

  fprintf (stdout, "Torus Dimensions:  %zu\n", info->dims);

  char str[1024];
  size_t i, nchars;

  for (nchars=i=0; i<(info->dims-1); i++)
    nchars += snprintf (&str[nchars],1023-nchars, "%zu,", info->coord[i]);
  nchars += snprintf (&str[nchars],1023-nchars, "%zu", info->coord[info->dims-1]);
  fprintf (stdout, "Torus Coordinates: [%s]\n", str);

  for (nchars=i=0; i<(info->dims-1); i++)
    nchars += snprintf (&str[nchars],1023-nchars, "%zu,", info->size[i]);
  nchars += snprintf (&str[nchars],1023-nchars, "%zu", info->size[info->dims-1]);
  fprintf (stdout, "Torus Size:        [%s]\n", str);

  for (nchars=i=0; i<(info->dims-1); i++)
    nchars += snprintf (&str[nchars],1023-nchars, "%zu,", info->torus[i]);
  nchars += snprintf (&str[nchars],1023-nchars, "%zu", info->torus[info->dims-1]);
  fprintf (stdout, "Torus Wrap:        [%s]\n", str);



  pami_extension_torus_task2torus_fn pamix_torus_task2torus =
    (pami_extension_torus_task2torus_fn) PAMI_Extension_symbol (extension, "task2torus");
  if (pamix_torus_task2torus == (void *)NULL)
  {
    fprintf (stderr, "Error. The \"EXT_torus_network\" extension function \"task2torus\" is not implemented. result = %d\n", status);
    return 1;
  }

  pami_task_t task = 1;
  size_t * coord = (size_t *) malloc (sizeof(size_t) * info->dims);
  status = pamix_torus_task2torus (task, coord);
  if (status != PAMI_SUCCESS)
  {
    fprintf (stderr, "Error.  Unable to query the torus coordinates of task 1\n");
    return 1;
  }

  for (nchars=i=0; i<(info->dims-1); i++)
    nchars += snprintf (&str[nchars],1023-nchars, "%zu,", coord[i]);
  nchars += snprintf (&str[nchars],1023-nchars, "%zu", coord[i]);
  fprintf (stdout, "Task 1 Torus Coordinates:     [%s]\n", str);


  pami_extension_torus_torus2task_fn pamix_torus_torus2task =
    (pami_extension_torus_torus2task_fn) PAMI_Extension_symbol (extension, "torus2task");
  if (pamix_torus_torus2task == (void *)NULL)
  {
    fprintf (stderr, "Error. The \"EXT_torus_network\" extension function \"torus2task\" is not implemented. result = %d\n", status);
    return 1;
  }

  /*coord[0] = 0; */
  /*coord[1] = 0; */
  /*coord[2] = 0; */
  /*coord[3] = 1; */
  status = pamix_torus_torus2task (coord, &task);
  if (status != PAMI_SUCCESS)
  {
    fprintf (stderr, "Error.  Unable to query the task for coordinates [%zu,%zu,%zu,%zu]\n",coord[0],coord[1],coord[2],coord[3]);
    return 1;
  }
  for (nchars=i=0; i<(info->dims-1); i++)
    nchars += snprintf (&str[nchars],1023-nchars, "%zu,", coord[i]);
  nchars += snprintf (&str[nchars],1023-nchars, "%zu", coord[i]);
  fprintf (stdout, "Task at Torus Coordinates [%s]: %d\n", str, task);;






  status = PAMI_Extension_close (extension);
  if(status != PAMI_SUCCESS)
  {
    fprintf (stderr, "Error. The \"EXT_torus_network\" extension could not be closed. result = %d\n", status);
    return 1;
  }


  /* ------------------------------------------------------------------------ */
  DBG_FPRINTF((stderr, "PAMI_Context_destroyv(&context, 1);\n"));
  status = PAMI_Context_destroyv(&context, 1);
  if(status != PAMI_SUCCESS)
  {
    fprintf(stderr, "Error. Unable to destroy pami context. result = %d\n", status);
    return 1;
  }

  DBG_FPRINTF((stderr, "PAMI_Client_destroy(&client);\n"));
  status = PAMI_Client_destroy(&client);
  if(status != PAMI_SUCCESS)
  {
    fprintf(stderr, "Error. Unable to finalize pami client. result = %d\n", status);
    return 1;
  }

  DBG_FPRINTF((stderr, "return 0;\n"));
  return 0;
}
Example #7
0
void ADIOI_BG_Open(ADIO_File fd, int *error_code)
{
  int perm, old_mask, amode;
  static char myname[] = "ADIOI_BG_OPEN";

  /* set internal variables for tuning environment variables */
  ad_bg_get_env_vars();    

  if (fd->perm == ADIO_PERM_NULL)  {
    old_mask = umask(022);
    umask(old_mask);
    perm = old_mask ^ 0666;
  }
  else perm = fd->perm;

    amode = 0;
    if (fd->access_mode & ADIO_CREATE)
	amode = amode | O_CREAT;
    if (fd->access_mode & ADIO_RDONLY)
	amode = amode | O_RDONLY;
    if (fd->access_mode & ADIO_WRONLY)
	amode = amode | O_WRONLY;
    if (fd->access_mode & ADIO_RDWR)
	amode = amode | O_RDWR;
    if (fd->access_mode & ADIO_EXCL)
	amode = amode | O_EXCL;
#ifdef ADIOI_MPE_LOGGING
    MPE_Log_event(ADIOI_MPE_open_a, 0, NULL);
#endif
    fd->fd_sys = open(fd->filename, amode, perm);
#ifdef ADIOI_MPE_LOGGING
    MPE_Log_event(ADIOI_MPE_open_b, 0, NULL);
#endif
  DBG_FPRINTF(stderr,"open('%s',%#X,%#X) rc=%d, errno=%d\n",fd->filename,amode,perm,fd->fd_sys,errno);
  fd->fd_direct = -1;

  if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND))
    fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);

    if(fd->fd_sys != -1)
    {

        /* Initialize the ad_bg file system specific information */
        ADIOI_BG_assert(fd->fs_ptr == NULL);
        fd->fs_ptr = (ADIOI_BG_fs*) ADIOI_Malloc(sizeof(ADIOI_BG_fs));

        ((ADIOI_BG_fs*)fd->fs_ptr)->blksize = 1048576; /* default to 1M */

        /* default is no fsync aggregation */
        ((ADIOI_BG_fs*)fd->fs_ptr)->fsync_aggr = 
	    ADIOI_BG_FSYNC_AGGREGATION_DISABLED; 


#ifdef ADIOI_MPE_LOGGING
        MPE_Log_event(ADIOI_MPE_stat_a, 0, NULL);
#endif
        scaleable_stat(fd);
#ifdef ADIOI_MPE_LOGGING
        MPE_Log_event(ADIOI_MPE_stat_b, 0, NULL);
#endif
	/* file domain code will get terribly confused in a hard-to-debug way
	 * if gpfs blocksize not sensible */
        ADIOI_BG_assert( ((ADIOI_BG_fs*)fd->fs_ptr)->blksize > 0);
    }

  if (fd->fd_sys == -1)  {
    if (errno == ENAMETOOLONG)
      *error_code = MPIO_Err_create_code(MPI_SUCCESS,
                                         MPIR_ERR_RECOVERABLE, myname,
                                         __LINE__, MPI_ERR_BAD_FILE,
                                         "**filenamelong",
                                         "**filenamelong %s %d",
                                         fd->filename,
                                         strlen(fd->filename));
    else if (errno == ENOENT)
      *error_code = MPIO_Err_create_code(MPI_SUCCESS,
                                         MPIR_ERR_RECOVERABLE, myname,
                                         __LINE__, MPI_ERR_NO_SUCH_FILE,
                                         "**filenoexist",
                                         "**filenoexist %s",
                                         fd->filename);
    else if (errno == ENOTDIR || errno == ELOOP)
      *error_code = MPIO_Err_create_code(MPI_SUCCESS,
                                         MPIR_ERR_RECOVERABLE,
                                         myname, __LINE__,
                                         MPI_ERR_BAD_FILE,
                                         "**filenamedir",
                                         "**filenamedir %s",
                                         fd->filename);
    else if (errno == EACCES)    {
      *error_code = MPIO_Err_create_code(MPI_SUCCESS,
                                         MPIR_ERR_RECOVERABLE, myname,
                                         __LINE__, MPI_ERR_ACCESS,
                                         "**fileaccess",
                                         "**fileaccess %s", 
                                         fd->filename );
    }
    else if (errno == EROFS)    {
      /* Read only file or file system and write access requested */
      *error_code = MPIO_Err_create_code(MPI_SUCCESS,
                                         MPIR_ERR_RECOVERABLE, myname,
                                         __LINE__, MPI_ERR_READ_ONLY,
                                         "**ioneedrd", 0 );
    }
    else    {
      *error_code = MPIO_Err_create_code(MPI_SUCCESS,
                                         MPIR_ERR_RECOVERABLE, myname,
                                         __LINE__, MPI_ERR_IO, "**io",
                                         "**io %s", strerror(errno));
    }
  }
  else *error_code = MPI_SUCCESS;
}
Example #8
0
void ADIOI_GPFS_Open(ADIO_File fd, int *error_code)
{
  int perm, old_mask, amode, rank, rc;
  static char myname[] = "ADIOI_GPFS_OPEN";

  /* set internal variables for tuning environment variables */
  ad_gpfs_get_env_vars();

  if (fd->perm == ADIO_PERM_NULL)  {
    old_mask = umask(022);
    umask(old_mask);
    perm = old_mask ^ 0666;
  }
  else perm = fd->perm;

    amode = 0;
    if (fd->access_mode & ADIO_CREATE)
	amode = amode | O_CREAT;
    if (fd->access_mode & ADIO_RDONLY)
	amode = amode | O_RDONLY;
    if (fd->access_mode & ADIO_WRONLY)
	amode = amode | O_WRONLY;
    if (fd->access_mode & ADIO_RDWR)
	amode = amode | O_RDWR;
    if (fd->access_mode & ADIO_EXCL)
	amode = amode | O_EXCL;
#ifdef ADIOI_MPE_LOGGING
    MPE_Log_event(ADIOI_MPE_open_a, 0, NULL);
#endif
    fd->fd_sys = open(fd->filename, amode, perm);
#ifdef ADIOI_MPE_LOGGING
    MPE_Log_event(ADIOI_MPE_open_b, 0, NULL);
#endif
  DBG_FPRINTF(stderr,"open('%s',%#X,%#X) rc=%d, errno=%d\n",fd->filename,amode,perm,fd->fd_sys,errno);
  fd->fd_direct = -1;

  if (gpfsmpio_devnullio == 1) {
      fd->null_fd = open("/dev/null", O_RDWR);
  } else {
      fd->null_fd = -1;
  }

  if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND))
    fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);

    if(fd->fd_sys != -1)
    {

        fd->blksize = 1048576; /* default to 1M */

#ifdef ADIOI_MPE_LOGGING
        MPE_Log_event(ADIOI_MPE_stat_a, 0, NULL);
#endif
	/* in this fs-specific routine, we might not be called over entire
	 * communicator (deferred open).  Collect statistics on one process.
	 * ADIOI_GEN_Opencoll (common-code caller) will take care of the
	 * broadcast */

	MPI_Comm_rank(fd->comm, &rank);
	if ((rank == fd->hints->ranklist[0]) || (fd->comm == MPI_COMM_SELF)) {
	    struct stat64 gpfs_statbuf;
	    /* Get the (real) underlying file system block size */
	    rc = stat64(fd->filename, &gpfs_statbuf);
	    if (rc >= 0)
	    {
		fd->blksize = gpfs_statbuf.st_blksize;
		DBGV_FPRINTF(stderr,"Successful stat '%s'.  Blocksize=%ld\n",
			fd->filename,gpfs_statbuf.st_blksize);
	    }
	    else
	    {
		DBGV_FPRINTF(stderr,"Stat '%s' failed with rc=%d, errno=%d\n",
			fd->filename,rc,errno);
	    }
	}
	/* all other ranks have incorrect fd->blocksize, but ADIOI_GEN_Opencoll
	 * will take care of that in both standard and deferred-open case */

#ifdef ADIOI_MPE_LOGGING
        MPE_Log_event(ADIOI_MPE_stat_b, 0, NULL);
#endif

#ifdef HAVE_GPFS_FCNTL_H
	/* in parallel workload, might be helpful to immediately release block
	 * tokens.  Or, system call overhead will outweigh any benefits... */
	if (getenv("ROMIO_GPFS_FREE_LOCKS")!=NULL)
	    gpfs_free_all_locks(fd->fd_sys);

#endif
    }

  if (fd->fd_sys == -1)  {
      *error_code = ADIOI_Err_create_code(myname, fd->filename, errno);
  }
  else *error_code = MPI_SUCCESS;
}
Example #9
0
static void ADIOI_R_Iexchange_data_recv(ADIOI_NBC_Request *nbc_req,
                                        int *error_code)
{
    ADIOI_R_Iexchange_data_vars *vars = nbc_req->data.rd.red_vars;
    ADIO_File fd = vars->fd;
    int *send_size = vars->send_size;
    int *recv_size = vars->recv_size;
    int *count = vars->count;
    int *start_pos = vars->start_pos;
    int *partial_send = vars->partial_send;
    int nprocs = vars->nprocs;
    int myrank = vars->myrank;
    ADIOI_Access *others_req = vars->others_req;
    int iter = vars->iter;
    int *buf_idx = vars->buf_idx;

    int i, j, k = 0, tmp = 0, nprocs_recv, nprocs_send;
    char **recv_buf = NULL;
    MPI_Datatype send_type;

    nprocs_recv = 0;
    for (i = 0; i < nprocs; i++) if (recv_size[i]) nprocs_recv++;
    vars->nprocs_recv = nprocs_recv;

    nprocs_send = 0;
    for (i = 0; i < nprocs; i++) if (send_size[i]) nprocs_send++;
    vars->nprocs_send = nprocs_send;

    vars->req2 = (MPI_Request *)
        ADIOI_Malloc((nprocs_send+nprocs_recv+1)*sizeof(MPI_Request));
    /* +1 to avoid a 0-size malloc */

    /* post recvs. if buftype_is_contig, data can be directly recd. into
       user buf at location given by buf_idx. else use recv_buf. */

#ifdef AGGREGATION_PROFILE
    MPE_Log_event (5032, 0, NULL);
#endif

    if (vars->buftype_is_contig) {
        j = 0;
        for (i = 0; i < nprocs; i++)
            if (recv_size[i]) {
                MPI_Irecv(((char *)vars->buf) + buf_idx[i], recv_size[i],
                          MPI_BYTE, i, myrank+i+100*iter, fd->comm,
                          vars->req2 + j);
                j++;
                buf_idx[i] += recv_size[i];
            }
    }
    else {
        /* allocate memory for recv_buf and post receives */
        recv_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char*));
        vars->recv_buf = recv_buf;
        for (i = 0; i < nprocs; i++)
            if (recv_size[i]) recv_buf[i] = (char *)ADIOI_Malloc(recv_size[i]);

        j = 0;
        for (i = 0; i < nprocs; i++)
            if (recv_size[i]) {
                MPI_Irecv(recv_buf[i], recv_size[i], MPI_BYTE, i,
                          myrank+i+100*iter, fd->comm,
                          vars->req2 + j);
                j++;
#ifdef RDCOLL_DEBUG
                DBG_FPRINTF(stderr, "node %d, recv_size %d, tag %d \n",
                            myrank, recv_size[i], myrank+i+100*iter);
#endif
            }
    }

    /* create derived datatypes and send data */

    j = 0;
    for (i = 0; i < nprocs; i++) {
        if (send_size[i]) {
            /* take care if the last off-len pair is a partial send */
            if (partial_send[i]) {
                k = start_pos[i] + count[i] - 1;
                tmp = others_req[i].lens[k];
                others_req[i].lens[k] = partial_send[i];
            }
            ADIOI_Type_create_hindexed_x(count[i],
                    &(others_req[i].lens[start_pos[i]]),
                    &(others_req[i].mem_ptrs[start_pos[i]]),
                    MPI_BYTE, &send_type);
            /* absolute displacement; use MPI_BOTTOM in send */
            MPI_Type_commit(&send_type);
            MPI_Isend(MPI_BOTTOM, 1, send_type, i, myrank+i+100*iter,
                      fd->comm, vars->req2 + nprocs_recv + j);
            MPI_Type_free(&send_type);
            if (partial_send[i]) others_req[i].lens[k] = tmp;
            j++;
        }
    }

    /* wait on the receives */
    if (nprocs_recv) {
        nbc_req->data.rd.state = ADIOI_IRC_STATE_R_IEXCHANGE_DATA_RECV;
        return;
    }

    ADIOI_R_Iexchange_data_fill(nbc_req, error_code);
}
Example #10
0
static void ADIOI_Iread_and_exch_l1_begin(ADIOI_NBC_Request *nbc_req,
                                          int *error_code)
{
    ADIOI_Iread_and_exch_vars *vars = nbc_req->data.rd.rae_vars;
    ADIO_File fd;
    int nprocs;
    ADIOI_Access *others_req;

    int i, j;
    ADIO_Offset real_off, req_off;
    char *read_buf;
    int *curr_offlen_ptr, *count, *send_size;
    int *partial_send, *start_pos;
    ADIO_Offset size, real_size, for_next_iter;
    int req_len, flag;

    ADIOI_R_Iexchange_data_vars *red_vars = NULL;

    /* loop exit condition */
    if (vars->m >= vars->ntimes) {
        ADIOI_Iread_and_exch_reset(nbc_req, error_code);
        return;
    }

    fd = vars->fd;
    nprocs = vars->nprocs;
    others_req = vars->others_req;

    read_buf = vars->read_buf;
    curr_offlen_ptr = vars->curr_offlen_ptr;
    count = vars->count;
    send_size = vars->send_size;
    partial_send = vars->partial_send;
    start_pos = vars->start_pos;

    /* read buf of size coll_bufsize (or less) */
    /* go through all others_req and check if any are satisfied
       by the current read */

    /* since MPI guarantees that displacements in filetypes are in
       monotonically nondecreasing order, I can maintain a pointer
       (curr_offlen_ptr) to
       current off-len pair for each process in others_req and scan
       further only from there. There is still a problem of filetypes
       such as:  (1, 2, 3 are not process nos. They are just numbers for
       three chunks of data, specified by a filetype.)

       1  -------!--
       2    -----!----
       3       --!-----

       where ! indicates where the current read_size limitation cuts
       through the filetype.  I resolve this by reading up to !, but
       filling the communication buffer only for 1. I copy the portion
       left over for 2 into a tmp_buf for use in the next
       iteration. i.e., 2 and 3 will be satisfied in the next
       iteration. This simplifies filling in the user's buf at the
       other end, as only one off-len pair with incomplete data
       will be sent. I also don't need to send the individual
       offsets and lens along with the data, as the data is being
       sent in a particular order. */

    /* off = start offset in the file for the data actually read in
             this iteration
       size = size of data read corresponding to off
       real_off = off minus whatever data was retained in memory from
             previous iteration for cases like 2, 3 illustrated above
       real_size = size plus the extra corresponding to real_off
       req_off = off in file for a particular contiguous request
                 minus what was satisfied in previous iteration
       req_size = size corresponding to req_off */

    size = ADIOI_MIN((unsigned)vars->coll_bufsize,
                     vars->end_loc - vars->st_loc + 1 - vars->done);
    real_off = vars->off - vars->for_curr_iter;
    real_size = size + vars->for_curr_iter;

    vars->size = size;
    vars->real_size = real_size;

    for (i = 0; i < nprocs; i++) count[i] = send_size[i] = 0;
    for_next_iter = 0;

    for (i = 0; i < nprocs; i++) {
#ifdef RDCOLL_DEBUG
        DBG_FPRINTF(stderr, "rank %d, i %d, others_count %d\n",
                    vars->myrank, i, others_req[i].count);
#endif
        if (others_req[i].count) {
            start_pos[i] = curr_offlen_ptr[i];
            for (j = curr_offlen_ptr[i]; j < others_req[i].count; j++) {
                if (partial_send[i]) {
                    /* this request may have been partially
                       satisfied in the previous iteration. */
                    req_off = others_req[i].offsets[j] + partial_send[i];
                    req_len = others_req[i].lens[j] - partial_send[i];
                    partial_send[i] = 0;
                    /* modify the off-len pair to reflect this change */
                    others_req[i].offsets[j] = req_off;
                    others_req[i].lens[j] = req_len;
                }
                else {
                    req_off = others_req[i].offsets[j];
                    req_len = others_req[i].lens[j];
                }
                if (req_off < real_off + real_size) {
                    count[i]++;
                    ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)read_buf) + req_off - real_off) == (ADIO_Offset)(MPIR_Upint)(read_buf + req_off - real_off));
                    MPI_Address(read_buf + req_off - real_off,
                                &(others_req[i].mem_ptrs[j]));
                    ADIOI_Assert((real_off + real_size - req_off) == (int)(real_off + real_size - req_off));
                    send_size[i] += (int)(ADIOI_MIN(real_off + real_size - req_off,
                                                    (ADIO_Offset)(unsigned)req_len));

                    if (real_off + real_size - req_off < (ADIO_Offset)(unsigned)req_len) {
                        partial_send[i] = (int)(real_off + real_size - req_off);
                        if ((j+1 < others_req[i].count) &&
                            (others_req[i].offsets[j+1] < real_off + real_size)) {
                            /* this is the case illustrated in the
                               figure above. */
                            for_next_iter = ADIOI_MAX(for_next_iter,
                                    real_off + real_size - others_req[i].offsets[j+1]);
                            /* max because it must cover requests
                               from different processes */
                        }
                        break;
                    }
                }
                else break;
            }
            curr_offlen_ptr[i] = j;
        }
    }
    vars->for_next_iter = for_next_iter;

    flag = 0;
    for (i = 0; i < nprocs; i++)
        if (count[i]) flag = 1;

    /* create a struct for ADIOI_R_Iexchange_data() */
    red_vars = (ADIOI_R_Iexchange_data_vars *)ADIOI_Calloc(
            1, sizeof(ADIOI_R_Iexchange_data_vars));
    nbc_req->data.rd.red_vars = red_vars;
    red_vars->fd = vars->fd;
    red_vars->buf = vars->buf;
    red_vars->flat_buf = vars->flat_buf;
    red_vars->offset_list = vars->offset_list;
    red_vars->len_list = vars->len_list;
    red_vars->send_size = vars->send_size;
    red_vars->recv_size = vars->recv_size;
    red_vars->count = vars->count;
    red_vars->start_pos = vars->start_pos;
    red_vars->partial_send = vars->partial_send;
    red_vars->recd_from_proc = vars->recd_from_proc;
    red_vars->nprocs = vars->nprocs;
    red_vars->myrank = vars->myrank;
    red_vars->buftype_is_contig = vars->buftype_is_contig;
    red_vars->contig_access_count = vars->contig_access_count;
    red_vars->min_st_offset = vars->min_st_offset;
    red_vars->fd_size = vars->fd_size;
    red_vars->fd_start = vars->fd_start;
    red_vars->fd_end = vars->fd_end;
    red_vars->others_req = vars->others_req;
    red_vars->iter = vars->m;
    red_vars->buftype_extent = vars->buftype_extent;
    red_vars->buf_idx = vars->buf_idx;
    red_vars->next_fn = ADIOI_Iread_and_exch_l1_end;

    if (flag) {
        ADIOI_Assert(size == (int)size);
        ADIO_IreadContig(fd, read_buf+vars->for_curr_iter, (int)size,
                         MPI_BYTE, ADIO_EXPLICIT_OFFSET, vars->off,
                         &vars->req2, error_code);

        nbc_req->data.rd.state = ADIOI_IRC_STATE_IREAD_AND_EXCH_L1_BEGIN;
        return;
    }

    ADIOI_R_Iexchange_data(nbc_req, error_code);
}
Example #11
0
/* Nonblocking version of ADIOI_GEN_ReadStridedColl() */
void ADIOI_GEN_IreadStridedColl(ADIO_File fd, void *buf, int count,
                   MPI_Datatype datatype, int file_ptr_type,
                   ADIO_Offset offset, MPI_Request *request,
                   int *error_code)
{
    /* Uses a generalized version of the extended two-phase method described
       in "An Extended Two-Phase Method for Accessing Sections of
       Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary,
       Scientific Programming, (5)4:301--317, Winter 1996.
       http://www.mcs.anl.gov/home/thakur/ext2ph.ps */

    ADIOI_NBC_Request *nbc_req = NULL;
    ADIOI_GEN_IreadStridedColl_vars *vars = NULL;
    int nprocs, myrank;
#ifdef RDCOLL_DEBUG
    int i;
#endif

    /* FIXME: need an implementation of ADIOI_IOIstridedColl
    if (fd->hints->cb_pfr != ADIOI_HINT_DISABLE) {
        ADIOI_IOIstridedColl(fd, buf, count, ADIOI_READ, datatype,
                             file_ptr_type, offset, request, error_code);
        return;
    }
    */

    /* top-level struct keeping the status of function progress */
    nbc_req = (ADIOI_NBC_Request *)ADIOI_Calloc(1, sizeof(ADIOI_NBC_Request));
    nbc_req->rdwr = ADIOI_READ;

    /* create a generalized request */
    if (ADIOI_GEN_greq_class == 0) {
        MPIX_Grequest_class_create(ADIOI_GEN_irc_query_fn,
                ADIOI_GEN_irc_free_fn, MPIU_Greq_cancel_fn,
                ADIOI_GEN_irc_poll_fn, ADIOI_GEN_irc_wait_fn,
                &ADIOI_GEN_greq_class);
    }
    MPIX_Grequest_class_allocate(ADIOI_GEN_greq_class, nbc_req, request);
    memcpy(&nbc_req->req, request, sizeof(MPI_Request));

    /* create a struct for parameters and variables */
    vars = (ADIOI_GEN_IreadStridedColl_vars *)ADIOI_Calloc(
            1, sizeof(ADIOI_GEN_IreadStridedColl_vars));
    nbc_req->data.rd.rsc_vars = vars;

    /* save the parameters */
    vars->fd = fd;
    vars->buf = buf;
    vars->count = count;
    vars->datatype = datatype;
    vars->file_ptr_type = file_ptr_type;
    vars->offset = offset;

    MPI_Comm_size(fd->comm, &nprocs);
    MPI_Comm_rank(fd->comm, &myrank);
    vars->nprocs = nprocs;
    vars->myrank = myrank;

    /* number of aggregators, cb_nodes, is stored in the hints */
    vars->nprocs_for_coll = fd->hints->cb_nodes;
    vars->orig_fp = fd->fp_ind;

    /* only check for interleaving if cb_read isn't disabled */
    if (fd->hints->cb_read != ADIOI_HINT_DISABLE) {
        /* For this process's request, calculate the list of offsets and
           lengths in the file and determine the start and end offsets. */

        /* Note: end_offset points to the last byte-offset that will be accessed.
           e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/

        ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset,
                              &vars->offset_list, &vars->len_list,
                              &vars->start_offset, &vars->end_offset,
                              &vars->contig_access_count);

#ifdef RDCOLL_DEBUG
        for (i = 0; i < vars->contig_access_count; i++) {
            DBG_FPRINTF(stderr, "rank %d  off %lld  len %lld\n",
                        myrank, vars->offset_list[i], vars->len_list[i]);
        }
#endif

        /* each process communicates its start and end offsets to other
           processes. The result is an array each of start and end offsets
           stored in order of process rank. */

        vars->st_offsets = (ADIO_Offset *)ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));
        vars->end_offsets = (ADIO_Offset *)ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));

        *error_code = MPI_Iallgather(&vars->start_offset, 1, ADIO_OFFSET,
                                     vars->st_offsets, 1, ADIO_OFFSET,
                                     fd->comm, &vars->req_offset[0]);
        if (*error_code != MPI_SUCCESS) return;
        *error_code = MPI_Iallgather(&vars->end_offset, 1, ADIO_OFFSET,
                                     vars->end_offsets, 1, ADIO_OFFSET,
                                     fd->comm, &vars->req_offset[1]);

        nbc_req->data.rd.state = ADIOI_IRC_STATE_GEN_IREADSTRIDEDCOLL;
        return;
    }

    ADIOI_GEN_IreadStridedColl_indio(nbc_req, error_code);
}
Example #12
0
File: flatten.c Project: ORNL/ompi
/* ADIOI_Count_contiguous_blocks
 *
 * Returns number of contiguous blocks in type, and also updates
 * curr_index to reflect the space for the additional blocks.
 *
 * ASSUMES THAT TYPE IS NOT A BASIC!!!
 */
MPI_Count ADIOI_Count_contiguous_blocks(MPI_Datatype datatype, MPI_Count *curr_index)
{
    int i, n;
    MPI_Count count=0, prev_index, num, basic_num;
    int top_count, combiner, old_combiner, old_is_contig;
    int nints, nadds, ntypes, old_nints, old_nadds, old_ntypes;
    int *ints;
    MPI_Aint *adds; /* Make no assumptions about +/- sign on these */
    MPI_Datatype *types;

    MPI_Type_get_envelope(datatype, &nints, &nadds, &ntypes, &combiner);
    ints = (int *) ADIOI_Malloc((nints+1)*sizeof(int));
    adds = (MPI_Aint *) ADIOI_Malloc((nadds+1)*sizeof(MPI_Aint));
    types = (MPI_Datatype *) ADIOI_Malloc((ntypes+1)*sizeof(MPI_Datatype));
    MPI_Type_get_contents(datatype, nints, nadds, ntypes, ints, adds, types);

    switch (combiner) {
#ifdef MPIIMPL_HAVE_MPI_COMBINER_DUP
    case MPI_COMBINER_DUP:
        MPI_Type_get_envelope(types[0], &old_nints, &old_nadds,
                              &old_ntypes, &old_combiner); 
	ADIOI_Datatype_iscontig(types[0], &old_is_contig);
	if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig))
	    count = ADIOI_Count_contiguous_blocks(types[0], curr_index);
	else {
		count = 1;
		(*curr_index)++;
	}
        break;
#endif
#ifdef MPIIMPL_HAVE_MPI_COMBINER_SUBARRAY
    case MPI_COMBINER_SUBARRAY:
        {
	    int dims = ints[0];
	    MPI_Datatype stype;

	    ADIO_Type_create_subarray(dims,
				      &ints[1],        /* sizes */
				      &ints[dims+1],   /* subsizes */
				      &ints[2*dims+1], /* starts */
				      ints[3*dims+1],  /* order */
				      types[0],        /* type */
				      &stype);
	    count = ADIOI_Count_contiguous_blocks(stype, curr_index);
	    /* curr_index will have already been updated; just pass
	     * count back up.
	     */
	    MPI_Type_free(&stype);

	}
	break;
#endif
#ifdef MPIIMPL_HAVE_MPI_COMBINER_DARRAY
    case MPI_COMBINER_DARRAY:
	{
	    int dims = ints[2];
	    MPI_Datatype dtype;

	    ADIO_Type_create_darray(ints[0],         /* size */
				    ints[1],         /* rank */
				    dims,
				    &ints[3],        /* gsizes */
				    &ints[dims+3],   /* distribs */
				    &ints[2*dims+3], /* dargs */
				    &ints[3*dims+3], /* psizes */
				    ints[4*dims+3],  /* order */
				    types[0],
				    &dtype);
	    count = ADIOI_Count_contiguous_blocks(dtype, curr_index);
	    /* curr_index will have already been updated; just pass
	     * count back up.
	     */
	    MPI_Type_free(&dtype);
	}
	break;
#endif
    case MPI_COMBINER_CONTIGUOUS:
        top_count = ints[0];
        MPI_Type_get_envelope(types[0], &old_nints, &old_nadds,
                              &old_ntypes, &old_combiner); 
	ADIOI_Datatype_iscontig(types[0], &old_is_contig);

	prev_index = *curr_index;
	if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig))
	    count = ADIOI_Count_contiguous_blocks(types[0], curr_index);
	else count = 1;

	if (prev_index == *curr_index) 
/* simplest case, made up of basic or contiguous types */
	    (*curr_index)++;
	else {
/* made up of noncontiguous derived types */
	    num = *curr_index - prev_index;
	    count *= top_count;
	    *curr_index += (top_count - 1)*num;
	}
	break;

    case MPI_COMBINER_VECTOR:
    case MPI_COMBINER_HVECTOR:
    case MPI_COMBINER_HVECTOR_INTEGER: 
        top_count = ints[0];
        MPI_Type_get_envelope(types[0], &old_nints, &old_nadds,
                              &old_ntypes, &old_combiner); 
	ADIOI_Datatype_iscontig(types[0], &old_is_contig);

	prev_index = *curr_index;
	if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig))
	    count = ADIOI_Count_contiguous_blocks(types[0], curr_index);
	else count = 1;

	if (prev_index == *curr_index) {
/* simplest case, vector of basic or contiguous types */
	    count = top_count;
	    *curr_index += count;
	}
	else {
/* vector of noncontiguous derived types */
	    num = *curr_index - prev_index;

/* The noncontiguous types have to be replicated blocklen times
   and then strided. */
	    count *= ints[1] * top_count;

/* First one */
	    *curr_index += (ints[1] - 1)*num;

/* Now repeat with strides. */
	    num = *curr_index - prev_index;
	    *curr_index += (top_count - 1)*num;
	}
	break;

    case MPI_COMBINER_INDEXED: 
    case MPI_COMBINER_HINDEXED:
    case MPI_COMBINER_HINDEXED_INTEGER:
        top_count = ints[0];
        MPI_Type_get_envelope(types[0], &old_nints, &old_nadds,
                              &old_ntypes, &old_combiner); 
	ADIOI_Datatype_iscontig(types[0], &old_is_contig);

	prev_index = *curr_index;
	if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig))
	    count = ADIOI_Count_contiguous_blocks(types[0], curr_index);
	else count = 1;

	if (prev_index == *curr_index) {
/* simplest case, indexed type made up of basic or contiguous types */
	    count = top_count;
	    *curr_index += count;
	}
	else {
/* indexed type made up of noncontiguous derived types */
	    basic_num = *curr_index - prev_index;

/* The noncontiguous types have to be replicated blocklens[i] times
   and then strided. */
	    *curr_index += (ints[1]-1) * basic_num;
	    count *= ints[1];

/* Now repeat with strides. */
	    for (i=1; i<top_count; i++) {
		count += ints[1+i] * basic_num;
		*curr_index += ints[1+i] * basic_num;
	    }
	}
	break;

#if defined HAVE_DECL_MPI_COMBINER_HINDEXED_BLOCK && HAVE_DECL_MPI_COMBINER_HINDEXED_BLOCK
    case MPI_COMBINER_HINDEXED_BLOCK:
#endif
    case MPI_COMBINER_INDEXED_BLOCK:
        top_count = ints[0];
        MPI_Type_get_envelope(types[0], &old_nints, &old_nadds,
                              &old_ntypes, &old_combiner); 
	ADIOI_Datatype_iscontig(types[0], &old_is_contig);

	prev_index = *curr_index;
	if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig))
	    count = ADIOI_Count_contiguous_blocks(types[0], curr_index);
	else count = 1;

	if (prev_index == *curr_index) {
/* simplest case, indexed type made up of basic or contiguous types */
	    count = top_count;
	    *curr_index += count;
	}
	else {
/* indexed type made up of noncontiguous derived types */
	    basic_num = *curr_index - prev_index;

/* The noncontiguous types have to be replicated blocklens[i] times
   and then strided. */
	    *curr_index += (ints[1]-1) * basic_num;
	    count *= ints[1];

/* Now repeat with strides. */
	    *curr_index += (top_count-1) * count;
	    count *= top_count;
	}
	break;

    case MPI_COMBINER_STRUCT: 
    case MPI_COMBINER_STRUCT_INTEGER: 
        top_count = ints[0];
	count = 0;
	for (n=0; n<top_count; n++) {
            MPI_Type_get_envelope(types[n], &old_nints, &old_nadds,
                                  &old_ntypes, &old_combiner); 
	    ADIOI_Datatype_iscontig(types[n], &old_is_contig);

	    prev_index = *curr_index;
	    if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig))
	    count += ADIOI_Count_contiguous_blocks(types[n], curr_index);

	    if (prev_index == *curr_index) {
/* simplest case, current type is basic or contiguous types */
		count++;
		(*curr_index)++;
	    }
	    else {
/* current type made up of noncontiguous derived types */
/* The current type has to be replicated blocklens[n] times */

		num = *curr_index - prev_index;
		count += (ints[1+n]-1)*num;
		(*curr_index) += (ints[1+n]-1)*num;
	    }
	}
	break;

    case MPI_COMBINER_RESIZED: 
	/* treat it as a struct with lb, type, ub */

	/* add 2 for lb and ub */
	(*curr_index) += 2;
	count += 2;

	/* add for datatype */ 
	MPI_Type_get_envelope(types[0], &old_nints, &old_nadds,
                                  &old_ntypes, &old_combiner); 
	ADIOI_Datatype_iscontig(types[0], &old_is_contig);

	if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig)) {
	    count += ADIOI_Count_contiguous_blocks(types[0], curr_index);
	}
	else {
        /* basic or contiguous type */
	    count++;
	    (*curr_index)++;
	}
	break;

    default:
	/* TODO: FIXME */
	DBG_FPRINTF(stderr, "Error: Unsupported datatype passed to ADIOI_Count_contiguous_blocks, combiner = %d\n", combiner);
	MPI_Abort(MPI_COMM_WORLD, 1);
    }

#ifndef MPISGI
/* There is a bug in SGI's impl. of MPI_Type_get_contents. It doesn't
   return new datatypes. Therefore no need to free. */
    for (i=0; i<ntypes; i++) {
 	MPI_Type_get_envelope(types[i], &old_nints, &old_nadds, &old_ntypes,
 			      &old_combiner);
 	if (old_combiner != MPI_COMBINER_NAMED) MPI_Type_free(types+i);
    }
#endif

    ADIOI_Free(ints);
    ADIOI_Free(adds);
    ADIOI_Free(types);
    return count;
}
Example #13
0
File: flatten.c Project: ORNL/ompi
/* flatten datatype and add it to Flatlist */
void ADIOI_Flatten_datatype(MPI_Datatype datatype)
{
#ifdef HAVE_MPIR_TYPE_FLATTEN
    MPI_Aint flatten_idx;
#endif
    MPI_Count curr_index=0;
    int is_contig;
    ADIOI_Flatlist_node *flat, *prev=0;

    /* check if necessary to flatten. */
 
    /* is it entirely contiguous? */
    ADIOI_Datatype_iscontig(datatype, &is_contig);
  #ifdef FLATTEN_DEBUG 
  DBG_FPRINTF(stderr,"ADIOI_Flatten_datatype:: is_contig %#X\n",is_contig);
  #endif
    if (is_contig) return;

    /* has it already been flattened? */
    flat = ADIOI_Flatlist;
    while (flat) {
	if (flat->type == datatype) {
      #ifdef FLATTEN_DEBUG 
      DBG_FPRINTF(stderr,"ADIOI_Flatten_datatype:: found datatype %#X\n", datatype);
      #endif
		return;
	}
	else {
	    prev = flat;
	    flat = flat->next;
	}
    }

    /* flatten and add to the list */
    flat = prev;
    flat->next = (ADIOI_Flatlist_node *)ADIOI_Malloc(sizeof(ADIOI_Flatlist_node));
    flat = flat->next;

    flat->type = datatype;
    flat->next = NULL;
    flat->blocklens = NULL;
    flat->indices = NULL;

    flat->count = ADIOI_Count_contiguous_blocks(datatype, &curr_index);
#ifdef FLATTEN_DEBUG 
    DBG_FPRINTF(stderr,"ADIOI_Flatten_datatype:: count %llX, cur_idx = %#llX\n",flat->count,curr_index);
#endif
/*    DBG_FPRINTF(stderr, "%d\n", flat->count);*/

    if (flat->count) {
	flat->blocklens = (ADIO_Offset *) ADIOI_Malloc(flat->count * sizeof(ADIO_Offset));
	flat->indices = (ADIO_Offset *) ADIOI_Malloc(flat->count * sizeof(ADIO_Offset));
    }
	
    curr_index = 0;
#ifdef HAVE_MPIR_TYPE_FLATTEN
    flatten_idx = (MPI_Aint) flat->count;
    MPIR_Type_flatten(datatype, flat->indices, flat->blocklens, &flatten_idx);
  #ifdef FLATTEN_DEBUG 
  DBG_FPRINTF(stderr,"ADIOI_Flatten_datatype:: MPIR_Type_flatten\n");
  #endif
#else
    ADIOI_Flatten(datatype, flat, 0, &curr_index);
  #ifdef FLATTEN_DEBUG 
  DBG_FPRINTF(stderr,"ADIOI_Flatten_datatype:: ADIOI_Flatten\n");
  #endif

    ADIOI_Optimize_flattened(flat);
#endif
/* debug */
#ifdef FLATTEN_DEBUG
    {
	int i;
	for (i=0; i<flat->count; i++) 
      DBG_FPRINTF(stderr,"ADIOI_Flatten_datatype:: i %#X, blocklens %#llX, indices %#llX\n",
              i,
              flat->blocklens[i],
              flat->indices[i]
             );
  }
#endif

}
Example #14
0
File: flatten.c Project: ORNL/ompi
/* ADIOI_Flatten()
 *
 * Assumption: input datatype is not a basic!!!!
 */
void ADIOI_Flatten(MPI_Datatype datatype, ADIOI_Flatlist_node *flat, 
		  ADIO_Offset st_offset, MPI_Count *curr_index)
{
    int i, k, m, n, basic_num, nonzeroth, is_hindexed_block=0;
    int combiner, old_combiner, old_is_contig;
    int nints, nadds, ntypes, old_nints, old_nadds, old_ntypes;
    /* By using ADIO_Offset we preserve +/- sign and 
         avoid >2G integer arithmetic problems */
    ADIO_Offset top_count;
    MPI_Count j, old_size, prev_index, num;
    MPI_Aint old_extent;/* Assume extents are non-negative */
    int *ints;
    MPI_Aint *adds; /* Make no assumptions about +/- sign on these */
    MPI_Datatype *types;
    MPI_Type_get_envelope(datatype, &nints, &nadds, &ntypes, &combiner);
    ints = (int *) ADIOI_Malloc((nints+1)*sizeof(int));
    adds = (MPI_Aint *) ADIOI_Malloc((nadds+1)*sizeof(MPI_Aint));
    types = (MPI_Datatype *) ADIOI_Malloc((ntypes+1)*sizeof(MPI_Datatype));
    MPI_Type_get_contents(datatype, nints, nadds, ntypes, ints, adds, types);

  #ifdef FLATTEN_DEBUG 
  DBG_FPRINTF(stderr,"ADIOI_Flatten:: st_offset %#llX, curr_index %#llX\n",st_offset,*curr_index);
  DBG_FPRINTF(stderr,"ADIOI_Flatten:: nints %#X, nadds %#X, ntypes %#X\n",nints, nadds, ntypes);
  for(i=0; i< nints; ++i)
  {
    DBG_FPRINTF(stderr,"ADIOI_Flatten:: ints[%d]=%#X\n",i,ints[i]);
  }
  for(i=0; i< nadds; ++i)
  {
    DBG_FPRINTF(stderr,"ADIOI_Flatten:: adds[%d]="MPI_AINT_FMT_HEX_SPEC"\n",i,adds[i]);
  }
  for(i=0; i< ntypes; ++i)
  {
    DBG_FPRINTF(stderr,"ADIOI_Flatten:: types[%d]=%#llX\n",i,(unsigned long long)(unsigned long)types[i]);
  }
  #endif
  /* Chapter 4, page 83: when processing datatypes, note this item from the
   * standard:
	 Most datatype constructors have replication count or block length
	 arguments.  Allowed values are non-negative integers. If the value is
	 zero, no elements are generated in the type map and there is no effect
	 on datatype bounds or extent.  */

    switch (combiner) {
#ifdef MPIIMPL_HAVE_MPI_COMBINER_DUP
    case MPI_COMBINER_DUP:
    #ifdef FLATTEN_DEBUG 
    DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_DUP\n");
    #endif
        MPI_Type_get_envelope(types[0], &old_nints, &old_nadds,
			      &old_ntypes, &old_combiner); 
        ADIOI_Datatype_iscontig(types[0], &old_is_contig);
	if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig))
            ADIOI_Flatten(types[0], flat, st_offset, curr_index);
        break;
#endif
#ifdef MPIIMPL_HAVE_MPI_COMBINER_SUBARRAY
    case MPI_COMBINER_SUBARRAY:
        {
	    int dims = ints[0];
	    MPI_Datatype stype;
      #ifdef FLATTEN_DEBUG 
      DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_SUBARRAY\n");
      #endif

	    ADIO_Type_create_subarray(dims,
				      &ints[1],        /* sizes */
				      &ints[dims+1],   /* subsizes */
				      &ints[2*dims+1], /* starts */
				      ints[3*dims+1],  /* order */
				      types[0],        /* type */
				      &stype);
	    ADIOI_Flatten(stype, flat, st_offset, curr_index);
	    MPI_Type_free(&stype);
	}
	break;
#endif
#ifdef MPIIMPL_HAVE_MPI_COMBINER_DARRAY
    case MPI_COMBINER_DARRAY:
	{
	    int dims = ints[2];
	    MPI_Datatype dtype;
      #ifdef FLATTEN_DEBUG 
      DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_DARRAY\n");
      #endif

	    ADIO_Type_create_darray(ints[0],         /* size */
				    ints[1],         /* rank */
				    dims,
				    &ints[3],        /* gsizes */
				    &ints[dims+3],   /* distribs */
				    &ints[2*dims+3], /* dargs */
				    &ints[3*dims+3], /* psizes */
				    ints[4*dims+3],  /* order */
				    types[0],
				    &dtype);
      #ifdef FLATTEN_DEBUG 
      DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_DARRAY <ADIOI_Flatten(dtype, flat->indices[%#X] %#llX, flat->blocklens[%#X] %#llX, st_offset %#llX, curr_index %#llX);\n",
              0, flat->indices[0], 0, flat->blocklens[0], st_offset, *curr_index);
      #endif
	    ADIOI_Flatten(dtype, flat, st_offset, curr_index);
      #ifdef FLATTEN_DEBUG 
      DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_DARRAY >ADIOI_Flatten(dtype, flat->indices[%#X] %#llX, flat->blocklens[%#X] %#llX, st_offset %#llX, curr_index %#llX);\n",
              0, flat->indices[0], 0, flat->blocklens[0], st_offset, *curr_index);
      #endif
	    MPI_Type_free(&dtype);
	}
	break;
#endif
    case MPI_COMBINER_CONTIGUOUS:
    #ifdef FLATTEN_DEBUG 
    DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_CONTIGUOUS\n");
    #endif
	top_count = ints[0];
        MPI_Type_get_envelope(types[0], &old_nints, &old_nadds,
			      &old_ntypes, &old_combiner); 
        ADIOI_Datatype_iscontig(types[0], &old_is_contig);

	prev_index = *curr_index;
	if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig))
	    ADIOI_Flatten(types[0], flat, st_offset, curr_index);

	if (prev_index == *curr_index) {
/* simplest case, made up of basic or contiguous types */
	    j = *curr_index;
	    flat->indices[j] = st_offset;
	    MPI_Type_size_x(types[0], &old_size);
	    flat->blocklens[j] = top_count * old_size;
      #ifdef FLATTEN_DEBUG 
      DBG_FPRINTF(stderr,"ADIOI_Flatten:: simple flat->indices[%#llX] %#llX, flat->blocklens[%#llX] %#llX\n",j, flat->indices[j], j, flat->blocklens[j]);
      #endif
	    (*curr_index)++;
	}
	else {
/* made up of noncontiguous derived types */
	    j = *curr_index;
	    num = *curr_index - prev_index;

/* The noncontiguous types have to be replicated count times */
	    MPI_Type_extent(types[0], &old_extent);
	    for (m=1; m<top_count; m++) {
		for (i=0; i<num; i++) {
		    flat->indices[j] = flat->indices[j-num] + ADIOI_AINT_CAST_TO_OFFSET old_extent;
		    flat->blocklens[j] = flat->blocklens[j-num];
          #ifdef FLATTEN_DEBUG 
          DBG_FPRINTF(stderr,"ADIOI_Flatten:: derived flat->indices[%#llX] %#llX, flat->blocklens[%#llX] %#llX\n",j, flat->indices[j], j, flat->blocklens[j]);
          #endif
		    j++;
		}
	    }
	    *curr_index = j;
	}
	break;

    case MPI_COMBINER_VECTOR: 
    #ifdef FLATTEN_DEBUG 
    DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_VECTOR\n");
    #endif
	top_count = ints[0];
        MPI_Type_get_envelope(types[0], &old_nints, &old_nadds,
			      &old_ntypes, &old_combiner); 
        ADIOI_Datatype_iscontig(types[0], &old_is_contig);

	prev_index = *curr_index;
	if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig))
	    ADIOI_Flatten(types[0], flat, st_offset, curr_index);

	if (prev_index == *curr_index) {
/* simplest case, vector of basic or contiguous types */
    /* By using ADIO_Offset we preserve +/- sign and 
         avoid >2G integer arithmetic problems */
    ADIO_Offset blocklength = ints[1], stride = ints[2];
	    j = *curr_index;
	    flat->indices[j] = st_offset;
	    MPI_Type_size_x(types[0], &old_size);
	    flat->blocklens[j] = blocklength * old_size;
	    for (i=j+1; i<j+top_count; i++) {
		flat->indices[i] = flat->indices[i-1] + stride * old_size;
		flat->blocklens[i] = flat->blocklens[j];
	    }
	    *curr_index = i;
	}
	else {
/* vector of noncontiguous derived types */
    /* By using ADIO_Offset we preserve +/- sign and 
         avoid >2G integer arithmetic problems */
    ADIO_Offset blocklength = ints[1], stride = ints[2];

	    j = *curr_index;
	    num = *curr_index - prev_index;

/* The noncontiguous types have to be replicated blocklen times
   and then strided. Replicate the first one. */
	    MPI_Type_extent(types[0], &old_extent);
	    for (m=1; m<blocklength; m++) {
		for (i=0; i<num; i++) {
		    flat->indices[j] = flat->indices[j-num] + ADIOI_AINT_CAST_TO_OFFSET old_extent;
		    flat->blocklens[j] = flat->blocklens[j-num];
		    j++;
		}
	    }
	    *curr_index = j;

/* Now repeat with strides. */
	    num = *curr_index - prev_index;
	    for (i=1; i<top_count; i++) {
 		for (m=0; m<num; m++) {
		   flat->indices[j] =  flat->indices[j-num] + stride * ADIOI_AINT_CAST_TO_OFFSET old_extent;
		   flat->blocklens[j] = flat->blocklens[j-num];
		   j++;
		}
	    }
	    *curr_index = j;
	}
	break;

    case MPI_COMBINER_HVECTOR: 
    case MPI_COMBINER_HVECTOR_INTEGER: 
    #ifdef FLATTEN_DEBUG 
    DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_HVECTOR_INTEGER\n");
    #endif
	top_count = ints[0];
        MPI_Type_get_envelope(types[0], &old_nints, &old_nadds,
			      &old_ntypes, &old_combiner); 
        ADIOI_Datatype_iscontig(types[0], &old_is_contig);

	prev_index = *curr_index;
	if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig))
	    ADIOI_Flatten(types[0], flat, st_offset, curr_index);

	if (prev_index == *curr_index) {
/* simplest case, vector of basic or contiguous types */
    /* By using ADIO_Offset we preserve +/- sign and 
         avoid >2G integer arithmetic problems */
    ADIO_Offset blocklength = ints[1];
	    j = *curr_index;
	    flat->indices[j] = st_offset;
	    MPI_Type_size_x(types[0], &old_size);
	    flat->blocklens[j] = blocklength * old_size;
	    for (i=j+1; i<j+top_count; i++) {
		flat->indices[i] = flat->indices[i-1] + adds[0];
		flat->blocklens[i] = flat->blocklens[j];
	    }
	    *curr_index = i;
	}
	else {
/* vector of noncontiguous derived types */
    /* By using ADIO_Offset we preserve +/- sign and 
         avoid >2G integer arithmetic problems */
    ADIO_Offset blocklength = ints[1];

	    j = *curr_index;
	    num = *curr_index - prev_index;

/* The noncontiguous types have to be replicated blocklen times
   and then strided. Replicate the first one. */
	    MPI_Type_extent(types[0], &old_extent);
	    for (m=1; m<blocklength; m++) {
		for (i=0; i<num; i++) {
		    flat->indices[j] = flat->indices[j-num] + ADIOI_AINT_CAST_TO_OFFSET old_extent;
		    flat->blocklens[j] = flat->blocklens[j-num];
		    j++;
		}
	    }
	    *curr_index = j;

/* Now repeat with strides. */
	    num = *curr_index - prev_index;
	    for (i=1; i<top_count; i++) {
 		for (m=0; m<num; m++) {
		   flat->indices[j] =  flat->indices[j-num] + adds[0];
		   flat->blocklens[j] = flat->blocklens[j-num];
		   j++;
		}
	    }
	    *curr_index = j;
	}
	break;

    case MPI_COMBINER_INDEXED: 
    #ifdef FLATTEN_DEBUG 
    DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_INDEXED\n");
    #endif
	top_count = ints[0];
        MPI_Type_get_envelope(types[0], &old_nints, &old_nadds,
			      &old_ntypes, &old_combiner); 
        ADIOI_Datatype_iscontig(types[0], &old_is_contig);
	MPI_Type_extent(types[0], &old_extent);

	prev_index = *curr_index;
	if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig))
  {
    /* By using ADIO_Offset we preserve +/- sign and 
         avoid >2G integer arithmetic problems */
    ADIO_Offset stride = ints[top_count+1];
        ADIOI_Flatten(types[0], flat,
         st_offset+stride* ADIOI_AINT_CAST_TO_OFFSET old_extent, curr_index);
  }

	if (prev_index == *curr_index) {
/* simplest case, indexed type made up of basic or contiguous types */
	    j = *curr_index;
	    for (i=j, nonzeroth=i; i<j+top_count; i++) {
    /* By using ADIO_Offset we preserve +/- sign and 
         avoid >2G integer arithmetic problems */
    ADIO_Offset blocklength = ints[1+i-j], stride = ints[top_count+1+i-j];
		if (blocklength > 0) {
		    flat->indices[nonzeroth] =
			st_offset + stride* ADIOI_AINT_CAST_TO_OFFSET old_extent;
		    flat->blocklens[nonzeroth] =
			blocklength* ADIOI_AINT_CAST_TO_OFFSET old_extent;
		    nonzeroth++;
		} else {
		    flat->count--; /* don't count/consider any zero-length blocklens */
		}
	    }
	    *curr_index = i;
	}
	else {
/* indexed type made up of noncontiguous derived types */

	    j = *curr_index;
	    num = *curr_index - prev_index;
	    basic_num = num;

/* The noncontiguous types have to be replicated blocklens[i] times
   and then strided. Replicate the first one. */
	    for (m=1; m<ints[1]; m++) {
		for (i=0, nonzeroth = j; i<num; i++) {
		    if (flat->blocklens[j-num] > 0) {
			flat->indices[nonzeroth] =
			    flat->indices[nonzeroth-num] + ADIOI_AINT_CAST_TO_OFFSET old_extent;
			flat->blocklens[nonzeroth] =
			    flat->blocklens[nonzeroth-num];
			j++;
			nonzeroth++;
		    } else {
			flat->count --;
		    }
		}
	    }
	    *curr_index = j;

/* Now repeat with strides. */
	    for (i=1; i<top_count; i++) {
		num = *curr_index - prev_index;
		prev_index = *curr_index;
		for (m=0, nonzeroth=j; m<basic_num; m++) {
      /* By using ADIO_Offset we preserve +/- sign and 
         avoid >2G integer arithmetic problems */
      ADIO_Offset stride = ints[top_count+1+i]-ints[top_count+i];
		    if (flat->blocklens[j-num] > 0 ) {
			flat->indices[nonzeroth] =
			    flat->indices[j-num] + stride* ADIOI_AINT_CAST_TO_OFFSET old_extent;
			flat->blocklens[nonzeroth] = flat->blocklens[j-num];
			j++;
			nonzeroth++;
		    } else {
			flat->count--;
		    }
		}
		*curr_index = j;
		for (m=1; m<ints[1+i]; m++) {
                    for (k=0, nonzeroth=j; k<basic_num; k++) {
			if (flat->blocklens[j-basic_num] > 0) {
			    flat->indices[nonzeroth] =
				flat->indices[j-basic_num] + ADIOI_AINT_CAST_TO_OFFSET old_extent;
			    flat->blocklens[nonzeroth] = flat->blocklens[j-basic_num];
			    j++;
			    nonzeroth++;
			} else {
			    flat->count --;
			}
                    }
                }
		*curr_index = j;
	    }
	}
	break;

#if defined HAVE_DECL_MPI_COMBINER_HINDEXED_BLOCK && HAVE_DECL_MPI_COMBINER_HINDEXED_BLOCK
    case MPI_COMBINER_HINDEXED_BLOCK:
	is_hindexed_block=1;
	/* deliberate fall-through */
#endif
    case MPI_COMBINER_INDEXED_BLOCK:
    #ifdef FLATTEN_DEBUG 
    DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_INDEXED_BLOCK\n");
    #endif
	top_count = ints[0];
        MPI_Type_get_envelope(types[0], &old_nints, &old_nadds,
			      &old_ntypes, &old_combiner); 
        ADIOI_Datatype_iscontig(types[0], &old_is_contig);
	MPI_Type_extent(types[0], &old_extent);

	prev_index = *curr_index;
	if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig))
  {
      /* By using ADIO_Offset we preserve +/- sign and 
         avoid >2G integer arithmetic problems */
      ADIO_Offset stride = ints[1+1];
	if (is_hindexed_block) {
	    ADIOI_Flatten(types[0], flat,
		    st_offset+adds[0], curr_index);
	} else {
	    ADIOI_Flatten(types[0], flat,
		    st_offset+stride* ADIOI_AINT_CAST_TO_OFFSET old_extent, curr_index);
	}
  }

	if (prev_index == *curr_index) {
/* simplest case, indexed type made up of basic or contiguous types */
	    j = *curr_index;
	    for (i=j; i<j+top_count; i++) {
      /* By using ADIO_Offset we preserve +/- sign and 
         avoid >2G integer arithmetic problems */
		ADIO_Offset blocklength = ints[1];
		if (is_hindexed_block) {
		    flat->indices[i] = st_offset + adds[i-j];
		} else {
		    ADIO_Offset stride = ints[1+1+i-j];
		    flat->indices[i] = st_offset +
			stride* ADIOI_AINT_CAST_TO_OFFSET old_extent;
		}
		flat->blocklens[i] = blocklength* ADIOI_AINT_CAST_TO_OFFSET old_extent;
	    }
	    *curr_index = i;
	}
	else {
/* vector of noncontiguous derived types */

	    j = *curr_index;
	    num = *curr_index - prev_index;

/* The noncontiguous types have to be replicated blocklens[i] times
   and then strided. Replicate the first one. */
	    for (m=1; m<ints[1]; m++) {
		for (i=0; i<num; i++) {
		    if (is_hindexed_block) {
			/* this is the one place the hindexed case uses the
			 * extent of a type */
			MPI_Type_extent(types[0], &old_extent);
		    }
		    flat->indices[j] = flat->indices[j-num] +
			ADIOI_AINT_CAST_TO_OFFSET old_extent;
		    flat->blocklens[j] = flat->blocklens[j-num];
		    j++;
		}
	    }
	    *curr_index = j;

/* Now repeat with strides. */
	    num = *curr_index - prev_index;
	    for (i=1; i<top_count; i++) {
		for (m=0; m<num; m++) {
		    if (is_hindexed_block) {
			flat->indices[j] = flat->indices[j-num] +
			    adds[i] - adds[i-1];
		    } else {
			/* By using ADIO_Offset we preserve +/- sign and
			   avoid >2G integer arithmetic problems */
			ADIO_Offset stride = ints[2+i]-ints[1+i];
			flat->indices[j] = flat->indices[j-num] +
			    stride* ADIOI_AINT_CAST_TO_OFFSET old_extent;
		    }
		    flat->blocklens[j] = flat->blocklens[j-num];
		    j++;
		}
	    }
	    *curr_index = j;
	}
	break;

    case MPI_COMBINER_HINDEXED: 
    case MPI_COMBINER_HINDEXED_INTEGER:
    #ifdef FLATTEN_DEBUG 
    DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_HINDEXED_INTEGER\n");
    #endif
	top_count = ints[0];
        MPI_Type_get_envelope(types[0], &old_nints, &old_nadds,
			      &old_ntypes, &old_combiner); 
        ADIOI_Datatype_iscontig(types[0], &old_is_contig);

	prev_index = *curr_index;
	if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig))
  {
        ADIOI_Flatten(types[0], flat, st_offset+adds[0], curr_index); 
  }

	if (prev_index == *curr_index) {
/* simplest case, indexed type made up of basic or contiguous types */
	    j = *curr_index;
	    MPI_Type_size_x(types[0], &old_size);
	    for (i=j, nonzeroth=j; i<j+top_count; i++) {
		if (ints[1+i-j] > 0) {
		    /* By using ADIO_Offset we preserve +/- sign and
		       avoid >2G integer arithmetic problems */
		    ADIO_Offset blocklength = ints[1+i-j];
		    flat->indices[nonzeroth] = st_offset + adds[i-j];
		    flat->blocklens[nonzeroth] = blocklength*old_size;
		    nonzeroth++;
		} else {
		    flat->count--;
		}
	    }
	    *curr_index = i;
	}
	else {
/* indexed type made up of noncontiguous derived types */

	    j = *curr_index;
	    num = *curr_index - prev_index;
	    basic_num = num;

/* The noncontiguous types have to be replicated blocklens[i] times
   and then strided. Replicate the first one. */
	    MPI_Type_extent(types[0], &old_extent);
	    for (m=1; m<ints[1]; m++) {
		for (i=0, nonzeroth=j; i<num; i++) {
		    if (flat->blocklens[j-num] > 0) {
			flat->indices[nonzeroth] =
			    flat->indices[j-num] + ADIOI_AINT_CAST_TO_OFFSET old_extent;
			flat->blocklens[nonzeroth] = flat->blocklens[j-num];
			j++;
			nonzeroth++;
		    } else {
			flat->count--;
		    }
		}
	    }
	    *curr_index = j;

/* Now repeat with strides. */
	    for (i=1; i<top_count; i++) {
		num = *curr_index - prev_index;
		prev_index = *curr_index;
		for (m=0, nonzeroth=j; m<basic_num; m++) {
		    if (flat->blocklens[j-num] > 0) {
			flat->indices[nonzeroth] =
			    flat->indices[j-num] + adds[i] - adds[i-1];
			flat->blocklens[nonzeroth] = flat->blocklens[j-num];
			j++;
			nonzeroth++;
		    } else {
			flat->count--;
		    }
		}
		*curr_index = j;
		for (m=1; m<ints[1+i]; m++) {
		    for (k=0,nonzeroth=j; k<basic_num; k++) {
			if (flat->blocklens[j-basic_num] >0) {
			    flat->indices[nonzeroth] =
				flat->indices[j-basic_num] + ADIOI_AINT_CAST_TO_OFFSET old_extent;
			    flat->blocklens[nonzeroth] = flat->blocklens[j-basic_num];
			    j++;
			    nonzeroth++;
			}
		    }
		}
		*curr_index = j;
	    }
	}
	break;

    case MPI_COMBINER_STRUCT: 
    case MPI_COMBINER_STRUCT_INTEGER: 
    #ifdef FLATTEN_DEBUG 
    DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_STRUCT_INTEGER\n");
    #endif
	top_count = ints[0];
	for (n=0; n<top_count; n++) {
	    MPI_Type_get_envelope(types[n], &old_nints, &old_nadds,
				  &old_ntypes, &old_combiner); 
            ADIOI_Datatype_iscontig(types[n], &old_is_contig);

	    prev_index = *curr_index;
            if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig))
		ADIOI_Flatten(types[n], flat, st_offset+adds[n], curr_index);

	    if (prev_index == *curr_index) {
/* simplest case, current type is basic or contiguous types */
        /* By using ADIO_Offset we preserve +/- sign and 
           avoid >2G integer arithmetic problems */
		if (ints[1+n] > 0 || types[n] == MPI_LB || types[n] == MPI_UB) {
		    ADIO_Offset blocklength = ints[1+n];
		    j = *curr_index;
		    flat->indices[j] = st_offset + adds[n];
		    MPI_Type_size_x(types[n], &old_size);
		    flat->blocklens[j] = blocklength * old_size;
#ifdef FLATTEN_DEBUG
		    DBG_FPRINTF(stderr,"ADIOI_Flatten:: simple adds[%#X] "MPI_AINT_FMT_HEX_SPEC", flat->indices[%#llX] %#llX, flat->blocklens[%#llX] %#llX\n",n,adds[n],j, flat->indices[j], j, flat->blocklens[j]);
#endif
		    (*curr_index)++;
		}
	    }
	    else {
/* current type made up of noncontiguous derived types */

		j = *curr_index;
		num = *curr_index - prev_index;

/* The current type has to be replicated blocklens[n] times */
		MPI_Type_extent(types[n], &old_extent);
		for (m=1; m<ints[1+n]; m++) {
		    for (i=0; i<num; i++) {
			flat->indices[j] =
			    flat->indices[j-num] + ADIOI_AINT_CAST_TO_OFFSET old_extent;
			flat->blocklens[j] = flat->blocklens[j-num];
#ifdef FLATTEN_DEBUG
			DBG_FPRINTF(stderr,"ADIOI_Flatten:: simple old_extent "MPI_AINT_FMT_HEX_SPEC", flat->indices[%#llX] %#llX, flat->blocklens[%#llX] %#llX\n",old_extent,j, flat->indices[j], j, flat->blocklens[j]);
#endif
			j++;
		    }
		}
		*curr_index = j;
	    }
	}
 	break;

    case MPI_COMBINER_RESIZED: 
    #ifdef FLATTEN_DEBUG 
    DBG_FPRINTF(stderr,"ADIOI_Flatten:: MPI_COMBINER_RESIZED\n");
    #endif

    /* This is done similar to a type_struct with an lb, datatype, ub */

    /* handle the Lb */
	j = *curr_index;
	flat->indices[j] = st_offset + adds[0];
	/* this zero-length blocklens[] element, unlike eleswhere in the
	 * flattening code, is correct and is used to indicate a lower bound
	 * marker */
	flat->blocklens[j] = 0;

        #ifdef FLATTEN_DEBUG 
        DBG_FPRINTF(stderr,"ADIOI_Flatten:: simple adds[%#X] "MPI_AINT_FMT_HEX_SPEC", flat->indices[%#llX] %#llX, flat->blocklens[%#llX] %#llX\n",0,adds[0],j, flat->indices[j], j, flat->blocklens[j]);
        #endif

	(*curr_index)++;

	/* handle the datatype */

	MPI_Type_get_envelope(types[0], &old_nints, &old_nadds,
			      &old_ntypes, &old_combiner); 
	ADIOI_Datatype_iscontig(types[0], &old_is_contig);

	if ((old_combiner != MPI_COMBINER_NAMED) && (!old_is_contig)) {
	    ADIOI_Flatten(types[0], flat, st_offset+adds[0], curr_index);
	}
	else {
            /* current type is basic or contiguous */
	    j = *curr_index;
	    flat->indices[j] = st_offset;
	    MPI_Type_size_x(types[0], &old_size);
	    flat->blocklens[j] = old_size;

            #ifdef FLATTEN_DEBUG 
	    DBG_FPRINTF(stderr,"ADIOI_Flatten:: simple adds[%#X] "MPI_AINT_FMT_HEX_SPEC", flat->indices[%#llX] %#llX, flat->blocklens[%#llX] %#llX\n",0,adds[0],j, flat->indices[j], j, flat->blocklens[j]);
            #endif

	    (*curr_index)++;
	}

	/* take care of the extent as a UB */
	j = *curr_index;
	flat->indices[j] = st_offset + adds[0] + adds[1];
	/* again, zero-element ok: an upper-bound marker explicitly set by the
	 * constructor of this resized type */
	flat->blocklens[j] = 0;

        #ifdef FLATTEN_DEBUG 
        DBG_FPRINTF(stderr,"ADIOI_Flatten:: simple adds[%#X] "MPI_AINT_FMT_HEX_SPEC", flat->indices[%#llX] %#llX, flat->blocklens[%#llX] %#llX\n",1,adds[1],j, flat->indices[j], j, flat->blocklens[j]);
        #endif

	(*curr_index)++;

 	break;

    default:
	/* TODO: FIXME (requires changing prototypes to return errors...) */
	DBG_FPRINTF(stderr, "Error: Unsupported datatype passed to ADIOI_Flatten\n");
	MPI_Abort(MPI_COMM_WORLD, 1);
    }

#ifndef MPISGI
/* There is a bug in SGI's impl. of MPI_Type_get_contents. It doesn't
   return new datatypes. Therefore no need to free. */
    for (i=0; i<ntypes; i++) {
 	MPI_Type_get_envelope(types[i], &old_nints, &old_nadds, &old_ntypes,
 			      &old_combiner);
 	if (old_combiner != MPI_COMBINER_NAMED) MPI_Type_free(types+i);
    }
#endif

    ADIOI_Free(ints);
    ADIOI_Free(adds);
    ADIOI_Free(types);

  #ifdef FLATTEN_DEBUG 
  DBG_FPRINTF(stderr,"ADIOI_Flatten:: return st_offset %#llX, curr_index %#llX\n",st_offset,*curr_index);
  #endif

}
Example #15
0
void ADIOI_GEN_ReadStridedColl(ADIO_File fd, void *buf, int count,
			       MPI_Datatype datatype, int file_ptr_type,
			       ADIO_Offset offset, ADIO_Status *status, int
			       *error_code)
{
/* Uses a generalized version of the extended two-phase method described
   in "An Extended Two-Phase Method for Accessing Sections of 
   Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary,
   Scientific Programming, (5)4:301--317, Winter 1996. 
   http://www.mcs.anl.gov/home/thakur/ext2ph.ps */

    ADIOI_Access *my_req; 
    /* array of nprocs structures, one for each other process in
       whose file domain this process's request lies */
    
    ADIOI_Access *others_req;
    /* array of nprocs structures, one for each other process
       whose request lies in this process's file domain. */

    int i, filetype_is_contig, nprocs, nprocs_for_coll, myrank;
    int contig_access_count=0, interleave_count = 0, buftype_is_contig;
    int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs;
    ADIO_Offset start_offset, end_offset, orig_fp, fd_size, min_st_offset, off;
    ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *fd_start = NULL,
	*fd_end = NULL, *end_offsets = NULL;
    ADIO_Offset *len_list = NULL;
    int *buf_idx = NULL;

#ifdef HAVE_STATUS_SET_BYTES
    MPI_Count bufsize, size;
#endif

    if (fd->hints->cb_pfr != ADIOI_HINT_DISABLE) {
        ADIOI_IOStridedColl (fd, buf, count, ADIOI_READ, datatype, 
			file_ptr_type, offset, status, error_code);
        return;
    }


    MPI_Comm_size(fd->comm, &nprocs);
    MPI_Comm_rank(fd->comm, &myrank);

    /* number of aggregators, cb_nodes, is stored in the hints */
    nprocs_for_coll = fd->hints->cb_nodes;
    orig_fp = fd->fp_ind;

    /* only check for interleaving if cb_read isn't disabled */
    if (fd->hints->cb_read != ADIOI_HINT_DISABLE) {
    /* For this process's request, calculate the list of offsets and
       lengths in the file and determine the start and end offsets. */

    /* Note: end_offset points to the last byte-offset that will be accessed.
       e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/

	ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset,
			      &offset_list, &len_list, &start_offset,
			      &end_offset, &contig_access_count); 
    
#ifdef RDCOLL_DEBUG
    for (i=0; i<contig_access_count; i++) {
	      DBG_FPRINTF(stderr, "rank %d  off %lld  len %lld\n", 
			      myrank, offset_list[i], len_list[i]);
	      }
#endif

	/* each process communicates its start and end offsets to other 
	   processes. The result is an array each of start and end offsets
	   stored in order of process rank. */ 
    
	st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));
	end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));

	MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1,
		      ADIO_OFFSET, fd->comm);
	MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1,
		      ADIO_OFFSET, fd->comm);

	/* are the accesses of different processes interleaved? */
	for (i=1; i<nprocs; i++)
	    if ((st_offsets[i] < end_offsets[i-1]) && 
                (st_offsets[i] <= end_offsets[i]))
                interleave_count++;
	/* This is a rudimentary check for interleaving, but should suffice
	   for the moment. */
    }

    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);

    if (fd->hints->cb_read == ADIOI_HINT_DISABLE
	|| (!interleave_count && (fd->hints->cb_read == ADIOI_HINT_AUTO))) 
    {
	/* don't do aggregation */
	if (fd->hints->cb_read != ADIOI_HINT_DISABLE) {
	    ADIOI_Free(offset_list);
	    ADIOI_Free(len_list);
	    ADIOI_Free(st_offsets);
	    ADIOI_Free(end_offsets);
	}

	fd->fp_ind = orig_fp;
	ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);

	if (buftype_is_contig && filetype_is_contig) {
	    if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
		off = fd->disp + (fd->etype_size) * offset;
		ADIO_ReadContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET,
                       off, status, error_code);
	    }
	    else ADIO_ReadContig(fd, buf, count, datatype, ADIO_INDIVIDUAL,
                       0, status, error_code);
	}
	else ADIO_ReadStrided(fd, buf, count, datatype, file_ptr_type,
                       offset, status, error_code);

	return;
    }

    /* We're going to perform aggregation of I/O.  Here we call
     * ADIOI_Calc_file_domains() to determine what processes will handle I/O
     * to what regions.  We pass nprocs_for_coll into this function; it is
     * used to determine how many processes will perform I/O, which is also
     * the number of regions into which the range of bytes must be divided.
     * These regions are called "file domains", or FDs.
     *
     * When this function returns, fd_start, fd_end, fd_size, and
     * min_st_offset will be filled in.  fd_start holds the starting byte
     * location for each file domain.  fd_end holds the ending byte location.
     * min_st_offset holds the minimum byte location that will be accessed.
     *
     * Both fd_start[] and fd_end[] are indexed by an aggregator number; this
     * needs to be mapped to an actual rank in the communicator later.
     *
     */
    ADIOI_Calc_file_domains(st_offsets, end_offsets, nprocs,
			    nprocs_for_coll, &min_st_offset,
			    &fd_start, &fd_end, 
			    fd->hints->min_fdomain_size, &fd_size,
			    fd->hints->striping_unit);

    /* calculate where the portions of the access requests of this process 
     * are located in terms of the file domains.  this could be on the same
     * process or on other processes.  this function fills in:
     * count_my_req_procs - number of processes (including this one) for which
     *     this process has requests in their file domain
     * count_my_req_per_proc - count of requests for each process, indexed
     *     by rank of the process
     * my_req[] - array of data structures describing the requests to be
     *     performed by each process (including self).  indexed by rank.
     * buf_idx[] - array of locations into which data can be directly moved;
     *     this is only valid for contiguous buffer case
     */
    ADIOI_Calc_my_req(fd, offset_list, len_list, contig_access_count,
		      min_st_offset, fd_start, fd_end, fd_size,
		      nprocs, &count_my_req_procs, 
		      &count_my_req_per_proc, &my_req,
		      &buf_idx);

    /* perform a collective communication in order to distribute the
     * data calculated above.  fills in the following:
     * count_others_req_procs - number of processes (including this
     *     one) which have requests in this process's file domain.
     * count_others_req_per_proc[] - number of separate contiguous
     *     requests from proc i lie in this process's file domain.
     */
    ADIOI_Calc_others_req(fd, count_my_req_procs, 
			  count_my_req_per_proc, my_req, 
			  nprocs, myrank, &count_others_req_procs, 
			  &others_req); 

    /* my_req[] and count_my_req_per_proc aren't needed at this point, so 
     * let's free the memory 
     */
    ADIOI_Free(count_my_req_per_proc);
    for (i=0; i<nprocs; i++) {
	if (my_req[i].count) {
	    ADIOI_Free(my_req[i].offsets);
	    ADIOI_Free(my_req[i].lens);
	}
    }
    ADIOI_Free(my_req);


    /* read data in sizes of no more than ADIOI_Coll_bufsize, 
     * communicate, and fill user buf. 
     */
    ADIOI_Read_and_exch(fd, buf, datatype, nprocs, myrank,
                        others_req, offset_list,
			len_list, contig_access_count, min_st_offset,
			fd_size, fd_start, fd_end, buf_idx, error_code);

    if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);

    /* free all memory allocated for collective I/O */
    for (i=0; i<nprocs; i++) {
	if (others_req[i].count) {
	    ADIOI_Free(others_req[i].offsets);
	    ADIOI_Free(others_req[i].lens);
	    ADIOI_Free(others_req[i].mem_ptrs);
	}
    }
    ADIOI_Free(others_req);

    ADIOI_Free(buf_idx);
    ADIOI_Free(offset_list);
    ADIOI_Free(len_list);
    ADIOI_Free(st_offsets);
    ADIOI_Free(end_offsets);
    ADIOI_Free(fd_start);
    ADIOI_Free(fd_end);

#ifdef HAVE_STATUS_SET_BYTES
    MPI_Type_size_x(datatype, &size);
    bufsize = size * count;
    MPIR_Status_set_bytes(status, datatype, bufsize);
/* This is a temporary way of filling in status. The right way is to 
   keep track of how much data was actually read and placed in buf 
   during collective I/O. */
#endif

    fd->fp_sys_posn = -1;   /* set it to null. */
}
Example #16
0
static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node
			 *flat_buf, ADIO_Offset *offset_list, ADIO_Offset
                         *len_list, int *send_size, int *recv_size,
			 int *count, int *start_pos, int *partial_send, 
			 int *recd_from_proc, int nprocs, 
			 int myrank, int
			 buftype_is_contig, int contig_access_count,
			 ADIO_Offset min_st_offset, ADIO_Offset fd_size,
			 ADIO_Offset *fd_start, ADIO_Offset *fd_end, 
			 ADIOI_Access *others_req, 
                         int iter, MPI_Aint buftype_extent, int *buf_idx)
{
    int i, j, k=0, tmp=0, nprocs_recv, nprocs_send;
    char **recv_buf = NULL; 
    MPI_Request *requests;
    MPI_Datatype send_type;
    MPI_Status *statuses;

/* exchange send_size info so that each process knows how much to
   receive from whom and how much memory to allocate. */

    MPI_Alltoall(send_size, 1, MPI_INT, recv_size, 1, MPI_INT, fd->comm);

    nprocs_recv = 0;
    for (i=0; i < nprocs; i++) if (recv_size[i]) nprocs_recv++;

    nprocs_send = 0;
    for (i=0; i<nprocs; i++) if (send_size[i]) nprocs_send++;

    requests = (MPI_Request *)
	ADIOI_Malloc((nprocs_send+nprocs_recv+1)*sizeof(MPI_Request));
/* +1 to avoid a 0-size malloc */

/* post recvs. if buftype_is_contig, data can be directly recd. into
   user buf at location given by buf_idx. else use recv_buf. */

#ifdef AGGREGATION_PROFILE
    MPE_Log_event (5032, 0, NULL);
#endif

    if (buftype_is_contig) {
	j = 0;
	for (i=0; i < nprocs; i++) 
	    if (recv_size[i]) {
		MPI_Irecv(((char *) buf) + buf_idx[i], recv_size[i], 
		  MPI_BYTE, i, myrank+i+100*iter, fd->comm, requests+j);
		j++;
		buf_idx[i] += recv_size[i];
	    }
    }
    else {
/* allocate memory for recv_buf and post receives */
	recv_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char*));
	for (i=0; i < nprocs; i++) 
	    if (recv_size[i]) recv_buf[i] = 
                                  (char *) ADIOI_Malloc(recv_size[i]);

	    j = 0;
	    for (i=0; i < nprocs; i++) 
		if (recv_size[i]) {
		    MPI_Irecv(recv_buf[i], recv_size[i], MPI_BYTE, i, 
			      myrank+i+100*iter, fd->comm, requests+j);
		    j++;
#ifdef RDCOLL_DEBUG
		    DBG_FPRINTF(stderr, "node %d, recv_size %d, tag %d \n", 
		       myrank, recv_size[i], myrank+i+100*iter); 
#endif
		}
    }

/* create derived datatypes and send data */

    j = 0;
    for (i=0; i<nprocs; i++) {
	if (send_size[i]) {
/* take care if the last off-len pair is a partial send */
	    if (partial_send[i]) {
		k = start_pos[i] + count[i] - 1;
		tmp = others_req[i].lens[k];
		others_req[i].lens[k] = partial_send[i];
	    }
	    ADIOI_Type_create_hindexed_x(count[i],
		  &(others_req[i].lens[start_pos[i]]),
	            &(others_req[i].mem_ptrs[start_pos[i]]), 
			 MPI_BYTE, &send_type);
	    /* absolute displacement; use MPI_BOTTOM in send */
	    MPI_Type_commit(&send_type);
	    MPI_Isend(MPI_BOTTOM, 1, send_type, i, myrank+i+100*iter,
		      fd->comm, requests+nprocs_recv+j);
	    MPI_Type_free(&send_type);
	    if (partial_send[i]) others_req[i].lens[k] = tmp;
	    j++;
	}
    }

    statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send+nprocs_recv+1) * \
                                     sizeof(MPI_Status)); 
     /* +1 to avoid a 0-size malloc */

    /* wait on the receives */
    if (nprocs_recv) {
#ifdef NEEDS_MPI_TEST
	j = 0;
	while (!j) MPI_Testall(nprocs_recv, requests, &j, statuses);
#else
	MPI_Waitall(nprocs_recv, requests, statuses);
#endif

	/* if noncontiguous, to the copies from the recv buffers */
	if (!buftype_is_contig) 
	    ADIOI_Fill_user_buffer(fd, buf, flat_buf, recv_buf,
				   offset_list, len_list, (unsigned*)recv_size, 
				   requests, statuses, recd_from_proc, 
				   nprocs, contig_access_count,
				   min_st_offset, fd_size, fd_start, fd_end,
				   buftype_extent);
    }

    /* wait on the sends*/
    MPI_Waitall(nprocs_send, requests+nprocs_recv, statuses+nprocs_recv);

    ADIOI_Free(statuses);
    ADIOI_Free(requests);

    if (!buftype_is_contig) {
	for (i=0; i < nprocs; i++) 
	    if (recv_size[i]) ADIOI_Free(recv_buf[i]);
	ADIOI_Free(recv_buf);
    }
#ifdef AGGREGATION_PROFILE
    MPE_Log_event (5033, 0, NULL);
#endif
}
Example #17
0
/* 
 * Compute a dynamic access range based file domain partition among I/O aggregators,
 * which align to the GPFS block size
 * Divide the I/O workload among "nprocs_for_coll" processes. This is
 * done by (logically) dividing the file into file domains (FDs); each
 * process may directly access only its own file domain. 
 * Additional effort is to make sure that each I/O aggregator get
 * a file domain that aligns to the GPFS block size.  So, there will 
 * not be any false sharing of GPFS file blocks among multiple I/O nodes. 
 *  
 * The common version of this now accepts a min_fd_size and striping_unit. 
 * It doesn't seem necessary here (using GPFS block sizes) but keep it in mind
 * (e.g. we could pass striping unit instead of using fs_ptr->blksize). 
 */
void ADIOI_BGL_GPFS_Calc_file_domains(ADIO_Offset *st_offsets,
                                      ADIO_Offset *end_offsets,
                                      int          nprocs,
                                      int          nprocs_for_coll,
                                      ADIO_Offset *min_st_offset_ptr,
                                      ADIO_Offset **fd_start_ptr,
                                      ADIO_Offset **fd_end_ptr,
                                      ADIO_Offset *fd_size_ptr,
                                      void        *fs_ptr)
{
    ADIO_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, *fd_size;
    int i, aggr;

#ifdef AGGREGATION_PROFILE
    MPE_Log_event (5004, 0, NULL);
#endif

#   if AGG_DEBUG
    static char myname[] = "ADIOI_BGL_GPFS_Calc_file_domains";
    DBG_FPRINTF(stderr, "%s(%d): %d aggregator(s)\n", 
	    myname,__LINE__,nprocs_for_coll);
#   endif
    __blksize_t blksize = 1048576; /* default to 1M */
    if(fs_ptr && ((ADIOI_BGL_fs*)fs_ptr)->blksize) /* ignore null ptr or 0 blksize */
      blksize = ((ADIOI_BGL_fs*)fs_ptr)->blksize;
#   if AGG_DEBUG
    DBG_FPRINTF(stderr,"%s(%d): Blocksize=%ld\n",myname,__LINE__,blksize);
#   endif
/* find min of start offsets and max of end offsets of all processes */
    min_st_offset  = st_offsets [0];
    max_end_offset = end_offsets[0];
    for (i=1; i<nprocs; i++) {
        min_st_offset = ADIOI_MIN(min_st_offset, st_offsets[i]);
        max_end_offset = ADIOI_MAX(max_end_offset, end_offsets[i]);
    }

    // DBG_FPRINTF(stderr, "_calc_file_domains, min_st_offset, max_ = %qd, %qd\n", min_st_offset, max_end_offset );

    /* determine the "file domain (FD)" of each process, i.e., the portion of
       the file that will be "owned" by each process */

    ADIO_Offset gpfs_ub       = (max_end_offset +blksize-1) / blksize * blksize - 1;
    ADIO_Offset gpfs_lb       = min_st_offset / blksize * blksize;
    ADIO_Offset gpfs_ub_rdoff = (max_end_offset +blksize-1) / blksize * blksize - 1 - max_end_offset;
    ADIO_Offset gpfs_lb_rdoff = min_st_offset - min_st_offset / blksize * blksize;
    ADIO_Offset fd_gpfs_range = gpfs_ub - gpfs_lb + 1;

    int         naggs    = nprocs_for_coll;

    /* Tweak the file domains so that no fd is smaller than a threshold.  We
     * have to strike a balance between efficency and parallelism: somewhere
     * between 10k processes sending 32-byte requests and one process sending a
     * 320k request is a (system-dependent) sweet spot 
     
    This is from the common code - the new min_fd_size parm that we didn't implement. 
    (And common code uses a different declaration of fd_size so beware)  */
     

    /* this is not entirely sufficient on BlueGene: we must be mindful of
     * imbalance over psets.  the hint processing code has already picked, say,
     * 8 processors per pset, so if we go increasing fd_size we'll end up with
     * some psets with 8 processors and some psets with none.  */
    /*
    if (fd_size < min_fd_size)
        fd_size = min_fd_size;
	*/
    fd_size              = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
    *fd_start_ptr        = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
    *fd_end_ptr          = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
    fd_start             = *fd_start_ptr;
    fd_end               = *fd_end_ptr;

    ADIO_Offset n_gpfs_blk    = fd_gpfs_range / blksize;
    ADIO_Offset nb_cn_small   = n_gpfs_blk/naggs;
    ADIO_Offset naggs_large   = n_gpfs_blk - naggs * (n_gpfs_blk/naggs);
    ADIO_Offset naggs_small   = naggs - naggs_large;

    /* nb_cn_small * blksize: evenly split file domain among processors:
     *      equivalent to fd_gpfs_rnage/naggs 
     * (nb_cn_small+1) * blksize: keeps file domain at least 'blksize' big
     */
    for (i=0; i<naggs; i++)
        if (i < naggs_small) fd_size[i] = nb_cn_small     * blksize;
			else fd_size[i] = (nb_cn_small+1) * blksize;
			/*potential optimization: if n_gpfs_blk smalller than
			 * naggs, slip in some zero-sized file
			 * domains to spread the work across all psets.  */

#   if AGG_DEBUG
     DBG_FPRINTF(stderr,"%s(%d): "
                   "gpfs_ub       %llu, "
                   "gpfs_lb       %llu, "
                   "gpfs_ub_rdoff %llu, "
                   "gpfs_lb_rdoff %llu, "
                   "fd_gpfs_range %llu, "
                   "n_gpfs_blk    %llu, "
                   "nb_cn_small   %llu, "
                   "naggs_large   %llu, "
                   "naggs_small   %llu, "
                   "\n",
                   myname,__LINE__,
                   gpfs_ub      ,
                   gpfs_lb      ,
                   gpfs_ub_rdoff,
                   gpfs_lb_rdoff,
                   fd_gpfs_range,
                   n_gpfs_blk   ,
                   nb_cn_small  ,
                   naggs_large  ,
                   naggs_small
                   );
#   endif

    fd_size[0]       -= gpfs_lb_rdoff;
    fd_size[naggs-1] -= gpfs_ub_rdoff;

    /* compute the file domain for each aggr */
    ADIO_Offset offset = min_st_offset;
    for (aggr=0; aggr<naggs; aggr++) {
        fd_start[aggr] = offset;
        fd_end  [aggr] = offset + fd_size[aggr] - 1;
        offset += fd_size[aggr];
    }

    *fd_size_ptr = fd_size[0];
    *min_st_offset_ptr = min_st_offset;

#ifdef AGGREGATION_PROFILE
    MPE_Log_event (5005, 0, NULL);
#endif
    ADIOI_Free (fd_size);
}
Example #18
0
static void scaleable_stat(ADIO_File fd)
{
    struct stat64 bg_stat;
    struct statfs bg_statfs;
    int rank, rc;
    char * dir;
    long buf[2];
    MPI_Comm_rank(fd->comm, &rank);

    if (rank == fd->hints->ranklist[0]) {
	/* Get the (real) underlying file system block size */
	rc = stat64(fd->filename, &bg_stat);
	if (rc >= 0)
	{
	    buf[0] = bg_stat.st_blksize;
	    DBGV_FPRINTF(stderr,"Successful stat '%s'.  Blocksize=%ld\n",
		    fd->filename,bg_stat.st_blksize);
	}
	else
	{
	    DBGV_FPRINTF(stderr,"Stat '%s' failed with rc=%d, errno=%d\n",
		    fd->filename,rc,errno);
	}
	/* Get the (real) underlying file system type so we can 
	 * plan our fsync scaling strategy */
	rc = statfs(fd->filename,&bg_statfs);
	if (rc >= 0)
	{
	    DBGV_FPRINTF(stderr,"Successful statfs '%s'.  Magic number=%#lX\n",
		    fd->filename,bg_statfs.f_type);
	    buf[1] = bg_statfs.f_type;
	}
	else
	{
	    DBGV_FPRINTF(stderr,"Statfs '%s' failed with rc=%d, errno=%d\n",
		    fd->filename,rc,errno);
	    ADIO_FileSysType_parentdir(fd->filename, &dir);
	    rc = statfs(dir,&bg_statfs);
	    if (rc >= 0)
	    {
		DBGV_FPRINTF(stderr,"Successful statfs '%s'.  Magic number=%#lX\n",dir,bg_statfs.f_type);
		buf[1] = bg_statfs.f_type;
	    }
	    else
	    {
		/* Hmm.  Guess we'll assume the worst-case, that it's not GPFS
		 * or BGLOCKLESSMPIO_F_TYPE (default PVFS2) below */
		buf[1] = -1; /* bogus magic number */
		DBGV_FPRINTF(stderr,"Statfs '%s' failed with rc=%d, errno=%d\n",dir,rc,errno);
	    }
	    free(dir);
	}
    }
    /* now we can broadcast the stat/statfs data to everyone else */
    if (fd->comm != MPI_COMM_SELF) { /* if indep open, there's no one to talk to*/
	if (fd->agg_comm != MPI_COMM_NULL) /* deferred open: only a subset of
					      processes participate */
	    MPI_Bcast(buf, 2, MPI_LONG, fd->hints->ranklist[0], fd->agg_comm);
	else
	    MPI_Bcast(buf, 2, MPI_LONG, fd->hints->ranklist[0], fd->comm);
    }
    bg_stat.st_blksize = buf[0];
    bg_statfs.f_type = buf[1];

    /* data from stat64 */
    /* store the blksize in the file system specific storage */
    ((ADIOI_BG_fs*)fd->fs_ptr)->blksize = bg_stat.st_blksize;

    /* data from statfs */
   if ((bg_statfs.f_type == GPFS_SUPER_MAGIC) ||
       (bg_statfs.f_type == bglocklessmpio_f_type))
   {
      ((ADIOI_BG_fs*)fd->fs_ptr)->fsync_aggr = 
            ADIOI_BG_FSYNC_AGGREGATION_ENABLED;

      /* Only one rank is an "fsync aggregator" because only one 
      * fsync is needed */
      if (rank == fd->hints->ranklist[0])
      {
         ((ADIOI_BG_fs*)fd->fs_ptr)->fsync_aggr |= 
            ADIOI_BG_FSYNC_AGGREGATOR;
         DBG_FPRINTF(stderr,"fsync aggregator %d\n",rank);
      }
      else 
         ; /* aggregation enabled but this rank is not an aggregator*/
   }
   else
      ; /* Other filesystems default to no fsync aggregation */
}
Example #19
0
/* 
 * ADIOI_BGL_Calc_my_req() overrides ADIOI_Calc_my_req for the default implementation 
 * is specific for static file domain partitioning.
 *
 * ADIOI_Calc_my_req() - calculate what portions of the access requests
 * of this process are located in the file domains of various processes
 * (including this one)
 */
void ADIOI_BGL_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *len_list, 
			   int contig_access_count, ADIO_Offset 
			   min_st_offset, ADIO_Offset *fd_start,
			   ADIO_Offset *fd_end, ADIO_Offset fd_size,
			   int nprocs,
			   int *count_my_req_procs_ptr,
			   int **count_my_req_per_proc_ptr,
			   ADIOI_Access **my_req_ptr,
			   int **buf_idx_ptr)
/* Possibly reconsider if buf_idx's are ok as int's, or should they be aints/offsets? 
   They are used as memory buffer indices so it seems like the 2G limit is in effect */
{
    int *count_my_req_per_proc, count_my_req_procs, *buf_idx;
    int i, l, proc;
    ADIO_Offset fd_len, rem_len, curr_idx, off;
    ADIOI_Access *my_req;

#ifdef AGGREGATION_PROFILE
    MPE_Log_event (5024, 0, NULL);
#endif

    *count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs,sizeof(int)); 
    count_my_req_per_proc = *count_my_req_per_proc_ptr;
/* count_my_req_per_proc[i] gives the no. of contig. requests of this
   process in process i's file domain. calloc initializes to zero.
   I'm allocating memory of size nprocs, so that I can do an 
   MPI_Alltoall later on.*/

    buf_idx = (int *) ADIOI_Malloc(nprocs*sizeof(int));
/* buf_idx is relevant only if buftype_is_contig.
   buf_idx[i] gives the index into user_buf where data received
   from proc. i should be placed. This allows receives to be done
   without extra buffer. This can't be done if buftype is not contig. */
   
    /* initialize buf_idx to -1 */
    for (i=0; i < nprocs; i++) buf_idx[i] = -1;

    /* one pass just to calculate how much space to allocate for my_req;
     * contig_access_count was calculated way back in ADIOI_Calc_my_off_len()
     */
    for (i=0; i < contig_access_count; i++) {
	/* short circuit offset/len processing if len == 0 
	 * 	(zero-byte  read/write */
	if (len_list[i] == 0) 
		continue;
	off = offset_list[i];
	fd_len = len_list[i];
	/* note: we set fd_len to be the total size of the access.  then
	 * ADIOI_Calc_aggregator() will modify the value to return the 
	 * amount that was available from the file domain that holds the
	 * first part of the access.
	 */
	proc = ADIOI_BGL_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size, 
				     fd_start, fd_end);
	count_my_req_per_proc[proc]++;

	/* figure out how much data is remaining in the access (i.e. wasn't 
	 * part of the file domain that had the starting byte); we'll take 
	 * care of this data (if there is any) in the while loop below.
	 */
	rem_len = len_list[i] - fd_len;

	while (rem_len > 0) {
	    off += fd_len; /* point to first remaining byte */
	    fd_len = rem_len; /* save remaining size, pass to calc */
	    proc = ADIOI_BGL_Calc_aggregator(fd, off, min_st_offset, &fd_len, 
					 fd_size, fd_start, fd_end);

	    count_my_req_per_proc[proc]++;
	    rem_len -= fd_len; /* reduce remaining length by amount from fd */
	}
    }

/* now allocate space for my_req, offset, and len */

    *my_req_ptr = (ADIOI_Access *)
	ADIOI_Malloc(nprocs*sizeof(ADIOI_Access)); 
    my_req = *my_req_ptr;

    count_my_req_procs = 0;
    for (i=0; i < nprocs; i++) {
	if (count_my_req_per_proc[i]) {
	    my_req[i].offsets = (ADIO_Offset *)
		ADIOI_Malloc(count_my_req_per_proc[i] * sizeof(ADIO_Offset));
	    my_req[i].lens = (int *)
		ADIOI_Malloc(count_my_req_per_proc[i] * sizeof(int));
	    count_my_req_procs++;
	}	    
	my_req[i].count = 0;  /* will be incremented where needed
				      later */
    }

/* now fill in my_req */
    curr_idx = 0;
    for (i=0; i<contig_access_count; i++) { 
	/* short circuit offset/len processing if len == 0 
	 * 	(zero-byte  read/write */
	if (len_list[i] == 0)
		continue;
	off = offset_list[i];
	fd_len = len_list[i];
	proc = ADIOI_BGL_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size, 
				     fd_start, fd_end);

	/* for each separate contiguous access from this process */
	if (buf_idx[proc] == -1)
  {
    ADIOI_Assert(curr_idx == (int) curr_idx);
    buf_idx[proc] = (int) curr_idx;
  }

	l = my_req[proc].count;
	curr_idx += fd_len;

	rem_len = len_list[i] - fd_len;

	/* store the proc, offset, and len information in an array
         * of structures, my_req. Each structure contains the 
         * offsets and lengths located in that process's FD, 
	 * and the associated count. 
	 */
	my_req[proc].offsets[l] = off;
  ADIOI_Assert(fd_len == (int) fd_len);
	my_req[proc].lens[l] = (int) fd_len;
	my_req[proc].count++;

	while (rem_len > 0) {
	    off += fd_len;
	    fd_len = rem_len;
	    proc = ADIOI_BGL_Calc_aggregator(fd, off, min_st_offset, &fd_len, 
					 fd_size, fd_start, fd_end);

	    if (buf_idx[proc] == -1) 
      {
        ADIOI_Assert(curr_idx == (int) curr_idx);
        buf_idx[proc] = (int) curr_idx;
      }

	    l = my_req[proc].count;
	    curr_idx += fd_len;
	    rem_len -= fd_len;

	    my_req[proc].offsets[l] = off;
      ADIOI_Assert(fd_len == (int) fd_len);
	    my_req[proc].lens[l] = (int) fd_len;
	    my_req[proc].count++;
	}
    }

#ifdef AGG_DEBUG
    for (i=0; i<nprocs; i++) {
	if (count_my_req_per_proc[i] > 0) {
	    DBG_FPRINTF(stderr, "data needed from %d (count = %d):\n", i, 
		    my_req[i].count);
	    for (l=0; l < my_req[i].count; l++) {
		DBG_FPRINTF(stderr, "   off[%d] = %lld, len[%d] = %d\n", l,
			my_req[i].offsets[l], l, my_req[i].lens[l]);
	    }
	}
	DBG_FPRINTF(stderr, "buf_idx[%d] = 0x%x\n", i, buf_idx[i]);
    }
#endif

    *count_my_req_procs_ptr = count_my_req_procs;
    *buf_idx_ptr = buf_idx;
#ifdef AGGREGATION_PROFILE
    MPE_Log_event (5025, 0, NULL);
#endif
}
Example #20
0
void ad_gpfs_get_env_vars() {
    char *x, *dummy;

    gpfsmpio_comm   = 0;
	x = getenv( "GPFSMPIO_COMM"         );
	if (x) gpfsmpio_comm         = atoi(x);
    gpfsmpio_timing = 0;
	x = getenv( "GPFSMPIO_TIMING"       );
	if (x) gpfsmpio_timing       = atoi(x);
    gpfsmpio_tunegather = 1;
	x = getenv( "GPFSMPIO_TUNEGATHER"   );
	if (x) gpfsmpio_tunegather   = atoi(x);
    gpfsmpio_tuneblocking = 1;
    x = getenv( "GPFSMPIO_TUNEBLOCKING" );
    if (x) gpfsmpio_tuneblocking = atoi(x);
    bglocklessmpio_f_type = PVFS2_SUPER_MAGIC;
    x = getenv( "BGLOCKLESSMPIO_F_TYPE" );
    if (x) bglocklessmpio_f_type = strtol(x,&dummy,0);
    DBG_FPRINTF(stderr,"BGLOCKLESSMPIO_F_TYPE=%ld/%#lX\n",
            bglocklessmpio_f_type,bglocklessmpio_f_type);
    /* note: this value will be 'sanity checked' in ADIOI_BG_persInfo_init(),
     * when we know a bit more about what "largest possible value" and
     * "smallest possible value" should be */
    gpfsmpio_bg_nagg_pset = ADIOI_BG_NAGG_PSET_DFLT;
    x = getenv("GPFSMPIO_NAGG_PSET");
    if (x) gpfsmpio_bg_nagg_pset = atoi(x);

    gpfsmpio_pthreadio = 0;
    x = getenv( "GPFSMPIO_PTHREADIO" );
    if (x) gpfsmpio_pthreadio = atoi(x);

    gpfsmpio_p2pcontig = 0;
    x = getenv( "GPFSMPIO_P2PCONTIG" );
    if (x) gpfsmpio_p2pcontig = atoi(x);

    gpfsmpio_write_aggmethod = 0;
    x = getenv( "GPFSMPIO_WRITE_AGGMETHOD" );
    if (x) gpfsmpio_write_aggmethod = atoi(x);

    gpfsmpio_read_aggmethod = 0;
    x = getenv( "GPFSMPIO_READ_AGGMETHOD" );
    if (x) gpfsmpio_read_aggmethod = atoi(x);

    gpfsmpio_balancecontig = 0;
    x = getenv( "GPFSMPIO_BALANCECONTIG" );
    if (x) gpfsmpio_balancecontig = atoi(x);

    gpfsmpio_devnullio = 0;
    x = getenv( "GPFSMPIO_DEVNULLIO" );
    if (x) gpfsmpio_devnullio = atoi(x);

    gpfsmpio_bridgeringagg = 0;
    x = getenv( "GPFSMPIO_BRIDGERINGAGG" );
    if (x) gpfsmpio_bridgeringagg = atoi(x);

    gpfsmpio_onesided_no_rmw = 0;
    x = getenv( "GPFSMPIO_ONESIDED_NO_RMW" );
    if (x) gpfsmpio_onesided_no_rmw = atoi(x);

    gpfsmpio_onesided_always_rmw = 0;
    x = getenv( "GPFSMPIO_ONESIDED_ALWAYS_RMW" );
    if (x) gpfsmpio_onesided_always_rmw = atoi(x);
    if (gpfsmpio_onesided_always_rmw)
      gpfsmpio_onesided_no_rmw = 1;

    gpfsmpio_onesided_inform_rmw = 0;
    x = getenv( "GPFSMPIO_ONESIDED_INFORM_RMW" );
    if (x) gpfsmpio_onesided_inform_rmw = atoi(x);
}
Example #21
0
/*
 * Compute a dynamic access range based file domain partition among I/O aggregators,
 * which align to the GPFS block size
 * Divide the I/O workload among "nprocs_for_coll" processes. This is
 * done by (logically) dividing the file into file domains (FDs); each
 * process may directly access only its own file domain.
 * Additional effort is to make sure that each I/O aggregator get
 * a file domain that aligns to the GPFS block size.  So, there will
 * not be any false sharing of GPFS file blocks among multiple I/O nodes.
 *
 * The common version of this now accepts a min_fd_size and striping_unit.
 * It doesn't seem necessary here (using GPFS block sizes) but keep it in mind
 * (e.g. we could pass striping unit instead of using fs_ptr->blksize).
 */
void ADIOI_GPFS_Calc_file_domains(ADIO_File fd,
	                              ADIO_Offset *st_offsets,
                                      ADIO_Offset *end_offsets,
                                      int          nprocs,
                                      int          nprocs_for_coll,
                                      ADIO_Offset *min_st_offset_ptr,
                                      ADIO_Offset **fd_start_ptr,
                                      ADIO_Offset **fd_end_ptr,
                                      ADIO_Offset *fd_size_ptr,
                                      void        *fs_ptr)
{
    ADIO_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, *fd_size;
    int i, aggr;
    TRACE_ERR("Entering ADIOI_GPFS_Calc_file_domains\n");
    blksize_t blksize;

#ifdef AGGREGATION_PROFILE
    MPE_Log_event (5004, 0, NULL);
#endif

#   if AGG_DEBUG
    static char myname[] = "ADIOI_GPFS_Calc_file_domains";
    DBG_FPRINTF(stderr, "%s(%d): %d aggregator(s)\n",
	    myname,__LINE__,nprocs_for_coll);
#   endif
    if (fd->blksize <= 0)
	/* default to 1M if blksize unset */
	fd->blksize = 1048576;
    blksize = fd->blksize;

#   if AGG_DEBUG
    DBG_FPRINTF(stderr,"%s(%d): Blocksize=%ld\n",myname,__LINE__,blksize);
#   endif
/* find min of start offsets and max of end offsets of all processes */
    min_st_offset  = st_offsets [0];
    max_end_offset = end_offsets[0];
    for (i=1; i<nprocs; i++) {
        min_st_offset = ADIOI_MIN(min_st_offset, st_offsets[i]);
        max_end_offset = ADIOI_MAX(max_end_offset, end_offsets[i]);
    }

    /* DBG_FPRINTF(stderr, "_calc_file_domains, min_st_offset, max_
       = %qd, %qd\n", min_st_offset, max_end_offset );*/

    /* determine the "file domain (FD)" of each process, i.e., the portion of
       the file that will be "owned" by each process */

    ADIO_Offset gpfs_ub       = (max_end_offset +blksize-1) / blksize * blksize - 1;
    ADIO_Offset gpfs_lb       = min_st_offset / blksize * blksize;
    ADIO_Offset gpfs_ub_rdoff = (max_end_offset +blksize-1) / blksize * blksize - 1 - max_end_offset;
    ADIO_Offset gpfs_lb_rdoff = min_st_offset - min_st_offset / blksize * blksize;
    ADIO_Offset fd_gpfs_range = gpfs_ub - gpfs_lb + 1;

    int         naggs    = nprocs_for_coll;

    /* Tweak the file domains so that no fd is smaller than a threshold.  We
     * have to strike a balance between efficency and parallelism: somewhere
     * between 10k processes sending 32-byte requests and one process sending a
     * 320k request is a (system-dependent) sweet spot

    This is from the common code - the new min_fd_size parm that we didn't implement.
    (And common code uses a different declaration of fd_size so beware)

    if (fd_size < min_fd_size)
        fd_size = min_fd_size;
    */
    fd_size              = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
    *fd_start_ptr        = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
    *fd_end_ptr          = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
    fd_start             = *fd_start_ptr;
    fd_end               = *fd_end_ptr;

    /* each process will have a file domain of some number of gpfs blocks, but
     * the division of blocks is not likely to be even.  Some file domains will
     * be "large" and others "small"
     *
     * Example: consider  17 blocks distributed over 3 aggregators.
     * nb_cn_small = 17/3 = 5
     * naggs_large = 17 - 3*(17/3) = 17 - 15  = 2
     * naggs_small = 3 - 2 = 1
     *
     * and you end up with file domains of {5-blocks, 6-blocks, 6-blocks}
     *
     * what about (relatively) small files?  say, a file of 1000 blocks
     * distributed over 2064 aggregators:
     * nb_cn_small = 1000/2064 = 0
     * naggs_large = 1000 - 2064*(1000/2064) = 1000
     * naggs_small = 2064 - 1000 = 1064
     * and you end up with domains of {0, 0, 0, ... 1, 1, 1 ...}
     *
     * it might be a good idea instead of having all the zeros up front, to
     * "mix" those zeros into the fd_size array.  that way, no pset/bridge-set
     * is left with zero work.  In fact, even if the small file domains aren't
     * zero, it's probably still a good idea to mix the "small" file domains
     * across the fd_size array to keep the io nodes in balance */


    ADIO_Offset n_gpfs_blk    = fd_gpfs_range / blksize;
    ADIO_Offset nb_cn_small   = n_gpfs_blk/naggs;
    ADIO_Offset naggs_large   = n_gpfs_blk - naggs * (n_gpfs_blk/naggs);
    ADIO_Offset naggs_small   = naggs - naggs_large;

#ifdef BGQPLATFORM
    if (gpfsmpio_balancecontig == 1) {
	/* File domains blocks are assigned to aggregators in a breadth-first
	 * fashion relative to the ions - additionally, file domains on the
	 * aggregators sharing the same bridgeset and ion have contiguous
	 * offsets. */

	// initialize everything to small
	for (i=0; i<naggs; i++)
	    fd_size[i] = nb_cn_small     * blksize;

	// go thru and distribute the large across the bridges

	/* bridelistoffset: agg rank list offsets using the bridgelist - each
	 * entry is created by adding up the indexes for the aggs from all
	 * previous bridges */
	int *bridgelistoffset =
	    (int *) ADIOI_Malloc(fd->hints->fs_hints.bg.numbridges*sizeof(int));
	/* tmpbridgelistnum: copy of the bridgelistnum whose entries can be
	 * decremented to keep track of bridge assignments during the actual
	 * large block assignments to the agg rank list*/
	int *tmpbridgelistnum =
	    (int *) ADIOI_Malloc(fd->hints->fs_hints.bg.numbridges*sizeof(int));

	int j;
	for (j=0;j<fd->hints->fs_hints.bg.numbridges;j++) {
	    int k, bridgerankoffset = 0;
	    for (k=0;k<j;k++) {
		bridgerankoffset += fd->hints->fs_hints.bg.bridgelistnum[k];
	    }
	    bridgelistoffset[j] = bridgerankoffset;
	}

	for (j=0;j<fd->hints->fs_hints.bg.numbridges;j++)
	    tmpbridgelistnum[j] = fd->hints->fs_hints.bg.bridgelistnum[j];
	int bridgeiter = 0;

	/* distribute the large blocks across the aggs going breadth-first
	 * across the bridgelist - this distributes the fd sizes across the
	 * ions, so later in the file domain assignment when it iterates thru
	 * the ranklist the offsets will be contiguous within the bridge and
	 * ion as well */
	for (j=0;j<naggs_large;j++) {
	    int foundbridge = 0;
	    int numbridgelistpasses = 0;
	    while (!foundbridge) {
		if (tmpbridgelistnum[bridgeiter] > 0) {
		    foundbridge = 1;
		    /*
		       printf("bridgeiter is %d tmpbridgelistnum[bridgeiter] is %d bridgelistoffset[bridgeiter] is %d\n",bridgeiter,tmpbridgelistnum[bridgeiter],bridgelistoffset[bridgeiter]);
		       printf("naggs is %d bridgeiter is %d bridgelistoffset[bridgeiter] is %d tmpbridgelistnum[bridgeiter] is %d\n",naggs, bridgeiter,bridgelistoffset[bridgeiter],tmpbridgelistnum[bridgeiter]);
		       printf("naggs is %d bridgeiter is %d setting fd_size[%d]\n",naggs, bridgeiter,bridgelistoffset[bridgeiter]+(fd->hints->bridgelistnum[bridgeiter]-tmpbridgelistnum[bridgeiter]));
		     */
		    int currentbridgelistnum =
			(fd->hints->fs_hints.bg.bridgelistnum[bridgeiter]-
			 tmpbridgelistnum[bridgeiter]);
		    int currentfdsizeindex = bridgelistoffset[bridgeiter] +
			currentbridgelistnum;
		    fd_size[currentfdsizeindex] = (nb_cn_small+1) * blksize;
		    tmpbridgelistnum[bridgeiter]--;
		}
		if (bridgeiter == (fd->hints->fs_hints.bg.numbridges-1)) {
		    /* guard against infinite loop - should only ever make 1 pass
		     * thru bridgelist */
		    ADIOI_Assert(numbridgelistpasses == 0);
		    numbridgelistpasses++;
		    bridgeiter = 0;
		}
		else
		    bridgeiter++;
	    }
	}
	ADIOI_Free(tmpbridgelistnum);
	ADIOI_Free(bridgelistoffset);

    } else {
	/* BG/L- and BG/P-style distribution of file domains: simple allocation of
	 * file domins to each aggregator */
	for (i=0; i<naggs; i++) {
	    if (i < naggs_large) {
		fd_size[i] = (nb_cn_small+1) * blksize;
	    } else {
		fd_size[i] = nb_cn_small     * blksize;
	    }
	}
    }
#ifdef balancecontigtrace
    int myrank;
    MPI_Comm_rank(fd->comm,&myrank);
    if (myrank == 0) {
      fprintf(stderr,"naggs_small is %d nb_cn_small is %d\n",naggs_small,nb_cn_small);
	for (i=0; i<naggs; i++) {
	    fprintf(stderr,"fd_size[%d] set to %d agg rank is %d\n",i,fd_size[i],fd->hints->ranklist[i]);
	}
    }
#endif

#else // not BGQ platform
	for (i=0; i<naggs; i++) {
	    if (i < naggs_large) {
		fd_size[i] = (nb_cn_small+1) * blksize;
	    } else {
		fd_size[i] = nb_cn_small     * blksize;
	    }
    }

#endif


#   if AGG_DEBUG
     DBG_FPRINTF(stderr,"%s(%d): "
                   "gpfs_ub       %llu, "
                   "gpfs_lb       %llu, "
                   "gpfs_ub_rdoff %llu, "
                   "gpfs_lb_rdoff %llu, "
                   "fd_gpfs_range %llu, "
                   "n_gpfs_blk    %llu, "
                   "nb_cn_small   %llu, "
                   "naggs_large   %llu, "
                   "naggs_small   %llu, "
                   "\n",
                   myname,__LINE__,
                   gpfs_ub      ,
                   gpfs_lb      ,
                   gpfs_ub_rdoff,
                   gpfs_lb_rdoff,
                   fd_gpfs_range,
                   n_gpfs_blk   ,
                   nb_cn_small  ,
                   naggs_large  ,
                   naggs_small
                   );
#   endif

    fd_size[0]       -= gpfs_lb_rdoff;
    fd_size[naggs-1] -= gpfs_ub_rdoff;

    /* compute the file domain for each aggr */
    ADIO_Offset offset = min_st_offset;
    for (aggr=0; aggr<naggs; aggr++) {
        fd_start[aggr] = offset;
        fd_end  [aggr] = offset + fd_size[aggr] - 1;
        offset += fd_size[aggr];
    }

    *fd_size_ptr = fd_size[0];
    *min_st_offset_ptr = min_st_offset;

#ifdef AGGREGATION_PROFILE
    MPE_Log_event (5005, 0, NULL);
#endif
    ADIOI_Free (fd_size);
    TRACE_ERR("Leaving ADIOI_GPFS_Calc_file_domains\n");
}