Ejemplo n.º 1
0
int mca_fcoll_two_phase_domain_partition (mca_io_ompio_file_t *fh,
					  OMPI_MPI_OFFSET_TYPE *start_offsets,
					  OMPI_MPI_OFFSET_TYPE *end_offsets,
					  OMPI_MPI_OFFSET_TYPE *min_st_offset_ptr,
					  OMPI_MPI_OFFSET_TYPE **fd_st_ptr,
					  OMPI_MPI_OFFSET_TYPE **fd_end_ptr,
					  int min_fd_size,
					  OMPI_MPI_OFFSET_TYPE *fd_size_ptr,
					  int striping_unit,
					  int nprocs_for_coll){
        
    OMPI_MPI_OFFSET_TYPE min_st_offset, max_end_offset, *fd_start=NULL, *fd_end=NULL, fd_size;
    int i;

    min_st_offset = start_offsets[0];
    max_end_offset = end_offsets[0];

    for (i=0; i< fh->f_size; i++){
	min_st_offset = OMPIO_MIN(min_st_offset, start_offsets[i]);
	max_end_offset = OMPIO_MAX(max_end_offset, end_offsets[i]);
	
    }

    fd_size = ((max_end_offset - min_st_offset + 1) + nprocs_for_coll - 1)/nprocs_for_coll; 
    
    if (fd_size < min_fd_size)
	fd_size = min_fd_size;
    
    *fd_st_ptr = (OMPI_MPI_OFFSET_TYPE *)
	malloc(nprocs_for_coll*sizeof(OMPI_MPI_OFFSET_TYPE)); 

    if ( NULL == *fd_st_ptr ) {
	return OMPI_ERR_OUT_OF_RESOURCE;
    }

    *fd_end_ptr = (OMPI_MPI_OFFSET_TYPE *)
	malloc(nprocs_for_coll*sizeof(OMPI_MPI_OFFSET_TYPE)); 

    if ( NULL == *fd_end_ptr ) {
	return OMPI_ERR_OUT_OF_RESOURCE;
    }

    
    fd_start = *fd_st_ptr;
    fd_end = *fd_end_ptr;
    
    
    if (striping_unit > 0){
      /* Lock Boundary based domain partitioning */
	int rem_front, rem_back;
	OMPI_MPI_OFFSET_TYPE end_off;
	
	fd_start[0] = min_st_offset;
        end_off     = fd_start[0] + fd_size;
        rem_front   = end_off % striping_unit;
        rem_back    = striping_unit - rem_front;
        if (rem_front < rem_back) 
		end_off -= rem_front;
        else                      
		end_off += rem_back;
        fd_end[0] = end_off - 1;
    
	/* align fd_end[i] to the nearest file lock boundary */
        for (i=1; i<nprocs_for_coll; i++) {
            fd_start[i] = fd_end[i-1] + 1;
            end_off     = min_st_offset + fd_size * (i+1);
            rem_front   = end_off % striping_unit;
            rem_back    = striping_unit - rem_front;
            if (rem_front < rem_back) 
		    end_off -= rem_front;
            else                      
		    end_off += rem_back;
            fd_end[i] = end_off - 1;
        }
        fd_end[nprocs_for_coll-1] = max_end_offset;
    }
    else{
	fd_start[0] = min_st_offset;
        fd_end[0] = min_st_offset + fd_size - 1;
	
        for (i=1; i<nprocs_for_coll; i++) {
            fd_start[i] = fd_end[i-1] + 1;
            fd_end[i] = fd_start[i] + fd_size - 1;
        }

    }
    
    for (i=0; i<nprocs_for_coll; i++) {
	if (fd_start[i] > max_end_offset)
	    fd_start[i] = fd_end[i] = -1;
	if (fd_end[i] > max_end_offset)
	    fd_end[i] = max_end_offset;
    }
    
    *fd_size_ptr = fd_size;
    *min_st_offset_ptr = min_st_offset;
 
    return OMPI_SUCCESS;
}
static int two_phase_read_and_exch(mca_io_ompio_file_t *fh,
				   void *buf,
				   MPI_Datatype datatype,
				   mca_io_ompio_access_array_t *others_req,
				   struct iovec *offset_len,
				   int contig_access_count,
				   OMPI_MPI_OFFSET_TYPE min_st_offset,
				   OMPI_MPI_OFFSET_TYPE fd_size,
				   OMPI_MPI_OFFSET_TYPE *fd_start,
				   OMPI_MPI_OFFSET_TYPE *fd_end,
				   Flatlist_node *flat_buf,
				   size_t *buf_idx, int striping_unit,
				   int *aggregator_list){


  int ret=OMPI_SUCCESS, i = 0, j = 0, ntimes = 0, max_ntimes = 0;
  int m = 0;
  int *curr_offlen_ptr=NULL, *count=NULL, *send_size=NULL, *recv_size=NULL;
  int *partial_send=NULL, *start_pos=NULL, req_len=0, flag=0;
  int *recd_from_proc=NULL;
  MPI_Aint buftype_extent=0;
  size_t byte_size = 0;
  OMPI_MPI_OFFSET_TYPE st_loc=-1, end_loc=-1, off=0, done=0, for_next_iter=0;
  OMPI_MPI_OFFSET_TYPE size=0, req_off=0, real_size=0, real_off=0, len=0;
  OMPI_MPI_OFFSET_TYPE for_curr_iter=0;
  char *read_buf=NULL, *tmp_buf=NULL;
  MPI_Datatype byte = MPI_BYTE;
  
  opal_datatype_type_size(&byte->super, 
			  &byte_size);

  for (i = 0; i < fh->f_size; i++){
    if (others_req[i].count) {
      st_loc = others_req[i].offsets[0];
      end_loc = others_req[i].offsets[0];
      break;
    }
  }

  for (i=0;i<fh->f_size;i++){
    for(j=0;j< others_req[i].count; j++){
      st_loc = 
	OMPIO_MIN(st_loc, others_req[i].offsets[j]);
      end_loc = 
	OMPIO_MAX(end_loc, (others_req[i].offsets[j] +
			    others_req[i].lens[j] - 1));
    }
  }
  
  ntimes = (int)((end_loc - st_loc + mca_fcoll_two_phase_cycle_buffer_size)/
		 mca_fcoll_two_phase_cycle_buffer_size);
  
  if ((st_loc == -1) && (end_loc == -1)){
    ntimes = 0;
  }
  
  fh->f_comm->c_coll.coll_allreduce (&ntimes,
				     &max_ntimes,
				     1,
				     MPI_INT,
				     MPI_MAX,
				     fh->f_comm,
				     fh->f_comm->c_coll.coll_allreduce_module);
  
  if (ntimes){
    read_buf = (char *) calloc (mca_fcoll_two_phase_cycle_buffer_size, 
			       sizeof(char));
    if ( NULL == read_buf ){
      ret =  OMPI_ERR_OUT_OF_RESOURCE;
      goto exit;
    }
  }
  
  curr_offlen_ptr = (int *)calloc (fh->f_size, 
				   sizeof(int));
  if (NULL == curr_offlen_ptr){
    ret = OMPI_ERR_OUT_OF_RESOURCE;
    goto exit;
  }
  
  count = (int *)calloc (fh->f_size, 
			 sizeof(int));
  if (NULL == count){
    ret = OMPI_ERR_OUT_OF_RESOURCE;
    goto exit;
  }

  partial_send = (int *)calloc(fh->f_size, sizeof(int));
  if ( NULL == partial_send ){
    ret = OMPI_ERR_OUT_OF_RESOURCE;
    goto exit;
  }
  
  send_size = (int *)malloc(fh->f_size * sizeof(int));
  if (NULL == send_size){
    ret = OMPI_ERR_OUT_OF_RESOURCE;
    goto exit;
  }
  
  recv_size = (int *)malloc(fh->f_size * sizeof(int));
  if (NULL == recv_size){
    ret = OMPI_ERR_OUT_OF_RESOURCE;
    goto exit;
  }
  
  recd_from_proc = (int *)calloc(fh->f_size,sizeof(int));
  if (NULL == recd_from_proc){
    ret = OMPI_ERR_OUT_OF_RESOURCE;
    goto exit;
  }

  start_pos = (int *) calloc(fh->f_size, sizeof(int));
  if ( NULL == start_pos ){
    ret = OMPI_ERR_OUT_OF_RESOURCE;
    return ret;
  }

  done = 0;
  off = st_loc;
  for_curr_iter = for_next_iter = 0;

  ompi_datatype_type_extent(datatype, &buftype_extent);
  
  for (m=0; m<ntimes; m++) {
    
    size = OMPIO_MIN((unsigned)mca_fcoll_two_phase_cycle_buffer_size, end_loc-st_loc+1-done); 
    real_off = off - for_curr_iter;
    real_size = size + for_curr_iter;
    
    for (i=0; i<fh->f_size; i++) count[i] = send_size[i] = 0;
    for_next_iter = 0;
    
    for (i=0; i<fh->f_size; i++) {
      if (others_req[i].count) {
	start_pos[i] = curr_offlen_ptr[i];
	for (j=curr_offlen_ptr[i]; j<others_req[i].count;
	     j++) {
	  if (partial_send[i]) {
	    /* this request may have been partially
	       satisfied in the previous iteration. */
	    req_off = others_req[i].offsets[j] +
	      partial_send[i]; 
	    req_len = others_req[i].lens[j] -
	      partial_send[i];
	    partial_send[i] = 0;
	    /* modify the off-len pair to reflect this change */
	    others_req[i].offsets[j] = req_off;
	    others_req[i].lens[j] = req_len;
	  }
	  else {
	    req_off = others_req[i].offsets[j];
	    req_len = others_req[i].lens[j];
	  }
	  if (req_off < real_off + real_size) {
	    count[i]++;
	    MPI_Address(read_buf+req_off-real_off, 
			&(others_req[i].mem_ptrs[j]));
	    
	    send_size[i] += (int)(OMPIO_MIN(real_off + real_size - req_off, 
					    (OMPI_MPI_OFFSET_TYPE)req_len)); 
	    
	    if (real_off+real_size-req_off < (OMPI_MPI_OFFSET_TYPE)req_len) {
	      partial_send[i] = (int) (real_off + real_size - req_off);
	      if ((j+1 < others_req[i].count) && 
		  (others_req[i].offsets[j+1] < 
		   real_off+real_size)) { 
		/* this is the case illustrated in the
		   figure above. */
		for_next_iter = OMPIO_MAX(for_next_iter,
					  real_off + real_size - others_req[i].offsets[j+1]); 
		/* max because it must cover requests 
		   from different processes */
	      }
	      break;
	    }
	  }
	  else break;
	}
	curr_offlen_ptr[i] = j;
      }
    }
    flag = 0;
    for (i=0; i<fh->f_size; i++) 
      if (count[i]) flag = 1;
    
    if (flag) {

#if TIME_BREAKDOWN
	start_read_time = MPI_Wtime();
#endif

      len = size * byte_size;
      fh->f_io_array = (mca_io_ompio_io_array_t *)calloc 
	(1,sizeof(mca_io_ompio_io_array_t));
      if (NULL == fh->f_io_array) {
	opal_output(1, "OUT OF MEMORY\n");
	return OMPI_ERR_OUT_OF_RESOURCE;
      }
      fh->f_io_array[0].offset = (IOVBASE_TYPE *)(intptr_t)off;
      fh->f_io_array[0].length = len;
      fh->f_io_array[0].memory_address = 
	read_buf+for_curr_iter;
      fh->f_num_of_io_entries = 1;
      
      if (fh->f_num_of_io_entries){
	if (OMPI_SUCCESS != fh->f_fbtl->fbtl_preadv (fh, NULL)) {
	  opal_output(1, "READ FAILED\n");
	  return OMPI_ERROR;
	}
      }
      
#if 0
      int ii;
      printf("%d: len/4 : %lld\n",
	     fh->f_rank,
	     len/4);
      for (ii = 0; ii < len/4 ;ii++){
	printf("%d: read_buf[%d]: %ld\n", 
	       fh->f_rank,
	       ii,
	       (int *)read_buf[ii]);
      }
#endif
      fh->f_num_of_io_entries = 0;
      if (NULL != fh->f_io_array) {
	free (fh->f_io_array);
	fh->f_io_array = NULL;
      }
      
#if TIME_BREAKDOWN
	end_read_time = MPI_Wtime();
	read_time += (end_read_time - start_read_time);
#endif


    }
 
    for_curr_iter = for_next_iter;
    
    for (i=0; i< fh->f_size; i++){
      recv_size[i]  = 0;
    }
    two_phase_exchange_data(fh, buf, offset_len,
			    send_size, start_pos, recv_size, count, 
			    partial_send, recd_from_proc, 
			    contig_access_count,
			    min_st_offset, fd_size, fd_start, fd_end,
			    flat_buf, others_req, m, buf_idx,
			    buftype_extent, striping_unit, aggregator_list); 

    if (for_next_iter){
      tmp_buf = (char *) calloc (for_next_iter, sizeof(char));
      memcpy(tmp_buf, 
	     read_buf+real_size-for_next_iter, 
	     for_next_iter);
      free(read_buf);
      read_buf = (char *)malloc(for_next_iter+mca_fcoll_two_phase_cycle_buffer_size);
      memcpy(read_buf, tmp_buf, for_next_iter);
      free(tmp_buf);
    }
    
    off += size;
    done += size;
  }
  
  for (i=0; i<fh->f_size; i++) count[i] = send_size[i] = 0;
  for (m=ntimes; m<max_ntimes; m++)
    two_phase_exchange_data(fh, buf, offset_len, send_size,
			    start_pos, recv_size, count, 
			    partial_send, recd_from_proc, 
			    contig_access_count,
			    min_st_offset, fd_size, fd_start, fd_end,
			    flat_buf, others_req, m, buf_idx,
			    buftype_extent, striping_unit, aggregator_list); 
  if (ntimes){ 
    free(read_buf);
    read_buf = NULL;
  }
  if (NULL != curr_offlen_ptr){
    free(curr_offlen_ptr);
    curr_offlen_ptr = NULL;
  }
  if (NULL != count){
    free(count);
    count = NULL;
  }
  if (NULL != partial_send){
    free(partial_send);
    partial_send = NULL;
  }
  if (NULL != send_size){
    free(send_size);
    send_size = NULL;
  }
  if (NULL != recv_size){
    free(recv_size);
    recv_size = NULL;
  }
  if (NULL != recd_from_proc){
    free(recd_from_proc);
    recd_from_proc = NULL;
  }
  if (NULL != start_pos){
    free(start_pos);
    start_pos = NULL;
  }

 exit:
  return ret;
  
}
static int two_phase_exch_and_write(mca_io_ompio_file_t *fh,
				    void *buf,
				    MPI_Datatype datatype,
				    mca_io_ompio_access_array_t *others_req,
				    struct iovec *offset_len,
				    int contig_access_count,
				    OMPI_MPI_OFFSET_TYPE min_st_offset,
				    OMPI_MPI_OFFSET_TYPE fd_size,
				    OMPI_MPI_OFFSET_TYPE *fd_start,
				    OMPI_MPI_OFFSET_TYPE *fd_end,
				    Flatlist_node *flat_buf,
				    size_t *buf_idx, int striping_unit,
				    int *aggregator_list)
    
{

  
    int i, j, ntimes, max_ntimes, m;
    int *curr_offlen_ptr=NULL, *count=NULL, *send_size=NULL, *recv_size=NULL;
    int *partial_recv=NULL, *start_pos=NULL, req_len, flag;
    int *sent_to_proc=NULL, ret = OMPI_SUCCESS;
    int *send_buf_idx=NULL, *curr_to_proc=NULL, *done_to_proc=NULL;
    OMPI_MPI_OFFSET_TYPE st_loc=-1, end_loc=-1, off, done;
    OMPI_MPI_OFFSET_TYPE size=0, req_off, len;
    MPI_Aint buftype_extent;
    int  hole;
    size_t byte_size;
    MPI_Datatype byte = MPI_BYTE;
    #if DEBUG_ON
    int ii,jj;
    #endif

    char *write_buf=NULL;


    opal_datatype_type_size(&byte->super,
			    &byte_size);
    
    for (i = 0; i < fh->f_size; i++){
	if (others_req[i].count) {
	    st_loc = others_req[i].offsets[0];
	    end_loc = others_req[i].offsets[0];
	    break;
	}
    }
    
    for (i=0;i<fh->f_size;i++){
	for(j=0;j< others_req[i].count; j++){
	    st_loc = OMPIO_MIN(st_loc, others_req[i].offsets[j]);
	    end_loc = OMPIO_MAX(end_loc, (others_req[i].offsets[j] + others_req[i].lens[j] - 1));
	    
	}
    }
    

    ntimes = (int) ((end_loc - st_loc + mca_fcoll_two_phase_cycle_buffer_size)/mca_fcoll_two_phase_cycle_buffer_size); 
    
    if ((st_loc == -1) && (end_loc == -1)) {
	ntimes = 0;
    }
    
    fh->f_comm->c_coll.coll_allreduce (&ntimes,
				       &max_ntimes,
				       1,
				       MPI_INT,
				       MPI_MAX,
				       fh->f_comm,
				       fh->f_comm->c_coll.coll_allreduce_module);

    if (ntimes){
      write_buf = (char *) malloc (mca_fcoll_two_phase_cycle_buffer_size);
      if ( NULL == write_buf ){
	return OMPI_ERR_OUT_OF_RESOURCE;
      }
    }

    curr_offlen_ptr = (int *) calloc(fh->f_size, sizeof(int)); 
    
    if ( NULL == curr_offlen_ptr ){
      return OMPI_ERR_OUT_OF_RESOURCE;
    }
    
    count = (int *) malloc(fh->f_size*sizeof(int));
    
    if ( NULL == count ){
      return OMPI_ERR_OUT_OF_RESOURCE;
    }
   
    partial_recv = (int *)calloc(fh->f_size, sizeof(int));

    if ( NULL == partial_recv ){
      return OMPI_ERR_OUT_OF_RESOURCE;
    }

    send_size = (int *) calloc(fh->f_size,sizeof(int));

    if ( NULL == send_size ){
      return OMPI_ERR_OUT_OF_RESOURCE;
    }

    recv_size = (int *) calloc(fh->f_size,sizeof(int));

    if ( NULL == recv_size ){
      return OMPI_ERR_OUT_OF_RESOURCE;
    }

    send_buf_idx = (int *) malloc(fh->f_size*sizeof(int));

    if ( NULL == send_buf_idx ){
      return OMPI_ERR_OUT_OF_RESOURCE;
    }
    
    sent_to_proc = (int *) calloc(fh->f_size, sizeof(int));
    
    if ( NULL == sent_to_proc){
      return OMPI_ERR_OUT_OF_RESOURCE;
    }

    curr_to_proc = (int *) malloc(fh->f_size*sizeof(int));
    
    if ( NULL == curr_to_proc ){
      return OMPI_ERR_OUT_OF_RESOURCE;
    }

    done_to_proc = (int *) malloc(fh->f_size*sizeof(int));
    
    if ( NULL == done_to_proc ){
      return OMPI_ERR_OUT_OF_RESOURCE;
    }

    start_pos = (int *) malloc(fh->f_size*sizeof(int));
    
    if ( NULL == start_pos ){
      return OMPI_ERR_OUT_OF_RESOURCE;
    }
 
  
    done = 0;
    off = st_loc;
    
    ompi_datatype_type_extent(datatype, &buftype_extent);
    for (m=0;m <ntimes; m++){
	for (i=0; i< fh->f_size; i++) count[i] = recv_size[i] = 0;
	
	size = OMPIO_MIN((unsigned)mca_fcoll_two_phase_cycle_buffer_size,
			 end_loc-st_loc+1-done);
	for (i=0;i<fh->f_size;i++){
	    if(others_req[i].count){
		start_pos[i] = curr_offlen_ptr[i];
		for (j=curr_offlen_ptr[i]; j<others_req[i].count; j++) {
		    		    if (partial_recv[i]) {
			/* this request may have been partially
			   satisfied in the previous iteration. */
			req_off = others_req[i].offsets[j] +
			    partial_recv[i]; 
                        req_len = others_req[i].lens[j] -
			    partial_recv[i];
			partial_recv[i] = 0;
			/* modify the off-len pair to reflect this change */
			others_req[i].offsets[j] = req_off;
			others_req[i].lens[j] = req_len;
		    }
		    else {
			req_off = others_req[i].offsets[j];
                        req_len = others_req[i].lens[j];
		    }
		    if (req_off < off + size) {
			count[i]++;
			#if DEBUG_ON
			printf("%d: req_off : %lld, off : %lld, size : %lld, count[%d]: %d\n", fh->f_rank,
			       req_off,
			       off,
			       size,i,
			       count[i]);
			#endif
			MPI_Address(write_buf+req_off-off,
				    &(others_req[i].mem_ptrs[j]));
			#if DEBUG_ON
			printf("%d : mem_ptrs : %ld\n", fh->f_rank,
			       others_req[i].mem_ptrs[j]);
			#endif
			recv_size[i] += (int) (OMPIO_MIN(off + size - req_off,
							 (unsigned)req_len));
			
			if (off+size-req_off < (unsigned)req_len){
			    
			    partial_recv[i] = (int)(off + size - req_off);
			    break;
			}
		    }
		    else break;
		}
		curr_offlen_ptr[i] = j;
	    }
	}	

	ret = two_phase_exchage_data(fh, buf, write_buf,
				     offset_len,send_size,
				     start_pos,recv_size,off,size,
				     count, partial_recv, sent_to_proc,
				     contig_access_count,
				     min_st_offset,
				     fd_size, fd_start,
				     fd_end, flat_buf, others_req,
				     send_buf_idx, curr_to_proc,
				     done_to_proc, m, buf_idx, 
				     buftype_extent, striping_unit,
				     aggregator_list, &hole);
	
	if ( OMPI_SUCCESS != ret ){
	  goto exit;
	}
	
	
	
	flag = 0;
	for (i=0; i<fh->f_size; i++)
	    if (count[i]) flag = 1;


	
	if (flag){

#if TIME_BREAKDOWN	  
	    start_write_time = MPI_Wtime();
#endif

	    #if DEBUG_ON
	    printf("rank : %d enters writing\n", fh->f_rank);
	    printf("size : %ld, off : %ld\n",size, off);
	    for (ii=0, jj=0;jj<size;jj+=4, ii++){
		printf("%d : write_buf[%d]: %d\n", fh->f_rank, ii,((int *)write_buf[jj]));
	    }
	    #endif
	    len = size * byte_size;
	    fh->f_io_array = (mca_io_ompio_io_array_t *)malloc 
		(sizeof(mca_io_ompio_io_array_t));
	    if (NULL == fh->f_io_array) {
		opal_output(1, "OUT OF MEMORY\n");
		return OMPI_ERR_OUT_OF_RESOURCE;
	    }

	    fh->f_io_array[0].offset  =(IOVBASE_TYPE *)(intptr_t) off;
	    fh->f_io_array[0].length = len;
	    fh->f_io_array[0].memory_address = write_buf;
	    fh->f_num_of_io_entries = 1;

	    #if DEBUG_ON
            for (i=0 ; i<fh->f_num_of_io_entries ; i++) {
                printf("%d: ADDRESS: %p  OFFSET: %ld   LENGTH: %d\n",
		       fh->f_rank,
                       fh->f_io_array[i].memory_address,
                       fh->f_io_array[i].offset,
                       fh->f_io_array[i].length);
            }
            #endif

	    if (fh->f_num_of_io_entries){
		if (OMPI_SUCCESS != fh->f_fbtl->fbtl_pwritev (fh, NULL)) {
		    opal_output(1, "WRITE FAILED\n");
		    return OMPI_ERROR;
		}
	    }
#if TIME_BREAKDOWN
	      end_write_time = MPI_Wtime();
	      write_time += (end_write_time - start_write_time);
#endif


	}
	/***************** DONE WRITING *****************************************/
	/****RESET **********************/
	fh->f_num_of_io_entries = 0;
	if (NULL != fh->f_io_array) {
	    free (fh->f_io_array);
	    fh->f_io_array = NULL;
	}

	off += size;
	done += size;
	
    }
    for (i=0; i<fh->f_size; i++) count[i] = recv_size[i] = 0;
    for (m=ntimes; m<max_ntimes; m++) {
      ret = two_phase_exchage_data(fh, buf, write_buf,
				   offset_len,send_size,
				   start_pos,recv_size,off,size,
				   count, partial_recv, sent_to_proc,
				   contig_access_count,
				   min_st_offset,
				   fd_size, fd_start,
				   fd_end, flat_buf,others_req,
				   send_buf_idx, curr_to_proc,
				   done_to_proc, m, buf_idx,
				   buftype_extent, striping_unit,
				   aggregator_list, &hole);
      if ( OMPI_SUCCESS != ret ){
	goto exit;
      }
    }
    
 exit:    
    
    if (ntimes){
      if ( NULL != write_buf ){
	free(write_buf);
      }
    }
    if ( NULL != curr_offlen_ptr ){
      free(curr_offlen_ptr);
    }
    if ( NULL != count ){ 
      free(count);
    }
    if ( NULL != partial_recv ){
      free(partial_recv);
    }
    if ( NULL != send_size ){
      free(send_size);
    }
    if ( NULL != recv_size ){
      free(recv_size);
    }
    if ( NULL != sent_to_proc ){
      free(sent_to_proc);
    }
    if ( NULL != start_pos ){
      free(start_pos);
    }
    if ( NULL != send_buf_idx ){
      free(send_buf_idx);
    }
    if ( NULL != curr_to_proc ){
      free(curr_to_proc);
    }
    if ( NULL != done_to_proc ){
      free(done_to_proc);
    }

    return ret;
}
int
mca_fcoll_two_phase_file_read_all (mca_io_ompio_file_t *fh,
				   void *buf,
				   int count,
				   struct ompi_datatype_t *datatype,
				   ompi_status_public_t *status)
{

    int ret = OMPI_SUCCESS, i = 0, j = 0, interleave_count = 0, striping_unit = 0;
    MPI_Aint recv_buf_addr = 0;
    uint32_t iov_count = 0, ti = 0;
    struct iovec *decoded_iov = NULL, *temp_iov = NULL, *iov = NULL;
    size_t max_data = 0;
    long long_max_data = 0, long_total_bytes = 0;
    int domain_size=0, *count_my_req_per_proc=NULL, count_my_req_procs = 0;
    int count_other_req_procs;
    size_t *buf_indices=NULL;
    int *aggregator_list = NULL, local_count = 0, local_size = 0;
    int two_phase_num_io_procs=1;
    OMPI_MPI_OFFSET_TYPE start_offset = 0, end_offset = 0, fd_size = 0;
    OMPI_MPI_OFFSET_TYPE *start_offsets=NULL, *end_offsets=NULL;
    OMPI_MPI_OFFSET_TYPE *fd_start=NULL, *fd_end=NULL, min_st_offset = 0;
    Flatlist_node *flat_buf=NULL;
    mca_io_ompio_access_array_t *my_req=NULL, *others_req=NULL;
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    mca_common_ompio_print_entry nentry;
#endif
//    if (opal_datatype_is_predefined(&datatype->super)) {
//	fh->f_flags = fh->f_flags |  OMPIO_CONTIGUOUS_MEMORY;
//    }

    if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) {
	ret =   fh->f_decode_datatype ((struct mca_io_ompio_file_t *)fh,
				       datatype,
				       count,
				       buf,
				       &max_data,
				       &temp_iov,
				       &iov_count);
	if (OMPI_SUCCESS != ret ){
	    goto exit;
	}

	recv_buf_addr = (size_t)(buf);
	decoded_iov  = (struct iovec *) calloc
	    (iov_count, sizeof(struct iovec));

	for (ti = 0; ti < iov_count; ti++){

	    decoded_iov[ti].iov_base = (IOVBASE_TYPE *)
		((OPAL_PTRDIFF_TYPE)temp_iov[ti].iov_base - recv_buf_addr);
	    decoded_iov[ti].iov_len = temp_iov[ti].iov_len;
#if DEBUG
	    printf("d_offset[%d]: %ld, d_len[%d]: %ld\n",
		   ti, (OPAL_PTRDIFF_TYPE)decoded_iov[ti].iov_base,
		   ti, decoded_iov[ti].iov_len);
#endif
	}

    }
    else{
	max_data = count * datatype->super.size;
    }

    if ( MPI_STATUS_IGNORE != status ) {
	status->_ucount = max_data;
    }

    fh->f_get_num_aggregators (&two_phase_num_io_procs);
    if (-1 == two_phase_num_io_procs ){
	ret = fh->f_set_aggregator_props ((struct mca_io_ompio_file_t *)fh,
					  two_phase_num_io_procs,
					  max_data);
	if (OMPI_SUCCESS != ret){
            goto exit;
	}

	two_phase_num_io_procs = fh->f_final_num_aggrs;

    }

    if (two_phase_num_io_procs > fh->f_size){
	two_phase_num_io_procs = fh->f_size;
    }

    aggregator_list = (int *) calloc (two_phase_num_io_procs, sizeof(int));
    if (NULL == aggregator_list){
	ret = OMPI_ERR_OUT_OF_RESOURCE;
	goto exit;
    }

    for (i=0; i< two_phase_num_io_procs; i++){
	aggregator_list[i] = i * fh->f_size / two_phase_num_io_procs;
    }

    ret = fh->f_generate_current_file_view ((struct mca_io_ompio_file_t *)fh,
					    max_data,
					    &iov,
					    &local_count);

    if (OMPI_SUCCESS != ret){
	goto exit;
    }

    long_max_data = (long) max_data;
    ret = fh->f_comm->c_coll.coll_allreduce (&long_max_data,
					     &long_total_bytes,
					     1,
					     MPI_LONG,
					     MPI_SUM,
					     fh->f_comm,
					     fh->f_comm->c_coll.coll_allreduce_module);

    if ( OMPI_SUCCESS != ret ) {
	goto exit;
    }

    if (!(fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) {

	/* This datastructre translates between OMPIO->ROMIO its a little hacky!*/
	/* But helps to re-use romio's code for handling non-contiguous file-type*/
	/*Flattened datatype for ompio is in decoded_iov it translated into
	  flatbuf*/

	flat_buf = (Flatlist_node *)calloc(1, sizeof(Flatlist_node));
	if ( NULL == flat_buf ){
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	}

	flat_buf->type = datatype;
	flat_buf->next = NULL;
	flat_buf->count = 0;
	flat_buf->indices = NULL;
	flat_buf->blocklens = NULL;

	if ( 0 < count ) {
	    local_size = OMPIO_MAX(1,iov_count/count);
	}
	else {
	    local_size = 0;
	}


	if ( 0 < local_size ) {
	    flat_buf->indices =
		(OMPI_MPI_OFFSET_TYPE *)calloc(local_size,
					       sizeof(OMPI_MPI_OFFSET_TYPE));
	    if (NULL == flat_buf->indices){
		ret = OMPI_ERR_OUT_OF_RESOURCE;
		goto exit;
	    }

	    flat_buf->blocklens =
		(OMPI_MPI_OFFSET_TYPE *)calloc(local_size,
					       sizeof(OMPI_MPI_OFFSET_TYPE));
	    if ( NULL == flat_buf->blocklens ){
		ret = OMPI_ERR_OUT_OF_RESOURCE;
		goto exit;
	    }
	}
	flat_buf->count = local_size;
        for (j = 0 ; j < local_size ; ++j) {
	    flat_buf->indices[j] = (OMPI_MPI_OFFSET_TYPE)(intptr_t)decoded_iov[j].iov_base;
	    flat_buf->blocklens[j] = decoded_iov[j].iov_len;
	}

#if DEBUG
	printf("flat_buf count: %d\n",
	       flat_buf->count);
	for(i=0;i<flat_buf->count;i++){
	    printf("%d: blocklen[%d] : %lld, indices[%d]: %lld\n",
		   fh->f_rank, i, flat_buf->blocklens[i], i ,flat_buf->indices[i]);
	}
#endif
    }

#if DEBUG
    printf("%d: total_bytes:%ld, local_count: %d\n",
	   fh->f_rank, long_total_bytes, local_count);
    for (i=0 ; i<local_count ; i++) {
	printf("%d: fcoll:two_phase:read_all:OFFSET:%ld,LENGTH:%ld\n",
	       fh->f_rank,
	       (size_t)iov[i].iov_base,
	       (size_t)iov[i].iov_len);
    }
#endif

    start_offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t)iov[0].iov_base;
    if ( 0 < local_count ) {
	end_offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t)iov[local_count-1].iov_base +
	    (OMPI_MPI_OFFSET_TYPE)(intptr_t)iov[local_count-1].iov_len - 1;
    }
    else {
	end_offset = 0;
    }
#if DEBUG
    printf("%d: START OFFSET:%ld, END OFFSET:%ld\n",
	   fh->f_rank,
	   (size_t)start_offset,
	   (size_t)end_offset);
#endif

    start_offsets = (OMPI_MPI_OFFSET_TYPE *)calloc
	(fh->f_size, sizeof(OMPI_MPI_OFFSET_TYPE));

    if ( NULL == start_offsets ){
	ret = OMPI_ERR_OUT_OF_RESOURCE;
	goto exit;
    }

    end_offsets = (OMPI_MPI_OFFSET_TYPE *)calloc
	(fh->f_size, sizeof(OMPI_MPI_OFFSET_TYPE));

    if (NULL == end_offsets){
	ret = OMPI_ERR_OUT_OF_RESOURCE;
	goto exit;
    }

    ret = fh->f_comm->c_coll.coll_allgather(&start_offset,
					    1,
					    OMPI_OFFSET_DATATYPE,
					    start_offsets,
					    1,
					    OMPI_OFFSET_DATATYPE,
					    fh->f_comm,
					    fh->f_comm->c_coll.coll_allgather_module);

    if ( OMPI_SUCCESS != ret ){
	goto exit;
    }

    ret = fh->f_comm->c_coll.coll_allgather(&end_offset,
					    1,
					    OMPI_OFFSET_DATATYPE,
					    end_offsets,
					    1,
					    OMPI_OFFSET_DATATYPE,
					    fh->f_comm,
					    fh->f_comm->c_coll.coll_allgather_module);


    if ( OMPI_SUCCESS != ret ){
	goto exit;
    }

#if DEBUG
    for (i=0;i<fh->f_size;i++){
	printf("%d: start[%d]:%ld,end[%d]:%ld\n",
	       fh->f_rank,i,
	       (size_t)start_offsets[i],i,
	       (size_t)end_offsets[i]);
    }
#endif

    for (i=1; i<fh->f_size; i++){
	if ((start_offsets[i] < end_offsets[i-1]) &&
	    (start_offsets[i] <= end_offsets[i])){
	    interleave_count++;
	}
    }

#if DEBUG
    printf("%d: interleave_count:%d\n",
	   fh->f_rank,interleave_count);
#endif

    ret = mca_fcoll_two_phase_domain_partition(fh,
					       start_offsets,
					       end_offsets,
					       &min_st_offset,
					       &fd_start,
					       &fd_end,
					       domain_size,
					       &fd_size,
					       striping_unit,
					       two_phase_num_io_procs);
    if (OMPI_SUCCESS != ret){
	goto exit;
    }

#if DEBUG
    for (i=0;i<two_phase_num_io_procs;i++){
	printf("fd_start[%d] : %lld, fd_end[%d] : %lld, local_count: %d\n",
	       i, fd_start[i], i, fd_end[i], local_count);
    }
#endif

    ret = mca_fcoll_two_phase_calc_my_requests (fh,
						iov,
						local_count,
						min_st_offset,
						fd_start,
						fd_end,
						fd_size,
						&count_my_req_procs,
						&count_my_req_per_proc,
						&my_req,
						&buf_indices,
						striping_unit,
						two_phase_num_io_procs,
						aggregator_list);
    if ( OMPI_SUCCESS != ret ){
	goto exit;
    }

    ret = mca_fcoll_two_phase_calc_others_requests(fh,
						   count_my_req_procs,
						   count_my_req_per_proc,
						   my_req,
						   &count_other_req_procs,
						   &others_req);
    if (OMPI_SUCCESS != ret ){
	goto exit;
    }

#if DEBUG
    printf("%d count_other_req_procs : %d\n",
	   fh->f_rank,
	   count_other_req_procs);
#endif

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_rexch = MPI_Wtime();
#endif


    ret = two_phase_read_and_exch(fh,
				  buf,
				  datatype,
				  others_req,
				  iov,
				  local_count,
				  min_st_offset,
				  fd_size,
				  fd_start,
				  fd_end,
				  flat_buf,
				  buf_indices,
				  striping_unit,
				  two_phase_num_io_procs,
				  aggregator_list);


    if (OMPI_SUCCESS != ret){
	goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    end_rexch = MPI_Wtime();
    read_exch += (end_rexch - start_rexch);
    nentry.time[0] = read_time;
    nentry.time[1] = rcomm_time;
    nentry.time[2] = read_exch;
    if (isread_aggregator(fh->f_rank,
			  two_phase_num_io_procs,
			  aggregator_list)){
	nentry.aggregator = 1;
    }
    else{
	nentry.aggregator = 0;
    }
    nentry.nprocs_for_coll = two_phase_num_io_procs;


    if (!mca_common_ompio_full_print_queue(fh->f_coll_read_time)){
	mca_common_ompio_register_print_entry(fh->f_coll_read_time,
                                              nentry);
    }
#endif


exit:
    if (flat_buf != NULL){
	if (flat_buf->blocklens != NULL){
	    free (flat_buf->blocklens);
	}
	if (flat_buf->indices != NULL){
	    free (flat_buf->indices);
	}
        free (flat_buf);
    }

    free (start_offsets);
    free (end_offsets);
    free (aggregator_list);
    free (fd_start);
    free (decoded_iov);
    free (buf_indices);
    free (count_my_req_per_proc);
    free (my_req);
    free (others_req);
    free (fd_end);

    return ret;
}