int mca_fcoll_two_phase_domain_partition (mca_io_ompio_file_t *fh,
					  OMPI_MPI_OFFSET_TYPE *start_offsets,
					  OMPI_MPI_OFFSET_TYPE *end_offsets,
					  OMPI_MPI_OFFSET_TYPE *min_st_offset_ptr,
					  OMPI_MPI_OFFSET_TYPE **fd_st_ptr,
					  OMPI_MPI_OFFSET_TYPE **fd_end_ptr,
					  int min_fd_size,
					  OMPI_MPI_OFFSET_TYPE *fd_size_ptr,
					  int striping_unit,
					  int nprocs_for_coll){
        
    OMPI_MPI_OFFSET_TYPE min_st_offset, max_end_offset, *fd_start=NULL, *fd_end=NULL, fd_size;
    int i;

    min_st_offset = start_offsets[0];
    max_end_offset = end_offsets[0];

    for (i=0; i< fh->f_size; i++){
	min_st_offset = OMPIO_MIN(min_st_offset, start_offsets[i]);
	max_end_offset = OMPIO_MAX(max_end_offset, end_offsets[i]);
	
    }

    fd_size = ((max_end_offset - min_st_offset + 1) + nprocs_for_coll - 1)/nprocs_for_coll; 
    
    if (fd_size < min_fd_size)
	fd_size = min_fd_size;
    
    *fd_st_ptr = (OMPI_MPI_OFFSET_TYPE *)
	malloc(nprocs_for_coll*sizeof(OMPI_MPI_OFFSET_TYPE)); 

    if ( NULL == *fd_st_ptr ) {
	return OMPI_ERR_OUT_OF_RESOURCE;
    }

    *fd_end_ptr = (OMPI_MPI_OFFSET_TYPE *)
	malloc(nprocs_for_coll*sizeof(OMPI_MPI_OFFSET_TYPE)); 

    if ( NULL == *fd_end_ptr ) {
	return OMPI_ERR_OUT_OF_RESOURCE;
    }

    
    fd_start = *fd_st_ptr;
    fd_end = *fd_end_ptr;
    
    
    if (striping_unit > 0){
      /* Lock Boundary based domain partitioning */
	int rem_front, rem_back;
	OMPI_MPI_OFFSET_TYPE end_off;
	
	fd_start[0] = min_st_offset;
        end_off     = fd_start[0] + fd_size;
        rem_front   = end_off % striping_unit;
        rem_back    = striping_unit - rem_front;
        if (rem_front < rem_back) 
		end_off -= rem_front;
        else                      
		end_off += rem_back;
        fd_end[0] = end_off - 1;
    
	/* align fd_end[i] to the nearest file lock boundary */
        for (i=1; i<nprocs_for_coll; i++) {
            fd_start[i] = fd_end[i-1] + 1;
            end_off     = min_st_offset + fd_size * (i+1);
            rem_front   = end_off % striping_unit;
            rem_back    = striping_unit - rem_front;
            if (rem_front < rem_back) 
		    end_off -= rem_front;
            else                      
		    end_off += rem_back;
            fd_end[i] = end_off - 1;
        }
        fd_end[nprocs_for_coll-1] = max_end_offset;
    }
    else{
	fd_start[0] = min_st_offset;
        fd_end[0] = min_st_offset + fd_size - 1;
	
        for (i=1; i<nprocs_for_coll; i++) {
            fd_start[i] = fd_end[i-1] + 1;
            fd_end[i] = fd_start[i] + fd_size - 1;
        }

    }
    
    for (i=0; i<nprocs_for_coll; i++) {
	if (fd_start[i] > max_end_offset)
	    fd_start[i] = fd_end[i] = -1;
	if (fd_end[i] > max_end_offset)
	    fd_end[i] = max_end_offset;
    }
    
    *fd_size_ptr = fd_size;
    *min_st_offset_ptr = min_st_offset;
 
    return OMPI_SUCCESS;
}
static int two_phase_exch_and_write(mca_io_ompio_file_t *fh,
				    void *buf,
				    MPI_Datatype datatype,
				    mca_io_ompio_access_array_t *others_req,
				    struct iovec *offset_len,
				    int contig_access_count,
				    OMPI_MPI_OFFSET_TYPE min_st_offset,
				    OMPI_MPI_OFFSET_TYPE fd_size,
				    OMPI_MPI_OFFSET_TYPE *fd_start,
				    OMPI_MPI_OFFSET_TYPE *fd_end,
				    Flatlist_node *flat_buf,
				    size_t *buf_idx, int striping_unit,
				    int *aggregator_list)
    
{

  
    int i, j, ntimes, max_ntimes, m;
    int *curr_offlen_ptr=NULL, *count=NULL, *send_size=NULL, *recv_size=NULL;
    int *partial_recv=NULL, *start_pos=NULL, req_len, flag;
    int *sent_to_proc=NULL, ret = OMPI_SUCCESS;
    int *send_buf_idx=NULL, *curr_to_proc=NULL, *done_to_proc=NULL;
    OMPI_MPI_OFFSET_TYPE st_loc=-1, end_loc=-1, off, done;
    OMPI_MPI_OFFSET_TYPE size=0, req_off, len;
    MPI_Aint buftype_extent;
    int  hole;
    size_t byte_size;
    MPI_Datatype byte = MPI_BYTE;
    #if DEBUG_ON
    int ii,jj;
    #endif

    char *write_buf=NULL;


    opal_datatype_type_size(&byte->super,
			    &byte_size);
    
    for (i = 0; i < fh->f_size; i++){
	if (others_req[i].count) {
	    st_loc = others_req[i].offsets[0];
	    end_loc = others_req[i].offsets[0];
	    break;
	}
    }
    
    for (i=0;i<fh->f_size;i++){
	for(j=0;j< others_req[i].count; j++){
	    st_loc = OMPIO_MIN(st_loc, others_req[i].offsets[j]);
	    end_loc = OMPIO_MAX(end_loc, (others_req[i].offsets[j] + others_req[i].lens[j] - 1));
	    
	}
    }
    

    ntimes = (int) ((end_loc - st_loc + mca_fcoll_two_phase_cycle_buffer_size)/mca_fcoll_two_phase_cycle_buffer_size); 
    
    if ((st_loc == -1) && (end_loc == -1)) {
	ntimes = 0;
    }
    
    fh->f_comm->c_coll.coll_allreduce (&ntimes,
				       &max_ntimes,
				       1,
				       MPI_INT,
				       MPI_MAX,
				       fh->f_comm,
				       fh->f_comm->c_coll.coll_allreduce_module);

    if (ntimes){
      write_buf = (char *) malloc (mca_fcoll_two_phase_cycle_buffer_size);
      if ( NULL == write_buf ){
	return OMPI_ERR_OUT_OF_RESOURCE;
      }
    }

    curr_offlen_ptr = (int *) calloc(fh->f_size, sizeof(int)); 
    
    if ( NULL == curr_offlen_ptr ){
      return OMPI_ERR_OUT_OF_RESOURCE;
    }
    
    count = (int *) malloc(fh->f_size*sizeof(int));
    
    if ( NULL == count ){
      return OMPI_ERR_OUT_OF_RESOURCE;
    }
   
    partial_recv = (int *)calloc(fh->f_size, sizeof(int));

    if ( NULL == partial_recv ){
      return OMPI_ERR_OUT_OF_RESOURCE;
    }

    send_size = (int *) calloc(fh->f_size,sizeof(int));

    if ( NULL == send_size ){
      return OMPI_ERR_OUT_OF_RESOURCE;
    }

    recv_size = (int *) calloc(fh->f_size,sizeof(int));

    if ( NULL == recv_size ){
      return OMPI_ERR_OUT_OF_RESOURCE;
    }

    send_buf_idx = (int *) malloc(fh->f_size*sizeof(int));

    if ( NULL == send_buf_idx ){
      return OMPI_ERR_OUT_OF_RESOURCE;
    }
    
    sent_to_proc = (int *) calloc(fh->f_size, sizeof(int));
    
    if ( NULL == sent_to_proc){
      return OMPI_ERR_OUT_OF_RESOURCE;
    }

    curr_to_proc = (int *) malloc(fh->f_size*sizeof(int));
    
    if ( NULL == curr_to_proc ){
      return OMPI_ERR_OUT_OF_RESOURCE;
    }

    done_to_proc = (int *) malloc(fh->f_size*sizeof(int));
    
    if ( NULL == done_to_proc ){
      return OMPI_ERR_OUT_OF_RESOURCE;
    }

    start_pos = (int *) malloc(fh->f_size*sizeof(int));
    
    if ( NULL == start_pos ){
      return OMPI_ERR_OUT_OF_RESOURCE;
    }
 
  
    done = 0;
    off = st_loc;
    
    ompi_datatype_type_extent(datatype, &buftype_extent);
    for (m=0;m <ntimes; m++){
	for (i=0; i< fh->f_size; i++) count[i] = recv_size[i] = 0;
	
	size = OMPIO_MIN((unsigned)mca_fcoll_two_phase_cycle_buffer_size,
			 end_loc-st_loc+1-done);
	for (i=0;i<fh->f_size;i++){
	    if(others_req[i].count){
		start_pos[i] = curr_offlen_ptr[i];
		for (j=curr_offlen_ptr[i]; j<others_req[i].count; j++) {
		    		    if (partial_recv[i]) {
			/* this request may have been partially
			   satisfied in the previous iteration. */
			req_off = others_req[i].offsets[j] +
			    partial_recv[i]; 
                        req_len = others_req[i].lens[j] -
			    partial_recv[i];
			partial_recv[i] = 0;
			/* modify the off-len pair to reflect this change */
			others_req[i].offsets[j] = req_off;
			others_req[i].lens[j] = req_len;
		    }
		    else {
			req_off = others_req[i].offsets[j];
                        req_len = others_req[i].lens[j];
		    }
		    if (req_off < off + size) {
			count[i]++;
			#if DEBUG_ON
			printf("%d: req_off : %lld, off : %lld, size : %lld, count[%d]: %d\n", fh->f_rank,
			       req_off,
			       off,
			       size,i,
			       count[i]);
			#endif
			MPI_Address(write_buf+req_off-off,
				    &(others_req[i].mem_ptrs[j]));
			#if DEBUG_ON
			printf("%d : mem_ptrs : %ld\n", fh->f_rank,
			       others_req[i].mem_ptrs[j]);
			#endif
			recv_size[i] += (int) (OMPIO_MIN(off + size - req_off,
							 (unsigned)req_len));
			
			if (off+size-req_off < (unsigned)req_len){
			    
			    partial_recv[i] = (int)(off + size - req_off);
			    break;
			}
		    }
		    else break;
		}
		curr_offlen_ptr[i] = j;
	    }
	}	

	ret = two_phase_exchage_data(fh, buf, write_buf,
				     offset_len,send_size,
				     start_pos,recv_size,off,size,
				     count, partial_recv, sent_to_proc,
				     contig_access_count,
				     min_st_offset,
				     fd_size, fd_start,
				     fd_end, flat_buf, others_req,
				     send_buf_idx, curr_to_proc,
				     done_to_proc, m, buf_idx, 
				     buftype_extent, striping_unit,
				     aggregator_list, &hole);
	
	if ( OMPI_SUCCESS != ret ){
	  goto exit;
	}
	
	
	
	flag = 0;
	for (i=0; i<fh->f_size; i++)
	    if (count[i]) flag = 1;


	
	if (flag){

#if TIME_BREAKDOWN	  
	    start_write_time = MPI_Wtime();
#endif

	    #if DEBUG_ON
	    printf("rank : %d enters writing\n", fh->f_rank);
	    printf("size : %ld, off : %ld\n",size, off);
	    for (ii=0, jj=0;jj<size;jj+=4, ii++){
		printf("%d : write_buf[%d]: %d\n", fh->f_rank, ii,((int *)write_buf[jj]));
	    }
	    #endif
	    len = size * byte_size;
	    fh->f_io_array = (mca_io_ompio_io_array_t *)malloc 
		(sizeof(mca_io_ompio_io_array_t));
	    if (NULL == fh->f_io_array) {
		opal_output(1, "OUT OF MEMORY\n");
		return OMPI_ERR_OUT_OF_RESOURCE;
	    }

	    fh->f_io_array[0].offset  =(IOVBASE_TYPE *)(intptr_t) off;
	    fh->f_io_array[0].length = len;
	    fh->f_io_array[0].memory_address = write_buf;
	    fh->f_num_of_io_entries = 1;

	    #if DEBUG_ON
            for (i=0 ; i<fh->f_num_of_io_entries ; i++) {
                printf("%d: ADDRESS: %p  OFFSET: %ld   LENGTH: %d\n",
		       fh->f_rank,
                       fh->f_io_array[i].memory_address,
                       fh->f_io_array[i].offset,
                       fh->f_io_array[i].length);
            }
            #endif

	    if (fh->f_num_of_io_entries){
		if (OMPI_SUCCESS != fh->f_fbtl->fbtl_pwritev (fh, NULL)) {
		    opal_output(1, "WRITE FAILED\n");
		    return OMPI_ERROR;
		}
	    }
#if TIME_BREAKDOWN
	      end_write_time = MPI_Wtime();
	      write_time += (end_write_time - start_write_time);
#endif


	}
	/***************** DONE WRITING *****************************************/
	/****RESET **********************/
	fh->f_num_of_io_entries = 0;
	if (NULL != fh->f_io_array) {
	    free (fh->f_io_array);
	    fh->f_io_array = NULL;
	}

	off += size;
	done += size;
	
    }
    for (i=0; i<fh->f_size; i++) count[i] = recv_size[i] = 0;
    for (m=ntimes; m<max_ntimes; m++) {
      ret = two_phase_exchage_data(fh, buf, write_buf,
				   offset_len,send_size,
				   start_pos,recv_size,off,size,
				   count, partial_recv, sent_to_proc,
				   contig_access_count,
				   min_st_offset,
				   fd_size, fd_start,
				   fd_end, flat_buf,others_req,
				   send_buf_idx, curr_to_proc,
				   done_to_proc, m, buf_idx,
				   buftype_extent, striping_unit,
				   aggregator_list, &hole);
      if ( OMPI_SUCCESS != ret ){
	goto exit;
      }
    }
    
 exit:    
    
    if (ntimes){
      if ( NULL != write_buf ){
	free(write_buf);
      }
    }
    if ( NULL != curr_offlen_ptr ){
      free(curr_offlen_ptr);
    }
    if ( NULL != count ){ 
      free(count);
    }
    if ( NULL != partial_recv ){
      free(partial_recv);
    }
    if ( NULL != send_size ){
      free(send_size);
    }
    if ( NULL != recv_size ){
      free(recv_size);
    }
    if ( NULL != sent_to_proc ){
      free(sent_to_proc);
    }
    if ( NULL != start_pos ){
      free(start_pos);
    }
    if ( NULL != send_buf_idx ){
      free(send_buf_idx);
    }
    if ( NULL != curr_to_proc ){
      free(curr_to_proc);
    }
    if ( NULL != done_to_proc ){
      free(done_to_proc);
    }

    return ret;
}
static int two_phase_read_and_exch(mca_io_ompio_file_t *fh,
				   void *buf,
				   MPI_Datatype datatype,
				   mca_io_ompio_access_array_t *others_req,
				   struct iovec *offset_len,
				   int contig_access_count,
				   OMPI_MPI_OFFSET_TYPE min_st_offset,
				   OMPI_MPI_OFFSET_TYPE fd_size,
				   OMPI_MPI_OFFSET_TYPE *fd_start,
				   OMPI_MPI_OFFSET_TYPE *fd_end,
				   Flatlist_node *flat_buf,
				   size_t *buf_idx, int striping_unit,
				   int *aggregator_list){


  int ret=OMPI_SUCCESS, i = 0, j = 0, ntimes = 0, max_ntimes = 0;
  int m = 0;
  int *curr_offlen_ptr=NULL, *count=NULL, *send_size=NULL, *recv_size=NULL;
  int *partial_send=NULL, *start_pos=NULL, req_len=0, flag=0;
  int *recd_from_proc=NULL;
  MPI_Aint buftype_extent=0;
  size_t byte_size = 0;
  OMPI_MPI_OFFSET_TYPE st_loc=-1, end_loc=-1, off=0, done=0, for_next_iter=0;
  OMPI_MPI_OFFSET_TYPE size=0, req_off=0, real_size=0, real_off=0, len=0;
  OMPI_MPI_OFFSET_TYPE for_curr_iter=0;
  char *read_buf=NULL, *tmp_buf=NULL;
  MPI_Datatype byte = MPI_BYTE;
  
  opal_datatype_type_size(&byte->super, 
			  &byte_size);

  for (i = 0; i < fh->f_size; i++){
    if (others_req[i].count) {
      st_loc = others_req[i].offsets[0];
      end_loc = others_req[i].offsets[0];
      break;
    }
  }

  for (i=0;i<fh->f_size;i++){
    for(j=0;j< others_req[i].count; j++){
      st_loc = 
	OMPIO_MIN(st_loc, others_req[i].offsets[j]);
      end_loc = 
	OMPIO_MAX(end_loc, (others_req[i].offsets[j] +
			    others_req[i].lens[j] - 1));
    }
  }
  
  ntimes = (int)((end_loc - st_loc + mca_fcoll_two_phase_cycle_buffer_size)/
		 mca_fcoll_two_phase_cycle_buffer_size);
  
  if ((st_loc == -1) && (end_loc == -1)){
    ntimes = 0;
  }
  
  fh->f_comm->c_coll.coll_allreduce (&ntimes,
				     &max_ntimes,
				     1,
				     MPI_INT,
				     MPI_MAX,
				     fh->f_comm,
				     fh->f_comm->c_coll.coll_allreduce_module);
  
  if (ntimes){
    read_buf = (char *) calloc (mca_fcoll_two_phase_cycle_buffer_size, 
			       sizeof(char));
    if ( NULL == read_buf ){
      ret =  OMPI_ERR_OUT_OF_RESOURCE;
      goto exit;
    }
  }
  
  curr_offlen_ptr = (int *)calloc (fh->f_size, 
				   sizeof(int));
  if (NULL == curr_offlen_ptr){
    ret = OMPI_ERR_OUT_OF_RESOURCE;
    goto exit;
  }
  
  count = (int *)calloc (fh->f_size, 
			 sizeof(int));
  if (NULL == count){
    ret = OMPI_ERR_OUT_OF_RESOURCE;
    goto exit;
  }

  partial_send = (int *)calloc(fh->f_size, sizeof(int));
  if ( NULL == partial_send ){
    ret = OMPI_ERR_OUT_OF_RESOURCE;
    goto exit;
  }
  
  send_size = (int *)malloc(fh->f_size * sizeof(int));
  if (NULL == send_size){
    ret = OMPI_ERR_OUT_OF_RESOURCE;
    goto exit;
  }
  
  recv_size = (int *)malloc(fh->f_size * sizeof(int));
  if (NULL == recv_size){
    ret = OMPI_ERR_OUT_OF_RESOURCE;
    goto exit;
  }
  
  recd_from_proc = (int *)calloc(fh->f_size,sizeof(int));
  if (NULL == recd_from_proc){
    ret = OMPI_ERR_OUT_OF_RESOURCE;
    goto exit;
  }

  start_pos = (int *) calloc(fh->f_size, sizeof(int));
  if ( NULL == start_pos ){
    ret = OMPI_ERR_OUT_OF_RESOURCE;
    return ret;
  }

  done = 0;
  off = st_loc;
  for_curr_iter = for_next_iter = 0;

  ompi_datatype_type_extent(datatype, &buftype_extent);
  
  for (m=0; m<ntimes; m++) {
    
    size = OMPIO_MIN((unsigned)mca_fcoll_two_phase_cycle_buffer_size, end_loc-st_loc+1-done); 
    real_off = off - for_curr_iter;
    real_size = size + for_curr_iter;
    
    for (i=0; i<fh->f_size; i++) count[i] = send_size[i] = 0;
    for_next_iter = 0;
    
    for (i=0; i<fh->f_size; i++) {
      if (others_req[i].count) {
	start_pos[i] = curr_offlen_ptr[i];
	for (j=curr_offlen_ptr[i]; j<others_req[i].count;
	     j++) {
	  if (partial_send[i]) {
	    /* this request may have been partially
	       satisfied in the previous iteration. */
	    req_off = others_req[i].offsets[j] +
	      partial_send[i]; 
	    req_len = others_req[i].lens[j] -
	      partial_send[i];
	    partial_send[i] = 0;
	    /* modify the off-len pair to reflect this change */
	    others_req[i].offsets[j] = req_off;
	    others_req[i].lens[j] = req_len;
	  }
	  else {
	    req_off = others_req[i].offsets[j];
	    req_len = others_req[i].lens[j];
	  }
	  if (req_off < real_off + real_size) {
	    count[i]++;
	    MPI_Address(read_buf+req_off-real_off, 
			&(others_req[i].mem_ptrs[j]));
	    
	    send_size[i] += (int)(OMPIO_MIN(real_off + real_size - req_off, 
					    (OMPI_MPI_OFFSET_TYPE)req_len)); 
	    
	    if (real_off+real_size-req_off < (OMPI_MPI_OFFSET_TYPE)req_len) {
	      partial_send[i] = (int) (real_off + real_size - req_off);
	      if ((j+1 < others_req[i].count) && 
		  (others_req[i].offsets[j+1] < 
		   real_off+real_size)) { 
		/* this is the case illustrated in the
		   figure above. */
		for_next_iter = OMPIO_MAX(for_next_iter,
					  real_off + real_size - others_req[i].offsets[j+1]); 
		/* max because it must cover requests 
		   from different processes */
	      }
	      break;
	    }
	  }
	  else break;
	}
	curr_offlen_ptr[i] = j;
      }
    }
    flag = 0;
    for (i=0; i<fh->f_size; i++) 
      if (count[i]) flag = 1;
    
    if (flag) {

#if TIME_BREAKDOWN
	start_read_time = MPI_Wtime();
#endif

      len = size * byte_size;
      fh->f_io_array = (mca_io_ompio_io_array_t *)calloc 
	(1,sizeof(mca_io_ompio_io_array_t));
      if (NULL == fh->f_io_array) {
	opal_output(1, "OUT OF MEMORY\n");
	return OMPI_ERR_OUT_OF_RESOURCE;
      }
      fh->f_io_array[0].offset = (IOVBASE_TYPE *)(intptr_t)off;
      fh->f_io_array[0].length = len;
      fh->f_io_array[0].memory_address = 
	read_buf+for_curr_iter;
      fh->f_num_of_io_entries = 1;
      
      if (fh->f_num_of_io_entries){
	if (OMPI_SUCCESS != fh->f_fbtl->fbtl_preadv (fh, NULL)) {
	  opal_output(1, "READ FAILED\n");
	  return OMPI_ERROR;
	}
      }
      
#if 0
      int ii;
      printf("%d: len/4 : %lld\n",
	     fh->f_rank,
	     len/4);
      for (ii = 0; ii < len/4 ;ii++){
	printf("%d: read_buf[%d]: %ld\n", 
	       fh->f_rank,
	       ii,
	       (int *)read_buf[ii]);
      }
#endif
      fh->f_num_of_io_entries = 0;
      if (NULL != fh->f_io_array) {
	free (fh->f_io_array);
	fh->f_io_array = NULL;
      }
      
#if TIME_BREAKDOWN
	end_read_time = MPI_Wtime();
	read_time += (end_read_time - start_read_time);
#endif


    }
 
    for_curr_iter = for_next_iter;
    
    for (i=0; i< fh->f_size; i++){
      recv_size[i]  = 0;
    }
    two_phase_exchange_data(fh, buf, offset_len,
			    send_size, start_pos, recv_size, count, 
			    partial_send, recd_from_proc, 
			    contig_access_count,
			    min_st_offset, fd_size, fd_start, fd_end,
			    flat_buf, others_req, m, buf_idx,
			    buftype_extent, striping_unit, aggregator_list); 

    if (for_next_iter){
      tmp_buf = (char *) calloc (for_next_iter, sizeof(char));
      memcpy(tmp_buf, 
	     read_buf+real_size-for_next_iter, 
	     for_next_iter);
      free(read_buf);
      read_buf = (char *)malloc(for_next_iter+mca_fcoll_two_phase_cycle_buffer_size);
      memcpy(read_buf, tmp_buf, for_next_iter);
      free(tmp_buf);
    }
    
    off += size;
    done += size;
  }
  
  for (i=0; i<fh->f_size; i++) count[i] = send_size[i] = 0;
  for (m=ntimes; m<max_ntimes; m++)
    two_phase_exchange_data(fh, buf, offset_len, send_size,
			    start_pos, recv_size, count, 
			    partial_send, recd_from_proc, 
			    contig_access_count,
			    min_st_offset, fd_size, fd_start, fd_end,
			    flat_buf, others_req, m, buf_idx,
			    buftype_extent, striping_unit, aggregator_list); 
  if (ntimes){ 
    free(read_buf);
    read_buf = NULL;
  }
  if (NULL != curr_offlen_ptr){
    free(curr_offlen_ptr);
    curr_offlen_ptr = NULL;
  }
  if (NULL != count){
    free(count);
    count = NULL;
  }
  if (NULL != partial_send){
    free(partial_send);
    partial_send = NULL;
  }
  if (NULL != send_size){
    free(send_size);
    send_size = NULL;
  }
  if (NULL != recv_size){
    free(recv_size);
    recv_size = NULL;
  }
  if (NULL != recd_from_proc){
    free(recd_from_proc);
    recd_from_proc = NULL;
  }
  if (NULL != start_pos){
    free(start_pos);
    start_pos = NULL;
  }

 exit:
  return ret;
  
}
static int two_phase_fill_send_buffer(mca_io_ompio_file_t *fh,
				      void *buf,
				      Flatlist_node *flat_buf,
				      char **send_buf,
				      struct iovec *offset_length,
				      int *send_size,
				      MPI_Request *requests,
				      int *sent_to_proc,
				      int contig_access_count, 
				      OMPI_MPI_OFFSET_TYPE min_st_offset,
				      OMPI_MPI_OFFSET_TYPE fd_size,
				      OMPI_MPI_OFFSET_TYPE *fd_start,
				      OMPI_MPI_OFFSET_TYPE *fd_end,
				      int *send_buf_idx,
				      int *curr_to_proc, 
				      int *done_to_proc,
				      int iter, MPI_Aint buftype_extent,
				      int striping_unit, int *aggregator_list){

    int i, p, flat_buf_idx;
    OMPI_MPI_OFFSET_TYPE flat_buf_sz, size_in_buf, buf_incr, size;
    int jj, n_buftypes, ret=OMPI_SUCCESS;
    OMPI_MPI_OFFSET_TYPE off, len, rem_len, user_buf_idx;

    for (i=0; i < fh->f_size; i++) {
	send_buf_idx[i] = curr_to_proc[i] = 0;
	done_to_proc[i] = sent_to_proc[i];
    }
    jj = 0;
    
    user_buf_idx = flat_buf->indices[0];
    flat_buf_idx = 0;
    n_buftypes = 0;
    flat_buf_sz = flat_buf->blocklens[0];
    
    for (i=0; i<contig_access_count; i++) { 
	
      off     = (OMPI_MPI_OFFSET_TYPE) (intptr_t)offset_length[i].iov_base;
	rem_len = (OMPI_MPI_OFFSET_TYPE)offset_length[i].iov_len;
	

	while (rem_len != 0) {
	    len = rem_len;
	    p = mca_fcoll_two_phase_calc_aggregator(fh,
						    off,
						    min_st_offset,
						    &len,
						    fd_size,
						    fd_start,
						    fd_end,
						    striping_unit,
						    mca_fcoll_two_phase_num_io_procs,
						    aggregator_list);

	    if (send_buf_idx[p] < send_size[p]) {
		if (curr_to_proc[p]+len > done_to_proc[p]) {
		    if (done_to_proc[p] > curr_to_proc[p]) {
			size = OMPIO_MIN(curr_to_proc[p] + len - 
					 done_to_proc[p], send_size[p]-send_buf_idx[p]);
			buf_incr = done_to_proc[p] - curr_to_proc[p];
			TWO_PHASE_BUF_INCR
		        buf_incr = curr_to_proc[p] + len - done_to_proc[p];
			curr_to_proc[p] = done_to_proc[p] + size;
		        TWO_PHASE_BUF_COPY
		    }
		    else {
			size = OMPIO_MIN(len,send_size[p]-send_buf_idx[p]);
			buf_incr = len;
			curr_to_proc[p] += size;
			TWO_PHASE_BUF_COPY
		    }
		    if (send_buf_idx[p] == send_size[p]) {

		      ret = MCA_PML_CALL(isend(send_buf[p],
					       send_size[p],
					       MPI_BYTE,
					       p,
					       fh->f_rank+p+100*iter,
					       MCA_PML_BASE_SEND_STANDARD, 
					       fh->f_comm,
					       requests+jj));	
		      
		      if ( OMPI_SUCCESS != ret ){
			return ret;
		      }
		      jj++;
		    }
		}
		else {
		    curr_to_proc[p] += len;
		    buf_incr = len;
		    TWO_PHASE_BUF_INCR
		}
	    }
static void two_phase_fill_user_buffer(mca_io_ompio_file_t *fh,
				       void *buf, 
				       Flatlist_node *flat_buf,
				       char **recv_buf,
				       struct iovec *offset_length, 
				       unsigned *recv_size, 
				       MPI_Request *requests, 
				       int *recd_from_proc,
				       int contig_access_count, 
				       OMPI_MPI_OFFSET_TYPE min_st_offset, 
				       OMPI_MPI_OFFSET_TYPE fd_size, 
				       OMPI_MPI_OFFSET_TYPE *fd_start, 
				       OMPI_MPI_OFFSET_TYPE *fd_end,
				       MPI_Aint buftype_extent,
				       int striping_unit, int *aggregator_list){
  
  int i = 0, p = 0, flat_buf_idx = 0;
  OMPI_MPI_OFFSET_TYPE flat_buf_sz = 0, size_in_buf = 0, buf_incr = 0, size = 0;
  int n_buftypes = 0;
  OMPI_MPI_OFFSET_TYPE off=0, len=0, rem_len=0, user_buf_idx=0;
  unsigned *curr_from_proc=NULL, *done_from_proc=NULL, *recv_buf_idx=NULL;
  
  curr_from_proc = (unsigned *) malloc (fh->f_size * sizeof(unsigned));
  done_from_proc = (unsigned *) malloc (fh->f_size * sizeof(unsigned));
  recv_buf_idx = (unsigned *) malloc (fh->f_size * sizeof(unsigned));
  
  
  for (i=0; i < fh->f_size; i++) {
    recv_buf_idx[i] = curr_from_proc[i] = 0;
    done_from_proc[i] = recd_from_proc[i];
  }
  
  

  
  user_buf_idx = flat_buf->indices[0];
  flat_buf_idx = 0;
  n_buftypes = 0;
  flat_buf_sz = flat_buf->blocklens[0];
  
  /* flat_buf_idx = current index into flattened buftype
     flat_buf_sz = size of current contiguous component in 
     flattened buf */
  
  for (i=0; i<contig_access_count; i++) { 
    
    off     = (OMPI_MPI_OFFSET_TYPE)(intptr_t)offset_length[i].iov_base;
    rem_len = (OMPI_MPI_OFFSET_TYPE)offset_length[i].iov_len;
    
    /* this request may span the file domains of more than one process */
    while (rem_len != 0) {
      len = rem_len;
      /* NOTE: len value is modified by ADIOI_Calc_aggregator() to be no
       * longer than the single region that processor "p" is responsible
       * for.
       */
      p = mca_fcoll_two_phase_calc_aggregator(fh,
					      off,
					      min_st_offset,
					      &len,
					      fd_size,
					      fd_start,
					      fd_end,
					      striping_unit,
					      mca_fcoll_two_phase_num_io_procs,
					      aggregator_list);
      
      if (recv_buf_idx[p] < recv_size[p]) {
	if (curr_from_proc[p]+len > done_from_proc[p]) {
	  if (done_from_proc[p] > curr_from_proc[p]) {
	    size = OMPIO_MIN(curr_from_proc[p] + len - 
			     done_from_proc[p], recv_size[p]-recv_buf_idx[p]);
	    buf_incr = done_from_proc[p] - curr_from_proc[p];
	    TWO_PHASE_BUF_INCR
	    buf_incr = curr_from_proc[p]+len-done_from_proc[p];
	    curr_from_proc[p] = done_from_proc[p] + size;
	    TWO_PHASE_BUF_COPY
	   }
	  else {
	    size = OMPIO_MIN(len,recv_size[p]-recv_buf_idx[p]);
	    buf_incr = len;
	    curr_from_proc[p] += (unsigned) size;
	    TWO_PHASE_BUF_COPY
	  }
	}
	else {
	  curr_from_proc[p] += (unsigned) len;
	  buf_incr = len;
	  TWO_PHASE_BUF_INCR
	}
      }