int
mca_fcoll_dynamic_file_write_all (mca_io_ompio_file_t *fh,
                                  void *buf,
                                  int count,
                                  struct ompi_datatype_t *datatype,
                                  ompi_status_public_t *status)
{
    MPI_Aint total_bytes_written = 0;  /* total bytes that have been written*/
    MPI_Aint total_bytes = 0;          /* total bytes to be written */
    MPI_Aint bytes_to_write_in_cycle = 0; /* left to be written in a cycle*/
    MPI_Aint bytes_per_cycle = 0;      /* total written in each cycle by each process*/
    int index = 0;
    int cycles = 0;
    int i=0, j=0, l=0;
    int n=0; /* current position in total_bytes_per_process array */
    MPI_Aint bytes_remaining = 0; /* how many bytes have been written from the current
                                value from total_bytes_per_process */
    int bytes_sent = 0, ret =0;
    int blocks=0, entries_per_aggregator=0;

    /* iovec structure and count of the buffer passed in */
    uint32_t iov_count = 0;
    struct iovec *decoded_iov = NULL;
    int iov_index = 0;
    char *send_buf = NULL;
    size_t current_position = 0;
    struct iovec *local_iov_array=NULL, *global_iov_array=NULL;
    local_io_array *file_offsets_for_agg=NULL;
    /* global iovec at the writers that contain the iovecs created from
       file_set_view */
    uint32_t total_fview_count = 0;
    int local_count = 0, temp_pindex;
    int *fview_count = NULL, *disp_index=NULL, *temp_disp_index=NULL;
    int current_index = 0, temp_index=0;

    char *global_buf = NULL;
    MPI_Aint global_count = 0;
    
    
    /* array that contains the sorted indices of the global_iov */
    int *sorted = NULL, *sorted_file_offsets=NULL;
    int *displs = NULL;
    int dynamic_num_io_procs;
    size_t max_data = 0, datatype_size = 0; 
    int **blocklen_per_process=NULL;
    MPI_Aint **displs_per_process=NULL, *memory_displacements=NULL;
    ompi_datatype_t **recvtype = NULL;
    MPI_Aint *total_bytes_per_process = NULL;
    MPI_Request *send_req=NULL, *recv_req=NULL;
    int recv_req_count=0;
    

#if TIME_BREAKDOWN
    double write_time = 0.0, start_write_time = 0.0, end_write_time = 0.0;
    double comm_time = 0.0, start_comm_time = 0.0, end_comm_time = 0.0;
    double exch_write = 0.0, start_exch = 0.0, end_exch = 0.0;
    print_entry nentry;
#endif


//    if (opal_datatype_is_contiguous_memory_layout(&datatype->super,1)) {
//        fh->f_flags |= OMPIO_CONTIGUOUS_MEMORY;
//    }

    /**************************************************************************
     ** In case the data is not contigous in memory, decode it into an iovec **
     **************************************************************************/
    if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) {
      ret =   ompi_io_ompio_decode_datatype (fh,
					     datatype,
					     count,
					     buf,
					     &max_data,
					     &decoded_iov,
					     &iov_count);
      if (OMPI_SUCCESS != ret ){
	goto exit;
      }
    }
    else {
        max_data = count * datatype->super.size;
    }

    if ( MPI_STATUS_IGNORE != status ) {
	status->_ucount = max_data;
    }
	
    mca_io_ompio_get_num_aggregators ( &dynamic_num_io_procs );
    ret = ompi_io_ompio_set_aggregator_props (fh, 
					      dynamic_num_io_procs,
					      max_data);
	
    if (OMPI_SUCCESS != ret){
	goto exit;
    }


    total_bytes_per_process = (MPI_Aint*)malloc
        (fh->f_procs_per_group*sizeof(MPI_Aint));
    if (NULL == total_bytes_per_process) {
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
	goto exit;
    }

    ret = ompi_io_ompio_allgather_array (&max_data,
					 1,
					 MPI_LONG,
					 total_bytes_per_process,
					 1,
					 MPI_LONG,
					 fh->f_aggregator_index,
					 fh->f_procs_in_group,
					 fh->f_procs_per_group,
					 fh->f_comm);
    
    if( OMPI_SUCCESS != ret){
      goto exit;
    }
    for (i=0 ; i<fh->f_procs_per_group ; i++) {
        total_bytes += total_bytes_per_process[i];
    }

    if (NULL != total_bytes_per_process) {
        free (total_bytes_per_process);
        total_bytes_per_process = NULL;
    }
    
    /*********************************************************************
     *** Generate the File offsets/lengths corresponding to this write ***
     ********************************************************************/
    ret = ompi_io_ompio_generate_current_file_view(fh,
						   max_data,
						   &local_iov_array,
						   &local_count);
    if (ret != OMPI_SUCCESS){
      goto exit;
    }

#if DEBUG_ON    
    for (i=0 ; i<local_count ; i++) {
   
      printf("%d: OFFSET: %d   LENGTH: %ld\n",
	     fh->f_rank,
	     local_iov_array[i].iov_base,
	     local_iov_array[i].iov_len);

    }
#endif   

    /*************************************************************
     *** ALLGather the File View information at all processes ***
     *************************************************************/

    fview_count = (int *) malloc (fh->f_procs_per_group * sizeof (int));
    if (NULL == fview_count) {
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
	goto exit;
    }

    ret = ompi_io_ompio_allgather_array (&local_count,
					 1,
					 MPI_INT,
					 fview_count,
					 1,
					 MPI_INT,
					 fh->f_aggregator_index,
					 fh->f_procs_in_group,
					 fh->f_procs_per_group,
					 fh->f_comm);
    
    if( OMPI_SUCCESS != ret){
      goto exit;
    }

    displs = (int*) malloc (fh->f_procs_per_group * sizeof (int));
    if (NULL == displs) {
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
	goto exit;
    }

    displs[0] = 0;
    total_fview_count = fview_count[0];
    for (i=1 ; i<fh->f_procs_per_group ; i++) {
        total_fview_count += fview_count[i];
        displs[i] = displs[i-1] + fview_count[i-1];
    }
    
#if DEBUG_ON
    printf("total_fview_count : %d\n", total_fview_count);
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
        for (i=0 ; i<fh->f_procs_per_group ; i++) {
            printf ("%d: PROCESS: %d  ELEMENTS: %d  DISPLS: %d\n",
                    fh->f_rank,
                    i,
                    fview_count[i],
                    displs[i]);
        }
    }
#endif
    
    /* allocate the global iovec  */

    if (0 != total_fview_count) {
      global_iov_array = (struct iovec*) malloc (total_fview_count *
						    sizeof(struct iovec));
      if (NULL == global_iov_array){
	opal_output(1, "OUT OF MEMORY\n");
	ret = OMPI_ERR_OUT_OF_RESOURCE;
	goto exit;
      }

    }
    
    ret = ompi_io_ompio_allgatherv_array (local_iov_array,
					  local_count,
					  fh->f_iov_type,
					  global_iov_array,
					  fview_count,
					  displs,
					  fh->f_iov_type,
					  fh->f_aggregator_index,
					  fh->f_procs_in_group,
					  fh->f_procs_per_group,
					  fh->f_comm);
    if (OMPI_SUCCESS != ret){
      goto exit;
    }

    /* sort it */
    if (0 != total_fview_count) {
        sorted = (int *)malloc (total_fview_count * sizeof(int));
        if (NULL == sorted) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
        }
	ompi_io_ompio_sort_iovec (global_iov_array, total_fview_count, sorted);
    }

    if (NULL != local_iov_array){
      free(local_iov_array);
      local_iov_array = NULL;
    }

    if (NULL != displs){
      free(displs);
      displs=NULL;
    }

    
#if DEBUG_ON
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
      uint32_t tv=0;
      for (tv=0 ; tv<total_fview_count ; tv++) {
	printf("%d: OFFSET: %lld   LENGTH: %ld\n",
	       fh->f_rank,
	       global_iov_array[sorted[tv]].iov_base,
	       global_iov_array[sorted[tv]].iov_len);
      }
    }
#endif
    
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
        disp_index = (int *)malloc (fh->f_procs_per_group * sizeof (int));
        if (NULL == disp_index) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
        }

	blocklen_per_process = (int **)malloc (fh->f_procs_per_group * sizeof (int*));
        if (NULL == blocklen_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
        }

	displs_per_process = (MPI_Aint **)malloc (fh->f_procs_per_group * sizeof (MPI_Aint*));
	if (NULL == displs_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
        }

	for(i=0;i<fh->f_procs_per_group;i++){
	  blocklen_per_process[i] = NULL;
	  displs_per_process[i] = NULL;
	}
    }
    

    mca_io_ompio_get_bytes_per_agg ( (int *)&bytes_per_cycle );
    cycles = ceil((double)total_bytes/bytes_per_cycle);


    n = 0; 
    bytes_remaining = 0;
    current_index = 0;




#if TIME_BREAKDOWN
      start_exch = MPI_Wtime();
#endif
    
    for (index = 0; index < cycles; index++) {

      /* Getting ready for next cycle 
	 Initializing and freeing buffers*/
      if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
	
	if (NULL == recvtype){
	  recvtype = (ompi_datatype_t **) 
	    malloc (fh->f_procs_per_group  * sizeof(ompi_datatype_t *));
	    if (NULL == recvtype) {
	      opal_output (1, "OUT OF MEMORY\n");
	      ret = OMPI_ERR_OUT_OF_RESOURCE;
	      goto exit;
	    }
	}
	
	for(l=0;l<fh->f_procs_per_group;l++){
	  disp_index[l] =  1;
	 
	  if (NULL != blocklen_per_process[l]){
	    free(blocklen_per_process[l]);
	    blocklen_per_process[l] = NULL;
	  }
	  if (NULL != displs_per_process[l]){
	    free(displs_per_process[l]);
	    displs_per_process[l] = NULL;
	  }
	  blocklen_per_process[l] = (int *) calloc (1, sizeof(int));
	  if (NULL == blocklen_per_process[l]) {
	    opal_output (1, "OUT OF MEMORY for blocklen\n");
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	  }
	  displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint));
	  if (NULL == displs_per_process[l]){      
	      opal_output (1, "OUT OF MEMORY for displs\n");
	      ret = OMPI_ERR_OUT_OF_RESOURCE;
	      goto exit;
	  }
	}
	
	if (NULL != sorted_file_offsets){
	  free(sorted_file_offsets);
	  sorted_file_offsets = NULL;
	}
	
	if(NULL != file_offsets_for_agg){
	  free(file_offsets_for_agg);
	  file_offsets_for_agg = NULL;
	}
	
	if (NULL != memory_displacements){
	  free(memory_displacements);
	  memory_displacements = NULL;
	}
	
      }
      
      if (cycles-1 == index) {
	bytes_to_write_in_cycle = total_bytes - bytes_per_cycle*index;
      }
      else {
	bytes_to_write_in_cycle = bytes_per_cycle;
      }
      
#if DEBUG_ON
      if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
	printf ("****%d: CYCLE %d   Bytes %lld**********\n",
		fh->f_rank,
		index, 
		bytes_to_write_in_cycle);
      }
#endif
      /**********************************************************
       **Gather the Data from all the processes at the writers **
       *********************************************************/
      
      /* Calculate how much data will be contributed in this cycle 
	 by each process*/
      bytes_sent = 0;
#if DEBUG_ON
      printf("bytes_to_write_in_cycle: %ld, cycle : %d\n", bytes_to_write_in_cycle,
	       index);
#endif
      /* The blocklen and displs calculation only done at aggregators!*/


      while (bytes_to_write_in_cycle) {
	
	blocks = fview_count[0];
	for (j=0 ; j<fh->f_procs_per_group ; j++) {
	  if (sorted[current_index] < blocks) {
	    n = j;
	    break;
	  }
	  else {
	    blocks += fview_count[j+1];
	  }
	}
	
	if (bytes_remaining) {
	  if (bytes_remaining <= bytes_to_write_in_cycle) {

	    if (fh->f_procs_in_group[fh->f_aggregator_index] == 
		fh->f_rank) {
	      blocklen_per_process[n][disp_index[n] - 1] = bytes_remaining;
	      displs_per_process[n][disp_index[n] - 1] = 
		(OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base + 
		(global_iov_array[sorted[current_index]].iov_len
		 - bytes_remaining);
	      
	    }
	    if (fh->f_procs_in_group[n] == fh->f_rank) {
	      bytes_sent += bytes_remaining;
	    }
	    current_index ++;
	    bytes_to_write_in_cycle -= bytes_remaining;
	    bytes_remaining = 0;
	    if (fh->f_procs_in_group[fh->f_aggregator_index] == 
		fh->f_rank) {		
	    /* In this cases the length is consumed so allocating for
	       next displacement and blocklength*/
     
	      blocklen_per_process[n] = (int *) realloc
		((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int));
	      displs_per_process[n] = (MPI_Aint *) realloc
		((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint));
	      blocklen_per_process[n][disp_index[n]] = 0;
	      displs_per_process[n][disp_index[n]] = 0;
	      disp_index[n] += 1;
	    }
	    continue;
	  }
	  else {
	    if (fh->f_procs_in_group[fh->f_aggregator_index] == 
		fh->f_rank) {
	      blocklen_per_process[n][disp_index[n] - 1] = bytes_to_write_in_cycle;
	      displs_per_process[n][disp_index[n] - 1] = 
		(OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base +
                (global_iov_array[sorted[current_index]].iov_len
                 - bytes_remaining);
	    }
	    
	    if (fh->f_procs_in_group[n] == fh->f_rank) {
	      bytes_sent += bytes_to_write_in_cycle;
	    }
	    bytes_remaining -= bytes_to_write_in_cycle;
	    bytes_to_write_in_cycle = 0;
	    break;
	  }
	}
	else {
	  if (bytes_to_write_in_cycle < 
	      (MPI_Aint) global_iov_array[sorted[current_index]].iov_len) {
	    if (fh->f_procs_in_group[fh->f_aggregator_index] == 
		fh->f_rank) {

	      blocklen_per_process[n][disp_index[n] - 1] = bytes_to_write_in_cycle;
	      displs_per_process[n][disp_index[n] - 1] = 
		(OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base ;


	    }
	    if (fh->f_procs_in_group[n] == fh->f_rank) {
	      bytes_sent += bytes_to_write_in_cycle;
	      
	    }
	    bytes_remaining = global_iov_array[sorted[current_index]].iov_len - 
	      bytes_to_write_in_cycle;
	    bytes_to_write_in_cycle = 0;
	    break;
	  }
	  else {
	    if (fh->f_procs_in_group[fh->f_aggregator_index] == 
		fh->f_rank) {
	      blocklen_per_process[n][disp_index[n] - 1] =
		global_iov_array[sorted[current_index]].iov_len;
	      displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)
		global_iov_array[sorted[current_index]].iov_base;
	      
	      blocklen_per_process[n] = 
		(int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int));
	      displs_per_process[n] = (MPI_Aint *)realloc
		((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint));
	      blocklen_per_process[n][disp_index[n]] = 0;
	      displs_per_process[n][disp_index[n]] = 0;
	      disp_index[n] += 1;
	      /*realloc for next blocklength 
		and assign this displacement and check for next displs as
	      the total length of this entry has been consumed!*/
	    }
	    if (fh->f_procs_in_group[n] == fh->f_rank) {
	      bytes_sent += global_iov_array[sorted[current_index]].iov_len;
	    }
	    bytes_to_write_in_cycle -= 
	      global_iov_array[sorted[current_index]].iov_len;
	    current_index ++;
	    continue;
	  }
	}
      }
      

      /* Calculate the displacement on where to put the data and allocate
	 the recieve buffer (global_buf) */
      if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
	entries_per_aggregator=0;
	for (i=0;i<fh->f_procs_per_group; i++){
	  for (j=0;j<disp_index[i];j++){
	    if (blocklen_per_process[i][j] > 0) 
	      entries_per_aggregator++ ;
	  }
	}

#if DEBUG_ON
	printf("%d: cycle: %d, bytes_sent: %d\n ",fh->f_rank,index,
		 bytes_sent);
	printf("%d : Entries per aggregator : %d\n",fh->f_rank,entries_per_aggregator);
#endif
	
	if (entries_per_aggregator > 0){
	  file_offsets_for_agg = (local_io_array *)
	    malloc(entries_per_aggregator*sizeof(local_io_array));
	  if (NULL == file_offsets_for_agg) {
	    opal_output (1, "OUT OF MEMORY\n");
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	  }
	  
	  sorted_file_offsets = (int *)
	    malloc (entries_per_aggregator*sizeof(int));
	  if (NULL == sorted_file_offsets){
	    opal_output (1, "OUT OF MEMORY\n");
	    ret =  OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	  }
	  
	  /*Moving file offsets to an IO array!*/
	  temp_index = 0;
	 	  
	  for (i=0;i<fh->f_procs_per_group; i++){
	    for(j=0;j<disp_index[i];j++){
	      if (blocklen_per_process[i][j] > 0){
		file_offsets_for_agg[temp_index].length =
		  blocklen_per_process[i][j];
		file_offsets_for_agg[temp_index].process_id = i;
		file_offsets_for_agg[temp_index].offset = 
		  displs_per_process[i][j];
		temp_index++;
 
#if DEBUG_ON
		printf("************Cycle: %d,  Aggregator: %d ***************\n", 
		       index+1,fh->f_rank);
		
		printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n",
		       fh->f_procs_in_group[i],j,
		     blocklen_per_process[i][j],j,
		     displs_per_process[i][j],
		     fh->f_rank);
#endif
	      }
	    }
	  }
	}
	else{
	  continue;
	}
	/* Sort the displacements for each aggregator*/
	local_heap_sort (file_offsets_for_agg,
			 entries_per_aggregator,
			 sorted_file_offsets); 
	
	/*create contiguous memory displacements 
	  based on blocklens on the same displs array
	  and map it to this aggregator's actual 
	  file-displacements (this is in the io-array created above)*/
	memory_displacements = (MPI_Aint *) malloc 
	    (entries_per_aggregator * sizeof(MPI_Aint));
	
	memory_displacements[sorted_file_offsets[0]] = 0;
	for (i=1; i<entries_per_aggregator; i++){
	  memory_displacements[sorted_file_offsets[i]] = 
	    memory_displacements[sorted_file_offsets[i-1]] + 
	    file_offsets_for_agg[sorted_file_offsets[i-1]].length;
	}
	
	temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int));
	if (NULL == temp_disp_index) {
	  opal_output (1, "OUT OF MEMORY\n");
	  return OMPI_ERR_OUT_OF_RESOURCE;
	}
	
	/*Now update the displacements array  with memory offsets*/
	global_count = 0;
	for (i=0;i<entries_per_aggregator;i++){
	  temp_pindex =
	    file_offsets_for_agg[sorted_file_offsets[i]].process_id;
	  displs_per_process[temp_pindex][temp_disp_index[temp_pindex]] =
	    memory_displacements[sorted_file_offsets[i]];
	  if (temp_disp_index[temp_pindex] < disp_index[temp_pindex])
	      temp_disp_index[temp_pindex] += 1;
	  else{
	    printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n",
		   temp_pindex, temp_disp_index[temp_pindex],
		   temp_pindex, disp_index[temp_pindex]);
	  }
	  global_count += 
	    file_offsets_for_agg[sorted_file_offsets[i]].length;
	}
	  
	if (NULL != temp_disp_index){
	  free(temp_disp_index);
	    temp_disp_index = NULL;
	}

#if DEBUG_ON

	printf("************Cycle: %d,  Aggregator: %d ***************\n", 
	       index+1,fh->f_rank);
	for (i=0;i<fh->f_procs_per_group; i++){
	  for(j=0;j<disp_index[i];j++){
	    if (blocklen_per_process[i][j] > 0){
	      printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n",
		     fh->f_procs_in_group[i],j,
		     blocklen_per_process[i][j],j,
		     displs_per_process[i][j],
		     fh->f_rank);
	      
	    }
	  }
	}
	printf("************Cycle: %d,  Aggregator: %d ***************\n", 
	       index+1,fh->f_rank);
	for (i=0; i<entries_per_aggregator;i++){
	  printf("%d: OFFSET: %lld   LENGTH: %ld, Mem-offset: %ld\n",
		   file_offsets_for_agg[sorted_file_offsets[i]].process_id,
		 file_offsets_for_agg[sorted_file_offsets[i]].offset,
		 file_offsets_for_agg[sorted_file_offsets[i]].length,
		 memory_displacements[sorted_file_offsets[i]]);
	}
	printf("%d : global_count : %ld, bytes_sent : %d\n",
	       fh->f_rank,global_count, bytes_sent);
#endif
#if TIME_BREAKDOWN
	  start_comm_time = MPI_Wtime();
#endif

	
	global_buf  = (char *) malloc (global_count);
	if (NULL == global_buf){
	    opal_output(1, "OUT OF MEMORY");
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	  goto exit;
	}

	recv_req_count = 0;
	for (i=0;i<fh->f_procs_per_group; i++){
	    
	  ompi_datatype_create_hindexed(disp_index[i],
					blocklen_per_process[i],
					displs_per_process[i],
					MPI_BYTE,
					&recvtype[i]);
	  ompi_datatype_commit(&recvtype[i]); 
	  
	  opal_datatype_type_size(&recvtype[i]->super, 
				  &datatype_size);
	  
	  if (datatype_size){
	    
	    recv_req = (MPI_Request *)realloc 
			((void *)recv_req, (recv_req_count + 1)*sizeof(MPI_Request));
	    
	    ret = MCA_PML_CALL(irecv(global_buf,
				     1,
				     recvtype[i],
				     fh->f_procs_in_group[i],
				     123,
				     fh->f_comm,
				     &recv_req[recv_req_count]));
	    recv_req_count++;
	    
	    if (OMPI_SUCCESS != ret){
	      goto exit;
	    }
	  }
	}
	
      }
      
      

      if (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY) {
	send_buf = &((char*)buf)[total_bytes_written];
      }
      else if (bytes_sent) {
	/* allocate a send buffer and copy the data that needs
	   to be sent into it in case the data is non-contigous
	   in memory */
	OPAL_PTRDIFF_TYPE mem_address;
	size_t remaining = 0;
	size_t temp_position = 0;
	
	send_buf = malloc (bytes_sent);
	if (NULL == send_buf) {
	  opal_output (1, "OUT OF MEMORY\n");
	  return OMPI_ERR_OUT_OF_RESOURCE;
	}
	
	remaining = bytes_sent;
	  
	while (remaining) {
	  mem_address = (OPAL_PTRDIFF_TYPE)
	    (decoded_iov[iov_index].iov_base) + current_position;
	  
	    if (remaining >= 
		(decoded_iov[iov_index].iov_len - current_position)) {
	      memcpy (send_buf+temp_position,
		      (IOVBASE_TYPE *)mem_address,
		      decoded_iov[iov_index].iov_len - current_position);
	      remaining = remaining - 
		(decoded_iov[iov_index].iov_len - current_position);
	      temp_position = temp_position +
		(decoded_iov[iov_index].iov_len - current_position);
	      iov_index = iov_index + 1;
	      current_position = 0;
	    }
	    else {
	      memcpy (send_buf+temp_position,
		      (IOVBASE_TYPE *) mem_address,
		      remaining);
	      current_position = current_position + remaining;
	  remaining = 0;
	    }
	  }
	}
	total_bytes_written += bytes_sent;

	/* Gather the sendbuf from each process in appropritate locations in 
	 aggregators*/
	
	send_req = (MPI_Request *) malloc (sizeof(MPI_Request));
	if (NULL == send_req){
	  opal_output (1, "OUT OF MEMORY\n");
	  ret = OMPI_ERR_OUT_OF_RESOURCE;
	  goto exit;
	}

	
	if (bytes_sent){

	  ret = MCA_PML_CALL(isend(send_buf,
				   bytes_sent,
				   MPI_BYTE,
				   fh->f_procs_in_group[fh->f_aggregator_index],
				   123,
				   MCA_PML_BASE_SEND_STANDARD, 
				   fh->f_comm,
				   send_req));	

	    
	    if ( OMPI_SUCCESS != ret ){
		goto exit;
	    }

	    ret = ompi_request_wait(send_req, MPI_STATUS_IGNORE);
	    if (OMPI_SUCCESS != ret){
		goto exit;
	    }
	} 
	if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {       
	  ret = ompi_request_wait_all (recv_req_count,
				       recv_req,
				       MPI_STATUS_IGNORE);
	  
	  if (OMPI_SUCCESS != ret){
	    goto exit;
	  }
	}
#if DEBUG_ON
	if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank){
	  printf("************Cycle: %d,  Aggregator: %d ***************\n", 
		 index+1,fh->f_rank);
	  for (i=0 ; i<global_count/4 ; i++)
	    printf (" RECV %d \n",((int *)global_buf)[i]);
	}
#endif




    
    if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) {
	if (NULL != send_buf) {
	    free (send_buf);
	    send_buf = NULL;
	}
    }

#if TIME_BREAKDOWN
      end_comm_time = MPI_Wtime();
      comm_time += (end_comm_time - start_comm_time);
#endif


    /**********************************************************
     **************** DONE GATHERING OF DATA ******************
     *********************************************************/
    
        /**********************************************************
         ******* Create the io array, and pass it to fbtl *********
         *********************************************************/

	if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {

#if TIME_BREAKDOWN
	    start_write_time = MPI_Wtime();
#endif
	  
	  fh->f_io_array = (mca_io_ompio_io_array_t *) malloc 
	    (entries_per_aggregator * sizeof (mca_io_ompio_io_array_t));
	  if (NULL == fh->f_io_array) {
	    opal_output(1, "OUT OF MEMORY\n");
	    return OMPI_ERR_OUT_OF_RESOURCE;
	  }
	  
	  fh->f_num_of_io_entries = 0;
	  /*First entry for every aggregator*/
	  fh->f_io_array[fh->f_num_of_io_entries].offset = 
	    (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset;
	  fh->f_io_array[fh->f_num_of_io_entries].length = 
	    file_offsets_for_agg[sorted_file_offsets[0]].length;
	  fh->f_io_array[fh->f_num_of_io_entries].memory_address = 
	    global_buf+memory_displacements[sorted_file_offsets[0]];
	  fh->f_num_of_io_entries++;

	  for (i=1;i<entries_per_aggregator;i++){
	    /* If the enrties are contiguous merge them,
	       else make a new entry */
	    if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset + 
		file_offsets_for_agg[sorted_file_offsets[i-1]].length ==
		file_offsets_for_agg[sorted_file_offsets[i]].offset){
	      fh->f_io_array[fh->f_num_of_io_entries - 1].length +=
		file_offsets_for_agg[sorted_file_offsets[i]].length;	
	    }
	    else {
	      fh->f_io_array[fh->f_num_of_io_entries].offset = 
		(IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset;
	      fh->f_io_array[fh->f_num_of_io_entries].length = 
		file_offsets_for_agg[sorted_file_offsets[i]].length;
	      fh->f_io_array[fh->f_num_of_io_entries].memory_address = 
		global_buf+memory_displacements[sorted_file_offsets[i]];
	      fh->f_num_of_io_entries++;
	    }
	    
	  }
	 
#if DEBUG_ON
	  printf("*************************** %d\n", fh->f_num_of_io_entries);
	  for (i=0 ; i<fh->f_num_of_io_entries ; i++) {
	    printf(" ADDRESS: %p  OFFSET: %ld   LENGTH: %ld\n",
		   fh->f_io_array[i].memory_address,
		   (OPAL_PTRDIFF_TYPE)fh->f_io_array[i].offset,
		   fh->f_io_array[i].length);
	  }
	  
#endif


	  if (fh->f_num_of_io_entries) {
	    if ( 0 >  fh->f_fbtl->fbtl_pwritev (fh)) {
	      opal_output (1, "WRITE FAILED\n");
	      ret = OMPI_ERROR;
	      goto exit;
	    }
	  }
#if TIME_BREAKDOWN
	  end_write_time = MPI_Wtime();
	  write_time += end_write_time - start_write_time;
#endif	  


	}
	
	
	if (NULL != send_req){
	  free(send_req);
	  send_req = NULL;
	}
	
        if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
	  fh->f_num_of_io_entries = 0;
	  if (NULL != fh->f_io_array) {
	    free (fh->f_io_array);
	    fh->f_io_array = NULL;
	  }
	  for (i =0; i< fh->f_procs_per_group; i++) 
	    ompi_datatype_destroy(recvtype+i);
	  if (NULL != recvtype){
	      free(recvtype);
	      recvtype=NULL;
	  }
	  if (NULL != recv_req){
	    free(recv_req);
	    recv_req = NULL;
	    
	  }
	  if (NULL != global_buf) {
	    free (global_buf);
	    global_buf = NULL;
	  }
	  
	}
	
    }

#if TIME_BREAKDOWN
      end_exch = MPI_Wtime();
      exch_write += end_exch - start_exch;
      nentry.time[0] = write_time;
      nentry.time[1] = comm_time;
      nentry.time[2] = exch_write;
      if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank)
	nentry.aggregator = 1;
      else
	nentry.aggregator = 0;
      nentry.nprocs_for_coll = dynamic_num_io_procs;
      if (!ompi_io_ompio_full_print_queue(WRITE_PRINT_QUEUE)){
	ompi_io_ompio_register_print_entry(WRITE_PRINT_QUEUE,
					   nentry);
      } 
#endif


 exit :
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
      if (NULL != fh->f_io_array) {
	free (fh->f_io_array);
	fh->f_io_array = NULL;
      }
      if (NULL != disp_index){
	free(disp_index);
	disp_index = NULL;
      }
      if (NULL != recvtype){
	free(recvtype);
	recvtype=NULL;
      }
      if (NULL != recv_req){
	free(recv_req);
	recv_req = NULL;
      }
      if (NULL != global_buf) {
	free (global_buf);
	global_buf = NULL;
      }
      for(l=0;l<fh->f_procs_per_group;l++){
	if (NULL != blocklen_per_process[l]){
	  free(blocklen_per_process[l]);
	  blocklen_per_process[l] = NULL;
	}
	if (NULL != displs_per_process[l]){
	  free(displs_per_process[l]);
	  displs_per_process[l] = NULL;
	}
      }
      if (NULL != blocklen_per_process){
	free(blocklen_per_process);
	blocklen_per_process = NULL;
      }
      if (NULL != displs_per_process){
	free(displs_per_process);
	displs_per_process = NULL;
      }

    }
    
    if (NULL != sorted) {
        free (sorted);
        sorted = NULL;
    }
    if (NULL != global_iov_array) {
        free (global_iov_array);
	global_iov_array = NULL;
    }
    if (NULL != fview_count) {
        free (fview_count);
        fview_count = NULL;
    }
    if (NULL != decoded_iov) {
        free (decoded_iov);
        decoded_iov = NULL;
    }
    
    if (NULL != send_req){
      free(send_req);
      send_req = NULL;
    }

    return OMPI_SUCCESS;
}
int
mca_fcoll_two_phase_file_write_all (mca_io_ompio_file_t *fh,
                                    void *buf,
                                    int count,
                                    struct ompi_datatype_t *datatype,
                                    ompi_status_public_t *status)
{

    

  int i, j,interleave_count=0, striping_unit=0;
  uint32_t iov_count=0,ti;
  struct iovec *decoded_iov=NULL, *temp_iov=NULL;
  size_t max_data = 0, total_bytes = 0; 
  int domain_size=0, *count_my_req_per_proc=NULL, count_my_req_procs;
  int count_other_req_procs,  ret=OMPI_SUCCESS;
  size_t *buf_indices=NULL;
  int local_count = 0, local_size=0,*aggregator_list = NULL;
  struct iovec *iov = NULL;
  
  OMPI_MPI_OFFSET_TYPE start_offset, end_offset, fd_size;
  OMPI_MPI_OFFSET_TYPE *start_offsets=NULL, *end_offsets=NULL;
  OMPI_MPI_OFFSET_TYPE *fd_start=NULL, *fd_end=NULL, min_st_offset;
  Flatlist_node *flat_buf=NULL;
  mca_io_ompio_access_array_t *my_req=NULL, *others_req=NULL;
  MPI_Aint send_buf_addr;
#if TIME_BREAKDOWN
  print_entry nentry;
#endif
  
  
  if (opal_datatype_is_contiguous_memory_layout(&datatype->super,1)) {
    fh->f_flags = fh->f_flags |  OMPIO_CONTIGUOUS_MEMORY;
  }
  
  
  if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) {
    
    ret =   ompi_io_ompio_decode_datatype (fh,
					   datatype,
					   count,
					   buf,
					   &max_data,
					   &temp_iov,
					   &iov_count);
    if (OMPI_SUCCESS != ret ){
      goto exit;
    }

    send_buf_addr = (OPAL_PTRDIFF_TYPE)buf;
    decoded_iov = (struct iovec *)malloc 
      (iov_count * sizeof(struct iovec));

    for (ti = 0; ti < iov_count; ti ++){
      decoded_iov[ti].iov_base = (IOVBASE_TYPE *)(
	(OPAL_PTRDIFF_TYPE)temp_iov[ti].iov_base - 
	send_buf_addr);
      decoded_iov[ti].iov_len = 
	temp_iov[ti].iov_len ;
      #if DEBUG_ON
      printf("d_offset[%d]: %ld, d_len[%d]: %ld\n",
	     ti, (OPAL_PTRDIFF_TYPE)decoded_iov[ti].iov_base,
	     ti, decoded_iov[ti].iov_len);
      #endif
    }
    
  }
  else{
    max_data = count * datatype->super.size;
  }
    
  if ( MPI_STATUS_IGNORE != status ) {
    status->_ucount = max_data;
  }

  
  if(-1 == mca_fcoll_two_phase_num_io_procs){
    ret = ompi_io_ompio_set_aggregator_props (fh, 
					      mca_fcoll_two_phase_num_io_procs,
					      max_data);
    if ( OMPI_SUCCESS != ret){
      return  ret;
    }
    
    mca_fcoll_two_phase_num_io_procs = 
      ceil((float)fh->f_size/fh->f_procs_per_group);
    
  }
  
  if (mca_fcoll_two_phase_num_io_procs > fh->f_size){
    mca_fcoll_two_phase_num_io_procs = fh->f_size;
  }
  
#if DEBUG_ON
  printf("Number of aggregators : %ld\n", mca_fcoll_two_phase_num_io_procs);
#endif

  aggregator_list = (int *) malloc (mca_fcoll_two_phase_num_io_procs *
				    sizeof(int));
  
  if ( NULL == aggregator_list ) {
    return OMPI_ERR_OUT_OF_RESOURCE;
  }
  
  for (i =0; i< mca_fcoll_two_phase_num_io_procs; i++){
    aggregator_list[i] = i;
  }
  
  
  ret = ompi_io_ompio_generate_current_file_view (fh, 
						  max_data, 
						  &iov, 
						  &local_count);
  
  
  if ( OMPI_SUCCESS != ret ){
    goto exit;
  }
  
  
  ret = fh->f_comm->c_coll.coll_allreduce (&max_data,
					   &total_bytes,
					   1,
					   MPI_DOUBLE,
					   MPI_SUM,
					   fh->f_comm,
					   fh->f_comm->c_coll.coll_allreduce_module);
  
  if ( OMPI_SUCCESS != ret ) {
    goto exit;
  }
  
  
    
  if (!(fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) {
    
    /* This datastructre translates between OMPIO->ROMIO its a little hacky!*/
    /* But helps to re-use romio's code for handling non-contiguous file-type*/
    flat_buf = (Flatlist_node *)malloc(sizeof(Flatlist_node));
    if ( NULL == flat_buf ){
      ret = OMPI_ERR_OUT_OF_RESOURCE;
      goto exit;
    }
    
    flat_buf->type = datatype;
    flat_buf->next = NULL;
    flat_buf->count = 0;
       
    
    local_size = iov_count/count;
    
    flat_buf->indices = 
      (OMPI_MPI_OFFSET_TYPE *)malloc(local_size * 
					       sizeof(OMPI_MPI_OFFSET_TYPE));
    if ( NULL == flat_buf->indices ){
      ret = OMPI_ERR_OUT_OF_RESOURCE;
      goto exit;
      
    }
      
    flat_buf->blocklens = 
      (OMPI_MPI_OFFSET_TYPE *)malloc(local_size * 
				     sizeof(OMPI_MPI_OFFSET_TYPE));
    if ( NULL == flat_buf->blocklens ){
      ret = OMPI_ERR_OUT_OF_RESOURCE;
	 goto exit;
    }
    
    flat_buf->count = local_size;
    i=0;j=0;
    while(j < local_size){
      flat_buf->indices[j] = (OMPI_MPI_OFFSET_TYPE)(intptr_t)decoded_iov[i].iov_base;
      flat_buf->blocklens[j] = decoded_iov[i].iov_len;
      if(i < (int)iov_count)
	i+=1;
      j+=1;
    }
    
#if DEBUG_ON
    printf("flat_buf_count : %d\n", flat_buf->count);
    for(i=0;i<flat_buf->count;i++){
      printf("%d: blocklen[%d] : %lld, indices[%d]: %lld \n",
	     fh->f_rank, i, flat_buf->blocklens[i], i ,flat_buf->indices[i]);
	 
    }
#endif
  }
  
#if DEBUG_ON
    printf("%d: fcoll:two_phase:write_all->total_bytes:%ld, local_count: %d\n",
	   fh->f_rank,total_bytes, local_count);
    for (i=0 ; i<local_count ; i++) {
      printf("%d: fcoll:two_phase:write_all:OFFSET:%ld,LENGTH:%ld\n",
	     fh->f_rank,
	     (size_t)iov[i].iov_base,
	     (size_t)iov[i].iov_len);
    }
    
    
#endif
    
    start_offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t)iov[0].iov_base;
    end_offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t)iov[local_count-1].iov_base +
      (OMPI_MPI_OFFSET_TYPE)iov[local_count-1].iov_len - 1; 
    
#if DEBUG_ON
    printf("%d: fcoll:two_phase:write_all:START OFFSET:%ld,END OFFSET:%ld\n",
	   fh->f_rank,
	   (size_t)start_offset,
	   (size_t)end_offset);
    
#endif

    start_offsets = (OMPI_MPI_OFFSET_TYPE *)malloc
	(fh->f_size*sizeof(OMPI_MPI_OFFSET_TYPE));
 
    if ( NULL == start_offsets ){
      ret = OMPI_ERR_OUT_OF_RESOURCE;
      goto exit; 
    }

    end_offsets = (OMPI_MPI_OFFSET_TYPE *)malloc
	(fh->f_size*sizeof(OMPI_MPI_OFFSET_TYPE));
    
    if ( NULL == end_offsets ){
      ret =  OMPI_ERR_OUT_OF_RESOURCE;
      goto exit;
    }
    
    
    ret = fh->f_comm->c_coll.coll_allgather(&start_offset,
					    1,
					    MPI_LONG,
					    start_offsets,
					    1,
					    MPI_LONG,
					    fh->f_comm,
					    fh->f_comm->c_coll.coll_allgather_module);

    if ( OMPI_SUCCESS != ret ){
      goto exit;
    }


    ret = fh->f_comm->c_coll.coll_allgather(&end_offset,
					    1,
					    MPI_LONG,
					    end_offsets,
					    1,
					    MPI_LONG,
					    fh->f_comm,
					    fh->f_comm->c_coll.coll_allgather_module);


    if ( OMPI_SUCCESS != ret ){
      goto exit;
    }
				      
#if DEBUG_ON
    for (i=0;i<fh->f_size;i++){
	printf("%d: fcoll:two_phase:write_all:start[%d]:%ld,end[%d]:%ld\n",
	       fh->f_rank,i,
	       (size_t)start_offsets[i],i,
	       (size_t)end_offsets[i]);
    }
#endif



    for (i=1; i<fh->f_size; i++){
      if ((start_offsets[i] < end_offsets[i-1]) && 
	  (start_offsets[i] <= end_offsets[i])){
	interleave_count++;
      }
    }

#if DEBUG_ON
    printf("%d: fcoll:two_phase:write_all:interleave_count:%d\n",
	   fh->f_rank,interleave_count);
#endif 
    
    
    ret = mca_fcoll_two_phase_domain_partition(fh,
					       start_offsets,
					       end_offsets,
					       &min_st_offset,
					       &fd_start,
					       &fd_end,
					       domain_size, 
					       &fd_size,
					       striping_unit,
					       mca_fcoll_two_phase_num_io_procs);
    if ( OMPI_SUCCESS != ret ){
      goto exit;
    }
    
    
#if  DEBUG_ON
	for (i=0;i<mca_fcoll_two_phase_num_io_procs;i++){
	  printf("fd_start[%d] : %lld, fd_end[%d] : %lld, local_count: %d\n",
		   i, fd_start[i], i, fd_end[i], local_count);
	}
#endif
	
	
	ret = mca_fcoll_two_phase_calc_my_requests (fh,
						    iov,
						    local_count,
						    min_st_offset,
						    fd_start,
						    fd_end,
						    fd_size,
						    &count_my_req_procs,
						    &count_my_req_per_proc,
						    &my_req,
						    &buf_indices,
						    striping_unit,
						    mca_fcoll_two_phase_num_io_procs,
						    aggregator_list);
	if ( OMPI_SUCCESS != ret ){
	  goto exit;
	}
	
	

	ret = mca_fcoll_two_phase_calc_others_requests(fh,
						       count_my_req_procs,
						       count_my_req_per_proc,
						       my_req,
						       &count_other_req_procs,
						       &others_req);
	if (OMPI_SUCCESS != ret ){
	  goto exit;
	}
	
	
#if DEBUG_ON
	printf("count_other_req_procs : %d\n", count_other_req_procs);
#endif

#if TIME_BREAKDOWN
	  start_exch = MPI_Wtime();
#endif
	
	ret = two_phase_exch_and_write(fh,
				       buf,
				       datatype,
				       others_req,
				       iov,
				       local_count,
				       min_st_offset,
				       fd_size,
				       fd_start,
				       fd_end,
				       flat_buf,
				       buf_indices,
				       striping_unit,
				       aggregator_list);

	if (OMPI_SUCCESS != ret){
	  goto exit;
	}
	

#if TIME_BREAKDOWN
	end_exch = MPI_Wtime();
	exch_write += (end_exch - start_exch);
	
	nentry.time[0] = write_time;
	nentry.time[1] = comm_time;
	nentry.time[2] = exch_write;
	if (is_aggregator(fh->f_rank,
			  mca_fcoll_two_phase_num_io_procs,
			  aggregator_list)){
	  nentry.aggregator = 1;
	}
	else{
	  nentry.aggregator = 0;
	}
	nentry.nprocs_for_coll = mca_fcoll_two_phase_num_io_procs;
	if (!ompi_io_ompio_full_print_queue(WRITE_PRINT_QUEUE)){
	  ompi_io_ompio_register_print_entry(WRITE_PRINT_QUEUE,
					     nentry);
	}
#endif

 exit : 
	if (flat_buf != NULL) {

	  if (flat_buf->blocklens != NULL) {
	    free (flat_buf->blocklens);
	  }
	  
	  if (flat_buf->indices != NULL) {
	    free (flat_buf->indices);
	  }
	  free (flat_buf);

	}



	if (start_offsets != NULL) {
	  free(start_offsets);
	}
	
	if (end_offsets != NULL){
	  free(end_offsets);
	}
	if (aggregator_list != NULL){
	  free(aggregator_list);
	}

	return ret;
}
int
mca_fcoll_static_file_write_all (mca_io_ompio_file_t *fh,
                                 void *buf,
                                 int count,
                                 struct ompi_datatype_t *datatype,
                                 ompi_status_public_t *status)
{



    size_t max_data = 0, bytes_per_cycle=0;
    struct iovec *iov=NULL, *decoded_iov=NULL;
    uint32_t iov_count=0, iov_index=0;
    int i=0,j=0,l=0, temp_index;
    int ret=OMPI_SUCCESS, cycles, local_cycles, *bytes_per_process=NULL;
    int index, *disp_index=NULL, **blocklen_per_process=NULL;
    int *iovec_count_per_process=NULL, *displs=NULL;
    size_t total_bytes_written=0;
    MPI_Aint **displs_per_process=NULL, *memory_displacements=NULL;
    MPI_Aint bytes_to_write_in_cycle=0, global_iov_count=0, global_count=0;

    local_io_array *local_iov_array =NULL, *global_iov_array=NULL;
    local_io_array *file_offsets_for_agg=NULL;
    int *sorted=NULL, *sorted_file_offsets=NULL, temp_pindex, *temp_disp_index=NULL;
    char *send_buf=NULL, *global_buf=NULL;
    int iov_size=0, current_position=0, *current_index=NULL;
    int *bytes_remaining=NULL, entries_per_aggregator=0;
    ompi_datatype_t **recvtype = NULL;
    MPI_Request *send_req=NULL, *recv_req=NULL;
    /* For creating datatype of type io_array */
    int blocklen[3] = {1, 1, 1};
    int static_num_io_procs=1;
    OPAL_PTRDIFF_TYPE d[3], base;
    ompi_datatype_t *types[3];
    ompi_datatype_t *io_array_type=MPI_DATATYPE_NULL;
    /*----------------------------------------------*/
#if TIME_BREAKDOWN
    double write_time = 0.0, start_write_time = 0.0, end_write_time = 0.0;
    double comm_time = 0.0, start_comm_time = 0.0, end_comm_time = 0.0;
    double exch_write = 0.0, start_exch = 0.0, end_exch = 0.0;
    print_entry nentry;
#endif


#if DEBUG_ON
    MPI_Aint gc_in;
#endif

//  if (opal_datatype_is_contiguous_memory_layout(&datatype->super,1)) {
//    fh->f_flags |= OMPIO_CONTIGUOUS_MEMORY;
//  }

    /* In case the data is not contigous in memory, decode it into an iovec */
    if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) {
        ompi_io_ompio_decode_datatype (fh,
                                       datatype,
                                       count,
                                       buf,
                                       &max_data,
                                       &decoded_iov,
                                       &iov_count);
    }
    else {
        max_data = count * datatype->super.size;
    }

    if ( MPI_STATUS_IGNORE != status ) {
        status->_ucount = max_data;
    }

    mca_io_ompio_get_num_aggregators ( & static_num_io_procs );
    ompi_io_ompio_set_aggregator_props (fh,
                                        static_num_io_procs,
                                        max_data);


    /* io_array datatype  for using in communication*/
    types[0] = &ompi_mpi_long.dt;
    types[1] = &ompi_mpi_long.dt;
    types[2] = &ompi_mpi_int.dt;

    d[0] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0];
    d[1] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0].length;
    d[2] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0].process_id;
    base = d[0];
    for (i=0 ; i<3 ; i++) {
        d[i] -= base;
    }
    ompi_datatype_create_struct (3,
                                 blocklen,
                                 d,
                                 types,
                                 &io_array_type);
    ompi_datatype_commit (&io_array_type);
    /* #########################################################*/



    ret = ompi_io_ompio_generate_current_file_view(fh,
            max_data,
            &iov,
            &iov_size);
    if (ret != OMPI_SUCCESS) {
        fprintf(stderr,"Current File View Generation Error\n");
        goto exit;
    }

    if (0 == iov_size) {
        iov_size  = 1;
    }

    local_iov_array = (local_io_array *)malloc (iov_size * sizeof(local_io_array));
    if ( NULL == local_iov_array) {
        fprintf(stderr,"local_iov_array allocation error\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }


    for (j=0; j < iov_size; j++) {
        local_iov_array[j].offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t)
                                    iov[j].iov_base;
        local_iov_array[j].length = (size_t)iov[j].iov_len;
        local_iov_array[j].process_id = fh->f_rank;

    }

    mca_io_ompio_get_bytes_per_agg ( (int *) &bytes_per_cycle);


    local_cycles = ceil((double)max_data/bytes_per_cycle);
    ret = fh->f_comm->c_coll.coll_allreduce (&local_cycles,
            &cycles,
            1,
            MPI_INT,
            MPI_MAX,
            fh->f_comm,
            fh->f_comm->c_coll.coll_allreduce_module);

    if (OMPI_SUCCESS != ret) {
        fprintf(stderr,"local cycles allreduce!\n");
        goto exit;
    }

    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {

        disp_index = (int *)malloc (fh->f_procs_per_group * sizeof (int));
        if (NULL == disp_index) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        bytes_per_process = (int *) malloc (fh->f_procs_per_group * sizeof(int ));
        if (NULL == bytes_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        bytes_remaining = (int *) malloc (fh->f_procs_per_group * sizeof(int));
        if (NULL == bytes_remaining) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        current_index = (int *) malloc (fh->f_procs_per_group * sizeof(int));
        if (NULL == current_index) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        blocklen_per_process = (int **)malloc (fh->f_procs_per_group * sizeof (int*));
        if (NULL == blocklen_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        displs_per_process = (MPI_Aint **)
                             malloc (fh->f_procs_per_group * sizeof (MPI_Aint*));

        if (NULL == displs_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        for(i=0; i<fh->f_procs_per_group; i++) {
            current_index[i] = 0;
            bytes_remaining[i] =0;
            blocklen_per_process[i] = NULL;
            displs_per_process[i] = NULL;
        }
    }

    iovec_count_per_process = (int *) malloc (fh->f_procs_per_group * sizeof(int));
    if (NULL == iovec_count_per_process) {
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }

    displs = (int *) malloc (fh->f_procs_per_group * sizeof(int));
    if (NULL == displs) {
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }

    ret = ompi_io_ompio_allgather_array (&iov_size,
                                         1,
                                         MPI_INT,
                                         iovec_count_per_process,
                                         1,
                                         MPI_INT,
                                         fh->f_aggregator_index,
                                         fh->f_procs_in_group,
                                         fh->f_procs_per_group,
                                         fh->f_comm);

    if( OMPI_SUCCESS != ret) {
        fprintf(stderr,"iov size allgatherv array!\n");
        goto exit;
    }


    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
        displs[0] = 0;
        global_iov_count = iovec_count_per_process[0];
        for (i=1 ; i<fh->f_procs_per_group ; i++) {
            global_iov_count += iovec_count_per_process[i];
            displs[i] = displs[i-1] + iovec_count_per_process[i-1];
        }
    }


    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
        global_iov_array = (local_io_array *) malloc (global_iov_count *
                           sizeof(local_io_array));
        if (NULL == global_iov_array) {
            opal_output (1, "OUT OF MEMORY\n");
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
    }

    ret = ompi_io_ompio_gatherv_array (local_iov_array,
                                       iov_size,
                                       io_array_type,
                                       global_iov_array,
                                       iovec_count_per_process,
                                       displs,
                                       io_array_type,
                                       fh->f_aggregator_index,
                                       fh->f_procs_in_group,
                                       fh->f_procs_per_group,
                                       fh->f_comm);
    if (OMPI_SUCCESS != ret) {
        fprintf(stderr,"global_iov_array gather error!\n");
        goto exit;
    }

    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {

        if ( 0 == global_iov_count) {
            global_iov_count =  1;
        }

        sorted = (int *)malloc (global_iov_count * sizeof(int));
        if (NULL == sorted) {
            opal_output (1, "OUT OF MEMORY\n");
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
        local_heap_sort (global_iov_array, global_iov_count, sorted);
    }

#if DEBUG_ON

    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
        for (gc_in=0; gc_in<global_iov_count; gc_in++) {
            printf("%d: Offset[%ld]: %lld, Length[%ld]: %ld\n",
                   global_iov_array[gc_in].process_id,
                   gc_in, global_iov_array[gc_in].offset,
                   gc_in, global_iov_array[gc_in].length);
        }
    }
#endif

#if TIME_BREAKDOWN
    start_exch = MPI_Wtime();
#endif


    for (index = 0; index < cycles; index++) {
        if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
            if (NULL == recvtype) {
                recvtype = (ompi_datatype_t **)
                           malloc (fh->f_procs_per_group  * sizeof(ompi_datatype_t *));
                if (NULL == recvtype) {
                    opal_output (1, "OUT OF MEMORY\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
            }
            for(l=0; l<fh->f_procs_per_group; l++) {
                disp_index[l] =  1;
                if (NULL != blocklen_per_process[l]) {
                    free(blocklen_per_process[l]);
                    blocklen_per_process[l] = NULL;
                }
                if (NULL != displs_per_process[l]) {
                    free(displs_per_process[l]);
                    displs_per_process[l] = NULL;
                }
                blocklen_per_process[l] = (int *) calloc (1, sizeof(int));
                if (NULL == blocklen_per_process[l]) {
                    opal_output (1, "OUT OF MEMORY for blocklen\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint));
                if (NULL == displs_per_process[l]) {
                    opal_output (1, "OUT OF MEMORY for displs\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
            }
            if (NULL != sorted_file_offsets) {
                free(sorted_file_offsets);
                sorted_file_offsets = NULL;
            }

            if(NULL != file_offsets_for_agg) {
                free(file_offsets_for_agg);
                file_offsets_for_agg = NULL;
            }

            if (NULL != memory_displacements) {
                free(memory_displacements);
                memory_displacements = NULL;
            }

        }
        if (local_cycles > index) {
            if ((index == local_cycles-1) && (max_data % bytes_per_cycle)) {
                bytes_to_write_in_cycle = max_data % bytes_per_cycle;
            }
            else if (max_data <= bytes_per_cycle) {
                bytes_to_write_in_cycle = max_data;
            }
            else {
                bytes_to_write_in_cycle = bytes_per_cycle;
            }
        }
        else {
            bytes_to_write_in_cycle = 0;
        }
#if DEBUG_ON
        /*    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {*/
        printf ("***%d: CYCLE %d   Bytes %ld**********\n",
                fh->f_rank,
                index,
                bytes_to_write_in_cycle);
        /* }*/
#endif
        /**********************************************************
         **Gather the Data from all the processes at the writers **
         *********************************************************/

        /* gather from each process how many bytes each will be sending */
        ompi_io_ompio_gather_array (&bytes_to_write_in_cycle,
                                    1,
                                    MPI_INT,
                                    bytes_per_process,
                                    1,
                                    MPI_INT,
                                    fh->f_aggregator_index,
                                    fh->f_procs_in_group,
                                    fh->f_procs_per_group,
                                    fh->f_comm);

        /*
           For each aggregator
           it needs to get bytes_to_write_in_cycle from each process
           in group which adds up to bytes_per_cycle

        */
        if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
            for (i=0; i<fh->f_procs_per_group; i++) {
                /*	    printf("bytes_per_process[%d]: %d\n", i, bytes_per_process[i]);
                 */

#if DEBUG_ON
                printf ("%d : bytes_per_process : %d\n",
                        fh->f_procs_in_group[i],
                        bytes_per_process[i]);
#endif

                while (bytes_per_process[i] > 0) {
                    if (get_process_id(global_iov_array[sorted[current_index[i]]].process_id,
                                       fh) == i) { /* current id owns this entry!*/

                        /*Add and subtract length and create
                          blocklength and displs array*/
                        if (bytes_remaining[i]) {
                            /*Remaining bytes in the current entry of
                            		     the global offset array*/
                            if (bytes_remaining[i] <= bytes_per_process[i]) {
                                blocklen_per_process[i][disp_index[i] - 1] = bytes_remaining[i];
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset +
                                    (global_iov_array[sorted[current_index[i]]].length
                                     - bytes_remaining[i]);

                                blocklen_per_process[i] = (int *) realloc
                                                          ((void *)blocklen_per_process[i], (disp_index[i]+1)*sizeof(int));
                                displs_per_process[i] = (MPI_Aint *)realloc
                                                        ((void *)displs_per_process[i], (disp_index[i]+1)*sizeof(MPI_Aint));
                                bytes_per_process[i] -= bytes_remaining[i];
                                blocklen_per_process[i][disp_index[i]] = 0;
                                displs_per_process[i][disp_index[i]] = 0;
                                bytes_remaining[i] = 0;
                                disp_index[i] += 1;
                                /* This entry has been used up, we need to move to the
                                next entry of this process and make current_index point there*/
                                current_index[i]  = find_next_index(i,
                                                                    current_index[i],
                                                                    fh,
                                                                    global_iov_array,
                                                                    global_iov_count,
                                                                    sorted);
                                if (current_index[i] == -1) {
                                    /* No more entries left, so Its all done! exit!*/
                                    break;
                                }
                                continue;
                            }
                            else {
                                blocklen_per_process[i][disp_index[i] - 1] = bytes_per_process[i];
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset +
                                    (global_iov_array[sorted[current_index[i]]].length
                                     - bytes_remaining[i]);
                                bytes_remaining[i] -= bytes_per_process[i];
                                bytes_per_process[i] = 0;
                                break;
                            }
                        }
                        else {
                            if (bytes_per_process[i] <
                                    global_iov_array[sorted[current_index[i]]].length) {
                                blocklen_per_process[i][disp_index[i] - 1] =
                                    bytes_per_process[i];
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset;

                                bytes_remaining[i] =
                                    global_iov_array[sorted[current_index[i]]].length -
                                    bytes_per_process[i];
                                bytes_per_process[i] = 0;
                                break;
                            }
                            else {
                                blocklen_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].length;
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset;
                                blocklen_per_process[i] =
                                    (int *) realloc ((void *)blocklen_per_process[i], (disp_index[i]+1)*sizeof(int));
                                displs_per_process[i] = (MPI_Aint *)realloc
                                                        ((void *)displs_per_process[i], (disp_index[i]+1)*sizeof(MPI_Aint));
                                blocklen_per_process[i][disp_index[i]] = 0;
                                displs_per_process[i][disp_index[i]] = 0;
                                disp_index[i] += 1;
                                bytes_per_process[i] -=
                                    global_iov_array[sorted[current_index[i]]].length;
                                current_index[i] = find_next_index(i,
                                                                   current_index[i],
                                                                   fh,
                                                                   global_iov_array,
                                                                   global_iov_count,
                                                                   sorted);
                                if (current_index[i] == -1) {
                                    break;
                                }
                            }
                        }
                    }
                    else {
                        current_index[i] = find_next_index(i,
                                                           current_index[i],
                                                           fh,
                                                           global_iov_array,
                                                           global_iov_count,
                                                           sorted);
                        if (current_index[i] == -1) {
                            bytes_per_process[i] = 0; /* no more entries left
						   to service this request*/
                            continue;
                        }
                    }
                }
            }
            entries_per_aggregator=0;
            for (i=0; i<fh->f_procs_per_group; i++) {
                for (j=0; j<disp_index[i]; j++) {
                    if (blocklen_per_process[i][j] > 0) {
                        entries_per_aggregator++;
#if DEBUG_ON
                        printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n",
                               fh->f_procs_in_group[i],j,
                               blocklen_per_process[i][j],j,
                               displs_per_process[i][j],
                               fh->f_rank);

#endif
                    }

                }
            }

            if (entries_per_aggregator > 0) {
                file_offsets_for_agg = (local_io_array *)
                                       malloc(entries_per_aggregator*sizeof(local_io_array));
                if (NULL == file_offsets_for_agg) {
                    opal_output (1, "OUT OF MEMORY\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                sorted_file_offsets = (int *)
                                      malloc (entries_per_aggregator*sizeof(int));
                if (NULL == sorted_file_offsets) {
                    opal_output (1, "OUT OF MEMORY\n");
                    ret =  OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                temp_index = 0;
                for (i=0; i<fh->f_procs_per_group; i++) {
                    for(j=0; j<disp_index[i]; j++) {
                        if (blocklen_per_process[i][j] > 0) {
                            file_offsets_for_agg[temp_index].length =
                                blocklen_per_process[i][j];
                            file_offsets_for_agg[temp_index].process_id = i;
                            file_offsets_for_agg[temp_index].offset =
                                displs_per_process[i][j];
                            temp_index++;
                        }
                    }
                }
            }
            else {
                continue;
            }
            local_heap_sort (file_offsets_for_agg,
                             entries_per_aggregator,
                             sorted_file_offsets);

            memory_displacements = (MPI_Aint *) malloc
                                   (entries_per_aggregator * sizeof(MPI_Aint));
            memory_displacements[sorted_file_offsets[0]] = 0;
            for (i=1; i<entries_per_aggregator; i++) {
                memory_displacements[sorted_file_offsets[i]] =
                    memory_displacements[sorted_file_offsets[i-1]] +
                    file_offsets_for_agg[sorted_file_offsets[i-1]].length;
            }

            temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int));
            if (NULL == temp_disp_index) {
                opal_output (1, "OUT OF MEMORY\n");
                return OMPI_ERR_OUT_OF_RESOURCE;
            }
            global_count = 0;
            for (i=0; i<entries_per_aggregator; i++) {
                temp_pindex =
                    file_offsets_for_agg[sorted_file_offsets[i]].process_id;
                displs_per_process[temp_pindex][temp_disp_index[temp_pindex]] =
                    memory_displacements[sorted_file_offsets[i]];
                if (temp_disp_index[temp_pindex] < disp_index[temp_pindex])
                    temp_disp_index[temp_pindex] += 1;
                else {
                    printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n",
                           temp_pindex, temp_disp_index[temp_pindex],
                           temp_pindex, disp_index[temp_pindex]);
                }
                global_count +=
                    file_offsets_for_agg[sorted_file_offsets[i]].length;
            }
            if (NULL != temp_disp_index) {
                free(temp_disp_index);
                temp_disp_index = NULL;
            }

#if DEBUG_ON
            printf("************Cycle: %d,  Aggregator: %d ***************\n",
                   index+1,fh->f_rank);
            for (i=0; i<entries_per_aggregator; i++) {
                printf("%d: OFFSET: %lld   LENGTH: %ld, Mem-offset: %ld, disp : %d\n",
                       file_offsets_for_agg[sorted_file_offsets[i]].process_id,
                       file_offsets_for_agg[sorted_file_offsets[i]].offset,
                       file_offsets_for_agg[sorted_file_offsets[i]].length,
                       memory_displacements[sorted_file_offsets[i]],
                       disp_index[ file_offsets_for_agg[sorted_file_offsets[i]].process_id]);
            }
#endif

#if DEBUG_ON
            printf("%d: global_count : %ld, bytes_to_write_in_cycle : %ld, procs_per_group: %d\n",
                   fh->f_rank,
                   global_count,
                   bytes_to_write_in_cycle,
                   fh->f_procs_per_group);
#endif
#if TIME_BREAKDOWN
            start_comm_time = MPI_Wtime();
#endif
            global_buf  = (char *) malloc (global_count);
            if (NULL == global_buf) {
                opal_output(1, "OUT OF MEMORY");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }

            recv_req = (MPI_Request *)
                       malloc (fh->f_procs_per_group * sizeof(MPI_Request));
            if (NULL == recv_req) {
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
            for (i=0; i<fh->f_procs_per_group; i++) {
                ompi_datatype_create_hindexed(disp_index[i],
                                              blocklen_per_process[i],
                                              displs_per_process[i],
                                              MPI_BYTE,
                                              &recvtype[i]);
                ompi_datatype_commit(&recvtype[i]);
                ret = MCA_PML_CALL(irecv(global_buf,
                                         1,
                                         recvtype[i],
                                         fh->f_procs_in_group[i],
                                         123,
                                         fh->f_comm,
                                         &recv_req[i]));
                if (OMPI_SUCCESS != ret) {
                    fprintf(stderr,"irecv Error!\n");
                    goto exit;
                }
            }
        }

        if (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY) {
            send_buf = &((char*)buf)[total_bytes_written];
        }
        else if (bytes_to_write_in_cycle) {
            /* allocate a send buffer and copy the data that needs
               to be sent into it in case the data is non-contigous
               in memory */
            OPAL_PTRDIFF_TYPE mem_address;
            size_t remaining = 0;
            size_t temp_position = 0;

            send_buf = malloc (bytes_to_write_in_cycle);
            if (NULL == send_buf) {
                opal_output (1, "OUT OF MEMORY\n");
                return OMPI_ERR_OUT_OF_RESOURCE;
            }
            remaining = bytes_to_write_in_cycle;

            while (remaining) {
                mem_address = (OPAL_PTRDIFF_TYPE)
                              (decoded_iov[iov_index].iov_base) + current_position;

                if (remaining >=
                        (decoded_iov[iov_index].iov_len - current_position)) {
                    memcpy (send_buf+temp_position,
                            (IOVBASE_TYPE *)mem_address,
                            decoded_iov[iov_index].iov_len - current_position);
                    remaining = remaining -
                                (decoded_iov[iov_index].iov_len - current_position);
                    temp_position = temp_position +
                                    (decoded_iov[iov_index].iov_len - current_position);
                    iov_index = iov_index + 1;
                    current_position = 0;
                }
                else {
                    memcpy (send_buf+temp_position,
                            (IOVBASE_TYPE *)mem_address,
                            remaining);
                    current_position = current_position + remaining;
                    remaining = 0;
                }
            }
        }
        total_bytes_written += bytes_to_write_in_cycle;

        send_req = (MPI_Request *) malloc (sizeof(MPI_Request));
        if (NULL == send_req) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        ret = MCA_PML_CALL(isend(send_buf,
                                 bytes_to_write_in_cycle,
                                 MPI_BYTE,
                                 fh->f_procs_in_group[fh->f_aggregator_index],
                                 123,
                                 MCA_PML_BASE_SEND_STANDARD,
                                 fh->f_comm,
                                 send_req));

        if ( OMPI_SUCCESS != ret ) {
            fprintf(stderr,"isend error!\n");
            goto exit;
        }

        ret = ompi_request_wait (send_req, MPI_STATUS_IGNORE);
        if (OMPI_SUCCESS != ret) {
            goto exit;
        }

        if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
            ret = ompi_request_wait_all (fh->f_procs_per_group,
                                         recv_req,
                                         MPI_STATUS_IGNORE);
            if (OMPI_SUCCESS != ret) {
                goto exit;
            }

#if DEBUG_ON
            printf("************Cycle: %d,  Aggregator: %d ***************\n",
                   index+1,fh->f_rank);
            if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
                for (i=0 ; i<global_count/4 ; i++)
                    printf (" RECV %d \n",((int *)global_buf)[i]);
            }
#endif
        }
#if TIME_BREAKDOWN
        end_comm_time = MPI_Wtime();
        comm_time += end_comm_time - start_comm_time;
#endif



        if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
            fh->f_io_array = (mca_io_ompio_io_array_t *) malloc
                             (entries_per_aggregator * sizeof (mca_io_ompio_io_array_t));
            if (NULL == fh->f_io_array) {
                opal_output(1, "OUT OF MEMORY\n");
                return OMPI_ERR_OUT_OF_RESOURCE;
            }
            fh->f_num_of_io_entries = 0;
            /*First entry for every aggregator*/
            fh->f_io_array[fh->f_num_of_io_entries].offset =
                (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset;
            fh->f_io_array[fh->f_num_of_io_entries].length =
                file_offsets_for_agg[sorted_file_offsets[0]].length;
            fh->f_io_array[fh->f_num_of_io_entries].memory_address =
                global_buf+memory_displacements[sorted_file_offsets[0]];
            fh->f_num_of_io_entries++;
            for (i=1; i<entries_per_aggregator; i++) {
                if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset +
                        file_offsets_for_agg[sorted_file_offsets[i-1]].length ==
                        file_offsets_for_agg[sorted_file_offsets[i]].offset) {
                    fh->f_io_array[fh->f_num_of_io_entries - 1].length +=
                        file_offsets_for_agg[sorted_file_offsets[i]].length;
                }
                else {
                    fh->f_io_array[fh->f_num_of_io_entries].offset =
                        (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset;
                    fh->f_io_array[fh->f_num_of_io_entries].length =
                        file_offsets_for_agg[sorted_file_offsets[i]].length;
                    fh->f_io_array[fh->f_num_of_io_entries].memory_address =
                        global_buf+memory_displacements[sorted_file_offsets[i]];
                    fh->f_num_of_io_entries++;
                }
            }
#if DEBUG_ON
            printf("*************************** %d\n", fh->f_num_of_io_entries);
            for (i=0 ; i<fh->f_num_of_io_entries ; i++) {
                printf(" ADDRESS: %p  OFFSET: %ld   LENGTH: %ld\n",
                       fh->f_io_array[i].memory_address,
                       (OPAL_PTRDIFF_TYPE)fh->f_io_array[i].offset,
                       fh->f_io_array[i].length);
            }
#endif

#if TIME_BREAKDOWN
            start_write_time = MPI_Wtime();
#endif

            if (fh->f_num_of_io_entries) {
                if ( 0 >  fh->f_fbtl->fbtl_pwritev (fh)) {
                    opal_output (1, "WRITE FAILED\n");
                    ret = OMPI_ERROR;
                    goto exit;
                }
            }

#if TIME_BREAKDOWN
            end_write_time = MPI_Wtime();
            write_time += end_write_time - start_write_time;
#endif

        }
        if (NULL != send_req) {
            free(send_req);
            send_req = NULL;
        }

        if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
            fh->f_num_of_io_entries = 0;
            if (NULL != fh->f_io_array) {
                free (fh->f_io_array);
                fh->f_io_array = NULL;
            }
            for (i = 0; i < fh->f_procs_per_group; i++)
                ompi_datatype_destroy(recvtype+i);
            if (NULL != recvtype) {
                free(recvtype);
                recvtype=NULL;
            }
            if (NULL != recv_req) {
                free(recv_req);
                recv_req = NULL;
            }
            if (NULL != global_buf) {
                free (global_buf);
                global_buf = NULL;
            }
        }
    }

#if TIME_BREAKDOWN
    end_exch = MPI_Wtime();
    exch_write += end_exch - start_exch;
    nentry.time[0] = write_time;
    nentry.time[1] = comm_time;
    nentry.time[2] = exch_write;
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank)
        nentry.aggregator = 1;
    else
        nentry.aggregator = 0;
    nentry.nprocs_for_coll = static_num_io_procs;
    if (!ompi_io_ompio_full_print_queue(WRITE_PRINT_QUEUE)) {
        ompi_io_ompio_register_print_entry(WRITE_PRINT_QUEUE,
                                           nentry);
    }
#endif



exit:
    if (NULL != decoded_iov) {
        free(decoded_iov);
        decoded_iov = NULL;
    }

    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {

        if (NULL != disp_index) {
            free(disp_index);
            disp_index = NULL;
        }

        if (NULL != local_iov_array) {
            free(local_iov_array);
            local_iov_array = NULL;
        }
        for(l=0; l<fh->f_procs_per_group; l++) {
            if (NULL != blocklen_per_process[l]) {
                free(blocklen_per_process[l]);
                blocklen_per_process[l] = NULL;
            }
            if (NULL != displs_per_process[l]) {
                free(displs_per_process[l]);
                displs_per_process[l] = NULL;
            }
        }
        if (NULL != blocklen_per_process) {
            free(blocklen_per_process);
            blocklen_per_process = NULL;
        }
        if (NULL != displs_per_process) {
            free(displs_per_process);
            displs_per_process = NULL;
        }
        if(NULL != bytes_remaining) {
            free(bytes_remaining);
            bytes_remaining = NULL;
        }
        if(NULL != current_index) {
            free(current_index);
            current_index = NULL;
        }
    }
    return ret;
}