示例#1
0
/**
 * Conversion function. They deal with data-types in 3 ways, always making local copies.
 * In order to allow performance testings, there are 3 functions:
 *  - one copying directly from one memory location to another one using the
 *    data-type copy function.
 *  - one which use a 2 convertors created with the same data-type
 *  - and one using 2 convertors created from different data-types.
 *
 */
static int local_copy_ddt_count( opal_datatype_t const * const pdt, int count )
{
    OPAL_PTRDIFF_TYPE lb, extent;
    size_t malloced_size;
    char *odst, *osrc;
    void *pdst, *psrc;
    TIMER_DATA_TYPE start, end;
    long total_time;
    int errors = 0;

    malloced_size = compute_memory_size(pdt, count);
    opal_datatype_get_extent( pdt, &lb, &extent );

    odst = (char*)malloc( malloced_size );
    osrc = (char*)malloc( malloced_size );

    {
        for( size_t i = 0; i < malloced_size; i++ )
            osrc[i] = i % 128 + 32;
        memcpy(odst, osrc, malloced_size);
    }
    pdst = odst - lb;
    psrc = osrc - lb;

    cache_trash();  /* make sure the cache is useless */

    GET_TIME( start );
    if( OPAL_SUCCESS != opal_datatype_copy_content_same_ddt( pdt, count, pdst, psrc ) ) {
        printf( "Unable to copy the datatype in the function local_copy_ddt_count."
                " Is the datatype committed ?\n" );
    }
    GET_TIME( end );
    total_time = ELAPSED_TIME( start, end );
    printf( "direct local copy in %ld microsec\n", total_time );
    if(outputFlags & VALIDATE_DATA) {
        for( size_t i = 0; i < malloced_size; i++ ) {
            if( odst[i] != osrc[i] ) {
                printf("error at position %lu (%d != %d)\n",
                       (unsigned long)i, (int)(odst[i]), (int)(osrc[i]));
                errors++;
                if(outputFlags & QUIT_ON_FIRST_ERROR) {
                    opal_datatype_dump(pdt);
                    assert(0); exit(-1);
                }
            }
        }
        if( 0 == errors ) {
            printf("Validation check succesfully passed\n");
        } else {
            printf("Found %d errors. Giving up!\n", errors);
            exit(-1);
        }
    }
    free( odst );
    free( osrc );

    return (0 == errors ? OPAL_SUCCESS : errors);
}
int mca_io_ompio_set_view_internal(mca_io_ompio_file_t *fh,
				   OMPI_MPI_OFFSET_TYPE disp,
				   ompi_datatype_t *etype,
				   ompi_datatype_t *filetype,
				   char *datarep,
				   ompi_info_t *info)
{


    size_t max_data = 0;
    MPI_Aint lb,ub;    

    fh->f_iov_count   = 0;
    fh->f_disp        = disp;
    fh->f_offset      = disp;
    fh->f_total_bytes = 0;
    
    ompi_io_ompio_decode_datatype (fh, 
                                   filetype, 
                                   1, 
                                   NULL, 
                                   &max_data,
                                   &fh->f_decoded_iov, 
                                   &fh->f_iov_count);

    opal_datatype_get_extent(&filetype->super, &lb, &fh->f_view_extent);
    opal_datatype_type_ub   (&filetype->super, &ub);
    opal_datatype_type_size (&etype->super, &fh->f_etype_size);
    opal_datatype_type_size (&filetype->super, &fh->f_view_size);
    ompi_datatype_duplicate (etype, &fh->f_etype);
    ompi_datatype_duplicate (filetype, &fh->f_filetype);
    
    fh->f_cc_size = get_contiguous_chunk_size (fh);

    if (opal_datatype_is_contiguous_memory_layout(&etype->super,1)) {
        if (opal_datatype_is_contiguous_memory_layout(&filetype->super,1) && 
	    fh->f_view_extent == (OPAL_PTRDIFF_TYPE)fh->f_view_size ) {
            fh->f_flags |= OMPIO_CONTIGUOUS_FVIEW;
        }
    }



    
    return OMPI_SUCCESS;
}
示例#3
0
int mpich_typeub( void )
{
   int errs = 0;
   OPAL_PTRDIFF_TYPE extent, lb, extent1, extent2, extent3;
   OPAL_PTRDIFF_TYPE displ[2];
   int blens[2];
   opal_datatype_t *type1, *type2, *type3, *types[2];

   opal_datatype_create_vector( 2, 1, 4, &opal_datatype_int4, &type1 );
   opal_datatype_commit( type1 );
   opal_datatype_get_extent( type1, &lb, &extent );
   extent1 = 5 * sizeof(int);
   if (extent != extent1) {
      printf("EXTENT 1 %ld != %ld\n", (long)extent, (long)extent1);
      errs++;
      printf("extent(type1)=%ld\n",(long)extent);
   }

   blens[0] = 1;
   blens[1] = 1;
   displ[0] = 0;
   displ[1] = sizeof(int)*4;
   types[0] = type1;
   types[1] = (opal_datatype_t*)&opal_datatype_ub;
   extent2  = displ[1];

   /*    using MPI_UB and Type_struct, monkey with the extent, making it 16
    */
   opal_datatype_create_struct( 2, blens, displ, types, &type2 );
   opal_datatype_commit( type2 );
   opal_datatype_get_extent( type2, &lb, &extent );
   if (extent != extent2) {
      printf("EXTENT 2 %ld != %ld\n", (long)extent, (long)extent2);
      errs++;
      printf("extent(type2)=%ld\n",(long)extent);
   }

   /*    monkey with the extent again, making it 4
    *     ===> MPICH gives 4
    *     ===> MPIF gives 16, the old extent
    */
   displ[1] = sizeof(int);
   types[0] = type2;
   types[1] = (opal_datatype_t*)&opal_datatype_ub;
   extent3  = extent2;

   opal_datatype_create_struct( 2, blens, displ, types, &type3 );
   opal_datatype_commit( type3 );

   opal_datatype_get_extent( type3, &lb, &extent );
   if (extent != extent3) {
      printf("EXTENT 3 %ld != %ld\n", (long)extent, (long)extent3);
      errs++;
      printf("extent(type3)=%ld\n",(long)extent);
   }

   OBJ_RELEASE( type1 ); /*assert( type1 == NULL );*/
   OBJ_RELEASE( type2 ); /*assert( type2 == NULL );*/
   OBJ_RELEASE( type3 ); assert( type3 == NULL );
   return errs;
}
int
mca_fcoll_dynamic_gen2_file_read_all (mca_io_ompio_file_t *fh,
                                 void *buf,
                                 int count,
                                 struct ompi_datatype_t *datatype,
                                 ompi_status_public_t *status)
{
    MPI_Aint position = 0;
    MPI_Aint total_bytes = 0;          /* total bytes to be read */
    MPI_Aint bytes_to_read_in_cycle = 0; /* left to be read in a cycle*/
    MPI_Aint bytes_per_cycle = 0;      /* total read in each cycle by each process*/
    int index = 0, ret=OMPI_SUCCESS;
    int cycles = 0;
    int i=0, j=0, l=0;
    int n=0; /* current position in total_bytes_per_process array */
    MPI_Aint bytes_remaining = 0; /* how many bytes have been read from the current
                                     value from total_bytes_per_process */
    int *sorted_file_offsets=NULL, entries_per_aggregator=0;
    int bytes_received = 0;
    int blocks = 0;
    /* iovec structure and count of the buffer passed in */
    uint32_t iov_count = 0;
    struct iovec *decoded_iov = NULL;
    int iov_index = 0;
    size_t current_position = 0;
    struct iovec *local_iov_array=NULL, *global_iov_array=NULL;
    char *receive_buf = NULL;
    MPI_Aint *memory_displacements=NULL;
    /* global iovec at the readers that contain the iovecs created from
       file_set_view */
    uint32_t total_fview_count = 0;
    int local_count = 0;
    int *fview_count = NULL, *disp_index=NULL, *temp_disp_index=NULL;
    int current_index=0, temp_index=0;
    int **blocklen_per_process=NULL;
    MPI_Aint **displs_per_process=NULL;
    char *global_buf = NULL;
    MPI_Aint global_count = 0;
    mca_io_ompio_local_io_array *file_offsets_for_agg=NULL;

    /* array that contains the sorted indices of the global_iov */
    int *sorted = NULL;
    int *displs = NULL;
    int dynamic_gen2_num_io_procs;
    size_t max_data = 0;
    MPI_Aint *total_bytes_per_process = NULL;
    ompi_datatype_t **sendtype = NULL;
    MPI_Request *send_req=NULL, recv_req=NULL;
    int my_aggregator =-1;
    bool recvbuf_is_contiguous=false;
    size_t ftype_size;
    OPAL_PTRDIFF_TYPE ftype_extent, lb;


#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    double read_time = 0.0, start_read_time = 0.0, end_read_time = 0.0;
    double rcomm_time = 0.0, start_rcomm_time = 0.0, end_rcomm_time = 0.0;
    double read_exch = 0.0, start_rexch = 0.0, end_rexch = 0.0;
    mca_io_ompio_print_entry nentry;
#endif

    /**************************************************************************
     ** 1. In case the data is not contigous in memory, decode it into an iovec
     **************************************************************************/

    opal_datatype_type_size ( &datatype->super, &ftype_size );
    opal_datatype_get_extent ( &datatype->super, &lb, &ftype_extent );

    if ( (ftype_extent == (OPAL_PTRDIFF_TYPE) ftype_size)             &&
        opal_datatype_is_contiguous_memory_layout(&datatype->super,1) &&
        0 == lb ) {
        recvbuf_is_contiguous = true;
    }


    if (! recvbuf_is_contiguous ) {
        ret = fh->f_decode_datatype ((struct mca_io_ompio_file_t *)fh,
                                     datatype,
                                     count,
                                     buf,
                                     &max_data,
                                     &decoded_iov,
                                     &iov_count);
        if (OMPI_SUCCESS != ret){
            goto exit;
        }
    }
    else {
        max_data = count * datatype->super.size;
    }

    if ( MPI_STATUS_IGNORE != status ) {
        status->_ucount = max_data;
    }

    fh->f_get_num_aggregators ( &dynamic_gen2_num_io_procs);
    ret = fh->f_set_aggregator_props ((struct mca_io_ompio_file_t *) fh,
                                      dynamic_gen2_num_io_procs,
                                      max_data);
    if (OMPI_SUCCESS != ret){
        goto exit;
    }
    my_aggregator = fh->f_procs_in_group[fh->f_aggregator_index];

    /**************************************************************************
     ** 2. Determine the total amount of data to be written
     **************************************************************************/
    total_bytes_per_process = (MPI_Aint*)malloc(fh->f_procs_per_group*sizeof(MPI_Aint));
    if (NULL == total_bytes_per_process) {
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_rcomm_time = MPI_Wtime();
#endif
    ret = fcoll_base_coll_allgather_array (&max_data,
                                           1,
                                           MPI_LONG,
                                           total_bytes_per_process,
                                           1,
                                           MPI_LONG,
                                           fh->f_aggregator_index,
                                           fh->f_procs_in_group,
                                           fh->f_procs_per_group,
                                           fh->f_comm);
    if (OMPI_SUCCESS != ret){
        goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    end_rcomm_time = MPI_Wtime();
    rcomm_time += end_rcomm_time - start_rcomm_time;
#endif

    for (i=0 ; i<fh->f_procs_per_group ; i++) {
        total_bytes += total_bytes_per_process[i];
    }

    if (NULL != total_bytes_per_process) {
        free (total_bytes_per_process);
        total_bytes_per_process = NULL;
    }

    /*********************************************************************
     *** 3. Generate the File offsets/lengths corresponding to this write
     ********************************************************************/
    ret = fh->f_generate_current_file_view ((struct mca_io_ompio_file_t *) fh,
                                            max_data,
                                            &local_iov_array,
                                            &local_count);

    if (ret != OMPI_SUCCESS){
        goto exit;
    }

    /*************************************************************
     *** 4. Allgather the File View information at all processes
     *************************************************************/

    fview_count = (int *) malloc (fh->f_procs_per_group * sizeof (int));
    if (NULL == fview_count) {
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_rcomm_time = MPI_Wtime();
#endif
    ret = fcoll_base_coll_allgather_array (&local_count,
                                           1,
                                           MPI_INT,
                                           fview_count,
                                           1,
                                           MPI_INT,
                                           fh->f_aggregator_index,
                                           fh->f_procs_in_group,
                                           fh->f_procs_per_group,
                                           fh->f_comm);
    
    if (OMPI_SUCCESS != ret){
        goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    end_rcomm_time = MPI_Wtime();
    rcomm_time += end_rcomm_time - start_rcomm_time;
#endif

    displs = (int*)malloc (fh->f_procs_per_group*sizeof(int));
    if (NULL == displs) {
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }

    displs[0] = 0;
    total_fview_count = fview_count[0];
    for (i=1 ; i<fh->f_procs_per_group ; i++) {
        total_fview_count += fview_count[i];
        displs[i] = displs[i-1] + fview_count[i-1];
    }

#if DEBUG_ON
    if (my_aggregator == fh->f_rank) {
    for (i=0 ; i<fh->f_procs_per_group ; i++) {
    printf ("%d: PROCESS: %d  ELEMENTS: %d  DISPLS: %d\n",
        fh->f_rank,
        i,
        fview_count[i],
        displs[i]);
}
}
#endif

    /* allocate the global iovec  */
    if (0 != total_fview_count) {
        global_iov_array = (struct iovec*)malloc (total_fview_count *
                                                  sizeof(struct iovec));
        if (NULL == global_iov_array) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_rcomm_time = MPI_Wtime();
#endif
    ret =  fcoll_base_coll_allgatherv_array (local_iov_array,
                                             local_count,
                                             fh->f_iov_type,
                                             global_iov_array,
                                             fview_count,
                                             displs,
                                             fh->f_iov_type,
                                             fh->f_aggregator_index,
                                             fh->f_procs_in_group,
                                             fh->f_procs_per_group,
                                             fh->f_comm);

    if (OMPI_SUCCESS != ret){
        goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    end_rcomm_time = MPI_Wtime();
    rcomm_time += end_rcomm_time - start_rcomm_time;
#endif

    /****************************************************************************************
     *** 5. Sort the global offset/lengths list based on the offsets.
     *** The result of the sort operation is the 'sorted', an integer array,
     *** which contains the indexes of the global_iov_array based on the offset.
     *** For example, if global_iov_array[x].offset is followed by global_iov_array[y].offset
     *** in the file, and that one is followed by global_iov_array[z].offset, than
     *** sorted[0] = x, sorted[1]=y and sorted[2]=z;
     ******************************************************************************************/
    if (0 != total_fview_count) {
       sorted = (int *)malloc (total_fview_count * sizeof(int));
      if (NULL == sorted) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        fh->f_sort_iovec (global_iov_array, total_fview_count, sorted);
    }

    if (NULL != local_iov_array) {
        free (local_iov_array);
        local_iov_array = NULL;
    }

#if DEBUG_ON
    if (my_aggregator == fh->f_rank) {
        for (i=0 ; i<total_fview_count ; i++) {
            printf("%d: OFFSET: %p   LENGTH: %d\n",
                   fh->f_rank,
                   global_iov_array[sorted[i]].iov_base,
                   global_iov_array[sorted[i]].iov_len);
        }
    }
#endif

    /*************************************************************
     *** 6. Determine the number of cycles required to execute this
     ***    operation
     *************************************************************/
    fh->f_get_bytes_per_agg ( (int *) &bytes_per_cycle);
    cycles = ceil((double)total_bytes/bytes_per_cycle);

    if ( my_aggregator == fh->f_rank) {
      disp_index = (int *)malloc (fh->f_procs_per_group * sizeof (int));
      if (NULL == disp_index) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        blocklen_per_process = (int **)malloc (fh->f_procs_per_group * sizeof (int*));
        if (NULL == blocklen_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        displs_per_process = (MPI_Aint **)malloc (fh->f_procs_per_group * sizeof (MPI_Aint*));
        if (NULL == displs_per_process){
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        for (i=0;i<fh->f_procs_per_group;i++){
            blocklen_per_process[i] = NULL;
            displs_per_process[i] = NULL;
        }

	send_req = (MPI_Request *) malloc (fh->f_procs_per_group * sizeof(MPI_Request));
	if (NULL == send_req){
	    opal_output ( 1, "OUT OF MEMORY\n");
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	}

	global_buf = (char *) malloc (bytes_per_cycle);
	if (NULL == global_buf){
	    opal_output(1, "OUT OF MEMORY\n");
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	}

	sendtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group * sizeof(ompi_datatype_t *));
	if (NULL == sendtype) {
            opal_output (1, "OUT OF MEMORY\n");
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	}

	for(l=0;l<fh->f_procs_per_group;l++){
            sendtype[l] = MPI_DATATYPE_NULL;
	}
    }




#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_rexch = MPI_Wtime();
#endif
    n = 0;
    bytes_remaining = 0;
    current_index = 0;

    for (index = 0; index < cycles; index++) {
        /**********************************************************************
         ***  7a. Getting ready for next cycle: initializing and freeing buffers
	 **********************************************************************/
        if (my_aggregator == fh->f_rank) {
             if (NULL != fh->f_io_array) {
                free (fh->f_io_array);
                fh->f_io_array = NULL;
            }
            fh->f_num_of_io_entries = 0;

            if (NULL != sendtype){
                for (i =0; i< fh->f_procs_per_group; i++) {
		    if ( MPI_DATATYPE_NULL != sendtype[i] ) {
                        ompi_datatype_destroy(&sendtype[i]);
                        sendtype[i] = MPI_DATATYPE_NULL;
                    }
		}
            }

            for(l=0;l<fh->f_procs_per_group;l++){
                disp_index[l] =  1;

                if (NULL != blocklen_per_process[l]){
                    free(blocklen_per_process[l]);
                    blocklen_per_process[l] = NULL;
                }
                if (NULL != displs_per_process[l]){
                    free(displs_per_process[l]);
                    displs_per_process[l] = NULL;
                }
                blocklen_per_process[l] = (int *) calloc (1, sizeof(int));
                if (NULL == blocklen_per_process[l]) {
                    opal_output (1, "OUT OF MEMORY for blocklen\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint));
                if (NULL == displs_per_process[l]){
                    opal_output (1, "OUT OF MEMORY for displs\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
            }

            if (NULL != sorted_file_offsets){
                free(sorted_file_offsets);
                sorted_file_offsets = NULL;
            }

            if(NULL != file_offsets_for_agg){
                free(file_offsets_for_agg);
                file_offsets_for_agg = NULL;
            }
            if (NULL != memory_displacements){
                free(memory_displacements);
                memory_displacements = NULL;
            }
        }  /* (my_aggregator == fh->f_rank */

        /**************************************************************************
         ***  7b. Determine the number of bytes to be actually read in this cycle
	 **************************************************************************/
        if (cycles-1 == index) {
            bytes_to_read_in_cycle = total_bytes - bytes_per_cycle*index;
        }
        else {
            bytes_to_read_in_cycle = bytes_per_cycle;
        }

#if DEBUG_ON
        if (my_aggregator == fh->f_rank) {
            printf ("****%d: CYCLE %d   Bytes %d**********\n",
                    fh->f_rank,
                    index,
                    bytes_to_write_in_cycle);
        }
#endif

        /*****************************************************************
         *** 7c. Calculate how much data will be contributed in this cycle
	 ***     by each process
         *****************************************************************/
        bytes_received = 0;

        while (bytes_to_read_in_cycle) {
            /* This next block identifies which process is the holder
            ** of the sorted[current_index] element;
            */
            blocks = fview_count[0];
            for (j=0 ; j<fh->f_procs_per_group ; j++) {
                if (sorted[current_index] < blocks) {
                    n = j;
                    break;
                }
                else {
                    blocks += fview_count[j+1];
                }
            }

            if (bytes_remaining) {
                /* Finish up a partially used buffer from the previous  cycle */
                if (bytes_remaining <= bytes_to_read_in_cycle) {
                    /* Data fits completely into the block */
                    if (my_aggregator == fh->f_rank) {
                        blocklen_per_process[n][disp_index[n] - 1] = bytes_remaining;
                        displs_per_process[n][disp_index[n] - 1] =
                            (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base +
                            (global_iov_array[sorted[current_index]].iov_len - bytes_remaining);

                        blocklen_per_process[n] = (int *) realloc
                            ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int));
                        displs_per_process[n] = (MPI_Aint *) realloc
                            ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint));
                        blocklen_per_process[n][disp_index[n]] = 0;
                        displs_per_process[n][disp_index[n]] = 0;
                        disp_index[n] += 1;
                    }
                    if (fh->f_procs_in_group[n] == fh->f_rank) {
                        bytes_received += bytes_remaining;
                    }
                    current_index ++;
                    bytes_to_read_in_cycle -= bytes_remaining;
                    bytes_remaining = 0;
                    continue;
                }
                else {
                     /* the remaining data from the previous cycle is larger than the
                        bytes_to_write_in_cycle, so we have to segment again */
                    if (my_aggregator == fh->f_rank) {
                        blocklen_per_process[n][disp_index[n] - 1] = bytes_to_read_in_cycle;
                        displs_per_process[n][disp_index[n] - 1] =
                            (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base +
                            (global_iov_array[sorted[current_index]].iov_len
                             - bytes_remaining);
                    }
                    if (fh->f_procs_in_group[n] == fh->f_rank) {
                        bytes_received += bytes_to_read_in_cycle;
                    }
                    bytes_remaining -= bytes_to_read_in_cycle;
                    bytes_to_read_in_cycle = 0;
                    break;
                }
            }
            else {
                /* No partially used entry available, have to start a new one */
                if (bytes_to_read_in_cycle <
                    (MPI_Aint) global_iov_array[sorted[current_index]].iov_len) {
                    /* This entry has more data than we can sendin one cycle */
                    if (my_aggregator == fh->f_rank) {
                        blocklen_per_process[n][disp_index[n] - 1] = bytes_to_read_in_cycle;
                        displs_per_process[n][disp_index[n] - 1] =
                            (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base ;
                    }

                    if (fh->f_procs_in_group[n] == fh->f_rank) {
                        bytes_received += bytes_to_read_in_cycle;
                    }
                    bytes_remaining = global_iov_array[sorted[current_index]].iov_len -
                        bytes_to_read_in_cycle;
                    bytes_to_read_in_cycle = 0;
                    break;
                }
                else {
                    /* Next data entry is less than bytes_to_write_in_cycle */
                    if (my_aggregator ==  fh->f_rank) {
                        blocklen_per_process[n][disp_index[n] - 1] =
                            global_iov_array[sorted[current_index]].iov_len;
                        displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)
                            global_iov_array[sorted[current_index]].iov_base;
                        blocklen_per_process[n] =
                            (int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int));
                        displs_per_process[n] = (MPI_Aint *)realloc
                            ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint));
                        blocklen_per_process[n][disp_index[n]] = 0;
                        displs_per_process[n][disp_index[n]] = 0;
                        disp_index[n] += 1;
                    }
                    if (fh->f_procs_in_group[n] == fh->f_rank) {
                        bytes_received +=
                            global_iov_array[sorted[current_index]].iov_len;
                    }
                    bytes_to_read_in_cycle -=
                        global_iov_array[sorted[current_index]].iov_len;
                    current_index ++;
                    continue;
                }
            }
        } /* end while (bytes_to_read_in_cycle) */

        /*************************************************************************
	 *** 7d. Calculate the displacement on where to put the data and allocate
         ***     the recieve buffer (global_buf)
	 *************************************************************************/
        if (my_aggregator == fh->f_rank) {
            entries_per_aggregator=0;
            for (i=0;i<fh->f_procs_per_group; i++){
                for (j=0;j<disp_index[i];j++){
                    if (blocklen_per_process[i][j] > 0)
                        entries_per_aggregator++ ;
                }
            }
            if (entries_per_aggregator > 0){
                file_offsets_for_agg = (mca_io_ompio_local_io_array *)
                    malloc(entries_per_aggregator*sizeof(mca_io_ompio_local_io_array));
                if (NULL == file_offsets_for_agg) {
                    opal_output (1, "OUT OF MEMORY\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                sorted_file_offsets = (int *)
                    malloc (entries_per_aggregator*sizeof(int));
                if (NULL == sorted_file_offsets){
                    opal_output (1, "OUT OF MEMORY\n");
                    ret =  OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                /*Moving file offsets to an IO array!*/
                temp_index = 0;
                global_count = 0;
                for (i=0;i<fh->f_procs_per_group; i++){
                    for(j=0;j<disp_index[i];j++){
                        if (blocklen_per_process[i][j] > 0){
                            file_offsets_for_agg[temp_index].length =
                                blocklen_per_process[i][j];
                            global_count += blocklen_per_process[i][j];
                            file_offsets_for_agg[temp_index].process_id = i;
                            file_offsets_for_agg[temp_index].offset =
                                displs_per_process[i][j];
                            temp_index++;
                        }
                    }
                }
            }
            else{
                continue;
            }

             /* Sort the displacements for each aggregator */
            read_heap_sort (file_offsets_for_agg,
                            entries_per_aggregator,
                            sorted_file_offsets);

            memory_displacements = (MPI_Aint *) malloc
                (entries_per_aggregator * sizeof(MPI_Aint));
            memory_displacements[sorted_file_offsets[0]] = 0;
            for (i=1; i<entries_per_aggregator; i++){
                memory_displacements[sorted_file_offsets[i]] =
                    memory_displacements[sorted_file_offsets[i-1]] +
                    file_offsets_for_agg[sorted_file_offsets[i-1]].length;
            }

             /**********************************************************
	     *** 7e. Create the io array, and pass it to fbtl
	     *********************************************************/
            fh->f_io_array = (mca_io_ompio_io_array_t *) malloc
                (entries_per_aggregator * sizeof (mca_io_ompio_io_array_t));
            if (NULL == fh->f_io_array) {
                opal_output(1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }

            fh->f_num_of_io_entries = 0;
            fh->f_io_array[0].offset =
                (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset;
            fh->f_io_array[0].length =
                file_offsets_for_agg[sorted_file_offsets[0]].length;
            fh->f_io_array[0].memory_address =
                global_buf+memory_displacements[sorted_file_offsets[0]];
            fh->f_num_of_io_entries++;
            for (i=1;i<entries_per_aggregator;i++){
                if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset +
                    file_offsets_for_agg[sorted_file_offsets[i-1]].length ==
                    file_offsets_for_agg[sorted_file_offsets[i]].offset){
                    fh->f_io_array[fh->f_num_of_io_entries - 1].length +=
                        file_offsets_for_agg[sorted_file_offsets[i]].length;
                }
                else{
                    fh->f_io_array[fh->f_num_of_io_entries].offset =
                        (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset;
                    fh->f_io_array[fh->f_num_of_io_entries].length =
                        file_offsets_for_agg[sorted_file_offsets[i]].length;
                    fh->f_io_array[fh->f_num_of_io_entries].memory_address =
                        global_buf+memory_displacements[sorted_file_offsets[i]];
                    fh->f_num_of_io_entries++;
                }
            }


#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            start_read_time = MPI_Wtime();
#endif

            if (fh->f_num_of_io_entries) {
                if ( 0 >  fh->f_fbtl->fbtl_preadv (fh)) {
                    opal_output (1, "READ FAILED\n");
                    ret = OMPI_ERROR;
                    goto exit;
                }
            }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            end_read_time = MPI_Wtime();
            read_time += end_read_time - start_read_time;
#endif
            /**********************************************************
             ******************** DONE READING ************************
             *********************************************************/

            temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int));
            if (NULL == temp_disp_index) {
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
            for (i=0; i<entries_per_aggregator; i++){
                temp_index =
                    file_offsets_for_agg[sorted_file_offsets[i]].process_id;
                displs_per_process[temp_index][temp_disp_index[temp_index]] =
                    memory_displacements[sorted_file_offsets[i]];
                if (temp_disp_index[temp_index] < disp_index[temp_index]){
                    temp_disp_index[temp_index] += 1;
                }
                else{
                    printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n",
                           temp_index, temp_disp_index[temp_index],
                           temp_index, disp_index[temp_index]);
                }
            }
            if (NULL != temp_disp_index){
                free(temp_disp_index);
                temp_disp_index = NULL;
            }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            start_rcomm_time = MPI_Wtime();
#endif
            for (i=0;i<fh->f_procs_per_group;i++){
                send_req[i] = MPI_REQUEST_NULL;
                if ( 0 < disp_index[i] ) {
                    ompi_datatype_create_hindexed(disp_index[i],
                                                  blocklen_per_process[i],
                                                  displs_per_process[i],
                                                  MPI_BYTE,
                                                  &sendtype[i]);
                    ompi_datatype_commit(&sendtype[i]);
                    ret = MCA_PML_CALL (isend(global_buf,
                                              1,
                                              sendtype[i],
                                              fh->f_procs_in_group[i],
                                              123,
                                              MCA_PML_BASE_SEND_STANDARD,
                                              fh->f_comm,
                                              &send_req[i]));
                    if(OMPI_SUCCESS != ret){
                        goto exit;
                    }
                }
            }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            end_rcomm_time = MPI_Wtime();
            rcomm_time += end_rcomm_time - start_rcomm_time;
#endif
        }

        /**********************************************************
         *** 7f.  Scatter the Data from the readers
         *********************************************************/
        if ( recvbuf_is_contiguous ) {
            receive_buf = &((char*)buf)[position];
        }
        else if (bytes_received) {
            /* allocate a receive buffer and copy the data that needs
               to be received into it in case the data is non-contigous
               in memory */
            receive_buf = malloc (bytes_received);
            if (NULL == receive_buf) {
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
        }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        start_rcomm_time = MPI_Wtime();
#endif
        ret = MCA_PML_CALL(irecv(receive_buf,
                                 bytes_received,
                                 MPI_BYTE,
                                 my_aggregator,
                                 123,
                                 fh->f_comm,
                                 &recv_req));
        if (OMPI_SUCCESS != ret){
            goto exit;
        }


        if (my_aggregator == fh->f_rank){
            ret = ompi_request_wait_all (fh->f_procs_per_group,
                                         send_req,
                                         MPI_STATUS_IGNORE);
            if (OMPI_SUCCESS != ret){
                goto exit;
            }
        }

        ret = ompi_request_wait (&recv_req, MPI_STATUS_IGNORE);
        if (OMPI_SUCCESS != ret){
            goto exit;
        }
        position += bytes_received;

        /* If data is not contigous in memory, copy the data from the
           receive buffer into the buffer passed in */
        if (!recvbuf_is_contiguous ) {
            OPAL_PTRDIFF_TYPE mem_address;
            size_t remaining = 0;
            size_t temp_position = 0;

            remaining = bytes_received;

            while (remaining) {
                mem_address = (OPAL_PTRDIFF_TYPE)
                    (decoded_iov[iov_index].iov_base) + current_position;

                if (remaining >=
                    (decoded_iov[iov_index].iov_len - current_position)) {
                    memcpy ((IOVBASE_TYPE *) mem_address,
                            receive_buf+temp_position,
                            decoded_iov[iov_index].iov_len - current_position);
                    remaining = remaining -
                        (decoded_iov[iov_index].iov_len - current_position);
                    temp_position = temp_position +
                        (decoded_iov[iov_index].iov_len - current_position);
                    iov_index = iov_index + 1;
                    current_position = 0;
                }
                else {
                    memcpy ((IOVBASE_TYPE *) mem_address,
                            receive_buf+temp_position,
                            remaining);
                    current_position = current_position + remaining;
                    remaining = 0;
                }
            }

            if (NULL != receive_buf) {
                free (receive_buf);
                receive_buf = NULL;
            }
        }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        end_rcomm_time = MPI_Wtime();
        rcomm_time += end_rcomm_time - start_rcomm_time;
#endif
    } /* end for (index=0; index < cycles; index ++) */

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    end_rexch = MPI_Wtime();
    read_exch += end_rexch - start_rexch;
    nentry.time[0] = read_time;
    nentry.time[1] = rcomm_time;
    nentry.time[2] = read_exch;
    if (my_aggregator == fh->f_rank)
        nentry.aggregator = 1;
    else
        nentry.aggregator = 0;
    nentry.nprocs_for_coll = dynamic_gen2_num_io_procs;
    if (!fh->f_full_print_queue(READ_PRINT_QUEUE)){
        fh->f_register_print_entry(READ_PRINT_QUEUE,
                                   nentry);
    }
#endif

exit:
    if (!recvbuf_is_contiguous) {
        if (NULL != receive_buf) {
            free (receive_buf);
            receive_buf = NULL;
        }
    }
    if (NULL != global_buf) {
        free (global_buf);
        global_buf = NULL;
    }
    if (NULL != sorted) {
        free (sorted);
        sorted = NULL;
    }
    if (NULL != global_iov_array) {
        free (global_iov_array);
        global_iov_array = NULL;
    }
    if (NULL != fview_count) {
        free (fview_count);
        fview_count = NULL;
    }
    if (NULL != decoded_iov) {
        free (decoded_iov);
        decoded_iov = NULL;
    }
    if (NULL != local_iov_array){
        free(local_iov_array);
        local_iov_array=NULL;
    }

    if (NULL != displs) {
        free (displs);
        displs = NULL;
    }
    if (my_aggregator == fh->f_rank) {

        if (NULL != sorted_file_offsets){
            free(sorted_file_offsets);
            sorted_file_offsets = NULL;
        }
        if (NULL != file_offsets_for_agg){
            free(file_offsets_for_agg);
            file_offsets_for_agg = NULL;
        }
        if (NULL != memory_displacements){
            free(memory_displacements);
            memory_displacements= NULL;
        }
        if (NULL != sendtype){
            for (i = 0; i < fh->f_procs_per_group; i++) {
                if ( MPI_DATATYPE_NULL != sendtype[i] ) {
                    ompi_datatype_destroy(&sendtype[i]);
                }
            }
            free(sendtype);
            sendtype=NULL;
        }

        if (NULL != disp_index){
            free(disp_index);
            disp_index = NULL;
        }

        if ( NULL != blocklen_per_process){
            for(l=0;l<fh->f_procs_per_group;l++){
                if (NULL != blocklen_per_process[l]){
                    free(blocklen_per_process[l]);
                    blocklen_per_process[l] = NULL;
                }
            }

            free(blocklen_per_process);
            blocklen_per_process = NULL;
        }

        if (NULL != displs_per_process){
            for (l=0; i<fh->f_procs_per_group; l++){
                if (NULL != displs_per_process[l]){
                    free(displs_per_process[l]);
                    displs_per_process[l] = NULL;
                }
            }
            free(displs_per_process);
            displs_per_process = NULL;
        }
        if ( NULL != send_req ) {
            free ( send_req );
            send_req = NULL;
        }
    }
    return ret;
}
示例#5
0
int mca_io_ompio_set_view_internal(mca_io_ompio_file_t *fh,
				   OMPI_MPI_OFFSET_TYPE disp,
				   ompi_datatype_t *etype,
				   ompi_datatype_t *filetype,
				   char *datarep,
				   ompi_info_t *info)
{

    size_t max_data = 0;
    int i;
    int num_groups = 0;
    contg *contg_groups;

    size_t ftype_size;
    OPAL_PTRDIFF_TYPE ftype_extent, lb, ub;
    ompi_datatype_t *newfiletype;

    if ( NULL != fh->f_etype ) {
        ompi_datatype_destroy (&fh->f_etype);
    }
    if ( NULL != fh->f_filetype ) {
        ompi_datatype_destroy (&fh->f_filetype);
    }
    if ( NULL != fh->f_orig_filetype ) {
        ompi_datatype_destroy (&fh->f_orig_filetype);
    }
    if (NULL != fh->f_decoded_iov) {
        free (fh->f_decoded_iov);
        fh->f_decoded_iov = NULL;
    }

    if (NULL != fh->f_datarep) {
        free (fh->f_datarep);
        fh->f_datarep = NULL;
    }

    /* Reset the flags first */
    fh->f_flags = 0;

    fh->f_flags |= OMPIO_FILE_VIEW_IS_SET;
    fh->f_datarep = strdup (datarep);
    ompi_datatype_duplicate (filetype, &fh->f_orig_filetype );

    opal_datatype_get_extent(&filetype->super, &lb, &ftype_extent);
    opal_datatype_type_size (&filetype->super, &ftype_size);

    if ( etype == filetype                             &&
	 ompi_datatype_is_predefined (filetype )       &&
	 ftype_extent == (OPAL_PTRDIFF_TYPE)ftype_size ){
	ompi_datatype_create_contiguous(MCA_IO_DEFAULT_FILE_VIEW_SIZE,
					&ompi_mpi_byte.dt,
					&newfiletype);
	ompi_datatype_commit (&newfiletype);
    }
    else {
        newfiletype = filetype;
    }



    fh->f_iov_count   = 0;
    fh->f_disp        = disp;
    fh->f_offset      = disp;
    fh->f_total_bytes = 0;

    ompi_io_ompio_decode_datatype (fh,
                                   newfiletype,
                                   1,
                                   NULL,
                                   &max_data,
                                   &fh->f_decoded_iov,
                                   &fh->f_iov_count);

    opal_datatype_get_extent(&newfiletype->super, &lb, &fh->f_view_extent);
    opal_datatype_type_ub   (&newfiletype->super, &ub);
    opal_datatype_type_size (&etype->super, &fh->f_etype_size);
    opal_datatype_type_size (&newfiletype->super, &fh->f_view_size);
    ompi_datatype_duplicate (etype, &fh->f_etype);
    ompi_datatype_duplicate (newfiletype, &fh->f_filetype);

    fh->f_cc_size = get_contiguous_chunk_size (fh);

    if (opal_datatype_is_contiguous_memory_layout(&etype->super,1)) {
        if (opal_datatype_is_contiguous_memory_layout(&filetype->super,1) &&
	    fh->f_view_extent == (OPAL_PTRDIFF_TYPE)fh->f_view_size ) {
            fh->f_flags |= OMPIO_CONTIGUOUS_FVIEW;
        }
    }

    contg_groups = (contg*) calloc ( 1, fh->f_size * sizeof(contg));
    if (NULL == contg_groups) {
        opal_output (1, "OUT OF MEMORY\n");
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    for( i = 0; i < fh->f_size; i++){
       contg_groups[i].procs_in_contg_group = (int*)calloc (1,fh->f_size * sizeof(int));
       if(NULL == contg_groups[i].procs_in_contg_group){
          int j;
          opal_output (1, "OUT OF MEMORY\n");
          for(j=0; j<i; j++) {
              free(contg_groups[j].procs_in_contg_group);
          }
          free(contg_groups);
          return OMPI_ERR_OUT_OF_RESOURCE;
       }
    }

    if( OMPI_SUCCESS != mca_io_ompio_fview_based_grouping(fh,
                                                          &num_groups,
                                                          contg_groups)){
       opal_output(1, "mca_io_ompio_fview_based_grouping() failed\n");
       free(contg_groups);
       return OMPI_ERROR;
    }
    if( !( (fh->f_comm->c_flags & OMPI_COMM_CART) &&
           (num_groups == 1 || num_groups == fh->f_size)) ) {
          mca_io_ompio_finalize_initial_grouping(fh,
                                                 num_groups,
                                                 contg_groups);
    }
    for( i = 0; i < fh->f_size; i++){
       free(contg_groups[i].procs_in_contg_group);
    }
    free(contg_groups);

    if ( etype == filetype                              &&
	 ompi_datatype_is_predefined (filetype )        &&
	 ftype_extent == (OPAL_PTRDIFF_TYPE)ftype_size ){
	ompi_datatype_destroy ( &newfiletype );
    }


    if (OMPI_SUCCESS != mca_fcoll_base_file_select (fh, NULL)) {
        opal_output(1, "mca_fcoll_base_file_select() failed\n");
        return OMPI_ERROR;
    }

    return OMPI_SUCCESS;
}
示例#6
0
static int local_copy_with_convertor( opal_datatype_t const * const pdt, int count, int chunk )
{
    OPAL_PTRDIFF_TYPE lb, extent;
    void *pdst = NULL, *psrc = NULL, *ptemp = NULL;
    char *odst, *osrc;
    opal_convertor_t *send_convertor = NULL, *recv_convertor = NULL;
    struct iovec iov;
    uint32_t iov_count;
    size_t max_data, length = 0, malloced_size;
    int32_t done1 = 0, done2 = 0, errors = 0;
    TIMER_DATA_TYPE start, end, unpack_start, unpack_end;
    long total_time, unpack_time = 0;

    malloced_size = compute_memory_size(pdt, count);
    opal_datatype_get_extent( pdt, &lb, &extent );

    odst = (char*)malloc( malloced_size );
    osrc = (char*)malloc( malloced_size );
    ptemp = malloc( chunk );

    {
        for( size_t i = 0; i < malloced_size; osrc[i] = i % 128 + 32, i++ );
        memcpy(odst, osrc, malloced_size);
    }
    pdst  = odst - lb;
    psrc  = osrc - lb;

    send_convertor = opal_convertor_create( remote_arch, 0 );
    if( OPAL_SUCCESS != opal_convertor_prepare_for_send( send_convertor, pdt, count, psrc ) ) {
        printf( "Unable to create the send convertor. Is the datatype committed ?\n" );
        goto clean_and_return;
    }

    recv_convertor = opal_convertor_create( remote_arch, 0 );
    if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( recv_convertor, pdt, count, pdst ) ) {
        printf( "Unable to create the recv convertor. Is the datatype committed ?\n" );
        goto clean_and_return;
    }

    cache_trash();  /* make sure the cache is useless */

    GET_TIME( start );
    while( (done1 & done2) != 1 ) {
        /* They are supposed to finish in exactly the same time. */
        if( done1 | done2 ) {
            printf( "WRONG !!! the send is %s but the receive is %s in local_copy_with_convertor\n",
                    (done1 ? "finish" : "not finish"),
                    (done2 ? "finish" : "not finish") );
        }

        max_data = chunk;
        iov_count = 1;
        iov.iov_base = ptemp;
        iov.iov_len = chunk;

        if( done1 == 0 ) {
            done1 = opal_convertor_pack( send_convertor, &iov, &iov_count, &max_data );
        }

        if( done2 == 0 ) {
            GET_TIME( unpack_start );
            done2 = opal_convertor_unpack( recv_convertor, &iov, &iov_count, &max_data );
            GET_TIME( unpack_end );
            unpack_time += ELAPSED_TIME( unpack_start, unpack_end );
        }

        length += max_data;
        if( outputFlags & RESET_CONVERTORS ) {
            struct dt_stack_t stack[1+send_convertor->stack_pos];
            int i, stack_pos = send_convertor->stack_pos;
            size_t pos;

            if( 0 == done1 ) {
                memcpy(stack, send_convertor->pStack, (1+send_convertor->stack_pos) * sizeof(struct dt_stack_t));
                pos = 0;
                opal_convertor_set_position(send_convertor, &pos);
                pos = length;
                opal_convertor_set_position(send_convertor, &pos);
                assert(pos == length);
                for(i = 0; i <= stack_pos; i++ ) {
                    if( stack[i].index != send_convertor->pStack[i].index )
                        {errors = 1; printf("send stack[%d].index differs (orig %d != new %d) (completed %lu/%lu)\n",
                                            i, stack[i].index, send_convertor->pStack[i].index,
                                            length, pdt->size * count);}
                    if( stack[i].count != send_convertor->pStack[i].count ) {
                        if( stack[i].type == send_convertor->pStack[i].type ) {
                            {errors = 1; printf("send stack[%d].count differs (orig %lu != new %lu) (completed %lu/%lu)\n",
                                                    i, stack[i].count, send_convertor->pStack[i].count,
                                                    length, pdt->size * count);}
                        } else {
                            if( (OPAL_DATATYPE_MAX_PREDEFINED <= stack[i].type) || (OPAL_DATATYPE_MAX_PREDEFINED <= send_convertor->pStack[i].type) )
                                {errors = 1; printf("send stack[%d].type wrong (orig %d != new %d) (completed %lu/%lu)\n",
                                                    i, (int)stack[i].type, (int)send_convertor->pStack[i].type,
                                                    length, pdt->size * count);}
                            else if( (stack[i].count * opal_datatype_basicDatatypes[stack[i].type]->size) !=
                                     (send_convertor->pStack[i].count * opal_datatype_basicDatatypes[send_convertor->pStack[i].type]->size) )
                                {errors = 1; printf("send stack[%d].type*count differs (orig (%d,%lu) != new (%d, %lu)) (completed %lu/%lu)\n",
                                                    i, (int)stack[i].type, stack[i].count,
                                                    (int)send_convertor->pStack[i].type, send_convertor->pStack[i].count,
                                                    length, pdt->size * count);}
                        }
                    }
                    if( stack[i].disp != send_convertor->pStack[i].disp )
                        {errors = 1; printf("send stack[%d].disp differs (orig %p != new %p) (completed %lu/%lu)\n",
                                            i, (void*)stack[i].disp, (void*)send_convertor->pStack[i].disp,
                                            length, pdt->size * count);}
                    if(0 != errors) {assert(0); exit(-1);}
                }
            }
            if( 0 == done2 ) {
                memcpy(stack, recv_convertor->pStack, (1+recv_convertor->stack_pos) * sizeof(struct dt_stack_t));
                pos = 0;
                opal_convertor_set_position(recv_convertor, &pos);
                pos = length;
                opal_convertor_set_position(recv_convertor, &pos);
                assert(pos == length);
                for(i = 0; i <= stack_pos; i++ ) {
                    if( stack[i].index != recv_convertor->pStack[i].index )
                        {errors = 1; printf("recv stack[%d].index differs (orig %d != new %d) (completed %lu/%lu)\n",
                                            i, stack[i].index, recv_convertor->pStack[i].index,
                                            length, pdt->size * count);}
                    if( stack[i].count != recv_convertor->pStack[i].count ) {
                        if( stack[i].type == recv_convertor->pStack[i].type ) {
                            {errors = 1; printf("recv stack[%d].count differs (orig %lu != new %lu) (completed %lu/%lu)\n",
                                                    i, stack[i].count, recv_convertor->pStack[i].count,
                                                    length, pdt->size * count);}
                        } else {
                            if( (OPAL_DATATYPE_MAX_PREDEFINED <= stack[i].type) || (OPAL_DATATYPE_MAX_PREDEFINED <= recv_convertor->pStack[i].type) )
                                {errors = 1; printf("recv stack[%d].type wrong (orig %d != new %d) (completed %lu/%lu)\n",
                                                    i, (int)stack[i].type, (int)recv_convertor->pStack[i].type,
                                                    length, pdt->size * count);}
                            else if( (stack[i].count * opal_datatype_basicDatatypes[stack[i].type]->size) !=
                                     (recv_convertor->pStack[i].count * opal_datatype_basicDatatypes[recv_convertor->pStack[i].type]->size) )
                                {errors = 1; printf("recv stack[%d].type*count differs (orig (%d,%lu) != new (%d, %lu)) (completed %lu/%lu)\n",
                                                    i, (int)stack[i].type, stack[i].count,
                                                    (int)recv_convertor->pStack[i].type, recv_convertor->pStack[i].count,
                                                    length, pdt->size * count);}
                        }
                    }
                    if( stack[i].disp != recv_convertor->pStack[i].disp )
                        {errors = 1; printf("recv stack[%d].disp differs (orig %p != new %p) (completed %lu/%lu)\n",
                                            i, (void*)stack[i].disp, (void*)recv_convertor->pStack[i].disp,
                                            length, pdt->size * count);}
                    if(0 != errors) {assert(0); exit(-1);}
                }
            }
        }
    }
    GET_TIME( end );
    total_time = ELAPSED_TIME( start, end );
    printf( "copying same data-type using convertors in %ld microsec\n", total_time );
    printf( "\t unpack in %ld microsec [pack in %ld microsec]\n", unpack_time,
            total_time - unpack_time );

    if(outputFlags & VALIDATE_DATA) {
        for( size_t i = errors = 0; i < malloced_size; i++ ) {
            if( odst[i] != osrc[i] ) {
                printf("error at position %lu (%d != %d)\n",
                       (unsigned long)i, (int)(odst[i]), (int)(osrc[i]));
                errors++;
                if(outputFlags & QUIT_ON_FIRST_ERROR) {
                    opal_datatype_dump(pdt);
                    assert(0); exit(-1);
                }
            }
        }
        if( 0 == errors ) {
            printf("Validation check succesfully passed\n");
        } else {
            printf("Found %d errors. Giving up!\n", errors);
            exit(-1);
        }
    }
 clean_and_return:
    if( NULL != send_convertor ) OBJ_RELEASE( send_convertor );
    if( NULL != recv_convertor ) OBJ_RELEASE( recv_convertor );

    if( NULL != odst ) free( odst );
    if( NULL != osrc ) free( osrc );
    if( NULL != ptemp ) free( ptemp );
    return (0 == errors ? OPAL_SUCCESS : errors);
}
示例#7
0
static int
local_copy_with_convertor_2datatypes( opal_datatype_t const * const send_type, int send_count,
                                      opal_datatype_t const * const recv_type, int recv_count,
                                      int chunk )
{
    OPAL_PTRDIFF_TYPE send_lb, send_extent, recv_lb, recv_extent;
    void *pdst = NULL, *psrc = NULL, *ptemp = NULL;
    char *odst, *osrc;
    opal_convertor_t *send_convertor = NULL, *recv_convertor = NULL;
    struct iovec iov;
    uint32_t iov_count;
    size_t max_data, length = 0, send_malloced_size, recv_malloced_size;;
    int32_t done1 = 0, done2 = 0;
    TIMER_DATA_TYPE start, end, unpack_start, unpack_end;
    long total_time, unpack_time = 0;

    send_malloced_size = compute_memory_size(send_type, send_count);
    recv_malloced_size = compute_memory_size(recv_type, recv_count);

    opal_datatype_get_extent( send_type, &send_lb, &send_extent );
    opal_datatype_get_extent( recv_type, &recv_lb, &recv_extent );

    odst = (char*)malloc( recv_malloced_size );
    osrc = (char*)malloc( send_malloced_size );
    ptemp = malloc( chunk );

    /* fill up the receiver with ZEROS */
    {
        for( size_t i = 0; i < send_malloced_size; i++ )
            osrc[i] = i % 128 + 32;
    }
    memset( odst, 0, recv_malloced_size );
    pdst  = odst - recv_lb;
    psrc  = osrc - send_lb;

    send_convertor = opal_convertor_create( remote_arch, 0 );
    if( OPAL_SUCCESS != opal_convertor_prepare_for_send( send_convertor, send_type, send_count, psrc ) ) {
        printf( "Unable to create the send convertor. Is the datatype committed ?\n" );
        goto clean_and_return;
    }
    recv_convertor = opal_convertor_create( remote_arch, 0 );
    if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( recv_convertor, recv_type, recv_count, pdst ) ) {
        printf( "Unable to create the recv convertor. Is the datatype committed ?\n" );
        goto clean_and_return;
    }

    cache_trash();  /* make sure the cache is useless */

    GET_TIME( start );
    while( (done1 & done2) != 1 ) {
        /* They are supposed to finish in exactly the same time. */
        if( done1 | done2 ) {
            printf( "WRONG !!! the send is %s but the receive is %s in local_copy_with_convertor_2datatypes\n",
                    (done1 ? "finish" : "not finish"),
                    (done2 ? "finish" : "not finish") );
        }

        max_data = chunk;
        iov_count = 1;
        iov.iov_base = ptemp;
        iov.iov_len = chunk;

        if( done1 == 0 ) {
            done1 = opal_convertor_pack( send_convertor, &iov, &iov_count, &max_data );
        }

        if( done2 == 0 ) {
            GET_TIME( unpack_start );
            done2 = opal_convertor_unpack( recv_convertor, &iov, &iov_count, &max_data );
            GET_TIME( unpack_end );
            unpack_time += ELAPSED_TIME( unpack_start, unpack_end );
        }

        length += max_data;

        if( outputFlags & RESET_CONVERTORS ) {
            size_t pos = 0;
            opal_convertor_set_position(send_convertor, &pos);
            pos = length;
            opal_convertor_set_position(send_convertor, &pos);
            assert(pos == length);

            pos = 0;
            opal_convertor_set_position(recv_convertor, &pos);
            pos = length;
            opal_convertor_set_position(recv_convertor, &pos);
            assert(pos == length);
        }
    }
    GET_TIME( end );
    total_time = ELAPSED_TIME( start, end );
    printf( "copying different data-types using convertors in %ld microsec\n", total_time );
    printf( "\t unpack in %ld microsec [pack in %ld microsec]\n", unpack_time,
            total_time - unpack_time );
 clean_and_return:
    if( send_convertor != NULL ) {
        OBJ_RELEASE( send_convertor ); assert( send_convertor == NULL );
    }
    if( recv_convertor != NULL ) {
        OBJ_RELEASE( recv_convertor ); assert( recv_convertor == NULL );
    }
    if( NULL != odst ) free( odst );
    if( NULL != osrc ) free( osrc );
    if( NULL != ptemp ) free( ptemp );
    return OPAL_SUCCESS;
}
示例#8
0
int mca_io_ompio_set_view_internal(mca_io_ompio_file_t *fh,
                                   OMPI_MPI_OFFSET_TYPE disp,
                                   ompi_datatype_t *etype,
                                   ompi_datatype_t *filetype,
                                   char *datarep,
                                   ompi_info_t *info)
{

    size_t max_data = 0;
    int i;
    int num_groups = 0;
    contg *contg_groups;



    MPI_Aint lb,ub;

    fh->f_iov_count   = 0;
    fh->f_disp        = disp;
    fh->f_offset      = disp;
    fh->f_total_bytes = 0;

    ompi_io_ompio_decode_datatype (fh,
                                   filetype,
                                   1,
                                   NULL,
                                   &max_data,
                                   &fh->f_decoded_iov,
                                   &fh->f_iov_count);

    opal_datatype_get_extent(&filetype->super, &lb, &fh->f_view_extent);
    opal_datatype_type_ub   (&filetype->super, &ub);
    opal_datatype_type_size (&etype->super, &fh->f_etype_size);
    opal_datatype_type_size (&filetype->super, &fh->f_view_size);
    ompi_datatype_duplicate (etype, &fh->f_etype);
    ompi_datatype_duplicate (filetype, &fh->f_filetype);

    fh->f_cc_size = get_contiguous_chunk_size (fh);

    if (opal_datatype_is_contiguous_memory_layout(&etype->super,1)) {
        if (opal_datatype_is_contiguous_memory_layout(&filetype->super,1) &&
                fh->f_view_extent == (OPAL_PTRDIFF_TYPE)fh->f_view_size ) {
            fh->f_flags |= OMPIO_CONTIGUOUS_FVIEW;
        }
    }


    contg_groups = (contg*) calloc ( 1, fh->f_size * sizeof(contg));
    if (NULL == contg_groups) {
        opal_output (1, "OUT OF MEMORY\n");
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    for( i = 0; i < fh->f_size; i++) {
        contg_groups[i].procs_in_contg_group = (int*)calloc (1,fh->f_size * sizeof(int));
        if(NULL == contg_groups[i].procs_in_contg_group) {
            int j;
            opal_output (1, "OUT OF MEMORY\n");
            for(j=0; j<i; j++) {
                free(contg_groups[j].procs_in_contg_group);
            }
            free(contg_groups);
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
    }
    if( OMPI_SUCCESS != mca_io_ompio_fview_based_grouping(fh,
            &num_groups,
            contg_groups)) {
        opal_output(1, "mca_io_ompio_fview_based_grouping() failed\n");
        free(contg_groups);
        return OMPI_ERROR;
    }
    if( !( (fh->f_comm->c_flags & OMPI_COMM_CART) &&
            (num_groups == 1 || num_groups == fh->f_size)) ) {
        mca_io_ompio_finalize_initial_grouping(fh,
                                               num_groups,
                                               contg_groups);
    }
    for( i = 0; i < fh->f_size; i++) {
        free(contg_groups[i].procs_in_contg_group);
    }
    free(contg_groups);


    return OMPI_SUCCESS;
}
示例#9
0
int mca_io_ompio_file_set_view (ompi_file_t *fp,
                                OMPI_MPI_OFFSET_TYPE disp,
                                ompi_datatype_t *etype,
                                ompi_datatype_t *filetype,
                                char *datarep,
                                ompi_info_t *info)
{
    mca_io_ompio_data_t *data;
    mca_io_ompio_file_t *fh;
    size_t ftype_size;
    OPAL_PTRDIFF_TYPE ftype_extent, lb;

    data = (mca_io_ompio_data_t *) fp->f_io_selected_data;
    fh = &data->ompio_fh;

    ompi_datatype_destroy (&fh->f_etype);
    ompi_datatype_destroy (&fh->f_filetype);
    ompi_datatype_destroy (&fh->f_orig_filetype);

    if (NULL != fh->f_decoded_iov) {
        free (fh->f_decoded_iov);
        fh->f_decoded_iov = NULL;
    }

    if (NULL != fh->f_datarep) {
        free (fh->f_datarep);
        fh->f_datarep = NULL;
    }

    /* Reset the flags first */
    fh->f_flags = 0;

    fh->f_flags |= OMPIO_FILE_VIEW_IS_SET;
    fh->f_datarep = strdup (datarep);
    ompi_datatype_duplicate (filetype, &fh->f_orig_filetype );

    opal_datatype_get_extent(&filetype->super, &lb, &ftype_extent);
    opal_datatype_type_size (&filetype->super, &ftype_size);

    if ( etype == filetype &&
            ompi_datatype_is_predefined (filetype ) &&
            ftype_extent == (OPAL_PTRDIFF_TYPE)ftype_size ) {
        ompi_datatype_t *newfiletype;
        ompi_datatype_create_contiguous(MCA_IO_DEFAULT_FILE_VIEW_SIZE,
                                        &ompi_mpi_byte.dt,
                                        &newfiletype);
        ompi_datatype_commit (&newfiletype);
        mca_io_ompio_set_view_internal (fh,
                                        disp,
                                        etype,
                                        newfiletype,
                                        datarep,
                                        info);
        ompi_datatype_destroy ( &newfiletype );
    }
    else {
        mca_io_ompio_set_view_internal (fh,
                                        disp,
                                        etype,
                                        filetype,
                                        datarep,
                                        info);
    }

    if (OMPI_SUCCESS != mca_fcoll_base_file_select (&data->ompio_fh,
            NULL)) {
        opal_output(1, "mca_fcoll_base_file_select() failed\n");
        return OMPI_ERROR;
    }

    return OMPI_SUCCESS;
}
int
mca_fcoll_dynamic_file_write_all (ompio_file_t *fh,
                                  const void *buf,
                                  int count,
                                  struct ompi_datatype_t *datatype,
                                  ompi_status_public_t *status)
{
    MPI_Aint total_bytes_written = 0;  /* total bytes that have been written*/
    MPI_Aint total_bytes = 0;          /* total bytes to be written */
    MPI_Aint bytes_to_write_in_cycle = 0; /* left to be written in a cycle*/
    MPI_Aint bytes_per_cycle = 0;      /* total written in each cycle by each process*/
    int index = 0;
    int cycles = 0;
    int i=0, j=0, l=0;
    int n=0; /* current position in total_bytes_per_process array */
    MPI_Aint bytes_remaining = 0; /* how many bytes have been written from the current
                                     value from total_bytes_per_process */
    int bytes_sent = 0, ret =0;
    int blocks=0, entries_per_aggregator=0;

    /* iovec structure and count of the buffer passed in */
    uint32_t iov_count = 0;
    struct iovec *decoded_iov = NULL;
    int iov_index = 0;
    char *send_buf = NULL;
    size_t current_position = 0;
    struct iovec *local_iov_array=NULL, *global_iov_array=NULL;
    mca_io_ompio_local_io_array *file_offsets_for_agg=NULL;
    /* global iovec at the writers that contain the iovecs created from
       file_set_view */
    uint32_t total_fview_count = 0;
    int local_count = 0, temp_pindex;
    int *fview_count = NULL, *disp_index=NULL, *temp_disp_index=NULL;
    int current_index = 0, temp_index=0;

    char *global_buf = NULL;
    MPI_Aint global_count = 0;


    /* array that contains the sorted indices of the global_iov */
    int *sorted = NULL, *sorted_file_offsets=NULL;
    int *displs = NULL;
    int dynamic_num_io_procs;
    size_t max_data = 0, datatype_size = 0;
    int **blocklen_per_process=NULL;
    MPI_Aint **displs_per_process=NULL, *memory_displacements=NULL;
    ompi_datatype_t **recvtype = NULL;
    MPI_Aint *total_bytes_per_process = NULL;
    MPI_Request send_req=NULL, *recv_req=NULL;
    int my_aggregator=-1;
    bool sendbuf_is_contiguous = false;
    size_t ftype_size;
    ptrdiff_t ftype_extent, lb;


#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    double write_time = 0.0, start_write_time = 0.0, end_write_time = 0.0;
    double comm_time = 0.0, start_comm_time = 0.0, end_comm_time = 0.0;
    double exch_write = 0.0, start_exch = 0.0, end_exch = 0.0;
    mca_common_ompio_print_entry nentry;
#endif

    opal_datatype_type_size ( &datatype->super, &ftype_size );
    opal_datatype_get_extent ( &datatype->super, &lb, &ftype_extent );

    /**************************************************************************
     ** 1.  In case the data is not contigous in memory, decode it into an iovec
     **************************************************************************/
    if ( ( ftype_extent == (ptrdiff_t) ftype_size)             &&
         opal_datatype_is_contiguous_memory_layout(&datatype->super,1) &&
         0 == lb ) {
        sendbuf_is_contiguous = true;
    }



    if (! sendbuf_is_contiguous ) {
        ret =   mca_common_ompio_decode_datatype ((struct ompio_file_t *) fh,
                                                  datatype,
                                                  count,
                                                  buf,
                                                  &max_data,
                                                  &decoded_iov,
                                                  &iov_count);
        if (OMPI_SUCCESS != ret ){
            goto exit;
        }
    }
    else {
        max_data = count * datatype->super.size;
    }

    if ( MPI_STATUS_IGNORE != status ) {
	status->_ucount = max_data;
    }

    dynamic_num_io_procs = fh->f_get_mca_parameter_value ( "num_aggregators", strlen ("num_aggregators"));
    if ( OMPI_ERR_MAX == dynamic_num_io_procs ) {
        ret = OMPI_ERROR;
        goto exit;
    }
    ret = mca_common_ompio_set_aggregator_props ((struct ompio_file_t *) fh,
				                 dynamic_num_io_procs,
				                 max_data);

    if (OMPI_SUCCESS != ret){
	goto exit;
    }
    my_aggregator = fh->f_procs_in_group[0];
    /**************************************************************************
     ** 2. Determine the total amount of data to be written
     **************************************************************************/
    total_bytes_per_process = (MPI_Aint*)malloc
        (fh->f_procs_per_group*sizeof(MPI_Aint));
    if (NULL == total_bytes_per_process) {
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
	goto exit;
    }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_comm_time = MPI_Wtime();
#endif
    ret = ompi_fcoll_base_coll_allgather_array (&max_data,
                                           1,
                                           MPI_LONG,
                                           total_bytes_per_process,
                                           1,
                                           MPI_LONG,
                                           0,
                                           fh->f_procs_in_group,
                                           fh->f_procs_per_group,
                                           fh->f_comm);
    
    if( OMPI_SUCCESS != ret){
	goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    end_comm_time = MPI_Wtime();
    comm_time += (end_comm_time - start_comm_time);
#endif

    for (i=0 ; i<fh->f_procs_per_group ; i++) {
        total_bytes += total_bytes_per_process[i];
    }

    if (NULL != total_bytes_per_process) {
        free (total_bytes_per_process);
        total_bytes_per_process = NULL;
    }

    /*********************************************************************
     *** 3. Generate the local offsets/lengths array corresponding to
     ***    this write operation
     ********************************************************************/
    ret = fh->f_generate_current_file_view( (struct ompio_file_t *) fh,
					    max_data,
					    &local_iov_array,
					    &local_count);
    if (ret != OMPI_SUCCESS){
	goto exit;
    }

#if DEBUG_ON
    for (i=0 ; i<local_count ; i++) {

        printf("%d: OFFSET: %d   LENGTH: %ld\n",
               fh->f_rank,
               local_iov_array[i].iov_base,
               local_iov_array[i].iov_len);

    }
#endif

    /*************************************************************
     *** 4. Allgather the offset/lengths array from all processes
     *************************************************************/
    fview_count = (int *) malloc (fh->f_procs_per_group * sizeof (int));
    if (NULL == fview_count) {
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
	goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_comm_time = MPI_Wtime();
#endif
    ret = ompi_fcoll_base_coll_allgather_array (&local_count,
                                           1,
                                           MPI_INT,
                                           fview_count,
                                           1,
                                           MPI_INT,
                                           0,
                                           fh->f_procs_in_group,
                                           fh->f_procs_per_group,
                                           fh->f_comm);
    
    if( OMPI_SUCCESS != ret){
	goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    end_comm_time = MPI_Wtime();
    comm_time += (end_comm_time - start_comm_time);
#endif

    displs = (int*) malloc (fh->f_procs_per_group * sizeof (int));
    if (NULL == displs) {
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
	goto exit;
    }

    displs[0] = 0;
    total_fview_count = fview_count[0];
    for (i=1 ; i<fh->f_procs_per_group ; i++) {
        total_fview_count += fview_count[i];
        displs[i] = displs[i-1] + fview_count[i-1];
    }

#if DEBUG_ON
    printf("total_fview_count : %d\n", total_fview_count);
    if (my_aggregator == fh->f_rank) {
        for (i=0 ; i<fh->f_procs_per_group ; i++) {
            printf ("%d: PROCESS: %d  ELEMENTS: %d  DISPLS: %d\n",
                    fh->f_rank,
                    i,
                    fview_count[i],
                    displs[i]);
        }
    }
#endif

    /* allocate the global iovec  */

    if (0 != total_fview_count) {
        global_iov_array = (struct iovec*) malloc (total_fview_count *
                                                   sizeof(struct iovec));
        if (NULL == global_iov_array){
            opal_output(1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

    }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_comm_time = MPI_Wtime();
#endif
    ret = ompi_fcoll_base_coll_allgatherv_array (local_iov_array,
                                            local_count,
                                            fh->f_iov_type,
                                            global_iov_array,
                                            fview_count,
                                            displs,
                                            fh->f_iov_type,
                                            0,
                                            fh->f_procs_in_group,
                                            fh->f_procs_per_group,
                                            fh->f_comm);
    if (OMPI_SUCCESS != ret){
	goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    end_comm_time = MPI_Wtime();
    comm_time += (end_comm_time - start_comm_time);
#endif

    /****************************************************************************************
    *** 5. Sort the global offset/lengths list based on the offsets.
    *** The result of the sort operation is the 'sorted', an integer array,
    *** which contains the indexes of the global_iov_array based on the offset.
    *** For example, if global_iov_array[x].offset is followed by global_iov_array[y].offset
    *** in the file, and that one is followed by global_iov_array[z].offset, than
    *** sorted[0] = x, sorted[1]=y and sorted[2]=z;
    ******************************************************************************************/
    if (0 != total_fview_count) {
        sorted = (int *)malloc (total_fview_count * sizeof(int));
        if (NULL == sorted) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
        }
	ompi_fcoll_base_sort_iovec (global_iov_array, total_fview_count, sorted);
    }

    if (NULL != local_iov_array){
	free(local_iov_array);
	local_iov_array = NULL;
    }

    if (NULL != displs){
	free(displs);
	displs=NULL;
    }


#if DEBUG_ON
    if (my_aggregator == fh->f_rank) {
        uint32_t tv=0;
        for (tv=0 ; tv<total_fview_count ; tv++) {
            printf("%d: OFFSET: %lld   LENGTH: %ld\n",
                   fh->f_rank,
                   global_iov_array[sorted[tv]].iov_base,
                   global_iov_array[sorted[tv]].iov_len);
        }
    }
#endif
    /*************************************************************
     *** 6. Determine the number of cycles required to execute this
     ***    operation
     *************************************************************/
    bytes_per_cycle = fh->f_bytes_per_agg;
    cycles = ceil((double)total_bytes/bytes_per_cycle);

    if (my_aggregator == fh->f_rank) {
        disp_index = (int *)malloc (fh->f_procs_per_group * sizeof (int));
        if (NULL == disp_index) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
        }

	blocklen_per_process = (int **)calloc (fh->f_procs_per_group, sizeof (int*));
        if (NULL == blocklen_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
        }

	displs_per_process = (MPI_Aint **)calloc (fh->f_procs_per_group, sizeof (MPI_Aint*));
	if (NULL == displs_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
        }

	recv_req = (MPI_Request *)malloc ((fh->f_procs_per_group)*sizeof(MPI_Request));
	if ( NULL == recv_req ) {
	    opal_output (1, "OUT OF MEMORY\n");
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	}

	global_buf  = (char *) malloc (bytes_per_cycle);
	if (NULL == global_buf){
	    opal_output(1, "OUT OF MEMORY");
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	}

	recvtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group  * sizeof(ompi_datatype_t *));
	if (NULL == recvtype) {
	    opal_output (1, "OUT OF MEMORY\n");
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	}
	for(l=0;l<fh->f_procs_per_group;l++){
            recvtype[l] = MPI_DATATYPE_NULL;
	}
    }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_exch = MPI_Wtime();
#endif
    n = 0;
    bytes_remaining = 0;
    current_index = 0;

    for (index = 0; index < cycles; index++) {
        /**********************************************************************
         ***  7a. Getting ready for next cycle: initializing and freeing buffers
	 **********************************************************************/
        if (my_aggregator == fh->f_rank) {
            if (NULL != fh->f_io_array) {
                free (fh->f_io_array);
                fh->f_io_array = NULL;
            }
	    fh->f_num_of_io_entries = 0;

            if (NULL != recvtype){
                for (i =0; i< fh->f_procs_per_group; i++) {
                    if ( MPI_DATATYPE_NULL != recvtype[i] ) {
                        ompi_datatype_destroy(&recvtype[i]);
			recvtype[i] = MPI_DATATYPE_NULL;
                    }
                }
            }

            for(l=0;l<fh->f_procs_per_group;l++){
                disp_index[l] =  1;

                free(blocklen_per_process[l]);
                free(displs_per_process[l]);

                blocklen_per_process[l] = (int *) calloc (1, sizeof(int));
                displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint));
                if (NULL == displs_per_process[l] || NULL == blocklen_per_process[l]){
                    opal_output (1, "OUT OF MEMORY for displs\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
            }

            if (NULL != sorted_file_offsets){
                free(sorted_file_offsets);
                sorted_file_offsets = NULL;
            }

            if(NULL != file_offsets_for_agg){
                free(file_offsets_for_agg);
                file_offsets_for_agg = NULL;
            }

            if (NULL != memory_displacements){
                free(memory_displacements);
                memory_displacements = NULL;
            }

        } /* (my_aggregator == fh->f_rank */

        /**************************************************************************
         ***  7b. Determine the number of bytes to be actually written in this cycle
	 **************************************************************************/
        if (cycles-1 == index) {
            bytes_to_write_in_cycle = total_bytes - bytes_per_cycle*index;
        }
        else {
            bytes_to_write_in_cycle = bytes_per_cycle;
        }

#if DEBUG_ON
        if (my_aggregator == fh->f_rank) {
            printf ("****%d: CYCLE %d   Bytes %lld**********\n",
                    fh->f_rank,
                    index,
                    bytes_to_write_in_cycle);
        }
#endif
        /**********************************************************
         **Gather the Data from all the processes at the writers **
         *********************************************************/

#if DEBUG_ON
        printf("bytes_to_write_in_cycle: %ld, cycle : %d\n", bytes_to_write_in_cycle,
	       index);
#endif

        /*****************************************************************
         *** 7c. Calculate how much data will be contributed in this cycle
	 ***     by each process
         *****************************************************************/
        bytes_sent = 0;

        /* The blocklen and displs calculation only done at aggregators!*/
        while (bytes_to_write_in_cycle) {

	    /* This next block identifies which process is the holder
	    ** of the sorted[current_index] element;
	    */
            blocks = fview_count[0];
            for (j=0 ; j<fh->f_procs_per_group ; j++) {
                if (sorted[current_index] < blocks) {
                    n = j;
                    break;
                }
                else {
                    blocks += fview_count[j+1];
                }
            }

            if (bytes_remaining) {
                /* Finish up a partially used buffer from the previous  cycle */

                if (bytes_remaining <= bytes_to_write_in_cycle) {
                    /* The data fits completely into the block */
                    if (my_aggregator == fh->f_rank) {
                        blocklen_per_process[n][disp_index[n] - 1] = bytes_remaining;
                        displs_per_process[n][disp_index[n] - 1] =
                            (ptrdiff_t)global_iov_array[sorted[current_index]].iov_base +
                            (global_iov_array[sorted[current_index]].iov_len
                             - bytes_remaining);

                        /* In this cases the length is consumed so allocating for
                           next displacement and blocklength*/
                        blocklen_per_process[n] = (int *) realloc
                            ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int));
                        displs_per_process[n] = (MPI_Aint *) realloc
                            ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint));
                        blocklen_per_process[n][disp_index[n]] = 0;
                        displs_per_process[n][disp_index[n]] = 0;
                        disp_index[n] += 1;
                    }
                    if (fh->f_procs_in_group[n] == fh->f_rank) {
                        bytes_sent += bytes_remaining;
                    }
                    current_index ++;
                    bytes_to_write_in_cycle -= bytes_remaining;
                    bytes_remaining = 0;
                    continue;
                }
                else {
                    /* the remaining data from the previous cycle is larger than the
		       bytes_to_write_in_cycle, so we have to segment again */
                    if (my_aggregator == fh->f_rank) {
                        blocklen_per_process[n][disp_index[n] - 1] = bytes_to_write_in_cycle;
                        displs_per_process[n][disp_index[n] - 1] =
                            (ptrdiff_t)global_iov_array[sorted[current_index]].iov_base +
                            (global_iov_array[sorted[current_index]].iov_len
                             - bytes_remaining);
                    }

                    if (fh->f_procs_in_group[n] == fh->f_rank) {
                        bytes_sent += bytes_to_write_in_cycle;
                    }
                    bytes_remaining -= bytes_to_write_in_cycle;
                    bytes_to_write_in_cycle = 0;
                    break;
                }
            }
            else {
                 /* No partially used entry available, have to start a new one */
                if (bytes_to_write_in_cycle <
                    (MPI_Aint) global_iov_array[sorted[current_index]].iov_len) {
                     /* This entry has more data than we can sendin one cycle */
                    if (my_aggregator == fh->f_rank) {
                        blocklen_per_process[n][disp_index[n] - 1] = bytes_to_write_in_cycle;
                        displs_per_process[n][disp_index[n] - 1] =
                            (ptrdiff_t)global_iov_array[sorted[current_index]].iov_base ;
                    }
                    if (fh->f_procs_in_group[n] == fh->f_rank) {
                        bytes_sent += bytes_to_write_in_cycle;

                    }
                    bytes_remaining = global_iov_array[sorted[current_index]].iov_len -
                        bytes_to_write_in_cycle;
                    bytes_to_write_in_cycle = 0;
                    break;
                }
                else {
                    /* Next data entry is less than bytes_to_write_in_cycle */
                    if (my_aggregator == fh->f_rank) {
                        blocklen_per_process[n][disp_index[n] - 1] =
                            global_iov_array[sorted[current_index]].iov_len;
                        displs_per_process[n][disp_index[n] - 1] = (ptrdiff_t)
                            global_iov_array[sorted[current_index]].iov_base;

                        /*realloc for next blocklength
                          and assign this displacement and check for next displs as
                          the total length of this entry has been consumed!*/
                        blocklen_per_process[n] =
                            (int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int));
                        displs_per_process[n] = (MPI_Aint *)realloc
                            ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint));
                        blocklen_per_process[n][disp_index[n]] = 0;
                        displs_per_process[n][disp_index[n]] = 0;
                        disp_index[n] += 1;
                    }
                    if (fh->f_procs_in_group[n] == fh->f_rank) {
                        bytes_sent += global_iov_array[sorted[current_index]].iov_len;
                    }
                    bytes_to_write_in_cycle -=
                        global_iov_array[sorted[current_index]].iov_len;
                    current_index ++;
                    continue;
                }
            }
        }


        /*************************************************************************
	 *** 7d. Calculate the displacement on where to put the data and allocate
         ***     the recieve buffer (global_buf)
	 *************************************************************************/
        if (my_aggregator == fh->f_rank) {
            entries_per_aggregator=0;
            for (i=0;i<fh->f_procs_per_group; i++){
                for (j=0;j<disp_index[i];j++){
                    if (blocklen_per_process[i][j] > 0)
                        entries_per_aggregator++ ;
                }
            }

#if DEBUG_ON
            printf("%d: cycle: %d, bytes_sent: %d\n ",fh->f_rank,index,
                   bytes_sent);
            printf("%d : Entries per aggregator : %d\n",fh->f_rank,entries_per_aggregator);
#endif

            if (entries_per_aggregator > 0){
                file_offsets_for_agg = (mca_io_ompio_local_io_array *)
                    malloc(entries_per_aggregator*sizeof(mca_io_ompio_local_io_array));
                if (NULL == file_offsets_for_agg) {
                    opal_output (1, "OUT OF MEMORY\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }

                sorted_file_offsets = (int *)
                    malloc (entries_per_aggregator*sizeof(int));
                if (NULL == sorted_file_offsets){
                    opal_output (1, "OUT OF MEMORY\n");
                    ret =  OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }

                /*Moving file offsets to an IO array!*/
                temp_index = 0;

                for (i=0;i<fh->f_procs_per_group; i++){
                    for(j=0;j<disp_index[i];j++){
                        if (blocklen_per_process[i][j] > 0){
                            file_offsets_for_agg[temp_index].length =
                                blocklen_per_process[i][j];
                            file_offsets_for_agg[temp_index].process_id = i;
                            file_offsets_for_agg[temp_index].offset =
                                displs_per_process[i][j];
                            temp_index++;

#if DEBUG_ON
                            printf("************Cycle: %d,  Aggregator: %d ***************\n",
                                   index+1,fh->f_rank);

                            printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n",
                                   fh->f_procs_in_group[i],j,
                                   blocklen_per_process[i][j],j,
                                   displs_per_process[i][j],
                                   fh->f_rank);
#endif
                        }
                    }
                }
            }
            else{
                continue;
            }
            /* Sort the displacements for each aggregator*/
            local_heap_sort (file_offsets_for_agg,
                             entries_per_aggregator,
                             sorted_file_offsets);

            /*create contiguous memory displacements
              based on blocklens on the same displs array
              and map it to this aggregator's actual
              file-displacements (this is in the io-array created above)*/
            memory_displacements = (MPI_Aint *) malloc
                (entries_per_aggregator * sizeof(MPI_Aint));

            memory_displacements[sorted_file_offsets[0]] = 0;
            for (i=1; i<entries_per_aggregator; i++){
                memory_displacements[sorted_file_offsets[i]] =
                    memory_displacements[sorted_file_offsets[i-1]] +
                    file_offsets_for_agg[sorted_file_offsets[i-1]].length;
            }

            temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int));
            if (NULL == temp_disp_index) {
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }

            /*Now update the displacements array  with memory offsets*/
            global_count = 0;
            for (i=0;i<entries_per_aggregator;i++){
                temp_pindex =
                    file_offsets_for_agg[sorted_file_offsets[i]].process_id;
                displs_per_process[temp_pindex][temp_disp_index[temp_pindex]] =
                    memory_displacements[sorted_file_offsets[i]];
                if (temp_disp_index[temp_pindex] < disp_index[temp_pindex])
                    temp_disp_index[temp_pindex] += 1;
                else{
                    printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n",
                           temp_pindex, temp_disp_index[temp_pindex],
                           temp_pindex, disp_index[temp_pindex]);
                }
                global_count +=
                    file_offsets_for_agg[sorted_file_offsets[i]].length;
            }

            if (NULL != temp_disp_index){
                free(temp_disp_index);
                temp_disp_index = NULL;
            }

#if DEBUG_ON

            printf("************Cycle: %d,  Aggregator: %d ***************\n",
                   index+1,fh->f_rank);
            for (i=0;i<fh->f_procs_per_group; i++){
                for(j=0;j<disp_index[i];j++){
                    if (blocklen_per_process[i][j] > 0){
                        printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n",
                               fh->f_procs_in_group[i],j,
                               blocklen_per_process[i][j],j,
                               displs_per_process[i][j],
                               fh->f_rank);

                    }
                }
            }
            printf("************Cycle: %d,  Aggregator: %d ***************\n",
                   index+1,fh->f_rank);
            for (i=0; i<entries_per_aggregator;i++){
                printf("%d: OFFSET: %lld   LENGTH: %ld, Mem-offset: %ld\n",
                       file_offsets_for_agg[sorted_file_offsets[i]].process_id,
                       file_offsets_for_agg[sorted_file_offsets[i]].offset,
                       file_offsets_for_agg[sorted_file_offsets[i]].length,
                       memory_displacements[sorted_file_offsets[i]]);
            }
            printf("%d : global_count : %ld, bytes_sent : %d\n",
                   fh->f_rank,global_count, bytes_sent);
#endif
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            start_comm_time = MPI_Wtime();
#endif
        /*************************************************************************
	 *** 7e. Perform the actual communication
	 *************************************************************************/
            for (i=0;i<fh->f_procs_per_group; i++) {
                recv_req[i] = MPI_REQUEST_NULL;
                if ( 0 < disp_index[i] ) {
                    ompi_datatype_create_hindexed(disp_index[i],
                                                  blocklen_per_process[i],
                                                  displs_per_process[i],
                                                  MPI_BYTE,
                                                  &recvtype[i]);
                    ompi_datatype_commit(&recvtype[i]);
                    opal_datatype_type_size(&recvtype[i]->super, &datatype_size);

                    if (datatype_size){
                        ret = MCA_PML_CALL(irecv(global_buf,
                                                 1,
                                                 recvtype[i],
                                                 fh->f_procs_in_group[i],
                                                 123,
                                                 fh->f_comm,
                                                 &recv_req[i]));
                        if (OMPI_SUCCESS != ret){
                            goto exit;
                        }
                    }
                }
            }
        } /* end if (my_aggregator == fh->f_rank ) */


        if ( sendbuf_is_contiguous ) {
            send_buf = &((char*)buf)[total_bytes_written];
        }
        else if (bytes_sent) {
            /* allocate a send buffer and copy the data that needs
               to be sent into it in case the data is non-contigous
               in memory */
            ptrdiff_t mem_address;
            size_t remaining = 0;
            size_t temp_position = 0;

            send_buf = malloc (bytes_sent);
            if (NULL == send_buf) {
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }

            remaining = bytes_sent;

            while (remaining) {
                mem_address = (ptrdiff_t)
                    (decoded_iov[iov_index].iov_base) + current_position;

                if (remaining >=
                    (decoded_iov[iov_index].iov_len - current_position)) {
                    memcpy (send_buf+temp_position,
                            (IOVBASE_TYPE *)mem_address,
                            decoded_iov[iov_index].iov_len - current_position);
                    remaining = remaining -
                        (decoded_iov[iov_index].iov_len - current_position);
                    temp_position = temp_position +
                        (decoded_iov[iov_index].iov_len - current_position);
                    iov_index = iov_index + 1;
                    current_position = 0;
                }
                else {
                    memcpy (send_buf+temp_position,
                            (IOVBASE_TYPE *) mem_address,
                            remaining);
                    current_position = current_position + remaining;
                    remaining = 0;
                }
            }
	}
	total_bytes_written += bytes_sent;

	/* Gather the sendbuf from each process in appropritate locations in
           aggregators*/

	if (bytes_sent){
            ret = MCA_PML_CALL(isend(send_buf,
                                     bytes_sent,
                                     MPI_BYTE,
                                     my_aggregator,
                                     123,
                                     MCA_PML_BASE_SEND_STANDARD,
                                     fh->f_comm,
                                     &send_req));


	    if ( OMPI_SUCCESS != ret ){
		goto exit;
	    }

	    ret = ompi_request_wait(&send_req, MPI_STATUS_IGNORE);
	    if (OMPI_SUCCESS != ret){
		goto exit;
	    }
	}

	if (my_aggregator == fh->f_rank) {
            ret = ompi_request_wait_all (fh->f_procs_per_group,
                                         recv_req,
                                         MPI_STATUS_IGNORE);

            if (OMPI_SUCCESS != ret){
                goto exit;
            }
	}

#if DEBUG_ON
	if (my_aggregator == fh->f_rank){
            printf("************Cycle: %d,  Aggregator: %d ***************\n",
                   index+1,fh->f_rank);
            for (i=0 ; i<global_count/4 ; i++)
                printf (" RECV %d \n",((int *)global_buf)[i]);
	}
#endif

        if (! sendbuf_is_contiguous) {
            if (NULL != send_buf) {
                free (send_buf);
                send_buf = NULL;
            }
        }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        end_comm_time = MPI_Wtime();
        comm_time += (end_comm_time - start_comm_time);
#endif
        /**********************************************************
         *** 7f. Create the io array, and pass it to fbtl
         *********************************************************/

	if (my_aggregator == fh->f_rank) {

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
	    start_write_time = MPI_Wtime();
#endif

            fh->f_io_array = (mca_common_ompio_io_array_t *) malloc
                (entries_per_aggregator * sizeof (mca_common_ompio_io_array_t));
            if (NULL == fh->f_io_array) {
                opal_output(1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }

            fh->f_num_of_io_entries = 0;
            /*First entry for every aggregator*/
            fh->f_io_array[0].offset =
                (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset;
            fh->f_io_array[0].length =
                file_offsets_for_agg[sorted_file_offsets[0]].length;
            fh->f_io_array[0].memory_address =
                global_buf+memory_displacements[sorted_file_offsets[0]];
            fh->f_num_of_io_entries++;

            for (i=1;i<entries_per_aggregator;i++){
                /* If the enrties are contiguous merge them,
                   else make a new entry */
                if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset +
                    file_offsets_for_agg[sorted_file_offsets[i-1]].length ==
                    file_offsets_for_agg[sorted_file_offsets[i]].offset){
                    fh->f_io_array[fh->f_num_of_io_entries - 1].length +=
                        file_offsets_for_agg[sorted_file_offsets[i]].length;
                }
                else {
                    fh->f_io_array[fh->f_num_of_io_entries].offset =
                        (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset;
                    fh->f_io_array[fh->f_num_of_io_entries].length =
                        file_offsets_for_agg[sorted_file_offsets[i]].length;
                    fh->f_io_array[fh->f_num_of_io_entries].memory_address =
                        global_buf+memory_displacements[sorted_file_offsets[i]];
                    fh->f_num_of_io_entries++;
                }

            }

#if DEBUG_ON
            printf("*************************** %d\n", fh->f_num_of_io_entries);
            for (i=0 ; i<fh->f_num_of_io_entries ; i++) {
                printf(" ADDRESS: %p  OFFSET: %ld   LENGTH: %ld\n",
                       fh->f_io_array[i].memory_address,
                       (ptrdiff_t)fh->f_io_array[i].offset,
                       fh->f_io_array[i].length);
            }

#endif

            if (fh->f_num_of_io_entries) {
                if ( 0 >  fh->f_fbtl->fbtl_pwritev (fh)) {
                    opal_output (1, "WRITE FAILED\n");
                    ret = OMPI_ERROR;
                    goto exit;
                }
            }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            end_write_time = MPI_Wtime();
            write_time += end_write_time - start_write_time;
#endif


	} /* end if (my_aggregator == fh->f_rank) */
    } /* end  for (index = 0; index < cycles; index++) */

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    end_exch = MPI_Wtime();
    exch_write += end_exch - start_exch;
    nentry.time[0] = write_time;
    nentry.time[1] = comm_time;
    nentry.time[2] = exch_write;
    if (my_aggregator == fh->f_rank)
	nentry.aggregator = 1;
    else
	nentry.aggregator = 0;
    nentry.nprocs_for_coll = dynamic_num_io_procs;
    if (!mca_common_ompio_full_print_queue(fh->f_coll_write_time)){
        mca_common_ompio_register_print_entry(fh->f_coll_write_time,
                                              nentry);
    }
#endif


exit :
    if (my_aggregator == fh->f_rank) {
        if (NULL != sorted_file_offsets){
            free(sorted_file_offsets);
            sorted_file_offsets = NULL;
        }
        if(NULL != file_offsets_for_agg){
            free(file_offsets_for_agg);
            file_offsets_for_agg = NULL;
        }
        if (NULL != memory_displacements){
            free(memory_displacements);
            memory_displacements = NULL;
        }
        if (NULL != recvtype){
            for (i =0; i< fh->f_procs_per_group; i++) {
                if ( MPI_DATATYPE_NULL != recvtype[i] ) {
                    ompi_datatype_destroy(&recvtype[i]);
                }
            }
            free(recvtype);
            recvtype=NULL;
        }

        if (NULL != fh->f_io_array) {
            free (fh->f_io_array);
            fh->f_io_array = NULL;
        }
        if (NULL != disp_index){
            free(disp_index);
            disp_index = NULL;
        }
        if (NULL != recvtype){
            free(recvtype);
            recvtype=NULL;
        }
        if (NULL != recv_req){
            free(recv_req);
            recv_req = NULL;
        }
        if (NULL != global_buf) {
            free (global_buf);
            global_buf = NULL;
        }
        for(l=0;l<fh->f_procs_per_group;l++){
            if (NULL != blocklen_per_process){
                free(blocklen_per_process[l]);
            }
            if (NULL != displs_per_process){
                free(displs_per_process[l]);
            }
        }

        free(blocklen_per_process);
        free(displs_per_process);
    }

    if (NULL != displs){
	free(displs);
	displs=NULL;
    }

    if (! sendbuf_is_contiguous) {
	if (NULL != send_buf) {
	    free (send_buf);
	    send_buf = NULL;
	}
    }
    if (NULL != global_buf) {
        free (global_buf);
        global_buf = NULL;
    }
    if (NULL != sorted) {
        free (sorted);
        sorted = NULL;
    }
    if (NULL != global_iov_array) {
        free (global_iov_array);
	global_iov_array = NULL;
    }
    if (NULL != fview_count) {
        free (fview_count);
        fview_count = NULL;
    }
    if (NULL != decoded_iov) {
        free (decoded_iov);
        decoded_iov = NULL;
    }


    return OMPI_SUCCESS;
}
int
mca_fcoll_static_file_read_all (mca_io_ompio_file_t *fh,
				void *buf,
				int count,
				struct ompi_datatype_t *datatype,
				ompi_status_public_t *status)
{

    int ret = OMPI_SUCCESS, iov_size=0, *bytes_remaining=NULL;
    int i, j, l,cycles=0, local_cycles=0, *current_index=NULL;
    int index, *disp_index=NULL, *bytes_per_process=NULL, current_position=0;
    int **blocklen_per_process=NULL, *iovec_count_per_process=NULL;
    int *displs=NULL, *sorted=NULL ,entries_per_aggregator=0;
    int *sorted_file_offsets=NULL, temp_index=0, position=0, *temp_disp_index=NULL;


    MPI_Aint **displs_per_process=NULL, global_iov_count=0, global_count=0;
    MPI_Aint *memory_displacements=NULL;
    int bytes_to_read_in_cycle=0;
    size_t max_data=0, bytes_per_cycle=0;
    uint32_t iov_count=0, iov_index=0;
    struct iovec *decoded_iov=NULL, *iov=NULL;
    mca_fcoll_static_local_io_array *local_iov_array=NULL, *global_iov_array=NULL;
    mca_fcoll_static_local_io_array *file_offsets_for_agg=NULL;

    char *global_buf=NULL, *receive_buf=NULL;

    int blocklen[3] = {1, 1, 1};
    int static_num_io_procs=1;
    OPAL_PTRDIFF_TYPE d[3], base;
    ompi_datatype_t *types[3];
    ompi_datatype_t *io_array_type=MPI_DATATYPE_NULL;
    ompi_datatype_t **sendtype = NULL;
    MPI_Request *send_req=NULL, recv_req=NULL;
    int my_aggregator=-1;
    bool recvbuf_is_contiguous=false;
    size_t ftype_size;
    OPAL_PTRDIFF_TYPE ftype_extent, lb;

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    double read_time = 0.0, start_read_time = 0.0, end_read_time = 0.0;
    double rcomm_time = 0.0, start_rcomm_time = 0.0, end_rcomm_time = 0.0;
    double read_exch = 0.0, start_rexch = 0.0, end_rexch = 0.0;
    mca_common_ompio_print_entry nentry;
#endif
#if DEBUG_ON
    MPI_Aint gc_in;
#endif
    opal_datatype_type_size ( &datatype->super, &ftype_size );
    opal_datatype_get_extent ( &datatype->super, &lb, &ftype_extent );

    /**************************************************************************
     ** 1.  In case the data is not contigous in memory, decode it into an iovec
     **************************************************************************/
    if ( ( ftype_extent == (OPAL_PTRDIFF_TYPE) ftype_size)             &&
         opal_datatype_is_contiguous_memory_layout(&datatype->super,1) &&
         0 == lb ) {
        recvbuf_is_contiguous = true;
    }


    /* In case the data is not contigous in memory, decode it into an iovec */
    if (!recvbuf_is_contiguous  ) {
        fh->f_decode_datatype ( (struct mca_io_ompio_file_t *)fh,
                                datatype,
                                count,
                                buf,
                                &max_data,
                                &decoded_iov,
                                &iov_count);
    }
    else {
        max_data = count * datatype->super.size;
    }

    if ( MPI_STATUS_IGNORE != status ) {
        status->_ucount = max_data;
    }


    fh->f_get_num_aggregators ( &static_num_io_procs );
    fh->f_set_aggregator_props ((struct mca_io_ompio_file_t *) fh,
                                static_num_io_procs,
                                max_data);
    my_aggregator = fh->f_procs_in_group[fh->f_aggregator_index];

    /*  printf("max_data %ld\n", max_data);  */
    ret = fh->f_generate_current_file_view((struct mca_io_ompio_file_t *)fh,
                                           max_data,
                                           &iov,
                                           &iov_size);
    if (ret != OMPI_SUCCESS){
        goto exit;
    }

    if ( iov_size > 0 ) {
        local_iov_array = (mca_fcoll_static_local_io_array *)malloc (iov_size * sizeof(mca_fcoll_static_local_io_array));
        if ( NULL == local_iov_array){
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }


        for (j=0; j < iov_size; j++){
            local_iov_array[j].offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t)
                iov[j].iov_base;
            local_iov_array[j].length = (size_t)iov[j].iov_len;
            local_iov_array[j].process_id = fh->f_rank;

        }
    }
    else {
        /* Allocate at least one element to correctly create the derived
           data type */
        local_iov_array = (mca_fcoll_static_local_io_array *)malloc (sizeof(mca_fcoll_static_local_io_array));
        if ( NULL == local_iov_array){
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }


        local_iov_array[0].offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t) 0;
        local_iov_array[0].length = (size_t) 0;
        local_iov_array[0].process_id = fh->f_rank;
    }

    d[0] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0];
    d[1] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0].length;
    d[2] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0].process_id;
    base = d[0];
    for (i=0 ; i<3 ; i++) {
        d[i] -= base;
    }

    /* io_array datatype  for using in communication*/
    types[0] = &ompi_mpi_long.dt;
    types[1] = &ompi_mpi_long.dt;
    types[2] = &ompi_mpi_int.dt;

    ompi_datatype_create_struct (3,
                                 blocklen,
                                 d,
                                 types,
                                 &io_array_type);
    ompi_datatype_commit (&io_array_type);

    /* #########################################################*/
    fh->f_get_bytes_per_agg ( (int*) &bytes_per_cycle);
    local_cycles = ceil((double)max_data*fh->f_procs_per_group/bytes_per_cycle);

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_rexch = MPI_Wtime();
#endif
    ret = fh->f_comm->c_coll.coll_allreduce (&local_cycles,
                                             &cycles,
                                             1,
                                             MPI_INT,
                                             MPI_MAX,
                                             fh->f_comm,
                                             fh->f_comm->c_coll.coll_allreduce_module);

    if (OMPI_SUCCESS != ret){
        goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        end_rcomm_time = MPI_Wtime();
        rcomm_time  += end_rcomm_time - start_rcomm_time;
#endif


    if (my_aggregator == fh->f_rank) {
        disp_index = (int *) malloc (fh->f_procs_per_group * sizeof(int));
        if (NULL == disp_index) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        bytes_per_process = (int *) malloc (fh->f_procs_per_group * sizeof(int ));
        if (NULL == bytes_per_process){
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        bytes_remaining = (int *) calloc (fh->f_procs_per_group, sizeof(int));
        if (NULL == bytes_remaining){
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        current_index = (int *) calloc (fh->f_procs_per_group, sizeof(int));
        if (NULL == current_index){
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        blocklen_per_process = (int **)calloc (fh->f_procs_per_group, sizeof (int*));
        if (NULL == blocklen_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        displs_per_process = (MPI_Aint **)calloc (fh->f_procs_per_group, sizeof (MPI_Aint*));
        if (NULL == displs_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
    }


    iovec_count_per_process = (int *) calloc (fh->f_procs_per_group, sizeof(int));
    if (NULL == iovec_count_per_process){
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }

    displs = (int *) calloc (fh->f_procs_per_group, sizeof(int));
    if (NULL == displs){
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_rexch = MPI_Wtime();
#endif
    ret = fcoll_base_coll_allgather_array (&iov_size,
                                           1,
                                           MPI_INT,
                                           iovec_count_per_process,
                                           1,
                                           MPI_INT,
                                           fh->f_aggregator_index,
                                           fh->f_procs_in_group,
                                           fh->f_procs_per_group,
                                           fh->f_comm);

    if( OMPI_SUCCESS != ret){
        goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        end_rcomm_time = MPI_Wtime();
        rcomm_time  += end_rcomm_time - start_rcomm_time;
#endif

    if (my_aggregator == fh->f_rank) {
        displs[0] = 0;
        global_iov_count = iovec_count_per_process[0];
        for (i=1 ; i<fh->f_procs_per_group ; i++) {
            global_iov_count += iovec_count_per_process[i];
            displs[i] = displs[i-1] + iovec_count_per_process[i-1];
        }
    }


    if ( (my_aggregator == fh->f_rank) &&
         (global_iov_count >  0 )) {
        global_iov_array = (mca_fcoll_static_local_io_array *) malloc (global_iov_count *
                                                      sizeof(mca_fcoll_static_local_io_array));
        if (NULL == global_iov_array){
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
    }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_rexch = MPI_Wtime();
#endif
    ret = fcoll_base_coll_gatherv_array (local_iov_array,
                                         iov_size,
                                         io_array_type,
                                         global_iov_array,
                                         iovec_count_per_process,
                                         displs,
                                         io_array_type,
                                         fh->f_aggregator_index,
                                         fh->f_procs_in_group,
                                         fh->f_procs_per_group,
                                         fh->f_comm);

    if (OMPI_SUCCESS != ret){
        fprintf(stderr,"global_iov_array gather error!\n");
        goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        end_rcomm_time = MPI_Wtime();
        rcomm_time  += end_rcomm_time - start_rcomm_time;
#endif


    if (NULL != local_iov_array){
        free(local_iov_array);
        local_iov_array = NULL;
    }

    if ( ( my_aggregator == fh->f_rank) &&
         ( global_iov_count > 0 )) {
        sorted = (int *)malloc (global_iov_count * sizeof(int));
        if (NULL == sorted) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        read_local_heap_sort (global_iov_array, global_iov_count, sorted);

        send_req = (MPI_Request *) malloc (fh->f_procs_per_group * sizeof(MPI_Request));
        if (NULL == send_req){
            opal_output ( 1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        sendtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group * sizeof(ompi_datatype_t *));
        if (NULL == sendtype) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        for ( i=0; i<fh->f_procs_per_group; i++ ) {
            sendtype[i] = MPI_DATATYPE_NULL;
        }

        if (NULL == bytes_per_process){
            bytes_per_process = (int *) malloc (fh->f_procs_per_group * sizeof(int));
            if (NULL == bytes_per_process){
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
        }
    }

#if DEBUG_ON

    if (my_aggregator == fh->f_rank) {
        for (gc_in=0; gc_in<global_iov_count; gc_in++){
            printf("%d: Offset[%ld]: %lld, Length[%ld]: %ld\n",
                   global_iov_array[sorted[gc_in]].process_id,
                   gc_in, global_iov_array[sorted[gc_in]].offset,
                   gc_in, global_iov_array[sorted[gc_in]].length);
        }
    }
#endif

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_rexch = MPI_Wtime();
#endif

    for (index = 0; index < cycles; index++){

        if (my_aggregator == fh->f_rank) {

            fh->f_num_of_io_entries = 0;
            if (NULL != fh->f_io_array) {
                free (fh->f_io_array);
                fh->f_io_array = NULL;
            }
            if (NULL != global_buf) {
                free (global_buf);
                global_buf = NULL;
            }

            if (NULL != sorted_file_offsets){
                free(sorted_file_offsets);
                sorted_file_offsets = NULL;
            }
            if (NULL != file_offsets_for_agg){
                free(file_offsets_for_agg);
                file_offsets_for_agg = NULL;
            }
            if (NULL != memory_displacements){
                free(memory_displacements);
                memory_displacements= NULL;
            }

            if ( NULL != sendtype ) {
                for ( i=0; i<fh->f_procs_per_group; i++ ) {
                    if ( MPI_DATATYPE_NULL != sendtype[i] ) {
                        ompi_datatype_destroy (&sendtype[i] );
                        sendtype[i] = MPI_DATATYPE_NULL;
                    }
                }
            }

            for(l=0;l<fh->f_procs_per_group;l++){
                disp_index[l] =  1;
                if (NULL != blocklen_per_process[l]){
                    free(blocklen_per_process[l]);
                    blocklen_per_process[l] = NULL;
                }
                if (NULL != displs_per_process[l]){
                    free(displs_per_process[l]);
                    displs_per_process[l] = NULL;
                }
                blocklen_per_process[l] = (int *) calloc (1, sizeof(int));
                if (NULL == blocklen_per_process[l]) {
                    opal_output (1, "OUT OF MEMORY for blocklen\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint));
                if (NULL == displs_per_process[l]){
                    opal_output (1, "OUT OF MEMORY for displs\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
            }
        }

        if (index < local_cycles ) {
            if ((index == local_cycles-1) && (max_data % (bytes_per_cycle/fh->f_procs_per_group))) {
                bytes_to_read_in_cycle = max_data - position;
            }
            else if (max_data <= bytes_per_cycle/fh->f_procs_per_group) {
                bytes_to_read_in_cycle = max_data;
            }
            else {
                bytes_to_read_in_cycle = bytes_per_cycle/fh->f_procs_per_group;
            }
        }
        else {
            bytes_to_read_in_cycle = 0;
        }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_rexch = MPI_Wtime();
#endif
        fcoll_base_coll_gather_array (&bytes_to_read_in_cycle,
                                      1,
                                      MPI_INT,
                                      bytes_per_process,
                                      1,
                                      MPI_INT,
                                      fh->f_aggregator_index,
                                      fh->f_procs_in_group,
                                      fh->f_procs_per_group,
                                      fh->f_comm);

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        end_rcomm_time = MPI_Wtime();
        rcomm_time  += end_rcomm_time - start_rcomm_time;
#endif

        if (recvbuf_is_contiguous ) {
            receive_buf = &((char*)buf)[position];
        }
        else if (bytes_to_read_in_cycle) {
            receive_buf = (char *) malloc (bytes_to_read_in_cycle * sizeof(char));
            if ( NULL == receive_buf){
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
        }


#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        start_rcomm_time = MPI_Wtime();
#endif

        ret = MCA_PML_CALL(irecv(receive_buf,
                                 bytes_to_read_in_cycle,
                                 MPI_BYTE,
                                 my_aggregator,
                                 123,
                                 fh->f_comm,
                                 &recv_req));
        if (OMPI_SUCCESS != ret){
            goto exit;
        }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        end_rcomm_time = MPI_Wtime();
        rcomm_time  += end_rcomm_time - start_rcomm_time;
#endif


        if (my_aggregator == fh->f_rank) {
            for (i=0;i<fh->f_procs_per_group; i++){
                while (bytes_per_process[i] > 0){
                    /*printf("%d: bytes_per_process[%d]: %d, bytes_remaining[%d]: %d\n",
                      index, i, bytes_per_process[i], i, bytes_remaining[i]);*/
                    if (read_get_process_id(global_iov_array[sorted[current_index[i]]].process_id,
                                            fh) == i){ /* current id owns this entry!*/
                        if (bytes_remaining[i]){ /*Remaining bytes in the current entry of
                                                   the global offset array*/
                            if (bytes_remaining[i] <= bytes_per_process[i]){

                                blocklen_per_process[i][disp_index[i] - 1] = bytes_remaining[i];
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset +
                                    (global_iov_array[sorted[current_index[i]]].length
                                     - bytes_remaining[i]);
                                blocklen_per_process[i] = (int *) realloc
                                    ((void *)blocklen_per_process[i], (disp_index[i]+1)*sizeof(int));
                                displs_per_process[i] = (MPI_Aint *)realloc
                                    ((void *)displs_per_process[i], (disp_index[i]+1)*sizeof(MPI_Aint));
                                bytes_per_process[i] -= bytes_remaining[i];
                                blocklen_per_process[i][disp_index[i]] = 0;
                                displs_per_process[i][disp_index[i]] = 0;
                                disp_index[i] += 1;
                                bytes_remaining[i] = 0;
                                /* This entry has been used up, we need to move to the
                                   next entry of this process and make current_index point there*/
                                current_index[i]  = read_find_next_index(i,
                                                                         current_index[i],
                                                                         fh,
                                                                         global_iov_array,
                                                                         global_iov_count,
                                                                         sorted);
                                if (current_index[i] == -1){
                                    break;
                                }
                                continue;
                            }
                            else{
                                blocklen_per_process[i][disp_index[i] - 1] = bytes_per_process[i];
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset +
                                    (global_iov_array[sorted[current_index[i]]].length
                                     - bytes_remaining[i]);
                                bytes_remaining[i] -= bytes_per_process[i];
                                bytes_per_process[i] = 0;
                                break;
                            }
                        }
                        else{
                            if (bytes_per_process[i] <
                                global_iov_array[sorted[current_index[i]]].length){
                                blocklen_per_process[i][disp_index[i] - 1] =
                                    bytes_per_process[i];
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset;
                                bytes_remaining[i] =
                                    global_iov_array[sorted[current_index[i]]].length -
                                    bytes_per_process[i];
                                bytes_per_process[i] = 0;
                                break;
                            }
                            else {
                                blocklen_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].length;
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset;
                                blocklen_per_process[i] =
                                    (int *) realloc ((void *)blocklen_per_process[i], (disp_index[i]+1)*sizeof(int));
                                displs_per_process[i] = (MPI_Aint *)realloc
                                    ((void *)displs_per_process[i], (disp_index[i]+1)*sizeof(MPI_Aint));
                                blocklen_per_process[i][disp_index[i]] = 0;
                                displs_per_process[i][disp_index[i]] = 0;
                                disp_index[i] += 1;
                                bytes_per_process[i] -=
                                    global_iov_array[sorted[current_index[i]]].length;
                                current_index[i] = read_find_next_index(i,
                                                                        current_index[i],
                                                                        fh,
                                                                        global_iov_array,
                                                                        global_iov_count,
                                                                        sorted);
                                if (current_index[i] == -1){
                                    break;
                                }
                            }
                        }
                    }
                    else{
                        current_index[i] = read_find_next_index(i,
                                                                current_index[i],
                                                                fh,
                                                                global_iov_array,
                                                                global_iov_count,
                                                                sorted);
                        if (current_index[i] == -1){
                            bytes_per_process[i] = 0; /* no more entries left
                                                         to service this request*/
                            continue;
                        }
                    }
                }
            }

            entries_per_aggregator=0;
            for (i=0;i<fh->f_procs_per_group;i++){
                for (j=0;j<disp_index[i];j++){
                    if (blocklen_per_process[i][j] > 0){
                        entries_per_aggregator++;
#if DEBUG_ON
                        printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n",
                               fh->f_procs_in_group[i],j,
                               blocklen_per_process[i][j],j,
                               displs_per_process[i][j],
                               fh->f_rank);

#endif
                    }
                }
            }

            if (entries_per_aggregator > 0){
                file_offsets_for_agg = (mca_fcoll_static_local_io_array *)
                    malloc(entries_per_aggregator*sizeof(mca_fcoll_static_local_io_array));
                if (NULL == file_offsets_for_agg) {
                    opal_output (1, "OUT OF MEMORY\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                sorted_file_offsets = (int *) malloc (entries_per_aggregator * sizeof(int));
                if (NULL == sorted_file_offsets){
                    opal_output (1, "OUT OF MEMORY\n");
                    ret =  OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                temp_index=0;
                global_count = 0;
                for (i=0;i<fh->f_procs_per_group; i++){
                    for(j=0;j<disp_index[i]; j++){
                        if (blocklen_per_process[i][j] > 0){
                            file_offsets_for_agg[temp_index].length =
                                blocklen_per_process[i][j];
                            global_count += blocklen_per_process[i][j];
                            file_offsets_for_agg[temp_index].process_id = i;
                            file_offsets_for_agg[temp_index].offset =
                                displs_per_process[i][j];
                            temp_index++;
                        }
                    }
                }
            }
            else{
                continue;
            }
            read_local_heap_sort (file_offsets_for_agg,
                                  entries_per_aggregator,
                                  sorted_file_offsets);
            memory_displacements = (MPI_Aint *) malloc
                (entries_per_aggregator * sizeof(MPI_Aint));
            memory_displacements[sorted_file_offsets[0]] = 0;
            for (i=1; i<entries_per_aggregator; i++){
                memory_displacements[sorted_file_offsets[i]] =
                    memory_displacements[sorted_file_offsets[i-1]] +
                    file_offsets_for_agg[sorted_file_offsets[i-1]].length;
            }

            global_buf = (char *) malloc (global_count * sizeof(char));
            if (NULL == global_buf){
                opal_output(1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
#if DEBUG_ON
            printf("************Cycle: %d,  Aggregator: %d ***************\n",
                   index+1,fh->f_rank);
            for (i=0; i<entries_per_aggregator;i++){
                printf("%d: OFFSET: %lld   LENGTH: %ld, Mem-offset: %ld, disp_index :%d\n",
                       file_offsets_for_agg[sorted_file_offsets[i]].process_id,
                       file_offsets_for_agg[sorted_file_offsets[i]].offset,
                       file_offsets_for_agg[sorted_file_offsets[i]].length,
                       memory_displacements[sorted_file_offsets[i]],
                       disp_index[i]);
            }
#endif

            fh->f_io_array = (mca_io_ompio_io_array_t *) malloc
                (entries_per_aggregator * sizeof (mca_io_ompio_io_array_t));
            if (NULL == fh->f_io_array) {
                opal_output(1, "OUT OF MEMORY\n");
                ret =  OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }



            fh->f_num_of_io_entries = 0;
            fh->f_io_array[0].offset =
                (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset;
            fh->f_io_array[0].length = file_offsets_for_agg[sorted_file_offsets[0]].length;
            fh->f_io_array[0].memory_address = global_buf+memory_displacements[sorted_file_offsets[0]];
            fh->f_num_of_io_entries++;
            for (i=1;i<entries_per_aggregator;i++){
                if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset +
                    file_offsets_for_agg[sorted_file_offsets[i-1]].length ==
                    file_offsets_for_agg[sorted_file_offsets[i]].offset){
                    fh->f_io_array[fh->f_num_of_io_entries - 1].length +=
                        file_offsets_for_agg[sorted_file_offsets[i]].length;
                }
                else{
                    fh->f_io_array[fh->f_num_of_io_entries].offset =
                        (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset;
                    fh->f_io_array[fh->f_num_of_io_entries].length =
                        file_offsets_for_agg[sorted_file_offsets[i]].length;
                    fh->f_io_array[fh->f_num_of_io_entries].memory_address =
                        global_buf+memory_displacements[sorted_file_offsets[i]];
                    fh->f_num_of_io_entries++;
                }
            }

#if DEBUG_ON
            printf("*************************** %d\n", fh->f_num_of_io_entries);
            for (i=0 ; i<fh->f_num_of_io_entries ; i++) {
                printf(" ADDRESS: %p  OFFSET: %ld   LENGTH: %ld\n",
                       fh->f_io_array[i].memory_address,
                       (OPAL_PTRDIFF_TYPE)fh->f_io_array[i].offset,
                       fh->f_io_array[i].length);
            }
#endif
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            start_read_time = MPI_Wtime();
#endif

            if (fh->f_num_of_io_entries) {
                if ( 0 > fh->f_fbtl->fbtl_preadv (fh)) {
                    opal_output (1, "READ FAILED\n");
                    ret = OMPI_ERROR;
                    goto exit;
                }
            }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            end_read_time = MPI_Wtime();
            read_time += end_read_time - start_read_time;
#endif


#if DEBUG_ON
            printf("************Cycle: %d,  Aggregator: %d ***************\n",
                   index+1,fh->f_rank);
            if (my_aggregator == fh->f_rank){
                for (i=0 ; i<global_count/4 ; i++)
                    printf (" READ %d \n",((int *)global_buf)[i]);
            }
#endif

            temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int));
            if (NULL == temp_disp_index) {
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }

            for (i=0; i<entries_per_aggregator; i++){
                temp_index =
                    file_offsets_for_agg[sorted_file_offsets[i]].process_id;
                displs_per_process[temp_index][temp_disp_index[temp_index]] =
                    memory_displacements[sorted_file_offsets[i]];
                if (temp_disp_index[temp_index] < disp_index[temp_index]){
                    temp_disp_index[temp_index] += 1;
                }
                else{
                    printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n",
                           temp_index, temp_disp_index[temp_index],
                           temp_index, disp_index[temp_index]);
                }
            }
            if (NULL != temp_disp_index){
                free(temp_disp_index);
                temp_disp_index = NULL;
            }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            start_rcomm_time = MPI_Wtime();
#endif

            for (i=0;i<fh->f_procs_per_group; i++){
                send_req[i] = MPI_REQUEST_NULL;
                ompi_datatype_create_hindexed(disp_index[i],
                                              blocklen_per_process[i],
                                              displs_per_process[i],
                                              MPI_BYTE,
                                              &sendtype[i]);
                ompi_datatype_commit(&sendtype[i]);
                ret = MCA_PML_CALL (isend(global_buf,
                                          1,
                                          sendtype[i],
                                          fh->f_procs_in_group[i],
                                          123,
                                          MCA_PML_BASE_SEND_STANDARD,
                                          fh->f_comm,
                                          &send_req[i]));
                if(OMPI_SUCCESS != ret){
                    goto exit;
                }
            }

            ret = ompi_request_wait_all (fh->f_procs_per_group,
                                         send_req,
                                         MPI_STATUS_IGNORE);
            if (OMPI_SUCCESS != ret){
                goto exit;
            }
        } /* if ( my_aggregator == fh->f_rank ) */

        ret = ompi_request_wait (&recv_req, MPI_STATUS_IGNORE);
        if (OMPI_SUCCESS != ret){
            goto exit;
        }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        end_rcomm_time = MPI_Wtime();
        rcomm_time += end_rcomm_time - start_rcomm_time;
#endif

        position += bytes_to_read_in_cycle;

        if (!recvbuf_is_contiguous) {
            OPAL_PTRDIFF_TYPE mem_address;
            size_t remaining = 0;
            size_t temp_position = 0;

            remaining = bytes_to_read_in_cycle;

            while (remaining && (iov_count > iov_index)){
                mem_address = (OPAL_PTRDIFF_TYPE)
                    (decoded_iov[iov_index].iov_base) + current_position;

                if (remaining >=
                    (decoded_iov[iov_index].iov_len - current_position)) {
                    memcpy ((IOVBASE_TYPE *) mem_address,
                            receive_buf+temp_position,
                            decoded_iov[iov_index].iov_len - current_position);
                    remaining = remaining -
                        (decoded_iov[iov_index].iov_len - current_position);
                    temp_position = temp_position +
                        (decoded_iov[iov_index].iov_len - current_position);
                    iov_index = iov_index + 1;
                    current_position = 0;
                }
                else{
                    memcpy ((IOVBASE_TYPE *) mem_address,
                            receive_buf+temp_position,
                            remaining);
                    current_position = current_position + remaining;
                    remaining = 0;
                }
            }
            if (NULL != receive_buf) {
                free (receive_buf);
                receive_buf = NULL;
            }
        }

    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    end_rexch = MPI_Wtime();
    read_exch += end_rexch - start_rexch;
    nentry.time[0] = read_time;
    nentry.time[1] = rcomm_time;
    nentry.time[2] = read_exch;
    if (my_aggregator == fh->f_rank)
        nentry.aggregator = 1;
    else
        nentry.aggregator = 0;
    nentry.nprocs_for_coll = static_num_io_procs;
    if (!mca_common_ompio_full_print_queue(fh->f_coll_read_time)){
        mca_common_ompio_register_print_entry(fh->f_coll_read_time,
                                              nentry);
    }
#endif

exit:
    if (NULL != decoded_iov){
        free(decoded_iov);
        decoded_iov = NULL;
    }

    if (NULL != displs){
        free(displs);
        displs = NULL;
    }

    if (NULL != iovec_count_per_process){
        free(iovec_count_per_process);
        iovec_count_per_process=NULL;
    }

    if (NULL != local_iov_array){
        free(local_iov_array);
        local_iov_array=NULL;
    }

    if (NULL != global_iov_array){
        free(global_iov_array);
        global_iov_array=NULL;
    }

    if (my_aggregator == fh->f_rank) {

        for(l=0;l<fh->f_procs_per_group;l++){
            if (blocklen_per_process) {
                free(blocklen_per_process[l]);
            }
            if (NULL != displs_per_process[l]){
                free(displs_per_process[l]);
                displs_per_process[l] = NULL;
            }
        }
    }

    if (NULL != bytes_per_process){
        free(bytes_per_process);
        bytes_per_process =NULL;
    }

    if (NULL != disp_index){
        free(disp_index);
        disp_index =NULL;
    }

    if (NULL != displs_per_process){
        free(displs_per_process);
        displs_per_process = NULL;
    }

    if(NULL != bytes_remaining){
        free(bytes_remaining);
        bytes_remaining = NULL;
    }

    if(NULL != current_index){
        free(current_index);
        current_index = NULL;
    }

    if (NULL != blocklen_per_process){
        free(blocklen_per_process);
        blocklen_per_process =NULL;
    }

    if (NULL != bytes_remaining){
        free(bytes_remaining);
        bytes_remaining =NULL;
    }

    if (NULL != memory_displacements){
        free(memory_displacements);
        memory_displacements= NULL;
    }

    if (NULL != file_offsets_for_agg){
        free(file_offsets_for_agg);
        file_offsets_for_agg = NULL;
    }

    if (NULL != sorted_file_offsets){
        free(sorted_file_offsets);
        sorted_file_offsets = NULL;
    }

    if (NULL != sendtype){
        free(sendtype);
        sendtype=NULL;
    }

    if ( !recvbuf_is_contiguous ) {
        if (NULL != receive_buf){
            free(receive_buf);
            receive_buf=NULL;
        }
    }

    if (NULL != global_buf) {
        free(global_buf);
        global_buf = NULL;
    }

    if (NULL != sorted) {
        free(sorted);
        sorted = NULL;
    }

    if (NULL != send_req){
        free(send_req);
        send_req = NULL;
    }


    return ret;

}