Exemple #1
0
ompi_datatype_t* test_struct( void )
{
    ompi_datatype_t* types[] = { &ompi_mpi_float.dt  /* ompi_datatype_basicDatatypes[DT_FLOAT] */,
                                 NULL,
                                 &ompi_mpi_char.dt  /* ompi_datatype_basicDatatypes[DT_CHAR] */ };
    int lengths[] = { 2, 1, 3 };
    MPI_Aint disp[] = { 0, 16, 26 };
    ompi_datatype_t* pdt, *pdt1;

    printf( "test struct\n" );
    ompi_datatype_create_contiguous(0, &ompi_mpi_datatype_null.dt, &pdt1);
    ompi_datatype_add( pdt1, &ompi_mpi_double.dt, 1, 0, -1 );
    ompi_datatype_add( pdt1, &ompi_mpi_char.dt, 1, 8, -1 );
    if( outputFlags & DUMP_DATA_AFTER_COMMIT ) {
        ompi_datatype_dump( pdt1 );
    }

    types[1] = pdt1;

    ompi_datatype_create_struct( 3, lengths, disp, types, &pdt );
    OBJ_RELEASE( pdt1 ); /*assert( pdt1 == NULL );*/
    if( outputFlags & DUMP_DATA_AFTER_COMMIT ) {
        ompi_datatype_dump( pdt );
    }
    return pdt;
}
Exemple #2
0
ompi_datatype_t* create_strange_dt( void )
{
    sdata_intern v[2];
    MPI_Aint displ[3];
    ompi_datatype_t* types[3] = { &ompi_mpi_int.dt };
    sstrange t[2];
    int pBlock[3] = {1, 10, 1}, dispi[3];
    ompi_datatype_t *pdt, *pdt1, *pdt2, *pdtTemp;

    dispi[0] = (int)((char*)&(v[0].i1) - (char*)&(v[0]));  /* 0 */
    dispi[1] = (int)(((char*)(&(v[0].i2)) - (char*)&(v[0])) / sizeof(int));  /* 2 */
    ompi_datatype_create_indexed_block( 2, 1, dispi, &ompi_mpi_int.dt, &pdtTemp );
#ifdef USE_RESIZED
    /* optional */
    displ[0] = 0;
    displ[1] = (char*)&(v[1]) - (char*)&(v[0]);
    ompi_datatype_create_resized( pdtTemp, displ[0], displ[1], &pdt1 );
    OBJ_RELEASE( pdtTemp ); assert( pdtTemp == NULL );
#else
    pdt1 = pdtTemp;
#endif  /* USE_RESIZED */

    types[1] = pdt1;
    types[2] = &ompi_mpi_int.dt;
    displ[0] = 0;
    displ[1] = (long)((char*)&(t[0].v[0]) - (char*)&(t[0]));
    displ[2] = (long)((char*)&(t[0].last) - (char*)&(t[0]));
    ompi_datatype_create_struct( 3, pBlock, displ, types, &pdtTemp );
#ifdef USE_RESIZED
    /* optional */
    displ[1] = (char*)&(t[1]) - (char*)&(t[0]);
    ompi_datatype_create_resized( pdtTemp, displ[0], displ[1], &pdt2 );
    OBJ_RELEASE( pdtTemp ); assert( pdtTemp == NULL );
#else
    pdt2 = pdtTemp;
#endif  /* USE_RESIZED */

    ompi_datatype_create_contiguous( SSTRANGE_CNT, pdt2, &pdt );

    OBJ_RELEASE( pdt1 );
    OBJ_RELEASE( pdt2 );
    printf( "\nStrange datatype BEFORE COMMIT\n" );
    if( outputFlags & DUMP_DATA_AFTER_COMMIT ) {
        ompi_datatype_dump( pdt );
    }
    ompi_datatype_commit( &pdt );
    printf( "\nStrange datatype AFTER COMMIT\n" );
    if( outputFlags & DUMP_DATA_AFTER_COMMIT ) {
        ompi_datatype_dump( pdt );
    }
    return pdt;
}
Exemple #3
0
ompi_datatype_t* test_struct_char_double( void )
{
    char_double_t data;
    int lengths[] = {1, 1};
    MPI_Aint displ[] = {0, 0};
    ompi_datatype_t *pdt;
    ompi_datatype_t* types[] = { &ompi_mpi_char.dt, &ompi_mpi_double.dt};

    displ[0] = (char*)&(data.c) - (char*)&(data);
    displ[1] = (char*)&(data.d) - (char*)&(data);

    ompi_datatype_create_struct( 2, lengths, displ, types, &pdt );
    ompi_datatype_commit( &pdt );
    if( outputFlags & DUMP_DATA_AFTER_COMMIT ) {
        ompi_datatype_dump( pdt );
    }
    return pdt;
}
Exemple #4
0
ompi_datatype_t* create_struct_constant_gap_resized_ddt( ompi_datatype_t* type )
{
    struct structure data[1];
    ompi_datatype_t *struct_type, *temp_type;
    ompi_datatype_t *types[2] = {type, type};
    int blocklens[2] = {1, 1};
    MPI_Aint disps[3];

    MPI_Get_address(&data[0].transfered_1, &disps[0]);
    MPI_Get_address(&data[0].transfered_2, &disps[1]);
    MPI_Get_address(&data[0], &disps[2]);
    disps[1] -= disps[2]; /*  8 */
    disps[0] -= disps[2]; /* 16 */

    ompi_datatype_create_struct(2, blocklens, disps, types, &temp_type);
    ompi_datatype_create_resized(temp_type, 0, sizeof(data[0]), &struct_type);
    ompi_datatype_commit(&struct_type);
    OBJ_RELEASE(temp_type); assert( temp_type == NULL );
    if( outputFlags & DUMP_DATA_AFTER_COMMIT ) {
        ompi_datatype_dump( struct_type );
    }

    return struct_type;
}
static int
cyclic(const int *gsize_array, int dim, int ndims, int nprocs,
       int rank, int darg, int order, ptrdiff_t orig_extent,
       ompi_datatype_t* type_old, ompi_datatype_t **type_new,
       ptrdiff_t *st_offset)
{
    int blksize, i, blklens[2], st_index, end_index, local_size, rem, count, rc;
    ptrdiff_t stride, disps[2];
    ompi_datatype_t *type_tmp, *types[2];

    if (darg == MPI_DISTRIBUTE_DFLT_DARG) {
        blksize = 1;
    } else {
        blksize = darg;
    }

    st_index = rank * blksize;
    end_index = gsize_array[dim] - 1;

    if (end_index < st_index) {
        local_size = 0;
    } else {
        local_size = ((end_index - st_index + 1)/(nprocs*blksize))*blksize;
        rem = (end_index - st_index + 1) % (nprocs*blksize);
        local_size += rem < blksize ? rem : blksize;
    }

    count = local_size / blksize;
    rem = local_size % blksize;

    stride = nprocs*blksize*orig_extent;
    if (order == MPI_ORDER_FORTRAN) {
        for (i=0; i<dim; i++) {
            stride *= gsize_array[i];
        }
    } else {
        for (i=ndims-1; i>dim; i--) {
            stride *= gsize_array[i];
        }
    }

    rc = ompi_datatype_create_hvector(count, blksize, stride, type_old, type_new);
    if (OMPI_SUCCESS != rc) return rc;

    if (rem) {
        /* if the last block is of size less than blksize, include
           it separately using MPI_Type_struct */

        types  [0] = *type_new; types  [1] = type_old;
        disps  [0] = 0;         disps  [1] = count*stride;
        blklens[0] = 1;         blklens[1] = rem;

        rc = ompi_datatype_create_struct(2, blklens, disps, types, &type_tmp);
        ompi_datatype_destroy(type_new);
        /* even in error condition, need to destroy type_new, so check
           for error after destroy. */
        if (OMPI_SUCCESS != rc) return rc;
        *type_new = type_tmp;
    }

    /* need to set the UB for block-cyclic to work */
    disps[0] = 0;         disps[1] = orig_extent;
    if (order == MPI_ORDER_FORTRAN) {
        for(i=0; i<=dim; i++) {
            disps[1] *= gsize_array[i];
        }
    } else {
        for(i=ndims-1; i>=dim; i--) {
            disps[1] *= gsize_array[i];
        }
    }
    rc = opal_datatype_resize( &(*type_new)->super, disps[0], disps[1] );
    if (OMPI_SUCCESS != rc) return rc;

    *st_offset = rank * blksize;
    /* in terms of no. of elements of type oldtype in this dimension */
    if (local_size == 0) *st_offset = 0;

    return OMPI_SUCCESS;
}
int32_t ompi_datatype_create_darray(int size,
                                    int rank,
                                    int ndims,
                                    int const* gsize_array,
                                    int const* distrib_array,
                                    int const* darg_array,
                                    int const* psize_array,
                                    int order,
                                    const ompi_datatype_t* oldtype,
                                    ompi_datatype_t** newtype)
{
    ompi_datatype_t *lastType;
    ptrdiff_t orig_extent, *st_offsets = NULL;
    int i, start_loop, end_loop, step;
    int *coords = NULL, rc = OMPI_SUCCESS;

    /* speedy corner case */
    if (ndims < 1) {
        /* Don't just return MPI_DATATYPE_NULL as that can't be
           MPI_TYPE_FREE()ed, and that seems bad */
        *newtype = ompi_datatype_create(0);
        ompi_datatype_add(*newtype, &ompi_mpi_datatype_null.dt, 0, 0, 0);
        return MPI_SUCCESS;
    }

    rc = ompi_datatype_type_extent(oldtype, &orig_extent);
    if (MPI_SUCCESS != rc) goto cleanup;

    /* calculate position in grid using row-major ordering */
    {
        int tmp_rank = rank, procs = size;

        coords = (int *) malloc(ndims * sizeof(int));
        for (i = 0 ; i < ndims ; i++) {
            procs = procs / psize_array[i];
            coords[i] = tmp_rank / procs;
            tmp_rank = tmp_rank % procs;
        }
    }

    st_offsets = (ptrdiff_t *) malloc(ndims * sizeof(ptrdiff_t));

    /* duplicate type to here to 1) deal with constness without
       casting and 2) eliminate need to for conditional destroy below.
       Lame, yes.  But cleaner code all around. */
    rc = ompi_datatype_duplicate(oldtype, &lastType);
    if (OMPI_SUCCESS != rc) goto cleanup;

    /* figure out ordering issues */
    if (MPI_ORDER_C == order) {
        start_loop = ndims - 1 ; step = -1; end_loop = -1;
    } else {
        start_loop = 0 ; step = 1; end_loop = ndims;
    }

    /* Build up array */
    for (i = start_loop; i != end_loop; i += step) {
        int nprocs, tmp_rank;

        switch(distrib_array[i]) {
        case MPI_DISTRIBUTE_BLOCK:
            rc = block(gsize_array, i, ndims, psize_array[i], coords[i],
                       darg_array[i], order, orig_extent,
                       lastType, newtype, st_offsets+i);
            break;
        case MPI_DISTRIBUTE_CYCLIC:
            rc = cyclic(gsize_array, i, ndims, psize_array[i], coords[i],
                        darg_array[i], order, orig_extent,
                        lastType, newtype, st_offsets+i);
            break;
        case MPI_DISTRIBUTE_NONE:
            /* treat it as a block distribution on 1 process */
            if (order == MPI_ORDER_C) {
                nprocs = psize_array[i]; tmp_rank = coords[i];
            } else {
                nprocs = 1; tmp_rank = 0;
            }

            rc = block(gsize_array, i, ndims, nprocs, tmp_rank,
                       MPI_DISTRIBUTE_DFLT_DARG, order, orig_extent,
                       lastType, newtype, st_offsets+i);
            break;
        default:
            rc = MPI_ERR_ARG;
        }
        ompi_datatype_destroy(&lastType);
        /* need to destroy the old type even in error condition, so
           don't check return code from above until after cleanup. */
        if (MPI_SUCCESS != rc) goto cleanup;
        lastType = *newtype;
    }


    /* set displacement and UB correctly. Use struct instead of
       resized for same reason as subarray */
    {
        ptrdiff_t displs[3], tmp_size;
        ompi_datatype_t *types[3];
        int blength[3] = { 1, 1, 1};

        displs[1] = st_offsets[start_loop];
        tmp_size = 1;
        for (i = start_loop + step ; i != end_loop ; i += step) {
            tmp_size *= gsize_array[i - step];
            displs[1] += tmp_size * st_offsets[i];
        }

        displs[0] = 0;
        displs[1] *= orig_extent;
        displs[2] = orig_extent;
        for (i = 0 ; i < ndims ; i++) {
            displs[2] *= gsize_array[i];
        }
        types[0] = MPI_LB; types[1] = lastType; types[2] = MPI_UB;

        rc = ompi_datatype_create_struct(3, blength, displs, types, newtype);
        ompi_datatype_destroy(&lastType);
        /* need to destroy the old type even in error condition, so
           don't check return code from above until after cleanup. */
        if (MPI_SUCCESS != rc) goto cleanup;
    }

 cleanup:
    if (NULL != st_offsets) free(st_offsets);
    if (NULL != coords) free(coords);
    return OMPI_SUCCESS;
}
Exemple #7
0
static ompi_datatype_t* __ompi_datatype_create_from_args( int32_t* i, MPI_Aint* a,
                                                          ompi_datatype_t** d, int32_t type )
{
    ompi_datatype_t* datatype = NULL;

    switch(type){
        /******************************************************************/
    case MPI_COMBINER_DUP:
        /* should we duplicate d[0]? */
        /* ompi_datatype_set_args( datatype, 0, NULL, 0, NULL, 1, d[0], MPI_COMBINER_DUP ); */
        assert(0);  /* shouldn't happen */
        break;
        /******************************************************************/
    case MPI_COMBINER_CONTIGUOUS:
        ompi_datatype_create_contiguous( i[0], d[0], &datatype );
        ompi_datatype_set_args( datatype, 1, (const int **) &i, 0, NULL, 1, d, MPI_COMBINER_CONTIGUOUS );
        break;
        /******************************************************************/
    case MPI_COMBINER_VECTOR:
        ompi_datatype_create_vector( i[0], i[1], i[2], d[0], &datatype );
        {
            const int* a_i[3] = {&i[0], &i[1], &i[2]};
            ompi_datatype_set_args( datatype, 3, a_i, 0, NULL, 1, d, MPI_COMBINER_VECTOR );
        }
        break;
        /******************************************************************/
    case MPI_COMBINER_HVECTOR_INTEGER:
    case MPI_COMBINER_HVECTOR:
        ompi_datatype_create_hvector( i[0], i[1], a[0], d[0], &datatype );
        {
            const int* a_i[2] = {&i[0], &i[1]};
            ompi_datatype_set_args( datatype, 2, a_i, 1, a, 1, d, MPI_COMBINER_HVECTOR );
        }
        break;
        /******************************************************************/
    case MPI_COMBINER_INDEXED:  /* TO CHECK */
        ompi_datatype_create_indexed( i[0], &(i[1]), &(i[1+i[0]]), d[0], &datatype );
        {
            const int* a_i[3] = {&i[0], &i[1], &(i[1+i[0]])};
            ompi_datatype_set_args( datatype, 2 * i[0] + 1, a_i, 0, NULL, 1, d, MPI_COMBINER_INDEXED );
        }
        break;
        /******************************************************************/
    case MPI_COMBINER_HINDEXED_INTEGER:
    case MPI_COMBINER_HINDEXED:
        ompi_datatype_create_hindexed( i[0], &(i[1]), a, d[0], &datatype );
        {
            const int* a_i[2] = {&i[0], &i[1]};
            ompi_datatype_set_args( datatype, i[0] + 1, a_i, i[0], a, 1, d, MPI_COMBINER_HINDEXED );
        }
        break;
        /******************************************************************/
    case MPI_COMBINER_INDEXED_BLOCK:
        ompi_datatype_create_indexed_block( i[0], i[1], &(i[2]), d[0], &datatype );
        {
            const int* a_i[3] = {&i[0], &i[1], &i[2]};
            ompi_datatype_set_args( datatype, i[0] + 2, a_i, 0, NULL, 1, d, MPI_COMBINER_INDEXED_BLOCK );
        }
        break;
        /******************************************************************/
    case MPI_COMBINER_STRUCT_INTEGER:
    case MPI_COMBINER_STRUCT:
        ompi_datatype_create_struct( i[0], &(i[1]), a, d, &datatype );
        {
            const int* a_i[2] = {&i[0], &i[1]};
            ompi_datatype_set_args( datatype, i[0] + 1, a_i, i[0], a, i[0], d, MPI_COMBINER_STRUCT );
        }
        break;
        /******************************************************************/
    case MPI_COMBINER_SUBARRAY:
        ompi_datatype_create_subarray( i[0], &i[1 + 0 * i[0]], &i[1 + 1 * i[0]],
                                       &i[1 + 2 * i[0]], i[1 + 3 * i[0]],
                                       d[0], &datatype );
        {
            const int* a_i[5] = {&i[0], &i[1 + 0 * i[0]], &i[1 + 1 * i[0]], &i[1 + 2 * i[0]], &i[1 + 3 * i[0]]};
            ompi_datatype_set_args( datatype, 3 * i[0] + 2, a_i, 0, NULL, 1, d, MPI_COMBINER_SUBARRAY);
        }
        break;
        /******************************************************************/
    case MPI_COMBINER_DARRAY:
        ompi_datatype_create_darray( i[0] /* size */, i[1] /* rank */, i[2] /* ndims */,
                                     &i[3 + 0 * i[0]], &i[3 + 1 * i[0]],
                                     &i[3 + 2 * i[0]], &i[3 + 3 * i[0]],
                                     i[3 + 4 * i[0]], d[0], &datatype );
        {
            const int* a_i[8] = {&i[0], &i[1], &i[2], &i[3 + 0 * i[0]], &i[3 + 1 * i[0]], &i[3 + 2 * i[0]],
                                 &i[3 + 3 * i[0]], &i[3 + 4 * i[0]]};
            ompi_datatype_set_args( datatype, 4 * i[0] + 4,a_i, 0, NULL, 1, d, MPI_COMBINER_DARRAY);
        }
        break;
        /******************************************************************/
    case MPI_COMBINER_F90_REAL:
    case MPI_COMBINER_F90_COMPLEX:
        /*pArgs->i[0] = i[0][0];
          pArgs->i[1] = i[1][0];
        */
        break;
        /******************************************************************/
    case MPI_COMBINER_F90_INTEGER:
        /*pArgs->i[0] = i[0][0];*/
        break;
        /******************************************************************/
    case MPI_COMBINER_RESIZED:
        ompi_datatype_create_resized(d[0], a[0], a[1], &datatype);
        ompi_datatype_set_args( datatype, 0, NULL, 2, a, 1, d, MPI_COMBINER_RESIZED );
        break;
        /******************************************************************/
    case MPI_COMBINER_HINDEXED_BLOCK:
        ompi_datatype_create_hindexed_block( i[0], i[1], a, d[0], &datatype );
        {
            const int* a_i[2] = {&i[0], &i[1]};
            ompi_datatype_set_args( datatype, 2 + i[0], a_i, i[0], a, 1, d, MPI_COMBINER_HINDEXED_BLOCK );
        }
        break;
        /******************************************************************/
     default:
        break;
    }

    return datatype;
}
int32_t ompi_datatype_create_subarray(int ndims,
                                      int const* size_array,
                                      int const* subsize_array,
                                      int const* start_array,
                                      int order,
                                      const ompi_datatype_t* oldtype,
                                      ompi_datatype_t** newtype)
{
    MPI_Datatype last_type;
    int32_t i, step, end_loop;
    MPI_Aint size, displ, extent;

    /**
     * If the oldtype contains the original MPI_LB and MPI_UB markers then we
     * are forced to follow the MPI standard suggestion and reset these 2
     * markers (MPI 3.0 page 96 line 37).  Otherwise we can simply resize the
     * datatype.
     */
    ompi_datatype_type_extent( oldtype, &extent );

    /* If the ndims is zero then return the NULL datatype */
    if( ndims < 2 ) {
        if( 0 == ndims ) {
            *newtype = &ompi_mpi_datatype_null.dt;
            return MPI_SUCCESS;
        }
        ompi_datatype_create_contiguous( subsize_array[0], oldtype, &last_type );
        size = size_array[0];
        displ = start_array[0];
        goto replace_subarray_type;
    }

    if( MPI_ORDER_C == order ) {
        i = ndims - 1;
        step = -1;
        end_loop = -1;
    } else {
        i = 0;
        step = 1;
        end_loop = ndims;
    }

    /* As we know that the ndims is at least 1 we can start by creating the
     * first dimension data outside the loop, such that we dont have to create
     * a duplicate of the oldtype just to be able to free it.
     */
    ompi_datatype_create_vector( subsize_array[i+step], subsize_array[i], size_array[i],
                                 oldtype, newtype );

    last_type = *newtype;
    size = (MPI_Aint)size_array[i] * (MPI_Aint)size_array[i+step];
    displ = (MPI_Aint)start_array[i] + (MPI_Aint)start_array[i+step] * (MPI_Aint)size_array[i];
    for( i += 2 * step; i != end_loop; i += step ) {
        ompi_datatype_create_hvector( subsize_array[i], 1, size * extent,
                                      last_type, newtype );
        ompi_datatype_destroy( &last_type );

        displ += size * start_array[i];
        size *= size_array[i];
        last_type = *newtype;
    }

 replace_subarray_type:
    /*
     * Resized will only set the soft lb and ub markers without moving the real
     * data inside. Thus, in case the original data contains the hard markers
     * (MPI_LB or MPI_UB) we must force the displacement of the data upward to
     * the right position AND set the hard markers LB and UB.
     *
     * NTH: ompi_datatype_create_resized() does not do enough for the general
     * pack/unpack functions to work correctly. Until this is fixed always use
     * ompi_datatype_create_struct(). Once this is fixed remove 1 || below. To
     * verify that the regression is fixed run the subarray test in the Open MPI
     * ibm testsuite.
     */
    if(1 || oldtype->super.flags & (OPAL_DATATYPE_FLAG_USER_LB | OPAL_DATATYPE_FLAG_USER_UB) ) {
        MPI_Aint displs[3];
        MPI_Datatype types[3];
        int blength[3] = { 1, 1, 1 };

        displs[0] = 0; displs[1] = displ * extent; displs[2] = size * extent;
        types[0] = MPI_LB; types[1] = last_type; types[2] = MPI_UB;
        ompi_datatype_create_struct( 3, blength, displs, types, newtype );
    } else {
        ompi_datatype_create_resized(last_type, displ * extent, size * extent, newtype);
    }
    ompi_datatype_destroy( &last_type );

    return OMPI_SUCCESS;
}
Exemple #9
0
/*
 *	file_open_pvfs2: This is the same strategy as ROMIO's pvfs2 open
 *
 *	Function:	- opens a new file
 *	Accepts:	- same arguments as MPI_File_open()
 *	Returns:	- Success if new file handle
 */
int
mca_fs_pvfs2_file_open (struct ompi_communicator_t *comm,
                        const char* filename,
                        int access_mode,
                        struct ompi_info_t *info,
                        mca_io_ompio_file_t *fh)
{
    int ret;
    mca_fs_pvfs2 *pvfs2_fs;
    PVFS_fs_id pvfs2_id;
    char pvfs2_path[OMPIO_MAX_NAME] = {0};
    char * ncache_timeout;
    open_status o_status = {0, {0, 0}};
    struct ompi_datatype_t *open_status_type;
    struct ompi_datatype_t *types[2] = {&ompi_mpi_int.dt, &ompi_mpi_byte.dt};
    int lens[2] = {1, sizeof(PVFS_object_ref)};
    OPAL_PTRDIFF_TYPE offsets[2];
    char char_stripe[MPI_MAX_INFO_KEY];
    int flag;
    int fs_pvfs2_stripe_size = -1;
    int fs_pvfs2_stripe_width = -1;

    /* We are going to do what ROMIO does with one process resolving
     * the name and broadcasting to others */

    pvfs2_fs = (mca_fs_pvfs2 *) malloc(sizeof(mca_fs_pvfs2));
    if (NULL == pvfs2_fs) {
        opal_output (1, "OUT OF MEMORY\n");
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    if (!mca_fs_pvfs2_IS_INITIALIZED) {
        /* disable the pvfs2 ncache */
        ncache_timeout = getenv("PVFS2_NCACHE_TIMEOUT");
        if (ncache_timeout == NULL ) {
            setenv("PVFS2_NCACHE_TIMEOUT", "0", 1);
        }
        ret = PVFS_util_init_defaults();
        if (ret < 0) {
            PVFS_perror("PVFS_util_init_defaults", ret);
            return OMPI_ERROR;
        }
        mca_fs_pvfs2_IS_INITIALIZED = 1;
    }

    memset(&(pvfs2_fs->credentials), 0, sizeof(PVFS_credentials));
    PVFS_util_gen_credentials(&(pvfs2_fs->credentials));

    /* check for stripe size and stripe depth in the info object and
       update mca_fs_pvfs2_stripe_width and mca_fs_pvfs2_stripe_size
       before calling fake_an_open() */

    ompi_info_get (info, "stripe_size", MPI_MAX_INFO_VAL, char_stripe, &flag);
    if ( flag ) {
        sscanf ( char_stripe, "%d", &fs_pvfs2_stripe_size );
    }

    ompi_info_get (info, "stripe_width", MPI_MAX_INFO_VAL, char_stripe, &flag);
    if ( flag ) {
        sscanf ( char_stripe, "%d", &fs_pvfs2_stripe_width );
    }

    if (fs_pvfs2_stripe_size < 0) {
        fs_pvfs2_stripe_size = mca_fs_pvfs2_stripe_size;
    }

    if (fs_pvfs2_stripe_width < 0) {
        fs_pvfs2_stripe_width = mca_fs_pvfs2_stripe_width;
    }


    if (OMPIO_ROOT == fh->f_rank) {
        ret = PVFS_util_resolve(filename, &pvfs2_id, pvfs2_path, OMPIO_MAX_NAME);
        if (ret < 0 ) {
            PVFS_perror("PVFS_util_resolve", ret);
	    o_status.error = -1;
        }
        else {
            fake_an_open (pvfs2_id,
                          pvfs2_path,
                          access_mode,
                          fs_pvfs2_stripe_width,
                          (PVFS_size)fs_pvfs2_stripe_size,
                          pvfs2_fs,
                          &o_status);
        }
        pvfs2_fs->object_ref = o_status.object_ref;
	fh->f_fs_ptr = pvfs2_fs;
    }

    /* broadcast status and (possibly valid) object reference */
    offsets[0] = (MPI_Aint)(&o_status.error);
    offsets[1] = (MPI_Aint)(&o_status.object_ref);

    ompi_datatype_create_struct (2, lens, offsets, types, &open_status_type);
    ompi_datatype_commit (&open_status_type);

    fh->f_comm->c_coll.coll_bcast (MPI_BOTTOM,
                                   1,
                                   open_status_type,
                                   OMPIO_ROOT,
                                   fh->f_comm,
                                   fh->f_comm->c_coll.coll_bcast_module);

    ompi_datatype_destroy (&open_status_type);

    if (o_status.error != 0) {
	/* No need to free the pvfs2_fs structure, since it will
	   be deallocated in file_close in case of an error */
	fh->f_fs_ptr = NULL;
	return OMPI_ERROR;
    }

    pvfs2_fs->object_ref = o_status.object_ref;
    fh->f_fs_ptr = pvfs2_fs;

    /* update the internal ompio structure to store stripe
       size and stripe depth correctly.
       Hadi(to be done): For this read the stripe size and stripe depth from
       the file itself
    */

    if (fs_pvfs2_stripe_size > 0 && fs_pvfs2_stripe_width > 0) {
        fh->f_stripe_size = fs_pvfs2_stripe_size;
        fh->f_stripe_count = fs_pvfs2_stripe_width;
    }

    return OMPI_SUCCESS;
}
int32_t ompi_datatype_create_subarray(int ndims,
                                      int const* size_array,
                                      int const* subsize_array,
                                      int const* start_array,
                                      int order,
                                      const ompi_datatype_t* oldtype,
                                      ompi_datatype_t** newtype)
{
    MPI_Datatype last_type;
    int32_t i, step, end_loop;
    MPI_Aint size, displ, extent;

    ompi_datatype_type_extent( oldtype, &extent );

    /* If the ndims is zero then return the NULL datatype */
    if( ndims < 2 ) {
        if( 0 == ndims ) {
            *newtype = &ompi_mpi_datatype_null.dt;
            return MPI_SUCCESS;
        }
        ompi_datatype_create_contiguous( subsize_array[0], oldtype, &last_type );
        size = size_array[0];
        displ = start_array[0];
        goto replace_subarray_type;
    }

    if( MPI_ORDER_C == order ) {
        i = ndims - 1;
        step = -1;
        end_loop = -1;
    } else {
        i = 0;
        step = 1;
        end_loop = ndims;
    }

    /* As we know that the ndims is at least 1 we can start by creating the
     * first dimension data outside the loop, such that we dont have to create
     * a duplicate of the oldtype just to be able to free it.
     */
    ompi_datatype_create_vector( subsize_array[i+step], subsize_array[i], size_array[i],
                                 oldtype, newtype );

    last_type = *newtype;
    size = size_array[i] * size_array[i+step];
    displ = start_array[i] + start_array[i+step] * size_array[i];
    for( i += 2 * step; i != end_loop; i += step ) {
        ompi_datatype_create_hvector( subsize_array[i], 1, size * extent,
                                      last_type, newtype );
        ompi_datatype_destroy( &last_type );
        displ += size * start_array[i];
        size *= size_array[i];
        last_type = *newtype;
    }

replace_subarray_type:
    /**
     * We cannot use resized here. Resized will only set the soft lb and ub markers
     * without moving the real data inside. What we need is to force the displacement
     * of the data upward to the right position AND set the LB and UB. A type
     * struct is the function we need.
     */
    {
        MPI_Aint displs[3];
        MPI_Datatype types[3];
        int blength[3] = { 1, 1, 1 };

        displs[0] = 0;
        displs[1] = displ * extent;
        displs[2] = size * extent;
        types[0] = MPI_LB;
        types[1] = last_type;
        types[2] = MPI_UB;
        ompi_datatype_create_struct( 3, blength, displs, types, newtype );
    }
    ompi_datatype_destroy( &last_type );

    return OMPI_SUCCESS;
}
int
mca_fcoll_static_file_write_all (mca_io_ompio_file_t *fh,
                                 void *buf,
                                 int count,
                                 struct ompi_datatype_t *datatype,
                                 ompi_status_public_t *status)
{
    
    
    
    size_t max_data = 0, bytes_per_cycle=0;
    struct iovec *iov=NULL, *decoded_iov=NULL;
    uint32_t iov_count=0, iov_index=0;
    int i=0,j=0,l=0, temp_index;
    int ret=OMPI_SUCCESS, cycles, local_cycles, *bytes_per_process=NULL;
    int index, *disp_index=NULL, **blocklen_per_process=NULL;
    int *iovec_count_per_process=NULL, *displs=NULL;
    size_t total_bytes_written=0;
    MPI_Aint **displs_per_process=NULL, *memory_displacements=NULL;
    MPI_Aint bytes_to_write_in_cycle=0, global_iov_count=0, global_count=0;
    
    local_io_array *local_iov_array =NULL, *global_iov_array=NULL;
    local_io_array *file_offsets_for_agg=NULL;
    int *sorted=NULL, *sorted_file_offsets=NULL, temp_pindex, *temp_disp_index=NULL;
    char *send_buf=NULL, *global_buf=NULL;
    int iov_size=0, current_position=0, *current_index=NULL;
    int *bytes_remaining=NULL, entries_per_aggregator=0;
    ompi_datatype_t **recvtype = NULL;
    MPI_Request *send_req=NULL, *recv_req=NULL;
    /* For creating datatype of type io_array */
    int blocklen[3] = {1, 1, 1};
    int static_num_io_procs=1;
    OPAL_PTRDIFF_TYPE d[3], base;
    ompi_datatype_t *types[3];
    ompi_datatype_t *io_array_type=MPI_DATATYPE_NULL;
    /*----------------------------------------------*/
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    double write_time = 0.0, start_write_time = 0.0, end_write_time = 0.0;
    double comm_time = 0.0, start_comm_time = 0.0, end_comm_time = 0.0;
    double exch_write = 0.0, start_exch = 0.0, end_exch = 0.0;
    mca_io_ompio_print_entry nentry;
#endif
    
    
#if DEBUG_ON
    MPI_Aint gc_in;
#endif
    
//  if (opal_datatype_is_contiguous_memory_layout(&datatype->super,1)) {
//    fh->f_flags |= OMPIO_CONTIGUOUS_MEMORY;
//  }
    
    /* In case the data is not contigous in memory, decode it into an iovec */
    if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) {
        fh->f_decode_datatype ((struct mca_io_ompio_file_t *)fh,
                               datatype,
                               count,
                               buf,
                               &max_data,
                               &decoded_iov,
                               &iov_count);
    }
    else {
        max_data = count * datatype->super.size;
    }
    
    if ( MPI_STATUS_IGNORE != status ) {
	status->_ucount = max_data;
    }
    
    fh->f_get_num_aggregators ( & static_num_io_procs );
    fh->f_set_aggregator_props ((struct mca_io_ompio_file_t *)fh,
				static_num_io_procs,
				max_data);
    
    
    /* io_array datatype  for using in communication*/
    types[0] = &ompi_mpi_long.dt;
    types[1] = &ompi_mpi_long.dt;
    types[2] = &ompi_mpi_int.dt;
    
    d[0] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0];
    d[1] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0].length;
    d[2] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0].process_id;
    base = d[0];
    for (i=0 ; i<3 ; i++) {
        d[i] -= base;
    }
    ompi_datatype_create_struct (3,
                                 blocklen,
                                 d,
                                 types,
                                 &io_array_type);
    ompi_datatype_commit (&io_array_type);
    /* #########################################################*/
    
    
    
    ret = fh->f_generate_current_file_view((struct mca_io_ompio_file_t *)fh,
                                           max_data,
                                           &iov,
                                           &iov_size);
    if (ret != OMPI_SUCCESS){
        fprintf(stderr,"Current File View Generation Error\n");
        goto exit;
    }
    
    if (0 == iov_size){
        iov_size  = 1;
    }
    
    local_iov_array = (local_io_array *)malloc (iov_size * sizeof(local_io_array));
    if ( NULL == local_iov_array){
        fprintf(stderr,"local_iov_array allocation error\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }
    
    
    for (j=0; j < iov_size; j++){
        local_iov_array[j].offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t)
            iov[j].iov_base;
        local_iov_array[j].length = (size_t)iov[j].iov_len;
        local_iov_array[j].process_id = fh->f_rank;
        
    }
    
    fh->f_get_bytes_per_agg ( (int *) &bytes_per_cycle);
    
    
    local_cycles = ceil((double)max_data/bytes_per_cycle);
    ret = fh->f_comm->c_coll.coll_allreduce (&local_cycles,
                                             &cycles,
                                             1,
                                             MPI_INT,
                                             MPI_MAX,
                                             fh->f_comm,
                                             fh->f_comm->c_coll.coll_allreduce_module);
    
    if (OMPI_SUCCESS != ret){
        fprintf(stderr,"local cycles allreduce!\n");
        goto exit;
    }
    
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
        
        disp_index = (int *)malloc (fh->f_procs_per_group * sizeof (int));
        if (NULL == disp_index) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        
        bytes_per_process = (int *) malloc (fh->f_procs_per_group * sizeof(int ));
        if (NULL == bytes_per_process){
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        
        bytes_remaining = (int *) malloc (fh->f_procs_per_group * sizeof(int));
        if (NULL == bytes_remaining){
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        
        current_index = (int *) malloc (fh->f_procs_per_group * sizeof(int));
        if (NULL == current_index){
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        
        blocklen_per_process = (int **)malloc (fh->f_procs_per_group * sizeof (int*));
        if (NULL == blocklen_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        
        displs_per_process = (MPI_Aint **)
            malloc (fh->f_procs_per_group * sizeof (MPI_Aint*));
        
        if (NULL == displs_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        
        for(i=0;i<fh->f_procs_per_group;i++){
            current_index[i] = 0;
            bytes_remaining[i] =0;
            blocklen_per_process[i] = NULL;
            displs_per_process[i] = NULL;
        }
    }
    
    iovec_count_per_process = (int *) malloc (fh->f_procs_per_group * sizeof(int));
    if (NULL == iovec_count_per_process){
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }
    
    displs = (int *) malloc (fh->f_procs_per_group * sizeof(int));
    if (NULL == displs){
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }
    
    ret = fh->f_allgather_array (&iov_size,
                                 1,
                                 MPI_INT,
                                 iovec_count_per_process,
                                 1,
                                 MPI_INT,
                                 fh->f_aggregator_index,
                                 fh->f_procs_in_group,
                                 fh->f_procs_per_group,
                                 fh->f_comm);
    
    if( OMPI_SUCCESS != ret){
        fprintf(stderr,"iov size allgatherv array!\n");
        goto exit;
    }
    
    
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
        displs[0] = 0;
        global_iov_count = iovec_count_per_process[0];
        for (i=1 ; i<fh->f_procs_per_group ; i++) {
            global_iov_count += iovec_count_per_process[i];
            displs[i] = displs[i-1] + iovec_count_per_process[i-1];
        }
    }
    
    
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
        global_iov_array = (local_io_array *) malloc (global_iov_count *
                                                      sizeof(local_io_array));
        if (NULL == global_iov_array){
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
    }
    
    ret = fh->f_gatherv_array (local_iov_array,
                               iov_size,
                               io_array_type,
                               global_iov_array,
                               iovec_count_per_process,
                               displs,
                               io_array_type,
                               fh->f_aggregator_index,
                               fh->f_procs_in_group,
                               fh->f_procs_per_group,
                               fh->f_comm);
    if (OMPI_SUCCESS != ret){
        fprintf(stderr,"global_iov_array gather error!\n");
        goto exit;
    }
    
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
        
        if ( 0 == global_iov_count){
            global_iov_count =  1;
        }
        
        sorted = (int *)malloc (global_iov_count * sizeof(int));
        if (NULL == sorted) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        local_heap_sort (global_iov_array, global_iov_count, sorted);
    }
    
#if DEBUG_ON
    
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
        for (gc_in=0; gc_in<global_iov_count; gc_in++){
            printf("%d: Offset[%ld]: %lld, Length[%ld]: %ld\n",
                   global_iov_array[gc_in].process_id,
                   gc_in, global_iov_array[gc_in].offset,
                   gc_in, global_iov_array[gc_in].length);
        }
    }
#endif
    
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_exch = MPI_Wtime();
#endif
    
    
    for (index = 0; index < cycles; index++){
        if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
            if (NULL == recvtype){
                recvtype = (ompi_datatype_t **)
                    malloc (fh->f_procs_per_group  * sizeof(ompi_datatype_t *));
                if (NULL == recvtype) {
                    opal_output (1, "OUT OF MEMORY\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
            }
            for(l=0;l<fh->f_procs_per_group;l++){
                disp_index[l] =  1;
                if (NULL != blocklen_per_process[l]){
                    free(blocklen_per_process[l]);
                    blocklen_per_process[l] = NULL;
                }
                if (NULL != displs_per_process[l]){
                    free(displs_per_process[l]);
                    displs_per_process[l] = NULL;
                }
                blocklen_per_process[l] = (int *) calloc (1, sizeof(int));
                if (NULL == blocklen_per_process[l]) {
                    opal_output (1, "OUT OF MEMORY for blocklen\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint));
                if (NULL == displs_per_process[l]){
                    opal_output (1, "OUT OF MEMORY for displs\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
            }
            if (NULL != sorted_file_offsets){
                free(sorted_file_offsets);
                sorted_file_offsets = NULL;
            }
            
            if(NULL != file_offsets_for_agg){
                free(file_offsets_for_agg);
                file_offsets_for_agg = NULL;
            }
            
            if (NULL != memory_displacements){
                free(memory_displacements);
                memory_displacements = NULL;
            }
            
        }
        if (local_cycles > index) {
            if ((index == local_cycles-1) && (max_data % bytes_per_cycle)) {
                bytes_to_write_in_cycle = max_data % bytes_per_cycle;
            }
            else if (max_data <= bytes_per_cycle) {
                bytes_to_write_in_cycle = max_data;
            }
            else {
                bytes_to_write_in_cycle = bytes_per_cycle;
            }
        }
        else {
            bytes_to_write_in_cycle = 0;
        }
#if DEBUG_ON
        /*    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {*/
        printf ("***%d: CYCLE %d   Bytes %ld**********\n",
                fh->f_rank,
                index,
                bytes_to_write_in_cycle);
        /* }*/
#endif
        /**********************************************************
         **Gather the Data from all the processes at the writers **
         *********************************************************/
        
        /* gather from each process how many bytes each will be sending */
        fh->f_gather_array (&bytes_to_write_in_cycle,
                            1,
                            MPI_INT,
                            bytes_per_process,
                            1,
                            MPI_INT,
                            fh->f_aggregator_index,
                            fh->f_procs_in_group,
                            fh->f_procs_per_group,
                            fh->f_comm);
        
        /*
          For each aggregator
          it needs to get bytes_to_write_in_cycle from each process
          in group which adds up to bytes_per_cycle
          
        */
        if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
            for (i=0;i<fh->f_procs_per_group; i++){
/*	    printf("bytes_per_process[%d]: %d\n", i, bytes_per_process[i]);
 */
                
#if DEBUG_ON
                printf ("%d : bytes_per_process : %d\n",
                        fh->f_procs_in_group[i],
                        bytes_per_process[i]);
#endif
                
                while (bytes_per_process[i] > 0){
                    if (get_process_id(global_iov_array[sorted[current_index[i]]].process_id,
                                       fh) == i){ /* current id owns this entry!*/
                        
                        /*Add and subtract length and create
                          blocklength and displs array*/
                        if (bytes_remaining[i]){ /*Remaining bytes in the current entry of
                                                   the global offset array*/
                            if (bytes_remaining[i] <= bytes_per_process[i]){
                                blocklen_per_process[i][disp_index[i] - 1] = bytes_remaining[i];
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset +
                                    (global_iov_array[sorted[current_index[i]]].length
                                     - bytes_remaining[i]);
                                
                                blocklen_per_process[i] = (int *) realloc
                                    ((void *)blocklen_per_process[i], (disp_index[i]+1)*sizeof(int));
                                displs_per_process[i] = (MPI_Aint *)realloc
                                    ((void *)displs_per_process[i], (disp_index[i]+1)*sizeof(MPI_Aint));
                                bytes_per_process[i] -= bytes_remaining[i];
                                blocklen_per_process[i][disp_index[i]] = 0;
                                displs_per_process[i][disp_index[i]] = 0;
                                bytes_remaining[i] = 0;
                                disp_index[i] += 1;
                                /* This entry has been used up, we need to move to the
                                   next entry of this process and make current_index point there*/
                                current_index[i]  = find_next_index(i,
                                                                    current_index[i],
                                                                    fh,
                                                                    global_iov_array,
                                                                    global_iov_count,
                                                                    sorted);
                                if (current_index[i] == -1){
                                    /* No more entries left, so Its all done! exit!*/
                                    break;
                                }
                                continue;
                            }
                            else{
                                blocklen_per_process[i][disp_index[i] - 1] = bytes_per_process[i];
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset +
                                    (global_iov_array[sorted[current_index[i]]].length
                                     - bytes_remaining[i]);
                                bytes_remaining[i] -= bytes_per_process[i];
                                bytes_per_process[i] = 0;
                                break;
                            }
                        }
                        else{
                            if (bytes_per_process[i] <
                                global_iov_array[sorted[current_index[i]]].length){
                                blocklen_per_process[i][disp_index[i] - 1] =
                                    bytes_per_process[i];
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset;
                                
                                bytes_remaining[i] =
                                    global_iov_array[sorted[current_index[i]]].length -
                                    bytes_per_process[i];
                                bytes_per_process[i] = 0;
                                break;
                            }
                            else {
                                blocklen_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].length;
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset;
                                blocklen_per_process[i] =
                                    (int *) realloc ((void *)blocklen_per_process[i], (disp_index[i]+1)*sizeof(int));
                                displs_per_process[i] = (MPI_Aint *)realloc
                                    ((void *)displs_per_process[i], (disp_index[i]+1)*sizeof(MPI_Aint));
                                blocklen_per_process[i][disp_index[i]] = 0;
                                displs_per_process[i][disp_index[i]] = 0;
                                disp_index[i] += 1;
                                bytes_per_process[i] -=
                                    global_iov_array[sorted[current_index[i]]].length;
                                current_index[i] = find_next_index(i,
                                                                   current_index[i],
                                                                   fh,
                                                                   global_iov_array,
                                                                   global_iov_count,
                                                                   sorted);
                                if (current_index[i] == -1){
                                    break;
                                }
                            }
                        }
                    }
                    else{
                        current_index[i] = find_next_index(i,
                                                           current_index[i],
                                                           fh,
                                                           global_iov_array,
                                                           global_iov_count,
                                                           sorted);
                        if (current_index[i] == -1){
                            bytes_per_process[i] = 0; /* no more entries left
                                                         to service this request*/
                            continue;
                        }
                    }
                }
            }
            entries_per_aggregator=0;
            for (i=0;i<fh->f_procs_per_group;i++){
                for (j=0;j<disp_index[i];j++){
                    if (blocklen_per_process[i][j] > 0){
                        entries_per_aggregator++;
#if DEBUG_ON
                        printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n",
                               fh->f_procs_in_group[i],j,
                               blocklen_per_process[i][j],j,
                               displs_per_process[i][j],
                               fh->f_rank);
                        
#endif
                    }
                    
                }
            }
            
            if (entries_per_aggregator > 0){
                file_offsets_for_agg = (local_io_array *)
                    malloc(entries_per_aggregator*sizeof(local_io_array));
                if (NULL == file_offsets_for_agg) {
                    opal_output (1, "OUT OF MEMORY\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                sorted_file_offsets = (int *)
                    malloc (entries_per_aggregator*sizeof(int));
                if (NULL == sorted_file_offsets){
                    opal_output (1, "OUT OF MEMORY\n");
                    ret =  OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                temp_index = 0;
                for (i=0;i<fh->f_procs_per_group; i++){
                    for(j=0;j<disp_index[i];j++){
                        if (blocklen_per_process[i][j] > 0){
                            file_offsets_for_agg[temp_index].length =
                                blocklen_per_process[i][j];
                            file_offsets_for_agg[temp_index].process_id = i;
                            file_offsets_for_agg[temp_index].offset =
                                displs_per_process[i][j];
                            temp_index++;
                        }
                    }
                }
            }
            else{
                continue;
            }
            local_heap_sort (file_offsets_for_agg,
                             entries_per_aggregator,
                             sorted_file_offsets);
            
            memory_displacements = (MPI_Aint *) malloc
                (entries_per_aggregator * sizeof(MPI_Aint));
            memory_displacements[sorted_file_offsets[0]] = 0;
            for (i=1; i<entries_per_aggregator; i++){
                memory_displacements[sorted_file_offsets[i]] =
                    memory_displacements[sorted_file_offsets[i-1]] +
                    file_offsets_for_agg[sorted_file_offsets[i-1]].length;
            }
            
            temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int));
            if (NULL == temp_disp_index) {
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
            global_count = 0;
            for (i=0;i<entries_per_aggregator;i++){
                temp_pindex =
                    file_offsets_for_agg[sorted_file_offsets[i]].process_id;
                displs_per_process[temp_pindex][temp_disp_index[temp_pindex]] =
                    memory_displacements[sorted_file_offsets[i]];
                if (temp_disp_index[temp_pindex] < disp_index[temp_pindex])
                    temp_disp_index[temp_pindex] += 1;
                else{
                    printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n",
                           temp_pindex, temp_disp_index[temp_pindex],
                           temp_pindex, disp_index[temp_pindex]);
                }
                global_count +=
                    file_offsets_for_agg[sorted_file_offsets[i]].length;
            }
            if (NULL != temp_disp_index){
                free(temp_disp_index);
                temp_disp_index = NULL;
            }
            
#if DEBUG_ON
            printf("************Cycle: %d,  Aggregator: %d ***************\n",
                   index+1,fh->f_rank);
            for (i=0; i<entries_per_aggregator;i++){
                printf("%d: OFFSET: %lld   LENGTH: %ld, Mem-offset: %ld, disp : %d\n",
                       file_offsets_for_agg[sorted_file_offsets[i]].process_id,
                       file_offsets_for_agg[sorted_file_offsets[i]].offset,
                       file_offsets_for_agg[sorted_file_offsets[i]].length,
                       memory_displacements[sorted_file_offsets[i]],
                       disp_index[ file_offsets_for_agg[sorted_file_offsets[i]].process_id]);
            }
#endif
            
#if DEBUG_ON
            printf("%d: global_count : %ld, bytes_to_write_in_cycle : %ld, procs_per_group: %d\n",
                   fh->f_rank,
                   global_count,
                   bytes_to_write_in_cycle,
                   fh->f_procs_per_group);
#endif
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            start_comm_time = MPI_Wtime();
#endif
            global_buf  = (char *) malloc (global_count);
            if (NULL == global_buf){
                opal_output(1, "OUT OF MEMORY");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
            
            recv_req = (MPI_Request *)
                malloc (fh->f_procs_per_group * sizeof(MPI_Request));
            if (NULL == recv_req){
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
            for (i=0;i<fh->f_procs_per_group; i++){
                ompi_datatype_create_hindexed(disp_index[i],
                                              blocklen_per_process[i],
                                              displs_per_process[i],
                                              MPI_BYTE,
                                              &recvtype[i]);
                ompi_datatype_commit(&recvtype[i]);
                ret = MCA_PML_CALL(irecv(global_buf,
                                         1,
                                         recvtype[i],
                                         fh->f_procs_in_group[i],
                                         123,
                                         fh->f_comm,
                                         &recv_req[i]));
                if (OMPI_SUCCESS != ret){
                    fprintf(stderr,"irecv Error!\n");
                    goto exit;
                }
            }
        }
        
        if (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY) {
            send_buf = &((char*)buf)[total_bytes_written];
        }
        else if (bytes_to_write_in_cycle) {
            /* allocate a send buffer and copy the data that needs
               to be sent into it in case the data is non-contigous
               in memory */
            OPAL_PTRDIFF_TYPE mem_address;
            size_t remaining = 0;
            size_t temp_position = 0;
            
            send_buf = malloc (bytes_to_write_in_cycle);
            if (NULL == send_buf) {
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
            remaining = bytes_to_write_in_cycle;
            
            while (remaining) {
                mem_address = (OPAL_PTRDIFF_TYPE)
                    (decoded_iov[iov_index].iov_base) + current_position;
                
                if (remaining >=
                    (decoded_iov[iov_index].iov_len - current_position)) {
                    memcpy (send_buf+temp_position,
                            (IOVBASE_TYPE *)mem_address,
                            decoded_iov[iov_index].iov_len - current_position);
                    remaining = remaining -
                        (decoded_iov[iov_index].iov_len - current_position);
                    temp_position = temp_position +
                        (decoded_iov[iov_index].iov_len - current_position);
                    iov_index = iov_index + 1;
                    current_position = 0;
                }
                else {
                    memcpy (send_buf+temp_position,
                            (IOVBASE_TYPE *)mem_address,
                            remaining);
                    current_position = current_position + remaining;
                    remaining = 0;
                }
            }
        }
        total_bytes_written += bytes_to_write_in_cycle;
        
        send_req = (MPI_Request *) malloc (sizeof(MPI_Request));
        if (NULL == send_req){
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        
        ret = MCA_PML_CALL(isend(send_buf,
                                 bytes_to_write_in_cycle,
                                 MPI_BYTE,
                                 fh->f_procs_in_group[fh->f_aggregator_index],
                                 123,
                                 MCA_PML_BASE_SEND_STANDARD,
                                 fh->f_comm,
                                 send_req));
        
        if ( OMPI_SUCCESS != ret ){
            fprintf(stderr,"isend error!\n");
            goto exit;
        }
        
        ret = ompi_request_wait (send_req, MPI_STATUS_IGNORE);
        if (OMPI_SUCCESS != ret){
            goto exit;
        }
        
        if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
            ret = ompi_request_wait_all (fh->f_procs_per_group,
                                         recv_req,
                                         MPI_STATUS_IGNORE);
            if (OMPI_SUCCESS != ret){
                goto exit;
            }
            
#if DEBUG_ON
            printf("************Cycle: %d,  Aggregator: %d ***************\n",
                   index+1,fh->f_rank);
            if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank){
                for (i=0 ; i<global_count/4 ; i++)
                    printf (" RECV %d \n",((int *)global_buf)[i]);
            }
#endif
        }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        end_comm_time = MPI_Wtime();
        comm_time += end_comm_time - start_comm_time;
#endif
        
        
        
        if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
            fh->f_io_array = (mca_io_ompio_io_array_t *) malloc
                (entries_per_aggregator * sizeof (mca_io_ompio_io_array_t));
            if (NULL == fh->f_io_array) {
                opal_output(1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
            fh->f_num_of_io_entries = 0;
            /*First entry for every aggregator*/
            fh->f_io_array[fh->f_num_of_io_entries].offset =
                (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset;
            fh->f_io_array[fh->f_num_of_io_entries].length =
                file_offsets_for_agg[sorted_file_offsets[0]].length;
            fh->f_io_array[fh->f_num_of_io_entries].memory_address =
                global_buf+memory_displacements[sorted_file_offsets[0]];
            fh->f_num_of_io_entries++;
            for (i=1;i<entries_per_aggregator;i++){
                if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset +
                    file_offsets_for_agg[sorted_file_offsets[i-1]].length ==
                    file_offsets_for_agg[sorted_file_offsets[i]].offset){
                    fh->f_io_array[fh->f_num_of_io_entries - 1].length +=
                        file_offsets_for_agg[sorted_file_offsets[i]].length;
                }
                else {
                    fh->f_io_array[fh->f_num_of_io_entries].offset =
                        (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset;
                    fh->f_io_array[fh->f_num_of_io_entries].length =
                        file_offsets_for_agg[sorted_file_offsets[i]].length;
                    fh->f_io_array[fh->f_num_of_io_entries].memory_address =
                        global_buf+memory_displacements[sorted_file_offsets[i]];
                    fh->f_num_of_io_entries++;
                }
            }
#if DEBUG_ON
            printf("*************************** %d\n", fh->f_num_of_io_entries);
            for (i=0 ; i<fh->f_num_of_io_entries ; i++) {
                printf(" ADDRESS: %p  OFFSET: %ld   LENGTH: %ld\n",
                       fh->f_io_array[i].memory_address,
                       (OPAL_PTRDIFF_TYPE)fh->f_io_array[i].offset,
                       fh->f_io_array[i].length);
            }
#endif
            
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            start_write_time = MPI_Wtime();
#endif
            
            if (fh->f_num_of_io_entries) {
                if ( 0 >  fh->f_fbtl->fbtl_pwritev (fh)) {
                    opal_output (1, "WRITE FAILED\n");
                    ret = OMPI_ERROR;
                    goto exit;
                }
            }
            
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            end_write_time = MPI_Wtime();
            write_time += end_write_time - start_write_time;
#endif
            
        }
        if (NULL != send_req){
            free(send_req);
            send_req = NULL;
        }
        
        if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
            fh->f_num_of_io_entries = 0;
            if (NULL != fh->f_io_array) {
                free (fh->f_io_array);
                fh->f_io_array = NULL;
            }
            for (i = 0; i < fh->f_procs_per_group; i++)
                ompi_datatype_destroy(recvtype+i);
            if (NULL != recvtype){
                free(recvtype);
                recvtype=NULL;
            }
            if (NULL != recv_req){
                free(recv_req);
                recv_req = NULL;
            }
            if (NULL != global_buf) {
                free (global_buf);
                global_buf = NULL;
            }
        }
    }
    
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    end_exch = MPI_Wtime();
    exch_write += end_exch - start_exch;
    nentry.time[0] = write_time;
    nentry.time[1] = comm_time;
    nentry.time[2] = exch_write;
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank)
        nentry.aggregator = 1;
    else
        nentry.aggregator = 0;
    nentry.nprocs_for_coll = static_num_io_procs;
    if (!fh->f_full_print_queue(WRITE_PRINT_QUEUE)){
	fh->f_register_print_entry(WRITE_PRINT_QUEUE,
				   nentry);
    }
#endif
    
    
    
exit:
    if (NULL != decoded_iov){
        free(decoded_iov);
        decoded_iov = NULL;
    }
    
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
        
        if (NULL != local_iov_array){
            free(local_iov_array);
            local_iov_array = NULL;
        }
        for(l=0;l<fh->f_procs_per_group;l++){
            if (NULL != blocklen_per_process[l]){
                free(blocklen_per_process[l]);
                blocklen_per_process[l] = NULL;
            }
            if (NULL != displs_per_process[l]){
                free(displs_per_process[l]);
                displs_per_process[l] = NULL;
            }
        }
    }
    
    if (NULL != send_buf){
        free(send_buf);
        send_buf = NULL;
    }
    
    if (NULL != global_buf){
        free(global_buf);
        global_buf = NULL;
    }
    
    if (NULL != recvtype){
        free(recvtype);
        recvtype = NULL;
    }
    
    if (NULL != sorted_file_offsets){
        free(sorted_file_offsets);
        sorted_file_offsets = NULL;
    }
    
    if (NULL != file_offsets_for_agg){
        free(file_offsets_for_agg);
        file_offsets_for_agg = NULL;
    }
    
    if (NULL != memory_displacements){
        free(memory_displacements);
        memory_displacements = NULL;
    }
    
    if (NULL != displs_per_process){
        free(displs_per_process);
        displs_per_process = NULL;
    }
    
    if (NULL != blocklen_per_process){
        free(blocklen_per_process);
        blocklen_per_process = NULL;
    }
    
    if(NULL != current_index){
        free(current_index);
        current_index = NULL;
    }
    
    if(NULL != bytes_remaining){
        free(bytes_remaining);
        bytes_remaining = NULL;
    }
    
    if (NULL != disp_index){
        free(disp_index);
        disp_index = NULL;
    }
    
    if (NULL != sorted) {
        free(sorted);
        sorted = NULL;
    }
    
    return ret;
}
Exemple #12
0
static int unpack_ooo(void)
{
    ompi_datatype_t * t1;
    ompi_datatype_t * t2;
    ompi_datatype_t * type[2];
    ompi_datatype_t * newtype;
    MPI_Aint disp[2];
    int len[2], rc;

    rc = ompi_datatype_create_vector(2, 1, 2, MPI_INT, &t1);
    if (OMPI_SUCCESS != rc) {
        fprintf(stderr, "could not create vector t1\n");
        return 1;
    }
    rc = ompi_datatype_commit (&t1);
    if (OMPI_SUCCESS != rc) {
        fprintf(stderr, "could not commit vector t1\n");
        return 1;
    }

    rc = ompi_datatype_create_vector(2, 1, 2, MPI_DOUBLE, &t2);
    if (OMPI_SUCCESS != rc) {
        fprintf(stderr, "could not create vector t2\n");
        return 1;
    }
    rc = ompi_datatype_commit (&t2);
    if (OMPI_SUCCESS != rc) {
        fprintf(stderr, "could not commit vector t2\n");
        return 1;
    }

/*
 * btl=0x7f7823672580 bytes_received=992 data_offset=0
 * btl=0x7f7823260420 bytes_received=1325 data_offset=992
 * btl=0x7f7823672580 bytes_received=992 data_offset=2317
 * btl=0x7f7823672580 bytes_received=992 data_offset=3309
 * btl=0x7f7823672580 bytes_received=992 data_offset=4301
 * btl=0x7f7823672580 bytes_received=992 data_offset=5293
 * btl=0x7f7823672580 bytes_received=992 data_offset=6285
 * btl=0x7f7823672580 bytes_received=667 data_offset=7277
 */
    size_t test1[9][2] = {
        {992, 0},
        {1325, 992},
        {992, 2317},
        {992, 3309},
        {992, 4301},
        {992, 5293},
        {992, 6285},
        {667, 7277},
        {0, -1},
    };

/*
 * btl=0x7f80bc545580 bytes_received=992 data_offset=0
 * btl=0x7f80bc545580 bytes_received=992 data_offset=2317
 * btl=0x7f80bc545580 bytes_received=992 data_offset=3309
 * btl=0x7f80bc545580 bytes_received=992 data_offset=4301
 * btl=0x7f80bc545580 bytes_received=992 data_offset=5293
 * btl=0x7f80bc545580 bytes_received=992 data_offset=6285
 * btl=0x7f80bc133420 bytes_received=1325 data_offset=992
 * btl=0x7f80bc545580 bytes_received=667 data_offset=7277
 */
    size_t test2[9][2] = {
        {992, 0},
        {992, 2317},
        {992, 3309},
        {992, 4301},
        {992, 5293},
        {992, 6285},
        {1325, 992},
        {667, 7277},
        {0, -1},
    };

/* trimmed version of test2 */
    size_t test3[9][2] = {
        {992, 0},
        {4960, 2317},
        {1325, 992},
        {667, 7277},
        {0, -1},
    };

/* an other test case */
    size_t test4[9][2] = {
        {992, 0},
        {992, 2976},
        {992, 1984},
        {992, 992},
        {3976, 3968},
        {0, -1},
    };

    disp[0] = (long)(&foo.i[0]) - (long)&foo;
    disp[1] = (long)(&foo.d[0]) - (long)&foo;

    type[0] = t1;
    type[1] = t2;

    len[0] = 1;
    len[1] = 1;

    rc = ompi_datatype_create_struct(2, len, disp, type, &newtype);
    if (OMPI_SUCCESS != rc) {
        fprintf(stderr, "could not create struct\n");
        return 1;
    }
    rc = ompi_datatype_commit (&newtype);
    if (OMPI_SUCCESS != rc) {
        fprintf(stderr, "could not create struct\n");
        return 1;
    }
    
    pbar = (struct pfoo_t *)malloc (N * sizeof(struct pfoo_t));
    if (NULL == pbar) {
        fprintf(stderr, "could not malloc pbar\n");
        return 1;
    }
    bar = (struct foo_t *)malloc (N * sizeof(struct foo_t));
    if (NULL == bar) {
        fprintf(stderr, "could not malloc bar\n");
        return 1;
    }

    if (0 != testcase(newtype, test1)) {
        printf ("test1 failed\n");
        return 2;
    }

    if (0 != testcase(newtype, test2)) {
        printf ("test2 failed\n");
        return 2;
    }

    if (0 != testcase(newtype, test3)) {
        printf ("test3 failed\n");
        return 2;
    }

    if (0 != testcase(newtype, test4)) {
        printf ("test4 failed\n");
        return 2;
    }


    /* test the automatic destruction pf the data */
    ompi_datatype_destroy( &newtype ); assert( newtype == NULL );

    return rc;
}
int mca_common_ompio_set_file_defaults (mca_io_ompio_file_t *fh)
{

   if (NULL != fh) {
        ompi_datatype_t *types[2];
        int blocklen[2] = {1, 1};
        OPAL_PTRDIFF_TYPE d[2], base;
        int i;

        fh->f_io_array = NULL;
        fh->f_perm = OMPIO_PERM_NULL;
        fh->f_flags = 0;
        fh->f_bytes_per_agg = mca_io_ompio_bytes_per_agg;
        fh->f_datarep = strdup ("native");

        fh->f_offset = 0;
        fh->f_disp = 0;
        fh->f_position_in_file_view = 0;
        fh->f_index_in_file_view = 0;
        fh->f_total_bytes = 0;

        fh->f_init_procs_per_group = -1;
        fh->f_init_procs_in_group = NULL;

	fh->f_procs_per_group = -1;
        fh->f_procs_in_group = NULL;

        fh->f_init_num_aggrs = -1;
        fh->f_init_aggr_list = NULL;


        /* Default file View */
        fh->f_iov_type = MPI_DATATYPE_NULL;
        fh->f_stripe_size = mca_io_ompio_bytes_per_agg;
	/*Decoded iovec of the file-view*/
	fh->f_decoded_iov = NULL;
        fh->f_etype = NULL;
        fh->f_filetype = NULL;
        fh->f_orig_filetype = NULL;

	mca_common_ompio_set_view(fh,
                                  0,
                                  &ompi_mpi_byte.dt,
                                  &ompi_mpi_byte.dt,
                                  "native",
                                  fh->f_info);


	/*Create a derived datatype for the created iovec */
	types[0] = &ompi_mpi_long.dt;
        types[1] = &ompi_mpi_long.dt;

        d[0] = (OPAL_PTRDIFF_TYPE) fh->f_decoded_iov;
        d[1] = (OPAL_PTRDIFF_TYPE) &fh->f_decoded_iov[0].iov_len;

        base = d[0];
        for (i=0 ; i<2 ; i++) {
            d[i] -= base;
        }

        ompi_datatype_create_struct (2,
                                     blocklen,
                                     d,
                                     types,
                                     &fh->f_iov_type);
        ompi_datatype_commit (&fh->f_iov_type);

        return OMPI_SUCCESS;
    }
    else {
        return OMPI_ERROR;
    }
}
int
mca_fcoll_static_file_read_all (mca_io_ompio_file_t *fh,
				void *buf,
				int count,
				struct ompi_datatype_t *datatype,
				ompi_status_public_t *status)
{

    int ret = OMPI_SUCCESS, iov_size=0, *bytes_remaining=NULL;
    int i, j, l,cycles=0, local_cycles=0, *current_index=NULL;
    int index, *disp_index=NULL, *bytes_per_process=NULL, current_position=0;
    int **blocklen_per_process=NULL, *iovec_count_per_process=NULL;
    int *displs=NULL, *sorted=NULL ,entries_per_aggregator=0;
    int *sorted_file_offsets=NULL, temp_index=0, position=0, *temp_disp_index=NULL;


    MPI_Aint **displs_per_process=NULL, global_iov_count=0, global_count=0;
    MPI_Aint *memory_displacements=NULL;
    int bytes_to_read_in_cycle=0;
    size_t max_data=0, bytes_per_cycle=0;
    uint32_t iov_count=0, iov_index=0;
    struct iovec *decoded_iov=NULL, *iov=NULL;
    mca_fcoll_static_local_io_array *local_iov_array=NULL, *global_iov_array=NULL;
    mca_fcoll_static_local_io_array *file_offsets_for_agg=NULL;

    char *global_buf=NULL, *receive_buf=NULL;

    int blocklen[3] = {1, 1, 1};
    int static_num_io_procs=1;
    OPAL_PTRDIFF_TYPE d[3], base;
    ompi_datatype_t *types[3];
    ompi_datatype_t *io_array_type=MPI_DATATYPE_NULL;
    ompi_datatype_t **sendtype = NULL;
    MPI_Request *send_req=NULL, recv_req=NULL;
    int my_aggregator=-1;
    bool recvbuf_is_contiguous=false;
    size_t ftype_size;
    OPAL_PTRDIFF_TYPE ftype_extent, lb;

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    double read_time = 0.0, start_read_time = 0.0, end_read_time = 0.0;
    double rcomm_time = 0.0, start_rcomm_time = 0.0, end_rcomm_time = 0.0;
    double read_exch = 0.0, start_rexch = 0.0, end_rexch = 0.0;
    mca_common_ompio_print_entry nentry;
#endif
#if DEBUG_ON
    MPI_Aint gc_in;
#endif
    opal_datatype_type_size ( &datatype->super, &ftype_size );
    opal_datatype_get_extent ( &datatype->super, &lb, &ftype_extent );

    /**************************************************************************
     ** 1.  In case the data is not contigous in memory, decode it into an iovec
     **************************************************************************/
    if ( ( ftype_extent == (OPAL_PTRDIFF_TYPE) ftype_size)             &&
         opal_datatype_is_contiguous_memory_layout(&datatype->super,1) &&
         0 == lb ) {
        recvbuf_is_contiguous = true;
    }


    /* In case the data is not contigous in memory, decode it into an iovec */
    if (!recvbuf_is_contiguous  ) {
        fh->f_decode_datatype ( (struct mca_io_ompio_file_t *)fh,
                                datatype,
                                count,
                                buf,
                                &max_data,
                                &decoded_iov,
                                &iov_count);
    }
    else {
        max_data = count * datatype->super.size;
    }

    if ( MPI_STATUS_IGNORE != status ) {
        status->_ucount = max_data;
    }


    fh->f_get_num_aggregators ( &static_num_io_procs );
    fh->f_set_aggregator_props ((struct mca_io_ompio_file_t *) fh,
                                static_num_io_procs,
                                max_data);
    my_aggregator = fh->f_procs_in_group[fh->f_aggregator_index];

    /*  printf("max_data %ld\n", max_data);  */
    ret = fh->f_generate_current_file_view((struct mca_io_ompio_file_t *)fh,
                                           max_data,
                                           &iov,
                                           &iov_size);
    if (ret != OMPI_SUCCESS){
        goto exit;
    }

    if ( iov_size > 0 ) {
        local_iov_array = (mca_fcoll_static_local_io_array *)malloc (iov_size * sizeof(mca_fcoll_static_local_io_array));
        if ( NULL == local_iov_array){
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }


        for (j=0; j < iov_size; j++){
            local_iov_array[j].offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t)
                iov[j].iov_base;
            local_iov_array[j].length = (size_t)iov[j].iov_len;
            local_iov_array[j].process_id = fh->f_rank;

        }
    }
    else {
        /* Allocate at least one element to correctly create the derived
           data type */
        local_iov_array = (mca_fcoll_static_local_io_array *)malloc (sizeof(mca_fcoll_static_local_io_array));
        if ( NULL == local_iov_array){
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }


        local_iov_array[0].offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t) 0;
        local_iov_array[0].length = (size_t) 0;
        local_iov_array[0].process_id = fh->f_rank;
    }

    d[0] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0];
    d[1] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0].length;
    d[2] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0].process_id;
    base = d[0];
    for (i=0 ; i<3 ; i++) {
        d[i] -= base;
    }

    /* io_array datatype  for using in communication*/
    types[0] = &ompi_mpi_long.dt;
    types[1] = &ompi_mpi_long.dt;
    types[2] = &ompi_mpi_int.dt;

    ompi_datatype_create_struct (3,
                                 blocklen,
                                 d,
                                 types,
                                 &io_array_type);
    ompi_datatype_commit (&io_array_type);

    /* #########################################################*/
    fh->f_get_bytes_per_agg ( (int*) &bytes_per_cycle);
    local_cycles = ceil((double)max_data*fh->f_procs_per_group/bytes_per_cycle);

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_rexch = MPI_Wtime();
#endif
    ret = fh->f_comm->c_coll.coll_allreduce (&local_cycles,
                                             &cycles,
                                             1,
                                             MPI_INT,
                                             MPI_MAX,
                                             fh->f_comm,
                                             fh->f_comm->c_coll.coll_allreduce_module);

    if (OMPI_SUCCESS != ret){
        goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        end_rcomm_time = MPI_Wtime();
        rcomm_time  += end_rcomm_time - start_rcomm_time;
#endif


    if (my_aggregator == fh->f_rank) {
        disp_index = (int *) malloc (fh->f_procs_per_group * sizeof(int));
        if (NULL == disp_index) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        bytes_per_process = (int *) malloc (fh->f_procs_per_group * sizeof(int ));
        if (NULL == bytes_per_process){
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        bytes_remaining = (int *) calloc (fh->f_procs_per_group, sizeof(int));
        if (NULL == bytes_remaining){
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        current_index = (int *) calloc (fh->f_procs_per_group, sizeof(int));
        if (NULL == current_index){
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        blocklen_per_process = (int **)calloc (fh->f_procs_per_group, sizeof (int*));
        if (NULL == blocklen_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        displs_per_process = (MPI_Aint **)calloc (fh->f_procs_per_group, sizeof (MPI_Aint*));
        if (NULL == displs_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
    }


    iovec_count_per_process = (int *) calloc (fh->f_procs_per_group, sizeof(int));
    if (NULL == iovec_count_per_process){
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }

    displs = (int *) calloc (fh->f_procs_per_group, sizeof(int));
    if (NULL == displs){
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_rexch = MPI_Wtime();
#endif
    ret = fcoll_base_coll_allgather_array (&iov_size,
                                           1,
                                           MPI_INT,
                                           iovec_count_per_process,
                                           1,
                                           MPI_INT,
                                           fh->f_aggregator_index,
                                           fh->f_procs_in_group,
                                           fh->f_procs_per_group,
                                           fh->f_comm);

    if( OMPI_SUCCESS != ret){
        goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        end_rcomm_time = MPI_Wtime();
        rcomm_time  += end_rcomm_time - start_rcomm_time;
#endif

    if (my_aggregator == fh->f_rank) {
        displs[0] = 0;
        global_iov_count = iovec_count_per_process[0];
        for (i=1 ; i<fh->f_procs_per_group ; i++) {
            global_iov_count += iovec_count_per_process[i];
            displs[i] = displs[i-1] + iovec_count_per_process[i-1];
        }
    }


    if ( (my_aggregator == fh->f_rank) &&
         (global_iov_count >  0 )) {
        global_iov_array = (mca_fcoll_static_local_io_array *) malloc (global_iov_count *
                                                      sizeof(mca_fcoll_static_local_io_array));
        if (NULL == global_iov_array){
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
    }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_rexch = MPI_Wtime();
#endif
    ret = fcoll_base_coll_gatherv_array (local_iov_array,
                                         iov_size,
                                         io_array_type,
                                         global_iov_array,
                                         iovec_count_per_process,
                                         displs,
                                         io_array_type,
                                         fh->f_aggregator_index,
                                         fh->f_procs_in_group,
                                         fh->f_procs_per_group,
                                         fh->f_comm);

    if (OMPI_SUCCESS != ret){
        fprintf(stderr,"global_iov_array gather error!\n");
        goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        end_rcomm_time = MPI_Wtime();
        rcomm_time  += end_rcomm_time - start_rcomm_time;
#endif


    if (NULL != local_iov_array){
        free(local_iov_array);
        local_iov_array = NULL;
    }

    if ( ( my_aggregator == fh->f_rank) &&
         ( global_iov_count > 0 )) {
        sorted = (int *)malloc (global_iov_count * sizeof(int));
        if (NULL == sorted) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        read_local_heap_sort (global_iov_array, global_iov_count, sorted);

        send_req = (MPI_Request *) malloc (fh->f_procs_per_group * sizeof(MPI_Request));
        if (NULL == send_req){
            opal_output ( 1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        sendtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group * sizeof(ompi_datatype_t *));
        if (NULL == sendtype) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        for ( i=0; i<fh->f_procs_per_group; i++ ) {
            sendtype[i] = MPI_DATATYPE_NULL;
        }

        if (NULL == bytes_per_process){
            bytes_per_process = (int *) malloc (fh->f_procs_per_group * sizeof(int));
            if (NULL == bytes_per_process){
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
        }
    }

#if DEBUG_ON

    if (my_aggregator == fh->f_rank) {
        for (gc_in=0; gc_in<global_iov_count; gc_in++){
            printf("%d: Offset[%ld]: %lld, Length[%ld]: %ld\n",
                   global_iov_array[sorted[gc_in]].process_id,
                   gc_in, global_iov_array[sorted[gc_in]].offset,
                   gc_in, global_iov_array[sorted[gc_in]].length);
        }
    }
#endif

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_rexch = MPI_Wtime();
#endif

    for (index = 0; index < cycles; index++){

        if (my_aggregator == fh->f_rank) {

            fh->f_num_of_io_entries = 0;
            if (NULL != fh->f_io_array) {
                free (fh->f_io_array);
                fh->f_io_array = NULL;
            }
            if (NULL != global_buf) {
                free (global_buf);
                global_buf = NULL;
            }

            if (NULL != sorted_file_offsets){
                free(sorted_file_offsets);
                sorted_file_offsets = NULL;
            }
            if (NULL != file_offsets_for_agg){
                free(file_offsets_for_agg);
                file_offsets_for_agg = NULL;
            }
            if (NULL != memory_displacements){
                free(memory_displacements);
                memory_displacements= NULL;
            }

            if ( NULL != sendtype ) {
                for ( i=0; i<fh->f_procs_per_group; i++ ) {
                    if ( MPI_DATATYPE_NULL != sendtype[i] ) {
                        ompi_datatype_destroy (&sendtype[i] );
                        sendtype[i] = MPI_DATATYPE_NULL;
                    }
                }
            }

            for(l=0;l<fh->f_procs_per_group;l++){
                disp_index[l] =  1;
                if (NULL != blocklen_per_process[l]){
                    free(blocklen_per_process[l]);
                    blocklen_per_process[l] = NULL;
                }
                if (NULL != displs_per_process[l]){
                    free(displs_per_process[l]);
                    displs_per_process[l] = NULL;
                }
                blocklen_per_process[l] = (int *) calloc (1, sizeof(int));
                if (NULL == blocklen_per_process[l]) {
                    opal_output (1, "OUT OF MEMORY for blocklen\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint));
                if (NULL == displs_per_process[l]){
                    opal_output (1, "OUT OF MEMORY for displs\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
            }
        }

        if (index < local_cycles ) {
            if ((index == local_cycles-1) && (max_data % (bytes_per_cycle/fh->f_procs_per_group))) {
                bytes_to_read_in_cycle = max_data - position;
            }
            else if (max_data <= bytes_per_cycle/fh->f_procs_per_group) {
                bytes_to_read_in_cycle = max_data;
            }
            else {
                bytes_to_read_in_cycle = bytes_per_cycle/fh->f_procs_per_group;
            }
        }
        else {
            bytes_to_read_in_cycle = 0;
        }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_rexch = MPI_Wtime();
#endif
        fcoll_base_coll_gather_array (&bytes_to_read_in_cycle,
                                      1,
                                      MPI_INT,
                                      bytes_per_process,
                                      1,
                                      MPI_INT,
                                      fh->f_aggregator_index,
                                      fh->f_procs_in_group,
                                      fh->f_procs_per_group,
                                      fh->f_comm);

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        end_rcomm_time = MPI_Wtime();
        rcomm_time  += end_rcomm_time - start_rcomm_time;
#endif

        if (recvbuf_is_contiguous ) {
            receive_buf = &((char*)buf)[position];
        }
        else if (bytes_to_read_in_cycle) {
            receive_buf = (char *) malloc (bytes_to_read_in_cycle * sizeof(char));
            if ( NULL == receive_buf){
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
        }


#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        start_rcomm_time = MPI_Wtime();
#endif

        ret = MCA_PML_CALL(irecv(receive_buf,
                                 bytes_to_read_in_cycle,
                                 MPI_BYTE,
                                 my_aggregator,
                                 123,
                                 fh->f_comm,
                                 &recv_req));
        if (OMPI_SUCCESS != ret){
            goto exit;
        }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        end_rcomm_time = MPI_Wtime();
        rcomm_time  += end_rcomm_time - start_rcomm_time;
#endif


        if (my_aggregator == fh->f_rank) {
            for (i=0;i<fh->f_procs_per_group; i++){
                while (bytes_per_process[i] > 0){
                    /*printf("%d: bytes_per_process[%d]: %d, bytes_remaining[%d]: %d\n",
                      index, i, bytes_per_process[i], i, bytes_remaining[i]);*/
                    if (read_get_process_id(global_iov_array[sorted[current_index[i]]].process_id,
                                            fh) == i){ /* current id owns this entry!*/
                        if (bytes_remaining[i]){ /*Remaining bytes in the current entry of
                                                   the global offset array*/
                            if (bytes_remaining[i] <= bytes_per_process[i]){

                                blocklen_per_process[i][disp_index[i] - 1] = bytes_remaining[i];
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset +
                                    (global_iov_array[sorted[current_index[i]]].length
                                     - bytes_remaining[i]);
                                blocklen_per_process[i] = (int *) realloc
                                    ((void *)blocklen_per_process[i], (disp_index[i]+1)*sizeof(int));
                                displs_per_process[i] = (MPI_Aint *)realloc
                                    ((void *)displs_per_process[i], (disp_index[i]+1)*sizeof(MPI_Aint));
                                bytes_per_process[i] -= bytes_remaining[i];
                                blocklen_per_process[i][disp_index[i]] = 0;
                                displs_per_process[i][disp_index[i]] = 0;
                                disp_index[i] += 1;
                                bytes_remaining[i] = 0;
                                /* This entry has been used up, we need to move to the
                                   next entry of this process and make current_index point there*/
                                current_index[i]  = read_find_next_index(i,
                                                                         current_index[i],
                                                                         fh,
                                                                         global_iov_array,
                                                                         global_iov_count,
                                                                         sorted);
                                if (current_index[i] == -1){
                                    break;
                                }
                                continue;
                            }
                            else{
                                blocklen_per_process[i][disp_index[i] - 1] = bytes_per_process[i];
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset +
                                    (global_iov_array[sorted[current_index[i]]].length
                                     - bytes_remaining[i]);
                                bytes_remaining[i] -= bytes_per_process[i];
                                bytes_per_process[i] = 0;
                                break;
                            }
                        }
                        else{
                            if (bytes_per_process[i] <
                                global_iov_array[sorted[current_index[i]]].length){
                                blocklen_per_process[i][disp_index[i] - 1] =
                                    bytes_per_process[i];
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset;
                                bytes_remaining[i] =
                                    global_iov_array[sorted[current_index[i]]].length -
                                    bytes_per_process[i];
                                bytes_per_process[i] = 0;
                                break;
                            }
                            else {
                                blocklen_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].length;
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset;
                                blocklen_per_process[i] =
                                    (int *) realloc ((void *)blocklen_per_process[i], (disp_index[i]+1)*sizeof(int));
                                displs_per_process[i] = (MPI_Aint *)realloc
                                    ((void *)displs_per_process[i], (disp_index[i]+1)*sizeof(MPI_Aint));
                                blocklen_per_process[i][disp_index[i]] = 0;
                                displs_per_process[i][disp_index[i]] = 0;
                                disp_index[i] += 1;
                                bytes_per_process[i] -=
                                    global_iov_array[sorted[current_index[i]]].length;
                                current_index[i] = read_find_next_index(i,
                                                                        current_index[i],
                                                                        fh,
                                                                        global_iov_array,
                                                                        global_iov_count,
                                                                        sorted);
                                if (current_index[i] == -1){
                                    break;
                                }
                            }
                        }
                    }
                    else{
                        current_index[i] = read_find_next_index(i,
                                                                current_index[i],
                                                                fh,
                                                                global_iov_array,
                                                                global_iov_count,
                                                                sorted);
                        if (current_index[i] == -1){
                            bytes_per_process[i] = 0; /* no more entries left
                                                         to service this request*/
                            continue;
                        }
                    }
                }
            }

            entries_per_aggregator=0;
            for (i=0;i<fh->f_procs_per_group;i++){
                for (j=0;j<disp_index[i];j++){
                    if (blocklen_per_process[i][j] > 0){
                        entries_per_aggregator++;
#if DEBUG_ON
                        printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n",
                               fh->f_procs_in_group[i],j,
                               blocklen_per_process[i][j],j,
                               displs_per_process[i][j],
                               fh->f_rank);

#endif
                    }
                }
            }

            if (entries_per_aggregator > 0){
                file_offsets_for_agg = (mca_fcoll_static_local_io_array *)
                    malloc(entries_per_aggregator*sizeof(mca_fcoll_static_local_io_array));
                if (NULL == file_offsets_for_agg) {
                    opal_output (1, "OUT OF MEMORY\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                sorted_file_offsets = (int *) malloc (entries_per_aggregator * sizeof(int));
                if (NULL == sorted_file_offsets){
                    opal_output (1, "OUT OF MEMORY\n");
                    ret =  OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                temp_index=0;
                global_count = 0;
                for (i=0;i<fh->f_procs_per_group; i++){
                    for(j=0;j<disp_index[i]; j++){
                        if (blocklen_per_process[i][j] > 0){
                            file_offsets_for_agg[temp_index].length =
                                blocklen_per_process[i][j];
                            global_count += blocklen_per_process[i][j];
                            file_offsets_for_agg[temp_index].process_id = i;
                            file_offsets_for_agg[temp_index].offset =
                                displs_per_process[i][j];
                            temp_index++;
                        }
                    }
                }
            }
            else{
                continue;
            }
            read_local_heap_sort (file_offsets_for_agg,
                                  entries_per_aggregator,
                                  sorted_file_offsets);
            memory_displacements = (MPI_Aint *) malloc
                (entries_per_aggregator * sizeof(MPI_Aint));
            memory_displacements[sorted_file_offsets[0]] = 0;
            for (i=1; i<entries_per_aggregator; i++){
                memory_displacements[sorted_file_offsets[i]] =
                    memory_displacements[sorted_file_offsets[i-1]] +
                    file_offsets_for_agg[sorted_file_offsets[i-1]].length;
            }

            global_buf = (char *) malloc (global_count * sizeof(char));
            if (NULL == global_buf){
                opal_output(1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
#if DEBUG_ON
            printf("************Cycle: %d,  Aggregator: %d ***************\n",
                   index+1,fh->f_rank);
            for (i=0; i<entries_per_aggregator;i++){
                printf("%d: OFFSET: %lld   LENGTH: %ld, Mem-offset: %ld, disp_index :%d\n",
                       file_offsets_for_agg[sorted_file_offsets[i]].process_id,
                       file_offsets_for_agg[sorted_file_offsets[i]].offset,
                       file_offsets_for_agg[sorted_file_offsets[i]].length,
                       memory_displacements[sorted_file_offsets[i]],
                       disp_index[i]);
            }
#endif

            fh->f_io_array = (mca_io_ompio_io_array_t *) malloc
                (entries_per_aggregator * sizeof (mca_io_ompio_io_array_t));
            if (NULL == fh->f_io_array) {
                opal_output(1, "OUT OF MEMORY\n");
                ret =  OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }



            fh->f_num_of_io_entries = 0;
            fh->f_io_array[0].offset =
                (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset;
            fh->f_io_array[0].length = file_offsets_for_agg[sorted_file_offsets[0]].length;
            fh->f_io_array[0].memory_address = global_buf+memory_displacements[sorted_file_offsets[0]];
            fh->f_num_of_io_entries++;
            for (i=1;i<entries_per_aggregator;i++){
                if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset +
                    file_offsets_for_agg[sorted_file_offsets[i-1]].length ==
                    file_offsets_for_agg[sorted_file_offsets[i]].offset){
                    fh->f_io_array[fh->f_num_of_io_entries - 1].length +=
                        file_offsets_for_agg[sorted_file_offsets[i]].length;
                }
                else{
                    fh->f_io_array[fh->f_num_of_io_entries].offset =
                        (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset;
                    fh->f_io_array[fh->f_num_of_io_entries].length =
                        file_offsets_for_agg[sorted_file_offsets[i]].length;
                    fh->f_io_array[fh->f_num_of_io_entries].memory_address =
                        global_buf+memory_displacements[sorted_file_offsets[i]];
                    fh->f_num_of_io_entries++;
                }
            }

#if DEBUG_ON
            printf("*************************** %d\n", fh->f_num_of_io_entries);
            for (i=0 ; i<fh->f_num_of_io_entries ; i++) {
                printf(" ADDRESS: %p  OFFSET: %ld   LENGTH: %ld\n",
                       fh->f_io_array[i].memory_address,
                       (OPAL_PTRDIFF_TYPE)fh->f_io_array[i].offset,
                       fh->f_io_array[i].length);
            }
#endif
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            start_read_time = MPI_Wtime();
#endif

            if (fh->f_num_of_io_entries) {
                if ( 0 > fh->f_fbtl->fbtl_preadv (fh)) {
                    opal_output (1, "READ FAILED\n");
                    ret = OMPI_ERROR;
                    goto exit;
                }
            }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            end_read_time = MPI_Wtime();
            read_time += end_read_time - start_read_time;
#endif


#if DEBUG_ON
            printf("************Cycle: %d,  Aggregator: %d ***************\n",
                   index+1,fh->f_rank);
            if (my_aggregator == fh->f_rank){
                for (i=0 ; i<global_count/4 ; i++)
                    printf (" READ %d \n",((int *)global_buf)[i]);
            }
#endif

            temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int));
            if (NULL == temp_disp_index) {
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }

            for (i=0; i<entries_per_aggregator; i++){
                temp_index =
                    file_offsets_for_agg[sorted_file_offsets[i]].process_id;
                displs_per_process[temp_index][temp_disp_index[temp_index]] =
                    memory_displacements[sorted_file_offsets[i]];
                if (temp_disp_index[temp_index] < disp_index[temp_index]){
                    temp_disp_index[temp_index] += 1;
                }
                else{
                    printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n",
                           temp_index, temp_disp_index[temp_index],
                           temp_index, disp_index[temp_index]);
                }
            }
            if (NULL != temp_disp_index){
                free(temp_disp_index);
                temp_disp_index = NULL;
            }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            start_rcomm_time = MPI_Wtime();
#endif

            for (i=0;i<fh->f_procs_per_group; i++){
                send_req[i] = MPI_REQUEST_NULL;
                ompi_datatype_create_hindexed(disp_index[i],
                                              blocklen_per_process[i],
                                              displs_per_process[i],
                                              MPI_BYTE,
                                              &sendtype[i]);
                ompi_datatype_commit(&sendtype[i]);
                ret = MCA_PML_CALL (isend(global_buf,
                                          1,
                                          sendtype[i],
                                          fh->f_procs_in_group[i],
                                          123,
                                          MCA_PML_BASE_SEND_STANDARD,
                                          fh->f_comm,
                                          &send_req[i]));
                if(OMPI_SUCCESS != ret){
                    goto exit;
                }
            }

            ret = ompi_request_wait_all (fh->f_procs_per_group,
                                         send_req,
                                         MPI_STATUS_IGNORE);
            if (OMPI_SUCCESS != ret){
                goto exit;
            }
        } /* if ( my_aggregator == fh->f_rank ) */

        ret = ompi_request_wait (&recv_req, MPI_STATUS_IGNORE);
        if (OMPI_SUCCESS != ret){
            goto exit;
        }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        end_rcomm_time = MPI_Wtime();
        rcomm_time += end_rcomm_time - start_rcomm_time;
#endif

        position += bytes_to_read_in_cycle;

        if (!recvbuf_is_contiguous) {
            OPAL_PTRDIFF_TYPE mem_address;
            size_t remaining = 0;
            size_t temp_position = 0;

            remaining = bytes_to_read_in_cycle;

            while (remaining && (iov_count > iov_index)){
                mem_address = (OPAL_PTRDIFF_TYPE)
                    (decoded_iov[iov_index].iov_base) + current_position;

                if (remaining >=
                    (decoded_iov[iov_index].iov_len - current_position)) {
                    memcpy ((IOVBASE_TYPE *) mem_address,
                            receive_buf+temp_position,
                            decoded_iov[iov_index].iov_len - current_position);
                    remaining = remaining -
                        (decoded_iov[iov_index].iov_len - current_position);
                    temp_position = temp_position +
                        (decoded_iov[iov_index].iov_len - current_position);
                    iov_index = iov_index + 1;
                    current_position = 0;
                }
                else{
                    memcpy ((IOVBASE_TYPE *) mem_address,
                            receive_buf+temp_position,
                            remaining);
                    current_position = current_position + remaining;
                    remaining = 0;
                }
            }
            if (NULL != receive_buf) {
                free (receive_buf);
                receive_buf = NULL;
            }
        }

    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    end_rexch = MPI_Wtime();
    read_exch += end_rexch - start_rexch;
    nentry.time[0] = read_time;
    nentry.time[1] = rcomm_time;
    nentry.time[2] = read_exch;
    if (my_aggregator == fh->f_rank)
        nentry.aggregator = 1;
    else
        nentry.aggregator = 0;
    nentry.nprocs_for_coll = static_num_io_procs;
    if (!mca_common_ompio_full_print_queue(fh->f_coll_read_time)){
        mca_common_ompio_register_print_entry(fh->f_coll_read_time,
                                              nentry);
    }
#endif

exit:
    if (NULL != decoded_iov){
        free(decoded_iov);
        decoded_iov = NULL;
    }

    if (NULL != displs){
        free(displs);
        displs = NULL;
    }

    if (NULL != iovec_count_per_process){
        free(iovec_count_per_process);
        iovec_count_per_process=NULL;
    }

    if (NULL != local_iov_array){
        free(local_iov_array);
        local_iov_array=NULL;
    }

    if (NULL != global_iov_array){
        free(global_iov_array);
        global_iov_array=NULL;
    }

    if (my_aggregator == fh->f_rank) {

        for(l=0;l<fh->f_procs_per_group;l++){
            if (blocklen_per_process) {
                free(blocklen_per_process[l]);
            }
            if (NULL != displs_per_process[l]){
                free(displs_per_process[l]);
                displs_per_process[l] = NULL;
            }
        }
    }

    if (NULL != bytes_per_process){
        free(bytes_per_process);
        bytes_per_process =NULL;
    }

    if (NULL != disp_index){
        free(disp_index);
        disp_index =NULL;
    }

    if (NULL != displs_per_process){
        free(displs_per_process);
        displs_per_process = NULL;
    }

    if(NULL != bytes_remaining){
        free(bytes_remaining);
        bytes_remaining = NULL;
    }

    if(NULL != current_index){
        free(current_index);
        current_index = NULL;
    }

    if (NULL != blocklen_per_process){
        free(blocklen_per_process);
        blocklen_per_process =NULL;
    }

    if (NULL != bytes_remaining){
        free(bytes_remaining);
        bytes_remaining =NULL;
    }

    if (NULL != memory_displacements){
        free(memory_displacements);
        memory_displacements= NULL;
    }

    if (NULL != file_offsets_for_agg){
        free(file_offsets_for_agg);
        file_offsets_for_agg = NULL;
    }

    if (NULL != sorted_file_offsets){
        free(sorted_file_offsets);
        sorted_file_offsets = NULL;
    }

    if (NULL != sendtype){
        free(sendtype);
        sendtype=NULL;
    }

    if ( !recvbuf_is_contiguous ) {
        if (NULL != receive_buf){
            free(receive_buf);
            receive_buf=NULL;
        }
    }

    if (NULL != global_buf) {
        free(global_buf);
        global_buf = NULL;
    }

    if (NULL != sorted) {
        free(sorted);
        sorted = NULL;
    }

    if (NULL != send_req){
        free(send_req);
        send_req = NULL;
    }


    return ret;

}