示例#1
0
static int test_upper( unsigned int length )
{
    ompi_datatype_t *pdt;
    opal_convertor_t * pConv;
    int rc = OMPI_SUCCESS;
    unsigned int i, iov_count, split_chunk, total_length;
    size_t max_data;
    struct iovec iov[5];
    TIMER_DATA_TYPE start, end;
    long total_time;

    printf( "test upper matrix\n" );
    pdt = upper_matrix( length );
    /*dt_dump( pdt );*/

    total_length = length * (length + 1) * ( sizeof(double) / 2);

    pConv = opal_convertor_create( remote_arch, 0 );
    if( OMPI_SUCCESS != opal_convertor_prepare_for_send( pConv, &(pdt->super), 1, NULL ) ) {
        printf( "Cannot attach the datatype to a convertor\n" );
        return OMPI_ERROR;
    }

    GET_TIME( start );
    split_chunk = (length + 1) * sizeof(double);
    /*    split_chunk = (total_length + 1) * sizeof(double); */
    for( i = total_length; i > 0; ) {
        iov_count = 5;
        max_data = 0;
        opal_convertor_raw( pConv, iov, &iov_count, &max_data );
        i -= max_data;
    }
    GET_TIME( end );
    total_time = ELAPSED_TIME( start, end );
    printf( "complete raw in %ld microsec\n", total_time );

    /* test the automatic destruction pf the data */
    ompi_datatype_destroy( &pdt );
    assert( pdt == NULL );

    OBJ_RELEASE( pConv );
    return rc;
}
示例#2
0
/*
 *	gatherv_inter
 *
 *	Function:	- gatherv operation using a local gather on c_local_comm
 *	Accepts:	- same arguments as MPI_Gatherv()
 *	Returns:	- MPI_SUCCESS or error code
 */
int
mca_coll_inter_gatherv_inter(const void *sbuf, int scount,
                             struct ompi_datatype_t *sdtype,
                             void *rbuf, const int *rcounts, const int *disps,
                             struct ompi_datatype_t *rdtype, int root,
                             struct ompi_communicator_t *comm,
                             mca_coll_base_module_t *module)
{
    int i, rank, size, size_local, total=0, err;
    int *count=NULL, *displace=NULL;
    char *ptmp=NULL;
    MPI_Aint incr;
    MPI_Aint extent;
    MPI_Aint lb;
    ompi_datatype_t *ndtype;

    if (MPI_PROC_NULL == root) { /* do nothing */
        return OMPI_SUCCESS;
    }
    size = ompi_comm_remote_size(comm);
    rank = ompi_comm_rank(comm);
    size_local = ompi_comm_size(comm);

    if (MPI_ROOT == root) { /* I am the root, receiving the data from zero. */
        ompi_datatype_create_indexed(size, rcounts, disps, rdtype, &ndtype);
        ompi_datatype_commit(&ndtype);

        err = MCA_PML_CALL(recv(rbuf, 1, ndtype, 0,
                                MCA_COLL_BASE_TAG_GATHERV,
                                comm, MPI_STATUS_IGNORE));
        ompi_datatype_destroy(&ndtype);
        return err;
    }

    if (0 == rank) {
        count = (int *)malloc(sizeof(int) * size_local);
        displace = (int *)malloc(sizeof(int) * size_local);
        if ((NULL == displace) || (NULL == count)) {
            err = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
    }

    err = comm->c_local_comm->c_coll.coll_gather(&scount, 1, MPI_INT,
                                                 count, 1, MPI_INT,
                                                 0, comm->c_local_comm,
                                                 comm->c_local_comm->c_coll.coll_gather_module);
    if (OMPI_SUCCESS != err) {
        goto exit;
    }
    if(0 == rank) {
        displace[0] = 0;
        for (i = 1; i < size_local; i++) {
            displace[i] = displace[i-1] + count[i-1];
        }
        /* Perform the gatherv locally with the first process as root */
        err = ompi_datatype_get_extent(sdtype, &lb, &extent);
        if (OMPI_SUCCESS != err) {
            err = OMPI_ERROR;
            goto exit;
        }
        incr = 0;
        for (i = 0; i < size_local; i++) {
            incr = incr + extent*count[i];
        }
        if ( incr > 0 ) {
            ptmp = (char*)malloc(incr);
            if (NULL == ptmp) {
                err = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
        }
    }
    err = comm->c_local_comm->c_coll.coll_gatherv(sbuf, scount, sdtype,
                                                  ptmp, count, displace,
                                                  sdtype,0, comm->c_local_comm,
                                                  comm->c_local_comm->c_coll.coll_gatherv_module);
    if (OMPI_SUCCESS != err) {
        goto exit;
    }

    if (0 == rank) {
        for (i = 0; i < size_local; i++) {
            total = total + count[i];
        }
        /* First process sends data to the root */
        err = MCA_PML_CALL(send(ptmp, total, sdtype, root,
                                MCA_COLL_BASE_TAG_GATHERV,
                                MCA_PML_BASE_SEND_STANDARD, comm));
    }

  exit:
    if (NULL != ptmp) {
        free(ptmp);
    }
    if (NULL != displace) {
        free(displace);
    }
    if (NULL != count) {
        free(count);
    }

    /* All done */
    return err;
}
int
mca_fcoll_static_file_read_all (mca_io_ompio_file_t *fh,
				void *buf,
				int count,
				struct ompi_datatype_t *datatype,
				ompi_status_public_t *status)
{

    int ret = OMPI_SUCCESS, iov_size=0, *bytes_remaining=NULL;
    int i, j, l,cycles=0, local_cycles=0, *current_index=NULL;
    int index, *disp_index=NULL, *bytes_per_process=NULL, current_position=0;
    int **blocklen_per_process=NULL, *iovec_count_per_process=NULL;
    int *displs=NULL, *sorted=NULL ,entries_per_aggregator=0;
    int *sorted_file_offsets=NULL, temp_index=0, position=0, *temp_disp_index=NULL;


    MPI_Aint **displs_per_process=NULL, global_iov_count=0, global_count=0;
    MPI_Aint *memory_displacements=NULL;
    int bytes_to_read_in_cycle=0;
    size_t max_data=0, bytes_per_cycle=0;
    uint32_t iov_count=0, iov_index=0;
    struct iovec *decoded_iov=NULL, *iov=NULL;
    mca_fcoll_static_local_io_array *local_iov_array=NULL, *global_iov_array=NULL;
    mca_fcoll_static_local_io_array *file_offsets_for_agg=NULL;

    char *global_buf=NULL, *receive_buf=NULL;

    int blocklen[3] = {1, 1, 1};
    int static_num_io_procs=1;
    OPAL_PTRDIFF_TYPE d[3], base;
    ompi_datatype_t *types[3];
    ompi_datatype_t *io_array_type=MPI_DATATYPE_NULL;
    ompi_datatype_t **sendtype = NULL;
    MPI_Request *send_req=NULL, recv_req=NULL;
    int my_aggregator=-1;
    bool recvbuf_is_contiguous=false;
    size_t ftype_size;
    OPAL_PTRDIFF_TYPE ftype_extent, lb;

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    double read_time = 0.0, start_read_time = 0.0, end_read_time = 0.0;
    double rcomm_time = 0.0, start_rcomm_time = 0.0, end_rcomm_time = 0.0;
    double read_exch = 0.0, start_rexch = 0.0, end_rexch = 0.0;
    mca_common_ompio_print_entry nentry;
#endif
#if DEBUG_ON
    MPI_Aint gc_in;
#endif
    opal_datatype_type_size ( &datatype->super, &ftype_size );
    opal_datatype_get_extent ( &datatype->super, &lb, &ftype_extent );

    /**************************************************************************
     ** 1.  In case the data is not contigous in memory, decode it into an iovec
     **************************************************************************/
    if ( ( ftype_extent == (OPAL_PTRDIFF_TYPE) ftype_size)             &&
         opal_datatype_is_contiguous_memory_layout(&datatype->super,1) &&
         0 == lb ) {
        recvbuf_is_contiguous = true;
    }


    /* In case the data is not contigous in memory, decode it into an iovec */
    if (!recvbuf_is_contiguous  ) {
        fh->f_decode_datatype ( (struct mca_io_ompio_file_t *)fh,
                                datatype,
                                count,
                                buf,
                                &max_data,
                                &decoded_iov,
                                &iov_count);
    }
    else {
        max_data = count * datatype->super.size;
    }

    if ( MPI_STATUS_IGNORE != status ) {
        status->_ucount = max_data;
    }


    fh->f_get_num_aggregators ( &static_num_io_procs );
    fh->f_set_aggregator_props ((struct mca_io_ompio_file_t *) fh,
                                static_num_io_procs,
                                max_data);
    my_aggregator = fh->f_procs_in_group[fh->f_aggregator_index];

    /*  printf("max_data %ld\n", max_data);  */
    ret = fh->f_generate_current_file_view((struct mca_io_ompio_file_t *)fh,
                                           max_data,
                                           &iov,
                                           &iov_size);
    if (ret != OMPI_SUCCESS){
        goto exit;
    }

    if ( iov_size > 0 ) {
        local_iov_array = (mca_fcoll_static_local_io_array *)malloc (iov_size * sizeof(mca_fcoll_static_local_io_array));
        if ( NULL == local_iov_array){
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }


        for (j=0; j < iov_size; j++){
            local_iov_array[j].offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t)
                iov[j].iov_base;
            local_iov_array[j].length = (size_t)iov[j].iov_len;
            local_iov_array[j].process_id = fh->f_rank;

        }
    }
    else {
        /* Allocate at least one element to correctly create the derived
           data type */
        local_iov_array = (mca_fcoll_static_local_io_array *)malloc (sizeof(mca_fcoll_static_local_io_array));
        if ( NULL == local_iov_array){
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }


        local_iov_array[0].offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t) 0;
        local_iov_array[0].length = (size_t) 0;
        local_iov_array[0].process_id = fh->f_rank;
    }

    d[0] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0];
    d[1] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0].length;
    d[2] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0].process_id;
    base = d[0];
    for (i=0 ; i<3 ; i++) {
        d[i] -= base;
    }

    /* io_array datatype  for using in communication*/
    types[0] = &ompi_mpi_long.dt;
    types[1] = &ompi_mpi_long.dt;
    types[2] = &ompi_mpi_int.dt;

    ompi_datatype_create_struct (3,
                                 blocklen,
                                 d,
                                 types,
                                 &io_array_type);
    ompi_datatype_commit (&io_array_type);

    /* #########################################################*/
    fh->f_get_bytes_per_agg ( (int*) &bytes_per_cycle);
    local_cycles = ceil((double)max_data*fh->f_procs_per_group/bytes_per_cycle);

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_rexch = MPI_Wtime();
#endif
    ret = fh->f_comm->c_coll.coll_allreduce (&local_cycles,
                                             &cycles,
                                             1,
                                             MPI_INT,
                                             MPI_MAX,
                                             fh->f_comm,
                                             fh->f_comm->c_coll.coll_allreduce_module);

    if (OMPI_SUCCESS != ret){
        goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        end_rcomm_time = MPI_Wtime();
        rcomm_time  += end_rcomm_time - start_rcomm_time;
#endif


    if (my_aggregator == fh->f_rank) {
        disp_index = (int *) malloc (fh->f_procs_per_group * sizeof(int));
        if (NULL == disp_index) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        bytes_per_process = (int *) malloc (fh->f_procs_per_group * sizeof(int ));
        if (NULL == bytes_per_process){
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        bytes_remaining = (int *) calloc (fh->f_procs_per_group, sizeof(int));
        if (NULL == bytes_remaining){
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        current_index = (int *) calloc (fh->f_procs_per_group, sizeof(int));
        if (NULL == current_index){
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        blocklen_per_process = (int **)calloc (fh->f_procs_per_group, sizeof (int*));
        if (NULL == blocklen_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        displs_per_process = (MPI_Aint **)calloc (fh->f_procs_per_group, sizeof (MPI_Aint*));
        if (NULL == displs_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
    }


    iovec_count_per_process = (int *) calloc (fh->f_procs_per_group, sizeof(int));
    if (NULL == iovec_count_per_process){
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }

    displs = (int *) calloc (fh->f_procs_per_group, sizeof(int));
    if (NULL == displs){
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_rexch = MPI_Wtime();
#endif
    ret = fcoll_base_coll_allgather_array (&iov_size,
                                           1,
                                           MPI_INT,
                                           iovec_count_per_process,
                                           1,
                                           MPI_INT,
                                           fh->f_aggregator_index,
                                           fh->f_procs_in_group,
                                           fh->f_procs_per_group,
                                           fh->f_comm);

    if( OMPI_SUCCESS != ret){
        goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        end_rcomm_time = MPI_Wtime();
        rcomm_time  += end_rcomm_time - start_rcomm_time;
#endif

    if (my_aggregator == fh->f_rank) {
        displs[0] = 0;
        global_iov_count = iovec_count_per_process[0];
        for (i=1 ; i<fh->f_procs_per_group ; i++) {
            global_iov_count += iovec_count_per_process[i];
            displs[i] = displs[i-1] + iovec_count_per_process[i-1];
        }
    }


    if ( (my_aggregator == fh->f_rank) &&
         (global_iov_count >  0 )) {
        global_iov_array = (mca_fcoll_static_local_io_array *) malloc (global_iov_count *
                                                      sizeof(mca_fcoll_static_local_io_array));
        if (NULL == global_iov_array){
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
    }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_rexch = MPI_Wtime();
#endif
    ret = fcoll_base_coll_gatherv_array (local_iov_array,
                                         iov_size,
                                         io_array_type,
                                         global_iov_array,
                                         iovec_count_per_process,
                                         displs,
                                         io_array_type,
                                         fh->f_aggregator_index,
                                         fh->f_procs_in_group,
                                         fh->f_procs_per_group,
                                         fh->f_comm);

    if (OMPI_SUCCESS != ret){
        fprintf(stderr,"global_iov_array gather error!\n");
        goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        end_rcomm_time = MPI_Wtime();
        rcomm_time  += end_rcomm_time - start_rcomm_time;
#endif


    if (NULL != local_iov_array){
        free(local_iov_array);
        local_iov_array = NULL;
    }

    if ( ( my_aggregator == fh->f_rank) &&
         ( global_iov_count > 0 )) {
        sorted = (int *)malloc (global_iov_count * sizeof(int));
        if (NULL == sorted) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        read_local_heap_sort (global_iov_array, global_iov_count, sorted);

        send_req = (MPI_Request *) malloc (fh->f_procs_per_group * sizeof(MPI_Request));
        if (NULL == send_req){
            opal_output ( 1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        sendtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group * sizeof(ompi_datatype_t *));
        if (NULL == sendtype) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        for ( i=0; i<fh->f_procs_per_group; i++ ) {
            sendtype[i] = MPI_DATATYPE_NULL;
        }

        if (NULL == bytes_per_process){
            bytes_per_process = (int *) malloc (fh->f_procs_per_group * sizeof(int));
            if (NULL == bytes_per_process){
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
        }
    }

#if DEBUG_ON

    if (my_aggregator == fh->f_rank) {
        for (gc_in=0; gc_in<global_iov_count; gc_in++){
            printf("%d: Offset[%ld]: %lld, Length[%ld]: %ld\n",
                   global_iov_array[sorted[gc_in]].process_id,
                   gc_in, global_iov_array[sorted[gc_in]].offset,
                   gc_in, global_iov_array[sorted[gc_in]].length);
        }
    }
#endif

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_rexch = MPI_Wtime();
#endif

    for (index = 0; index < cycles; index++){

        if (my_aggregator == fh->f_rank) {

            fh->f_num_of_io_entries = 0;
            if (NULL != fh->f_io_array) {
                free (fh->f_io_array);
                fh->f_io_array = NULL;
            }
            if (NULL != global_buf) {
                free (global_buf);
                global_buf = NULL;
            }

            if (NULL != sorted_file_offsets){
                free(sorted_file_offsets);
                sorted_file_offsets = NULL;
            }
            if (NULL != file_offsets_for_agg){
                free(file_offsets_for_agg);
                file_offsets_for_agg = NULL;
            }
            if (NULL != memory_displacements){
                free(memory_displacements);
                memory_displacements= NULL;
            }

            if ( NULL != sendtype ) {
                for ( i=0; i<fh->f_procs_per_group; i++ ) {
                    if ( MPI_DATATYPE_NULL != sendtype[i] ) {
                        ompi_datatype_destroy (&sendtype[i] );
                        sendtype[i] = MPI_DATATYPE_NULL;
                    }
                }
            }

            for(l=0;l<fh->f_procs_per_group;l++){
                disp_index[l] =  1;
                if (NULL != blocklen_per_process[l]){
                    free(blocklen_per_process[l]);
                    blocklen_per_process[l] = NULL;
                }
                if (NULL != displs_per_process[l]){
                    free(displs_per_process[l]);
                    displs_per_process[l] = NULL;
                }
                blocklen_per_process[l] = (int *) calloc (1, sizeof(int));
                if (NULL == blocklen_per_process[l]) {
                    opal_output (1, "OUT OF MEMORY for blocklen\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint));
                if (NULL == displs_per_process[l]){
                    opal_output (1, "OUT OF MEMORY for displs\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
            }
        }

        if (index < local_cycles ) {
            if ((index == local_cycles-1) && (max_data % (bytes_per_cycle/fh->f_procs_per_group))) {
                bytes_to_read_in_cycle = max_data - position;
            }
            else if (max_data <= bytes_per_cycle/fh->f_procs_per_group) {
                bytes_to_read_in_cycle = max_data;
            }
            else {
                bytes_to_read_in_cycle = bytes_per_cycle/fh->f_procs_per_group;
            }
        }
        else {
            bytes_to_read_in_cycle = 0;
        }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_rexch = MPI_Wtime();
#endif
        fcoll_base_coll_gather_array (&bytes_to_read_in_cycle,
                                      1,
                                      MPI_INT,
                                      bytes_per_process,
                                      1,
                                      MPI_INT,
                                      fh->f_aggregator_index,
                                      fh->f_procs_in_group,
                                      fh->f_procs_per_group,
                                      fh->f_comm);

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        end_rcomm_time = MPI_Wtime();
        rcomm_time  += end_rcomm_time - start_rcomm_time;
#endif

        if (recvbuf_is_contiguous ) {
            receive_buf = &((char*)buf)[position];
        }
        else if (bytes_to_read_in_cycle) {
            receive_buf = (char *) malloc (bytes_to_read_in_cycle * sizeof(char));
            if ( NULL == receive_buf){
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
        }


#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        start_rcomm_time = MPI_Wtime();
#endif

        ret = MCA_PML_CALL(irecv(receive_buf,
                                 bytes_to_read_in_cycle,
                                 MPI_BYTE,
                                 my_aggregator,
                                 123,
                                 fh->f_comm,
                                 &recv_req));
        if (OMPI_SUCCESS != ret){
            goto exit;
        }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        end_rcomm_time = MPI_Wtime();
        rcomm_time  += end_rcomm_time - start_rcomm_time;
#endif


        if (my_aggregator == fh->f_rank) {
            for (i=0;i<fh->f_procs_per_group; i++){
                while (bytes_per_process[i] > 0){
                    /*printf("%d: bytes_per_process[%d]: %d, bytes_remaining[%d]: %d\n",
                      index, i, bytes_per_process[i], i, bytes_remaining[i]);*/
                    if (read_get_process_id(global_iov_array[sorted[current_index[i]]].process_id,
                                            fh) == i){ /* current id owns this entry!*/
                        if (bytes_remaining[i]){ /*Remaining bytes in the current entry of
                                                   the global offset array*/
                            if (bytes_remaining[i] <= bytes_per_process[i]){

                                blocklen_per_process[i][disp_index[i] - 1] = bytes_remaining[i];
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset +
                                    (global_iov_array[sorted[current_index[i]]].length
                                     - bytes_remaining[i]);
                                blocklen_per_process[i] = (int *) realloc
                                    ((void *)blocklen_per_process[i], (disp_index[i]+1)*sizeof(int));
                                displs_per_process[i] = (MPI_Aint *)realloc
                                    ((void *)displs_per_process[i], (disp_index[i]+1)*sizeof(MPI_Aint));
                                bytes_per_process[i] -= bytes_remaining[i];
                                blocklen_per_process[i][disp_index[i]] = 0;
                                displs_per_process[i][disp_index[i]] = 0;
                                disp_index[i] += 1;
                                bytes_remaining[i] = 0;
                                /* This entry has been used up, we need to move to the
                                   next entry of this process and make current_index point there*/
                                current_index[i]  = read_find_next_index(i,
                                                                         current_index[i],
                                                                         fh,
                                                                         global_iov_array,
                                                                         global_iov_count,
                                                                         sorted);
                                if (current_index[i] == -1){
                                    break;
                                }
                                continue;
                            }
                            else{
                                blocklen_per_process[i][disp_index[i] - 1] = bytes_per_process[i];
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset +
                                    (global_iov_array[sorted[current_index[i]]].length
                                     - bytes_remaining[i]);
                                bytes_remaining[i] -= bytes_per_process[i];
                                bytes_per_process[i] = 0;
                                break;
                            }
                        }
                        else{
                            if (bytes_per_process[i] <
                                global_iov_array[sorted[current_index[i]]].length){
                                blocklen_per_process[i][disp_index[i] - 1] =
                                    bytes_per_process[i];
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset;
                                bytes_remaining[i] =
                                    global_iov_array[sorted[current_index[i]]].length -
                                    bytes_per_process[i];
                                bytes_per_process[i] = 0;
                                break;
                            }
                            else {
                                blocklen_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].length;
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset;
                                blocklen_per_process[i] =
                                    (int *) realloc ((void *)blocklen_per_process[i], (disp_index[i]+1)*sizeof(int));
                                displs_per_process[i] = (MPI_Aint *)realloc
                                    ((void *)displs_per_process[i], (disp_index[i]+1)*sizeof(MPI_Aint));
                                blocklen_per_process[i][disp_index[i]] = 0;
                                displs_per_process[i][disp_index[i]] = 0;
                                disp_index[i] += 1;
                                bytes_per_process[i] -=
                                    global_iov_array[sorted[current_index[i]]].length;
                                current_index[i] = read_find_next_index(i,
                                                                        current_index[i],
                                                                        fh,
                                                                        global_iov_array,
                                                                        global_iov_count,
                                                                        sorted);
                                if (current_index[i] == -1){
                                    break;
                                }
                            }
                        }
                    }
                    else{
                        current_index[i] = read_find_next_index(i,
                                                                current_index[i],
                                                                fh,
                                                                global_iov_array,
                                                                global_iov_count,
                                                                sorted);
                        if (current_index[i] == -1){
                            bytes_per_process[i] = 0; /* no more entries left
                                                         to service this request*/
                            continue;
                        }
                    }
                }
            }

            entries_per_aggregator=0;
            for (i=0;i<fh->f_procs_per_group;i++){
                for (j=0;j<disp_index[i];j++){
                    if (blocklen_per_process[i][j] > 0){
                        entries_per_aggregator++;
#if DEBUG_ON
                        printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n",
                               fh->f_procs_in_group[i],j,
                               blocklen_per_process[i][j],j,
                               displs_per_process[i][j],
                               fh->f_rank);

#endif
                    }
                }
            }

            if (entries_per_aggregator > 0){
                file_offsets_for_agg = (mca_fcoll_static_local_io_array *)
                    malloc(entries_per_aggregator*sizeof(mca_fcoll_static_local_io_array));
                if (NULL == file_offsets_for_agg) {
                    opal_output (1, "OUT OF MEMORY\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                sorted_file_offsets = (int *) malloc (entries_per_aggregator * sizeof(int));
                if (NULL == sorted_file_offsets){
                    opal_output (1, "OUT OF MEMORY\n");
                    ret =  OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                temp_index=0;
                global_count = 0;
                for (i=0;i<fh->f_procs_per_group; i++){
                    for(j=0;j<disp_index[i]; j++){
                        if (blocklen_per_process[i][j] > 0){
                            file_offsets_for_agg[temp_index].length =
                                blocklen_per_process[i][j];
                            global_count += blocklen_per_process[i][j];
                            file_offsets_for_agg[temp_index].process_id = i;
                            file_offsets_for_agg[temp_index].offset =
                                displs_per_process[i][j];
                            temp_index++;
                        }
                    }
                }
            }
            else{
                continue;
            }
            read_local_heap_sort (file_offsets_for_agg,
                                  entries_per_aggregator,
                                  sorted_file_offsets);
            memory_displacements = (MPI_Aint *) malloc
                (entries_per_aggregator * sizeof(MPI_Aint));
            memory_displacements[sorted_file_offsets[0]] = 0;
            for (i=1; i<entries_per_aggregator; i++){
                memory_displacements[sorted_file_offsets[i]] =
                    memory_displacements[sorted_file_offsets[i-1]] +
                    file_offsets_for_agg[sorted_file_offsets[i-1]].length;
            }

            global_buf = (char *) malloc (global_count * sizeof(char));
            if (NULL == global_buf){
                opal_output(1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
#if DEBUG_ON
            printf("************Cycle: %d,  Aggregator: %d ***************\n",
                   index+1,fh->f_rank);
            for (i=0; i<entries_per_aggregator;i++){
                printf("%d: OFFSET: %lld   LENGTH: %ld, Mem-offset: %ld, disp_index :%d\n",
                       file_offsets_for_agg[sorted_file_offsets[i]].process_id,
                       file_offsets_for_agg[sorted_file_offsets[i]].offset,
                       file_offsets_for_agg[sorted_file_offsets[i]].length,
                       memory_displacements[sorted_file_offsets[i]],
                       disp_index[i]);
            }
#endif

            fh->f_io_array = (mca_io_ompio_io_array_t *) malloc
                (entries_per_aggregator * sizeof (mca_io_ompio_io_array_t));
            if (NULL == fh->f_io_array) {
                opal_output(1, "OUT OF MEMORY\n");
                ret =  OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }



            fh->f_num_of_io_entries = 0;
            fh->f_io_array[0].offset =
                (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset;
            fh->f_io_array[0].length = file_offsets_for_agg[sorted_file_offsets[0]].length;
            fh->f_io_array[0].memory_address = global_buf+memory_displacements[sorted_file_offsets[0]];
            fh->f_num_of_io_entries++;
            for (i=1;i<entries_per_aggregator;i++){
                if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset +
                    file_offsets_for_agg[sorted_file_offsets[i-1]].length ==
                    file_offsets_for_agg[sorted_file_offsets[i]].offset){
                    fh->f_io_array[fh->f_num_of_io_entries - 1].length +=
                        file_offsets_for_agg[sorted_file_offsets[i]].length;
                }
                else{
                    fh->f_io_array[fh->f_num_of_io_entries].offset =
                        (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset;
                    fh->f_io_array[fh->f_num_of_io_entries].length =
                        file_offsets_for_agg[sorted_file_offsets[i]].length;
                    fh->f_io_array[fh->f_num_of_io_entries].memory_address =
                        global_buf+memory_displacements[sorted_file_offsets[i]];
                    fh->f_num_of_io_entries++;
                }
            }

#if DEBUG_ON
            printf("*************************** %d\n", fh->f_num_of_io_entries);
            for (i=0 ; i<fh->f_num_of_io_entries ; i++) {
                printf(" ADDRESS: %p  OFFSET: %ld   LENGTH: %ld\n",
                       fh->f_io_array[i].memory_address,
                       (OPAL_PTRDIFF_TYPE)fh->f_io_array[i].offset,
                       fh->f_io_array[i].length);
            }
#endif
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            start_read_time = MPI_Wtime();
#endif

            if (fh->f_num_of_io_entries) {
                if ( 0 > fh->f_fbtl->fbtl_preadv (fh)) {
                    opal_output (1, "READ FAILED\n");
                    ret = OMPI_ERROR;
                    goto exit;
                }
            }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            end_read_time = MPI_Wtime();
            read_time += end_read_time - start_read_time;
#endif


#if DEBUG_ON
            printf("************Cycle: %d,  Aggregator: %d ***************\n",
                   index+1,fh->f_rank);
            if (my_aggregator == fh->f_rank){
                for (i=0 ; i<global_count/4 ; i++)
                    printf (" READ %d \n",((int *)global_buf)[i]);
            }
#endif

            temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int));
            if (NULL == temp_disp_index) {
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }

            for (i=0; i<entries_per_aggregator; i++){
                temp_index =
                    file_offsets_for_agg[sorted_file_offsets[i]].process_id;
                displs_per_process[temp_index][temp_disp_index[temp_index]] =
                    memory_displacements[sorted_file_offsets[i]];
                if (temp_disp_index[temp_index] < disp_index[temp_index]){
                    temp_disp_index[temp_index] += 1;
                }
                else{
                    printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n",
                           temp_index, temp_disp_index[temp_index],
                           temp_index, disp_index[temp_index]);
                }
            }
            if (NULL != temp_disp_index){
                free(temp_disp_index);
                temp_disp_index = NULL;
            }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            start_rcomm_time = MPI_Wtime();
#endif

            for (i=0;i<fh->f_procs_per_group; i++){
                send_req[i] = MPI_REQUEST_NULL;
                ompi_datatype_create_hindexed(disp_index[i],
                                              blocklen_per_process[i],
                                              displs_per_process[i],
                                              MPI_BYTE,
                                              &sendtype[i]);
                ompi_datatype_commit(&sendtype[i]);
                ret = MCA_PML_CALL (isend(global_buf,
                                          1,
                                          sendtype[i],
                                          fh->f_procs_in_group[i],
                                          123,
                                          MCA_PML_BASE_SEND_STANDARD,
                                          fh->f_comm,
                                          &send_req[i]));
                if(OMPI_SUCCESS != ret){
                    goto exit;
                }
            }

            ret = ompi_request_wait_all (fh->f_procs_per_group,
                                         send_req,
                                         MPI_STATUS_IGNORE);
            if (OMPI_SUCCESS != ret){
                goto exit;
            }
        } /* if ( my_aggregator == fh->f_rank ) */

        ret = ompi_request_wait (&recv_req, MPI_STATUS_IGNORE);
        if (OMPI_SUCCESS != ret){
            goto exit;
        }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        end_rcomm_time = MPI_Wtime();
        rcomm_time += end_rcomm_time - start_rcomm_time;
#endif

        position += bytes_to_read_in_cycle;

        if (!recvbuf_is_contiguous) {
            OPAL_PTRDIFF_TYPE mem_address;
            size_t remaining = 0;
            size_t temp_position = 0;

            remaining = bytes_to_read_in_cycle;

            while (remaining && (iov_count > iov_index)){
                mem_address = (OPAL_PTRDIFF_TYPE)
                    (decoded_iov[iov_index].iov_base) + current_position;

                if (remaining >=
                    (decoded_iov[iov_index].iov_len - current_position)) {
                    memcpy ((IOVBASE_TYPE *) mem_address,
                            receive_buf+temp_position,
                            decoded_iov[iov_index].iov_len - current_position);
                    remaining = remaining -
                        (decoded_iov[iov_index].iov_len - current_position);
                    temp_position = temp_position +
                        (decoded_iov[iov_index].iov_len - current_position);
                    iov_index = iov_index + 1;
                    current_position = 0;
                }
                else{
                    memcpy ((IOVBASE_TYPE *) mem_address,
                            receive_buf+temp_position,
                            remaining);
                    current_position = current_position + remaining;
                    remaining = 0;
                }
            }
            if (NULL != receive_buf) {
                free (receive_buf);
                receive_buf = NULL;
            }
        }

    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    end_rexch = MPI_Wtime();
    read_exch += end_rexch - start_rexch;
    nentry.time[0] = read_time;
    nentry.time[1] = rcomm_time;
    nentry.time[2] = read_exch;
    if (my_aggregator == fh->f_rank)
        nentry.aggregator = 1;
    else
        nentry.aggregator = 0;
    nentry.nprocs_for_coll = static_num_io_procs;
    if (!mca_common_ompio_full_print_queue(fh->f_coll_read_time)){
        mca_common_ompio_register_print_entry(fh->f_coll_read_time,
                                              nentry);
    }
#endif

exit:
    if (NULL != decoded_iov){
        free(decoded_iov);
        decoded_iov = NULL;
    }

    if (NULL != displs){
        free(displs);
        displs = NULL;
    }

    if (NULL != iovec_count_per_process){
        free(iovec_count_per_process);
        iovec_count_per_process=NULL;
    }

    if (NULL != local_iov_array){
        free(local_iov_array);
        local_iov_array=NULL;
    }

    if (NULL != global_iov_array){
        free(global_iov_array);
        global_iov_array=NULL;
    }

    if (my_aggregator == fh->f_rank) {

        for(l=0;l<fh->f_procs_per_group;l++){
            if (blocklen_per_process) {
                free(blocklen_per_process[l]);
            }
            if (NULL != displs_per_process[l]){
                free(displs_per_process[l]);
                displs_per_process[l] = NULL;
            }
        }
    }

    if (NULL != bytes_per_process){
        free(bytes_per_process);
        bytes_per_process =NULL;
    }

    if (NULL != disp_index){
        free(disp_index);
        disp_index =NULL;
    }

    if (NULL != displs_per_process){
        free(displs_per_process);
        displs_per_process = NULL;
    }

    if(NULL != bytes_remaining){
        free(bytes_remaining);
        bytes_remaining = NULL;
    }

    if(NULL != current_index){
        free(current_index);
        current_index = NULL;
    }

    if (NULL != blocklen_per_process){
        free(blocklen_per_process);
        blocklen_per_process =NULL;
    }

    if (NULL != bytes_remaining){
        free(bytes_remaining);
        bytes_remaining =NULL;
    }

    if (NULL != memory_displacements){
        free(memory_displacements);
        memory_displacements= NULL;
    }

    if (NULL != file_offsets_for_agg){
        free(file_offsets_for_agg);
        file_offsets_for_agg = NULL;
    }

    if (NULL != sorted_file_offsets){
        free(sorted_file_offsets);
        sorted_file_offsets = NULL;
    }

    if (NULL != sendtype){
        free(sendtype);
        sendtype=NULL;
    }

    if ( !recvbuf_is_contiguous ) {
        if (NULL != receive_buf){
            free(receive_buf);
            receive_buf=NULL;
        }
    }

    if (NULL != global_buf) {
        free(global_buf);
        global_buf = NULL;
    }

    if (NULL != sorted) {
        free(sorted);
        sorted = NULL;
    }

    if (NULL != send_req){
        free(send_req);
        send_req = NULL;
    }


    return ret;

}
int32_t ompi_datatype_create_darray(int size,
                                    int rank,
                                    int ndims,
                                    int const* gsize_array,
                                    int const* distrib_array,
                                    int const* darg_array,
                                    int const* psize_array,
                                    int order,
                                    const ompi_datatype_t* oldtype,
                                    ompi_datatype_t** newtype)
{
    ompi_datatype_t *lastType;
    ptrdiff_t orig_extent, *st_offsets = NULL;
    int i, start_loop, end_loop, step;
    int *coords = NULL, rc = OMPI_SUCCESS;

    /* speedy corner case */
    if (ndims < 1) {
        /* Don't just return MPI_DATATYPE_NULL as that can't be
           MPI_TYPE_FREE()ed, and that seems bad */
        *newtype = ompi_datatype_create(0);
        ompi_datatype_add(*newtype, &ompi_mpi_datatype_null.dt, 0, 0, 0);
        return MPI_SUCCESS;
    }

    rc = ompi_datatype_type_extent(oldtype, &orig_extent);
    if (MPI_SUCCESS != rc) goto cleanup;

    /* calculate position in grid using row-major ordering */
    {
        int tmp_rank = rank, procs = size;

        coords = (int *) malloc(ndims * sizeof(int));
        for (i = 0 ; i < ndims ; i++) {
            procs = procs / psize_array[i];
            coords[i] = tmp_rank / procs;
            tmp_rank = tmp_rank % procs;
        }
    }

    st_offsets = (ptrdiff_t *) malloc(ndims * sizeof(ptrdiff_t));

    /* duplicate type to here to 1) deal with constness without
       casting and 2) eliminate need to for conditional destroy below.
       Lame, yes.  But cleaner code all around. */
    rc = ompi_datatype_duplicate(oldtype, &lastType);
    if (OMPI_SUCCESS != rc) goto cleanup;

    /* figure out ordering issues */
    if (MPI_ORDER_C == order) {
        start_loop = ndims - 1 ; step = -1; end_loop = -1;
    } else {
        start_loop = 0 ; step = 1; end_loop = ndims;
    }

    /* Build up array */
    for (i = start_loop; i != end_loop; i += step) {
        int nprocs, tmp_rank;

        switch(distrib_array[i]) {
        case MPI_DISTRIBUTE_BLOCK:
            rc = block(gsize_array, i, ndims, psize_array[i], coords[i],
                       darg_array[i], order, orig_extent,
                       lastType, newtype, st_offsets+i);
            break;
        case MPI_DISTRIBUTE_CYCLIC:
            rc = cyclic(gsize_array, i, ndims, psize_array[i], coords[i],
                        darg_array[i], order, orig_extent,
                        lastType, newtype, st_offsets+i);
            break;
        case MPI_DISTRIBUTE_NONE:
            /* treat it as a block distribution on 1 process */
            if (order == MPI_ORDER_C) {
                nprocs = psize_array[i]; tmp_rank = coords[i];
            } else {
                nprocs = 1; tmp_rank = 0;
            }

            rc = block(gsize_array, i, ndims, nprocs, tmp_rank,
                       MPI_DISTRIBUTE_DFLT_DARG, order, orig_extent,
                       lastType, newtype, st_offsets+i);
            break;
        default:
            rc = MPI_ERR_ARG;
        }
        ompi_datatype_destroy(&lastType);
        /* need to destroy the old type even in error condition, so
           don't check return code from above until after cleanup. */
        if (MPI_SUCCESS != rc) goto cleanup;
        lastType = *newtype;
    }


    /* set displacement and UB correctly. Use struct instead of
       resized for same reason as subarray */
    {
        ptrdiff_t displs[3], tmp_size;
        ompi_datatype_t *types[3];
        int blength[3] = { 1, 1, 1};

        displs[1] = st_offsets[start_loop];
        tmp_size = 1;
        for (i = start_loop + step ; i != end_loop ; i += step) {
            tmp_size *= gsize_array[i - step];
            displs[1] += tmp_size * st_offsets[i];
        }

        displs[0] = 0;
        displs[1] *= orig_extent;
        displs[2] = orig_extent;
        for (i = 0 ; i < ndims ; i++) {
            displs[2] *= gsize_array[i];
        }
        types[0] = MPI_LB; types[1] = lastType; types[2] = MPI_UB;

        rc = ompi_datatype_create_struct(3, blength, displs, types, newtype);
        ompi_datatype_destroy(&lastType);
        /* need to destroy the old type even in error condition, so
           don't check return code from above until after cleanup. */
        if (MPI_SUCCESS != rc) goto cleanup;
    }

 cleanup:
    if (NULL != st_offsets) free(st_offsets);
    if (NULL != coords) free(coords);
    return OMPI_SUCCESS;
}
int32_t ompi_datatype_create_subarray(int ndims,
                                      int const* size_array,
                                      int const* subsize_array,
                                      int const* start_array,
                                      int order,
                                      const ompi_datatype_t* oldtype,
                                      ompi_datatype_t** newtype)
{
    MPI_Datatype last_type;
    int32_t i, step, end_loop;
    MPI_Aint size, displ, extent;

    /**
     * If the oldtype contains the original MPI_LB and MPI_UB markers then we
     * are forced to follow the MPI standard suggestion and reset these 2
     * markers (MPI 3.0 page 96 line 37).  Otherwise we can simply resize the
     * datatype.
     */
    ompi_datatype_type_extent( oldtype, &extent );

    /* If the ndims is zero then return the NULL datatype */
    if( ndims < 2 ) {
        if( 0 == ndims ) {
            *newtype = &ompi_mpi_datatype_null.dt;
            return MPI_SUCCESS;
        }
        ompi_datatype_create_contiguous( subsize_array[0], oldtype, &last_type );
        size = size_array[0];
        displ = start_array[0];
        goto replace_subarray_type;
    }

    if( MPI_ORDER_C == order ) {
        i = ndims - 1;
        step = -1;
        end_loop = -1;
    } else {
        i = 0;
        step = 1;
        end_loop = ndims;
    }

    /* As we know that the ndims is at least 1 we can start by creating the
     * first dimension data outside the loop, such that we dont have to create
     * a duplicate of the oldtype just to be able to free it.
     */
    ompi_datatype_create_vector( subsize_array[i+step], subsize_array[i], size_array[i],
                                 oldtype, newtype );

    last_type = *newtype;
    size = (MPI_Aint)size_array[i] * (MPI_Aint)size_array[i+step];
    displ = (MPI_Aint)start_array[i] + (MPI_Aint)start_array[i+step] * (MPI_Aint)size_array[i];
    for( i += 2 * step; i != end_loop; i += step ) {
        ompi_datatype_create_hvector( subsize_array[i], 1, size * extent,
                                      last_type, newtype );
        ompi_datatype_destroy( &last_type );

        displ += size * start_array[i];
        size *= size_array[i];
        last_type = *newtype;
    }

 replace_subarray_type:
    /**
      * We need to shift the content (useful data) of the datatype, so
      * we need to force the displacement to be moved. Therefore, we
      * cannot use resize as it will only set the soft lb and ub
      * markers without moving the data. Instead, we have to create a
      * new data, and insert the last_Type with the correct
      * displacement.
      */
    *newtype = ompi_datatype_create( last_type->super.desc.used );
    ompi_datatype_add( *newtype, last_type, 1, displ * extent, size * extent);
    ompi_datatype_destroy( &last_type );

    return OMPI_SUCCESS;
}
 int
 mca_fcoll_dynamic_file_read_all (mca_io_ompio_file_t *fh,
				  void *buf,
				  int count,
				  struct ompi_datatype_t *datatype,
				  ompi_status_public_t *status)
 {
     MPI_Aint position = 0;
     MPI_Aint total_bytes = 0;          /* total bytes to be read */
     MPI_Aint bytes_to_read_in_cycle = 0; /* left to be read in a cycle*/
     MPI_Aint bytes_per_cycle = 0;      /* total read in each cycle by each process*/
     int index = 0, ret=OMPI_SUCCESS;
     int cycles = 0;
     int i=0, j=0, l=0;
     int n=0; /* current position in total_bytes_per_process array */
     MPI_Aint bytes_remaining = 0; /* how many bytes have been read from the current
					 value from total_bytes_per_process */
     int *sorted_file_offsets=NULL, entries_per_aggregator=0;
     int bytes_received = 0;
     int blocks = 0;
     /* iovec structure and count of the buffer passed in */
     uint32_t iov_count = 0;
     struct iovec *decoded_iov = NULL;
     int iov_index = 0;
     size_t current_position = 0;
     struct iovec *local_iov_array=NULL, *global_iov_array=NULL;
     char *receive_buf = NULL;
     MPI_Aint *memory_displacements=NULL;
     /* global iovec at the readers that contain the iovecs created from
	file_set_view */
     uint32_t total_fview_count = 0;
     int local_count = 0;
     int *fview_count = NULL, *disp_index=NULL, *temp_disp_index=NULL;
     int current_index=0, temp_index=0;
     int **blocklen_per_process=NULL;
     MPI_Aint **displs_per_process=NULL;
     char *global_buf = NULL;
     MPI_Aint global_count = 0;
     local_io_array *file_offsets_for_agg=NULL;

     /* array that contains the sorted indices of the global_iov */
     int *sorted = NULL;
     int *displs = NULL;
     int dynamic_num_io_procs;
     size_t max_data = 0;
     int *bytes_per_process = NULL;
     MPI_Aint *total_bytes_per_process = NULL;
     ompi_datatype_t **sendtype = NULL;
     MPI_Request *send_req=NULL, *recv_req=NULL;


 #if TIME_BREAKDOWN
     double read_time = 0.0, start_read_time = 0.0, end_read_time = 0.0;
     double rcomm_time = 0.0, start_rcomm_time = 0.0, end_rcomm_time = 0.0;
     double read_exch = 0.0, start_rexch = 0.0, end_rexch = 0.0;
     print_entry nentry;
 #endif


//     if (opal_datatype_is_contiguous_memory_layout(&datatype->super,1)) {
//	 fh->f_flags |= OMPIO_CONTIGUOUS_MEMORY;
//     }
     /**************************************************************************
      ** In case the data is not contigous in memory, decode it into an iovec **
      **************************************************************************/
     if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) {
	 ret = fh->f_decode_datatype ((struct mca_io_ompio_file_t *)fh,
				    datatype,
				    count,
				    buf,
				    &max_data,
				    &decoded_iov,
				    &iov_count);
       if (OMPI_SUCCESS != ret){
	 goto exit;
       }
     }
     else {
	 max_data = count * datatype->super.size;
     }

     if ( MPI_STATUS_IGNORE != status ) {
	 status->_ucount = max_data;
     }

     fh->f_get_num_aggregators ( &dynamic_num_io_procs);
     ret = fh->f_set_aggregator_props ((struct mca_io_ompio_file_t *) fh,
				       dynamic_num_io_procs,
				       max_data);
     if (OMPI_SUCCESS != ret){
	 goto exit;
     }

     total_bytes_per_process = (MPI_Aint*)malloc
	 (fh->f_procs_per_group*sizeof(MPI_Aint));
     if (NULL == total_bytes_per_process) {
	 opal_output (1, "OUT OF MEMORY\n");
	 ret = OMPI_ERR_OUT_OF_RESOURCE;
	 goto exit;
     }

     ret = fh->f_allgather_array (&max_data,
				  1,
				  MPI_LONG,
				  total_bytes_per_process,
				  1,
				  MPI_LONG,
				  fh->f_aggregator_index,
				  fh->f_procs_in_group,
				  fh->f_procs_per_group,
				  fh->f_comm);
     if (OMPI_SUCCESS != ret){
       goto exit;
     }

     for (i=0 ; i<fh->f_procs_per_group ; i++) {
	 total_bytes += total_bytes_per_process[i];
     }

     if (NULL != total_bytes_per_process) {
	 free (total_bytes_per_process);
	 total_bytes_per_process = NULL;
     }

     /*********************************************************************
      *** Generate the File offsets/lengths corresponding to this write ***
      ********************************************************************/
     ret = fh->f_generate_current_file_view ((struct mca_io_ompio_file_t *) fh,
					     max_data,
					     &local_iov_array,
					     &local_count);

     if (ret != OMPI_SUCCESS){
	 goto exit;
     }



     /* #########################################################*/

     /*************************************************************
      *** ALLGather the File View information at all processes ***
      *************************************************************/

     fview_count = (int *) malloc (fh->f_procs_per_group * sizeof (int));
     if (NULL == fview_count) {
	 opal_output (1, "OUT OF MEMORY\n");
	 ret = OMPI_ERR_OUT_OF_RESOURCE;
	 goto exit;
     }

     ret = fh->f_allgather_array (&local_count,
				  1,
				  MPI_INT,
				  fview_count,
				  1,
				  MPI_INT,
				  fh->f_aggregator_index,
				  fh->f_procs_in_group,
				  fh->f_procs_per_group,
				  fh->f_comm);

     if (OMPI_SUCCESS != ret){
	 goto exit;
     }

     displs = (int*)malloc (fh->f_procs_per_group*sizeof(int));
     if (NULL == displs) {
	 opal_output (1, "OUT OF MEMORY\n");
	 ret = OMPI_ERR_OUT_OF_RESOURCE;
	 goto exit;
     }

     displs[0] = 0;
     total_fview_count = fview_count[0];
     for (i=1 ; i<fh->f_procs_per_group ; i++) {
	 total_fview_count += fview_count[i];
	 displs[i] = displs[i-1] + fview_count[i-1];
     }

 #if DEBUG_ON
     if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
	 for (i=0 ; i<fh->f_procs_per_group ; i++) {
	     printf ("%d: PROCESS: %d  ELEMENTS: %d  DISPLS: %d\n",
		     fh->f_rank,
		     i,
		     fview_count[i],
		     displs[i]);
	 }
     }
 #endif

     /* allocate the global iovec  */
     if (0 != total_fview_count) {
       global_iov_array = (struct iovec*)malloc (total_fview_count *
						 sizeof(struct iovec));
       if (NULL == global_iov_array) {
	 opal_output (1, "OUT OF MEMORY\n");
	 ret = OMPI_ERR_OUT_OF_RESOURCE;
	 goto exit;
       }
     }

     ret =  fh->f_allgatherv_array (local_iov_array,
				    local_count,
				    fh->f_iov_type,
				    global_iov_array,
				    fview_count,
				    displs,
				    fh->f_iov_type,
				    fh->f_aggregator_index,
				    fh->f_procs_in_group,
				    fh->f_procs_per_group,
				    fh->f_comm);

     if (OMPI_SUCCESS != ret){
       goto exit;
     }

     /* sort it */
     if (0 != total_fview_count) {
	 sorted = (int *)malloc (total_fview_count * sizeof(int));
	 if (NULL == sorted) {
	     opal_output (1, "OUT OF MEMORY\n");
	     ret = OMPI_ERR_OUT_OF_RESOURCE;
	     goto exit;
	 }
	 fh->f_sort_iovec (global_iov_array, total_fview_count, sorted);
     }

     if (NULL != local_iov_array) {
	 free (local_iov_array);
	 local_iov_array = NULL;
     }

 #if DEBUG_ON
     if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
	 for (i=0 ; i<total_fview_count ; i++) {
	     printf("%d: OFFSET: %p   LENGTH: %d\n",
		    fh->f_rank,
		    global_iov_array[sorted[i]].iov_base,
		    global_iov_array[sorted[i]].iov_len);
	 }
     }
 #endif

     if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {

       disp_index = (int *)malloc (fh->f_procs_per_group * sizeof (int));
       if (NULL == disp_index) {
	 opal_output (1, "OUT OF MEMORY\n");
	 ret = OMPI_ERR_OUT_OF_RESOURCE;
	 goto exit;
       }

       blocklen_per_process = (int **)malloc (fh->f_procs_per_group * sizeof (int*));
       if (NULL == blocklen_per_process) {
	 opal_output (1, "OUT OF MEMORY\n");
	 ret = OMPI_ERR_OUT_OF_RESOURCE;
	 goto exit;
       }

       displs_per_process = (MPI_Aint **)malloc (fh->f_procs_per_group * sizeof (MPI_Aint*));
       if (NULL == displs_per_process){
	 opal_output (1, "OUT OF MEMORY\n");
	 ret = OMPI_ERR_OUT_OF_RESOURCE;
	 goto exit;
       }

       for (i=0;i<fh->f_procs_per_group;i++){
	 blocklen_per_process[i] = NULL;
	 displs_per_process[i] = NULL;
       }
     }


     /*
      * Calculate how many bytes are read in each cycle
      */
     fh->f_get_bytes_per_agg ( (int *) &bytes_per_cycle);
     cycles = ceil((double)total_bytes/bytes_per_cycle);

     n = 0;
     bytes_remaining = 0;
     current_index = 0;


 #if TIME_BREAKDOWN
     start_rexch = MPI_Wtime();
 #endif
     for (index = 0; index < cycles; index++) {
       /* Getting ready for next cycle
	  Initializing and freeing buffers */
       if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
	 if (NULL == sendtype){
	   sendtype = (ompi_datatype_t **)
	     malloc (fh->f_procs_per_group  * sizeof(ompi_datatype_t *));
	   if (NULL == sendtype) {
	     opal_output (1, "OUT OF MEMORY\n");
	     ret = OMPI_ERR_OUT_OF_RESOURCE;
	     goto exit;
	   }
	 }

	 for(l=0;l<fh->f_procs_per_group;l++){

	   disp_index[l] =  1;

	   if (NULL != blocklen_per_process[l]){
	     free(blocklen_per_process[l]);
	     blocklen_per_process[l] = NULL;
	   }
	   if (NULL != displs_per_process[l]){
	     free(displs_per_process[l]);
	     displs_per_process[l] = NULL;
	   }
	   blocklen_per_process[l] = (int *) calloc (1, sizeof(int));
	   if (NULL == blocklen_per_process[l]) {
	     opal_output (1, "OUT OF MEMORY for blocklen\n");
	     ret = OMPI_ERR_OUT_OF_RESOURCE;
	     goto exit;
	   }
	   displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint));
	   if (NULL == displs_per_process[l]){
	     opal_output (1, "OUT OF MEMORY for displs\n");
	     ret = OMPI_ERR_OUT_OF_RESOURCE;
	     goto exit;
	   }
	 }

	 if (NULL != sorted_file_offsets){
	   free(sorted_file_offsets);
	   sorted_file_offsets = NULL;
	 }

	 if(NULL != file_offsets_for_agg){
	   free(file_offsets_for_agg);
	   file_offsets_for_agg = NULL;
	 }
	 if (NULL != memory_displacements){
	   free(memory_displacements);
	   memory_displacements = NULL;
	 }
       }


       if (cycles-1 == index) {
	 bytes_to_read_in_cycle = total_bytes - bytes_per_cycle*index;
       }
       else {
	 bytes_to_read_in_cycle = bytes_per_cycle;
       }

 #if DEBUG_ON
       if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
	 printf ("****%d: CYCLE %d   Bytes %d**********\n",
		 fh->f_rank,
		 index,
		 bytes_to_write_in_cycle);
       }
 #endif

       /* Calculate how much data will be contributed in this cycle
	    by each process*/
       bytes_received = 0;

       while (bytes_to_read_in_cycle) {
	 blocks = fview_count[0];
	 for (j=0 ; j<fh->f_procs_per_group ; j++) {
	   if (sorted[current_index] < blocks) {
	     n = j;
	     break;
	   }
	   else {
	     blocks += fview_count[j+1];
	   }
	 }
	 if (bytes_remaining) {
	   if (bytes_remaining <= bytes_to_read_in_cycle) {

	     if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
	       blocklen_per_process[n][disp_index[n] - 1] = bytes_remaining;
	       displs_per_process[n][disp_index[n] - 1] =
		 (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base +
		 (global_iov_array[sorted[current_index]].iov_len - bytes_remaining);
	     }
	     if (fh->f_procs_in_group[n] == fh->f_rank) {
	       bytes_received += bytes_remaining;
	     }
	     current_index ++;
	     bytes_to_read_in_cycle -= bytes_remaining;
	     bytes_remaining = 0;
	     if (fh->f_procs_in_group[fh->f_aggregator_index] ==
		 fh->f_rank) {
	       blocklen_per_process[n] = (int *) realloc
		 ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int));
	       displs_per_process[n] = (MPI_Aint *) realloc
		 ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint));
	       blocklen_per_process[n][disp_index[n]] = 0;
	       displs_per_process[n][disp_index[n]] = 0;
	       disp_index[n] += 1;
	     }
	     continue;
	   }
	   else {
	     if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
	       blocklen_per_process[n][disp_index[n] - 1] = bytes_to_read_in_cycle;
	       displs_per_process[n][disp_index[n] - 1] =
		 (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base +
		 (global_iov_array[sorted[current_index]].iov_len
		  - bytes_remaining);
	     }
	     if (fh->f_procs_in_group[n] == fh->f_rank) {
	       bytes_received += bytes_to_read_in_cycle;
	     }
	     bytes_remaining -= bytes_to_read_in_cycle;
	     bytes_to_read_in_cycle = 0;
	     break;
	   }
	 }
	 else {
	   if (bytes_to_read_in_cycle <
		   (MPI_Aint) global_iov_array[sorted[current_index]].iov_len) {
	     if (fh->f_procs_in_group[fh->f_aggregator_index] ==
		 fh->f_rank) {

	       blocklen_per_process[n][disp_index[n] - 1] = bytes_to_read_in_cycle;
	       displs_per_process[n][disp_index[n] - 1] =
		 (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base ;
	     }

	     if (fh->f_procs_in_group[n] == fh->f_rank) {
	       bytes_received += bytes_to_read_in_cycle;
	     }
	     bytes_remaining = global_iov_array[sorted[current_index]].iov_len -
	       bytes_to_read_in_cycle;
	     bytes_to_read_in_cycle = 0;
	     break;
	   }
	   else {
	     if (fh->f_procs_in_group[fh->f_aggregator_index] ==
		 fh->f_rank) {
	       blocklen_per_process[n][disp_index[n] - 1] =
		 global_iov_array[sorted[current_index]].iov_len;
	       displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)
		 global_iov_array[sorted[current_index]].iov_base;
	       blocklen_per_process[n] =
		 (int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int));
	       displs_per_process[n] = (MPI_Aint *)realloc
		 ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint));
	       blocklen_per_process[n][disp_index[n]] = 0;
	       displs_per_process[n][disp_index[n]] = 0;
	       disp_index[n] += 1;
	     }
	     if (fh->f_procs_in_group[n] == fh->f_rank) {
	       bytes_received +=
		 global_iov_array[sorted[current_index]].iov_len;
	     }
	     bytes_to_read_in_cycle -=
	       global_iov_array[sorted[current_index]].iov_len;
	     current_index ++;
	     continue;
	   }
	 }
       }
       /* Calculate the displacement on where to put the data and allocate
	  the recieve buffer (global_buf) */
       if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
	 entries_per_aggregator=0;
	 for (i=0;i<fh->f_procs_per_group; i++){
	   for (j=0;j<disp_index[i];j++){
	     if (blocklen_per_process[i][j] > 0)
	       entries_per_aggregator++ ;
	   }
	 }
	 if (entries_per_aggregator > 0){
	   file_offsets_for_agg = (local_io_array *)
	     malloc(entries_per_aggregator*sizeof(local_io_array));
	   if (NULL == file_offsets_for_agg) {
	     opal_output (1, "OUT OF MEMORY\n");
	     ret = OMPI_ERR_OUT_OF_RESOURCE;
	     goto exit;
	   }
	   sorted_file_offsets = (int *)
	     malloc (entries_per_aggregator*sizeof(int));
	   if (NULL == sorted_file_offsets){
	     opal_output (1, "OUT OF MEMORY\n");
	     ret =  OMPI_ERR_OUT_OF_RESOURCE;
	     goto exit;
	   }
	   /*Moving file offsets to an IO array!*/
	   temp_index = 0;
	   global_count = 0;
	   for (i=0;i<fh->f_procs_per_group; i++){
	     for(j=0;j<disp_index[i];j++){
	       if (blocklen_per_process[i][j] > 0){
		   file_offsets_for_agg[temp_index].length =
		     blocklen_per_process[i][j];
		   global_count += blocklen_per_process[i][j];
		   file_offsets_for_agg[temp_index].process_id = i;
		   file_offsets_for_agg[temp_index].offset =
		     displs_per_process[i][j];
		   temp_index++;
	       }
	     }
	   }
	 }
	 else{
	   continue;
	 }

	 read_heap_sort (file_offsets_for_agg,
			 entries_per_aggregator,
			 sorted_file_offsets);

	 memory_displacements = (MPI_Aint *) malloc
	   (entries_per_aggregator * sizeof(MPI_Aint));
	 memory_displacements[sorted_file_offsets[0]] = 0;
	 for (i=1; i<entries_per_aggregator; i++){
	   memory_displacements[sorted_file_offsets[i]] =
	     memory_displacements[sorted_file_offsets[i-1]] +
	     file_offsets_for_agg[sorted_file_offsets[i-1]].length;
	 }

	 global_buf = (char *) malloc (global_count * sizeof(char));
	 if (NULL == global_buf){
	   opal_output(1, "OUT OF MEMORY\n");
	   ret = OMPI_ERR_OUT_OF_RESOURCE;
	   goto exit;
	 }

	  fh->f_io_array = (mca_io_ompio_io_array_t *) malloc
	    (entries_per_aggregator * sizeof (mca_io_ompio_io_array_t));
	  if (NULL == fh->f_io_array) {
	    opal_output(1, "OUT OF MEMORY\n");
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	  }

	 fh->f_num_of_io_entries = 0;
	 fh->f_io_array[fh->f_num_of_io_entries].offset =
	     (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset;
	 fh->f_io_array[fh->f_num_of_io_entries].length =
	   file_offsets_for_agg[sorted_file_offsets[0]].length;
	 fh->f_io_array[fh->f_num_of_io_entries].memory_address =
	   global_buf+memory_displacements[sorted_file_offsets[0]];
	 fh->f_num_of_io_entries++;
	 for (i=1;i<entries_per_aggregator;i++){
	   if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset +
	       file_offsets_for_agg[sorted_file_offsets[i-1]].length ==
	       file_offsets_for_agg[sorted_file_offsets[i]].offset){
	     fh->f_io_array[fh->f_num_of_io_entries - 1].length +=
	       file_offsets_for_agg[sorted_file_offsets[i]].length;
	   }
	   else{
	     fh->f_io_array[fh->f_num_of_io_entries].offset =
		 (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset;
	     fh->f_io_array[fh->f_num_of_io_entries].length =
	       file_offsets_for_agg[sorted_file_offsets[i]].length;
	     fh->f_io_array[fh->f_num_of_io_entries].memory_address =
	       global_buf+memory_displacements[sorted_file_offsets[i]];
	     fh->f_num_of_io_entries++;
	   }
	 }


 #if TIME_BREAKDOWN
	 start_read_time = MPI_Wtime();
 #endif

	 if (fh->f_num_of_io_entries) {
	   if ( 0 >  fh->f_fbtl->fbtl_preadv (fh)) {
	     opal_output (1, "READ FAILED\n");
	     ret = OMPI_ERROR;
	     goto exit;
	   }
	 }

 #if TIME_BREAKDOWN
	 end_read_time = MPI_Wtime();
	 read_time += end_read_time - start_read_time;
 #endif
	 /**********************************************************
	  ******************** DONE READING ************************
	  *********************************************************/

	 temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int));
	 if (NULL == temp_disp_index) {
	   opal_output (1, "OUT OF MEMORY\n");
	   ret = OMPI_ERR_OUT_OF_RESOURCE;
	   goto exit;
	 }
	 for (i=0; i<entries_per_aggregator; i++){
	   temp_index =
	     file_offsets_for_agg[sorted_file_offsets[i]].process_id;
	   displs_per_process[temp_index][temp_disp_index[temp_index]] =
	     memory_displacements[sorted_file_offsets[i]];
	   if (temp_disp_index[temp_index] < disp_index[temp_index]){
	     temp_disp_index[temp_index] += 1;
	   }
	   else{
	     printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n",
		    temp_index, temp_disp_index[temp_index],
		    temp_index, disp_index[temp_index]);
	   }
	 }
	 if (NULL != temp_disp_index){
	   free(temp_disp_index);
	   temp_disp_index = NULL;
	 }

	 send_req = (MPI_Request *)
	   malloc (fh->f_procs_per_group * sizeof(MPI_Request));
	 if (NULL == send_req){
	   opal_output ( 1, "OUT OF MEMORY\n");
	   ret = OMPI_ERR_OUT_OF_RESOURCE;
	   goto exit;
	 }
 #if TIME_BREAKDOWN
	 start_rcomm_time = MPI_Wtime();
 #endif
	 for (i=0;i<fh->f_procs_per_group;i++){
	   ompi_datatype_create_hindexed(disp_index[i],
					 blocklen_per_process[i],
					 displs_per_process[i],
					 MPI_BYTE,
					 &sendtype[i]);
	   ompi_datatype_commit(&sendtype[i]);
	   ret = MCA_PML_CALL (isend(global_buf,
				     1,
				     sendtype[i],
				     fh->f_procs_in_group[i],
				     123,
				     MCA_PML_BASE_SEND_STANDARD,
				     fh->f_comm,
				     &send_req[i]));
	   if(OMPI_SUCCESS != ret){
	       goto exit;
	   }
	 }
 #if TIME_BREAKDOWN
	 end_rcomm_time = MPI_Wtime();
	 rcomm_time += end_rcomm_time - start_rcomm_time;
 #endif
       }

       /**********************************************************
	********* Scatter the Data from the readers **************
	*********************************************************/
       if (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY) {
	 receive_buf = &((char*)buf)[position];
       }
       else if (bytes_received) {
	 /* allocate a receive buffer and copy the data that needs
	    to be received into it in case the data is non-contigous
	    in memory */
	 receive_buf = malloc (bytes_received);
	 if (NULL == receive_buf) {
	   opal_output (1, "OUT OF MEMORY\n");
	   ret = OMPI_ERR_OUT_OF_RESOURCE;
	   goto exit;
	 }
       }

 #if TIME_BREAKDOWN
       start_rcomm_time = MPI_Wtime();
 #endif
       recv_req = (MPI_Request *) malloc (sizeof (MPI_Request));
       if (NULL == recv_req){
	 opal_output (1, "OUT OF MEMORY\n");
	 ret = OMPI_ERR_OUT_OF_RESOURCE;
	 goto exit;
       }

       ret = MCA_PML_CALL(irecv(receive_buf,
				bytes_received,
				MPI_BYTE,
				fh->f_procs_in_group[fh->f_aggregator_index],
				123,
				fh->f_comm,
				recv_req));
       if (OMPI_SUCCESS != ret){
	 goto exit;
       }


       if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank){
	 ret = ompi_request_wait_all (fh->f_procs_per_group,
				    send_req,
				      MPI_STATUS_IGNORE);
	 if (OMPI_SUCCESS != ret){
	   goto exit;
	 }
       }

       ret = ompi_request_wait (recv_req, MPI_STATUS_IGNORE);
       if (OMPI_SUCCESS != ret){
	 goto exit;
       }
       position += bytes_received;

       /* If data is not contigous in memory, copy the data from the
	  receive buffer into the buffer passed in */
       if (!(fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) {
	 OPAL_PTRDIFF_TYPE mem_address;
	 size_t remaining = 0;
	 size_t temp_position = 0;

	 remaining = bytes_received;

	 while (remaining) {
	   mem_address = (OPAL_PTRDIFF_TYPE)
	     (decoded_iov[iov_index].iov_base) + current_position;

	   if (remaining >=
	       (decoded_iov[iov_index].iov_len - current_position)) {
	     memcpy ((IOVBASE_TYPE *) mem_address,
		     receive_buf+temp_position,
		     decoded_iov[iov_index].iov_len - current_position);
	     remaining = remaining -
	       (decoded_iov[iov_index].iov_len - current_position);
	     temp_position = temp_position +
	       (decoded_iov[iov_index].iov_len - current_position);
	     iov_index = iov_index + 1;
	     current_position = 0;
	   }
	   else {
	     memcpy ((IOVBASE_TYPE *) mem_address,
		     receive_buf+temp_position,
		     remaining);
	     current_position = current_position + remaining;
	     remaining = 0;
	   }
	 }

	 if (NULL != receive_buf) {
	   free (receive_buf);
	 receive_buf = NULL;
	 }
       }
#if TIME_BREAKDOWN
       end_rcomm_time = MPI_Wtime();
       rcomm_time += end_rcomm_time - start_rcomm_time;
#endif

       if (NULL != recv_req){
	 free(recv_req);
	 recv_req = NULL;
       }
       if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank){
	 fh->f_num_of_io_entries = 0;
	 if (NULL != fh->f_io_array) {
	   free (fh->f_io_array);
	   fh->f_io_array = NULL;
	 }
	 if (NULL != global_buf) {
	   free (global_buf);
	   global_buf = NULL;
	 }
	 for (i = 0; i < fh->f_procs_per_group; i++)
	   ompi_datatype_destroy(sendtype+i);
	 if (NULL != sendtype){
	   free(sendtype);
	   sendtype=NULL;
	 }
	 if (NULL != send_req){
	   free(send_req);
	   send_req = NULL;
	 }
	 if (NULL != sorted_file_offsets){
	   free(sorted_file_offsets);
	   sorted_file_offsets = NULL;
	 }
	 if (NULL != file_offsets_for_agg){
	   free(file_offsets_for_agg);
	   file_offsets_for_agg = NULL;
	 }
	 if (NULL != bytes_per_process){
	   free(bytes_per_process);
	   bytes_per_process =NULL;
	 }
	 if (NULL != memory_displacements){
	   free(memory_displacements);
	   memory_displacements= NULL;
	 }
       }
     }

 #if TIME_BREAKDOWN
     end_rexch = MPI_Wtime();
     read_exch += end_rexch - start_rexch;
     nentry.time[0] = read_time;
     nentry.time[1] = rcomm_time;
     nentry.time[2] = read_exch;
     if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank)
       nentry.aggregator = 1;
     else
       nentry.aggregator = 0;
     nentry.nprocs_for_coll = dynamic_num_io_procs;
     if (!fh->f_full_print_queue(READ_PRINT_QUEUE)){
       fh->f_register_print_entry(READ_PRINT_QUEUE,
				  nentry);
     }
 #endif

 exit:
     if (NULL != sorted) {
       free (sorted);
       sorted = NULL;
     }
     if (NULL != global_iov_array) {
       free (global_iov_array);
       global_iov_array = NULL;
     }
     if (NULL != fview_count) {
       free (fview_count);
       fview_count = NULL;
     }
     if (NULL != decoded_iov) {
       free (decoded_iov);
       decoded_iov = NULL;
     }
     if (NULL != local_iov_array){
       free(local_iov_array);
       local_iov_array=NULL;
     }

     if (NULL != displs) {
       free (displs);
       displs = NULL;
     }
     if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {

       if (NULL != disp_index){
	 free(disp_index);
	 disp_index = NULL;
       }

       if ( NULL != blocklen_per_process){
	 for(l=0;l<fh->f_procs_per_group;l++){
	   if (NULL != blocklen_per_process[l]){
	     free(blocklen_per_process[l]);
	     blocklen_per_process[l] = NULL;
	   }
	 }

	 free(blocklen_per_process);
	 blocklen_per_process = NULL;
       }

       if (NULL != displs_per_process){
	 for (l=0; i<fh->f_procs_per_group; l++){
	   if (NULL != displs_per_process[l]){
	     free(displs_per_process[l]);
	     displs_per_process[l] = NULL;
	   }
	 }
	 free(displs_per_process);
	 displs_per_process = NULL;
       }

     }
     return ret;
 }
示例#7
0
int
ompio_io_ompio_file_close (mca_io_ompio_file_t *ompio_fh)
{
    int ret = OMPI_SUCCESS;
    int delete_flag = 0;
    char name[256];

    if(mca_io_ompio_coll_timing_info){
        strcpy (name, "WRITE");
        if (!ompi_io_ompio_empty_print_queue(WRITE_PRINT_QUEUE)){
            ret = ompi_io_ompio_print_time_info(WRITE_PRINT_QUEUE,
                                                name,
                                                ompio_fh);
            if (OMPI_SUCCESS != ret){
                printf("Error in print_time_info ");
            }

        }
        strcpy (name, "READ");
        if (!ompi_io_ompio_empty_print_queue(READ_PRINT_QUEUE)){
            ret = ompi_io_ompio_print_time_info(READ_PRINT_QUEUE,
                                                name,
                                                ompio_fh);
            if (OMPI_SUCCESS != ret){
                printf("Error in print_time_info ");
            }
        }
    }
    if ( ompio_fh->f_amode & MPI_MODE_DELETE_ON_CLOSE ) {
        delete_flag = 1;
    }

    /*close the sharedfp file*/
    if( NULL != ompio_fh->f_sharedfp ){
        ret = ompio_fh->f_sharedfp->sharedfp_file_close(ompio_fh);
    }
    if ( NULL != ompio_fh->f_fs ) {
	/* The pointer might not be set if file_close() is
	** called from the file destructor in case of an error
	** during file_open()
	*/
	ret = ompio_fh->f_fs->fs_file_close (ompio_fh);
    }
    if ( delete_flag && 0 == ompio_fh->f_rank ) {
        mca_io_ompio_file_delete ( ompio_fh->f_filename, MPI_INFO_NULL );
    }

    if ( NULL != ompio_fh->f_fs ) {
	mca_fs_base_file_unselect (ompio_fh);
    }
    if ( NULL != ompio_fh->f_fbtl ) {
	mca_fbtl_base_file_unselect (ompio_fh);
    }

    if ( NULL != ompio_fh->f_fcoll ) {
	mca_fcoll_base_file_unselect (ompio_fh);
    }
    if ( NULL != ompio_fh->f_sharedfp)  {
	mca_sharedfp_base_file_unselect (ompio_fh);
    }

    if (NULL != ompio_fh->f_io_array) {
        free (ompio_fh->f_io_array);
        ompio_fh->f_io_array = NULL;
    }

    if (NULL != ompio_fh->f_init_procs_in_group) {
        free (ompio_fh->f_init_procs_in_group);
        ompio_fh->f_init_procs_in_group = NULL;
    }
    if (NULL != ompio_fh->f_procs_in_group) {
        free (ompio_fh->f_procs_in_group);
        ompio_fh->f_procs_in_group = NULL;
    }

    if (NULL != ompio_fh->f_decoded_iov) {
        free (ompio_fh->f_decoded_iov);
        ompio_fh->f_decoded_iov = NULL;
    }

    if (NULL != ompio_fh->f_convertor) {
        free (ompio_fh->f_convertor);
        ompio_fh->f_convertor = NULL;
    }

    if (NULL != ompio_fh->f_datarep) {
        free (ompio_fh->f_datarep);
        ompio_fh->f_datarep = NULL;
    }


    if (MPI_DATATYPE_NULL != ompio_fh->f_iov_type) {
        ompi_datatype_destroy (&ompio_fh->f_iov_type);
    }

    if ( MPI_DATATYPE_NULL != ompio_fh->f_etype ) {
	ompi_datatype_destroy (&ompio_fh->f_etype);
    }
    if ( MPI_DATATYPE_NULL != ompio_fh->f_filetype ){
	ompi_datatype_destroy (&ompio_fh->f_filetype);
    }

    if ( MPI_DATATYPE_NULL != ompio_fh->f_orig_filetype ){
	ompi_datatype_destroy (&ompio_fh->f_orig_filetype);
    }


    if (MPI_COMM_NULL != ompio_fh->f_comm && (ompio_fh->f_flags & OMPIO_SHAREDFP_IS_SET) )  {
        ompi_comm_free (&ompio_fh->f_comm);
    }

    return ret;
}
int32_t ompi_datatype_create_subarray(int ndims,
                                      int const* size_array,
                                      int const* subsize_array,
                                      int const* start_array,
                                      int order,
                                      const ompi_datatype_t* oldtype,
                                      ompi_datatype_t** newtype)
{
    MPI_Datatype last_type;
    int32_t i, step, end_loop;
    MPI_Aint size, displ, extent;

    ompi_datatype_type_extent( oldtype, &extent );

    /* If the ndims is zero then return the NULL datatype */
    if( ndims < 2 ) {
        if( 0 == ndims ) {
            *newtype = &ompi_mpi_datatype_null.dt;
            return MPI_SUCCESS;
        }
        ompi_datatype_create_contiguous( subsize_array[0], oldtype, &last_type );
        size = size_array[0];
        displ = start_array[0];
        goto replace_subarray_type;
    }

    if( MPI_ORDER_C == order ) {
        i = ndims - 1;
        step = -1;
        end_loop = -1;
    } else {
        i = 0;
        step = 1;
        end_loop = ndims;
    }

    /* As we know that the ndims is at least 1 we can start by creating the
     * first dimension data outside the loop, such that we dont have to create
     * a duplicate of the oldtype just to be able to free it.
     */
    ompi_datatype_create_vector( subsize_array[i+step], subsize_array[i], size_array[i],
                                 oldtype, newtype );

    last_type = *newtype;
    size = size_array[i] * size_array[i+step];
    displ = start_array[i] + start_array[i+step] * size_array[i];
    for( i += 2 * step; i != end_loop; i += step ) {
        ompi_datatype_create_hvector( subsize_array[i], 1, size * extent,
                                      last_type, newtype );
        ompi_datatype_destroy( &last_type );
        displ += size * start_array[i];
        size *= size_array[i];
        last_type = *newtype;
    }

replace_subarray_type:
    /**
     * We cannot use resized here. Resized will only set the soft lb and ub markers
     * without moving the real data inside. What we need is to force the displacement
     * of the data upward to the right position AND set the LB and UB. A type
     * struct is the function we need.
     */
    {
        MPI_Aint displs[3];
        MPI_Datatype types[3];
        int blength[3] = { 1, 1, 1 };

        displs[0] = 0;
        displs[1] = displ * extent;
        displs[2] = size * extent;
        types[0] = MPI_LB;
        types[1] = last_type;
        types[2] = MPI_UB;
        ompi_datatype_create_struct( 3, blength, displs, types, newtype );
    }
    ompi_datatype_destroy( &last_type );

    return OMPI_SUCCESS;
}
示例#9
0
static int test_upper( unsigned int length )
{
    double *mat1, *mat2, *inbuf;
    ompi_datatype_t *pdt;
    opal_convertor_t * pConv;
    char *ptr;
    int rc;
    unsigned int i, j, iov_count, split_chunk, total_length;
    size_t max_data;
    struct iovec a;
    TIMER_DATA_TYPE start, end;
    long total_time;

    printf( "test upper matrix\n" );
    pdt = upper_matrix( length );
    /*dt_dump( pdt );*/

    mat1 = malloc( length * length * sizeof(double) );
    init_random_upper_matrix( length, mat1 );
    mat2 = calloc( length * length, sizeof(double) );

    total_length = length * (length + 1) * ( sizeof(double) / 2);
    inbuf = (double*)malloc( total_length );
    ptr = (char*)inbuf;
    /* copy upper matrix in the array simulating the input buffer */
    for( i = 0; i < length; i++ ) {
        uint32_t pos = i * length + i;
        for( j = i; j < length; j++, pos++ ) {
            *inbuf = mat1[pos];
            inbuf++;
        }
    }
    inbuf = (double*)ptr;
    pConv = opal_convertor_create( remote_arch, 0 );
    if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( pConv, &(pdt->super), 1, mat2 ) ) {
        printf( "Cannot attach the datatype to a convertor\n" );
        return OMPI_ERROR;
    }

    GET_TIME( start );
    split_chunk = (length + 1) * sizeof(double);
    /*    split_chunk = (total_length + 1) * sizeof(double); */
    for( i = total_length; i > 0; ) {
        if( i <= split_chunk ) {  /* equal test just to be able to set a breakpoint */
            split_chunk = i;
        }
        a.iov_base = ptr;
        a.iov_len = split_chunk;
        iov_count = 1;
        max_data = split_chunk;
        opal_convertor_unpack( pConv, &a, &iov_count, &max_data );
        ptr += max_data;
        i -= max_data;
        if( mat2[0] != inbuf[0] ) assert(0);
    }
    GET_TIME( end );
    total_time = ELAPSED_TIME( start, end );
    printf( "complete unpacking in %ld microsec\n", total_time );
    free( inbuf );
    rc = check_diag_matrix( length, mat1, mat2 );
    free( mat1 );
    free( mat2 );

    /* test the automatic destruction pf the data */
    ompi_datatype_destroy( &pdt ); assert( pdt == NULL );

    OBJ_RELEASE( pConv );
    return rc;
}
int
mca_fcoll_dynamic_file_write_all (mca_io_ompio_file_t *fh,
                                  void *buf,
                                  int count,
                                  struct ompi_datatype_t *datatype,
                                  ompi_status_public_t *status)
{
    MPI_Aint total_bytes_written = 0;  /* total bytes that have been written*/
    MPI_Aint total_bytes = 0;          /* total bytes to be written */
    MPI_Aint bytes_to_write_in_cycle = 0; /* left to be written in a cycle*/
    MPI_Aint bytes_per_cycle = 0;      /* total written in each cycle by each process*/
    int index = 0;
    int cycles = 0;
    int i=0, j=0, l=0;
    int n=0; /* current position in total_bytes_per_process array */
    MPI_Aint bytes_remaining = 0; /* how many bytes have been written from the current
                                value from total_bytes_per_process */
    int bytes_sent = 0, ret =0;
    int blocks=0, entries_per_aggregator=0;

    /* iovec structure and count of the buffer passed in */
    uint32_t iov_count = 0;
    struct iovec *decoded_iov = NULL;
    int iov_index = 0;
    char *send_buf = NULL;
    size_t current_position = 0;
    struct iovec *local_iov_array=NULL, *global_iov_array=NULL;
    local_io_array *file_offsets_for_agg=NULL;
    /* global iovec at the writers that contain the iovecs created from
       file_set_view */
    uint32_t total_fview_count = 0;
    int local_count = 0, temp_pindex;
    int *fview_count = NULL, *disp_index=NULL, *temp_disp_index=NULL;
    int current_index = 0, temp_index=0;

    char *global_buf = NULL;
    MPI_Aint global_count = 0;
    
    
    /* array that contains the sorted indices of the global_iov */
    int *sorted = NULL, *sorted_file_offsets=NULL;
    int *displs = NULL;
    int dynamic_num_io_procs;
    size_t max_data = 0, datatype_size = 0; 
    int **blocklen_per_process=NULL;
    MPI_Aint **displs_per_process=NULL, *memory_displacements=NULL;
    ompi_datatype_t **recvtype = NULL;
    MPI_Aint *total_bytes_per_process = NULL;
    MPI_Request *send_req=NULL, *recv_req=NULL;
    int recv_req_count=0;
    

#if TIME_BREAKDOWN
    double write_time = 0.0, start_write_time = 0.0, end_write_time = 0.0;
    double comm_time = 0.0, start_comm_time = 0.0, end_comm_time = 0.0;
    double exch_write = 0.0, start_exch = 0.0, end_exch = 0.0;
    print_entry nentry;
#endif


//    if (opal_datatype_is_contiguous_memory_layout(&datatype->super,1)) {
//        fh->f_flags |= OMPIO_CONTIGUOUS_MEMORY;
//    }

    /**************************************************************************
     ** In case the data is not contigous in memory, decode it into an iovec **
     **************************************************************************/
    if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) {
      ret =   ompi_io_ompio_decode_datatype (fh,
					     datatype,
					     count,
					     buf,
					     &max_data,
					     &decoded_iov,
					     &iov_count);
      if (OMPI_SUCCESS != ret ){
	goto exit;
      }
    }
    else {
        max_data = count * datatype->super.size;
    }

    if ( MPI_STATUS_IGNORE != status ) {
	status->_ucount = max_data;
    }
	
    mca_io_ompio_get_num_aggregators ( &dynamic_num_io_procs );
    ret = ompi_io_ompio_set_aggregator_props (fh, 
					      dynamic_num_io_procs,
					      max_data);
	
    if (OMPI_SUCCESS != ret){
	goto exit;
    }


    total_bytes_per_process = (MPI_Aint*)malloc
        (fh->f_procs_per_group*sizeof(MPI_Aint));
    if (NULL == total_bytes_per_process) {
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
	goto exit;
    }

    ret = ompi_io_ompio_allgather_array (&max_data,
					 1,
					 MPI_LONG,
					 total_bytes_per_process,
					 1,
					 MPI_LONG,
					 fh->f_aggregator_index,
					 fh->f_procs_in_group,
					 fh->f_procs_per_group,
					 fh->f_comm);
    
    if( OMPI_SUCCESS != ret){
      goto exit;
    }
    for (i=0 ; i<fh->f_procs_per_group ; i++) {
        total_bytes += total_bytes_per_process[i];
    }

    if (NULL != total_bytes_per_process) {
        free (total_bytes_per_process);
        total_bytes_per_process = NULL;
    }
    
    /*********************************************************************
     *** Generate the File offsets/lengths corresponding to this write ***
     ********************************************************************/
    ret = ompi_io_ompio_generate_current_file_view(fh,
						   max_data,
						   &local_iov_array,
						   &local_count);
    if (ret != OMPI_SUCCESS){
      goto exit;
    }

#if DEBUG_ON    
    for (i=0 ; i<local_count ; i++) {
   
      printf("%d: OFFSET: %d   LENGTH: %ld\n",
	     fh->f_rank,
	     local_iov_array[i].iov_base,
	     local_iov_array[i].iov_len);

    }
#endif   

    /*************************************************************
     *** ALLGather the File View information at all processes ***
     *************************************************************/

    fview_count = (int *) malloc (fh->f_procs_per_group * sizeof (int));
    if (NULL == fview_count) {
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
	goto exit;
    }

    ret = ompi_io_ompio_allgather_array (&local_count,
					 1,
					 MPI_INT,
					 fview_count,
					 1,
					 MPI_INT,
					 fh->f_aggregator_index,
					 fh->f_procs_in_group,
					 fh->f_procs_per_group,
					 fh->f_comm);
    
    if( OMPI_SUCCESS != ret){
      goto exit;
    }

    displs = (int*) malloc (fh->f_procs_per_group * sizeof (int));
    if (NULL == displs) {
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
	goto exit;
    }

    displs[0] = 0;
    total_fview_count = fview_count[0];
    for (i=1 ; i<fh->f_procs_per_group ; i++) {
        total_fview_count += fview_count[i];
        displs[i] = displs[i-1] + fview_count[i-1];
    }
    
#if DEBUG_ON
    printf("total_fview_count : %d\n", total_fview_count);
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
        for (i=0 ; i<fh->f_procs_per_group ; i++) {
            printf ("%d: PROCESS: %d  ELEMENTS: %d  DISPLS: %d\n",
                    fh->f_rank,
                    i,
                    fview_count[i],
                    displs[i]);
        }
    }
#endif
    
    /* allocate the global iovec  */

    if (0 != total_fview_count) {
      global_iov_array = (struct iovec*) malloc (total_fview_count *
						    sizeof(struct iovec));
      if (NULL == global_iov_array){
	opal_output(1, "OUT OF MEMORY\n");
	ret = OMPI_ERR_OUT_OF_RESOURCE;
	goto exit;
      }

    }
    
    ret = ompi_io_ompio_allgatherv_array (local_iov_array,
					  local_count,
					  fh->f_iov_type,
					  global_iov_array,
					  fview_count,
					  displs,
					  fh->f_iov_type,
					  fh->f_aggregator_index,
					  fh->f_procs_in_group,
					  fh->f_procs_per_group,
					  fh->f_comm);
    if (OMPI_SUCCESS != ret){
      goto exit;
    }

    /* sort it */
    if (0 != total_fview_count) {
        sorted = (int *)malloc (total_fview_count * sizeof(int));
        if (NULL == sorted) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
        }
	ompi_io_ompio_sort_iovec (global_iov_array, total_fview_count, sorted);
    }

    if (NULL != local_iov_array){
      free(local_iov_array);
      local_iov_array = NULL;
    }

    if (NULL != displs){
      free(displs);
      displs=NULL;
    }

    
#if DEBUG_ON
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
      uint32_t tv=0;
      for (tv=0 ; tv<total_fview_count ; tv++) {
	printf("%d: OFFSET: %lld   LENGTH: %ld\n",
	       fh->f_rank,
	       global_iov_array[sorted[tv]].iov_base,
	       global_iov_array[sorted[tv]].iov_len);
      }
    }
#endif
    
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
        disp_index = (int *)malloc (fh->f_procs_per_group * sizeof (int));
        if (NULL == disp_index) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
        }

	blocklen_per_process = (int **)malloc (fh->f_procs_per_group * sizeof (int*));
        if (NULL == blocklen_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
        }

	displs_per_process = (MPI_Aint **)malloc (fh->f_procs_per_group * sizeof (MPI_Aint*));
	if (NULL == displs_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
        }

	for(i=0;i<fh->f_procs_per_group;i++){
	  blocklen_per_process[i] = NULL;
	  displs_per_process[i] = NULL;
	}
    }
    

    mca_io_ompio_get_bytes_per_agg ( (int *)&bytes_per_cycle );
    cycles = ceil((double)total_bytes/bytes_per_cycle);


    n = 0; 
    bytes_remaining = 0;
    current_index = 0;




#if TIME_BREAKDOWN
      start_exch = MPI_Wtime();
#endif
    
    for (index = 0; index < cycles; index++) {

      /* Getting ready for next cycle 
	 Initializing and freeing buffers*/
      if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
	
	if (NULL == recvtype){
	  recvtype = (ompi_datatype_t **) 
	    malloc (fh->f_procs_per_group  * sizeof(ompi_datatype_t *));
	    if (NULL == recvtype) {
	      opal_output (1, "OUT OF MEMORY\n");
	      ret = OMPI_ERR_OUT_OF_RESOURCE;
	      goto exit;
	    }
	}
	
	for(l=0;l<fh->f_procs_per_group;l++){
	  disp_index[l] =  1;
	 
	  if (NULL != blocklen_per_process[l]){
	    free(blocklen_per_process[l]);
	    blocklen_per_process[l] = NULL;
	  }
	  if (NULL != displs_per_process[l]){
	    free(displs_per_process[l]);
	    displs_per_process[l] = NULL;
	  }
	  blocklen_per_process[l] = (int *) calloc (1, sizeof(int));
	  if (NULL == blocklen_per_process[l]) {
	    opal_output (1, "OUT OF MEMORY for blocklen\n");
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	  }
	  displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint));
	  if (NULL == displs_per_process[l]){      
	      opal_output (1, "OUT OF MEMORY for displs\n");
	      ret = OMPI_ERR_OUT_OF_RESOURCE;
	      goto exit;
	  }
	}
	
	if (NULL != sorted_file_offsets){
	  free(sorted_file_offsets);
	  sorted_file_offsets = NULL;
	}
	
	if(NULL != file_offsets_for_agg){
	  free(file_offsets_for_agg);
	  file_offsets_for_agg = NULL;
	}
	
	if (NULL != memory_displacements){
	  free(memory_displacements);
	  memory_displacements = NULL;
	}
	
      }
      
      if (cycles-1 == index) {
	bytes_to_write_in_cycle = total_bytes - bytes_per_cycle*index;
      }
      else {
	bytes_to_write_in_cycle = bytes_per_cycle;
      }
      
#if DEBUG_ON
      if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
	printf ("****%d: CYCLE %d   Bytes %lld**********\n",
		fh->f_rank,
		index, 
		bytes_to_write_in_cycle);
      }
#endif
      /**********************************************************
       **Gather the Data from all the processes at the writers **
       *********************************************************/
      
      /* Calculate how much data will be contributed in this cycle 
	 by each process*/
      bytes_sent = 0;
#if DEBUG_ON
      printf("bytes_to_write_in_cycle: %ld, cycle : %d\n", bytes_to_write_in_cycle,
	       index);
#endif
      /* The blocklen and displs calculation only done at aggregators!*/


      while (bytes_to_write_in_cycle) {
	
	blocks = fview_count[0];
	for (j=0 ; j<fh->f_procs_per_group ; j++) {
	  if (sorted[current_index] < blocks) {
	    n = j;
	    break;
	  }
	  else {
	    blocks += fview_count[j+1];
	  }
	}
	
	if (bytes_remaining) {
	  if (bytes_remaining <= bytes_to_write_in_cycle) {

	    if (fh->f_procs_in_group[fh->f_aggregator_index] == 
		fh->f_rank) {
	      blocklen_per_process[n][disp_index[n] - 1] = bytes_remaining;
	      displs_per_process[n][disp_index[n] - 1] = 
		(OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base + 
		(global_iov_array[sorted[current_index]].iov_len
		 - bytes_remaining);
	      
	    }
	    if (fh->f_procs_in_group[n] == fh->f_rank) {
	      bytes_sent += bytes_remaining;
	    }
	    current_index ++;
	    bytes_to_write_in_cycle -= bytes_remaining;
	    bytes_remaining = 0;
	    if (fh->f_procs_in_group[fh->f_aggregator_index] == 
		fh->f_rank) {		
	    /* In this cases the length is consumed so allocating for
	       next displacement and blocklength*/
     
	      blocklen_per_process[n] = (int *) realloc
		((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int));
	      displs_per_process[n] = (MPI_Aint *) realloc
		((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint));
	      blocklen_per_process[n][disp_index[n]] = 0;
	      displs_per_process[n][disp_index[n]] = 0;
	      disp_index[n] += 1;
	    }
	    continue;
	  }
	  else {
	    if (fh->f_procs_in_group[fh->f_aggregator_index] == 
		fh->f_rank) {
	      blocklen_per_process[n][disp_index[n] - 1] = bytes_to_write_in_cycle;
	      displs_per_process[n][disp_index[n] - 1] = 
		(OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base +
                (global_iov_array[sorted[current_index]].iov_len
                 - bytes_remaining);
	    }
	    
	    if (fh->f_procs_in_group[n] == fh->f_rank) {
	      bytes_sent += bytes_to_write_in_cycle;
	    }
	    bytes_remaining -= bytes_to_write_in_cycle;
	    bytes_to_write_in_cycle = 0;
	    break;
	  }
	}
	else {
	  if (bytes_to_write_in_cycle < 
	      (MPI_Aint) global_iov_array[sorted[current_index]].iov_len) {
	    if (fh->f_procs_in_group[fh->f_aggregator_index] == 
		fh->f_rank) {

	      blocklen_per_process[n][disp_index[n] - 1] = bytes_to_write_in_cycle;
	      displs_per_process[n][disp_index[n] - 1] = 
		(OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base ;


	    }
	    if (fh->f_procs_in_group[n] == fh->f_rank) {
	      bytes_sent += bytes_to_write_in_cycle;
	      
	    }
	    bytes_remaining = global_iov_array[sorted[current_index]].iov_len - 
	      bytes_to_write_in_cycle;
	    bytes_to_write_in_cycle = 0;
	    break;
	  }
	  else {
	    if (fh->f_procs_in_group[fh->f_aggregator_index] == 
		fh->f_rank) {
	      blocklen_per_process[n][disp_index[n] - 1] =
		global_iov_array[sorted[current_index]].iov_len;
	      displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)
		global_iov_array[sorted[current_index]].iov_base;
	      
	      blocklen_per_process[n] = 
		(int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int));
	      displs_per_process[n] = (MPI_Aint *)realloc
		((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint));
	      blocklen_per_process[n][disp_index[n]] = 0;
	      displs_per_process[n][disp_index[n]] = 0;
	      disp_index[n] += 1;
	      /*realloc for next blocklength 
		and assign this displacement and check for next displs as
	      the total length of this entry has been consumed!*/
	    }
	    if (fh->f_procs_in_group[n] == fh->f_rank) {
	      bytes_sent += global_iov_array[sorted[current_index]].iov_len;
	    }
	    bytes_to_write_in_cycle -= 
	      global_iov_array[sorted[current_index]].iov_len;
	    current_index ++;
	    continue;
	  }
	}
      }
      

      /* Calculate the displacement on where to put the data and allocate
	 the recieve buffer (global_buf) */
      if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
	entries_per_aggregator=0;
	for (i=0;i<fh->f_procs_per_group; i++){
	  for (j=0;j<disp_index[i];j++){
	    if (blocklen_per_process[i][j] > 0) 
	      entries_per_aggregator++ ;
	  }
	}

#if DEBUG_ON
	printf("%d: cycle: %d, bytes_sent: %d\n ",fh->f_rank,index,
		 bytes_sent);
	printf("%d : Entries per aggregator : %d\n",fh->f_rank,entries_per_aggregator);
#endif
	
	if (entries_per_aggregator > 0){
	  file_offsets_for_agg = (local_io_array *)
	    malloc(entries_per_aggregator*sizeof(local_io_array));
	  if (NULL == file_offsets_for_agg) {
	    opal_output (1, "OUT OF MEMORY\n");
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	  }
	  
	  sorted_file_offsets = (int *)
	    malloc (entries_per_aggregator*sizeof(int));
	  if (NULL == sorted_file_offsets){
	    opal_output (1, "OUT OF MEMORY\n");
	    ret =  OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	  }
	  
	  /*Moving file offsets to an IO array!*/
	  temp_index = 0;
	 	  
	  for (i=0;i<fh->f_procs_per_group; i++){
	    for(j=0;j<disp_index[i];j++){
	      if (blocklen_per_process[i][j] > 0){
		file_offsets_for_agg[temp_index].length =
		  blocklen_per_process[i][j];
		file_offsets_for_agg[temp_index].process_id = i;
		file_offsets_for_agg[temp_index].offset = 
		  displs_per_process[i][j];
		temp_index++;
 
#if DEBUG_ON
		printf("************Cycle: %d,  Aggregator: %d ***************\n", 
		       index+1,fh->f_rank);
		
		printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n",
		       fh->f_procs_in_group[i],j,
		     blocklen_per_process[i][j],j,
		     displs_per_process[i][j],
		     fh->f_rank);
#endif
	      }
	    }
	  }
	}
	else{
	  continue;
	}
	/* Sort the displacements for each aggregator*/
	local_heap_sort (file_offsets_for_agg,
			 entries_per_aggregator,
			 sorted_file_offsets); 
	
	/*create contiguous memory displacements 
	  based on blocklens on the same displs array
	  and map it to this aggregator's actual 
	  file-displacements (this is in the io-array created above)*/
	memory_displacements = (MPI_Aint *) malloc 
	    (entries_per_aggregator * sizeof(MPI_Aint));
	
	memory_displacements[sorted_file_offsets[0]] = 0;
	for (i=1; i<entries_per_aggregator; i++){
	  memory_displacements[sorted_file_offsets[i]] = 
	    memory_displacements[sorted_file_offsets[i-1]] + 
	    file_offsets_for_agg[sorted_file_offsets[i-1]].length;
	}
	
	temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int));
	if (NULL == temp_disp_index) {
	  opal_output (1, "OUT OF MEMORY\n");
	  return OMPI_ERR_OUT_OF_RESOURCE;
	}
	
	/*Now update the displacements array  with memory offsets*/
	global_count = 0;
	for (i=0;i<entries_per_aggregator;i++){
	  temp_pindex =
	    file_offsets_for_agg[sorted_file_offsets[i]].process_id;
	  displs_per_process[temp_pindex][temp_disp_index[temp_pindex]] =
	    memory_displacements[sorted_file_offsets[i]];
	  if (temp_disp_index[temp_pindex] < disp_index[temp_pindex])
	      temp_disp_index[temp_pindex] += 1;
	  else{
	    printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n",
		   temp_pindex, temp_disp_index[temp_pindex],
		   temp_pindex, disp_index[temp_pindex]);
	  }
	  global_count += 
	    file_offsets_for_agg[sorted_file_offsets[i]].length;
	}
	  
	if (NULL != temp_disp_index){
	  free(temp_disp_index);
	    temp_disp_index = NULL;
	}

#if DEBUG_ON

	printf("************Cycle: %d,  Aggregator: %d ***************\n", 
	       index+1,fh->f_rank);
	for (i=0;i<fh->f_procs_per_group; i++){
	  for(j=0;j<disp_index[i];j++){
	    if (blocklen_per_process[i][j] > 0){
	      printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n",
		     fh->f_procs_in_group[i],j,
		     blocklen_per_process[i][j],j,
		     displs_per_process[i][j],
		     fh->f_rank);
	      
	    }
	  }
	}
	printf("************Cycle: %d,  Aggregator: %d ***************\n", 
	       index+1,fh->f_rank);
	for (i=0; i<entries_per_aggregator;i++){
	  printf("%d: OFFSET: %lld   LENGTH: %ld, Mem-offset: %ld\n",
		   file_offsets_for_agg[sorted_file_offsets[i]].process_id,
		 file_offsets_for_agg[sorted_file_offsets[i]].offset,
		 file_offsets_for_agg[sorted_file_offsets[i]].length,
		 memory_displacements[sorted_file_offsets[i]]);
	}
	printf("%d : global_count : %ld, bytes_sent : %d\n",
	       fh->f_rank,global_count, bytes_sent);
#endif
#if TIME_BREAKDOWN
	  start_comm_time = MPI_Wtime();
#endif

	
	global_buf  = (char *) malloc (global_count);
	if (NULL == global_buf){
	    opal_output(1, "OUT OF MEMORY");
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	  goto exit;
	}

	recv_req_count = 0;
	for (i=0;i<fh->f_procs_per_group; i++){
	    
	  ompi_datatype_create_hindexed(disp_index[i],
					blocklen_per_process[i],
					displs_per_process[i],
					MPI_BYTE,
					&recvtype[i]);
	  ompi_datatype_commit(&recvtype[i]); 
	  
	  opal_datatype_type_size(&recvtype[i]->super, 
				  &datatype_size);
	  
	  if (datatype_size){
	    
	    recv_req = (MPI_Request *)realloc 
			((void *)recv_req, (recv_req_count + 1)*sizeof(MPI_Request));
	    
	    ret = MCA_PML_CALL(irecv(global_buf,
				     1,
				     recvtype[i],
				     fh->f_procs_in_group[i],
				     123,
				     fh->f_comm,
				     &recv_req[recv_req_count]));
	    recv_req_count++;
	    
	    if (OMPI_SUCCESS != ret){
	      goto exit;
	    }
	  }
	}
	
      }
      
      

      if (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY) {
	send_buf = &((char*)buf)[total_bytes_written];
      }
      else if (bytes_sent) {
	/* allocate a send buffer and copy the data that needs
	   to be sent into it in case the data is non-contigous
	   in memory */
	OPAL_PTRDIFF_TYPE mem_address;
	size_t remaining = 0;
	size_t temp_position = 0;
	
	send_buf = malloc (bytes_sent);
	if (NULL == send_buf) {
	  opal_output (1, "OUT OF MEMORY\n");
	  return OMPI_ERR_OUT_OF_RESOURCE;
	}
	
	remaining = bytes_sent;
	  
	while (remaining) {
	  mem_address = (OPAL_PTRDIFF_TYPE)
	    (decoded_iov[iov_index].iov_base) + current_position;
	  
	    if (remaining >= 
		(decoded_iov[iov_index].iov_len - current_position)) {
	      memcpy (send_buf+temp_position,
		      (IOVBASE_TYPE *)mem_address,
		      decoded_iov[iov_index].iov_len - current_position);
	      remaining = remaining - 
		(decoded_iov[iov_index].iov_len - current_position);
	      temp_position = temp_position +
		(decoded_iov[iov_index].iov_len - current_position);
	      iov_index = iov_index + 1;
	      current_position = 0;
	    }
	    else {
	      memcpy (send_buf+temp_position,
		      (IOVBASE_TYPE *) mem_address,
		      remaining);
	      current_position = current_position + remaining;
	  remaining = 0;
	    }
	  }
	}
	total_bytes_written += bytes_sent;

	/* Gather the sendbuf from each process in appropritate locations in 
	 aggregators*/
	
	send_req = (MPI_Request *) malloc (sizeof(MPI_Request));
	if (NULL == send_req){
	  opal_output (1, "OUT OF MEMORY\n");
	  ret = OMPI_ERR_OUT_OF_RESOURCE;
	  goto exit;
	}

	
	if (bytes_sent){

	  ret = MCA_PML_CALL(isend(send_buf,
				   bytes_sent,
				   MPI_BYTE,
				   fh->f_procs_in_group[fh->f_aggregator_index],
				   123,
				   MCA_PML_BASE_SEND_STANDARD, 
				   fh->f_comm,
				   send_req));	

	    
	    if ( OMPI_SUCCESS != ret ){
		goto exit;
	    }

	    ret = ompi_request_wait(send_req, MPI_STATUS_IGNORE);
	    if (OMPI_SUCCESS != ret){
		goto exit;
	    }
	} 
	if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {       
	  ret = ompi_request_wait_all (recv_req_count,
				       recv_req,
				       MPI_STATUS_IGNORE);
	  
	  if (OMPI_SUCCESS != ret){
	    goto exit;
	  }
	}
#if DEBUG_ON
	if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank){
	  printf("************Cycle: %d,  Aggregator: %d ***************\n", 
		 index+1,fh->f_rank);
	  for (i=0 ; i<global_count/4 ; i++)
	    printf (" RECV %d \n",((int *)global_buf)[i]);
	}
#endif




    
    if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) {
	if (NULL != send_buf) {
	    free (send_buf);
	    send_buf = NULL;
	}
    }

#if TIME_BREAKDOWN
      end_comm_time = MPI_Wtime();
      comm_time += (end_comm_time - start_comm_time);
#endif


    /**********************************************************
     **************** DONE GATHERING OF DATA ******************
     *********************************************************/
    
        /**********************************************************
         ******* Create the io array, and pass it to fbtl *********
         *********************************************************/

	if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {

#if TIME_BREAKDOWN
	    start_write_time = MPI_Wtime();
#endif
	  
	  fh->f_io_array = (mca_io_ompio_io_array_t *) malloc 
	    (entries_per_aggregator * sizeof (mca_io_ompio_io_array_t));
	  if (NULL == fh->f_io_array) {
	    opal_output(1, "OUT OF MEMORY\n");
	    return OMPI_ERR_OUT_OF_RESOURCE;
	  }
	  
	  fh->f_num_of_io_entries = 0;
	  /*First entry for every aggregator*/
	  fh->f_io_array[fh->f_num_of_io_entries].offset = 
	    (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset;
	  fh->f_io_array[fh->f_num_of_io_entries].length = 
	    file_offsets_for_agg[sorted_file_offsets[0]].length;
	  fh->f_io_array[fh->f_num_of_io_entries].memory_address = 
	    global_buf+memory_displacements[sorted_file_offsets[0]];
	  fh->f_num_of_io_entries++;

	  for (i=1;i<entries_per_aggregator;i++){
	    /* If the enrties are contiguous merge them,
	       else make a new entry */
	    if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset + 
		file_offsets_for_agg[sorted_file_offsets[i-1]].length ==
		file_offsets_for_agg[sorted_file_offsets[i]].offset){
	      fh->f_io_array[fh->f_num_of_io_entries - 1].length +=
		file_offsets_for_agg[sorted_file_offsets[i]].length;	
	    }
	    else {
	      fh->f_io_array[fh->f_num_of_io_entries].offset = 
		(IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset;
	      fh->f_io_array[fh->f_num_of_io_entries].length = 
		file_offsets_for_agg[sorted_file_offsets[i]].length;
	      fh->f_io_array[fh->f_num_of_io_entries].memory_address = 
		global_buf+memory_displacements[sorted_file_offsets[i]];
	      fh->f_num_of_io_entries++;
	    }
	    
	  }
	 
#if DEBUG_ON
	  printf("*************************** %d\n", fh->f_num_of_io_entries);
	  for (i=0 ; i<fh->f_num_of_io_entries ; i++) {
	    printf(" ADDRESS: %p  OFFSET: %ld   LENGTH: %ld\n",
		   fh->f_io_array[i].memory_address,
		   (OPAL_PTRDIFF_TYPE)fh->f_io_array[i].offset,
		   fh->f_io_array[i].length);
	  }
	  
#endif


	  if (fh->f_num_of_io_entries) {
	    if ( 0 >  fh->f_fbtl->fbtl_pwritev (fh)) {
	      opal_output (1, "WRITE FAILED\n");
	      ret = OMPI_ERROR;
	      goto exit;
	    }
	  }
#if TIME_BREAKDOWN
	  end_write_time = MPI_Wtime();
	  write_time += end_write_time - start_write_time;
#endif	  


	}
	
	
	if (NULL != send_req){
	  free(send_req);
	  send_req = NULL;
	}
	
        if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
	  fh->f_num_of_io_entries = 0;
	  if (NULL != fh->f_io_array) {
	    free (fh->f_io_array);
	    fh->f_io_array = NULL;
	  }
	  for (i =0; i< fh->f_procs_per_group; i++) 
	    ompi_datatype_destroy(recvtype+i);
	  if (NULL != recvtype){
	      free(recvtype);
	      recvtype=NULL;
	  }
	  if (NULL != recv_req){
	    free(recv_req);
	    recv_req = NULL;
	    
	  }
	  if (NULL != global_buf) {
	    free (global_buf);
	    global_buf = NULL;
	  }
	  
	}
	
    }

#if TIME_BREAKDOWN
      end_exch = MPI_Wtime();
      exch_write += end_exch - start_exch;
      nentry.time[0] = write_time;
      nentry.time[1] = comm_time;
      nentry.time[2] = exch_write;
      if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank)
	nentry.aggregator = 1;
      else
	nentry.aggregator = 0;
      nentry.nprocs_for_coll = dynamic_num_io_procs;
      if (!ompi_io_ompio_full_print_queue(WRITE_PRINT_QUEUE)){
	ompi_io_ompio_register_print_entry(WRITE_PRINT_QUEUE,
					   nentry);
      } 
#endif


 exit :
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
      if (NULL != fh->f_io_array) {
	free (fh->f_io_array);
	fh->f_io_array = NULL;
      }
      if (NULL != disp_index){
	free(disp_index);
	disp_index = NULL;
      }
      if (NULL != recvtype){
	free(recvtype);
	recvtype=NULL;
      }
      if (NULL != recv_req){
	free(recv_req);
	recv_req = NULL;
      }
      if (NULL != global_buf) {
	free (global_buf);
	global_buf = NULL;
      }
      for(l=0;l<fh->f_procs_per_group;l++){
	if (NULL != blocklen_per_process[l]){
	  free(blocklen_per_process[l]);
	  blocklen_per_process[l] = NULL;
	}
	if (NULL != displs_per_process[l]){
	  free(displs_per_process[l]);
	  displs_per_process[l] = NULL;
	}
      }
      if (NULL != blocklen_per_process){
	free(blocklen_per_process);
	blocklen_per_process = NULL;
      }
      if (NULL != displs_per_process){
	free(displs_per_process);
	displs_per_process = NULL;
      }

    }
    
    if (NULL != sorted) {
        free (sorted);
        sorted = NULL;
    }
    if (NULL != global_iov_array) {
        free (global_iov_array);
	global_iov_array = NULL;
    }
    if (NULL != fview_count) {
        free (fview_count);
        fview_count = NULL;
    }
    if (NULL != decoded_iov) {
        free (decoded_iov);
        decoded_iov = NULL;
    }
    
    if (NULL != send_req){
      free(send_req);
      send_req = NULL;
    }

    return OMPI_SUCCESS;
}
示例#11
0
int mca_io_ompio_file_set_view (ompi_file_t *fp,
                                OMPI_MPI_OFFSET_TYPE disp,
                                ompi_datatype_t *etype,
                                ompi_datatype_t *filetype,
                                char *datarep,
                                ompi_info_t *info)
{
    mca_io_ompio_data_t *data;
    mca_io_ompio_file_t *fh;
    size_t ftype_size;
    OPAL_PTRDIFF_TYPE ftype_extent, lb;

    data = (mca_io_ompio_data_t *) fp->f_io_selected_data;
    fh = &data->ompio_fh;

    ompi_datatype_destroy (&fh->f_etype);
    ompi_datatype_destroy (&fh->f_filetype);
    ompi_datatype_destroy (&fh->f_orig_filetype);

    if (NULL != fh->f_decoded_iov) {
        free (fh->f_decoded_iov);
        fh->f_decoded_iov = NULL;
    }

    if (NULL != fh->f_datarep) {
        free (fh->f_datarep);
        fh->f_datarep = NULL;
    }

    /* Reset the flags first */
    fh->f_flags = 0;

    fh->f_flags |= OMPIO_FILE_VIEW_IS_SET;
    fh->f_datarep = strdup (datarep);
    ompi_datatype_duplicate (filetype, &fh->f_orig_filetype );

    opal_datatype_get_extent(&filetype->super, &lb, &ftype_extent);
    opal_datatype_type_size (&filetype->super, &ftype_size);

    if ( etype == filetype &&
            ompi_datatype_is_predefined (filetype ) &&
            ftype_extent == (OPAL_PTRDIFF_TYPE)ftype_size ) {
        ompi_datatype_t *newfiletype;
        ompi_datatype_create_contiguous(MCA_IO_DEFAULT_FILE_VIEW_SIZE,
                                        &ompi_mpi_byte.dt,
                                        &newfiletype);
        ompi_datatype_commit (&newfiletype);
        mca_io_ompio_set_view_internal (fh,
                                        disp,
                                        etype,
                                        newfiletype,
                                        datarep,
                                        info);
        ompi_datatype_destroy ( &newfiletype );
    }
    else {
        mca_io_ompio_set_view_internal (fh,
                                        disp,
                                        etype,
                                        filetype,
                                        datarep,
                                        info);
    }

    if (OMPI_SUCCESS != mca_fcoll_base_file_select (&data->ompio_fh,
            NULL)) {
        opal_output(1, "mca_fcoll_base_file_select() failed\n");
        return OMPI_ERROR;
    }

    return OMPI_SUCCESS;
}
示例#12
0
static int unpack_ooo(void)
{
    ompi_datatype_t * t1;
    ompi_datatype_t * t2;
    ompi_datatype_t * type[2];
    ompi_datatype_t * newtype;
    MPI_Aint disp[2];
    int len[2], rc;

    rc = ompi_datatype_create_vector(2, 1, 2, MPI_INT, &t1);
    if (OMPI_SUCCESS != rc) {
        fprintf(stderr, "could not create vector t1\n");
        return 1;
    }
    rc = ompi_datatype_commit (&t1);
    if (OMPI_SUCCESS != rc) {
        fprintf(stderr, "could not commit vector t1\n");
        return 1;
    }

    rc = ompi_datatype_create_vector(2, 1, 2, MPI_DOUBLE, &t2);
    if (OMPI_SUCCESS != rc) {
        fprintf(stderr, "could not create vector t2\n");
        return 1;
    }
    rc = ompi_datatype_commit (&t2);
    if (OMPI_SUCCESS != rc) {
        fprintf(stderr, "could not commit vector t2\n");
        return 1;
    }

/*
 * btl=0x7f7823672580 bytes_received=992 data_offset=0
 * btl=0x7f7823260420 bytes_received=1325 data_offset=992
 * btl=0x7f7823672580 bytes_received=992 data_offset=2317
 * btl=0x7f7823672580 bytes_received=992 data_offset=3309
 * btl=0x7f7823672580 bytes_received=992 data_offset=4301
 * btl=0x7f7823672580 bytes_received=992 data_offset=5293
 * btl=0x7f7823672580 bytes_received=992 data_offset=6285
 * btl=0x7f7823672580 bytes_received=667 data_offset=7277
 */
    size_t test1[9][2] = {
        {992, 0},
        {1325, 992},
        {992, 2317},
        {992, 3309},
        {992, 4301},
        {992, 5293},
        {992, 6285},
        {667, 7277},
        {0, -1},
    };

/*
 * btl=0x7f80bc545580 bytes_received=992 data_offset=0
 * btl=0x7f80bc545580 bytes_received=992 data_offset=2317
 * btl=0x7f80bc545580 bytes_received=992 data_offset=3309
 * btl=0x7f80bc545580 bytes_received=992 data_offset=4301
 * btl=0x7f80bc545580 bytes_received=992 data_offset=5293
 * btl=0x7f80bc545580 bytes_received=992 data_offset=6285
 * btl=0x7f80bc133420 bytes_received=1325 data_offset=992
 * btl=0x7f80bc545580 bytes_received=667 data_offset=7277
 */
    size_t test2[9][2] = {
        {992, 0},
        {992, 2317},
        {992, 3309},
        {992, 4301},
        {992, 5293},
        {992, 6285},
        {1325, 992},
        {667, 7277},
        {0, -1},
    };

/* trimmed version of test2 */
    size_t test3[9][2] = {
        {992, 0},
        {4960, 2317},
        {1325, 992},
        {667, 7277},
        {0, -1},
    };

/* an other test case */
    size_t test4[9][2] = {
        {992, 0},
        {992, 2976},
        {992, 1984},
        {992, 992},
        {3976, 3968},
        {0, -1},
    };

    disp[0] = (long)(&foo.i[0]) - (long)&foo;
    disp[1] = (long)(&foo.d[0]) - (long)&foo;

    type[0] = t1;
    type[1] = t2;

    len[0] = 1;
    len[1] = 1;

    rc = ompi_datatype_create_struct(2, len, disp, type, &newtype);
    if (OMPI_SUCCESS != rc) {
        fprintf(stderr, "could not create struct\n");
        return 1;
    }
    rc = ompi_datatype_commit (&newtype);
    if (OMPI_SUCCESS != rc) {
        fprintf(stderr, "could not create struct\n");
        return 1;
    }
    
    pbar = (struct pfoo_t *)malloc (N * sizeof(struct pfoo_t));
    if (NULL == pbar) {
        fprintf(stderr, "could not malloc pbar\n");
        return 1;
    }
    bar = (struct foo_t *)malloc (N * sizeof(struct foo_t));
    if (NULL == bar) {
        fprintf(stderr, "could not malloc bar\n");
        return 1;
    }

    if (0 != testcase(newtype, test1)) {
        printf ("test1 failed\n");
        return 2;
    }

    if (0 != testcase(newtype, test2)) {
        printf ("test2 failed\n");
        return 2;
    }

    if (0 != testcase(newtype, test3)) {
        printf ("test3 failed\n");
        return 2;
    }

    if (0 != testcase(newtype, test4)) {
        printf ("test4 failed\n");
        return 2;
    }


    /* test the automatic destruction pf the data */
    ompi_datatype_destroy( &newtype ); assert( newtype == NULL );

    return rc;
}
int
mca_fcoll_dynamic_file_write_all (ompio_file_t *fh,
                                  const void *buf,
                                  int count,
                                  struct ompi_datatype_t *datatype,
                                  ompi_status_public_t *status)
{
    MPI_Aint total_bytes_written = 0;  /* total bytes that have been written*/
    MPI_Aint total_bytes = 0;          /* total bytes to be written */
    MPI_Aint bytes_to_write_in_cycle = 0; /* left to be written in a cycle*/
    MPI_Aint bytes_per_cycle = 0;      /* total written in each cycle by each process*/
    int index = 0;
    int cycles = 0;
    int i=0, j=0, l=0;
    int n=0; /* current position in total_bytes_per_process array */
    MPI_Aint bytes_remaining = 0; /* how many bytes have been written from the current
                                     value from total_bytes_per_process */
    int bytes_sent = 0, ret =0;
    int blocks=0, entries_per_aggregator=0;

    /* iovec structure and count of the buffer passed in */
    uint32_t iov_count = 0;
    struct iovec *decoded_iov = NULL;
    int iov_index = 0;
    char *send_buf = NULL;
    size_t current_position = 0;
    struct iovec *local_iov_array=NULL, *global_iov_array=NULL;
    mca_io_ompio_local_io_array *file_offsets_for_agg=NULL;
    /* global iovec at the writers that contain the iovecs created from
       file_set_view */
    uint32_t total_fview_count = 0;
    int local_count = 0, temp_pindex;
    int *fview_count = NULL, *disp_index=NULL, *temp_disp_index=NULL;
    int current_index = 0, temp_index=0;

    char *global_buf = NULL;
    MPI_Aint global_count = 0;


    /* array that contains the sorted indices of the global_iov */
    int *sorted = NULL, *sorted_file_offsets=NULL;
    int *displs = NULL;
    int dynamic_num_io_procs;
    size_t max_data = 0, datatype_size = 0;
    int **blocklen_per_process=NULL;
    MPI_Aint **displs_per_process=NULL, *memory_displacements=NULL;
    ompi_datatype_t **recvtype = NULL;
    MPI_Aint *total_bytes_per_process = NULL;
    MPI_Request send_req=NULL, *recv_req=NULL;
    int my_aggregator=-1;
    bool sendbuf_is_contiguous = false;
    size_t ftype_size;
    ptrdiff_t ftype_extent, lb;


#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    double write_time = 0.0, start_write_time = 0.0, end_write_time = 0.0;
    double comm_time = 0.0, start_comm_time = 0.0, end_comm_time = 0.0;
    double exch_write = 0.0, start_exch = 0.0, end_exch = 0.0;
    mca_common_ompio_print_entry nentry;
#endif

    opal_datatype_type_size ( &datatype->super, &ftype_size );
    opal_datatype_get_extent ( &datatype->super, &lb, &ftype_extent );

    /**************************************************************************
     ** 1.  In case the data is not contigous in memory, decode it into an iovec
     **************************************************************************/
    if ( ( ftype_extent == (ptrdiff_t) ftype_size)             &&
         opal_datatype_is_contiguous_memory_layout(&datatype->super,1) &&
         0 == lb ) {
        sendbuf_is_contiguous = true;
    }



    if (! sendbuf_is_contiguous ) {
        ret =   mca_common_ompio_decode_datatype ((struct ompio_file_t *) fh,
                                                  datatype,
                                                  count,
                                                  buf,
                                                  &max_data,
                                                  &decoded_iov,
                                                  &iov_count);
        if (OMPI_SUCCESS != ret ){
            goto exit;
        }
    }
    else {
        max_data = count * datatype->super.size;
    }

    if ( MPI_STATUS_IGNORE != status ) {
	status->_ucount = max_data;
    }

    dynamic_num_io_procs = fh->f_get_mca_parameter_value ( "num_aggregators", strlen ("num_aggregators"));
    if ( OMPI_ERR_MAX == dynamic_num_io_procs ) {
        ret = OMPI_ERROR;
        goto exit;
    }
    ret = mca_common_ompio_set_aggregator_props ((struct ompio_file_t *) fh,
				                 dynamic_num_io_procs,
				                 max_data);

    if (OMPI_SUCCESS != ret){
	goto exit;
    }
    my_aggregator = fh->f_procs_in_group[0];
    /**************************************************************************
     ** 2. Determine the total amount of data to be written
     **************************************************************************/
    total_bytes_per_process = (MPI_Aint*)malloc
        (fh->f_procs_per_group*sizeof(MPI_Aint));
    if (NULL == total_bytes_per_process) {
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
	goto exit;
    }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_comm_time = MPI_Wtime();
#endif
    ret = ompi_fcoll_base_coll_allgather_array (&max_data,
                                           1,
                                           MPI_LONG,
                                           total_bytes_per_process,
                                           1,
                                           MPI_LONG,
                                           0,
                                           fh->f_procs_in_group,
                                           fh->f_procs_per_group,
                                           fh->f_comm);
    
    if( OMPI_SUCCESS != ret){
	goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    end_comm_time = MPI_Wtime();
    comm_time += (end_comm_time - start_comm_time);
#endif

    for (i=0 ; i<fh->f_procs_per_group ; i++) {
        total_bytes += total_bytes_per_process[i];
    }

    if (NULL != total_bytes_per_process) {
        free (total_bytes_per_process);
        total_bytes_per_process = NULL;
    }

    /*********************************************************************
     *** 3. Generate the local offsets/lengths array corresponding to
     ***    this write operation
     ********************************************************************/
    ret = fh->f_generate_current_file_view( (struct ompio_file_t *) fh,
					    max_data,
					    &local_iov_array,
					    &local_count);
    if (ret != OMPI_SUCCESS){
	goto exit;
    }

#if DEBUG_ON
    for (i=0 ; i<local_count ; i++) {

        printf("%d: OFFSET: %d   LENGTH: %ld\n",
               fh->f_rank,
               local_iov_array[i].iov_base,
               local_iov_array[i].iov_len);

    }
#endif

    /*************************************************************
     *** 4. Allgather the offset/lengths array from all processes
     *************************************************************/
    fview_count = (int *) malloc (fh->f_procs_per_group * sizeof (int));
    if (NULL == fview_count) {
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
	goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_comm_time = MPI_Wtime();
#endif
    ret = ompi_fcoll_base_coll_allgather_array (&local_count,
                                           1,
                                           MPI_INT,
                                           fview_count,
                                           1,
                                           MPI_INT,
                                           0,
                                           fh->f_procs_in_group,
                                           fh->f_procs_per_group,
                                           fh->f_comm);
    
    if( OMPI_SUCCESS != ret){
	goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    end_comm_time = MPI_Wtime();
    comm_time += (end_comm_time - start_comm_time);
#endif

    displs = (int*) malloc (fh->f_procs_per_group * sizeof (int));
    if (NULL == displs) {
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
	goto exit;
    }

    displs[0] = 0;
    total_fview_count = fview_count[0];
    for (i=1 ; i<fh->f_procs_per_group ; i++) {
        total_fview_count += fview_count[i];
        displs[i] = displs[i-1] + fview_count[i-1];
    }

#if DEBUG_ON
    printf("total_fview_count : %d\n", total_fview_count);
    if (my_aggregator == fh->f_rank) {
        for (i=0 ; i<fh->f_procs_per_group ; i++) {
            printf ("%d: PROCESS: %d  ELEMENTS: %d  DISPLS: %d\n",
                    fh->f_rank,
                    i,
                    fview_count[i],
                    displs[i]);
        }
    }
#endif

    /* allocate the global iovec  */

    if (0 != total_fview_count) {
        global_iov_array = (struct iovec*) malloc (total_fview_count *
                                                   sizeof(struct iovec));
        if (NULL == global_iov_array){
            opal_output(1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

    }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_comm_time = MPI_Wtime();
#endif
    ret = ompi_fcoll_base_coll_allgatherv_array (local_iov_array,
                                            local_count,
                                            fh->f_iov_type,
                                            global_iov_array,
                                            fview_count,
                                            displs,
                                            fh->f_iov_type,
                                            0,
                                            fh->f_procs_in_group,
                                            fh->f_procs_per_group,
                                            fh->f_comm);
    if (OMPI_SUCCESS != ret){
	goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    end_comm_time = MPI_Wtime();
    comm_time += (end_comm_time - start_comm_time);
#endif

    /****************************************************************************************
    *** 5. Sort the global offset/lengths list based on the offsets.
    *** The result of the sort operation is the 'sorted', an integer array,
    *** which contains the indexes of the global_iov_array based on the offset.
    *** For example, if global_iov_array[x].offset is followed by global_iov_array[y].offset
    *** in the file, and that one is followed by global_iov_array[z].offset, than
    *** sorted[0] = x, sorted[1]=y and sorted[2]=z;
    ******************************************************************************************/
    if (0 != total_fview_count) {
        sorted = (int *)malloc (total_fview_count * sizeof(int));
        if (NULL == sorted) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
        }
	ompi_fcoll_base_sort_iovec (global_iov_array, total_fview_count, sorted);
    }

    if (NULL != local_iov_array){
	free(local_iov_array);
	local_iov_array = NULL;
    }

    if (NULL != displs){
	free(displs);
	displs=NULL;
    }


#if DEBUG_ON
    if (my_aggregator == fh->f_rank) {
        uint32_t tv=0;
        for (tv=0 ; tv<total_fview_count ; tv++) {
            printf("%d: OFFSET: %lld   LENGTH: %ld\n",
                   fh->f_rank,
                   global_iov_array[sorted[tv]].iov_base,
                   global_iov_array[sorted[tv]].iov_len);
        }
    }
#endif
    /*************************************************************
     *** 6. Determine the number of cycles required to execute this
     ***    operation
     *************************************************************/
    bytes_per_cycle = fh->f_bytes_per_agg;
    cycles = ceil((double)total_bytes/bytes_per_cycle);

    if (my_aggregator == fh->f_rank) {
        disp_index = (int *)malloc (fh->f_procs_per_group * sizeof (int));
        if (NULL == disp_index) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
        }

	blocklen_per_process = (int **)calloc (fh->f_procs_per_group, sizeof (int*));
        if (NULL == blocklen_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
        }

	displs_per_process = (MPI_Aint **)calloc (fh->f_procs_per_group, sizeof (MPI_Aint*));
	if (NULL == displs_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
        }

	recv_req = (MPI_Request *)malloc ((fh->f_procs_per_group)*sizeof(MPI_Request));
	if ( NULL == recv_req ) {
	    opal_output (1, "OUT OF MEMORY\n");
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	}

	global_buf  = (char *) malloc (bytes_per_cycle);
	if (NULL == global_buf){
	    opal_output(1, "OUT OF MEMORY");
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	}

	recvtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group  * sizeof(ompi_datatype_t *));
	if (NULL == recvtype) {
	    opal_output (1, "OUT OF MEMORY\n");
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	}
	for(l=0;l<fh->f_procs_per_group;l++){
            recvtype[l] = MPI_DATATYPE_NULL;
	}
    }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_exch = MPI_Wtime();
#endif
    n = 0;
    bytes_remaining = 0;
    current_index = 0;

    for (index = 0; index < cycles; index++) {
        /**********************************************************************
         ***  7a. Getting ready for next cycle: initializing and freeing buffers
	 **********************************************************************/
        if (my_aggregator == fh->f_rank) {
            if (NULL != fh->f_io_array) {
                free (fh->f_io_array);
                fh->f_io_array = NULL;
            }
	    fh->f_num_of_io_entries = 0;

            if (NULL != recvtype){
                for (i =0; i< fh->f_procs_per_group; i++) {
                    if ( MPI_DATATYPE_NULL != recvtype[i] ) {
                        ompi_datatype_destroy(&recvtype[i]);
			recvtype[i] = MPI_DATATYPE_NULL;
                    }
                }
            }

            for(l=0;l<fh->f_procs_per_group;l++){
                disp_index[l] =  1;

                free(blocklen_per_process[l]);
                free(displs_per_process[l]);

                blocklen_per_process[l] = (int *) calloc (1, sizeof(int));
                displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint));
                if (NULL == displs_per_process[l] || NULL == blocklen_per_process[l]){
                    opal_output (1, "OUT OF MEMORY for displs\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
            }

            if (NULL != sorted_file_offsets){
                free(sorted_file_offsets);
                sorted_file_offsets = NULL;
            }

            if(NULL != file_offsets_for_agg){
                free(file_offsets_for_agg);
                file_offsets_for_agg = NULL;
            }

            if (NULL != memory_displacements){
                free(memory_displacements);
                memory_displacements = NULL;
            }

        } /* (my_aggregator == fh->f_rank */

        /**************************************************************************
         ***  7b. Determine the number of bytes to be actually written in this cycle
	 **************************************************************************/
        if (cycles-1 == index) {
            bytes_to_write_in_cycle = total_bytes - bytes_per_cycle*index;
        }
        else {
            bytes_to_write_in_cycle = bytes_per_cycle;
        }

#if DEBUG_ON
        if (my_aggregator == fh->f_rank) {
            printf ("****%d: CYCLE %d   Bytes %lld**********\n",
                    fh->f_rank,
                    index,
                    bytes_to_write_in_cycle);
        }
#endif
        /**********************************************************
         **Gather the Data from all the processes at the writers **
         *********************************************************/

#if DEBUG_ON
        printf("bytes_to_write_in_cycle: %ld, cycle : %d\n", bytes_to_write_in_cycle,
	       index);
#endif

        /*****************************************************************
         *** 7c. Calculate how much data will be contributed in this cycle
	 ***     by each process
         *****************************************************************/
        bytes_sent = 0;

        /* The blocklen and displs calculation only done at aggregators!*/
        while (bytes_to_write_in_cycle) {

	    /* This next block identifies which process is the holder
	    ** of the sorted[current_index] element;
	    */
            blocks = fview_count[0];
            for (j=0 ; j<fh->f_procs_per_group ; j++) {
                if (sorted[current_index] < blocks) {
                    n = j;
                    break;
                }
                else {
                    blocks += fview_count[j+1];
                }
            }

            if (bytes_remaining) {
                /* Finish up a partially used buffer from the previous  cycle */

                if (bytes_remaining <= bytes_to_write_in_cycle) {
                    /* The data fits completely into the block */
                    if (my_aggregator == fh->f_rank) {
                        blocklen_per_process[n][disp_index[n] - 1] = bytes_remaining;
                        displs_per_process[n][disp_index[n] - 1] =
                            (ptrdiff_t)global_iov_array[sorted[current_index]].iov_base +
                            (global_iov_array[sorted[current_index]].iov_len
                             - bytes_remaining);

                        /* In this cases the length is consumed so allocating for
                           next displacement and blocklength*/
                        blocklen_per_process[n] = (int *) realloc
                            ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int));
                        displs_per_process[n] = (MPI_Aint *) realloc
                            ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint));
                        blocklen_per_process[n][disp_index[n]] = 0;
                        displs_per_process[n][disp_index[n]] = 0;
                        disp_index[n] += 1;
                    }
                    if (fh->f_procs_in_group[n] == fh->f_rank) {
                        bytes_sent += bytes_remaining;
                    }
                    current_index ++;
                    bytes_to_write_in_cycle -= bytes_remaining;
                    bytes_remaining = 0;
                    continue;
                }
                else {
                    /* the remaining data from the previous cycle is larger than the
		       bytes_to_write_in_cycle, so we have to segment again */
                    if (my_aggregator == fh->f_rank) {
                        blocklen_per_process[n][disp_index[n] - 1] = bytes_to_write_in_cycle;
                        displs_per_process[n][disp_index[n] - 1] =
                            (ptrdiff_t)global_iov_array[sorted[current_index]].iov_base +
                            (global_iov_array[sorted[current_index]].iov_len
                             - bytes_remaining);
                    }

                    if (fh->f_procs_in_group[n] == fh->f_rank) {
                        bytes_sent += bytes_to_write_in_cycle;
                    }
                    bytes_remaining -= bytes_to_write_in_cycle;
                    bytes_to_write_in_cycle = 0;
                    break;
                }
            }
            else {
                 /* No partially used entry available, have to start a new one */
                if (bytes_to_write_in_cycle <
                    (MPI_Aint) global_iov_array[sorted[current_index]].iov_len) {
                     /* This entry has more data than we can sendin one cycle */
                    if (my_aggregator == fh->f_rank) {
                        blocklen_per_process[n][disp_index[n] - 1] = bytes_to_write_in_cycle;
                        displs_per_process[n][disp_index[n] - 1] =
                            (ptrdiff_t)global_iov_array[sorted[current_index]].iov_base ;
                    }
                    if (fh->f_procs_in_group[n] == fh->f_rank) {
                        bytes_sent += bytes_to_write_in_cycle;

                    }
                    bytes_remaining = global_iov_array[sorted[current_index]].iov_len -
                        bytes_to_write_in_cycle;
                    bytes_to_write_in_cycle = 0;
                    break;
                }
                else {
                    /* Next data entry is less than bytes_to_write_in_cycle */
                    if (my_aggregator == fh->f_rank) {
                        blocklen_per_process[n][disp_index[n] - 1] =
                            global_iov_array[sorted[current_index]].iov_len;
                        displs_per_process[n][disp_index[n] - 1] = (ptrdiff_t)
                            global_iov_array[sorted[current_index]].iov_base;

                        /*realloc for next blocklength
                          and assign this displacement and check for next displs as
                          the total length of this entry has been consumed!*/
                        blocklen_per_process[n] =
                            (int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int));
                        displs_per_process[n] = (MPI_Aint *)realloc
                            ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint));
                        blocklen_per_process[n][disp_index[n]] = 0;
                        displs_per_process[n][disp_index[n]] = 0;
                        disp_index[n] += 1;
                    }
                    if (fh->f_procs_in_group[n] == fh->f_rank) {
                        bytes_sent += global_iov_array[sorted[current_index]].iov_len;
                    }
                    bytes_to_write_in_cycle -=
                        global_iov_array[sorted[current_index]].iov_len;
                    current_index ++;
                    continue;
                }
            }
        }


        /*************************************************************************
	 *** 7d. Calculate the displacement on where to put the data and allocate
         ***     the recieve buffer (global_buf)
	 *************************************************************************/
        if (my_aggregator == fh->f_rank) {
            entries_per_aggregator=0;
            for (i=0;i<fh->f_procs_per_group; i++){
                for (j=0;j<disp_index[i];j++){
                    if (blocklen_per_process[i][j] > 0)
                        entries_per_aggregator++ ;
                }
            }

#if DEBUG_ON
            printf("%d: cycle: %d, bytes_sent: %d\n ",fh->f_rank,index,
                   bytes_sent);
            printf("%d : Entries per aggregator : %d\n",fh->f_rank,entries_per_aggregator);
#endif

            if (entries_per_aggregator > 0){
                file_offsets_for_agg = (mca_io_ompio_local_io_array *)
                    malloc(entries_per_aggregator*sizeof(mca_io_ompio_local_io_array));
                if (NULL == file_offsets_for_agg) {
                    opal_output (1, "OUT OF MEMORY\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }

                sorted_file_offsets = (int *)
                    malloc (entries_per_aggregator*sizeof(int));
                if (NULL == sorted_file_offsets){
                    opal_output (1, "OUT OF MEMORY\n");
                    ret =  OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }

                /*Moving file offsets to an IO array!*/
                temp_index = 0;

                for (i=0;i<fh->f_procs_per_group; i++){
                    for(j=0;j<disp_index[i];j++){
                        if (blocklen_per_process[i][j] > 0){
                            file_offsets_for_agg[temp_index].length =
                                blocklen_per_process[i][j];
                            file_offsets_for_agg[temp_index].process_id = i;
                            file_offsets_for_agg[temp_index].offset =
                                displs_per_process[i][j];
                            temp_index++;

#if DEBUG_ON
                            printf("************Cycle: %d,  Aggregator: %d ***************\n",
                                   index+1,fh->f_rank);

                            printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n",
                                   fh->f_procs_in_group[i],j,
                                   blocklen_per_process[i][j],j,
                                   displs_per_process[i][j],
                                   fh->f_rank);
#endif
                        }
                    }
                }
            }
            else{
                continue;
            }
            /* Sort the displacements for each aggregator*/
            local_heap_sort (file_offsets_for_agg,
                             entries_per_aggregator,
                             sorted_file_offsets);

            /*create contiguous memory displacements
              based on blocklens on the same displs array
              and map it to this aggregator's actual
              file-displacements (this is in the io-array created above)*/
            memory_displacements = (MPI_Aint *) malloc
                (entries_per_aggregator * sizeof(MPI_Aint));

            memory_displacements[sorted_file_offsets[0]] = 0;
            for (i=1; i<entries_per_aggregator; i++){
                memory_displacements[sorted_file_offsets[i]] =
                    memory_displacements[sorted_file_offsets[i-1]] +
                    file_offsets_for_agg[sorted_file_offsets[i-1]].length;
            }

            temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int));
            if (NULL == temp_disp_index) {
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }

            /*Now update the displacements array  with memory offsets*/
            global_count = 0;
            for (i=0;i<entries_per_aggregator;i++){
                temp_pindex =
                    file_offsets_for_agg[sorted_file_offsets[i]].process_id;
                displs_per_process[temp_pindex][temp_disp_index[temp_pindex]] =
                    memory_displacements[sorted_file_offsets[i]];
                if (temp_disp_index[temp_pindex] < disp_index[temp_pindex])
                    temp_disp_index[temp_pindex] += 1;
                else{
                    printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n",
                           temp_pindex, temp_disp_index[temp_pindex],
                           temp_pindex, disp_index[temp_pindex]);
                }
                global_count +=
                    file_offsets_for_agg[sorted_file_offsets[i]].length;
            }

            if (NULL != temp_disp_index){
                free(temp_disp_index);
                temp_disp_index = NULL;
            }

#if DEBUG_ON

            printf("************Cycle: %d,  Aggregator: %d ***************\n",
                   index+1,fh->f_rank);
            for (i=0;i<fh->f_procs_per_group; i++){
                for(j=0;j<disp_index[i];j++){
                    if (blocklen_per_process[i][j] > 0){
                        printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n",
                               fh->f_procs_in_group[i],j,
                               blocklen_per_process[i][j],j,
                               displs_per_process[i][j],
                               fh->f_rank);

                    }
                }
            }
            printf("************Cycle: %d,  Aggregator: %d ***************\n",
                   index+1,fh->f_rank);
            for (i=0; i<entries_per_aggregator;i++){
                printf("%d: OFFSET: %lld   LENGTH: %ld, Mem-offset: %ld\n",
                       file_offsets_for_agg[sorted_file_offsets[i]].process_id,
                       file_offsets_for_agg[sorted_file_offsets[i]].offset,
                       file_offsets_for_agg[sorted_file_offsets[i]].length,
                       memory_displacements[sorted_file_offsets[i]]);
            }
            printf("%d : global_count : %ld, bytes_sent : %d\n",
                   fh->f_rank,global_count, bytes_sent);
#endif
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            start_comm_time = MPI_Wtime();
#endif
        /*************************************************************************
	 *** 7e. Perform the actual communication
	 *************************************************************************/
            for (i=0;i<fh->f_procs_per_group; i++) {
                recv_req[i] = MPI_REQUEST_NULL;
                if ( 0 < disp_index[i] ) {
                    ompi_datatype_create_hindexed(disp_index[i],
                                                  blocklen_per_process[i],
                                                  displs_per_process[i],
                                                  MPI_BYTE,
                                                  &recvtype[i]);
                    ompi_datatype_commit(&recvtype[i]);
                    opal_datatype_type_size(&recvtype[i]->super, &datatype_size);

                    if (datatype_size){
                        ret = MCA_PML_CALL(irecv(global_buf,
                                                 1,
                                                 recvtype[i],
                                                 fh->f_procs_in_group[i],
                                                 123,
                                                 fh->f_comm,
                                                 &recv_req[i]));
                        if (OMPI_SUCCESS != ret){
                            goto exit;
                        }
                    }
                }
            }
        } /* end if (my_aggregator == fh->f_rank ) */


        if ( sendbuf_is_contiguous ) {
            send_buf = &((char*)buf)[total_bytes_written];
        }
        else if (bytes_sent) {
            /* allocate a send buffer and copy the data that needs
               to be sent into it in case the data is non-contigous
               in memory */
            ptrdiff_t mem_address;
            size_t remaining = 0;
            size_t temp_position = 0;

            send_buf = malloc (bytes_sent);
            if (NULL == send_buf) {
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }

            remaining = bytes_sent;

            while (remaining) {
                mem_address = (ptrdiff_t)
                    (decoded_iov[iov_index].iov_base) + current_position;

                if (remaining >=
                    (decoded_iov[iov_index].iov_len - current_position)) {
                    memcpy (send_buf+temp_position,
                            (IOVBASE_TYPE *)mem_address,
                            decoded_iov[iov_index].iov_len - current_position);
                    remaining = remaining -
                        (decoded_iov[iov_index].iov_len - current_position);
                    temp_position = temp_position +
                        (decoded_iov[iov_index].iov_len - current_position);
                    iov_index = iov_index + 1;
                    current_position = 0;
                }
                else {
                    memcpy (send_buf+temp_position,
                            (IOVBASE_TYPE *) mem_address,
                            remaining);
                    current_position = current_position + remaining;
                    remaining = 0;
                }
            }
	}
	total_bytes_written += bytes_sent;

	/* Gather the sendbuf from each process in appropritate locations in
           aggregators*/

	if (bytes_sent){
            ret = MCA_PML_CALL(isend(send_buf,
                                     bytes_sent,
                                     MPI_BYTE,
                                     my_aggregator,
                                     123,
                                     MCA_PML_BASE_SEND_STANDARD,
                                     fh->f_comm,
                                     &send_req));


	    if ( OMPI_SUCCESS != ret ){
		goto exit;
	    }

	    ret = ompi_request_wait(&send_req, MPI_STATUS_IGNORE);
	    if (OMPI_SUCCESS != ret){
		goto exit;
	    }
	}

	if (my_aggregator == fh->f_rank) {
            ret = ompi_request_wait_all (fh->f_procs_per_group,
                                         recv_req,
                                         MPI_STATUS_IGNORE);

            if (OMPI_SUCCESS != ret){
                goto exit;
            }
	}

#if DEBUG_ON
	if (my_aggregator == fh->f_rank){
            printf("************Cycle: %d,  Aggregator: %d ***************\n",
                   index+1,fh->f_rank);
            for (i=0 ; i<global_count/4 ; i++)
                printf (" RECV %d \n",((int *)global_buf)[i]);
	}
#endif

        if (! sendbuf_is_contiguous) {
            if (NULL != send_buf) {
                free (send_buf);
                send_buf = NULL;
            }
        }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        end_comm_time = MPI_Wtime();
        comm_time += (end_comm_time - start_comm_time);
#endif
        /**********************************************************
         *** 7f. Create the io array, and pass it to fbtl
         *********************************************************/

	if (my_aggregator == fh->f_rank) {

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
	    start_write_time = MPI_Wtime();
#endif

            fh->f_io_array = (mca_common_ompio_io_array_t *) malloc
                (entries_per_aggregator * sizeof (mca_common_ompio_io_array_t));
            if (NULL == fh->f_io_array) {
                opal_output(1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }

            fh->f_num_of_io_entries = 0;
            /*First entry for every aggregator*/
            fh->f_io_array[0].offset =
                (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset;
            fh->f_io_array[0].length =
                file_offsets_for_agg[sorted_file_offsets[0]].length;
            fh->f_io_array[0].memory_address =
                global_buf+memory_displacements[sorted_file_offsets[0]];
            fh->f_num_of_io_entries++;

            for (i=1;i<entries_per_aggregator;i++){
                /* If the enrties are contiguous merge them,
                   else make a new entry */
                if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset +
                    file_offsets_for_agg[sorted_file_offsets[i-1]].length ==
                    file_offsets_for_agg[sorted_file_offsets[i]].offset){
                    fh->f_io_array[fh->f_num_of_io_entries - 1].length +=
                        file_offsets_for_agg[sorted_file_offsets[i]].length;
                }
                else {
                    fh->f_io_array[fh->f_num_of_io_entries].offset =
                        (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset;
                    fh->f_io_array[fh->f_num_of_io_entries].length =
                        file_offsets_for_agg[sorted_file_offsets[i]].length;
                    fh->f_io_array[fh->f_num_of_io_entries].memory_address =
                        global_buf+memory_displacements[sorted_file_offsets[i]];
                    fh->f_num_of_io_entries++;
                }

            }

#if DEBUG_ON
            printf("*************************** %d\n", fh->f_num_of_io_entries);
            for (i=0 ; i<fh->f_num_of_io_entries ; i++) {
                printf(" ADDRESS: %p  OFFSET: %ld   LENGTH: %ld\n",
                       fh->f_io_array[i].memory_address,
                       (ptrdiff_t)fh->f_io_array[i].offset,
                       fh->f_io_array[i].length);
            }

#endif

            if (fh->f_num_of_io_entries) {
                if ( 0 >  fh->f_fbtl->fbtl_pwritev (fh)) {
                    opal_output (1, "WRITE FAILED\n");
                    ret = OMPI_ERROR;
                    goto exit;
                }
            }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            end_write_time = MPI_Wtime();
            write_time += end_write_time - start_write_time;
#endif


	} /* end if (my_aggregator == fh->f_rank) */
    } /* end  for (index = 0; index < cycles; index++) */

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    end_exch = MPI_Wtime();
    exch_write += end_exch - start_exch;
    nentry.time[0] = write_time;
    nentry.time[1] = comm_time;
    nentry.time[2] = exch_write;
    if (my_aggregator == fh->f_rank)
	nentry.aggregator = 1;
    else
	nentry.aggregator = 0;
    nentry.nprocs_for_coll = dynamic_num_io_procs;
    if (!mca_common_ompio_full_print_queue(fh->f_coll_write_time)){
        mca_common_ompio_register_print_entry(fh->f_coll_write_time,
                                              nentry);
    }
#endif


exit :
    if (my_aggregator == fh->f_rank) {
        if (NULL != sorted_file_offsets){
            free(sorted_file_offsets);
            sorted_file_offsets = NULL;
        }
        if(NULL != file_offsets_for_agg){
            free(file_offsets_for_agg);
            file_offsets_for_agg = NULL;
        }
        if (NULL != memory_displacements){
            free(memory_displacements);
            memory_displacements = NULL;
        }
        if (NULL != recvtype){
            for (i =0; i< fh->f_procs_per_group; i++) {
                if ( MPI_DATATYPE_NULL != recvtype[i] ) {
                    ompi_datatype_destroy(&recvtype[i]);
                }
            }
            free(recvtype);
            recvtype=NULL;
        }

        if (NULL != fh->f_io_array) {
            free (fh->f_io_array);
            fh->f_io_array = NULL;
        }
        if (NULL != disp_index){
            free(disp_index);
            disp_index = NULL;
        }
        if (NULL != recvtype){
            free(recvtype);
            recvtype=NULL;
        }
        if (NULL != recv_req){
            free(recv_req);
            recv_req = NULL;
        }
        if (NULL != global_buf) {
            free (global_buf);
            global_buf = NULL;
        }
        for(l=0;l<fh->f_procs_per_group;l++){
            if (NULL != blocklen_per_process){
                free(blocklen_per_process[l]);
            }
            if (NULL != displs_per_process){
                free(displs_per_process[l]);
            }
        }

        free(blocklen_per_process);
        free(displs_per_process);
    }

    if (NULL != displs){
	free(displs);
	displs=NULL;
    }

    if (! sendbuf_is_contiguous) {
	if (NULL != send_buf) {
	    free (send_buf);
	    send_buf = NULL;
	}
    }
    if (NULL != global_buf) {
        free (global_buf);
        global_buf = NULL;
    }
    if (NULL != sorted) {
        free (sorted);
        sorted = NULL;
    }
    if (NULL != global_iov_array) {
        free (global_iov_array);
	global_iov_array = NULL;
    }
    if (NULL != fview_count) {
        free (fview_count);
        fview_count = NULL;
    }
    if (NULL != decoded_iov) {
        free (decoded_iov);
        decoded_iov = NULL;
    }


    return OMPI_SUCCESS;
}
示例#14
0
/*
 *	allgatherv_inter
 *
 *	Function:	- allgatherv using other MPI collectives
 *	Accepts:	- same as MPI_Allgatherv()
 *	Returns:	- MPI_SUCCESS or error code
 */
int
mca_coll_inter_allgatherv_inter(const void *sbuf, int scount,
                                struct ompi_datatype_t *sdtype,
                                void *rbuf, const int *rcounts, const int *disps,
                                struct ompi_datatype_t *rdtype,
                                struct ompi_communicator_t *comm,
                               mca_coll_base_module_t *module)
{
    int i, rank, size, size_local, total=0, err;
    int *count=NULL,*displace=NULL;
    char *ptmp_free=NULL, *ptmp=NULL;
    ompi_datatype_t *ndtype = NULL;

    rank = ompi_comm_rank(comm);
    size_local = ompi_comm_size(comm->c_local_comm);
    size = ompi_comm_remote_size(comm);

    if (0 == rank) {
	count = (int *)malloc(sizeof(int) * size_local);
	displace = (int *)malloc(sizeof(int) * size_local);
	if ((NULL == count) || (NULL == displace)) {
            err = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
	}
    }
    /* Local gather to get the scount of each process */
    err = comm->c_local_comm->c_coll->coll_gather(&scount, 1, MPI_INT,
						 count, 1, MPI_INT,
						 0, comm->c_local_comm,
                                                 comm->c_local_comm->c_coll->coll_gather_module);
    if (OMPI_SUCCESS != err) {
        goto exit;
    }
    if(0 == rank) {
	displace[0] = 0;
	for (i = 1; i < size_local; i++) {
	    displace[i] = displace[i-1] + count[i-1];
	}
	total = 0;
	for (i = 0; i < size_local; i++) {
	    total = total + count[i];
	}
	if ( total > 0 ) {
            ptrdiff_t gap, span;
            span = opal_datatype_span(&sdtype->super, total, &gap);
	    ptmp_free = (char*)malloc(span);
	    if (NULL == ptmp_free) {
                err = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
	    }
            ptmp = ptmp_free - gap;
	}
    }
    err = comm->c_local_comm->c_coll->coll_gatherv(sbuf, scount, sdtype,
						  ptmp, count, displace,
						  sdtype,0, comm->c_local_comm,
                                                  comm->c_local_comm->c_coll->coll_gatherv_module);
    if (OMPI_SUCCESS != err) {
        goto exit;
    }

    ompi_datatype_create_indexed(size,rcounts,disps,rdtype,&ndtype);
    ompi_datatype_commit(&ndtype);

    if (0 == rank) {
	/* Exchange data between roots */
        err = ompi_coll_base_sendrecv_actual(ptmp, total, sdtype, 0,
                                             MCA_COLL_BASE_TAG_ALLGATHERV,
	                                     rbuf, 1, ndtype, 0,
                                             MCA_COLL_BASE_TAG_ALLGATHERV,
                                             comm, MPI_STATUS_IGNORE);
        if (OMPI_SUCCESS != err) {
            goto exit;
        }
    }

    /* bcast the message to all the local processes */
    err = comm->c_local_comm->c_coll->coll_bcast(rbuf, 1, ndtype,
						0, comm->c_local_comm,
                                                comm->c_local_comm->c_coll->coll_bcast_module);
  exit:
    if( NULL != ndtype ) {
        ompi_datatype_destroy(&ndtype);
    }
    if (NULL != ptmp_free) {
        free(ptmp_free);
    }
    if (NULL != displace) {
        free(displace);
    }
    if (NULL != count) {
        free(count);
    }

    return err;

}
int
ompio_io_ompio_file_close (mca_io_ompio_file_t *ompio_fh)
{
    int ret = OMPI_SUCCESS;
    int delete_flag = 0;
    char name[256];

    if(mca_io_ompio_coll_timing_info) {
        strcpy (name, "WRITE");
        if (!ompi_io_ompio_empty_print_queue(WRITE_PRINT_QUEUE)) {
            ret = ompi_io_ompio_print_time_info(WRITE_PRINT_QUEUE,
                                                name,
                                                ompio_fh);
            if (OMPI_SUCCESS != ret) {
                printf("Error in print_time_info ");
            }

        }
        strcpy (name, "READ");
        if (!ompi_io_ompio_empty_print_queue(READ_PRINT_QUEUE)) {
            ret = ompi_io_ompio_print_time_info(READ_PRINT_QUEUE,
                                                name,
                                                ompio_fh);
            if (OMPI_SUCCESS != ret) {
                printf("Error in print_time_info ");
            }
        }
    }
    if ( ompio_fh->f_amode & MPI_MODE_DELETE_ON_CLOSE ) {
        delete_flag = 1;
    }

    /*close the sharedfp file*/
    if(ompio_fh->f_sharedfp != NULL) {
        ret = ompio_fh->f_sharedfp->sharedfp_file_close(ompio_fh);
    }
    ret = ompio_fh->f_fs->fs_file_close (ompio_fh);
    if ( delete_flag && 0 == ompio_fh->f_rank ) {
        mca_io_ompio_file_delete ( ompio_fh->f_filename, MPI_INFO_NULL );
    }

    mca_fs_base_file_unselect (ompio_fh);
    mca_fbtl_base_file_unselect (ompio_fh);
    mca_fcoll_base_file_unselect (ompio_fh);
    /* mca_sharedfp_base_file_unselect (ompio_fh) ; EG?*/

    if (NULL != ompio_fh->f_io_array) {
        free (ompio_fh->f_io_array);
        ompio_fh->f_io_array = NULL;
    }

    if (NULL != ompio_fh->f_init_procs_in_group) {
        free (ompio_fh->f_init_procs_in_group);
        ompio_fh->f_init_procs_in_group = NULL;
    }
    if (NULL != ompio_fh->f_procs_in_group) {
        free (ompio_fh->f_procs_in_group);
        ompio_fh->f_procs_in_group = NULL;
    }

    if (NULL != ompio_fh->f_decoded_iov) {
        free (ompio_fh->f_decoded_iov);
        ompio_fh->f_decoded_iov = NULL;
    }

    if (NULL != ompio_fh->f_convertor) {
        free (ompio_fh->f_convertor);
        ompio_fh->f_convertor = NULL;
    }

    if (NULL != ompio_fh->f_datarep) {
        free (ompio_fh->f_datarep);
        ompio_fh->f_datarep = NULL;
    }


    if (MPI_DATATYPE_NULL != ompio_fh->f_iov_type) {
        ompi_datatype_destroy (&ompio_fh->f_iov_type);
    }

    if (MPI_COMM_NULL != ompio_fh->f_comm)  {
        ompi_comm_free (&ompio_fh->f_comm);
    }


    /*
    if (MPI_INFO_NULL != ompio_fh->f_info)
    {
        ompi_info_free (&ompio_fh->f_info);
    }
    */


    return ret;
}
示例#16
0
int
mca_fcoll_static_file_write_all (mca_io_ompio_file_t *fh,
                                 void *buf,
                                 int count,
                                 struct ompi_datatype_t *datatype,
                                 ompi_status_public_t *status)
{
    
    
    
    size_t max_data = 0, bytes_per_cycle=0;
    struct iovec *iov=NULL, *decoded_iov=NULL;
    uint32_t iov_count=0, iov_index=0;
    int i=0,j=0,l=0, temp_index;
    int ret=OMPI_SUCCESS, cycles, local_cycles, *bytes_per_process=NULL;
    int index, *disp_index=NULL, **blocklen_per_process=NULL;
    int *iovec_count_per_process=NULL, *displs=NULL;
    size_t total_bytes_written=0;
    MPI_Aint **displs_per_process=NULL, *memory_displacements=NULL;
    MPI_Aint bytes_to_write_in_cycle=0, global_iov_count=0, global_count=0;
    
    local_io_array *local_iov_array =NULL, *global_iov_array=NULL;
    local_io_array *file_offsets_for_agg=NULL;
    int *sorted=NULL, *sorted_file_offsets=NULL, temp_pindex, *temp_disp_index=NULL;
    char *send_buf=NULL, *global_buf=NULL;
    int iov_size=0, current_position=0, *current_index=NULL;
    int *bytes_remaining=NULL, entries_per_aggregator=0;
    ompi_datatype_t **recvtype = NULL;
    MPI_Request *send_req=NULL, *recv_req=NULL;
    /* For creating datatype of type io_array */
    int blocklen[3] = {1, 1, 1};
    int static_num_io_procs=1;
    OPAL_PTRDIFF_TYPE d[3], base;
    ompi_datatype_t *types[3];
    ompi_datatype_t *io_array_type=MPI_DATATYPE_NULL;
    /*----------------------------------------------*/
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    double write_time = 0.0, start_write_time = 0.0, end_write_time = 0.0;
    double comm_time = 0.0, start_comm_time = 0.0, end_comm_time = 0.0;
    double exch_write = 0.0, start_exch = 0.0, end_exch = 0.0;
    mca_io_ompio_print_entry nentry;
#endif
    
    
#if DEBUG_ON
    MPI_Aint gc_in;
#endif
    
//  if (opal_datatype_is_contiguous_memory_layout(&datatype->super,1)) {
//    fh->f_flags |= OMPIO_CONTIGUOUS_MEMORY;
//  }
    
    /* In case the data is not contigous in memory, decode it into an iovec */
    if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) {
        fh->f_decode_datatype ((struct mca_io_ompio_file_t *)fh,
                               datatype,
                               count,
                               buf,
                               &max_data,
                               &decoded_iov,
                               &iov_count);
    }
    else {
        max_data = count * datatype->super.size;
    }
    
    if ( MPI_STATUS_IGNORE != status ) {
	status->_ucount = max_data;
    }
    
    fh->f_get_num_aggregators ( & static_num_io_procs );
    fh->f_set_aggregator_props ((struct mca_io_ompio_file_t *)fh,
				static_num_io_procs,
				max_data);
    
    
    /* io_array datatype  for using in communication*/
    types[0] = &ompi_mpi_long.dt;
    types[1] = &ompi_mpi_long.dt;
    types[2] = &ompi_mpi_int.dt;
    
    d[0] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0];
    d[1] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0].length;
    d[2] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0].process_id;
    base = d[0];
    for (i=0 ; i<3 ; i++) {
        d[i] -= base;
    }
    ompi_datatype_create_struct (3,
                                 blocklen,
                                 d,
                                 types,
                                 &io_array_type);
    ompi_datatype_commit (&io_array_type);
    /* #########################################################*/
    
    
    
    ret = fh->f_generate_current_file_view((struct mca_io_ompio_file_t *)fh,
                                           max_data,
                                           &iov,
                                           &iov_size);
    if (ret != OMPI_SUCCESS){
        fprintf(stderr,"Current File View Generation Error\n");
        goto exit;
    }
    
    if (0 == iov_size){
        iov_size  = 1;
    }
    
    local_iov_array = (local_io_array *)malloc (iov_size * sizeof(local_io_array));
    if ( NULL == local_iov_array){
        fprintf(stderr,"local_iov_array allocation error\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }
    
    
    for (j=0; j < iov_size; j++){
        local_iov_array[j].offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t)
            iov[j].iov_base;
        local_iov_array[j].length = (size_t)iov[j].iov_len;
        local_iov_array[j].process_id = fh->f_rank;
        
    }
    
    fh->f_get_bytes_per_agg ( (int *) &bytes_per_cycle);
    
    
    local_cycles = ceil((double)max_data/bytes_per_cycle);
    ret = fh->f_comm->c_coll.coll_allreduce (&local_cycles,
                                             &cycles,
                                             1,
                                             MPI_INT,
                                             MPI_MAX,
                                             fh->f_comm,
                                             fh->f_comm->c_coll.coll_allreduce_module);
    
    if (OMPI_SUCCESS != ret){
        fprintf(stderr,"local cycles allreduce!\n");
        goto exit;
    }
    
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
        
        disp_index = (int *)malloc (fh->f_procs_per_group * sizeof (int));
        if (NULL == disp_index) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        
        bytes_per_process = (int *) malloc (fh->f_procs_per_group * sizeof(int ));
        if (NULL == bytes_per_process){
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        
        bytes_remaining = (int *) malloc (fh->f_procs_per_group * sizeof(int));
        if (NULL == bytes_remaining){
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        
        current_index = (int *) malloc (fh->f_procs_per_group * sizeof(int));
        if (NULL == current_index){
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        
        blocklen_per_process = (int **)malloc (fh->f_procs_per_group * sizeof (int*));
        if (NULL == blocklen_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        
        displs_per_process = (MPI_Aint **)
            malloc (fh->f_procs_per_group * sizeof (MPI_Aint*));
        
        if (NULL == displs_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        
        for(i=0;i<fh->f_procs_per_group;i++){
            current_index[i] = 0;
            bytes_remaining[i] =0;
            blocklen_per_process[i] = NULL;
            displs_per_process[i] = NULL;
        }
    }
    
    iovec_count_per_process = (int *) malloc (fh->f_procs_per_group * sizeof(int));
    if (NULL == iovec_count_per_process){
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }
    
    displs = (int *) malloc (fh->f_procs_per_group * sizeof(int));
    if (NULL == displs){
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }
    
    ret = fh->f_allgather_array (&iov_size,
                                 1,
                                 MPI_INT,
                                 iovec_count_per_process,
                                 1,
                                 MPI_INT,
                                 fh->f_aggregator_index,
                                 fh->f_procs_in_group,
                                 fh->f_procs_per_group,
                                 fh->f_comm);
    
    if( OMPI_SUCCESS != ret){
        fprintf(stderr,"iov size allgatherv array!\n");
        goto exit;
    }
    
    
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
        displs[0] = 0;
        global_iov_count = iovec_count_per_process[0];
        for (i=1 ; i<fh->f_procs_per_group ; i++) {
            global_iov_count += iovec_count_per_process[i];
            displs[i] = displs[i-1] + iovec_count_per_process[i-1];
        }
    }
    
    
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
        global_iov_array = (local_io_array *) malloc (global_iov_count *
                                                      sizeof(local_io_array));
        if (NULL == global_iov_array){
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
    }
    
    ret = fh->f_gatherv_array (local_iov_array,
                               iov_size,
                               io_array_type,
                               global_iov_array,
                               iovec_count_per_process,
                               displs,
                               io_array_type,
                               fh->f_aggregator_index,
                               fh->f_procs_in_group,
                               fh->f_procs_per_group,
                               fh->f_comm);
    if (OMPI_SUCCESS != ret){
        fprintf(stderr,"global_iov_array gather error!\n");
        goto exit;
    }
    
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
        
        if ( 0 == global_iov_count){
            global_iov_count =  1;
        }
        
        sorted = (int *)malloc (global_iov_count * sizeof(int));
        if (NULL == sorted) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        local_heap_sort (global_iov_array, global_iov_count, sorted);
    }
    
#if DEBUG_ON
    
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
        for (gc_in=0; gc_in<global_iov_count; gc_in++){
            printf("%d: Offset[%ld]: %lld, Length[%ld]: %ld\n",
                   global_iov_array[gc_in].process_id,
                   gc_in, global_iov_array[gc_in].offset,
                   gc_in, global_iov_array[gc_in].length);
        }
    }
#endif
    
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_exch = MPI_Wtime();
#endif
    
    
    for (index = 0; index < cycles; index++){
        if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
            if (NULL == recvtype){
                recvtype = (ompi_datatype_t **)
                    malloc (fh->f_procs_per_group  * sizeof(ompi_datatype_t *));
                if (NULL == recvtype) {
                    opal_output (1, "OUT OF MEMORY\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
            }
            for(l=0;l<fh->f_procs_per_group;l++){
                disp_index[l] =  1;
                if (NULL != blocklen_per_process[l]){
                    free(blocklen_per_process[l]);
                    blocklen_per_process[l] = NULL;
                }
                if (NULL != displs_per_process[l]){
                    free(displs_per_process[l]);
                    displs_per_process[l] = NULL;
                }
                blocklen_per_process[l] = (int *) calloc (1, sizeof(int));
                if (NULL == blocklen_per_process[l]) {
                    opal_output (1, "OUT OF MEMORY for blocklen\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint));
                if (NULL == displs_per_process[l]){
                    opal_output (1, "OUT OF MEMORY for displs\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
            }
            if (NULL != sorted_file_offsets){
                free(sorted_file_offsets);
                sorted_file_offsets = NULL;
            }
            
            if(NULL != file_offsets_for_agg){
                free(file_offsets_for_agg);
                file_offsets_for_agg = NULL;
            }
            
            if (NULL != memory_displacements){
                free(memory_displacements);
                memory_displacements = NULL;
            }
            
        }
        if (local_cycles > index) {
            if ((index == local_cycles-1) && (max_data % bytes_per_cycle)) {
                bytes_to_write_in_cycle = max_data % bytes_per_cycle;
            }
            else if (max_data <= bytes_per_cycle) {
                bytes_to_write_in_cycle = max_data;
            }
            else {
                bytes_to_write_in_cycle = bytes_per_cycle;
            }
        }
        else {
            bytes_to_write_in_cycle = 0;
        }
#if DEBUG_ON
        /*    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {*/
        printf ("***%d: CYCLE %d   Bytes %ld**********\n",
                fh->f_rank,
                index,
                bytes_to_write_in_cycle);
        /* }*/
#endif
        /**********************************************************
         **Gather the Data from all the processes at the writers **
         *********************************************************/
        
        /* gather from each process how many bytes each will be sending */
        fh->f_gather_array (&bytes_to_write_in_cycle,
                            1,
                            MPI_INT,
                            bytes_per_process,
                            1,
                            MPI_INT,
                            fh->f_aggregator_index,
                            fh->f_procs_in_group,
                            fh->f_procs_per_group,
                            fh->f_comm);
        
        /*
          For each aggregator
          it needs to get bytes_to_write_in_cycle from each process
          in group which adds up to bytes_per_cycle
          
        */
        if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
            for (i=0;i<fh->f_procs_per_group; i++){
/*	    printf("bytes_per_process[%d]: %d\n", i, bytes_per_process[i]);
 */
                
#if DEBUG_ON
                printf ("%d : bytes_per_process : %d\n",
                        fh->f_procs_in_group[i],
                        bytes_per_process[i]);
#endif
                
                while (bytes_per_process[i] > 0){
                    if (get_process_id(global_iov_array[sorted[current_index[i]]].process_id,
                                       fh) == i){ /* current id owns this entry!*/
                        
                        /*Add and subtract length and create
                          blocklength and displs array*/
                        if (bytes_remaining[i]){ /*Remaining bytes in the current entry of
                                                   the global offset array*/
                            if (bytes_remaining[i] <= bytes_per_process[i]){
                                blocklen_per_process[i][disp_index[i] - 1] = bytes_remaining[i];
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset +
                                    (global_iov_array[sorted[current_index[i]]].length
                                     - bytes_remaining[i]);
                                
                                blocklen_per_process[i] = (int *) realloc
                                    ((void *)blocklen_per_process[i], (disp_index[i]+1)*sizeof(int));
                                displs_per_process[i] = (MPI_Aint *)realloc
                                    ((void *)displs_per_process[i], (disp_index[i]+1)*sizeof(MPI_Aint));
                                bytes_per_process[i] -= bytes_remaining[i];
                                blocklen_per_process[i][disp_index[i]] = 0;
                                displs_per_process[i][disp_index[i]] = 0;
                                bytes_remaining[i] = 0;
                                disp_index[i] += 1;
                                /* This entry has been used up, we need to move to the
                                   next entry of this process and make current_index point there*/
                                current_index[i]  = find_next_index(i,
                                                                    current_index[i],
                                                                    fh,
                                                                    global_iov_array,
                                                                    global_iov_count,
                                                                    sorted);
                                if (current_index[i] == -1){
                                    /* No more entries left, so Its all done! exit!*/
                                    break;
                                }
                                continue;
                            }
                            else{
                                blocklen_per_process[i][disp_index[i] - 1] = bytes_per_process[i];
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset +
                                    (global_iov_array[sorted[current_index[i]]].length
                                     - bytes_remaining[i]);
                                bytes_remaining[i] -= bytes_per_process[i];
                                bytes_per_process[i] = 0;
                                break;
                            }
                        }
                        else{
                            if (bytes_per_process[i] <
                                global_iov_array[sorted[current_index[i]]].length){
                                blocklen_per_process[i][disp_index[i] - 1] =
                                    bytes_per_process[i];
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset;
                                
                                bytes_remaining[i] =
                                    global_iov_array[sorted[current_index[i]]].length -
                                    bytes_per_process[i];
                                bytes_per_process[i] = 0;
                                break;
                            }
                            else {
                                blocklen_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].length;
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset;
                                blocklen_per_process[i] =
                                    (int *) realloc ((void *)blocklen_per_process[i], (disp_index[i]+1)*sizeof(int));
                                displs_per_process[i] = (MPI_Aint *)realloc
                                    ((void *)displs_per_process[i], (disp_index[i]+1)*sizeof(MPI_Aint));
                                blocklen_per_process[i][disp_index[i]] = 0;
                                displs_per_process[i][disp_index[i]] = 0;
                                disp_index[i] += 1;
                                bytes_per_process[i] -=
                                    global_iov_array[sorted[current_index[i]]].length;
                                current_index[i] = find_next_index(i,
                                                                   current_index[i],
                                                                   fh,
                                                                   global_iov_array,
                                                                   global_iov_count,
                                                                   sorted);
                                if (current_index[i] == -1){
                                    break;
                                }
                            }
                        }
                    }
                    else{
                        current_index[i] = find_next_index(i,
                                                           current_index[i],
                                                           fh,
                                                           global_iov_array,
                                                           global_iov_count,
                                                           sorted);
                        if (current_index[i] == -1){
                            bytes_per_process[i] = 0; /* no more entries left
                                                         to service this request*/
                            continue;
                        }
                    }
                }
            }
            entries_per_aggregator=0;
            for (i=0;i<fh->f_procs_per_group;i++){
                for (j=0;j<disp_index[i];j++){
                    if (blocklen_per_process[i][j] > 0){
                        entries_per_aggregator++;
#if DEBUG_ON
                        printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n",
                               fh->f_procs_in_group[i],j,
                               blocklen_per_process[i][j],j,
                               displs_per_process[i][j],
                               fh->f_rank);
                        
#endif
                    }
                    
                }
            }
            
            if (entries_per_aggregator > 0){
                file_offsets_for_agg = (local_io_array *)
                    malloc(entries_per_aggregator*sizeof(local_io_array));
                if (NULL == file_offsets_for_agg) {
                    opal_output (1, "OUT OF MEMORY\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                sorted_file_offsets = (int *)
                    malloc (entries_per_aggregator*sizeof(int));
                if (NULL == sorted_file_offsets){
                    opal_output (1, "OUT OF MEMORY\n");
                    ret =  OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                temp_index = 0;
                for (i=0;i<fh->f_procs_per_group; i++){
                    for(j=0;j<disp_index[i];j++){
                        if (blocklen_per_process[i][j] > 0){
                            file_offsets_for_agg[temp_index].length =
                                blocklen_per_process[i][j];
                            file_offsets_for_agg[temp_index].process_id = i;
                            file_offsets_for_agg[temp_index].offset =
                                displs_per_process[i][j];
                            temp_index++;
                        }
                    }
                }
            }
            else{
                continue;
            }
            local_heap_sort (file_offsets_for_agg,
                             entries_per_aggregator,
                             sorted_file_offsets);
            
            memory_displacements = (MPI_Aint *) malloc
                (entries_per_aggregator * sizeof(MPI_Aint));
            memory_displacements[sorted_file_offsets[0]] = 0;
            for (i=1; i<entries_per_aggregator; i++){
                memory_displacements[sorted_file_offsets[i]] =
                    memory_displacements[sorted_file_offsets[i-1]] +
                    file_offsets_for_agg[sorted_file_offsets[i-1]].length;
            }
            
            temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int));
            if (NULL == temp_disp_index) {
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
            global_count = 0;
            for (i=0;i<entries_per_aggregator;i++){
                temp_pindex =
                    file_offsets_for_agg[sorted_file_offsets[i]].process_id;
                displs_per_process[temp_pindex][temp_disp_index[temp_pindex]] =
                    memory_displacements[sorted_file_offsets[i]];
                if (temp_disp_index[temp_pindex] < disp_index[temp_pindex])
                    temp_disp_index[temp_pindex] += 1;
                else{
                    printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n",
                           temp_pindex, temp_disp_index[temp_pindex],
                           temp_pindex, disp_index[temp_pindex]);
                }
                global_count +=
                    file_offsets_for_agg[sorted_file_offsets[i]].length;
            }
            if (NULL != temp_disp_index){
                free(temp_disp_index);
                temp_disp_index = NULL;
            }
            
#if DEBUG_ON
            printf("************Cycle: %d,  Aggregator: %d ***************\n",
                   index+1,fh->f_rank);
            for (i=0; i<entries_per_aggregator;i++){
                printf("%d: OFFSET: %lld   LENGTH: %ld, Mem-offset: %ld, disp : %d\n",
                       file_offsets_for_agg[sorted_file_offsets[i]].process_id,
                       file_offsets_for_agg[sorted_file_offsets[i]].offset,
                       file_offsets_for_agg[sorted_file_offsets[i]].length,
                       memory_displacements[sorted_file_offsets[i]],
                       disp_index[ file_offsets_for_agg[sorted_file_offsets[i]].process_id]);
            }
#endif
            
#if DEBUG_ON
            printf("%d: global_count : %ld, bytes_to_write_in_cycle : %ld, procs_per_group: %d\n",
                   fh->f_rank,
                   global_count,
                   bytes_to_write_in_cycle,
                   fh->f_procs_per_group);
#endif
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            start_comm_time = MPI_Wtime();
#endif
            global_buf  = (char *) malloc (global_count);
            if (NULL == global_buf){
                opal_output(1, "OUT OF MEMORY");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
            
            recv_req = (MPI_Request *)
                malloc (fh->f_procs_per_group * sizeof(MPI_Request));
            if (NULL == recv_req){
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
            for (i=0;i<fh->f_procs_per_group; i++){
                ompi_datatype_create_hindexed(disp_index[i],
                                              blocklen_per_process[i],
                                              displs_per_process[i],
                                              MPI_BYTE,
                                              &recvtype[i]);
                ompi_datatype_commit(&recvtype[i]);
                ret = MCA_PML_CALL(irecv(global_buf,
                                         1,
                                         recvtype[i],
                                         fh->f_procs_in_group[i],
                                         123,
                                         fh->f_comm,
                                         &recv_req[i]));
                if (OMPI_SUCCESS != ret){
                    fprintf(stderr,"irecv Error!\n");
                    goto exit;
                }
            }
        }
        
        if (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY) {
            send_buf = &((char*)buf)[total_bytes_written];
        }
        else if (bytes_to_write_in_cycle) {
            /* allocate a send buffer and copy the data that needs
               to be sent into it in case the data is non-contigous
               in memory */
            OPAL_PTRDIFF_TYPE mem_address;
            size_t remaining = 0;
            size_t temp_position = 0;
            
            send_buf = malloc (bytes_to_write_in_cycle);
            if (NULL == send_buf) {
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
            remaining = bytes_to_write_in_cycle;
            
            while (remaining) {
                mem_address = (OPAL_PTRDIFF_TYPE)
                    (decoded_iov[iov_index].iov_base) + current_position;
                
                if (remaining >=
                    (decoded_iov[iov_index].iov_len - current_position)) {
                    memcpy (send_buf+temp_position,
                            (IOVBASE_TYPE *)mem_address,
                            decoded_iov[iov_index].iov_len - current_position);
                    remaining = remaining -
                        (decoded_iov[iov_index].iov_len - current_position);
                    temp_position = temp_position +
                        (decoded_iov[iov_index].iov_len - current_position);
                    iov_index = iov_index + 1;
                    current_position = 0;
                }
                else {
                    memcpy (send_buf+temp_position,
                            (IOVBASE_TYPE *)mem_address,
                            remaining);
                    current_position = current_position + remaining;
                    remaining = 0;
                }
            }
        }
        total_bytes_written += bytes_to_write_in_cycle;
        
        send_req = (MPI_Request *) malloc (sizeof(MPI_Request));
        if (NULL == send_req){
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        
        ret = MCA_PML_CALL(isend(send_buf,
                                 bytes_to_write_in_cycle,
                                 MPI_BYTE,
                                 fh->f_procs_in_group[fh->f_aggregator_index],
                                 123,
                                 MCA_PML_BASE_SEND_STANDARD,
                                 fh->f_comm,
                                 send_req));
        
        if ( OMPI_SUCCESS != ret ){
            fprintf(stderr,"isend error!\n");
            goto exit;
        }
        
        ret = ompi_request_wait (send_req, MPI_STATUS_IGNORE);
        if (OMPI_SUCCESS != ret){
            goto exit;
        }
        
        if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
            ret = ompi_request_wait_all (fh->f_procs_per_group,
                                         recv_req,
                                         MPI_STATUS_IGNORE);
            if (OMPI_SUCCESS != ret){
                goto exit;
            }
            
#if DEBUG_ON
            printf("************Cycle: %d,  Aggregator: %d ***************\n",
                   index+1,fh->f_rank);
            if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank){
                for (i=0 ; i<global_count/4 ; i++)
                    printf (" RECV %d \n",((int *)global_buf)[i]);
            }
#endif
        }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        end_comm_time = MPI_Wtime();
        comm_time += end_comm_time - start_comm_time;
#endif
        
        
        
        if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
            fh->f_io_array = (mca_io_ompio_io_array_t *) malloc
                (entries_per_aggregator * sizeof (mca_io_ompio_io_array_t));
            if (NULL == fh->f_io_array) {
                opal_output(1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
            fh->f_num_of_io_entries = 0;
            /*First entry for every aggregator*/
            fh->f_io_array[fh->f_num_of_io_entries].offset =
                (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset;
            fh->f_io_array[fh->f_num_of_io_entries].length =
                file_offsets_for_agg[sorted_file_offsets[0]].length;
            fh->f_io_array[fh->f_num_of_io_entries].memory_address =
                global_buf+memory_displacements[sorted_file_offsets[0]];
            fh->f_num_of_io_entries++;
            for (i=1;i<entries_per_aggregator;i++){
                if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset +
                    file_offsets_for_agg[sorted_file_offsets[i-1]].length ==
                    file_offsets_for_agg[sorted_file_offsets[i]].offset){
                    fh->f_io_array[fh->f_num_of_io_entries - 1].length +=
                        file_offsets_for_agg[sorted_file_offsets[i]].length;
                }
                else {
                    fh->f_io_array[fh->f_num_of_io_entries].offset =
                        (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset;
                    fh->f_io_array[fh->f_num_of_io_entries].length =
                        file_offsets_for_agg[sorted_file_offsets[i]].length;
                    fh->f_io_array[fh->f_num_of_io_entries].memory_address =
                        global_buf+memory_displacements[sorted_file_offsets[i]];
                    fh->f_num_of_io_entries++;
                }
            }
#if DEBUG_ON
            printf("*************************** %d\n", fh->f_num_of_io_entries);
            for (i=0 ; i<fh->f_num_of_io_entries ; i++) {
                printf(" ADDRESS: %p  OFFSET: %ld   LENGTH: %ld\n",
                       fh->f_io_array[i].memory_address,
                       (OPAL_PTRDIFF_TYPE)fh->f_io_array[i].offset,
                       fh->f_io_array[i].length);
            }
#endif
            
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            start_write_time = MPI_Wtime();
#endif
            
            if (fh->f_num_of_io_entries) {
                if ( 0 >  fh->f_fbtl->fbtl_pwritev (fh)) {
                    opal_output (1, "WRITE FAILED\n");
                    ret = OMPI_ERROR;
                    goto exit;
                }
            }
            
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            end_write_time = MPI_Wtime();
            write_time += end_write_time - start_write_time;
#endif
            
        }
        if (NULL != send_req){
            free(send_req);
            send_req = NULL;
        }
        
        if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
            fh->f_num_of_io_entries = 0;
            if (NULL != fh->f_io_array) {
                free (fh->f_io_array);
                fh->f_io_array = NULL;
            }
            for (i = 0; i < fh->f_procs_per_group; i++)
                ompi_datatype_destroy(recvtype+i);
            if (NULL != recvtype){
                free(recvtype);
                recvtype=NULL;
            }
            if (NULL != recv_req){
                free(recv_req);
                recv_req = NULL;
            }
            if (NULL != global_buf) {
                free (global_buf);
                global_buf = NULL;
            }
        }
    }
    
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    end_exch = MPI_Wtime();
    exch_write += end_exch - start_exch;
    nentry.time[0] = write_time;
    nentry.time[1] = comm_time;
    nentry.time[2] = exch_write;
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank)
        nentry.aggregator = 1;
    else
        nentry.aggregator = 0;
    nentry.nprocs_for_coll = static_num_io_procs;
    if (!fh->f_full_print_queue(WRITE_PRINT_QUEUE)){
	fh->f_register_print_entry(WRITE_PRINT_QUEUE,
				   nentry);
    }
#endif
    
    
    
exit:
    if (NULL != decoded_iov){
        free(decoded_iov);
        decoded_iov = NULL;
    }
    
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
        
        if (NULL != local_iov_array){
            free(local_iov_array);
            local_iov_array = NULL;
        }
        for(l=0;l<fh->f_procs_per_group;l++){
            if (NULL != blocklen_per_process[l]){
                free(blocklen_per_process[l]);
                blocklen_per_process[l] = NULL;
            }
            if (NULL != displs_per_process[l]){
                free(displs_per_process[l]);
                displs_per_process[l] = NULL;
            }
        }
    }
    
    if (NULL != send_buf){
        free(send_buf);
        send_buf = NULL;
    }
    
    if (NULL != global_buf){
        free(global_buf);
        global_buf = NULL;
    }
    
    if (NULL != recvtype){
        free(recvtype);
        recvtype = NULL;
    }
    
    if (NULL != sorted_file_offsets){
        free(sorted_file_offsets);
        sorted_file_offsets = NULL;
    }
    
    if (NULL != file_offsets_for_agg){
        free(file_offsets_for_agg);
        file_offsets_for_agg = NULL;
    }
    
    if (NULL != memory_displacements){
        free(memory_displacements);
        memory_displacements = NULL;
    }
    
    if (NULL != displs_per_process){
        free(displs_per_process);
        displs_per_process = NULL;
    }
    
    if (NULL != blocklen_per_process){
        free(blocklen_per_process);
        blocklen_per_process = NULL;
    }
    
    if(NULL != current_index){
        free(current_index);
        current_index = NULL;
    }
    
    if(NULL != bytes_remaining){
        free(bytes_remaining);
        bytes_remaining = NULL;
    }
    
    if (NULL != disp_index){
        free(disp_index);
        disp_index = NULL;
    }
    
    if (NULL != sorted) {
        free(sorted);
        sorted = NULL;
    }
    
    return ret;
}
int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
                                         struct ompi_datatype_t *sdtype,
                                         void* rbuf, int rcount,
                                         struct ompi_datatype_t *rdtype,
                                         struct ompi_communicator_t *comm,
                                         mca_coll_base_module_t *module)
{
    int i, k, line = -1, rank, size, err = 0, weallocated = 0;
    int sendto, recvfrom, distance, *displs = NULL, *blen = NULL;
    char *tmpbuf = NULL, *tmpbuf_free = NULL;
    ptrdiff_t rlb, slb, tlb, sext, rext, tsext;
    struct ompi_datatype_t *new_ddt;
#ifdef blahblah
    mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
    mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
#endif

    if (MPI_IN_PLACE == sbuf) {
        return mca_coll_tuned_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
                                                            comm, module);
    }

    size = ompi_comm_size(comm);
    rank = ompi_comm_rank(comm);

    OPAL_OUTPUT((ompi_coll_tuned_stream,
                 "coll:tuned:alltoall_intra_bruck rank %d", rank));

    err = ompi_datatype_get_extent (sdtype, &slb, &sext);
    if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }

    err = ompi_datatype_get_true_extent(sdtype, &tlb,  &tsext);
    if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }

    err = ompi_datatype_get_extent (rdtype, &rlb, &rext);
    if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }


#ifdef blahblah
    /* try and SAVE memory by using the data segment hung off 
       the communicator if possible */
    if (data->mcct_num_reqs >= size) { 
        /* we have enought preallocated for displments and lengths */
        displs = (int*) data->mcct_reqs;
        blen = (int *) (displs + size);
        weallocated = 0;
    } 
    else { /* allocate the buffers ourself */
#endif
        displs = (int *) malloc(size * sizeof(int));
        if (displs == NULL) { line = __LINE__; err = -1; goto err_hndl; }
        blen = (int *) malloc(size * sizeof(int));
        if (blen == NULL) { line = __LINE__; err = -1; goto err_hndl; }
        weallocated = 1;
#ifdef blahblah
    }
#endif

    /* tmp buffer allocation for message data */
    tmpbuf_free = (char *) malloc(tsext + ((ptrdiff_t)scount * (ptrdiff_t)size - 1) * sext);
    if (tmpbuf_free == NULL) { line = __LINE__; err = -1; goto err_hndl; }
    tmpbuf = tmpbuf_free - slb;

    /* Step 1 - local rotation - shift up by rank */
    err = ompi_datatype_copy_content_same_ddt (sdtype, 
                                               (int32_t) ((ptrdiff_t)(size - rank) * (ptrdiff_t)scount),
                                               tmpbuf, 
                                               ((char*) sbuf) + (ptrdiff_t)rank * (ptrdiff_t)scount * sext);
    if (err<0) {
        line = __LINE__; err = -1; goto err_hndl;
    }

    if (rank != 0) {
        err = ompi_datatype_copy_content_same_ddt (sdtype, (ptrdiff_t)rank * (ptrdiff_t)scount,
                                                   tmpbuf + (ptrdiff_t)(size - rank) * (ptrdiff_t)scount* sext, 
                                                   (char*) sbuf);
        if (err<0) {
            line = __LINE__; err = -1; goto err_hndl;
        }
    }

    /* perform communication step */
    for (distance = 1; distance < size; distance<<=1) {

        sendto = (rank + distance) % size;
        recvfrom = (rank - distance + size) % size;
        k = 0;

        /* create indexed datatype */
        for (i = 1; i < size; i++) {
            if (( i & distance) == distance) {
                displs[k] = (ptrdiff_t)i * (ptrdiff_t)scount; 
                blen[k] = scount;
                k++;
            }
        }
        /* Set indexes and displacements */
        err = ompi_datatype_create_indexed(k, blen, displs, sdtype, &new_ddt);
        if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;  }
        /* Commit the new datatype */
        err = ompi_datatype_commit(&new_ddt);
        if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;  }

        /* Sendreceive */
        err = ompi_coll_tuned_sendrecv ( tmpbuf, 1, new_ddt, sendto,
                                         MCA_COLL_BASE_TAG_ALLTOALL,
                                         rbuf, 1, new_ddt, recvfrom,
                                         MCA_COLL_BASE_TAG_ALLTOALL,
                                         comm, MPI_STATUS_IGNORE, rank );
        if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }

        /* Copy back new data from recvbuf to tmpbuf */
        err = ompi_datatype_copy_content_same_ddt(new_ddt, 1,tmpbuf, (char *) rbuf);
        if (err < 0) { line = __LINE__; err = -1; goto err_hndl;  }

        /* free ddt */
        err = ompi_datatype_destroy(&new_ddt);
        if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;  }
    } /* end of for (distance = 1... */

    /* Step 3 - local rotation - */
    for (i = 0; i < size; i++) {

        err = ompi_datatype_copy_content_same_ddt (rdtype, (int32_t) rcount,
                                                   ((char*)rbuf) + ((ptrdiff_t)((rank - i + size) % size) * (ptrdiff_t)rcount * rext), 
                                                   tmpbuf + (ptrdiff_t)i * (ptrdiff_t)rcount * rext);
        if (err < 0) { line = __LINE__; err = -1; goto err_hndl;  }
    }

    /* Step 4 - clean up */
    if (tmpbuf != NULL) free(tmpbuf_free);
    if (weallocated) {
        if (displs != NULL) free(displs);
        if (blen != NULL) free(blen);
    }
    return OMPI_SUCCESS;

 err_hndl:
    OPAL_OUTPUT((ompi_coll_tuned_stream,
                 "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, 
                 rank));
    if (tmpbuf != NULL) free(tmpbuf_free);
    if (weallocated) {
        if (displs != NULL) free(displs);
        if (blen != NULL) free(blen);
    }
    return err;
}
示例#18
0
int main(int argc, char *argv[])
{
    opal_init_util(&argc, &argv);
    ompi_datatype_init();

    /* Simple contiguous data: MPI_INT32_T */
    {
        int32_t send_data[2] = {1234, 5678};
        int32_t recv_data[2] = {-1, -1};

        if( verbose ) {
            printf("send data %08x %08x \n", send_data[0], send_data[1]);
            printf("data "); dump_hex(&send_data, sizeof(int32_t) * 2); printf("\n");
        }
        (void)pack_unpack_datatype( send_data, &ompi_mpi_int32_t.dt, 2,
                                    recv_data, check_contiguous, (void*)&ompi_mpi_int32_t.dt );
        if( verbose ) {
            printf("recv "); dump_hex(&recv_data, sizeof(int32_t) * 2); printf("\n");
            printf("recv data %08x %08x \n", recv_data[0], recv_data[1]);
        }
        if( (send_data[0] != recv_data[0]) || (send_data[1] != recv_data[1]) ) {
            printf("Error during external32 pack/unack for contiguous types (MPI_INT32_T)\n");
            exit(-1);
        }
    }
    /* Simple contiguous data: MPI_INT16_T */
    {
        int16_t send_data[2] = {1234, 5678};
        int16_t recv_data[2] = {-1, -1};

        if( verbose ) {
            printf("send data %08x %08x \n", send_data[0], send_data[1]);
            printf("data "); dump_hex(&send_data, sizeof(int16_t) * 2); printf("\n");
        }
        (void)pack_unpack_datatype( send_data, &ompi_mpi_int16_t.dt, 2,
                                    recv_data, check_contiguous, (void*)&ompi_mpi_int16_t.dt );
        if( verbose ) {
            printf("recv "); dump_hex(&recv_data, sizeof(int16_t) * 2); printf("\n");
            printf("recv data %08x %08x \n", recv_data[0], recv_data[1]);
        }
        if( (send_data[0] != recv_data[0]) || (send_data[1] != recv_data[1]) ) {
            printf("Error during external32 pack/unack for contiguous types\n");
            exit(-1);
        }
    }

    /* Vector datatype */
    printf("\n\nVector datatype\n\n");
    {
        int count=2, blocklength=1, stride=2;
        int send_data[3] = {1234, 0, 5678};
        int recv_data[3] = {-1, -1, -1};
        ompi_datatype_t *ddt;

        ompi_datatype_create_vector ( count, blocklength, stride, &ompi_mpi_int.dt, &ddt );
        {
            const int* a_i[3] = {&count, &blocklength, &stride};
            ompi_datatype_t *type = &ompi_mpi_int.dt;

            ompi_datatype_set_args( ddt, 3, a_i, 0, NULL, 1, &type, MPI_COMBINER_VECTOR );
        }
        ompi_datatype_commit(&ddt);

        if( verbose ) {
            printf("send data %08x %x08x %08x \n", send_data[0], send_data[1], send_data[2]);
            printf("data "); dump_hex(&send_data, sizeof(int32_t) * 3); printf("\n");
        }
        (void)pack_unpack_datatype( send_data, ddt, 1, recv_data, check_vector, (void*)&ompi_mpi_int32_t.dt );
        if( verbose ) {
            printf("recv "); dump_hex(&recv_data, sizeof(int32_t) * 3); printf("\n");
            printf("recv data %08x %08x %08x \n", recv_data[0], recv_data[1], recv_data[2]);
        }
        ompi_datatype_destroy(&ddt);
        if( (send_data[0] != recv_data[0]) || (send_data[2] != recv_data[2]) ) {
            printf("Error during external32 pack/unack for vector types (MPI_INT32_T)\n");
            exit(-1);
        }
    }

    ompi_datatype_finalize();

    return 0;
}
/*
 *	scatterv_inter
 *
 *	Function:	- scatterv operation
 *	Accepts:	- same arguments as MPI_Scatterv()
 *	Returns:	- MPI_SUCCESS or error code
 */
int
mca_coll_inter_scatterv_inter(void *sbuf, int *scounts,
                              int *disps, struct ompi_datatype_t *sdtype,
                              void *rbuf, int rcount,
                              struct ompi_datatype_t *rdtype, int root,
                              struct ompi_communicator_t *comm,
                              mca_coll_base_module_t *module)
{
    int i, rank, size, err, total, size_local;
    int *counts=NULL,*displace=NULL;
    char *ptmp=NULL;
    MPI_Aint incr;
    MPI_Aint extent;
    MPI_Aint lb;
    ompi_datatype_t *ndtype;

    /* Initialize */

    rank = ompi_comm_rank(comm);
    size = ompi_comm_remote_size(comm);
    size_local = ompi_comm_size(comm);

    if (MPI_PROC_NULL == root) {
        /* do nothing */
        err = OMPI_SUCCESS;
    } else if (MPI_ROOT != root) {
	if(0 == rank) {
	    /* local root recieves the counts from the root */
	    counts = (int *)malloc(sizeof(int) * size_local);
	    err = MCA_PML_CALL(recv(counts, size_local, MPI_INT,
				    root, MCA_COLL_BASE_TAG_SCATTERV,
				    comm, MPI_STATUS_IGNORE));
	    if (OMPI_SUCCESS != err) {
		return err;
	    }
	    /* calculate the whole buffer size and recieve it from root */
	    err = ompi_datatype_get_extent(rdtype, &lb, &extent);
	    if (OMPI_SUCCESS != err) {
		return OMPI_ERROR;
	    }
	    incr = 0;
	    for (i = 0; i < size_local; i++) {
		incr = incr + extent*counts[i];
	    }
	    if ( incr > 0 ) {
		ptmp = (char*)malloc(incr); 
		if (NULL == ptmp) {
		    return OMPI_ERR_OUT_OF_RESOURCE;
		}
	    }
	    total = 0;
	    for (i = 0; i < size_local; i++) {
		total = total + counts[i];
	    }
	    err = MCA_PML_CALL(recv(ptmp, total, rdtype,
				    root, MCA_COLL_BASE_TAG_SCATTERV,
				    comm, MPI_STATUS_IGNORE));
	    if (OMPI_SUCCESS != err) {
		return err;
	    }
	    /* set the local displacement i.e. no displacements here */
	    displace = (int *)malloc(sizeof(int) * size_local);
	    displace[0] = 0;
	    for (i = 1; i < size_local; i++) {
		displace[i] = displace[i-1] + counts[i-1];
	    }
	}
	/* perform the scatterv locally */
	err = comm->c_local_comm->c_coll.coll_scatterv(ptmp, counts, displace, 
						       rdtype, rbuf, rcount, 
						       rdtype, 0, comm->c_local_comm,
                                                       comm->c_local_comm->c_coll.coll_scatterv_module);
	if (OMPI_SUCCESS != err) {
	    return err;
	}

	if (NULL != ptmp) {
	    free(ptmp);
	}
	if (NULL != displace) {
	    free(displace);
	}
	if (NULL != counts) {
	    free(counts);
	}

    } else {
	err = MCA_PML_CALL(send(scounts, size, MPI_INT, 0,
				MCA_COLL_BASE_TAG_SCATTERV,
				MCA_PML_BASE_SEND_STANDARD, comm));
	if (OMPI_SUCCESS != err) {
	    return err;
	}

	ompi_datatype_create_indexed(size,scounts,disps,sdtype,&ndtype);
	ompi_datatype_commit(&ndtype);
	
	err = MCA_PML_CALL(send(sbuf, 1, ndtype, 0,
				MCA_COLL_BASE_TAG_SCATTERV,
				MCA_PML_BASE_SEND_STANDARD, comm));
	if (OMPI_SUCCESS != err) {
	    return err;
	}
	ompi_datatype_destroy(&ndtype);

    }

    /* All done */
    return err;
}
static int two_phase_exchange_data(mca_io_ompio_file_t *fh,
				   void *buf, struct iovec *offset_len,
				   int *send_size, int *start_pos, int *recv_size,
				   int *count, int *partial_send, 
				   int *recd_from_proc, int contig_access_count,
				   OMPI_MPI_OFFSET_TYPE min_st_offset,
				   OMPI_MPI_OFFSET_TYPE fd_size,
				   OMPI_MPI_OFFSET_TYPE *fd_start,
				   OMPI_MPI_OFFSET_TYPE *fd_end, 
				   Flatlist_node *flat_buf,
				   mca_io_ompio_access_array_t *others_req, 
				   int iter, size_t *buf_idx, 
				   MPI_Aint buftype_extent, int striping_unit,
				   int *aggregator_list)
{
  
  int i=0, j=0, k=0, tmp=0, nprocs_recv=0, nprocs_send=0;
  int ret = OMPI_SUCCESS;
  char **recv_buf = NULL;
  MPI_Request *requests=NULL;
  MPI_Datatype send_type;



#if TIME_BREAKDOWN
    start_rcomm_time = MPI_Wtime();
#endif

  ret = fh->f_comm->c_coll.coll_alltoall (send_size,
					  1,
					  MPI_INT,
					  recv_size,
					  1,
					  MPI_INT,
					  fh->f_comm,
					  fh->f_comm->c_coll.coll_alltoall_module);

  if ( OMPI_SUCCESS != ret ){
    goto exit;
  }
  
  
#if DEBUG
  for (i=0; i<fh->f_size; i++){
    printf("%d: RS[%d]: %d\n", fh->f_rank,
	   i,
	   recv_size[i]);
  }
#endif

  
  nprocs_recv = 0;
  for (i=0; i < fh->f_size; i++) 
    if (recv_size[i]) nprocs_recv++;

  nprocs_send = 0;
  for (i=0; i< fh->f_size; i++) 
    if (send_size[i]) nprocs_send++;

  requests = (MPI_Request *)
    malloc((nprocs_send+nprocs_recv+1) *  sizeof(MPI_Request));
  
  if (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY) {
    j = 0;
    for (i=0; i < fh->f_size; i++){
      if (recv_size[i]){
	ret = MCA_PML_CALL(irecv(((char *) buf)+ buf_idx[i],
				 recv_size[i],
				 MPI_BYTE,
				 i,
				 fh->f_rank+i+100*iter,
				 fh->f_comm,
				 requests+j));
	
	if ( OMPI_SUCCESS != ret ){
	  return ret;
	}
	j++;
	buf_idx[i] += recv_size[i];
      }
    }
  }
  else{
    
    recv_buf = (char **)malloc(fh->f_size * sizeof(char *));
    if (NULL == recv_buf){
      ret = OMPI_ERR_OUT_OF_RESOURCE;
      goto exit;
    }

    for (i=0; i < fh->f_size; i++)
      if(recv_size[i]) recv_buf[i] = 
			 (char *) malloc (recv_size[i] *  sizeof(char));
    j = 0;
    for(i=0; i<fh->f_size; i++)
      if (recv_size[i]) {
	ret = MCA_PML_CALL(irecv(recv_buf[i],
				 recv_size[i],
				 MPI_BYTE,
				 i,
				 fh->f_rank+i+100*iter,
				 fh->f_comm,
				 requests+j));
	j++;
	
      }
  }
    
  

  j = 0;
  for (i = 0; i< fh->f_size; i++){
    if (send_size[i]){
      if (partial_send[i]){
	k = start_pos[i] + count[i] - 1;
	tmp = others_req[i].lens[k];
	others_req[i].lens[k] = partial_send[i];
      }

      ompi_datatype_create_hindexed(count[i],
				    &(others_req[i].lens[start_pos[i]]),
				    &(others_req[i].mem_ptrs[start_pos[i]]),
				    MPI_BYTE,
				    &send_type);

      ompi_datatype_commit(&send_type);

      ret = MCA_PML_CALL(isend(MPI_BOTTOM,
			       1,
			       send_type,
			       i,
			       fh->f_rank+i+100*iter,
			       MCA_PML_BASE_SEND_STANDARD,
			       fh->f_comm,
			       requests+nprocs_recv+j));
      ompi_datatype_destroy(&send_type);
      
      if (partial_send[i]) others_req[i].lens[k] = tmp;
      j++;
    }
  }


  if (nprocs_recv) {

    ret = ompi_request_wait_all(nprocs_recv,
				requests, 
				MPI_STATUS_IGNORE);
    
    
    if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) {

      two_phase_fill_user_buffer(fh, buf, flat_buf,
				 recv_buf, offset_len,
				 (unsigned *)recv_size, requests,
				 recd_from_proc, contig_access_count,
				 min_st_offset, fd_size, fd_start, fd_end,
				 buftype_extent, striping_unit, aggregator_list);
    }
  }

  ret = ompi_request_wait_all(nprocs_send,
			      requests+nprocs_recv, 
			      MPI_STATUS_IGNORE);

  if (NULL != requests){
    free(requests);
    requests = NULL;
  }
  
  if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)){
    for (i=0; i< fh->f_size; i++){
      if (recv_size[i]){
	free(recv_buf[i]);
      }
    }
    free(recv_buf);
  }

#if TIME_BREAKDOWN
    end_rcomm_time = MPI_Wtime();
    rcomm_time += (end_rcomm_time - start_rcomm_time);
#endif

 exit:
    return ret;

}
示例#21
0
/*
 *	file_open_pvfs2: This is the same strategy as ROMIO's pvfs2 open
 *
 *	Function:	- opens a new file
 *	Accepts:	- same arguments as MPI_File_open()
 *	Returns:	- Success if new file handle
 */
int
mca_fs_pvfs2_file_open (struct ompi_communicator_t *comm,
                        const char* filename,
                        int access_mode,
                        struct ompi_info_t *info,
                        mca_io_ompio_file_t *fh)
{
    int ret;
    mca_fs_pvfs2 *pvfs2_fs;
    PVFS_fs_id pvfs2_id;
    char pvfs2_path[OMPIO_MAX_NAME] = {0};
    char * ncache_timeout;
    open_status o_status = {0, {0, 0}};
    struct ompi_datatype_t *open_status_type;
    struct ompi_datatype_t *types[2] = {&ompi_mpi_int.dt, &ompi_mpi_byte.dt};
    int lens[2] = {1, sizeof(PVFS_object_ref)};
    OPAL_PTRDIFF_TYPE offsets[2];
    char char_stripe[MPI_MAX_INFO_KEY];
    int flag;
    int fs_pvfs2_stripe_size = -1;
    int fs_pvfs2_stripe_width = -1;

    /* We are going to do what ROMIO does with one process resolving
     * the name and broadcasting to others */

    pvfs2_fs = (mca_fs_pvfs2 *) malloc(sizeof(mca_fs_pvfs2));
    if (NULL == pvfs2_fs) {
        opal_output (1, "OUT OF MEMORY\n");
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    if (!mca_fs_pvfs2_IS_INITIALIZED) {
        /* disable the pvfs2 ncache */
        ncache_timeout = getenv("PVFS2_NCACHE_TIMEOUT");
        if (ncache_timeout == NULL ) {
            setenv("PVFS2_NCACHE_TIMEOUT", "0", 1);
        }
        ret = PVFS_util_init_defaults();
        if (ret < 0) {
            PVFS_perror("PVFS_util_init_defaults", ret);
            return OMPI_ERROR;
        }
        mca_fs_pvfs2_IS_INITIALIZED = 1;
    }

    memset(&(pvfs2_fs->credentials), 0, sizeof(PVFS_credentials));
    PVFS_util_gen_credentials(&(pvfs2_fs->credentials));

    /* check for stripe size and stripe depth in the info object and
       update mca_fs_pvfs2_stripe_width and mca_fs_pvfs2_stripe_size
       before calling fake_an_open() */

    ompi_info_get (info, "stripe_size", MPI_MAX_INFO_VAL, char_stripe, &flag);
    if ( flag ) {
        sscanf ( char_stripe, "%d", &fs_pvfs2_stripe_size );
    }

    ompi_info_get (info, "stripe_width", MPI_MAX_INFO_VAL, char_stripe, &flag);
    if ( flag ) {
        sscanf ( char_stripe, "%d", &fs_pvfs2_stripe_width );
    }

    if (fs_pvfs2_stripe_size < 0) {
        fs_pvfs2_stripe_size = mca_fs_pvfs2_stripe_size;
    }

    if (fs_pvfs2_stripe_width < 0) {
        fs_pvfs2_stripe_width = mca_fs_pvfs2_stripe_width;
    }


    if (OMPIO_ROOT == fh->f_rank) {
        ret = PVFS_util_resolve(filename, &pvfs2_id, pvfs2_path, OMPIO_MAX_NAME);
        if (ret < 0 ) {
            PVFS_perror("PVFS_util_resolve", ret);
	    o_status.error = -1;
        }
        else {
            fake_an_open (pvfs2_id,
                          pvfs2_path,
                          access_mode,
                          fs_pvfs2_stripe_width,
                          (PVFS_size)fs_pvfs2_stripe_size,
                          pvfs2_fs,
                          &o_status);
        }
        pvfs2_fs->object_ref = o_status.object_ref;
	fh->f_fs_ptr = pvfs2_fs;
    }

    /* broadcast status and (possibly valid) object reference */
    offsets[0] = (MPI_Aint)(&o_status.error);
    offsets[1] = (MPI_Aint)(&o_status.object_ref);

    ompi_datatype_create_struct (2, lens, offsets, types, &open_status_type);
    ompi_datatype_commit (&open_status_type);

    fh->f_comm->c_coll.coll_bcast (MPI_BOTTOM,
                                   1,
                                   open_status_type,
                                   OMPIO_ROOT,
                                   fh->f_comm,
                                   fh->f_comm->c_coll.coll_bcast_module);

    ompi_datatype_destroy (&open_status_type);

    if (o_status.error != 0) {
	/* No need to free the pvfs2_fs structure, since it will
	   be deallocated in file_close in case of an error */
	fh->f_fs_ptr = NULL;
	return OMPI_ERROR;
    }

    pvfs2_fs->object_ref = o_status.object_ref;
    fh->f_fs_ptr = pvfs2_fs;

    /* update the internal ompio structure to store stripe
       size and stripe depth correctly.
       Hadi(to be done): For this read the stripe size and stripe depth from
       the file itself
    */

    if (fs_pvfs2_stripe_size > 0 && fs_pvfs2_stripe_width > 0) {
        fh->f_stripe_size = fs_pvfs2_stripe_size;
        fh->f_stripe_count = fs_pvfs2_stripe_width;
    }

    return OMPI_SUCCESS;
}
static int two_phase_exchage_data(mca_io_ompio_file_t *fh,
				  void *buf,
				  char *write_buf,
				  struct iovec *offset_length,
				  int *send_size,int *start_pos,
				  int *recv_size,
				  OMPI_MPI_OFFSET_TYPE off,
				  OMPI_MPI_OFFSET_TYPE size, int *count,
				  int *partial_recv, int *sent_to_proc,
				  int contig_access_count,
				  OMPI_MPI_OFFSET_TYPE min_st_offset,
				  OMPI_MPI_OFFSET_TYPE fd_size,
				  OMPI_MPI_OFFSET_TYPE *fd_start,
				  OMPI_MPI_OFFSET_TYPE *fd_end,
				  Flatlist_node *flat_buf,
				  mca_io_ompio_access_array_t *others_req,
				  int *send_buf_idx, int *curr_to_proc,
				  int *done_to_proc, int iter,
				  size_t *buf_idx,MPI_Aint buftype_extent,
				  int striping_unit, int *aggregator_list,
				  int *hole){
  
    int *tmp_len=NULL, sum, *srt_len=NULL, nprocs_recv, nprocs_send,  k,i,j;
    int ret=OMPI_SUCCESS;
    MPI_Request *requests=NULL, *send_req=NULL;
    ompi_datatype_t **recv_types=NULL;
    OMPI_MPI_OFFSET_TYPE *srt_off=NULL;
    char **send_buf = NULL; 
    
#if TIME_BREAKDOWN
      start_comm_time = MPI_Wtime();
#endif
        
    ret = fh->f_comm->c_coll.coll_alltoall (recv_size,
					    1,
					    MPI_INT,
					    send_size,
					    1,
					    MPI_INT,
					    fh->f_comm,
					    fh->f_comm->c_coll.coll_alltoall_module);
    
    if ( OMPI_SUCCESS != ret ){
      return ret;
    }

    nprocs_recv = 0;
    for (i=0;i<fh->f_size;i++){
      if (recv_size[i]){
	nprocs_recv++;
      }
    }
    
    
    recv_types = (ompi_datatype_t **)
	malloc (( nprocs_recv + 1 ) * sizeof(ompi_datatype_t *));
    
    if ( NULL == recv_types ){
      ret = OMPI_ERR_OUT_OF_RESOURCE;
      goto exit;
    }

    tmp_len = (int *) malloc(fh->f_size*sizeof(int));
    
    if ( NULL == tmp_len ) {
      return OMPI_ERR_OUT_OF_RESOURCE;
    }

    j = 0;
    for (i=0;i<fh->f_size;i++){
	if (recv_size[i]) {
	    if (partial_recv[i]) {
		k = start_pos[i] + count[i] - 1;
		tmp_len[i] = others_req[i].lens[k];
		others_req[i].lens[k] = partial_recv[i];
	    }
	    ompi_datatype_create_hindexed(count[i], 
					  &(others_req[i].lens[start_pos[i]]),
					  &(others_req[i].mem_ptrs[start_pos[i]]), 
					  MPI_BYTE, recv_types+j);
	    ompi_datatype_commit(recv_types+j);
	    j++;
	}
    }

    sum = 0;
    for (i=0;i<fh->f_size;i++) sum += count[i];
    srt_off = (OMPI_MPI_OFFSET_TYPE *) 
      malloc((sum+1)*sizeof(OMPI_MPI_OFFSET_TYPE));
    
    if ( NULL == srt_off ){
      ret = OMPI_ERR_OUT_OF_RESOURCE;
      goto exit;
    }
    
    srt_len = (int *) malloc((sum+1)*sizeof(int));
    
    if ( NULL == srt_len ) {
      ret = OMPI_ERR_OUT_OF_RESOURCE;
      goto exit;
    }


    two_phase_heap_merge(others_req, count, srt_off, srt_len, start_pos, fh->f_size,fh->f_rank,  nprocs_recv, sum);


    for (i=0; i<fh->f_size; i++) 
        if (partial_recv[i]) {
            k = start_pos[i] + count[i] - 1;
            others_req[i].lens[k] = tmp_len[i];
        }
    
    if ( NULL != tmp_len ){
      free(tmp_len); 
    }

    *hole = 0;
    if (off != srt_off[0]){
	*hole = 1;
    }
    else{
	for (i=1;i<sum;i++){
	    if (srt_off[i] <= srt_off[0] + srt_len[0]){
		int new_len = srt_off[i] + srt_len[i] - srt_off[0];
		if(new_len > srt_len[0]) 
		    srt_len[0] = new_len;
	    }
	    else
		break;
	}
	if (i < sum || size != srt_len[0])
	    *hole = 1;
    }


    if ( NULL != srt_off ){
      free(srt_off);
    }
    if ( NULL != srt_len ){
      free(srt_len);
    }

    if (nprocs_recv){
	if (*hole){
	    if (off > 0){
		fh->f_io_array = (mca_io_ompio_io_array_t *)malloc 
		    (sizeof(mca_io_ompio_io_array_t));
		if (NULL == fh->f_io_array) {
		    opal_output(1, "OUT OF MEMORY\n");
		    return OMPI_ERR_OUT_OF_RESOURCE;
		}
		fh->f_io_array[0].offset  =(IOVBASE_TYPE *)(intptr_t) off;
		fh->f_num_of_io_entries = 1;
		fh->f_io_array[0].length = size;
		fh->f_io_array[0].memory_address = write_buf;
		if (fh->f_num_of_io_entries){
		    if (OMPI_SUCCESS != fh->f_fbtl->fbtl_preadv (fh, NULL)) {
			opal_output(1, "READ FAILED\n");
			return OMPI_ERROR;
		    }
		}
		
	    }
	    fh->f_num_of_io_entries = 0;
	    if (NULL != fh->f_io_array) {
		free (fh->f_io_array);
		fh->f_io_array = NULL;
	    }
	}
    }
    
    nprocs_send = 0;
    for (i=0; i <fh->f_size; i++) if (send_size[i]) nprocs_send++;

    #if DEBUG_ON
    printf("%d : nprocs_send : %d\n", fh->f_rank,nprocs_send);
    #endif

    requests = (MPI_Request *) 	
	malloc((nprocs_send+nprocs_recv+1)*sizeof(MPI_Request)); 

    if ( NULL == requests ){
      return OMPI_ERR_OUT_OF_RESOURCE;
    }
    
    j = 0;
    for (i=0; i<fh->f_size; i++) {
	if (recv_size[i]) {
	  ret = MCA_PML_CALL(irecv(MPI_BOTTOM,
				   1,
				   recv_types[j],
				   i,
				   fh->f_rank+i+100*iter,
				   fh->f_comm,
				   requests+j));

	  if ( OMPI_SUCCESS != ret ){
	    goto exit;
	  }
	  j++;
	}
    }
    send_req = requests + nprocs_recv;
    
    
    if (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY) {
	j = 0;
	for (i=0; i <fh->f_size; i++) 
	  if (send_size[i]) {
	    ret = MCA_PML_CALL(isend(((char *) buf) + buf_idx[i],
				     send_size[i],
				     MPI_BYTE,
				     i,
				     fh->f_rank+i+100*iter,
				     MCA_PML_BASE_SEND_STANDARD, 
				     fh->f_comm,
				     send_req+j));	

	    if ( OMPI_SUCCESS != ret ){
	      goto exit;
	    }
	    
	    j++;
	    buf_idx[i] += send_size[i];
	  }
    }
    else if(nprocs_send && (!(fh->f_flags & OMPIO_CONTIGUOUS_MEMORY))){
      send_buf = (char **) malloc(fh->f_size*sizeof(char*));
      if ( NULL == send_buf ){
	ret = OMPI_ERR_OUT_OF_RESOURCE;
	goto exit;
      }
      for (i=0; i < fh->f_size; i++){
	if (send_size[i]) {
	  send_buf[i] = (char *) malloc(send_size[i]);
	  
	  if ( NULL == send_buf[i] ){
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	  }
	}
      }
      
      ret = two_phase_fill_send_buffer(fh, buf,flat_buf, send_buf,
				       offset_length, send_size,
				       send_req,sent_to_proc,
				       contig_access_count, 
				       min_st_offset, fd_size,
				       fd_start, fd_end, send_buf_idx,
				       curr_to_proc, done_to_proc,
				       iter, buftype_extent, striping_unit,
				       aggregator_list);
      
      if ( OMPI_SUCCESS != ret ){
	goto exit;
      }
    }


    for (i=0; i<nprocs_recv; i++) ompi_datatype_destroy(recv_types+i);
    if (NULL != recv_types){
      free(recv_types);
      recv_types = NULL;
    }

    ret = ompi_request_wait_all (nprocs_send+nprocs_recv,
				 requests,
				 MPI_STATUS_IGNORE);
    

    if ( NULL != requests ){
      free(requests);
    }

#if TIME_BREAKDOWN
      end_comm_time = MPI_Wtime();
      comm_time += (end_comm_time - start_comm_time);
#endif

 exit:    
    return ret;
}
示例#23
0
int32_t ompi_datatype_create_subarray(int ndims,
                                      int const* size_array,
                                      int const* subsize_array,
                                      int const* start_array,
                                      int order,
                                      const ompi_datatype_t* oldtype,
                                      ompi_datatype_t** newtype)
{
    MPI_Datatype last_type;
    int32_t i, step, end_loop;
    MPI_Aint size, displ, extent;

    /**
     * If the oldtype contains the original MPI_LB and MPI_UB markers then we
     * are forced to follow the MPI standard suggestion and reset these 2
     * markers (MPI 3.0 page 96 line 37).  Otherwise we can simply resize the
     * datatype.
     */
    ompi_datatype_type_extent( oldtype, &extent );

    /* If the ndims is zero then return the NULL datatype */
    if( ndims < 2 ) {
        if( 0 == ndims ) {
            *newtype = &ompi_mpi_datatype_null.dt;
            return MPI_SUCCESS;
        }
        ompi_datatype_create_contiguous( subsize_array[0], oldtype, &last_type );
        size = size_array[0];
        displ = start_array[0];
        goto replace_subarray_type;
    }

    if( MPI_ORDER_C == order ) {
        i = ndims - 1;
        step = -1;
        end_loop = -1;
    } else {
        i = 0;
        step = 1;
        end_loop = ndims;
    }

    /* As we know that the ndims is at least 1 we can start by creating the
     * first dimension data outside the loop, such that we dont have to create
     * a duplicate of the oldtype just to be able to free it.
     */
    ompi_datatype_create_vector( subsize_array[i+step], subsize_array[i], size_array[i],
                                 oldtype, newtype );

    last_type = *newtype;
    size = (MPI_Aint)size_array[i] * (MPI_Aint)size_array[i+step];
    displ = (MPI_Aint)start_array[i] + (MPI_Aint)start_array[i+step] * (MPI_Aint)size_array[i];
    for( i += 2 * step; i != end_loop; i += step ) {
        ompi_datatype_create_hvector( subsize_array[i], 1, size * extent,
                                      last_type, newtype );
        ompi_datatype_destroy( &last_type );

        displ += size * start_array[i];
        size *= size_array[i];
        last_type = *newtype;
    }

 replace_subarray_type:
    /*
     * Resized will only set the soft lb and ub markers without moving the real
     * data inside. Thus, in case the original data contains the hard markers
     * (MPI_LB or MPI_UB) we must force the displacement of the data upward to
     * the right position AND set the hard markers LB and UB.
     *
     * NTH: ompi_datatype_create_resized() does not do enough for the general
     * pack/unpack functions to work correctly. Until this is fixed always use
     * ompi_datatype_create_struct(). Once this is fixed remove 1 || below. To
     * verify that the regression is fixed run the subarray test in the Open MPI
     * ibm testsuite.
     */
    if(1 || oldtype->super.flags & (OPAL_DATATYPE_FLAG_USER_LB | OPAL_DATATYPE_FLAG_USER_UB) ) {
        MPI_Aint displs[3];
        MPI_Datatype types[3];
        int blength[3] = { 1, 1, 1 };

        displs[0] = 0; displs[1] = displ * extent; displs[2] = size * extent;
        types[0] = MPI_LB; types[1] = last_type; types[2] = MPI_UB;
        ompi_datatype_create_struct( 3, blength, displs, types, newtype );
    } else {
        ompi_datatype_create_resized(last_type, displ * extent, size * extent, newtype);
    }
    ompi_datatype_destroy( &last_type );

    return OMPI_SUCCESS;
}
示例#24
0
int ompi_coll_base_alltoall_intra_bruck(const void *sbuf, int scount,
                                         struct ompi_datatype_t *sdtype,
                                         void* rbuf, int rcount,
                                         struct ompi_datatype_t *rdtype,
                                         struct ompi_communicator_t *comm,
                                         mca_coll_base_module_t *module)
{
    int i, k, line = -1, rank, size, err = 0;
    int sendto, recvfrom, distance, *displs = NULL, *blen = NULL;
    char *tmpbuf = NULL, *tmpbuf_free = NULL;
    OPAL_PTRDIFF_TYPE sext, rext, span, gap;
    struct ompi_datatype_t *new_ddt;

    if (MPI_IN_PLACE == sbuf) {
        return mca_coll_base_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
                                                            comm, module);
    }

    size = ompi_comm_size(comm);
    rank = ompi_comm_rank(comm);

    OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
                 "coll:base:alltoall_intra_bruck rank %d", rank));

    err = ompi_datatype_type_extent (sdtype, &sext);
    if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }

    err = ompi_datatype_type_extent (rdtype, &rext);
    if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }

    span = opal_datatype_span(&sdtype->super, (int64_t)size * scount, &gap);

    displs = (int *) malloc(size * sizeof(int));
    if (displs == NULL) { line = __LINE__; err = -1; goto err_hndl; }
    blen = (int *) malloc(size * sizeof(int));
    if (blen == NULL) { line = __LINE__; err = -1; goto err_hndl; }

    /* tmp buffer allocation for message data */
    tmpbuf_free = (char *)malloc(span);
    if (tmpbuf_free == NULL) { line = __LINE__; err = -1; goto err_hndl; }
    tmpbuf = tmpbuf_free - gap;

    /* Step 1 - local rotation - shift up by rank */
    err = ompi_datatype_copy_content_same_ddt (sdtype,
                                               (int32_t) ((ptrdiff_t)(size - rank) * (ptrdiff_t)scount),
                                               tmpbuf,
                                               ((char*) sbuf) + (ptrdiff_t)rank * (ptrdiff_t)scount * sext);
    if (err<0) {
        line = __LINE__; err = -1; goto err_hndl;
    }

    if (rank != 0) {
        err = ompi_datatype_copy_content_same_ddt (sdtype, (ptrdiff_t)rank * (ptrdiff_t)scount,
                                                   tmpbuf + (ptrdiff_t)(size - rank) * (ptrdiff_t)scount* sext,
                                                   (char*) sbuf);
        if (err<0) {
            line = __LINE__; err = -1; goto err_hndl;
        }
    }

    /* perform communication step */
    for (distance = 1; distance < size; distance<<=1) {

        sendto = (rank + distance) % size;
        recvfrom = (rank - distance + size) % size;
        k = 0;

        /* create indexed datatype */
        for (i = 1; i < size; i++) {
            if (( i & distance) == distance) {
                displs[k] = (ptrdiff_t)i * (ptrdiff_t)scount;
                blen[k] = scount;
                k++;
            }
        }
        /* Set indexes and displacements */
        err = ompi_datatype_create_indexed(k, blen, displs, sdtype, &new_ddt);
        if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;  }
        /* Commit the new datatype */
        err = ompi_datatype_commit(&new_ddt);
        if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;  }

        /* Sendreceive */
        err = ompi_coll_base_sendrecv ( tmpbuf, 1, new_ddt, sendto,
                                         MCA_COLL_BASE_TAG_ALLTOALL,
                                         rbuf, 1, new_ddt, recvfrom,
                                         MCA_COLL_BASE_TAG_ALLTOALL,
                                         comm, MPI_STATUS_IGNORE, rank );
        if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }

        /* Copy back new data from recvbuf to tmpbuf */
        err = ompi_datatype_copy_content_same_ddt(new_ddt, 1,tmpbuf, (char *) rbuf);
        if (err < 0) { line = __LINE__; err = -1; goto err_hndl;  }

        /* free ddt */
        err = ompi_datatype_destroy(&new_ddt);
        if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;  }
    } /* end of for (distance = 1... */

    /* Step 3 - local rotation - */
    for (i = 0; i < size; i++) {

        err = ompi_datatype_copy_content_same_ddt (rdtype, (int32_t) rcount,
                                                   ((char*)rbuf) + ((ptrdiff_t)((rank - i + size) % size) * (ptrdiff_t)rcount * rext),
                                                   tmpbuf + (ptrdiff_t)i * (ptrdiff_t)rcount * rext);
        if (err < 0) { line = __LINE__; err = -1; goto err_hndl;  }
    }

    /* Step 4 - clean up */
    if (tmpbuf != NULL) free(tmpbuf_free);
    if (displs != NULL) free(displs);
    if (blen != NULL) free(blen);
    return OMPI_SUCCESS;

 err_hndl:
    OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
                 "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err,
                 rank));
    (void)line;  // silence compiler warning
    if (tmpbuf != NULL) free(tmpbuf_free);
    if (displs != NULL) free(displs);
    if (blen != NULL) free(blen);
    return err;
}
int
mca_fcoll_dynamic_gen2_file_read_all (mca_io_ompio_file_t *fh,
                                 void *buf,
                                 int count,
                                 struct ompi_datatype_t *datatype,
                                 ompi_status_public_t *status)
{
    MPI_Aint position = 0;
    MPI_Aint total_bytes = 0;          /* total bytes to be read */
    MPI_Aint bytes_to_read_in_cycle = 0; /* left to be read in a cycle*/
    MPI_Aint bytes_per_cycle = 0;      /* total read in each cycle by each process*/
    int index = 0, ret=OMPI_SUCCESS;
    int cycles = 0;
    int i=0, j=0, l=0;
    int n=0; /* current position in total_bytes_per_process array */
    MPI_Aint bytes_remaining = 0; /* how many bytes have been read from the current
                                     value from total_bytes_per_process */
    int *sorted_file_offsets=NULL, entries_per_aggregator=0;
    int bytes_received = 0;
    int blocks = 0;
    /* iovec structure and count of the buffer passed in */
    uint32_t iov_count = 0;
    struct iovec *decoded_iov = NULL;
    int iov_index = 0;
    size_t current_position = 0;
    struct iovec *local_iov_array=NULL, *global_iov_array=NULL;
    char *receive_buf = NULL;
    MPI_Aint *memory_displacements=NULL;
    /* global iovec at the readers that contain the iovecs created from
       file_set_view */
    uint32_t total_fview_count = 0;
    int local_count = 0;
    int *fview_count = NULL, *disp_index=NULL, *temp_disp_index=NULL;
    int current_index=0, temp_index=0;
    int **blocklen_per_process=NULL;
    MPI_Aint **displs_per_process=NULL;
    char *global_buf = NULL;
    MPI_Aint global_count = 0;
    mca_io_ompio_local_io_array *file_offsets_for_agg=NULL;

    /* array that contains the sorted indices of the global_iov */
    int *sorted = NULL;
    int *displs = NULL;
    int dynamic_gen2_num_io_procs;
    size_t max_data = 0;
    MPI_Aint *total_bytes_per_process = NULL;
    ompi_datatype_t **sendtype = NULL;
    MPI_Request *send_req=NULL, recv_req=NULL;
    int my_aggregator =-1;
    bool recvbuf_is_contiguous=false;
    size_t ftype_size;
    OPAL_PTRDIFF_TYPE ftype_extent, lb;


#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    double read_time = 0.0, start_read_time = 0.0, end_read_time = 0.0;
    double rcomm_time = 0.0, start_rcomm_time = 0.0, end_rcomm_time = 0.0;
    double read_exch = 0.0, start_rexch = 0.0, end_rexch = 0.0;
    mca_io_ompio_print_entry nentry;
#endif

    /**************************************************************************
     ** 1. In case the data is not contigous in memory, decode it into an iovec
     **************************************************************************/

    opal_datatype_type_size ( &datatype->super, &ftype_size );
    opal_datatype_get_extent ( &datatype->super, &lb, &ftype_extent );

    if ( (ftype_extent == (OPAL_PTRDIFF_TYPE) ftype_size)             &&
        opal_datatype_is_contiguous_memory_layout(&datatype->super,1) &&
        0 == lb ) {
        recvbuf_is_contiguous = true;
    }


    if (! recvbuf_is_contiguous ) {
        ret = fh->f_decode_datatype ((struct mca_io_ompio_file_t *)fh,
                                     datatype,
                                     count,
                                     buf,
                                     &max_data,
                                     &decoded_iov,
                                     &iov_count);
        if (OMPI_SUCCESS != ret){
            goto exit;
        }
    }
    else {
        max_data = count * datatype->super.size;
    }

    if ( MPI_STATUS_IGNORE != status ) {
        status->_ucount = max_data;
    }

    fh->f_get_num_aggregators ( &dynamic_gen2_num_io_procs);
    ret = fh->f_set_aggregator_props ((struct mca_io_ompio_file_t *) fh,
                                      dynamic_gen2_num_io_procs,
                                      max_data);
    if (OMPI_SUCCESS != ret){
        goto exit;
    }
    my_aggregator = fh->f_procs_in_group[fh->f_aggregator_index];

    /**************************************************************************
     ** 2. Determine the total amount of data to be written
     **************************************************************************/
    total_bytes_per_process = (MPI_Aint*)malloc(fh->f_procs_per_group*sizeof(MPI_Aint));
    if (NULL == total_bytes_per_process) {
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_rcomm_time = MPI_Wtime();
#endif
    ret = fcoll_base_coll_allgather_array (&max_data,
                                           1,
                                           MPI_LONG,
                                           total_bytes_per_process,
                                           1,
                                           MPI_LONG,
                                           fh->f_aggregator_index,
                                           fh->f_procs_in_group,
                                           fh->f_procs_per_group,
                                           fh->f_comm);
    if (OMPI_SUCCESS != ret){
        goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    end_rcomm_time = MPI_Wtime();
    rcomm_time += end_rcomm_time - start_rcomm_time;
#endif

    for (i=0 ; i<fh->f_procs_per_group ; i++) {
        total_bytes += total_bytes_per_process[i];
    }

    if (NULL != total_bytes_per_process) {
        free (total_bytes_per_process);
        total_bytes_per_process = NULL;
    }

    /*********************************************************************
     *** 3. Generate the File offsets/lengths corresponding to this write
     ********************************************************************/
    ret = fh->f_generate_current_file_view ((struct mca_io_ompio_file_t *) fh,
                                            max_data,
                                            &local_iov_array,
                                            &local_count);

    if (ret != OMPI_SUCCESS){
        goto exit;
    }

    /*************************************************************
     *** 4. Allgather the File View information at all processes
     *************************************************************/

    fview_count = (int *) malloc (fh->f_procs_per_group * sizeof (int));
    if (NULL == fview_count) {
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_rcomm_time = MPI_Wtime();
#endif
    ret = fcoll_base_coll_allgather_array (&local_count,
                                           1,
                                           MPI_INT,
                                           fview_count,
                                           1,
                                           MPI_INT,
                                           fh->f_aggregator_index,
                                           fh->f_procs_in_group,
                                           fh->f_procs_per_group,
                                           fh->f_comm);
    
    if (OMPI_SUCCESS != ret){
        goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    end_rcomm_time = MPI_Wtime();
    rcomm_time += end_rcomm_time - start_rcomm_time;
#endif

    displs = (int*)malloc (fh->f_procs_per_group*sizeof(int));
    if (NULL == displs) {
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }

    displs[0] = 0;
    total_fview_count = fview_count[0];
    for (i=1 ; i<fh->f_procs_per_group ; i++) {
        total_fview_count += fview_count[i];
        displs[i] = displs[i-1] + fview_count[i-1];
    }

#if DEBUG_ON
    if (my_aggregator == fh->f_rank) {
    for (i=0 ; i<fh->f_procs_per_group ; i++) {
    printf ("%d: PROCESS: %d  ELEMENTS: %d  DISPLS: %d\n",
        fh->f_rank,
        i,
        fview_count[i],
        displs[i]);
}
}
#endif

    /* allocate the global iovec  */
    if (0 != total_fview_count) {
        global_iov_array = (struct iovec*)malloc (total_fview_count *
                                                  sizeof(struct iovec));
        if (NULL == global_iov_array) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_rcomm_time = MPI_Wtime();
#endif
    ret =  fcoll_base_coll_allgatherv_array (local_iov_array,
                                             local_count,
                                             fh->f_iov_type,
                                             global_iov_array,
                                             fview_count,
                                             displs,
                                             fh->f_iov_type,
                                             fh->f_aggregator_index,
                                             fh->f_procs_in_group,
                                             fh->f_procs_per_group,
                                             fh->f_comm);

    if (OMPI_SUCCESS != ret){
        goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    end_rcomm_time = MPI_Wtime();
    rcomm_time += end_rcomm_time - start_rcomm_time;
#endif

    /****************************************************************************************
     *** 5. Sort the global offset/lengths list based on the offsets.
     *** The result of the sort operation is the 'sorted', an integer array,
     *** which contains the indexes of the global_iov_array based on the offset.
     *** For example, if global_iov_array[x].offset is followed by global_iov_array[y].offset
     *** in the file, and that one is followed by global_iov_array[z].offset, than
     *** sorted[0] = x, sorted[1]=y and sorted[2]=z;
     ******************************************************************************************/
    if (0 != total_fview_count) {
       sorted = (int *)malloc (total_fview_count * sizeof(int));
      if (NULL == sorted) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        fh->f_sort_iovec (global_iov_array, total_fview_count, sorted);
    }

    if (NULL != local_iov_array) {
        free (local_iov_array);
        local_iov_array = NULL;
    }

#if DEBUG_ON
    if (my_aggregator == fh->f_rank) {
        for (i=0 ; i<total_fview_count ; i++) {
            printf("%d: OFFSET: %p   LENGTH: %d\n",
                   fh->f_rank,
                   global_iov_array[sorted[i]].iov_base,
                   global_iov_array[sorted[i]].iov_len);
        }
    }
#endif

    /*************************************************************
     *** 6. Determine the number of cycles required to execute this
     ***    operation
     *************************************************************/
    fh->f_get_bytes_per_agg ( (int *) &bytes_per_cycle);
    cycles = ceil((double)total_bytes/bytes_per_cycle);

    if ( my_aggregator == fh->f_rank) {
      disp_index = (int *)malloc (fh->f_procs_per_group * sizeof (int));
      if (NULL == disp_index) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        blocklen_per_process = (int **)malloc (fh->f_procs_per_group * sizeof (int*));
        if (NULL == blocklen_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        displs_per_process = (MPI_Aint **)malloc (fh->f_procs_per_group * sizeof (MPI_Aint*));
        if (NULL == displs_per_process){
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        for (i=0;i<fh->f_procs_per_group;i++){
            blocklen_per_process[i] = NULL;
            displs_per_process[i] = NULL;
        }

	send_req = (MPI_Request *) malloc (fh->f_procs_per_group * sizeof(MPI_Request));
	if (NULL == send_req){
	    opal_output ( 1, "OUT OF MEMORY\n");
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	}

	global_buf = (char *) malloc (bytes_per_cycle);
	if (NULL == global_buf){
	    opal_output(1, "OUT OF MEMORY\n");
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	}

	sendtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group * sizeof(ompi_datatype_t *));
	if (NULL == sendtype) {
            opal_output (1, "OUT OF MEMORY\n");
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	}

	for(l=0;l<fh->f_procs_per_group;l++){
            sendtype[l] = MPI_DATATYPE_NULL;
	}
    }




#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_rexch = MPI_Wtime();
#endif
    n = 0;
    bytes_remaining = 0;
    current_index = 0;

    for (index = 0; index < cycles; index++) {
        /**********************************************************************
         ***  7a. Getting ready for next cycle: initializing and freeing buffers
	 **********************************************************************/
        if (my_aggregator == fh->f_rank) {
             if (NULL != fh->f_io_array) {
                free (fh->f_io_array);
                fh->f_io_array = NULL;
            }
            fh->f_num_of_io_entries = 0;

            if (NULL != sendtype){
                for (i =0; i< fh->f_procs_per_group; i++) {
		    if ( MPI_DATATYPE_NULL != sendtype[i] ) {
                        ompi_datatype_destroy(&sendtype[i]);
                        sendtype[i] = MPI_DATATYPE_NULL;
                    }
		}
            }

            for(l=0;l<fh->f_procs_per_group;l++){
                disp_index[l] =  1;

                if (NULL != blocklen_per_process[l]){
                    free(blocklen_per_process[l]);
                    blocklen_per_process[l] = NULL;
                }
                if (NULL != displs_per_process[l]){
                    free(displs_per_process[l]);
                    displs_per_process[l] = NULL;
                }
                blocklen_per_process[l] = (int *) calloc (1, sizeof(int));
                if (NULL == blocklen_per_process[l]) {
                    opal_output (1, "OUT OF MEMORY for blocklen\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint));
                if (NULL == displs_per_process[l]){
                    opal_output (1, "OUT OF MEMORY for displs\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
            }

            if (NULL != sorted_file_offsets){
                free(sorted_file_offsets);
                sorted_file_offsets = NULL;
            }

            if(NULL != file_offsets_for_agg){
                free(file_offsets_for_agg);
                file_offsets_for_agg = NULL;
            }
            if (NULL != memory_displacements){
                free(memory_displacements);
                memory_displacements = NULL;
            }
        }  /* (my_aggregator == fh->f_rank */

        /**************************************************************************
         ***  7b. Determine the number of bytes to be actually read in this cycle
	 **************************************************************************/
        if (cycles-1 == index) {
            bytes_to_read_in_cycle = total_bytes - bytes_per_cycle*index;
        }
        else {
            bytes_to_read_in_cycle = bytes_per_cycle;
        }

#if DEBUG_ON
        if (my_aggregator == fh->f_rank) {
            printf ("****%d: CYCLE %d   Bytes %d**********\n",
                    fh->f_rank,
                    index,
                    bytes_to_write_in_cycle);
        }
#endif

        /*****************************************************************
         *** 7c. Calculate how much data will be contributed in this cycle
	 ***     by each process
         *****************************************************************/
        bytes_received = 0;

        while (bytes_to_read_in_cycle) {
            /* This next block identifies which process is the holder
            ** of the sorted[current_index] element;
            */
            blocks = fview_count[0];
            for (j=0 ; j<fh->f_procs_per_group ; j++) {
                if (sorted[current_index] < blocks) {
                    n = j;
                    break;
                }
                else {
                    blocks += fview_count[j+1];
                }
            }

            if (bytes_remaining) {
                /* Finish up a partially used buffer from the previous  cycle */
                if (bytes_remaining <= bytes_to_read_in_cycle) {
                    /* Data fits completely into the block */
                    if (my_aggregator == fh->f_rank) {
                        blocklen_per_process[n][disp_index[n] - 1] = bytes_remaining;
                        displs_per_process[n][disp_index[n] - 1] =
                            (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base +
                            (global_iov_array[sorted[current_index]].iov_len - bytes_remaining);

                        blocklen_per_process[n] = (int *) realloc
                            ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int));
                        displs_per_process[n] = (MPI_Aint *) realloc
                            ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint));
                        blocklen_per_process[n][disp_index[n]] = 0;
                        displs_per_process[n][disp_index[n]] = 0;
                        disp_index[n] += 1;
                    }
                    if (fh->f_procs_in_group[n] == fh->f_rank) {
                        bytes_received += bytes_remaining;
                    }
                    current_index ++;
                    bytes_to_read_in_cycle -= bytes_remaining;
                    bytes_remaining = 0;
                    continue;
                }
                else {
                     /* the remaining data from the previous cycle is larger than the
                        bytes_to_write_in_cycle, so we have to segment again */
                    if (my_aggregator == fh->f_rank) {
                        blocklen_per_process[n][disp_index[n] - 1] = bytes_to_read_in_cycle;
                        displs_per_process[n][disp_index[n] - 1] =
                            (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base +
                            (global_iov_array[sorted[current_index]].iov_len
                             - bytes_remaining);
                    }
                    if (fh->f_procs_in_group[n] == fh->f_rank) {
                        bytes_received += bytes_to_read_in_cycle;
                    }
                    bytes_remaining -= bytes_to_read_in_cycle;
                    bytes_to_read_in_cycle = 0;
                    break;
                }
            }
            else {
                /* No partially used entry available, have to start a new one */
                if (bytes_to_read_in_cycle <
                    (MPI_Aint) global_iov_array[sorted[current_index]].iov_len) {
                    /* This entry has more data than we can sendin one cycle */
                    if (my_aggregator == fh->f_rank) {
                        blocklen_per_process[n][disp_index[n] - 1] = bytes_to_read_in_cycle;
                        displs_per_process[n][disp_index[n] - 1] =
                            (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base ;
                    }

                    if (fh->f_procs_in_group[n] == fh->f_rank) {
                        bytes_received += bytes_to_read_in_cycle;
                    }
                    bytes_remaining = global_iov_array[sorted[current_index]].iov_len -
                        bytes_to_read_in_cycle;
                    bytes_to_read_in_cycle = 0;
                    break;
                }
                else {
                    /* Next data entry is less than bytes_to_write_in_cycle */
                    if (my_aggregator ==  fh->f_rank) {
                        blocklen_per_process[n][disp_index[n] - 1] =
                            global_iov_array[sorted[current_index]].iov_len;
                        displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)
                            global_iov_array[sorted[current_index]].iov_base;
                        blocklen_per_process[n] =
                            (int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int));
                        displs_per_process[n] = (MPI_Aint *)realloc
                            ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint));
                        blocklen_per_process[n][disp_index[n]] = 0;
                        displs_per_process[n][disp_index[n]] = 0;
                        disp_index[n] += 1;
                    }
                    if (fh->f_procs_in_group[n] == fh->f_rank) {
                        bytes_received +=
                            global_iov_array[sorted[current_index]].iov_len;
                    }
                    bytes_to_read_in_cycle -=
                        global_iov_array[sorted[current_index]].iov_len;
                    current_index ++;
                    continue;
                }
            }
        } /* end while (bytes_to_read_in_cycle) */

        /*************************************************************************
	 *** 7d. Calculate the displacement on where to put the data and allocate
         ***     the recieve buffer (global_buf)
	 *************************************************************************/
        if (my_aggregator == fh->f_rank) {
            entries_per_aggregator=0;
            for (i=0;i<fh->f_procs_per_group; i++){
                for (j=0;j<disp_index[i];j++){
                    if (blocklen_per_process[i][j] > 0)
                        entries_per_aggregator++ ;
                }
            }
            if (entries_per_aggregator > 0){
                file_offsets_for_agg = (mca_io_ompio_local_io_array *)
                    malloc(entries_per_aggregator*sizeof(mca_io_ompio_local_io_array));
                if (NULL == file_offsets_for_agg) {
                    opal_output (1, "OUT OF MEMORY\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                sorted_file_offsets = (int *)
                    malloc (entries_per_aggregator*sizeof(int));
                if (NULL == sorted_file_offsets){
                    opal_output (1, "OUT OF MEMORY\n");
                    ret =  OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                /*Moving file offsets to an IO array!*/
                temp_index = 0;
                global_count = 0;
                for (i=0;i<fh->f_procs_per_group; i++){
                    for(j=0;j<disp_index[i];j++){
                        if (blocklen_per_process[i][j] > 0){
                            file_offsets_for_agg[temp_index].length =
                                blocklen_per_process[i][j];
                            global_count += blocklen_per_process[i][j];
                            file_offsets_for_agg[temp_index].process_id = i;
                            file_offsets_for_agg[temp_index].offset =
                                displs_per_process[i][j];
                            temp_index++;
                        }
                    }
                }
            }
            else{
                continue;
            }

             /* Sort the displacements for each aggregator */
            read_heap_sort (file_offsets_for_agg,
                            entries_per_aggregator,
                            sorted_file_offsets);

            memory_displacements = (MPI_Aint *) malloc
                (entries_per_aggregator * sizeof(MPI_Aint));
            memory_displacements[sorted_file_offsets[0]] = 0;
            for (i=1; i<entries_per_aggregator; i++){
                memory_displacements[sorted_file_offsets[i]] =
                    memory_displacements[sorted_file_offsets[i-1]] +
                    file_offsets_for_agg[sorted_file_offsets[i-1]].length;
            }

             /**********************************************************
	     *** 7e. Create the io array, and pass it to fbtl
	     *********************************************************/
            fh->f_io_array = (mca_io_ompio_io_array_t *) malloc
                (entries_per_aggregator * sizeof (mca_io_ompio_io_array_t));
            if (NULL == fh->f_io_array) {
                opal_output(1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }

            fh->f_num_of_io_entries = 0;
            fh->f_io_array[0].offset =
                (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset;
            fh->f_io_array[0].length =
                file_offsets_for_agg[sorted_file_offsets[0]].length;
            fh->f_io_array[0].memory_address =
                global_buf+memory_displacements[sorted_file_offsets[0]];
            fh->f_num_of_io_entries++;
            for (i=1;i<entries_per_aggregator;i++){
                if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset +
                    file_offsets_for_agg[sorted_file_offsets[i-1]].length ==
                    file_offsets_for_agg[sorted_file_offsets[i]].offset){
                    fh->f_io_array[fh->f_num_of_io_entries - 1].length +=
                        file_offsets_for_agg[sorted_file_offsets[i]].length;
                }
                else{
                    fh->f_io_array[fh->f_num_of_io_entries].offset =
                        (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset;
                    fh->f_io_array[fh->f_num_of_io_entries].length =
                        file_offsets_for_agg[sorted_file_offsets[i]].length;
                    fh->f_io_array[fh->f_num_of_io_entries].memory_address =
                        global_buf+memory_displacements[sorted_file_offsets[i]];
                    fh->f_num_of_io_entries++;
                }
            }


#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            start_read_time = MPI_Wtime();
#endif

            if (fh->f_num_of_io_entries) {
                if ( 0 >  fh->f_fbtl->fbtl_preadv (fh)) {
                    opal_output (1, "READ FAILED\n");
                    ret = OMPI_ERROR;
                    goto exit;
                }
            }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            end_read_time = MPI_Wtime();
            read_time += end_read_time - start_read_time;
#endif
            /**********************************************************
             ******************** DONE READING ************************
             *********************************************************/

            temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int));
            if (NULL == temp_disp_index) {
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
            for (i=0; i<entries_per_aggregator; i++){
                temp_index =
                    file_offsets_for_agg[sorted_file_offsets[i]].process_id;
                displs_per_process[temp_index][temp_disp_index[temp_index]] =
                    memory_displacements[sorted_file_offsets[i]];
                if (temp_disp_index[temp_index] < disp_index[temp_index]){
                    temp_disp_index[temp_index] += 1;
                }
                else{
                    printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n",
                           temp_index, temp_disp_index[temp_index],
                           temp_index, disp_index[temp_index]);
                }
            }
            if (NULL != temp_disp_index){
                free(temp_disp_index);
                temp_disp_index = NULL;
            }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            start_rcomm_time = MPI_Wtime();
#endif
            for (i=0;i<fh->f_procs_per_group;i++){
                send_req[i] = MPI_REQUEST_NULL;
                if ( 0 < disp_index[i] ) {
                    ompi_datatype_create_hindexed(disp_index[i],
                                                  blocklen_per_process[i],
                                                  displs_per_process[i],
                                                  MPI_BYTE,
                                                  &sendtype[i]);
                    ompi_datatype_commit(&sendtype[i]);
                    ret = MCA_PML_CALL (isend(global_buf,
                                              1,
                                              sendtype[i],
                                              fh->f_procs_in_group[i],
                                              123,
                                              MCA_PML_BASE_SEND_STANDARD,
                                              fh->f_comm,
                                              &send_req[i]));
                    if(OMPI_SUCCESS != ret){
                        goto exit;
                    }
                }
            }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            end_rcomm_time = MPI_Wtime();
            rcomm_time += end_rcomm_time - start_rcomm_time;
#endif
        }

        /**********************************************************
         *** 7f.  Scatter the Data from the readers
         *********************************************************/
        if ( recvbuf_is_contiguous ) {
            receive_buf = &((char*)buf)[position];
        }
        else if (bytes_received) {
            /* allocate a receive buffer and copy the data that needs
               to be received into it in case the data is non-contigous
               in memory */
            receive_buf = malloc (bytes_received);
            if (NULL == receive_buf) {
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
        }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        start_rcomm_time = MPI_Wtime();
#endif
        ret = MCA_PML_CALL(irecv(receive_buf,
                                 bytes_received,
                                 MPI_BYTE,
                                 my_aggregator,
                                 123,
                                 fh->f_comm,
                                 &recv_req));
        if (OMPI_SUCCESS != ret){
            goto exit;
        }


        if (my_aggregator == fh->f_rank){
            ret = ompi_request_wait_all (fh->f_procs_per_group,
                                         send_req,
                                         MPI_STATUS_IGNORE);
            if (OMPI_SUCCESS != ret){
                goto exit;
            }
        }

        ret = ompi_request_wait (&recv_req, MPI_STATUS_IGNORE);
        if (OMPI_SUCCESS != ret){
            goto exit;
        }
        position += bytes_received;

        /* If data is not contigous in memory, copy the data from the
           receive buffer into the buffer passed in */
        if (!recvbuf_is_contiguous ) {
            OPAL_PTRDIFF_TYPE mem_address;
            size_t remaining = 0;
            size_t temp_position = 0;

            remaining = bytes_received;

            while (remaining) {
                mem_address = (OPAL_PTRDIFF_TYPE)
                    (decoded_iov[iov_index].iov_base) + current_position;

                if (remaining >=
                    (decoded_iov[iov_index].iov_len - current_position)) {
                    memcpy ((IOVBASE_TYPE *) mem_address,
                            receive_buf+temp_position,
                            decoded_iov[iov_index].iov_len - current_position);
                    remaining = remaining -
                        (decoded_iov[iov_index].iov_len - current_position);
                    temp_position = temp_position +
                        (decoded_iov[iov_index].iov_len - current_position);
                    iov_index = iov_index + 1;
                    current_position = 0;
                }
                else {
                    memcpy ((IOVBASE_TYPE *) mem_address,
                            receive_buf+temp_position,
                            remaining);
                    current_position = current_position + remaining;
                    remaining = 0;
                }
            }

            if (NULL != receive_buf) {
                free (receive_buf);
                receive_buf = NULL;
            }
        }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        end_rcomm_time = MPI_Wtime();
        rcomm_time += end_rcomm_time - start_rcomm_time;
#endif
    } /* end for (index=0; index < cycles; index ++) */

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    end_rexch = MPI_Wtime();
    read_exch += end_rexch - start_rexch;
    nentry.time[0] = read_time;
    nentry.time[1] = rcomm_time;
    nentry.time[2] = read_exch;
    if (my_aggregator == fh->f_rank)
        nentry.aggregator = 1;
    else
        nentry.aggregator = 0;
    nentry.nprocs_for_coll = dynamic_gen2_num_io_procs;
    if (!fh->f_full_print_queue(READ_PRINT_QUEUE)){
        fh->f_register_print_entry(READ_PRINT_QUEUE,
                                   nentry);
    }
#endif

exit:
    if (!recvbuf_is_contiguous) {
        if (NULL != receive_buf) {
            free (receive_buf);
            receive_buf = NULL;
        }
    }
    if (NULL != global_buf) {
        free (global_buf);
        global_buf = NULL;
    }
    if (NULL != sorted) {
        free (sorted);
        sorted = NULL;
    }
    if (NULL != global_iov_array) {
        free (global_iov_array);
        global_iov_array = NULL;
    }
    if (NULL != fview_count) {
        free (fview_count);
        fview_count = NULL;
    }
    if (NULL != decoded_iov) {
        free (decoded_iov);
        decoded_iov = NULL;
    }
    if (NULL != local_iov_array){
        free(local_iov_array);
        local_iov_array=NULL;
    }

    if (NULL != displs) {
        free (displs);
        displs = NULL;
    }
    if (my_aggregator == fh->f_rank) {

        if (NULL != sorted_file_offsets){
            free(sorted_file_offsets);
            sorted_file_offsets = NULL;
        }
        if (NULL != file_offsets_for_agg){
            free(file_offsets_for_agg);
            file_offsets_for_agg = NULL;
        }
        if (NULL != memory_displacements){
            free(memory_displacements);
            memory_displacements= NULL;
        }
        if (NULL != sendtype){
            for (i = 0; i < fh->f_procs_per_group; i++) {
                if ( MPI_DATATYPE_NULL != sendtype[i] ) {
                    ompi_datatype_destroy(&sendtype[i]);
                }
            }
            free(sendtype);
            sendtype=NULL;
        }

        if (NULL != disp_index){
            free(disp_index);
            disp_index = NULL;
        }

        if ( NULL != blocklen_per_process){
            for(l=0;l<fh->f_procs_per_group;l++){
                if (NULL != blocklen_per_process[l]){
                    free(blocklen_per_process[l]);
                    blocklen_per_process[l] = NULL;
                }
            }

            free(blocklen_per_process);
            blocklen_per_process = NULL;
        }

        if (NULL != displs_per_process){
            for (l=0; i<fh->f_procs_per_group; l++){
                if (NULL != displs_per_process[l]){
                    free(displs_per_process[l]);
                    displs_per_process[l] = NULL;
                }
            }
            free(displs_per_process);
            displs_per_process = NULL;
        }
        if ( NULL != send_req ) {
            free ( send_req );
            send_req = NULL;
        }
    }
    return ret;
}
示例#26
0
int mca_io_ompio_set_view_internal(mca_io_ompio_file_t *fh,
				   OMPI_MPI_OFFSET_TYPE disp,
				   ompi_datatype_t *etype,
				   ompi_datatype_t *filetype,
				   char *datarep,
				   ompi_info_t *info)
{

    size_t max_data = 0;
    int i;
    int num_groups = 0;
    contg *contg_groups;

    size_t ftype_size;
    OPAL_PTRDIFF_TYPE ftype_extent, lb, ub;
    ompi_datatype_t *newfiletype;

    if ( NULL != fh->f_etype ) {
        ompi_datatype_destroy (&fh->f_etype);
    }
    if ( NULL != fh->f_filetype ) {
        ompi_datatype_destroy (&fh->f_filetype);
    }
    if ( NULL != fh->f_orig_filetype ) {
        ompi_datatype_destroy (&fh->f_orig_filetype);
    }
    if (NULL != fh->f_decoded_iov) {
        free (fh->f_decoded_iov);
        fh->f_decoded_iov = NULL;
    }

    if (NULL != fh->f_datarep) {
        free (fh->f_datarep);
        fh->f_datarep = NULL;
    }

    /* Reset the flags first */
    fh->f_flags = 0;

    fh->f_flags |= OMPIO_FILE_VIEW_IS_SET;
    fh->f_datarep = strdup (datarep);
    ompi_datatype_duplicate (filetype, &fh->f_orig_filetype );

    opal_datatype_get_extent(&filetype->super, &lb, &ftype_extent);
    opal_datatype_type_size (&filetype->super, &ftype_size);

    if ( etype == filetype                             &&
	 ompi_datatype_is_predefined (filetype )       &&
	 ftype_extent == (OPAL_PTRDIFF_TYPE)ftype_size ){
	ompi_datatype_create_contiguous(MCA_IO_DEFAULT_FILE_VIEW_SIZE,
					&ompi_mpi_byte.dt,
					&newfiletype);
	ompi_datatype_commit (&newfiletype);
    }
    else {
        newfiletype = filetype;
    }



    fh->f_iov_count   = 0;
    fh->f_disp        = disp;
    fh->f_offset      = disp;
    fh->f_total_bytes = 0;

    ompi_io_ompio_decode_datatype (fh,
                                   newfiletype,
                                   1,
                                   NULL,
                                   &max_data,
                                   &fh->f_decoded_iov,
                                   &fh->f_iov_count);

    opal_datatype_get_extent(&newfiletype->super, &lb, &fh->f_view_extent);
    opal_datatype_type_ub   (&newfiletype->super, &ub);
    opal_datatype_type_size (&etype->super, &fh->f_etype_size);
    opal_datatype_type_size (&newfiletype->super, &fh->f_view_size);
    ompi_datatype_duplicate (etype, &fh->f_etype);
    ompi_datatype_duplicate (newfiletype, &fh->f_filetype);

    fh->f_cc_size = get_contiguous_chunk_size (fh);

    if (opal_datatype_is_contiguous_memory_layout(&etype->super,1)) {
        if (opal_datatype_is_contiguous_memory_layout(&filetype->super,1) &&
	    fh->f_view_extent == (OPAL_PTRDIFF_TYPE)fh->f_view_size ) {
            fh->f_flags |= OMPIO_CONTIGUOUS_FVIEW;
        }
    }

    contg_groups = (contg*) calloc ( 1, fh->f_size * sizeof(contg));
    if (NULL == contg_groups) {
        opal_output (1, "OUT OF MEMORY\n");
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    for( i = 0; i < fh->f_size; i++){
       contg_groups[i].procs_in_contg_group = (int*)calloc (1,fh->f_size * sizeof(int));
       if(NULL == contg_groups[i].procs_in_contg_group){
          int j;
          opal_output (1, "OUT OF MEMORY\n");
          for(j=0; j<i; j++) {
              free(contg_groups[j].procs_in_contg_group);
          }
          free(contg_groups);
          return OMPI_ERR_OUT_OF_RESOURCE;
       }
    }

    if( OMPI_SUCCESS != mca_io_ompio_fview_based_grouping(fh,
                                                          &num_groups,
                                                          contg_groups)){
       opal_output(1, "mca_io_ompio_fview_based_grouping() failed\n");
       free(contg_groups);
       return OMPI_ERROR;
    }
    if( !( (fh->f_comm->c_flags & OMPI_COMM_CART) &&
           (num_groups == 1 || num_groups == fh->f_size)) ) {
          mca_io_ompio_finalize_initial_grouping(fh,
                                                 num_groups,
                                                 contg_groups);
    }
    for( i = 0; i < fh->f_size; i++){
       free(contg_groups[i].procs_in_contg_group);
    }
    free(contg_groups);

    if ( etype == filetype                              &&
	 ompi_datatype_is_predefined (filetype )        &&
	 ftype_extent == (OPAL_PTRDIFF_TYPE)ftype_size ){
	ompi_datatype_destroy ( &newfiletype );
    }


    if (OMPI_SUCCESS != mca_fcoll_base_file_select (fh, NULL)) {
        opal_output(1, "mca_fcoll_base_file_select() failed\n");
        return OMPI_ERROR;
    }

    return OMPI_SUCCESS;
}
static int
cyclic(const int *gsize_array, int dim, int ndims, int nprocs,
       int rank, int darg, int order, ptrdiff_t orig_extent,
       ompi_datatype_t* type_old, ompi_datatype_t **type_new,
       ptrdiff_t *st_offset)
{
    int blksize, i, blklens[2], st_index, end_index, local_size, rem, count, rc;
    ptrdiff_t stride, disps[2];
    ompi_datatype_t *type_tmp, *types[2];

    if (darg == MPI_DISTRIBUTE_DFLT_DARG) {
        blksize = 1;
    } else {
        blksize = darg;
    }

    st_index = rank * blksize;
    end_index = gsize_array[dim] - 1;

    if (end_index < st_index) {
        local_size = 0;
    } else {
        local_size = ((end_index - st_index + 1)/(nprocs*blksize))*blksize;
        rem = (end_index - st_index + 1) % (nprocs*blksize);
        local_size += rem < blksize ? rem : blksize;
    }

    count = local_size / blksize;
    rem = local_size % blksize;

    stride = nprocs*blksize*orig_extent;
    if (order == MPI_ORDER_FORTRAN) {
        for (i=0; i<dim; i++) {
            stride *= gsize_array[i];
        }
    } else {
        for (i=ndims-1; i>dim; i--) {
            stride *= gsize_array[i];
        }
    }

    rc = ompi_datatype_create_hvector(count, blksize, stride, type_old, type_new);
    if (OMPI_SUCCESS != rc) return rc;

    if (rem) {
        /* if the last block is of size less than blksize, include
           it separately using MPI_Type_struct */

        types  [0] = *type_new; types  [1] = type_old;
        disps  [0] = 0;         disps  [1] = count*stride;
        blklens[0] = 1;         blklens[1] = rem;

        rc = ompi_datatype_create_struct(2, blklens, disps, types, &type_tmp);
        ompi_datatype_destroy(type_new);
        /* even in error condition, need to destroy type_new, so check
           for error after destroy. */
        if (OMPI_SUCCESS != rc) return rc;
        *type_new = type_tmp;
    }

    /* need to set the UB for block-cyclic to work */
    disps[0] = 0;         disps[1] = orig_extent;
    if (order == MPI_ORDER_FORTRAN) {
        for(i=0; i<=dim; i++) {
            disps[1] *= gsize_array[i];
        }
    } else {
        for(i=ndims-1; i>=dim; i--) {
            disps[1] *= gsize_array[i];
        }
    }
    rc = opal_datatype_resize( &(*type_new)->super, disps[0], disps[1] );
    if (OMPI_SUCCESS != rc) return rc;

    *st_offset = rank * blksize;
    /* in terms of no. of elements of type oldtype in this dimension */
    if (local_size == 0) *st_offset = 0;

    return OMPI_SUCCESS;
}
/*
 *	allgatherv_intra
 *
 *	Function:	- allgatherv using other MPI collectives
 *	Accepts:	- same as MPI_Allgatherv()
 *	Returns:	- MPI_SUCCESS or error code
 */
int
mca_coll_basic_allgatherv_intra(void *sbuf, int scount,
                                struct ompi_datatype_t *sdtype,
                                void *rbuf, int *rcounts, int *disps,
                                struct ompi_datatype_t *rdtype,
                                struct ompi_communicator_t *comm,
                                mca_coll_base_module_t *module)
{
    int i, size, rank ;
    int err;
    MPI_Aint extent;
    MPI_Aint lb;
    char *send_buf = NULL;
    struct ompi_datatype_t *newtype, *send_type;

    size = ompi_comm_size(comm);
    rank = ompi_comm_rank(comm);
    /*
     * We don't have a root process defined. Arbitrarily assign root
     * to process with rank 0 (OMPI convention)
     */

    if (MPI_IN_PLACE == sbuf) {
        ompi_datatype_get_extent(rdtype, &lb, &extent);
        send_type = rdtype;
        send_buf = (char*)rbuf;
        for (i = 0; i < rank; ++i) {
            send_buf += (rcounts[i] * extent);
        }
    } else {
        send_buf = (char*)sbuf;
        send_type = sdtype;
    }

    err = comm->c_coll.coll_gatherv(send_buf,
                                    rcounts[rank], send_type,rbuf,
                                    rcounts, disps, rdtype, 0,
                                    comm, comm->c_coll.coll_gatherv_module);

    if (MPI_SUCCESS != err) {
        return err;
    }
    /*
     * we now have all the data in the root's rbuf. Need to
     * broadcast the data out to the other processes
     *
     * Need to define a datatype that captures the different vectors
     * from each process. MPI_TYPE_INDEXED with params 
     *                    size,rcount,displs,rdtype,newtype
     * should do the trick.
     * Use underlying ddt functions to create, and commit the
     * new datatype on each process, then broadcast and destroy the
     * datatype.
     */

    err = ompi_datatype_create_indexed(size,rcounts,disps,rdtype,&newtype);
    if (MPI_SUCCESS != err) {
        return err;
    }
    
    err = ompi_datatype_commit(&newtype);
    if(MPI_SUCCESS != err) {
       return err;
    }

    err = comm->c_coll.coll_bcast( rbuf, 1 ,newtype,0,comm,
                                   comm->c_coll.coll_bcast_module);

    ompi_datatype_destroy (&newtype);

    return err;
}