static int bcol_ptpcoll_barrier_recurs_dbl_new(
                                bcol_function_args_t *input_args,
                                struct coll_ml_function_t *const_args)
{
   /* local variable */
    uint64_t sequence_number;
    mca_bcol_ptpcoll_module_t *ptp_module =
                         (mca_bcol_ptpcoll_module_t *) const_args->bcol_module;

    ompi_communicator_t *comm = ptp_module->super.sbgp_partner_module->group_comm;

    int rc, my_extra_partner_comm_rank = 0, exchange, completed,
        pair_comm_rank, pair_rank, delta, tag, num_reqs = 0,
        my_rank = ptp_module->super.sbgp_partner_module->my_index,
        n_exchange = ptp_module->super.sbgp_partner_module->n_levels_pow2;

    ompi_request_t **requests;
    ompi_free_list_item_t *item;

    mca_bcol_ptpcoll_collreq_t *collreq;

    OMPI_FREE_LIST_WAIT_MT(&ptp_module->collreqs_free, item);
    if (OPAL_UNLIKELY(NULL == item)) {
        PTPCOLL_ERROR(("Free list waiting failed."));
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    collreq = (mca_bcol_ptpcoll_collreq_t *) item;
    input_args->bcol_opaque_data = (void *) collreq;

    assert(PTPCOLL_EXTRA != ptp_module->pow_2type);

    requests = collreq->requests;

    /* TAG Calculation */
    sequence_number = input_args->sequence_num;

    /* keep tag within the limit supportd by the pml */
    tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptp_module->tag_mask);

    /* mark this as a collective tag, to avoid conflict with user-level flags */
    tag = -tag;

    if (PTPCOLL_PROXY == ptp_module->pow_2type) {
        /* I will participate in the exchange - wait for signal from extra
         ** process */
        /*
         * recv from extra rank - my_extra_partner_comm_rank
         *  can use blocking recv, as no other communications
         *  need to take place.
         */
        my_extra_partner_comm_rank =
                       ptp_module->super.sbgp_partner_module->group_list[ptp_module->proxy_extra_index];

        collreq->need_toserv_extra = 1;
        collreq->extra_partner_rank = my_extra_partner_comm_rank;

        rc = MCA_PML_CALL(irecv(NULL, 0, MPI_INT,
                    my_extra_partner_comm_rank, tag, comm,
                    &(requests[0])));
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("IRecv failed."));
            return rc;
        }

        completed = mca_bcol_ptpcoll_test_for_match(&requests[0], &rc);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("Test for irecv failed."));
            return rc;
        }

        if (!completed) {
            collreq->tag = tag;
            collreq->num_reqs = 1;
            collreq->exchange = 0;

            return BCOL_FN_STARTED;
        }
    } else {
        collreq->need_toserv_extra = 0;
    }

    /* Loop over exchange send/recv pairs */
    delta = 1;
    for (exchange = 0; exchange < n_exchange; ++exchange) {

        /* rank of exchange partner within the group */
        pair_rank = my_rank ^ delta;

        /* rank within the communicator */
        pair_comm_rank =
            ptp_module->super.sbgp_partner_module->group_list[pair_rank];

        /* send to partner - we will wait for completion, as send
         *   completion is at the MPI level, and will not
         *   incur network level completion costs
         */
        rc = MCA_PML_CALL(isend(NULL, 0, MPI_INT,
                    pair_comm_rank, tag,
                    MCA_PML_BASE_SEND_STANDARD, comm,
                    &(requests[0])));
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("ISend failed."));
            return rc;
        }

        ++num_reqs;

        /* recive from partner */
        rc = MCA_PML_CALL(irecv(NULL, 0, MPI_INT,
                    pair_comm_rank, tag, comm,
                    &(requests[1])));
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("IRecv failed."));
            return rc;
        }

        ++num_reqs;

        PTPCOLL_VERBOSE(5, ("exchange - %d, pair_rank - %d, pair_comm_rank - %d",
                             exchange, pair_rank, pair_comm_rank));

        /* test for completion */
        completed =
            mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("Test for all failed."));
            return rc;
        }

        if (!completed) {
            collreq->tag = tag;
            collreq->num_reqs = num_reqs;

            collreq->exchange = exchange + 1;
            assert(collreq->exchange >= 0);

            return BCOL_FN_STARTED;
        }

        delta <<= 1; /* delta *= 2 */
    }

    if (PTPCOLL_PROXY == ptp_module->pow_2type) {
        /* send - let the extra rank know that we are done */
        rc = MCA_PML_CALL(isend(NULL, 0, MPI_INT,
                    my_extra_partner_comm_rank, tag,
                    MCA_PML_BASE_SEND_STANDARD, comm,
                    &(requests[0])));
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("ISend failed."));
            return rc;
        }

        completed = mca_bcol_ptpcoll_test_for_match(&requests[0], &rc);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("Test for isend failed."));
            return rc;
        }

        if (!completed) {
            collreq->tag = tag;
            collreq->num_reqs = 1;

            collreq->need_toserv_extra = 0;
            collreq->exchange = n_exchange;

            return BCOL_FN_STARTED;
        }
    }

    OMPI_FREE_LIST_RETURN_MT(&ptp_module->collreqs_free, (ompi_free_list_item_t *) collreq);
    return BCOL_FN_COMPLETE;
}
static int
mca_coll_basic_neighbor_alltoallv_cart(const void *sbuf, const int scounts[], const int sdisps[],
                                       struct ompi_datatype_t *sdtype, void *rbuf, const int rcounts[],
                                       const int rdisps[], struct ompi_datatype_t *rdtype,
                                       struct ompi_communicator_t *comm, mca_coll_base_module_t *module)
{
    const mca_topo_base_comm_cart_2_2_0_t *cart = comm->c_topo->mtc.cart;
    const int rank = ompi_comm_rank (comm);
    int rc = MPI_SUCCESS, dim, i, nreqs;
    ptrdiff_t lb, rdextent, sdextent;
    ompi_request_t **reqs, **preqs;

    if( 0 == cart->ndims ) return OMPI_SUCCESS;

    ompi_datatype_get_extent(rdtype, &lb, &rdextent);
    ompi_datatype_get_extent(sdtype, &lb, &sdextent);
    reqs = preqs = ompi_coll_base_comm_get_reqs( module->base_data, 4 * cart->ndims );
    if( NULL == reqs ) { return OMPI_ERR_OUT_OF_RESOURCE; }

    /* post receives first */
    for (dim = 0, nreqs = 0, i = 0; dim < cart->ndims ; ++dim, i += 2) {
        int srank = MPI_PROC_NULL, drank = MPI_PROC_NULL;

        if (cart->dims[dim] > 1) {
            mca_topo_base_cart_shift (comm, dim, 1, &srank, &drank);
        } else if (1 == cart->dims[dim] && cart->periods[dim]) {
            srank = drank = rank;
        }

        if (MPI_PROC_NULL != srank) {
            nreqs++;
            rc = MCA_PML_CALL(irecv((char *) rbuf + rdisps[i] * rdextent, rcounts[i], rdtype, srank,
                                    MCA_COLL_BASE_TAG_ALLTOALL, comm, preqs++));
            if (OMPI_SUCCESS != rc) break;
        }

        if (MPI_PROC_NULL != drank) {
            nreqs++;
            rc = MCA_PML_CALL(irecv((char *) rbuf + rdisps[i+1] * rdextent, rcounts[i+1], rdtype, drank,
                                    MCA_COLL_BASE_TAG_ALLTOALL, comm, preqs++));
            if (OMPI_SUCCESS != rc) break;
        }
    }

    if (OMPI_SUCCESS != rc) {
        ompi_coll_base_free_reqs( reqs, nreqs );
        return rc;
    }

    for (dim = 0, i = 0 ; dim < cart->ndims ; ++dim, i += 2) {
        int srank = MPI_PROC_NULL, drank = MPI_PROC_NULL;

        if (cart->dims[dim] > 1) {
            mca_topo_base_cart_shift (comm, dim, 1, &srank, &drank);
        } else if (1 == cart->dims[dim] && cart->periods[dim]) {
            srank = drank = rank;
        }

        if (MPI_PROC_NULL != srank) {
            nreqs++;
            /* remove cast from const when the pml layer is updated to take a const for the send buffer */
            rc = MCA_PML_CALL(isend((char *) sbuf + sdisps[i] * sdextent, scounts[i], sdtype, srank,
                                    MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, preqs++));
            if (OMPI_SUCCESS != rc) break;
        }

        if (MPI_PROC_NULL != drank) {
            nreqs++;
            rc = MCA_PML_CALL(isend((char *) sbuf + sdisps[i+1] * sdextent, scounts[i+1], sdtype, drank,
                                    MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, preqs++));
            if (OMPI_SUCCESS != rc) break;
        }
    }

    if (OMPI_SUCCESS != rc) {
        ompi_coll_base_free_reqs( reqs, nreqs );
        return rc;
    }

    rc = ompi_request_wait_all (nreqs, reqs, MPI_STATUSES_IGNORE);
    if (OMPI_SUCCESS != rc) {
        ompi_coll_base_free_reqs( reqs, nreqs );
    }
    return rc;
}
/**
 * This is a generic implementation of the reduce protocol. It used the tree
 * provided as an argument and execute all operations using a segment of
 * count times a datatype.
 * For the last communication it will update the count in order to limit
 * the number of datatype to the original count (original_count)
 *
 * Note that for non-commutative operations we cannot save memory copy
 * for the first block: thus we must copy sendbuf to accumbuf on intermediate 
 * to keep the optimized loop happy.
 */
int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_count,
                                    ompi_datatype_t* datatype, ompi_op_t* op,
                                    int root, ompi_communicator_t* comm,
                                    mca_coll_base_module_t *module,
                                    ompi_coll_tree_t* tree, int count_by_segment,
                                    int max_outstanding_reqs )
{
    char *inbuf[2] = {NULL, NULL}, *inbuf_free[2] = {NULL, NULL};
    char *accumbuf = NULL, *accumbuf_free = NULL;
    char *local_op_buffer = NULL, *sendtmpbuf = NULL;
    ptrdiff_t extent, lower_bound, segment_increment;
    size_t typelng;
    ompi_request_t* reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
    int num_segments, line, ret, segindex, i, rank;
    int recvcount, prevcount, inbi;

    /**
     * Determine number of segments and number of elements
     * sent per operation
     */
    ompi_datatype_get_extent( datatype, &lower_bound, &extent );
    ompi_datatype_type_size( datatype, &typelng );
    num_segments = (original_count + count_by_segment - 1) / count_by_segment;
    segment_increment = (ptrdiff_t)count_by_segment * extent;

    sendtmpbuf = (char*) sendbuf; 
    if( sendbuf == MPI_IN_PLACE ) { 
        sendtmpbuf = (char *)recvbuf; 
    }

    OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:reduce_generic count %d, msg size %ld, segsize %ld, max_requests %d",
                 original_count, (unsigned long)((ptrdiff_t)num_segments * (ptrdiff_t)segment_increment),
                 (unsigned long)segment_increment, max_outstanding_reqs));

    rank = ompi_comm_rank(comm);

    /*printf("Tree of rank %d - ", rank);
    printf("Parent : %d - ", tree->tree_prev);
    printf("Child : ");
    for (i = 0; i < tree->tree_nextsize; i++)
        printf("%d ", tree->tree_next[i]);
    printf("\n");*/

    /* non-leaf nodes - wait for children to send me data & forward up 
       (if needed) */
    if( tree->tree_nextsize > 0 ) {
        ptrdiff_t true_lower_bound, true_extent, real_segment_size;
        ompi_datatype_get_true_extent( datatype, &true_lower_bound, 
                                       &true_extent );

        /* handle non existant recv buffer (i.e. its NULL) and 
           protect the recv buffer on non-root nodes */
        accumbuf = (char*)recvbuf;
        if( (NULL == accumbuf) || (root != rank) ) {
            /* Allocate temporary accumulator buffer. */
            accumbuf_free = (char*)malloc(true_extent + 
                                          (ptrdiff_t)(original_count - 1) * extent);
            if (accumbuf_free == NULL) { 
                line = __LINE__; ret = -1; goto error_hndl; 
            }
            accumbuf = accumbuf_free - lower_bound;
        } 

        /* If this is a non-commutative operation we must copy
           sendbuf to the accumbuf, in order to simplfy the loops */
        if (!ompi_op_is_commute(op)) {
            ompi_datatype_copy_content_same_ddt(datatype, original_count, 
                                                (char*)accumbuf,
                                                (char*)sendtmpbuf);
        }
        /* Allocate two buffers for incoming segments */
        real_segment_size = true_extent + (ptrdiff_t)(count_by_segment - 1) * extent;
        inbuf_free[0] = (char*) malloc(real_segment_size);
        if( inbuf_free[0] == NULL ) { 
            line = __LINE__; ret = -1; goto error_hndl; 
        }
        inbuf[0] = inbuf_free[0] - lower_bound;
        /* if there is chance to overlap communication -
           allocate second buffer */
        if( (num_segments > 1) || (tree->tree_nextsize > 1) ) {
            inbuf_free[1] = (char*) malloc(real_segment_size);
            if( inbuf_free[1] == NULL ) { 
                line = __LINE__; ret = -1; goto error_hndl;
            }
            inbuf[1] = inbuf_free[1] - lower_bound;
        } 

        /* reset input buffer index and receive count */
        inbi = 0;
        recvcount = 0;
        /* for each segment */
        for( segindex = 0; segindex <= num_segments; segindex++ ) {
            prevcount = recvcount;
            /* recvcount - number of elements in current segment */
            recvcount = count_by_segment;
            if( segindex == (num_segments-1) )
                recvcount = original_count - (ptrdiff_t)count_by_segment * (ptrdiff_t)segindex;

            /* for each child */
            for( i = 0; i < tree->tree_nextsize; i++ ) {
                /**
                 * We try to overlap communication:
                 * either with next segment or with the next child
                 */
                /* post irecv for current segindex on current child */
                if( segindex < num_segments ) {
                    void* local_recvbuf = inbuf[inbi];
                    if( 0 == i ) {
                        /* for the first step (1st child per segment) and 
                         * commutative operations we might be able to irecv 
                         * directly into the accumulate buffer so that we can 
                         * reduce(op) this with our sendbuf in one step as 
                         * ompi_op_reduce only has two buffer pointers, 
                         * this avoids an extra memory copy.
                         *
                         * BUT if the operation is non-commutative or 
                         * we are root and are USING MPI_IN_PLACE this is wrong!
                         */
                        if( (ompi_op_is_commute(op)) &&
                            !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) {
                            local_recvbuf = accumbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment;
                        }
                    }
                    ret = MCA_PML_CALL(irecv(local_recvbuf, recvcount, datatype,
                                             tree->tree_next[i], 
                                             MCA_COLL_BASE_TAG_REDUCE, comm, 
                                             &reqs[inbi]));
                    if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl;}
                }
                /* wait for previous req to complete, if any.
                   if there are no requests reqs[inbi ^1] will be 
                   MPI_REQUEST_NULL. */
                /* wait on data from last child for previous segment */
                ret = ompi_request_wait_all( 1, &reqs[inbi ^ 1], 
                                             MPI_STATUSES_IGNORE );
                if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl;  }
                local_op_buffer = inbuf[inbi ^ 1];
                if( i > 0 ) {
                    /* our first operation is to combine our own [sendbuf] data 
                     * with the data we recvd from down stream (but only 
                     * the operation is commutative and if we are not root and 
                     * not using MPI_IN_PLACE)
                     */
                    if( 1 == i ) {
                        if( (ompi_op_is_commute(op)) && 
                            !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) {
                            local_op_buffer = sendtmpbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment;
                        }
                    }
                    /* apply operation */
                    ompi_op_reduce(op, local_op_buffer, 
                                   accumbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, 
                                   recvcount, datatype );
                } else if ( segindex > 0 ) {
                    void* accumulator = accumbuf + (ptrdiff_t)(segindex-1) * (ptrdiff_t)segment_increment;
                    if( tree->tree_nextsize <= 1 ) {
                        if( (ompi_op_is_commute(op)) &&
                            !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) {
                            local_op_buffer = sendtmpbuf + (ptrdiff_t)(segindex-1) * (ptrdiff_t)segment_increment;
                        }
                    }
                    ompi_op_reduce(op, local_op_buffer, accumulator, prevcount, 
                                   datatype );

                    /* all reduced on available data this step (i) complete, 
                     * pass to the next process unless you are the root.
                     */
                    if (rank != tree->tree_root) {
                        /* send combined/accumulated data to parent */
                        ret = MCA_PML_CALL( send( accumulator, prevcount, 
                                                  datatype, tree->tree_prev, 
                                                  MCA_COLL_BASE_TAG_REDUCE,
                                                  MCA_PML_BASE_SEND_STANDARD, 
                                                  comm) );
                        if (ret != MPI_SUCCESS) { 
                            line = __LINE__; goto error_hndl;  
                        }
                    }

                    /* we stop when segindex = number of segments 
                       (i.e. we do num_segment+1 steps for pipelining */
                    if (segindex == num_segments) break;
                }

                /* update input buffer index */
                inbi = inbi ^ 1;
            } /* end of for each child */
        } /* end of for each segment */

        /* clean up */
        if( inbuf_free[0] != NULL) free(inbuf_free[0]);
        if( inbuf_free[1] != NULL) free(inbuf_free[1]);
        if( accumbuf_free != NULL ) free(accumbuf_free);
    }

    /* leaf nodes 
       Depending on the value of max_outstanding_reqs and 
       the number of segments we have two options:
       - send all segments using blocking send to the parent, or
       - avoid overflooding the parent nodes by limiting the number of 
       outstanding requests to max_oustanding_reqs.
       TODO/POSSIBLE IMPROVEMENT: If there is a way to determine the eager size 
       for the current communication, synchronization should be used only 
       when the message/segment size is smaller than the eager size.
    */
    else {

        /* If the number of segments is less than a maximum number of oustanding
           requests or there is no limit on the maximum number of outstanding 
           requests, we send data to the parent using blocking send */
        if ((0 == max_outstanding_reqs) || 
            (num_segments <= max_outstanding_reqs)) {
            
            segindex = 0;
            while ( original_count > 0) {
                if (original_count < count_by_segment) {
                    count_by_segment = original_count;
                }
                ret = MCA_PML_CALL( send((char*)sendbuf + 
                                         (ptrdiff_t)segindex * (ptrdiff_t)segment_increment,
                                         count_by_segment, datatype,
                                         tree->tree_prev, 
                                         MCA_COLL_BASE_TAG_REDUCE,
                                         MCA_PML_BASE_SEND_STANDARD,
                                         comm) );
                if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
                segindex++;
                original_count -= count_by_segment;
            }
        }

        /* Otherwise, introduce flow control:
           - post max_outstanding_reqs non-blocking synchronous send,
           - for remaining segments
           - wait for a ssend to complete, and post the next one.
           - wait for all outstanding sends to complete.
        */
        else {

            int creq = 0;
            ompi_request_t **sreq = NULL;

            sreq = (ompi_request_t**) calloc( max_outstanding_reqs,
                                              sizeof(ompi_request_t*) );
            if (NULL == sreq) { line = __LINE__; ret = -1; goto error_hndl; }

            /* post first group of requests */
            for (segindex = 0; segindex < max_outstanding_reqs; segindex++) {
                ret = MCA_PML_CALL( isend((char*)sendbuf +
                                          (ptrdiff_t)segindex * (ptrdiff_t)segment_increment,
                                          count_by_segment, datatype,
                                          tree->tree_prev, 
                                          MCA_COLL_BASE_TAG_REDUCE,
                                          MCA_PML_BASE_SEND_SYNCHRONOUS, comm,
                                          &sreq[segindex]) );
                if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl;  }
                original_count -= count_by_segment;
            }

            creq = 0;
            while ( original_count > 0 ) {
                /* wait on a posted request to complete */
                ret = ompi_request_wait(&sreq[creq], MPI_STATUS_IGNORE);
                if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl;  }
                sreq[creq] = MPI_REQUEST_NULL;

                if( original_count < count_by_segment ) {
                    count_by_segment = original_count;
                }
                ret = MCA_PML_CALL( isend((char*)sendbuf + 
                                          (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, 
                                          count_by_segment, datatype, 
                                          tree->tree_prev, 
                                          MCA_COLL_BASE_TAG_REDUCE, 
                                          MCA_PML_BASE_SEND_SYNCHRONOUS, comm, 
                                          &sreq[creq]) );
                if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl;  }
                creq = (creq + 1) % max_outstanding_reqs;
                segindex++;
                original_count -= count_by_segment;
            }

            /* Wait on the remaining request to complete */
            ret = ompi_request_wait_all( max_outstanding_reqs, sreq, 
                                         MPI_STATUSES_IGNORE );
            if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl;  }

            /* free requests */
            free(sreq);
        }
    }
    return OMPI_SUCCESS;

 error_hndl:  /* error handler */
    OPAL_OUTPUT (( ompi_coll_tuned_stream, 
                   "ERROR_HNDL: node %d file %s line %d error %d\n", 
                   rank, __FILE__, line, ret ));
    if( inbuf_free[0] != NULL ) free(inbuf_free[0]);
    if( inbuf_free[1] != NULL ) free(inbuf_free[1]);
    if( accumbuf_free != NULL ) free(accumbuf);
    return ret;
}
Example #4
0
/* MPI_IN_PLACE all to all algorithm. TODO: implement a better one. */
int
mca_coll_base_alltoall_intra_basic_inplace(const void *rbuf, int rcount,
                                           struct ompi_datatype_t *rdtype,
                                           struct ompi_communicator_t *comm,
                                           mca_coll_base_module_t *module)
{
    int i, j, size, rank, err = MPI_SUCCESS, line;
    OPAL_PTRDIFF_TYPE ext, gap;
    ompi_request_t *req;
    char *allocated_buffer = NULL, *tmp_buffer;
    size_t max_size;

    /* Initialize. */

    size = ompi_comm_size(comm);
    rank = ompi_comm_rank(comm);

    /* If only one process, we're done. */
    if (1 == size) {
        return MPI_SUCCESS;
    }

    /* Find the largest receive amount */
    ompi_datatype_type_extent (rdtype, &ext);
    max_size = opal_datatype_span(&rdtype->super, rcount, &gap);

    /* Initiate all send/recv to/from others. */

    /* Allocate a temporary buffer */
    allocated_buffer = calloc (max_size, 1);
    if( NULL == allocated_buffer) { err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto error_hndl; }
    tmp_buffer = allocated_buffer - gap;
    max_size = ext * rcount;

    /* in-place alltoall slow algorithm (but works) */
    for (i = 0 ; i < size ; ++i) {
        for (j = i+1 ; j < size ; ++j) {
            if (i == rank) {
                /* Copy the data into the temporary buffer */
                err = ompi_datatype_copy_content_same_ddt (rdtype, rcount, tmp_buffer,
                                                       (char *) rbuf + j * max_size);
                if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }

                /* Exchange data with the peer */
                err = MCA_PML_CALL(irecv ((char *) rbuf + max_size * j, rcount, rdtype,
                                          j, MCA_COLL_BASE_TAG_ALLTOALL, comm, &req));
                if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }

                err = MCA_PML_CALL(send ((char *) tmp_buffer,  rcount, rdtype,
                                          j, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD,
                                          comm));
                if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }
            } else if (j == rank) {
                /* Copy the data into the temporary buffer */
                err = ompi_datatype_copy_content_same_ddt (rdtype, rcount, tmp_buffer,
                                                       (char *) rbuf + i * max_size);
                if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }

                /* Exchange data with the peer */
                err = MCA_PML_CALL(irecv ((char *) rbuf + max_size * i, rcount, rdtype,
                                          i, MCA_COLL_BASE_TAG_ALLTOALL, comm, &req));
                if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }

                err = MCA_PML_CALL(send ((char *) tmp_buffer,  rcount, rdtype,
                                          i, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD,
                                          comm));
                if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }
            } else {
                continue;
            }

            /* Wait for the requests to complete */
            err = ompi_request_wait ( &req, MPI_STATUSES_IGNORE);
            if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }
        }
    }

 error_hndl:
    /* Free the temporary buffer */
    if( NULL != allocated_buffer )
        free (allocated_buffer);

    if( MPI_SUCCESS != err ) {
        OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
                     "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err,
                     rank));
        (void)line;  // silence compiler warning
    }

    /* All done */
    return err;
}
Example #5
0
int mca_io_ompio_finalize_initial_grouping(mca_io_ompio_file_t *fh,
		                           int num_groups,
					   contg *contg_groups)
{

    int z = 0;
    int y = 0;
    int r = 0;

    MPI_Request *sendreq = NULL , *req = NULL;


    req = (MPI_Request *)malloc (2* sizeof(MPI_Request));
    if (NULL == req) {
       return OMPI_ERR_OUT_OF_RESOURCE;
    }

    fh->f_init_num_aggrs = num_groups;
    fh->f_init_aggr_list = (int*)malloc (fh->f_init_num_aggrs * sizeof(int));
    if (NULL == fh->f_init_aggr_list) {
        opal_output (1, "OUT OF MEMORY\n");
        free(req);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    if(OMPIO_ROOT == fh->f_rank){
       sendreq = (MPI_Request *)malloc ( 2 *fh->f_size * sizeof(MPI_Request));
       if (NULL == sendreq) {
           free(req);
           return OMPI_ERR_OUT_OF_RESOURCE;
       }

       for( z = 0 ;z < num_groups; z++){
           for( y = 0; y < contg_groups[z].procs_per_contg_group; y++){
               MCA_PML_CALL(isend(&contg_groups[z].procs_per_contg_group,
                                  1,
                                  MPI_INT,
                                  contg_groups[z].procs_in_contg_group[y],
                                  OMPIO_PROCS_PER_GROUP_TAG,
                                  MCA_PML_BASE_SEND_STANDARD,
                                  fh->f_comm,
                                  &sendreq[r++]));

               //send initial grouping distribution to all processes in the group
               MCA_PML_CALL(isend(contg_groups[z].procs_in_contg_group,
                                  contg_groups[z].procs_per_contg_group,
                                  MPI_INT,
                                  contg_groups[z].procs_in_contg_group[y],
                                  OMPIO_PROCS_IN_GROUP_TAG,
                                  MCA_PML_BASE_SEND_STANDARD,
                                  fh->f_comm,
                                  &sendreq[r++]));
           }
       }
    }

    //All processes receive initial procs per group from OMPIO_ROOT
    MCA_PML_CALL(irecv(&fh->f_init_procs_per_group,
                       1,
                       MPI_INT,
                       OMPIO_ROOT,
                       OMPIO_PROCS_PER_GROUP_TAG,
                       fh->f_comm,
                       &req[0]));

    ompi_request_wait (&req[0], MPI_STATUS_IGNORE);
    fh->f_init_procs_in_group = (int*)malloc (fh->f_init_procs_per_group * sizeof(int));
    if (NULL == fh->f_init_procs_in_group) {
        opal_output (1, "OUT OF MEMORY\n");
        free(req);
        if (NULL != sendreq) {
            free(sendreq);
        }
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    //All processes receive initial process distribution from OMPIO_ROOT
    MCA_PML_CALL(irecv(fh->f_init_procs_in_group,
                       fh->f_init_procs_per_group,
                       MPI_INT,
                       OMPIO_ROOT,
                       OMPIO_PROCS_IN_GROUP_TAG,
                       fh->f_comm,
                       &req[1]));

    ompi_request_wait (&req[1], MPI_STATUS_IGNORE);
    free (req);
    if(OMPIO_ROOT == fh->f_rank){
        ompi_request_wait_all (r, sendreq, MPI_STATUSES_IGNORE);
        free (sendreq);
    }


    /*set initial aggregator list */
    //OMPIO_ROOT broadcasts aggr list
    if(OMPIO_ROOT == fh->f_rank){
      for( z = 0 ;z < num_groups; z++){
          fh->f_init_aggr_list[z] = contg_groups[z].procs_in_contg_group[0];
     }
    }

    fh->f_comm->c_coll.coll_bcast (fh->f_init_aggr_list,
  		                   num_groups,
				   MPI_INT,
				   OMPIO_ROOT,
				   fh->f_comm,
				   fh->f_comm->c_coll.coll_bcast_module);

   return OMPI_SUCCESS;
}
Example #6
0
/* Arguments not used in this implementation:
 *  - bridgecomm
 *  - local_leader
 *  - remote_leader
 *  - send_first
 */
static int ompi_comm_allreduce_inter ( int *inbuf, int *outbuf, 
                                       int count, struct ompi_op_t *op, 
                                       ompi_communicator_t *intercomm,
                                       ompi_communicator_t *bridgecomm, 
                                       void* local_leader, 
                                       void* remote_leader, 
                                       int send_first )
{
    int local_rank, rsize;
    int i, rc;
    int *sbuf;
    int *tmpbuf=NULL;
    int *rcounts=NULL, scount=0;
    int *rdisps=NULL;

    if ( &ompi_mpi_op_sum.op != op && &ompi_mpi_op_prod.op != op &&
         &ompi_mpi_op_max.op != op && &ompi_mpi_op_min.op  != op ) {
        return MPI_ERR_OP;
    }

    if ( !OMPI_COMM_IS_INTER (intercomm)) {
        return MPI_ERR_COMM;
    }

    /* Allocate temporary arrays */
    rsize      = ompi_comm_remote_size (intercomm);
    local_rank = ompi_comm_rank ( intercomm );

    tmpbuf  = (int *) malloc ( count * sizeof(int));
    rdisps  = (int *) calloc ( rsize, sizeof(int));
    rcounts = (int *) calloc ( rsize, sizeof(int) );
    if ( OPAL_UNLIKELY (NULL == tmpbuf || NULL == rdisps || NULL == rcounts)) {
        rc = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }
    
    /* Execute the inter-allreduce: the result of our group will
       be in the buffer of the remote group */
    rc = intercomm->c_coll.coll_allreduce ( inbuf, tmpbuf, count, MPI_INT,
                                            op, intercomm,
                                            intercomm->c_coll.coll_allreduce_module);
    if ( OMPI_SUCCESS != rc ) {
        goto exit;
    }

    if ( 0 == local_rank ) {
        MPI_Request req;

        /* for the allgatherv later */
        scount = count;

        /* local leader exchange their data and determine the overall result
           for both groups */
        rc = MCA_PML_CALL(irecv (outbuf, count, MPI_INT, 0, 
                                 OMPI_COMM_ALLREDUCE_TAG,
                                 intercomm, &req));
        if ( OMPI_SUCCESS != rc ) {
            goto exit;
        }
        rc = MCA_PML_CALL(send (tmpbuf, count, MPI_INT, 0,
                                OMPI_COMM_ALLREDUCE_TAG,
                                MCA_PML_BASE_SEND_STANDARD,
                                intercomm));
        if ( OMPI_SUCCESS != rc ) {
            goto exit;
        }
        rc = ompi_request_wait ( &req, MPI_STATUS_IGNORE );
        if ( OMPI_SUCCESS != rc ) {
            goto exit;
        }

        if ( &ompi_mpi_op_max.op == op ) {
            for ( i = 0 ; i < count; i++ ) {
                if (tmpbuf[i] > outbuf[i]) outbuf[i] = tmpbuf[i];
            }
        }
        else if ( &ompi_mpi_op_min.op == op ) {
            for ( i = 0 ; i < count; i++ ) {
                if (tmpbuf[i] < outbuf[i]) outbuf[i] = tmpbuf[i];
            }
        }
        else if ( &ompi_mpi_op_sum.op == op ) {
            for ( i = 0 ; i < count; i++ ) {
                outbuf[i] += tmpbuf[i];
            }
        }
        else if ( &ompi_mpi_op_prod.op == op ) {
            for ( i = 0 ; i < count; i++ ) {
                outbuf[i] *= tmpbuf[i];
            }
        }
    }
 
    /* distribute the overall result to all processes in the other group.
       Instead of using bcast, we are using here allgatherv, to avoid the
       possible deadlock. Else, we need an algorithm to determine, 
       which group sends first in the inter-bcast and which receives 
       the result first.
    */
    rcounts[0] = count;
    sbuf       = outbuf;
    rc = intercomm->c_coll.coll_allgatherv (sbuf, scount, MPI_INT, outbuf,
                                            rcounts, rdisps, MPI_INT, 
                                            intercomm,
                                            intercomm->c_coll.coll_allgatherv_module);
    
 exit:
    if ( NULL != tmpbuf ) {
        free ( tmpbuf );
    }
    if ( NULL != rcounts ) {
        free ( rcounts );
    }
    if ( NULL != rdisps ) {
        free ( rdisps );
    }
    
    return (rc);
}
/*
 *  reduce_scatter_intra_basic_recursivehalving
 *
 *  Function:   - reduce scatter implementation using recursive-halving 
 *                algorithm
 *  Accepts:    - same as MPI_Reduce_scatter()
 *  Returns:    - MPI_SUCCESS or error code
 *  Limitation: - Works only for commutative operations.
 */
int
ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf, 
                                                            void *rbuf, 
                                                            int *rcounts,
                                                            struct ompi_datatype_t *dtype,
                                                            struct ompi_op_t *op,
                                                            struct ompi_communicator_t *comm,
                                                            mca_coll_base_module_t *module)
{
    int i, rank, size, count, err = OMPI_SUCCESS;
    int tmp_size, remain = 0, tmp_rank, *disps = NULL;
    ptrdiff_t true_lb, true_extent, lb, extent, buf_size;
    char *recv_buf = NULL, *recv_buf_free = NULL;
    char *result_buf = NULL, *result_buf_free = NULL;
   
    /* Initialize */
    rank = ompi_comm_rank(comm);
    size = ompi_comm_size(comm);
   
    OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_basic_recursivehalving, rank %d", rank));

    /* Find displacements and the like */
    disps = (int*) malloc(sizeof(int) * size);
    if (NULL == disps) return OMPI_ERR_OUT_OF_RESOURCE;

    disps[0] = 0;
    for (i = 0; i < (size - 1); ++i) {
        disps[i + 1] = disps[i] + rcounts[i];
    }
    count = disps[size - 1] + rcounts[size - 1];

    /* short cut the trivial case */
    if (0 == count) {
        free(disps);
        return OMPI_SUCCESS;
    }

    /* get datatype information */
    ompi_datatype_get_extent(dtype, &lb, &extent);
    ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent);
    buf_size = true_extent + (ptrdiff_t)(count - 1) * extent;

    /* Handle MPI_IN_PLACE */
    if (MPI_IN_PLACE == sbuf) {
        sbuf = rbuf;
    }

    /* Allocate temporary receive buffer. */
    recv_buf_free = (char*) malloc(buf_size);
    recv_buf = recv_buf_free - lb;
    if (NULL == recv_buf_free) {
        err = OMPI_ERR_OUT_OF_RESOURCE;
        goto cleanup;
    }
   
    /* allocate temporary buffer for results */
    result_buf_free = (char*) malloc(buf_size);
    result_buf = result_buf_free - lb;
   
    /* copy local buffer into the temporary results */
    err = ompi_datatype_sndrcv(sbuf, count, dtype, result_buf, count, dtype);
    if (OMPI_SUCCESS != err) goto cleanup;
   
    /* figure out power of two mapping: grow until larger than
       comm size, then go back one, to get the largest power of
       two less than comm size */
    tmp_size = opal_next_poweroftwo (size); 
    tmp_size >>= 1;
    remain = size - tmp_size;
   
    /* If comm size is not a power of two, have the first "remain"
       procs with an even rank send to rank + 1, leaving a power of
       two procs to do the rest of the algorithm */
    if (rank < 2 * remain) {
        if ((rank & 1) == 0) {
            err = MCA_PML_CALL(send(result_buf, count, dtype, rank + 1, 
                                    MCA_COLL_BASE_TAG_REDUCE_SCATTER,
                                    MCA_PML_BASE_SEND_STANDARD,
                                    comm));
            if (OMPI_SUCCESS != err) goto cleanup;
         
            /* we don't participate from here on out */
            tmp_rank = -1;
        } else {
            err = MCA_PML_CALL(recv(recv_buf, count, dtype, rank - 1,
                                    MCA_COLL_BASE_TAG_REDUCE_SCATTER,
                                    comm, MPI_STATUS_IGNORE));
         
            /* integrate their results into our temp results */
            ompi_op_reduce(op, recv_buf, result_buf, count, dtype);
         
            /* adjust rank to be the bottom "remain" ranks */
            tmp_rank = rank / 2;
        }
    } else {
        /* just need to adjust rank to show that the bottom "even
           remain" ranks dropped out */
        tmp_rank = rank - remain;
    }
   
    /* For ranks not kicked out by the above code, perform the
       recursive halving */
    if (tmp_rank >= 0) {
        int *tmp_disps = NULL, *tmp_rcounts = NULL;
        int mask, send_index, recv_index, last_index;
      
        /* recalculate disps and rcounts to account for the
           special "remainder" processes that are no longer doing
           anything */
        tmp_rcounts = (int*) malloc(tmp_size * sizeof(int));
        if (NULL == tmp_rcounts) {
            err = OMPI_ERR_OUT_OF_RESOURCE;
            goto cleanup;
        }
        tmp_disps = (int*) malloc(tmp_size * sizeof(int));
        if (NULL == tmp_disps) {
            free(tmp_rcounts);
            err = OMPI_ERR_OUT_OF_RESOURCE;
            goto cleanup;
        }

        for (i = 0 ; i < tmp_size ; ++i) {
            if (i < remain) {
                /* need to include old neighbor as well */
                tmp_rcounts[i] = rcounts[i * 2 + 1] + rcounts[i * 2];
            } else {
                tmp_rcounts[i] = rcounts[i + remain];
            }
        }

        tmp_disps[0] = 0;
        for (i = 0; i < tmp_size - 1; ++i) {
            tmp_disps[i + 1] = tmp_disps[i] + tmp_rcounts[i];
        }

        /* do the recursive halving communication.  Don't use the
           dimension information on the communicator because I
           think the information is invalidated by our "shrinking"
           of the communicator */
        mask = tmp_size >> 1;
        send_index = recv_index = 0;
        last_index = tmp_size;
        while (mask > 0) {
            int tmp_peer, peer, send_count, recv_count;
            struct ompi_request_t *request;

            tmp_peer = tmp_rank ^ mask;
            peer = (tmp_peer < remain) ? tmp_peer * 2 + 1 : tmp_peer + remain;

            /* figure out if we're sending, receiving, or both */
            send_count = recv_count = 0;
            if (tmp_rank < tmp_peer) {
                send_index = recv_index + mask;
                for (i = send_index ; i < last_index ; ++i) {
                    send_count += tmp_rcounts[i];
                }
                for (i = recv_index ; i < send_index ; ++i) {
                    recv_count += tmp_rcounts[i];
                }
            } else {
                recv_index = send_index + mask;
                for (i = send_index ; i < recv_index ; ++i) {
                    send_count += tmp_rcounts[i];
                }
                for (i = recv_index ; i < last_index ; ++i) {
                    recv_count += tmp_rcounts[i];
                }
            }

            /* actual data transfer.  Send from result_buf,
               receive into recv_buf */
            if (send_count > 0 && recv_count != 0) {
                err = MCA_PML_CALL(irecv(recv_buf + (ptrdiff_t)tmp_disps[recv_index] * extent,
                                         recv_count, dtype, peer,
                                         MCA_COLL_BASE_TAG_REDUCE_SCATTER,
                                         comm, &request));
                if (OMPI_SUCCESS != err) {
                    free(tmp_rcounts);
                    free(tmp_disps);
                    goto cleanup;
                }                                             
            }
            if (recv_count > 0 && send_count != 0) {
                err = MCA_PML_CALL(send(result_buf + (ptrdiff_t)tmp_disps[send_index] * extent,
                                        send_count, dtype, peer, 
                                        MCA_COLL_BASE_TAG_REDUCE_SCATTER,
                                        MCA_PML_BASE_SEND_STANDARD,
                                        comm));
                if (OMPI_SUCCESS != err) {
                    free(tmp_rcounts);
                    free(tmp_disps);
                    goto cleanup;
                }                                             
            }
            if (send_count > 0 && recv_count != 0) {
                err = ompi_request_wait(&request, MPI_STATUS_IGNORE);
                if (OMPI_SUCCESS != err) {
                    free(tmp_rcounts);
                    free(tmp_disps);
                    goto cleanup;
                }                                             
            }

            /* if we received something on this step, push it into
               the results buffer */
            if (recv_count > 0) {
                ompi_op_reduce(op, 
                               recv_buf + (ptrdiff_t)tmp_disps[recv_index] * extent, 
                               result_buf + (ptrdiff_t)tmp_disps[recv_index] * extent,
                               recv_count, dtype);
            }

            /* update for next iteration */
            send_index = recv_index;
            last_index = recv_index + mask;
            mask >>= 1;
        }

        /* copy local results from results buffer into real receive buffer */
        if (0 != rcounts[rank]) {
            err = ompi_datatype_sndrcv(result_buf + disps[rank] * extent,
                                       rcounts[rank], dtype, 
                                       rbuf, rcounts[rank], dtype);
            if (OMPI_SUCCESS != err) {
                free(tmp_rcounts);
                free(tmp_disps);
                goto cleanup;
            }                                             
        }

        free(tmp_rcounts);
        free(tmp_disps);
    }
Example #8
0
/*
 *	allgather_inter
 *
 *	Function:	- allgather using other MPI collections
 *	Accepts:	- same as MPI_Allgather()
 *	Returns:	- MPI_SUCCESS or error code
 */
int
mca_coll_basic_allgather_inter(void *sbuf, int scount,
                               struct ompi_datatype_t *sdtype,
                               void *rbuf, int rcount,
                               struct ompi_datatype_t *rdtype,
                               struct ompi_communicator_t *comm,
                               mca_coll_base_module_t *module)
{
    int rank, root = 0, size, rsize, err, i;
    char *tmpbuf = NULL, *ptmp;
    ptrdiff_t rlb, slb, rextent, sextent, incr;
    ompi_request_t *req;
    mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t*) module;
    ompi_request_t **reqs = basic_module->mccb_reqs;

    rank = ompi_comm_rank(comm);
    size = ompi_comm_size(comm);
    rsize = ompi_comm_remote_size(comm);

    /* Algorithm:
     * - a gather to the root in remote group (simultaniously executed,
     * thats why we cannot use coll_gather).
     * - exchange the temp-results between two roots 
     * - inter-bcast (again simultanious).
     */

    /* Step one: gather operations: */
    if (rank != root) {
        /* send your data to root */
        err = MCA_PML_CALL(send(sbuf, scount, sdtype, root,
                                MCA_COLL_BASE_TAG_ALLGATHER,
                                MCA_PML_BASE_SEND_STANDARD, comm));
        if (OMPI_SUCCESS != err) {
            return err;
        }
    } else {
        /* receive a msg. from all other procs. */
        err = ompi_datatype_get_extent(rdtype, &rlb, &rextent);
        if (OMPI_SUCCESS != err) {
            return err;
        }
        err = ompi_datatype_get_extent(sdtype, &slb, &sextent);
        if (OMPI_SUCCESS != err) {
            return err;
        }

        /* Do a send-recv between the two root procs. to avoid deadlock */
        err = MCA_PML_CALL(isend(sbuf, scount, sdtype, 0,
                                 MCA_COLL_BASE_TAG_ALLGATHER,
                                 MCA_PML_BASE_SEND_STANDARD,
                                 comm, &reqs[rsize]));
        if (OMPI_SUCCESS != err) {
            return err;
        }

        err = MCA_PML_CALL(irecv(rbuf, rcount, rdtype, 0,
                                 MCA_COLL_BASE_TAG_ALLGATHER, comm,
                                 &reqs[0]));
        if (OMPI_SUCCESS != err) {
            return err;
        }

        incr = rextent * rcount;
        ptmp = (char *) rbuf + incr;
        for (i = 1; i < rsize; ++i, ptmp += incr) {
            err = MCA_PML_CALL(irecv(ptmp, rcount, rdtype, i,
                                     MCA_COLL_BASE_TAG_ALLGATHER,
                                     comm, &reqs[i]));
            if (MPI_SUCCESS != err) {
                return err;
            }
        }

        err = ompi_request_wait_all(rsize + 1, reqs, MPI_STATUSES_IGNORE);
        if (OMPI_SUCCESS != err) {
            return err;
        }

        /* Step 2: exchange the resuts between the root processes */
        tmpbuf = (char *) malloc(scount * size * sextent);
        if (NULL == tmpbuf) {
            return err;
        }

        err = MCA_PML_CALL(isend(rbuf, rsize * rcount, rdtype, 0,
                                 MCA_COLL_BASE_TAG_ALLGATHER,
                                 MCA_PML_BASE_SEND_STANDARD, comm, &req));
        if (OMPI_SUCCESS != err) {
            goto exit;
        }

        err = MCA_PML_CALL(recv(tmpbuf, size * scount, sdtype, 0,
                                MCA_COLL_BASE_TAG_ALLGATHER, comm,
                                MPI_STATUS_IGNORE));
        if (OMPI_SUCCESS != err) {
            goto exit;
        }

        err = ompi_request_wait( &req, MPI_STATUS_IGNORE);
        if (OMPI_SUCCESS != err) {
            goto exit;
        }
    }


    /* Step 3: bcast the data to the remote group. This 
     * happens in both groups simultaniously, thus we can 
     * not use coll_bcast (this would deadlock). 
     */
    if (rank != root) {
        /* post the recv */
        err = MCA_PML_CALL(recv(rbuf, rsize * rcount, rdtype, 0,
                                MCA_COLL_BASE_TAG_ALLGATHER, comm,
                                MPI_STATUS_IGNORE));
        if (OMPI_SUCCESS != err) {
            goto exit;
        }

    } else {
        /* Send the data to every other process in the remote group
         * except to rank zero. which has it already. */
        for (i = 1; i < rsize; i++) {
            err = MCA_PML_CALL(isend(tmpbuf, size * scount, sdtype, i,
                                     MCA_COLL_BASE_TAG_ALLGATHER,
                                     MCA_PML_BASE_SEND_STANDARD,
                                     comm, &reqs[i - 1]));
            if (OMPI_SUCCESS != err) {
                goto exit;
            }

        }

        err = ompi_request_wait_all(rsize - 1, reqs, MPI_STATUSES_IGNORE);
        if (OMPI_SUCCESS != err) {
            goto exit;
        }
    }

  exit:
    if (NULL != tmpbuf) {
        free(tmpbuf);
    }

    return err;
}
int
mca_fcoll_dynamic_file_write_all (mca_io_ompio_file_t *fh,
                                  void *buf,
                                  int count,
                                  struct ompi_datatype_t *datatype,
                                  ompi_status_public_t *status)
{
    MPI_Aint total_bytes_written = 0;  /* total bytes that have been written*/
    MPI_Aint total_bytes = 0;          /* total bytes to be written */
    MPI_Aint bytes_to_write_in_cycle = 0; /* left to be written in a cycle*/
    MPI_Aint bytes_per_cycle = 0;      /* total written in each cycle by each process*/
    int index = 0;
    int cycles = 0;
    int i=0, j=0, l=0;
    int n=0; /* current position in total_bytes_per_process array */
    MPI_Aint bytes_remaining = 0; /* how many bytes have been written from the current
                                value from total_bytes_per_process */
    int bytes_sent = 0, ret =0;
    int blocks=0, entries_per_aggregator=0;

    /* iovec structure and count of the buffer passed in */
    uint32_t iov_count = 0;
    struct iovec *decoded_iov = NULL;
    int iov_index = 0;
    char *send_buf = NULL;
    size_t current_position = 0;
    struct iovec *local_iov_array=NULL, *global_iov_array=NULL;
    local_io_array *file_offsets_for_agg=NULL;
    /* global iovec at the writers that contain the iovecs created from
       file_set_view */
    uint32_t total_fview_count = 0;
    int local_count = 0, temp_pindex;
    int *fview_count = NULL, *disp_index=NULL, *temp_disp_index=NULL;
    int current_index = 0, temp_index=0;

    char *global_buf = NULL;
    MPI_Aint global_count = 0;
    
    
    /* array that contains the sorted indices of the global_iov */
    int *sorted = NULL, *sorted_file_offsets=NULL;
    int *displs = NULL;
    int dynamic_num_io_procs;
    size_t max_data = 0, datatype_size = 0; 
    int **blocklen_per_process=NULL;
    MPI_Aint **displs_per_process=NULL, *memory_displacements=NULL;
    ompi_datatype_t **recvtype = NULL;
    MPI_Aint *total_bytes_per_process = NULL;
    MPI_Request *send_req=NULL, *recv_req=NULL;
    int recv_req_count=0;
    

#if TIME_BREAKDOWN
    double write_time = 0.0, start_write_time = 0.0, end_write_time = 0.0;
    double comm_time = 0.0, start_comm_time = 0.0, end_comm_time = 0.0;
    double exch_write = 0.0, start_exch = 0.0, end_exch = 0.0;
    print_entry nentry;
#endif


//    if (opal_datatype_is_contiguous_memory_layout(&datatype->super,1)) {
//        fh->f_flags |= OMPIO_CONTIGUOUS_MEMORY;
//    }

    /**************************************************************************
     ** In case the data is not contigous in memory, decode it into an iovec **
     **************************************************************************/
    if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) {
      ret =   ompi_io_ompio_decode_datatype (fh,
					     datatype,
					     count,
					     buf,
					     &max_data,
					     &decoded_iov,
					     &iov_count);
      if (OMPI_SUCCESS != ret ){
	goto exit;
      }
    }
    else {
        max_data = count * datatype->super.size;
    }

    if ( MPI_STATUS_IGNORE != status ) {
	status->_ucount = max_data;
    }
	
    mca_io_ompio_get_num_aggregators ( &dynamic_num_io_procs );
    ret = ompi_io_ompio_set_aggregator_props (fh, 
					      dynamic_num_io_procs,
					      max_data);
	
    if (OMPI_SUCCESS != ret){
	goto exit;
    }


    total_bytes_per_process = (MPI_Aint*)malloc
        (fh->f_procs_per_group*sizeof(MPI_Aint));
    if (NULL == total_bytes_per_process) {
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
	goto exit;
    }

    ret = ompi_io_ompio_allgather_array (&max_data,
					 1,
					 MPI_LONG,
					 total_bytes_per_process,
					 1,
					 MPI_LONG,
					 fh->f_aggregator_index,
					 fh->f_procs_in_group,
					 fh->f_procs_per_group,
					 fh->f_comm);
    
    if( OMPI_SUCCESS != ret){
      goto exit;
    }
    for (i=0 ; i<fh->f_procs_per_group ; i++) {
        total_bytes += total_bytes_per_process[i];
    }

    if (NULL != total_bytes_per_process) {
        free (total_bytes_per_process);
        total_bytes_per_process = NULL;
    }
    
    /*********************************************************************
     *** Generate the File offsets/lengths corresponding to this write ***
     ********************************************************************/
    ret = ompi_io_ompio_generate_current_file_view(fh,
						   max_data,
						   &local_iov_array,
						   &local_count);
    if (ret != OMPI_SUCCESS){
      goto exit;
    }

#if DEBUG_ON    
    for (i=0 ; i<local_count ; i++) {
   
      printf("%d: OFFSET: %d   LENGTH: %ld\n",
	     fh->f_rank,
	     local_iov_array[i].iov_base,
	     local_iov_array[i].iov_len);

    }
#endif   

    /*************************************************************
     *** ALLGather the File View information at all processes ***
     *************************************************************/

    fview_count = (int *) malloc (fh->f_procs_per_group * sizeof (int));
    if (NULL == fview_count) {
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
	goto exit;
    }

    ret = ompi_io_ompio_allgather_array (&local_count,
					 1,
					 MPI_INT,
					 fview_count,
					 1,
					 MPI_INT,
					 fh->f_aggregator_index,
					 fh->f_procs_in_group,
					 fh->f_procs_per_group,
					 fh->f_comm);
    
    if( OMPI_SUCCESS != ret){
      goto exit;
    }

    displs = (int*) malloc (fh->f_procs_per_group * sizeof (int));
    if (NULL == displs) {
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
	goto exit;
    }

    displs[0] = 0;
    total_fview_count = fview_count[0];
    for (i=1 ; i<fh->f_procs_per_group ; i++) {
        total_fview_count += fview_count[i];
        displs[i] = displs[i-1] + fview_count[i-1];
    }
    
#if DEBUG_ON
    printf("total_fview_count : %d\n", total_fview_count);
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
        for (i=0 ; i<fh->f_procs_per_group ; i++) {
            printf ("%d: PROCESS: %d  ELEMENTS: %d  DISPLS: %d\n",
                    fh->f_rank,
                    i,
                    fview_count[i],
                    displs[i]);
        }
    }
#endif
    
    /* allocate the global iovec  */

    if (0 != total_fview_count) {
      global_iov_array = (struct iovec*) malloc (total_fview_count *
						    sizeof(struct iovec));
      if (NULL == global_iov_array){
	opal_output(1, "OUT OF MEMORY\n");
	ret = OMPI_ERR_OUT_OF_RESOURCE;
	goto exit;
      }

    }
    
    ret = ompi_io_ompio_allgatherv_array (local_iov_array,
					  local_count,
					  fh->f_iov_type,
					  global_iov_array,
					  fview_count,
					  displs,
					  fh->f_iov_type,
					  fh->f_aggregator_index,
					  fh->f_procs_in_group,
					  fh->f_procs_per_group,
					  fh->f_comm);
    if (OMPI_SUCCESS != ret){
      goto exit;
    }

    /* sort it */
    if (0 != total_fview_count) {
        sorted = (int *)malloc (total_fview_count * sizeof(int));
        if (NULL == sorted) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
        }
	ompi_io_ompio_sort_iovec (global_iov_array, total_fview_count, sorted);
    }

    if (NULL != local_iov_array){
      free(local_iov_array);
      local_iov_array = NULL;
    }

    if (NULL != displs){
      free(displs);
      displs=NULL;
    }

    
#if DEBUG_ON
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
      uint32_t tv=0;
      for (tv=0 ; tv<total_fview_count ; tv++) {
	printf("%d: OFFSET: %lld   LENGTH: %ld\n",
	       fh->f_rank,
	       global_iov_array[sorted[tv]].iov_base,
	       global_iov_array[sorted[tv]].iov_len);
      }
    }
#endif
    
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
        disp_index = (int *)malloc (fh->f_procs_per_group * sizeof (int));
        if (NULL == disp_index) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
        }

	blocklen_per_process = (int **)malloc (fh->f_procs_per_group * sizeof (int*));
        if (NULL == blocklen_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
        }

	displs_per_process = (MPI_Aint **)malloc (fh->f_procs_per_group * sizeof (MPI_Aint*));
	if (NULL == displs_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
        }

	for(i=0;i<fh->f_procs_per_group;i++){
	  blocklen_per_process[i] = NULL;
	  displs_per_process[i] = NULL;
	}
    }
    

    mca_io_ompio_get_bytes_per_agg ( (int *)&bytes_per_cycle );
    cycles = ceil((double)total_bytes/bytes_per_cycle);


    n = 0; 
    bytes_remaining = 0;
    current_index = 0;




#if TIME_BREAKDOWN
      start_exch = MPI_Wtime();
#endif
    
    for (index = 0; index < cycles; index++) {

      /* Getting ready for next cycle 
	 Initializing and freeing buffers*/
      if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
	
	if (NULL == recvtype){
	  recvtype = (ompi_datatype_t **) 
	    malloc (fh->f_procs_per_group  * sizeof(ompi_datatype_t *));
	    if (NULL == recvtype) {
	      opal_output (1, "OUT OF MEMORY\n");
	      ret = OMPI_ERR_OUT_OF_RESOURCE;
	      goto exit;
	    }
	}
	
	for(l=0;l<fh->f_procs_per_group;l++){
	  disp_index[l] =  1;
	 
	  if (NULL != blocklen_per_process[l]){
	    free(blocklen_per_process[l]);
	    blocklen_per_process[l] = NULL;
	  }
	  if (NULL != displs_per_process[l]){
	    free(displs_per_process[l]);
	    displs_per_process[l] = NULL;
	  }
	  blocklen_per_process[l] = (int *) calloc (1, sizeof(int));
	  if (NULL == blocklen_per_process[l]) {
	    opal_output (1, "OUT OF MEMORY for blocklen\n");
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	  }
	  displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint));
	  if (NULL == displs_per_process[l]){      
	      opal_output (1, "OUT OF MEMORY for displs\n");
	      ret = OMPI_ERR_OUT_OF_RESOURCE;
	      goto exit;
	  }
	}
	
	if (NULL != sorted_file_offsets){
	  free(sorted_file_offsets);
	  sorted_file_offsets = NULL;
	}
	
	if(NULL != file_offsets_for_agg){
	  free(file_offsets_for_agg);
	  file_offsets_for_agg = NULL;
	}
	
	if (NULL != memory_displacements){
	  free(memory_displacements);
	  memory_displacements = NULL;
	}
	
      }
      
      if (cycles-1 == index) {
	bytes_to_write_in_cycle = total_bytes - bytes_per_cycle*index;
      }
      else {
	bytes_to_write_in_cycle = bytes_per_cycle;
      }
      
#if DEBUG_ON
      if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
	printf ("****%d: CYCLE %d   Bytes %lld**********\n",
		fh->f_rank,
		index, 
		bytes_to_write_in_cycle);
      }
#endif
      /**********************************************************
       **Gather the Data from all the processes at the writers **
       *********************************************************/
      
      /* Calculate how much data will be contributed in this cycle 
	 by each process*/
      bytes_sent = 0;
#if DEBUG_ON
      printf("bytes_to_write_in_cycle: %ld, cycle : %d\n", bytes_to_write_in_cycle,
	       index);
#endif
      /* The blocklen and displs calculation only done at aggregators!*/


      while (bytes_to_write_in_cycle) {
	
	blocks = fview_count[0];
	for (j=0 ; j<fh->f_procs_per_group ; j++) {
	  if (sorted[current_index] < blocks) {
	    n = j;
	    break;
	  }
	  else {
	    blocks += fview_count[j+1];
	  }
	}
	
	if (bytes_remaining) {
	  if (bytes_remaining <= bytes_to_write_in_cycle) {

	    if (fh->f_procs_in_group[fh->f_aggregator_index] == 
		fh->f_rank) {
	      blocklen_per_process[n][disp_index[n] - 1] = bytes_remaining;
	      displs_per_process[n][disp_index[n] - 1] = 
		(OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base + 
		(global_iov_array[sorted[current_index]].iov_len
		 - bytes_remaining);
	      
	    }
	    if (fh->f_procs_in_group[n] == fh->f_rank) {
	      bytes_sent += bytes_remaining;
	    }
	    current_index ++;
	    bytes_to_write_in_cycle -= bytes_remaining;
	    bytes_remaining = 0;
	    if (fh->f_procs_in_group[fh->f_aggregator_index] == 
		fh->f_rank) {		
	    /* In this cases the length is consumed so allocating for
	       next displacement and blocklength*/
     
	      blocklen_per_process[n] = (int *) realloc
		((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int));
	      displs_per_process[n] = (MPI_Aint *) realloc
		((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint));
	      blocklen_per_process[n][disp_index[n]] = 0;
	      displs_per_process[n][disp_index[n]] = 0;
	      disp_index[n] += 1;
	    }
	    continue;
	  }
	  else {
	    if (fh->f_procs_in_group[fh->f_aggregator_index] == 
		fh->f_rank) {
	      blocklen_per_process[n][disp_index[n] - 1] = bytes_to_write_in_cycle;
	      displs_per_process[n][disp_index[n] - 1] = 
		(OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base +
                (global_iov_array[sorted[current_index]].iov_len
                 - bytes_remaining);
	    }
	    
	    if (fh->f_procs_in_group[n] == fh->f_rank) {
	      bytes_sent += bytes_to_write_in_cycle;
	    }
	    bytes_remaining -= bytes_to_write_in_cycle;
	    bytes_to_write_in_cycle = 0;
	    break;
	  }
	}
	else {
	  if (bytes_to_write_in_cycle < 
	      (MPI_Aint) global_iov_array[sorted[current_index]].iov_len) {
	    if (fh->f_procs_in_group[fh->f_aggregator_index] == 
		fh->f_rank) {

	      blocklen_per_process[n][disp_index[n] - 1] = bytes_to_write_in_cycle;
	      displs_per_process[n][disp_index[n] - 1] = 
		(OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base ;


	    }
	    if (fh->f_procs_in_group[n] == fh->f_rank) {
	      bytes_sent += bytes_to_write_in_cycle;
	      
	    }
	    bytes_remaining = global_iov_array[sorted[current_index]].iov_len - 
	      bytes_to_write_in_cycle;
	    bytes_to_write_in_cycle = 0;
	    break;
	  }
	  else {
	    if (fh->f_procs_in_group[fh->f_aggregator_index] == 
		fh->f_rank) {
	      blocklen_per_process[n][disp_index[n] - 1] =
		global_iov_array[sorted[current_index]].iov_len;
	      displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)
		global_iov_array[sorted[current_index]].iov_base;
	      
	      blocklen_per_process[n] = 
		(int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int));
	      displs_per_process[n] = (MPI_Aint *)realloc
		((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint));
	      blocklen_per_process[n][disp_index[n]] = 0;
	      displs_per_process[n][disp_index[n]] = 0;
	      disp_index[n] += 1;
	      /*realloc for next blocklength 
		and assign this displacement and check for next displs as
	      the total length of this entry has been consumed!*/
	    }
	    if (fh->f_procs_in_group[n] == fh->f_rank) {
	      bytes_sent += global_iov_array[sorted[current_index]].iov_len;
	    }
	    bytes_to_write_in_cycle -= 
	      global_iov_array[sorted[current_index]].iov_len;
	    current_index ++;
	    continue;
	  }
	}
      }
      

      /* Calculate the displacement on where to put the data and allocate
	 the recieve buffer (global_buf) */
      if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
	entries_per_aggregator=0;
	for (i=0;i<fh->f_procs_per_group; i++){
	  for (j=0;j<disp_index[i];j++){
	    if (blocklen_per_process[i][j] > 0) 
	      entries_per_aggregator++ ;
	  }
	}

#if DEBUG_ON
	printf("%d: cycle: %d, bytes_sent: %d\n ",fh->f_rank,index,
		 bytes_sent);
	printf("%d : Entries per aggregator : %d\n",fh->f_rank,entries_per_aggregator);
#endif
	
	if (entries_per_aggregator > 0){
	  file_offsets_for_agg = (local_io_array *)
	    malloc(entries_per_aggregator*sizeof(local_io_array));
	  if (NULL == file_offsets_for_agg) {
	    opal_output (1, "OUT OF MEMORY\n");
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	  }
	  
	  sorted_file_offsets = (int *)
	    malloc (entries_per_aggregator*sizeof(int));
	  if (NULL == sorted_file_offsets){
	    opal_output (1, "OUT OF MEMORY\n");
	    ret =  OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	  }
	  
	  /*Moving file offsets to an IO array!*/
	  temp_index = 0;
	 	  
	  for (i=0;i<fh->f_procs_per_group; i++){
	    for(j=0;j<disp_index[i];j++){
	      if (blocklen_per_process[i][j] > 0){
		file_offsets_for_agg[temp_index].length =
		  blocklen_per_process[i][j];
		file_offsets_for_agg[temp_index].process_id = i;
		file_offsets_for_agg[temp_index].offset = 
		  displs_per_process[i][j];
		temp_index++;
 
#if DEBUG_ON
		printf("************Cycle: %d,  Aggregator: %d ***************\n", 
		       index+1,fh->f_rank);
		
		printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n",
		       fh->f_procs_in_group[i],j,
		     blocklen_per_process[i][j],j,
		     displs_per_process[i][j],
		     fh->f_rank);
#endif
	      }
	    }
	  }
	}
	else{
	  continue;
	}
	/* Sort the displacements for each aggregator*/
	local_heap_sort (file_offsets_for_agg,
			 entries_per_aggregator,
			 sorted_file_offsets); 
	
	/*create contiguous memory displacements 
	  based on blocklens on the same displs array
	  and map it to this aggregator's actual 
	  file-displacements (this is in the io-array created above)*/
	memory_displacements = (MPI_Aint *) malloc 
	    (entries_per_aggregator * sizeof(MPI_Aint));
	
	memory_displacements[sorted_file_offsets[0]] = 0;
	for (i=1; i<entries_per_aggregator; i++){
	  memory_displacements[sorted_file_offsets[i]] = 
	    memory_displacements[sorted_file_offsets[i-1]] + 
	    file_offsets_for_agg[sorted_file_offsets[i-1]].length;
	}
	
	temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int));
	if (NULL == temp_disp_index) {
	  opal_output (1, "OUT OF MEMORY\n");
	  return OMPI_ERR_OUT_OF_RESOURCE;
	}
	
	/*Now update the displacements array  with memory offsets*/
	global_count = 0;
	for (i=0;i<entries_per_aggregator;i++){
	  temp_pindex =
	    file_offsets_for_agg[sorted_file_offsets[i]].process_id;
	  displs_per_process[temp_pindex][temp_disp_index[temp_pindex]] =
	    memory_displacements[sorted_file_offsets[i]];
	  if (temp_disp_index[temp_pindex] < disp_index[temp_pindex])
	      temp_disp_index[temp_pindex] += 1;
	  else{
	    printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n",
		   temp_pindex, temp_disp_index[temp_pindex],
		   temp_pindex, disp_index[temp_pindex]);
	  }
	  global_count += 
	    file_offsets_for_agg[sorted_file_offsets[i]].length;
	}
	  
	if (NULL != temp_disp_index){
	  free(temp_disp_index);
	    temp_disp_index = NULL;
	}

#if DEBUG_ON

	printf("************Cycle: %d,  Aggregator: %d ***************\n", 
	       index+1,fh->f_rank);
	for (i=0;i<fh->f_procs_per_group; i++){
	  for(j=0;j<disp_index[i];j++){
	    if (blocklen_per_process[i][j] > 0){
	      printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n",
		     fh->f_procs_in_group[i],j,
		     blocklen_per_process[i][j],j,
		     displs_per_process[i][j],
		     fh->f_rank);
	      
	    }
	  }
	}
	printf("************Cycle: %d,  Aggregator: %d ***************\n", 
	       index+1,fh->f_rank);
	for (i=0; i<entries_per_aggregator;i++){
	  printf("%d: OFFSET: %lld   LENGTH: %ld, Mem-offset: %ld\n",
		   file_offsets_for_agg[sorted_file_offsets[i]].process_id,
		 file_offsets_for_agg[sorted_file_offsets[i]].offset,
		 file_offsets_for_agg[sorted_file_offsets[i]].length,
		 memory_displacements[sorted_file_offsets[i]]);
	}
	printf("%d : global_count : %ld, bytes_sent : %d\n",
	       fh->f_rank,global_count, bytes_sent);
#endif
#if TIME_BREAKDOWN
	  start_comm_time = MPI_Wtime();
#endif

	
	global_buf  = (char *) malloc (global_count);
	if (NULL == global_buf){
	    opal_output(1, "OUT OF MEMORY");
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	  goto exit;
	}

	recv_req_count = 0;
	for (i=0;i<fh->f_procs_per_group; i++){
	    
	  ompi_datatype_create_hindexed(disp_index[i],
					blocklen_per_process[i],
					displs_per_process[i],
					MPI_BYTE,
					&recvtype[i]);
	  ompi_datatype_commit(&recvtype[i]); 
	  
	  opal_datatype_type_size(&recvtype[i]->super, 
				  &datatype_size);
	  
	  if (datatype_size){
	    
	    recv_req = (MPI_Request *)realloc 
			((void *)recv_req, (recv_req_count + 1)*sizeof(MPI_Request));
	    
	    ret = MCA_PML_CALL(irecv(global_buf,
				     1,
				     recvtype[i],
				     fh->f_procs_in_group[i],
				     123,
				     fh->f_comm,
				     &recv_req[recv_req_count]));
	    recv_req_count++;
	    
	    if (OMPI_SUCCESS != ret){
	      goto exit;
	    }
	  }
	}
	
      }
      
      

      if (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY) {
	send_buf = &((char*)buf)[total_bytes_written];
      }
      else if (bytes_sent) {
	/* allocate a send buffer and copy the data that needs
	   to be sent into it in case the data is non-contigous
	   in memory */
	OPAL_PTRDIFF_TYPE mem_address;
	size_t remaining = 0;
	size_t temp_position = 0;
	
	send_buf = malloc (bytes_sent);
	if (NULL == send_buf) {
	  opal_output (1, "OUT OF MEMORY\n");
	  return OMPI_ERR_OUT_OF_RESOURCE;
	}
	
	remaining = bytes_sent;
	  
	while (remaining) {
	  mem_address = (OPAL_PTRDIFF_TYPE)
	    (decoded_iov[iov_index].iov_base) + current_position;
	  
	    if (remaining >= 
		(decoded_iov[iov_index].iov_len - current_position)) {
	      memcpy (send_buf+temp_position,
		      (IOVBASE_TYPE *)mem_address,
		      decoded_iov[iov_index].iov_len - current_position);
	      remaining = remaining - 
		(decoded_iov[iov_index].iov_len - current_position);
	      temp_position = temp_position +
		(decoded_iov[iov_index].iov_len - current_position);
	      iov_index = iov_index + 1;
	      current_position = 0;
	    }
	    else {
	      memcpy (send_buf+temp_position,
		      (IOVBASE_TYPE *) mem_address,
		      remaining);
	      current_position = current_position + remaining;
	  remaining = 0;
	    }
	  }
	}
	total_bytes_written += bytes_sent;

	/* Gather the sendbuf from each process in appropritate locations in 
	 aggregators*/
	
	send_req = (MPI_Request *) malloc (sizeof(MPI_Request));
	if (NULL == send_req){
	  opal_output (1, "OUT OF MEMORY\n");
	  ret = OMPI_ERR_OUT_OF_RESOURCE;
	  goto exit;
	}

	
	if (bytes_sent){

	  ret = MCA_PML_CALL(isend(send_buf,
				   bytes_sent,
				   MPI_BYTE,
				   fh->f_procs_in_group[fh->f_aggregator_index],
				   123,
				   MCA_PML_BASE_SEND_STANDARD, 
				   fh->f_comm,
				   send_req));	

	    
	    if ( OMPI_SUCCESS != ret ){
		goto exit;
	    }

	    ret = ompi_request_wait(send_req, MPI_STATUS_IGNORE);
	    if (OMPI_SUCCESS != ret){
		goto exit;
	    }
	} 
	if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {       
	  ret = ompi_request_wait_all (recv_req_count,
				       recv_req,
				       MPI_STATUS_IGNORE);
	  
	  if (OMPI_SUCCESS != ret){
	    goto exit;
	  }
	}
#if DEBUG_ON
	if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank){
	  printf("************Cycle: %d,  Aggregator: %d ***************\n", 
		 index+1,fh->f_rank);
	  for (i=0 ; i<global_count/4 ; i++)
	    printf (" RECV %d \n",((int *)global_buf)[i]);
	}
#endif




    
    if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) {
	if (NULL != send_buf) {
	    free (send_buf);
	    send_buf = NULL;
	}
    }

#if TIME_BREAKDOWN
      end_comm_time = MPI_Wtime();
      comm_time += (end_comm_time - start_comm_time);
#endif


    /**********************************************************
     **************** DONE GATHERING OF DATA ******************
     *********************************************************/
    
        /**********************************************************
         ******* Create the io array, and pass it to fbtl *********
         *********************************************************/

	if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {

#if TIME_BREAKDOWN
	    start_write_time = MPI_Wtime();
#endif
	  
	  fh->f_io_array = (mca_io_ompio_io_array_t *) malloc 
	    (entries_per_aggregator * sizeof (mca_io_ompio_io_array_t));
	  if (NULL == fh->f_io_array) {
	    opal_output(1, "OUT OF MEMORY\n");
	    return OMPI_ERR_OUT_OF_RESOURCE;
	  }
	  
	  fh->f_num_of_io_entries = 0;
	  /*First entry for every aggregator*/
	  fh->f_io_array[fh->f_num_of_io_entries].offset = 
	    (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset;
	  fh->f_io_array[fh->f_num_of_io_entries].length = 
	    file_offsets_for_agg[sorted_file_offsets[0]].length;
	  fh->f_io_array[fh->f_num_of_io_entries].memory_address = 
	    global_buf+memory_displacements[sorted_file_offsets[0]];
	  fh->f_num_of_io_entries++;

	  for (i=1;i<entries_per_aggregator;i++){
	    /* If the enrties are contiguous merge them,
	       else make a new entry */
	    if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset + 
		file_offsets_for_agg[sorted_file_offsets[i-1]].length ==
		file_offsets_for_agg[sorted_file_offsets[i]].offset){
	      fh->f_io_array[fh->f_num_of_io_entries - 1].length +=
		file_offsets_for_agg[sorted_file_offsets[i]].length;	
	    }
	    else {
	      fh->f_io_array[fh->f_num_of_io_entries].offset = 
		(IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset;
	      fh->f_io_array[fh->f_num_of_io_entries].length = 
		file_offsets_for_agg[sorted_file_offsets[i]].length;
	      fh->f_io_array[fh->f_num_of_io_entries].memory_address = 
		global_buf+memory_displacements[sorted_file_offsets[i]];
	      fh->f_num_of_io_entries++;
	    }
	    
	  }
	 
#if DEBUG_ON
	  printf("*************************** %d\n", fh->f_num_of_io_entries);
	  for (i=0 ; i<fh->f_num_of_io_entries ; i++) {
	    printf(" ADDRESS: %p  OFFSET: %ld   LENGTH: %ld\n",
		   fh->f_io_array[i].memory_address,
		   (OPAL_PTRDIFF_TYPE)fh->f_io_array[i].offset,
		   fh->f_io_array[i].length);
	  }
	  
#endif


	  if (fh->f_num_of_io_entries) {
	    if ( 0 >  fh->f_fbtl->fbtl_pwritev (fh)) {
	      opal_output (1, "WRITE FAILED\n");
	      ret = OMPI_ERROR;
	      goto exit;
	    }
	  }
#if TIME_BREAKDOWN
	  end_write_time = MPI_Wtime();
	  write_time += end_write_time - start_write_time;
#endif	  


	}
	
	
	if (NULL != send_req){
	  free(send_req);
	  send_req = NULL;
	}
	
        if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
	  fh->f_num_of_io_entries = 0;
	  if (NULL != fh->f_io_array) {
	    free (fh->f_io_array);
	    fh->f_io_array = NULL;
	  }
	  for (i =0; i< fh->f_procs_per_group; i++) 
	    ompi_datatype_destroy(recvtype+i);
	  if (NULL != recvtype){
	      free(recvtype);
	      recvtype=NULL;
	  }
	  if (NULL != recv_req){
	    free(recv_req);
	    recv_req = NULL;
	    
	  }
	  if (NULL != global_buf) {
	    free (global_buf);
	    global_buf = NULL;
	  }
	  
	}
	
    }

#if TIME_BREAKDOWN
      end_exch = MPI_Wtime();
      exch_write += end_exch - start_exch;
      nentry.time[0] = write_time;
      nentry.time[1] = comm_time;
      nentry.time[2] = exch_write;
      if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank)
	nentry.aggregator = 1;
      else
	nentry.aggregator = 0;
      nentry.nprocs_for_coll = dynamic_num_io_procs;
      if (!ompi_io_ompio_full_print_queue(WRITE_PRINT_QUEUE)){
	ompi_io_ompio_register_print_entry(WRITE_PRINT_QUEUE,
					   nentry);
      } 
#endif


 exit :
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
      if (NULL != fh->f_io_array) {
	free (fh->f_io_array);
	fh->f_io_array = NULL;
      }
      if (NULL != disp_index){
	free(disp_index);
	disp_index = NULL;
      }
      if (NULL != recvtype){
	free(recvtype);
	recvtype=NULL;
      }
      if (NULL != recv_req){
	free(recv_req);
	recv_req = NULL;
      }
      if (NULL != global_buf) {
	free (global_buf);
	global_buf = NULL;
      }
      for(l=0;l<fh->f_procs_per_group;l++){
	if (NULL != blocklen_per_process[l]){
	  free(blocklen_per_process[l]);
	  blocklen_per_process[l] = NULL;
	}
	if (NULL != displs_per_process[l]){
	  free(displs_per_process[l]);
	  displs_per_process[l] = NULL;
	}
      }
      if (NULL != blocklen_per_process){
	free(blocklen_per_process);
	blocklen_per_process = NULL;
      }
      if (NULL != displs_per_process){
	free(displs_per_process);
	displs_per_process = NULL;
      }

    }
    
    if (NULL != sorted) {
        free (sorted);
        sorted = NULL;
    }
    if (NULL != global_iov_array) {
        free (global_iov_array);
	global_iov_array = NULL;
    }
    if (NULL != fview_count) {
        free (fview_count);
        fview_count = NULL;
    }
    if (NULL != decoded_iov) {
        free (decoded_iov);
        decoded_iov = NULL;
    }
    
    if (NULL != send_req){
      free(send_req);
      send_req = NULL;
    }

    return OMPI_SUCCESS;
}
Example #10
0
int
ompi_coll_tuned_bcast_intra_generic( void* buffer,
                                     int original_count,
                                     struct ompi_datatype_t* datatype,
                                     int root,
                                     struct ompi_communicator_t* comm,
                                     mca_coll_base_module_t *module,
                                     uint32_t count_by_segment,
                                     ompi_coll_tree_t* tree )
{
    int err = 0, line, i, rank, size, segindex, req_index;
    int num_segments; /* Number of segments */
    int sendcount;    /* number of elements sent in this segment */
    size_t realsegsize, type_size;
    char *tmpbuf;
    ptrdiff_t extent, lb;
    ompi_request_t *recv_reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
#if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
    ompi_request_t **send_reqs = NULL;
#endif

    size = ompi_comm_size(comm);
    rank = ompi_comm_rank(comm);
    assert( size > 1 );

    ompi_datatype_get_extent (datatype, &lb, &extent);
    ompi_datatype_type_size( datatype, &type_size );
    num_segments = (original_count + count_by_segment - 1) / count_by_segment;
    realsegsize = (ptrdiff_t)count_by_segment * extent;

    /* Set the buffer pointers */
    tmpbuf = (char *) buffer;

#if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
    if( tree->tree_nextsize != 0 ) {
        send_reqs = (ompi_request_t**)malloc( (ptrdiff_t)tree->tree_nextsize *
                                              sizeof(ompi_request_t*) );
    }
#endif

    /* Root code */
    if( rank == root ) {
        /*
           For each segment:
           - send segment to all children.
           The last segment may have less elements than other segments.
        */
        sendcount = count_by_segment;
        for( segindex = 0; segindex < num_segments; segindex++ ) {
            if( segindex == (num_segments - 1) ) {
                sendcount = original_count - segindex * count_by_segment;
            }
            for( i = 0; i < tree->tree_nextsize; i++ ) {
#if defined(COLL_TUNED_BCAST_USE_BLOCKING)
                err = MCA_PML_CALL(send(tmpbuf, sendcount, datatype,
                                        tree->tree_next[i],
                                        MCA_COLL_BASE_TAG_BCAST,
                                        MCA_PML_BASE_SEND_STANDARD, comm));
#else
                err = MCA_PML_CALL(isend(tmpbuf, sendcount, datatype,
                                         tree->tree_next[i],
                                         MCA_COLL_BASE_TAG_BCAST,
                                         MCA_PML_BASE_SEND_STANDARD, comm,
                                         &send_reqs[i]));
#endif  /* COLL_TUNED_BCAST_USE_BLOCKING */
                if (err != MPI_SUCCESS) {
                    line = __LINE__;
                    goto error_hndl;
                }
            }

#if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
            /* complete the sends before starting the next sends */
            err = ompi_request_wait_all( tree->tree_nextsize, send_reqs,
                                         MPI_STATUSES_IGNORE );
            if (err != MPI_SUCCESS) {
                line = __LINE__;
                goto error_hndl;
            }
#endif /* not COLL_TUNED_BCAST_USE_BLOCKING */

            /* update tmp buffer */
            tmpbuf += realsegsize;

        }
    }

    /* Intermediate nodes code */
    else if( tree->tree_nextsize > 0 ) {
        /*
           Create the pipeline.
           1) Post the first receive
           2) For segments 1 .. num_segments
           - post new receive
           - wait on the previous receive to complete
           - send this data to children
           3) Wait on the last segment
           4) Compute number of elements in last segment.
           5) Send the last segment to children
        */
        req_index = 0;
        err = MCA_PML_CALL(irecv(tmpbuf, count_by_segment, datatype,
                                 tree->tree_prev, MCA_COLL_BASE_TAG_BCAST,
                                 comm, &recv_reqs[req_index]));
        if (err != MPI_SUCCESS) {
            line = __LINE__;
            goto error_hndl;
        }

        for( segindex = 1; segindex < num_segments; segindex++ ) {

            req_index = req_index ^ 0x1;

            /* post new irecv */
            err = MCA_PML_CALL(irecv( tmpbuf + realsegsize, count_by_segment,
                                      datatype, tree->tree_prev,
                                      MCA_COLL_BASE_TAG_BCAST,
                                      comm, &recv_reqs[req_index]));
            if (err != MPI_SUCCESS) {
                line = __LINE__;
                goto error_hndl;
            }

            /* wait for and forward the previous segment to children */
            err = ompi_request_wait( &recv_reqs[req_index ^ 0x1],
                                     MPI_STATUSES_IGNORE );
            if (err != MPI_SUCCESS) {
                line = __LINE__;
                goto error_hndl;
            }

            for( i = 0; i < tree->tree_nextsize; i++ ) {
#if defined(COLL_TUNED_BCAST_USE_BLOCKING)
                err = MCA_PML_CALL(send(tmpbuf, count_by_segment, datatype,
                                        tree->tree_next[i],
                                        MCA_COLL_BASE_TAG_BCAST,
                                        MCA_PML_BASE_SEND_STANDARD, comm));
#else
                err = MCA_PML_CALL(isend(tmpbuf, count_by_segment, datatype,
                                         tree->tree_next[i],
                                         MCA_COLL_BASE_TAG_BCAST,
                                         MCA_PML_BASE_SEND_STANDARD, comm,
                                         &send_reqs[i]));
#endif  /* COLL_TUNED_BCAST_USE_BLOCKING */
                if (err != MPI_SUCCESS) {
                    line = __LINE__;
                    goto error_hndl;
                }
            }

#if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
            /* complete the sends before starting the next iteration */
            err = ompi_request_wait_all( tree->tree_nextsize, send_reqs,
                                         MPI_STATUSES_IGNORE );
            if (err != MPI_SUCCESS) {
                line = __LINE__;
                goto error_hndl;
            }
#endif  /* COLL_TUNED_BCAST_USE_BLOCKING */

            /* Update the receive buffer */
            tmpbuf += realsegsize;

        }

        /* Process the last segment */
        err = ompi_request_wait( &recv_reqs[req_index], MPI_STATUSES_IGNORE );
        if (err != MPI_SUCCESS) {
            line = __LINE__;
            goto error_hndl;
        }
        sendcount = original_count - (ptrdiff_t)(num_segments - 1) * count_by_segment;
        for( i = 0; i < tree->tree_nextsize; i++ ) {
#if defined(COLL_TUNED_BCAST_USE_BLOCKING)
            err = MCA_PML_CALL(send(tmpbuf, sendcount, datatype,
                                    tree->tree_next[i],
                                    MCA_COLL_BASE_TAG_BCAST,
                                    MCA_PML_BASE_SEND_STANDARD, comm));
#else
            err = MCA_PML_CALL(isend(tmpbuf, sendcount, datatype,
                                     tree->tree_next[i],
                                     MCA_COLL_BASE_TAG_BCAST,
                                     MCA_PML_BASE_SEND_STANDARD, comm,
                                     &send_reqs[i]));
#endif  /* COLL_TUNED_BCAST_USE_BLOCKING */
            if (err != MPI_SUCCESS) {
                line = __LINE__;
                goto error_hndl;
            }
        }

#if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
        err = ompi_request_wait_all( tree->tree_nextsize, send_reqs,
                                     MPI_STATUSES_IGNORE );
        if (err != MPI_SUCCESS) {
            line = __LINE__;
            goto error_hndl;
        }
#endif  /* COLL_TUNED_BCAST_USE_BLOCKING */
    }

    /* Leaf nodes */
    else {
        /*
           Receive all segments from parent in a loop:
           1) post irecv for the first segment
           2) for segments 1 .. num_segments
           - post irecv for the next segment
           - wait on the previous segment to arrive
           3) wait for the last segment
        */
        req_index = 0;
        err = MCA_PML_CALL(irecv(tmpbuf, count_by_segment, datatype,
                                 tree->tree_prev, MCA_COLL_BASE_TAG_BCAST,
                                 comm, &recv_reqs[req_index]));
        if (err != MPI_SUCCESS) {
            line = __LINE__;
            goto error_hndl;
        }

        for( segindex = 1; segindex < num_segments; segindex++ ) {
            req_index = req_index ^ 0x1;
            tmpbuf += realsegsize;
            /* post receive for the next segment */
            err = MCA_PML_CALL(irecv(tmpbuf, count_by_segment, datatype,
                                     tree->tree_prev, MCA_COLL_BASE_TAG_BCAST,
                                     comm, &recv_reqs[req_index]));
            if (err != MPI_SUCCESS) {
                line = __LINE__;
                goto error_hndl;
            }
            /* wait on the previous segment */
            err = ompi_request_wait( &recv_reqs[req_index ^ 0x1],
                                     MPI_STATUS_IGNORE );
            if (err != MPI_SUCCESS) {
                line = __LINE__;
                goto error_hndl;
            }
        }

        err = ompi_request_wait( &recv_reqs[req_index], MPI_STATUS_IGNORE );
        if (err != MPI_SUCCESS) {
            line = __LINE__;
            goto error_hndl;
        }
    }

#if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
    if( NULL != send_reqs ) free(send_reqs);
#endif

    return (MPI_SUCCESS);

error_hndl:
    OPAL_OUTPUT( (ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d",
                  __FILE__, line, err, rank) );
#if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
    if( NULL != send_reqs ) free(send_reqs);
#endif
    return (err);
}
Example #11
0
int
ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
        int count,
        struct ompi_datatype_t* datatype,
        int root,
        struct ompi_communicator_t* comm,
        mca_coll_base_module_t *module,
        uint32_t segsize )
{
    int err=0, line, rank, size, segindex, i, lr, pair;
    uint32_t counts[2];
    int segcount[2];       /* Number of elements sent with each segment */
    int num_segments[2];   /* Number of segmenets */
    int sendcount[2];      /* the same like segcount, except for the last segment */
    size_t realsegsize[2], type_size;
    char *tmpbuf[2];
    ptrdiff_t type_extent, lb;
    ompi_request_t *base_req, *new_req;
    ompi_coll_tree_t *tree;
    mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
    mca_coll_tuned_comm_t *data = tuned_module->tuned_data;

    size = ompi_comm_size(comm);
    rank = ompi_comm_rank(comm);

    OPAL_OUTPUT((ompi_coll_tuned_stream,"ompi_coll_tuned_bcast_intra_split_bintree rank %d root %d ss %5d", rank, root, segsize));

    if (size == 1) {
        return MPI_SUCCESS;
    }

    /* setup the binary tree topology. */
    COLL_TUNED_UPDATE_BINTREE( comm, tuned_module, root );
    tree = data->cached_bintree;

    err = ompi_datatype_type_size( datatype, &type_size );

    /* Determine number of segments and number of elements per segment */
    counts[0] = count/2;
    if (count % 2 != 0) counts[0]++;
    counts[1] = count - counts[0];
    if ( segsize > 0 ) {
        /* Note that ompi_datatype_type_size() will never return a negative
           value in typelng; it returns an int [vs. an unsigned type]
           because of the MPI spec. */
        if (segsize < ((uint32_t) type_size)) {
            segsize = type_size; /* push segsize up to hold one type */
        }
        segcount[0] = segcount[1] = segsize / type_size;
        num_segments[0] = counts[0]/segcount[0];
        if ((counts[0] % segcount[0]) != 0) num_segments[0]++;
        num_segments[1] = counts[1]/segcount[1];
        if ((counts[1] % segcount[1]) != 0) num_segments[1]++;
    } else {
        segcount[0]     = counts[0];
        segcount[1]     = counts[1];
        num_segments[0] = num_segments[1] = 1;
    }

    /* if the message is too small to be split into segments */
    if( (counts[0] == 0 || counts[1] == 0) ||
            (segsize > ((ptrdiff_t)counts[0] * type_size)) ||
            (segsize > ((ptrdiff_t)counts[1] * type_size)) ) {
        /* call linear version here ! */
        return (ompi_coll_tuned_bcast_intra_chain ( buffer, count, datatype,
                root, comm, module,
                segsize, 1 ));
    }

    err = ompi_datatype_get_extent (datatype, &lb, &type_extent);

    /* Determine real segment size */
    realsegsize[0] = (ptrdiff_t)segcount[0] * type_extent;
    realsegsize[1] = (ptrdiff_t)segcount[1] * type_extent;

    /* set the buffer pointers */
    tmpbuf[0] = (char *) buffer;
    tmpbuf[1] = (char *) buffer + (ptrdiff_t)counts[0] * type_extent;

    /* Step 1:
       Root splits the buffer in 2 and sends segmented message down the branches.
       Left subtree of the tree receives first half of the buffer, while right
       subtree receives the remaining message.
    */

    /* determine if I am left (0) or right (1), (root is right) */
    lr = ((rank + size - root)%size + 1)%2;

    /* root code */
    if( rank == root ) {
        /* determine segment count */
        sendcount[0] = segcount[0];
        sendcount[1] = segcount[1];
        /* for each segment */
        for (segindex = 0; segindex < num_segments[0]; segindex++) {
            /* for each child */
            for( i = 0; i < tree->tree_nextsize && i < 2; i++ ) {
                if (segindex >= num_segments[i]) { /* no more segments */
                    continue;
                }
                /* determine how many elements are being sent in this round */
                if(segindex == (num_segments[i] - 1))
                    sendcount[i] = counts[i] - segindex*segcount[i];
                /* send data */
                MCA_PML_CALL(send(tmpbuf[i], sendcount[i], datatype,
                                  tree->tree_next[i], MCA_COLL_BASE_TAG_BCAST,
                                  MCA_PML_BASE_SEND_STANDARD, comm));
                if (err != MPI_SUCCESS) {
                    line = __LINE__;
                    goto error_hndl;
                }
                /* update tmp buffer */
                tmpbuf[i] += realsegsize[i];
            }
        }
    }

    /* intermediate nodes code */
    else if( tree->tree_nextsize > 0 ) {
        /* Intermediate nodes:
         * It will receive segments only from one half of the data.
         * Which one is determined by whether the node belongs to the "left" or "right"
         * subtree. Topoloby building function builds binary tree such that
         * odd "shifted ranks" ((rank + size - root)%size) are on the left subtree,
         * and even on the right subtree.
         *
         * Create the pipeline. We first post the first receive, then in the loop we
         * post the next receive and after that wait for the previous receive to complete
         * and we disseminating the data to all children.
         */
        sendcount[lr] = segcount[lr];
        err = MCA_PML_CALL(irecv(tmpbuf[lr], sendcount[lr], datatype,
                                 tree->tree_prev, MCA_COLL_BASE_TAG_BCAST,
                                 comm, &base_req));
        if (err != MPI_SUCCESS) {
            line = __LINE__;
            goto error_hndl;
        }

        for( segindex = 1; segindex < num_segments[lr]; segindex++ ) {
            /* determine how many elements to expect in this round */
            if( segindex == (num_segments[lr] - 1))
                sendcount[lr] = counts[lr] - (ptrdiff_t)segindex * (ptrdiff_t)segcount[lr];
            /* post new irecv */
            err = MCA_PML_CALL(irecv( tmpbuf[lr] + realsegsize[lr], sendcount[lr],
                                      datatype, tree->tree_prev, MCA_COLL_BASE_TAG_BCAST,
                                      comm, &new_req));
            if (err != MPI_SUCCESS) {
                line = __LINE__;
                goto error_hndl;
            }

            /* wait for and forward current segment */
            err = ompi_request_wait_all( 1, &base_req, MPI_STATUSES_IGNORE );
            for( i = 0; i < tree->tree_nextsize; i++ ) {  /* send data to children (segcount[lr]) */
                err = MCA_PML_CALL(send( tmpbuf[lr], segcount[lr], datatype,
                                         tree->tree_next[i], MCA_COLL_BASE_TAG_BCAST,
                                         MCA_PML_BASE_SEND_STANDARD, comm));
                if (err != MPI_SUCCESS) {
                    line = __LINE__;
                    goto error_hndl;
                }
            } /* end of for each child */

            /* upate the base request */
            base_req = new_req;
            /* go to the next buffer (ie. the one corresponding to the next recv) */
            tmpbuf[lr] += realsegsize[lr];
        } /* end of for segindex */

        /* wait for the last segment and forward current segment */
        err = ompi_request_wait_all( 1, &base_req, MPI_STATUSES_IGNORE );
        for( i = 0; i < tree->tree_nextsize; i++ ) {  /* send data to children */
            err = MCA_PML_CALL(send(tmpbuf[lr], sendcount[lr], datatype,
                                    tree->tree_next[i], MCA_COLL_BASE_TAG_BCAST,
                                    MCA_PML_BASE_SEND_STANDARD, comm));
            if (err != MPI_SUCCESS) {
                line = __LINE__;
                goto error_hndl;
            }
        } /* end of for each child */
    }

    /* leaf nodes */
    else {
        /* Just consume segments as fast as possible */
        sendcount[lr] = segcount[lr];
        for (segindex = 0; segindex < num_segments[lr]; segindex++) {
            /* determine how many elements to expect in this round */
            if (segindex == (num_segments[lr] - 1))
                sendcount[lr] = counts[lr] - (ptrdiff_t)segindex * (ptrdiff_t)segcount[lr];
            /* receive segments */
            err = MCA_PML_CALL(recv(tmpbuf[lr], sendcount[lr], datatype,
                                    tree->tree_prev, MCA_COLL_BASE_TAG_BCAST,
                                    comm, MPI_STATUS_IGNORE));
            if (err != MPI_SUCCESS) {
                line = __LINE__;
                goto error_hndl;
            }
            /* update the initial pointer to the buffer */
            tmpbuf[lr] += realsegsize[lr];
        }
    }

    /* reset the buffer pointers */
    tmpbuf[0] = (char *) buffer;
    tmpbuf[1] = (char *) buffer + (ptrdiff_t)counts[0] * type_extent;

    /* Step 2:
       Find your immediate pair (identical node in opposite subtree) and SendRecv
       data buffer with them.
       The tree building function ensures that
       if (we are not root)
       if we are in the left subtree (lr == 0) our pair is (rank+1)%size.
       if we are in the right subtree (lr == 1) our pair is (rank-1)%size
       If we have even number of nodes the rank (size-1) will pair up with root.
    */
    if (lr == 0) {
        pair = (rank+1)%size;
    } else {
        pair = (rank+size-1)%size;
    }

    if ( (size%2) != 0 && rank != root) {

        err = ompi_coll_tuned_sendrecv( tmpbuf[lr], counts[lr], datatype,
                                        pair, MCA_COLL_BASE_TAG_BCAST,
                                        tmpbuf[(lr+1)%2], counts[(lr+1)%2], datatype,
                                        pair, MCA_COLL_BASE_TAG_BCAST,
                                        comm, MPI_STATUS_IGNORE, rank);
        if (err != MPI_SUCCESS) {
            line = __LINE__;
            goto error_hndl;
        }
    } else if ( (size%2) == 0 ) {
        /* root sends right buffer to the last node */
        if( rank == root ) {
            err = MCA_PML_CALL(send(tmpbuf[1], counts[1], datatype,
                                    (root+size-1)%size, MCA_COLL_BASE_TAG_BCAST,
                                    MCA_PML_BASE_SEND_STANDARD, comm));
            if (err != MPI_SUCCESS) {
                line = __LINE__;
                goto error_hndl;
            }

        }
        /* last node receives right buffer from the root */
        else if (rank == (root+size-1)%size) {
            err = MCA_PML_CALL(recv(tmpbuf[1], counts[1], datatype,
                                    root, MCA_COLL_BASE_TAG_BCAST,
                                    comm, MPI_STATUS_IGNORE));
            if (err != MPI_SUCCESS) {
                line = __LINE__;
                goto error_hndl;
            }
        }
        /* everyone else exchanges buffers */
        else {
            err = ompi_coll_tuned_sendrecv( tmpbuf[lr], counts[lr], datatype,
                                            pair, MCA_COLL_BASE_TAG_BCAST,
                                            tmpbuf[(lr+1)%2], counts[(lr+1)%2], datatype,
                                            pair, MCA_COLL_BASE_TAG_BCAST,
                                            comm, MPI_STATUS_IGNORE, rank);
            if (err != MPI_SUCCESS) {
                line = __LINE__;
                goto error_hndl;
            }
        }
    }
    return (MPI_SUCCESS);

error_hndl:
    OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,line,err,rank));
    return (err);
}
/*
 *	exscan_intra
 *
 *	Function:	- basic exscan operation
 *	Accepts:	- same arguments as MPI_Exscan()
 *	Returns:	- MPI_SUCCESS or error code
 */
int
mca_coll_basic_exscan_intra(void *sbuf, void *rbuf, int count,
                            struct ompi_datatype_t *dtype,
                            struct ompi_op_t *op,
                            struct ompi_communicator_t *comm,
                            mca_coll_base_module_t *module)
{
    int size, rank, err;
    ptrdiff_t true_lb, true_extent, lb, extent;
    char *free_buffer = NULL;
    char *reduce_buffer = NULL;
    char *source;
    MPI_Request req = MPI_REQUEST_NULL;

    /* Initialize. */

    rank = ompi_comm_rank(comm);
    size = ompi_comm_size(comm);

    /* If we're rank 0, then we send our sbuf to the next rank */

    if (0 == rank) {
        return MCA_PML_CALL(send(sbuf, count, dtype, rank + 1,
                                 MCA_COLL_BASE_TAG_EXSCAN,
                                 MCA_PML_BASE_SEND_STANDARD, comm));
    }

    /* If we're the last rank, then just receive the result from the
     * prior rank */

    else if ((size - 1) == rank) {
        return MCA_PML_CALL(recv(rbuf, count, dtype, rank - 1,
                                 MCA_COLL_BASE_TAG_EXSCAN, comm,
                                 MPI_STATUS_IGNORE));
    }

    /* Otherwise, get the result from the prior rank, combine it with my
     * data, and send it to the next rank */

    /* Start the receive for the prior rank's answer */

    err = MCA_PML_CALL(irecv(rbuf, count, dtype, rank - 1,
                             MCA_COLL_BASE_TAG_EXSCAN, comm, &req));
    if (MPI_SUCCESS != err) {
        goto error;
    }

    /* Get a temporary buffer to perform the reduction into.  Rationale
     * for malloc'ing this size is provided in coll_basic_reduce.c. */

    ompi_ddt_get_extent(dtype, &lb, &extent);
    ompi_ddt_get_true_extent(dtype, &true_lb, &true_extent);

    free_buffer = (char*)malloc(true_extent + (count - 1) * extent);
    if (NULL == free_buffer) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    reduce_buffer = free_buffer - lb;

    if (ompi_op_is_commute(op)) {

        /* If we're commutative, we can copy my sbuf into the reduction
         * buffer before the receive completes */

        err = ompi_ddt_copy_content_same_ddt(dtype, count, 
                                             reduce_buffer, (char*)sbuf);
        if (MPI_SUCCESS != err) {
            goto error;
        }

        /* Now setup the reduction */

        source = (char*)rbuf;

        /* Finally, wait for the receive to complete (so that we can do
         * the reduction).  */

        err = ompi_request_wait(&req, MPI_STATUS_IGNORE);
        if (MPI_SUCCESS != err) {
            goto error;
        }
    } else {

        /* Setup the reduction */

        source = (char*)sbuf;

        /* If we're not commutative, we have to wait for the receive to
         * complete and then copy it into the reduce buffer */

        err = ompi_request_wait(&req, MPI_STATUS_IGNORE);
        if (MPI_SUCCESS != err) {
            goto error;
        }

        err = ompi_ddt_copy_content_same_ddt(dtype, count, 
                                             reduce_buffer, (char*)rbuf);
        if (MPI_SUCCESS != err) {
            goto error;
        }
    }

    /* Now reduce the received answer with my source into the answer
     * that we send off to the next rank */

    ompi_op_reduce(op, source, reduce_buffer, count, dtype);

    /* Send my result off to the next rank */

    err = MCA_PML_CALL(send(reduce_buffer, count, dtype, rank + 1,
                            MCA_COLL_BASE_TAG_EXSCAN,
                            MCA_PML_BASE_SEND_STANDARD, comm));

    /* Error */

  error:
    free(free_buffer);
    if (MPI_REQUEST_NULL != req) {
        ompi_request_cancel(req);
        ompi_request_wait(&req, MPI_STATUS_IGNORE);
    }

    /* All done */

    return err;
}
static int bcol_ptpcoll_barrier_recurs_dbl_extra_new(
                                bcol_function_args_t *input_args,
                                struct coll_ml_function_t *const_args)
{
   /* local variable */
    uint64_t sequence_number;
    int rc, completed, num_reqs = 2,
        tag, my_extra_partner_comm_rank;

    ompi_request_t **requests;
    ompi_free_list_item_t *item;

    mca_bcol_ptpcoll_collreq_t *collreq;

    mca_bcol_ptpcoll_module_t *ptp_module =
                         (mca_bcol_ptpcoll_module_t *) const_args->bcol_module;
    ompi_communicator_t *comm = ptp_module->super.sbgp_partner_module->group_comm;

    OMPI_FREE_LIST_WAIT_MT(&ptp_module->collreqs_free, item);
    if (OPAL_UNLIKELY(NULL == item)) {
        PTPCOLL_ERROR(("Free list waiting failed."));
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    collreq = (mca_bcol_ptpcoll_collreq_t *) item;
    input_args->bcol_opaque_data = (void *) collreq;

    requests = collreq->requests;

    /* TAG Calculation */
    sequence_number = input_args->sequence_num;

    /* Keep tag within the limit supportd by the pml */
    tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptp_module->tag_mask);

    /* mark this as a collective tag, to avoid conflict with user-level flags */
    tag = -tag;

    /* I will not participate in the exchange - so just "register" as here,
     * signal the extra rank that I am here */

    my_extra_partner_comm_rank =
                 ptp_module->super.sbgp_partner_module->group_list[ptp_module->proxy_extra_index];

    rc = MCA_PML_CALL(isend(NULL, 0, MPI_INT,
                my_extra_partner_comm_rank, tag,
                MCA_PML_BASE_SEND_STANDARD, comm,
                &(requests[0])));
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        PTPCOLL_ERROR(("Send failed."));
        return rc;
    }

    /* Recv signal that the rest are done - my_extra_partner_comm_rank */
    rc = MCA_PML_CALL(irecv(NULL, 0, MPI_INT,
                my_extra_partner_comm_rank, tag, comm,
                &(requests[1])));
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        PTPCOLL_ERROR(("IRecv failed."));
        return rc;
    }

    /* Test for completion */
    completed =
        mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        PTPCOLL_ERROR(("Test for all failed."));
        return rc;
    }

    if (!completed) {
        return BCOL_FN_STARTED;
    }

    OMPI_FREE_LIST_RETURN_MT(&ptp_module->collreqs_free, (ompi_free_list_item_t *) collreq);
    return BCOL_FN_COMPLETE;
}
static int bcol_ptpcoll_barrier_recurs_dbl_new_progress(
                                bcol_function_args_t *input_args,
                                struct coll_ml_function_t *const_args)
{
   /* local variable */
    mca_bcol_ptpcoll_module_t *ptp_module =
                         (mca_bcol_ptpcoll_module_t *) const_args->bcol_module;

    ompi_communicator_t *comm = ptp_module->super.sbgp_partner_module->group_comm;

    int rc, exchange, pair_comm_rank, tag,
        pair_rank, delta, num_reqs, completed,
        my_rank = ptp_module->super.sbgp_partner_module->my_index,
        n_exchange = ptp_module->super.sbgp_partner_module->n_levels_pow2;

    ompi_request_t **requests;
    mca_bcol_ptpcoll_collreq_t *collreq =
                    (mca_bcol_ptpcoll_collreq_t *) input_args->bcol_opaque_data;

    num_reqs = collreq->num_reqs;
    requests = collreq->requests;

    /* test for completion */
    completed =
        mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        PTPCOLL_ERROR(("Test for all failed."));
        return rc;
    }

    if (!completed) {
          return BCOL_FN_STARTED;
    }

    assert(PTPCOLL_EXTRA != ptp_module->pow_2type);

    /* Continue loop over exchange send/recv pairs */
    num_reqs = 0;
    tag = collreq->tag;

    exchange = collreq->exchange;
    assert(exchange >= 0);

    delta = 1 << exchange;
    for (; exchange < n_exchange; ++exchange) {

        /* rank of exchange partner within the group */
        pair_rank = my_rank ^ delta;

        /* rank within the communicator */
        pair_comm_rank =
            ptp_module->super.sbgp_partner_module->group_list[pair_rank];

        /* send to partner - we will wait for completion, as send
         *   completion is at the MPI level, and will not
         *   incur network level completion costs
         */
        rc = MCA_PML_CALL(isend(NULL, 0, MPI_INT,
                    pair_comm_rank, tag,
                    MCA_PML_BASE_SEND_STANDARD, comm,
                    &(requests[0])));
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("ISend failed."));
            return rc;
        }

        ++num_reqs;

        /* recive from partner */
        rc = MCA_PML_CALL(irecv(NULL, 0, MPI_INT,
                    pair_comm_rank, tag, comm,
                    &(requests[1])));
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("IRecv failed."));
            return rc;
        }

        ++num_reqs;

        PTPCOLL_VERBOSE(5, ("exchange - %d, pair_rank - %d, pair_comm_rank - %d",
                             exchange, pair_rank, pair_comm_rank));

        /* test for completion */
        completed =
            mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("Test for all failed."));
            return rc;
        }

        if (!completed) {
            collreq->num_reqs = num_reqs;
            collreq->exchange = exchange + 1;
            assert(collreq->exchange >= 0);

            return BCOL_FN_STARTED;
        }

        delta <<= 1; /* delta *= 2 */
    }

    /* if non power of 2, may need to send message to "extra" proc */
    if (collreq->need_toserv_extra) {
        /* send - let the extra rank know that we are done */
        rc = MCA_PML_CALL(isend(NULL, 0, MPI_INT,
                    collreq->extra_partner_rank, tag,
                    MCA_PML_BASE_SEND_STANDARD, comm,
                    &(requests[0])));
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("ISend failed."));
            return rc;
        }

        completed = mca_bcol_ptpcoll_test_for_match(&requests[0], &rc);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("Test for isend failed."));
            return rc;
        }

        if (!completed) {
            collreq->num_reqs = 1;
            collreq->need_toserv_extra = 0;
            collreq->exchange = n_exchange;

            return BCOL_FN_STARTED;
        }
    }

    return BCOL_FN_COMPLETE;
}
Example #15
0
File: nbc.c Project: ICLDisco/ompi
static inline int NBC_Start_round(NBC_Handle *handle) {
  int num; /* number of operations */
  int res;
  char* ptr;
  MPI_Request *tmp;
  NBC_Fn_type type;
  NBC_Args_send     sendargs;
  NBC_Args_recv     recvargs;
  NBC_Args_op         opargs;
  NBC_Args_copy     copyargs;
  NBC_Args_unpack unpackargs;
  void *buf1,  *buf2;

  /* get round-schedule address */
  ptr = handle->schedule->data + handle->row_offset;

  NBC_GET_BYTES(ptr,num);
  NBC_DEBUG(10, "start_round round at offset %d : posting %i operations\n", handle->row_offset, num);

  for (int i = 0 ; i < num ; ++i) {
    int offset = (intptr_t)(ptr - handle->schedule->data);

    memcpy (&type, ptr, sizeof (type));
    switch(type) {
      case SEND:
        NBC_DEBUG(5,"  SEND (offset %li) ", offset);
        NBC_GET_BYTES(ptr,sendargs);
        NBC_DEBUG(5,"*buf: %p, count: %i, type: %p, dest: %i, tag: %i)\n", sendargs.buf,
                  sendargs.count, sendargs.datatype, sendargs.dest, handle->tag);
        /* get an additional request */
        handle->req_count++;
        /* get buffer */
        if(sendargs.tmpbuf) {
          buf1=(char*)handle->tmpbuf+(long)sendargs.buf;
        } else {
          buf1=(void *)sendargs.buf;
        }
#ifdef NBC_TIMING
        Isend_time -= MPI_Wtime();
#endif
        tmp = (MPI_Request *) realloc ((void *) handle->req_array, handle->req_count * sizeof (MPI_Request));
        if (NULL == tmp) {
          return OMPI_ERR_OUT_OF_RESOURCE;
        }

        handle->req_array = tmp;

        res = MCA_PML_CALL(isend(buf1, sendargs.count, sendargs.datatype, sendargs.dest, handle->tag,
                                 MCA_PML_BASE_SEND_STANDARD, sendargs.local?handle->comm->c_local_comm:handle->comm,
                                 handle->req_array+handle->req_count - 1));
        if (OMPI_SUCCESS != res) {
          NBC_Error ("Error in MPI_Isend(%lu, %i, %p, %i, %i, %lu) (%i)", (unsigned long)buf1, sendargs.count,
                     sendargs.datatype, sendargs.dest, handle->tag, (unsigned long)handle->comm, res);
          return res;
        }
#ifdef NBC_TIMING
        Isend_time += MPI_Wtime();
#endif
        break;
      case RECV:
        NBC_DEBUG(5, "  RECV (offset %li) ", offset);
        NBC_GET_BYTES(ptr,recvargs);
        NBC_DEBUG(5, "*buf: %p, count: %i, type: %p, source: %i, tag: %i)\n", recvargs.buf, recvargs.count,
                  recvargs.datatype, recvargs.source, handle->tag);
        /* get an additional request - TODO: req_count NOT thread safe */
        handle->req_count++;
        /* get buffer */
        if(recvargs.tmpbuf) {
          buf1=(char*)handle->tmpbuf+(long)recvargs.buf;
        } else {
          buf1=recvargs.buf;
        }
#ifdef NBC_TIMING
        Irecv_time -= MPI_Wtime();
#endif
        tmp = (MPI_Request *) realloc ((void *) handle->req_array, handle->req_count * sizeof (MPI_Request));
        if (NULL == tmp) {
          return OMPI_ERR_OUT_OF_RESOURCE;
        }

        handle->req_array = tmp;

        res = MCA_PML_CALL(irecv(buf1, recvargs.count, recvargs.datatype, recvargs.source, handle->tag, recvargs.local?handle->comm->c_local_comm:handle->comm,
                                 handle->req_array+handle->req_count-1));
        if (OMPI_SUCCESS != res) {
          NBC_Error("Error in MPI_Irecv(%lu, %i, %p, %i, %i, %lu) (%i)", (unsigned long)buf1, recvargs.count,
                    recvargs.datatype, recvargs.source, handle->tag, (unsigned long)handle->comm, res);
          return res;
        }
#ifdef NBC_TIMING
        Irecv_time += MPI_Wtime();
#endif
        break;
      case OP:
        NBC_DEBUG(5, "  OP2  (offset %li) ", offset);
        NBC_GET_BYTES(ptr,opargs);
        NBC_DEBUG(5, "*buf1: %p, buf2: %p, count: %i, type: %p)\n", opargs.buf1, opargs.buf2,
                  opargs.count, opargs.datatype);
        /* get buffers */
        if(opargs.tmpbuf1) {
          buf1=(char*)handle->tmpbuf+(long)opargs.buf1;
        } else {
          buf1=(void *)opargs.buf1;
        }
        if(opargs.tmpbuf2) {
          buf2=(char*)handle->tmpbuf+(long)opargs.buf2;
        } else {
          buf2=opargs.buf2;
        }
        ompi_op_reduce(opargs.op, buf1, buf2, opargs.count, opargs.datatype);
        break;
      case COPY:
        NBC_DEBUG(5, "  COPY   (offset %li) ", offset);
        NBC_GET_BYTES(ptr,copyargs);
        NBC_DEBUG(5, "*src: %lu, srccount: %i, srctype: %p, *tgt: %lu, tgtcount: %i, tgttype: %p)\n",
                  (unsigned long) copyargs.src, copyargs.srccount, copyargs.srctype,
                  (unsigned long) copyargs.tgt, copyargs.tgtcount, copyargs.tgttype);
        /* get buffers */
        if(copyargs.tmpsrc) {
          buf1=(char*)handle->tmpbuf+(long)copyargs.src;
        } else {
          buf1=copyargs.src;
        }
        if(copyargs.tmptgt) {
          buf2=(char*)handle->tmpbuf+(long)copyargs.tgt;
        } else {
          buf2=copyargs.tgt;
        }
        res = NBC_Copy (buf1, copyargs.srccount, copyargs.srctype, buf2, copyargs.tgtcount, copyargs.tgttype,
                        handle->comm);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
          return res;
        }
        break;
      case UNPACK:
        NBC_DEBUG(5, "  UNPACK   (offset %li) ", offset);
        NBC_GET_BYTES(ptr,unpackargs);
        NBC_DEBUG(5, "*src: %lu, srccount: %i, srctype: %p, *tgt: %lu\n", (unsigned long) unpackargs.inbuf,
                  unpackargs.count, unpackargs.datatype, (unsigned long) unpackargs.outbuf);
        /* get buffers */
        if(unpackargs.tmpinbuf) {
          buf1=(char*)handle->tmpbuf+(long)unpackargs.inbuf;
        } else {
          buf1=unpackargs.inbuf;
        }
        if(unpackargs.tmpoutbuf) {
          buf2=(char*)handle->tmpbuf+(long)unpackargs.outbuf;
        } else {
          buf2=unpackargs.outbuf;
        }
        res = NBC_Unpack (buf1, unpackargs.count, unpackargs.datatype, buf2, handle->comm);
        if (OMPI_SUCCESS != res) {
          NBC_Error ("NBC_Unpack() failed (code: %i)", res);
          return res;
        }

        break;
      default:
        NBC_Error ("NBC_Start_round: bad type %li at offset %li", (long)type, offset);
        return OMPI_ERROR;
    }
  }

  /* check if we can make progress - not in the first round, this allows us to leave the
   * initialization faster and to reach more overlap
   *
   * threaded case: calling progress in the first round can lead to a
   * deadlock if NBC_Free is called in this round :-( */
  if (handle->row_offset) {
    res = NBC_Progress(handle);
    if ((NBC_OK != res) && (NBC_CONTINUE != res)) {
      return OMPI_ERROR;
    }
  }

  return OMPI_SUCCESS;
}
int
mca_fcoll_static_file_write_all (mca_io_ompio_file_t *fh,
                                 void *buf,
                                 int count,
                                 struct ompi_datatype_t *datatype,
                                 ompi_status_public_t *status)
{
    
    
    
    size_t max_data = 0, bytes_per_cycle=0;
    struct iovec *iov=NULL, *decoded_iov=NULL;
    uint32_t iov_count=0, iov_index=0;
    int i=0,j=0,l=0, temp_index;
    int ret=OMPI_SUCCESS, cycles, local_cycles, *bytes_per_process=NULL;
    int index, *disp_index=NULL, **blocklen_per_process=NULL;
    int *iovec_count_per_process=NULL, *displs=NULL;
    size_t total_bytes_written=0;
    MPI_Aint **displs_per_process=NULL, *memory_displacements=NULL;
    MPI_Aint bytes_to_write_in_cycle=0, global_iov_count=0, global_count=0;
    
    local_io_array *local_iov_array =NULL, *global_iov_array=NULL;
    local_io_array *file_offsets_for_agg=NULL;
    int *sorted=NULL, *sorted_file_offsets=NULL, temp_pindex, *temp_disp_index=NULL;
    char *send_buf=NULL, *global_buf=NULL;
    int iov_size=0, current_position=0, *current_index=NULL;
    int *bytes_remaining=NULL, entries_per_aggregator=0;
    ompi_datatype_t **recvtype = NULL;
    MPI_Request *send_req=NULL, *recv_req=NULL;
    /* For creating datatype of type io_array */
    int blocklen[3] = {1, 1, 1};
    int static_num_io_procs=1;
    OPAL_PTRDIFF_TYPE d[3], base;
    ompi_datatype_t *types[3];
    ompi_datatype_t *io_array_type=MPI_DATATYPE_NULL;
    /*----------------------------------------------*/
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    double write_time = 0.0, start_write_time = 0.0, end_write_time = 0.0;
    double comm_time = 0.0, start_comm_time = 0.0, end_comm_time = 0.0;
    double exch_write = 0.0, start_exch = 0.0, end_exch = 0.0;
    mca_io_ompio_print_entry nentry;
#endif
    
    
#if DEBUG_ON
    MPI_Aint gc_in;
#endif
    
//  if (opal_datatype_is_contiguous_memory_layout(&datatype->super,1)) {
//    fh->f_flags |= OMPIO_CONTIGUOUS_MEMORY;
//  }
    
    /* In case the data is not contigous in memory, decode it into an iovec */
    if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) {
        fh->f_decode_datatype ((struct mca_io_ompio_file_t *)fh,
                               datatype,
                               count,
                               buf,
                               &max_data,
                               &decoded_iov,
                               &iov_count);
    }
    else {
        max_data = count * datatype->super.size;
    }
    
    if ( MPI_STATUS_IGNORE != status ) {
	status->_ucount = max_data;
    }
    
    fh->f_get_num_aggregators ( & static_num_io_procs );
    fh->f_set_aggregator_props ((struct mca_io_ompio_file_t *)fh,
				static_num_io_procs,
				max_data);
    
    
    /* io_array datatype  for using in communication*/
    types[0] = &ompi_mpi_long.dt;
    types[1] = &ompi_mpi_long.dt;
    types[2] = &ompi_mpi_int.dt;
    
    d[0] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0];
    d[1] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0].length;
    d[2] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0].process_id;
    base = d[0];
    for (i=0 ; i<3 ; i++) {
        d[i] -= base;
    }
    ompi_datatype_create_struct (3,
                                 blocklen,
                                 d,
                                 types,
                                 &io_array_type);
    ompi_datatype_commit (&io_array_type);
    /* #########################################################*/
    
    
    
    ret = fh->f_generate_current_file_view((struct mca_io_ompio_file_t *)fh,
                                           max_data,
                                           &iov,
                                           &iov_size);
    if (ret != OMPI_SUCCESS){
        fprintf(stderr,"Current File View Generation Error\n");
        goto exit;
    }
    
    if (0 == iov_size){
        iov_size  = 1;
    }
    
    local_iov_array = (local_io_array *)malloc (iov_size * sizeof(local_io_array));
    if ( NULL == local_iov_array){
        fprintf(stderr,"local_iov_array allocation error\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }
    
    
    for (j=0; j < iov_size; j++){
        local_iov_array[j].offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t)
            iov[j].iov_base;
        local_iov_array[j].length = (size_t)iov[j].iov_len;
        local_iov_array[j].process_id = fh->f_rank;
        
    }
    
    fh->f_get_bytes_per_agg ( (int *) &bytes_per_cycle);
    
    
    local_cycles = ceil((double)max_data/bytes_per_cycle);
    ret = fh->f_comm->c_coll.coll_allreduce (&local_cycles,
                                             &cycles,
                                             1,
                                             MPI_INT,
                                             MPI_MAX,
                                             fh->f_comm,
                                             fh->f_comm->c_coll.coll_allreduce_module);
    
    if (OMPI_SUCCESS != ret){
        fprintf(stderr,"local cycles allreduce!\n");
        goto exit;
    }
    
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
        
        disp_index = (int *)malloc (fh->f_procs_per_group * sizeof (int));
        if (NULL == disp_index) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        
        bytes_per_process = (int *) malloc (fh->f_procs_per_group * sizeof(int ));
        if (NULL == bytes_per_process){
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        
        bytes_remaining = (int *) malloc (fh->f_procs_per_group * sizeof(int));
        if (NULL == bytes_remaining){
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        
        current_index = (int *) malloc (fh->f_procs_per_group * sizeof(int));
        if (NULL == current_index){
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        
        blocklen_per_process = (int **)malloc (fh->f_procs_per_group * sizeof (int*));
        if (NULL == blocklen_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        
        displs_per_process = (MPI_Aint **)
            malloc (fh->f_procs_per_group * sizeof (MPI_Aint*));
        
        if (NULL == displs_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        
        for(i=0;i<fh->f_procs_per_group;i++){
            current_index[i] = 0;
            bytes_remaining[i] =0;
            blocklen_per_process[i] = NULL;
            displs_per_process[i] = NULL;
        }
    }
    
    iovec_count_per_process = (int *) malloc (fh->f_procs_per_group * sizeof(int));
    if (NULL == iovec_count_per_process){
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }
    
    displs = (int *) malloc (fh->f_procs_per_group * sizeof(int));
    if (NULL == displs){
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }
    
    ret = fh->f_allgather_array (&iov_size,
                                 1,
                                 MPI_INT,
                                 iovec_count_per_process,
                                 1,
                                 MPI_INT,
                                 fh->f_aggregator_index,
                                 fh->f_procs_in_group,
                                 fh->f_procs_per_group,
                                 fh->f_comm);
    
    if( OMPI_SUCCESS != ret){
        fprintf(stderr,"iov size allgatherv array!\n");
        goto exit;
    }
    
    
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
        displs[0] = 0;
        global_iov_count = iovec_count_per_process[0];
        for (i=1 ; i<fh->f_procs_per_group ; i++) {
            global_iov_count += iovec_count_per_process[i];
            displs[i] = displs[i-1] + iovec_count_per_process[i-1];
        }
    }
    
    
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
        global_iov_array = (local_io_array *) malloc (global_iov_count *
                                                      sizeof(local_io_array));
        if (NULL == global_iov_array){
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
    }
    
    ret = fh->f_gatherv_array (local_iov_array,
                               iov_size,
                               io_array_type,
                               global_iov_array,
                               iovec_count_per_process,
                               displs,
                               io_array_type,
                               fh->f_aggregator_index,
                               fh->f_procs_in_group,
                               fh->f_procs_per_group,
                               fh->f_comm);
    if (OMPI_SUCCESS != ret){
        fprintf(stderr,"global_iov_array gather error!\n");
        goto exit;
    }
    
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
        
        if ( 0 == global_iov_count){
            global_iov_count =  1;
        }
        
        sorted = (int *)malloc (global_iov_count * sizeof(int));
        if (NULL == sorted) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        local_heap_sort (global_iov_array, global_iov_count, sorted);
    }
    
#if DEBUG_ON
    
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
        for (gc_in=0; gc_in<global_iov_count; gc_in++){
            printf("%d: Offset[%ld]: %lld, Length[%ld]: %ld\n",
                   global_iov_array[gc_in].process_id,
                   gc_in, global_iov_array[gc_in].offset,
                   gc_in, global_iov_array[gc_in].length);
        }
    }
#endif
    
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_exch = MPI_Wtime();
#endif
    
    
    for (index = 0; index < cycles; index++){
        if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
            if (NULL == recvtype){
                recvtype = (ompi_datatype_t **)
                    malloc (fh->f_procs_per_group  * sizeof(ompi_datatype_t *));
                if (NULL == recvtype) {
                    opal_output (1, "OUT OF MEMORY\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
            }
            for(l=0;l<fh->f_procs_per_group;l++){
                disp_index[l] =  1;
                if (NULL != blocklen_per_process[l]){
                    free(blocklen_per_process[l]);
                    blocklen_per_process[l] = NULL;
                }
                if (NULL != displs_per_process[l]){
                    free(displs_per_process[l]);
                    displs_per_process[l] = NULL;
                }
                blocklen_per_process[l] = (int *) calloc (1, sizeof(int));
                if (NULL == blocklen_per_process[l]) {
                    opal_output (1, "OUT OF MEMORY for blocklen\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint));
                if (NULL == displs_per_process[l]){
                    opal_output (1, "OUT OF MEMORY for displs\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
            }
            if (NULL != sorted_file_offsets){
                free(sorted_file_offsets);
                sorted_file_offsets = NULL;
            }
            
            if(NULL != file_offsets_for_agg){
                free(file_offsets_for_agg);
                file_offsets_for_agg = NULL;
            }
            
            if (NULL != memory_displacements){
                free(memory_displacements);
                memory_displacements = NULL;
            }
            
        }
        if (local_cycles > index) {
            if ((index == local_cycles-1) && (max_data % bytes_per_cycle)) {
                bytes_to_write_in_cycle = max_data % bytes_per_cycle;
            }
            else if (max_data <= bytes_per_cycle) {
                bytes_to_write_in_cycle = max_data;
            }
            else {
                bytes_to_write_in_cycle = bytes_per_cycle;
            }
        }
        else {
            bytes_to_write_in_cycle = 0;
        }
#if DEBUG_ON
        /*    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {*/
        printf ("***%d: CYCLE %d   Bytes %ld**********\n",
                fh->f_rank,
                index,
                bytes_to_write_in_cycle);
        /* }*/
#endif
        /**********************************************************
         **Gather the Data from all the processes at the writers **
         *********************************************************/
        
        /* gather from each process how many bytes each will be sending */
        fh->f_gather_array (&bytes_to_write_in_cycle,
                            1,
                            MPI_INT,
                            bytes_per_process,
                            1,
                            MPI_INT,
                            fh->f_aggregator_index,
                            fh->f_procs_in_group,
                            fh->f_procs_per_group,
                            fh->f_comm);
        
        /*
          For each aggregator
          it needs to get bytes_to_write_in_cycle from each process
          in group which adds up to bytes_per_cycle
          
        */
        if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
            for (i=0;i<fh->f_procs_per_group; i++){
/*	    printf("bytes_per_process[%d]: %d\n", i, bytes_per_process[i]);
 */
                
#if DEBUG_ON
                printf ("%d : bytes_per_process : %d\n",
                        fh->f_procs_in_group[i],
                        bytes_per_process[i]);
#endif
                
                while (bytes_per_process[i] > 0){
                    if (get_process_id(global_iov_array[sorted[current_index[i]]].process_id,
                                       fh) == i){ /* current id owns this entry!*/
                        
                        /*Add and subtract length and create
                          blocklength and displs array*/
                        if (bytes_remaining[i]){ /*Remaining bytes in the current entry of
                                                   the global offset array*/
                            if (bytes_remaining[i] <= bytes_per_process[i]){
                                blocklen_per_process[i][disp_index[i] - 1] = bytes_remaining[i];
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset +
                                    (global_iov_array[sorted[current_index[i]]].length
                                     - bytes_remaining[i]);
                                
                                blocklen_per_process[i] = (int *) realloc
                                    ((void *)blocklen_per_process[i], (disp_index[i]+1)*sizeof(int));
                                displs_per_process[i] = (MPI_Aint *)realloc
                                    ((void *)displs_per_process[i], (disp_index[i]+1)*sizeof(MPI_Aint));
                                bytes_per_process[i] -= bytes_remaining[i];
                                blocklen_per_process[i][disp_index[i]] = 0;
                                displs_per_process[i][disp_index[i]] = 0;
                                bytes_remaining[i] = 0;
                                disp_index[i] += 1;
                                /* This entry has been used up, we need to move to the
                                   next entry of this process and make current_index point there*/
                                current_index[i]  = find_next_index(i,
                                                                    current_index[i],
                                                                    fh,
                                                                    global_iov_array,
                                                                    global_iov_count,
                                                                    sorted);
                                if (current_index[i] == -1){
                                    /* No more entries left, so Its all done! exit!*/
                                    break;
                                }
                                continue;
                            }
                            else{
                                blocklen_per_process[i][disp_index[i] - 1] = bytes_per_process[i];
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset +
                                    (global_iov_array[sorted[current_index[i]]].length
                                     - bytes_remaining[i]);
                                bytes_remaining[i] -= bytes_per_process[i];
                                bytes_per_process[i] = 0;
                                break;
                            }
                        }
                        else{
                            if (bytes_per_process[i] <
                                global_iov_array[sorted[current_index[i]]].length){
                                blocklen_per_process[i][disp_index[i] - 1] =
                                    bytes_per_process[i];
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset;
                                
                                bytes_remaining[i] =
                                    global_iov_array[sorted[current_index[i]]].length -
                                    bytes_per_process[i];
                                bytes_per_process[i] = 0;
                                break;
                            }
                            else {
                                blocklen_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].length;
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset;
                                blocklen_per_process[i] =
                                    (int *) realloc ((void *)blocklen_per_process[i], (disp_index[i]+1)*sizeof(int));
                                displs_per_process[i] = (MPI_Aint *)realloc
                                    ((void *)displs_per_process[i], (disp_index[i]+1)*sizeof(MPI_Aint));
                                blocklen_per_process[i][disp_index[i]] = 0;
                                displs_per_process[i][disp_index[i]] = 0;
                                disp_index[i] += 1;
                                bytes_per_process[i] -=
                                    global_iov_array[sorted[current_index[i]]].length;
                                current_index[i] = find_next_index(i,
                                                                   current_index[i],
                                                                   fh,
                                                                   global_iov_array,
                                                                   global_iov_count,
                                                                   sorted);
                                if (current_index[i] == -1){
                                    break;
                                }
                            }
                        }
                    }
                    else{
                        current_index[i] = find_next_index(i,
                                                           current_index[i],
                                                           fh,
                                                           global_iov_array,
                                                           global_iov_count,
                                                           sorted);
                        if (current_index[i] == -1){
                            bytes_per_process[i] = 0; /* no more entries left
                                                         to service this request*/
                            continue;
                        }
                    }
                }
            }
            entries_per_aggregator=0;
            for (i=0;i<fh->f_procs_per_group;i++){
                for (j=0;j<disp_index[i];j++){
                    if (blocklen_per_process[i][j] > 0){
                        entries_per_aggregator++;
#if DEBUG_ON
                        printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n",
                               fh->f_procs_in_group[i],j,
                               blocklen_per_process[i][j],j,
                               displs_per_process[i][j],
                               fh->f_rank);
                        
#endif
                    }
                    
                }
            }
            
            if (entries_per_aggregator > 0){
                file_offsets_for_agg = (local_io_array *)
                    malloc(entries_per_aggregator*sizeof(local_io_array));
                if (NULL == file_offsets_for_agg) {
                    opal_output (1, "OUT OF MEMORY\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                sorted_file_offsets = (int *)
                    malloc (entries_per_aggregator*sizeof(int));
                if (NULL == sorted_file_offsets){
                    opal_output (1, "OUT OF MEMORY\n");
                    ret =  OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                temp_index = 0;
                for (i=0;i<fh->f_procs_per_group; i++){
                    for(j=0;j<disp_index[i];j++){
                        if (blocklen_per_process[i][j] > 0){
                            file_offsets_for_agg[temp_index].length =
                                blocklen_per_process[i][j];
                            file_offsets_for_agg[temp_index].process_id = i;
                            file_offsets_for_agg[temp_index].offset =
                                displs_per_process[i][j];
                            temp_index++;
                        }
                    }
                }
            }
            else{
                continue;
            }
            local_heap_sort (file_offsets_for_agg,
                             entries_per_aggregator,
                             sorted_file_offsets);
            
            memory_displacements = (MPI_Aint *) malloc
                (entries_per_aggregator * sizeof(MPI_Aint));
            memory_displacements[sorted_file_offsets[0]] = 0;
            for (i=1; i<entries_per_aggregator; i++){
                memory_displacements[sorted_file_offsets[i]] =
                    memory_displacements[sorted_file_offsets[i-1]] +
                    file_offsets_for_agg[sorted_file_offsets[i-1]].length;
            }
            
            temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int));
            if (NULL == temp_disp_index) {
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
            global_count = 0;
            for (i=0;i<entries_per_aggregator;i++){
                temp_pindex =
                    file_offsets_for_agg[sorted_file_offsets[i]].process_id;
                displs_per_process[temp_pindex][temp_disp_index[temp_pindex]] =
                    memory_displacements[sorted_file_offsets[i]];
                if (temp_disp_index[temp_pindex] < disp_index[temp_pindex])
                    temp_disp_index[temp_pindex] += 1;
                else{
                    printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n",
                           temp_pindex, temp_disp_index[temp_pindex],
                           temp_pindex, disp_index[temp_pindex]);
                }
                global_count +=
                    file_offsets_for_agg[sorted_file_offsets[i]].length;
            }
            if (NULL != temp_disp_index){
                free(temp_disp_index);
                temp_disp_index = NULL;
            }
            
#if DEBUG_ON
            printf("************Cycle: %d,  Aggregator: %d ***************\n",
                   index+1,fh->f_rank);
            for (i=0; i<entries_per_aggregator;i++){
                printf("%d: OFFSET: %lld   LENGTH: %ld, Mem-offset: %ld, disp : %d\n",
                       file_offsets_for_agg[sorted_file_offsets[i]].process_id,
                       file_offsets_for_agg[sorted_file_offsets[i]].offset,
                       file_offsets_for_agg[sorted_file_offsets[i]].length,
                       memory_displacements[sorted_file_offsets[i]],
                       disp_index[ file_offsets_for_agg[sorted_file_offsets[i]].process_id]);
            }
#endif
            
#if DEBUG_ON
            printf("%d: global_count : %ld, bytes_to_write_in_cycle : %ld, procs_per_group: %d\n",
                   fh->f_rank,
                   global_count,
                   bytes_to_write_in_cycle,
                   fh->f_procs_per_group);
#endif
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            start_comm_time = MPI_Wtime();
#endif
            global_buf  = (char *) malloc (global_count);
            if (NULL == global_buf){
                opal_output(1, "OUT OF MEMORY");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
            
            recv_req = (MPI_Request *)
                malloc (fh->f_procs_per_group * sizeof(MPI_Request));
            if (NULL == recv_req){
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
            for (i=0;i<fh->f_procs_per_group; i++){
                ompi_datatype_create_hindexed(disp_index[i],
                                              blocklen_per_process[i],
                                              displs_per_process[i],
                                              MPI_BYTE,
                                              &recvtype[i]);
                ompi_datatype_commit(&recvtype[i]);
                ret = MCA_PML_CALL(irecv(global_buf,
                                         1,
                                         recvtype[i],
                                         fh->f_procs_in_group[i],
                                         123,
                                         fh->f_comm,
                                         &recv_req[i]));
                if (OMPI_SUCCESS != ret){
                    fprintf(stderr,"irecv Error!\n");
                    goto exit;
                }
            }
        }
        
        if (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY) {
            send_buf = &((char*)buf)[total_bytes_written];
        }
        else if (bytes_to_write_in_cycle) {
            /* allocate a send buffer and copy the data that needs
               to be sent into it in case the data is non-contigous
               in memory */
            OPAL_PTRDIFF_TYPE mem_address;
            size_t remaining = 0;
            size_t temp_position = 0;
            
            send_buf = malloc (bytes_to_write_in_cycle);
            if (NULL == send_buf) {
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
            remaining = bytes_to_write_in_cycle;
            
            while (remaining) {
                mem_address = (OPAL_PTRDIFF_TYPE)
                    (decoded_iov[iov_index].iov_base) + current_position;
                
                if (remaining >=
                    (decoded_iov[iov_index].iov_len - current_position)) {
                    memcpy (send_buf+temp_position,
                            (IOVBASE_TYPE *)mem_address,
                            decoded_iov[iov_index].iov_len - current_position);
                    remaining = remaining -
                        (decoded_iov[iov_index].iov_len - current_position);
                    temp_position = temp_position +
                        (decoded_iov[iov_index].iov_len - current_position);
                    iov_index = iov_index + 1;
                    current_position = 0;
                }
                else {
                    memcpy (send_buf+temp_position,
                            (IOVBASE_TYPE *)mem_address,
                            remaining);
                    current_position = current_position + remaining;
                    remaining = 0;
                }
            }
        }
        total_bytes_written += bytes_to_write_in_cycle;
        
        send_req = (MPI_Request *) malloc (sizeof(MPI_Request));
        if (NULL == send_req){
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        
        ret = MCA_PML_CALL(isend(send_buf,
                                 bytes_to_write_in_cycle,
                                 MPI_BYTE,
                                 fh->f_procs_in_group[fh->f_aggregator_index],
                                 123,
                                 MCA_PML_BASE_SEND_STANDARD,
                                 fh->f_comm,
                                 send_req));
        
        if ( OMPI_SUCCESS != ret ){
            fprintf(stderr,"isend error!\n");
            goto exit;
        }
        
        ret = ompi_request_wait (send_req, MPI_STATUS_IGNORE);
        if (OMPI_SUCCESS != ret){
            goto exit;
        }
        
        if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
            ret = ompi_request_wait_all (fh->f_procs_per_group,
                                         recv_req,
                                         MPI_STATUS_IGNORE);
            if (OMPI_SUCCESS != ret){
                goto exit;
            }
            
#if DEBUG_ON
            printf("************Cycle: %d,  Aggregator: %d ***************\n",
                   index+1,fh->f_rank);
            if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank){
                for (i=0 ; i<global_count/4 ; i++)
                    printf (" RECV %d \n",((int *)global_buf)[i]);
            }
#endif
        }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        end_comm_time = MPI_Wtime();
        comm_time += end_comm_time - start_comm_time;
#endif
        
        
        
        if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
            fh->f_io_array = (mca_io_ompio_io_array_t *) malloc
                (entries_per_aggregator * sizeof (mca_io_ompio_io_array_t));
            if (NULL == fh->f_io_array) {
                opal_output(1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
            fh->f_num_of_io_entries = 0;
            /*First entry for every aggregator*/
            fh->f_io_array[fh->f_num_of_io_entries].offset =
                (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset;
            fh->f_io_array[fh->f_num_of_io_entries].length =
                file_offsets_for_agg[sorted_file_offsets[0]].length;
            fh->f_io_array[fh->f_num_of_io_entries].memory_address =
                global_buf+memory_displacements[sorted_file_offsets[0]];
            fh->f_num_of_io_entries++;
            for (i=1;i<entries_per_aggregator;i++){
                if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset +
                    file_offsets_for_agg[sorted_file_offsets[i-1]].length ==
                    file_offsets_for_agg[sorted_file_offsets[i]].offset){
                    fh->f_io_array[fh->f_num_of_io_entries - 1].length +=
                        file_offsets_for_agg[sorted_file_offsets[i]].length;
                }
                else {
                    fh->f_io_array[fh->f_num_of_io_entries].offset =
                        (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset;
                    fh->f_io_array[fh->f_num_of_io_entries].length =
                        file_offsets_for_agg[sorted_file_offsets[i]].length;
                    fh->f_io_array[fh->f_num_of_io_entries].memory_address =
                        global_buf+memory_displacements[sorted_file_offsets[i]];
                    fh->f_num_of_io_entries++;
                }
            }
#if DEBUG_ON
            printf("*************************** %d\n", fh->f_num_of_io_entries);
            for (i=0 ; i<fh->f_num_of_io_entries ; i++) {
                printf(" ADDRESS: %p  OFFSET: %ld   LENGTH: %ld\n",
                       fh->f_io_array[i].memory_address,
                       (OPAL_PTRDIFF_TYPE)fh->f_io_array[i].offset,
                       fh->f_io_array[i].length);
            }
#endif
            
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            start_write_time = MPI_Wtime();
#endif
            
            if (fh->f_num_of_io_entries) {
                if ( 0 >  fh->f_fbtl->fbtl_pwritev (fh)) {
                    opal_output (1, "WRITE FAILED\n");
                    ret = OMPI_ERROR;
                    goto exit;
                }
            }
            
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            end_write_time = MPI_Wtime();
            write_time += end_write_time - start_write_time;
#endif
            
        }
        if (NULL != send_req){
            free(send_req);
            send_req = NULL;
        }
        
        if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
            fh->f_num_of_io_entries = 0;
            if (NULL != fh->f_io_array) {
                free (fh->f_io_array);
                fh->f_io_array = NULL;
            }
            for (i = 0; i < fh->f_procs_per_group; i++)
                ompi_datatype_destroy(recvtype+i);
            if (NULL != recvtype){
                free(recvtype);
                recvtype=NULL;
            }
            if (NULL != recv_req){
                free(recv_req);
                recv_req = NULL;
            }
            if (NULL != global_buf) {
                free (global_buf);
                global_buf = NULL;
            }
        }
    }
    
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    end_exch = MPI_Wtime();
    exch_write += end_exch - start_exch;
    nentry.time[0] = write_time;
    nentry.time[1] = comm_time;
    nentry.time[2] = exch_write;
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank)
        nentry.aggregator = 1;
    else
        nentry.aggregator = 0;
    nentry.nprocs_for_coll = static_num_io_procs;
    if (!fh->f_full_print_queue(WRITE_PRINT_QUEUE)){
	fh->f_register_print_entry(WRITE_PRINT_QUEUE,
				   nentry);
    }
#endif
    
    
    
exit:
    if (NULL != decoded_iov){
        free(decoded_iov);
        decoded_iov = NULL;
    }
    
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
        
        if (NULL != local_iov_array){
            free(local_iov_array);
            local_iov_array = NULL;
        }
        for(l=0;l<fh->f_procs_per_group;l++){
            if (NULL != blocklen_per_process[l]){
                free(blocklen_per_process[l]);
                blocklen_per_process[l] = NULL;
            }
            if (NULL != displs_per_process[l]){
                free(displs_per_process[l]);
                displs_per_process[l] = NULL;
            }
        }
    }
    
    if (NULL != send_buf){
        free(send_buf);
        send_buf = NULL;
    }
    
    if (NULL != global_buf){
        free(global_buf);
        global_buf = NULL;
    }
    
    if (NULL != recvtype){
        free(recvtype);
        recvtype = NULL;
    }
    
    if (NULL != sorted_file_offsets){
        free(sorted_file_offsets);
        sorted_file_offsets = NULL;
    }
    
    if (NULL != file_offsets_for_agg){
        free(file_offsets_for_agg);
        file_offsets_for_agg = NULL;
    }
    
    if (NULL != memory_displacements){
        free(memory_displacements);
        memory_displacements = NULL;
    }
    
    if (NULL != displs_per_process){
        free(displs_per_process);
        displs_per_process = NULL;
    }
    
    if (NULL != blocklen_per_process){
        free(blocklen_per_process);
        blocklen_per_process = NULL;
    }
    
    if(NULL != current_index){
        free(current_index);
        current_index = NULL;
    }
    
    if(NULL != bytes_remaining){
        free(bytes_remaining);
        bytes_remaining = NULL;
    }
    
    if (NULL != disp_index){
        free(disp_index);
        disp_index = NULL;
    }
    
    if (NULL != sorted) {
        free(sorted);
        sorted = NULL;
    }
    
    return ret;
}
Example #17
0
/*
 *	allreduce_inter
 *
 *	Function:	- allreduce using other MPI collectives
 *	Accepts:	- same as MPI_Allreduce()
 *	Returns:	- MPI_SUCCESS or error code
 */
int
mca_coll_basic_allreduce_inter(void *sbuf, void *rbuf, int count,
                               struct ompi_datatype_t *dtype,
                               struct ompi_op_t *op,
                               struct ompi_communicator_t *comm)
{
    int err, i;
    int rank;
    int root = 0;
    int rsize;
    ptrdiff_t lb, extent;
    char *tmpbuf = NULL, *pml_buffer = NULL;
    ompi_request_t *req[2];
    ompi_request_t **reqs = comm->c_coll_basic_data->mccb_reqs;

    rank = ompi_comm_rank(comm);
    rsize = ompi_comm_remote_size(comm);

    /* determine result of the remote group, you cannot
     * use coll_reduce for inter-communicators, since than
     * you would need to determine an order between the
     * two groups (e.g. which group is providing the data
     * and which one enters coll_reduce with providing
     * MPI_PROC_NULL as root argument etc.) Here,
     * we execute the data exchange for both groups
     * simultaniously. */
    /*****************************************************************/
    if (rank == root) {
        err = ompi_ddt_get_extent(dtype, &lb, &extent);
        if (OMPI_SUCCESS != err) {
            return OMPI_ERROR;
        }

        tmpbuf = (char *) malloc(count * extent);
        if (NULL == tmpbuf) {
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
        pml_buffer = tmpbuf - lb;

        /* Do a send-recv between the two root procs. to avoid deadlock */
        err = MCA_PML_CALL(irecv(rbuf, count, dtype, 0,
                                 MCA_COLL_BASE_TAG_ALLREDUCE, comm,
                                 &(req[0])));
        if (OMPI_SUCCESS != err) {
            goto exit;
        }

        err = MCA_PML_CALL(isend(sbuf, count, dtype, 0,
                                 MCA_COLL_BASE_TAG_ALLREDUCE,
                                 MCA_PML_BASE_SEND_STANDARD,
                                 comm, &(req[1])));
        if (OMPI_SUCCESS != err) {
            goto exit;
        }

        err = ompi_request_wait_all(2, req, MPI_STATUSES_IGNORE);
        if (OMPI_SUCCESS != err) {
            goto exit;
        }


        /* Loop receiving and calling reduction function (C or Fortran). */
        for (i = 1; i < rsize; i++) {
            err = MCA_PML_CALL(recv(pml_buffer, count, dtype, i,
                                    MCA_COLL_BASE_TAG_ALLREDUCE, comm,
                                    MPI_STATUS_IGNORE));
            if (MPI_SUCCESS != err) {
                goto exit;
            }

            /* Perform the reduction */
            ompi_op_reduce(op, pml_buffer, rbuf, count, dtype);
        }
    } else {
        /* If not root, send data to the root. */
        err = MCA_PML_CALL(send(sbuf, count, dtype, root,
                                MCA_COLL_BASE_TAG_ALLREDUCE,
                                MCA_PML_BASE_SEND_STANDARD, comm));
        if (OMPI_SUCCESS != err) {
            goto exit;
        }
    }


    /* now we have on one process the result of the remote group. To distribute
     * the data to all processes in the local group, we exchange the data between
     * the two root processes. They then send it to every other process in the 
     * remote group. */
    /***************************************************************************/
    if (rank == root) {
        /* sendrecv between the two roots */
        err = MCA_PML_CALL(irecv(pml_buffer, count, dtype, 0,
                                 MCA_COLL_BASE_TAG_ALLREDUCE,
                                 comm, &(req[1])));
        if (OMPI_SUCCESS != err) {
            goto exit;
        }

        err = MCA_PML_CALL(isend(rbuf, count, dtype, 0,
                                 MCA_COLL_BASE_TAG_ALLREDUCE,
                                 MCA_PML_BASE_SEND_STANDARD, comm,
                                 &(req[0])));
        if (OMPI_SUCCESS != err) {
            goto exit;
        }
        err = ompi_request_wait_all(2, req, MPI_STATUSES_IGNORE);
        if (OMPI_SUCCESS != err) {
            goto exit;
        }

        /* distribute the data to other processes in remote group.
         * Note that we start from 1 (not from zero), since zero
         * has already the correct data AND we avoid a potential 
         * deadlock here. 
         */
        if (rsize > 1) {
            for (i = 1; i < rsize; i++) {
                err = MCA_PML_CALL(isend(pml_buffer, count, dtype, i,
                                         MCA_COLL_BASE_TAG_ALLREDUCE,
                                         MCA_PML_BASE_SEND_STANDARD, comm,
                                         &reqs[i - 1]));
                if (OMPI_SUCCESS != err) {
                    goto exit;
                }
            }

            err =
                ompi_request_wait_all(rsize - 1, reqs,
                                      MPI_STATUSES_IGNORE);
            if (OMPI_SUCCESS != err) {
                goto exit;
            }
        }
    } else {
        err = MCA_PML_CALL(recv(rbuf, count, dtype, root,
                                MCA_COLL_BASE_TAG_ALLREDUCE,
                                comm, MPI_STATUS_IGNORE));
    }

  exit:
    if (NULL != tmpbuf) {
        free(tmpbuf);
    }


    return err;
}
static int two_phase_exchange_data(mca_io_ompio_file_t *fh,
				   void *buf, struct iovec *offset_len,
				   int *send_size, int *start_pos, int *recv_size,
				   int *count, int *partial_send, 
				   int *recd_from_proc, int contig_access_count,
				   OMPI_MPI_OFFSET_TYPE min_st_offset,
				   OMPI_MPI_OFFSET_TYPE fd_size,
				   OMPI_MPI_OFFSET_TYPE *fd_start,
				   OMPI_MPI_OFFSET_TYPE *fd_end, 
				   Flatlist_node *flat_buf,
				   mca_io_ompio_access_array_t *others_req, 
				   int iter, size_t *buf_idx, 
				   MPI_Aint buftype_extent, int striping_unit,
				   int *aggregator_list)
{
  
  int i=0, j=0, k=0, tmp=0, nprocs_recv=0, nprocs_send=0;
  int ret = OMPI_SUCCESS;
  char **recv_buf = NULL;
  MPI_Request *requests=NULL;
  MPI_Datatype send_type;



#if TIME_BREAKDOWN
    start_rcomm_time = MPI_Wtime();
#endif

  ret = fh->f_comm->c_coll.coll_alltoall (send_size,
					  1,
					  MPI_INT,
					  recv_size,
					  1,
					  MPI_INT,
					  fh->f_comm,
					  fh->f_comm->c_coll.coll_alltoall_module);

  if ( OMPI_SUCCESS != ret ){
    goto exit;
  }
  
  
#if DEBUG
  for (i=0; i<fh->f_size; i++){
    printf("%d: RS[%d]: %d\n", fh->f_rank,
	   i,
	   recv_size[i]);
  }
#endif

  
  nprocs_recv = 0;
  for (i=0; i < fh->f_size; i++) 
    if (recv_size[i]) nprocs_recv++;

  nprocs_send = 0;
  for (i=0; i< fh->f_size; i++) 
    if (send_size[i]) nprocs_send++;

  requests = (MPI_Request *)
    malloc((nprocs_send+nprocs_recv+1) *  sizeof(MPI_Request));
  
  if (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY) {
    j = 0;
    for (i=0; i < fh->f_size; i++){
      if (recv_size[i]){
	ret = MCA_PML_CALL(irecv(((char *) buf)+ buf_idx[i],
				 recv_size[i],
				 MPI_BYTE,
				 i,
				 fh->f_rank+i+100*iter,
				 fh->f_comm,
				 requests+j));
	
	if ( OMPI_SUCCESS != ret ){
	  return ret;
	}
	j++;
	buf_idx[i] += recv_size[i];
      }
    }
  }
  else{
    
    recv_buf = (char **)malloc(fh->f_size * sizeof(char *));
    if (NULL == recv_buf){
      ret = OMPI_ERR_OUT_OF_RESOURCE;
      goto exit;
    }

    for (i=0; i < fh->f_size; i++)
      if(recv_size[i]) recv_buf[i] = 
			 (char *) malloc (recv_size[i] *  sizeof(char));
    j = 0;
    for(i=0; i<fh->f_size; i++)
      if (recv_size[i]) {
	ret = MCA_PML_CALL(irecv(recv_buf[i],
				 recv_size[i],
				 MPI_BYTE,
				 i,
				 fh->f_rank+i+100*iter,
				 fh->f_comm,
				 requests+j));
	j++;
	
      }
  }
    
  

  j = 0;
  for (i = 0; i< fh->f_size; i++){
    if (send_size[i]){
      if (partial_send[i]){
	k = start_pos[i] + count[i] - 1;
	tmp = others_req[i].lens[k];
	others_req[i].lens[k] = partial_send[i];
      }

      ompi_datatype_create_hindexed(count[i],
				    &(others_req[i].lens[start_pos[i]]),
				    &(others_req[i].mem_ptrs[start_pos[i]]),
				    MPI_BYTE,
				    &send_type);

      ompi_datatype_commit(&send_type);

      ret = MCA_PML_CALL(isend(MPI_BOTTOM,
			       1,
			       send_type,
			       i,
			       fh->f_rank+i+100*iter,
			       MCA_PML_BASE_SEND_STANDARD,
			       fh->f_comm,
			       requests+nprocs_recv+j));
      ompi_datatype_destroy(&send_type);
      
      if (partial_send[i]) others_req[i].lens[k] = tmp;
      j++;
    }
  }


  if (nprocs_recv) {

    ret = ompi_request_wait_all(nprocs_recv,
				requests, 
				MPI_STATUS_IGNORE);
    
    
    if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) {

      two_phase_fill_user_buffer(fh, buf, flat_buf,
				 recv_buf, offset_len,
				 (unsigned *)recv_size, requests,
				 recd_from_proc, contig_access_count,
				 min_st_offset, fd_size, fd_start, fd_end,
				 buftype_extent, striping_unit, aggregator_list);
    }
  }

  ret = ompi_request_wait_all(nprocs_send,
			      requests+nprocs_recv, 
			      MPI_STATUS_IGNORE);

  if (NULL != requests){
    free(requests);
    requests = NULL;
  }
  
  if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)){
    for (i=0; i< fh->f_size; i++){
      if (recv_size[i]){
	free(recv_buf[i]);
      }
    }
    free(recv_buf);
  }

#if TIME_BREAKDOWN
    end_rcomm_time = MPI_Wtime();
    rcomm_time += (end_rcomm_time - start_rcomm_time);
#endif

 exit:
    return ret;

}
Example #19
0
/* Arguments not used in this implementation:
 * - send_first
 */
static int ompi_comm_allreduce_intra_bridge (int *inbuf, int *outbuf, 
                                             int count, struct ompi_op_t *op, 
                                             ompi_communicator_t *comm,
                                             ompi_communicator_t *bcomm, 
                                             void* lleader, void* rleader,
                                             int send_first )
{
    int *tmpbuf=NULL;
    int local_rank;
    int i;
    int rc;
    int local_leader, remote_leader;
    
    local_leader  = (*((int*)lleader));
    remote_leader = (*((int*)rleader));

    if ( &ompi_mpi_op_sum.op != op && &ompi_mpi_op_prod.op != op &&
         &ompi_mpi_op_max.op != op && &ompi_mpi_op_min.op  != op ) {
        return MPI_ERR_OP;
    }
    
    local_rank = ompi_comm_rank ( comm );
    tmpbuf     = (int *) malloc ( count * sizeof(int));
    if ( NULL == tmpbuf ) {
        rc = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }

    /* Intercomm_create */
    rc = comm->c_coll.coll_allreduce ( inbuf, tmpbuf, count, MPI_INT,
                                       op, comm, comm->c_coll.coll_allreduce_module );
    if ( OMPI_SUCCESS != rc ) {
        goto exit;
    }

    if (local_rank == local_leader ) {
        MPI_Request req;
        
        rc = MCA_PML_CALL(irecv ( outbuf, count, MPI_INT, remote_leader,
                                 OMPI_COMM_ALLREDUCE_TAG, 
                                 bcomm, &req));
        if ( OMPI_SUCCESS != rc ) {
            goto exit;       
        }
        rc = MCA_PML_CALL(send (tmpbuf, count, MPI_INT, remote_leader, 
                               OMPI_COMM_ALLREDUCE_TAG,
                               MCA_PML_BASE_SEND_STANDARD,  bcomm));
        if ( OMPI_SUCCESS != rc ) {
            goto exit;
        }
        rc = ompi_request_wait_all ( 1, &req, MPI_STATUS_IGNORE);
        if ( OMPI_SUCCESS != rc ) {
            goto exit;
        }

        if ( &ompi_mpi_op_max.op == op ) {
            for ( i = 0 ; i < count; i++ ) {
                if (tmpbuf[i] > outbuf[i]) outbuf[i] = tmpbuf[i];
            }
        }
        else if ( &ompi_mpi_op_min.op == op ) {
            for ( i = 0 ; i < count; i++ ) {
                if (tmpbuf[i] < outbuf[i]) outbuf[i] = tmpbuf[i];
            }
        }
        else if ( &ompi_mpi_op_sum.op == op ) {
            for ( i = 0 ; i < count; i++ ) {
                outbuf[i] += tmpbuf[i];
            }
        }
        else if ( &ompi_mpi_op_prod.op == op ) {
            for ( i = 0 ; i < count; i++ ) {
                outbuf[i] *= tmpbuf[i];
            }
        }
        
    }
    
    rc = comm->c_coll.coll_bcast ( outbuf, count, MPI_INT, local_leader, 
                                   comm, comm->c_coll.coll_bcast_module );

 exit:
    if (NULL != tmpbuf ) {
        free (tmpbuf);
    }

    return (rc);
}
Example #20
0
/*
 *	gather_intra_linear_sync
 *
 *	Function:	- synchronized gather operation with
 *	Accepts:	- same arguments as MPI_Gather(), first segment size
 *	Returns:	- MPI_SUCCESS or error code
 */
int
ompi_coll_base_gather_intra_linear_sync(const void *sbuf, int scount,
                                         struct ompi_datatype_t *sdtype,
                                         void *rbuf, int rcount,
                                         struct ompi_datatype_t *rdtype,
                                         int root,
                                         struct ompi_communicator_t *comm,
                                         mca_coll_base_module_t *module,
                                         int first_segment_size)
{
    int i, ret, line, rank, size, first_segment_count;
    ompi_request_t **reqs = NULL;
    MPI_Aint extent, lb;
    size_t typelng;

    size = ompi_comm_size(comm);
    rank = ompi_comm_rank(comm);

    OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
                 "ompi_coll_base_gather_intra_linear_sync rank %d, segment %d", rank, first_segment_size));

    if (rank != root) {
        /* Non-root processes:
           - receive zero byte message from the root,
           - send the first segment of the data synchronously,
           - send the second segment of the data.
        */

        ompi_datatype_type_size(sdtype, &typelng);
        ompi_datatype_get_extent(sdtype, &lb, &extent);
        first_segment_count = scount;
        COLL_BASE_COMPUTED_SEGCOUNT( (size_t) first_segment_size, typelng,
                                      first_segment_count );

        ret = MCA_PML_CALL(recv(sbuf, 0, MPI_BYTE, root,
                                MCA_COLL_BASE_TAG_GATHER,
                                comm, MPI_STATUS_IGNORE));
        if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }

        ret = MCA_PML_CALL(send(sbuf, first_segment_count, sdtype, root,
                                MCA_COLL_BASE_TAG_GATHER,
                                MCA_PML_BASE_SEND_STANDARD, comm));
        if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }

        ret = MCA_PML_CALL(send((char*)sbuf + extent * first_segment_count,
                                (scount - first_segment_count), sdtype,
                                root, MCA_COLL_BASE_TAG_GATHER,
                                MCA_PML_BASE_SEND_STANDARD, comm));
        if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }

    } else {

        /* Root process,
           - For every non-root node:
           - post irecv for the first segment of the message
           - send zero byte message to signal node to send the message
           - post irecv for the second segment of the message
           - wait for the first segment to complete
           - Copy local data if necessary
           - Waitall for all the second segments to complete.
        */
        char *ptmp;
        ompi_request_t *first_segment_req;
        reqs = (ompi_request_t**) calloc(size, sizeof(ompi_request_t*));
        if (NULL == reqs) { ret = -1; line = __LINE__; goto error_hndl; }

        ompi_datatype_type_size(rdtype, &typelng);
        ompi_datatype_get_extent(rdtype, &lb, &extent);
        first_segment_count = rcount;
        COLL_BASE_COMPUTED_SEGCOUNT( (size_t)first_segment_size, typelng,
                                      first_segment_count );

        ptmp = (char *) rbuf;
        for (i = 0; i < size; ++i) {
            if (i == rank) {
                /* skip myself */
                reqs[i] = MPI_REQUEST_NULL;
                continue;
            }

            /* irecv for the first segment from i */
            ptmp = (char*)rbuf + (ptrdiff_t)i * (ptrdiff_t)rcount * extent;
            ret = MCA_PML_CALL(irecv(ptmp, first_segment_count, rdtype, i,
                                     MCA_COLL_BASE_TAG_GATHER, comm,
                                     &first_segment_req));
            if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }

            /* send sync message */
            ret = MCA_PML_CALL(send(rbuf, 0, MPI_BYTE, i,
                                    MCA_COLL_BASE_TAG_GATHER,
                                    MCA_PML_BASE_SEND_STANDARD, comm));
            if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }

            /* irecv for the second segment */
            ptmp = (char*)rbuf + ((ptrdiff_t)i * (ptrdiff_t)rcount + first_segment_count) * extent;
            ret = MCA_PML_CALL(irecv(ptmp, (rcount - first_segment_count),
                                     rdtype, i, MCA_COLL_BASE_TAG_GATHER, comm,
                                     &reqs[i]));
            if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }

            /* wait on the first segment to complete */
            ret = ompi_request_wait(&first_segment_req, MPI_STATUS_IGNORE);
            if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
        }

        /* copy local data if necessary */
        if (MPI_IN_PLACE != sbuf) {
            ret = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype,
                                       (char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * extent,
                                       rcount, rdtype);
            if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
        }

        /* wait all second segments to complete */
        ret = ompi_request_wait_all(size, reqs, MPI_STATUSES_IGNORE);
        if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }

        free(reqs);
    }

    /* All done */

    return MPI_SUCCESS;
 error_hndl:
    if (NULL != reqs) {
        free(reqs);
    }
    OPAL_OUTPUT (( ompi_coll_base_framework.framework_output,
                   "ERROR_HNDL: node %d file %s line %d error %d\n",
                   rank, __FILE__, line, ret ));
    return ret;
}
static int
mca_coll_basic_neighbor_alltoallw_cart(const void *sbuf, const int scounts[], const MPI_Aint sdisps[],
                                       struct ompi_datatype_t * const sdtypes[], void *rbuf, const int rcounts[],
                                       const MPI_Aint rdisps[], struct ompi_datatype_t * const rdtypes[],
                                       struct ompi_communicator_t *comm, mca_coll_base_module_t *module)
{
    mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t *) module;
    const mca_topo_base_comm_cart_2_1_0_t *cart = comm->c_topo->mtc.cart;
    const int rank = ompi_comm_rank (comm);
    int rc = MPI_SUCCESS, dim, i, nreqs;
    ompi_request_t **reqs;


    /* post receives first */
    for (dim = 0, i = 0, nreqs = 0, reqs = basic_module->mccb_reqs ; dim < cart->ndims ; ++dim, i += 2) {
        int srank = MPI_PROC_NULL, drank = MPI_PROC_NULL;

        if (cart->dims[dim] > 1) {
            mca_topo_base_cart_shift (comm, dim, 1, &srank, &drank);
        } else if (1 == cart->dims[dim] && cart->periods[dim]) {
            srank = drank = rank;
        }

        if (MPI_PROC_NULL != srank) {
            rc = MCA_PML_CALL(irecv((char *) rbuf + rdisps[i], rcounts[i], rdtypes[i], srank,
                                    MCA_COLL_BASE_TAG_ALLTOALL, comm, reqs++));
            if (OMPI_SUCCESS != rc) break;
            nreqs++;
        }

        if (MPI_PROC_NULL != drank) {
            rc = MCA_PML_CALL(irecv((char *) rbuf + rdisps[i+1], rcounts[i+1], rdtypes[i+1], drank,
                                    MCA_COLL_BASE_TAG_ALLTOALL, comm, reqs++));
            if (OMPI_SUCCESS != rc) break;
            nreqs++;
        }
    }

    if (OMPI_SUCCESS != rc) {
        /* should probably try to clean up here */
        return rc;
    }

    for (dim = 0, i = 0 ; dim < cart->ndims ; ++dim, i += 2) {
        int srank = MPI_PROC_NULL, drank = MPI_PROC_NULL;

        if (cart->dims[dim] > 1) {
            mca_topo_base_cart_shift (comm, dim, 1, &srank, &drank);
        } else if (1 == cart->dims[dim] && cart->periods[dim]) {
            srank = drank = rank;
        }

        if (MPI_PROC_NULL != srank) {
            /* remove cast from const when the pml layer is updated to take a const for the send buffer */
            rc = MCA_PML_CALL(isend((char *) sbuf + sdisps[i], scounts[i], sdtypes[i], srank,
                                    MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, reqs++));
            if (OMPI_SUCCESS != rc) break;
            nreqs++;
        }

        if (MPI_PROC_NULL != drank) {
            rc = MCA_PML_CALL(isend((char *) sbuf + sdisps[i+1], scounts[i+1], sdtypes[i+1], drank,
                                    MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, reqs++));
            if (OMPI_SUCCESS != rc) break;
            nreqs++;
        }
    }

    if (OMPI_SUCCESS != rc) {
        /* should probably try to clean up here */
        return rc;
    }

    return ompi_request_wait_all (nreqs, basic_module->mccb_reqs, MPI_STATUSES_IGNORE);
}
static int two_phase_exchage_data(mca_io_ompio_file_t *fh,
				  void *buf,
				  char *write_buf,
				  struct iovec *offset_length,
				  int *send_size,int *start_pos,
				  int *recv_size,
				  OMPI_MPI_OFFSET_TYPE off,
				  OMPI_MPI_OFFSET_TYPE size, int *count,
				  int *partial_recv, int *sent_to_proc,
				  int contig_access_count,
				  OMPI_MPI_OFFSET_TYPE min_st_offset,
				  OMPI_MPI_OFFSET_TYPE fd_size,
				  OMPI_MPI_OFFSET_TYPE *fd_start,
				  OMPI_MPI_OFFSET_TYPE *fd_end,
				  Flatlist_node *flat_buf,
				  mca_io_ompio_access_array_t *others_req,
				  int *send_buf_idx, int *curr_to_proc,
				  int *done_to_proc, int iter,
				  size_t *buf_idx,MPI_Aint buftype_extent,
				  int striping_unit, int *aggregator_list,
				  int *hole){
  
    int *tmp_len=NULL, sum, *srt_len=NULL, nprocs_recv, nprocs_send,  k,i,j;
    int ret=OMPI_SUCCESS;
    MPI_Request *requests=NULL, *send_req=NULL;
    ompi_datatype_t **recv_types=NULL;
    OMPI_MPI_OFFSET_TYPE *srt_off=NULL;
    char **send_buf = NULL; 
    
#if TIME_BREAKDOWN
      start_comm_time = MPI_Wtime();
#endif
        
    ret = fh->f_comm->c_coll.coll_alltoall (recv_size,
					    1,
					    MPI_INT,
					    send_size,
					    1,
					    MPI_INT,
					    fh->f_comm,
					    fh->f_comm->c_coll.coll_alltoall_module);
    
    if ( OMPI_SUCCESS != ret ){
      return ret;
    }

    nprocs_recv = 0;
    for (i=0;i<fh->f_size;i++){
      if (recv_size[i]){
	nprocs_recv++;
      }
    }
    
    
    recv_types = (ompi_datatype_t **)
	malloc (( nprocs_recv + 1 ) * sizeof(ompi_datatype_t *));
    
    if ( NULL == recv_types ){
      ret = OMPI_ERR_OUT_OF_RESOURCE;
      goto exit;
    }

    tmp_len = (int *) malloc(fh->f_size*sizeof(int));
    
    if ( NULL == tmp_len ) {
      return OMPI_ERR_OUT_OF_RESOURCE;
    }

    j = 0;
    for (i=0;i<fh->f_size;i++){
	if (recv_size[i]) {
	    if (partial_recv[i]) {
		k = start_pos[i] + count[i] - 1;
		tmp_len[i] = others_req[i].lens[k];
		others_req[i].lens[k] = partial_recv[i];
	    }
	    ompi_datatype_create_hindexed(count[i], 
					  &(others_req[i].lens[start_pos[i]]),
					  &(others_req[i].mem_ptrs[start_pos[i]]), 
					  MPI_BYTE, recv_types+j);
	    ompi_datatype_commit(recv_types+j);
	    j++;
	}
    }

    sum = 0;
    for (i=0;i<fh->f_size;i++) sum += count[i];
    srt_off = (OMPI_MPI_OFFSET_TYPE *) 
      malloc((sum+1)*sizeof(OMPI_MPI_OFFSET_TYPE));
    
    if ( NULL == srt_off ){
      ret = OMPI_ERR_OUT_OF_RESOURCE;
      goto exit;
    }
    
    srt_len = (int *) malloc((sum+1)*sizeof(int));
    
    if ( NULL == srt_len ) {
      ret = OMPI_ERR_OUT_OF_RESOURCE;
      goto exit;
    }


    two_phase_heap_merge(others_req, count, srt_off, srt_len, start_pos, fh->f_size,fh->f_rank,  nprocs_recv, sum);


    for (i=0; i<fh->f_size; i++) 
        if (partial_recv[i]) {
            k = start_pos[i] + count[i] - 1;
            others_req[i].lens[k] = tmp_len[i];
        }
    
    if ( NULL != tmp_len ){
      free(tmp_len); 
    }

    *hole = 0;
    if (off != srt_off[0]){
	*hole = 1;
    }
    else{
	for (i=1;i<sum;i++){
	    if (srt_off[i] <= srt_off[0] + srt_len[0]){
		int new_len = srt_off[i] + srt_len[i] - srt_off[0];
		if(new_len > srt_len[0]) 
		    srt_len[0] = new_len;
	    }
	    else
		break;
	}
	if (i < sum || size != srt_len[0])
	    *hole = 1;
    }


    if ( NULL != srt_off ){
      free(srt_off);
    }
    if ( NULL != srt_len ){
      free(srt_len);
    }

    if (nprocs_recv){
	if (*hole){
	    if (off > 0){
		fh->f_io_array = (mca_io_ompio_io_array_t *)malloc 
		    (sizeof(mca_io_ompio_io_array_t));
		if (NULL == fh->f_io_array) {
		    opal_output(1, "OUT OF MEMORY\n");
		    return OMPI_ERR_OUT_OF_RESOURCE;
		}
		fh->f_io_array[0].offset  =(IOVBASE_TYPE *)(intptr_t) off;
		fh->f_num_of_io_entries = 1;
		fh->f_io_array[0].length = size;
		fh->f_io_array[0].memory_address = write_buf;
		if (fh->f_num_of_io_entries){
		    if (OMPI_SUCCESS != fh->f_fbtl->fbtl_preadv (fh, NULL)) {
			opal_output(1, "READ FAILED\n");
			return OMPI_ERROR;
		    }
		}
		
	    }
	    fh->f_num_of_io_entries = 0;
	    if (NULL != fh->f_io_array) {
		free (fh->f_io_array);
		fh->f_io_array = NULL;
	    }
	}
    }
    
    nprocs_send = 0;
    for (i=0; i <fh->f_size; i++) if (send_size[i]) nprocs_send++;

    #if DEBUG_ON
    printf("%d : nprocs_send : %d\n", fh->f_rank,nprocs_send);
    #endif

    requests = (MPI_Request *) 	
	malloc((nprocs_send+nprocs_recv+1)*sizeof(MPI_Request)); 

    if ( NULL == requests ){
      return OMPI_ERR_OUT_OF_RESOURCE;
    }
    
    j = 0;
    for (i=0; i<fh->f_size; i++) {
	if (recv_size[i]) {
	  ret = MCA_PML_CALL(irecv(MPI_BOTTOM,
				   1,
				   recv_types[j],
				   i,
				   fh->f_rank+i+100*iter,
				   fh->f_comm,
				   requests+j));

	  if ( OMPI_SUCCESS != ret ){
	    goto exit;
	  }
	  j++;
	}
    }
    send_req = requests + nprocs_recv;
    
    
    if (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY) {
	j = 0;
	for (i=0; i <fh->f_size; i++) 
	  if (send_size[i]) {
	    ret = MCA_PML_CALL(isend(((char *) buf) + buf_idx[i],
				     send_size[i],
				     MPI_BYTE,
				     i,
				     fh->f_rank+i+100*iter,
				     MCA_PML_BASE_SEND_STANDARD, 
				     fh->f_comm,
				     send_req+j));	

	    if ( OMPI_SUCCESS != ret ){
	      goto exit;
	    }
	    
	    j++;
	    buf_idx[i] += send_size[i];
	  }
    }
    else if(nprocs_send && (!(fh->f_flags & OMPIO_CONTIGUOUS_MEMORY))){
      send_buf = (char **) malloc(fh->f_size*sizeof(char*));
      if ( NULL == send_buf ){
	ret = OMPI_ERR_OUT_OF_RESOURCE;
	goto exit;
      }
      for (i=0; i < fh->f_size; i++){
	if (send_size[i]) {
	  send_buf[i] = (char *) malloc(send_size[i]);
	  
	  if ( NULL == send_buf[i] ){
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	  }
	}
      }
      
      ret = two_phase_fill_send_buffer(fh, buf,flat_buf, send_buf,
				       offset_length, send_size,
				       send_req,sent_to_proc,
				       contig_access_count, 
				       min_st_offset, fd_size,
				       fd_start, fd_end, send_buf_idx,
				       curr_to_proc, done_to_proc,
				       iter, buftype_extent, striping_unit,
				       aggregator_list);
      
      if ( OMPI_SUCCESS != ret ){
	goto exit;
      }
    }


    for (i=0; i<nprocs_recv; i++) ompi_datatype_destroy(recv_types+i);
    if (NULL != recv_types){
      free(recv_types);
      recv_types = NULL;
    }

    ret = ompi_request_wait_all (nprocs_send+nprocs_recv,
				 requests,
				 MPI_STATUS_IGNORE);
    

    if ( NULL != requests ){
      free(requests);
    }

#if TIME_BREAKDOWN
      end_comm_time = MPI_Wtime();
      comm_time += (end_comm_time - start_comm_time);
#endif

 exit:    
    return ret;
}
Example #23
0
static int
mca_coll_basic_alltoallw_intra_inplace(void *rbuf, int *rcounts, const int *rdisps,
                                       struct ompi_datatype_t * const *rdtypes,
                                       struct ompi_communicator_t *comm,
                                       mca_coll_base_module_t *module)
{
    mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t*) module;
    int i, j, size, rank, err=MPI_SUCCESS, max_size;
    MPI_Request *preq;
    char *tmp_buffer;
    ptrdiff_t ext;

    /* Initialize. */

    size = ompi_comm_size(comm);
    rank = ompi_comm_rank(comm);

    /* If only one process, we're done. */
    if (1 == size) {
        return MPI_SUCCESS;
    }

    /* Find the largest receive amount */
    for (i = 0, max_size = 0 ; i < size ; ++i) {
        ompi_datatype_type_extent (rdtypes[i], &ext);
        ext *= rcounts[i];

        max_size = ext > max_size ? ext : max_size;
    }

    /* Allocate a temporary buffer */
    tmp_buffer = calloc (max_size, 1);
    if (NULL == tmp_buffer) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    /* in-place alltoallw slow algorithm (but works) */
    for (i = 0 ; i < size ; ++i) {
        for (j = i+1 ; j < size ; ++j) {
            ompi_datatype_type_extent (rdtypes[j], &ext);

            /* Initiate all send/recv to/from others. */
            preq = basic_module->mccb_reqs;

            if (i == rank && rcounts[j] != 0) {
                /* Copy the data into the temporary buffer */
                err = ompi_datatype_copy_content_same_ddt (rdtypes[j], rcounts[j],
                                                           tmp_buffer, (char *) rbuf + rdisps[j]);
                if (MPI_SUCCESS != err) { goto error_hndl; }

                /* Exchange data with the peer */
                err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[j], rcounts[j], rdtypes[j],
                                          j, MCA_COLL_BASE_TAG_ALLTOALLW, comm, preq++));
                if (MPI_SUCCESS != err) { goto error_hndl; }

                err = MCA_PML_CALL(isend ((void *) tmp_buffer,  rcounts[j], rdtypes[j],
                                          j, MCA_COLL_BASE_TAG_ALLTOALLW, MCA_PML_BASE_SEND_STANDARD,
                                          comm, preq++));
                if (MPI_SUCCESS != err) { goto error_hndl; }
            } else if (j == rank && rcounts[i] != 0) {
                /* Copy the data into the temporary buffer */
                err = ompi_datatype_copy_content_same_ddt (rdtypes[i], rcounts[i],
                                                           tmp_buffer, (char *) rbuf + rdisps[i]);
                if (MPI_SUCCESS != err) { goto error_hndl; }

                /* Exchange data with the peer */
                err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[i], rcounts[i], rdtypes[i],
                                          i, MCA_COLL_BASE_TAG_ALLTOALLW, comm, preq++));
                if (MPI_SUCCESS != err) { goto error_hndl; }

                err = MCA_PML_CALL(isend ((void *) tmp_buffer,  rcounts[i], rdtypes[i],
                                          i, MCA_COLL_BASE_TAG_ALLTOALLW, MCA_PML_BASE_SEND_STANDARD,
                                          comm, preq++));
                if (MPI_SUCCESS != err) { goto error_hndl; }
            } else {
                continue;
            }

            /* Wait for the requests to complete */
            err = ompi_request_wait_all (2, basic_module->mccb_reqs, MPI_STATUS_IGNORE);
            if (MPI_SUCCESS != err) { goto error_hndl; }

            /* Free the requests. */
            mca_coll_basic_free_reqs(basic_module->mccb_reqs, 2);
        }
    }

 error_hndl:
    /* Free the temporary buffer */
    free (tmp_buffer);

    /* All done */

    return err;
}
/*
 *	allreduce_inter
 *
 *	Function:	- allreduce using other MPI collectives
 *	Accepts:	- same as MPI_Allreduce()
 *	Returns:	- MPI_SUCCESS or error code
 */
int
mca_coll_inter_allreduce_inter(void *sbuf, void *rbuf, int count,
                               struct ompi_datatype_t *dtype,
                               struct ompi_op_t *op,
                               struct ompi_communicator_t *comm,
                               mca_coll_base_module_t *module)
{
    int err, rank, root = 0, rsize;
    ptrdiff_t lb, extent;
    char *tmpbuf = NULL, *pml_buffer = NULL;
    ompi_request_t *req[2];

    rank = ompi_comm_rank(comm);
    rsize = ompi_comm_remote_size(comm);
    
    /* Perform the reduction locally */
    err = ompi_ddt_get_extent(dtype, &lb, &extent);
    if (OMPI_SUCCESS != err) {
	return OMPI_ERROR;
    }
    
    tmpbuf = (char *) malloc(count * extent);
    if (NULL == tmpbuf) {
	return OMPI_ERR_OUT_OF_RESOURCE;
    }
    pml_buffer = tmpbuf - lb;
   
    err = comm->c_local_comm->c_coll.coll_reduce(sbuf, pml_buffer, count,
						 dtype, op, root, 
						 comm->c_local_comm,
                                                 comm->c_local_comm->c_coll.coll_reduce_module);
    if (OMPI_SUCCESS != err) {
	goto exit;
    }

    if (rank == root) {
	/* Do a send-recv between the two root procs. to avoid deadlock */
        err = MCA_PML_CALL(irecv(rbuf, count, dtype, 0,
                                 MCA_COLL_BASE_TAG_ALLREDUCE, comm,
                                 &(req[0])));
        if (OMPI_SUCCESS != err) {
            goto exit;
        }

        err = MCA_PML_CALL(isend(pml_buffer, count, dtype, 0,
                                 MCA_COLL_BASE_TAG_ALLREDUCE,
                                 MCA_PML_BASE_SEND_STANDARD,
                                 comm, &(req[1])));
        if (OMPI_SUCCESS != err) {
            goto exit;
        }

        err = ompi_request_wait_all(2, req, MPI_STATUSES_IGNORE);
        if (OMPI_SUCCESS != err) {
            goto exit;
        }
    }

    /* bcast the message to all the local processes */
    err = comm->c_local_comm->c_coll.coll_bcast(rbuf, count, dtype, 
						root, comm->c_local_comm,
                                                comm->c_local_comm->c_coll.coll_bcast_module);
    if (OMPI_SUCCESS != err) {
            goto exit;
    }

  exit:
    if (NULL != tmpbuf) {
        free(tmpbuf);
    }

    return err;
}
Example #25
0
static inline int NBC_Start_round(NBC_Handle *handle) {
  int num; /* number of operations */
  int i, res, ret=NBC_OK;
  char* ptr;
  NBC_Fn_type type;
  NBC_Args_send     sendargs; 
  NBC_Args_recv     recvargs; 
  NBC_Args_op         opargs; 
  NBC_Args_copy     copyargs; 
  NBC_Args_unpack unpackargs; 
  NBC_Schedule myschedule;
  void *buf1, *buf2, *buf3;

  /* get round-schedule address */
  myschedule = (NBC_Schedule*)((char*)*handle->schedule + handle->row_offset);
  ptr = (char*) myschedule;

  NBC_GET_BYTES(ptr,num);
  NBC_DEBUG(10, "start_round round at address %p : posting %i operations\n", myschedule, num);

  for (i=0; i<num; i++) {
    NBC_GET_BYTES(ptr,type);
    switch(type) {
      case SEND:
        NBC_DEBUG(5,"  SEND (offset %li) ", (long)ptr-(long)myschedule);
        NBC_GET_BYTES(ptr,sendargs);
        NBC_DEBUG(5,"*buf: %p, count: %i, type: %lu, dest: %i, tag: %i)\n", sendargs.buf, sendargs.count, (unsigned long)sendargs.datatype, sendargs.dest, handle->tag);
        /* get an additional request */
        handle->req_count++;
        /* get buffer */
        if(sendargs.tmpbuf) {
          buf1=(char*)handle->tmpbuf+(long)sendargs.buf;
        } else {
          buf1=sendargs.buf;
        }
#ifdef NBC_TIMING
        Isend_time -= MPI_Wtime();
#endif
        handle->req_array = (MPI_Request*)realloc((void*)handle->req_array, (handle->req_count)*sizeof(MPI_Request));
        NBC_CHECK_NULL(handle->req_array);
        res = MCA_PML_CALL(isend(buf1, sendargs.count, sendargs.datatype, sendargs.dest, handle->tag, MCA_PML_BASE_SEND_STANDARD, handle->comm, handle->req_array+handle->req_count-1));
        if(OMPI_SUCCESS != res) { printf("Error in MPI_Isend(%lu, %i, %lu, %i, %i, %lu) (%i)\n", (unsigned long)buf1, sendargs.count, (unsigned long)sendargs.datatype, sendargs.dest, handle->tag, (unsigned long)handle->comm, res); ret=res; goto error; }
#ifdef NBC_TIMING
        Isend_time += MPI_Wtime();
#endif
        break;
      case RECV:
        NBC_DEBUG(5, "  RECV (offset %li) ", (long)ptr-(long)myschedule);
        NBC_GET_BYTES(ptr,recvargs);
        NBC_DEBUG(5, "*buf: %p, count: %i, type: %lu, source: %i, tag: %i)\n", recvargs.buf, recvargs.count, (unsigned long)recvargs.datatype, recvargs.source, handle->tag);
        /* get an additional request - TODO: req_count NOT thread safe */
        handle->req_count++;
        /* get buffer */
        if(recvargs.tmpbuf) {
          buf1=(char*)handle->tmpbuf+(long)recvargs.buf;
        } else {
          buf1=recvargs.buf;
        }
#ifdef NBC_TIMING
        Irecv_time -= MPI_Wtime();
#endif
        handle->req_array = (MPI_Request*)realloc((void*)handle->req_array, (handle->req_count)*sizeof(MPI_Request));
        NBC_CHECK_NULL(handle->req_array);
        res = MCA_PML_CALL(irecv(buf1, recvargs.count, recvargs.datatype, recvargs.source, handle->tag, handle->comm, handle->req_array+handle->req_count-1)); 
        if(OMPI_SUCCESS != res) { printf("Error in MPI_Irecv(%lu, %i, %lu, %i, %i, %lu) (%i)\n", (unsigned long)buf1, recvargs.count, (unsigned long)recvargs.datatype, recvargs.source, handle->tag, (unsigned long)handle->comm, res); ret=res; goto error; }
#ifdef NBC_TIMING
        Irecv_time += MPI_Wtime();
#endif
        break;
      case OP:
        NBC_DEBUG(5, "  OP   (offset %li) ", (long)ptr-(long)myschedule);
        NBC_GET_BYTES(ptr,opargs);
        NBC_DEBUG(5, "*buf1: %p, buf2: %p, buf3: %p, count: %i, type: %lu)\n", opargs.buf1, opargs.buf2, opargs.buf3, opargs.count, (unsigned long)opargs.datatype);
        /* get buffers */
        if(opargs.tmpbuf1) {
          buf1=(char*)handle->tmpbuf+(long)opargs.buf1;
        } else {
          buf1=opargs.buf1;
        }
        if(opargs.tmpbuf2) {
          buf2=(char*)handle->tmpbuf+(long)opargs.buf2;
        } else {
          buf2=opargs.buf2;
        }
        if(opargs.tmpbuf3) {
          buf3=(char*)handle->tmpbuf+(long)opargs.buf3;
        } else {
          buf3=opargs.buf3;
        }
        ompi_3buff_op_reduce(opargs.op, buf1, buf2, buf3, opargs.count, opargs.datatype);
        break;
      case COPY:
        NBC_DEBUG(5, "  COPY   (offset %li) ", (long)ptr-(long)myschedule);
        NBC_GET_BYTES(ptr,copyargs);
        NBC_DEBUG(5, "*src: %lu, srccount: %i, srctype: %lu, *tgt: %lu, tgtcount: %i, tgttype: %lu)\n", (unsigned long)copyargs.src, copyargs.srccount, (unsigned long)copyargs.srctype, (unsigned long)copyargs.tgt, copyargs.tgtcount, (unsigned long)copyargs.tgttype);
        /* get buffers */
        if(copyargs.tmpsrc) {
          buf1=(char*)handle->tmpbuf+(long)copyargs.src;
        } else {
          buf1=copyargs.src;
        }
        if(copyargs.tmptgt) {
          buf2=(char*)handle->tmpbuf+(long)copyargs.tgt;
        } else {
          buf2=copyargs.tgt;
        }
        res = NBC_Copy(buf1, copyargs.srccount, copyargs.srctype, buf2, copyargs.tgtcount, copyargs.tgttype, handle->comm);
        if(res != NBC_OK) { printf("NBC_Copy() failed (code: %i)\n", res); ret=res; goto error; }
        break;
      case UNPACK:
        NBC_DEBUG(5, "  UNPACK   (offset %li) ", (long)ptr-(long)myschedule);
        NBC_GET_BYTES(ptr,unpackargs);
        NBC_DEBUG(5, "*src: %lu, srccount: %i, srctype: %lu, *tgt: %lu\n", (unsigned long)unpackargs.inbuf, unpackargs.count, (unsigned long)unpackargs.datatype, (unsigned long)unpackargs.outbuf);
        /* get buffers */
        if(unpackargs.tmpinbuf) {
          buf1=(char*)handle->tmpbuf+(long)unpackargs.inbuf;
        } else {
          buf1=unpackargs.outbuf;
        }
        if(unpackargs.tmpoutbuf) {
          buf2=(char*)handle->tmpbuf+(long)unpackargs.outbuf;
        } else {
          buf2=unpackargs.outbuf;
        }
        res = NBC_Unpack(buf1, unpackargs.count, unpackargs.datatype, buf2, handle->comm);
        if(res != NBC_OK) { printf("NBC_Unpack() failed (code: %i)\n", res); ret=res; goto error; }
        break;
      default:
        printf("NBC_Start_round: bad type %li at offset %li\n", (long)type, (long)ptr-(long)myschedule);
        ret=NBC_BAD_SCHED;
        goto error;
    }
  }

  /* check if we can make progress - not in the first round, this allows us to leave the
   * initialization faster and to reach more overlap 
   *
   * threaded case: calling progress in the first round can lead to a
   * deadlock if NBC_Free is called in this round :-( */
  if(handle->row_offset != sizeof(int)) {
    res = NBC_Progress(handle);
    if((NBC_OK != res) && (NBC_CONTINUE != res)) { printf("Error in NBC_Progress() (%i)\n", res); ret=res; goto error; }
  }

error:
  return ret;
}
Example #26
0
/*
 *	alltoall_inter
 *
 *	Function:	- MPI_Alltoall
 *	Accepts:	- same as MPI_Alltoall()
 *	Returns:	- MPI_SUCCESS or an MPI error code
 */
int
mca_coll_basic_alltoall_inter(const void *sbuf, int scount,
                              struct ompi_datatype_t *sdtype,
                              void *rbuf, int rcount,
                              struct ompi_datatype_t *rdtype,
                              struct ompi_communicator_t *comm,
                              mca_coll_base_module_t *module)
{
    int i;
    int size;
    int err;
    int nreqs;
    char *psnd;
    char *prcv;
    MPI_Aint lb;
    MPI_Aint sndinc;
    MPI_Aint rcvinc;

    ompi_request_t **req;
    ompi_request_t **sreq;
    ompi_request_t **rreq;

    mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t*) module;

    /* Initialize. */

    size = ompi_comm_remote_size(comm);

    err = ompi_datatype_get_extent(sdtype, &lb, &sndinc);
    if (OMPI_SUCCESS != err) {
        return err;
    }
    sndinc *= scount;

    err = ompi_datatype_get_extent(rdtype, &lb, &rcvinc);
    if (OMPI_SUCCESS != err) {
        return err;
    }
    rcvinc *= rcount;

    /* Initiate all send/recv to/from others. */
    nreqs = size * 2;
    req = rreq = basic_module->mccb_reqs;
    sreq = rreq + size;

    prcv = (char *) rbuf;
    psnd = (char *) sbuf;

    /* Post all receives first */
    for (i = 0; i < size; i++, ++rreq) {
        err = MCA_PML_CALL(irecv(prcv + (i * rcvinc), rcount, rdtype, i,
                                 MCA_COLL_BASE_TAG_ALLTOALL, comm, rreq));
        if (OMPI_SUCCESS != err) {
            return err;
        }
    }

    /* Now post all sends */
    for (i = 0; i < size; i++, ++sreq) {
        err = MCA_PML_CALL(isend(psnd + (i * sndinc), scount, sdtype, i,
                                 MCA_COLL_BASE_TAG_ALLTOALL,
                                 MCA_PML_BASE_SEND_STANDARD, comm, sreq));
        if (OMPI_SUCCESS != err) {
            return err;
        }
    }

    /* Wait for them all.  If there's an error, note that we don't
     * care what the error was -- just that there *was* an error.  The
     * PML will finish all requests, even if one or more of them fail.
     * i.e., by the end of this call, all the requests are free-able.
     * So free them anyway -- even if there was an error, and return
     * the error after we free everything. */
    err = ompi_request_wait_all(nreqs, req, MPI_STATUSES_IGNORE);

    /* All done */
    return err;
}
int
mca_coll_base_alltoallv_intra_basic_inplace(const void *rbuf, const int *rcounts, const int *rdisps,
                                            struct ompi_datatype_t *rdtype,
                                            struct ompi_communicator_t *comm,
                                            mca_coll_base_module_t *module)
{
    mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
    int i, j, size, rank, err=MPI_SUCCESS;
    ompi_request_t **preq, **reqs;
    char *allocated_buffer, *tmp_buffer;
    size_t max_size, rdtype_size;
    OPAL_PTRDIFF_TYPE ext, gap = 0;

    /* Initialize. */

    size = ompi_comm_size(comm);
    rank = ompi_comm_rank(comm);
    ompi_datatype_type_size(rdtype, &rdtype_size);

    /* If only one process, we're done. */
    if (1 == size || 0 == rdtype_size) {
        return MPI_SUCCESS;
    }

    /* Find the largest receive amount */
    ompi_datatype_type_extent (rdtype, &ext);
    for (i = 0, max_size = 0 ; i < size ; ++i) {
        size_t size = opal_datatype_span(&rdtype->super, rcounts[i], &gap);
        max_size = size > max_size ? size : max_size;
    }
    /* The gap will always be the same as we are working on the same datatype */

    /* Allocate a temporary buffer */
    allocated_buffer = calloc (max_size, 1);
    if (NULL == allocated_buffer) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    tmp_buffer = allocated_buffer - gap;

    /* Initiate all send/recv to/from others. */
    reqs = preq = coll_base_comm_get_reqs(base_module->base_data, 2);
    if( NULL == reqs ) { err = OMPI_ERR_OUT_OF_RESOURCE; goto error_hndl; }

    /* in-place alltoallv slow algorithm (but works) */
    for (i = 0 ; i < size ; ++i) {
        for (j = i+1 ; j < size ; ++j) {
            preq = reqs;

            if (i == rank && rcounts[j]) {
                /* Copy the data into the temporary buffer */
                err = ompi_datatype_copy_content_same_ddt (rdtype, rcounts[j],
                                                           tmp_buffer, (char *) rbuf + rdisps[j] * ext);
                if (MPI_SUCCESS != err) { goto error_hndl; }

                /* Exchange data with the peer */
                err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[j] * ext, rcounts[j], rdtype,
                                          j, MCA_COLL_BASE_TAG_ALLTOALLV, comm, preq++));
                if (MPI_SUCCESS != err) { goto error_hndl; }

                err = MCA_PML_CALL(isend ((void *) tmp_buffer,  rcounts[j], rdtype,
                                          j, MCA_COLL_BASE_TAG_ALLTOALLV, MCA_PML_BASE_SEND_STANDARD,
                                          comm, preq++));
                if (MPI_SUCCESS != err) { goto error_hndl; }
            } else if (j == rank && rcounts[i]) {
                /* Copy the data into the temporary buffer */
                err = ompi_datatype_copy_content_same_ddt (rdtype, rcounts[i],
                                                           tmp_buffer, (char *) rbuf + rdisps[i] * ext);
                if (MPI_SUCCESS != err) { goto error_hndl; }

                /* Exchange data with the peer */
                err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[i] * ext, rcounts[i], rdtype,
                                          i, MCA_COLL_BASE_TAG_ALLTOALLV, comm, preq++));
                if (MPI_SUCCESS != err) { goto error_hndl; }

                err = MCA_PML_CALL(isend ((void *) tmp_buffer,  rcounts[i], rdtype,
                                          i, MCA_COLL_BASE_TAG_ALLTOALLV, MCA_PML_BASE_SEND_STANDARD,
                                          comm, preq++));
                if (MPI_SUCCESS != err) { goto error_hndl; }
            } else {
                continue;
            }

            /* Wait for the requests to complete */
            err = ompi_request_wait_all (2, reqs, MPI_STATUSES_IGNORE);
            if (MPI_SUCCESS != err) { goto error_hndl; }
        }
    }

 error_hndl:
    /* Free the temporary buffer */
    free (allocated_buffer);
    if( MPI_SUCCESS != err ) {
        ompi_coll_base_free_reqs(reqs, 2 );
    }

    /* All done */
    return err;
}
Example #28
0
/*
 *	allgather_inter
 *
 *	Function:	- allgather using other MPI collections
 *	Accepts:	- same as MPI_Allgather()
 *	Returns:	- MPI_SUCCESS or error code
 */
int
mca_coll_basic_allgather_inter(const void *sbuf, int scount,
                               struct ompi_datatype_t *sdtype,
                               void *rbuf, int rcount,
                               struct ompi_datatype_t *rdtype,
                               struct ompi_communicator_t *comm,
                               mca_coll_base_module_t *module)
{
    int rank, root = 0, size, rsize, err, i, line;
    char *tmpbuf_free = NULL, *tmpbuf, *ptmp;
    ptrdiff_t rlb, rextent, incr;
    ptrdiff_t gap, span;
    ompi_request_t *req;
    ompi_request_t **reqs = NULL;

    rank = ompi_comm_rank(comm);
    size = ompi_comm_size(comm);
    rsize = ompi_comm_remote_size(comm);

    /* Algorithm:
     * - a gather to the root in remote group (simultaniously executed,
     * thats why we cannot use coll_gather).
     * - exchange the temp-results between two roots
     * - inter-bcast (again simultanious).
     */

    /* Step one: gather operations: */
    if (rank != root) {
        /* send your data to root */
        err = MCA_PML_CALL(send(sbuf, scount, sdtype, root,
                                MCA_COLL_BASE_TAG_ALLGATHER,
                                MCA_PML_BASE_SEND_STANDARD, comm));
        if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; }
    } else {
        /* receive a msg. from all other procs. */
        err = ompi_datatype_get_extent(rdtype, &rlb, &rextent);
        if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; }

        /* Get a requests arrays of the right size */
        reqs = ompi_coll_base_comm_get_reqs(module->base_data, rsize + 1);
        if( NULL == reqs ) { line = __LINE__; err = OMPI_ERR_OUT_OF_RESOURCE; goto exit; }

        /* Do a send-recv between the two root procs. to avoid deadlock */
        err = MCA_PML_CALL(isend(sbuf, scount, sdtype, 0,
                                 MCA_COLL_BASE_TAG_ALLGATHER,
                                 MCA_PML_BASE_SEND_STANDARD,
                                 comm, &reqs[rsize]));
        if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; }

        err = MCA_PML_CALL(irecv(rbuf, rcount, rdtype, 0,
                                 MCA_COLL_BASE_TAG_ALLGATHER, comm,
                                 &reqs[0]));
        if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; }

        incr = rextent * rcount;
        ptmp = (char *) rbuf + incr;
        for (i = 1; i < rsize; ++i, ptmp += incr) {
            err = MCA_PML_CALL(irecv(ptmp, rcount, rdtype, i,
                                     MCA_COLL_BASE_TAG_ALLGATHER,
                                     comm, &reqs[i]));
            if (MPI_SUCCESS != err) { line = __LINE__; goto exit; }
        }

        err = ompi_request_wait_all(rsize + 1, reqs, MPI_STATUSES_IGNORE);
        if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; }

        /* Step 2: exchange the resuts between the root processes */
        span = opal_datatype_span(&sdtype->super, (int64_t)scount * (int64_t)size, &gap);
        tmpbuf_free = (char *) malloc(span);
        if (NULL == tmpbuf_free) { line = __LINE__; err = OMPI_ERR_OUT_OF_RESOURCE; goto exit; }
        tmpbuf = tmpbuf_free - gap;

        err = MCA_PML_CALL(isend(rbuf, rsize * rcount, rdtype, 0,
                                 MCA_COLL_BASE_TAG_ALLGATHER,
                                 MCA_PML_BASE_SEND_STANDARD, comm, &req));
        if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; }

        err = MCA_PML_CALL(recv(tmpbuf, size * scount, sdtype, 0,
                                MCA_COLL_BASE_TAG_ALLGATHER, comm,
                                MPI_STATUS_IGNORE));
        if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; }

        err = ompi_request_wait( &req, MPI_STATUS_IGNORE);
        if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; }
    }


    /* Step 3: bcast the data to the remote group. This
     * happens in both groups simultaneously, thus we can
     * not use coll_bcast (this would deadlock).
     */
    if (rank != root) {
        /* post the recv */
        err = MCA_PML_CALL(recv(rbuf, rsize * rcount, rdtype, 0,
                                MCA_COLL_BASE_TAG_ALLGATHER, comm,
                                MPI_STATUS_IGNORE));
        if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; }

    } else {
        /* Send the data to every other process in the remote group
         * except to rank zero. which has it already. */
        for (i = 1; i < rsize; i++) {
            err = MCA_PML_CALL(isend(tmpbuf, size * scount, sdtype, i,
                                     MCA_COLL_BASE_TAG_ALLGATHER,
                                     MCA_PML_BASE_SEND_STANDARD,
                                     comm, &reqs[i - 1]));
            if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; }
        }

        err = ompi_request_wait_all(rsize - 1, reqs, MPI_STATUSES_IGNORE);
        if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; }
    }

  exit:
    if( MPI_SUCCESS != err ) {
        OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
                      __FILE__, line, err, rank) );
        (void)line;  // silence compiler warning
        if( NULL != reqs ) ompi_coll_base_free_reqs(reqs, rsize+1);
    }
    if (NULL != tmpbuf_free) {
        free(tmpbuf_free);
    }

    return err;
}
/*
 * alltoall_intra_linear_sync
 * 
 * Function:       Linear implementation of alltoall with limited number
 *                 of outstanding requests.
 * Accepts:        Same as MPI_Alltoall(), and the maximum number of 
 *                 outstanding requests (actual number is 2 * max, since
 *                 we count receive and send requests separately).
 * Returns:        MPI_SUCCESS or error code
 *
 * Description:    Algorithm is the following:
 *                 1) post K irecvs, K <= N
 *                 2) post K isends, K <= N
 *                 3) while not done
 *                    - wait for any request to complete
 *                    - replace that request by the new one of the same type.
 */
int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount,
                                               struct ompi_datatype_t *sdtype,
                                               void* rbuf, int rcount,
                                               struct ompi_datatype_t *rdtype,
                                               struct ompi_communicator_t *comm,
                                               mca_coll_base_module_t *module,
                                               int max_outstanding_reqs)
{
    int line, error, ri, si, rank, size, nreqs, nrreqs, nsreqs, total_reqs;
    char *psnd, *prcv;
    ptrdiff_t slb, sext, rlb, rext;

    ompi_request_t **reqs = NULL;

    if (MPI_IN_PLACE == sbuf) {
        return mca_coll_tuned_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
                                                            comm, module);
    }

    /* Initialize. */

    size = ompi_comm_size(comm);
    rank = ompi_comm_rank(comm);

    OPAL_OUTPUT((ompi_coll_tuned_stream,
                 "ompi_coll_tuned_alltoall_intra_linear_sync rank %d", rank));

    error = ompi_datatype_get_extent(sdtype, &slb, &sext);
    if (OMPI_SUCCESS != error) {
        return error;
    }
    sext *= scount;

    error = ompi_datatype_get_extent(rdtype, &rlb, &rext);
    if (OMPI_SUCCESS != error) {
        return error;
    }
    rext *= rcount;

    /* simple optimization */

    psnd = ((char *) sbuf) + (ptrdiff_t)rank * sext;
    prcv = ((char *) rbuf) + (ptrdiff_t)rank * rext;

    error = ompi_datatype_sndrcv(psnd, scount, sdtype, prcv, rcount, rdtype);
    if (MPI_SUCCESS != error) {
        return error;
    }

    /* If only one process, we're done. */

    if (1 == size) {
        return MPI_SUCCESS;
    }

    /* Initiate send/recv to/from others. */
    total_reqs =  (((max_outstanding_reqs > (size - 1)) || 
                    (max_outstanding_reqs <= 0)) ?
                   (size - 1) : (max_outstanding_reqs));
    reqs = (ompi_request_t**) malloc( 2 * total_reqs * 
                                      sizeof(ompi_request_t*));
    if (NULL == reqs) { error = -1; line = __LINE__; goto error_hndl; }
    
    prcv = (char *) rbuf;
    psnd = (char *) sbuf;

    /* Post first batch or ireceive and isend requests  */
    for (nreqs = 0, nrreqs = 0, ri = (rank + 1) % size; nreqs < total_reqs; 
         ri = (ri + 1) % size, ++nreqs, ++nrreqs) {
        error =
            MCA_PML_CALL(irecv
                         (prcv + (ptrdiff_t)ri * rext, rcount, rdtype, ri,
                          MCA_COLL_BASE_TAG_ALLTOALL, comm, &reqs[nreqs]));
        if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; }
    }
    for ( nsreqs = 0, si =  (rank + size - 1) % size; nreqs < 2 * total_reqs; 
          si = (si + size - 1) % size, ++nreqs, ++nsreqs) {
        error =
            MCA_PML_CALL(isend
                         (psnd + (ptrdiff_t)si * sext, scount, sdtype, si,
                          MCA_COLL_BASE_TAG_ALLTOALL,
                          MCA_PML_BASE_SEND_STANDARD, comm, &reqs[nreqs]));
        if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; }
    }

    /* Wait for requests to complete */
    if (nreqs == 2 * (size - 1)) {
        /* Optimization for the case when all requests have been posted  */
        error = ompi_request_wait_all(nreqs, reqs, MPI_STATUSES_IGNORE);
        if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; }
       
    } else {
        /* As requests complete, replace them with corresponding requests:
           - wait for any request to complete, mark the request as 
           MPI_REQUEST_NULL
           - If it was a receive request, replace it with new irecv request 
           (if any)
           - if it was a send request, replace it with new isend request (if any)
        */
        int ncreqs = 0;
        while (ncreqs < 2 * (size - 1)) {
            int completed;
            error = ompi_request_wait_any(2 * total_reqs, reqs, &completed,
                                          MPI_STATUS_IGNORE);
            if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; }
            reqs[completed] = MPI_REQUEST_NULL;
            ncreqs++;
            if (completed < total_reqs) {
                if (nrreqs < (size - 1)) {
                    error = 
                        MCA_PML_CALL(irecv
                                     (prcv + (ptrdiff_t)ri * rext, rcount, rdtype, ri,
                                      MCA_COLL_BASE_TAG_ALLTOALL, comm, 
                                      &reqs[completed]));
                    if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; }
                    ++nrreqs;
                    ri = (ri + 1) % size;
                }
            } else {
                if (nsreqs < (size - 1)) {
                    error = MCA_PML_CALL(isend
                                         (psnd + (ptrdiff_t)si * sext, scount, sdtype, si,
                                          MCA_COLL_BASE_TAG_ALLTOALL,
                                          MCA_PML_BASE_SEND_STANDARD, comm,
                                          &reqs[completed]));
                    ++nsreqs;
                    si = (si + size - 1) % size; 
                }
            }
        }
    }

    /* Free the reqs */
    free(reqs);

    /* All done */
    return MPI_SUCCESS;

 error_hndl:
    OPAL_OUTPUT((ompi_coll_tuned_stream,
                 "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, error, 
                 rank));
    if (NULL != reqs) free(reqs);
    return error;
}
static int bcol_ptpcoll_barrier_recurs_knomial_extra_new(
                                bcol_function_args_t *input_args,
                                struct coll_ml_function_t *const_args)
{
    /* local variable */
    uint64_t sequence_number;
    int rc, tag, pair_comm_rank,
        completed, num_reqs = 2;

    mca_bcol_ptpcoll_module_t *ptpcoll_module =
                    (mca_bcol_ptpcoll_module_t *) const_args->bcol_module;

    netpatterns_k_exchange_node_t *my_exchange_node =
                                   &ptpcoll_module->knomial_exchange_tree;

    ompi_communicator_t *comm =
                    ptpcoll_module->super.sbgp_partner_module->group_comm;

    int *extra_sources_array = my_exchange_node->rank_extra_sources_array;

    ompi_request_t **requests;
    ompi_free_list_item_t *item;

    mca_bcol_ptpcoll_collreq_t *collreq;

    OMPI_FREE_LIST_WAIT_MT(&ptpcoll_module->collreqs_free, item);
    if (OPAL_UNLIKELY(NULL == item)) {
        PTPCOLL_ERROR(("Free list waiting failed."));
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    collreq = (mca_bcol_ptpcoll_collreq_t *) item;
    input_args->bcol_opaque_data = (void *) collreq;

    requests = collreq->requests;

    /* TAG Calculation */
    sequence_number = input_args->sequence_num;

    /* Keep tag within the limit supportd by the pml */
    tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask);

    /* Mark this as a collective tag, to avoid conflict with user-level flags */
    tag = -tag;

    pair_comm_rank =
            ptpcoll_module->super.sbgp_partner_module->group_list[extra_sources_array[0]];

    rc = MCA_PML_CALL(isend(
                NULL, 0, MPI_INT,
                pair_comm_rank, tag,
                MCA_PML_BASE_SEND_STANDARD,
                comm, &(requests[0])));
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        PTPCOLL_ERROR(("ISend failed."));
        return rc;
    }

    rc = MCA_PML_CALL(irecv(
                NULL, 0, MPI_INT,
                pair_comm_rank, tag,
                comm, &(requests[1])));
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        PTPCOLL_ERROR(("IRecv failed."));
        return rc;
    }

    /* Test for completion */
    completed =
        mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        PTPCOLL_ERROR(("Test for all failed."));
        return rc;
    }

    if (!completed) {
        return BCOL_FN_STARTED;
    }

    OMPI_FREE_LIST_RETURN_MT(&ptpcoll_module->collreqs_free, (ompi_free_list_item_t *) collreq);
    return BCOL_FN_COMPLETE;
}