Пример #1
0
int down_search(int rank, int parent, int me, int num_procs,
                int *num_children, opal_list_t *children, opal_bitmap_t *relatives)
{
    int i, bitmap, peer, hibit, mask, found;
    orte_routed_tree_t *child;
    opal_bitmap_t *relations;

    /* is this me? */
    if (me == rank) {
        bitmap = opal_cube_dim(num_procs);

        hibit = opal_hibit(rank, bitmap);
        --bitmap;

        for (i = hibit + 1, mask = 1 << i; i <= bitmap; ++i, mask <<= 1) {
            peer = rank | mask;
            if (peer < num_procs) {
                child = OBJ_NEW(orte_routed_tree_t);
                child->vpid = peer;
                if (NULL != children) {
                    /* this is a direct child - add it to my list */
                    opal_list_append(children, &child->super);
                    (*num_children)++;
                    /* setup the relatives bitmap */
                    opal_bitmap_init(&child->relatives, num_procs);
                    /* point to the relatives */
                    relations = &child->relatives;
                } else {
                    /* we are recording someone's relatives - set the bit */
                    opal_bitmap_set_bit(relatives, peer);
                    /* point to this relations */
                    relations = relatives;
                }
                /* search for this child's relatives */
                down_search(0, 0, peer, num_procs, NULL, NULL, relations);
            }
        }
        return parent;
    }

    /* find the children of this rank */
    bitmap = opal_cube_dim(num_procs);

    hibit = opal_hibit(rank, bitmap);
    --bitmap;

    for (i = hibit + 1, mask = 1 << i; i <= bitmap; ++i, mask <<= 1) {
        peer = rank | mask;
        if (peer < num_procs) {
            /* execute compute on this child */
            if (0 <= (found = down_search(peer, rank, me, num_procs, num_children, children, relatives))) {
                return found;
            }
        }
    }
    return -1;
}
Пример #2
0
/*
 *	bcast_log_intra
 *
 *	Function:	- broadcast using O(log(N)) algorithm
 *	Accepts:	- same arguments as MPI_Bcast()
 *	Returns:	- MPI_SUCCESS or error code
 */
int
mca_coll_basic_bcast_log_intra(void *buff, int count,
                               struct ompi_datatype_t *datatype, int root,
                               struct ompi_communicator_t *comm,
                               mca_coll_base_module_t *module)
{
    int i;
    int size;
    int rank;
    int vrank;
    int peer;
    int dim;
    int hibit;
    int mask;
    int err;
    int nreqs;
    ompi_request_t **preq;
    mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t*) module;
    ompi_request_t **reqs = basic_module->mccb_reqs;

    size = ompi_comm_size(comm);
    rank = ompi_comm_rank(comm);
    vrank = (rank + size - root) % size;

    dim = comm->c_cube_dim;
    hibit = opal_hibit(vrank, dim);
    --dim;

    /* Receive data from parent in the tree. */

    if (vrank > 0) {
        peer = ((vrank & ~(1 << hibit)) + root) % size;

        err = MCA_PML_CALL(recv(buff, count, datatype, peer,
                                MCA_COLL_BASE_TAG_BCAST,
                                comm, MPI_STATUS_IGNORE));
        if (MPI_SUCCESS != err) {
            return err;
        }
    }

    /* Send data to the children. */

    err = MPI_SUCCESS;
    preq = reqs;
    nreqs = 0;
    for (i = hibit + 1, mask = 1 << i; i <= dim; ++i, mask <<= 1) {
        peer = vrank | mask;
        if (peer < size) {
            peer = (peer + root) % size;
            ++nreqs;

            err = MCA_PML_CALL(isend_init(buff, count, datatype, peer,
                                          MCA_COLL_BASE_TAG_BCAST,
                                          MCA_PML_BASE_SEND_STANDARD,
                                          comm, preq++));
            if (MPI_SUCCESS != err) {
                mca_coll_basic_free_reqs(reqs, nreqs);
                return err;
            }
        }
    }

    /* Start and wait on all requests. */

    if (nreqs > 0) {

        /* Start your engines.  This will never return an error. */

        MCA_PML_CALL(start(nreqs, reqs));

        /* Wait for them all.  If there's an error, note that we don't
         * care what the error was -- just that there *was* an error.
         * The PML will finish all requests, even if one or more of them
         * fail.  i.e., by the end of this call, all the requests are
         * free-able.  So free them anyway -- even if there was an
         * error, and return the error after we free everything. */

        err = ompi_request_wait_all(nreqs, reqs, MPI_STATUSES_IGNORE);

        /* Free the reqs */

        mca_coll_basic_free_reqs(reqs, nreqs);
    }

    /* All done */

    return err;
}
Пример #3
0
static int mca_oob_xcast_binomial_tree(orte_jobid_t job,
                                       bool process_first,
                                       orte_buffer_t* buffer,
                                       orte_gpr_trigger_cb_fn_t cbfunc)
{
    orte_std_cntr_t i;
    int rc;
    int tag = ORTE_RML_TAG_XCAST;
    int peer, size, rank, hibit, mask;
    orte_buffer_t rbuf, sbuf;
    orte_gpr_notify_message_t *msg;
    orte_process_name_t target;

    /* check to see if there is something to send - this is only true on the HNP end.
     * However, we cannot just test to see if we are the HNP since, if we are a singleton,
     * we are the HNP *and* we still need to handle both ends of the xcast
     */
    if (NULL != buffer) {        
        /* this is the HNP end, so it starts the procedure. Accordingly, it sends its
         * message to the first process in the job in the peer list, which takes it from there
         */
        OBJ_CONSTRUCT(&xcastmutex, opal_mutex_t);
        OPAL_THREAD_LOCK(&xcastmutex);

        target.cellid = ORTE_PROC_MY_NAME->cellid;
        target.jobid = job;
        target.vpid = 0;
        if (0 > (rc = mca_oob_send_packed(&target, buffer, tag, 0))) {
            ORTE_ERROR_LOG(rc);
            OPAL_THREAD_UNLOCK(&xcastmutex);
            OBJ_DESTRUCT(&xcastmutex);
            return rc;
        }

        OPAL_THREAD_UNLOCK(&xcastmutex);
        OBJ_DESTRUCT(&xcastmutex);

        return ORTE_SUCCESS;
        
    } 
    
    
    /* this process is one of the application procs - accordingly, it will
     * receive the message from its "parent" in the broadcast tree, and
     * then send it along to some set of children
     */
    
    /* compute the bitmap, if we haven't already done so */
    if (!bitmap_init) {
        bitmap_save = opal_cube_dim((int)orte_process_info.num_procs);
        bitmap_init = true;
    }
    
    xcast_bitmap = bitmap_save;
    rank = (int)(ORTE_PROC_MY_NAME->vpid);
    size = (int)orte_process_info.num_procs;
    
    hibit = opal_hibit(rank, xcast_bitmap);
    --xcast_bitmap;
    
    /* regardless of who we are, we first have to receive the message */
    OBJ_CONSTRUCT(&rbuf, orte_buffer_t);
    if (0 > (rc = mca_oob_recv_packed(ORTE_NAME_WILDCARD, &rbuf, tag))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&rbuf);
        return rc;
    }
    
    msg = OBJ_NEW(orte_gpr_notify_message_t);
    if (NULL == msg) {
        ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
        return ORTE_ERR_OUT_OF_RESOURCE;
    }
    i=1;
    if (ORTE_SUCCESS != (rc = orte_dss.unpack(&rbuf, &msg, &i, ORTE_GPR_NOTIFY_MSG))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(msg);
        return rc;
    }
    OBJ_DESTRUCT(&rbuf);
    
    /* repack the message so we can send it on */
    OBJ_CONSTRUCT(&sbuf, orte_buffer_t);
    if (ORTE_SUCCESS != (rc = orte_dss.pack(&sbuf, &msg, 1, ORTE_GPR_NOTIFY_MSG))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&sbuf);
        return rc;
    }
    
    /* since the OOB contact info for our peers is in the STG1 message, we have to
     * process it BEFORE we can relay the message to any "children"
     */
    if (cbfunc != NULL && process_first) {
        /* process the message */
        cbfunc(msg);
    }
            
    /* send data to any children */
    target.cellid = ORTE_PROC_MY_NAME->cellid;
    target.jobid = ORTE_PROC_MY_NAME->jobid;
    
    for (i = hibit + 1, mask = 1 << i; i <= xcast_bitmap; ++i, mask <<= 1) {
        peer = rank | mask;
        if (peer < size) {
            target.vpid = (orte_vpid_t)peer;
            if (0 > (rc = mca_oob_send_packed(&target, &sbuf, tag, 0))) {
                ORTE_ERROR_LOG(rc);
                OBJ_RELEASE(msg);
                return rc;
            }
        }
    }
    OBJ_DESTRUCT(&sbuf);
    
    /* if it wasn't the STG1 message, then process it here */
    if (cbfunc != NULL && !process_first) {
        cbfunc(msg);
    }
    OBJ_RELEASE(msg);
    
    return ORTE_SUCCESS;
}
Пример #4
0
static int binomial_tree(int rank, int parent, int me, int num_procs,
                         int *nchildren, opal_list_t *childrn,
                         opal_bitmap_t *relatives, bool mine)
{
    int i, bitmap, peer, hibit, mask, found;
    orte_routed_tree_t *child;
    opal_bitmap_t *relations;

    OPAL_OUTPUT_VERBOSE((3, orte_routed_base_framework.framework_output,
                         "%s routed:binomial rank %d parent %d me %d num_procs %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         rank, parent, me, num_procs));

    /* is this me? */
    if (me == rank) {
        bitmap = opal_cube_dim(num_procs);

        hibit = opal_hibit(rank, bitmap);
        --bitmap;

        for (i = hibit + 1, mask = 1 << i; i <= bitmap; ++i, mask <<= 1) {
            peer = rank | mask;
            if (peer < num_procs) {
                child = OBJ_NEW(orte_routed_tree_t);
                child->vpid = peer;
                OPAL_OUTPUT_VERBOSE((3, orte_routed_base_framework.framework_output,
                                     "%s routed:binomial %d found child %s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     rank,
                                     ORTE_VPID_PRINT(child->vpid)));

                if (mine) {
                    /* this is a direct child - add it to my list */
                    opal_list_append(childrn, &child->super);
                    (*nchildren)++;
                    /* setup the relatives bitmap */
                    opal_bitmap_init(&child->relatives, num_procs);

                    /* point to the relatives */
                    relations = &child->relatives;
                } else {
                    /* we are recording someone's relatives - set the bit */
                    opal_bitmap_set_bit(relatives, peer);
                    /* point to this relations */
                    relations = relatives;
                }
                /* search for this child's relatives */
                binomial_tree(0, 0, peer, num_procs, nchildren, childrn, relations, false);
            }
        }
        return parent;
    }

    /* find the children of this rank */
    OPAL_OUTPUT_VERBOSE((5, orte_routed_base_framework.framework_output,
                         "%s routed:binomial find children of rank %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rank));
    bitmap = opal_cube_dim(num_procs);

    hibit = opal_hibit(rank, bitmap);
    --bitmap;

    for (i = hibit + 1, mask = 1 << i; i <= bitmap; ++i, mask <<= 1) {
        peer = rank | mask;
        OPAL_OUTPUT_VERBOSE((5, orte_routed_base_framework.framework_output,
                             "%s routed:binomial find children checking peer %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), peer));
        if (peer < num_procs) {
            OPAL_OUTPUT_VERBOSE((5, orte_routed_base_framework.framework_output,
                                 "%s routed:binomial find children computing tree",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            /* execute compute on this child */
            if (0 <= (found = binomial_tree(peer, rank, me, num_procs, nchildren, childrn, relatives, mine))) {
                OPAL_OUTPUT_VERBOSE((5, orte_routed_base_framework.framework_output,
                                     "%s routed:binomial find children returning found value %d",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), found));
                return found;
            }
        }
    }
    return -1;
}
Пример #5
0
/** adapted from openmpi see,
    ~/openmpi-1.6.4/ompi/mca/coll/basic/coll_basic_bcast.c
    This does a binomial tree based broadcast.
*/
int MPI_Bcast_log( void *buff, int count, MPI_Datatype datatype, int root,
                   MPI_Comm comm )
{

    int i;
    int size;
    int rank;
    int vrank;
    int peer;
    int dim;
    int hibit;
    int mask;
    int err;
	int rcnt=0;
	MPI_Request *areqs = NULL;
    

	MPI_CHECK( err = MPI_Comm_rank( comm, &rank ) );
	MPI_CHECK( err = MPI_Comm_size( comm, &size ) );

    vrank = (rank + size - root) % size;
    
    dim = opal_cube_dim(size);
    hibit = opal_hibit(vrank, dim);
    --dim;
 
    /* malloc the maximum possible send request, this is the dimension
       minus the depth then one more the leaf, this might allocate a
       more slots than necessary if the number of nodes do not evenly
       divide into the tree.
    */
    NULL_CHECK( areqs = malloc( sizeof( MPI_Request ) * ( dim-(hibit+1)+1 ) ) );

    /* Receive data from parent in the tree. */
    
    if (vrank > 0) {
        peer = ((vrank & ~(1 << hibit)) + root) % size;
    
        //        printf("Getting ready to receive from %d at %d\n", peer, rank);
        MPI_CHECK( err = MPI_Recv( buff, count, datatype, peer, BCAST_TAG, comm, MPI_STATUS_IGNORE ) );

        if (MPI_SUCCESS != err) {
            return err;
        }
     }

    /* Send data to the children. */
   
    // just in case there are no children, return should be success
    err = MPI_SUCCESS; 
    for (i = hibit + 1, mask = 1 << i; i <= dim; ++i, mask <<= 1) {
        peer = vrank | mask;
        if (peer < size) {
            peer = (peer + root) % size;
            MPI_CHECK( err = MPI_Isend( buff, count, datatype, peer, 
                                        BCAST_TAG, comm,
                                        &areqs[ rcnt ] ) );
            rcnt++;
            if (MPI_SUCCESS != err) {
                free(areqs);
                return err;
            }
        }
     }
    
    /* Wait on all requests. */

    if (rcnt > 0) {
        MPI_CHECK( err = MPI_Waitall( rcnt, areqs, MPI_STATUSES_IGNORE ) );
        /* Free the reqs */    
        free(areqs);
     }
    
    /* All done */
    
    return err;
}
/*
 *	barrier_intra_log
 *
 *	Function:	- barrier using O(log(N)) algorithm
 *	Accepts:	- same as MPI_Barrier()
 *	Returns:	- MPI_SUCCESS or error code
 */
int
mca_coll_basic_barrier_intra_log(struct ompi_communicator_t *comm,
                                 mca_coll_base_module_t *module)
{
    int i;
    int err;
    int peer;
    int dim;
    int hibit;
    int mask;
    int size = ompi_comm_size(comm);
    int rank = ompi_comm_rank(comm);

    /* Send null-messages up and down the tree.  Synchronization at the
     * root (rank 0). */

    dim = comm->c_cube_dim;
    hibit = opal_hibit(rank, dim);
    --dim;

    /* Receive from children. */

    for (i = dim, mask = 1 << i; i > hibit; --i, mask >>= 1) {
        peer = rank | mask;
        if (peer < size) {
            err = MCA_PML_CALL(recv(NULL, 0, MPI_BYTE, peer,
                                    MCA_COLL_BASE_TAG_BARRIER,
                                    comm, MPI_STATUS_IGNORE));
            if (MPI_SUCCESS != err) {
                return err;
            }
        }
    }

    /* Send to and receive from parent. */

    if (rank > 0) {
        peer = rank & ~(1 << hibit);
        err =
            MCA_PML_CALL(send
                         (NULL, 0, MPI_BYTE, peer,
                          MCA_COLL_BASE_TAG_BARRIER,
                          MCA_PML_BASE_SEND_STANDARD, comm));
        if (MPI_SUCCESS != err) {
            return err;
        }

        err = MCA_PML_CALL(recv(NULL, 0, MPI_BYTE, peer,
                                MCA_COLL_BASE_TAG_BARRIER,
                                comm, MPI_STATUS_IGNORE));
        if (MPI_SUCCESS != err) {
            return err;
        }
    }

    /* Send to children. */

    for (i = hibit + 1, mask = 1 << i; i <= dim; ++i, mask <<= 1) {
        peer = rank | mask;
        if (peer < size) {
            err = MCA_PML_CALL(send(NULL, 0, MPI_BYTE, peer,
                                    MCA_COLL_BASE_TAG_BARRIER,
                                    MCA_PML_BASE_SEND_STANDARD, comm));
            if (MPI_SUCCESS != err) {
                return err;
            }
        }
    }

    /* All done */

    return MPI_SUCCESS;
}
int
ompi_coll_portals4_barrier_intra(struct ompi_communicator_t *comm,
                                 mca_coll_base_module_t *module)
{
    mca_coll_portals4_module_t *portals4_module = (mca_coll_portals4_module_t*) module;
    int ret, i, dim, hibit, mask, num_msgs;
    int size = ompi_comm_size(comm);
    int rank = ompi_comm_rank(comm);
    ptl_ct_event_t ct;
    ptl_handle_ct_t ct_h;
    ptl_handle_me_t me_h;
    ptl_me_t me;
    size_t count;
    ptl_match_bits_t match_bits;
    ptl_handle_md_t md_h;
    void *base;

    ompi_coll_portals4_get_md(0, &md_h, &base);

    count = opal_atomic_add_size_t(&portals4_module->barrier_count, 1);

    ret = PtlCTAlloc(mca_coll_portals4_component.ni_h,
                     &ct_h);
    if (PTL_OK != ret) {
        opal_output_verbose(1, ompi_coll_base_framework.framework_output,
                            "%s:%d: PtlCTAlloc failed: %d\n",
                            __FILE__, __LINE__, ret);
        return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
    }

    COLL_PORTALS4_SET_BITS(match_bits, ompi_comm_get_cid(comm), 
                          0, COLL_PORTALS4_BARRIER, count);

    /* Build "tree" out of hypercube */
    dim = comm->c_cube_dim;
    hibit = opal_hibit(rank, dim);
    --dim;

    /* receive space */
    me.start = NULL;
    me.length = 0;
    me.ct_handle = ct_h;
    me.min_free = 0;
    me.uid = mca_coll_portals4_component.uid;
    me.options = PTL_ME_OP_PUT | PTL_ME_EVENT_SUCCESS_DISABLE |
        PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE |
        PTL_ME_EVENT_CT_COMM | PTL_ME_EVENT_CT_OVERFLOW;
    me.match_id.phys.nid = PTL_NID_ANY;
    me.match_id.phys.pid = PTL_PID_ANY;
    me.match_bits = match_bits;
    me.ignore_bits = 0;
    ret = PtlMEAppend(mca_coll_portals4_component.ni_h,
                      mca_coll_portals4_component.pt_idx,
                      &me,
                      PTL_PRIORITY_LIST,
                      NULL,
                      &me_h);
    if (PTL_OK != ret) {
        opal_output_verbose(1, ompi_coll_base_framework.framework_output,
                            "%s:%d: PtlMEAppend failed: %d\n",
                            __FILE__, __LINE__, ret);
        return OMPI_ERROR;
    }

    /* calculate number of children to receive from */
    num_msgs = ompi_coll_portals4_get_nchildren(dim + 1, hibit, rank, size);
    
    /* send to parent when children have sent to us */
    if (rank > 0) {
        int parent = rank & ~(1 << hibit);
        ret = PtlTriggeredPut(md_h,
                              0,
                              0,
                              PTL_NO_ACK_REQ,
                              ompi_coll_portals4_get_peer(comm, parent),
                              mca_coll_portals4_component.pt_idx,
                              match_bits,
                              0,
                              NULL,
                              0,
                              ct_h,
                              num_msgs);
        if (PTL_OK != ret) {
            opal_output_verbose(1, ompi_coll_base_framework.framework_output,
                                "%s:%d: PtlTriggeredPut failed: %d\n",
                                __FILE__, __LINE__, ret);
            return OMPI_ERROR;
        }

        /* we'll need to wait for the parent response before the next set of comms */
        num_msgs++;
    }

    /* send to children when parent (or all children if root) has sent to us */
    for (i = hibit + 1, mask = 1 << i; i <= dim; ++i, mask <<= 1) {
        int peer = rank | mask;
        if (peer < size) {
            ret = PtlTriggeredPut(md_h,
                                  0,
                                  0,
                                  PTL_NO_ACK_REQ,
                                  ompi_coll_portals4_get_peer(comm, peer),
                                  mca_coll_portals4_component.pt_idx,
                                  match_bits,
                                  0,
                                  NULL,
                                  0,
                                  ct_h,
                                  num_msgs);
            if (PTL_OK != ret) {
                opal_output_verbose(1, ompi_coll_base_framework.framework_output,
                                    "%s:%d: PtlTriggeredPut failed: %d\n",
                                    __FILE__, __LINE__, ret);
                return OMPI_ERROR;
            }
        }
    }

    /* Wait for all incoming messages */
    ret = PtlCTWait(ct_h, num_msgs, &ct);
    if (PTL_OK != ret) {
        opal_output_verbose(1, ompi_coll_base_framework.framework_output,
                            "%s:%d: PtlCTWait failed: %d\n",
                            __FILE__, __LINE__, ret);
        return OMPI_ERROR;
    }

    /* cleanup */
    ret = PtlMEUnlink(me_h);
    if (PTL_OK != ret) {
        opal_output_verbose(1, ompi_coll_base_framework.framework_output,
                            "%s:%d: PtlMEUnlink failed: %d\n",
                            __FILE__, __LINE__, ret);
        return OMPI_ERROR;
    }
    ret = PtlCTFree(ct_h); 
    if (PTL_OK != ret) {
        opal_output_verbose(1, ompi_coll_base_framework.framework_output,
                            "%s:%d: PtlCTFree failed: %d\n",
                            __FILE__, __LINE__, ret);
        return OMPI_ERROR;
    }

    return OMPI_SUCCESS;
}
/*
 The Binomial Spanning Tree algorithm.
 Outlay:
 The game scales with log2(NP) and uses 1 byte of memory.
 */
static int __algorithm_binomial_tree(struct oshmem_group_t *group,
                                     int PE_root,
                                     void *target,
                                     const void *source,
                                     size_t nlong,
                                     long *pSync)
{
    int rc = OSHMEM_SUCCESS;
    long value = SHMEM_SYNC_INIT;
    int root_id = oshmem_proc_group_find_id(group, PE_root);
    int my_id = oshmem_proc_group_find_id(group, group->my_pe);
    int peer_id = 0;
    int peer_pe = 0;
    int vrank;
    int dim = opal_cube_dim(group->proc_count);
    int hibit;
    int mask;
    int i = 0;

    SCOLL_VERBOSE(12, "[#%d] Broadcast algorithm: Tree", group->my_pe);
    SCOLL_VERBOSE(15,
                  "[#%d] pSync[0] = %ld root = #%d",
                  group->my_pe, pSync[0], PE_root);

    vrank = (my_id + group->proc_count - root_id) % group->proc_count;
    hibit = opal_hibit(vrank, dim);

    SCOLL_VERBOSE(15,
                  "[#%d] dim = %d vrank = %d hibit = %d",
                  group->my_pe, dim, vrank, hibit);

    dim--;

    pSync[0] = SHMEM_SYNC_READY;
    /* Receive data from parent in the tree. */
    if (vrank > 0) {
        value = SHMEM_SYNC_READY;

        SCOLL_VERBOSE(14, "[#%d] wait", group->my_pe);
        rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_NE, (void*)&value, SHMEM_LONG));
        while ((value = pSync[0]) < 0) {
            SCOLL_VERBOSE(14,
                          "[#%d] Broadcast size is a negative value (%li)\n",
                          group->my_pe, pSync[0]);
            MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_NE, (void*)&value, SHMEM_LONG));
        }
        if (OSHMEM_SUCCESS != rc) {
            return rc;
        }
        nlong = (size_t) pSync[0];
    }

    /* Send data to the children. */
    for (i = hibit + 1, mask = 1 << i; i <= dim; ++i, mask <<= 1) {
        peer_id = vrank | mask;

        if (peer_id < group->proc_count) {
            /* Wait for the child to be ready to receive (pSync must have the initial value) */
            peer_id = (peer_id + root_id) % group->proc_count;
            peer_pe = oshmem_proc_pe(group->proc_array[peer_id]);

            SCOLL_VERBOSE(14,
                          "[#%d] check remote pe is ready to receive #%d",
                          group->my_pe, peer_pe);
            do {
                rc = MCA_SPML_CALL(get((void*)pSync, sizeof(long), (void*)pSync, peer_pe));
            } while ((OSHMEM_SUCCESS == rc) && (pSync[0] != SHMEM_SYNC_READY));

            SCOLL_VERBOSE(14, "[#%d] send data to #%d", group->my_pe, peer_pe);
            rc = MCA_SPML_CALL(put(target, nlong, (my_id == root_id ? (void *)source : target), peer_pe));

            MCA_SPML_CALL(fence());

            SCOLL_VERBOSE(14, "[#%d] signals to #%d", group->my_pe, peer_pe);
            value = nlong;
            rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe));
            if (OSHMEM_SUCCESS != rc) {
                break;
            }
        }
    }

    return rc;
}