Exemplo n.º 1
0
int down_search(int rank, int parent, int me, int num_procs,
                int *num_children, opal_list_t *children, opal_bitmap_t *relatives)
{
    int i, bitmap, peer, hibit, mask, found;
    orte_routed_tree_t *child;
    opal_bitmap_t *relations;

    /* is this me? */
    if (me == rank) {
        bitmap = opal_cube_dim(num_procs);

        hibit = opal_hibit(rank, bitmap);
        --bitmap;

        for (i = hibit + 1, mask = 1 << i; i <= bitmap; ++i, mask <<= 1) {
            peer = rank | mask;
            if (peer < num_procs) {
                child = OBJ_NEW(orte_routed_tree_t);
                child->vpid = peer;
                if (NULL != children) {
                    /* this is a direct child - add it to my list */
                    opal_list_append(children, &child->super);
                    (*num_children)++;
                    /* setup the relatives bitmap */
                    opal_bitmap_init(&child->relatives, num_procs);
                    /* point to the relatives */
                    relations = &child->relatives;
                } else {
                    /* we are recording someone's relatives - set the bit */
                    opal_bitmap_set_bit(relatives, peer);
                    /* point to this relations */
                    relations = relatives;
                }
                /* search for this child's relatives */
                down_search(0, 0, peer, num_procs, NULL, NULL, relations);
            }
        }
        return parent;
    }

    /* find the children of this rank */
    bitmap = opal_cube_dim(num_procs);

    hibit = opal_hibit(rank, bitmap);
    --bitmap;

    for (i = hibit + 1, mask = 1 << i; i <= bitmap; ++i, mask <<= 1) {
        peer = rank | mask;
        if (peer < num_procs) {
            /* execute compute on this child */
            if (0 <= (found = down_search(peer, rank, me, num_procs, num_children, children, relatives))) {
                return found;
            }
        }
    }
    return -1;
}
Exemplo n.º 2
0
ompi_communicator_t *ompi_comm_allocate ( int local_size, int remote_size )
{
    ompi_communicator_t *new_comm;

    /* create new communicator element */
    new_comm = OBJ_NEW(ompi_communicator_t);
    new_comm->c_local_group = ompi_group_allocate ( local_size );
    if ( 0 < remote_size ) {
        new_comm->c_remote_group = ompi_group_allocate (remote_size);
        new_comm->c_flags |= OMPI_COMM_INTER;
    } else {
        /* 
         * simplifies some operations (e.g. p2p), if 
         * we can always use the remote group 
         */
        new_comm->c_remote_group = new_comm->c_local_group;
        OBJ_RETAIN(new_comm->c_remote_group);
    }

    /* fill in the inscribing hyper-cube dimensions */
    new_comm->c_cube_dim = opal_cube_dim(local_size);

    return new_comm;
}
Exemplo n.º 3
0
static int mca_oob_xcast_binomial_tree(orte_jobid_t job,
                                       bool process_first,
                                       orte_buffer_t* buffer,
                                       orte_gpr_trigger_cb_fn_t cbfunc)
{
    orte_std_cntr_t i;
    int rc;
    int tag = ORTE_RML_TAG_XCAST;
    int peer, size, rank, hibit, mask;
    orte_buffer_t rbuf, sbuf;
    orte_gpr_notify_message_t *msg;
    orte_process_name_t target;

    /* check to see if there is something to send - this is only true on the HNP end.
     * However, we cannot just test to see if we are the HNP since, if we are a singleton,
     * we are the HNP *and* we still need to handle both ends of the xcast
     */
    if (NULL != buffer) {        
        /* this is the HNP end, so it starts the procedure. Accordingly, it sends its
         * message to the first process in the job in the peer list, which takes it from there
         */
        OBJ_CONSTRUCT(&xcastmutex, opal_mutex_t);
        OPAL_THREAD_LOCK(&xcastmutex);

        target.cellid = ORTE_PROC_MY_NAME->cellid;
        target.jobid = job;
        target.vpid = 0;
        if (0 > (rc = mca_oob_send_packed(&target, buffer, tag, 0))) {
            ORTE_ERROR_LOG(rc);
            OPAL_THREAD_UNLOCK(&xcastmutex);
            OBJ_DESTRUCT(&xcastmutex);
            return rc;
        }

        OPAL_THREAD_UNLOCK(&xcastmutex);
        OBJ_DESTRUCT(&xcastmutex);

        return ORTE_SUCCESS;
        
    } 
    
    
    /* this process is one of the application procs - accordingly, it will
     * receive the message from its "parent" in the broadcast tree, and
     * then send it along to some set of children
     */
    
    /* compute the bitmap, if we haven't already done so */
    if (!bitmap_init) {
        bitmap_save = opal_cube_dim((int)orte_process_info.num_procs);
        bitmap_init = true;
    }
    
    xcast_bitmap = bitmap_save;
    rank = (int)(ORTE_PROC_MY_NAME->vpid);
    size = (int)orte_process_info.num_procs;
    
    hibit = opal_hibit(rank, xcast_bitmap);
    --xcast_bitmap;
    
    /* regardless of who we are, we first have to receive the message */
    OBJ_CONSTRUCT(&rbuf, orte_buffer_t);
    if (0 > (rc = mca_oob_recv_packed(ORTE_NAME_WILDCARD, &rbuf, tag))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&rbuf);
        return rc;
    }
    
    msg = OBJ_NEW(orte_gpr_notify_message_t);
    if (NULL == msg) {
        ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
        return ORTE_ERR_OUT_OF_RESOURCE;
    }
    i=1;
    if (ORTE_SUCCESS != (rc = orte_dss.unpack(&rbuf, &msg, &i, ORTE_GPR_NOTIFY_MSG))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(msg);
        return rc;
    }
    OBJ_DESTRUCT(&rbuf);
    
    /* repack the message so we can send it on */
    OBJ_CONSTRUCT(&sbuf, orte_buffer_t);
    if (ORTE_SUCCESS != (rc = orte_dss.pack(&sbuf, &msg, 1, ORTE_GPR_NOTIFY_MSG))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&sbuf);
        return rc;
    }
    
    /* since the OOB contact info for our peers is in the STG1 message, we have to
     * process it BEFORE we can relay the message to any "children"
     */
    if (cbfunc != NULL && process_first) {
        /* process the message */
        cbfunc(msg);
    }
            
    /* send data to any children */
    target.cellid = ORTE_PROC_MY_NAME->cellid;
    target.jobid = ORTE_PROC_MY_NAME->jobid;
    
    for (i = hibit + 1, mask = 1 << i; i <= xcast_bitmap; ++i, mask <<= 1) {
        peer = rank | mask;
        if (peer < size) {
            target.vpid = (orte_vpid_t)peer;
            if (0 > (rc = mca_oob_send_packed(&target, &sbuf, tag, 0))) {
                ORTE_ERROR_LOG(rc);
                OBJ_RELEASE(msg);
                return rc;
            }
        }
    }
    OBJ_DESTRUCT(&sbuf);
    
    /* if it wasn't the STG1 message, then process it here */
    if (cbfunc != NULL && !process_first) {
        cbfunc(msg);
    }
    OBJ_RELEASE(msg);
    
    return ORTE_SUCCESS;
}
Exemplo n.º 4
0
static int binomial_tree(int rank, int parent, int me, int num_procs,
                         int *nchildren, opal_list_t *childrn,
                         opal_bitmap_t *relatives, bool mine)
{
    int i, bitmap, peer, hibit, mask, found;
    orte_routed_tree_t *child;
    opal_bitmap_t *relations;

    OPAL_OUTPUT_VERBOSE((3, orte_routed_base_framework.framework_output,
                         "%s routed:binomial rank %d parent %d me %d num_procs %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         rank, parent, me, num_procs));

    /* is this me? */
    if (me == rank) {
        bitmap = opal_cube_dim(num_procs);

        hibit = opal_hibit(rank, bitmap);
        --bitmap;

        for (i = hibit + 1, mask = 1 << i; i <= bitmap; ++i, mask <<= 1) {
            peer = rank | mask;
            if (peer < num_procs) {
                child = OBJ_NEW(orte_routed_tree_t);
                child->vpid = peer;
                OPAL_OUTPUT_VERBOSE((3, orte_routed_base_framework.framework_output,
                                     "%s routed:binomial %d found child %s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     rank,
                                     ORTE_VPID_PRINT(child->vpid)));

                if (mine) {
                    /* this is a direct child - add it to my list */
                    opal_list_append(childrn, &child->super);
                    (*nchildren)++;
                    /* setup the relatives bitmap */
                    opal_bitmap_init(&child->relatives, num_procs);

                    /* point to the relatives */
                    relations = &child->relatives;
                } else {
                    /* we are recording someone's relatives - set the bit */
                    opal_bitmap_set_bit(relatives, peer);
                    /* point to this relations */
                    relations = relatives;
                }
                /* search for this child's relatives */
                binomial_tree(0, 0, peer, num_procs, nchildren, childrn, relations, false);
            }
        }
        return parent;
    }

    /* find the children of this rank */
    OPAL_OUTPUT_VERBOSE((5, orte_routed_base_framework.framework_output,
                         "%s routed:binomial find children of rank %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rank));
    bitmap = opal_cube_dim(num_procs);

    hibit = opal_hibit(rank, bitmap);
    --bitmap;

    for (i = hibit + 1, mask = 1 << i; i <= bitmap; ++i, mask <<= 1) {
        peer = rank | mask;
        OPAL_OUTPUT_VERBOSE((5, orte_routed_base_framework.framework_output,
                             "%s routed:binomial find children checking peer %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), peer));
        if (peer < num_procs) {
            OPAL_OUTPUT_VERBOSE((5, orte_routed_base_framework.framework_output,
                                 "%s routed:binomial find children computing tree",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            /* execute compute on this child */
            if (0 <= (found = binomial_tree(peer, rank, me, num_procs, nchildren, childrn, relatives, mine))) {
                OPAL_OUTPUT_VERBOSE((5, orte_routed_base_framework.framework_output,
                                     "%s routed:binomial find children returning found value %d",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), found));
                return found;
            }
        }
    }
    return -1;
}
Exemplo n.º 5
0
/** adapted from openmpi see,
    ~/openmpi-1.6.4/ompi/mca/coll/basic/coll_basic_bcast.c
    This does a binomial tree based broadcast.
*/
int MPI_Bcast_log( void *buff, int count, MPI_Datatype datatype, int root,
                   MPI_Comm comm )
{

    int i;
    int size;
    int rank;
    int vrank;
    int peer;
    int dim;
    int hibit;
    int mask;
    int err;
	int rcnt=0;
	MPI_Request *areqs = NULL;
    

	MPI_CHECK( err = MPI_Comm_rank( comm, &rank ) );
	MPI_CHECK( err = MPI_Comm_size( comm, &size ) );

    vrank = (rank + size - root) % size;
    
    dim = opal_cube_dim(size);
    hibit = opal_hibit(vrank, dim);
    --dim;
 
    /* malloc the maximum possible send request, this is the dimension
       minus the depth then one more the leaf, this might allocate a
       more slots than necessary if the number of nodes do not evenly
       divide into the tree.
    */
    NULL_CHECK( areqs = malloc( sizeof( MPI_Request ) * ( dim-(hibit+1)+1 ) ) );

    /* Receive data from parent in the tree. */
    
    if (vrank > 0) {
        peer = ((vrank & ~(1 << hibit)) + root) % size;
    
        //        printf("Getting ready to receive from %d at %d\n", peer, rank);
        MPI_CHECK( err = MPI_Recv( buff, count, datatype, peer, BCAST_TAG, comm, MPI_STATUS_IGNORE ) );

        if (MPI_SUCCESS != err) {
            return err;
        }
     }

    /* Send data to the children. */
   
    // just in case there are no children, return should be success
    err = MPI_SUCCESS; 
    for (i = hibit + 1, mask = 1 << i; i <= dim; ++i, mask <<= 1) {
        peer = vrank | mask;
        if (peer < size) {
            peer = (peer + root) % size;
            MPI_CHECK( err = MPI_Isend( buff, count, datatype, peer, 
                                        BCAST_TAG, comm,
                                        &areqs[ rcnt ] ) );
            rcnt++;
            if (MPI_SUCCESS != err) {
                free(areqs);
                return err;
            }
        }
     }
    
    /* Wait on all requests. */

    if (rcnt > 0) {
        MPI_CHECK( err = MPI_Waitall( rcnt, areqs, MPI_STATUSES_IGNORE ) );
        /* Free the reqs */    
        free(areqs);
     }
    
    /* All done */
    
    return err;
}
Exemplo n.º 6
0
/*
 * Initialize comm world/self/null/parent.
 */
int ompi_comm_init(void)
{
    ompi_group_t *group;
    size_t size;

    /* Setup communicator array */
    OBJ_CONSTRUCT(&ompi_mpi_communicators, opal_pointer_array_t); 
    if( OPAL_SUCCESS != opal_pointer_array_init(&ompi_mpi_communicators, 0,
                                                OMPI_FORTRAN_HANDLE_MAX, 64) ) {
        return OMPI_ERROR;
    }

    /* Setup MPI_COMM_WORLD */
    OBJ_CONSTRUCT(&ompi_mpi_comm_world, ompi_communicator_t);
    group = OBJ_NEW(ompi_group_t);
    group->grp_proc_pointers = ompi_proc_world(&size);
    group->grp_proc_count    = (int)size;
    OMPI_GROUP_SET_INTRINSIC (group);
    OMPI_GROUP_SET_DENSE (group);
    ompi_set_group_rank(group, ompi_proc_local());
    ompi_group_increment_proc_count (group);

    ompi_mpi_comm_world.comm.c_contextid    = 0;
    ompi_mpi_comm_world.comm.c_id_start_index = 4;
    ompi_mpi_comm_world.comm.c_id_available = 4;
    ompi_mpi_comm_world.comm.c_f_to_c_index = 0;
    ompi_mpi_comm_world.comm.c_my_rank      = group->grp_my_rank;
    ompi_mpi_comm_world.comm.c_local_group  = group;
    ompi_mpi_comm_world.comm.c_remote_group = group;
    OBJ_RETAIN(ompi_mpi_comm_world.comm.c_remote_group);
    ompi_mpi_comm_world.comm.c_cube_dim     = opal_cube_dim((int)size);
    ompi_mpi_comm_world.comm.error_handler  = &ompi_mpi_errors_are_fatal.eh;
    OBJ_RETAIN( &ompi_mpi_errors_are_fatal.eh );
    OMPI_COMM_SET_PML_ADDED(&ompi_mpi_comm_world.comm);
    opal_pointer_array_set_item (&ompi_mpi_communicators, 0, &ompi_mpi_comm_world);

    MEMCHECKER (memset (ompi_mpi_comm_world.comm.c_name, 0, MPI_MAX_OBJECT_NAME));
    strncpy (ompi_mpi_comm_world.comm.c_name, "MPI_COMM_WORLD",
             strlen("MPI_COMM_WORLD")+1 );
    ompi_mpi_comm_world.comm.c_flags |= OMPI_COMM_NAMEISSET;
    ompi_mpi_comm_world.comm.c_flags |= OMPI_COMM_INTRINSIC;

    /* We have to create a hash (although it is legal to leave this
       filed NULL -- the attribute accessor functions will intepret
       this as "there are no attributes cached on this object")
       because MPI_COMM_WORLD has some predefined attributes. */
    ompi_attr_hash_init(&ompi_mpi_comm_world.comm.c_keyhash);

    /* Setup MPI_COMM_SELF */
    OBJ_CONSTRUCT(&ompi_mpi_comm_self, ompi_communicator_t);
    group = OBJ_NEW(ompi_group_t);
    group->grp_proc_pointers = ompi_proc_self(&size);
    group->grp_my_rank       = 0;
    group->grp_proc_count    = (int)size;
    OMPI_GROUP_SET_INTRINSIC (group);
    OMPI_GROUP_SET_DENSE (group);
    
    ompi_mpi_comm_self.comm.c_contextid    = 1;
    ompi_mpi_comm_self.comm.c_f_to_c_index = 1;
    ompi_mpi_comm_self.comm.c_id_start_index = 20;
    ompi_mpi_comm_self.comm.c_id_available = 20;
    ompi_mpi_comm_self.comm.c_my_rank      = group->grp_my_rank;
    ompi_mpi_comm_self.comm.c_local_group  = group;
    ompi_mpi_comm_self.comm.c_remote_group = group;
    OBJ_RETAIN(ompi_mpi_comm_self.comm.c_remote_group);
    ompi_mpi_comm_self.comm.error_handler  = &ompi_mpi_errors_are_fatal.eh;
    OBJ_RETAIN( &ompi_mpi_errors_are_fatal.eh );
    OMPI_COMM_SET_PML_ADDED(&ompi_mpi_comm_self.comm);
    opal_pointer_array_set_item (&ompi_mpi_communicators, 1, &ompi_mpi_comm_self);

    MEMCHECKER (memset (ompi_mpi_comm_self.comm.c_name, 0, MPI_MAX_OBJECT_NAME));
    strncpy(ompi_mpi_comm_self.comm.c_name,"MPI_COMM_SELF",strlen("MPI_COMM_SELF")+1);
    ompi_mpi_comm_self.comm.c_flags |= OMPI_COMM_NAMEISSET;
    ompi_mpi_comm_self.comm.c_flags |= OMPI_COMM_INTRINSIC;

    /* We can set MPI_COMM_SELF's keyhash to NULL because it has no
       predefined attributes.  If a user defines an attribute on
       MPI_COMM_SELF, the keyhash will automatically be created. */
    ompi_mpi_comm_self.comm.c_keyhash = NULL;

    /* Setup MPI_COMM_NULL */
    OBJ_CONSTRUCT(&ompi_mpi_comm_null, ompi_communicator_t);
    ompi_mpi_comm_null.comm.c_local_group  = &ompi_mpi_group_null.group;
    ompi_mpi_comm_null.comm.c_remote_group = &ompi_mpi_group_null.group;
    OBJ_RETAIN(&ompi_mpi_group_null.group); 
    OBJ_RETAIN(&ompi_mpi_group_null.group);

    ompi_mpi_comm_null.comm.c_contextid    = 2;
    ompi_mpi_comm_null.comm.c_f_to_c_index = 2;
    ompi_mpi_comm_null.comm.c_my_rank      = MPI_PROC_NULL;

    ompi_mpi_comm_null.comm.error_handler  = &ompi_mpi_errors_are_fatal.eh;
    OBJ_RETAIN( &ompi_mpi_errors_are_fatal.eh );
    opal_pointer_array_set_item (&ompi_mpi_communicators, 2, &ompi_mpi_comm_null);

    MEMCHECKER (memset (ompi_mpi_comm_null.comm.c_name, 0, MPI_MAX_OBJECT_NAME));
    strncpy(ompi_mpi_comm_null.comm.c_name,"MPI_COMM_NULL",strlen("MPI_COMM_NULL")+1);
    ompi_mpi_comm_null.comm.c_flags |= OMPI_COMM_NAMEISSET;
    ompi_mpi_comm_null.comm.c_flags |= OMPI_COMM_INTRINSIC;

    /* Initialize the parent communicator to MPI_COMM_NULL */
    ompi_mpi_comm_parent = &ompi_mpi_comm_null.comm;
    OBJ_RETAIN(&ompi_mpi_comm_null);
    OBJ_RETAIN(&ompi_mpi_group_null.group);
    OBJ_RETAIN(&ompi_mpi_errors_are_fatal.eh);

    /* initialize the comm_reg stuff for multi-threaded comm_cid
       allocation */
    ompi_comm_reg_init();

    return OMPI_SUCCESS;
}
/*
 The Binomial Spanning Tree algorithm.
 Outlay:
 The game scales with log2(NP) and uses 1 byte of memory.
 */
static int __algorithm_binomial_tree(struct oshmem_group_t *group,
                                     int PE_root,
                                     void *target,
                                     const void *source,
                                     size_t nlong,
                                     long *pSync)
{
    int rc = OSHMEM_SUCCESS;
    long value = SHMEM_SYNC_INIT;
    int root_id = oshmem_proc_group_find_id(group, PE_root);
    int my_id = oshmem_proc_group_find_id(group, group->my_pe);
    int peer_id = 0;
    int peer_pe = 0;
    int vrank;
    int dim = opal_cube_dim(group->proc_count);
    int hibit;
    int mask;
    int i = 0;

    SCOLL_VERBOSE(12, "[#%d] Broadcast algorithm: Tree", group->my_pe);
    SCOLL_VERBOSE(15,
                  "[#%d] pSync[0] = %ld root = #%d",
                  group->my_pe, pSync[0], PE_root);

    vrank = (my_id + group->proc_count - root_id) % group->proc_count;
    hibit = opal_hibit(vrank, dim);

    SCOLL_VERBOSE(15,
                  "[#%d] dim = %d vrank = %d hibit = %d",
                  group->my_pe, dim, vrank, hibit);

    dim--;

    pSync[0] = SHMEM_SYNC_READY;
    /* Receive data from parent in the tree. */
    if (vrank > 0) {
        value = SHMEM_SYNC_READY;

        SCOLL_VERBOSE(14, "[#%d] wait", group->my_pe);
        rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_NE, (void*)&value, SHMEM_LONG));
        while ((value = pSync[0]) < 0) {
            SCOLL_VERBOSE(14,
                          "[#%d] Broadcast size is a negative value (%li)\n",
                          group->my_pe, pSync[0]);
            MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_NE, (void*)&value, SHMEM_LONG));
        }
        if (OSHMEM_SUCCESS != rc) {
            return rc;
        }
        nlong = (size_t) pSync[0];
    }

    /* Send data to the children. */
    for (i = hibit + 1, mask = 1 << i; i <= dim; ++i, mask <<= 1) {
        peer_id = vrank | mask;

        if (peer_id < group->proc_count) {
            /* Wait for the child to be ready to receive (pSync must have the initial value) */
            peer_id = (peer_id + root_id) % group->proc_count;
            peer_pe = oshmem_proc_pe(group->proc_array[peer_id]);

            SCOLL_VERBOSE(14,
                          "[#%d] check remote pe is ready to receive #%d",
                          group->my_pe, peer_pe);
            do {
                rc = MCA_SPML_CALL(get((void*)pSync, sizeof(long), (void*)pSync, peer_pe));
            } while ((OSHMEM_SUCCESS == rc) && (pSync[0] != SHMEM_SYNC_READY));

            SCOLL_VERBOSE(14, "[#%d] send data to #%d", group->my_pe, peer_pe);
            rc = MCA_SPML_CALL(put(target, nlong, (my_id == root_id ? (void *)source : target), peer_pe));

            MCA_SPML_CALL(fence());

            SCOLL_VERBOSE(14, "[#%d] signals to #%d", group->my_pe, peer_pe);
            value = nlong;
            rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe));
            if (OSHMEM_SUCCESS != rc) {
                break;
            }
        }
    }

    return rc;
}