示例#1
0
static int
_process_name_compare(const opal_process_name_t p1, const opal_process_name_t p2)
{
    ompi_process_name_t* o1 = (ompi_process_name_t*)&p1;
    ompi_process_name_t* o2 = (ompi_process_name_t*)&p2;
    return ompi_rte_compare_name_fields(OMPI_RTE_CMP_ALL, o1, o2);
}
/* Find endpoint for specific subnet/lid/message */
static mca_btl_openib_endpoint_t* xoob_find_endpoint(ompi_process_name_t* process_name,
        uint64_t subnet_id, uint16_t lid, uint8_t message_type)
{
    size_t i;
    mca_btl_openib_proc_t *ib_proc;
    mca_btl_openib_endpoint_t *ib_endpoint = NULL;
    bool found = false;

    BTL_VERBOSE(("Searching for ep and proc with follow parameters:"
                "jobid %d, vpid %d, "
                "sid %" PRIx64 ", lid %d",
                process_name->jobid, process_name->vpid,
                subnet_id, lid));


    /* find ibproc */
    OPAL_THREAD_LOCK(&mca_btl_openib_component.ib_lock);
    for (ib_proc = (mca_btl_openib_proc_t*)
            opal_list_get_first(&mca_btl_openib_component.ib_procs);
            ib_proc != (mca_btl_openib_proc_t*)
            opal_list_get_end(&mca_btl_openib_component.ib_procs);
            ib_proc  = (mca_btl_openib_proc_t*)opal_list_get_next(ib_proc)) {
        if (OPAL_EQUAL == ompi_rte_compare_name_fields(OMPI_RTE_CMP_ALL,
                    &ib_proc->proc_ompi->proc_name, process_name)) {
            found = true;
            break;
        }
    }
    /* we found our ib_proc, lets find endpoint now */
    if (found) {
        for (i = 0; i < ib_proc->proc_endpoint_count; i++) {
            ib_endpoint = ib_proc->proc_endpoints[i];
            /* we need to check different
             * lid for different message type */
            if (ENDPOINT_XOOB_CONNECT_RESPONSE == message_type ||
                    ENDPOINT_XOOB_CONNECT_XRC_RESPONSE == message_type) {
                /* response message */
                if (ib_endpoint->subnet_id == subnet_id &&
                        ib_endpoint->ib_addr->lid == lid) {
                    break; /* Found one */
                }
            } else {
                /* request message */
                if (ib_endpoint->subnet_id == subnet_id &&
                        ib_endpoint->endpoint_btl->lid == lid) {
                    break; /* Found one */
                }
            }
        }
        if (NULL == ib_endpoint) {
                BTL_ERROR(("can't find suitable endpoint for this peer\n"));
        }
    } else {
            BTL_ERROR(("can't find suitable endpoint for this peer\n"));
    }
    OPAL_THREAD_UNLOCK(&mca_btl_openib_component.ib_lock);
    return ib_endpoint;
}
示例#3
0
bool mca_btl_tcp2_endpoint_accept(mca_btl_base_endpoint_t* btl_endpoint,
                                 struct sockaddr* addr, int sd)
{
    mca_btl_tcp_proc_t *endpoint_proc = btl_endpoint->endpoint_proc;
    const orte_process_name_t *this_proc = &(ompi_proc_local()->proc_name);
    int cmpval;

    if(NULL == btl_endpoint->endpoint_addr) {
        return false;
    }

    OPAL_THREAD_LOCK(&btl_endpoint->endpoint_recv_lock);
    OPAL_THREAD_LOCK(&btl_endpoint->endpoint_send_lock);

    cmpval = ompi_rte_compare_name_fields(OMPI_RTE_CMP_ALL,
                                    &endpoint_proc->proc_ompi->proc_name,
                                    this_proc);
    if((btl_endpoint->endpoint_sd < 0) ||
       (btl_endpoint->endpoint_state != MCA_BTL_TCP_CONNECTED &&
        cmpval < 0)) {
        mca_btl_tcp2_endpoint_close(btl_endpoint);
        btl_endpoint->endpoint_sd = sd;
        if(mca_btl_tcp2_endpoint_send_connect_ack(btl_endpoint) != OMPI_SUCCESS) {
            mca_btl_tcp2_endpoint_close(btl_endpoint);
            OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
            OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
            return false;
        }
        mca_btl_tcp_endpoint_event_init(btl_endpoint);
        /* NOT NEEDED if we remove the PERSISTENT flag when we create the
         * first recv_event.
         */
        opal_event_add(&btl_endpoint->endpoint_recv_event, 0);  /* TODO */
        mca_btl_tcp_endpoint_connected(btl_endpoint);
#if OPAL_ENABLE_DEBUG && WANT_PEER_DUMP
        mca_btl_tcp2_endpoint_dump(btl_endpoint, "accepted");
#endif
        OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
        OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
        return true;
    }
    OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
    OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
    return false;
}
示例#4
0
文件: proc.c 项目: IanYXXL/A1
ompi_proc_t * ompi_proc_find ( const ompi_process_name_t * name )
{
    ompi_proc_t *proc, *rproc=NULL;
    ompi_rte_cmp_bitmask_t mask;

    /* return the proc-struct which matches this jobid+process id */
    mask = OMPI_RTE_CMP_JOBID | OMPI_RTE_CMP_VPID;
    OPAL_THREAD_LOCK(&ompi_proc_lock);
    for(proc =  (ompi_proc_t*)opal_list_get_first(&ompi_proc_list);
        proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list);
        proc =  (ompi_proc_t*)opal_list_get_next(proc)) {
        if (OPAL_EQUAL == ompi_rte_compare_name_fields(mask, &proc->proc_name, name)) {
            rproc = proc;
            break;
        }
    }
    OPAL_THREAD_UNLOCK(&ompi_proc_lock);

    return rproc;
}
示例#5
0
文件: proc.c 项目: IanYXXL/A1
static ompi_proc_t *
ompi_proc_find_and_add(const ompi_process_name_t * name, bool* isnew)
{
    ompi_proc_t *proc, *rproc = NULL;
    ompi_rte_cmp_bitmask_t mask;
    
    /* return the proc-struct which matches this jobid+process id */
    mask = OMPI_RTE_CMP_JOBID | OMPI_RTE_CMP_VPID;
    OPAL_THREAD_LOCK(&ompi_proc_lock);
    for(proc =  (ompi_proc_t*)opal_list_get_first(&ompi_proc_list);
        proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list);
        proc =  (ompi_proc_t*)opal_list_get_next(proc)) {
        if (OPAL_EQUAL == ompi_rte_compare_name_fields(mask, &proc->proc_name, name)) {
            rproc = proc;
            *isnew = false;
            break;
        }
    }
    
    /* if we didn't find this proc in the list, create a new
     * proc_t and append it to the list
     */
    if (NULL == rproc) {
        *isnew = true;
        rproc = OBJ_NEW(ompi_proc_t);
        if (NULL != rproc) {
            opal_list_append(&ompi_proc_list, (opal_list_item_t*)rproc);
            rproc->proc_name = *name;
        }
        /* caller had better fill in the rest of the proc, or there's
         going to be pain later... */
    }
    
    OPAL_THREAD_UNLOCK(&ompi_proc_lock);
    
    return rproc;
}
示例#6
0
int
ompi_mpi_abort(struct ompi_communicator_t* comm,
               int errcode,
               bool kill_remote_of_intercomm)
{
    int count = 0, i, ret;
    char *msg, *host, hostname[MAXHOSTNAMELEN];
    pid_t pid = 0;
    ompi_process_name_t *abort_procs;
    int32_t nabort_procs;

    /* Protection for recursive invocation */
    if (have_been_invoked) {
        return OMPI_SUCCESS;
    }
    have_been_invoked = true;

    /* If MPI is initialized, we know we have a runtime nodename, so use that.  Otherwise, call
       gethostname. */
    if (ompi_mpi_initialized) {
        host = ompi_process_info.nodename;
    } else {
        gethostname(hostname, sizeof(hostname));
        host = hostname;
    }
    pid = getpid();

    /* Should we print a stack trace?  Not aggregated because they
       might be different on all processes. */
    if (ompi_mpi_abort_print_stack) {
        char **messages;
        int len, i;

        if (OMPI_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
            for (i = 0; i < len; ++i) {
                fprintf(stderr, "[%s:%d] [%d] func:%s\n", host, (int) pid, 
                        i, messages[i]);
                fflush(stderr);
            }
            free(messages);
        } else {
            /* This will print an message if it's unable to print the
               backtrace, so we don't need an additional "else" clause
               if opal_backtrace_print() is not supported. */
            opal_backtrace_print(stderr, NULL, 1);
        }
    }

    /* Notify the debugger that we're about to abort */

    if (errcode < 0 ||
        asprintf(&msg, "[%s:%d] aborting with MPI error %s%s", 
                 host, (int) pid, ompi_mpi_errnum_get_string(errcode), 
                 ompi_mpi_abort_print_stack ? 
                 " (stack trace available on stderr)" : "") < 0) {
        msg = NULL;
    }
    ompi_debugger_notify_abort(msg);
    if (NULL != msg) {
        free(msg);
    }

    /* Should we wait for a while before aborting? */

    if (0 != ompi_mpi_abort_delay) {
        if (ompi_mpi_abort_delay < 0) {
            fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter mpi_abort_delay is < 0)\n",
                    host, (int) pid);
            fflush(stderr);
            while (1) { 
                sleep(5); 
            }
        } else {
            fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n",
                    host, (int) pid, ompi_mpi_abort_delay);
            do {
                sleep(1);
            } while (--ompi_mpi_abort_delay > 0);
        }
    }

    /* If OMPI isn't setup yet/any more, then don't even try killing
       everyone.  OMPI's initialized period covers all runtime
       initialized period of time, so no need to check that here.
       Sorry, Charlie... */

    if (!ompi_mpi_initialized || ompi_mpi_finalized) {
        fprintf(stderr, "[%s:%d] Local abort %s completed successfully; not able to aggregate error messages, and not able to guarantee that all other processes were killed!\n",
                host, (int) pid, ompi_mpi_finalized ? 
                "after MPI_FINALIZE" : "before MPI_INIT");
        exit(errcode);
    }

    /* abort local procs in the communicator.  If the communicator is
       an intercommunicator AND the abort has explicitly requested
       that we abort the remote procs, then do that as well. */
    nabort_procs = ompi_comm_size(comm);

    if (kill_remote_of_intercomm) {
        /* ompi_comm_remote_size() returns 0 if not an intercomm, so
           this is cool */
        nabort_procs += ompi_comm_remote_size(comm);
    }

    abort_procs = (ompi_process_name_t*)malloc(sizeof(ompi_process_name_t) * nabort_procs);
    if (NULL == abort_procs) {
        /* quick clean orte and get out */
        ompi_rte_abort(errcode, "Abort unable to malloc memory to kill procs");
    }

    /* put all the local procs in the abort list */
    for (i = 0 ; i < ompi_comm_size(comm) ; ++i) {
        if (OPAL_EQUAL != ompi_rte_compare_name_fields(OMPI_RTE_CMP_ALL, 
                                 &comm->c_local_group->grp_proc_pointers[i]->proc_name,
                                 OMPI_PROC_MY_NAME)) {
            assert(count <= nabort_procs);
            abort_procs[count++] = comm->c_local_group->grp_proc_pointers[i]->proc_name;
        } else {
            /* don't terminate me just yet */
            nabort_procs--;
        }
    }

    /* if requested, kill off remote procs too */
    if (kill_remote_of_intercomm) {
        for (i = 0 ; i < ompi_comm_remote_size(comm) ; ++i) {
            if (OPAL_EQUAL != ompi_rte_compare_name_fields(OMPI_RTE_CMP_ALL, 
                                     &comm->c_remote_group->grp_proc_pointers[i]->proc_name,
                                     OMPI_PROC_MY_NAME)) {
                assert(count <= nabort_procs);
                abort_procs[count++] =
                    comm->c_remote_group->grp_proc_pointers[i]->proc_name;
            } else {
                /* don't terminate me just yet */
                nabort_procs--;
            }
        }
    }

    if (nabort_procs > 0) {
        /* This must be implemented for MPI_Abort() to work according to the
         * standard language for a 'high-quality' implementation.
         * It would be nifty if we could differentiate between the
         * abort scenarios:
         *      - MPI_Abort()
         *      - MPI_ERRORS_ARE_FATAL
         *      - Victim of MPI_Abort()
         */
        /*
         * Abort peers in this communicator group. Does not include self.
         */
        if( OMPI_SUCCESS != (ret = ompi_rte_abort_peers(abort_procs, nabort_procs, errcode)) ) {
            ompi_rte_abort(errcode, "Open MPI failed to abort all of the procs requested (%d).", ret);
        }
    }

    /* now that we've aborted everyone else, gracefully die. */
    ompi_rte_abort(errcode, NULL);
    
    return OMPI_SUCCESS;
}
示例#7
0
static int bootstrap_comm(ompi_communicator_t *comm,
                          mca_coll_sm_module_t *module)
{
    int i;
    char *shortpath, *fullpath;
    mca_coll_sm_component_t *c = &mca_coll_sm_component;
    mca_coll_sm_comm_t *data = module->sm_comm_data;
    int comm_size = ompi_comm_size(comm);
    int num_segments = c->sm_comm_num_segments;
    int num_in_use = c->sm_comm_num_in_use_flags;
    int frag_size = c->sm_fragment_size;
    int control_size = c->sm_control_size;
    ompi_process_name_t *lowest_name = NULL;
    size_t size;
    ompi_proc_t *proc;

    /* Make the rendezvous filename for this communicators shmem data
       segment.  The CID is not guaranteed to be unique among all
       procs on this node, so also pair it with the PID of the proc
       with the lowest ORTE name to form a unique filename. */
    proc = ompi_group_peer_lookup(comm->c_local_group, 0);
    lowest_name = &(proc->proc_name);
    for (i = 1; i < comm_size; ++i) {
        proc = ompi_group_peer_lookup(comm->c_local_group, i);
        if (ompi_rte_compare_name_fields(OMPI_RTE_CMP_ALL, 
                                          &(proc->proc_name),
                                          lowest_name) < 0) {
            lowest_name = &(proc->proc_name);
        }
    }
    asprintf(&shortpath, "coll-sm-cid-%d-name-%s.mmap", comm->c_contextid,
             OMPI_NAME_PRINT(lowest_name));
    if (NULL == shortpath) {
        opal_output_verbose(10, ompi_coll_base_framework.framework_output,
                            "coll:sm:enable:bootstrap comm (%d/%s): asprintf failed", 
                            comm->c_contextid, comm->c_name);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    fullpath = opal_os_path(false, ompi_process_info.job_session_dir,
                            shortpath, NULL);
    free(shortpath);
    if (NULL == fullpath) {
        opal_output_verbose(10, ompi_coll_base_framework.framework_output,
                            "coll:sm:enable:bootstrap comm (%d/%s): opal_os_path failed", 
                            comm->c_contextid, comm->c_name);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    /* Calculate how much space we need in the per-communicator shmem
       data segment.  There are several values to add:

       - size of the barrier data (2 of these):
           - fan-in data (num_procs * control_size)
           - fan-out data (num_procs * control_size)
       - size of the "in use" buffers:
           - num_in_use_buffers * control_size
       - size of the message fragment area (one for each segment):
           - control (num_procs * control_size)
           - fragment data (num_procs * (frag_size))

       So it's:

           barrier: 2 * control_size + 2 * control_size
           in use:  num_in_use * control_size
           control: num_segments * (num_procs * control_size * 2 +
                                    num_procs * control_size)
           message: num_segments * (num_procs * frag_size)
     */

    size = 4 * control_size +
        (num_in_use * control_size) +
        (num_segments * (comm_size * control_size * 2)) +
        (num_segments * (comm_size * frag_size));
    opal_output_verbose(10, ompi_coll_base_framework.framework_output,
                        "coll:sm:enable:bootstrap comm (%d/%s): attaching to %" PRIsize_t " byte mmap: %s",
                        comm->c_contextid, comm->c_name, size, fullpath);
    data->sm_bootstrap_meta =
        mca_common_sm_init_group(comm->c_local_group, size, fullpath,
                                 sizeof(mca_common_sm_seg_header_t),
                                 getpagesize());
    if (NULL == data->sm_bootstrap_meta) {
        opal_output_verbose(10, ompi_coll_base_framework.framework_output,
                            "coll:sm:enable:bootstrap comm (%d/%s): mca_common_sm_init_group failed", 
                            comm->c_contextid, comm->c_name);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    /* All done */
    return OMPI_SUCCESS;
}
示例#8
0
文件: proc.c 项目: IanYXXL/A1
int
ompi_proc_unpack(opal_buffer_t* buf, 
                 int proclistsize, ompi_proc_t ***proclist,
                 bool full_info,
                 int *newproclistsize, ompi_proc_t ***newproclist)
{
    int i;
    size_t newprocs_len = 0;
    ompi_proc_t **plist=NULL, **newprocs = NULL;

    /* do not free plist *ever*, since it is used in the remote group
       structure of a communicator */
    plist = (ompi_proc_t **) calloc (proclistsize, sizeof (ompi_proc_t *));
    if ( NULL == plist ) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    /* free this on the way out */
    newprocs = (ompi_proc_t **) calloc (proclistsize, sizeof (ompi_proc_t *));
    if (NULL == newprocs) {
        free(plist);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    /* cycle through the array of provided procs and unpack
     * their info - as packed by ompi_proc_pack
     */
    for ( i=0; i<proclistsize; i++ ){
        int32_t count=1;
        ompi_process_name_t new_name;
        uint32_t new_arch;
        char *new_hostname;
        bool isnew = false;
        int rc;

        rc = opal_dss.unpack(buf, &new_name, &count, OMPI_NAME);
        if (rc != OPAL_SUCCESS) {
            OMPI_ERROR_LOG(rc);
            free(plist);
            free(newprocs);
            return rc;
        }
        if (!full_info) {
            rc = opal_dss.unpack(buf, &new_arch, &count, OPAL_UINT32);
            if (rc != OPAL_SUCCESS) {
                OMPI_ERROR_LOG(rc);
                free(plist);
                free(newprocs);
                return rc;
            }
            rc = opal_dss.unpack(buf, &new_hostname, &count, OPAL_STRING);
            if (rc != OPAL_SUCCESS) {
                OMPI_ERROR_LOG(rc);
                free(plist);
                free(newprocs);
                return rc;
            }
        }
        /* see if this proc is already on our ompi_proc_list */
        plist[i] = ompi_proc_find_and_add(&new_name, &isnew);
        if (isnew) {
            /* if not, then it was added, so update the values
             * in the proc_t struct with the info that was passed
             * to us
             */
            newprocs[newprocs_len++] = plist[i];

            if (full_info) {
                int32_t num_recvd_entries;
                int32_t cnt;
                int32_t j;

                /* unpack the number of entries for this proc */
                cnt = 1;
                if (OPAL_SUCCESS != (rc = opal_dss.unpack(buf, &num_recvd_entries, &cnt, OPAL_INT32))) {
                    OMPI_ERROR_LOG(rc);
                    break;
                }

                /*
                 * Extract the attribute names and values
                 */
                for (j = 0; j < num_recvd_entries; j++) {
                    opal_value_t *kv;
                    cnt = 1;
                    if (OPAL_SUCCESS != (rc = opal_dss.unpack(buf, &kv, &cnt, OPAL_VALUE))) {
                        OMPI_ERROR_LOG(rc);
                        break;
                    }
                    /* if this is me, dump the data - we already have it in the db */
                    if (OPAL_EQUAL == ompi_rte_compare_name_fields(OMPI_RTE_CMP_ALL,
                                                                   OMPI_PROC_MY_NAME, &new_name)) {
                        OBJ_RELEASE(kv);
                    } else {
                        /* store it in the database */
                        if (OPAL_SUCCESS != (rc = opal_db.store_pointer((opal_identifier_t*)&new_name, kv))) {
                            OMPI_ERROR_LOG(rc);
                            OBJ_RELEASE(kv);
                        }
                        /* do not release the kv - the db holds that pointer */
                    }
                }
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
                rc = opal_db.fetch((opal_identifier_t*)&new_name, "OMPI_ARCH",
                                   (void**)&new_arch, OPAL_UINT32);
                if( OPAL_SUCCESS == rc ) {
                    new_arch = opal_local_arch;
                }
#else
                new_arch = opal_local_arch;
#endif
                if (ompi_process_info.num_procs < ompi_hostname_cutoff) {
                    /* retrieve the hostname */
                    rc = opal_db.fetch_pointer((opal_identifier_t*)&new_name, OMPI_DB_HOSTNAME,
                                               (void**)&new_hostname, OPAL_STRING);
                    if( OPAL_SUCCESS != rc ) {
                        new_hostname = NULL;
                    }
                } else {
                    /* just set the hostname to NULL for now - we'll fill it in
                     * as modex_recv's are called for procs we will talk to
                     */
                    new_hostname = NULL;
                }
            }
            /* update all the values */
            plist[i]->proc_arch = new_arch;
            /* if arch is different than mine, create a new convertor for this proc */
            if (plist[i]->proc_arch != opal_local_arch) {
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
                OBJ_RELEASE(plist[i]->proc_convertor);
                plist[i]->proc_convertor = opal_convertor_create(plist[i]->proc_arch, 0);
#else
                opal_show_help("help-mpi-runtime",
                               "heterogeneous-support-unavailable",
                               true, ompi_process_info.nodename, 
                               new_hostname == NULL ? "<hostname unavailable>" :
                               new_hostname);
                free(plist);
                free(newprocs);
                return OMPI_ERR_NOT_SUPPORTED;
#endif
            }

            if (0 == strcmp(ompi_proc_local_proc->proc_hostname, new_hostname)) {
                plist[i]->proc_flags |= (OPAL_PROC_ON_NODE | OPAL_PROC_ON_CU | OPAL_PROC_ON_CLUSTER);
            }

            /* Save the hostname */
            plist[i]->proc_hostname = new_hostname;

        } else {
            if (full_info) {
                int32_t num_recvd_entries;
                int32_t j, cnt;

                /* discard all keys: they are already locally known */
                cnt = 1;
                if (OPAL_SUCCESS == (rc = opal_dss.unpack(buf, &num_recvd_entries, &cnt, OPAL_INT32))) {
                    for (j = 0; j < num_recvd_entries; j++) {
                        opal_value_t *kv;
                        cnt = 1;
                        if (OPAL_SUCCESS != (rc = opal_dss.unpack(buf, &kv, &cnt, OPAL_VALUE))) {
                            OMPI_ERROR_LOG(rc);
                            continue;
                        }
                        OBJ_RELEASE(kv);
                    }
                } else {
                    OMPI_ERROR_LOG(rc);
                }
            }
        }
    }

    if (NULL != newproclistsize) *newproclistsize = newprocs_len;
    if (NULL != newproclist) {
        *newproclist = newprocs;
    } else if (newprocs != NULL) {
        free(newprocs);
    }

    *proclist = plist;
    return OMPI_SUCCESS;
}
示例#9
0
int bcol_basesmuma_smcm_allgather_connection(
                                             mca_bcol_basesmuma_module_t *sm_bcol_module,
                                             mca_sbgp_base_module_t *module,
                                             opal_list_t *peer_list,
                                             bcol_basesmuma_smcm_proc_item_t ***back_files,
                                             ompi_communicator_t *comm,
                                             bcol_basesmuma_smcm_file_t input,
                                             char *base_fname,
                                             bool map_all)
{

    /* define local variables */

    int rc, i, fd;
    ptrdiff_t mem_offset;
    ompi_proc_t *proc_temp, *my_id;
    bcol_basesmuma_smcm_proc_item_t *temp;
    bcol_basesmuma_smcm_proc_item_t *item_ptr;
    bcol_basesmuma_smcm_proc_item_t **backing_files;
    struct file_info_t local_file;
    struct file_info_t *all_files=NULL;

    /* sanity check */
    if (strlen(input.file_name) > SM_BACKING_FILE_NAME_MAX_LEN-1) {
        opal_output (ompi_bcol_base_framework.framework_output, "backing file name too long:  %s len :: %d",
                     input.file_name, (int) strlen(input.file_name));
        return OMPI_ERR_BAD_PARAM;
    }

    backing_files = (bcol_basesmuma_smcm_proc_item_t **)
        calloc(module->group_size, sizeof(bcol_basesmuma_smcm_proc_item_t *));
    if (!backing_files) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    /* FIXME *back_files might have been already allocated
     * so free it in order to avoid a memory leak */
    if (NULL != *back_files) {
        free (*back_files);
    }
    *back_files = backing_files;

    my_id = ompi_proc_local();

    /* Phase One:
       gather a list of processes that will participate in the allgather - I'm
       preparing this list from the sbgp-ing module that was passed into the function */

    /* fill in local file information */
    local_file.vpid  = ((orte_process_name_t*)&my_id->super.proc_name)->vpid;
    local_file.jobid = ((orte_process_name_t*)&my_id->super.proc_name)->jobid;
    local_file.file_size=input.size;
    local_file.size_ctl_structure=input.size_ctl_structure;
    local_file.data_seg_alignment=input.data_seg_alignment;

    strcpy (local_file.file_name, input.file_name);

    /* will exchange this data type as a string of characters -
     * this routine is first called before MPI_init() completes
     * and before error handling is setup, so can't use the
     * MPI data types to send this data */
    all_files = (struct file_info_t *) calloc(module->group_size,
                                              sizeof (struct file_info_t));
    if (!all_files) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    /* exchange data */
    rc = comm_allgather_pml(&local_file,all_files,sizeof(struct file_info_t), MPI_CHAR,
                            sm_bcol_module->super.sbgp_partner_module->my_index,
                            sm_bcol_module->super.sbgp_partner_module->group_size,
                            sm_bcol_module->super.sbgp_partner_module->group_list,
                            sm_bcol_module->super.sbgp_partner_module->group_comm);
    if( OMPI_SUCCESS != rc ) {
        opal_output (ompi_bcol_base_framework.framework_output, "failed in comm_allgather_pml.  Error code: %d", rc);
        goto Error;
    }

    /* Phase four:
       loop through the receive buffer, unpack the data recieved from remote peers */

    for (i = 0; i < module->group_size; i++) {
        struct file_info_t *rem_file = all_files + i;

        /* check if this is my index or if the file is already mapped (set above). ther
         * is no reason to look through the peer list again because no two members of
         * the group will have the same vpid/jobid pair. ignore this previously found
         * mapping if map_all was requested (NTH: not sure why exactly since we re-map
         * and already mapped file) */
        if (sm_bcol_module->super.sbgp_partner_module->my_index == i) {
            continue;
        }

        proc_temp = ompi_comm_peer_lookup(comm,module->group_list[i]);

        OPAL_LIST_FOREACH(item_ptr, peer_list, bcol_basesmuma_smcm_proc_item_t) {
            /* if the vpid/jobid/filename combination already exists in the list,
               then do not map this peer's file --- because you already have */
            if (0 == ompi_rte_compare_name_fields(OMPI_RTE_CMP_ALL,
                                                  OMPI_CAST_RTE_NAME(&proc_temp->super.proc_name),
                                                  &item_ptr->peer) &&
                0 == strcmp (item_ptr->sm_file.file_name, rem_file->file_name)) {
                ++item_ptr->refcnt;
                /* record file data */
                backing_files[i] = item_ptr;
                break;
            }
        }

        if (!map_all && backing_files[i]) {
            continue;
        }

        temp = OBJ_NEW(bcol_basesmuma_smcm_proc_item_t);
        if (!temp) {
            rc = OMPI_ERR_OUT_OF_RESOURCE;
            goto Error;
        }

        temp->peer.vpid = rem_file->vpid;
        temp->peer.jobid = rem_file->jobid;

        temp->sm_file.file_name = strdup (rem_file->file_name);
        if (!temp->sm_file.file_name) {
            rc = OMPI_ERR_OUT_OF_RESOURCE;
            OBJ_RELEASE(temp);
            goto Error;
        }

        temp->sm_file.size = (size_t) rem_file->file_size;
        temp->sm_file.mpool_size = (size_t) rem_file->file_size;
        temp->sm_file.size_ctl_structure = (size_t) rem_file->size_ctl_structure;
        temp->sm_file.data_seg_alignment = (size_t) rem_file->data_seg_alignment;
        temp->refcnt = 1;

        /* Phase Five:
           If map_all == true, then  we map every peer's file
           else we check to see if I have already mapped this
           vpid/jobid/filename combination and if I have, then
           I do not mmap this peer's file.
           *
           */
        fd = open(temp->sm_file.file_name, O_RDWR, 0600);
        if (0 > fd) {
            opal_output (ompi_bcol_base_framework.framework_output, "SMCM Allgather failed to open sm backing file %s. errno = %d",
                         temp->sm_file.file_name, errno);
            rc = OMPI_ERROR;
            goto Error;
        }

        /* map the file */
        temp->sm_mmap = bcol_basesmuma_smcm_reg_mmap (NULL, fd, temp->sm_file.size,
                                                      temp->sm_file.size_ctl_structure,
                                                      temp->sm_file.data_seg_alignment,
                                                      temp->sm_file.file_name);
        close (fd);
        if (NULL == temp->sm_mmap) {
            opal_output (ompi_bcol_base_framework.framework_output, "mmapping failed to map remote peer's file");
            OBJ_RELEASE(temp);
            rc = OMPI_ERROR;
            goto Error;
        }

        /* compute memory offset */
        mem_offset = (ptrdiff_t) temp->sm_mmap->data_addr -
            (ptrdiff_t) temp->sm_mmap->map_seg;
        temp->sm_mmap->map_seg->seg_offset = mem_offset;
        temp->sm_mmap->map_seg->seg_size = temp->sm_file.size - mem_offset;
        /* more stuff to follow */

        /* append this peer's info, including shared memory map addr, onto the
           peer_list */

        /* record file data */
        backing_files[i] = (bcol_basesmuma_smcm_proc_item_t *) temp;

        opal_list_append(peer_list, (opal_list_item_t*) temp);
    }

    rc = OMPI_SUCCESS;

 Error:

    /* error clean-up and return */
    if (NULL != all_files) {
        free(all_files);
    }

    return rc;
}
示例#10
0
/*
 * For a specific module, see if this proc has matching address/modex
 * info.  If so, create an endpoint and return it.
 *
 * Implementation note: This code relies on the order of modules on a local
 * side matching the order of the modex entries that we send around, otherwise
 * both sides may not agree on a bidirectional connection.  It also assumes
 * that add_procs will be invoked on the local modules in that same order, for
 * the same reason.  If those assumptions do not hold, we will need to
 * canonicalize this match ordering somehow, probably by (jobid,vpid) pair or
 * by the interface MAC or IP address.
 */
static int match_modex(ompi_btl_usnic_module_t *module,
                       ompi_btl_usnic_proc_t *proc,
                       int *index_out)
{
    int err = OMPI_SUCCESS;
    size_t i;
    uint32_t num_modules;
    ompi_btl_usnic_graph_t *g = NULL;
    int nme;
    int *me;
    bool proc_is_left;

    if (NULL == index_out) {
        return OMPI_ERR_BAD_PARAM;
    }
    *index_out = -1;

    num_modules = mca_btl_usnic_component.num_modules;

    opal_output_verbose(20, USNIC_OUT, "btl:usnic:%s: module=%p proc=%p with dimensions %d x %d",
                        __func__, (void *)module, (void *)proc,
                        num_modules, (int)proc->proc_modex_count);

    /* We compute an interface match-up table once for each (module,proc) pair
     * and cache it in the proc.  Store per-proc instead of per-module, since
     * MPI dynamic process routines can add procs but not new modules. */
    if (NULL == proc->proc_ep_match_table) {
        proc->proc_ep_match_table = malloc(num_modules *
                                       sizeof(*proc->proc_ep_match_table));
        if (NULL == proc->proc_ep_match_table) {
            OMPI_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE);
            return OMPI_ERR_OUT_OF_RESOURCE;
        }

        /* initialize to "no matches" */
        for (i = 0; i < num_modules; ++i) {
            proc->proc_ep_match_table[i] = -1;
        }

        /* For graphs where all edges are equal (and even for some other
         * graphs), two peers making matching calculations with "mirror image"
         * graphs might not end up with the same matching.  Ensure that both
         * sides are always setting up the exact same graph by always putting
         * the process with the lower (jobid,vpid) on the "left".
         */
        proc_is_left =
            (ompi_rte_compare_name_fields(OMPI_RTE_CMP_ALL,
                                          &proc->proc_ompi->proc_name,
                                          &(ompi_proc_local()->proc_name)) < 0);

        err = create_proc_module_graph(proc, proc_is_left, &g);
        if (OMPI_SUCCESS != err) {
            goto out_free_table;
        }

        nme = 0;
        err = ompi_btl_usnic_solve_bipartite_assignment(g, &nme, &me);
        if (OMPI_SUCCESS != err) {
            OMPI_ERROR_LOG(err);
            goto out_free_graph;
        }

        edge_pairs_to_match_table(proc, proc_is_left, nme, me);

        err = ompi_btl_usnic_gr_free(g);
        if (OMPI_SUCCESS != err) {
            OMPI_ERROR_LOG(err);
            return err;
        }
    }


    if (!proc->proc_match_exists) {
        opal_output_verbose(5, USNIC_OUT, "btl:usnic:%s: unable to find any valid interface pairs for proc %s",
                            __func__, OMPI_NAME_PRINT(&proc->proc_ompi->proc_name));
        return OMPI_ERR_NOT_FOUND;
    }

    /* assuming no strange failure cases, this should always be present */
    if (NULL != proc->proc_ep_match_table && proc->proc_match_exists) {
        for (i = 0; i < num_modules; ++i) {
            if (module == mca_btl_usnic_component.usnic_active_modules[i]) {
                *index_out = proc->proc_ep_match_table[i];
                break;
            }
        }
    }

    /* If MTU does not match, throw an error */
    /* TODO with UDP, do we still want to enforce this restriction or just take
     * the min of the two MTUs?  Another choice is to disqualify this pairing
     * before running the matching algorithm on it. */
    if (*index_out >= 0 &&
        proc->proc_modex[*index_out].mtu != module->if_mtu) {
        opal_show_help("help-mpi-btl-usnic.txt", "MTU mismatch",
                    true,
                    ompi_process_info.nodename,
                    ibv_get_device_name(module->device),
                    module->port_num,
                    module->if_mtu,
                    (NULL == proc->proc_ompi->proc_hostname) ?
                    "unknown" : proc->proc_ompi->proc_hostname,
                    proc->proc_modex[*index_out].mtu);
        *index_out = -1;
        return OMPI_ERR_UNREACH;
    }

    return (*index_out == -1 ? OMPI_ERR_NOT_FOUND : OMPI_SUCCESS);

out_free_graph:
    ompi_btl_usnic_gr_free(g);
out_free_table:
    free(proc->proc_ep_match_table);
    proc->proc_ep_match_table = NULL;
    proc->proc_match_exists = false;
    return err;
}