Exemple #1
0
static void orte_proc_construct(orte_proc_t* proc)
{
    proc->name = *ORTE_NAME_INVALID;
    proc->pid = 0;
    proc->local_rank = ORTE_LOCAL_RANK_INVALID;
    proc->node_rank = ORTE_NODE_RANK_INVALID;
    proc->app_rank = -1;
    proc->last_errmgr_state = ORTE_PROC_STATE_UNDEF;
    proc->state = ORTE_PROC_STATE_UNDEF;
    proc->app_idx = 0;
    proc->slot_list = NULL;
    proc->node = NULL;
    proc->prior_node = NULL;
    proc->nodename = NULL;
    proc->exit_code = 0;      /* Assume we won't fail unless otherwise notified */
    proc->rml_uri = NULL;
    proc->restarts = 0;
    proc->fast_failures = 0;
    proc->last_failure.tv_sec = 0;
    proc->last_failure.tv_usec = 0;
    proc->reported = false;
    proc->beat = 0;
    OBJ_CONSTRUCT(&proc->stats, opal_ring_buffer_t);
    opal_ring_buffer_init(&proc->stats, orte_stat_history_size);
    ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
#if OPAL_ENABLE_FT_CR == 1
    proc->ckpt_state = 0;
    proc->ckpt_snapshot_ref = NULL;
    proc->ckpt_snapshot_loc = NULL;
#endif
}
Exemple #2
0
void orte_sstore_base_local_snapshot_info_destruct( orte_sstore_base_local_snapshot_info_t *snapshot)
{
    snapshot->process_name.jobid  = 0;
    snapshot->process_name.vpid   = 0;
    ORTE_EPOCH_SET(snapshot->process_name.epoch,ORTE_EPOCH_MIN);

    if( NULL != snapshot->crs_comp ) {
        free(snapshot->crs_comp);
        snapshot->crs_comp = NULL;
    }

    if( NULL != snapshot->compress_comp ) {
        free(snapshot->compress_comp);
        snapshot->compress_comp = NULL;
    }

    if( NULL != snapshot->compress_postfix ) {
        free(snapshot->compress_postfix);
        snapshot->compress_postfix = NULL;
    }

    if( NULL != snapshot->start_time ) {
        free(snapshot->start_time);
        snapshot->start_time = NULL;
    }

    if( NULL != snapshot->end_time ) {
        free(snapshot->end_time);
        snapshot->end_time = NULL;
    }
}
static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
{
    opal_pointer_array_t cmd;
    orte_proc_t proc;
    int rc;

    /* stop local sensors for this job */
    if (ORTE_VPID_WILDCARD == vpid) {
        orte_sensor.stop(job);
    }

    if (ORTE_JOBID_WILDCARD == job 
        && ORTE_VPID_WILDCARD == vpid) {
        if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) {
            ORTE_ERROR_LOG(rc);
        }
        return;
    }

    OBJ_CONSTRUCT(&cmd, opal_pointer_array_t);
    OBJ_CONSTRUCT(&proc, orte_proc_t);
    proc.name.jobid = job;
    proc.name.vpid = vpid;
    ORTE_EPOCH_SET(proc.name.epoch,orte_ess.proc_get_epoch(&(proc.name)));
    opal_pointer_array_add(&cmd, &proc);
    if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) {
        ORTE_ERROR_LOG(rc);
    }
    OBJ_DESTRUCT(&cmd);
    OBJ_DESTRUCT(&proc);
}
Exemple #4
0
void errmgr_autor_wp_item_destruct(errmgr_autor_wp_item_t *wp)
{
    wp->name.jobid = ORTE_JOBID_INVALID;
    wp->name.vpid  = ORTE_VPID_INVALID;
    ORTE_EPOCH_SET(wp->name.epoch,ORTE_EPOCH_INVALID);

    wp->state = 0;
}
Exemple #5
0
static int udp_recv_buffer(orte_process_name_t *name,
                           orte_rmcast_channel_t channel,
                           orte_rmcast_tag_t tag,
                           orte_rmcast_seq_t *seq_num,
                           opal_buffer_t *buf)
{
    rmcast_base_recv_t *recvptr;
    int ret;
    orte_rmcast_channel_t chan;

    ORTE_ACQUIRE_THREAD(&ctl);

    if (!comm_enabled) {
        ORTE_RELEASE_THREAD(&ctl);
        return ORTE_ERR_COMM_DISABLED;
    }

    OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
                         "%s rmcast:udp: recv_buffer called on multicast channel %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), channel));

    if (ORTE_RMCAST_GROUP_INPUT_CHANNEL == channel) {
        chan = orte_rmcast_base.my_input_channel->channel;
    } else if (ORTE_RMCAST_GROUP_OUTPUT_CHANNEL == channel) {
        chan = orte_rmcast_base.my_output_channel->channel;
    } else {
        chan = channel;
    }
    
    if (ORTE_SUCCESS != (ret = orte_rmcast_base_queue_recv(&recvptr, chan, tag,
                                                           ORTE_RMCAST_NON_PERSISTENT,
                                                           NULL, NULL, NULL, true))) {
        ORTE_ERROR_LOG(ret);
        ORTE_RELEASE_THREAD(&ctl);
        return ret;
    }
    ORTE_RELEASE_THREAD(&ctl);
    
    recvptr->ctl.active = true;
    ORTE_ACQUIRE_THREAD(&recvptr->ctl);
    
    /* xfer the data */
    if (NULL != name) {
        /* caller requested id of sender */
        name->jobid = recvptr->name.jobid;
        name->vpid = recvptr->name.vpid;
        ORTE_EPOCH_SET(name->epoch,recvptr->name.epoch);
    }
    *seq_num = recvptr->seq_num;
    if (ORTE_SUCCESS != (ret = opal_dss.copy_payload(buf, recvptr->buf))) {
        ORTE_ERROR_LOG(ret);
    }
    /* release the data */
    OBJ_RELEASE(recvptr);
    
    return ret;
}
Exemple #6
0
void orte_snapc_base_local_snapshot_destruct( orte_snapc_base_local_snapshot_t *snapshot)
{
    snapshot->process_name.jobid  = 0;
    snapshot->process_name.vpid   = 0;
    ORTE_EPOCH_SET(snapshot->process_name.epoch,ORTE_EPOCH_MIN);

    snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;

    snapshot->ss_handle  = ORTE_SSTORE_HANDLE_INVALID;
}
Exemple #7
0
static int udp_recv(orte_process_name_t *name,
                    orte_rmcast_channel_t channel,
                    orte_rmcast_tag_t tag,
                    orte_rmcast_seq_t *seq_num,
                    struct iovec **msg, int *count)
{
    rmcast_base_recv_t *recvptr;
    int ret;
    orte_rmcast_channel_t chan;

    ORTE_ACQUIRE_THREAD(&ctl);

    if (!comm_enabled) {
        ORTE_RELEASE_THREAD(&ctl);
        return ORTE_ERR_COMM_DISABLED;
    }

    if (ORTE_RMCAST_GROUP_INPUT_CHANNEL == channel) {
        chan = orte_rmcast_base.my_input_channel->channel;
    } else if (ORTE_RMCAST_GROUP_OUTPUT_CHANNEL == channel) {
        chan = orte_rmcast_base.my_output_channel->channel;
    } else {
        chan = channel;
    }
    
    if (ORTE_SUCCESS != (ret = orte_rmcast_base_queue_recv(&recvptr, chan, tag,
                                                           ORTE_RMCAST_NON_PERSISTENT,
                                                           NULL, NULL, NULL, true))) {
        ORTE_ERROR_LOG(ret);
        ORTE_RELEASE_THREAD(&ctl);
        return ret;
    }
    ORTE_RELEASE_THREAD(&ctl);
    
    recvptr->ctl.active = true;
    ORTE_ACQUIRE_THREAD(&recvptr->ctl);
    
    /* xfer the data */
    if (NULL != name) {
        /* caller requested id of sender */
        name->jobid = recvptr->name.jobid;
        name->vpid = recvptr->name.vpid;
        ORTE_EPOCH_SET(name->epoch,recvptr->name.epoch);
    }
    *seq_num = recvptr->seq_num;
    *msg = recvptr->iovec_array;
    *count = recvptr->iovec_count;
    
    /* carefully release the recv */
    recvptr->iovec_array = NULL;
    recvptr->iovec_count = 0;
    OBJ_RELEASE(recvptr);
    
    return ORTE_SUCCESS;
}
void orte_sstore_central_local_app_snapshot_info_construct(orte_sstore_central_local_app_snapshot_info_t *info)
{
    info->name.jobid = ORTE_JOBID_INVALID;
    info->name.vpid  = ORTE_VPID_INVALID;
    ORTE_EPOCH_SET(info->name.epoch,ORTE_EPOCH_MIN);

    info->local_location = NULL;
    info->metadata_filename = NULL;
    info->crs_comp = NULL;
    info->ckpt_skipped = false;
}
static int append_new_app_handle_info(orte_sstore_central_local_snapshot_info_t *handle_info,
                                      orte_process_name_t *name)
{
    orte_sstore_central_local_app_snapshot_info_t *app_info = NULL;

    app_info = OBJ_NEW(orte_sstore_central_local_app_snapshot_info_t);

    app_info->name.jobid = name->jobid;
    app_info->name.vpid  = name->vpid;
    ORTE_EPOCH_SET(app_info->name.epoch,name->epoch);

    opal_list_append(handle_info->app_info_handle, &(app_info->super));

    return ORTE_SUCCESS;
}
Exemple #10
0
static int slave_set_name(void)
{
    char *jobid_str, *procid_str;
    int id, rc;
    orte_jobid_t jobid;
    orte_vpid_t vpid;
    
    id = mca_base_param_register_string("orte", "ess", "jobid", NULL, NULL);
    mca_base_param_lookup_string(id, &jobid_str);
    if (NULL == jobid_str) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        return ORTE_ERR_NOT_FOUND;
    }
    if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_jobid(&jobid, jobid_str))) {
        ORTE_ERROR_LOG(rc);
        return(rc);
    }
    free(jobid_str);
    
    id = mca_base_param_register_string("orte", "ess", "vpid", NULL, NULL);
    mca_base_param_lookup_string(id, &procid_str);
    if (NULL == procid_str) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        return ORTE_ERR_NOT_FOUND;
    }
    if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_vpid(&vpid, procid_str))) {
        ORTE_ERROR_LOG(rc);
        return(rc);
    }
    free(procid_str);
    
    ORTE_PROC_MY_NAME->jobid = jobid;
    ORTE_PROC_MY_NAME->vpid = vpid;
    ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME));
    
    OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
                         "ess:slave set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    /* get the non-name common environmental variables */
    if (ORTE_SUCCESS != (rc = orte_ess_env_get())) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }

    return ORTE_SUCCESS;
}
static int orte_sstore_central_extract_global_metadata(orte_sstore_central_global_snapshot_info_t * handle_info,
        orte_sstore_base_global_snapshot_info_t *global_snapshot)
{
    int exit_status = ORTE_SUCCESS;
    orte_sstore_base_local_snapshot_info_t *vpid_snapshot = NULL;
    opal_list_item_t* item = NULL;
    int i = 0;

    /*
     * Cleanup the structure a bit, so we can refresh it below
     */
    while (NULL != (item = opal_list_remove_first(&global_snapshot->local_snapshots))) {
        OBJ_RELEASE(item);
    }

    if( NULL != global_snapshot->start_time ) {
        free( global_snapshot->start_time );
        global_snapshot->start_time = NULL;
    }

    if( NULL != global_snapshot->end_time ) {
        free( global_snapshot->end_time );
        global_snapshot->end_time = NULL;
    }

    /*
     * Create a structure for each application process
     */
    for(i = 0; i < handle_info->num_procs_total; ++i) {
        vpid_snapshot = OBJ_NEW(orte_sstore_base_local_snapshot_info_t);
        vpid_snapshot->ss_handle = handle_info->id;

        vpid_snapshot->process_name.jobid  = handle_info->jobid;
        vpid_snapshot->process_name.vpid   = i;
        ORTE_EPOCH_SET(vpid_snapshot->process_name.epoch,orte_ess.proc_get_epoch(&vpid_snapshot->process_name));

        vpid_snapshot->crs_comp     = NULL;
        global_snapshot->start_time = NULL;
        global_snapshot->end_time   = NULL;

        opal_list_append(&global_snapshot->local_snapshots, &(vpid_snapshot->super));
    }

    return exit_status;
}
Exemple #12
0
/**
 * Initialize the module
 */
static int init(void)
{
    int rc;
    
    OBJ_CONSTRUCT(&my_local_peers, opal_list_t);
    OBJ_CONSTRUCT(&barrier, orte_grpcomm_collective_t);
    OBJ_CONSTRUCT(&allgather, orte_grpcomm_collective_t);

    my_local_rank_zero_proc.jobid = ORTE_PROC_MY_NAME->jobid;
    my_local_rank_zero_proc.vpid = ORTE_VPID_INVALID;
    ORTE_EPOCH_SET(my_local_rank_zero_proc.epoch,ORTE_EPOCH_MIN);

    if (ORTE_SUCCESS != (rc = orte_grpcomm_base_modex_init())) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    
    return rc;
}
Exemple #13
0
static int hnp_pull(const orte_process_name_t* dst_name,
                    orte_iof_tag_t src_tag,
                    int fd)
{
    orte_iof_sink_t *sink;
    int flags;
    
    /* this is a local call - only stdin is supported */
    if (ORTE_IOF_STDIN != src_tag) {
        return ORTE_ERR_NOT_SUPPORTED;
    }
    
    OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
                         "%s iof:hnp pulling fd %d for process %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         fd, ORTE_NAME_PRINT(dst_name)));
    
    /* set the file descriptor to non-blocking - do this before we setup
     * the sink in case it fires right away
     */
    if((flags = fcntl(fd, F_GETFL, 0)) < 0) {
        opal_output(orte_iof_base.iof_output, "[%s:%d]: fcntl(F_GETFL) failed with errno=%d\n", 
                    __FILE__, __LINE__, errno);
    } else {
        flags |= O_NONBLOCK;
        fcntl(fd, F_SETFL, flags);
    }
    
    ORTE_IOF_SINK_DEFINE(&sink, dst_name, fd, ORTE_IOF_STDIN,
                         stdin_write_handler,
                         &mca_iof_hnp_component.sinks);
    sink->daemon.jobid = ORTE_PROC_MY_NAME->jobid;
    sink->daemon.vpid = ORTE_PROC_MY_NAME->vpid;
    ORTE_EPOCH_SET(sink->daemon.epoch,ORTE_PROC_MY_NAME->epoch);

    return ORTE_SUCCESS;
}
int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
{
    orte_job_map_t *map;
    orte_vpid_t vpid;
    int i, j;
    orte_node_t *node;
    orte_proc_t *proc;
    int rc;
    
    map = jdata->map;
    
    if (ORTE_MAPPING_BYSLOT & map->policy ||
        ORTE_MAPPING_BYSOCKET & map->policy ||
        ORTE_MAPPING_BYBOARD & map->policy) {
        /* assign the ranks sequentially */
        for (i=0; i < map->nodes->size; i++) {
            if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
                continue;
            }
            for (j=0; j < node->procs->size; j++) {
                if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
                    continue;
                }
                /* ignore procs from other jobs */
                if (proc->name.jobid != jdata->jobid) {
                    continue;
                }
                if (ORTE_VPID_INVALID == proc->name.vpid) {
                    /* find the next available vpid */
                    for (vpid=0; vpid < jdata->num_procs; vpid++) {
                        if (NULL == opal_pointer_array_get_item(jdata->procs, vpid)) {
                            break;
                        }
                    }
                    proc->name.vpid = vpid;
                    ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
                    ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
                    
                    /* If there is an invalid epoch here, it's because it doesn't exist yet. */
                    if (0 == ORTE_EPOCH_CMP(ORTE_EPOCH_INVALID,proc->name.epoch)) {
                        ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
                    }
                }
                if (NULL == opal_pointer_array_get_item(jdata->procs, proc->name.vpid)) {
                    if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
                        ORTE_ERROR_LOG(rc);
                        return rc;
                    }                    
                }
            }
        }
        return ORTE_SUCCESS;
    }
    
    if (ORTE_MAPPING_BYNODE & map->policy) {
        /* assign the ranks round-robin across nodes */
        for (i=0; i < map->nodes->size; i++) {
            if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
                continue;
            }
            for (j=0; j < node->procs->size; j++) {
                if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
                    continue;
                }
                /* ignore procs from other jobs */
                if (proc->name.jobid != jdata->jobid) {
                    continue;
                }
                if (ORTE_VPID_INVALID == proc->name.vpid) {
                    /* find the next available vpid */
                    vpid = i;
                    while (NULL != opal_pointer_array_get_item(jdata->procs, vpid)) {
                        vpid += map->num_nodes;
                        if (jdata->num_procs <= vpid) {
                            vpid = vpid - jdata->num_procs;
                        }
                    }
                    proc->name.vpid = vpid;
                    ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
                    ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
                }
                if (NULL == opal_pointer_array_get_item(jdata->procs, proc->name.vpid)) {
                    if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
                        ORTE_ERROR_LOG(rc);
                        return rc;
                    }                    
                }
            }
        }
        return ORTE_SUCCESS;
    }

    return ORTE_ERR_NOT_IMPLEMENTED;
}
int orte_rmaps_base_define_daemons(orte_job_t *jdata)
{
    orte_job_map_t *map;
    orte_node_t *node;
    orte_proc_t *proc;
    orte_job_t *daemons;
    int i;
    int rc;

    OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
                         "%s rmaps:base:define_daemons",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

    if (ORTE_MAPPING_USE_VM & jdata->map->policy) {
        /* nothing for us to do - all daemons are
         * defined by definition!
         */
        return ORTE_SUCCESS;
    }

    /* get the daemon job data struct */
    if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_HNP->jobid))) {
        /* bad news */
        ORTE_ERROR_LOG(ORTE_ERR_FATAL);
        return ORTE_ERR_FATAL;
    }
    
    /* initialize the #new daemons */
    map = jdata->map;
    map->num_new_daemons = 0;
    
    /* go through the nodes in the map, checking each one's daemon name
     */
    for (i=0; i < map->nodes->size; i++) {
        if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
            continue;
        }
        if (NULL == node->daemon) {
            /* we haven't defined one for it
             * yet, so do so now and indicate it is to be launched
             */
            proc = OBJ_NEW(orte_proc_t);
            if (NULL == proc) {
                ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
                return ORTE_ERR_OUT_OF_RESOURCE;
            }
            proc->name.jobid = ORTE_PROC_MY_HNP->jobid;
            if (ORTE_VPID_MAX-1 <= daemons->num_procs) {
                /* no more daemons available */
                orte_show_help("help-orte-rmaps-base.txt", "out-of-vpids", true);
                OBJ_RELEASE(proc);
                return ORTE_ERR_OUT_OF_RESOURCE;
            }
            proc->name.vpid = daemons->num_procs;  /* take the next available vpid */
            ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
            proc->node = node;
            proc->nodename = node->name;
            OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
                                 "%s rmaps:base:define_daemons add new daemon %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&proc->name)));
            /* add the daemon to the daemon job object */
            if (0 > (rc = opal_pointer_array_add(daemons->procs, (void*)proc))) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
            ++daemons->num_procs;
            /* point the node to the daemon */
            node->daemon = proc;
            OBJ_RETAIN(proc);  /* maintain accounting */
            /* track number of daemons to be launched */
            ++map->num_new_daemons;
            /* and their starting vpid */
            if (ORTE_VPID_INVALID == map->daemon_vpid_start) {
                map->daemon_vpid_start = proc->name.vpid;
            }
        }
        /*
         * If we are launching on a node where there used to be a daemon, but
         * it had previously failed, try to relaunch it. (Daemon Recovery) Do
         * this ONLY if there are procs mapped to that daemon!
         */
        else if (node->daemon->state > ORTE_PROC_STATE_UNTERMINATED) {
            /* If no processes are to be launched on this node, then exclude it */
            if( 0 >= node->num_procs ) {
                OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
                                     "%s rmaps:base:define_daemons Skipping the Recovery of daemon %s [0x%x] Launched: %s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_NAME_PRINT(&node->daemon->name),
                                     node->daemon->state,
                                     (node->daemon_launched ? "T" : "F")
                                     ));
                /* since this daemon exists but is not needed, then flag it
                 * as "launched" to avoid relaunching it for no reason
                 */
                node->daemon_launched = true;
                continue;
            }

            OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
                                 "%s rmaps:base:define_daemons RECOVERING daemon %s [0x%x] Launched: %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&node->daemon->name),
                                 node->daemon->state,
                                 (node->daemon_launched ? "T" : "F")
                                 ));

            /* flag that the daemon is no longer launched */
            node->daemon_launched = false;

            /* set the state to indicate launch is in progress */
            node->daemon->state = ORTE_PROC_STATE_RESTART;

            free(node->daemon->rml_uri);
            node->daemon->rml_uri = NULL;
            
            OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
                                 "%s rmaps:base:define_daemons add new daemon %s (Recovering old daemon)",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&node->daemon->name)));

            /* track number of daemons to be launched */
            ++map->num_new_daemons;
        }
        else {
            /* this daemon was previously defined - flag it */
            node->daemon_launched = true;
            OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
                                 "%s rmaps:base:define_daemons existing daemon %s already launched",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&node->daemon->name)));
        }
    }

    return ORTE_SUCCESS;
}
Exemple #16
0
int orte_sstore_base_extract_global_metadata(orte_sstore_base_global_snapshot_info_t *global_snapshot)
{
    int ret, exit_status = ORTE_SUCCESS;
    FILE *metadata = NULL;
    char * token = NULL;
    char * value = NULL;
    orte_process_name_t proc;
    opal_list_item_t* item = NULL;
    orte_sstore_base_local_snapshot_info_t *vpid_snapshot = NULL;

    /*
     * Cleanup the structure a bit, so we can refresh it below
     */
    while (NULL != (item = opal_list_remove_first(&global_snapshot->local_snapshots))) {
        OBJ_RELEASE(item);
    }

    if( NULL != global_snapshot->start_time ) {
        free( global_snapshot->start_time );
        global_snapshot->start_time = NULL;
    }

    if( NULL != global_snapshot->end_time ) {
        free( global_snapshot->end_time );
        global_snapshot->end_time = NULL;
    }

    /*
     * Open the metadata file
     */
    if (NULL == (metadata = fopen(global_snapshot->metadata_filename, "r")) ) {
        opal_output(orte_sstore_base_output,
                    "sstore:base:extract_global_metadata() Unable to open the file (%s)\n",
                    global_snapshot->metadata_filename);
        ORTE_ERROR_LOG(ORTE_ERROR);
        exit_status = ORTE_ERROR;
        goto cleanup;
    }

    /*
     * Seek to the sequence number requested
     */
    if( ORTE_SUCCESS != (ret = orte_sstore_base_metadata_seek_to_seq_num(metadata, global_snapshot->seq_num))) {
        ORTE_ERROR_LOG(ORTE_ERROR);
        exit_status = ORTE_ERROR;
        goto cleanup;
    }

    /*
     * Extract each token and make the records
     */
    do {
        if( ORTE_SUCCESS != orte_sstore_base_metadata_read_next_token(metadata, &token, &value) ) {
            break;
        }

        if(0 == strncmp(token, SSTORE_METADATA_GLOBAL_SNAP_SEQ_STR,  strlen(SSTORE_METADATA_GLOBAL_SNAP_SEQ_STR)) ||
           0 == strncmp(token, SSTORE_METADATA_INTERNAL_MIG_SEQ_STR, strlen(SSTORE_METADATA_INTERNAL_MIG_SEQ_STR)) ) {
            break;
        }

        if( 0 == strncmp(token, SSTORE_METADATA_INTERNAL_PROCESS_STR, strlen(SSTORE_METADATA_INTERNAL_PROCESS_STR)) ) {
            orte_util_convert_string_to_process_name(&proc, value);

            /* Not the first process, so append it to the list */
            if( NULL != vpid_snapshot) {
                opal_list_append(&global_snapshot->local_snapshots, &(vpid_snapshot->super));
            }

            vpid_snapshot = OBJ_NEW(orte_sstore_base_local_snapshot_info_t);
            vpid_snapshot->ss_handle = global_snapshot->ss_handle;

            vpid_snapshot->process_name.jobid  = proc.jobid;
            vpid_snapshot->process_name.vpid   = proc.vpid;
            ORTE_EPOCH_SET(vpid_snapshot->process_name.epoch,proc.epoch);
        }
        else if(0 == strncmp(token, SSTORE_METADATA_LOCAL_CRS_COMP_STR, strlen(SSTORE_METADATA_LOCAL_CRS_COMP_STR))) {
            vpid_snapshot->crs_comp = strdup(value);
        }
        else if(0 == strncmp(token, SSTORE_METADATA_LOCAL_COMPRESS_COMP_STR, strlen(SSTORE_METADATA_LOCAL_COMPRESS_COMP_STR))) {
            vpid_snapshot->compress_comp = strdup(value);
        }
        else if(0 == strncmp(token, SSTORE_METADATA_LOCAL_COMPRESS_POSTFIX_STR, strlen(SSTORE_METADATA_LOCAL_COMPRESS_POSTFIX_STR))) {
            vpid_snapshot->compress_postfix = strdup(value);
        }
        else if(0 == strncmp(token, SSTORE_METADATA_INTERNAL_TIME_STR, strlen(SSTORE_METADATA_INTERNAL_TIME_STR)) ) {
            if( NULL == global_snapshot->start_time) {
                global_snapshot->start_time = strdup(value);
            }
            else {
                global_snapshot->end_time   = strdup(value);
            }
        }
        else if(0 == strncmp(token, SSTORE_METADATA_GLOBAL_AMCA_PARAM_STR, strlen(SSTORE_METADATA_GLOBAL_AMCA_PARAM_STR))) {
            global_snapshot->amca_param  = strdup(value);
        }
    } while(0 == feof(metadata) );
    
    /* Append the last item */
    if( NULL != vpid_snapshot) {
        opal_list_append(&global_snapshot->local_snapshots, &(vpid_snapshot->super));
    }
    
 cleanup:
    if( NULL != metadata ) {
        fclose(metadata);
        metadata = NULL;
    }
    if( NULL != value ) {
        free(value);
        value = NULL;
    }
    if( NULL != token ) {
        free(token);
        token = NULL;
    }

    return exit_status;
}
/*
 * Claim a slot for a specified job on a node
 */
int orte_rmaps_base_claim_slot(orte_job_t *jdata,
                               orte_node_t *current_node,
                               int32_t cpus_per_rank,
                               orte_std_cntr_t app_idx,
                               opal_list_t *nodes,
                               bool oversubscribe,
                               bool remove_from_list,
                               orte_proc_t **returnproc)
{
    orte_proc_t *proc;
    bool oversub;
    int rc;

    /* if we were given a proc, just use it */
    if (NULL != returnproc && NULL != *returnproc) {
        proc = *returnproc;
    } else {
        /* create mapped_proc object */
        proc = OBJ_NEW(orte_proc_t);
        if (NULL == proc) {
            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
            return ORTE_ERR_OUT_OF_RESOURCE;
        }
        /* set the jobid */
        proc->name.jobid = jdata->jobid;
        /* flag the proc as ready for launch */
        proc->state = ORTE_PROC_STATE_INIT;
        /* we do not set the vpid here - this will be done
         * during a second phase
         */

        /* We do set the epoch here since they all start with the same value. */
        ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);

        proc->app_idx = app_idx;
        OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
                             "%s rmaps:base:claim_slot: created new proc %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&proc->name)));
        
        /* provide returned proc, if requested */
        if (NULL != returnproc) {
            *returnproc = proc;
        }
    }

    OBJ_RETAIN(current_node);  /* maintain accounting on object */
    
    proc->node = current_node;
    proc->nodename = current_node->name;
    
    OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
                         "%s rmaps:base:claim_slot mapping proc in job %s to node %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_JOBID_PRINT(jdata->jobid), current_node->name));
    
    /* Be sure to demarcate the slots for this proc as claimed from the node */
    current_node->slots_inuse += 1;
    
    /* see if this node is oversubscribed now */
    if (current_node->slots_inuse > current_node->slots) {
        oversub = true;
    } else {
        oversub = false;
    }
    
    /* assign the proc to the node and ensure the node is on the map */
    if (ORTE_SUCCESS != (rc = orte_rmaps_base_add_proc_to_map(jdata->map, current_node,
                                                              oversub, proc))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(proc);
        return rc;
    }
    
    /* If this node has reached its max number of allocatable slots OR it has
     * reached the soft limit AND we are in a "no oversubscribe" state, then
     * we need to return a flag telling the mapper this is the case so it
     * can move on to the next node
     */
    if ((0 != current_node->slots_max  &&
        current_node->slots_inuse >= current_node->slots_max) ||
        (!oversubscribe && current_node->slots_inuse >= current_node->slots)) {
        /* see if we are supposed to remove the node from the list - some
         * mappers want us to do so to avoid any chance of continuing to
         * add procs to it
         */
        if (NULL != nodes && remove_from_list) {
            opal_list_remove_item(nodes, (opal_list_item_t*)current_node);
            /* release it - it was retained when we started, so this
             * just ensures the instance counter is correctly updated
             */
            OBJ_RELEASE(current_node);
        }
        /* now return the proper code so the caller knows this node
         * is fully used
         */
        return ORTE_ERR_NODE_FULLY_USED;
    }

    return ORTE_SUCCESS;
}
Exemple #18
0
int orte_util_build_daemon_nidmap(char **nodes)
{
    orte_nid_t *node;
    int i, num_nodes;
    int rc;
    struct hostent *h;
    opal_buffer_t buf;
    orte_process_name_t proc;
    char *uri, *addr;
    char *proc_name;
    
    num_nodes = opal_argv_count(nodes);
    
    OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
                         "%s orte:util:build:daemon:nidmap found %d nodes",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), num_nodes));
    
    if (0 == num_nodes) {
        /* nothing to do */
        return ORTE_SUCCESS;
    }
    
    /* set the size of the nidmap storage so we minimize realloc's */
    if (ORTE_SUCCESS != (rc = opal_pointer_array_set_size(&orte_nidmap, num_nodes+1))) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    
    /* install the entry for the HNP */
    node = OBJ_NEW(orte_nid_t);
    node->name = strdup("HNP");
    node->daemon = 0;
    /* the arch defaults to our arch so that non-hetero
     * case will yield correct behavior
     */
    opal_pointer_array_set_item(&orte_nidmap, 0, node);        
    
    /* the daemon vpids will be assigned in order,
     * starting with vpid=1 for the first node in
     * the list
     */
    OBJ_CONSTRUCT(&buf, opal_buffer_t);
    proc.jobid = ORTE_PROC_MY_NAME->jobid;
    for (i=0; i < num_nodes; i++) {
        node = OBJ_NEW(orte_nid_t);
        node->name = strdup(nodes[i]);
        node->daemon = i+1;
        /* the arch defaults to our arch so that non-hetero
         * case will yield correct behavior
         */
        opal_pointer_array_set_item(&orte_nidmap, node->daemon, node);        
        
        /* lookup the address of this node */
        if (NULL == (h = gethostbyname(node->name))) {
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            return ORTE_ERR_NOT_FOUND;
        }
        addr = inet_ntoa(*(struct in_addr*)h->h_addr_list[0]);
        
        /* since we are using static ports, all my fellow daemons will be on my
         * port. Setup the contact info for each daemon in my hash tables. Note
         * that this will -not- open a port to those daemons, but will only
         * define the info necessary for opening such a port if/when I communicate
         * to them
         */
        /* construct the URI */
        proc.vpid = node->daemon;
        ORTE_EPOCH_SET(proc.epoch,ORTE_EPOCH_MIN);

        orte_util_convert_process_name_to_string(&proc_name, &proc);
        asprintf(&uri, "%s;tcp://%s:%d", proc_name, addr, (int)orte_process_info.my_port);
        OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
                             "%s orte:util:build:daemon:nidmap node %s daemon %d addr %s uri %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             node->name, (int)node->daemon, addr, uri));
        opal_dss.pack(&buf, &uri, 1, OPAL_STRING);
        free(proc_name);
        free(uri);
    }
    
    /* load the hash tables */
    if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(&buf))) {
        ORTE_ERROR_LOG(rc);
    }
    OBJ_DESTRUCT(&buf);

    return rc;
}
Exemple #19
0
static int hier_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf)
{
    int rc=ORTE_SUCCESS;
    opal_list_item_t *item;
    orte_namelist_t *nm;
    opal_buffer_t tmp_buf;

    OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output,
                         "%s grpcomm:hier entering allgather",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    /* have I initialized my local info? */
    if (!coll_initialized) {
        orte_process_name_t proc;
        orte_vpid_t v;
        
        /* get my local rank so I can locally cache it */
        my_local_rank = orte_ess.get_local_rank(ORTE_PROC_MY_NAME);
        
        /* if I am local_rank=0 for this node and job, then setup
         * my array of local_rank=0 peers
         */
        if (0 == my_local_rank) {
            /* we need one entry/node in this job */
            my_coll_peers = (orte_vpid_t*)malloc(orte_process_info.num_nodes * sizeof(orte_vpid_t));
            cpeers = 0;
        }
        
        /* cycle through the procs to create a list of those that are local to me */
        proc.jobid = ORTE_PROC_MY_NAME->jobid;
        for (v=0; v < orte_process_info.num_procs; v++) {
            proc.vpid = v;
            ORTE_EPOCH_SET(proc.epoch,orte_util_lookup_epoch(&proc));

            /* is this proc local_rank=0 on its node? */
            if (0 == my_local_rank && 0 == orte_ess.get_local_rank(&proc)) {
                my_coll_peers[cpeers++] = v;
            }
            /* if this is me, or this proc isn't on our node, ignore it */
            if (v == ORTE_PROC_MY_NAME->vpid ||
                !OPAL_PROC_ON_LOCAL_NODE(orte_ess.proc_get_locality(&proc))) {
                continue;
            }
            /* add this proc to our list of local peers */
            nm = OBJ_NEW(orte_namelist_t);
            nm->name.jobid = proc.jobid;
            nm->name.vpid = proc.vpid;
            ORTE_EPOCH_SET(nm->name.epoch,proc.epoch);

            opal_list_append(&my_local_peers, &nm->item);
            /* if I am not local_rank=0, is this one? */
            if (0 != my_local_rank &&
                0 == orte_ess.get_local_rank(&proc)) {
                my_local_rank_zero_proc.jobid = proc.jobid;
                my_local_rank_zero_proc.vpid = proc.vpid;
                ORTE_EPOCH_SET(my_local_rank_zero_proc.epoch,proc.epoch);
            }
        }

        /* compute the number of local peers - note that this number
         * does not include me!!
         */
        num_local_peers = opal_list_get_size(&my_local_peers);
        
        /* flag that I have initialized things */
        coll_initialized = true;
    }

    /* if I am not local rank = 0 */
    if (0 != my_local_rank) {
        if (ORTE_VPID_INVALID == my_local_rank_zero_proc.vpid) {
            /* something is broken */
            ORTE_ERROR_LOG(ORTE_ERR_FATAL);
            return ORTE_ERR_FATAL;
        }

        /* setup the collective */
        OPAL_THREAD_LOCK(&allgather.lock);
        allgather.recvd = 0;
        /* reset the collector */
        OBJ_DESTRUCT(&allgather.results);
        OBJ_CONSTRUCT(&allgather.results, opal_buffer_t);
        OPAL_THREAD_UNLOCK(&allgather.lock);
        
        /* send our data to the local_rank=0 proc on this node */
        if (0 > (rc = orte_rml.send_buffer(&my_local_rank_zero_proc, sbuf, ORTE_RML_TAG_ALLGATHER, 0))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        
        /* now receive the final result. Be sure to do this in
         * a manner that allows us to return without being in a recv!
         */
        rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER,
                                     ORTE_RML_NON_PERSISTENT, allgather_recv, &allgather);
        if (rc != ORTE_SUCCESS) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        
        /* wait to complete - we will receive a single message
         * sent from our local_rank=0 peer
         */
        OPAL_THREAD_LOCK(&allgather.lock);
        while (allgather.recvd < 1) {
            opal_condition_wait(&allgather.cond, &allgather.lock);
        }
        /* copy payload to the caller's buffer */
        if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(rbuf, &allgather.results))) {
            ORTE_ERROR_LOG(rc);
        }
        OPAL_THREAD_UNLOCK(&allgather.lock);
        
        
    } else {
        /* I am local_rank = 0 on this node! */

        /* setup the collective */
        OPAL_THREAD_LOCK(&allgather.lock);
        allgather.recvd = 0;
        /* reset the collector */
        OBJ_DESTRUCT(&allgather.results);
        OBJ_CONSTRUCT(&allgather.results, opal_buffer_t);
        /* seed with my data */
        opal_dss.copy_payload(&allgather.results, sbuf);
        OPAL_THREAD_UNLOCK(&allgather.lock);

        /* wait to receive their data. Be sure to do this in
         * a manner that allows us to return without being in a recv!
         */
        rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER,
                                     ORTE_RML_PERSISTENT, allgather_recv, &allgather);
        if (rc != ORTE_SUCCESS) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        
        /* wait to complete - we need to receive input from every
         * local peer (excluding myself)
         */
        OPAL_THREAD_LOCK(&allgather.lock);
        while (allgather.recvd < num_local_peers) {
            opal_condition_wait(&allgather.cond, &allgather.lock);
        }
        /* xfer to the tmp buf in case another allgather comes along */
        OBJ_CONSTRUCT(&tmp_buf, opal_buffer_t);
        opal_dss.copy_payload(&tmp_buf, &allgather.results);
        OPAL_THREAD_UNLOCK(&allgather.lock);
        
        /* cancel the lingering recv */
        orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER);
        
        /* take the recv'd data and use one of the base collectives
         * to exchange it with all other local_rank=0 procs in a scalable
         * manner - the exact collective will depend upon the number of
         * nodes in the job
         */
        if (ORTE_SUCCESS != (rc = orte_grpcomm_base_allgather(&tmp_buf, rbuf, num_local_peers + 1,
                                                              ORTE_PROC_MY_NAME->jobid,
                                                              cpeers, my_coll_peers))) {
            ORTE_ERROR_LOG(rc);
            OBJ_DESTRUCT(&tmp_buf);
            return rc;
        }
        OBJ_DESTRUCT(&tmp_buf);  /* done with this */

        /* distribute the results to our local peers */
        for (item = opal_list_get_first(&my_local_peers);
             item != opal_list_get_end(&my_local_peers);
             item = opal_list_get_next(item)) {
            nm = (orte_namelist_t*)item;
            if (0 > (rc = orte_rml.send_buffer(&nm->name, rbuf, ORTE_RML_TAG_ALLGATHER, 0))) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
        }
    }

    OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output,
                         "%s grpcomm:hier allgather completed",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    return ORTE_SUCCESS;
}
Exemple #20
0
/* For a complete description of this algorithm, please look at
 * ompi/mca/coll/tuned/coll_tuned_allgather.c
 */
static int recursivedoubling(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_entries,
                             orte_jobid_t jobid, orte_vpid_t np, orte_vpid_t *vpids)
{
    orte_vpid_t rank, distance, nv;
    int32_t num_remote, total_entries, cnt;
    opal_buffer_t collection, buf;
    orte_process_name_t peer;
    int rc;
    
    OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
                         "%s grpcomm:coll:recdub algo employed",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    /* initialize */
    total_entries = num_entries;
    
    /* start by seeding the collection with our own data */
    OBJ_CONSTRUCT(&collection, opal_buffer_t);
    opal_dss.copy_payload(&collection, sendbuf);
    
    /* collective is constrained to take place within the specified jobid */
    peer.jobid = jobid;
    
    /* Communication step:
     At every step i, rank r:
     - exchanges message containing all data collected so far with rank peer = (r ^ 2^i).
     */
    /* find my position in the group of participants. This
     * value is the "rank" we will use in the algo
     */
    rank = ORTE_VPID_INVALID;
    for (nv=0; nv < np; nv++) {
        if (vpids[nv] == ORTE_PROC_MY_NAME->vpid) {
            rank = nv;
            break;
        }
    }
    
    /* check for bozo case */
    if (ORTE_VPID_INVALID == rank) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        return ORTE_ERR_NOT_FOUND;
    }
    
    for (distance = 0x1; distance < np; distance<<=1) {
        
        /* first send my current contents */
        nv = rank ^ distance;
        peer.vpid = vpids[nv];
        ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer));

        OBJ_CONSTRUCT(&buf, opal_buffer_t);
        opal_dss.pack(&buf, &total_entries, 1, OPAL_INT32);
        opal_dss.copy_payload(&buf, &collection);
        OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
                             "%s grpcomm:coll:recdub sending to %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&peer)));
        if (0 > (rc = orte_rml.send_buffer(&peer, &buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        OBJ_DESTRUCT(&buf);
        
        /* now setup to recv from my other partner */
        num_recvd = 0;
        OBJ_CONSTRUCT(&bucket, opal_buffer_t);
        if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(&peer,
                                                          ORTE_RML_TAG_DAEMON_COLLECTIVE,
                                                          ORTE_RML_NON_PERSISTENT,
                                                          orte_grpcomm_base_coll_recv,
                                                          NULL))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        /* and wait for it to get here */
        ORTE_PROGRESSED_WAIT(false, num_recvd, 1);
        
        /* extract the number of entries in the remote buffer */
        cnt = 1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(&bucket, &num_remote, &cnt, OPAL_INT32))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        
        /* add it to our running total */
        total_entries += num_remote;
        
        /* transfer the data to our collection */
        opal_dss.copy_payload(&collection, &bucket);
        
        /* cleanup */
        OBJ_DESTRUCT(&bucket);
    }
    
    /* output of a collective begins with the total number of entries */
    if (ORTE_SUCCESS != (rc = opal_dss.pack(recvbuf, &total_entries, 1, OPAL_INT32))) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    
    /* transfer the collected data */
    opal_dss.copy_payload(recvbuf, &collection);
    
    /* cleanup */
    OBJ_DESTRUCT(&collection);
    
    return ORTE_SUCCESS;
}
Exemple #21
0
/*
 * The Two-Proc Algorithm
 *
 * One sends to zero, zero waits to recv from one
 * Zero adds its data to message, sends result back to one
 */
static int twoproc(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_entries,
                   orte_jobid_t jobid, orte_vpid_t *vpids)
{
    orte_process_name_t peer;
    int32_t num_remote, cnt;
    int rc;
    opal_buffer_t buf;
    
    peer.jobid = jobid;
    
    OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
                         "%s grpcomm:coll:two-proc algo employed",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    if (vpids[0] == ORTE_PROC_MY_NAME->vpid) {
        /* I send first */
        peer.vpid = vpids[1];
        ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer));

        /* setup a temp buffer so I can inform the other side as to the
         * number of entries in my buffer
         */
        OBJ_CONSTRUCT(&buf, opal_buffer_t);
        opal_dss.pack(&buf, &num_entries, 1, OPAL_INT32);
        opal_dss.copy_payload(&buf, sendbuf);
        OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
                             "%s grpcomm:coll:two-proc sending to %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&peer)));
        
        if (0 > (rc = orte_rml.send_buffer(&peer, &buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        OBJ_DESTRUCT(&buf);
        
        /* wait for reply */
        num_recvd = 0;
        OBJ_CONSTRUCT(&bucket, opal_buffer_t);
        if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
                                                          ORTE_RML_TAG_DAEMON_COLLECTIVE,
                                                          ORTE_RML_NON_PERSISTENT,
                                                          orte_grpcomm_base_coll_recv,
                                                          NULL))) {
            ORTE_ERROR_LOG(rc);
        }
        
        ORTE_PROGRESSED_WAIT(false, num_recvd, 1);
        OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
                             "%s grpcomm:coll:two-proc got my return message",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        
    } else {
        /* if I am not the start, then I recv first */
        num_recvd = 0;
        OBJ_CONSTRUCT(&bucket, opal_buffer_t);
        if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
                                                          ORTE_RML_TAG_DAEMON_COLLECTIVE,
                                                          ORTE_RML_NON_PERSISTENT,
                                                          orte_grpcomm_base_coll_recv,
                                                          NULL))) {
            ORTE_ERROR_LOG(rc);
        }
        
        ORTE_PROGRESSED_WAIT(false, num_recvd, 1);
        OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
                             "%s grpcomm:coll:two-proc got my starting message",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        
        /* send my data back */
        OBJ_CONSTRUCT(&buf, opal_buffer_t);
        opal_dss.pack(&buf, &num_entries, 1, OPAL_INT32);
        opal_dss.copy_payload(&buf, sendbuf);
        peer.vpid = vpids[0];
        ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer));

        OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
                             "%s grpcomm:coll:two-proc sending to %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&peer)));
        if (0 > (rc = orte_rml.send_buffer(&peer, &buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        OBJ_DESTRUCT(&buf);
    }
    
    /* extract the number of entries in the remote buffer */
    cnt = 1;
    if (ORTE_SUCCESS != (rc = opal_dss.unpack(&bucket, &num_remote, &cnt, OPAL_INT32))) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    
    /* output of a collective begins with the total number of entries */
    num_remote += num_entries;
    if (ORTE_SUCCESS != (rc = opal_dss.pack(recvbuf, &num_remote, 1, OPAL_INT32))) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    
    /* xfer my data */
    opal_dss.copy_payload(recvbuf, sendbuf);
    /* xfer the recvd data */
    opal_dss.copy_payload(recvbuf, &bucket);
    
    /* cleanup */
    OBJ_DESTRUCT(&bucket);
    
    return ORTE_SUCCESS;
}
Exemple #22
0
void orte_grpcomm_base_daemon_collective(orte_process_name_t *sender,
                                         opal_buffer_t *data)
{
    orte_jobid_t jobid;
    orte_odls_job_t *jobdat;
    orte_routed_tree_t *child;
    orte_std_cntr_t n;
    opal_list_t daemon_tree;
    opal_list_item_t *item, *next;
    int32_t num_contributors;
    opal_buffer_t buf;
    orte_process_name_t my_parent, proc;
    orte_vpid_t daemonvpid;
    int rc;
    int32_t numc;
    orte_rml_tag_t rmltag;
    
    OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
                         "%s grpcomm:base:daemon_coll: daemon collective called",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    /* unpack the jobid using this collective */
    n = 1;
    if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobid, &n, ORTE_JOBID))) {
        ORTE_ERROR_LOG(rc);
        return;
    }
    
    /* lookup the job record for it */
    jobdat = NULL;
    for (item = opal_list_get_first(&orte_local_jobdata);
         item != opal_list_get_end(&orte_local_jobdata);
         item = opal_list_get_next(item)) {
        jobdat = (orte_odls_job_t*)item;
        
        /* is this the specified job? */
        if (jobdat->jobid == jobid) {
            break;
        }
    }
    if (NULL == jobdat) {
        /* race condition - someone sent us a collective before we could
         * parse the add_local_procs cmd. Just add the jobdat object
         * and continue
         */
        jobdat = OBJ_NEW(orte_odls_job_t);
        jobdat->jobid = jobid;
        opal_list_append(&orte_local_jobdata, &jobdat->super);
    }
    
    /* it may be possible to get here prior to having actually finished processing our
     * local launch msg due to the race condition between different nodes and when
     * they start their individual procs. Hence, we have to first ensure that we
     * -have- finished processing the launch msg, or else we won't know whether
     * or not to wait before sending this on
     */
    OPAL_THREAD_LOCK(&jobdat->lock);
    while (!jobdat->launch_msg_processed) {
        opal_condition_wait(&jobdat->cond, &jobdat->lock);
    }
    OPAL_THREAD_UNLOCK(&jobdat->lock);
    
    /* unpack the tag for this collective */
    n = 1;
    if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &rmltag, &n, ORTE_RML_TAG))) {
        ORTE_ERROR_LOG(rc);
        return;
    }
    
    /* unpack the number of contributors in this data bucket */
    n = 1;
    if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &num_contributors, &n, OPAL_INT32))) {
        ORTE_ERROR_LOG(rc);
        return;
    }
    jobdat->num_contributors += num_contributors;
    
    /* xfer the data */
    opal_dss.copy_payload(&jobdat->collection_bucket, data);
    
    /* count the number of participants collected */
    jobdat->num_collected++;
    
    /* if we haven't already done so, figure out how many participants we
     * should be expecting
     */
    if (jobdat->num_participating < 0) {
        if (0 < jobdat->num_local_procs) {
            /* we have children, so account for our own participation */
            jobdat->num_participating = 1;
        } else {
            jobdat->num_participating = 0;
        }
        /* now see if anyone else will be sending us something */
        OBJ_CONSTRUCT(&daemon_tree, opal_list_t);
        orte_routed.get_routing_tree(&daemon_tree);
        /* unfortunately, there is no simple way to determine which of our "child"
         * daemons in the routing tree will be sending us something. All we can do
         * is brute force a search, though we attempt to keep it as short as possible
         */
        proc.jobid = jobid;
        proc.vpid = 0;
        while (proc.vpid < jobdat->num_procs && 0 < opal_list_get_size(&daemon_tree)) {
            ORTE_EPOCH_SET(proc.epoch,orte_ess.proc_get_epoch(&proc));

            /* get the daemon that hosts this proc */
            daemonvpid = orte_ess.proc_get_daemon(&proc);
            /* is this daemon one of our children, or at least its contribution
             * will pass through one of our children
             */
            item = opal_list_get_first(&daemon_tree);
            while (item != opal_list_get_end(&daemon_tree)) {
                next = opal_list_get_next(item);
                child = (orte_routed_tree_t*)item;
                if (child->vpid == daemonvpid || opal_bitmap_is_set_bit(&child->relatives, daemonvpid)) {
                    /* it does - add to num_participating */
                    jobdat->num_participating++;
                    /* remove this from the list so we don't double count it */
                    opal_list_remove_item(&daemon_tree, item);
                    /* done with search */
                    break;
                }
                item = next;
            }
            proc.vpid++;
        }
    }
    
    OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
                         "%s grpcomm:base:daemon_coll: daemon collective for job %s from %s type %ld"
                         " num_collected %d num_participating %d num_contributors %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jobid),
                         ORTE_NAME_PRINT(sender),
                         (long)jobdat->collective_type, jobdat->num_collected,
                         jobdat->num_participating, jobdat->num_contributors));
    
    if (jobdat->num_collected == jobdat->num_participating) {
        /* if I am the HNP, go process the results */
        if (ORTE_PROC_IS_HNP) {
            goto hnp_process;
        }
        
        /* if I am not the HNP, send to my parent */
        OBJ_CONSTRUCT(&buf, opal_buffer_t);
        /* pack the jobid */
        if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jobid, 1, ORTE_JOBID))) {
            ORTE_ERROR_LOG(rc);
            return;
        }
        /* pack the target tag */
        if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &rmltag, 1, ORTE_RML_TAG))) {
            ORTE_ERROR_LOG(rc);
            return;
        }
        /* pack the number of contributors */
        if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jobdat->num_contributors, 1, OPAL_INT32))) {
            ORTE_ERROR_LOG(rc);
            return;
        }
        /* xfer the payload*/
        opal_dss.copy_payload(&buf, &jobdat->collection_bucket);
        /* reset everything for next collective */
        jobdat->num_contributors = 0;
        jobdat->num_collected = 0;
        OBJ_DESTRUCT(&jobdat->collection_bucket);
        OBJ_CONSTRUCT(&jobdat->collection_bucket, opal_buffer_t);
        /* send it */
        my_parent.jobid = ORTE_PROC_MY_NAME->jobid;
        my_parent.vpid = orte_routed.get_routing_tree(NULL);
        ORTE_EPOCH_SET(my_parent.epoch,orte_ess.proc_get_epoch(&my_parent));

        OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
                             "%s grpcomm:base:daemon_coll: daemon collective not the HNP - sending to parent %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&my_parent)));
        if (0 > (rc = orte_rml.send_buffer(&my_parent, &buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) {
            ORTE_ERROR_LOG(rc);
            return;
        }
        OBJ_DESTRUCT(&buf);
    }
    return;
    
hnp_process:
    OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
                         "%s grpcomm:base:daemon_coll: daemon collective HNP - xcasting to job %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_JOBID_PRINT(jobid)));
    /* setup a buffer to send the results back to the job members */
    OBJ_CONSTRUCT(&buf, opal_buffer_t);
    
    /* add any collected data */
    numc = jobdat->num_contributors;
    if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &numc, 1, OPAL_INT32))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }
    if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(&buf, &jobdat->collection_bucket))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }
    /* reset everything for next collective */
    jobdat->num_contributors = 0;
    jobdat->num_collected = 0;
    OBJ_DESTRUCT(&jobdat->collection_bucket);
    OBJ_CONSTRUCT(&jobdat->collection_bucket, opal_buffer_t);
    /* send the buffer */
    if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(jobid, &buf, rmltag))) {
        ORTE_ERROR_LOG(rc);
    }
    
cleanup:
    OBJ_DESTRUCT(&buf);
    
    return;    
}
Exemple #23
0
int
main(int argc, char *argv[]){
    int count;
    int msgsize;
    uint8_t *msg;
    int i, j, rc;
    orte_process_name_t peer;
    double maxpower;
    
    /*
     * Init
     */
    orte_init(&argc, &argv, ORTE_PROC_NON_MPI);

    if (argc > 1) {
        count = atoi(argv[1]);
        if (count < 0) {
            count = INT_MAX-1;
        }
    } else {
        count = MAX_COUNT;
    }
    
    peer.jobid = ORTE_PROC_MY_NAME->jobid;
    
    for (j=1; j < count+1; j++) {
        peer.vpid = (ORTE_PROC_MY_NAME->vpid + j) % orte_process_info.num_procs;
        ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer));
        
        /* rank0 starts ring */
        if (ORTE_PROC_MY_NAME->vpid == 0) {
            /* setup the initiating buffer - put random sized message in it */
            OBJ_CONSTRUCT(&buf, opal_buffer_t);
            
            maxpower = (double)(j%7);
            msgsize = (int)pow(10.0, maxpower);
            opal_output(0, "Ring %d message size %d bytes", j, msgsize);
            msg = (uint8_t*)malloc(msgsize);
            opal_dss.pack(&buf, msg, msgsize, OPAL_BYTE);
            
            if (0 > (rc = orte_rml.send_buffer(&peer,&buf, MY_TAG, 0))) {
                opal_output(0, "error sending to %s %s\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            ORTE_NAME_PRINT(&peer), ORTE_ERROR_NAME(rc));
                exit(1);
            }
            OBJ_DESTRUCT(&buf);
            /* wait for it to come around */
            OBJ_CONSTRUCT(&buf, opal_buffer_t);
            msg_recvd = false;
            orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, MY_TAG,
                                    ORTE_RML_NON_PERSISTENT, recv_ack, NULL);
            
            ORTE_PROGRESSED_WAIT(msg_recvd, 0, 1);
            opal_output(0, "%s Ring %d completed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), j);
        } else {
            /* wait for msg */
            OBJ_CONSTRUCT(&buf, opal_buffer_t);
            msg_recvd = false;
            orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, MY_TAG,
                                    ORTE_RML_NON_PERSISTENT, recv_ack, NULL);
            
            ORTE_PROGRESSED_WAIT(msg_recvd, 0, 1);
            /* send it along */
            if (0 > (rc = orte_rml.send_buffer(&peer, &buf, MY_TAG, 0))) {
                opal_output(0, "%s error sending to %s %s\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            ORTE_NAME_PRINT(&peer), ORTE_ERROR_NAME(rc));
                exit(1);
            }
            OBJ_DESTRUCT(&buf);
        }
    }

    orte_finalize();

    return 0;
}
Exemple #24
0
/*****************
 * Local Functions
 *****************/
static void errmgr_autor_process_fault_app(orte_job_t *jdata,
                                           orte_process_name_t *proc,
                                           orte_proc_state_t state)
{
    errmgr_autor_wp_item_t *wp_item = NULL;
    struct timeval soon;

    OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
                         "%s errmgr:hnp(autor): process_fault() "
                         "Process fault! proc %s (0x%x)",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(proc),
                         state));

    if( !orte_sstore_base_is_checkpoint_available ) {
        OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
                             "%s errmgr:hnp(autor): process_fault() "
                             "No checkpoints are available for this job! Cannot Automaticly Recover!",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ));
        opal_show_help("help-orte-errmgr-hnp.txt", "autor_failed_to_recover_proc", true,
                       ORTE_NAME_PRINT(proc), proc->vpid);
        return;
    }

    mca_errmgr_hnp_component.ignore_current_update = true;

    /*
     * If we are already in the shutdown stage of the recovery, then just skip it
     */
    if( autor_mask_faults ) {
        OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
                             "%s errmgr:hnp(autor):process_fault() "
                             "Currently recovering the job. Failure masked!",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        return;
    }

    /*
     * Append this process to the list to process
     */
    wp_item = OBJ_NEW(errmgr_autor_wp_item_t);
    wp_item->name.jobid = proc->jobid;
    wp_item->name.vpid = proc->vpid;
    ORTE_EPOCH_SET(wp_item->name.epoch,proc->epoch);
    wp_item->state = state;

    opal_list_append(procs_pending_recovery, &(wp_item->super));

    /*
     * Activate the timer, if it is not already setup
     */
    if( !autor_timer_active ) {
        autor_timer_active = true;

        opal_event_evtimer_set(opal_event_base, autor_timer_event, errmgr_autor_recover_processes, NULL);
        soon.tv_sec  = mca_errmgr_hnp_component.autor_recovery_delay;
        soon.tv_usec = 0;
        opal_event_evtimer_add(autor_timer_event, &soon);
    }

    return;
}
Exemple #25
0
/* Setup to read local data. If the tag is other than STDIN,
 * then this is output being pushed from one of my child processes
 * and I'll write the data out myself. If the tag is STDIN,
 * then I need to setup to read from my stdin, and send anything
 * I get to the specified dst_name. The dst_name in this case tells
 * us which procs are to get stdin - only two options are supported:
 *
 * (a) a specific name, usually vpid=0; or
 *
 * (b) all procs, specified by vpid=ORTE_VPID_WILDCARD
 *
 * The orte_plm_base_launch_apps function calls iof.push after
 * the procs are launched and tells us how to distribute stdin. This
 * ensures that the procs are started -before- we begin reading stdin
 * and attempting to send it to remote procs
 */
static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag, int fd)
{
    orte_job_t *jdata;
    orte_proc_t *proc;
    orte_iof_sink_t *sink;
    orte_iof_proc_t *proct;
    opal_list_item_t *item;
    int flags;
    char *outfile;
    int fdout;
    orte_odls_job_t *jobdat=NULL;
    int np, numdigs;
    int rc;
    orte_ns_cmp_bitmask_t mask;

    /* don't do this if the dst vpid is invalid or the fd is negative! */
    if (ORTE_VPID_INVALID == dst_name->vpid || fd < 0) {
        return ORTE_SUCCESS;
    }
    
    OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
                         "%s iof:hnp pushing fd %d for process %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         fd, ORTE_NAME_PRINT(dst_name)));
    
    if (!(src_tag & ORTE_IOF_STDIN)) {
        /* set the file descriptor to non-blocking - do this before we setup
         * and activate the read event in case it fires right away
         */
        if((flags = fcntl(fd, F_GETFL, 0)) < 0) {
            opal_output(orte_iof_base.iof_output, "[%s:%d]: fcntl(F_GETFL) failed with errno=%d\n", 
                        __FILE__, __LINE__, errno);
        } else {
            flags |= O_NONBLOCK;
            fcntl(fd, F_SETFL, flags);
        }
        /* do we already have this process in our list? */
        for (item = opal_list_get_first(&mca_iof_hnp_component.procs);
             item != opal_list_get_end(&mca_iof_hnp_component.procs);
             item = opal_list_get_next(item)) {
            proct = (orte_iof_proc_t*)item;
            mask = ORTE_NS_CMP_ALL;
            if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proct->name, dst_name)) {
                /* found it */
                goto SETUP;
            }
        }
        /* if we get here, then we don't yet have this proc in our list */
        proct = OBJ_NEW(orte_iof_proc_t);
        proct->name.jobid = dst_name->jobid;
        proct->name.vpid = dst_name->vpid;
        ORTE_EPOCH_SET(proct->name.epoch,dst_name->epoch);
        opal_list_append(&mca_iof_hnp_component.procs, &proct->super);
        /* see if we are to output to a file */
        if (NULL != orte_output_filename) {
            /* get the local jobdata for this proc */
            for (item = opal_list_get_first(&orte_local_jobdata);
                 item != opal_list_get_end(&orte_local_jobdata);
                 item = opal_list_get_next(item)) {
                jobdat = (orte_odls_job_t*)item;
                if (jobdat->jobid == proct->name.jobid) {
                    break;
                }
            }
            if (NULL == jobdat) {
                ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
                return ORTE_ERR_NOT_FOUND;
            }
            np = jobdat->num_procs / 10;
            /* determine the number of digits required for max vpid */
            numdigs = 1;
            while (np > 0) {
                numdigs++;
                np = np / 10;
            }
            /* construct the filename */
            asprintf(&outfile, "%s.%d.%0*lu", orte_output_filename,
                     (int)ORTE_LOCAL_JOBID(proct->name.jobid),
                     numdigs, (unsigned long)proct->name.vpid);
            /* create the file */
            fdout = open(outfile, O_CREAT|O_RDWR|O_TRUNC, 0644);
            free(outfile);
            if (fdout < 0) {
                /* couldn't be opened */
                ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
                return ORTE_ERR_FILE_OPEN_FAILURE;
            }
            /* define a sink to that file descriptor */
            ORTE_IOF_SINK_DEFINE(&sink, dst_name, fdout, ORTE_IOF_STDOUTALL,
                                 orte_iof_base_write_handler,
                                 &mca_iof_hnp_component.sinks);
        }
        
    SETUP:
        /* define a read event and activate it */
        if (src_tag & ORTE_IOF_STDOUT) {
            ORTE_IOF_READ_EVENT(&proct->revstdout, dst_name, fd, ORTE_IOF_STDOUT,
                                orte_iof_hnp_read_local_handler, false);
        } else if (src_tag & ORTE_IOF_STDERR) {
            ORTE_IOF_READ_EVENT(&proct->revstderr, dst_name, fd, ORTE_IOF_STDERR,
                                orte_iof_hnp_read_local_handler, false);
        } else if (src_tag & ORTE_IOF_STDDIAG) {
            ORTE_IOF_READ_EVENT(&proct->revstddiag, dst_name, fd, ORTE_IOF_STDDIAG,
                                orte_iof_hnp_read_local_handler, false);
        }
        /* if -all- of the readevents for this proc have been defined, then
         * activate them. Otherwise, we can think that the proc is complete
         * because one of the readevents fires -prior- to all of them having
         * been defined!
         */
        if (NULL != proct->revstdout && NULL != proct->revstderr && NULL != proct->revstddiag) {
            proct->revstdout->active = true;
            opal_event_add(&(proct->revstdout->ev), 0);
            proct->revstderr->active = true;
            opal_event_add(&(proct->revstderr->ev), 0);
            proct->revstddiag->active = true;
            opal_event_add(&(proct->revstddiag->ev), 0);
        }
        return ORTE_SUCCESS;
    }

    /* if we are pushing stdin, this is happening only during launch - setup
     * a target for this destination if it is going somewhere other than me
     */
    if (ORTE_VPID_WILDCARD == dst_name->vpid) {
        /* if wildcard, define a sink with that info so it gets sent out */
        ORTE_IOF_SINK_DEFINE(&sink, dst_name, -1, ORTE_IOF_STDIN,
                             stdin_write_handler,
                             &mca_iof_hnp_component.sinks);
    } else {
        /* no - lookup the proc's daemon and set that into sink */
        if (NULL == (jdata = orte_get_job_data_object(dst_name->jobid))) {
            ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
            return ORTE_ERR_BAD_PARAM;
        }
        if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, dst_name->vpid))) {
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            return ORTE_ERR_NOT_FOUND;
        }
        /* if it is me, then don't set this up - we'll get it on the pull */
        if (ORTE_PROC_MY_NAME->vpid != proc->node->daemon->name.vpid) {
            ORTE_IOF_SINK_DEFINE(&sink, dst_name, -1, ORTE_IOF_STDIN,
                                 stdin_write_handler,
                                 &mca_iof_hnp_component.sinks);
            sink->daemon.jobid = ORTE_PROC_MY_NAME->jobid;
            sink->daemon.vpid = proc->node->daemon->name.vpid;
            ORTE_EPOCH_SET(sink->daemon.epoch,orte_ess.proc_get_epoch(&sink->daemon));
        }
    }
    
    /* now setup the read - but check to only do this once */
    if (NULL == mca_iof_hnp_component.stdinev) {
        /* Since we are the HNP, we don't want to set nonblocking on our
         * stdio stream.  If we do so, we set the file descriptor to
         * non-blocking for everyone that has that file descriptor, which
         * includes everyone else in our shell pipeline chain.  (See
         * http://lists.freebsd.org/pipermail/freebsd-hackers/2005-January/009742.html).
         * This causes things like "mpirun -np 1 big_app | cat" to lose
         * output, because cat's stdout is then ALSO non-blocking and cat
         * isn't built to deal with that case (same with almost all other
         * unix text utils). 
         */
        if (0 != fd) {
            if((flags = fcntl(fd, F_GETFL, 0)) < 0) {
                opal_output(orte_iof_base.iof_output, "[%s:%d]: fcntl(F_GETFL) failed with errno=%d\n", 
                            __FILE__, __LINE__, errno);
            } else {
                flags |= O_NONBLOCK;
                fcntl(fd, F_SETFL, flags);
            }            
        }
        if (isatty(fd)) {
            /* We should avoid trying to read from stdin if we
             * have a terminal, but are backgrounded.  Catch the
             * signals that are commonly used when we switch
             * between being backgrounded and not.  If the
             * filedescriptor is not a tty, don't worry about it
             * and always stay connected.
             */
            opal_event_signal_set(opal_event_base, &mca_iof_hnp_component.stdinsig,
                                  SIGCONT, orte_iof_hnp_stdin_cb,
                                  NULL);
            
            /* setup a read event to read stdin, but don't activate it yet. The
             * dst_name indicates who should receive the stdin. If that recipient
             * doesn't do a corresponding pull, however, then the stdin will
             * be dropped upon receipt at the local daemon
             */
            ORTE_IOF_READ_EVENT(&mca_iof_hnp_component.stdinev,
                                dst_name, fd, ORTE_IOF_STDIN,
                                orte_iof_hnp_read_local_handler, false);
            
            /* check to see if we want the stdin read event to be
             * active - we will always at least define the event,
             * but may delay its activation
             */
            if (!(src_tag & ORTE_IOF_STDIN) || orte_iof_hnp_stdin_check(fd)) {
                mca_iof_hnp_component.stdinev->active = true;
                if (OPAL_SUCCESS != (rc = opal_event_add(&(mca_iof_hnp_component.stdinev->ev), 0))) {
                    ORTE_ERROR_LOG(rc);
                }
            }
        } else {
            /* if we are not looking at a tty, just setup a read event
             * and activate it
             */
            ORTE_IOF_READ_EVENT(&mca_iof_hnp_component.stdinev,
                                dst_name, fd, ORTE_IOF_STDIN,
                                orte_iof_hnp_read_local_handler, true);
        }
    }
    return ORTE_SUCCESS;
}
int orte_rmaps_base_setup_virtual_machine(orte_job_t *jdata)
{
    orte_job_t *jdat;
    orte_node_t *node;
    orte_proc_t *proc;
    orte_job_map_t *map;
    opal_list_t node_list;
    opal_list_item_t *item;
    orte_app_context_t *app;
    orte_std_cntr_t num_slots;
    int rc, i, n;
    bool ignored;

    /* get the daemon app if provided - may include -host or hostfile
     * info about available nodes
     */
    app = (orte_app_context_t *) opal_pointer_array_get_item(jdata->apps, 0);
    
    map = jdata->map;
    
    /* get the list of all available nodes that do not already
     * have a daemon on them
     */
    OBJ_CONSTRUCT(&node_list, opal_list_t);
    if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots,
                                                               app, map->policy))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&node_list);
        return rc;
    }
    /* check all other known jobs to see if they have something to
     * add to the allocation - we won't have seen these and the
     * daemon job won't have any in its app
     */
    for (i=0; i < orte_job_data->size; i++) {
        if (NULL == (jdat = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) {
            continue;
        }
        for (n=0; n < jdat->apps->size; n++) {
            if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdat->apps, n))) {
                continue;
            }
            if (NULL != app->hostfile) {
                /* hostfile was specified - parse it and add it to the list. The
                 * function automatically ignores duplicates
                 */
                if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&node_list,
                                                                       &ignored,
                                                                       app->hostfile))) {
                    ORTE_ERROR_LOG(rc);
                    OBJ_DESTRUCT(&node_list);
                    return rc;
                }
            }
            if (NULL != app->dash_host) {
                /* parse and add to list, ignoring duplicates */
                if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&node_list,
                                                                        &ignored,
                                                                        app->dash_host))) {
                    ORTE_ERROR_LOG(rc);
                    OBJ_DESTRUCT(&node_list);
                    return rc;
                }
            }
        }
    }

    /* add all these nodes to the map */
    while (NULL != (item = opal_list_remove_first(&node_list))) {
        node = (orte_node_t*)item;
        /* if this is my node, ignore it - we are already here */
        if (0 == strcmp(node->name, orte_process_info.nodename)) {
            continue;
        }
        opal_pointer_array_add(map->nodes, (void*)node);
        ++(map->num_nodes);
        /* if this node already has a daemon, release that object
         * to maintain bookkeeping
         */
        if (NULL != node->daemon) {
            OBJ_RELEASE(node->daemon);
        }
        /* create a new daemon object for this node */
        proc = OBJ_NEW(orte_proc_t);
        if (NULL == proc) {
            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
            return ORTE_ERR_OUT_OF_RESOURCE;
        }
        proc->name.jobid = ORTE_PROC_MY_HNP->jobid;
        if (ORTE_VPID_MAX-1 <= jdata->num_procs) {
            /* no more daemons available */
            orte_show_help("help-orte-rmaps-base.txt", "out-of-vpids", true);
            OBJ_RELEASE(proc);
            return ORTE_ERR_OUT_OF_RESOURCE;
        }
        proc->name.vpid = jdata->num_procs;  /* take the next available vpid */
        ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
        ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
        proc->node = node;
        proc->nodename = node->name;
        OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
                             "%s rmaps:base:setup_vm add new daemon %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&proc->name)));
        /* add the daemon to the daemon job object */
        if (0 > (rc = opal_pointer_array_add(jdata->procs, (void*)proc))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        ++jdata->num_procs;
        /* point the node to the daemon */
        node->daemon = proc;
        OBJ_RETAIN(proc);  /* maintain accounting */
        /* track number of daemons to be launched */
        ++map->num_new_daemons;
        /* and their starting vpid */
        if (ORTE_VPID_INVALID == map->daemon_vpid_start) {
            map->daemon_vpid_start = proc->name.vpid;
        }
    }
    OBJ_DESTRUCT(&node_list);
    
    return ORTE_SUCCESS;
}
Exemple #27
0
static int rte_init(void)
{
    int ret;
    char *error = NULL;
    char **nodes = NULL, **ppnlist = NULL;
    char *envar;
    int32_t jobfam;
    int i, j, *ppn;
    orte_nid_t *node;
    orte_jmap_t *jmap;
    orte_pmap_t *pmap;
    orte_vpid_t vpid;
    bool byslot;

    /* run the prolog */
    if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
        error = "orte_ess_base_std_prolog";
        goto error;
    }
    
    /* Only application procs can use this module. Since we
     * were directly launched by someone, we need to bootstrap
     * our own global info so we can startup.
     */
    
    /* ensure that static ports were assigned - otherwise, we cant
     * work since we won't know how to talk to anyone else
     */
    if (NULL == getenv("OMPI_MCA_oob_tcp_static_ports") &&
        NULL == getenv("OMPI_MCA_oob_tcp_static_ports_v6")) {
        error = "static ports were not assigned";
        goto error;
    }

    /* declare ourselves to be standalone - i.e., not launched by orted */
    orte_standalone_operation = true;
    
    /* extract a jobid from the environment - can be totally
     * arbitrary. if one isn't provided, just fake it
     */
    if (NULL != (envar = getenv("OMPI_MCA_orte_jobid"))) {
        jobfam = strtol(envar, NULL, 10);
    } else {
        jobfam = 1;
    }
    ORTE_PROC_MY_NAME->jobid = ORTE_CONSTRUCT_LOCAL_JOBID(0, jobfam);
    
    /* extract a rank from the environment */
    if (NULL == (envar = getenv("OMPI_MCA_orte_rank"))) {
        error = "could not get process rank";
        goto error;
    }
    ORTE_PROC_MY_NAME->vpid = strtol(envar, NULL, 10);
    ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN);

    OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
                         "%s completed name definition",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

    /* get the number of procs in this job */
    if (NULL == (envar = getenv("OMPI_MCA_orte_num_procs"))) {
        error = "could not get number of processes in job";
        goto error;
    }
    orte_process_info.num_procs = strtol(envar, NULL, 10);

    if (orte_process_info.max_procs < orte_process_info.num_procs) {
        orte_process_info.max_procs = orte_process_info.num_procs;
    }
    
    /* set the app_num so that MPI attributes get set correctly */
    orte_process_info.app_num = 1;

    /* get the list of nodes */
    if (NULL == (envar = getenv("OMPI_MCA_orte_nodes"))) {
        error = "could not get list of nodes";
        goto error;
    }
    /* break this down */
    nodes = opal_argv_split(envar, ',');
    orte_process_info.num_nodes = opal_argv_count(nodes);

    /* get the ppn */
    if (NULL == (envar = getenv("OMPI_MCA_orte_ppn"))) {
        error = "could not get ppn";
        goto error;
    }
    ppnlist = opal_argv_split(envar, ',');
    ppn = (int*)malloc(orte_process_info.num_nodes * sizeof(int));
    if (1 == opal_argv_count(ppnlist)) {
        /* constant ppn */
        j = strtol(ppnlist[0], NULL, 10);
        for (i=0; i < orte_process_info.num_nodes; i++) {
            ppn[i] = j;
        }
    } else {
        for (i=0; i < orte_process_info.num_nodes; i++) {
            ppn[i] = strtol(ppnlist[i], NULL, 10);
        }
    }
    opal_argv_free(ppnlist);

    /* get the mapping mode - default to byslot */
    byslot = true;
    if (NULL != (envar = getenv("OMPI_MCA_mapping")) &&
        0 == strcmp(envar, "bynode")) {
        byslot = false;
    }

    /* setup the nidmap arrays */
    if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_util_nidmap_init";
        goto error;
    }
    
    /* set the size of the nidmap storage so we minimize realloc's */
    if (ORTE_SUCCESS != (ret = opal_pointer_array_set_size(&orte_nidmap, orte_process_info.num_nodes))) {
        error = "could not set pointer array size for nidmap";
        goto error;
    }
    
    /* construct the nidmap */
    for (i=0; i < orte_process_info.num_nodes; i++) {
        node = OBJ_NEW(orte_nid_t);
        if (0 == strcmp(nodes[i], orte_process_info.nodename) || opal_ifislocal(nodes[i])) {
            node->name = strdup(orte_process_info.nodename);
        } else {
            node->name = strdup(nodes[i]);
        }
        node->daemon = i;
        node->index = i;
        opal_pointer_array_set_item(&orte_nidmap, i, node);
    }
    opal_argv_free(nodes);

    /* create a job map for this job */
    jmap = OBJ_NEW(orte_jmap_t);
    jmap->job = ORTE_PROC_MY_NAME->jobid;
    opal_pointer_array_add(&orte_jobmap, jmap);
    /* update the num procs */
    jmap->num_procs = orte_process_info.num_procs;
    /* set the size of the pidmap storage so we minimize realloc's */
    if (ORTE_SUCCESS != (ret = opal_pointer_array_set_size(&jmap->pmap, jmap->num_procs))) {
        ORTE_ERROR_LOG(ret);
        error = "could not set pointer array size for pidmap";
        goto error;
    }

    /* construct the pidmap */
    if (byslot) {
        vpid = 0;
        for (i=0; i < orte_process_info.num_nodes; i++) {
            node = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, i);
            /* for each node, cycle through the ppn */
            for (j=0; j < ppn[i]; j++) {
                pmap = OBJ_NEW(orte_pmap_t);
                pmap->node = i;
                pmap->local_rank = j;
                pmap->node_rank = j;
                if (ORTE_SUCCESS != (ret = opal_pointer_array_set_item(&jmap->pmap, vpid, pmap))) {
                    ORTE_ERROR_LOG(ret);
                    error = "could not set pmap values";
                    goto error;
                }
                /* if this is me, then define the daemon's vpid to 
                 * be the node number
                 */
                if (vpid == ORTE_PROC_MY_NAME->vpid) {
                    ORTE_PROC_MY_DAEMON->jobid = 0;
                    ORTE_PROC_MY_DAEMON->vpid = i;
                    ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_PROC_MY_NAME->epoch);
                }
                OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
                                     "%s node %d name %s rank %s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     (int) node->index, node->name, ORTE_VPID_PRINT(vpid)));
                vpid++;
            }
        }
    } else {
        /* cycle across the nodes */
        vpid = 0;
        while (vpid < orte_process_info.num_procs) {
            for (i=0; i < orte_process_info.num_nodes && vpid < orte_process_info.num_procs; i++) {
                node = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, i);
                if (0 < ppn[i]) {
                    pmap = OBJ_NEW(orte_pmap_t);
                    pmap->node = i;
                    pmap->local_rank = ppn[i]-1;
                    pmap->node_rank = ppn[i]-1;
                    if (ORTE_SUCCESS != (ret = opal_pointer_array_set_item(&jmap->pmap, vpid, pmap))) {
                        ORTE_ERROR_LOG(ret);
                        error = "could not set pmap values";
                        goto error;
                    }
                    /* if this is me, then define the daemon's vpid to 
                     * be the node number
                     */
                    if (vpid == ORTE_PROC_MY_NAME->vpid) {
                        ORTE_PROC_MY_DAEMON->jobid = 0;
                        ORTE_PROC_MY_DAEMON->vpid = i;
                        ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_PROC_MY_NAME->epoch);
                    }
                    OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
                                         "%s node %d name %s rank %d",
                                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                         (int) node->index, node->name, (int)vpid));
                    vpid++;
                    --ppn[i];
                }
            }
        }
    }
    free(ppn);

    /* ensure we pick the correct critical components */
    putenv("OMPI_MCA_grpcomm=hier");
    putenv("OMPI_MCA_routed=direct");

    /* use the default procedure to finish my setup */
    if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_ess_base_app_setup";
        goto error;
    }

    if (0 < opal_output_get_verbosity(orte_ess_base_output)) {
        orte_nidmap_dump();
        orte_jobmap_dump();
    }

    return ORTE_SUCCESS;

 error:
    orte_show_help("help-orte-runtime.txt",
                   "orte_init:startup:internal-failure",
                   true, error, ORTE_ERROR_NAME(ret), ret);
    
    return ret;
}
static int alps_set_name(void)
{
    int rc;
    orte_jobid_t jobid;
    char *tmp;
    orte_vpid_t vpid;

    OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
                         "ess:alps setting name"));

    mca_base_param_reg_string_name("orte", "ess_jobid", "Process jobid",
                                   true, false, NULL, &tmp);
    if (NULL == tmp) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        return ORTE_ERR_NOT_FOUND;
    }
    if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_jobid(&jobid, tmp))) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    free(tmp);

    mca_base_param_reg_string_name("orte", "ess_vpid", "Process vpid",
                                   true, false, NULL, &tmp);
    if (NULL == tmp) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        return ORTE_ERR_NOT_FOUND;
    }
    if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_vpid(&starting_vpid,
                                                               tmp))) {
        ORTE_ERROR_LOG(rc);
        return(rc);
    }
    free(tmp);

    if (ORTE_SUCCESS != (rc = get_vpid(&vpid, starting_vpid))) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }

    ORTE_PROC_MY_NAME->jobid = jobid;
    ORTE_PROC_MY_NAME->vpid = vpid;
    ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_INVALID);
    ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,
                   orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME));

    OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
                         "ess:alps set name to %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

    /* get the num procs as provided in the cmd line param */
    if (ORTE_SUCCESS != (rc = orte_ess_env_get())) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }

    if (orte_process_info.max_procs < orte_process_info.num_procs) {
        orte_process_info.max_procs = orte_process_info.num_procs;
    }

    return ORTE_SUCCESS;
}