static void orte_proc_construct(orte_proc_t* proc) { proc->name = *ORTE_NAME_INVALID; proc->pid = 0; proc->local_rank = ORTE_LOCAL_RANK_INVALID; proc->node_rank = ORTE_NODE_RANK_INVALID; proc->app_rank = -1; proc->last_errmgr_state = ORTE_PROC_STATE_UNDEF; proc->state = ORTE_PROC_STATE_UNDEF; proc->app_idx = 0; proc->slot_list = NULL; proc->node = NULL; proc->prior_node = NULL; proc->nodename = NULL; proc->exit_code = 0; /* Assume we won't fail unless otherwise notified */ proc->rml_uri = NULL; proc->restarts = 0; proc->fast_failures = 0; proc->last_failure.tv_sec = 0; proc->last_failure.tv_usec = 0; proc->reported = false; proc->beat = 0; OBJ_CONSTRUCT(&proc->stats, opal_ring_buffer_t); opal_ring_buffer_init(&proc->stats, orte_stat_history_size); ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); #if OPAL_ENABLE_FT_CR == 1 proc->ckpt_state = 0; proc->ckpt_snapshot_ref = NULL; proc->ckpt_snapshot_loc = NULL; #endif }
void orte_sstore_base_local_snapshot_info_destruct( orte_sstore_base_local_snapshot_info_t *snapshot) { snapshot->process_name.jobid = 0; snapshot->process_name.vpid = 0; ORTE_EPOCH_SET(snapshot->process_name.epoch,ORTE_EPOCH_MIN); if( NULL != snapshot->crs_comp ) { free(snapshot->crs_comp); snapshot->crs_comp = NULL; } if( NULL != snapshot->compress_comp ) { free(snapshot->compress_comp); snapshot->compress_comp = NULL; } if( NULL != snapshot->compress_postfix ) { free(snapshot->compress_postfix); snapshot->compress_postfix = NULL; } if( NULL != snapshot->start_time ) { free(snapshot->start_time); snapshot->start_time = NULL; } if( NULL != snapshot->end_time ) { free(snapshot->end_time); snapshot->end_time = NULL; } }
static void killprocs(orte_jobid_t job, orte_vpid_t vpid) { opal_pointer_array_t cmd; orte_proc_t proc; int rc; /* stop local sensors for this job */ if (ORTE_VPID_WILDCARD == vpid) { orte_sensor.stop(job); } if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid) { if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) { ORTE_ERROR_LOG(rc); } return; } OBJ_CONSTRUCT(&cmd, opal_pointer_array_t); OBJ_CONSTRUCT(&proc, orte_proc_t); proc.name.jobid = job; proc.name.vpid = vpid; ORTE_EPOCH_SET(proc.name.epoch,orte_ess.proc_get_epoch(&(proc.name))); opal_pointer_array_add(&cmd, &proc); if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) { ORTE_ERROR_LOG(rc); } OBJ_DESTRUCT(&cmd); OBJ_DESTRUCT(&proc); }
void errmgr_autor_wp_item_destruct(errmgr_autor_wp_item_t *wp) { wp->name.jobid = ORTE_JOBID_INVALID; wp->name.vpid = ORTE_VPID_INVALID; ORTE_EPOCH_SET(wp->name.epoch,ORTE_EPOCH_INVALID); wp->state = 0; }
static int udp_recv_buffer(orte_process_name_t *name, orte_rmcast_channel_t channel, orte_rmcast_tag_t tag, orte_rmcast_seq_t *seq_num, opal_buffer_t *buf) { rmcast_base_recv_t *recvptr; int ret; orte_rmcast_channel_t chan; ORTE_ACQUIRE_THREAD(&ctl); if (!comm_enabled) { ORTE_RELEASE_THREAD(&ctl); return ORTE_ERR_COMM_DISABLED; } OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, "%s rmcast:udp: recv_buffer called on multicast channel %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), channel)); if (ORTE_RMCAST_GROUP_INPUT_CHANNEL == channel) { chan = orte_rmcast_base.my_input_channel->channel; } else if (ORTE_RMCAST_GROUP_OUTPUT_CHANNEL == channel) { chan = orte_rmcast_base.my_output_channel->channel; } else { chan = channel; } if (ORTE_SUCCESS != (ret = orte_rmcast_base_queue_recv(&recvptr, chan, tag, ORTE_RMCAST_NON_PERSISTENT, NULL, NULL, NULL, true))) { ORTE_ERROR_LOG(ret); ORTE_RELEASE_THREAD(&ctl); return ret; } ORTE_RELEASE_THREAD(&ctl); recvptr->ctl.active = true; ORTE_ACQUIRE_THREAD(&recvptr->ctl); /* xfer the data */ if (NULL != name) { /* caller requested id of sender */ name->jobid = recvptr->name.jobid; name->vpid = recvptr->name.vpid; ORTE_EPOCH_SET(name->epoch,recvptr->name.epoch); } *seq_num = recvptr->seq_num; if (ORTE_SUCCESS != (ret = opal_dss.copy_payload(buf, recvptr->buf))) { ORTE_ERROR_LOG(ret); } /* release the data */ OBJ_RELEASE(recvptr); return ret; }
void orte_snapc_base_local_snapshot_destruct( orte_snapc_base_local_snapshot_t *snapshot) { snapshot->process_name.jobid = 0; snapshot->process_name.vpid = 0; ORTE_EPOCH_SET(snapshot->process_name.epoch,ORTE_EPOCH_MIN); snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE; snapshot->ss_handle = ORTE_SSTORE_HANDLE_INVALID; }
static int udp_recv(orte_process_name_t *name, orte_rmcast_channel_t channel, orte_rmcast_tag_t tag, orte_rmcast_seq_t *seq_num, struct iovec **msg, int *count) { rmcast_base_recv_t *recvptr; int ret; orte_rmcast_channel_t chan; ORTE_ACQUIRE_THREAD(&ctl); if (!comm_enabled) { ORTE_RELEASE_THREAD(&ctl); return ORTE_ERR_COMM_DISABLED; } if (ORTE_RMCAST_GROUP_INPUT_CHANNEL == channel) { chan = orte_rmcast_base.my_input_channel->channel; } else if (ORTE_RMCAST_GROUP_OUTPUT_CHANNEL == channel) { chan = orte_rmcast_base.my_output_channel->channel; } else { chan = channel; } if (ORTE_SUCCESS != (ret = orte_rmcast_base_queue_recv(&recvptr, chan, tag, ORTE_RMCAST_NON_PERSISTENT, NULL, NULL, NULL, true))) { ORTE_ERROR_LOG(ret); ORTE_RELEASE_THREAD(&ctl); return ret; } ORTE_RELEASE_THREAD(&ctl); recvptr->ctl.active = true; ORTE_ACQUIRE_THREAD(&recvptr->ctl); /* xfer the data */ if (NULL != name) { /* caller requested id of sender */ name->jobid = recvptr->name.jobid; name->vpid = recvptr->name.vpid; ORTE_EPOCH_SET(name->epoch,recvptr->name.epoch); } *seq_num = recvptr->seq_num; *msg = recvptr->iovec_array; *count = recvptr->iovec_count; /* carefully release the recv */ recvptr->iovec_array = NULL; recvptr->iovec_count = 0; OBJ_RELEASE(recvptr); return ORTE_SUCCESS; }
void orte_sstore_central_local_app_snapshot_info_construct(orte_sstore_central_local_app_snapshot_info_t *info) { info->name.jobid = ORTE_JOBID_INVALID; info->name.vpid = ORTE_VPID_INVALID; ORTE_EPOCH_SET(info->name.epoch,ORTE_EPOCH_MIN); info->local_location = NULL; info->metadata_filename = NULL; info->crs_comp = NULL; info->ckpt_skipped = false; }
static int append_new_app_handle_info(orte_sstore_central_local_snapshot_info_t *handle_info, orte_process_name_t *name) { orte_sstore_central_local_app_snapshot_info_t *app_info = NULL; app_info = OBJ_NEW(orte_sstore_central_local_app_snapshot_info_t); app_info->name.jobid = name->jobid; app_info->name.vpid = name->vpid; ORTE_EPOCH_SET(app_info->name.epoch,name->epoch); opal_list_append(handle_info->app_info_handle, &(app_info->super)); return ORTE_SUCCESS; }
static int slave_set_name(void) { char *jobid_str, *procid_str; int id, rc; orte_jobid_t jobid; orte_vpid_t vpid; id = mca_base_param_register_string("orte", "ess", "jobid", NULL, NULL); mca_base_param_lookup_string(id, &jobid_str); if (NULL == jobid_str) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_jobid(&jobid, jobid_str))) { ORTE_ERROR_LOG(rc); return(rc); } free(jobid_str); id = mca_base_param_register_string("orte", "ess", "vpid", NULL, NULL); mca_base_param_lookup_string(id, &procid_str); if (NULL == procid_str) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_vpid(&vpid, procid_str))) { ORTE_ERROR_LOG(rc); return(rc); } free(procid_str); ORTE_PROC_MY_NAME->jobid = jobid; ORTE_PROC_MY_NAME->vpid = vpid; ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "ess:slave set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* get the non-name common environmental variables */ if (ORTE_SUCCESS != (rc = orte_ess_env_get())) { ORTE_ERROR_LOG(rc); return rc; } return ORTE_SUCCESS; }
static int orte_sstore_central_extract_global_metadata(orte_sstore_central_global_snapshot_info_t * handle_info, orte_sstore_base_global_snapshot_info_t *global_snapshot) { int exit_status = ORTE_SUCCESS; orte_sstore_base_local_snapshot_info_t *vpid_snapshot = NULL; opal_list_item_t* item = NULL; int i = 0; /* * Cleanup the structure a bit, so we can refresh it below */ while (NULL != (item = opal_list_remove_first(&global_snapshot->local_snapshots))) { OBJ_RELEASE(item); } if( NULL != global_snapshot->start_time ) { free( global_snapshot->start_time ); global_snapshot->start_time = NULL; } if( NULL != global_snapshot->end_time ) { free( global_snapshot->end_time ); global_snapshot->end_time = NULL; } /* * Create a structure for each application process */ for(i = 0; i < handle_info->num_procs_total; ++i) { vpid_snapshot = OBJ_NEW(orte_sstore_base_local_snapshot_info_t); vpid_snapshot->ss_handle = handle_info->id; vpid_snapshot->process_name.jobid = handle_info->jobid; vpid_snapshot->process_name.vpid = i; ORTE_EPOCH_SET(vpid_snapshot->process_name.epoch,orte_ess.proc_get_epoch(&vpid_snapshot->process_name)); vpid_snapshot->crs_comp = NULL; global_snapshot->start_time = NULL; global_snapshot->end_time = NULL; opal_list_append(&global_snapshot->local_snapshots, &(vpid_snapshot->super)); } return exit_status; }
/** * Initialize the module */ static int init(void) { int rc; OBJ_CONSTRUCT(&my_local_peers, opal_list_t); OBJ_CONSTRUCT(&barrier, orte_grpcomm_collective_t); OBJ_CONSTRUCT(&allgather, orte_grpcomm_collective_t); my_local_rank_zero_proc.jobid = ORTE_PROC_MY_NAME->jobid; my_local_rank_zero_proc.vpid = ORTE_VPID_INVALID; ORTE_EPOCH_SET(my_local_rank_zero_proc.epoch,ORTE_EPOCH_MIN); if (ORTE_SUCCESS != (rc = orte_grpcomm_base_modex_init())) { ORTE_ERROR_LOG(rc); return rc; } return rc; }
static int hnp_pull(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag, int fd) { orte_iof_sink_t *sink; int flags; /* this is a local call - only stdin is supported */ if (ORTE_IOF_STDIN != src_tag) { return ORTE_ERR_NOT_SUPPORTED; } OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s iof:hnp pulling fd %d for process %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), fd, ORTE_NAME_PRINT(dst_name))); /* set the file descriptor to non-blocking - do this before we setup * the sink in case it fires right away */ if((flags = fcntl(fd, F_GETFL, 0)) < 0) { opal_output(orte_iof_base.iof_output, "[%s:%d]: fcntl(F_GETFL) failed with errno=%d\n", __FILE__, __LINE__, errno); } else { flags |= O_NONBLOCK; fcntl(fd, F_SETFL, flags); } ORTE_IOF_SINK_DEFINE(&sink, dst_name, fd, ORTE_IOF_STDIN, stdin_write_handler, &mca_iof_hnp_component.sinks); sink->daemon.jobid = ORTE_PROC_MY_NAME->jobid; sink->daemon.vpid = ORTE_PROC_MY_NAME->vpid; ORTE_EPOCH_SET(sink->daemon.epoch,ORTE_PROC_MY_NAME->epoch); return ORTE_SUCCESS; }
int orte_rmaps_base_compute_vpids(orte_job_t *jdata) { orte_job_map_t *map; orte_vpid_t vpid; int i, j; orte_node_t *node; orte_proc_t *proc; int rc; map = jdata->map; if (ORTE_MAPPING_BYSLOT & map->policy || ORTE_MAPPING_BYSOCKET & map->policy || ORTE_MAPPING_BYBOARD & map->policy) { /* assign the ranks sequentially */ for (i=0; i < map->nodes->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { continue; } for (j=0; j < node->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; } /* ignore procs from other jobs */ if (proc->name.jobid != jdata->jobid) { continue; } if (ORTE_VPID_INVALID == proc->name.vpid) { /* find the next available vpid */ for (vpid=0; vpid < jdata->num_procs; vpid++) { if (NULL == opal_pointer_array_get_item(jdata->procs, vpid)) { break; } } proc->name.vpid = vpid; ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID); ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); /* If there is an invalid epoch here, it's because it doesn't exist yet. */ if (0 == ORTE_EPOCH_CMP(ORTE_EPOCH_INVALID,proc->name.epoch)) { ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); } } if (NULL == opal_pointer_array_get_item(jdata->procs, proc->name.vpid)) { if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { ORTE_ERROR_LOG(rc); return rc; } } } } return ORTE_SUCCESS; } if (ORTE_MAPPING_BYNODE & map->policy) { /* assign the ranks round-robin across nodes */ for (i=0; i < map->nodes->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { continue; } for (j=0; j < node->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; } /* ignore procs from other jobs */ if (proc->name.jobid != jdata->jobid) { continue; } if (ORTE_VPID_INVALID == proc->name.vpid) { /* find the next available vpid */ vpid = i; while (NULL != opal_pointer_array_get_item(jdata->procs, vpid)) { vpid += map->num_nodes; if (jdata->num_procs <= vpid) { vpid = vpid - jdata->num_procs; } } proc->name.vpid = vpid; ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID); ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); } if (NULL == opal_pointer_array_get_item(jdata->procs, proc->name.vpid)) { if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { ORTE_ERROR_LOG(rc); return rc; } } } } return ORTE_SUCCESS; } return ORTE_ERR_NOT_IMPLEMENTED; }
int orte_rmaps_base_define_daemons(orte_job_t *jdata) { orte_job_map_t *map; orte_node_t *node; orte_proc_t *proc; orte_job_t *daemons; int i; int rc; OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, "%s rmaps:base:define_daemons", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (ORTE_MAPPING_USE_VM & jdata->map->policy) { /* nothing for us to do - all daemons are * defined by definition! */ return ORTE_SUCCESS; } /* get the daemon job data struct */ if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_HNP->jobid))) { /* bad news */ ORTE_ERROR_LOG(ORTE_ERR_FATAL); return ORTE_ERR_FATAL; } /* initialize the #new daemons */ map = jdata->map; map->num_new_daemons = 0; /* go through the nodes in the map, checking each one's daemon name */ for (i=0; i < map->nodes->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { continue; } if (NULL == node->daemon) { /* we haven't defined one for it * yet, so do so now and indicate it is to be launched */ proc = OBJ_NEW(orte_proc_t); if (NULL == proc) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } proc->name.jobid = ORTE_PROC_MY_HNP->jobid; if (ORTE_VPID_MAX-1 <= daemons->num_procs) { /* no more daemons available */ orte_show_help("help-orte-rmaps-base.txt", "out-of-vpids", true); OBJ_RELEASE(proc); return ORTE_ERR_OUT_OF_RESOURCE; } proc->name.vpid = daemons->num_procs; /* take the next available vpid */ ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); proc->node = node; proc->nodename = node->name; OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, "%s rmaps:base:define_daemons add new daemon %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name))); /* add the daemon to the daemon job object */ if (0 > (rc = opal_pointer_array_add(daemons->procs, (void*)proc))) { ORTE_ERROR_LOG(rc); return rc; } ++daemons->num_procs; /* point the node to the daemon */ node->daemon = proc; OBJ_RETAIN(proc); /* maintain accounting */ /* track number of daemons to be launched */ ++map->num_new_daemons; /* and their starting vpid */ if (ORTE_VPID_INVALID == map->daemon_vpid_start) { map->daemon_vpid_start = proc->name.vpid; } } /* * If we are launching on a node where there used to be a daemon, but * it had previously failed, try to relaunch it. (Daemon Recovery) Do * this ONLY if there are procs mapped to that daemon! */ else if (node->daemon->state > ORTE_PROC_STATE_UNTERMINATED) { /* If no processes are to be launched on this node, then exclude it */ if( 0 >= node->num_procs ) { OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, "%s rmaps:base:define_daemons Skipping the Recovery of daemon %s [0x%x] Launched: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&node->daemon->name), node->daemon->state, (node->daemon_launched ? "T" : "F") )); /* since this daemon exists but is not needed, then flag it * as "launched" to avoid relaunching it for no reason */ node->daemon_launched = true; continue; } OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, "%s rmaps:base:define_daemons RECOVERING daemon %s [0x%x] Launched: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&node->daemon->name), node->daemon->state, (node->daemon_launched ? "T" : "F") )); /* flag that the daemon is no longer launched */ node->daemon_launched = false; /* set the state to indicate launch is in progress */ node->daemon->state = ORTE_PROC_STATE_RESTART; free(node->daemon->rml_uri); node->daemon->rml_uri = NULL; OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, "%s rmaps:base:define_daemons add new daemon %s (Recovering old daemon)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&node->daemon->name))); /* track number of daemons to be launched */ ++map->num_new_daemons; } else { /* this daemon was previously defined - flag it */ node->daemon_launched = true; OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, "%s rmaps:base:define_daemons existing daemon %s already launched", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&node->daemon->name))); } } return ORTE_SUCCESS; }
int orte_sstore_base_extract_global_metadata(orte_sstore_base_global_snapshot_info_t *global_snapshot) { int ret, exit_status = ORTE_SUCCESS; FILE *metadata = NULL; char * token = NULL; char * value = NULL; orte_process_name_t proc; opal_list_item_t* item = NULL; orte_sstore_base_local_snapshot_info_t *vpid_snapshot = NULL; /* * Cleanup the structure a bit, so we can refresh it below */ while (NULL != (item = opal_list_remove_first(&global_snapshot->local_snapshots))) { OBJ_RELEASE(item); } if( NULL != global_snapshot->start_time ) { free( global_snapshot->start_time ); global_snapshot->start_time = NULL; } if( NULL != global_snapshot->end_time ) { free( global_snapshot->end_time ); global_snapshot->end_time = NULL; } /* * Open the metadata file */ if (NULL == (metadata = fopen(global_snapshot->metadata_filename, "r")) ) { opal_output(orte_sstore_base_output, "sstore:base:extract_global_metadata() Unable to open the file (%s)\n", global_snapshot->metadata_filename); ORTE_ERROR_LOG(ORTE_ERROR); exit_status = ORTE_ERROR; goto cleanup; } /* * Seek to the sequence number requested */ if( ORTE_SUCCESS != (ret = orte_sstore_base_metadata_seek_to_seq_num(metadata, global_snapshot->seq_num))) { ORTE_ERROR_LOG(ORTE_ERROR); exit_status = ORTE_ERROR; goto cleanup; } /* * Extract each token and make the records */ do { if( ORTE_SUCCESS != orte_sstore_base_metadata_read_next_token(metadata, &token, &value) ) { break; } if(0 == strncmp(token, SSTORE_METADATA_GLOBAL_SNAP_SEQ_STR, strlen(SSTORE_METADATA_GLOBAL_SNAP_SEQ_STR)) || 0 == strncmp(token, SSTORE_METADATA_INTERNAL_MIG_SEQ_STR, strlen(SSTORE_METADATA_INTERNAL_MIG_SEQ_STR)) ) { break; } if( 0 == strncmp(token, SSTORE_METADATA_INTERNAL_PROCESS_STR, strlen(SSTORE_METADATA_INTERNAL_PROCESS_STR)) ) { orte_util_convert_string_to_process_name(&proc, value); /* Not the first process, so append it to the list */ if( NULL != vpid_snapshot) { opal_list_append(&global_snapshot->local_snapshots, &(vpid_snapshot->super)); } vpid_snapshot = OBJ_NEW(orte_sstore_base_local_snapshot_info_t); vpid_snapshot->ss_handle = global_snapshot->ss_handle; vpid_snapshot->process_name.jobid = proc.jobid; vpid_snapshot->process_name.vpid = proc.vpid; ORTE_EPOCH_SET(vpid_snapshot->process_name.epoch,proc.epoch); } else if(0 == strncmp(token, SSTORE_METADATA_LOCAL_CRS_COMP_STR, strlen(SSTORE_METADATA_LOCAL_CRS_COMP_STR))) { vpid_snapshot->crs_comp = strdup(value); } else if(0 == strncmp(token, SSTORE_METADATA_LOCAL_COMPRESS_COMP_STR, strlen(SSTORE_METADATA_LOCAL_COMPRESS_COMP_STR))) { vpid_snapshot->compress_comp = strdup(value); } else if(0 == strncmp(token, SSTORE_METADATA_LOCAL_COMPRESS_POSTFIX_STR, strlen(SSTORE_METADATA_LOCAL_COMPRESS_POSTFIX_STR))) { vpid_snapshot->compress_postfix = strdup(value); } else if(0 == strncmp(token, SSTORE_METADATA_INTERNAL_TIME_STR, strlen(SSTORE_METADATA_INTERNAL_TIME_STR)) ) { if( NULL == global_snapshot->start_time) { global_snapshot->start_time = strdup(value); } else { global_snapshot->end_time = strdup(value); } } else if(0 == strncmp(token, SSTORE_METADATA_GLOBAL_AMCA_PARAM_STR, strlen(SSTORE_METADATA_GLOBAL_AMCA_PARAM_STR))) { global_snapshot->amca_param = strdup(value); } } while(0 == feof(metadata) ); /* Append the last item */ if( NULL != vpid_snapshot) { opal_list_append(&global_snapshot->local_snapshots, &(vpid_snapshot->super)); } cleanup: if( NULL != metadata ) { fclose(metadata); metadata = NULL; } if( NULL != value ) { free(value); value = NULL; } if( NULL != token ) { free(token); token = NULL; } return exit_status; }
/* * Claim a slot for a specified job on a node */ int orte_rmaps_base_claim_slot(orte_job_t *jdata, orte_node_t *current_node, int32_t cpus_per_rank, orte_std_cntr_t app_idx, opal_list_t *nodes, bool oversubscribe, bool remove_from_list, orte_proc_t **returnproc) { orte_proc_t *proc; bool oversub; int rc; /* if we were given a proc, just use it */ if (NULL != returnproc && NULL != *returnproc) { proc = *returnproc; } else { /* create mapped_proc object */ proc = OBJ_NEW(orte_proc_t); if (NULL == proc) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } /* set the jobid */ proc->name.jobid = jdata->jobid; /* flag the proc as ready for launch */ proc->state = ORTE_PROC_STATE_INIT; /* we do not set the vpid here - this will be done * during a second phase */ /* We do set the epoch here since they all start with the same value. */ ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); proc->app_idx = app_idx; OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, "%s rmaps:base:claim_slot: created new proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name))); /* provide returned proc, if requested */ if (NULL != returnproc) { *returnproc = proc; } } OBJ_RETAIN(current_node); /* maintain accounting on object */ proc->node = current_node; proc->nodename = current_node->name; OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, "%s rmaps:base:claim_slot mapping proc in job %s to node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid), current_node->name)); /* Be sure to demarcate the slots for this proc as claimed from the node */ current_node->slots_inuse += 1; /* see if this node is oversubscribed now */ if (current_node->slots_inuse > current_node->slots) { oversub = true; } else { oversub = false; } /* assign the proc to the node and ensure the node is on the map */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_add_proc_to_map(jdata->map, current_node, oversub, proc))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(proc); return rc; } /* If this node has reached its max number of allocatable slots OR it has * reached the soft limit AND we are in a "no oversubscribe" state, then * we need to return a flag telling the mapper this is the case so it * can move on to the next node */ if ((0 != current_node->slots_max && current_node->slots_inuse >= current_node->slots_max) || (!oversubscribe && current_node->slots_inuse >= current_node->slots)) { /* see if we are supposed to remove the node from the list - some * mappers want us to do so to avoid any chance of continuing to * add procs to it */ if (NULL != nodes && remove_from_list) { opal_list_remove_item(nodes, (opal_list_item_t*)current_node); /* release it - it was retained when we started, so this * just ensures the instance counter is correctly updated */ OBJ_RELEASE(current_node); } /* now return the proper code so the caller knows this node * is fully used */ return ORTE_ERR_NODE_FULLY_USED; } return ORTE_SUCCESS; }
int orte_util_build_daemon_nidmap(char **nodes) { orte_nid_t *node; int i, num_nodes; int rc; struct hostent *h; opal_buffer_t buf; orte_process_name_t proc; char *uri, *addr; char *proc_name; num_nodes = opal_argv_count(nodes); OPAL_OUTPUT_VERBOSE((2, orte_debug_output, "%s orte:util:build:daemon:nidmap found %d nodes", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), num_nodes)); if (0 == num_nodes) { /* nothing to do */ return ORTE_SUCCESS; } /* set the size of the nidmap storage so we minimize realloc's */ if (ORTE_SUCCESS != (rc = opal_pointer_array_set_size(&orte_nidmap, num_nodes+1))) { ORTE_ERROR_LOG(rc); return rc; } /* install the entry for the HNP */ node = OBJ_NEW(orte_nid_t); node->name = strdup("HNP"); node->daemon = 0; /* the arch defaults to our arch so that non-hetero * case will yield correct behavior */ opal_pointer_array_set_item(&orte_nidmap, 0, node); /* the daemon vpids will be assigned in order, * starting with vpid=1 for the first node in * the list */ OBJ_CONSTRUCT(&buf, opal_buffer_t); proc.jobid = ORTE_PROC_MY_NAME->jobid; for (i=0; i < num_nodes; i++) { node = OBJ_NEW(orte_nid_t); node->name = strdup(nodes[i]); node->daemon = i+1; /* the arch defaults to our arch so that non-hetero * case will yield correct behavior */ opal_pointer_array_set_item(&orte_nidmap, node->daemon, node); /* lookup the address of this node */ if (NULL == (h = gethostbyname(node->name))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } addr = inet_ntoa(*(struct in_addr*)h->h_addr_list[0]); /* since we are using static ports, all my fellow daemons will be on my * port. Setup the contact info for each daemon in my hash tables. Note * that this will -not- open a port to those daemons, but will only * define the info necessary for opening such a port if/when I communicate * to them */ /* construct the URI */ proc.vpid = node->daemon; ORTE_EPOCH_SET(proc.epoch,ORTE_EPOCH_MIN); orte_util_convert_process_name_to_string(&proc_name, &proc); asprintf(&uri, "%s;tcp://%s:%d", proc_name, addr, (int)orte_process_info.my_port); OPAL_OUTPUT_VERBOSE((2, orte_debug_output, "%s orte:util:build:daemon:nidmap node %s daemon %d addr %s uri %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, (int)node->daemon, addr, uri)); opal_dss.pack(&buf, &uri, 1, OPAL_STRING); free(proc_name); free(uri); } /* load the hash tables */ if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(&buf))) { ORTE_ERROR_LOG(rc); } OBJ_DESTRUCT(&buf); return rc; }
static int hier_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf) { int rc=ORTE_SUCCESS; opal_list_item_t *item; orte_namelist_t *nm; opal_buffer_t tmp_buf; OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, "%s grpcomm:hier entering allgather", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* have I initialized my local info? */ if (!coll_initialized) { orte_process_name_t proc; orte_vpid_t v; /* get my local rank so I can locally cache it */ my_local_rank = orte_ess.get_local_rank(ORTE_PROC_MY_NAME); /* if I am local_rank=0 for this node and job, then setup * my array of local_rank=0 peers */ if (0 == my_local_rank) { /* we need one entry/node in this job */ my_coll_peers = (orte_vpid_t*)malloc(orte_process_info.num_nodes * sizeof(orte_vpid_t)); cpeers = 0; } /* cycle through the procs to create a list of those that are local to me */ proc.jobid = ORTE_PROC_MY_NAME->jobid; for (v=0; v < orte_process_info.num_procs; v++) { proc.vpid = v; ORTE_EPOCH_SET(proc.epoch,orte_util_lookup_epoch(&proc)); /* is this proc local_rank=0 on its node? */ if (0 == my_local_rank && 0 == orte_ess.get_local_rank(&proc)) { my_coll_peers[cpeers++] = v; } /* if this is me, or this proc isn't on our node, ignore it */ if (v == ORTE_PROC_MY_NAME->vpid || !OPAL_PROC_ON_LOCAL_NODE(orte_ess.proc_get_locality(&proc))) { continue; } /* add this proc to our list of local peers */ nm = OBJ_NEW(orte_namelist_t); nm->name.jobid = proc.jobid; nm->name.vpid = proc.vpid; ORTE_EPOCH_SET(nm->name.epoch,proc.epoch); opal_list_append(&my_local_peers, &nm->item); /* if I am not local_rank=0, is this one? */ if (0 != my_local_rank && 0 == orte_ess.get_local_rank(&proc)) { my_local_rank_zero_proc.jobid = proc.jobid; my_local_rank_zero_proc.vpid = proc.vpid; ORTE_EPOCH_SET(my_local_rank_zero_proc.epoch,proc.epoch); } } /* compute the number of local peers - note that this number * does not include me!! */ num_local_peers = opal_list_get_size(&my_local_peers); /* flag that I have initialized things */ coll_initialized = true; } /* if I am not local rank = 0 */ if (0 != my_local_rank) { if (ORTE_VPID_INVALID == my_local_rank_zero_proc.vpid) { /* something is broken */ ORTE_ERROR_LOG(ORTE_ERR_FATAL); return ORTE_ERR_FATAL; } /* setup the collective */ OPAL_THREAD_LOCK(&allgather.lock); allgather.recvd = 0; /* reset the collector */ OBJ_DESTRUCT(&allgather.results); OBJ_CONSTRUCT(&allgather.results, opal_buffer_t); OPAL_THREAD_UNLOCK(&allgather.lock); /* send our data to the local_rank=0 proc on this node */ if (0 > (rc = orte_rml.send_buffer(&my_local_rank_zero_proc, sbuf, ORTE_RML_TAG_ALLGATHER, 0))) { ORTE_ERROR_LOG(rc); return rc; } /* now receive the final result. Be sure to do this in * a manner that allows us to return without being in a recv! */ rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER, ORTE_RML_NON_PERSISTENT, allgather_recv, &allgather); if (rc != ORTE_SUCCESS) { ORTE_ERROR_LOG(rc); return rc; } /* wait to complete - we will receive a single message * sent from our local_rank=0 peer */ OPAL_THREAD_LOCK(&allgather.lock); while (allgather.recvd < 1) { opal_condition_wait(&allgather.cond, &allgather.lock); } /* copy payload to the caller's buffer */ if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(rbuf, &allgather.results))) { ORTE_ERROR_LOG(rc); } OPAL_THREAD_UNLOCK(&allgather.lock); } else { /* I am local_rank = 0 on this node! */ /* setup the collective */ OPAL_THREAD_LOCK(&allgather.lock); allgather.recvd = 0; /* reset the collector */ OBJ_DESTRUCT(&allgather.results); OBJ_CONSTRUCT(&allgather.results, opal_buffer_t); /* seed with my data */ opal_dss.copy_payload(&allgather.results, sbuf); OPAL_THREAD_UNLOCK(&allgather.lock); /* wait to receive their data. Be sure to do this in * a manner that allows us to return without being in a recv! */ rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER, ORTE_RML_PERSISTENT, allgather_recv, &allgather); if (rc != ORTE_SUCCESS) { ORTE_ERROR_LOG(rc); return rc; } /* wait to complete - we need to receive input from every * local peer (excluding myself) */ OPAL_THREAD_LOCK(&allgather.lock); while (allgather.recvd < num_local_peers) { opal_condition_wait(&allgather.cond, &allgather.lock); } /* xfer to the tmp buf in case another allgather comes along */ OBJ_CONSTRUCT(&tmp_buf, opal_buffer_t); opal_dss.copy_payload(&tmp_buf, &allgather.results); OPAL_THREAD_UNLOCK(&allgather.lock); /* cancel the lingering recv */ orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER); /* take the recv'd data and use one of the base collectives * to exchange it with all other local_rank=0 procs in a scalable * manner - the exact collective will depend upon the number of * nodes in the job */ if (ORTE_SUCCESS != (rc = orte_grpcomm_base_allgather(&tmp_buf, rbuf, num_local_peers + 1, ORTE_PROC_MY_NAME->jobid, cpeers, my_coll_peers))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&tmp_buf); return rc; } OBJ_DESTRUCT(&tmp_buf); /* done with this */ /* distribute the results to our local peers */ for (item = opal_list_get_first(&my_local_peers); item != opal_list_get_end(&my_local_peers); item = opal_list_get_next(item)) { nm = (orte_namelist_t*)item; if (0 > (rc = orte_rml.send_buffer(&nm->name, rbuf, ORTE_RML_TAG_ALLGATHER, 0))) { ORTE_ERROR_LOG(rc); return rc; } } } OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, "%s grpcomm:hier allgather completed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return ORTE_SUCCESS; }
/* For a complete description of this algorithm, please look at * ompi/mca/coll/tuned/coll_tuned_allgather.c */ static int recursivedoubling(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_entries, orte_jobid_t jobid, orte_vpid_t np, orte_vpid_t *vpids) { orte_vpid_t rank, distance, nv; int32_t num_remote, total_entries, cnt; opal_buffer_t collection, buf; orte_process_name_t peer; int rc; OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:coll:recdub algo employed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* initialize */ total_entries = num_entries; /* start by seeding the collection with our own data */ OBJ_CONSTRUCT(&collection, opal_buffer_t); opal_dss.copy_payload(&collection, sendbuf); /* collective is constrained to take place within the specified jobid */ peer.jobid = jobid; /* Communication step: At every step i, rank r: - exchanges message containing all data collected so far with rank peer = (r ^ 2^i). */ /* find my position in the group of participants. This * value is the "rank" we will use in the algo */ rank = ORTE_VPID_INVALID; for (nv=0; nv < np; nv++) { if (vpids[nv] == ORTE_PROC_MY_NAME->vpid) { rank = nv; break; } } /* check for bozo case */ if (ORTE_VPID_INVALID == rank) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } for (distance = 0x1; distance < np; distance<<=1) { /* first send my current contents */ nv = rank ^ distance; peer.vpid = vpids[nv]; ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); OBJ_CONSTRUCT(&buf, opal_buffer_t); opal_dss.pack(&buf, &total_entries, 1, OPAL_INT32); opal_dss.copy_payload(&buf, &collection); OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:coll:recdub sending to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer))); if (0 > (rc = orte_rml.send_buffer(&peer, &buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) { ORTE_ERROR_LOG(rc); return rc; } OBJ_DESTRUCT(&buf); /* now setup to recv from my other partner */ num_recvd = 0; OBJ_CONSTRUCT(&bucket, opal_buffer_t); if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(&peer, ORTE_RML_TAG_DAEMON_COLLECTIVE, ORTE_RML_NON_PERSISTENT, orte_grpcomm_base_coll_recv, NULL))) { ORTE_ERROR_LOG(rc); return rc; } /* and wait for it to get here */ ORTE_PROGRESSED_WAIT(false, num_recvd, 1); /* extract the number of entries in the remote buffer */ cnt = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(&bucket, &num_remote, &cnt, OPAL_INT32))) { ORTE_ERROR_LOG(rc); return rc; } /* add it to our running total */ total_entries += num_remote; /* transfer the data to our collection */ opal_dss.copy_payload(&collection, &bucket); /* cleanup */ OBJ_DESTRUCT(&bucket); } /* output of a collective begins with the total number of entries */ if (ORTE_SUCCESS != (rc = opal_dss.pack(recvbuf, &total_entries, 1, OPAL_INT32))) { ORTE_ERROR_LOG(rc); return rc; } /* transfer the collected data */ opal_dss.copy_payload(recvbuf, &collection); /* cleanup */ OBJ_DESTRUCT(&collection); return ORTE_SUCCESS; }
/* * The Two-Proc Algorithm * * One sends to zero, zero waits to recv from one * Zero adds its data to message, sends result back to one */ static int twoproc(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_entries, orte_jobid_t jobid, orte_vpid_t *vpids) { orte_process_name_t peer; int32_t num_remote, cnt; int rc; opal_buffer_t buf; peer.jobid = jobid; OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:coll:two-proc algo employed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (vpids[0] == ORTE_PROC_MY_NAME->vpid) { /* I send first */ peer.vpid = vpids[1]; ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); /* setup a temp buffer so I can inform the other side as to the * number of entries in my buffer */ OBJ_CONSTRUCT(&buf, opal_buffer_t); opal_dss.pack(&buf, &num_entries, 1, OPAL_INT32); opal_dss.copy_payload(&buf, sendbuf); OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:coll:two-proc sending to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer))); if (0 > (rc = orte_rml.send_buffer(&peer, &buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) { ORTE_ERROR_LOG(rc); return rc; } OBJ_DESTRUCT(&buf); /* wait for reply */ num_recvd = 0; OBJ_CONSTRUCT(&bucket, opal_buffer_t); if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON_COLLECTIVE, ORTE_RML_NON_PERSISTENT, orte_grpcomm_base_coll_recv, NULL))) { ORTE_ERROR_LOG(rc); } ORTE_PROGRESSED_WAIT(false, num_recvd, 1); OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:coll:two-proc got my return message", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } else { /* if I am not the start, then I recv first */ num_recvd = 0; OBJ_CONSTRUCT(&bucket, opal_buffer_t); if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON_COLLECTIVE, ORTE_RML_NON_PERSISTENT, orte_grpcomm_base_coll_recv, NULL))) { ORTE_ERROR_LOG(rc); } ORTE_PROGRESSED_WAIT(false, num_recvd, 1); OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:coll:two-proc got my starting message", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* send my data back */ OBJ_CONSTRUCT(&buf, opal_buffer_t); opal_dss.pack(&buf, &num_entries, 1, OPAL_INT32); opal_dss.copy_payload(&buf, sendbuf); peer.vpid = vpids[0]; ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:coll:two-proc sending to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer))); if (0 > (rc = orte_rml.send_buffer(&peer, &buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) { ORTE_ERROR_LOG(rc); return rc; } OBJ_DESTRUCT(&buf); } /* extract the number of entries in the remote buffer */ cnt = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(&bucket, &num_remote, &cnt, OPAL_INT32))) { ORTE_ERROR_LOG(rc); return rc; } /* output of a collective begins with the total number of entries */ num_remote += num_entries; if (ORTE_SUCCESS != (rc = opal_dss.pack(recvbuf, &num_remote, 1, OPAL_INT32))) { ORTE_ERROR_LOG(rc); return rc; } /* xfer my data */ opal_dss.copy_payload(recvbuf, sendbuf); /* xfer the recvd data */ opal_dss.copy_payload(recvbuf, &bucket); /* cleanup */ OBJ_DESTRUCT(&bucket); return ORTE_SUCCESS; }
void orte_grpcomm_base_daemon_collective(orte_process_name_t *sender, opal_buffer_t *data) { orte_jobid_t jobid; orte_odls_job_t *jobdat; orte_routed_tree_t *child; orte_std_cntr_t n; opal_list_t daemon_tree; opal_list_item_t *item, *next; int32_t num_contributors; opal_buffer_t buf; orte_process_name_t my_parent, proc; orte_vpid_t daemonvpid; int rc; int32_t numc; orte_rml_tag_t rmltag; OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:base:daemon_coll: daemon collective called", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* unpack the jobid using this collective */ n = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobid, &n, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return; } /* lookup the job record for it */ jobdat = NULL; for (item = opal_list_get_first(&orte_local_jobdata); item != opal_list_get_end(&orte_local_jobdata); item = opal_list_get_next(item)) { jobdat = (orte_odls_job_t*)item; /* is this the specified job? */ if (jobdat->jobid == jobid) { break; } } if (NULL == jobdat) { /* race condition - someone sent us a collective before we could * parse the add_local_procs cmd. Just add the jobdat object * and continue */ jobdat = OBJ_NEW(orte_odls_job_t); jobdat->jobid = jobid; opal_list_append(&orte_local_jobdata, &jobdat->super); } /* it may be possible to get here prior to having actually finished processing our * local launch msg due to the race condition between different nodes and when * they start their individual procs. Hence, we have to first ensure that we * -have- finished processing the launch msg, or else we won't know whether * or not to wait before sending this on */ OPAL_THREAD_LOCK(&jobdat->lock); while (!jobdat->launch_msg_processed) { opal_condition_wait(&jobdat->cond, &jobdat->lock); } OPAL_THREAD_UNLOCK(&jobdat->lock); /* unpack the tag for this collective */ n = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &rmltag, &n, ORTE_RML_TAG))) { ORTE_ERROR_LOG(rc); return; } /* unpack the number of contributors in this data bucket */ n = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &num_contributors, &n, OPAL_INT32))) { ORTE_ERROR_LOG(rc); return; } jobdat->num_contributors += num_contributors; /* xfer the data */ opal_dss.copy_payload(&jobdat->collection_bucket, data); /* count the number of participants collected */ jobdat->num_collected++; /* if we haven't already done so, figure out how many participants we * should be expecting */ if (jobdat->num_participating < 0) { if (0 < jobdat->num_local_procs) { /* we have children, so account for our own participation */ jobdat->num_participating = 1; } else { jobdat->num_participating = 0; } /* now see if anyone else will be sending us something */ OBJ_CONSTRUCT(&daemon_tree, opal_list_t); orte_routed.get_routing_tree(&daemon_tree); /* unfortunately, there is no simple way to determine which of our "child" * daemons in the routing tree will be sending us something. All we can do * is brute force a search, though we attempt to keep it as short as possible */ proc.jobid = jobid; proc.vpid = 0; while (proc.vpid < jobdat->num_procs && 0 < opal_list_get_size(&daemon_tree)) { ORTE_EPOCH_SET(proc.epoch,orte_ess.proc_get_epoch(&proc)); /* get the daemon that hosts this proc */ daemonvpid = orte_ess.proc_get_daemon(&proc); /* is this daemon one of our children, or at least its contribution * will pass through one of our children */ item = opal_list_get_first(&daemon_tree); while (item != opal_list_get_end(&daemon_tree)) { next = opal_list_get_next(item); child = (orte_routed_tree_t*)item; if (child->vpid == daemonvpid || opal_bitmap_is_set_bit(&child->relatives, daemonvpid)) { /* it does - add to num_participating */ jobdat->num_participating++; /* remove this from the list so we don't double count it */ opal_list_remove_item(&daemon_tree, item); /* done with search */ break; } item = next; } proc.vpid++; } } OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:base:daemon_coll: daemon collective for job %s from %s type %ld" " num_collected %d num_participating %d num_contributors %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jobid), ORTE_NAME_PRINT(sender), (long)jobdat->collective_type, jobdat->num_collected, jobdat->num_participating, jobdat->num_contributors)); if (jobdat->num_collected == jobdat->num_participating) { /* if I am the HNP, go process the results */ if (ORTE_PROC_IS_HNP) { goto hnp_process; } /* if I am not the HNP, send to my parent */ OBJ_CONSTRUCT(&buf, opal_buffer_t); /* pack the jobid */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return; } /* pack the target tag */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &rmltag, 1, ORTE_RML_TAG))) { ORTE_ERROR_LOG(rc); return; } /* pack the number of contributors */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jobdat->num_contributors, 1, OPAL_INT32))) { ORTE_ERROR_LOG(rc); return; } /* xfer the payload*/ opal_dss.copy_payload(&buf, &jobdat->collection_bucket); /* reset everything for next collective */ jobdat->num_contributors = 0; jobdat->num_collected = 0; OBJ_DESTRUCT(&jobdat->collection_bucket); OBJ_CONSTRUCT(&jobdat->collection_bucket, opal_buffer_t); /* send it */ my_parent.jobid = ORTE_PROC_MY_NAME->jobid; my_parent.vpid = orte_routed.get_routing_tree(NULL); ORTE_EPOCH_SET(my_parent.epoch,orte_ess.proc_get_epoch(&my_parent)); OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:base:daemon_coll: daemon collective not the HNP - sending to parent %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&my_parent))); if (0 > (rc = orte_rml.send_buffer(&my_parent, &buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) { ORTE_ERROR_LOG(rc); return; } OBJ_DESTRUCT(&buf); } return; hnp_process: OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:base:daemon_coll: daemon collective HNP - xcasting to job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jobid))); /* setup a buffer to send the results back to the job members */ OBJ_CONSTRUCT(&buf, opal_buffer_t); /* add any collected data */ numc = jobdat->num_contributors; if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &numc, 1, OPAL_INT32))) { ORTE_ERROR_LOG(rc); goto cleanup; } if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(&buf, &jobdat->collection_bucket))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* reset everything for next collective */ jobdat->num_contributors = 0; jobdat->num_collected = 0; OBJ_DESTRUCT(&jobdat->collection_bucket); OBJ_CONSTRUCT(&jobdat->collection_bucket, opal_buffer_t); /* send the buffer */ if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(jobid, &buf, rmltag))) { ORTE_ERROR_LOG(rc); } cleanup: OBJ_DESTRUCT(&buf); return; }
int main(int argc, char *argv[]){ int count; int msgsize; uint8_t *msg; int i, j, rc; orte_process_name_t peer; double maxpower; /* * Init */ orte_init(&argc, &argv, ORTE_PROC_NON_MPI); if (argc > 1) { count = atoi(argv[1]); if (count < 0) { count = INT_MAX-1; } } else { count = MAX_COUNT; } peer.jobid = ORTE_PROC_MY_NAME->jobid; for (j=1; j < count+1; j++) { peer.vpid = (ORTE_PROC_MY_NAME->vpid + j) % orte_process_info.num_procs; ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); /* rank0 starts ring */ if (ORTE_PROC_MY_NAME->vpid == 0) { /* setup the initiating buffer - put random sized message in it */ OBJ_CONSTRUCT(&buf, opal_buffer_t); maxpower = (double)(j%7); msgsize = (int)pow(10.0, maxpower); opal_output(0, "Ring %d message size %d bytes", j, msgsize); msg = (uint8_t*)malloc(msgsize); opal_dss.pack(&buf, msg, msgsize, OPAL_BYTE); if (0 > (rc = orte_rml.send_buffer(&peer,&buf, MY_TAG, 0))) { opal_output(0, "error sending to %s %s\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer), ORTE_ERROR_NAME(rc)); exit(1); } OBJ_DESTRUCT(&buf); /* wait for it to come around */ OBJ_CONSTRUCT(&buf, opal_buffer_t); msg_recvd = false; orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, MY_TAG, ORTE_RML_NON_PERSISTENT, recv_ack, NULL); ORTE_PROGRESSED_WAIT(msg_recvd, 0, 1); opal_output(0, "%s Ring %d completed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), j); } else { /* wait for msg */ OBJ_CONSTRUCT(&buf, opal_buffer_t); msg_recvd = false; orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, MY_TAG, ORTE_RML_NON_PERSISTENT, recv_ack, NULL); ORTE_PROGRESSED_WAIT(msg_recvd, 0, 1); /* send it along */ if (0 > (rc = orte_rml.send_buffer(&peer, &buf, MY_TAG, 0))) { opal_output(0, "%s error sending to %s %s\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer), ORTE_ERROR_NAME(rc)); exit(1); } OBJ_DESTRUCT(&buf); } } orte_finalize(); return 0; }
/***************** * Local Functions *****************/ static void errmgr_autor_process_fault_app(orte_job_t *jdata, orte_process_name_t *proc, orte_proc_state_t state) { errmgr_autor_wp_item_t *wp_item = NULL; struct timeval soon; OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, "%s errmgr:hnp(autor): process_fault() " "Process fault! proc %s (0x%x)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), state)); if( !orte_sstore_base_is_checkpoint_available ) { OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, "%s errmgr:hnp(autor): process_fault() " "No checkpoints are available for this job! Cannot Automaticly Recover!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) )); opal_show_help("help-orte-errmgr-hnp.txt", "autor_failed_to_recover_proc", true, ORTE_NAME_PRINT(proc), proc->vpid); return; } mca_errmgr_hnp_component.ignore_current_update = true; /* * If we are already in the shutdown stage of the recovery, then just skip it */ if( autor_mask_faults ) { OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, "%s errmgr:hnp(autor):process_fault() " "Currently recovering the job. Failure masked!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return; } /* * Append this process to the list to process */ wp_item = OBJ_NEW(errmgr_autor_wp_item_t); wp_item->name.jobid = proc->jobid; wp_item->name.vpid = proc->vpid; ORTE_EPOCH_SET(wp_item->name.epoch,proc->epoch); wp_item->state = state; opal_list_append(procs_pending_recovery, &(wp_item->super)); /* * Activate the timer, if it is not already setup */ if( !autor_timer_active ) { autor_timer_active = true; opal_event_evtimer_set(opal_event_base, autor_timer_event, errmgr_autor_recover_processes, NULL); soon.tv_sec = mca_errmgr_hnp_component.autor_recovery_delay; soon.tv_usec = 0; opal_event_evtimer_add(autor_timer_event, &soon); } return; }
/* Setup to read local data. If the tag is other than STDIN, * then this is output being pushed from one of my child processes * and I'll write the data out myself. If the tag is STDIN, * then I need to setup to read from my stdin, and send anything * I get to the specified dst_name. The dst_name in this case tells * us which procs are to get stdin - only two options are supported: * * (a) a specific name, usually vpid=0; or * * (b) all procs, specified by vpid=ORTE_VPID_WILDCARD * * The orte_plm_base_launch_apps function calls iof.push after * the procs are launched and tells us how to distribute stdin. This * ensures that the procs are started -before- we begin reading stdin * and attempting to send it to remote procs */ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag, int fd) { orte_job_t *jdata; orte_proc_t *proc; orte_iof_sink_t *sink; orte_iof_proc_t *proct; opal_list_item_t *item; int flags; char *outfile; int fdout; orte_odls_job_t *jobdat=NULL; int np, numdigs; int rc; orte_ns_cmp_bitmask_t mask; /* don't do this if the dst vpid is invalid or the fd is negative! */ if (ORTE_VPID_INVALID == dst_name->vpid || fd < 0) { return ORTE_SUCCESS; } OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s iof:hnp pushing fd %d for process %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), fd, ORTE_NAME_PRINT(dst_name))); if (!(src_tag & ORTE_IOF_STDIN)) { /* set the file descriptor to non-blocking - do this before we setup * and activate the read event in case it fires right away */ if((flags = fcntl(fd, F_GETFL, 0)) < 0) { opal_output(orte_iof_base.iof_output, "[%s:%d]: fcntl(F_GETFL) failed with errno=%d\n", __FILE__, __LINE__, errno); } else { flags |= O_NONBLOCK; fcntl(fd, F_SETFL, flags); } /* do we already have this process in our list? */ for (item = opal_list_get_first(&mca_iof_hnp_component.procs); item != opal_list_get_end(&mca_iof_hnp_component.procs); item = opal_list_get_next(item)) { proct = (orte_iof_proc_t*)item; mask = ORTE_NS_CMP_ALL; if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proct->name, dst_name)) { /* found it */ goto SETUP; } } /* if we get here, then we don't yet have this proc in our list */ proct = OBJ_NEW(orte_iof_proc_t); proct->name.jobid = dst_name->jobid; proct->name.vpid = dst_name->vpid; ORTE_EPOCH_SET(proct->name.epoch,dst_name->epoch); opal_list_append(&mca_iof_hnp_component.procs, &proct->super); /* see if we are to output to a file */ if (NULL != orte_output_filename) { /* get the local jobdata for this proc */ for (item = opal_list_get_first(&orte_local_jobdata); item != opal_list_get_end(&orte_local_jobdata); item = opal_list_get_next(item)) { jobdat = (orte_odls_job_t*)item; if (jobdat->jobid == proct->name.jobid) { break; } } if (NULL == jobdat) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } np = jobdat->num_procs / 10; /* determine the number of digits required for max vpid */ numdigs = 1; while (np > 0) { numdigs++; np = np / 10; } /* construct the filename */ asprintf(&outfile, "%s.%d.%0*lu", orte_output_filename, (int)ORTE_LOCAL_JOBID(proct->name.jobid), numdigs, (unsigned long)proct->name.vpid); /* create the file */ fdout = open(outfile, O_CREAT|O_RDWR|O_TRUNC, 0644); free(outfile); if (fdout < 0) { /* couldn't be opened */ ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE); return ORTE_ERR_FILE_OPEN_FAILURE; } /* define a sink to that file descriptor */ ORTE_IOF_SINK_DEFINE(&sink, dst_name, fdout, ORTE_IOF_STDOUTALL, orte_iof_base_write_handler, &mca_iof_hnp_component.sinks); } SETUP: /* define a read event and activate it */ if (src_tag & ORTE_IOF_STDOUT) { ORTE_IOF_READ_EVENT(&proct->revstdout, dst_name, fd, ORTE_IOF_STDOUT, orte_iof_hnp_read_local_handler, false); } else if (src_tag & ORTE_IOF_STDERR) { ORTE_IOF_READ_EVENT(&proct->revstderr, dst_name, fd, ORTE_IOF_STDERR, orte_iof_hnp_read_local_handler, false); } else if (src_tag & ORTE_IOF_STDDIAG) { ORTE_IOF_READ_EVENT(&proct->revstddiag, dst_name, fd, ORTE_IOF_STDDIAG, orte_iof_hnp_read_local_handler, false); } /* if -all- of the readevents for this proc have been defined, then * activate them. Otherwise, we can think that the proc is complete * because one of the readevents fires -prior- to all of them having * been defined! */ if (NULL != proct->revstdout && NULL != proct->revstderr && NULL != proct->revstddiag) { proct->revstdout->active = true; opal_event_add(&(proct->revstdout->ev), 0); proct->revstderr->active = true; opal_event_add(&(proct->revstderr->ev), 0); proct->revstddiag->active = true; opal_event_add(&(proct->revstddiag->ev), 0); } return ORTE_SUCCESS; } /* if we are pushing stdin, this is happening only during launch - setup * a target for this destination if it is going somewhere other than me */ if (ORTE_VPID_WILDCARD == dst_name->vpid) { /* if wildcard, define a sink with that info so it gets sent out */ ORTE_IOF_SINK_DEFINE(&sink, dst_name, -1, ORTE_IOF_STDIN, stdin_write_handler, &mca_iof_hnp_component.sinks); } else { /* no - lookup the proc's daemon and set that into sink */ if (NULL == (jdata = orte_get_job_data_object(dst_name->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_BAD_PARAM; } if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, dst_name->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } /* if it is me, then don't set this up - we'll get it on the pull */ if (ORTE_PROC_MY_NAME->vpid != proc->node->daemon->name.vpid) { ORTE_IOF_SINK_DEFINE(&sink, dst_name, -1, ORTE_IOF_STDIN, stdin_write_handler, &mca_iof_hnp_component.sinks); sink->daemon.jobid = ORTE_PROC_MY_NAME->jobid; sink->daemon.vpid = proc->node->daemon->name.vpid; ORTE_EPOCH_SET(sink->daemon.epoch,orte_ess.proc_get_epoch(&sink->daemon)); } } /* now setup the read - but check to only do this once */ if (NULL == mca_iof_hnp_component.stdinev) { /* Since we are the HNP, we don't want to set nonblocking on our * stdio stream. If we do so, we set the file descriptor to * non-blocking for everyone that has that file descriptor, which * includes everyone else in our shell pipeline chain. (See * http://lists.freebsd.org/pipermail/freebsd-hackers/2005-January/009742.html). * This causes things like "mpirun -np 1 big_app | cat" to lose * output, because cat's stdout is then ALSO non-blocking and cat * isn't built to deal with that case (same with almost all other * unix text utils). */ if (0 != fd) { if((flags = fcntl(fd, F_GETFL, 0)) < 0) { opal_output(orte_iof_base.iof_output, "[%s:%d]: fcntl(F_GETFL) failed with errno=%d\n", __FILE__, __LINE__, errno); } else { flags |= O_NONBLOCK; fcntl(fd, F_SETFL, flags); } } if (isatty(fd)) { /* We should avoid trying to read from stdin if we * have a terminal, but are backgrounded. Catch the * signals that are commonly used when we switch * between being backgrounded and not. If the * filedescriptor is not a tty, don't worry about it * and always stay connected. */ opal_event_signal_set(opal_event_base, &mca_iof_hnp_component.stdinsig, SIGCONT, orte_iof_hnp_stdin_cb, NULL); /* setup a read event to read stdin, but don't activate it yet. The * dst_name indicates who should receive the stdin. If that recipient * doesn't do a corresponding pull, however, then the stdin will * be dropped upon receipt at the local daemon */ ORTE_IOF_READ_EVENT(&mca_iof_hnp_component.stdinev, dst_name, fd, ORTE_IOF_STDIN, orte_iof_hnp_read_local_handler, false); /* check to see if we want the stdin read event to be * active - we will always at least define the event, * but may delay its activation */ if (!(src_tag & ORTE_IOF_STDIN) || orte_iof_hnp_stdin_check(fd)) { mca_iof_hnp_component.stdinev->active = true; if (OPAL_SUCCESS != (rc = opal_event_add(&(mca_iof_hnp_component.stdinev->ev), 0))) { ORTE_ERROR_LOG(rc); } } } else { /* if we are not looking at a tty, just setup a read event * and activate it */ ORTE_IOF_READ_EVENT(&mca_iof_hnp_component.stdinev, dst_name, fd, ORTE_IOF_STDIN, orte_iof_hnp_read_local_handler, true); } } return ORTE_SUCCESS; }
int orte_rmaps_base_setup_virtual_machine(orte_job_t *jdata) { orte_job_t *jdat; orte_node_t *node; orte_proc_t *proc; orte_job_map_t *map; opal_list_t node_list; opal_list_item_t *item; orte_app_context_t *app; orte_std_cntr_t num_slots; int rc, i, n; bool ignored; /* get the daemon app if provided - may include -host or hostfile * info about available nodes */ app = (orte_app_context_t *) opal_pointer_array_get_item(jdata->apps, 0); map = jdata->map; /* get the list of all available nodes that do not already * have a daemon on them */ OBJ_CONSTRUCT(&node_list, opal_list_t); if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, map->policy))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&node_list); return rc; } /* check all other known jobs to see if they have something to * add to the allocation - we won't have seen these and the * daemon job won't have any in its app */ for (i=0; i < orte_job_data->size; i++) { if (NULL == (jdat = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) { continue; } for (n=0; n < jdat->apps->size; n++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdat->apps, n))) { continue; } if (NULL != app->hostfile) { /* hostfile was specified - parse it and add it to the list. The * function automatically ignores duplicates */ if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&node_list, &ignored, app->hostfile))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&node_list); return rc; } } if (NULL != app->dash_host) { /* parse and add to list, ignoring duplicates */ if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&node_list, &ignored, app->dash_host))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&node_list); return rc; } } } } /* add all these nodes to the map */ while (NULL != (item = opal_list_remove_first(&node_list))) { node = (orte_node_t*)item; /* if this is my node, ignore it - we are already here */ if (0 == strcmp(node->name, orte_process_info.nodename)) { continue; } opal_pointer_array_add(map->nodes, (void*)node); ++(map->num_nodes); /* if this node already has a daemon, release that object * to maintain bookkeeping */ if (NULL != node->daemon) { OBJ_RELEASE(node->daemon); } /* create a new daemon object for this node */ proc = OBJ_NEW(orte_proc_t); if (NULL == proc) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } proc->name.jobid = ORTE_PROC_MY_HNP->jobid; if (ORTE_VPID_MAX-1 <= jdata->num_procs) { /* no more daemons available */ orte_show_help("help-orte-rmaps-base.txt", "out-of-vpids", true); OBJ_RELEASE(proc); return ORTE_ERR_OUT_OF_RESOURCE; } proc->name.vpid = jdata->num_procs; /* take the next available vpid */ ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID); ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); proc->node = node; proc->nodename = node->name; OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, "%s rmaps:base:setup_vm add new daemon %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name))); /* add the daemon to the daemon job object */ if (0 > (rc = opal_pointer_array_add(jdata->procs, (void*)proc))) { ORTE_ERROR_LOG(rc); return rc; } ++jdata->num_procs; /* point the node to the daemon */ node->daemon = proc; OBJ_RETAIN(proc); /* maintain accounting */ /* track number of daemons to be launched */ ++map->num_new_daemons; /* and their starting vpid */ if (ORTE_VPID_INVALID == map->daemon_vpid_start) { map->daemon_vpid_start = proc->name.vpid; } } OBJ_DESTRUCT(&node_list); return ORTE_SUCCESS; }
static int rte_init(void) { int ret; char *error = NULL; char **nodes = NULL, **ppnlist = NULL; char *envar; int32_t jobfam; int i, j, *ppn; orte_nid_t *node; orte_jmap_t *jmap; orte_pmap_t *pmap; orte_vpid_t vpid; bool byslot; /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { error = "orte_ess_base_std_prolog"; goto error; } /* Only application procs can use this module. Since we * were directly launched by someone, we need to bootstrap * our own global info so we can startup. */ /* ensure that static ports were assigned - otherwise, we cant * work since we won't know how to talk to anyone else */ if (NULL == getenv("OMPI_MCA_oob_tcp_static_ports") && NULL == getenv("OMPI_MCA_oob_tcp_static_ports_v6")) { error = "static ports were not assigned"; goto error; } /* declare ourselves to be standalone - i.e., not launched by orted */ orte_standalone_operation = true; /* extract a jobid from the environment - can be totally * arbitrary. if one isn't provided, just fake it */ if (NULL != (envar = getenv("OMPI_MCA_orte_jobid"))) { jobfam = strtol(envar, NULL, 10); } else { jobfam = 1; } ORTE_PROC_MY_NAME->jobid = ORTE_CONSTRUCT_LOCAL_JOBID(0, jobfam); /* extract a rank from the environment */ if (NULL == (envar = getenv("OMPI_MCA_orte_rank"))) { error = "could not get process rank"; goto error; } ORTE_PROC_MY_NAME->vpid = strtol(envar, NULL, 10); ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN); OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "%s completed name definition", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* get the number of procs in this job */ if (NULL == (envar = getenv("OMPI_MCA_orte_num_procs"))) { error = "could not get number of processes in job"; goto error; } orte_process_info.num_procs = strtol(envar, NULL, 10); if (orte_process_info.max_procs < orte_process_info.num_procs) { orte_process_info.max_procs = orte_process_info.num_procs; } /* set the app_num so that MPI attributes get set correctly */ orte_process_info.app_num = 1; /* get the list of nodes */ if (NULL == (envar = getenv("OMPI_MCA_orte_nodes"))) { error = "could not get list of nodes"; goto error; } /* break this down */ nodes = opal_argv_split(envar, ','); orte_process_info.num_nodes = opal_argv_count(nodes); /* get the ppn */ if (NULL == (envar = getenv("OMPI_MCA_orte_ppn"))) { error = "could not get ppn"; goto error; } ppnlist = opal_argv_split(envar, ','); ppn = (int*)malloc(orte_process_info.num_nodes * sizeof(int)); if (1 == opal_argv_count(ppnlist)) { /* constant ppn */ j = strtol(ppnlist[0], NULL, 10); for (i=0; i < orte_process_info.num_nodes; i++) { ppn[i] = j; } } else { for (i=0; i < orte_process_info.num_nodes; i++) { ppn[i] = strtol(ppnlist[i], NULL, 10); } } opal_argv_free(ppnlist); /* get the mapping mode - default to byslot */ byslot = true; if (NULL != (envar = getenv("OMPI_MCA_mapping")) && 0 == strcmp(envar, "bynode")) { byslot = false; } /* setup the nidmap arrays */ if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) { ORTE_ERROR_LOG(ret); error = "orte_util_nidmap_init"; goto error; } /* set the size of the nidmap storage so we minimize realloc's */ if (ORTE_SUCCESS != (ret = opal_pointer_array_set_size(&orte_nidmap, orte_process_info.num_nodes))) { error = "could not set pointer array size for nidmap"; goto error; } /* construct the nidmap */ for (i=0; i < orte_process_info.num_nodes; i++) { node = OBJ_NEW(orte_nid_t); if (0 == strcmp(nodes[i], orte_process_info.nodename) || opal_ifislocal(nodes[i])) { node->name = strdup(orte_process_info.nodename); } else { node->name = strdup(nodes[i]); } node->daemon = i; node->index = i; opal_pointer_array_set_item(&orte_nidmap, i, node); } opal_argv_free(nodes); /* create a job map for this job */ jmap = OBJ_NEW(orte_jmap_t); jmap->job = ORTE_PROC_MY_NAME->jobid; opal_pointer_array_add(&orte_jobmap, jmap); /* update the num procs */ jmap->num_procs = orte_process_info.num_procs; /* set the size of the pidmap storage so we minimize realloc's */ if (ORTE_SUCCESS != (ret = opal_pointer_array_set_size(&jmap->pmap, jmap->num_procs))) { ORTE_ERROR_LOG(ret); error = "could not set pointer array size for pidmap"; goto error; } /* construct the pidmap */ if (byslot) { vpid = 0; for (i=0; i < orte_process_info.num_nodes; i++) { node = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, i); /* for each node, cycle through the ppn */ for (j=0; j < ppn[i]; j++) { pmap = OBJ_NEW(orte_pmap_t); pmap->node = i; pmap->local_rank = j; pmap->node_rank = j; if (ORTE_SUCCESS != (ret = opal_pointer_array_set_item(&jmap->pmap, vpid, pmap))) { ORTE_ERROR_LOG(ret); error = "could not set pmap values"; goto error; } /* if this is me, then define the daemon's vpid to * be the node number */ if (vpid == ORTE_PROC_MY_NAME->vpid) { ORTE_PROC_MY_DAEMON->jobid = 0; ORTE_PROC_MY_DAEMON->vpid = i; ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_PROC_MY_NAME->epoch); } OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "%s node %d name %s rank %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int) node->index, node->name, ORTE_VPID_PRINT(vpid))); vpid++; } } } else { /* cycle across the nodes */ vpid = 0; while (vpid < orte_process_info.num_procs) { for (i=0; i < orte_process_info.num_nodes && vpid < orte_process_info.num_procs; i++) { node = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, i); if (0 < ppn[i]) { pmap = OBJ_NEW(orte_pmap_t); pmap->node = i; pmap->local_rank = ppn[i]-1; pmap->node_rank = ppn[i]-1; if (ORTE_SUCCESS != (ret = opal_pointer_array_set_item(&jmap->pmap, vpid, pmap))) { ORTE_ERROR_LOG(ret); error = "could not set pmap values"; goto error; } /* if this is me, then define the daemon's vpid to * be the node number */ if (vpid == ORTE_PROC_MY_NAME->vpid) { ORTE_PROC_MY_DAEMON->jobid = 0; ORTE_PROC_MY_DAEMON->vpid = i; ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_PROC_MY_NAME->epoch); } OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "%s node %d name %s rank %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int) node->index, node->name, (int)vpid)); vpid++; --ppn[i]; } } } } free(ppn); /* ensure we pick the correct critical components */ putenv("OMPI_MCA_grpcomm=hier"); putenv("OMPI_MCA_routed=direct"); /* use the default procedure to finish my setup */ if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup())) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_app_setup"; goto error; } if (0 < opal_output_get_verbosity(orte_ess_base_output)) { orte_nidmap_dump(); orte_jobmap_dump(); } return ORTE_SUCCESS; error: orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); return ret; }
static int alps_set_name(void) { int rc; orte_jobid_t jobid; char *tmp; orte_vpid_t vpid; OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "ess:alps setting name")); mca_base_param_reg_string_name("orte", "ess_jobid", "Process jobid", true, false, NULL, &tmp); if (NULL == tmp) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_jobid(&jobid, tmp))) { ORTE_ERROR_LOG(rc); return rc; } free(tmp); mca_base_param_reg_string_name("orte", "ess_vpid", "Process vpid", true, false, NULL, &tmp); if (NULL == tmp) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_vpid(&starting_vpid, tmp))) { ORTE_ERROR_LOG(rc); return(rc); } free(tmp); if (ORTE_SUCCESS != (rc = get_vpid(&vpid, starting_vpid))) { ORTE_ERROR_LOG(rc); return rc; } ORTE_PROC_MY_NAME->jobid = jobid; ORTE_PROC_MY_NAME->vpid = vpid; ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_INVALID); ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch, orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "ess:alps set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* get the num procs as provided in the cmd line param */ if (ORTE_SUCCESS != (rc = orte_ess_env_get())) { ORTE_ERROR_LOG(rc); return rc; } if (orte_process_info.max_procs < orte_process_info.num_procs) { orte_process_info.max_procs = orte_process_info.num_procs; } return ORTE_SUCCESS; }