static void component_shutdown(void) { opal_output_verbose(2, orte_oob_base_framework.framework_output, "%s ALPS SHUTDOWN", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); }
static int setup_fork(orte_job_t *jdata, orte_app_context_t *app) { int i; char *param; bool oversubscribed; orte_node_t *node; char **envcpy, **nps, **firstranks; char *npstring, *firstrankstring; char *num_app_ctx; bool takeus = false; opal_output_verbose(1, orte_schizo_base_framework.framework_output, "%s schizo:ompi: setup_fork", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); if (NULL != orte_schizo_base.personalities) { /* see if we are included */ for (i=0; NULL != jdata->personality[i]; i++) { if (0 == strcmp(jdata->personality[i], "ompi")) { takeus = true; break; } } if (!takeus) { return ORTE_ERR_TAKE_NEXT_OPTION; } } /* see if the mapper thinks we are oversubscribed */ oversubscribed = false; if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_OVERSUBSCRIBED)) { oversubscribed = true; } /* setup base environment: copy the current environ and merge in the app context environ */ if (NULL != app->env) { /* manually free original context->env to avoid a memory leak */ char **tmp = app->env; envcpy = opal_environ_merge(orte_launch_environ, app->env); if (NULL != tmp) { opal_argv_free(tmp); } } else { envcpy = opal_argv_copy(orte_launch_environ); } app->env = envcpy; /* special case handling for --prefix: this is somewhat icky, but at least some users do this. :-\ It is possible that when using --prefix, the user will also "-x PATH" and/or "-x LD_LIBRARY_PATH", which would therefore clobber the work that was done in the prior pls to ensure that we have the prefix at the beginning of the PATH and LD_LIBRARY_PATH. So examine the context->env and see if we find PATH or LD_LIBRARY_PATH. If found, that means the prior work was clobbered, and we need to re-prefix those variables. */ param = NULL; orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)¶m, OPAL_STRING); for (i = 0; NULL != param && NULL != app->env && NULL != app->env[i]; ++i) { char *newenv; /* Reset PATH */ if (0 == strncmp("PATH=", app->env[i], 5)) { asprintf(&newenv, "%s/bin:%s", param, app->env[i] + 5); opal_setenv("PATH", newenv, true, &app->env); free(newenv); } /* Reset LD_LIBRARY_PATH */ else if (0 == strncmp("LD_LIBRARY_PATH=", app->env[i], 16)) { asprintf(&newenv, "%s/lib:%s", param, app->env[i] + 16); opal_setenv("LD_LIBRARY_PATH", newenv, true, &app->env); free(newenv); } } if (NULL != param) { free(param); } /* pass my contact info to the local proc so we can talk */ opal_setenv("OMPI_MCA_orte_local_daemon_uri", orte_process_info.my_daemon_uri, true, &app->env); /* pass the hnp's contact info to the local proc in case it * needs it */ if (NULL != orte_process_info.my_hnp_uri) { opal_setenv("OMPI_MCA_orte_hnp_uri", orte_process_info.my_hnp_uri, true, &app->env); } /* setup yield schedule - do not override any user-supplied directive! */ if (oversubscribed) { opal_setenv("OMPI_MCA_mpi_yield_when_idle", "1", false, &app->env); } else { opal_setenv("OMPI_MCA_mpi_yield_when_idle", "0", false, &app->env); } /* set the app_context number into the environment */ asprintf(¶m, "%ld", (long)app->idx); opal_setenv("OMPI_MCA_orte_app_num", param, true, &app->env); free(param); /* although the total_slots_alloc is the universe size, users * would appreciate being given a public environmental variable * that also represents this value - something MPI specific - so * do that here. Also required by the ompi_attributes code! * * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT. * We know - just live with it */ asprintf(¶m, "%ld", (long)jdata->total_slots_alloc); opal_setenv("OMPI_UNIVERSE_SIZE", param, true, &app->env); free(param); /* pass the number of nodes involved in this job */ asprintf(¶m, "%ld", (long)(jdata->map->num_nodes)); opal_setenv("OMPI_MCA_orte_num_nodes", param, true, &app->env); free(param); /* pass a param telling the child what type and model of cpu we are on, * if we know it. If hwloc has the value, use what it knows. Otherwise, * see if we were explicitly given it and use that value. */ hwloc_obj_t obj; char *htmp; if (NULL != opal_hwloc_topology) { obj = hwloc_get_root_obj(opal_hwloc_topology); if (NULL != (htmp = (char*)hwloc_obj_get_info_by_name(obj, "CPUType")) || NULL != (htmp = orte_local_cpu_type)) { opal_setenv("OMPI_MCA_orte_cpu_type", htmp, true, &app->env); } if (NULL != (htmp = (char*)hwloc_obj_get_info_by_name(obj, "CPUModel")) || NULL != (htmp = orte_local_cpu_model)) { opal_setenv("OMPI_MCA_orte_cpu_model", htmp, true, &app->env); } } else { if (NULL != orte_local_cpu_type) { opal_setenv("OMPI_MCA_orte_cpu_type", orte_local_cpu_type, true, &app->env); } if (NULL != orte_local_cpu_model) { opal_setenv("OMPI_MCA_orte_cpu_model", orte_local_cpu_model, true, &app->env); } } /* get shmem's best component name so we can provide a hint to the shmem * framework. the idea here is to have someone figure out what component to * select (via the shmem framework) and then have the rest of the * components in shmem obey that decision. for more details take a look at * the shmem framework in opal. */ if (NULL != (param = opal_shmem_base_best_runnable_component_name())) { opal_setenv("OMPI_MCA_shmem_RUNTIME_QUERY_hint", param, true, &app->env); free(param); } /* Set an info MCA param that tells the launched processes that * any binding policy was applied by us (e.g., so that * MPI_INIT doesn't try to bind itself) */ opal_setenv("OMPI_MCA_orte_bound_at_launch", "1", true, &app->env); /* tell the ESS to avoid the singleton component - but don't override * anything that may have been provided elsewhere */ opal_setenv("OMPI_MCA_ess", "^singleton", false, &app->env); /* ensure that the spawned process ignores direct launch components, * but do not overrride anything we were given */ opal_setenv("OMPI_MCA_pmix", "^s1,s2,cray", false, &app->env); /* since we want to pass the name as separate components, make sure * that the "name" environmental variable is cleared! */ opal_unsetenv("OMPI_MCA_orte_ess_name", &app->env); asprintf(¶m, "%ld", (long)jdata->num_procs); opal_setenv("OMPI_MCA_orte_ess_num_procs", param, true, &app->env); /* although the num_procs is the comm_world size, users * would appreciate being given a public environmental variable * that also represents this value - something MPI specific - so * do that here. * * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT. * We know - just live with it */ opal_setenv("OMPI_COMM_WORLD_SIZE", param, true, &app->env); free(param); /* users would appreciate being given a public environmental variable * that also represents this value - something MPI specific - so * do that here. * * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT. * We know - just live with it */ asprintf(¶m, "%ld", (long)jdata->num_local_procs); opal_setenv("OMPI_COMM_WORLD_LOCAL_SIZE", param, true, &app->env); free(param); /* forcibly set the local tmpdir base to match ours */ opal_setenv("OMPI_MCA_orte_tmpdir_base", orte_process_info.tmpdir_base, true, &app->env); /* MPI-3 requires we provide some further info to the procs, * so we pass them as envars to avoid introducing further * ORTE calls in the MPI layer */ asprintf(&num_app_ctx, "%lu", (unsigned long)jdata->num_apps); /* build some common envars we need to pass for MPI-3 compatibility */ nps = NULL; firstranks = NULL; for (i=0; i < jdata->apps->size; i++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { continue; } opal_argv_append_nosize(&nps, ORTE_VPID_PRINT(app->num_procs)); opal_argv_append_nosize(&firstranks, ORTE_VPID_PRINT(app->first_rank)); } npstring = opal_argv_join(nps, ' '); firstrankstring = opal_argv_join(firstranks, ' '); opal_argv_free(nps); opal_argv_free(firstranks); /* add the MPI-3 envars */ opal_setenv("OMPI_NUM_APP_CTX", num_app_ctx, true, &app->env); opal_setenv("OMPI_FIRST_RANKS", firstrankstring, true, &app->env); opal_setenv("OMPI_APP_CTX_NUM_PROCS", npstring, true, &app->env); free(num_app_ctx); free(firstrankstring); free(npstring); return ORTE_SUCCESS; }
/* called when a receive should be progressed */ static int ompi_mtl_portals4_recv_progress(ptl_event_t *ev, ompi_mtl_portals4_base_request_t* ptl_base_request) { int ret; ompi_mtl_portals4_recv_request_t* ptl_request = (ompi_mtl_portals4_recv_request_t*) ptl_base_request; size_t msg_length = 0; ptl_match_bits_t read_match_bits; /* as soon as we've seen any event associated with a request, it's started */ ptl_request->req_started = true; switch (ev->type) { case PTL_EVENT_PUT: OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, "Recv %lu (0x%lx) got put event", ptl_request->opcount, ev->hdr_data)); if (ev->ni_fail_type != PTL_NI_OK) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: PTL_EVENT_PUT with ni_fail_type: %d", __FILE__, __LINE__, ev->ni_fail_type); goto callback_error; } ptl_request->me_h = PTL_INVALID_HANDLE; msg_length = MTL_PORTALS4_GET_LENGTH(ev->hdr_data); ptl_request->super.super.ompi_req->req_status.MPI_SOURCE = MTL_PORTALS4_GET_SOURCE(ev->match_bits); ptl_request->super.super.ompi_req->req_status.MPI_TAG = MTL_PORTALS4_GET_TAG(ev->match_bits); if (OPAL_UNLIKELY(msg_length > ptl_request->delivery_len)) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "truncate expected: %ld %ld", msg_length, ptl_request->delivery_len); ptl_request->super.super.ompi_req->req_status.MPI_ERROR = MPI_ERR_TRUNCATE; } #if OPAL_ENABLE_DEBUG ptl_request->hdr_data = ev->hdr_data; #endif if (!MTL_PORTALS4_IS_SHORT_MSG(ev->match_bits) && ompi_mtl_portals4.protocol == rndv) { /* If it's not a short message and we're doing rndv, we only have the first part of the message. Issue the get to pull the second part of the message. */ if (ptl_request->is_triggered) { ptl_request->super.super.ompi_req->req_status._ucount = 0; } else { ptl_request->super.super.ompi_req->req_status._ucount = ompi_mtl_portals4.eager_limit; MTL_PORTALS4_SET_READ_BITS(read_match_bits, MTL_PORTALS4_GET_CONTEXT(ev->match_bits), MTL_PORTALS4_GET_TAG(ev->match_bits)); ret = read_msg((char*) ptl_request->delivery_ptr + ompi_mtl_portals4.eager_limit, ((msg_length > ptl_request->delivery_len) ? ptl_request->delivery_len : msg_length) - ompi_mtl_portals4.eager_limit, ev->initiator, read_match_bits, ompi_mtl_portals4.eager_limit, ptl_request); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { if (NULL != ptl_request->buffer_ptr) free(ptl_request->buffer_ptr); goto callback_error; } } } else { /* If we're either using the eager protocol or were a short message, all data has been received, so complete the message. */ ret = ompi_mtl_datatype_unpack(ptl_request->convertor, ev->start, ev->mlength); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: ompi_mtl_datatype_unpack failed: %d", __FILE__, __LINE__, ret); ptl_request->super.super.ompi_req->req_status.MPI_ERROR = ret; } ptl_request->super.super.ompi_req->req_status._ucount = ev->mlength; OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, "Recv %lu (0x%lx) completed, expected", ptl_request->opcount, ptl_request->hdr_data)); ptl_request->super.super.completion_callback(&ptl_request->super.super); } break; case PTL_EVENT_REPLY: OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, "Recv %lu (0x%lx) got reply event", ptl_request->opcount, ptl_request->hdr_data)); if (OPAL_UNLIKELY(ev->ni_fail_type != PTL_NI_OK)) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: PTL_EVENT_REPLY with ni_fail_type: %d", __FILE__, __LINE__, ev->ni_fail_type); goto callback_error; } if (ptl_request->is_triggered) { PtlCTFree(ptl_request->ct_h); ptl_request->ct_h = PTL_INVALID_HANDLE; } /* set the received length in the status, now that we know exactly how much data was sent. */ ptl_request->super.super.ompi_req->req_status._ucount += ev->mlength; #if OMPI_MTL_PORTALS4_FLOW_CONTROL OPAL_THREAD_ADD32(&ompi_mtl_portals4.flowctl.send_slots, 1); #endif /* make sure the data is in the right place. Use _ucount for the total length because it will be set correctly for all three protocols. mlength is only correct for eager, and delivery_len is the length of the buffer, not the length of the send. */ ret = ompi_mtl_datatype_unpack(ptl_request->convertor, ptl_request->delivery_ptr, ptl_request->super.super.ompi_req->req_status._ucount); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: ompi_mtl_datatype_unpack failed: %d", __FILE__, __LINE__, ret); ptl_request->super.super.ompi_req->req_status.MPI_ERROR = ret; } OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, "Recv %lu (0x%lx) completed, reply", ptl_request->opcount, ptl_request->hdr_data)); ptl_request->super.super.completion_callback(&ptl_request->super.super); break; case PTL_EVENT_PUT_OVERFLOW: OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, "Recv %lu (0x%lx) got put_overflow event", ptl_request->opcount, ev->hdr_data)); if (OPAL_UNLIKELY(ev->ni_fail_type != PTL_NI_OK)) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: PTL_EVENT_PUT_OVERFLOW with ni_fail_type: %d", __FILE__, __LINE__, ev->ni_fail_type); goto callback_error; } ptl_request->me_h = PTL_INVALID_HANDLE; msg_length = MTL_PORTALS4_GET_LENGTH(ev->hdr_data); ptl_request->super.super.ompi_req->req_status.MPI_SOURCE = MTL_PORTALS4_GET_SOURCE(ev->match_bits); ptl_request->super.super.ompi_req->req_status.MPI_TAG = MTL_PORTALS4_GET_TAG(ev->match_bits); if (OPAL_UNLIKELY(msg_length > ptl_request->delivery_len)) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "truncate unexpected: %ld %ld %d", msg_length, ptl_request->delivery_len, MTL_PORTALS4_IS_SHORT_MSG(ev->match_bits)); ptl_request->super.super.ompi_req->req_status.MPI_ERROR = MPI_ERR_TRUNCATE; } #if OPAL_ENABLE_DEBUG ptl_request->hdr_data = ev->hdr_data; #endif ptl_request->super.super.ompi_req->req_status._ucount = ev->mlength; /* overflow case. Short messages have the buffer stashed somewhere. Long messages left in buffer at the source */ if (MTL_PORTALS4_IS_SHORT_MSG(ev->match_bits)) { if (ev->mlength > 0) { struct iovec iov; uint32_t iov_count = 1; size_t max_data; iov.iov_base = (char*) ev->start; iov.iov_len = ev->mlength; max_data = iov.iov_len; ret = opal_convertor_unpack(ptl_request->convertor, &iov, &iov_count, &max_data ); if (NULL != ptl_request->buffer_ptr) free(ptl_request->buffer_ptr); if (OPAL_UNLIKELY(ret < 0)) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: opal_convertor_unpack failed: %d", __FILE__, __LINE__, ret); goto callback_error; } } /* if it's a sync, send the ack */ if (MTL_PORTALS4_IS_SYNC_MSG(ev->hdr_data)) { OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, "Recv %lu (0x%lx) sending sync ack", ptl_request->opcount, ptl_request->hdr_data)); ret = PtlPut(ompi_mtl_portals4.zero_md_h, 0, 0, PTL_NO_ACK_REQ, ev->initiator, ompi_mtl_portals4.read_idx, ev->hdr_data, 0, NULL, 0); if (OPAL_UNLIKELY(PTL_OK != ret)) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: PtlPut failed: %d", __FILE__, __LINE__, ret); goto callback_error; } } OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, "Recv %lu (0x%lx) completed, unexpected short (0x%lx)", ptl_request->opcount, ptl_request->hdr_data, (long) ev->start)); ptl_request->super.super.completion_callback(&ptl_request->super.super); } else { if (!ptl_request->is_triggered) { if (ev->mlength > 0) { /* if rndv or triggered, copy the eager part to the right place */ memcpy(ptl_request->delivery_ptr, ev->start, ev->mlength); } MTL_PORTALS4_SET_READ_BITS(read_match_bits, MTL_PORTALS4_GET_CONTEXT(ev->match_bits), MTL_PORTALS4_GET_TAG(ev->match_bits)); ret = read_msg((char*) ptl_request->delivery_ptr + ev->mlength, ((msg_length > ptl_request->delivery_len) ? ptl_request->delivery_len : msg_length) - ev->mlength, ev->initiator, read_match_bits, ev->mlength, ptl_request); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { if (NULL != ptl_request->buffer_ptr) free(ptl_request->buffer_ptr); goto callback_error; } } } break; case PTL_EVENT_LINK: break; default: opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "Unhandled receive callback with event type %d", ev->type); return OMPI_ERROR; } return OMPI_SUCCESS; callback_error: ptl_request->super.super.ompi_req->req_status.MPI_ERROR = ompi_mtl_portals4_get_error(ret); ptl_request->super.super.completion_callback(&ptl_request->super.super); return OMPI_SUCCESS; }
static int initialize(int argc, char *argv[]) { int ret, exit_status = OPAL_SUCCESS; char * tmp_env_var = NULL; /* * Make sure to init util before parse_args * to ensure installdirs is setup properly * before calling mca_base_open(); */ if( OPAL_SUCCESS != (ret = opal_init_util(&argc, &argv)) ) { return ret; } /* * Parse Command line arguments */ if (OPAL_SUCCESS != (ret = parse_args(argc, argv))) { exit_status = ret; goto cleanup; } /* * Setup OPAL Output handle from the verbose argument */ if( opal_restart_globals.verbose ) { opal_restart_globals.output = opal_output_open(NULL); opal_output_set_verbosity(opal_restart_globals.output, 10); } else { opal_restart_globals.output = 0; /* Default=STDOUT */ } /* * Turn off the selection of the CRS component, * we need to do that later */ (void) mca_base_var_env_name("crs_base_do_not_select", &tmp_env_var); opal_setenv(tmp_env_var, "1", /* turn off the selection */ true, &environ); free(tmp_env_var); tmp_env_var = NULL; /* * Make sure we select the proper compress component. */ if( NULL != opal_restart_globals.snapshot_compress ) { (void) mca_base_var_env_name("compress", &tmp_env_var); opal_setenv(tmp_env_var, opal_restart_globals.snapshot_compress, true, &environ); free(tmp_env_var); tmp_env_var = NULL; } /* * Initialize the OPAL layer */ if (OPAL_SUCCESS != (ret = opal_init(&argc, &argv))) { exit_status = ret; goto cleanup; } /* * If the checkpoint was compressed, then decompress it before continuing */ if( NULL != opal_restart_globals.snapshot_compress ) { char * zip_dir = NULL; char * tmp_str = NULL; /* Make sure to clear the selection for the restart, * this way the user can swich compression mechanism * across restart */ (void) mca_base_var_env_name("compress", &tmp_env_var); opal_unsetenv(tmp_env_var, &environ); free(tmp_env_var); tmp_env_var = NULL; opal_asprintf(&zip_dir, "%s/%s%s", opal_restart_globals.snapshot_loc, opal_restart_globals.snapshot_ref, opal_restart_globals.snapshot_compress_postfix); if (0 > (ret = access(zip_dir, F_OK)) ) { opal_output(opal_restart_globals.output, "Error: Unable to access the file [%s]!", zip_dir); exit_status = OPAL_ERROR; goto cleanup; } opal_output_verbose(10, opal_restart_globals.output, "Decompressing (%s)", zip_dir); opal_compress.decompress(zip_dir, &tmp_str); if( NULL != zip_dir ) { free(zip_dir); zip_dir = NULL; } if( NULL != tmp_str ) { free(tmp_str); tmp_str = NULL; } } /* * If a cache directory has been suggested, see if it exists */ if( NULL != opal_restart_globals.snapshot_cache ) { if(0 == (ret = access(opal_restart_globals.snapshot_cache, F_OK)) ) { opal_output_verbose(10, opal_restart_globals.output, "Using the cached snapshot (%s) instead of (%s)", opal_restart_globals.snapshot_cache, opal_restart_globals.snapshot_loc); if( NULL != opal_restart_globals.snapshot_loc ) { free(opal_restart_globals.snapshot_loc); opal_restart_globals.snapshot_loc = NULL; } opal_restart_globals.snapshot_loc = opal_dirname(opal_restart_globals.snapshot_cache); } else { opal_show_help("help-opal-restart.txt", "cache_not_avail", true, opal_restart_globals.snapshot_cache, opal_restart_globals.snapshot_loc); } } /* * Mark this process as a tool */ opal_cr_is_tool = true; cleanup: return exit_status; }
static int parse_cli(int argc, int start, char **argv) { int i, j, k; bool ignore; char *no_dups[] = { "grpcomm", "odls", "rml", "routed", NULL }; bool takeus = false; opal_output_verbose(1, orte_schizo_base_framework.framework_output, "%s schizo:ompi: parse_cli", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); /* if they gave us a list of personalities, * see if we are included */ if (NULL != orte_schizo_base.personalities) { for (i=0; NULL != orte_schizo_base.personalities[i]; i++) { if (0 == strcmp(orte_schizo_base.personalities[i], "ompi")) { takeus = true; break; } } if (!takeus) { return ORTE_ERR_TAKE_NEXT_OPTION; } } else { /* attempt to auto-detect CLI options that * we recognize */ } for (i = 0; i < (argc-start); ++i) { if (0 == strcmp("-mca", argv[i]) || 0 == strcmp("--mca", argv[i]) ) { /* ignore this one */ if (0 == strcmp(argv[i+1], "mca_base_env_list")) { i += 2; continue; } /* It would be nice to avoid increasing the length * of the orted cmd line by removing any non-ORTE * params. However, this raises a problem since * there could be OPAL directives that we really * -do- want the orted to see - it's only the OMPI * related directives we could ignore. This becomes * a very complicated procedure, however, since * the OMPI mca params are not cleanly separated - so * filtering them out is nearly impossible. * * see if this is already present so we at least can * avoid growing the cmd line with duplicates */ ignore = false; if (NULL != orted_cmd_line) { for (j=0; NULL != orted_cmd_line[j]; j++) { if (0 == strcmp(argv[i+1], orted_cmd_line[j])) { /* already here - if the value is the same, * we can quitely ignore the fact that they * provide it more than once. However, some * frameworks are known to have problems if the * value is different. We don't have a good way * to know this, but we at least make a crude * attempt here to protect ourselves. */ if (0 == strcmp(argv[i+2], orted_cmd_line[j+1])) { /* values are the same */ ignore = true; break; } else { /* values are different - see if this is a problem */ for (k=0; NULL != no_dups[k]; k++) { if (0 == strcmp(no_dups[k], argv[i+1])) { /* print help message * and abort as we cannot know which one is correct */ orte_show_help("help-orterun.txt", "orterun:conflicting-params", true, orte_basename, argv[i+1], argv[i+2], orted_cmd_line[j+1]); return ORTE_ERR_BAD_PARAM; } } /* this passed muster - just ignore it */ ignore = true; break; } } } } if (!ignore) { opal_argv_append_nosize(&orted_cmd_line, argv[i]); opal_argv_append_nosize(&orted_cmd_line, argv[i+1]); opal_argv_append_nosize(&orted_cmd_line, argv[i+2]); } i += 2; } } return ORTE_SUCCESS; }
static int read_bytes(mca_oob_usock_peer_t* peer) { int rc; /* read until all bytes recvd or error */ while (0 < peer->recv_msg->rdbytes) { rc = read(peer->sd, peer->recv_msg->rdptr, peer->recv_msg->rdbytes); if (rc < 0) { if(opal_socket_errno == EINTR) { continue; } else if (opal_socket_errno == EAGAIN) { /* tell the caller to keep this message on active, * but let the event lib cycle so other messages * can progress while this socket is busy */ return ORTE_ERR_RESOURCE_BUSY; } else if (opal_socket_errno == EWOULDBLOCK) { /* tell the caller to keep this message on active, * but let the event lib cycle so other messages * can progress while this socket is busy */ return ORTE_ERR_WOULD_BLOCK; } /* we hit an error and cannot progress this message - report * the error back to the RML and let the caller know * to abort this message */ opal_output_verbose(OOB_USOCK_DEBUG_FAIL, orte_oob_base_framework.framework_output, "%s-%s mca_oob_usock_msg_recv: readv failed: %s (%d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), strerror(opal_socket_errno), opal_socket_errno); // mca_oob_usock_peer_close(peer); // if (NULL != mca_oob_usock.oob_exception_callback) { // mca_oob_usock.oob_exception_callback(&peer->name, ORTE_RML_PEER_DISCONNECTED); //} return ORTE_ERR_COMM_FAILURE; } else if (rc == 0) { /* the remote peer closed the connection - report that condition * and let the caller know */ opal_output_verbose(OOB_USOCK_DEBUG_FAIL, orte_oob_base_framework.framework_output, "%s-%s mca_oob_usock_msg_recv: peer closed connection", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name))); /* stop all events */ if (peer->recv_ev_active) { opal_event_del(&peer->recv_event); peer->recv_ev_active = false; } if (peer->timer_ev_active) { opal_event_del(&peer->timer_event); peer->timer_ev_active = false; } if (peer->send_ev_active) { opal_event_del(&peer->send_event); peer->send_ev_active = false; } if (NULL != peer->recv_msg) { OBJ_RELEASE(peer->recv_msg); peer->recv_msg = NULL; } mca_oob_usock_peer_close(peer); //if (NULL != mca_oob_usock.oob_exception_callback) { // mca_oob_usock.oob_exception_callback(&peer->peer_name, ORTE_RML_PEER_DISCONNECTED); //} return ORTE_ERR_WOULD_BLOCK; } /* we were able to read something, so adjust counters and location */ peer->recv_msg->rdbytes -= rc; peer->recv_msg->rdptr += rc; } /* we read the full data block */ return ORTE_SUCCESS; }
/* * Function to find as many components of a given type as possible. This * includes statically-linked in components as well as opening up a * directory and looking for shared-library MCA components of the * appropriate type (load them if available). * * Return one consolidated array of (mca_base_component_t*) pointing to all * available components. */ int mca_base_component_find(const char *directory, const char *type, const mca_base_component_t *static_components[], char **requested_component_names, bool include_mode, opal_list_t *found_components, bool open_dso_components) { int i; opal_list_item_t *item; mca_base_component_list_item_t *cli; /* Find all the components that were statically linked in */ OBJ_CONSTRUCT(found_components, opal_list_t); for (i = 0; NULL != static_components[i]; ++i) { if ( use_component(include_mode, (const char**)requested_component_names, static_components[i]->mca_component_name) ) { cli = OBJ_NEW(mca_base_component_list_item_t); if (NULL == cli) { return OPAL_ERR_OUT_OF_RESOURCE; } cli->cli_component = static_components[i]; opal_list_append(found_components, (opal_list_item_t *) cli); } } #if OMPI_WANT_LIBLTDL /* Find any available dynamic components in the specified directory */ if (open_dso_components) { int param, param_disable_dlopen; param = mca_base_param_find("mca", NULL, "component_disable_dlopen"); mca_base_param_lookup_int(param, ¶m_disable_dlopen); if (0 == param_disable_dlopen) { find_dyn_components(directory, type, (const char**)requested_component_names, include_mode, found_components); } } else { opal_output_verbose(40, 0, "mca: base: component_find: dso loading for %s MCA components disabled", type); } #endif /* Ensure that *all* requested components exist. Print a warning and abort if they do not. */ for (i = 0; include_mode && NULL != requested_component_names && NULL != requested_component_names[i]; ++i) { for (item = opal_list_get_first(found_components); opal_list_get_end(found_components) != item; item = opal_list_get_next(item)) { cli = (mca_base_component_list_item_t*) item; if (0 == strcmp(requested_component_names[i], cli->cli_component->mca_component_name)) { break; } } if (opal_list_get_end(found_components) == item) { char h[MAXHOSTNAMELEN]; gethostname(h, sizeof(h)); opal_show_help("help-mca-base.txt", "find-available:not-valid", true, h, type, requested_component_names[i]); return OPAL_ERR_NOT_FOUND; } } /* All done */ return OPAL_SUCCESS; }
/* Compare the addresses of the local interface corresponding to module and the * remote interface corresponding to proc_modex_addr. Returns a weight value * (higher values indicate more desirable connections). */ static uint64_t compute_weight( ompi_btl_usnic_module_t *module, ompi_btl_usnic_addr_t *proc_modex_addr) { char my_ip_string[INET_ADDRSTRLEN], peer_ip_string[INET_ADDRSTRLEN]; uint32_t mynet, peernet; int err, metric; uint32_t min_link_speed_gbps; inet_ntop(AF_INET, &module->if_ipv4_addr, my_ip_string, sizeof(my_ip_string)); inet_ntop(AF_INET, &proc_modex_addr->ipv4_addr, peer_ip_string, sizeof(peer_ip_string)); /* Just compare the CIDR-masked IP address to see if they're on the same network. If so, we're good. */ mynet = ompi_btl_usnic_get_ipv4_subnet(module->if_ipv4_addr, module->if_cidrmask); peernet = ompi_btl_usnic_get_ipv4_subnet(proc_modex_addr->ipv4_addr, proc_modex_addr->cidrmask); opal_output_verbose(5, USNIC_OUT, "btl:usnic:%s: checking my IP address/subnet (%s/%d) vs. peer (%s/%d): %s", __func__, my_ip_string, module->if_cidrmask, peer_ip_string, proc_modex_addr->cidrmask, (mynet == peernet ? "match" : "DO NOT match")); if (!mca_btl_usnic_component.use_udp) { if (mynet != peernet) { return WEIGHT_UNREACHABLE; } else { return 1; /* any positive weight is fine */ } } min_link_speed_gbps = MIN(module->super.btl_bandwidth, proc_modex_addr->link_speed_mbps) / 1000; metric = 0; err = ompi_btl_usnic_nl_ip_rt_lookup(mca_btl_usnic_component.unlsk, module->if_ipv4_addr, proc_modex_addr->ipv4_addr, &metric); if (0 != err) { return 0; /* no connectivity */ } else { /* Format in binary MSB LSB * most sig. 32-bits: 00000000 0000000A BBBBBBBB 00000001 * least sig. 32-bits: CCCCCCCC CCCCCCCC CCCCCCCC CCCCCCCC * * A = 1 iff same subnet * B = min link speed (in Gbps) between iface pair * C = metric from routing table * * That is, this prioritizes interfaces in the same subnet first, * followed by having the same link speed. The extra literal "1" is in * there to help prioritize over any zero-cost links that might * otherwise make their way into the graph. It is not strictly * necessary and could be eliminated if the extra byte is needed. * * TODO add an MCA parameter to optionally swap the offsets of A and * B, thereby prioritizing link speed over same subnet reachability. */ /* FIXME how can we check that the metric is the same before we have * communication with this host? Mismatched metrics could cause the * remote peer to make a different pairing decision... */ if (min_link_speed_gbps > 0xff) { opal_output_verbose(20, USNIC_OUT, "clamping min_link_speed_gbps=%u to 255", min_link_speed_gbps); min_link_speed_gbps = 0xff; } return ((uint64_t)(mynet == peernet) << 48) | ((uint64_t)(min_link_speed_gbps & 0xff) << 40) | ((uint64_t)0x1 << 32) | (/*metric=*/0); } }
/** * Constructs an interface graph from all local modules and the given proc's * remote interfaces. The resulting vertices will always have the module * vertices appear before the proc vertices. */ static int create_proc_module_graph( ompi_btl_usnic_proc_t *proc, bool proc_is_left, ompi_btl_usnic_graph_t **g_out) { int err; int i, j; int u, v; int num_modules; ompi_btl_usnic_graph_t *g = NULL; if (NULL == g_out) { return OMPI_ERR_BAD_PARAM; } *g_out = NULL; num_modules = (int)mca_btl_usnic_component.num_modules; /* Construct a bipartite graph with remote interfaces on the one side and * local interfaces (modules) on the other. */ err = ompi_btl_usnic_gr_create(NULL, NULL, &g); if (OMPI_SUCCESS != err) { OMPI_ERROR_LOG(err); goto out; } /* create vertices for each interface (local and remote) */ for (i = 0; i < num_modules; ++i) { int idx = -1; err = ompi_btl_usnic_gr_add_vertex(g, mca_btl_usnic_component.usnic_active_modules[i], &idx); if (OMPI_SUCCESS != err) { OMPI_ERROR_LOG(err); goto out_free_graph; } assert(idx == MODULE_VERTEX(i)); } for (i = 0; i < (int)proc->proc_modex_count; ++i) { int idx = -1; err = ompi_btl_usnic_gr_add_vertex(g, &proc->proc_modex[i], &idx); if (OMPI_SUCCESS != err) { OMPI_ERROR_LOG(err); goto out_free_graph; } assert(idx == (int)PROC_VERTEX(i)); } /* now add edges between interfaces that can communicate */ for (i = 0; i < num_modules; ++i) { for (j = 0; j < (int)proc->proc_modex_count; ++j) { int64_t weight, cost; /* assumption: compute_weight returns the same weight on the * remote process with these arguments (effectively) transposed */ weight = compute_weight(mca_btl_usnic_component.usnic_active_modules[i], &proc->proc_modex[j]); opal_output_verbose(20, USNIC_OUT, "btl:usnic:%s: weight=0x%016" PRIx64 " for edge module[%d] (%p) <--> endpoint[%d] on proc %p", __func__, weight, i, (void *)mca_btl_usnic_component.usnic_active_modules[i], j, (void *)proc); if (WEIGHT_UNREACHABLE == weight) { continue; } else { /* the graph code optimizes for minimum *cost*, but we have * been computing weights (negative costs) */ cost = -weight; } assert(INT64_MAX != cost); assert(INT64_MIN != cost); if (proc_is_left) { u = PROC_VERTEX(j); v = MODULE_VERTEX(i); } else { u = MODULE_VERTEX(i); v = PROC_VERTEX(j); } opal_output_verbose(20, USNIC_OUT, "btl:usnic:%s: adding edge (%d,%d) with cost=%" PRIi64 " for edge module[%d] <--> endpoint[%d]", __func__, u, v, cost, i, j); err = ompi_btl_usnic_gr_add_edge(g, u, v, cost, /*capacity=*/1, /*e_data=*/NULL); if (OMPI_SUCCESS != err) { OMPI_ERROR_LOG(err); goto out_free_graph; } } } *g_out = g; return OMPI_SUCCESS; out_free_graph: ompi_btl_usnic_gr_free(g); out: return err; }
void orte_state_base_track_procs(int fd, short argc, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_process_name_t *proc = &caddy->name; orte_proc_state_t state = caddy->proc_state; orte_job_t *jdata; orte_proc_t *pdata; opal_output_verbose(5, orte_state_base_output, "%s state:base:track_procs called for proc %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state)); /* get the job object for this proc */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto cleanup; } pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); if (ORTE_PROC_STATE_RUNNING == state) { /* update the proc state */ pdata->state = state; jdata->num_launched++; if (jdata->num_launched == jdata->num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_RUNNING); } } else if (ORTE_PROC_STATE_REGISTERED == state) { /* update the proc state */ pdata->state = state; jdata->num_reported++; if (jdata->num_reported == jdata->num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_REGISTERED); } } else if (ORTE_PROC_STATE_IOF_COMPLETE == state) { /* update the proc state */ pdata->state = state; /* Release only the stdin IOF file descriptor for this child, if one * was defined. File descriptors for the other IOF channels - stdout, * stderr, and stddiag - were released when their associated pipes * were cleared and closed due to termination of the process */ if (NULL != orte_iof.close) { orte_iof.close(proc, ORTE_IOF_STDIN); } pdata->iof_complete = true; if (pdata->waitpid_recvd) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_WAITPID_FIRED == state) { /* update the proc state */ pdata->state = state; pdata->waitpid_recvd = true; if (pdata->iof_complete) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_TERMINATED == state) { /* update the proc state */ pdata->alive = false; pdata->state = state; if (pdata->local_proc) { /* Clean up the session directory as if we were the process * itself. This covers the case where the process died abnormally * and didn't cleanup its own session directory. */ orte_session_dir_finalize(proc); } /* return the allocated slot for reuse */ cleanup_node(pdata); /* track job status */ jdata->num_terminated++; if (jdata->num_terminated == jdata->num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); } } cleanup: OBJ_RELEASE(caddy); }
void orte_state_base_check_all_complete(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_job_t *jdata = caddy->jdata; orte_proc_t *proc; int i; orte_std_cntr_t j; orte_job_t *job; orte_node_t *node; orte_job_map_t *map; orte_std_cntr_t index; bool one_still_alive; orte_vpid_t lowest=0; opal_output_verbose(2, orte_state_base_output, "%s state:base:check_job_complete on job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid)); if (NULL == jdata || jdata->jobid == ORTE_PROC_MY_NAME->jobid) { /* just check to see if the daemons are complete */ OPAL_OUTPUT_VERBOSE((2, orte_state_base_output, "%s state:base:check_job_complete - received NULL job, checking daemons", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto CHECK_DAEMONS; } else { /* mark the job as terminated, but don't override any * abnormal termination flags */ if (jdata->state < ORTE_JOB_STATE_UNTERMINATED) { jdata->state = ORTE_JOB_STATE_TERMINATED; } } /* turn off any sensor monitors on this job */ orte_sensor.stop(jdata->jobid); /* tell the IOF that the job is complete */ if (NULL != orte_iof.complete) { orte_iof.complete(jdata); } if (0 < jdata->num_non_zero_exit && !orte_abort_non_zero_exit) { if (!orte_report_child_jobs_separately || 1 == ORTE_LOCAL_JOBID(jdata->jobid)) { /* update the exit code */ ORTE_UPDATE_EXIT_STATUS(lowest); } /* warn user */ opal_output(orte_clean_output, "-------------------------------------------------------\n" "While %s job %s terminated normally, %d %s. Further examination may be required.\n" "-------------------------------------------------------", (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "the primary" : "child", (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid), jdata->num_non_zero_exit, (1 == jdata->num_non_zero_exit) ? "process returned\na non-zero exit code." : "processes returned\nnon-zero exit codes."); } OPAL_OUTPUT_VERBOSE((2, orte_state_base_output, "%s state:base:check_job_completed declared job %s normally terminated - checking all jobs", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); /* if this job is a continuously operating one, then don't do * anything further - just return here */ if (NULL != jdata && (ORTE_JOB_CONTROL_CONTINUOUS_OP & jdata->controls || ORTE_JOB_CONTROL_RECOVERABLE & jdata->controls)) { goto CHECK_ALIVE; } /* if the job that is being checked is the HNP, then we are * trying to terminate the orteds. In that situation, we * do -not- check all jobs - we simply notify the HNP * that the orteds are complete. Also check special case * if jdata is NULL - we want * to definitely declare the job done if the orteds * have completed, no matter what else may be happening. * This can happen if a ctrl-c hits in the "wrong" place * while launching */ CHECK_DAEMONS: if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) { if (0 == orte_routed.num_routes()) { /* orteds are done! */ OPAL_OUTPUT_VERBOSE((2, orte_state_base_output, "%s orteds complete - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (NULL == jdata) { jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); } ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_TERMINATED); OBJ_RELEASE(caddy); return; } OBJ_RELEASE(caddy); return; } /* Release the resources used by this job. Since some errmgrs may want * to continue using resources allocated to the job as part of their * fault recovery procedure, we only do this once the job is "complete". * Note that an aborted/killed job -is- flagged as complete and will * therefore have its resources released. We need to do this after * we call the errmgr so that any attempt to restart the job will * avoid doing so in the exact same place as the current job */ if (NULL != jdata->map && jdata->state == ORTE_JOB_STATE_TERMINATED) { map = jdata->map; for (index = 0; index < map->nodes->size; index++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) { continue; } OPAL_OUTPUT_VERBOSE((2, orte_state_base_output, "%s releasing procs from node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name)); for (i = 0; i < node->procs->size; i++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { continue; } if (proc->name.jobid != jdata->jobid) { /* skip procs from another job */ continue; } node->slots_inuse--; node->num_procs--; OPAL_OUTPUT_VERBOSE((2, orte_state_base_output, "%s releasing proc %s from node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name), node->name)); /* set the entry in the node array to NULL */ opal_pointer_array_set_item(node->procs, i, NULL); /* release the proc once for the map entry */ OBJ_RELEASE(proc); } /* set the node location to NULL */ opal_pointer_array_set_item(map->nodes, index, NULL); /* maintain accounting */ OBJ_RELEASE(node); /* flag that the node is no longer in a map */ node->mapped = false; } OBJ_RELEASE(map); jdata->map = NULL; } CHECK_ALIVE: /* now check to see if all jobs are done - trigger notification of this jdata * object when we find it */ one_still_alive = false; for (j=1; j < orte_job_data->size; j++) { if (NULL == (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, j))) { /* since we are releasing jdata objects as we * go, we can no longer assume that the job_data * array is left justified */ continue; } /* if this is the job we are checking AND it normally terminated, * then activate the "notify_completed" state - this will release * the job state, but is provided so that the HNP main code can * take alternative actions if desired. If the state is killed_by_cmd, * then go ahead and release it. We cannot release it if it * abnormally terminated as mpirun needs the info so it can * report appropriately to the user * * NOTE: do not release the primary job (j=1) so we * can pretty-print completion message */ if (NULL != jdata && job->jobid == jdata->jobid) { if (jdata->state == ORTE_JOB_STATE_TERMINATED) { OPAL_OUTPUT_VERBOSE((2, orte_state_base_output, "%s state:base:check_job_completed state is terminated - activating notify", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_NOTIFY_COMPLETED); one_still_alive = true; } else if (jdata->state == ORTE_JOB_STATE_KILLED_BY_CMD || jdata->state == ORTE_JOB_STATE_NOTIFIED) { OPAL_OUTPUT_VERBOSE((2, orte_state_base_output, "%s state:base:check_job_completed state is killed or notified - cleaning up", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* release this object, ensuring that the * pointer array internal accounting * is maintained! */ if (1 < j) { opal_pointer_array_set_item(orte_job_data, j, NULL); /* ensure the array has a NULL */ OBJ_RELEASE(jdata); } } continue; } /* if the job is flagged to not be monitored, skip it */ if (ORTE_JOB_CONTROL_DO_NOT_MONITOR & job->controls) { continue; } /* when checking for job termination, we must be sure to NOT check * our own job as it - rather obviously - has NOT terminated! */ if (job->num_terminated < job->num_procs) { /* we have at least one job that is not done yet - we cannot * just return, though, as we need to ensure we cleanout the * job data for the job that just completed */ OPAL_OUTPUT_VERBOSE((2, orte_state_base_output, "%s state:base:check_job_completed job %s is not terminated (%d:%d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job->jobid), job->num_terminated, job->num_procs)); one_still_alive = true; } else { OPAL_OUTPUT_VERBOSE((2, orte_state_base_output, "%s state:base:check_job_completed job %s is terminated (%d vs %d [%s])", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job->jobid), job->num_terminated, job->num_procs, (NULL == jdata) ? "UNKNOWN" : orte_job_state_to_str(jdata->state) )); } } /* if a job is still alive, we just return */ if (one_still_alive) { OPAL_OUTPUT_VERBOSE((2, orte_state_base_output, "%s state:base:check_job_completed at least one job is not terminated", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); OBJ_RELEASE(caddy); return; } /* if we get here, then all jobs are done, so terminate */ OPAL_OUTPUT_VERBOSE((2, orte_state_base_output, "%s state:base:check_job_completed all jobs terminated", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* set the exit status to 0 - this will only happen if it * wasn't already set by an error condition */ ORTE_UPDATE_EXIT_STATUS(0); /* order daemon termination - this tells us to cleanup * our local procs as well as telling remote daemons * to die */ orte_plm.terminate_orteds(); OBJ_RELEASE(caddy); }
/* we cannot use the RML to communicate with SLURM as it doesn't * understand our internal protocol, so we have to do a bare-bones * exchange based on sockets */ static int dyn_allocate(orte_job_t *jdata) { char *cmd_str, **cmd=NULL, *tmp, *jstring; char *node_list; orte_app_context_t *app; int i; struct timeval tv; local_jobtracker_t *jtrk; int64_t i64, *i64ptr; if (NULL == mca_ras_slurm_component.config_file) { opal_output(0, "Cannot perform dynamic allocation as no Slurm configuration file provided"); return ORTE_ERR_NOT_FOUND; } /* track this request */ jtrk = OBJ_NEW(local_jobtracker_t); jtrk->jobid = jdata->jobid; opal_list_append(&jobs, &jtrk->super); /* construct the command - note that the jdata structure contains * a field for the minimum number of nodes required for the job. * The node list can be constructed from the union of all the nodes * contained in the dash_host field of the app_contexts. So you'll * need to do a little work to build the command. We don't currently * have a field in the jdata structure for "mandatory" vs "optional" * allocations, so we'll have to add that someday. Likewise, you may * want to provide a param to adjust the timeout value */ /* construct the cmd string */ opal_argv_append_nosize(&cmd, "allocate"); /* add the jobid */ orte_util_convert_jobid_to_string(&jstring, jdata->jobid); opal_asprintf(&tmp, "jobid=%s", jstring); opal_argv_append_nosize(&cmd, tmp); free(tmp); free(jstring); /* if we want the allocation for all apps in one shot, * then tell slurm * * RHC: we don't currently have the ability to handle * rolling allocations in the rest of the code base */ #if 0 if (!mca_ras_slurm_component.rolling_alloc) { opal_argv_append_nosize(&cmd, "return=all"); } #else opal_argv_append_nosize(&cmd, "return=all"); #endif /* pass the timeout */ opal_asprintf(&tmp, "timeout=%d", mca_ras_slurm_component.timeout); opal_argv_append_nosize(&cmd, tmp); free(tmp); /* for each app, add its allocation request info */ i64ptr = &i64; for (i=0; i < jdata->apps->size; i++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { continue; } /* add the app id, preceded by a colon separator */ opal_asprintf(&tmp, ": app=%d", (int)app->idx); opal_argv_append_nosize(&cmd, tmp); free(tmp); /* add the number of process "slots" we need */ opal_asprintf(&tmp, "np=%d", app->num_procs); opal_argv_append_nosize(&cmd, tmp); free(tmp); /* if we were given a minimum number of nodes, pass it along */ if (orte_get_attribute(&app->attributes, ORTE_APP_MIN_NODES, (void**)&i64ptr, OPAL_INT64)) { opal_asprintf(&tmp, "N=%ld", (long int)i64); opal_argv_append_nosize(&cmd, tmp); free(tmp); } /* add the list of nodes, if one was given, ensuring * that each node only appears once */ node_list = get_node_list(app); if (NULL != node_list) { opal_asprintf(&tmp, "node_list=%s", node_list); opal_argv_append_nosize(&cmd, tmp); free(node_list); free(tmp); } /* add the mandatory/optional flag */ if (orte_get_attribute(&app->attributes, ORTE_APP_MANDATORY, NULL, OPAL_BOOL)) { opal_argv_append_nosize(&cmd, "flag=mandatory"); } else { opal_argv_append_nosize(&cmd, "flag=optional"); } } /* assemble it into the final cmd to be sent */ cmd_str = opal_argv_join(cmd, ' '); opal_argv_free(cmd); /* start a timer - if the response to our request doesn't appear * in the defined time, then we will error out as Slurm isn't * responding to us */ opal_event_evtimer_set(orte_event_base, &jtrk->timeout_ev, timeout, jtrk); tv.tv_sec = mca_ras_slurm_component.timeout * 2; tv.tv_usec = 0; opal_event_evtimer_add(&jtrk->timeout_ev, &tv); opal_output_verbose(2, orte_ras_base_framework.framework_output, "%s slurm:dynalloc cmd_str = %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cmd_str); if (send(socket_fd, cmd_str, strlen(cmd_str)+1, 0) < 0) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); } free(cmd_str); /* we cannot wait here for a response as we * are already in an event. So return a value * that indicates we are waiting for an * allocation so the base functions know * that they shouldn't progress the job */ return ORTE_ERR_ALLOCATION_PENDING; }
static void recv_data(int fd, short args, void *cbdata) { bool found; int i, rc; orte_node_t *nd, *nd2; opal_list_t nds, ndtmp; opal_list_item_t *item, *itm; char recv_msg[8192]; int nbytes, idx, sjob; char **alloc, *nodelist, *tpn; local_jobtracker_t *ptr, *jtrk; local_apptracker_t *aptrk; orte_app_context_t *app; orte_jobid_t jobid; orte_job_t *jdata; char **dash_host = NULL; opal_output_verbose(2, orte_ras_base_framework.framework_output, "%s ras:slurm: dynamic allocation - data recvd", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); /* read the data from the socket and put it in the * nodes field of op */ memset(recv_msg, 0, sizeof(recv_msg)); nbytes = read(fd, recv_msg, sizeof(recv_msg) - 1); opal_output_verbose(2, orte_ras_base_framework.framework_output, "%s ras:slurm: dynamic allocation msg: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv_msg); /* check if we got something */ if (0 == nbytes || 0 == strlen(recv_msg) || strstr(recv_msg, "failure") != NULL) { /* show an error here - basically, a "nothing was available" * message */ orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, (0 == strlen(recv_msg)) ? "NO MSG" : recv_msg); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALLOC_FAILED); return; } /* break the message into its component parts, separated by colons */ alloc = opal_argv_split(recv_msg, ':'); /* the first section contains the ORTE jobid for this allocation */ tpn = strchr(alloc[0], '='); orte_util_convert_string_to_jobid(&jobid, tpn+1); /* get the corresponding job object */ jdata = orte_get_job_data_object(jobid); jtrk = NULL; /* find the associated tracking object */ for (item = opal_list_get_first(&jobs); item != opal_list_get_end(&jobs); item = opal_list_get_next(item)) { ptr = (local_jobtracker_t*)item; if (ptr->jobid == jobid) { jtrk = ptr; break; } } if (NULL == jtrk) { orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, "NO JOB TRACKER"); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALLOC_FAILED); opal_argv_free(alloc); return; } /* stop the timeout event */ opal_event_del(&jtrk->timeout_ev); /* cycle across all the remaining parts - each is the allocation for * an app in this job */ OBJ_CONSTRUCT(&nds, opal_list_t); OBJ_CONSTRUCT(&ndtmp, opal_list_t); idx = -1; sjob = -1; nodelist = NULL; tpn = NULL; for (i=1; NULL != alloc[i]; i++) { if (ORTE_SUCCESS != parse_alloc_msg(alloc[i], &idx, &sjob, &nodelist, &tpn)) { orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, jtrk->cmd); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED); opal_argv_free(alloc); if (NULL != nodelist) { free(nodelist); } if (NULL != tpn) { free(tpn); } return; } if (idx < 0) { orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, jtrk->cmd); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED); opal_argv_free(alloc); free(nodelist); free(tpn); return; } if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, idx))) { orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, jtrk->cmd); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED); opal_argv_free(alloc); free(nodelist); free(tpn); return; } /* release the current dash_host as that contained the *desired* allocation */ orte_remove_attribute(&app->attributes, ORTE_APP_DASH_HOST); /* track the Slurm jobid */ if (NULL == (aptrk = (local_apptracker_t*)opal_pointer_array_get_item(&jtrk->apps, idx))) { aptrk = OBJ_NEW(local_apptracker_t); opal_pointer_array_set_item(&jtrk->apps, idx, aptrk); } aptrk->sjob = sjob; /* since the nodelist/tpn may contain regular expressions, parse them */ if (ORTE_SUCCESS != (rc = orte_ras_slurm_discover(nodelist, tpn, &ndtmp))) { ORTE_ERROR_LOG(rc); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED); opal_argv_free(alloc); free(nodelist); free(tpn); return; } /* transfer the discovered nodes to our node list, and construct * the new dash_host entry to match what was allocated */ while (NULL != (item = opal_list_remove_first(&ndtmp))) { nd = (orte_node_t*)item; opal_argv_append_nosize(&dash_host, nd->name); /* check for duplicates */ found = false; for (itm = opal_list_get_first(&nds); itm != opal_list_get_end(&nds); itm = opal_list_get_next(itm)) { nd2 = (orte_node_t*)itm; if (0 == strcmp(nd->name, nd2->name)) { found = true; nd2->slots += nd->slots; OBJ_RELEASE(item); break; } } if (!found) { /* append the new node to our list */ opal_list_append(&nds, item); } } /* cleanup */ free(nodelist); free(tpn); } /* cleanup */ opal_argv_free(alloc); OBJ_DESTRUCT(&ndtmp); if (NULL != dash_host) { tpn = opal_argv_join(dash_host, ','); for (idx=0; idx < jdata->apps->size; idx++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, idx))) { orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, jtrk->cmd); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED); opal_argv_free(dash_host); free(tpn); return; } orte_set_attribute(&app->attributes, ORTE_APP_DASH_HOST, ORTE_ATTR_LOCAL, (void*)tpn, OPAL_STRING); } opal_argv_free(dash_host); free(tpn); } if (opal_list_is_empty(&nds)) { /* if we get here, then we were able to contact slurm, * which means we are in an actively managed cluster. * However, slurm indicated that nothing is currently * available that meets our requirements. This is a fatal * situation - we do NOT have the option of running on * user-specified hosts as the cluster is managed. */ OBJ_DESTRUCT(&nds); orte_show_help("help-ras-base.txt", "ras-base:no-allocation", true); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } /* store the found nodes */ if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nds, jdata))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nds); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); return; } OBJ_DESTRUCT(&nds); /* default to no-oversubscribe-allowed for managed systems */ if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) { ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE); } /* flag that the allocation is managed */ orte_managed_allocation = true; /* move the job along */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOCATION_COMPLETE); /* all done */ return; }
/** * Discover available (pre-allocated) nodes. Allocate the * requested number of nodes/process slots to the job. * */ static int orte_ras_slurm_allocate(orte_job_t *jdata, opal_list_t *nodes) { int ret, cpus_per_task; char *slurm_node_str, *regexp; char *tasks_per_node, *node_tasks; char *tmp; char *slurm_jobid; if (NULL == (slurm_jobid = getenv("SLURM_JOBID"))) { /* we are not in a slurm allocation - see if dyn alloc * is enabled */ if (!mca_ras_slurm_component.dyn_alloc_enabled) { /* nope - nothing we can do */ opal_output_verbose(2, orte_ras_base_framework.framework_output, "%s ras:slurm: no prior allocation and dynamic alloc disabled", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); return ORTE_ERR_TAKE_NEXT_OPTION; } } else { /* save this value in the global job ident string for * later use in any error reporting */ orte_job_ident = strdup(slurm_jobid); } slurm_node_str = getenv("SLURM_NODELIST"); if (NULL == slurm_node_str) { /* see if dynamic allocation is enabled */ if (mca_ras_slurm_component.dyn_alloc_enabled) { /* attempt to get the allocation - the function * dyn_allocate will return as ORTE_ERR_ALLOCATION_PENDING * if it succeeds in sending the allocation request */ ret = dyn_allocate(jdata); /* return to the above layer in ras/base/ras_base_allocate.c * to wait for event (libevent) happening */ return ret; } orte_show_help("help-ras-slurm.txt", "slurm-env-var-not-found", 1, "SLURM_NODELIST"); return ORTE_ERR_NOT_FOUND; } regexp = strdup(slurm_node_str); if(NULL == regexp) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } if (mca_ras_slurm_component.use_all) { /* this is an oddball case required for debug situations where * a tool is started that will then call mpirun. In this case, * Slurm will assign only 1 tasks/per node to the tool, but * we want mpirun to use the entire allocation. They don't give * us a specific variable for this purpose, so we have to fudge * a bit - but this is a special edge case, and we'll live with it */ tasks_per_node = getenv("SLURM_JOB_CPUS_PER_NODE"); if (NULL == tasks_per_node) { /* couldn't find any version - abort */ orte_show_help("help-ras-slurm.txt", "slurm-env-var-not-found", 1, "SLURM_JOB_CPUS_PER_NODE"); free(regexp); return ORTE_ERR_NOT_FOUND; } node_tasks = strdup(tasks_per_node); if (NULL == node_tasks) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); free(regexp); return ORTE_ERR_OUT_OF_RESOURCE; } cpus_per_task = 1; } else { /* get the number of process slots we were assigned on each node */ tasks_per_node = getenv("SLURM_TASKS_PER_NODE"); if (NULL == tasks_per_node) { /* couldn't find any version - abort */ orte_show_help("help-ras-slurm.txt", "slurm-env-var-not-found", 1, "SLURM_TASKS_PER_NODE"); free(regexp); return ORTE_ERR_NOT_FOUND; } node_tasks = strdup(tasks_per_node); if (NULL == node_tasks) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); free(regexp); return ORTE_ERR_OUT_OF_RESOURCE; } /* get the number of CPUs per task that the user provided to slurm */ tmp = getenv("SLURM_CPUS_PER_TASK"); if(NULL != tmp) { cpus_per_task = atoi(tmp); if(0 >= cpus_per_task) { opal_output(0, "ras:slurm:allocate: Got bad value from SLURM_CPUS_PER_TASK. " "Variable was: %s\n", tmp); ORTE_ERROR_LOG(ORTE_ERROR); free(node_tasks); free(regexp); return ORTE_ERROR; } } else { cpus_per_task = 1; } } ret = orte_ras_slurm_discover(regexp, node_tasks, nodes); free(regexp); free(node_tasks); if (ORTE_SUCCESS != ret) { OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output, "%s ras:slurm:allocate: discover failed!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return ret; } /* record the number of allocated nodes */ orte_num_allocated_nodes = opal_list_get_size(nodes); /* All done */ OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output, "%s ras:slurm:allocate: success", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return ORTE_SUCCESS; }
int ompi_osc_pt2pt_sendreq_recv_accum(ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_send_header_t *header, void *payload) { int ret = OMPI_SUCCESS; struct ompi_op_t *op = ompi_osc_pt2pt_op_create(header->hdr_target_op); ompi_proc_t *proc = ompi_comm_peer_lookup( module->p2p_comm, header->hdr_origin ); struct ompi_datatype_t *datatype = ompi_osc_pt2pt_datatype_create(proc, &payload); if (header->hdr_msg_length > 0) { /* lock the window for accumulates */ OPAL_THREAD_LOCK(&module->p2p_acc_lock); /* copy the data from the temporary buffer into the user window */ ret = ompi_osc_pt2pt_process_op(module, header, datatype, op, payload, header->hdr_msg_length); /* unlock the window for accumulates */ OPAL_THREAD_UNLOCK(&module->p2p_acc_lock); /* Release datatype & op */ OBJ_RELEASE(datatype); OBJ_RELEASE(op); OPAL_THREAD_ADD32(&(module->p2p_num_pending_in), -1); opal_output_verbose(50, ompi_osc_base_output, "%d received accum message from %d", module->p2p_comm->c_my_rank, header->hdr_origin); } else { ompi_osc_pt2pt_longreq_t *longreq; ptrdiff_t lb, extent, true_lb, true_extent; size_t buflen; /* figure out how big a buffer we need */ ompi_ddt_get_extent(datatype, &lb, &extent); ompi_ddt_get_true_extent(datatype, &true_lb, &true_extent); buflen = true_extent + (header->hdr_target_count - 1) * extent; /* get a longreq and fill it in */ ompi_osc_pt2pt_longreq_alloc(&longreq); longreq->req_comp_cb = ompi_osc_pt2pt_sendreq_recv_accum_long_cb; longreq->req_datatype = datatype; longreq->req_op = op; longreq->req_module = module; /* allocate a buffer to receive into ... */ longreq->req_comp_cbdata = malloc(buflen + sizeof(ompi_osc_pt2pt_send_header_t)); if (NULL == longreq->req_comp_cbdata) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; /* fill in tmp header */ memcpy(longreq->req_comp_cbdata, header, sizeof(ompi_osc_pt2pt_send_header_t)); ((ompi_osc_pt2pt_send_header_t*) longreq->req_comp_cbdata)->hdr_msg_length = buflen; ret = mca_pml.pml_irecv(((char*) longreq->req_comp_cbdata) + sizeof(ompi_osc_pt2pt_send_header_t), header->hdr_target_count, datatype, header->hdr_origin, header->hdr_origin_tag, module->p2p_comm, &(longreq->req_pml_req)); opal_output_verbose(50, ompi_osc_base_output, "%d started long recv accum message from %d (%d)", module->p2p_comm->c_my_rank, header->hdr_origin, header->hdr_origin_tag); /* put the send request in the waiting list */ OPAL_THREAD_LOCK(&(module->p2p_lock)); opal_list_append(&(module->p2p_long_msgs), &(longreq->super.super)); OPAL_THREAD_UNLOCK(&(module->p2p_lock)); } return ret; }
/* * For a specific module, see if this proc has matching address/modex * info. If so, create an endpoint and return it. * * Implementation note: This code relies on the order of modules on a local * side matching the order of the modex entries that we send around, otherwise * both sides may not agree on a bidirectional connection. It also assumes * that add_procs will be invoked on the local modules in that same order, for * the same reason. If those assumptions do not hold, we will need to * canonicalize this match ordering somehow, probably by (jobid,vpid) pair or * by the interface MAC or IP address. */ static int match_modex(ompi_btl_usnic_module_t *module, ompi_btl_usnic_proc_t *proc, int *index_out) { int err = OMPI_SUCCESS; size_t i; uint32_t num_modules; ompi_btl_usnic_graph_t *g = NULL; int nme; int *me; bool proc_is_left; if (NULL == index_out) { return OMPI_ERR_BAD_PARAM; } *index_out = -1; num_modules = mca_btl_usnic_component.num_modules; opal_output_verbose(20, USNIC_OUT, "btl:usnic:%s: module=%p proc=%p with dimensions %d x %d", __func__, (void *)module, (void *)proc, num_modules, (int)proc->proc_modex_count); /* We compute an interface match-up table once for each (module,proc) pair * and cache it in the proc. Store per-proc instead of per-module, since * MPI dynamic process routines can add procs but not new modules. */ if (NULL == proc->proc_ep_match_table) { proc->proc_ep_match_table = malloc(num_modules * sizeof(*proc->proc_ep_match_table)); if (NULL == proc->proc_ep_match_table) { OMPI_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE); return OMPI_ERR_OUT_OF_RESOURCE; } /* initialize to "no matches" */ for (i = 0; i < num_modules; ++i) { proc->proc_ep_match_table[i] = -1; } /* For graphs where all edges are equal (and even for some other * graphs), two peers making matching calculations with "mirror image" * graphs might not end up with the same matching. Ensure that both * sides are always setting up the exact same graph by always putting * the process with the lower (jobid,vpid) on the "left". */ proc_is_left = (ompi_rte_compare_name_fields(OMPI_RTE_CMP_ALL, &proc->proc_ompi->proc_name, &(ompi_proc_local()->proc_name)) < 0); err = create_proc_module_graph(proc, proc_is_left, &g); if (OMPI_SUCCESS != err) { goto out_free_table; } nme = 0; err = ompi_btl_usnic_solve_bipartite_assignment(g, &nme, &me); if (OMPI_SUCCESS != err) { OMPI_ERROR_LOG(err); goto out_free_graph; } edge_pairs_to_match_table(proc, proc_is_left, nme, me); err = ompi_btl_usnic_gr_free(g); if (OMPI_SUCCESS != err) { OMPI_ERROR_LOG(err); return err; } } if (!proc->proc_match_exists) { opal_output_verbose(5, USNIC_OUT, "btl:usnic:%s: unable to find any valid interface pairs for proc %s", __func__, OMPI_NAME_PRINT(&proc->proc_ompi->proc_name)); return OMPI_ERR_NOT_FOUND; } /* assuming no strange failure cases, this should always be present */ if (NULL != proc->proc_ep_match_table && proc->proc_match_exists) { for (i = 0; i < num_modules; ++i) { if (module == mca_btl_usnic_component.usnic_active_modules[i]) { *index_out = proc->proc_ep_match_table[i]; break; } } } /* If MTU does not match, throw an error */ /* TODO with UDP, do we still want to enforce this restriction or just take * the min of the two MTUs? Another choice is to disqualify this pairing * before running the matching algorithm on it. */ if (*index_out >= 0 && proc->proc_modex[*index_out].mtu != module->if_mtu) { opal_show_help("help-mpi-btl-usnic.txt", "MTU mismatch", true, ompi_process_info.nodename, ibv_get_device_name(module->device), module->port_num, module->if_mtu, (NULL == proc->proc_ompi->proc_hostname) ? "unknown" : proc->proc_ompi->proc_hostname, proc->proc_modex[*index_out].mtu); *index_out = -1; return OMPI_ERR_UNREACH; } return (*index_out == -1 ? OMPI_ERR_NOT_FOUND : OMPI_SUCCESS); out_free_graph: ompi_btl_usnic_gr_free(g); out_free_table: free(proc->proc_ep_match_table); proc->proc_ep_match_table = NULL; proc->proc_match_exists = false; return err; }
/* * A file descriptor is available/ready for send. Check the state * of the socket and take the appropriate action. */ void mca_oob_usock_send_handler(int sd, short flags, void *cbdata) { mca_oob_usock_peer_t* peer = (mca_oob_usock_peer_t*)cbdata; mca_oob_usock_send_t* msg = peer->send_msg; int rc; opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s usock:send_handler called to send to peer %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->name)); switch (peer->state) { case MCA_OOB_USOCK_CONNECTING: case MCA_OOB_USOCK_CLOSED: opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s usock:send_handler %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), mca_oob_usock_state_print(peer->state)); mca_oob_usock_peer_complete_connect(peer); /* de-activate the send event until the connection * handshake completes */ if (peer->send_ev_active) { opal_event_del(&peer->send_event); peer->send_ev_active = false; } break; case MCA_OOB_USOCK_CONNECTED: opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s usock:send_handler SENDING TO %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == peer->send_msg) ? "NULL" : ORTE_NAME_PRINT(&peer->name)); if (NULL != msg) { /* if the header hasn't been completely sent, send it */ if (!msg->hdr_sent) { if (ORTE_SUCCESS == (rc = send_bytes(peer))) { /* header is completely sent */ msg->hdr_sent = true; /* setup to send the data */ if (NULL == msg->msg) { /* this was a zero-byte msg - nothing more to do */ OBJ_RELEASE(msg); peer->send_msg = NULL; goto next; } else if (NULL != msg->msg->buffer) { /* send the buffer data as a single block */ msg->sdptr = msg->msg->buffer->base_ptr; msg->sdbytes = msg->msg->buffer->bytes_used; } else if (NULL != msg->msg->iov) { /* start with the first iovec */ msg->sdptr = msg->msg->iov[0].iov_base; msg->sdbytes = msg->msg->iov[0].iov_len; msg->iovnum = 0; } else { msg->sdptr = msg->msg->data; msg->sdbytes = msg->msg->count; } /* fall thru and let the send progress */ } else if (ORTE_ERR_RESOURCE_BUSY == rc || ORTE_ERR_WOULD_BLOCK == rc) { /* exit this event and let the event lib progress */ return; } else { // report the error opal_output(0, "%s-%s mca_oob_usock_peer_send_handler: unable to send header", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name))); opal_event_del(&peer->send_event); peer->send_ev_active = false; msg->msg->status = rc; ORTE_RML_SEND_COMPLETE(msg->msg); OBJ_RELEASE(msg); peer->send_msg = NULL; goto next; } } /* progress the data transmission */ if (msg->hdr_sent) { if (ORTE_SUCCESS == (rc = send_bytes(peer))) { /* this block is complete */ if (NULL != msg->msg->buffer) { /* we are done - notify the RML */ opal_output_verbose(2, orte_oob_base_framework.framework_output, "%s MESSAGE SEND COMPLETE TO %s OF %d BYTES ON SOCKET %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), msg->hdr.nbytes, peer->sd); msg->msg->status = ORTE_SUCCESS; ORTE_RML_SEND_COMPLETE(msg->msg); OBJ_RELEASE(msg); peer->send_msg = NULL; } else if (NULL != msg->msg->data) { /* this was a relay message - nothing more to do */ opal_output_verbose(2, orte_oob_base_framework.framework_output, "%s MESSAGE SEND COMPLETE TO %s OF %d BYTES ON SOCKET %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), msg->hdr.nbytes, peer->sd); OBJ_RELEASE(msg); peer->send_msg = NULL; } else { /* rotate to the next iovec */ msg->iovnum++; if (msg->iovnum < msg->msg->count) { msg->sdptr = msg->msg->iov[msg->iovnum].iov_base; msg->sdbytes = msg->msg->iov[msg->iovnum].iov_len; /* exit this event to give the event lib * a chance to progress any other pending * actions */ return; } else { /* this message is complete - notify the RML */ opal_output_verbose(2, orte_oob_base_framework.framework_output, "%s MESSAGE SEND COMPLETE TO %s OF %d BYTES ON SOCKET %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), msg->hdr.nbytes, peer->sd); msg->msg->status = ORTE_SUCCESS; ORTE_RML_SEND_COMPLETE(msg->msg); OBJ_RELEASE(msg); peer->send_msg = NULL; } } /* fall thru to queue the next message */ } else if (ORTE_ERR_RESOURCE_BUSY == rc || ORTE_ERR_WOULD_BLOCK == rc) { /* exit this event and let the event lib progress */ return; } else { // report the error opal_output(0, "%s-%s mca_oob_usock_peer_send_handler: unable to send message ON SOCKET %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), peer->sd); opal_event_del(&peer->send_event); peer->send_ev_active = false; msg->msg->status = rc; ORTE_RML_SEND_COMPLETE(msg->msg); OBJ_RELEASE(msg); peer->send_msg = NULL; ORTE_FORCED_TERMINATE(1); return; } } next: /* if current message completed - progress any pending sends by * moving the next in the queue into the "on-deck" position. Note * that this doesn't mean we send the message right now - we will * wait for another send_event to fire before doing so. This gives * us a chance to service any pending recvs. */ peer->send_msg = (mca_oob_usock_send_t*) opal_list_remove_first(&peer->send_queue); } /* if nothing else to do unregister for send event notifications */ if (NULL == peer->send_msg && peer->send_ev_active) { opal_event_del(&peer->send_event); peer->send_ev_active = false; } break; default: opal_output(0, "%s-%s mca_oob_usock_peer_send_handler: invalid connection state (%d) on socket %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), peer->state, peer->sd); if (peer->send_ev_active) { opal_event_del(&peer->send_event); peer->send_ev_active = false; } break; } }
/* * Create an endpoint and claim the matched modex slot */ int ompi_btl_usnic_create_endpoint(ompi_btl_usnic_module_t *module, ompi_btl_usnic_proc_t *proc, ompi_btl_usnic_endpoint_t **endpoint_o) { int err; int modex_index; ompi_btl_usnic_endpoint_t *endpoint; /* look for matching modex info */ err = match_modex(module, proc, &modex_index); if (OMPI_SUCCESS != err) { opal_output_verbose(5, USNIC_OUT, "btl:usnic:create_endpoint: did not match usnic modex info for peer %s", OMPI_NAME_PRINT(&proc->proc_ompi->proc_name)); return err; } endpoint = OBJ_NEW(ompi_btl_usnic_endpoint_t); if (NULL == endpoint) { return OMPI_ERR_OUT_OF_RESOURCE; } /* Initalize the endpoint */ endpoint->endpoint_module = module; assert(modex_index >= 0 && modex_index < (int)proc->proc_modex_count); endpoint->endpoint_remote_addr = proc->proc_modex[modex_index]; /* Initialize endpoint sequence number info */ endpoint->endpoint_next_seq_to_send = module->local_addr.isn; endpoint->endpoint_ack_seq_rcvd = endpoint->endpoint_next_seq_to_send - 1; endpoint->endpoint_next_contig_seq_to_recv = endpoint->endpoint_remote_addr.isn; endpoint->endpoint_highest_seq_rcvd = endpoint->endpoint_next_contig_seq_to_recv - 1; endpoint->endpoint_rfstart = WINDOW_SIZE_MOD(endpoint->endpoint_next_contig_seq_to_recv); /* Defer creating the ibv_ah. Since calling ibv_create_ah() may trigger ARP resolution, it's better to batch all the endpoints' calls to ibv_create_ah() together to get some parallelism. */ endpoint->endpoint_remote_ah = NULL; /* Now claim that modex slot */ proc->proc_modex_claimed[modex_index] = true; MSGDEBUG1_OUT("create_endpoint: module=%p claimed endpoint=%p on proc=%p (hash=0x%" PRIx64 ")\n", (void *)module, (void *)endpoint, (void *)proc, ompi_rte_hash_name(&proc->proc_ompi->proc_name)); /* Save the endpoint on this proc's array of endpoints */ proc->proc_endpoints[proc->proc_endpoint_count] = endpoint; endpoint->endpoint_proc_index = proc->proc_endpoint_count; endpoint->endpoint_proc = proc; ++proc->proc_endpoint_count; OBJ_RETAIN(proc); /* also add endpoint to module's list of endpoints */ opal_list_append(&(module->all_endpoints), &(endpoint->endpoint_endpoint_li)); *endpoint_o = endpoint; return OMPI_SUCCESS; }
void mca_oob_usock_recv_handler(int sd, short flags, void *cbdata) { mca_oob_usock_peer_t* peer = (mca_oob_usock_peer_t*)cbdata; int rc; orte_rml_send_t *snd; if (orte_abnormal_term_ordered) { return; } opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:usock:recv:handler called for peer %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->name)); switch (peer->state) { case MCA_OOB_USOCK_CONNECT_ACK: if (ORTE_SUCCESS == (rc = mca_oob_usock_peer_recv_connect_ack(peer, peer->sd, NULL))) { opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:usock:recv:handler starting send/recv events", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); /* we connected! Start the send/recv events */ if (!peer->recv_ev_active) { opal_event_add(&peer->recv_event, 0); peer->recv_ev_active = true; } if (peer->timer_ev_active) { opal_event_del(&peer->timer_event); peer->timer_ev_active = false; } /* if there is a message waiting to be sent, queue it */ if (NULL == peer->send_msg) { peer->send_msg = (mca_oob_usock_send_t*)opal_list_remove_first(&peer->send_queue); } if (NULL != peer->send_msg && !peer->send_ev_active) { opal_event_add(&peer->send_event, 0); peer->send_ev_active = true; } /* update our state */ peer->state = MCA_OOB_USOCK_CONNECTED; } else { opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s UNABLE TO COMPLETE CONNECT ACK WITH %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->name)); opal_event_del(&peer->recv_event); peer->recv_ev_active = false; ORTE_FORCED_TERMINATE(1); return; } break; case MCA_OOB_USOCK_CONNECTED: opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:usock:recv:handler CONNECTED", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); /* allocate a new message and setup for recv */ if (NULL == peer->recv_msg) { opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:usock:recv:handler allocate new recv msg", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); peer->recv_msg = OBJ_NEW(mca_oob_usock_recv_t); if (NULL == peer->recv_msg) { opal_output(0, "%s-%s mca_oob_usock_peer_recv_handler: unable to allocate recv message\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name))); return; } /* start by reading the header */ peer->recv_msg->rdptr = (char*)&peer->recv_msg->hdr; peer->recv_msg->rdbytes = sizeof(mca_oob_usock_hdr_t); } /* if the header hasn't been completely read, read it */ if (!peer->recv_msg->hdr_recvd) { opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:usock:recv:handler read hdr", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); if (ORTE_SUCCESS == (rc = read_bytes(peer))) { /* completed reading the header */ peer->recv_msg->hdr_recvd = true; /* if this is a zero-byte message, then we are done */ if (0 == peer->recv_msg->hdr.nbytes) { opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s RECVD ZERO-BYTE MESSAGE FROM %s for tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->name), peer->recv_msg->hdr.tag); peer->recv_msg->data = NULL; // make sure peer->recv_msg->rdptr = NULL; peer->recv_msg->rdbytes = 0; } else { opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:usock:recv:handler allocate data region of size %lu", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (unsigned long)peer->recv_msg->hdr.nbytes); /* allocate the data region */ peer->recv_msg->data = (char*)malloc(peer->recv_msg->hdr.nbytes); /* point to it */ peer->recv_msg->rdptr = peer->recv_msg->data; peer->recv_msg->rdbytes = peer->recv_msg->hdr.nbytes; } /* fall thru and attempt to read the data */ } else if (ORTE_ERR_RESOURCE_BUSY == rc || ORTE_ERR_WOULD_BLOCK == rc) { /* exit this event and let the event lib progress */ return; } else { /* close the connection */ opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:usock:recv:handler error reading bytes - closing connection", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); mca_oob_usock_peer_close(peer); return; } } if (peer->recv_msg->hdr_recvd) { /* continue to read the data block - we start from * wherever we left off, which could be at the * beginning or somewhere in the message */ if (ORTE_SUCCESS == (rc = read_bytes(peer))) { /* we recvd all of the message */ opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s RECVD COMPLETE MESSAGE FROM %s OF %d BYTES FOR DEST %s TAG %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->recv_msg->hdr.origin), (int)peer->recv_msg->hdr.nbytes, ORTE_NAME_PRINT(&peer->recv_msg->hdr.dst), peer->recv_msg->hdr.tag); /* am I the intended recipient? */ if (peer->recv_msg->hdr.dst.jobid == ORTE_PROC_MY_NAME->jobid && peer->recv_msg->hdr.dst.vpid == ORTE_PROC_MY_NAME->vpid) { /* yes - post it to the RML for delivery */ opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s DELIVERING TO RML", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); ORTE_RML_POST_MESSAGE(&peer->recv_msg->hdr.origin, peer->recv_msg->hdr.tag, peer->recv_msg->hdr.seq_num, peer->recv_msg->data, peer->recv_msg->hdr.nbytes); OBJ_RELEASE(peer->recv_msg); } else { /* no - we don't route things, so we promote this * back to the OOB and let another transport move * it along. If we are a daemon and it is intended * for another of our local procs, it will just come * back to us and be handled then */ snd = OBJ_NEW(orte_rml_send_t); snd->dst = peer->recv_msg->hdr.dst; snd->origin = peer->recv_msg->hdr.origin; snd->tag = peer->recv_msg->hdr.tag; snd->data = peer->recv_msg->data; snd->seq_num = peer->recv_msg->hdr.seq_num; snd->count = peer->recv_msg->hdr.nbytes; snd->cbfunc.iov = NULL; snd->cbdata = NULL; /* activate the OOB send state */ ORTE_OOB_SEND(snd); /* protect the data */ peer->recv_msg->data = NULL; /* cleanup */ OBJ_RELEASE(peer->recv_msg); return; } } else if (ORTE_ERR_RESOURCE_BUSY == rc || ORTE_ERR_WOULD_BLOCK == rc) { /* exit this event and let the event lib progress */ return; } else { // report the error opal_output(0, "%s-%s mca_oob_usock_peer_recv_handler: unable to recv message", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name))); /* turn off the recv event */ opal_event_del(&peer->recv_event); peer->recv_ev_active = false; ORTE_FORCED_TERMINATE(1); return; } } break; default: opal_output(0, "%s-%s mca_oob_usock_peer_recv_handler: invalid socket state(%d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), peer->state); // mca_oob_usock_peer_close(peer); break; } }
int mca_fbtl_base_file_select (struct mca_io_ompio_file_t *file, mca_base_component_t *preferred) { int priority; int best_priority; opal_list_item_t *item; opal_list_item_t *next_item; mca_base_component_priority_list_item_t *selectable_item; char *names, **name_array; int num_names; mca_base_component_priority_list_item_t *cpli; mca_fbtl_base_component_t *component; mca_fbtl_base_component_t *best_component; mca_fbtl_base_module_t *module; opal_list_t queried; queried_module_t *om; opal_list_t *selectable; char *str; int err = MPI_SUCCESS; int i; bool was_selectable_constructed = false; /* Check and see if a preferred component was provided. If it was provided then it should be used (if possible) */ if (NULL != preferred) { /* We have a preferred component. Check if it is available and if so, whether it wants to run */ str = &(preferred->mca_component_name[0]); opal_output_verbose(10, mca_fbtl_base_output, "fbtl:base:file_select: Checking preferred component: %s", str); /* query the component for its priority and get its module structure. This is necessary to proceed */ component = (mca_fbtl_base_component_t *)preferred; module = component->fbtlm_file_query (file, &priority); if (NULL != module && NULL != module->fbtl_module_init) { /* this query seems to have returned something legitimate * and we can now go ahead and initialize the * file with it * but first, the functions which * are null need to be filled in */ /*fill_null_pointers (module);*/ file->f_fbtl = module; file->f_fbtl_component = preferred; return module->fbtl_module_init(file); } /* His preferred component is present, but is unable to * run. This is not a good sign. We should try selecting * some other component We let it fall through and select * from the list of available components */ } /*end of selection for preferred component */ /* * We fall till here if one of the two things happened: * 1. The preferred component was provided but for some reason was * not able to be selected * 2. No preferred component was provided * * All we need to do is to go through the list of available * components and find the one which has the highest priority and * use that for this file */ /* Check if anything was requested by means on the name parameters */ names = NULL; mca_base_param_lookup_string (mca_fbtl_base_param, &names); if (NULL != names && 0 < strlen(names)) { name_array = opal_argv_split (names, ','); num_names = opal_argv_count (name_array); opal_output_verbose(10, mca_fbtl_base_output, "fbtl:base:file_Select: Checking all available module"); /* since there are somethings which the mca requested through the if the intersection is NULL, then we barf saying that the requested modules are not being available */ selectable = OBJ_NEW(opal_list_t); was_selectable_constructed = true; /* go through the compoents_available list and check against the names * to see whether this can be added or not */ for (item = opal_list_get_first(&mca_fbtl_base_components_available); item != opal_list_get_end(&mca_fbtl_base_components_available); item = opal_list_get_next(item)) { /* convert the opal_list_item_t returned into the proper type */ cpli = (mca_base_component_priority_list_item_t *) item; component = (mca_fbtl_base_component_t *) cpli->super.cli_component; opal_output_verbose(10, mca_fbtl_base_output, "select: initialising %s component %s", component->fbtlm_version.mca_type_name, component->fbtlm_version.mca_component_name); /* check if this name is present in the mca_base_params */ for (i=0; i < num_names; i++) { if (0 == strcmp(name_array[i], component->fbtlm_version.mca_component_name)) { /* this is present, and should be added o the selectable list */ /* We need to create a seperate object to initialise this list with * since we cannot have the same item in 2 lists */ selectable_item = OBJ_NEW (mca_base_component_priority_list_item_t); *selectable_item = *cpli; opal_list_append (selectable, (opal_list_item_t *)selectable_item); break; } } } /* check for a NULL intersection between the available list and the * list which was asked for */ if (0 == opal_list_get_size(selectable)) { was_selectable_constructed = true; OBJ_RELEASE (selectable); opal_output_verbose (10, mca_fbtl_base_output, "fbtl:base:file_select: preferred modules were not available"); return OMPI_ERROR; } } else { /* if there was no name_array, then we need to simply initialize selectable to mca_fbtl_base_components_available */ selectable = &mca_fbtl_base_components_available; } best_component = NULL; best_priority = -1; OBJ_CONSTRUCT(&queried, opal_list_t); for (item = opal_list_get_first(selectable); item != opal_list_get_end(selectable); item = opal_list_get_next(item)) { /* * convert the opal_list_item_t returned into the proper type */ cpli = (mca_base_component_priority_list_item_t *) item; component = (mca_fbtl_base_component_t *) cpli->super.cli_component; opal_output_verbose(10, mca_fbtl_base_output, "select: initialising %s component %s", component->fbtlm_version.mca_type_name, component->fbtlm_version.mca_component_name); /* * we can call the query function only if there is a function :-) */ if (NULL == component->fbtlm_file_query) { opal_output_verbose(10, mca_fbtl_base_output, "select: no query, ignoring the component"); } else { /* * call the query function and see what it returns */ module = component->fbtlm_file_query (file, &priority); if (NULL == module || NULL == module->fbtl_module_init) { /* * query did not return any action which can be used */ opal_output_verbose(10, mca_fbtl_base_output, "select: query returned failure"); } else { opal_output_verbose(10, mca_fbtl_base_output, "select: query returned priority %d", priority); /* * is this the best component we have found till now? */ if (priority > best_priority) { best_priority = priority; best_component = component; } om = OBJ_NEW(queried_module_t); /* * check if we have run out of space */ if (NULL == om) { OBJ_DESTRUCT(&queried); return OMPI_ERR_OUT_OF_RESOURCE; } om->om_component = component; om->om_module = module; opal_list_append(&queried, (opal_list_item_t *)om); } /* end else of if (NULL == module) */ } /* end else of if (NULL == component->fbtlm_init) */ } /* end for ... end of traversal */ /* We have to remove empty out the selectable list if the selectable * list was constructed as a duplicate and not as a pointer to the * mca_base_components_available list. So, check and destroy */ if (was_selectable_constructed) { /* remove all the items first */ for (item = opal_list_get_first(&mca_fbtl_base_components_available); item != opal_list_get_end(&mca_fbtl_base_components_available); item = next_item) { next_item = opal_list_get_next(item); OBJ_RELEASE (item); } /* release the list itself */ OBJ_RELEASE (selectable); was_selectable_constructed = false; } /* * Now we have alist of components which successfully returned * their module struct. One of these components has the best * priority. The rest have to be comm_unqueried to counter the * effects of file_query'ing them. Finalize happens only on * components which should are initialized. */ if (NULL == best_component) { /* * This typically means that there was no component which was * able to run properly this time. So, we need to abort * JMS replace with show_help */ OBJ_DESTRUCT(&queried); return OMPI_ERROR; } /* * We now have a list of components which have successfully * returned their priorities from the query. We now have to * unquery() those components which have not been selected and * init() the component which was selected */ for (item = opal_list_remove_first(&queried); NULL != item; item = opal_list_remove_first(&queried)) { om = (queried_module_t *) item; if (om->om_component == best_component) { /* * this is the chosen component, we have to initialise the * module of this component. * * ANJU: a component might not have all the functions * defined. Whereever a function pointer is null in the * module structure we need to fill it in with the base * structure function pointers. This is yet to be done */ /* * We don return here coz we still need to go through and * elease the other objects */ /*fill_null_pointers (om->om_module);*/ file->f_fbtl = om->om_module; err = om->om_module->fbtl_module_init(file); file->f_fbtl_component = (mca_base_component_t *)best_component; } else { /* * this is not the "choosen one", finalize */ if (NULL != om->om_component->fbtlm_file_unquery) { /* unquery the component only if they have some clean * up job to do. Components which are queried but do * not actually do anything typically do not have a * unquery. Hence this check is necessary */ (void) om->om_component->fbtlm_file_unquery(file); opal_output_verbose(10, mca_fbtl_base_output, "select: component %s is not selected", om->om_component->fbtlm_version.mca_component_name); } /* end if */ } /* if not best component */ OBJ_RELEASE(om); } /* traversing through the entire list */ opal_output_verbose(10, mca_fbtl_base_output, "select: component %s selected", best_component->fbtlm_version.mca_component_name); OBJ_DESTRUCT(&queried); return err; }
int main(int argc, char *argv[]) { int ret, exit_status = OPAL_SUCCESS; int child_pid; int prev_pid = 0; int idx; opal_crs_base_snapshot_t *snapshot = NULL; char * tmp_env_var = NULL; bool select = false; /*************** * Initialize ***************/ if (OPAL_SUCCESS != (ret = initialize(argc, argv))) { exit_status = ret; goto cleanup; } /* * Check for existence of the file, or program in the case of self */ if( OPAL_SUCCESS != (ret = check_file() )) { opal_show_help("help-opal-restart.txt", "invalid_filename", true, opal_restart_globals.snapshot_ref); exit_status = ret; goto cleanup; } /* Re-enable the selection of the CRS component, so we can choose the right one */ idx = mca_base_var_find(NULL, "crs", "base", "do_not_select"); if (0 > idx) { opal_output(opal_restart_globals.output, "MCA variable opal_crs_base_do_not_select not found\n"); exit_status = OPAL_ERROR; goto cleanup; } ret = mca_base_var_set_value(idx, &select, 0, MCA_BASE_VAR_SOURCE_DEFAULT, NULL); if (OPAL_SUCCESS != ret) { exit_status = ret; goto cleanup; } /* * Make sure we are using the correct checkpointer */ if(NULL == expected_crs_comp) { char * full_metadata_path = NULL; FILE * metadata = NULL; opal_asprintf(&full_metadata_path, "%s/%s/%s", opal_restart_globals.snapshot_loc, opal_restart_globals.snapshot_ref, opal_restart_globals.snapshot_metadata); if( NULL == (metadata = fopen(full_metadata_path, "r")) ) { opal_show_help("help-opal-restart.txt", "invalid_metadata", true, opal_restart_globals.snapshot_metadata, full_metadata_path); exit_status = OPAL_ERROR; goto cleanup; } if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(metadata, &expected_crs_comp, &prev_pid)) ) { opal_show_help("help-opal-restart.txt", "invalid_metadata", true, opal_restart_globals.snapshot_metadata, full_metadata_path); exit_status = ret; goto cleanup; } free(full_metadata_path); full_metadata_path = NULL; fclose(metadata); metadata = NULL; } opal_output_verbose(10, opal_restart_globals.output, "Restart Expects checkpointer: (%s)", expected_crs_comp); (void) mca_base_var_env_name("crs", &tmp_env_var); opal_setenv(tmp_env_var, expected_crs_comp, true, &environ); free(tmp_env_var); tmp_env_var = NULL; /* Select this component or don't continue. * If the selection of this component fails, then we can't * restart on this node because it doesn't have the proper checkpointer * available. */ if( OPAL_SUCCESS != (ret = opal_crs_base_open(MCA_BASE_OPEN_DEFAULT)) ) { opal_show_help("help-opal-restart.txt", "comp_select_failure", true, "crs", ret); exit_status = ret; goto cleanup; } if( OPAL_SUCCESS != (ret = opal_crs_base_select()) ) { opal_show_help("help-opal-restart.txt", "comp_select_failure", true, expected_crs_comp, ret); exit_status = ret; goto cleanup; } /* * Make sure we have selected the proper component */ if(NULL == expected_crs_comp || 0 != strncmp(expected_crs_comp, opal_crs_base_selected_component.base_version.mca_component_name, strlen(expected_crs_comp)) ) { opal_show_help("help-opal-restart.txt", "comp_select_mismatch", true, expected_crs_comp, opal_crs_base_selected_component.base_version.mca_component_name, ret); exit_status = ret; goto cleanup; } /****************************** * Restart in this process ******************************/ opal_output_verbose(10, opal_restart_globals.output, "Restarting from file (%s)\n", opal_restart_globals.snapshot_ref); snapshot = OBJ_NEW(opal_crs_base_snapshot_t); snapshot->cold_start = true; opal_asprintf(&(snapshot->snapshot_directory), "%s/%s", opal_restart_globals.snapshot_loc, opal_restart_globals.snapshot_ref); opal_asprintf(&(snapshot->metadata_filename), "%s/%s", snapshot->snapshot_directory, opal_restart_globals.snapshot_metadata); /* Since some checkpoint/restart systems don't pass along env vars to the * restarted app, we need to take care of that. * * Included here is the creation of any files or directories that need to be * created before the process is restarted. */ if(OPAL_SUCCESS != (ret = post_env_vars(prev_pid, snapshot) ) ) { exit_status = ret; goto cleanup; } /* * Do the actual restart */ ret = opal_crs.crs_restart(snapshot, false, &child_pid); if (OPAL_SUCCESS != ret) { opal_show_help("help-opal-restart.txt", "restart_cmd_failure", true, opal_restart_globals.snapshot_ref, ret, opal_crs_base_selected_component.base_version.mca_component_name); exit_status = ret; goto cleanup; } /* Should never get here, since crs_restart calls exec */ /*************** * Cleanup ***************/ cleanup: if (OPAL_SUCCESS != (ret = finalize())) { return ret; } if(NULL != snapshot ) OBJ_DESTRUCT(snapshot); return exit_status; }
/** * Function for finding and opening either all MCA components, or the one * that was specifically requested via a MCA parameter. */ static int orte_rmaps_base_open(mca_base_open_flag_t flags) { int rc; /* init the globals */ OBJ_CONSTRUCT(&orte_rmaps_base.selected_modules, opal_list_t); orte_rmaps_base.slot_list = NULL; orte_rmaps_base.mapping = 0; orte_rmaps_base.ranking = 0; /* if a topology file was given, then set our topology * from it. Even though our actual topology may differ, * mpirun only needs to see the compute node topology * for mapping purposes */ if (NULL != rmaps_base_topo_file) { if (OPAL_SUCCESS != (rc = opal_hwloc_base_set_topology(rmaps_base_topo_file))) { orte_show_help("help-orte-rmaps-base.txt", "topo-file", true, rmaps_base_topo_file); return ORTE_ERR_SILENT; } } /* check for violations that has to be detected before we parse the mapping option */ if (NULL != orte_rmaps_base.ppr) { orte_show_help("help-orte-rmaps-base.txt", "deprecated", true, "--ppr, -ppr", "--map-by ppr:<pattern>", "rmaps_base_pattern, rmaps_ppr_pattern", "rmaps_base_mapping_policy=ppr:<pattern>"); /* if the mapping policy is NULL, then we can proceed */ if (NULL == rmaps_base_mapping_policy) { asprintf(&rmaps_base_mapping_policy, "ppr:%s", orte_rmaps_base.ppr); } else { return ORTE_ERR_SILENT; } } if (1 < orte_rmaps_base.cpus_per_rank) { orte_show_help("help-orte-rmaps-base.txt", "deprecated", true, "--cpus-per-proc, -cpus-per-proc, --cpus-per-rank, -cpus-per-rank", "--map-by <obj>:PE=N, default <obj>=NUMA", "rmaps_base_cpus_per_proc", "rmaps_base_mapping_policy=<obj>:PE=N, default <obj>=NUMA"); } if (ORTE_SUCCESS != (rc = orte_rmaps_base_set_mapping_policy(&orte_rmaps_base.mapping, &orte_rmaps_base.device, rmaps_base_mapping_policy))) { return rc; } if (ORTE_SUCCESS != (rc = orte_rmaps_base_set_ranking_policy(&orte_rmaps_base.ranking, orte_rmaps_base.mapping, rmaps_base_ranking_policy))) { return rc; } if (rmaps_base_bycore) { orte_show_help("help-orte-rmaps-base.txt", "deprecated", true, "--bycore, -bycore", "--map-by core", "rmaps_base_bycore", "rmaps_base_mapping_policy=core"); /* set mapping policy to bycore - error if something else already set */ if ((ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) && ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) != ORTE_MAPPING_BYCORE) { /* error - cannot redefine the default mapping policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "bycore", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYCORE); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); /* set ranking policy to bycore - error if something else already set */ if ((ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(orte_rmaps_base.ranking)) && ORTE_GET_RANKING_POLICY(orte_rmaps_base.ranking) != ORTE_RANK_BY_CORE) { /* error - cannot redefine the default ranking policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "ranking", "bycore", orte_rmaps_base_print_ranking(orte_rmaps_base.ranking)); return ORTE_ERR_SILENT; } ORTE_SET_RANKING_POLICY(orte_rmaps_base.ranking, ORTE_RANK_BY_CORE); ORTE_SET_RANKING_DIRECTIVE(orte_rmaps_base.ranking, ORTE_RANKING_GIVEN); } if (rmaps_base_byslot) { orte_show_help("help-orte-rmaps-base.txt", "deprecated", true, "--byslot, -byslot", "--map-by slot", "rmaps_base_byslot", "rmaps_base_mapping_policy=slot"); /* set mapping policy to byslot - error if something else already set */ if ((ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) && ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) != ORTE_MAPPING_BYSLOT) { /* error - cannot redefine the default mapping policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "byslot", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYSLOT); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); /* set ranking policy to byslot - error if something else already set */ if ((ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(orte_rmaps_base.ranking)) && ORTE_GET_RANKING_POLICY(orte_rmaps_base.ranking) != ORTE_RANK_BY_SLOT) { /* error - cannot redefine the default ranking policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "ranking", "byslot", orte_rmaps_base_print_ranking(orte_rmaps_base.ranking)); return ORTE_ERR_SILENT; } ORTE_SET_RANKING_POLICY(orte_rmaps_base.ranking, ORTE_RANK_BY_SLOT); ORTE_SET_RANKING_DIRECTIVE(orte_rmaps_base.ranking, ORTE_RANKING_GIVEN); } if (rmaps_base_bynode) { orte_show_help("help-orte-rmaps-base.txt", "deprecated", true, "--bynode, -bynode", "--map-by node", "rmaps_base_bynode", "rmaps_base_mapping_policy=node"); /* set mapping policy to bynode - error if something else already set */ if ((ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) && ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) != ORTE_MAPPING_BYNODE) { orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "bynode", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYNODE); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); /* set ranking policy to bynode - error if something else already set */ if ((ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(orte_rmaps_base.ranking)) && ORTE_GET_RANKING_POLICY(orte_rmaps_base.ranking) != ORTE_RANK_BY_NODE) { /* error - cannot redefine the default ranking policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "ranking", "bynode", orte_rmaps_base_print_ranking(orte_rmaps_base.ranking)); return ORTE_ERR_SILENT; } ORTE_SET_RANKING_POLICY(orte_rmaps_base.ranking, ORTE_RANK_BY_NODE); ORTE_SET_RANKING_DIRECTIVE(orte_rmaps_base.ranking, ORTE_RANKING_GIVEN); } if (1 < orte_rmaps_base.cpus_per_rank) { /* if we were asked for multiple cpus/proc, then we have to * bind to those cpus - any other binding policy is an * error */ if (OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy)) { if (opal_hwloc_use_hwthreads_as_cpus) { if (OPAL_BIND_TO_HWTHREAD != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy) && OPAL_BIND_TO_NONE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { orte_show_help("help-orte-rmaps-base.txt", "mismatch-binding", true, orte_rmaps_base.cpus_per_rank, "use-hwthreads-as-cpus", opal_hwloc_base_print_binding(opal_hwloc_binding_policy), "bind-to hwthread"); return ORTE_ERR_SILENT; } } else if (OPAL_BIND_TO_CORE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy) && OPAL_BIND_TO_NONE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { orte_show_help("help-orte-rmaps-base.txt", "mismatch-binding", true, orte_rmaps_base.cpus_per_rank, "cores as cpus", opal_hwloc_base_print_binding(opal_hwloc_binding_policy), "bind-to core"); return ORTE_ERR_SILENT; } } else { if (opal_hwloc_use_hwthreads_as_cpus) { OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_HWTHREAD); } else { OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_CORE); } } /* we also need to ensure we are mapping to a high-enough level to have * multiple cpus beneath it - by default, we'll go to the NUMA level */ if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) { if (ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) == ORTE_MAPPING_BYHWTHREAD || (ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) == ORTE_MAPPING_BYCORE && !opal_hwloc_use_hwthreads_as_cpus)) { orte_show_help("help-orte-rmaps-base.txt", "mapping-too-low-init", true); return ORTE_ERR_SILENT; } } else { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s rmaps:base pe/rank set - setting mapping to BYNUMA", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYNUMA); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); } } if (orte_rmaps_base_pernode) { /* there is no way to resolve this conflict, so if something else was * given, we have no choice but to error out */ if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) { orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "bynode", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } /* ensure we set the mapping policy to ppr */ ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_PPR); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); /* define the ppr */ orte_rmaps_base.ppr = strdup("1:node"); } if (0 < orte_rmaps_base_n_pernode) { /* there is no way to resolve this conflict, so if something else was * given, we have no choice but to error out */ if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) { orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "bynode", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } /* ensure we set the mapping policy to ppr */ ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_PPR); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); /* define the ppr */ asprintf(&orte_rmaps_base.ppr, "%d:node", orte_rmaps_base_n_pernode); } if (0 < orte_rmaps_base_n_persocket) { /* there is no way to resolve this conflict, so if something else was * given, we have no choice but to error out */ if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) { orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "bynode", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } /* ensure we set the mapping policy to ppr */ ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_PPR); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); /* define the ppr */ asprintf(&orte_rmaps_base.ppr, "%d:socket", orte_rmaps_base_n_persocket); } /* Should we schedule on the local node or not? */ if (rmaps_base_no_schedule_local) { orte_rmaps_base.mapping |= ORTE_MAPPING_NO_USE_LOCAL; } /* Should we oversubscribe or not? */ if (rmaps_base_no_oversubscribe) { if ((ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) && !(ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) { /* error - cannot redefine the default mapping policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "no-oversubscribe", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_SUBSCRIBE_GIVEN); } /** force oversubscription permission */ if (rmaps_base_oversubscribe) { if ((ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) && (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) { /* error - cannot redefine the default mapping policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "oversubscribe", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } ORTE_UNSET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_SUBSCRIBE_GIVEN); /* also set the overload allowed flag */ opal_hwloc_binding_policy |= OPAL_BIND_ALLOW_OVERLOAD; } /* should we display a detailed (developer-quality) version of the map after determining it? */ if (rmaps_base_display_devel_map) { orte_rmaps_base.display_map = true; orte_devel_level_output = true; } /* should we display a diffable report of proc locations after determining it? */ if (rmaps_base_display_diffable_map) { orte_rmaps_base.display_map = true; orte_display_diffable_output = true; } /* Open up all available components */ rc = mca_base_framework_components_open(&orte_rmaps_base_framework, flags); /* check to see if any component indicated a problem */ if (ORTE_MAPPING_CONFLICTED & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) { /* the component would have already reported the error, so * tell the rest of the chain to shut up */ return ORTE_ERR_SILENT; } /* All done */ return rc; }
static int post_env_vars(int prev_pid, opal_crs_base_snapshot_t *snapshot) { int ret, exit_status = OPAL_SUCCESS; char *command = NULL; char *proc_file = NULL; char **loc_touch = NULL; char **loc_mkdir = NULL; int argc, i; if( 0 > prev_pid ) { opal_output(opal_restart_globals.output, "Invalid PID (%d)\n", prev_pid); exit_status = OPAL_ERROR; goto cleanup; } /* * This is needed so we can pass the previous environment to the restarted * application process. */ opal_asprintf(&proc_file, "%s/%s-%d", opal_tmp_directory(), OPAL_CR_BASE_ENV_NAME, prev_pid); opal_asprintf(&command, "env | grep OMPI_ > %s", proc_file); opal_output_verbose(5, opal_restart_globals.output, "post_env_vars: Execute: <%s>", command); ret = system(command); if( 0 > ret) { exit_status = ret; goto cleanup; } /* * Any directories that need to be created */ if( NULL == (snapshot->metadata = fopen(snapshot->metadata_filename, "r")) ) { opal_show_help("help-opal-restart.txt", "invalid_metadata", true, opal_restart_globals.snapshot_metadata, snapshot->metadata_filename); exit_status = OPAL_ERROR; goto cleanup; } opal_crs_base_metadata_read_token(snapshot->metadata, CRS_METADATA_MKDIR, &loc_mkdir); argc = opal_argv_count(loc_mkdir); for( i = 0; i < argc; ++i ) { if( NULL != command ) { free(command); command = NULL; } opal_asprintf(&command, "mkdir -p %s", loc_mkdir[i]); opal_output_verbose(5, opal_restart_globals.output, "post_env_vars: Execute: <%s>", command); ret = system(command); if( 0 > ret) { exit_status = ret; goto cleanup; } } if( 0 < argc ) { system("sync ; sync"); } /* * Any files that need to exist */ opal_crs_base_metadata_read_token(snapshot->metadata, CRS_METADATA_TOUCH, &loc_touch); argc = opal_argv_count(loc_touch); for( i = 0; i < argc; ++i ) { if( NULL != command ) { free(command); command = NULL; } opal_asprintf(&command, "touch %s", loc_touch[i]); opal_output_verbose(5, opal_restart_globals.output, "post_env_vars: Execute: <%s>", command); ret = system(command); if( 0 > ret) { exit_status = ret; goto cleanup; } } if( 0 < argc ) { system("sync ; sync"); } cleanup: if( NULL != command) { free(command); command = NULL; } if( NULL != proc_file) { free(proc_file); proc_file = NULL; } if( NULL != loc_mkdir ) { opal_argv_free(loc_mkdir); loc_mkdir = NULL; } if( NULL != loc_touch ) { opal_argv_free(loc_touch); loc_touch = NULL; } if( NULL != snapshot->metadata ) { fclose(snapshot->metadata); snapshot->metadata = NULL; } return exit_status; }
int orte_rmaps_base_set_mapping_policy(orte_mapping_policy_t *policy, char **device, char *inspec) { char *ck; char *ptr; orte_mapping_policy_t tmp; int rc; size_t len; char *spec; char *pch; /* set defaults */ tmp = 0; if (NULL != device) { *device = NULL; } opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s rmaps:base set policy with %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == inspec) ? "NULL" : inspec); if (NULL == inspec) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYSOCKET); } else { spec = strdup(inspec); // protect the input string /* see if a colon was included - if so, then we have a policy + modifier */ ck = strchr(spec, ':'); if (NULL != ck) { /* if the colon is the first character of the string, then we * just have modifiers on the default mapping policy */ if (ck == spec) { ck++; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s rmaps:base only modifiers %s provided - assuming bysocket mapping", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ck); ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYSOCKET); if (ORTE_ERR_SILENT == (rc = check_modifiers(ck, &tmp)) && ORTE_ERR_BAD_PARAM != rc) { free(spec); return ORTE_ERR_SILENT; } free(spec); goto setpolicy; } /* split the string */ *ck = '\0'; ck++; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s rmaps:base policy %s modifiers %s provided", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), spec, ck); /* if the policy is "dist", then we set the policy to that value * and save the second argument as the device */ if (0 == strncasecmp(spec, "ppr", strlen(spec))) { /* we have to allow additional modifiers here - e.g., specifying * #pe's/proc or oversubscribe - so check for modifiers */ if (NULL == (ptr = strrchr(ck, ':'))) { /* this is an error - there had to be at least one * colon to delimit the number from the object type */ orte_show_help("help-orte-rmaps-base.txt", "invalid-pattern", true, inspec); free(spec); return ORTE_ERR_SILENT; } ptr++; // move past the colon /* check the remaining string for modifiers - may be none, so * don't emit an error message if the modifier isn't recognized */ if (ORTE_ERR_SILENT == (rc = check_modifiers(ptr, &tmp)) && ORTE_ERR_BAD_PARAM != rc) { free(spec); return ORTE_ERR_SILENT; } /* if we found something, then we need to adjust the string */ if (ORTE_SUCCESS == rc) { ptr--; *ptr = '\0'; } /* now get the pattern */ orte_rmaps_base.ppr = strdup(ck); ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_PPR); ORTE_SET_MAPPING_DIRECTIVE(tmp, ORTE_MAPPING_GIVEN); free(spec); goto setpolicy; } if (ORTE_SUCCESS != (rc = check_modifiers(ck, &tmp)) && ORTE_ERR_TAKE_NEXT_OPTION != rc) { if (ORTE_ERR_BAD_PARAM == rc) { orte_show_help("help-orte-rmaps-base.txt", "unrecognized-modifier", true, inspec); } free(spec); return rc; } } len = strlen(spec); if (0 == strncasecmp(spec, "slot", len)) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYSLOT); } else if (0 == strncasecmp(spec, "node", len)) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYNODE); } else if (0 == strncasecmp(spec, "seq", len)) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_SEQ); } else if (0 == strncasecmp(spec, "core", len)) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYCORE); } else if (0 == strncasecmp(spec, "l1cache", len)) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYL1CACHE); } else if (0 == strncasecmp(spec, "l2cache", len)) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYL2CACHE); } else if (0 == strncasecmp(spec, "l3cache", len)) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYL3CACHE); } else if (0 == strncasecmp(spec, "socket", len)) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYSOCKET); } else if (0 == strncasecmp(spec, "numa", len)) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYNUMA); } else if (0 == strncasecmp(spec, "board", len)) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYBOARD); } else if (0 == strncasecmp(spec, "hwthread", len)) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYHWTHREAD); /* if we are mapping processes to individual hwthreads, then * we need to treat those hwthreads as separate cpus */ opal_hwloc_use_hwthreads_as_cpus = true; } else if ( NULL != device && 0 == strncasecmp(spec, "dist", len)) { if (NULL != rmaps_dist_device) { if (NULL != (pch = strchr(rmaps_dist_device, ':'))) { *pch = '\0'; } *device = strdup(rmaps_dist_device); ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYDIST); } else { orte_show_help("help-orte-rmaps-base.txt", "device-not-specified", true); free(spec); return ORTE_ERR_SILENT; } } else { orte_show_help("help-orte-rmaps-base.txt", "unrecognized-policy", true, "mapping", spec); free(spec); return ORTE_ERR_SILENT; } free(spec); ORTE_SET_MAPPING_DIRECTIVE(tmp, ORTE_MAPPING_GIVEN); } setpolicy: *policy = tmp; return ORTE_SUCCESS; }
static int parse_env(char *path, opal_cmd_line_t *cmd_line, char **srcenv, char ***dstenv) { int i, j; char *param; char *value; char *env_set_flag; char **vars; bool takeus = false; opal_output_verbose(1, orte_schizo_base_framework.framework_output, "%s schizo:ompi: parse_env", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); if (NULL != orte_schizo_base.personalities) { /* see if we are included */ for (i=0; NULL != orte_schizo_base.personalities[i]; i++) { if (0 == strcmp(orte_schizo_base.personalities[i], "ompi")) { takeus = true; break; } } if (!takeus) { return ORTE_ERR_TAKE_NEXT_OPTION; } } for (i = 0; NULL != srcenv[i]; ++i) { if (0 == strncmp("OMPI_", srcenv[i], 5)) { /* check for duplicate in app->env - this * would have been placed there by the * cmd line processor. By convention, we * always let the cmd line override the * environment */ param = strdup(srcenv[i]); value = strchr(param, '='); *value = '\0'; value++; opal_setenv(param, value, false, dstenv); free(param); } } /* set necessary env variables for external usage from tune conf file*/ int set_from_file = 0; vars = NULL; if (OPAL_SUCCESS == mca_base_var_process_env_list_from_file(&vars) && NULL != vars) { for (i=0; NULL != vars[i]; i++) { value = strchr(vars[i], '='); /* terminate the name of the param */ *value = '\0'; /* step over the equals */ value++; /* overwrite any prior entry */ opal_setenv(vars[i], value, true, dstenv); /* save it for any comm_spawn'd apps */ opal_setenv(vars[i], value, true, &orte_forwarded_envars); } set_from_file = 1; opal_argv_free(vars); } /* Did the user request to export any environment variables on the cmd line? */ env_set_flag = getenv("OMPI_MCA_mca_base_env_list"); if (opal_cmd_line_is_taken(cmd_line, "x")) { if (NULL != env_set_flag) { orte_show_help("help-orterun.txt", "orterun:conflict-env-set", false); return ORTE_ERR_FATAL; } j = opal_cmd_line_get_ninsts(cmd_line, "x"); for (i = 0; i < j; ++i) { param = opal_cmd_line_get_param(cmd_line, "x", i, 0); if (NULL != (value = strchr(param, '='))) { /* terminate the name of the param */ *value = '\0'; /* step over the equals */ value++; /* overwrite any prior entry */ opal_setenv(param, value, true, dstenv); /* save it for any comm_spawn'd apps */ opal_setenv(param, value, true, &orte_forwarded_envars); } else { value = getenv(param); if (NULL != value) { /* overwrite any prior entry */ opal_setenv(param, value, true, dstenv); /* save it for any comm_spawn'd apps */ opal_setenv(param, value, true, &orte_forwarded_envars); } else { opal_output(0, "Warning: could not find environment variable \"%s\"\n", param); } } } } else if (NULL != env_set_flag) { /* if mca_base_env_list was set, check if some of env vars were set via -x from a conf file. * If this is the case, error out. */ if (!set_from_file) { /* set necessary env variables for external usage */ vars = NULL; if (OPAL_SUCCESS == mca_base_var_process_env_list(env_set_flag, &vars) && NULL != vars) { for (i=0; NULL != vars[i]; i++) { value = strchr(vars[i], '='); /* terminate the name of the param */ *value = '\0'; /* step over the equals */ value++; /* overwrite any prior entry */ opal_setenv(vars[i], value, true, dstenv); /* save it for any comm_spawn'd apps */ opal_setenv(vars[i], value, true, &orte_forwarded_envars); } opal_argv_free(vars); } } else { orte_show_help("help-orterun.txt", "orterun:conflict-env-set", false); return ORTE_ERR_FATAL; } } /* If the user specified --path, store it in the user's app environment via the OMPI_exec_path variable. */ if (NULL != path) { asprintf(&value, "OMPI_exec_path=%s", path); opal_argv_append_nosize(dstenv, value); /* save it for any comm_spawn'd apps */ opal_argv_append_nosize(&orte_forwarded_envars, value); free(value); } return ORTE_SUCCESS; }
/* * Function for selecting one component from all those that are * available. */ void orte_rmaps_base_map_job(int fd, short args, void *cbdata) { orte_job_t *jdata; orte_job_map_t *map; int rc; bool did_map; opal_list_item_t *item; orte_rmaps_base_selected_module_t *mod; orte_job_t *parent; orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; /* convenience */ jdata = caddy->jdata; /* NOTE: NO PROXY COMPONENT REQUIRED - REMOTE PROCS ARE NOT * ALLOWED TO CALL RMAPS INDEPENDENTLY. ONLY THE PLM CAN * DO SO, AND ALL PLM COMMANDS ARE RELAYED TO HNP */ opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps: mapping job %s", ORTE_JOBID_PRINT(jdata->jobid)); /* NOTE: CHECK FOR JDATA->MAP == NULL. IF IT IS, THEN USE * THE VALUES THAT WERE READ BY THE LOCAL MCA PARAMS. THE * PLM PROXY WILL SEND A JOB-OBJECT THAT WILL INCLUDE ANY * MAPPING DIRECTIVES - OTHERWISE, THAT OBJECT WILL HAVE A * NULL MAP FIELD * LONE EXCEPTION - WE COPY DISPLAY MAP ACROSS IF THEY * DIDN'T SET IT */ if (NULL == jdata->map) { opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps: creating new map for job %s", ORTE_JOBID_PRINT(jdata->jobid)); /* create a map object where we will store the results */ map = OBJ_NEW(orte_job_map_t); if (NULL == map) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } /* load it with the system defaults */ map->mapping = orte_rmaps_base.mapping; map->ranking = orte_rmaps_base.ranking; #if OPAL_HAVE_HWLOC map->binding = opal_hwloc_binding_policy; #endif if (NULL != orte_rmaps_base.ppr) { map->ppr = strdup(orte_rmaps_base.ppr); } map->cpus_per_rank = orte_rmaps_base.cpus_per_rank; map->display_map = orte_rmaps_base.display_map; /* assign the map object to this job */ jdata->map = map; } else { opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps: setting mapping policies for job %s", ORTE_JOBID_PRINT(jdata->jobid)); if (!jdata->map->display_map) { jdata->map->display_map = orte_rmaps_base.display_map; } /* set the default mapping policy IFF it wasn't provided */ if (!ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) { ORTE_SET_MAPPING_POLICY(jdata->map->mapping, orte_rmaps_base.mapping); } if (!ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)); } /* ditto for rank and bind policies */ if (!ORTE_RANKING_POLICY_IS_SET(jdata->map->ranking)) { ORTE_SET_RANKING_POLICY(jdata->map->ranking, orte_rmaps_base.ranking); } #if OPAL_HAVE_HWLOC if (!OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { jdata->map->binding = opal_hwloc_binding_policy; } #endif } #if OPAL_HAVE_HWLOC /* if we are not going to launch, then we need to set any * undefined topologies to match our own so the mapper * can operate */ if (orte_do_not_launch) { orte_node_t *node; hwloc_topology_t t0; int i; node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); t0 = node->topology; for (i=1; i < orte_node_pool->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { continue; } if (NULL == node->topology) { node->topology = t0; } } } #endif /* cycle thru the available mappers until one agrees to map * the job */ did_map = false; for (item = opal_list_get_first(&orte_rmaps_base.selected_modules); item != opal_list_get_end(&orte_rmaps_base.selected_modules); item = opal_list_get_next(item)) { mod = (orte_rmaps_base_selected_module_t*)item; if (ORTE_SUCCESS == (rc = mod->module->map_job(jdata))) { did_map = true; break; } /* mappers return "next option" if they didn't attempt to * map the job. anything else is a true error. */ if (ORTE_ERR_TAKE_NEXT_OPTION != rc) { ORTE_ERROR_LOG(rc); ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } } /* if we get here without doing the map, or with zero procs in * the map, then that's an error */ if (!did_map || 0 == jdata->num_procs) { orte_show_help("help-orte-rmaps-base.txt", "failed-map", true); ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } /* compute and save local ranks */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) { ORTE_ERROR_LOG(rc); ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } #if OPAL_HAVE_HWLOC /* compute and save bindings */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_bindings(jdata))) { ORTE_ERROR_LOG(rc); ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } #endif /* if it is a dynamic spawn, save the bookmark on the parent's job too */ if (ORTE_JOBID_INVALID != jdata->originator.jobid) { if (NULL != (parent = orte_get_job_data_object(jdata->originator.jobid))) { parent->bookmark = jdata->bookmark; } } /* if we wanted to display the map, now is the time to do it - ignore * daemon job */ if (jdata->map->display_map) { char *output; int i, j; orte_node_t *node; orte_proc_t *proc; if (orte_display_diffable_output) { /* intended solely to test mapping methods, this output * can become quite long when testing at scale. Rather * than enduring all the malloc/free's required to * create an arbitrary-length string, custom-generate * the output a line at a time here */ /* display just the procs in a diffable format */ opal_output(orte_clean_output, "<map>"); fflush(stderr); /* loop through nodes */ for (i=0; i < jdata->map->nodes->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) { continue; } opal_output(orte_clean_output, "\t<host name=%s>", (NULL == node->name) ? "UNKNOWN" : node->name); fflush(stderr); for (j=0; j < node->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; } #if OPAL_HAVE_HWLOC { char locale[64]; if (NULL != proc->locale) { hwloc_bitmap_list_snprintf(locale, 64, proc->locale->cpuset); } opal_output(orte_clean_output, "\t\t<process rank=%s app_idx=%ld local_rank=%lu node_rank=%lu locale=%s binding=%s[%s:%u]>", ORTE_VPID_PRINT(proc->name.vpid), (long)proc->app_idx, (unsigned long)proc->local_rank, (unsigned long)proc->node_rank, locale, (NULL == proc->cpu_bitmap) ? "NULL" : proc->cpu_bitmap, opal_hwloc_base_print_level(jdata->map->bind_level), proc->bind_idx); } #else opal_output(orte_clean_output, "\t\t<process rank=%s app_idx=%ld local_rank=%lu node_rank=%lu>", ORTE_VPID_PRINT(proc->name.vpid), (long)proc->app_idx, (unsigned long)proc->local_rank, (unsigned long)proc->node_rank); #endif fflush(stderr); } opal_output(orte_clean_output, "\t</host>"); fflush(stderr); } #if OPAL_HAVE_HWLOC { opal_hwloc_locality_t locality; orte_proc_t *p0; /* test locality - for the first node, print the locality of each proc relative to the first one */ node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, 0); p0 = (orte_proc_t*)opal_pointer_array_get_item(node->procs, 0); opal_output(orte_clean_output, "\t<locality>"); for (j=1; j < node->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; } locality = opal_hwloc_base_get_relative_locality(node->topology, jdata->map->bind_level, p0->bind_idx, jdata->map->bind_level, proc->bind_idx); opal_output(orte_clean_output, "\t\t<bind_level=%s rank=%s bind_idx=%u rank=%s bind_idx=%u locality=%s>", opal_hwloc_base_print_level(jdata->map->bind_level), ORTE_VPID_PRINT(p0->name.vpid), p0->bind_idx, ORTE_VPID_PRINT(proc->name.vpid), proc->bind_idx, opal_hwloc_base_print_locality(locality)); } opal_output(orte_clean_output, "\t</locality>\n</map>"); fflush(stderr); } #else opal_output(orte_clean_output, "\n</map>"); fflush(stderr); #endif } else { opal_dss.print(&output, NULL, jdata->map, ORTE_JOB_MAP); if (orte_xml_output) { fprintf(orte_xml_fp, "%s\n", output); fflush(orte_xml_fp); } else { opal_output(orte_clean_output, "%s", output); } free(output); } } /* set the job state to the next position */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_COMPLETE); /* cleanup */ OBJ_RELEASE(caddy); }
static int setup_child(orte_job_t *jdata, orte_proc_t *child, orte_app_context_t *app) { char *param, *value; int rc, i; int32_t nrestarts=0, *nrptr; bool takeus = false; opal_output_verbose(1, orte_schizo_base_framework.framework_output, "%s schizo:ompi: setup_child", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); if (NULL != orte_schizo_base.personalities) { /* see if we are included */ for (i=0; NULL != jdata->personality[i]; i++) { if (0 == strcmp(jdata->personality[i], "ompi")) { takeus = true; break; } } if (!takeus) { return ORTE_ERR_TAKE_NEXT_OPTION; } } /* setup the jobid */ if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(&value, child->name.jobid))) { ORTE_ERROR_LOG(rc); return rc; } opal_setenv("OMPI_MCA_ess_base_jobid", value, true, &app->env); free(value); /* setup the vpid */ if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&value, child->name.vpid))) { ORTE_ERROR_LOG(rc); return rc; } opal_setenv("OMPI_MCA_ess_base_vpid", value, true, &app->env); /* although the vpid IS the process' rank within the job, users * would appreciate being given a public environmental variable * that also represents this value - something MPI specific - so * do that here. * * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT. * We know - just live with it */ opal_setenv("OMPI_COMM_WORLD_RANK", value, true, &app->env); free(value); /* done with this now */ /* users would appreciate being given a public environmental variable * that also represents the local rank value - something MPI specific - so * do that here. * * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT. * We know - just live with it */ if (ORTE_LOCAL_RANK_INVALID == child->local_rank) { ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS; return rc; } asprintf(&value, "%lu", (unsigned long) child->local_rank); opal_setenv("OMPI_COMM_WORLD_LOCAL_RANK", value, true, &app->env); free(value); /* users would appreciate being given a public environmental variable * that also represents the node rank value - something MPI specific - so * do that here. * * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT. * We know - just live with it */ if (ORTE_NODE_RANK_INVALID == child->node_rank) { ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS; return rc; } asprintf(&value, "%lu", (unsigned long) child->node_rank); opal_setenv("OMPI_COMM_WORLD_NODE_RANK", value, true, &app->env); /* set an mca param for it too */ opal_setenv("OMPI_MCA_orte_ess_node_rank", value, true, &app->env); free(value); /* provide the identifier for the PMIx connection - the * PMIx connection is made prior to setting the process * name itself. Although in most cases the ID and the * process name are the same, it isn't necessarily * required */ orte_util_convert_process_name_to_string(&value, &child->name); opal_setenv("PMIX_ID", value, true, &app->env); free(value); nrptr = &nrestarts; if (orte_get_attribute(&child->attributes, ORTE_PROC_NRESTARTS, (void**)&nrptr, OPAL_INT32)) { /* pass the number of restarts for this proc - will be zero for * an initial start, but procs would like to know if they are being * restarted so they can take appropriate action */ asprintf(&value, "%d", nrestarts); opal_setenv("OMPI_MCA_orte_num_restarts", value, true, &app->env); free(value); } /* if the proc should not barrier in orte_init, tell it */ if (orte_get_attribute(&child->attributes, ORTE_PROC_NOBARRIER, NULL, OPAL_BOOL) || 0 < nrestarts) { opal_setenv("OMPI_MCA_orte_do_not_barrier", "1", true, &app->env); } /* if we are using staged execution, tell it */ if (orte_staged_execution) { opal_setenv("OMPI_MCA_orte_staged_execution", "1", true, &app->env); } /* if the proc isn't going to forward IO, then we need to flag that * it has "completed" iof termination as otherwise it will never fire */ if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT)) { ORTE_FLAG_SET(child, ORTE_PROC_FLAG_IOF_COMPLETE); } /* construct the proc's session dir name */ if (NULL != orte_process_info.tmpdir_base) { value = strdup(orte_process_info.tmpdir_base); } else { value = NULL; } param = NULL; if (ORTE_SUCCESS != (rc = orte_session_dir_get_name(¶m, &value, NULL, orte_process_info.nodename, NULL, &child->name))) { ORTE_ERROR_LOG(rc); if (NULL != value) { free(value); } return rc; } free(value); /* pass an envar so the proc can find any files it had prepositioned */ opal_setenv("OMPI_FILE_LOCATION", param, true, &app->env); /* if the user wanted the cwd to be the proc's session dir, then * switch to that location now */ if (orte_get_attribute(&app->attributes, ORTE_APP_SSNDIR_CWD, NULL, OPAL_BOOL)) { /* create the session dir - may not exist */ if (OPAL_SUCCESS != (rc = opal_os_dirpath_create(param, S_IRWXU))) { ORTE_ERROR_LOG(rc); /* doesn't exist with correct permissions, and/or we can't * create it - either way, we are done */ free(param); return rc; } /* change to it */ if (0 != chdir(param)) { free(param); return ORTE_ERROR; } /* It seems that chdir doesn't * adjust the $PWD enviro variable when it changes the directory. This * can cause a user to get a different response when doing getcwd vs * looking at the enviro variable. To keep this consistent, we explicitly * ensure that the PWD enviro variable matches the CWD we moved to. * * NOTE: if a user's program does a chdir(), then $PWD will once * again not match getcwd! This is beyond our control - we are only * ensuring they start out matching. */ opal_setenv("PWD", param, true, &app->env); /* update the initial wdir value too */ opal_setenv("OMPI_MCA_initial_wdir", param, true, &app->env); } free(param); return ORTE_SUCCESS; }
/* create the initial fragment, pack header, datatype, and payload (if size fits) and send */ int ompi_osc_pt2pt_sendreq_send(ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_sendreq_t *sendreq) { int ret = OMPI_SUCCESS; opal_free_list_item_t *item; ompi_osc_pt2pt_send_header_t *header = NULL; ompi_osc_pt2pt_buffer_t *buffer = NULL; size_t written_data = 0; size_t needed_len = sizeof(ompi_osc_pt2pt_send_header_t); const void *packed_ddt; size_t packed_ddt_len = ompi_ddt_pack_description_length(sendreq->req_target_datatype); /* we always need to send the ddt */ needed_len += packed_ddt_len; if (OMPI_OSC_PT2PT_GET != sendreq->req_type) { needed_len += sendreq->req_origin_bytes_packed; } /* Get a buffer */ OPAL_FREE_LIST_GET(&mca_osc_pt2pt_component.p2p_c_buffers, item, ret); if (NULL == item) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; goto cleanup; } buffer = (ompi_osc_pt2pt_buffer_t*) item; /* verify at least enough space for header */ if (mca_osc_pt2pt_component.p2p_c_eager_size < sizeof(ompi_osc_pt2pt_send_header_t)) { ret = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup; } /* setup buffer */ buffer->cbfunc = ompi_osc_pt2pt_sendreq_send_cb; buffer->cbdata = (void*) sendreq; /* pack header */ header = (ompi_osc_pt2pt_send_header_t*) buffer->payload; written_data += sizeof(ompi_osc_pt2pt_send_header_t); header->hdr_base.hdr_flags = 0; header->hdr_windx = sendreq->req_module->p2p_comm->c_contextid; header->hdr_origin = sendreq->req_module->p2p_comm->c_my_rank; header->hdr_origin_sendreq.pval = (void*) sendreq; header->hdr_origin_tag = 0; header->hdr_target_disp = sendreq->req_target_disp; header->hdr_target_count = sendreq->req_target_count; switch (sendreq->req_type) { case OMPI_OSC_PT2PT_PUT: header->hdr_base.hdr_type = OMPI_OSC_PT2PT_HDR_PUT; #if OMPI_ENABLE_MEM_DEBUG header->hdr_target_op = 0; #endif break; case OMPI_OSC_PT2PT_ACC: header->hdr_base.hdr_type = OMPI_OSC_PT2PT_HDR_ACC; header->hdr_target_op = sendreq->req_op_id; break; case OMPI_OSC_PT2PT_GET: header->hdr_base.hdr_type = OMPI_OSC_PT2PT_HDR_GET; #if OMPI_ENABLE_MEM_DEBUG header->hdr_target_op = 0; #endif break; } /* Set datatype id and / or pack datatype */ ret = ompi_ddt_get_pack_description(sendreq->req_target_datatype, &packed_ddt); if (OMPI_SUCCESS != ret) goto cleanup; memcpy((unsigned char*) buffer->payload + written_data, packed_ddt, packed_ddt_len); written_data += packed_ddt_len; if (OMPI_OSC_PT2PT_GET != sendreq->req_type) { /* if sending data and it fits, pack payload */ if (mca_osc_pt2pt_component.p2p_c_eager_size >= written_data + sendreq->req_origin_bytes_packed) { struct iovec iov; uint32_t iov_count = 1; size_t max_data = sendreq->req_origin_bytes_packed; iov.iov_len = max_data; iov.iov_base = (IOVBASE_TYPE*)((unsigned char*) buffer->payload + written_data); ret = ompi_convertor_pack(&sendreq->req_origin_convertor, &iov, &iov_count, &max_data ); if (ret < 0) { ret = OMPI_ERR_FATAL; goto cleanup; } assert(max_data == sendreq->req_origin_bytes_packed); written_data += max_data; header->hdr_msg_length = sendreq->req_origin_bytes_packed; } else { header->hdr_msg_length = 0; header->hdr_origin_tag = create_send_tag(module); } } else { header->hdr_msg_length = 0; } buffer->len = written_data; #ifdef WORDS_BIGENDIAN header->hdr_base.hdr_flags |= OMPI_OSC_PT2PT_HDR_FLAG_NBO; #elif OMPI_ENABLE_HETEROGENEOUS_SUPPORT if (sendreq->req_target_proc->proc_arch & OMPI_ARCH_ISBIGENDIAN) { header->hdr_base.hdr_flags |= OMPI_OSC_PT2PT_HDR_FLAG_NBO; OMPI_OSC_PT2PT_SEND_HDR_HTON(*header); } #endif /* send fragment */ opal_output_verbose(51, ompi_osc_base_output, "%d sending sendreq to %d", sendreq->req_module->p2p_comm->c_my_rank, sendreq->req_target_rank); ret = MCA_PML_CALL(isend(buffer->payload, buffer->len, MPI_BYTE, sendreq->req_target_rank, -200, MCA_PML_BASE_SEND_STANDARD, module->p2p_comm, &buffer->request)); opal_list_append(&module->p2p_pending_control_sends, &buffer->super.super); goto done; cleanup: if (item != NULL) { OPAL_FREE_LIST_RETURN(&mca_osc_pt2pt_component.p2p_c_buffers, item); } done: return ret; }
int ompi_mtl_portals4_irecv(struct mca_mtl_base_module_t* mtl, struct ompi_communicator_t *comm, int src, int tag, struct opal_convertor_t *convertor, mca_mtl_request_t *mtl_request) { ptl_match_bits_t read_match_bits, recv_match_bits, recv_ignore_bits; int ret = OMPI_SUCCESS; ptl_process_t remote_proc; ompi_mtl_portals4_recv_request_t *ptl_request = (ompi_mtl_portals4_recv_request_t*) mtl_request; void *start; size_t length; bool free_after; ptl_me_t me; if (MPI_ANY_SOURCE == src) { if (ompi_mtl_portals4.use_logical) { remote_proc.rank = PTL_RANK_ANY; } else { remote_proc.phys.nid = PTL_NID_ANY; remote_proc.phys.pid = PTL_PID_ANY; } } else if ((ompi_mtl_portals4.use_logical) && (MPI_COMM_WORLD == comm)) { remote_proc.rank = src; } else { ompi_proc_t* ompi_proc = ompi_comm_peer_lookup( comm, src ); remote_proc = *((ptl_process_t*) ompi_mtl_portals4_get_endpoint (mtl, ompi_proc)); } MTL_PORTALS4_SET_RECV_BITS(recv_match_bits, recv_ignore_bits, comm->c_contextid, src, tag); MTL_PORTALS4_SET_READ_BITS(read_match_bits, comm->c_contextid, tag); ret = ompi_mtl_datatype_recv_buf(convertor, &start, &length, &free_after); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; } ptl_request->super.type = portals4_req_recv; ptl_request->super.event_callback = ompi_mtl_portals4_recv_progress; #if OPAL_ENABLE_DEBUG ptl_request->opcount = OPAL_THREAD_ADD64((int64_t*) &ompi_mtl_portals4.recv_opcount, 1); ptl_request->hdr_data = 0; #endif ptl_request->buffer_ptr = (free_after) ? start : NULL; ptl_request->convertor = convertor; ptl_request->delivery_ptr = start; ptl_request->delivery_len = length; ptl_request->req_started = false; ptl_request->super.super.ompi_req->req_status.MPI_ERROR = OMPI_SUCCESS; ptl_request->super.super.ompi_req->req_status._ucount = 0; ptl_request->is_triggered = ((ompi_mtl_portals4.protocol == eager) || (ompi_mtl_portals4.eager_limit >= length) || (MPI_ANY_SOURCE == src) || (MPI_ANY_TAG == tag)) ? false : true; if (ptl_request->is_triggered) { ret = triggered_read_msg((char*) ptl_request->delivery_ptr, ptl_request->delivery_len, remote_proc, read_match_bits, 0, ptl_request); } OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, "Recv %lu from %x,%x of length %ld (0x%lx, 0x%lx, 0x%lx)\n", ptl_request->opcount, remote_proc.phys.nid, remote_proc.phys.pid, (int64_t)length, recv_match_bits, recv_ignore_bits, (unsigned long) ptl_request)); me.start = start; me.length = length; if (ptl_request->is_triggered) me.ct_handle = ptl_request->ct_h; else me.ct_handle = PTL_CT_NONE; me.min_free = 0; me.uid = ompi_mtl_portals4.uid; me.options = PTL_ME_OP_PUT | PTL_ME_USE_ONCE | PTL_ME_EVENT_UNLINK_DISABLE; if (ptl_request->is_triggered) me.options |= PTL_ME_EVENT_CT_COMM | PTL_ME_EVENT_CT_OVERFLOW; if (length <= ompi_mtl_portals4.eager_limit) { me.options |= PTL_ME_EVENT_LINK_DISABLE; } me.match_id = remote_proc; me.match_bits = recv_match_bits; me.ignore_bits = recv_ignore_bits; ret = PtlMEAppend(ompi_mtl_portals4.ni_h, ompi_mtl_portals4.recv_idx, &me, PTL_PRIORITY_LIST, ptl_request, &ptl_request->me_h); if (OPAL_UNLIKELY(PTL_OK != ret)) { if (NULL != ptl_request->buffer_ptr) free(ptl_request->buffer_ptr); opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: PtlMEAppend failed: %d", __FILE__, __LINE__, ret); return ompi_mtl_portals4_get_error(ret); } /* if a long message, spin until we either have a comm event or a link event, guaranteeing progress for long unexpected messages. */ if (length > ompi_mtl_portals4.eager_limit) { while (true != ptl_request->req_started) { ompi_mtl_portals4_progress(); } } return OMPI_SUCCESS; }
static int mca_bml_r2_add_procs( size_t nprocs, struct ompi_proc_t** procs, struct opal_bitmap_t* reachable ) { size_t p, p_index, n_new_procs = 0; struct mca_btl_base_endpoint_t ** btl_endpoints = NULL; struct ompi_proc_t** new_procs = NULL; int rc, ret = OMPI_SUCCESS; if(0 == nprocs) { return OMPI_SUCCESS; } if(OMPI_SUCCESS != (rc = mca_bml_r2_add_btls()) ) { return rc; } /* Select only the procs that don't yet have the BML proc struct. This prevent * us from calling btl->add_procs several times on the same destination proc. */ for(p_index = 0; p_index < nprocs; p_index++) { struct ompi_proc_t* proc = procs[p_index]; if(NULL != proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) { continue; /* go to the next proc */ } /* Allocate the new_procs on demand */ if( NULL == new_procs ) { new_procs = (struct ompi_proc_t **)malloc(nprocs * sizeof(struct ompi_proc_t *)); if( NULL == new_procs ) { return OMPI_ERR_OUT_OF_RESOURCE; } } OBJ_RETAIN(proc); new_procs[n_new_procs++] = proc; } if ( 0 == n_new_procs ) { return OMPI_SUCCESS; } /* Starting from here we only work on the unregistered procs */ procs = new_procs; nprocs = n_new_procs; /* attempt to add all procs to each r2 */ btl_endpoints = (struct mca_btl_base_endpoint_t **) malloc(nprocs * sizeof(struct mca_btl_base_endpoint_t*)); if (NULL == btl_endpoints) { free(new_procs); return OMPI_ERR_OUT_OF_RESOURCE; } for(p_index = 0; p_index < mca_bml_r2.num_btl_modules; p_index++) { mca_btl_base_module_t* btl = mca_bml_r2.btl_modules[p_index]; int btl_inuse = 0; /* if the r2 can reach the destination proc it sets the * corresponding bit (proc index) in the reachable bitmap * and can return addressing information for each proc * that is passed back to the r2 on data transfer calls */ opal_bitmap_clear_all_bits(reachable); memset(btl_endpoints, 0, nprocs *sizeof(struct mca_btl_base_endpoint_t*)); rc = btl->btl_add_procs(btl, n_new_procs, new_procs, btl_endpoints, reachable); if(OMPI_SUCCESS != rc) { /* This BTL has troubles adding the nodes. Let's continue maybe some other BTL * can take care of this task. */ continue; } /* for each proc that is reachable */ for( p = 0; p < n_new_procs; p++ ) { if(opal_bitmap_is_set_bit(reachable, p)) { ompi_proc_t *proc = new_procs[p]; mca_bml_base_endpoint_t * bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; mca_bml_base_btl_t* bml_btl; size_t size; if(NULL == bml_endpoint) { /* allocate bml specific proc data */ bml_endpoint = OBJ_NEW(mca_bml_base_endpoint_t); if (NULL == bml_endpoint) { opal_output(0, "mca_bml_r2_add_procs: unable to allocate resources"); free(btl_endpoints); free(new_procs); return OMPI_ERR_OUT_OF_RESOURCE; } /* preallocate space in array for max number of r2s */ mca_bml_base_btl_array_reserve(&bml_endpoint->btl_eager, mca_bml_r2.num_btl_modules); mca_bml_base_btl_array_reserve(&bml_endpoint->btl_send, mca_bml_r2.num_btl_modules); mca_bml_base_btl_array_reserve(&bml_endpoint->btl_rdma, mca_bml_r2.num_btl_modules); bml_endpoint->btl_max_send_size = -1; bml_endpoint->btl_proc = proc; proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML] = bml_endpoint; bml_endpoint->btl_flags_or = 0; } /* dont allow an additional BTL with a lower exclusivity ranking */ size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); if(size > 0) { bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, size-1); /* skip this btl if the exclusivity is less than the previous */ if(bml_btl->btl->btl_exclusivity > btl->btl_exclusivity) { btl->btl_del_procs(btl, 1, &proc, &btl_endpoints[p]); opal_output_verbose(20, ompi_btl_base_framework.framework_output, "mca: bml: Not using %s btl to %s on node %s " "because %s btl has higher exclusivity (%d > %d)", btl->btl_component->btl_version.mca_component_name, OMPI_NAME_PRINT(&proc->proc_name), proc->proc_hostname, bml_btl->btl->btl_component->btl_version.mca_component_name, bml_btl->btl->btl_exclusivity, btl->btl_exclusivity); continue; } } opal_output_verbose(1, ompi_btl_base_framework.framework_output, "mca: bml: Using %s btl to %s on node %s", btl->btl_component->btl_version.mca_component_name, OMPI_NAME_PRINT(&proc->proc_name), proc->proc_hostname); /* cache the endpoint on the proc */ bml_btl = mca_bml_base_btl_array_insert(&bml_endpoint->btl_send); bml_btl->btl = btl; bml_btl->btl_endpoint = btl_endpoints[p]; bml_btl->btl_weight = 0; bml_btl->btl_flags = btl->btl_flags; if( (bml_btl->btl_flags & MCA_BTL_FLAGS_PUT) && (NULL == btl->btl_put) ) { opal_output(0, "mca_bml_r2_add_procs: The PUT flag is specified for" " the %s BTL without any PUT function attached. Discard the flag !", bml_btl->btl->btl_component->btl_version.mca_component_name); bml_btl->btl_flags ^= MCA_BTL_FLAGS_PUT; } if( (bml_btl->btl_flags & MCA_BTL_FLAGS_GET) && (NULL == btl->btl_get) ) { opal_output(0, "mca_bml_r2_add_procs: The GET flag is specified for" " the %s BTL without any GET function attached. Discard the flag !", bml_btl->btl->btl_component->btl_version.mca_component_name); bml_btl->btl_flags ^= MCA_BTL_FLAGS_GET; } if( (bml_btl->btl_flags & (MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_SEND)) == 0 ) { /** * If no protocol specified, we have 2 choices: we ignore the BTL * as we don't know which protocl to use, or we suppose that all * BTLs support the send protocol. */ bml_btl->btl_flags |= MCA_BTL_FLAGS_SEND; } /** * calculate the bitwise OR of the btl flags */ bml_endpoint->btl_flags_or |= bml_btl->btl_flags; /* This BTL is in use, allow the progress registration */ btl_inuse++; } } if(btl_inuse > 0 && NULL != btl->btl_component->btl_progress) { size_t p; bool found = false; for( p = 0; p < mca_bml_r2.num_btl_progress; p++ ) { if(mca_bml_r2.btl_progress[p] == btl->btl_component->btl_progress) { found = true; break; } } if(found == false) { mca_bml_r2.btl_progress[mca_bml_r2.num_btl_progress] = btl->btl_component->btl_progress; mca_bml_r2.num_btl_progress++; opal_progress_register( btl->btl_component->btl_progress ); } } } free(btl_endpoints); /* iterate back through procs and compute metrics for registered r2s */ for(p=0; p<n_new_procs; p++) { ompi_proc_t *proc = new_procs[p]; mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; double total_bandwidth = 0; uint32_t latency = 0xffffffff; size_t n_index; size_t n_size; /* skip over procs w/ no btl's registered */ if(NULL == bml_endpoint) { continue; } /* (1) determine the total bandwidth available across all btls * note that we need to do this here, as we may already have btls configured * (2) determine the highest priority ranking for latency * (3) compute the maximum amount of bytes that can be send without any * weighting. Once the left over is smaller than this number we will * start using the weight to compute the correct amount. */ n_size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); /* sort BTLs in descending order according to bandwidth value */ qsort(bml_endpoint->btl_send.bml_btls, n_size, sizeof(mca_bml_base_btl_t), btl_bandwidth_compare); bml_endpoint->btl_rdma_index = 0; for(n_index = 0; n_index < n_size; n_index++) { mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index); mca_btl_base_module_t* btl = bml_btl->btl; total_bandwidth += bml_btl->btl->btl_bandwidth; if(btl->btl_latency < latency) { latency = btl->btl_latency; } } /* (1) set the weight of each btl as a percentage of overall bandwidth * (2) copy all btl instances at the highest priority ranking into the * list of btls used for first fragments */ for(n_index = 0; n_index < n_size; n_index++) { mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index); mca_btl_base_module_t *btl = bml_btl->btl; /* compute weighting factor for this r2 */ if(btl->btl_bandwidth > 0) { bml_btl->btl_weight = (float)(btl->btl_bandwidth / total_bandwidth); } else { bml_btl->btl_weight = (float)(1.0 / n_size); } /* check to see if this r2 is already in the array of r2s * used for first fragments - if not add it. */ if(btl->btl_latency == latency) { mca_bml_base_btl_t* bml_btl_new = mca_bml_base_btl_array_insert(&bml_endpoint->btl_eager); *bml_btl_new = *bml_btl; } /* set endpoint max send size as min of available btls */ if(bml_endpoint->btl_max_send_size > btl->btl_max_send_size) bml_endpoint->btl_max_send_size = btl->btl_max_send_size; /* check flags - is rdma prefered */ if ((btl->btl_flags & (MCA_BTL_FLAGS_PUT|MCA_BTL_FLAGS_GET)) && !((proc->proc_arch != ompi_proc_local_proc->proc_arch) && (0 == (btl->btl_flags & MCA_BTL_FLAGS_HETEROGENEOUS_RDMA)))) { mca_bml_base_btl_t* bml_btl_rdma = mca_bml_base_btl_array_insert(&bml_endpoint->btl_rdma); mca_btl_base_module_t* btl_rdma = bml_btl->btl; *bml_btl_rdma = *bml_btl; if(bml_endpoint->btl_pipeline_send_length < btl_rdma->btl_rdma_pipeline_send_length) { bml_endpoint->btl_pipeline_send_length = btl_rdma->btl_rdma_pipeline_send_length; } if(bml_endpoint->btl_send_limit < btl_rdma->btl_min_rdma_pipeline_size) { bml_endpoint->btl_send_limit = btl_rdma->btl_min_rdma_pipeline_size; } } } } /* see if we have a connection to everyone else */ for(p = 0; p < n_new_procs; p++) { ompi_proc_t *proc = new_procs[p]; if (NULL == proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) { ret = OMPI_ERR_UNREACH; if (mca_bml_r2.show_unreach_errors) { opal_show_help("help-mca-bml-r2.txt", "unreachable proc", true, OMPI_NAME_PRINT(&(ompi_proc_local_proc->proc_name)), (NULL != ompi_proc_local_proc->proc_hostname ? ompi_proc_local_proc->proc_hostname : "unknown!"), OMPI_NAME_PRINT(&(proc->proc_name)), (NULL != ompi_proc_local_proc->proc_hostname ? ompi_proc_local_proc->proc_hostname : "unknown!"), btl_names); } break; } } free(new_procs); return ret; }