static void mca_coll_hierarch_dump_struct ( mca_coll_hierarch_module_t *c) { int i, j; int rank; struct mca_coll_hierarch_llead_t *current=NULL; rank = ompi_comm_rank ( c->hier_comm ); printf("%d: Dump of hier-struct for comm %s cid %u\n", rank, c->hier_comm->c_name, c->hier_comm->c_contextid); printf("%d: No of llead communicators: %d No of lleaders: %d\n", rank, opal_pointer_array_get_size ( &(c->hier_llead)), c->hier_num_lleaders ); for ( i=0; i < opal_pointer_array_get_size(&(c->hier_llead)); i++ ) { current = (mca_coll_hierarch_llead_t*)opal_pointer_array_get_item (&(c->hier_llead), i); if ( current == NULL ) { continue; } printf("%d: my_leader %d am_leader %d\n", rank, current->my_lleader, current->am_lleader ); for (j=0; j<c->hier_num_lleaders; j++ ) { printf("%d: lleader[%d] = %d\n", rank, j, current->lleaders[j]); } } return; }
static int notify_collectives(int msg) { mca_coll_base_module_t *modules[NUM_COLLECTIVES]; int i, max, highest_module = 0; memset(&modules, 0, sizeof(mca_coll_base_module_t*) * NUM_COLLECTIVES); max = opal_pointer_array_get_size(&ompi_mpi_communicators); for (i = 0 ; i < max ; ++i) { ompi_communicator_t *comm = (ompi_communicator_t *)opal_pointer_array_get_item(&ompi_mpi_communicators, i); if (NULL == comm) continue; SIGNAL(comm, modules, highest_module, msg, allgather); SIGNAL(comm, modules, highest_module, msg, allgatherv); SIGNAL(comm, modules, highest_module, msg, allreduce); SIGNAL(comm, modules, highest_module, msg, alltoall); SIGNAL(comm, modules, highest_module, msg, alltoallv); SIGNAL(comm, modules, highest_module, msg, alltoallw); SIGNAL(comm, modules, highest_module, msg, barrier); SIGNAL(comm, modules, highest_module, msg, bcast); SIGNAL(comm, modules, highest_module, msg, exscan); SIGNAL(comm, modules, highest_module, msg, gather); SIGNAL(comm, modules, highest_module, msg, gatherv); SIGNAL(comm, modules, highest_module, msg, reduce); SIGNAL(comm, modules, highest_module, msg, reduce_scatter); SIGNAL(comm, modules, highest_module, msg, scan); SIGNAL(comm, modules, highest_module, msg, scatter); SIGNAL(comm, modules, highest_module, msg, scatterv); } return OMPI_SUCCESS; }
static int check_if_restarted(opal_pointer_array_t *procs) { orte_std_cntr_t i_proc; orte_proc_t *proc = NULL; bool is_done; if( NULL == procs ){ return true; } is_done = true; for(i_proc = 0; i_proc < opal_pointer_array_get_size(procs); ++i_proc) { proc = (orte_proc_t*)opal_pointer_array_get_item(procs, i_proc); if( NULL == proc ) { continue; } if( !(ORTE_PROC_STATE_RUNNING & proc->state) ) { is_done = false; break; } } if( !is_done ) { OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, "\t Still waiting for restart: \"%s\" [0x%x] != [0x%x]\n", ORTE_NAME_PRINT(&proc->name), proc->state, ORTE_PROC_STATE_RUNNING)); } return is_done; }
/* this routine runs through the list of communicators and does the disconnect for all dynamic communicators */ int ompi_dpm_base_dyn_finalize (void) { int i,j=0, max=0; ompi_dpm_base_disconnect_obj **objs=NULL; ompi_communicator_t *comm=NULL; if ( 1 <ompi_comm_num_dyncomm ) { objs = (ompi_dpm_base_disconnect_obj **)malloc (ompi_comm_num_dyncomm* sizeof(ompi_dpm_base_disconnect_obj*)); if ( NULL == objs ) { return OMPI_ERR_OUT_OF_RESOURCE; } max = opal_pointer_array_get_size(&ompi_mpi_communicators); for ( i=3; i<max; i++ ) { comm = (ompi_communicator_t*)opal_pointer_array_get_item(&ompi_mpi_communicators,i); if (NULL != comm && OMPI_COMM_IS_DYNAMIC(comm)) { objs[j++]=ompi_dpm_base_disconnect_init(comm); } } if ( j != ompi_comm_num_dyncomm+1 ) { free (objs); return OMPI_ERROR; } ompi_dpm_base_disconnect_waitall (ompi_comm_num_dyncomm, objs); free (objs); } return OMPI_SUCCESS; }
static int mca_spml_base_close(void) { int i, j; /** * Destruct the send and receive queues. The ompi_free_list_t destructor * will return the memory to the mpool, so this has to be done before the * mpool get released by the SPML close function. */ OBJ_DESTRUCT(&mca_spml_base_put_requests); OBJ_DESTRUCT(&mca_spml_base_get_requests); /* Free all the strings in the array */ j = opal_pointer_array_get_size(&mca_spml_base_spml); for (i = 0; i < j; i++) { char * tmp_val; tmp_val = (char *) opal_pointer_array_get_item(&mca_spml_base_spml, i); if (NULL == tmp_val) { continue; } free(tmp_val); } OBJ_DESTRUCT(&mca_spml_base_spml); /* Close all remaining available components */ return mca_base_framework_components_close(&oshmem_spml_base_framework, NULL); }
/** * This functions allows an error to map out the entire BTL. First a * call is made up to the PML to map out all connections from this BTL. * Then a message is sent to all the endpoints connected to this BTL. * This function is enabled by the btl_openib_port_error_failover * MCA parameter. If that parameter is not set, then this function * does not do anything. * @param openib_btl Pointer to BTL that had the error */ void mca_btl_openib_handle_btl_error(mca_btl_openib_module_t* openib_btl) { mca_btl_base_endpoint_t* endpoint; int i; /* Check to see that the flag is set for the entire map out. */ if(mca_btl_openib_component.port_error_failover) { /* Since we are not specifying a specific connection to bring down, * the PML layer will may out the entire BTL for future communication. */ char *btlname = NULL; asprintf(&btlname, "lid=%d:name=%s", openib_btl->lid, openib_btl->device->ib_dev->name); openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL, NULL, btlname); if (NULL != btlname) free(btlname); /* Now send out messages to all endpoints that we are disconnecting. * Only do this to endpoints that are connected. Otherwise, the * remote side does not yet have the information on this endpoint. */ for (i = 0; i < opal_pointer_array_get_size(openib_btl->device->endpoints); i++) { endpoint = (mca_btl_openib_endpoint_t*) opal_pointer_array_get_item(openib_btl->device->endpoints, i); if (NULL == endpoint) { continue; } if (MCA_BTL_IB_CONNECTED == endpoint->endpoint_state) { mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0); endpoint->endpoint_state = MCA_BTL_IB_FAILED; error_out_all_pending_frags(endpoint, &openib_btl->super, true); } } } }
/** * This function is a debugging tool. If you notify a hang, you can * call this function from a debugger and see if there are any * messages stuck in any of the queues. If you call it with * errout=true, then it will error them out. Otherwise, it will * just print out the size of the queues with data in them. */ void mca_btl_openib_dump_all_internal_queues(bool errout) { int i, j, num_eps; mca_btl_openib_module_t* btl; int total; mca_btl_base_endpoint_t* ep; struct mca_btl_base_module_t* module; for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) { btl = mca_btl_openib_component.openib_btls[i]; module = &btl->super; num_eps = opal_pointer_array_get_size(btl->device->endpoints); /* Now, find the endpoint associated with it */ for (j = 0; j < num_eps; j++) { ep = (mca_btl_openib_endpoint_t*) opal_pointer_array_get_item(btl->device->endpoints, j); if (NULL == ep) { continue; } total = 0; error_out_all_pending_frags(ep, module, errout); } } }
static int mca_pml_base_close(void) { int i, j; /* turn off the progress code for the pml */ if( NULL != mca_pml.pml_progress ) { opal_progress_unregister(mca_pml.pml_progress); } /* Blatently ignore the return code (what would we do to recover, anyway? This module is going away, so errors don't matter anymore) */ /** * Destruct the send and receive queues. The ompi_free_list_t destructor * will return the memory to the mpool, so this has to be done before the * mpool get released by the PML close function. */ OBJ_DESTRUCT(&mca_pml_base_send_requests); OBJ_DESTRUCT(&mca_pml_base_recv_requests); mca_pml.pml_progress = mca_pml_base_progress; /* Free all the strings in the array */ j = opal_pointer_array_get_size(&mca_pml_base_pml); for (i = 0; i < j; ++i) { char *str; str = (char*) opal_pointer_array_get_item(&mca_pml_base_pml, i); free(str); } OBJ_DESTRUCT(&mca_pml_base_pml); /* Close all remaining available components */ return mca_base_framework_components_close(&ompi_pml_base_framework, NULL); }
int opal_dss_register(opal_dss_pack_fn_t pack_fn, opal_dss_unpack_fn_t unpack_fn, opal_dss_copy_fn_t copy_fn, opal_dss_compare_fn_t compare_fn, opal_dss_print_fn_t print_fn, bool structured, const char *name, opal_data_type_t *type) { opal_dss_type_info_t *info, *ptr; int32_t i; /* Check for bozo cases */ if (NULL == pack_fn || NULL == unpack_fn || NULL == copy_fn || NULL == compare_fn || NULL == print_fn || NULL == name || NULL == type) { return OPAL_ERR_BAD_PARAM; } /* check if this entry already exists - if so, error - we do NOT allow multiple type registrations */ for (i=0; i < opal_pointer_array_get_size(&opal_dss_types); i++) { ptr = opal_pointer_array_get_item(&opal_dss_types, i); if (NULL != ptr) { /* check if the name exists */ if (0 == strcmp(ptr->odti_name, name)) { return OPAL_ERR_DATA_TYPE_REDEF; } /* check if the specified type exists */ if (*type > 0 && ptr->odti_type == *type) { return OPAL_ERR_DATA_TYPE_REDEF; } } } /* if type is given (i.e., *type > 0), then just use it. * otherwise, it is an error */ if (0 >= *type) { return OPAL_ERR_BAD_PARAM; } /* Add a new entry to the table */ info = (opal_dss_type_info_t*) OBJ_NEW(opal_dss_type_info_t); if (NULL == info) { return OPAL_ERR_OUT_OF_RESOURCE; } info->odti_type = *type; info->odti_name = strdup(name); info->odti_pack_fn = pack_fn; info->odti_unpack_fn = unpack_fn; info->odti_copy_fn = copy_fn; info->odti_compare_fn = compare_fn; info->odti_print_fn = print_fn; info->odti_structured = structured; return opal_pointer_array_set_item(&opal_dss_types, *type, info); }
/* * Shut down the MPI_File bookkeeping */ int ompi_file_finalize(void) { int i, max; size_t num_unnamed; ompi_file_t *file; /* Release MPI_FILE_NULL. Do this so that we don't get a bogus leak report on it. Plus, it's statically allocated, so we don't want to call OBJ_RELEASE on it. */ OBJ_DESTRUCT(&ompi_mpi_file_null.file); opal_pointer_array_set_item(&ompi_file_f_to_c_table, 0, NULL); /* Iterate through all the file handles and destroy them. Note that this also takes care of destroying MPI_FILE_NULL. */ max = opal_pointer_array_get_size(&ompi_file_f_to_c_table); for (num_unnamed = i = 0; i < max; ++i) { file = (ompi_file_t *)opal_pointer_array_get_item(&ompi_file_f_to_c_table, i); /* If the file was closed but still exists because the user told us to never free handles, then do an OBJ_RELEASE it and all is well. Then get the value again and see if it's actually been freed. */ if (NULL != file && ompi_debug_no_free_handles && 0 == (file->f_flags & OMPI_FILE_ISCLOSED)) { OBJ_RELEASE(file); file = (ompi_file_t *)opal_pointer_array_get_item(&ompi_file_f_to_c_table, i); } if (NULL != file) { /* If the user wanted warnings about MPI object leaks, print out a message */ if (ompi_debug_show_handle_leaks) { ++num_unnamed; } OBJ_RELEASE(file); } /* Don't bother setting each element back down to NULL; it would just take a lot of thread locks / unlocks and since we're destroying everything, it isn't worth it */ } if (num_unnamed > 0) { opal_output(0, "WARNING: %lu unnamed MPI_File handles still allocated at MPI_FINALIZE", (unsigned long)num_unnamed); } OBJ_DESTRUCT(&ompi_file_f_to_c_table); /* All done */ return OMPI_SUCCESS; }
/* XRC recive QP to endpoint */ static mca_btl_openib_endpoint_t * xrc_qp2endpoint(uint32_t qp_num, mca_btl_openib_device_t *device) { mca_btl_openib_endpoint_t *ep; int ep_i; for(ep_i = 0; ep_i < opal_pointer_array_get_size(device->endpoints); ep_i++) { ep = opal_pointer_array_get_item(device->endpoints, ep_i); if (qp_num == ep->xrc_recv_qp_num) return ep; } return NULL; }
/* QP to endpoint */ static mca_btl_openib_endpoint_t * qp2endpoint(struct ibv_qp *qp, mca_btl_openib_device_t *device) { mca_btl_openib_endpoint_t *ep; int ep_i, qp_i; for(ep_i = 0; ep_i < opal_pointer_array_get_size(device->endpoints); ep_i++) { ep = opal_pointer_array_get_item(device->endpoints, ep_i); for(qp_i = 0; qp_i < mca_btl_openib_component.num_qps; qp_i++) { if (qp == ep->qps[qp_i].qp->lcl_qp) return ep; } } return NULL; }
void oshmem_proc_group_finalize_scoll(void) { int max, i; oshmem_group_t *group; /* Check whether we have some left */ max = opal_pointer_array_get_size(&oshmem_group_array); for (i = 0; i < max; i++) { group = (oshmem_group_t *) opal_pointer_array_get_item(&oshmem_group_array, i); if (NULL != group) { mca_scoll_base_group_unselect(group); } } }
int oshmem_op_finalize(void) { int max, i; oshmem_op_t *op; /* Check whether we have some left */ max = opal_pointer_array_get_size(&oshmem_op_array); for (i = 0; i < max; i++) { op = (oshmem_op_t *) opal_pointer_array_get_item(&oshmem_op_array, i); if (NULL != op) { OBJ_RELEASE(op); } } OBJ_DESTRUCT(&oshmem_op_array); return OSHMEM_SUCCESS; }
MPI_Win MPI_Win_f2c(MPI_Fint win) { int o_index= OMPI_FINT_2_INT(win); if (MPI_PARAM_CHECK) { OMPI_ERR_INIT_FINALIZE(FUNC_NAME); } /* Per MPI-2:4.12.4, do not invoke an error handler if we get an invalid fortran handle. If we get an invalid fortran handle, return an invalid C handle. */ if ( 0 > o_index || o_index >= opal_pointer_array_get_size(&ompi_mpi_windows)) { return NULL; } return (MPI_Win)opal_pointer_array_get_item(&ompi_mpi_windows, o_index); }
MPI_Comm MPI_Comm_f2c(MPI_Fint comm) { int o_index= OMPI_FINT_2_INT(comm); OPAL_CR_NOOP_PROGRESS(); if ( MPI_PARAM_CHECK ) { OMPI_ERR_INIT_FINALIZE(FUNC_NAME); } /* Per MPI-2:4.12.4, do not invoke an error handler if we get an invalid fortran handle. If we get an invalid fortran handle, return an invalid C handle. */ if ( 0 > o_index || o_index >= opal_pointer_array_get_size(&ompi_comm_f_to_c_table)) { return NULL; } return (MPI_Comm)opal_pointer_array_get_item(&ompi_comm_f_to_c_table, o_index); }
int opal_dss_close(void) { int32_t i; if (!opal_dss_initialized) { return OPAL_SUCCESS; } opal_dss_initialized = false; for (i = 0 ; i < opal_pointer_array_get_size(&opal_dss_types) ; ++i) { opal_dss_type_info_t *info = (opal_dss_type_info_t*)opal_pointer_array_get_item(&opal_dss_types, i); if (NULL != info) { opal_pointer_array_set_item(&opal_dss_types, i, NULL); OBJ_RELEASE(info); } } OBJ_DESTRUCT(&opal_dss_types); return OPAL_SUCCESS; }
MPI_Message MPI_Message_f2c(MPI_Fint message) { int message_index = OMPI_FINT_2_INT(message); if (MPI_PARAM_CHECK) { OMPI_ERR_INIT_FINALIZE(FUNC_NAME); } /* Per MPI-2:4.12.4, do not invoke an error handler if we get an invalid fortran handle. If we get an invalid fortran handle, return an invalid C handle. */ if (message_index < 0 || message_index >= opal_pointer_array_get_size(&ompi_message_f_to_c_table)) { return NULL; } return (MPI_Message)opal_pointer_array_get_item(&ompi_message_f_to_c_table, message_index); }
OSHMEM_DECLSPEC int oshmem_proc_group_finalize(void) { int max, i; oshmem_group_t *group; /* Check whether we have some left */ max = opal_pointer_array_get_size(&oshmem_group_array); for (i = 0; i < max; i++) { group = (oshmem_group_t *) opal_pointer_array_get_item(&oshmem_group_array, i); if (NULL != group) { /* Group has not been freed before finalize */ oshmem_proc_group_destroy(group); } } OBJ_DESTRUCT(&oshmem_group_array); return OSHMEM_SUCCESS; }
/* Create list of IB HCA that have active port */ static int iboffload_load_devices(void) { int num_devs = 0, i; mca_bcol_iboffload_device_t *device = NULL; mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; IBOFFLOAD_VERBOSE(10, ("Entering to iboffload_load_devices")); /* Get list of devices */ /*cm->ib_devs = ibv_get_device_list_compat(&num_devs);*/ cm->ib_devs = ompi_ibv_get_device_list(&num_devs); if (0 == num_devs || NULL == cm->ib_devs) { IBOFFLOAD_ERROR(("No IB devices found")); /* No hca error*/ orte_show_help("help-mpi-btl-openib.txt", "no-nics", true); return OMPI_ERROR; } cm->num_devs = num_devs; for (i = 0; i < num_devs; i++) { device = OBJ_NEW(mca_bcol_iboffload_device_t); if (NULL != device) { opal_pointer_array_set_item(&cm->devices, i, (void *) device); device->dev.ib_dev = cm->ib_devs[i]; IBOFFLOAD_VERBOSE(10, ("Device %s with index %d was appended.\n", ibv_get_device_name(device->dev.ib_dev), i)); } } if (0 == opal_pointer_array_get_size(&cm->devices)) { /* No relevand devices were found, return error */ IBOFFLOAD_ERROR(("No active devices found.\n")); return OMPI_ERROR; } return OMPI_SUCCESS; }
int ompi_win_finalize(void) { size_t size = opal_pointer_array_get_size (&ompi_mpi_windows); /* start at 1 to skip win null */ for (size_t i = 1 ; i < size ; ++i) { ompi_win_t *win = (ompi_win_t *) opal_pointer_array_get_item (&ompi_mpi_windows, i); if (NULL != win) { if (ompi_debug_show_handle_leaks && !ompi_win_invalid(win)){ opal_output(0,"WARNING: MPI_Win still allocated in MPI_Finalize\n"); ompi_win_dump (win); } ompi_win_free (win); } } OBJ_DESTRUCT(&ompi_mpi_win_null.win); OBJ_DESTRUCT(&ompi_mpi_windows); OBJ_RELEASE(ompi_win_accumulate_ops); return OMPI_SUCCESS; }
MPI_Op MPI_Op_f2c(MPI_Fint op_f) { int op_index = OMPI_FINT_2_INT(op_f); /* Error checking */ if (MPI_PARAM_CHECK) { OMPI_ERR_INIT_FINALIZE(FUNC_NAME); } /* Per MPI-2:4.12.4, do not invoke an error handler if we get an invalid fortran handle. If we get an invalid fortran handle, return an invalid C handle. */ if (op_index < 0 || op_index >= opal_pointer_array_get_size(ompi_op_f_to_c_table)) { return NULL; } return (MPI_Op)opal_pointer_array_get_item(ompi_op_f_to_c_table, op_index); }
static inline int mca_btl_ugni_handle_remote_smsg_overrun (mca_btl_ugni_module_t *btl) { size_t endpoint_count; unsigned int ep_index; int count, rc; BTL_VERBOSE(("btl/ugni_component detected SMSG CQ overrun. " "processing message backlog...")); /* we don't know which endpoint lost an smsg completion. clear the smsg remote cq and check all mailboxes */ /* clear out remote cq */ mca_btl_ugni_cq_clear (btl->devices, btl->smsg_remote_cq); endpoint_count = opal_pointer_array_get_size (&btl->endpoints); for (ep_index = 0, count = 0 ; ep_index < endpoint_count ; ++ep_index) { mca_btl_base_endpoint_t *ep; ep = (mca_btl_base_endpoint_t *) opal_pointer_array_get_item (&btl->endpoints, ep_index); if (NULL == ep || MCA_BTL_UGNI_EP_STATE_CONNECTED != ep->state) { continue; } /* clear out smsg mailbox */ rc = mca_btl_ugni_smsg_process (ep); if (OPAL_LIKELY(rc >= 0)) { count += rc; } } return count; }
RTE_PUBLIC int rte_stci_get_ec_locality (rte_ec_handle_t ec_handle) { /* * Get information about ec locality, * So far Open MPI uses following set of locality flags: * * #define OPAL_PROC_ON_CLUSTER 0x10 * #define OPAL_PROC_ON_CU 0x08 * #define OPAL_PROC_ON_NODE 0x04 * #define OPAL_PROC_ON_BOARD 0x02 * #define OPAL_PROC_ON_SOCKET 0x01 * #define OPAL_PROC_NON_LOCAL 0x00 * #define OPAL_PROC_ALL_LOCAL 0x1f */ stci_agent_set_t *set; size_t n_agents; int d, i; STCI_Process_t *aname; /* FIXME: we just cover the RTE_PROC_ON_NODE case right now */ /* get the local process group */ stci_agent_group_get_set (stci_group_local, &set, &n_agents); for (i = 0; i < opal_pointer_array_get_size (set); i++) { aname = opal_pointer_array_get_item (set, i); if (aname == NULL) continue; if (!stci_process_name_compare (&aname->name, (STCI_Process_name_t *)ec_handle, &d )) return RTE_PROC_ON_NODE; } return RTE_PROC_NON_LOCAL; }
/** * Converts the MPI_Fint info into a valid C MPI_Info handle * * @param info Integer handle to an MPI_INFO object * @retval C handle corresponding to MPI_INFO object */ MPI_Info MPI_Info_f2c(MPI_Fint info) { int info_index = OMPI_FINT_2_INT(info); OPAL_CR_NOOP_PROGRESS(); /* check the arguments */ if (MPI_PARAM_CHECK) { OMPI_ERR_INIT_FINALIZE(FUNC_NAME); } /* Per MPI-2:4.12.4, do not invoke an error handler if we get an invalid fortran handle. If we get an invalid fortran handle, return an invalid C handle. */ if (info_index < 0 || info_index >= opal_pointer_array_get_size(&ompi_info_f_to_c_table)) { return NULL; } return (MPI_Info)opal_pointer_array_get_item(&ompi_info_f_to_c_table, info_index); }
int orte_errmgr_base_update_app_context_for_cr_recovery(orte_job_t *jobdata, orte_proc_t *proc, opal_list_t *local_snapshots) { int ret, exit_status = ORTE_SUCCESS; opal_list_item_t *item = NULL; orte_std_cntr_t i_app; int argc = 0; orte_app_context_t *cur_app_context = NULL; orte_app_context_t *new_app_context = NULL; orte_sstore_base_local_snapshot_info_t *vpid_snapshot = NULL; char *reference_fmt_str = NULL; char *location_str = NULL; char *cache_location_str = NULL; char *ref_location_fmt_str = NULL; char *tmp_str = NULL; char *global_snapshot_ref = NULL; char *global_snapshot_seq = NULL; /* * Get the snapshot restart command for this process * JJH CLEANUP: Pass in the vpid_snapshot, so we don't have to look it up every time? */ for(item = opal_list_get_first(local_snapshots); item != opal_list_get_end(local_snapshots); item = opal_list_get_next(item) ) { vpid_snapshot = (orte_sstore_base_local_snapshot_info_t*)item; if(OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &vpid_snapshot->process_name, &proc->name) ) { break; } else { vpid_snapshot = NULL; } } if( NULL == vpid_snapshot ) { ORTE_ERROR_LOG(ORTE_ERROR); exit_status = ORTE_ERROR; goto cleanup; } orte_sstore.get_attr(vpid_snapshot->ss_handle, SSTORE_METADATA_LOCAL_SNAP_REF_FMT, &reference_fmt_str); orte_sstore.get_attr(vpid_snapshot->ss_handle, SSTORE_METADATA_LOCAL_SNAP_LOC, &location_str); orte_sstore.get_attr(vpid_snapshot->ss_handle, SSTORE_METADATA_LOCAL_SNAP_REF_LOC_FMT, &ref_location_fmt_str); orte_sstore.get_attr(vpid_snapshot->ss_handle, SSTORE_METADATA_GLOBAL_SNAP_REF, &global_snapshot_ref); orte_sstore.get_attr(vpid_snapshot->ss_handle, SSTORE_METADATA_GLOBAL_SNAP_SEQ, &global_snapshot_seq); /* * Find current app_context */ cur_app_context = NULL; for(i_app = 0; i_app < opal_pointer_array_get_size(jobdata->apps); ++i_app) { cur_app_context = (orte_app_context_t *)opal_pointer_array_get_item(jobdata->apps, i_app); if( NULL == cur_app_context ) { continue; } if(proc->app_idx == cur_app_context->idx) { break; } } if( NULL == cur_app_context ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * if > 1 processes in this app context * Create a new app_context * Copy over attributes * Add it to the job_t data structure * Associate it with this process in the job * else * Reuse this app_context */ if( cur_app_context->num_procs > 1 ) { /* Create a new app_context */ new_app_context = OBJ_NEW(orte_app_context_t); /* Copy over attributes */ new_app_context->idx = cur_app_context->idx; new_app_context->app = NULL; /* strdup(cur_app_context->app); */ new_app_context->num_procs = 1; new_app_context->argv = NULL; /* opal_argv_copy(cur_app_context->argv); */ new_app_context->env = opal_argv_copy(cur_app_context->env); new_app_context->cwd = (NULL == cur_app_context->cwd ? NULL : strdup(cur_app_context->cwd)); new_app_context->user_specified_cwd = cur_app_context->user_specified_cwd; new_app_context->hostfile = (NULL == cur_app_context->hostfile ? NULL : strdup(cur_app_context->hostfile)); new_app_context->add_hostfile = (NULL == cur_app_context->add_hostfile ? NULL : strdup(cur_app_context->add_hostfile)); new_app_context->dash_host = opal_argv_copy(cur_app_context->dash_host); new_app_context->prefix_dir = (NULL == cur_app_context->prefix_dir ? NULL : strdup(cur_app_context->prefix_dir)); new_app_context->preload_binary = false; new_app_context->preload_libs = false; new_app_context->preload_files_dest_dir = NULL; new_app_context->preload_files_src_dir = NULL; asprintf(&tmp_str, reference_fmt_str, vpid_snapshot->process_name.vpid); asprintf(&(new_app_context->sstore_load), "%s:%s:%s:%s:%s:%s", location_str, global_snapshot_ref, tmp_str, (vpid_snapshot->compress_comp == NULL ? "" : vpid_snapshot->compress_comp), (vpid_snapshot->compress_postfix == NULL ? "" : vpid_snapshot->compress_postfix), global_snapshot_seq); new_app_context->used_on_node = cur_app_context->used_on_node; /* Add it to the job_t data structure */ /*current_global_jobdata->num_apps++; */ new_app_context->idx = (jobdata->num_apps); proc->app_idx = new_app_context->idx; opal_pointer_array_add(jobdata->apps, new_app_context); ++(jobdata->num_apps); /* Remove association with the old app_context */ --(cur_app_context->num_procs); } else { new_app_context = cur_app_context; /* Cleanout old stuff */ free(new_app_context->app); new_app_context->app = NULL; opal_argv_free(new_app_context->argv); new_app_context->argv = NULL; asprintf(&tmp_str, reference_fmt_str, vpid_snapshot->process_name.vpid); asprintf(&(new_app_context->sstore_load), "%s:%s:%s:%s:%s:%s", location_str, global_snapshot_ref, tmp_str, (vpid_snapshot->compress_comp == NULL ? "" : vpid_snapshot->compress_comp), (vpid_snapshot->compress_postfix == NULL ? "" : vpid_snapshot->compress_postfix), global_snapshot_seq); } /* * Update the app_context with the restart informaiton */ new_app_context->app = strdup("opal-restart"); opal_argv_append(&argc, &(new_app_context->argv), new_app_context->app); opal_argv_append(&argc, &(new_app_context->argv), "-l"); opal_argv_append(&argc, &(new_app_context->argv), location_str); opal_argv_append(&argc, &(new_app_context->argv), "-m"); opal_argv_append(&argc, &(new_app_context->argv), orte_sstore_base_local_metadata_filename); opal_argv_append(&argc, &(new_app_context->argv), "-r"); if( NULL != tmp_str ) { free(tmp_str); tmp_str = NULL; } asprintf(&tmp_str, reference_fmt_str, vpid_snapshot->process_name.vpid); opal_argv_append(&argc, &(new_app_context->argv), tmp_str); cleanup: if( NULL != tmp_str) { free(tmp_str); tmp_str = NULL; } if( NULL != location_str ) { free(location_str); location_str = NULL; } if( NULL != cache_location_str ) { free(cache_location_str); cache_location_str = NULL; } if( NULL != reference_fmt_str ) { free(reference_fmt_str); reference_fmt_str = NULL; } if( NULL != ref_location_fmt_str ) { free(ref_location_fmt_str); ref_location_fmt_str = NULL; } return exit_status; }
/** * This function gets called when a control message is received that * is one of the following types: * MCA_BTL_OPENIB_CONTROL_EP_BROKEN * MCA_BTL_OPENIB_CONTROL_EP_EAGER_RDMA_ERROR message * Note that we are using the working connection to send information * about the broken connection. That is why we have to look at the * various information in the control message to figure out which * endpoint is broken. It is (obviously) not the one the message was * received on, because we would not have received the message in that * case. In the case of the BROKEN message, that means the remote * side is notifying us that it has brought down its half of the * connection. Therefore, we need to bring out half down. This is * done because it has been observed that there are cases where only * one side of the connection actually sees the error. This means we * can be left in a state where one side believes it has two BTLs, but * the other side believes it only has one. This can cause problems. * In the case of the EAGER_RDMA_ERROR, see elsewhere in the code what * we are doing. * @param ctl_hdr Pointer control header that was received */ void btl_openib_handle_failover_control_messages(mca_btl_openib_control_header_t *ctl_hdr, mca_btl_openib_endpoint_t* ep) { mca_btl_openib_broken_connection_header_t *bc_hdr = (mca_btl_openib_broken_connection_header_t*)ctl_hdr; int i; int found = false; if(ep->nbo) { BTL_OPENIB_BROKEN_CONNECTION_HEADER_NTOH((*bc_hdr)); } opal_output_verbose(30, mca_btl_openib_component.verbose_failover, "IB: Control message received from %d: lid=%d,subnet=0x%" PRIx64 "", bc_hdr->vpid, bc_hdr->lid, bc_hdr->subnet_id); /* Now we walk through all the endpoints on all the BTLs to * find out which one to map out. */ for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) { mca_btl_openib_module_t* newbtl; int j; newbtl = mca_btl_openib_component.openib_btls[i]; /* Now, find the endpoint associated with it */ for (j = 0; j < opal_pointer_array_get_size(newbtl->device->endpoints); j++) { mca_btl_base_endpoint_t* newep; newep = (mca_btl_openib_endpoint_t*) opal_pointer_array_get_item(newbtl->device->endpoints, j); if (NULL == newep) { continue; } /* Now compare the LID, subnet ID, and the vpid we received * from the remote side and try to match it to an endpoint. */ if ((bc_hdr->lid == newep->rem_info.rem_lid) && (bc_hdr->subnet_id == newep->rem_info.rem_subnet_id) && (bc_hdr->vpid == newep->endpoint_proc->proc_opal->proc_name.vpid)) { opal_output_verbose(30, mca_btl_openib_component.verbose_failover, "IB: Control message received from %d: " "found match: lid=%d," "subnet=0x%" PRIx64 ",endpoint_state=%d", newep->endpoint_proc->proc_opal->proc_name.vpid, newep->rem_info.rem_lid, newep->rem_info.rem_subnet_id, newep->endpoint_state); found = true; /* At this point, we have found the endpoint. Now decode the * message type and do the appropriate action. */ if (MCA_BTL_OPENIB_CONTROL_EP_BROKEN == ctl_hdr->type) { /* Now that we found a match, check the state of the * endpoint to see it is already in a failed state. * If not, then notify the upper layer and error out * any pending fragments. */ if (MCA_BTL_IB_FAILED == newep->endpoint_state) { return; } else { char *btlname = NULL; opal_proc_t* remote_proc = NULL; asprintf(&btlname, "lid=%d:name=%s", newbtl->lid, newbtl->device->ib_dev->name); remote_proc = newep->endpoint_proc->proc_opal; opal_output_verbose(10, mca_btl_openib_component.verbose_failover, "IB: Control message received from %d: " "bringing down connection,lid=%d," "subnet=0x%" PRIx64 ",endpoint_state=%d", newep->endpoint_proc->proc_opal->proc_name.vpid, newep->rem_info.rem_lid, newep->rem_info.rem_subnet_id, newep->endpoint_state); newbtl->error_cb(&newbtl->super, MCA_BTL_ERROR_FLAGS_NONFATAL, remote_proc, btlname); if (NULL != btlname) free(btlname); error_out_all_pending_frags(newep, &newbtl->super, true); newep->endpoint_state = MCA_BTL_IB_FAILED; return; } } else { /* MCA_BTL_OPENIB_CONTROL_EP_EAGER_RDMA_ERROR message */ /* If we are still pointing at the location where * we detected an error on the remote side, then * bump the index by one. */ if (newep->eager_rdma_local.head == (uint16_t)bc_hdr->index) { /* Adjust the local head by one just in case */ MCA_BTL_OPENIB_RDMA_NEXT_INDEX(newep->eager_rdma_local.head); opal_output_verbose(20, mca_btl_openib_component.verbose_failover, "IB: rank=%d, control message (remote=%d), " "moved local head by one (new=%d)", opal_process_name_vpid(OPAL_PROC_MY_NAME), opal_process_name_vpid(newep->endpoint_proc->proc_opal->proc_name), newep->eager_rdma_local.head); } else { opal_output_verbose(20, mca_btl_openib_component.verbose_failover, "IB: rank=%d, control message (remote=%d), " "did not move local head by one (still=%d)", opal_process_name_vpid(OPAL_PROC_MY_NAME), opal_process_name_vpid(newep->endpoint_proc->proc_opal->proc_name), newep->eager_rdma_local.head); } } break; /* since we found the endpoint */ } } } if (false == found) { opal_output_verbose(30, mca_btl_openib_component.verbose_failover, "IB: Control message: no match found"); } }
/** * This function is used to send a message to the remote side * indicating the endpoint is broken and telling the remote side to * brings its endpoint down as well. This is needed because there are * cases where only one side of the connection determines that the * there was a problem. * @param endpoint Pointer to endpoint with error * @param type Type of message to be sent, can be one of two types * @param index When sending RDMA error message, index is non zero */ static void mca_btl_openib_endpoint_notify(mca_btl_base_endpoint_t* endpoint, uint8_t type, int index) { mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl; mca_btl_openib_module_t* newbtl = NULL; bool found = false; mca_btl_openib_broken_connection_header_t *bc_hdr; mca_btl_openib_send_control_frag_t* frag; mca_btl_base_endpoint_t* newep; int i, rc; opal_proc_t* remote_proc = endpoint->endpoint_proc->proc_opal; /* First, find a different BTL than this one that got the * error to send the message over. */ for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) { if (mca_btl_openib_component.openib_btls[i] != openib_btl) { newbtl = mca_btl_openib_component.openib_btls[i]; break; } } if (NULL == newbtl) { opal_output_verbose(20, mca_btl_openib_component.verbose_failover, "IB: Endpoint Notify: No BTL found"); /* If we cannot find one, then just return. */ return; } /* Now, find the endpoint associated with it. The device * associated with the BTL has the list of all the * endpoints. */ for (i = 0; i < opal_pointer_array_get_size(newbtl->device->endpoints); i++) { newep = (mca_btl_openib_endpoint_t*) opal_pointer_array_get_item(newbtl->device->endpoints, i); if (NULL == newep) { continue; } if (newep->endpoint_proc->proc_opal == remote_proc) { found = true; break; } } if (false == found) { opal_output_verbose(20, mca_btl_openib_component.verbose_failover, "IB: Endpoint Notify: No endpoint found"); /* If we cannot find a match, then just return. */ return; } frag = alloc_control_frag(newbtl); if(NULL == frag) { opal_output_verbose(20, mca_btl_openib_component.verbose_failover, "IB: Endpoint Notify: No frag space"); /* If no frag available, then just return. */ return; } to_base_frag(frag)->base.des_cbfunc = mca_btl_openib_endpoint_notify_cb; to_base_frag(frag)->base.des_cbdata = NULL; to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_FLAGS_PRIORITY|MCA_BTL_DES_SEND_ALWAYS_CALLBACK; to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp; to_base_frag(frag)->segment.base.seg_len = sizeof(mca_btl_openib_broken_connection_header_t); to_com_frag(frag)->endpoint = newep; frag->hdr->tag = MCA_BTL_TAG_IB; bc_hdr = (mca_btl_openib_broken_connection_header_t*)to_base_frag(frag)->segment.base.seg_addr.pval; bc_hdr->control.type = type; bc_hdr->lid = endpoint->endpoint_btl->port_info.lid; bc_hdr->subnet_id = endpoint->endpoint_btl->port_info.subnet_id; bc_hdr->vpid = opal_process_name_vpid(OPAL_PROC_MY_NAME); bc_hdr->index = index; if(newep->nbo) { BTL_OPENIB_BROKEN_CONNECTION_HEADER_HTON((*bc_hdr)); } rc = mca_btl_openib_endpoint_send(newep, frag); if (OPAL_SUCCESS == rc || OPAL_ERR_RESOURCE_BUSY == rc) { return; } MCA_BTL_IB_FRAG_RETURN(frag); BTL_ERROR(("Error sending BROKEN CONNECTION buffer (%s)", strerror(errno))); return; }
static void errmgr_autor_process_fault_daemon(orte_job_t *jdata, orte_process_name_t *proc, orte_proc_state_t state) { orte_proc_t *loc_proc = NULL, *child_proc = NULL; orte_std_cntr_t i_proc; int32_t i; OPAL_OUTPUT_VERBOSE((15, mca_errmgr_hnp_component.super.output_handle, "%s errmgr:hnp(autor): process_fault_daemon() " "------- Daemon fault reported! proc %s (0x%x)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), state)); /* * Set the process state in the job data structure */ for(i = 0; i < jdata->procs->size; ++i) { if (NULL == (loc_proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { continue; } if( loc_proc->name.vpid != proc->vpid) { continue; } loc_proc->state = state; break; } /* * Remove the route to this process */ orte_routed.delete_route(proc); /* * If the aborted daemon had active processes on its node, then we should * make sure to signal that all the children are gone. */ if( loc_proc->node->num_procs > 0 ) { OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, "%s errmgr:base: stabalize_runtime() " "------- Daemon lost with the following processes", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); for(i_proc = 0; i_proc < opal_pointer_array_get_size(loc_proc->node->procs); ++i_proc) { child_proc = (orte_proc_t*)opal_pointer_array_get_item(loc_proc->node->procs, i_proc); if( NULL == child_proc ) { continue; } OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, "%s errmgr:base: stabalize_runtime() " "\t %s [0x%x]", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&child_proc->name), child_proc->state)); if( child_proc->last_errmgr_state < child_proc->state ) { child_proc->last_errmgr_state = child_proc->state; orte_errmgr.update_state(child_proc->name.jobid, ORTE_JOB_STATE_COMM_FAILED, &(child_proc->name), ORTE_PROC_STATE_COMM_FAILED, 0, 1); } } } else { /* This daemon had no children, so just mask the failure */ mca_errmgr_hnp_component.ignore_current_update = true; } /* * Record the dead daemon */ orte_errmgr_hnp_record_dead_process(proc); return; }
static void errmgr_autor_recover_processes(int fd, short event, void *cbdata) { int ret, exit_status = ORTE_SUCCESS; opal_list_item_t *item = NULL; errmgr_autor_wp_item_t *wp_item = NULL; orte_std_cntr_t i_proc; orte_proc_t *proc = NULL; orte_sstore_base_global_snapshot_info_t *snapshot = NULL; char * tmp_str = NULL; autor_mask_faults = true; ERRMGR_AUTOR_CLEAR_TIMERS(); ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_START); /* * Display the processes that are to be recovered */ OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, "%s errmgr:hnp(autor):recover() " "------- Display known failed processes in the job %s -------", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(current_global_jobdata->jobid))); opal_list_sort(procs_pending_recovery, autor_procs_sort_compare_fn); display_procs(); /* * Find the latest checkpoint */ OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, "%s errmgr:hnp(autor):recover() " "------- Find the latest checkpoint for the job %s -------", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(current_global_jobdata->jobid))); snapshot = OBJ_NEW(orte_sstore_base_global_snapshot_info_t); if( ORTE_SUCCESS != (ret = orte_sstore.request_global_snapshot_data(&orte_sstore_handle_last_stable, snapshot)) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_SETUP); /* * Safely terminate the entire job */ opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, "errmgr:hnp(autor):recover() " "------- Safely terminate the job %s -------", ORTE_JOBID_PRINT(current_global_jobdata->jobid)); for(i_proc = 0; i_proc < opal_pointer_array_get_size(current_global_jobdata->procs); ++i_proc) { proc = (orte_proc_t*)opal_pointer_array_get_item(current_global_jobdata->procs, i_proc); if( NULL == proc ) { continue; } if( proc->state < ORTE_PROC_STATE_UNTERMINATED ) { proc->state = ORTE_PROC_STATE_MIGRATING; } if( current_global_jobdata->stdin_target == proc->name.vpid ) { orte_iof.close(&(proc->name), ORTE_IOF_STDIN); } } orte_plm.terminate_procs(current_global_jobdata->procs); /* * Wait for the job to terminate all processes */ while(!check_if_terminated(current_global_jobdata->procs) ) { opal_progress(); } ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_TERM); opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, "errmgr:hnp(autor):recover() " "------- Done waiting for termination of job %s -------", ORTE_JOBID_PRINT(current_global_jobdata->jobid)); current_global_jobdata->num_terminated = current_global_jobdata->num_procs; orte_plm_base_reset_job(current_global_jobdata); /* * Construct the app contexts to restart */ OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, "%s errmgr:hnp(autor):recover() " "------- Rebuild job %s app context -------", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(current_global_jobdata->jobid))); for(i_proc = 0; i_proc < opal_pointer_array_get_size(current_global_jobdata->procs); ++i_proc) { proc = (orte_proc_t*)opal_pointer_array_get_item(current_global_jobdata->procs, i_proc); if( NULL == proc ) { continue; } if( ORTE_SUCCESS != (ret = orte_errmgr_base_update_app_context_for_cr_recovery(current_global_jobdata, proc, &(snapshot->local_snapshots))) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, "\tAdjusted: \"%s\" [0x%d] [%s]\n", ORTE_NAME_PRINT(&proc->name), proc->state, proc->node->name)); } ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_RESETUP); /* * Spawn the restarted job */ opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, "errmgr:hnp(autor):recover() " "------- Respawning the job %s -------", ORTE_JOBID_PRINT(current_global_jobdata->jobid)); orte_snapc_base_has_recovered = false; autor_mask_faults = false; /* Failures pass this point are worth noting */ orte_plm.spawn(current_global_jobdata); /* * Wait for all the processes to restart */ opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, "errmgr:hnp(autor):recover() " "------- Waiting for restart -------"); while(!check_if_restarted(current_global_jobdata->procs) ) { opal_progress(); } ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_RESTART); /* * All done */ while( !orte_snapc_base_has_recovered ) { opal_progress(); } opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, "errmgr:hnp(autor):recover() " "------- Finished recovering job %s -------", ORTE_JOBID_PRINT(current_global_jobdata->jobid)); opal_show_help("help-orte-errmgr-hnp.txt", "autor_recovery_complete", true); ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_FINISH); cleanup: while(NULL != (item = opal_list_remove_first(procs_pending_recovery))) { wp_item = (errmgr_autor_wp_item_t*)item; OBJ_RELEASE(wp_item); } if( NULL != tmp_str ) { free(tmp_str); tmp_str = NULL; } ERRMGR_AUTOR_DISPLAY_ALL_TIMERS(); autor_timer_active = false; autor_mask_faults = false; return; }