Beispiel #1
0
static int
pvfs2_register(void)
{
    int param;

    param = mca_base_param_find ("fs", NULL, "pvfs2_stripe_size");
    if (param >= 0)
    {
        mca_base_param_lookup_int (param, &mca_fs_pvfs2_stripe_size);
    }
    param = mca_base_param_find ("fs", NULL, "pvfs2_stripe_width");
    if (param >= 0)
    {
        mca_base_param_lookup_int (param, &mca_fs_pvfs2_stripe_width);
    }

    mca_base_param_reg_int (&mca_fs_pvfs2_component.fsm_version,
                            "priority",
                            "Priority of the pvfs2 fs component",
                            false, false, mca_fs_pvfs2_priority,
                            &mca_fs_pvfs2_priority);
    mca_base_param_reg_int (&mca_fs_pvfs2_component.fsm_version,
                            "stripe_size",
                            "stripe size of a file over pvfs2",
                            false, false, mca_fs_pvfs2_stripe_size,
                            &mca_fs_pvfs2_stripe_size);
    mca_base_param_reg_int (&mca_fs_pvfs2_component.fsm_version,
                            "stripe_width",
                            "stripe width of a file over pvfs2",
                            false, false, mca_fs_pvfs2_stripe_width,
                            &mca_fs_pvfs2_stripe_width);

    return OMPI_SUCCESS;
}
/*
 * Setup the freelist of IO requests.  This does not need to be
 * protected with a lock because it's called during MPI_INIT.
 */
int mca_io_base_request_create_freelist(void)
{
    opal_list_item_t *p;
    const mca_base_component_t *component;
    const mca_io_base_component_2_0_0_t *v200;
    size_t size = 0;
    int i, init, incr;

    /* Find the maximum additional number of bytes required by all io
       components for requests and make that the request size */

    for (p = opal_list_get_first(&mca_io_base_components_available); 
         p != opal_list_get_end(&mca_io_base_components_available); 
         p = opal_list_get_next(p)) {
        component = ((mca_base_component_priority_list_item_t *) 
                     p)->super.cli_component;

        /* Only know how to handle v2.0.0 components for now */

        if (component->mca_type_major_version == 2 &&
            component->mca_type_minor_version == 0 &&
            component->mca_type_release_version == 0) {
            v200 = (mca_io_base_component_2_0_0_t *) component;
            if (v200->io_request_bytes > size) {
                size = v200->io_request_bytes;
            }
        }
    }

    /* Construct and initialized the freelist of IO requests. */

    OBJ_CONSTRUCT(&mca_io_base_requests, ompi_free_list_t);
    mca_io_base_requests_valid = true;
    i = mca_base_param_find("io", NULL, "base_freelist_initial_size");
    mca_base_param_lookup_int(i, &init);
    i = mca_base_param_find("io", NULL, "base_freelist_increment");
    mca_base_param_lookup_int(i, &incr);

    ompi_free_list_init_new(&mca_io_base_requests,
                        sizeof(mca_io_base_request_t) + size,
                        CACHE_LINE_SIZE,
                        OBJ_CLASS(mca_io_base_request_t),
                        0,CACHE_LINE_SIZE,
                        init, -1, incr,
                        NULL);

    /* All done */

    return OMPI_SUCCESS;
}
int mca_bml_base_open(void) 
{
    /* See if we've already been here */
    if (++mca_bml_base_already_opened > 1) {
        return OMPI_SUCCESS;
    }

    if(OMPI_SUCCESS !=
       mca_base_components_open("bml", 0, mca_bml_base_static_components, 
                                &mca_bml_base_components_available, 
                                true)) {  
        return OMPI_ERROR; 
    }

#if OPAL_ENABLE_DEBUG_RELIABILITY
    do {
        int param, value;
        
        mca_base_param_register_int("bml", NULL, "error_rate_floor", "error_rate_floor", 0);
        param = mca_base_param_find("bml", NULL, "error_rate_floor");
        mca_base_param_lookup_int(param, &value);
        mca_bml_base_error_rate_floor = value;

        mca_base_param_register_int("bml", NULL, "error_rate_ceiling", "error_rate_ceiling", 0);
        param = mca_base_param_find("bml", NULL, "error_rate_ceiling");
        mca_base_param_lookup_int(param, &value);
        mca_bml_base_error_rate_ceiling = value;


        mca_base_param_register_int("bml", NULL, "srand", "srand", 1);
        param = mca_base_param_find("bml", NULL, "srand");
        mca_base_param_lookup_int(param, &value);

        /* seed random number generator */
        if(value) {
            struct timeval tv;
            gettimeofday(&tv, NULL);
            srand(getpid() * tv.tv_usec);
        }

        /* initialize count */
        if(mca_bml_base_error_rate_ceiling > 0 
           && mca_bml_base_error_rate_floor <= mca_bml_base_error_rate_ceiling) {
            mca_bml_base_error_count = (int) ((mca_bml_base_error_rate_ceiling * rand())/(RAND_MAX+1.0));
        }
    } while (0);
#endif
    return mca_btl_base_open(); 
}
static int
dynamic_register(void)
{
    int param;

    param = mca_base_param_find ("fcoll", NULL, "dynamic_priority");
    if (param >= 0)
    {
        mca_base_param_lookup_int (param, &mca_fcoll_dynamic_priority);
    }
    param = mca_base_param_find ("fcoll", NULL, "dynamic_num_io_procs");
    if (param >= 0)
    {
        mca_base_param_lookup_int (param, &mca_fcoll_dynamic_num_io_procs);
    }
    param = mca_base_param_find ("fcoll", NULL, "dynamic_constant_cbs");
    if (param >= 0)
    {
        mca_base_param_lookup_int (param, &mca_fcoll_dynamic_constant_cbs);
    }
    param = mca_base_param_find ("fcoll", NULL, "dynamic_cycle_buffer_size");
    if (param >= 0)
    {
        mca_base_param_lookup_int (param, &mca_fcoll_dynamic_cycle_buffer_size);
    }

    mca_base_param_reg_int (&mca_fcoll_dynamic_component.fcollm_version,
                            "priority",
                            "Priority of the dynamic fcoll component",
                            false, false, mca_fcoll_dynamic_priority,
                            &mca_fcoll_dynamic_priority);
    mca_base_param_reg_int (&mca_fcoll_dynamic_component.fcollm_version,
                            "num_io_procs",
                            "Number of writers in the dynamic fcoll component",
                            false, false, mca_fcoll_dynamic_num_io_procs,
                            &mca_fcoll_dynamic_num_io_procs);
    mca_base_param_reg_int (&mca_fcoll_dynamic_component.fcollm_version,
                            "constant_cbs",
                            "wether we are using constant or scaling cycle buffer size in the dynamic fcoll component",
                            false, false, mca_fcoll_dynamic_constant_cbs,
                            &mca_fcoll_dynamic_constant_cbs);
    mca_base_param_reg_int (&mca_fcoll_dynamic_component.fcollm_version,
                            "cycle_buffer_size",
                            "Cycle Buffer Size of the dynamic fcoll component",
                            false, false, mca_fcoll_dynamic_cycle_buffer_size,
                            &mca_fcoll_dynamic_cycle_buffer_size);

    return OMPI_SUCCESS;
}
Beispiel #5
0
int
orte_sds_singleton_set_name(void)
{
    int rc, id, flag;
    orte_vpid_t vpid;

    if (ORTE_SUCCESS != (rc = orte_ns.create_my_name())) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }

    vpid = ORTE_PROC_MY_NAME->vpid;
    
    orte_process_info.num_procs = 1;
    orte_process_info.vpid_start = vpid;
    /* only set the singleton flag is we are NOT infrastructure, 
       and it has not been previously set. */
    id = mca_base_param_find("orte", NULL, "infrastructure");
    mca_base_param_lookup_int(id, &flag);
    if (!flag) {
        orte_process_info.singleton = true;
    }
    
    return ORTE_SUCCESS;
}
/* init the progress engine - called from orte_init */
int
opal_progress_init(void)
{
#if OPAL_ENABLE_DEBUG
    int param, value;
#endif

    /* reentrant issues */
#if OPAL_HAVE_THREAD_SUPPORT
    opal_atomic_init(&progress_lock, OPAL_ATOMIC_UNLOCKED);
#endif  /* OPAL_HAVE_THREAD_SUPPORT */

    /* set the event tick rate */
    opal_progress_set_event_poll_rate(10000);

#if OPAL_ENABLE_DEBUG
    param = mca_base_param_find("opal", NULL, "progress_debug");
    mca_base_param_lookup_int(param, &value);
    if (value) {
        debug_output = opal_output_open(NULL);
    }
#endif

    OPAL_OUTPUT((debug_output, "progress: initialized event flag to: %x",
                 opal_progress_event_flag));                 
    OPAL_OUTPUT((debug_output, "progress: initialized yield_when_idle to: %s",
                 call_yield == 0 ? "false" : "true"));
    OPAL_OUTPUT((debug_output, "progress: initialized num users to: %d",
                 num_event_users));
    OPAL_OUTPUT((debug_output, "progress: initialized poll rate to: %ld",
                 (long) event_progress_delta));

    return OPAL_SUCCESS;
}
Beispiel #7
0
int
ompi_common_mx_initialize(void)
{
    mx_return_t mx_return;
    struct mca_mpool_base_resources_t mpool_resources;
    int index, value;
    
    ompi_common_mx_initialize_ref_cnt++;
    
    if(ompi_common_mx_initialize_ref_cnt == 1) { 
        /* set the MX error handle to always return. This function is the
         * only MX function allowed to be called before mx_init in order
         * to make sure that if the MX is not up and running the MX
         * library does not exit the application.
         */
        mx_set_error_handler(MX_ERRORS_RETURN);
	
	/* If we have a memory manager available, and
	   mpi_leave_pinned == -1, then set mpi_leave_pinned to 1.

	   We have a memory manager if:
	   - we have both FREE and MUNMAP support
	   - we have MUNMAP support and the linux mallopt */
	value = opal_mem_hooks_support_level();
	if ((value & (OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT))
            == (OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT)) {
	  index = mca_base_param_find("mpi", NULL, "leave_pinned");
	  if (index >= 0)
            if ((mca_base_param_lookup_int(index, &value) == OPAL_SUCCESS) 
		&& (value == -1)) {
	      
	      ompi_mpi_leave_pinned = 1;
	      setenv("MX_RCACHE", "2", 1);
	      mpool_resources.regcache_clean = mx__regcache_clean;
	      ompi_common_mx_fake_mpool = 
		mca_mpool_base_module_create("fake", NULL, &mpool_resources);
	      if (!ompi_common_mx_fake_mpool) {
		ompi_mpi_leave_pinned = 0;
		setenv("MX_RCACHE", "0", 1);
		opal_output(0, "Error creating fake mpool (error %s)\n",
			    strerror(errno));
	      }
	    }
	}
	
        /* initialize the mx library */
        mx_return = mx_init(); 
        
        if(MX_SUCCESS != mx_return) {
            opal_output(0,
                        "Error in mx_init (error %s)\n",
                        mx_strerror(mx_return));
            return OMPI_ERR_NOT_AVAILABLE;
        }
        
    } 
    return OMPI_SUCCESS;
}
int opal_carto_auto_detect_component_query(mca_base_module_t **module, int *priority)
{
    int param;

    param = mca_base_param_find("carto", "auto_detect", "priority");
    mca_base_param_lookup_int(param, priority);

    *module = (mca_base_module_t *)&loc_module;

    return OPAL_SUCCESS;
}
int opal_paffinity_solaris_component_query(mca_base_module_t **module, int *priority)
{
    int param;

    param = mca_base_param_find("paffinity", "solaris", "priority");
    mca_base_param_lookup_int(param, priority);

    *module = (mca_base_module_t *)&loc_module;

    return OPAL_SUCCESS;
}
Beispiel #10
0
/**
 * Run a user-level debugger
 */
void orte_run_debugger(char *basename, opal_cmd_line_t *cmd_line,
                       int argc, char *argv[])
{
    int i, id;
    char **new_argv = NULL;
    char *value, **lines;

    /* Get the orte_base_debug MCA parameter and search for a debugger
       that can run */
    
    id = mca_base_param_find("orte", NULL, "base_user_debugger");
    if (id < 0) {
        opal_show_help("help-orterun.txt", "debugger-mca-param-not-found", 
                       true);
        exit(1);
    }
    value = NULL;
    mca_base_param_lookup_string(id, &value);
    if (NULL == value) {
        opal_show_help("help-orterun.txt", "debugger-orte_base_user_debugger-empty",
                       true);
        exit(1);
    }

    /* Look through all the values in the MCA param */

    lines = opal_argv_split(value, ':');
    free(value);
    for (i = 0; NULL != lines[i]; ++i) {
        if (ORTE_SUCCESS == process(lines[i], basename, cmd_line, argc, argv, 
                                    &new_argv)) {
            break;
        }
    }

    /* If we didn't find one, abort */

    if (NULL == lines[i]) {
        opal_show_help("help-orterun.txt", "debugger-not-found", true);
        exit(1);
    }
    opal_argv_free(lines);

    /* We found one */

    execvp(new_argv[0], new_argv);
    value = opal_argv_join(new_argv, ' ');
    opal_show_help("help-orterun.txt", "debugger-exec-failed",
                   true, basename, value, new_argv[0]);
    free(value);
    opal_argv_free(new_argv);
    exit(1);
}
static int lookup_set(char *a, char *b, char *c, int default_val,
                      char *token, int *argc, char ***argv)
{
    int id, rc;
    
    id = mca_base_param_find(a, b, c);
    if (id < 0) {
        id = mca_base_param_register_int(a, b, c, NULL, default_val);
    }
    mca_base_param_lookup_int(id, &rc);
    if (rc) {
        opal_argv_append(argc, argv, token);
    }
    
    return ORTE_SUCCESS;
}
Beispiel #12
0
/*
 * Function to find as many components of a given type as possible.  This
 * includes statically-linked in components as well as opening up a
 * directory and looking for shared-library MCA components of the
 * appropriate type (load them if available).
 *
 * Return one consolidated array of (mca_base_component_t*) pointing to all
 * available components.
 */
int mca_base_component_find(const char *directory, const char *type, 
                         const mca_base_component_t *static_components[], 
                            opal_list_t *found_components,
                            bool open_dso_components)
{
  int i;
  mca_base_component_list_item_t *cli;

  /* Find all the components that were statically linked in */

  OBJ_CONSTRUCT(found_components, opal_list_t);
  for (i = 0; NULL != static_components[i]; ++i) {
    cli = OBJ_NEW(mca_base_component_list_item_t);
    if (NULL == cli) {
      return OPAL_ERR_OUT_OF_RESOURCE;
    }
    cli->cli_component = static_components[i];
    opal_list_append(found_components, (opal_list_item_t *) cli);
  }

#if OMPI_WANT_LIBLTDL
  /* Find any available dynamic components in the specified directory */
  if (open_dso_components) {
      int param, param_disable_dlopen;
      param = mca_base_param_find("mca", NULL, "component_disable_dlopen");
      mca_base_param_lookup_int(param, &param_disable_dlopen);

      if (0 == param_disable_dlopen) {
          find_dyn_components(directory, type, NULL, found_components);
      }
  } else {
    opal_output_verbose(40, 0, 
                        "mca: base: component_find: dso loading for %s MCA components disabled", 
                        type);
  }
#endif

  /* All done */

  return OPAL_SUCCESS;
}
Beispiel #13
0
/*
 * Function for finding and opening either all MCA components, or the one
 * that was specifically requested via a MCA parameter.
 */
int mca_fcache_base_open(void)
{
    /* Open an output stream for this framework */

    mca_fcache_base_output = opal_output_open(NULL);

     /* Open up all available components */

    if (OMPI_SUCCESS != 
        mca_base_components_open("fcache", mca_fcache_base_output,
                                 mca_fcache_base_static_components, 
                                 &mca_fcache_base_components_opened, true)) {
        return OMPI_ERROR;
    }
    mca_fcache_base_components_opened_valid = true;

    /* Find the index of the MCA "fcache" param for selection */
    
    mca_fcache_base_param = mca_base_param_find("fcache", "base", NULL);

    return OMPI_SUCCESS;
}
/*
 * Traverse the entire list of found components (a list of
 * mca_base_component_t instances).  If the requested_component_names
 * array is empty, or the name of each component in the list of found
 * components is in the requested_components_array, try to open it.
 * If it opens, add it to the components_available list.
 */
static int open_components(const char *type_name, int output_id,
                           opal_list_t *src, opal_list_t *dest)
{
    int ret;
    opal_list_item_t *item;
    const mca_base_component_t *component;
    mca_base_component_list_item_t *cli;
    bool called_open;
    bool opened, registered;

    /* Announce */

    opal_output_verbose(10, output_id,
                        "mca: base: components_open: opening %s components",
                        type_name);

    /* Traverse the list of found components */

    OBJ_CONSTRUCT(dest, opal_list_t);
    for (item = opal_list_get_first(src);
            opal_list_get_end(src) != item;
            item = opal_list_get_next(item)) {
        cli = (mca_base_component_list_item_t *) item;
        component = cli->cli_component;

        registered = opened = called_open = false;
        opal_output_verbose(10, output_id,
                            "mca: base: components_open: found loaded component %s",
                            component->mca_component_name);

        /* Call the component's MCA parameter registration function */
        if (NULL == component->mca_register_component_params) {
            registered = true;
            opal_output_verbose(10, output_id,
                                "mca: base: components_open: "
                                "component %s has no register function",
                                component->mca_component_name);
        } else {
            ret = component->mca_register_component_params();
            if (MCA_SUCCESS == ret) {
                registered = true;
                opal_output_verbose(10, output_id,
                                    "mca: base: components_open: "
                                    "component %s register function successful",
                                    component->mca_component_name);
            } else if (OPAL_ERR_NOT_AVAILABLE != ret) {
                /* If the component returns OPAL_ERR_NOT_AVAILABLE,
                   it's a cue to "silently ignore me" -- it's not a
                   failure, it's just a way for the component to say
                   "nope!".

                   Otherwise, however, display an error.  We may end
                   up displaying this twice, but it may go to separate
                   streams.  So better to be redundant than to not
                   display the error in the stream where it was
                   expected. */

                if (show_errors) {
                    opal_output(0, "mca: base: components_open: "
                                "component %s / %s register function failed",
                                component->mca_type_name,
                                component->mca_component_name);
                }
                opal_output_verbose(10, output_id,
                                    "mca: base: components_open: "
                                    "component %s register function failed",
                                    component->mca_component_name);
            }
        }

        if (NULL == component->mca_open_component) {
            opened = true;
            opal_output_verbose(10, output_id,
                                "mca: base: components_open: "
                                "component %s has no open function",
                                component->mca_component_name);
        } else {
            called_open = true;
            ret = component->mca_open_component();
            if (MCA_SUCCESS == ret) {
                opened = true;
                opal_output_verbose(10, output_id,
                                    "mca: base: components_open: "
                                    "component %s open function successful",
                                    component->mca_component_name);
            } else if (OPAL_ERR_NOT_AVAILABLE != ret) {
                /* If the component returns OPAL_ERR_NOT_AVAILABLE,
                   it's a cue to "silently ignore me" -- it's not a
                   failure, it's just a way for the component to say
                   "nope!".

                   Otherwise, however, display an error.  We may end
                   up displaying this twice, but it may go to separate
                   streams.  So better to be redundant than to not
                   display the error in the stream where it was
                   expected. */

                if (show_errors) {
                    opal_output(0, "mca: base: components_open: "
                                "component %s / %s open function failed",
                                component->mca_type_name,
                                component->mca_component_name);
                }
                opal_output_verbose(10, output_id,
                                    "mca: base: components_open: "
                                    "component %s open function failed",
                                    component->mca_component_name);
            }
        }

        /* If it didn't open, close it out and get rid of it */

        if (!opened) {
            char *name;
            if (called_open) {
                if (NULL != component->mca_close_component) {
                    component->mca_close_component();
                }
                opal_output_verbose(10, output_id,
                                    "mca: base: components_open: component %s closed",
                                    component->mca_component_name);
                called_open = false;
            }
            name = strdup(component->mca_component_name);
            mca_base_component_repository_release(component);
            opal_output_verbose(10, output_id,
                                "mca: base: components_open: component %s unloaded",
                                name);
            free(name);
        }

        /* If it did open, register its "priority" MCA parameter (if
           it doesn't already have one) and save it in the
           opened_components list */

        else {
            if (OPAL_ERROR == mca_base_param_find(type_name,
                                                  component->mca_component_name,
                                                  "priority")) {
                mca_base_param_register_int(type_name,
                                            component->mca_component_name,
                                            "priority", NULL, 0);
            }

            cli = OBJ_NEW(mca_base_component_list_item_t);
            if (NULL == cli) {
                return OPAL_ERR_OUT_OF_RESOURCE;
            }
            cli->cli_component = component;
            opal_list_append(dest, (opal_list_item_t *) cli);
        }
    }

    /* All done */

    return OPAL_SUCCESS;
}
Beispiel #15
0
/**
 * Run a user-level debugger
 */
void orte_run_debugger(char *basename, opal_cmd_line_t *cmd_line,
                       int argc, char *argv[], int num_procs)
{
    int i, id;
    char **new_argv = NULL;
    char *value, **lines, *env_name;

    /* Get the orte_base_debug MCA parameter and search for a debugger
       that can run */
    
    id = mca_base_param_find("orte", NULL, "base_user_debugger");
    if (id < 0) {
        orte_show_help("help-orterun.txt", "debugger-mca-param-not-found", 
                       true);
        exit(1);
    }
    value = NULL;
    mca_base_param_lookup_string(id, &value);
    if (NULL == value) {
        orte_show_help("help-orterun.txt", "debugger-orte_base_user_debugger-empty",
                       true);
        exit(1);
    }

    /* Look through all the values in the MCA param */

    lines = opal_argv_split(value, ':');
    free(value);
    for (i = 0; NULL != lines[i]; ++i) {
        if (ORTE_SUCCESS == process(lines[i], basename, cmd_line, argc, argv, 
                                    &new_argv, num_procs)) {
            break;
        }
    }

    /* If we didn't find one, abort */

    if (NULL == lines[i]) {
        orte_show_help("help-orterun.txt", "debugger-not-found", true);
        exit(1);
    }
    opal_argv_free(lines);

    /* We found one */
    
    /* cleanup the MPIR arrays in case the debugger doesn't set them */
    memset((char*)MPIR_executable_path, 0, MPIR_MAX_PATH_LENGTH);
    memset((char*)MPIR_server_arguments, 0, MPIR_MAX_ARG_LENGTH);
    
    /* Set an MCA param so that everyone knows that they are being
       launched under a debugger; not all debuggers are consistent
       about setting MPIR_being_debugged in both the launcher and the
       MPI processes */
    env_name = mca_base_param_environ_variable("orte", 
                                               "in_parallel_debugger", NULL);
    if (NULL != env_name) {
        opal_setenv(env_name, "1", true, &environ);
        free(env_name);
    }

    /* Launch the debugger */
    execvp(new_argv[0], new_argv);
    value = opal_argv_join(new_argv, ' ');
    orte_show_help("help-orterun.txt", "debugger-exec-failed",
                   true, basename, value, new_argv[0]);
    free(value);
    opal_argv_free(new_argv);
    exit(1);
}
Beispiel #16
0
/*
 * Open a component, chasing down its dependencies first, if possible.
 */
static int open_component(component_file_item_t *target_file, 
                       opal_list_t *found_components)
{
  int show_errors, param;
  lt_dlhandle component_handle;
  mca_base_component_t *component_struct;
  char *struct_name, *err;
  opal_list_t dependencies;
  opal_list_item_t *cur;
  mca_base_component_list_item_t *mitem;
  dependency_item_t *ditem;
  size_t len;

  opal_output_verbose(40, 0, "mca: base: component_find: examining dyanmic %s MCA component \"%s\"",
                     target_file->type, target_file->name, NULL);
  opal_output_verbose(40, 0, "mca: base: component_find: %s", target_file->filename, NULL);
  param = mca_base_param_find("mca", NULL, "component_show_load_errors");
  mca_base_param_lookup_int(param, &show_errors);

  /* Was this component already loaded (e.g., via dependency)? */

  if (LOADED == target_file->status) {
    opal_output_verbose(40, 0, "mca: base: component_find: already loaded (ignored)", NULL);
    return OPAL_SUCCESS;
  }

  /* Ensure that this component is not already loaded (should only happen
     if it was statically loaded).  It's an error if it's already
     loaded because we're evaluating this file -- not this component.
     Hence, returning OPAL_ERR_PARAM indicates that the *file* failed
     to load, not the component. */

  for (cur = opal_list_get_first(found_components); 
       opal_list_get_end(found_components) != cur;
       cur = opal_list_get_next(cur)) {
    mitem = (mca_base_component_list_item_t *) cur;
    if (0 == strcmp(mitem->cli_component->mca_type_name, target_file->type) &&
        0 == strcmp(mitem->cli_component->mca_component_name, target_file->name)) {
      opal_output_verbose(40, 0, "mca: base: component_find: already loaded (ignored)", NULL);
      target_file->status = FAILED_TO_LOAD;
      return OPAL_ERR_BAD_PARAM;
    }
  }

  /* Look at see if this component has any dependencies.  If so, load
     them.  If we can't load them, then this component must also fail to
     load. */

  OBJ_CONSTRUCT(&dependencies, opal_list_t);
  if (0 != check_ompi_info(target_file, &dependencies, found_components)) {
    target_file->status = FAILED_TO_LOAD;
    free_dependency_list(&dependencies);
    return OPAL_ERR_OUT_OF_RESOURCE;
  }

  /* Now try to load the component */

  component_handle = lt_dlopenext(target_file->filename);
  if (NULL == component_handle) {
    err = strdup(lt_dlerror());
    if (0 != show_errors) {
        opal_output(0, "mca: base: component_find: unable to open %s %s: %s (ignored)", 
                    target_file->type, target_file->name, err);
    }
    opal_output_verbose(40, 0, "mca: base: component_find: unable to open %s: %s (ignored)", 
                        target_file->filename, err, NULL);
    free(err);
    target_file->status = FAILED_TO_LOAD;
    free_dependency_list(&dependencies);
    return OPAL_ERR_BAD_PARAM;
  }

  /* Successfully opened the component; now find the public struct.
     Malloc out enough space for it. */

  len = strlen(target_file->type) + strlen(target_file->name) + 32;
  struct_name = (char*)malloc(len);
  if (NULL == struct_name) {
    lt_dlclose(component_handle);
    target_file->status = FAILED_TO_LOAD;
    free_dependency_list(&dependencies);
    return OPAL_ERR_OUT_OF_RESOURCE;
  }
  snprintf(struct_name, len, "mca_%s_%s_component", target_file->type,
           target_file->name);

  mitem = OBJ_NEW(mca_base_component_list_item_t);
  if (NULL == mitem) {
    free(struct_name);
    lt_dlclose(component_handle);
    target_file->status = FAILED_TO_LOAD;
    free_dependency_list(&dependencies);
    return OPAL_ERR_OUT_OF_RESOURCE;
  }

  component_struct = (mca_base_component_t*)lt_dlsym(component_handle, struct_name);
  if (NULL == component_struct) {
    if (0 != show_errors) {
        opal_output(0, "mca: base: component_find: \"%s\" does not appear to be a valid "
                       "%s MCA dynamic component (ignored)", 
                       target_file->basename, target_file->type, NULL);
    }
    opal_output_verbose(40, 0, "mca: base: component_find: \"%s\" does not appear to be a valid "
                       "%s MCA dynamic component (ignored)", 
                       target_file->basename, target_file->type, NULL);
    free(mitem);
    free(struct_name);
    lt_dlclose(component_handle);
    target_file->status = FAILED_TO_LOAD;
    free_dependency_list(&dependencies);
    return OPAL_ERR_BAD_PARAM;
  }

  /* We found the public struct.  Save it, and register this component to
     be closed later. */

  mitem->cli_component = component_struct;
  opal_list_append(found_components, (opal_list_item_t *) mitem);
  mca_base_component_repository_retain(target_file->type, component_handle, 
                                    component_struct);

  /* Now that that's all done, link all the dependencies in to this
     component's repository entry */

  for (cur = opal_list_remove_first(&dependencies);
       NULL != cur;
       cur = opal_list_remove_first(&dependencies)) {
    ditem = (dependency_item_t *) cur;
    mca_base_component_repository_link(target_file->type,
                                       target_file->name,
                                       ditem->di_component_file_item->type,
                                       ditem->di_component_file_item->name);
    OBJ_RELEASE(ditem);
  }
  OBJ_DESTRUCT(&dependencies);

  opal_output_verbose(40, 0, "mca: base: component_find: opened dynamic %s MCA component \"%s\"",
                     target_file->type, target_file->name, NULL);
  target_file->status = LOADED;
    
  /* All done */
    
  free(struct_name);
  return OPAL_SUCCESS;
}
Beispiel #17
0
static int open_component(void)
{
    int param;

    param = mca_base_param_find ("io", NULL, "ompio_cycle_buffer_size");
    if (param >= 0) {
        mca_base_param_lookup_int (param, &mca_io_ompio_cycle_buffer_size);
    }
    param = mca_base_param_find ("io", NULL, "ompio_bytes_per_agg");
    if (param >= 0) {
        mca_base_param_lookup_int (param, &mca_io_ompio_bytes_per_agg);
    }

    priority_param = 
        mca_base_param_reg_int(&mca_io_ompio_component.io_version, 
                               "priority",
                               "Priority of the io ompio component",
                               false, false, priority_param, NULL);
    delete_priority_param = 
        mca_base_param_reg_int(&mca_io_ompio_component.io_version,
                               "delete_priority", 
                               "Delete priority of the io ompio component",
                               false, false, delete_priority_param, NULL);

    mca_base_param_reg_string(&mca_io_ompio_component.io_version,
                              "version", 
                              "Version of OMPIO",
                              false, true, NULL, NULL);

    mca_base_param_reg_int (&mca_io_ompio_component.io_version,
                            "cycle_buffer_size",
                            "Cycle Buffer Size of individual reads/writes",
                            false, false, mca_io_ompio_cycle_buffer_size,
                            &mca_io_ompio_cycle_buffer_size);

    mca_base_param_reg_int (&mca_io_ompio_component.io_version,
                            "bytes_per_agg",
                            "Bytes per aggregator process for automatic selection",
                            false, false, mca_io_ompio_bytes_per_agg,
                            &mca_io_ompio_bytes_per_agg);

    /*
    mca_base_param_reg_string(&mca_io_ompio_component.io_version,
                              "user_configure_params", 
                              "User-specified command line parameters passed to OMPIO's configure script",
                              false, true, 
                              MCA_io_ompio_USER_CONFIGURE_FLAGS, NULL);
    mca_base_param_reg_string(&mca_io_ompio_component.io_version,
                              "complete_configure_params", 
                              "Complete set of command line parameters passed to OMPIO's configure script",
                              false, true, 
                              MCA_io_ompio_COMPLETE_CONFIGURE_FLAGS, NULL);
    */
    /* Create the mutex */
    OBJ_CONSTRUCT(&mca_io_ompio_mutex, opal_mutex_t);

    /* Create the list of pending requests */

    OBJ_CONSTRUCT(&mca_io_ompio_pending_requests, opal_list_t);

    return OMPI_SUCCESS;
}
/*
 * Function to find as many components of a given type as possible.  This
 * includes statically-linked in components as well as opening up a
 * directory and looking for shared-library MCA components of the
 * appropriate type (load them if available).
 *
 * Return one consolidated array of (mca_base_component_t*) pointing to all
 * available components.
 */
int mca_base_component_find(const char *directory, const char *type, 
                            const mca_base_component_t *static_components[], 
                            char **requested_component_names,
                            bool include_mode,
                            opal_list_t *found_components,
                            bool open_dso_components)
{
    int i;
    opal_list_item_t *item;
    mca_base_component_list_item_t *cli;

    /* Find all the components that were statically linked in */
    OBJ_CONSTRUCT(found_components, opal_list_t);
    for (i = 0; NULL != static_components[i]; ++i) {
        if ( use_component(include_mode,
                           (const char**)requested_component_names,
                           static_components[i]->mca_component_name) ) {
            cli = OBJ_NEW(mca_base_component_list_item_t);
            if (NULL == cli) {
                return OPAL_ERR_OUT_OF_RESOURCE;
            }
            cli->cli_component = static_components[i];
            opal_list_append(found_components, (opal_list_item_t *) cli);
        }
    }

#if OMPI_WANT_LIBLTDL
    /* Find any available dynamic components in the specified directory */
    if (open_dso_components) {
        int param, param_disable_dlopen;
        param = mca_base_param_find("mca", NULL, "component_disable_dlopen");
        mca_base_param_lookup_int(param, &param_disable_dlopen);

        if (0 == param_disable_dlopen) {
            find_dyn_components(directory, type,
                                (const char**)requested_component_names,
                                include_mode, found_components); 
        }
    } else {
        opal_output_verbose(40, 0, 
                            "mca: base: component_find: dso loading for %s MCA components disabled", 
                            type);
    }
#endif

    /* Ensure that *all* requested components exist.  Print a warning
       and abort if they do not. */
    for (i = 0; include_mode && NULL != requested_component_names && 
             NULL != requested_component_names[i]; ++i) {
        for (item = opal_list_get_first(found_components);
             opal_list_get_end(found_components) != item; 
             item = opal_list_get_next(item)) {
            cli = (mca_base_component_list_item_t*) item;
            if (0 == strcmp(requested_component_names[i], 
                            cli->cli_component->mca_component_name)) {
                break;
            }
        }

        if (opal_list_get_end(found_components) == item) {
            char h[MAXHOSTNAMELEN];
            gethostname(h, sizeof(h));
            opal_show_help("help-mca-base.txt", 
                           "find-available:not-valid", true,
                           h, type, requested_component_names[i]);
            return OPAL_ERR_NOT_FOUND;
        }
    }

    /* All done */

    return OPAL_SUCCESS;
}
Beispiel #19
0
int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
{
    int ret;
    ompi_proc_t** procs;
    size_t nprocs;
    char *error = NULL;
    bool timing = false;
    int param, value;
    struct timeval ompistart, ompistop;
    char *event_val = NULL;
    bool orte_setup = false;
    orte_grpcomm_collective_t *coll;
    char *cmd=NULL, *av=NULL;

    /* bitflag of the thread level support provided. To be used
     * for the modex in order to work in heterogeneous environments. */
    uint8_t threadlevel_bf; 

    /* Indicate that we have *started* MPI_INIT*.  MPI_FINALIZE has
       something sorta similar in a static local variable in
       ompi_mpi_finalize(). */
    ompi_mpi_init_started = true;

    /* Setup enough to check get/set MCA params */

    if (OPAL_SUCCESS != (ret = opal_init_util(&argc, &argv))) {
        error = "ompi_mpi_init: opal_init_util failed";
        goto error;
    }

    if (OPAL_SUCCESS != (ret = opal_arch_set_fortran_logical_size(sizeof(ompi_fortran_logical_t)))) {
        error = "ompi_mpi_init: opal_arch_set_fortran_logical_size failed";
        goto error;
    }

    /* _After_ opal_init_util() but _before_ orte_init(), we need to
       set an MCA param that tells libevent that it's ok to use any
       mechanism in libevent that is available on this platform (e.g.,
       epoll and friends).  Per opal/event/event.s, we default to
       select/poll -- but we know that MPI processes won't be using
       pty's with the event engine, so it's ok to relax this
       constraint and let any fd-monitoring mechanism be used. */
    ret = mca_base_param_reg_string_name("opal", "event_include",
                                         "Internal orted MCA param: tell opal_init() to use a specific mechanism in libevent",
                                         false, false, "all", &event_val);
    if (ret >= 0) {
        /* We have to explicitly "set" the MCA param value here
           because libevent initialization will re-register the MCA
           param and therefore override the default. Setting the value
           here puts the desired value ("all") in different storage
           that is not overwritten if/when the MCA param is
           re-registered. This is unless the user has specified a different
           value for this MCA parameter. Make sure we check to see if the
           default is specified before forcing "all" in case that is not what
           the user desires. Note that we do *NOT* set this value as an
           environment variable, just so that it won't be inherited by
           any spawned processes and potentially cause unintented
           side-effects with launching ORTE tools... */
        if (0 == strcmp("all", event_val)) {
            mca_base_param_set_string(ret, "all");
        }
    }

    if( NULL != event_val ) {
        free(event_val);
        event_val = NULL;
    }

    /* check to see if we want timing information */
    param = mca_base_param_reg_int_name("ompi", "timing",
                                        "Request that critical timing loops be measured",
                                        false, false, 0, &value);
    if (value != 0) {
        timing = true;
        gettimeofday(&ompistart, NULL);
    }
    
    /* if we were not externally started, then we need to setup
     * some envars so the MPI_INFO_ENV can get the cmd name
     * and argv (but only if the user supplied a non-NULL argv!), and
     * the requested thread level
     */
    if (NULL == getenv("OMPI_COMMAND") && NULL != argv && NULL != argv[0]) {
        asprintf(&cmd, "OMPI_COMMAND=%s", argv[0]);
        putenv(cmd);
    }
    if (NULL == getenv("OMPI_ARGV") && 1 < argc) {
        char *tmp;
        tmp = opal_argv_join(&argv[1], ' ');
        asprintf(&av, "OMPI_ARGV=%s", tmp);
        free(tmp);
        putenv(av);
    }

    /* Setup ORTE - note that we are an MPI process  */
    if (ORTE_SUCCESS != (ret = orte_init(NULL, NULL, ORTE_PROC_MPI))) {
        error = "ompi_mpi_init: orte_init failed";
        goto error;
    }
    orte_setup = true;
    
    /* check for timing request - get stop time and report elapsed time if so */
    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init [%ld]: time from start to completion of orte_init %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
        gettimeofday(&ompistart, NULL);
    }

#if OPAL_HAVE_HWLOC
    /* if hwloc is available but didn't get setup for some
     * reason, do so now
     */
    if (NULL == opal_hwloc_topology) {
        if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) {
            error = "Topology init";
            goto error;
        }
    }
#endif

    /* Register errhandler callback with orte errmgr */
    if (NULL != orte_errmgr.set_fault_callback) {
        orte_errmgr.set_fault_callback(ompi_errhandler_runtime_callback);
    }

    /* Figure out the final MPI thread levels.  If we were not
       compiled for support for MPI threads, then don't allow
       MPI_THREAD_MULTIPLE.  Set this stuff up here early in the
       process so that other components can make decisions based on
       this value. */
    /**
     * These values are monotonic; MPI_THREAD_SINGLE < MPI_THREAD_FUNNELED
     *                             < MPI_THREAD_SERIALIZED < MPI_THREAD_MULTIPLE.
     * If possible, the call will return provided = required. Failing this,
     * the call will return the least supported level such that
     * provided > required. Finally, if the user requirement cannot be
     * satisfied, then the call will return in provided the highest
     * supported level.
     */
    ompi_mpi_thread_requested = requested;

    if (OMPI_ENABLE_THREAD_MULTIPLE == 1) {
        ompi_mpi_thread_provided = *provided = requested;
        ompi_mpi_main_thread = opal_thread_get_self();
    } else {
        if (MPI_THREAD_MULTIPLE == requested) {
            ompi_mpi_thread_provided = *provided = MPI_THREAD_SERIALIZED;
        } else {
            ompi_mpi_thread_provided = *provided = requested;
        }
        ompi_mpi_main_thread = (OPAL_ENABLE_MULTI_THREADS ? opal_thread_get_self() : NULL);
    }

    ompi_mpi_thread_multiple = (ompi_mpi_thread_provided == 
                                MPI_THREAD_MULTIPLE);

    /* determine the bitflag belonging to the threadlevel_support provided */
    memset ( &threadlevel_bf, 0, sizeof(uint8_t));
    OMPI_THREADLEVEL_SET_BITFLAG ( ompi_mpi_thread_provided, threadlevel_bf );

    /* add this bitflag to the modex */
    if ( OMPI_SUCCESS != (ret = ompi_modex_send_string("MPI_THREAD_LEVEL", &threadlevel_bf, sizeof(uint8_t)))) {
        error = "ompi_mpi_init: modex send thread level";
        goto error;
    }

    /* Once we've joined the RTE, see if any MCA parameters were
       passed to the MPI level */

    if (OMPI_SUCCESS != (ret = ompi_mpi_register_params())) {
        error = "mca_mpi_register_params() failed";
        goto error;
    }

    /* initialize datatypes. This step should be done early as it will
     * create the local convertor and local arch used in the proc
     * init.
     */
    if (OMPI_SUCCESS != (ret = ompi_datatype_init())) {
        error = "ompi_datatype_init() failed";
        goto error;
    }

    /* Initialize OMPI procs */
    if (OMPI_SUCCESS != (ret = ompi_proc_init())) {
        error = "mca_proc_init() failed";
        goto error;
    }

    /* Initialize the op framework. This has to be done *after*
       ddt_init, but befor mca_coll_base_open, since some collective
       modules (e.g., the hierarchical coll component) may need ops in
       their query function. */
    if (OMPI_SUCCESS != (ret = ompi_op_base_open())) {
        error = "ompi_op_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != 
        (ret = ompi_op_base_find_available(OMPI_ENABLE_PROGRESS_THREADS,
                                           OMPI_ENABLE_THREAD_MULTIPLE))) {
        error = "ompi_op_base_find_available() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = ompi_op_init())) {
        error = "ompi_op_init() failed";
        goto error;
    }

    /* Open up MPI-related MCA components */

    if (OMPI_SUCCESS != (ret = mca_allocator_base_open())) {
        error = "mca_allocator_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = mca_rcache_base_open())) {
        error = "mca_rcache_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = mca_mpool_base_open())) {
        error = "mca_mpool_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = mca_pml_base_open())) {
        error = "mca_pml_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = mca_coll_base_open())) {
        error = "mca_coll_base_open() failed";
        goto error;
    }

    if (OMPI_SUCCESS != (ret = ompi_osc_base_open())) {
        error = "ompi_osc_base_open() failed";
        goto error;
    }

#if OPAL_ENABLE_FT_CR == 1
    if (OMPI_SUCCESS != (ret = ompi_crcp_base_open())) {
        error = "ompi_crcp_base_open() failed";
        goto error;
    }
#endif

    /* In order to reduce the common case for MPI apps (where they
       don't use MPI-2 IO or MPI-1 topology functions), the io and
       topo frameworks are initialized lazily, at the first use of
       relevant functions (e.g., MPI_FILE_*, MPI_CART_*, MPI_GRAPH_*),
       so they are not opened here. */

    /* Select which MPI components to use */

    if (OMPI_SUCCESS != 
        (ret = mca_mpool_base_init(OMPI_ENABLE_PROGRESS_THREADS,
                                   OMPI_ENABLE_THREAD_MULTIPLE))) {
        error = "mca_mpool_base_init() failed";
        goto error;
    }

    if (OMPI_SUCCESS != 
        (ret = mca_pml_base_select(OMPI_ENABLE_PROGRESS_THREADS,
                                   OMPI_ENABLE_THREAD_MULTIPLE))) {
        error = "mca_pml_base_select() failed";
        goto error;
    }

    /* check for timing request - get stop time and report elapsed time if so */
    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init[%ld]: time from completion of orte_init to modex %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
        gettimeofday(&ompistart, NULL);
    }
    
    /* exchange connection info - this function also acts as a barrier
     * as it will not return until the exchange is complete
     */
    coll = OBJ_NEW(orte_grpcomm_collective_t);
    coll->id = orte_process_info.peer_modex;
    if (ORTE_SUCCESS != (ret = orte_grpcomm.modex(coll))) {
        error = "orte_grpcomm_modex failed";
        goto error;
    }
    /* wait for modex to complete - this may be moved anywhere in mpi_init
     * so long as it occurs prior to calling a function that needs
     * the modex info!
     */
    while (coll->active) {
        opal_progress();  /* block in progress pending events */
    }
    OBJ_RELEASE(coll);

    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init[%ld]: time to execute modex %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
        gettimeofday(&ompistart, NULL);
    }

    /* select buffered send allocator component to be used */
    ret=mca_pml_base_bsend_init(OMPI_ENABLE_THREAD_MULTIPLE);
    if( OMPI_SUCCESS != ret ) {
        error = "mca_pml_base_bsend_init() failed";
        goto error;
    }

    if (OMPI_SUCCESS != 
        (ret = mca_coll_base_find_available(OMPI_ENABLE_PROGRESS_THREADS,
                                            OMPI_ENABLE_THREAD_MULTIPLE))) {
        error = "mca_coll_base_find_available() failed";
        goto error;
    }

    if (OMPI_SUCCESS != 
        (ret = ompi_osc_base_find_available(OMPI_ENABLE_PROGRESS_THREADS,
                                            OMPI_ENABLE_THREAD_MULTIPLE))) {
        error = "ompi_osc_base_find_available() failed";
        goto error;
    }

#if OPAL_ENABLE_FT_CR == 1
    if (OMPI_SUCCESS != (ret = ompi_crcp_base_select() ) ) {
        error = "ompi_crcp_base_select() failed";
        goto error;
    }
#endif

    /* io and topo components are not selected here -- see comment
       above about the io and topo frameworks being loaded lazily */

    /* Initialize each MPI handle subsystem */
    /* initialize requests */
    if (OMPI_SUCCESS != (ret = ompi_request_init())) {
        error = "ompi_request_init() failed";
        goto error;
    }

    if (OMPI_SUCCESS != (ret = ompi_message_init())) {
        error = "ompi_message_init() failed";
        goto error;
    }

    /* initialize info */
    if (OMPI_SUCCESS != (ret = ompi_info_init())) {
        error = "ompi_info_init() failed";
        goto error;
    }

    /* initialize error handlers */
    if (OMPI_SUCCESS != (ret = ompi_errhandler_init())) {
        error = "ompi_errhandler_init() failed";
        goto error;
    }

    /* initialize error codes */
    if (OMPI_SUCCESS != (ret = ompi_mpi_errcode_init())) {
        error = "ompi_mpi_errcode_init() failed";
        goto error;
    }
    
    /* initialize internal error codes */
    if (OMPI_SUCCESS != (ret = ompi_errcode_intern_init())) {
        error = "ompi_errcode_intern_init() failed";
        goto error;
    }
     
    /* initialize groups  */
    if (OMPI_SUCCESS != (ret = ompi_group_init())) {
        error = "ompi_group_init() failed";
        goto error;
    }

    /* initialize communicators */
    if (OMPI_SUCCESS != (ret = ompi_comm_init())) {
        error = "ompi_comm_init() failed";
        goto error;
    }

    /* initialize file handles */
    if (OMPI_SUCCESS != (ret = ompi_file_init())) {
        error = "ompi_file_init() failed";
        goto error;
    }

    /* initialize windows */
    if (OMPI_SUCCESS != (ret = ompi_win_init())) {
        error = "ompi_win_init() failed";
        goto error;
    }

    /* initialize attribute meta-data structure for comm/win/dtype */
    if (OMPI_SUCCESS != (ret = ompi_attr_init())) {
        error = "ompi_attr_init() failed";
        goto error;
    }

    /* identify the architectures of remote procs and setup
     * their datatype convertors, if required
     */
    if (OMPI_SUCCESS != (ret = ompi_proc_complete_init())) {
        error = "ompi_proc_complete_init failed";
        goto error;
    }

    /* If thread support was enabled, then setup OPAL to allow for
       them. */
    if ((OMPI_ENABLE_PROGRESS_THREADS == 1) ||
        (*provided != MPI_THREAD_SINGLE)) {
        opal_set_using_threads(true);
    }

    /* start PML/BTL's */
    ret = MCA_PML_CALL(enable(true));
    if( OMPI_SUCCESS != ret ) {
        error = "PML control failed";
        goto error;
    }

    /* add all ompi_proc_t's to PML */
    if (NULL == (procs = ompi_proc_world(&nprocs))) {
        error = "ompi_proc_world() failed";
        goto error;
    }
    ret = MCA_PML_CALL(add_procs(procs, nprocs));
    free(procs);
    /* If we got "unreachable", then print a specific error message.
       Otherwise, if we got some other failure, fall through to print
       a generic message. */
    if (OMPI_ERR_UNREACH == ret) {
        orte_show_help("help-mpi-runtime",
                       "mpi_init:startup:pml-add-procs-fail", true);
        error = NULL;
        goto error;
    } else if (OMPI_SUCCESS != ret) {
        error = "PML add procs failed";
        goto error;
    }

    MCA_PML_CALL(add_comm(&ompi_mpi_comm_world.comm));
    MCA_PML_CALL(add_comm(&ompi_mpi_comm_self.comm));

    /*
     * Dump all MCA parameters if requested
     */
    if (ompi_mpi_show_mca_params) {
        ompi_show_all_mca_params(ompi_mpi_comm_world.comm.c_my_rank, 
                                 nprocs, 
                                 orte_process_info.nodename);
    }

    /* Do we need to wait for a debugger? */
    ompi_wait_for_debugger();
    
    /* check for timing request - get stop time and report elapsed
       time if so, then start the clock again */
    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init[%ld]: time from modex to first barrier %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
        gettimeofday(&ompistart, NULL);
    }
    
    /* wait for everyone to reach this point */
    coll = OBJ_NEW(orte_grpcomm_collective_t);
    coll->id = orte_process_info.peer_init_barrier;
    if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier(coll))) {
        error = "orte_grpcomm_barrier failed";
        goto error;
    }
    /* wait for barrier to complete */
    while (coll->active) {
        opal_progress();  /* block in progress pending events */
    }
    OBJ_RELEASE(coll);

    /* check for timing request - get stop time and report elapsed
       time if so, then start the clock again */
    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init[%ld]: time to execute barrier %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
        gettimeofday(&ompistart, NULL);
    }

#if OMPI_ENABLE_PROGRESS_THREADS == 0
    /* Start setting up the event engine for MPI operations.  Don't
       block in the event library, so that communications don't take
       forever between procs in the dynamic code.  This will increase
       CPU utilization for the remainder of MPI_INIT when we are
       blocking on ORTE-level events, but may greatly reduce non-TCP
       latency. */
    opal_progress_set_event_flag(OPAL_EVLOOP_NONBLOCK);
#endif
    
    /* wire up the mpi interface, if requested.  Do this after the
       non-block switch for non-TCP performance.  Do before the
       polling change as anyone with a complex wire-up is going to be
       using the oob. */
    if (OMPI_SUCCESS != (ret = ompi_init_preconnect_mpi())) {
        error = "ompi_mpi_do_preconnect_all() failed";
        goto error;
    }

    /* Setup the publish/subscribe (PUBSUB) framework */
    if (OMPI_SUCCESS != (ret = ompi_pubsub_base_open())) {
        error = "ompi_pubsub_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = ompi_pubsub_base_select())) {
        error = "ompi_pubsub_base_select() failed";
        goto error;
    }
    
    /* Setup the dynamic process management (DPM) framework */
    if (OMPI_SUCCESS != (ret = ompi_dpm_base_open())) {
        error = "ompi_dpm_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = ompi_dpm_base_select())) {
        error = "ompi_dpm_base_select() failed";
        goto error;
    }


    /* Determine the overall threadlevel support of all processes 
       in MPI_COMM_WORLD. This has to be done before calling 
       coll_base_comm_select, since some of the collective components
       e.g. hierarch, might create subcommunicators. The threadlevel
       requested by all processes is required in order to know
       which cid allocation algorithm can be used. */
    if ( OMPI_SUCCESS != 
	 ( ret = ompi_comm_cid_init ())) {
	error = "ompi_mpi_init: ompi_comm_cid_init failed";
	goto error;
    }

    /* Init coll for the comms. This has to be after dpm_base_select, 
       (since dpm.mark_dyncomm is not set in the communicator creation
       function else), but before dpm.dyncom_init, since this function
       might require collective for the CID allocation. */
    if (OMPI_SUCCESS !=
        (ret = mca_coll_base_comm_select(MPI_COMM_WORLD))) {
        error = "mca_coll_base_comm_select(MPI_COMM_WORLD) failed";
        goto error;
    }

    if (OMPI_SUCCESS != 
        (ret = mca_coll_base_comm_select(MPI_COMM_SELF))) {
        error = "mca_coll_base_comm_select(MPI_COMM_SELF) failed";
        goto error;
    }


    
    /* Check whether we have been spawned or not.  We introduce that
       at the very end, since we need collectives, datatypes, ptls
       etc. up and running here.... */
    if (OMPI_SUCCESS != (ret = ompi_dpm.dyn_init())) {
        error = "ompi_comm_dyn_init() failed";
        goto error;
    }

    /*
     * Startup the Checkpoint/Restart Mech.
     * Note: Always do this so tools don't hang when
     * in a non-checkpointable build
     */
    if (OMPI_SUCCESS != (ret = ompi_cr_init())) {
        error = "ompi_cr_init";
        goto error;
    }

    /* Undo OPAL calling opal_progress_event_users_increment() during 
       opal_init, to get better latency when not using TCP.  Do 
       this *after* dyn_init, as dyn init uses lots of ORTE 
       communication and we don't want to hinder the performance of 
       that code. */ 
    opal_progress_event_users_decrement(); 

    /* see if yield_when_idle was specified - if so, use it */
    param = mca_base_param_find("mpi", NULL, "yield_when_idle");
    mca_base_param_lookup_int(param, &value);
    if (value < 0) {
        /* if no info is provided, just default to conservative */
        opal_progress_set_yield_when_idle(true);
    } else {
        /* info was provided, so set idle accordingly */
        opal_progress_set_yield_when_idle(value == 0 ? false : true);
    }
    
    param = mca_base_param_find("mpi", NULL, "event_tick_rate");
    mca_base_param_lookup_int(param, &value);
    /* negative value means use default - just don't do anything */
    if (value >= 0) {
        opal_progress_set_event_poll_rate(value);
    }

    /* At this point, we are fully configured and in MPI mode.  Any
       communication calls here will work exactly like they would in
       the user's code.  Setup the connections between procs and warm
       them up with simple sends, if requested */

    if (OMPI_SUCCESS != (ret = ompi_mpiext_init())) {
        error = "ompi_mpiext_init";
        goto error;
    }

    /* Fall through */
 error:
    if (ret != OMPI_SUCCESS) {
        /* Only print a message if one was not already printed */
        if (NULL != error) {
            const char *err_msg = opal_strerror(ret);
            /* If ORTE was not setup yet, don't use orte_show_help */
            if (orte_setup) {
                orte_show_help("help-mpi-runtime",
                               "mpi_init:startup:internal-failure", true,
                               "MPI_INIT", "MPI_INIT", error, err_msg, ret);
            } else {
                opal_show_help("help-mpi-runtime",
                               "mpi_init:startup:internal-failure", true,
                               "MPI_INIT", "MPI_INIT", error, err_msg, ret);
            }
        }
        return ret;
    }

    /* Initialize the registered datarep list to be empty */
    OBJ_CONSTRUCT(&ompi_registered_datareps, opal_list_t);

    /* Initialize the arrays used to store the F90 types returned by the
     *  MPI_Type_create_f90_XXX functions.
     */
    OBJ_CONSTRUCT( &ompi_mpi_f90_integer_hashtable, opal_hash_table_t);
    opal_hash_table_init(&ompi_mpi_f90_integer_hashtable, 16 /* why not? */);

    OBJ_CONSTRUCT( &ompi_mpi_f90_real_hashtable, opal_hash_table_t);
    opal_hash_table_init(&ompi_mpi_f90_real_hashtable, FLT_MAX_10_EXP);

    OBJ_CONSTRUCT( &ompi_mpi_f90_complex_hashtable, opal_hash_table_t);
    opal_hash_table_init(&ompi_mpi_f90_complex_hashtable, FLT_MAX_10_EXP);

    /* All done.  Wasn't that simple? */

    ompi_mpi_initialized = true;

    /* check for timing request - get stop time and report elapsed time if so */
    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init[%ld]: time from barrier to complete mpi_init %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
    }

    return MPI_SUCCESS;
}
int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
{
    int ret;
    ompi_proc_t** procs;
    size_t nprocs;
    char *error = NULL;
    bool timing = false;
    int param, value;
    struct timeval ompistart, ompistop;
    char *event_val = NULL;
    opal_paffinity_base_cpu_set_t mask;
    bool proc_bound;
#if 0
    /* see comment below about sched_yield */
    int num_processors;
#endif
    bool orte_setup = false;
    bool paffinity_enabled = false;

    /* Setup enough to check get/set MCA params */

    if (ORTE_SUCCESS != (ret = opal_init_util())) {
        error = "ompi_mpi_init: opal_init_util failed";
        goto error;
    }

    /* _After_ opal_init_util() but _before_ orte_init(), we need to
       set an MCA param that tells libevent that it's ok to use any
       mechanism in libevent that is available on this platform (e.g.,
       epoll and friends).  Per opal/event/event.s, we default to
       select/poll -- but we know that MPI processes won't be using
       pty's with the event engine, so it's ok to relax this
       constraint and let any fd-monitoring mechanism be used. */
    ret = mca_base_param_reg_string_name("opal", "event_include",
                                         "Internal orted MCA param: tell opal_init() to use a specific mechanism in libevent",
                                         false, false, "all", &event_val);
    if (ret >= 0) {
        /* We have to explicitly "set" the MCA param value here
           because libevent initialization will re-register the MCA
           param and therefore override the default. Setting the value
           here puts the desired value ("all") in different storage
           that is not overwritten if/when the MCA param is
           re-registered. This is unless the user has specified a different
           value for this MCA parameter. Make sure we check to see if the
           default is specified before forcing "all" in case that is not what
           the user desires. Note that we do *NOT* set this value as an
           environment variable, just so that it won't be inherited by
           any spawned processes and potentially cause unintented
           side-effects with launching ORTE tools... */
        if (0 == strcmp("all", event_val)) {
            mca_base_param_set_string(ret, "all");
        }
    }

    if( NULL != event_val ) {
        free(event_val);
        event_val = NULL;
    }

    /* check to see if we want timing information */
    param = mca_base_param_reg_int_name("ompi", "timing",
                                        "Request that critical timing loops be measured",
                                        false, false, 0, &value);
    if (value != 0) {
        timing = true;
        gettimeofday(&ompistart, NULL);
    }
    
    /* Setup ORTE - note that we are not a tool  */
    
    if (ORTE_SUCCESS != (ret = orte_init(ORTE_NON_TOOL))) {
        error = "ompi_mpi_init: orte_init failed";
        goto error;
    }
    orte_setup = true;
    
    /* check for timing request - get stop time and report elapsed time if so */
    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init [%ld]: time from start to completion of orte_init %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
        gettimeofday(&ompistart, NULL);
    }

    /* Figure out the final MPI thread levels.  If we were not
       compiled for support for MPI threads, then don't allow
       MPI_THREAD_MULTIPLE.  Set this stuff up here early in the
       process so that other components can make decisions based on
       this value. */

    ompi_mpi_thread_requested = requested;
    if (OMPI_HAVE_THREAD_SUPPORT == 0) {
        ompi_mpi_thread_provided = *provided = MPI_THREAD_SINGLE;
        ompi_mpi_main_thread = NULL;
    } else if (OMPI_ENABLE_MPI_THREADS == 1) {
        ompi_mpi_thread_provided = *provided = requested;
        ompi_mpi_main_thread = opal_thread_get_self();
    } else {
        if (MPI_THREAD_MULTIPLE == requested) {
            ompi_mpi_thread_provided = *provided = MPI_THREAD_SERIALIZED;
        } else {
            ompi_mpi_thread_provided = *provided = requested;
        }
        ompi_mpi_main_thread = opal_thread_get_self();
    }

    ompi_mpi_thread_multiple = (ompi_mpi_thread_provided == 
                                MPI_THREAD_MULTIPLE);

    /* Once we've joined the RTE, see if any MCA parameters were
       passed to the MPI level */

    if (OMPI_SUCCESS != (ret = ompi_mpi_register_params())) {
        error = "mca_mpi_register_params() failed";
        goto error;
    }

    /* if it hasn't already been done, setup process affinity. 
     * First check to see if a slot list was
     * specified.  If so, use it.  If no slot list was specified,
     * that's not an error -- just fall through and try the next
     * paffinity scheme.
     */
    ret = opal_paffinity_base_get(&mask);
    if (OPAL_SUCCESS == ret) {
        /* paffinity is supported - check for binding */
        OPAL_PAFFINITY_PROCESS_IS_BOUND(mask, &proc_bound);
        if (proc_bound) {
            /* someone external set it - indicate it is set
             * so that we know
             */
            paffinity_enabled = true;
        } else {
            /* the system is capable of doing processor affinity, but it
             * has not yet been set - see if a slot_list was given
             */
            if (NULL != opal_paffinity_base_slot_list) {
                /* It's an error if multiple paffinity schemes were specified */
                if (opal_paffinity_alone) {
                    ret = OMPI_ERR_BAD_PARAM;
                    error = "Multiple processor affinity schemes specified (can only specify one)";
                    goto error;
                }
                ret = opal_paffinity_base_slot_list_set((long)ORTE_PROC_MY_NAME->vpid, opal_paffinity_base_slot_list);
                if (OPAL_ERR_NOT_FOUND != ret) {
                    error = "opal_paffinity_base_slot_list_set() returned an error";
                    goto error;
                }
                paffinity_enabled = true;
            } else if (opal_paffinity_alone) {
                /* no slot_list, but they asked for paffinity */
                int phys_cpu;
                orte_node_rank_t nrank;
                if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(ORTE_PROC_MY_NAME))) {
                    error = "Could not get node rank - cannot set processor affinity";
                    goto error;
                }
                OPAL_PAFFINITY_CPU_ZERO(mask);
                phys_cpu = opal_paffinity_base_get_physical_processor_id(nrank);
                if (0 > phys_cpu) {
                    error = "Could not get physical processor id - cannot set processor affinity";
                    goto error;
                }
                OPAL_PAFFINITY_CPU_SET(phys_cpu, mask);
                ret = opal_paffinity_base_set(mask);
                if (OPAL_SUCCESS != ret) {
                    error = "Setting processor affinity failed";
                    goto error;
                }
                paffinity_enabled = true;
            }
        }
    }
    
    /* If we were able to set processor affinity, try setting up
     memory affinity */
    if (!opal_maffinity_setup && paffinity_enabled) {
        if (OPAL_SUCCESS == opal_maffinity_base_open() &&
            OPAL_SUCCESS == opal_maffinity_base_select()) {
            opal_maffinity_setup = true;
        }
    }
    
    /* initialize datatypes. This step should be done early as it will
     * create the local convertor and local arch used in the proc
     * init.
     */
    if (OMPI_SUCCESS != (ret = ompi_ddt_init())) {
        error = "ompi_ddt_init() failed";
        goto error;
    }

    /* Initialize OMPI procs */
    if (OMPI_SUCCESS != (ret = ompi_proc_init())) {
        error = "mca_proc_init() failed";
        goto error;
    }

    /* initialize ops. This has to be done *after* ddt_init, but
       befor mca_coll_base_open, since come collective modules
       (e.g. the hierarchical) need them in the query function
    */
    if (OMPI_SUCCESS != (ret = ompi_op_init())) {
        error = "ompi_op_init() failed";
        goto error;
    }


    /* Open up MPI-related MCA components */

    if (OMPI_SUCCESS != (ret = mca_allocator_base_open())) {
        error = "mca_allocator_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = mca_rcache_base_open())) {
        error = "mca_rcache_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = mca_mpool_base_open())) {
        error = "mca_mpool_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = mca_pml_base_open())) {
        error = "mca_pml_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = mca_coll_base_open())) {
        error = "mca_coll_base_open() failed";
        goto error;
    }

    if (OMPI_SUCCESS != (ret = ompi_osc_base_open())) {
        error = "ompi_osc_base_open() failed";
        goto error;
    }

#if OPAL_ENABLE_FT == 1
    if (OMPI_SUCCESS != (ret = ompi_crcp_base_open())) {
        error = "ompi_crcp_base_open() failed";
        goto error;
    }
#endif

    /* In order to reduce the common case for MPI apps (where they
       don't use MPI-2 IO or MPI-1 topology functions), the io and
       topo frameworks are initialized lazily, at the first use of
       relevant functions (e.g., MPI_FILE_*, MPI_CART_*, MPI_GRAPH_*),
       so they are not opened here. */

    /* Select which MPI components to use */

    if (OMPI_SUCCESS != 
        (ret = mca_mpool_base_init(OMPI_ENABLE_PROGRESS_THREADS,
                                   OMPI_ENABLE_MPI_THREADS))) {
        error = "mca_mpool_base_init() failed";
        goto error;
    }

    if (OMPI_SUCCESS != 
        (ret = mca_pml_base_select(OMPI_ENABLE_PROGRESS_THREADS,
                                   OMPI_ENABLE_MPI_THREADS))) {
        error = "mca_pml_base_select() failed";
        goto error;
    }

    /* select buffered send allocator component to be used */
    ret=mca_pml_base_bsend_init(OMPI_ENABLE_MPI_THREADS);
    if( OMPI_SUCCESS != ret ) {
        error = "mca_pml_base_bsend_init() failed";
        goto error;
    }

    if (OMPI_SUCCESS != 
        (ret = mca_coll_base_find_available(OMPI_ENABLE_PROGRESS_THREADS,
                                            OMPI_ENABLE_MPI_THREADS))) {
        error = "mca_coll_base_find_available() failed";
        goto error;
    }

    if (OMPI_SUCCESS != 
        (ret = ompi_osc_base_find_available(OMPI_ENABLE_PROGRESS_THREADS,
                                           OMPI_ENABLE_MPI_THREADS))) {
        error = "ompi_osc_base_find_available() failed";
        goto error;
    }

#if OPAL_ENABLE_FT == 1
    if (OMPI_SUCCESS != (ret = ompi_crcp_base_select() ) ) {
        error = "ompi_crcp_base_select() failed";
        goto error;
    }
#endif

    /* io and topo components are not selected here -- see comment
       above about the io and topo frameworks being loaded lazily */

    /* Initialize each MPI handle subsystem */
    /* initialize requests */
    if (OMPI_SUCCESS != (ret = ompi_request_init())) {
        error = "ompi_request_init() failed";
        goto error;
    }

    /* initialize info */
    if (OMPI_SUCCESS != (ret = ompi_info_init())) {
        error = "ompi_info_init() failed";
        goto error;
    }

    /* initialize error handlers */
    if (OMPI_SUCCESS != (ret = ompi_errhandler_init())) {
        error = "ompi_errhandler_init() failed";
        goto error;
    }

    /* initialize error codes */
    if (OMPI_SUCCESS != (ret = ompi_mpi_errcode_init())) {
        error = "ompi_mpi_errcode_init() failed";
        goto error;
    }
    
    /* initialize internal error codes */
    if (OMPI_SUCCESS != (ret = ompi_errcode_intern_init())) {
        error = "ompi_errcode_intern_init() failed";
        goto error;
    }
     
    /* initialize groups  */
    if (OMPI_SUCCESS != (ret = ompi_group_init())) {
        error = "ompi_group_init() failed";
        goto error;
    }

    /* initialize communicators */
    if (OMPI_SUCCESS != (ret = ompi_comm_init())) {
        error = "ompi_comm_init() failed";
        goto error;
    }

    /* initialize file handles */
    if (OMPI_SUCCESS != (ret = ompi_file_init())) {
        error = "ompi_file_init() failed";
        goto error;
    }

    /* initialize windows */
    if (OMPI_SUCCESS != (ret = ompi_win_init())) {
        error = "ompi_win_init() failed";
        goto error;
    }

    /* initialize attribute meta-data structure for comm/win/dtype */
    if (OMPI_SUCCESS != (ret = ompi_attr_init())) {
        error = "ompi_attr_init() failed";
        goto error;
    }

    /* check for timing request - get stop time and report elapsed time if so */
    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init[%ld]: time from completion of orte_init to modex %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
        gettimeofday(&ompistart, NULL);
    }
    
    /* exchange connection info - this function also acts as a barrier
     * as it will not return until the exchange is complete
     */
    if (OMPI_SUCCESS != (ret = orte_grpcomm.modex(NULL))) {
        error = "orte_grpcomm_modex failed";
        goto error;
    }

    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init[%ld]: time to execute modex %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
        gettimeofday(&ompistart, NULL);
    }
    
    /* identify the architectures of remote procs and setup
     * their datatype convertors, if required
     */
    if (OMPI_SUCCESS != (ret = ompi_proc_set_arch())) {
        error = "ompi_proc_set_arch failed";
        goto error;
    }

    /* If thread support was enabled, then setup OPAL to allow for
       them. */
    if ((OMPI_ENABLE_PROGRESS_THREADS == 1) ||
        (*provided != MPI_THREAD_SINGLE)) {
        opal_set_using_threads(true);
    }

    /* start PML/BTL's */
    ret = MCA_PML_CALL(enable(true));
    if( OMPI_SUCCESS != ret ) {
        error = "PML control failed";
        goto error;
    }

    /* add all ompi_proc_t's to PML */
    if (NULL == (procs = ompi_proc_world(&nprocs))) {
        error = "ompi_proc_world() failed";
        goto error;
    }
    ret = MCA_PML_CALL(add_procs(procs, nprocs));
    free(procs);
    if( OMPI_SUCCESS != ret ) {
        error = "PML add procs failed";
        goto error;
    }

    MCA_PML_CALL(add_comm(&ompi_mpi_comm_world.comm));
    MCA_PML_CALL(add_comm(&ompi_mpi_comm_self.comm));


    /*
     * Dump all MCA parameters if requested
     */
    if (ompi_mpi_show_mca_params) {
       ompi_show_all_mca_params(ompi_mpi_comm_world.comm.c_my_rank, 
                                nprocs, 
                                orte_process_info.nodename);
    }

    /* Do we need to wait for a debugger? */
    ompi_wait_for_debugger();
    
    /* check for timing request - get stop time and report elapsed
     time if so, then start the clock again */
    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init[%ld]: time from modex thru complete oob wireup %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
        gettimeofday(&ompistart, NULL);
    }
    
    /* wait for everyone to reach this point */
    if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier())) {
        error = "orte_grpcomm_barrier failed";
        goto error;
    }
    
    /* check for timing request - get stop time and report elapsed
       time if so, then start the clock again */
    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init[%ld]: time to execute barrier %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
        gettimeofday(&ompistart, NULL);
    }

#if OMPI_ENABLE_PROGRESS_THREADS == 0
    /* Start setting up the event engine for MPI operations.  Don't
       block in the event library, so that communications don't take
       forever between procs in the dynamic code.  This will increase
       CPU utilization for the remainder of MPI_INIT when we are
       blocking on ORTE-level events, but may greatly reduce non-TCP
       latency. */
    opal_progress_set_event_flag(OPAL_EVLOOP_NONBLOCK);
#endif
    
    /* wire up the mpi interface, if requested.  Do this after the
       non-block switch for non-TCP performance.  Do before the
       polling change as anyone with a complex wire-up is going to be
       using the oob. */
    if (OMPI_SUCCESS != (ret = ompi_init_preconnect_mpi())) {
        error = "ompi_mpi_do_preconnect_all() failed";
        goto error;
    }

    /* Setup the publish/subscribe (PUBSUB) framework */
    if (OMPI_SUCCESS != (ret = ompi_pubsub_base_open())) {
        error = "ompi_pubsub_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = ompi_pubsub_base_select())) {
        error = "ompi_pubsub_base_select() failed";
        goto error;
    }
    
    /* Setup the dynamic process management (DPM) framework */
    if (OMPI_SUCCESS != (ret = ompi_dpm_base_open())) {
        error = "ompi_dpm_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = ompi_dpm_base_select())) {
        error = "ompi_dpm_base_select() failed";
        goto error;
    }

    /* Init coll for the comms. This has to be after dpm_base_select, 
       (since dpm.mark_dyncomm is not set in the communicator creation
       function else), but before dpm.dyncom_init, since this function
       might require collective for the CID allocation. */
    if (OMPI_SUCCESS !=
        (ret = mca_coll_base_comm_select(MPI_COMM_WORLD))) {
        error = "mca_coll_base_comm_select(MPI_COMM_WORLD) failed";
        goto error;
    }

    if (OMPI_SUCCESS != 
        (ret = mca_coll_base_comm_select(MPI_COMM_SELF))) {
        error = "mca_coll_base_comm_select(MPI_COMM_SELF) failed";
        goto error;
    }


    
    /* Check whether we have been spawned or not.  We introduce that
       at the very end, since we need collectives, datatypes, ptls
       etc. up and running here.... */
    if (OMPI_SUCCESS != (ret = ompi_dpm.dyn_init())) {
        error = "ompi_comm_dyn_init() failed";
        goto error;
    }

    /*
     * Startup the Checkpoint/Restart Mech.
     * Note: Always do this so tools don't hang when
     * in a non-checkpointable build
     */
    if (OMPI_SUCCESS != (ret = ompi_cr_init())) {
        error = "ompi_cr_init";
        goto error;
    }

    /* Undo OPAL calling opal_progress_event_users_increment() during 
       opal_init, to get better latency when not using TCP.  Do 
       this *after* dyn_init, as dyn init uses lots of ORTE 
       communication and we don't want to hinder the performance of 
       that code. */ 
    opal_progress_event_users_decrement(); 

    /* see if yield_when_idle was specified - if so, use it */
    param = mca_base_param_find("mpi", NULL, "yield_when_idle");
    mca_base_param_lookup_int(param, &value);
    if (value < 0) {
        /* if no info is provided, just default to conservative */
        opal_progress_set_yield_when_idle(true);
    } else {
        /* info was provided, so set idle accordingly */
        opal_progress_set_yield_when_idle(value == 0 ? false : true);
    }
    
    param = mca_base_param_find("mpi", NULL, "event_tick_rate");
    mca_base_param_lookup_int(param, &value);
    /* negative value means use default - just don't do anything */
    if (value >= 0) {
        opal_progress_set_event_poll_rate(value);
    }

    /* At this point, we are fully configured and in MPI mode.  Any
       communication calls here will work exactly like they would in
       the user's code.  Setup the connections between procs and warm
       them up with simple sends, if requested */

 error:
    if (ret != OMPI_SUCCESS) {
        const char *err_msg = opal_strerror(ret);
        /* If ORTE was not setup yet, don't use orte_show_help */
        if (orte_setup) {
            orte_show_help("help-mpi-runtime",
                           "mpi_init:startup:internal-failure", true,
                           "MPI_INIT", "MPI_INIT", error, err_msg, ret);
        } else {
            opal_show_help("help-mpi-runtime",
                           "mpi_init:startup:internal-failure", true,
                           "MPI_INIT", "MPI_INIT", error, err_msg, ret);
        }
        return ret;
    }

    /* Initialize the registered datarep list to be empty */
    OBJ_CONSTRUCT(&ompi_registered_datareps, opal_list_t);

    /* Initialize the arrays used to store the F90 types returned by the
     *  MPI_Type_create_f90_XXX functions.
     */
    OBJ_CONSTRUCT( &ompi_mpi_f90_integer_hashtable, opal_hash_table_t);
    opal_hash_table_init(&ompi_mpi_f90_integer_hashtable, 16 /* why not? */);

    OBJ_CONSTRUCT( &ompi_mpi_f90_real_hashtable, opal_hash_table_t);
    opal_hash_table_init(&ompi_mpi_f90_real_hashtable, FLT_MAX_10_EXP);

    OBJ_CONSTRUCT( &ompi_mpi_f90_complex_hashtable, opal_hash_table_t);
    opal_hash_table_init(&ompi_mpi_f90_complex_hashtable, FLT_MAX_10_EXP);
    
    /* All done.  Wasn't that simple? */

    ompi_mpi_initialized = true;

    /* check for timing request - get stop time and report elapsed time if so */
    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init[%ld]: time from barrier p to complete mpi_init %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
    }

    return MPI_SUCCESS;
}
Beispiel #21
0
int orcm_init_util(void)
{
    int ret, i;
    char *error;
    char *destdir, *tmp, *mcp, *new_mcp;

    /* Setup OPAL */
    if( ORTE_SUCCESS != (ret = opal_init(NULL, NULL)) ) {
        error = "opal_init_util";
        goto error;
    }
    /* register handler for errnum -> string conversion */
    opal_error_register("OPENRCM", ORCM_ERR_BASE, ORCM_ERR_MAX, orcm_err2str);
    /* register where the OPENRCM show_help files are located */
    if (NULL != (destdir = getenv("ORCM_DESTDIR"))) {
        asprintf(&tmp, "%s%s", destdir, ORCM_PKGHELPDIR);
    } else {
        tmp = strdup(ORCM_PKGHELPDIR);
    }
    if (ORTE_SUCCESS != (ret = opal_show_help_add_dir(tmp))) {
        error = "register show_help_dir";
        goto error;
    }
    free(tmp);
    
    /* Add ORCM's component directory into the
       mca_base_param_component_path */
    i = mca_base_param_find("mca", NULL, "component_path");
    if (i < 0) {
        ret = ORCM_ERR_NOT_FOUND;
        error = "Could not find mca_component_path";
        goto error;
    }
    mca_base_param_lookup_string(i, &mcp);
    if (NULL == mcp) {
        ret = ORCM_ERR_NOT_FOUND;
        error = "Could not find mca_component_path";
        goto error;
    }
    if (NULL != destdir) {
        asprintf(&new_mcp, "%s%s:%s", destdir, ORCM_PKGLIBDIR, mcp);
    } else {
        asprintf(&new_mcp, "%s:%s", ORCM_PKGLIBDIR, mcp);
    }
    mca_base_param_set_string(i, new_mcp);
    free(new_mcp);
    free(mcp);

    orcm_util_initialized = true;
    
    return ORCM_SUCCESS;
    
error:
    if (ORCM_ERR_SILENT != ret) {
        orte_show_help("help-openrcm-runtime.txt",
                       "orcm_init:startup:internal-failure",
                       true, error, ORTE_ERROR_NAME(ret), ret);
    }
    
    return ret;
}
Beispiel #22
0
int mca_bml_r2_ft_event(int state)
{
    static bool first_continue_pass = false;
    ompi_proc_t** procs = NULL;
    size_t num_procs;
    size_t btl_idx;
    int ret, p;
    int loc_state;
    int param_type = -1;
    char *param_list = NULL;

    if(OPAL_CRS_CHECKPOINT == state) {
        /* Do nothing for now */
    }
    else if(OPAL_CRS_CONTINUE == state) {
        first_continue_pass = !first_continue_pass;

        /* Since nothing in Checkpoint, we are fine here (unless required by BTL) */
        if( ompi_cr_continue_like_restart && !first_continue_pass) {
            procs = ompi_proc_all(&num_procs);
            if(NULL == procs) {
                return OMPI_ERR_OUT_OF_RESOURCE;
            }
        }
    }
    else if(OPAL_CRS_RESTART_PRE == state ) {
        /* Nothing here */
    }
    else if(OPAL_CRS_RESTART == state ) {
        procs = ompi_proc_all(&num_procs);
        if(NULL == procs) {
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
    }
    else if(OPAL_CRS_TERM == state ) {
        ;
    }
    else {
        ;
    }

    /* Never call the ft_event functions attached to the BTLs on the second
     * pass of RESTART since on the first pass they were unloaded and therefore
     * no longer exist.
     */
    if( OPAL_CRS_RESTART != state ) {
        if( OPAL_CRS_CONTINUE == state && !first_continue_pass ) {
            ;
        } else {
            /* Since we only ever call into the BTLs once during the first restart
             * pass, just lie to them on this pass for a bit of local clarity.
             */
            if( OPAL_CRS_RESTART_PRE == state ) {
                loc_state = OPAL_CRS_RESTART;
            } else {
                loc_state = state;
            }

            /*
             * Call ft_event in:
             * - BTL modules
             * - MPool modules
             *
             * These should be cleaning out stale state, and memory references in 
             * preparation for being shut down.
             */
            for(btl_idx = 0; btl_idx < mca_bml_r2.num_btl_modules; btl_idx++) {
                /*
                 * Notify Mpool
                 */
                if( NULL != (mca_bml_r2.btl_modules[btl_idx])->btl_mpool &&
                    NULL != (mca_bml_r2.btl_modules[btl_idx])->btl_mpool->mpool_ft_event ) {
                    opal_output_verbose(10, ompi_cr_output,
                                        "bml:r2: ft_event: Notify the %s MPool.\n",
                                        (mca_bml_r2.btl_modules[btl_idx])->btl_mpool->mpool_component->mpool_version.mca_component_name);
                    if(OMPI_SUCCESS != (ret = (mca_bml_r2.btl_modules[btl_idx])->btl_mpool->mpool_ft_event(loc_state) ) ) {
                        continue;
                    }
                }

                /*
                 * Notify BTL
                 */
                if( NULL != (mca_bml_r2.btl_modules[btl_idx])->btl_ft_event) {
                    opal_output_verbose(10, ompi_cr_output,
                                        "bml:r2: ft_event: Notify the %s BTL.\n",
                                        (mca_bml_r2.btl_modules[btl_idx])->btl_component->btl_version.mca_component_name);
                    if(OMPI_SUCCESS != (ret = (mca_bml_r2.btl_modules[btl_idx])->btl_ft_event(loc_state) ) ) {
                        continue;
                    }
                }
            }
        } /* OPAL_CRS_CONTINUE == state && !first_continue_pass */
    }
    
    if(OPAL_CRS_CHECKPOINT == state) {
        ;
    }
    else if(OPAL_CRS_CONTINUE == state) {
        /* Matches OPAL_CRS_RESTART_PRE */
        if( ompi_cr_continue_like_restart && first_continue_pass) {
            if( OMPI_SUCCESS != (ret = mca_bml_r2_finalize()) ) {
                opal_output(0, "bml:r2: ft_event(Restart): Failed to finalize BML framework\n");
                return ret;
            }
        }
        /* Matches OPAL_CRS_RESTART */
        else if( ompi_cr_continue_like_restart && !first_continue_pass ) {
            /*
             * Barrier to make all processes have been successfully restarted before
             * we try to remove some restart only files.
             */
            if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier())) {
                opal_output(0, "bml:r2: ft_event(Restart): Failed in orte_grpcomm.barrier (%d)", ret);
                return ret;
            }

            opal_output_verbose(10, ompi_cr_output,
                                "bml:r2: ft_event(Restart): Cleanup restart files\n");
            opal_crs_base_cleanup_flush();

            /*
             * Re-open the BTL framework to get the full list of components.
             */
            if( OMPI_SUCCESS != (ret = mca_btl_base_open()) ) {
                opal_output(0, "bml:r2: ft_event(Restart): Failed to open BTL framework\n");
                return ret;
            }

            /*
             * Re-select the BTL components/modules
             * This will cause the BTL components to discover the available
             * network options on this machine, and post proper modex informaiton.
             */
            if( OMPI_SUCCESS != (ret = mca_btl_base_select(OMPI_ENABLE_PROGRESS_THREADS,
                                                           OMPI_ENABLE_MPI_THREADS) ) ) {
                opal_output(0, "bml:r2: ft_event(Restart): Failed to select in BTL framework\n");
                return ret;
            }

            /*
             * Clear some structures so we can properly repopulate them
             */
            mca_bml_r2.btls_added = false;

            for(p = 0; p < (int)num_procs; ++p) {
                if( NULL != procs[p]->proc_bml) {
                    OBJ_RELEASE(procs[p]->proc_bml);
                    procs[p]->proc_bml = NULL;
                }

                OBJ_RELEASE(procs[p]);
            }

            if( NULL != procs ) {
                free(procs);
                procs = NULL;
            }
        }
    }
    else if(OPAL_CRS_RESTART_PRE == state ) {
        opal_output_verbose(10, ompi_cr_output,
                            "bml:r2: ft_event(Restart): Finalize BML\n");

        /*
         * Finalize the BML
         * - Flush progress functions
         * - Flush module references
         * - mca_btl_base_close()
         *   Need to do this because we may have BTL components that were
         *   unloaded in the first selection that may be available now.
         *   Conversely we may have BTL components loaded now that
         *   are not available now.
         */
        if( OMPI_SUCCESS != (ret = mca_bml_r2_finalize()) ) {
            opal_output(0, "bml:r2: ft_event(Restart): Failed to finalize BML framework\n");
            return ret;
        }
    }
    else if(OPAL_CRS_RESTART == state  ) {

        /*
         * Barrier to make all processes have been successfully restarted before
         * we try to remove some restart only files.
         */
        if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier())) {
            opal_output(0, "bml:r2: ft_event(Restart): Failed in orte_grpcomm.barrier (%d)", ret);
            return ret;
        }

        opal_output_verbose(10, ompi_cr_output,
                            "bml:r2: ft_event(Restart): Cleanup restart files\n");
        opal_crs_base_cleanup_flush();

        /*
         * Re-open the BTL framework to get the full list of components.
         * - but first clear the MCA value that was there
         */
        param_type = mca_base_param_find("btl", NULL, NULL);
        mca_base_param_lookup_string(param_type, &param_list);
        opal_output_verbose(11, ompi_cr_output,
                            "Restart (Previous BTL MCA): <%s>\n", param_list);
        if( NULL != param_list ) {
            free(param_list);
            param_list = NULL;
        }

        /* Deregister the old value, and refresh the file cache to grab any updates */
        mca_base_param_deregister(param_type);
        mca_base_param_recache_files(false);

        if( OMPI_SUCCESS != (ret = mca_btl_base_open()) ) {
            opal_output(0, "bml:r2: ft_event(Restart): Failed to open BTL framework\n");
            return ret;
        }

        param_type = mca_base_param_find("btl", NULL, NULL);
        mca_base_param_lookup_string(param_type, &param_list);
        opal_output_verbose(11, ompi_cr_output,
                            "Restart (New BTL MCA): <%s>\n", param_list);
        if( NULL != param_list ) {
            free(param_list);
            param_list = NULL;
        }

        /*
         * Re-select the BTL components/modules
         * This will cause the BTL components to discover the available
         * network options on this machine, and post proper modex informaiton.
         */
        if( OMPI_SUCCESS != (ret = mca_btl_base_select(OMPI_ENABLE_PROGRESS_THREADS,
                                                       OMPI_ENABLE_MPI_THREADS) ) ) {
            opal_output(0, "bml:r2: ft_event(Restart): Failed to select in BTL framework\n");
            return ret;
        }

        /*
         * Clear some structures so we can properly repopulate them
         */
        mca_bml_r2.btls_added = false;

        for(p = 0; p < (int)num_procs; ++p) {
            if( NULL != procs[p]->proc_bml) {
                OBJ_RELEASE(procs[p]->proc_bml);
                procs[p]->proc_bml = NULL;
            }

            OBJ_RELEASE(procs[p]);
        }

        if( NULL != procs ) {
            free(procs);
            procs = NULL;
        }
    }
    else if(OPAL_CRS_TERM == state ) {
        ;
    }
    else {
        ;
    }
    
    return OMPI_SUCCESS;
}
Beispiel #23
0
static int orte_rds_hostfile_query(orte_jobid_t job)
{
    opal_list_t existing;
    opal_list_t updates, rds_updates;
    opal_list_item_t *item;
    orte_rds_cell_desc_t *rds_item;
    orte_rds_cell_attr_t *new_attr;
    orte_ras_node_t *ras_item;
    int rc;

    if (orte_rds_hostfile_queried) {
        /* if we have already been queried, then
         * our info is on the registry, so just
         * return. Note that this restriction
         * may eventually be lifted - ideally, 
         * we might check to see if this is a
         * new file name and go ahead with the
         * query if so.
         */
        return ORTE_SUCCESS;
    }
    orte_rds_hostfile_queried = true;
    
    OBJ_CONSTRUCT(&existing, opal_list_t);
    OBJ_CONSTRUCT(&updates, opal_list_t);
    OBJ_CONSTRUCT(&rds_updates, opal_list_t);
    rc = orte_ras_base_node_query(&existing);
    if(ORTE_SUCCESS != rc) {
        goto cleanup;
    }

    rc = mca_base_param_find("rds", "hostfile", "path");
    mca_base_param_lookup_string(rc, &mca_rds_hostfile_component.path);

    rc = orte_rds_hostfile_parse(mca_rds_hostfile_component.path, &existing, &updates);
    if (ORTE_ERR_NOT_FOUND == rc) {
        if(mca_rds_hostfile_component.default_hostfile) {
            rc = ORTE_SUCCESS;
        } else {
            opal_show_help("help-rds-hostfile.txt", "rds:no-hostfile",
                           true,
                           mca_rds_hostfile_component.path);
        }
        goto cleanup;
    } else if (ORTE_SUCCESS != rc) {
        goto cleanup;
    }

    if ( !opal_list_is_empty(&updates) ) {

        /* Convert RAS update list to RDS update list */
        for ( ras_item  = (orte_ras_node_t*)opal_list_get_first(&updates);
              ras_item != (orte_ras_node_t*)opal_list_get_end(&updates);
              ras_item  = (orte_ras_node_t*)opal_list_get_next(ras_item)) {

            rds_item = OBJ_NEW(orte_rds_cell_desc_t);
            if (NULL == rds_item) {
                ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
                return ORTE_ERR_OUT_OF_RESOURCE;
            }

            rds_item->site  = strdup("Hostfile");
            rds_item->name  = strdup(ras_item->node_name);
            if (need_cellid) {
#if 0 /* JJH Repair when cellid's are fixed */
                /* Create a new cellid for this hostfile */
                rc = orte_ns.create_cellid(&local_cellid, rds_item->site, rds_item->name);
                if (ORTE_SUCCESS != rc) {
                    ORTE_ERROR_LOG(rc);
                    return rc;
                }
#endif
                local_cellid = 0;
                need_cellid = false;
            }

            rds_item->cellid      = local_cellid;
            ras_item->node_cellid = local_cellid;

            new_attr = OBJ_NEW(orte_rds_cell_attr_t);
            if (NULL == new_attr) {
                ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
                return ORTE_ERR_OUT_OF_RESOURCE;
            }
            new_attr->keyval.key          = strdup(ORTE_RDS_NAME);
            new_attr->keyval.value = OBJ_NEW(orte_data_value_t);
            if (NULL == new_attr->keyval.value) {
                ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
                return ORTE_ERR_OUT_OF_RESOURCE;
            }
            new_attr->keyval.value->type   = ORTE_STRING;
            new_attr->keyval.value->data   = strdup(ras_item->node_name);
            opal_list_append(&(rds_item->attributes), &new_attr->super);

            new_attr = OBJ_NEW(orte_rds_cell_attr_t);
            if (NULL == new_attr) {
                ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
                return ORTE_ERR_OUT_OF_RESOURCE;
            }
            new_attr->keyval.key          = strdup(ORTE_CELLID_KEY);
            new_attr->keyval.value = OBJ_NEW(orte_data_value_t);
            if (NULL == new_attr->keyval.value) {
                ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
                return ORTE_ERR_OUT_OF_RESOURCE;
            }
            new_attr->keyval.value->type   = ORTE_CELLID;
            if (ORTE_SUCCESS != (rc = orte_dss.copy(&(new_attr->keyval.value->data), &(rds_item->cellid), ORTE_CELLID))) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
            opal_list_append(&(rds_item->attributes), &new_attr->super);

            opal_list_append(&rds_updates, &rds_item->super);
        }

        /* Insert the new node into the RDS */
        rc = orte_rds.store_resource(&rds_updates);
        if (ORTE_SUCCESS != rc) {
            goto cleanup;
        }

        /* Then the RAS, since we can assume that any
         * resources listed in the hostfile have been
         * already allocated for our use.
         */
        rc = orte_ras_base_node_insert(&updates);
        if (ORTE_SUCCESS != rc) {
            goto cleanup;
        }
        
        /* and now, indicate that ORTE should override any oversubscribed conditions
         * based on local hardware limits since the user (a) might not have
         * provided us any info on the #slots for a node, and (b) the user
         * might have been wrong! If we don't check the number of local physical
         * processors, then we could be too aggressive on our sched_yield setting
         * and cause performance problems.
         */
        rc = orte_ras_base_set_oversubscribe_override(job);
        if (ORTE_SUCCESS != rc) {
            goto cleanup;
        }
    }

cleanup:
    if (NULL != mca_rds_hostfile_component.path) {
        free(mca_rds_hostfile_component.path);
        mca_rds_hostfile_component.path = NULL;
    }

    while(NULL != (item = opal_list_remove_first(&existing))) {
        OBJ_RELEASE(item);
    }

    while(NULL != (item = opal_list_remove_first(&updates))) {
        OBJ_RELEASE(item);
    }

    while (NULL != (rds_item = (orte_rds_cell_desc_t*)opal_list_remove_first(&rds_updates))) {
        while (NULL != (new_attr = (orte_rds_cell_attr_t*)opal_list_remove_first(&(rds_item->attributes)))) {
            OBJ_RELEASE(new_attr);
        }
        OBJ_RELEASE(rds_item);
    }

    OBJ_DESTRUCT(&existing);
    OBJ_DESTRUCT(&updates);
    OBJ_DESTRUCT(&rds_updates);

    return rc;
}
static int tuned_open(void)
{
    int rc;

#if OMPI_ENABLE_DEBUG
    {
        int param;

        param = mca_base_param_find("coll", NULL, "base_verbose");
        if (param >= 0) {
            int verbose;
            mca_base_param_lookup_int(param, &verbose);
            if (verbose > 0) {
                ompi_coll_tuned_stream = opal_output_open(NULL);
            }
        }
    }
#endif  /* OMPI_ENABLE_DEBUG */

    /* Use a low priority, but allow other components to be lower */    
    mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
                           "priority",
                           "Priority of the tuned coll component",
                           false, false, ompi_coll_tuned_priority,
                           &ompi_coll_tuned_priority);

    /* parameter for pre-allocated memory requests etc */
    mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
                           "pre_allocate_memory_comm_size_limit",
                           "Size of communicator were we stop pre-allocating memory for the fixed internal buffer used for message requests etc that is hung off the communicator data segment. I.e. if you have a 100'000 nodes you might not want to pre-allocate 200'000 request handle slots per communicator instance!",
                           false, false, ompi_coll_tuned_preallocate_memory_comm_size_limit,
                           &ompi_coll_tuned_preallocate_memory_comm_size_limit);
    
    /* some initial guesses at topology parameters */
    mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
                           "init_tree_fanout",
                           "Inital fanout used in the tree topologies for each communicator. This is only an initial guess, if a tuned collective needs a different fanout for an operation, it build it dynamically. This parameter is only for the first guess and might save a little time",
                           false, false, ompi_coll_tuned_init_tree_fanout,
                           &ompi_coll_tuned_init_tree_fanout);

    mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
                           "init_chain_fanout",
                           "Inital fanout used in the chain (fanout followed by pipeline) topologies for each communicator. This is only an initial guess, if a tuned collective needs a different fanout for an operation, it build it dynamically. This parameter is only for the first guess and might save a little time",
                           false, false, ompi_coll_tuned_init_chain_fanout,
                           &ompi_coll_tuned_init_chain_fanout);

    /* now check that the user hasn't overrode any of the decision functions if dynamic rules are enabled */
    /* the user can redo this before every comm dup/create if they like */
    /* this is useful for benchmarking and user knows best tuning */
    /* as this is the component we only lookup the indicies of the mca params */
    /* the actual values are looked up during comm create via module init */
   
    /* intra functions first */
    /* if dynamic rules allowed then look up dynamic rules config filename, else we leave it an empty filename (NULL) */
    /* by default DISABLE dynamic rules and instead use fixed [if based] rules */
    mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
                           "use_dynamic_rules",
                           "Switch used to decide if we use static (compiled/if statements) or dynamic (built at runtime) decision function rules",
                           false, false, ompi_coll_tuned_use_dynamic_rules,
                           &ompi_coll_tuned_use_dynamic_rules);

    if (ompi_coll_tuned_use_dynamic_rules) {
        mca_base_param_reg_string(&mca_coll_tuned_component.super.collm_version,
                                  "dynamic_rules_filename",
                                  "Filename of configuration file that contains the dynamic (@runtime) decision function rules",
                                  false, false, ompi_coll_tuned_dynamic_rules_filename,
                                  &ompi_coll_tuned_dynamic_rules_filename);
        if( ompi_coll_tuned_dynamic_rules_filename ) {
            OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:component_open Reading collective rules file [%s]", 
                         ompi_coll_tuned_dynamic_rules_filename));
            rc = ompi_coll_tuned_read_rules_config_file( ompi_coll_tuned_dynamic_rules_filename,
                                                         &(mca_coll_tuned_component.all_base_rules), COLLCOUNT);
            if( rc >= 0 ) {
                OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_open Read %d valid rules\n", rc));
            } else {
                OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_open Reading collective rules file failed\n"));
                mca_coll_tuned_component.all_base_rules = NULL;
            }
        }
        ompi_coll_tuned_allreduce_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLREDUCE]);
        ompi_coll_tuned_alltoall_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLTOALL]);
        ompi_coll_tuned_allgather_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLGATHER]);
        ompi_coll_tuned_allgatherv_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLGATHERV]);
        ompi_coll_tuned_alltoallv_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLTOALLV]);
        ompi_coll_tuned_barrier_intra_check_forced_init(&ompi_coll_tuned_forced_params[BARRIER]);
        ompi_coll_tuned_bcast_intra_check_forced_init(&ompi_coll_tuned_forced_params[BCAST]);
        ompi_coll_tuned_reduce_intra_check_forced_init(&ompi_coll_tuned_forced_params[REDUCE]);
        ompi_coll_tuned_reduce_scatter_intra_check_forced_init(&ompi_coll_tuned_forced_params[REDUCESCATTER]);
        ompi_coll_tuned_gather_intra_check_forced_init(&ompi_coll_tuned_forced_params[GATHER]);
        ompi_coll_tuned_scatter_intra_check_forced_init(&ompi_coll_tuned_forced_params[SCATTER]);
    }

    OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:component_open: done!"));

    return OMPI_SUCCESS;
}
/**
 * Function for finding and opening either all MCA components, or the
 * one that was specifically requested via a MCA parameter.
 */
int mca_base_components_open(const char *type_name, int output_id,
                             const mca_base_component_t **static_components,
                             opal_list_t *components_available,
                             bool open_dso_components)
{
    int ret, param;
    opal_list_item_t *item;
    opal_list_t components_found;
    char **requested_component_names;
    int param_verbose = -1;
    int param_type = -1;
    int verbose_level;
    char *str;
    bool include_mode;
#if (OPAL_ENABLE_FT == 1) && (OPAL_ENABLE_FT_CR == 1)
    opal_list_item_t *next;
    uint32_t open_only_flags = MCA_BASE_METADATA_PARAM_NONE;
    const mca_base_component_t *component;
#endif

    /* Register MCA parameters */
    /* Check to see if it exists first */
    if( 0 > (param_type = mca_base_param_find(type_name, NULL, NULL) ) ) {
        asprintf(&str, "Default selection set of components for the %s framework (<none>"
                 " means use all components that can be found)", type_name);
        param_type =
            mca_base_param_reg_string_name(type_name, NULL, str,
                                           false, false, NULL, NULL);
        free(str);
    }

    param = mca_base_param_find("mca", NULL, "component_show_load_errors");
    mca_base_param_lookup_int(param, &ret);
    show_errors = OPAL_INT_TO_BOOL(ret);

    /* Setup verbosity for this MCA type */
    asprintf(&str, "Verbosity level for the %s framework (0 = no verbosity)", type_name);
    param_verbose =
        mca_base_param_reg_int_name(type_name, "base_verbose",
                                    str, false, false, 0, NULL);
    free(str);
    mca_base_param_lookup_int(param_verbose, &verbose_level);
    if (output_id != 0) {
        opal_output_set_verbosity(output_id, verbose_level);
    }
    opal_output_verbose(10, output_id,
                        "mca: base: components_open: Looking for %s components",
                        type_name);

    ret = parse_requested(param_type, &include_mode, &requested_component_names);
    if( OPAL_SUCCESS != ret ) {
        return ret;
    }

    /* Find and load requested components */
    if (OPAL_SUCCESS != (ret =
                             mca_base_component_find(NULL, type_name, static_components,
                                     requested_component_names, include_mode,
                                     &components_found, open_dso_components)) ) {
        return ret;
    }

#if (OPAL_ENABLE_FT == 1) && (OPAL_ENABLE_FT_CR == 1)
    {
        int param_id = -1;
        int param_val = 0;
        /*
         * Extract supported mca parameters for selection contraints
         * Supported Options:
         *   - mca_base_component_distill_checkpoint_ready = Checkpoint Ready
         */
        param_id = mca_base_param_reg_int_name("mca", "base_component_distill_checkpoint_ready",
                                               "Distill only those components that are Checkpoint Ready",
                                               false, false,
                                               0, &param_val);
        if( 0 != param_val ) { /* Select Checkpoint Ready */
            open_only_flags |= MCA_BASE_METADATA_PARAM_CHECKPOINT;
        }
    }
#endif  /* (OPAL_ENABLE_FT == 1) && (OPAL_ENABLE_FT_CR == 1) */

    /*
     * Pre-process the list with parameter constraints
     * e.g., If requested to select only CR enabled components
     *       then only make available those components.
     *
     * JJH Note: Currently checkpoint/restart is the only user of this
     *           functionality. If other component constraint options are
     *           added, then this logic can be used for all contraint
     *           options.
     */
#if (OPAL_ENABLE_FT == 1) && (OPAL_ENABLE_FT_CR == 1)
    if( !(MCA_BASE_METADATA_PARAM_NONE & open_only_flags) ) {
        if( MCA_BASE_METADATA_PARAM_CHECKPOINT & open_only_flags) {
            opal_output_verbose(10, output_id,
                                "mca: base: components_open: "
                                "including only %s components that are checkpoint enabled", type_name);
        }

        /*
         * Check all the components to make sure they adhere to the user
         * expressed requirements.
         */
        for(item  = opal_list_get_first(&components_found);
                item != opal_list_get_end(&components_found);
                item  = next ) {
            mca_base_open_only_dummy_component_t *dummy;
            mca_base_component_list_item_t *cli = (mca_base_component_list_item_t *) item;
            dummy = (mca_base_open_only_dummy_component_t*) cli->cli_component;
            component = cli->cli_component;

            next = opal_list_get_next(item);

            /*
             * If the user asked for a checkpoint enabled run
             * then only load checkpoint enabled components.
             */
            if( MCA_BASE_METADATA_PARAM_CHECKPOINT & open_only_flags) {
                if( MCA_BASE_METADATA_PARAM_CHECKPOINT & dummy->data.param_field) {
                    opal_output_verbose(10, output_id,
                                        "mca: base: components_open: "
                                        "(%s) Component %s is Checkpointable",
                                        type_name,
                                        dummy->version.mca_component_name);
                }
                else {
                    opal_output_verbose(10, output_id,
                                        "mca: base: components_open: "
                                        "(%s) Component %s is *NOT* Checkpointable - Disabled",
                                        type_name,
                                        dummy->version.mca_component_name);
                    opal_list_remove_item(&components_found, item);
                    /* Make sure to release the component since we are not
                     * opening it */
                    mca_base_component_repository_release(component);
                }
            }
        }
    }
#endif  /* (OPAL_ENABLE_FT == 1) && (OPAL_ENABLE_FT_CR == 1) */

    /* Open all remaining components */
    ret = open_components(type_name, output_id,
                          &components_found, components_available);

    /* Free resources */
    for (item = opal_list_remove_first(&components_found); NULL != item;
            item = opal_list_remove_first(&components_found)) {
        OBJ_RELEASE(item);
    }
    OBJ_DESTRUCT(&components_found);

    if (NULL != requested_component_names) {
        opal_argv_free(requested_component_names);
    }

    /* All done */
    return ret;
}
Beispiel #26
0
/* turn on MPI optimizations */
int
opal_progress_mpi_enable(void)
{
    int param, value;

    /* call sched yield when oversubscribed. */
    param = mca_base_param_find("mpi", NULL, "yield_when_idle");
    mca_base_param_lookup_int(param, &value);

    if (value < 0) {
        /* this should never happen set to 1 if it somehow does */
        call_yield = 1;
    } else {
        call_yield = value;
    }

    /* set the event tick rate */
    param = mca_base_param_find("mpi", NULL, "event_tick_rate");
    mca_base_param_lookup_int(param, &value);

    if (value < 0) {
        /* user didn't specify - default tick rate */
        event_progress_delta = opal_progress_default_tick_rate;
    } else if (value == 0) {
#if OPAL_PROGRESS_USE_TIMERS
        /* user specified as never tick - tick once per minute */
        event_progress_delta = 60 * 1000000;
#else
        /* user specified as never tick - don't count often */
        event_progress_delta = INT_MAX;
#endif
    } else {
#if OPAL_PROGRESS_USE_TIMERS
        event_progress_delta = value;
#else
        /* subtract one so that we can do post-fix subtraction
           in the inner loop and go faster */
        event_progress_delta = value - 1;
#endif
    }
#if OPAL_PROGRESS_USE_TIMERS && !OPAL_TIMER_USEC_NATIVE
    /*  going to use cycles for counter.  Adjust specified usec into cycles */
    event_progress_delta = event_progress_delta * opal_timer_base_get_freq() / 1000000;
#endif

#if OPAL_PROGRESS_USE_TIMERS
#if OPAL_TIMER_USEC_NATIVE
    event_progress_last_time = opal_timer_base_get_usec();
#else
    event_progress_last_time = opal_timer_base_get_cycles();
#endif
#else
    /* it's possible that an init function bumped up our tick rate.
     * If so, set the event_progress counter to 0.  Otherwise, set it to
     * the reset value */
    event_progress_counter = (event_num_mpi_users > 0) ? 
        0 : event_progress_delta;
#endif

    return OPAL_SUCCESS;
}