static int check_compare(hwloc_const_bitmap_t set1, hwloc_const_bitmap_t set2)
{
  int res = hwloc_bitmap_compare_inclusion(set1, set2);

  if (hwloc_bitmap_iszero(set1)) {
    if (hwloc_bitmap_iszero(set2)) {
      assert(res == HWLOC_BITMAP_EQUAL);
    } else {
      assert(res == HWLOC_BITMAP_INCLUDED);
    }
  } else if (hwloc_bitmap_iszero(set2)) {
    assert(res == HWLOC_BITMAP_CONTAINS);

  } else if (hwloc_bitmap_isequal(set1, set2)) {
    assert(res == HWLOC_BITMAP_EQUAL);
  } else if (hwloc_bitmap_isincluded(set1, set2)) {
    assert(res == HWLOC_BITMAP_INCLUDED);
  } else if (hwloc_bitmap_isincluded(set2, set1)) {
    assert(res == HWLOC_BITMAP_CONTAINS);
  } else if (hwloc_bitmap_intersects(set1, set2)) {
    assert(res == HWLOC_BITMAP_INTERSECTS);
  } else {
    assert(res == HWLOC_BITMAP_DIFFERENT);
  }

  return res;
}
Ejemplo n.º 2
0
static int
hwloc_solaris_get_sth_cpubind(hwloc_topology_t topology, idtype_t idtype, id_t id, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
{
  processorid_t binding;
  int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
  int n;
  int i;

  if (depth < 0) {
    errno = ENOSYS;
    return -1;
  }

  /* first check if processor_bind() was used to bind to a single processor rather than to an lgroup */
  if ( processor_bind(idtype, id, PBIND_QUERY, &binding) == 0 && binding != PBIND_NONE ) {
    hwloc_bitmap_only(hwloc_set, binding);
    return 0;
  }

  /* if not, check lgroups */
  hwloc_bitmap_zero(hwloc_set);
  n = hwloc_get_nbobjs_by_depth(topology, depth);
  for (i = 0; i < n; i++) {
    hwloc_obj_t obj = hwloc_get_obj_by_depth(topology, depth, i);
    lgrp_affinity_t aff = lgrp_affinity_get(idtype, id, obj->os_index);

    if (aff == LGRP_AFF_STRONG)
      hwloc_bitmap_or(hwloc_set, hwloc_set, obj->cpuset);
  }

  if (hwloc_bitmap_iszero(hwloc_set))
    hwloc_bitmap_copy(hwloc_set, hwloc_topology_get_complete_cpuset(topology));

  return 0;
}
Ejemplo n.º 3
0
static int
hwloc_solaris_get_sth_membind(hwloc_topology_t topology, idtype_t idtype, id_t id, hwloc_nodeset_t nodeset, hwloc_membind_policy_t *policy, int flags __hwloc_attribute_unused)
{
  int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
  int n;
  int i;

  if (depth < 0) {
    errno = ENOSYS;
    return -1;
  }

  hwloc_bitmap_zero(nodeset);
  n = hwloc_get_nbobjs_by_depth(topology, depth);

  for (i = 0; i < n; i++) {
    hwloc_obj_t obj = hwloc_get_obj_by_depth(topology, depth, i);
    lgrp_affinity_t aff = lgrp_affinity_get(idtype, id, obj->os_index);

    if (aff == LGRP_AFF_STRONG)
      hwloc_bitmap_set(nodeset, obj->os_index);
  }

  if (hwloc_bitmap_iszero(nodeset))
    hwloc_bitmap_copy(nodeset, hwloc_topology_get_complete_nodeset(topology));

  *policy = HWLOC_MEMBIND_BIND;
  return 0;
}
Ejemplo n.º 4
0
int main(void)
{
  hwloc_bitmap_t orig, expected;

  orig = hwloc_bitmap_alloc();
  expected = hwloc_bitmap_alloc();

  /* empty set gives empty set */
  hwloc_bitmap_singlify(orig);
  assert(hwloc_bitmap_iszero(orig));

  /* full set gives first bit only */
  hwloc_bitmap_fill(orig);
  hwloc_bitmap_singlify(orig);
  hwloc_bitmap_zero(expected);
  hwloc_bitmap_set(expected, 0);
  assert(hwloc_bitmap_isequal(orig, expected));
  assert(!hwloc_bitmap_compare(orig, expected));

  /* actual non-trivial set */
  hwloc_bitmap_zero(orig);
  hwloc_bitmap_set(orig, 45);
  hwloc_bitmap_set(orig, 46);
  hwloc_bitmap_set(orig, 517);
  hwloc_bitmap_singlify(orig);
  hwloc_bitmap_zero(expected);
  hwloc_bitmap_set(expected, 45);
  assert(hwloc_bitmap_isequal(orig, expected));
  assert(!hwloc_bitmap_compare(orig, expected));

  hwloc_bitmap_free(orig);
  hwloc_bitmap_free(expected);

  return 0;
}
Ejemplo n.º 5
0
static hwloc_const_bitmap_t
hwloc_fix_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t set)
{
  hwloc_const_bitmap_t topology_set = hwloc_topology_get_topology_cpuset(topology);
  hwloc_const_bitmap_t complete_set = hwloc_topology_get_complete_cpuset(topology);

  if (!topology_set) {
    /* The topology is composed of several systems, the cpuset is ambiguous. */
    errno = EXDEV;
    return NULL;
  }

  if (hwloc_bitmap_iszero(set)) {
    errno = EINVAL;
    return NULL;
  }

  if (!hwloc_bitmap_isincluded(set, complete_set)) {
    errno = EINVAL;
    return NULL;
  }

  if (hwloc_bitmap_isincluded(topology_set, set))
    set = complete_set;

  return set;
}
Ejemplo n.º 6
0
static hwloc_obj_t insert_task(hwloc_topology_t topology, hwloc_cpuset_t cpuset, const char * name)
{
  hwloc_obj_t group, obj;

  hwloc_bitmap_and(cpuset, cpuset, hwloc_topology_get_topology_cpuset(topology));
  if (hwloc_bitmap_iszero(cpuset))
    return NULL;

  /* try to insert a group at exact position */
  group = hwloc_topology_alloc_group_object(topology);
  if (!group)
    return NULL;
  group->cpuset = hwloc_bitmap_dup(cpuset);
  group = hwloc_topology_insert_group_object(topology, group);
  if (!group) {
    /* try to insert in a larger parent */
    char *s;
    hwloc_bitmap_asprintf(&s, cpuset);
    group = hwloc_get_obj_covering_cpuset(topology, cpuset);
    fprintf(stderr, "Inserting process `%s' below parent larger than cpuset %s\n", name, s);
    free(s);
  }
  obj = hwloc_topology_insert_misc_object(topology, group, name);
  if (!obj)
    fprintf(stderr, "Failed to insert process `%s'\n", name);
  else
    obj->subtype = strdup("Process");

  return obj;
}
Ejemplo n.º 7
0
static hwloc_obj_t insert_task(hwloc_topology_t topology, hwloc_cpuset_t cpuset, const char * name)
{
  hwloc_obj_t obj;

  hwloc_bitmap_and(cpuset, cpuset, hwloc_topology_get_topology_cpuset(topology));
  if (hwloc_bitmap_iszero(cpuset))
    return NULL;

  /* try to insert at exact position */
  obj = hwloc_topology_insert_misc_object_by_cpuset(topology, cpuset, name);
  if (!obj) {
    /* try to insert in a larger parent */
    char *s;
    hwloc_bitmap_asprintf(&s, cpuset);
    obj = hwloc_get_obj_covering_cpuset(topology, cpuset);
    if (obj) {
      obj = hwloc_topology_insert_misc_object_by_parent(topology, obj, name);
      fprintf(stderr, "Inserted process `%s' below parent larger than cpuset %s\n", name, s);
    } else {
      fprintf(stderr, "Failed to insert process `%s' with cpuset %s\n", name, s);
    }
    free(s);
  } else {
    hwloc_obj_add_info(obj, "Type", "Process");
  }

  return obj;
}
Ejemplo n.º 8
0
static hwloc_const_nodeset_t
hwloc_fix_membind(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset)
{
  hwloc_const_bitmap_t topology_nodeset = hwloc_topology_get_topology_nodeset(topology);
  hwloc_const_bitmap_t complete_nodeset = hwloc_topology_get_complete_nodeset(topology);

  if (!hwloc_topology_get_topology_cpuset(topology)) {
    /* The topology is composed of several systems, the nodeset is thus
     * ambiguous. */
    errno = EXDEV;
    return NULL;
  }

  if (!complete_nodeset) {
    /* There is no NUMA node */
    errno = ENODEV;
    return NULL;
  }

  if (hwloc_bitmap_iszero(nodeset)) {
    errno = EINVAL;
    return NULL;
  }

  if (!hwloc_bitmap_isincluded(nodeset, complete_nodeset)) {
    errno = EINVAL;
    return NULL;
  }

  if (hwloc_bitmap_isincluded(topology_nodeset, nodeset))
    return complete_nodeset;

  return nodeset;
}
Ejemplo n.º 9
0
int main(void)
{
  hwloc_topology_t topology;
  hwloc_bitmap_t set, set2, nocpunomemnodeset, nocpubutmemnodeset, nomembutcpunodeset, nomembutcpucpuset;
  hwloc_obj_t node;
  struct bitmask *bitmask, *bitmask2;
  unsigned long mask;
  unsigned long maxnode;
  int i;

  if (numa_available() < 0)
    /* libnuma has inconsistent behavior when the kernel isn't NUMA-aware.
     * don't try to check everything precisely.
     */
    exit(77);

  hwloc_topology_init(&topology);
  hwloc_topology_load(topology);

  /* convert full stuff between cpuset and libnuma */
  set = hwloc_bitmap_alloc();
  nocpunomemnodeset = hwloc_bitmap_alloc();
  nocpubutmemnodeset = hwloc_bitmap_alloc();
  nomembutcpunodeset = hwloc_bitmap_alloc();
  nomembutcpucpuset = hwloc_bitmap_alloc();
  /* gather all nodes if any, or the whole system if no nodes */
  if (hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_NUMANODE)) {
    node = NULL;
    while ((node = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_NUMANODE, node)) != NULL) {
      hwloc_bitmap_or(set, set, node->cpuset);
      if (hwloc_bitmap_iszero(node->cpuset)) {
	if (node->memory.local_memory)
	  hwloc_bitmap_set(nocpubutmemnodeset, node->os_index);
	else
	  hwloc_bitmap_set(nocpunomemnodeset, node->os_index);
      } else if (!node->memory.local_memory) {
	hwloc_bitmap_set(nomembutcpunodeset, node->os_index);
	hwloc_bitmap_or(nomembutcpucpuset, nomembutcpucpuset, node->cpuset);
      }
    }
  } else {
    hwloc_bitmap_or(set, set, hwloc_topology_get_complete_cpuset(topology));
  }

  set2 = hwloc_bitmap_alloc();
  hwloc_cpuset_from_linux_libnuma_bitmask(topology, set2, numa_all_nodes_ptr);
  /* numa_all_nodes_ptr doesn't contain NODES with CPU but no memory */
  hwloc_bitmap_or(set2, set2, nomembutcpucpuset);
  assert(hwloc_bitmap_isequal(set, set2));
  hwloc_bitmap_free(set2);

  bitmask = hwloc_cpuset_to_linux_libnuma_bitmask(topology, set);
  /* numa_all_nodes_ptr contains NODES with no CPU but with memory */
  hwloc_bitmap_foreach_begin(i, nocpubutmemnodeset) { numa_bitmask_setbit(bitmask, i); } hwloc_bitmap_foreach_end();
Ejemplo n.º 10
0
static int
hwloc__get_largest_objs_inside_cpuset (struct hwloc_obj *current, hwloc_const_bitmap_t set,
				       struct hwloc_obj ***res, int *max)
{
  int gotten = 0;
  unsigned i;

  /* the caller must ensure this */
  if (*max <= 0)
    return 0;

  if (hwloc_bitmap_isequal(current->cpuset, set)) {
    **res = current;
    (*res)++;
    (*max)--;
    return 1;
  }

  for (i=0; i<current->arity; i++) {
    hwloc_bitmap_t subset = hwloc_bitmap_dup(set);
    int ret;

    /* split out the cpuset part corresponding to this child and see if there's anything to do */
    if (current->children[i]->cpuset) {
      hwloc_bitmap_and(subset, subset, current->children[i]->cpuset);
      if (hwloc_bitmap_iszero(subset)) {
        hwloc_bitmap_free(subset);
        continue;
      }
    }

    ret = hwloc__get_largest_objs_inside_cpuset (current->children[i], subset, res, max);
    gotten += ret;
    hwloc_bitmap_free(subset);

    /* if no more room to store remaining objects, return what we got so far */
    if (!*max)
      break;
  }

  return gotten;
}
Ejemplo n.º 11
0
static int
hwloc_fix_membind_cpuset(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_const_cpuset_t cpuset)
{
  hwloc_const_bitmap_t topology_set = hwloc_topology_get_topology_cpuset(topology);
  hwloc_const_bitmap_t complete_set = hwloc_topology_get_complete_cpuset(topology);
  hwloc_const_bitmap_t complete_nodeset = hwloc_topology_get_complete_nodeset(topology);

  if (!topology_set) {
    /* The topology is composed of several systems, the cpuset is thus
     * ambiguous. */
    errno = EXDEV;
    return -1;
  }

  if (!complete_nodeset) {
    /* There is no NUMA node */
    errno = ENODEV;
    return -1;
  }

  if (hwloc_bitmap_iszero(cpuset)) {
    errno = EINVAL;
    return -1;
  }

  if (!hwloc_bitmap_isincluded(cpuset, complete_set)) {
    errno = EINVAL;
    return -1;
  }

  if (hwloc_bitmap_isincluded(topology_set, cpuset)) {
    hwloc_bitmap_copy(nodeset, complete_nodeset);
    return 0;
  }

  hwloc_cpuset_to_nodeset(topology, cpuset, nodeset);
  return 0;
}
Ejemplo n.º 12
0
static void print_task(hwloc_topology_t topology,
                       long pid_number, const char *name, hwloc_bitmap_t cpuset,
                       char *pidoutput,
                       int thread)
{
    printf("%s%ld\t", thread ? " " : "", pid_number);

    if (show_cpuset) {
        char *cpuset_str = NULL;
        hwloc_bitmap_asprintf(&cpuset_str, cpuset);
        printf("%s", cpuset_str);
        free(cpuset_str);
    } else {
        hwloc_bitmap_t remaining = hwloc_bitmap_dup(cpuset);
        int first = 1;
        while (!hwloc_bitmap_iszero(remaining)) {
            char type[64];
            unsigned idx;
            hwloc_obj_t obj = hwloc_get_first_largest_obj_inside_cpuset(topology, remaining);
            /* don't show a cache if there's something equivalent and nicer */
            while (hwloc_obj_type_is_cache(obj->type) && obj->arity == 1)
                obj = obj->first_child;
            hwloc_obj_type_snprintf(type, sizeof(type), obj, 1);
            idx = logical ? obj->logical_index : obj->os_index;
            if (idx == (unsigned) -1)
                printf("%s%s", first ? "" : " ", type);
            else
                printf("%s%s:%u", first ? "" : " ", type, idx);
            hwloc_bitmap_andnot(remaining, remaining, obj->cpuset);
            first = 0;
        }
        hwloc_bitmap_free(remaining);
    }

    printf("\t\t%s%s%s\n", name, pidoutput ? "\t" : "", pidoutput ? pidoutput : "");
}
Ejemplo n.º 13
0
int orte_daemon(int argc, char *argv[])
{
    int ret = 0;
    opal_cmd_line_t *cmd_line = NULL;
    char *rml_uri;
    int i;
    opal_buffer_t *buffer;
    char hostname[100];
#if OPAL_ENABLE_FT_CR == 1
    char *tmp_env_var = NULL;
#endif
    
    /* initialize the globals */
    memset(&orted_globals, 0, sizeof(orted_globals));
    /* initialize the singleton died pipe to an illegal value so we can detect it was set */
    orted_globals.singleton_died_pipe = -1;
    /* init the failure orted vpid to an invalid value */
    orted_globals.fail = ORTE_VPID_INVALID;
    
    /* setup to check common command line options that just report and die */
    cmd_line = OBJ_NEW(opal_cmd_line_t);
    if (OPAL_SUCCESS != opal_cmd_line_create(cmd_line, orte_cmd_line_opts)) {
        OBJ_RELEASE(cmd_line);
        exit(1);
    }
    mca_base_cmd_line_setup(cmd_line);
    if (ORTE_SUCCESS != (ret = opal_cmd_line_parse(cmd_line, false,
                                                   argc, argv))) {
        char *args = NULL;
        args = opal_cmd_line_get_usage_msg(cmd_line);
        fprintf(stderr, "Usage: %s [OPTION]...\n%s\n", argv[0], args);
        free(args);
        OBJ_RELEASE(cmd_line);
        return ret;
    }
    
    /*
     * Since this process can now handle MCA/GMCA parameters, make sure to
     * process them.
     */
    mca_base_cmd_line_process_args(cmd_line, &environ, &environ);
    
    /* Ensure that enough of OPAL is setup for us to be able to run */
    /*
     * NOTE: (JJH)
     *  We need to allow 'mca_base_cmd_line_process_args()' to process command
     *  line arguments *before* calling opal_init_util() since the command
     *  line could contain MCA parameters that affect the way opal_init_util()
     *  functions. AMCA parameters are one such option normally received on the
     *  command line that affect the way opal_init_util() behaves.
     *  It is "safe" to call mca_base_cmd_line_process_args() before 
     *  opal_init_util() since mca_base_cmd_line_process_args() does *not*
     *  depend upon opal_init_util() functionality.
     */
    if (OPAL_SUCCESS != opal_init_util(&argc, &argv)) {
        fprintf(stderr, "OPAL failed to initialize -- orted aborting\n");
        exit(1);
    }

    /* save the environment for launch purposes. This MUST be
     * done so that we can pass it to any local procs we
     * spawn - otherwise, those local procs won't see any
     * non-MCA envars that were set in the enviro when the
     * orted was executed - e.g., by .csh
     */
    orte_launch_environ = opal_argv_copy(environ);
    
    /* purge any ess flag set in the environ when we were launched */
    opal_unsetenv(OPAL_MCA_PREFIX"ess", &orte_launch_environ);
    
    /* if orte_daemon_debug is set, let someone know we are alive right
     * away just in case we have a problem along the way
     */
    if (orted_globals.debug) {
        gethostname(hostname, 100);
        fprintf(stderr, "Daemon was launched on %s - beginning to initialize\n", hostname);
    }
    
    /* check for help request */
    if (orted_globals.help) {
        char *args = NULL;
        args = opal_cmd_line_get_usage_msg(cmd_line);
        orte_show_help("help-orted.txt", "orted:usage", false,
                       argv[0], args);
        free(args);
        return 1;
    }
#if defined(HAVE_SETSID)
    /* see if we were directed to separate from current session */
    if (orted_globals.set_sid) {
        setsid();
    }
#endif
    /* see if they want us to spin until they can connect a debugger to us */
    i=0;
    while (orted_spin_flag) {
        i++;
        if (1000 < i) i=0;        
    }

#if OPAL_ENABLE_FT_CR == 1
    /* Mark as a tool program */
    (void) mca_base_var_env_name ("opal_cr_is_tool", &tmp_env_var);
    opal_setenv(tmp_env_var,
                "1",
                true, &environ);
    free(tmp_env_var);
#endif

    /* if mapreduce set, flag it */
    if (orted_globals.mapreduce) {
        orte_map_reduce = true;
    }

    /* detach from controlling terminal
     * otherwise, remain attached so output can get to us
     */
    if(!orte_debug_flag &&
       !orte_debug_daemons_flag &&
       orted_globals.daemonize) {
        opal_daemon_init(NULL);
    }
    
    /* Set the flag telling OpenRTE that I am NOT a
     * singleton, but am "infrastructure" - prevents setting
     * up incorrect infrastructure that only a singleton would
     * require.
     */
    if (orted_globals.hnp) {
        if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_HNP))) {
            ORTE_ERROR_LOG(ret);
            return ret;
        }
    } else {
        if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_DAEMON))) {
            ORTE_ERROR_LOG(ret);
            return ret;
        }
    }
    /* finalize the OPAL utils. As they are opened again from orte_init->opal_init
     * we continue to have a reference count on them. So we have to finalize them twice...
     */
    opal_finalize_util();

#if OPAL_HAVE_HWLOC
    /* bind ourselves if so directed */
    if (NULL != orte_daemon_cores) {
        char **cores=NULL, tmp[128];
        hwloc_obj_t pu;
        hwloc_cpuset_t ours, pucpus, res;
        int core;

        /* could be a collection of comma-delimited ranges, so
         * use our handy utility to parse it
         */
        orte_util_parse_range_options(orte_daemon_cores, &cores);
        if (NULL != cores) {
            ours = hwloc_bitmap_alloc();
            hwloc_bitmap_zero(ours);
            pucpus = hwloc_bitmap_alloc();
            res = hwloc_bitmap_alloc();
            for (i=0; NULL != cores[i]; i++) {
                core = strtoul(cores[i], NULL, 10);
                if (NULL == (pu = opal_hwloc_base_get_pu(opal_hwloc_topology, core, OPAL_HWLOC_LOGICAL))) {
                    /* turn off the show help forwarding as we won't
                     * be able to cycle the event library to send
                     */
                    orte_show_help_finalize();
                    /* the message will now come out locally */
                    orte_show_help("help-orted.txt", "orted:cannot-bind",
                                   true, orte_process_info.nodename,
                                   orte_daemon_cores);
                    ret = ORTE_ERR_NOT_SUPPORTED;
                    goto DONE;
                }
                hwloc_bitmap_and(pucpus, pu->online_cpuset, pu->allowed_cpuset);
                hwloc_bitmap_or(res, ours, pucpus);
                hwloc_bitmap_copy(ours, res);
            }
            /* if the result is all zeros, then don't bind */
            if (!hwloc_bitmap_iszero(ours)) {
                (void)hwloc_set_cpubind(opal_hwloc_topology, ours, 0);
                if (opal_hwloc_report_bindings) {
                    opal_hwloc_base_cset2mapstr(tmp, sizeof(tmp), opal_hwloc_topology, ours);
                    opal_output(0, "Daemon %s is bound to cores %s",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp);
                }
            }
            /* cleanup */
            hwloc_bitmap_free(ours);
            hwloc_bitmap_free(pucpus);
            hwloc_bitmap_free(res);
            opal_argv_free(cores);
        }
    }
#endif

    if ((int)ORTE_VPID_INVALID != orted_globals.fail) {
        orted_globals.abort=false;
        /* some vpid was ordered to fail. The value can be positive
         * or negative, depending upon the desired method for failure,
         * so need to check both here
         */
        if (0 > orted_globals.fail) {
            orted_globals.fail = -1*orted_globals.fail;
            orted_globals.abort = true;
        }
        /* are we the specified vpid? */
        if ((int)ORTE_PROC_MY_NAME->vpid == orted_globals.fail) {
            /* if the user specified we delay, then setup a timer
             * and have it kill us
             */
            if (0 < orted_globals.fail_delay) {
                ORTE_TIMER_EVENT(orted_globals.fail_delay, 0, shutdown_callback, ORTE_SYS_PRI);
                
            } else {
                opal_output(0, "%s is executing clean %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            orted_globals.abort ? "abort" : "abnormal termination");

                /* do -not- call finalize as this will send a message to the HNP
                 * indicating clean termination! Instead, just forcibly cleanup
                 * the local session_dir tree and exit
                 */
                orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
                
                /* if we were ordered to abort, do so */
                if (orted_globals.abort) {
                    abort();
                }
                
                /* otherwise, return with non-zero status */
                ret = ORTE_ERROR_DEFAULT_EXIT_CODE;
                goto DONE;
            }
        }
    }

    /* insert our contact info into our process_info struct so we
     * have it for later use and set the local daemon field to our name
     */
    orte_process_info.my_daemon_uri = orte_rml.get_contact_info();
    ORTE_PROC_MY_DAEMON->jobid = ORTE_PROC_MY_NAME->jobid;
    ORTE_PROC_MY_DAEMON->vpid = ORTE_PROC_MY_NAME->vpid;
    
    /* if I am also the hnp, then update that contact info field too */
    if (ORTE_PROC_IS_HNP) {
        orte_process_info.my_hnp_uri = orte_rml.get_contact_info();
        ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid;
        ORTE_PROC_MY_HNP->vpid = ORTE_PROC_MY_NAME->vpid;
    }
    
    /* setup the primary daemon command receive function */
    orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON,
                            ORTE_RML_PERSISTENT, orte_daemon_recv, NULL);
    
    /* output a message indicating we are alive, our name, and our pid
     * for debugging purposes
     */
    if (orte_debug_daemons_flag) {
        fprintf(stderr, "Daemon %s checking in as pid %ld on host %s\n",
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)orte_process_info.pid,
                orte_process_info.nodename);
    }

    /* We actually do *not* want the orted to voluntarily yield() the
       processor more than necessary.  The orted already blocks when
       it is doing nothing, so it doesn't use any more CPU cycles than
       it should; but when it *is* doing something, we do not want it
       to be unnecessarily delayed because it voluntarily yielded the
       processor in the middle of its work.

       For example: when a message arrives at the orted, we want the
       OS to wake up the orted in a timely fashion (which most OS's
       seem good about doing) and then we want the orted to process
       the message as fast as possible.  If the orted yields and lets
       aggressive MPI applications get the processor back, it may be a
       long time before the OS schedules the orted to run again
       (particularly if there is no IO event to wake it up).  Hence,
       routed OOB messages (for example) may be significantly delayed
       before being delivered to MPI processes, which can be
       problematic in some scenarios (e.g., COMM_SPAWN, BTL's that
       require OOB messages for wireup, etc.). */
    opal_progress_set_yield_when_idle(false);

    /* Change the default behavior of libevent such that we want to
       continually block rather than blocking for the default timeout
       and then looping around the progress engine again.  There
       should be nothing in the orted that cannot block in libevent
       until "something" happens (i.e., there's no need to keep
       cycling through progress because the only things that should
       happen will happen in libevent).  This is a minor optimization,
       but what the heck... :-) */
    opal_progress_set_event_flag(OPAL_EVLOOP_ONCE);

    /* if requested, report my uri to the indicated pipe */
    if (orted_globals.uri_pipe > 0) {
        orte_job_t *jdata;
        orte_proc_t *proc;
        orte_node_t *node;
        orte_app_context_t *app;
        char *tmp, *nptr, *sysinfo;
        int32_t ljob;

        /* setup the singleton's job */
        jdata = OBJ_NEW(orte_job_t);
        orte_plm_base_create_jobid(jdata);
        ljob = ORTE_LOCAL_JOBID(jdata->jobid);
        opal_pointer_array_set_item(orte_job_data, ljob, jdata);

        /* must create a map for it (even though it has no
         * info in it) so that the job info will be picked
         * up in subsequent pidmaps or other daemons won't
         * know how to route
         */
        jdata->map = OBJ_NEW(orte_job_map_t);

        /* setup an app_context for the singleton */
        app = OBJ_NEW(orte_app_context_t);
        app->app = strdup("singleton");
        app->num_procs = 1;
        opal_pointer_array_add(jdata->apps, app);
        
        /* setup a proc object for the singleton - since we
         * -must- be the HNP, and therefore we stored our
         * node on the global node pool, and since the singleton
         * -must- be on the same node as us, indicate that
         */
        proc = OBJ_NEW(orte_proc_t);
        proc->name.jobid = jdata->jobid;
        proc->name.vpid = 0;
        ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_ALIVE);
        proc->state = ORTE_PROC_STATE_RUNNING;
        proc->app_idx = 0;
        /* obviously, it is on my node */
        node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
        proc->node = node;
        OBJ_RETAIN(node);  /* keep accounting straight */
        opal_pointer_array_add(jdata->procs, proc);
        jdata->num_procs = 1;
        /* and it obviously is on the node */
        OBJ_RETAIN(proc);
        opal_pointer_array_add(node->procs, proc);
        node->num_procs++;
        /* and obviously it is one of my local procs */
        OBJ_RETAIN(proc);
        opal_pointer_array_add(orte_local_children, proc);
        jdata->num_local_procs = 1;
        /* set the trivial */
        proc->local_rank = 0;
        proc->node_rank = 0;
        proc->app_rank = 0;
        proc->state = ORTE_PROC_STATE_RUNNING;
        proc->app_idx = 0;
        ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_LOCAL);

        /* create a string that contains our uri + sysinfo + PMIx server URI */
        orte_util_convert_sysinfo_to_string(&sysinfo, orte_local_cpu_type, orte_local_cpu_model);
        asprintf(&tmp, "%s[%s]%s", orte_process_info.my_daemon_uri, sysinfo, pmix_server_uri);
	free(sysinfo);

        /* pass that info to the singleton */
        write(orted_globals.uri_pipe, tmp, strlen(tmp)+1); /* need to add 1 to get the NULL */

        /* cleanup */
        free(tmp);

        /* since a singleton spawned us, we need to harvest
         * any MCA params from the local environment so
         * we can pass them along to any subsequent daemons
         * we may start as the result of a comm_spawn
         */
        for (i=0; NULL != environ[i]; i++) {
            if (0 == strncmp(environ[i], OPAL_MCA_PREFIX, 9)) {
                /* make a copy to manipulate */
                tmp = strdup(environ[i]);
                /* find the equal sign */
                nptr = strchr(tmp, '=');
                *nptr = '\0';
                nptr++;
                /* add the mca param to the orted cmd line */
                opal_argv_append_nosize(&orted_cmd_line, "-"OPAL_MCA_CMD_LINE_ID);
                opal_argv_append_nosize(&orted_cmd_line, &tmp[9]);
                opal_argv_append_nosize(&orted_cmd_line, nptr);
                free(tmp);
            }
        }
    }

    /* if we were given a pipe to monitor for singleton termination, set that up */
    if (orted_globals.singleton_died_pipe > 0) {
        /* register shutdown handler */
        pipe_handler = (opal_event_t*)malloc(sizeof(opal_event_t));
        opal_event_set(orte_event_base, pipe_handler,
                       orted_globals.singleton_died_pipe,
                       OPAL_EV_READ,
                       pipe_closed,
                       pipe_handler);
        opal_event_add(pipe_handler, NULL);
    }

    /* If I have a parent, then save his contact info so
     * any messages we send can flow thru him.
     */

    orte_parent_uri = NULL;
    (void) mca_base_var_register ("orte", "orte", NULL, "parent_uri",
                                  "URI for the parent if tree launch is enabled.",
                                  MCA_BASE_VAR_TYPE_STRING, NULL, 0,
                                  MCA_BASE_VAR_FLAG_INTERNAL,
                                  OPAL_INFO_LVL_9,
                                  MCA_BASE_VAR_SCOPE_CONSTANT,
                                  &orte_parent_uri);
    if (NULL != orte_parent_uri) {
        orte_process_name_t parent;

        /* set the contact info into the hash table */
        orte_rml.set_contact_info(orte_parent_uri);
        ret = orte_rml_base_parse_uris(orte_parent_uri, &parent, NULL);
        if (ORTE_SUCCESS != ret) {
            ORTE_ERROR_LOG(ret);
            free (orte_parent_uri);
            orte_parent_uri = NULL;
            goto DONE;
        }

        /* don't need this value anymore */
        free(orte_parent_uri);
        orte_parent_uri = NULL;

        /* tell the routed module that we have a path
         * back to the HNP
         */
        if (ORTE_SUCCESS != (ret = orte_routed.update_route(ORTE_PROC_MY_HNP, &parent))) {
            ORTE_ERROR_LOG(ret);
            goto DONE;
        }
        /* set the lifeline to point to our parent so that we
         * can handle the situation if that lifeline goes away
         */
        if (ORTE_SUCCESS != (ret = orte_routed.set_lifeline(&parent))) {
            ORTE_ERROR_LOG(ret);
            goto DONE;
        }
    }

    /* if we are not the HNP...the only time we will be an HNP
     * is if we are launched by a singleton to provide support
     * for it
     */
    if (!ORTE_PROC_IS_HNP) {
        /* send the information to the orted report-back point - this function
         * will process the data, but also counts the number of
         * orteds that reported back so the launch procedure can continue.
         * We need to do this at the last possible second as the HNP
         * can turn right around and begin issuing orders to us
         */

        buffer = OBJ_NEW(opal_buffer_t);
        /* insert our name for rollup purposes */
        if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) {
            ORTE_ERROR_LOG(ret);
            OBJ_RELEASE(buffer);
            goto DONE;
        }
        /* for now, always include our contact info, even if we are using
         * static ports. Eventually, this will be removed
         */
        rml_uri = orte_rml.get_contact_info();
        if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &rml_uri, 1, OPAL_STRING))) {
            ORTE_ERROR_LOG(ret);
            OBJ_RELEASE(buffer);
            goto DONE;
        }

        /* include our node name */
        opal_dss.pack(buffer, &orte_process_info.nodename, 1, OPAL_STRING);

        /* if requested, include any non-loopback aliases for this node */
        if (orte_retain_aliases) {
            char **aliases=NULL;
            uint8_t naliases, ni;
            char hostname[ORTE_MAX_HOSTNAME_SIZE];

            /* if we stripped the prefix or removed the fqdn,
             * include full hostname as an alias
             */
            gethostname(hostname, ORTE_MAX_HOSTNAME_SIZE);
            if (strlen(orte_process_info.nodename) < strlen(hostname)) {
                opal_argv_append_nosize(&aliases, hostname);
            }
            opal_ifgetaliases(&aliases);
            naliases = opal_argv_count(aliases);
            if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &naliases, 1, OPAL_UINT8))) {
                ORTE_ERROR_LOG(ret);
                OBJ_RELEASE(buffer);
                goto DONE;
            }
            for (ni=0; ni < naliases; ni++) {
                if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &aliases[ni], 1, OPAL_STRING))) {
                    ORTE_ERROR_LOG(ret);
                    OBJ_RELEASE(buffer);
                    goto DONE;
                }
            }
            opal_argv_free(aliases);
        }

#if OPAL_HAVE_HWLOC
        {
            char *coprocessors;
            /* add the local topology */
            if (NULL != opal_hwloc_topology &&
                (1 == ORTE_PROC_MY_NAME->vpid || orte_hetero_nodes)) {
                if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO))) {
                    ORTE_ERROR_LOG(ret);
                }
            }
            /* detect and add any coprocessors */
            coprocessors = opal_hwloc_base_find_coprocessors(opal_hwloc_topology);
            if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &coprocessors, 1, OPAL_STRING))) {
                ORTE_ERROR_LOG(ret);
            }
            /* see if I am on a coprocessor */
            coprocessors = opal_hwloc_base_check_on_coprocessor();
            if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &coprocessors, 1, OPAL_STRING))) {
                ORTE_ERROR_LOG(ret);
            }
        }
#endif

        /* send to the HNP's callback - will be routed if routes are available */
        if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buffer,
                                               ORTE_RML_TAG_ORTED_CALLBACK,
                                               orte_rml_send_callback, NULL))) {
            ORTE_ERROR_LOG(ret);
            OBJ_RELEASE(buffer);
            goto DONE;
        }
    }

    /* if we are tree-spawning, then we need to capture the MCA params
     * from our cmd line so we can pass them along to the daemons we spawn -
     * otherwise, only the first layer of daemons will ever see them
     */
    if (orted_globals.tree_spawn) {
        int j, k;
        bool ignore;
        char *no_keep[] = {
            "orte_hnp_uri",
            "orte_ess_jobid",
            "orte_ess_vpid",
            "orte_ess_num_procs",
            "orte_parent_uri",
            "mca_base_env_list",
            NULL
        };
        for (i=0; i < argc; i++) {
            if (0 == strcmp("-"OPAL_MCA_CMD_LINE_ID,  argv[i]) ||
                0 == strcmp("--"OPAL_MCA_CMD_LINE_ID, argv[i]) ) {
                ignore = false;
                /* see if this is something we cannot pass along */
                for (k=0; NULL != no_keep[k]; k++) {
                    if (0 == strcmp(no_keep[k], argv[i+1])) {
                        ignore = true;
                        break;
                    }
                }
                if (!ignore) {
                    /* see if this is already present so we at least can
                     * avoid growing the cmd line with duplicates
                     */
                    if (NULL != orted_cmd_line) {
                        for (j=0; NULL != orted_cmd_line[j]; j++) {
                            if (0 == strcmp(argv[i+1], orted_cmd_line[j])) {
                                /* already here - ignore it */
                                ignore = true;
                                break;
                            }
                        }
                    }
                    if (!ignore) {
                        opal_argv_append_nosize(&orted_cmd_line, argv[i]);
                        opal_argv_append_nosize(&orted_cmd_line, argv[i+1]);
                        opal_argv_append_nosize(&orted_cmd_line, argv[i+2]);
                    }
                }
                i += 2;
            }
        }
    }
            
    if (orte_debug_daemons_flag) {
        opal_output(0, "%s orted: up and running - waiting for commands!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
    }
    ret = ORTE_SUCCESS;

    /* loop the event lib until an exit event is detected */
    while (orte_event_base_active) {
        opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
    }

    /* ensure all local procs are dead */
    orte_odls.kill_local_procs(NULL);

 DONE:
    /* update the exit status, in case it wasn't done */
    ORTE_UPDATE_EXIT_STATUS(ret);

    /* cleanup and leave */
    orte_finalize();

    if (orte_debug_flag) {
        fprintf(stderr, "exiting with status %d\n", orte_exit_status);
    }
    exit(orte_exit_status);
}
Ejemplo n.º 14
0
int main(int argc, char *argv[])
{
  hwloc_topology_t topology;
  int loaded = 0;
  unsigned depth;
  hwloc_bitmap_t cpubind_set, membind_set;
  int got_cpubind = 0, got_membind = 0;
  int working_on_cpubind = 1; /* membind if 0 */
  int get_binding = 0;
  int get_last_cpu_location = 0;
  unsigned long flags = 0;
  int force = 0;
  int single = 0;
  int verbose = 0;
  int logical = 1;
  int taskset = 0;
  int cpubind_flags = 0;
  hwloc_membind_policy_t membind_policy = HWLOC_MEMBIND_BIND;
  int membind_flags = 0;
  int opt;
  int ret;
  int pid_number = -1;
  hwloc_pid_t pid = 0; /* only valid when pid_number > 0, but gcc-4.8 still reports uninitialized warnings */
  char *callname;

  cpubind_set = hwloc_bitmap_alloc();
  membind_set = hwloc_bitmap_alloc();

  /* don't load now, in case some options change the config before the topology is actually used */
#define LOADED() (loaded)
#define ENSURE_LOADED() do { \
  if (!loaded) { \
    hwloc_topology_init(&topology); \
    hwloc_topology_set_all_types_filter(topology, HWLOC_TYPE_FILTER_KEEP_ALL); \
    hwloc_topology_set_flags(topology, flags); \
    hwloc_topology_load(topology); \
    depth = hwloc_topology_get_depth(topology); \
    loaded = 1; \
  } \
} while (0)

  callname = argv[0];
  /* skip argv[0], handle options */
  argv++;
  argc--;

  while (argc >= 1) {
    if (!strcmp(argv[0], "--")) {
      argc--;
      argv++;
      break;
    }

    opt = 0;

    if (*argv[0] == '-') {
      if (!strcmp(argv[0], "-v") || !strcmp(argv[0], "--verbose")) {
	verbose++;
	goto next;
      }
      if (!strcmp(argv[0], "-q") || !strcmp(argv[0], "--quiet")) {
	verbose--;
	goto next;
      }
      if (!strcmp(argv[0], "--help")) {
        usage("hwloc-bind", stdout);
	return EXIT_SUCCESS;
      }
      if (!strcmp(argv[0], "--single")) {
	single = 1;
	goto next;
      }
      if (!strcmp(argv[0], "-f") || !strcmp(argv[0], "--force")) {
	force = 1;
	goto next;
      }
      if (!strcmp(argv[0], "--strict")) {
	cpubind_flags |= HWLOC_CPUBIND_STRICT;
	membind_flags |= HWLOC_MEMBIND_STRICT;
	goto next;
      }
      if (!strcmp(argv[0], "--pid")) {
        if (argc < 2) {
          usage ("hwloc-bind", stderr);
          exit(EXIT_FAILURE);
        }
        pid_number = atoi(argv[1]);
        opt = 1;
        goto next;
      }
      if (!strcmp (argv[0], "--version")) {
	printf("%s %s\n", callname, HWLOC_VERSION);
	exit(EXIT_SUCCESS);
      }
      if (!strcmp(argv[0], "-l") || !strcmp(argv[0], "--logical")) {
        logical = 1;
        goto next;
      }
      if (!strcmp(argv[0], "-p") || !strcmp(argv[0], "--physical")) {
        logical = 0;
        goto next;
      }
      if (!strcmp(argv[0], "--taskset")) {
        taskset = 1;
        goto next;
      }
      if (!strcmp (argv[0], "-e") || !strncmp (argv[0], "--get-last-cpu-location", 10)) {
	get_last_cpu_location = 1;
	goto next;
      }
      if (!strcmp (argv[0], "--get")) {
	get_binding = 1;
	goto next;
      }
      if (!strcmp (argv[0], "--cpubind")) {
	working_on_cpubind = 1;
	goto next;
      }
      if (!strcmp (argv[0], "--membind")) {
	working_on_cpubind = 0;
	goto next;
      }
      if (!strcmp (argv[0], "--mempolicy")) {
	if (!strncmp(argv[1], "default", 2))
	  membind_policy = HWLOC_MEMBIND_DEFAULT;
	else if (!strncmp(argv[1], "firsttouch", 2))
	  membind_policy = HWLOC_MEMBIND_FIRSTTOUCH;
	else if (!strncmp(argv[1], "bind", 2))
	  membind_policy = HWLOC_MEMBIND_BIND;
	else if (!strncmp(argv[1], "interleave", 2))
	  membind_policy = HWLOC_MEMBIND_INTERLEAVE;
	else if (!strncmp(argv[1], "nexttouch", 2))
	  membind_policy = HWLOC_MEMBIND_NEXTTOUCH;
	else {
	  fprintf(stderr, "Unrecognized memory binding policy %s\n", argv[1]);
          usage ("hwloc-bind", stderr);
          exit(EXIT_FAILURE);
	}
	opt = 1;
	goto next;
      }
      if (!strcmp (argv[0], "--whole-system")) {
	if (loaded) {
	  fprintf(stderr, "Input option %s disallowed after options using the topology\n", argv[0]);
	  exit(EXIT_FAILURE);
	}
	flags |= HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM;
	goto next;
      }
      if (!strcmp (argv[0], "--restrict")) {
	hwloc_bitmap_t restrictset;
	int err;
	if (argc < 2) {
	  usage (callname, stdout);
	  exit(EXIT_FAILURE);
	}
	restrictset = hwloc_bitmap_alloc();
	hwloc_bitmap_sscanf(restrictset, argv[1]);
	ENSURE_LOADED();
	err = hwloc_topology_restrict (topology, restrictset, 0);
	if (err) {
	  perror("Restricting the topology");
	  /* fallthrough */
	}
	hwloc_bitmap_free(restrictset);
	argc--;
	argv++;
	goto next;
      }

      fprintf (stderr, "Unrecognized option: %s\n", argv[0]);
      usage("hwloc-bind", stderr);
      return EXIT_FAILURE;
    }

    ENSURE_LOADED();
    ret = hwloc_calc_process_arg(topology, depth, argv[0], logical,
				 working_on_cpubind ? cpubind_set : membind_set,
				 verbose);
    if (ret < 0) {
      if (verbose > 0)
	fprintf(stderr, "assuming the command starts at %s\n", argv[0]);
      break;
    }
    if (working_on_cpubind)
      got_cpubind = 1;
    else
      got_membind = 1;

  next:
    argc -= opt+1;
    argv += opt+1;
  }

  ENSURE_LOADED();

  if (pid_number > 0) {
    pid = hwloc_pid_from_number(pid_number, !(get_binding || get_last_cpu_location));
    /* no need to set_pid()
     * the doc just says we're operating on pid, not that we're retrieving the topo/cpuset as seen from inside pid
     */
  }

  if (get_last_cpu_location && !working_on_cpubind) {
    fprintf(stderr, "Options --membind and --get-last-cpu-location cannot be combined.\n");
    return EXIT_FAILURE;
  }
  if ((get_binding || get_last_cpu_location) && (got_cpubind || got_membind)) {
    /* doesn't work because get_binding/get_last_cpu_location overwrites cpubind_set */
    fprintf(stderr, "Cannot display and set binding at the same time.\n");
    return EXIT_FAILURE;
  }

  if (get_binding || get_last_cpu_location) {
    char *s;
    const char *policystr = NULL;
    int err;
    if (working_on_cpubind) {
      if (get_last_cpu_location) {
	if (pid_number > 0)
	  err = hwloc_get_proc_last_cpu_location(topology, pid, cpubind_set, 0);
	else
	  err = hwloc_get_last_cpu_location(topology, cpubind_set, 0);
      } else {
	if (pid_number > 0)
	  err = hwloc_get_proc_cpubind(topology, pid, cpubind_set, 0);
	else
	  err = hwloc_get_cpubind(topology, cpubind_set, 0);
      }
      if (err) {
	const char *errmsg = strerror(errno);
	if (pid_number > 0)
	  fprintf(stderr, "hwloc_get_proc_%s %d failed (errno %d %s)\n", get_last_cpu_location ? "last_cpu_location" : "cpubind", pid_number, errno, errmsg);
	else
	  fprintf(stderr, "hwloc_get_%s failed (errno %d %s)\n", get_last_cpu_location ? "last_cpu_location" : "cpubind", errno, errmsg);
	return EXIT_FAILURE;
      }
      if (taskset)
	hwloc_bitmap_taskset_asprintf(&s, cpubind_set);
      else
	hwloc_bitmap_asprintf(&s, cpubind_set);
    } else {
      hwloc_membind_policy_t policy;
      if (pid_number > 0)
	err = hwloc_get_proc_membind(topology, pid, membind_set, &policy, 0);
      else
	err = hwloc_get_membind(topology, membind_set, &policy, 0);
      if (err) {
	const char *errmsg = strerror(errno);
        if (pid_number > 0)
          fprintf(stderr, "hwloc_get_proc_membind %d failed (errno %d %s)\n", pid_number, errno, errmsg);
        else
	  fprintf(stderr, "hwloc_get_membind failed (errno %d %s)\n", errno, errmsg);
	return EXIT_FAILURE;
      }
      if (taskset)
	hwloc_bitmap_taskset_asprintf(&s, membind_set);
      else
	hwloc_bitmap_asprintf(&s, membind_set);
      switch (policy) {
      case HWLOC_MEMBIND_DEFAULT: policystr = "default"; break;
      case HWLOC_MEMBIND_FIRSTTOUCH: policystr = "firsttouch"; break;
      case HWLOC_MEMBIND_BIND: policystr = "bind"; break;
      case HWLOC_MEMBIND_INTERLEAVE: policystr = "interleave"; break;
      case HWLOC_MEMBIND_NEXTTOUCH: policystr = "nexttouch"; break;
      default: fprintf(stderr, "unknown memory policy %d\n", policy); assert(0); break;
      }
    }
    if (policystr)
      printf("%s (%s)\n", s, policystr);
    else
      printf("%s\n", s);
    free(s);
  }

  if (got_membind) {
    if (hwloc_bitmap_iszero(membind_set)) {
      if (verbose >= 0)
	fprintf(stderr, "cannot membind to empty set\n");
      if (!force)
	goto failed_binding;
    }
    if (verbose > 0) {
      char *s;
      hwloc_bitmap_asprintf(&s, membind_set);
      fprintf(stderr, "binding on memory set %s\n", s);
      free(s);
    }
    if (single)
      hwloc_bitmap_singlify(membind_set);
    if (pid_number > 0)
      ret = hwloc_set_proc_membind(topology, pid, membind_set, membind_policy, membind_flags);
    else
      ret = hwloc_set_membind(topology, membind_set, membind_policy, membind_flags);
    if (ret && verbose >= 0) {
      int bind_errno = errno;
      const char *errmsg = strerror(bind_errno);
      char *s;
      hwloc_bitmap_asprintf(&s, membind_set);
      if (pid_number > 0)
        fprintf(stderr, "hwloc_set_proc_membind %s %d failed (errno %d %s)\n", s, pid_number, bind_errno, errmsg);
      else
        fprintf(stderr, "hwloc_set_membind %s failed (errno %d %s)\n", s, bind_errno, errmsg);
      free(s);
    }
    if (ret && !force)
      goto failed_binding;
  }

  if (got_cpubind) {
    if (hwloc_bitmap_iszero(cpubind_set)) {
      if (verbose >= 0)
	fprintf(stderr, "cannot cpubind to empty set\n");
      if (!force)
	goto failed_binding;
    }
    if (verbose > 0) {
      char *s;
      hwloc_bitmap_asprintf(&s, cpubind_set);
      fprintf(stderr, "binding on cpu set %s\n", s);
      free(s);
    }
    if (single)
      hwloc_bitmap_singlify(cpubind_set);
    if (pid_number > 0)
      ret = hwloc_set_proc_cpubind(topology, pid, cpubind_set, cpubind_flags);
    else
      ret = hwloc_set_cpubind(topology, cpubind_set, cpubind_flags);
    if (ret && verbose >= 0) {
      int bind_errno = errno;
      const char *errmsg = strerror(bind_errno);
      char *s;
      hwloc_bitmap_asprintf(&s, cpubind_set);
      if (pid_number > 0)
        fprintf(stderr, "hwloc_set_proc_cpubind %s %d failed (errno %d %s)\n", s, pid_number, bind_errno, errmsg);
      else
        fprintf(stderr, "hwloc_set_cpubind %s failed (errno %d %s)\n", s, bind_errno, errmsg);
      free(s);
    }
    if (ret && !force)
      goto failed_binding;
  }

  hwloc_bitmap_free(cpubind_set);
  hwloc_bitmap_free(membind_set);

  hwloc_topology_destroy(topology);

  if (pid_number > 0)
    return EXIT_SUCCESS;

  if (0 == argc) {
    if (get_binding || get_last_cpu_location)
      return EXIT_SUCCESS;
    fprintf(stderr, "%s: nothing to do!\n", callname);
    return EXIT_FAILURE;
  }

  /* FIXME: check whether Windows execvp() passes INHERIT_PARENT_AFFINITY to CreateProcess()
   * because we need to propagate processor group affinity. However process-wide affinity
   * isn't supported with processor groups so far.
   */
  ret = execvp(argv[0], argv);
  if (ret) {
      fprintf(stderr, "%s: Failed to launch executable \"%s\"\n",
              callname, argv[0]);
      perror("execvp");
  }
  return EXIT_FAILURE;


failed_binding:
  hwloc_bitmap_free(cpubind_set);
  hwloc_bitmap_free(membind_set);
  hwloc_topology_destroy(topology);
  return EXIT_FAILURE;
}
Ejemplo n.º 15
0
Archivo: gpu.c Proyecto: DawidvC/chapel
int main(void)
{
    hwloc_topology_t topology;
    hwloc_obj_t obj;
    unsigned n, i;
    int devid, platformid;
    const char *dev;

    /* Allocate, initialize and load topology object. */
    hwloc_topology_init(&topology);
    hwloc_topology_set_flags(topology, HWLOC_TOPOLOGY_FLAG_IO_DEVICES);
    hwloc_topology_load(topology);

    /* Find CUDA devices through the corresponding OS devices */
    n = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_OS_DEVICE);

    for (i = 0; i < n ; i++) {
      const char *s;
      obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_OS_DEVICE, i);
      printf("%s:\n", obj->name);

      s = hwloc_obj_get_info_by_name(obj, "Backend");

      if (s && !strcmp(s, "CUDA")) {
        /* This is a CUDA device */
        assert(!strncmp(obj->name, "cuda", 4));
        devid = atoi(obj->name + 4);
        printf("CUDA device %d\n", devid);

        s = hwloc_obj_get_info_by_name(obj, "GPUModel");
        if (s)
          printf("Model: %s\n", s);

        s = hwloc_obj_get_info_by_name(obj, "CUDAGlobalMemorySize");
        if (s)
          printf("Memory: %s\n", s);

        s = hwloc_obj_get_info_by_name(obj, "CUDAMultiProcessors");
        if (s)
        {
          int mp = atoi(s);
          s = hwloc_obj_get_info_by_name(obj, "CUDACoresPerMP");
          if (s) {
            int mp_cores = atoi(s);
            printf("Cores: %d\n", mp * mp_cores);
          }
        }
      }

      if (s && !strcmp(s, "OpenCL")) {
        /* This is an OpenCL device */
        assert(!strncmp(obj->name, "opencl", 6));
        platformid = atoi(obj->name + 6);
        printf("OpenCL platform %d\n", platformid);
        dev = strchr(obj->name + 6, 'd');
        devid = atoi(dev + 1);
        printf("OpenCL device %d\n", devid);

        s = hwloc_obj_get_info_by_name(obj, "GPUModel");
        if (s)
          printf("Model: %s\n", s);

        s = hwloc_obj_get_info_by_name(obj, "OpenCLGlobalMemorySize");
        if (s)
          printf("Memory: %s\n", s);
      }

      /* One can also use helpers from hwloc/cuda.h, hwloc/cudart.h,
       * hwloc/opencl.h */


      /* Find out cpuset this is connected to */
      while (obj && (!obj->cpuset || hwloc_bitmap_iszero(obj->cpuset)))
        obj = obj->parent;

      if (obj) {
        char *cpuset_string;
        char name[16];
        hwloc_obj_type_snprintf(name, sizeof(name), obj, 0);
        hwloc_bitmap_asprintf(&cpuset_string, obj->cpuset);
        printf("Location: %s P#%d\n", name, obj->os_index);
        printf("Cpuset: %s\n", cpuset_string);
      }
      printf("\n");
    }

    /* Destroy topology object. */
    hwloc_topology_destroy(topology);

    return 0;
}
Ejemplo n.º 16
0
static int orcmd_init(void)
{
    int ret = ORTE_ERROR;
    char *error = NULL;
    opal_buffer_t buf, *clusterbuf, *uribuf;
    orte_job_t *jdata;
    orte_node_t *node;
    orte_proc_t *proc;
    opal_list_t config;
    orcm_scheduler_t *scheduler;
    orcm_node_t *mynode=NULL;
    int32_t n;

    if (initialized) {
        return ORCM_SUCCESS;
    }
    initialized = true;

    /* Initialize the ORTE data type support */
    if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
        error = "orte_std_prolog";
        goto error;
    }

    /* setup the global job and node arrays */
    orte_job_data = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data,
                                                       1,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       1))) {
        ORTE_ERROR_LOG(ret);
        error = "setup job array";
        goto error;
    }

    orte_node_pool = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
        ORTE_ERROR_LOG(ret);
        error = "setup node array";
        goto error;
    }
    orte_node_topologies = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_topologies,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
        ORTE_ERROR_LOG(ret);
        error = "setup node topologies array";
        goto error;
    }

    /* create a job tracker for the daemons */
    jdata = OBJ_NEW(orte_job_t);
    jdata->jobid = 0;
    ORTE_PROC_MY_NAME->jobid = 0;
    opal_pointer_array_set_item(orte_job_data, 0, jdata);

    /* read the site configuration */
    OBJ_CONSTRUCT(&config, opal_list_t);
    if (ORCM_SUCCESS != (ret = orcm_cfgi.read_config(&config))) {
        error = "getting config";
        goto error;
    }

    /* define the cluster and collect contact info for all
     * aggregators - we'll need to know how to talk to any
     * of them in case of failures
     */
    OBJ_CONSTRUCT(&buf, opal_buffer_t);
    if (ORCM_SUCCESS != (ret = orcm_cfgi.define_system(&config,
                                                       &mynode,
                                                       &orte_process_info.num_procs,
                                                       &buf))) {
        OBJ_DESTRUCT(&buf);
        error = "define system";
        goto error;
    }

    /* if my name didn't get set, then we didn't find our node
     * in the config - report it and die
     */
    if (NULL == mynode) {
        orte_show_help("help-ess-orcm.txt", "node-not-found", true,
                       orcm_cfgi_base.config_file,
                       orte_process_info.nodename);
        OBJ_DESTRUCT(&buf);
        return ORTE_ERR_SILENT;
    }

    /* define a node and proc object for ourselves as some parts
     * of ORTE and ORCM require it */
    if (NULL == (node = OBJ_NEW(orte_node_t))) {
        ret = ORTE_ERR_OUT_OF_RESOURCE;
        error = "out of memory";
        goto error;
    }
    node->name = strdup(orte_process_info.nodename);
    opal_pointer_array_set_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid, node);
    if (NULL == (proc = OBJ_NEW(orte_proc_t))) {
        ret = ORTE_ERR_OUT_OF_RESOURCE;
        error = "out of memory";
        goto error;
    }
    proc->name.jobid = ORTE_PROC_MY_NAME->jobid;
    proc->name.vpid = ORTE_PROC_MY_NAME->vpid;
    OBJ_RETAIN(proc);
    node->daemon = proc;
    OBJ_RETAIN(node);
    proc->node = node;
    opal_pointer_array_set_item(jdata->procs, ORTE_PROC_MY_NAME->vpid, proc);

    /* For now, we only support a single scheduler daemon in the system.
     * This *may* change someday in the future */
    scheduler = (orcm_scheduler_t*)opal_list_get_first(orcm_schedulers);

    /* If we are in test mode, then we don't *require* that a scheduler
     * be defined in the system - otherwise, we do */
    if (NULL == scheduler) {
        if (mca_sst_orcmd_component.scheduler_reqd) {
            error = "no scheduler found";
            ret = ORTE_ERR_NOT_FOUND;
            goto error;
        }
    } else {
        ORTE_PROC_MY_SCHEDULER->jobid = scheduler->controller.daemon.jobid;
        ORTE_PROC_MY_SCHEDULER->vpid = scheduler->controller.daemon.vpid;
    }

    /* register the ORTE-level params at this time now that the
     * config has had a chance to push things into the environ
     */
    if (ORTE_SUCCESS != (ret = orte_register_params())) {
        OBJ_DESTRUCT(&buf);
        error = "orte_register_params";
        goto error;
    }

    /* setup callback for SIGPIPE */
    setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback);
    /* Set signal handlers to catch kill signals so we can properly clean up
     * after ourselves.
     */
    setup_sighandler(SIGTERM, &term_handler, shutdown_signal);
    setup_sighandler(SIGINT, &int_handler, shutdown_signal);

    /** setup callbacks for signals we should ignore */
    setup_sighandler(SIGUSR1, &sigusr1_handler, signal_callback);
    setup_sighandler(SIGUSR2, &sigusr2_handler, signal_callback);
    signals_set = true;

#if OPAL_HAVE_HWLOC
    {
        hwloc_obj_t obj;
        unsigned i, j;

        /* get the local topology */
        if (NULL == opal_hwloc_topology) {
            if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
                OBJ_DESTRUCT(&buf);
                error = "topology discovery";
                goto error;
            }
        }

        /* remove the hostname from the topology. Unfortunately, hwloc
         * decided to add the source hostname to the "topology", thus
         * rendering it unusable as a pure topological description. So
         * we remove that information here.
         */
        obj = hwloc_get_root_obj(opal_hwloc_topology);
        for (i=0; i < obj->infos_count; i++) {
            if (NULL == obj->infos[i].name ||
                NULL == obj->infos[i].value) {
                continue;
            }
            if (0 == strncmp(obj->infos[i].name, "HostName", strlen("HostName"))) {
                free(obj->infos[i].name);
                free(obj->infos[i].value);
                /* left justify the array */
                for (j=i; j < obj->infos_count-1; j++) {
                    obj->infos[j] = obj->infos[j+1];
                }
                obj->infos[obj->infos_count-1].name = NULL;
                obj->infos[obj->infos_count-1].value = NULL;
                obj->infos_count--;
                break;
            }
        }

        if (15 < opal_output_get_verbosity(orcm_sst_base_framework.framework_output)) {
            opal_output(0, "%s Topology Info:", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO);
        }

        /* if we were asked to bind to specific core(s), do so now */
        if (NULL != orte_daemon_cores) {
            char **cores=NULL, tmp[128];
            hwloc_obj_t pu;
            hwloc_cpuset_t ours, pucpus, res;
            int core;

            /* could be a collection of comma-delimited ranges, so
             * use our handy utility to parse it
             */
            orte_util_parse_range_options(orte_daemon_cores, &cores);
            if (NULL != cores) {
                ours = hwloc_bitmap_alloc();
                hwloc_bitmap_zero(ours);
                pucpus = hwloc_bitmap_alloc();
                res = hwloc_bitmap_alloc();
                for (i=0; NULL != cores[i]; i++) {
                    core = strtoul(cores[i], NULL, 10);
                    if (NULL == (pu = opal_hwloc_base_get_pu(opal_hwloc_topology, core, OPAL_HWLOC_LOGICAL))) {
                        orte_show_help("help-orted.txt", "orted:cannot-bind",
                                       true, orte_process_info.nodename,
                                       orte_daemon_cores);
                        ret = ORTE_ERR_NOT_SUPPORTED;
                        OBJ_DESTRUCT(&buf);
                        error = "cannot bind";
                        goto error;
                    }
                    hwloc_bitmap_and(pucpus, pu->online_cpuset, pu->allowed_cpuset);
                    hwloc_bitmap_or(res, ours, pucpus);
                    hwloc_bitmap_copy(ours, res);
                }
                /* if the result is all zeros, then don't bind */
                if (!hwloc_bitmap_iszero(ours)) {
                    (void)hwloc_set_cpubind(opal_hwloc_topology, ours, 0);
                    if (opal_hwloc_report_bindings) {
                        opal_hwloc_base_cset2mapstr(tmp, sizeof(tmp), opal_hwloc_topology, ours);
                        opal_output(0, "Daemon %s is bound to cores %s",
                                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp);
                    }
                }
                /* cleanup */
                hwloc_bitmap_free(ours);
                hwloc_bitmap_free(pucpus);
                hwloc_bitmap_free(res);
                opal_argv_free(cores);
            }
        }
    }
#endif

    /* open and select the pstat framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_pstat_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "opal_pstat_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "opal_pstat_base_select";
        goto error;
    }

    /* open and setup the state machine */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_state_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_state_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_state_base_select";
        goto error;
    }

    /* open the notifier */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_notifier_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_notifier_base_open";
        goto error;
    }

    /* open the errmgr */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_errmgr_base_open";
        goto error;
    }

    /* Setup the communication infrastructure */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_oob_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_oob_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_oob_base_select";
        goto error;
    }
    if (!opal_list_get_size(&orte_oob_base.actives)) {
        ret = ORTE_ERROR;
        error = "orte_oob: Found 0 active transports";
        goto error;
    }

    /* Runtime Messaging Layer */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_rml_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_rml_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_rml_base_select";
        goto error;
    }

    /* select the notifier*/
    if (ORTE_SUCCESS != (ret = orte_notifier_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_notifier_base_select";
        goto error;
    }

    /* select the errmgr */
    if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_errmgr_base_select";
        goto error;
    }

    /* Routed system */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_rml_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_routed_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_routed_base_select";
        goto error;
    }

    /* database */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_db_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orcm_db_base_open";
        goto error;
    }
    /* always restrict daemons to local database components */
    if (ORTE_SUCCESS != (ret = orcm_db_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orcm_db_base_select";
        goto error;
    }

    /* datastore - ensure we don't pickup the pmi component, but
     * don't override anything set by user
     */
    if (NULL == getenv(OPAL_MCA_PREFIX"dstore")) {
        putenv(OPAL_MCA_PREFIX"dstore=^pmi");
    }
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_dstore_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "opal_dstore_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = opal_dstore_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "opal_dstore_base_select";
        goto error;
    }
    /* create the handle */
    if (0 > (opal_dstore_internal = opal_dstore.open("INTERNAL", NULL, NULL))) {
        error = "opal dstore internal";
        ret = ORTE_ERR_FATAL;
        goto error;
    }

    /* extract the cluster description and setup the routed info - the orcm routed component
     * will know what to do. */
    n = 1;
    if (OPAL_SUCCESS != (ret = opal_dss.unpack(&buf, &clusterbuf, &n, OPAL_BUFFER))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "extract cluster buf";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, clusterbuf))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        OBJ_RELEASE(clusterbuf);
        error = "orte_routed.init_routes";
        goto error;
    }
    OBJ_RELEASE(clusterbuf);

    /* extract the uri buffer and load the hash tables */
    n = 1;
    if (OPAL_SUCCESS != (ret = opal_dss.unpack(&buf, &uribuf, &n, OPAL_BUFFER))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "extract uri buffer";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_rml_base_update_contact_info(uribuf))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        OBJ_RELEASE(uribuf);
        error = "load hash tables";
        goto error;
    }
    OBJ_DESTRUCT(&buf);
    OBJ_RELEASE(uribuf);

    /*
     * Group communications
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_grpcomm_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_select";
        goto error;
    }

    /* Open/select the odls */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_odls_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_odls_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_odls_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_odls_base_select";
        goto error;
    }

    /* enable communication with the rml */
    if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml.enable_comm";
        goto error;
    }

    /* setup the FileM */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_filem_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_filem_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_filem_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_filem_base_select";
        goto error;
    }

    /*
     * Initalize the CR setup
     * Note: Always do this, even in non-FT builds.
     * If we don't some user level tools may hang.
     */
    opal_cr_set_enabled(false);
    if (ORTE_SUCCESS != (ret = orte_cr_init())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_cr_init";
        goto error;
    }

    /* setup the ANALYTICS framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_analytics_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_analytics_base_open";
        goto error;
    }

    /* setup the EVGEN framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_evgen_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_evgen_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orcm_evgen_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_evgen_select";
        goto error;
    }

    /* setup the SENSOR framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_sensor_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_sensor_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orcm_sensor_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_sensor_select";
        goto error;
    }
    /* start the local sensors */
    orcm_sensor.start(ORTE_PROC_MY_NAME->jobid);

    /* setup the PWRMGMT framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_pwrmgmt_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_pwrmgmt_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orcm_pwrmgmt_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_pwrmgmt_select";
        goto error;
    }

    /* setup the DFS framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_dfs_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_dfs_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_dfs_select";
        goto error;
    }

    /* open and setup the DIAG framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_diag_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_diag_base_open";
        goto error;
    }
    if (ORCM_SUCCESS != (ret = orcm_diag_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_diag_select";
        goto error;
    }

    return ORTE_SUCCESS;
    
 error:
    orte_show_help("help-orcm-runtime.txt",
                   "orcm_init:startup:internal-failure",
                   true, error, ORTE_ERROR_NAME(ret), ret);
    
    return ORTE_ERR_SILENT;
}
Ejemplo n.º 17
0
int main(int argc, char *argv[])
{
    const struct hwloc_topology_support *support;
    hwloc_topology_t topology;
    hwloc_const_bitmap_t topocpuset;
    hwloc_bitmap_t cpuset;
    unsigned long flags = 0;
    DIR *dir;
    struct dirent *dirent;
    int show_all = 0;
    int show_threads = 0;
    int get_last_cpu_location = 0;
    char *callname;
    char *pidcmd = NULL;
    int err;
    int opt;

    callname = strrchr(argv[0], '/');
    if (!callname)
        callname = argv[0];
    else
        callname++;
    /* skip argv[0], handle options */
    argc--;
    argv++;

    hwloc_utils_check_api_version(callname);

    while (argc >= 1) {
        opt = 0;
        if (!strcmp(argv[0], "-a"))
            show_all = 1;
        else if (!strcmp(argv[0], "-l") || !strcmp(argv[0], "--logical")) {
            logical = 1;
        } else if (!strcmp(argv[0], "-p") || !strcmp(argv[0], "--physical")) {
            logical = 0;
        } else if (!strcmp(argv[0], "-c") || !strcmp(argv[0], "--cpuset")) {
            show_cpuset = 1;
        } else if (!strcmp(argv[0], "-e") || !strncmp(argv[0], "--get-last-cpu-location", 10)) {
            get_last_cpu_location = 1;
        } else if (!strcmp(argv[0], "-t") || !strcmp(argv[0], "--threads")) {
#ifdef HWLOC_LINUX_SYS
            show_threads = 1;
#else
            fprintf (stderr, "Listing threads is currently only supported on Linux\n");
#endif
        } else if (!strcmp (argv[0], "--whole-system")) {
            flags |= HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM;
        } else if (!strcmp (argv[0], "--pid-cmd")) {
            if (argc < 2) {
                usage(callname, stdout);
                exit(EXIT_FAILURE);
            }
            pidcmd = argv[1];
            opt = 1;
        } else {
            fprintf (stderr, "Unrecognized option: %s\n", argv[0]);
            usage (callname, stderr);
            exit(EXIT_FAILURE);
        }
        argc -= opt+1;
        argv += opt+1;
    }

    err = hwloc_topology_init(&topology);
    if (err)
        goto out;

    hwloc_topology_set_flags(topology, flags);

    err = hwloc_topology_load(topology);
    if (err)
        goto out_with_topology;

    support = hwloc_topology_get_support(topology);

    if (get_last_cpu_location) {
        if (!support->cpubind->get_proc_last_cpu_location)
            goto out_with_topology;
    } else {
        if (!support->cpubind->get_proc_cpubind)
            goto out_with_topology;
    }

    topocpuset = hwloc_topology_get_topology_cpuset(topology);

    dir  = opendir("/proc");
    if (!dir)
        goto out_with_topology;

    cpuset = hwloc_bitmap_alloc();
    if (!cpuset)
        goto out_with_dir;

    while ((dirent = readdir(dir))) {
        long pid_number;
        hwloc_pid_t pid;
        char pidoutput[1024];
        char *end;
        char name[64] = "";
        /* management of threads */
        unsigned boundthreads = 0, i;
        long *tids = NULL; /* NULL if process is not threaded */
        hwloc_bitmap_t *tidcpusets = NULL;

        pid_number = strtol(dirent->d_name, &end, 10);
        if (*end)
            /* Not a number */
            continue;

        pid = hwloc_pid_from_number(pid_number, 0);

#ifdef HWLOC_LINUX_SYS
        {
            unsigned pathlen = 6 + strlen(dirent->d_name) + 1 + 7 + 1;
            char *path;
            int file;
            ssize_t n;

            path = malloc(pathlen);
            snprintf(path, pathlen, "/proc/%s/cmdline", dirent->d_name);
            file = open(path, O_RDONLY);
            free(path);

            if (file >= 0) {
                n = read(file, name, sizeof(name) - 1);
                close(file);

                if (n <= 0)
                    /* Ignore kernel threads and errors */
                    continue;

                name[n] = 0;
            }
        }
#endif /* HWLOC_LINUX_SYS */

        if (show_threads) {
#ifdef HWLOC_LINUX_SYS
            /* check if some threads must be displayed */
            unsigned pathlen = 6 + strlen(dirent->d_name) + 1 + 4 + 1;
            char *path;
            DIR *taskdir;

            path = malloc(pathlen);
            snprintf(path, pathlen, "/proc/%s/task", dirent->d_name);
            taskdir = opendir(path);
            if (taskdir) {
                struct dirent *taskdirent;
                long tid;
                unsigned n = 0;
                /* count threads */
                while ((taskdirent = readdir(taskdir))) {
                    tid = strtol(taskdirent->d_name, &end, 10);
                    if (*end)
                        /* Not a number */
                        continue;
                    n++;
                }
                if (n > 1) {
                    /* if there's more than one thread, see if some are bound */
                    tids = malloc(n * sizeof(*tids));
                    tidcpusets = calloc(n+1, sizeof(*tidcpusets));
                    if (tids && tidcpusets) {
                        /* reread the directory but gather info now */
                        rewinddir(taskdir);
                        i = 0;
                        while ((taskdirent = readdir(taskdir))) {
                            tid = strtol(taskdirent->d_name, &end, 10);
                            if (*end)
                                /* Not a number */
                                continue;
                            if (get_last_cpu_location) {
                                if (hwloc_linux_get_tid_last_cpu_location(topology, tid, cpuset))
                                    continue;
                            } else {
                                if (hwloc_linux_get_tid_cpubind(topology, tid, cpuset))
                                    continue;
                            }
                            hwloc_bitmap_and(cpuset, cpuset, topocpuset);
                            tids[i] = tid;
                            tidcpusets[i] = hwloc_bitmap_dup(cpuset);
                            i++;
                            if (hwloc_bitmap_iszero(cpuset))
                                continue;
                            if (hwloc_bitmap_isequal(cpuset, topocpuset) && !show_all)
                                continue;
                            boundthreads++;
                        }
                    } else {
                        /* failed to alloc, behave as if there were no threads */
                        free(tids);
                        tids = NULL;
                        free(tidcpusets);
                        tidcpusets = NULL;
                    }
                }
                closedir(taskdir);
            }
#endif /* HWLOC_LINUX_SYS */
        }

        if (get_last_cpu_location) {
            if (hwloc_get_proc_last_cpu_location(topology, pid, cpuset, 0))
                continue;
        } else {
            if (hwloc_get_proc_cpubind(topology, pid, cpuset, 0))
                continue;
        }

        hwloc_bitmap_and(cpuset, cpuset, topocpuset);
        if (hwloc_bitmap_iszero(cpuset))
            continue;

        /* don't print anything if the process isn't bound and if no threads are bound and if not showing all */
        if (hwloc_bitmap_isequal(cpuset, topocpuset) && (!tids || !boundthreads) && !show_all)
            continue;

        pidoutput[0] = '\0';
        if (pidcmd) {
            char *cmd;
            FILE *file;
            cmd = malloc(strlen(pidcmd)+1+5+2+1);
            sprintf(cmd, "%s %u", pidcmd, pid);
            file = popen(cmd, "r");
            if (file) {
                if (fgets(pidoutput, sizeof(pidoutput), file)) {
                    end = strchr(pidoutput, '\n');
                    if (end)
                        *end = '\0';
                }
                pclose(file);
            }
            free(cmd);
        }

        /* print the process */
        print_task(topology, pid_number, name, cpuset, pidoutput[0] == '\0' ? NULL : pidoutput, 0);
        if (tids)
            /* print each tid we found (it's tidcpuset isn't NULL anymore) */
            for(i=0; tidcpusets[i] != NULL; i++) {
                print_task(topology, tids[i], "", tidcpusets[i], NULL, 1);
                hwloc_bitmap_free(tidcpusets[i]);
            }

        /* free threads stuff */
        free(tidcpusets);
        free(tids);
    }

    err = 0;
    hwloc_bitmap_free(cpuset);

out_with_dir:
    closedir(dir);
out_with_topology:
    hwloc_topology_destroy(topology);
out:
    return err;
}
Ejemplo n.º 18
0
static void create_hwloc_cpusets() {
#ifdef USE_HWLOC
    int i;

    int err = hwloc_topology_init(&topology);
    assert(err == 0);

    err = hwloc_topology_load(topology);
    assert(err == 0);

    hwloc_bitmap_t cpuset = hwloc_bitmap_alloc();
    assert(cpuset);

    err = hwloc_get_cpubind(topology, cpuset, HWLOC_CPUBIND_PROCESS);
    assert(err == 0);
    const int available_pus = hwloc_bitmap_weight(cpuset);
    const int last_set_index = hwloc_bitmap_last(cpuset);
    const int num_workers = hc_context->nworkers;

    hclib_affinity_t selected_affinity = HCLIB_AFFINITY_STRIDED;
    const char *user_selected_affinity = getenv("HCLIB_AFFINITY");
    if (user_selected_affinity) {
        if (strcmp(user_selected_affinity, "strided") == 0) {
            selected_affinity = HCLIB_AFFINITY_STRIDED;
        } else if (strcmp(user_selected_affinity, "chunked") == 0) {
            selected_affinity = HCLIB_AFFINITY_CHUNKED;
        } else {
            fprintf(stderr, "Unsupported thread affinity \"%s\" specified with "
                    "HCLIB_AFFINITY.\n", user_selected_affinity);
            exit(1);
        }
    }

    thread_cpusets = (hwloc_bitmap_t *)malloc(hc_context->nworkers *
            sizeof(*thread_cpusets));
    assert(thread_cpusets);

    for (i = 0; i < hc_context->nworkers; i++) {
        thread_cpusets[i] = hwloc_bitmap_alloc();
        assert(thread_cpusets[i]);
    }

    switch (selected_affinity) {
        case (HCLIB_AFFINITY_STRIDED): {
            if (available_pus < num_workers) {
                fprintf(stderr, "ERROR Available PUs (%d) was less than number "
                        "of workers (%d), don't currently support "
                        "oversubscription with strided thread pinning\n",
                        available_pus, num_workers);
                exit(1);
            }

            int count = 0;
            int index = 0;
            while (index <= last_set_index) {
                if (hwloc_bitmap_isset(cpuset, index)) {
                    hwloc_bitmap_set(thread_cpusets[count % num_workers],
                            index);
                    count++;
                }
                index++;
            }
            break;
        }
        case (HCLIB_AFFINITY_CHUNKED): {
            const int chunk_size = (available_pus + num_workers - 1) /
                    num_workers;
            int count = 0;
            int index = 0;
            while (index <= last_set_index) {
                if (hwloc_bitmap_isset(cpuset, index)) {
                    hwloc_bitmap_set(thread_cpusets[count / chunk_size], index);
                    count++;
                }
                index++;
            }
            break;
        }
        default:
            assert(false);
    }

    hwloc_bitmap_t nodeset = hwloc_bitmap_alloc();
    hwloc_bitmap_t other_nodeset = hwloc_bitmap_alloc();
    assert(nodeset && other_nodeset);

    /*
     * Here, we look for contiguous ranges of worker threads that share any NUMA
     * nodes with us. In theory, this should be more hierarchical but isn't yet.
     * This is also super inefficient... O(T^2) where T is the number of
     * workers.
     */
    bool revert_to_naive_stealing = false;
    for (i = 0; i < hc_context->nworkers; i++) {
        // Get the NUMA nodes for this CPU set
        hwloc_cpuset_to_nodeset(topology, thread_cpusets[i], nodeset);

        int base = -1;
        int limit = -1;
        int j;
        for (j = 0; j < hc_context->nworkers; j++) {
            hwloc_cpuset_to_nodeset(topology, thread_cpusets[j], other_nodeset);
            // Take the intersection, see if there is any overlap
            hwloc_bitmap_and(other_nodeset, nodeset, other_nodeset);

            if (base < 0) {
                // Haven't found a contiguous chunk of workers yet.
                if (!hwloc_bitmap_iszero(other_nodeset)) {
                    base = j;
                }
            } else {
                /*
                 * Have a contiguous chunk of workers, either still inside it or
                 * after it.
                 */
                if (limit < 0) {
                    // Inside the contiguous chunk of workers
                    if (hwloc_bitmap_iszero(other_nodeset)) {
                        // Found the end
                        limit = j;
                    }
                } else {
                    // After the contiguous chunk of workers
                    if (!hwloc_bitmap_iszero(other_nodeset)) {
                        // No contiguous chunk to find, just do something naive.
                        revert_to_naive_stealing = true;
                        break;
                    }
                }
            }
        }

        if (revert_to_naive_stealing) {
            fprintf(stderr, "WARNING: Using naive work-stealing patterns.\n");
            base = 0;
            limit = hc_context->nworkers;
        } else {
            assert(base >= 0);
            if (limit < 0) {
                limit = hc_context->nworkers;
            }
        }

        hc_context->workers[i]->base_intra_socket_workers = base;
        hc_context->workers[i]->limit_intra_socket_workers = limit;

#ifdef VERBOSE
        char *nbuf;
        hwloc_bitmap_asprintf(&nbuf, nodeset);

        char *buffer;
        hwloc_bitmap_asprintf(&buffer, thread_cpusets[i]);
        fprintf(stderr, "Worker %d has access to %d PUs (%s), %d NUMA nodes "
                "(%s). Shared NUMA nodes with [%d, %d).\n", i,
                hwloc_bitmap_weight(thread_cpusets[i]), buffer,
                hwloc_bitmap_weight(nodeset), nbuf, base, limit);
        free(buffer);
#endif
    }

#endif
}
Ejemplo n.º 19
0
int main(void)
{
  hwloc_topology_t topology;
#ifdef HWLOC_HAVE_CPU_SET
  unsigned depth;
  hwloc_bitmap_t hwlocset;
  cpu_set_t schedset;
  hwloc_obj_t obj;
  int err;
#endif /* HWLOC_HAVE_CPU_SET */

  hwloc_topology_init(&topology);
  hwloc_topology_load(topology);

#ifdef HWLOC_HAVE_CPU_SET

  depth = hwloc_topology_get_depth(topology);

  hwlocset = hwloc_bitmap_dup(hwloc_topology_get_complete_cpuset(topology));
  hwloc_cpuset_to_glibc_sched_affinity(topology, hwlocset, &schedset, sizeof(schedset));
#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
  err = sched_setaffinity(0, sizeof(schedset));
#else
  err = sched_setaffinity(0, sizeof(schedset), &schedset);
#endif
  assert(!err);
  hwloc_bitmap_free(hwlocset);

#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
  err = sched_getaffinity(0, sizeof(schedset));
#else
  err = sched_getaffinity(0, sizeof(schedset), &schedset);
#endif
  assert(!err);
  hwlocset = hwloc_bitmap_alloc();
  hwloc_cpuset_from_glibc_sched_affinity(topology, hwlocset, &schedset, sizeof(schedset));
  assert(hwloc_bitmap_isincluded(hwlocset, hwloc_topology_get_complete_cpuset(topology)));
  hwloc_bitmap_andnot(hwlocset, hwlocset, hwloc_topology_get_online_cpuset(topology));
  hwloc_bitmap_andnot(hwlocset, hwlocset, hwloc_topology_get_allowed_cpuset(topology));
  assert(hwloc_bitmap_iszero(hwlocset));
  hwloc_bitmap_free(hwlocset);

  obj = hwloc_get_obj_by_depth(topology, depth-1, hwloc_get_nbobjs_by_depth(topology, depth-1) - 1);
  assert(obj);
  assert(obj->type == HWLOC_OBJ_PU);

  hwlocset = hwloc_bitmap_dup(obj->cpuset);
  hwloc_cpuset_to_glibc_sched_affinity(topology, hwlocset, &schedset, sizeof(schedset));
#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
  err = sched_setaffinity(0, sizeof(schedset));
#else
  err = sched_setaffinity(0, sizeof(schedset), &schedset);
#endif
  assert(!err);
  hwloc_bitmap_free(hwlocset);

#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
  err = sched_getaffinity(0, sizeof(schedset));
#else
  err = sched_getaffinity(0, sizeof(schedset), &schedset);
#endif
  assert(!err);
  hwlocset = hwloc_bitmap_alloc();
  hwloc_cpuset_from_glibc_sched_affinity(topology, hwlocset, &schedset, sizeof(schedset));
  assert(hwloc_bitmap_isequal(hwlocset, obj->cpuset));
  hwloc_bitmap_free(hwlocset);

#endif /* HWLOC_HAVE_CPU_SET */

  hwloc_topology_destroy(topology);
  return 0;
}
Ejemplo n.º 20
0
int orte_daemon(int argc, char *argv[])
{
    int ret = 0;
    opal_cmd_line_t *cmd_line = NULL;
    int i;
    opal_buffer_t *buffer;
    char hostname[OPAL_MAXHOSTNAMELEN];
#if OPAL_ENABLE_FT_CR == 1
    char *tmp_env_var = NULL;
#endif

    /* initialize the globals */
    memset(&orted_globals, 0, sizeof(orted_globals));
    /* initialize the singleton died pipe to an illegal value so we can detect it was set */
    orted_globals.singleton_died_pipe = -1;
    bucket = OBJ_NEW(opal_buffer_t);

    /* setup to check common command line options that just report and die */
    cmd_line = OBJ_NEW(opal_cmd_line_t);
    if (OPAL_SUCCESS != opal_cmd_line_create(cmd_line, orte_cmd_line_opts)) {
        OBJ_RELEASE(cmd_line);
        exit(1);
    }
    mca_base_cmd_line_setup(cmd_line);
    if (ORTE_SUCCESS != (ret = opal_cmd_line_parse(cmd_line, false, false,
                                                   argc, argv))) {
        char *args = NULL;
        args = opal_cmd_line_get_usage_msg(cmd_line);
        fprintf(stderr, "Usage: %s [OPTION]...\n%s\n", argv[0], args);
        free(args);
        OBJ_RELEASE(cmd_line);
        return ret;
    }

    /*
     * Since this process can now handle MCA/GMCA parameters, make sure to
     * process them.
     */
    mca_base_cmd_line_process_args(cmd_line, &environ, &environ);

    /* Ensure that enough of OPAL is setup for us to be able to run */
    /*
     * NOTE: (JJH)
     *  We need to allow 'mca_base_cmd_line_process_args()' to process command
     *  line arguments *before* calling opal_init_util() since the command
     *  line could contain MCA parameters that affect the way opal_init_util()
     *  functions. AMCA parameters are one such option normally received on the
     *  command line that affect the way opal_init_util() behaves.
     *  It is "safe" to call mca_base_cmd_line_process_args() before
     *  opal_init_util() since mca_base_cmd_line_process_args() does *not*
     *  depend upon opal_init_util() functionality.
     */
    if (OPAL_SUCCESS != opal_init_util(&argc, &argv)) {
        fprintf(stderr, "OPAL failed to initialize -- orted aborting\n");
        exit(1);
    }

    /* save the environment for launch purposes. This MUST be
     * done so that we can pass it to any local procs we
     * spawn - otherwise, those local procs won't see any
     * non-MCA envars that were set in the enviro when the
     * orted was executed - e.g., by .csh
     */
    orte_launch_environ = opal_argv_copy(environ);

    /* purge any ess/pmix flags set in the environ when we were launched */
    opal_unsetenv(OPAL_MCA_PREFIX"ess", &orte_launch_environ);
    opal_unsetenv(OPAL_MCA_PREFIX"pmix", &orte_launch_environ);

    /* if orte_daemon_debug is set, let someone know we are alive right
     * away just in case we have a problem along the way
     */
    if (orted_globals.debug) {
        gethostname(hostname, sizeof(hostname));
        fprintf(stderr, "Daemon was launched on %s - beginning to initialize\n", hostname);
    }

    /* check for help request */
    if (orted_globals.help) {
        char *args = NULL;
        args = opal_cmd_line_get_usage_msg(cmd_line);
        orte_show_help("help-orted.txt", "orted:usage", false,
                       argv[0], args);
        free(args);
        return 1;
    }
#if defined(HAVE_SETSID)
    /* see if we were directed to separate from current session */
    if (orted_globals.set_sid) {
        setsid();
    }
#endif
    /* see if they want us to spin until they can connect a debugger to us */
    i=0;
    while (orted_spin_flag) {
        i++;
        if (1000 < i) i=0;
    }

#if OPAL_ENABLE_FT_CR == 1
    /* Mark as a tool program */
    (void) mca_base_var_env_name ("opal_cr_is_tool", &tmp_env_var);
    opal_setenv(tmp_env_var,
                "1",
                true, &environ);
    free(tmp_env_var);
#endif

    /* detach from controlling terminal
     * otherwise, remain attached so output can get to us
     */
    if(!orte_debug_flag &&
       !orte_debug_daemons_flag &&
       orted_globals.daemonize) {
        opal_daemon_init(NULL);
    }

    /* Set the flag telling OpenRTE that I am NOT a
     * singleton, but am "infrastructure" - prevents setting
     * up incorrect infrastructure that only a singleton would
     * require.
     */
    if (orted_globals.hnp) {
        if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_HNP))) {
            ORTE_ERROR_LOG(ret);
            return ret;
        }
    } else {
        if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_DAEMON))) {
            ORTE_ERROR_LOG(ret);
            return ret;
        }
    }

    /* finalize the OPAL utils. As they are opened again from orte_init->opal_init
     * we continue to have a reference count on them. So we have to finalize them twice...
     */
    opal_finalize_util();

    /* bind ourselves if so directed */
    if (NULL != orte_daemon_cores) {
        char **cores=NULL, tmp[128];
        hwloc_obj_t pu;
        hwloc_cpuset_t ours, res;
        int core;

        /* could be a collection of comma-delimited ranges, so
         * use our handy utility to parse it
         */
        orte_util_parse_range_options(orte_daemon_cores, &cores);
        if (NULL != cores) {
            ours = hwloc_bitmap_alloc();
            hwloc_bitmap_zero(ours);
            res = hwloc_bitmap_alloc();
            for (i=0; NULL != cores[i]; i++) {
                core = strtoul(cores[i], NULL, 10);
                if (NULL == (pu = opal_hwloc_base_get_pu(opal_hwloc_topology, core, OPAL_HWLOC_LOGICAL))) {
                    /* turn off the show help forwarding as we won't
                     * be able to cycle the event library to send
                     */
                    orte_show_help_finalize();
                    /* the message will now come out locally */
                    orte_show_help("help-orted.txt", "orted:cannot-bind",
                                   true, orte_process_info.nodename,
                                   orte_daemon_cores);
                    ret = ORTE_ERR_NOT_SUPPORTED;
                    hwloc_bitmap_free(ours);
                    hwloc_bitmap_free(res);
                    goto DONE;
                }
                hwloc_bitmap_or(res, ours, pu->cpuset);
                hwloc_bitmap_copy(ours, res);
            }
            /* if the result is all zeros, then don't bind */
            if (!hwloc_bitmap_iszero(ours)) {
                (void)hwloc_set_cpubind(opal_hwloc_topology, ours, 0);
                if (opal_hwloc_report_bindings) {
                    opal_hwloc_base_cset2mapstr(tmp, sizeof(tmp), opal_hwloc_topology, ours);
                    opal_output(0, "Daemon %s is bound to cores %s",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp);
                }
            }
            /* cleanup */
            hwloc_bitmap_free(ours);
            hwloc_bitmap_free(res);
            opal_argv_free(cores);
        }
    }

    if ((int)ORTE_VPID_INVALID != orted_debug_failure) {
        orted_globals.abort=false;
        /* some vpid was ordered to fail. The value can be positive
         * or negative, depending upon the desired method for failure,
         * so need to check both here
         */
        if (0 > orted_debug_failure) {
            orted_debug_failure = -1*orted_debug_failure;
            orted_globals.abort = true;
        }
        /* are we the specified vpid? */
        if ((int)ORTE_PROC_MY_NAME->vpid == orted_debug_failure) {
            /* if the user specified we delay, then setup a timer
             * and have it kill us
             */
            if (0 < orted_debug_failure_delay) {
                ORTE_TIMER_EVENT(orted_debug_failure_delay, 0, shutdown_callback, ORTE_SYS_PRI);

            } else {
                opal_output(0, "%s is executing clean %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            orted_globals.abort ? "abort" : "abnormal termination");

                /* do -not- call finalize as this will send a message to the HNP
                 * indicating clean termination! Instead, just forcibly cleanup
                 * the local session_dir tree and exit
                 */
                orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);

                /* if we were ordered to abort, do so */
                if (orted_globals.abort) {
                    abort();
                }

                /* otherwise, return with non-zero status */
                ret = ORTE_ERROR_DEFAULT_EXIT_CODE;
                goto DONE;
            }
        }
    }

    /* insert our contact info into our process_info struct so we
     * have it for later use and set the local daemon field to our name
     */
    orte_oob_base_get_addr(&orte_process_info.my_daemon_uri);
    if (NULL == orte_process_info.my_daemon_uri) {
        /* no way to communicate */
        ret = ORTE_ERROR;
        goto DONE;
    }
    ORTE_PROC_MY_DAEMON->jobid = ORTE_PROC_MY_NAME->jobid;
    ORTE_PROC_MY_DAEMON->vpid = ORTE_PROC_MY_NAME->vpid;

    /* if I am also the hnp, then update that contact info field too */
    if (ORTE_PROC_IS_HNP) {
        orte_process_info.my_hnp_uri = strdup(orte_process_info.my_daemon_uri);
        ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid;
        ORTE_PROC_MY_HNP->vpid = ORTE_PROC_MY_NAME->vpid;
    }

    /* setup the primary daemon command receive function */
    orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON,
                            ORTE_RML_PERSISTENT, orte_daemon_recv, NULL);

    /* output a message indicating we are alive, our name, and our pid
     * for debugging purposes
     */
    if (orte_debug_daemons_flag) {
        fprintf(stderr, "Daemon %s checking in as pid %ld on host %s\n",
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)orte_process_info.pid,
                orte_process_info.nodename);
    }

    /* We actually do *not* want the orted to voluntarily yield() the
       processor more than necessary.  The orted already blocks when
       it is doing nothing, so it doesn't use any more CPU cycles than
       it should; but when it *is* doing something, we do not want it
       to be unnecessarily delayed because it voluntarily yielded the
       processor in the middle of its work.

       For example: when a message arrives at the orted, we want the
       OS to wake up the orted in a timely fashion (which most OS's
       seem good about doing) and then we want the orted to process
       the message as fast as possible.  If the orted yields and lets
       aggressive MPI applications get the processor back, it may be a
       long time before the OS schedules the orted to run again
       (particularly if there is no IO event to wake it up).  Hence,
       routed OOB messages (for example) may be significantly delayed
       before being delivered to MPI processes, which can be
       problematic in some scenarios (e.g., COMM_SPAWN, BTL's that
       require OOB messages for wireup, etc.). */
    opal_progress_set_yield_when_idle(false);

    /* Change the default behavior of libevent such that we want to
       continually block rather than blocking for the default timeout
       and then looping around the progress engine again.  There
       should be nothing in the orted that cannot block in libevent
       until "something" happens (i.e., there's no need to keep
       cycling through progress because the only things that should
       happen will happen in libevent).  This is a minor optimization,
       but what the heck... :-) */
    opal_progress_set_event_flag(OPAL_EVLOOP_ONCE);

    /* if requested, report my uri to the indicated pipe */
    if (orted_globals.uri_pipe > 0) {
        orte_job_t *jdata;
        orte_proc_t *proc;
        orte_node_t *node;
        orte_app_context_t *app;
        char *tmp, *nptr, *sysinfo;
        char **singenv=NULL, *string_key, *env_str;

        /* setup the singleton's job */
        jdata = OBJ_NEW(orte_job_t);
        /* default to ompi for now */
        opal_argv_append_nosize(&jdata->personality, "ompi");
        orte_plm_base_create_jobid(jdata);
        opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata);

        /* must create a map for it (even though it has no
         * info in it) so that the job info will be picked
         * up in subsequent pidmaps or other daemons won't
         * know how to route
         */
        jdata->map = OBJ_NEW(orte_job_map_t);

        /* setup an app_context for the singleton */
        app = OBJ_NEW(orte_app_context_t);
        app->app = strdup("singleton");
        app->num_procs = 1;
        opal_pointer_array_add(jdata->apps, app);
        jdata->num_apps = 1;

        /* setup a proc object for the singleton - since we
         * -must- be the HNP, and therefore we stored our
         * node on the global node pool, and since the singleton
         * -must- be on the same node as us, indicate that
         */
        proc = OBJ_NEW(orte_proc_t);
        proc->name.jobid = jdata->jobid;
        proc->name.vpid = 0;
        proc->parent = 0;
        ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_ALIVE);
        proc->state = ORTE_PROC_STATE_RUNNING;
        proc->app_idx = 0;
        /* obviously, it is on my node */
        node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
        proc->node = node;
        OBJ_RETAIN(node);  /* keep accounting straight */
        opal_pointer_array_add(jdata->procs, proc);
        jdata->num_procs = 1;
        /* add the node to the job map */
        OBJ_RETAIN(node);
        opal_pointer_array_add(jdata->map->nodes, node);
        jdata->map->num_nodes++;
        /* and it obviously is on the node */
        OBJ_RETAIN(proc);
        opal_pointer_array_add(node->procs, proc);
        node->num_procs++;
        /* and obviously it is one of my local procs */
        OBJ_RETAIN(proc);
        opal_pointer_array_add(orte_local_children, proc);
        jdata->num_local_procs = 1;
        /* set the trivial */
        proc->local_rank = 0;
        proc->node_rank = 0;
        proc->app_rank = 0;
        proc->state = ORTE_PROC_STATE_RUNNING;
        proc->app_idx = 0;
        ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_LOCAL);

        /* set the ORTE_JOB_TRANSPORT_KEY from the environment */
        orte_pre_condition_transports(jdata, NULL);

        /* register the singleton's nspace with our PMIx server */
        if (ORTE_SUCCESS != (ret = orte_pmix_server_register_nspace(jdata, false))) {
          ORTE_ERROR_LOG(ret);
          goto DONE;
        }
        /* use setup fork to create the envars needed by the singleton */
        if (OPAL_SUCCESS != (ret = opal_pmix.server_setup_fork(&proc->name, &singenv))) {
            ORTE_ERROR_LOG(ret);
            goto DONE;
        }

        /* append the transport key to the envars needed by the singleton */
        if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_TRANSPORT_KEY, (void**)&string_key, OPAL_STRING) || NULL == string_key) {
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            goto DONE;
        }
        asprintf(&env_str, OPAL_MCA_PREFIX"orte_precondition_transports=%s", string_key);
        opal_argv_append_nosize(&singenv, env_str);
        free(env_str);

        nptr = opal_argv_join(singenv, '*');
        opal_argv_free(singenv);

        /* create a string that contains our uri + sysinfo + PMIx server URI envars */
        orte_util_convert_sysinfo_to_string(&sysinfo, orte_local_cpu_type, orte_local_cpu_model);
        asprintf(&tmp, "%s[%s]%s", orte_process_info.my_daemon_uri, sysinfo, nptr);
        free(sysinfo);
        free(nptr);

        /* pass that info to the singleton */
        if (OPAL_SUCCESS != (ret = opal_fd_write(orted_globals.uri_pipe, strlen(tmp)+1, tmp))) { ; /* need to add 1 to get the NULL */
            ORTE_ERROR_LOG(ret);
            goto DONE;
        }

        /* cleanup */
        free(tmp);
        close(orted_globals.uri_pipe);

        /* since a singleton spawned us, we need to harvest
         * any MCA params from the local environment so
         * we can pass them along to any subsequent daemons
         * we may start as the result of a comm_spawn
         */
        for (i=0; NULL != environ[i]; i++) {
            if (0 == strncmp(environ[i], OPAL_MCA_PREFIX, 9)) {
                /* make a copy to manipulate */
                tmp = strdup(environ[i]);
                /* find the equal sign */
                nptr = strchr(tmp, '=');
                *nptr = '\0';
                nptr++;
                /* add the mca param to the orted cmd line */
                opal_argv_append_nosize(&orted_cmd_line, "-"OPAL_MCA_CMD_LINE_ID);
                opal_argv_append_nosize(&orted_cmd_line, &tmp[9]);
                opal_argv_append_nosize(&orted_cmd_line, nptr);
                free(tmp);
            }
        }
    }

    /* if we were given a pipe to monitor for singleton termination, set that up */
    if (orted_globals.singleton_died_pipe > 0) {
        /* register shutdown handler */
        pipe_handler = (opal_event_t*)malloc(sizeof(opal_event_t));
        opal_event_set(orte_event_base, pipe_handler,
                       orted_globals.singleton_died_pipe,
                       OPAL_EV_READ,
                       pipe_closed,
                       pipe_handler);
        opal_event_add(pipe_handler, NULL);
    }

    /* If I have a parent, then save his contact info so
     * any messages we send can flow thru him.
     */
    orte_parent_uri = NULL;
    (void) mca_base_var_register ("orte", "orte", NULL, "parent_uri",
                                  "URI for the parent if tree launch is enabled.",
                                  MCA_BASE_VAR_TYPE_STRING, NULL, 0,
                                  MCA_BASE_VAR_FLAG_INTERNAL,
                                  OPAL_INFO_LVL_9,
                                  MCA_BASE_VAR_SCOPE_CONSTANT,
                                  &orte_parent_uri);
    if (NULL != orte_parent_uri) {
        orte_process_name_t parent;
        opal_value_t val;

        /* set the contact info into our local database */
        ret = orte_rml_base_parse_uris(orte_parent_uri, &parent, NULL);
        if (ORTE_SUCCESS != ret) {
            ORTE_ERROR_LOG(ret);
            free (orte_parent_uri);
            orte_parent_uri = NULL;
            goto DONE;
        }
        OBJ_CONSTRUCT(&val, opal_value_t);
        val.key = OPAL_PMIX_PROC_URI;
        val.type = OPAL_STRING;
        val.data.string = orte_parent_uri;
        if (OPAL_SUCCESS != (ret = opal_pmix.store_local(&parent, &val))) {
            ORTE_ERROR_LOG(ret);
            OBJ_DESTRUCT(&val);
            goto DONE;
        }
        val.key = NULL;
        val.data.string = NULL;
        OBJ_DESTRUCT(&val);

        /* don't need this value anymore */
        free(orte_parent_uri);
        orte_parent_uri = NULL;

        /* tell the routed module that we have a path
         * back to the HNP
         */
        if (ORTE_SUCCESS != (ret = orte_routed.update_route(NULL, ORTE_PROC_MY_HNP, &parent))) {
            ORTE_ERROR_LOG(ret);
            goto DONE;
        }
        /* set the lifeline to point to our parent so that we
         * can handle the situation if that lifeline goes away
         */
        if (ORTE_SUCCESS != (ret = orte_routed.set_lifeline(NULL, &parent))) {
            ORTE_ERROR_LOG(ret);
            goto DONE;
        }
    }

    /* if we are not the HNP...the only time we will be an HNP
     * is if we are launched by a singleton to provide support
     * for it
     */
    if (!ORTE_PROC_IS_HNP) {
        orte_process_name_t target;
        target.jobid = ORTE_PROC_MY_NAME->jobid;

        if (orte_fwd_mpirun_port || orte_static_ports) {
            /* setup the rollup callback */
            orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ORTED_CALLBACK,
                                    ORTE_RML_PERSISTENT, rollup, NULL);
            target.vpid = ORTE_PROC_MY_NAME->vpid;
            /* since we will be waiting for any children to send us
             * their rollup info before sending to our parent, save
             * a little time in the launch phase by "warming up" the
             * connection to our parent while we wait for our children */
            buffer = OBJ_NEW(opal_buffer_t);  // zero-byte message
            if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
                                                   ORTE_PROC_MY_PARENT, buffer,
                                                   ORTE_RML_TAG_WARMUP_CONNECTION,
                                                   orte_rml_send_callback, NULL))) {
                ORTE_ERROR_LOG(ret);
                OBJ_RELEASE(buffer);
                goto DONE;
            }
        } else {
            target.vpid = 0;
        }

        /* send the information to the orted report-back point - this function
         * will process the data, but also counts the number of
         * orteds that reported back so the launch procedure can continue.
         * We need to do this at the last possible second as the HNP
         * can turn right around and begin issuing orders to us
         */

        buffer = OBJ_NEW(opal_buffer_t);
        /* insert our name for rollup purposes */
        if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) {
            ORTE_ERROR_LOG(ret);
            OBJ_RELEASE(buffer);
            goto DONE;
        }

        /* get any connection info we may have pushed */
        {
            opal_value_t *val = NULL, *kv;
            opal_list_t *modex;
            int32_t flag;

            if (OPAL_SUCCESS != (ret = opal_pmix.get(ORTE_PROC_MY_NAME, NULL, NULL, &val)) || NULL == val) {
                /* just pack a marker indicating we don't have any to share */
                flag = 0;
                if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT32))) {
                    ORTE_ERROR_LOG(ret);
                    OBJ_RELEASE(buffer);
                    goto DONE;
                }
            } else {
                /* the data is returned as a list of key-value pairs in the opal_value_t */
                if (OPAL_PTR == val->type) {
                    modex = (opal_list_t*)val->data.ptr;
                    flag = (int32_t)opal_list_get_size(modex);
                    if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT32))) {
                        ORTE_ERROR_LOG(ret);
                        OBJ_RELEASE(buffer);
                        goto DONE;
                    }
                    OPAL_LIST_FOREACH(kv, modex, opal_value_t) {
                        if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &kv, 1, OPAL_VALUE))) {
                            ORTE_ERROR_LOG(ret);
                            OBJ_RELEASE(buffer);
                            goto DONE;
                        }
                    }
                    OPAL_LIST_RELEASE(modex);
                } else {
                    opal_output(0, "VAL KEY: %s", (NULL == val->key) ? "NULL" : val->key);
                    /* single value */
                    flag = 1;
                    if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT32))) {
                        ORTE_ERROR_LOG(ret);
                        OBJ_RELEASE(buffer);
                        goto DONE;
                    }
                    if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &val, 1, OPAL_VALUE))) {
                        ORTE_ERROR_LOG(ret);
                        OBJ_RELEASE(buffer);
                        goto DONE;
                    }
                }
                OBJ_RELEASE(val);
            }
Ejemplo n.º 21
0
int main(int argc, char *argv[])
{
  hwloc_topology_t topology;
  unsigned depth;
  hwloc_bitmap_t cpubind_set, membind_set;
  int got_cpubind = 0, got_membind = 0;
  int working_on_cpubind = 1; /* membind if 0 */
  int get_binding = 0;
  int get_last_cpu_location = 0;
  unsigned long flags = HWLOC_TOPOLOGY_FLAG_WHOLE_IO|HWLOC_TOPOLOGY_FLAG_ICACHES;
  int force = 0;
  int single = 0;
  int verbose = 0;
  int logical = 1;
  int taskset = 0;
  int cpubind_flags = 0;
  hwloc_membind_policy_t membind_policy = HWLOC_MEMBIND_BIND;
  int membind_flags = 0;
  int opt;
  int ret;
  int pid_number = 0;
  hwloc_pid_t pid;
  char *callname;

  cpubind_set = hwloc_bitmap_alloc();
  membind_set = hwloc_bitmap_alloc();

  hwloc_topology_init(&topology);
  hwloc_topology_set_flags(topology, flags);
  hwloc_topology_load(topology);
  depth = hwloc_topology_get_depth(topology);

  callname = argv[0];
  /* skip argv[0], handle options */
  argv++;
  argc--;

  while (argc >= 1) {
    if (!strcmp(argv[0], "--")) {
      argc--;
      argv++;
      break;
    }

    opt = 0;

    if (*argv[0] == '-') {
      if (!strcmp(argv[0], "-v") || !strcmp(argv[0], "--verbose")) {
	verbose++;
	goto next;
      }
      else if (!strcmp(argv[0], "-q") || !strcmp(argv[0], "--quiet")) {
	verbose--;
	goto next;
      }
      else if (!strcmp(argv[0], "--help")) {
        usage("hwloc-bind", stdout);
	return EXIT_SUCCESS;
      }
      else if (!strcmp(argv[0], "--single")) {
	single = 1;
	goto next;
      }
      else if (!strcmp(argv[0], "-f") || !strcmp(argv[0], "--force")) {
	force = 1;
	goto next;
      }
      else if (!strcmp(argv[0], "--strict")) {
	cpubind_flags |= HWLOC_CPUBIND_STRICT;
	membind_flags |= HWLOC_MEMBIND_STRICT;
	goto next;
      }
      else if (!strcmp(argv[0], "--pid")) {
        if (argc < 2) {
          usage ("hwloc-bind", stderr);
          exit(EXIT_FAILURE);
        }
        pid_number = atoi(argv[1]);
        opt = 1;
        goto next;
      }
      else if (!strcmp (argv[0], "--version")) {
          printf("%s %s\n", callname, VERSION);
          exit(EXIT_SUCCESS);
      }
      if (!strcmp(argv[0], "-l") || !strcmp(argv[0], "--logical")) {
        logical = 1;
        goto next;
      }
      if (!strcmp(argv[0], "-p") || !strcmp(argv[0], "--physical")) {
        logical = 0;
        goto next;
      }
      if (!strcmp(argv[0], "--taskset")) {
        taskset = 1;
        goto next;
      }
      else if (!strcmp (argv[0], "-e") || !strncmp (argv[0], "--get-last-cpu-location", 10)) {
	get_last_cpu_location = 1;
	goto next;
      }
      else if (!strcmp (argv[0], "--get")) {
	get_binding = 1;
	goto next;
      }
      else if (!strcmp (argv[0], "--cpubind")) {
	  working_on_cpubind = 1;
	  goto next;
      }
      else if (!strcmp (argv[0], "--membind")) {
	  working_on_cpubind = 0;
	  goto next;
      }
      else if (!strcmp (argv[0], "--mempolicy")) {
	if (!strncmp(argv[1], "default", 2))
	  membind_policy = HWLOC_MEMBIND_DEFAULT;
	else if (!strncmp(argv[1], "firsttouch", 2))
	  membind_policy = HWLOC_MEMBIND_FIRSTTOUCH;
	else if (!strncmp(argv[1], "bind", 2))
	  membind_policy = HWLOC_MEMBIND_BIND;
	else if (!strncmp(argv[1], "interleave", 2))
	  membind_policy = HWLOC_MEMBIND_INTERLEAVE;
	else if (!strncmp(argv[1], "replicate", 2))
	  membind_policy = HWLOC_MEMBIND_REPLICATE;
	else if (!strncmp(argv[1], "nexttouch", 2))
	  membind_policy = HWLOC_MEMBIND_NEXTTOUCH;
	else {
	  fprintf(stderr, "Unrecognized memory binding policy %s\n", argv[1]);
          usage ("hwloc-bind", stderr);
          exit(EXIT_FAILURE);
	}
	opt = 1;
	goto next;
      }
      else if (!strcmp (argv[0], "--whole-system")) {
	flags |= HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM;
	hwloc_topology_destroy(topology);
	hwloc_topology_init(&topology);
	hwloc_topology_set_flags(topology, flags);
	hwloc_topology_load(topology);
	depth = hwloc_topology_get_depth(topology);
	goto next;
      }
      else if (!strcmp (argv[0], "--restrict")) {
	hwloc_bitmap_t restrictset;
	int err;
	if (argc < 2) {
	  usage (callname, stdout);
	  exit(EXIT_FAILURE);
	}
	restrictset = hwloc_bitmap_alloc();
	hwloc_bitmap_sscanf(restrictset, argv[1]);
	err = hwloc_topology_restrict (topology, restrictset, 0);
	if (err) {
	  perror("Restricting the topology");
	  /* fallthrough */
	}
	hwloc_bitmap_free(restrictset);
	argc--;
	argv++;
	goto next;
      }

      fprintf (stderr, "Unrecognized option: %s\n", argv[0]);
      usage("hwloc-bind", stderr);
      return EXIT_FAILURE;
    }

    ret = hwloc_calc_process_arg(topology, depth, argv[0], logical,
				 working_on_cpubind ? cpubind_set : membind_set,
				 verbose);
    if (ret < 0) {
      if (verbose > 0)
	fprintf(stderr, "assuming the command starts at %s\n", argv[0]);
      break;
    }
    if (working_on_cpubind)
      got_cpubind = 1;
    else
      got_membind = 1;

  next:
    argc -= opt+1;
    argv += opt+1;
  }

  pid = hwloc_pid_from_number(pid_number, !(get_binding || get_last_cpu_location));

  if (get_binding || get_last_cpu_location) {
    char *s;
    const char *policystr = NULL;
    int err;
    if (working_on_cpubind) {
      if (get_last_cpu_location) {
	if (pid_number)
	  err = hwloc_get_proc_last_cpu_location(topology, pid, cpubind_set, 0);
	else
	  err = hwloc_get_last_cpu_location(topology, cpubind_set, 0);
      } else {
	if (pid_number)
	  err = hwloc_get_proc_cpubind(topology, pid, cpubind_set, 0);
	else
	  err = hwloc_get_cpubind(topology, cpubind_set, 0);
      }
      if (err) {
	const char *errmsg = strerror(errno);
	if (pid_number)
	  fprintf(stderr, "hwloc_get_proc_%s %d failed (errno %d %s)\n", get_last_cpu_location ? "last_cpu_location" : "cpubind", pid_number, errno, errmsg);
	else
	  fprintf(stderr, "hwloc_get_%s failed (errno %d %s)\n", get_last_cpu_location ? "last_cpu_location" : "cpubind", errno, errmsg);
	return EXIT_FAILURE;
      }
      if (taskset)
	hwloc_bitmap_taskset_asprintf(&s, cpubind_set);
      else
	hwloc_bitmap_asprintf(&s, cpubind_set);
    } else {
      hwloc_membind_policy_t policy;
      if (pid_number)
	err = hwloc_get_proc_membind(topology, pid, membind_set, &policy, 0);
      else
	err = hwloc_get_membind(topology, membind_set, &policy, 0);
      if (err) {
	const char *errmsg = strerror(errno);
        if (pid_number)
          fprintf(stderr, "hwloc_get_proc_membind %d failed (errno %d %s)\n", pid_number, errno, errmsg);
        else
	  fprintf(stderr, "hwloc_get_membind failed (errno %d %s)\n", errno, errmsg);
	return EXIT_FAILURE;
      }
      if (taskset)
	hwloc_bitmap_taskset_asprintf(&s, membind_set);
      else
	hwloc_bitmap_asprintf(&s, membind_set);
      switch (policy) {
      case HWLOC_MEMBIND_DEFAULT: policystr = "default"; break;
      case HWLOC_MEMBIND_FIRSTTOUCH: policystr = "firsttouch"; break;
      case HWLOC_MEMBIND_BIND: policystr = "bind"; break;
      case HWLOC_MEMBIND_INTERLEAVE: policystr = "interleave"; break;
      case HWLOC_MEMBIND_REPLICATE: policystr = "replicate"; break;
      case HWLOC_MEMBIND_NEXTTOUCH: policystr = "nexttouch"; break;
      default: fprintf(stderr, "unknown memory policy %d\n", policy); assert(0); break;
      }
    }
    if (policystr)
      printf("%s (%s)\n", s, policystr);
    else
      printf("%s\n", s);
    free(s);
    return EXIT_SUCCESS;
  }

  if (got_membind) {
    if (hwloc_bitmap_iszero(membind_set)) {
      if (verbose >= 0)
	fprintf(stderr, "cannot membind to empty set\n");
      if (!force)
	goto failed_binding;
    }
    if (verbose > 0) {
      char *s;
      hwloc_bitmap_asprintf(&s, membind_set);
      fprintf(stderr, "binding on memory set %s\n", s);
      free(s);
    }
    if (single)
      hwloc_bitmap_singlify(membind_set);
    if (pid_number)
      ret = hwloc_set_proc_membind(topology, pid, membind_set, membind_policy, membind_flags);
    else
      ret = hwloc_set_membind(topology, membind_set, membind_policy, membind_flags);
    if (ret && verbose >= 0) {
      int bind_errno = errno;
      const char *errmsg = strerror(bind_errno);
      char *s;
      hwloc_bitmap_asprintf(&s, membind_set);
      if (pid_number)
        fprintf(stderr, "hwloc_set_proc_membind %s %d failed (errno %d %s)\n", s, pid_number, bind_errno, errmsg);
      else
        fprintf(stderr, "hwloc_set_membind %s failed (errno %d %s)\n", s, bind_errno, errmsg);
      free(s);
    }
    if (ret && !force)
      goto failed_binding;
  }

  if (got_cpubind) {
    if (hwloc_bitmap_iszero(cpubind_set)) {
      if (verbose >= 0)
	fprintf(stderr, "cannot cpubind to empty set\n");
      if (!force)
	goto failed_binding;
    }
    if (verbose > 0) {
      char *s;
      hwloc_bitmap_asprintf(&s, cpubind_set);
      fprintf(stderr, "binding on cpu set %s\n", s);
      free(s);
    }
    if (single)
      hwloc_bitmap_singlify(cpubind_set);
    if (pid_number)
      ret = hwloc_set_proc_cpubind(topology, pid, cpubind_set, cpubind_flags);
    else
      ret = hwloc_set_cpubind(topology, cpubind_set, cpubind_flags);
    if (ret && verbose >= 0) {
      int bind_errno = errno;
      const char *errmsg = strerror(bind_errno);
      char *s;
      hwloc_bitmap_asprintf(&s, cpubind_set);
      if (pid_number)
        fprintf(stderr, "hwloc_set_proc_cpubind %s %d failed (errno %d %s)\n", s, pid_number, bind_errno, errmsg);
      else
        fprintf(stderr, "hwloc_set_cpubind %s failed (errno %d %s)\n", s, bind_errno, errmsg);
      free(s);
    }
    if (ret && !force)
      goto failed_binding;
  }

  hwloc_bitmap_free(cpubind_set);
  hwloc_bitmap_free(membind_set);

  hwloc_topology_destroy(topology);

  if (pid_number)
    return EXIT_SUCCESS;

  if (0 == argc) {
    fprintf(stderr, "%s: nothing to do!\n", callname);
    return EXIT_FAILURE;
  }

  ret = execvp(argv[0], argv);
  if (ret) {
      fprintf(stderr, "%s: Failed to launch executable \"%s\"\n", 
              callname, argv[0]);
      perror("execvp");
  }
  return EXIT_FAILURE;


failed_binding:
  hwloc_bitmap_free(cpubind_set);
  hwloc_bitmap_free(membind_set);
  hwloc_topology_destroy(topology);
  return EXIT_FAILURE;
}
int main(void)
{
    hwloc_topology_t topology;
    unsigned depth;
    hwloc_obj_t objs[OBJ_MAX];
    hwloc_obj_t obj;
    hwloc_bitmap_t set;
    int ret;

    hwloc_topology_init(&topology);
    hwloc_topology_set_synthetic(topology, SYNTHETIC_TOPOLOGY_DESCRIPTION);
    hwloc_topology_load(topology);
    depth = hwloc_topology_get_depth(topology);

    /* just get the system object */
    obj = hwloc_get_root_obj(topology);
    ret = hwloc_get_largest_objs_inside_cpuset(topology, obj->cpuset, objs, 1);
    assert(ret == 1);
    assert(objs[0] == obj);
    objs[0] = hwloc_get_first_largest_obj_inside_cpuset(topology, obj->cpuset);
    assert(objs[0] == obj);

    /* just get the very last object */
    obj = hwloc_get_obj_by_depth(topology, depth-1, hwloc_get_nbobjs_by_depth(topology, depth-1)-1);
    ret = hwloc_get_largest_objs_inside_cpuset(topology, obj->cpuset, objs, 1);
    assert(ret == 1);
    assert(objs[0] == obj);

    /* try an empty one */
    set = hwloc_bitmap_alloc();
    ret = hwloc_get_largest_objs_inside_cpuset(topology, set, objs, 1);
    assert(ret == 0);
    objs[0] = hwloc_get_first_largest_obj_inside_cpuset(topology, set);
    assert(objs[0] == NULL);
    hwloc_bitmap_free(set);

    /* try an impossible one */
    set = hwloc_bitmap_alloc();
    hwloc_bitmap_sscanf(set, GIVEN_TOOLARGE_CPUSET_STRING);
    ret = hwloc_get_largest_objs_inside_cpuset(topology, set, objs, 1);
    assert(ret == -1);
    objs[0] = hwloc_get_first_largest_obj_inside_cpuset(topology, set);
    assert(objs[0] == NULL);
    hwloc_bitmap_free(set);

    /* try a harder one with 1 obj instead of 2 needed */
    set = hwloc_bitmap_alloc();
    hwloc_bitmap_sscanf(set, GIVEN_LARGESPLIT_CPUSET_STRING);
    ret = hwloc_get_largest_objs_inside_cpuset(topology, set, objs, 1);
    assert(ret == 1);
    assert(objs[0] == hwloc_get_obj_by_depth(topology, depth-1, 0));
    objs[0] = hwloc_get_first_largest_obj_inside_cpuset(topology, set);
    assert(objs[0] == hwloc_get_obj_by_depth(topology, depth-1, 0));
    /* try a harder one with lots of objs instead of 2 needed */
    ret = hwloc_get_largest_objs_inside_cpuset(topology, set, objs, 2);
    assert(ret == 2);
    assert(objs[0] == hwloc_get_obj_by_depth(topology, depth-1, 0));
    assert(objs[1] == hwloc_get_obj_by_depth(topology, depth-1, hwloc_get_nbobjs_by_depth(topology, depth-1)-1));
    objs[0] = hwloc_get_first_largest_obj_inside_cpuset(topology, set);
    hwloc_bitmap_andnot(set, set, objs[0]->cpuset);
    objs[1] = hwloc_get_first_largest_obj_inside_cpuset(topology, set);
    hwloc_bitmap_andnot(set, set, objs[1]->cpuset);
    objs[2] = hwloc_get_first_largest_obj_inside_cpuset(topology, set);
    assert(objs[0] == hwloc_get_obj_by_depth(topology, depth-1, 0));
    assert(objs[1] == hwloc_get_obj_by_depth(topology, depth-1, hwloc_get_nbobjs_by_depth(topology, depth-1)-1));
    assert(objs[2] == NULL);
    assert(hwloc_bitmap_iszero(set));
    hwloc_bitmap_free(set);

    /* try a very hard one */
    set = hwloc_bitmap_alloc();
    hwloc_bitmap_sscanf(set, GIVEN_HARD_CPUSET_STRING);
    ret = hwloc_get_largest_objs_inside_cpuset(topology, set, objs, OBJ_MAX);
    assert(objs[0] == hwloc_get_obj_by_depth(topology, 5, 29));
    assert(objs[1] == hwloc_get_obj_by_depth(topology, 3, 5));
    assert(objs[2] == hwloc_get_obj_by_depth(topology, 3, 6));
    assert(objs[3] == hwloc_get_obj_by_depth(topology, 3, 7));
    assert(objs[4] == hwloc_get_obj_by_depth(topology, 2, 2));
    assert(objs[5] == hwloc_get_obj_by_depth(topology, 4, 36));
    assert(objs[6] == hwloc_get_obj_by_depth(topology, 5, 74));
    hwloc_bitmap_free(set);

    hwloc_topology_destroy(topology);

    return EXIT_SUCCESS;
}