Beispiel #1
0
int main(void)
{
  hwloc_topology_t topology;
  hwloc_bitmap_t set, set2, nocpunomemnodeset, nocpubutmemnodeset, nomembutcpunodeset, nomembutcpucpuset;
  hwloc_obj_t node;
  struct bitmask *bitmask, *bitmask2;
  unsigned long mask;
  unsigned long maxnode;
  int i;

  if (numa_available() < 0)
    /* libnuma has inconsistent behavior when the kernel isn't NUMA-aware.
     * don't try to check everything precisely.
     */
    exit(77);

  hwloc_topology_init(&topology);
  hwloc_topology_load(topology);

  /* convert full stuff between cpuset and libnuma */
  set = hwloc_bitmap_alloc();
  nocpunomemnodeset = hwloc_bitmap_alloc();
  nocpubutmemnodeset = hwloc_bitmap_alloc();
  nomembutcpunodeset = hwloc_bitmap_alloc();
  nomembutcpucpuset = hwloc_bitmap_alloc();
  /* gather all nodes if any, or the whole system if no nodes */
  if (hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_NUMANODE)) {
    node = NULL;
    while ((node = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_NUMANODE, node)) != NULL) {
      hwloc_bitmap_or(set, set, node->cpuset);
      if (hwloc_bitmap_iszero(node->cpuset)) {
	if (node->memory.local_memory)
	  hwloc_bitmap_set(nocpubutmemnodeset, node->os_index);
	else
	  hwloc_bitmap_set(nocpunomemnodeset, node->os_index);
      } else if (!node->memory.local_memory) {
	hwloc_bitmap_set(nomembutcpunodeset, node->os_index);
	hwloc_bitmap_or(nomembutcpucpuset, nomembutcpucpuset, node->cpuset);
      }
    }
  } else {
    hwloc_bitmap_or(set, set, hwloc_topology_get_complete_cpuset(topology));
  }

  set2 = hwloc_bitmap_alloc();
  hwloc_cpuset_from_linux_libnuma_bitmask(topology, set2, numa_all_nodes_ptr);
  /* numa_all_nodes_ptr doesn't contain NODES with CPU but no memory */
  hwloc_bitmap_or(set2, set2, nomembutcpucpuset);
  assert(hwloc_bitmap_isequal(set, set2));
  hwloc_bitmap_free(set2);

  bitmask = hwloc_cpuset_to_linux_libnuma_bitmask(topology, set);
  /* numa_all_nodes_ptr contains NODES with no CPU but with memory */
  hwloc_bitmap_foreach_begin(i, nocpubutmemnodeset) { numa_bitmask_setbit(bitmask, i); } hwloc_bitmap_foreach_end();
Beispiel #2
0
static int
hwloc_solaris_get_sth_cpubind(hwloc_topology_t topology, idtype_t idtype, id_t id, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
{
  processorid_t binding;
  int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
  int n;
  int i;

  if (depth < 0) {
    errno = ENOSYS;
    return -1;
  }

  /* first check if processor_bind() was used to bind to a single processor rather than to an lgroup */
  if ( processor_bind(idtype, id, PBIND_QUERY, &binding) == 0 && binding != PBIND_NONE ) {
    hwloc_bitmap_only(hwloc_set, binding);
    return 0;
  }

  /* if not, check lgroups */
  hwloc_bitmap_zero(hwloc_set);
  n = hwloc_get_nbobjs_by_depth(topology, depth);
  for (i = 0; i < n; i++) {
    hwloc_obj_t obj = hwloc_get_obj_by_depth(topology, depth, i);
    lgrp_affinity_t aff = lgrp_affinity_get(idtype, id, obj->os_index);

    if (aff == LGRP_AFF_STRONG)
      hwloc_bitmap_or(hwloc_set, hwloc_set, obj->cpuset);
  }

  if (hwloc_bitmap_iszero(hwloc_set))
    hwloc_bitmap_copy(hwloc_set, hwloc_topology_get_complete_cpuset(topology));

  return 0;
}
Beispiel #3
0
static void _add_hwloc_cpuset(
	hwloc_obj_type_t hwtype, hwloc_obj_type_t req_hwtype,
	hwloc_obj_t obj, uint32_t taskid,  int bind_verbose,
	hwloc_bitmap_t cpuset)
{
	struct hwloc_obj *pobj;

	/* if requested binding overlaps the granularity */
	/* use the ancestor cpuset instead of the object one */
	if (hwloc_compare_types(hwtype, req_hwtype) > 0) {

		/* Get the parent object of req_hwtype or the */
		/* one just above if not found (meaning of >0)*/
		/* (useful for ldoms binding with !NUMA nodes)*/
		pobj = obj->parent;
		while (pobj != NULL &&
			hwloc_compare_types(pobj->type, req_hwtype) > 0)
			pobj = pobj->parent;

		if (pobj != NULL) {
			if (bind_verbose)
				info("task/cgroup: task[%u] higher level %s "
				     "found", taskid,
				     hwloc_obj_type_string(pobj->type));
			hwloc_bitmap_or(cpuset, cpuset, pobj->allowed_cpuset);
		} else {
			/* should not be executed */
			if (bind_verbose)
				info("task/cgroup: task[%u] no higher level "
				     "found", taskid);
			hwloc_bitmap_or(cpuset, cpuset, obj->allowed_cpuset);
		}

	} else {
		hwloc_bitmap_or(cpuset, cpuset, obj->allowed_cpuset);
	}
}
int main(void)
{
  hwloc_topology_t topology;
  hwloc_obj_t obj, cache;
  hwloc_bitmap_t set;

  hwloc_topology_init(&topology);
  hwloc_topology_set_synthetic(topology, SYNTHETIC_TOPOLOGY_DESCRIPTION);
  hwloc_topology_load(topology);

  /* check the cache above a given cpu */
#define CPUINDEX 180
  set = hwloc_bitmap_alloc();
  obj = hwloc_get_obj_by_depth(topology, 5, CPUINDEX);
  assert(obj);
  hwloc_bitmap_or(set, set, obj->cpuset);
  cache = hwloc_get_cache_covering_cpuset(topology, set);
  assert(cache);
  assert(cache->type == HWLOC_OBJ_CACHE);
  assert(cache->logical_index == CPUINDEX/2/3);
  assert(hwloc_obj_is_in_subtree(topology, obj, cache));
  hwloc_bitmap_free(set);

  /* check the cache above two nearby cpus */
#define CPUINDEX1 180
#define CPUINDEX2 183
  set = hwloc_bitmap_alloc();
  obj = hwloc_get_obj_by_depth(topology, 5, CPUINDEX1);
  assert(obj);
  hwloc_bitmap_or(set, set, obj->cpuset);
  obj = hwloc_get_obj_by_depth(topology, 5, CPUINDEX2);
  assert(obj);
  hwloc_bitmap_or(set, set, obj->cpuset);
  cache = hwloc_get_cache_covering_cpuset(topology, set);
  assert(cache);
  assert(cache->type == HWLOC_OBJ_CACHE);
  assert(cache->logical_index == CPUINDEX1/2/3);
  assert(cache->logical_index == CPUINDEX2/2/3);
  assert(hwloc_obj_is_in_subtree(topology, obj, cache));
  hwloc_bitmap_free(set);

  /* check no cache above two distant cpus */
#undef CPUINDEX1
#define CPUINDEX1 300
  set = hwloc_bitmap_alloc();
  obj = hwloc_get_obj_by_depth(topology, 5, CPUINDEX1);
  assert(obj);
  hwloc_bitmap_or(set, set, obj->cpuset);
  obj = hwloc_get_obj_by_depth(topology, 5, CPUINDEX2);
  assert(obj);
  hwloc_bitmap_or(set, set, obj->cpuset);
  cache = hwloc_get_cache_covering_cpuset(topology, set);
  assert(!cache);
  hwloc_bitmap_free(set);

  /* check no cache above higher level */
  set = hwloc_bitmap_alloc();
  obj = hwloc_get_obj_by_depth(topology, 2, 0);
  assert(obj);
  hwloc_bitmap_or(set, set, obj->cpuset);
  cache = hwloc_get_cache_covering_cpuset(topology, set);
  assert(!cache);
  hwloc_bitmap_free(set);

  hwloc_topology_destroy(topology);

  return EXIT_SUCCESS;
}
Beispiel #5
0
int main(void)
{
  hwloc_bitmap_t set;
  hwloc_obj_t obj;
  char *str = NULL;

  hwloc_topology_init(&topology);
  hwloc_topology_load(topology);

  support = hwloc_topology_get_support(topology);

  obj = hwloc_get_root_obj(topology);
  set = hwloc_bitmap_dup(obj->cpuset);

  while (hwloc_bitmap_isequal(obj->cpuset, set)) {
    if (!obj->arity)
      break;
    obj = obj->children[0];
  }

  hwloc_bitmap_asprintf(&str, set);
  printf("system set is %s\n", str);
  free(str);

  test(set, 0);
  printf("now strict\n");
  test(set, HWLOC_CPUBIND_STRICT);

  hwloc_bitmap_free(set);
  set = hwloc_bitmap_dup(obj->cpuset);
  hwloc_bitmap_asprintf(&str, set);
  printf("obj set is %s\n", str);
  free(str);

  test(set, 0);
  printf("now strict\n");
  test(set, HWLOC_CPUBIND_STRICT);

  hwloc_bitmap_singlify(set);
  hwloc_bitmap_asprintf(&str, set);
  printf("singlified to %s\n", str);
  free(str);

  test(set, 0);
  printf("now strict\n");
  test(set, HWLOC_CPUBIND_STRICT);
  hwloc_bitmap_free(set);

  printf("\n\nmemory tests\n\n");
  printf("complete node set\n");
  set = hwloc_bitmap_dup(hwloc_get_root_obj(topology)->cpuset);
  hwloc_bitmap_asprintf(&str, set);
  printf("i.e. cpuset %s\n", str);
  free(str);
  testmem3(set);
  hwloc_bitmap_free(set);

  obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_NODE, 0);
  if (obj) {
    set = hwloc_bitmap_dup(obj->cpuset);
    hwloc_bitmap_asprintf(&str, set);
    printf("cpuset set is %s\n", str);
    free(str);

    testmem3(set);

    obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_NODE, 1);
    if (obj) {
      hwloc_bitmap_or(set, set, obj->cpuset);
      hwloc_bitmap_asprintf(&str, set);
      printf("cpuset set is %s\n", str);
      free(str);

      testmem3(set);
    }
    hwloc_bitmap_free(set);
  }

  hwloc_topology_destroy(topology);
  return 0;
}
Beispiel #6
0
int orte_daemon(int argc, char *argv[])
{
    int ret = 0;
    opal_cmd_line_t *cmd_line = NULL;
    int i;
    opal_buffer_t *buffer;
    char hostname[OPAL_MAXHOSTNAMELEN];
#if OPAL_ENABLE_FT_CR == 1
    char *tmp_env_var = NULL;
#endif

    /* initialize the globals */
    memset(&orted_globals, 0, sizeof(orted_globals));
    /* initialize the singleton died pipe to an illegal value so we can detect it was set */
    orted_globals.singleton_died_pipe = -1;
    bucket = OBJ_NEW(opal_buffer_t);

    /* setup to check common command line options that just report and die */
    cmd_line = OBJ_NEW(opal_cmd_line_t);
    if (OPAL_SUCCESS != opal_cmd_line_create(cmd_line, orte_cmd_line_opts)) {
        OBJ_RELEASE(cmd_line);
        exit(1);
    }
    mca_base_cmd_line_setup(cmd_line);
    if (ORTE_SUCCESS != (ret = opal_cmd_line_parse(cmd_line, false, false,
                                                   argc, argv))) {
        char *args = NULL;
        args = opal_cmd_line_get_usage_msg(cmd_line);
        fprintf(stderr, "Usage: %s [OPTION]...\n%s\n", argv[0], args);
        free(args);
        OBJ_RELEASE(cmd_line);
        return ret;
    }

    /*
     * Since this process can now handle MCA/GMCA parameters, make sure to
     * process them.
     */
    mca_base_cmd_line_process_args(cmd_line, &environ, &environ);

    /* Ensure that enough of OPAL is setup for us to be able to run */
    /*
     * NOTE: (JJH)
     *  We need to allow 'mca_base_cmd_line_process_args()' to process command
     *  line arguments *before* calling opal_init_util() since the command
     *  line could contain MCA parameters that affect the way opal_init_util()
     *  functions. AMCA parameters are one such option normally received on the
     *  command line that affect the way opal_init_util() behaves.
     *  It is "safe" to call mca_base_cmd_line_process_args() before
     *  opal_init_util() since mca_base_cmd_line_process_args() does *not*
     *  depend upon opal_init_util() functionality.
     */
    if (OPAL_SUCCESS != opal_init_util(&argc, &argv)) {
        fprintf(stderr, "OPAL failed to initialize -- orted aborting\n");
        exit(1);
    }

    /* save the environment for launch purposes. This MUST be
     * done so that we can pass it to any local procs we
     * spawn - otherwise, those local procs won't see any
     * non-MCA envars that were set in the enviro when the
     * orted was executed - e.g., by .csh
     */
    orte_launch_environ = opal_argv_copy(environ);

    /* purge any ess/pmix flags set in the environ when we were launched */
    opal_unsetenv(OPAL_MCA_PREFIX"ess", &orte_launch_environ);
    opal_unsetenv(OPAL_MCA_PREFIX"pmix", &orte_launch_environ);

    /* if orte_daemon_debug is set, let someone know we are alive right
     * away just in case we have a problem along the way
     */
    if (orted_globals.debug) {
        gethostname(hostname, sizeof(hostname));
        fprintf(stderr, "Daemon was launched on %s - beginning to initialize\n", hostname);
    }

    /* check for help request */
    if (orted_globals.help) {
        char *args = NULL;
        args = opal_cmd_line_get_usage_msg(cmd_line);
        orte_show_help("help-orted.txt", "orted:usage", false,
                       argv[0], args);
        free(args);
        return 1;
    }
#if defined(HAVE_SETSID)
    /* see if we were directed to separate from current session */
    if (orted_globals.set_sid) {
        setsid();
    }
#endif
    /* see if they want us to spin until they can connect a debugger to us */
    i=0;
    while (orted_spin_flag) {
        i++;
        if (1000 < i) i=0;
    }

#if OPAL_ENABLE_FT_CR == 1
    /* Mark as a tool program */
    (void) mca_base_var_env_name ("opal_cr_is_tool", &tmp_env_var);
    opal_setenv(tmp_env_var,
                "1",
                true, &environ);
    free(tmp_env_var);
#endif

    /* detach from controlling terminal
     * otherwise, remain attached so output can get to us
     */
    if(!orte_debug_flag &&
       !orte_debug_daemons_flag &&
       orted_globals.daemonize) {
        opal_daemon_init(NULL);
    }

    /* Set the flag telling OpenRTE that I am NOT a
     * singleton, but am "infrastructure" - prevents setting
     * up incorrect infrastructure that only a singleton would
     * require.
     */
    if (orted_globals.hnp) {
        if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_HNP))) {
            ORTE_ERROR_LOG(ret);
            return ret;
        }
    } else {
        if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_DAEMON))) {
            ORTE_ERROR_LOG(ret);
            return ret;
        }
    }

    /* finalize the OPAL utils. As they are opened again from orte_init->opal_init
     * we continue to have a reference count on them. So we have to finalize them twice...
     */
    opal_finalize_util();

    /* bind ourselves if so directed */
    if (NULL != orte_daemon_cores) {
        char **cores=NULL, tmp[128];
        hwloc_obj_t pu;
        hwloc_cpuset_t ours, res;
        int core;

        /* could be a collection of comma-delimited ranges, so
         * use our handy utility to parse it
         */
        orte_util_parse_range_options(orte_daemon_cores, &cores);
        if (NULL != cores) {
            ours = hwloc_bitmap_alloc();
            hwloc_bitmap_zero(ours);
            res = hwloc_bitmap_alloc();
            for (i=0; NULL != cores[i]; i++) {
                core = strtoul(cores[i], NULL, 10);
                if (NULL == (pu = opal_hwloc_base_get_pu(opal_hwloc_topology, core, OPAL_HWLOC_LOGICAL))) {
                    /* turn off the show help forwarding as we won't
                     * be able to cycle the event library to send
                     */
                    orte_show_help_finalize();
                    /* the message will now come out locally */
                    orte_show_help("help-orted.txt", "orted:cannot-bind",
                                   true, orte_process_info.nodename,
                                   orte_daemon_cores);
                    ret = ORTE_ERR_NOT_SUPPORTED;
                    hwloc_bitmap_free(ours);
                    hwloc_bitmap_free(res);
                    goto DONE;
                }
                hwloc_bitmap_or(res, ours, pu->cpuset);
                hwloc_bitmap_copy(ours, res);
            }
            /* if the result is all zeros, then don't bind */
            if (!hwloc_bitmap_iszero(ours)) {
                (void)hwloc_set_cpubind(opal_hwloc_topology, ours, 0);
                if (opal_hwloc_report_bindings) {
                    opal_hwloc_base_cset2mapstr(tmp, sizeof(tmp), opal_hwloc_topology, ours);
                    opal_output(0, "Daemon %s is bound to cores %s",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp);
                }
            }
            /* cleanup */
            hwloc_bitmap_free(ours);
            hwloc_bitmap_free(res);
            opal_argv_free(cores);
        }
    }

    if ((int)ORTE_VPID_INVALID != orted_debug_failure) {
        orted_globals.abort=false;
        /* some vpid was ordered to fail. The value can be positive
         * or negative, depending upon the desired method for failure,
         * so need to check both here
         */
        if (0 > orted_debug_failure) {
            orted_debug_failure = -1*orted_debug_failure;
            orted_globals.abort = true;
        }
        /* are we the specified vpid? */
        if ((int)ORTE_PROC_MY_NAME->vpid == orted_debug_failure) {
            /* if the user specified we delay, then setup a timer
             * and have it kill us
             */
            if (0 < orted_debug_failure_delay) {
                ORTE_TIMER_EVENT(orted_debug_failure_delay, 0, shutdown_callback, ORTE_SYS_PRI);

            } else {
                opal_output(0, "%s is executing clean %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            orted_globals.abort ? "abort" : "abnormal termination");

                /* do -not- call finalize as this will send a message to the HNP
                 * indicating clean termination! Instead, just forcibly cleanup
                 * the local session_dir tree and exit
                 */
                orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);

                /* if we were ordered to abort, do so */
                if (orted_globals.abort) {
                    abort();
                }

                /* otherwise, return with non-zero status */
                ret = ORTE_ERROR_DEFAULT_EXIT_CODE;
                goto DONE;
            }
        }
    }

    /* insert our contact info into our process_info struct so we
     * have it for later use and set the local daemon field to our name
     */
    orte_oob_base_get_addr(&orte_process_info.my_daemon_uri);
    if (NULL == orte_process_info.my_daemon_uri) {
        /* no way to communicate */
        ret = ORTE_ERROR;
        goto DONE;
    }
    ORTE_PROC_MY_DAEMON->jobid = ORTE_PROC_MY_NAME->jobid;
    ORTE_PROC_MY_DAEMON->vpid = ORTE_PROC_MY_NAME->vpid;

    /* if I am also the hnp, then update that contact info field too */
    if (ORTE_PROC_IS_HNP) {
        orte_process_info.my_hnp_uri = strdup(orte_process_info.my_daemon_uri);
        ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid;
        ORTE_PROC_MY_HNP->vpid = ORTE_PROC_MY_NAME->vpid;
    }

    /* setup the primary daemon command receive function */
    orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON,
                            ORTE_RML_PERSISTENT, orte_daemon_recv, NULL);

    /* output a message indicating we are alive, our name, and our pid
     * for debugging purposes
     */
    if (orte_debug_daemons_flag) {
        fprintf(stderr, "Daemon %s checking in as pid %ld on host %s\n",
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)orte_process_info.pid,
                orte_process_info.nodename);
    }

    /* We actually do *not* want the orted to voluntarily yield() the
       processor more than necessary.  The orted already blocks when
       it is doing nothing, so it doesn't use any more CPU cycles than
       it should; but when it *is* doing something, we do not want it
       to be unnecessarily delayed because it voluntarily yielded the
       processor in the middle of its work.

       For example: when a message arrives at the orted, we want the
       OS to wake up the orted in a timely fashion (which most OS's
       seem good about doing) and then we want the orted to process
       the message as fast as possible.  If the orted yields and lets
       aggressive MPI applications get the processor back, it may be a
       long time before the OS schedules the orted to run again
       (particularly if there is no IO event to wake it up).  Hence,
       routed OOB messages (for example) may be significantly delayed
       before being delivered to MPI processes, which can be
       problematic in some scenarios (e.g., COMM_SPAWN, BTL's that
       require OOB messages for wireup, etc.). */
    opal_progress_set_yield_when_idle(false);

    /* Change the default behavior of libevent such that we want to
       continually block rather than blocking for the default timeout
       and then looping around the progress engine again.  There
       should be nothing in the orted that cannot block in libevent
       until "something" happens (i.e., there's no need to keep
       cycling through progress because the only things that should
       happen will happen in libevent).  This is a minor optimization,
       but what the heck... :-) */
    opal_progress_set_event_flag(OPAL_EVLOOP_ONCE);

    /* if requested, report my uri to the indicated pipe */
    if (orted_globals.uri_pipe > 0) {
        orte_job_t *jdata;
        orte_proc_t *proc;
        orte_node_t *node;
        orte_app_context_t *app;
        char *tmp, *nptr, *sysinfo;
        char **singenv=NULL, *string_key, *env_str;

        /* setup the singleton's job */
        jdata = OBJ_NEW(orte_job_t);
        /* default to ompi for now */
        opal_argv_append_nosize(&jdata->personality, "ompi");
        orte_plm_base_create_jobid(jdata);
        opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata);

        /* must create a map for it (even though it has no
         * info in it) so that the job info will be picked
         * up in subsequent pidmaps or other daemons won't
         * know how to route
         */
        jdata->map = OBJ_NEW(orte_job_map_t);

        /* setup an app_context for the singleton */
        app = OBJ_NEW(orte_app_context_t);
        app->app = strdup("singleton");
        app->num_procs = 1;
        opal_pointer_array_add(jdata->apps, app);
        jdata->num_apps = 1;

        /* setup a proc object for the singleton - since we
         * -must- be the HNP, and therefore we stored our
         * node on the global node pool, and since the singleton
         * -must- be on the same node as us, indicate that
         */
        proc = OBJ_NEW(orte_proc_t);
        proc->name.jobid = jdata->jobid;
        proc->name.vpid = 0;
        proc->parent = 0;
        ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_ALIVE);
        proc->state = ORTE_PROC_STATE_RUNNING;
        proc->app_idx = 0;
        /* obviously, it is on my node */
        node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
        proc->node = node;
        OBJ_RETAIN(node);  /* keep accounting straight */
        opal_pointer_array_add(jdata->procs, proc);
        jdata->num_procs = 1;
        /* add the node to the job map */
        OBJ_RETAIN(node);
        opal_pointer_array_add(jdata->map->nodes, node);
        jdata->map->num_nodes++;
        /* and it obviously is on the node */
        OBJ_RETAIN(proc);
        opal_pointer_array_add(node->procs, proc);
        node->num_procs++;
        /* and obviously it is one of my local procs */
        OBJ_RETAIN(proc);
        opal_pointer_array_add(orte_local_children, proc);
        jdata->num_local_procs = 1;
        /* set the trivial */
        proc->local_rank = 0;
        proc->node_rank = 0;
        proc->app_rank = 0;
        proc->state = ORTE_PROC_STATE_RUNNING;
        proc->app_idx = 0;
        ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_LOCAL);

        /* set the ORTE_JOB_TRANSPORT_KEY from the environment */
        orte_pre_condition_transports(jdata, NULL);

        /* register the singleton's nspace with our PMIx server */
        if (ORTE_SUCCESS != (ret = orte_pmix_server_register_nspace(jdata, false))) {
          ORTE_ERROR_LOG(ret);
          goto DONE;
        }
        /* use setup fork to create the envars needed by the singleton */
        if (OPAL_SUCCESS != (ret = opal_pmix.server_setup_fork(&proc->name, &singenv))) {
            ORTE_ERROR_LOG(ret);
            goto DONE;
        }

        /* append the transport key to the envars needed by the singleton */
        if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_TRANSPORT_KEY, (void**)&string_key, OPAL_STRING) || NULL == string_key) {
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            goto DONE;
        }
        asprintf(&env_str, OPAL_MCA_PREFIX"orte_precondition_transports=%s", string_key);
        opal_argv_append_nosize(&singenv, env_str);
        free(env_str);

        nptr = opal_argv_join(singenv, '*');
        opal_argv_free(singenv);

        /* create a string that contains our uri + sysinfo + PMIx server URI envars */
        orte_util_convert_sysinfo_to_string(&sysinfo, orte_local_cpu_type, orte_local_cpu_model);
        asprintf(&tmp, "%s[%s]%s", orte_process_info.my_daemon_uri, sysinfo, nptr);
        free(sysinfo);
        free(nptr);

        /* pass that info to the singleton */
        if (OPAL_SUCCESS != (ret = opal_fd_write(orted_globals.uri_pipe, strlen(tmp)+1, tmp))) { ; /* need to add 1 to get the NULL */
            ORTE_ERROR_LOG(ret);
            goto DONE;
        }

        /* cleanup */
        free(tmp);
        close(orted_globals.uri_pipe);

        /* since a singleton spawned us, we need to harvest
         * any MCA params from the local environment so
         * we can pass them along to any subsequent daemons
         * we may start as the result of a comm_spawn
         */
        for (i=0; NULL != environ[i]; i++) {
            if (0 == strncmp(environ[i], OPAL_MCA_PREFIX, 9)) {
                /* make a copy to manipulate */
                tmp = strdup(environ[i]);
                /* find the equal sign */
                nptr = strchr(tmp, '=');
                *nptr = '\0';
                nptr++;
                /* add the mca param to the orted cmd line */
                opal_argv_append_nosize(&orted_cmd_line, "-"OPAL_MCA_CMD_LINE_ID);
                opal_argv_append_nosize(&orted_cmd_line, &tmp[9]);
                opal_argv_append_nosize(&orted_cmd_line, nptr);
                free(tmp);
            }
        }
    }

    /* if we were given a pipe to monitor for singleton termination, set that up */
    if (orted_globals.singleton_died_pipe > 0) {
        /* register shutdown handler */
        pipe_handler = (opal_event_t*)malloc(sizeof(opal_event_t));
        opal_event_set(orte_event_base, pipe_handler,
                       orted_globals.singleton_died_pipe,
                       OPAL_EV_READ,
                       pipe_closed,
                       pipe_handler);
        opal_event_add(pipe_handler, NULL);
    }

    /* If I have a parent, then save his contact info so
     * any messages we send can flow thru him.
     */
    orte_parent_uri = NULL;
    (void) mca_base_var_register ("orte", "orte", NULL, "parent_uri",
                                  "URI for the parent if tree launch is enabled.",
                                  MCA_BASE_VAR_TYPE_STRING, NULL, 0,
                                  MCA_BASE_VAR_FLAG_INTERNAL,
                                  OPAL_INFO_LVL_9,
                                  MCA_BASE_VAR_SCOPE_CONSTANT,
                                  &orte_parent_uri);
    if (NULL != orte_parent_uri) {
        orte_process_name_t parent;
        opal_value_t val;

        /* set the contact info into our local database */
        ret = orte_rml_base_parse_uris(orte_parent_uri, &parent, NULL);
        if (ORTE_SUCCESS != ret) {
            ORTE_ERROR_LOG(ret);
            free (orte_parent_uri);
            orte_parent_uri = NULL;
            goto DONE;
        }
        OBJ_CONSTRUCT(&val, opal_value_t);
        val.key = OPAL_PMIX_PROC_URI;
        val.type = OPAL_STRING;
        val.data.string = orte_parent_uri;
        if (OPAL_SUCCESS != (ret = opal_pmix.store_local(&parent, &val))) {
            ORTE_ERROR_LOG(ret);
            OBJ_DESTRUCT(&val);
            goto DONE;
        }
        val.key = NULL;
        val.data.string = NULL;
        OBJ_DESTRUCT(&val);

        /* don't need this value anymore */
        free(orte_parent_uri);
        orte_parent_uri = NULL;

        /* tell the routed module that we have a path
         * back to the HNP
         */
        if (ORTE_SUCCESS != (ret = orte_routed.update_route(NULL, ORTE_PROC_MY_HNP, &parent))) {
            ORTE_ERROR_LOG(ret);
            goto DONE;
        }
        /* set the lifeline to point to our parent so that we
         * can handle the situation if that lifeline goes away
         */
        if (ORTE_SUCCESS != (ret = orte_routed.set_lifeline(NULL, &parent))) {
            ORTE_ERROR_LOG(ret);
            goto DONE;
        }
    }

    /* if we are not the HNP...the only time we will be an HNP
     * is if we are launched by a singleton to provide support
     * for it
     */
    if (!ORTE_PROC_IS_HNP) {
        orte_process_name_t target;
        target.jobid = ORTE_PROC_MY_NAME->jobid;

        if (orte_fwd_mpirun_port || orte_static_ports) {
            /* setup the rollup callback */
            orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ORTED_CALLBACK,
                                    ORTE_RML_PERSISTENT, rollup, NULL);
            target.vpid = ORTE_PROC_MY_NAME->vpid;
            /* since we will be waiting for any children to send us
             * their rollup info before sending to our parent, save
             * a little time in the launch phase by "warming up" the
             * connection to our parent while we wait for our children */
            buffer = OBJ_NEW(opal_buffer_t);  // zero-byte message
            if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
                                                   ORTE_PROC_MY_PARENT, buffer,
                                                   ORTE_RML_TAG_WARMUP_CONNECTION,
                                                   orte_rml_send_callback, NULL))) {
                ORTE_ERROR_LOG(ret);
                OBJ_RELEASE(buffer);
                goto DONE;
            }
        } else {
            target.vpid = 0;
        }

        /* send the information to the orted report-back point - this function
         * will process the data, but also counts the number of
         * orteds that reported back so the launch procedure can continue.
         * We need to do this at the last possible second as the HNP
         * can turn right around and begin issuing orders to us
         */

        buffer = OBJ_NEW(opal_buffer_t);
        /* insert our name for rollup purposes */
        if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) {
            ORTE_ERROR_LOG(ret);
            OBJ_RELEASE(buffer);
            goto DONE;
        }

        /* get any connection info we may have pushed */
        {
            opal_value_t *val = NULL, *kv;
            opal_list_t *modex;
            int32_t flag;

            if (OPAL_SUCCESS != (ret = opal_pmix.get(ORTE_PROC_MY_NAME, NULL, NULL, &val)) || NULL == val) {
                /* just pack a marker indicating we don't have any to share */
                flag = 0;
                if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT32))) {
                    ORTE_ERROR_LOG(ret);
                    OBJ_RELEASE(buffer);
                    goto DONE;
                }
            } else {
                /* the data is returned as a list of key-value pairs in the opal_value_t */
                if (OPAL_PTR == val->type) {
                    modex = (opal_list_t*)val->data.ptr;
                    flag = (int32_t)opal_list_get_size(modex);
                    if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT32))) {
                        ORTE_ERROR_LOG(ret);
                        OBJ_RELEASE(buffer);
                        goto DONE;
                    }
                    OPAL_LIST_FOREACH(kv, modex, opal_value_t) {
                        if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &kv, 1, OPAL_VALUE))) {
                            ORTE_ERROR_LOG(ret);
                            OBJ_RELEASE(buffer);
                            goto DONE;
                        }
                    }
                    OPAL_LIST_RELEASE(modex);
                } else {
                    opal_output(0, "VAL KEY: %s", (NULL == val->key) ? "NULL" : val->key);
                    /* single value */
                    flag = 1;
                    if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT32))) {
                        ORTE_ERROR_LOG(ret);
                        OBJ_RELEASE(buffer);
                        goto DONE;
                    }
                    if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &val, 1, OPAL_VALUE))) {
                        ORTE_ERROR_LOG(ret);
                        OBJ_RELEASE(buffer);
                        goto DONE;
                    }
                }
                OBJ_RELEASE(val);
            }
Beispiel #7
0
/*
 * Recursively build objects whose cpu start at first_cpu
 * - level gives where to look in the type, arity and id arrays
 * - the id array is used as a variable to get unique IDs for a given level.
 * - generated memory should be added to *memory_kB.
 * - generated cpus should be added to parent_cpuset.
 * - next cpu number to be used should be returned.
 */
static unsigned
hwloc__look_synthetic(struct hwloc_topology *topology,
    int level, unsigned first_cpu,
    hwloc_bitmap_t parent_cpuset)
{
  hwloc_obj_t obj;
  unsigned i;
  hwloc_obj_type_t type = topology->backend_params.synthetic.type[level];

  /* pre-hooks */
  switch (type) {
    case HWLOC_OBJ_MISC:
      break;
    case HWLOC_OBJ_GROUP:
      break;
    case HWLOC_OBJ_SYSTEM:
    case HWLOC_OBJ_BRIDGE:
    case HWLOC_OBJ_PCI_DEVICE:
    case HWLOC_OBJ_OS_DEVICE:
      /* Shouldn't happen.  */
      abort();
      break;
    case HWLOC_OBJ_MACHINE:
      break;
    case HWLOC_OBJ_NODE:
      break;
    case HWLOC_OBJ_SOCKET:
      break;
    case HWLOC_OBJ_CACHE:
      break;
    case HWLOC_OBJ_CORE:
      break;
    case HWLOC_OBJ_PU:
      break;
    case HWLOC_OBJ_TYPE_MAX:
      /* Should never happen */
      assert(0);
      break;
  }

  obj = hwloc_alloc_setup_object(type, topology->backend_params.synthetic.id[level]++);
  obj->cpuset = hwloc_bitmap_alloc();

  if (!topology->backend_params.synthetic.arity[level]) {
    hwloc_bitmap_set(obj->cpuset, first_cpu++);
  } else {
    for (i = 0; i < topology->backend_params.synthetic.arity[level]; i++)
      first_cpu = hwloc__look_synthetic(topology, level + 1, first_cpu, obj->cpuset);
  }

  if (type == HWLOC_OBJ_NODE) {
    obj->nodeset = hwloc_bitmap_alloc();
    hwloc_bitmap_set(obj->nodeset, obj->os_index);
  }

  hwloc_bitmap_or(parent_cpuset, parent_cpuset, obj->cpuset);

  /* post-hooks */
  switch (type) {
    case HWLOC_OBJ_MISC:
      break;
    case HWLOC_OBJ_GROUP:
      obj->attr->group.depth = topology->backend_params.synthetic.depth[level];
      break;
    case HWLOC_OBJ_SYSTEM:
    case HWLOC_OBJ_BRIDGE:
    case HWLOC_OBJ_PCI_DEVICE:
    case HWLOC_OBJ_OS_DEVICE:
      abort();
      break;
    case HWLOC_OBJ_MACHINE:
      break;
    case HWLOC_OBJ_NODE:
      /* 1GB in memory nodes, 256k 4k-pages.  */
      obj->memory.local_memory = 1024*1024*1024;
      obj->memory.page_types_len = 1;
      obj->memory.page_types = malloc(sizeof(*obj->memory.page_types));
      memset(obj->memory.page_types, 0, sizeof(*obj->memory.page_types));
      obj->memory.page_types[0].size = 4096;
      obj->memory.page_types[0].count = 256*1024;
      break;
    case HWLOC_OBJ_SOCKET:
      break;
    case HWLOC_OBJ_CACHE:
      obj->attr->cache.depth = topology->backend_params.synthetic.depth[level];
      obj->attr->cache.linesize = 64;
      if (obj->attr->cache.depth == 1)
	/* 32Kb in L1 */
	obj->attr->cache.size = 32*1024;
      else
	/* *4 at each level, starting from 1MB for L2 */
	obj->attr->cache.size = 256*1024 << (2*obj->attr->cache.depth);
      break;
    case HWLOC_OBJ_CORE:
      break;
    case HWLOC_OBJ_PU:
      break;
    case HWLOC_OBJ_TYPE_MAX:
      /* Should never happen */
      assert(0);
      break;
  }

  hwloc_insert_object_by_cpuset(topology, obj);

  return first_cpu;
}
/****** shepherd_binding/binding_set_striding() *************************************
*  NAME
*     binding_set_striding() -- Binds current process to cores.
*
*  SYNOPSIS
*     bool binding_set_striding(int first_socket, int first_core, int
*     number_of_cores, int offset, int stepsize)
*
*  FUNCTION
*     Performs a core binding for the calling process according to the 
*     'striding' strategy. The first core used is specified by first_socket
*     (beginning with 0) and first_core (beginning with 0). If first_core is 
*     greater than available cores on first_socket, the next socket is examined 
*     and first_core is reduced by the skipped cores. If the first_core could 
*     not be found on system (because it was to high) no binding will be done.
*     
*     If the first core was choosen the next one is defined by the step size 'n' 
*     which is incremented to the first core found. If the socket has not the 
*     core (because it was the last core of the socket for example) the next 
*     socket is examined.
*
*     If the system is out of cores and there are still some cores to select 
*     (because of the number_of_cores parameter) no core binding will be performed.
*    
*  INPUTS
*     int first_socket    - first socket to begin with  
*     int first_core      - first core to start with  
*     int number_of_cores - total number of cores to be used
*     int offset          - core offset for first core (increments first core used) 
*     int stepsize        - step size
*     int type            - type of binding (set or env or pe)
*
*  RESULT
*     bool - Returns true if the binding was performed, otherwise false.
*
*  NOTES
*     MT-NOTE: binding_set_striding() is MT safe 
*
*******************************************************************************/
static bool
binding_set_striding(int first_socket, int first_core, int number_of_cores,
                          int offset, int stepsize, const binding_type_t type)
{
   /* n := take every n-th core */ 
   bool bound = false;

#if HAVE_HWLOC
   dstring error = DSTRING_INIT;

   if (has_core_binding() == true) {

      sge_dstring_free(&error);

         /* bitmask for processors to turn on and off */
         hwloc_bitmap_t cpuset = hwloc_bitmap_alloc();

         /* when library offers architecture: 
            - get virtual processor ids in the following manner:
              * on socket "first_socket" choose core number "first_core + offset"
              * then add n: if core is not available go to next socket
              * ...
         */
         if (has_topology_information()) {
            /* number of cores set in processor binding mask */
            int cores_set = 0;
            /* next socket to use */
            int next_socket = first_socket;
            /* next core to use */
            int next_core = first_core + offset;
            /* maximal number of sockets on this system */
            int max_number_of_sockets = get_number_of_sockets();
            hwloc_obj_t core;
            
            /* check if we are already out of range */
            if (next_socket >= max_number_of_sockets) {
               shepherd_trace("binding_set_striding: already out of sockets");
               hwloc_bitmap_free(cpuset);
               return false;
            }   

            while (get_number_of_cores(next_socket) <= next_core) {
               /* move on to next socket - could be that we have to deal only with cores 
                  instead of <socket><core> tuples */
               next_core -= get_number_of_cores(next_socket);
               next_socket++;
               if (next_socket >= max_number_of_sockets) {
                  /* we are out of sockets - we do nothing */
                  shepherd_trace("binding_set_striding: first core: out of sockets");
                  hwloc_bitmap_free(cpuset);
                  return false;
               }
            }  
            core = hwloc_get_obj_below_by_type(sge_hwloc_topology,
                                               HWLOC_OBJ_SOCKET, next_socket,
                                               HWLOC_OBJ_CORE, next_core);
            hwloc_bitmap_or(cpuset, cpuset, core->cpuset);
            
            /* collect the rest of the processor ids */ 
            for (cores_set = 1; cores_set < number_of_cores; cores_set++) {
               /* calculate next_core number */ 
               next_core += stepsize;
               
               /* check if we are already out of range */
               if (next_socket >= max_number_of_sockets) {
                  shepherd_trace("binding_set_striding: out of sockets");
                  hwloc_bitmap_free(cpuset);
                  return false;
               }   

               while (get_number_of_cores(next_socket) <= next_core) {
                  /* move on to next socket - could be that we have to deal only with cores 
                     instead of <socket><core> tuples */
                  next_core -= get_number_of_cores(next_socket);
                  next_socket++;
                  if (next_socket >= max_number_of_sockets) {
                     /* we are out of sockets - we do nothing */
                     shepherd_trace("binding_set_striding: out of sockets!");
                     hwloc_bitmap_free(cpuset);
                     return false;
                  }
                  core = hwloc_get_obj_below_by_type(sge_hwloc_topology,
                                                     HWLOC_OBJ_SOCKET,
                                                     next_socket,
                                                     HWLOC_OBJ_CORE,
                                                     next_core);
                  hwloc_bitmap_or(cpuset, cpuset, core->cpuset);
               }    
                
            } /* collecting processor ids */
           
            if (type == BINDING_TYPE_PE) {
            
               /* rankfile is created: do nothing */

            } else if (type == BINDING_TYPE_ENV) {

               /* set the environment variable */
               if (create_binding_env(cpuset) == true) {
                  shepherd_trace("binding_set_striding: SGE_BINDING env var created");
               } else {
                  shepherd_trace("binding_set_striding: problems while creating SGE_BINDING env");
               }

            } else {
               
               /* bind process to mask */ 
               if (bind_process_to_mask(cpuset) == true) {
                  /* there was an error while binding */ 
                  bound = true;
               }
            }
         
            hwloc_bitmap_free(cpuset);
            
         } else {
            /* setting bitmask without topology information which could 
               not be right? */
            shepherd_trace("binding_set_striding: bitmask without topology information");
            return false;
         }

   } else {
      /* has no core binding feature */
      sge_dstring_free(&error);
      
      return false;
   }
   
#endif  /* HAVE_HWLOC */
   return bound;
}
/*
 * Recursively build objects whose cpu start at first_cpu
 * - level gives where to look in the type, arity and id arrays
 * - the id array is used as a variable to get unique IDs for a given level.
 * - generated memory should be added to *memory_kB.
 * - generated cpus should be added to parent_cpuset.
 * - next cpu number to be used should be returned.
 */
static void
hwloc__look_synthetic(struct hwloc_topology *topology,
		      struct hwloc_synthetic_backend_data_s *data,
		      int level,
		      hwloc_bitmap_t parent_cpuset)
{
  hwloc_obj_t obj;
  unsigned i;
  struct hwloc_synthetic_level_data_s *curlevel = &data->level[level];
  hwloc_obj_type_t type = curlevel->type;
  unsigned os_index;

  /* pre-hooks */
  switch (type) {
    case HWLOC_OBJ_GROUP:
      break;
    case HWLOC_OBJ_MACHINE:
      break;
    case HWLOC_OBJ_NUMANODE:
      break;
    case HWLOC_OBJ_PACKAGE:
      break;
    case HWLOC_OBJ_CACHE:
      break;
    case HWLOC_OBJ_CORE:
      break;
    case HWLOC_OBJ_PU:
      break;
    case HWLOC_OBJ_SYSTEM:
    case HWLOC_OBJ_BRIDGE:
    case HWLOC_OBJ_PCI_DEVICE:
    case HWLOC_OBJ_OS_DEVICE:
    case HWLOC_OBJ_MISC:
    case HWLOC_OBJ_TYPE_MAX:
      /* Should never happen */
      assert(0);
      break;
  }

  os_index = curlevel->next_os_index++;
  if (curlevel->index_array)
    os_index = curlevel->index_array[os_index];
  obj = hwloc_alloc_setup_object(type, os_index);
  obj->cpuset = hwloc_bitmap_alloc();

  if (!curlevel->arity) {
    hwloc_bitmap_set(obj->cpuset, os_index);
  } else {
    for (i = 0; i < curlevel->arity; i++)
      hwloc__look_synthetic(topology, data, level + 1, obj->cpuset);
  }

  if (type == HWLOC_OBJ_NUMANODE) {
    obj->nodeset = hwloc_bitmap_alloc();
    hwloc_bitmap_set(obj->nodeset, os_index);
  }

  hwloc_bitmap_or(parent_cpuset, parent_cpuset, obj->cpuset);

  hwloc_synthetic__post_look_hooks(curlevel, obj);

  hwloc_insert_object_by_cpuset(topology, obj);
}
static int bind_downwards(orte_job_t *jdata,
                          hwloc_obj_type_t target,
                          unsigned cache_level)
{
    int i, j;
    orte_job_map_t *map;
    orte_node_t *node;
    orte_proc_t *proc;
    hwloc_obj_t trg_obj, nxt_obj;
    hwloc_cpuset_t cpus;
    unsigned int ncpus;
    struct hwloc_topology_support *support;
    opal_hwloc_obj_data_t *data;
    int total_cpus;
    hwloc_cpuset_t totalcpuset;

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps: bind downward for job %s with bindings %s",
                        ORTE_JOBID_PRINT(jdata->jobid),
                        opal_hwloc_base_print_binding(jdata->map->binding));
    /* initialize */
    map = jdata->map;
    totalcpuset = hwloc_bitmap_alloc();

    for (i=0; i < map->nodes->size; i++) {
        if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
            continue;
        }
        if (!orte_do_not_launch) {
            /* if we don't want to launch, then we are just testing the system,
             * so ignore questions about support capabilities
             */
            support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology);
            /* check if topology supports cpubind - have to be careful here
             * as Linux doesn't currently support thread-level binding. This
             * may change in the future, though, and it isn't clear how hwloc
             * interprets the current behavior. So check both flags to be sure.
             */
            if (!support->cpubind->set_thisproc_cpubind &&
                !support->cpubind->set_thisthread_cpubind) {
                if (!OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy)) {
                    /* we are not required to bind, so ignore this */
                    continue;
                }
                orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name);
                hwloc_bitmap_free(totalcpuset);
                return ORTE_ERR_SILENT;
            }
            /* check if topology supports membind - have to be careful here
             * as hwloc treats this differently than I (at least) would have
             * expected. Per hwloc, Linux memory binding is at the thread,
             * and not process, level. Thus, hwloc sets the "thisproc" flag
             * to "false" on all Linux systems, and uses the "thisthread" flag
             * to indicate binding capability
             */
            if (!support->membind->set_thisproc_membind &&
                !support->membind->set_thisthread_membind) {
                if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name);
                    membind_warned = true;
                } else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name);
                    hwloc_bitmap_free(totalcpuset);
                    return ORTE_ERR_SILENT;
                }
            }
        }

        /* clear the topology of any prior usage numbers */
        opal_hwloc_base_clear_usage(node->topology);

        /* cycle thru the procs */
        for (j=0; j < node->procs->size; j++) {
            if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
                continue;
            }
            /* ignore procs from other jobs */
            if (proc->name.jobid != jdata->jobid) {
                continue;
            }
            /* ignore procs that have already been bound - should
             * never happen, but safer
             */
            if (NULL != proc->cpu_bitmap) {
                continue;
            }
            /* we don't know if the target is a direct child of this locale,
             * or if it is some depth below it, so we have to conduct a bit
             * of a search. Let hwloc find the min usage one for us.
             */
            trg_obj = opal_hwloc_base_find_min_bound_target_under_obj(node->topology,
                                                                      proc->locale,
                                                                      target, cache_level);
            if (NULL == trg_obj) {
                /* there aren't any such targets under this object */
                orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
                hwloc_bitmap_free(totalcpuset);
                return ORTE_ERR_SILENT;
            }
            /* start with a clean slate */
            hwloc_bitmap_zero(totalcpuset);
            total_cpus = 0;
            nxt_obj = trg_obj;
            do {
                if (NULL == nxt_obj) {
                    /* could not find enough cpus to meet request */
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
                    hwloc_bitmap_free(totalcpuset);
                    return ORTE_ERR_SILENT;
                }
                trg_obj = nxt_obj;
                /* get the number of cpus under this location */
                ncpus = opal_hwloc_base_get_npus(node->topology, trg_obj);
                opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                    "%s GOT %d CPUS",
                                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ncpus);
                /* track the number bound */
                if (NULL == (data = (opal_hwloc_obj_data_t*)trg_obj->userdata)) {
                    data = OBJ_NEW(opal_hwloc_obj_data_t);
                    trg_obj->userdata = data;
                }
                data->num_bound++;
                /* error out if adding a proc would cause overload and that wasn't allowed */
                if (ncpus < data->num_bound &&
                    !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
                                   opal_hwloc_base_print_binding(map->binding), node->name,
                                   data->num_bound, ncpus);
                    hwloc_bitmap_free(totalcpuset);
                    return ORTE_ERR_SILENT;
                }
                /* bind the proc here */
                cpus = opal_hwloc_base_get_available_cpus(node->topology, trg_obj);
                hwloc_bitmap_or(totalcpuset, totalcpuset, cpus);
                total_cpus += ncpus;
                /* move to the next location, in case we need it */
                nxt_obj = trg_obj->next_cousin;
            } while (total_cpus < orte_rmaps_base.cpus_per_rank);
            hwloc_bitmap_list_asprintf(&proc->cpu_bitmap, totalcpuset);
            if (4 < opal_output_get_verbosity(orte_rmaps_base_framework.framework_output)) {
                char tmp1[1024], tmp2[1024];
                opal_hwloc_base_cset2str(tmp1, sizeof(tmp1), totalcpuset);
                opal_hwloc_base_cset2mapstr(tmp2, sizeof(tmp2), totalcpuset);
                opal_output(orte_rmaps_base_framework.framework_output,
                            "%s BOUND PROC %s[%s] TO %s: %s",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            ORTE_NAME_PRINT(&proc->name), node->name,
                            tmp1, tmp2);
            }
        }
    }
    hwloc_bitmap_free(totalcpuset);

    return ORTE_SUCCESS;
}
Beispiel #11
0
static HYD_status handle_bitmap_binding(const char *binding, const char *mapping)
{
    int i, j, k, bind_count, map_count, cache_depth = 0, bind_depth = 0, map_depth = 0;
    int total_map_objs, total_bind_objs, num_pus_in_map_domain, num_pus_in_bind_domain,
        total_map_domains;
    hwloc_obj_t map_obj, bind_obj, *start_pu;
    hwloc_cpuset_t *map_domains;
    char *bind_str, *map_str;
    HYD_status status = HYD_SUCCESS;

    HYDU_FUNC_ENTER();

    /* split out the count fields */
    status = split_count_field(binding, &bind_str, &bind_count);
    HYDU_ERR_POP(status, "error splitting count field\n");

    status = split_count_field(mapping, &map_str, &map_count);
    HYDU_ERR_POP(status, "error splitting count field\n");


    /* get the binding object */
    if (!strcmp(bind_str, "board"))
        bind_depth = hwloc_get_type_or_above_depth(topology, HWLOC_OBJ_MACHINE);
    else if (!strcmp(bind_str, "numa"))
        bind_depth = hwloc_get_type_or_above_depth(topology, HWLOC_OBJ_NODE);
    else if (!strcmp(bind_str, "socket"))
        bind_depth = hwloc_get_type_or_above_depth(topology, HWLOC_OBJ_SOCKET);
    else if (!strcmp(bind_str, "core"))
        bind_depth = hwloc_get_type_or_above_depth(topology, HWLOC_OBJ_CORE);
    else if (!strcmp(bind_str, "hwthread"))
        bind_depth = hwloc_get_type_or_above_depth(topology, HWLOC_OBJ_PU);
    else {
        /* check if it's in the l*cache format */
        cache_depth = parse_cache_string(bind_str);
        if (!cache_depth) {
            HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR,
                                "unrecognized binding string \"%s\"\n", binding);
        }
        bind_depth = hwloc_get_cache_type_depth(topology, cache_depth, -1);
    }

    /* get the mapping */
    if (!strcmp(map_str, "board"))
        map_depth = hwloc_get_type_or_above_depth(topology, HWLOC_OBJ_MACHINE);
    else if (!strcmp(map_str, "numa"))
        map_depth = hwloc_get_type_or_above_depth(topology, HWLOC_OBJ_NODE);
    else if (!strcmp(map_str, "socket"))
        map_depth = hwloc_get_type_or_above_depth(topology, HWLOC_OBJ_SOCKET);
    else if (!strcmp(map_str, "core"))
        map_depth = hwloc_get_type_or_above_depth(topology, HWLOC_OBJ_CORE);
    else if (!strcmp(map_str, "hwthread"))
        map_depth = hwloc_get_type_or_above_depth(topology, HWLOC_OBJ_PU);
    else {
        cache_depth = parse_cache_string(map_str);
        if (!cache_depth) {
            HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR,
                                "unrecognized mapping string \"%s\"\n", mapping);
        }
        map_depth = hwloc_get_cache_type_depth(topology, cache_depth, -1);
    }

    /*
     * Process Affinity Algorithm:
     *
     * The code below works in 3 stages. The end result is an array of all the possible
     * binding bitmaps for a system, based on the options specified.
     *
     * 1. Define all possible mapping "domains" in a system. A mapping domain is a group
     *    of hardware elements found by traversing the topology. Each traversal skips the
     *    number of elements the user specified in the mapping string. The traversal ends
     *    when the next mapping domain == the first mapping domain. Note that if the
     *    mapping string defines a domain that is larger than the system size, we exit
     *    with an error.
     *
     * 2. Define the number of possible binding domains within a mapping domain. This
     *    process is similar to step 1, in that we traverse the mapping domain finding
     *    all possible bind combinations, stopping when a duplicate of the first binding
     *    is reached. If a binding is larger (in # of PUs) than the mapping domain,
     *    the number of possible bindings for that domain is 1. In this stage, we also
     *    locate the first PU in each mapping domain for use later during binding.
     *
     * 3. Create the binding bitmaps. We allocate an array of bitmaps and fill them in
     *    with all possible bindings. The starting PU in each mapping domain is advanced
     *    if and when we wrap around to the beginning of the mapping domains. This ensures
     *    that we do not repeat.
     *
     */

    /* calculate the number of map domains */
    total_map_objs = hwloc_get_nbobjs_by_depth(topology, map_depth);
    num_pus_in_map_domain = (HYDT_topo_hwloc_info.total_num_pus / total_map_objs) * map_count;
    HYDU_ERR_CHKANDJUMP(status, num_pus_in_map_domain > HYDT_topo_hwloc_info.total_num_pus,
                        HYD_INTERNAL_ERROR, "mapping option \"%s\" larger than total system size\n",
                        mapping);

    /* The number of total_map_domains should be large enough to
     * contain all contiguous map object collections of length
     * map_count.  For example, if the map object is "socket" and the
     * map_count is 3, on a system with 4 sockets, the following map
     * domains should be included: (0,1,2), (3,0,1), (2,3,0), (1,2,3).
     * We do this by finding how many times we need to replicate the
     * list of the map objects so that an integral number of map
     * domains can map to them.  In the above case, the list of map
     * objects is replicated 3 times. */
    for (i = 1; (i * total_map_objs) % map_count; i++);
    total_map_domains = (i * total_map_objs) / map_count;

    /* initialize the map domains */
    HYDU_MALLOC_OR_JUMP(map_domains, hwloc_bitmap_t *, total_map_domains * sizeof(hwloc_bitmap_t),
                        status);
    HYDU_MALLOC_OR_JUMP(start_pu, hwloc_obj_t *, total_map_domains * sizeof(hwloc_obj_t), status);

    /* For each map domain, find the next map object (first map object
     * for the first map domain) and add the following "map_count"
     * number of contiguous map objects, wrapping to the first one if
     * needed, to the map domain.  Store the first PU in the first map
     * object of the map domain as "start_pu".  This is needed later
     * for the actual binding. */
    map_obj = NULL;
    for (i = 0; i < total_map_domains; i++) {
        map_domains[i] = hwloc_bitmap_alloc();
        hwloc_bitmap_zero(map_domains[i]);

        for (j = 0; j < map_count; j++) {
            map_obj = hwloc_get_next_obj_by_depth(topology, map_depth, map_obj);
            /* map_obj will be NULL if it reaches the end. call again to wrap around */
            if (!map_obj)
                map_obj = hwloc_get_next_obj_by_depth(topology, map_depth, map_obj);

            if (j == 0)
                start_pu[i] =
                    hwloc_get_obj_inside_cpuset_by_type(topology, map_obj->cpuset, HWLOC_OBJ_PU, 0);

            hwloc_bitmap_or(map_domains[i], map_domains[i], map_obj->cpuset);
        }
    }


    /* Find the possible binding domains is similar to that of map
     * domains.  But if a binding domain is larger (in # of PUs) than
     * the mapping domain, the number of possible bindings for that
     * domain is 1. */

    /* calculate the number of possible bindings and allocate bitmaps for them */
    total_bind_objs = hwloc_get_nbobjs_by_depth(topology, bind_depth);
    num_pus_in_bind_domain = (HYDT_topo_hwloc_info.total_num_pus / total_bind_objs) * bind_count;

    if (num_pus_in_bind_domain < num_pus_in_map_domain) {
        for (i = 1; (i * num_pus_in_map_domain) % num_pus_in_bind_domain; i++);
        HYDT_topo_hwloc_info.num_bitmaps =
            (i * num_pus_in_map_domain * total_map_domains) / num_pus_in_bind_domain;
    }
    else {
        HYDT_topo_hwloc_info.num_bitmaps = total_map_domains;
    }

    /* initialize bitmaps */
    HYDU_MALLOC_OR_JUMP(HYDT_topo_hwloc_info.bitmap, hwloc_bitmap_t *,
                        HYDT_topo_hwloc_info.num_bitmaps * sizeof(hwloc_bitmap_t), status);

    for (i = 0; i < HYDT_topo_hwloc_info.num_bitmaps; i++) {
        HYDT_topo_hwloc_info.bitmap[i] = hwloc_bitmap_alloc();
        hwloc_bitmap_zero(HYDT_topo_hwloc_info.bitmap[i]);
    }

    /* do bindings */
    i = 0;
    while (i < HYDT_topo_hwloc_info.num_bitmaps) {
        for (j = 0; j < total_map_domains; j++) {
            bind_obj = hwloc_get_ancestor_obj_by_depth(topology, bind_depth, start_pu[j]);

            for (k = 0; k < bind_count; k++) {
                hwloc_bitmap_or(HYDT_topo_hwloc_info.bitmap[i], HYDT_topo_hwloc_info.bitmap[i],
                                bind_obj->cpuset);

                /* if the binding is smaller than the mapping domain, wrap around inside that domain */
                if (num_pus_in_bind_domain < num_pus_in_map_domain) {
                    bind_obj =
                        hwloc_get_next_obj_inside_cpuset_by_depth(topology, map_domains[j],
                                                                  bind_depth, bind_obj);
                    if (!bind_obj)
                        bind_obj =
                            hwloc_get_next_obj_inside_cpuset_by_depth(topology, map_domains[j],
                                                                      bind_depth, bind_obj);
                }
                else {
                    bind_obj = hwloc_get_next_obj_by_depth(topology, bind_depth, bind_obj);
                    if (!bind_obj)
                        bind_obj = hwloc_get_next_obj_by_depth(topology, bind_depth, bind_obj);
                }

            }
            i++;

            /* advance the starting position for this map domain, if needed */
            if (num_pus_in_bind_domain < num_pus_in_map_domain) {
                for (k = 0; k < num_pus_in_bind_domain; k++) {
                    start_pu[j] = hwloc_get_next_obj_inside_cpuset_by_type(topology, map_domains[j],
                                                                           HWLOC_OBJ_PU,
                                                                           start_pu[j]);
                    if (!start_pu[j])
                        start_pu[j] =
                            hwloc_get_next_obj_inside_cpuset_by_type(topology, map_domains[j],
                                                                     HWLOC_OBJ_PU, start_pu[j]);
                }
            }
        }
    }

    /* free temporary memory */
    MPL_free(map_domains);
    MPL_free(start_pu);

  fn_exit:
    HYDU_FUNC_EXIT();
    return status;

  fn_fail:
    goto fn_exit;
}
Beispiel #12
0
static int
hwloc_look_windows(struct hwloc_backend *backend)
{
  struct hwloc_topology *topology = backend->topology;
  hwloc_bitmap_t groups_pu_set = NULL;
  SYSTEM_INFO SystemInfo;
  DWORD length;

  if (topology->levels[0][0]->cpuset)
    /* somebody discovered things */
    return -1;

  hwloc_alloc_obj_cpusets(topology->levels[0][0]);

  GetSystemInfo(&SystemInfo);

  if (!GetLogicalProcessorInformationExProc && GetLogicalProcessorInformationProc) {
      PSYSTEM_LOGICAL_PROCESSOR_INFORMATION procInfo, tmpprocInfo;
      unsigned id;
      unsigned i;
      struct hwloc_obj *obj;
      hwloc_obj_type_t type;

      length = 0;
      procInfo = NULL;

      while (1) {
        if (GetLogicalProcessorInformationProc(procInfo, &length))
          break;
        if (GetLastError() != ERROR_INSUFFICIENT_BUFFER)
          return -1;
        tmpprocInfo = realloc(procInfo, length);
        if (!tmpprocInfo) {
          free(procInfo);
          goto out;
        }
        procInfo = tmpprocInfo;
      }

      assert(!length || procInfo);

      for (i = 0; i < length / sizeof(*procInfo); i++) {

        /* Ignore unknown caches */
        if (procInfo->Relationship == RelationCache
                && procInfo->Cache.Type != CacheUnified
                && procInfo->Cache.Type != CacheData
                && procInfo->Cache.Type != CacheInstruction)
          continue;

        id = -1;
        switch (procInfo[i].Relationship) {
          case RelationNumaNode:
            type = HWLOC_OBJ_NUMANODE;
            id = procInfo[i].NumaNode.NodeNumber;
            break;
          case RelationProcessorPackage:
            type = HWLOC_OBJ_PACKAGE;
            break;
          case RelationCache:
            type = (procInfo[i].Cache.Type == CacheInstruction ? HWLOC_OBJ_L1ICACHE : HWLOC_OBJ_L1CACHE) + procInfo[i].Cache.Level - 1;
            break;
          case RelationProcessorCore:
            type = HWLOC_OBJ_CORE;
            break;
          case RelationGroup:
          default:
            type = HWLOC_OBJ_GROUP;
            break;
        }

        if (!hwloc_filter_check_keep_object_type(topology, type))
          continue;

        obj = hwloc_alloc_setup_object(topology, type, id);
        obj->cpuset = hwloc_bitmap_alloc();
        hwloc_debug("%s#%u mask %lx\n", hwloc_type_name(type), id, procInfo[i].ProcessorMask);
        /* ProcessorMask is a ULONG_PTR */
        hwloc_bitmap_set_ith_ULONG_PTR(obj->cpuset, 0, procInfo[i].ProcessorMask);
        hwloc_debug_2args_bitmap("%s#%u bitmap %s\n", hwloc_type_name(type), id, obj->cpuset);

        switch (type) {
          case HWLOC_OBJ_NUMANODE:
            {
              ULONGLONG avail;
              obj->nodeset = hwloc_bitmap_alloc();
              hwloc_bitmap_set(obj->nodeset, id);
              if ((GetNumaAvailableMemoryNodeExProc && GetNumaAvailableMemoryNodeExProc(id, &avail))
               || (GetNumaAvailableMemoryNodeProc && GetNumaAvailableMemoryNodeProc(id, &avail)))
                obj->memory.local_memory = avail;
              obj->memory.page_types_len = 2;
              obj->memory.page_types = malloc(2 * sizeof(*obj->memory.page_types));
              memset(obj->memory.page_types, 0, 2 * sizeof(*obj->memory.page_types));
              obj->memory.page_types_len = 1;
              obj->memory.page_types[0].size = SystemInfo.dwPageSize;
#if HAVE_DECL__SC_LARGE_PAGESIZE
              obj->memory.page_types_len++;
              obj->memory.page_types[1].size = sysconf(_SC_LARGE_PAGESIZE);
#endif
              break;
            }
          case HWLOC_OBJ_L1CACHE:
          case HWLOC_OBJ_L2CACHE:
          case HWLOC_OBJ_L3CACHE:
          case HWLOC_OBJ_L4CACHE:
          case HWLOC_OBJ_L5CACHE:
          case HWLOC_OBJ_L1ICACHE:
          case HWLOC_OBJ_L2ICACHE:
          case HWLOC_OBJ_L3ICACHE:
            obj->attr->cache.size = procInfo[i].Cache.Size;
            obj->attr->cache.associativity = procInfo[i].Cache.Associativity == CACHE_FULLY_ASSOCIATIVE ? -1 : procInfo[i].Cache.Associativity ;
            obj->attr->cache.linesize = procInfo[i].Cache.LineSize;
            obj->attr->cache.depth = procInfo[i].Cache.Level;
            switch (procInfo->Cache.Type) {
              case CacheUnified:
                obj->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED;
                break;
              case CacheData:
                obj->attr->cache.type = HWLOC_OBJ_CACHE_DATA;
                break;
              case CacheInstruction:
                obj->attr->cache.type = HWLOC_OBJ_CACHE_INSTRUCTION;
                break;
              default:
                hwloc_free_unlinked_object(obj);
                continue;
            }
            break;
          case HWLOC_OBJ_GROUP:
            obj->attr->group.kind = procInfo[i].Relationship == RelationGroup ? HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP : HWLOC_GROUP_KIND_WINDOWS_RELATIONSHIP_UNKNOWN;
            break;
          default:
            break;
        }
        hwloc_insert_object_by_cpuset(topology, obj);
      }

      free(procInfo);
  }

  if (GetLogicalProcessorInformationExProc) {
      PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX procInfoTotal, tmpprocInfoTotal, procInfo;
      unsigned id;
      struct hwloc_obj *obj;
      hwloc_obj_type_t type;

      length = 0;
      procInfoTotal = NULL;

      while (1) {
        if (GetLogicalProcessorInformationExProc(RelationAll, procInfoTotal, &length))
          break;
        if (GetLastError() != ERROR_INSUFFICIENT_BUFFER)
          return -1;
        tmpprocInfoTotal = realloc(procInfoTotal, length);
        if (!tmpprocInfoTotal) {
          free(procInfoTotal);
          goto out;
        }
        procInfoTotal = tmpprocInfoTotal;
      }

      for (procInfo = procInfoTotal;
           (void*) procInfo < (void*) ((uintptr_t) procInfoTotal + length);
           procInfo = (void*) ((uintptr_t) procInfo + procInfo->Size)) {
        unsigned num, i;
        GROUP_AFFINITY *GroupMask;

        /* Ignore unknown caches */
        if (procInfo->Relationship == RelationCache
                && procInfo->Cache.Type != CacheUnified
                && procInfo->Cache.Type != CacheData
                && procInfo->Cache.Type != CacheInstruction)
          continue;

        id = -1;
        switch (procInfo->Relationship) {
          case RelationNumaNode:
            type = HWLOC_OBJ_NUMANODE;
            num = 1;
            GroupMask = &procInfo->NumaNode.GroupMask;
            id = procInfo->NumaNode.NodeNumber;
            break;
          case RelationProcessorPackage:
            type = HWLOC_OBJ_PACKAGE;
            num = procInfo->Processor.GroupCount;
            GroupMask = procInfo->Processor.GroupMask;
            break;
          case RelationCache:
            type = (procInfo->Cache.Type == CacheInstruction ? HWLOC_OBJ_L1ICACHE : HWLOC_OBJ_L1CACHE) + procInfo->Cache.Level - 1;
            num = 1;
            GroupMask = &procInfo->Cache.GroupMask;
            break;
          case RelationProcessorCore:
            type = HWLOC_OBJ_CORE;
            num = procInfo->Processor.GroupCount;
            GroupMask = procInfo->Processor.GroupMask;
            break;
          case RelationGroup:
            /* So strange an interface... */
            for (id = 0; id < procInfo->Group.ActiveGroupCount; id++) {
              KAFFINITY mask;
              hwloc_bitmap_t set;

              set = hwloc_bitmap_alloc();
              mask = procInfo->Group.GroupInfo[id].ActiveProcessorMask;
              hwloc_debug("group %u %d cpus mask %lx\n", id,
                          procInfo->Group.GroupInfo[id].ActiveProcessorCount, mask);
              /* KAFFINITY is ULONG_PTR */
              hwloc_bitmap_set_ith_ULONG_PTR(set, id, mask);
              /* FIXME: what if running 32bits on a 64bits windows with 64-processor groups?
               * ULONG_PTR is 32bits, so half the group is invisible?
               * maybe scale id to id*8/sizeof(ULONG_PTR) so that groups are 64-PU aligned?
               */
              hwloc_debug_2args_bitmap("group %u %d bitmap %s\n", id, procInfo->Group.GroupInfo[id].ActiveProcessorCount, set);

              /* save the set of PUs so that we can create them at the end */
              if (!groups_pu_set)
                groups_pu_set = hwloc_bitmap_alloc();
              hwloc_bitmap_or(groups_pu_set, groups_pu_set, set);

              if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_GROUP)) {
                obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_GROUP, id);
                obj->cpuset = set;
                obj->attr->group.kind = HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP;
                hwloc_insert_object_by_cpuset(topology, obj);
              } else
                hwloc_bitmap_free(set);
            }
            continue;
          default:
            /* Don't know how to get the mask.  */
            hwloc_debug("unknown relation %d\n", procInfo->Relationship);
            continue;
        }

        if (!hwloc_filter_check_keep_object_type(topology, type))
          continue;

        obj = hwloc_alloc_setup_object(topology, type, id);
        obj->cpuset = hwloc_bitmap_alloc();
        for (i = 0; i < num; i++) {
          hwloc_debug("%s#%u %d: mask %d:%lx\n", hwloc_type_name(type), id, i, GroupMask[i].Group, GroupMask[i].Mask);
          /* GROUP_AFFINITY.Mask is KAFFINITY, which is ULONG_PTR */
          hwloc_bitmap_set_ith_ULONG_PTR(obj->cpuset, GroupMask[i].Group, GroupMask[i].Mask);
          /* FIXME: scale id to id*8/sizeof(ULONG_PTR) as above? */
        }
        hwloc_debug_2args_bitmap("%s#%u bitmap %s\n", hwloc_type_name(type), id, obj->cpuset);
        switch (type) {
          case HWLOC_OBJ_NUMANODE:
            {
              ULONGLONG avail;
              obj->nodeset = hwloc_bitmap_alloc();
              hwloc_bitmap_set(obj->nodeset, id);
              if ((GetNumaAvailableMemoryNodeExProc && GetNumaAvailableMemoryNodeExProc(id, &avail))
               || (GetNumaAvailableMemoryNodeProc && GetNumaAvailableMemoryNodeProc(id, &avail)))
                obj->memory.local_memory = avail;
              obj->memory.page_types = malloc(2 * sizeof(*obj->memory.page_types));
              memset(obj->memory.page_types, 0, 2 * sizeof(*obj->memory.page_types));
              obj->memory.page_types_len = 1;
              obj->memory.page_types[0].size = SystemInfo.dwPageSize;
#if HAVE_DECL__SC_LARGE_PAGESIZE
              obj->memory.page_types_len++;
              obj->memory.page_types[1].size = sysconf(_SC_LARGE_PAGESIZE);
#endif
              break;
            }
          case HWLOC_OBJ_L1CACHE:
          case HWLOC_OBJ_L2CACHE:
          case HWLOC_OBJ_L3CACHE:
          case HWLOC_OBJ_L4CACHE:
          case HWLOC_OBJ_L5CACHE:
          case HWLOC_OBJ_L1ICACHE:
          case HWLOC_OBJ_L2ICACHE:
          case HWLOC_OBJ_L3ICACHE:
            obj->attr->cache.size = procInfo->Cache.CacheSize;
            obj->attr->cache.associativity = procInfo->Cache.Associativity == CACHE_FULLY_ASSOCIATIVE ? -1 : procInfo->Cache.Associativity ;
            obj->attr->cache.linesize = procInfo->Cache.LineSize;
            obj->attr->cache.depth = procInfo->Cache.Level;
            switch (procInfo->Cache.Type) {
              case CacheUnified:
                obj->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED;
                break;
              case CacheData:
                obj->attr->cache.type = HWLOC_OBJ_CACHE_DATA;
                break;
              case CacheInstruction:
                obj->attr->cache.type = HWLOC_OBJ_CACHE_INSTRUCTION;
                break;
              default:
                hwloc_free_unlinked_object(obj);
                continue;
            }
            break;
          default:
            break;
        }
        hwloc_insert_object_by_cpuset(topology, obj);
      }
      free(procInfoTotal);
  }

  if (groups_pu_set) {
    /* the system supports multiple Groups.
     * PU indexes may be discontiguous, especially if Groups contain less than 64 procs.
     */
    hwloc_obj_t obj;
    unsigned idx;
    hwloc_bitmap_foreach_begin(idx, groups_pu_set) {
      obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_PU, idx);
      obj->cpuset = hwloc_bitmap_alloc();
      hwloc_bitmap_only(obj->cpuset, idx);
      hwloc_debug_1arg_bitmap("cpu %u has cpuset %s\n",
                              idx, obj->cpuset);
      hwloc_insert_object_by_cpuset(topology, obj);
    } hwloc_bitmap_foreach_end();
    hwloc_bitmap_free(groups_pu_set);
  } else {
Beispiel #13
0
static int orcmd_init(void)
{
    int ret = ORTE_ERROR;
    char *error = NULL;
    opal_buffer_t buf, *clusterbuf, *uribuf;
    orte_job_t *jdata;
    orte_node_t *node;
    orte_proc_t *proc;
    opal_list_t config;
    orcm_scheduler_t *scheduler;
    orcm_node_t *mynode=NULL;
    int32_t n;

    if (initialized) {
        return ORCM_SUCCESS;
    }
    initialized = true;

    /* Initialize the ORTE data type support */
    if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
        error = "orte_std_prolog";
        goto error;
    }

    /* setup the global job and node arrays */
    orte_job_data = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data,
                                                       1,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       1))) {
        ORTE_ERROR_LOG(ret);
        error = "setup job array";
        goto error;
    }

    orte_node_pool = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
        ORTE_ERROR_LOG(ret);
        error = "setup node array";
        goto error;
    }
    orte_node_topologies = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_topologies,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
        ORTE_ERROR_LOG(ret);
        error = "setup node topologies array";
        goto error;
    }

    /* create a job tracker for the daemons */
    jdata = OBJ_NEW(orte_job_t);
    jdata->jobid = 0;
    ORTE_PROC_MY_NAME->jobid = 0;
    opal_pointer_array_set_item(orte_job_data, 0, jdata);

    /* read the site configuration */
    OBJ_CONSTRUCT(&config, opal_list_t);
    if (ORCM_SUCCESS != (ret = orcm_cfgi.read_config(&config))) {
        error = "getting config";
        goto error;
    }

    /* define the cluster and collect contact info for all
     * aggregators - we'll need to know how to talk to any
     * of them in case of failures
     */
    OBJ_CONSTRUCT(&buf, opal_buffer_t);
    if (ORCM_SUCCESS != (ret = orcm_cfgi.define_system(&config,
                                                       &mynode,
                                                       &orte_process_info.num_procs,
                                                       &buf))) {
        OBJ_DESTRUCT(&buf);
        error = "define system";
        goto error;
    }

    /* if my name didn't get set, then we didn't find our node
     * in the config - report it and die
     */
    if (NULL == mynode) {
        orte_show_help("help-ess-orcm.txt", "node-not-found", true,
                       orcm_cfgi_base.config_file,
                       orte_process_info.nodename);
        OBJ_DESTRUCT(&buf);
        return ORTE_ERR_SILENT;
    }

    /* define a node and proc object for ourselves as some parts
     * of ORTE and ORCM require it */
    if (NULL == (node = OBJ_NEW(orte_node_t))) {
        ret = ORTE_ERR_OUT_OF_RESOURCE;
        error = "out of memory";
        goto error;
    }
    node->name = strdup(orte_process_info.nodename);
    opal_pointer_array_set_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid, node);
    if (NULL == (proc = OBJ_NEW(orte_proc_t))) {
        ret = ORTE_ERR_OUT_OF_RESOURCE;
        error = "out of memory";
        goto error;
    }
    proc->name.jobid = ORTE_PROC_MY_NAME->jobid;
    proc->name.vpid = ORTE_PROC_MY_NAME->vpid;
    OBJ_RETAIN(proc);
    node->daemon = proc;
    OBJ_RETAIN(node);
    proc->node = node;
    opal_pointer_array_set_item(jdata->procs, ORTE_PROC_MY_NAME->vpid, proc);

    /* For now, we only support a single scheduler daemon in the system.
     * This *may* change someday in the future */
    scheduler = (orcm_scheduler_t*)opal_list_get_first(orcm_schedulers);

    /* If we are in test mode, then we don't *require* that a scheduler
     * be defined in the system - otherwise, we do */
    if (NULL == scheduler) {
        if (mca_sst_orcmd_component.scheduler_reqd) {
            error = "no scheduler found";
            ret = ORTE_ERR_NOT_FOUND;
            goto error;
        }
    } else {
        ORTE_PROC_MY_SCHEDULER->jobid = scheduler->controller.daemon.jobid;
        ORTE_PROC_MY_SCHEDULER->vpid = scheduler->controller.daemon.vpid;
    }

    /* register the ORTE-level params at this time now that the
     * config has had a chance to push things into the environ
     */
    if (ORTE_SUCCESS != (ret = orte_register_params())) {
        OBJ_DESTRUCT(&buf);
        error = "orte_register_params";
        goto error;
    }

    /* setup callback for SIGPIPE */
    setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback);
    /* Set signal handlers to catch kill signals so we can properly clean up
     * after ourselves.
     */
    setup_sighandler(SIGTERM, &term_handler, shutdown_signal);
    setup_sighandler(SIGINT, &int_handler, shutdown_signal);

    /** setup callbacks for signals we should ignore */
    setup_sighandler(SIGUSR1, &sigusr1_handler, signal_callback);
    setup_sighandler(SIGUSR2, &sigusr2_handler, signal_callback);
    signals_set = true;

#if OPAL_HAVE_HWLOC
    {
        hwloc_obj_t obj;
        unsigned i, j;

        /* get the local topology */
        if (NULL == opal_hwloc_topology) {
            if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
                OBJ_DESTRUCT(&buf);
                error = "topology discovery";
                goto error;
            }
        }

        /* remove the hostname from the topology. Unfortunately, hwloc
         * decided to add the source hostname to the "topology", thus
         * rendering it unusable as a pure topological description. So
         * we remove that information here.
         */
        obj = hwloc_get_root_obj(opal_hwloc_topology);
        for (i=0; i < obj->infos_count; i++) {
            if (NULL == obj->infos[i].name ||
                NULL == obj->infos[i].value) {
                continue;
            }
            if (0 == strncmp(obj->infos[i].name, "HostName", strlen("HostName"))) {
                free(obj->infos[i].name);
                free(obj->infos[i].value);
                /* left justify the array */
                for (j=i; j < obj->infos_count-1; j++) {
                    obj->infos[j] = obj->infos[j+1];
                }
                obj->infos[obj->infos_count-1].name = NULL;
                obj->infos[obj->infos_count-1].value = NULL;
                obj->infos_count--;
                break;
            }
        }

        if (15 < opal_output_get_verbosity(orcm_sst_base_framework.framework_output)) {
            opal_output(0, "%s Topology Info:", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO);
        }

        /* if we were asked to bind to specific core(s), do so now */
        if (NULL != orte_daemon_cores) {
            char **cores=NULL, tmp[128];
            hwloc_obj_t pu;
            hwloc_cpuset_t ours, pucpus, res;
            int core;

            /* could be a collection of comma-delimited ranges, so
             * use our handy utility to parse it
             */
            orte_util_parse_range_options(orte_daemon_cores, &cores);
            if (NULL != cores) {
                ours = hwloc_bitmap_alloc();
                hwloc_bitmap_zero(ours);
                pucpus = hwloc_bitmap_alloc();
                res = hwloc_bitmap_alloc();
                for (i=0; NULL != cores[i]; i++) {
                    core = strtoul(cores[i], NULL, 10);
                    if (NULL == (pu = opal_hwloc_base_get_pu(opal_hwloc_topology, core, OPAL_HWLOC_LOGICAL))) {
                        orte_show_help("help-orted.txt", "orted:cannot-bind",
                                       true, orte_process_info.nodename,
                                       orte_daemon_cores);
                        ret = ORTE_ERR_NOT_SUPPORTED;
                        OBJ_DESTRUCT(&buf);
                        error = "cannot bind";
                        goto error;
                    }
                    hwloc_bitmap_and(pucpus, pu->online_cpuset, pu->allowed_cpuset);
                    hwloc_bitmap_or(res, ours, pucpus);
                    hwloc_bitmap_copy(ours, res);
                }
                /* if the result is all zeros, then don't bind */
                if (!hwloc_bitmap_iszero(ours)) {
                    (void)hwloc_set_cpubind(opal_hwloc_topology, ours, 0);
                    if (opal_hwloc_report_bindings) {
                        opal_hwloc_base_cset2mapstr(tmp, sizeof(tmp), opal_hwloc_topology, ours);
                        opal_output(0, "Daemon %s is bound to cores %s",
                                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp);
                    }
                }
                /* cleanup */
                hwloc_bitmap_free(ours);
                hwloc_bitmap_free(pucpus);
                hwloc_bitmap_free(res);
                opal_argv_free(cores);
            }
        }
    }
#endif

    /* open and select the pstat framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_pstat_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "opal_pstat_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "opal_pstat_base_select";
        goto error;
    }

    /* open and setup the state machine */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_state_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_state_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_state_base_select";
        goto error;
    }

    /* open the notifier */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_notifier_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_notifier_base_open";
        goto error;
    }

    /* open the errmgr */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_errmgr_base_open";
        goto error;
    }

    /* Setup the communication infrastructure */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_oob_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_oob_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_oob_base_select";
        goto error;
    }
    if (!opal_list_get_size(&orte_oob_base.actives)) {
        ret = ORTE_ERROR;
        error = "orte_oob: Found 0 active transports";
        goto error;
    }

    /* Runtime Messaging Layer */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_rml_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_rml_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_rml_base_select";
        goto error;
    }

    /* select the notifier*/
    if (ORTE_SUCCESS != (ret = orte_notifier_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_notifier_base_select";
        goto error;
    }

    /* select the errmgr */
    if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_errmgr_base_select";
        goto error;
    }

    /* Routed system */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_rml_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_routed_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_routed_base_select";
        goto error;
    }

    /* database */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_db_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orcm_db_base_open";
        goto error;
    }
    /* always restrict daemons to local database components */
    if (ORTE_SUCCESS != (ret = orcm_db_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orcm_db_base_select";
        goto error;
    }

    /* datastore - ensure we don't pickup the pmi component, but
     * don't override anything set by user
     */
    if (NULL == getenv(OPAL_MCA_PREFIX"dstore")) {
        putenv(OPAL_MCA_PREFIX"dstore=^pmi");
    }
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_dstore_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "opal_dstore_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = opal_dstore_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "opal_dstore_base_select";
        goto error;
    }
    /* create the handle */
    if (0 > (opal_dstore_internal = opal_dstore.open("INTERNAL", NULL, NULL))) {
        error = "opal dstore internal";
        ret = ORTE_ERR_FATAL;
        goto error;
    }

    /* extract the cluster description and setup the routed info - the orcm routed component
     * will know what to do. */
    n = 1;
    if (OPAL_SUCCESS != (ret = opal_dss.unpack(&buf, &clusterbuf, &n, OPAL_BUFFER))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "extract cluster buf";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, clusterbuf))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        OBJ_RELEASE(clusterbuf);
        error = "orte_routed.init_routes";
        goto error;
    }
    OBJ_RELEASE(clusterbuf);

    /* extract the uri buffer and load the hash tables */
    n = 1;
    if (OPAL_SUCCESS != (ret = opal_dss.unpack(&buf, &uribuf, &n, OPAL_BUFFER))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "extract uri buffer";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_rml_base_update_contact_info(uribuf))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        OBJ_RELEASE(uribuf);
        error = "load hash tables";
        goto error;
    }
    OBJ_DESTRUCT(&buf);
    OBJ_RELEASE(uribuf);

    /*
     * Group communications
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_grpcomm_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_select";
        goto error;
    }

    /* Open/select the odls */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_odls_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_odls_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_odls_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_odls_base_select";
        goto error;
    }

    /* enable communication with the rml */
    if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml.enable_comm";
        goto error;
    }

    /* setup the FileM */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_filem_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_filem_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_filem_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_filem_base_select";
        goto error;
    }

    /*
     * Initalize the CR setup
     * Note: Always do this, even in non-FT builds.
     * If we don't some user level tools may hang.
     */
    opal_cr_set_enabled(false);
    if (ORTE_SUCCESS != (ret = orte_cr_init())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_cr_init";
        goto error;
    }

    /* setup the ANALYTICS framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_analytics_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_analytics_base_open";
        goto error;
    }

    /* setup the EVGEN framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_evgen_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_evgen_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orcm_evgen_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_evgen_select";
        goto error;
    }

    /* setup the SENSOR framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_sensor_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_sensor_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orcm_sensor_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_sensor_select";
        goto error;
    }
    /* start the local sensors */
    orcm_sensor.start(ORTE_PROC_MY_NAME->jobid);

    /* setup the PWRMGMT framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_pwrmgmt_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_pwrmgmt_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orcm_pwrmgmt_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_pwrmgmt_select";
        goto error;
    }

    /* setup the DFS framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_dfs_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_dfs_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_dfs_select";
        goto error;
    }

    /* open and setup the DIAG framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_diag_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_diag_base_open";
        goto error;
    }
    if (ORCM_SUCCESS != (ret = orcm_diag_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_diag_select";
        goto error;
    }

    return ORTE_SUCCESS;
    
 error:
    orte_show_help("help-orcm-runtime.txt",
                   "orcm_init:startup:internal-failure",
                   true, error, ORTE_ERROR_NAME(ret), ret);
    
    return ORTE_ERR_SILENT;
}
int main(void)
{
  const struct hwloc_topology_support *support;
  char *buffer;
  hwloc_topology_t topology;
  hwloc_bitmap_t set = hwloc_bitmap_alloc();
  hwloc_bitmap_t total = hwloc_bitmap_alloc();
  hwloc_obj_t node;
  char *s;
  int err;

  err = hwloc_topology_init(&topology);
  assert(!err);
  err = hwloc_topology_load(topology);
  assert(!err);

  support = hwloc_topology_get_support(topology);
  if (!support->membind->get_area_memlocation)
    goto out;

  buffer = hwloc_alloc(topology, LEN);
  assert(buffer);
  printf("buffer %p length %u\n", buffer, LEN);

  err = hwloc_get_area_memlocation(topology, buffer, LEN, set, HWLOC_MEMBIND_BYNODESET);
  if (err < 0 && errno == ENOSYS) {
    fprintf(stderr, "hwloc_get_area_memlocation() failed with ENOSYS, aborting\n");
    goto out_with_buffer;
  }
  assert(!err);
  hwloc_bitmap_asprintf(&s, set);
  printf("address %p length %u allocated in nodeset %s\n", buffer, LEN, s);
  free(s);
  hwloc_bitmap_copy(total, set);

  node = NULL;
 next1:
  node = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_NUMANODE, node);
  if (!node)
    goto out_with_buffer;
  if (!node->memory.local_memory)
    goto next1;
  printf("binding to 1st node and touching 1st quarter\n");
  err = hwloc_set_area_membind(topology, buffer, LEN, node->nodeset, HWLOC_MEMBIND_BIND, HWLOC_MEMBIND_BYNODESET);
  if (err < 0 && errno == ENOSYS) {
    fprintf(stderr, "hwloc_set_area_membind() failed with ENOSYS, aborting\n");
    goto out_with_buffer;
  }
  assert(!err);

  memset(buffer, 0, LEN/4);
  err = hwloc_get_area_memlocation(topology, buffer, 1, set, HWLOC_MEMBIND_BYNODESET);
  assert(!err);
  hwloc_bitmap_asprintf(&s, set);
  printf("address %p length %u allocated in nodeset %s\n", buffer, LEN/4, s);
  free(s);
  hwloc_bitmap_or(total, total, set);

 next2:
  node = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_NUMANODE, node);
  if (!node)
    goto out_with_nomorenodes;
  if (!node->memory.local_memory)
    goto next2;
  printf("binding to 2nd node and touching 2nd quarter\n");
  err = hwloc_set_area_membind(topology, buffer, LEN, node->nodeset, HWLOC_MEMBIND_BIND, HWLOC_MEMBIND_BYNODESET);
  assert(!err);

  memset(buffer+LEN/4, 0, LEN/4);
  err = hwloc_get_area_memlocation(topology, buffer+LEN/4, LEN/4, set, HWLOC_MEMBIND_BYNODESET);
  assert(!err);
  hwloc_bitmap_asprintf(&s, set);
  printf("address %p length %u allocated in nodeset %s\n", buffer+LEN/4, LEN/4, s);
  free(s);
  hwloc_bitmap_or(total, total, set);

 next3:
  node = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_NUMANODE, node);
  if (!node)
    goto out_with_nomorenodes;
  if (!node->memory.local_memory)
    goto next3;
  printf("binding to 3rd node and touching 3rd quarter\n");
  err = hwloc_set_area_membind(topology, buffer, LEN, node->nodeset, HWLOC_MEMBIND_BIND, HWLOC_MEMBIND_BYNODESET);
  assert(!err);

  memset(buffer+LEN/2, 0, LEN/4);
  err = hwloc_get_area_memlocation(topology, buffer+LEN/2, LEN/4, set, HWLOC_MEMBIND_BYNODESET);
  assert(!err);
  hwloc_bitmap_asprintf(&s, set);
  printf("address %p length %u allocated in nodeset %s\n", buffer+LEN/2, LEN/4, s);
  free(s);
  hwloc_bitmap_or(total, total, set);

 next4:
  node = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_NUMANODE, node);
  if (!node)
    goto out_with_nomorenodes;
  if (!node->memory.local_memory)
    goto next4;
  printf("binding to 4th node and touching 4th quarter\n");
  err = hwloc_set_area_membind(topology, buffer, LEN, node->nodeset, HWLOC_MEMBIND_BIND, HWLOC_MEMBIND_BYNODESET);
  assert(!err);

  memset(buffer+3*LEN/4, 0, LEN/4);
  err = hwloc_get_area_memlocation(topology, buffer+3*LEN/4, LEN/4, set, HWLOC_MEMBIND_BYNODESET);
  assert(!err);
  hwloc_bitmap_asprintf(&s, set);
  printf("address %p length %u allocated in nodeset %s\n", buffer+3*LEN/4, LEN/4, s);
  free(s);
  hwloc_bitmap_or(total, total, set);

 out_with_nomorenodes:
  err = hwloc_get_area_memlocation(topology, buffer, LEN, set, HWLOC_MEMBIND_BYNODESET);
  assert(!err);
  hwloc_bitmap_asprintf(&s, set);
  printf("address %p length %u located on %s\n", buffer, LEN, s);
  free(s);
  assert(hwloc_bitmap_isincluded(total, set));

 out_with_buffer:
  hwloc_free(topology, buffer, LEN);

 out:
  hwloc_topology_destroy(topology);
  hwloc_bitmap_free(set);
  hwloc_bitmap_free(total);
  return 0;
}
/****** shepherd_binding/binding_explicit() *****************************************
*  NAME
*     binding_explicit() -- Binds current process to specified CPU cores. 
*
*  SYNOPSIS
*     bool binding_explicit(int* list_of_cores, int camount, int* 
*     list_of_sockets, int samount) 
*
*  FUNCTION
*     Binds the current process to the cores specified by a <socket>,<core>
*     tuple. The tuple is given by a list of sockets and a list of cores. 
*     The elements on the same position of these lists are reflecting 
*     a tuple. Therefore the length of the lists must be the same.
*
*  INPUTS
*     int* list_of_sockets - List of sockets in the same order as list of cores. 
*     int samount          - Length of the list of sockets. 
*     int* list_of_cores   - List of cores in the same order as list of sockets. 
*     int camount          - Length of the list of cores. 
*     int type             - Type of binding ( set | env | pe ).
*
*  RESULT
*     bool - true when the current process was bound as specified with the
*            input parameter
*
*  NOTES
*     MT-NOTE: binding_explicit() is not MT safe 
*
*******************************************************************************/
static bool binding_explicit(const int* list_of_sockets, const int samount, 
   const int* list_of_cores, const int camount, const binding_type_t type)
{
   /* return value: successful bound or not */ 
   bool bound = false;

#if HAVE_HWLOC
   /* check if we have exactly the same number of sockets as cores */
   if (camount != samount) {
      shepherd_trace("binding_explicit: bug: number of sockets != number of cores");
      return false;
   }

   if (list_of_sockets == NULL || list_of_cores == NULL) {
      shepherd_trace("binding_explicit: wrong input values");
   }   
   
   if (has_core_binding() == true) {
      
      if (has_topology_information()) {
         /* bitmask for processors to turn on and off */
         hwloc_bitmap_t cpuset = hwloc_bitmap_alloc();

         /* processor id counter */
         int pr_id_ctr;

         /* Fetch for each socket,core tuple the processor id. 
            If this is not possible for one do nothing and return false. */ 

         /* go through all socket,core tuples and get the processor id */
         for (pr_id_ctr = 0; pr_id_ctr < camount; pr_id_ctr++) { 
            hwloc_obj_t core =
              hwloc_get_obj_below_by_type(sge_hwloc_topology,
                                          HWLOC_OBJ_SOCKET,
                                          list_of_sockets[pr_id_ctr],
                                          HWLOC_OBJ_CORE,
                                          list_of_cores[pr_id_ctr]);
            if (!core) {
               hwloc_bitmap_free(cpuset);
               return false;
            }
            hwloc_bitmap_or(cpuset, cpuset, core->cpuset);
         }

         if (type == BINDING_TYPE_PE) {
            
            /* rankfile is created */

         } else if (type == BINDING_TYPE_ENV) {
            /* set the environment variable */
            if (create_binding_env(cpuset) == true) {
               shepherd_trace("binding_explicit: SGE_BINDING env var created");
            } else {
               shepherd_trace("binding_explicit: problems while creating SGE_BINDING env");
            }
         } else {
            /* do the core binding for the current process with the mask */
            if (bind_process_to_mask(cpuset) == true) {
               /* there was an error while binding */ 
               bound = true;
            } else {
               /* couldn't be bound return false */
               shepherd_trace("binding_explicit: bind_process_to_mask was not successful");
            }   
         }

         hwloc_bitmap_free(cpuset);
         /* Fixme:  Maybe free topology at this stage, but it probably
            doesn't use a significant amount of space.  */
      } else {
         /* has no topology information */
         shepherd_trace("binding_explicit: no topology information");
      }  

   } else {
      /* has no core binding ability */
      shepherd_trace("binding_explicit: host does not support core binding");
   }   
#endif  /* HAVE_HWLOC */
   return bound;
}
Beispiel #16
0
int orte_daemon(int argc, char *argv[])
{
    int ret = 0;
    opal_cmd_line_t *cmd_line = NULL;
    char *rml_uri;
    int i;
    opal_buffer_t *buffer;
    char hostname[100];
#if OPAL_ENABLE_FT_CR == 1
    char *tmp_env_var = NULL;
#endif
    
    /* initialize the globals */
    memset(&orted_globals, 0, sizeof(orted_globals));
    /* initialize the singleton died pipe to an illegal value so we can detect it was set */
    orted_globals.singleton_died_pipe = -1;
    /* init the failure orted vpid to an invalid value */
    orted_globals.fail = ORTE_VPID_INVALID;
    
    /* setup to check common command line options that just report and die */
    cmd_line = OBJ_NEW(opal_cmd_line_t);
    if (OPAL_SUCCESS != opal_cmd_line_create(cmd_line, orte_cmd_line_opts)) {
        OBJ_RELEASE(cmd_line);
        exit(1);
    }
    mca_base_cmd_line_setup(cmd_line);
    if (ORTE_SUCCESS != (ret = opal_cmd_line_parse(cmd_line, false,
                                                   argc, argv))) {
        char *args = NULL;
        args = opal_cmd_line_get_usage_msg(cmd_line);
        fprintf(stderr, "Usage: %s [OPTION]...\n%s\n", argv[0], args);
        free(args);
        OBJ_RELEASE(cmd_line);
        return ret;
    }
    
    /*
     * Since this process can now handle MCA/GMCA parameters, make sure to
     * process them.
     */
    mca_base_cmd_line_process_args(cmd_line, &environ, &environ);
    
    /* Ensure that enough of OPAL is setup for us to be able to run */
    /*
     * NOTE: (JJH)
     *  We need to allow 'mca_base_cmd_line_process_args()' to process command
     *  line arguments *before* calling opal_init_util() since the command
     *  line could contain MCA parameters that affect the way opal_init_util()
     *  functions. AMCA parameters are one such option normally received on the
     *  command line that affect the way opal_init_util() behaves.
     *  It is "safe" to call mca_base_cmd_line_process_args() before 
     *  opal_init_util() since mca_base_cmd_line_process_args() does *not*
     *  depend upon opal_init_util() functionality.
     */
    if (OPAL_SUCCESS != opal_init_util(&argc, &argv)) {
        fprintf(stderr, "OPAL failed to initialize -- orted aborting\n");
        exit(1);
    }

    /* save the environment for launch purposes. This MUST be
     * done so that we can pass it to any local procs we
     * spawn - otherwise, those local procs won't see any
     * non-MCA envars that were set in the enviro when the
     * orted was executed - e.g., by .csh
     */
    orte_launch_environ = opal_argv_copy(environ);
    
    /* purge any ess flag set in the environ when we were launched */
    opal_unsetenv(OPAL_MCA_PREFIX"ess", &orte_launch_environ);
    
    /* if orte_daemon_debug is set, let someone know we are alive right
     * away just in case we have a problem along the way
     */
    if (orted_globals.debug) {
        gethostname(hostname, 100);
        fprintf(stderr, "Daemon was launched on %s - beginning to initialize\n", hostname);
    }
    
    /* check for help request */
    if (orted_globals.help) {
        char *args = NULL;
        args = opal_cmd_line_get_usage_msg(cmd_line);
        orte_show_help("help-orted.txt", "orted:usage", false,
                       argv[0], args);
        free(args);
        return 1;
    }
#if defined(HAVE_SETSID)
    /* see if we were directed to separate from current session */
    if (orted_globals.set_sid) {
        setsid();
    }
#endif
    /* see if they want us to spin until they can connect a debugger to us */
    i=0;
    while (orted_spin_flag) {
        i++;
        if (1000 < i) i=0;        
    }

#if OPAL_ENABLE_FT_CR == 1
    /* Mark as a tool program */
    (void) mca_base_var_env_name ("opal_cr_is_tool", &tmp_env_var);
    opal_setenv(tmp_env_var,
                "1",
                true, &environ);
    free(tmp_env_var);
#endif

    /* if mapreduce set, flag it */
    if (orted_globals.mapreduce) {
        orte_map_reduce = true;
    }

    /* detach from controlling terminal
     * otherwise, remain attached so output can get to us
     */
    if(!orte_debug_flag &&
       !orte_debug_daemons_flag &&
       orted_globals.daemonize) {
        opal_daemon_init(NULL);
    }
    
    /* Set the flag telling OpenRTE that I am NOT a
     * singleton, but am "infrastructure" - prevents setting
     * up incorrect infrastructure that only a singleton would
     * require.
     */
    if (orted_globals.hnp) {
        if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_HNP))) {
            ORTE_ERROR_LOG(ret);
            return ret;
        }
    } else {
        if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_DAEMON))) {
            ORTE_ERROR_LOG(ret);
            return ret;
        }
    }
    /* finalize the OPAL utils. As they are opened again from orte_init->opal_init
     * we continue to have a reference count on them. So we have to finalize them twice...
     */
    opal_finalize_util();

#if OPAL_HAVE_HWLOC
    /* bind ourselves if so directed */
    if (NULL != orte_daemon_cores) {
        char **cores=NULL, tmp[128];
        hwloc_obj_t pu;
        hwloc_cpuset_t ours, pucpus, res;
        int core;

        /* could be a collection of comma-delimited ranges, so
         * use our handy utility to parse it
         */
        orte_util_parse_range_options(orte_daemon_cores, &cores);
        if (NULL != cores) {
            ours = hwloc_bitmap_alloc();
            hwloc_bitmap_zero(ours);
            pucpus = hwloc_bitmap_alloc();
            res = hwloc_bitmap_alloc();
            for (i=0; NULL != cores[i]; i++) {
                core = strtoul(cores[i], NULL, 10);
                if (NULL == (pu = opal_hwloc_base_get_pu(opal_hwloc_topology, core, OPAL_HWLOC_LOGICAL))) {
                    /* turn off the show help forwarding as we won't
                     * be able to cycle the event library to send
                     */
                    orte_show_help_finalize();
                    /* the message will now come out locally */
                    orte_show_help("help-orted.txt", "orted:cannot-bind",
                                   true, orte_process_info.nodename,
                                   orte_daemon_cores);
                    ret = ORTE_ERR_NOT_SUPPORTED;
                    goto DONE;
                }
                hwloc_bitmap_and(pucpus, pu->online_cpuset, pu->allowed_cpuset);
                hwloc_bitmap_or(res, ours, pucpus);
                hwloc_bitmap_copy(ours, res);
            }
            /* if the result is all zeros, then don't bind */
            if (!hwloc_bitmap_iszero(ours)) {
                (void)hwloc_set_cpubind(opal_hwloc_topology, ours, 0);
                if (opal_hwloc_report_bindings) {
                    opal_hwloc_base_cset2mapstr(tmp, sizeof(tmp), opal_hwloc_topology, ours);
                    opal_output(0, "Daemon %s is bound to cores %s",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp);
                }
            }
            /* cleanup */
            hwloc_bitmap_free(ours);
            hwloc_bitmap_free(pucpus);
            hwloc_bitmap_free(res);
            opal_argv_free(cores);
        }
    }
#endif

    if ((int)ORTE_VPID_INVALID != orted_globals.fail) {
        orted_globals.abort=false;
        /* some vpid was ordered to fail. The value can be positive
         * or negative, depending upon the desired method for failure,
         * so need to check both here
         */
        if (0 > orted_globals.fail) {
            orted_globals.fail = -1*orted_globals.fail;
            orted_globals.abort = true;
        }
        /* are we the specified vpid? */
        if ((int)ORTE_PROC_MY_NAME->vpid == orted_globals.fail) {
            /* if the user specified we delay, then setup a timer
             * and have it kill us
             */
            if (0 < orted_globals.fail_delay) {
                ORTE_TIMER_EVENT(orted_globals.fail_delay, 0, shutdown_callback, ORTE_SYS_PRI);
                
            } else {
                opal_output(0, "%s is executing clean %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            orted_globals.abort ? "abort" : "abnormal termination");

                /* do -not- call finalize as this will send a message to the HNP
                 * indicating clean termination! Instead, just forcibly cleanup
                 * the local session_dir tree and exit
                 */
                orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
                
                /* if we were ordered to abort, do so */
                if (orted_globals.abort) {
                    abort();
                }
                
                /* otherwise, return with non-zero status */
                ret = ORTE_ERROR_DEFAULT_EXIT_CODE;
                goto DONE;
            }
        }
    }

    /* insert our contact info into our process_info struct so we
     * have it for later use and set the local daemon field to our name
     */
    orte_process_info.my_daemon_uri = orte_rml.get_contact_info();
    ORTE_PROC_MY_DAEMON->jobid = ORTE_PROC_MY_NAME->jobid;
    ORTE_PROC_MY_DAEMON->vpid = ORTE_PROC_MY_NAME->vpid;
    
    /* if I am also the hnp, then update that contact info field too */
    if (ORTE_PROC_IS_HNP) {
        orte_process_info.my_hnp_uri = orte_rml.get_contact_info();
        ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid;
        ORTE_PROC_MY_HNP->vpid = ORTE_PROC_MY_NAME->vpid;
    }
    
    /* setup the primary daemon command receive function */
    orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON,
                            ORTE_RML_PERSISTENT, orte_daemon_recv, NULL);
    
    /* output a message indicating we are alive, our name, and our pid
     * for debugging purposes
     */
    if (orte_debug_daemons_flag) {
        fprintf(stderr, "Daemon %s checking in as pid %ld on host %s\n",
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)orte_process_info.pid,
                orte_process_info.nodename);
    }

    /* We actually do *not* want the orted to voluntarily yield() the
       processor more than necessary.  The orted already blocks when
       it is doing nothing, so it doesn't use any more CPU cycles than
       it should; but when it *is* doing something, we do not want it
       to be unnecessarily delayed because it voluntarily yielded the
       processor in the middle of its work.

       For example: when a message arrives at the orted, we want the
       OS to wake up the orted in a timely fashion (which most OS's
       seem good about doing) and then we want the orted to process
       the message as fast as possible.  If the orted yields and lets
       aggressive MPI applications get the processor back, it may be a
       long time before the OS schedules the orted to run again
       (particularly if there is no IO event to wake it up).  Hence,
       routed OOB messages (for example) may be significantly delayed
       before being delivered to MPI processes, which can be
       problematic in some scenarios (e.g., COMM_SPAWN, BTL's that
       require OOB messages for wireup, etc.). */
    opal_progress_set_yield_when_idle(false);

    /* Change the default behavior of libevent such that we want to
       continually block rather than blocking for the default timeout
       and then looping around the progress engine again.  There
       should be nothing in the orted that cannot block in libevent
       until "something" happens (i.e., there's no need to keep
       cycling through progress because the only things that should
       happen will happen in libevent).  This is a minor optimization,
       but what the heck... :-) */
    opal_progress_set_event_flag(OPAL_EVLOOP_ONCE);

    /* if requested, report my uri to the indicated pipe */
    if (orted_globals.uri_pipe > 0) {
        orte_job_t *jdata;
        orte_proc_t *proc;
        orte_node_t *node;
        orte_app_context_t *app;
        char *tmp, *nptr, *sysinfo;
        int32_t ljob;

        /* setup the singleton's job */
        jdata = OBJ_NEW(orte_job_t);
        orte_plm_base_create_jobid(jdata);
        ljob = ORTE_LOCAL_JOBID(jdata->jobid);
        opal_pointer_array_set_item(orte_job_data, ljob, jdata);

        /* must create a map for it (even though it has no
         * info in it) so that the job info will be picked
         * up in subsequent pidmaps or other daemons won't
         * know how to route
         */
        jdata->map = OBJ_NEW(orte_job_map_t);

        /* setup an app_context for the singleton */
        app = OBJ_NEW(orte_app_context_t);
        app->app = strdup("singleton");
        app->num_procs = 1;
        opal_pointer_array_add(jdata->apps, app);
        
        /* setup a proc object for the singleton - since we
         * -must- be the HNP, and therefore we stored our
         * node on the global node pool, and since the singleton
         * -must- be on the same node as us, indicate that
         */
        proc = OBJ_NEW(orte_proc_t);
        proc->name.jobid = jdata->jobid;
        proc->name.vpid = 0;
        ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_ALIVE);
        proc->state = ORTE_PROC_STATE_RUNNING;
        proc->app_idx = 0;
        /* obviously, it is on my node */
        node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
        proc->node = node;
        OBJ_RETAIN(node);  /* keep accounting straight */
        opal_pointer_array_add(jdata->procs, proc);
        jdata->num_procs = 1;
        /* and it obviously is on the node */
        OBJ_RETAIN(proc);
        opal_pointer_array_add(node->procs, proc);
        node->num_procs++;
        /* and obviously it is one of my local procs */
        OBJ_RETAIN(proc);
        opal_pointer_array_add(orte_local_children, proc);
        jdata->num_local_procs = 1;
        /* set the trivial */
        proc->local_rank = 0;
        proc->node_rank = 0;
        proc->app_rank = 0;
        proc->state = ORTE_PROC_STATE_RUNNING;
        proc->app_idx = 0;
        ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_LOCAL);

        /* create a string that contains our uri + sysinfo + PMIx server URI */
        orte_util_convert_sysinfo_to_string(&sysinfo, orte_local_cpu_type, orte_local_cpu_model);
        asprintf(&tmp, "%s[%s]%s", orte_process_info.my_daemon_uri, sysinfo, pmix_server_uri);
	free(sysinfo);

        /* pass that info to the singleton */
        write(orted_globals.uri_pipe, tmp, strlen(tmp)+1); /* need to add 1 to get the NULL */

        /* cleanup */
        free(tmp);

        /* since a singleton spawned us, we need to harvest
         * any MCA params from the local environment so
         * we can pass them along to any subsequent daemons
         * we may start as the result of a comm_spawn
         */
        for (i=0; NULL != environ[i]; i++) {
            if (0 == strncmp(environ[i], OPAL_MCA_PREFIX, 9)) {
                /* make a copy to manipulate */
                tmp = strdup(environ[i]);
                /* find the equal sign */
                nptr = strchr(tmp, '=');
                *nptr = '\0';
                nptr++;
                /* add the mca param to the orted cmd line */
                opal_argv_append_nosize(&orted_cmd_line, "-"OPAL_MCA_CMD_LINE_ID);
                opal_argv_append_nosize(&orted_cmd_line, &tmp[9]);
                opal_argv_append_nosize(&orted_cmd_line, nptr);
                free(tmp);
            }
        }
    }

    /* if we were given a pipe to monitor for singleton termination, set that up */
    if (orted_globals.singleton_died_pipe > 0) {
        /* register shutdown handler */
        pipe_handler = (opal_event_t*)malloc(sizeof(opal_event_t));
        opal_event_set(orte_event_base, pipe_handler,
                       orted_globals.singleton_died_pipe,
                       OPAL_EV_READ,
                       pipe_closed,
                       pipe_handler);
        opal_event_add(pipe_handler, NULL);
    }

    /* If I have a parent, then save his contact info so
     * any messages we send can flow thru him.
     */

    orte_parent_uri = NULL;
    (void) mca_base_var_register ("orte", "orte", NULL, "parent_uri",
                                  "URI for the parent if tree launch is enabled.",
                                  MCA_BASE_VAR_TYPE_STRING, NULL, 0,
                                  MCA_BASE_VAR_FLAG_INTERNAL,
                                  OPAL_INFO_LVL_9,
                                  MCA_BASE_VAR_SCOPE_CONSTANT,
                                  &orte_parent_uri);
    if (NULL != orte_parent_uri) {
        orte_process_name_t parent;

        /* set the contact info into the hash table */
        orte_rml.set_contact_info(orte_parent_uri);
        ret = orte_rml_base_parse_uris(orte_parent_uri, &parent, NULL);
        if (ORTE_SUCCESS != ret) {
            ORTE_ERROR_LOG(ret);
            free (orte_parent_uri);
            orte_parent_uri = NULL;
            goto DONE;
        }

        /* don't need this value anymore */
        free(orte_parent_uri);
        orte_parent_uri = NULL;

        /* tell the routed module that we have a path
         * back to the HNP
         */
        if (ORTE_SUCCESS != (ret = orte_routed.update_route(ORTE_PROC_MY_HNP, &parent))) {
            ORTE_ERROR_LOG(ret);
            goto DONE;
        }
        /* set the lifeline to point to our parent so that we
         * can handle the situation if that lifeline goes away
         */
        if (ORTE_SUCCESS != (ret = orte_routed.set_lifeline(&parent))) {
            ORTE_ERROR_LOG(ret);
            goto DONE;
        }
    }

    /* if we are not the HNP...the only time we will be an HNP
     * is if we are launched by a singleton to provide support
     * for it
     */
    if (!ORTE_PROC_IS_HNP) {
        /* send the information to the orted report-back point - this function
         * will process the data, but also counts the number of
         * orteds that reported back so the launch procedure can continue.
         * We need to do this at the last possible second as the HNP
         * can turn right around and begin issuing orders to us
         */

        buffer = OBJ_NEW(opal_buffer_t);
        /* insert our name for rollup purposes */
        if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) {
            ORTE_ERROR_LOG(ret);
            OBJ_RELEASE(buffer);
            goto DONE;
        }
        /* for now, always include our contact info, even if we are using
         * static ports. Eventually, this will be removed
         */
        rml_uri = orte_rml.get_contact_info();
        if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &rml_uri, 1, OPAL_STRING))) {
            ORTE_ERROR_LOG(ret);
            OBJ_RELEASE(buffer);
            goto DONE;
        }

        /* include our node name */
        opal_dss.pack(buffer, &orte_process_info.nodename, 1, OPAL_STRING);

        /* if requested, include any non-loopback aliases for this node */
        if (orte_retain_aliases) {
            char **aliases=NULL;
            uint8_t naliases, ni;
            char hostname[ORTE_MAX_HOSTNAME_SIZE];

            /* if we stripped the prefix or removed the fqdn,
             * include full hostname as an alias
             */
            gethostname(hostname, ORTE_MAX_HOSTNAME_SIZE);
            if (strlen(orte_process_info.nodename) < strlen(hostname)) {
                opal_argv_append_nosize(&aliases, hostname);
            }
            opal_ifgetaliases(&aliases);
            naliases = opal_argv_count(aliases);
            if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &naliases, 1, OPAL_UINT8))) {
                ORTE_ERROR_LOG(ret);
                OBJ_RELEASE(buffer);
                goto DONE;
            }
            for (ni=0; ni < naliases; ni++) {
                if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &aliases[ni], 1, OPAL_STRING))) {
                    ORTE_ERROR_LOG(ret);
                    OBJ_RELEASE(buffer);
                    goto DONE;
                }
            }
            opal_argv_free(aliases);
        }

#if OPAL_HAVE_HWLOC
        {
            char *coprocessors;
            /* add the local topology */
            if (NULL != opal_hwloc_topology &&
                (1 == ORTE_PROC_MY_NAME->vpid || orte_hetero_nodes)) {
                if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO))) {
                    ORTE_ERROR_LOG(ret);
                }
            }
            /* detect and add any coprocessors */
            coprocessors = opal_hwloc_base_find_coprocessors(opal_hwloc_topology);
            if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &coprocessors, 1, OPAL_STRING))) {
                ORTE_ERROR_LOG(ret);
            }
            /* see if I am on a coprocessor */
            coprocessors = opal_hwloc_base_check_on_coprocessor();
            if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &coprocessors, 1, OPAL_STRING))) {
                ORTE_ERROR_LOG(ret);
            }
        }
#endif

        /* send to the HNP's callback - will be routed if routes are available */
        if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buffer,
                                               ORTE_RML_TAG_ORTED_CALLBACK,
                                               orte_rml_send_callback, NULL))) {
            ORTE_ERROR_LOG(ret);
            OBJ_RELEASE(buffer);
            goto DONE;
        }
    }

    /* if we are tree-spawning, then we need to capture the MCA params
     * from our cmd line so we can pass them along to the daemons we spawn -
     * otherwise, only the first layer of daemons will ever see them
     */
    if (orted_globals.tree_spawn) {
        int j, k;
        bool ignore;
        char *no_keep[] = {
            "orte_hnp_uri",
            "orte_ess_jobid",
            "orte_ess_vpid",
            "orte_ess_num_procs",
            "orte_parent_uri",
            "mca_base_env_list",
            NULL
        };
        for (i=0; i < argc; i++) {
            if (0 == strcmp("-"OPAL_MCA_CMD_LINE_ID,  argv[i]) ||
                0 == strcmp("--"OPAL_MCA_CMD_LINE_ID, argv[i]) ) {
                ignore = false;
                /* see if this is something we cannot pass along */
                for (k=0; NULL != no_keep[k]; k++) {
                    if (0 == strcmp(no_keep[k], argv[i+1])) {
                        ignore = true;
                        break;
                    }
                }
                if (!ignore) {
                    /* see if this is already present so we at least can
                     * avoid growing the cmd line with duplicates
                     */
                    if (NULL != orted_cmd_line) {
                        for (j=0; NULL != orted_cmd_line[j]; j++) {
                            if (0 == strcmp(argv[i+1], orted_cmd_line[j])) {
                                /* already here - ignore it */
                                ignore = true;
                                break;
                            }
                        }
                    }
                    if (!ignore) {
                        opal_argv_append_nosize(&orted_cmd_line, argv[i]);
                        opal_argv_append_nosize(&orted_cmd_line, argv[i+1]);
                        opal_argv_append_nosize(&orted_cmd_line, argv[i+2]);
                    }
                }
                i += 2;
            }
        }
    }
            
    if (orte_debug_daemons_flag) {
        opal_output(0, "%s orted: up and running - waiting for commands!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
    }
    ret = ORTE_SUCCESS;

    /* loop the event lib until an exit event is detected */
    while (orte_event_base_active) {
        opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
    }

    /* ensure all local procs are dead */
    orte_odls.kill_local_procs(NULL);

 DONE:
    /* update the exit status, in case it wasn't done */
    ORTE_UPDATE_EXIT_STATUS(ret);

    /* cleanup and leave */
    orte_finalize();

    if (orte_debug_flag) {
        fprintf(stderr, "exiting with status %d\n", orte_exit_status);
    }
    exit(orte_exit_status);
}
Beispiel #17
0
/* user to have to play with the cgroup hierarchy to modify it */
extern int task_cgroup_cpuset_set_task_affinity(slurmd_job_t *job)
{
	int fstatus = SLURM_ERROR;

#ifndef HAVE_HWLOC

	error("task/cgroup: plugin not compiled with hwloc support, "
	      "skipping affinity.");
	return fstatus;

#else
	uint32_t i;
	uint32_t nldoms;
	uint32_t nsockets;
	uint32_t ncores;
	uint32_t npus;
	uint32_t nobj;

	uint32_t pfirst,plast;
	uint32_t taskid = job->envtp->localid;
	uint32_t jntasks = job->node_tasks;
	uint32_t jnpus = jntasks * job->cpus_per_task;
	pid_t    pid = job->envtp->task_pid;

	cpu_bind_type_t bind_type;
	int verbose;

	hwloc_topology_t topology;
#if HWLOC_API_VERSION <= 0x00010000
	hwloc_cpuset_t cpuset,ct;
#else
	hwloc_bitmap_t cpuset,ct;
#endif
	hwloc_obj_t obj;
	struct hwloc_obj *pobj;
	hwloc_obj_type_t hwtype;
	hwloc_obj_type_t req_hwtype;
	int hwdepth;

	size_t tssize;
	cpu_set_t ts;

	bind_type = job->cpu_bind_type ;
	if (conf->task_plugin_param & CPU_BIND_VERBOSE ||
	    bind_type & CPU_BIND_VERBOSE)
		verbose = 1 ;

	if (bind_type & CPU_BIND_NONE) {
		if (verbose)
			info("task/cgroup: task[%u] is requesting no affinity",
			     taskid);
		return 0;
	} else if (bind_type & CPU_BIND_TO_THREADS) {
		if (verbose)
			info("task/cgroup: task[%u] is requesting "
			     "thread level binding",taskid);
		req_hwtype = HWLOC_OBJ_PU;
	} else if (bind_type & CPU_BIND_TO_CORES) {
		if (verbose)
			info("task/cgroup: task[%u] is requesting "
			     "core level binding",taskid);
		req_hwtype = HWLOC_OBJ_CORE;
	} else if (bind_type & CPU_BIND_TO_SOCKETS) {
		if (verbose)
			info("task/cgroup: task[%u] is requesting "
			     "socket level binding",taskid);
		req_hwtype = HWLOC_OBJ_SOCKET;
	} else if (bind_type & CPU_BIND_TO_LDOMS) {
		if (verbose)
			info("task/cgroup: task[%u] is requesting "
			     "ldom level binding",taskid);
		req_hwtype = HWLOC_OBJ_NODE;
	} else {
		if (verbose)
			info("task/cgroup: task[%u] using core level binding"
			     " by default",taskid);
		req_hwtype = HWLOC_OBJ_CORE;
	}

	/* Allocate and initialize hwloc objects */
	hwloc_topology_init(&topology);
#if HWLOC_API_VERSION <= 0x00010000
	cpuset = hwloc_cpuset_alloc() ;
#else
	cpuset = hwloc_bitmap_alloc() ;
#endif

	/*
	 * Perform the topology detection. It will only get allowed PUs.
	 * Detect in the same time the granularity to use for binding.
	 * The granularity can be relaxed from threads to cores if enough
	 * cores are available as with hyperthread support, ntasks-per-core
	 * param can let us have access to more threads per core for each
	 * task
	 * Revert back to machine granularity if no finer-grained granularity
	 * matching the request is found. This will result in no affinity
	 * applied.
	 * The detected granularity will be used to find where to best place
	 * the task, then the cpu_bind option will be used to relax the
	 * affinity constraint and use more PUs. (i.e. use a core granularity
	 * to dispatch the tasks across the sockets and then provide access
	 * to each task to the cores of its socket.)
	 */
	hwloc_topology_load(topology);
	npus = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						   HWLOC_OBJ_PU);
	ncores = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						     HWLOC_OBJ_CORE);
	nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						       HWLOC_OBJ_SOCKET);
	nldoms = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						     HWLOC_OBJ_NODE);
	hwtype = HWLOC_OBJ_MACHINE;
	nobj = 1;
	if (npus >= jnpus || bind_type & CPU_BIND_TO_THREADS) {
		hwtype = HWLOC_OBJ_PU;
		nobj = npus;
	}
	if (ncores >= jnpus || bind_type & CPU_BIND_TO_CORES) {
		hwtype = HWLOC_OBJ_CORE;
		nobj = ncores;
	}
	if (nsockets >= jntasks &&
	     bind_type & CPU_BIND_TO_SOCKETS) {
		hwtype = HWLOC_OBJ_SOCKET;
		nobj = nsockets;
	}
	/*
	 * HWLOC returns all the NUMA nodes available regardless of the
	 * number of underlying sockets available (regardless of the allowed
	 * resources). So there is no guarantee that each ldom will be populated
	 * with usable sockets. So add a simple check that at least ensure that
	 * we have as many sockets as ldoms before moving to ldoms granularity
	 */
	if (nldoms >= jntasks &&
	     nsockets >= nldoms &&
	     bind_type & CPU_BIND_TO_LDOMS) {
		hwtype = HWLOC_OBJ_NODE;
		nobj = nldoms;
	}

	/*
	 * Perform a block binding on the detected object respecting the
	 * granularity.
	 * If not enough objects to do the job, revert to no affinity mode
	 */
	if (hwloc_compare_types(hwtype,HWLOC_OBJ_MACHINE) == 0) {

		info("task/cgroup: task[%u] disabling affinity because of %s "
		     "granularity",taskid,hwloc_obj_type_string(hwtype));

	} else if (hwloc_compare_types(hwtype,HWLOC_OBJ_CORE) >= 0 &&
		    jnpus > nobj) {

		info("task/cgroup: task[%u] not enough %s objects, disabling "
		     "affinity",taskid,hwloc_obj_type_string(hwtype));

	} else {

		if (verbose) {
			info("task/cgroup: task[%u] using %s granularity",
			     taskid,hwloc_obj_type_string(hwtype));
		}
		if (hwloc_compare_types(hwtype,HWLOC_OBJ_CORE) >= 0) {
			/* cores or threads granularity */
			pfirst = taskid *  job->cpus_per_task ;
			plast = pfirst + job->cpus_per_task - 1;
		} else {
			/* sockets or ldoms granularity */
			pfirst = taskid;
			plast = pfirst;
		}

		hwdepth = hwloc_get_type_depth(topology,hwtype);
		for (i = pfirst; i <= plast && i < nobj ; i++) {
			obj = hwloc_get_obj_by_depth(topology,hwdepth,(int)i);

			/* if requested binding overlap the granularity */
			/* use the ancestor cpuset instead of the object one */
			if (hwloc_compare_types(hwtype,req_hwtype) > 0) {

				/* Get the parent object of req_hwtype or the */
				/* one just above if not found (meaning of >0)*/
				/* (useful for ldoms binding with !NUMA nodes)*/
				pobj = obj->parent;
				while (pobj != NULL &&
					hwloc_compare_types(pobj->type,
							    req_hwtype) > 0)
					pobj = pobj->parent;

				if (pobj != NULL) {
					if (verbose)
						info("task/cgroup: task[%u] "
						     "higher level %s found",
						     taskid,
						     hwloc_obj_type_string(
							     pobj->type));
#if HWLOC_API_VERSION <= 0x00010000
					ct = hwloc_cpuset_dup(pobj->
							      allowed_cpuset);
					hwloc_cpuset_or(cpuset,cpuset,ct);
					hwloc_cpuset_free(ct);
#else
					ct = hwloc_bitmap_dup(pobj->
							      allowed_cpuset);
					hwloc_bitmap_or(cpuset,cpuset,ct);
					hwloc_bitmap_free(ct);
#endif
				} else {
					/* should not be executed */
					if (verbose)
						info("task/cgroup: task[%u] "
						     "no higher level found",
						     taskid);
#if HWLOC_API_VERSION <= 0x00010000
					ct = hwloc_cpuset_dup(obj->
							      allowed_cpuset);
					hwloc_cpuset_or(cpuset,cpuset,ct);
					hwloc_cpuset_free(ct);
#else
					ct = hwloc_bitmap_dup(obj->
							      allowed_cpuset);
					hwloc_bitmap_or(cpuset,cpuset,ct);
					hwloc_bitmap_free(ct);
#endif
				}

			} else {
#if HWLOC_API_VERSION <= 0x00010000
				ct = hwloc_cpuset_dup(obj->allowed_cpuset);
				hwloc_cpuset_or(cpuset,cpuset,ct);
				hwloc_cpuset_free(ct);
#else
				ct = hwloc_bitmap_dup(obj->allowed_cpuset);
				hwloc_bitmap_or(cpuset,cpuset,ct);
				hwloc_bitmap_free(ct);
#endif
			}
		}

		char *str;
#if HWLOC_API_VERSION <= 0x00010000
		hwloc_cpuset_asprintf(&str,cpuset);
#else
		hwloc_bitmap_asprintf(&str,cpuset);
#endif
		tssize = sizeof(cpu_set_t);
		if (hwloc_cpuset_to_glibc_sched_affinity(topology,cpuset,
							  &ts,tssize) == 0) {
			fstatus = SLURM_SUCCESS;
			if (sched_setaffinity(pid,tssize,&ts)) {
				error("task/cgroup: task[%u] unable to set "
				      "taskset '%s'",taskid,str);
				fstatus = SLURM_ERROR;
			} else if (verbose) {
				info("task/cgroup: task[%u] taskset '%s' is set"
				     ,taskid,str);
			}
		} else {
			error("task/cgroup: task[%u] unable to build "
			      "taskset '%s'",taskid,str);
			fstatus = SLURM_ERROR;
		}
		free(str);

	}

	/* Destroy hwloc objects */
#if HWLOC_API_VERSION <= 0x00010000
	hwloc_cpuset_free(cpuset);
#else
	hwloc_bitmap_free(cpuset);
#endif
	hwloc_topology_destroy(topology);

	return fstatus;
#endif

}
Beispiel #18
0
static int
hwloc_solaris_set_sth_cpubind(hwloc_topology_t topology, idtype_t idtype, id_t id, hwloc_const_bitmap_t hwloc_set, int flags)
{
  unsigned target_cpu;

  /* The resulting binding is always strict */

  if (hwloc_bitmap_isequal(hwloc_set, hwloc_topology_get_complete_cpuset(topology))) {
    if (processor_bind(idtype, id, PBIND_NONE, NULL) != 0)
      return -1;
#ifdef HAVE_LIBLGRP
    if (!(flags & HWLOC_CPUBIND_NOMEMBIND)) {
      int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
      if (depth >= 0) {
	int n = hwloc_get_nbobjs_by_depth(topology, depth);
	int i;

	for (i = 0; i < n; i++) {
	  hwloc_obj_t obj = hwloc_get_obj_by_depth(topology, depth, i);
	  lgrp_affinity_set(idtype, id, obj->os_index, LGRP_AFF_NONE);
	}
      }
    }
#endif /* HAVE_LIBLGRP */
    return 0;
  }

#ifdef HAVE_LIBLGRP
  if (!(flags & HWLOC_CPUBIND_NOMEMBIND)) {
    int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
    if (depth >= 0) {
      int n = hwloc_get_nbobjs_by_depth(topology, depth);
      int i;
      int ok;
      hwloc_bitmap_t target = hwloc_bitmap_alloc();

      for (i = 0; i < n; i++) {
	hwloc_obj_t obj = hwloc_get_obj_by_depth(topology, depth, i);
        if (hwloc_bitmap_isincluded(obj->cpuset, hwloc_set))
          hwloc_bitmap_or(target, target, obj->cpuset);
      }

      ok = hwloc_bitmap_isequal(target, hwloc_set);
      hwloc_bitmap_free(target);

      if (ok) {
        /* Ok, managed to achieve hwloc_set by just combining NUMA nodes */

        for (i = 0; i < n; i++) {
          hwloc_obj_t obj = hwloc_get_obj_by_depth(topology, depth, i);

          if (hwloc_bitmap_isincluded(obj->cpuset, hwloc_set)) {
            lgrp_affinity_set(idtype, id, obj->os_index, LGRP_AFF_STRONG);
          } else {
            if (flags & HWLOC_CPUBIND_STRICT)
              lgrp_affinity_set(idtype, id, obj->os_index, LGRP_AFF_NONE);
            else
              lgrp_affinity_set(idtype, id, obj->os_index, LGRP_AFF_WEAK);
          }
        }

        return 0;
      }
    }
  }
#endif /* HAVE_LIBLGRP */

  if (hwloc_bitmap_weight(hwloc_set) != 1) {
    errno = EXDEV;
    return -1;
  }

  target_cpu = hwloc_bitmap_first(hwloc_set);

  if (processor_bind(idtype, id,
		     (processorid_t) (target_cpu), NULL) != 0)
    return -1;

  return 0;
}
static int bind_downwards(orte_job_t *jdata,
                          orte_node_t *node,
                          hwloc_obj_type_t target,
                          unsigned cache_level)
{
    int j;
    orte_job_map_t *map;
    orte_proc_t *proc;
    hwloc_obj_t trg_obj, nxt_obj;
    hwloc_cpuset_t cpus;
    unsigned int ncpus;
    opal_hwloc_obj_data_t *data;
    int total_cpus;
    hwloc_cpuset_t totalcpuset;

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps: bind downward for job %s with bindings %s",
                        ORTE_JOBID_PRINT(jdata->jobid),
                        opal_hwloc_base_print_binding(jdata->map->binding));
    /* initialize */
    map = jdata->map;
    totalcpuset = hwloc_bitmap_alloc();

    /* cycle thru the procs */
    for (j=0; j < node->procs->size; j++) {
        if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
            continue;
        }
        /* ignore procs from other jobs */
        if (proc->name.jobid != jdata->jobid) {
            continue;
        }
        /* ignore procs that have already been bound - should
         * never happen, but safer
         */
        if (NULL != proc->cpu_bitmap) {
            continue;
        }
        /* we don't know if the target is a direct child of this locale,
         * or if it is some depth below it, so we have to conduct a bit
         * of a search. Let hwloc find the min usage one for us.
         */
        trg_obj = opal_hwloc_base_find_min_bound_target_under_obj(node->topology,
                                                                  proc->locale,
                                                                  target, cache_level);
        if (NULL == trg_obj) {
            /* there aren't any such targets under this object */
            orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
            hwloc_bitmap_free(totalcpuset);
            return ORTE_ERR_SILENT;
        }
        /* record the location */
        proc->bind_location = trg_obj;
        /* start with a clean slate */
        hwloc_bitmap_zero(totalcpuset);
        total_cpus = 0;
        nxt_obj = trg_obj;
        do {
            if (NULL == nxt_obj) {
                /* could not find enough cpus to meet request */
                orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
                hwloc_bitmap_free(totalcpuset);
                return ORTE_ERR_SILENT;
            }
            trg_obj = nxt_obj;
            /* get the number of cpus under this location */
            ncpus = opal_hwloc_base_get_npus(node->topology, trg_obj);
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "%s GOT %d CPUS",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ncpus);
            /* track the number bound */
            if (NULL == (data = (opal_hwloc_obj_data_t*)trg_obj->userdata)) {
                data = OBJ_NEW(opal_hwloc_obj_data_t);
                trg_obj->userdata = data;
            }
            data->num_bound++;
            /* error out if adding a proc would cause overload and that wasn't allowed,
             * and it wasn't a default binding policy (i.e., the user requested it)
             */
            if (ncpus < data->num_bound &&
                !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) {
                if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
                                   opal_hwloc_base_print_binding(map->binding), node->name,
                                   data->num_bound, ncpus);
                    hwloc_bitmap_free(totalcpuset);
                    return ORTE_ERR_SILENT;
                } else {
                    /* if this is the default binding policy, then just don't
                     * bind this proc
                     */
                    data->num_bound--;  // maintain count
                    /* show the proc as not bound */
                    proc->bind_location = NULL;
                    hwloc_bitmap_zero(totalcpuset);
                    break;
                }
            }
            /* bind the proc here */
            cpus = opal_hwloc_base_get_available_cpus(node->topology, trg_obj);
            hwloc_bitmap_or(totalcpuset, totalcpuset, cpus);
            /* track total #cpus */
            total_cpus += ncpus;
            /* move to the next location, in case we need it */
            nxt_obj = trg_obj->next_cousin;
        } while (total_cpus < orte_rmaps_base.cpus_per_rank);
        hwloc_bitmap_list_asprintf(&proc->cpu_bitmap, totalcpuset);
        if (4 < opal_output_get_verbosity(orte_rmaps_base_framework.framework_output)) {
            char tmp1[1024], tmp2[1024];
            if (OPAL_ERR_NOT_BOUND == opal_hwloc_base_cset2str(tmp1, sizeof(tmp1),
                                                               node->topology, totalcpuset)) {
                opal_output(orte_rmaps_base_framework.framework_output,
                            "%s PROC %s ON %s IS NOT BOUND",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            ORTE_NAME_PRINT(&proc->name), node->name);
            } else {
                opal_hwloc_base_cset2mapstr(tmp2, sizeof(tmp2), node->topology, totalcpuset);
                opal_output(orte_rmaps_base_framework.framework_output,
                            "%s BOUND PROC %s[%s] TO %s: %s",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            ORTE_NAME_PRINT(&proc->name), node->name,
                            tmp1, tmp2);
            }
        }
    }
    hwloc_bitmap_free(totalcpuset);
    
    return ORTE_SUCCESS;
}
/****** shepherd_binding/binding_set_linear() ***************************************
*  NAME
*     binding_set_linear() -- Bind current process linear to chunk of cores.
*
*  SYNOPSIS
*     bool binding_set_linear(int first_socket, int first_core, int 
*     number_of_cores, int offset)
*
*  FUNCTION
*     Binds current process (shepherd) to a set of cores. All processes 
*     started by the current process inherit the core binding.
*     
*     The core binding is done in a linear manner, that means that 
*     the process is bound to 'number_of_cores' cores using one core
*     after another starting at socket 'first_socket' (usually 0) and 
*     core = 'first_core' (usually 0) + 'offset'. If the core number 
*     is higher than the number of cores which are provided by socket 
*     'first_socket' then the next socket is taken (the core number 
*      defines how many cores are skiped).
*
*  INPUTS
*     int first_socket    - The first socket (starting at 0) to bind to. 
*     int first_core      - The first core to bind. 
*     int number_of_cores - The number_of_cores of cores to bind to.
*     int offset          - The user specified core number offset. 
*     binding_type_t type - The type of binding ONLY FOR EXECD ( set | env | pe )
*                           
*  RESULT
*     bool - true if binding for current process was done, false if not
*
*  NOTES
*     MT-NOTE: binding_set_linear() is not MT safe 
*
*******************************************************************************/
static bool binding_set_linear(int first_socket, int first_core,
               int number_of_cores, int offset, const binding_type_t type)
{

#if HAVE_HWLOC
   /* sets bitmask in a linear manner        */ 
   /* first core is on exclusive host 0      */ 
   /* first core could be set from scheduler */ 
   /* offset is the first core to start with (makes sense only with
      exclusive host) */
   dstring error = DSTRING_INIT;

   if (has_core_binding() == true) {
      /* bitmask for processors to turn on and off */
      hwloc_bitmap_t cpuset = hwloc_bitmap_alloc();
         
      if (has_topology_information()) {
         /* number of cores set in processor binding mask */
         int cores_set;
         /* next socket to use */
         int next_socket = first_socket;
         /* the number of cores of the next socket */
         int socket_number_of_cores;
         /* next core to use */
         int next_core = first_core + offset;
         /* maximal number of sockets on this system */
         int max_number_of_sockets = get_number_of_sockets();
         hwloc_obj_t this_core;

         /* strategy: go to the first_socket and the first_core + offset and 
            fill up socket and go to the next one. */ 
               
         /* TODO maybe better to search for using a core exclusively? */
            
         while (get_number_of_cores(next_socket) <= next_core) {
            /* TODO which kind of warning when first socket does not
               offer this? */
            /* move on to next socket - could be that we have to deal
               only with cores instead of <socket><core> tuples */
            next_core -= get_number_of_cores(next_socket);
            next_socket++;
            if (next_socket >= max_number_of_sockets) {
               /* we are out of sockets - we do nothing */
               hwloc_bitmap_free(cpuset);
               return false;
            }
         }  
         this_core =
            hwloc_get_obj_below_by_type(sge_hwloc_topology,
                                        HWLOC_OBJ_SOCKET, next_socket,
                                        HWLOC_OBJ_CORE, next_core);
         hwloc_bitmap_or(cpuset, cpuset, this_core->cpuset);

         /* collect the other processor ids with the strategy */
         for (cores_set = 1; cores_set < number_of_cores; cores_set++) {
            next_core++;
            /* jump to next socket when it is needed */
            /* maybe the next socket could offer 0 cores (I can't see when, 
               but just to be sure) */
            while ((socket_number_of_cores = get_number_of_cores(next_socket))
                        <= next_core) {
               next_socket++;
               next_core = next_core - socket_number_of_cores;
               if (next_socket >= max_number_of_sockets) {
                  /* we are out of sockets - we do nothing */
                  hwloc_bitmap_free(cpuset);
                  return false;
               }
            }
            this_core =
               hwloc_get_obj_below_by_type(sge_hwloc_topology,
                                           HWLOC_OBJ_SOCKET, next_socket,
                                           HWLOC_OBJ_CORE, next_core);
            hwloc_bitmap_or(cpuset, cpuset, this_core->cpuset);
         }

         /* check what to do with the processor ids (set, env or pe) */
         if (type == BINDING_TYPE_PE) {
               
            /* is done outside */

         } else if (type == BINDING_TYPE_ENV) {
               
            /* set the environment variable                    */
            /* this does not show up in "environment" file !!! */
            if (create_binding_env(cpuset) == true) {
               shepherd_trace("binding_set_linear: SGE_BINDING env var created");
            } else {
               shepherd_trace("binding_set_linear: problems while creating SGE_BINDING env");
            }
             
         } else {

             /* bind SET process to mask */ 
            if (bind_process_to_mask(cpuset) == false) {
               /* there was an error while binding */ 
               hwloc_bitmap_free(cpuset);
               return false;
            }
         }

         hwloc_bitmap_free(cpuset);

      } else {
            
         /* TODO DG strategy without topology information but with 
            working library? */
         shepherd_trace("binding_set_linear: no information about topology");
         return false;
      }
         

   } else {

      shepherd_trace("binding_set_linear: binding not supported: %s",
                     sge_dstring_get_string(&error));

      sge_dstring_free(&error);
   }
#endif  /* HAVE_HWLOC */
   return true;
}