コード例 #1
0
ファイル: rmaps_ppr.c プロジェクト: bureddy/ompi-release
/* recursively climb the topology, pruning procs beyond that allowed
 * by the given ppr
 */
static void prune(orte_jobid_t jobid,
                  orte_app_idx_t app_idx,
                  orte_node_t *node,
                  opal_hwloc_level_t *level,
                  orte_vpid_t *nmapped)
{
    hwloc_obj_t obj, top;
    unsigned int i, nobjs;
    hwloc_obj_type_t lvl;
    unsigned cache_level = 0, k;
    int nprocs;
    hwloc_cpuset_t avail, cpus, childcpus;
    int n, limit, nmax, nunder, idx, idxmax = 0;
    orte_proc_t *proc, *pptr, *procmax;
    opal_hwloc_level_t ll;
    char dang[64];
    hwloc_obj_t locale;

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps:ppr: pruning level %d",
                        *level);

    /* convenience */
    ll = *level;

    /* convenience */
    lvl = opal_hwloc_levels[ll];
    limit = ppr[ll];

    if (0 == limit) {
        /* no limit at this level, so move up if necessary */
        if (0 == ll) {
            /* done */
            return;
        }
        --(*level);
        prune(jobid, app_idx, node, level, nmapped);
        return;
    }

    /* handle the darn cache thing again */
    if (OPAL_HWLOC_L3CACHE_LEVEL == ll) {
        cache_level = 3;
    } else if (OPAL_HWLOC_L2CACHE_LEVEL == ll) {
        cache_level = 2;
    } else if (OPAL_HWLOC_L1CACHE_LEVEL == ll) {
        cache_level = 1;
    }

    /* get the number of resources at this level on this node */
    nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology,
                                               lvl, cache_level,
                                               OPAL_HWLOC_AVAILABLE);

    /* for each resource, compute the number of procs sitting
     * underneath it and check against the limit
     */
    for (i=0; i < nobjs; i++) {
        obj = opal_hwloc_base_get_obj_by_type(node->topology,
                                              lvl, cache_level,
                                              i, OPAL_HWLOC_AVAILABLE);
        /* get the available cpuset */
        avail = opal_hwloc_base_get_available_cpus(node->topology, obj);

        /* look at the intersection of this object's cpuset and that
         * of each proc in the job/app - if they intersect, then count this proc
         * against the limit
         */
        nprocs = 0;
        for (n=0; n < node->procs->size; n++) {
            if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, n))) {
                continue;
            }
            if (proc->name.jobid != jobid ||
                proc->app_idx != app_idx) {
                continue;
            }
            locale = NULL;
            if (orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
                ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
                return;
            }
            cpus = opal_hwloc_base_get_available_cpus(node->topology, locale);
            if (hwloc_bitmap_intersects(avail, cpus)) {
                nprocs++;
            }
        }
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:ppr: found %d procs limit %d",
                            nprocs, limit);

        /* check against the limit */
        while (limit < nprocs) {
            /* need to remove procs - do this in a semi-intelligent
             * manner to provide a little load balancing by cycling
             * across the objects beneath this one, removing procs
             * in a round-robin fashion until the limit is satisfied
             *
             * NOTE: I'm sure someone more knowledgeable with hwloc
             * will come up with a more efficient way to do this, so
             * consider this is a starting point
             */

            /* find the first level that has more than
             * one child beneath it - if all levels
             * have only one child, then return this
             * object
             */
            top = find_split(node->topology, obj);
            hwloc_obj_type_snprintf(dang, 64, top, 1);
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps:ppr: SPLIT AT LEVEL %s", dang);

            /* cycle across the children of this object */
            nmax = 0;
            procmax = NULL;
            idx = 0;
            /* find the child with the most procs underneath it */
            for (k=0; k < top->arity && limit < nprocs; k++) {
                /* get this object's available cpuset */
                childcpus = opal_hwloc_base_get_available_cpus(node->topology, top->children[k]);
                nunder = 0;
                pptr = NULL;
                for (n=0; n < node->procs->size; n++) {
                    if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, n))) {
                        continue;
                    }
                    if (proc->name.jobid != jobid ||
                        proc->app_idx != app_idx) {
                        continue;
                    }
                    locale = NULL;
                    if (orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
                        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
                        return;
                    }
                    cpus = opal_hwloc_base_get_available_cpus(node->topology, locale);
                    if (hwloc_bitmap_intersects(childcpus, cpus)) {
                        nunder++;
                        if (NULL == pptr) {
                            /* save the location of the first proc under this object */
                            pptr = proc;
                            idx = n;
                        }
                    }
                }
                if (nmax < nunder) {
                    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                        "mca:rmaps:ppr: PROCS UNDER CHILD %d %d MAX %d",
                                        k, nunder, nmax);
                    nmax = nunder;
                    procmax = pptr;
                    idxmax = idx;
                }
            }
            if (NULL == procmax) {
                /* can't find anything to remove - error out */
                goto error;
            }
            /* remove it */
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps:ppr: removing proc at posn %d",
                                idxmax);
            opal_pointer_array_set_item(node->procs, idxmax, NULL);
            node->num_procs--;
            node->slots_inuse--;
            if (node->slots_inuse < 0) {
                node->slots_inuse = 0;
            }
            nprocs--;
            *nmapped -= 1;
            OBJ_RELEASE(procmax);
        }
    }
    /* finished with this level - move up if necessary */
    if (0 == ll) {
        return;
    }
    --(*level);
    prune(jobid, app_idx, node, level, nmapped);
    return;

 error:
    opal_output(0, "INFINITE LOOP");
}
コード例 #2
0
hwloc::hwloc()
{
  s_core_topology   = std::pair<unsigned,unsigned>(0,0);
  s_core_capacity   = 0 ;
  s_hwloc_topology  = 0 ;
  s_hwloc_location  = 0 ;
  s_process_binding = 0 ;

  for ( unsigned i = 0 ; i < MAX_CORE ; ++i ) s_core[i] = 0 ;

  hwloc_topology_init( & s_hwloc_topology );
  hwloc_topology_load( s_hwloc_topology );

  s_hwloc_location  = hwloc_bitmap_alloc();
  s_process_binding = hwloc_bitmap_alloc();

  hwloc_get_cpubind( s_hwloc_topology , s_process_binding ,  HWLOC_CPUBIND_PROCESS );

  // Choose a hwloc object type for the NUMA level, which may not exist.

  hwloc_obj_type_t root_type = HWLOC_OBJ_TYPE_MAX ;

  {
    // Object types to search, in order.
    static const hwloc_obj_type_t candidate_root_type[] =
      { HWLOC_OBJ_NODE     /* NUMA region     */
      , HWLOC_OBJ_SOCKET   /* hardware socket */
      , HWLOC_OBJ_MACHINE  /* local machine   */
      };

    enum { CANDIDATE_ROOT_TYPE_COUNT =
             sizeof(candidate_root_type) / sizeof(hwloc_obj_type_t) };

    for ( int k = 0 ; k < CANDIDATE_ROOT_TYPE_COUNT && HWLOC_OBJ_TYPE_MAX == root_type ; ++k ) {
      if ( 0 < hwloc_get_nbobjs_by_type( s_hwloc_topology , candidate_root_type[k] ) ) {
        root_type = candidate_root_type[k] ;
      }
    }
  }

  // Determine which of these 'root' types are available to this process.
  // The process may have been bound (e.g., by MPI) to a subset of these root types.
  // Determine current location of the master (calling) process>

  hwloc_bitmap_t proc_cpuset_location = hwloc_bitmap_alloc();

  hwloc_get_last_cpu_location( s_hwloc_topology , proc_cpuset_location , HWLOC_CPUBIND_THREAD );

  const unsigned max_root = hwloc_get_nbobjs_by_type( s_hwloc_topology , root_type );

  unsigned root_base     = max_root ;
  unsigned root_count    = 0 ;
  unsigned core_per_root = 0 ;
  unsigned pu_per_core   = 0 ;
  bool     symmetric     = true ;

  for ( unsigned i = 0 ; i < max_root ; ++i ) {

    const hwloc_obj_t root = hwloc_get_obj_by_type( s_hwloc_topology , root_type , i );

    if ( hwloc_bitmap_intersects( s_process_binding , root->allowed_cpuset ) ) {

      ++root_count ;

      // Remember which root (NUMA) object the master thread is running on.
      // This will be logical NUMA rank #0 for this process.

      if ( hwloc_bitmap_intersects( proc_cpuset_location, root->allowed_cpuset ) ) {
        root_base = i ;
      }

      // Count available cores:

      const unsigned max_core =
        hwloc_get_nbobjs_inside_cpuset_by_type( s_hwloc_topology ,
                                                root->allowed_cpuset ,
                                                HWLOC_OBJ_CORE );

      unsigned core_count = 0 ;

      for ( unsigned j = 0 ; j < max_core ; ++j ) {

        const hwloc_obj_t core =
          hwloc_get_obj_inside_cpuset_by_type( s_hwloc_topology ,
                                               root->allowed_cpuset ,
                                               HWLOC_OBJ_CORE , j );

        // If process' cpuset intersects core's cpuset then process can access this core.
        // Must use intersection instead of inclusion because the Intel-Phi
        // MPI may bind the process to only one of the core's hyperthreads.
        //
        // Assumption: if the process can access any hyperthread of the core
        // then it has ownership of the entire core.
        // This assumes that it would be performance-detrimental
        // to spawn more than one MPI process per core and use nested threading.

        if ( hwloc_bitmap_intersects( s_process_binding , core->allowed_cpuset ) ) {

          ++core_count ;

          const unsigned pu_count =
            hwloc_get_nbobjs_inside_cpuset_by_type( s_hwloc_topology ,
                                                    core->allowed_cpuset ,
                                                    HWLOC_OBJ_PU );

          if ( pu_per_core == 0 ) pu_per_core = pu_count ;

          // Enforce symmetry by taking the minimum:

          pu_per_core = std::min( pu_per_core , pu_count );

          if ( pu_count != pu_per_core ) symmetric = false ;
        }
      }

      if ( 0 == core_per_root ) core_per_root = core_count ;

      // Enforce symmetry by taking the minimum:

      core_per_root = std::min( core_per_root , core_count );

      if ( core_count != core_per_root ) symmetric = false ;
    }
  }

  s_core_topology.first  = root_count ;
  s_core_topology.second = core_per_root ;
  s_core_capacity        = pu_per_core ;

  // Fill the 's_core' array for fast mapping from a core coordinate to the
  // hwloc cpuset object required for thread location querying and binding.

  for ( unsigned i = 0 ; i < max_root ; ++i ) {

    const unsigned root_rank = ( i + root_base ) % max_root ;

    const hwloc_obj_t root = hwloc_get_obj_by_type( s_hwloc_topology , root_type , root_rank );

    if ( hwloc_bitmap_intersects( s_process_binding , root->allowed_cpuset ) ) {

      const unsigned max_core =
        hwloc_get_nbobjs_inside_cpuset_by_type( s_hwloc_topology ,
                                                root->allowed_cpuset ,
                                                HWLOC_OBJ_CORE );

      unsigned core_count = 0 ;

      for ( unsigned j = 0 ; j < max_core && core_count < core_per_root ; ++j ) {

        const hwloc_obj_t core =
          hwloc_get_obj_inside_cpuset_by_type( s_hwloc_topology ,
                                               root->allowed_cpuset ,
                                               HWLOC_OBJ_CORE , j );

        if ( hwloc_bitmap_intersects( s_process_binding , core->allowed_cpuset ) ) {

          s_core[ core_count + core_per_root * i ] = core->allowed_cpuset ;

          ++core_count ;
        }
      }
    }
  }

  hwloc_bitmap_free( proc_cpuset_location );

  if ( ! symmetric ) {
    std::cout << "KokkosArray::hwloc WARNING: Using a symmetric subset of a non-symmetric core topology."
              << std::endl ;
  }
}
コード例 #3
0
static int rank_span(orte_job_t *jdata,
                     orte_app_context_t *app,
                     opal_list_t *nodes,
                     hwloc_obj_type_t target,
                     unsigned cache_level)
{
    hwloc_obj_t obj;
    int num_objs, i, j, rc;
    orte_vpid_t num_ranked=0;
    orte_node_t *node;
    orte_proc_t *proc;
    orte_vpid_t vpid;
    int cnt;
    opal_list_item_t *item;
    hwloc_obj_t locale;

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps:rank_span: for job %s",
                        ORTE_JOBID_PRINT(jdata->jobid));

    /* if the ranking is spanned, then we perform the
     * ranking as if it was one big node - i.e., we
     * rank one proc on each object, step to the next object
     * moving across all the nodes, then wrap around to the
     * first object on the first node.
     *
     *        Node 0                Node 1
     *    Obj 0     Obj 1       Obj 0     Obj 1
     *     0 4       1 5         2 6       3 7
     *     8 12      9 13       10 14     11 15
     */

    /* In the interest of getting this committed in finite time,
     * just loop across the nodes and objects until all procs
     * are mapped
     */

    vpid = jdata->num_procs;
    cnt = 0;
    while (cnt < app->num_procs) {
        for (item = opal_list_get_first(nodes);
             item != opal_list_get_end(nodes);
             item = opal_list_get_next(item)) {
            node = (orte_node_t*)item;
            /* get the number of objects - only consider those we can actually use */
            num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target,
                                                          cache_level, OPAL_HWLOC_AVAILABLE);
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps:rank_span: found %d objects on node %s with %d procs",
                                num_objs, node->name, (int)node->num_procs);
            if (0 == num_objs) {
                return ORTE_ERR_NOT_SUPPORTED;
            }

            /* for each object */
            for (i=0; i < num_objs && cnt < app->num_procs; i++) {
                obj = opal_hwloc_base_get_obj_by_type(node->topology, target,
                                                      cache_level, i, OPAL_HWLOC_AVAILABLE);

                opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                    "mca:rmaps:rank_span: working object %d", i);

                /* cycle thru the procs on this node */
                for (j=0; j < node->procs->size && cnt < app->num_procs; j++) {
                    if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
                        continue;
                    }
                    /* ignore procs from other jobs */
                    if (proc->name.jobid != jdata->jobid) {
                        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                            "mca:rmaps:rank_span skipping proc %s - from another job, num_ranked %d",
                                            ORTE_NAME_PRINT(&proc->name), num_ranked);
                        continue;
                    }
                    /* ignore procs that are already assigned */
                    if (ORTE_VPID_INVALID != proc->name.vpid) {
                        continue;
                    }
                    /* ignore procs from other apps */
                    if (proc->app_idx != app->idx) {
                        continue;
                    }
                    /* protect against bozo case */
                    locale = NULL;
                    if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
                        ORTE_ERROR_LOG(ORTE_ERROR);
                        return ORTE_ERROR;
                    }
                    /* ignore procs not on this object */
                    if (!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) {
                        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                            "mca:rmaps:rank_span: proc at position %d is not on object %d",
                                            j, i);
                        continue;
                    }
                    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                        "mca:rmaps:rank_span: assigning vpid %s", ORTE_VPID_PRINT(vpid));
                    proc->name.vpid = vpid++;
                    if (0 == cnt) {
                        app->first_rank = proc->name.vpid;
                    }
                    cnt++;

                    /* insert the proc into the jdata array - no harm if already there */
                    if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
                        ORTE_ERROR_LOG(rc);
                        return rc;
                    }
                    /* track where the highest vpid landed - this is our
                     * new bookmark
                     */
                    jdata->bookmark = node;
                    /* move to next object */
                    break;
                }
            }
        }
    }

    return ORTE_SUCCESS;
}
コード例 #4
0
static int rank_by(orte_job_t *jdata,
                   orte_app_context_t *app,
                   opal_list_t *nodes,
                   hwloc_obj_type_t target,
                   unsigned cache_level)
{
    hwloc_obj_t obj;
    int num_objs, i, j, rc;
    orte_vpid_t num_ranked=0;
    orte_node_t *node;
    orte_proc_t *proc;
    orte_vpid_t vpid;
    int cnt;
    opal_pointer_array_t objs;
    bool all_done;
    opal_list_item_t *item;
    hwloc_obj_t locale;

    if (ORTE_RANKING_SPAN & ORTE_GET_RANKING_DIRECTIVE(jdata->map->ranking)) {
        return rank_span(jdata, app, nodes, target, cache_level);
    } else if (ORTE_RANKING_FILL & ORTE_GET_RANKING_DIRECTIVE(jdata->map->ranking)) {
        return rank_fill(jdata, app, nodes, target, cache_level);
    }

    /* if ranking is not spanned or filled, then we
     * default to assign ranks sequentially across
     * target objects within a node until that node
     * is fully ranked, and then move on to the next
     * node
     *
     *        Node 0                Node 1
     *    Obj 0     Obj 1       Obj 0     Obj 1
     *     0 2       1 3         8 10      9 11
     *     4 6       5 7        12 14     13 15
     */

    /* setup the pointer array */
    OBJ_CONSTRUCT(&objs, opal_pointer_array_t);
    opal_pointer_array_init(&objs, 2, INT_MAX, 2);

    vpid = jdata->num_procs;
    cnt = 0;
    for (item = opal_list_get_first(nodes);
         item != opal_list_get_end(nodes);
         item = opal_list_get_next(item)) {
        node = (orte_node_t*)item;
        /* get the number of objects - only consider those we can actually use */
        num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target,
                                                      cache_level, OPAL_HWLOC_AVAILABLE);
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:rank_by: found %d objects on node %s with %d procs",
                            num_objs, node->name, (int)node->num_procs);
        if (0 == num_objs) {
            return ORTE_ERR_NOT_SUPPORTED;
        }
        /* collect all the objects */
        for (i=0; i < num_objs; i++) {
            obj = opal_hwloc_base_get_obj_by_type(node->topology, target,
                                                  cache_level, i, OPAL_HWLOC_AVAILABLE);
            opal_pointer_array_set_item(&objs, i, obj);
        }

        /* cycle across the objects, assigning a proc to each one,
         * until all procs have been assigned - unfortunately, since
         * more than this job may be mapped onto a node, the number
         * of procs on the node can't be used to tell us when we
         * are done. Instead, we have to just keep going until all
         * procs are ranked - which means we have to make one extra
         * pass thru the loop
         *
         * Perhaps someday someone will come up with a more efficient
         * algorithm, but this works for now.
         */
        all_done = false;
        while (!all_done && cnt < app->num_procs) {
            all_done = true;
            /* cycle across the objects */
            for (i=0; i < num_objs && cnt < app->num_procs; i++) {
                obj = (hwloc_obj_t)opal_pointer_array_get_item(&objs, i);

                /* find the next proc on this object */
                for (j=0; j < node->procs->size && cnt < app->num_procs; j++) {
                    if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
                        continue;
                    }
                    /* ignore procs from other jobs */
                    if (proc->name.jobid != jdata->jobid) {
                        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                            "mca:rmaps:rank_by skipping proc %s - from another job, num_ranked %d",
                                            ORTE_NAME_PRINT(&proc->name), num_ranked);
                        continue;
                    }
                    /* ignore procs that are already ranked */
                    if (ORTE_VPID_INVALID != proc->name.vpid) {
                        continue;
                    }
                    /* ignore procs from other apps */
                    if (proc->app_idx != app->idx) {
                        continue;
                    }
                    if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
                        continue;
                    }
                    /* ignore procs on other objects */
                    if (!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) {
                        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                            "mca:rmaps:rank_by: proc at position %d is not on object %d",
                                            j, i);
                        continue;
                    }
                    proc->name.vpid = vpid++;
                    if (0 == cnt) {
                        app->first_rank = proc->name.vpid;
                    }
                    cnt++;
                    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                        "mca:rmaps:rank_by: assigned rank %s", ORTE_VPID_PRINT(proc->name.vpid));
                    /* insert the proc into the jdata array - no harm if already there */
                    if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
                        ORTE_ERROR_LOG(rc);
                        return rc;
                    }
                   /* flag that one was mapped */
                    all_done = false;
                    /* track where the highest vpid landed - this is our
                     * new bookmark
                     */
                    jdata->bookmark = node;
                    /* move to next object */
                    break;
                }
            }
        }
    }

    /* cleanup */
    OBJ_DESTRUCT(&objs);

    return ORTE_SUCCESS;
}
コード例 #5
0
ファイル: btl_sm.c プロジェクト: rppendya/ompi
static int
sm_btl_first_time_init(mca_btl_sm_t *sm_btl,
                       int32_t my_smp_rank,
                       int n)
{
    size_t length, length_payload;
    sm_fifo_t *my_fifos;
    int my_mem_node, num_mem_nodes, i, rc;
    mca_mpool_base_resources_t *res = NULL;
    mca_btl_sm_component_t* m = &mca_btl_sm_component;

    /* Assume we don't have hwloc support and fill in dummy info */
    mca_btl_sm_component.mem_node = my_mem_node = 0;
    mca_btl_sm_component.num_mem_nodes = num_mem_nodes = 1;

    /* If we have hwloc support, then get accurate information */
    if (NULL != opal_hwloc_topology) {
        i = opal_hwloc_base_get_nbobjs_by_type(opal_hwloc_topology,
                                               HWLOC_OBJ_NODE, 0,
                                               OPAL_HWLOC_AVAILABLE);

        /* If we find >0 NUMA nodes, then investigate further */
        if (i > 0) {
            int numa=0, w;
            unsigned n_bound=0;
            hwloc_cpuset_t avail;
            hwloc_obj_t obj;

            /* JMS This tells me how many numa nodes are *available*,
               but it's not how many are being used *by this job*.
               Note that this is the value we've previously used (from
               the previous carto-based implementation), but it really
               should be improved to be how many NUMA nodes are being
               used *in this job*. */
            mca_btl_sm_component.num_mem_nodes = num_mem_nodes = i;

            /* if we are not bound, then there is nothing further to do */
            if (NULL != opal_process_info.cpuset) {
                /* count the number of NUMA nodes to which we are bound */
                for (w=0; w < i; w++) {
                    if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology,
                                       HWLOC_OBJ_NODE, 0, w,
                                       OPAL_HWLOC_AVAILABLE))) {
                        continue;
                    }
                    /* get that NUMA node's available cpus */
                    avail = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj);
                    /* see if we intersect */
                    if (hwloc_bitmap_intersects(avail, opal_hwloc_my_cpuset)) {
                        n_bound++;
                        numa = w;
                    }
                }
                /* if we are located on more than one NUMA, or we didn't find
                 * a NUMA we are on, then not much we can do
                 */
                if (1 == n_bound) {
                    mca_btl_sm_component.mem_node = my_mem_node = numa;
                } else {
                    mca_btl_sm_component.mem_node = my_mem_node = -1;
                }
            }
        }
    }

    if (NULL == (res = calloc(1, sizeof(*res)))) {
        return OPAL_ERR_OUT_OF_RESOURCE;
    }

    /* lookup shared memory pool */
    mca_btl_sm_component.sm_mpools =
        (mca_mpool_base_module_t **)calloc(num_mem_nodes,
                                           sizeof(mca_mpool_base_module_t *));

    /* Disable memory binding, because each MPI process will claim pages in the
     * mpool for their local NUMA node */
    res->mem_node = -1;

    if (OPAL_SUCCESS != (rc = setup_mpool_base_resources(m, res))) {
        free(res);
        return rc;
    }
    /* now that res is fully populated, create the thing */
    mca_btl_sm_component.sm_mpools[0] =
        mca_mpool_base_module_create(mca_btl_sm_component.sm_mpool_name,
                                     sm_btl, res);
    /* Sanity check to ensure that we found it */
    if (NULL == mca_btl_sm_component.sm_mpools[0]) {
        free(res);
        return OPAL_ERR_OUT_OF_RESOURCE;
    }

    mca_btl_sm_component.sm_mpool = mca_btl_sm_component.sm_mpools[0];

    mca_btl_sm_component.sm_mpool_base =
        mca_btl_sm_component.sm_mpools[0]->mpool_base(mca_btl_sm_component.sm_mpools[0]);

    /* create a list of peers */
    mca_btl_sm_component.sm_peers = (struct mca_btl_base_endpoint_t**)
                                    calloc(n, sizeof(struct mca_btl_base_endpoint_t*));
    if (NULL == mca_btl_sm_component.sm_peers) {
        free(res);
        return OPAL_ERR_OUT_OF_RESOURCE;
    }

    /* remember that node rank zero is already attached */
    if (0 != my_smp_rank) {
        if (OPAL_SUCCESS != (rc = sm_segment_attach(m))) {
            free(res);
            return rc;
        }
    }

    /* it is now safe to free the mpool resources */
    free(res);

    /* check to make sure number of local procs is within the
     * specified limits */
    if(mca_btl_sm_component.sm_max_procs > 0 &&
            mca_btl_sm_component.num_smp_procs + n >
            mca_btl_sm_component.sm_max_procs) {
        return OPAL_ERROR;
    }

    mca_btl_sm_component.shm_fifo = (volatile sm_fifo_t **)mca_btl_sm_component.sm_seg->module_data_addr;
    mca_btl_sm_component.shm_bases = (char**)(mca_btl_sm_component.shm_fifo + n);
    mca_btl_sm_component.shm_mem_nodes = (uint16_t*)(mca_btl_sm_component.shm_bases + n);

    /* set the base of the shared memory segment */
    mca_btl_sm_component.shm_bases[mca_btl_sm_component.my_smp_rank] =
        (char*)mca_btl_sm_component.sm_mpool_base;
    mca_btl_sm_component.shm_mem_nodes[mca_btl_sm_component.my_smp_rank] =
        (uint16_t)my_mem_node;

    /* initialize the array of fifo's "owned" by this process */
    if(NULL == (my_fifos = (sm_fifo_t*)mpool_calloc(FIFO_MAP_NUM(n), sizeof(sm_fifo_t))))
        return OPAL_ERR_OUT_OF_RESOURCE;

    mca_btl_sm_component.shm_fifo[mca_btl_sm_component.my_smp_rank] = my_fifos;

    /* cache the pointer to the 2d fifo array.  These addresses
     * are valid in the current process space */
    mca_btl_sm_component.fifo = (sm_fifo_t**)malloc(sizeof(sm_fifo_t*) * n);

    if(NULL == mca_btl_sm_component.fifo)
        return OPAL_ERR_OUT_OF_RESOURCE;

    mca_btl_sm_component.fifo[mca_btl_sm_component.my_smp_rank] = my_fifos;

    mca_btl_sm_component.mem_nodes = (uint16_t *) malloc(sizeof(uint16_t) * n);
    if(NULL == mca_btl_sm_component.mem_nodes)
        return OPAL_ERR_OUT_OF_RESOURCE;

    /* initialize fragment descriptor free lists */

    /* allocation will be for the fragment descriptor and payload buffer */
    length = sizeof(mca_btl_sm_frag1_t);
    length_payload =
        sizeof(mca_btl_sm_hdr_t) + mca_btl_sm_component.eager_limit;
    i = opal_free_list_init (&mca_btl_sm_component.sm_frags_eager, length,
                             opal_cache_line_size, OBJ_CLASS(mca_btl_sm_frag1_t),
                             length_payload, opal_cache_line_size,
                             mca_btl_sm_component.sm_free_list_num,
                             mca_btl_sm_component.sm_free_list_max,
                             mca_btl_sm_component.sm_free_list_inc,
                             mca_btl_sm_component.sm_mpool, 0, NULL, NULL, NULL);
    if ( OPAL_SUCCESS != i )
        return i;

    length = sizeof(mca_btl_sm_frag2_t);
    length_payload =
        sizeof(mca_btl_sm_hdr_t) + mca_btl_sm_component.max_frag_size;
    i = opal_free_list_init (&mca_btl_sm_component.sm_frags_max, length,
                             opal_cache_line_size, OBJ_CLASS(mca_btl_sm_frag2_t),
                             length_payload, opal_cache_line_size,
                             mca_btl_sm_component.sm_free_list_num,
                             mca_btl_sm_component.sm_free_list_max,
                             mca_btl_sm_component.sm_free_list_inc,
                             mca_btl_sm_component.sm_mpool, 0, NULL, NULL, NULL);
    if ( OPAL_SUCCESS != i )
        return i;

    i = opal_free_list_init (&mca_btl_sm_component.sm_frags_user,
                             sizeof(mca_btl_sm_user_t),
                             opal_cache_line_size, OBJ_CLASS(mca_btl_sm_user_t),
                             sizeof(mca_btl_sm_hdr_t), opal_cache_line_size,
                             mca_btl_sm_component.sm_free_list_num,
                             mca_btl_sm_component.sm_free_list_max,
                             mca_btl_sm_component.sm_free_list_inc,
                             mca_btl_sm_component.sm_mpool, 0, NULL, NULL, NULL);
    if ( OPAL_SUCCESS != i )
        return i;

    mca_btl_sm_component.num_outstanding_frags = 0;

    mca_btl_sm_component.num_pending_sends = 0;
    i = opal_free_list_init(&mca_btl_sm_component.pending_send_fl,
                            sizeof(btl_sm_pending_send_item_t), 8,
                            OBJ_CLASS(opal_free_list_item_t),
                            0, 0, 16, -1, 32, NULL, 0, NULL, NULL,
                            NULL);
    if ( OPAL_SUCCESS != i )
        return i;

    /* set flag indicating btl has been inited */
    sm_btl->btl_inited = true;

    return OPAL_SUCCESS;
}
コード例 #6
0
static int rank_fill(orte_job_t *jdata,
                     hwloc_obj_type_t target,
                     unsigned cache_level)
{
    orte_app_context_t *app;
    hwloc_obj_t obj;
    int num_objs, i, j, m, n, rc;
    orte_vpid_t num_ranked=0;
    orte_node_t *node;
    orte_proc_t *proc, *pptr;
    orte_vpid_t vpid;
    int cnt;
    hwloc_obj_t locale;

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps:rank_fill: for job %s",
                        ORTE_JOBID_PRINT(jdata->jobid));

    /* if the ranking is fill, then we rank all the procs
     * within a given object before moving on to the next
     *
     *        Node 0                Node 1
     *    Obj 0     Obj 1       Obj 0     Obj 1
     *     0 1       4 5         8 9      12 13
     *     2 3       6 7        10 11     14 15
     */

    vpid = 0;
    for (n=0; n < jdata->apps->size; n++) {
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) {
            continue;
        }

        cnt = 0;
        for (m=0; m < jdata->map->nodes->size; m++) {
            if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) {
                continue;
            }
            /* get the number of objects - only consider those we can actually use */
            num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, target,
                                                          cache_level, OPAL_HWLOC_AVAILABLE);
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps:rank_fill: found %d objects on node %s with %d procs",
                                num_objs, node->name, (int)node->num_procs);
            if (0 == num_objs) {
                return ORTE_ERR_NOT_SUPPORTED;
            }

            /* for each object */
            for (i=0; i < num_objs && cnt < app->num_procs; i++) {
                obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, target,
                                                      cache_level, i, OPAL_HWLOC_AVAILABLE);

                opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                    "mca:rmaps:rank_fill: working object %d", i);

                /* cycle thru the procs on this node */
                for (j=0; j < node->procs->size && cnt < app->num_procs; j++) {
                    if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
                        continue;
                    }
                    /* ignore procs from other jobs */
                    if (proc->name.jobid != jdata->jobid) {
                        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                            "mca:rmaps:rank_fill skipping proc %s - from another job, num_ranked %d",
                                            ORTE_NAME_PRINT(&proc->name), num_ranked);
                        continue;
                    }
                    /* ignore procs that are already assigned */
                    if (ORTE_VPID_INVALID != proc->name.vpid) {
                        continue;
                    }
                    /* ignore procs from other apps */
                    if (proc->app_idx != app->idx) {
                        continue;
                    }
                     /* protect against bozo case */
                    locale = NULL;
                    if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
                        ORTE_ERROR_LOG(ORTE_ERROR);
                        return ORTE_ERROR;
                    }
                    /* ignore procs not on this object */
                    if (!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) {
                        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                            "mca:rmaps:rank_fill: proc at position %d is not on object %d",
                                            j, i);
                        continue;
                    }
                    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                        "mca:rmaps:rank_fill: assigning vpid %s", ORTE_VPID_PRINT(vpid));
                    proc->name.vpid = vpid++;
                    if (0 == cnt) {
                        app->first_rank = proc->name.vpid;
                    }
                    cnt++;

                    /* insert the proc into the jdata array */
                    if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->name.vpid))) {
                        OBJ_RELEASE(pptr);
                    }
                    OBJ_RETAIN(proc);
                    if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
                        ORTE_ERROR_LOG(rc);
                        return rc;
                    }
                    /* track where the highest vpid landed - this is our
                     * new bookmark
                     */
                    jdata->bookmark = node;
                }
            }
        }
    }

    return ORTE_SUCCESS;
}