Ejemplo n.º 1
0
void chpl_comm_ofi_oob_allgather(const void* mine, void* all, size_t size) {
  DBG_PRINTF(DBG_OOB, "OOB allGather: %zd", size);

  //
  // PMI doesn't provide an ordered allGather, so we build one here
  // by concatenating the node index and the payload and using that
  // to scatter the unordered PMI_Allgather() results.
  //
  typedef struct {
    int nodeID;
    uint64_t info[];
  } gather_t;

  const size_t g_size = offsetof(gather_t, info) + size;
  gather_t* g_mine;
  CHK_SYS_CALLOC_SZ(g_mine, 1, g_size);
  g_mine->nodeID = chpl_nodeID;
  memcpy(&g_mine->info, mine, size);

  gather_t* g_all;
  CHK_SYS_CALLOC_SZ(g_all, chpl_numNodes, g_size);

  PMI_CHK(PMI_Allgather(g_mine, g_all, g_size));

  for (int g_i = 0; g_i < chpl_numNodes; g_i++) {
    char* g_pa = (char*) g_all + g_i * g_size;
    int i;
    memcpy(&i, g_pa + offsetof(gather_t, nodeID), sizeof(i));
    char* p_a = (char*) all + i * size;
    memcpy(p_a, g_pa + offsetof(gather_t, info), size);
  }

  sys_free(g_all);
  sys_free(g_mine);
}
Ejemplo n.º 2
0
static int cray_fence(opal_list_t *procs, int collect_data)
{
    int rc, cnt;
    int32_t i;
    int *all_lens = NULL;
    opal_value_t *kp, kvn;
    opal_buffer_t *send_buffer = NULL;
    opal_buffer_t *buf = NULL;
    void *sbuf_ptr;
    char *cptr, *rcv_buff = NULL;
    opal_process_name_t id;
    typedef struct {
        uint32_t pmix_rank;
        opal_process_name_t name;
        int32_t nbytes;
    } bytes_and_rank_t;
    int32_t rcv_nbytes_tot;
    bytes_and_rank_t s_bytes_and_rank;
    bytes_and_rank_t *r_bytes_and_ranks = NULL;
    opal_hwloc_locality_t locality;
    opal_list_t vals;
    char *cpuset = NULL;
    opal_process_name_t pname;

    opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                        "%s pmix:cray executing fence cache_global %p cache_local %p",
                        OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
                        (void *)mca_pmix_cray_component.cache_global,
                        (void *)mca_pmix_cray_component.cache_local);

    /* get the modex data from each local process and set the
     * localities to avoid having the MPI layer fetch data
     * for every process in the job */
    pname.jobid = OPAL_PROC_MY_NAME.jobid;

    /*
     * "unload" the cache_local/cache_global buffers, first copy
     * it so we can continue to use the local buffers if further
     * calls to put can be made
     */

    send_buffer = OBJ_NEW(opal_buffer_t);
    if (NULL == send_buffer) {
        return OPAL_ERR_OUT_OF_RESOURCE;
    }

    opal_dss.copy_payload(send_buffer, mca_pmix_cray_component.cache_global);
    opal_dss.unload(send_buffer, &sbuf_ptr, &s_bytes_and_rank.nbytes);
    s_bytes_and_rank.pmix_rank = pmix_rank;
    s_bytes_and_rank.name = OPAL_PROC_MY_NAME;

    r_bytes_and_ranks = (bytes_and_rank_t *)malloc(pmix_size * sizeof(bytes_and_rank_t));
    if (NULL == r_bytes_and_ranks) {
        rc = OPAL_ERR_OUT_OF_RESOURCE;
        goto fn_exit;
    }

    /*
     * gather up all the buffer sizes and rank order.
     * doing this step below since the cray pmi PMI_Allgather doesn't deliver
     * the gathered data necessarily in PMI rank order, although the order stays
     * the same for the duration of a job - assuming no node failures.
     */

    if (PMI_SUCCESS != (rc = PMI_Allgather(&s_bytes_and_rank,r_bytes_and_ranks,sizeof(bytes_and_rank_t)))) {
        OPAL_PMI_ERROR(rc,"PMI_Allgather");
        rc = OPAL_ERR_COMM_FAILURE;
        goto fn_exit;
    }


    for (rcv_nbytes_tot=0,i=0; i < pmix_size; i++) {
        rcv_nbytes_tot += r_bytes_and_ranks[i].nbytes;
    }

    opal_output_verbose(20, opal_pmix_base_framework.framework_output,
                        "%s pmix:cray total number of bytes to receive %d",
                        OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), rcv_nbytes_tot);

    rcv_buff = (char *) malloc(rcv_nbytes_tot * sizeof(char));
    if (NULL == rcv_buff) {
        rc = OPAL_ERR_OUT_OF_RESOURCE;
        goto fn_exit;
    }

    all_lens = (int *)malloc(sizeof(int) * pmix_size);
    if (NULL == all_lens) {
        rc = OPAL_ERR_OUT_OF_RESOURCE;
        goto fn_exit;
    }
    for (i=0; i< pmix_size; i++) {
        all_lens[r_bytes_and_ranks[i].pmix_rank] = r_bytes_and_ranks[i].nbytes;
    }

    if (PMI_SUCCESS != (rc = PMI_Allgatherv(sbuf_ptr,s_bytes_and_rank.nbytes,rcv_buff,all_lens))) {
        OPAL_PMI_ERROR(rc,"PMI_Allgatherv");
        rc = OPAL_ERR_COMM_FAILURE;
        goto fn_exit;
    }

    OBJ_RELEASE(send_buffer);
    send_buffer  = NULL;

    buf = OBJ_NEW(opal_buffer_t);
    if (buf == NULL) {
        rc = OPAL_ERR_OUT_OF_RESOURCE;
        goto fn_exit;
    }

    for (cptr = rcv_buff, i=0; i < pmix_size; i++) {

        id = r_bytes_and_ranks[i].name;

        buf->base_ptr = NULL;  /* TODO: ugh */
        if (OPAL_SUCCESS != (rc = opal_dss.load(buf, (void *)cptr, r_bytes_and_ranks[i].nbytes))) {
            OPAL_PMI_ERROR(rc,"pmix:cray opal_dss.load failed");
            goto fn_exit;
        }

        /* unpack and stuff in to the dstore */

        cnt = 1;
        while (OPAL_SUCCESS == (rc = opal_dss.unpack(buf, &kp, &cnt, OPAL_VALUE))) {
            opal_output_verbose(20, opal_pmix_base_framework.framework_output,
                        "%s pmix:cray unpacked kp with key %s type(%d) for id  %s",
                         OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), kp->key, kp->type, OPAL_NAME_PRINT(id));
            if (OPAL_SUCCESS != (rc = opal_pmix_base_store(&id, kp))) {
                OPAL_ERROR_LOG(rc);
                goto fn_exit;
            }
             OBJ_RELEASE(kp);
             cnt = 1;
        }

        cptr += r_bytes_and_ranks[i].nbytes;

    }

    buf->base_ptr = NULL;  /* TODO: ugh */
    OBJ_RELEASE(buf);

    opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                        "%s pmix:cray kvs_fence complete",
                        OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));

#if OPAL_HAVE_HWLOC
    /* fetch my cpuset */
    OBJ_CONSTRUCT(&vals, opal_list_t);
    if (OPAL_SUCCESS == (rc = opal_pmix_base_fetch(&pmix_pname,
                                                   OPAL_PMIX_CPUSET, &vals))) {
        kp = (opal_value_t*)opal_list_get_first(&vals);
        cpuset = strdup(kp->data.string);
    } else {
        cpuset = NULL;
    }
    OPAL_LIST_DESTRUCT(&vals);
#endif

    /* we only need to set locality for each local rank as "not found"
     * equates to "non-local" */
    for (i=0; i < pmix_nlranks; i++) {
        id.vpid = pmix_lranks[i];
        id.jobid = pmix_jobid;
        opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                                "%s checking out if %s is local to me",
                                OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
                                OPAL_NAME_PRINT(id));
        /* fetch cpuset for this vpid */
#if OPAL_HAVE_HWLOC
        OBJ_CONSTRUCT(&vals, opal_list_t);
        if (OPAL_SUCCESS != (rc = opal_pmix_base_fetch(&id,
                                                    OPAL_PMIX_CPUSET, &vals))) {
            opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                                "%s cpuset for local proc %s not found",
                                OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
                                OPAL_NAME_PRINT(id));
            OPAL_LIST_DESTRUCT(&vals);
            /* even though the cpuset wasn't found, we at least know it is
             * on the same node with us */
            locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
        } else {
            kp = (opal_value_t*)opal_list_get_first(&vals);
            if (NULL == kp->data.string) {
                /* if we share a node, but we don't know anything more, then
                 * mark us as on the node as this is all we know
                 */
                locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
            } else {
                /* determine relative location on our node */
                locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
                                                                 cpuset,
                                                                 kp->data.string);
            }
            OPAL_LIST_DESTRUCT(&vals);
        }
#else
        /* all we know is we share a node */
        locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
#endif
        OPAL_OUTPUT_VERBOSE((1, opal_pmix_base_framework.framework_output,
                             "%s pmix:cray proc %s locality %s",
                             OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
                             OPAL_NAME_PRINT(id),
                             opal_hwloc_base_print_locality(locality)));

        OBJ_CONSTRUCT(&kvn, opal_value_t);
        kvn.key = strdup(OPAL_PMIX_LOCALITY);
        kvn.type = OPAL_UINT16;
        kvn.data.uint16 = locality;
        opal_pmix_base_store(&pname, &kvn);
        OBJ_DESTRUCT(&kvn);
    }

fn_exit:
#if OPAL_HAVE_HWLOC
    if (NULL != cpuset) {
        free(cpuset);
    }
#endif
    if (all_lens != NULL) {
        free(all_lens);
    }
    if (rcv_buff != NULL) {
        free(rcv_buff);
    }
    if (r_bytes_and_ranks != NULL) {
        free(r_bytes_and_ranks);
    }
    return rc;
}
Ejemplo n.º 3
0
    counter++;
#endif
#else
    PMI_Barrier();
#endif
}

#if HAVE_PMI_ALLGATHER
static gasnet_node_t *gasnetc_pmi_allgather_order = NULL;
GASNETI_INLINE(gasnetc_pmi_allgather_init)
void gasnetc_pmi_allgather_init(void) {
    /* perform (just once) an Allgather of node number to establish the order */
    if_pf (!gasnetc_pmi_allgather_order) {
        int rc;
        gasnetc_pmi_allgather_order = gasneti_malloc(gasneti_nodes * sizeof(gasnet_node_t));
        rc = PMI_Allgather(&gasneti_mynode, gasnetc_pmi_allgather_order, sizeof(gasnet_node_t));
        gasneti_assert(PMI_SUCCESS == rc);
    }
}
#endif

/* gasneti_bootstrapExchange
 */
void gasneti_bootstrapExchange_pmi(void *src, size_t len, void *dest) {
#if HAVE_PMI_ALLGATHER
    uint8_t *unsorted = gasneti_malloc(len * gasneti_nodes); /* TODO: use alloca()? */
    gasnet_node_t i;
    int rc;

    /* Allgather the callers data to a temporary array */
    gasnetc_pmi_allgather_init();