Exemplo n.º 1
0
int
ompi_mtl_portals4_add_procs(struct mca_mtl_base_module_t *mtl,
                            size_t nprocs,
                            struct ompi_proc_t** procs)
{
    int ret, me;
    size_t i;
    bool new_found = false;

    /* Get the list of ptl_process_id_t from the runtime and copy into structure */
    for (i = 0 ; i < nprocs ; ++i) {
        ptl_process_t *modex_id;
        size_t size;

        if( procs[i] == ompi_proc_local_proc ) {
            me = i;
        }

        if (procs[i]->super.proc_arch != ompi_proc_local()->super.proc_arch) {
            opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                                "Portals 4 MTL does not support heterogeneous operations.");
            opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                                "Proc %s architecture %x, mine %x.",
                                OMPI_NAME_PRINT(&procs[i]->super.proc_name), 
                                procs[i]->super.proc_arch, ompi_proc_local()->super.proc_arch);
            return OMPI_ERR_NOT_SUPPORTED;
        }

        OPAL_MODEX_RECV(ret, &mca_mtl_portals4_component.mtl_version,
                        &procs[i]->super, (char**)&modex_id, &size);
        if (OMPI_SUCCESS != ret) {
            opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                                "%s:%d: ompi_modex_recv failed: %d\n",
                                __FILE__, __LINE__, ret);
            return ret;
        } else if (sizeof(ptl_process_t) != size) {
            opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                                "%s:%d: ompi_modex_recv failed: %d\n",
                                __FILE__, __LINE__, ret);
            return OMPI_ERR_BAD_PARAM;
        }

        if (NULL == procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PORTALS4]) {
            ptl_process_t *peer_id;
            peer_id = malloc(sizeof(ptl_process_t));
            if (NULL == peer_id) {
                opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                                    "%s:%d: malloc failed: %d\n",
                                    __FILE__, __LINE__, ret);
                return OMPI_ERR_OUT_OF_RESOURCE;
            }
            *peer_id = *modex_id;
            procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PORTALS4] = peer_id;

            new_found = true;
        } else {
            ptl_process_t *proc = (ptl_process_t*) procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PORTALS4];
            if (proc->phys.nid != modex_id->phys.nid ||
                proc->phys.pid != modex_id->phys.pid) {
                opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                                    "%s:%d: existing peer and modex peer don't match\n",
                                    __FILE__, __LINE__);
                return OMPI_ERROR;
            }
        }
    }

#if OMPI_MTL_PORTALS4_FLOW_CONTROL
    if (new_found) {
        ret = ompi_mtl_portals4_flowctl_add_procs(me, nprocs, procs);
        if (OMPI_SUCCESS != ret) {
            opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                                "%s:%d: flowctl_add_procs failed: %d\n",
                                __FILE__, __LINE__, ret);
            return ret;
        }
    }
#endif

    return OMPI_SUCCESS;
}
Exemplo n.º 2
0
int
ompi_mtl_portals4_add_procs(struct mca_mtl_base_module_t *mtl,
                            size_t nprocs,
                            struct ompi_proc_t** procs)
{
    int ret, me;
    size_t i;
    bool new_found = false;
    ptl_process_t *maptable;

    if (ompi_mtl_portals4.use_logical) {
        maptable = malloc(sizeof(ptl_process_t) * nprocs);
        if (NULL == maptable) {
            opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                                "%s:%d: malloc failed\n",
                                __FILE__, __LINE__);
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
    }

    /* Get the list of ptl_process_id_t from the runtime and copy into structure */
    for (i = 0 ; i < nprocs ; ++i) {
        ptl_process_t *modex_id;
        size_t size;

        if( procs[i] == ompi_proc_local_proc ) {
            me = i;
        }

        if (procs[i]->super.proc_arch != ompi_proc_local()->super.proc_arch) {
            opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                                "Portals 4 MTL does not support heterogeneous operations.");
            opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                                "Proc %s architecture %x, mine %x.",
                                OMPI_NAME_PRINT(&procs[i]->super.proc_name),
                                procs[i]->super.proc_arch, ompi_proc_local()->super.proc_arch);
            return OMPI_ERR_NOT_SUPPORTED;
        }

        OPAL_MODEX_RECV(ret, &mca_mtl_portals4_component.mtl_version,
                        &procs[i]->super.proc_name, (uint8_t**)&modex_id, &size);
        if (OMPI_SUCCESS != ret) {
            opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                                "%s:%d: ompi_modex_recv failed: %d\n",
                                __FILE__, __LINE__, ret);
            return ret;
        } else if (sizeof(ptl_process_t) != size) {
            opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                                "%s:%d: ompi_modex_recv failed: %d\n",
                                __FILE__, __LINE__, ret);
            return OMPI_ERR_BAD_PARAM;
        }

        if (NULL == procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PORTALS4]) {
            ptl_process_t *peer_id;
            peer_id = malloc(sizeof(ptl_process_t));
            if (NULL == peer_id) {
                opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                                    "%s:%d: malloc failed: %d\n",
                                    __FILE__, __LINE__, ret);
                return OMPI_ERR_OUT_OF_RESOURCE;
            }
            if (ompi_mtl_portals4.use_logical) {
                peer_id->rank = i;
                maptable[i].phys.pid = modex_id->phys.pid;
                maptable[i].phys.nid = modex_id->phys.nid;
                opal_output_verbose(50, ompi_mtl_base_framework.framework_output,
                    "logical: global rank=%d pid=%d nid=%d\n",
                    (int)i, maptable[i].phys.pid, maptable[i].phys.nid);
            } else {
                *peer_id = *modex_id;
            }

            procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PORTALS4] = peer_id;

            new_found = true;
        } else {
            ptl_process_t *proc = (ptl_process_t*) procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PORTALS4];
            if (ompi_mtl_portals4.use_logical) {
                if ((size_t)proc->rank != i) {
                    opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                                    "%s:%d: existing peer and rank don't match\n",
                                    __FILE__, __LINE__);
                    return OMPI_ERROR;
                }
                maptable[i].phys.pid = modex_id->phys.pid;
                maptable[i].phys.nid = modex_id->phys.nid;
            }
            else if (proc->phys.nid != modex_id->phys.nid ||
                     proc->phys.pid != modex_id->phys.pid) {
                opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                                    "%s:%d: existing peer and modex peer don't match\n",
                                    __FILE__, __LINE__);
                return OMPI_ERROR;
            }
        }
    }

    if (ompi_mtl_portals4.use_logical) {
        ret = PtlSetMap(ompi_mtl_portals4.ni_h, nprocs, maptable);
        if (OMPI_SUCCESS != ret) {
            opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                                "%s:%d: logical mapping failed: %d\n",
                                __FILE__, __LINE__, ret);
            return ret;
        }
        opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                            "logical mapping OK\n");
        free(maptable);
    }

    portals4_init_interface();

    /* activate progress callback */
    ret = opal_progress_register(ompi_mtl_portals4_progress);
    if (OMPI_SUCCESS != ret) {
        opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                            "%s:%d: opal_progress_register failed: %d\n",
                            __FILE__, __LINE__, ret);
        return ret;
    }

#if OMPI_MTL_PORTALS4_FLOW_CONTROL
    if (new_found) {
        ret = ompi_mtl_portals4_flowctl_add_procs(me, nprocs, procs);
        if (OMPI_SUCCESS != ret) {
            opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                                "%s:%d: flowctl_add_procs failed: %d\n",
                                __FILE__, __LINE__, ret);
            return ret;
        }
    }
#endif

    return OMPI_SUCCESS;
}