Exemple #1
0
FORT_DLL_SPEC void FORT_CALL mpi_cart_sub_ ( MPI_Fint *v1, MPI_Fint v2[], MPI_Fint *v3, MPI_Fint *ierr ){
    int _ctsize;
    int *l2=0;
    {int _topotype;
    PMPI_Topo_test( (MPI_Comm)*v1, &_topotype );
    if (_topotype != MPI_CART) {
        _ctsize = 0;
    }
    else 
        PMPI_Cartdim_get( (MPI_Comm)*v1, &_ctsize );
    }

    if (_ctsize) {int li;
     l2 = (int *)MPL_malloc(_ctsize * sizeof(int), MPL_MEM_OTHER);
     for (li=0; li<_ctsize; li++) {
        l2[li] = MPII_FROM_FLOG(v2[li]);
     }
    }
    *ierr = MPI_Cart_sub( (MPI_Comm)(*v1), l2, (MPI_Comm *)(v3) );
    if (l2) { MPL_free( l2 ); }
}
Exemple #2
0
static HYD_status fn_job_getid(int fd, char *args[])
{
    struct HYD_string_stash stash;
    char *cmd, *thrid;
    struct HYD_pmcd_token *tokens = NULL;
    int token_count;
    HYD_status status = HYD_SUCCESS;

    HYDU_FUNC_ENTER();

    status = HYD_pmcd_pmi_args_to_tokens(args, &tokens, &token_count);
    HYDU_ERR_POP(status, "unable to convert args to tokens\n");

    thrid = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "thrid");

    HYD_STRING_STASH_INIT(stash);
    HYD_STRING_STASH(stash, MPL_strdup("cmd=job-getid-response;"), status);
    if (thrid) {
        HYD_STRING_STASH(stash, MPL_strdup("thrid="), status);
        HYD_STRING_STASH(stash, MPL_strdup(thrid), status);
        HYD_STRING_STASH(stash, MPL_strdup(";"), status);
    }
    HYD_STRING_STASH(stash, MPL_strdup("jobid="), status);
    HYD_STRING_STASH(stash, MPL_strdup(HYD_pmcd_pmip.local.kvs->kvsname), status);
    HYD_STRING_STASH(stash, MPL_strdup(";rc=0;"), status);

    HYD_STRING_SPIT(stash, cmd, status);

    send_cmd_downstream(fd, cmd);
    MPL_free(cmd);

  fn_exit:
    if (tokens)
        HYD_pmcd_pmi_free_tokens(tokens, token_count);
    HYDU_FUNC_EXIT();
    return status;

  fn_fail:
    goto fn_exit;
}
Exemple #3
0
int MPIR_Info_set_impl(MPID_Info *info_ptr, const char *key, const char *value)
{
    int mpi_errno = MPI_SUCCESS;
    MPID_Info *curr_ptr, *prev_ptr;
    MPID_MPI_STATE_DECL(MPID_STATE_MPIR_INFO_SET_IMPL);

    MPID_MPI_FUNC_ENTER(MPID_STATE_MPIR_INFO_SET_IMPL);

    prev_ptr = info_ptr;
    curr_ptr = info_ptr->next;

    while (curr_ptr) {
        if (!strncmp(curr_ptr->key, key, MPI_MAX_INFO_KEY)) {
            /* Key already present; replace value */
            MPL_free(curr_ptr->value);
            curr_ptr->value = MPL_strdup(value);
            break;
        }
        prev_ptr = curr_ptr;
        curr_ptr = curr_ptr->next;
    }

    if (!curr_ptr) {
        /* Key not present, insert value */
        mpi_errno = MPIU_Info_alloc(&curr_ptr);
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);

        /*printf( "Inserting new elm %x at %x\n", curr_ptr->id, prev_ptr->id );*/
        prev_ptr->next   = curr_ptr;
        curr_ptr->key    = MPL_strdup(key);
        curr_ptr->value  = MPL_strdup(value);
    }

fn_exit:
    MPID_MPI_FUNC_EXIT(MPID_STATE_MPIR_INFO_SET_IMPL);
    return mpi_errno;

fn_fail:
    goto fn_exit;
}
Exemple #4
0
static HYD_status control_port_fn(char *arg, char ***argv)
{
    char *port = NULL;
    HYD_status status = HYD_SUCCESS;

    HYDU_ERR_CHKANDJUMP(status, HYD_pmcd_pmip.upstream.server_name, HYD_INTERNAL_ERROR,
                        "duplicate control port setting\n");

    port = MPL_strdup(**argv);
    HYD_pmcd_pmip.upstream.server_name = MPL_strdup(strtok(port, ":"));
    HYD_pmcd_pmip.upstream.server_port = atoi(strtok(NULL, ":"));

    (*argv)++;

  fn_exit:
    if (port)
        MPL_free(port);
    return status;

  fn_fail:
    goto fn_exit;
}
Exemple #5
0
RLOG_Struct* RLOG_InitLog(int rank, int size)
{
    RLOG_Struct* pRLOG;

    pRLOG = (RLOG_Struct*)MPL_malloc(sizeof(RLOG_Struct));
    if (pRLOG == NULL)
	return NULL;

    pRLOG->nRank = rank;
    pRLOG->nSize = size;
    pRLOG->nRecursion = 0;
    pRLOG->nCurEventId = RLOG_FIRST_EVENT_ID;
    pRLOG->dFirstTimestamp = 0.0;
    MPL_snprintf(pRLOG->pszFileName, 256, "log%d.irlog", rank);

    pRLOG->pOutput = NULL;
    pRLOG->pOutput = IRLOG_CreateOutputStruct(pRLOG->pszFileName);
    if (pRLOG->pOutput == NULL)
    {
	MPL_error_printf("RLOG Error: unable to allocate an output structure.\n");
	MPL_free(pRLOG);
	return NULL;
    }

    RLOG_EnableLogging(pRLOG);

    /* save the parts of the header and event that do not change */
    pRLOG->DiskEvent.event = RLOG_GetNextEventID(pRLOG);
    pRLOG->DiskEvent.rank = rank;
    pRLOG->DiskHeader.type = RLOG_EVENT_TYPE;
    pRLOG->DiskHeader.length = sizeof(RLOG_HEADER) + sizeof(RLOG_EVENT);
    /* put the description of the state in the log file */
    RLOG_DescribeState(pRLOG, pRLOG->DiskEvent.event, "RLOG_DISK", "255   0   0");

    RLOG_DisableLogging(pRLOG);

    return pRLOG;
}
HYD_status HYDT_bscd_slurm_query_node_list(struct HYD_node **node_list)
{
    char *list, *dummy, *task_list;
    int nnodes;
    HYD_status status = HYD_SUCCESS;

    HYDU_FUNC_ENTER();

    if (MPL_env2str("SLURM_NODELIST", (const char **) &list) == 0) {
        *node_list = NULL;
        goto fn_exit;
    }

    if (MPL_env2str("SLURM_NNODES", (const char **) &dummy) == 0) {
        *node_list = NULL;
        goto fn_exit;
    }
    nnodes = atoi(dummy);

    if (MPL_env2str("SLURM_TASKS_PER_NODE", (const char **) &task_list) == 0) {
        *node_list = NULL;
        goto fn_exit;
    }

    status = extract_tasks_per_node(nnodes, task_list);
    HYDU_ERR_POP(status, "unable to extract the number of tasks per node\n");

    list_to_nodes(list);
    *node_list = global_node_list;

  fn_exit:
    MPL_free(tasks_per_node);
    HYDU_FUNC_EXIT();
    return status;

  fn_fail:
    goto fn_exit;
}
Exemple #7
0
int MPL_args_deserialize(int len, const void *serialized_buf, int *argc, char ***argv)
{
    const char *buf = serialized_buf;
    int nargs;
    int *arg_lengths;
    char **targv;
    int i;

    nargs = *((int *) buf);
    buf += sizeof(int);

    targv = (char **) MPL_malloc(nargs * sizeof(char *), MPL_MEM_STRINGS);
    arg_lengths = (int *) MPL_malloc(nargs * sizeof(int), MPL_MEM_STRINGS);

    assert(targv && arg_lengths);

    for (i = 0; i < nargs; i++) {
        arg_lengths[i] = (*((int *) buf));
        buf += sizeof(int);

        /* allocate an extra end-of-string character for each string */
        targv[i] = (char *) MPL_malloc(arg_lengths[i] + 1, MPL_MEM_STRINGS);
        assert(targv[i]);
    }

    for (i = 0; i < nargs; i++) {
        memcpy(targv[i], buf, arg_lengths[i]);
        targv[i][arg_lengths[i]] = 0;   /* append an end of string character */
        buf += arg_lengths[i];
    }

    *argc = nargs;
    *argv = targv;

    MPL_free(arg_lengths);

    return 0;
}
Exemple #8
0
static HYD_status control_port_fn(char *arg, char ***argv)
{
    char *port = NULL, *name;
    HYD_status status = HYD_SUCCESS;

    HYDU_ERR_CHKANDJUMP(status, HYD_pmcd_pmip.upstream.server_name, HYD_INTERNAL_ERROR,
                        "duplicate control port setting\n");

    port = MPL_strdup(**argv);
    HYDU_ERR_CHKANDJUMP(status, NULL == port, HYD_INTERNAL_ERROR, "port not provided\n");
    name = strtok(port, ":");
    HYD_pmcd_pmip.upstream.server_name = name ? MPL_strdup(name) : NULL;
    HYD_pmcd_pmip.upstream.server_port = strtol(strtok(NULL, ":"), NULL, 10);

    (*argv)++;

  fn_exit:
    MPL_free(port);
    return status;

  fn_fail:
    goto fn_exit;
}
Exemple #9
0
int MPID_nem_ptl_poll_finalize(void)
{
    int mpi_errno = MPI_SUCCESS;
    int i;
    int ret;
    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_PTL_POLL_FINALIZE);

    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_PTL_POLL_FINALIZE);
    
    for (i = 0; i < NUM_OVERFLOW_ME; ++i) {
        if (overflow_me_handle[i] != PTL_INVALID_HANDLE) {
            ret = PtlMEUnlink(overflow_me_handle[i]);
            MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmeunlink", "**ptlmeunlink %s", MPID_nem_ptl_strerror(ret));
        }
        MPL_free(overflow_buf[i]);
    }
    
 fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_PTL_POLL_FINALIZE);
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}
Exemple #10
0
char *HYDU_getcwd(void)
{
    char *cwdval, *retval = NULL;
    const char *pwdval;
    HYD_status status = HYD_SUCCESS;
#if defined HAVE_STAT
    struct stat spwd, scwd;
#endif /* HAVE_STAT */

    if (MPL_env2str("PWD", &pwdval) == 0)
        pwdval = NULL;
    HYDU_MALLOC_OR_JUMP(cwdval, char *, HYDRA_MAX_PATH, status);
    if (getcwd(cwdval, HYDRA_MAX_PATH) == NULL)
        HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR,
                            "allocated space is too small for absolute path\n");

#if defined HAVE_STAT
    if (pwdval && stat(pwdval, &spwd) != -1 && stat(cwdval, &scwd) != -1 &&
            spwd.st_dev == scwd.st_dev && spwd.st_ino == scwd.st_ino) {
        /* PWD and getcwd() match; use the PWD value */
        retval = MPL_strdup(pwdval);
        MPL_free(cwdval);
    }
    else
#endif /* HAVE_STAT */
    {
        /* PWD and getcwd() don't match; use the getcwd value and hope
         * for the best. */
        retval = cwdval;
    }

fn_exit:
    return retval;

fn_fail:
    goto fn_exit;
}
Exemple #11
0
int MPID_NS_Unpublish(MPID_NS_Handle handle, const MPIR_Info * info_ptr, const char service_name[])
{
    char filename[MAXPATHLEN];
    int err;
    int i;

    /* Remove the file corresponding to the service name */
    /* Determine file and directory name.  The file name is from
     * the service name */
    MPL_strncpy(filename, handle->dirname, MAXPATHLEN);
    MPL_strnapp(filename, service_name, MAXPATHLEN);

    /* Find the filename from the list of published files */
    for (i = 0; i < handle->nactive; i++) {
        if (handle->filenames[i] && strcmp(filename, handle->filenames[i]) == 0) {
            /* unlink the file only if we find it */
            unlink(filename);
            MPL_free(handle->filenames[i]);
            handle->filenames[i] = 0;
            break;
        }
    }

    if (i == handle->nactive) {
        /* --BEGIN ERROR HANDLING-- */
        /* Error: this name was not found */
        err = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, __func__, __LINE__,
                                   MPI_ERR_SERVICE, "**namepubnotpub",
                                   "**namepubnotpub %s", service_name);
        return err;
        /* --END ERROR HANDLING-- */
    }

    /* Later, we can reduce the number of active and compress the list */

    return 0;
}
Exemple #12
0
HYD_status HYDT_topo_finalize(void)
{
    HYD_status status = HYD_SUCCESS;

    HYDU_FUNC_ENTER();

    /* Finalize the topology library requested by the user */
#if defined HAVE_HWLOC
    if (!strcmp(HYDT_topo_info.topolib, "hwloc")) {
        status = HYDT_topo_hwloc_finalize();
        HYDU_ERR_POP(status, "unable to finalize hwloc\n");
    }
#endif /* HAVE_HWLOC */

    if (HYDT_topo_info.topolib)
        MPL_free(HYDT_topo_info.topolib);

  fn_exit:
    HYDU_FUNC_EXIT();
    return status;

  fn_fail:
    goto fn_exit;
}
Exemple #13
0
void AppendFile(FILE *fout, FILE *fin)
{
    int total;
    int num_read, num_written;
    char *buffer, *buf;

    buffer = (char*)MPL_malloc(sizeof(char) * BUFFER_SIZE);

    total = ftell(fin);
    fseek(fin, 0L, SEEK_SET);

    while (total)
    {
	num_read = fread(buffer, 1, min(BUFFER_SIZE, total), fin);
	if (num_read == 0)
	{
	    MPL_error_printf("failed to read from input file\n");
	    return;
	}
	total -= num_read;
	buf = buffer;
	while (num_read)
	{
	    num_written = fwrite(buf, 1, num_read, fout);
	    if (num_written == 0)
	    {
		MPL_error_printf("failed to write to output file\n");
		return;
	    }
	    num_read -= num_written;
	    buf += num_written;
	}
    }

    MPL_free(buffer);
}
Exemple #14
0
int MPIR_T_pvar_handle_free_impl(MPI_T_pvar_session session, MPI_T_pvar_handle *handle)
{
    int mpi_errno = MPI_SUCCESS;
    MPIR_T_pvar_handle_t *hnd = *handle;

    MPL_DL_DELETE(session->hlist, hnd);

    /* Unlink handle from pvar if it is a watermark */
    if (MPIR_T_pvar_is_watermark(hnd)) {
        MPIR_T_pvar_watermark_t *mark = (MPIR_T_pvar_watermark_t *)hnd->addr;
        if (MPIR_T_pvar_is_first(hnd)) {
            mark->first_used = 0;
            mark->first_started = 0;
        } else {
            MPIU_Assert(mark->hlist);
            if (mark->hlist == hnd) {
                /* hnd happens to be the head */
                mark->hlist = hnd->next2;
                if (mark->hlist != NULL)
                    mark->hlist->prev2 = mark->hlist;
            } else {
                hnd->prev2->next2 = hnd->next2;
                if (hnd->next2 != NULL)
                    hnd->next2->prev2 = hnd->prev2;
            }
        }
    }

    MPL_free(hnd);
    *handle = MPI_T_PVAR_HANDLE_NULL;

fn_exit:
    return mpi_errno;
fn_fail:
    goto fn_exit;
}
void MPIDU_Datatype_free_contents(MPIDU_Datatype *dtp)
{
    int i, struct_sz = sizeof(MPIDU_Datatype_contents);
    int align_sz = 8, epsilon;
    MPIDU_Datatype *old_dtp;
    MPI_Datatype *array_of_types;

    if ((epsilon = struct_sz % align_sz)) {
	struct_sz += align_sz - epsilon;
    }

    /* note: relies on types being first after structure */
    array_of_types = (MPI_Datatype *) ((char *)dtp->contents + struct_sz);

    for (i=0; i < dtp->contents->nr_types; i++) {
	if (HANDLE_GET_KIND(array_of_types[i]) != HANDLE_KIND_BUILTIN) {
	    MPIDU_Datatype_get_ptr(array_of_types[i], old_dtp);
	    MPIDU_Datatype_release(old_dtp);
	}
    }

    MPL_free(dtp->contents);
    dtp->contents = NULL;
}
Exemple #16
0
int MPIDI_PG_Destroy(MPIDI_PG_t * pg)
{
    MPIDI_PG_t * pg_prev;
    MPIDI_PG_t * pg_cur;
    int i;
    int mpi_errno = MPI_SUCCESS;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_PG_DESTROY);

    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_PG_DESTROY);

    MPIR_Assert(MPIR_Object_get_ref(pg) == 0);

    pg_prev = NULL;
    pg_cur = MPIDI_PG_list;
    while(pg_cur != NULL)
    {
	if (pg_cur == pg)
	{
	    if (MPIDI_PG_iterator_next == pg)
	    { 
		MPIDI_PG_iterator_next = MPIDI_PG_iterator_next->next;
	    }

            if (pg_prev == NULL)
                MPIDI_PG_list = pg->next; 
            else
                pg_prev->next = pg->next;

            MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_DISCONNECT, VERBOSE, (MPL_DBG_FDEST, "destroying pg=%p pg->id=%s", pg, (char *)pg->id));

            for (i = 0; i < pg->size; ++i) {
                /* FIXME it would be good if we could make this assertion.
                   Unfortunately, either:
                   1) We're not being disciplined and some caller of this
                      function doesn't bother to manage all the refcounts
                      because he thinks he knows better.  Annoying, but not
                      strictly a bug.
		      (wdg - actually, that is a bug - managing the ref
		      counts IS required and missing one is a bug.)
                   2) There is a real bug lurking out there somewhere and we
                      just haven't hit it in the tests yet.  */
                /*MPIR_Assert(MPIR_Object_get_ref(pg->vct[i]) == 0);*/

                MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_DISCONNECT, VERBOSE, (MPL_DBG_FDEST, "about to free pg->vct=%p which contains vc=%p", pg->vct, &pg->vct[i]));

                /* This used to be handled in MPIDI_VCRT_Release, but that was
                   not the right place to do this.  The VC should only be freed
                   when the PG that it belongs to is freed, not just when the
                   VC's refcount drops to zero. [goodell@ 2008-06-13] */
		/* In that case, the fact that the VC is in the PG should
		   increment the ref count - reflecting the fact that the
		   use in the PG constitutes a reference-count-incrementing
		   use.  Alternately, if the PG is able to recreate a VC, 
		   and can thus free unused (or idle) VCs, it should be allowed
		   to do so.  [wdg 2008-08-31] */
                mpi_errno = MPIDI_CH3_VC_Destroy(&(pg->vct[i]));
                if (mpi_errno) { MPIR_ERR_POP(mpi_errno); }
            }

	    MPIDI_PG_Destroy_fn(pg);
	    MPL_free(pg->vct);
	    if (pg->connData) {
		if (pg->freeConnInfo) {
		    (*pg->freeConnInfo)( pg );
		}
		else {
		    MPL_free(pg->connData);
		}
	    }
	    mpi_errno = MPIDI_CH3_PG_Destroy(pg);
	    MPL_free(pg);

	    goto fn_exit;
	}

	pg_prev = pg_cur;
	pg_cur = pg_cur->next;
    }

    /* PG not found if we got here */
    MPIR_ERR_SET1(mpi_errno,MPI_ERR_OTHER,
		  "**dev|pg_not_found", "**dev|pg_not_found %p", pg);

  fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_PG_DESTROY);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}
Exemple #17
0
/* XXX DJG FIXME at some point this should poll, much like the newtcp module.
   But then we have that whole pollfd array to manage, which we don't really
   need until this proof-of-concept proves itself. */
int MPID_nem_lmt_vmsplice_progress(void)
{
    int mpi_errno = MPI_SUCCESS;
    struct lmt_vmsplice_node *prev = NULL;
    struct lmt_vmsplice_node *free_me = NULL;
    struct lmt_vmsplice_node *cur = outstanding_head;
    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_LMT_VMSPLICE_PROGRESS);

    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_LMT_VMSPLICE_PROGRESS);
    
    while (cur) {
        int complete = 0;

        switch (MPIDI_Request_get_type(cur->req)) {
            case MPIDI_REQUEST_TYPE_RECV:
                mpi_errno = do_readv(cur->req, cur->pipe_fd, cur->req->dev.iov,
                                     &cur->req->dev.iov_offset,
                                     &cur->req->dev.iov_count, &complete);
                /* FIXME: set the error status of the req and complete it, rather than POP */
                if (mpi_errno) MPIR_ERR_POP(mpi_errno);
                break;
            case MPIDI_REQUEST_TYPE_SEND:
                mpi_errno = do_vmsplice(cur->req, cur->pipe_fd, cur->req->dev.iov,
                                        &cur->req->dev.iov_offset,
                                        &cur->req->dev.iov_count, &complete);
                /* FIXME: set the error status of the req and complete it, rather than POP */
                if (mpi_errno) MPIR_ERR_POP(mpi_errno);
                break;
            default:
                MPIR_ERR_INTERNALANDJUMP(mpi_errno, "unexpected request type");
                break;
        }

        if (complete) {
            MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL, VERBOSE, ".... complete");

            /* remove the node from the list */
            if (cur == outstanding_head) {
                outstanding_head = cur->next;
                prev = NULL;
                free_me = cur;
                cur = cur->next;
            }
            else {
                prev->next = cur->next;
                prev = cur;
                free_me = cur;
                cur = cur->next;
            }
            if (free_me) MPL_free(free_me);
            --MPID_nem_local_lmt_pending;
        }

        if (!cur) break; /* we might have made cur NULL above */

        prev = cur;
        cur = cur->next;
    }

fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_LMT_VMSPLICE_PROGRESS);
    return mpi_errno;
fn_fail:
    goto fn_exit;
}
Exemple #18
0
int MPIDO_Gatherv(const void *sendbuf, 
                  int sendcount, 
                  MPI_Datatype sendtype,
                  void *recvbuf, 
                  const int *recvcounts, 
                  const int *displs, 
                  MPI_Datatype recvtype,
                  int root, 
                  MPID_Comm * comm_ptr, 
                  int *mpierrno)

{
#ifndef HAVE_PAMI_IN_PLACE
  if (sendbuf == MPI_IN_PLACE)
  {
    MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`");
    return -1;
  }
#endif
   TRACE_ERR("Entering MPIDO_Gatherv\n");
   int i;
   int contig ATTRIBUTE((unused)), rsize ATTRIBUTE((unused)), ssize ATTRIBUTE((unused));
   int pamidt = 1;
   MPID_Datatype *dt_ptr = NULL;
   MPI_Aint send_true_lb, recv_true_lb;
   char *sbuf, *rbuf;
   pami_type_t stype, rtype;
   int tmp;
   volatile unsigned gatherv_active = 1;
   const int rank = comm_ptr->rank;
   const int size = comm_ptr->local_size;
#if ASSERT_LEVEL==0
   /* We can't afford the tracing in ndebug/performance libraries */
    const unsigned verbose = 0;
#else
    const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0);
#endif
   const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid);
   const int selected_type = mpid->user_selected_type[PAMI_XFER_GATHERV_INT];

   /* Check for native PAMI types and MPI_IN_PLACE on sendbuf */
   /* MPI_IN_PLACE is a nonlocal decision. We will need a preallreduce if we ever have
    * multiple "good" gathervs that work on different counts for example */
   if((sendbuf != MPI_IN_PLACE) && (MPIDI_Datatype_to_pami(sendtype, &stype, -1, NULL, &tmp) != MPI_SUCCESS))
      pamidt = 0;
   if(MPIDI_Datatype_to_pami(recvtype, &rtype, -1, NULL, &tmp) != MPI_SUCCESS)
      pamidt = 0;

   if(pamidt == 0 || selected_type == MPID_COLL_USE_MPICH)
   {
      if(unlikely(verbose))
         fprintf(stderr,"Using MPICH gatherv algorithm\n");
      TRACE_ERR("GATHERV using MPICH\n");
      MPIDI_Update_last_algorithm(comm_ptr, "GATHERV_MPICH");
#if CUDA_AWARE_SUPPORT
    if(MPIDI_Process.cuda_aware_support_on)
    {
       MPI_Aint sdt_extent,rdt_extent;
       MPID_Datatype_get_extent_macro(sendtype, sdt_extent);
       MPID_Datatype_get_extent_macro(recvtype, rdt_extent);
       char *scbuf = NULL;
       char *rcbuf = NULL;
       int is_send_dev_buf = MPIDI_cuda_is_device_buf(sendbuf);
       int is_recv_dev_buf = (rank == root) ? MPIDI_cuda_is_device_buf(recvbuf) : 0;
       if(is_send_dev_buf)
       {
         scbuf = MPL_malloc(sdt_extent * sendcount);
         cudaError_t cudaerr = CudaMemcpy(scbuf, sendbuf, sdt_extent * sendcount, cudaMemcpyDeviceToHost);
         if (cudaSuccess != cudaerr)
           fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
       }
       else
         scbuf = sendbuf;
       size_t rtotal_buf;
       if(is_recv_dev_buf)
       {
         //Since displs can be non-continous, we need to calculate max buffer size 
         int highest_displs = displs[size - 1];
         int highest_recvcount = recvcounts[size - 1];
         for(i = 0; i < size; i++)
         {
           if(displs[i]+recvcounts[i] > highest_displs+highest_recvcount)
           {
             highest_displs = displs[i];
             highest_recvcount = recvcounts[i];
           }
         }
         rtotal_buf = (highest_displs+highest_recvcount)*rdt_extent;
         rcbuf = MPL_malloc(rtotal_buf);
         if(sendbuf == MPI_IN_PLACE)
         {
           cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, rtotal_buf, cudaMemcpyDeviceToHost);
           if (cudaSuccess != cudaerr)
             fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
         }
         else
           memset(rcbuf, 0, rtotal_buf);
       }
       else
         rcbuf = recvbuf;
       int cuda_res =  MPIR_Gatherv(scbuf, sendcount, sendtype, rcbuf, recvcounts, displs, recvtype, root, comm_ptr, mpierrno);
       if(is_send_dev_buf)MPL_free(scbuf);
       if(is_recv_dev_buf)
         {
           cudaError_t cudaerr = CudaMemcpy(recvbuf, rcbuf, rtotal_buf, cudaMemcpyHostToDevice);
           if (cudaSuccess != cudaerr)
             fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr));
           MPL_free(rcbuf);
         }
       return cuda_res;
    }
    else
#endif
      return MPIR_Gatherv(sendbuf, sendcount, sendtype,
               recvbuf, recvcounts, displs, recvtype,
               root, comm_ptr, mpierrno);
   }

   MPIDI_Datatype_get_info(1, recvtype, contig, rsize, dt_ptr, recv_true_lb);
   rbuf = (char *)recvbuf + recv_true_lb;
   sbuf = (void *) sendbuf;

   pami_xfer_t gatherv;

   gatherv.cb_done = cb_gatherv;
   gatherv.cookie = (void *)&gatherv_active;
   gatherv.cmd.xfer_gatherv_int.root = MPIDI_Task_to_endpoint(MPID_VCR_GET_LPID(comm_ptr->vcr, root), 0);
   gatherv.cmd.xfer_gatherv_int.rcvbuf = rbuf;
   gatherv.cmd.xfer_gatherv_int.rtype = rtype;
   gatherv.cmd.xfer_gatherv_int.rtypecounts = (int *) recvcounts;
   gatherv.cmd.xfer_gatherv_int.rdispls = (int *) displs;

   gatherv.cmd.xfer_gatherv_int.sndbuf = NULL;
   gatherv.cmd.xfer_gatherv_int.stype = stype;
   gatherv.cmd.xfer_gatherv_int.stypecount = sendcount;

   if(rank == root)
   {
      if(sendbuf == MPI_IN_PLACE) 
      {
         if(unlikely(verbose))
            fprintf(stderr,"gatherv MPI_IN_PLACE buffering\n");
         sbuf = PAMI_IN_PLACE;
         gatherv.cmd.xfer_gatherv_int.stype = rtype;
         gatherv.cmd.xfer_gatherv_int.stypecount = recvcounts[rank];
      }
      else
      {
         MPIDI_Datatype_get_info(1, sendtype, contig, ssize, dt_ptr, send_true_lb);
         sbuf = (char *)sbuf + send_true_lb;
      }
   }
   gatherv.cmd.xfer_gatherv_int.sndbuf = sbuf;

   pami_algorithm_t my_gatherv;
   const pami_metadata_t *my_md = (pami_metadata_t *)NULL;
   int queryreq = 0;

   if(selected_type == MPID_COLL_OPTIMIZED)
   {
      TRACE_ERR("Optimized gatherv %s was selected\n",
         mpid->opt_protocol_md[PAMI_XFER_GATHERV_INT][0].name);
      my_gatherv = mpid->opt_protocol[PAMI_XFER_GATHERV_INT][0];
      my_md = &mpid->opt_protocol_md[PAMI_XFER_GATHERV_INT][0];
      queryreq = mpid->must_query[PAMI_XFER_GATHERV_INT][0];
   }
   else
   {
      TRACE_ERR("Optimized gatherv %s was set by user\n",
         mpid->user_metadata[PAMI_XFER_GATHERV_INT].name);
         my_gatherv = mpid->user_selected[PAMI_XFER_GATHERV_INT];
         my_md = &mpid->user_metadata[PAMI_XFER_GATHERV_INT];
         queryreq = selected_type;
   }

   gatherv.algorithm = my_gatherv;


   if(unlikely(queryreq == MPID_COLL_ALWAYS_QUERY || 
               queryreq == MPID_COLL_CHECK_FN_REQUIRED))
   {
      metadata_result_t result = {0};
      TRACE_ERR("querying gatherv protocol %s, type was %d\n", 
         my_md->name, queryreq);
      if(my_md->check_fn == NULL)
      {
         /* process metadata bits */
         if((!my_md->check_correct.values.inplace) && (sendbuf == MPI_IN_PLACE))
            result.check.unspecified = 1;
/* Can't check ranges like this.  Non-local.  Comment out for now.
         if(my_md->check_correct.values.rangeminmax)
         {
            MPI_Aint data_true_lb;
            MPID_Datatype *data_ptr;
            int data_size, data_contig;
            MPIDI_Datatype_get_info(sendcount, sendtype, data_contig, data_size, data_ptr, data_true_lb); 
            if((my_md->range_lo <= data_size) &&
               (my_md->range_hi >= data_size))
               ; 
            else
            {
               result.check.range = 1;
               if(unlikely(verbose))
               {   
                  fprintf(stderr,"message size (%u) outside range (%zu<->%zu) for %s.\n",
                          data_size,
                          my_md->range_lo,
                          my_md->range_hi,
                          my_md->name);
               }
            }
         }
 */
      }
      else /* calling the check fn is sufficient */
         result = my_md->check_fn(&gatherv);
      TRACE_ERR("bitmask: %#X\n", result.bitmask);
      result.check.nonlocal = 0; /* #warning REMOVE THIS WHEN IMPLEMENTED */
      if(result.bitmask)
      {
         if(unlikely(verbose))
            fprintf(stderr,"Query failed for %s. Using MPICH gatherv.\n", my_md->name);
         MPIDI_Update_last_algorithm(comm_ptr, "GATHERV_MPICH");
         return MPIR_Gatherv(sendbuf, sendcount, sendtype,
                             recvbuf, recvcounts, displs, recvtype,
                             root, comm_ptr, mpierrno);
      }
      if(my_md->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) 
      { 
         comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests;
         int tmpmpierrno;   
         if(unlikely(verbose))
            fprintf(stderr,"Query barrier required for %s\n", my_md->name);
         MPIDO_Barrier(comm_ptr, &tmpmpierrno);
      }
   }
   
   MPIDI_Update_last_algorithm(comm_ptr, my_md->name);

   if(unlikely(verbose))
   {
      unsigned long long int threadID;
      MPL_thread_id_t tid;
      MPL_thread_self(&tid);
      threadID = (unsigned long long int)tid;
      fprintf(stderr,"<%llx> Using protocol %s for gatherv on %u\n", 
              threadID,
              my_md->name,
              (unsigned) comm_ptr->context_id);
   }

   MPIDI_Post_coll_t gatherv_post;
   MPIDI_Context_post(MPIDI_Context[0], &gatherv_post.state,
                      MPIDI_Pami_post_wrapper, (void *)&gatherv);
   
   TRACE_ERR("Waiting on active %d\n", gatherv_active);
   MPID_PROGRESS_WAIT_WHILE(gatherv_active);

   TRACE_ERR("Leaving MPIDO_Gatherv\n");
   return 0;
}
Exemple #19
0
int MPIDO_Gatherv_simple(const void *sendbuf, 
                  int sendcount, 
                  MPI_Datatype sendtype,
                  void *recvbuf, 
                  const int *recvcounts, 
                  const int *displs, 
                  MPI_Datatype recvtype,
                  int root, 
                  MPID_Comm * comm_ptr, 
                  int *mpierrno)

{
#ifndef HAVE_PAMI_IN_PLACE
  if (sendbuf == MPI_IN_PLACE)
  {
    MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`");
    return -1;
  }
#endif
   TRACE_ERR("Entering MPIDO_Gatherv_optimized\n");
   int snd_contig = 1, rcv_contig = 1;
   void *snd_noncontig_buff = NULL, *rcv_noncontig_buff = NULL;
   void *sbuf = NULL, *rbuf = NULL;
   int  *rcounts = NULL;
   int  *rdispls = NULL;
   int send_size = 0;
   int recv_size = 0;
   int rcvlen    = 0;
  int totalrecvcount  = 0;
   pami_type_t rtype = PAMI_TYPE_NULL;
   MPID_Segment segment;
   MPID_Datatype *data_ptr = NULL;
   int send_true_lb, recv_true_lb = 0;
   int i, tmp;
   volatile unsigned gatherv_active = 1;
   const int rank = comm_ptr->rank;
   const int size = comm_ptr->local_size;
#if ASSERT_LEVEL==0
   /* We can't afford the tracing in ndebug/performance libraries */
    const unsigned verbose = 0;
#else
    const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0);
#endif

   const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid);
  int recvok=PAMI_SUCCESS, recvcontinuous=0;

   if(sendbuf != MPI_IN_PLACE)
   {
     MPIDI_Datatype_get_info(sendcount, sendtype, snd_contig,
                            send_size, data_ptr, send_true_lb);
    if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL)
    {
      advisor_algorithm_t advisor_algorithms[1];
      int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_GATHERV_INT, 64, advisor_algorithms, 1);
      if(num_algorithms)
      {
        if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO)
        {
          return MPIR_Gatherv(sendbuf, sendcount, sendtype,
                              recvbuf, recvcounts, displs, recvtype,
                              root, comm_ptr, mpierrno);
        }
        else if(advisor_algorithms[0].metadata && advisor_algorithms[0].metadata->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests)))
        {
          comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests;
          int tmpmpierrno;
          if(unlikely(verbose))
            fprintf(stderr,"Query barrier required for %s\n", advisor_algorithms[0].metadata->name);
          MPIDO_Barrier(comm_ptr, &tmpmpierrno);
        }
      }
    }

    sbuf = (char *)sendbuf + send_true_lb;
    if(!snd_contig)
    {
      snd_noncontig_buff = MPL_malloc(send_size);
      sbuf = snd_noncontig_buff;
      if(snd_noncontig_buff == NULL)
      {
        MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1,
                   "Fatal:  Cannot allocate pack buffer");
      }
      DLOOP_Offset last = send_size;
      MPID_Segment_init(sendbuf, sendcount, sendtype, &segment, 0);
      MPID_Segment_pack(&segment, 0, &last, snd_noncontig_buff);
    }
  }
  else
  {
    MPIDI_Datatype_get_info(1, recvtype, rcv_contig,
                            rcvlen, data_ptr, recv_true_lb);
    if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL)
    {
      advisor_algorithm_t advisor_algorithms[1];
      int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_GATHERV_INT, 64, advisor_algorithms, 1);
      if(num_algorithms)
      {
        if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO)
        {
          return MPIR_Gatherv(sendbuf, sendcount, sendtype,
                              recvbuf, recvcounts, displs, recvtype,
                              root, comm_ptr, mpierrno);
        }
        else if(advisor_algorithms[0].metadata && advisor_algorithms[0].metadata->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests)))
        {
          comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests;
          int tmpmpierrno;
          if(unlikely(verbose))
            fprintf(stderr,"Query barrier required for %s\n", advisor_algorithms[0].metadata->name);
          MPIDO_Barrier(comm_ptr, &tmpmpierrno);
        }
      }
    }
  }

   pami_xfer_t gatherv;
   rbuf = (char *)recvbuf + recv_true_lb;
   rcounts = (int*)recvcounts;
   rdispls = (int*)displs;
   if(rank == root)
   {
    if((recvok = MPIDI_Datatype_to_pami(recvtype, &rtype, -1, NULL, &tmp)) != MPI_SUCCESS)
      {
        MPIDI_Datatype_get_info(1, recvtype, rcv_contig,
                                rcvlen, data_ptr, recv_true_lb);
      totalrecvcount = recvcounts[0];
      recvcontinuous = displs[0] == 0? 1 : 0 ;
          rcounts = (int*)MPL_malloc(size);
          rdispls = (int*)MPL_malloc(size);
      rdispls[0] = 0;
      rcounts[0] = rcvlen * recvcounts[0];
      for(i = 1; i < size; i++)
      {
        rdispls[i]= rcvlen * totalrecvcount;
        totalrecvcount += recvcounts[i];
        if(displs[i] != (displs[i-1] + recvcounts[i-1]))
          recvcontinuous = 0;
            rcounts[i] = rcvlen * recvcounts[i];
          }
      recv_size = rcvlen * totalrecvcount;

          rcv_noncontig_buff = MPL_malloc(recv_size);
          rbuf = rcv_noncontig_buff;
          rtype = PAMI_TYPE_BYTE;
          if(rcv_noncontig_buff == NULL)
          {
             MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1,
                "Fatal:  Cannot allocate pack buffer");
          }
      if(sendbuf == MPI_IN_PLACE)
      {
        size_t extent;
        MPID_Datatype_get_extent_macro(recvtype,extent);
        MPIR_Localcopy(recvbuf + displs[rank]*extent, recvcounts[rank], recvtype,
                     rcv_noncontig_buff + rdispls[rank], rcounts[rank],MPI_CHAR);
      }
    }
    if(sendbuf == MPI_IN_PLACE)
    {
      gatherv.cmd.xfer_gatherv_int.sndbuf = PAMI_IN_PLACE;
    }
    else
    {
      gatherv.cmd.xfer_gatherv_int.sndbuf = sbuf;
    }
    gatherv.cmd.xfer_gatherv_int.stype = PAMI_TYPE_BYTE;/* stype is ignored when sndbuf == PAMI_IN_PLACE */
    gatherv.cmd.xfer_gatherv_int.stypecount = send_size;

  }
  else
  {
    gatherv.cmd.xfer_gatherv_int.sndbuf = sbuf;
    gatherv.cmd.xfer_gatherv_int.stype = PAMI_TYPE_BYTE;
    gatherv.cmd.xfer_gatherv_int.stypecount = send_size;     
  }


  gatherv.cb_done = cb_gatherv;
  gatherv.cookie = (void *)&gatherv_active;
  gatherv.cmd.xfer_gatherv_int.root = MPIDI_Task_to_endpoint(MPID_VCR_GET_LPID(comm_ptr->vcr, root), 0);
  gatherv.cmd.xfer_gatherv_int.rcvbuf = rbuf;
  gatherv.cmd.xfer_gatherv_int.rtype = rtype;
  gatherv.cmd.xfer_gatherv_int.rtypecounts = (int *) rcounts;
  gatherv.cmd.xfer_gatherv_int.rdispls = (int *) rdispls;


  const pami_metadata_t *my_gatherv_md;

  gatherv.algorithm = mpid->coll_algorithm[PAMI_XFER_GATHERV_INT][0][0];
  my_gatherv_md = &mpid->coll_metadata[PAMI_XFER_GATHERV_INT][0][0];

  MPIDI_Update_last_algorithm(comm_ptr, my_gatherv_md->name);

  MPIDI_Post_coll_t gatherv_post;
  TRACE_ERR("%s gatherv\n", MPIDI_Process.context_post.active>0?"Posting":"Invoking");
  MPIDI_Context_post(MPIDI_Context[0], &gatherv_post.state,
                     MPIDI_Pami_post_wrapper, (void *)&gatherv);
  TRACE_ERR("Gatherv %s\n", MPIDI_Process.context_post.active>0?"posted":"invoked");

  TRACE_ERR("Waiting on active %d\n", gatherv_active);
  MPID_PROGRESS_WAIT_WHILE(gatherv_active);

  if(!rcv_contig || recvok != PAMI_SUCCESS)
  {
    if(recvcontinuous)
   {
      MPIR_Localcopy(rcv_noncontig_buff, recv_size, MPI_CHAR,
                     recvbuf,   totalrecvcount,     recvtype);
    }
    else
    {
      size_t extent;
      MPID_Datatype_get_extent_macro(recvtype,extent);
      for(i=0; i<size; ++i)
      {
        char* scbuf = (char*)rcv_noncontig_buff+ rdispls[i];
        char* rcbuf = (char*)recvbuf + displs[i]*extent;
        MPIR_Localcopy(scbuf, rcounts[i], MPI_CHAR,
                       rcbuf, recvcounts[i], recvtype);
        TRACE_ERR("Pack recv src  extent %zu, displ[%zu]=%zu, count[%zu]=%zu buf[%zu]=%u\n",
                  (size_t)extent, (size_t)i,(size_t)precvdispls[i],(size_t)i,(size_t)precvcounts[i],(size_t)precvdispls[i], *(int*)scbuf);
        TRACE_ERR("Pack recv dest extent %zu, displ[%zu]=%zu, count[%zu]=%zu buf[%zu]=%u\n",
                  (size_t)extent, (size_t)i,(size_t)displs[i],(size_t)i,(size_t)recvcounts[i],(size_t)displs[i], *(int*)rcbuf);
      }

    }
      MPL_free(rcv_noncontig_buff);
      if(rank == root)
      {
         MPL_free(rcounts);
         MPL_free(rdispls);
      }
   }
   if(!snd_contig)  MPL_free(snd_noncontig_buff);


   TRACE_ERR("Leaving MPIDO_Gatherv_optimized\n");
   return MPI_SUCCESS;
}
Exemple #20
0
int MPID_Comm_spawn_multiple(int count, char *commands[], char **argvs[], const int maxprocs[],
                             MPIR_Info * info_ptrs[], int root, MPIR_Comm * comm_ptr,
                             MPIR_Comm ** intercomm, int errcodes[])
{
    char port_name[MPI_MAX_PORT_NAME];
    int *info_keyval_sizes = 0, i, mpi_errno = MPI_SUCCESS;
    PMI_keyval_t **info_keyval_vectors = 0, preput_keyval_vector;
    int *pmi_errcodes = 0, pmi_errno = 0;
    int total_num_processes, should_accept = 1;

    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_COMM_SPAWN_MULTIPLE);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_COMM_SPAWN_MULTIPLE);

    memset(port_name, 0, sizeof(port_name));

    if (comm_ptr->rank == root) {
        total_num_processes = 0;

        for (i = 0; i < count; i++)
            total_num_processes += maxprocs[i];

        pmi_errcodes = (int *) MPL_malloc(sizeof(int) * total_num_processes, MPL_MEM_BUFFER);
        MPIR_ERR_CHKANDJUMP(!pmi_errcodes, mpi_errno, MPI_ERR_OTHER, "**nomem");

        for (i = 0; i < total_num_processes; i++)
            pmi_errcodes[i] = 0;

        mpi_errno = MPID_Open_port(NULL, port_name);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        info_keyval_sizes = (int *) MPL_malloc(count * sizeof(int), MPL_MEM_BUFFER);
        MPIR_ERR_CHKANDJUMP(!info_keyval_sizes, mpi_errno, MPI_ERR_OTHER, "**nomem");
        info_keyval_vectors =
            (PMI_keyval_t **) MPL_malloc(count * sizeof(PMI_keyval_t *), MPL_MEM_BUFFER);
        MPIR_ERR_CHKANDJUMP(!info_keyval_vectors, mpi_errno, MPI_ERR_OTHER, "**nomem");

        if (!info_ptrs)
            for (i = 0; i < count; i++) {
                info_keyval_vectors[i] = 0;
                info_keyval_sizes[i] = 0;
        } else
            for (i = 0; i < count; i++) {
                mpi_errno = mpi_to_pmi_keyvals(info_ptrs[i], &info_keyval_vectors[i],
                                               &info_keyval_sizes[i]);
                if (mpi_errno)
                    MPIR_ERR_POP(mpi_errno);
            }

        preput_keyval_vector.key = MPIDI_PARENT_PORT_KVSKEY;
        preput_keyval_vector.val = port_name;
        pmi_errno = PMI_Spawn_multiple(count, (const char **)
                                       commands,
                                       (const char ***) argvs,
                                       maxprocs, info_keyval_sizes, (const PMI_keyval_t **)
                                       info_keyval_vectors, 1, &preput_keyval_vector, pmi_errcodes);

        if (pmi_errno != PMI_SUCCESS)
            MPIR_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER,
                                 "**pmi_spawn_multiple", "**pmi_spawn_multiple %d", pmi_errno);

        if (errcodes != MPI_ERRCODES_IGNORE) {
            for (i = 0; i < total_num_processes; i++) {
                errcodes[i] = pmi_errcodes[0];
                should_accept = should_accept && errcodes[i];
            }

            should_accept = !should_accept;
        }
    }

    if (errcodes != MPI_ERRCODES_IGNORE) {
        MPIR_Errflag_t errflag = MPIR_ERR_NONE;
        mpi_errno = MPIR_Bcast(&should_accept, 1, MPI_INT, root, comm_ptr, &errflag);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        mpi_errno = MPIR_Bcast(&pmi_errno, 1, MPI_INT, root, comm_ptr, &errflag);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        mpi_errno = MPIR_Bcast(&total_num_processes, 1, MPI_INT, root, comm_ptr, &errflag);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        mpi_errno = MPIR_Bcast(errcodes, total_num_processes, MPI_INT, root, comm_ptr, &errflag);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);
    }

    if (should_accept) {
        mpi_errno = MPID_Comm_accept(port_name, NULL, root, comm_ptr, intercomm);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);
    } else {
        if ((pmi_errno == PMI_SUCCESS) && (errcodes[0] != 0)) {
            mpi_errno = MPIR_Comm_create(intercomm);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);
        }
    }

    if (comm_ptr->rank == root) {
        mpi_errno = MPID_Close_port(port_name);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);
    }

  fn_exit:

    if (info_keyval_vectors) {
        free_pmi_keyvals(info_keyval_vectors, count, info_keyval_sizes);
        MPL_free(info_keyval_vectors);
    }

    MPL_free(info_keyval_sizes);
    MPL_free(pmi_errcodes);

    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_COMM_SPAWN_MULTIPLE);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}
Exemple #21
0
/* MPIR_Find_local  -- from the list of processes in comm,
 * builds a list of local processes, i.e., processes on this same node.
 *
 * Note that this will not work correctly for spawned or attached
 * processes.
 *
 *  OUT:
 *    local_size_p      - number of processes on this node.
 *    local_rank_p      - rank of this processes among local processes.
 *    local_ranks_p     - (*local_ranks_p)[i]     = the rank in comm
 *                        of the process with local rank i.
 *                        This is of size (*local_size_p).
 *    intranode_table_p - (*intranode_table_p)[i] = the rank in
 *    (optional)          *local_ranks_p of rank i in comm or -1 if not
 *                        applicable.  It is of size comm->remote_size.
 *                        No return if NULL is specified.
 */
int MPIR_Find_local(MPIR_Comm * comm, int *local_size_p, int *local_rank_p,
                    int **local_ranks_p, int **intranode_table_p)
{
    int mpi_errno = MPI_SUCCESS;
    int i, local_size, local_rank;
    int *local_ranks = NULL, *intranode_table = NULL;
    int node_id = -1, my_node_id = -1;

    MPIR_CHKPMEM_DECL(2);

    /* local_ranks will be realloc'ed later to the appropriate size (currently unknown) */
    /* FIXME: realloc doesn't guarantee that the allocated area will be
     * shrunk - so using realloc is not an appropriate strategy. */
    MPIR_CHKPMEM_MALLOC(local_ranks, int *, sizeof(int) * comm->remote_size, mpi_errno,
                        "local_ranks", MPL_MEM_COMM);
    MPIR_CHKPMEM_MALLOC(intranode_table, int *, sizeof(int) * comm->remote_size, mpi_errno,
                        "intranode_table", MPL_MEM_COMM);

    for (i = 0; i < comm->remote_size; ++i)
        intranode_table[i] = -1;

    mpi_errno = MPID_Get_node_id(comm, comm->rank, &my_node_id);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);
    MPIR_Assert(my_node_id >= 0);

    local_size = 0;
    local_rank = -1;

    /* Scan through the list of processes in comm. */
    for (i = 0; i < comm->remote_size; ++i) {
        mpi_errno = MPID_Get_node_id(comm, i, &node_id);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        /* The upper level can catch this non-fatal error and should be
         * able to recover gracefully. */
        MPIR_ERR_CHKANDJUMP(node_id < 0, mpi_errno, MPI_ERR_OTHER, "**dynamic_node_ids");

        /* build list of local processes */
        if (node_id == my_node_id) {
            if (i == comm->rank)
                local_rank = local_size;

            intranode_table[i] = local_size;
            local_ranks[local_size] = i;
            ++local_size;
        }
    }

#ifdef ENABLE_DEBUG
    printf("------------------------------------------------------------------------\n");
    printf("[%d]comm = %p\n", comm->rank, comm);
    printf("[%d]comm->size = %d\n", comm->rank, comm->remote_size);
    printf("[%d]comm->rank = %d\n", comm->rank, comm->rank);
    printf("[%d]local_size = %d\n", comm->rank, local_size);
    printf("[%d]local_rank = %d\n", comm->rank, local_rank);
    printf("[%d]local_ranks = %p\n", comm->rank, local_ranks);
    for (i = 0; i < local_size; ++i)
        printf("[%d]  local_ranks[%d] = %d\n", comm->rank, i, local_ranks[i]);
    printf("[%d]intranode_table = %p\n", comm->rank, intranode_table);
    for (i = 0; i < comm->remote_size; ++i)
        printf("[%d]  intranode_table[%d] = %d\n", comm->rank, i, intranode_table[i]);
#endif

    MPIR_CHKPMEM_COMMIT();

    *local_size_p = local_size;
    *local_rank_p = local_rank;

    *local_ranks_p = MPL_realloc(local_ranks, sizeof(int) * local_size, MPL_MEM_COMM);
    MPIR_ERR_CHKANDJUMP(*local_ranks_p == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem2");

    if (intranode_table_p)
        *intranode_table_p = intranode_table;   /* no need to realloc */
    else
        MPL_free(intranode_table);      /* free internally if caller passes NULL */

  fn_exit:
    return mpi_errno;
  fn_fail:
    MPIR_CHKPMEM_REAP();
    goto fn_exit;
}
Exemple #22
0
static
int InitPscomConnections(pscom_socket_t *socket) {
	char key[50];
	unsigned long guard_pmi_key = MAGIC_PMI_KEY;
	int i;
	int mpi_errno = MPI_SUCCESS;

	int pg_rank = MPIDI_Process.my_pg_rank;
	int pg_size = MPIDI_Process.my_pg_size;
	char *pg_id = MPIDI_Process.pg_id_name;
	char *listen_socket;
	char **psp_port = NULL;

	/* Distribute my contact information */
	snprintf(key, sizeof(key), "pscom%d", pg_rank);

	listen_socket = MPL_strdup(pscom_listen_socket_ondemand_str(socket));
	PMICALL(PMI_KVS_Put(pg_id, key, listen_socket));

#define IPSCOM_VERSION "pscom_v5.0"
	i_version_set(pg_id, pg_rank, IPSCOM_VERSION);

	PMICALL(PMI_KVS_Commit(pg_id));

	PMICALL(PMI_Barrier());

	i_version_check(pg_id, pg_rank, IPSCOM_VERSION);

	init_grank_port_mapping();

	/* Get portlist */
	psp_port = MPL_malloc(pg_size * sizeof(*psp_port), MPL_MEM_OBJECT);
	assert(psp_port);

	for (i = 0; i < pg_size; i++) {
		char val[100];
		unsigned long guard_pmi_value = MAGIC_PMI_VALUE;

		if (i != pg_rank) {
			snprintf(key, sizeof(key), "pscom%d", i);
			checked_PMI_KVS_Get(pg_id, key, val, sizeof(val));
			/* simple_pmi.c has a bug.(fixed in mpich2-1.0.5)
			   Test for the bugfix: */
			assert(guard_pmi_value == MAGIC_PMI_VALUE);
			assert(guard_pmi_key == MAGIC_PMI_KEY);
		} else {
			/* myself: Dont use PMI_KVS_Get, because this fail
			   in the case of no pm (SINGLETON_INIT_BUT_NO_PM) */
			strcpy(val, listen_socket);
		}

		psp_port[i] = MPL_strdup(val);
	}

	/* Create all connections */
	for (i = 0; i < pg_size; i++) {
		pscom_connection_t *con;
		pscom_err_t rc;
		const char *dest;

		dest = psp_port[i];

		con = pscom_open_connection(socket);
		rc = pscom_connect_socket_str(con, dest);

		if (rc != PSCOM_SUCCESS) {
			PRINTERROR("Connecting %s to %s (rank %d to %d) failed : %s",
				   listen_socket, dest, pg_rank, i, pscom_err_str(rc));
			goto fn_fail;
		}

		grank2con_set(i, con);
	}

	pscom_stop_listen(socket);
 fn_exit:
	if (psp_port) {
		for (i = 0; i < pg_size; i++) {
			MPL_free(psp_port[i]);
			psp_port[i] = NULL;
		}
		MPL_free(psp_port);
	}

	MPL_free(listen_socket);
	return mpi_errno;
	/* --- */
 fn_fail:
	mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL,
					 "InitPscomConnections", __LINE__, MPI_ERR_OTHER, "**connfailed", 0);
	goto fn_exit;
}
Exemple #23
0
int MPID_Win_free(MPIR_Win ** win_ptr)
{
    int mpi_errno = MPI_SUCCESS;
    int in_use;
    MPIR_Comm *comm_ptr;
    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_WIN_FREE);

    MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPID_WIN_FREE);

    MPIR_ERR_CHKANDJUMP(((*win_ptr)->states.access_state != MPIDI_RMA_NONE &&
                         (*win_ptr)->states.access_state != MPIDI_RMA_FENCE_ISSUED &&
                         (*win_ptr)->states.access_state != MPIDI_RMA_FENCE_GRANTED) ||
                        ((*win_ptr)->states.exposure_state != MPIDI_RMA_NONE),
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

    /* 1. Here we must wait until all passive locks are released on this target,
     * because for some UNLOCK messages, we do not send ACK back to origin,
     * we must wait until lock is released so that we can free window.
     * 2. We also need to wait until AT completion counter being zero, because
     * this counter is increment everytime we meet a GET-like operation, it is
     * possible that when target entering Win_free, passive epoch is not finished
     * yet and there are still GETs doing on this target.
     * 3. We also need to wait until lock queue becomes empty. It is possible
     * that some lock requests is still waiting in the queue when target is
     * entering Win_free. */
    while ((*win_ptr)->current_lock_type != MPID_LOCK_NONE ||
           (*win_ptr)->at_completion_counter != 0 ||
           (*win_ptr)->target_lock_queue_head != NULL ||
           (*win_ptr)->current_target_lock_data_bytes != 0 || (*win_ptr)->sync_request_cnt != 0) {
        mpi_errno = wait_progress_engine();
        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_POP(mpi_errno);
    }

    mpi_errno = MPID_Barrier((*win_ptr)->comm_ptr, &errflag);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);

    /* Free window resources in lower layer. */
    if (MPIDI_CH3U_Win_hooks.win_free != NULL) {
        mpi_errno = MPIDI_CH3U_Win_hooks.win_free(win_ptr);
        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_POP(mpi_errno);
    }

    /* dequeue window from the global list */
    MPIR_Assert((*win_ptr)->active == FALSE);
    DL_DELETE(MPIDI_RMA_Win_inactive_list_head, (*win_ptr));

    if (MPIDI_RMA_Win_inactive_list_head == NULL && MPIDI_RMA_Win_active_list_head == NULL) {
        /* this is the last window, de-register RMA progress hook */
        mpi_errno = MPID_Progress_deregister_hook(MPIDI_CH3I_RMA_Progress_hook_id);
        if (mpi_errno != MPI_SUCCESS) {
            MPIR_ERR_POP(mpi_errno);
        }
    }

    comm_ptr = (*win_ptr)->comm_ptr;
    mpi_errno = MPIR_Comm_free_impl(comm_ptr);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);

    if ((*win_ptr)->basic_info_table != NULL)
        MPL_free((*win_ptr)->basic_info_table);
    MPL_free((*win_ptr)->op_pool_start);
    MPL_free((*win_ptr)->target_pool_start);
    MPL_free((*win_ptr)->slots);
    MPL_free((*win_ptr)->target_lock_entry_pool_start);

    MPIR_Assert((*win_ptr)->current_target_lock_data_bytes == 0);

    /* Free the attached buffer for windows created with MPI_Win_allocate() */
    if ((*win_ptr)->create_flavor == MPI_WIN_FLAVOR_ALLOCATE ||
        (*win_ptr)->create_flavor == MPI_WIN_FLAVOR_SHARED) {
        if ((*win_ptr)->shm_allocated == FALSE && (*win_ptr)->size > 0) {
            MPL_free((*win_ptr)->base);
        }
    }

    MPIR_Object_release_ref(*win_ptr, &in_use);
    /* MPI windows don't have reference count semantics, so this should always be true */
    MPIR_Assert(!in_use);
    MPIR_Handle_obj_free(&MPIR_Win_mem, *win_ptr);

  fn_exit:
    MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPID_WIN_FREE);
    return mpi_errno;

  fn_fail:
    goto fn_exit;
}
Exemple #24
0
/* *slen is the length of the string, including the null terminator.  So if the
   resulting string is |foo\0bar\0|, then *slen == 8. */
static int connToStringKVS( char **buf_p, int *slen, MPIDI_PG_t *pg )
{
    char *string = 0;
    char *pg_idStr = (char *)pg->id;      /* In the PMI/KVS space,
					     the pg id is a string */
    char   buf[MPIDI_MAX_KVS_VALUE_LEN];
    int    i, j, rc, mpi_errno = MPI_SUCCESS, len;
    size_t vallen, curSlen;

    /* Make an initial allocation of a string with an estimate of the
       needed space */
    len = 0;
    curSlen = 10 + pg->size * 128;
    string = (char *)MPL_malloc( curSlen );

    /* Start with the id of the pg */
    while (*pg_idStr && len < curSlen) 
	string[len++] = *pg_idStr++;
    string[len++] = 0;
    
    /* Add the size of the pg */
    MPL_snprintf( &string[len], curSlen - len, "%d", pg->size );
    while (string[len]) len++;
    len++;

    for (i=0; i<pg->size; i++) {
	rc = getConnInfoKVS( i, buf, MPIDI_MAX_KVS_VALUE_LEN, pg );
	if (rc) {
	    MPL_internal_error_printf( 
		    "Panic: getConnInfoKVS failed for %s (rc=%d)\n", 
		    (char *)pg->id, rc );
	}
#ifndef USE_PERSISTENT_SHARED_MEMORY
	/* FIXME: This is a hack to avoid including shared-memory 
	   queue names in the business card that may be used
	   by processes that were not part of the same COMM_WORLD. 
	   To fix this, the shared memory channels should look at the
	   returned connection info and decide whether to use 
	   sockets or shared memory by determining whether the
	   process is in the same MPI_COMM_WORLD. */
	/* FIXME: The more general problem is that the connection information
	   needs to include some information on the range of validity (e.g.,
	   all processes, same comm world, particular ranks), and that
	   representation needs to be scalable */
/*	printf( "Adding key %s value %s\n", key, val ); */
	{
	char *p = strstr( buf, "$shm_host" );
	if (p) p[1] = 0;
	/*	    printf( "(fixed) Adding key %s value %s\n", key, val ); */
	}
#endif
	/* Add the information to the output buffer */
	vallen = strlen(buf);
	/* Check that this will fix in the remaining space */
	if (len + vallen + 1 >= curSlen) {
	    char *nstring = 0;
            curSlen += (pg->size - i) * (vallen + 1 );
	    nstring = MPL_realloc( string, curSlen );
	    if (!nstring) {
		MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**nomem");
	    }
	    string = nstring;
	}
	/* Append to string */
	for (j=0; j<vallen+1; j++) {
	    string[len++] = buf[j];
	}
    }

    MPIR_Assert(len <= curSlen);

    *buf_p = string;
    *slen  = len;
 fn_exit:
    return mpi_errno;
 fn_fail:
    if (string) MPL_free(string);
    goto fn_exit;
}
Exemple #25
0
/**
 * \brief MPID buffer copy
 *
 * Implements non-contiguous buffers correctly.
 *
 * \param[in]  sbuf       The address of the input buffer
 * \param[in]  scount     The number of elements in that buffer
 * \param[in]  sdt        The datatype of those elements
 * \param[out] smpi_errno Returns errors
 * \param[in]  rbuf       The address of the output buffer
 * \param[out] rcount     The number of elements in that buffer
 * \param[in]  rdt        The datatype of those elements
 * \param[out] rsz        The size of the ouput data
 * \param[out] rmpi_errno Returns errors
 */
void MPIDI_Buffer_copy(
    const void * const sbuf, MPI_Aint scount, MPI_Datatype sdt,                       int * smpi_errno,
          void * const rbuf, MPI_Aint rcount, MPI_Datatype rdt, MPIDI_msg_sz_t * rsz, int * rmpi_errno)
{
    int sdt_contig;
    int rdt_contig;
    MPI_Aint sdt_true_lb, rdt_true_lb;
    MPIDI_msg_sz_t sdata_sz;
    MPIDI_msg_sz_t rdata_sz;
    MPID_Datatype * sdt_ptr;
    MPID_Datatype * rdt_ptr;

    MPI_Aint  sdt_extent;
    MPI_Aint  rdt_extent;

    *smpi_errno = MPI_SUCCESS;
    *rmpi_errno = MPI_SUCCESS;

    /* printf("bufcopy: src count=%d dt=%d\n", scount, sdt); */
    /* printf("bufcopy: dst count=%d dt=%d\n", rcount, rdt); */

    MPIDI_Datatype_get_info(scount, sdt, sdt_contig, sdata_sz, sdt_ptr, sdt_true_lb);
    MPIDI_Datatype_get_info(rcount, rdt, rdt_contig, rdata_sz, rdt_ptr, rdt_true_lb);

    /* --BEGIN ERROR HANDLING-- */
    if (sdata_sz > rdata_sz)
    {
        *rmpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, __FUNCTION__, __LINE__, MPI_ERR_TRUNCATE, "**truncate", "**truncate %d %d", sdata_sz, rdata_sz );
        sdata_sz = rdata_sz;
    }
    /* --END ERROR HANDLING-- */

    if (sdata_sz == 0)
    {
        *rsz = 0;
        goto fn_exit;
    }

    if (sdt_contig && rdt_contig)
    {
#if CUDA_AWARE_SUPPORT
      if(MPIDI_Process.cuda_aware_support_on && MPIDI_cuda_is_device_buf(rbuf))
      {
        cudaError_t cudaerr = CudaMemcpy(rbuf + rdt_true_lb, sbuf + sdt_true_lb, sdata_sz, cudaMemcpyHostToDevice);
      }
      else
#endif
        memcpy((char*)rbuf + rdt_true_lb, (const char *)sbuf + sdt_true_lb, sdata_sz);
        *rsz = sdata_sz;
    }
    else if (sdt_contig)
    {
#if CUDA_AWARE_SUPPORT
      // This will need to be done in two steps:
      // 1 - Allocate a temp buffer which is the same size as user buffer and unpack in it.
      // 2 - Copy unpacked data into user buffer from temp buffer.
      if(MPIDI_Process.cuda_aware_support_on && MPIDI_cuda_is_device_buf(rbuf))
      {
        MPID_Datatype_get_extent_macro(rdt, rdt_extent);
        char *buf =  MPL_malloc(rdt_extent * rcount);
        memset(buf, 0, rdt_extent * rcount);        
        MPID_Segment seg;
        DLOOP_Offset last;

        MPID_Segment_init(buf, rcount, rdt, &seg, 0);
        last = sdata_sz;
        MPID_Segment_unpack(&seg, 0, &last, (char*)sbuf + sdt_true_lb);
        /* --BEGIN ERROR HANDLING-- */
        if (last != sdata_sz)
        {
            *rmpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, __FUNCTION__, __LINE__, MPI_ERR_TYPE, "**dtypemismatch", 0);
        }
        /* --END ERROR HANDLING-- */

       *rsz = last;

        
        cudaError_t cudaerr = CudaMemcpy(rbuf + rdt_true_lb, buf, rdt_extent * rcount, cudaMemcpyHostToDevice);

        MPL_free(buf);

        goto fn_exit;

      }
#endif

        MPID_Segment seg;
        DLOOP_Offset last;

        MPID_Segment_init(rbuf, rcount, rdt, &seg, 0);
        last = sdata_sz;
        MPID_Segment_unpack(&seg, 0, &last, (char*)sbuf + sdt_true_lb);
        /* --BEGIN ERROR HANDLING-- */
        if (last != sdata_sz)
        {
            *rmpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, __FUNCTION__, __LINE__, MPI_ERR_TYPE, "**dtypemismatch", 0);
        }
        /* --END ERROR HANDLING-- */

        *rsz = last;
    }
    else if (rdt_contig)
    {
        MPID_Segment seg;
        DLOOP_Offset last;

        MPID_Segment_init(sbuf, scount, sdt, &seg, 0);
        last = sdata_sz;
        MPID_Segment_pack(&seg, 0, &last, (char*)rbuf + rdt_true_lb);
        /* --BEGIN ERROR HANDLING-- */
        if (last != sdata_sz)
        {
            *rmpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, __FUNCTION__, __LINE__, MPI_ERR_TYPE, "**dtypemismatch", 0);
        }
        /* --END ERROR HANDLING-- */

        *rsz = last;
    }
    else
    {
        char * buf;
        MPIDI_msg_sz_t buf_off;
        MPID_Segment sseg;
        MPIDI_msg_sz_t sfirst;
        MPID_Segment rseg;
        MPIDI_msg_sz_t rfirst;

        buf = MPL_malloc(MPIDI_COPY_BUFFER_SZ);
        /* --BEGIN ERROR HANDLING-- */
        if (buf == NULL)
        {
            *smpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, __FUNCTION__, __LINE__, MPI_ERR_OTHER, "**nomem", 0);
            *rmpi_errno = *smpi_errno;
            *rsz = 0;
            goto fn_exit;
        }
        /* --END ERROR HANDLING-- */

        MPID_Segment_init(sbuf, scount, sdt, &sseg, 0);
        MPID_Segment_init(rbuf, rcount, rdt, &rseg, 0);

        sfirst = 0;
        rfirst = 0;
        buf_off = 0;

        for(;;)
        {
            DLOOP_Offset last;
            char * buf_end;

            if (sdata_sz - sfirst > MPIDI_COPY_BUFFER_SZ - buf_off)
            {
                last = sfirst + (MPIDI_COPY_BUFFER_SZ - buf_off);
            }
            else
            {
                last = sdata_sz;
            }

            MPID_Segment_pack(&sseg, sfirst, &last, buf + buf_off);
            /* --BEGIN ERROR HANDLING-- */
            MPID_assert(last > sfirst);
            /* --END ERROR HANDLING-- */

            buf_end = buf + buf_off + (last - sfirst);
            sfirst = last;

            MPID_Segment_unpack(&rseg, rfirst, &last, buf);
            /* --BEGIN ERROR HANDLING-- */
            MPID_assert(last > rfirst);
            /* --END ERROR HANDLING-- */

            rfirst = last;

            if (rfirst == sdata_sz)
            {
                /* successful completion */
                break;
            }

            /* --BEGIN ERROR HANDLING-- */
            if (sfirst == sdata_sz)
            {
                /* datatype mismatch -- remaining bytes could not be unpacked */
                *rmpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, __FUNCTION__, __LINE__, MPI_ERR_TYPE, "**dtypemismatch", 0);
                break;
            }
            /* --END ERROR HANDLING-- */

            buf_off = sfirst - rfirst;
            if (buf_off > 0)
            {
                memmove(buf, buf_end - buf_off, buf_off);
            }
        }

        *rsz = rfirst;
        MPL_free(buf);
    }

  fn_exit:
    return;
}
int MPIDI_Comm_spawn_multiple(int count, char **commands,
                                  char ***argvs, const int *maxprocs,
                                  MPIR_Info **info_ptrs, int root,
                                  MPIR_Comm *comm_ptr, MPIR_Comm
                                  **intercomm, int *errcodes) 
{
    char port_name[MPI_MAX_PORT_NAME];
    int *info_keyval_sizes=0, i, mpi_errno=MPI_SUCCESS;
    PMI_keyval_t **info_keyval_vectors=0, preput_keyval_vector;
    int *pmi_errcodes = 0, pmi_errno;
    int total_num_processes, should_accept = 1;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_COMM_SPAWN_MULTIPLE);

    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_COMM_SPAWN_MULTIPLE);


    if (comm_ptr->rank == root) {
	/* create an array for the pmi error codes */
	total_num_processes = 0;
	for (i=0; i<count; i++) {
	    total_num_processes += maxprocs[i];
	}
	pmi_errcodes = (int*)MPL_malloc(sizeof(int) * total_num_processes);
	if (pmi_errcodes == NULL) {
	    MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**nomem");
	}

	/* initialize them to 0 */
	for (i=0; i<total_num_processes; i++)
	    pmi_errcodes[i] = 0;

	/* Open a port for the spawned processes to connect to */
	/* FIXME: info may be needed for port name */
        mpi_errno = MPID_Open_port(NULL, port_name);
	/* --BEGIN ERROR HANDLING-- */
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);
	/* --END ERROR HANDLING-- */

	/* Spawn the processes */
#ifdef USE_PMI2_API
        MPIR_Assert(count > 0);
        {
            int *argcs = MPL_malloc(count*sizeof(int));
            struct MPIR_Info preput;
            struct MPIR_Info *preput_p[1] = { &preput };

            MPIR_Assert(argcs);
            /*
            info_keyval_sizes = MPL_malloc(count * sizeof(int));
            */

            /* FIXME cheating on constness */
            preput.key = (char *)PARENT_PORT_KVSKEY;
            preput.value = port_name;
            preput.next = NULL;

            /* compute argcs array */
            for (i = 0; i < count; ++i) {
                argcs[i] = 0;
                if (argvs != NULL && argvs[i] != NULL) {
                    while (argvs[i][argcs[i]]) {
                        ++argcs[i];
                    }
                }

                /* a fib for now */
                /*
                info_keyval_sizes[i] = 0;
                */
            }
            /* XXX DJG don't need this, PMI API is thread-safe? */
            /*MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_POBJ_PMI_MUTEX);*/
            /* release the global CS for spawn PMI calls */
            MPID_THREAD_CS_EXIT(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
            pmi_errno = PMI2_Job_Spawn(count, (const char **)commands,
                                       argcs, (const char ***)argvs,
                                       maxprocs,
                                       info_keyval_sizes, (const MPIR_Info **)info_ptrs,
                                       1, (const struct MPIR_Info **)preput_p,
                                       NULL, 0,
                                       /*jobId, jobIdSize,*/ /* XXX DJG job stuff? */
                                       pmi_errcodes);
            MPID_THREAD_CS_ENTER(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
            /*MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_POBJ_PMI_MUTEX);*/
            MPL_free(argcs);
            if (pmi_errno != PMI2_SUCCESS) {
                MPIR_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER,
                     "**pmi_spawn_multiple", "**pmi_spawn_multiple %d", pmi_errno);
            }
        }
#else
        /* FIXME: This is *really* awkward.  We should either
           Fix on MPI-style info data structures for PMI (avoid unnecessary
           duplication) or add an MPIU_Info_getall(...) that creates
           the necessary arrays of key/value pairs */

        /* convert the infos into PMI keyvals */
        info_keyval_sizes   = (int *) MPL_malloc(count * sizeof(int));
        info_keyval_vectors = 
            (PMI_keyval_t**) MPL_malloc(count * sizeof(PMI_keyval_t*));
        if (!info_keyval_sizes || !info_keyval_vectors) { 
            MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**nomem");
        }

        if (!info_ptrs) {
            for (i=0; i<count; i++) {
                info_keyval_vectors[i] = 0;
                info_keyval_sizes[i]   = 0;
            }
        }
        else {
            for (i=0; i<count; i++) {
                mpi_errno = mpi_to_pmi_keyvals( info_ptrs[i], 
                                                &info_keyval_vectors[i],
                                                &info_keyval_sizes[i] );
                if (mpi_errno) { MPIR_ERR_POP(mpi_errno); }
            }
        }

        preput_keyval_vector.key = PARENT_PORT_KVSKEY;
        preput_keyval_vector.val = port_name;


        MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_POBJ_PMI_MUTEX);
        pmi_errno = PMI_Spawn_multiple(count, (const char **)
                                       commands, 
                                       (const char ***) argvs,
                                       maxprocs, info_keyval_sizes,
                                       (const PMI_keyval_t **)
                                       info_keyval_vectors, 1, 
                                       &preput_keyval_vector,
                                       pmi_errcodes);
	MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_POBJ_PMI_MUTEX);
        if (pmi_errno != PMI_SUCCESS) {
	    MPIR_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER,
		 "**pmi_spawn_multiple", "**pmi_spawn_multiple %d", pmi_errno);
        }
#endif

	if (errcodes != MPI_ERRCODES_IGNORE) {
	    for (i=0; i<total_num_processes; i++) {
		/* FIXME: translate the pmi error codes here */
		errcodes[i] = pmi_errcodes[i];
                /* We want to accept if any of the spawns succeeded.
                   Alternatively, this is the same as we want to NOT accept if
                   all of them failed.  should_accept = NAND(e_0, ..., e_n)
                   Remember, success equals false (0). */
                should_accept = should_accept && errcodes[i];
	    }
            should_accept = !should_accept; /* the `N' in NAND */
	}
    }

    if (errcodes != MPI_ERRCODES_IGNORE) {
        MPIR_Errflag_t errflag = MPIR_ERR_NONE;
        mpi_errno = MPIR_Bcast_impl(&should_accept, 1, MPI_INT, root, comm_ptr, &errflag);
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);

        mpi_errno = MPIR_Bcast_impl(&total_num_processes, 1, MPI_INT, root, comm_ptr, &errflag);
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);
        
        mpi_errno = MPIR_Bcast_impl(errcodes, total_num_processes, MPI_INT, root, comm_ptr, &errflag);
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);

        MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
    }

    if (should_accept) {
        mpi_errno = MPID_Comm_accept(port_name, NULL, root, comm_ptr, intercomm); 
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);
    }
    else {
        MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**pmi_spawn_multiple");
    }

    if (comm_ptr->rank == root) {
	/* Close the port opened for the spawned processes to connect to */
	mpi_errno = MPID_Close_port(port_name);
	/* --BEGIN ERROR HANDLING-- */
	if (mpi_errno != MPI_SUCCESS)
	{
	    MPIR_ERR_POP(mpi_errno);
	}
	/* --END ERROR HANDLING-- */
    }

 fn_exit:
    if (info_keyval_vectors) {
	free_pmi_keyvals(info_keyval_vectors, count, info_keyval_sizes);
	MPL_free(info_keyval_sizes);
	MPL_free(info_keyval_vectors);
    }
    if (pmi_errcodes) {
	MPL_free(pmi_errcodes);
    }
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_COMM_SPAWN_MULTIPLE);
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}
Exemple #27
0
static
int InitPortConnections(pscom_socket_t *socket) {
	char key[50];
	unsigned long guard_pmi_key = MAGIC_PMI_KEY;
	int i;
	int mpi_errno = MPI_SUCCESS;

	int pg_rank = MPIDI_Process.my_pg_rank;
	int pg_size = MPIDI_Process.my_pg_size;
	char *pg_id = MPIDI_Process.pg_id_name;
	char *listen_socket;
	char **psp_port = NULL;

	/* Distribute my contact information */
	snprintf(key, sizeof(key), "psp%d", pg_rank);

	listen_socket = MPL_strdup(pscom_listen_socket_str(socket));
	PMICALL(PMI_KVS_Put(pg_id, key, listen_socket));

#define INIT_VERSION "ps_v5.0"
	i_version_set(pg_id, pg_rank, INIT_VERSION);
	PMICALL(PMI_KVS_Commit(pg_id));

	PMICALL(PMI_Barrier());

	i_version_check(pg_id, pg_rank, INIT_VERSION);

	init_grank_port_mapping();

	/* Get portlist */
	psp_port = MPL_malloc(pg_size * sizeof(*psp_port), MPL_MEM_OBJECT);
	assert(psp_port);

	for (i = 0; i < pg_size; i++) {
		char val[100];
		unsigned long guard_pmi_value = MAGIC_PMI_VALUE;

		if (i != pg_rank) {
			snprintf(key, sizeof(key), "psp%d", i);
			checked_PMI_KVS_Get(pg_id, key, val, sizeof(val));
			/* simple_pmi.c has a bug.(fixed in mpich2-1.0.5)
			   Test for the bugfix: */
			assert(guard_pmi_value == MAGIC_PMI_VALUE);
			assert(guard_pmi_key == MAGIC_PMI_KEY);
		} else {
			/* myself: Dont use PMI_KVS_Get, because this fail
			   in the case of no pm (SINGLETON_INIT_BUT_NO_PM) */
			strcpy(val, listen_socket);
		}

		psp_port[i] = MPL_strdup(val);
	}

	/* connect ranks pg_rank..(pg_rank + pg_size/2) */
	for (i = 0; i <= pg_size / 2; i++) {
		int dest = (pg_rank + i) % pg_size;
		int src = (pg_rank + pg_size - i) % pg_size;

		if (!i || (pg_rank / i) % 2) {
			/* connect, accept */
			if (do_connect(socket, pg_rank, dest, psp_port[dest])) goto fn_fail;
			if (!i || src != dest) {
				do_wait(pg_rank, src);
			}
		} else {
			/* accept, connect */
			do_wait(pg_rank, src);
			if (src != dest) {
				if (do_connect(socket, pg_rank, dest, psp_port[dest])) goto fn_fail;
			}
		}

	}

	/* Wait for all connections: (already done?) */
	for (i = 0; i < pg_size; i++) {
		while (!grank2con_get(i)) {
			pscom_wait_any();
		}
	}

	/* ToDo: */
	pscom_stop_listen(socket);

 fn_exit:
	if (psp_port) {
		for (i = 0; i < pg_size; i++) {
			MPL_free(psp_port[i]);
			psp_port[i] = NULL;
		}
		MPL_free(psp_port);
	}

	MPL_free(listen_socket);
	return mpi_errno;
	/* --- */
 fn_fail:
	mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL,
					 "InitPortConnections", __LINE__, MPI_ERR_OTHER, "**connfailed", 0);
	goto fn_exit;
}
Exemple #28
0
HYD_status proxy_process_pmi_cb(int fd, HYD_dmx_event_t events, void *userp)
{
    int recvd, closed, count;
    char pmi_stash[PMI_STASH_SIZE] = { 0 };
    int i;
    char *tmp_cmd;
    char *pmi_cmd;
    const char *delim = NULL;
    char *args[MAX_PMI_ARGS];
    struct proxy_pmi_handle *h;
    struct proxy_kv_hash *pmi_args = NULL;
    struct proxy_kv_hash *hash, *tmp;
    HYD_status status = HYD_SUCCESS;

    HYD_FUNC_ENTER();

    /* read enough to see if it is a "cmd=" or "mcmd=" */
    count = 0;
    status =
        HYD_sock_read(fd, pmi_stash, strlen("mcmd="), &recvd, &closed,
                      HYD_SOCK_COMM_TYPE__BLOCKING);
    HYD_ERR_POP(status, "error reading PMI cmd\n");
    count += recvd;

    if (closed) {
        status = HYD_dmx_deregister_fd(fd);
        close(fd);
        goto fn_exit;
    }

    /* see if more data is available */
    status =
        HYD_sock_read(fd, pmi_stash + count, PMI_STASH_SIZE - count, &recvd, &closed,
                      HYD_SOCK_COMM_TYPE__NONBLOCKING);
    HYD_ERR_POP(status, "error reading PMI cmd\n");
    HYD_ASSERT(!closed, status);
    count += recvd;

    /* keep reading till we have at least one full PMI command (in
     * PMI-2 we can get multiple commands before we respond to the
     * first command) */
    while (1) {
        /* parse the string to see if it is a PMI-1 or a PMI-2
         * command */
        if (!strncmp(pmi_stash, "cmd=", strlen("cmd="))) {
            if (pmi_stash[count - 1] != '\n') {
                /* if we reached the end of the buffer we read and did
                 * not yet get a full comamnd, wait till we get at
                 * least one more byte */
                status =
                    HYD_sock_read(fd, pmi_stash + count, 1, &recvd, &closed,
                                  HYD_SOCK_COMM_TYPE__BLOCKING);
                HYD_ERR_POP(status, "error reading PMI cmd\n");
                HYD_ASSERT(!closed, status);
                count += recvd;

                /* see if more data is available */
                status =
                    HYD_sock_read(fd, pmi_stash + count, PMI_STASH_SIZE - count, &recvd, &closed,
                                  HYD_SOCK_COMM_TYPE__NONBLOCKING);
                HYD_ERR_POP(status, "error reading PMI cmd\n");
                HYD_ASSERT(!closed, status);
                count += recvd;

                continue;
            }

            /* overwrite the newline character */
            pmi_stash[count - 1] = 0;
            delim = " ";
        } else if (!strncmp(pmi_stash, "mcmd=", strlen("mcmd="))) {
            if (count < strlen("endcmd") ||
                strncmp(&pmi_stash[count - strlen("endcmd")], "endcmd", strlen("endcmd"))) {
                /* if we reached the end of the buffer we read and did
                 * not yet get a full comamnd, wait till we get at
                 * least one more byte */
                status =
                    HYD_sock_read(fd, pmi_stash + count, 1, &recvd, &closed,
                                  HYD_SOCK_COMM_TYPE__BLOCKING);
                HYD_ERR_POP(status, "error reading PMI cmd\n");
                HYD_ASSERT(!closed, status);
                count += recvd;

                /* see if more data is available */
                status =
                    HYD_sock_read(fd, pmi_stash + count, PMI_STASH_SIZE - count, &recvd, &closed,
                                  HYD_SOCK_COMM_TYPE__NONBLOCKING);
                HYD_ERR_POP(status, "error reading PMI cmd\n");
                HYD_ASSERT(!closed, status);
                count += recvd;

                continue;
            }

            /* remove the last "endcmd" and the newline before that */
            pmi_stash[count - strlen("endcmd") - 1] = 0;
            delim = "\n";
        }

        if (proxy_params.debug)
            HYD_PRINT(stdout, "pmi cmd from fd %d: %s\n", fd, pmi_stash);

        tmp_cmd = strtok(pmi_stash, delim);
        for (i = 0;; i++) {
            args[i] = strtok(NULL, delim);
            if (args[i] == NULL)
                break;
        }

        pmi_cmd = strtok(tmp_cmd, "=");
        pmi_cmd = strtok(NULL, "=");

        /* split out the args into tokens */
        for (i = 0; args[i]; i++) {
            HYD_MALLOC(hash, struct proxy_kv_hash *, sizeof(struct proxy_kv_hash), status);
            hash->key = MPL_strdup(strtok(args[i], "="));
            hash->val = MPL_strdup(strtok(NULL, "="));
            HASH_ADD_STR(pmi_args, key, hash, MPL_MEM_PM);
        }

        /* try to find the handler and run it */
        h = proxy_pmi_handlers;
        while (h->handler) {
            if (!strcmp(pmi_cmd, h->cmd)) {
                status = h->handler(fd, pmi_args);
                HYD_ERR_POP(status, "PMI handler returned error\n");
                break;
            }
            h++;
        }

        if (h->handler == NULL) {
            /* we don't understand the command; forward it upstream */
            HYD_ERR_SETANDJUMP(status, HYD_ERR_INTERNAL, "unknown PMI command: %s\n", pmi_cmd);
        }

        HASH_ITER(hh, pmi_args, hash, tmp) {
            HASH_DEL(pmi_args, hash);
            MPL_free(hash->key);
            MPL_free(hash->val);
            MPL_free(hash);
        }

        break;
    }
Exemple #29
0
int MPID_nem_mxm_vc_init(MPIDI_VC_t * vc)
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_CH3I_VC *vc_ch = &vc->ch;
    MPID_nem_mxm_vc_area *vc_area = VC_BASE(vc);

    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MXM_VC_INIT);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MXM_VC_INIT);

    /* local connection is used for any source communication */
    MPIR_Assert(MPID_nem_mem_region.rank != vc->lpid);
    MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_CHANNEL, VERBOSE,
                     (MPL_DBG_FDEST,
                      "[%i]=== connecting  to  %i  \n", MPID_nem_mem_region.rank, vc->lpid));
    {
        char *business_card;
        int val_max_sz;
#ifdef USE_PMI2_API
        val_max_sz = PMI2_MAX_VALLEN;
#else
        mpi_errno = PMI_KVS_Get_value_length_max(&val_max_sz);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);
#endif

        business_card = (char *) MPL_malloc(val_max_sz, MPL_MEM_ADDRESS);
        mpi_errno = vc->pg->getConnInfo(vc->pg_rank, business_card, val_max_sz, vc->pg);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        vc_area->ctx = vc;
        vc_area->mxm_ep = &_mxm_obj.endpoint[vc->pg_rank];
        mpi_errno = _mxm_connect(&_mxm_obj.endpoint[vc->pg_rank], business_card, vc_area);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        MPL_free(business_card);
    }

    MPIDI_CHANGE_VC_STATE(vc, ACTIVE);

    vc_area->pending_sends = 0;

    /* Use default rendezvous functions */
    vc->eager_max_msg_sz = _mxm_eager_threshold();
    vc->ready_eager_max_msg_sz = vc->eager_max_msg_sz;
    vc->rndvSend_fn = MPID_nem_lmt_RndvSend;
    vc->rndvRecv_fn = MPID_nem_lmt_RndvRecv;

    vc->sendNoncontig_fn = MPID_nem_mxm_SendNoncontig;
    vc->comm_ops = &comm_ops;

    vc_ch->iStartContigMsg = MPID_nem_mxm_iStartContigMsg;
    vc_ch->iSendContig = MPID_nem_mxm_iSendContig;

  fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MXM_VC_INIT);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}
Exemple #30
0
/* MPIR_Find_external -- from the list of processes in comm,
 * builds a list of external processes, i.e., one process from each node.
 * You can think of this as the root or master process for each node.
 *
 * Note that this will not work correctly for spawned or attached
 * processes.
 *
 *  OUT:
 *    external_size_p   - number of external processes
 *    external_rank_p   - rank of this process among the external
 *                        processes, or -1 if this process is not external
 *    external_ranks_p  - (*external_ranks_p)[i]   = the rank in comm
 *                        of the process with external rank i.
 *                        This is of size (*external_size_p)
 *    internode_table_p - (*internode_table_p)[i]  = the rank in
 *    (optional)          *external_ranks_p of the root of the node
 *                        containing rank i in comm.  It is of size
 *                        comm->remote_size. No return if NULL is specified.
 */
int MPIR_Find_external(MPIR_Comm * comm, int *external_size_p, int *external_rank_p,
                       int **external_ranks_p, int **internode_table_p)
{
    int mpi_errno = MPI_SUCCESS;
    int *nodes;
    int i, external_size, external_rank;
    int *external_ranks, *internode_table;
    int max_node_id, node_id;

    MPIR_CHKLMEM_DECL(1);
    MPIR_CHKPMEM_DECL(2);

    /* Scan through the list of processes in comm and add one
     * process from each node to the list of "external" processes.  We
     * add the first process we find from each node.  nodes[] is an
     * array where we keep track of whether we have already added that
     * node to the list. */

    /* external_ranks will be realloc'ed later to the appropriate size (currently unknown) */
    /* FIXME: realloc doesn't guarantee that the allocated area will be
     * shrunk - so using realloc is not an appropriate strategy. */
    MPIR_CHKPMEM_MALLOC(external_ranks, int *, sizeof(int) * comm->remote_size, mpi_errno,
                        "external_ranks", MPL_MEM_COMM);
    MPIR_CHKPMEM_MALLOC(internode_table, int *, sizeof(int) * comm->remote_size, mpi_errno,
                        "internode_table", MPL_MEM_COMM);

    mpi_errno = MPID_Get_max_node_id(comm, &max_node_id);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);
    MPIR_Assert(max_node_id >= 0);
    MPIR_CHKLMEM_MALLOC(nodes, int *, sizeof(int) * (max_node_id + 1), mpi_errno, "nodes",
                        MPL_MEM_COMM);

    /* nodes maps node_id to rank in external_ranks of leader for that node */
    for (i = 0; i < (max_node_id + 1); ++i)
        nodes[i] = -1;

    external_size = 0;
    external_rank = -1;

    for (i = 0; i < comm->remote_size; ++i) {
        mpi_errno = MPID_Get_node_id(comm, i, &node_id);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        /* The upper level can catch this non-fatal error and should be
         * able to recover gracefully. */
        MPIR_ERR_CHKANDJUMP(node_id < 0, mpi_errno, MPI_ERR_OTHER, "**dynamic_node_ids");

        MPIR_Assert(node_id <= max_node_id);

        /* build list of external processes */
        if (nodes[node_id] == -1) {
            if (i == comm->rank)
                external_rank = external_size;
            nodes[node_id] = external_size;
            external_ranks[external_size] = i;
            ++external_size;
        }

        /* build the map from rank in comm to rank in external_ranks */
        internode_table[i] = nodes[node_id];
    }

#ifdef ENABLE_DEBUG
    printf("------------------------------------------------------------------------\n");
    printf("[%d]comm = %p\n", comm->rank, comm);
    printf("[%d]comm->size = %d\n", comm->rank, comm->remote_size);
    printf("[%d]comm->rank = %d\n", comm->rank, comm->rank);
    printf("[%d]external_size = %d\n", comm->rank, external_size);
    printf("[%d]external_rank = %d\n", comm->rank, external_rank);
    printf("[%d]external_ranks = %p\n", comm->rank, external_ranks);
    for (i = 0; i < external_size; ++i)
        printf("[%d]  external_ranks[%d] = %d\n", comm->rank, i, external_ranks[i]);
    printf("[%d]internode_table = %p\n", comm->rank, internode_table);
    for (i = 0; i < comm->remote_size; ++i)
        printf("[%d]  internode_table[%d] = %d\n", comm->rank, i, internode_table[i]);
    printf("[%d]nodes = %p\n", comm->rank, nodes);
    for (i = 0; i < (max_node_id + 1); ++i)
        printf("[%d]  nodes[%d] = %d\n", comm->rank, i, nodes[i]);
#endif

    MPIR_CHKPMEM_COMMIT();

    *external_size_p = external_size;
    *external_rank_p = external_rank;
    *external_ranks_p = MPL_realloc(external_ranks, sizeof(int) * external_size, MPL_MEM_COMM);
    MPIR_ERR_CHKANDJUMP(*external_ranks_p == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem2");

    if (internode_table_p)
        *internode_table_p = internode_table;   /* no need to realloc */
    else
        MPL_free(internode_table);      /* free internally if caller passes NULL */

  fn_exit:
    MPIR_CHKLMEM_FREEALL();
    return mpi_errno;
  fn_fail:
    MPIR_CHKPMEM_REAP();
    goto fn_exit;
}