Beispiel #1
0
/* initialize Cartesian topology info in comm. If ndims==0, dims and periods
   are not referenced */
static void tMPI_Cart_init(tMPI_Comm *comm_cart, int ndims, int *dims, 
                           int *periods)
{
    int newrank=-1;
    int i;

    if (*comm_cart)
    {
        tMPI_Comm_rank(*comm_cart, &newrank);
    }

    if (newrank==0)
    {
        (*comm_cart)->cart=(struct cart_topol*)tMPI_Malloc(
                                            sizeof(struct cart_topol));
        (*comm_cart)->cart->dims=(int*)tMPI_Malloc(ndims*sizeof(int));
        (*comm_cart)->cart->periods=(int*)tMPI_Malloc(ndims*sizeof(int));
        (*comm_cart)->cart->ndims=ndims;
        for(i=0;i<ndims;i++)
        {
            (*comm_cart)->cart->dims[i]=dims[i];
            (*comm_cart)->cart->periods[i]=periods[i];
        }
    }

    /* and we add a barrier to make sure the cart object is seen by 
       every thread that is part of the new communicator */
    if (*comm_cart)
    {
        tMPI_Barrier_wait( &( (*comm_cart)->barrier) );
    }
}
int tMPI_Thread_create_aff(tMPI_Thread_t *thread,
                           void *(*start_routine)(void *), void *arg)
{
    DWORD thread_id;
    struct tMPI_Thread_starter_param *prm;

    tMPI_Init_initers();
    tMPI_Set_main_thread_affinity();

    /* a small memory leak to be sure that it doesn't get deallocated 
       once this function ends, before the newly created thread uses it. */
    prm=(struct tMPI_Thread_starter_param*)
              tMPI_Malloc(sizeof(struct tMPI_Thread_starter_param));
    prm->start_routine= start_routine;
    prm->param=arg;

    *thread=(struct tMPI_Thread*)tMPI_Malloc(sizeof(struct tMPI_Thread)*1);

    if(thread==NULL)
    {
        tMPI_Fatal_error(TMPI_FARGS,"Invalid thread pointer.");
        return EINVAL;
    }

    if( g_ulHighestNumaNodeNumber != 0 )
    {
        /* if running on a NUMA system, use the group and NUMA aware thread 
           creation logic */
        (*thread)->th = tMPI_Thread_create_NUMA(NULL,
                                                0,
                                                tMPI_Win32_thread_starter,
                                                prm,
                                                0, 
                                                &thread_id);
    } else {
        /* TODO: for now, non-NUMA systems don't set thread affinity. */
        (*thread)->th = CreateThread(NULL,
                                     0,
                                     tMPI_Win32_thread_starter,
                                     prm,
                                     0, 
                                     &thread_id);
    }

    if((*thread)->th==NULL)
    {
        tMPI_Free(thread);
        tMPI_Fatal_error(TMPI_FARGS,"Failed to create thread, error code=%d",
                         GetLastError());
        return -1;
    }

    /* inherit the thread priority from the parent thread. */
    /* TODO: is there value in setting this, vs. just allowing it to default 
       from the process?  currently, this limits the effectivenes of changing 
       the priority in eg: TaskManager. */
    SetThreadPriority(((*thread)->th), GetThreadPriority(GetCurrentThread()));

    return 0;
}
Beispiel #3
0
int tMPI_Cart_sub(tMPI_Comm comm, int *remain_dims, tMPI_Comm *newcomm)
{
    int myrank;
    int ndims=0;
    int *dims=NULL;
    int *periods=NULL;
    int *oldcoords=NULL;
    int i;
    int ndims_notused=1;
    int color_notused=0;

#ifdef TMPI_TRACE
    tMPI_Trace_print("tMPI_Cart_sub(%p, %p, %p)", comm, remain_dims, newcomm);
#endif
    tMPI_Comm_rank(comm, &myrank);
    if ( comm->cart )
    {
        oldcoords=(int*)tMPI_Malloc(sizeof(int)*comm->cart->ndims);
        dims=(int*)tMPI_Malloc(sizeof(int)*comm->cart->ndims);
        periods=(int*)tMPI_Malloc(sizeof(int)*comm->cart->ndims);

        /* get old coordinates */
        tMPI_Cart_coords(comm, myrank, comm->cart->ndims, oldcoords);

        for(i=0;i<comm->cart->ndims;i++)
        {
            if (remain_dims[i])
            {
                /* for the remaining dimensions, copy dimensionality data */
                dims[ndims]=comm->cart->dims[i];
                periods[ndims]=comm->cart->periods[i];
                ndims++;
            }
            else
            {
                /* base color on not used coordinates. We keep a 
                   ndims_notused index multiplier.*/
                color_notused += oldcoords[i]*ndims_notused;
                ndims_notused *= comm->cart->dims[i];
            }
        }
    }

    /* key=myrank, because we want the order to remain the same */
    tMPI_Comm_split(comm, color_notused, myrank, newcomm);
    tMPI_Cart_init(newcomm, ndims, dims, periods);

    if (oldcoords)
        free(oldcoords);
    if (dims)
        free(dims);
    if (periods)
        free(periods);

    return TMPI_SUCCESS;
}
Beispiel #4
0
void tMPI_Coll_envt_init(struct coll_env_thread *met, int N)
{
    tMPI_Atomic_set(&(met->current_sync), 0);
    tMPI_Atomic_set(&(met->n_remaining), 0);
    met->buf=(void**)tMPI_Malloc(sizeof(void*)*N);
    met->bufsize=(size_t*)tMPI_Malloc(sizeof(size_t)*N);
    met->read_data=(tmpi_bool*)tMPI_Malloc(sizeof(tmpi_bool)*N);
#ifdef USE_COLLECTIVE_COPY_BUFFER
    met->cpbuf=(tMPI_Atomic_ptr_t*)tMPI_Malloc(sizeof(tMPI_Atomic_ptr_t)*N);
    met->cb=NULL;
    met->using_cb=FALSE;
#endif
    tMPI_Event_init( &(met->send_ev) );
    tMPI_Event_init( &(met->recv_ev) );
}
Beispiel #5
0
int tMPI_Thread_barrier_init(tMPI_Thread_barrier_t *barrier, int n)
{
    if(barrier==NULL)
    {
        return EINVAL;
    }

    barrier->barrierp=(struct tMPI_Thread_barrier*)
              tMPI_Malloc(sizeof(struct tMPI_Thread_barrier)*1);

#if 0
 /* use this once Vista is the oldest supported windows version: */
    InitializeCriticalSection(&(barrier->barrierp->cs));
    InitializeConditionVariable(&(barrier->barrierp->cv));
#else
    tMPI_Thread_mutex_init(&(barrier->barrierp->cs));
    tMPI_Thread_cond_init(&(barrier->barrierp->cv));
#endif

    barrier->threshold = n;
    barrier->count     = n;
    barrier->cycle     = 0;

    return 0;
}
Beispiel #6
0
static int tMPI_Thread_barrier_init_once(tMPI_Thread_barrier_t *barrier)
{
    int ret=0;

    /* we're relying on the memory barrier semantics of mutex_lock/unlock
       for the check preceding this function call to have worked */
    pthread_mutex_lock( &(barrier_init) );    
    if(barrier->barrierp==NULL)
    {
        barrier->barrierp=(struct tMPI_Thread_barrier*)
                  tMPI_Malloc(sizeof(struct tMPI_Thread_barrier)*1);
        ret = pthread_mutex_init(&(barrier->barrierp->mutex),NULL);

        if(ret!=0)
        {
            tMPI_Fatal_error(TMPI_FARGS,"Error initializing POSIX mutex. rc=%d",
                             ret);
            return ret;
        }

        ret = pthread_cond_init(&(barrier->barrierp->cv),NULL);

        if(ret!=0)
        {
            tMPI_Fatal_error(TMPI_FARGS,
                             "Error initializing POSIX condition variable. rc=%d",
                             ret);
            return ret;
        }
    }
    pthread_mutex_unlock( &(barrier_init) );    
    return ret;
}
Beispiel #7
0
int tMPI_Thread_key_create(tMPI_Thread_key_t *key, void (*destructor)(void *))
{
    if(key==NULL)
    {
        tMPI_Fatal_error(TMPI_FARGS,"Invalid key pointer.");
        return EINVAL;
    }


    /* TODO: make list of destructors for thread-local storage */
    key->key=(struct tMPI_Thread_key*)tMPI_Malloc(sizeof(struct 
                                                         tMPI_Thread_key)*1);
 
    (key)->key->wkey=TlsAlloc();

    if ( (key)->key->wkey == TLS_OUT_OF_INDEXES ) 
    {
        tMPI_Fatal_error(TMPI_FARGS,
                         "Failed to create thread key, error code=%d.",
                         GetLastError());
        return -1;
    }

    return 0;
}
Beispiel #8
0
int tMPI_Type_contiguous(int count, tMPI_Datatype oldtype,
                         tMPI_Datatype *newtype)
{
    struct tmpi_datatype_ *ntp;

#ifdef TMPI_TRACE
    tMPI_Trace_print("tMPI_Type_contiguous(%d, %p, %p)", count, oldtype,
                     newtype);
#endif
    ntp               = (struct tmpi_datatype_*)tMPI_Malloc(sizeof(struct tmpi_datatype_));
    ntp->size         = count*oldtype->size;
    ntp->op_functions = NULL;

    /* establish components */
    ntp->N_comp = 1;
    ntp->comps  = (struct tmpi_datatype_component*)tMPI_Malloc(
                sizeof(struct tmpi_datatype_component)*1);
    ntp->comps[0].type  = oldtype;
    ntp->comps[0].count = 1;
    ntp->committed      = FALSE;

    /* now add it to the list.  */
    tMPI_Spinlock_lock(&(tmpi_global->datatype_lock));
    /* check whether there's space */
    if (tmpi_global->N_usertypes + 1 >= tmpi_global->Nalloc_usertypes)
    {
        /* make space */
        tmpi_global->Nalloc_usertypes = Nthreads*(tmpi_global->N_usertypes) + 1;
        tmpi_global->usertypes        = (struct tmpi_datatype_**)
            tMPI_Realloc(tmpi_global->usertypes,
                         (sizeof(struct tmpi_datatype_ *)*
                          tmpi_global->Nalloc_usertypes)
                         );

    }
    /* add to the list */
    tmpi_global->usertypes[tmpi_global->N_usertypes] = ntp;
    tmpi_global->N_usertypes++;
    *newtype = ntp;
    tMPI_Spinlock_unlock(&(tmpi_global->datatype_lock));

    return TMPI_SUCCESS;
}
Beispiel #9
0
int tMPI_Thread_mutex_init(tMPI_Thread_mutex_t *mtx) 
{
    if(mtx==NULL)
    {
        return EINVAL;
    }

    mtx->mutex=(struct tMPI_Mutex*)tMPI_Malloc(sizeof(struct tMPI_Mutex)*1);
    InitializeCriticalSection(&(mtx->mutex->cs));

    return 0;
}
Beispiel #10
0
void tMPI_Coll_env_init(struct coll_env *cev, int N)
{
    int i;

    cev->met=(struct coll_env_thread*)tMPI_Malloc(
                        sizeof(struct coll_env_thread)*N);
    cev->N=N;
    tMPI_Atomic_set(&(cev->coll.current_sync), 0);
    tMPI_Atomic_set(&(cev->coll.n_remaining), 0);
    for(i=0;i<N;i++)
    {
        tMPI_Coll_envt_init(&(cev->met[i]), N);
    }
}
Beispiel #11
0
void tMPI_Coll_sync_init(struct coll_sync *csync, int N)
{
    int i;

    csync->synct=0;
    csync->syncs=0;
    csync->N=N;

    csync->events=(tMPI_Event*)tMPI_Malloc(sizeof(tMPI_Event)*N);
    for(i=0;i<N;i++)
    {
        tMPI_Event_init( &(csync->events[i]) );
    }
}
Beispiel #12
0
static int tMPI_Thread_mutex_init_once(tMPI_Thread_mutex_t *mtx)
{
    int ret=0;

    /* we're relying on the memory barrier semantics of mutex_lock/unlock
       for the check preceding this function call to have worked */
    pthread_mutex_lock( &(mutex_init) );    
    if(mtx->mutex==NULL)
    {
        mtx->mutex=(struct tMPI_Mutex*)tMPI_Malloc(sizeof(struct tMPI_Mutex)*1);
        ret=pthread_mutex_init( &(mtx->mutex->mtx), NULL);
    }
    pthread_mutex_unlock( &(mutex_init) );    
    return ret;
}
Beispiel #13
0
static int tMPI_Thread_cond_init_once(tMPI_Thread_cond_t *cond)
{
    int ret=0;

    /* we're relying on the memory barrier semantics of mutex_lock/unlock
       for the check preceding this function call to have worked */
    pthread_mutex_lock( &(cond_init) );    
    if(cond->condp==NULL)
    {
        cond->condp=(struct tMPI_Thread_cond*)
                  tMPI_Malloc(sizeof(struct tMPI_Thread_cond)*1);
        ret=pthread_cond_init( &(cond->condp->cond), NULL);
    }
    pthread_mutex_unlock( &(cond_init) );    
    return ret;
}
Beispiel #14
0
static void tMPI_Thread_init(struct tmpi_thread *th)
{
    int N_envelopes=(Nthreads+1)*N_EV_ALLOC;  
    int N_send_envelopes=N_EV_ALLOC;  
    int N_reqs=(Nthreads+1)*N_EV_ALLOC;  
    int i;

    /* we set our thread id, as a thread-specific piece of global data. */
    tMPI_Thread_setspecific(id_key, th);

    /* allocate comm.self */
    th->self_comm=tMPI_Comm_alloc(TMPI_COMM_WORLD, 1);
    th->self_comm->grp.peers[0]=th;

    /* allocate envelopes */
    tMPI_Free_env_list_init( &(th->envelopes), N_envelopes );
    /* recv list */
    tMPI_Recv_env_list_init( &(th->evr));
    /* send lists */
    th->evs=(struct send_envelope_list*)tMPI_Malloc(
                        sizeof(struct send_envelope_list)*Nthreads);
    for(i=0;i<Nthreads;i++)
    {
        tMPI_Send_env_list_init( &(th->evs[i]), N_send_envelopes);
    }

    tMPI_Atomic_set( &(th->ev_outgoing_received), 0);

    tMPI_Event_init( &(th->p2p_event) );

    /* allocate requests */
    tMPI_Req_list_init(&(th->rql), N_reqs);

#ifdef USE_COLLECTIVE_COPY_BUFFER
    /* allcate copy_buffer list */
    tMPI_Copy_buffer_list_init(&(th->cbl_multi), (Nthreads+1)*(N_COLL_ENV+1),
                               Nthreads*COPY_BUFFER_SIZE);
#endif

#ifdef TMPI_PROFILE
    tMPI_Profile_init(&(th->profile));
#endif
    /* now wait for all other threads to come on line, before we
       start the MPI program */
    tMPI_Thread_barrier_wait( &(tmpi_global->barrier) );
}
Beispiel #15
0
void tMPI_Copy_buffer_list_init(struct copy_buffer_list *cbl, int Nbufs,
                                size_t size)
{
    int i;

    cbl->size=size;
    cbl->cb_alloc=(struct copy_buffer*)
                  tMPI_Malloc(sizeof(struct copy_buffer)*Nbufs);
    cbl->cb=cbl->cb_alloc; /* the first one */
    cbl->Nbufs = Nbufs;
    for(i=0;i<Nbufs;i++)
    {
        tMPI_Copy_buffer_init( &(cbl->cb_alloc[i]), size );
        if (i<Nbufs-1)
            cbl->cb_alloc[i].next=&(cbl->cb_alloc[i+1]);
        else
            cbl->cb_alloc[i].next=NULL;
    }
}
Beispiel #16
0
int tMPI_Thread_cond_init(tMPI_Thread_cond_t *cond) 
{
    int ret;
    
    if(cond==NULL)
    {
        return EINVAL;
    }
   
    cond->condp=(struct tMPI_Thread_cond*)
              tMPI_Malloc(sizeof(struct tMPI_Thread_cond)*1);
    ret = pthread_cond_init(&(cond->condp->cond), NULL);
    
    if(ret!=0)
    {
        tMPI_Fatal_error(TMPI_FARGS,"Error initializing POSIX condition variable. rc=%d",ret);
        fflush(stderr);
    }
    tMPI_Atomic_set(&(cond->initialized),1);
    return ret;
}
Beispiel #17
0
int tMPI_Thread_mutex_init(tMPI_Thread_mutex_t *mtx) 
{
    int ret;
  
    if (mtx == NULL)
    {
        return EINVAL;
    }

    mtx->mutex=(struct tMPI_Mutex*)tMPI_Malloc(sizeof(struct tMPI_Mutex)*1);
    ret = pthread_mutex_init(&(mtx->mutex->mtx),NULL);
    
    if(ret!=0)
    {
        tMPI_Fatal_error(TMPI_FARGS,"Error initializing POSIX mutex. rc=%d");
        /* Use system memory allocation routines */
        return ret;
    }

    tMPI_Atomic_set(&(mtx->initialized), 1);
    return 0;
}
Beispiel #18
0
int tMPI_Thread_cond_init(tMPI_Thread_cond_t *cond) 
{
    if(cond==NULL)
    {
        return EINVAL;
    }

    cond->condp=(struct tMPI_Thread_cond*)
              tMPI_Malloc(sizeof(struct tMPI_Thread_cond)*1);
#if 0
    /* use this code once Vista is the minimum version required */
    InitializeConditionVariable( &(cond->cv) );
#else
    cond->condp->Nwaiters=0;
    InitializeCriticalSection(&(cond->condp->wtr_lock));
    cond->condp->Nrelease=0;
    cond->condp->cycle=0;
    /* a manual reset, unsignalled event */
    cond->condp->ev = CreateEvent(NULL, TRUE, FALSE, NULL); 
#endif
    return 0;
}
Beispiel #19
0
int tMPI_Thread_barrier_init(tMPI_Thread_barrier_t *barrier, int n)
{
    int ret;
    /*tMPI_Thread_pthread_barrier_t *p;*/
    
    if(barrier==NULL)
    {
        return EINVAL;
    }
    
    barrier->barrierp=(struct tMPI_Thread_barrier*)
              tMPI_Malloc(sizeof(struct tMPI_Thread_barrier)*1);
    ret = pthread_mutex_init(&(barrier->barrierp->mutex),NULL);
        
    if(ret!=0)
    {
        tMPI_Fatal_error(TMPI_FARGS,"Error initializing POSIX mutex. rc=%d",
                         ret);
        return ret;
    }
    
    ret = pthread_cond_init(&(barrier->barrierp->cv),NULL);
    
    if(ret!=0)
    {
        tMPI_Fatal_error(TMPI_FARGS,
                         "Error initializing POSIX condition variable. rc=%d",
                         ret);
        return ret;
    }
        
    barrier->threshold = n;
    barrier->count     = n;
    barrier->cycle     = 0;

    tMPI_Atomic_set(&(barrier->initialized), 1);
    return 0;
}
Beispiel #20
0
int tMPI_Thread_key_create(tMPI_Thread_key_t *key, void (*destructor)(void *))
{
    int ret;

    if(key==NULL)
    {
        tMPI_Fatal_error(TMPI_FARGS,"Invalid key pointer.");
        return EINVAL;
    }


    key->key=(struct tMPI_Thread_key*)tMPI_Malloc(sizeof(struct 
                                                         tMPI_Thread_key)*1);
    ret = pthread_key_create(&((key)->key->pkey), destructor);
    if(ret!=0)
    {
        tMPI_Fatal_error(TMPI_FARGS,"Failed to create thread key, rc=%d.",ret);
        fflush(stderr);
        return -1;
    }

    tMPI_Atomic_set(&(key->initialized), 1);
    return 0;
}
Beispiel #21
0
int tMPI_Comm_alloc(tMPI_Comm *newcomm, tMPI_Comm parent, int N)
{
    struct tmpi_comm_ *retc;
    int                i;
    int                ret;

    retc = (struct tmpi_comm_*)tMPI_Malloc(sizeof(struct tmpi_comm_));
    if (retc == NULL)
    {
        return TMPI_ERR_NO_MEM;
    }

    retc->grp.peers = (struct tmpi_thread**)tMPI_Malloc(
                sizeof(struct tmpi_thread*)*Nthreads);
    if (retc->grp.peers == NULL)
    {
        return TMPI_ERR_NO_MEM;
    }
    retc->grp.N = N;

    ret = tMPI_Thread_mutex_init( &(retc->comm_create_lock) );
    if (ret != 0)
    {
        return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
    }
    ret = tMPI_Thread_cond_init( &(retc->comm_create_prep) );
    if (ret != 0)
    {
        return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
    }
    ret = tMPI_Thread_cond_init( &(retc->comm_create_finish) );
    if (ret != 0)
    {
        return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
    }

    retc->split    = NULL;
    retc->new_comm = NULL;
    /* we have no topology to start out with */
    retc->cart = NULL;
    /*retc->graph=NULL;*/

    /* we start counting at 0 */
    tMPI_Atomic_set( &(retc->destroy_counter), 0);

    /* initialize the main barrier */
    tMPI_Barrier_init(&(retc->barrier), N);

    /* the reduce barriers */
    {
        /* First calculate the number of reduce barriers */
        int Niter = 0; /* the iteration number */
        int Nred  = N; /* the number of reduce barriers for this iteration */
        while (Nred > 1)
        {
            /* Nred is now Nred/2 + a rest term because solitary
               process at the end of the list must still be accounter for */
            Nred   = Nred/2 + Nred%2;
            Niter += 1;
        }

        retc->N_reduce_iter = Niter;
        /* allocate the list */
        retc->reduce_barrier = (tMPI_Barrier_t**)
            tMPI_Malloc(sizeof(tMPI_Barrier_t*)*(Niter+1));
        if (retc->reduce_barrier == NULL)
        {
            return TMPI_ERR_NO_MEM;
        }
        retc->N_reduce = (int*)tMPI_Malloc(sizeof(int)*(Niter+1));
        if (retc->N_reduce == NULL)
        {
            return TMPI_ERR_NO_MEM;
        }

        /* we re-set Nred to N */
        Nred = N;
        for (i = 0; i < Niter; i++)
        {
            int j;

            Nred              = Nred/2 + Nred%2;
            retc->N_reduce[i] = Nred;
            /* allocate the sub-list */
            retc->reduce_barrier[i] = (tMPI_Barrier_t*)
                tMPI_Malloc(sizeof(tMPI_Barrier_t)*(Nred));
            if (retc->reduce_barrier[i] == NULL)
            {
                return TMPI_ERR_NO_MEM;
            }
            for (j = 0; j < Nred; j++)
            {
                tMPI_Barrier_init(&(retc->reduce_barrier[i][j]), 2);
            }
        }
    }

    /* the reduce buffers */
    retc->reduce_sendbuf = (tMPI_Atomic_ptr_t*)
        tMPI_Malloc(sizeof(tMPI_Atomic_ptr_t)*Nthreads);
    if (retc->reduce_sendbuf == NULL)
    {
        return TMPI_ERR_NO_MEM;
    }
    retc->reduce_recvbuf = (tMPI_Atomic_ptr_t*)
        tMPI_Malloc(sizeof(tMPI_Atomic_ptr_t)*Nthreads);
    if (retc->reduce_recvbuf == NULL)
    {
        return TMPI_ERR_NO_MEM;
    }

    if (parent)
    {
        retc->erh = parent->erh;
    }
    else
    {
        retc->erh = TMPI_ERRORS_ARE_FATAL;
    }

    /* coll_env objects */
    retc->cev = (struct coll_env*)tMPI_Malloc(sizeof(struct coll_env)*
                                              N_COLL_ENV);
    if (retc->cev == NULL)
    {
        return TMPI_ERR_NO_MEM;
    }

    for (i = 0; i < N_COLL_ENV; i++)
    {
        ret = tMPI_Coll_env_init( &(retc->cev[i]), N);
        if (ret != TMPI_SUCCESS)
        {
            return ret;
        }
    }
    /* multi_sync objects */
    retc->csync = (struct coll_sync*)tMPI_Malloc(sizeof(struct coll_sync)*N);
    if (retc->csync == NULL)
    {
        return TMPI_ERR_NO_MEM;
    }

    for (i = 0; i < N; i++)
    {
        ret = tMPI_Coll_sync_init( &(retc->csync[i]), N);
        if (ret != TMPI_SUCCESS)
        {
            return ret;
        }
    }

    ret = tMPI_Thread_mutex_lock( &(tmpi_global->comm_link_lock) );
    if (ret != 0)
    {
        return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
    }
    /* we insert ourselves in the circular list, after TMPI_COMM_WORLD */
    if (TMPI_COMM_WORLD)
    {
        retc->next = TMPI_COMM_WORLD;
        retc->prev = TMPI_COMM_WORLD->prev;

        TMPI_COMM_WORLD->prev->next = retc;
        TMPI_COMM_WORLD->prev       = retc;
    }
    else
    {
        retc->prev = retc->next = retc;
    }
    ret = tMPI_Thread_mutex_unlock( &(tmpi_global->comm_link_lock) );
    if (ret != 0)
    {
        return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
    }
    *newcomm = retc;
    return TMPI_SUCCESS;
}
Beispiel #22
0
void tMPI_Start_threads(tmpi_bool main_returns, int N, int *argc, char ***argv, 
                        void (*start_fn)(void*), void *start_arg,
                        int (*start_fn_main)(int, char**))
{
#ifdef TMPI_TRACE
    tMPI_Trace_print("tMPI_Start_threads(%d, %p, %p, %p, %p)", N, argc,
                       argv, start_fn, start_arg);
#endif
    if (N>0) 
    {
        int i;
        int set_affinity=FALSE;

        tmpi_finalized=FALSE;
        Nthreads=N;

        /* allocate global data */
        tmpi_global=(struct tmpi_global*)
                        tMPI_Malloc(sizeof(struct tmpi_global));
        tMPI_Global_init(tmpi_global, N);

        /* allocate world and thread data */
        threads=(struct tmpi_thread*)tMPI_Malloc(sizeof(struct tmpi_thread)*N);
        TMPI_COMM_WORLD=tMPI_Comm_alloc(NULL, N);
        TMPI_GROUP_EMPTY=tMPI_Group_alloc();

        if (tMPI_Thread_key_create(&id_key, NULL))
        {
            tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_INIT);
        }
        for(i=0;i<N;i++)
        {
            TMPI_COMM_WORLD->grp.peers[i]=&(threads[i]);

            /* copy argc, argv */
            if (argc && argv)
            {
                int j;
                threads[i].argc=*argc;
                threads[i].argv=(char**)tMPI_Malloc(threads[i].argc*
                                                   sizeof(char*));
                for(j=0;j<threads[i].argc;j++)
                {
#if ! (defined( _WIN32 ) || defined( _WIN64 ) )
                    threads[i].argv[j]=strdup( (*argv)[j] );
#else
                    threads[i].argv[j]=_strdup( (*argv)[j] );
#endif
                }
            }
            else
            {
                threads[i].argc=0;
                threads[i].argv=NULL;
            }
            threads[i].start_fn=start_fn;
            threads[i].start_fn_main=start_fn_main;
            threads[i].start_arg=start_arg;
        }

        /* now check whether to set affinity */
#ifdef TMPI_THREAD_AFFINITY
        {
            int nhw=tMPI_Thread_get_hw_number();
            if ((nhw > 1) && (nhw == N))
            {
                set_affinity=TRUE;
            }
        }
#endif

        for(i=1;i<N;i++) /* zero is the main thread */
        {
            int ret;

            if (set_affinity)
            {
                ret=tMPI_Thread_create_aff(&(threads[i].thread_id), 
                                           tMPI_Thread_starter,
                                           (void*)&(threads[i]) ) ;
            }
            else
            {
                ret=tMPI_Thread_create(&(threads[i].thread_id), 
                                       tMPI_Thread_starter,
                                       (void*)&(threads[i]) ) ;
            }

            if(ret)
            {
                tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_INIT);
            }
        }
        /* the main thread now also runs start_fn if we don't want
           it to return */
        if (!main_returns)
            tMPI_Thread_starter((void*)&(threads[0]));
        else
            tMPI_Thread_init(&(threads[0]));
    }
}
Beispiel #23
0
/* initialize a copy buffer */
void tMPI_Copy_buffer_init(struct copy_buffer *cb, size_t size)
{
    cb->buf=tMPI_Malloc(size);
    cb->size=size;
}
/*  returns 0 on success.
    Success is returned if the system is non-NUMA, OR the system doesn't 
    support appropriate NUMA APIs, OR the system is NUMA and we successfully 
    initialized support.
    
    returns -1 on error.
    This can happen if an API returned an error, a memory allocation failed, or 
    we failed to initialize affinity mapping information.
*/
int tMPI_Init_NUMA(void)
{
    /* module handle to kernel32.dll -- we already reference it, so it's already loaded */
    HMODULE hModKernel32 = NULL;                    
    /* 0-based NUMA node count -- does not imply all nodes have available (eg: hot-plug) processors */
    ULONG ulHighestNumaNodeNumber;                  
    /* total number of processors available per affinity masks */
    DWORD dwTotalProcessors = 0;                    
    ULONG i = 0;

    /* calling thread PROCESSOR_NUMBER */
    PROCESSOR_NUMBER CurrentProcessorNumber;      
    /* calling thread GROUP_AFFINITY */
    GROUP_AFFINITY CurrentThreadGroupAffinity; 
    /* calling thread NUMA node */
    USHORT CurrentNumaNodeNumber;

    WORD wActiveGroupCount;
    WORD GroupIndex;

    /* array of processor information structures */
    MPI_NUMA_PROCESSOR_INFO *pMPI_ProcessorInfo = NULL; 

    /* assume an error condition */
    int iRet = -1;

    hModKernel32 = GetModuleHandleA("kernel32.dll");

    if( hModKernel32 == NULL )
    {
        return 0;
    }

    /* obtain addresses of relevant NUMA functions, most of which are 
       Windows 7 / Windows Server 2008R2 only functions
       this is done using GetProcAddress to enable the binary to run on older 
       Windows versions.
    */

    func_GetNumaHighestNodeNumber = (func_GetNumaHighestNodeNumber_t) GetProcAddress( hModKernel32, "GetNumaHighestNodeNumber" );

    if( func_GetNumaHighestNodeNumber == NULL )
    {
        return 0;
    }

    /* determine if we're on a NUMA system and if so, determine the number of 
       (potential) nodes */

    if(!func_GetNumaHighestNodeNumber( &ulHighestNumaNodeNumber ))
    {
        return -1;
    }

    if( ulHighestNumaNodeNumber == 0 )
    {
        /* system is not NUMA */
        return 0;
    }

    func_SetThreadGroupAffinity = (func_SetThreadGroupAffinity_t)GetProcAddress( hModKernel32, "SetThreadGroupAffinity" );
    func_SetThreadIdealProcessorEx = (func_SetThreadIdealProcessorEx_t)GetProcAddress( hModKernel32, "SetThreadIdealProcessorEx" );
    func_CreateRemoteThreadEx = (func_CreateRemoteThreadEx_t)GetProcAddress( hModKernel32, "CreateRemoteThreadEx" );
    func_GetNumaNodeProcessorMaskEx = (func_GetNumaNodeProcessorMaskEx_t)GetProcAddress( hModKernel32, "GetNumaNodeProcessorMaskEx" );
    func_GetNumaProcessorNodeEx = (func_GetNumaProcessorNodeEx_t)GetProcAddress( hModKernel32, "GetNumaProcessorNodeEx" );
    func_GetCurrentProcessorNumberEx = (func_GetCurrentProcessorNumberEx_t)GetProcAddress( hModKernel32, "GetCurrentProcessorNumberEx" );
    func_GetActiveProcessorCount = (func_GetActiveProcessorCount_t)GetProcAddress( hModKernel32, "GetActiveProcessorCount" );
    func_GetActiveProcessorGroupCount = (func_GetActiveProcessorGroupCount_t)GetProcAddress( hModKernel32, "GetActiveProcessorGroupCount" );
    func_InitializeProcThreadAttributeList = (func_InitializeProcThreadAttributeList_t)GetProcAddress( hModKernel32, "InitializeProcThreadAttributeList" );
    func_UpdateProcThreadAttribute = (func_UpdateProcThreadAttribute_t)GetProcAddress( hModKernel32, "UpdateProcThreadAttribute" );
    func_DeleteProcThreadAttributeList = (func_DeleteProcThreadAttributeList_t)GetProcAddress( hModKernel32, "DeleteProcThreadAttributeList" );

    if( (func_SetThreadGroupAffinity == NULL) ||
        (func_SetThreadIdealProcessorEx == NULL) ||
        (func_CreateRemoteThreadEx == NULL) ||
        (func_GetNumaNodeProcessorMaskEx == NULL) ||
        (func_GetNumaProcessorNodeEx == NULL) ||
        (func_GetCurrentProcessorNumberEx == NULL) ||
        (func_GetActiveProcessorCount == NULL) ||
        (func_GetActiveProcessorGroupCount == NULL) ||
        (func_InitializeProcThreadAttributeList == NULL) ||
        (func_UpdateProcThreadAttribute == NULL) ||
        (func_DeleteProcThreadAttributeList == NULL) )
    {
        /* if any addresses couldn't be located, assume NUMA functionality 
           isn't supported */
        return 0;
    }

    /* count the active processors across the groups */

    func_GetCurrentProcessorNumberEx(&CurrentProcessorNumber);

    wActiveGroupCount = func_GetActiveProcessorGroupCount();
    
    dwTotalProcessors = func_GetActiveProcessorCount( ALL_PROCESSOR_GROUPS );

#if !((defined WIN64 || defined _WIN64))
    /* WOW64 doesn't allow setting the affinity correctly beyond 32 
       processors -- the KAFFINITY mask is only 32 bits wide
       This check is only here for completeness -- large systems should be 
       running 64bit Gromacs code, where the processor quantity is not 
       constrained.
       By failing here, the WOW64 32bit client will use normal CreateThread(), 
       which can schedule up to 64 un-affinitized threads
    */

    if( dwTotalProcessors > 32 )
    {
        return 0;
    }
#endif

    /* allocate array of processor info blocks */

    pMPI_ProcessorInfo = tMPI_Malloc( sizeof(MPI_NUMA_PROCESSOR_INFO) * 
                                      dwTotalProcessors );
    if(pMPI_ProcessorInfo == NULL)
    {
        tMPI_Fatal_error(TMPI_FARGS,"tMPI_Malloc failed for processor information");
        goto cleanup;
    }

    /* zero fill to cover reserved must be-zero fields */
    memset(pMPI_ProcessorInfo, 0, sizeof(MPI_NUMA_PROCESSOR_INFO) * dwTotalProcessors);

    /* loop through each processor group, and for each group, capture the 
       processor numbers and NUMA node information. */

    for(GroupIndex = 0 ; GroupIndex < wActiveGroupCount ; GroupIndex++)
    {
        DWORD dwGroupProcessorCount;
        BYTE ProcessorIndex;

        dwGroupProcessorCount = func_GetActiveProcessorCount( GroupIndex );

        for(ProcessorIndex = 0 ; ProcessorIndex < dwGroupProcessorCount ; 
            ProcessorIndex++)
        {
            PROCESSOR_NUMBER *pProcessorNumber = &(pMPI_ProcessorInfo[i].ProcessorNumber);
            GROUP_AFFINITY *pGroupAffinity = &(pMPI_ProcessorInfo[i].GroupAffinity);
            USHORT *pNodeNumber = &(pMPI_ProcessorInfo[i].NumaNodeNumber);

            pProcessorNumber->Group = GroupIndex;
            pProcessorNumber->Number = ProcessorIndex;

            /* save an index to the processor array entry for the current processor
               this is used to enable subsequent threads to be created in a round 
               robin fashion starting at the next array entry
            */

            if( (CurrentProcessorNumber.Group == pProcessorNumber->Group ) &&
                (CurrentProcessorNumber.Number == pProcessorNumber->Number) )
            {
                /* set global: current thread index into processor array */
                g_ulThreadIndex = i;
            }

            /* capture the node number and group affinity associated with processor entry
               any failures here are assumed to be catastrophic and disable 
               the group & NUMA aware thread support
            */

            if(!func_GetNumaProcessorNodeEx(pProcessorNumber, pNodeNumber))
            {
                tMPI_Fatal_error(TMPI_FARGS,
                                 "Processor enumeration, GetNumaProcessorNodeEx failed, error code=%d",
                                 GetLastError());
                goto cleanup;
            }

            if(!func_GetNumaNodeProcessorMaskEx(*pNodeNumber, pGroupAffinity))
            {
                tMPI_Fatal_error(TMPI_FARGS,
                                 "Processor enumeration, GetNumaNodeProcessorMaskEx failed, error code=%d",
                                 GetLastError());
                goto cleanup;
            }

            /* future enhancement: construct GroupAffinity (single) processor 
               mask within NUMA node for this processor entry */

            /* increment processor array index */
            i++;

            /* sanity check, should never happen */

            if(i > dwTotalProcessors)
            {
                tMPI_Fatal_error(TMPI_FARGS,"Processor enumeration exceeds allocated memory!");
                goto cleanup;
            }
        }
    }

#if 0
    /* set the NUMA node affinity for the current thread
       failures to set the current thread affinity are ignored, 
       as a fringe case can arise on >32 processor systems with a 32bit 
       build/code.
    */
    func_SetThreadIdealProcessorEx(GetCurrentThread(), 
                                   &CurrentProcessorNumber, 
                                   NULL);

    if(func_GetNumaProcessorNodeEx(&CurrentProcessorNumber, 
                                   &CurrentNumaNodeNumber))
    {
        /* for the NUMA node number associated with the current processor 
           number, get the group affinity mask */
        if(func_GetNumaNodeProcessorMaskEx(CurrentNumaNodeNumber, 
                                           &CurrentThreadGroupAffinity))
        {
            /* set the current thread affinity to prevent it from running on 
               other NUMA nodes */
            func_SetThreadGroupAffinity(GetCurrentThread(), 
                                        &CurrentThreadGroupAffinity, 
                                        NULL);
        }
    }
#endif
 
    /* capture number of processors, highest NUMA node number, and processor 
       array */
    g_ulTotalProcessors = dwTotalProcessors;
    g_ulHighestNumaNodeNumber = ulHighestNumaNodeNumber;
    g_MPI_ProcessorInfo = pMPI_ProcessorInfo;

    iRet = 0 ;

#if 0   
    // TODO: debug DISCARD                        
    printf("primary thread tid=%lu group=%lu mask=0x%I64x group=%lu number=%lu ulThreadIndex=%lu\n",
        GetCurrentThreadId(),
        CurrentThreadGroupAffinity.Group,
        (ULONGLONG)CurrentThreadGroupAffinity.Mask,
        (ULONG)CurrentProcessorNumber.Group,
        (ULONG)CurrentProcessorNumber.Number,
        g_ulThreadIndex);
#endif

cleanup:

    if( iRet != 0 )
    {
        if( pMPI_ProcessorInfo )
        {
            tMPI_Free( pMPI_ProcessorInfo );
        }
    }

    return 0;
}
HANDLE tMPI_Thread_create_NUMA(LPSECURITY_ATTRIBUTES lpThreadAttributes,
                               SIZE_T dwStackSize,
                               LPTHREAD_START_ROUTINE lpStartAddress,
                               LPVOID lpParameter,
                               DWORD dwCreationFlags,
                               LPDWORD lpThreadId)
{
    LPPROC_THREAD_ATTRIBUTE_LIST pAttributeList = NULL;
    HANDLE hThread = NULL;
    SIZE_T cbAttributeList = 0;
    GROUP_AFFINITY GroupAffinity;
    PROCESSOR_NUMBER IdealProcessorNumber;
    ULONG CurrentProcessorIndex;

    /* for each thread created, round-robin through the set of valid 
       processors and affinity masks.
       the assumption is that callers of tMPI_Thread_create_NUMA are creating 
       threads that saturate a given processor.
       for cases where threads are being created that rarely do work, standard 
       thread creation (eg: CreateThread) should be invoked instead.
    */

    CurrentProcessorIndex = (ULONG)InterlockedIncrement((volatile LONG *)&g_ulThreadIndex);
    CurrentProcessorIndex = CurrentProcessorIndex % g_ulTotalProcessors;

    /* group, mask. */

    memcpy(&GroupAffinity, 
           &(g_MPI_ProcessorInfo[CurrentProcessorIndex].GroupAffinity), 
           sizeof(GROUP_AFFINITY));

    /* group, processor number */
    
    memcpy(&IdealProcessorNumber, 
           &(g_MPI_ProcessorInfo[CurrentProcessorIndex].ProcessorNumber), 
           sizeof(PROCESSOR_NUMBER)); 

    /* determine size of allocation for AttributeList */

    if(!func_InitializeProcThreadAttributeList(pAttributeList,
                                               2,
                                               0,
                                               &cbAttributeList))
    {
        DWORD dwLastError = GetLastError();
        if( dwLastError != ERROR_INSUFFICIENT_BUFFER )
        {
            tMPI_Fatal_error(TMPI_FARGS,
                             "InitializeProcThreadAttributeList, error code=%d",
                             dwLastError);
            goto cleanup;
        }
    }

    pAttributeList = (LPPROC_THREAD_ATTRIBUTE_LIST)tMPI_Malloc( cbAttributeList );
    if( pAttributeList == NULL )
    {
        tMPI_Fatal_error(TMPI_FARGS,"Failed to allocate pAttributeList");
        goto cleanup;
    }

    memset( pAttributeList, 0, cbAttributeList );

    if(!func_InitializeProcThreadAttributeList(pAttributeList,
                                               2,
                                               0,
                                               &cbAttributeList))
    {
        tMPI_Fatal_error(TMPI_FARGS,
                         "InitializeProcThreadAttributeList, error code=%d",
                         GetLastError());
        goto cleanup;
    }

    if(!func_UpdateProcThreadAttribute(pAttributeList,
                                       0,
                                       PROC_THREAD_ATTRIBUTE_GROUP_AFFINITY,
                                       &GroupAffinity,
                                       sizeof(GroupAffinity),
                                       NULL,
                                       NULL))
    {
        tMPI_Fatal_error(TMPI_FARGS,"UpdateProcThreadAttribute, error code=%d",
                         GetLastError());
        goto cleanup;
    }

    if(!func_UpdateProcThreadAttribute(pAttributeList,
                                       0,
                                       PROC_THREAD_ATTRIBUTE_IDEAL_PROCESSOR,
                                       &IdealProcessorNumber,
                                       sizeof(IdealProcessorNumber),
                                       NULL,
                                       NULL))
    {
        tMPI_Fatal_error(TMPI_FARGS,"UpdateProcThreadAttribute, error code=%d",
                         GetLastError());
        goto cleanup;
    }


    hThread = func_CreateRemoteThreadEx( GetCurrentProcess(),
                                         lpThreadAttributes,
                                         dwStackSize,
                                         lpStartAddress,
                                         lpParameter,
                                         dwCreationFlags,
                                         pAttributeList,
                                         lpThreadId);
            
    func_DeleteProcThreadAttributeList( pAttributeList );

#if 0   
	// TODO: debug only or DISCARD
    if( hThread )
    {
        PROCESSOR_NUMBER ProcNumber;
        USHORT NodeNumber;

        GetThreadIdealProcessorEx(hThread, &ProcNumber);
        GetNumaProcessorNodeEx(&ProcNumber, &NodeNumber);

        printf("started thread tid=%lu group=%lu mask=0x%I64x number=%lu numanode=%lu\n",
            *lpThreadId,
            GroupAffinity.Group,
            (ULONGLONG)GroupAffinity.Mask,
            ProcNumber.Number,
            NodeNumber
            );
    }
#endif

cleanup:
    
    if( pAttributeList )
    {
        tMPI_Free( pAttributeList );
    }

    return hThread;
}
Beispiel #26
0
/* this is the main comm creation function. All other functions that create
    comms use this*/
int tMPI_Comm_split(tMPI_Comm comm, int color, int key, tMPI_Comm *newcomm)
{
    int                 i, j;
    int                 N = tMPI_Comm_N(comm);
    volatile tMPI_Comm *newcomm_list;
    volatile int        colors[MAX_PREALLOC_THREADS]; /* array with the colors
                                                         of each thread */
    volatile int        keys[MAX_PREALLOC_THREADS];   /* same for keys (only one of
                                                         the threads actually suplies
                                                         these arrays to the comm
                                                         structure) */
    tmpi_bool          i_am_first = FALSE;
    int                myrank     = tMPI_Comm_seek_rank(comm, tMPI_Get_current());
    struct tmpi_split *spl;
    int                ret;

#ifdef TMPI_TRACE
    tMPI_Trace_print("tMPI_Comm_split(%p, %d, %d, %p)", comm, color, key,
                     newcomm);
#endif
    if (!comm)
    {
        *newcomm = NULL;
        return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_COMM);
    }

    ret = tMPI_Thread_mutex_lock(&(comm->comm_create_lock));
    if (ret != 0)
    {
        return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
    }
    /* first get the colors */
    if (!comm->new_comm)
    {
        /* i am apparently  first */
        comm->split    = (struct tmpi_split*)tMPI_Malloc(sizeof(struct tmpi_split));
        comm->new_comm = (tMPI_Comm*)tMPI_Malloc(N*sizeof(tMPI_Comm));
        if (N <= MAX_PREALLOC_THREADS)
        {
            comm->split->colors = colors;
            comm->split->keys   = keys;
        }
        else
        {
            comm->split->colors = (int*)tMPI_Malloc(N*sizeof(int));
            comm->split->keys   = (int*)tMPI_Malloc(N*sizeof(int));
        }
        comm->split->Ncol_init  = tMPI_Comm_N(comm);
        comm->split->can_finish = FALSE;
        i_am_first              = TRUE;
        /* the main communicator contains a list the size of grp.N */
    }
    newcomm_list        = comm->new_comm; /* we copy it to the local stacks because
                                             we can later erase comm->new_comm safely */
    spl                 = comm->split;    /* we do the same for spl */
    spl->colors[myrank] = color;
    spl->keys[myrank]   = key;
    spl->Ncol_init--;

    if (spl->Ncol_init == 0)
    {
        ret = tMPI_Thread_cond_signal(&(comm->comm_create_prep));
        if (ret != 0)
        {
            return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
        }
    }

    if (!i_am_first)
    {
        /* all other threads can just wait until the creator thread is
           finished */
        while (!spl->can_finish)
        {
            ret = tMPI_Thread_cond_wait(&(comm->comm_create_finish),
                                        &(comm->comm_create_lock) );
            if (ret != 0)
            {
                return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
            }
        }
    }
    else
    {
        int        Ncomms = 0;
        int        comm_color_[MAX_PREALLOC_THREADS];
        int        comm_N_[MAX_PREALLOC_THREADS];
        int       *comm_color = comm_color_; /* there can't be more comms than N*/
        int       *comm_N     = comm_N_;     /* the number of procs in a group */

        int       *comm_groups;              /* the groups */
        tMPI_Comm *comms;                    /* the communicators */

        /* wait for the colors to be done */
        /*if (N>1)*/
        while (spl->Ncol_init > 0)
        {
            ret = tMPI_Thread_cond_wait(&(comm->comm_create_prep),
                                        &(comm->comm_create_lock));
            if (ret != 0)
            {
                return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
            }
        }

        /* reset the state so that a new comm creating function can run */
        spl->Ncol_destroy = N;
        comm->new_comm    = 0;
        comm->split       = 0;

        comm_groups = (int*)tMPI_Malloc(N*N*sizeof(int));
        if (N > MAX_PREALLOC_THREADS)
        {
            comm_color = (int*)tMPI_Malloc(N*sizeof(int));
            comm_N     = (int*)tMPI_Malloc(N*sizeof(int));
        }

        /* count colors, allocate and split up communicators */
        tMPI_Split_colors(N, (int*)spl->colors,
                          (int*)spl->keys,
                          &Ncomms,
                          comm_N, comm_color, comm_groups);


        /* allocate a bunch of communicators */
        comms = (tMPI_Comm*)tMPI_Malloc(Ncomms*sizeof(tMPI_Comm));
        for (i = 0; i < Ncomms; i++)
        {
            ret = tMPI_Comm_alloc(&(comms[i]), comm, comm_N[i]);
            if (ret != TMPI_SUCCESS)
            {
                return ret;
            }
        }

        /* now distribute the comms */
        for (i = 0; i < Ncomms; i++)
        {
            comms[i]->grp.N = comm_N[i];
            for (j = 0; j < comm_N[i]; j++)
            {
                comms[i]->grp.peers[j] =
                    comm->grp.peers[comm_groups[i*comm->grp.N + j]];
            }
        }
        /* and put them into the newcomm_list */
        for (i = 0; i < N; i++)
        {
            newcomm_list[i] = TMPI_COMM_NULL;
            for (j = 0; j < Ncomms; j++)
            {
                if (spl->colors[i] == comm_color[j])
                {
                    newcomm_list[i] = comms[j];
                    break;
                }
            }
        }

#ifdef TMPI_DEBUG
        /* output */
        for (i = 0; i < Ncomms; i++)
        {
            printf("Group %d (color %d) has %d members: ",
                   i, comm_color[i], comm_N[i]);
            for (j = 0; j < comm_N[i]; j++)
            {
                printf(" %d ", comm_groups[comm->grp.N*i + j]);
            }

            printf(" rank: ");
            for (j = 0; j < comm_N[i]; j++)
            {
                printf(" %d ", spl->keys[comm_groups[N*i + j]]);
            }
            printf(" color: ");
            for (j = 0; j < comm_N[i]; j++)
            {
                printf(" %d ", spl->colors[comm_groups[N*i + j]]);
            }
            printf("\n");
        }
#endif
        if (N > MAX_PREALLOC_THREADS)
        {
            free((int*)spl->colors);
            free((int*)spl->keys);
            free(comm_color);
            free(comm_N);
        }
        free(comm_groups);
        free(comms);
        spl->can_finish = TRUE;

        /* tell the waiting threads that there's a comm ready */
        ret = tMPI_Thread_cond_broadcast(&(comm->comm_create_finish));
        if (ret != 0)
        {
            return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
        }
    }
    /* here the individual threads get their comm object */
    *newcomm = newcomm_list[myrank];

    /* free when we have assigned them all, so we can reuse the object*/
    spl->Ncol_destroy--;
    if (spl->Ncol_destroy == 0)
    {
        free((void*)newcomm_list);
        free(spl);
    }

    ret = tMPI_Thread_mutex_unlock(&(comm->comm_create_lock));
    if (ret != 0)
    {
        return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
    }

    return TMPI_SUCCESS;
}