コード例 #1
0
int tMPI_Thread_create_aff(tMPI_Thread_t *thread, 
                           void *(*start_routine)(void *),
                           void *arg)
{
    int ret;

    /* set the calling thread's affinity mask */
    if (tMPI_Atomic_get(&main_thread_aff_set) == 0)
    {
#ifdef HAVE_PTHREAD_SETAFFINITY
        cpu_set_t set;
#endif
        /* this can be a spinlock because the chances of collision are low. */
        tMPI_Spinlock_lock( &main_thread_aff_lock );
        tMPI_Atomic_set( &aff_thread_number, 0);
#ifdef HAVE_PTHREAD_SETAFFINITY
        CPU_ZERO(&set);
        CPU_SET(0, &set);
        pthread_setaffinity_np(pthread_self(), sizeof(set), &set);
        /*fprintf(stderr, "Setting affinity.\n");*/
#endif
        tMPI_Atomic_set( &main_thread_aff_set, 1);
        tMPI_Spinlock_unlock( &main_thread_aff_lock );
    }


    if(thread==NULL)
    {
        tMPI_Fatal_error(TMPI_FARGS,"Invalid thread pointer.");
        return EINVAL;
    }

    *thread=(struct tMPI_Thread*)malloc(sizeof(struct tMPI_Thread)*1);
    ret=pthread_create(&((*thread)->th),NULL,start_routine,arg);

    if(ret!=0)
    {
        /* Cannot use tMPI_error() since messages use threads for locking */
        tMPI_Fatal_error(TMPI_FARGS,"Failed to create POSIX thread, rc=%d",ret);
        /* Use system memory allocation routines */
        return -1;
    }
    else
    {
#ifdef HAVE_PTHREAD_SETAFFINITY
        int n;
        cpu_set_t set;

        n=tMPI_Atomic_add_return(&aff_thread_number, 1);
        CPU_ZERO(&set);
        CPU_SET(n, &set);
        return pthread_setaffinity_np((*thread)->th, sizeof(set), &set);
#else
        return 0;
#endif
    }
}
コード例 #2
0
ファイル: collective.c プロジェクト: TTarenzi/MMCG-HAdResS
void tMPI_Coll_env_init(struct coll_env *cev, int N)
{
    int i;

    cev->met=(struct coll_env_thread*)tMPI_Malloc(
                        sizeof(struct coll_env_thread)*N);
    cev->N=N;
    tMPI_Atomic_set(&(cev->coll.current_sync), 0);
    tMPI_Atomic_set(&(cev->coll.n_remaining), 0);
    for(i=0;i<N;i++)
    {
        tMPI_Coll_envt_init(&(cev->met[i]), N);
    }
}
コード例 #3
0
ファイル: collective.c プロジェクト: TTarenzi/MMCG-HAdResS
void tMPI_Coll_envt_init(struct coll_env_thread *met, int N)
{
    tMPI_Atomic_set(&(met->current_sync), 0);
    tMPI_Atomic_set(&(met->n_remaining), 0);
    met->buf=(void**)tMPI_Malloc(sizeof(void*)*N);
    met->bufsize=(size_t*)tMPI_Malloc(sizeof(size_t)*N);
    met->read_data=(tmpi_bool*)tMPI_Malloc(sizeof(tmpi_bool)*N);
#ifdef USE_COLLECTIVE_COPY_BUFFER
    met->cpbuf=(tMPI_Atomic_ptr_t*)tMPI_Malloc(sizeof(tMPI_Atomic_ptr_t)*N);
    met->cb=NULL;
    met->using_cb=FALSE;
#endif
    tMPI_Event_init( &(met->send_ev) );
    tMPI_Event_init( &(met->recv_ev) );
}
コード例 #4
0
ファイル: winthreads.c プロジェクト: chenleo/gromacs453pf
static void tMPI_Init_initers(void)
{
    int state;
    /* we can pre-check because it's atomic */
    if (tMPI_Atomic_get(&init_inited) == 0)
    {
        /* this can be a spinlock because the chances of collision are low. */
        tMPI_Spinlock_lock( &init_init );

        state=tMPI_Atomic_get(&init_inited);
        tMPI_Atomic_memory_barrier_acq();
        if (state == 0)
        {
            InitializeCriticalSection(&mutex_init);
            InitializeCriticalSection(&once_init);
            InitializeCriticalSection(&cond_init);
            InitializeCriticalSection(&barrier_init);

            tMPI_Atomic_memory_barrier_rel();
            tMPI_Atomic_set(&init_inited, 1);
        }

        tMPI_Spinlock_unlock( &init_init );
    }
}
コード例 #5
0
ファイル: winthreads.c プロジェクト: chenleo/gromacs453pf
int tMPI_Thread_once(tMPI_Thread_once_t *once_control, 
                     void (*init_routine)(void))
{
#if 0
    /* use once Vista is minimum required version */
    BOOL bStatus;
    bStatus = InitOnceExecuteOnce(once_control, InitHandleWrapperFunction, 
                                  init_routine, NULL);

    if (!bStatus)
    {
        tMPI_Fatal_error(TMPI_FARGS,"Failed to run thread_once routine");
        return -1;
    }
#else
    /* really ugly hack - and it's slow... */
    tMPI_Init_initers();
    EnterCriticalSection(&once_init);
    if (tMPI_Atomic_get(&(once_control->once)) == 0)
    {
        (*init_routine)();
        tMPI_Atomic_set(&(once_control->once), 1);
    }
    LeaveCriticalSection(&once_init);
#endif
    return 0;
}
コード例 #6
0
ファイル: tmpi_init.c プロジェクト: TTarenzi/MMCG-HAdResS
static void tMPI_Thread_init(struct tmpi_thread *th)
{
    int N_envelopes=(Nthreads+1)*N_EV_ALLOC;  
    int N_send_envelopes=N_EV_ALLOC;  
    int N_reqs=(Nthreads+1)*N_EV_ALLOC;  
    int i;

    /* we set our thread id, as a thread-specific piece of global data. */
    tMPI_Thread_setspecific(id_key, th);

    /* allocate comm.self */
    th->self_comm=tMPI_Comm_alloc(TMPI_COMM_WORLD, 1);
    th->self_comm->grp.peers[0]=th;

    /* allocate envelopes */
    tMPI_Free_env_list_init( &(th->envelopes), N_envelopes );
    /* recv list */
    tMPI_Recv_env_list_init( &(th->evr));
    /* send lists */
    th->evs=(struct send_envelope_list*)tMPI_Malloc(
                        sizeof(struct send_envelope_list)*Nthreads);
    for(i=0;i<Nthreads;i++)
    {
        tMPI_Send_env_list_init( &(th->evs[i]), N_send_envelopes);
    }

    tMPI_Atomic_set( &(th->ev_outgoing_received), 0);

    tMPI_Event_init( &(th->p2p_event) );

    /* allocate requests */
    tMPI_Req_list_init(&(th->rql), N_reqs);

#ifdef USE_COLLECTIVE_COPY_BUFFER
    /* allcate copy_buffer list */
    tMPI_Copy_buffer_list_init(&(th->cbl_multi), (Nthreads+1)*(N_COLL_ENV+1),
                               Nthreads*COPY_BUFFER_SIZE);
#endif

#ifdef TMPI_PROFILE
    tMPI_Profile_init(&(th->profile));
#endif
    /* now wait for all other threads to come on line, before we
       start the MPI program */
    tMPI_Thread_barrier_wait( &(tmpi_global->barrier) );
}
コード例 #7
0
ファイル: winthreads.c プロジェクト: yupinov/gromacs
static int tMPI_Init_initers(void)
{
    int state;
    int ret = 0;

    /* we can pre-check because it's atomic */
    if (tMPI_Atomic_get(&init_inited) == 0)
    {
        /* this can be a spinlock because the chances of collision are low. */
        tMPI_Spinlock_lock( &init_init );

        state = tMPI_Atomic_get(&init_inited);
        tMPI_Atomic_memory_barrier_acq();
        if (state == 0)
        {
            InitializeCriticalSection(&mutex_init);
            InitializeCriticalSection(&once_init);
            InitializeCriticalSection(&cond_init);
            InitializeCriticalSection(&barrier_init);
            InitializeCriticalSection(&thread_id_list_lock);

            ret = tMPI_Init_NUMA();
            if (ret != 0)
            {
                goto err;
            }


            ret = tMPI_Thread_id_list_init();
            if (ret != 0)
            {
                goto err;
            }

            tMPI_Atomic_memory_barrier_rel();
            tMPI_Atomic_set(&init_inited, 1);
        }

        tMPI_Spinlock_unlock( &init_init );
    }
    return ret;
err:
    tMPI_Spinlock_unlock( &init_init );
    return ret;
}
コード例 #8
0
ファイル: pthreads.c プロジェクト: alexholehouse/gromacs
int tMPI_Thread_cond_init(tMPI_Thread_cond_t *cond) 
{
    int ret;
    
    if(cond==NULL)
    {
        return EINVAL;
    }
   
    cond->condp=(struct tMPI_Thread_cond*)
              tMPI_Malloc(sizeof(struct tMPI_Thread_cond)*1);
    ret = pthread_cond_init(&(cond->condp->cond), NULL);
    
    if(ret!=0)
    {
        tMPI_Fatal_error(TMPI_FARGS,"Error initializing POSIX condition variable. rc=%d",ret);
        fflush(stderr);
    }
    tMPI_Atomic_set(&(cond->initialized),1);
    return ret;
}
コード例 #9
0
ファイル: pthreads.c プロジェクト: alexholehouse/gromacs
int tMPI_Thread_once(tMPI_Thread_once_t *once_control,
                     void (*init_routine)(void))
{
    int ret;
    if (!once_control || !init_routine)
    {
        return EINVAL;
    }

    /* really ugly hack - and it's slow... */
    if ( (ret=pthread_mutex_lock( &once_init )) )
        return ret;
    if (tMPI_Atomic_get(&(once_control->once)) == 0)
    {
        (*init_routine)();
        tMPI_Atomic_set(&(once_control->once), 1);
    }
    pthread_mutex_unlock( &once_init );

    return 0;
}
コード例 #10
0
ファイル: pthreads.c プロジェクト: alexholehouse/gromacs
int tMPI_Thread_mutex_init(tMPI_Thread_mutex_t *mtx) 
{
    int ret;
  
    if (mtx == NULL)
    {
        return EINVAL;
    }

    mtx->mutex=(struct tMPI_Mutex*)tMPI_Malloc(sizeof(struct tMPI_Mutex)*1);
    ret = pthread_mutex_init(&(mtx->mutex->mtx),NULL);
    
    if(ret!=0)
    {
        tMPI_Fatal_error(TMPI_FARGS,"Error initializing POSIX mutex. rc=%d");
        /* Use system memory allocation routines */
        return ret;
    }

    tMPI_Atomic_set(&(mtx->initialized), 1);
    return 0;
}
コード例 #11
0
ファイル: pthreads.c プロジェクト: alexholehouse/gromacs
int tMPI_Thread_barrier_init(tMPI_Thread_barrier_t *barrier, int n)
{
    int ret;
    /*tMPI_Thread_pthread_barrier_t *p;*/
    
    if(barrier==NULL)
    {
        return EINVAL;
    }
    
    barrier->barrierp=(struct tMPI_Thread_barrier*)
              tMPI_Malloc(sizeof(struct tMPI_Thread_barrier)*1);
    ret = pthread_mutex_init(&(barrier->barrierp->mutex),NULL);
        
    if(ret!=0)
    {
        tMPI_Fatal_error(TMPI_FARGS,"Error initializing POSIX mutex. rc=%d",
                         ret);
        return ret;
    }
    
    ret = pthread_cond_init(&(barrier->barrierp->cv),NULL);
    
    if(ret!=0)
    {
        tMPI_Fatal_error(TMPI_FARGS,
                         "Error initializing POSIX condition variable. rc=%d",
                         ret);
        return ret;
    }
        
    barrier->threshold = n;
    barrier->count     = n;
    barrier->cycle     = 0;

    tMPI_Atomic_set(&(barrier->initialized), 1);
    return 0;
}
コード例 #12
0
ファイル: pthreads.c プロジェクト: alexholehouse/gromacs
int tMPI_Thread_key_create(tMPI_Thread_key_t *key, void (*destructor)(void *))
{
    int ret;

    if(key==NULL)
    {
        tMPI_Fatal_error(TMPI_FARGS,"Invalid key pointer.");
        return EINVAL;
    }


    key->key=(struct tMPI_Thread_key*)tMPI_Malloc(sizeof(struct 
                                                         tMPI_Thread_key)*1);
    ret = pthread_key_create(&((key)->key->pkey), destructor);
    if(ret!=0)
    {
        tMPI_Fatal_error(TMPI_FARGS,"Failed to create thread key, rc=%d.",ret);
        fflush(stderr);
        return -1;
    }

    tMPI_Atomic_set(&(key->initialized), 1);
    return 0;
}
コード例 #13
0
ファイル: comm.c プロジェクト: BradleyDickson/fABMACS
int tMPI_Comm_alloc(tMPI_Comm *newcomm, tMPI_Comm parent, int N)
{
    struct tmpi_comm_ *retc;
    int                i;
    int                ret;

    retc = (struct tmpi_comm_*)tMPI_Malloc(sizeof(struct tmpi_comm_));
    if (retc == NULL)
    {
        return TMPI_ERR_NO_MEM;
    }

    retc->grp.peers = (struct tmpi_thread**)tMPI_Malloc(
                sizeof(struct tmpi_thread*)*Nthreads);
    if (retc->grp.peers == NULL)
    {
        return TMPI_ERR_NO_MEM;
    }
    retc->grp.N = N;

    ret = tMPI_Thread_mutex_init( &(retc->comm_create_lock) );
    if (ret != 0)
    {
        return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
    }
    ret = tMPI_Thread_cond_init( &(retc->comm_create_prep) );
    if (ret != 0)
    {
        return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
    }
    ret = tMPI_Thread_cond_init( &(retc->comm_create_finish) );
    if (ret != 0)
    {
        return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
    }

    retc->split    = NULL;
    retc->new_comm = NULL;
    /* we have no topology to start out with */
    retc->cart = NULL;
    /*retc->graph=NULL;*/

    /* we start counting at 0 */
    tMPI_Atomic_set( &(retc->destroy_counter), 0);

    /* initialize the main barrier */
    tMPI_Barrier_init(&(retc->barrier), N);

    /* the reduce barriers */
    {
        /* First calculate the number of reduce barriers */
        int Niter = 0; /* the iteration number */
        int Nred  = N; /* the number of reduce barriers for this iteration */
        while (Nred > 1)
        {
            /* Nred is now Nred/2 + a rest term because solitary
               process at the end of the list must still be accounter for */
            Nred   = Nred/2 + Nred%2;
            Niter += 1;
        }

        retc->N_reduce_iter = Niter;
        /* allocate the list */
        retc->reduce_barrier = (tMPI_Barrier_t**)
            tMPI_Malloc(sizeof(tMPI_Barrier_t*)*(Niter+1));
        if (retc->reduce_barrier == NULL)
        {
            return TMPI_ERR_NO_MEM;
        }
        retc->N_reduce = (int*)tMPI_Malloc(sizeof(int)*(Niter+1));
        if (retc->N_reduce == NULL)
        {
            return TMPI_ERR_NO_MEM;
        }

        /* we re-set Nred to N */
        Nred = N;
        for (i = 0; i < Niter; i++)
        {
            int j;

            Nred              = Nred/2 + Nred%2;
            retc->N_reduce[i] = Nred;
            /* allocate the sub-list */
            retc->reduce_barrier[i] = (tMPI_Barrier_t*)
                tMPI_Malloc(sizeof(tMPI_Barrier_t)*(Nred));
            if (retc->reduce_barrier[i] == NULL)
            {
                return TMPI_ERR_NO_MEM;
            }
            for (j = 0; j < Nred; j++)
            {
                tMPI_Barrier_init(&(retc->reduce_barrier[i][j]), 2);
            }
        }
    }

    /* the reduce buffers */
    retc->reduce_sendbuf = (tMPI_Atomic_ptr_t*)
        tMPI_Malloc(sizeof(tMPI_Atomic_ptr_t)*Nthreads);
    if (retc->reduce_sendbuf == NULL)
    {
        return TMPI_ERR_NO_MEM;
    }
    retc->reduce_recvbuf = (tMPI_Atomic_ptr_t*)
        tMPI_Malloc(sizeof(tMPI_Atomic_ptr_t)*Nthreads);
    if (retc->reduce_recvbuf == NULL)
    {
        return TMPI_ERR_NO_MEM;
    }

    if (parent)
    {
        retc->erh = parent->erh;
    }
    else
    {
        retc->erh = TMPI_ERRORS_ARE_FATAL;
    }

    /* coll_env objects */
    retc->cev = (struct coll_env*)tMPI_Malloc(sizeof(struct coll_env)*
                                              N_COLL_ENV);
    if (retc->cev == NULL)
    {
        return TMPI_ERR_NO_MEM;
    }

    for (i = 0; i < N_COLL_ENV; i++)
    {
        ret = tMPI_Coll_env_init( &(retc->cev[i]), N);
        if (ret != TMPI_SUCCESS)
        {
            return ret;
        }
    }
    /* multi_sync objects */
    retc->csync = (struct coll_sync*)tMPI_Malloc(sizeof(struct coll_sync)*N);
    if (retc->csync == NULL)
    {
        return TMPI_ERR_NO_MEM;
    }

    for (i = 0; i < N; i++)
    {
        ret = tMPI_Coll_sync_init( &(retc->csync[i]), N);
        if (ret != TMPI_SUCCESS)
        {
            return ret;
        }
    }

    ret = tMPI_Thread_mutex_lock( &(tmpi_global->comm_link_lock) );
    if (ret != 0)
    {
        return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
    }
    /* we insert ourselves in the circular list, after TMPI_COMM_WORLD */
    if (TMPI_COMM_WORLD)
    {
        retc->next = TMPI_COMM_WORLD;
        retc->prev = TMPI_COMM_WORLD->prev;

        TMPI_COMM_WORLD->prev->next = retc;
        TMPI_COMM_WORLD->prev       = retc;
    }
    else
    {
        retc->prev = retc->next = retc;
    }
    ret = tMPI_Thread_mutex_unlock( &(tmpi_global->comm_link_lock) );
    if (ret != 0)
    {
        return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
    }
    *newcomm = retc;
    return TMPI_SUCCESS;
}
コード例 #14
0
/* Set the main thread's affinity */
static int tMPI_Set_main_thread_affinity(void)
{
    /* calling thread PROCESSOR_NUMBER */
    PROCESSOR_NUMBER CurrentProcessorNumber;      
    /* calling thread GROUP_AFFINITY */
    GROUP_AFFINITY CurrentThreadGroupAffinity; 
    /* calling thread NUMA node */
    USHORT CurrentNumaNodeNumber;


    /* we can pre-check because it's atomic */
    if (tMPI_Atomic_get(&main_thread_aff_set) == 0)
    {
        /* this can be a spinlock because the chances of collision are low. */
        tMPI_Spinlock_lock( &main_thread_aff_lock );
        if( g_ulHighestNumaNodeNumber != 0 )
        {
            func_GetCurrentProcessorNumberEx(&CurrentProcessorNumber);


            /* set the NUMA node affinity for the current thread
               failures to set the current thread affinity are ignored, 
               as a fringe case can arise on >32 processor systems with a 32bit 
               build/code.
               */
            func_SetThreadIdealProcessorEx(GetCurrentThread(), 
                                           &CurrentProcessorNumber, 
                                           NULL);

            if(func_GetNumaProcessorNodeEx(&CurrentProcessorNumber, 
                                           &CurrentNumaNodeNumber))
            {
                /* for the NUMA node number associated with the current processor 
                   number, get the group affinity mask */
                if(func_GetNumaNodeProcessorMaskEx(CurrentNumaNodeNumber, 
                                                   &CurrentThreadGroupAffinity))
                {
                    /* set the current thread affinity to prevent it from running on 
                       other NUMA nodes */
                    func_SetThreadGroupAffinity(GetCurrentThread(), 
                                                &CurrentThreadGroupAffinity, 
                                                NULL);
                }
            }
        }
        else
        {
            /* No NUMA. For now, we just do a similar thing. */
            if ( (func_GetCurrentProcessorNumberEx != NULL)  &&
                 (func_SetThreadIdealProcessorEx))
            {
                func_GetCurrentProcessorNumberEx(&CurrentProcessorNumber);
                func_SetThreadIdealProcessorEx(GetCurrentThread(), 
                                               &CurrentProcessorNumber, 
                                               NULL);
            }
        }
        tMPI_Atomic_set( &main_thread_aff_set, 1);
        tMPI_Spinlock_unlock( &main_thread_aff_lock );
    }
    return 0;
}
コード例 #15
0
ファイル: collective.c プロジェクト: TTarenzi/MMCG-HAdResS
void tMPI_Post_multi(struct coll_env *cev, int myrank, int index, 
                     int tag, tMPI_Datatype datatype, size_t bufsize, 
                     void *buf, int n_remaining, int synct, int dest)
{
    int i;
#ifdef USE_COLLECTIVE_COPY_BUFFER
    /* decide based on the number of waiting threads */
    tmpi_bool using_cb=(bufsize < (size_t)(n_remaining*COPY_BUFFER_SIZE));

    cev->met[myrank].using_cb=using_cb;
    if (using_cb)
    {
        /* we set it to NULL initially */
        /*cev->met[myrank].cpbuf[index]=NULL;*/
        tMPI_Atomic_ptr_set(&(cev->met[myrank].cpbuf[index]), NULL);

        tMPI_Atomic_set(&(cev->met[myrank].buf_readcount), 0);
    }
#endif
    cev->met[myrank].tag=tag;
    cev->met[myrank].datatype=datatype;
    cev->met[myrank].buf[index]=buf;
    cev->met[myrank].bufsize[index]=bufsize;
    tMPI_Atomic_set(&(cev->met[myrank].n_remaining), n_remaining);
    tMPI_Atomic_memory_barrier_rel();
    tMPI_Atomic_set(&(cev->met[myrank].current_sync), synct);

    /* publish availability. */
    if (dest<0)
    {
        for(i=0;i<cev->N;i++)
        {
            if (i != myrank)
                tMPI_Event_signal( &(cev->met[i].recv_ev) );
        }
    }
    else
    {
        tMPI_Event_signal( &(cev->met[dest].recv_ev) );
    }

#ifdef USE_COLLECTIVE_COPY_BUFFER
    /* becase we've published availability, we can start copying -- 
       possibly in parallel with the receiver */
    if (using_cb)
    {
        struct tmpi_thread *cur=tMPI_Get_current();
         /* copy the buffer locally. First allocate */
        cev->met[myrank].cb=tMPI_Copy_buffer_list_get( &(cur->cbl_multi) );
        if (cev->met[myrank].cb->size < bufsize)
        {
            fprintf(stderr, "ERROR: cb size too small\n");
            exit(1);
        }
        /* copy to the new buf */
        memcpy(cev->met[myrank].cb->buf, buf, bufsize);

        /* post the new buf */
        tMPI_Atomic_memory_barrier_rel();
        /*cev->met[myrank].cpbuf[index]=cev->met[myrank].cb->buf;*/
        tMPI_Atomic_ptr_set(&(cev->met[myrank].cpbuf[index]), 
                            cev->met[myrank].cb->buf);
    }
#endif
}
コード例 #16
0
ファイル: alltoall.c プロジェクト: BradleyDickson/fABMACS
int tMPI_Alltoall(void* sendbuf, int sendcount, tMPI_Datatype sendtype,
                  void* recvbuf, int recvcount, tMPI_Datatype recvtype,
                  tMPI_Comm comm)
{
    int                 synct;
    struct coll_env    *cev;
    int                 myrank;
    int                 ret = TMPI_SUCCESS;
    int                 i;
    size_t              sendsize = sendtype->size*sendcount;
    size_t              recvsize = recvtype->size*recvcount;
    int                 n_remaining;
    struct tmpi_thread *cur = tMPI_Get_current();

#ifdef TMPI_PROFILE
    tMPI_Profile_count_start(cur);
#endif
#ifdef TMPI_TRACE
    tMPI_Trace_print("tMPI_Alltoall(%p, %d, %p, %p, %d, %p, %p)",
                     sendbuf, sendcount, sendtype,
                     recvbuf, recvcount, recvtype, comm);
#endif

    if (!comm)
    {
        return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_COMM);
    }
    if (!sendbuf || !recvbuf) /* don't do pointer arithmetic on a NULL ptr */
    {
        return tMPI_Error(comm, TMPI_ERR_BUF);
    }

    myrank = tMPI_Comm_seek_rank(comm, cur);

    /* we increase our counter, and determine which coll_env we get */
    cev = tMPI_Get_cev(comm, myrank, &synct);

    /* post our pointers */
    /* we set up multiple posts, so no Post_multi */
    cev->met[myrank].tag      = TMPI_ALLTOALL_TAG;
    cev->met[myrank].datatype = sendtype;
    tMPI_Atomic_set( &(cev->met[myrank].n_remaining), cev->N-1 );
    for (i = 0; i < comm->grp.N; i++)
    {
        cev->met[myrank].bufsize[i]   = sendsize;
        cev->met[myrank].buf[i]       = (char*)sendbuf+sendsize*i;
        cev->met[myrank].read_data[i] = FALSE;
    }
    tMPI_Atomic_memory_barrier_rel();
    tMPI_Atomic_set(&(cev->met[myrank].current_sync), synct);

    /* post availability */
    for (i = 0; i < cev->N; i++)
    {
        if (i != myrank)
        {
            tMPI_Event_signal( &(cev->met[i].recv_ev) );
        }
    }

    /* we don't do the copy buffer thing here because it's pointless:
       the processes have to synchronize anyway, because they all
       send and receive. */

    /* do root transfer */
    tMPI_Coll_root_xfer(comm, sendtype, recvtype,
                        sendsize, recvsize,
                        (char*)sendbuf+sendsize*myrank,
                        (char*)recvbuf+recvsize*myrank, &ret);
    cev->met[myrank].read_data[myrank] = TRUE;
    /* and poll data availability */
    n_remaining = cev->N-1;
    while (n_remaining > 0)
    {
#if defined(TMPI_PROFILE) && defined(TMPI_CYCLE_COUNT)
        tMPI_Profile_wait_start(cur);
#endif
        tMPI_Event_wait( &(cev->met[myrank]).recv_ev );
#if defined(TMPI_PROFILE) && defined(TMPI_CYCLE_COUNT)
        tMPI_Profile_wait_stop(cur, TMPIWAIT_Coll_recv);
#endif
        for (i = 0; i < cev->N; i++)
        {
            if ((!cev->met[myrank].read_data[i]) &&
                (tMPI_Atomic_get(&(cev->met[i].current_sync)) == synct))
            {
                tMPI_Event_process( &(cev->met[myrank]).recv_ev, 1);
                tMPI_Mult_recv(comm, cev, i, myrank, TMPI_ALLTOALL_TAG,
                               recvtype, recvsize, (char*)recvbuf+recvsize*i,
                               &ret);
                if (ret != TMPI_SUCCESS)
                {
                    return ret;
                }
                cev->met[myrank].read_data[i] = TRUE;
                n_remaining--;
            }
        }
    }


    /* and wait until everybody is done copying our data */
    tMPI_Wait_for_others(cev, myrank);

#ifdef TMPI_PROFILE
    tMPI_Profile_count_stop(cur, TMPIFN_Alltoall);
#endif
    return ret;
}