static void tMPI_Init_initers(void) { int state; /* we can pre-check because it's atomic */ if (tMPI_Atomic_get(&init_inited) == 0) { /* this can be a spinlock because the chances of collision are low. */ tMPI_Spinlock_lock( &init_init ); state=tMPI_Atomic_get(&init_inited); tMPI_Atomic_memory_barrier_acq(); if (state == 0) { InitializeCriticalSection(&mutex_init); InitializeCriticalSection(&once_init); InitializeCriticalSection(&cond_init); InitializeCriticalSection(&barrier_init); tMPI_Atomic_memory_barrier_rel(); tMPI_Atomic_set(&init_inited, 1); } tMPI_Spinlock_unlock( &init_init ); } }
/* once */ int tMPI_Once(tMPI_Comm comm, void (*function)(void*), void *param, int *was_first) { int myrank; int ret=TMPI_SUCCESS; struct coll_sync *csync; struct coll_env *cev; int syncs; if (!comm) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_COMM); } myrank=tMPI_Comm_seek_rank(comm, tMPI_Get_current()); /* we increase our counter, and determine which coll_env we get */ csync=&(comm->csync[myrank]); csync->syncs++; cev=&(comm->cev[csync->syncs % N_COLL_ENV]); /* now do a compare-and-swap on the current_syncc */ syncs=tMPI_Atomic_get( &(cev->coll.current_sync)); if ((csync->syncs - syncs > 0) && /* check if sync was an earlier number. If it is a later number, we can't have been the first to arrive here. */ tMPI_Atomic_cas(&(cev->coll.current_sync), syncs, csync->syncs)) { /* we're the first! */ function(param); if (was_first) *was_first=TRUE; } return ret; }
int tMPI_Thread_once(tMPI_Thread_once_t *once_control, void (*init_routine)(void)) { #if 0 /* use once Vista is minimum required version */ BOOL bStatus; bStatus = InitOnceExecuteOnce(once_control, InitHandleWrapperFunction, init_routine, NULL); if (!bStatus) { tMPI_Fatal_error(TMPI_FARGS,"Failed to run thread_once routine"); return -1; } #else /* really ugly hack - and it's slow... */ tMPI_Init_initers(); EnterCriticalSection(&once_init); if (tMPI_Atomic_get(&(once_control->once)) == 0) { (*init_routine)(); tMPI_Atomic_set(&(once_control->once), 1); } LeaveCriticalSection(&once_init); #endif return 0; }
int tMPI_Thread_cond_broadcast(tMPI_Thread_cond_t *cond) { /* check whether the condition is initialized */ if (tMPI_Atomic_get( &(cond->initialized) ) == 0) { tMPI_Thread_cond_init_once(cond); } /* The condition variable is now guaranteed to be valid. */ #if 0 /* use this code once Vista is the minimum version required */ WakeAllConditionVariable( &(cond->cv) ); #else EnterCriticalSection(&(cond->condp->wtr_lock)); /* check whether there are any waiters */ if (cond->condp->Nwaiters > 0) { cond->condp->Nrelease=cond->condp->Nwaiters; cond->condp->cycle++; if (!SetEvent(cond->condp->ev)) /* actually release the waiting threads */ { tMPI_Fatal_error(TMPI_FARGS,"Failed SetEvent, error code=%d", GetLastError()); return -1; } } LeaveCriticalSection(&(cond->condp->wtr_lock)); #endif return 0; }
static int tMPI_Init_initers(void) { int state; int ret = 0; /* we can pre-check because it's atomic */ if (tMPI_Atomic_get(&init_inited) == 0) { /* this can be a spinlock because the chances of collision are low. */ tMPI_Spinlock_lock( &init_init ); state = tMPI_Atomic_get(&init_inited); tMPI_Atomic_memory_barrier_acq(); if (state == 0) { InitializeCriticalSection(&mutex_init); InitializeCriticalSection(&once_init); InitializeCriticalSection(&cond_init); InitializeCriticalSection(&barrier_init); InitializeCriticalSection(&thread_id_list_lock); ret = tMPI_Init_NUMA(); if (ret != 0) { goto err; } ret = tMPI_Thread_id_list_init(); if (ret != 0) { goto err; } tMPI_Atomic_memory_barrier_rel(); tMPI_Atomic_set(&init_inited, 1); } tMPI_Spinlock_unlock( &init_init ); } return ret; err: tMPI_Spinlock_unlock( &init_init ); return ret; }
int tMPI_Thread_create_aff(tMPI_Thread_t *thread, void *(*start_routine)(void *), void *arg) { int ret; /* set the calling thread's affinity mask */ if (tMPI_Atomic_get(&main_thread_aff_set) == 0) { #ifdef HAVE_PTHREAD_SETAFFINITY cpu_set_t set; #endif /* this can be a spinlock because the chances of collision are low. */ tMPI_Spinlock_lock( &main_thread_aff_lock ); tMPI_Atomic_set( &aff_thread_number, 0); #ifdef HAVE_PTHREAD_SETAFFINITY CPU_ZERO(&set); CPU_SET(0, &set); pthread_setaffinity_np(pthread_self(), sizeof(set), &set); /*fprintf(stderr, "Setting affinity.\n");*/ #endif tMPI_Atomic_set( &main_thread_aff_set, 1); tMPI_Spinlock_unlock( &main_thread_aff_lock ); } if(thread==NULL) { tMPI_Fatal_error(TMPI_FARGS,"Invalid thread pointer."); return EINVAL; } *thread=(struct tMPI_Thread*)malloc(sizeof(struct tMPI_Thread)*1); ret=pthread_create(&((*thread)->th),NULL,start_routine,arg); if(ret!=0) { /* Cannot use tMPI_error() since messages use threads for locking */ tMPI_Fatal_error(TMPI_FARGS,"Failed to create POSIX thread, rc=%d",ret); /* Use system memory allocation routines */ return -1; } else { #ifdef HAVE_PTHREAD_SETAFFINITY int n; cpu_set_t set; n=tMPI_Atomic_add_return(&aff_thread_number, 1); CPU_ZERO(&set); CPU_SET(n, &set); return pthread_setaffinity_np((*thread)->th, sizeof(set), &set); #else return 0; #endif } }
int tMPI_Thread_mutex_lock(tMPI_Thread_mutex_t *mtx) { /* check whether the mutex is initialized */ if (tMPI_Atomic_get( &(mtx->initialized) ) == 0) { tMPI_Thread_mutex_init_once(mtx); } /* The mutex is now guaranteed to be valid. */ EnterCriticalSection( &(mtx->mutex->cs) ); return 0; }
int tMPI_Thread_cond_broadcast(tMPI_Thread_cond_t *cond) { int ret; /* check whether the condition is initialized */ if (tMPI_Atomic_get( &(cond->initialized) ) == 0) { tMPI_Thread_cond_init_once(cond); } ret = pthread_cond_broadcast( &(cond->condp->cond) ); return ret; }
int tMPI_Thread_cond_wait(tMPI_Thread_cond_t *cond, tMPI_Thread_mutex_t *mtx) { int ret; /* check whether the condition is initialized */ if (tMPI_Atomic_get( &(cond->initialized) ) == 0) { tMPI_Thread_cond_init_once(cond); } /* the mutex must have been initialized because it should be locked here */ ret = pthread_cond_wait( &(cond->condp->cond), &(mtx->mutex->mtx) ); return ret; }
int tMPI_Thread_mutex_unlock(tMPI_Thread_mutex_t *mtx) { int ret; /* check whether the mutex is initialized */ if (tMPI_Atomic_get( &(mtx->initialized) ) == 0) { ret=tMPI_Thread_mutex_init_once(mtx); if (ret) return ret; } ret = pthread_mutex_unlock(&(mtx->mutex->mtx)); return ret; }
int tMPI_Thread_barrier_wait(tMPI_Thread_barrier_t * barrier) { int cycle; int rc; /* check whether the barrier is initialized */ if (tMPI_Atomic_get( &(barrier->initialized) ) == 0) { tMPI_Thread_barrier_init_once(barrier); } rc = pthread_mutex_lock(&barrier->barrierp->mutex); if(rc != 0) return EBUSY; cycle = barrier->cycle; /* Decrement the count atomically and check if it is zero. * This will only be true for the last thread calling us. */ if( --barrier->count <= 0 ) { barrier->cycle = !barrier->cycle; barrier->count = barrier->threshold; rc = pthread_cond_broadcast(&barrier->barrierp->cv); if(rc == 0) rc = -1; } else { while(cycle == barrier->cycle) { rc = pthread_cond_wait(&barrier->barrierp->cv, &barrier->barrierp->mutex); if(rc != 0) break; } } pthread_mutex_unlock(&barrier->barrierp->mutex); return rc; }
void tMPI_Wait_for_others(struct coll_env *cev, int myrank) { #if defined(TMPI_PROFILE) struct tmpi_thread *cur=tMPI_Get_current(); tMPI_Profile_wait_start(cur); #endif #ifdef USE_COLLECTIVE_COPY_BUFFER if (! (cev->met[myrank].using_cb) ) #endif { /* wait until everybody else is done copying the buffer */ tMPI_Event_wait( &(cev->met[myrank].send_ev)); tMPI_Event_process( &(cev->met[myrank].send_ev), 1); } #ifdef USE_COLLECTIVE_COPY_BUFFER else { /* wait until everybody else is done copying the original buffer. We use fetch_add because we want to be sure of coherency. This wait is bound to be very short (otherwise it wouldn't be double-buffering) so we always spin here. */ /*tMPI_Atomic_memory_barrier_rel();*/ #if 0 while (!tMPI_Atomic_cas( &(cev->met[rank].buf_readcount), 0, -100000)) #endif #if 0 while (tMPI_Atomic_fetch_add( &(cev->met[myrank].buf_readcount), 0) != 0) #endif #if 1 while (tMPI_Atomic_get( &(cev->met[rank].buf_readcount) )>0) #endif { } tMPI_Atomic_memory_barrier_acq(); } #endif #if defined(TMPI_PROFILE) tMPI_Profile_wait_stop(cur, TMPIWAIT_Coll_send); #endif }
int tMPI_Thread_once(tMPI_Thread_once_t *once_control, void (*init_routine)(void)) { int ret; if (!once_control || !init_routine) { return EINVAL; } /* really ugly hack - and it's slow... */ if ( (ret=pthread_mutex_lock( &once_init )) ) return ret; if (tMPI_Atomic_get(&(once_control->once)) == 0) { (*init_routine)(); tMPI_Atomic_set(&(once_control->once), 1); } pthread_mutex_unlock( &once_init ); return 0; }
int tMPI_Alltoall(void* sendbuf, int sendcount, tMPI_Datatype sendtype, void* recvbuf, int recvcount, tMPI_Datatype recvtype, tMPI_Comm comm) { int synct; struct coll_env *cev; int myrank; int ret = TMPI_SUCCESS; int i; size_t sendsize = sendtype->size*sendcount; size_t recvsize = recvtype->size*recvcount; int n_remaining; struct tmpi_thread *cur = tMPI_Get_current(); #ifdef TMPI_PROFILE tMPI_Profile_count_start(cur); #endif #ifdef TMPI_TRACE tMPI_Trace_print("tMPI_Alltoall(%p, %d, %p, %p, %d, %p, %p)", sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm); #endif if (!comm) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_COMM); } if (!sendbuf || !recvbuf) /* don't do pointer arithmetic on a NULL ptr */ { return tMPI_Error(comm, TMPI_ERR_BUF); } myrank = tMPI_Comm_seek_rank(comm, cur); /* we increase our counter, and determine which coll_env we get */ cev = tMPI_Get_cev(comm, myrank, &synct); /* post our pointers */ /* we set up multiple posts, so no Post_multi */ cev->met[myrank].tag = TMPI_ALLTOALL_TAG; cev->met[myrank].datatype = sendtype; tMPI_Atomic_set( &(cev->met[myrank].n_remaining), cev->N-1 ); for (i = 0; i < comm->grp.N; i++) { cev->met[myrank].bufsize[i] = sendsize; cev->met[myrank].buf[i] = (char*)sendbuf+sendsize*i; cev->met[myrank].read_data[i] = FALSE; } tMPI_Atomic_memory_barrier_rel(); tMPI_Atomic_set(&(cev->met[myrank].current_sync), synct); /* post availability */ for (i = 0; i < cev->N; i++) { if (i != myrank) { tMPI_Event_signal( &(cev->met[i].recv_ev) ); } } /* we don't do the copy buffer thing here because it's pointless: the processes have to synchronize anyway, because they all send and receive. */ /* do root transfer */ tMPI_Coll_root_xfer(comm, sendtype, recvtype, sendsize, recvsize, (char*)sendbuf+sendsize*myrank, (char*)recvbuf+recvsize*myrank, &ret); cev->met[myrank].read_data[myrank] = TRUE; /* and poll data availability */ n_remaining = cev->N-1; while (n_remaining > 0) { #if defined(TMPI_PROFILE) && defined(TMPI_CYCLE_COUNT) tMPI_Profile_wait_start(cur); #endif tMPI_Event_wait( &(cev->met[myrank]).recv_ev ); #if defined(TMPI_PROFILE) && defined(TMPI_CYCLE_COUNT) tMPI_Profile_wait_stop(cur, TMPIWAIT_Coll_recv); #endif for (i = 0; i < cev->N; i++) { if ((!cev->met[myrank].read_data[i]) && (tMPI_Atomic_get(&(cev->met[i].current_sync)) == synct)) { tMPI_Event_process( &(cev->met[myrank]).recv_ev, 1); tMPI_Mult_recv(comm, cev, i, myrank, TMPI_ALLTOALL_TAG, recvtype, recvsize, (char*)recvbuf+recvsize*i, &ret); if (ret != TMPI_SUCCESS) { return ret; } cev->met[myrank].read_data[i] = TRUE; n_remaining--; } } } /* and wait until everybody is done copying our data */ tMPI_Wait_for_others(cev, myrank); #ifdef TMPI_PROFILE tMPI_Profile_count_stop(cur, TMPIFN_Alltoall); #endif return ret; }
int tMPI_Thread_barrier_wait(tMPI_Thread_barrier_t *barrier) { int cycle; BOOL rc=FALSE; int ret=0; /*tMPI_Thread_pthread_barrier_t *p;*/ /* check whether the barrier is initialized */ if (tMPI_Atomic_get( &(barrier->initialized) ) == 0) { tMPI_Thread_barrier_init_once(barrier,barrier->threshold); } #if 0 EnterCriticalSection( &(barrier->barrierp->cs) ); #else tMPI_Thread_mutex_lock( &(barrier->barrierp->cs) ); #endif cycle = barrier->cycle; /* Decrement the count atomically and check if it is zero. * This will only be true for the last thread calling us. */ if( --(barrier->count) <= 0 ) { barrier->cycle = !barrier->cycle; barrier->count = barrier->threshold; #if 0 WakeAllConditionVariable( &(barrier->barrierp->cv) ); #else tMPI_Thread_cond_broadcast( &(barrier->barrierp->cv) ); #endif } else { while(cycle == barrier->cycle) { #if 0 rc=SleepConditionVariableCS (&(barrier->barrierp->cv), &(barrier->barrierp->cs), INFINITE); if(!rc) { ret=-1; break; } #else rc = tMPI_Thread_cond_wait(&barrier->barrierp->cv, &barrier->barrierp->cs); if(rc != 0) break; #endif } } #if 0 LeaveCriticalSection( &(barrier->barrierp->cs) ); #else tMPI_Thread_mutex_unlock( &(barrier->barrierp->cs) ); #endif return ret; }
int tMPI_Gather(void* sendbuf, int sendcount, tMPI_Datatype sendtype, void* recvbuf, int recvcount, tMPI_Datatype recvtype, int root, tMPI_Comm comm) { int synct; struct coll_env *cev; int myrank; int ret = TMPI_SUCCESS; struct tmpi_thread *cur = tMPI_Get_current(); #ifdef TMPI_PROFILE tMPI_Profile_count_start(cur); #endif #ifdef TMPI_TRACE tMPI_Trace_print("tMPI_Gather(%p, %d, %p, %p, %d, %p, %d, %p)", sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, root, comm); #endif if (!comm) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_COMM); } myrank = tMPI_Comm_seek_rank(comm, cur); /* we increase our counter, and determine which coll_env we get */ cev = tMPI_Get_cev(comm, myrank, &synct); if (myrank == root) { int i; int n_remaining = comm->grp.N-1; /* do root transfer */ if (sendbuf != TMPI_IN_PLACE) { tMPI_Coll_root_xfer(comm, sendtype, recvtype, sendtype->size*sendcount, recvtype->size*recvcount, sendbuf, (char*)recvbuf+myrank*recvcount*recvtype->size, &ret); } for (i = 0; i < comm->grp.N; i++) { cev->met[myrank].read_data[i] = FALSE; } cev->met[myrank].read_data[myrank] = TRUE; /* wait for data availability as long as there are xfers to be done */ while (n_remaining > 0) { #if defined(TMPI_PROFILE) && defined(TMPI_CYCLE_COUNT) tMPI_Profile_wait_start(cur); #endif tMPI_Event_wait( &(cev->met[myrank]).recv_ev ); #if defined(TMPI_PROFILE) && defined(TMPI_CYCLE_COUNT) tMPI_Profile_wait_stop(cur, TMPIWAIT_Coll_recv); #endif /* now check all of them */ for (i = 0; i < comm->grp.N; i++) { if (!cev->met[myrank].read_data[i] && (tMPI_Atomic_get(&(cev->met[i].current_sync)) == synct)) { tMPI_Mult_recv(comm, cev, i, 0, TMPI_GATHER_TAG, recvtype, recvcount*recvtype->size, (char*)recvbuf+i*recvcount*recvtype->size, &ret); tMPI_Event_process( &(cev->met[myrank]).recv_ev, 1); if (ret != TMPI_SUCCESS) { return ret; } cev->met[myrank].read_data[i] = TRUE; n_remaining--; } } } } else { if (!sendbuf) /* don't do pointer arithmetic on a NULL ptr */ { return tMPI_Error(comm, TMPI_ERR_BUF); } /* first set up the data just to root. */ ret = tMPI_Post_multi(cev, myrank, 0, TMPI_GATHER_TAG, sendtype, sendcount*sendtype->size, sendbuf, 1, synct, root); if (ret != TMPI_SUCCESS) { return ret; } /* and wait until root is done copying */ tMPI_Wait_for_others(cev, myrank); } #ifdef TMPI_PROFILE tMPI_Profile_count_stop(cur, TMPIFN_Gather); #endif return ret; }
int tMPI_Thread_cond_wait(tMPI_Thread_cond_t *cond, tMPI_Thread_mutex_t *mtx) { BOOL wait_done=FALSE; BOOL last_waiter=FALSE; int my_cycle; /* check whether the condition is initialized */ if (tMPI_Atomic_get( &(cond->initialized) ) == 0) { tMPI_Thread_cond_init_once(cond); } /* the mutex must have been initialized because it should be locked here */ #if 0 /* use this code once Vista is the minimum version required */ ret=SleepConditionVariableCS (&(cond->cv), &(mtx->cs), INFINITE); if (!ret) { tMPI_Fatal_error(TMPI_FARGS,"Failed wait for condition, error code=%d", GetLastError()); return -1; } #else /* serially increase waiter count */ EnterCriticalSection(&(cond->condp->wtr_lock)); cond->condp->Nwaiters++; my_cycle = cond->condp->cycle; LeaveCriticalSection(&(cond->condp->wtr_lock)); /* now it's safe to release the mutex from the fn call */ LeaveCriticalSection(&(mtx->mutex->cs)); /* Loop a wait until we found out we've waited for the right event. Note that this loop is potentially a busy-wait loop in bad circumstances (higher priority threads, for example). */ do { /* do the actual waiting */ if (WaitForSingleObject( cond->condp->ev, INFINITE )== WAIT_FAILED) { tMPI_Fatal_error(TMPI_FARGS,"Failed event reset, error code=%d", GetLastError()); return -1; } /* serially check whether we got the right event. */ EnterCriticalSection(&(cond->condp->wtr_lock)); wait_done = (cond->condp->Nrelease > 0) && (cond->condp->cycle!=my_cycle); LeaveCriticalSection(&(cond->condp->wtr_lock)); } while(!wait_done); /* We obtain the mutex from the function call */ EnterCriticalSection(&(mtx->mutex->cs)); /* we serially decrease the waiter count and release count */ EnterCriticalSection(&(cond->condp->wtr_lock)); cond->condp->Nwaiters--; cond->condp->Nrelease--; last_waiter=(cond->condp->Nrelease==0); LeaveCriticalSection(&(cond->condp->wtr_lock)); /* manually release the event if everybody's done with it */ if (last_waiter) { if (!ResetEvent( cond->condp->ev )) { tMPI_Fatal_error(TMPI_FARGS,"Failed event reset, error code=%d", GetLastError()); return -1; } } #endif return 0; }
/* Set the main thread's affinity */ static int tMPI_Set_main_thread_affinity(void) { /* calling thread PROCESSOR_NUMBER */ PROCESSOR_NUMBER CurrentProcessorNumber; /* calling thread GROUP_AFFINITY */ GROUP_AFFINITY CurrentThreadGroupAffinity; /* calling thread NUMA node */ USHORT CurrentNumaNodeNumber; /* we can pre-check because it's atomic */ if (tMPI_Atomic_get(&main_thread_aff_set) == 0) { /* this can be a spinlock because the chances of collision are low. */ tMPI_Spinlock_lock( &main_thread_aff_lock ); if( g_ulHighestNumaNodeNumber != 0 ) { func_GetCurrentProcessorNumberEx(&CurrentProcessorNumber); /* set the NUMA node affinity for the current thread failures to set the current thread affinity are ignored, as a fringe case can arise on >32 processor systems with a 32bit build/code. */ func_SetThreadIdealProcessorEx(GetCurrentThread(), &CurrentProcessorNumber, NULL); if(func_GetNumaProcessorNodeEx(&CurrentProcessorNumber, &CurrentNumaNodeNumber)) { /* for the NUMA node number associated with the current processor number, get the group affinity mask */ if(func_GetNumaNodeProcessorMaskEx(CurrentNumaNodeNumber, &CurrentThreadGroupAffinity)) { /* set the current thread affinity to prevent it from running on other NUMA nodes */ func_SetThreadGroupAffinity(GetCurrentThread(), &CurrentThreadGroupAffinity, NULL); } } } else { /* No NUMA. For now, we just do a similar thing. */ if ( (func_GetCurrentProcessorNumberEx != NULL) && (func_SetThreadIdealProcessorEx)) { func_GetCurrentProcessorNumberEx(&CurrentProcessorNumber); func_SetThreadIdealProcessorEx(GetCurrentThread(), &CurrentProcessorNumber, NULL); } } tMPI_Atomic_set( &main_thread_aff_set, 1); tMPI_Spinlock_unlock( &main_thread_aff_lock ); } return 0; }
void* tMPI_Once_wait(tMPI_Comm comm, void* (*function)(void*), void *param, int *was_first) { int myrank; struct coll_sync *csync; struct coll_env *cev; int syncs; void *ret; if (!comm) { tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_COMM); return NULL; } myrank=tMPI_Comm_seek_rank(comm, tMPI_Get_current()); /* we increase our counter, and determine which coll_env we get */ csync=&(comm->csync[myrank]); csync->syncs++; cev=&(comm->cev[csync->syncs % N_COLL_ENV]); /* now do a compare-and-swap on the current_syncc */ syncs=tMPI_Atomic_get( &(cev->coll.current_sync)); tMPI_Atomic_memory_barrier_acq(); if ((csync->syncs - syncs > 0) && /* check if sync was an earlier number. If it is a later number, we can't have been the first to arrive here. Calculating the difference instead of comparing directly avoids ABA problems. */ tMPI_Atomic_cas(&(cev->coll.current_sync), syncs, csync->syncs)) { /* we're the first! */ ret=function(param); if (was_first) *was_first=TRUE; /* broadcast the output data */ cev->coll.res=ret; tMPI_Atomic_memory_barrier_rel(); /* signal that we're done */ tMPI_Atomic_fetch_add(&(cev->coll.current_sync), 1); /* we need to keep being in sync */ csync->syncs++; } else { /* we need to wait until the current_syncc gets increased again */ csync->syncs++; do { /*tMPI_Atomic_memory_barrier();*/ syncs=tMPI_Atomic_get( &(cev->coll.current_sync) ); } while (csync->syncs - syncs > 0); /* difference again due to ABA problems */ tMPI_Atomic_memory_barrier_acq(); ret=cev->coll.res; } return ret; }