static void tMPI_Init_initers(void) { int state; /* we can pre-check because it's atomic */ if (tMPI_Atomic_get(&init_inited) == 0) { /* this can be a spinlock because the chances of collision are low. */ tMPI_Spinlock_lock( &init_init ); state=tMPI_Atomic_get(&init_inited); tMPI_Atomic_memory_barrier_acq(); if (state == 0) { InitializeCriticalSection(&mutex_init); InitializeCriticalSection(&once_init); InitializeCriticalSection(&cond_init); InitializeCriticalSection(&barrier_init); tMPI_Atomic_memory_barrier_rel(); tMPI_Atomic_set(&init_inited, 1); } tMPI_Spinlock_unlock( &init_init ); } }
static int tMPI_Init_initers(void) { int state; int ret = 0; /* we can pre-check because it's atomic */ if (tMPI_Atomic_get(&init_inited) == 0) { /* this can be a spinlock because the chances of collision are low. */ tMPI_Spinlock_lock( &init_init ); state = tMPI_Atomic_get(&init_inited); tMPI_Atomic_memory_barrier_acq(); if (state == 0) { InitializeCriticalSection(&mutex_init); InitializeCriticalSection(&once_init); InitializeCriticalSection(&cond_init); InitializeCriticalSection(&barrier_init); InitializeCriticalSection(&thread_id_list_lock); ret = tMPI_Init_NUMA(); if (ret != 0) { goto err; } ret = tMPI_Thread_id_list_init(); if (ret != 0) { goto err; } tMPI_Atomic_memory_barrier_rel(); tMPI_Atomic_set(&init_inited, 1); } tMPI_Spinlock_unlock( &init_init ); } return ret; err: tMPI_Spinlock_unlock( &init_init ); return ret; }
void* tMPI_Once_wait(tMPI_Comm comm, void* (*function)(void*), void *param, int *was_first) { int myrank; struct coll_sync *csync; struct coll_env *cev; int syncs; void *ret; if (!comm) { tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_COMM); return NULL; } myrank=tMPI_Comm_seek_rank(comm, tMPI_Get_current()); /* we increase our counter, and determine which coll_env we get */ csync=&(comm->csync[myrank]); csync->syncs++; cev=&(comm->cev[csync->syncs % N_COLL_ENV]); /* now do a compare-and-swap on the current_syncc */ syncs=tMPI_Atomic_get( &(cev->coll.current_sync)); tMPI_Atomic_memory_barrier_acq(); if ((csync->syncs - syncs > 0) && /* check if sync was an earlier number. If it is a later number, we can't have been the first to arrive here. Calculating the difference instead of comparing directly avoids ABA problems. */ tMPI_Atomic_cas(&(cev->coll.current_sync), syncs, csync->syncs)) { /* we're the first! */ ret=function(param); if (was_first) *was_first=TRUE; /* broadcast the output data */ cev->coll.res=ret; tMPI_Atomic_memory_barrier_rel(); /* signal that we're done */ tMPI_Atomic_fetch_add(&(cev->coll.current_sync), 1); /* we need to keep being in sync */ csync->syncs++; } else { /* we need to wait until the current_syncc gets increased again */ csync->syncs++; do { /*tMPI_Atomic_memory_barrier();*/ syncs=tMPI_Atomic_get( &(cev->coll.current_sync) ); } while (csync->syncs - syncs > 0); /* difference again due to ABA problems */ tMPI_Atomic_memory_barrier_acq(); ret=cev->coll.res; } return ret; }
void tMPI_Post_multi(struct coll_env *cev, int myrank, int index, int tag, tMPI_Datatype datatype, size_t bufsize, void *buf, int n_remaining, int synct, int dest) { int i; #ifdef USE_COLLECTIVE_COPY_BUFFER /* decide based on the number of waiting threads */ tmpi_bool using_cb=(bufsize < (size_t)(n_remaining*COPY_BUFFER_SIZE)); cev->met[myrank].using_cb=using_cb; if (using_cb) { /* we set it to NULL initially */ /*cev->met[myrank].cpbuf[index]=NULL;*/ tMPI_Atomic_ptr_set(&(cev->met[myrank].cpbuf[index]), NULL); tMPI_Atomic_set(&(cev->met[myrank].buf_readcount), 0); } #endif cev->met[myrank].tag=tag; cev->met[myrank].datatype=datatype; cev->met[myrank].buf[index]=buf; cev->met[myrank].bufsize[index]=bufsize; tMPI_Atomic_set(&(cev->met[myrank].n_remaining), n_remaining); tMPI_Atomic_memory_barrier_rel(); tMPI_Atomic_set(&(cev->met[myrank].current_sync), synct); /* publish availability. */ if (dest<0) { for(i=0;i<cev->N;i++) { if (i != myrank) tMPI_Event_signal( &(cev->met[i].recv_ev) ); } } else { tMPI_Event_signal( &(cev->met[dest].recv_ev) ); } #ifdef USE_COLLECTIVE_COPY_BUFFER /* becase we've published availability, we can start copying -- possibly in parallel with the receiver */ if (using_cb) { struct tmpi_thread *cur=tMPI_Get_current(); /* copy the buffer locally. First allocate */ cev->met[myrank].cb=tMPI_Copy_buffer_list_get( &(cur->cbl_multi) ); if (cev->met[myrank].cb->size < bufsize) { fprintf(stderr, "ERROR: cb size too small\n"); exit(1); } /* copy to the new buf */ memcpy(cev->met[myrank].cb->buf, buf, bufsize); /* post the new buf */ tMPI_Atomic_memory_barrier_rel(); /*cev->met[myrank].cpbuf[index]=cev->met[myrank].cb->buf;*/ tMPI_Atomic_ptr_set(&(cev->met[myrank].cpbuf[index]), cev->met[myrank].cb->buf); } #endif }
void tMPI_Mult_recv(tMPI_Comm comm, struct coll_env *cev, int rank, int index, int expected_tag, tMPI_Datatype recvtype, size_t recvsize, void *recvbuf, int *ret) { size_t sendsize=cev->met[rank].bufsize[index]; /* check tags, types */ if ((cev->met[rank].datatype != recvtype ) || (cev->met[rank].tag != expected_tag)) { *ret=tMPI_Error(comm, TMPI_ERR_MULTI_MISMATCH); } if (sendsize) /* we allow NULL ptrs if there's nothing to xmit */ { void *srcbuf; #ifdef USE_COLLECTIVE_COPY_BUFFER tmpi_bool decrease_ctr=FALSE; #endif if ( sendsize > recvsize ) { *ret=tMPI_Error(comm, TMPI_ERR_XFER_BUFSIZE); return; } if ( cev->met[rank].buf == recvbuf ) { *ret=tMPI_Error(TMPI_COMM_WORLD,TMPI_ERR_XFER_BUF_OVERLAP); return; } /* get source buffer */ #ifdef USE_COLLECTIVE_COPY_BUFFER if ( !(cev->met[rank].using_cb)) #endif { srcbuf=cev->met[rank].buf[index]; } #ifdef USE_COLLECTIVE_COPY_BUFFER else { srcbuf=tMPI_Atomic_ptr_get(&(cev->met[rank].cpbuf[index])); tMPI_Atomic_memory_barrier_acq(); if(!srcbuf) { /* there was (as of yet) no copied buffer */ void *try_again_srcbuf; /* we need to try checking the pointer again after we increase the read counter, signaling that one more thread is reading. */ tMPI_Atomic_add_return(&(cev->met[rank].buf_readcount), 1); /* a full memory barrier */ tMPI_Atomic_memory_barrier(); try_again_srcbuf=tMPI_Atomic_ptr_get( &(cev->met[rank].cpbuf[index])); if (!try_again_srcbuf) { /* apparently the copied buffer is not ready yet. We just use the real source buffer. We have already indicated we're reading from the regular buf. */ srcbuf=cev->met[rank].buf[index]; decrease_ctr=TRUE; } else { /* We tried again, and this time there was a copied buffer. We use that, and indicate that we're not reading from the regular buf. This case should be pretty rare. */ tMPI_Atomic_fetch_add(&(cev->met[rank].buf_readcount),-1); tMPI_Atomic_memory_barrier_acq(); srcbuf=try_again_srcbuf; } } #ifdef TMPI_PROFILE if (srcbuf) tMPI_Profile_count_buffered_coll_xfer(tMPI_Get_current()); #endif } #endif /* copy data */ memcpy((char*)recvbuf, srcbuf, sendsize); #ifdef TMPI_PROFILE tMPI_Profile_count_coll_xfer(tMPI_Get_current()); #endif #ifdef USE_COLLECTIVE_COPY_BUFFER if (decrease_ctr) { /* we decrement the read count; potentially releasing the buffer. */ tMPI_Atomic_memory_barrier_rel(); tMPI_Atomic_fetch_add( &(cev->met[rank].buf_readcount), -1); } #endif } /* signal one thread ready */ { int reta; tMPI_Atomic_memory_barrier_rel(); reta=tMPI_Atomic_add_return( &(cev->met[rank].n_remaining), -1); if (reta <= 0) { tMPI_Event_signal( &(cev->met[rank].send_ev) ); } } }
int tMPI_Alltoall(void* sendbuf, int sendcount, tMPI_Datatype sendtype, void* recvbuf, int recvcount, tMPI_Datatype recvtype, tMPI_Comm comm) { int synct; struct coll_env *cev; int myrank; int ret = TMPI_SUCCESS; int i; size_t sendsize = sendtype->size*sendcount; size_t recvsize = recvtype->size*recvcount; int n_remaining; struct tmpi_thread *cur = tMPI_Get_current(); #ifdef TMPI_PROFILE tMPI_Profile_count_start(cur); #endif #ifdef TMPI_TRACE tMPI_Trace_print("tMPI_Alltoall(%p, %d, %p, %p, %d, %p, %p)", sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm); #endif if (!comm) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_COMM); } if (!sendbuf || !recvbuf) /* don't do pointer arithmetic on a NULL ptr */ { return tMPI_Error(comm, TMPI_ERR_BUF); } myrank = tMPI_Comm_seek_rank(comm, cur); /* we increase our counter, and determine which coll_env we get */ cev = tMPI_Get_cev(comm, myrank, &synct); /* post our pointers */ /* we set up multiple posts, so no Post_multi */ cev->met[myrank].tag = TMPI_ALLTOALL_TAG; cev->met[myrank].datatype = sendtype; tMPI_Atomic_set( &(cev->met[myrank].n_remaining), cev->N-1 ); for (i = 0; i < comm->grp.N; i++) { cev->met[myrank].bufsize[i] = sendsize; cev->met[myrank].buf[i] = (char*)sendbuf+sendsize*i; cev->met[myrank].read_data[i] = FALSE; } tMPI_Atomic_memory_barrier_rel(); tMPI_Atomic_set(&(cev->met[myrank].current_sync), synct); /* post availability */ for (i = 0; i < cev->N; i++) { if (i != myrank) { tMPI_Event_signal( &(cev->met[i].recv_ev) ); } } /* we don't do the copy buffer thing here because it's pointless: the processes have to synchronize anyway, because they all send and receive. */ /* do root transfer */ tMPI_Coll_root_xfer(comm, sendtype, recvtype, sendsize, recvsize, (char*)sendbuf+sendsize*myrank, (char*)recvbuf+recvsize*myrank, &ret); cev->met[myrank].read_data[myrank] = TRUE; /* and poll data availability */ n_remaining = cev->N-1; while (n_remaining > 0) { #if defined(TMPI_PROFILE) && defined(TMPI_CYCLE_COUNT) tMPI_Profile_wait_start(cur); #endif tMPI_Event_wait( &(cev->met[myrank]).recv_ev ); #if defined(TMPI_PROFILE) && defined(TMPI_CYCLE_COUNT) tMPI_Profile_wait_stop(cur, TMPIWAIT_Coll_recv); #endif for (i = 0; i < cev->N; i++) { if ((!cev->met[myrank].read_data[i]) && (tMPI_Atomic_get(&(cev->met[i].current_sync)) == synct)) { tMPI_Event_process( &(cev->met[myrank]).recv_ev, 1); tMPI_Mult_recv(comm, cev, i, myrank, TMPI_ALLTOALL_TAG, recvtype, recvsize, (char*)recvbuf+recvsize*i, &ret); if (ret != TMPI_SUCCESS) { return ret; } cev->met[myrank].read_data[i] = TRUE; n_remaining--; } } } /* and wait until everybody is done copying our data */ tMPI_Wait_for_others(cev, myrank); #ifdef TMPI_PROFILE tMPI_Profile_count_stop(cur, TMPIFN_Alltoall); #endif return ret; }