int tMPI_Cart_get(tMPI_Comm comm, int maxdims, int *dims, int *periods, int *coords) { int i; int myrank=tMPI_Comm_seek_rank(comm, tMPI_Get_current()); #ifdef TMPI_TRACE tMPI_Trace_print("tMPI_Cart_get(%p, %d, %p, %p, %p)", comm, maxdims, dims, periods, coords); #endif if (!comm) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_COMM); } if (!comm->cart || comm->cart->ndims==0) return TMPI_SUCCESS; tMPI_Cart_coords(comm, myrank, maxdims, coords); for(i=0;i<comm->cart->ndims;i++) { if (i>=maxdims) { return tMPI_Error(comm, TMPI_ERR_DIMS); } dims[i]=comm->cart->dims[i]; periods[i]=comm->cart->periods[i]; } return TMPI_SUCCESS; }
int tMPI_Cart_coords(tMPI_Comm comm, int rank, int maxdims, int *coords) { int i; int rank_left=rank; #ifdef TMPI_TRACE tMPI_Trace_print("tMPI_Cart_coords(%p, %d, %d, %p)", comm, rank, maxdims, coords); #endif if (!comm) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_COMM); } if (!comm->cart || comm->cart->ndims==0) return TMPI_SUCCESS; if (maxdims < comm->cart->ndims) { return tMPI_Error(comm, TMPI_ERR_DIMS); } /* again, row-major ordering */ for(i=comm->cart->ndims-1;i>=0;i--) { coords[i]=rank_left%comm->cart->dims[i]; rank_left /= comm->cart->dims[i]; } return TMPI_SUCCESS; }
int tMPI_Cart_create(tMPI_Comm comm_old, int ndims, int *dims, int *periods, int reorder, tMPI_Comm *comm_cart) { int myrank = tMPI_Comm_seek_rank(comm_old, tMPI_Get_current()); int key = myrank; int color = 0; int Ntot = 1; int i; #ifdef TMPI_TRACE tMPI_Trace_print("tMPI_Cart_create(%p, %d, %p, %p, %d, %p)", comm_old, ndims, dims, periods, reorder, comm_cart); #endif if (!comm_old) { return tMPI_Error(comm_old, TMPI_ERR_COMM); } /* calculate the total number of procs in cartesian comm */ for (i = 0; i < ndims; i++) { Ntot *= dims[i]; } /* refuse to create if there's not enough procs */ if (comm_old->grp.N < Ntot) { *comm_cart = TMPI_COMM_NULL; #if 1 return tMPI_Error(comm_old, TMPI_ERR_CART_CREATE_NPROCS); #endif } if (key >= Ntot) { key = TMPI_UNDEFINED; } if (reorder) { tMPI_Cart_map(comm_old, ndims, dims, periods, &key); } if (key == TMPI_UNDEFINED) { color = TMPI_UNDEFINED; } tMPI_Comm_split(comm_old, color, key, comm_cart); tMPI_Cart_init(comm_cart, ndims, dims, periods); return TMPI_SUCCESS; }
int tMPI_Barrier(tMPI_Comm comm) { #ifdef TMPI_PROFILE struct tmpi_thread *cur=tMPI_Get_current(); tMPI_Profile_count_start(cur); #endif #ifdef TMPI_TRACE tMPI_Trace_print("tMPI_Barrier(%p, %d, %p, %d, %d, %p, %p)", comm); #endif if (!comm) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_COMM); } if (comm->grp.N>1) { #if defined(TMPI_PROFILE) tMPI_Profile_wait_start(cur); #endif tMPI_Barrier_wait( &(comm->barrier) ); #if defined(TMPI_PROFILE) tMPI_Profile_wait_stop(cur, TMPIWAIT_Barrier); #endif } #ifdef TMPI_PROFILE tMPI_Profile_count_stop(cur, TMPIFN_Barrier); #endif return TMPI_SUCCESS; }
/* once */ int tMPI_Once(tMPI_Comm comm, void (*function)(void*), void *param, int *was_first) { int myrank; int ret=TMPI_SUCCESS; struct coll_sync *csync; struct coll_env *cev; int syncs; if (!comm) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_COMM); } myrank=tMPI_Comm_seek_rank(comm, tMPI_Get_current()); /* we increase our counter, and determine which coll_env we get */ csync=&(comm->csync[myrank]); csync->syncs++; cev=&(comm->cev[csync->syncs % N_COLL_ENV]); /* now do a compare-and-swap on the current_syncc */ syncs=tMPI_Atomic_get( &(cev->coll.current_sync)); if ((csync->syncs - syncs > 0) && /* check if sync was an earlier number. If it is a later number, we can't have been the first to arrive here. */ tMPI_Atomic_cas(&(cev->coll.current_sync), syncs, csync->syncs)) { /* we're the first! */ function(param); if (was_first) *was_first=TRUE; } return ret; }
void *tMPI_Realloc(void *p, size_t size) { void *ret=(void*)realloc(p, size); if (!ret) { tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_MALLOC); } return ret; }
void *tMPI_Malloc(size_t size) { void *ret=(void*)malloc(size); if (!ret) { tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_MALLOC); } return ret; }
struct tmpi_thread *tMPI_Get_thread(tMPI_Comm comm, int rank) { /* check destination */ if ( (rank < 0) || (rank > comm->grp.N) ) { tMPI_Error(comm, TMPI_ERR_GROUP_RANK); return NULL; } return comm->grp.peers[rank]; }
int tMPI_Cart_rank(tMPI_Comm comm, int *coords, int *rank) { int i, mul = 1, ret = 0; #ifdef TMPI_TRACE tMPI_Trace_print("tMPI_Cart_get(%p, %p, %p)", comm, coords, rank); #endif if (!comm) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_COMM); } if (!comm->cart || comm->cart->ndims == 0) { return TMPI_SUCCESS; } /* because of row-major ordering, we count the dimensions down */ for (i = comm->cart->ndims-1; i >= 0; i--) { int rcoord = coords[i]; if (comm->cart->periods[i]) { /* apply periodic boundary conditions */ rcoord = rcoord % comm->cart->dims[i]; if (rcoord < 0) { rcoord += comm->cart->dims[i]; } } else { if (rcoord < 0 || rcoord >= comm->cart->dims[i]) { return tMPI_Error(comm, TMPI_ERR_DIMS); } } ret += mul*rcoord; mul *= comm->cart->dims[i]; } *rank = ret; return TMPI_SUCCESS; }
int tMPI_Get_count(tMPI_Status *status, tMPI_Datatype datatype, int *count) { #ifdef TMPI_TRACE tMPI_Trace_print("tMPI_Get_count(%p, %p, %p)", status, datatype, count); #endif if (!status) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_STATUS); } *count = (int)(status->transferred/datatype->size); return TMPI_SUCCESS; }
int tMPI_Cart_map(tMPI_Comm comm, int ndims, int *dims, int *periods, int *newrank) { /* this function doesn't actually do anything beyond returning the current rank (or TMPI_UNDEFINED if it doesn't fit in the new topology */ int myrank=tMPI_Comm_seek_rank(comm, tMPI_Get_current()); int Ntot=1; int i; #ifdef TMPI_TRACE tMPI_Trace_print("tMPI_Cart_map(%p, %d, %p, %p, %p)", comm, ndims, dims, periods, newrank); #endif if (!comm) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_COMM); } if (!periods) { return tMPI_Error(comm, TMPI_ERR_DIMS); } /* calculate the total number of procs in cartesian comm */ for(i=0;i<ndims;i++) { Ntot *= dims[i]; } if (myrank >= Ntot) { *newrank=TMPI_UNDEFINED; } else { *newrank=myrank; } return TMPI_SUCCESS; }
void tMPI_Coll_root_xfer(tMPI_Comm comm, tMPI_Datatype sendtype, tMPI_Datatype recvtype, size_t sendsize, size_t recvsize, void* sendbuf, void* recvbuf, int *ret) { /* do root transfer */ if (recvsize < sendsize) { *ret=tMPI_Error(comm, TMPI_ERR_XFER_BUFSIZE); return; } if (recvtype != sendtype) { *ret=tMPI_Error(comm, TMPI_ERR_MULTI_MISMATCH); return; } if ( sendbuf == recvbuf ) { *ret=tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_XFER_BUF_OVERLAP); return; } memcpy(recvbuf, sendbuf, sendsize); }
int tMPI_Cartdim_get(tMPI_Comm comm, int *ndims) { #ifdef TMPI_TRACE tMPI_Trace_print("tMPI_Cartdim_get(%p, %p)", comm, ndims); #endif if (!comm) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_COMM); } if (!comm->cart || comm->cart->ndims==0) { return TMPI_SUCCESS; } *ndims=comm->cart->ndims; return TMPI_SUCCESS; }
/* topology functions */ int tMPI_Topo_test(tMPI_Comm comm, int *status) { #ifdef TMPI_TRACE tMPI_Trace_print("tMPI_Topo_test(%p, %p)", comm, status); #endif if (!comm) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_COMM); } if (comm->cart) *status=TMPI_CART; /*else if (comm->graph) status=MPI_GRAPH;*/ else *status=TMPI_UNDEFINED; return TMPI_SUCCESS; }
int tMPI_Gather(void* sendbuf, int sendcount, tMPI_Datatype sendtype, void* recvbuf, int recvcount, tMPI_Datatype recvtype, int root, tMPI_Comm comm) { int synct; struct coll_env *cev; int myrank; int ret = TMPI_SUCCESS; struct tmpi_thread *cur = tMPI_Get_current(); #ifdef TMPI_PROFILE tMPI_Profile_count_start(cur); #endif #ifdef TMPI_TRACE tMPI_Trace_print("tMPI_Gather(%p, %d, %p, %p, %d, %p, %d, %p)", sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, root, comm); #endif if (!comm) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_COMM); } myrank = tMPI_Comm_seek_rank(comm, cur); /* we increase our counter, and determine which coll_env we get */ cev = tMPI_Get_cev(comm, myrank, &synct); if (myrank == root) { int i; int n_remaining = comm->grp.N-1; /* do root transfer */ if (sendbuf != TMPI_IN_PLACE) { tMPI_Coll_root_xfer(comm, sendtype, recvtype, sendtype->size*sendcount, recvtype->size*recvcount, sendbuf, (char*)recvbuf+myrank*recvcount*recvtype->size, &ret); } for (i = 0; i < comm->grp.N; i++) { cev->met[myrank].read_data[i] = FALSE; } cev->met[myrank].read_data[myrank] = TRUE; /* wait for data availability as long as there are xfers to be done */ while (n_remaining > 0) { #if defined(TMPI_PROFILE) && defined(TMPI_CYCLE_COUNT) tMPI_Profile_wait_start(cur); #endif tMPI_Event_wait( &(cev->met[myrank]).recv_ev ); #if defined(TMPI_PROFILE) && defined(TMPI_CYCLE_COUNT) tMPI_Profile_wait_stop(cur, TMPIWAIT_Coll_recv); #endif /* now check all of them */ for (i = 0; i < comm->grp.N; i++) { if (!cev->met[myrank].read_data[i] && (tMPI_Atomic_get(&(cev->met[i].current_sync)) == synct)) { tMPI_Mult_recv(comm, cev, i, 0, TMPI_GATHER_TAG, recvtype, recvcount*recvtype->size, (char*)recvbuf+i*recvcount*recvtype->size, &ret); tMPI_Event_process( &(cev->met[myrank]).recv_ev, 1); if (ret != TMPI_SUCCESS) { return ret; } cev->met[myrank].read_data[i] = TRUE; n_remaining--; } } } } else { if (!sendbuf) /* don't do pointer arithmetic on a NULL ptr */ { return tMPI_Error(comm, TMPI_ERR_BUF); } /* first set up the data just to root. */ ret = tMPI_Post_multi(cev, myrank, 0, TMPI_GATHER_TAG, sendtype, sendcount*sendtype->size, sendbuf, 1, synct, root); if (ret != TMPI_SUCCESS) { return ret; } /* and wait until root is done copying */ tMPI_Wait_for_others(cev, myrank); } #ifdef TMPI_PROFILE tMPI_Profile_count_stop(cur, TMPIFN_Gather); #endif return ret; }
void tMPI_Mult_recv(tMPI_Comm comm, struct coll_env *cev, int rank, int index, int expected_tag, tMPI_Datatype recvtype, size_t recvsize, void *recvbuf, int *ret) { size_t sendsize=cev->met[rank].bufsize[index]; /* check tags, types */ if ((cev->met[rank].datatype != recvtype ) || (cev->met[rank].tag != expected_tag)) { *ret=tMPI_Error(comm, TMPI_ERR_MULTI_MISMATCH); } if (sendsize) /* we allow NULL ptrs if there's nothing to xmit */ { void *srcbuf; #ifdef USE_COLLECTIVE_COPY_BUFFER tmpi_bool decrease_ctr=FALSE; #endif if ( sendsize > recvsize ) { *ret=tMPI_Error(comm, TMPI_ERR_XFER_BUFSIZE); return; } if ( cev->met[rank].buf == recvbuf ) { *ret=tMPI_Error(TMPI_COMM_WORLD,TMPI_ERR_XFER_BUF_OVERLAP); return; } /* get source buffer */ #ifdef USE_COLLECTIVE_COPY_BUFFER if ( !(cev->met[rank].using_cb)) #endif { srcbuf=cev->met[rank].buf[index]; } #ifdef USE_COLLECTIVE_COPY_BUFFER else { srcbuf=tMPI_Atomic_ptr_get(&(cev->met[rank].cpbuf[index])); tMPI_Atomic_memory_barrier_acq(); if(!srcbuf) { /* there was (as of yet) no copied buffer */ void *try_again_srcbuf; /* we need to try checking the pointer again after we increase the read counter, signaling that one more thread is reading. */ tMPI_Atomic_add_return(&(cev->met[rank].buf_readcount), 1); /* a full memory barrier */ tMPI_Atomic_memory_barrier(); try_again_srcbuf=tMPI_Atomic_ptr_get( &(cev->met[rank].cpbuf[index])); if (!try_again_srcbuf) { /* apparently the copied buffer is not ready yet. We just use the real source buffer. We have already indicated we're reading from the regular buf. */ srcbuf=cev->met[rank].buf[index]; decrease_ctr=TRUE; } else { /* We tried again, and this time there was a copied buffer. We use that, and indicate that we're not reading from the regular buf. This case should be pretty rare. */ tMPI_Atomic_fetch_add(&(cev->met[rank].buf_readcount),-1); tMPI_Atomic_memory_barrier_acq(); srcbuf=try_again_srcbuf; } } #ifdef TMPI_PROFILE if (srcbuf) tMPI_Profile_count_buffered_coll_xfer(tMPI_Get_current()); #endif } #endif /* copy data */ memcpy((char*)recvbuf, srcbuf, sendsize); #ifdef TMPI_PROFILE tMPI_Profile_count_coll_xfer(tMPI_Get_current()); #endif #ifdef USE_COLLECTIVE_COPY_BUFFER if (decrease_ctr) { /* we decrement the read count; potentially releasing the buffer. */ tMPI_Atomic_memory_barrier_rel(); tMPI_Atomic_fetch_add( &(cev->met[rank].buf_readcount), -1); } #endif } /* signal one thread ready */ { int reta; tMPI_Atomic_memory_barrier_rel(); reta=tMPI_Atomic_add_return( &(cev->met[rank].n_remaining), -1); if (reta <= 0) { tMPI_Event_signal( &(cev->met[rank].send_ev) ); } } }
/* this is the main comm creation function. All other functions that create comms use this*/ int tMPI_Comm_split(tMPI_Comm comm, int color, int key, tMPI_Comm *newcomm) { int i, j; int N = tMPI_Comm_N(comm); volatile tMPI_Comm *newcomm_list; volatile int colors[MAX_PREALLOC_THREADS]; /* array with the colors of each thread */ volatile int keys[MAX_PREALLOC_THREADS]; /* same for keys (only one of the threads actually suplies these arrays to the comm structure) */ tmpi_bool i_am_first = FALSE; int myrank = tMPI_Comm_seek_rank(comm, tMPI_Get_current()); struct tmpi_split *spl; int ret; #ifdef TMPI_TRACE tMPI_Trace_print("tMPI_Comm_split(%p, %d, %d, %p)", comm, color, key, newcomm); #endif if (!comm) { *newcomm = NULL; return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_COMM); } ret = tMPI_Thread_mutex_lock(&(comm->comm_create_lock)); if (ret != 0) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO); } /* first get the colors */ if (!comm->new_comm) { /* i am apparently first */ comm->split = (struct tmpi_split*)tMPI_Malloc(sizeof(struct tmpi_split)); comm->new_comm = (tMPI_Comm*)tMPI_Malloc(N*sizeof(tMPI_Comm)); if (N <= MAX_PREALLOC_THREADS) { comm->split->colors = colors; comm->split->keys = keys; } else { comm->split->colors = (int*)tMPI_Malloc(N*sizeof(int)); comm->split->keys = (int*)tMPI_Malloc(N*sizeof(int)); } comm->split->Ncol_init = tMPI_Comm_N(comm); comm->split->can_finish = FALSE; i_am_first = TRUE; /* the main communicator contains a list the size of grp.N */ } newcomm_list = comm->new_comm; /* we copy it to the local stacks because we can later erase comm->new_comm safely */ spl = comm->split; /* we do the same for spl */ spl->colors[myrank] = color; spl->keys[myrank] = key; spl->Ncol_init--; if (spl->Ncol_init == 0) { ret = tMPI_Thread_cond_signal(&(comm->comm_create_prep)); if (ret != 0) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO); } } if (!i_am_first) { /* all other threads can just wait until the creator thread is finished */ while (!spl->can_finish) { ret = tMPI_Thread_cond_wait(&(comm->comm_create_finish), &(comm->comm_create_lock) ); if (ret != 0) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO); } } } else { int Ncomms = 0; int comm_color_[MAX_PREALLOC_THREADS]; int comm_N_[MAX_PREALLOC_THREADS]; int *comm_color = comm_color_; /* there can't be more comms than N*/ int *comm_N = comm_N_; /* the number of procs in a group */ int *comm_groups; /* the groups */ tMPI_Comm *comms; /* the communicators */ /* wait for the colors to be done */ /*if (N>1)*/ while (spl->Ncol_init > 0) { ret = tMPI_Thread_cond_wait(&(comm->comm_create_prep), &(comm->comm_create_lock)); if (ret != 0) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO); } } /* reset the state so that a new comm creating function can run */ spl->Ncol_destroy = N; comm->new_comm = 0; comm->split = 0; comm_groups = (int*)tMPI_Malloc(N*N*sizeof(int)); if (N > MAX_PREALLOC_THREADS) { comm_color = (int*)tMPI_Malloc(N*sizeof(int)); comm_N = (int*)tMPI_Malloc(N*sizeof(int)); } /* count colors, allocate and split up communicators */ tMPI_Split_colors(N, (int*)spl->colors, (int*)spl->keys, &Ncomms, comm_N, comm_color, comm_groups); /* allocate a bunch of communicators */ comms = (tMPI_Comm*)tMPI_Malloc(Ncomms*sizeof(tMPI_Comm)); for (i = 0; i < Ncomms; i++) { ret = tMPI_Comm_alloc(&(comms[i]), comm, comm_N[i]); if (ret != TMPI_SUCCESS) { return ret; } } /* now distribute the comms */ for (i = 0; i < Ncomms; i++) { comms[i]->grp.N = comm_N[i]; for (j = 0; j < comm_N[i]; j++) { comms[i]->grp.peers[j] = comm->grp.peers[comm_groups[i*comm->grp.N + j]]; } } /* and put them into the newcomm_list */ for (i = 0; i < N; i++) { newcomm_list[i] = TMPI_COMM_NULL; for (j = 0; j < Ncomms; j++) { if (spl->colors[i] == comm_color[j]) { newcomm_list[i] = comms[j]; break; } } } #ifdef TMPI_DEBUG /* output */ for (i = 0; i < Ncomms; i++) { printf("Group %d (color %d) has %d members: ", i, comm_color[i], comm_N[i]); for (j = 0; j < comm_N[i]; j++) { printf(" %d ", comm_groups[comm->grp.N*i + j]); } printf(" rank: "); for (j = 0; j < comm_N[i]; j++) { printf(" %d ", spl->keys[comm_groups[N*i + j]]); } printf(" color: "); for (j = 0; j < comm_N[i]; j++) { printf(" %d ", spl->colors[comm_groups[N*i + j]]); } printf("\n"); } #endif if (N > MAX_PREALLOC_THREADS) { free((int*)spl->colors); free((int*)spl->keys); free(comm_color); free(comm_N); } free(comm_groups); free(comms); spl->can_finish = TRUE; /* tell the waiting threads that there's a comm ready */ ret = tMPI_Thread_cond_broadcast(&(comm->comm_create_finish)); if (ret != 0) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO); } } /* here the individual threads get their comm object */ *newcomm = newcomm_list[myrank]; /* free when we have assigned them all, so we can reuse the object*/ spl->Ncol_destroy--; if (spl->Ncol_destroy == 0) { free((void*)newcomm_list); free(spl); } ret = tMPI_Thread_mutex_unlock(&(comm->comm_create_lock)); if (ret != 0) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO); } return TMPI_SUCCESS; }
int tMPI_Comm_destroy(tMPI_Comm comm, tmpi_bool do_link_lock) { int i; int ret; free(comm->grp.peers); for (i = 0; i < comm->N_reduce_iter; i++) { free(comm->reduce_barrier[i]); } free(comm->reduce_barrier); free(comm->N_reduce); for (i = 0; i < N_COLL_ENV; i++) { tMPI_Coll_env_destroy( &(comm->cev[i]) ); } for (i = 0; i < comm->grp.N; i++) { tMPI_Coll_sync_destroy( &(comm->csync[i]) ); } free(comm->cev); free(comm->csync); ret = tMPI_Thread_mutex_destroy( &(comm->comm_create_lock) ); if (ret != 0) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO); } ret = tMPI_Thread_cond_destroy( &(comm->comm_create_prep) ); if (ret != 0) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO); } ret = tMPI_Thread_cond_destroy( &(comm->comm_create_finish) ); if (ret != 0) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO); } free((void*)comm->reduce_sendbuf); free((void*)comm->reduce_recvbuf); if (comm->cart) { tMPI_Cart_destroy( comm->cart ); free(comm->cart); } /* remove ourselves from the circular list */ if (do_link_lock) { ret = tMPI_Thread_mutex_lock( &(tmpi_global->comm_link_lock) ); if (ret != 0) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO); } } if (comm->next) { comm->next->prev = comm->prev; } if (comm->prev) { comm->prev->next = comm->next; } free(comm); if (do_link_lock) { ret = tMPI_Thread_mutex_unlock( &(tmpi_global->comm_link_lock) ); if (ret != 0) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO); } } return TMPI_SUCCESS; }
void* tMPI_Once_wait(tMPI_Comm comm, void* (*function)(void*), void *param, int *was_first) { int myrank; struct coll_sync *csync; struct coll_env *cev; int syncs; void *ret; if (!comm) { tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_COMM); return NULL; } myrank=tMPI_Comm_seek_rank(comm, tMPI_Get_current()); /* we increase our counter, and determine which coll_env we get */ csync=&(comm->csync[myrank]); csync->syncs++; cev=&(comm->cev[csync->syncs % N_COLL_ENV]); /* now do a compare-and-swap on the current_syncc */ syncs=tMPI_Atomic_get( &(cev->coll.current_sync)); tMPI_Atomic_memory_barrier_acq(); if ((csync->syncs - syncs > 0) && /* check if sync was an earlier number. If it is a later number, we can't have been the first to arrive here. Calculating the difference instead of comparing directly avoids ABA problems. */ tMPI_Atomic_cas(&(cev->coll.current_sync), syncs, csync->syncs)) { /* we're the first! */ ret=function(param); if (was_first) *was_first=TRUE; /* broadcast the output data */ cev->coll.res=ret; tMPI_Atomic_memory_barrier_rel(); /* signal that we're done */ tMPI_Atomic_fetch_add(&(cev->coll.current_sync), 1); /* we need to keep being in sync */ csync->syncs++; } else { /* we need to wait until the current_syncc gets increased again */ csync->syncs++; do { /*tMPI_Atomic_memory_barrier();*/ syncs=tMPI_Atomic_get( &(cev->coll.current_sync) ); } while (csync->syncs - syncs > 0); /* difference again due to ABA problems */ tMPI_Atomic_memory_barrier_acq(); ret=cev->coll.res; } return ret; }
int tMPI_Comm_alloc(tMPI_Comm *newcomm, tMPI_Comm parent, int N) { struct tmpi_comm_ *retc; int i; int ret; retc = (struct tmpi_comm_*)tMPI_Malloc(sizeof(struct tmpi_comm_)); if (retc == NULL) { return TMPI_ERR_NO_MEM; } retc->grp.peers = (struct tmpi_thread**)tMPI_Malloc( sizeof(struct tmpi_thread*)*Nthreads); if (retc->grp.peers == NULL) { return TMPI_ERR_NO_MEM; } retc->grp.N = N; ret = tMPI_Thread_mutex_init( &(retc->comm_create_lock) ); if (ret != 0) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO); } ret = tMPI_Thread_cond_init( &(retc->comm_create_prep) ); if (ret != 0) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO); } ret = tMPI_Thread_cond_init( &(retc->comm_create_finish) ); if (ret != 0) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO); } retc->split = NULL; retc->new_comm = NULL; /* we have no topology to start out with */ retc->cart = NULL; /*retc->graph=NULL;*/ /* we start counting at 0 */ tMPI_Atomic_set( &(retc->destroy_counter), 0); /* initialize the main barrier */ tMPI_Barrier_init(&(retc->barrier), N); /* the reduce barriers */ { /* First calculate the number of reduce barriers */ int Niter = 0; /* the iteration number */ int Nred = N; /* the number of reduce barriers for this iteration */ while (Nred > 1) { /* Nred is now Nred/2 + a rest term because solitary process at the end of the list must still be accounter for */ Nred = Nred/2 + Nred%2; Niter += 1; } retc->N_reduce_iter = Niter; /* allocate the list */ retc->reduce_barrier = (tMPI_Barrier_t**) tMPI_Malloc(sizeof(tMPI_Barrier_t*)*(Niter+1)); if (retc->reduce_barrier == NULL) { return TMPI_ERR_NO_MEM; } retc->N_reduce = (int*)tMPI_Malloc(sizeof(int)*(Niter+1)); if (retc->N_reduce == NULL) { return TMPI_ERR_NO_MEM; } /* we re-set Nred to N */ Nred = N; for (i = 0; i < Niter; i++) { int j; Nred = Nred/2 + Nred%2; retc->N_reduce[i] = Nred; /* allocate the sub-list */ retc->reduce_barrier[i] = (tMPI_Barrier_t*) tMPI_Malloc(sizeof(tMPI_Barrier_t)*(Nred)); if (retc->reduce_barrier[i] == NULL) { return TMPI_ERR_NO_MEM; } for (j = 0; j < Nred; j++) { tMPI_Barrier_init(&(retc->reduce_barrier[i][j]), 2); } } } /* the reduce buffers */ retc->reduce_sendbuf = (tMPI_Atomic_ptr_t*) tMPI_Malloc(sizeof(tMPI_Atomic_ptr_t)*Nthreads); if (retc->reduce_sendbuf == NULL) { return TMPI_ERR_NO_MEM; } retc->reduce_recvbuf = (tMPI_Atomic_ptr_t*) tMPI_Malloc(sizeof(tMPI_Atomic_ptr_t)*Nthreads); if (retc->reduce_recvbuf == NULL) { return TMPI_ERR_NO_MEM; } if (parent) { retc->erh = parent->erh; } else { retc->erh = TMPI_ERRORS_ARE_FATAL; } /* coll_env objects */ retc->cev = (struct coll_env*)tMPI_Malloc(sizeof(struct coll_env)* N_COLL_ENV); if (retc->cev == NULL) { return TMPI_ERR_NO_MEM; } for (i = 0; i < N_COLL_ENV; i++) { ret = tMPI_Coll_env_init( &(retc->cev[i]), N); if (ret != TMPI_SUCCESS) { return ret; } } /* multi_sync objects */ retc->csync = (struct coll_sync*)tMPI_Malloc(sizeof(struct coll_sync)*N); if (retc->csync == NULL) { return TMPI_ERR_NO_MEM; } for (i = 0; i < N; i++) { ret = tMPI_Coll_sync_init( &(retc->csync[i]), N); if (ret != TMPI_SUCCESS) { return ret; } } ret = tMPI_Thread_mutex_lock( &(tmpi_global->comm_link_lock) ); if (ret != 0) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO); } /* we insert ourselves in the circular list, after TMPI_COMM_WORLD */ if (TMPI_COMM_WORLD) { retc->next = TMPI_COMM_WORLD; retc->prev = TMPI_COMM_WORLD->prev; TMPI_COMM_WORLD->prev->next = retc; TMPI_COMM_WORLD->prev = retc; } else { retc->prev = retc->next = retc; } ret = tMPI_Thread_mutex_unlock( &(tmpi_global->comm_link_lock) ); if (ret != 0) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO); } *newcomm = retc; return TMPI_SUCCESS; }
void tMPI_Start_threads(tmpi_bool main_returns, int N, int *argc, char ***argv, void (*start_fn)(void*), void *start_arg, int (*start_fn_main)(int, char**)) { #ifdef TMPI_TRACE tMPI_Trace_print("tMPI_Start_threads(%d, %p, %p, %p, %p)", N, argc, argv, start_fn, start_arg); #endif if (N>0) { int i; int set_affinity=FALSE; tmpi_finalized=FALSE; Nthreads=N; /* allocate global data */ tmpi_global=(struct tmpi_global*) tMPI_Malloc(sizeof(struct tmpi_global)); tMPI_Global_init(tmpi_global, N); /* allocate world and thread data */ threads=(struct tmpi_thread*)tMPI_Malloc(sizeof(struct tmpi_thread)*N); TMPI_COMM_WORLD=tMPI_Comm_alloc(NULL, N); TMPI_GROUP_EMPTY=tMPI_Group_alloc(); if (tMPI_Thread_key_create(&id_key, NULL)) { tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_INIT); } for(i=0;i<N;i++) { TMPI_COMM_WORLD->grp.peers[i]=&(threads[i]); /* copy argc, argv */ if (argc && argv) { int j; threads[i].argc=*argc; threads[i].argv=(char**)tMPI_Malloc(threads[i].argc* sizeof(char*)); for(j=0;j<threads[i].argc;j++) { #if ! (defined( _WIN32 ) || defined( _WIN64 ) ) threads[i].argv[j]=strdup( (*argv)[j] ); #else threads[i].argv[j]=_strdup( (*argv)[j] ); #endif } } else { threads[i].argc=0; threads[i].argv=NULL; } threads[i].start_fn=start_fn; threads[i].start_fn_main=start_fn_main; threads[i].start_arg=start_arg; } /* now check whether to set affinity */ #ifdef TMPI_THREAD_AFFINITY { int nhw=tMPI_Thread_get_hw_number(); if ((nhw > 1) && (nhw == N)) { set_affinity=TRUE; } } #endif for(i=1;i<N;i++) /* zero is the main thread */ { int ret; if (set_affinity) { ret=tMPI_Thread_create_aff(&(threads[i].thread_id), tMPI_Thread_starter, (void*)&(threads[i]) ) ; } else { ret=tMPI_Thread_create(&(threads[i].thread_id), tMPI_Thread_starter, (void*)&(threads[i]) ) ; } if(ret) { tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_INIT); } } /* the main thread now also runs start_fn if we don't want it to return */ if (!main_returns) tMPI_Thread_starter((void*)&(threads[0])); else tMPI_Thread_init(&(threads[0])); } }
int tMPI_Comm_free(tMPI_Comm *comm) { int size; int sum; int ret; #ifdef TMPI_TRACE tMPI_Trace_print("tMPI_Comm_free(%p)", comm); #endif #ifndef TMPI_STRICT if (!*comm) { return TMPI_SUCCESS; } if ((*comm)->grp.N > 1) { /* we remove ourselves from the comm. */ ret = tMPI_Thread_mutex_lock(&((*comm)->comm_create_lock)); if (ret != 0) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO); } (*comm)->grp.peers[myrank] = (*comm)->grp.peers[(*comm)->grp.N-1]; (*comm)->grp.N--; ret = tMPI_Thread_mutex_unlock(&((*comm)->comm_create_lock)); if (ret != 0) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO); } } else { /* we're the last one so we can safely destroy it */ ret = tMPI_Comm_destroy(*comm, TRUE); if (ret != 0) { return ret; } } #else /* This is correct if programs actually treat Comm_free as a collective call */ if (!*comm) { return TMPI_SUCCESS; } size = (*comm)->grp.N; /* we add 1 to the destroy counter and actually deallocate if the counter reaches N. */ sum = tMPI_Atomic_fetch_add( &((*comm)->destroy_counter), 1) + 1; /* this is a collective call on a shared data structure, so only one process (the last one in this case) should do anything */ if (sum == size) { ret = tMPI_Comm_destroy(*comm, TRUE); if (ret != 0) { return ret; } } #endif return TMPI_SUCCESS; }
int tMPI_Finalize(void) { int i; #ifdef TMPI_TRACE tMPI_Trace_print("tMPI_Finalize()"); #endif #ifdef TMPI_DEBUG printf("%5d: tMPI_Finalize called\n", tMPI_This_threadnr()); fflush(stdout); #endif #ifdef TMPI_PROFILE { struct tmpi_thread *cur=tMPI_Get_current(); tMPI_Profile_stop( &(cur->profile) ); tMPI_Thread_barrier_wait( &(tmpi_global->barrier) ); if (tMPI_Is_master()) { tMPI_Profiles_summarize(Nthreads, threads); } } #endif tMPI_Thread_barrier_wait( &(tmpi_global->barrier) ); if (tMPI_Is_master()) { /* we just wait for all threads to finish; the order isn't very relevant, as all threads should arrive at their endpoints soon. */ for(i=1;i<Nthreads;i++) { if (tMPI_Thread_join(threads[i].thread_id, NULL)) { tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_FINALIZE); } tMPI_Thread_destroy(&(threads[i])); } /* at this point, we are the only thread left, so we can destroy the global structures with impunity. */ tMPI_Thread_destroy(&(threads[0])); free(threads); tMPI_Thread_key_delete(id_key); /* de-allocate all the comm stuctures. */ { tMPI_Comm cur=TMPI_COMM_WORLD->next; while(cur && (cur!=TMPI_COMM_WORLD) ) { tMPI_Comm next=cur->next; tMPI_Comm_destroy(cur); cur=next; } tMPI_Comm_destroy(TMPI_COMM_WORLD); } tMPI_Group_free(&TMPI_GROUP_EMPTY); threads=0; TMPI_COMM_WORLD=NULL; TMPI_GROUP_EMPTY=NULL; Nthreads=0; /* deallocate the 'global' structure */ tMPI_Global_destroy(tmpi_global); free(tmpi_global); tmpi_finalized=TRUE; } else { tMPI_Thread_exit(0); } return TMPI_SUCCESS; }
int tMPI_Scan(void* sendbuf, void* recvbuf, int count, tMPI_Datatype datatype, tMPI_Op op, tMPI_Comm comm) { struct tmpi_thread *cur=tMPI_Get_current(); int myrank=tMPI_Comm_seek_rank(comm, cur); int N=tMPI_Comm_N(comm); int prev=myrank - 1; /* my previous neighbor */ int next=myrank + 1; /* my next neighbor */ #ifdef TMPI_PROFILE tMPI_Profile_count_start(cur); #endif #ifdef TMPI_TRACE tMPI_Trace_print("tMPI_Scan(%p, %p, %d, %p, %p, %p)", sendbuf, recvbuf, count, datatype, op, comm); #endif if (count==0) return TMPI_SUCCESS; if (!recvbuf) { return tMPI_Error(comm, TMPI_ERR_BUF); } if (sendbuf==TMPI_IN_PLACE) { sendbuf=recvbuf; } /* we set our send and recv buffers */ tMPI_Atomic_ptr_set(&(comm->reduce_sendbuf[myrank]),sendbuf); tMPI_Atomic_ptr_set(&(comm->reduce_recvbuf[myrank]),recvbuf); /* now wait for the previous rank to finish */ if (myrank > 0) { void *a, *b; int ret; #if defined(TMPI_PROFILE) && defined(TMPI_CYCLE_COUNT) tMPI_Profile_wait_start(cur); #endif /* wait for the previous neighbor's data to be ready */ tMPI_Event_wait( &(comm->csync[myrank].events[prev]) ); tMPI_Event_process( &(comm->csync[myrank].events[prev]), 1); #if defined(TMPI_PROFILE) && defined(TMPI_CYCLE_COUNT) tMPI_Profile_wait_stop(cur, TMPIWAIT_Reduce); #endif #ifdef TMPI_DEBUG printf("%d: scanning with %d \n", myrank, prev, iteration); fflush(stdout); #endif /* now do the reduction */ if (prev > 0) { a = (void*)tMPI_Atomic_ptr_get(&(comm->reduce_recvbuf[prev])); } else { a = (void*)tMPI_Atomic_ptr_get(&(comm->reduce_sendbuf[prev])); } b = sendbuf; if ((ret=tMPI_Reduce_run_op(recvbuf, a, b, datatype, count, op, comm)) != TMPI_SUCCESS) { return ret; } /* signal to my previous neighbor that I'm done with the data */ tMPI_Event_signal( &(comm->csync[prev].events[prev]) ); } else { if (sendbuf != recvbuf) { /* copy the data if this is rank 0, and not MPI_IN_PLACE */ memcpy(recvbuf, sendbuf, count*datatype->size); } } if (myrank < N-1) { /* signal to my next neighbor that I have the data */ tMPI_Event_signal( &(comm->csync[next].events[myrank]) ); /* and wait for my next neighbor to finish */ tMPI_Event_wait( &(comm->csync[myrank].events[myrank]) ); tMPI_Event_process( &(comm->csync[myrank].events[myrank]), 1); } #if defined(TMPI_PROFILE) && defined(TMPI_CYCLE_COUNT) tMPI_Profile_wait_start(cur); #endif /*tMPI_Barrier_wait( &(comm->barrier));*/ #if defined(TMPI_PROFILE) /*tMPI_Profile_wait_stop(cur, TMPIWAIT_Reduce);*/ tMPI_Profile_count_stop(cur, TMPIFN_Scan); #endif return TMPI_SUCCESS; }
int tMPI_Alltoall(void* sendbuf, int sendcount, tMPI_Datatype sendtype, void* recvbuf, int recvcount, tMPI_Datatype recvtype, tMPI_Comm comm) { int synct; struct coll_env *cev; int myrank; int ret = TMPI_SUCCESS; int i; size_t sendsize = sendtype->size*sendcount; size_t recvsize = recvtype->size*recvcount; int n_remaining; struct tmpi_thread *cur = tMPI_Get_current(); #ifdef TMPI_PROFILE tMPI_Profile_count_start(cur); #endif #ifdef TMPI_TRACE tMPI_Trace_print("tMPI_Alltoall(%p, %d, %p, %p, %d, %p, %p)", sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm); #endif if (!comm) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_COMM); } if (!sendbuf || !recvbuf) /* don't do pointer arithmetic on a NULL ptr */ { return tMPI_Error(comm, TMPI_ERR_BUF); } myrank = tMPI_Comm_seek_rank(comm, cur); /* we increase our counter, and determine which coll_env we get */ cev = tMPI_Get_cev(comm, myrank, &synct); /* post our pointers */ /* we set up multiple posts, so no Post_multi */ cev->met[myrank].tag = TMPI_ALLTOALL_TAG; cev->met[myrank].datatype = sendtype; tMPI_Atomic_set( &(cev->met[myrank].n_remaining), cev->N-1 ); for (i = 0; i < comm->grp.N; i++) { cev->met[myrank].bufsize[i] = sendsize; cev->met[myrank].buf[i] = (char*)sendbuf+sendsize*i; cev->met[myrank].read_data[i] = FALSE; } tMPI_Atomic_memory_barrier_rel(); tMPI_Atomic_set(&(cev->met[myrank].current_sync), synct); /* post availability */ for (i = 0; i < cev->N; i++) { if (i != myrank) { tMPI_Event_signal( &(cev->met[i].recv_ev) ); } } /* we don't do the copy buffer thing here because it's pointless: the processes have to synchronize anyway, because they all send and receive. */ /* do root transfer */ tMPI_Coll_root_xfer(comm, sendtype, recvtype, sendsize, recvsize, (char*)sendbuf+sendsize*myrank, (char*)recvbuf+recvsize*myrank, &ret); cev->met[myrank].read_data[myrank] = TRUE; /* and poll data availability */ n_remaining = cev->N-1; while (n_remaining > 0) { #if defined(TMPI_PROFILE) && defined(TMPI_CYCLE_COUNT) tMPI_Profile_wait_start(cur); #endif tMPI_Event_wait( &(cev->met[myrank]).recv_ev ); #if defined(TMPI_PROFILE) && defined(TMPI_CYCLE_COUNT) tMPI_Profile_wait_stop(cur, TMPIWAIT_Coll_recv); #endif for (i = 0; i < cev->N; i++) { if ((!cev->met[myrank].read_data[i]) && (tMPI_Atomic_get(&(cev->met[i].current_sync)) == synct)) { tMPI_Event_process( &(cev->met[myrank]).recv_ev, 1); tMPI_Mult_recv(comm, cev, i, myrank, TMPI_ALLTOALL_TAG, recvtype, recvsize, (char*)recvbuf+recvsize*i, &ret); if (ret != TMPI_SUCCESS) { return ret; } cev->met[myrank].read_data[i] = TRUE; n_remaining--; } } } /* and wait until everybody is done copying our data */ tMPI_Wait_for_others(cev, myrank); #ifdef TMPI_PROFILE tMPI_Profile_count_stop(cur, TMPIFN_Alltoall); #endif return ret; }