int tMPI_Scan(void* sendbuf, void* recvbuf, int count, tMPI_Datatype datatype, tMPI_Op op, tMPI_Comm comm) { struct tmpi_thread *cur=tMPI_Get_current(); int myrank=tMPI_Comm_seek_rank(comm, cur); int N=tMPI_Comm_N(comm); int prev=myrank - 1; /* my previous neighbor */ int next=myrank + 1; /* my next neighbor */ #ifdef TMPI_PROFILE tMPI_Profile_count_start(cur); #endif #ifdef TMPI_TRACE tMPI_Trace_print("tMPI_Scan(%p, %p, %d, %p, %p, %p)", sendbuf, recvbuf, count, datatype, op, comm); #endif if (count==0) return TMPI_SUCCESS; if (!recvbuf) { return tMPI_Error(comm, TMPI_ERR_BUF); } if (sendbuf==TMPI_IN_PLACE) { sendbuf=recvbuf; } /* we set our send and recv buffers */ tMPI_Atomic_ptr_set(&(comm->reduce_sendbuf[myrank]),sendbuf); tMPI_Atomic_ptr_set(&(comm->reduce_recvbuf[myrank]),recvbuf); /* now wait for the previous rank to finish */ if (myrank > 0) { void *a, *b; int ret; #if defined(TMPI_PROFILE) && defined(TMPI_CYCLE_COUNT) tMPI_Profile_wait_start(cur); #endif /* wait for the previous neighbor's data to be ready */ tMPI_Event_wait( &(comm->csync[myrank].events[prev]) ); tMPI_Event_process( &(comm->csync[myrank].events[prev]), 1); #if defined(TMPI_PROFILE) && defined(TMPI_CYCLE_COUNT) tMPI_Profile_wait_stop(cur, TMPIWAIT_Reduce); #endif #ifdef TMPI_DEBUG printf("%d: scanning with %d \n", myrank, prev, iteration); fflush(stdout); #endif /* now do the reduction */ if (prev > 0) { a = (void*)tMPI_Atomic_ptr_get(&(comm->reduce_recvbuf[prev])); } else { a = (void*)tMPI_Atomic_ptr_get(&(comm->reduce_sendbuf[prev])); } b = sendbuf; if ((ret=tMPI_Reduce_run_op(recvbuf, a, b, datatype, count, op, comm)) != TMPI_SUCCESS) { return ret; } /* signal to my previous neighbor that I'm done with the data */ tMPI_Event_signal( &(comm->csync[prev].events[prev]) ); } else { if (sendbuf != recvbuf) { /* copy the data if this is rank 0, and not MPI_IN_PLACE */ memcpy(recvbuf, sendbuf, count*datatype->size); } } if (myrank < N-1) { /* signal to my next neighbor that I have the data */ tMPI_Event_signal( &(comm->csync[next].events[myrank]) ); /* and wait for my next neighbor to finish */ tMPI_Event_wait( &(comm->csync[myrank].events[myrank]) ); tMPI_Event_process( &(comm->csync[myrank].events[myrank]), 1); } #if defined(TMPI_PROFILE) && defined(TMPI_CYCLE_COUNT) tMPI_Profile_wait_start(cur); #endif /*tMPI_Barrier_wait( &(comm->barrier));*/ #if defined(TMPI_PROFILE) /*tMPI_Profile_wait_stop(cur, TMPIWAIT_Reduce);*/ tMPI_Profile_count_stop(cur, TMPIFN_Scan); #endif return TMPI_SUCCESS; }
void tMPI_Mult_recv(tMPI_Comm comm, struct coll_env *cev, int rank, int index, int expected_tag, tMPI_Datatype recvtype, size_t recvsize, void *recvbuf, int *ret) { size_t sendsize=cev->met[rank].bufsize[index]; /* check tags, types */ if ((cev->met[rank].datatype != recvtype ) || (cev->met[rank].tag != expected_tag)) { *ret=tMPI_Error(comm, TMPI_ERR_MULTI_MISMATCH); } if (sendsize) /* we allow NULL ptrs if there's nothing to xmit */ { void *srcbuf; #ifdef USE_COLLECTIVE_COPY_BUFFER tmpi_bool decrease_ctr=FALSE; #endif if ( sendsize > recvsize ) { *ret=tMPI_Error(comm, TMPI_ERR_XFER_BUFSIZE); return; } if ( cev->met[rank].buf == recvbuf ) { *ret=tMPI_Error(TMPI_COMM_WORLD,TMPI_ERR_XFER_BUF_OVERLAP); return; } /* get source buffer */ #ifdef USE_COLLECTIVE_COPY_BUFFER if ( !(cev->met[rank].using_cb)) #endif { srcbuf=cev->met[rank].buf[index]; } #ifdef USE_COLLECTIVE_COPY_BUFFER else { srcbuf=tMPI_Atomic_ptr_get(&(cev->met[rank].cpbuf[index])); tMPI_Atomic_memory_barrier_acq(); if(!srcbuf) { /* there was (as of yet) no copied buffer */ void *try_again_srcbuf; /* we need to try checking the pointer again after we increase the read counter, signaling that one more thread is reading. */ tMPI_Atomic_add_return(&(cev->met[rank].buf_readcount), 1); /* a full memory barrier */ tMPI_Atomic_memory_barrier(); try_again_srcbuf=tMPI_Atomic_ptr_get( &(cev->met[rank].cpbuf[index])); if (!try_again_srcbuf) { /* apparently the copied buffer is not ready yet. We just use the real source buffer. We have already indicated we're reading from the regular buf. */ srcbuf=cev->met[rank].buf[index]; decrease_ctr=TRUE; } else { /* We tried again, and this time there was a copied buffer. We use that, and indicate that we're not reading from the regular buf. This case should be pretty rare. */ tMPI_Atomic_fetch_add(&(cev->met[rank].buf_readcount),-1); tMPI_Atomic_memory_barrier_acq(); srcbuf=try_again_srcbuf; } } #ifdef TMPI_PROFILE if (srcbuf) tMPI_Profile_count_buffered_coll_xfer(tMPI_Get_current()); #endif } #endif /* copy data */ memcpy((char*)recvbuf, srcbuf, sendsize); #ifdef TMPI_PROFILE tMPI_Profile_count_coll_xfer(tMPI_Get_current()); #endif #ifdef USE_COLLECTIVE_COPY_BUFFER if (decrease_ctr) { /* we decrement the read count; potentially releasing the buffer. */ tMPI_Atomic_memory_barrier_rel(); tMPI_Atomic_fetch_add( &(cev->met[rank].buf_readcount), -1); } #endif } /* signal one thread ready */ { int reta; tMPI_Atomic_memory_barrier_rel(); reta=tMPI_Atomic_add_return( &(cev->met[rank].n_remaining), -1); if (reta <= 0) { tMPI_Event_signal( &(cev->met[rank].send_ev) ); } } }
void tMPI_Post_multi(struct coll_env *cev, int myrank, int index, int tag, tMPI_Datatype datatype, size_t bufsize, void *buf, int n_remaining, int synct, int dest) { int i; #ifdef USE_COLLECTIVE_COPY_BUFFER /* decide based on the number of waiting threads */ tmpi_bool using_cb=(bufsize < (size_t)(n_remaining*COPY_BUFFER_SIZE)); cev->met[myrank].using_cb=using_cb; if (using_cb) { /* we set it to NULL initially */ /*cev->met[myrank].cpbuf[index]=NULL;*/ tMPI_Atomic_ptr_set(&(cev->met[myrank].cpbuf[index]), NULL); tMPI_Atomic_set(&(cev->met[myrank].buf_readcount), 0); } #endif cev->met[myrank].tag=tag; cev->met[myrank].datatype=datatype; cev->met[myrank].buf[index]=buf; cev->met[myrank].bufsize[index]=bufsize; tMPI_Atomic_set(&(cev->met[myrank].n_remaining), n_remaining); tMPI_Atomic_memory_barrier_rel(); tMPI_Atomic_set(&(cev->met[myrank].current_sync), synct); /* publish availability. */ if (dest<0) { for(i=0;i<cev->N;i++) { if (i != myrank) tMPI_Event_signal( &(cev->met[i].recv_ev) ); } } else { tMPI_Event_signal( &(cev->met[dest].recv_ev) ); } #ifdef USE_COLLECTIVE_COPY_BUFFER /* becase we've published availability, we can start copying -- possibly in parallel with the receiver */ if (using_cb) { struct tmpi_thread *cur=tMPI_Get_current(); /* copy the buffer locally. First allocate */ cev->met[myrank].cb=tMPI_Copy_buffer_list_get( &(cur->cbl_multi) ); if (cev->met[myrank].cb->size < bufsize) { fprintf(stderr, "ERROR: cb size too small\n"); exit(1); } /* copy to the new buf */ memcpy(cev->met[myrank].cb->buf, buf, bufsize); /* post the new buf */ tMPI_Atomic_memory_barrier_rel(); /*cev->met[myrank].cpbuf[index]=cev->met[myrank].cb->buf;*/ tMPI_Atomic_ptr_set(&(cev->met[myrank].cpbuf[index]), cev->met[myrank].cb->buf); } #endif }
int tMPI_Alltoall(void* sendbuf, int sendcount, tMPI_Datatype sendtype, void* recvbuf, int recvcount, tMPI_Datatype recvtype, tMPI_Comm comm) { int synct; struct coll_env *cev; int myrank; int ret = TMPI_SUCCESS; int i; size_t sendsize = sendtype->size*sendcount; size_t recvsize = recvtype->size*recvcount; int n_remaining; struct tmpi_thread *cur = tMPI_Get_current(); #ifdef TMPI_PROFILE tMPI_Profile_count_start(cur); #endif #ifdef TMPI_TRACE tMPI_Trace_print("tMPI_Alltoall(%p, %d, %p, %p, %d, %p, %p)", sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm); #endif if (!comm) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_COMM); } if (!sendbuf || !recvbuf) /* don't do pointer arithmetic on a NULL ptr */ { return tMPI_Error(comm, TMPI_ERR_BUF); } myrank = tMPI_Comm_seek_rank(comm, cur); /* we increase our counter, and determine which coll_env we get */ cev = tMPI_Get_cev(comm, myrank, &synct); /* post our pointers */ /* we set up multiple posts, so no Post_multi */ cev->met[myrank].tag = TMPI_ALLTOALL_TAG; cev->met[myrank].datatype = sendtype; tMPI_Atomic_set( &(cev->met[myrank].n_remaining), cev->N-1 ); for (i = 0; i < comm->grp.N; i++) { cev->met[myrank].bufsize[i] = sendsize; cev->met[myrank].buf[i] = (char*)sendbuf+sendsize*i; cev->met[myrank].read_data[i] = FALSE; } tMPI_Atomic_memory_barrier_rel(); tMPI_Atomic_set(&(cev->met[myrank].current_sync), synct); /* post availability */ for (i = 0; i < cev->N; i++) { if (i != myrank) { tMPI_Event_signal( &(cev->met[i].recv_ev) ); } } /* we don't do the copy buffer thing here because it's pointless: the processes have to synchronize anyway, because they all send and receive. */ /* do root transfer */ tMPI_Coll_root_xfer(comm, sendtype, recvtype, sendsize, recvsize, (char*)sendbuf+sendsize*myrank, (char*)recvbuf+recvsize*myrank, &ret); cev->met[myrank].read_data[myrank] = TRUE; /* and poll data availability */ n_remaining = cev->N-1; while (n_remaining > 0) { #if defined(TMPI_PROFILE) && defined(TMPI_CYCLE_COUNT) tMPI_Profile_wait_start(cur); #endif tMPI_Event_wait( &(cev->met[myrank]).recv_ev ); #if defined(TMPI_PROFILE) && defined(TMPI_CYCLE_COUNT) tMPI_Profile_wait_stop(cur, TMPIWAIT_Coll_recv); #endif for (i = 0; i < cev->N; i++) { if ((!cev->met[myrank].read_data[i]) && (tMPI_Atomic_get(&(cev->met[i].current_sync)) == synct)) { tMPI_Event_process( &(cev->met[myrank]).recv_ev, 1); tMPI_Mult_recv(comm, cev, i, myrank, TMPI_ALLTOALL_TAG, recvtype, recvsize, (char*)recvbuf+recvsize*i, &ret); if (ret != TMPI_SUCCESS) { return ret; } cev->met[myrank].read_data[i] = TRUE; n_remaining--; } } } /* and wait until everybody is done copying our data */ tMPI_Wait_for_others(cev, myrank); #ifdef TMPI_PROFILE tMPI_Profile_count_stop(cur, TMPIFN_Alltoall); #endif return ret; }