int tMPI_Thread_create_aff(tMPI_Thread_t *thread, void *(*start_routine)(void *), void *arg) { int ret; /* set the calling thread's affinity mask */ if (tMPI_Atomic_get(&main_thread_aff_set) == 0) { #ifdef HAVE_PTHREAD_SETAFFINITY cpu_set_t set; #endif /* this can be a spinlock because the chances of collision are low. */ tMPI_Spinlock_lock( &main_thread_aff_lock ); tMPI_Atomic_set( &aff_thread_number, 0); #ifdef HAVE_PTHREAD_SETAFFINITY CPU_ZERO(&set); CPU_SET(0, &set); pthread_setaffinity_np(pthread_self(), sizeof(set), &set); /*fprintf(stderr, "Setting affinity.\n");*/ #endif tMPI_Atomic_set( &main_thread_aff_set, 1); tMPI_Spinlock_unlock( &main_thread_aff_lock ); } if(thread==NULL) { tMPI_Fatal_error(TMPI_FARGS,"Invalid thread pointer."); return EINVAL; } *thread=(struct tMPI_Thread*)malloc(sizeof(struct tMPI_Thread)*1); ret=pthread_create(&((*thread)->th),NULL,start_routine,arg); if(ret!=0) { /* Cannot use tMPI_error() since messages use threads for locking */ tMPI_Fatal_error(TMPI_FARGS,"Failed to create POSIX thread, rc=%d",ret); /* Use system memory allocation routines */ return -1; } else { #ifdef HAVE_PTHREAD_SETAFFINITY int n; cpu_set_t set; n=tMPI_Atomic_add_return(&aff_thread_number, 1); CPU_ZERO(&set); CPU_SET(n, &set); return pthread_setaffinity_np((*thread)->th, sizeof(set), &set); #else return 0; #endif } }
void tMPI_Coll_env_init(struct coll_env *cev, int N) { int i; cev->met=(struct coll_env_thread*)tMPI_Malloc( sizeof(struct coll_env_thread)*N); cev->N=N; tMPI_Atomic_set(&(cev->coll.current_sync), 0); tMPI_Atomic_set(&(cev->coll.n_remaining), 0); for(i=0;i<N;i++) { tMPI_Coll_envt_init(&(cev->met[i]), N); } }
void tMPI_Coll_envt_init(struct coll_env_thread *met, int N) { tMPI_Atomic_set(&(met->current_sync), 0); tMPI_Atomic_set(&(met->n_remaining), 0); met->buf=(void**)tMPI_Malloc(sizeof(void*)*N); met->bufsize=(size_t*)tMPI_Malloc(sizeof(size_t)*N); met->read_data=(tmpi_bool*)tMPI_Malloc(sizeof(tmpi_bool)*N); #ifdef USE_COLLECTIVE_COPY_BUFFER met->cpbuf=(tMPI_Atomic_ptr_t*)tMPI_Malloc(sizeof(tMPI_Atomic_ptr_t)*N); met->cb=NULL; met->using_cb=FALSE; #endif tMPI_Event_init( &(met->send_ev) ); tMPI_Event_init( &(met->recv_ev) ); }
static void tMPI_Init_initers(void) { int state; /* we can pre-check because it's atomic */ if (tMPI_Atomic_get(&init_inited) == 0) { /* this can be a spinlock because the chances of collision are low. */ tMPI_Spinlock_lock( &init_init ); state=tMPI_Atomic_get(&init_inited); tMPI_Atomic_memory_barrier_acq(); if (state == 0) { InitializeCriticalSection(&mutex_init); InitializeCriticalSection(&once_init); InitializeCriticalSection(&cond_init); InitializeCriticalSection(&barrier_init); tMPI_Atomic_memory_barrier_rel(); tMPI_Atomic_set(&init_inited, 1); } tMPI_Spinlock_unlock( &init_init ); } }
int tMPI_Thread_once(tMPI_Thread_once_t *once_control, void (*init_routine)(void)) { #if 0 /* use once Vista is minimum required version */ BOOL bStatus; bStatus = InitOnceExecuteOnce(once_control, InitHandleWrapperFunction, init_routine, NULL); if (!bStatus) { tMPI_Fatal_error(TMPI_FARGS,"Failed to run thread_once routine"); return -1; } #else /* really ugly hack - and it's slow... */ tMPI_Init_initers(); EnterCriticalSection(&once_init); if (tMPI_Atomic_get(&(once_control->once)) == 0) { (*init_routine)(); tMPI_Atomic_set(&(once_control->once), 1); } LeaveCriticalSection(&once_init); #endif return 0; }
static void tMPI_Thread_init(struct tmpi_thread *th) { int N_envelopes=(Nthreads+1)*N_EV_ALLOC; int N_send_envelopes=N_EV_ALLOC; int N_reqs=(Nthreads+1)*N_EV_ALLOC; int i; /* we set our thread id, as a thread-specific piece of global data. */ tMPI_Thread_setspecific(id_key, th); /* allocate comm.self */ th->self_comm=tMPI_Comm_alloc(TMPI_COMM_WORLD, 1); th->self_comm->grp.peers[0]=th; /* allocate envelopes */ tMPI_Free_env_list_init( &(th->envelopes), N_envelopes ); /* recv list */ tMPI_Recv_env_list_init( &(th->evr)); /* send lists */ th->evs=(struct send_envelope_list*)tMPI_Malloc( sizeof(struct send_envelope_list)*Nthreads); for(i=0;i<Nthreads;i++) { tMPI_Send_env_list_init( &(th->evs[i]), N_send_envelopes); } tMPI_Atomic_set( &(th->ev_outgoing_received), 0); tMPI_Event_init( &(th->p2p_event) ); /* allocate requests */ tMPI_Req_list_init(&(th->rql), N_reqs); #ifdef USE_COLLECTIVE_COPY_BUFFER /* allcate copy_buffer list */ tMPI_Copy_buffer_list_init(&(th->cbl_multi), (Nthreads+1)*(N_COLL_ENV+1), Nthreads*COPY_BUFFER_SIZE); #endif #ifdef TMPI_PROFILE tMPI_Profile_init(&(th->profile)); #endif /* now wait for all other threads to come on line, before we start the MPI program */ tMPI_Thread_barrier_wait( &(tmpi_global->barrier) ); }
static int tMPI_Init_initers(void) { int state; int ret = 0; /* we can pre-check because it's atomic */ if (tMPI_Atomic_get(&init_inited) == 0) { /* this can be a spinlock because the chances of collision are low. */ tMPI_Spinlock_lock( &init_init ); state = tMPI_Atomic_get(&init_inited); tMPI_Atomic_memory_barrier_acq(); if (state == 0) { InitializeCriticalSection(&mutex_init); InitializeCriticalSection(&once_init); InitializeCriticalSection(&cond_init); InitializeCriticalSection(&barrier_init); InitializeCriticalSection(&thread_id_list_lock); ret = tMPI_Init_NUMA(); if (ret != 0) { goto err; } ret = tMPI_Thread_id_list_init(); if (ret != 0) { goto err; } tMPI_Atomic_memory_barrier_rel(); tMPI_Atomic_set(&init_inited, 1); } tMPI_Spinlock_unlock( &init_init ); } return ret; err: tMPI_Spinlock_unlock( &init_init ); return ret; }
int tMPI_Thread_cond_init(tMPI_Thread_cond_t *cond) { int ret; if(cond==NULL) { return EINVAL; } cond->condp=(struct tMPI_Thread_cond*) tMPI_Malloc(sizeof(struct tMPI_Thread_cond)*1); ret = pthread_cond_init(&(cond->condp->cond), NULL); if(ret!=0) { tMPI_Fatal_error(TMPI_FARGS,"Error initializing POSIX condition variable. rc=%d",ret); fflush(stderr); } tMPI_Atomic_set(&(cond->initialized),1); return ret; }
int tMPI_Thread_once(tMPI_Thread_once_t *once_control, void (*init_routine)(void)) { int ret; if (!once_control || !init_routine) { return EINVAL; } /* really ugly hack - and it's slow... */ if ( (ret=pthread_mutex_lock( &once_init )) ) return ret; if (tMPI_Atomic_get(&(once_control->once)) == 0) { (*init_routine)(); tMPI_Atomic_set(&(once_control->once), 1); } pthread_mutex_unlock( &once_init ); return 0; }
int tMPI_Thread_mutex_init(tMPI_Thread_mutex_t *mtx) { int ret; if (mtx == NULL) { return EINVAL; } mtx->mutex=(struct tMPI_Mutex*)tMPI_Malloc(sizeof(struct tMPI_Mutex)*1); ret = pthread_mutex_init(&(mtx->mutex->mtx),NULL); if(ret!=0) { tMPI_Fatal_error(TMPI_FARGS,"Error initializing POSIX mutex. rc=%d"); /* Use system memory allocation routines */ return ret; } tMPI_Atomic_set(&(mtx->initialized), 1); return 0; }
int tMPI_Thread_barrier_init(tMPI_Thread_barrier_t *barrier, int n) { int ret; /*tMPI_Thread_pthread_barrier_t *p;*/ if(barrier==NULL) { return EINVAL; } barrier->barrierp=(struct tMPI_Thread_barrier*) tMPI_Malloc(sizeof(struct tMPI_Thread_barrier)*1); ret = pthread_mutex_init(&(barrier->barrierp->mutex),NULL); if(ret!=0) { tMPI_Fatal_error(TMPI_FARGS,"Error initializing POSIX mutex. rc=%d", ret); return ret; } ret = pthread_cond_init(&(barrier->barrierp->cv),NULL); if(ret!=0) { tMPI_Fatal_error(TMPI_FARGS, "Error initializing POSIX condition variable. rc=%d", ret); return ret; } barrier->threshold = n; barrier->count = n; barrier->cycle = 0; tMPI_Atomic_set(&(barrier->initialized), 1); return 0; }
int tMPI_Thread_key_create(tMPI_Thread_key_t *key, void (*destructor)(void *)) { int ret; if(key==NULL) { tMPI_Fatal_error(TMPI_FARGS,"Invalid key pointer."); return EINVAL; } key->key=(struct tMPI_Thread_key*)tMPI_Malloc(sizeof(struct tMPI_Thread_key)*1); ret = pthread_key_create(&((key)->key->pkey), destructor); if(ret!=0) { tMPI_Fatal_error(TMPI_FARGS,"Failed to create thread key, rc=%d.",ret); fflush(stderr); return -1; } tMPI_Atomic_set(&(key->initialized), 1); return 0; }
int tMPI_Comm_alloc(tMPI_Comm *newcomm, tMPI_Comm parent, int N) { struct tmpi_comm_ *retc; int i; int ret; retc = (struct tmpi_comm_*)tMPI_Malloc(sizeof(struct tmpi_comm_)); if (retc == NULL) { return TMPI_ERR_NO_MEM; } retc->grp.peers = (struct tmpi_thread**)tMPI_Malloc( sizeof(struct tmpi_thread*)*Nthreads); if (retc->grp.peers == NULL) { return TMPI_ERR_NO_MEM; } retc->grp.N = N; ret = tMPI_Thread_mutex_init( &(retc->comm_create_lock) ); if (ret != 0) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO); } ret = tMPI_Thread_cond_init( &(retc->comm_create_prep) ); if (ret != 0) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO); } ret = tMPI_Thread_cond_init( &(retc->comm_create_finish) ); if (ret != 0) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO); } retc->split = NULL; retc->new_comm = NULL; /* we have no topology to start out with */ retc->cart = NULL; /*retc->graph=NULL;*/ /* we start counting at 0 */ tMPI_Atomic_set( &(retc->destroy_counter), 0); /* initialize the main barrier */ tMPI_Barrier_init(&(retc->barrier), N); /* the reduce barriers */ { /* First calculate the number of reduce barriers */ int Niter = 0; /* the iteration number */ int Nred = N; /* the number of reduce barriers for this iteration */ while (Nred > 1) { /* Nred is now Nred/2 + a rest term because solitary process at the end of the list must still be accounter for */ Nred = Nred/2 + Nred%2; Niter += 1; } retc->N_reduce_iter = Niter; /* allocate the list */ retc->reduce_barrier = (tMPI_Barrier_t**) tMPI_Malloc(sizeof(tMPI_Barrier_t*)*(Niter+1)); if (retc->reduce_barrier == NULL) { return TMPI_ERR_NO_MEM; } retc->N_reduce = (int*)tMPI_Malloc(sizeof(int)*(Niter+1)); if (retc->N_reduce == NULL) { return TMPI_ERR_NO_MEM; } /* we re-set Nred to N */ Nred = N; for (i = 0; i < Niter; i++) { int j; Nred = Nred/2 + Nred%2; retc->N_reduce[i] = Nred; /* allocate the sub-list */ retc->reduce_barrier[i] = (tMPI_Barrier_t*) tMPI_Malloc(sizeof(tMPI_Barrier_t)*(Nred)); if (retc->reduce_barrier[i] == NULL) { return TMPI_ERR_NO_MEM; } for (j = 0; j < Nred; j++) { tMPI_Barrier_init(&(retc->reduce_barrier[i][j]), 2); } } } /* the reduce buffers */ retc->reduce_sendbuf = (tMPI_Atomic_ptr_t*) tMPI_Malloc(sizeof(tMPI_Atomic_ptr_t)*Nthreads); if (retc->reduce_sendbuf == NULL) { return TMPI_ERR_NO_MEM; } retc->reduce_recvbuf = (tMPI_Atomic_ptr_t*) tMPI_Malloc(sizeof(tMPI_Atomic_ptr_t)*Nthreads); if (retc->reduce_recvbuf == NULL) { return TMPI_ERR_NO_MEM; } if (parent) { retc->erh = parent->erh; } else { retc->erh = TMPI_ERRORS_ARE_FATAL; } /* coll_env objects */ retc->cev = (struct coll_env*)tMPI_Malloc(sizeof(struct coll_env)* N_COLL_ENV); if (retc->cev == NULL) { return TMPI_ERR_NO_MEM; } for (i = 0; i < N_COLL_ENV; i++) { ret = tMPI_Coll_env_init( &(retc->cev[i]), N); if (ret != TMPI_SUCCESS) { return ret; } } /* multi_sync objects */ retc->csync = (struct coll_sync*)tMPI_Malloc(sizeof(struct coll_sync)*N); if (retc->csync == NULL) { return TMPI_ERR_NO_MEM; } for (i = 0; i < N; i++) { ret = tMPI_Coll_sync_init( &(retc->csync[i]), N); if (ret != TMPI_SUCCESS) { return ret; } } ret = tMPI_Thread_mutex_lock( &(tmpi_global->comm_link_lock) ); if (ret != 0) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO); } /* we insert ourselves in the circular list, after TMPI_COMM_WORLD */ if (TMPI_COMM_WORLD) { retc->next = TMPI_COMM_WORLD; retc->prev = TMPI_COMM_WORLD->prev; TMPI_COMM_WORLD->prev->next = retc; TMPI_COMM_WORLD->prev = retc; } else { retc->prev = retc->next = retc; } ret = tMPI_Thread_mutex_unlock( &(tmpi_global->comm_link_lock) ); if (ret != 0) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO); } *newcomm = retc; return TMPI_SUCCESS; }
/* Set the main thread's affinity */ static int tMPI_Set_main_thread_affinity(void) { /* calling thread PROCESSOR_NUMBER */ PROCESSOR_NUMBER CurrentProcessorNumber; /* calling thread GROUP_AFFINITY */ GROUP_AFFINITY CurrentThreadGroupAffinity; /* calling thread NUMA node */ USHORT CurrentNumaNodeNumber; /* we can pre-check because it's atomic */ if (tMPI_Atomic_get(&main_thread_aff_set) == 0) { /* this can be a spinlock because the chances of collision are low. */ tMPI_Spinlock_lock( &main_thread_aff_lock ); if( g_ulHighestNumaNodeNumber != 0 ) { func_GetCurrentProcessorNumberEx(&CurrentProcessorNumber); /* set the NUMA node affinity for the current thread failures to set the current thread affinity are ignored, as a fringe case can arise on >32 processor systems with a 32bit build/code. */ func_SetThreadIdealProcessorEx(GetCurrentThread(), &CurrentProcessorNumber, NULL); if(func_GetNumaProcessorNodeEx(&CurrentProcessorNumber, &CurrentNumaNodeNumber)) { /* for the NUMA node number associated with the current processor number, get the group affinity mask */ if(func_GetNumaNodeProcessorMaskEx(CurrentNumaNodeNumber, &CurrentThreadGroupAffinity)) { /* set the current thread affinity to prevent it from running on other NUMA nodes */ func_SetThreadGroupAffinity(GetCurrentThread(), &CurrentThreadGroupAffinity, NULL); } } } else { /* No NUMA. For now, we just do a similar thing. */ if ( (func_GetCurrentProcessorNumberEx != NULL) && (func_SetThreadIdealProcessorEx)) { func_GetCurrentProcessorNumberEx(&CurrentProcessorNumber); func_SetThreadIdealProcessorEx(GetCurrentThread(), &CurrentProcessorNumber, NULL); } } tMPI_Atomic_set( &main_thread_aff_set, 1); tMPI_Spinlock_unlock( &main_thread_aff_lock ); } return 0; }
void tMPI_Post_multi(struct coll_env *cev, int myrank, int index, int tag, tMPI_Datatype datatype, size_t bufsize, void *buf, int n_remaining, int synct, int dest) { int i; #ifdef USE_COLLECTIVE_COPY_BUFFER /* decide based on the number of waiting threads */ tmpi_bool using_cb=(bufsize < (size_t)(n_remaining*COPY_BUFFER_SIZE)); cev->met[myrank].using_cb=using_cb; if (using_cb) { /* we set it to NULL initially */ /*cev->met[myrank].cpbuf[index]=NULL;*/ tMPI_Atomic_ptr_set(&(cev->met[myrank].cpbuf[index]), NULL); tMPI_Atomic_set(&(cev->met[myrank].buf_readcount), 0); } #endif cev->met[myrank].tag=tag; cev->met[myrank].datatype=datatype; cev->met[myrank].buf[index]=buf; cev->met[myrank].bufsize[index]=bufsize; tMPI_Atomic_set(&(cev->met[myrank].n_remaining), n_remaining); tMPI_Atomic_memory_barrier_rel(); tMPI_Atomic_set(&(cev->met[myrank].current_sync), synct); /* publish availability. */ if (dest<0) { for(i=0;i<cev->N;i++) { if (i != myrank) tMPI_Event_signal( &(cev->met[i].recv_ev) ); } } else { tMPI_Event_signal( &(cev->met[dest].recv_ev) ); } #ifdef USE_COLLECTIVE_COPY_BUFFER /* becase we've published availability, we can start copying -- possibly in parallel with the receiver */ if (using_cb) { struct tmpi_thread *cur=tMPI_Get_current(); /* copy the buffer locally. First allocate */ cev->met[myrank].cb=tMPI_Copy_buffer_list_get( &(cur->cbl_multi) ); if (cev->met[myrank].cb->size < bufsize) { fprintf(stderr, "ERROR: cb size too small\n"); exit(1); } /* copy to the new buf */ memcpy(cev->met[myrank].cb->buf, buf, bufsize); /* post the new buf */ tMPI_Atomic_memory_barrier_rel(); /*cev->met[myrank].cpbuf[index]=cev->met[myrank].cb->buf;*/ tMPI_Atomic_ptr_set(&(cev->met[myrank].cpbuf[index]), cev->met[myrank].cb->buf); } #endif }
int tMPI_Alltoall(void* sendbuf, int sendcount, tMPI_Datatype sendtype, void* recvbuf, int recvcount, tMPI_Datatype recvtype, tMPI_Comm comm) { int synct; struct coll_env *cev; int myrank; int ret = TMPI_SUCCESS; int i; size_t sendsize = sendtype->size*sendcount; size_t recvsize = recvtype->size*recvcount; int n_remaining; struct tmpi_thread *cur = tMPI_Get_current(); #ifdef TMPI_PROFILE tMPI_Profile_count_start(cur); #endif #ifdef TMPI_TRACE tMPI_Trace_print("tMPI_Alltoall(%p, %d, %p, %p, %d, %p, %p)", sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm); #endif if (!comm) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_COMM); } if (!sendbuf || !recvbuf) /* don't do pointer arithmetic on a NULL ptr */ { return tMPI_Error(comm, TMPI_ERR_BUF); } myrank = tMPI_Comm_seek_rank(comm, cur); /* we increase our counter, and determine which coll_env we get */ cev = tMPI_Get_cev(comm, myrank, &synct); /* post our pointers */ /* we set up multiple posts, so no Post_multi */ cev->met[myrank].tag = TMPI_ALLTOALL_TAG; cev->met[myrank].datatype = sendtype; tMPI_Atomic_set( &(cev->met[myrank].n_remaining), cev->N-1 ); for (i = 0; i < comm->grp.N; i++) { cev->met[myrank].bufsize[i] = sendsize; cev->met[myrank].buf[i] = (char*)sendbuf+sendsize*i; cev->met[myrank].read_data[i] = FALSE; } tMPI_Atomic_memory_barrier_rel(); tMPI_Atomic_set(&(cev->met[myrank].current_sync), synct); /* post availability */ for (i = 0; i < cev->N; i++) { if (i != myrank) { tMPI_Event_signal( &(cev->met[i].recv_ev) ); } } /* we don't do the copy buffer thing here because it's pointless: the processes have to synchronize anyway, because they all send and receive. */ /* do root transfer */ tMPI_Coll_root_xfer(comm, sendtype, recvtype, sendsize, recvsize, (char*)sendbuf+sendsize*myrank, (char*)recvbuf+recvsize*myrank, &ret); cev->met[myrank].read_data[myrank] = TRUE; /* and poll data availability */ n_remaining = cev->N-1; while (n_remaining > 0) { #if defined(TMPI_PROFILE) && defined(TMPI_CYCLE_COUNT) tMPI_Profile_wait_start(cur); #endif tMPI_Event_wait( &(cev->met[myrank]).recv_ev ); #if defined(TMPI_PROFILE) && defined(TMPI_CYCLE_COUNT) tMPI_Profile_wait_stop(cur, TMPIWAIT_Coll_recv); #endif for (i = 0; i < cev->N; i++) { if ((!cev->met[myrank].read_data[i]) && (tMPI_Atomic_get(&(cev->met[i].current_sync)) == synct)) { tMPI_Event_process( &(cev->met[myrank]).recv_ev, 1); tMPI_Mult_recv(comm, cev, i, myrank, TMPI_ALLTOALL_TAG, recvtype, recvsize, (char*)recvbuf+recvsize*i, &ret); if (ret != TMPI_SUCCESS) { return ret; } cev->met[myrank].read_data[i] = TRUE; n_remaining--; } } } /* and wait until everybody is done copying our data */ tMPI_Wait_for_others(cev, myrank); #ifdef TMPI_PROFILE tMPI_Profile_count_stop(cur, TMPIFN_Alltoall); #endif return ret; }