/* initialize Cartesian topology info in comm. If ndims==0, dims and periods are not referenced */ static void tMPI_Cart_init(tMPI_Comm *comm_cart, int ndims, int *dims, int *periods) { int newrank=-1; int i; if (*comm_cart) { tMPI_Comm_rank(*comm_cart, &newrank); } if (newrank==0) { (*comm_cart)->cart=(struct cart_topol*)tMPI_Malloc( sizeof(struct cart_topol)); (*comm_cart)->cart->dims=(int*)tMPI_Malloc(ndims*sizeof(int)); (*comm_cart)->cart->periods=(int*)tMPI_Malloc(ndims*sizeof(int)); (*comm_cart)->cart->ndims=ndims; for(i=0;i<ndims;i++) { (*comm_cart)->cart->dims[i]=dims[i]; (*comm_cart)->cart->periods[i]=periods[i]; } } /* and we add a barrier to make sure the cart object is seen by every thread that is part of the new communicator */ if (*comm_cart) { tMPI_Barrier_wait( &( (*comm_cart)->barrier) ); } }
int tMPI_Thread_create_aff(tMPI_Thread_t *thread, void *(*start_routine)(void *), void *arg) { DWORD thread_id; struct tMPI_Thread_starter_param *prm; tMPI_Init_initers(); tMPI_Set_main_thread_affinity(); /* a small memory leak to be sure that it doesn't get deallocated once this function ends, before the newly created thread uses it. */ prm=(struct tMPI_Thread_starter_param*) tMPI_Malloc(sizeof(struct tMPI_Thread_starter_param)); prm->start_routine= start_routine; prm->param=arg; *thread=(struct tMPI_Thread*)tMPI_Malloc(sizeof(struct tMPI_Thread)*1); if(thread==NULL) { tMPI_Fatal_error(TMPI_FARGS,"Invalid thread pointer."); return EINVAL; } if( g_ulHighestNumaNodeNumber != 0 ) { /* if running on a NUMA system, use the group and NUMA aware thread creation logic */ (*thread)->th = tMPI_Thread_create_NUMA(NULL, 0, tMPI_Win32_thread_starter, prm, 0, &thread_id); } else { /* TODO: for now, non-NUMA systems don't set thread affinity. */ (*thread)->th = CreateThread(NULL, 0, tMPI_Win32_thread_starter, prm, 0, &thread_id); } if((*thread)->th==NULL) { tMPI_Free(thread); tMPI_Fatal_error(TMPI_FARGS,"Failed to create thread, error code=%d", GetLastError()); return -1; } /* inherit the thread priority from the parent thread. */ /* TODO: is there value in setting this, vs. just allowing it to default from the process? currently, this limits the effectivenes of changing the priority in eg: TaskManager. */ SetThreadPriority(((*thread)->th), GetThreadPriority(GetCurrentThread())); return 0; }
int tMPI_Cart_sub(tMPI_Comm comm, int *remain_dims, tMPI_Comm *newcomm) { int myrank; int ndims=0; int *dims=NULL; int *periods=NULL; int *oldcoords=NULL; int i; int ndims_notused=1; int color_notused=0; #ifdef TMPI_TRACE tMPI_Trace_print("tMPI_Cart_sub(%p, %p, %p)", comm, remain_dims, newcomm); #endif tMPI_Comm_rank(comm, &myrank); if ( comm->cart ) { oldcoords=(int*)tMPI_Malloc(sizeof(int)*comm->cart->ndims); dims=(int*)tMPI_Malloc(sizeof(int)*comm->cart->ndims); periods=(int*)tMPI_Malloc(sizeof(int)*comm->cart->ndims); /* get old coordinates */ tMPI_Cart_coords(comm, myrank, comm->cart->ndims, oldcoords); for(i=0;i<comm->cart->ndims;i++) { if (remain_dims[i]) { /* for the remaining dimensions, copy dimensionality data */ dims[ndims]=comm->cart->dims[i]; periods[ndims]=comm->cart->periods[i]; ndims++; } else { /* base color on not used coordinates. We keep a ndims_notused index multiplier.*/ color_notused += oldcoords[i]*ndims_notused; ndims_notused *= comm->cart->dims[i]; } } } /* key=myrank, because we want the order to remain the same */ tMPI_Comm_split(comm, color_notused, myrank, newcomm); tMPI_Cart_init(newcomm, ndims, dims, periods); if (oldcoords) free(oldcoords); if (dims) free(dims); if (periods) free(periods); return TMPI_SUCCESS; }
void tMPI_Coll_envt_init(struct coll_env_thread *met, int N) { tMPI_Atomic_set(&(met->current_sync), 0); tMPI_Atomic_set(&(met->n_remaining), 0); met->buf=(void**)tMPI_Malloc(sizeof(void*)*N); met->bufsize=(size_t*)tMPI_Malloc(sizeof(size_t)*N); met->read_data=(tmpi_bool*)tMPI_Malloc(sizeof(tmpi_bool)*N); #ifdef USE_COLLECTIVE_COPY_BUFFER met->cpbuf=(tMPI_Atomic_ptr_t*)tMPI_Malloc(sizeof(tMPI_Atomic_ptr_t)*N); met->cb=NULL; met->using_cb=FALSE; #endif tMPI_Event_init( &(met->send_ev) ); tMPI_Event_init( &(met->recv_ev) ); }
int tMPI_Thread_barrier_init(tMPI_Thread_barrier_t *barrier, int n) { if(barrier==NULL) { return EINVAL; } barrier->barrierp=(struct tMPI_Thread_barrier*) tMPI_Malloc(sizeof(struct tMPI_Thread_barrier)*1); #if 0 /* use this once Vista is the oldest supported windows version: */ InitializeCriticalSection(&(barrier->barrierp->cs)); InitializeConditionVariable(&(barrier->barrierp->cv)); #else tMPI_Thread_mutex_init(&(barrier->barrierp->cs)); tMPI_Thread_cond_init(&(barrier->barrierp->cv)); #endif barrier->threshold = n; barrier->count = n; barrier->cycle = 0; return 0; }
static int tMPI_Thread_barrier_init_once(tMPI_Thread_barrier_t *barrier) { int ret=0; /* we're relying on the memory barrier semantics of mutex_lock/unlock for the check preceding this function call to have worked */ pthread_mutex_lock( &(barrier_init) ); if(barrier->barrierp==NULL) { barrier->barrierp=(struct tMPI_Thread_barrier*) tMPI_Malloc(sizeof(struct tMPI_Thread_barrier)*1); ret = pthread_mutex_init(&(barrier->barrierp->mutex),NULL); if(ret!=0) { tMPI_Fatal_error(TMPI_FARGS,"Error initializing POSIX mutex. rc=%d", ret); return ret; } ret = pthread_cond_init(&(barrier->barrierp->cv),NULL); if(ret!=0) { tMPI_Fatal_error(TMPI_FARGS, "Error initializing POSIX condition variable. rc=%d", ret); return ret; } } pthread_mutex_unlock( &(barrier_init) ); return ret; }
int tMPI_Thread_key_create(tMPI_Thread_key_t *key, void (*destructor)(void *)) { if(key==NULL) { tMPI_Fatal_error(TMPI_FARGS,"Invalid key pointer."); return EINVAL; } /* TODO: make list of destructors for thread-local storage */ key->key=(struct tMPI_Thread_key*)tMPI_Malloc(sizeof(struct tMPI_Thread_key)*1); (key)->key->wkey=TlsAlloc(); if ( (key)->key->wkey == TLS_OUT_OF_INDEXES ) { tMPI_Fatal_error(TMPI_FARGS, "Failed to create thread key, error code=%d.", GetLastError()); return -1; } return 0; }
int tMPI_Type_contiguous(int count, tMPI_Datatype oldtype, tMPI_Datatype *newtype) { struct tmpi_datatype_ *ntp; #ifdef TMPI_TRACE tMPI_Trace_print("tMPI_Type_contiguous(%d, %p, %p)", count, oldtype, newtype); #endif ntp = (struct tmpi_datatype_*)tMPI_Malloc(sizeof(struct tmpi_datatype_)); ntp->size = count*oldtype->size; ntp->op_functions = NULL; /* establish components */ ntp->N_comp = 1; ntp->comps = (struct tmpi_datatype_component*)tMPI_Malloc( sizeof(struct tmpi_datatype_component)*1); ntp->comps[0].type = oldtype; ntp->comps[0].count = 1; ntp->committed = FALSE; /* now add it to the list. */ tMPI_Spinlock_lock(&(tmpi_global->datatype_lock)); /* check whether there's space */ if (tmpi_global->N_usertypes + 1 >= tmpi_global->Nalloc_usertypes) { /* make space */ tmpi_global->Nalloc_usertypes = Nthreads*(tmpi_global->N_usertypes) + 1; tmpi_global->usertypes = (struct tmpi_datatype_**) tMPI_Realloc(tmpi_global->usertypes, (sizeof(struct tmpi_datatype_ *)* tmpi_global->Nalloc_usertypes) ); } /* add to the list */ tmpi_global->usertypes[tmpi_global->N_usertypes] = ntp; tmpi_global->N_usertypes++; *newtype = ntp; tMPI_Spinlock_unlock(&(tmpi_global->datatype_lock)); return TMPI_SUCCESS; }
int tMPI_Thread_mutex_init(tMPI_Thread_mutex_t *mtx) { if(mtx==NULL) { return EINVAL; } mtx->mutex=(struct tMPI_Mutex*)tMPI_Malloc(sizeof(struct tMPI_Mutex)*1); InitializeCriticalSection(&(mtx->mutex->cs)); return 0; }
void tMPI_Coll_env_init(struct coll_env *cev, int N) { int i; cev->met=(struct coll_env_thread*)tMPI_Malloc( sizeof(struct coll_env_thread)*N); cev->N=N; tMPI_Atomic_set(&(cev->coll.current_sync), 0); tMPI_Atomic_set(&(cev->coll.n_remaining), 0); for(i=0;i<N;i++) { tMPI_Coll_envt_init(&(cev->met[i]), N); } }
void tMPI_Coll_sync_init(struct coll_sync *csync, int N) { int i; csync->synct=0; csync->syncs=0; csync->N=N; csync->events=(tMPI_Event*)tMPI_Malloc(sizeof(tMPI_Event)*N); for(i=0;i<N;i++) { tMPI_Event_init( &(csync->events[i]) ); } }
static int tMPI_Thread_mutex_init_once(tMPI_Thread_mutex_t *mtx) { int ret=0; /* we're relying on the memory barrier semantics of mutex_lock/unlock for the check preceding this function call to have worked */ pthread_mutex_lock( &(mutex_init) ); if(mtx->mutex==NULL) { mtx->mutex=(struct tMPI_Mutex*)tMPI_Malloc(sizeof(struct tMPI_Mutex)*1); ret=pthread_mutex_init( &(mtx->mutex->mtx), NULL); } pthread_mutex_unlock( &(mutex_init) ); return ret; }
static int tMPI_Thread_cond_init_once(tMPI_Thread_cond_t *cond) { int ret=0; /* we're relying on the memory barrier semantics of mutex_lock/unlock for the check preceding this function call to have worked */ pthread_mutex_lock( &(cond_init) ); if(cond->condp==NULL) { cond->condp=(struct tMPI_Thread_cond*) tMPI_Malloc(sizeof(struct tMPI_Thread_cond)*1); ret=pthread_cond_init( &(cond->condp->cond), NULL); } pthread_mutex_unlock( &(cond_init) ); return ret; }
static void tMPI_Thread_init(struct tmpi_thread *th) { int N_envelopes=(Nthreads+1)*N_EV_ALLOC; int N_send_envelopes=N_EV_ALLOC; int N_reqs=(Nthreads+1)*N_EV_ALLOC; int i; /* we set our thread id, as a thread-specific piece of global data. */ tMPI_Thread_setspecific(id_key, th); /* allocate comm.self */ th->self_comm=tMPI_Comm_alloc(TMPI_COMM_WORLD, 1); th->self_comm->grp.peers[0]=th; /* allocate envelopes */ tMPI_Free_env_list_init( &(th->envelopes), N_envelopes ); /* recv list */ tMPI_Recv_env_list_init( &(th->evr)); /* send lists */ th->evs=(struct send_envelope_list*)tMPI_Malloc( sizeof(struct send_envelope_list)*Nthreads); for(i=0;i<Nthreads;i++) { tMPI_Send_env_list_init( &(th->evs[i]), N_send_envelopes); } tMPI_Atomic_set( &(th->ev_outgoing_received), 0); tMPI_Event_init( &(th->p2p_event) ); /* allocate requests */ tMPI_Req_list_init(&(th->rql), N_reqs); #ifdef USE_COLLECTIVE_COPY_BUFFER /* allcate copy_buffer list */ tMPI_Copy_buffer_list_init(&(th->cbl_multi), (Nthreads+1)*(N_COLL_ENV+1), Nthreads*COPY_BUFFER_SIZE); #endif #ifdef TMPI_PROFILE tMPI_Profile_init(&(th->profile)); #endif /* now wait for all other threads to come on line, before we start the MPI program */ tMPI_Thread_barrier_wait( &(tmpi_global->barrier) ); }
void tMPI_Copy_buffer_list_init(struct copy_buffer_list *cbl, int Nbufs, size_t size) { int i; cbl->size=size; cbl->cb_alloc=(struct copy_buffer*) tMPI_Malloc(sizeof(struct copy_buffer)*Nbufs); cbl->cb=cbl->cb_alloc; /* the first one */ cbl->Nbufs = Nbufs; for(i=0;i<Nbufs;i++) { tMPI_Copy_buffer_init( &(cbl->cb_alloc[i]), size ); if (i<Nbufs-1) cbl->cb_alloc[i].next=&(cbl->cb_alloc[i+1]); else cbl->cb_alloc[i].next=NULL; } }
int tMPI_Thread_cond_init(tMPI_Thread_cond_t *cond) { int ret; if(cond==NULL) { return EINVAL; } cond->condp=(struct tMPI_Thread_cond*) tMPI_Malloc(sizeof(struct tMPI_Thread_cond)*1); ret = pthread_cond_init(&(cond->condp->cond), NULL); if(ret!=0) { tMPI_Fatal_error(TMPI_FARGS,"Error initializing POSIX condition variable. rc=%d",ret); fflush(stderr); } tMPI_Atomic_set(&(cond->initialized),1); return ret; }
int tMPI_Thread_mutex_init(tMPI_Thread_mutex_t *mtx) { int ret; if (mtx == NULL) { return EINVAL; } mtx->mutex=(struct tMPI_Mutex*)tMPI_Malloc(sizeof(struct tMPI_Mutex)*1); ret = pthread_mutex_init(&(mtx->mutex->mtx),NULL); if(ret!=0) { tMPI_Fatal_error(TMPI_FARGS,"Error initializing POSIX mutex. rc=%d"); /* Use system memory allocation routines */ return ret; } tMPI_Atomic_set(&(mtx->initialized), 1); return 0; }
int tMPI_Thread_cond_init(tMPI_Thread_cond_t *cond) { if(cond==NULL) { return EINVAL; } cond->condp=(struct tMPI_Thread_cond*) tMPI_Malloc(sizeof(struct tMPI_Thread_cond)*1); #if 0 /* use this code once Vista is the minimum version required */ InitializeConditionVariable( &(cond->cv) ); #else cond->condp->Nwaiters=0; InitializeCriticalSection(&(cond->condp->wtr_lock)); cond->condp->Nrelease=0; cond->condp->cycle=0; /* a manual reset, unsignalled event */ cond->condp->ev = CreateEvent(NULL, TRUE, FALSE, NULL); #endif return 0; }
int tMPI_Thread_barrier_init(tMPI_Thread_barrier_t *barrier, int n) { int ret; /*tMPI_Thread_pthread_barrier_t *p;*/ if(barrier==NULL) { return EINVAL; } barrier->barrierp=(struct tMPI_Thread_barrier*) tMPI_Malloc(sizeof(struct tMPI_Thread_barrier)*1); ret = pthread_mutex_init(&(barrier->barrierp->mutex),NULL); if(ret!=0) { tMPI_Fatal_error(TMPI_FARGS,"Error initializing POSIX mutex. rc=%d", ret); return ret; } ret = pthread_cond_init(&(barrier->barrierp->cv),NULL); if(ret!=0) { tMPI_Fatal_error(TMPI_FARGS, "Error initializing POSIX condition variable. rc=%d", ret); return ret; } barrier->threshold = n; barrier->count = n; barrier->cycle = 0; tMPI_Atomic_set(&(barrier->initialized), 1); return 0; }
int tMPI_Thread_key_create(tMPI_Thread_key_t *key, void (*destructor)(void *)) { int ret; if(key==NULL) { tMPI_Fatal_error(TMPI_FARGS,"Invalid key pointer."); return EINVAL; } key->key=(struct tMPI_Thread_key*)tMPI_Malloc(sizeof(struct tMPI_Thread_key)*1); ret = pthread_key_create(&((key)->key->pkey), destructor); if(ret!=0) { tMPI_Fatal_error(TMPI_FARGS,"Failed to create thread key, rc=%d.",ret); fflush(stderr); return -1; } tMPI_Atomic_set(&(key->initialized), 1); return 0; }
int tMPI_Comm_alloc(tMPI_Comm *newcomm, tMPI_Comm parent, int N) { struct tmpi_comm_ *retc; int i; int ret; retc = (struct tmpi_comm_*)tMPI_Malloc(sizeof(struct tmpi_comm_)); if (retc == NULL) { return TMPI_ERR_NO_MEM; } retc->grp.peers = (struct tmpi_thread**)tMPI_Malloc( sizeof(struct tmpi_thread*)*Nthreads); if (retc->grp.peers == NULL) { return TMPI_ERR_NO_MEM; } retc->grp.N = N; ret = tMPI_Thread_mutex_init( &(retc->comm_create_lock) ); if (ret != 0) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO); } ret = tMPI_Thread_cond_init( &(retc->comm_create_prep) ); if (ret != 0) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO); } ret = tMPI_Thread_cond_init( &(retc->comm_create_finish) ); if (ret != 0) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO); } retc->split = NULL; retc->new_comm = NULL; /* we have no topology to start out with */ retc->cart = NULL; /*retc->graph=NULL;*/ /* we start counting at 0 */ tMPI_Atomic_set( &(retc->destroy_counter), 0); /* initialize the main barrier */ tMPI_Barrier_init(&(retc->barrier), N); /* the reduce barriers */ { /* First calculate the number of reduce barriers */ int Niter = 0; /* the iteration number */ int Nred = N; /* the number of reduce barriers for this iteration */ while (Nred > 1) { /* Nred is now Nred/2 + a rest term because solitary process at the end of the list must still be accounter for */ Nred = Nred/2 + Nred%2; Niter += 1; } retc->N_reduce_iter = Niter; /* allocate the list */ retc->reduce_barrier = (tMPI_Barrier_t**) tMPI_Malloc(sizeof(tMPI_Barrier_t*)*(Niter+1)); if (retc->reduce_barrier == NULL) { return TMPI_ERR_NO_MEM; } retc->N_reduce = (int*)tMPI_Malloc(sizeof(int)*(Niter+1)); if (retc->N_reduce == NULL) { return TMPI_ERR_NO_MEM; } /* we re-set Nred to N */ Nred = N; for (i = 0; i < Niter; i++) { int j; Nred = Nred/2 + Nred%2; retc->N_reduce[i] = Nred; /* allocate the sub-list */ retc->reduce_barrier[i] = (tMPI_Barrier_t*) tMPI_Malloc(sizeof(tMPI_Barrier_t)*(Nred)); if (retc->reduce_barrier[i] == NULL) { return TMPI_ERR_NO_MEM; } for (j = 0; j < Nred; j++) { tMPI_Barrier_init(&(retc->reduce_barrier[i][j]), 2); } } } /* the reduce buffers */ retc->reduce_sendbuf = (tMPI_Atomic_ptr_t*) tMPI_Malloc(sizeof(tMPI_Atomic_ptr_t)*Nthreads); if (retc->reduce_sendbuf == NULL) { return TMPI_ERR_NO_MEM; } retc->reduce_recvbuf = (tMPI_Atomic_ptr_t*) tMPI_Malloc(sizeof(tMPI_Atomic_ptr_t)*Nthreads); if (retc->reduce_recvbuf == NULL) { return TMPI_ERR_NO_MEM; } if (parent) { retc->erh = parent->erh; } else { retc->erh = TMPI_ERRORS_ARE_FATAL; } /* coll_env objects */ retc->cev = (struct coll_env*)tMPI_Malloc(sizeof(struct coll_env)* N_COLL_ENV); if (retc->cev == NULL) { return TMPI_ERR_NO_MEM; } for (i = 0; i < N_COLL_ENV; i++) { ret = tMPI_Coll_env_init( &(retc->cev[i]), N); if (ret != TMPI_SUCCESS) { return ret; } } /* multi_sync objects */ retc->csync = (struct coll_sync*)tMPI_Malloc(sizeof(struct coll_sync)*N); if (retc->csync == NULL) { return TMPI_ERR_NO_MEM; } for (i = 0; i < N; i++) { ret = tMPI_Coll_sync_init( &(retc->csync[i]), N); if (ret != TMPI_SUCCESS) { return ret; } } ret = tMPI_Thread_mutex_lock( &(tmpi_global->comm_link_lock) ); if (ret != 0) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO); } /* we insert ourselves in the circular list, after TMPI_COMM_WORLD */ if (TMPI_COMM_WORLD) { retc->next = TMPI_COMM_WORLD; retc->prev = TMPI_COMM_WORLD->prev; TMPI_COMM_WORLD->prev->next = retc; TMPI_COMM_WORLD->prev = retc; } else { retc->prev = retc->next = retc; } ret = tMPI_Thread_mutex_unlock( &(tmpi_global->comm_link_lock) ); if (ret != 0) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO); } *newcomm = retc; return TMPI_SUCCESS; }
void tMPI_Start_threads(tmpi_bool main_returns, int N, int *argc, char ***argv, void (*start_fn)(void*), void *start_arg, int (*start_fn_main)(int, char**)) { #ifdef TMPI_TRACE tMPI_Trace_print("tMPI_Start_threads(%d, %p, %p, %p, %p)", N, argc, argv, start_fn, start_arg); #endif if (N>0) { int i; int set_affinity=FALSE; tmpi_finalized=FALSE; Nthreads=N; /* allocate global data */ tmpi_global=(struct tmpi_global*) tMPI_Malloc(sizeof(struct tmpi_global)); tMPI_Global_init(tmpi_global, N); /* allocate world and thread data */ threads=(struct tmpi_thread*)tMPI_Malloc(sizeof(struct tmpi_thread)*N); TMPI_COMM_WORLD=tMPI_Comm_alloc(NULL, N); TMPI_GROUP_EMPTY=tMPI_Group_alloc(); if (tMPI_Thread_key_create(&id_key, NULL)) { tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_INIT); } for(i=0;i<N;i++) { TMPI_COMM_WORLD->grp.peers[i]=&(threads[i]); /* copy argc, argv */ if (argc && argv) { int j; threads[i].argc=*argc; threads[i].argv=(char**)tMPI_Malloc(threads[i].argc* sizeof(char*)); for(j=0;j<threads[i].argc;j++) { #if ! (defined( _WIN32 ) || defined( _WIN64 ) ) threads[i].argv[j]=strdup( (*argv)[j] ); #else threads[i].argv[j]=_strdup( (*argv)[j] ); #endif } } else { threads[i].argc=0; threads[i].argv=NULL; } threads[i].start_fn=start_fn; threads[i].start_fn_main=start_fn_main; threads[i].start_arg=start_arg; } /* now check whether to set affinity */ #ifdef TMPI_THREAD_AFFINITY { int nhw=tMPI_Thread_get_hw_number(); if ((nhw > 1) && (nhw == N)) { set_affinity=TRUE; } } #endif for(i=1;i<N;i++) /* zero is the main thread */ { int ret; if (set_affinity) { ret=tMPI_Thread_create_aff(&(threads[i].thread_id), tMPI_Thread_starter, (void*)&(threads[i]) ) ; } else { ret=tMPI_Thread_create(&(threads[i].thread_id), tMPI_Thread_starter, (void*)&(threads[i]) ) ; } if(ret) { tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_INIT); } } /* the main thread now also runs start_fn if we don't want it to return */ if (!main_returns) tMPI_Thread_starter((void*)&(threads[0])); else tMPI_Thread_init(&(threads[0])); } }
/* initialize a copy buffer */ void tMPI_Copy_buffer_init(struct copy_buffer *cb, size_t size) { cb->buf=tMPI_Malloc(size); cb->size=size; }
/* returns 0 on success. Success is returned if the system is non-NUMA, OR the system doesn't support appropriate NUMA APIs, OR the system is NUMA and we successfully initialized support. returns -1 on error. This can happen if an API returned an error, a memory allocation failed, or we failed to initialize affinity mapping information. */ int tMPI_Init_NUMA(void) { /* module handle to kernel32.dll -- we already reference it, so it's already loaded */ HMODULE hModKernel32 = NULL; /* 0-based NUMA node count -- does not imply all nodes have available (eg: hot-plug) processors */ ULONG ulHighestNumaNodeNumber; /* total number of processors available per affinity masks */ DWORD dwTotalProcessors = 0; ULONG i = 0; /* calling thread PROCESSOR_NUMBER */ PROCESSOR_NUMBER CurrentProcessorNumber; /* calling thread GROUP_AFFINITY */ GROUP_AFFINITY CurrentThreadGroupAffinity; /* calling thread NUMA node */ USHORT CurrentNumaNodeNumber; WORD wActiveGroupCount; WORD GroupIndex; /* array of processor information structures */ MPI_NUMA_PROCESSOR_INFO *pMPI_ProcessorInfo = NULL; /* assume an error condition */ int iRet = -1; hModKernel32 = GetModuleHandleA("kernel32.dll"); if( hModKernel32 == NULL ) { return 0; } /* obtain addresses of relevant NUMA functions, most of which are Windows 7 / Windows Server 2008R2 only functions this is done using GetProcAddress to enable the binary to run on older Windows versions. */ func_GetNumaHighestNodeNumber = (func_GetNumaHighestNodeNumber_t) GetProcAddress( hModKernel32, "GetNumaHighestNodeNumber" ); if( func_GetNumaHighestNodeNumber == NULL ) { return 0; } /* determine if we're on a NUMA system and if so, determine the number of (potential) nodes */ if(!func_GetNumaHighestNodeNumber( &ulHighestNumaNodeNumber )) { return -1; } if( ulHighestNumaNodeNumber == 0 ) { /* system is not NUMA */ return 0; } func_SetThreadGroupAffinity = (func_SetThreadGroupAffinity_t)GetProcAddress( hModKernel32, "SetThreadGroupAffinity" ); func_SetThreadIdealProcessorEx = (func_SetThreadIdealProcessorEx_t)GetProcAddress( hModKernel32, "SetThreadIdealProcessorEx" ); func_CreateRemoteThreadEx = (func_CreateRemoteThreadEx_t)GetProcAddress( hModKernel32, "CreateRemoteThreadEx" ); func_GetNumaNodeProcessorMaskEx = (func_GetNumaNodeProcessorMaskEx_t)GetProcAddress( hModKernel32, "GetNumaNodeProcessorMaskEx" ); func_GetNumaProcessorNodeEx = (func_GetNumaProcessorNodeEx_t)GetProcAddress( hModKernel32, "GetNumaProcessorNodeEx" ); func_GetCurrentProcessorNumberEx = (func_GetCurrentProcessorNumberEx_t)GetProcAddress( hModKernel32, "GetCurrentProcessorNumberEx" ); func_GetActiveProcessorCount = (func_GetActiveProcessorCount_t)GetProcAddress( hModKernel32, "GetActiveProcessorCount" ); func_GetActiveProcessorGroupCount = (func_GetActiveProcessorGroupCount_t)GetProcAddress( hModKernel32, "GetActiveProcessorGroupCount" ); func_InitializeProcThreadAttributeList = (func_InitializeProcThreadAttributeList_t)GetProcAddress( hModKernel32, "InitializeProcThreadAttributeList" ); func_UpdateProcThreadAttribute = (func_UpdateProcThreadAttribute_t)GetProcAddress( hModKernel32, "UpdateProcThreadAttribute" ); func_DeleteProcThreadAttributeList = (func_DeleteProcThreadAttributeList_t)GetProcAddress( hModKernel32, "DeleteProcThreadAttributeList" ); if( (func_SetThreadGroupAffinity == NULL) || (func_SetThreadIdealProcessorEx == NULL) || (func_CreateRemoteThreadEx == NULL) || (func_GetNumaNodeProcessorMaskEx == NULL) || (func_GetNumaProcessorNodeEx == NULL) || (func_GetCurrentProcessorNumberEx == NULL) || (func_GetActiveProcessorCount == NULL) || (func_GetActiveProcessorGroupCount == NULL) || (func_InitializeProcThreadAttributeList == NULL) || (func_UpdateProcThreadAttribute == NULL) || (func_DeleteProcThreadAttributeList == NULL) ) { /* if any addresses couldn't be located, assume NUMA functionality isn't supported */ return 0; } /* count the active processors across the groups */ func_GetCurrentProcessorNumberEx(&CurrentProcessorNumber); wActiveGroupCount = func_GetActiveProcessorGroupCount(); dwTotalProcessors = func_GetActiveProcessorCount( ALL_PROCESSOR_GROUPS ); #if !((defined WIN64 || defined _WIN64)) /* WOW64 doesn't allow setting the affinity correctly beyond 32 processors -- the KAFFINITY mask is only 32 bits wide This check is only here for completeness -- large systems should be running 64bit Gromacs code, where the processor quantity is not constrained. By failing here, the WOW64 32bit client will use normal CreateThread(), which can schedule up to 64 un-affinitized threads */ if( dwTotalProcessors > 32 ) { return 0; } #endif /* allocate array of processor info blocks */ pMPI_ProcessorInfo = tMPI_Malloc( sizeof(MPI_NUMA_PROCESSOR_INFO) * dwTotalProcessors ); if(pMPI_ProcessorInfo == NULL) { tMPI_Fatal_error(TMPI_FARGS,"tMPI_Malloc failed for processor information"); goto cleanup; } /* zero fill to cover reserved must be-zero fields */ memset(pMPI_ProcessorInfo, 0, sizeof(MPI_NUMA_PROCESSOR_INFO) * dwTotalProcessors); /* loop through each processor group, and for each group, capture the processor numbers and NUMA node information. */ for(GroupIndex = 0 ; GroupIndex < wActiveGroupCount ; GroupIndex++) { DWORD dwGroupProcessorCount; BYTE ProcessorIndex; dwGroupProcessorCount = func_GetActiveProcessorCount( GroupIndex ); for(ProcessorIndex = 0 ; ProcessorIndex < dwGroupProcessorCount ; ProcessorIndex++) { PROCESSOR_NUMBER *pProcessorNumber = &(pMPI_ProcessorInfo[i].ProcessorNumber); GROUP_AFFINITY *pGroupAffinity = &(pMPI_ProcessorInfo[i].GroupAffinity); USHORT *pNodeNumber = &(pMPI_ProcessorInfo[i].NumaNodeNumber); pProcessorNumber->Group = GroupIndex; pProcessorNumber->Number = ProcessorIndex; /* save an index to the processor array entry for the current processor this is used to enable subsequent threads to be created in a round robin fashion starting at the next array entry */ if( (CurrentProcessorNumber.Group == pProcessorNumber->Group ) && (CurrentProcessorNumber.Number == pProcessorNumber->Number) ) { /* set global: current thread index into processor array */ g_ulThreadIndex = i; } /* capture the node number and group affinity associated with processor entry any failures here are assumed to be catastrophic and disable the group & NUMA aware thread support */ if(!func_GetNumaProcessorNodeEx(pProcessorNumber, pNodeNumber)) { tMPI_Fatal_error(TMPI_FARGS, "Processor enumeration, GetNumaProcessorNodeEx failed, error code=%d", GetLastError()); goto cleanup; } if(!func_GetNumaNodeProcessorMaskEx(*pNodeNumber, pGroupAffinity)) { tMPI_Fatal_error(TMPI_FARGS, "Processor enumeration, GetNumaNodeProcessorMaskEx failed, error code=%d", GetLastError()); goto cleanup; } /* future enhancement: construct GroupAffinity (single) processor mask within NUMA node for this processor entry */ /* increment processor array index */ i++; /* sanity check, should never happen */ if(i > dwTotalProcessors) { tMPI_Fatal_error(TMPI_FARGS,"Processor enumeration exceeds allocated memory!"); goto cleanup; } } } #if 0 /* set the NUMA node affinity for the current thread failures to set the current thread affinity are ignored, as a fringe case can arise on >32 processor systems with a 32bit build/code. */ func_SetThreadIdealProcessorEx(GetCurrentThread(), &CurrentProcessorNumber, NULL); if(func_GetNumaProcessorNodeEx(&CurrentProcessorNumber, &CurrentNumaNodeNumber)) { /* for the NUMA node number associated with the current processor number, get the group affinity mask */ if(func_GetNumaNodeProcessorMaskEx(CurrentNumaNodeNumber, &CurrentThreadGroupAffinity)) { /* set the current thread affinity to prevent it from running on other NUMA nodes */ func_SetThreadGroupAffinity(GetCurrentThread(), &CurrentThreadGroupAffinity, NULL); } } #endif /* capture number of processors, highest NUMA node number, and processor array */ g_ulTotalProcessors = dwTotalProcessors; g_ulHighestNumaNodeNumber = ulHighestNumaNodeNumber; g_MPI_ProcessorInfo = pMPI_ProcessorInfo; iRet = 0 ; #if 0 // TODO: debug DISCARD printf("primary thread tid=%lu group=%lu mask=0x%I64x group=%lu number=%lu ulThreadIndex=%lu\n", GetCurrentThreadId(), CurrentThreadGroupAffinity.Group, (ULONGLONG)CurrentThreadGroupAffinity.Mask, (ULONG)CurrentProcessorNumber.Group, (ULONG)CurrentProcessorNumber.Number, g_ulThreadIndex); #endif cleanup: if( iRet != 0 ) { if( pMPI_ProcessorInfo ) { tMPI_Free( pMPI_ProcessorInfo ); } } return 0; }
HANDLE tMPI_Thread_create_NUMA(LPSECURITY_ATTRIBUTES lpThreadAttributes, SIZE_T dwStackSize, LPTHREAD_START_ROUTINE lpStartAddress, LPVOID lpParameter, DWORD dwCreationFlags, LPDWORD lpThreadId) { LPPROC_THREAD_ATTRIBUTE_LIST pAttributeList = NULL; HANDLE hThread = NULL; SIZE_T cbAttributeList = 0; GROUP_AFFINITY GroupAffinity; PROCESSOR_NUMBER IdealProcessorNumber; ULONG CurrentProcessorIndex; /* for each thread created, round-robin through the set of valid processors and affinity masks. the assumption is that callers of tMPI_Thread_create_NUMA are creating threads that saturate a given processor. for cases where threads are being created that rarely do work, standard thread creation (eg: CreateThread) should be invoked instead. */ CurrentProcessorIndex = (ULONG)InterlockedIncrement((volatile LONG *)&g_ulThreadIndex); CurrentProcessorIndex = CurrentProcessorIndex % g_ulTotalProcessors; /* group, mask. */ memcpy(&GroupAffinity, &(g_MPI_ProcessorInfo[CurrentProcessorIndex].GroupAffinity), sizeof(GROUP_AFFINITY)); /* group, processor number */ memcpy(&IdealProcessorNumber, &(g_MPI_ProcessorInfo[CurrentProcessorIndex].ProcessorNumber), sizeof(PROCESSOR_NUMBER)); /* determine size of allocation for AttributeList */ if(!func_InitializeProcThreadAttributeList(pAttributeList, 2, 0, &cbAttributeList)) { DWORD dwLastError = GetLastError(); if( dwLastError != ERROR_INSUFFICIENT_BUFFER ) { tMPI_Fatal_error(TMPI_FARGS, "InitializeProcThreadAttributeList, error code=%d", dwLastError); goto cleanup; } } pAttributeList = (LPPROC_THREAD_ATTRIBUTE_LIST)tMPI_Malloc( cbAttributeList ); if( pAttributeList == NULL ) { tMPI_Fatal_error(TMPI_FARGS,"Failed to allocate pAttributeList"); goto cleanup; } memset( pAttributeList, 0, cbAttributeList ); if(!func_InitializeProcThreadAttributeList(pAttributeList, 2, 0, &cbAttributeList)) { tMPI_Fatal_error(TMPI_FARGS, "InitializeProcThreadAttributeList, error code=%d", GetLastError()); goto cleanup; } if(!func_UpdateProcThreadAttribute(pAttributeList, 0, PROC_THREAD_ATTRIBUTE_GROUP_AFFINITY, &GroupAffinity, sizeof(GroupAffinity), NULL, NULL)) { tMPI_Fatal_error(TMPI_FARGS,"UpdateProcThreadAttribute, error code=%d", GetLastError()); goto cleanup; } if(!func_UpdateProcThreadAttribute(pAttributeList, 0, PROC_THREAD_ATTRIBUTE_IDEAL_PROCESSOR, &IdealProcessorNumber, sizeof(IdealProcessorNumber), NULL, NULL)) { tMPI_Fatal_error(TMPI_FARGS,"UpdateProcThreadAttribute, error code=%d", GetLastError()); goto cleanup; } hThread = func_CreateRemoteThreadEx( GetCurrentProcess(), lpThreadAttributes, dwStackSize, lpStartAddress, lpParameter, dwCreationFlags, pAttributeList, lpThreadId); func_DeleteProcThreadAttributeList( pAttributeList ); #if 0 // TODO: debug only or DISCARD if( hThread ) { PROCESSOR_NUMBER ProcNumber; USHORT NodeNumber; GetThreadIdealProcessorEx(hThread, &ProcNumber); GetNumaProcessorNodeEx(&ProcNumber, &NodeNumber); printf("started thread tid=%lu group=%lu mask=0x%I64x number=%lu numanode=%lu\n", *lpThreadId, GroupAffinity.Group, (ULONGLONG)GroupAffinity.Mask, ProcNumber.Number, NodeNumber ); } #endif cleanup: if( pAttributeList ) { tMPI_Free( pAttributeList ); } return hThread; }
/* this is the main comm creation function. All other functions that create comms use this*/ int tMPI_Comm_split(tMPI_Comm comm, int color, int key, tMPI_Comm *newcomm) { int i, j; int N = tMPI_Comm_N(comm); volatile tMPI_Comm *newcomm_list; volatile int colors[MAX_PREALLOC_THREADS]; /* array with the colors of each thread */ volatile int keys[MAX_PREALLOC_THREADS]; /* same for keys (only one of the threads actually suplies these arrays to the comm structure) */ tmpi_bool i_am_first = FALSE; int myrank = tMPI_Comm_seek_rank(comm, tMPI_Get_current()); struct tmpi_split *spl; int ret; #ifdef TMPI_TRACE tMPI_Trace_print("tMPI_Comm_split(%p, %d, %d, %p)", comm, color, key, newcomm); #endif if (!comm) { *newcomm = NULL; return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_COMM); } ret = tMPI_Thread_mutex_lock(&(comm->comm_create_lock)); if (ret != 0) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO); } /* first get the colors */ if (!comm->new_comm) { /* i am apparently first */ comm->split = (struct tmpi_split*)tMPI_Malloc(sizeof(struct tmpi_split)); comm->new_comm = (tMPI_Comm*)tMPI_Malloc(N*sizeof(tMPI_Comm)); if (N <= MAX_PREALLOC_THREADS) { comm->split->colors = colors; comm->split->keys = keys; } else { comm->split->colors = (int*)tMPI_Malloc(N*sizeof(int)); comm->split->keys = (int*)tMPI_Malloc(N*sizeof(int)); } comm->split->Ncol_init = tMPI_Comm_N(comm); comm->split->can_finish = FALSE; i_am_first = TRUE; /* the main communicator contains a list the size of grp.N */ } newcomm_list = comm->new_comm; /* we copy it to the local stacks because we can later erase comm->new_comm safely */ spl = comm->split; /* we do the same for spl */ spl->colors[myrank] = color; spl->keys[myrank] = key; spl->Ncol_init--; if (spl->Ncol_init == 0) { ret = tMPI_Thread_cond_signal(&(comm->comm_create_prep)); if (ret != 0) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO); } } if (!i_am_first) { /* all other threads can just wait until the creator thread is finished */ while (!spl->can_finish) { ret = tMPI_Thread_cond_wait(&(comm->comm_create_finish), &(comm->comm_create_lock) ); if (ret != 0) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO); } } } else { int Ncomms = 0; int comm_color_[MAX_PREALLOC_THREADS]; int comm_N_[MAX_PREALLOC_THREADS]; int *comm_color = comm_color_; /* there can't be more comms than N*/ int *comm_N = comm_N_; /* the number of procs in a group */ int *comm_groups; /* the groups */ tMPI_Comm *comms; /* the communicators */ /* wait for the colors to be done */ /*if (N>1)*/ while (spl->Ncol_init > 0) { ret = tMPI_Thread_cond_wait(&(comm->comm_create_prep), &(comm->comm_create_lock)); if (ret != 0) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO); } } /* reset the state so that a new comm creating function can run */ spl->Ncol_destroy = N; comm->new_comm = 0; comm->split = 0; comm_groups = (int*)tMPI_Malloc(N*N*sizeof(int)); if (N > MAX_PREALLOC_THREADS) { comm_color = (int*)tMPI_Malloc(N*sizeof(int)); comm_N = (int*)tMPI_Malloc(N*sizeof(int)); } /* count colors, allocate and split up communicators */ tMPI_Split_colors(N, (int*)spl->colors, (int*)spl->keys, &Ncomms, comm_N, comm_color, comm_groups); /* allocate a bunch of communicators */ comms = (tMPI_Comm*)tMPI_Malloc(Ncomms*sizeof(tMPI_Comm)); for (i = 0; i < Ncomms; i++) { ret = tMPI_Comm_alloc(&(comms[i]), comm, comm_N[i]); if (ret != TMPI_SUCCESS) { return ret; } } /* now distribute the comms */ for (i = 0; i < Ncomms; i++) { comms[i]->grp.N = comm_N[i]; for (j = 0; j < comm_N[i]; j++) { comms[i]->grp.peers[j] = comm->grp.peers[comm_groups[i*comm->grp.N + j]]; } } /* and put them into the newcomm_list */ for (i = 0; i < N; i++) { newcomm_list[i] = TMPI_COMM_NULL; for (j = 0; j < Ncomms; j++) { if (spl->colors[i] == comm_color[j]) { newcomm_list[i] = comms[j]; break; } } } #ifdef TMPI_DEBUG /* output */ for (i = 0; i < Ncomms; i++) { printf("Group %d (color %d) has %d members: ", i, comm_color[i], comm_N[i]); for (j = 0; j < comm_N[i]; j++) { printf(" %d ", comm_groups[comm->grp.N*i + j]); } printf(" rank: "); for (j = 0; j < comm_N[i]; j++) { printf(" %d ", spl->keys[comm_groups[N*i + j]]); } printf(" color: "); for (j = 0; j < comm_N[i]; j++) { printf(" %d ", spl->colors[comm_groups[N*i + j]]); } printf("\n"); } #endif if (N > MAX_PREALLOC_THREADS) { free((int*)spl->colors); free((int*)spl->keys); free(comm_color); free(comm_N); } free(comm_groups); free(comms); spl->can_finish = TRUE; /* tell the waiting threads that there's a comm ready */ ret = tMPI_Thread_cond_broadcast(&(comm->comm_create_finish)); if (ret != 0) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO); } } /* here the individual threads get their comm object */ *newcomm = newcomm_list[myrank]; /* free when we have assigned them all, so we can reuse the object*/ spl->Ncol_destroy--; if (spl->Ncol_destroy == 0) { free((void*)newcomm_list); free(spl); } ret = tMPI_Thread_mutex_unlock(&(comm->comm_create_lock)); if (ret != 0) { return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO); } return TMPI_SUCCESS; }