Example #1
0
int tMPI_Thread_create(tMPI_Thread_t *thread, void *(*start_routine)(void *),
                       void *arg)
{
    int ret;

    if(thread==NULL)
    {
        tMPI_Fatal_error(TMPI_FARGS,"Invalid thread pointer.");
        return EINVAL;
    }

    *thread=(struct tMPI_Thread*)malloc(sizeof(struct tMPI_Thread)*1);
    ret=pthread_create(&((*thread)->th),NULL,start_routine,arg);

    if(ret!=0)
    {
        /* Cannot use tMPI_error() since messages use threads for locking */
        tMPI_Fatal_error(TMPI_FARGS,"Failed to create POSIX thread:%s, rc=%d",
                         strerror(errno), ret);
        /* Use system memory allocation routines */
        return -1;
    }

    return 0;
}
Example #2
0
static int tMPI_Thread_barrier_init_once(tMPI_Thread_barrier_t *barrier)
{
    int ret=0;

    /* we're relying on the memory barrier semantics of mutex_lock/unlock
       for the check preceding this function call to have worked */
    pthread_mutex_lock( &(barrier_init) );    
    if(barrier->barrierp==NULL)
    {
        barrier->barrierp=(struct tMPI_Thread_barrier*)
                  tMPI_Malloc(sizeof(struct tMPI_Thread_barrier)*1);
        ret = pthread_mutex_init(&(barrier->barrierp->mutex),NULL);

        if(ret!=0)
        {
            tMPI_Fatal_error(TMPI_FARGS,"Error initializing POSIX mutex. rc=%d",
                             ret);
            return ret;
        }

        ret = pthread_cond_init(&(barrier->barrierp->cv),NULL);

        if(ret!=0)
        {
            tMPI_Fatal_error(TMPI_FARGS,
                             "Error initializing POSIX condition variable. rc=%d",
                             ret);
            return ret;
        }
    }
    pthread_mutex_unlock( &(barrier_init) );    
    return ret;
}
int tMPI_Thread_create_aff(tMPI_Thread_t *thread,
                           void *(*start_routine)(void *), void *arg)
{
    DWORD thread_id;
    struct tMPI_Thread_starter_param *prm;

    tMPI_Init_initers();
    tMPI_Set_main_thread_affinity();

    /* a small memory leak to be sure that it doesn't get deallocated 
       once this function ends, before the newly created thread uses it. */
    prm=(struct tMPI_Thread_starter_param*)
              tMPI_Malloc(sizeof(struct tMPI_Thread_starter_param));
    prm->start_routine= start_routine;
    prm->param=arg;

    *thread=(struct tMPI_Thread*)tMPI_Malloc(sizeof(struct tMPI_Thread)*1);

    if(thread==NULL)
    {
        tMPI_Fatal_error(TMPI_FARGS,"Invalid thread pointer.");
        return EINVAL;
    }

    if( g_ulHighestNumaNodeNumber != 0 )
    {
        /* if running on a NUMA system, use the group and NUMA aware thread 
           creation logic */
        (*thread)->th = tMPI_Thread_create_NUMA(NULL,
                                                0,
                                                tMPI_Win32_thread_starter,
                                                prm,
                                                0, 
                                                &thread_id);
    } else {
        /* TODO: for now, non-NUMA systems don't set thread affinity. */
        (*thread)->th = CreateThread(NULL,
                                     0,
                                     tMPI_Win32_thread_starter,
                                     prm,
                                     0, 
                                     &thread_id);
    }

    if((*thread)->th==NULL)
    {
        tMPI_Free(thread);
        tMPI_Fatal_error(TMPI_FARGS,"Failed to create thread, error code=%d",
                         GetLastError());
        return -1;
    }

    /* inherit the thread priority from the parent thread. */
    /* TODO: is there value in setting this, vs. just allowing it to default 
       from the process?  currently, this limits the effectivenes of changing 
       the priority in eg: TaskManager. */
    SetThreadPriority(((*thread)->th), GetThreadPriority(GetCurrentThread()));

    return 0;
}
Example #4
0
int tMPI_Thread_key_create(tMPI_Thread_key_t *key, void (*destructor)(void *))
{
    if(key==NULL)
    {
        tMPI_Fatal_error(TMPI_FARGS,"Invalid key pointer.");
        return EINVAL;
    }


    /* TODO: make list of destructors for thread-local storage */
    key->key=(struct tMPI_Thread_key*)tMPI_Malloc(sizeof(struct 
                                                         tMPI_Thread_key)*1);
 
    (key)->key->wkey=TlsAlloc();

    if ( (key)->key->wkey == TLS_OUT_OF_INDEXES ) 
    {
        tMPI_Fatal_error(TMPI_FARGS,
                         "Failed to create thread key, error code=%d.",
                         GetLastError());
        return -1;
    }

    return 0;
}
Example #5
0
int tMPI_Thread_join(tMPI_Thread_t thread, void **value_ptr)
{
    DWORD ret,retval;

    ret = WaitForSingleObject(thread->th, INFINITE);

    if (ret != 0)
    {
        tMPI_Fatal_error(TMPI_FARGS,"Failed to join thread. error code=%d",
                         GetLastError());
        return -1;
    }

    if (value_ptr)
    {
        if (!GetExitCodeThread(thread, &retval))
        {
            /* TODO: somehow assign value_ptr */
            tMPI_Fatal_error(TMPI_FARGS,
                             "Failed to get thread exit code: error=%d",
                             GetLastError());
            return -1;
        }
    }
    CloseHandle(thread->th);
    free(thread);

    return 0;
}
int tMPI_Thread_create_aff(tMPI_Thread_t *thread, 
                           void *(*start_routine)(void *),
                           void *arg)
{
    int ret;

    /* set the calling thread's affinity mask */
    if (tMPI_Atomic_get(&main_thread_aff_set) == 0)
    {
#ifdef HAVE_PTHREAD_SETAFFINITY
        cpu_set_t set;
#endif
        /* this can be a spinlock because the chances of collision are low. */
        tMPI_Spinlock_lock( &main_thread_aff_lock );
        tMPI_Atomic_set( &aff_thread_number, 0);
#ifdef HAVE_PTHREAD_SETAFFINITY
        CPU_ZERO(&set);
        CPU_SET(0, &set);
        pthread_setaffinity_np(pthread_self(), sizeof(set), &set);
        /*fprintf(stderr, "Setting affinity.\n");*/
#endif
        tMPI_Atomic_set( &main_thread_aff_set, 1);
        tMPI_Spinlock_unlock( &main_thread_aff_lock );
    }


    if(thread==NULL)
    {
        tMPI_Fatal_error(TMPI_FARGS,"Invalid thread pointer.");
        return EINVAL;
    }

    *thread=(struct tMPI_Thread*)malloc(sizeof(struct tMPI_Thread)*1);
    ret=pthread_create(&((*thread)->th),NULL,start_routine,arg);

    if(ret!=0)
    {
        /* Cannot use tMPI_error() since messages use threads for locking */
        tMPI_Fatal_error(TMPI_FARGS,"Failed to create POSIX thread, rc=%d",ret);
        /* Use system memory allocation routines */
        return -1;
    }
    else
    {
#ifdef HAVE_PTHREAD_SETAFFINITY
        int n;
        cpu_set_t set;

        n=tMPI_Atomic_add_return(&aff_thread_number, 1);
        CPU_ZERO(&set);
        CPU_SET(n, &set);
        return pthread_setaffinity_np((*thread)->th, sizeof(set), &set);
#else
        return 0;
#endif
    }
}
Example #7
0
int tMPI_Thread_create_aff(tMPI_Thread_t *thread, 
                           void *(*start_routine)(void *),
                           void *arg)
{
    int ret;

#ifdef TMPI_SET_AFFINITY
    /* set the calling thread's affinity mask */
    pthread_mutex_lock( &(aff_init) );
    if (aff_thread_number==0)
    {
        tMPI_Set_affinity(aff_thread_number++);
    }
    pthread_mutex_unlock( &(aff_init) );
#endif

    if(thread==NULL)
    {
        tMPI_Fatal_error(TMPI_FARGS,"Invalid thread pointer.");
        return EINVAL;
    }

    *thread=(struct tMPI_Thread*)malloc(sizeof(struct tMPI_Thread)*1);
    ret=pthread_create(&((*thread)->th),NULL,start_routine,arg);

    if(ret!=0)
    {
        /* Cannot use tMPI_error() since messages use threads for locking */
        tMPI_Fatal_error(TMPI_FARGS,"Failed to create POSIX thread:%s, rc=%d",
                         strerror(errno), ret);
        /* Use system memory allocation routines */
        return -1;
    }
    else
    {
#ifdef TMPI_SET_AFFINITY
        /* now set the affinity of the new thread */
        int ret;

        pthread_mutex_lock( &(aff_init) );
        ret=tMPI_Set_affinity(aff_thread_number++);
        pthread_mutex_unlock( &(aff_init) );
        /* failure is non-fatal, so we won't check the result */
        return 0;
#else
        return 0;
#endif
    }
}
Example #8
0
int tMPI_Thread_once(tMPI_Thread_once_t *once_control, 
                     void (*init_routine)(void))
{
#if 0
    /* use once Vista is minimum required version */
    BOOL bStatus;
    bStatus = InitOnceExecuteOnce(once_control, InitHandleWrapperFunction, 
                                  init_routine, NULL);

    if (!bStatus)
    {
        tMPI_Fatal_error(TMPI_FARGS,"Failed to run thread_once routine");
        return -1;
    }
#else
    /* really ugly hack - and it's slow... */
    tMPI_Init_initers();
    EnterCriticalSection(&once_init);
    if (tMPI_Atomic_get(&(once_control->once)) == 0)
    {
        (*init_routine)();
        tMPI_Atomic_set(&(once_control->once), 1);
    }
    LeaveCriticalSection(&once_init);
#endif
    return 0;
}
Example #9
0
int tMPI_Thread_cond_broadcast(tMPI_Thread_cond_t *cond)
{
    /* check whether the condition is initialized */
    if (tMPI_Atomic_get( &(cond->initialized)  ) == 0)
    {
        tMPI_Thread_cond_init_once(cond);
    }
    /* The condition variable is now guaranteed to be valid. */
#if 0
    /* use this code once Vista is the minimum version required */
    WakeAllConditionVariable( &(cond->cv) );
#else
    EnterCriticalSection(&(cond->condp->wtr_lock));
    /* check whether there are any waiters */
    if (cond->condp->Nwaiters > 0)
    {
        cond->condp->Nrelease=cond->condp->Nwaiters;
        cond->condp->cycle++;
        if (!SetEvent(cond->condp->ev)) /* actually release the 
                                           waiting threads */
        {
            tMPI_Fatal_error(TMPI_FARGS,"Failed SetEvent, error code=%d",
                             GetLastError());
            return -1;
        }
    }
    LeaveCriticalSection(&(cond->condp->wtr_lock));
#endif
    return 0;
}
Example #10
0
int tMPI_Thread_cancel(tMPI_Thread_t thread)
{
    if (!TerminateThread( thread, -1) )
    {
        tMPI_Fatal_error(TMPI_FARGS,"Failed thread_cancel, error code=%d",
                         GetLastError());
        return -1;
    }
    return 0;
}
Example #11
0
int tMPI_Thread_barrier_init(tMPI_Thread_barrier_t *barrier, int n)
{
    int ret;
    /*tMPI_Thread_pthread_barrier_t *p;*/
    
    if(barrier==NULL)
    {
        return EINVAL;
    }
    
    barrier->barrierp=(struct tMPI_Thread_barrier*)
              tMPI_Malloc(sizeof(struct tMPI_Thread_barrier)*1);
    ret = pthread_mutex_init(&(barrier->barrierp->mutex),NULL);
        
    if(ret!=0)
    {
        tMPI_Fatal_error(TMPI_FARGS,"Error initializing POSIX mutex. rc=%d",
                         ret);
        return ret;
    }
    
    ret = pthread_cond_init(&(barrier->barrierp->cv),NULL);
    
    if(ret!=0)
    {
        tMPI_Fatal_error(TMPI_FARGS,
                         "Error initializing POSIX condition variable. rc=%d",
                         ret);
        return ret;
    }
        
    barrier->threshold = n;
    barrier->count     = n;
    barrier->cycle     = 0;

    tMPI_Atomic_set(&(barrier->initialized), 1);
    return 0;
}
Example #12
0
int tMPI_Thread_create(tMPI_Thread_t *thread,
                       void *(*start_routine)(void *), void *arg)
{
    DWORD thread_id;
    struct tMPI_Thread_starter_param *prm;

    tMPI_Init_initers();

    /* a small memory leak to be sure that it doesn't get deallocated 
       once this function ends */
    prm=(struct tMPI_Thread_starter_param*)
              malloc(sizeof(struct tMPI_Thread_starter_param));
    prm->start_routine= start_routine;
    prm->param=arg;

    *thread=(struct tMPI_Thread*)malloc(sizeof(struct tMPI_Thread)*1);

    if(thread==NULL)
    {
        tMPI_Fatal_error(TMPI_FARGS,"Invalid thread pointer.");
        return EINVAL;
    }

    (*thread)->th = CreateThread(NULL, 0, tMPI_Win32_thread_starter, prm, 0, 
                                 &thread_id);

    if((*thread)->th==NULL)
    {
        tMPI_Fatal_error(TMPI_FARGS,"Failed to create thread, error code=%d",
                         GetLastError());
        return -1;
    }
    /* inherit the thread priority from the paren thread. */
    SetThreadPriority(((*thread)->th), GetThreadPriority(GetCurrentThread()));

    return 0;
}
Example #13
0
int tMPI_Thread_join(tMPI_Thread_t thread, void **value_ptr)
{
    int ret;
    pthread_t th=thread->th;

    free(thread);
    
    ret = pthread_join( th, value_ptr );

    if(ret != 0 )
    {
        tMPI_Fatal_error(TMPI_FARGS,"Failed to join POSIX thread. rc=%d",ret);
    }
    return ret;
}
Example #14
0
int tMPI_Thread_key_delete(tMPI_Thread_key_t key)
{
    int ret;

    ret=pthread_key_delete((key.key->pkey));
    free(key.key);

    if(ret!=0)
    {
        tMPI_Fatal_error(TMPI_FARGS,"Failed to delete thread key, rc=%d.",ret);
        fflush(stderr);
    }
    
    return ret;
}
Example #15
0
int tMPI_Thread_key_create(tMPI_Thread_key_t *key, void (*destructor)(void *))
{
    int ret;

    if(key==NULL)
    {
        tMPI_Fatal_error(TMPI_FARGS,"Invalid key pointer.");
        return EINVAL;
    }


    key->key=(struct tMPI_Thread_key*)tMPI_Malloc(sizeof(struct 
                                                         tMPI_Thread_key)*1);
    ret = pthread_key_create(&((key)->key->pkey), destructor);
    if(ret!=0)
    {
        tMPI_Fatal_error(TMPI_FARGS,"Failed to create thread key, rc=%d.",ret);
        fflush(stderr);
        return -1;
    }

    tMPI_Atomic_set(&(key->initialized), 1);
    return 0;
}
Example #16
0
int tMPI_Thread_mutex_destroy(tMPI_Thread_mutex_t *mtx) 
{
    int ret;

    if(mtx == NULL)
    {
        return EINVAL;
    }
    
    ret = pthread_mutex_destroy( &(mtx->mutex->mtx) );
    free(mtx->mutex);
    
    if(ret!=0)
    {
        tMPI_Fatal_error(TMPI_FARGS,"Error destroying POSIX mutex. rc=%d",ret);
        /* Use system memory allocation routines */
    }
    return ret;
}
Example #17
0
int tMPI_Thread_cond_destroy(tMPI_Thread_cond_t *cond) 
{
    int ret;
    
    if(cond == NULL)
    {
        return EINVAL;
    }
    
    ret = pthread_cond_destroy(&(cond->condp->cond));
    free(cond->condp);
   
    if(ret!=0)
    {
        tMPI_Fatal_error(TMPI_FARGS,
                         "Error destroying POSIX condition variable. rc=%d",
                         ret);
        fflush(stderr);
    }
    return ret;
}
Example #18
0
int tMPI_Thread_cond_init(tMPI_Thread_cond_t *cond) 
{
    int ret;
    
    if(cond==NULL)
    {
        return EINVAL;
    }
   
    cond->condp=(struct tMPI_Thread_cond*)
              tMPI_Malloc(sizeof(struct tMPI_Thread_cond)*1);
    ret = pthread_cond_init(&(cond->condp->cond), NULL);
    
    if(ret!=0)
    {
        tMPI_Fatal_error(TMPI_FARGS,"Error initializing POSIX condition variable. rc=%d",ret);
        fflush(stderr);
    }
    tMPI_Atomic_set(&(cond->initialized),1);
    return ret;
}
Example #19
0
int tMPI_Thread_mutex_init(tMPI_Thread_mutex_t *mtx) 
{
    int ret;
  
    if (mtx == NULL)
    {
        return EINVAL;
    }

    mtx->mutex=(struct tMPI_Mutex*)tMPI_Malloc(sizeof(struct tMPI_Mutex)*1);
    ret = pthread_mutex_init(&(mtx->mutex->mtx),NULL);
    
    if(ret!=0)
    {
        tMPI_Fatal_error(TMPI_FARGS,"Error initializing POSIX mutex. rc=%d");
        /* Use system memory allocation routines */
        return ret;
    }

    tMPI_Atomic_set(&(mtx->initialized), 1);
    return 0;
}
/*  returns 0 on success.
    Success is returned if the system is non-NUMA, OR the system doesn't 
    support appropriate NUMA APIs, OR the system is NUMA and we successfully 
    initialized support.
    
    returns -1 on error.
    This can happen if an API returned an error, a memory allocation failed, or 
    we failed to initialize affinity mapping information.
*/
int tMPI_Init_NUMA(void)
{
    /* module handle to kernel32.dll -- we already reference it, so it's already loaded */
    HMODULE hModKernel32 = NULL;                    
    /* 0-based NUMA node count -- does not imply all nodes have available (eg: hot-plug) processors */
    ULONG ulHighestNumaNodeNumber;                  
    /* total number of processors available per affinity masks */
    DWORD dwTotalProcessors = 0;                    
    ULONG i = 0;

    /* calling thread PROCESSOR_NUMBER */
    PROCESSOR_NUMBER CurrentProcessorNumber;      
    /* calling thread GROUP_AFFINITY */
    GROUP_AFFINITY CurrentThreadGroupAffinity; 
    /* calling thread NUMA node */
    USHORT CurrentNumaNodeNumber;

    WORD wActiveGroupCount;
    WORD GroupIndex;

    /* array of processor information structures */
    MPI_NUMA_PROCESSOR_INFO *pMPI_ProcessorInfo = NULL; 

    /* assume an error condition */
    int iRet = -1;

    hModKernel32 = GetModuleHandleA("kernel32.dll");

    if( hModKernel32 == NULL )
    {
        return 0;
    }

    /* obtain addresses of relevant NUMA functions, most of which are 
       Windows 7 / Windows Server 2008R2 only functions
       this is done using GetProcAddress to enable the binary to run on older 
       Windows versions.
    */

    func_GetNumaHighestNodeNumber = (func_GetNumaHighestNodeNumber_t) GetProcAddress( hModKernel32, "GetNumaHighestNodeNumber" );

    if( func_GetNumaHighestNodeNumber == NULL )
    {
        return 0;
    }

    /* determine if we're on a NUMA system and if so, determine the number of 
       (potential) nodes */

    if(!func_GetNumaHighestNodeNumber( &ulHighestNumaNodeNumber ))
    {
        return -1;
    }

    if( ulHighestNumaNodeNumber == 0 )
    {
        /* system is not NUMA */
        return 0;
    }

    func_SetThreadGroupAffinity = (func_SetThreadGroupAffinity_t)GetProcAddress( hModKernel32, "SetThreadGroupAffinity" );
    func_SetThreadIdealProcessorEx = (func_SetThreadIdealProcessorEx_t)GetProcAddress( hModKernel32, "SetThreadIdealProcessorEx" );
    func_CreateRemoteThreadEx = (func_CreateRemoteThreadEx_t)GetProcAddress( hModKernel32, "CreateRemoteThreadEx" );
    func_GetNumaNodeProcessorMaskEx = (func_GetNumaNodeProcessorMaskEx_t)GetProcAddress( hModKernel32, "GetNumaNodeProcessorMaskEx" );
    func_GetNumaProcessorNodeEx = (func_GetNumaProcessorNodeEx_t)GetProcAddress( hModKernel32, "GetNumaProcessorNodeEx" );
    func_GetCurrentProcessorNumberEx = (func_GetCurrentProcessorNumberEx_t)GetProcAddress( hModKernel32, "GetCurrentProcessorNumberEx" );
    func_GetActiveProcessorCount = (func_GetActiveProcessorCount_t)GetProcAddress( hModKernel32, "GetActiveProcessorCount" );
    func_GetActiveProcessorGroupCount = (func_GetActiveProcessorGroupCount_t)GetProcAddress( hModKernel32, "GetActiveProcessorGroupCount" );
    func_InitializeProcThreadAttributeList = (func_InitializeProcThreadAttributeList_t)GetProcAddress( hModKernel32, "InitializeProcThreadAttributeList" );
    func_UpdateProcThreadAttribute = (func_UpdateProcThreadAttribute_t)GetProcAddress( hModKernel32, "UpdateProcThreadAttribute" );
    func_DeleteProcThreadAttributeList = (func_DeleteProcThreadAttributeList_t)GetProcAddress( hModKernel32, "DeleteProcThreadAttributeList" );

    if( (func_SetThreadGroupAffinity == NULL) ||
        (func_SetThreadIdealProcessorEx == NULL) ||
        (func_CreateRemoteThreadEx == NULL) ||
        (func_GetNumaNodeProcessorMaskEx == NULL) ||
        (func_GetNumaProcessorNodeEx == NULL) ||
        (func_GetCurrentProcessorNumberEx == NULL) ||
        (func_GetActiveProcessorCount == NULL) ||
        (func_GetActiveProcessorGroupCount == NULL) ||
        (func_InitializeProcThreadAttributeList == NULL) ||
        (func_UpdateProcThreadAttribute == NULL) ||
        (func_DeleteProcThreadAttributeList == NULL) )
    {
        /* if any addresses couldn't be located, assume NUMA functionality 
           isn't supported */
        return 0;
    }

    /* count the active processors across the groups */

    func_GetCurrentProcessorNumberEx(&CurrentProcessorNumber);

    wActiveGroupCount = func_GetActiveProcessorGroupCount();
    
    dwTotalProcessors = func_GetActiveProcessorCount( ALL_PROCESSOR_GROUPS );

#if !((defined WIN64 || defined _WIN64))
    /* WOW64 doesn't allow setting the affinity correctly beyond 32 
       processors -- the KAFFINITY mask is only 32 bits wide
       This check is only here for completeness -- large systems should be 
       running 64bit Gromacs code, where the processor quantity is not 
       constrained.
       By failing here, the WOW64 32bit client will use normal CreateThread(), 
       which can schedule up to 64 un-affinitized threads
    */

    if( dwTotalProcessors > 32 )
    {
        return 0;
    }
#endif

    /* allocate array of processor info blocks */

    pMPI_ProcessorInfo = tMPI_Malloc( sizeof(MPI_NUMA_PROCESSOR_INFO) * 
                                      dwTotalProcessors );
    if(pMPI_ProcessorInfo == NULL)
    {
        tMPI_Fatal_error(TMPI_FARGS,"tMPI_Malloc failed for processor information");
        goto cleanup;
    }

    /* zero fill to cover reserved must be-zero fields */
    memset(pMPI_ProcessorInfo, 0, sizeof(MPI_NUMA_PROCESSOR_INFO) * dwTotalProcessors);

    /* loop through each processor group, and for each group, capture the 
       processor numbers and NUMA node information. */

    for(GroupIndex = 0 ; GroupIndex < wActiveGroupCount ; GroupIndex++)
    {
        DWORD dwGroupProcessorCount;
        BYTE ProcessorIndex;

        dwGroupProcessorCount = func_GetActiveProcessorCount( GroupIndex );

        for(ProcessorIndex = 0 ; ProcessorIndex < dwGroupProcessorCount ; 
            ProcessorIndex++)
        {
            PROCESSOR_NUMBER *pProcessorNumber = &(pMPI_ProcessorInfo[i].ProcessorNumber);
            GROUP_AFFINITY *pGroupAffinity = &(pMPI_ProcessorInfo[i].GroupAffinity);
            USHORT *pNodeNumber = &(pMPI_ProcessorInfo[i].NumaNodeNumber);

            pProcessorNumber->Group = GroupIndex;
            pProcessorNumber->Number = ProcessorIndex;

            /* save an index to the processor array entry for the current processor
               this is used to enable subsequent threads to be created in a round 
               robin fashion starting at the next array entry
            */

            if( (CurrentProcessorNumber.Group == pProcessorNumber->Group ) &&
                (CurrentProcessorNumber.Number == pProcessorNumber->Number) )
            {
                /* set global: current thread index into processor array */
                g_ulThreadIndex = i;
            }

            /* capture the node number and group affinity associated with processor entry
               any failures here are assumed to be catastrophic and disable 
               the group & NUMA aware thread support
            */

            if(!func_GetNumaProcessorNodeEx(pProcessorNumber, pNodeNumber))
            {
                tMPI_Fatal_error(TMPI_FARGS,
                                 "Processor enumeration, GetNumaProcessorNodeEx failed, error code=%d",
                                 GetLastError());
                goto cleanup;
            }

            if(!func_GetNumaNodeProcessorMaskEx(*pNodeNumber, pGroupAffinity))
            {
                tMPI_Fatal_error(TMPI_FARGS,
                                 "Processor enumeration, GetNumaNodeProcessorMaskEx failed, error code=%d",
                                 GetLastError());
                goto cleanup;
            }

            /* future enhancement: construct GroupAffinity (single) processor 
               mask within NUMA node for this processor entry */

            /* increment processor array index */
            i++;

            /* sanity check, should never happen */

            if(i > dwTotalProcessors)
            {
                tMPI_Fatal_error(TMPI_FARGS,"Processor enumeration exceeds allocated memory!");
                goto cleanup;
            }
        }
    }

#if 0
    /* set the NUMA node affinity for the current thread
       failures to set the current thread affinity are ignored, 
       as a fringe case can arise on >32 processor systems with a 32bit 
       build/code.
    */
    func_SetThreadIdealProcessorEx(GetCurrentThread(), 
                                   &CurrentProcessorNumber, 
                                   NULL);

    if(func_GetNumaProcessorNodeEx(&CurrentProcessorNumber, 
                                   &CurrentNumaNodeNumber))
    {
        /* for the NUMA node number associated with the current processor 
           number, get the group affinity mask */
        if(func_GetNumaNodeProcessorMaskEx(CurrentNumaNodeNumber, 
                                           &CurrentThreadGroupAffinity))
        {
            /* set the current thread affinity to prevent it from running on 
               other NUMA nodes */
            func_SetThreadGroupAffinity(GetCurrentThread(), 
                                        &CurrentThreadGroupAffinity, 
                                        NULL);
        }
    }
#endif
 
    /* capture number of processors, highest NUMA node number, and processor 
       array */
    g_ulTotalProcessors = dwTotalProcessors;
    g_ulHighestNumaNodeNumber = ulHighestNumaNodeNumber;
    g_MPI_ProcessorInfo = pMPI_ProcessorInfo;

    iRet = 0 ;

#if 0   
    // TODO: debug DISCARD                        
    printf("primary thread tid=%lu group=%lu mask=0x%I64x group=%lu number=%lu ulThreadIndex=%lu\n",
        GetCurrentThreadId(),
        CurrentThreadGroupAffinity.Group,
        (ULONGLONG)CurrentThreadGroupAffinity.Mask,
        (ULONG)CurrentProcessorNumber.Group,
        (ULONG)CurrentProcessorNumber.Number,
        g_ulThreadIndex);
#endif

cleanup:

    if( iRet != 0 )
    {
        if( pMPI_ProcessorInfo )
        {
            tMPI_Free( pMPI_ProcessorInfo );
        }
    }

    return 0;
}
Example #21
0
int tMPI_Thread_cond_wait(tMPI_Thread_cond_t *cond, tMPI_Thread_mutex_t *mtx)
{
    BOOL wait_done=FALSE;
    BOOL last_waiter=FALSE;
    int my_cycle;

    /* check whether the condition is initialized */
    if (tMPI_Atomic_get( &(cond->initialized)  ) == 0)
    {
        tMPI_Thread_cond_init_once(cond);
    }
    /* the mutex must have been initialized because it should be locked here */

#if 0
    /* use this code once Vista is the minimum version required */
    ret=SleepConditionVariableCS (&(cond->cv), &(mtx->cs), INFINITE);

    if (!ret)
    {
        tMPI_Fatal_error(TMPI_FARGS,"Failed wait for condition, error code=%d",
                         GetLastError());
        return -1;
    }
#else
    /* serially increase waiter count */
    EnterCriticalSection(&(cond->condp->wtr_lock));
    cond->condp->Nwaiters++;
    my_cycle = cond->condp->cycle;
    LeaveCriticalSection(&(cond->condp->wtr_lock));

    /* now it's safe to release the mutex from the fn call */
    LeaveCriticalSection(&(mtx->mutex->cs));

    /* Loop a wait until we found out we've waited for the right event.
       Note that this loop is potentially a busy-wait loop in bad
       circumstances (higher priority threads, for example). */
    do
    {
        /* do the actual waiting */
        if (WaitForSingleObject( cond->condp->ev, INFINITE )== WAIT_FAILED)
        {
            tMPI_Fatal_error(TMPI_FARGS,"Failed event reset, error code=%d",
                             GetLastError());
            return -1;
        }

        /* serially check whether we got the right event.  */
        EnterCriticalSection(&(cond->condp->wtr_lock));
        wait_done = (cond->condp->Nrelease > 0) && 
                    (cond->condp->cycle!=my_cycle);
        LeaveCriticalSection(&(cond->condp->wtr_lock));
    }
    while(!wait_done);

    /* We obtain the mutex from the function call */
    EnterCriticalSection(&(mtx->mutex->cs));

    /* we serially decrease the waiter count and release count */
    EnterCriticalSection(&(cond->condp->wtr_lock));
    cond->condp->Nwaiters--;
    cond->condp->Nrelease--;
    last_waiter=(cond->condp->Nrelease==0);
    LeaveCriticalSection(&(cond->condp->wtr_lock));

    /* manually release the event if everybody's done with it */
    if (last_waiter)
    {
        if (!ResetEvent( cond->condp->ev ))
        {
            tMPI_Fatal_error(TMPI_FARGS,"Failed event reset, error code=%d",
                             GetLastError());
            return -1;
        }
    }
#endif

    return 0;
}
HANDLE tMPI_Thread_create_NUMA(LPSECURITY_ATTRIBUTES lpThreadAttributes,
                               SIZE_T dwStackSize,
                               LPTHREAD_START_ROUTINE lpStartAddress,
                               LPVOID lpParameter,
                               DWORD dwCreationFlags,
                               LPDWORD lpThreadId)
{
    LPPROC_THREAD_ATTRIBUTE_LIST pAttributeList = NULL;
    HANDLE hThread = NULL;
    SIZE_T cbAttributeList = 0;
    GROUP_AFFINITY GroupAffinity;
    PROCESSOR_NUMBER IdealProcessorNumber;
    ULONG CurrentProcessorIndex;

    /* for each thread created, round-robin through the set of valid 
       processors and affinity masks.
       the assumption is that callers of tMPI_Thread_create_NUMA are creating 
       threads that saturate a given processor.
       for cases where threads are being created that rarely do work, standard 
       thread creation (eg: CreateThread) should be invoked instead.
    */

    CurrentProcessorIndex = (ULONG)InterlockedIncrement((volatile LONG *)&g_ulThreadIndex);
    CurrentProcessorIndex = CurrentProcessorIndex % g_ulTotalProcessors;

    /* group, mask. */

    memcpy(&GroupAffinity, 
           &(g_MPI_ProcessorInfo[CurrentProcessorIndex].GroupAffinity), 
           sizeof(GROUP_AFFINITY));

    /* group, processor number */
    
    memcpy(&IdealProcessorNumber, 
           &(g_MPI_ProcessorInfo[CurrentProcessorIndex].ProcessorNumber), 
           sizeof(PROCESSOR_NUMBER)); 

    /* determine size of allocation for AttributeList */

    if(!func_InitializeProcThreadAttributeList(pAttributeList,
                                               2,
                                               0,
                                               &cbAttributeList))
    {
        DWORD dwLastError = GetLastError();
        if( dwLastError != ERROR_INSUFFICIENT_BUFFER )
        {
            tMPI_Fatal_error(TMPI_FARGS,
                             "InitializeProcThreadAttributeList, error code=%d",
                             dwLastError);
            goto cleanup;
        }
    }

    pAttributeList = (LPPROC_THREAD_ATTRIBUTE_LIST)tMPI_Malloc( cbAttributeList );
    if( pAttributeList == NULL )
    {
        tMPI_Fatal_error(TMPI_FARGS,"Failed to allocate pAttributeList");
        goto cleanup;
    }

    memset( pAttributeList, 0, cbAttributeList );

    if(!func_InitializeProcThreadAttributeList(pAttributeList,
                                               2,
                                               0,
                                               &cbAttributeList))
    {
        tMPI_Fatal_error(TMPI_FARGS,
                         "InitializeProcThreadAttributeList, error code=%d",
                         GetLastError());
        goto cleanup;
    }

    if(!func_UpdateProcThreadAttribute(pAttributeList,
                                       0,
                                       PROC_THREAD_ATTRIBUTE_GROUP_AFFINITY,
                                       &GroupAffinity,
                                       sizeof(GroupAffinity),
                                       NULL,
                                       NULL))
    {
        tMPI_Fatal_error(TMPI_FARGS,"UpdateProcThreadAttribute, error code=%d",
                         GetLastError());
        goto cleanup;
    }

    if(!func_UpdateProcThreadAttribute(pAttributeList,
                                       0,
                                       PROC_THREAD_ATTRIBUTE_IDEAL_PROCESSOR,
                                       &IdealProcessorNumber,
                                       sizeof(IdealProcessorNumber),
                                       NULL,
                                       NULL))
    {
        tMPI_Fatal_error(TMPI_FARGS,"UpdateProcThreadAttribute, error code=%d",
                         GetLastError());
        goto cleanup;
    }


    hThread = func_CreateRemoteThreadEx( GetCurrentProcess(),
                                         lpThreadAttributes,
                                         dwStackSize,
                                         lpStartAddress,
                                         lpParameter,
                                         dwCreationFlags,
                                         pAttributeList,
                                         lpThreadId);
            
    func_DeleteProcThreadAttributeList( pAttributeList );

#if 0   
	// TODO: debug only or DISCARD
    if( hThread )
    {
        PROCESSOR_NUMBER ProcNumber;
        USHORT NodeNumber;

        GetThreadIdealProcessorEx(hThread, &ProcNumber);
        GetNumaProcessorNodeEx(&ProcNumber, &NodeNumber);

        printf("started thread tid=%lu group=%lu mask=0x%I64x number=%lu numanode=%lu\n",
            *lpThreadId,
            GroupAffinity.Group,
            (ULONGLONG)GroupAffinity.Mask,
            ProcNumber.Number,
            NodeNumber
            );
    }
#endif

cleanup:
    
    if( pAttributeList )
    {
        tMPI_Free( pAttributeList );
    }

    return hThread;
}