Beispiel #1
0
    WaitForSingleObject( shptr->h, INFINITE );
}

void WINAPI release_binsem_using_event( SRWLOCK* h_ )
{
    srwl_or_handle* shptr = (srwl_or_handle*) h_;
    SetEvent( shptr->h );
}

static void (WINAPI *__TBB_init_binsem)( SRWLOCK* ) = (void (WINAPI *)(SRWLOCK*))&init_binsem_using_event;
static void (WINAPI *__TBB_acquire_binsem)( SRWLOCK* ) = (void (WINAPI *)(SRWLOCK*))&acquire_binsem_using_event;
static void (WINAPI *__TBB_release_binsem)( SRWLOCK* ) = (void (WINAPI *)(SRWLOCK*))&release_binsem_using_event;

//! Table describing the how to link the handlers.
static const dynamic_link_descriptor SRWLLinkTable[] = {
    DLD(InitializeSRWLock,       __TBB_init_binsem),
    DLD(AcquireSRWLockExclusive, __TBB_acquire_binsem),
    DLD(ReleaseSRWLockExclusive, __TBB_release_binsem)
};

inline void init_concmon_module()
{
    __TBB_ASSERT( (uintptr_t)__TBB_init_binsem==(uintptr_t)&init_binsem_using_event, NULL );
    if( dynamic_link( "Kernel32.dll", SRWLLinkTable, sizeof(SRWLLinkTable)/sizeof(dynamic_link_descriptor) ) ) {
        __TBB_ASSERT( (uintptr_t)__TBB_init_binsem!=(uintptr_t)&init_binsem_using_event, NULL );
        __TBB_ASSERT( (uintptr_t)__TBB_acquire_binsem!=(uintptr_t)&acquire_binsem_using_event, NULL );
        __TBB_ASSERT( (uintptr_t)__TBB_release_binsem!=(uintptr_t)&release_binsem_using_event, NULL );
    }
}

binary_semaphore::binary_semaphore() {
Beispiel #2
0
namespace internal {

#if __linux__ || __FreeBSD_version >= 701000

static void set_affinity_mask( size_t maskSize, const basic_mask_t* threadMask ) {
#if __linux__
    if( sched_setaffinity( 0, maskSize, threadMask ) )
#else /* FreeBSD */
    if( cpuset_setaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) )
#endif
        runtime_warning( "setaffinity syscall failed" );
}

static void get_affinity_mask( size_t maskSize, basic_mask_t* threadMask ) {
#if __linux__
    if( sched_getaffinity( 0, maskSize, threadMask ) )
#else /* FreeBSD */
    if( cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) )
#endif
        runtime_warning( "getaffinity syscall failed" );
}

static basic_mask_t* process_mask;
static int num_masks;
struct process_mask_cleanup_helper {
    ~process_mask_cleanup_helper() {
        if( process_mask ) {
            delete [] process_mask;
        }
     }
};
static process_mask_cleanup_helper process_mask_cleanup;

#define curMaskSize sizeof(basic_mask_t) * num_masks
affinity_helper::~affinity_helper() {
    if( threadMask ) {
        if( is_changed ) {
            set_affinity_mask( curMaskSize, threadMask );
        }
        delete [] threadMask;
    }
}
void affinity_helper::protect_affinity_mask() {
    if( threadMask == NULL && num_masks && process_mask ) {
        threadMask = new basic_mask_t [num_masks];
        memset( threadMask, 0, curMaskSize );
        get_affinity_mask( curMaskSize, threadMask );
        is_changed = memcmp( process_mask, threadMask, curMaskSize );
        if( is_changed ) {
            set_affinity_mask( curMaskSize, process_mask );
        }
    }
}
#undef curMaskSize

static atomic<do_once_state> hardware_concurrency_info;

static int theNumProcs;

static void initialize_hardware_concurrency_info () {
    int err;
    int availableProcs = 0;
    int numMasks = 1;
#if __linux__
#if __TBB_MAIN_THREAD_AFFINITY_BROKEN
    int maxProcs = INT_MAX; // To check the entire mask.
    int pid = 0; // Get the mask of the calling thread.
#else
    int maxProcs = sysconf(_SC_NPROCESSORS_ONLN);
    int pid = getpid();
#endif
    cpu_set_t *processMask;
    const size_t BasicMaskSize =  sizeof(cpu_set_t);
    for (;;) {
        int curMaskSize = BasicMaskSize * numMasks;
        processMask = new cpu_set_t[numMasks];
        memset( processMask, 0, curMaskSize );
        err = sched_getaffinity( pid, curMaskSize, processMask );
        if ( !err || errno != EINVAL || curMaskSize * CHAR_BIT >= 256 * 1024 )
            break;
        delete[] processMask;
        numMasks <<= 1;
    }
#else /* FreeBSD >= 7.1 */
    int maxProcs = sysconf(_SC_NPROCESSORS_ONLN);
    cpuset_t *processMask;
    const size_t BasicMaskSize = sizeof(cpuset_t);
    for (;;) {
        int curMaskSize = BasicMaskSize * numMasks;
        processMask = new cpuset_t[numMasks];
        memset( processMask, 0, curMaskSize );
        // CPU_LEVEL_WHICH - anonymous (current) mask, CPU_LEVEL_CPUSET - assigned mask
#if __TBB_MAIN_THREAD_AFFINITY_BROKEN
        err = cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, curMaskSize, processMask );
#else
        err = cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, curMaskSize, processMask );
#endif
        if ( !err || errno != ERANGE || curMaskSize * CHAR_BIT >= 16 * 1024 )
            break;
        delete[] processMask;
        numMasks <<= 1;
    }
#endif /* FreeBSD >= 7.1 */
    if ( !err ) {
        for ( int m = 0; availableProcs < maxProcs && m < numMasks; ++m ) {
            for ( size_t i = 0; (availableProcs < maxProcs) && (i < BasicMaskSize * CHAR_BIT); ++i ) {
                if ( CPU_ISSET( i, processMask + m ) )
                    ++availableProcs;
            }
        }
        num_masks = numMasks;
        process_mask = processMask;
    }
    else {
        availableProcs = (maxProcs == INT_MAX) ? sysconf(_SC_NPROCESSORS_ONLN) : maxProcs;
        delete[] processMask;
    }
    theNumProcs = availableProcs > 0 ? availableProcs : 1; // Fail safety strap
    __TBB_ASSERT( theNumProcs <= sysconf(_SC_NPROCESSORS_ONLN), NULL );
}

int AvailableHwConcurrency() {
    atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info );
    return theNumProcs;
}

#elif defined(_SC_NPROCESSORS_ONLN)

int AvailableHwConcurrency() {
    int n = sysconf(_SC_NPROCESSORS_ONLN);
    return (n > 0) ? n : 1;
}

#elif _WIN32||_WIN64

static atomic<do_once_state> hardware_concurrency_info;

static const WORD TBB_ALL_PROCESSOR_GROUPS = 0xffff;

// Statically allocate an array for processor group information.
// Windows 7 supports maximum 4 groups, but let's look ahead a little.
static const WORD MaxProcessorGroups = 64;

struct ProcessorGroupInfo {
    DWORD_PTR   mask;                   ///< Affinity mask covering the whole group
    int         numProcs;               ///< Number of processors in the group
    int         numProcsRunningTotal;   ///< Subtotal of processors in this and preceding groups

    //! Total number of processor groups in the system
    static int NumGroups; 

    //! Index of the group with a slot reserved for the first master thread
    /** In the context of multiple processor groups support current implementation
        defines "the first master thread" as the first thread to invoke
        AvailableHwConcurrency(). 

        TODO:   Implement a dynamic scheme remapping workers depending on the pending
                master threads affinity. **/
    static int HoleIndex;
};

int ProcessorGroupInfo::NumGroups = 1;
int ProcessorGroupInfo::HoleIndex = 0;


ProcessorGroupInfo theProcessorGroups[MaxProcessorGroups];

struct TBB_GROUP_AFFINITY {
    DWORD_PTR Mask;
    WORD   Group;
    WORD   Reserved[3];
};

static DWORD (WINAPI *TBB_GetMaximumProcessorCount)( WORD groupIndex ) = NULL;
static WORD (WINAPI *TBB_GetMaximumProcessorGroupCount)() = NULL;
static BOOL (WINAPI *TBB_SetThreadGroupAffinity)( HANDLE hThread, 
                        const TBB_GROUP_AFFINITY* newAff, TBB_GROUP_AFFINITY *prevAff );
static BOOL (WINAPI *TBB_GetThreadGroupAffinity)( HANDLE hThread, TBB_GROUP_AFFINITY* );

static const dynamic_link_descriptor ProcessorGroupsApiLinkTable[] = {
      DLD(GetMaximumProcessorCount, TBB_GetMaximumProcessorCount)
    , DLD(GetMaximumProcessorGroupCount, TBB_GetMaximumProcessorGroupCount)
    , DLD(SetThreadGroupAffinity, TBB_SetThreadGroupAffinity)
    , DLD(GetThreadGroupAffinity, TBB_GetThreadGroupAffinity)
};

static void initialize_hardware_concurrency_info () {
    dynamic_link( GetModuleHandleA( "Kernel32.dll" ), ProcessorGroupsApiLinkTable,
                  sizeof(ProcessorGroupsApiLinkTable)/sizeof(dynamic_link_descriptor) );
    SYSTEM_INFO si;
    GetSystemInfo(&si);
    DWORD_PTR pam, sam, m = 1;
    GetProcessAffinityMask( GetCurrentProcess(), &pam, &sam );
    int nproc = 0;
    for ( size_t i = 0; i < sizeof(DWORD_PTR) * CHAR_BIT; ++i, m <<= 1 ) {
        if ( pam & m )
            ++nproc;
    }
    __TBB_ASSERT( nproc <= (int)si.dwNumberOfProcessors, NULL );
    if ( nproc == (int)si.dwNumberOfProcessors && TBB_GetMaximumProcessorCount ) {
        // The process does not have restricting affinity mask and multiple processor groups are possible
        ProcessorGroupInfo::NumGroups = (int)TBB_GetMaximumProcessorGroupCount();
        __TBB_ASSERT( ProcessorGroupInfo::NumGroups <= MaxProcessorGroups, NULL );
        // Fail safety bootstrap. Release versions will limit available concurrency
        // level, while debug ones would assert.
        if ( ProcessorGroupInfo::NumGroups > MaxProcessorGroups )
            ProcessorGroupInfo::NumGroups = MaxProcessorGroups;
        if ( ProcessorGroupInfo::NumGroups > 1 ) {
            TBB_GROUP_AFFINITY ga;
            if ( TBB_GetThreadGroupAffinity( GetCurrentThread(), &ga ) )
                ProcessorGroupInfo::HoleIndex = ga.Group;
            int nprocs = 0;
            for ( WORD i = 0; i < ProcessorGroupInfo::NumGroups; ++i ) {
                ProcessorGroupInfo  &pgi = theProcessorGroups[i];
                pgi.numProcs = (int)TBB_GetMaximumProcessorCount(i);
                __TBB_ASSERT( pgi.numProcs <= (int)sizeof(DWORD_PTR) * CHAR_BIT, NULL );
                pgi.mask = pgi.numProcs == sizeof(DWORD_PTR) * CHAR_BIT ? ~(DWORD_PTR)0 : (DWORD_PTR(1) << pgi.numProcs) - 1;
                pgi.numProcsRunningTotal = nprocs += pgi.numProcs;
            }
            __TBB_ASSERT( nprocs == (int)TBB_GetMaximumProcessorCount( TBB_ALL_PROCESSOR_GROUPS ), NULL );
            return;
        }
    }
    // Either the process has restricting affinity mask or only a single processor groups is present
    theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = nproc;

    PrintExtraVersionInfo("Processor groups", "%d", ProcessorGroupInfo::NumGroups);
    if (ProcessorGroupInfo::NumGroups>1)
        for (int i=0; i<ProcessorGroupInfo::NumGroups; ++i)
            PrintExtraVersionInfo( "----- Group", "%d: size %d", i, theProcessorGroups[i].numProcs);
}

int AvailableHwConcurrency() {
    atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info );
    return theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal;
}

int NumberOfProcessorGroups() {
    __TBB_ASSERT( hardware_concurrency_info == initialization_complete, "NumberOfProcessorGroups is used before AvailableHwConcurrency" );
    return ProcessorGroupInfo::NumGroups;
}

// Offset for the slot reserved for the first master thread
#define HoleAdjusted(procIdx, grpIdx) (procIdx + (holeIdx <= grpIdx))

int FindProcessorGroupIndex ( int procIdx ) {
    // In case of oversubscription spread extra workers in a round robin manner
    int holeIdx;
    const int numProcs = theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal;
    if ( procIdx >= numProcs - 1 ) {
        holeIdx = INT_MAX;
        procIdx = (procIdx - numProcs + 1) % numProcs;
    }
    else
        holeIdx = ProcessorGroupInfo::HoleIndex;
    __TBB_ASSERT( hardware_concurrency_info == initialization_complete, "FindProcessorGroupIndex is used before AvailableHwConcurrency" );
    // Approximate the likely group index assuming all groups are of the same size
    int i = procIdx / theProcessorGroups[0].numProcs;
    // Make sure the approximation is a valid group index
    if (i >= ProcessorGroupInfo::NumGroups) i = ProcessorGroupInfo::NumGroups-1;
    // Now adjust the approximation up or down
    if ( theProcessorGroups[i].numProcsRunningTotal > HoleAdjusted(procIdx, i) ) {
        while ( theProcessorGroups[i].numProcsRunningTotal - theProcessorGroups[i].numProcs > HoleAdjusted(procIdx, i) ) {
            __TBB_ASSERT( i > 0, NULL );
            --i;
        }
    }
    else {
        do {
            ++i;
        } while ( theProcessorGroups[i].numProcsRunningTotal <= HoleAdjusted(procIdx, i) );
    }
    __TBB_ASSERT( i < ProcessorGroupInfo::NumGroups, NULL );
    return i;
}

void MoveThreadIntoProcessorGroup( void* hThread, int groupIndex ) {
    __TBB_ASSERT( hardware_concurrency_info == initialization_complete, "MoveThreadIntoProcessorGroup is used before AvailableHwConcurrency" );
    if ( !TBB_SetThreadGroupAffinity )
        return;
    TBB_GROUP_AFFINITY ga = { theProcessorGroups[groupIndex].mask, (WORD)groupIndex, {0,0,0} };
    TBB_SetThreadGroupAffinity( hThread, &ga, NULL );
}

#else
    #error AvailableHwConcurrency is not implemented in this OS 
#endif /* OS */

} // namespace internal
Beispiel #3
0
    }
    LeaveCriticalSection( &cv_event->mutex );
    CloseHandle( my_event );
}

void WINAPI destroy_condvar_noop( CONDITION_VARIABLE* /*cv*/ ) { /*no op*/ }

static void (WINAPI *__TBB_init_condvar)( PCONDITION_VARIABLE ) = (void (WINAPI *)(PCONDITION_VARIABLE))&init_condvar_using_event;
static BOOL (WINAPI *__TBB_condvar_wait)( PCONDITION_VARIABLE, LPCRITICAL_SECTION, DWORD ) = (BOOL (WINAPI *)(PCONDITION_VARIABLE,LPCRITICAL_SECTION, DWORD))&sleep_condition_variable_cs_using_event;
static void (WINAPI *__TBB_condvar_notify_one)( PCONDITION_VARIABLE ) = (void (WINAPI *)(PCONDITION_VARIABLE))&wake_condition_variable_using_event;
static void (WINAPI *__TBB_condvar_notify_all)( PCONDITION_VARIABLE ) = (void (WINAPI *)(PCONDITION_VARIABLE))&wake_all_condition_variable_using_event;
static void (WINAPI *__TBB_destroy_condvar)( PCONDITION_VARIABLE ) = (void (WINAPI *)(PCONDITION_VARIABLE))&destroy_condvar_using_event;

//! Table describing how to link the handlers.
static const dynamic_link_descriptor CondVarLinkTable[] = {
    DLD(InitializeConditionVariable, __TBB_init_condvar),
    DLD(SleepConditionVariableCS,    __TBB_condvar_wait),
    DLD(WakeConditionVariable,       __TBB_condvar_notify_one),
    DLD(WakeAllConditionVariable,    __TBB_condvar_notify_all)
};

void init_condvar_module()
{
    __TBB_ASSERT( (uintptr_t)__TBB_init_condvar==(uintptr_t)&init_condvar_using_event, NULL );
    if( dynamic_link( "Kernel32.dll", CondVarLinkTable, 4 ) )
        __TBB_destroy_condvar = (void (WINAPI *)(PCONDITION_VARIABLE))&destroy_condvar_noop;
}
#endif /* _WIN32||_WIN64 */

} // namespace internal
namespace internal {

//------------------------------------------------------------------------
// governor
//------------------------------------------------------------------------

#if __TBB_SURVIVE_THREAD_SWITCH

#if _WIN32
#define CILKLIB_NAME "cilkrts20.dll"
#else
#define CILKLIB_NAME "libcilkrts.so"
#endif

//! Handler for memory allocation
static __cilk_tbb_retcode (*watch_stack_handler)(struct __cilk_tbb_unwatch_thunk* u,
                                                 struct __cilk_tbb_stack_op_thunk o);

#if __TBB_WEAK_SYMBOLS
    #pragma weak __cilkrts_watch_stack
#endif

//! Table describing how to link the handlers.
static const dynamic_link_descriptor CilkLinkTable[] = {
    DLD(__cilkrts_watch_stack, watch_stack_handler)
};

static atomic<do_once_state> cilkrts_load_state;

bool initialize_cilk_interop() {
    // Pinning can fail. This is a normal situation, and means that the current
    // thread does not use Cilk and consequently does not need interop.
    return dynamic_link( CILKLIB_NAME, CilkLinkTable, 1 );
}
#endif /* __TBB_SURVIVE_THREAD_SWITCH */

namespace rml {
    tbb_server* make_private_server( tbb_client& client );
}

void governor::acquire_resources () {
#if USE_PTHREAD
    int status = theTLS.create(auto_terminate);
#else
    int status = theTLS.create();
#endif
    if( status )
        handle_perror(status, "TBB failed to initialize TLS storage\n");

    ::rml::factory::status_type res = theRMLServerFactory.open(); 
    UsePrivateRML = res != ::rml::factory::st_success;
}

void governor::release_resources () {
    theRMLServerFactory.close();
#if TBB_USE_ASSERT
    if( __TBB_InitOnce::initialization_done() && theTLS.get() ) 
        runtime_warning( "TBB is unloaded while tbb::task_scheduler_init object is alive?" );
#endif
    int status = theTLS.destroy();
    if( status )
        handle_perror(status, "TBB failed to destroy TLS storage");
    dynamic_unlink_all();
}

rml::tbb_server* governor::create_rml_server ( rml::tbb_client& client ) {
    rml::tbb_server* server = NULL;
    if( !UsePrivateRML ) {
        ::rml::factory::status_type status = theRMLServerFactory.make_server( server, client );
        if( status != ::rml::factory::st_success ) {
            UsePrivateRML = true;
            runtime_warning( "rml::tbb_factorymake_server failed with status %x, falling back on private rml", status );
        }
    }
    if ( !server ) {
        __TBB_ASSERT( UsePrivateRML, NULL );
        server = rml::make_private_server( client );
    }
    __TBB_ASSERT( server, "Failed to create RML server" );
    return server;
}

void governor::sign_on(generic_scheduler* s) {
    __TBB_ASSERT( !s->my_registered, NULL );  
    s->my_registered = true;
    theTLS.set(s);
#if __TBB_SURVIVE_THREAD_SWITCH
    __cilk_tbb_stack_op_thunk o;
    o.routine = &stack_op_handler;
    o.data = s;
    if( watch_stack_handler ) {
        if( (*watch_stack_handler)(&s->my_cilk_unwatch_thunk, o) ) {
            // Failed to register with Cilk, make sure we are clean
            s->my_cilk_unwatch_thunk.routine = NULL;
        }
#if TBB_USE_ASSERT
        else
            s->my_cilk_state = generic_scheduler::cs_running;
#endif /* TBB_USE_ASSERT */
    }
#endif /* __TBB_SURVIVE_THREAD_SWITCH */
}

void governor::sign_off(generic_scheduler* s) {
    if( s->my_registered ) {
        __TBB_ASSERT( theTLS.get()==s || (!s->is_worker() && !theTLS.get()), "attempt to unregister a wrong scheduler instance" );
        theTLS.set(NULL);
        s->my_registered = false;
#if __TBB_SURVIVE_THREAD_SWITCH
        __cilk_tbb_unwatch_thunk &ut = s->my_cilk_unwatch_thunk;
        if ( ut.routine )
           (*ut.routine)(ut.data);
#endif /* __TBB_SURVIVE_THREAD_SWITCH */
    }
}

generic_scheduler* governor::init_scheduler( unsigned num_threads, stack_size_type stack_size, bool auto_init ) {
    if( !__TBB_InitOnce::initialization_done() )
        DoOneTimeInitializations();
    generic_scheduler* s = theTLS.get();
    if( s ) {
        s->my_ref_count += 1;
        return s;
    }
#if __TBB_SURVIVE_THREAD_SWITCH
    atomic_do_once( &initialize_cilk_interop, cilkrts_load_state );
#endif /* __TBB_SURVIVE_THREAD_SWITCH */
    if( (int)num_threads == task_scheduler_init::automatic )
        num_threads = default_num_threads();
    s = generic_scheduler::create_master( 
            market::create_arena( num_threads - 1, stack_size ? stack_size : ThreadStackSize ) );
    __TBB_ASSERT(s, "Somehow a local scheduler creation for a master thread failed");
    s->my_auto_initialized = auto_init;
    return s;
}

void governor::terminate_scheduler( generic_scheduler* s ) {
    __TBB_ASSERT( s == theTLS.get(), "Attempt to terminate non-local scheduler instance" );
    if( !--(s->my_ref_count) )
        s->cleanup_master();
}

void governor::auto_terminate(void* arg){
    generic_scheduler* s = static_cast<generic_scheduler*>(arg);
    if( s && s->my_auto_initialized ) {
        if( !--(s->my_ref_count) ) {
            if ( !theTLS.get() && !s->local_task_pool_empty() ) {
                // This thread's TLS slot is already cleared. But in order to execute
                // remaining tasks cleanup_master() will need TLS correctly set.
                // So we temporarily restore its value.
                theTLS.set(s);
                s->cleanup_master();
                theTLS.set(NULL);
            }
            else
                s->cleanup_master();
        }
    }
}

void governor::print_version_info () {
    if ( UsePrivateRML )
        PrintExtraVersionInfo( "RML", "private" );
    else {
        PrintExtraVersionInfo( "RML", "shared" );
        theRMLServerFactory.call_with_server_info( PrintRMLVersionInfo, (void*)"" );
    }
#if __TBB_SURVIVE_THREAD_SWITCH
    if( watch_stack_handler )
        PrintExtraVersionInfo( "CILK", CILKLIB_NAME );
#endif /* __TBB_SURVIVE_THREAD_SWITCH */
}

#if __TBB_SURVIVE_THREAD_SWITCH
__cilk_tbb_retcode governor::stack_op_handler( __cilk_tbb_stack_op op, void* data ) {
    __TBB_ASSERT(data,NULL);
    generic_scheduler* s = static_cast<generic_scheduler*>(data);
#if TBB_USE_ASSERT
    void* current = theTLS.get();
#if _WIN32||_WIN64
    unsigned thread_id = GetCurrentThreadId();
#else
    unsigned thread_id = unsigned(pthread_self());
#endif

#endif /* TBB_USE_ASSERT */
    switch( op ) {
        default:
            __TBB_ASSERT( 0, "invalid op" );
        case CILK_TBB_STACK_ADOPT: {
            __TBB_ASSERT( !current && s->my_cilk_state==generic_scheduler::cs_limbo || 
                          current==s && s->my_cilk_state==generic_scheduler::cs_running, "invalid adoption" );
#if TBB_USE_ASSERT
            if( current==s ) 
                runtime_warning( "redundant adoption of %p by thread %x\n", s, thread_id );
            s->my_cilk_state = generic_scheduler::cs_running;
#endif /* TBB_USE_ASSERT */
            theTLS.set(s);
            break;
        }
        case CILK_TBB_STACK_ORPHAN: {
            __TBB_ASSERT( current==s && s->my_cilk_state==generic_scheduler::cs_running, "invalid orphaning" ); 
#if TBB_USE_ASSERT
            s->my_cilk_state = generic_scheduler::cs_limbo;
#endif /* TBB_USE_ASSERT */
            theTLS.set(NULL);
            break;
        }
        case CILK_TBB_STACK_RELEASE: {
            __TBB_ASSERT( !current && s->my_cilk_state==generic_scheduler::cs_limbo || 
                          current==s && s->my_cilk_state==generic_scheduler::cs_running, "invalid release" );
#if TBB_USE_ASSERT
            s->my_cilk_state = generic_scheduler::cs_freed;
#endif /* TBB_USE_ASSERT */
            s->my_cilk_unwatch_thunk.routine = NULL;
            auto_terminate( s );
        } 
    }
    return 0;
}
#endif /* __TBB_SURVIVE_THREAD_SWITCH */

} // namespace internal
Beispiel #5
0
namespace internal {

#if __TBB_USE_OS_AFFINITY_SYSCALL

#if __linux__
// Handlers for interoperation with libiomp
static int (*libiomp_try_restoring_original_mask)();
// Table for mapping to libiomp entry points
static const dynamic_link_descriptor iompLinkTable[] = {
    { "kmp_set_thread_affinity_mask_initial", (pointer_to_handler*)(void*)(&libiomp_try_restoring_original_mask) }
};
#endif

static void set_thread_affinity_mask( size_t maskSize, const basic_mask_t* threadMask ) {
#if __linux__
    if( sched_setaffinity( 0, maskSize, threadMask ) )
#else /* FreeBSD */
    if( cpuset_setaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) )
#endif
        runtime_warning( "setaffinity syscall failed" );
}

static void get_thread_affinity_mask( size_t maskSize, basic_mask_t* threadMask ) {
#if __linux__
    if( sched_getaffinity( 0, maskSize, threadMask ) )
#else /* FreeBSD */
    if( cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) )
#endif
        runtime_warning( "getaffinity syscall failed" );
}

static basic_mask_t* process_mask;
static int num_masks;

void destroy_process_mask() {
    if( process_mask ) {
        delete [] process_mask;
    }
}

#define curMaskSize sizeof(basic_mask_t) * num_masks
affinity_helper::~affinity_helper() {
    if( threadMask ) {
        if( is_changed ) {
            set_thread_affinity_mask( curMaskSize, threadMask );
        }
        delete [] threadMask;
    }
}
void affinity_helper::protect_affinity_mask( bool restore_process_mask ) {
    if( threadMask == NULL && num_masks ) { // TODO: assert num_masks validity?
        threadMask = new basic_mask_t [num_masks];
        memset( threadMask, 0, curMaskSize );
        get_thread_affinity_mask( curMaskSize, threadMask );
        if( restore_process_mask ) {
            __TBB_ASSERT( process_mask, "A process mask is requested but not yet stored" );
            is_changed = memcmp( process_mask, threadMask, curMaskSize );
            if( is_changed )
                set_thread_affinity_mask( curMaskSize, process_mask );
        } else {
            // Assume that the mask will be changed by the caller.
            is_changed = 1;
        }
    }
}
void affinity_helper::dismiss() {
    if( threadMask ) {
        delete [] threadMask;
        threadMask = NULL;
    }
    is_changed = 0;
}
#undef curMaskSize

static atomic<do_once_state> hardware_concurrency_info;

static int theNumProcs;

static void initialize_hardware_concurrency_info () {
    int err;
    int availableProcs = 0;
    int numMasks = 1;
#if __linux__
#if __TBB_MAIN_THREAD_AFFINITY_BROKEN
    int maxProcs = INT_MAX; // To check the entire mask.
    int pid = 0; // Get the mask of the calling thread.
#else
    int maxProcs = sysconf(_SC_NPROCESSORS_ONLN);
    int pid = getpid();
#endif
#else /* FreeBSD >= 7.1 */
    int maxProcs = sysconf(_SC_NPROCESSORS_ONLN);
#endif
    basic_mask_t* processMask;
    const size_t BasicMaskSize =  sizeof(basic_mask_t);
    for (;;) {
        const int curMaskSize = BasicMaskSize * numMasks;
        processMask = new basic_mask_t[numMasks];
        memset( processMask, 0, curMaskSize );
#if __linux__
        err = sched_getaffinity( pid, curMaskSize, processMask );
        if ( !err || errno != EINVAL || curMaskSize * CHAR_BIT >= 256 * 1024 )
            break;
#else /* FreeBSD >= 7.1 */
        // CPU_LEVEL_WHICH - anonymous (current) mask, CPU_LEVEL_CPUSET - assigned mask
#if __TBB_MAIN_THREAD_AFFINITY_BROKEN
        err = cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, curMaskSize, processMask );
#else
        err = cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, curMaskSize, processMask );
#endif
        if ( !err || errno != ERANGE || curMaskSize * CHAR_BIT >= 16 * 1024 )
            break;
#endif /* FreeBSD >= 7.1 */
        delete[] processMask;
        numMasks <<= 1;
    }
    if ( !err ) {
        // We have found the mask size and captured the process affinity mask into processMask.
        num_masks = numMasks; // do here because it's needed for affinity_helper to work
#if __linux__
        // For better coexistence with libiomp which might have changed the mask already,
        // check for its presense and ask it to restore the mask.
        dynamic_link_handle libhandle;
        if ( dynamic_link( "libiomp5.so", iompLinkTable, 1, &libhandle, DYNAMIC_LINK_GLOBAL ) ) {
            // We have found the symbol provided by libiomp5 for restoring original thread affinity.
            affinity_helper affhelp;
            affhelp.protect_affinity_mask( /*restore_process_mask=*/false );
            if ( libiomp_try_restoring_original_mask()==0 ) {
                // Now we have the right mask to capture, restored by libiomp.
                const int curMaskSize = BasicMaskSize * numMasks;
                memset( processMask, 0, curMaskSize );
                get_thread_affinity_mask( curMaskSize, processMask );
            } else
                affhelp.dismiss();  // thread mask has not changed
            dynamic_unlink( libhandle );
            // Destructor of affinity_helper restores the thread mask (unless dismissed).
        }
#endif
        for ( int m = 0; availableProcs < maxProcs && m < numMasks; ++m ) {
            for ( size_t i = 0; (availableProcs < maxProcs) && (i < BasicMaskSize * CHAR_BIT); ++i ) {
                if ( CPU_ISSET( i, processMask + m ) )
                    ++availableProcs;
            }
        }
        process_mask = processMask;
    }
    else {
        // Failed to get the process affinity mask; assume the whole machine can be used.
        availableProcs = (maxProcs == INT_MAX) ? sysconf(_SC_NPROCESSORS_ONLN) : maxProcs;
        delete[] processMask;
    }
    theNumProcs = availableProcs > 0 ? availableProcs : 1; // Fail safety strap
    __TBB_ASSERT( theNumProcs <= sysconf(_SC_NPROCESSORS_ONLN), NULL );
}

int AvailableHwConcurrency() {
    atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info );
    return theNumProcs;
}

/* End of __TBB_USE_OS_AFFINITY_SYSCALL implementation */
#elif __ANDROID__

// Work-around for Android that reads the correct number of available CPUs since system calls are unreliable.
// Format of "present" file is: ([<int>-<int>|<int>],)+
int AvailableHwConcurrency() {
    FILE *fp = fopen("/sys/devices/system/cpu/present", "r");
    if (fp == NULL) return 1;
    int num_args, lower, upper, num_cpus=0;
    while ((num_args = fscanf(fp, "%u-%u", &lower, &upper)) != EOF) {
        switch(num_args) {
            case 2: num_cpus += upper - lower + 1; break;
            case 1: num_cpus += 1; break;
        }
        fscanf(fp, ",");
    }
    return (num_cpus > 0) ? num_cpus : 1;
}

#elif defined(_SC_NPROCESSORS_ONLN)

int AvailableHwConcurrency() {
    int n = sysconf(_SC_NPROCESSORS_ONLN);
    return (n > 0) ? n : 1;
}

#elif _WIN32||_WIN64

static atomic<do_once_state> hardware_concurrency_info;

static const WORD TBB_ALL_PROCESSOR_GROUPS = 0xffff;

// Statically allocate an array for processor group information.
// Windows 7 supports maximum 4 groups, but let's look ahead a little.
static const WORD MaxProcessorGroups = 64;

struct ProcessorGroupInfo {
    DWORD_PTR   mask;                   ///< Affinity mask covering the whole group
    int         numProcs;               ///< Number of processors in the group
    int         numProcsRunningTotal;   ///< Subtotal of processors in this and preceding groups

    //! Total number of processor groups in the system
    static int NumGroups;

    //! Index of the group with a slot reserved for the first master thread
    /** In the context of multiple processor groups support current implementation
        defines "the first master thread" as the first thread to invoke
        AvailableHwConcurrency().

        TODO:   Implement a dynamic scheme remapping workers depending on the pending
                master threads affinity. **/
    static int HoleIndex;
};

int ProcessorGroupInfo::NumGroups = 1;
int ProcessorGroupInfo::HoleIndex = 0;

ProcessorGroupInfo theProcessorGroups[MaxProcessorGroups];

struct TBB_GROUP_AFFINITY {
    DWORD_PTR Mask;
    WORD   Group;
    WORD   Reserved[3];
};

static DWORD (WINAPI *TBB_GetActiveProcessorCount)( WORD groupIndex ) = NULL;
static WORD (WINAPI *TBB_GetActiveProcessorGroupCount)() = NULL;
static BOOL (WINAPI *TBB_SetThreadGroupAffinity)( HANDLE hThread,
                        const TBB_GROUP_AFFINITY* newAff, TBB_GROUP_AFFINITY *prevAff );
static BOOL (WINAPI *TBB_GetThreadGroupAffinity)( HANDLE hThread, TBB_GROUP_AFFINITY* );

static const dynamic_link_descriptor ProcessorGroupsApiLinkTable[] = {
      DLD(GetActiveProcessorCount, TBB_GetActiveProcessorCount)
    , DLD(GetActiveProcessorGroupCount, TBB_GetActiveProcessorGroupCount)
    , DLD(SetThreadGroupAffinity, TBB_SetThreadGroupAffinity)
    , DLD(GetThreadGroupAffinity, TBB_GetThreadGroupAffinity)
};

static void initialize_hardware_concurrency_info () {
#if __TBB_WIN8UI_SUPPORT
    // For these applications processor groups info is unavailable
    // Setting up a number of processors for one processor group
    theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = std::thread::hardware_concurrency();
#else /* __TBB_WIN8UI_SUPPORT */
    dynamic_link( "Kernel32.dll", ProcessorGroupsApiLinkTable,
                  sizeof(ProcessorGroupsApiLinkTable)/sizeof(dynamic_link_descriptor) );
    SYSTEM_INFO si;
    GetNativeSystemInfo(&si);
    DWORD_PTR pam, sam, m = 1;
    GetProcessAffinityMask( GetCurrentProcess(), &pam, &sam );
    int nproc = 0;
    for ( size_t i = 0; i < sizeof(DWORD_PTR) * CHAR_BIT; ++i, m <<= 1 ) {
        if ( pam & m )
            ++nproc;
    }
    __TBB_ASSERT( nproc <= (int)si.dwNumberOfProcessors, NULL );
    // By default setting up a number of processors for one processor group
    theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = nproc;
    // Setting up processor groups in case the process does not restrict affinity mask and more than one processor group is present
    if ( nproc == (int)si.dwNumberOfProcessors && TBB_GetActiveProcessorCount ) {
        // The process does not have restricting affinity mask and multiple processor groups are possible
        ProcessorGroupInfo::NumGroups = (int)TBB_GetActiveProcessorGroupCount();
        __TBB_ASSERT( ProcessorGroupInfo::NumGroups <= MaxProcessorGroups, NULL );
        // Fail safety bootstrap. Release versions will limit available concurrency
        // level, while debug ones would assert.
        if ( ProcessorGroupInfo::NumGroups > MaxProcessorGroups )
            ProcessorGroupInfo::NumGroups = MaxProcessorGroups;
        if ( ProcessorGroupInfo::NumGroups > 1 ) {
            TBB_GROUP_AFFINITY ga;
            if ( TBB_GetThreadGroupAffinity( GetCurrentThread(), &ga ) )
                ProcessorGroupInfo::HoleIndex = ga.Group;
            int nprocs = 0;
            for ( WORD i = 0; i < ProcessorGroupInfo::NumGroups; ++i ) {
                ProcessorGroupInfo  &pgi = theProcessorGroups[i];
                pgi.numProcs = (int)TBB_GetActiveProcessorCount(i);
                __TBB_ASSERT( pgi.numProcs <= (int)sizeof(DWORD_PTR) * CHAR_BIT, NULL );
                pgi.mask = pgi.numProcs == sizeof(DWORD_PTR) * CHAR_BIT ? ~(DWORD_PTR)0 : (DWORD_PTR(1) << pgi.numProcs) - 1;
                pgi.numProcsRunningTotal = nprocs += pgi.numProcs;
            }
            __TBB_ASSERT( nprocs == (int)TBB_GetActiveProcessorCount( TBB_ALL_PROCESSOR_GROUPS ), NULL );
        }
    }
#endif /* __TBB_WIN8UI_SUPPORT */

    PrintExtraVersionInfo("Processor groups", "%d", ProcessorGroupInfo::NumGroups);
    if (ProcessorGroupInfo::NumGroups>1)
        for (int i=0; i<ProcessorGroupInfo::NumGroups; ++i)
            PrintExtraVersionInfo( "----- Group", "%d: size %d", i, theProcessorGroups[i].numProcs);
}

int NumberOfProcessorGroups() {
    __TBB_ASSERT( hardware_concurrency_info == initialization_complete, "NumberOfProcessorGroups is used before AvailableHwConcurrency" );
    return ProcessorGroupInfo::NumGroups;
}

// Offset for the slot reserved for the first master thread
#define HoleAdjusted(procIdx, grpIdx) (procIdx + (holeIdx <= grpIdx))

int FindProcessorGroupIndex ( int procIdx ) {
    // In case of oversubscription spread extra workers in a round robin manner
    int holeIdx;
    const int numProcs = theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal;
    if ( procIdx >= numProcs - 1 ) {
        holeIdx = INT_MAX;
        procIdx = (procIdx - numProcs + 1) % numProcs;
    }
    else
        holeIdx = ProcessorGroupInfo::HoleIndex;
    __TBB_ASSERT( hardware_concurrency_info == initialization_complete, "FindProcessorGroupIndex is used before AvailableHwConcurrency" );
    // Approximate the likely group index assuming all groups are of the same size
    int i = procIdx / theProcessorGroups[0].numProcs;
    // Make sure the approximation is a valid group index
    if (i >= ProcessorGroupInfo::NumGroups) i = ProcessorGroupInfo::NumGroups-1;
    // Now adjust the approximation up or down
    if ( theProcessorGroups[i].numProcsRunningTotal > HoleAdjusted(procIdx, i) ) {
        while ( theProcessorGroups[i].numProcsRunningTotal - theProcessorGroups[i].numProcs > HoleAdjusted(procIdx, i) ) {
            __TBB_ASSERT( i > 0, NULL );
            --i;
        }
    }
    else {
        do {
            ++i;
        } while ( theProcessorGroups[i].numProcsRunningTotal <= HoleAdjusted(procIdx, i) );
    }
    __TBB_ASSERT( i < ProcessorGroupInfo::NumGroups, NULL );
    return i;
}

void MoveThreadIntoProcessorGroup( void* hThread, int groupIndex ) {
    __TBB_ASSERT( hardware_concurrency_info == initialization_complete, "MoveThreadIntoProcessorGroup is used before AvailableHwConcurrency" );
    if ( !TBB_SetThreadGroupAffinity )
        return;
    TBB_GROUP_AFFINITY ga = { theProcessorGroups[groupIndex].mask, (WORD)groupIndex, {0,0,0} };
    TBB_SetThreadGroupAffinity( hThread, &ga, NULL );
}

int AvailableHwConcurrency() {
    atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info );
    return theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal;
}

/* End of _WIN32||_WIN64 implementation */
#else
    #error AvailableHwConcurrency is not implemented for this OS
#endif

} // namespace internal
Beispiel #6
0
namespace internal {

//------------------------------------------------------------------------
// governor
//------------------------------------------------------------------------

#if __TBB_SURVIVE_THREAD_SWITCH
// Support for interoperability with Intel(R) Cilk(tm) Plus.

#if _WIN32
#define CILKLIB_NAME "cilkrts20.dll"
#else
#define CILKLIB_NAME "libcilkrts.so"
#endif

//! Handler for interoperation with cilkrts library.
static __cilk_tbb_retcode (*watch_stack_handler)(struct __cilk_tbb_unwatch_thunk* u,
                                                 struct __cilk_tbb_stack_op_thunk o);

#if __TBB_WEAK_SYMBOLS
    #pragma weak __cilkrts_watch_stack
#endif

//! Table describing how to link the handlers.
static const dynamic_link_descriptor CilkLinkTable[] = {
    DLD(__cilkrts_watch_stack, watch_stack_handler)
};

static atomic<do_once_state> cilkrts_load_state;

bool initialize_cilk_interop() {
    // Pinning can fail. This is a normal situation, and means that the current
    // thread does not use cilkrts and consequently does not need interop.
    return dynamic_link( CILKLIB_NAME, CilkLinkTable, 1 );
}
#endif /* __TBB_SURVIVE_THREAD_SWITCH */

namespace rml {
    tbb_server* make_private_server( tbb_client& client );
}

void governor::acquire_resources () {
#if USE_PTHREAD
    int status = theTLS.create(auto_terminate);
#else
    int status = theTLS.create();
#endif
    if( status )
        handle_perror(status, "TBB failed to initialize task scheduler TLS\n");
}

void governor::release_resources () {
    theRMLServerFactory.close();
#if TBB_USE_ASSERT
    if( __TBB_InitOnce::initialization_done() && theTLS.get() ) 
        runtime_warning( "TBB is unloaded while tbb::task_scheduler_init object is alive?" );
#endif
    int status = theTLS.destroy();
    if( status )
        handle_perror(status, "TBB failed to destroy task scheduler TLS");
    dynamic_unlink_all();
}

rml::tbb_server* governor::create_rml_server ( rml::tbb_client& client ) {
    rml::tbb_server* server = NULL;
    if( !UsePrivateRML ) {
        ::rml::factory::status_type status = theRMLServerFactory.make_server( server, client );
        if( status != ::rml::factory::st_success ) {
            UsePrivateRML = true;
            runtime_warning( "rml::tbb_factory::make_server failed with status %x, falling back on private rml", status );
        }
    }
    if ( !server ) {
        __TBB_ASSERT( UsePrivateRML, NULL );
        server = rml::make_private_server( client );
    }
    __TBB_ASSERT( server, "Failed to create RML server" );
    return server;
}

void governor::sign_on(generic_scheduler* s) {
    __TBB_ASSERT( !theTLS.get(), NULL );
    theTLS.set(s);
#if __TBB_SURVIVE_THREAD_SWITCH
    if( watch_stack_handler ) {
        __cilk_tbb_stack_op_thunk o;
        o.routine = &stack_op_handler;
        o.data = s;
        if( (*watch_stack_handler)(&s->my_cilk_unwatch_thunk, o) ) {
            // Failed to register with cilkrts, make sure we are clean
            s->my_cilk_unwatch_thunk.routine = NULL;
        }
#if TBB_USE_ASSERT
        else
            s->my_cilk_state = generic_scheduler::cs_running;
#endif /* TBB_USE_ASSERT */
    }
#endif /* __TBB_SURVIVE_THREAD_SWITCH */
}

void governor::sign_off(generic_scheduler* s) {
    suppress_unused_warning(s);
    __TBB_ASSERT( theTLS.get()==s, "attempt to unregister a wrong scheduler instance" );
    theTLS.set(NULL);
#if __TBB_SURVIVE_THREAD_SWITCH
    __cilk_tbb_unwatch_thunk &ut = s->my_cilk_unwatch_thunk;
    if ( ut.routine )
       (*ut.routine)(ut.data);
#endif /* __TBB_SURVIVE_THREAD_SWITCH */
}

void governor::setBlockingTerminate(const task_scheduler_init *tsi) {
    __TBB_ASSERT(!IsBlockingTermiantionInProgress, "It's impossible to create task_scheduler_init while blocking termination is in progress.");
    if (BlockingTSI)
        throw_exception(eid_blocking_sch_init);
    BlockingTSI = tsi;
}

generic_scheduler* governor::init_scheduler( unsigned num_threads, stack_size_type stack_size, bool auto_init ) {
    if( !__TBB_InitOnce::initialization_done() )
        DoOneTimeInitializations();
    generic_scheduler* s = theTLS.get();
    if( s ) {
        s->my_ref_count += 1;
        return s;
    }
#if __TBB_SURVIVE_THREAD_SWITCH
    atomic_do_once( &initialize_cilk_interop, cilkrts_load_state );
#endif /* __TBB_SURVIVE_THREAD_SWITCH */
    if( (int)num_threads == task_scheduler_init::automatic )
        num_threads = default_num_threads();
    s = generic_scheduler::create_master( 
            market::create_arena( num_threads - 1, stack_size ? stack_size : ThreadStackSize ) );
    __TBB_ASSERT(s, "Somehow a local scheduler creation for a master thread failed");
    s->my_auto_initialized = auto_init;
    return s;
}

void governor::terminate_scheduler( generic_scheduler* s, const task_scheduler_init* tsi_ptr ) {
    __TBB_ASSERT( s == theTLS.get(), "Attempt to terminate non-local scheduler instance" );
    if (--(s->my_ref_count)) {
        if (BlockingTSI && BlockingTSI==tsi_ptr) {
            // can't throw exception, because this is on dtor's call chain
            fprintf(stderr, "Attempt to terminate nested scheduler in blocking mode\n");
            exit(1);
        }
    } else {
#if TBB_USE_ASSERT
        if (BlockingTSI) {
            __TBB_ASSERT( BlockingTSI == tsi_ptr, "For blocking termiantion last terminate_scheduler must be blocking." );
            IsBlockingTermiantionInProgress = true;
        }
#endif
        s->cleanup_master();
        BlockingTSI = NULL;
#if TBB_USE_ASSERT
        IsBlockingTermiantionInProgress = false;
#endif
    }
}

void governor::auto_terminate(void* arg){
    generic_scheduler* s = static_cast<generic_scheduler*>(arg);
    if( s && s->my_auto_initialized ) {
        if( !--(s->my_ref_count) ) {
            __TBB_ASSERT( !BlockingTSI, "Blocking auto-termiante is not supported." );
            // If the TLS slot is already cleared by OS or underlying concurrency
            // runtime, restore its value.
            if ( !theTLS.get() )
                theTLS.set(s);
            else __TBB_ASSERT( s == theTLS.get(), NULL );
            s->cleanup_master();
            __TBB_ASSERT( !theTLS.get(), "cleanup_master has not cleared its TLS slot" );
        }
    }
}

void governor::print_version_info () {
    if ( UsePrivateRML )
        PrintExtraVersionInfo( "RML", "private" );
    else {
        PrintExtraVersionInfo( "RML", "shared" );
        theRMLServerFactory.call_with_server_info( PrintRMLVersionInfo, (void*)"" );
    }
#if __TBB_SURVIVE_THREAD_SWITCH
    if( watch_stack_handler )
        PrintExtraVersionInfo( "CILK", CILKLIB_NAME );
#endif /* __TBB_SURVIVE_THREAD_SWITCH */
}

void governor::initialize_rml_factory () {
    ::rml::factory::status_type res = theRMLServerFactory.open(); 
    UsePrivateRML = res != ::rml::factory::st_success;
}

#if __TBB_SURVIVE_THREAD_SWITCH
__cilk_tbb_retcode governor::stack_op_handler( __cilk_tbb_stack_op op, void* data ) {
    __TBB_ASSERT(data,NULL);
    generic_scheduler* s = static_cast<generic_scheduler*>(data);
#if TBB_USE_ASSERT
    void* current = theTLS.get();
#if _WIN32||_WIN64
    uintptr_t thread_id = GetCurrentThreadId();
#else
    uintptr_t thread_id = uintptr_t(pthread_self());
#endif

#endif /* TBB_USE_ASSERT */
    switch( op ) {
        default:
            __TBB_ASSERT( 0, "invalid op" );
        case CILK_TBB_STACK_ADOPT: {
            __TBB_ASSERT( !current && s->my_cilk_state==generic_scheduler::cs_limbo || 
                          current==s && s->my_cilk_state==generic_scheduler::cs_running, "invalid adoption" );
#if TBB_USE_ASSERT
            if( current==s ) 
                runtime_warning( "redundant adoption of %p by thread %p\n", s, (void*)thread_id );
            s->my_cilk_state = generic_scheduler::cs_running;
#endif /* TBB_USE_ASSERT */
            theTLS.set(s);
            break;
        }
        case CILK_TBB_STACK_ORPHAN: {
            __TBB_ASSERT( current==s && s->my_cilk_state==generic_scheduler::cs_running, "invalid orphaning" ); 
#if TBB_USE_ASSERT
            s->my_cilk_state = generic_scheduler::cs_limbo;
#endif /* TBB_USE_ASSERT */
            theTLS.set(NULL);
            break;
        }
        case CILK_TBB_STACK_RELEASE: {
            __TBB_ASSERT( !current && s->my_cilk_state==generic_scheduler::cs_limbo || 
                          current==s && s->my_cilk_state==generic_scheduler::cs_running, "invalid release" );
#if TBB_USE_ASSERT
            s->my_cilk_state = generic_scheduler::cs_freed;
#endif /* TBB_USE_ASSERT */
            s->my_cilk_unwatch_thunk.routine = NULL;
            auto_terminate( s );
        } 
    }
    return 0;
}
#endif /* __TBB_SURVIVE_THREAD_SWITCH */

} // namespace internal
Beispiel #7
0
namespace internal {

#if DO_ITT_NOTIFY

//! Table describing the __itt_notify handlers.
static const DynamicLinkDescriptor ITT_HandlerTable[] = {
    DLD( __itt_notify_sync_prepare, ITT_Handler_sync_prepare),
    DLD( __itt_notify_sync_acquired, ITT_Handler_sync_acquired),
    DLD( __itt_notify_sync_releasing, ITT_Handler_sync_releasing),
    DLD( __itt_notify_sync_cancel, ITT_Handler_sync_cancel),
# if _WIN32||_WIN64
    DLD( __itt_thr_name_setW, ITT_Handler_thr_name_set),
    DLD( __itt_thread_set_nameW, ITT_Handler_thread_set_name),
# else
    DLD( __itt_thr_name_set, ITT_Handler_thr_name_set),
    DLD( __itt_thread_set_name, ITT_Handler_thread_set_name),
# endif /* _WIN32 || _WIN64 */

#if __TBB_NAMING_API_SUPPORT
# if _WIN32||_WIN64
    DLD( __itt_sync_createW, ITT_Handler_sync_create),
    DLD( __itt_sync_renameW, ITT_Handler_sync_rename)
# else /* !WIN */
    DLD( __itt_sync_create, ITT_Handler_sync_create),
    DLD( __itt_sync_rename, ITT_Handler_sync_rename)
# endif /* !WIN */
#endif /* __TBB_NAMING_API_SUPPORT */
};

static const int ITT_HandlerTable_size = 
    sizeof(ITT_HandlerTable)/sizeof(DynamicLinkDescriptor);

// LIBITTNOTIFY_NAME is the name of the ITT notification library 
# if _WIN32||_WIN64
#  define LIBITTNOTIFY_NAME "libittnotify.dll"
# elif __linux__
#  define LIBITTNOTIFY_NAME "libittnotify.so"
# else
#  error Intel(R) Threading Tools not provided for this OS
# endif

//! Performs tools support initialization.
/** Is called by DoOneTimeInitializations and ITT_DoOneTimeInitialization in 
    a protected (one-time) manner. Not to be invoked directly. **/
bool InitializeITT() {
    bool result = false;
    // Check if we are running under a performance or correctness tool
    bool t_checker = GetBoolEnvironmentVariable("KMP_FOR_TCHECK");
    bool t_profiler = GetBoolEnvironmentVariable("KMP_FOR_TPROFILE");
	__TBB_ASSERT(!(t_checker&&t_profiler), NULL);
    if ( t_checker || t_profiler ) {
        // Yes, we are in the tool mode. Try to load libittnotify library.
        result = FillDynamicLinks( LIBITTNOTIFY_NAME, ITT_HandlerTable, ITT_HandlerTable_size, 4 );
    }
    if (result){
        if ( t_checker ) {
            current_tool = ITC;
        } else if ( t_profiler ) {
            current_tool = ITP;
        }
    } else {
        // Clear away the proxy (dummy) handlers
        for (int i = 0; i < ITT_HandlerTable_size; i++)
            *ITT_HandlerTable[i].handler = NULL;
        current_tool = NONE;
    }
    PrintExtraVersionInfo( "ITT", result?"yes":"no" );
    return result;
}

#if !__TBB_NAMING_API_SUPPORT
    #define ITT_DoOneTimeInitialization DoOneTimeInitializations
#endif

//! Performs one-time initialization of tools interoperability mechanisms.
/** Defined in task.cpp. Makes a protected do-once call to InitializeITT(). **/
void ITT_DoOneTimeInitialization();

/** The following dummy_xxx functions are proxies that correspond to tool notification 
    APIs and are used to initialize corresponding pointers to the tool notifications
    (ITT_Handler_xxx). When the first call to ITT_Handler_xxx takes place before 
    the whole library initialization (done by DoOneTimeInitializations) happened,
    the proxy handler performs initialization of the tools support. After this
    ITT_Handler_xxx will be set to either tool notification pointer or NULL. **/
void dummy_sync_prepare( volatile void* ptr ) {
    ITT_DoOneTimeInitialization();
    __TBB_ASSERT( ITT_Handler_sync_prepare!=&dummy_sync_prepare, NULL );
    if (ITT_Handler_sync_prepare)
        (*ITT_Handler_sync_prepare) (ptr);
}

void dummy_sync_acquired( volatile void* ptr ) {
    ITT_DoOneTimeInitialization();
    __TBB_ASSERT( ITT_Handler_sync_acquired!=&dummy_sync_acquired, NULL );
    if (ITT_Handler_sync_acquired)
        (*ITT_Handler_sync_acquired) (ptr);
}

void dummy_sync_releasing( volatile void* ptr ) {
    ITT_DoOneTimeInitialization();
    __TBB_ASSERT( ITT_Handler_sync_releasing!=&dummy_sync_releasing, NULL );
    if (ITT_Handler_sync_releasing)
        (*ITT_Handler_sync_releasing) (ptr);
}

void dummy_sync_cancel( volatile void* ptr ) {
    ITT_DoOneTimeInitialization();
    __TBB_ASSERT( ITT_Handler_sync_cancel!=&dummy_sync_cancel, NULL );
    if (ITT_Handler_sync_cancel)
        (*ITT_Handler_sync_cancel) (ptr);
}

int dummy_thr_name_set( const tchar* str, int number ) {
    ITT_DoOneTimeInitialization();
    __TBB_ASSERT( ITT_Handler_thr_name_set!=&dummy_thr_name_set, NULL );
    if (ITT_Handler_thr_name_set)
        return (*ITT_Handler_thr_name_set) (str, number);
    return -1;
}

void dummy_thread_set_name( const tchar* name ) {
    ITT_DoOneTimeInitialization();
    __TBB_ASSERT( ITT_Handler_thread_set_name!=&dummy_thread_set_name, NULL );
    if (ITT_Handler_thread_set_name)
        (*ITT_Handler_thread_set_name)( name );
}

#if __TBB_NAMING_API_SUPPORT
void dummy_sync_create( void* obj, const tchar* objname, const tchar* objtype, int /*attribute*/ ) {
    ITT_DoOneTimeInitialization();
    __TBB_ASSERT( ITT_Handler_sync_create!=&dummy_sync_create, NULL );
    ITT_SYNC_CREATE( obj, objtype, objname );
}

void dummy_sync_rename( void* obj, const tchar* new_name ) {
    ITT_DoOneTimeInitialization();
    __TBB_ASSERT( ITT_Handler_sync_rename!=&dummy_sync_rename, NULL );
    ITT_SYNC_RENAME(obj, new_name);
}

void itt_set_sync_name_v3( void *obj, const tchar* name) {
    __TBB_ASSERT( ITT_Handler_sync_rename!=&dummy_sync_rename, NULL );
    ITT_SYNC_RENAME(obj, name);
}
#endif /* __TBB_NAMING_API_SUPPORT */

//! Leading padding before the area where tool notification handlers are placed.
/** Prevents cache lines where the handler pointers are stored from thrashing.
    Defined as extern to prevent compiler from placing the padding arrays separately
    from the handler pointers (which are declared as extern).
    Declared separately from definition to get rid of compiler warnings. **/
extern char __ITT_Handler_leading_padding[NFS_MaxLineSize];

//! Trailing padding after the area where tool notification handlers are placed.
extern char __ITT_Handler_trailing_padding[NFS_MaxLineSize];

char __ITT_Handler_leading_padding[NFS_MaxLineSize] = {0};
PointerToITT_Handler ITT_Handler_sync_prepare = &dummy_sync_prepare;
PointerToITT_Handler ITT_Handler_sync_acquired = &dummy_sync_acquired;
PointerToITT_Handler ITT_Handler_sync_releasing = &dummy_sync_releasing;
PointerToITT_Handler ITT_Handler_sync_cancel = &dummy_sync_cancel;
PointerToITT_thr_name_set ITT_Handler_thr_name_set = &dummy_thr_name_set;
PointerToITT_thread_set_name ITT_Handler_thread_set_name = &dummy_thread_set_name;
#if __TBB_NAMING_API_SUPPORT
PointerToITT_sync_create ITT_Handler_sync_create = &dummy_sync_create;
PointerToITT_sync_rename ITT_Handler_sync_rename = &dummy_sync_rename;
#endif /* __TBB_NAMING_API_SUPPORT */
char __ITT_Handler_trailing_padding[NFS_MaxLineSize] = {0};

target_tool current_tool = TO_BE_INITIALIZED;

#endif /* DO_ITT_NOTIFY */

void itt_store_pointer_with_release_v3( void* dst, void* src ) {
    ITT_NOTIFY(sync_releasing, dst);
    __TBB_store_with_release(*static_cast<void**>(dst),src);
}

void* itt_load_pointer_with_acquire_v3( const void* src ) {
    void* result = __TBB_load_with_acquire(*static_cast<void*const*>(src));
    ITT_NOTIFY(sync_acquired, const_cast<void*>(src));
    return result;
}

void* itt_load_pointer_v3( const void* src ) {
    void* result = *static_cast<void*const*>(src);
    return result;
}

} // namespace internal