WaitForSingleObject( shptr->h, INFINITE ); } void WINAPI release_binsem_using_event( SRWLOCK* h_ ) { srwl_or_handle* shptr = (srwl_or_handle*) h_; SetEvent( shptr->h ); } static void (WINAPI *__TBB_init_binsem)( SRWLOCK* ) = (void (WINAPI *)(SRWLOCK*))&init_binsem_using_event; static void (WINAPI *__TBB_acquire_binsem)( SRWLOCK* ) = (void (WINAPI *)(SRWLOCK*))&acquire_binsem_using_event; static void (WINAPI *__TBB_release_binsem)( SRWLOCK* ) = (void (WINAPI *)(SRWLOCK*))&release_binsem_using_event; //! Table describing the how to link the handlers. static const dynamic_link_descriptor SRWLLinkTable[] = { DLD(InitializeSRWLock, __TBB_init_binsem), DLD(AcquireSRWLockExclusive, __TBB_acquire_binsem), DLD(ReleaseSRWLockExclusive, __TBB_release_binsem) }; inline void init_concmon_module() { __TBB_ASSERT( (uintptr_t)__TBB_init_binsem==(uintptr_t)&init_binsem_using_event, NULL ); if( dynamic_link( "Kernel32.dll", SRWLLinkTable, sizeof(SRWLLinkTable)/sizeof(dynamic_link_descriptor) ) ) { __TBB_ASSERT( (uintptr_t)__TBB_init_binsem!=(uintptr_t)&init_binsem_using_event, NULL ); __TBB_ASSERT( (uintptr_t)__TBB_acquire_binsem!=(uintptr_t)&acquire_binsem_using_event, NULL ); __TBB_ASSERT( (uintptr_t)__TBB_release_binsem!=(uintptr_t)&release_binsem_using_event, NULL ); } } binary_semaphore::binary_semaphore() {
namespace internal { #if __linux__ || __FreeBSD_version >= 701000 static void set_affinity_mask( size_t maskSize, const basic_mask_t* threadMask ) { #if __linux__ if( sched_setaffinity( 0, maskSize, threadMask ) ) #else /* FreeBSD */ if( cpuset_setaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) ) #endif runtime_warning( "setaffinity syscall failed" ); } static void get_affinity_mask( size_t maskSize, basic_mask_t* threadMask ) { #if __linux__ if( sched_getaffinity( 0, maskSize, threadMask ) ) #else /* FreeBSD */ if( cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) ) #endif runtime_warning( "getaffinity syscall failed" ); } static basic_mask_t* process_mask; static int num_masks; struct process_mask_cleanup_helper { ~process_mask_cleanup_helper() { if( process_mask ) { delete [] process_mask; } } }; static process_mask_cleanup_helper process_mask_cleanup; #define curMaskSize sizeof(basic_mask_t) * num_masks affinity_helper::~affinity_helper() { if( threadMask ) { if( is_changed ) { set_affinity_mask( curMaskSize, threadMask ); } delete [] threadMask; } } void affinity_helper::protect_affinity_mask() { if( threadMask == NULL && num_masks && process_mask ) { threadMask = new basic_mask_t [num_masks]; memset( threadMask, 0, curMaskSize ); get_affinity_mask( curMaskSize, threadMask ); is_changed = memcmp( process_mask, threadMask, curMaskSize ); if( is_changed ) { set_affinity_mask( curMaskSize, process_mask ); } } } #undef curMaskSize static atomic<do_once_state> hardware_concurrency_info; static int theNumProcs; static void initialize_hardware_concurrency_info () { int err; int availableProcs = 0; int numMasks = 1; #if __linux__ #if __TBB_MAIN_THREAD_AFFINITY_BROKEN int maxProcs = INT_MAX; // To check the entire mask. int pid = 0; // Get the mask of the calling thread. #else int maxProcs = sysconf(_SC_NPROCESSORS_ONLN); int pid = getpid(); #endif cpu_set_t *processMask; const size_t BasicMaskSize = sizeof(cpu_set_t); for (;;) { int curMaskSize = BasicMaskSize * numMasks; processMask = new cpu_set_t[numMasks]; memset( processMask, 0, curMaskSize ); err = sched_getaffinity( pid, curMaskSize, processMask ); if ( !err || errno != EINVAL || curMaskSize * CHAR_BIT >= 256 * 1024 ) break; delete[] processMask; numMasks <<= 1; } #else /* FreeBSD >= 7.1 */ int maxProcs = sysconf(_SC_NPROCESSORS_ONLN); cpuset_t *processMask; const size_t BasicMaskSize = sizeof(cpuset_t); for (;;) { int curMaskSize = BasicMaskSize * numMasks; processMask = new cpuset_t[numMasks]; memset( processMask, 0, curMaskSize ); // CPU_LEVEL_WHICH - anonymous (current) mask, CPU_LEVEL_CPUSET - assigned mask #if __TBB_MAIN_THREAD_AFFINITY_BROKEN err = cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, curMaskSize, processMask ); #else err = cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, curMaskSize, processMask ); #endif if ( !err || errno != ERANGE || curMaskSize * CHAR_BIT >= 16 * 1024 ) break; delete[] processMask; numMasks <<= 1; } #endif /* FreeBSD >= 7.1 */ if ( !err ) { for ( int m = 0; availableProcs < maxProcs && m < numMasks; ++m ) { for ( size_t i = 0; (availableProcs < maxProcs) && (i < BasicMaskSize * CHAR_BIT); ++i ) { if ( CPU_ISSET( i, processMask + m ) ) ++availableProcs; } } num_masks = numMasks; process_mask = processMask; } else { availableProcs = (maxProcs == INT_MAX) ? sysconf(_SC_NPROCESSORS_ONLN) : maxProcs; delete[] processMask; } theNumProcs = availableProcs > 0 ? availableProcs : 1; // Fail safety strap __TBB_ASSERT( theNumProcs <= sysconf(_SC_NPROCESSORS_ONLN), NULL ); } int AvailableHwConcurrency() { atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info ); return theNumProcs; } #elif defined(_SC_NPROCESSORS_ONLN) int AvailableHwConcurrency() { int n = sysconf(_SC_NPROCESSORS_ONLN); return (n > 0) ? n : 1; } #elif _WIN32||_WIN64 static atomic<do_once_state> hardware_concurrency_info; static const WORD TBB_ALL_PROCESSOR_GROUPS = 0xffff; // Statically allocate an array for processor group information. // Windows 7 supports maximum 4 groups, but let's look ahead a little. static const WORD MaxProcessorGroups = 64; struct ProcessorGroupInfo { DWORD_PTR mask; ///< Affinity mask covering the whole group int numProcs; ///< Number of processors in the group int numProcsRunningTotal; ///< Subtotal of processors in this and preceding groups //! Total number of processor groups in the system static int NumGroups; //! Index of the group with a slot reserved for the first master thread /** In the context of multiple processor groups support current implementation defines "the first master thread" as the first thread to invoke AvailableHwConcurrency(). TODO: Implement a dynamic scheme remapping workers depending on the pending master threads affinity. **/ static int HoleIndex; }; int ProcessorGroupInfo::NumGroups = 1; int ProcessorGroupInfo::HoleIndex = 0; ProcessorGroupInfo theProcessorGroups[MaxProcessorGroups]; struct TBB_GROUP_AFFINITY { DWORD_PTR Mask; WORD Group; WORD Reserved[3]; }; static DWORD (WINAPI *TBB_GetMaximumProcessorCount)( WORD groupIndex ) = NULL; static WORD (WINAPI *TBB_GetMaximumProcessorGroupCount)() = NULL; static BOOL (WINAPI *TBB_SetThreadGroupAffinity)( HANDLE hThread, const TBB_GROUP_AFFINITY* newAff, TBB_GROUP_AFFINITY *prevAff ); static BOOL (WINAPI *TBB_GetThreadGroupAffinity)( HANDLE hThread, TBB_GROUP_AFFINITY* ); static const dynamic_link_descriptor ProcessorGroupsApiLinkTable[] = { DLD(GetMaximumProcessorCount, TBB_GetMaximumProcessorCount) , DLD(GetMaximumProcessorGroupCount, TBB_GetMaximumProcessorGroupCount) , DLD(SetThreadGroupAffinity, TBB_SetThreadGroupAffinity) , DLD(GetThreadGroupAffinity, TBB_GetThreadGroupAffinity) }; static void initialize_hardware_concurrency_info () { dynamic_link( GetModuleHandleA( "Kernel32.dll" ), ProcessorGroupsApiLinkTable, sizeof(ProcessorGroupsApiLinkTable)/sizeof(dynamic_link_descriptor) ); SYSTEM_INFO si; GetSystemInfo(&si); DWORD_PTR pam, sam, m = 1; GetProcessAffinityMask( GetCurrentProcess(), &pam, &sam ); int nproc = 0; for ( size_t i = 0; i < sizeof(DWORD_PTR) * CHAR_BIT; ++i, m <<= 1 ) { if ( pam & m ) ++nproc; } __TBB_ASSERT( nproc <= (int)si.dwNumberOfProcessors, NULL ); if ( nproc == (int)si.dwNumberOfProcessors && TBB_GetMaximumProcessorCount ) { // The process does not have restricting affinity mask and multiple processor groups are possible ProcessorGroupInfo::NumGroups = (int)TBB_GetMaximumProcessorGroupCount(); __TBB_ASSERT( ProcessorGroupInfo::NumGroups <= MaxProcessorGroups, NULL ); // Fail safety bootstrap. Release versions will limit available concurrency // level, while debug ones would assert. if ( ProcessorGroupInfo::NumGroups > MaxProcessorGroups ) ProcessorGroupInfo::NumGroups = MaxProcessorGroups; if ( ProcessorGroupInfo::NumGroups > 1 ) { TBB_GROUP_AFFINITY ga; if ( TBB_GetThreadGroupAffinity( GetCurrentThread(), &ga ) ) ProcessorGroupInfo::HoleIndex = ga.Group; int nprocs = 0; for ( WORD i = 0; i < ProcessorGroupInfo::NumGroups; ++i ) { ProcessorGroupInfo &pgi = theProcessorGroups[i]; pgi.numProcs = (int)TBB_GetMaximumProcessorCount(i); __TBB_ASSERT( pgi.numProcs <= (int)sizeof(DWORD_PTR) * CHAR_BIT, NULL ); pgi.mask = pgi.numProcs == sizeof(DWORD_PTR) * CHAR_BIT ? ~(DWORD_PTR)0 : (DWORD_PTR(1) << pgi.numProcs) - 1; pgi.numProcsRunningTotal = nprocs += pgi.numProcs; } __TBB_ASSERT( nprocs == (int)TBB_GetMaximumProcessorCount( TBB_ALL_PROCESSOR_GROUPS ), NULL ); return; } } // Either the process has restricting affinity mask or only a single processor groups is present theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = nproc; PrintExtraVersionInfo("Processor groups", "%d", ProcessorGroupInfo::NumGroups); if (ProcessorGroupInfo::NumGroups>1) for (int i=0; i<ProcessorGroupInfo::NumGroups; ++i) PrintExtraVersionInfo( "----- Group", "%d: size %d", i, theProcessorGroups[i].numProcs); } int AvailableHwConcurrency() { atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info ); return theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal; } int NumberOfProcessorGroups() { __TBB_ASSERT( hardware_concurrency_info == initialization_complete, "NumberOfProcessorGroups is used before AvailableHwConcurrency" ); return ProcessorGroupInfo::NumGroups; } // Offset for the slot reserved for the first master thread #define HoleAdjusted(procIdx, grpIdx) (procIdx + (holeIdx <= grpIdx)) int FindProcessorGroupIndex ( int procIdx ) { // In case of oversubscription spread extra workers in a round robin manner int holeIdx; const int numProcs = theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal; if ( procIdx >= numProcs - 1 ) { holeIdx = INT_MAX; procIdx = (procIdx - numProcs + 1) % numProcs; } else holeIdx = ProcessorGroupInfo::HoleIndex; __TBB_ASSERT( hardware_concurrency_info == initialization_complete, "FindProcessorGroupIndex is used before AvailableHwConcurrency" ); // Approximate the likely group index assuming all groups are of the same size int i = procIdx / theProcessorGroups[0].numProcs; // Make sure the approximation is a valid group index if (i >= ProcessorGroupInfo::NumGroups) i = ProcessorGroupInfo::NumGroups-1; // Now adjust the approximation up or down if ( theProcessorGroups[i].numProcsRunningTotal > HoleAdjusted(procIdx, i) ) { while ( theProcessorGroups[i].numProcsRunningTotal - theProcessorGroups[i].numProcs > HoleAdjusted(procIdx, i) ) { __TBB_ASSERT( i > 0, NULL ); --i; } } else { do { ++i; } while ( theProcessorGroups[i].numProcsRunningTotal <= HoleAdjusted(procIdx, i) ); } __TBB_ASSERT( i < ProcessorGroupInfo::NumGroups, NULL ); return i; } void MoveThreadIntoProcessorGroup( void* hThread, int groupIndex ) { __TBB_ASSERT( hardware_concurrency_info == initialization_complete, "MoveThreadIntoProcessorGroup is used before AvailableHwConcurrency" ); if ( !TBB_SetThreadGroupAffinity ) return; TBB_GROUP_AFFINITY ga = { theProcessorGroups[groupIndex].mask, (WORD)groupIndex, {0,0,0} }; TBB_SetThreadGroupAffinity( hThread, &ga, NULL ); } #else #error AvailableHwConcurrency is not implemented in this OS #endif /* OS */ } // namespace internal
} LeaveCriticalSection( &cv_event->mutex ); CloseHandle( my_event ); } void WINAPI destroy_condvar_noop( CONDITION_VARIABLE* /*cv*/ ) { /*no op*/ } static void (WINAPI *__TBB_init_condvar)( PCONDITION_VARIABLE ) = (void (WINAPI *)(PCONDITION_VARIABLE))&init_condvar_using_event; static BOOL (WINAPI *__TBB_condvar_wait)( PCONDITION_VARIABLE, LPCRITICAL_SECTION, DWORD ) = (BOOL (WINAPI *)(PCONDITION_VARIABLE,LPCRITICAL_SECTION, DWORD))&sleep_condition_variable_cs_using_event; static void (WINAPI *__TBB_condvar_notify_one)( PCONDITION_VARIABLE ) = (void (WINAPI *)(PCONDITION_VARIABLE))&wake_condition_variable_using_event; static void (WINAPI *__TBB_condvar_notify_all)( PCONDITION_VARIABLE ) = (void (WINAPI *)(PCONDITION_VARIABLE))&wake_all_condition_variable_using_event; static void (WINAPI *__TBB_destroy_condvar)( PCONDITION_VARIABLE ) = (void (WINAPI *)(PCONDITION_VARIABLE))&destroy_condvar_using_event; //! Table describing how to link the handlers. static const dynamic_link_descriptor CondVarLinkTable[] = { DLD(InitializeConditionVariable, __TBB_init_condvar), DLD(SleepConditionVariableCS, __TBB_condvar_wait), DLD(WakeConditionVariable, __TBB_condvar_notify_one), DLD(WakeAllConditionVariable, __TBB_condvar_notify_all) }; void init_condvar_module() { __TBB_ASSERT( (uintptr_t)__TBB_init_condvar==(uintptr_t)&init_condvar_using_event, NULL ); if( dynamic_link( "Kernel32.dll", CondVarLinkTable, 4 ) ) __TBB_destroy_condvar = (void (WINAPI *)(PCONDITION_VARIABLE))&destroy_condvar_noop; } #endif /* _WIN32||_WIN64 */ } // namespace internal
namespace internal { //------------------------------------------------------------------------ // governor //------------------------------------------------------------------------ #if __TBB_SURVIVE_THREAD_SWITCH #if _WIN32 #define CILKLIB_NAME "cilkrts20.dll" #else #define CILKLIB_NAME "libcilkrts.so" #endif //! Handler for memory allocation static __cilk_tbb_retcode (*watch_stack_handler)(struct __cilk_tbb_unwatch_thunk* u, struct __cilk_tbb_stack_op_thunk o); #if __TBB_WEAK_SYMBOLS #pragma weak __cilkrts_watch_stack #endif //! Table describing how to link the handlers. static const dynamic_link_descriptor CilkLinkTable[] = { DLD(__cilkrts_watch_stack, watch_stack_handler) }; static atomic<do_once_state> cilkrts_load_state; bool initialize_cilk_interop() { // Pinning can fail. This is a normal situation, and means that the current // thread does not use Cilk and consequently does not need interop. return dynamic_link( CILKLIB_NAME, CilkLinkTable, 1 ); } #endif /* __TBB_SURVIVE_THREAD_SWITCH */ namespace rml { tbb_server* make_private_server( tbb_client& client ); } void governor::acquire_resources () { #if USE_PTHREAD int status = theTLS.create(auto_terminate); #else int status = theTLS.create(); #endif if( status ) handle_perror(status, "TBB failed to initialize TLS storage\n"); ::rml::factory::status_type res = theRMLServerFactory.open(); UsePrivateRML = res != ::rml::factory::st_success; } void governor::release_resources () { theRMLServerFactory.close(); #if TBB_USE_ASSERT if( __TBB_InitOnce::initialization_done() && theTLS.get() ) runtime_warning( "TBB is unloaded while tbb::task_scheduler_init object is alive?" ); #endif int status = theTLS.destroy(); if( status ) handle_perror(status, "TBB failed to destroy TLS storage"); dynamic_unlink_all(); } rml::tbb_server* governor::create_rml_server ( rml::tbb_client& client ) { rml::tbb_server* server = NULL; if( !UsePrivateRML ) { ::rml::factory::status_type status = theRMLServerFactory.make_server( server, client ); if( status != ::rml::factory::st_success ) { UsePrivateRML = true; runtime_warning( "rml::tbb_factorymake_server failed with status %x, falling back on private rml", status ); } } if ( !server ) { __TBB_ASSERT( UsePrivateRML, NULL ); server = rml::make_private_server( client ); } __TBB_ASSERT( server, "Failed to create RML server" ); return server; } void governor::sign_on(generic_scheduler* s) { __TBB_ASSERT( !s->my_registered, NULL ); s->my_registered = true; theTLS.set(s); #if __TBB_SURVIVE_THREAD_SWITCH __cilk_tbb_stack_op_thunk o; o.routine = &stack_op_handler; o.data = s; if( watch_stack_handler ) { if( (*watch_stack_handler)(&s->my_cilk_unwatch_thunk, o) ) { // Failed to register with Cilk, make sure we are clean s->my_cilk_unwatch_thunk.routine = NULL; } #if TBB_USE_ASSERT else s->my_cilk_state = generic_scheduler::cs_running; #endif /* TBB_USE_ASSERT */ } #endif /* __TBB_SURVIVE_THREAD_SWITCH */ } void governor::sign_off(generic_scheduler* s) { if( s->my_registered ) { __TBB_ASSERT( theTLS.get()==s || (!s->is_worker() && !theTLS.get()), "attempt to unregister a wrong scheduler instance" ); theTLS.set(NULL); s->my_registered = false; #if __TBB_SURVIVE_THREAD_SWITCH __cilk_tbb_unwatch_thunk &ut = s->my_cilk_unwatch_thunk; if ( ut.routine ) (*ut.routine)(ut.data); #endif /* __TBB_SURVIVE_THREAD_SWITCH */ } } generic_scheduler* governor::init_scheduler( unsigned num_threads, stack_size_type stack_size, bool auto_init ) { if( !__TBB_InitOnce::initialization_done() ) DoOneTimeInitializations(); generic_scheduler* s = theTLS.get(); if( s ) { s->my_ref_count += 1; return s; } #if __TBB_SURVIVE_THREAD_SWITCH atomic_do_once( &initialize_cilk_interop, cilkrts_load_state ); #endif /* __TBB_SURVIVE_THREAD_SWITCH */ if( (int)num_threads == task_scheduler_init::automatic ) num_threads = default_num_threads(); s = generic_scheduler::create_master( market::create_arena( num_threads - 1, stack_size ? stack_size : ThreadStackSize ) ); __TBB_ASSERT(s, "Somehow a local scheduler creation for a master thread failed"); s->my_auto_initialized = auto_init; return s; } void governor::terminate_scheduler( generic_scheduler* s ) { __TBB_ASSERT( s == theTLS.get(), "Attempt to terminate non-local scheduler instance" ); if( !--(s->my_ref_count) ) s->cleanup_master(); } void governor::auto_terminate(void* arg){ generic_scheduler* s = static_cast<generic_scheduler*>(arg); if( s && s->my_auto_initialized ) { if( !--(s->my_ref_count) ) { if ( !theTLS.get() && !s->local_task_pool_empty() ) { // This thread's TLS slot is already cleared. But in order to execute // remaining tasks cleanup_master() will need TLS correctly set. // So we temporarily restore its value. theTLS.set(s); s->cleanup_master(); theTLS.set(NULL); } else s->cleanup_master(); } } } void governor::print_version_info () { if ( UsePrivateRML ) PrintExtraVersionInfo( "RML", "private" ); else { PrintExtraVersionInfo( "RML", "shared" ); theRMLServerFactory.call_with_server_info( PrintRMLVersionInfo, (void*)"" ); } #if __TBB_SURVIVE_THREAD_SWITCH if( watch_stack_handler ) PrintExtraVersionInfo( "CILK", CILKLIB_NAME ); #endif /* __TBB_SURVIVE_THREAD_SWITCH */ } #if __TBB_SURVIVE_THREAD_SWITCH __cilk_tbb_retcode governor::stack_op_handler( __cilk_tbb_stack_op op, void* data ) { __TBB_ASSERT(data,NULL); generic_scheduler* s = static_cast<generic_scheduler*>(data); #if TBB_USE_ASSERT void* current = theTLS.get(); #if _WIN32||_WIN64 unsigned thread_id = GetCurrentThreadId(); #else unsigned thread_id = unsigned(pthread_self()); #endif #endif /* TBB_USE_ASSERT */ switch( op ) { default: __TBB_ASSERT( 0, "invalid op" ); case CILK_TBB_STACK_ADOPT: { __TBB_ASSERT( !current && s->my_cilk_state==generic_scheduler::cs_limbo || current==s && s->my_cilk_state==generic_scheduler::cs_running, "invalid adoption" ); #if TBB_USE_ASSERT if( current==s ) runtime_warning( "redundant adoption of %p by thread %x\n", s, thread_id ); s->my_cilk_state = generic_scheduler::cs_running; #endif /* TBB_USE_ASSERT */ theTLS.set(s); break; } case CILK_TBB_STACK_ORPHAN: { __TBB_ASSERT( current==s && s->my_cilk_state==generic_scheduler::cs_running, "invalid orphaning" ); #if TBB_USE_ASSERT s->my_cilk_state = generic_scheduler::cs_limbo; #endif /* TBB_USE_ASSERT */ theTLS.set(NULL); break; } case CILK_TBB_STACK_RELEASE: { __TBB_ASSERT( !current && s->my_cilk_state==generic_scheduler::cs_limbo || current==s && s->my_cilk_state==generic_scheduler::cs_running, "invalid release" ); #if TBB_USE_ASSERT s->my_cilk_state = generic_scheduler::cs_freed; #endif /* TBB_USE_ASSERT */ s->my_cilk_unwatch_thunk.routine = NULL; auto_terminate( s ); } } return 0; } #endif /* __TBB_SURVIVE_THREAD_SWITCH */ } // namespace internal
namespace internal { #if __TBB_USE_OS_AFFINITY_SYSCALL #if __linux__ // Handlers for interoperation with libiomp static int (*libiomp_try_restoring_original_mask)(); // Table for mapping to libiomp entry points static const dynamic_link_descriptor iompLinkTable[] = { { "kmp_set_thread_affinity_mask_initial", (pointer_to_handler*)(void*)(&libiomp_try_restoring_original_mask) } }; #endif static void set_thread_affinity_mask( size_t maskSize, const basic_mask_t* threadMask ) { #if __linux__ if( sched_setaffinity( 0, maskSize, threadMask ) ) #else /* FreeBSD */ if( cpuset_setaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) ) #endif runtime_warning( "setaffinity syscall failed" ); } static void get_thread_affinity_mask( size_t maskSize, basic_mask_t* threadMask ) { #if __linux__ if( sched_getaffinity( 0, maskSize, threadMask ) ) #else /* FreeBSD */ if( cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) ) #endif runtime_warning( "getaffinity syscall failed" ); } static basic_mask_t* process_mask; static int num_masks; void destroy_process_mask() { if( process_mask ) { delete [] process_mask; } } #define curMaskSize sizeof(basic_mask_t) * num_masks affinity_helper::~affinity_helper() { if( threadMask ) { if( is_changed ) { set_thread_affinity_mask( curMaskSize, threadMask ); } delete [] threadMask; } } void affinity_helper::protect_affinity_mask( bool restore_process_mask ) { if( threadMask == NULL && num_masks ) { // TODO: assert num_masks validity? threadMask = new basic_mask_t [num_masks]; memset( threadMask, 0, curMaskSize ); get_thread_affinity_mask( curMaskSize, threadMask ); if( restore_process_mask ) { __TBB_ASSERT( process_mask, "A process mask is requested but not yet stored" ); is_changed = memcmp( process_mask, threadMask, curMaskSize ); if( is_changed ) set_thread_affinity_mask( curMaskSize, process_mask ); } else { // Assume that the mask will be changed by the caller. is_changed = 1; } } } void affinity_helper::dismiss() { if( threadMask ) { delete [] threadMask; threadMask = NULL; } is_changed = 0; } #undef curMaskSize static atomic<do_once_state> hardware_concurrency_info; static int theNumProcs; static void initialize_hardware_concurrency_info () { int err; int availableProcs = 0; int numMasks = 1; #if __linux__ #if __TBB_MAIN_THREAD_AFFINITY_BROKEN int maxProcs = INT_MAX; // To check the entire mask. int pid = 0; // Get the mask of the calling thread. #else int maxProcs = sysconf(_SC_NPROCESSORS_ONLN); int pid = getpid(); #endif #else /* FreeBSD >= 7.1 */ int maxProcs = sysconf(_SC_NPROCESSORS_ONLN); #endif basic_mask_t* processMask; const size_t BasicMaskSize = sizeof(basic_mask_t); for (;;) { const int curMaskSize = BasicMaskSize * numMasks; processMask = new basic_mask_t[numMasks]; memset( processMask, 0, curMaskSize ); #if __linux__ err = sched_getaffinity( pid, curMaskSize, processMask ); if ( !err || errno != EINVAL || curMaskSize * CHAR_BIT >= 256 * 1024 ) break; #else /* FreeBSD >= 7.1 */ // CPU_LEVEL_WHICH - anonymous (current) mask, CPU_LEVEL_CPUSET - assigned mask #if __TBB_MAIN_THREAD_AFFINITY_BROKEN err = cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, curMaskSize, processMask ); #else err = cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, curMaskSize, processMask ); #endif if ( !err || errno != ERANGE || curMaskSize * CHAR_BIT >= 16 * 1024 ) break; #endif /* FreeBSD >= 7.1 */ delete[] processMask; numMasks <<= 1; } if ( !err ) { // We have found the mask size and captured the process affinity mask into processMask. num_masks = numMasks; // do here because it's needed for affinity_helper to work #if __linux__ // For better coexistence with libiomp which might have changed the mask already, // check for its presense and ask it to restore the mask. dynamic_link_handle libhandle; if ( dynamic_link( "libiomp5.so", iompLinkTable, 1, &libhandle, DYNAMIC_LINK_GLOBAL ) ) { // We have found the symbol provided by libiomp5 for restoring original thread affinity. affinity_helper affhelp; affhelp.protect_affinity_mask( /*restore_process_mask=*/false ); if ( libiomp_try_restoring_original_mask()==0 ) { // Now we have the right mask to capture, restored by libiomp. const int curMaskSize = BasicMaskSize * numMasks; memset( processMask, 0, curMaskSize ); get_thread_affinity_mask( curMaskSize, processMask ); } else affhelp.dismiss(); // thread mask has not changed dynamic_unlink( libhandle ); // Destructor of affinity_helper restores the thread mask (unless dismissed). } #endif for ( int m = 0; availableProcs < maxProcs && m < numMasks; ++m ) { for ( size_t i = 0; (availableProcs < maxProcs) && (i < BasicMaskSize * CHAR_BIT); ++i ) { if ( CPU_ISSET( i, processMask + m ) ) ++availableProcs; } } process_mask = processMask; } else { // Failed to get the process affinity mask; assume the whole machine can be used. availableProcs = (maxProcs == INT_MAX) ? sysconf(_SC_NPROCESSORS_ONLN) : maxProcs; delete[] processMask; } theNumProcs = availableProcs > 0 ? availableProcs : 1; // Fail safety strap __TBB_ASSERT( theNumProcs <= sysconf(_SC_NPROCESSORS_ONLN), NULL ); } int AvailableHwConcurrency() { atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info ); return theNumProcs; } /* End of __TBB_USE_OS_AFFINITY_SYSCALL implementation */ #elif __ANDROID__ // Work-around for Android that reads the correct number of available CPUs since system calls are unreliable. // Format of "present" file is: ([<int>-<int>|<int>],)+ int AvailableHwConcurrency() { FILE *fp = fopen("/sys/devices/system/cpu/present", "r"); if (fp == NULL) return 1; int num_args, lower, upper, num_cpus=0; while ((num_args = fscanf(fp, "%u-%u", &lower, &upper)) != EOF) { switch(num_args) { case 2: num_cpus += upper - lower + 1; break; case 1: num_cpus += 1; break; } fscanf(fp, ","); } return (num_cpus > 0) ? num_cpus : 1; } #elif defined(_SC_NPROCESSORS_ONLN) int AvailableHwConcurrency() { int n = sysconf(_SC_NPROCESSORS_ONLN); return (n > 0) ? n : 1; } #elif _WIN32||_WIN64 static atomic<do_once_state> hardware_concurrency_info; static const WORD TBB_ALL_PROCESSOR_GROUPS = 0xffff; // Statically allocate an array for processor group information. // Windows 7 supports maximum 4 groups, but let's look ahead a little. static const WORD MaxProcessorGroups = 64; struct ProcessorGroupInfo { DWORD_PTR mask; ///< Affinity mask covering the whole group int numProcs; ///< Number of processors in the group int numProcsRunningTotal; ///< Subtotal of processors in this and preceding groups //! Total number of processor groups in the system static int NumGroups; //! Index of the group with a slot reserved for the first master thread /** In the context of multiple processor groups support current implementation defines "the first master thread" as the first thread to invoke AvailableHwConcurrency(). TODO: Implement a dynamic scheme remapping workers depending on the pending master threads affinity. **/ static int HoleIndex; }; int ProcessorGroupInfo::NumGroups = 1; int ProcessorGroupInfo::HoleIndex = 0; ProcessorGroupInfo theProcessorGroups[MaxProcessorGroups]; struct TBB_GROUP_AFFINITY { DWORD_PTR Mask; WORD Group; WORD Reserved[3]; }; static DWORD (WINAPI *TBB_GetActiveProcessorCount)( WORD groupIndex ) = NULL; static WORD (WINAPI *TBB_GetActiveProcessorGroupCount)() = NULL; static BOOL (WINAPI *TBB_SetThreadGroupAffinity)( HANDLE hThread, const TBB_GROUP_AFFINITY* newAff, TBB_GROUP_AFFINITY *prevAff ); static BOOL (WINAPI *TBB_GetThreadGroupAffinity)( HANDLE hThread, TBB_GROUP_AFFINITY* ); static const dynamic_link_descriptor ProcessorGroupsApiLinkTable[] = { DLD(GetActiveProcessorCount, TBB_GetActiveProcessorCount) , DLD(GetActiveProcessorGroupCount, TBB_GetActiveProcessorGroupCount) , DLD(SetThreadGroupAffinity, TBB_SetThreadGroupAffinity) , DLD(GetThreadGroupAffinity, TBB_GetThreadGroupAffinity) }; static void initialize_hardware_concurrency_info () { #if __TBB_WIN8UI_SUPPORT // For these applications processor groups info is unavailable // Setting up a number of processors for one processor group theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = std::thread::hardware_concurrency(); #else /* __TBB_WIN8UI_SUPPORT */ dynamic_link( "Kernel32.dll", ProcessorGroupsApiLinkTable, sizeof(ProcessorGroupsApiLinkTable)/sizeof(dynamic_link_descriptor) ); SYSTEM_INFO si; GetNativeSystemInfo(&si); DWORD_PTR pam, sam, m = 1; GetProcessAffinityMask( GetCurrentProcess(), &pam, &sam ); int nproc = 0; for ( size_t i = 0; i < sizeof(DWORD_PTR) * CHAR_BIT; ++i, m <<= 1 ) { if ( pam & m ) ++nproc; } __TBB_ASSERT( nproc <= (int)si.dwNumberOfProcessors, NULL ); // By default setting up a number of processors for one processor group theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = nproc; // Setting up processor groups in case the process does not restrict affinity mask and more than one processor group is present if ( nproc == (int)si.dwNumberOfProcessors && TBB_GetActiveProcessorCount ) { // The process does not have restricting affinity mask and multiple processor groups are possible ProcessorGroupInfo::NumGroups = (int)TBB_GetActiveProcessorGroupCount(); __TBB_ASSERT( ProcessorGroupInfo::NumGroups <= MaxProcessorGroups, NULL ); // Fail safety bootstrap. Release versions will limit available concurrency // level, while debug ones would assert. if ( ProcessorGroupInfo::NumGroups > MaxProcessorGroups ) ProcessorGroupInfo::NumGroups = MaxProcessorGroups; if ( ProcessorGroupInfo::NumGroups > 1 ) { TBB_GROUP_AFFINITY ga; if ( TBB_GetThreadGroupAffinity( GetCurrentThread(), &ga ) ) ProcessorGroupInfo::HoleIndex = ga.Group; int nprocs = 0; for ( WORD i = 0; i < ProcessorGroupInfo::NumGroups; ++i ) { ProcessorGroupInfo &pgi = theProcessorGroups[i]; pgi.numProcs = (int)TBB_GetActiveProcessorCount(i); __TBB_ASSERT( pgi.numProcs <= (int)sizeof(DWORD_PTR) * CHAR_BIT, NULL ); pgi.mask = pgi.numProcs == sizeof(DWORD_PTR) * CHAR_BIT ? ~(DWORD_PTR)0 : (DWORD_PTR(1) << pgi.numProcs) - 1; pgi.numProcsRunningTotal = nprocs += pgi.numProcs; } __TBB_ASSERT( nprocs == (int)TBB_GetActiveProcessorCount( TBB_ALL_PROCESSOR_GROUPS ), NULL ); } } #endif /* __TBB_WIN8UI_SUPPORT */ PrintExtraVersionInfo("Processor groups", "%d", ProcessorGroupInfo::NumGroups); if (ProcessorGroupInfo::NumGroups>1) for (int i=0; i<ProcessorGroupInfo::NumGroups; ++i) PrintExtraVersionInfo( "----- Group", "%d: size %d", i, theProcessorGroups[i].numProcs); } int NumberOfProcessorGroups() { __TBB_ASSERT( hardware_concurrency_info == initialization_complete, "NumberOfProcessorGroups is used before AvailableHwConcurrency" ); return ProcessorGroupInfo::NumGroups; } // Offset for the slot reserved for the first master thread #define HoleAdjusted(procIdx, grpIdx) (procIdx + (holeIdx <= grpIdx)) int FindProcessorGroupIndex ( int procIdx ) { // In case of oversubscription spread extra workers in a round robin manner int holeIdx; const int numProcs = theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal; if ( procIdx >= numProcs - 1 ) { holeIdx = INT_MAX; procIdx = (procIdx - numProcs + 1) % numProcs; } else holeIdx = ProcessorGroupInfo::HoleIndex; __TBB_ASSERT( hardware_concurrency_info == initialization_complete, "FindProcessorGroupIndex is used before AvailableHwConcurrency" ); // Approximate the likely group index assuming all groups are of the same size int i = procIdx / theProcessorGroups[0].numProcs; // Make sure the approximation is a valid group index if (i >= ProcessorGroupInfo::NumGroups) i = ProcessorGroupInfo::NumGroups-1; // Now adjust the approximation up or down if ( theProcessorGroups[i].numProcsRunningTotal > HoleAdjusted(procIdx, i) ) { while ( theProcessorGroups[i].numProcsRunningTotal - theProcessorGroups[i].numProcs > HoleAdjusted(procIdx, i) ) { __TBB_ASSERT( i > 0, NULL ); --i; } } else { do { ++i; } while ( theProcessorGroups[i].numProcsRunningTotal <= HoleAdjusted(procIdx, i) ); } __TBB_ASSERT( i < ProcessorGroupInfo::NumGroups, NULL ); return i; } void MoveThreadIntoProcessorGroup( void* hThread, int groupIndex ) { __TBB_ASSERT( hardware_concurrency_info == initialization_complete, "MoveThreadIntoProcessorGroup is used before AvailableHwConcurrency" ); if ( !TBB_SetThreadGroupAffinity ) return; TBB_GROUP_AFFINITY ga = { theProcessorGroups[groupIndex].mask, (WORD)groupIndex, {0,0,0} }; TBB_SetThreadGroupAffinity( hThread, &ga, NULL ); } int AvailableHwConcurrency() { atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info ); return theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal; } /* End of _WIN32||_WIN64 implementation */ #else #error AvailableHwConcurrency is not implemented for this OS #endif } // namespace internal
namespace internal { //------------------------------------------------------------------------ // governor //------------------------------------------------------------------------ #if __TBB_SURVIVE_THREAD_SWITCH // Support for interoperability with Intel(R) Cilk(tm) Plus. #if _WIN32 #define CILKLIB_NAME "cilkrts20.dll" #else #define CILKLIB_NAME "libcilkrts.so" #endif //! Handler for interoperation with cilkrts library. static __cilk_tbb_retcode (*watch_stack_handler)(struct __cilk_tbb_unwatch_thunk* u, struct __cilk_tbb_stack_op_thunk o); #if __TBB_WEAK_SYMBOLS #pragma weak __cilkrts_watch_stack #endif //! Table describing how to link the handlers. static const dynamic_link_descriptor CilkLinkTable[] = { DLD(__cilkrts_watch_stack, watch_stack_handler) }; static atomic<do_once_state> cilkrts_load_state; bool initialize_cilk_interop() { // Pinning can fail. This is a normal situation, and means that the current // thread does not use cilkrts and consequently does not need interop. return dynamic_link( CILKLIB_NAME, CilkLinkTable, 1 ); } #endif /* __TBB_SURVIVE_THREAD_SWITCH */ namespace rml { tbb_server* make_private_server( tbb_client& client ); } void governor::acquire_resources () { #if USE_PTHREAD int status = theTLS.create(auto_terminate); #else int status = theTLS.create(); #endif if( status ) handle_perror(status, "TBB failed to initialize task scheduler TLS\n"); } void governor::release_resources () { theRMLServerFactory.close(); #if TBB_USE_ASSERT if( __TBB_InitOnce::initialization_done() && theTLS.get() ) runtime_warning( "TBB is unloaded while tbb::task_scheduler_init object is alive?" ); #endif int status = theTLS.destroy(); if( status ) handle_perror(status, "TBB failed to destroy task scheduler TLS"); dynamic_unlink_all(); } rml::tbb_server* governor::create_rml_server ( rml::tbb_client& client ) { rml::tbb_server* server = NULL; if( !UsePrivateRML ) { ::rml::factory::status_type status = theRMLServerFactory.make_server( server, client ); if( status != ::rml::factory::st_success ) { UsePrivateRML = true; runtime_warning( "rml::tbb_factory::make_server failed with status %x, falling back on private rml", status ); } } if ( !server ) { __TBB_ASSERT( UsePrivateRML, NULL ); server = rml::make_private_server( client ); } __TBB_ASSERT( server, "Failed to create RML server" ); return server; } void governor::sign_on(generic_scheduler* s) { __TBB_ASSERT( !theTLS.get(), NULL ); theTLS.set(s); #if __TBB_SURVIVE_THREAD_SWITCH if( watch_stack_handler ) { __cilk_tbb_stack_op_thunk o; o.routine = &stack_op_handler; o.data = s; if( (*watch_stack_handler)(&s->my_cilk_unwatch_thunk, o) ) { // Failed to register with cilkrts, make sure we are clean s->my_cilk_unwatch_thunk.routine = NULL; } #if TBB_USE_ASSERT else s->my_cilk_state = generic_scheduler::cs_running; #endif /* TBB_USE_ASSERT */ } #endif /* __TBB_SURVIVE_THREAD_SWITCH */ } void governor::sign_off(generic_scheduler* s) { suppress_unused_warning(s); __TBB_ASSERT( theTLS.get()==s, "attempt to unregister a wrong scheduler instance" ); theTLS.set(NULL); #if __TBB_SURVIVE_THREAD_SWITCH __cilk_tbb_unwatch_thunk &ut = s->my_cilk_unwatch_thunk; if ( ut.routine ) (*ut.routine)(ut.data); #endif /* __TBB_SURVIVE_THREAD_SWITCH */ } void governor::setBlockingTerminate(const task_scheduler_init *tsi) { __TBB_ASSERT(!IsBlockingTermiantionInProgress, "It's impossible to create task_scheduler_init while blocking termination is in progress."); if (BlockingTSI) throw_exception(eid_blocking_sch_init); BlockingTSI = tsi; } generic_scheduler* governor::init_scheduler( unsigned num_threads, stack_size_type stack_size, bool auto_init ) { if( !__TBB_InitOnce::initialization_done() ) DoOneTimeInitializations(); generic_scheduler* s = theTLS.get(); if( s ) { s->my_ref_count += 1; return s; } #if __TBB_SURVIVE_THREAD_SWITCH atomic_do_once( &initialize_cilk_interop, cilkrts_load_state ); #endif /* __TBB_SURVIVE_THREAD_SWITCH */ if( (int)num_threads == task_scheduler_init::automatic ) num_threads = default_num_threads(); s = generic_scheduler::create_master( market::create_arena( num_threads - 1, stack_size ? stack_size : ThreadStackSize ) ); __TBB_ASSERT(s, "Somehow a local scheduler creation for a master thread failed"); s->my_auto_initialized = auto_init; return s; } void governor::terminate_scheduler( generic_scheduler* s, const task_scheduler_init* tsi_ptr ) { __TBB_ASSERT( s == theTLS.get(), "Attempt to terminate non-local scheduler instance" ); if (--(s->my_ref_count)) { if (BlockingTSI && BlockingTSI==tsi_ptr) { // can't throw exception, because this is on dtor's call chain fprintf(stderr, "Attempt to terminate nested scheduler in blocking mode\n"); exit(1); } } else { #if TBB_USE_ASSERT if (BlockingTSI) { __TBB_ASSERT( BlockingTSI == tsi_ptr, "For blocking termiantion last terminate_scheduler must be blocking." ); IsBlockingTermiantionInProgress = true; } #endif s->cleanup_master(); BlockingTSI = NULL; #if TBB_USE_ASSERT IsBlockingTermiantionInProgress = false; #endif } } void governor::auto_terminate(void* arg){ generic_scheduler* s = static_cast<generic_scheduler*>(arg); if( s && s->my_auto_initialized ) { if( !--(s->my_ref_count) ) { __TBB_ASSERT( !BlockingTSI, "Blocking auto-termiante is not supported." ); // If the TLS slot is already cleared by OS or underlying concurrency // runtime, restore its value. if ( !theTLS.get() ) theTLS.set(s); else __TBB_ASSERT( s == theTLS.get(), NULL ); s->cleanup_master(); __TBB_ASSERT( !theTLS.get(), "cleanup_master has not cleared its TLS slot" ); } } } void governor::print_version_info () { if ( UsePrivateRML ) PrintExtraVersionInfo( "RML", "private" ); else { PrintExtraVersionInfo( "RML", "shared" ); theRMLServerFactory.call_with_server_info( PrintRMLVersionInfo, (void*)"" ); } #if __TBB_SURVIVE_THREAD_SWITCH if( watch_stack_handler ) PrintExtraVersionInfo( "CILK", CILKLIB_NAME ); #endif /* __TBB_SURVIVE_THREAD_SWITCH */ } void governor::initialize_rml_factory () { ::rml::factory::status_type res = theRMLServerFactory.open(); UsePrivateRML = res != ::rml::factory::st_success; } #if __TBB_SURVIVE_THREAD_SWITCH __cilk_tbb_retcode governor::stack_op_handler( __cilk_tbb_stack_op op, void* data ) { __TBB_ASSERT(data,NULL); generic_scheduler* s = static_cast<generic_scheduler*>(data); #if TBB_USE_ASSERT void* current = theTLS.get(); #if _WIN32||_WIN64 uintptr_t thread_id = GetCurrentThreadId(); #else uintptr_t thread_id = uintptr_t(pthread_self()); #endif #endif /* TBB_USE_ASSERT */ switch( op ) { default: __TBB_ASSERT( 0, "invalid op" ); case CILK_TBB_STACK_ADOPT: { __TBB_ASSERT( !current && s->my_cilk_state==generic_scheduler::cs_limbo || current==s && s->my_cilk_state==generic_scheduler::cs_running, "invalid adoption" ); #if TBB_USE_ASSERT if( current==s ) runtime_warning( "redundant adoption of %p by thread %p\n", s, (void*)thread_id ); s->my_cilk_state = generic_scheduler::cs_running; #endif /* TBB_USE_ASSERT */ theTLS.set(s); break; } case CILK_TBB_STACK_ORPHAN: { __TBB_ASSERT( current==s && s->my_cilk_state==generic_scheduler::cs_running, "invalid orphaning" ); #if TBB_USE_ASSERT s->my_cilk_state = generic_scheduler::cs_limbo; #endif /* TBB_USE_ASSERT */ theTLS.set(NULL); break; } case CILK_TBB_STACK_RELEASE: { __TBB_ASSERT( !current && s->my_cilk_state==generic_scheduler::cs_limbo || current==s && s->my_cilk_state==generic_scheduler::cs_running, "invalid release" ); #if TBB_USE_ASSERT s->my_cilk_state = generic_scheduler::cs_freed; #endif /* TBB_USE_ASSERT */ s->my_cilk_unwatch_thunk.routine = NULL; auto_terminate( s ); } } return 0; } #endif /* __TBB_SURVIVE_THREAD_SWITCH */ } // namespace internal
namespace internal { #if DO_ITT_NOTIFY //! Table describing the __itt_notify handlers. static const DynamicLinkDescriptor ITT_HandlerTable[] = { DLD( __itt_notify_sync_prepare, ITT_Handler_sync_prepare), DLD( __itt_notify_sync_acquired, ITT_Handler_sync_acquired), DLD( __itt_notify_sync_releasing, ITT_Handler_sync_releasing), DLD( __itt_notify_sync_cancel, ITT_Handler_sync_cancel), # if _WIN32||_WIN64 DLD( __itt_thr_name_setW, ITT_Handler_thr_name_set), DLD( __itt_thread_set_nameW, ITT_Handler_thread_set_name), # else DLD( __itt_thr_name_set, ITT_Handler_thr_name_set), DLD( __itt_thread_set_name, ITT_Handler_thread_set_name), # endif /* _WIN32 || _WIN64 */ #if __TBB_NAMING_API_SUPPORT # if _WIN32||_WIN64 DLD( __itt_sync_createW, ITT_Handler_sync_create), DLD( __itt_sync_renameW, ITT_Handler_sync_rename) # else /* !WIN */ DLD( __itt_sync_create, ITT_Handler_sync_create), DLD( __itt_sync_rename, ITT_Handler_sync_rename) # endif /* !WIN */ #endif /* __TBB_NAMING_API_SUPPORT */ }; static const int ITT_HandlerTable_size = sizeof(ITT_HandlerTable)/sizeof(DynamicLinkDescriptor); // LIBITTNOTIFY_NAME is the name of the ITT notification library # if _WIN32||_WIN64 # define LIBITTNOTIFY_NAME "libittnotify.dll" # elif __linux__ # define LIBITTNOTIFY_NAME "libittnotify.so" # else # error Intel(R) Threading Tools not provided for this OS # endif //! Performs tools support initialization. /** Is called by DoOneTimeInitializations and ITT_DoOneTimeInitialization in a protected (one-time) manner. Not to be invoked directly. **/ bool InitializeITT() { bool result = false; // Check if we are running under a performance or correctness tool bool t_checker = GetBoolEnvironmentVariable("KMP_FOR_TCHECK"); bool t_profiler = GetBoolEnvironmentVariable("KMP_FOR_TPROFILE"); __TBB_ASSERT(!(t_checker&&t_profiler), NULL); if ( t_checker || t_profiler ) { // Yes, we are in the tool mode. Try to load libittnotify library. result = FillDynamicLinks( LIBITTNOTIFY_NAME, ITT_HandlerTable, ITT_HandlerTable_size, 4 ); } if (result){ if ( t_checker ) { current_tool = ITC; } else if ( t_profiler ) { current_tool = ITP; } } else { // Clear away the proxy (dummy) handlers for (int i = 0; i < ITT_HandlerTable_size; i++) *ITT_HandlerTable[i].handler = NULL; current_tool = NONE; } PrintExtraVersionInfo( "ITT", result?"yes":"no" ); return result; } #if !__TBB_NAMING_API_SUPPORT #define ITT_DoOneTimeInitialization DoOneTimeInitializations #endif //! Performs one-time initialization of tools interoperability mechanisms. /** Defined in task.cpp. Makes a protected do-once call to InitializeITT(). **/ void ITT_DoOneTimeInitialization(); /** The following dummy_xxx functions are proxies that correspond to tool notification APIs and are used to initialize corresponding pointers to the tool notifications (ITT_Handler_xxx). When the first call to ITT_Handler_xxx takes place before the whole library initialization (done by DoOneTimeInitializations) happened, the proxy handler performs initialization of the tools support. After this ITT_Handler_xxx will be set to either tool notification pointer or NULL. **/ void dummy_sync_prepare( volatile void* ptr ) { ITT_DoOneTimeInitialization(); __TBB_ASSERT( ITT_Handler_sync_prepare!=&dummy_sync_prepare, NULL ); if (ITT_Handler_sync_prepare) (*ITT_Handler_sync_prepare) (ptr); } void dummy_sync_acquired( volatile void* ptr ) { ITT_DoOneTimeInitialization(); __TBB_ASSERT( ITT_Handler_sync_acquired!=&dummy_sync_acquired, NULL ); if (ITT_Handler_sync_acquired) (*ITT_Handler_sync_acquired) (ptr); } void dummy_sync_releasing( volatile void* ptr ) { ITT_DoOneTimeInitialization(); __TBB_ASSERT( ITT_Handler_sync_releasing!=&dummy_sync_releasing, NULL ); if (ITT_Handler_sync_releasing) (*ITT_Handler_sync_releasing) (ptr); } void dummy_sync_cancel( volatile void* ptr ) { ITT_DoOneTimeInitialization(); __TBB_ASSERT( ITT_Handler_sync_cancel!=&dummy_sync_cancel, NULL ); if (ITT_Handler_sync_cancel) (*ITT_Handler_sync_cancel) (ptr); } int dummy_thr_name_set( const tchar* str, int number ) { ITT_DoOneTimeInitialization(); __TBB_ASSERT( ITT_Handler_thr_name_set!=&dummy_thr_name_set, NULL ); if (ITT_Handler_thr_name_set) return (*ITT_Handler_thr_name_set) (str, number); return -1; } void dummy_thread_set_name( const tchar* name ) { ITT_DoOneTimeInitialization(); __TBB_ASSERT( ITT_Handler_thread_set_name!=&dummy_thread_set_name, NULL ); if (ITT_Handler_thread_set_name) (*ITT_Handler_thread_set_name)( name ); } #if __TBB_NAMING_API_SUPPORT void dummy_sync_create( void* obj, const tchar* objname, const tchar* objtype, int /*attribute*/ ) { ITT_DoOneTimeInitialization(); __TBB_ASSERT( ITT_Handler_sync_create!=&dummy_sync_create, NULL ); ITT_SYNC_CREATE( obj, objtype, objname ); } void dummy_sync_rename( void* obj, const tchar* new_name ) { ITT_DoOneTimeInitialization(); __TBB_ASSERT( ITT_Handler_sync_rename!=&dummy_sync_rename, NULL ); ITT_SYNC_RENAME(obj, new_name); } void itt_set_sync_name_v3( void *obj, const tchar* name) { __TBB_ASSERT( ITT_Handler_sync_rename!=&dummy_sync_rename, NULL ); ITT_SYNC_RENAME(obj, name); } #endif /* __TBB_NAMING_API_SUPPORT */ //! Leading padding before the area where tool notification handlers are placed. /** Prevents cache lines where the handler pointers are stored from thrashing. Defined as extern to prevent compiler from placing the padding arrays separately from the handler pointers (which are declared as extern). Declared separately from definition to get rid of compiler warnings. **/ extern char __ITT_Handler_leading_padding[NFS_MaxLineSize]; //! Trailing padding after the area where tool notification handlers are placed. extern char __ITT_Handler_trailing_padding[NFS_MaxLineSize]; char __ITT_Handler_leading_padding[NFS_MaxLineSize] = {0}; PointerToITT_Handler ITT_Handler_sync_prepare = &dummy_sync_prepare; PointerToITT_Handler ITT_Handler_sync_acquired = &dummy_sync_acquired; PointerToITT_Handler ITT_Handler_sync_releasing = &dummy_sync_releasing; PointerToITT_Handler ITT_Handler_sync_cancel = &dummy_sync_cancel; PointerToITT_thr_name_set ITT_Handler_thr_name_set = &dummy_thr_name_set; PointerToITT_thread_set_name ITT_Handler_thread_set_name = &dummy_thread_set_name; #if __TBB_NAMING_API_SUPPORT PointerToITT_sync_create ITT_Handler_sync_create = &dummy_sync_create; PointerToITT_sync_rename ITT_Handler_sync_rename = &dummy_sync_rename; #endif /* __TBB_NAMING_API_SUPPORT */ char __ITT_Handler_trailing_padding[NFS_MaxLineSize] = {0}; target_tool current_tool = TO_BE_INITIALIZED; #endif /* DO_ITT_NOTIFY */ void itt_store_pointer_with_release_v3( void* dst, void* src ) { ITT_NOTIFY(sync_releasing, dst); __TBB_store_with_release(*static_cast<void**>(dst),src); } void* itt_load_pointer_with_acquire_v3( const void* src ) { void* result = __TBB_load_with_acquire(*static_cast<void*const*>(src)); ITT_NOTIFY(sync_acquired, const_cast<void*>(src)); return result; } void* itt_load_pointer_v3( const void* src ) { void* result = *static_cast<void*const*>(src); return result; } } // namespace internal