void TestNegativeQueue( int nthread ) {
    tbb::concurrent_queue<T> queue;
    NativeParallelFor( nthread, TestNegativeQueueBody<T>(queue,nthread) );
Example #2
void RunCilkOnlyConcurrently ( tbb_sched_injection_mode_t sim ) {
    g_sim = sim;
    NativeParallelFor( P_outer, FibBody() );
void TestPushPop( int prefill, ptrdiff_t capacity, int nthread ) {
    ASSERT( nthread>0, "nthread must be positive" );
    if( prefill+1>=capacity )
    bool success = false;
    for( int k=0; k<3; ++k )
        PopKind[k] = 0;
    for( int trial=0; !success; ++trial ) {
        FooConstructed = 0;
        FooDestroyed = 0;
        Body body(nthread);
        tbb::concurrent_queue<Foo> queue;
        queue.set_capacity( capacity );
        body.queue = &queue;
        for( int i=0; i<prefill; ++i ) {
            Foo f;
            f.thread_id = nthread;
            f.serial = 1+i;
            ASSERT( queue.size()==i+1, NULL );
            ASSERT( !queue.empty(), NULL );
        tbb::tick_count t0 = tbb::tick_count::now();
        NativeParallelFor( nthread, body );
        tbb::tick_count t1 = tbb::tick_count::now();
        double timing = (t1-t0).seconds();
        if( Verbose )
            printf("prefill=%d capacity=%d time = %g = %g nsec/operation\n", prefill, int(capacity), timing, timing/(2*M*nthread)*1.E9);
        int sum = 0;
        for( int k=0; k<nthread; ++k )
            sum += Sum[k];
        int expected = nthread*((M-1)*M/2) + ((prefill-1)*prefill)/2;
        for( int i=prefill; --i>=0; ) {
            ASSERT( !queue.empty(), NULL );
            Foo f;
            ASSERT( queue.size()==i, NULL );
            sum += f.serial-1;
        ASSERT( queue.empty(), NULL );
        ASSERT( queue.size()==0, NULL );
        if( sum!=expected )
            printf("sum=%d expected=%d\n",sum,expected);
        ASSERT( FooConstructed==FooDestroyed, NULL );

        success = true;
        if( nthread>1 && prefill==0 ) {
            // Check that pop_if_present got sufficient exercise
            for( int k=0; k<2; ++k ) {
#if (_WIN32||_WIN64)
                // The TBB library on Windows seems to have a tough time generating
                // the desired interleavings for pop_if_present, so the code tries longer, and settles
                // for fewer desired interleavings.
                const int max_trial = 100;
                const int min_requirement = 20;
                const int min_requirement = 100;
                const int max_trial = 20;
#endif /* _WIN32||_WIN64 */
                if( PopKind[k]<min_requirement ) {
                    if( trial>=max_trial ) {
                        if( Verbose )
                            printf("Warning: %d threads had only %ld pop_if_present operations %s after %d trials (expected at least %d). "
                                    "This problem may merely be unlucky scheduling. "
                                    "Investigate only if it happens repeatedly.\n",
                                    nthread, long(PopKind[k]), k==0?"failed":"succeeded", max_trial, min_requirement);
                            printf("Warning: the number of %s pop_if_present operations is less than expected for %d threads. Investigate if it happens repeatedly.\n",
                                   k==0?"failed":"succeeded", nthread );
                    } else {
                        success = false;
Example #4
void TestPools() {
    rml::MemPoolPolicy pol(getMem, putMem);
    size_t beforeNumBackRef, afterNumBackRef;

    rml::MemoryPool *pool1;
    rml::MemoryPool *pool2;
    pool_create_v1(0, &pol, &pool1);
    pool_create_v1(0, &pol, &pool2);

    scalable_allocation_command(TBBMALLOC_CLEAN_ALL_BUFFERS, NULL);
    beforeNumBackRef = allocatedBackRefCount();
    rml::MemoryPool *fixedPool;

    pool_create_v1(0, &pol, &fixedPool);
    pol.pAlloc = getMallocMem;
    pol.pFree = putMallocMem;
    pol.granularity = 8;
    rml::MemoryPool *mallocPool;

    pool_create_v1(0, &pol, &mallocPool);
/* check that large object cache (LOC) returns correct size for cached objects
   passBackendSz Byte objects are cached in LOC, but bypassed the backend, so
   memory requested directly from allocation callback.
   nextPassBackendSz Byte objects must fit to another LOC bin,
   so that their allocation/realeasing leads to cache cleanup.
   All this is expecting to lead to releasing of passBackendSz Byte object
   from LOC during LOC cleanup, and putMallocMem checks that returned size
   is correct.
    const size_t passBackendSz = Backend::maxBinned_HugePage+1,
        anotherLOCBinSz = minLargeObjectSize+1;
    for (int i=0; i<10; i++) { // run long enough to be cached
        void *p = pool_malloc(mallocPool, passBackendSz);
        ASSERT(p, "Memory was not allocated");
        pool_free(mallocPool, p);
    // run long enough to passBackendSz allocation was cleaned from cache
    // and returned back to putMallocMem for size checking
    for (int i=0; i<1000; i++) {
        void *p = pool_malloc(mallocPool, anotherLOCBinSz);
        ASSERT(p, "Memory was not allocated");
        pool_free(mallocPool, p);

    void *smallObj =  pool_malloc(fixedPool, 10);
    ASSERT(smallObj, "Memory was not allocated");
    memset(smallObj, 1, 10);
    void *ptr = pool_malloc(fixedPool, 1024);
    ASSERT(ptr, "Memory was not allocated");
    memset(ptr, 1, 1024);
    void *largeObj = pool_malloc(fixedPool, minLargeObjectSize);
    ASSERT(largeObj, "Memory was not allocated");
    memset(largeObj, 1, minLargeObjectSize);
    ptr = pool_malloc(fixedPool, minLargeObjectSize);
    ASSERT(ptr, "Memory was not allocated");
    memset(ptr, minLargeObjectSize, minLargeObjectSize);
    pool_malloc(fixedPool, 10*minLargeObjectSize); // no leak for unsuccesful allocations
    pool_free(fixedPool, smallObj);
    pool_free(fixedPool, largeObj);

    // provoke large object cache cleanup and hope no leaks occurs
    for( int p=MaxThread; p>=MinThread; --p )
        NativeParallelFor( p, StressLOCacheWork(mallocPool) );

    scalable_allocation_command(TBBMALLOC_CLEAN_ALL_BUFFERS, NULL);
    afterNumBackRef = allocatedBackRefCount();
    ASSERT(beforeNumBackRef==afterNumBackRef, "backreference leak detected");

        // test usedSize/cachedSize and LOC bitmask correctness
        void *p[5];
        pool_create_v1(0, &pol, &mallocPool);
        const LargeObjectCache *loc = &((rml::internal::MemoryPool*)mallocPool)->extMemPool.loc;
        p[3] = pool_malloc(mallocPool, minLargeObjectSize+2*LargeObjectCache::largeBlockCacheStep);
        for (int i=0; i<10; i++) {
            p[0] = pool_malloc(mallocPool, minLargeObjectSize);
            p[1] = pool_malloc(mallocPool, minLargeObjectSize+LargeObjectCache::largeBlockCacheStep);
            pool_free(mallocPool, p[0]);
            pool_free(mallocPool, p[1]);
        ASSERT(loc->getUsedSize(), NULL);
        pool_free(mallocPool, p[3]);
        ASSERT(loc->getLOCSize() < 3*(minLargeObjectSize+LargeObjectCache::largeBlockCacheStep), NULL);
        const size_t maxLocalLOCSize = LocalLOCImpl<3,30>::getMaxSize();
        ASSERT(loc->getUsedSize() <= maxLocalLOCSize, NULL);
        for (int i=0; i<3; i++)
            p[i] = pool_malloc(mallocPool, minLargeObjectSize+i*LargeObjectCache::largeBlockCacheStep);
        size_t currUser = loc->getUsedSize();
        ASSERT(!loc->getLOCSize() && currUser >= 3*(minLargeObjectSize+LargeObjectCache::largeBlockCacheStep), NULL);
        p[4] = pool_malloc(mallocPool, minLargeObjectSize+3*LargeObjectCache::largeBlockCacheStep);
        ASSERT(loc->getUsedSize() - currUser >= minLargeObjectSize+3*LargeObjectCache::largeBlockCacheStep, NULL);
        pool_free(mallocPool, p[4]);
        ASSERT(loc->getUsedSize() <= currUser+maxLocalLOCSize, NULL);
        ASSERT(!loc->getLOCSize() && !loc->getUsedSize(), NULL);
    // To test LOC we need bigger lists than released by current LocalLOC
    //   in production code. Create special LocalLOC.
        LocalLOCImpl<2, 20> lLOC;
        pool_create_v1(0, &pol, &mallocPool);
        rml::internal::ExtMemoryPool *mPool = &((rml::internal::MemoryPool*)mallocPool)->extMemPool;
        const LargeObjectCache *loc = &((rml::internal::MemoryPool*)mallocPool)->extMemPool.loc;
        for (int i=0; i<22; i++) {
            void *o = pool_malloc(mallocPool, minLargeObjectSize+i*LargeObjectCache::largeBlockCacheStep);
            bool ret = lLOC.put(((LargeObjectHdr*)o - 1)->memoryBlock, mPool);
            ASSERT(ret, NULL);

            o = pool_malloc(mallocPool, minLargeObjectSize+i*LargeObjectCache::largeBlockCacheStep);
            ret = lLOC.put(((LargeObjectHdr*)o - 1)->memoryBlock, mPool);
            ASSERT(ret, NULL);
        ASSERT(!loc->getUsedSize(), NULL);

// test releasing memory from pthread key destructor
void TestKeyDtor() {
    for (int i=0; i<4; i++)
        NativeParallelFor( 1, TestThread(1) );
Example #6
void TestTGContextOnNewThread() {
    REMARK("Testing a regression for a bug with task_group_context\n");
    TestTGContext body;
    NativeParallelFor(1, body);
Example #7
void test_parallel(int nThreads) {
    tbb::flow::graph g;
    Harness::SpinBarrier barrier(nThreads);
    AddRemoveBody body(nThreads, barrier, g);
    NativeParallelFor(nThreads, body);
Example #8
void RunPrioritySwitchBetweenTwoMasters ( int idx, uintptr_t opts ) {
    ASSERT( idx < NumTests, NULL );
    REMARK( "Config %d: idx=%i, opts=%u\r", ++g_CurConfig, idx, (unsigned)opts );
    NativeParallelFor ( 2, MasterBody<NodeType>(idx, opts) );
Example #9
void TestPeriodicConcurrentActivities () {
    REMARK( "TestPeriodicConcurrentActivities: %s / %s \n", Low == tbb::priority_low ? "Low" : "Normal", High == tbb::priority_normal ? "Normal" : "High" );
    NativeParallelFor ( 2, PeriodicActivitiesBody() );
void buffered_levels( size_t concurrency, Body body ) {
    typedef typename tbb::flow::tuple_element<0,OutputTuple>::type OutputType;
    // Do for lc = 1 to concurrency level
    for ( size_t lc = 1; lc <= concurrency; ++lc ) { 
        tbb::flow::graph g;

        // Set the execute_counter back to zero in the harness
        harness_graph_multifunction_executor<InputType, OutputTuple,tbb::spin_mutex>::execute_count = 0;
        // Set the max allowed executors to lc.  There is a check in the functor to make sure this is never exceeded.
        harness_graph_multifunction_executor<InputType, OutputTuple,tbb::spin_mutex>::max_executors = lc;

        // Create the function_node with the appropriate concurreny level, and use default buffering
        tbb::flow::multifunction_node< InputType, OutputTuple > exe_node( g, lc, body );
        //Create a vector of identical exe_nodes
        std::vector< tbb::flow::multifunction_node< InputType, OutputTuple > > exe_vec(2, exe_node);

        // exercise each of the copied nodes
        for (size_t node_idx=0; node_idx<exe_vec.size(); ++node_idx) {
            for (size_t num_receivers = 1; num_receivers <= MAX_NODES; ++num_receivers ) {
                // Create num_receivers counting receivers and connect the exe_vec[node_idx] to them.
                harness_mapped_receiver<OutputType> *receivers = new harness_mapped_receiver<OutputType>[num_receivers];
                for (size_t r = 0; r < num_receivers; ++r ) {
                    tbb::flow::make_edge( tbb::flow::output_port<0>(exe_vec[node_idx]), receivers[r] );

                // Do the test with varying numbers of senders
                harness_counting_sender<InputType> *senders = NULL;
                for (size_t num_senders = 1; num_senders <= MAX_NODES; ++num_senders ) {
                    // Create num_senders senders, set there message limit each to N, and connect them to the exe_vec[node_idx]
                    senders = new harness_counting_sender<InputType>[num_senders];
                    for (size_t s = 0; s < num_senders; ++s ) {
                        senders[s].my_limit = N;
                        tbb::flow::make_edge( senders[s], exe_vec[node_idx] );

                    // Initialize the receivers so they know how many senders and messages to check for
                    for (size_t r = 0; r < num_receivers; ++r ) {
                         receivers[r].initialize_map( N, num_senders ); 

                    // Do the test
                    NativeParallelFor( (int)num_senders, parallel_put_until_limit<InputType>(senders) );

                    // cofirm that each sender was requested from N times 
                    for (size_t s = 0; s < num_senders; ++s ) {
                        size_t n = senders[s].my_received;
                        ASSERT( n == N, NULL ); 
                        ASSERT( senders[s].my_receiver == &exe_vec[node_idx], NULL );
                    // validate the receivers
                    for (size_t r = 0; r < num_receivers; ++r ) {
                    delete [] senders;
                for (size_t r = 0; r < num_receivers; ++r ) {
                    tbb::flow::remove_edge( tbb::flow::output_port<0>(exe_vec[node_idx]), receivers[r] );
                ASSERT( exe_vec[node_idx].try_put( InputType() ) == true, NULL );
                for (size_t r = 0; r < num_receivers; ++r ) {
                    // since it's detached, nothing should have changed
                delete [] receivers;
int main(int argc, char* argv[]) {
    MaxThread = MinThread = 1;

    // check if we were called to test standard behavior
    for (int i=1; i< argc; i++) {
        if (strcmp((char*)*(argv+i),"-s")==0)

    ParseCommandLine( argC, argV );
#if __linux__
    /* According to man pthreads 
       "NPTL threads do not share resource limits (fixed in kernel 2.6.10)".
       Use per-threads limits for affected systems.
    if ( LinuxKernelVersion() < 2*1000000 + 6*1000 + 10)
        perProcessLimits = false;
#if __APPLE__
    /* Skip due to lack of memory limit enforcing under Mac OS X. */
//for linux and dynamic runtime errno is used to check allocator fuctions
//check if library compiled with /MD(d) and we can use errno
#if _MSC_VER 
#if defined(_MT) && defined(_DLL) //check errno if test itself compiled with /MD(d) only
    #pragma comment(lib, "version.lib")
    char*  version_info_block = NULL;
    int version_info_block_size; 
    LPVOID comments_block = NULL;
    UINT comments_block_size;
#ifdef _DEBUG
#define __TBBMALLOCDLL "tbbmalloc_debug.dll"
#else  //_DEBUG
#define __TBBMALLOCDLL "tbbmalloc.dll"
#endif //_DEBUG
    version_info_block_size = GetFileVersionInfoSize( __TBBMALLOCDLL, (LPDWORD)&version_info_block_size );
    if( version_info_block_size 
        && ((version_info_block = (char*)malloc(version_info_block_size)) != NULL)
        && GetFileVersionInfo(  __TBBMALLOCDLL, NULL, version_info_block_size, version_info_block )
        && VerQueryValue( version_info_block, "\\StringFileInfo\\000004b0\\Comments", &comments_block, &comments_block_size )
        && strstr( (char*)comments_block, "/MD" )
            __tbb_test_errno = true;
     if( version_info_block ) free( version_info_block );
#endif // defined(_MT) && defined(_DLL)
#else  // _MSC_VER
    __tbb_test_errno = true;
#endif // _MSC_VER

    for( int p=MaxThread; p>=MinThread; --p ) {
        REMARK("testing with %d threads\n", p );
        Harness::SpinBarrier *barrier = new Harness::SpinBarrier(p);
        NativeParallelFor( p, RoundRobin(p, barrier, Verbose) );
        delete barrier;
    if( !error_occurred ) 
    return 0;
void buffered_levels_with_copy( size_t concurrency ) {
    typedef typename tbb::flow::tuple_element<0,OutputTuple>::type OutputType;
    // Do for lc = 1 to concurrency level
    for ( size_t lc = 1; lc <= concurrency; ++lc ) { 
        tbb::flow::graph g;

        inc_functor cf;
        cf.local_execute_count = Offset;
        global_execute_count = Offset;
        tbb::flow::multifunction_node< InputType, OutputTuple > exe_node( g, lc, cf );

        for (size_t num_receivers = 1; num_receivers <= MAX_NODES; ++num_receivers ) {
           harness_mapped_receiver<OutputType> *receivers = new harness_mapped_receiver<OutputType>[num_receivers];
           for (size_t r = 0; r < num_receivers; ++r ) {
               tbb::flow::make_edge( tbb::flow::output_port<0>(exe_node), receivers[r] );

            harness_counting_sender<InputType> *senders = NULL;
            for (size_t num_senders = 1; num_senders <= MAX_NODES; ++num_senders ) {
                senders = new harness_counting_sender<InputType>[num_senders];
                for (size_t s = 0; s < num_senders; ++s ) {
                    senders[s].my_limit = N;
                    tbb::flow::make_edge( senders[s], exe_node );

                for (size_t r = 0; r < num_receivers; ++r ) {
                    receivers[r].initialize_map( N, num_senders ); 

                NativeParallelFor( (int)num_senders, parallel_put_until_limit<InputType>(senders) );

                for (size_t s = 0; s < num_senders; ++s ) {
                    size_t n = senders[s].my_received;
                    ASSERT( n == N, NULL ); 
                    ASSERT( senders[s].my_receiver == &exe_node, NULL );
                for (size_t r = 0; r < num_receivers; ++r ) {
                delete [] senders;
            for (size_t r = 0; r < num_receivers; ++r ) {
                tbb::flow::remove_edge( tbb::flow::output_port<0>(exe_node), receivers[r] );
            ASSERT( exe_node.try_put( InputType() ) == true, NULL );
            for (size_t r = 0; r < num_receivers; ++r ) {
            delete [] receivers;

        // validate that the local body matches the global execute_count and both are correct
        inc_functor body_copy = tbb::flow::copy_body<inc_functor>( exe_node );
        const size_t expected_count = N/2 * MAX_NODES * MAX_NODES * ( MAX_NODES + 1 ) + MAX_NODES + Offset; 
        size_t global_count = global_execute_count;
        size_t inc_count = body_copy.local_execute_count;
        ASSERT( global_count == expected_count && global_count == inc_count, NULL ); 
int test_parallel(int num_threads) {
    tbb::graph g;
    tbb::queue_node<T> q(g);
    tbb::queue_node<T> q2(g);
    tbb::queue_node<T> q3(g);
    T bogus_value(-1);
    T j = bogus_value;

    NativeParallelFor( num_threads, parallel_puts<T>(q) );

    T *next_value = new T[num_threads];
    for (int tid = 0; tid < num_threads; ++tid) next_value[tid] = T(0);

    for (int i = 0; i < num_threads * N; ++i ) {
        spin_try_get( q, j );
        check_item( next_value, j );
        j = bogus_value;
    for (int tid = 0; tid < num_threads; ++tid)  {
        ASSERT( next_value[tid] == T(N), NULL );

    j = bogus_value;
    ASSERT( q.try_get( j ) == false, NULL );
    ASSERT( j == bogus_value, NULL );

    NativeParallelFor( num_threads, parallel_puts<T>(q) );

        touches< T > t( num_threads );
        NativeParallelFor( num_threads, parallel_gets<T>(q, t) );
        ASSERT( t.validate_touches(), NULL );
    j = bogus_value;
    ASSERT( q.try_get( j ) == false, NULL );
    ASSERT( j == bogus_value, NULL );

        touches< T > t2( num_threads );
        NativeParallelFor( num_threads, parallel_put_get<T>(q, t2) );
        ASSERT( t2.validate_touches(), NULL );
    j = bogus_value;
    ASSERT( q.try_get( j ) == false, NULL );
    ASSERT( j == bogus_value, NULL );

    ASSERT( q.register_successor( q2 ) == true, NULL );
    ASSERT( q2.register_successor( q3 ) == true, NULL );

    NativeParallelFor( num_threads, parallel_puts<T>(q) );
        touches< T > t3( num_threads );
        NativeParallelFor( num_threads, parallel_gets<T>(q3, t3) );
        ASSERT( t3.validate_touches(), NULL );
    j = bogus_value;
    ASSERT( q.try_get( j ) == false, NULL );
    ASSERT( q2.try_get( j ) == false, NULL );
    ASSERT( q3.try_get( j ) == false, NULL );
    ASSERT( j == bogus_value, NULL );

    return 0;
Example #14
// Tests non-stack-bound task group (the group that is allocated by one thread and destroyed by the other)
void TestVagabondGroup () {
    NativeParallelFor( 2, SharedGroupBody(2, VagabondGroup) );
Example #15
void TestParallelWait () {
    NativeParallelFor( g_MaxConcurrency, SharedGroupBody(g_MaxConcurrency, ParallelWait) );
Example #16
void TestParallelSpawn () {
    NativeParallelFor( g_MaxConcurrency, SharedGroupBody(g_MaxConcurrency) );