Example #1
0
void unlimited_concurrency( Body body ) {

    for (int p = 1; p < 2*MaxThread; ++p) {
        tbb::flow::graph g;
        tbb::flow::function_node< InputType, OutputType, tbb::flow::rejecting > exe_node( g, tbb::flow::unlimited, body );

        for (size_t num_receivers = 1; num_receivers <= MAX_NODES; ++num_receivers ) {

            std::vector< harness_counting_receiver<OutputType> > receivers(num_receivers, harness_counting_receiver<OutputType>(g));
            harness_graph_executor<InputType, OutputType>::execute_count = 0;

            for (size_t r = 0; r < num_receivers; ++r ) {
                tbb::flow::make_edge( exe_node, receivers[r] );
            }

            NativeParallelFor( p, parallel_puts<InputType>(exe_node) );
            g.wait_for_all();

            // 2) the nodes will receive puts from multiple predecessors simultaneously,
            size_t ec = harness_graph_executor<InputType, OutputType>::execute_count;
            ASSERT( (int)ec == p*N, NULL );
            for (size_t r = 0; r < num_receivers; ++r ) {
                size_t c = receivers[r].my_count;
                // 3) the nodes will send to multiple successors.
                ASSERT( (int)c == p*N, NULL );
            }
            for (size_t r = 0; r < num_receivers; ++r ) {
                tbb::flow::remove_edge( exe_node, receivers[r] );
            }
            }
        }
    }
Example #2
0
void continue_nodes( Body body ) {

    for (int p = 1; p < 2*MaxThread; ++p) {
        tbb::graph g;
        tbb::executable_node< OutputType > exe_node( g, body );
        for (size_t i = 0; i < N; ++i) {
           exe_node.register_predecessor( *reinterpret_cast< tbb::sender< tbb::continue_msg > * >(&exe_node) );
        }

        for (size_t num_receivers = 1; num_receivers <= MAX_NODES; ++num_receivers ) { 
            harness_counting_receiver<OutputType> *receivers = new harness_counting_receiver<OutputType>[num_receivers];
            harness_graph_executor<tbb::continue_msg, OutputType>::execute_count = 0;

            for (size_t r = 0; r < num_receivers; ++r ) {
                ASSERT( exe_node.register_successor( receivers[r] ), NULL );
            }

            NativeParallelFor( p, parallel_puts<tbb::continue_msg>(exe_node) );
            g.wait_for_all(); 

            // 2) the nodes will receive puts from multiple predecessors simultaneously,
            size_t ec = harness_graph_executor<tbb::continue_msg, OutputType>::execute_count;
            ASSERT( (int)ec == p, NULL ); 
            for (size_t r = 0; r < num_receivers; ++r ) {
                size_t c = receivers[r].my_count;
                // 3) the nodes will send to multiple successors.
                ASSERT( (int)c == p, NULL );
            }
        }
    }
}
void continue_nodes( Body body ) {
    for (int p = 1; p < 2*MaxThread; ++p) {
        tbb::flow::graph g;
        tbb::flow::continue_node< OutputType > exe_node( g, body );
        run_continue_nodes( p, g, exe_node);
        exe_node.try_put(tbb::flow::continue_msg());
        tbb::flow::continue_node< OutputType > exe_node_copy( exe_node );
        run_continue_nodes( p, g, exe_node_copy);
    }
}
void continue_nodes_with_copy( ) {

    for (int p = 1; p < 2*MaxThread; ++p) {
        tbb::flow::graph g;
        inc_functor<OutputType> cf;
        cf.local_execute_count = Offset;
        global_execute_count = Offset;

        tbb::flow::continue_node< OutputType > exe_node( g, cf );
        fake_continue_sender fake_sender;
        for (size_t i = 0; i < N; ++i) {
           exe_node.register_predecessor( fake_sender );
        }

        for (size_t num_receivers = 1; num_receivers <= MAX_NODES; ++num_receivers ) {
            harness_counting_receiver<OutputType> *receivers = new harness_counting_receiver<OutputType>[num_receivers];

            for (size_t r = 0; r < num_receivers; ++r ) {
                tbb::flow::make_edge( exe_node, receivers[r] );
            }

            NativeParallelFor( p, parallel_puts<tbb::flow::continue_msg>(exe_node) );
            g.wait_for_all();

            // 2) the nodes will receive puts from multiple predecessors simultaneously,
            for (size_t r = 0; r < num_receivers; ++r ) {
                size_t c = receivers[r].my_count;
                // 3) the nodes will send to multiple successors.
                ASSERT( (int)c == p, NULL );
            }
        }

        // validate that the local body matches the global execute_count and both are correct
        inc_functor<OutputType> body_copy = tbb::flow::copy_body< inc_functor<OutputType> >( exe_node );
        const size_t expected_count = p*MAX_NODES + Offset;
        size_t global_count = global_execute_count;
        size_t inc_count = body_copy.local_execute_count;
        ASSERT( global_count == expected_count && global_count == inc_count, NULL );

    }
}
Example #5
0
void buffered_levels( size_t concurrency, Body body ) {

   // Do for lc = 1 to concurrency level
   for ( size_t lc = 1; lc <= concurrency; ++lc ) {
   tbb::flow::graph g;

   // Set the execute_counter back to zero in the harness
   harness_graph_executor<InputType, OutputType>::execute_count = 0;
   // Set the number of current executors to zero.
   harness_graph_executor<InputType, OutputType>::current_executors = 0;
   // Set the max allowed executors to lc.  There is a check in the functor to make sure this is never exceeded.
   harness_graph_executor<InputType, OutputType>::max_executors = lc;

   // Create the function_node with the appropriate concurrency level, and use default buffering
   tbb::flow::function_node< InputType, OutputType > exe_node( g, lc, body );
   tbb::flow::function_node<InputType, InputType> pass_thru( g, tbb::flow::unlimited, pass_through<InputType>());

   // Create a vector of identical exe_nodes and pass_thrus
   std::vector< tbb::flow::function_node< InputType, OutputType > > exe_vec(2, exe_node);
   std::vector< tbb::flow::function_node< InputType, InputType > > pass_thru_vec(2, pass_thru);
   // Attach each pass_thru to its corresponding exe_node
   for (size_t node_idx=0; node_idx<exe_vec.size(); ++node_idx) {
       tbb::flow::make_edge(pass_thru_vec[node_idx], exe_vec[node_idx]);
   }

   // TODO: why the test is executed serially for the node pairs, not concurrently?
   for (size_t node_idx=0; node_idx<exe_vec.size(); ++node_idx) {
   // For num_receivers = 1 to MAX_NODES
   for (size_t num_receivers = 1; num_receivers <= MAX_NODES; ++num_receivers ) {
        // Create num_receivers counting receivers and connect the exe_vec[node_idx] to them.
        std::vector< harness_mapped_receiver<OutputType>* > receivers(num_receivers);
        for (size_t i = 0; i < num_receivers; i++) {
            receivers[i] = new harness_mapped_receiver<OutputType>(g);
        }

        for (size_t r = 0; r < num_receivers; ++r ) {
            tbb::flow::make_edge( exe_vec[node_idx], *receivers[r] );
        }

        // Do the test with varying numbers of senders
        harness_counting_sender<InputType> *senders = NULL;
        for (size_t num_senders = 1; num_senders <= MAX_NODES; ++num_senders ) {
            // Create num_senders senders, set there message limit each to N, and connect them to pass_thru_vec[node_idx]
            senders = new harness_counting_sender<InputType>[num_senders];
            for (size_t s = 0; s < num_senders; ++s ) {
               senders[s].my_limit = N;
               senders[s].register_successor(pass_thru_vec[node_idx] );
            }

            // Initialize the receivers so they know how many senders and messages to check for
            for (size_t r = 0; r < num_receivers; ++r ) {
                 receivers[r]->initialize_map( N, num_senders );
            }

            // Do the test
            NativeParallelFor( (int)num_senders, parallel_put_until_limit<InputType>(senders) );
            g.wait_for_all();

            // confirm that each sender was requested from N times
            for (size_t s = 0; s < num_senders; ++s ) {
                size_t n = senders[s].my_received;
                ASSERT( n == N, NULL );
                ASSERT( senders[s].my_receiver == &pass_thru_vec[node_idx], NULL );
            }
            // validate the receivers
            for (size_t r = 0; r < num_receivers; ++r ) {
                receivers[r]->validate();
            }
            delete [] senders;
        }
        for (size_t r = 0; r < num_receivers; ++r ) {
            tbb::flow::remove_edge( exe_vec[node_idx], *receivers[r] );
        }
        ASSERT( exe_vec[node_idx].try_put( InputType() ) == true, NULL );
        g.wait_for_all();
        for (size_t r = 0; r < num_receivers; ++r ) {
            // since it's detached, nothing should have changed
            receivers[r]->validate();
        }

        for (size_t i = 0; i < num_receivers; i++) {
            delete receivers[i];
        }

    } // for num_receivers
    } // for node_idx
    } // for concurrency level lc
}
Example #6
0
void concurrency_levels( size_t concurrency, Body body ) {

   for ( size_t lc = 1; lc <= concurrency; ++lc ) {
       tbb::flow::graph g;

       // Set the execute_counter back to zero in the harness
       harness_graph_executor<InputType, OutputType>::execute_count = 0;
       // Set the number of current executors to zero.
       harness_graph_executor<InputType, OutputType>::current_executors = 0;
       // Set the max allowed executors to lc. There is a check in the functor to make sure this is never exceeded.
       harness_graph_executor<InputType, OutputType>::max_executors = lc;

       typedef tbb::flow::function_node< InputType, OutputType, tbb::flow::rejecting > fnode_type;
       fnode_type exe_node( g, lc, body );

       for (size_t num_receivers = 1; num_receivers <= MAX_NODES; ++num_receivers ) {

            std::vector< harness_counting_receiver<OutputType> > receivers(num_receivers, harness_counting_receiver<OutputType>(g));

#if TBB_PREVIEW_FLOW_GRAPH_FEATURES
            ASSERT(exe_node.successor_count() == 0, NULL);
            ASSERT(exe_node.predecessor_count() == 0, NULL);
#endif

            for (size_t r = 0; r < num_receivers; ++r ) {
                tbb::flow::make_edge( exe_node, receivers[r] );
            }
#if TBB_PREVIEW_FLOW_GRAPH_FEATURES
            ASSERT(exe_node.successor_count() == num_receivers, NULL);
            typename fnode_type::successor_list_type my_succs;
            exe_node.copy_successors(my_succs);
            ASSERT(my_succs.size() == num_receivers, NULL);
            typename fnode_type::predecessor_list_type my_preds;
            exe_node.copy_predecessors(my_preds);
            ASSERT(my_preds.size() == 0, NULL);
#endif

            harness_counting_sender<InputType> *senders = NULL;

            for (size_t num_senders = 1; num_senders <= MAX_NODES; ++num_senders ) {
                senders = new harness_counting_sender<InputType>[num_senders];
                {
                    // Exclusively lock m to prevent exe_node from finishing
                    tbb::spin_rw_mutex::scoped_lock l( harness_graph_executor<InputType, OutputType>::template mutex_holder<tbb::spin_rw_mutex>::mutex );

                    // put to lc level, it will accept and then block at m
                    for ( size_t c = 0 ; c < lc ; ++c ) {
                        ASSERT( exe_node.try_put( InputType() ) == true, NULL );
                    }
                    // it only accepts to lc level
                    ASSERT( exe_node.try_put( InputType() ) == false, NULL );

                    for (size_t s = 0; s < num_senders; ++s ) {
                       // register a sender
                       senders[s].my_limit = N;
                       exe_node.register_predecessor( senders[s] );
                    }

                } // release lock at end of scope, setting the exe node free to continue
                // wait for graph to settle down
                g.wait_for_all();

                // confirm that each sender was requested from N times
                for (size_t s = 0; s < num_senders; ++s ) {
                    size_t n = senders[s].my_received;
                    ASSERT( n == N, NULL );
                    ASSERT( senders[s].my_receiver == &exe_node, NULL );
                }
                // confirm that each receivers got N * num_senders + the initial lc puts
                for (size_t r = 0; r < num_receivers; ++r ) {
                    size_t n = receivers[r].my_count;
                    ASSERT( n == num_senders*N+lc, NULL );
                    receivers[r].my_count = 0;
                }
                delete [] senders;
            }
            for (size_t r = 0; r < num_receivers; ++r ) {
                tbb::flow::remove_edge( exe_node, receivers[r] );
            }
            ASSERT( exe_node.try_put( InputType() ) == true, NULL );
            g.wait_for_all();
            for (size_t r = 0; r < num_receivers; ++r ) {
                ASSERT( int(receivers[r].my_count) == 0, NULL );
            }
        }
    }
}
Example #7
0
void buffered_levels_with_copy( size_t concurrency ) {

    // Do for lc = 1 to concurrency level
    for ( size_t lc = 1; lc <= concurrency; ++lc ) {
        tbb::flow::graph g;

        inc_functor cf;
        cf.local_execute_count = Offset;
        global_execute_count = Offset;

        tbb::flow::function_node< InputType, OutputType > exe_node( g, lc, cf );

        for (size_t num_receivers = 1; num_receivers <= MAX_NODES; ++num_receivers ) {

           std::vector< harness_mapped_receiver<OutputType>* > receivers(num_receivers);
           for (size_t i = 0; i < num_receivers; i++) {
               receivers[i] = new harness_mapped_receiver<OutputType>(g);
           }

           for (size_t r = 0; r < num_receivers; ++r ) {
               tbb::flow::make_edge( exe_node, *receivers[r] );
            }

            harness_counting_sender<InputType> *senders = NULL;
            for (size_t num_senders = 1; num_senders <= MAX_NODES; ++num_senders ) {
                senders = new harness_counting_sender<InputType>[num_senders];
                for (size_t s = 0; s < num_senders; ++s ) {
                    senders[s].my_limit = N;
                    tbb::flow::make_edge( senders[s], exe_node );
                }

                for (size_t r = 0; r < num_receivers; ++r ) {
                    receivers[r]->initialize_map( N, num_senders );
                }

                NativeParallelFor( (int)num_senders, parallel_put_until_limit<InputType>(senders) );
                g.wait_for_all();

                for (size_t s = 0; s < num_senders; ++s ) {
                    size_t n = senders[s].my_received;
                    ASSERT( n == N, NULL );
                    ASSERT( senders[s].my_receiver == &exe_node, NULL );
                }
                for (size_t r = 0; r < num_receivers; ++r ) {
                    receivers[r]->validate();
                }
                delete [] senders;
            }
            for (size_t r = 0; r < num_receivers; ++r ) {
                tbb::flow::remove_edge( exe_node, *receivers[r] );
            }
            ASSERT( exe_node.try_put( InputType() ) == true, NULL );
            g.wait_for_all();
            for (size_t r = 0; r < num_receivers; ++r ) {
                receivers[r]->validate();
            }

            for (size_t i = 0; i < num_receivers; i++) {
                delete receivers[i];
            }
        }

        // validate that the local body matches the global execute_count and both are correct
        inc_functor body_copy = tbb::flow::copy_body<inc_functor>( exe_node );
        const size_t expected_count = N/2 * MAX_NODES * MAX_NODES * ( MAX_NODES + 1 ) + MAX_NODES + Offset;
        size_t global_count = global_execute_count;
        size_t inc_count = body_copy.local_execute_count;
        ASSERT( global_count == expected_count && global_count == inc_count, NULL );
        g.reset(tbb::flow::rf_reset_bodies);
        body_copy = tbb::flow::copy_body<inc_functor>( exe_node );
        inc_count = body_copy.local_execute_count;
        ASSERT( Offset == inc_count, "reset(rf_reset_bodies) did not reset functor" );
    }
}
void buffered_levels( size_t concurrency, Body body ) {
    typedef typename std::tuple_element<0,OutputTuple>::type OutputType;
    // Do for lc = 1 to concurrency level
    for ( size_t lc = 1; lc <= concurrency; ++lc ) { 
        tbb::flow::graph g;

        // Set the execute_counter back to zero in the harness
        harness_graph_multifunction_executor<InputType, OutputTuple,tbb::spin_mutex>::execute_count = 0;
        // Set the max allowed executors to lc.  There is a check in the functor to make sure this is never exceeded.
        harness_graph_multifunction_executor<InputType, OutputTuple,tbb::spin_mutex>::max_executors = lc;

        // Create the function_node with the appropriate concurreny level, and use default buffering
        tbb::flow::multifunction_node< InputType, OutputTuple > exe_node( g, lc, body );
   
        //Create a vector of identical exe_nodes
        std::vector< tbb::flow::multifunction_node< InputType, OutputTuple > > exe_vec(2, exe_node);

        // exercise each of the copied nodes
        for (size_t node_idx=0; node_idx<exe_vec.size(); ++node_idx) {
            for (size_t num_receivers = 1; num_receivers <= MAX_NODES; ++num_receivers ) {
                // Create num_receivers counting receivers and connect the exe_vec[node_idx] to them.
                harness_mapped_receiver<OutputType> *receivers = new harness_mapped_receiver<OutputType>[num_receivers];
                for (size_t r = 0; r < num_receivers; ++r ) {
                    tbb::flow::make_edge( tbb::flow::output_port<0>(exe_vec[node_idx]), receivers[r] );
                }

                // Do the test with varying numbers of senders
                harness_counting_sender<InputType> *senders = NULL;
                for (size_t num_senders = 1; num_senders <= MAX_NODES; ++num_senders ) {
                    // Create num_senders senders, set there message limit each to N, and connect them to the exe_vec[node_idx]
                    senders = new harness_counting_sender<InputType>[num_senders];
                    for (size_t s = 0; s < num_senders; ++s ) {
                        senders[s].my_limit = N;
                        tbb::flow::make_edge( senders[s], exe_vec[node_idx] );
                    }

                    // Initialize the receivers so they know how many senders and messages to check for
                    for (size_t r = 0; r < num_receivers; ++r ) {
                         receivers[r].initialize_map( N, num_senders ); 
                    }

                    // Do the test
                    NativeParallelFor( (int)num_senders, parallel_put_until_limit<InputType>(senders) );
                    g.wait_for_all();

                    // cofirm that each sender was requested from N times 
                    for (size_t s = 0; s < num_senders; ++s ) {
                        size_t n = senders[s].my_received;
                        ASSERT( n == N, NULL ); 
                        ASSERT( senders[s].my_receiver == &exe_vec[node_idx], NULL );
                    }
                    // validate the receivers
                    for (size_t r = 0; r < num_receivers; ++r ) {
                        receivers[r].validate();
                    }
                    delete [] senders;
                }
                for (size_t r = 0; r < num_receivers; ++r ) {
                    tbb::flow::remove_edge( tbb::flow::output_port<0>(exe_vec[node_idx]), receivers[r] );
                }
                ASSERT( exe_vec[node_idx].try_put( InputType() ) == true, NULL );
                g.wait_for_all();
                for (size_t r = 0; r < num_receivers; ++r ) {
                    // since it's detached, nothing should have changed
                    receivers[r].validate();
                }
                delete [] receivers;
            }
        } 
    }
}
void unlimited_concurrency( Body body ) {
    typedef typename std::tuple_element<0,OutputTuple>::type OutputType;

    for (int p = 1; p < 2*MaxThread; ++p) {
        tbb::flow::graph g;
        tbb::flow::multifunction_node< InputType, OutputTuple, tbb::flow::rejecting > exe_node( g, tbb::flow::unlimited, body );

        for (size_t num_receivers = 1; num_receivers <= MAX_NODES; ++num_receivers ) {
            harness_counting_receiver<OutputType> *receivers = new harness_counting_receiver<OutputType>[num_receivers];
            harness_graph_multifunction_executor<InputType, OutputTuple>::execute_count = 0;

            for (size_t r = 0; r < num_receivers; ++r ) {
                tbb::flow::make_edge( tbb::flow::output_port<0>(exe_node), receivers[r] );
            }

            NativeParallelFor( p, parallel_puts<InputType>(exe_node) );
            g.wait_for_all(); 

            // 2) the nodes will receive puts from multiple predecessors simultaneously,
            size_t ec = harness_graph_multifunction_executor<InputType, OutputTuple>::execute_count;
            ASSERT( (int)ec == p*N, NULL ); 
            for (size_t r = 0; r < num_receivers; ++r ) {
                size_t c = receivers[r].my_count;
                // 3) the nodes will send to multiple successors.
                ASSERT( (int)c == p*N, NULL );
            }
        }
    }
}
void concurrency_levels( size_t concurrency, Body body ) {
    typedef typename std::tuple_element<0,OutputTuple>::type OutputType;
    for ( size_t lc = 1; lc <= concurrency; ++lc ) { 
        tbb::flow::graph g;
        harness_graph_multifunction_executor<InputType, OutputTuple, tbb::spin_mutex>::execute_count = 0;

        tbb::flow::multifunction_node< InputType, OutputTuple, tbb::flow::rejecting > exe_node( g, lc, body );

        for (size_t num_receivers = 1; num_receivers <= MAX_NODES; ++num_receivers ) {

            harness_counting_receiver<OutputType> *receivers = new harness_counting_receiver<OutputType>[num_receivers];

            for (size_t r = 0; r < num_receivers; ++r ) {
                tbb::flow::make_edge( tbb::flow::output_port<0>(exe_node), receivers[r] );
            }

            harness_counting_sender<InputType> *senders = NULL;
    
            for (size_t num_senders = 1; num_senders <= MAX_NODES; ++num_senders ) {
                {
                    // lock m to prevent exe_node from finishing
                    tbb::spin_mutex::scoped_lock l( harness_graph_multifunction_executor< InputType, OutputTuple, tbb::spin_mutex >::mutex );
    
                    // put to lc level, it will accept and then block at m
                    for ( size_t c = 0 ; c < lc ; ++c ) {
                        ASSERT( exe_node.try_put( InputType() ) == true, NULL );
                    }
                    // it only accepts to lc level
                    ASSERT( exe_node.try_put( InputType() ) == false, NULL );
    
                    senders = new harness_counting_sender<InputType>[num_senders];
                    for (size_t s = 0; s < num_senders; ++s ) {
                       // register a sender
                       senders[s].my_limit = N;
                       exe_node.register_predecessor( senders[s] );
                    }
    
                } // release lock at end of scope, setting the exe node free to continue
                // wait for graph to settle down
                g.wait_for_all();
    
                // confirm that each sender was requested from N times 
                for (size_t s = 0; s < num_senders; ++s ) {
                    size_t n = senders[s].my_received;
                    ASSERT( n == N, NULL ); 
                    ASSERT( senders[s].my_receiver == &exe_node, NULL );
                }
                // confirm that each receivers got N * num_senders + the initial lc puts
                for (size_t r = 0; r < num_receivers; ++r ) {
                    size_t n = receivers[r].my_count;
                    ASSERT( n == num_senders*N+lc, NULL );
                    receivers[r].my_count = 0;
                }
                delete [] senders;
            }
            for (size_t r = 0; r < num_receivers; ++r ) {
                tbb::flow::remove_edge( tbb::flow::output_port<0>(exe_node), receivers[r] );
            }
            ASSERT( exe_node.try_put( InputType() ) == true, NULL );
            g.wait_for_all();
            for (size_t r = 0; r < num_receivers; ++r ) {
                ASSERT( int(receivers[r].my_count) == 0, NULL );
            }
            delete [] receivers;
        }
    }
}
void buffered_levels_with_copy( size_t concurrency ) {
    typedef typename std::tuple_element<0,OutputTuple>::type OutputType;
    // Do for lc = 1 to concurrency level
    for ( size_t lc = 1; lc <= concurrency; ++lc ) { 
        tbb::flow::graph g;

        inc_functor cf;
        cf.local_execute_count = Offset;
        global_execute_count = Offset;
       
        tbb::flow::multifunction_node< InputType, OutputTuple > exe_node( g, lc, cf );

        for (size_t num_receivers = 1; num_receivers <= MAX_NODES; ++num_receivers ) {
           harness_mapped_receiver<OutputType> *receivers = new harness_mapped_receiver<OutputType>[num_receivers];
           for (size_t r = 0; r < num_receivers; ++r ) {
               tbb::flow::make_edge( tbb::flow::output_port<0>(exe_node), receivers[r] );
            }

            harness_counting_sender<InputType> *senders = NULL;
            for (size_t num_senders = 1; num_senders <= MAX_NODES; ++num_senders ) {
                senders = new harness_counting_sender<InputType>[num_senders];
                for (size_t s = 0; s < num_senders; ++s ) {
                    senders[s].my_limit = N;
                    tbb::flow::make_edge( senders[s], exe_node );
                }

                for (size_t r = 0; r < num_receivers; ++r ) {
                    receivers[r].initialize_map( N, num_senders ); 
                }

                NativeParallelFor( (int)num_senders, parallel_put_until_limit<InputType>(senders) );
                g.wait_for_all();

                for (size_t s = 0; s < num_senders; ++s ) {
                    size_t n = senders[s].my_received;
                    ASSERT( n == N, NULL ); 
                    ASSERT( senders[s].my_receiver == &exe_node, NULL );
                }
                for (size_t r = 0; r < num_receivers; ++r ) {
                    receivers[r].validate();
                }
                delete [] senders;
            }
            for (size_t r = 0; r < num_receivers; ++r ) {
                tbb::flow::remove_edge( tbb::flow::output_port<0>(exe_node), receivers[r] );
            }
            ASSERT( exe_node.try_put( InputType() ) == true, NULL );
            g.wait_for_all();
            for (size_t r = 0; r < num_receivers; ++r ) {
                receivers[r].validate();
            }
            delete [] receivers;
        }

        // validate that the local body matches the global execute_count and both are correct
        inc_functor body_copy = tbb::flow::copy_body<inc_functor>( exe_node );
        const size_t expected_count = N/2 * MAX_NODES * MAX_NODES * ( MAX_NODES + 1 ) + MAX_NODES + Offset; 
        size_t global_count = global_execute_count;
        size_t inc_count = body_copy.local_execute_count;
        ASSERT( global_count == expected_count && global_count == inc_count, NULL ); 
    }
}