static test_pal_thread_return_t TEST_PAL_CALLING_CONVENTION thread_simple_enqueuer( void *util_thread_starter_thread_state )
{
  lfds700_pal_uint_t
    loop;

  struct lfds700_misc_prng_state
    ps;

  struct test_state
    *ts;

  struct util_thread_starter_thread_state
    *tsts;

  LFDS700_MISC_MAKE_VALID_ON_CURRENT_LOGICAL_CORE_INITS_COMPLETED_BEFORE_NOW_ON_ANY_OTHER_LOGICAL_CORE;

  assert( util_thread_starter_thread_state != NULL );

  tsts = (struct util_thread_starter_thread_state *) util_thread_starter_thread_state;
  ts = (struct test_state *) tsts->thread_user_state;

  lfds700_misc_prng_init( &ps );

  ts->te_array = util_aligned_malloc( sizeof(struct test_element) * ts->number_elements, LFDS700_PAL_ATOMIC_ISOLATION_IN_BYTES );

  for( loop = 0 ; loop < ts->number_elements ; loop++ )
  {
    (ts->te_array+loop)->thread_number = ts->thread_number;
    (ts->te_array+loop)->counter = loop;
  }

  util_thread_starter_ready_and_wait( tsts );

  for( loop = 0 ; loop < ts->number_elements ; loop++ )
  {
    LFDS700_QUEUE_SET_VALUE_IN_ELEMENT( (ts->te_array+loop)->qe, ts->te_array+loop );
    lfds700_queue_enqueue( ts->qs, &(ts->te_array+loop)->qe, &ps );
  }

  LFDS700_MISC_BARRIER_STORE;

  lfds700_misc_force_store();

  return( (test_pal_thread_return_t) EXIT_SUCCESS );
}
static test_pal_thread_return_t TEST_PAL_CALLING_CONVENTION thread_enqueuer_with_malloc_and_dequeuer_with_free( void *util_thread_starter_thread_state )
{
  lfds700_pal_uint_t
    loop,
    time_loop = 0;

  struct lfds700_misc_prng_state
    ps;

  struct lfds700_queue_element
    *qe;

  struct test_state
    *ts;

  struct util_thread_starter_thread_state
    *tsts;

  time_t
    current_time,
    start_time;

  LFDS700_MISC_MAKE_VALID_ON_CURRENT_LOGICAL_CORE_INITS_COMPLETED_BEFORE_NOW_ON_ANY_OTHER_LOGICAL_CORE;

  assert( util_thread_starter_thread_state != NULL );

  tsts = (struct util_thread_starter_thread_state *) util_thread_starter_thread_state;
  ts = (struct test_state *) tsts->thread_user_state;

  lfds700_misc_prng_init( &ps );

  util_thread_starter_ready_and_wait( tsts );

  current_time = start_time = time( NULL );

  while( current_time < start_time + TEST_DURATION_IN_SECONDS )
  {
    for( loop = 0 ; loop < 1000 ; loop++ )
    {
      qe = util_aligned_malloc( sizeof(struct lfds700_queue_element), LFDS700_PAL_ATOMIC_ISOLATION_IN_BYTES );
      lfds700_queue_enqueue( ts->qs, qe, &ps );
    }

    for( loop = 0 ; loop < 1000 ; loop++ )
    {
      lfds700_queue_dequeue( ts->qs, &qe, &ps );
      util_aligned_free( qe );
    }

    if( time_loop++ == REDUCED_TIME_LOOP_COUNT )
    {
      time_loop = 0;
      time( &current_time );
    }
  }

  LFDS700_MISC_BARRIER_STORE;

  lfds700_misc_force_store();

  return( (test_pal_thread_return_t) EXIT_SUCCESS );
}
void test_lfds700_queue_rapid_enqueuing_and_dequeuing( struct lfds700_list_asu_state *list_of_logical_processors, lfds700_pal_uint_t memory_in_megabytes )
{
  enum lfds700_misc_validity
    dvs = LFDS700_MISC_VALIDITY_VALID;

  lfds700_pal_uint_t
    loop,
    number_elements_with_dummy_element,
    number_elements_without_dummy_element,
    number_logical_processors,
    *per_thread_counters;

  struct lfds700_list_asu_element
    *lasue;

  struct lfds700_misc_prng_state
    ps;

  struct lfds700_queue_element
    *qe;

  struct lfds700_misc_validation_info
    vi;

  struct lfds700_queue_state
    qs;

  struct test_pal_logical_processor
    *lp;

  struct util_thread_starter_state
    *tts;

  struct test_element
    *te_array,
    *te;

  struct test_state
    *ts;

  test_pal_thread_state_t
    *thread_handles;

  assert( list_of_logical_processors != NULL );
  // TRD : memory_in_megabytes can be any value in its range

  /* TRD : we create a single queue with 50,000 elements
           we don't want too many elements, so we ensure plenty of element re-use
           each thread simply loops dequeuing and enqueuing
           where the user data indicates thread number and an increment counter
           vertification is that the counter increments on a per-thread basis
  */

  internal_display_test_name( "Rapid enqueuing and dequeuing (%d seconds)", TEST_DURATION_IN_SECONDS );

  lfds700_list_asu_query( list_of_logical_processors, LFDS700_LIST_ASU_QUERY_GET_POTENTIALLY_INACCURATE_COUNT, NULL, (void **) &number_logical_processors );

  lfds700_misc_prng_init( &ps );

  number_elements_with_dummy_element = ( memory_in_megabytes * ONE_MEGABYTE_IN_BYTES ) / sizeof(struct test_element);

  if( number_elements_with_dummy_element > (10000 * number_logical_processors) + 1 )
    number_elements_with_dummy_element = (10000 * number_logical_processors) + 1;

  number_elements_without_dummy_element = number_elements_with_dummy_element - 1;

  vi.min_elements = number_elements_without_dummy_element;
  vi.max_elements = number_elements_without_dummy_element;

  te_array = util_aligned_malloc( sizeof(struct test_element) * number_elements_with_dummy_element, LFDS700_PAL_ATOMIC_ISOLATION_IN_BYTES );

  lfds700_queue_init_valid_on_current_logical_core( &qs, &(te_array+number_elements_without_dummy_element)->qe, &ps, NULL );

  // TRD : we assume the test will iterate at least once (or we'll have a false negative)
  for( loop = 0 ; loop < number_elements_without_dummy_element ; loop++ )
  {
    (te_array+loop)->thread_number = loop;
    (te_array+loop)->counter = 0;
    LFDS700_QUEUE_SET_VALUE_IN_ELEMENT( (te_array+loop)->qe, te_array+loop );
    lfds700_queue_enqueue( &qs, &(te_array+loop)->qe, &ps );
  }

  ts = util_malloc_wrapper( sizeof(struct test_state) * number_logical_processors );

  for( loop = 0 ; loop < number_logical_processors ; loop++ )
  {
    (ts+loop)->qs = &qs;
    (ts+loop)->thread_number = loop;
    (ts+loop)->counter = 0;
  }

  thread_handles = util_malloc_wrapper( sizeof(test_pal_thread_state_t) * number_logical_processors );

  util_thread_starter_new( &tts, number_logical_processors );

  LFDS700_MISC_BARRIER_STORE;

  lfds700_misc_force_store();

  loop = 0;
  lasue = NULL;

  while( LFDS700_LIST_ASU_GET_START_AND_THEN_NEXT(*list_of_logical_processors, lasue) )
  {
    lp = LFDS700_LIST_ASU_GET_VALUE_FROM_ELEMENT( *lasue );
    util_thread_starter_start( tts, &thread_handles[loop], loop, lp, thread_rapid_enqueuer_and_dequeuer, ts+loop );
    loop++;
  }

  util_thread_starter_run( tts );

  for( loop = 0 ; loop < number_logical_processors ; loop++ )
    test_pal_thread_wait( thread_handles[loop] );

  util_thread_starter_delete( tts );

  free( thread_handles );

  LFDS700_MISC_BARRIER_LOAD;

  lfds700_queue_query( &qs, LFDS700_QUEUE_QUERY_SINGLETHREADED_VALIDATE, &vi, &dvs );

  // TRD : now check results
  per_thread_counters = util_malloc_wrapper( sizeof(lfds700_pal_uint_t) * number_logical_processors );

  for( loop = 0 ; loop < number_logical_processors ; loop++ )
    *(per_thread_counters+loop) = 0;

  while( dvs == LFDS700_MISC_VALIDITY_VALID and lfds700_queue_dequeue(&qs, &qe, &ps) )
  {
    te = LFDS700_QUEUE_GET_VALUE_FROM_ELEMENT( *qe );

    if( te->thread_number >= number_logical_processors )
    {
      dvs = LFDS700_MISC_VALIDITY_INVALID_TEST_DATA;
      break;
    }

    if( per_thread_counters[te->thread_number] == 0 )
      per_thread_counters[te->thread_number] = te->counter;

    if( te->counter > per_thread_counters[te->thread_number] )
      dvs = LFDS700_MISC_VALIDITY_INVALID_MISSING_ELEMENTS;

    if( te->counter < per_thread_counters[te->thread_number] )
      dvs = LFDS700_MISC_VALIDITY_INVALID_ADDITIONAL_ELEMENTS;

    if( te->counter == per_thread_counters[te->thread_number] )
      per_thread_counters[te->thread_number]++;
  }

  free( per_thread_counters );

  lfds700_queue_cleanup( &qs, NULL );

  util_aligned_free( te_array );

  free( ts );

  internal_display_test_result( 1, "queue", dvs );

  return;
}
static test_pal_thread_return_t TEST_PAL_CALLING_CONVENTION thread_rapid_enqueuer_and_dequeuer( void *util_thread_starter_thread_state )
{
  lfds700_pal_uint_t
    time_loop = 0;

  struct lfds700_misc_prng_state
    ps;

  struct lfds700_queue_element
    *qe;

  struct test_element
    *te;

  struct test_state
    *ts;

  struct util_thread_starter_thread_state
    *tsts;

  time_t
    current_time,
    start_time;

  LFDS700_MISC_MAKE_VALID_ON_CURRENT_LOGICAL_CORE_INITS_COMPLETED_BEFORE_NOW_ON_ANY_OTHER_LOGICAL_CORE;

  assert( util_thread_starter_thread_state != NULL );

  tsts = (struct util_thread_starter_thread_state *) util_thread_starter_thread_state;
  ts = (struct test_state *) tsts->thread_user_state;

  lfds700_misc_prng_init( &ps );

  util_thread_starter_ready_and_wait( tsts );

  current_time = start_time = time( NULL );

  while( current_time < start_time + TEST_DURATION_IN_SECONDS )
  {
    lfds700_queue_dequeue( ts->qs, &qe, &ps );
    te = LFDS700_QUEUE_GET_VALUE_FROM_ELEMENT( *qe );

    te->thread_number = ts->thread_number;
    te->counter = ts->counter++;

    LFDS700_QUEUE_SET_VALUE_IN_ELEMENT( *qe, te );
    lfds700_queue_enqueue( ts->qs, qe, &ps );

    if( time_loop++ == TIME_LOOP_COUNT )
    {
      time_loop = 0;
      time( &current_time );
    }
  }

  LFDS700_MISC_BARRIER_STORE;

  lfds700_misc_force_store();

  return( (test_pal_thread_return_t) EXIT_SUCCESS );
}
void test_lfds700_queue_dequeuing( struct lfds700_list_asu_state *list_of_logical_processors, lfds700_pal_uint_t memory_in_megabytes )
{
    enum lfds700_misc_validity
    dvs = LFDS700_MISC_VALIDITY_VALID;

    lfds700_pal_uint_t
    loop,
    number_elements_with_dummy_element,
    number_elements_without_dummy_element,
    number_logical_processors;

    struct lfds700_list_asu_element
        *lasue;

    struct lfds700_misc_prng_state
        ps;

    struct lfds700_queue_state
        qs;

    struct lfds700_misc_validation_info
        vi = { 0, 0 };

    struct test_pal_logical_processor
        *lp;

    struct util_thread_starter_state
        *tts;

    struct test_element
        *te_array;

    struct test_state
        *ts;

    test_pal_thread_state_t
    *thread_handles;

    assert( list_of_logical_processors != NULL );
    // TRD : memory_in_megabytes can be any value in its range

    /* TRD : create a queue, add 1,000,000 elements

             use a single thread to enqueue every element
             each elements user data is an incrementing counter

             then run one thread per CPU
             where each busy-works dequeuing

             when an element is dequeued, we check (on a per-thread basis) the
             value dequeued is greater than the element previously dequeued

             note we have no variation in the test for CAS+GC vs DWCAS
             this is because all we do is dequeue
             what we actually want to stress test is the queue
             not CAS
             so it's better to let the dequeue run as fast as possible
    */

    internal_display_test_name( "Dequeuing" );

    lfds700_list_asu_query( list_of_logical_processors, LFDS700_LIST_ASU_QUERY_GET_POTENTIALLY_INACCURATE_COUNT, NULL, (void **) &number_logical_processors );

    lfds700_misc_prng_init( &ps );

    number_elements_with_dummy_element = ( memory_in_megabytes * ONE_MEGABYTE_IN_BYTES ) / sizeof(struct test_element);
    number_elements_without_dummy_element = number_elements_with_dummy_element - 1;

    te_array = util_aligned_malloc( sizeof(struct test_element) * number_elements_with_dummy_element, LFDS700_PAL_ATOMIC_ISOLATION_IN_BYTES );

    lfds700_queue_init_valid_on_current_logical_core( &qs, &(te_array + number_elements_without_dummy_element)->qe, &ps, NULL );

    for( loop = 0 ; loop < number_elements_without_dummy_element ; loop++ )
    {
        LFDS700_QUEUE_SET_VALUE_IN_ELEMENT( (te_array+loop)->qe, loop );
        lfds700_queue_enqueue( &qs, &(te_array+loop)->qe, &ps );
    }

    ts = util_malloc_wrapper( sizeof(struct test_state) * number_logical_processors );

    for( loop = 0 ; loop < number_logical_processors ; loop++ )
    {
        (ts+loop)->qs = &qs;
        (ts+loop)->error_flag = LOWERED;
    }

    thread_handles = util_malloc_wrapper( sizeof(test_pal_thread_state_t) * number_logical_processors );

    util_thread_starter_new( &tts, number_logical_processors );

    LFDS700_MISC_BARRIER_STORE;

    lfds700_misc_force_store();

    loop = 0;
    lasue = NULL;

    while( LFDS700_LIST_ASU_GET_START_AND_THEN_NEXT(*list_of_logical_processors, lasue) )
    {
        lp = LFDS700_LIST_ASU_GET_VALUE_FROM_ELEMENT( *lasue );
        util_thread_starter_start( tts, &thread_handles[loop], loop, lp, thread_simple_dequeuer, ts+loop );
        loop++;
    }

    util_thread_starter_run( tts );

    for( loop = 0 ; loop < number_logical_processors ; loop++ )
        test_pal_thread_wait( thread_handles[loop] );

    util_thread_starter_delete( tts );

    free( thread_handles );

    LFDS700_MISC_BARRIER_LOAD;

    // TRD : check queue is empty
    lfds700_queue_query( &qs, LFDS700_QUEUE_QUERY_SINGLETHREADED_VALIDATE, &vi, &dvs );

    // TRD : check for raised error flags
    for( loop = 0 ; loop < number_logical_processors ; loop++ )
        if( (ts+loop)->error_flag == RAISED )
            dvs = LFDS700_MISC_VALIDITY_INVALID_TEST_DATA;

    free( ts );

    util_aligned_free( te_array );

    lfds700_queue_cleanup( &qs, NULL );

    internal_display_test_result( 1, "queue", dvs );

    return;
}
libshared_pal_thread_return_t LIBSHARED_PAL_THREAD_CALLING_CONVENTION libbenchmark_benchmark_queue_umm_liblfds700_lockfree_enqueue1_dequeue1_thread( void *libbenchmark_threadset_per_thread_state )
{
  int long long unsigned
    current_time = 0,
    end_time,
    time_units_per_second;

  struct lfds700_misc_prng_state
    ps;

  lfds710_pal_uint_t
    operation_count = 0,
    time_loop = 0;

  struct lfds700_queue_element
    *qe;

  struct lfds700_queue_state
    *qs;

  struct libbenchmark_benchmark_queue_umm_liblfds700_lockfree_enqueue1_dequeue1_overall_benchmark_state
    *obs;

  struct libbenchmark_benchmark_queue_umm_liblfds700_lockfree_enqueue1_dequeue1_per_thread_benchmark_state
    *ptbs;

  struct libbenchmark_threadset_per_thread_state
    *pts;

  LFDS710_MISC_BARRIER_LOAD;

  LFDS710_PAL_ASSERT( libbenchmark_threadset_per_thread_state != NULL );

  pts = (struct libbenchmark_threadset_per_thread_state *) libbenchmark_threadset_per_thread_state;

  ptbs = LIBBENCHMARK_THREADSET_PER_THREAD_STATE_GET_USERS_PER_THREAD_STATE( *pts );
  obs = LIBBENCHMARK_THREADSET_PER_THREAD_STATE_GET_USERS_OVERALL_STATE( *pts );
  qs = obs->qs;

  lfds700_misc_prng_init( &ps );

  LIBBENCHMARK_PAL_TIME_UNITS_PER_SECOND( &time_units_per_second );

  libbenchmark_threadset_thread_ready_and_wait( pts );

  LIBBENCHMARK_PAL_GET_HIGHRES_TIME( &current_time );

  end_time = current_time + time_units_per_second * libbenchmark_globals_benchmark_duration_in_seconds;

  while( current_time < end_time )
  {
    lfds700_queue_dequeue( qs, &qe, &ps );
    lfds700_queue_enqueue( qs, qe, &ps );
    operation_count++;

    if( time_loop++ == TIME_LOOP_COUNT )
    {
      time_loop = 0;
      LIBBENCHMARK_PAL_GET_HIGHRES_TIME( &current_time );
    }
  }

  ptbs->operation_count = operation_count;

  LFDS710_MISC_BARRIER_STORE;

  lfds710_misc_force_store();

  return LIBSHARED_PAL_THREAD_RETURN_CAST(RETURN_SUCCESS);
}
void libbenchmark_benchmark_queue_umm_liblfds700_lockfree_enqueue1_dequeue1_init( struct libbenchmark_topology_state *ts,
                                                                                  struct lfds710_list_aso_state *logical_processor_set,
                                                                                  struct libshared_memory_state *ms,
                                                                                  enum libbenchmark_topology_numa_mode numa_mode,
                                                                                  struct libbenchmark_threadset_state *tsets )
{
  struct lfds700_misc_prng_state
    ps;

  lfds710_pal_uint_t
    loop,
    number_logical_processors,
    number_logical_processors_in_numa_node,
    largest_number_logical_processors_in_numa_node = 0;

  struct lfds710_list_asu_element
    *lasue = NULL,
    *lasue_lp = NULL;

  struct libbenchmark_benchmark_queue_umm_liblfds700_lockfree_enqueue1_dequeue1_overall_benchmark_state
    *obs;

  struct libbenchmark_benchmark_queue_umm_liblfds700_lockfree_enqueue1_dequeue1_per_thread_benchmark_state
    *ptbs;

  struct lfds700_queue_element
    *qe;

  struct lfds700_queue_state
    *qs = NULL;

  struct libbenchmark_threadset_per_numa_state
    *pns,
    *largest_pns = NULL;

  struct libbenchmark_threadset_per_thread_state
    *pts;

  struct libbenchmark_topology_node_state
    *numa_node_for_lp;

  LFDS710_PAL_ASSERT( ts != NULL );
  LFDS710_PAL_ASSERT( logical_processor_set != NULL );
  LFDS710_PAL_ASSERT( ms != NULL );
  // TRD : numa_mode can be any value in its range
  LFDS710_PAL_ASSERT( tsets != NULL );

  lfds700_misc_prng_init( &ps );

  obs = libshared_memory_alloc_from_most_free_space_node( ms, sizeof(struct libbenchmark_benchmark_queue_umm_liblfds700_lockfree_enqueue1_dequeue1_overall_benchmark_state), LFDS710_PAL_ATOMIC_ISOLATION_IN_BYTES );

  libbenchmark_threadset_init( tsets, ts, logical_processor_set, ms, libbenchmark_benchmark_queue_umm_liblfds700_lockfree_enqueue1_dequeue1_thread, NULL );

  switch( numa_mode )
  {
    case LIBBENCHMARK_TOPOLOGY_NUMA_MODE_SMP:
      qs = libshared_memory_alloc_from_most_free_space_node( ms, sizeof(struct lfds700_queue_state), LFDS710_PAL_ATOMIC_ISOLATION_IN_BYTES );
      lfds710_list_aso_query( logical_processor_set, LFDS710_LIST_ASO_QUERY_GET_POTENTIALLY_INACCURATE_COUNT, NULL, (void *) &number_logical_processors );
      qe = libshared_memory_alloc_from_most_free_space_node( ms, sizeof(struct lfds700_queue_element) * (number_logical_processors+1), LFDS710_PAL_ATOMIC_ISOLATION_IN_BYTES );
      lfds700_queue_init_valid_on_current_logical_core( qs, &qe[0], &ps, NULL );
      for( loop = 1 ; loop < (number_logical_processors+1) ; loop++ )
        lfds700_queue_enqueue( qs, &qe[loop], &ps );
      // TRD : now the per-thread states
      while( LFDS710_LIST_ASU_GET_START_AND_THEN_NEXT(tsets->list_of_per_thread_states,lasue) )
      {
        pts = LFDS710_LIST_ASU_GET_VALUE_FROM_ELEMENT( *lasue );
        ptbs = libshared_memory_alloc_from_most_free_space_node( ms, sizeof(struct libbenchmark_benchmark_queue_umm_liblfds700_lockfree_enqueue1_dequeue1_per_thread_benchmark_state), LFDS710_PAL_ATOMIC_ISOLATION_IN_BYTES );
        pts->users_per_thread_state = ptbs;
      }
    break;

    case LIBBENCHMARK_TOPOLOGY_NUMA_MODE_NUMA:
      // TRD : get the NUMA node for the queue_umm state
      while( LFDS710_LIST_ASU_GET_START_AND_THEN_NEXT(tsets->list_of_per_numa_states,lasue) )
      {
        pns = LFDS710_LIST_ASU_GET_VALUE_FROM_ELEMENT( *lasue );

        lasue_lp = NULL;
        number_logical_processors_in_numa_node = 0;

        while( LFDS710_LIST_ASU_GET_START_AND_THEN_NEXT(tsets->list_of_per_thread_states,lasue_lp) )
        {
          pts = LFDS710_LIST_ASU_GET_VALUE_FROM_ELEMENT( *lasue_lp );

          libbenchmark_topology_query( ts, LIBBENCHMARK_TOPOLOGY_QUERY_GET_NUMA_NODE_FOR_LOGICAL_PROCESSOR, pts->tns_lp, &numa_node_for_lp );

          if( LIBBENCHMARK_TOPOLOGY_NODE_GET_NUMA_ID(*numa_node_for_lp) == pns->numa_node_id )
            number_logical_processors_in_numa_node++;
        }

        if( number_logical_processors_in_numa_node > largest_number_logical_processors_in_numa_node )
          largest_pns = pns;
      }

      qs = libshared_memory_alloc_from_specific_node( ms, largest_pns->numa_node_id, sizeof(struct lfds700_queue_state), LFDS710_PAL_ATOMIC_ISOLATION_IN_BYTES );
      qe = libshared_memory_alloc_from_specific_node( ms, largest_pns->numa_node_id, sizeof(struct lfds700_queue_element), LFDS710_PAL_ATOMIC_ISOLATION_IN_BYTES );
      lfds700_queue_init_valid_on_current_logical_core( qs, qe, &ps, NULL );

      /* TRD : for each NUMA node, alloc one element per thread in that NUMA node (from the current thread set)
               the dummy element comes from the same node as the queue_umm state and has already been done
      */

      lasue = NULL;

      while( LFDS710_LIST_ASU_GET_START_AND_THEN_NEXT(tsets->list_of_per_numa_states,lasue) )
      {
        pns = LFDS710_LIST_ASU_GET_VALUE_FROM_ELEMENT( *lasue );

        /* TRD : for each NUMA node, figure out how many LPs in the current set are in that NUMA node
                 and allocate then the correct number of elements from this NUMA node (1 per LP)
        */

        lasue_lp = NULL;
        number_logical_processors_in_numa_node = 0;

        while( LFDS710_LIST_ASU_GET_START_AND_THEN_NEXT(tsets->list_of_per_thread_states,lasue_lp) )
        {
          pts = LFDS710_LIST_ASU_GET_VALUE_FROM_ELEMENT( *lasue_lp );

          libbenchmark_topology_query( ts, LIBBENCHMARK_TOPOLOGY_QUERY_GET_NUMA_NODE_FOR_LOGICAL_PROCESSOR, pts->tns_lp, &numa_node_for_lp );

          if( LIBBENCHMARK_TOPOLOGY_NODE_GET_NUMA_ID(*numa_node_for_lp) == pns->numa_node_id )
            number_logical_processors_in_numa_node++;
        }

        qe = libshared_memory_alloc_from_specific_node( ms, pns->numa_node_id, sizeof(struct lfds700_queue_element) * number_logical_processors_in_numa_node, LFDS710_PAL_ATOMIC_ISOLATION_IN_BYTES );
        for( loop = 0 ; loop < number_logical_processors_in_numa_node ; loop++ )
          lfds700_queue_enqueue( qs, &qe[loop], &ps );
      }

      // TRD : now the per-thread states

      lasue = NULL;

      while( LFDS710_LIST_ASU_GET_START_AND_THEN_NEXT(tsets->list_of_per_thread_states,lasue) )
      {
        pts = LFDS710_LIST_ASU_GET_VALUE_FROM_ELEMENT( *lasue );
        ptbs = libshared_memory_alloc_from_specific_node( ms, largest_pns->numa_node_id, sizeof(struct libbenchmark_benchmark_queue_umm_liblfds700_lockfree_enqueue1_dequeue1_per_thread_benchmark_state), LFDS710_PAL_ATOMIC_ISOLATION_IN_BYTES );
        pts->users_per_thread_state = ptbs;
      }
    break;

    case LIBBENCHMARK_TOPOLOGY_NUMA_MODE_NUMA_BUT_NOT_USED:
      // TRD : get the NUMA node for the queue_umm state
      while( LFDS710_LIST_ASU_GET_START_AND_THEN_NEXT(tsets->list_of_per_numa_states,lasue) )
      {
        pns = LFDS710_LIST_ASU_GET_VALUE_FROM_ELEMENT( *lasue );

        lasue_lp = NULL;
        number_logical_processors_in_numa_node = 0;

        while( LFDS710_LIST_ASU_GET_START_AND_THEN_NEXT(tsets->list_of_per_thread_states,lasue_lp) )
        {
          pts = LFDS710_LIST_ASU_GET_VALUE_FROM_ELEMENT( *lasue_lp );

          libbenchmark_topology_query( ts, LIBBENCHMARK_TOPOLOGY_QUERY_GET_NUMA_NODE_FOR_LOGICAL_PROCESSOR, pts->tns_lp, &numa_node_for_lp );

          if( LIBBENCHMARK_TOPOLOGY_NODE_GET_NUMA_ID(*numa_node_for_lp) == pns->numa_node_id )
            number_logical_processors_in_numa_node++;
        }

        if( number_logical_processors_in_numa_node > largest_number_logical_processors_in_numa_node )
          largest_pns = pns;
      }

      qs = libshared_memory_alloc_from_specific_node( ms, largest_pns->numa_node_id, sizeof(struct lfds700_queue_state), LFDS710_PAL_ATOMIC_ISOLATION_IN_BYTES );
      qe = libshared_memory_alloc_from_specific_node( ms, largest_pns->numa_node_id, sizeof(struct lfds700_queue_element), LFDS710_PAL_ATOMIC_ISOLATION_IN_BYTES );
      lfds700_queue_init_valid_on_current_logical_core( qs, qe, &ps, NULL );

      /* TRD : for each NUMA node, alloc one element per thread in that NUMA node (from the current thread set)
               the dummy element comes from the same node as the queue_umm state and has already been done
      */

      lasue = NULL;

      while( LFDS710_LIST_ASU_GET_START_AND_THEN_NEXT(tsets->list_of_per_numa_states,lasue) )
      {
        pns = LFDS710_LIST_ASU_GET_VALUE_FROM_ELEMENT( *lasue );

        /* TRD : for each NUMA node, figure out how many LPs in the current set are in that NUMA node
                 and allocate then the correct number of elements from this NUMA node (1 per LP)
        */

        lasue_lp = NULL;
        number_logical_processors_in_numa_node = 0;

        while( LFDS710_LIST_ASU_GET_START_AND_THEN_NEXT(tsets->list_of_per_thread_states,lasue_lp) )
        {
          pts = LFDS710_LIST_ASU_GET_VALUE_FROM_ELEMENT( *lasue_lp );

          libbenchmark_topology_query( ts, LIBBENCHMARK_TOPOLOGY_QUERY_GET_NUMA_NODE_FOR_LOGICAL_PROCESSOR, pts->tns_lp, &numa_node_for_lp );

          if( LIBBENCHMARK_TOPOLOGY_NODE_GET_NUMA_ID(*numa_node_for_lp) == pns->numa_node_id )
            number_logical_processors_in_numa_node++;
        }

        // TRD : everything allocates from the queue_umm state NUMA node
        qe = libshared_memory_alloc_from_specific_node( ms, largest_pns->numa_node_id, sizeof(struct lfds700_queue_element) * number_logical_processors_in_numa_node, LFDS710_PAL_ATOMIC_ISOLATION_IN_BYTES );
        for( loop = 0 ; loop < number_logical_processors_in_numa_node ; loop++ )
          lfds700_queue_enqueue( qs, &qe[loop], &ps );
      }

      // TRD : now the per-thread states

      lasue = NULL;

      while( LFDS710_LIST_ASU_GET_START_AND_THEN_NEXT(tsets->list_of_per_thread_states,lasue) )
      {
        pts = LFDS710_LIST_ASU_GET_VALUE_FROM_ELEMENT( *lasue );
        ptbs = libshared_memory_alloc_from_specific_node( ms, largest_pns->numa_node_id, sizeof(struct libbenchmark_benchmark_queue_umm_liblfds700_lockfree_enqueue1_dequeue1_per_thread_benchmark_state), LFDS710_PAL_ATOMIC_ISOLATION_IN_BYTES );
        pts->users_per_thread_state = ptbs;
      }    break;
  }

  obs->qs = qs;
  tsets->users_threadset_state = obs;

  return;
}