Beispiel #1
0
static inline void __pmp_thread_wake (pmp_thread_t *thread)
{
  int32_t sync = __pmp_atomic_cmpxchg32(&thread->sync, PMP_SYNC_IDLE,
                                        PMP_SYNC_UNBLOCKED);
  assert(sync != PMP_SYNC_UNBLOCKED);

  if (sync == PMP_SYNC_BLOCKED) {
    __pmp_debug(PMP_DEBUG_THREAD, "thread global_id %d is being signaled\n",
                thread->global_id);

    thread->sync = PMP_SYNC_IDLE;

    assert(thread->tid != -1);

    __pmp_sample(PMP_PROFILE_THREAD_RESCHEDULE);
#if (defined PMP_USE_PTHREAD_SIGNALS)
    if (pthread_kill(thread->pthread_id, SIGPMP) != 0) {
      __pmp_fatal("unable to wake thread using pthread_kill\n");
    }
#elif (defined PMP_NO_NPTL)
    if (kill(thread->tid, SIGPMP) != 0) {
      __pmp_fatal("unable to wake thread using kill\n");
    }
#else
    if (tkill(thread->tid, SIGPMP) != 0) {
      __pmp_fatal("unable to wake thread using tkill\n");
    }
#endif
  }
  else {
    __pmp_debug(PMP_DEBUG_THREAD, "thread global_id %d is woken\n",
                thread->global_id);
  }
}
Beispiel #2
0
void __pmp_thread_create_main (void)
{
  pmp_thread_t *thread = __pmp_get_main_thread();
  int global_id;
  global_id = __pmp_idstack_pop(&__pmp_manager.idstack);
  assert(global_id == 0);

  thread->pthread_id = pthread_self();
#ifdef PMP_NO_NPTL
  thread->tid = getpid();
#else
  thread->tid = gettid();
#endif
  thread->local_id = 0;

#ifndef PMP_NO_TLS
#ifdef PMP_TLS_THREAD
  __pmp_tls_current_thread = thread;
#endif
#ifdef PMP_TLS_LOCAL_ID
  __pmp_tls_current_local_id = 0;
#endif
#ifdef PMP_TLS_GLOBAL_ID
  __pmp_tls_current_global_id = thread->global_id;
#endif
#endif

#ifdef PMP_USE_PTHREAD_SIGNALS
  if (pthread_sigmask(SIG_BLOCK, &__pmp_manager.mask_block_sigpmp, 
                      NULL) != 0) {
    __pmp_fatal("unable to set thread-specific sigmask\n");
  }
#else
  if (sigprocmask(SIG_BLOCK, &__pmp_manager.mask_block_sigpmp, NULL) != 0) {
    __pmp_fatal("unable to set thread-specific sigmask\n");
  }
#endif

  if (pthread_setspecific(__pmp_manager.thread_key, (void *) thread) != 0) {
    __pmp_fatal("unable to set thread-specific data\n");
  }

  __pmp_thread_bind(thread);		/* early master bind */

  __pmp_debug(PMP_DEBUG_THREAD, "created main thread global_id %d\n",
              thread->global_id);

  __pmp_debug(PMP_DEBUG_THREAD,
              "__pmp_thread_create_main: tid=%d, pthread_id=0x%08x "
              "global_id=%d, local_id=%d\n",
              (int) thread->tid, (int) thread->pthread_id,
              (int) thread->global_id, (int) thread->local_id);
}
Beispiel #3
0
static inline void __pmp_scheduler_sample (int sched)
{
#ifdef PMP_PROFILE
  switch (sched) {
    case PMP_SCHED_STATIC: {
      __pmp_sample(PMP_PROFILE_SCHED_STATIC);
      break;
    }
    case PMP_SCHED_STATIC_EVEN: {
      __pmp_sample(PMP_PROFILE_SCHED_STATIC_EVEN);
      break;
    }
    case PMP_SCHED_DYNAMIC: {
      __pmp_sample(PMP_PROFILE_SCHED_DYNAMIC);
      break;
    }
    case PMP_SCHED_GUIDED: {
      __pmp_sample(PMP_PROFILE_SCHED_GUIDED);
      break;
    }
    case PMP_SCHED_ORDERED_STATIC: {
      __pmp_sample(PMP_PROFILE_SCHED_ORDERED_STATIC);
      break;
    }
    case PMP_SCHED_ORDERED_STATIC_EVEN: {
      __pmp_sample(PMP_PROFILE_SCHED_ORDERED_STATIC_EVEN);
      break;
    }
    case PMP_SCHED_ORDERED_DYNAMIC: {
      __pmp_sample(PMP_PROFILE_SCHED_ORDERED_DYNAMIC);
      break;
    }
    case PMP_SCHED_ORDERED_GUIDED: {
      __pmp_sample(PMP_PROFILE_SCHED_ORDERED_GUIDED);
      break;
    }
    default: {
        __pmp_fatal("unknown dynamic scheduling type %d\n", sched);
      break;
    }
  }
#endif
}
Beispiel #4
0
static inline int __pmp_schedule_next (int global_id, int64_t *lowerp,
                                       int64_t *upperp, int64_t *incp)
{
  pmp_thread_t *thread = __pmp_get_thread(global_id);
  int team_size = __pmp_get_team_size(thread->team);
  int64_t iteration = thread->iteration;
  pmp_local_id_t local_id = thread->local_id;
  pmp_loop_t *loop = thread->loop;

  assert(loop != NULL);
  assert(local_id < team_size);

  if (team_size == 1) {
    if (iteration == 0) {
      *lowerp = loop->lower;
      *upperp = loop->upper;
      *incp = loop->inc;
      thread->ticket_number = loop->lower;
      thread->iteration = 1;
      __pmp_loop_analyser(thread, loop->sched, global_id, local_id,
                          loop->lower, loop->upper,
                          *lowerp, *upperp, *incp, 0, 0);
      return 1;
    }
    else {
      assert(iteration == 1);
      __pmp_loop_free(thread);
      return 0;
    }
  }
  else {
    int     sched = loop->sched;
    int64_t lower = loop->lower;
    int64_t upper = loop->upper;
    int64_t inc   = loop->inc;
    int64_t chunk = loop->chunk;
    switch (sched) {
      case PMP_SCHED_STATIC:
      case PMP_SCHED_ORDERED_STATIC: {
        /* NOTE: setting a small value of chunk causes (unnecessary) iteration
         * through this code. If the chunk is ignored, the code degenerates
         * into the static even case (which is the default). */
        int64_t size = (upper - lower) / inc + 1;
        int64_t size_per_thread = ((size - 1) / team_size + 1) * inc;
        int64_t thread_lower = lower + (local_id * size_per_thread);
        int64_t thread_upper = thread_lower + size_per_thread - inc;
        int64_t this_lower = thread_lower + (iteration * chunk * inc);
        int64_t this_upper = this_lower + (chunk - 1) * inc;
        thread_upper = LOOPMIN(inc, thread_upper, upper);
        this_upper = LOOPMIN(inc, this_upper, thread_upper);
        if ((inc >= 0) ? (this_lower > thread_upper) : 
                         (this_lower < thread_upper)) {
          __pmp_loop_free(thread);
          return 0;
	}
        else {
          *incp = inc;
          *lowerp = this_lower;
          *upperp = this_upper;
          thread->ticket_number = this_lower;
          thread->iteration++;
          __pmp_loop_analyser(thread, sched, global_id, local_id, lower, upper,
                              *lowerp, *upperp, *incp, 0, 0);
          return 1;
	}
        /* NOT REACHED */
        break;
      }
      case PMP_SCHED_STATIC_EVEN:
      case PMP_SCHED_ORDERED_STATIC_EVEN: {
        if (iteration == 0) {
          int64_t size = (upper - lower) / inc + 1;
          int64_t thread_lower;
          int64_t thread_upper;
          if (!__pmp_get_param()->static_fair) {
            int64_t size_per_thread = ((size - 1) / team_size + 1) * inc;
            thread_lower = lower + (local_id * size_per_thread);
            thread_upper = thread_lower + size_per_thread - inc;
	  }
          else {
            int64_t chunk = size / team_size;
            int64_t remainder = size - (chunk * team_size);
            int64_t index = MIN(local_id, remainder) * (chunk + 1);
            if (local_id > remainder) {
              index += (local_id - remainder) * chunk;
            }
            thread_lower = lower + (index * inc);
            chunk += (local_id < remainder);
            thread_upper = thread_lower + (chunk - 1) * inc;
	  }
          thread_upper = LOOPMIN(inc, thread_upper, upper);
          if ((inc >= 0) ? (thread_lower > thread_upper) : 
                           (thread_lower < thread_upper)) {
            __pmp_loop_free(thread);
            return 0;
	  }
	  else {
            *incp = inc;
            *lowerp = thread_lower;
            *upperp = thread_upper;
            thread->ticket_number = thread_lower;
            thread->iteration++;
            __pmp_loop_analyser(thread, sched, global_id, local_id,
                                lower, upper, *lowerp, *upperp, 
                                *incp, 0, 0);
            return 1;
	  }
	}
        else {
          assert(iteration == 1);
          __pmp_loop_free(thread);
	  return 0;
	}
        /* NOT REACHED */
        break;
      }
      case PMP_SCHED_DYNAMIC:
      case PMP_SCHED_ORDERED_DYNAMIC: {
        int64_t stride = inc * chunk;
#if __WORDSIZE == 64
        int64_t current = __pmp_atomic_xadd64(&loop->current, stride);
#else
        /* TODO: the atomic xadd64 is a problem for 32-bit compilation */
        /*       the workaround below is just to do a 32-bit atomic add */
        int64_t current;
        current = (int64_t) __pmp_atomic_xadd32((int32_t *) &loop->current,
                                                (int32_t) stride);
#endif
        if ((inc >= 0) ? (current > upper) : (current < upper)) {
          __pmp_loop_free(thread);
          return 0;
	}
	else {
          *incp = inc;
          *lowerp = current;
          *upperp = *lowerp + stride - inc;
          *upperp = LOOPMIN(inc, upper, *upperp);
          thread->ticket_number = current;
          thread->iteration++;
          __pmp_loop_analyser(thread, sched, global_id, local_id, lower, upper,
                              *lowerp, *upperp, *incp, 0, 0);
          return 1;
	}
        /* NOT REACHED */
        break;
      }
      case PMP_SCHED_GUIDED:
      case PMP_SCHED_ORDERED_GUIDED: {
        /* NOTE: guided scheduling uses a heuristic to choose a good
         * chunk size to divide up the remaining iterations amongst
         * the team (subject to a minimum). An exact implementation of 
         * this would require a lock on the loop data. However, the
         * heuristic can be approximated using (possibly) stale values 
         * and this should be good enough. The value of "remaining"
         * is monotonically decreasing. The worst that could happen
         * is that an update to loop->chunk is lost slightly unbalancing
         * the distribution. The most important point is that loop->current
         * is maintained atomically. */
        /* UPDATE: if cmpxchg64 is available then this is used to protect
         * the update of loop->chunk. This is fairly cunning, and makes
         * the chunk update more accurate in this case! */
        int64_t min_chunk = loop->min_chunk;
        int64_t remaining = upper - loop->current + 1;		 /* estimate */
        int64_t my_chunk = MAX(min_chunk, MIN(chunk, remaining));/* estimate */
        int64_t stride = inc * my_chunk;
#if __WORDSIZE == 64
        int64_t current = __pmp_atomic_xadd64(&loop->current, stride);
#else
        /* TODO: the atomic xadd64 is a problem for 32-bit compilation */
        /*       the workaround below is just to do a 32-bit atomic add */
        int64_t current = __pmp_atomic_xadd32((int32_t *) &loop->current,
                                              (int32_t) stride);
#endif
        assert(stride != 0);
#ifdef SUPER_DEBUG
        if (Enabled_Libomp_Loop_Debug)
          __pmp_debug("LOOPS_DEBUG", "__pmp_schedule_next: global_id=%d, "
                      "remaining=%d, my_chunk=%d, stride=%d, current=%d\n",
                      global_id, remaining, my_chunk, stride, current);
#endif

        if ((inc >= 0) ? (current > upper) : (current < upper)) {
          __pmp_loop_free(thread);
          return 0;
	}
	else {
          pmp_param_t *param = __pmp_get_param();
          int64_t my_upper = LOOPMIN(inc, upper, current + stride - inc);
          int64_t new_chunk;
          int64_t divisor;
          remaining = upper - my_upper;				/* estimate */
          divisor = team_size * param->guided_chunk_divisor;
          new_chunk = (remaining + divisor - 1) / divisor;
          new_chunk = MIN(param->guided_chunk_max, new_chunk);
          new_chunk = MAX(min_chunk, new_chunk);
#if __WORDSIZE == 64
          (void) __pmp_atomic_cmpxchg64(&loop->chunk, chunk, new_chunk);
#else
          loop->chunk = new_chunk;				/* estimate */
#endif
          *incp = inc;
          *lowerp = current;
          *upperp = my_upper;
          thread->ticket_number = current;
          thread->iteration++;
          __pmp_loop_analyser(thread, sched, global_id, local_id, lower, upper,
                              *lowerp, *upperp, *incp, 0, 0);
          return 1;
	}
        /* NOT REACHED */
        break;
      }
      default: {
        __pmp_fatal("unknown dynamic scheduling type %d\n", sched);
        break;
      }
    }
    /* NOT REACHED */
    assert(0);
    __pmp_loop_free(thread);
    return 0;
  }
  /* NOT REACHED */
}
Beispiel #5
0
static inline void __pmp_static_init (int global_id, int sched,
                                      int64_t *lowerp, int64_t *upperp, 
                                      int64_t *stridep,
                                      int64_t inc, int64_t chunk)
{
  /* NOTE: chunk parameter is undefined/unused for static even scheduling */

  pmp_thread_t *thread = __pmp_get_thread(global_id);
  int team_size = __pmp_get_team_size(thread->team);
  int64_t loop_lower = *lowerp;
  int64_t loop_upper = *upperp;
  int64_t lower;
  int64_t upper;
  
  assert(team_size > 0);

  if (chunk <= 0) {
    if (thread->global_id == 0)
      __pmp_warning("Chunk size is non-positive, set to default '1'\n");
    chunk = 1;
  }

  if (team_size == 1) {
    *stridep = (inc > 0) ? (loop_upper - loop_lower + 1) : 
                           (loop_upper - loop_lower - 1);
  }
  else {
    pmp_local_id_t local_id = thread->local_id;
    int64_t stride;
    switch (sched) {
      case PMP_SCHED_STATIC_EVEN: {
        int64_t size = (loop_upper - loop_lower) / inc + 1;
        assert(size >= 0);
        if (!__pmp_get_param()->static_fair) {
          /* The size is divided by the team_size and rounded up to give
           * the chunk size. Chunks of this size are assigned to threads
           * in increased local_id order. If the division was not exact
           * then the last thread will have fewer iterations, and possibly
           * none at all. */
          chunk = (size + team_size - 1) / team_size;
          lower = loop_lower + (local_id * chunk * inc);
	}
        else {
          /* The size is divided by the team_size and rounded down to 
           * give the chunk. Each thread will have at least this many
           * iterations. If the division was not exact then the remainder
           * iterations are scheduled across the threads in increasing
           * thread order. Note that the difference between the minimum
           * and maximum number of iterations assigned to the threads
           * across the team is at most 1. The maximum number of iterations
           * assigned to a thread (the worst case path through the schedule)
           * is the same as for default behavior. */
          int64_t remainder;
          int64_t index;
          chunk = size / team_size;
          remainder = size - (chunk * team_size);
          index = MIN(local_id, remainder) * (chunk + 1);
          if (local_id > remainder) {
            index += (local_id - remainder) * chunk;
	  }
          lower = loop_lower + (index * inc);
          chunk += (local_id < remainder);
	}

        if ((inc >= 0) ? (lower <= loop_upper) : (lower >= loop_upper)) {
          upper = LOOPMIN(inc, lower + (chunk - 1) * inc, loop_upper);
          stride = size * inc;
        }
        else {
          /* If the entire set of iterations falls out of the loop bounds
           * then arrange for a non-iterating loop which will not trigger
           * the LASTPRIVATE check made by the compiler. This means that
           * the final value of the loop induction variable must not exceed
           * the loop upper bound. */
          lower = loop_lower - inc;
          upper = lower - inc;
          stride = inc;
        }
        __pmp_loop_analyser(thread, sched, global_id, local_id,
                            loop_lower, loop_upper,
                            lower, upper, inc, chunk, stride);
        break;
      }
      case PMP_SCHED_STATIC: {
        stride = chunk * inc;
        lower = loop_lower + (local_id * stride);
        if ((inc >= 0) ? (lower <= loop_upper) : (lower >= loop_upper)) {
          upper = LOOPMIN(inc, lower + (chunk - 1) * inc, loop_upper);
          stride *= team_size;
        }
        else {
          /* If the entire set of iterations falls out of the loop bounds
           * then arrange for a non-iterating loop which will not trigger
           * the LASTPRIVATE check made by the compiler. This means that
           * the final value of the loop induction variable must not exceed
           * the loop upper bound. */
          lower = loop_lower - inc;
          upper = lower - inc;
          stride = inc;
        }
        __pmp_loop_analyser(thread, sched, global_id, local_id,
                            loop_lower, loop_upper,
                            lower, upper, inc, chunk, stride);
        break;
      }
      default: {
        __pmp_fatal("unknown static scheduling type %d\n", sched);
        stride = 0;
        lower = loop_lower;
        upper = loop_upper;
      }
    }
    *lowerp = lower;
    *upperp = upper;
    *stridep = stride;
  }
  __pmp_scheduler_sample(sched);
}
Beispiel #6
0
void __ompc_get_thdprv (void ***thdprv, int64_t size,
                        void *data, int global_id)
{
  __pmp_debug(PMP_DEBUG_CALLS, "__ompc_get_thdprv: thdprv=%p, size=%ld, "
              "data=%p, global_id=%d\n", thdprv, (long) size, data, global_id);
  __pmp_sample(PMP_PROFILE_OMPC_GET_THDPRV);

  if (__pmp_get_param()->disabled) {
    void **t = (void **) calloc (1, sizeof(void *));
    if (t == NULL) {
      __pmp_fatal("failed to allocate thread private data\n");
    }
    t[0] = data;
    *thdprv = t;
  }
  else {
    void **t = *thdprv;
    if (t == NULL) {
      /* TODO: can I reduce the size of this array? Note that it is indexed
       * by global_id and global_id's can be arbitrarily assigned to threads
       * in general, so this may be difficult. */
      void *t_new;
      void *t_cur;
      t = (void **) calloc(PMP_MAX_THREADS, sizeof(void *));
      if (t == NULL) {
        __pmp_fatal("failed to allocate thread private data\n");
      }
      t_new = (void *) t;
      t_cur = __pmp_atomic_cmpxchgptr((volatile voidptr_t *) thdprv, 
                                      NULL, t_new);
      if (t_cur != NULL) {
        /* This thread lost the race and another thread has already
         * installed a thdprv array. Simply back out this allocation
         * and use *thdprv. */
        free(t);
        t = (void **) t_cur;
      }
    }
    if (t[global_id] == NULL) {
      /* The OpenMP 2.5 standard says:
       *
       * "Each copy of a threadprivate object is initialized once, in the manner
       * specified by the program, but at an unspecified point in the program
       * prior to the first reference to that copy."
       *
       * Since the initial values live in the statically allocated block of
       * memory passed to our "data" argument, the master thread needs to use
       * a dynamically allocated block, just as the additional threads do, so
       * that it if it changes its copies of the variables before the program
       * enters the first parallel region, those changes have no effect on the
       * copies in the additional threads. Observation shows that the code
       * generator calls __ompc_get_thdprv from the serial portion of the
       * program, for the master thread, before it changes any values.
       *
       * Note the copying is done without synchronization, which is safe only
       * because we're copying statically initialized and subsequently
       * unchanged values: copying from the main thread would require a
       * barrier.
       */
      t[global_id] = (void *) malloc(size);
      if (t[global_id] == NULL) {
        __pmp_fatal("failed to allocate thread private data");
      }
      memcpy(t[global_id], data, size);
    }
  }
}
Beispiel #7
0
void __pmp_thread_create (pmp_thread_t *thread)
{
  pmp_thread_t *creator = __pmp_get_current_thread();
  pthread_t pthread_id;
  int result;
  pmp_param_t *param = __pmp_get_param();

  thread->creator = creator;

  if (param->thread_guard_size > 0) {
    void *guard;
    /* NOTE: this lock is to give a better chance of the guard page 
     * allocation to immediately follow the pthread stack allocation. */
    __pmp_lock(thread->global_id, &__pmp_manager.pthread_create_lock);

    /* NOTE: it seems that mmap tends to allocate in an upwards direction
       so allocate the guard page first. */
    guard = mmap(0, param->thread_guard_size, PROT_NONE,
#if defined(BUILD_OS_DARWIN)
                 MAP_PRIVATE | MAP_ANON,
#else /* defined(BUILD_OS_DARWIN) */
                 MAP_PRIVATE | MAP_ANONYMOUS,
#endif /* defined(BUILD_OS_DARWIN) */
		 0, 0);
    if (guard == MAP_FAILED) {
      __pmp_warning("unable to allocate a guard page of %ld bytes\n",
                    (long) param->thread_guard_size);
    }
    else {
      __pmp_debug(PMP_DEBUG_THREAD, "guard page allocated at address %p\n",
                  guard);
      thread->guard_page = guard;
    }
  }

  if ((result = pthread_create(&pthread_id, &__pmp_manager.pthread_attr,
			       __pmp_thread_run, thread)) != 0) {
    if (__pmp_manager.allocated_threads > param->initial_team_size) {
      __pmp_warning(
        "pthread_create failed when trying to allocate thread %d\n",
        __pmp_manager.allocated_threads);
      __pmp_warning(
        "note this is more than the initial number of threads (%d)\n",
        param->initial_team_size);
#if defined(BUILD_OS_DARWIN)
      if (sizeof(long) == 4)
#else /* defined(BUILD_OS_DARWIN) */
      if (__WORDSIZE == 32)
#endif /* defined(BUILD_OS_DARWIN) */
      {
	int64_t total_stack = ((int64_t) param->thread_stack_size) *
                              ((int64_t) __pmp_manager.allocated_threads);
        if (total_stack > 0x40000000LL) {
          __pmp_warning(
            "the failure may be due to excessive thread stack size\n");
          __pmp_warning(
            "try using a smaller setting for PSC_OMP_STACK_SIZE\n");
	}
      }
    }
    __pmp_fatal("unable to create thread (result code %d)\n", result);
  }

  if (param->thread_guard_size > 0) {
    __pmp_unlock(thread->global_id, &__pmp_manager.pthread_create_lock);
  }

  __pmp_atomic_xadd32(&__pmp_manager.waiting_threads, 1);

  __pmp_debug(PMP_DEBUG_THREAD, "created thread global_id %d\n", 
              thread->global_id);
}
Beispiel #8
0
static void *__pmp_thread_run (void *arg)
{
  pmp_thread_t *thread = (pmp_thread_t *) arg;
  pmp_team_t *team;

  thread->pthread_id = pthread_self();
#ifdef PMP_NO_NPTL
  thread->tid = getpid();
#else
  thread->tid = gettid();
#endif

#ifndef PMP_NO_TLS
#ifdef PMP_TLS_THREAD
  __pmp_tls_current_thread = thread;
#endif
#ifdef PMP_TLS_LOCAL_ID
  __pmp_tls_current_local_id = thread->local_id;
#endif
#ifdef PMP_TLS_GLOBAL_ID
  __pmp_tls_current_global_id = thread->global_id;
#endif
#endif

#ifdef PMP_USE_PTHREAD_SIGNALS
  if (pthread_sigmask(SIG_BLOCK, &__pmp_manager.mask_block_sigpmp, 
                      NULL) != 0) {
    __pmp_fatal("unable to set thread-specific sigmask\n");
  }
#else
  if (sigprocmask(SIG_BLOCK, &__pmp_manager.mask_block_sigpmp, NULL) != 0) {
    __pmp_fatal("unable to set thread-specific sigmask\n");
  }
#endif

  if (pthread_setspecific(__pmp_manager.thread_key, (void *) thread) != 0) {
    __pmp_fatal("unable to set thread-specific data\n");
  }

  __pmp_debug(PMP_DEBUG_THREAD, "thread global_id %d is running\n",
              thread->global_id);

  /* Note that there is no synchronization between the creating thread and
   * the created thread until here. This is the point where the created
   * thread is assigned to do some work. The reason that this is sufficient
   * is because the __pmp_thread_wait/wake mechanism is "protected" by
   * the thread->sync value which has been pre-initialized. If the creator
   * gets to the wake first, then it will just swap in PMP_THREAD_UNBLOCKED
   * and its work is done. If it gets to the wake second, then the created
   * thread must have got there first and this guarantees that the other
   * thread fields will already be initialized by the created thread.
   *
   * With nested forking, there is the possibility that the creator thread
   * will be usurped by another forking thread (there is no lock between
   * creation of a thread and that thread being assigned to do work). This
   * works for the same reason as described above.
   */

  __pmp_thread_wait(thread);		/* wait for first assignment */

#ifdef PMP_NO_NPTL
  __pmp_shared_catch_segv(thread);	/* set up shared segv handler */
#else
  __pmp_catch_segv();			/* set up thread's segv handler */
#endif

  __pmp_thread_bind(thread);		/* bind to the assigned local_id */

  while (1) {

    __pmp_debug(PMP_DEBUG_THREAD,
                "__pmp_thread_run: thread tid=%d, pthread_id=0x%08x "
                "global_id=%d, local_id=%d\n",
                (int) thread->tid, (int) thread->pthread_id,
                (int) thread->global_id, (int) thread->local_id);

    team = thread->team;
    assert(team != NULL);
#ifndef PMP_NO_TLS
#ifdef PMP_TLS_LOCAL_ID
    __pmp_tls_current_local_id = thread->local_id;
#endif
#ifdef PMP_TLS_TEAM
    __pmp_tls_current_team = team;
#endif
#endif
    __pmp_memory_fence();
    __pmp_thread_work(thread);		/* do the work */
    __pmp_thread_worker_join(team);	/* wake up team master */
    __pmp_memory_fence();
    __pmp_thread_idle(thread);		/* thread is now idle */
    __pmp_thread_wait(thread);		/* wait for more work */
    __pmp_thread_bind(thread);		/* update binding */
  }

  /* Currently unreachable */

  __pmp_debug(PMP_DEBUG_THREAD, "thread global_id %d is exiting\n",
              thread->global_id);

  return NULL;
}