Ejemplo n.º 1
0
int __pmp_thread_acquire (int nthreads)
{
  int count = 1;	/* count from 1 to ignore master thread */

  /* NOTE - in the typical case this while construct does not loop */

  while (count < nthreads) {
    int required = nthreads - count;
    int waiting = __pmp_atomic_xadd32(&__pmp_manager.waiting_threads,
                                      -required);
    if (waiting >= required) {
      count += required;
      break;
    }
    else {
      count += waiting;
      required -= waiting;
      __pmp_atomic_xadd32(&__pmp_manager.waiting_threads, required);
      if (__pmp_manager_create_more_threads(required) == 0) {
        break;
      }
    }
  }

  __pmp_atomic_add32(&__ompc_cur_numthreads, count - 1);

  __pmp_debug(PMP_DEBUG_THREAD, "acquired %d out of %d threads\n",
              count, nthreads);

  return count;
}
Ejemplo n.º 2
0
static inline void __pmp_thread_barrier (pmp_thread_t *thread)
{
  /* NOTE: the compiler optimizes away OMP barriers in the serial code
   * so there is no need to optimize that case here. The case of a team
   * with just one thread is not so common, so ideally don't optimize 
   * that path either. However, it is currently necessary to check
   * (team != NULL) so one might as well check for the 1-thread team too.
   * The most important case is the n-way barrier where n > 1. */
  pmp_team_t *team = thread->team;
  int team_size = __pmp_get_team_size(team);
  if (team_size > 1) {
    int32_t count = __pmp_atomic_xadd32(&team->barrier_count, -1);
    assert(count > 0);
    __pmp_debug(PMP_DEBUG_THREAD, "thread hits barrier with count of %d\n", 
                (int) count);
    if (count > 1) {
      __pmp_thread_wait(thread);
    }
    else {
      pmp_local_id_t local_id = thread->local_id;
      int i;
      team->barrier_count = team_size;
      for (i = 0; i < team_size; i++) {
        pmp_thread_t *t = team->members[i];
        if (i != local_id) {
          __pmp_thread_wake(t);
        }
      }
    }
  }
}
Ejemplo n.º 3
0
static void __pmp_thread_release (pmp_team_t *team, pmp_thread_t *master)
{
  pmp_local_id_t old_local_id;
  pmp_idstack_t *idstack = &__pmp_manager.idstack;
  int nworkers = team->team_size - 1;
  int i;

  __pmp_lock(master->global_id, &__pmp_manager.idlock);

  for (i = nworkers; i >= 1; i--) {
    pmp_thread_t *thread = team->members[i];

    assert(thread != master);
    old_local_id = thread->local_id;
    thread->local_id = -1;
    thread->team = NULL;

    __pmp_idstack_push(idstack, thread->global_id);

    __pmp_debug(PMP_DEBUG_THREAD,
                "released thread global_id %d from local_id %d "
                "of team at %p\n", 
                thread->global_id, old_local_id, team);
  }

  __pmp_unlock(master->global_id, &__pmp_manager.idlock);

  __pmp_atomic_add32(&__ompc_cur_numthreads, -nworkers);

  __pmp_atomic_xadd32(&__pmp_manager.waiting_threads, nworkers);
}
Ejemplo n.º 4
0
static inline void __pmp_thread_master_join (pmp_thread_t *master)
{
  pmp_team_t *team = master->team;
  int32_t count;
  int thread_spin = __pmp_get_param()->thread_spin;
  int i;

  /* NOTE: insert a small spin loop here to try to arrange for the master
   *       to arrive just after the last worker thread. If this happens
   *       then we avoid a much more expensive thread synchronization. */

  for (i = 0; i < thread_spin; i++) {
    /* USER LEVEL SPIN LOOP */
    if (team->working_threads == 1) {
      team->working_threads = 0;
      return;
    }
    __pmp_yield();
  }

  count = __pmp_atomic_xadd32(&team->working_threads, -1);
  __pmp_debug(PMP_DEBUG_THREAD, "master thread joins with count of %d\n", 
              (int) count);
  assert(count >= 1);
  if (count > 1) {
    __pmp_thread_wait(master);
  }
}
Ejemplo n.º 5
0
static inline void __pmp_thread_worker_join (pmp_team_t *team)
{
  int32_t count = __pmp_atomic_xadd32(&team->working_threads, -1);
  __pmp_debug(PMP_DEBUG_THREAD, "worker thread joins with count of %d\n", 
              (int) count);
  assert(count >= 1);
  if (count == 1) {
    __pmp_thread_wake(team->members[0]);
  }
}
Ejemplo n.º 6
0
static inline int __pmp_schedule_next (int global_id, int64_t *lowerp,
                                       int64_t *upperp, int64_t *incp)
{
  pmp_thread_t *thread = __pmp_get_thread(global_id);
  int team_size = __pmp_get_team_size(thread->team);
  int64_t iteration = thread->iteration;
  pmp_local_id_t local_id = thread->local_id;
  pmp_loop_t *loop = thread->loop;

  assert(loop != NULL);
  assert(local_id < team_size);

  if (team_size == 1) {
    if (iteration == 0) {
      *lowerp = loop->lower;
      *upperp = loop->upper;
      *incp = loop->inc;
      thread->ticket_number = loop->lower;
      thread->iteration = 1;
      __pmp_loop_analyser(thread, loop->sched, global_id, local_id,
                          loop->lower, loop->upper,
                          *lowerp, *upperp, *incp, 0, 0);
      return 1;
    }
    else {
      assert(iteration == 1);
      __pmp_loop_free(thread);
      return 0;
    }
  }
  else {
    int     sched = loop->sched;
    int64_t lower = loop->lower;
    int64_t upper = loop->upper;
    int64_t inc   = loop->inc;
    int64_t chunk = loop->chunk;
    switch (sched) {
      case PMP_SCHED_STATIC:
      case PMP_SCHED_ORDERED_STATIC: {
        /* NOTE: setting a small value of chunk causes (unnecessary) iteration
         * through this code. If the chunk is ignored, the code degenerates
         * into the static even case (which is the default). */
        int64_t size = (upper - lower) / inc + 1;
        int64_t size_per_thread = ((size - 1) / team_size + 1) * inc;
        int64_t thread_lower = lower + (local_id * size_per_thread);
        int64_t thread_upper = thread_lower + size_per_thread - inc;
        int64_t this_lower = thread_lower + (iteration * chunk * inc);
        int64_t this_upper = this_lower + (chunk - 1) * inc;
        thread_upper = LOOPMIN(inc, thread_upper, upper);
        this_upper = LOOPMIN(inc, this_upper, thread_upper);
        if ((inc >= 0) ? (this_lower > thread_upper) : 
                         (this_lower < thread_upper)) {
          __pmp_loop_free(thread);
          return 0;
	}
        else {
          *incp = inc;
          *lowerp = this_lower;
          *upperp = this_upper;
          thread->ticket_number = this_lower;
          thread->iteration++;
          __pmp_loop_analyser(thread, sched, global_id, local_id, lower, upper,
                              *lowerp, *upperp, *incp, 0, 0);
          return 1;
	}
        /* NOT REACHED */
        break;
      }
      case PMP_SCHED_STATIC_EVEN:
      case PMP_SCHED_ORDERED_STATIC_EVEN: {
        if (iteration == 0) {
          int64_t size = (upper - lower) / inc + 1;
          int64_t thread_lower;
          int64_t thread_upper;
          if (!__pmp_get_param()->static_fair) {
            int64_t size_per_thread = ((size - 1) / team_size + 1) * inc;
            thread_lower = lower + (local_id * size_per_thread);
            thread_upper = thread_lower + size_per_thread - inc;
	  }
          else {
            int64_t chunk = size / team_size;
            int64_t remainder = size - (chunk * team_size);
            int64_t index = MIN(local_id, remainder) * (chunk + 1);
            if (local_id > remainder) {
              index += (local_id - remainder) * chunk;
            }
            thread_lower = lower + (index * inc);
            chunk += (local_id < remainder);
            thread_upper = thread_lower + (chunk - 1) * inc;
	  }
          thread_upper = LOOPMIN(inc, thread_upper, upper);
          if ((inc >= 0) ? (thread_lower > thread_upper) : 
                           (thread_lower < thread_upper)) {
            __pmp_loop_free(thread);
            return 0;
	  }
	  else {
            *incp = inc;
            *lowerp = thread_lower;
            *upperp = thread_upper;
            thread->ticket_number = thread_lower;
            thread->iteration++;
            __pmp_loop_analyser(thread, sched, global_id, local_id,
                                lower, upper, *lowerp, *upperp, 
                                *incp, 0, 0);
            return 1;
	  }
	}
        else {
          assert(iteration == 1);
          __pmp_loop_free(thread);
	  return 0;
	}
        /* NOT REACHED */
        break;
      }
      case PMP_SCHED_DYNAMIC:
      case PMP_SCHED_ORDERED_DYNAMIC: {
        int64_t stride = inc * chunk;
#if __WORDSIZE == 64
        int64_t current = __pmp_atomic_xadd64(&loop->current, stride);
#else
        /* TODO: the atomic xadd64 is a problem for 32-bit compilation */
        /*       the workaround below is just to do a 32-bit atomic add */
        int64_t current;
        current = (int64_t) __pmp_atomic_xadd32((int32_t *) &loop->current,
                                                (int32_t) stride);
#endif
        if ((inc >= 0) ? (current > upper) : (current < upper)) {
          __pmp_loop_free(thread);
          return 0;
	}
	else {
          *incp = inc;
          *lowerp = current;
          *upperp = *lowerp + stride - inc;
          *upperp = LOOPMIN(inc, upper, *upperp);
          thread->ticket_number = current;
          thread->iteration++;
          __pmp_loop_analyser(thread, sched, global_id, local_id, lower, upper,
                              *lowerp, *upperp, *incp, 0, 0);
          return 1;
	}
        /* NOT REACHED */
        break;
      }
      case PMP_SCHED_GUIDED:
      case PMP_SCHED_ORDERED_GUIDED: {
        /* NOTE: guided scheduling uses a heuristic to choose a good
         * chunk size to divide up the remaining iterations amongst
         * the team (subject to a minimum). An exact implementation of 
         * this would require a lock on the loop data. However, the
         * heuristic can be approximated using (possibly) stale values 
         * and this should be good enough. The value of "remaining"
         * is monotonically decreasing. The worst that could happen
         * is that an update to loop->chunk is lost slightly unbalancing
         * the distribution. The most important point is that loop->current
         * is maintained atomically. */
        /* UPDATE: if cmpxchg64 is available then this is used to protect
         * the update of loop->chunk. This is fairly cunning, and makes
         * the chunk update more accurate in this case! */
        int64_t min_chunk = loop->min_chunk;
        int64_t remaining = upper - loop->current + 1;		 /* estimate */
        int64_t my_chunk = MAX(min_chunk, MIN(chunk, remaining));/* estimate */
        int64_t stride = inc * my_chunk;
#if __WORDSIZE == 64
        int64_t current = __pmp_atomic_xadd64(&loop->current, stride);
#else
        /* TODO: the atomic xadd64 is a problem for 32-bit compilation */
        /*       the workaround below is just to do a 32-bit atomic add */
        int64_t current = __pmp_atomic_xadd32((int32_t *) &loop->current,
                                              (int32_t) stride);
#endif
        assert(stride != 0);
#ifdef SUPER_DEBUG
        if (Enabled_Libomp_Loop_Debug)
          __pmp_debug("LOOPS_DEBUG", "__pmp_schedule_next: global_id=%d, "
                      "remaining=%d, my_chunk=%d, stride=%d, current=%d\n",
                      global_id, remaining, my_chunk, stride, current);
#endif

        if ((inc >= 0) ? (current > upper) : (current < upper)) {
          __pmp_loop_free(thread);
          return 0;
	}
	else {
          pmp_param_t *param = __pmp_get_param();
          int64_t my_upper = LOOPMIN(inc, upper, current + stride - inc);
          int64_t new_chunk;
          int64_t divisor;
          remaining = upper - my_upper;				/* estimate */
          divisor = team_size * param->guided_chunk_divisor;
          new_chunk = (remaining + divisor - 1) / divisor;
          new_chunk = MIN(param->guided_chunk_max, new_chunk);
          new_chunk = MAX(min_chunk, new_chunk);
#if __WORDSIZE == 64
          (void) __pmp_atomic_cmpxchg64(&loop->chunk, chunk, new_chunk);
#else
          loop->chunk = new_chunk;				/* estimate */
#endif
          *incp = inc;
          *lowerp = current;
          *upperp = my_upper;
          thread->ticket_number = current;
          thread->iteration++;
          __pmp_loop_analyser(thread, sched, global_id, local_id, lower, upper,
                              *lowerp, *upperp, *incp, 0, 0);
          return 1;
	}
        /* NOT REACHED */
        break;
      }
      default: {
        __pmp_fatal("unknown dynamic scheduling type %d\n", sched);
        break;
      }
    }
    /* NOT REACHED */
    assert(0);
    __pmp_loop_free(thread);
    return 0;
  }
  /* NOT REACHED */
}
Ejemplo n.º 7
0
void __pmp_thread_create (pmp_thread_t *thread)
{
  pmp_thread_t *creator = __pmp_get_current_thread();
  pthread_t pthread_id;
  int result;
  pmp_param_t *param = __pmp_get_param();

  thread->creator = creator;

  if (param->thread_guard_size > 0) {
    void *guard;
    /* NOTE: this lock is to give a better chance of the guard page 
     * allocation to immediately follow the pthread stack allocation. */
    __pmp_lock(thread->global_id, &__pmp_manager.pthread_create_lock);

    /* NOTE: it seems that mmap tends to allocate in an upwards direction
       so allocate the guard page first. */
    guard = mmap(0, param->thread_guard_size, PROT_NONE,
#if defined(BUILD_OS_DARWIN)
                 MAP_PRIVATE | MAP_ANON,
#else /* defined(BUILD_OS_DARWIN) */
                 MAP_PRIVATE | MAP_ANONYMOUS,
#endif /* defined(BUILD_OS_DARWIN) */
		 0, 0);
    if (guard == MAP_FAILED) {
      __pmp_warning("unable to allocate a guard page of %ld bytes\n",
                    (long) param->thread_guard_size);
    }
    else {
      __pmp_debug(PMP_DEBUG_THREAD, "guard page allocated at address %p\n",
                  guard);
      thread->guard_page = guard;
    }
  }

  if ((result = pthread_create(&pthread_id, &__pmp_manager.pthread_attr,
			       __pmp_thread_run, thread)) != 0) {
    if (__pmp_manager.allocated_threads > param->initial_team_size) {
      __pmp_warning(
        "pthread_create failed when trying to allocate thread %d\n",
        __pmp_manager.allocated_threads);
      __pmp_warning(
        "note this is more than the initial number of threads (%d)\n",
        param->initial_team_size);
#if defined(BUILD_OS_DARWIN)
      if (sizeof(long) == 4)
#else /* defined(BUILD_OS_DARWIN) */
      if (__WORDSIZE == 32)
#endif /* defined(BUILD_OS_DARWIN) */
      {
	int64_t total_stack = ((int64_t) param->thread_stack_size) *
                              ((int64_t) __pmp_manager.allocated_threads);
        if (total_stack > 0x40000000LL) {
          __pmp_warning(
            "the failure may be due to excessive thread stack size\n");
          __pmp_warning(
            "try using a smaller setting for PSC_OMP_STACK_SIZE\n");
	}
      }
    }
    __pmp_fatal("unable to create thread (result code %d)\n", result);
  }

  if (param->thread_guard_size > 0) {
    __pmp_unlock(thread->global_id, &__pmp_manager.pthread_create_lock);
  }

  __pmp_atomic_xadd32(&__pmp_manager.waiting_threads, 1);

  __pmp_debug(PMP_DEBUG_THREAD, "created thread global_id %d\n", 
              thread->global_id);
}