Esempio n. 1
0
static inline void __pmp_thread_master_join (pmp_thread_t *master)
{
  pmp_team_t *team = master->team;
  int32_t count;
  int thread_spin = __pmp_get_param()->thread_spin;
  int i;

  /* NOTE: insert a small spin loop here to try to arrange for the master
   *       to arrive just after the last worker thread. If this happens
   *       then we avoid a much more expensive thread synchronization. */

  for (i = 0; i < thread_spin; i++) {
    /* USER LEVEL SPIN LOOP */
    if (team->working_threads == 1) {
      team->working_threads = 0;
      return;
    }
    __pmp_yield();
  }

  count = __pmp_atomic_xadd32(&team->working_threads, -1);
  __pmp_debug(PMP_DEBUG_THREAD, "master thread joins with count of %d\n", 
              (int) count);
  assert(count >= 1);
  if (count > 1) {
    __pmp_thread_wait(master);
  }
}
Esempio n. 2
0
void __ompc_copyin_thdprv (int n, ...)
{
  pmp_global_id_t global_id;

  __pmp_debug(PMP_DEBUG_CALLS, "__ompc_copyin_thdprv: n=%d\n", n);
  __pmp_sample(PMP_PROFILE_OMPC_COPYIN_THDPRV);

  if (__pmp_get_param()->disabled) {
    return;
  }

  global_id = __pmp_get_current_global_id();

  va_list ap;
  va_start(ap, n);
  while (n > 0) {
    void *dst = va_arg(ap, void*);
    void *src = va_arg(ap, void*);
    int size = va_arg(ap, int);
    if (dst != src) {
      __pmp_debug(PMP_DEBUG_THREAD, "__ompc_copyin_thdprv: global_id=%d "
                  "dst: %p, src: %p, size: %d\n", global_id, dst, src, size);
      memcpy(dst, src, size);
    }
    n -= 3;
  }
  va_end(ap);
}
Esempio n. 3
0
static inline void __pmp_thread_wait (pmp_thread_t *thread)
{
  int32_t sync;
  int thread_spin;
  int i;

  if (thread->sync == PMP_SYNC_UNBLOCKED) {
    __pmp_debug(PMP_DEBUG_THREAD, "thread global_id %d does not block (1)\n",
                thread->global_id);
    thread->sync = PMP_SYNC_IDLE;
    return;
  }

  thread_spin = __pmp_get_param()->thread_spin;

  for (i = 0; i < thread_spin; i++) {
    /* USER LEVEL SPIN LOOP */
    if (thread->sync == PMP_SYNC_UNBLOCKED) {
      __pmp_debug(PMP_DEBUG_THREAD, "thread global_id %d does not block (2)\n",
                  thread->global_id);
      thread->sync = PMP_SYNC_IDLE;
      return;
    }
    __pmp_yield();
  }

  sync = __pmp_atomic_cmpxchg32(&thread->sync, PMP_SYNC_IDLE,
                                PMP_SYNC_BLOCKED);

  if (sync == PMP_SYNC_IDLE) {
    __pmp_debug(PMP_DEBUG_THREAD, "thread global_id %d is waiting\n",
                thread->global_id);

    __pmp_sample(PMP_PROFILE_THREAD_DESCHEDULE);

#ifdef PMP_USE_PTHREAD_SIGNALS
    {
      int sig;
      do {
        sigwait(&__pmp_manager.mask_block_sigpmp, &sig);
      } while (sig != SIGPMP);
    }
#else
    sigsuspend(&__pmp_manager.mask_unblock_sigpmp);
    /* NOTE: it is unfortunate that sigsuspend does not tell us which
     *       signal has been raised. */
#endif
    __pmp_debug(PMP_DEBUG_THREAD, "thread global_id %d is awake\n",
                thread->global_id);
  }
  else {
    __pmp_debug(PMP_DEBUG_THREAD, "thread global_id %d does not block (3)\n",
                thread->global_id);
    thread->sync = PMP_SYNC_IDLE;
  }
}
Esempio n. 4
0
int __ompc_can_fork (void)
{
  int team_size = __pmp_get_new_team_size();
  int has_forked = (__pmp_get_main_thread()->nesting_depth > 0);
  pmp_param_t *param = __pmp_get_param();
  int serial_outline = param->serial_outline;
  int disabled = param->disabled;
  int can_fork = (team_size > 1 || has_forked || serial_outline) && !disabled;
  __pmp_debug(PMP_DEBUG_CALLS, "__ompc_can_fork returns %d\n", can_fork);
  __pmp_sample(PMP_PROFILE_OMPC_CAN_FORK);
  return can_fork;
}
Esempio n. 5
0
static inline void __pmp_scheduler_init (int global_id, int sched,
                                         int64_t lower, int64_t upper,
                                         int64_t inc, int64_t chunk)
{
  /* NOTE: chunk parameter is undefined/unused for static even scheduling */

  pmp_thread_t *thread = __pmp_get_thread(global_id);
  pmp_param_t *param = __pmp_get_param();
  int64_t min_chunk = MAX(1, chunk);

  if (sched == PMP_SCHED_RUNTIME || sched == PMP_SCHED_ORDERED_RUNTIME) {
    int old = sched;
    sched = param->runtime_schedule;
    chunk = param->runtime_chunk;
    if (old == PMP_SCHED_ORDERED_RUNTIME) {
      sched += PMP_SCHED_ORDERED_OFFSET;
    } 
  }

  if (sched == PMP_SCHED_GUIDED || sched == PMP_SCHED_ORDERED_GUIDED) {
    /* The initial chunk is loop trip count spread over the number of 
     * threads (the division is rounded up) */
    int team_size = __pmp_get_team_size(thread->team);
    int64_t divisor = team_size * param->guided_chunk_divisor;
    chunk = (upper - lower + divisor) / divisor;
    chunk = MIN(param->guided_chunk_max, chunk);
    chunk = MAX(min_chunk, chunk);
  }

  if (chunk <= 0) {
    if (thread->global_id == 0)
      __pmp_warning("Chunk size is non-positive, set to default '1'\n");
    chunk = 1;
  }

  __pmp_scheduler_sample(sched);

  assert(inc != 0 && chunk != 0 && min_chunk != 0);

  __pmp_loop_alloc(thread, sched, lower, upper, inc, chunk, min_chunk);

  thread->iteration = 0;
}
Esempio n. 6
0
static void __pmp_thread_bind (pmp_thread_t *thread)
{
  /* TODO : use dynamic information to bind threads appropriately */

  pmp_param_t *param = __pmp_get_param();
  if (param->enable_affinity) {
    int cpu;
    int index = param->global_affinity ? thread->global_id : thread->local_id;
    assert(index < PMP_MAX_THREADS);
    cpu = param->thread_to_cpu_map[index];
    assert(cpu < param->machine_num_cpus);
    if (thread->cpu != cpu) {
      static bool __pmp_enable_affinity_warning = true;
      int e;
      if (__pmp_manager.params != NULL) {
        thread->param = &__pmp_manager.params[cpu];
      }
      else {
        thread->param = &__pmp_param;
      }
      e = __pmp_set_affinity(cpu);
      __pmp_debug(PMP_DEBUG_THREAD, "__pmp_thread_bind: global_id=%d, "
                  "local_id=%d, CPU=%d, param=%p\n",
                  thread->global_id, thread->local_id, cpu, thread->param);
      if (e != 0 && __pmp_enable_affinity_warning) {
        __pmp_warning("failed to set affinity\n");
        __pmp_warning("maybe the kernel does not support "
                      "affinity system calls\n");
        __pmp_enable_affinity_warning = false;
      }
      thread->cpu = cpu;
    }

    /* TODO: give the thread an opportunity to move to its bound CPU
     * before continuing? Currently just do a __pmp_yield(). It is not
     * clear if this is necessary or sufficient. */
    __pmp_yield();
  }
}
Esempio n. 7
0
static inline void __pmp_loop_analyser (pmp_thread_t *thread, int sched,
                                        pmp_global_id_t global_id,
                                        pmp_local_id_t local_id,
                                        int64_t loop_lower, int64_t loop_upper,
                                        int64_t my_lower, int64_t my_upper,
                                        int64_t inc, int64_t chunk, 
                                        int64_t stride)
{
  pmp_loop_t *loop;
  bool allocated;

#ifdef SUPER_DEBUG
  if (Enabled_Libomp_Loop_Debug)
    __pmp_debug("LOOPS_DEBUG", "__pmp_loop_analyser: sched=%d, global_id=%d, "
                "local_id=%d, loop_lower=%" PRId64 ", loop_upper=%" PRId64 ", "
                "my_lower=%" PRId64 ", my_upper=%" PRId64 ", inc=%" PRId64 ", chunk=%" PRId64 ", "
                "stride=%" PRId64 "\n",
                sched, global_id, local_id, loop_lower, loop_upper,
                my_lower, my_upper, inc, chunk, stride);
#endif
  if (!__pmp_profile.enabled && !__pmp_get_param()->check) {
    return;
  }

  /* NOTE: set chunk=0 and stride=0 for a non-strided loop. They will 
   * then be auto-sized to use the inner loop for the required iterations
   * from my_lower to my_upper (inclusive). The outer loop will run
   * only once. */
  if (chunk == 0 && stride == 0) {
    chunk = (my_upper - my_lower) / inc + 1;
  }

  assert(inc != 0 && chunk != 0);
  assert((inc > 0 && stride >= 0) || (inc < 0 && stride <= 0));

  allocated = false;

#ifdef PMP_CHECK
  if (thread->loop == NULL) {
    /* For statically scheduled loops, allocate a loop to hold check data */
    __pmp_loop_alloc(thread, sched, loop_lower, loop_upper, inc, chunk, chunk);
    allocated = true;
  }
#endif

  /* NOTE: filter out cases where the loop contains no iterations */
  if ((inc >= 0) ? (my_lower <= my_upper) : (my_lower >= my_upper)) {

    assert((inc >= 0) ? (loop_lower <= my_lower && 
                         my_lower <= my_upper && 
                         my_upper <= loop_upper)
                      : (loop_upper <= my_upper &&
                         my_upper <= my_lower &&
                         my_lower <= loop_lower));

    loop = thread->loop;

#if (defined PMP_PROFILE) || (defined PMP_CHECK)
    if (inc >= 0) {
      int64_t count = 0;
      int64_t outer = my_lower;
      while (outer <= loop_upper) {
        int64_t inner = outer;
        int64_t i;
        for (i = 0; i < chunk && inner <= my_upper; i++) {
          __pmp_loop_check(loop, inner);
          count++;
          inner += inc;
        }
        if (inner > loop_upper) {
          __pmp_last_check(loop);
        }
        if (stride == 0) {
          break;
	}
        else {
          outer += stride;
          my_upper = MIN(my_upper + stride, loop_upper);
	}
      }
      __pmp_profile_iterations(global_id, count);
    }
    else {
      int64_t count = 0;
      int64_t outer = my_lower;
      while (outer >= loop_upper) {
        int64_t inner = outer;
        int64_t i;
        for (i = 0; i < chunk && inner >= my_upper; i++) {
          __pmp_loop_check(loop, inner);
          count++;
          inner += inc;
        }
        if (inner < loop_upper) {
          __pmp_last_check(loop);
        }
        if (stride == 0) {
          break;
	}
        else {
          outer += stride;
          my_upper = MAX(my_upper + stride, loop_upper);
	}
      }
      __pmp_profile_iterations(global_id, count);
    }
#endif
  }

#ifdef PMP_CHECK
  if (allocated) {
    /* For statically scheduled loops, deallocate the loop */
    __pmp_loop_free(thread);
  }
#endif
}
Esempio n. 8
0
static inline int __pmp_schedule_next (int global_id, int64_t *lowerp,
                                       int64_t *upperp, int64_t *incp)
{
  pmp_thread_t *thread = __pmp_get_thread(global_id);
  int team_size = __pmp_get_team_size(thread->team);
  int64_t iteration = thread->iteration;
  pmp_local_id_t local_id = thread->local_id;
  pmp_loop_t *loop = thread->loop;

  assert(loop != NULL);
  assert(local_id < team_size);

  if (team_size == 1) {
    if (iteration == 0) {
      *lowerp = loop->lower;
      *upperp = loop->upper;
      *incp = loop->inc;
      thread->ticket_number = loop->lower;
      thread->iteration = 1;
      __pmp_loop_analyser(thread, loop->sched, global_id, local_id,
                          loop->lower, loop->upper,
                          *lowerp, *upperp, *incp, 0, 0);
      return 1;
    }
    else {
      assert(iteration == 1);
      __pmp_loop_free(thread);
      return 0;
    }
  }
  else {
    int     sched = loop->sched;
    int64_t lower = loop->lower;
    int64_t upper = loop->upper;
    int64_t inc   = loop->inc;
    int64_t chunk = loop->chunk;
    switch (sched) {
      case PMP_SCHED_STATIC:
      case PMP_SCHED_ORDERED_STATIC: {
        /* NOTE: setting a small value of chunk causes (unnecessary) iteration
         * through this code. If the chunk is ignored, the code degenerates
         * into the static even case (which is the default). */
        int64_t size = (upper - lower) / inc + 1;
        int64_t size_per_thread = ((size - 1) / team_size + 1) * inc;
        int64_t thread_lower = lower + (local_id * size_per_thread);
        int64_t thread_upper = thread_lower + size_per_thread - inc;
        int64_t this_lower = thread_lower + (iteration * chunk * inc);
        int64_t this_upper = this_lower + (chunk - 1) * inc;
        thread_upper = LOOPMIN(inc, thread_upper, upper);
        this_upper = LOOPMIN(inc, this_upper, thread_upper);
        if ((inc >= 0) ? (this_lower > thread_upper) : 
                         (this_lower < thread_upper)) {
          __pmp_loop_free(thread);
          return 0;
	}
        else {
          *incp = inc;
          *lowerp = this_lower;
          *upperp = this_upper;
          thread->ticket_number = this_lower;
          thread->iteration++;
          __pmp_loop_analyser(thread, sched, global_id, local_id, lower, upper,
                              *lowerp, *upperp, *incp, 0, 0);
          return 1;
	}
        /* NOT REACHED */
        break;
      }
      case PMP_SCHED_STATIC_EVEN:
      case PMP_SCHED_ORDERED_STATIC_EVEN: {
        if (iteration == 0) {
          int64_t size = (upper - lower) / inc + 1;
          int64_t thread_lower;
          int64_t thread_upper;
          if (!__pmp_get_param()->static_fair) {
            int64_t size_per_thread = ((size - 1) / team_size + 1) * inc;
            thread_lower = lower + (local_id * size_per_thread);
            thread_upper = thread_lower + size_per_thread - inc;
	  }
          else {
            int64_t chunk = size / team_size;
            int64_t remainder = size - (chunk * team_size);
            int64_t index = MIN(local_id, remainder) * (chunk + 1);
            if (local_id > remainder) {
              index += (local_id - remainder) * chunk;
            }
            thread_lower = lower + (index * inc);
            chunk += (local_id < remainder);
            thread_upper = thread_lower + (chunk - 1) * inc;
	  }
          thread_upper = LOOPMIN(inc, thread_upper, upper);
          if ((inc >= 0) ? (thread_lower > thread_upper) : 
                           (thread_lower < thread_upper)) {
            __pmp_loop_free(thread);
            return 0;
	  }
	  else {
            *incp = inc;
            *lowerp = thread_lower;
            *upperp = thread_upper;
            thread->ticket_number = thread_lower;
            thread->iteration++;
            __pmp_loop_analyser(thread, sched, global_id, local_id,
                                lower, upper, *lowerp, *upperp, 
                                *incp, 0, 0);
            return 1;
	  }
	}
        else {
          assert(iteration == 1);
          __pmp_loop_free(thread);
	  return 0;
	}
        /* NOT REACHED */
        break;
      }
      case PMP_SCHED_DYNAMIC:
      case PMP_SCHED_ORDERED_DYNAMIC: {
        int64_t stride = inc * chunk;
#if __WORDSIZE == 64
        int64_t current = __pmp_atomic_xadd64(&loop->current, stride);
#else
        /* TODO: the atomic xadd64 is a problem for 32-bit compilation */
        /*       the workaround below is just to do a 32-bit atomic add */
        int64_t current;
        current = (int64_t) __pmp_atomic_xadd32((int32_t *) &loop->current,
                                                (int32_t) stride);
#endif
        if ((inc >= 0) ? (current > upper) : (current < upper)) {
          __pmp_loop_free(thread);
          return 0;
	}
	else {
          *incp = inc;
          *lowerp = current;
          *upperp = *lowerp + stride - inc;
          *upperp = LOOPMIN(inc, upper, *upperp);
          thread->ticket_number = current;
          thread->iteration++;
          __pmp_loop_analyser(thread, sched, global_id, local_id, lower, upper,
                              *lowerp, *upperp, *incp, 0, 0);
          return 1;
	}
        /* NOT REACHED */
        break;
      }
      case PMP_SCHED_GUIDED:
      case PMP_SCHED_ORDERED_GUIDED: {
        /* NOTE: guided scheduling uses a heuristic to choose a good
         * chunk size to divide up the remaining iterations amongst
         * the team (subject to a minimum). An exact implementation of 
         * this would require a lock on the loop data. However, the
         * heuristic can be approximated using (possibly) stale values 
         * and this should be good enough. The value of "remaining"
         * is monotonically decreasing. The worst that could happen
         * is that an update to loop->chunk is lost slightly unbalancing
         * the distribution. The most important point is that loop->current
         * is maintained atomically. */
        /* UPDATE: if cmpxchg64 is available then this is used to protect
         * the update of loop->chunk. This is fairly cunning, and makes
         * the chunk update more accurate in this case! */
        int64_t min_chunk = loop->min_chunk;
        int64_t remaining = upper - loop->current + 1;		 /* estimate */
        int64_t my_chunk = MAX(min_chunk, MIN(chunk, remaining));/* estimate */
        int64_t stride = inc * my_chunk;
#if __WORDSIZE == 64
        int64_t current = __pmp_atomic_xadd64(&loop->current, stride);
#else
        /* TODO: the atomic xadd64 is a problem for 32-bit compilation */
        /*       the workaround below is just to do a 32-bit atomic add */
        int64_t current = __pmp_atomic_xadd32((int32_t *) &loop->current,
                                              (int32_t) stride);
#endif
        assert(stride != 0);
#ifdef SUPER_DEBUG
        if (Enabled_Libomp_Loop_Debug)
          __pmp_debug("LOOPS_DEBUG", "__pmp_schedule_next: global_id=%d, "
                      "remaining=%d, my_chunk=%d, stride=%d, current=%d\n",
                      global_id, remaining, my_chunk, stride, current);
#endif

        if ((inc >= 0) ? (current > upper) : (current < upper)) {
          __pmp_loop_free(thread);
          return 0;
	}
	else {
          pmp_param_t *param = __pmp_get_param();
          int64_t my_upper = LOOPMIN(inc, upper, current + stride - inc);
          int64_t new_chunk;
          int64_t divisor;
          remaining = upper - my_upper;				/* estimate */
          divisor = team_size * param->guided_chunk_divisor;
          new_chunk = (remaining + divisor - 1) / divisor;
          new_chunk = MIN(param->guided_chunk_max, new_chunk);
          new_chunk = MAX(min_chunk, new_chunk);
#if __WORDSIZE == 64
          (void) __pmp_atomic_cmpxchg64(&loop->chunk, chunk, new_chunk);
#else
          loop->chunk = new_chunk;				/* estimate */
#endif
          *incp = inc;
          *lowerp = current;
          *upperp = my_upper;
          thread->ticket_number = current;
          thread->iteration++;
          __pmp_loop_analyser(thread, sched, global_id, local_id, lower, upper,
                              *lowerp, *upperp, *incp, 0, 0);
          return 1;
	}
        /* NOT REACHED */
        break;
      }
      default: {
        __pmp_fatal("unknown dynamic scheduling type %d\n", sched);
        break;
      }
    }
    /* NOT REACHED */
    assert(0);
    __pmp_loop_free(thread);
    return 0;
  }
  /* NOT REACHED */
}
Esempio n. 9
0
static inline void __pmp_static_init (int global_id, int sched,
                                      int64_t *lowerp, int64_t *upperp, 
                                      int64_t *stridep,
                                      int64_t inc, int64_t chunk)
{
  /* NOTE: chunk parameter is undefined/unused for static even scheduling */

  pmp_thread_t *thread = __pmp_get_thread(global_id);
  int team_size = __pmp_get_team_size(thread->team);
  int64_t loop_lower = *lowerp;
  int64_t loop_upper = *upperp;
  int64_t lower;
  int64_t upper;
  
  assert(team_size > 0);

  if (chunk <= 0) {
    if (thread->global_id == 0)
      __pmp_warning("Chunk size is non-positive, set to default '1'\n");
    chunk = 1;
  }

  if (team_size == 1) {
    *stridep = (inc > 0) ? (loop_upper - loop_lower + 1) : 
                           (loop_upper - loop_lower - 1);
  }
  else {
    pmp_local_id_t local_id = thread->local_id;
    int64_t stride;
    switch (sched) {
      case PMP_SCHED_STATIC_EVEN: {
        int64_t size = (loop_upper - loop_lower) / inc + 1;
        assert(size >= 0);
        if (!__pmp_get_param()->static_fair) {
          /* The size is divided by the team_size and rounded up to give
           * the chunk size. Chunks of this size are assigned to threads
           * in increased local_id order. If the division was not exact
           * then the last thread will have fewer iterations, and possibly
           * none at all. */
          chunk = (size + team_size - 1) / team_size;
          lower = loop_lower + (local_id * chunk * inc);
	}
        else {
          /* The size is divided by the team_size and rounded down to 
           * give the chunk. Each thread will have at least this many
           * iterations. If the division was not exact then the remainder
           * iterations are scheduled across the threads in increasing
           * thread order. Note that the difference between the minimum
           * and maximum number of iterations assigned to the threads
           * across the team is at most 1. The maximum number of iterations
           * assigned to a thread (the worst case path through the schedule)
           * is the same as for default behavior. */
          int64_t remainder;
          int64_t index;
          chunk = size / team_size;
          remainder = size - (chunk * team_size);
          index = MIN(local_id, remainder) * (chunk + 1);
          if (local_id > remainder) {
            index += (local_id - remainder) * chunk;
	  }
          lower = loop_lower + (index * inc);
          chunk += (local_id < remainder);
	}

        if ((inc >= 0) ? (lower <= loop_upper) : (lower >= loop_upper)) {
          upper = LOOPMIN(inc, lower + (chunk - 1) * inc, loop_upper);
          stride = size * inc;
        }
        else {
          /* If the entire set of iterations falls out of the loop bounds
           * then arrange for a non-iterating loop which will not trigger
           * the LASTPRIVATE check made by the compiler. This means that
           * the final value of the loop induction variable must not exceed
           * the loop upper bound. */
          lower = loop_lower - inc;
          upper = lower - inc;
          stride = inc;
        }
        __pmp_loop_analyser(thread, sched, global_id, local_id,
                            loop_lower, loop_upper,
                            lower, upper, inc, chunk, stride);
        break;
      }
      case PMP_SCHED_STATIC: {
        stride = chunk * inc;
        lower = loop_lower + (local_id * stride);
        if ((inc >= 0) ? (lower <= loop_upper) : (lower >= loop_upper)) {
          upper = LOOPMIN(inc, lower + (chunk - 1) * inc, loop_upper);
          stride *= team_size;
        }
        else {
          /* If the entire set of iterations falls out of the loop bounds
           * then arrange for a non-iterating loop which will not trigger
           * the LASTPRIVATE check made by the compiler. This means that
           * the final value of the loop induction variable must not exceed
           * the loop upper bound. */
          lower = loop_lower - inc;
          upper = lower - inc;
          stride = inc;
        }
        __pmp_loop_analyser(thread, sched, global_id, local_id,
                            loop_lower, loop_upper,
                            lower, upper, inc, chunk, stride);
        break;
      }
      default: {
        __pmp_fatal("unknown static scheduling type %d\n", sched);
        stride = 0;
        lower = loop_lower;
        upper = loop_upper;
      }
    }
    *lowerp = lower;
    *upperp = upper;
    *stridep = stride;
  }
  __pmp_scheduler_sample(sched);
}
Esempio n. 10
0
void __ompc_get_thdprv (void ***thdprv, int64_t size,
                        void *data, int global_id)
{
  __pmp_debug(PMP_DEBUG_CALLS, "__ompc_get_thdprv: thdprv=%p, size=%ld, "
              "data=%p, global_id=%d\n", thdprv, (long) size, data, global_id);
  __pmp_sample(PMP_PROFILE_OMPC_GET_THDPRV);

  if (__pmp_get_param()->disabled) {
    void **t = (void **) calloc (1, sizeof(void *));
    if (t == NULL) {
      __pmp_fatal("failed to allocate thread private data\n");
    }
    t[0] = data;
    *thdprv = t;
  }
  else {
    void **t = *thdprv;
    if (t == NULL) {
      /* TODO: can I reduce the size of this array? Note that it is indexed
       * by global_id and global_id's can be arbitrarily assigned to threads
       * in general, so this may be difficult. */
      void *t_new;
      void *t_cur;
      t = (void **) calloc(PMP_MAX_THREADS, sizeof(void *));
      if (t == NULL) {
        __pmp_fatal("failed to allocate thread private data\n");
      }
      t_new = (void *) t;
      t_cur = __pmp_atomic_cmpxchgptr((volatile voidptr_t *) thdprv, 
                                      NULL, t_new);
      if (t_cur != NULL) {
        /* This thread lost the race and another thread has already
         * installed a thdprv array. Simply back out this allocation
         * and use *thdprv. */
        free(t);
        t = (void **) t_cur;
      }
    }
    if (t[global_id] == NULL) {
      /* The OpenMP 2.5 standard says:
       *
       * "Each copy of a threadprivate object is initialized once, in the manner
       * specified by the program, but at an unspecified point in the program
       * prior to the first reference to that copy."
       *
       * Since the initial values live in the statically allocated block of
       * memory passed to our "data" argument, the master thread needs to use
       * a dynamically allocated block, just as the additional threads do, so
       * that it if it changes its copies of the variables before the program
       * enters the first parallel region, those changes have no effect on the
       * copies in the additional threads. Observation shows that the code
       * generator calls __ompc_get_thdprv from the serial portion of the
       * program, for the master thread, before it changes any values.
       *
       * Note the copying is done without synchronization, which is safe only
       * because we're copying statically initialized and subsequently
       * unchanged values: copying from the main thread would require a
       * barrier.
       */
      t[global_id] = (void *) malloc(size);
      if (t[global_id] == NULL) {
        __pmp_fatal("failed to allocate thread private data");
      }
      memcpy(t[global_id], data, size);
    }
  }
}
Esempio n. 11
0
void __pmp_thread_create (pmp_thread_t *thread)
{
  pmp_thread_t *creator = __pmp_get_current_thread();
  pthread_t pthread_id;
  int result;
  pmp_param_t *param = __pmp_get_param();

  thread->creator = creator;

  if (param->thread_guard_size > 0) {
    void *guard;
    /* NOTE: this lock is to give a better chance of the guard page 
     * allocation to immediately follow the pthread stack allocation. */
    __pmp_lock(thread->global_id, &__pmp_manager.pthread_create_lock);

    /* NOTE: it seems that mmap tends to allocate in an upwards direction
       so allocate the guard page first. */
    guard = mmap(0, param->thread_guard_size, PROT_NONE,
#if defined(BUILD_OS_DARWIN)
                 MAP_PRIVATE | MAP_ANON,
#else /* defined(BUILD_OS_DARWIN) */
                 MAP_PRIVATE | MAP_ANONYMOUS,
#endif /* defined(BUILD_OS_DARWIN) */
		 0, 0);
    if (guard == MAP_FAILED) {
      __pmp_warning("unable to allocate a guard page of %ld bytes\n",
                    (long) param->thread_guard_size);
    }
    else {
      __pmp_debug(PMP_DEBUG_THREAD, "guard page allocated at address %p\n",
                  guard);
      thread->guard_page = guard;
    }
  }

  if ((result = pthread_create(&pthread_id, &__pmp_manager.pthread_attr,
			       __pmp_thread_run, thread)) != 0) {
    if (__pmp_manager.allocated_threads > param->initial_team_size) {
      __pmp_warning(
        "pthread_create failed when trying to allocate thread %d\n",
        __pmp_manager.allocated_threads);
      __pmp_warning(
        "note this is more than the initial number of threads (%d)\n",
        param->initial_team_size);
#if defined(BUILD_OS_DARWIN)
      if (sizeof(long) == 4)
#else /* defined(BUILD_OS_DARWIN) */
      if (__WORDSIZE == 32)
#endif /* defined(BUILD_OS_DARWIN) */
      {
	int64_t total_stack = ((int64_t) param->thread_stack_size) *
                              ((int64_t) __pmp_manager.allocated_threads);
        if (total_stack > 0x40000000LL) {
          __pmp_warning(
            "the failure may be due to excessive thread stack size\n");
          __pmp_warning(
            "try using a smaller setting for PSC_OMP_STACK_SIZE\n");
	}
      }
    }
    __pmp_fatal("unable to create thread (result code %d)\n", result);
  }

  if (param->thread_guard_size > 0) {
    __pmp_unlock(thread->global_id, &__pmp_manager.pthread_create_lock);
  }

  __pmp_atomic_xadd32(&__pmp_manager.waiting_threads, 1);

  __pmp_debug(PMP_DEBUG_THREAD, "created thread global_id %d\n", 
              thread->global_id);
}