Пример #1
0
void __ompc_ordered (int global_id)
{
  pmp_thread_t *thread = __pmp_get_thread(global_id);

  if (__pmp_get_team_size(thread->team) > 1) {
    pmp_loop_t *loop = thread->loop;
    int64_t ticket_number = thread->ticket_number;
    int64_t now_serving;

#ifdef SUPER_DEBUG
    if (Enabled_Libomp_Call_Debug)
      __pmp_debug("CALLS_DEBUG", "__ompc_ordered: global_id=%d\n", global_id);
#endif
    __pmp_sample(PMP_PROFILE_OMPC_ORDERED);

    if (loop == NULL || loop->sched <= PMP_SCHED_ORDERED_OFFSET) {
      __pmp_warning("ordered directives must be used inside ordered "
                    "OpenMP loops\n");
      return;
    }

    assert(loop != NULL);

    now_serving = loop->now_serving;

    if (now_serving != ticket_number) {
      if ((loop->inc >= 0) ? (now_serving > ticket_number) :
                             (now_serving < ticket_number)) {
        __pmp_warning("ordered OpenMP loop may result in program deadlock\n");
        __pmp_warning("maybe due to multiple ordered directives "
                      "in a loop iteration\n");
      }
      while (loop->now_serving != ticket_number) {
        /* USER LEVEL SPIN LOOP */
        __pmp_yield();
      }
    }

#ifdef SUPER_DEBUG
    if (Enabled_Libomp_Loop_Debug)
      __pmp_debug("LOOPS_DEBUG", "__ompc_ordered: now serving global_id=%d "
                  " ticket_number=%" PRId64 "\n", global_id, ticket_number);
#endif
  }

  __pmp_memory_fence();
}
Пример #2
0
static void __pmp_thread_bind (pmp_thread_t *thread)
{
  /* TODO : use dynamic information to bind threads appropriately */

  pmp_param_t *param = __pmp_get_param();
  if (param->enable_affinity) {
    int cpu;
    int index = param->global_affinity ? thread->global_id : thread->local_id;
    assert(index < PMP_MAX_THREADS);
    cpu = param->thread_to_cpu_map[index];
    assert(cpu < param->machine_num_cpus);
    if (thread->cpu != cpu) {
      static bool __pmp_enable_affinity_warning = true;
      int e;
      if (__pmp_manager.params != NULL) {
        thread->param = &__pmp_manager.params[cpu];
      }
      else {
        thread->param = &__pmp_param;
      }
      e = __pmp_set_affinity(cpu);
      __pmp_debug(PMP_DEBUG_THREAD, "__pmp_thread_bind: global_id=%d, "
                  "local_id=%d, CPU=%d, param=%p\n",
                  thread->global_id, thread->local_id, cpu, thread->param);
      if (e != 0 && __pmp_enable_affinity_warning) {
        __pmp_warning("failed to set affinity\n");
        __pmp_warning("maybe the kernel does not support "
                      "affinity system calls\n");
        __pmp_enable_affinity_warning = false;
      }
      thread->cpu = cpu;
    }

    /* TODO: give the thread an opportunity to move to its bound CPU
     * before continuing? Currently just do a __pmp_yield(). It is not
     * clear if this is necessary or sufficient. */
    __pmp_yield();
  }
}
Пример #3
0
static inline void __pmp_scheduler_init (int global_id, int sched,
                                         int64_t lower, int64_t upper,
                                         int64_t inc, int64_t chunk)
{
  /* NOTE: chunk parameter is undefined/unused for static even scheduling */

  pmp_thread_t *thread = __pmp_get_thread(global_id);
  pmp_param_t *param = __pmp_get_param();
  int64_t min_chunk = MAX(1, chunk);

  if (sched == PMP_SCHED_RUNTIME || sched == PMP_SCHED_ORDERED_RUNTIME) {
    int old = sched;
    sched = param->runtime_schedule;
    chunk = param->runtime_chunk;
    if (old == PMP_SCHED_ORDERED_RUNTIME) {
      sched += PMP_SCHED_ORDERED_OFFSET;
    } 
  }

  if (sched == PMP_SCHED_GUIDED || sched == PMP_SCHED_ORDERED_GUIDED) {
    /* The initial chunk is loop trip count spread over the number of 
     * threads (the division is rounded up) */
    int team_size = __pmp_get_team_size(thread->team);
    int64_t divisor = team_size * param->guided_chunk_divisor;
    chunk = (upper - lower + divisor) / divisor;
    chunk = MIN(param->guided_chunk_max, chunk);
    chunk = MAX(min_chunk, chunk);
  }

  if (chunk <= 0) {
    if (thread->global_id == 0)
      __pmp_warning("Chunk size is non-positive, set to default '1'\n");
    chunk = 1;
  }

  __pmp_scheduler_sample(sched);

  assert(inc != 0 && chunk != 0 && min_chunk != 0);

  __pmp_loop_alloc(thread, sched, lower, upper, inc, chunk, min_chunk);

  thread->iteration = 0;
}
Пример #4
0
void __ompc_end_ordered (int global_id)
{
  pmp_thread_t *thread = __pmp_get_thread(global_id);

  __pmp_memory_fence();

  if (__pmp_get_team_size(thread->team) > 1) {
    pmp_loop_t *loop = thread->loop;
    int64_t ticket_number = thread->ticket_number;

#ifdef SUPER_DEBUG
    if (Enabled_Libomp_Call_Debug)
      __pmp_debug("CALLS_DEBUG", "__ompc_end_ordered: global_id=%d\n",
                  global_id);
#endif
    __pmp_sample(PMP_PROFILE_OMPC_END_ORDERED);

    if (loop == NULL || loop->sched <= PMP_SCHED_ORDERED_OFFSET) {
      if (thread->global_id == 0)
        __pmp_warning("ordered directives must be used inside ordered "
                      "OpenMP loops\n");
      return;
    }

    assert(loop != NULL);
    assert(loop->now_serving == ticket_number);

#ifdef SUPER_DEBUG
    if (Enabled_Libomp_Loop_Debug)
      __pmp_debug("LOOPS_DEBUG", "__ompc_ordered: stop serving global_id=%d "
                  " ticket_number=%" PRId64 "\n", global_id, ticket_number);
#endif

    loop->now_serving += loop->inc;
    thread->ticket_number = ticket_number + loop->inc;
  }
}
Пример #5
0
void __pmp_catch_segv (void)
{
  #define ALT_STACK_SIZE 8192
  void *new_stack = malloc(ALT_STACK_SIZE);
  stack_t ss = {
    .ss_sp = new_stack,
    .ss_flags = 0,
    .ss_size = ALT_STACK_SIZE
  };
  struct sigaction sa;
 
  memset(&sa, 0, sizeof(sa));
  sa.sa_sigaction = __pmp_segv;
  sa.sa_flags = SA_RESETHAND | SA_ONSTACK | SA_SIGINFO | SA_NODEFER;
 
  if (sigaltstack(&ss, NULL) == -1) {
    perror("sigaltstack");
    exit(1);
  }
 
  if (sigaction(SIGSEGV, &sa, NULL) == -1) {
    perror("sigaction");
    exit(1);
  }
}

#define STACK_MIN 65536
 
#define dprint(...) \
        do { \
                if (verbose) { \
                        fprintf(stderr, __VA_ARGS__); \
                } \
        } while (0)

int __pmp_get_stack_size_limit(const char *new_limit, int64_t *max_stack_ptr,
                               int nthreads)
{
  struct rlimit rl;
  int64_t ncpus;
  int64_t phys_mem;
  int64_t max_stack;
  int verbose = getenv("PSC_STACK_VERBOSE") != NULL;

  *max_stack_ptr = 0;
 
  if (getrlimit(RLIMIT_STACK, &rl) == -1) {
    __pmp_warning("could not calculate your stack size limit\n");
    return -1;
  }
 
  if (rl.rlim_cur == RLIM_INFINITY) {
    dprint("No stack size limits currently in place\n");
  } else {
    dprint("Stack size limits: %ld current, %ld maximum\n",
      (long) rl.rlim_cur, (long) rl.rlim_max);
  }
 
#if defined(BUILD_OS_DARWIN)
  phys_mem = get_sysctl_int("hw.memsize");
  ncpus = get_sysctl_int(SYSCTL_NPROCESSORS_ONLN);
#elif defined(__NetBSD__)
  phys_mem = get_sysctl_int("hw.physmem64");
  ncpus = sysconf(_SC_NPROCESSORS_ONLN);
#else /* defined(BUILD_OS_DARWIN) */
  phys_mem = (int64_t)sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGE_SIZE);
  ncpus = sysconf(_SC_NPROCESSORS_ONLN);
#endif /* defined(BUILD_OS_DARWIN) */
  nthreads = (nthreads > 0) ? nthreads : ncpus;
 
 
  dprint("Physical memory: %"PRId64" bytes\n", phys_mem);
  dprint("Number of CPUs: %"PRId64"\n", ncpus);
  dprint("Default number of threads per team: %d\n", nthreads);
 
  max_stack = phys_mem - 64LL * 1048576;
 
  if (max_stack > ULONG_MAX) {
    max_stack = ULONG_MAX;
  }
 
  if (phys_mem > 512LL * 1048576) {
    max_stack -= 128LL * 1048576 * ncpus;
  } else {
    max_stack -= (phys_mem >> 3) * ncpus;
  }

  if (nthreads > 1) {
    /* share max_stack over nthreads for OpenMP programs */
    max_stack /= nthreads;
  }

  dprint("Automatic maximum stack size limit: %"PRId64" (%"PRId64"%% of RAM)\n",
    max_stack, (max_stack * 100) / phys_mem);

  if (new_limit && *new_limit) {
    double max;
    char *end;
 
    max = strtod(new_limit, &end);
 
    if (errno == ERANGE) {
      __pmp_warning("your requested stack size limit of "
                    "\"%s\" is not well-formed\n", new_limit);
      return -1;
    }
 
    switch (tolower((unsigned char)*end)) {
    case 'k':
      max *= 1024;
      break;
    case 'm':
      max *= 1048576;
      break;
    case 'g':
      max *= 1073741824;
      break;
    case '%':
      max = phys_mem * max / 100;
      break;
    default: break;
    }
 
    if (*end && strcasecmp(end + 1, "/cpu") == 0) {
      max *= ncpus;
    }
 
    if (isinf(max)) {
      max_stack = RLIM_INFINITY;
    }
    else if (max < 0) {
      max_stack = phys_mem + max;
    } else {
      max_stack = max;
    }
 
    if (max_stack != RLIM_INFINITY && max_stack < STACK_MIN) {
      __pmp_warning("bad maximum stack size "
                    "limit of %"PRId64" (specified as \"%s\")\n",
        max_stack, new_limit);
      return -1;
    }
 
    if (max_stack > ULONG_MAX) {
      __pmp_warning("treating requested stack "
                    "size limit of %"PRId64" as no limit\n",
        max_stack);
      max_stack = RLIM_INFINITY;
    }
 
    if (max_stack == RLIM_INFINITY) {
      dprint("You have asked for no stack size limit\n");
    } else {
      dprint("You have asked for a stack size limit of "
        "%"PRId64" (%"PRId64"%% of RAM)\n",
        max_stack, (max_stack * 100) / phys_mem);
    }
 
    if (max_stack > phys_mem) {
      __pmp_warning("your requested stack "
                    "size limit is %"PRId64"%% of physical memory\n",
                    (max_stack * 100) / phys_mem);
    }
  }
  else if (rl.rlim_cur == RLIM_INFINITY || rl.rlim_cur > max_stack) {
    dprint("Will not automatically reduce stack size limit\n");
  }
 
  if (max_stack > rl.rlim_max)
    max_stack = rl.rlim_max;

  *max_stack_ptr = max_stack;

  return 0;
}
Пример #6
0
static inline void __pmp_static_init (int global_id, int sched,
                                      int64_t *lowerp, int64_t *upperp, 
                                      int64_t *stridep,
                                      int64_t inc, int64_t chunk)
{
  /* NOTE: chunk parameter is undefined/unused for static even scheduling */

  pmp_thread_t *thread = __pmp_get_thread(global_id);
  int team_size = __pmp_get_team_size(thread->team);
  int64_t loop_lower = *lowerp;
  int64_t loop_upper = *upperp;
  int64_t lower;
  int64_t upper;
  
  assert(team_size > 0);

  if (chunk <= 0) {
    if (thread->global_id == 0)
      __pmp_warning("Chunk size is non-positive, set to default '1'\n");
    chunk = 1;
  }

  if (team_size == 1) {
    *stridep = (inc > 0) ? (loop_upper - loop_lower + 1) : 
                           (loop_upper - loop_lower - 1);
  }
  else {
    pmp_local_id_t local_id = thread->local_id;
    int64_t stride;
    switch (sched) {
      case PMP_SCHED_STATIC_EVEN: {
        int64_t size = (loop_upper - loop_lower) / inc + 1;
        assert(size >= 0);
        if (!__pmp_get_param()->static_fair) {
          /* The size is divided by the team_size and rounded up to give
           * the chunk size. Chunks of this size are assigned to threads
           * in increased local_id order. If the division was not exact
           * then the last thread will have fewer iterations, and possibly
           * none at all. */
          chunk = (size + team_size - 1) / team_size;
          lower = loop_lower + (local_id * chunk * inc);
	}
        else {
          /* The size is divided by the team_size and rounded down to 
           * give the chunk. Each thread will have at least this many
           * iterations. If the division was not exact then the remainder
           * iterations are scheduled across the threads in increasing
           * thread order. Note that the difference between the minimum
           * and maximum number of iterations assigned to the threads
           * across the team is at most 1. The maximum number of iterations
           * assigned to a thread (the worst case path through the schedule)
           * is the same as for default behavior. */
          int64_t remainder;
          int64_t index;
          chunk = size / team_size;
          remainder = size - (chunk * team_size);
          index = MIN(local_id, remainder) * (chunk + 1);
          if (local_id > remainder) {
            index += (local_id - remainder) * chunk;
	  }
          lower = loop_lower + (index * inc);
          chunk += (local_id < remainder);
	}

        if ((inc >= 0) ? (lower <= loop_upper) : (lower >= loop_upper)) {
          upper = LOOPMIN(inc, lower + (chunk - 1) * inc, loop_upper);
          stride = size * inc;
        }
        else {
          /* If the entire set of iterations falls out of the loop bounds
           * then arrange for a non-iterating loop which will not trigger
           * the LASTPRIVATE check made by the compiler. This means that
           * the final value of the loop induction variable must not exceed
           * the loop upper bound. */
          lower = loop_lower - inc;
          upper = lower - inc;
          stride = inc;
        }
        __pmp_loop_analyser(thread, sched, global_id, local_id,
                            loop_lower, loop_upper,
                            lower, upper, inc, chunk, stride);
        break;
      }
      case PMP_SCHED_STATIC: {
        stride = chunk * inc;
        lower = loop_lower + (local_id * stride);
        if ((inc >= 0) ? (lower <= loop_upper) : (lower >= loop_upper)) {
          upper = LOOPMIN(inc, lower + (chunk - 1) * inc, loop_upper);
          stride *= team_size;
        }
        else {
          /* If the entire set of iterations falls out of the loop bounds
           * then arrange for a non-iterating loop which will not trigger
           * the LASTPRIVATE check made by the compiler. This means that
           * the final value of the loop induction variable must not exceed
           * the loop upper bound. */
          lower = loop_lower - inc;
          upper = lower - inc;
          stride = inc;
        }
        __pmp_loop_analyser(thread, sched, global_id, local_id,
                            loop_lower, loop_upper,
                            lower, upper, inc, chunk, stride);
        break;
      }
      default: {
        __pmp_fatal("unknown static scheduling type %d\n", sched);
        stride = 0;
        lower = loop_lower;
        upper = loop_upper;
      }
    }
    *lowerp = lower;
    *upperp = upper;
    *stridep = stride;
  }
  __pmp_scheduler_sample(sched);
}
Пример #7
0
void __pmp_thread_create (pmp_thread_t *thread)
{
  pmp_thread_t *creator = __pmp_get_current_thread();
  pthread_t pthread_id;
  int result;
  pmp_param_t *param = __pmp_get_param();

  thread->creator = creator;

  if (param->thread_guard_size > 0) {
    void *guard;
    /* NOTE: this lock is to give a better chance of the guard page 
     * allocation to immediately follow the pthread stack allocation. */
    __pmp_lock(thread->global_id, &__pmp_manager.pthread_create_lock);

    /* NOTE: it seems that mmap tends to allocate in an upwards direction
       so allocate the guard page first. */
    guard = mmap(0, param->thread_guard_size, PROT_NONE,
#if defined(BUILD_OS_DARWIN)
                 MAP_PRIVATE | MAP_ANON,
#else /* defined(BUILD_OS_DARWIN) */
                 MAP_PRIVATE | MAP_ANONYMOUS,
#endif /* defined(BUILD_OS_DARWIN) */
		 0, 0);
    if (guard == MAP_FAILED) {
      __pmp_warning("unable to allocate a guard page of %ld bytes\n",
                    (long) param->thread_guard_size);
    }
    else {
      __pmp_debug(PMP_DEBUG_THREAD, "guard page allocated at address %p\n",
                  guard);
      thread->guard_page = guard;
    }
  }

  if ((result = pthread_create(&pthread_id, &__pmp_manager.pthread_attr,
			       __pmp_thread_run, thread)) != 0) {
    if (__pmp_manager.allocated_threads > param->initial_team_size) {
      __pmp_warning(
        "pthread_create failed when trying to allocate thread %d\n",
        __pmp_manager.allocated_threads);
      __pmp_warning(
        "note this is more than the initial number of threads (%d)\n",
        param->initial_team_size);
#if defined(BUILD_OS_DARWIN)
      if (sizeof(long) == 4)
#else /* defined(BUILD_OS_DARWIN) */
      if (__WORDSIZE == 32)
#endif /* defined(BUILD_OS_DARWIN) */
      {
	int64_t total_stack = ((int64_t) param->thread_stack_size) *
                              ((int64_t) __pmp_manager.allocated_threads);
        if (total_stack > 0x40000000LL) {
          __pmp_warning(
            "the failure may be due to excessive thread stack size\n");
          __pmp_warning(
            "try using a smaller setting for PSC_OMP_STACK_SIZE\n");
	}
      }
    }
    __pmp_fatal("unable to create thread (result code %d)\n", result);
  }

  if (param->thread_guard_size > 0) {
    __pmp_unlock(thread->global_id, &__pmp_manager.pthread_create_lock);
  }

  __pmp_atomic_xadd32(&__pmp_manager.waiting_threads, 1);

  __pmp_debug(PMP_DEBUG_THREAD, "created thread global_id %d\n", 
              thread->global_id);
}