Пример #1
0
int __ompc_master (int global_id)
{
  int master = (__pmp_get_thread(global_id)->local_id == 0);
  __pmp_debug(PMP_DEBUG_CALLS, "__ompc_master global_id=%d returns %d\n",
              global_id, master);
  __pmp_sample(PMP_PROFILE_OMPC_MASTER);
  return master;
}
Пример #2
0
int __ompc_single (int global_id)
{
  int result = __pmp_thread_single(__pmp_get_thread(global_id));
  __pmp_debug(PMP_DEBUG_CALLS, "__ompc_single global_id=%d returns %d\n", 
              global_id, result);
  __pmp_sample(PMP_PROFILE_OMPC_SINGLE);
  return result;
}
Пример #3
0
void __ompc_ordered (int global_id)
{
  pmp_thread_t *thread = __pmp_get_thread(global_id);

  if (__pmp_get_team_size(thread->team) > 1) {
    pmp_loop_t *loop = thread->loop;
    int64_t ticket_number = thread->ticket_number;
    int64_t now_serving;

#ifdef SUPER_DEBUG
    if (Enabled_Libomp_Call_Debug)
      __pmp_debug("CALLS_DEBUG", "__ompc_ordered: global_id=%d\n", global_id);
#endif
    __pmp_sample(PMP_PROFILE_OMPC_ORDERED);

    if (loop == NULL || loop->sched <= PMP_SCHED_ORDERED_OFFSET) {
      __pmp_warning("ordered directives must be used inside ordered "
                    "OpenMP loops\n");
      return;
    }

    assert(loop != NULL);

    now_serving = loop->now_serving;

    if (now_serving != ticket_number) {
      if ((loop->inc >= 0) ? (now_serving > ticket_number) :
                             (now_serving < ticket_number)) {
        __pmp_warning("ordered OpenMP loop may result in program deadlock\n");
        __pmp_warning("maybe due to multiple ordered directives "
                      "in a loop iteration\n");
      }
      while (loop->now_serving != ticket_number) {
        /* USER LEVEL SPIN LOOP */
        __pmp_yield();
      }
    }

#ifdef SUPER_DEBUG
    if (Enabled_Libomp_Loop_Debug)
      __pmp_debug("LOOPS_DEBUG", "__ompc_ordered: now serving global_id=%d "
                  " ticket_number=%" PRId64 "\n", global_id, ticket_number);
#endif
  }

  __pmp_memory_fence();
}
Пример #4
0
static inline void __pmp_scheduler_init (int global_id, int sched,
                                         int64_t lower, int64_t upper,
                                         int64_t inc, int64_t chunk)
{
  /* NOTE: chunk parameter is undefined/unused for static even scheduling */

  pmp_thread_t *thread = __pmp_get_thread(global_id);
  pmp_param_t *param = __pmp_get_param();
  int64_t min_chunk = MAX(1, chunk);

  if (sched == PMP_SCHED_RUNTIME || sched == PMP_SCHED_ORDERED_RUNTIME) {
    int old = sched;
    sched = param->runtime_schedule;
    chunk = param->runtime_chunk;
    if (old == PMP_SCHED_ORDERED_RUNTIME) {
      sched += PMP_SCHED_ORDERED_OFFSET;
    } 
  }

  if (sched == PMP_SCHED_GUIDED || sched == PMP_SCHED_ORDERED_GUIDED) {
    /* The initial chunk is loop trip count spread over the number of 
     * threads (the division is rounded up) */
    int team_size = __pmp_get_team_size(thread->team);
    int64_t divisor = team_size * param->guided_chunk_divisor;
    chunk = (upper - lower + divisor) / divisor;
    chunk = MIN(param->guided_chunk_max, chunk);
    chunk = MAX(min_chunk, chunk);
  }

  if (chunk <= 0) {
    if (thread->global_id == 0)
      __pmp_warning("Chunk size is non-positive, set to default '1'\n");
    chunk = 1;
  }

  __pmp_scheduler_sample(sched);

  assert(inc != 0 && chunk != 0 && min_chunk != 0);

  __pmp_loop_alloc(thread, sched, lower, upper, inc, chunk, min_chunk);

  thread->iteration = 0;
}
Пример #5
0
void __ompc_end_ordered (int global_id)
{
  pmp_thread_t *thread = __pmp_get_thread(global_id);

  __pmp_memory_fence();

  if (__pmp_get_team_size(thread->team) > 1) {
    pmp_loop_t *loop = thread->loop;
    int64_t ticket_number = thread->ticket_number;

#ifdef SUPER_DEBUG
    if (Enabled_Libomp_Call_Debug)
      __pmp_debug("CALLS_DEBUG", "__ompc_end_ordered: global_id=%d\n",
                  global_id);
#endif
    __pmp_sample(PMP_PROFILE_OMPC_END_ORDERED);

    if (loop == NULL || loop->sched <= PMP_SCHED_ORDERED_OFFSET) {
      if (thread->global_id == 0)
        __pmp_warning("ordered directives must be used inside ordered "
                      "OpenMP loops\n");
      return;
    }

    assert(loop != NULL);
    assert(loop->now_serving == ticket_number);

#ifdef SUPER_DEBUG
    if (Enabled_Libomp_Loop_Debug)
      __pmp_debug("LOOPS_DEBUG", "__ompc_ordered: stop serving global_id=%d "
                  " ticket_number=%" PRId64 "\n", global_id, ticket_number);
#endif

    loop->now_serving += loop->inc;
    thread->ticket_number = ticket_number + loop->inc;
  }
}
Пример #6
0
static inline int __pmp_schedule_next (int global_id, int64_t *lowerp,
                                       int64_t *upperp, int64_t *incp)
{
  pmp_thread_t *thread = __pmp_get_thread(global_id);
  int team_size = __pmp_get_team_size(thread->team);
  int64_t iteration = thread->iteration;
  pmp_local_id_t local_id = thread->local_id;
  pmp_loop_t *loop = thread->loop;

  assert(loop != NULL);
  assert(local_id < team_size);

  if (team_size == 1) {
    if (iteration == 0) {
      *lowerp = loop->lower;
      *upperp = loop->upper;
      *incp = loop->inc;
      thread->ticket_number = loop->lower;
      thread->iteration = 1;
      __pmp_loop_analyser(thread, loop->sched, global_id, local_id,
                          loop->lower, loop->upper,
                          *lowerp, *upperp, *incp, 0, 0);
      return 1;
    }
    else {
      assert(iteration == 1);
      __pmp_loop_free(thread);
      return 0;
    }
  }
  else {
    int     sched = loop->sched;
    int64_t lower = loop->lower;
    int64_t upper = loop->upper;
    int64_t inc   = loop->inc;
    int64_t chunk = loop->chunk;
    switch (sched) {
      case PMP_SCHED_STATIC:
      case PMP_SCHED_ORDERED_STATIC: {
        /* NOTE: setting a small value of chunk causes (unnecessary) iteration
         * through this code. If the chunk is ignored, the code degenerates
         * into the static even case (which is the default). */
        int64_t size = (upper - lower) / inc + 1;
        int64_t size_per_thread = ((size - 1) / team_size + 1) * inc;
        int64_t thread_lower = lower + (local_id * size_per_thread);
        int64_t thread_upper = thread_lower + size_per_thread - inc;
        int64_t this_lower = thread_lower + (iteration * chunk * inc);
        int64_t this_upper = this_lower + (chunk - 1) * inc;
        thread_upper = LOOPMIN(inc, thread_upper, upper);
        this_upper = LOOPMIN(inc, this_upper, thread_upper);
        if ((inc >= 0) ? (this_lower > thread_upper) : 
                         (this_lower < thread_upper)) {
          __pmp_loop_free(thread);
          return 0;
	}
        else {
          *incp = inc;
          *lowerp = this_lower;
          *upperp = this_upper;
          thread->ticket_number = this_lower;
          thread->iteration++;
          __pmp_loop_analyser(thread, sched, global_id, local_id, lower, upper,
                              *lowerp, *upperp, *incp, 0, 0);
          return 1;
	}
        /* NOT REACHED */
        break;
      }
      case PMP_SCHED_STATIC_EVEN:
      case PMP_SCHED_ORDERED_STATIC_EVEN: {
        if (iteration == 0) {
          int64_t size = (upper - lower) / inc + 1;
          int64_t thread_lower;
          int64_t thread_upper;
          if (!__pmp_get_param()->static_fair) {
            int64_t size_per_thread = ((size - 1) / team_size + 1) * inc;
            thread_lower = lower + (local_id * size_per_thread);
            thread_upper = thread_lower + size_per_thread - inc;
	  }
          else {
            int64_t chunk = size / team_size;
            int64_t remainder = size - (chunk * team_size);
            int64_t index = MIN(local_id, remainder) * (chunk + 1);
            if (local_id > remainder) {
              index += (local_id - remainder) * chunk;
            }
            thread_lower = lower + (index * inc);
            chunk += (local_id < remainder);
            thread_upper = thread_lower + (chunk - 1) * inc;
	  }
          thread_upper = LOOPMIN(inc, thread_upper, upper);
          if ((inc >= 0) ? (thread_lower > thread_upper) : 
                           (thread_lower < thread_upper)) {
            __pmp_loop_free(thread);
            return 0;
	  }
	  else {
            *incp = inc;
            *lowerp = thread_lower;
            *upperp = thread_upper;
            thread->ticket_number = thread_lower;
            thread->iteration++;
            __pmp_loop_analyser(thread, sched, global_id, local_id,
                                lower, upper, *lowerp, *upperp, 
                                *incp, 0, 0);
            return 1;
	  }
	}
        else {
          assert(iteration == 1);
          __pmp_loop_free(thread);
	  return 0;
	}
        /* NOT REACHED */
        break;
      }
      case PMP_SCHED_DYNAMIC:
      case PMP_SCHED_ORDERED_DYNAMIC: {
        int64_t stride = inc * chunk;
#if __WORDSIZE == 64
        int64_t current = __pmp_atomic_xadd64(&loop->current, stride);
#else
        /* TODO: the atomic xadd64 is a problem for 32-bit compilation */
        /*       the workaround below is just to do a 32-bit atomic add */
        int64_t current;
        current = (int64_t) __pmp_atomic_xadd32((int32_t *) &loop->current,
                                                (int32_t) stride);
#endif
        if ((inc >= 0) ? (current > upper) : (current < upper)) {
          __pmp_loop_free(thread);
          return 0;
	}
	else {
          *incp = inc;
          *lowerp = current;
          *upperp = *lowerp + stride - inc;
          *upperp = LOOPMIN(inc, upper, *upperp);
          thread->ticket_number = current;
          thread->iteration++;
          __pmp_loop_analyser(thread, sched, global_id, local_id, lower, upper,
                              *lowerp, *upperp, *incp, 0, 0);
          return 1;
	}
        /* NOT REACHED */
        break;
      }
      case PMP_SCHED_GUIDED:
      case PMP_SCHED_ORDERED_GUIDED: {
        /* NOTE: guided scheduling uses a heuristic to choose a good
         * chunk size to divide up the remaining iterations amongst
         * the team (subject to a minimum). An exact implementation of 
         * this would require a lock on the loop data. However, the
         * heuristic can be approximated using (possibly) stale values 
         * and this should be good enough. The value of "remaining"
         * is monotonically decreasing. The worst that could happen
         * is that an update to loop->chunk is lost slightly unbalancing
         * the distribution. The most important point is that loop->current
         * is maintained atomically. */
        /* UPDATE: if cmpxchg64 is available then this is used to protect
         * the update of loop->chunk. This is fairly cunning, and makes
         * the chunk update more accurate in this case! */
        int64_t min_chunk = loop->min_chunk;
        int64_t remaining = upper - loop->current + 1;		 /* estimate */
        int64_t my_chunk = MAX(min_chunk, MIN(chunk, remaining));/* estimate */
        int64_t stride = inc * my_chunk;
#if __WORDSIZE == 64
        int64_t current = __pmp_atomic_xadd64(&loop->current, stride);
#else
        /* TODO: the atomic xadd64 is a problem for 32-bit compilation */
        /*       the workaround below is just to do a 32-bit atomic add */
        int64_t current = __pmp_atomic_xadd32((int32_t *) &loop->current,
                                              (int32_t) stride);
#endif
        assert(stride != 0);
#ifdef SUPER_DEBUG
        if (Enabled_Libomp_Loop_Debug)
          __pmp_debug("LOOPS_DEBUG", "__pmp_schedule_next: global_id=%d, "
                      "remaining=%d, my_chunk=%d, stride=%d, current=%d\n",
                      global_id, remaining, my_chunk, stride, current);
#endif

        if ((inc >= 0) ? (current > upper) : (current < upper)) {
          __pmp_loop_free(thread);
          return 0;
	}
	else {
          pmp_param_t *param = __pmp_get_param();
          int64_t my_upper = LOOPMIN(inc, upper, current + stride - inc);
          int64_t new_chunk;
          int64_t divisor;
          remaining = upper - my_upper;				/* estimate */
          divisor = team_size * param->guided_chunk_divisor;
          new_chunk = (remaining + divisor - 1) / divisor;
          new_chunk = MIN(param->guided_chunk_max, new_chunk);
          new_chunk = MAX(min_chunk, new_chunk);
#if __WORDSIZE == 64
          (void) __pmp_atomic_cmpxchg64(&loop->chunk, chunk, new_chunk);
#else
          loop->chunk = new_chunk;				/* estimate */
#endif
          *incp = inc;
          *lowerp = current;
          *upperp = my_upper;
          thread->ticket_number = current;
          thread->iteration++;
          __pmp_loop_analyser(thread, sched, global_id, local_id, lower, upper,
                              *lowerp, *upperp, *incp, 0, 0);
          return 1;
	}
        /* NOT REACHED */
        break;
      }
      default: {
        __pmp_fatal("unknown dynamic scheduling type %d\n", sched);
        break;
      }
    }
    /* NOT REACHED */
    assert(0);
    __pmp_loop_free(thread);
    return 0;
  }
  /* NOT REACHED */
}
Пример #7
0
static inline void __pmp_static_init (int global_id, int sched,
                                      int64_t *lowerp, int64_t *upperp, 
                                      int64_t *stridep,
                                      int64_t inc, int64_t chunk)
{
  /* NOTE: chunk parameter is undefined/unused for static even scheduling */

  pmp_thread_t *thread = __pmp_get_thread(global_id);
  int team_size = __pmp_get_team_size(thread->team);
  int64_t loop_lower = *lowerp;
  int64_t loop_upper = *upperp;
  int64_t lower;
  int64_t upper;
  
  assert(team_size > 0);

  if (chunk <= 0) {
    if (thread->global_id == 0)
      __pmp_warning("Chunk size is non-positive, set to default '1'\n");
    chunk = 1;
  }

  if (team_size == 1) {
    *stridep = (inc > 0) ? (loop_upper - loop_lower + 1) : 
                           (loop_upper - loop_lower - 1);
  }
  else {
    pmp_local_id_t local_id = thread->local_id;
    int64_t stride;
    switch (sched) {
      case PMP_SCHED_STATIC_EVEN: {
        int64_t size = (loop_upper - loop_lower) / inc + 1;
        assert(size >= 0);
        if (!__pmp_get_param()->static_fair) {
          /* The size is divided by the team_size and rounded up to give
           * the chunk size. Chunks of this size are assigned to threads
           * in increased local_id order. If the division was not exact
           * then the last thread will have fewer iterations, and possibly
           * none at all. */
          chunk = (size + team_size - 1) / team_size;
          lower = loop_lower + (local_id * chunk * inc);
	}
        else {
          /* The size is divided by the team_size and rounded down to 
           * give the chunk. Each thread will have at least this many
           * iterations. If the division was not exact then the remainder
           * iterations are scheduled across the threads in increasing
           * thread order. Note that the difference between the minimum
           * and maximum number of iterations assigned to the threads
           * across the team is at most 1. The maximum number of iterations
           * assigned to a thread (the worst case path through the schedule)
           * is the same as for default behavior. */
          int64_t remainder;
          int64_t index;
          chunk = size / team_size;
          remainder = size - (chunk * team_size);
          index = MIN(local_id, remainder) * (chunk + 1);
          if (local_id > remainder) {
            index += (local_id - remainder) * chunk;
	  }
          lower = loop_lower + (index * inc);
          chunk += (local_id < remainder);
	}

        if ((inc >= 0) ? (lower <= loop_upper) : (lower >= loop_upper)) {
          upper = LOOPMIN(inc, lower + (chunk - 1) * inc, loop_upper);
          stride = size * inc;
        }
        else {
          /* If the entire set of iterations falls out of the loop bounds
           * then arrange for a non-iterating loop which will not trigger
           * the LASTPRIVATE check made by the compiler. This means that
           * the final value of the loop induction variable must not exceed
           * the loop upper bound. */
          lower = loop_lower - inc;
          upper = lower - inc;
          stride = inc;
        }
        __pmp_loop_analyser(thread, sched, global_id, local_id,
                            loop_lower, loop_upper,
                            lower, upper, inc, chunk, stride);
        break;
      }
      case PMP_SCHED_STATIC: {
        stride = chunk * inc;
        lower = loop_lower + (local_id * stride);
        if ((inc >= 0) ? (lower <= loop_upper) : (lower >= loop_upper)) {
          upper = LOOPMIN(inc, lower + (chunk - 1) * inc, loop_upper);
          stride *= team_size;
        }
        else {
          /* If the entire set of iterations falls out of the loop bounds
           * then arrange for a non-iterating loop which will not trigger
           * the LASTPRIVATE check made by the compiler. This means that
           * the final value of the loop induction variable must not exceed
           * the loop upper bound. */
          lower = loop_lower - inc;
          upper = lower - inc;
          stride = inc;
        }
        __pmp_loop_analyser(thread, sched, global_id, local_id,
                            loop_lower, loop_upper,
                            lower, upper, inc, chunk, stride);
        break;
      }
      default: {
        __pmp_fatal("unknown static scheduling type %d\n", sched);
        stride = 0;
        lower = loop_lower;
        upper = loop_upper;
      }
    }
    *lowerp = lower;
    *upperp = upper;
    *stridep = stride;
  }
  __pmp_scheduler_sample(sched);
}