static inline blysk_task* blysk__create_task(blysk_task* parent, void (*func)(void *), void *arg, void (*cpyfn) (void *, void *), u32 argsize, u32 arg_align, int nDeps) { blysk_task *new_task = allocTask(nDeps, argsize, arg_align); if (nDeps != 0 && parent->__dep_manager == NULL) blysk__DEP_init(parent); new_task->tsk = func; new_task->_dependencies = nDeps; new_task->_udependencies = (Counter) {(unsigned)nDeps}; if(expectFalse(cpyfn != 0)) { cpyfn(getTaskArgs(new_task), arg); } else { memcpy(getTaskArgs(new_task), arg, argsize); } new_task->_parent = parent; new_task->_children = (Counter) {0}; new_task->__dep_manager = NULL; new_task->FPN[0] = NULL; new_task->FPN[1] = NULL; new_task->Type = UNIT; trackTask(new_task); /* Clean this stuff up */ #if defined(__STAT_TASK) new_task-> stat__task = fetchAndAddCounter(&uniqueTask_ids, 1, RELAXED); new_task-> stat__parent = parent != NULL ? parent->stat__task : 0; new_task-> stat__create_instant = rdtsc(); new_task-> stat__child_number = parent->stat__num_children++; new_task-> stat__cpu_id_create = blysk__THREAD_get_rid(); new_task-> stat__num_children = 0; /* Set Others to default */ new_task-> stat__cpu_id = -1; new_task-> stat__num_children = 0; new_task-> stat__exec_cycles = 0; new_task-> stat__creation_cycles = 0; new_task-> stat__overhead_cycles = 0; new_task-> stat__queue_size = 0; new_task-> stat__exec_end_instant = -1; new_task-> stat__cpu_id_release = -1; new_task-> stat__release_instant= -1; new_task-> stat__dependency_resolution_time = -1; new_task-> stat__joins_at = -1; new_task-> stat__joins_counter = 0; #endif return new_task; }
void GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *), long arg_size, long arg_align, bool if_clause, unsigned flags, void **depend) { struct gomp_thread *thr = gomp_thread (); struct gomp_team *team = thr->ts.team; #ifdef HAVE_BROKEN_POSIX_SEMAPHORES /* If pthread_mutex_* is used for omp_*lock*, then each task must be tied to one thread all the time. This means UNTIED tasks must be tied and if CPYFN is non-NULL IF(0) must be forced, as CPYFN might be running on different thread than FN. */ if (cpyfn) if_clause = false; if (flags & 1) flags &= ~1; #endif /* If parallel or taskgroup has been cancelled, don't start new tasks. */ if (team && (gomp_team_barrier_cancelled (&team->barrier) || (thr->task->taskgroup && thr->task->taskgroup->cancelled))) return; if (!if_clause || team == NULL || (thr->task && thr->task->final_task) || team->task_count > 64 * team->nthreads) { struct gomp_task task; /* If there are depend clauses and earlier deferred sibling tasks with depend clauses, check if there isn't a dependency. If there is, we need to wait for them. There is no need to handle depend clauses for non-deferred tasks other than this, because the parent task is suspended until the child task finishes and thus it can't start further child tasks. */ if ((flags & 8) && thr->task && thr->task->depend_hash) gomp_task_maybe_wait_for_dependencies (depend); gomp_init_task (&task, thr->task, gomp_icv (false)); task.kind = GOMP_TASK_IFFALSE; task.final_task = (thr->task && thr->task->final_task) || (flags & 2); if (thr->task) { task.in_tied_task = thr->task->in_tied_task; task.taskgroup = thr->task->taskgroup; } thr->task = &task; if (__builtin_expect (cpyfn != NULL, 0)) { char buf[arg_size + arg_align - 1]; char *arg = (char *) (((uintptr_t) buf + arg_align - 1) & ~(uintptr_t) (arg_align - 1)); cpyfn (arg, data); fn (arg); } else fn (data); /* Access to "children" is normally done inside a task_lock mutex region, but the only way this particular task.children can be set is if this thread's task work function (fn) creates children. So since the setter is *this* thread, we need no barriers here when testing for non-NULL. We can have task.children set by the current thread then changed by a child thread, but seeing a stale non-NULL value is not a problem. Once past the task_lock acquisition, this thread will see the real value of task.children. */ if (task.children != NULL) { gomp_mutex_lock (&team->task_lock); gomp_clear_parent (task.children); gomp_mutex_unlock (&team->task_lock); } gomp_end_task (); } else { struct gomp_task *task; struct gomp_task *parent = thr->task; struct gomp_taskgroup *taskgroup = parent->taskgroup; char *arg; bool do_wake; size_t depend_size = 0; if (flags & 8) depend_size = ((uintptr_t) depend[0] * sizeof (struct gomp_task_depend_entry)); task = gomp_malloc (sizeof (*task) + depend_size + arg_size + arg_align - 1); arg = (char *) (((uintptr_t) (task + 1) + depend_size + arg_align - 1) & ~(uintptr_t) (arg_align - 1)); gomp_init_task (task, parent, gomp_icv (false)); task->kind = GOMP_TASK_IFFALSE; task->in_tied_task = parent->in_tied_task; task->taskgroup = taskgroup; thr->task = task; if (cpyfn) { cpyfn (arg, data); task->copy_ctors_done = true; } else memcpy (arg, data, arg_size); thr->task = parent; task->kind = GOMP_TASK_WAITING; task->fn = fn; task->fn_data = arg; task->final_task = (flags & 2) >> 1; gomp_mutex_lock (&team->task_lock); /* If parallel or taskgroup has been cancelled, don't start new tasks. */ if (__builtin_expect ((gomp_team_barrier_cancelled (&team->barrier) || (taskgroup && taskgroup->cancelled)) && !task->copy_ctors_done, 0)) { gomp_mutex_unlock (&team->task_lock); gomp_finish_task (task); free (task); return; } if (taskgroup) taskgroup->num_children++; if (depend_size) { size_t ndepend = (uintptr_t) depend[0]; size_t nout = (uintptr_t) depend[1]; size_t i; hash_entry_type ent; task->depend_count = ndepend; task->num_dependees = 0; if (parent->depend_hash == NULL) parent->depend_hash = htab_create (2 * ndepend > 12 ? 2 * ndepend : 12); for (i = 0; i < ndepend; i++) { task->depend[i].addr = depend[2 + i]; task->depend[i].next = NULL; task->depend[i].prev = NULL; task->depend[i].task = task; task->depend[i].is_in = i >= nout; task->depend[i].redundant = false; task->depend[i].redundant_out = false; hash_entry_type *slot = htab_find_slot (&parent->depend_hash, &task->depend[i], INSERT); hash_entry_type out = NULL, last = NULL; if (*slot) { /* If multiple depends on the same task are the same, all but the first one are redundant. As inout/out come first, if any of them is inout/out, it will win, which is the right semantics. */ if ((*slot)->task == task) { task->depend[i].redundant = true; continue; } for (ent = *slot; ent; ent = ent->next) { if (ent->redundant_out) break; last = ent; /* depend(in:...) doesn't depend on earlier depend(in:...). */ if (i >= nout && ent->is_in) continue; if (!ent->is_in) out = ent; struct gomp_task *tsk = ent->task; if (tsk->dependers == NULL) { tsk->dependers = gomp_malloc (sizeof (struct gomp_dependers_vec) + 6 * sizeof (struct gomp_task *)); tsk->dependers->n_elem = 1; tsk->dependers->allocated = 6; tsk->dependers->elem[0] = task; task->num_dependees++; continue; } /* We already have some other dependency on tsk from earlier depend clause. */ else if (tsk->dependers->n_elem && (tsk->dependers->elem[tsk->dependers->n_elem - 1] == task)) continue; else if (tsk->dependers->n_elem == tsk->dependers->allocated) { tsk->dependers->allocated = tsk->dependers->allocated * 2 + 2; tsk->dependers = gomp_realloc (tsk->dependers, sizeof (struct gomp_dependers_vec) + (tsk->dependers->allocated * sizeof (struct gomp_task *))); } tsk->dependers->elem[tsk->dependers->n_elem++] = task; task->num_dependees++; } task->depend[i].next = *slot; (*slot)->prev = &task->depend[i]; } *slot = &task->depend[i]; /* There is no need to store more than one depend({,in}out:) task per address in the hash table chain for the purpose of creation of deferred tasks, because each out depends on all earlier outs, thus it is enough to record just the last depend({,in}out:). For depend(in:), we need to keep all of the previous ones not terminated yet, because a later depend({,in}out:) might need to depend on all of them. So, if the new task's clause is depend({,in}out:), we know there is at most one other depend({,in}out:) clause in the list (out). For non-deferred tasks we want to see all outs, so they are moved to the end of the chain, after first redundant_out entry all following entries should be redundant_out. */ if (!task->depend[i].is_in && out) { if (out != last) { out->next->prev = out->prev; out->prev->next = out->next; out->next = last->next; out->prev = last; last->next = out; if (out->next) out->next->prev = out; } out->redundant_out = true; } } if (task->num_dependees) { gomp_mutex_unlock (&team->task_lock); return; } } if (parent->children) { task->next_child = parent->children; task->prev_child = parent->children->prev_child; task->next_child->prev_child = task; task->prev_child->next_child = task; } else { task->next_child = task; task->prev_child = task; } parent->children = task; if (taskgroup) { if (taskgroup->children) { task->next_taskgroup = taskgroup->children; task->prev_taskgroup = taskgroup->children->prev_taskgroup; task->next_taskgroup->prev_taskgroup = task; task->prev_taskgroup->next_taskgroup = task; } else { task->next_taskgroup = task; task->prev_taskgroup = task; } taskgroup->children = task; } if (team->task_queue) { task->next_queue = team->task_queue; task->prev_queue = team->task_queue->prev_queue; task->next_queue->prev_queue = task; task->prev_queue->next_queue = task; } else { task->next_queue = task; task->prev_queue = task; team->task_queue = task; } ++team->task_count; ++team->task_queued_count; gomp_team_barrier_set_task_pending (&team->barrier); do_wake = team->task_running_count + !parent->in_tied_task < team->nthreads; gomp_mutex_unlock (&team->task_lock); if (do_wake) gomp_team_barrier_wake (&team->barrier, 1); } }
void GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *), long arg_size, long arg_align, bool if_clause, unsigned flags, void **depend, int priority) { struct gomp_thread *thr = gomp_thread (); struct gomp_team *team = thr->ts.team; #ifdef HAVE_BROKEN_POSIX_SEMAPHORES /* If pthread_mutex_* is used for omp_*lock*, then each task must be tied to one thread all the time. This means UNTIED tasks must be tied and if CPYFN is non-NULL IF(0) must be forced, as CPYFN might be running on different thread than FN. */ if (cpyfn) if_clause = false; flags &= ~GOMP_TASK_FLAG_UNTIED; #endif /* If parallel or taskgroup has been cancelled, don't start new tasks. */ if (team && (gomp_team_barrier_cancelled (&team->barrier) || (thr->task->taskgroup && thr->task->taskgroup->cancelled))) return; if ((flags & GOMP_TASK_FLAG_PRIORITY) == 0) priority = 0; /* FIXME, use priority. */ (void) priority; if (!if_clause || team == NULL || (thr->task && thr->task->final_task) || team->task_count > 64 * team->nthreads) { struct gomp_task task; /* If there are depend clauses and earlier deferred sibling tasks with depend clauses, check if there isn't a dependency. If there is, we need to wait for them. There is no need to handle depend clauses for non-deferred tasks other than this, because the parent task is suspended until the child task finishes and thus it can't start further child tasks. */ if ((flags & GOMP_TASK_FLAG_DEPEND) && thr->task && thr->task->depend_hash) gomp_task_maybe_wait_for_dependencies (depend); gomp_init_task (&task, thr->task, gomp_icv (false)); task.kind = GOMP_TASK_UNDEFERRED; task.final_task = (thr->task && thr->task->final_task) || (flags & GOMP_TASK_FLAG_FINAL); if (thr->task) { task.in_tied_task = thr->task->in_tied_task; task.taskgroup = thr->task->taskgroup; } thr->task = &task; if (__builtin_expect (cpyfn != NULL, 0)) { char buf[arg_size + arg_align - 1]; char *arg = (char *) (((uintptr_t) buf + arg_align - 1) & ~(uintptr_t) (arg_align - 1)); cpyfn (arg, data); fn (arg); } else fn (data); /* Access to "children" is normally done inside a task_lock mutex region, but the only way this particular task.children can be set is if this thread's task work function (fn) creates children. So since the setter is *this* thread, we need no barriers here when testing for non-NULL. We can have task.children set by the current thread then changed by a child thread, but seeing a stale non-NULL value is not a problem. Once past the task_lock acquisition, this thread will see the real value of task.children. */ if (task.children != NULL) { gomp_mutex_lock (&team->task_lock); gomp_clear_parent (task.children); gomp_mutex_unlock (&team->task_lock); } gomp_end_task (); } else { struct gomp_task *task; struct gomp_task *parent = thr->task; struct gomp_taskgroup *taskgroup = parent->taskgroup; char *arg; bool do_wake; size_t depend_size = 0; if (flags & GOMP_TASK_FLAG_DEPEND) depend_size = ((uintptr_t) depend[0] * sizeof (struct gomp_task_depend_entry)); task = gomp_malloc (sizeof (*task) + depend_size + arg_size + arg_align - 1); arg = (char *) (((uintptr_t) (task + 1) + depend_size + arg_align - 1) & ~(uintptr_t) (arg_align - 1)); gomp_init_task (task, parent, gomp_icv (false)); task->kind = GOMP_TASK_UNDEFERRED; task->in_tied_task = parent->in_tied_task; task->taskgroup = taskgroup; thr->task = task; if (cpyfn) { cpyfn (arg, data); task->copy_ctors_done = true; } else memcpy (arg, data, arg_size); thr->task = parent; task->kind = GOMP_TASK_WAITING; task->fn = fn; task->fn_data = arg; task->final_task = (flags & GOMP_TASK_FLAG_FINAL) >> 1; gomp_mutex_lock (&team->task_lock); /* If parallel or taskgroup has been cancelled, don't start new tasks. */ if (__builtin_expect ((gomp_team_barrier_cancelled (&team->barrier) || (taskgroup && taskgroup->cancelled)) && !task->copy_ctors_done, 0)) { gomp_mutex_unlock (&team->task_lock); gomp_finish_task (task); free (task); return; } if (taskgroup) taskgroup->num_children++; if (depend_size) { gomp_task_handle_depend (task, parent, depend); if (task->num_dependees) { gomp_mutex_unlock (&team->task_lock); return; } } if (parent->children) { task->next_child = parent->children; task->prev_child = parent->children->prev_child; task->next_child->prev_child = task; task->prev_child->next_child = task; } else { task->next_child = task; task->prev_child = task; } parent->children = task; if (taskgroup) { /* If applicable, place task into its taskgroup. */ if (taskgroup->children) { task->next_taskgroup = taskgroup->children; task->prev_taskgroup = taskgroup->children->prev_taskgroup; task->next_taskgroup->prev_taskgroup = task; task->prev_taskgroup->next_taskgroup = task; } else { task->next_taskgroup = task; task->prev_taskgroup = task; } taskgroup->children = task; } if (team->task_queue) { task->next_queue = team->task_queue; task->prev_queue = team->task_queue->prev_queue; task->next_queue->prev_queue = task; task->prev_queue->next_queue = task; } else { task->next_queue = task; task->prev_queue = task; team->task_queue = task; } ++team->task_count; ++team->task_queued_count; gomp_team_barrier_set_task_pending (&team->barrier); do_wake = team->task_running_count + !parent->in_tied_task < team->nthreads; gomp_mutex_unlock (&team->task_lock); if (do_wake) gomp_team_barrier_wake (&team->barrier, 1); } }
void GOMP_taskloop (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *), long arg_size, long arg_align, unsigned flags, unsigned long num_tasks, int priority, TYPE start, TYPE end, TYPE step) { struct gomp_thread *thr = gomp_thread (); struct gomp_team *team = thr->ts.team; #ifdef HAVE_BROKEN_POSIX_SEMAPHORES /* If pthread_mutex_* is used for omp_*lock*, then each task must be tied to one thread all the time. This means UNTIED tasks must be tied and if CPYFN is non-NULL IF(0) must be forced, as CPYFN might be running on different thread than FN. */ if (cpyfn) flags &= ~GOMP_TASK_FLAG_IF; flags &= ~GOMP_TASK_FLAG_UNTIED; #endif /* If parallel or taskgroup has been cancelled, don't start new tasks. */ if (team && gomp_team_barrier_cancelled (&team->barrier)) return; #ifdef TYPE_is_long TYPE s = step; if (step > 0) { if (start >= end) return; s--; } else { if (start <= end) return; s++; } UTYPE n = (end - start + s) / step; #else UTYPE n; if (flags & GOMP_TASK_FLAG_UP) { if (start >= end) return; n = (end - start + step - 1) / step; } else { if (start <= end) return; n = (start - end - step - 1) / -step; } #endif TYPE task_step = step; unsigned long nfirst = n; if (flags & GOMP_TASK_FLAG_GRAINSIZE) { unsigned long grainsize = num_tasks; #ifdef TYPE_is_long num_tasks = n / grainsize; #else UTYPE ndiv = n / grainsize; num_tasks = ndiv; if (num_tasks != ndiv) num_tasks = ~0UL; #endif if (num_tasks <= 1) { num_tasks = 1; task_step = end - start; } else if (num_tasks >= grainsize #ifndef TYPE_is_long && num_tasks != ~0UL #endif ) { UTYPE mul = num_tasks * grainsize; task_step = (TYPE) grainsize * step; if (mul != n) { task_step += step; nfirst = n - mul - 1; } } else { UTYPE div = n / num_tasks; UTYPE mod = n % num_tasks; task_step = (TYPE) div * step; if (mod) { task_step += step; nfirst = mod - 1; } } } else { if (num_tasks == 0) num_tasks = team ? team->nthreads : 1; if (num_tasks >= n) num_tasks = n; else { UTYPE div = n / num_tasks; UTYPE mod = n % num_tasks; task_step = (TYPE) div * step; if (mod) { task_step += step; nfirst = mod - 1; } } } if (flags & GOMP_TASK_FLAG_NOGROUP) { if (thr->task && thr->task->taskgroup && thr->task->taskgroup->cancelled) return; } else ialias_call (GOMP_taskgroup_start) (); if (priority > gomp_max_task_priority_var) priority = gomp_max_task_priority_var; if ((flags & GOMP_TASK_FLAG_IF) == 0 || team == NULL || (thr->task && thr->task->final_task) || team->task_count + num_tasks > 64 * team->nthreads) { unsigned long i; if (__builtin_expect (cpyfn != NULL, 0)) { struct gomp_task task[num_tasks]; struct gomp_task *parent = thr->task; arg_size = (arg_size + arg_align - 1) & ~(arg_align - 1); char buf[num_tasks * arg_size + arg_align - 1]; char *arg = (char *) (((uintptr_t) buf + arg_align - 1) & ~(uintptr_t) (arg_align - 1)); char *orig_arg = arg; for (i = 0; i < num_tasks; i++) { gomp_init_task (&task[i], parent, gomp_icv (false)); task[i].priority = priority; task[i].kind = GOMP_TASK_UNDEFERRED; task[i].final_task = (thr->task && thr->task->final_task) || (flags & GOMP_TASK_FLAG_FINAL); if (thr->task) { task[i].in_tied_task = thr->task->in_tied_task; task[i].taskgroup = thr->task->taskgroup; } thr->task = &task[i]; cpyfn (arg, data); arg += arg_size; } arg = orig_arg; for (i = 0; i < num_tasks; i++) { thr->task = &task[i]; ((TYPE *)arg)[0] = start; start += task_step; ((TYPE *)arg)[1] = start; if (i == nfirst) task_step -= step; fn (arg); arg += arg_size; if (!priority_queue_empty_p (&task[i].children_queue, MEMMODEL_RELAXED)) { gomp_mutex_lock (&team->task_lock); gomp_clear_parent (&task[i].children_queue); gomp_mutex_unlock (&team->task_lock); } gomp_end_task (); } } else for (i = 0; i < num_tasks; i++) { struct gomp_task task; gomp_init_task (&task, thr->task, gomp_icv (false)); task.priority = priority; task.kind = GOMP_TASK_UNDEFERRED; task.final_task = (thr->task && thr->task->final_task) || (flags & GOMP_TASK_FLAG_FINAL); if (thr->task) { task.in_tied_task = thr->task->in_tied_task; task.taskgroup = thr->task->taskgroup; } thr->task = &task; ((TYPE *)data)[0] = start; start += task_step; ((TYPE *)data)[1] = start; if (i == nfirst) task_step -= step; fn (data); if (!priority_queue_empty_p (&task.children_queue, MEMMODEL_RELAXED)) { gomp_mutex_lock (&team->task_lock); gomp_clear_parent (&task.children_queue); gomp_mutex_unlock (&team->task_lock); } gomp_end_task (); } } else { struct gomp_task *tasks[num_tasks]; struct gomp_task *parent = thr->task; struct gomp_taskgroup *taskgroup = parent->taskgroup; char *arg; int do_wake; unsigned long i; for (i = 0; i < num_tasks; i++) { struct gomp_task *task = gomp_malloc (sizeof (*task) + arg_size + arg_align - 1); tasks[i] = task; arg = (char *) (((uintptr_t) (task + 1) + arg_align - 1) & ~(uintptr_t) (arg_align - 1)); gomp_init_task (task, parent, gomp_icv (false)); task->priority = priority; task->kind = GOMP_TASK_UNDEFERRED; task->in_tied_task = parent->in_tied_task; task->taskgroup = taskgroup; thr->task = task; if (cpyfn) { cpyfn (arg, data); task->copy_ctors_done = true; } else memcpy (arg, data, arg_size); ((TYPE *)arg)[0] = start; start += task_step; ((TYPE *)arg)[1] = start; if (i == nfirst) task_step -= step; thr->task = parent; task->kind = GOMP_TASK_WAITING; task->fn = fn; task->fn_data = arg; task->final_task = (flags & GOMP_TASK_FLAG_FINAL) >> 1; } gomp_mutex_lock (&team->task_lock); /* If parallel or taskgroup has been cancelled, don't start new tasks. */ if (__builtin_expect ((gomp_team_barrier_cancelled (&team->barrier) || (taskgroup && taskgroup->cancelled)) && cpyfn == NULL, 0)) { gomp_mutex_unlock (&team->task_lock); for (i = 0; i < num_tasks; i++) { gomp_finish_task (tasks[i]); free (tasks[i]); } if ((flags & GOMP_TASK_FLAG_NOGROUP) == 0) ialias_call (GOMP_taskgroup_end) (); return; } if (taskgroup) taskgroup->num_children += num_tasks; for (i = 0; i < num_tasks; i++) { struct gomp_task *task = tasks[i]; priority_queue_insert (PQ_CHILDREN, &parent->children_queue, task, priority, PRIORITY_INSERT_BEGIN, /*last_parent_depends_on=*/false, task->parent_depends_on); if (taskgroup) priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue, task, priority, PRIORITY_INSERT_BEGIN, /*last_parent_depends_on=*/false, task->parent_depends_on); priority_queue_insert (PQ_TEAM, &team->task_queue, task, priority, PRIORITY_INSERT_END, /*last_parent_depends_on=*/false, task->parent_depends_on); ++team->task_count; ++team->task_queued_count; } gomp_team_barrier_set_task_pending (&team->barrier); if (team->task_running_count + !parent->in_tied_task < team->nthreads) { do_wake = team->nthreads - team->task_running_count - !parent->in_tied_task; if ((unsigned long) do_wake > num_tasks) do_wake = num_tasks; } else do_wake = 0; gomp_mutex_unlock (&team->task_lock); if (do_wake) gomp_team_barrier_wake (&team->barrier, do_wake); } if ((flags & GOMP_TASK_FLAG_NOGROUP) == 0) ialias_call (GOMP_taskgroup_end) (); }