int starpu_opencl_collect_stats(cl_event event STARPU_ATTRIBUTE_UNUSED) { #if defined(CL_PROFILING_CLOCK_CYCLE_COUNT)||defined(CL_PROFILING_STALL_CYCLE_COUNT)||defined(CL_PROFILING_POWER_CONSUMED) struct starpu_task *task = starpu_task_get_current(); struct starpu_profiling_task_info *info = task->profiling_info; #endif #ifdef CL_PROFILING_CLOCK_CYCLE_COUNT if (starpu_profiling_status_get() && info) { cl_int err; unsigned int clock_cycle_count; size_t size; err = clGetEventProfilingInfo(event, CL_PROFILING_CLOCK_CYCLE_COUNT, sizeof(clock_cycle_count), &clock_cycle_count, &size); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); STARPU_ASSERT(size == sizeof(clock_cycle_count)); info->used_cycles += clock_cycle_count; } #endif #ifdef CL_PROFILING_STALL_CYCLE_COUNT if (starpu_profiling_status_get() && info) { cl_int err; unsigned int stall_cycle_count; size_t size; err = clGetEventProfilingInfo(event, CL_PROFILING_STALL_CYCLE_COUNT, sizeof(stall_cycle_count), &stall_cycle_count, &size); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); STARPU_ASSERT(size == sizeof(stall_cycle_count)); info->stall_cycles += stall_cycle_count; } #endif #ifdef CL_PROFILING_POWER_CONSUMED if (info && (starpu_profiling_status_get() || (task->cl && task->cl->power_model && task->cl->power_model->benchmarking))) { cl_int err; double power_consumed; size_t size; err = clGetEventProfilingInfo(event, CL_PROFILING_POWER_CONSUMED, sizeof(power_consumed), &power_consumed, &size); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); STARPU_ASSERT(size == sizeof(power_consumed)); info->power_consumed += power_consumed; } #endif return 0; }
void starpu_profiling_worker_helper_display_summary(void) { const char *stats; double sum_consumed = 0.; int profiling = starpu_profiling_status_get(); double overall_time = 0; int workerid; int worker_cnt = starpu_worker_get_count(); if (!((stats = getenv("STARPU_WORKER_STATS")) && atoi(stats))) return; fprintf(stderr, "\nWorker statistics:\n"); fprintf(stderr, "******************\n"); for (workerid = 0; workerid < worker_cnt; workerid++) { struct starpu_profiling_worker_info info; starpu_profiling_worker_get_info(workerid, &info); char name[64]; starpu_worker_get_name(workerid, name, sizeof(name)); if (profiling) { double total_time = starpu_timing_timespec_to_us(&info.total_time) / 1000.; double executing_time = starpu_timing_timespec_to_us(&info.executing_time) / 1000.; double sleeping_time = starpu_timing_timespec_to_us(&info.sleeping_time) / 1000.; if (total_time > overall_time) overall_time = total_time; fprintf(stderr, "%-32s\n", name); fprintf(stderr, "\t%d task(s)\n\ttotal: %.2lf ms executing: %.2lf ms sleeping: %.2lf ms overhead %.2lf ms\n", info.executed_tasks, total_time, executing_time, sleeping_time, total_time - executing_time - sleeping_time); if (info.used_cycles || info.stall_cycles) fprintf(stderr, "\t%lu Mcy %lu Mcy stall\n", info.used_cycles/1000000, info.stall_cycles/1000000); if (info.power_consumed) fprintf(stderr, "\t%f J consumed\n", info.power_consumed); } else { fprintf(stderr, "\t%-32s\t%d task(s)\n", name, info.executed_tasks); } sum_consumed += info.power_consumed; } if (profiling) { const char *strval_idle_power = getenv("STARPU_IDLE_POWER"); if (strval_idle_power) { double idle_power = atof(strval_idle_power); /* Watt */ double idle_consumption = idle_power * overall_time / 1000.; /* J */ fprintf(stderr, "Idle consumption: %.2lf J\n", idle_consumption); sum_consumed += idle_consumption; } } if (profiling && sum_consumed) fprintf(stderr, "Total consumption: %.2lf J\n", sum_consumed); }
void _starpu_worker_register_executing_start_date(int workerid, struct timespec *executing_start) { if (starpu_profiling_status_get()) { STARPU_PTHREAD_MUTEX_LOCK(&worker_info_mutex[workerid]); worker_registered_executing_start[workerid] = 1; memcpy(&executing_start_date[workerid], executing_start, sizeof(struct timespec)); STARPU_PTHREAD_MUTEX_UNLOCK(&worker_info_mutex[workerid]); } }
void _starpu_profiling_set_task_push_end_time(struct starpu_task *task) { if (!starpu_profiling_status_get()) return; struct starpu_profiling_task_info *profiling_info; profiling_info = task->profiling_info; if (profiling_info) _starpu_clock_gettime(&profiling_info->push_end_time); }
void _starpu_worker_restart_sleeping(int workerid) { if (starpu_profiling_status_get()) { struct timespec sleep_start_time; _starpu_clock_gettime(&sleep_start_time); STARPU_PTHREAD_MUTEX_LOCK(&worker_info_mutex[workerid]); worker_registered_sleeping_start[workerid] = 1; memcpy(&sleeping_start_date[workerid], &sleep_start_time, sizeof(struct timespec)); STARPU_PTHREAD_MUTEX_UNLOCK(&worker_info_mutex[workerid]); } }
struct starpu_profiling_task_info *_starpu_allocate_profiling_info_if_needed(struct starpu_task *task) { struct starpu_profiling_task_info *info = NULL; /* If we are benchmarking, we need room for the power consumption */ if (starpu_profiling_status_get() || (task->cl && task->cl->power_model && (task->cl->power_model->benchmarking || _starpu_get_calibrate_flag()))) { info = (struct starpu_profiling_task_info *) calloc(1, sizeof(struct starpu_profiling_task_info)); STARPU_ASSERT(info); } return info; }
int starpu_profiling_worker_get_info(int workerid, struct starpu_profiling_worker_info *info) { if (!starpu_profiling_status_get()) { /* Not thread safe, shouldn't be too much a problem */ info->executed_tasks = worker_info[workerid].executed_tasks; } STARPU_PTHREAD_MUTEX_LOCK(&worker_info_mutex[workerid]); if (info) { /* The total time is computed in a lazy fashion */ struct timespec now; _starpu_clock_gettime(&now); /* In case some worker is currently sleeping, we take into * account the time spent since it registered. */ if (worker_registered_sleeping_start[workerid]) { struct timespec sleeping_time; starpu_timespec_sub(&now, &sleeping_start_date[workerid], &sleeping_time); starpu_timespec_accumulate(&worker_info[workerid].sleeping_time, &sleeping_time); } if (worker_registered_executing_start[workerid]) { struct timespec executing_time; starpu_timespec_sub(&now, &executing_start_date[workerid], &executing_time); starpu_timespec_accumulate(&worker_info[workerid].executing_time, &executing_time); } /* total_time = now - start_time */ starpu_timespec_sub(&now, &worker_info[workerid].start_time, &worker_info[workerid].total_time); memcpy(info, &worker_info[workerid], sizeof(struct starpu_profiling_worker_info)); } _starpu_worker_reset_profiling_info_with_lock(workerid); STARPU_PTHREAD_MUTEX_UNLOCK(&worker_info_mutex[workerid]); return 0; }
void _starpu_worker_update_profiling_info_executing(int workerid, struct timespec *executing_time, int executed_tasks, uint64_t used_cycles, uint64_t stall_cycles, double power_consumed) { if (starpu_profiling_status_get()) { STARPU_PTHREAD_MUTEX_LOCK(&worker_info_mutex[workerid]); if (executing_time) starpu_timespec_accumulate(&worker_info[workerid].executing_time, executing_time); worker_info[workerid].used_cycles += used_cycles; worker_info[workerid].stall_cycles += stall_cycles; worker_info[workerid].power_consumed += power_consumed; worker_info[workerid].executed_tasks += executed_tasks; STARPU_PTHREAD_MUTEX_UNLOCK(&worker_info_mutex[workerid]); } else /* Not thread safe, shouldn't be too much a problem */ worker_info[workerid].executed_tasks += executed_tasks; }
/* Workers may block when there is no work to do at all. We assume that the * mutex is hold when that function is called. */ void _starpu_block_worker(int workerid, pthread_cond_t *cond, pthread_mutex_t *mutex) { struct timespec start_time, end_time; STARPU_TRACE_WORKER_SLEEP_START _starpu_worker_set_status(workerid, STATUS_SLEEPING); starpu_clock_gettime(&start_time); _starpu_worker_register_sleeping_start_date(workerid, &start_time); PTHREAD_COND_WAIT(cond, mutex); _starpu_worker_set_status(workerid, STATUS_UNKNOWN); STARPU_TRACE_WORKER_SLEEP_END starpu_clock_gettime(&end_time); int profiling = starpu_profiling_status_get(); if (profiling) { struct timespec sleeping_time; starpu_timespec_sub(&end_time, &start_time, &sleeping_time); _starpu_worker_update_profiling_info_sleeping(workerid, &start_time, &end_time); } }
struct starpu_task *_starpu_pop_task(struct _starpu_worker *worker) { struct starpu_task *task; int worker_id; unsigned node; /* We can't tell in advance which task will be picked up, so we measure * a timestamp, and will attribute it afterwards to the task. */ int profiling = starpu_profiling_status_get(); struct timespec pop_start_time; if (profiling) _starpu_clock_gettime(&pop_start_time); pick: /* perhaps there is some local task to be executed first */ task = _starpu_pop_local_task(worker); /* get tasks from the stacks of the strategy */ if(!task) { struct _starpu_sched_ctx *sched_ctx ; #ifndef STARPU_NON_BLOCKING_DRIVERS int been_here[STARPU_NMAX_SCHED_CTXS]; int i; for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++) been_here[i] = 0; while(!task) #endif { if(worker->nsched_ctxs == 1) sched_ctx = _starpu_get_initial_sched_ctx(); else { while(1) { sched_ctx = _get_next_sched_ctx_to_pop_into(worker); if(worker->removed_from_ctx[sched_ctx->id] == 1 && worker->shares_tasks_lists[sched_ctx->id] == 1) { _starpu_worker_gets_out_of_ctx(sched_ctx->id, worker); worker->removed_from_ctx[sched_ctx->id] = 0; sched_ctx = NULL; } else break; } } if(sched_ctx && sched_ctx->id != STARPU_NMAX_SCHED_CTXS) { if (sched_ctx->sched_policy && sched_ctx->sched_policy->pop_task) { task = sched_ctx->sched_policy->pop_task(sched_ctx->id); _starpu_pop_task_end(task); } } if(!task) { /* it doesn't matter if it shares tasks list or not in the scheduler, if it does not have any task to pop just get it out of here */ /* however if it shares a task list it will be removed as soon as he finishes this job (in handle_job_termination) */ if(worker->removed_from_ctx[sched_ctx->id]) { _starpu_worker_gets_out_of_ctx(sched_ctx->id, worker); worker->removed_from_ctx[sched_ctx->id] = 0; } #ifdef STARPU_USE_SC_HYPERVISOR if(worker->pop_ctx_priority) { struct starpu_sched_ctx_performance_counters *perf_counters = sched_ctx->perf_counters; if(sched_ctx->id != 0 && perf_counters != NULL && perf_counters->notify_idle_cycle && _starpu_sched_ctx_allow_hypervisor(sched_ctx->id)) { // _STARPU_TRACE_HYPERVISOR_BEGIN(); perf_counters->notify_idle_cycle(sched_ctx->id, worker->workerid, 1.0); // _STARPU_TRACE_HYPERVISOR_END(); } } #endif //STARPU_USE_SC_HYPERVISOR #ifndef STARPU_NON_BLOCKING_DRIVERS if(been_here[sched_ctx->id] || worker->nsched_ctxs == 1) break; been_here[sched_ctx->id] = 1; #endif } } } if (!task) { idle_start[worker->workerid] = starpu_timing_now(); return NULL; } if(idle_start[worker->workerid] != 0.0) { double idle_end = starpu_timing_now(); idle[worker->workerid] += (idle_end - idle_start[worker->workerid]); idle_start[worker->workerid] = 0.0; } #ifdef STARPU_USE_SC_HYPERVISOR struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx); struct starpu_sched_ctx_performance_counters *perf_counters = sched_ctx->perf_counters; if(sched_ctx->id != 0 && perf_counters != NULL && perf_counters->notify_poped_task && _starpu_sched_ctx_allow_hypervisor(sched_ctx->id)) { // _STARPU_TRACE_HYPERVISOR_BEGIN(); perf_counters->notify_poped_task(task->sched_ctx, worker->workerid); // _STARPU_TRACE_HYPERVISOR_END(); } #endif //STARPU_USE_SC_HYPERVISOR /* Make sure we do not bother with all the multiformat-specific code if * it is not necessary. */ if (!_starpu_task_uses_multiformat_handles(task)) goto profiling; /* This is either a conversion task, or a regular task for which the * conversion tasks have already been created and submitted */ if (task->mf_skip) goto profiling; /* * This worker may not be able to execute this task. In this case, we * should return the task anyway. It will be pushed back almost immediatly. * This way, we avoid computing and executing the conversions tasks. * Here, we do not care about what implementation is used. */ worker_id = starpu_worker_get_id(); if (!starpu_worker_can_execute_task_first_impl(worker_id, task, NULL)) return task; node = starpu_worker_get_memory_node(worker_id); /* * We do have a task that uses multiformat handles. Let's create the * required conversion tasks. */ STARPU_PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex); unsigned i; unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task); for (i = 0; i < nbuffers; i++) { struct starpu_task *conversion_task; starpu_data_handle_t handle; handle = STARPU_TASK_GET_HANDLE(task, i); if (!_starpu_handle_needs_conversion_task(handle, node)) continue; conversion_task = _starpu_create_conversion_task(handle, node); conversion_task->mf_skip = 1; conversion_task->execute_on_a_specific_worker = 1; conversion_task->workerid = worker_id; /* * Next tasks will need to know where these handles have gone. */ handle->mf_node = node; _starpu_task_submit_conversion_task(conversion_task, worker_id); } task->mf_skip = 1; starpu_task_list_push_back(&worker->local_tasks, task); STARPU_PTHREAD_MUTEX_LOCK(&worker->sched_mutex); goto pick; profiling: if (profiling) { struct starpu_profiling_task_info *profiling_info; profiling_info = task->profiling_info; /* The task may have been created before profiling was enabled, * so we check if the profiling_info structure is available * even though we already tested if profiling is enabled. */ if (profiling_info) { memcpy(&profiling_info->pop_start_time, &pop_start_time, sizeof(struct timespec)); _starpu_clock_gettime(&profiling_info->pop_end_time); } } if(task->prologue_callback_pop_func) task->prologue_callback_pop_func(task->prologue_callback_pop_arg); return task; }