void starpu_profiling_worker_helper_display_summary(void) { const char *stats; double sum_consumed = 0.; int profiling = starpu_profiling_status_get(); double overall_time = 0; int workerid; int worker_cnt = starpu_worker_get_count(); if (!((stats = getenv("STARPU_WORKER_STATS")) && atoi(stats))) return; fprintf(stderr, "\nWorker statistics:\n"); fprintf(stderr, "******************\n"); for (workerid = 0; workerid < worker_cnt; workerid++) { struct starpu_profiling_worker_info info; starpu_profiling_worker_get_info(workerid, &info); char name[64]; starpu_worker_get_name(workerid, name, sizeof(name)); if (profiling) { double total_time = starpu_timing_timespec_to_us(&info.total_time) / 1000.; double executing_time = starpu_timing_timespec_to_us(&info.executing_time) / 1000.; double sleeping_time = starpu_timing_timespec_to_us(&info.sleeping_time) / 1000.; if (total_time > overall_time) overall_time = total_time; fprintf(stderr, "%-32s\n", name); fprintf(stderr, "\t%d task(s)\n\ttotal: %.2lf ms executing: %.2lf ms sleeping: %.2lf ms overhead %.2lf ms\n", info.executed_tasks, total_time, executing_time, sleeping_time, total_time - executing_time - sleeping_time); if (info.used_cycles || info.stall_cycles) fprintf(stderr, "\t%lu Mcy %lu Mcy stall\n", info.used_cycles/1000000, info.stall_cycles/1000000); if (info.power_consumed) fprintf(stderr, "\t%f J consumed\n", info.power_consumed); } else { fprintf(stderr, "\t%-32s\t%d task(s)\n", name, info.executed_tasks); } sum_consumed += info.power_consumed; } if (profiling) { const char *strval_idle_power = getenv("STARPU_IDLE_POWER"); if (strval_idle_power) { double idle_power = atof(strval_idle_power); /* Watt */ double idle_consumption = idle_power * overall_time / 1000.; /* J */ fprintf(stderr, "Idle consumption: %.2lf J\n", idle_consumption); sum_consumed += idle_consumption; } } if (profiling && sum_consumed) fprintf(stderr, "Total consumption: %.2lf J\n", sum_consumed); }
int main(int argc, char **argv) { starpu_init(NULL); starpu_data_malloc_pinned_if_possible((void **)&v, VECTORSIZE*sizeof(unsigned)); starpu_vector_data_register(&v_handle, 0, (uintptr_t)v, VECTORSIZE, sizeof(unsigned)); unsigned nworker = starpu_worker_get_count(); cnt = nworker*N; unsigned iter, worker; for (iter = 0; iter < N; iter++) { for (worker = 0; worker < nworker; worker++) { /* synchronous prefetch */ unsigned node = starpu_worker_get_memory_node(worker); starpu_data_prefetch_on_node(v_handle, node, 0); /* execute a task */ struct starpu_task *task = starpu_task_create(); task->cl = &cl; task->buffers[0].handle = v_handle; task->buffers[0].mode = select_random_mode(); task->callback_func = callback; task->callback_arg = NULL; task->synchronous = 1; int ret = starpu_task_submit(task, NULL); if (ret == -ENODEV) goto enodev; } } pthread_mutex_lock(&mutex); if (!finished) pthread_cond_wait(&cond, &mutex); pthread_mutex_unlock(&mutex); starpu_shutdown(); return 0; enodev: fprintf(stderr, "WARNING: No one can execute this task\n"); /* yes, we do not perform the computation but we did detect that no one * could perform the kernel, so this is not an error from StarPU */ return 0; }
static void initialize_prio_center_policy(unsigned sched_ctx_id) { struct starpu_sched_tree *t; struct starpu_sched_component * eager_component; starpu_sched_ctx_create_worker_collection(sched_ctx_id, STARPU_WORKER_LIST); t = starpu_sched_tree_create(sched_ctx_id); t->root = starpu_sched_component_prio_create(t, NULL); eager_component = starpu_sched_component_eager_create(t, NULL); starpu_sched_component_connect(t->root, eager_component); unsigned i; for(i = 0; i < starpu_worker_get_count() + starpu_combined_worker_get_count(); i++) starpu_sched_component_connect(eager_component, starpu_sched_component_worker_get(sched_ctx_id, i)); starpu_sched_tree_update_workers(t); starpu_sched_ctx_set_policy_data(sched_ctx_id, (void*)t); }
void hard_coded_handle_idle_cycle(unsigned sched_ctx, int worker) { unsigned criteria = sc_hypervisor_get_resize_criteria(); if(criteria != SC_NOTHING)// && criteria == SC_SPEED) { int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex); if(ret != EBUSY) { // if(sc_hypervisor_criteria_fulfilled(sched_ctx, worker)) // if(sc_hypervisor_check_speed_gap_btw_ctxs(NULL, -1, NULL, -1)) if(sc_hypervisor_check_idle(sched_ctx, worker)) { if(hard_coded_worker_belong_to_other_sched_ctx(sched_ctx, worker)) sc_hypervisor_remove_workers_from_sched_ctx(&worker, 1, sched_ctx, 1); else { // sc_hypervisor_policy_resize_to_unknown_receiver(sched_ctx, 0); unsigned *sched_ctxs = sc_hypervisor_get_sched_ctxs(); int ns = sc_hypervisor_get_nsched_ctxs(); int nworkers = (int)starpu_worker_get_count(); struct types_of_workers *tw = sc_hypervisor_get_types_of_workers(NULL, nworkers); int nw = tw->nw; double w_in_s[ns][nw]; w_in_s[0][0] = 1; w_in_s[0][1] = 3; w_in_s[1][0] = 8; w_in_s[1][1] = 0; // sc_hypervisor_lp_place_resources_in_ctx(ns, nw, w_in_s, sched_ctxs, NULL, 1, tw); sc_hypervisor_lp_distribute_floating_no_resources_in_ctxs(sc_hypervisor_get_sched_ctxs(), ns, tw->nw, w_in_s, NULL, nworkers, tw); } } starpu_pthread_mutex_unlock(&act_hypervisor_mutex); } } }
void display_stat_heat(void) { unsigned nworkers = starpu_worker_get_count(); fprintf(stderr, "STATS : \n"); unsigned worker; for (worker = 0; worker < nworkers; worker++) { count_total_per_worker[worker] = count_11_per_worker[worker] + count_12_per_worker[worker] + count_21_per_worker[worker] + count_22_per_worker[worker]; count_11_total += count_11_per_worker[worker]; count_12_total += count_12_per_worker[worker]; count_21_total += count_21_per_worker[worker]; count_22_total += count_22_per_worker[worker]; } fprintf(stderr, "\t11 (diagonal block LU)\n"); for (worker = 0; worker < nworkers; worker++) { if (count_total_per_worker[worker]) { char name[32]; starpu_worker_get_name(worker, name, 32); fprintf(stderr, "\t\t%s -> %d / %d (%2.2f %%)\n", name, count_11_per_worker[worker], count_11_total, (100.0*count_11_per_worker[worker])/count_11_total); } } fprintf(stderr, "\t12 (TRSM)\n"); for (worker = 0; worker < nworkers; worker++) { if (count_total_per_worker[worker]) { char name[32]; starpu_worker_get_name(worker, name, 32); fprintf(stderr, "\t\t%s -> %d / %d (%2.2f %%)\n", name, count_12_per_worker[worker], count_12_total, (100.0*count_12_per_worker[worker])/count_12_total); } } fprintf(stderr, "\t21 (TRSM)\n"); for (worker = 0; worker < nworkers; worker++) { if (count_total_per_worker[worker]) { char name[32]; starpu_worker_get_name(worker, name, 32); fprintf(stderr, "\t\t%s -> %d / %d (%2.2f %%)\n", name, count_21_per_worker[worker], count_21_total, (100.0*count_21_per_worker[worker])/count_21_total); } } fprintf(stderr, "\t22 (SGEMM)\n"); for (worker = 0; worker < nworkers; worker++) { if (count_total_per_worker[worker]) { char name[32]; starpu_worker_get_name(worker, name, 32); fprintf(stderr, "\t\t%s -> %d / %d (%2.2f %%)\n", name, count_22_per_worker[worker], count_22_total, (100.0*count_22_per_worker[worker])/count_22_total); } } }
void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gathering_node) { unsigned child; unsigned worker; unsigned nworkers = starpu_worker_get_count(); unsigned node; unsigned sizes[root_handle->nchildren]; _STARPU_TRACE_START_UNPARTITION(root_handle, gathering_node); _starpu_spin_lock(&root_handle->header_lock); STARPU_ASSERT_MSG(root_handle->nchildren != 0, "data %p is not partitioned, can not unpartition it", root_handle); /* first take all the children lock (in order !) */ for (child = 0; child < root_handle->nchildren; child++) { starpu_data_handle_t child_handle = starpu_data_get_child(root_handle, child); /* make sure the intermediate children is unpartitionned as well */ if (child_handle->nchildren > 0) starpu_data_unpartition(child_handle, gathering_node); /* If this is a multiformat handle, we must convert the data now */ #ifdef STARPU_DEVEL #warning TODO: _starpu_fetch_data_on_node should be doing it #endif if (_starpu_data_is_multiformat_handle(child_handle) && starpu_node_get_kind(child_handle->mf_node) != STARPU_CPU_RAM) { struct starpu_codelet cl = { .where = STARPU_CPU, .cpu_funcs = { _starpu_empty_codelet_function }, .modes = { STARPU_RW }, .nbuffers = 1 }; struct starpu_task *task = starpu_task_create(); task->name = "convert_data"; STARPU_TASK_SET_HANDLE(task, child_handle, 0); task->cl = &cl; task->synchronous = 1; if (_starpu_task_submit_internally(task) != 0) _STARPU_ERROR("Could not submit the conversion task while unpartitionning\n"); } int ret; /* for now we pretend that the RAM is almost unlimited and that gathering * data should be possible from the node that does the unpartionning ... we * don't want to have the programming deal with memory shortage at that time, * really */ /* Acquire the child data on the gathering node. This will trigger collapsing any reduction */ ret = starpu_data_acquire_on_node(child_handle, gathering_node, STARPU_RW); STARPU_ASSERT(ret == 0); starpu_data_release_on_node(child_handle, gathering_node); _starpu_spin_lock(&child_handle->header_lock); child_handle->busy_waiting = 1; _starpu_spin_unlock(&child_handle->header_lock); /* Wait for all requests to finish (notably WT requests) */ STARPU_PTHREAD_MUTEX_LOCK(&child_handle->busy_mutex); while (1) { /* Here helgrind would shout that this an unprotected access, * but this is actually fine: all threads who do busy_count-- * are supposed to call _starpu_data_check_not_busy, which will * wake us up through the busy_mutex/busy_cond. */ if (!child_handle->busy_count) break; /* This is woken by _starpu_data_check_not_busy, always called * after decrementing busy_count */ STARPU_PTHREAD_COND_WAIT(&child_handle->busy_cond, &child_handle->busy_mutex); } STARPU_PTHREAD_MUTEX_UNLOCK(&child_handle->busy_mutex); _starpu_spin_lock(&child_handle->header_lock); sizes[child] = _starpu_data_get_size(child_handle); _starpu_data_unregister_ram_pointer(child_handle); for (worker = 0; worker < nworkers; worker++) { struct _starpu_data_replicate *local = &child_handle->per_worker[worker]; STARPU_ASSERT(local->state == STARPU_INVALID); if (local->allocated && local->automatically_allocated) _starpu_request_mem_chunk_removal(child_handle, local, starpu_worker_get_memory_node(worker), sizes[child]); } _starpu_memory_stats_free(child_handle); }
static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_data_handle_t *childrenp, unsigned nparts, struct starpu_data_filter *f, int inherit_state) { unsigned i; unsigned node; /* first take care to properly lock the data header */ _starpu_spin_lock(&initial_handle->header_lock); initial_handle->nplans++; STARPU_ASSERT_MSG(nparts > 0, "Partitioning data %p in 0 piece does not make sense", initial_handle); /* allocate the children */ if (inherit_state) { initial_handle->children = (struct _starpu_data_state *) calloc(nparts, sizeof(struct _starpu_data_state)); STARPU_ASSERT(initial_handle->children); /* this handle now has children */ initial_handle->nchildren = nparts; } unsigned nworkers = starpu_worker_get_count(); for (node = 0; node < STARPU_MAXNODES; node++) { if (initial_handle->per_node[node].state != STARPU_INVALID) break; } if (node == STARPU_MAXNODES) { /* This is lazy allocation, allocate it now in main RAM, so as * to have somewhere to gather pieces later */ /* FIXME: mark as unevictable! */ int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[STARPU_MAIN_RAM], 0); #ifdef STARPU_DEVEL #warning we should reclaim memory if allocation failed #endif STARPU_ASSERT(!ret); } for (i = 0; i < nparts; i++) { starpu_data_handle_t child; if (inherit_state) child = &initial_handle->children[i]; else child = childrenp[i]; STARPU_ASSERT(child); struct starpu_data_interface_ops *ops; /* each child may have his own interface type */ /* what's this child's interface ? */ if (f->get_child_ops) ops = f->get_child_ops(f, i); else ops = initial_handle->ops; _starpu_data_handle_init(child, ops, initial_handle->mf_node); child->nchildren = 0; child->nplans = 0; child->switch_cl = NULL; child->partitioned = 0; child->readonly = 0; child->mpi_data = initial_handle->mpi_data; child->root_handle = initial_handle->root_handle; child->father_handle = initial_handle; child->sibling_index = i; child->depth = initial_handle->depth + 1; child->is_not_important = initial_handle->is_not_important; child->wt_mask = initial_handle->wt_mask; child->home_node = initial_handle->home_node; child->is_readonly = initial_handle->is_readonly; /* initialize the chunk lock */ _starpu_data_requester_list_init(&child->req_list); _starpu_data_requester_list_init(&child->reduction_req_list); child->reduction_tmp_handles = NULL; child->write_invalidation_req = NULL; child->refcnt = 0; child->unlocking_reqs = 0; child->busy_count = 0; child->busy_waiting = 0; STARPU_PTHREAD_MUTEX_INIT(&child->busy_mutex, NULL); STARPU_PTHREAD_COND_INIT(&child->busy_cond, NULL); child->reduction_refcnt = 0; _starpu_spin_init(&child->header_lock); child->sequential_consistency = initial_handle->sequential_consistency; STARPU_PTHREAD_MUTEX_INIT(&child->sequential_consistency_mutex, NULL); child->last_submitted_mode = STARPU_R; child->last_sync_task = NULL; child->last_submitted_accessors.task = NULL; child->last_submitted_accessors.next = &child->last_submitted_accessors; child->last_submitted_accessors.prev = &child->last_submitted_accessors; child->post_sync_tasks = NULL; /* Tell helgrind that the race in _starpu_unlock_post_sync_tasks is fine */ STARPU_HG_DISABLE_CHECKING(child->post_sync_tasks_cnt); child->post_sync_tasks_cnt = 0; /* The methods used for reduction are propagated to the * children. */ child->redux_cl = initial_handle->redux_cl; child->init_cl = initial_handle->init_cl; #ifdef STARPU_USE_FXT child->last_submitted_ghost_sync_id_is_valid = 0; child->last_submitted_ghost_sync_id = 0; child->last_submitted_ghost_accessors_id = NULL; #endif if (_starpu_global_arbiter) /* Just for testing purpose */ starpu_data_assign_arbiter(child, _starpu_global_arbiter); else child->arbiter = NULL; _starpu_data_requester_list_init(&child->arbitered_req_list); for (node = 0; node < STARPU_MAXNODES; node++) { struct _starpu_data_replicate *initial_replicate; struct _starpu_data_replicate *child_replicate; initial_replicate = &initial_handle->per_node[node]; child_replicate = &child->per_node[node]; if (inherit_state) child_replicate->state = initial_replicate->state; else child_replicate->state = STARPU_INVALID; if (inherit_state || !initial_replicate->automatically_allocated) child_replicate->allocated = initial_replicate->allocated; else child_replicate->allocated = 0; /* Do not allow memory reclaiming within the child for parent bits */ child_replicate->automatically_allocated = 0; child_replicate->refcnt = 0; child_replicate->memory_node = node; child_replicate->relaxed_coherency = 0; if (inherit_state) child_replicate->initialized = initial_replicate->initialized; else child_replicate->initialized = 0; /* update the interface */ void *initial_interface = starpu_data_get_interface_on_node(initial_handle, node); void *child_interface = starpu_data_get_interface_on_node(child, node); STARPU_ASSERT_MSG(!(!inherit_state && child_replicate->automatically_allocated && child_replicate->allocated), "partition planning is currently not supported when handle has some automatically allocated buffers"); f->filter_func(initial_interface, child_interface, f, i, nparts); } unsigned worker; for (worker = 0; worker < nworkers; worker++) { struct _starpu_data_replicate *child_replicate; child_replicate = &child->per_worker[worker]; child_replicate->state = STARPU_INVALID; child_replicate->allocated = 0; child_replicate->automatically_allocated = 0; child_replicate->refcnt = 0; child_replicate->memory_node = starpu_worker_get_memory_node(worker); child_replicate->requested = 0; for (node = 0; node < STARPU_MAXNODES; node++) { child_replicate->request[node] = NULL; } child_replicate->relaxed_coherency = 1; child_replicate->initialized = 0; /* duplicate the content of the interface on node 0 */ memcpy(child_replicate->data_interface, child->per_node[0].data_interface, child->ops->interface_size); } /* We compute the size and the footprint of the child once and * store it in the handle */ child->footprint = _starpu_compute_data_footprint(child); void *ptr; ptr = starpu_data_handle_to_pointer(child, STARPU_MAIN_RAM); if (ptr != NULL) _starpu_data_register_ram_pointer(child, ptr); } /* now let the header */ _starpu_spin_unlock(&initial_handle->header_lock); }
int main(int argc, char **argv) { unsigned *foo; starpu_data_handle_t handle; int ret; unsigned n, i, size; ret = starpu_initialize(NULL, &argc, &argv); if (ret == -ENODEV) return STARPU_TEST_SKIPPED; STARPU_CHECK_RETURN_VALUE(ret, "starpu_init"); #ifdef STARPU_USE_OPENCL ret = starpu_opencl_load_opencl_from_file("tests/datawizard/scal_opencl.cl", &opencl_program, NULL); STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file"); #endif n = starpu_worker_get_count(); if (n == 1) { starpu_shutdown(); return STARPU_TEST_SKIPPED; } size = 10 * n; foo = (unsigned *) calloc(size, sizeof(*foo)); for (i = 0; i < size; i++) foo[i] = i; starpu_vector_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)foo, size, sizeof(*foo)); /* Broadcast the data to force in-place partitioning */ for (i = 0; i < n; i++) starpu_data_prefetch_on_node(handle, starpu_worker_get_memory_node(i), 0); struct starpu_data_filter f = { .filter_func = starpu_vector_filter_block, .nchildren = n, }; starpu_data_partition(handle, &f); for (i = 0; i < f.nchildren; i++) { struct starpu_task *task = starpu_task_create(); task->handles[0] = starpu_data_get_sub_data(handle, 1, i); task->cl = &scal_codelet; task->execute_on_a_specific_worker = 1; task->workerid = i; ret = starpu_task_submit(task); if (ret == -ENODEV) goto enodev; STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); } ret = starpu_task_wait_for_all(); STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all"); starpu_data_unpartition(handle, STARPU_MAIN_RAM); starpu_data_unregister(handle); starpu_shutdown(); ret = EXIT_SUCCESS; for (i = 0; i < size; i++) { if (foo[i] != i*2) { FPRINTF(stderr,"value %u is %u instead of %u\n", i, foo[i], 2*i); ret = EXIT_FAILURE; } } return ret; enodev: starpu_data_unregister(handle); fprintf(stderr, "WARNING: No one can execute this task\n"); /* yes, we do not perform the computation but we did detect that no one * could perform the kernel, so this is not an error from StarPU */ starpu_shutdown(); return STARPU_TEST_SKIPPED; }
/* Enqueue a task into the list of tasks explicitely attached to a worker. In * case workerid identifies a combined worker, a task will be enqueued into * each worker of the combination. */ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int workerid) { int nbasic_workers = (int)starpu_worker_get_count(); /* Is this a basic worker or a combined worker ? */ int is_basic_worker = (workerid < nbasic_workers); unsigned memory_node; struct _starpu_worker *worker = NULL; struct _starpu_combined_worker *combined_worker = NULL; if (is_basic_worker) { worker = _starpu_get_worker_struct(workerid); memory_node = worker->memory_node; } else { combined_worker = _starpu_get_combined_worker_struct(workerid); memory_node = combined_worker->memory_node; } if (use_prefetch) starpu_prefetch_task_input_on_node(task, memory_node); if (is_basic_worker) _starpu_push_task_on_specific_worker_notify_sched(task, worker, workerid, workerid); else { /* Notify all workers of the combined worker */ int worker_size = combined_worker->worker_size; int *combined_workerid = combined_worker->combined_workerid; int j; for (j = 0; j < worker_size; j++) { int subworkerid = combined_workerid[j]; _starpu_push_task_on_specific_worker_notify_sched(task, _starpu_get_worker_struct(subworkerid), subworkerid, workerid); } } #ifdef STARPU_USE_SC_HYPERVISOR starpu_sched_ctx_call_pushed_task_cb(workerid, task->sched_ctx); #endif //STARPU_USE_SC_HYPERVISOR unsigned i; if (is_basic_worker) { unsigned node = starpu_worker_get_memory_node(workerid); if (_starpu_task_uses_multiformat_handles(task)) { unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task); for (i = 0; i < nbuffers; i++) { struct starpu_task *conversion_task; starpu_data_handle_t handle; handle = STARPU_TASK_GET_HANDLE(task, i); if (!_starpu_handle_needs_conversion_task(handle, node)) continue; conversion_task = _starpu_create_conversion_task(handle, node); conversion_task->mf_skip = 1; conversion_task->execute_on_a_specific_worker = 1; conversion_task->workerid = workerid; _starpu_task_submit_conversion_task(conversion_task, workerid); //_STARPU_DEBUG("Pushing a conversion task\n"); } for (i = 0; i < nbuffers; i++) { starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, i); handle->mf_node = node; } } // if(task->sched_ctx != _starpu_get_initial_sched_ctx()->id) if(task->priority > 0) return _starpu_push_local_task(worker, task, 1); else return _starpu_push_local_task(worker, task, 0); } else { /* This is a combined worker so we create task aliases */ int worker_size = combined_worker->worker_size; int *combined_workerid = combined_worker->combined_workerid; int ret = 0; struct _starpu_job *job = _starpu_get_job_associated_to_task(task); job->task_size = worker_size; job->combined_workerid = workerid; job->active_task_alias_count = 0; STARPU_PTHREAD_BARRIER_INIT(&job->before_work_barrier, NULL, worker_size); STARPU_PTHREAD_BARRIER_INIT(&job->after_work_barrier, NULL, worker_size); job->after_work_busy_barrier = worker_size; /* Note: we have to call that early, or else the task may have * disappeared already */ starpu_push_task_end(task); int j; for (j = 0; j < worker_size; j++) { struct starpu_task *alias = starpu_task_dup(task); alias->destroy = 1; worker = _starpu_get_worker_struct(combined_workerid[j]); ret |= _starpu_push_local_task(worker, alias, 0); } return ret; } }