static void find_and_assign_combinations_with_hwloc(int *workerids, int nworkers) { struct _starpu_machine_config *config = _starpu_get_machine_config(); struct _starpu_machine_topology *topology = &config->topology; int synthesize_arity = starpu_get_env_number("STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER"); int min = starpu_get_env_number("STARPU_MIN_WORKERSIZE"); if (min < 2) min = 2; int max = starpu_get_env_number("STARPU_MAX_WORKERSIZE"); if (max == -1) max = INT_MAX; if (synthesize_arity == -1) synthesize_arity = 2; /* First, mark nodes which contain CPU workers, simply by setting their userdata field */ int i; for (i = 0; i < nworkers; i++) { struct _starpu_worker *worker = _starpu_get_worker_struct(workerids[i]); if (worker->perf_arch.devices[0].type == STARPU_CPU_WORKER && worker->perf_arch.devices[0].ncores == 1) { hwloc_obj_t obj = hwloc_get_obj_by_depth(topology->hwtopology, config->pu_depth, worker->bindid); obj = obj->parent; while (obj) { obj->userdata = (void*) -1; obj = obj->parent; } } } find_and_assign_combinations(hwloc_get_root_obj(topology->hwtopology), min, max, synthesize_arity); }
static void find_and_assign_combinations(hwloc_obj_t obj, unsigned min, unsigned max, unsigned synthesize_arity) { char name[64]; unsigned i, n, nworkers; int cpu_workers[STARPU_NMAXWORKERS]; struct _starpu_machine_config *config = _starpu_get_machine_config(); struct _starpu_machine_topology *topology = &config->topology; hwloc_obj_snprintf(name, sizeof(name), topology->hwtopology, obj, "#", 0); _STARPU_DEBUG("Looking at %s\n", name); for (n = 0, i = 0; i < obj->arity; i++) if (obj->children[i]->userdata) /* it has a CPU worker */ n++; if (n == 1) { /* If there is only one child, we go to the next level right away */ find_and_assign_combinations(obj->children[0], min, max, synthesize_arity); return; } /* Add this object */ nworkers = 0; find_workers(obj, cpu_workers, &nworkers); if (nworkers >= min && nworkers <= max) { _STARPU_DEBUG("Adding it\n"); unsigned sched_ctx_id = starpu_sched_ctx_get_context(); if(sched_ctx_id == STARPU_NMAX_SCHED_CTXS) sched_ctx_id = 0; struct starpu_worker_collection* workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id); int newworkerid = starpu_combined_worker_assign_workerid(nworkers, cpu_workers); STARPU_ASSERT(newworkerid >= 0); workers->add(workers,newworkerid); } /* Add artificial intermediate objects recursively */ synthesize_intermediate_workers(obj->children, min, max, obj->arity, n, synthesize_arity); /* And recurse */ for (i = 0; i < obj->arity; i++) if (obj->children[i]->userdata == (void*) -1) find_and_assign_combinations(obj->children[i], min, max, synthesize_arity); }
void _starpu_sched_find_worker_combinations(int *workerids, int nworkers) { struct _starpu_machine_config *config = _starpu_get_machine_config(); if (config->conf->single_combined_worker > 0) combine_all_cpu_workers(workerids, nworkers); else { #ifdef STARPU_HAVE_HWLOC find_and_assign_combinations_with_hwloc(workerids, nworkers); #else find_and_assign_combinations_without_hwloc(workerids, nworkers); #endif } }
unsigned starpu_worker_get_memory_node(unsigned workerid) { struct _starpu_machine_config *config = _starpu_get_machine_config(); /* This workerid may either be a basic worker or a combined worker */ unsigned nworkers = config->topology.nworkers; if (workerid < config->topology.nworkers) return config->workers[workerid].memory_node; /* We have a combined worker */ unsigned ncombinedworkers = config->topology.ncombinedworkers; STARPU_ASSERT_MSG(workerid < ncombinedworkers + nworkers, "Bad workerid %u, maximum %u", workerid, ncombinedworkers + nworkers); return config->combined_workers[workerid - nworkers].memory_node; }
static void find_and_assign_combinations_without_hwloc(int *workerids, int nworkers) { int i; unsigned sched_ctx_id = starpu_sched_ctx_get_context(); if(sched_ctx_id == STARPU_NMAX_SCHED_CTXS) sched_ctx_id = 0; int min, max; #ifdef STARPU_USE_MIC unsigned j; int mic_min, mic_max; #endif struct starpu_worker_collection* workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id); /* We put the id of all CPU workers in this array */ int cpu_workers[STARPU_NMAXWORKERS]; unsigned ncpus = 0; #ifdef STARPU_USE_MIC unsigned nb_mics = _starpu_get_machine_config()->topology.nmicdevices; unsigned * nmics_table; int * mic_id; int ** mic_workers; mic_id = malloc(sizeof(int)*nb_mics); nmics_table = malloc(sizeof(unsigned)*nb_mics); mic_workers = malloc(sizeof(int*)*nb_mics); for(j=0; j<nb_mics; j++) { mic_id[j] = -1; nmics_table[j] = 0; mic_workers[j] = malloc(sizeof(int)*STARPU_NMAXWORKERS); } #endif /* STARPU_USE_MIC */ struct _starpu_worker *worker; for (i = 0; i < nworkers; i++) { worker = _starpu_get_worker_struct(workerids[i]); if (worker->arch == STARPU_CPU_WORKER) cpu_workers[ncpus++] = i; #ifdef STARPU_USE_MIC else if(worker->arch == STARPU_MIC_WORKER) { for(j=0; mic_id[j] != worker->devid && mic_id[j] != -1 && j<nb_mics; j++); if(j<nb_mics) { if(mic_id[j] == -1) { mic_id[j] = worker->devid; } mic_workers[j][nmics_table[j]++] = i; } } #endif /* STARPU_USE_MIC */ } min = starpu_get_env_number("STARPU_MIN_WORKERSIZE"); if (min < 2) min = 2; max = starpu_get_env_number("STARPU_MAX_WORKERSIZE"); if (max == -1 || max > (int) ncpus) max = ncpus; assign_combinations_without_hwloc(workers,cpu_workers,ncpus,min,max); #ifdef STARPU_USE_MIC mic_min = starpu_get_env_number("STARPU_MIN_WORKERSIZE"); if (mic_min < 2) mic_min = 2; for(j=0; j<nb_mics; j++) { mic_max = starpu_get_env_number("STARPU_MAX_WORKERSIZE"); if (mic_max == -1 || mic_max > (int) nmics_table[j]) mic_max = nmics_table[j]; assign_combinations_without_hwloc(workers,mic_workers[j],nmics_table[j],mic_min,mic_max); free(mic_workers[j]); } free(mic_id); free(nmics_table); free(mic_workers); #endif /* STARPU_USE_MIC */ }
int _starpu_push_task_to_workers(struct starpu_task *task) { struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx); unsigned nworkers = 0; _STARPU_TRACE_JOB_PUSH(task, task->priority > 0); /* if the contexts still does not have workers put the task back to its place in the empty ctx list */ if(!sched_ctx->is_initial_sched) { /*if there are workers in the ctx that are not able to execute tasks we consider the ctx empty */ nworkers = _starpu_nworkers_able_to_execute_task(task, sched_ctx); if (nworkers == 0) { STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx->empty_ctx_mutex); starpu_task_list_push_back(&sched_ctx->empty_ctx_tasks, task); STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx->empty_ctx_mutex); #ifdef STARPU_USE_SC_HYPERVISOR if(sched_ctx != NULL && sched_ctx->id != 0 && sched_ctx->perf_counters != NULL && sched_ctx->perf_counters->notify_empty_ctx) { _STARPU_TRACE_HYPERVISOR_BEGIN(); sched_ctx->perf_counters->notify_empty_ctx(sched_ctx->id, task); _STARPU_TRACE_HYPERVISOR_END(); } #endif return -EAGAIN; } } _starpu_profiling_set_task_push_start_time(task); int ret = 0; if (STARPU_UNLIKELY(task->execute_on_a_specific_worker)) { unsigned node = starpu_worker_get_memory_node(task->workerid); if (starpu_get_prefetch_flag()) starpu_prefetch_task_input_on_node(task, node); ret = _starpu_push_task_on_specific_worker(task, task->workerid); } else { struct _starpu_machine_config *config = _starpu_get_machine_config(); /* When a task can only be executed on a given arch and we have * only one memory node for that arch, we can systematically * prefetch before the scheduling decision. */ if (starpu_get_prefetch_flag()) { if (task->cl->where == STARPU_CPU && config->cpus_nodeid >= 0) starpu_prefetch_task_input_on_node(task, config->cpus_nodeid); else if (task->cl->where == STARPU_CUDA && config->cuda_nodeid >= 0) starpu_prefetch_task_input_on_node(task, config->cuda_nodeid); else if (task->cl->where == STARPU_OPENCL && config->opencl_nodeid >= 0) starpu_prefetch_task_input_on_node(task, config->opencl_nodeid); else if (task->cl->where == STARPU_MIC && config->mic_nodeid >= 0) starpu_prefetch_task_input_on_node(task, config->mic_nodeid); else if (task->cl->where == STARPU_SCC && config->scc_nodeid >= 0) starpu_prefetch_task_input_on_node(task, config->scc_nodeid); } if(!sched_ctx->sched_policy) { /* Note: we have to call that early, or else the task may have * disappeared already */ starpu_push_task_end(task); if(!sched_ctx->awake_workers) ret = _starpu_push_task_on_specific_worker(task, sched_ctx->main_master); else { struct starpu_worker_collection *workers = sched_ctx->workers; struct _starpu_job *job = _starpu_get_job_associated_to_task(task); job->task_size = workers->nworkers; job->combined_workerid = -1; // workerid; its a ctx not combined worker job->active_task_alias_count = 0; STARPU_PTHREAD_BARRIER_INIT(&job->before_work_barrier, NULL, workers->nworkers); STARPU_PTHREAD_BARRIER_INIT(&job->after_work_barrier, NULL, workers->nworkers); job->after_work_busy_barrier = workers->nworkers; unsigned workerid; struct starpu_sched_ctx_iterator it; if(workers->init_iterator) workers->init_iterator(workers, &it); while(workers->has_next(workers, &it)) { workerid = workers->get_next(workers, &it); struct starpu_task *alias = starpu_task_dup(task); alias->destroy = 1; ret |= _starpu_push_task_on_specific_worker(alias, workerid); } } } else { STARPU_ASSERT(sched_ctx->sched_policy->push_task); /* check out if there are any workers in the context */ starpu_pthread_rwlock_t *changing_ctx_mutex = _starpu_sched_ctx_get_changing_ctx_mutex(sched_ctx->id); STARPU_PTHREAD_RWLOCK_RDLOCK(changing_ctx_mutex); nworkers = starpu_sched_ctx_get_nworkers(sched_ctx->id); if (nworkers == 0) ret = -1; else { _STARPU_TRACE_WORKER_SCHEDULING_PUSH; ret = sched_ctx->sched_policy->push_task(task); _STARPU_TRACE_WORKER_SCHEDULING_POP; } STARPU_PTHREAD_RWLOCK_UNLOCK(changing_ctx_mutex); } if(ret == -1) { fprintf(stderr, "repush task \n"); _STARPU_TRACE_JOB_POP(task, task->priority > 0); ret = _starpu_push_task_to_workers(task); } } /* Note: from here, the task might have been destroyed already! */ _STARPU_LOG_OUT(); return ret; }