/**
 * Return the most suitable worker to whom add a task.
 * The number of previously processed tasks, total and local,
 * and the number of tasks currently awaiting to be processed
 * by the tasks are taken into account to select the most suitable
 * worker to add a task to.
 */
static unsigned select_worker_overload(unsigned sched_ctx_id)
{
	unsigned worker;
	float  worker_ratio;
	unsigned best_worker = 0;
	float best_ratio = FLT_MAX;

	/* Don't try to play smart until we get
	 * enough informations. */
	if (performed_total < calibration_value)
		return select_worker_round_robin(sched_ctx_id);

	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);

	struct starpu_sched_ctx_iterator it;

	workers->init_iterator(workers, &it);
	while(workers->has_next(workers, &it))
	{
		worker = workers->get_next(workers, &it);

		worker_ratio = overload_metric(sched_ctx_id, worker);

		if (worker_ratio < best_ratio)
		{
			best_worker = worker;
			best_ratio = worker_ratio;
		}
	}

	return best_worker;
}
static
int ws_push_task(struct starpu_task *task)
{
	unsigned sched_ctx_id = task->sched_ctx;
	struct _starpu_work_stealing_data *ws = (struct _starpu_work_stealing_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);

	struct _starpu_deque_jobq *deque_queue;
	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
	int workerid = starpu_worker_get_id();

	unsigned worker = 0;
	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
	struct starpu_sched_ctx_iterator it;
	
	workers->init_iterator(workers, &it);
	/* !! C'est ballot de tout locker! */
	while(workers->has_next(workers, &it))
	{
		worker = workers->get_next(workers, &it);
		starpu_pthread_mutex_t *sched_mutex;
		starpu_pthread_cond_t *sched_cond;
		starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
		STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
	}
	
	
	/* If the current thread is not a worker but
	 * the main thread (-1), we find the better one to
	 * put task on its queue */
	if (workerid == -1)
		workerid = select_worker(sched_ctx_id);

	deque_queue = ws->queue_array[workerid];

#ifdef HAVE_AYUDAME_H
	if (AYU_event)
	{
		intptr_t id = workerid;
		AYU_event(AYU_ADDTASKTOQUEUE, j->job_id, &id);
	}
#endif
	_starpu_job_list_push_back(&deque_queue->jobq, j);
	deque_queue->njobs++;
	starpu_push_task_end(task);

	while(workers->has_next(workers, &it))
	{
		worker = workers->get_next(workers, &it);
		starpu_pthread_mutex_t *sched_mutex;
		starpu_pthread_cond_t *sched_cond;
		starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
#ifndef STARPU_NON_BLOCKING_DRIVERS
		STARPU_PTHREAD_COND_SIGNAL(sched_cond);
#endif
		STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
	}
		
	return 0;
}
static void synthesize_intermediate_workers(hwloc_obj_t *children, unsigned min, unsigned max, unsigned arity, unsigned n, unsigned synthesize_arity)
{
	unsigned nworkers, i, j;
	unsigned chunk_size = (n + synthesize_arity-1) / synthesize_arity;
	unsigned chunk_start;
	int cpu_workers[STARPU_NMAXWORKERS];
	int ret;

	if (n <= synthesize_arity)
		/* Not too many children, do not synthesize */
		return;

	_STARPU_DEBUG("%u children > %u, synthesizing intermediate combined workers of size %u\n", n, synthesize_arity, chunk_size);

	n = 0;
	j = 0;
	nworkers = 0;
	chunk_start = 0;
	for (i = 0 ; i < arity; i++)
	{
		if (children[i]->userdata)
		{
			n++;
			_STARPU_DEBUG("child %u\n", i);
			find_workers(children[i], cpu_workers, &nworkers);
			j++;
		}
		/* Completed a chunk, or last bit (but not if it's just 1 subobject) */
		if (j == chunk_size || (i == arity-1 && j > 1))
		{
			if (nworkers >= min && nworkers <= max)
			{
				unsigned sched_ctx_id  = starpu_sched_ctx_get_context();
				if(sched_ctx_id == STARPU_NMAX_SCHED_CTXS)
					sched_ctx_id = 0;
				struct starpu_worker_collection* workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);

				_STARPU_DEBUG("Adding it\n");
				ret = starpu_combined_worker_assign_workerid(nworkers, cpu_workers);
				STARPU_ASSERT(ret >= 0);
				workers->add(workers,ret);
			}
			/* Recurse there */
			synthesize_intermediate_workers(children+chunk_start, min, max, i - chunk_start, n, synthesize_arity);
			/* And restart another one */
			n = 0;
			j = 0;
			nworkers = 0;
			chunk_start = i+1;
		}
	}
}
static void find_and_assign_combinations(hwloc_obj_t obj, unsigned min, unsigned max, unsigned synthesize_arity)
{
	char name[64];
	unsigned i, n, nworkers;
	int cpu_workers[STARPU_NMAXWORKERS];

	struct _starpu_machine_config *config = _starpu_get_machine_config();
	struct _starpu_machine_topology *topology = &config->topology;

	hwloc_obj_snprintf(name, sizeof(name), topology->hwtopology, obj, "#", 0);
	_STARPU_DEBUG("Looking at %s\n", name);

	for (n = 0, i = 0; i < obj->arity; i++)
		if (obj->children[i]->userdata)
			/* it has a CPU worker */
			n++;

	if (n == 1)
	{
		/* If there is only one child, we go to the next level right away */
		find_and_assign_combinations(obj->children[0], min, max, synthesize_arity);
		return;
	}

	/* Add this object */
	nworkers = 0;
	find_workers(obj, cpu_workers, &nworkers);

	if (nworkers >= min && nworkers <= max)
	{
		_STARPU_DEBUG("Adding it\n");
		unsigned sched_ctx_id  = starpu_sched_ctx_get_context();
		if(sched_ctx_id == STARPU_NMAX_SCHED_CTXS)
			sched_ctx_id = 0;

		struct starpu_worker_collection* workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);

		int newworkerid = starpu_combined_worker_assign_workerid(nworkers, cpu_workers);
		STARPU_ASSERT(newworkerid >= 0);
		workers->add(workers,newworkerid);
	}

	/* Add artificial intermediate objects recursively */
	synthesize_intermediate_workers(obj->children, min, max, obj->arity, n, synthesize_arity);

	/* And recurse */
	for (i = 0; i < obj->arity; i++)
		if (obj->children[i]->userdata == (void*) -1)
			find_and_assign_combinations(obj->children[i], min, max, synthesize_arity);
}
static int push_task_dummy(struct starpu_task *task)
{
	unsigned sched_ctx_id = task->sched_ctx;
	struct dummy_sched_data *data = (struct dummy_sched_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);

	/* NB: In this simplistic strategy, we assume that the context in which
	   we push task has at least one worker*/


	/* lock all workers when pushing tasks on a list where all
	   of them would pop for tasks */
        starpu_pthread_mutex_lock(&data->policy_mutex);

	starpu_task_list_push_front(&data->sched_list, task);

	starpu_push_task_end(task);
	starpu_pthread_mutex_unlock(&data->policy_mutex);


        /*if there are no tasks block */
        /* wake people waiting for a task */
        unsigned worker = 0;
	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);

        struct starpu_sched_ctx_iterator it;

	workers->init_iterator(workers, &it);
	while(workers->has_next(workers, &it))
        {
                worker = workers->get_next(workers, &it);
		starpu_pthread_mutex_t *sched_mutex;
                starpu_pthread_cond_t *sched_cond;
                starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
		starpu_pthread_mutex_lock(sched_mutex);
                starpu_pthread_cond_signal(sched_cond);
                starpu_pthread_mutex_unlock(sched_mutex);
        }

	return 0;
}
static void combine_all_cpu_workers(int *workerids, int nworkers)
{
	unsigned sched_ctx_id  = starpu_sched_ctx_get_context();
	if(sched_ctx_id == STARPU_NMAX_SCHED_CTXS)
		sched_ctx_id = 0;
	struct starpu_worker_collection* workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
	int cpu_workers[STARPU_NMAXWORKERS];
	int ncpus = 0;
	struct _starpu_worker *worker;
	int i;
	int min;
	int max;

	for (i = 0; i < nworkers; i++)
	{
		worker = _starpu_get_worker_struct(workerids[i]);

		if (worker->arch == STARPU_CPU_WORKER)
			cpu_workers[ncpus++] = workerids[i];
	}

	min = starpu_get_env_number("STARPU_MIN_WORKERSIZE");
	if (min < 1)
		min = 1;
	max = starpu_get_env_number("STARPU_MAX_WORKERSIZE");
	if (max == -1 || max > ncpus)
		max = ncpus;

	for (i = min; i <= max; i++)
	{
		int newworkerid;
		newworkerid = starpu_combined_worker_assign_workerid(i, cpu_workers);
		STARPU_ASSERT(newworkerid >= 0);
		workers->add(workers, newworkerid);
	}
}
static void find_and_assign_combinations_without_hwloc(int *workerids, int nworkers)
{
	int i;
	unsigned sched_ctx_id  = starpu_sched_ctx_get_context();
	if(sched_ctx_id == STARPU_NMAX_SCHED_CTXS)
		sched_ctx_id = 0;
	int min, max;
#ifdef STARPU_USE_MIC
	unsigned j;
	int mic_min, mic_max;
#endif

	struct starpu_worker_collection* workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);

	/* We put the id of all CPU workers in this array */
	int cpu_workers[STARPU_NMAXWORKERS];
	unsigned ncpus = 0;
#ifdef STARPU_USE_MIC
	unsigned nb_mics = _starpu_get_machine_config()->topology.nmicdevices;
	unsigned * nmics_table;
	int * mic_id;
	int ** mic_workers;
	mic_id = malloc(sizeof(int)*nb_mics);
	nmics_table = malloc(sizeof(unsigned)*nb_mics);
	mic_workers = malloc(sizeof(int*)*nb_mics);
	for(j=0; j<nb_mics; j++)
	{
		mic_id[j] = -1;
		nmics_table[j] = 0;
		mic_workers[j] = malloc(sizeof(int)*STARPU_NMAXWORKERS);
	}
#endif /* STARPU_USE_MIC */

	struct _starpu_worker *worker;
	for (i = 0; i < nworkers; i++)
	{
		worker = _starpu_get_worker_struct(workerids[i]);
		if (worker->arch == STARPU_CPU_WORKER)
			cpu_workers[ncpus++] = i;
#ifdef STARPU_USE_MIC
		else if(worker->arch == STARPU_MIC_WORKER)
		{
			for(j=0; mic_id[j] != worker->devid && mic_id[j] != -1 && j<nb_mics; j++);
			if(j<nb_mics)
			{
				if(mic_id[j] == -1)
				{
					mic_id[j] = worker->devid;					
				}
				mic_workers[j][nmics_table[j]++] = i;
			}
		}
#endif /* STARPU_USE_MIC */

	}


	min = starpu_get_env_number("STARPU_MIN_WORKERSIZE");
	if (min < 2)
		min = 2;
	max = starpu_get_env_number("STARPU_MAX_WORKERSIZE");
	if (max == -1 || max > (int) ncpus)
		max = ncpus;
	
	assign_combinations_without_hwloc(workers,cpu_workers,ncpus,min,max);
#ifdef STARPU_USE_MIC
	mic_min = starpu_get_env_number("STARPU_MIN_WORKERSIZE");
	if (mic_min < 2)
		mic_min = 2;
	for(j=0; j<nb_mics; j++)
	{
		mic_max = starpu_get_env_number("STARPU_MAX_WORKERSIZE");
		if (mic_max == -1 || mic_max > (int) nmics_table[j])
			mic_max = nmics_table[j];
		assign_combinations_without_hwloc(workers,mic_workers[j],nmics_table[j],mic_min,mic_max);
		free(mic_workers[j]);
	}
	free(mic_id);
	free(nmics_table);
	free(mic_workers);
#endif /* STARPU_USE_MIC */
}