示例#1
0
static void _handle_pending_node_data_requests(uint32_t src_node, unsigned force)
{
//	_STARPU_DEBUG("_starpu_handle_pending_node_data_requests ...\n");

	PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[src_node]);

	/* for all entries of the list */
	starpu_data_request_list_t local_list = data_requests_pending[src_node];
	data_requests_pending[src_node] = starpu_data_request_list_new();

	PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);

	while (!starpu_data_request_list_empty(local_list))
	{
		starpu_data_request_t r;
		r = starpu_data_request_list_pop_back(local_list);

      if (r->src_handle != r->dst_handle) {
         _starpu_spin_lock(&r->src_handle->header_lock);
         _starpu_spin_lock(&r->dst_handle->header_lock);
      }
      else
         _starpu_spin_lock(&r->src_handle->header_lock);
	
		_starpu_spin_lock(&r->lock);
	
		/* wait until the transfer is terminated */
		if (force)
		{
			_starpu_driver_wait_request_completion(r, src_node);
			starpu_handle_data_request_completion(r);
		}
		else {
			if (_starpu_driver_test_request_completion(r, src_node))
			{
				
				starpu_handle_data_request_completion(r);
			}
			else {
				_starpu_spin_unlock(&r->lock);
            if (r->src_handle != r->dst_handle) {
               _starpu_spin_unlock(&r->src_handle->header_lock);
               _starpu_spin_unlock(&r->dst_handle->header_lock);
            }
            else
               _starpu_spin_unlock(&r->src_handle->header_lock);

				/* wake the requesting worker up */
				PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[src_node]);
				starpu_data_request_list_push_front(data_requests_pending[src_node], r);
				PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
			}
		}
	}

	starpu_data_request_list_delete(local_list);
}
示例#2
0
/* Returns whether the completion was already terminated, and caller should
 * thus immediately proceed. */
int _starpu_add_successor_to_cg_list(struct _starpu_cg_list *successors, struct _starpu_cg *cg)
{
	int ret;
	STARPU_ASSERT(cg);

	_starpu_spin_lock(&successors->lock);
	ret = successors->terminated;

	/* where should that cg should be put in the array ? */
	unsigned index = successors->nsuccs++;

#ifdef STARPU_DYNAMIC_DEPS_SIZE
	if (index >= successors->succ_list_size)
	{
		/* the successor list is too small */
		if (successors->succ_list_size > 0)
			successors->succ_list_size *= 2;
		else
			successors->succ_list_size = 4;

		successors->succ = (struct _starpu_cg **) realloc(successors->succ,
			successors->succ_list_size*sizeof(struct _starpu_cg *));
	}
#else
	STARPU_ASSERT(index < STARPU_NMAXDEPS);
#endif
	successors->succ[index] = cg;
	_starpu_spin_unlock(&successors->lock);

	return ret;
}
示例#3
0
static void _starpu_tag_free(void *_tag)
{
	struct _starpu_tag *tag = (struct _starpu_tag *) _tag;

	if (tag)
	{
		_starpu_spin_lock(&tag->lock);

		unsigned nsuccs = tag->tag_successors.nsuccs;
		unsigned succ;

		for (succ = 0; succ < nsuccs; succ++)
		{
			struct _starpu_cg *cg = tag->tag_successors.succ[succ];

			unsigned ntags = STARPU_ATOMIC_ADD(&cg->ntags, -1);
			unsigned remaining STARPU_ATTRIBUTE_UNUSED = STARPU_ATOMIC_ADD(&cg->remaining, -1);

			if (!ntags && (cg->cg_type == STARPU_CG_TAG))
				/* Last tag this cg depends on, cg becomes unreferenced */
				free(cg);
		}

#ifdef STARPU_DYNAMIC_DEPS_SIZE
		free(tag->tag_successors.succ);
#endif

		_starpu_spin_unlock(&tag->lock);
		_starpu_spin_destroy(&tag->lock);

		free(tag);
	}
}
示例#4
0
/* TODO : accounting to see how much time was spent working for other people ... */
static int starpu_handle_data_request(starpu_data_request_t r, unsigned may_alloc)
{
   if (r->src_handle != r->dst_handle) {
   	_starpu_spin_lock(&r->src_handle->header_lock);
      _starpu_spin_lock(&r->dst_handle->header_lock);
   }
   else
      _starpu_spin_lock(&r->src_handle->header_lock);

	_starpu_spin_lock(&r->lock);

	if (r->mode & STARPU_R)
	{
		STARPU_ASSERT(r->src_handle->per_node[r->src_node].allocated);
		STARPU_ASSERT(r->src_handle->per_node[r->src_node].refcnt);
	}

   /* perform the transfer */
   /* the header of the data must be locked by the worker that submitted the request */
   r->retval = _starpu_driver_copy_data_1_to_1(r->src_handle, r->src_node, r->dst_handle, r->dst_node, !(r->mode & STARPU_R), r, may_alloc);

	if (r->retval == ENOMEM)
	{
		_starpu_spin_unlock(&r->lock);
      if (r->src_handle != r->dst_handle) {
         _starpu_spin_unlock(&r->src_handle->header_lock);
         _starpu_spin_unlock(&r->dst_handle->header_lock);
      }
      else
         _starpu_spin_unlock(&r->src_handle->header_lock);

		return ENOMEM;
	}

	if (r->retval == EAGAIN)
	{
		_starpu_spin_unlock(&r->lock);
      if (r->src_handle != r->dst_handle) {
         _starpu_spin_unlock(&r->src_handle->header_lock);
         _starpu_spin_unlock(&r->dst_handle->header_lock);
      }
      else
         _starpu_spin_unlock(&r->src_handle->header_lock);

		/* the request is pending and we put it in the corresponding queue  */
		PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[r->handling_node]);
		starpu_data_request_list_push_front(data_requests_pending[r->handling_node], r);
		PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[r->handling_node]);

		return EAGAIN;
	}

	/* the request has been handled */
	starpu_handle_data_request_completion(r);

	return 0;
}
示例#5
0
int _starpu_wait_data_request_completion(starpu_data_request_t r, unsigned may_alloc)
{
	int retval;
	int do_delete = 0;

	uint32_t local_node = _starpu_get_local_memory_node();

	do {
		_starpu_spin_lock(&r->lock);

		if (r->completed)
			break;

		_starpu_spin_unlock(&r->lock);

#ifndef STARPU_NON_BLOCKING_DRIVERS
		_starpu_wake_all_blocked_workers_on_node(r->handling_node);
#endif

		_starpu_datawizard_progress(local_node, may_alloc);

	} while (1);


	retval = r->retval;
	if (retval)
		_STARPU_DISP("REQUEST %p COMPLETED (retval %d) !\n", r, r->retval);
		

	r->refcnt--;

	/* if nobody is waiting on that request, we can get rid of it */
	if (r->refcnt == 0)
		do_delete = 1;

	_starpu_spin_unlock(&r->lock);
	
	if (do_delete)
		starpu_data_request_destroy(r);
	
	return retval;
}
示例#6
0
/* The data must be released by calling starpu_data_release later on */
int starpu_data_acquire_cb(starpu_data_handle handle,
		starpu_access_mode mode, void (*callback)(void *), void *arg)
{
	STARPU_ASSERT(handle);

	struct user_interaction_wrapper *wrapper = malloc(sizeof(struct user_interaction_wrapper));
	STARPU_ASSERT(wrapper);

	wrapper->handle = handle;
	wrapper->mode = mode;
	wrapper->callback = callback;
	wrapper->callback_arg = arg;
	PTHREAD_COND_INIT(&wrapper->cond, NULL);
	PTHREAD_MUTEX_INIT(&wrapper->lock, NULL);
	wrapper->finished = 0;

//TODO: instead of having the is_prefetch argument, _starpu_fetch_data shoud consider two flags: async and detached
	_starpu_spin_lock(&handle->header_lock);
	handle->per_node[0].refcnt++;
	_starpu_spin_unlock(&handle->header_lock);

	PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
	int sequential_consistency = handle->sequential_consistency;
	if (sequential_consistency)
	{
		wrapper->pre_sync_task = starpu_task_create();
		wrapper->pre_sync_task->callback_func = starpu_data_acquire_cb_pre_sync_callback;
		wrapper->pre_sync_task->callback_arg = wrapper;

		wrapper->post_sync_task = starpu_task_create();

#ifdef STARPU_USE_FXT
                starpu_job_t job = _starpu_get_job_associated_to_task(wrapper->pre_sync_task);
                job->model_name = "acquire_cb_pre";
                job = _starpu_get_job_associated_to_task(wrapper->post_sync_task);
                job->model_name = "acquire_cb_post";
#endif

		_starpu_detect_implicit_data_deps_with_handle(wrapper->pre_sync_task, wrapper->post_sync_task, handle, mode);
		PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);

		/* TODO detect if this is superflous */
		int ret = starpu_task_submit(wrapper->pre_sync_task, NULL);
		STARPU_ASSERT(!ret);
	}
	else {
		PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);

		starpu_data_acquire_cb_pre_sync_callback(wrapper);
	}

	return 0;
}
示例#7
0
/* Caller just has to promise that the list will not disappear.
 * _starpu_notify_cg_list protects the list itself.
 * No job lock should be held, since we might want to immediately call the callback of an empty task.
 */
void _starpu_notify_cg_list(struct _starpu_cg_list *successors)
{
	unsigned succ;

	_starpu_spin_lock(&successors->lock);
	/* Note: some thread might be concurrently adding other items */
	for (succ = 0; succ < successors->nsuccs; succ++)
	{
		struct _starpu_cg *cg = successors->succ[succ];
		STARPU_ASSERT(cg);
		unsigned cg_type = cg->cg_type;

		if (cg_type == STARPU_CG_APPS)
		{
			/* Remove the temporary ref to the cg */
			memmove(&successors->succ[succ], &successors->succ[succ+1], (successors->nsuccs-(succ+1)) * sizeof(successors->succ[succ]));
			succ--;
			successors->nsuccs--;
		}
		_starpu_spin_unlock(&successors->lock);

		struct _starpu_tag *cgtag = NULL;

		if (cg_type == STARPU_CG_TAG)
		{
			cgtag = cg->succ.tag;
			STARPU_ASSERT(cgtag);
			_starpu_spin_lock(&cgtag->lock);
		}

		_starpu_notify_cg(cg);

		if (cg_type == STARPU_CG_TAG)
			_starpu_spin_unlock(&cgtag->lock);

		_starpu_spin_lock(&successors->lock);
	}
	successors->terminated = 1;
	_starpu_spin_unlock(&successors->lock);
}
示例#8
0
void starpu_data_set_sequential_consistency_flag(starpu_data_handle handle, unsigned flag)
{
	_starpu_spin_lock(&handle->header_lock);

	unsigned child;
	for (child = 0; child < handle->nchildren; child++)
	{
		/* make sure that the flags are applied to the children as well */
		struct starpu_data_state_t *child_handle = &handle->children[child];
		if (child_handle->nchildren > 0)
			starpu_data_set_sequential_consistency_flag(child_handle, flag);
	}

	PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
	handle->sequential_consistency = flag;
	PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);

	_starpu_spin_unlock(&handle->header_lock);
}
示例#9
0
文件: memalloc.c 项目: alucas/StarPU
static void unlock_all_subtree(starpu_data_handle handle)
{
	if (handle->nchildren == 0)
	{
		/* this is a leaf */	
		_starpu_spin_unlock(&handle->header_lock);
	}
	else {
		/* lock all sub-subtrees children 
		 * Note that this is done in the reverse order of the
		 * lock_all_subtree so that we avoid deadlock */
		unsigned i;
		for (i =0; i < handle->nchildren; i++)
		{
			unsigned child = handle->nchildren - 1 - i;
			unlock_all_subtree(&handle->children[child]);
		}
	}
}
示例#10
0
/* handle->lock should already be taken !  */
starpu_data_request_t _starpu_create_data_request(starpu_data_handle src_handle, uint32_t src_node, starpu_data_handle dst_handle, uint32_t dst_node, uint32_t handling_node, starpu_access_mode mode, unsigned is_prefetch)
{
	starpu_data_request_t r = starpu_data_request_new();

	_starpu_spin_init(&r->lock);

   r->event = _starpu_event_create();
	r->src_handle = src_handle;
	r->dst_handle = dst_handle;
	r->src_node = src_node;
	r->dst_node = dst_node;
	r->mode = mode;

	r->handling_node = handling_node;

	r->completed = 0;
	r->retval = -1;

	r->next_req_count = 0;

	r->callbacks = NULL;

	r->is_a_prefetch_request = is_prefetch;

	/* associate that request with the handle so that further similar
	 * requests will reuse that one  */

	_starpu_spin_lock(&r->lock);

	dst_handle->per_node[dst_node].request = r;

	dst_handle->per_node[dst_node].refcnt++;

	if (mode & STARPU_R)
		src_handle->per_node[src_node].refcnt++;

	r->refcnt = 1;

	_starpu_spin_unlock(&r->lock);

	return r;
}
示例#11
0
static void _prefetch_data_on_node(void *arg)
{
	struct user_interaction_wrapper *wrapper = arg;
        int ret;

	ret = _starpu_fetch_data_on_node(wrapper->handle, wrapper->node, STARPU_R, wrapper->async, NULL, NULL);
        STARPU_ASSERT(!ret);

        PTHREAD_MUTEX_LOCK(&wrapper->lock);
	wrapper->finished = 1;
	PTHREAD_COND_SIGNAL(&wrapper->cond);
	PTHREAD_MUTEX_UNLOCK(&wrapper->lock);

	if (!wrapper->async)
	{
		_starpu_spin_lock(&wrapper->handle->header_lock);
		_starpu_notify_data_dependencies(wrapper->handle);
		_starpu_spin_unlock(&wrapper->handle->header_lock);
	}

}
struct starpu_task *_starpu_create_conversion_task_for_arch(starpu_data_handle_t handle,
						   enum starpu_node_kind node_kind)
{
	struct starpu_task *conversion_task;

#if defined(STARPU_USE_OPENCL) || defined(STARPU_USE_CUDA) || defined(STARPU_USE_MIC) || defined(STARPU_USE_SCC) || defined(STARPU_SIMGRID)
	struct starpu_multiformat_interface *format_interface;
#endif

	conversion_task = starpu_task_create();
	conversion_task->name = "conversion_task";
	conversion_task->synchronous = 0;
	STARPU_TASK_SET_HANDLE(conversion_task, handle, 0);

#if defined(STARPU_USE_OPENCL) || defined(STARPU_USE_CUDA) || defined(STARPU_USE_MIC) || defined(STARPU_USE_SCC) || defined(STARPU_SIMGRID)
	/* The node does not really matter here */
	format_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
#endif

	_starpu_spin_lock(&handle->header_lock);
	handle->refcnt++;
	handle->busy_count++;
	_starpu_spin_unlock(&handle->header_lock);

	switch(node_kind)
	{
	case STARPU_CPU_RAM:
	case STARPU_SCC_RAM:
	case STARPU_SCC_SHM:
		switch (starpu_node_get_kind(handle->mf_node))
		{
		case STARPU_CPU_RAM:
		case STARPU_SCC_RAM:
		case STARPU_SCC_SHM:
			STARPU_ABORT();
#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
		case STARPU_CUDA_RAM:
		{
			struct starpu_multiformat_data_interface_ops *mf_ops;
			mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface);
			conversion_task->cl = mf_ops->cuda_to_cpu_cl;
			break;
		}
#endif
#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
		case STARPU_OPENCL_RAM:
		{
			struct starpu_multiformat_data_interface_ops *mf_ops;
			mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface);
			conversion_task->cl = mf_ops->opencl_to_cpu_cl;
			break;
		}
#endif
#ifdef STARPU_USE_MIC
		case STARPU_MIC_RAM:
		{
			struct starpu_multiformat_data_interface_ops *mf_ops;
			mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface);
			conversion_task->cl = mf_ops->mic_to_cpu_cl;
			break;
		}
#endif
		default:
			_STARPU_ERROR("Oops : %u\n", handle->mf_node);
		}
		break;
#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
	case STARPU_CUDA_RAM:
		{
			struct starpu_multiformat_data_interface_ops *mf_ops;
			mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface);
			conversion_task->cl = mf_ops->cpu_to_cuda_cl;
			break;
		}
#endif
#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
	case STARPU_OPENCL_RAM:
	{
		struct starpu_multiformat_data_interface_ops *mf_ops;
		mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface);
		conversion_task->cl = mf_ops->cpu_to_opencl_cl;
		break;
	}
#endif
#ifdef STARPU_USE_MIC
	case STARPU_MIC_RAM:
	{
		struct starpu_multiformat_data_interface_ops *mf_ops;
		mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface);
		conversion_task->cl = mf_ops->cpu_to_mic_cl;
		break;
	}
#endif
	default:
		STARPU_ABORT();
	}

	STARPU_TASK_SET_MODE(conversion_task, STARPU_RW, 0);
	return conversion_task;
}
示例#13
0
int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle handle, unsigned node, unsigned async, starpu_access_mode mode)
{
	STARPU_ASSERT(handle);

	/* it is forbidden to call this function from a callback or a codelet */
	if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
		return -EDEADLK;

	struct user_interaction_wrapper wrapper =
	{
		.handle = handle,
		.node = node,
		.async = async,
		.cond = PTHREAD_COND_INITIALIZER,
		.lock = PTHREAD_MUTEX_INITIALIZER,
		.finished = 0
	};

	if (!_starpu_attempt_to_submit_data_request_from_apps(handle, mode, _prefetch_data_on_node, &wrapper))
	{
		/* we can immediately proceed */
		_starpu_fetch_data_on_node(handle, node, mode, async, NULL, NULL);

		/* remove the "lock"/reference */
		if (!async)
		{
			_starpu_spin_lock(&handle->header_lock);
			_starpu_notify_data_dependencies(handle);
			_starpu_spin_unlock(&handle->header_lock);
		}
	}
	else {
		PTHREAD_MUTEX_LOCK(&wrapper.lock);
		while (!wrapper.finished)
			PTHREAD_COND_WAIT(&wrapper.cond, &wrapper.lock);
		PTHREAD_MUTEX_UNLOCK(&wrapper.lock);
	}

	return 0;
}

int starpu_data_prefetch_on_node(starpu_data_handle handle, unsigned node, unsigned async)
{
	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R);
}

/*
 *	It is possible to specify that a piece of data can be discarded without
 *	impacting the application.
 */
void starpu_data_advise_as_important(starpu_data_handle handle, unsigned is_important)
{
	_starpu_spin_lock(&handle->header_lock);

	/* first take all the children lock (in order !) */
	unsigned child;
	for (child = 0; child < handle->nchildren; child++)
	{
		/* make sure the intermediate children is advised as well */
		struct starpu_data_state_t *child_handle = &handle->children[child];
		if (child_handle->nchildren > 0)
			starpu_data_advise_as_important(child_handle, is_important);
	}

	handle->is_not_important = !is_important;

	/* now the parent may be used again so we release the lock */
	_starpu_spin_unlock(&handle->header_lock);

}
示例#14
0
static void starpu_handle_data_request_completion(starpu_data_request_t r)
{
	unsigned do_delete = 0;

	uint32_t src_node = r->src_node;
	uint32_t dst_node = r->dst_node;

	_starpu_update_data_state(r->dst_handle, dst_node, r->mode);

#ifdef STARPU_USE_FXT
	size_t size = _starpu_data_get_size(r->src_handle);
	STARPU_TRACE_END_DRIVER_COPY(src_node, dst_node, size, r->com_id);
#endif

	unsigned chained_req;
	for (chained_req = 0; chained_req < r->next_req_count; chained_req++)
	{
		_starpu_post_data_request(r->next_req[chained_req], r->next_req[chained_req]->handling_node);
	}

	r->completed = 1;
	
	r->dst_handle->per_node[dst_node].refcnt--;

	if (r->mode & STARPU_R)
		r->src_handle->per_node[src_node].refcnt--;

	r->refcnt--;

	/* if nobody is waiting on that request, we can get rid of it */
	if (r->refcnt == 0)
		do_delete = 1;
	
	r->retval = 0;

	/* In case there are one or multiple callbacks, we execute them now. */
	struct callback_list *callbacks = r->callbacks;
	
	_starpu_spin_unlock(&r->lock);

   if (r->src_handle != r->dst_handle) {
   	_starpu_spin_unlock(&r->src_handle->header_lock);
      _starpu_spin_unlock(&r->dst_handle->header_lock);
   }
   else
      _starpu_spin_unlock(&r->src_handle->header_lock);

	if (do_delete)
		starpu_data_request_destroy(r);


	/* We do the callback once the lock is released so that they can do
	 * blocking operations with the handle (eg. release it) */
	while (callbacks)
	{
		callbacks->callback_func(callbacks->callback_arg);

		struct callback_list *next = callbacks->next;
		free(callbacks);
		callbacks = next;
	}
}
/* This is called when a task is finished with a piece of data
 * (or on starpu_data_release)
 *
 * The header lock must already be taken by the caller.
 * This may free the handle if it was lazily unregistered (1 is returned in
 * that case). The handle pointer thus becomes invalid for the caller.
 */
int _starpu_notify_data_dependencies(starpu_data_handle_t handle)
{
	_starpu_spin_checklocked(&handle->header_lock);
	/* A data access has finished so we remove a reference. */
	STARPU_ASSERT(handle->refcnt > 0);
	handle->refcnt--;
	STARPU_ASSERT(handle->busy_count > 0);
	handle->busy_count--;
	if (_starpu_data_check_not_busy(handle))
		/* Handle was destroyed, nothing left to do.  */
		return 1;

	if (handle->arbiter)
	{
		unsigned refcnt = handle->refcnt;
		STARPU_ASSERT(_starpu_data_requester_list_empty(&handle->req_list));
		STARPU_ASSERT(_starpu_data_requester_list_empty(&handle->reduction_req_list));
		_starpu_spin_unlock(&handle->header_lock);
		/* _starpu_notify_arbitered_dependencies will handle its own locking */
		if (!refcnt)
			_starpu_notify_arbitered_dependencies(handle);
		/* We have already unlocked */
		return 1;
	}
	STARPU_ASSERT(_starpu_data_requester_list_empty(&handle->arbitered_req_list));

	/* In case there is a pending reduction, and that this is the last
	 * requester, we may go back to a "normal" coherency model. */
	if (handle->reduction_refcnt > 0)
	{
		//fprintf(stderr, "NOTIFY REDUCTION TASK RED REFCNT %d\n", handle->reduction_refcnt);
		handle->reduction_refcnt--;
		if (handle->reduction_refcnt == 0)
			_starpu_data_end_reduction_mode_terminate(handle);
	}

	struct _starpu_data_requester *r;
	while ((r = may_unlock_data_req_list_head(handle)))
	{
		/* STARPU_RW accesses are treated as STARPU_W */
		enum starpu_data_access_mode r_mode = r->mode;
		if (r_mode == STARPU_RW)
			r_mode = STARPU_W;

		int put_in_list = 1;
		if ((handle->reduction_refcnt == 0) && (handle->current_mode == STARPU_REDUX) && (r_mode != STARPU_REDUX))
		{
			_starpu_data_end_reduction_mode(handle);

			/* Since we need to perform a mode change, we freeze
			 * the request if needed. */
			put_in_list = (handle->reduction_refcnt > 0);
		}
		else
		{
			put_in_list = 0;
		}

		if (put_in_list)
		{
			/* We need to put the request back because we must
			 * perform a reduction before. */
			_starpu_data_requester_list_push_front(&handle->req_list, r);
		}
		else
		{
			/* The data is now attributed to that request so we put a
			 * reference on it. */
			handle->refcnt++;
			handle->busy_count++;

			enum starpu_data_access_mode previous_mode = handle->current_mode;
			handle->current_mode = r_mode;

			/* In case we enter in a reduction mode, we invalidate all per
			 * worker replicates. Note that the "per_node" replicates are
			 * kept intact because we'll reduce a valid copy of the
			 * "per-node replicate" with the per-worker replicates .*/
			if ((r_mode == STARPU_REDUX) && (previous_mode != STARPU_REDUX))
				_starpu_data_start_reduction_mode(handle);

			_starpu_spin_unlock(&handle->header_lock);

			if (r->is_requested_by_codelet)
			{
				if (!unlock_one_requester(r))
					_starpu_push_task(r->j);
			}
			else
			{
				STARPU_ASSERT(r->ready_data_callback);

				/* execute the callback associated with the data requester */
				r->ready_data_callback(r->argcb);
			}

			_starpu_data_requester_delete(r);

			_starpu_spin_lock(&handle->header_lock);
			STARPU_ASSERT(handle->busy_count > 0);
			handle->busy_count--;
			if (_starpu_data_check_not_busy(handle))
				return 1;
		}
	}

	return 0;
}
/* No lock is held, this acquires and releases the handle header lock */
static unsigned _starpu_attempt_to_submit_data_request(unsigned request_from_codelet,
						       starpu_data_handle_t handle, enum starpu_data_access_mode mode,
						       void (*callback)(void *), void *argcb,
						       struct _starpu_job *j, unsigned buffer_index)
{
	if (handle->arbiter)
		return _starpu_attempt_to_submit_arbitered_data_request(request_from_codelet, handle, mode, callback, argcb, j, buffer_index);

	if (mode == STARPU_RW)
		mode = STARPU_W;

	/* Take the lock protecting the header. We try to do some progression
	 * in case this is called from a worker, otherwise we just wait for the
	 * lock to be available. */
	if (request_from_codelet)
	{
		int cpt = 0;
		while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
		{
			cpt++;
			_starpu_datawizard_progress(_starpu_memory_node_get_local_key(), 0);
		}
		if (cpt == STARPU_SPIN_MAXTRY)
			_starpu_spin_lock(&handle->header_lock);
	}
	else
	{
		_starpu_spin_lock(&handle->header_lock);
	}

	/* If we have a request that is not used for the reduction, and that a
	 * reduction is pending, we put it at the end of normal list, and we
	 * use the reduction_req_list instead */
	unsigned pending_reduction = (handle->reduction_refcnt > 0);
	unsigned frozen = 0;

	/* If we are currently performing a reduction, we freeze any request
	 * that is not explicitely a reduction task. */
	unsigned is_a_reduction_task = (request_from_codelet && j->reduction_task);

	if (pending_reduction && !is_a_reduction_task)
		frozen = 1;

	/* If there is currently nobody accessing the piece of data, or it's
	 * not another writter and if this is the same type of access as the
	 * current one, we can proceed. */
	unsigned put_in_list = 1;

	enum starpu_data_access_mode previous_mode = handle->current_mode;

	if (!frozen && ((handle->refcnt == 0) || (!(mode == STARPU_W) && (handle->current_mode == mode))))
	{
		/* Detect whether this is the end of a reduction phase */
			/* We don't want to start multiple reductions of the
			 * same handle at the same time ! */

		if ((handle->reduction_refcnt == 0) && (previous_mode == STARPU_REDUX) && (mode != STARPU_REDUX))
		{
			_starpu_data_end_reduction_mode(handle);

			/* Since we need to perform a mode change, we freeze
			 * the request if needed. */
			put_in_list = (handle->reduction_refcnt > 0);
		}
		else
		{
			put_in_list = 0;
		}
	}

	if (put_in_list)
	{
		/* there cannot be multiple writers or a new writer
		 * while the data is in read mode */

		handle->busy_count++;
		/* enqueue the request */
		struct _starpu_data_requester *r = _starpu_data_requester_new();
		r->mode = mode;
		r->is_requested_by_codelet = request_from_codelet;
		r->j = j;
		r->buffer_index = buffer_index;
		r->ready_data_callback = callback;
		r->argcb = argcb;

		/* We put the requester in a specific list if this is a reduction task */
		struct _starpu_data_requester_list *req_list =
			is_a_reduction_task?&handle->reduction_req_list:&handle->req_list;

		_starpu_data_requester_list_push_back(req_list, r);

		/* failed */
		put_in_list = 1;
	}
	else
	{
		handle->refcnt++;
		handle->busy_count++;

		/* Do not write to handle->current_mode if it is already
		 * R. This avoids a spurious warning from helgrind when
		 * the following happens:
		 * acquire(R) in thread A
		 * acquire(R) in thread B
		 * release_data_on_node() in thread A
		 * helgrind would shout that the latter reads current_mode
		 * unsafely.
		 *
		 * This actually basically explains helgrind that it is a
		 * shared R acquisition.
		 */
		if (mode != STARPU_R || handle->current_mode != mode)
			handle->current_mode = mode;

		if ((mode == STARPU_REDUX) && (previous_mode != STARPU_REDUX))
			_starpu_data_start_reduction_mode(handle);

		/* success */
		put_in_list = 0;
	}

	_starpu_spin_unlock(&handle->header_lock);
	return put_in_list;

}
示例#17
0
void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gathering_node)
{
	unsigned child;
	unsigned worker;
	unsigned nworkers = starpu_worker_get_count();
	unsigned node;
	unsigned sizes[root_handle->nchildren];

	_STARPU_TRACE_START_UNPARTITION(root_handle, gathering_node);
	_starpu_spin_lock(&root_handle->header_lock);

	STARPU_ASSERT_MSG(root_handle->nchildren != 0, "data %p is not partitioned, can not unpartition it", root_handle);

	/* first take all the children lock (in order !) */
	for (child = 0; child < root_handle->nchildren; child++)
	{
		starpu_data_handle_t child_handle = starpu_data_get_child(root_handle, child);

		/* make sure the intermediate children is unpartitionned as well */
		if (child_handle->nchildren > 0)
			starpu_data_unpartition(child_handle, gathering_node);

		/* If this is a multiformat handle, we must convert the data now */
#ifdef STARPU_DEVEL
#warning TODO: _starpu_fetch_data_on_node should be doing it
#endif
		if (_starpu_data_is_multiformat_handle(child_handle) &&
			starpu_node_get_kind(child_handle->mf_node) != STARPU_CPU_RAM)
		{
			struct starpu_codelet cl =
			{
				.where = STARPU_CPU,
				.cpu_funcs = { _starpu_empty_codelet_function },
				.modes = { STARPU_RW },
				.nbuffers = 1
			};
			struct starpu_task *task = starpu_task_create();
			task->name = "convert_data";

			STARPU_TASK_SET_HANDLE(task, child_handle, 0);
			task->cl = &cl;
			task->synchronous = 1;
			if (_starpu_task_submit_internally(task) != 0)
				_STARPU_ERROR("Could not submit the conversion task while unpartitionning\n");
		}

		int ret;
		/* for now we pretend that the RAM is almost unlimited and that gathering
		 * data should be possible from the node that does the unpartionning ... we
		 * don't want to have the programming deal with memory shortage at that time,
		 * really */
		/* Acquire the child data on the gathering node. This will trigger collapsing any reduction */
		ret = starpu_data_acquire_on_node(child_handle, gathering_node, STARPU_RW);
		STARPU_ASSERT(ret == 0);
		starpu_data_release_on_node(child_handle, gathering_node);

		_starpu_spin_lock(&child_handle->header_lock);
		child_handle->busy_waiting = 1;
		_starpu_spin_unlock(&child_handle->header_lock);

		/* Wait for all requests to finish (notably WT requests) */
		STARPU_PTHREAD_MUTEX_LOCK(&child_handle->busy_mutex);
		while (1)
		{
			/* Here helgrind would shout that this an unprotected access,
			 * but this is actually fine: all threads who do busy_count--
			 * are supposed to call _starpu_data_check_not_busy, which will
			 * wake us up through the busy_mutex/busy_cond. */
			if (!child_handle->busy_count)
				break;
			/* This is woken by _starpu_data_check_not_busy, always called
			 * after decrementing busy_count */
			STARPU_PTHREAD_COND_WAIT(&child_handle->busy_cond, &child_handle->busy_mutex);
		}
		STARPU_PTHREAD_MUTEX_UNLOCK(&child_handle->busy_mutex);

		_starpu_spin_lock(&child_handle->header_lock);

		sizes[child] = _starpu_data_get_size(child_handle);

		_starpu_data_unregister_ram_pointer(child_handle);

		for (worker = 0; worker < nworkers; worker++)
		{
			struct _starpu_data_replicate *local = &child_handle->per_worker[worker];
			STARPU_ASSERT(local->state == STARPU_INVALID);
			if (local->allocated && local->automatically_allocated)
				_starpu_request_mem_chunk_removal(child_handle, local, starpu_worker_get_memory_node(worker), sizes[child]);
		}

		_starpu_memory_stats_free(child_handle);
	}
示例#18
0
static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_data_handle_t *childrenp, unsigned nparts, struct starpu_data_filter *f, int inherit_state)
{
	unsigned i;
	unsigned node;

	/* first take care to properly lock the data header */
	_starpu_spin_lock(&initial_handle->header_lock);

	initial_handle->nplans++;

	STARPU_ASSERT_MSG(nparts > 0, "Partitioning data %p in 0 piece does not make sense", initial_handle);

	/* allocate the children */
	if (inherit_state)
	{
		initial_handle->children = (struct _starpu_data_state *) calloc(nparts, sizeof(struct _starpu_data_state));
		STARPU_ASSERT(initial_handle->children);

		/* this handle now has children */
		initial_handle->nchildren = nparts;
	}

	unsigned nworkers = starpu_worker_get_count();

	for (node = 0; node < STARPU_MAXNODES; node++)
	{
		if (initial_handle->per_node[node].state != STARPU_INVALID)
			break;
	}
	if (node == STARPU_MAXNODES)
	{
		/* This is lazy allocation, allocate it now in main RAM, so as
		 * to have somewhere to gather pieces later */
		/* FIXME: mark as unevictable! */
		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[STARPU_MAIN_RAM], 0);
#ifdef STARPU_DEVEL
#warning we should reclaim memory if allocation failed
#endif
		STARPU_ASSERT(!ret);
	}

	for (i = 0; i < nparts; i++)
	{
		starpu_data_handle_t child;

		if (inherit_state)
			child = &initial_handle->children[i];
		else
			child = childrenp[i];
		STARPU_ASSERT(child);

		struct starpu_data_interface_ops *ops;

		/* each child may have his own interface type */
		/* what's this child's interface ? */
		if (f->get_child_ops)
			ops = f->get_child_ops(f, i);
		else
			ops = initial_handle->ops;

		_starpu_data_handle_init(child, ops, initial_handle->mf_node);

		child->nchildren = 0;
		child->nplans = 0;
		child->switch_cl = NULL;
		child->partitioned = 0;
		child->readonly = 0;
                child->mpi_data = initial_handle->mpi_data;
		child->root_handle = initial_handle->root_handle;
		child->father_handle = initial_handle;
		child->sibling_index = i;
		child->depth = initial_handle->depth + 1;

		child->is_not_important = initial_handle->is_not_important;
		child->wt_mask = initial_handle->wt_mask;
		child->home_node = initial_handle->home_node;
		child->is_readonly = initial_handle->is_readonly;

		/* initialize the chunk lock */
		_starpu_data_requester_list_init(&child->req_list);
		_starpu_data_requester_list_init(&child->reduction_req_list);
		child->reduction_tmp_handles = NULL;
		child->write_invalidation_req = NULL;
		child->refcnt = 0;
		child->unlocking_reqs = 0;
		child->busy_count = 0;
		child->busy_waiting = 0;
		STARPU_PTHREAD_MUTEX_INIT(&child->busy_mutex, NULL);
		STARPU_PTHREAD_COND_INIT(&child->busy_cond, NULL);
		child->reduction_refcnt = 0;
		_starpu_spin_init(&child->header_lock);

		child->sequential_consistency = initial_handle->sequential_consistency;

		STARPU_PTHREAD_MUTEX_INIT(&child->sequential_consistency_mutex, NULL);
		child->last_submitted_mode = STARPU_R;
		child->last_sync_task = NULL;
		child->last_submitted_accessors.task = NULL;
		child->last_submitted_accessors.next = &child->last_submitted_accessors;
		child->last_submitted_accessors.prev = &child->last_submitted_accessors;
		child->post_sync_tasks = NULL;
		/* Tell helgrind that the race in _starpu_unlock_post_sync_tasks is fine */
		STARPU_HG_DISABLE_CHECKING(child->post_sync_tasks_cnt);
		child->post_sync_tasks_cnt = 0;

		/* The methods used for reduction are propagated to the
		 * children. */
		child->redux_cl = initial_handle->redux_cl;
		child->init_cl = initial_handle->init_cl;

#ifdef STARPU_USE_FXT
		child->last_submitted_ghost_sync_id_is_valid = 0;
		child->last_submitted_ghost_sync_id = 0;
		child->last_submitted_ghost_accessors_id = NULL;
#endif

		if (_starpu_global_arbiter)
			/* Just for testing purpose */
			starpu_data_assign_arbiter(child, _starpu_global_arbiter);
		else
			child->arbiter = NULL;
		_starpu_data_requester_list_init(&child->arbitered_req_list);

		for (node = 0; node < STARPU_MAXNODES; node++)
		{
			struct _starpu_data_replicate *initial_replicate;
			struct _starpu_data_replicate *child_replicate;

			initial_replicate = &initial_handle->per_node[node];
			child_replicate = &child->per_node[node];

			if (inherit_state)
				child_replicate->state = initial_replicate->state;
			else
				child_replicate->state = STARPU_INVALID;
			if (inherit_state || !initial_replicate->automatically_allocated)
				child_replicate->allocated = initial_replicate->allocated;
			else
				child_replicate->allocated = 0;
			/* Do not allow memory reclaiming within the child for parent bits */
			child_replicate->automatically_allocated = 0;
			child_replicate->refcnt = 0;
			child_replicate->memory_node = node;
			child_replicate->relaxed_coherency = 0;
			if (inherit_state)
				child_replicate->initialized = initial_replicate->initialized;
			else
				child_replicate->initialized = 0;

			/* update the interface */
			void *initial_interface = starpu_data_get_interface_on_node(initial_handle, node);
			void *child_interface = starpu_data_get_interface_on_node(child, node);

			STARPU_ASSERT_MSG(!(!inherit_state && child_replicate->automatically_allocated && child_replicate->allocated), "partition planning is currently not supported when handle has some automatically allocated buffers");
			f->filter_func(initial_interface, child_interface, f, i, nparts);
		}

		unsigned worker;
		for (worker = 0; worker < nworkers; worker++)
		{
			struct _starpu_data_replicate *child_replicate;
			child_replicate = &child->per_worker[worker];

			child_replicate->state = STARPU_INVALID;
			child_replicate->allocated = 0;
			child_replicate->automatically_allocated = 0;
			child_replicate->refcnt = 0;
			child_replicate->memory_node = starpu_worker_get_memory_node(worker);
			child_replicate->requested = 0;

			for (node = 0; node < STARPU_MAXNODES; node++)
			{
				child_replicate->request[node] = NULL;
			}

			child_replicate->relaxed_coherency = 1;
			child_replicate->initialized = 0;

			/* duplicate  the content of the interface on node 0 */
			memcpy(child_replicate->data_interface, child->per_node[0].data_interface, child->ops->interface_size);
		}

		/* We compute the size and the footprint of the child once and
		 * store it in the handle */
		child->footprint = _starpu_compute_data_footprint(child);

		void *ptr;
		ptr = starpu_data_handle_to_pointer(child, STARPU_MAIN_RAM);
		if (ptr != NULL)
			_starpu_data_register_ram_pointer(child, ptr);
	}
	/* now let the header */
	_starpu_spin_unlock(&initial_handle->header_lock);
}