static void _handle_pending_node_data_requests(uint32_t src_node, unsigned force) { // _STARPU_DEBUG("_starpu_handle_pending_node_data_requests ...\n"); PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[src_node]); /* for all entries of the list */ starpu_data_request_list_t local_list = data_requests_pending[src_node]; data_requests_pending[src_node] = starpu_data_request_list_new(); PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]); while (!starpu_data_request_list_empty(local_list)) { starpu_data_request_t r; r = starpu_data_request_list_pop_back(local_list); if (r->src_handle != r->dst_handle) { _starpu_spin_lock(&r->src_handle->header_lock); _starpu_spin_lock(&r->dst_handle->header_lock); } else _starpu_spin_lock(&r->src_handle->header_lock); _starpu_spin_lock(&r->lock); /* wait until the transfer is terminated */ if (force) { _starpu_driver_wait_request_completion(r, src_node); starpu_handle_data_request_completion(r); } else { if (_starpu_driver_test_request_completion(r, src_node)) { starpu_handle_data_request_completion(r); } else { _starpu_spin_unlock(&r->lock); if (r->src_handle != r->dst_handle) { _starpu_spin_unlock(&r->src_handle->header_lock); _starpu_spin_unlock(&r->dst_handle->header_lock); } else _starpu_spin_unlock(&r->src_handle->header_lock); /* wake the requesting worker up */ PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[src_node]); starpu_data_request_list_push_front(data_requests_pending[src_node], r); PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]); } } } starpu_data_request_list_delete(local_list); }
/* Returns whether the completion was already terminated, and caller should * thus immediately proceed. */ int _starpu_add_successor_to_cg_list(struct _starpu_cg_list *successors, struct _starpu_cg *cg) { int ret; STARPU_ASSERT(cg); _starpu_spin_lock(&successors->lock); ret = successors->terminated; /* where should that cg should be put in the array ? */ unsigned index = successors->nsuccs++; #ifdef STARPU_DYNAMIC_DEPS_SIZE if (index >= successors->succ_list_size) { /* the successor list is too small */ if (successors->succ_list_size > 0) successors->succ_list_size *= 2; else successors->succ_list_size = 4; successors->succ = (struct _starpu_cg **) realloc(successors->succ, successors->succ_list_size*sizeof(struct _starpu_cg *)); } #else STARPU_ASSERT(index < STARPU_NMAXDEPS); #endif successors->succ[index] = cg; _starpu_spin_unlock(&successors->lock); return ret; }
static void _starpu_tag_free(void *_tag) { struct _starpu_tag *tag = (struct _starpu_tag *) _tag; if (tag) { _starpu_spin_lock(&tag->lock); unsigned nsuccs = tag->tag_successors.nsuccs; unsigned succ; for (succ = 0; succ < nsuccs; succ++) { struct _starpu_cg *cg = tag->tag_successors.succ[succ]; unsigned ntags = STARPU_ATOMIC_ADD(&cg->ntags, -1); unsigned remaining STARPU_ATTRIBUTE_UNUSED = STARPU_ATOMIC_ADD(&cg->remaining, -1); if (!ntags && (cg->cg_type == STARPU_CG_TAG)) /* Last tag this cg depends on, cg becomes unreferenced */ free(cg); } #ifdef STARPU_DYNAMIC_DEPS_SIZE free(tag->tag_successors.succ); #endif _starpu_spin_unlock(&tag->lock); _starpu_spin_destroy(&tag->lock); free(tag); } }
/* TODO : accounting to see how much time was spent working for other people ... */ static int starpu_handle_data_request(starpu_data_request_t r, unsigned may_alloc) { if (r->src_handle != r->dst_handle) { _starpu_spin_lock(&r->src_handle->header_lock); _starpu_spin_lock(&r->dst_handle->header_lock); } else _starpu_spin_lock(&r->src_handle->header_lock); _starpu_spin_lock(&r->lock); if (r->mode & STARPU_R) { STARPU_ASSERT(r->src_handle->per_node[r->src_node].allocated); STARPU_ASSERT(r->src_handle->per_node[r->src_node].refcnt); } /* perform the transfer */ /* the header of the data must be locked by the worker that submitted the request */ r->retval = _starpu_driver_copy_data_1_to_1(r->src_handle, r->src_node, r->dst_handle, r->dst_node, !(r->mode & STARPU_R), r, may_alloc); if (r->retval == ENOMEM) { _starpu_spin_unlock(&r->lock); if (r->src_handle != r->dst_handle) { _starpu_spin_unlock(&r->src_handle->header_lock); _starpu_spin_unlock(&r->dst_handle->header_lock); } else _starpu_spin_unlock(&r->src_handle->header_lock); return ENOMEM; } if (r->retval == EAGAIN) { _starpu_spin_unlock(&r->lock); if (r->src_handle != r->dst_handle) { _starpu_spin_unlock(&r->src_handle->header_lock); _starpu_spin_unlock(&r->dst_handle->header_lock); } else _starpu_spin_unlock(&r->src_handle->header_lock); /* the request is pending and we put it in the corresponding queue */ PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[r->handling_node]); starpu_data_request_list_push_front(data_requests_pending[r->handling_node], r); PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[r->handling_node]); return EAGAIN; } /* the request has been handled */ starpu_handle_data_request_completion(r); return 0; }
int _starpu_wait_data_request_completion(starpu_data_request_t r, unsigned may_alloc) { int retval; int do_delete = 0; uint32_t local_node = _starpu_get_local_memory_node(); do { _starpu_spin_lock(&r->lock); if (r->completed) break; _starpu_spin_unlock(&r->lock); #ifndef STARPU_NON_BLOCKING_DRIVERS _starpu_wake_all_blocked_workers_on_node(r->handling_node); #endif _starpu_datawizard_progress(local_node, may_alloc); } while (1); retval = r->retval; if (retval) _STARPU_DISP("REQUEST %p COMPLETED (retval %d) !\n", r, r->retval); r->refcnt--; /* if nobody is waiting on that request, we can get rid of it */ if (r->refcnt == 0) do_delete = 1; _starpu_spin_unlock(&r->lock); if (do_delete) starpu_data_request_destroy(r); return retval; }
/* The data must be released by calling starpu_data_release later on */ int starpu_data_acquire_cb(starpu_data_handle handle, starpu_access_mode mode, void (*callback)(void *), void *arg) { STARPU_ASSERT(handle); struct user_interaction_wrapper *wrapper = malloc(sizeof(struct user_interaction_wrapper)); STARPU_ASSERT(wrapper); wrapper->handle = handle; wrapper->mode = mode; wrapper->callback = callback; wrapper->callback_arg = arg; PTHREAD_COND_INIT(&wrapper->cond, NULL); PTHREAD_MUTEX_INIT(&wrapper->lock, NULL); wrapper->finished = 0; //TODO: instead of having the is_prefetch argument, _starpu_fetch_data shoud consider two flags: async and detached _starpu_spin_lock(&handle->header_lock); handle->per_node[0].refcnt++; _starpu_spin_unlock(&handle->header_lock); PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex); int sequential_consistency = handle->sequential_consistency; if (sequential_consistency) { wrapper->pre_sync_task = starpu_task_create(); wrapper->pre_sync_task->callback_func = starpu_data_acquire_cb_pre_sync_callback; wrapper->pre_sync_task->callback_arg = wrapper; wrapper->post_sync_task = starpu_task_create(); #ifdef STARPU_USE_FXT starpu_job_t job = _starpu_get_job_associated_to_task(wrapper->pre_sync_task); job->model_name = "acquire_cb_pre"; job = _starpu_get_job_associated_to_task(wrapper->post_sync_task); job->model_name = "acquire_cb_post"; #endif _starpu_detect_implicit_data_deps_with_handle(wrapper->pre_sync_task, wrapper->post_sync_task, handle, mode); PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex); /* TODO detect if this is superflous */ int ret = starpu_task_submit(wrapper->pre_sync_task, NULL); STARPU_ASSERT(!ret); } else { PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex); starpu_data_acquire_cb_pre_sync_callback(wrapper); } return 0; }
/* Caller just has to promise that the list will not disappear. * _starpu_notify_cg_list protects the list itself. * No job lock should be held, since we might want to immediately call the callback of an empty task. */ void _starpu_notify_cg_list(struct _starpu_cg_list *successors) { unsigned succ; _starpu_spin_lock(&successors->lock); /* Note: some thread might be concurrently adding other items */ for (succ = 0; succ < successors->nsuccs; succ++) { struct _starpu_cg *cg = successors->succ[succ]; STARPU_ASSERT(cg); unsigned cg_type = cg->cg_type; if (cg_type == STARPU_CG_APPS) { /* Remove the temporary ref to the cg */ memmove(&successors->succ[succ], &successors->succ[succ+1], (successors->nsuccs-(succ+1)) * sizeof(successors->succ[succ])); succ--; successors->nsuccs--; } _starpu_spin_unlock(&successors->lock); struct _starpu_tag *cgtag = NULL; if (cg_type == STARPU_CG_TAG) { cgtag = cg->succ.tag; STARPU_ASSERT(cgtag); _starpu_spin_lock(&cgtag->lock); } _starpu_notify_cg(cg); if (cg_type == STARPU_CG_TAG) _starpu_spin_unlock(&cgtag->lock); _starpu_spin_lock(&successors->lock); } successors->terminated = 1; _starpu_spin_unlock(&successors->lock); }
void starpu_data_set_sequential_consistency_flag(starpu_data_handle handle, unsigned flag) { _starpu_spin_lock(&handle->header_lock); unsigned child; for (child = 0; child < handle->nchildren; child++) { /* make sure that the flags are applied to the children as well */ struct starpu_data_state_t *child_handle = &handle->children[child]; if (child_handle->nchildren > 0) starpu_data_set_sequential_consistency_flag(child_handle, flag); } PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex); handle->sequential_consistency = flag; PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex); _starpu_spin_unlock(&handle->header_lock); }
static void unlock_all_subtree(starpu_data_handle handle) { if (handle->nchildren == 0) { /* this is a leaf */ _starpu_spin_unlock(&handle->header_lock); } else { /* lock all sub-subtrees children * Note that this is done in the reverse order of the * lock_all_subtree so that we avoid deadlock */ unsigned i; for (i =0; i < handle->nchildren; i++) { unsigned child = handle->nchildren - 1 - i; unlock_all_subtree(&handle->children[child]); } } }
/* handle->lock should already be taken ! */ starpu_data_request_t _starpu_create_data_request(starpu_data_handle src_handle, uint32_t src_node, starpu_data_handle dst_handle, uint32_t dst_node, uint32_t handling_node, starpu_access_mode mode, unsigned is_prefetch) { starpu_data_request_t r = starpu_data_request_new(); _starpu_spin_init(&r->lock); r->event = _starpu_event_create(); r->src_handle = src_handle; r->dst_handle = dst_handle; r->src_node = src_node; r->dst_node = dst_node; r->mode = mode; r->handling_node = handling_node; r->completed = 0; r->retval = -1; r->next_req_count = 0; r->callbacks = NULL; r->is_a_prefetch_request = is_prefetch; /* associate that request with the handle so that further similar * requests will reuse that one */ _starpu_spin_lock(&r->lock); dst_handle->per_node[dst_node].request = r; dst_handle->per_node[dst_node].refcnt++; if (mode & STARPU_R) src_handle->per_node[src_node].refcnt++; r->refcnt = 1; _starpu_spin_unlock(&r->lock); return r; }
static void _prefetch_data_on_node(void *arg) { struct user_interaction_wrapper *wrapper = arg; int ret; ret = _starpu_fetch_data_on_node(wrapper->handle, wrapper->node, STARPU_R, wrapper->async, NULL, NULL); STARPU_ASSERT(!ret); PTHREAD_MUTEX_LOCK(&wrapper->lock); wrapper->finished = 1; PTHREAD_COND_SIGNAL(&wrapper->cond); PTHREAD_MUTEX_UNLOCK(&wrapper->lock); if (!wrapper->async) { _starpu_spin_lock(&wrapper->handle->header_lock); _starpu_notify_data_dependencies(wrapper->handle); _starpu_spin_unlock(&wrapper->handle->header_lock); } }
struct starpu_task *_starpu_create_conversion_task_for_arch(starpu_data_handle_t handle, enum starpu_node_kind node_kind) { struct starpu_task *conversion_task; #if defined(STARPU_USE_OPENCL) || defined(STARPU_USE_CUDA) || defined(STARPU_USE_MIC) || defined(STARPU_USE_SCC) || defined(STARPU_SIMGRID) struct starpu_multiformat_interface *format_interface; #endif conversion_task = starpu_task_create(); conversion_task->name = "conversion_task"; conversion_task->synchronous = 0; STARPU_TASK_SET_HANDLE(conversion_task, handle, 0); #if defined(STARPU_USE_OPENCL) || defined(STARPU_USE_CUDA) || defined(STARPU_USE_MIC) || defined(STARPU_USE_SCC) || defined(STARPU_SIMGRID) /* The node does not really matter here */ format_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM); #endif _starpu_spin_lock(&handle->header_lock); handle->refcnt++; handle->busy_count++; _starpu_spin_unlock(&handle->header_lock); switch(node_kind) { case STARPU_CPU_RAM: case STARPU_SCC_RAM: case STARPU_SCC_SHM: switch (starpu_node_get_kind(handle->mf_node)) { case STARPU_CPU_RAM: case STARPU_SCC_RAM: case STARPU_SCC_SHM: STARPU_ABORT(); #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID) case STARPU_CUDA_RAM: { struct starpu_multiformat_data_interface_ops *mf_ops; mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface); conversion_task->cl = mf_ops->cuda_to_cpu_cl; break; } #endif #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID) case STARPU_OPENCL_RAM: { struct starpu_multiformat_data_interface_ops *mf_ops; mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface); conversion_task->cl = mf_ops->opencl_to_cpu_cl; break; } #endif #ifdef STARPU_USE_MIC case STARPU_MIC_RAM: { struct starpu_multiformat_data_interface_ops *mf_ops; mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface); conversion_task->cl = mf_ops->mic_to_cpu_cl; break; } #endif default: _STARPU_ERROR("Oops : %u\n", handle->mf_node); } break; #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID) case STARPU_CUDA_RAM: { struct starpu_multiformat_data_interface_ops *mf_ops; mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface); conversion_task->cl = mf_ops->cpu_to_cuda_cl; break; } #endif #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID) case STARPU_OPENCL_RAM: { struct starpu_multiformat_data_interface_ops *mf_ops; mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface); conversion_task->cl = mf_ops->cpu_to_opencl_cl; break; } #endif #ifdef STARPU_USE_MIC case STARPU_MIC_RAM: { struct starpu_multiformat_data_interface_ops *mf_ops; mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface); conversion_task->cl = mf_ops->cpu_to_mic_cl; break; } #endif default: STARPU_ABORT(); } STARPU_TASK_SET_MODE(conversion_task, STARPU_RW, 0); return conversion_task; }
int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle handle, unsigned node, unsigned async, starpu_access_mode mode) { STARPU_ASSERT(handle); /* it is forbidden to call this function from a callback or a codelet */ if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls())) return -EDEADLK; struct user_interaction_wrapper wrapper = { .handle = handle, .node = node, .async = async, .cond = PTHREAD_COND_INITIALIZER, .lock = PTHREAD_MUTEX_INITIALIZER, .finished = 0 }; if (!_starpu_attempt_to_submit_data_request_from_apps(handle, mode, _prefetch_data_on_node, &wrapper)) { /* we can immediately proceed */ _starpu_fetch_data_on_node(handle, node, mode, async, NULL, NULL); /* remove the "lock"/reference */ if (!async) { _starpu_spin_lock(&handle->header_lock); _starpu_notify_data_dependencies(handle); _starpu_spin_unlock(&handle->header_lock); } } else { PTHREAD_MUTEX_LOCK(&wrapper.lock); while (!wrapper.finished) PTHREAD_COND_WAIT(&wrapper.cond, &wrapper.lock); PTHREAD_MUTEX_UNLOCK(&wrapper.lock); } return 0; } int starpu_data_prefetch_on_node(starpu_data_handle handle, unsigned node, unsigned async) { return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R); } /* * It is possible to specify that a piece of data can be discarded without * impacting the application. */ void starpu_data_advise_as_important(starpu_data_handle handle, unsigned is_important) { _starpu_spin_lock(&handle->header_lock); /* first take all the children lock (in order !) */ unsigned child; for (child = 0; child < handle->nchildren; child++) { /* make sure the intermediate children is advised as well */ struct starpu_data_state_t *child_handle = &handle->children[child]; if (child_handle->nchildren > 0) starpu_data_advise_as_important(child_handle, is_important); } handle->is_not_important = !is_important; /* now the parent may be used again so we release the lock */ _starpu_spin_unlock(&handle->header_lock); }
static void starpu_handle_data_request_completion(starpu_data_request_t r) { unsigned do_delete = 0; uint32_t src_node = r->src_node; uint32_t dst_node = r->dst_node; _starpu_update_data_state(r->dst_handle, dst_node, r->mode); #ifdef STARPU_USE_FXT size_t size = _starpu_data_get_size(r->src_handle); STARPU_TRACE_END_DRIVER_COPY(src_node, dst_node, size, r->com_id); #endif unsigned chained_req; for (chained_req = 0; chained_req < r->next_req_count; chained_req++) { _starpu_post_data_request(r->next_req[chained_req], r->next_req[chained_req]->handling_node); } r->completed = 1; r->dst_handle->per_node[dst_node].refcnt--; if (r->mode & STARPU_R) r->src_handle->per_node[src_node].refcnt--; r->refcnt--; /* if nobody is waiting on that request, we can get rid of it */ if (r->refcnt == 0) do_delete = 1; r->retval = 0; /* In case there are one or multiple callbacks, we execute them now. */ struct callback_list *callbacks = r->callbacks; _starpu_spin_unlock(&r->lock); if (r->src_handle != r->dst_handle) { _starpu_spin_unlock(&r->src_handle->header_lock); _starpu_spin_unlock(&r->dst_handle->header_lock); } else _starpu_spin_unlock(&r->src_handle->header_lock); if (do_delete) starpu_data_request_destroy(r); /* We do the callback once the lock is released so that they can do * blocking operations with the handle (eg. release it) */ while (callbacks) { callbacks->callback_func(callbacks->callback_arg); struct callback_list *next = callbacks->next; free(callbacks); callbacks = next; } }
/* This is called when a task is finished with a piece of data * (or on starpu_data_release) * * The header lock must already be taken by the caller. * This may free the handle if it was lazily unregistered (1 is returned in * that case). The handle pointer thus becomes invalid for the caller. */ int _starpu_notify_data_dependencies(starpu_data_handle_t handle) { _starpu_spin_checklocked(&handle->header_lock); /* A data access has finished so we remove a reference. */ STARPU_ASSERT(handle->refcnt > 0); handle->refcnt--; STARPU_ASSERT(handle->busy_count > 0); handle->busy_count--; if (_starpu_data_check_not_busy(handle)) /* Handle was destroyed, nothing left to do. */ return 1; if (handle->arbiter) { unsigned refcnt = handle->refcnt; STARPU_ASSERT(_starpu_data_requester_list_empty(&handle->req_list)); STARPU_ASSERT(_starpu_data_requester_list_empty(&handle->reduction_req_list)); _starpu_spin_unlock(&handle->header_lock); /* _starpu_notify_arbitered_dependencies will handle its own locking */ if (!refcnt) _starpu_notify_arbitered_dependencies(handle); /* We have already unlocked */ return 1; } STARPU_ASSERT(_starpu_data_requester_list_empty(&handle->arbitered_req_list)); /* In case there is a pending reduction, and that this is the last * requester, we may go back to a "normal" coherency model. */ if (handle->reduction_refcnt > 0) { //fprintf(stderr, "NOTIFY REDUCTION TASK RED REFCNT %d\n", handle->reduction_refcnt); handle->reduction_refcnt--; if (handle->reduction_refcnt == 0) _starpu_data_end_reduction_mode_terminate(handle); } struct _starpu_data_requester *r; while ((r = may_unlock_data_req_list_head(handle))) { /* STARPU_RW accesses are treated as STARPU_W */ enum starpu_data_access_mode r_mode = r->mode; if (r_mode == STARPU_RW) r_mode = STARPU_W; int put_in_list = 1; if ((handle->reduction_refcnt == 0) && (handle->current_mode == STARPU_REDUX) && (r_mode != STARPU_REDUX)) { _starpu_data_end_reduction_mode(handle); /* Since we need to perform a mode change, we freeze * the request if needed. */ put_in_list = (handle->reduction_refcnt > 0); } else { put_in_list = 0; } if (put_in_list) { /* We need to put the request back because we must * perform a reduction before. */ _starpu_data_requester_list_push_front(&handle->req_list, r); } else { /* The data is now attributed to that request so we put a * reference on it. */ handle->refcnt++; handle->busy_count++; enum starpu_data_access_mode previous_mode = handle->current_mode; handle->current_mode = r_mode; /* In case we enter in a reduction mode, we invalidate all per * worker replicates. Note that the "per_node" replicates are * kept intact because we'll reduce a valid copy of the * "per-node replicate" with the per-worker replicates .*/ if ((r_mode == STARPU_REDUX) && (previous_mode != STARPU_REDUX)) _starpu_data_start_reduction_mode(handle); _starpu_spin_unlock(&handle->header_lock); if (r->is_requested_by_codelet) { if (!unlock_one_requester(r)) _starpu_push_task(r->j); } else { STARPU_ASSERT(r->ready_data_callback); /* execute the callback associated with the data requester */ r->ready_data_callback(r->argcb); } _starpu_data_requester_delete(r); _starpu_spin_lock(&handle->header_lock); STARPU_ASSERT(handle->busy_count > 0); handle->busy_count--; if (_starpu_data_check_not_busy(handle)) return 1; } } return 0; }
/* No lock is held, this acquires and releases the handle header lock */ static unsigned _starpu_attempt_to_submit_data_request(unsigned request_from_codelet, starpu_data_handle_t handle, enum starpu_data_access_mode mode, void (*callback)(void *), void *argcb, struct _starpu_job *j, unsigned buffer_index) { if (handle->arbiter) return _starpu_attempt_to_submit_arbitered_data_request(request_from_codelet, handle, mode, callback, argcb, j, buffer_index); if (mode == STARPU_RW) mode = STARPU_W; /* Take the lock protecting the header. We try to do some progression * in case this is called from a worker, otherwise we just wait for the * lock to be available. */ if (request_from_codelet) { int cpt = 0; while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock)) { cpt++; _starpu_datawizard_progress(_starpu_memory_node_get_local_key(), 0); } if (cpt == STARPU_SPIN_MAXTRY) _starpu_spin_lock(&handle->header_lock); } else { _starpu_spin_lock(&handle->header_lock); } /* If we have a request that is not used for the reduction, and that a * reduction is pending, we put it at the end of normal list, and we * use the reduction_req_list instead */ unsigned pending_reduction = (handle->reduction_refcnt > 0); unsigned frozen = 0; /* If we are currently performing a reduction, we freeze any request * that is not explicitely a reduction task. */ unsigned is_a_reduction_task = (request_from_codelet && j->reduction_task); if (pending_reduction && !is_a_reduction_task) frozen = 1; /* If there is currently nobody accessing the piece of data, or it's * not another writter and if this is the same type of access as the * current one, we can proceed. */ unsigned put_in_list = 1; enum starpu_data_access_mode previous_mode = handle->current_mode; if (!frozen && ((handle->refcnt == 0) || (!(mode == STARPU_W) && (handle->current_mode == mode)))) { /* Detect whether this is the end of a reduction phase */ /* We don't want to start multiple reductions of the * same handle at the same time ! */ if ((handle->reduction_refcnt == 0) && (previous_mode == STARPU_REDUX) && (mode != STARPU_REDUX)) { _starpu_data_end_reduction_mode(handle); /* Since we need to perform a mode change, we freeze * the request if needed. */ put_in_list = (handle->reduction_refcnt > 0); } else { put_in_list = 0; } } if (put_in_list) { /* there cannot be multiple writers or a new writer * while the data is in read mode */ handle->busy_count++; /* enqueue the request */ struct _starpu_data_requester *r = _starpu_data_requester_new(); r->mode = mode; r->is_requested_by_codelet = request_from_codelet; r->j = j; r->buffer_index = buffer_index; r->ready_data_callback = callback; r->argcb = argcb; /* We put the requester in a specific list if this is a reduction task */ struct _starpu_data_requester_list *req_list = is_a_reduction_task?&handle->reduction_req_list:&handle->req_list; _starpu_data_requester_list_push_back(req_list, r); /* failed */ put_in_list = 1; } else { handle->refcnt++; handle->busy_count++; /* Do not write to handle->current_mode if it is already * R. This avoids a spurious warning from helgrind when * the following happens: * acquire(R) in thread A * acquire(R) in thread B * release_data_on_node() in thread A * helgrind would shout that the latter reads current_mode * unsafely. * * This actually basically explains helgrind that it is a * shared R acquisition. */ if (mode != STARPU_R || handle->current_mode != mode) handle->current_mode = mode; if ((mode == STARPU_REDUX) && (previous_mode != STARPU_REDUX)) _starpu_data_start_reduction_mode(handle); /* success */ put_in_list = 0; } _starpu_spin_unlock(&handle->header_lock); return put_in_list; }
void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gathering_node) { unsigned child; unsigned worker; unsigned nworkers = starpu_worker_get_count(); unsigned node; unsigned sizes[root_handle->nchildren]; _STARPU_TRACE_START_UNPARTITION(root_handle, gathering_node); _starpu_spin_lock(&root_handle->header_lock); STARPU_ASSERT_MSG(root_handle->nchildren != 0, "data %p is not partitioned, can not unpartition it", root_handle); /* first take all the children lock (in order !) */ for (child = 0; child < root_handle->nchildren; child++) { starpu_data_handle_t child_handle = starpu_data_get_child(root_handle, child); /* make sure the intermediate children is unpartitionned as well */ if (child_handle->nchildren > 0) starpu_data_unpartition(child_handle, gathering_node); /* If this is a multiformat handle, we must convert the data now */ #ifdef STARPU_DEVEL #warning TODO: _starpu_fetch_data_on_node should be doing it #endif if (_starpu_data_is_multiformat_handle(child_handle) && starpu_node_get_kind(child_handle->mf_node) != STARPU_CPU_RAM) { struct starpu_codelet cl = { .where = STARPU_CPU, .cpu_funcs = { _starpu_empty_codelet_function }, .modes = { STARPU_RW }, .nbuffers = 1 }; struct starpu_task *task = starpu_task_create(); task->name = "convert_data"; STARPU_TASK_SET_HANDLE(task, child_handle, 0); task->cl = &cl; task->synchronous = 1; if (_starpu_task_submit_internally(task) != 0) _STARPU_ERROR("Could not submit the conversion task while unpartitionning\n"); } int ret; /* for now we pretend that the RAM is almost unlimited and that gathering * data should be possible from the node that does the unpartionning ... we * don't want to have the programming deal with memory shortage at that time, * really */ /* Acquire the child data on the gathering node. This will trigger collapsing any reduction */ ret = starpu_data_acquire_on_node(child_handle, gathering_node, STARPU_RW); STARPU_ASSERT(ret == 0); starpu_data_release_on_node(child_handle, gathering_node); _starpu_spin_lock(&child_handle->header_lock); child_handle->busy_waiting = 1; _starpu_spin_unlock(&child_handle->header_lock); /* Wait for all requests to finish (notably WT requests) */ STARPU_PTHREAD_MUTEX_LOCK(&child_handle->busy_mutex); while (1) { /* Here helgrind would shout that this an unprotected access, * but this is actually fine: all threads who do busy_count-- * are supposed to call _starpu_data_check_not_busy, which will * wake us up through the busy_mutex/busy_cond. */ if (!child_handle->busy_count) break; /* This is woken by _starpu_data_check_not_busy, always called * after decrementing busy_count */ STARPU_PTHREAD_COND_WAIT(&child_handle->busy_cond, &child_handle->busy_mutex); } STARPU_PTHREAD_MUTEX_UNLOCK(&child_handle->busy_mutex); _starpu_spin_lock(&child_handle->header_lock); sizes[child] = _starpu_data_get_size(child_handle); _starpu_data_unregister_ram_pointer(child_handle); for (worker = 0; worker < nworkers; worker++) { struct _starpu_data_replicate *local = &child_handle->per_worker[worker]; STARPU_ASSERT(local->state == STARPU_INVALID); if (local->allocated && local->automatically_allocated) _starpu_request_mem_chunk_removal(child_handle, local, starpu_worker_get_memory_node(worker), sizes[child]); } _starpu_memory_stats_free(child_handle); }
static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_data_handle_t *childrenp, unsigned nparts, struct starpu_data_filter *f, int inherit_state) { unsigned i; unsigned node; /* first take care to properly lock the data header */ _starpu_spin_lock(&initial_handle->header_lock); initial_handle->nplans++; STARPU_ASSERT_MSG(nparts > 0, "Partitioning data %p in 0 piece does not make sense", initial_handle); /* allocate the children */ if (inherit_state) { initial_handle->children = (struct _starpu_data_state *) calloc(nparts, sizeof(struct _starpu_data_state)); STARPU_ASSERT(initial_handle->children); /* this handle now has children */ initial_handle->nchildren = nparts; } unsigned nworkers = starpu_worker_get_count(); for (node = 0; node < STARPU_MAXNODES; node++) { if (initial_handle->per_node[node].state != STARPU_INVALID) break; } if (node == STARPU_MAXNODES) { /* This is lazy allocation, allocate it now in main RAM, so as * to have somewhere to gather pieces later */ /* FIXME: mark as unevictable! */ int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[STARPU_MAIN_RAM], 0); #ifdef STARPU_DEVEL #warning we should reclaim memory if allocation failed #endif STARPU_ASSERT(!ret); } for (i = 0; i < nparts; i++) { starpu_data_handle_t child; if (inherit_state) child = &initial_handle->children[i]; else child = childrenp[i]; STARPU_ASSERT(child); struct starpu_data_interface_ops *ops; /* each child may have his own interface type */ /* what's this child's interface ? */ if (f->get_child_ops) ops = f->get_child_ops(f, i); else ops = initial_handle->ops; _starpu_data_handle_init(child, ops, initial_handle->mf_node); child->nchildren = 0; child->nplans = 0; child->switch_cl = NULL; child->partitioned = 0; child->readonly = 0; child->mpi_data = initial_handle->mpi_data; child->root_handle = initial_handle->root_handle; child->father_handle = initial_handle; child->sibling_index = i; child->depth = initial_handle->depth + 1; child->is_not_important = initial_handle->is_not_important; child->wt_mask = initial_handle->wt_mask; child->home_node = initial_handle->home_node; child->is_readonly = initial_handle->is_readonly; /* initialize the chunk lock */ _starpu_data_requester_list_init(&child->req_list); _starpu_data_requester_list_init(&child->reduction_req_list); child->reduction_tmp_handles = NULL; child->write_invalidation_req = NULL; child->refcnt = 0; child->unlocking_reqs = 0; child->busy_count = 0; child->busy_waiting = 0; STARPU_PTHREAD_MUTEX_INIT(&child->busy_mutex, NULL); STARPU_PTHREAD_COND_INIT(&child->busy_cond, NULL); child->reduction_refcnt = 0; _starpu_spin_init(&child->header_lock); child->sequential_consistency = initial_handle->sequential_consistency; STARPU_PTHREAD_MUTEX_INIT(&child->sequential_consistency_mutex, NULL); child->last_submitted_mode = STARPU_R; child->last_sync_task = NULL; child->last_submitted_accessors.task = NULL; child->last_submitted_accessors.next = &child->last_submitted_accessors; child->last_submitted_accessors.prev = &child->last_submitted_accessors; child->post_sync_tasks = NULL; /* Tell helgrind that the race in _starpu_unlock_post_sync_tasks is fine */ STARPU_HG_DISABLE_CHECKING(child->post_sync_tasks_cnt); child->post_sync_tasks_cnt = 0; /* The methods used for reduction are propagated to the * children. */ child->redux_cl = initial_handle->redux_cl; child->init_cl = initial_handle->init_cl; #ifdef STARPU_USE_FXT child->last_submitted_ghost_sync_id_is_valid = 0; child->last_submitted_ghost_sync_id = 0; child->last_submitted_ghost_accessors_id = NULL; #endif if (_starpu_global_arbiter) /* Just for testing purpose */ starpu_data_assign_arbiter(child, _starpu_global_arbiter); else child->arbiter = NULL; _starpu_data_requester_list_init(&child->arbitered_req_list); for (node = 0; node < STARPU_MAXNODES; node++) { struct _starpu_data_replicate *initial_replicate; struct _starpu_data_replicate *child_replicate; initial_replicate = &initial_handle->per_node[node]; child_replicate = &child->per_node[node]; if (inherit_state) child_replicate->state = initial_replicate->state; else child_replicate->state = STARPU_INVALID; if (inherit_state || !initial_replicate->automatically_allocated) child_replicate->allocated = initial_replicate->allocated; else child_replicate->allocated = 0; /* Do not allow memory reclaiming within the child for parent bits */ child_replicate->automatically_allocated = 0; child_replicate->refcnt = 0; child_replicate->memory_node = node; child_replicate->relaxed_coherency = 0; if (inherit_state) child_replicate->initialized = initial_replicate->initialized; else child_replicate->initialized = 0; /* update the interface */ void *initial_interface = starpu_data_get_interface_on_node(initial_handle, node); void *child_interface = starpu_data_get_interface_on_node(child, node); STARPU_ASSERT_MSG(!(!inherit_state && child_replicate->automatically_allocated && child_replicate->allocated), "partition planning is currently not supported when handle has some automatically allocated buffers"); f->filter_func(initial_interface, child_interface, f, i, nparts); } unsigned worker; for (worker = 0; worker < nworkers; worker++) { struct _starpu_data_replicate *child_replicate; child_replicate = &child->per_worker[worker]; child_replicate->state = STARPU_INVALID; child_replicate->allocated = 0; child_replicate->automatically_allocated = 0; child_replicate->refcnt = 0; child_replicate->memory_node = starpu_worker_get_memory_node(worker); child_replicate->requested = 0; for (node = 0; node < STARPU_MAXNODES; node++) { child_replicate->request[node] = NULL; } child_replicate->relaxed_coherency = 1; child_replicate->initialized = 0; /* duplicate the content of the interface on node 0 */ memcpy(child_replicate->data_interface, child->per_node[0].data_interface, child->ops->interface_size); } /* We compute the size and the footprint of the child once and * store it in the handle */ child->footprint = _starpu_compute_data_footprint(child); void *ptr; ptr = starpu_data_handle_to_pointer(child, STARPU_MAIN_RAM); if (ptr != NULL) _starpu_data_register_ram_pointer(child, ptr); } /* now let the header */ _starpu_spin_unlock(&initial_handle->header_lock); }