static void register_variable_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface) { struct starpu_variable_interface *variable_interface = (struct starpu_variable_interface *)data_interface; unsigned node; for (node = 0; node < STARPU_MAXNODES; node++) { struct starpu_variable_interface *local_interface = (struct starpu_variable_interface *) starpu_data_get_interface_on_node(handle, node); if (node == home_node) { local_interface->ptr = variable_interface->ptr; local_interface->dev_handle = variable_interface->dev_handle; local_interface->offset = variable_interface->offset; } else { local_interface->ptr = 0; local_interface->dev_handle = 0; local_interface->offset = 0; } local_interface->id = variable_interface->id; local_interface->elemsize = variable_interface->elemsize; } }
static void register_block_handle(starpu_data_handle handle, uint32_t home_node, void *interface) { starpu_block_interface_t *block_interface = interface; unsigned node; for (node = 0; node < STARPU_MAXNODES; node++) { starpu_block_interface_t *local_interface = starpu_data_get_interface_on_node(handle, node); if (node == home_node) { local_interface->ptr = block_interface->ptr; local_interface->dev_handle = block_interface->dev_handle; local_interface->offset = block_interface->offset; local_interface->ldy = block_interface->ldy; local_interface->ldz = block_interface->ldz; } else { local_interface->ptr = 0; local_interface->dev_handle = 0; local_interface->offset = 0; local_interface->ldy = 0; local_interface->ldz = 0; } local_interface->nx = block_interface->nx; local_interface->ny = block_interface->ny; local_interface->nz = block_interface->nz; local_interface->elemsize = block_interface->elemsize; } }
static size_t variable_interface_get_size(starpu_data_handle_t handle) { struct starpu_variable_interface *variable_interface = (struct starpu_variable_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM); return variable_interface->elemsize; }
int starpu_complex_get_nx(starpu_data_handle_t handle) { struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM); return complex_interface->nx; }
double *starpu_complex_get_imaginary(starpu_data_handle_t handle) { struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM); return complex_interface->imaginary; }
static void register_csr_handle(starpu_data_handle handle, uint32_t home_node, void *interface) { starpu_csr_interface_t *csr_interface = interface; unsigned node; for (node = 0; node < STARPU_MAXNODES; node++) { starpu_csr_interface_t *local_interface = starpu_data_get_interface_on_node(handle, node); if (node == home_node) { local_interface->nzval = csr_interface->nzval; local_interface->colind = csr_interface->colind; } else { local_interface->nzval = 0; local_interface->colind = NULL; } local_interface->rowptr = csr_interface->rowptr; local_interface->nnz = csr_interface->nnz; local_interface->nrow = csr_interface->nrow; local_interface->firstentry = csr_interface->firstentry; local_interface->elemsize = csr_interface->elemsize; } }
int _starpu_allocate_memory_on_node(starpu_data_handle handle, uint32_t dst_node, unsigned may_alloc) { size_t allocated_memory; STARPU_ASSERT(handle); /* A buffer is already allocated on the node */ if (handle->per_node[dst_node].allocated) return 0; if (!may_alloc) return ENOMEM; void *interface = starpu_data_get_interface_on_node(handle, dst_node); allocated_memory = _starpu_allocate_interface(handle, interface, dst_node); /* perhaps we could really not handle that capacity misses */ if (!allocated_memory) return ENOMEM; /* perhaps we could really not handle that capacity misses */ if (allocated_memory) register_mem_chunk(handle, dst_node, allocated_memory, 1); handle->per_node[dst_node].allocated = 1; handle->per_node[dst_node].automatically_allocated = 1; return 0; }
uintptr_t starpu_variable_get_local_ptr(starpu_data_handle_t handle) { unsigned node; node = _starpu_memory_node_get_local_key(); STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node)); return STARPU_VARIABLE_GET_PTR(starpu_data_get_interface_on_node(handle, node)); }
static int unpack_variable_handle(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count) { STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node)); struct starpu_variable_interface *variable_interface = (struct starpu_variable_interface *) starpu_data_get_interface_on_node(handle, node); STARPU_ASSERT(count == variable_interface->elemsize); memcpy((void*)variable_interface->ptr, ptr, variable_interface->elemsize); return 0; }
static int complex_unpack_data(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count) { char *data = ptr; STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node)); struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, node); STARPU_ASSERT(count == 2 * complex_interface->nx * sizeof(double)); memcpy(complex_interface->real, data, complex_interface->nx*sizeof(double)); memcpy(complex_interface->imaginary, data+complex_interface->nx*sizeof(double), complex_interface->nx*sizeof(double)); return 0; }
static void register_variable_handle(starpu_data_handle handle, uint32_t home_node, void *interface) { unsigned node; for (node = 0; node < STARPU_MAXNODES; node++) { starpu_variable_interface_t *local_interface = starpu_data_get_interface_on_node(handle, node); if (node == home_node) { local_interface->ptr = STARPU_VARIABLE_GET_PTR(interface); } else { local_interface->ptr = 0; } local_interface->elemsize = STARPU_VARIABLE_GET_ELEMSIZE(interface); } }
static int complex_pack_data(starpu_data_handle_t handle, unsigned node, void **ptr, starpu_ssize_t *count) { STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node)); struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, node); *count = complex_get_size(handle); if (ptr != NULL) { char *data; starpu_malloc_flags((void**) &data, *count, 0); *ptr = data; memcpy(data, complex_interface->real, complex_interface->nx*sizeof(double)); memcpy(data+complex_interface->nx*sizeof(double), complex_interface->imaginary, complex_interface->nx*sizeof(double)); } return 0; }
static void register_mem_chunk(starpu_data_handle handle, uint32_t dst_node, size_t size, unsigned automatically_allocated) { int res; starpu_mem_chunk_t mc; /* the interface was already filled by ops->allocate_data_on_node */ void *src_interface = starpu_data_get_interface_on_node(handle, dst_node); size_t interface_size = handle->ops->interface_size; /* Put this memchunk in the list of memchunk in use */ mc = _starpu_memchunk_init(handle, size, src_interface, interface_size, automatically_allocated); res = pthread_rwlock_wrlock(&mc_rwlock[dst_node]); STARPU_ASSERT(!res); starpu_mem_chunk_list_push_front(mc_list[dst_node], mc); res = pthread_rwlock_unlock(&mc_rwlock[dst_node]); STARPU_ASSERT(!res); }
static void complex_register_data_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface) { struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) data_interface; unsigned node; for (node = 0; node < STARPU_MAXNODES; node++) { struct starpu_complex_interface *local_interface = (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, node); local_interface->nx = complex_interface->nx; if (node == home_node) { local_interface->real = complex_interface->real; local_interface->imaginary = complex_interface->imaginary; } else { local_interface->real = 0; local_interface->imaginary = 0; } } }
static size_t complex_get_size(starpu_data_handle_t handle) { size_t size; struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM); size = complex_interface->nx * 2 * sizeof(double); return size; }
static void *variable_handle_to_pointer(starpu_data_handle_t handle, unsigned node) { STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node)); return (void*) STARPU_VARIABLE_GET_PTR(starpu_data_get_interface_on_node(handle, node)); }
size_t starpu_variable_get_elemsize(starpu_data_handle_t handle) { return STARPU_VARIABLE_GET_ELEMSIZE(starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM)); }
starpu_node_kind src_kind = _starpu_get_node_kind(src_node); starpu_node_kind dst_kind = _starpu_get_node_kind(dst_node); STARPU_ASSERT(src_handle->per_node[src_node].refcnt); STARPU_ASSERT(dst_handle->per_node[dst_node].refcnt); STARPU_ASSERT(src_handle->per_node[src_node].allocated); STARPU_ASSERT(dst_handle->per_node[dst_node].allocated); #ifdef STARPU_USE_CUDA cudaError_t cures; cudaStream_t *stream; #endif void *src_interface = starpu_data_get_interface_on_node(src_handle, src_node); void *dst_interface = starpu_data_get_interface_on_node(dst_handle, dst_node); switch (_STARPU_MEMORY_NODE_TUPLE(src_kind,dst_kind)) { case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CPU_RAM): /* STARPU_CPU_RAM -> STARPU_CPU_RAM */ STARPU_ASSERT(copy_methods->ram_to_ram); copy_methods->ram_to_ram(src_interface, src_node, dst_interface, dst_node); break; #ifdef STARPU_USE_CUDA case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CPU_RAM): /* CUBLAS_RAM -> STARPU_CPU_RAM */ /* only the proper CUBLAS thread can initiate this ! */ if (_starpu_get_local_memory_node() == src_node) { /* only the proper CUBLAS thread can initiate this directly ! */ STARPU_ASSERT(copy_methods->cuda_to_ram);
struct starpu_task *_starpu_create_conversion_task_for_arch(starpu_data_handle_t handle, enum starpu_node_kind node_kind) { struct starpu_task *conversion_task; #if defined(STARPU_USE_OPENCL) || defined(STARPU_USE_CUDA) || defined(STARPU_USE_MIC) || defined(STARPU_USE_SCC) || defined(STARPU_SIMGRID) struct starpu_multiformat_interface *format_interface; #endif conversion_task = starpu_task_create(); conversion_task->name = "conversion_task"; conversion_task->synchronous = 0; STARPU_TASK_SET_HANDLE(conversion_task, handle, 0); #if defined(STARPU_USE_OPENCL) || defined(STARPU_USE_CUDA) || defined(STARPU_USE_MIC) || defined(STARPU_USE_SCC) || defined(STARPU_SIMGRID) /* The node does not really matter here */ format_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM); #endif _starpu_spin_lock(&handle->header_lock); handle->refcnt++; handle->busy_count++; _starpu_spin_unlock(&handle->header_lock); switch(node_kind) { case STARPU_CPU_RAM: case STARPU_SCC_RAM: case STARPU_SCC_SHM: switch (starpu_node_get_kind(handle->mf_node)) { case STARPU_CPU_RAM: case STARPU_SCC_RAM: case STARPU_SCC_SHM: STARPU_ABORT(); #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID) case STARPU_CUDA_RAM: { struct starpu_multiformat_data_interface_ops *mf_ops; mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface); conversion_task->cl = mf_ops->cuda_to_cpu_cl; break; } #endif #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID) case STARPU_OPENCL_RAM: { struct starpu_multiformat_data_interface_ops *mf_ops; mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface); conversion_task->cl = mf_ops->opencl_to_cpu_cl; break; } #endif #ifdef STARPU_USE_MIC case STARPU_MIC_RAM: { struct starpu_multiformat_data_interface_ops *mf_ops; mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface); conversion_task->cl = mf_ops->mic_to_cpu_cl; break; } #endif default: _STARPU_ERROR("Oops : %u\n", handle->mf_node); } break; #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID) case STARPU_CUDA_RAM: { struct starpu_multiformat_data_interface_ops *mf_ops; mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface); conversion_task->cl = mf_ops->cpu_to_cuda_cl; break; } #endif #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID) case STARPU_OPENCL_RAM: { struct starpu_multiformat_data_interface_ops *mf_ops; mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface); conversion_task->cl = mf_ops->cpu_to_opencl_cl; break; } #endif #ifdef STARPU_USE_MIC case STARPU_MIC_RAM: { struct starpu_multiformat_data_interface_ops *mf_ops; mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface); conversion_task->cl = mf_ops->cpu_to_mic_cl; break; } #endif default: STARPU_ABORT(); } STARPU_TASK_SET_MODE(conversion_task, STARPU_RW, 0); return conversion_task; }
/* declare a new data with the variable interface */ void starpu_variable_data_register(starpu_data_handle_t *handleptr, unsigned home_node, uintptr_t ptr, size_t elemsize) { struct starpu_variable_interface variable = { .id = STARPU_VARIABLE_INTERFACE_ID, .ptr = ptr, .dev_handle = ptr, .offset = 0, .elemsize = elemsize }; #ifdef STARPU_USE_SCC _starpu_scc_set_offset_in_shared_memory((void*)variable.ptr, (void**)&(variable.dev_handle), &(variable.offset)); #endif starpu_data_register(handleptr, home_node, &variable, &starpu_interface_variable_ops); } void starpu_variable_ptr_register(starpu_data_handle_t handle, unsigned node, uintptr_t ptr, uintptr_t dev_handle, size_t offset) { struct starpu_variable_interface *variable_interface = starpu_data_get_interface_on_node(handle, node); starpu_data_ptr_register(handle, node); variable_interface->ptr = ptr; variable_interface->dev_handle = dev_handle; variable_interface->offset = offset; } static uint32_t footprint_variable_interface_crc32(starpu_data_handle_t handle) { return starpu_hash_crc32c_be(starpu_variable_get_elemsize(handle), 0); } static int variable_compare(void *data_interface_a, void *data_interface_b) { struct starpu_variable_interface *variable_a = (struct starpu_variable_interface *) data_interface_a; struct starpu_variable_interface *variable_b = (struct starpu_variable_interface *) data_interface_b; /* Two variables are considered compatible if they have the same size */ return (variable_a->elemsize == variable_b->elemsize); } static void display_variable_interface(starpu_data_handle_t handle, FILE *f) { struct starpu_variable_interface *variable_interface = (struct starpu_variable_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM); fprintf(f, "%ld\t", (long)variable_interface->elemsize); } static int pack_variable_handle(starpu_data_handle_t handle, unsigned node, void **ptr, starpu_ssize_t *count) { STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node)); struct starpu_variable_interface *variable_interface = (struct starpu_variable_interface *) starpu_data_get_interface_on_node(handle, node); *count = variable_interface->elemsize; if (ptr != NULL) { starpu_malloc_flags(ptr, *count, 0); memcpy(*ptr, (void*)variable_interface->ptr, variable_interface->elemsize); } return 0; }
static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_data_handle_t *childrenp, unsigned nparts, struct starpu_data_filter *f, int inherit_state) { unsigned i; unsigned node; /* first take care to properly lock the data header */ _starpu_spin_lock(&initial_handle->header_lock); initial_handle->nplans++; STARPU_ASSERT_MSG(nparts > 0, "Partitioning data %p in 0 piece does not make sense", initial_handle); /* allocate the children */ if (inherit_state) { initial_handle->children = (struct _starpu_data_state *) calloc(nparts, sizeof(struct _starpu_data_state)); STARPU_ASSERT(initial_handle->children); /* this handle now has children */ initial_handle->nchildren = nparts; } unsigned nworkers = starpu_worker_get_count(); for (node = 0; node < STARPU_MAXNODES; node++) { if (initial_handle->per_node[node].state != STARPU_INVALID) break; } if (node == STARPU_MAXNODES) { /* This is lazy allocation, allocate it now in main RAM, so as * to have somewhere to gather pieces later */ /* FIXME: mark as unevictable! */ int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[STARPU_MAIN_RAM], 0); #ifdef STARPU_DEVEL #warning we should reclaim memory if allocation failed #endif STARPU_ASSERT(!ret); } for (i = 0; i < nparts; i++) { starpu_data_handle_t child; if (inherit_state) child = &initial_handle->children[i]; else child = childrenp[i]; STARPU_ASSERT(child); struct starpu_data_interface_ops *ops; /* each child may have his own interface type */ /* what's this child's interface ? */ if (f->get_child_ops) ops = f->get_child_ops(f, i); else ops = initial_handle->ops; _starpu_data_handle_init(child, ops, initial_handle->mf_node); child->nchildren = 0; child->nplans = 0; child->switch_cl = NULL; child->partitioned = 0; child->readonly = 0; child->mpi_data = initial_handle->mpi_data; child->root_handle = initial_handle->root_handle; child->father_handle = initial_handle; child->sibling_index = i; child->depth = initial_handle->depth + 1; child->is_not_important = initial_handle->is_not_important; child->wt_mask = initial_handle->wt_mask; child->home_node = initial_handle->home_node; child->is_readonly = initial_handle->is_readonly; /* initialize the chunk lock */ _starpu_data_requester_list_init(&child->req_list); _starpu_data_requester_list_init(&child->reduction_req_list); child->reduction_tmp_handles = NULL; child->write_invalidation_req = NULL; child->refcnt = 0; child->unlocking_reqs = 0; child->busy_count = 0; child->busy_waiting = 0; STARPU_PTHREAD_MUTEX_INIT(&child->busy_mutex, NULL); STARPU_PTHREAD_COND_INIT(&child->busy_cond, NULL); child->reduction_refcnt = 0; _starpu_spin_init(&child->header_lock); child->sequential_consistency = initial_handle->sequential_consistency; STARPU_PTHREAD_MUTEX_INIT(&child->sequential_consistency_mutex, NULL); child->last_submitted_mode = STARPU_R; child->last_sync_task = NULL; child->last_submitted_accessors.task = NULL; child->last_submitted_accessors.next = &child->last_submitted_accessors; child->last_submitted_accessors.prev = &child->last_submitted_accessors; child->post_sync_tasks = NULL; /* Tell helgrind that the race in _starpu_unlock_post_sync_tasks is fine */ STARPU_HG_DISABLE_CHECKING(child->post_sync_tasks_cnt); child->post_sync_tasks_cnt = 0; /* The methods used for reduction are propagated to the * children. */ child->redux_cl = initial_handle->redux_cl; child->init_cl = initial_handle->init_cl; #ifdef STARPU_USE_FXT child->last_submitted_ghost_sync_id_is_valid = 0; child->last_submitted_ghost_sync_id = 0; child->last_submitted_ghost_accessors_id = NULL; #endif if (_starpu_global_arbiter) /* Just for testing purpose */ starpu_data_assign_arbiter(child, _starpu_global_arbiter); else child->arbiter = NULL; _starpu_data_requester_list_init(&child->arbitered_req_list); for (node = 0; node < STARPU_MAXNODES; node++) { struct _starpu_data_replicate *initial_replicate; struct _starpu_data_replicate *child_replicate; initial_replicate = &initial_handle->per_node[node]; child_replicate = &child->per_node[node]; if (inherit_state) child_replicate->state = initial_replicate->state; else child_replicate->state = STARPU_INVALID; if (inherit_state || !initial_replicate->automatically_allocated) child_replicate->allocated = initial_replicate->allocated; else child_replicate->allocated = 0; /* Do not allow memory reclaiming within the child for parent bits */ child_replicate->automatically_allocated = 0; child_replicate->refcnt = 0; child_replicate->memory_node = node; child_replicate->relaxed_coherency = 0; if (inherit_state) child_replicate->initialized = initial_replicate->initialized; else child_replicate->initialized = 0; /* update the interface */ void *initial_interface = starpu_data_get_interface_on_node(initial_handle, node); void *child_interface = starpu_data_get_interface_on_node(child, node); STARPU_ASSERT_MSG(!(!inherit_state && child_replicate->automatically_allocated && child_replicate->allocated), "partition planning is currently not supported when handle has some automatically allocated buffers"); f->filter_func(initial_interface, child_interface, f, i, nparts); } unsigned worker; for (worker = 0; worker < nworkers; worker++) { struct _starpu_data_replicate *child_replicate; child_replicate = &child->per_worker[worker]; child_replicate->state = STARPU_INVALID; child_replicate->allocated = 0; child_replicate->automatically_allocated = 0; child_replicate->refcnt = 0; child_replicate->memory_node = starpu_worker_get_memory_node(worker); child_replicate->requested = 0; for (node = 0; node < STARPU_MAXNODES; node++) { child_replicate->request[node] = NULL; } child_replicate->relaxed_coherency = 1; child_replicate->initialized = 0; /* duplicate the content of the interface on node 0 */ memcpy(child_replicate->data_interface, child->per_node[0].data_interface, child->ops->interface_size); } /* We compute the size and the footprint of the child once and * store it in the handle */ child->footprint = _starpu_compute_data_footprint(child); void *ptr; ptr = starpu_data_handle_to_pointer(child, STARPU_MAIN_RAM); if (ptr != NULL) _starpu_data_register_ram_pointer(child, ptr); } /* now let the header */ _starpu_spin_unlock(&initial_handle->header_lock); }