void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_channel) { #ifdef STARPU_SIMGRID STARPU_PTHREAD_MUTEX_LOCK(&async_channel->event.mutex); while (!async_channel->event.finished) STARPU_PTHREAD_COND_WAIT(&async_channel->event.cond, &async_channel->event.mutex); STARPU_PTHREAD_MUTEX_UNLOCK(&async_channel->event.mutex); #else /* !SIMGRID */ enum starpu_node_kind kind = async_channel->type; #ifdef STARPU_USE_CUDA cudaEvent_t event; cudaError_t cures; #endif switch (kind) { #ifdef STARPU_USE_CUDA case STARPU_CUDA_RAM: event = (*async_channel).event.cuda_event; cures = cudaEventSynchronize(event); if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures); cures = cudaEventDestroy(event); if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures); break; #endif #ifdef STARPU_USE_OPENCL case STARPU_OPENCL_RAM: { cl_int err; if ((*async_channel).event.opencl_event == NULL) STARPU_ABORT(); err = clWaitForEvents(1, &((*async_channel).event.opencl_event)); if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err); err = clReleaseEvent((*async_channel).event.opencl_event); if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err); break; } #endif #ifdef STARPU_USE_MIC case STARPU_MIC_RAM: _starpu_mic_wait_request_completion(&(async_channel->event.mic_event)); break; #endif case STARPU_MAIN_RAM: starpu_disk_wait_request(async_channel); case STARPU_CPU_RAM: default: STARPU_ABORT(); } #endif /* !SIMGRID */ }
void *_starpu_htbl_search_32(struct starpu_htbl32_node_s *htbl, uint32_t key) { unsigned currentbit; unsigned keysize = 32; starpu_htbl32_node_t *current_htbl = htbl; /* 000000000001111 with HTBL_NODE_SIZE 1's */ uint32_t mask = (1<<STARPU_HTBL32_NODE_SIZE)-1; for(currentbit = 0; currentbit < keysize; currentbit+=STARPU_HTBL32_NODE_SIZE) { // printf("search : current bit = %d \n", currentbit); if (STARPU_UNLIKELY(current_htbl == NULL)) return NULL; /* 0000000000001111 * | currentbit * 0000111100000000 = offloaded_mask * |last_currentbit * */ unsigned last_currentbit = keysize - (currentbit + STARPU_HTBL32_NODE_SIZE); uint32_t offloaded_mask = mask << last_currentbit; unsigned current_index = (key & (offloaded_mask)) >> (last_currentbit); current_htbl = current_htbl->children[current_index]; } return current_htbl; }
/* the generic interface that call the proper underlying implementation */ int _starpu_push_task(starpu_job_t j, unsigned job_is_already_locked) { struct starpu_task *task = j->task; task->status = STARPU_TASK_READY; /* in case there is no codelet associated to the task (that's a control * task), we directly execute its callback and enforce the * corresponding dependencies */ if (task->cl == NULL) { _starpu_handle_job_termination(j, job_is_already_locked); return 0; } if (STARPU_UNLIKELY(task->execute_on_a_specific_worker)) { unsigned workerid = task->workerid; struct starpu_worker_s *worker = _starpu_get_worker_struct(workerid); if (use_prefetch) { uint32_t memory_node = starpu_worker_get_memory_node(workerid); _starpu_prefetch_task_input_on_node(task, memory_node); } return _starpu_push_local_task(worker, j); } else { STARPU_ASSERT(policy.push_task); return policy.push_task(task); } }
unsigned _starpu_driver_test_request_completion(struct _starpu_async_channel *async_channel) { #ifdef STARPU_SIMGRID unsigned ret; STARPU_PTHREAD_MUTEX_LOCK(&async_channel->event.mutex); ret = async_channel->event.finished; STARPU_PTHREAD_MUTEX_UNLOCK(&async_channel->event.mutex); return ret; #else /* !SIMGRID */ enum starpu_node_kind kind = async_channel->type; unsigned success = 0; #ifdef STARPU_USE_CUDA cudaEvent_t event; #endif switch (kind) { #ifdef STARPU_USE_CUDA case STARPU_CUDA_RAM: event = (*async_channel).event.cuda_event; cudaError_t cures = cudaEventQuery(event); success = (cures == cudaSuccess); if (success) cudaEventDestroy(event); else if (cures != cudaErrorNotReady) STARPU_CUDA_REPORT_ERROR(cures); break; #endif #ifdef STARPU_USE_OPENCL case STARPU_OPENCL_RAM: { cl_int event_status; cl_event opencl_event = (*async_channel).event.opencl_event; if (opencl_event == NULL) STARPU_ABORT(); cl_int err = clGetEventInfo(opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL); if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err); if (event_status < 0) STARPU_OPENCL_REPORT_ERROR(event_status); success = (event_status == CL_COMPLETE); break; } #endif #ifdef STARPU_USE_MIC case STARPU_MIC_RAM: success = _starpu_mic_request_is_complete(&(async_channel->event.mic_event)); break; #endif case STARPU_DISK_RAM: success = starpu_disk_test_request(async_channel); break; case STARPU_CPU_RAM: default: STARPU_ABORT(); } return success; #endif /* !SIMGRID */ }
double _starpu_history_based_job_expected_length(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct starpu_job_s *j) { double exp; struct starpu_per_arch_perfmodel_t *per_arch_model; struct starpu_history_entry_t *entry; struct starpu_htbl32_node_s *history; load_history_based_model(model, 1); if (STARPU_UNLIKELY(!j->footprint_is_computed)) _starpu_compute_buffers_footprint(j); uint32_t key = j->footprint; per_arch_model = &model->per_arch[arch]; history = per_arch_model->history; if (!history) return -1.0; PTHREAD_RWLOCK_RDLOCK(&model->model_rwlock); entry = _starpu_htbl_search_32(history, key); PTHREAD_RWLOCK_UNLOCK(&model->model_rwlock); exp = entry?entry->mean:-1.0; return exp; }
int starpu_opencl_set_kernel_args(cl_int *error, cl_kernel *kernel, ...) { int i; va_list ap; va_start(ap, kernel); for (i = 0; ; i++) { int size = va_arg(ap, int); if (size == 0) break; cl_mem *ptr = va_arg(ap, cl_mem *); int err = clSetKernelArg(*kernel, i, size, ptr); if (STARPU_UNLIKELY(err != CL_SUCCESS)) { *error = err; break; } } va_end(ap); return i; }
cl_int starpu_opencl_release_kernel(cl_kernel kernel) { cl_int err; err = clReleaseKernel(kernel); if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err); return CL_SUCCESS; }
int starpu_event_wait_all(int num_events, starpu_event *events) { if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls())) return -EDEADLK; int i; for (i=0; i<num_events; i++) starpu_event_wait(events[i]); return 0; }
int _starpu_opencl_init_context(int devid) { #ifdef STARPU_SIMGRID int j; for (j = 0; j < STARPU_MAX_PIPELINE; j++) { task_finished[devid][j] = 0; STARPU_PTHREAD_MUTEX_INIT(&task_mutex[devid][j], NULL); STARPU_PTHREAD_COND_INIT(&task_cond[devid][j], NULL); } #else /* !STARPU_SIMGRID */ cl_int err; cl_uint uint; STARPU_PTHREAD_MUTEX_LOCK(&big_lock); _STARPU_DEBUG("Initialising context for dev %d\n", devid); // Create a compute context err = 0; contexts[devid] = clCreateContext(NULL, 1, &devices[devid], NULL, NULL, &err); if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err); err = clGetDeviceInfo(devices[devid], CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(uint), &uint, NULL); if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err); starpu_malloc_set_align(uint/8); // Create execution queue for the given device queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], 0, &err); if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err); // Create transfer queue for the given device cl_command_queue_properties props; err = clGetDeviceInfo(devices[devid], CL_DEVICE_QUEUE_PROPERTIES, sizeof(props), &props, NULL); if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err); props &= ~CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE; in_transfer_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], props, &err); if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err); out_transfer_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], props, &err); if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err); peer_transfer_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], props, &err); if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err); alloc_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], 0, &err); if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err); STARPU_PTHREAD_MUTEX_UNLOCK(&big_lock); #endif /* !STARPU_SIMGRID */ return 0; }
unsigned _starpu_memory_node_get_local_key(void) { unsigned *memory_node; memory_node = (unsigned *) STARPU_PTHREAD_GETSPECIFIC(memory_node_key); /* in case this is called by the programmer, we assume the RAM node is the appropriate memory node ... XXX */ if (STARPU_UNLIKELY(!memory_node)) return STARPU_MAIN_RAM; return *memory_node; }
static void _starpu_opencl_limit_gpu_mem_if_needed(unsigned devid) { starpu_ssize_t limit; size_t STARPU_ATTRIBUTE_UNUSED totalGlobalMem = 0; size_t STARPU_ATTRIBUTE_UNUSED to_waste = 0; char name[30]; #ifdef STARPU_SIMGRID totalGlobalMem = _starpu_simgrid_get_memsize("OpenCL", devid); #elif defined(STARPU_USE_OPENCL) /* Request the size of the current device's memory */ cl_int err; cl_ulong size; err = clGetDeviceInfo(devices[devid], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size), &size, NULL); if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err); totalGlobalMem = size; #endif limit = starpu_get_env_number("STARPU_LIMIT_OPENCL_MEM"); if (limit == -1) { sprintf(name, "STARPU_LIMIT_OPENCL_%u_MEM", devid); limit = starpu_get_env_number(name); } #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID) if (limit == -1) { /* Use 90% of the available memory by default. */ limit = totalGlobalMem / (1024*1024) * 0.9; } #endif global_mem[devid] = limit * 1024*1024; #ifdef STARPU_USE_OPENCL /* How much memory to waste ? */ to_waste = totalGlobalMem - global_mem[devid]; #endif _STARPU_DEBUG("OpenCL device %d: Wasting %ld MB / Limit %ld MB / Total %ld MB / Remains %ld MB\n", devid, (long)to_waste/(1024*1024), (long) limit, (long)totalGlobalMem/(1024*1024), (long)(totalGlobalMem - to_waste)/(1024*1024)); }
int starpu_event_wait(starpu_event event) { if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls())) return -EDEADLK; /* We can avoid mutex locking if event is already complete */ if (!event->complete) { _starpu_event_lock(event); event->cond_wait_count += 1; while (!event->complete) { pthread_cond_wait(&event->cond, &event->mutex); } event->cond_wait_count -= 1; _starpu_event_unlock(event); } return 0; }
int starpu_opencl_unload_opencl(struct starpu_opencl_program *opencl_programs) { unsigned int dev; unsigned int nb_devices; if (!starpu_opencl_worker_get_count()) return 0; nb_devices = _starpu_opencl_get_device_count(); // Iterate over each device for(dev = 0; dev < nb_devices; dev ++) { if (opencl_programs->programs[dev]) { cl_int err; err = clReleaseProgram(opencl_programs->programs[dev]); if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err); } } return 0; }
static void create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, unsigned j) { /* FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */ struct starpu_task *task = create_task(TAG22(k, i, j)); task->cl = &cl22; /* which sub-data is manipulated ? */ task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, i); task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j); task->handles[2] = starpu_data_get_sub_data(dataA, 2, i, j); if (!noprio && (i == k + 1) && (j == k +1) ) { task->priority = STARPU_MAX_PRIO; } /* enforce dependencies ... */ if (k > 0) { starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG21(k, i), TAG21(k, j)); } else { starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j)); } int n = starpu_matrix_get_nx(task->handles[0]); task->flops = FLOPS_SGEMM(n, n, n); int ret = starpu_task_submit(task); if (STARPU_UNLIKELY(ret == -ENODEV)) { FPRINTF(stderr, "No worker may execute this task\n"); exit(0); } }
static int _starpu_opencl_get_binary_name(char *binary_file_name, size_t maxlen, const char *source_file_name, int dev, cl_device_id device) { char binary_directory[1024]; char *p; cl_int err; cl_uint vendor_id; _starpu_opencl_create_binary_directory(binary_directory, 1024); p = strrchr(source_file_name, '/'); snprintf(binary_file_name, maxlen, "%s/%s", binary_directory, p?p:source_file_name); p = strstr(binary_file_name, ".cl"); if (p == NULL) p=binary_file_name + strlen(binary_file_name); err = clGetDeviceInfo(device, CL_DEVICE_VENDOR_ID, sizeof(vendor_id), &vendor_id, NULL); if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err); sprintf(p, ".%s.vendor_id_%d_device_id_%d", _starpu_opencl_get_device_type_as_string(dev), (int)vendor_id, dev); return CL_SUCCESS; }
cl_int starpu_opencl_load_kernel(cl_kernel *kernel, cl_command_queue *queue, struct starpu_opencl_program *opencl_programs, const char *kernel_name, int devid) { cl_int err; cl_device_id device; cl_program program; starpu_opencl_get_device(devid, &device); starpu_opencl_get_queue(devid, queue); program = opencl_programs->programs[devid]; if (!program) { _STARPU_DISP("Program not available for device <%d>\n", devid); return CL_INVALID_PROGRAM; } // Create the compute kernel in the program we wish to run *kernel = clCreateKernel(program, kernel_name, &err); if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err); return CL_SUCCESS; }
int _starpu_opencl_deinit_context(int devid) { #ifdef STARPU_SIMGRID int j; for (j = 0; j < STARPU_MAX_PIPELINE; j++) { task_finished[devid][j] = 0; STARPU_PTHREAD_MUTEX_DESTROY(&task_mutex[devid][j]); STARPU_PTHREAD_COND_DESTROY(&task_cond[devid][j]); } #else /* !STARPU_SIMGRID */ cl_int err; STARPU_PTHREAD_MUTEX_LOCK(&big_lock); _STARPU_DEBUG("De-initialising context for dev %d\n", devid); err = clReleaseContext(contexts[devid]); if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err); err = clReleaseCommandQueue(queues[devid]); if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err); err = clReleaseCommandQueue(in_transfer_queues[devid]); if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err); err = clReleaseCommandQueue(out_transfer_queues[devid]); if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err); err = clReleaseCommandQueue(peer_transfer_queues[devid]); if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err); err = clReleaseCommandQueue(alloc_queues[devid]); if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err); contexts[devid] = NULL; STARPU_PTHREAD_MUTEX_UNLOCK(&big_lock); #endif return 0; }
int _starpu_push_task_to_workers(struct starpu_task *task) { struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx); unsigned nworkers = 0; _STARPU_TRACE_JOB_PUSH(task, task->priority > 0); /* if the contexts still does not have workers put the task back to its place in the empty ctx list */ if(!sched_ctx->is_initial_sched) { /*if there are workers in the ctx that are not able to execute tasks we consider the ctx empty */ nworkers = _starpu_nworkers_able_to_execute_task(task, sched_ctx); if (nworkers == 0) { STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx->empty_ctx_mutex); starpu_task_list_push_back(&sched_ctx->empty_ctx_tasks, task); STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx->empty_ctx_mutex); #ifdef STARPU_USE_SC_HYPERVISOR if(sched_ctx != NULL && sched_ctx->id != 0 && sched_ctx->perf_counters != NULL && sched_ctx->perf_counters->notify_empty_ctx) { _STARPU_TRACE_HYPERVISOR_BEGIN(); sched_ctx->perf_counters->notify_empty_ctx(sched_ctx->id, task); _STARPU_TRACE_HYPERVISOR_END(); } #endif return -EAGAIN; } } _starpu_profiling_set_task_push_start_time(task); int ret = 0; if (STARPU_UNLIKELY(task->execute_on_a_specific_worker)) { unsigned node = starpu_worker_get_memory_node(task->workerid); if (starpu_get_prefetch_flag()) starpu_prefetch_task_input_on_node(task, node); ret = _starpu_push_task_on_specific_worker(task, task->workerid); } else { struct _starpu_machine_config *config = _starpu_get_machine_config(); /* When a task can only be executed on a given arch and we have * only one memory node for that arch, we can systematically * prefetch before the scheduling decision. */ if (starpu_get_prefetch_flag()) { if (task->cl->where == STARPU_CPU && config->cpus_nodeid >= 0) starpu_prefetch_task_input_on_node(task, config->cpus_nodeid); else if (task->cl->where == STARPU_CUDA && config->cuda_nodeid >= 0) starpu_prefetch_task_input_on_node(task, config->cuda_nodeid); else if (task->cl->where == STARPU_OPENCL && config->opencl_nodeid >= 0) starpu_prefetch_task_input_on_node(task, config->opencl_nodeid); else if (task->cl->where == STARPU_MIC && config->mic_nodeid >= 0) starpu_prefetch_task_input_on_node(task, config->mic_nodeid); else if (task->cl->where == STARPU_SCC && config->scc_nodeid >= 0) starpu_prefetch_task_input_on_node(task, config->scc_nodeid); } if(!sched_ctx->sched_policy) { /* Note: we have to call that early, or else the task may have * disappeared already */ starpu_push_task_end(task); if(!sched_ctx->awake_workers) ret = _starpu_push_task_on_specific_worker(task, sched_ctx->main_master); else { struct starpu_worker_collection *workers = sched_ctx->workers; struct _starpu_job *job = _starpu_get_job_associated_to_task(task); job->task_size = workers->nworkers; job->combined_workerid = -1; // workerid; its a ctx not combined worker job->active_task_alias_count = 0; STARPU_PTHREAD_BARRIER_INIT(&job->before_work_barrier, NULL, workers->nworkers); STARPU_PTHREAD_BARRIER_INIT(&job->after_work_barrier, NULL, workers->nworkers); job->after_work_busy_barrier = workers->nworkers; unsigned workerid; struct starpu_sched_ctx_iterator it; if(workers->init_iterator) workers->init_iterator(workers, &it); while(workers->has_next(workers, &it)) { workerid = workers->get_next(workers, &it); struct starpu_task *alias = starpu_task_dup(task); alias->destroy = 1; ret |= _starpu_push_task_on_specific_worker(alias, workerid); } } } else { STARPU_ASSERT(sched_ctx->sched_policy->push_task); /* check out if there are any workers in the context */ starpu_pthread_rwlock_t *changing_ctx_mutex = _starpu_sched_ctx_get_changing_ctx_mutex(sched_ctx->id); STARPU_PTHREAD_RWLOCK_RDLOCK(changing_ctx_mutex); nworkers = starpu_sched_ctx_get_nworkers(sched_ctx->id); if (nworkers == 0) ret = -1; else { _STARPU_TRACE_WORKER_SCHEDULING_PUSH; ret = sched_ctx->sched_policy->push_task(task); _STARPU_TRACE_WORKER_SCHEDULING_POP; } STARPU_PTHREAD_RWLOCK_UNLOCK(changing_ctx_mutex); } if(ret == -1) { fprintf(stderr, "repush task \n"); _STARPU_TRACE_JOB_POP(task, task->priority > 0); ret = _starpu_push_task_to_workers(task); } } /* Note: from here, the task might have been destroyed already! */ _STARPU_LOG_OUT(); return ret; }
static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks, unsigned reclevel) { int ret; /* create a new codelet */ struct starpu_task *entry_task = NULL; /* create all the DAG nodes */ unsigned i,j,k; starpu_data_handle_t dataA; /* monitor and partition the A matrix into blocks : * one block is now determined by 2 unsigned (i,j) */ starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(float)); starpu_data_set_sequential_consistency_flag(dataA, 0); struct starpu_data_filter f = { .filter_func = starpu_matrix_filter_vertical_block, .nchildren = nblocks }; struct starpu_data_filter f2 = { .filter_func = starpu_matrix_filter_block, .nchildren = nblocks }; starpu_data_map_filters(dataA, 2, &f, &f2); for (k = 0; k < nbigblocks; k++) { struct starpu_task *task = create_task_11(dataA, k, reclevel); /* we defer the launch of the first task */ if (k == 0) { entry_task = task; } else { ret = starpu_task_submit(task); if (ret == -ENODEV) return 77; STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); } for (j = k+1; j<nblocks; j++) { ret = create_task_21(dataA, k, j, reclevel); if (ret == -ENODEV) return 77; for (i = k+1; i<nblocks; i++) { if (i <= j) { ret = create_task_22(dataA, k, i, j, reclevel); if (ret == -ENODEV) return 77; } } } } /* schedule the codelet */ ret = starpu_task_submit(entry_task); if (STARPU_UNLIKELY(ret == -ENODEV)) { FPRINTF(stderr, "No worker may execute this task\n"); return 77; } if (nblocks == nbigblocks) { /* stall the application until the end of computations */ starpu_tag_wait(TAG11_AUX(nblocks-1, reclevel)); starpu_data_unpartition(dataA, STARPU_MAIN_RAM); starpu_data_unregister(dataA); return 0; } else { STARPU_ASSERT(reclevel == 0); unsigned ndeps_tags = (nblocks - nbigblocks)*(nblocks - nbigblocks); starpu_tag_t *tag_array = malloc(ndeps_tags*sizeof(starpu_tag_t)); STARPU_ASSERT(tag_array); unsigned ind = 0; for (i = nbigblocks; i < nblocks; i++) for (j = nbigblocks; j < nblocks; j++) { if (i <= j) tag_array[ind++] = TAG22_AUX(nbigblocks - 1, i, j, reclevel); } starpu_tag_wait_array(ind, tag_array); free(tag_array); starpu_data_unpartition(dataA, STARPU_MAIN_RAM); starpu_data_unregister(dataA); float *newmatA = &matA[nbigblocks*(size/nblocks)*(ld+1)]; return cholesky_grain_rec(newmatA, size/nblocks*(nblocks - nbigblocks), ld, (nblocks - nbigblocks)*2, (nblocks - nbigblocks)*2, reclevel+1); } }
static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks) { double start; double end; struct starpu_task *entry_task = NULL; /* create all the DAG nodes */ unsigned i,j,k; start = starpu_timing_now(); for (k = 0; k < nblocks; k++) { struct starpu_task *task = create_task_11(dataA, k); /* we defer the launch of the first task */ if (k == 0) { entry_task = task; } else { int ret = starpu_task_submit(task); if (STARPU_UNLIKELY(ret == -ENODEV)) { FPRINTF(stderr, "No worker may execute this task\n"); exit(0); } } for (j = k+1; j<nblocks; j++) { create_task_21(dataA, k, j); for (i = k+1; i<nblocks; i++) { if (i <= j) create_task_22(dataA, k, i, j); } } } /* schedule the codelet */ int ret = starpu_task_submit(entry_task); if (STARPU_UNLIKELY(ret == -ENODEV)) { FPRINTF(stderr, "No worker may execute this task\n"); exit(0); } /* stall the application until the end of computations */ starpu_tag_wait(TAG11(nblocks-1)); starpu_data_unpartition(dataA, STARPU_MAIN_RAM); end = starpu_timing_now(); double timing = end - start; unsigned n = starpu_matrix_get_nx(dataA); double flop = (1.0f*n*n*n)/3.0f; PRINTF("# size\tms\tGFlops\n"); PRINTF("%u\t%.0f\t%.1f\n", n, timing/1000, (flop/timing/1000.0f)); }
int main(int argc, char **argv) { unsigned i; float foo; starpu_data_handle float_array_handle; starpu_codelet cl; starpu_init(NULL); if (argc == 2) niter = atoi(argv[1]); foo = 0.0f; starpu_variable_data_register(&float_array_handle, 0 /* home node */, (uintptr_t)&foo, sizeof(float)); #ifdef STARPU_USE_OPENCL starpu_opencl_load_opencl_from_file("examples/basic_examples/variable_kernels_opencl_codelet.cl", &opencl_code); #endif cl.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL; cl.cpu_func = cpu_codelet; #ifdef STARPU_USE_CUDA cl.cuda_func = cuda_codelet; #endif #ifdef STARPU_USE_OPENCL cl.opencl_func = opencl_codelet; #endif cl.nbuffers = 1; cl.model = NULL; for (i = 0; i < niter; i++) { struct starpu_task *task = starpu_task_create(); int ret; task->cl = &cl; task->callback_func = NULL; task->buffers[0].handle = float_array_handle; task->buffers[0].mode = STARPU_RW; ret = starpu_task_submit(task, NULL); if (STARPU_UNLIKELY(ret == -ENODEV)) { fprintf(stderr, "No worker may execute this task\n"); exit(0); } } starpu_task_wait_for_all(); /* update the array in RAM */ starpu_data_acquire(float_array_handle, STARPU_R); fprintf(stderr, "variable -> %f\n", foo); starpu_data_release(float_array_handle); starpu_shutdown(); return 0; }
int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle handle, unsigned node, unsigned async, starpu_access_mode mode) { STARPU_ASSERT(handle); /* it is forbidden to call this function from a callback or a codelet */ if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls())) return -EDEADLK; struct user_interaction_wrapper wrapper = { .handle = handle, .node = node, .async = async, .cond = PTHREAD_COND_INITIALIZER, .lock = PTHREAD_MUTEX_INITIALIZER, .finished = 0 }; if (!_starpu_attempt_to_submit_data_request_from_apps(handle, mode, _prefetch_data_on_node, &wrapper)) { /* we can immediately proceed */ _starpu_fetch_data_on_node(handle, node, mode, async, NULL, NULL); /* remove the "lock"/reference */ if (!async) { _starpu_spin_lock(&handle->header_lock); _starpu_notify_data_dependencies(handle); _starpu_spin_unlock(&handle->header_lock); } } else { PTHREAD_MUTEX_LOCK(&wrapper.lock); while (!wrapper.finished) PTHREAD_COND_WAIT(&wrapper.cond, &wrapper.lock); PTHREAD_MUTEX_UNLOCK(&wrapper.lock); } return 0; } int starpu_data_prefetch_on_node(starpu_data_handle handle, unsigned node, unsigned async) { return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R); } /* * It is possible to specify that a piece of data can be discarded without * impacting the application. */ void starpu_data_advise_as_important(starpu_data_handle handle, unsigned is_important) { _starpu_spin_lock(&handle->header_lock); /* first take all the children lock (in order !) */ unsigned child; for (child = 0; child < handle->nchildren; child++) { /* make sure the intermediate children is advised as well */ struct starpu_data_state_t *child_handle = &handle->children[child]; if (child_handle->nchildren > 0) starpu_data_advise_as_important(child_handle, is_important); } handle->is_not_important = !is_important; /* now the parent may be used again so we release the lock */ _starpu_spin_unlock(&handle->header_lock); }
/* The data must be released by calling starpu_data_release later on */ int starpu_data_acquire(starpu_data_handle handle, starpu_access_mode mode) { STARPU_ASSERT(handle); /* it is forbidden to call this function from a callback or a codelet */ if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls())) return -EDEADLK; struct user_interaction_wrapper wrapper = { .handle = handle, .mode = mode, .node = 0, // unused .cond = PTHREAD_COND_INITIALIZER, .lock = PTHREAD_MUTEX_INITIALIZER, .finished = 0 }; // _STARPU_DEBUG("TAKE sequential_consistency_mutex starpu_data_acquire\n"); PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex); int sequential_consistency = handle->sequential_consistency; if (sequential_consistency) { wrapper.pre_sync_task = starpu_task_create(); wrapper.post_sync_task = starpu_task_create(); #ifdef STARPU_USE_FXT starpu_job_t job = _starpu_get_job_associated_to_task(wrapper.pre_sync_task); job->model_name = "acquire_pre"; job = _starpu_get_job_associated_to_task(wrapper.post_sync_task); job->model_name = "acquire_post"; #endif _starpu_detect_implicit_data_deps_with_handle(wrapper.pre_sync_task, wrapper.post_sync_task, handle, mode); PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex); /* TODO detect if this is superflous */ wrapper.pre_sync_task->synchronous = 1; int ret = starpu_task_submit(wrapper.pre_sync_task, NULL); STARPU_ASSERT(!ret); /* starpu_event event; int ret = starpu_task_submit(wrapper.pre_sync_task, &event); STARPU_ASSERT(!ret); starpu_event_wait(event); starpu_event_release(event); */ } else { PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex); } /* we try to get the data, if we do not succeed immediately, we set a * callback function that will be executed automatically when the data is * available again, otherwise we fetch the data directly */ if (!_starpu_attempt_to_submit_data_request_from_apps(handle, mode, _starpu_data_acquire_continuation, &wrapper)) { /* no one has locked this data yet, so we proceed immediately */ int ret = _starpu_fetch_data_on_node(handle, 0, mode, 0, NULL, NULL); STARPU_ASSERT(!ret); } else { PTHREAD_MUTEX_LOCK(&wrapper.lock); while (!wrapper.finished) PTHREAD_COND_WAIT(&wrapper.cond, &wrapper.lock); PTHREAD_MUTEX_UNLOCK(&wrapper.lock); } /* At that moment, the caller holds a reference to the piece of data. * We enqueue the "post" sync task in the list associated to the handle * so that it is submitted by the starpu_data_release * function. */ _starpu_add_post_sync_tasks(wrapper.post_sync_task, handle); return 0; } /* This function must be called after starpu_data_acquire so that the * application release the data */ void starpu_data_release(starpu_data_handle handle) { STARPU_ASSERT(handle); /* The application can now release the rw-lock */ _starpu_release_data_on_node(handle, 0, 0); /* In case there are some implicit dependencies, unlock the "post sync" tasks */ _starpu_unlock_post_sync_tasks(handle); }
static int copy_data_1_to_1_generic(starpu_data_handle_t handle, struct _starpu_data_replicate *src_replicate, struct _starpu_data_replicate *dst_replicate, struct _starpu_data_request *req) { unsigned src_node = src_replicate->memory_node; unsigned dst_node = dst_replicate->memory_node; STARPU_ASSERT(src_replicate->refcnt); STARPU_ASSERT(dst_replicate->refcnt); STARPU_ASSERT(src_replicate->allocated); STARPU_ASSERT(dst_replicate->allocated); _starpu_comm_amounts_inc(src_node, dst_node, handle->ops->get_size(handle)); #ifdef STARPU_SIMGRID return _starpu_simgrid_transfer(handle->ops->get_size(handle), src_node, dst_node, req); #else /* !SIMGRID */ int ret = 0; const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods; enum starpu_node_kind src_kind = starpu_node_get_kind(src_node); enum starpu_node_kind dst_kind = starpu_node_get_kind(dst_node); #ifdef STARPU_USE_CUDA cudaError_t cures; cudaStream_t stream; #endif void *src_interface = src_replicate->data_interface; void *dst_interface = dst_replicate->data_interface; #if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID) if ((src_kind == STARPU_CUDA_RAM) || (dst_kind == STARPU_CUDA_RAM)) { unsigned devid; if ((src_kind == STARPU_CUDA_RAM) && (dst_kind == STARPU_CUDA_RAM)) { /* GPU-GPU transfer, issue it from the device we are supposed to drive */ int worker = starpu_worker_get_id(); devid = starpu_worker_get_devid(worker); } else { unsigned node = (dst_kind == STARPU_CUDA_RAM)?dst_node:src_node; devid = _starpu_memory_node_get_devid(node); } starpu_cuda_set_device(devid); } #endif switch (_STARPU_MEMORY_NODE_TUPLE(src_kind,dst_kind)) { case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CPU_RAM): /* STARPU_CPU_RAM -> STARPU_CPU_RAM */ if (copy_methods->ram_to_ram) copy_methods->ram_to_ram(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req ? &req->async_channel : NULL); break; #ifdef STARPU_USE_CUDA case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CPU_RAM): /* only the proper CUBLAS thread can initiate this directly ! */ #if !defined(HAVE_CUDA_MEMCPY_PEER) STARPU_ASSERT(_starpu_memory_node_get_local_key() == src_node); #endif if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() || !(copy_methods->cuda_to_ram_async || copy_methods->any_to_any)) { /* this is not associated to a request so it's synchronous */ STARPU_ASSERT(copy_methods->cuda_to_ram || copy_methods->any_to_any); if (copy_methods->cuda_to_ram) copy_methods->cuda_to_ram(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL); } else { req->async_channel.type = STARPU_CUDA_RAM; cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming); if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures); stream = starpu_cuda_get_local_out_transfer_stream(); if (copy_methods->cuda_to_ram_async) ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream); else { STARPU_ASSERT(copy_methods->any_to_any); ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel); } cures = cudaEventRecord(req->async_channel.event.cuda_event, stream); if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures); } break; case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM): /* STARPU_CPU_RAM -> CUBLAS_RAM */ /* only the proper CUBLAS thread can initiate this ! */ #if !defined(HAVE_CUDA_MEMCPY_PEER) STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node); #endif if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() || !(copy_methods->ram_to_cuda_async || copy_methods->any_to_any)) { /* this is not associated to a request so it's synchronous */ STARPU_ASSERT(copy_methods->ram_to_cuda || copy_methods->any_to_any); if (copy_methods->ram_to_cuda) copy_methods->ram_to_cuda(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL); } else { req->async_channel.type = STARPU_CUDA_RAM; cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming); if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures); stream = starpu_cuda_get_local_in_transfer_stream(); if (copy_methods->ram_to_cuda_async) ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream); else { STARPU_ASSERT(copy_methods->any_to_any); ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel); } cures = cudaEventRecord(req->async_channel.event.cuda_event, stream); if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures); } break; case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM): /* CUDA - CUDA transfer */ if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() || !(copy_methods->cuda_to_cuda_async || copy_methods->any_to_any)) { STARPU_ASSERT(copy_methods->cuda_to_cuda || copy_methods->any_to_any); /* this is not associated to a request so it's synchronous */ if (copy_methods->cuda_to_cuda) copy_methods->cuda_to_cuda(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL); } else { req->async_channel.type = STARPU_CUDA_RAM; cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming); if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures); stream = starpu_cuda_get_peer_transfer_stream(src_node, dst_node); if (copy_methods->cuda_to_cuda_async) ret = copy_methods->cuda_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream); else { STARPU_ASSERT(copy_methods->any_to_any); ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel); } cures = cudaEventRecord(req->async_channel.event.cuda_event, stream); if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures); } break; #endif #ifdef STARPU_USE_OPENCL case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_CPU_RAM): /* OpenCL -> RAM */ STARPU_ASSERT(_starpu_memory_node_get_local_key() == src_node); if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() || !(copy_methods->opencl_to_ram_async || copy_methods->any_to_any)) { STARPU_ASSERT(copy_methods->opencl_to_ram || copy_methods->any_to_any); /* this is not associated to a request so it's synchronous */ if (copy_methods->opencl_to_ram) copy_methods->opencl_to_ram(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL); } else { req->async_channel.type = STARPU_OPENCL_RAM; if (copy_methods->opencl_to_ram_async) ret = copy_methods->opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event)); else { STARPU_ASSERT(copy_methods->any_to_any); ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel); } } break; case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_OPENCL_RAM): /* STARPU_CPU_RAM -> STARPU_OPENCL_RAM */ STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node); if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() || !(copy_methods->ram_to_opencl_async || copy_methods->any_to_any)) { STARPU_ASSERT(copy_methods->ram_to_opencl || copy_methods->any_to_any); /* this is not associated to a request so it's synchronous */ if (copy_methods->ram_to_opencl) copy_methods->ram_to_opencl(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL); } else { req->async_channel.type = STARPU_OPENCL_RAM; if (copy_methods->ram_to_opencl_async) ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event)); else { STARPU_ASSERT(copy_methods->any_to_any); ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel); } } break; case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_OPENCL_RAM): /* STARPU_OPENCL_RAM -> STARPU_OPENCL_RAM */ STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node || _starpu_memory_node_get_local_key() == src_node); if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() || !(copy_methods->opencl_to_opencl_async || copy_methods->any_to_any)) { STARPU_ASSERT(copy_methods->opencl_to_opencl || copy_methods->any_to_any); /* this is not associated to a request so it's synchronous */ if (copy_methods->opencl_to_opencl) copy_methods->opencl_to_opencl(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL); } else { req->async_channel.type = STARPU_OPENCL_RAM; if (copy_methods->opencl_to_opencl_async) ret = copy_methods->opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event)); else { STARPU_ASSERT(copy_methods->any_to_any); ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel); } } break; #endif #ifdef STARPU_USE_MIC case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_MIC_RAM): /* RAM -> MIC */ if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mic_copy_disabled() || !(copy_methods->ram_to_mic_async || copy_methods->any_to_any)) { /* this is not associated to a request so it's synchronous */ STARPU_ASSERT(copy_methods->ram_to_mic || copy_methods->any_to_any); if (copy_methods->ram_to_mic) copy_methods->ram_to_mic(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL); } else { req->async_channel.type = STARPU_MIC_RAM; if (copy_methods->ram_to_mic_async) ret = copy_methods->ram_to_mic_async(src_interface, src_node, dst_interface, dst_node); else { STARPU_ASSERT(copy_methods->any_to_any); ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel); } _starpu_mic_init_event(&(req->async_channel.event.mic_event), dst_node); } break; case _STARPU_MEMORY_NODE_TUPLE(STARPU_MIC_RAM,STARPU_CPU_RAM): /* MIC -> RAM */ if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mic_copy_disabled() || !(copy_methods->mic_to_ram_async || copy_methods->any_to_any)) { /* this is not associated to a request so it's synchronous */ STARPU_ASSERT(copy_methods->mic_to_ram || copy_methods->any_to_any); if (copy_methods->mic_to_ram) copy_methods->mic_to_ram(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL); } else { req->async_channel.type = STARPU_MIC_RAM; if (copy_methods->mic_to_ram_async) ret = copy_methods->mic_to_ram_async(src_interface, src_node, dst_interface, dst_node); else { STARPU_ASSERT(copy_methods->any_to_any); ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel); } _starpu_mic_init_event(&(req->async_channel.event.mic_event), src_node); } break; #endif #ifdef STARPU_USE_SCC /* SCC RAM associated to the master process is considered as * the main memory node. */ case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_SCC_RAM): /* master private SCC RAM -> slave private SCC RAM */ if (copy_methods->scc_src_to_sink) copy_methods->scc_src_to_sink(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL); break; case _STARPU_MEMORY_NODE_TUPLE(STARPU_SCC_RAM,STARPU_CPU_RAM): /* slave private SCC RAM -> master private SCC RAM */ if (copy_methods->scc_sink_to_src) copy_methods->scc_sink_to_src(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL); break; case _STARPU_MEMORY_NODE_TUPLE(STARPU_SCC_RAM,STARPU_SCC_RAM): /* slave private SCC RAM -> slave private SCC RAM */ if (copy_methods->scc_sink_to_sink) copy_methods->scc_sink_to_sink(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL); break; #endif case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_DISK_RAM): if(copy_methods->any_to_any) ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req && !starpu_asynchronous_copy_disabled() ? &req->async_channel : NULL); else { void *obj = starpu_data_handle_to_pointer(handle, dst_node); void * ptr = NULL; starpu_ssize_t size = 0; handle->ops->pack_data(handle, src_node, &ptr, &size); ret = _starpu_disk_full_write(src_node, dst_node, obj, ptr, size, &req->async_channel); if (ret == 0) /* write is already finished, ptr was allocated in pack_data */ free(ptr); /* For now, asynchronous is not supported */ STARPU_ASSERT(ret == 0); } break; case _STARPU_MEMORY_NODE_TUPLE(STARPU_DISK_RAM,STARPU_CPU_RAM): if(copy_methods->any_to_any) ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req && !starpu_asynchronous_copy_disabled() ? &req->async_channel : NULL); else { void *obj = starpu_data_handle_to_pointer(handle, src_node); void * ptr = NULL; size_t size = 0; ret = _starpu_disk_full_read(src_node, dst_node, obj, &ptr, &size, &req->async_channel); if (ret == 0) { /* read is already finished, we can already unpack */ handle->ops->unpack_data(handle, dst_node, ptr, size); /* ptr is allocated in full_read */ free(ptr); } /* For now, asynchronous is not supported */ STARPU_ASSERT(ret == 0); } break; case _STARPU_MEMORY_NODE_TUPLE(STARPU_DISK_RAM,STARPU_DISK_RAM): ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req ? &req->async_channel : NULL); break; default: STARPU_ABORT(); break; } return ret; #endif /* !SIMGRID */ }
static int _starpu_opencl_compile_or_load_opencl_from_string(const char *opencl_program_source, const char* build_options, struct starpu_opencl_program *opencl_programs, const char* source_file_name) { unsigned int dev; unsigned int nb_devices; nb_devices = _starpu_opencl_get_device_count(); // Iterate over each device for(dev = 0; dev < nb_devices; dev ++) { cl_device_id device; cl_context context; cl_program program; cl_int err; if (opencl_programs) opencl_programs->programs[dev] = NULL; starpu_opencl_get_device(dev, &device); starpu_opencl_get_context(dev, &context); if (context == NULL) { _STARPU_DEBUG("[%u] is not a valid OpenCL context\n", dev); continue; } // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) &opencl_program_source, NULL, &err); if (!program || err != CL_SUCCESS) { _STARPU_DISP("Error: Failed to load program source with options %s!\n", build_options); return EXIT_FAILURE; } // Build the program executable err = clBuildProgram(program, 1, &device, build_options, NULL, NULL); // Get the status { cl_build_status status; size_t len; static char buffer[4096] = ""; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); if (len > 2) _STARPU_DISP("Compilation output\n%s\n", buffer); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_STATUS, sizeof(status), &status, NULL); if (err != CL_SUCCESS || status != CL_BUILD_SUCCESS) { _STARPU_DISP("Error: Failed to build program executable!\n"); _STARPU_DISP("clBuildProgram: %d - clGetProgramBuildInfo: %d\n", err, status); return EXIT_FAILURE; } } // Store program if (opencl_programs) opencl_programs->programs[dev] = program; else { char binary_file_name[1024]; char *binary; size_t binary_len; FILE *fh; err = _starpu_opencl_get_binary_name(binary_file_name, 1024, source_file_name, dev, device); if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err); err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binary_len, NULL); if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err); binary = malloc(binary_len); err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(binary), &binary, NULL); if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err); fh = fopen(binary_file_name, "w"); if (fh == NULL) { _STARPU_DISP("Error: Failed to open file <%s>\n", binary_file_name); perror("fopen"); return EXIT_FAILURE; } fwrite(binary, binary_len, 1, fh); fclose(fh); free(binary); _STARPU_DEBUG("File <%s> created\n", binary_file_name); } } return EXIT_SUCCESS; }
int starpu_opencl_load_binary_opencl(const char *kernel_id, struct starpu_opencl_program *opencl_programs) { unsigned int dev; unsigned int nb_devices; nb_devices = _starpu_opencl_get_device_count(); // Iterate over each device for(dev = 0; dev < nb_devices; dev ++) { cl_device_id device; cl_context context; cl_program program; cl_int err; char *binary; char binary_file_name[1024]; size_t length; cl_int binary_status; opencl_programs->programs[dev] = NULL; starpu_opencl_get_device(dev, &device); starpu_opencl_get_context(dev, &context); if (context == NULL) { _STARPU_DEBUG("[%u] is not a valid OpenCL context\n", dev); continue; } // Load the binary buffer err = _starpu_opencl_get_binary_name(binary_file_name, 1024, kernel_id, dev, device); if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err); binary = _starpu_opencl_load_program_binary(binary_file_name, &length); // Create the compute program from the binary buffer program = clCreateProgramWithBinary(context, 1, &device, &length, (const unsigned char **) &binary, &binary_status, &err); if (!program || err != CL_SUCCESS) { _STARPU_DISP("Error: Failed to load program binary!\n"); return EXIT_FAILURE; } // Build the program executable err = clBuildProgram(program, 1, &device, NULL, NULL, NULL); // Get the status { cl_build_status status; size_t len; static char buffer[4096] = ""; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); if (len > 2) _STARPU_DISP("Compilation output\n%s\n", buffer); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_STATUS, sizeof(status), &status, NULL); if (err != CL_SUCCESS || status != CL_BUILD_SUCCESS) { _STARPU_DISP("Error: Failed to build program executable!\n"); _STARPU_DISP("clBuildProgram: %d - clGetProgramBuildInfo: %d\n", err, status); return EXIT_FAILURE; } } // Store program opencl_programs->programs[dev] = program; } return 0; }
int main(int argc, char **argv) { int ret; unsigned part; double timing; double start, end; unsigned row, pos; unsigned ind; /* CSR matrix description */ float *nzval; uint32_t nnz; uint32_t *colind; uint32_t *rowptr; /* Input and Output vectors */ float *vector_in_ptr; float *vector_out_ptr; /* * Parse command-line arguments */ parse_args(argc, argv); /* * Launch StarPU */ ret = starpu_init(NULL); if (ret == -ENODEV) return 77; STARPU_CHECK_RETURN_VALUE(ret, "starpu_init"); /* * Create a 3-band sparse matrix as input example */ nnz = 3*size-2; starpu_malloc((void **)&nzval, nnz*sizeof(float)); starpu_malloc((void **)&colind, nnz*sizeof(uint32_t)); starpu_malloc((void **)&rowptr, (size+1)*sizeof(uint32_t)); assert(nzval && colind && rowptr); /* fill the matrix */ for (row = 0, pos = 0; row < size; row++) { rowptr[row] = pos; if (row > 0) { nzval[pos] = 1.0f; colind[pos] = row-1; pos++; } nzval[pos] = 5.0f; colind[pos] = row; pos++; if (row < size - 1) { nzval[pos] = 1.0f; colind[pos] = row+1; pos++; } } STARPU_ASSERT(pos == nnz); rowptr[size] = nnz; /* initiate the 2 vectors */ starpu_malloc((void **)&vector_in_ptr, size*sizeof(float)); starpu_malloc((void **)&vector_out_ptr, size*sizeof(float)); assert(vector_in_ptr && vector_out_ptr); /* fill them */ for (ind = 0; ind < size; ind++) { vector_in_ptr[ind] = 2.0f; vector_out_ptr[ind] = 0.0f; } /* * Register the CSR matrix and the 2 vectors */ starpu_csr_data_register(&sparse_matrix, STARPU_MAIN_RAM, nnz, size, (uintptr_t)nzval, colind, rowptr, 0, sizeof(float)); starpu_vector_data_register(&vector_in, STARPU_MAIN_RAM, (uintptr_t)vector_in_ptr, size, sizeof(float)); starpu_vector_data_register(&vector_out, STARPU_MAIN_RAM, (uintptr_t)vector_out_ptr, size, sizeof(float)); /* * Partition the CSR matrix and the output vector */ csr_f.nchildren = nblocks; vector_f.nchildren = nblocks; starpu_data_partition(sparse_matrix, &csr_f); starpu_data_partition(vector_out, &vector_f); /* * If we use OpenCL, we need to compile the SpMV kernel */ #ifdef STARPU_USE_OPENCL compile_spmv_opencl_kernel(); #endif start = starpu_timing_now(); /* * Create and submit StarPU tasks */ for (part = 0; part < nblocks; part++) { struct starpu_task *task = starpu_task_create(); task->cl = &spmv_cl; task->handles[0] = starpu_data_get_sub_data(sparse_matrix, 1, part); task->handles[1] = vector_in; task->handles[2] = starpu_data_get_sub_data(vector_out, 1, part); ret = starpu_task_submit(task); if (STARPU_UNLIKELY(ret == -ENODEV)) { FPRINTF(stderr, "No worker may execute this task\n"); exit(0); } } starpu_task_wait_for_all(); end = starpu_timing_now(); /* * Unregister the CSR matrix and the output vector */ starpu_data_unpartition(sparse_matrix, STARPU_MAIN_RAM); starpu_data_unpartition(vector_out, STARPU_MAIN_RAM); /* * Unregister data */ starpu_data_unregister(sparse_matrix); starpu_data_unregister(vector_in); starpu_data_unregister(vector_out); /* * Display the result */ for (row = 0; row < STARPU_MIN(size, 16); row++) { FPRINTF(stdout, "%2.2f\t%2.2f\n", vector_in_ptr[row], vector_out_ptr[row]); } starpu_free(nzval); starpu_free(colind); starpu_free(rowptr); starpu_free(vector_in_ptr); starpu_free(vector_out_ptr); /* * Stop StarPU */ starpu_shutdown(); timing = end - start; FPRINTF(stderr, "Computation took (in ms)\n"); FPRINTF(stdout, "%2.2f\n", timing/1000); return 0; }