void dw_cublas_codelet_update_u11(void *descr[], void *_args) { dw_common_codelet_update_u11(descr, 1, _args); int id = starpu_worker_get_id(); count_11_per_worker[id]++; }
void opencl_shadow_host(int bz, TYPE *ptr, int nx, int ny, int nz, int ldy, int ldz, int i) { #if 0 size_t dim[] = {nx, ny, nz}; #else size_t dim[] = {nx, ny, 1}; #endif int devid,id; id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); cl_kernel kernel; cl_command_queue cq; starpu_opencl_load_kernel(&kernel, &cq, &program, "shadow", devid); clSetKernelArg(kernel, 0, sizeof(bz), &bz); clSetKernelArg(kernel, 1, sizeof(ptr), &ptr); clSetKernelArg(kernel, 2, sizeof(nx), &nx); clSetKernelArg(kernel, 3, sizeof(ny), &ny); clSetKernelArg(kernel, 4, sizeof(nz), &nz); clSetKernelArg(kernel, 5, sizeof(ldy), &ldy); clSetKernelArg(kernel, 6, sizeof(ldz), &ldz); clSetKernelArg(kernel, 7, sizeof(i), &i); cl_event ev; cl_int err = clEnqueueNDRangeKernel(cq, kernel, 3, NULL, dim, NULL, 0, NULL, NULL); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); }
void callback_func(void *arg) { /* do some accounting */ int id = starpu_worker_get_id(); flop_per_worker[id] += BLAS3_FLOP(conf.m, conf.n, conf.k); ls_per_worker[id] += BLAS3_LS(conf.m, conf.n, conf.k); }
void dw_cpu_codelet_update_u21(void *descr[], void *_args) { dw_common_codelet_update_u21(descr, 0, _args); int id = starpu_worker_get_id(); count_21_per_worker[id]++; }
void opencl_codelet(void *descr[], void *_args) { cl_mem val = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(descr[0]); cl_kernel kernel; cl_command_queue queue; cl_event event; int id, devid, err; id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "incrementer", devid); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 0, sizeof(val), &val); if (err) STARPU_OPENCL_REPORT_ERROR(err); { size_t global=4; size_t local, s; cl_device_id device; starpu_opencl_get_device(devid, &device); err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); if (local > global) local=global; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); starpu_opencl_release_kernel(kernel); } }
void opencl_codelet_incC(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args) { STARPU_SKIP_IF_VALGRIND; cl_mem val = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(descr[0]); cl_kernel kernel; cl_command_queue queue; cl_event event; int id, devid, err; id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_code, "incC", devid); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 0, sizeof(val), &val); if (err) STARPU_OPENCL_REPORT_ERROR(err); { size_t global=100; size_t local=100; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); } starpu_opencl_release_kernel(kernel); }
static int ws_push_task(struct starpu_task *task) { unsigned sched_ctx_id = task->sched_ctx; struct _starpu_work_stealing_data *ws = (struct _starpu_work_stealing_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id); struct _starpu_deque_jobq *deque_queue; struct _starpu_job *j = _starpu_get_job_associated_to_task(task); int workerid = starpu_worker_get_id(); unsigned worker = 0; struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id); struct starpu_sched_ctx_iterator it; workers->init_iterator(workers, &it); /* !! C'est ballot de tout locker! */ while(workers->has_next(workers, &it)) { worker = workers->get_next(workers, &it); starpu_pthread_mutex_t *sched_mutex; starpu_pthread_cond_t *sched_cond; starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond); STARPU_PTHREAD_MUTEX_LOCK(sched_mutex); } /* If the current thread is not a worker but * the main thread (-1), we find the better one to * put task on its queue */ if (workerid == -1) workerid = select_worker(sched_ctx_id); deque_queue = ws->queue_array[workerid]; #ifdef HAVE_AYUDAME_H if (AYU_event) { intptr_t id = workerid; AYU_event(AYU_ADDTASKTOQUEUE, j->job_id, &id); } #endif _starpu_job_list_push_back(&deque_queue->jobq, j); deque_queue->njobs++; starpu_push_task_end(task); while(workers->has_next(workers, &it)) { worker = workers->get_next(workers, &it); starpu_pthread_mutex_t *sched_mutex; starpu_pthread_cond_t *sched_cond; starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond); #ifndef STARPU_NON_BLOCKING_DRIVERS STARPU_PTHREAD_COND_SIGNAL(sched_cond); #endif STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex); } return 0; }
void master_g(void *arg) { (void) arg; int worker_id; pthread_t tid; tid = pthread_self(); worker_id = starpu_worker_get_id(); printf("[tid %p] task thread = %d -- master\n", (void *)tid, worker_id); }
void task_region_g(void *buffers[], void *args) { (void) buffers; int i = (int)(intptr_t) args; int worker_id; pthread_t tid; tid = pthread_self(); worker_id = starpu_worker_get_id(); printf("[tid %p] task thread = %d: explicit task \"g[%d]\"\n", (void *)tid, worker_id, i); }
static void soclCreateKernel_task(void *data) { struct _cl_kernel *k = (struct _cl_kernel *)data; int range = starpu_worker_get_range(); cl_int err; if (k->program->cl_programs[range] == NULL) { k->errcodes[range] = CL_SUCCESS; DEBUG_MSG("[Device %d] Kernel creation skipped: program has not been built for this device.\n", starpu_worker_get_id()); return; } DEBUG_MSG("[Device %d] Creating kernel...\n", starpu_worker_get_id()); k->cl_kernels[range] = clCreateKernel(k->program->cl_programs[range], k->kernel_name, &err); if (err != CL_SUCCESS) { k->errcodes[range] = err; ERROR_STOP("[Device %d] Unable to create kernel. Error %d. Aborting.\n", starpu_worker_get_id(), err); return; } /* One worker creates argument structures */ if (__sync_bool_compare_and_swap(&k->num_args, 0, 666)) { unsigned int i; cl_uint num_args; err = clGetKernelInfo(k->cl_kernels[range], CL_KERNEL_NUM_ARGS, sizeof(num_args), &num_args, NULL); if (err != CL_SUCCESS) { DEBUG_CL("clGetKernelInfo", err); ERROR_STOP("Unable to get kernel argument count. Aborting.\n"); } k->num_args = num_args; DEBUG_MSG("Kernel has %d arguments\n", num_args); k->arg_size = (size_t*)malloc(sizeof(size_t) * num_args); k->arg_value = (void**)malloc(sizeof(void*) * num_args); k->arg_type = (enum kernel_arg_type*)malloc(sizeof(enum kernel_arg_type) * num_args); /* Settings default type to NULL */ for (i=0; i<num_args; i++) { k->arg_value[i] = NULL; k->arg_type[i] = Null; } } }
void parallel_region_f(void *buffers[], void *args) { (void) buffers; (void) args; int worker_id; pthread_t tid; tid = pthread_self(); worker_id = starpu_worker_get_id(); printf("[tid %p] task thread = %d -- parallel -->\n", (void *)tid, worker_id); starpu_omp_master(master_g, NULL); starpu_omp_master(master_g, NULL); starpu_omp_master(master_g, NULL); starpu_omp_master(master_g, NULL); printf("[tid %p] task thread = %d -- parallel <--\n", (void *)tid, worker_id); }
void axpy_opencl(void *buffers[], void *_args) { TYPE *alpha = _args; int id, devid; cl_int err; cl_kernel kernel; cl_command_queue queue; cl_event event; unsigned n = STARPU_VECTOR_GET_NX(buffers[0]); cl_mem x = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]); cl_mem y = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[1]); id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "_axpy_opencl", devid); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 0, sizeof(x), &x); err|= clSetKernelArg(kernel, 1, sizeof(y), &y); err|= clSetKernelArg(kernel, 2, sizeof(n), &n); err|= clSetKernelArg(kernel, 3, sizeof(*alpha), alpha); if (err) STARPU_OPENCL_REPORT_ERROR(err); { size_t global=n; size_t local; size_t s; cl_device_id device; starpu_opencl_get_device(devid, &device); err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); if (local > global) local=global; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); } starpu_opencl_release_kernel(kernel); }
static void common_data_cpy_func(void *descr[], void *cl_arg) { unsigned interface_id = *(unsigned *)cl_arg; const struct starpu_data_interface_ops *interface_ops = _starpu_data_interface_get_ops(interface_id); const struct starpu_data_copy_methods *copy_methods = interface_ops->copy_methods; int workerid = starpu_worker_get_id(); enum starpu_worker_archtype type = starpu_worker_get_type(workerid); unsigned memory_node = starpu_worker_get_memory_node(workerid); void *dst_interface = descr[0]; void *src_interface = descr[1]; switch (type) { case STARPU_CPU_WORKER: if (copy_methods->ram_to_ram) { copy_methods->ram_to_ram(src_interface, memory_node, dst_interface, memory_node); return; } break; case STARPU_CUDA_WORKER: if (copy_methods->cuda_to_cuda) { copy_methods->cuda_to_cuda(src_interface, memory_node, dst_interface, memory_node); return; } break; case STARPU_OPENCL_WORKER: if (copy_methods->opencl_to_opencl) { copy_methods->opencl_to_opencl(src_interface, memory_node, dst_interface, memory_node); return; } break; default: /* unknown architecture */ STARPU_ABORT(); } STARPU_ASSERT(copy_methods->any_to_any); copy_methods->any_to_any(src_interface, memory_node, dst_interface, memory_node, NULL); }
void redux_opencl_func(void *buffers[], void *args) { int id, devid; cl_int err; cl_kernel kernel; cl_command_queue queue; cl_event event; cl_mem dota = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[0]); cl_mem dotb = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[1]); id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); err = starpu_opencl_load_kernel(&kernel, &queue, &_opencl_program, "_redux_opencl", devid); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 0, sizeof(dota), &dota); err|= clSetKernelArg(kernel, 1, sizeof(dotb), &dotb); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); { size_t global=1; size_t local; size_t s; cl_device_id device; starpu_opencl_get_device(devid, &device); err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); if (local > global) local=global; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); } starpu_opencl_release_kernel(kernel); }
static struct starpu_task * best_implementation_pull_task(struct starpu_sched_component * component) { struct starpu_task * task = NULL; int i; for(i=0; i < component->nparents; i++) { if(component->parents[i] == NULL) continue; else { task = component->parents[i]->pull_task(component->parents[i]); if(task) break; } } if(task) /* this worker can execute this task as it was returned by a pop*/ (void)find_best_impl(component->tree->sched_ctx_id, task, starpu_worker_get_id()); return task; }
void opencl_codelet(void *descr[], void *_args) { cl_kernel kernel; cl_command_queue queue; int id, devid, err, n; float *block = (float *)STARPU_BLOCK_GET_PTR(descr[0]); int nx = (int)STARPU_BLOCK_GET_NX(descr[0]); int ny = (int)STARPU_BLOCK_GET_NY(descr[0]); int nz = (int)STARPU_BLOCK_GET_NZ(descr[0]); unsigned ldy = STARPU_BLOCK_GET_LDY(descr[0]); unsigned ldz = STARPU_BLOCK_GET_LDZ(descr[0]); float *multiplier = (float *)_args; id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_code, "block", devid); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = 0; n=0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &block); err = clSetKernelArg(kernel, 1, sizeof(int), &nx); err = clSetKernelArg(kernel, 2, sizeof(int), &ny); err = clSetKernelArg(kernel, 3, sizeof(int), &nz); err = clSetKernelArg(kernel, 4, sizeof(ldy), &ldy); err = clSetKernelArg(kernel, 5, sizeof(ldz), &ldz); err = clSetKernelArg(kernel, 6, sizeof(float), multiplier); if (err) STARPU_OPENCL_REPORT_ERROR(err); { size_t global=nx*ny*nz; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); } clFinish(queue); starpu_opencl_release_kernel(kernel); }
void opencl_codelet(void *descr[], void *_args) { cl_kernel kernel; cl_command_queue queue; cl_event event; int id, devid, err; cl_mem block = (cl_mem)STARPU_BLOCK_GET_DEV_HANDLE(descr[0]); int nx = (int)STARPU_BLOCK_GET_NX(descr[0]); int ny = (int)STARPU_BLOCK_GET_NY(descr[0]); int nz = (int)STARPU_BLOCK_GET_NZ(descr[0]); int ldy = (int)STARPU_BLOCK_GET_LDY(descr[0]); int ldz = (int) STARPU_BLOCK_GET_LDZ(descr[0]); float *multiplier = (float *)_args; id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_code, "block", devid); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); CHECK_CL_SET_KERNEL_ARG(kernel, 0, sizeof(block), &block); CHECK_CL_SET_KERNEL_ARG(kernel, 1, sizeof(nx), &nx); CHECK_CL_SET_KERNEL_ARG(kernel, 2, sizeof(ny), &ny); CHECK_CL_SET_KERNEL_ARG(kernel, 3, sizeof(nz), &nz); CHECK_CL_SET_KERNEL_ARG(kernel, 4, sizeof(ldy), &ldy); CHECK_CL_SET_KERNEL_ARG(kernel, 5, sizeof(ldz), &ldz); CHECK_CL_SET_KERNEL_ARG(kernel, 6, sizeof(*multiplier), multiplier); { size_t global=nx*ny*nz; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, &event); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); } clFinish(queue); starpu_opencl_collect_stats(event); clReleaseEvent(event); starpu_opencl_release_kernel(kernel); }
void parallel_region_f(void *buffers[], void *args) { (void) buffers; (void) args; int worker_id; pthread_t tid; struct starpu_omp_task_region_attr attr; int i = 0; tid = pthread_self(); worker_id = starpu_worker_get_id(); printf("[tid %p] task thread = %d: implicit task \"f\"\n", (void *)tid, worker_id); starpu_omp_taskgroup_inline_begin(); taskgroup_f((void *)&i); starpu_omp_taskgroup_inline_end(); printf("[tid %p] task thread = %d: implicit task \"f\": taskgroup\n", (void *)tid, worker_id); starpu_omp_taskgroup_inline_begin(); taskgroup_f((void *)&i); starpu_omp_taskgroup_inline_end(); printf("[tid %p] task thread = %d: implicit task \"f\": taskgroup\n", (void *)tid, worker_id); memset(&attr, 0, sizeof(attr)); attr.cl.cpu_funcs[0] = task_region_g; attr.cl.where = STARPU_CPU; attr.cl_arg_size = sizeof(void *); attr.cl_arg_free = 0; attr.if_clause = 1; attr.final_clause = 0; attr.untied_clause = 1; attr.mergeable_clause = 0; attr.cl_arg = (void *)(intptr_t)i++; starpu_omp_task_region(&attr); attr.cl_arg = (void *)(intptr_t)i++; starpu_omp_task_region(&attr); }
void vector_scal_opencl(void *buffers[], void *_args) { float *factor = _args; int id, devid, err; cl_kernel kernel; cl_command_queue queue; cl_event event; /* length of the vector */ unsigned n = STARPU_VECTOR_GET_NX(buffers[0]); /* OpenCL copy of the vector pointer */ cl_mem val = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]); id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); err = starpu_opencl_load_kernel(&kernel, &queue, &programs, "vector_mult_opencl", devid); /* Name of the codelet defined above */ if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 0, sizeof(val), &val); err |= clSetKernelArg(kernel, 1, sizeof(n), &n); err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor); if (err) STARPU_OPENCL_REPORT_ERROR(err); { size_t global=1; size_t local=1; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); } clFinish(queue); starpu_opencl_collect_stats(event); clReleaseEvent(event); starpu_opencl_release_kernel(kernel); }
void soclEnqueueNDRangeKernel_task(void *descr[], void *args) { command_ndrange_kernel cmd = (command_ndrange_kernel)args; cl_command_queue cq; int wid; cl_int err; cl_event ev = command_event_get(cmd); ev->prof_start = _socl_nanotime(); gc_entity_release(ev); wid = starpu_worker_get_id(); starpu_opencl_get_queue(wid, &cq); DEBUG_MSG("[worker %d] [kernel %d] Executing kernel...\n", wid, cmd->kernel->id); int range = starpu_worker_get_range(); /* Set arguments */ { unsigned int i; int buf = 0; for (i=0; i<cmd->num_args; i++) { switch (cmd->arg_types[i]) { case Null: err = clSetKernelArg(cmd->kernel->cl_kernels[range], i, cmd->arg_sizes[i], NULL); break; case Buffer: { cl_mem mem; mem = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[buf]); err = clSetKernelArg(cmd->kernel->cl_kernels[range], i, cmd->arg_sizes[i], &mem); buf++; } break; case Immediate: err = clSetKernelArg(cmd->kernel->cl_kernels[range], i, cmd->arg_sizes[i], cmd->args[i]); break; } if (err != CL_SUCCESS) { DEBUG_CL("clSetKernelArg", err); DEBUG_ERROR("Aborting\n"); } } } /* Calling Kernel */ cl_event event; err = clEnqueueNDRangeKernel(cq, cmd->kernel->cl_kernels[range], cmd->work_dim, cmd->global_work_offset, cmd->global_work_size, cmd->local_work_size, 0, NULL, &event); if (err != CL_SUCCESS) { ERROR_MSG("Worker[%d] Unable to Enqueue kernel (error %d)\n", wid, err); DEBUG_CL("clEnqueueNDRangeKernel", err); DEBUG_MSG("Workdim %d, global_work_offset %p, global_work_size %p, local_work_size %p\n", cmd->work_dim, cmd->global_work_offset, cmd->global_work_size, cmd->local_work_size); DEBUG_MSG("Global work size: %ld %ld %ld\n", cmd->global_work_size[0], (cmd->work_dim > 1 ? cmd->global_work_size[1] : 1), (cmd->work_dim > 2 ? cmd->global_work_size[2] : 1)); if (cmd->local_work_size != NULL) DEBUG_MSG("Local work size: %ld %ld %ld\n", cmd->local_work_size[0], (cmd->work_dim > 1 ? cmd->local_work_size[1] : 1), (cmd->work_dim > 2 ? cmd->local_work_size[2] : 1)); } else { /* Waiting for kernel to terminate */ clWaitForEvents(1, &event); clReleaseEvent(event); } }
struct starpu_task *_starpu_pop_task(struct _starpu_worker *worker) { struct starpu_task *task; int worker_id; unsigned node; /* We can't tell in advance which task will be picked up, so we measure * a timestamp, and will attribute it afterwards to the task. */ int profiling = starpu_profiling_status_get(); struct timespec pop_start_time; if (profiling) _starpu_clock_gettime(&pop_start_time); pick: /* perhaps there is some local task to be executed first */ task = _starpu_pop_local_task(worker); /* get tasks from the stacks of the strategy */ if(!task) { struct _starpu_sched_ctx *sched_ctx ; #ifndef STARPU_NON_BLOCKING_DRIVERS int been_here[STARPU_NMAX_SCHED_CTXS]; int i; for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++) been_here[i] = 0; while(!task) #endif { if(worker->nsched_ctxs == 1) sched_ctx = _starpu_get_initial_sched_ctx(); else { while(1) { sched_ctx = _get_next_sched_ctx_to_pop_into(worker); if(worker->removed_from_ctx[sched_ctx->id] == 1 && worker->shares_tasks_lists[sched_ctx->id] == 1) { _starpu_worker_gets_out_of_ctx(sched_ctx->id, worker); worker->removed_from_ctx[sched_ctx->id] = 0; sched_ctx = NULL; } else break; } } if(sched_ctx && sched_ctx->id != STARPU_NMAX_SCHED_CTXS) { if (sched_ctx->sched_policy && sched_ctx->sched_policy->pop_task) { task = sched_ctx->sched_policy->pop_task(sched_ctx->id); _starpu_pop_task_end(task); } } if(!task) { /* it doesn't matter if it shares tasks list or not in the scheduler, if it does not have any task to pop just get it out of here */ /* however if it shares a task list it will be removed as soon as he finishes this job (in handle_job_termination) */ if(worker->removed_from_ctx[sched_ctx->id]) { _starpu_worker_gets_out_of_ctx(sched_ctx->id, worker); worker->removed_from_ctx[sched_ctx->id] = 0; } #ifdef STARPU_USE_SC_HYPERVISOR if(worker->pop_ctx_priority) { struct starpu_sched_ctx_performance_counters *perf_counters = sched_ctx->perf_counters; if(sched_ctx->id != 0 && perf_counters != NULL && perf_counters->notify_idle_cycle && _starpu_sched_ctx_allow_hypervisor(sched_ctx->id)) { // _STARPU_TRACE_HYPERVISOR_BEGIN(); perf_counters->notify_idle_cycle(sched_ctx->id, worker->workerid, 1.0); // _STARPU_TRACE_HYPERVISOR_END(); } } #endif //STARPU_USE_SC_HYPERVISOR #ifndef STARPU_NON_BLOCKING_DRIVERS if(been_here[sched_ctx->id] || worker->nsched_ctxs == 1) break; been_here[sched_ctx->id] = 1; #endif } } } if (!task) { idle_start[worker->workerid] = starpu_timing_now(); return NULL; } if(idle_start[worker->workerid] != 0.0) { double idle_end = starpu_timing_now(); idle[worker->workerid] += (idle_end - idle_start[worker->workerid]); idle_start[worker->workerid] = 0.0; } #ifdef STARPU_USE_SC_HYPERVISOR struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx); struct starpu_sched_ctx_performance_counters *perf_counters = sched_ctx->perf_counters; if(sched_ctx->id != 0 && perf_counters != NULL && perf_counters->notify_poped_task && _starpu_sched_ctx_allow_hypervisor(sched_ctx->id)) { // _STARPU_TRACE_HYPERVISOR_BEGIN(); perf_counters->notify_poped_task(task->sched_ctx, worker->workerid); // _STARPU_TRACE_HYPERVISOR_END(); } #endif //STARPU_USE_SC_HYPERVISOR /* Make sure we do not bother with all the multiformat-specific code if * it is not necessary. */ if (!_starpu_task_uses_multiformat_handles(task)) goto profiling; /* This is either a conversion task, or a regular task for which the * conversion tasks have already been created and submitted */ if (task->mf_skip) goto profiling; /* * This worker may not be able to execute this task. In this case, we * should return the task anyway. It will be pushed back almost immediatly. * This way, we avoid computing and executing the conversions tasks. * Here, we do not care about what implementation is used. */ worker_id = starpu_worker_get_id(); if (!starpu_worker_can_execute_task_first_impl(worker_id, task, NULL)) return task; node = starpu_worker_get_memory_node(worker_id); /* * We do have a task that uses multiformat handles. Let's create the * required conversion tasks. */ STARPU_PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex); unsigned i; unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task); for (i = 0; i < nbuffers; i++) { struct starpu_task *conversion_task; starpu_data_handle_t handle; handle = STARPU_TASK_GET_HANDLE(task, i); if (!_starpu_handle_needs_conversion_task(handle, node)) continue; conversion_task = _starpu_create_conversion_task(handle, node); conversion_task->mf_skip = 1; conversion_task->execute_on_a_specific_worker = 1; conversion_task->workerid = worker_id; /* * Next tasks will need to know where these handles have gone. */ handle->mf_node = node; _starpu_task_submit_conversion_task(conversion_task, worker_id); } task->mf_skip = 1; starpu_task_list_push_back(&worker->local_tasks, task); STARPU_PTHREAD_MUTEX_LOCK(&worker->sched_mutex); goto pick; profiling: if (profiling) { struct starpu_profiling_task_info *profiling_info; profiling_info = task->profiling_info; /* The task may have been created before profiling was enabled, * so we check if the profiling_info structure is available * even though we already tested if profiling is enabled. */ if (profiling_info) { memcpy(&profiling_info->pop_start_time, &pop_start_time, sizeof(struct timespec)); _starpu_clock_gettime(&profiling_info->pop_end_time); } } if(task->prologue_callback_pop_func) task->prologue_callback_pop_func(task->prologue_callback_pop_arg); return task; }
void test_variable_opencl_func(void *buffers[], void *args) { STARPU_SKIP_IF_VALGRIND; int id, devid, ret; int factor = *(int *) args; cl_int err; cl_kernel kernel; cl_command_queue queue; cl_event event; ret = starpu_opencl_load_opencl_from_file(KERNEL_LOCATION, &opencl_program, NULL); STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file"); cl_mem val = (cl_mem)STARPU_VARIABLE_GET_PTR(buffers[0]); cl_context context; id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); starpu_opencl_get_context(devid, &context); cl_mem fail = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeof(int), &variable_config.copy_failed, &err); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "variable_opencl", devid); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 0, sizeof(val), &val); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 1, sizeof(fail), &fail); if (err) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 2, sizeof(factor), &factor); if (err) STARPU_OPENCL_REPORT_ERROR(err); { size_t global = 1; size_t local; size_t s; cl_device_id device; starpu_opencl_get_device(devid, &device); err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); if (local > global) local = global; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); } err = clEnqueueReadBuffer(queue, fail, CL_TRUE, 0, sizeof(int), &variable_config.copy_failed, 0, NULL, NULL); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); clFinish(queue); starpu_opencl_collect_stats(event); clReleaseEvent(event); starpu_opencl_release_kernel(kernel); ret = starpu_opencl_unload_opencl(&opencl_program); STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl"); return; }
void multiformat_scal_opencl_func(void *buffers[], void *args) { (void) args; int id, devid; cl_int err; cl_kernel kernel; cl_command_queue queue; cl_event event; unsigned n = STARPU_MULTIFORMAT_GET_NX(buffers[0]); cl_mem val = (cl_mem)STARPU_MULTIFORMAT_GET_OPENCL_PTR(buffers[0]); id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "multiformat_opencl", devid); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 0, sizeof(val), &val); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 1, sizeof(n), &n); if (err) STARPU_OPENCL_REPORT_ERROR(err); { size_t global=n; size_t local; size_t s; cl_device_id device; starpu_opencl_get_device(devid, &device); err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); if (local > global) local = global; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); } clFinish(queue); starpu_opencl_collect_stats(event); clReleaseEvent(event); starpu_opencl_release_kernel(kernel); }
void cpu_to_opencl_opencl_func(void *buffers[], void *args) { (void) args; int id, devid; cl_int err; cl_kernel kernel; cl_command_queue queue; cl_event event; unsigned n = CUSTOM_GET_NX(buffers[0]); n*=2; struct point *aop; aop = (struct point *) CUSTOM_GET_CPU_PTR(buffers[0]); id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); err = starpu_opencl_load_kernel(&kernel, &queue, &_opencl_conversion_program, "custom_opencl_conversion", devid); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); void *x = CUSTOM_GET_OPENCL_X_PTR(buffers[0]); if (starpu_opencl_set_kernel_args(&err, &kernel, sizeof(aop), &aop, sizeof(x), &x, sizeof(n), &n, 0) != 3) { STARPU_OPENCL_REPORT_ERROR(err); assert(0); } { size_t global=n; size_t local; size_t s; cl_device_id device; starpu_opencl_get_device(devid, &device); err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); if (local > global) local = global; err = clEnqueueNDRangeKernel( queue, kernel, 1, /* work_dim */ NULL, /* global_work_offset */ &global, /* global_work_size */ &local, /* local_work_size */ 0, /* num_events_in_wait_list */ NULL, /* event_wait_list */ NULL); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); } starpu_opencl_release_kernel(kernel); }
void cpu_to_opencl_opencl_func(void *buffers[], void *args) { STARPU_SKIP_IF_VALGRIND; (void) args; int id, devid, ret; cl_int err; cl_kernel kernel; cl_command_queue queue; cl_event event; unsigned n = STARPU_MULTIFORMAT_GET_NX(buffers[0]); cl_mem src = (cl_mem) STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]); cl_mem dst = (cl_mem) STARPU_MULTIFORMAT_GET_OPENCL_PTR(buffers[0]); id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); ret = starpu_opencl_load_opencl_from_file(KERNEL_LOCATION, &opencl_conversion_program, NULL); STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file"); err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_conversion_program, "cpu_to_opencl_opencl", devid); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 0, sizeof(src), &src); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 1, sizeof(dst), &dst); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 2, sizeof(n), &n); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); { size_t global=n; size_t local; size_t s; cl_device_id device; starpu_opencl_get_device(devid, &device); err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); if (local > global) local = global; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); } clFinish(queue); starpu_opencl_collect_stats(event); clReleaseEvent(event); starpu_opencl_release_kernel(kernel); ret = starpu_opencl_unload_opencl(&opencl_conversion_program); STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl"); }
/* * cl_update (CUDA version) */ static void update_func_cuda(void *descr[], void *arg) { struct block_description *block = arg; int workerid = starpu_worker_get_id(); DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); if (block->bz == 0) fprintf(stderr,"!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid); else DEBUG( "!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid); #ifdef STARPU_USE_MPI int rank = 0; MPI_Comm_rank(MPI_COMM_WORLD, &rank); DEBUG( "!!! RANK %d !!!\n", rank); #endif DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); unsigned block_size_z = get_block_size(block->bz); unsigned i; update_per_worker[workerid]++; struct timeval tv, tv2, diff, delta = {.tv_sec = 0, .tv_usec = get_ticks()*1000}; gettimeofday(&tv, NULL); timersub(&tv, &start, &tv2); timersub(&tv2, &last_tick[block->bz], &diff); while (timercmp(&diff, &delta, >=)) { timeradd(&last_tick[block->bz], &delta, &last_tick[block->bz]); timersub(&tv2, &last_tick[block->bz], &diff); if (who_runs_what_index[block->bz] < who_runs_what_len) who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = -1; } if (who_runs_what_index[block->bz] < who_runs_what_len) who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = global_workerid(workerid); /* * Load neighbours' boundaries : TOP */ /* The offset along the z axis is (block_size_z + K) */ load_subblock_from_buffer_cuda(descr[0], descr[2], block_size_z+K); load_subblock_from_buffer_cuda(descr[1], descr[3], block_size_z+K); /* * Load neighbours' boundaries : BOTTOM */ load_subblock_from_buffer_cuda(descr[0], descr[4], 0); load_subblock_from_buffer_cuda(descr[1], descr[5], 0); /* * Stencils ... do the actual work here :) TODO */ for (i=1; i<=K; i++) { starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2]; TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr; /* Shadow data */ cuda_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i); /* And perform actual computation */ #ifdef LIFE cuda_life_update_host(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i); #else cudaMemcpy(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new), cudaMemcpyDeviceToDevice); #endif /* LIFE */ } cudaError_t cures; if ((cures = cudaThreadSynchronize()) != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures); } #endif /* STARPU_USE_CUDA */ /* * cl_update (CPU version) */ static void update_func_cpu(void *descr[], void *arg) { struct block_description *block = arg; int workerid = starpu_worker_get_id(); DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); if (block->bz == 0) fprintf(stderr,"!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid); else DEBUG( "!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid); #ifdef STARPU_USE_MPI int rank = 0; MPI_Comm_rank(MPI_COMM_WORLD, &rank); DEBUG( "!!! RANK %d !!!\n", rank); #endif DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); unsigned block_size_z = get_block_size(block->bz); unsigned i; update_per_worker[workerid]++; struct timeval tv, tv2, diff, delta = {.tv_sec = 0, .tv_usec = get_ticks() * 1000}; gettimeofday(&tv, NULL); timersub(&tv, &start, &tv2); timersub(&tv2, &last_tick[block->bz], &diff); while (timercmp(&diff, &delta, >=)) { timeradd(&last_tick[block->bz], &delta, &last_tick[block->bz]); timersub(&tv2, &last_tick[block->bz], &diff); if (who_runs_what_index[block->bz] < who_runs_what_len) who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = -1; } if (who_runs_what_index[block->bz] < who_runs_what_len) who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = global_workerid(workerid); /* * Load neighbours' boundaries : TOP */ /* The offset along the z axis is (block_size_z + K) */ load_subblock_from_buffer_cpu(descr[0], descr[2], block_size_z+K); load_subblock_from_buffer_cpu(descr[1], descr[3], block_size_z+K); /* * Load neighbours' boundaries : BOTTOM */ load_subblock_from_buffer_cpu(descr[0], descr[4], 0); load_subblock_from_buffer_cpu(descr[1], descr[5], 0); /* * Stencils ... do the actual work here :) TODO */ for (i=1; i<=K; i++) { starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2]; TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr; /* Shadow data */ unsigned ldy = oldb->ldy, ldz = oldb->ldz; unsigned nx = oldb->nx, ny = oldb->ny, nz = oldb->nz; unsigned x, y, z; unsigned stepx = 1; unsigned stepy = 1; unsigned stepz = 1; unsigned idx = 0; unsigned idy = 0; unsigned idz = 0; TYPE *ptr = old; # include "shadow.h" /* And perform actual computation */ #ifdef LIFE life_update(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i); #else memcpy(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new)); #endif /* LIFE */ } } /* Performance model and codelet structure */ static struct starpu_perfmodel_t cl_update_model = { .type = STARPU_HISTORY_BASED, .symbol = "cl_update" }; starpu_codelet cl_update = { .where = #ifdef STARPU_USE_CUDA STARPU_CUDA| #endif STARPU_CPU, .cpu_func = update_func_cpu, #ifdef STARPU_USE_CUDA .cuda_func = update_func_cuda, #endif .model = &cl_update_model, .nbuffers = 6 }; /* * Save the block internal boundaries to give them to our neighbours. */ /* CPU version */ static void load_subblock_into_buffer_cpu(starpu_block_interface_t *block, starpu_block_interface_t *boundary, unsigned firstz) { /* Sanity checks */ STARPU_ASSERT(block->nx == boundary->nx); STARPU_ASSERT(block->ny == boundary->ny); STARPU_ASSERT(boundary->nz == K); /* NB: this is not fully garanteed ... but it's *very* likely and that * makes our life much simpler */ STARPU_ASSERT(block->ldy == boundary->ldy); STARPU_ASSERT(block->ldz == boundary->ldz); /* We do a contiguous memory transfer */ size_t boundary_size = K*block->ldz*block->elemsize; unsigned offset = firstz*block->ldz; TYPE *block_data = (TYPE *)block->ptr; TYPE *boundary_data = (TYPE *)boundary->ptr; memcpy(boundary_data, &block_data[offset], boundary_size); } /* CUDA version */ #ifdef STARPU_USE_CUDA static void load_subblock_into_buffer_cuda(starpu_block_interface_t *block, starpu_block_interface_t *boundary, unsigned firstz) { /* Sanity checks */ STARPU_ASSERT(block->nx == boundary->nx); STARPU_ASSERT(block->ny == boundary->ny); STARPU_ASSERT(boundary->nz == K); /* NB: this is not fully garanteed ... but it's *very* likely and that * makes our life much simpler */ STARPU_ASSERT(block->ldy == boundary->ldy); STARPU_ASSERT(block->ldz == boundary->ldz); /* We do a contiguous memory transfer */ size_t boundary_size = K*block->ldz*block->elemsize; unsigned offset = firstz*block->ldz; TYPE *block_data = (TYPE *)block->ptr; TYPE *boundary_data = (TYPE *)boundary->ptr; cudaMemcpy(boundary_data, &block_data[offset], boundary_size, cudaMemcpyDeviceToDevice); } #endif /* STARPU_USE_CUDA */ /* Record how many top/bottom saves each worker performed */ unsigned top_per_worker[STARPU_NMAXWORKERS]; unsigned bottom_per_worker[STARPU_NMAXWORKERS]; /* top save, CPU version */ static void dummy_func_top_cpu(void *descr[] __attribute__((unused)), void *arg) { struct block_description *block = arg; int workerid = starpu_worker_get_id(); top_per_worker[workerid]++; DEBUG( "DO SAVE Bottom block %d\n", block->bz); /* The offset along the z axis is (block_size_z + K)- K */ unsigned block_size_z = get_block_size(block->bz); load_subblock_into_buffer_cpu(descr[0], descr[2], block_size_z); load_subblock_into_buffer_cpu(descr[1], descr[3], block_size_z); } /* bottom save, CPU version */ static void dummy_func_bottom_cpu(void *descr[] __attribute__((unused)), void *arg) { struct block_description *block = arg; int workerid = starpu_worker_get_id(); bottom_per_worker[workerid]++; DEBUG( "DO SAVE Top block %d\n", block->bz); load_subblock_into_buffer_cpu(descr[0], descr[2], K); load_subblock_into_buffer_cpu(descr[1], descr[3], K); } /* top save, CUDA version */ #ifdef STARPU_USE_CUDA static void dummy_func_top_cuda(void *descr[] __attribute__((unused)), void *arg) { struct block_description *block = arg; int workerid = starpu_worker_get_id(); top_per_worker[workerid]++; DEBUG( "DO SAVE Top block %d\n", block->bz); /* The offset along the z axis is (block_size_z + K)- K */ unsigned block_size_z = get_block_size(block->bz); load_subblock_into_buffer_cuda(descr[0], descr[2], block_size_z); load_subblock_into_buffer_cuda(descr[1], descr[3], block_size_z); cudaThreadSynchronize(); } /* bottom save, CUDA version */ static void dummy_func_bottom_cuda(void *descr[] __attribute__((unused)), void *arg) { struct block_description *block = arg; int workerid = starpu_worker_get_id(); bottom_per_worker[workerid]++; DEBUG( "DO SAVE Bottom block %d on CUDA\n", block->bz); load_subblock_into_buffer_cuda(descr[0], descr[2], K); load_subblock_into_buffer_cuda(descr[1], descr[3], K); cudaThreadSynchronize(); } #endif /* STARPU_USE_CUDA */ /* Performance models and codelet for save */ static struct starpu_perfmodel_t save_cl_bottom_model = { .type = STARPU_HISTORY_BASED, .symbol = "save_cl_bottom" }; static struct starpu_perfmodel_t save_cl_top_model = { .type = STARPU_HISTORY_BASED, .symbol = "save_cl_top" }; starpu_codelet save_cl_bottom = { .where = #ifdef STARPU_USE_CUDA STARPU_CUDA| #endif STARPU_CPU, .cpu_func = dummy_func_bottom_cpu, #ifdef STARPU_USE_CUDA .cuda_func = dummy_func_bottom_cuda, #endif .model = &save_cl_bottom_model, .nbuffers = 4 }; starpu_codelet save_cl_top = { .where = #ifdef STARPU_USE_CUDA STARPU_CUDA| #endif STARPU_CPU, .cpu_func = dummy_func_top_cpu, #ifdef STARPU_USE_CUDA .cuda_func = dummy_func_top_cuda, #endif .model = &save_cl_top_model, .nbuffers = 4 };
static int copy_data_1_to_1_generic(starpu_data_handle_t handle, struct _starpu_data_replicate *src_replicate, struct _starpu_data_replicate *dst_replicate, struct _starpu_data_request *req) { unsigned src_node = src_replicate->memory_node; unsigned dst_node = dst_replicate->memory_node; STARPU_ASSERT(src_replicate->refcnt); STARPU_ASSERT(dst_replicate->refcnt); STARPU_ASSERT(src_replicate->allocated); STARPU_ASSERT(dst_replicate->allocated); _starpu_comm_amounts_inc(src_node, dst_node, handle->ops->get_size(handle)); #ifdef STARPU_SIMGRID return _starpu_simgrid_transfer(handle->ops->get_size(handle), src_node, dst_node, req); #else /* !SIMGRID */ int ret = 0; const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods; enum starpu_node_kind src_kind = starpu_node_get_kind(src_node); enum starpu_node_kind dst_kind = starpu_node_get_kind(dst_node); #ifdef STARPU_USE_CUDA cudaError_t cures; cudaStream_t stream; #endif void *src_interface = src_replicate->data_interface; void *dst_interface = dst_replicate->data_interface; #if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID) if ((src_kind == STARPU_CUDA_RAM) || (dst_kind == STARPU_CUDA_RAM)) { unsigned devid; if ((src_kind == STARPU_CUDA_RAM) && (dst_kind == STARPU_CUDA_RAM)) { /* GPU-GPU transfer, issue it from the device we are supposed to drive */ int worker = starpu_worker_get_id(); devid = starpu_worker_get_devid(worker); } else { unsigned node = (dst_kind == STARPU_CUDA_RAM)?dst_node:src_node; devid = _starpu_memory_node_get_devid(node); } starpu_cuda_set_device(devid); } #endif switch (_STARPU_MEMORY_NODE_TUPLE(src_kind,dst_kind)) { case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CPU_RAM): /* STARPU_CPU_RAM -> STARPU_CPU_RAM */ if (copy_methods->ram_to_ram) copy_methods->ram_to_ram(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req ? &req->async_channel : NULL); break; #ifdef STARPU_USE_CUDA case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CPU_RAM): /* only the proper CUBLAS thread can initiate this directly ! */ #if !defined(HAVE_CUDA_MEMCPY_PEER) STARPU_ASSERT(_starpu_memory_node_get_local_key() == src_node); #endif if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() || !(copy_methods->cuda_to_ram_async || copy_methods->any_to_any)) { /* this is not associated to a request so it's synchronous */ STARPU_ASSERT(copy_methods->cuda_to_ram || copy_methods->any_to_any); if (copy_methods->cuda_to_ram) copy_methods->cuda_to_ram(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL); } else { req->async_channel.type = STARPU_CUDA_RAM; cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming); if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures); stream = starpu_cuda_get_local_out_transfer_stream(); if (copy_methods->cuda_to_ram_async) ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream); else { STARPU_ASSERT(copy_methods->any_to_any); ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel); } cures = cudaEventRecord(req->async_channel.event.cuda_event, stream); if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures); } break; case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM): /* STARPU_CPU_RAM -> CUBLAS_RAM */ /* only the proper CUBLAS thread can initiate this ! */ #if !defined(HAVE_CUDA_MEMCPY_PEER) STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node); #endif if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() || !(copy_methods->ram_to_cuda_async || copy_methods->any_to_any)) { /* this is not associated to a request so it's synchronous */ STARPU_ASSERT(copy_methods->ram_to_cuda || copy_methods->any_to_any); if (copy_methods->ram_to_cuda) copy_methods->ram_to_cuda(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL); } else { req->async_channel.type = STARPU_CUDA_RAM; cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming); if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures); stream = starpu_cuda_get_local_in_transfer_stream(); if (copy_methods->ram_to_cuda_async) ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream); else { STARPU_ASSERT(copy_methods->any_to_any); ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel); } cures = cudaEventRecord(req->async_channel.event.cuda_event, stream); if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures); } break; case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM): /* CUDA - CUDA transfer */ if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() || !(copy_methods->cuda_to_cuda_async || copy_methods->any_to_any)) { STARPU_ASSERT(copy_methods->cuda_to_cuda || copy_methods->any_to_any); /* this is not associated to a request so it's synchronous */ if (copy_methods->cuda_to_cuda) copy_methods->cuda_to_cuda(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL); } else { req->async_channel.type = STARPU_CUDA_RAM; cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming); if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures); stream = starpu_cuda_get_peer_transfer_stream(src_node, dst_node); if (copy_methods->cuda_to_cuda_async) ret = copy_methods->cuda_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream); else { STARPU_ASSERT(copy_methods->any_to_any); ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel); } cures = cudaEventRecord(req->async_channel.event.cuda_event, stream); if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures); } break; #endif #ifdef STARPU_USE_OPENCL case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_CPU_RAM): /* OpenCL -> RAM */ STARPU_ASSERT(_starpu_memory_node_get_local_key() == src_node); if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() || !(copy_methods->opencl_to_ram_async || copy_methods->any_to_any)) { STARPU_ASSERT(copy_methods->opencl_to_ram || copy_methods->any_to_any); /* this is not associated to a request so it's synchronous */ if (copy_methods->opencl_to_ram) copy_methods->opencl_to_ram(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL); } else { req->async_channel.type = STARPU_OPENCL_RAM; if (copy_methods->opencl_to_ram_async) ret = copy_methods->opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event)); else { STARPU_ASSERT(copy_methods->any_to_any); ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel); } } break; case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_OPENCL_RAM): /* STARPU_CPU_RAM -> STARPU_OPENCL_RAM */ STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node); if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() || !(copy_methods->ram_to_opencl_async || copy_methods->any_to_any)) { STARPU_ASSERT(copy_methods->ram_to_opencl || copy_methods->any_to_any); /* this is not associated to a request so it's synchronous */ if (copy_methods->ram_to_opencl) copy_methods->ram_to_opencl(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL); } else { req->async_channel.type = STARPU_OPENCL_RAM; if (copy_methods->ram_to_opencl_async) ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event)); else { STARPU_ASSERT(copy_methods->any_to_any); ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel); } } break; case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_OPENCL_RAM): /* STARPU_OPENCL_RAM -> STARPU_OPENCL_RAM */ STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node || _starpu_memory_node_get_local_key() == src_node); if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() || !(copy_methods->opencl_to_opencl_async || copy_methods->any_to_any)) { STARPU_ASSERT(copy_methods->opencl_to_opencl || copy_methods->any_to_any); /* this is not associated to a request so it's synchronous */ if (copy_methods->opencl_to_opencl) copy_methods->opencl_to_opencl(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL); } else { req->async_channel.type = STARPU_OPENCL_RAM; if (copy_methods->opencl_to_opencl_async) ret = copy_methods->opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event)); else { STARPU_ASSERT(copy_methods->any_to_any); ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel); } } break; #endif #ifdef STARPU_USE_MIC case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_MIC_RAM): /* RAM -> MIC */ if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mic_copy_disabled() || !(copy_methods->ram_to_mic_async || copy_methods->any_to_any)) { /* this is not associated to a request so it's synchronous */ STARPU_ASSERT(copy_methods->ram_to_mic || copy_methods->any_to_any); if (copy_methods->ram_to_mic) copy_methods->ram_to_mic(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL); } else { req->async_channel.type = STARPU_MIC_RAM; if (copy_methods->ram_to_mic_async) ret = copy_methods->ram_to_mic_async(src_interface, src_node, dst_interface, dst_node); else { STARPU_ASSERT(copy_methods->any_to_any); ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel); } _starpu_mic_init_event(&(req->async_channel.event.mic_event), dst_node); } break; case _STARPU_MEMORY_NODE_TUPLE(STARPU_MIC_RAM,STARPU_CPU_RAM): /* MIC -> RAM */ if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mic_copy_disabled() || !(copy_methods->mic_to_ram_async || copy_methods->any_to_any)) { /* this is not associated to a request so it's synchronous */ STARPU_ASSERT(copy_methods->mic_to_ram || copy_methods->any_to_any); if (copy_methods->mic_to_ram) copy_methods->mic_to_ram(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL); } else { req->async_channel.type = STARPU_MIC_RAM; if (copy_methods->mic_to_ram_async) ret = copy_methods->mic_to_ram_async(src_interface, src_node, dst_interface, dst_node); else { STARPU_ASSERT(copy_methods->any_to_any); ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel); } _starpu_mic_init_event(&(req->async_channel.event.mic_event), src_node); } break; #endif #ifdef STARPU_USE_SCC /* SCC RAM associated to the master process is considered as * the main memory node. */ case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_SCC_RAM): /* master private SCC RAM -> slave private SCC RAM */ if (copy_methods->scc_src_to_sink) copy_methods->scc_src_to_sink(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL); break; case _STARPU_MEMORY_NODE_TUPLE(STARPU_SCC_RAM,STARPU_CPU_RAM): /* slave private SCC RAM -> master private SCC RAM */ if (copy_methods->scc_sink_to_src) copy_methods->scc_sink_to_src(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL); break; case _STARPU_MEMORY_NODE_TUPLE(STARPU_SCC_RAM,STARPU_SCC_RAM): /* slave private SCC RAM -> slave private SCC RAM */ if (copy_methods->scc_sink_to_sink) copy_methods->scc_sink_to_sink(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL); break; #endif case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_DISK_RAM): if(copy_methods->any_to_any) ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req && !starpu_asynchronous_copy_disabled() ? &req->async_channel : NULL); else { void *obj = starpu_data_handle_to_pointer(handle, dst_node); void * ptr = NULL; starpu_ssize_t size = 0; handle->ops->pack_data(handle, src_node, &ptr, &size); ret = _starpu_disk_full_write(src_node, dst_node, obj, ptr, size, &req->async_channel); if (ret == 0) /* write is already finished, ptr was allocated in pack_data */ free(ptr); /* For now, asynchronous is not supported */ STARPU_ASSERT(ret == 0); } break; case _STARPU_MEMORY_NODE_TUPLE(STARPU_DISK_RAM,STARPU_CPU_RAM): if(copy_methods->any_to_any) ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req && !starpu_asynchronous_copy_disabled() ? &req->async_channel : NULL); else { void *obj = starpu_data_handle_to_pointer(handle, src_node); void * ptr = NULL; size_t size = 0; ret = _starpu_disk_full_read(src_node, dst_node, obj, &ptr, &size, &req->async_channel); if (ret == 0) { /* read is already finished, we can already unpack */ handle->ops->unpack_data(handle, dst_node, ptr, size); /* ptr is allocated in full_read */ free(ptr); } /* For now, asynchronous is not supported */ STARPU_ASSERT(ret == 0); } break; case _STARPU_MEMORY_NODE_TUPLE(STARPU_DISK_RAM,STARPU_DISK_RAM): ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req ? &req->async_channel : NULL); break; default: STARPU_ABORT(); break; } return ret; #endif /* !SIMGRID */ }
/* Note: this is not scalable work stealing, use lws instead */ static struct starpu_task *ws_pop_task(unsigned sched_ctx_id) { struct _starpu_work_stealing_data *ws = (struct _starpu_work_stealing_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id); struct starpu_task *task; struct _starpu_deque_jobq *q; int workerid = starpu_worker_get_id(); STARPU_ASSERT(workerid != -1); q = ws->queue_array[workerid]; task = _starpu_deque_pop_task(q, workerid); if (task) { /* there was a local task */ ws->performed_total++; q->nprocessed++; q->njobs--; return task; } starpu_pthread_mutex_t *worker_sched_mutex; starpu_pthread_cond_t *worker_sched_cond; starpu_worker_get_sched_condition(workerid, &worker_sched_mutex, &worker_sched_cond); /* Note: Releasing this mutex before taking the victim mutex, to avoid interlock*/ STARPU_PTHREAD_MUTEX_UNLOCK(worker_sched_mutex); /* we need to steal someone's job */ unsigned victim = select_victim(sched_ctx_id); starpu_pthread_mutex_t *victim_sched_mutex; starpu_pthread_cond_t *victim_sched_cond; starpu_worker_get_sched_condition(victim, &victim_sched_mutex, &victim_sched_cond); STARPU_PTHREAD_MUTEX_LOCK(victim_sched_mutex); struct _starpu_deque_jobq *victimq = ws->queue_array[victim]; task = _starpu_deque_pop_task(victimq, workerid); if (task) { _STARPU_TRACE_WORK_STEALING(q, workerid); ws->performed_total++; /* Beware : we have to increase the number of processed tasks of * the stealer, not the victim ! */ q->nprocessed++; victimq->njobs--; } STARPU_PTHREAD_MUTEX_UNLOCK(victim_sched_mutex); STARPU_PTHREAD_MUTEX_LOCK(worker_sched_mutex); if(!task) { task = _starpu_deque_pop_task(q, workerid); if (task) { /* there was a local task */ ws->performed_total++; q->nprocessed++; q->njobs--; return task; } } return task; }
*/ #include <starpu.h> #include <starpu_opencl.h> #include <CL/cl.h> extern struct starpu_opencl_program opencl_code; void opencl_codelet_incA(void *descr[], __attribute__ ((unused)) void *_args) { unsigned *val = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]); cl_kernel kernel; cl_command_queue queue; int id, devid, err; id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_code, "incA", devid); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val); if (err) STARPU_OPENCL_REPORT_ERROR(err); { size_t global=100; size_t local=100; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); }