void opencl_codelet(void *descr[], void *_args) { cl_mem val = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(descr[0]); cl_kernel kernel; cl_command_queue queue; cl_event event; int id, devid, err; id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "incrementer", devid); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 0, sizeof(val), &val); if (err) STARPU_OPENCL_REPORT_ERROR(err); { size_t global=4; size_t local, s; cl_device_id device; starpu_opencl_get_device(devid, &device); err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); if (local > global) local=global; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); starpu_opencl_release_kernel(kernel); } }
void opencl_codelet_incC(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args) { STARPU_SKIP_IF_VALGRIND; cl_mem val = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(descr[0]); cl_kernel kernel; cl_command_queue queue; cl_event event; int id, devid, err; id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_code, "incC", devid); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 0, sizeof(val), &val); if (err) STARPU_OPENCL_REPORT_ERROR(err); { size_t global=100; size_t local=100; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); } starpu_opencl_release_kernel(kernel); }
void opencl_shadow_host(int bz, TYPE *ptr, int nx, int ny, int nz, int ldy, int ldz, int i) { #if 0 size_t dim[] = {nx, ny, nz}; #else size_t dim[] = {nx, ny, 1}; #endif int devid,id; id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); cl_kernel kernel; cl_command_queue cq; starpu_opencl_load_kernel(&kernel, &cq, &program, "shadow", devid); clSetKernelArg(kernel, 0, sizeof(bz), &bz); clSetKernelArg(kernel, 1, sizeof(ptr), &ptr); clSetKernelArg(kernel, 2, sizeof(nx), &nx); clSetKernelArg(kernel, 3, sizeof(ny), &ny); clSetKernelArg(kernel, 4, sizeof(nz), &nz); clSetKernelArg(kernel, 5, sizeof(ldy), &ldy); clSetKernelArg(kernel, 6, sizeof(ldz), &ldz); clSetKernelArg(kernel, 7, sizeof(i), &i); cl_event ev; cl_int err = clEnqueueNDRangeKernel(cq, kernel, 3, NULL, dim, NULL, 0, NULL, NULL); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); }
void axpy_opencl(void *buffers[], void *_args) { TYPE *alpha = _args; int id, devid; cl_int err; cl_kernel kernel; cl_command_queue queue; cl_event event; unsigned n = STARPU_VECTOR_GET_NX(buffers[0]); cl_mem x = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]); cl_mem y = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[1]); id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "_axpy_opencl", devid); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 0, sizeof(x), &x); err|= clSetKernelArg(kernel, 1, sizeof(y), &y); err|= clSetKernelArg(kernel, 2, sizeof(n), &n); err|= clSetKernelArg(kernel, 3, sizeof(*alpha), alpha); if (err) STARPU_OPENCL_REPORT_ERROR(err); { size_t global=n; size_t local; size_t s; cl_device_id device; starpu_opencl_get_device(devid, &device); err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); if (local > global) local=global; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); } starpu_opencl_release_kernel(kernel); }
void redux_opencl_func(void *buffers[], void *args) { int id, devid; cl_int err; cl_kernel kernel; cl_command_queue queue; cl_event event; cl_mem dota = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[0]); cl_mem dotb = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[1]); id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); err = starpu_opencl_load_kernel(&kernel, &queue, &_opencl_program, "_redux_opencl", devid); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 0, sizeof(dota), &dota); err|= clSetKernelArg(kernel, 1, sizeof(dotb), &dotb); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); { size_t global=1; size_t local; size_t s; cl_device_id device; starpu_opencl_get_device(devid, &device); err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); if (local > global) local=global; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); } starpu_opencl_release_kernel(kernel); }
void opencl_codelet(void *descr[], void *_args) { cl_kernel kernel; cl_command_queue queue; cl_event event; int id, devid, err; cl_mem block = (cl_mem)STARPU_BLOCK_GET_DEV_HANDLE(descr[0]); int nx = (int)STARPU_BLOCK_GET_NX(descr[0]); int ny = (int)STARPU_BLOCK_GET_NY(descr[0]); int nz = (int)STARPU_BLOCK_GET_NZ(descr[0]); int ldy = (int)STARPU_BLOCK_GET_LDY(descr[0]); int ldz = (int) STARPU_BLOCK_GET_LDZ(descr[0]); float *multiplier = (float *)_args; id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_code, "block", devid); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); CHECK_CL_SET_KERNEL_ARG(kernel, 0, sizeof(block), &block); CHECK_CL_SET_KERNEL_ARG(kernel, 1, sizeof(nx), &nx); CHECK_CL_SET_KERNEL_ARG(kernel, 2, sizeof(ny), &ny); CHECK_CL_SET_KERNEL_ARG(kernel, 3, sizeof(nz), &nz); CHECK_CL_SET_KERNEL_ARG(kernel, 4, sizeof(ldy), &ldy); CHECK_CL_SET_KERNEL_ARG(kernel, 5, sizeof(ldz), &ldz); CHECK_CL_SET_KERNEL_ARG(kernel, 6, sizeof(*multiplier), multiplier); { size_t global=nx*ny*nz; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, &event); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); } clFinish(queue); starpu_opencl_collect_stats(event); clReleaseEvent(event); starpu_opencl_release_kernel(kernel); }
void opencl_codelet(void *descr[], void *_args) { cl_kernel kernel; cl_command_queue queue; int id, devid, err, n; float *block = (float *)STARPU_BLOCK_GET_PTR(descr[0]); int nx = (int)STARPU_BLOCK_GET_NX(descr[0]); int ny = (int)STARPU_BLOCK_GET_NY(descr[0]); int nz = (int)STARPU_BLOCK_GET_NZ(descr[0]); unsigned ldy = STARPU_BLOCK_GET_LDY(descr[0]); unsigned ldz = STARPU_BLOCK_GET_LDZ(descr[0]); float *multiplier = (float *)_args; id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_code, "block", devid); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = 0; n=0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &block); err = clSetKernelArg(kernel, 1, sizeof(int), &nx); err = clSetKernelArg(kernel, 2, sizeof(int), &ny); err = clSetKernelArg(kernel, 3, sizeof(int), &nz); err = clSetKernelArg(kernel, 4, sizeof(ldy), &ldy); err = clSetKernelArg(kernel, 5, sizeof(ldz), &ldz); err = clSetKernelArg(kernel, 6, sizeof(float), multiplier); if (err) STARPU_OPENCL_REPORT_ERROR(err); { size_t global=nx*ny*nz; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); } clFinish(queue); starpu_opencl_release_kernel(kernel); }
void vector_scal_opencl(void *buffers[], void *_args) { float *factor = _args; int id, devid, err; cl_kernel kernel; cl_command_queue queue; cl_event event; /* length of the vector */ unsigned n = STARPU_VECTOR_GET_NX(buffers[0]); /* OpenCL copy of the vector pointer */ cl_mem val = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]); id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); err = starpu_opencl_load_kernel(&kernel, &queue, &programs, "vector_mult_opencl", devid); /* Name of the codelet defined above */ if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 0, sizeof(val), &val); err |= clSetKernelArg(kernel, 1, sizeof(n), &n); err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor); if (err) STARPU_OPENCL_REPORT_ERROR(err); { size_t global=1; size_t local=1; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); } clFinish(queue); starpu_opencl_collect_stats(event); clReleaseEvent(event); starpu_opencl_release_kernel(kernel); }
void cpu_to_opencl_opencl_func(void *buffers[], void *args) { (void) args; int id, devid; cl_int err; cl_kernel kernel; cl_command_queue queue; cl_event event; unsigned n = CUSTOM_GET_NX(buffers[0]); n*=2; struct point *aop; aop = (struct point *) CUSTOM_GET_CPU_PTR(buffers[0]); id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); err = starpu_opencl_load_kernel(&kernel, &queue, &_opencl_conversion_program, "custom_opencl_conversion", devid); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); void *x = CUSTOM_GET_OPENCL_X_PTR(buffers[0]); if (starpu_opencl_set_kernel_args(&err, &kernel, sizeof(aop), &aop, sizeof(x), &x, sizeof(n), &n, 0) != 3) { STARPU_OPENCL_REPORT_ERROR(err); assert(0); } { size_t global=n; size_t local; size_t s; cl_device_id device; starpu_opencl_get_device(devid, &device); err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); if (local > global) local = global; err = clEnqueueNDRangeKernel( queue, kernel, 1, /* work_dim */ NULL, /* global_work_offset */ &global, /* global_work_size */ &local, /* local_work_size */ 0, /* num_events_in_wait_list */ NULL, /* event_wait_list */ NULL); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); } starpu_opencl_release_kernel(kernel); }
#include <starpu.h> #include <starpu_opencl.h> #include <CL/cl.h> extern struct starpu_opencl_program opencl_code; void opencl_codelet_incA(void *descr[], __attribute__ ((unused)) void *_args) { unsigned *val = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]); cl_kernel kernel; cl_command_queue queue; int id, devid, err; id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_code, "incA", devid); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val); if (err) STARPU_OPENCL_REPORT_ERROR(err); { size_t global=100; size_t local=100; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); }
static int copy_data_1_to_1_generic(starpu_data_handle_t handle, struct _starpu_data_replicate *src_replicate, struct _starpu_data_replicate *dst_replicate, struct _starpu_data_request *req) { unsigned src_node = src_replicate->memory_node; unsigned dst_node = dst_replicate->memory_node; STARPU_ASSERT(src_replicate->refcnt); STARPU_ASSERT(dst_replicate->refcnt); STARPU_ASSERT(src_replicate->allocated); STARPU_ASSERT(dst_replicate->allocated); _starpu_comm_amounts_inc(src_node, dst_node, handle->ops->get_size(handle)); #ifdef STARPU_SIMGRID return _starpu_simgrid_transfer(handle->ops->get_size(handle), src_node, dst_node, req); #else /* !SIMGRID */ int ret = 0; const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods; enum starpu_node_kind src_kind = starpu_node_get_kind(src_node); enum starpu_node_kind dst_kind = starpu_node_get_kind(dst_node); #ifdef STARPU_USE_CUDA cudaError_t cures; cudaStream_t stream; #endif void *src_interface = src_replicate->data_interface; void *dst_interface = dst_replicate->data_interface; #if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID) if ((src_kind == STARPU_CUDA_RAM) || (dst_kind == STARPU_CUDA_RAM)) { unsigned devid; if ((src_kind == STARPU_CUDA_RAM) && (dst_kind == STARPU_CUDA_RAM)) { /* GPU-GPU transfer, issue it from the device we are supposed to drive */ int worker = starpu_worker_get_id(); devid = starpu_worker_get_devid(worker); } else { unsigned node = (dst_kind == STARPU_CUDA_RAM)?dst_node:src_node; devid = _starpu_memory_node_get_devid(node); } starpu_cuda_set_device(devid); } #endif switch (_STARPU_MEMORY_NODE_TUPLE(src_kind,dst_kind)) { case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CPU_RAM): /* STARPU_CPU_RAM -> STARPU_CPU_RAM */ if (copy_methods->ram_to_ram) copy_methods->ram_to_ram(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req ? &req->async_channel : NULL); break; #ifdef STARPU_USE_CUDA case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CPU_RAM): /* only the proper CUBLAS thread can initiate this directly ! */ #if !defined(HAVE_CUDA_MEMCPY_PEER) STARPU_ASSERT(_starpu_memory_node_get_local_key() == src_node); #endif if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() || !(copy_methods->cuda_to_ram_async || copy_methods->any_to_any)) { /* this is not associated to a request so it's synchronous */ STARPU_ASSERT(copy_methods->cuda_to_ram || copy_methods->any_to_any); if (copy_methods->cuda_to_ram) copy_methods->cuda_to_ram(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL); } else { req->async_channel.type = STARPU_CUDA_RAM; cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming); if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures); stream = starpu_cuda_get_local_out_transfer_stream(); if (copy_methods->cuda_to_ram_async) ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream); else { STARPU_ASSERT(copy_methods->any_to_any); ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel); } cures = cudaEventRecord(req->async_channel.event.cuda_event, stream); if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures); } break; case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM): /* STARPU_CPU_RAM -> CUBLAS_RAM */ /* only the proper CUBLAS thread can initiate this ! */ #if !defined(HAVE_CUDA_MEMCPY_PEER) STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node); #endif if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() || !(copy_methods->ram_to_cuda_async || copy_methods->any_to_any)) { /* this is not associated to a request so it's synchronous */ STARPU_ASSERT(copy_methods->ram_to_cuda || copy_methods->any_to_any); if (copy_methods->ram_to_cuda) copy_methods->ram_to_cuda(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL); } else { req->async_channel.type = STARPU_CUDA_RAM; cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming); if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures); stream = starpu_cuda_get_local_in_transfer_stream(); if (copy_methods->ram_to_cuda_async) ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream); else { STARPU_ASSERT(copy_methods->any_to_any); ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel); } cures = cudaEventRecord(req->async_channel.event.cuda_event, stream); if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures); } break; case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM): /* CUDA - CUDA transfer */ if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() || !(copy_methods->cuda_to_cuda_async || copy_methods->any_to_any)) { STARPU_ASSERT(copy_methods->cuda_to_cuda || copy_methods->any_to_any); /* this is not associated to a request so it's synchronous */ if (copy_methods->cuda_to_cuda) copy_methods->cuda_to_cuda(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL); } else { req->async_channel.type = STARPU_CUDA_RAM; cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming); if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures); stream = starpu_cuda_get_peer_transfer_stream(src_node, dst_node); if (copy_methods->cuda_to_cuda_async) ret = copy_methods->cuda_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream); else { STARPU_ASSERT(copy_methods->any_to_any); ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel); } cures = cudaEventRecord(req->async_channel.event.cuda_event, stream); if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures); } break; #endif #ifdef STARPU_USE_OPENCL case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_CPU_RAM): /* OpenCL -> RAM */ STARPU_ASSERT(_starpu_memory_node_get_local_key() == src_node); if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() || !(copy_methods->opencl_to_ram_async || copy_methods->any_to_any)) { STARPU_ASSERT(copy_methods->opencl_to_ram || copy_methods->any_to_any); /* this is not associated to a request so it's synchronous */ if (copy_methods->opencl_to_ram) copy_methods->opencl_to_ram(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL); } else { req->async_channel.type = STARPU_OPENCL_RAM; if (copy_methods->opencl_to_ram_async) ret = copy_methods->opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event)); else { STARPU_ASSERT(copy_methods->any_to_any); ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel); } } break; case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_OPENCL_RAM): /* STARPU_CPU_RAM -> STARPU_OPENCL_RAM */ STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node); if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() || !(copy_methods->ram_to_opencl_async || copy_methods->any_to_any)) { STARPU_ASSERT(copy_methods->ram_to_opencl || copy_methods->any_to_any); /* this is not associated to a request so it's synchronous */ if (copy_methods->ram_to_opencl) copy_methods->ram_to_opencl(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL); } else { req->async_channel.type = STARPU_OPENCL_RAM; if (copy_methods->ram_to_opencl_async) ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event)); else { STARPU_ASSERT(copy_methods->any_to_any); ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel); } } break; case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_OPENCL_RAM): /* STARPU_OPENCL_RAM -> STARPU_OPENCL_RAM */ STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node || _starpu_memory_node_get_local_key() == src_node); if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() || !(copy_methods->opencl_to_opencl_async || copy_methods->any_to_any)) { STARPU_ASSERT(copy_methods->opencl_to_opencl || copy_methods->any_to_any); /* this is not associated to a request so it's synchronous */ if (copy_methods->opencl_to_opencl) copy_methods->opencl_to_opencl(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL); } else { req->async_channel.type = STARPU_OPENCL_RAM; if (copy_methods->opencl_to_opencl_async) ret = copy_methods->opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event)); else { STARPU_ASSERT(copy_methods->any_to_any); ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel); } } break; #endif #ifdef STARPU_USE_MIC case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_MIC_RAM): /* RAM -> MIC */ if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mic_copy_disabled() || !(copy_methods->ram_to_mic_async || copy_methods->any_to_any)) { /* this is not associated to a request so it's synchronous */ STARPU_ASSERT(copy_methods->ram_to_mic || copy_methods->any_to_any); if (copy_methods->ram_to_mic) copy_methods->ram_to_mic(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL); } else { req->async_channel.type = STARPU_MIC_RAM; if (copy_methods->ram_to_mic_async) ret = copy_methods->ram_to_mic_async(src_interface, src_node, dst_interface, dst_node); else { STARPU_ASSERT(copy_methods->any_to_any); ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel); } _starpu_mic_init_event(&(req->async_channel.event.mic_event), dst_node); } break; case _STARPU_MEMORY_NODE_TUPLE(STARPU_MIC_RAM,STARPU_CPU_RAM): /* MIC -> RAM */ if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mic_copy_disabled() || !(copy_methods->mic_to_ram_async || copy_methods->any_to_any)) { /* this is not associated to a request so it's synchronous */ STARPU_ASSERT(copy_methods->mic_to_ram || copy_methods->any_to_any); if (copy_methods->mic_to_ram) copy_methods->mic_to_ram(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL); } else { req->async_channel.type = STARPU_MIC_RAM; if (copy_methods->mic_to_ram_async) ret = copy_methods->mic_to_ram_async(src_interface, src_node, dst_interface, dst_node); else { STARPU_ASSERT(copy_methods->any_to_any); ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel); } _starpu_mic_init_event(&(req->async_channel.event.mic_event), src_node); } break; #endif #ifdef STARPU_USE_SCC /* SCC RAM associated to the master process is considered as * the main memory node. */ case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_SCC_RAM): /* master private SCC RAM -> slave private SCC RAM */ if (copy_methods->scc_src_to_sink) copy_methods->scc_src_to_sink(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL); break; case _STARPU_MEMORY_NODE_TUPLE(STARPU_SCC_RAM,STARPU_CPU_RAM): /* slave private SCC RAM -> master private SCC RAM */ if (copy_methods->scc_sink_to_src) copy_methods->scc_sink_to_src(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL); break; case _STARPU_MEMORY_NODE_TUPLE(STARPU_SCC_RAM,STARPU_SCC_RAM): /* slave private SCC RAM -> slave private SCC RAM */ if (copy_methods->scc_sink_to_sink) copy_methods->scc_sink_to_sink(src_interface, src_node, dst_interface, dst_node); else copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL); break; #endif case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_DISK_RAM): if(copy_methods->any_to_any) ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req && !starpu_asynchronous_copy_disabled() ? &req->async_channel : NULL); else { void *obj = starpu_data_handle_to_pointer(handle, dst_node); void * ptr = NULL; starpu_ssize_t size = 0; handle->ops->pack_data(handle, src_node, &ptr, &size); ret = _starpu_disk_full_write(src_node, dst_node, obj, ptr, size, &req->async_channel); if (ret == 0) /* write is already finished, ptr was allocated in pack_data */ free(ptr); /* For now, asynchronous is not supported */ STARPU_ASSERT(ret == 0); } break; case _STARPU_MEMORY_NODE_TUPLE(STARPU_DISK_RAM,STARPU_CPU_RAM): if(copy_methods->any_to_any) ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req && !starpu_asynchronous_copy_disabled() ? &req->async_channel : NULL); else { void *obj = starpu_data_handle_to_pointer(handle, src_node); void * ptr = NULL; size_t size = 0; ret = _starpu_disk_full_read(src_node, dst_node, obj, &ptr, &size, &req->async_channel); if (ret == 0) { /* read is already finished, we can already unpack */ handle->ops->unpack_data(handle, dst_node, ptr, size); /* ptr is allocated in full_read */ free(ptr); } /* For now, asynchronous is not supported */ STARPU_ASSERT(ret == 0); } break; case _STARPU_MEMORY_NODE_TUPLE(STARPU_DISK_RAM,STARPU_DISK_RAM): ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req ? &req->async_channel : NULL); break; default: STARPU_ABORT(); break; } return ret; #endif /* !SIMGRID */ }
void cpu_to_opencl_opencl_func(void *buffers[], void *args) { STARPU_SKIP_IF_VALGRIND; (void) args; int id, devid, ret; cl_int err; cl_kernel kernel; cl_command_queue queue; cl_event event; unsigned n = STARPU_MULTIFORMAT_GET_NX(buffers[0]); cl_mem src = (cl_mem) STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]); cl_mem dst = (cl_mem) STARPU_MULTIFORMAT_GET_OPENCL_PTR(buffers[0]); id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); ret = starpu_opencl_load_opencl_from_file(KERNEL_LOCATION, &opencl_conversion_program, NULL); STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file"); err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_conversion_program, "cpu_to_opencl_opencl", devid); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 0, sizeof(src), &src); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 1, sizeof(dst), &dst); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 2, sizeof(n), &n); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); { size_t global=n; size_t local; size_t s; cl_device_id device; starpu_opencl_get_device(devid, &device); err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); if (local > global) local = global; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); } clFinish(queue); starpu_opencl_collect_stats(event); clReleaseEvent(event); starpu_opencl_release_kernel(kernel); ret = starpu_opencl_unload_opencl(&opencl_conversion_program); STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl"); }
void multiformat_scal_opencl_func(void *buffers[], void *args) { (void) args; int id, devid; cl_int err; cl_kernel kernel; cl_command_queue queue; cl_event event; unsigned n = STARPU_MULTIFORMAT_GET_NX(buffers[0]); cl_mem val = (cl_mem)STARPU_MULTIFORMAT_GET_OPENCL_PTR(buffers[0]); id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "multiformat_opencl", devid); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 0, sizeof(val), &val); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 1, sizeof(n), &n); if (err) STARPU_OPENCL_REPORT_ERROR(err); { size_t global=n; size_t local; size_t s; cl_device_id device; starpu_opencl_get_device(devid, &device); err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); if (local > global) local = global; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); } clFinish(queue); starpu_opencl_collect_stats(event); clReleaseEvent(event); starpu_opencl_release_kernel(kernel); }
void test_variable_opencl_func(void *buffers[], void *args) { STARPU_SKIP_IF_VALGRIND; int id, devid, ret; int factor = *(int *) args; cl_int err; cl_kernel kernel; cl_command_queue queue; cl_event event; ret = starpu_opencl_load_opencl_from_file(KERNEL_LOCATION, &opencl_program, NULL); STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file"); cl_mem val = (cl_mem)STARPU_VARIABLE_GET_PTR(buffers[0]); cl_context context; id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); starpu_opencl_get_context(devid, &context); cl_mem fail = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeof(int), &variable_config.copy_failed, &err); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "variable_opencl", devid); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 0, sizeof(val), &val); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 1, sizeof(fail), &fail); if (err) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 2, sizeof(factor), &factor); if (err) STARPU_OPENCL_REPORT_ERROR(err); { size_t global = 1; size_t local; size_t s; cl_device_id device; starpu_opencl_get_device(devid, &device); err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); if (local > global) local = global; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); } err = clEnqueueReadBuffer(queue, fail, CL_TRUE, 0, sizeof(int), &variable_config.copy_failed, 0, NULL, NULL); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); clFinish(queue); starpu_opencl_collect_stats(event); clReleaseEvent(event); starpu_opencl_release_kernel(kernel); ret = starpu_opencl_unload_opencl(&opencl_program); STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl"); return; }