void WebCLEvent::setCallback(unsigned commandExecCallbackType, WebCLCallback* callback, ExceptionState& es) { if (isReleased()) { es.throwWebCLException(WebCLException::INVALID_EVENT, WebCLException::invalidEventMessage); return; } if (commandExecCallbackType != CL_COMPLETE) { es.throwWebCLException(WebCLException::INVALID_VALUE, WebCLException::invalidValueMessage); return; } ASSERT(callback); if (m_callbacks.size()) { m_callbacks.append(adoptRef(callback)); return; } m_callbacks.clear(); m_callbacks.append(adoptRef(callback)); WebCLEventHolder* holder = new WebCLEventHolder; holder->event = createWeakPtr(); holder->type = commandExecCallbackType; cl_int err = clSetEventCallback(m_clEvent, commandExecCallbackType, &callbackProxy, holder); if (err != CL_SUCCESS) WebCLException::throwException(err, es); }
/* Writes the contents of a given dataset into a given cl_mem device memory buffer * * @env: Struct containing device/context/queue variables. * @mem_struct: Struct containing cl_mem buffer and the number of entries it can hold. * @dataset: Pointer to an integer array of data to be read, same length as buffer. */ void loadIntArrayIntoDevice( const RubiCLEnvironment env, const RubiCLMemoryBuffer mem_struct, int* dataset ) { if (DEBUG) printf("loadIntArrayIntoDevice\n"); cl_event write_event; cl_int ret = clEnqueueWriteBuffer( env.queue, // Command queue mem_struct.buffer, // Memory buffer CL_FALSE, // Blocking write? (set to nonblocking) 0, // Offset in buffer to write to mem_struct.buffer_entries * sizeof(int), // Input data size dataset, // Input data 0, // Number of preceding actions NULL, // List of preceding actions &write_event // Event object destination ); if (ret != CL_SUCCESS) printf("clEnqueueWriteBuffer %s\n", oclErrorString(ret)); clSetEventCallback( write_event, // Event to monitor CL_COMPLETE, // Status to fire on &releaseMemoryCallback, // Callback to trigger dataset // Data to pass to callback ); }
rcl_status cl_write_buffer(struct client_state* state, buffer_t buffer, void* ptr, uint64_t offset, uint32_t size) { struct buffer_state* buffer_state; cl_int retval; cl_event event; buffer--; if (!vector_valid_idx(&state->buffers, buffer)) { log_print(log_error, "Buffer %" PRIu32 " not found", buffer); return RCL_INVALID_BUFFER; } log_print(log_notice, "Writing to buffer, size: %" PRIu32 ", offset: %" \ PRIu64, size, offset); buffer_state = *vector_element(&state->buffers, buffer, struct buffer_state*); retval = clEnqueueWriteBuffer(state->command_queue, buffer_state->id, CL_FALSE, offset, size, ptr, 0, NULL, &event); if (retval) return opencl_error(retval); if (buffer_state->original) memcpy(buffer_state->original + offset, ptr, size); retval = clSetEventCallback(event, CL_COMPLETE, cl_free_write_buffer, ptr); if (retval) { clFinish(state->command_queue); free(ptr); } return RCL_OK; }
cl_int clEnqueueNDRangeKernel (cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) { static struct work_size_s work_sizes; struct ld_kernel_s *ldKernel = find_kernel_entry(kernel); int i; cl_int errcode; if (num_events_in_wait_list) { clCheck(clWaitForEvents(num_events_in_wait_list, event_wait_list)); } assert(ldKernel); for (i = 0; i < work_dim; i++) { work_sizes.local[i] = local_work_size[i]; work_sizes.global[i] = global_work_size[i]/work_sizes.local[i]; } #if ENABLE_KERNEL_PROFILING == 1 static cl_event kern_event; if (!event) { event = &kern_event; // scope of the event is limited to this function. } #endif kernel_executed_event(ldKernel, &work_sizes, work_dim); errcode = real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, num_events_in_wait_list, event_wait_list, event); #if ENABLE_KERNEL_PROFILING == 1 clCheck(errcode); clRetainEvent(*event); clSetEventCallback(*event, CL_COMPLETE, kernel_profiler_cb, ldKernel); #endif #if FORCE_FINISH_KERNEL real_clFinish(command_queue); #endif kernel_finished_event(ldKernel, &work_sizes, work_dim); return errcode; }
/// Registers a function to be called when the event status changes to /// \p status (by default CL_COMPLETE). The callback is passed the OpenCL /// event object, the event status, and a pointer to arbitrary user data. /// /// \see_opencl_ref{clSetEventCallback} /// /// \opencl_version_warning{1,1} void set_callback(void (BOOST_COMPUTE_CL_CALLBACK *callback)( cl_event event, cl_int status, void *user_data ), cl_int status = CL_COMPLETE, void *user_data = 0) { cl_int ret = clSetEventCallback(m_event, status, callback, user_data); if(ret != CL_SUCCESS){ BOOST_THROW_EXCEPTION(opencl_error(ret)); } }
cl_int EventWrapper::setEventCallback (cl_int aCommandExecCallbackType, void (*aCallback)(cl_event, cl_int, void*), void* aUserData) { #if CL_WRAPPER_CL_VERSION_SUPPORT >= 110 return clSetEventCallback (mWrapped, aCommandExecCallbackType, aCallback, aUserData); #else // CL_WRAPPER_CL_VERSION_SUPPORT >= 110 (void)aCommandExecCallbackType; (void)aCallback; (void)aUserData; D_LOG (LOG_LEVEL_ERROR, "CLWrapper support for OpenCL 1.1 API was not enabled at build time."); return CL_INVALID_VALUE; #endif }
static int RegisterCallback(lua_State *L) { cl_event event = *traits::CheckObject(L, 1); cl_int cmdType = static_cast<cl_int>(luaL_checknumber(L, 2)); lua_State *thread = traits::CreateCallbackThread(L, 3); if (thread == NULL) { return 0; } cl_int err = clSetEventCallback(event, cmdType, Callback, thread); CheckCLError(L, err, "Failed registering event callback: %s."); lua_pushvalue(L, 1); traits::RegisterCallback(L); return 0; }
void Event::callback( CommandExecutionCallbackType type, void (*pfn_notify) (cl_event event, cl_int status, void * user_data), void * user_data ) { static const auto error_map = error::ErrorMap{ {ErrorCode::invalid_event, "the given event is invalid."}, {ErrorCode::invalid_value, "the given callback is null; OR the given callback type is invalid."} }; error::handle<EventException>( clSetEventCallback( m_id, static_cast<std::underlying_type<CommandExecutionCallbackType>::type>(type), pfn_notify, user_data ), error_map ); }
void WebCL::waitForEventsImpl(const Vector<RefPtr<WebCLEvent>>& events, WebCLCallback* callback) { Vector<cl_event> clEvents; Vector<WeakPtr<WebCLObject>> webEvents; WebCLHolder* holder = new WebCLHolder; holder->webcl = m_weakFactory.createWeakPtr(); for (auto event : events) { clEvents.append(event->getEvent()); webEvents.append(event->createWeakPtr()); } if (!callback) { clWaitForEvents(clEvents.size(), clEvents.data()); } else { m_callbackRegisterQueue.append(std::make_pair(webEvents, adoptRef(callback))); for (auto clEvent : clEvents) clSetEventCallback(clEvent, CL_COMPLETE, &callbackProxy, holder); } }
void WebCLEvent::setCallback(unsigned long commandExecCallbackType, PassOwnPtr<WebCLCallback> callback, ExceptionState& es) { cl_int err = 0; if (m_cl_Event == NULL) { es.throwWebCLException( WebCLException::INVALID_EVENT, WebCLException::invalidEventMessage); return; } if (commandExecCallbackType != WebCL::COMPLETE) { es.throwWebCLException( WebCLException::INVALID_VALUE, WebCLException::invalidValueMessage); return; } ASSERT(callback); if (CallbackDataVector* vector = callbackRegisterQueue().get(this)) { vector->append(std::make_pair(commandExecCallbackType, callback)); return; } OwnPtr<CallbackDataVector> vector = adoptPtr(new CallbackDataVector()); vector->append(std::make_pair(commandExecCallbackType, callback)); callbackRegisterQueue().set(this, vector.release()); err = WebCL::SUCCESS; pfnEventNotify callbackProxyPtr = &callbackProxy; err = clSetEventCallback(m_cl_Event, commandExecCallbackType, callbackProxyPtr, this); if (err != CL_SUCCESS) { WebCLException::throwException(err, es); } return; }
void AutoExposure::update(cl_command_queue queue, cl_mem image) { assert(_initialized); // If auto-exposure is disabled, don't do anything if(!_autoExposure) return; // Increse/decrease exposure from the difference between the current luma // average with an expected 0.5 frame average. _exposure *= 1.0f + qBound(-_adjustSpeed, 0.5f - _exposureData.meteringAverage, _adjustSpeed); _updateCounter++; if(_updateCounter % _updatePeriod) return; _updateCounter= 0; int ai= 0; clKernelArg(_downKernel, ai++, image); clKernelArg(_downKernel, ai++, _lumaImage); if(!clLaunchKernelEvent(_downKernel, queue, _lumaSize, "AE/Downsample")) return; // Download image data cl_event& downloadEvent= analytics.clEvent("AE/Download"); size_t origin[3]= { 0,0,0 }; size_t region[3]= { (size_t)_lumaSize.width(), (size_t)_lumaSize.height(), 1 }; cl_int error= clEnqueueReadImage(queue, _lumaImage, CL_FALSE, origin, region, _lumaSize.width(),0,_lumaData,0,0, &downloadEvent); if(clCheckError(error, "clEnqueueReadImage")) return; error= clSetEventCallback(downloadEvent, CL_COMPLETE, exposureCallback, (void*)this); clCheckError(error, "clSetEventCallback"); // When the download is done, exposureCallback will be called }
static void opencl_task_start( MTAPI_IN mtapi_task_hndl_t task, MTAPI_OUT mtapi_status_t* status) { mtapi_status_t local_status = MTAPI_ERR_UNKNOWN; cl_int err; if (embb_mtapi_node_is_initialized()) { embb_mtapi_node_t * node = embb_mtapi_node_get_instance(); if (embb_mtapi_task_pool_is_handle_valid(node->task_pool, task)) { embb_mtapi_task_t * local_task = embb_mtapi_task_pool_get_storage_for_handle(node->task_pool, task); if (embb_mtapi_action_pool_is_handle_valid( node->action_pool, local_task->action)) { embb_mtapi_action_t * local_action = embb_mtapi_action_pool_get_storage_for_handle( node->action_pool, local_task->action); embb_mtapi_opencl_plugin_t * plugin = &embb_mtapi_opencl_plugin; embb_mtapi_opencl_action_t * opencl_action = (embb_mtapi_opencl_action_t*)local_action->plugin_data; embb_mtapi_opencl_task_t * opencl_task = (embb_mtapi_opencl_task_t*)embb_alloc( sizeof(embb_mtapi_opencl_task_t)); size_t elements = local_task->result_size / opencl_action->element_size; size_t global_work_size; if (0 == elements) elements = 1; global_work_size = round_up(opencl_action->local_work_size, elements); opencl_task->task = task; opencl_task->arguments_size = (int)local_task->arguments_size; if (0 < local_task->arguments_size) { opencl_task->arguments = clCreateBuffer(plugin->context, CL_MEM_READ_ONLY, local_task->arguments_size, NULL, &err); } else { opencl_task->arguments = NULL; } opencl_task->result_buffer_size = (int)local_task->result_size; if (0 < local_task->result_size) { opencl_task->result_buffer = clCreateBuffer(plugin->context, CL_MEM_WRITE_ONLY, local_task->result_size, NULL, &err); } else { opencl_task->result_buffer = NULL; } err = clSetKernelArg(opencl_action->kernel, 0, sizeof(cl_mem), (const void*)&opencl_task->arguments); err |= clSetKernelArg(opencl_action->kernel, 1, sizeof(cl_int), (const void*)&opencl_task->arguments_size); err |= clSetKernelArg(opencl_action->kernel, 2, sizeof(cl_mem), (const void*)&opencl_task->result_buffer); err |= clSetKernelArg(opencl_action->kernel, 3, sizeof(cl_int), (const void*)&opencl_task->result_buffer_size); err |= clEnqueueWriteBuffer(plugin->command_queue, opencl_task->arguments, CL_FALSE, 0, (size_t)opencl_task->arguments_size, local_task->arguments, 0, NULL, NULL); if (CL_SUCCESS == err) { embb_mtapi_task_set_state(local_task, MTAPI_TASK_RUNNING); err |= clEnqueueNDRangeKernel(plugin->command_queue, opencl_action->kernel, 1, NULL, &global_work_size, &opencl_action->local_work_size, 0, NULL, NULL); err |= clEnqueueReadBuffer(plugin->command_queue, opencl_task->result_buffer, CL_FALSE, 0, (size_t)opencl_task->result_buffer_size, local_task->result_buffer, 0, NULL, &opencl_task->kernel_finish_event); err |= clSetEventCallback(opencl_task->kernel_finish_event, CL_COMPLETE, opencl_task_complete, opencl_task); } err |= clFlush(plugin->command_queue); if (CL_SUCCESS != err) { embb_mtapi_task_set_state(local_task, MTAPI_TASK_ERROR); local_status = MTAPI_ERR_ACTION_FAILED; } else { local_status = MTAPI_SUCCESS; } } } } mtapi_status_set(status, local_status); }
void enqueue() { // Errors in this function can not be handled by opencl_err.hpp // because they require non-standard error handling CAF_LOG_TRACE("command::enqueue()"); this->ref(); // reference held by the OpenCL comand queue cl_event event_k; auto data_or_nullptr = [](const dim_vec& vec) { return vec.empty() ? nullptr : vec.data(); }; // OpenCL expects cl_uint (unsigned int), hence the cast cl_int err = clEnqueueNDRangeKernel( queue_.get(), actor_facade_->kernel_.get(), static_cast<cl_uint>(actor_facade_->config_.dimensions().size()), data_or_nullptr(actor_facade_->config_.offsets()), data_or_nullptr(actor_facade_->config_.dimensions()), data_or_nullptr(actor_facade_->config_.local_dimensions()), static_cast<cl_uint>(mem_in_events_.size()), (mem_in_events_.empty() ? nullptr : mem_in_events_.data()), &event_k ); if (err != CL_SUCCESS) { CAF_LOGMF(CAF_ERROR, "clEnqueueNDRangeKernel: " << get_opencl_error(err)); clReleaseEvent(event_k); this->deref(); return; } else { enqueue_read_buffers(event_k, detail::get_indices(result_buffers_)); cl_event marker; #if defined(__APPLE__) err = clEnqueueMarkerWithWaitList( queue_.get(), static_cast<cl_uint>(mem_out_events_.size()), mem_out_events_.data(), &marker ); #else err = clEnqueueMarker(queue_.get(), &marker); #endif if (err != CL_SUCCESS) { CAF_LOGMF(CAF_ERROR, "clSetEventCallback: " << get_opencl_error(err)); clReleaseEvent(marker); clReleaseEvent(event_k); this->deref(); // callback is not set return; } err = clSetEventCallback(marker, CL_COMPLETE, [](cl_event, cl_int, void* data) { auto cmd = reinterpret_cast<command*>(data); cmd->handle_results(); cmd->deref(); }, this); if (err != CL_SUCCESS) { CAF_LOGMF(CAF_ERROR, "clSetEventCallback: " << get_opencl_error(err)); clReleaseEvent(marker); clReleaseEvent(event_k); this->deref(); // callback is not set return; } err = clFlush(queue_.get()); if (err != CL_SUCCESS) { CAF_LOGMF(CAF_ERROR, "clFlush: " << get_opencl_error(err)); } mem_out_events_.push_back(std::move(event_k)); mem_out_events_.push_back(std::move(marker)); } }
int main() { cl_int error_code = CL_SUCCESS; try { // find Intel platform cl_uint num_platforms = 0; error_code = clGetPlatformIDs(0, nullptr, &num_platforms); HANDLE_CL_ERROR(clGetPlatformIDs) std::unique_ptr<cl_platform_id[]> platform_ids( new cl_platform_id[static_cast<const std::size_t>(num_platforms)]); error_code = clGetPlatformIDs(num_platforms, platform_ids.get(), nullptr); HANDLE_CL_ERROR(clGetPlatformIDs) cl_platform_id platform = nullptr; for (std::size_t i = 0; i != static_cast<const std::size_t>(num_platforms); ++i) { std::size_t platform_name_size = 0; error_code = clGetPlatformInfo(platform_ids[i], CL_PLATFORM_NAME, 0, nullptr, &platform_name_size); HANDLE_CL_ERROR(clGetPlatformInfo) std::unique_ptr<char[]> platform_name(new char[platform_name_size]); error_code = clGetPlatformInfo(platform_ids[i], CL_PLATFORM_NAME, platform_name_size, platform_name.get(), nullptr); HANDLE_CL_ERROR(clGetPlatformInfo) if (std::strcmp(beignet_platform_name, platform_name.get()) == 0) { platform = platform_ids[i]; std::cout << "Platform: " << platform_name.get() << std::endl; break; } } if (platform == nullptr) { throw std::runtime_error(std::string("Couldn't find platform with name: ") + beignet_platform_name); } // find Intel GPU cl_device_id device = nullptr; error_code = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, nullptr); HANDLE_CL_ERROR(clGetDeviceIDs) std::size_t device_name_size = 0; error_code = clGetDeviceInfo(device, CL_DEVICE_NAME, 0, nullptr, &device_name_size); HANDLE_CL_ERROR(clGetDeviceInfo) std::unique_ptr<char[]> device_name(new char[device_name_size]); error_code = clGetDeviceInfo(device, CL_DEVICE_NAME, device_name_size, device_name.get(), nullptr); HANDLE_CL_ERROR(clGetDeviceInfo) std::cout << "Device: " << device_name.get() << std::endl; // create OpenCL context, command queue, program and kernel const auto context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &error_code); HANDLE_CL_ERROR(clCreateContext) const auto command_queue = clCreateCommandQueue(context, device, 0, &error_code); HANDLE_CL_ERROR(clCreateCommandQueue) const char *source_strings[1]; source_strings[0] = kernel_source; const std::size_t source_size = std::strlen(kernel_source); const auto program = clCreateProgramWithSource(context, 1, source_strings, &source_size, &error_code); HANDLE_CL_ERROR(clCreateProgramWithSource) error_code = clBuildProgram(program, 1, &device, "", nullptr, nullptr); HANDLE_CL_ERROR(clBuildProgram) const auto kernel = clCreateKernel(program, "print_hello", &error_code); HANDLE_CL_ERROR(clCreateKernel) // enqueue kernel and set event completion handler cl_event event; std::size_t global_work_size = 1; error_code = clEnqueueNDRangeKernel(command_queue, kernel, 1, nullptr, &global_work_size, nullptr, 0, nullptr, &event); HANDLE_CL_ERROR(clEnqueueNDRangeKernel) error_code = clSetEventCallback(event, CL_COMPLETE, [](cl_event, cl_int, void *) { std::cout << "OpenCL callback" << std::endl; // Notify the waiting thread that the kernel is completed { std::lock_guard<std::mutex> cond_lock(cond_mutex); kernel_complete = true; } cond_var.notify_one(); }, nullptr); HANDLE_CL_ERROR(clSetEventCallback) error_code = clFlush(command_queue); HANDLE_CL_ERROR(clFlush) // simulate work std::this_thread::sleep_for(std::chrono::seconds(1)); // do work, dependent on kernel completion { std::unique_lock<std::mutex> cond_lock(cond_mutex); while (!kernel_complete) { if (cond_var.wait_for(cond_lock, std::chrono::seconds(5)) == std::cv_status::timeout) { std::cout << "WARNING: A 5 second timeout has been reached on the condition variable.\n" " This may be a deadlock." << std::endl; } } } // When using Beignet, this will never be called as a deadlock will occur. std::cout << "Doing work, dependent on the kernel's completion" << std::endl; } catch (const std::exception &e) { std::cout << "Error: " << e.what() << std::endl; } catch (...) { std::cout << "Unknown error" << std::endl; } }
cl_int clEnqueueFillBuffer(cl_command_queue command_queue, cl_mem buffer, const void *pattern, size_t pattern_size, size_t offset, size_t size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) { static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; static cl_program last_program = NULL; static cl_context last_context = NULL; cl_kernel kernel; cl_program program; cl_context context; cl_device_id device; size_t gworksz; size_t lworksz; char kernel_name[80]; cl_int rc; union { cl_char v_char; cl_short v_short; cl_int v_int; cl_long v_long; } pattern_value; cl_uint pattern_nums; switch (pattern_size) { case sizeof(cl_char): pattern_value.v_char = *((cl_char *)pattern); break; case sizeof(cl_short): pattern_value.v_short = *((cl_short *)pattern); break; case sizeof(cl_int): pattern_value.v_int = *((cl_int *)pattern); break; case sizeof(cl_long): pattern_value.v_long = *((cl_long *)pattern); break; default: /* * pattern_size was not support one, even though OpenCL 1.2 * spec says 16, 32, 64 or 128 bytes patterns are supported. */ return CL_INVALID_VALUE; } /* ensure alignment */ if (offset % pattern_size != 0) return CL_INVALID_VALUE; if (size % pattern_size != 0) return CL_INVALID_VALUE; /* fetch context and device_id associated with this command queue */ rc = clGetCommandQueueInfo(command_queue, CL_QUEUE_CONTEXT, sizeof(cl_context), &context, NULL); if (rc != CL_SUCCESS) return rc; pthread_mutex_lock(&lock); if (last_program && last_context == context) { rc = clRetainProgram(last_program); if (rc != CL_SUCCESS) goto out_unlock; program = last_program; } else { char source[10240]; const char *prog_source[1]; size_t prog_length[1]; cl_uint num_devices; cl_device_id *device_ids; static struct { const char *type_name; size_t type_size; } pattern_types[] = { { "char", sizeof(cl_char) }, { "short", sizeof(cl_short) }, { "int", sizeof(cl_int) }, { "long", sizeof(cl_long) }, }; size_t i, ofs; /* fetch properties of cl_context */ rc = clGetContextInfo(context, CL_CONTEXT_NUM_DEVICES, sizeof(cl_uint), &num_devices, NULL); if (rc != CL_SUCCESS) goto out_unlock; Assert(num_devices > 0); device_ids = calloc(num_devices, sizeof(cl_device_id)); if (!device_ids) { rc = CL_OUT_OF_HOST_MEMORY; goto out_unlock; } rc = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(cl_device_id) * num_devices, device_ids, NULL); if (rc != CL_SUCCESS) { free(device_ids); goto out_unlock; } /* release the previous program */ if (last_program) { rc = clReleaseProgram(last_program); Assert(rc == CL_SUCCESS); last_program = NULL; last_context = NULL; } /* create a program object */ for (i=0, ofs=0; i < lengthof(pattern_types); i++) { ofs += snprintf( source + ofs, sizeof(source) - ofs, "__kernel void\n" "pgstromEnqueueFillBuffer_%zu(__global %s *buffer,\n" " %s value, uint nums)\n" "{\n" " if (get_global_id(0) >= nums)\n" " return;\n" " buffer[get_global_id(0)] = value;\n" "}\n", pattern_types[i].type_size, pattern_types[i].type_name, pattern_types[i].type_name); } prog_source[0] = source; prog_length[0] = ofs; program = clCreateProgramWithSource(context, 1, prog_source, prog_length, &rc); if (rc != CL_SUCCESS) { free(device_ids); goto out_unlock; } /* build this program object */ rc = clBuildProgram(program, num_devices, device_ids, NULL, NULL, NULL); free(device_ids); if (rc != CL_SUCCESS) { clReleaseProgram(program); goto out_unlock; } /* acquire the program object */ rc = clRetainProgram(program); if (rc != CL_SUCCESS) { clReleaseProgram(program); goto out_unlock; } last_program = program; last_context = context; } pthread_mutex_unlock(&lock); Assert(program != NULL); /* fetch a device id of this command queue */ rc = clGetCommandQueueInfo(command_queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device, NULL); if (rc != CL_SUCCESS) goto out_release_program; /* fetch a kernel object to be called */ snprintf(kernel_name, sizeof(kernel_name), "pgstromEnqueueFillBuffer_%zu", pattern_size); kernel = clCreateKernel(program, kernel_name, &rc); if (rc != CL_SUCCESS) goto out_release_program; /* 1st arg: __global <typename> *buffer */ rc = clSetKernelArg(kernel, 0, sizeof(cl_mem), &buffer); if (rc != CL_SUCCESS) goto out_release_kernel; /* 2nd arg: <typename> value */ rc = clSetKernelArg(kernel, 1, pattern_size, &pattern_value); if (rc != CL_SUCCESS) goto out_release_kernel; /* 3rd arg: size_t nums */ pattern_nums = (offset + size) / pattern_size; rc = clSetKernelArg(kernel, 2, sizeof(cl_uint), &pattern_nums); if (rc != CL_SUCCESS) goto out_release_kernel; /* calculate optimal workgroup size */ rc = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &lworksz, NULL); Assert((lworksz & (lworksz - 1)) == 0); gworksz = ((size / pattern_size + lworksz - 1) / lworksz) * lworksz; /* enqueue a kernel, instead of clEnqueueFillBuffer */ offset /= pattern_size; rc = clEnqueueNDRangeKernel(command_queue, kernel, 1, &offset, &gworksz, &lworksz, num_events_in_wait_list, event_wait_list, event); if (rc != CL_SUCCESS) goto out_release_kernel; rc = clSetEventCallback(*event, CL_COMPLETE, pgstromEnqueueFillBufferCleanup, kernel); if (rc != CL_SUCCESS) { clWaitForEvents(1, event); goto out_release_kernel; } return CL_SUCCESS; out_unlock: pthread_mutex_unlock(&lock); return rc; out_release_kernel: clReleaseKernel(kernel); out_release_program: clReleaseProgram(program); return rc; }
void Event::setCallback(EEventStatus status, EventCallback cb) { _callback = std::move(cb); clSetEventCallback(_id, cl_int(status), &detail::callback_priv, &_callback); }
void enqueue () { CPPA_LOG_TRACE("command::enqueue()"); this->ref(); // reference held by the OpenCL comand queue cl_int err{0}; cl_event event_k; auto data_or_nullptr = [](const dim_vec& vec) { return vec.empty() ? nullptr : vec.data(); }; err = clEnqueueNDRangeKernel(m_queue.get(), m_actor_facade->m_kernel.get(), m_actor_facade->m_global_dimensions.size(), data_or_nullptr(m_actor_facade->m_global_offsets), data_or_nullptr(m_actor_facade->m_global_dimensions), data_or_nullptr(m_actor_facade->m_local_dimensions), m_events.size(), (m_events.empty() ? nullptr : m_events.data()), &event_k); if (err != CL_SUCCESS) { CPPA_LOGMF(CPPA_ERROR, "clEnqueueNDRangeKernel: " << get_opencl_error(err)); this->deref(); // or can anything actually happen? return; } else { cl_event event_r; err = clEnqueueReadBuffer(m_queue.get(), m_arguments.back().get(), CL_FALSE, 0, sizeof(typename R::value_type) * m_result_size, m_result.data(), 1, &event_k, &event_r); if (err != CL_SUCCESS) { throw std::runtime_error("clEnqueueReadBuffer: " + get_opencl_error(err)); this->deref(); // failed to enqueue command return; } err = clSetEventCallback(event_r, CL_COMPLETE, [](cl_event, cl_int, void* data) { auto cmd = reinterpret_cast<command*>(data); cmd->handle_results(); cmd->deref(); }, this); if (err != CL_SUCCESS) { CPPA_LOGMF(CPPA_ERROR, "clSetEventCallback: " << get_opencl_error(err)); this->deref(); // callback is not set return; } err = clFlush(m_queue.get()); if (err != CL_SUCCESS) { CPPA_LOGMF(CPPA_ERROR, "clFlush: " << get_opencl_error(err)); } m_events.push_back(std::move(event_k)); m_events.push_back(std::move(event_r)); } }
int main() { cl_platform_id platform_id = NULL; cl_device_id device_id = NULL; cl_context context = NULL; cl_command_queue command_queue = NULL; cl_mem objA = NULL; cl_mem objB = NULL; cl_mem objC = NULL; cl_program program = NULL; cl_kernel kernel = NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret; cl_event event1; int i, j; float *A; float *B; float *C; A = (float *)malloc(4*4*sizeof(float)); B = (float *)malloc(4*4*sizeof(float)); C = (float *)malloc(4*4*sizeof(float)); /* Initialize input data */ for (i=0; i<4; i++) { for (j=0; j<4; j++) { A[i*4+j] = i*4+j+1; B[i*4+j] = j*4+i+1; } } /* Get Platform/Device Information*/ ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); /* Create OpenCL Context */ context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret); /* Create command queue */ command_queue = clCreateCommandQueue(context, device_id, 0, &ret); /* Create Buffer Object */ objA = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret); objB = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret); objC = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret); /* * Creating an user event * As a user event is created, its execution status is set to be CL_SUBMITTED * and we tag the event to a callback so when event reaches CL_COMPLETE, it will * execute postProcess */ event1 = clCreateUserEvent(context, &ret); clSetEventCallback(event1, CL_COMPLETE, &postProcess, "Looks like its done."); /* Copy input data to the memory buffer */ ret = clEnqueueWriteBuffer(command_queue, objA, CL_TRUE, 0, 4*4*sizeof(float), A, 0, NULL, NULL ); printf("A has been written\n"); /* The next command will wait for event1 according to its status*/ ret = clEnqueueWriteBuffer(command_queue, objB, CL_TRUE, 0, 4*4*sizeof(float), B, 1, &event1, NULL); printf("B has been written\n"); /* Tell event1 to complete */ clSetUserEventStatus(event1, CL_COMPLETE); const char *file_names[] = {"sample_kernel.cl"}; const int NUMBER_OF_FILES = 1; char* buffer[NUMBER_OF_FILES]; size_t sizes[NUMBER_OF_FILES]; loadProgramSource(file_names, NUMBER_OF_FILES, buffer, sizes); /* Create kernel program from source file*/ program = clCreateProgramWithSource(context, 1, (const char **)buffer, sizes, &ret); ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); /* Create data parallel OpenCL kernel */ kernel = clCreateKernel(program, "sample", &ret); /* Set OpenCL kernel arguments */ ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&objA); ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&objB); ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&objC); size_t global_item_size = 4; size_t local_item_size = 1; /* Execute OpenCL kernel as data parallel */ ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); /* Transfer result to host */ ret = clEnqueueReadBuffer(command_queue, objC, CL_TRUE, 0, 4*4*sizeof(float), C, 0, NULL, NULL); /* Display Results */ for (i=0; i<4; i++) { for (j=0; j<4; j++) { printf("%7.2f ", C[i*4+j]); } printf("\n"); } /* Finalization */ ret = clFlush(command_queue); ret = clFinish(command_queue); ret = clReleaseKernel(kernel); ret = clReleaseProgram(program); ret = clReleaseMemObject(objA); ret = clReleaseMemObject(objB); ret = clReleaseMemObject(objC); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); free(A); free(B); free(C); return 0; }
/* PRUint32 run (in PRUint32 rank, [array, size_is (rank)] in PRUint32 shape, [array, size_is (rank), optional] in PRUint32 tile); */ NS_IMETHODIMP dpoCKernel::Run(PRUint32 rank, PRUint32 *shape, PRUint32 *tile, PRUint32 *_retval) { cl_int err_code; cl_event runEvent, readEvent, writeEvent; size_t *global_work_size; size_t *local_work_size; const int zero = 0; DEBUG_LOG_STATUS("Run", "preparing execution of kernel"); if (sizeof(size_t) == sizeof(PRUint32)) { global_work_size = (size_t *) shape; } else { global_work_size = (size_t *) nsMemory::Alloc(rank * sizeof(size_t)); if (global_work_size == NULL) { DEBUG_LOG_STATUS("Run", "allocation of global_work_size failed"); return NS_ERROR_OUT_OF_MEMORY; } for (int cnt = 0; cnt < rank; cnt++) { global_work_size[cnt] = shape[cnt]; } } #ifdef USE_LOCAL_WORKSIZE if (tile == NULL) { local_work_size = NULL; } else { if ((sizeof(size_t) == sizeof(PRUint32))) { local_work_size = (size_t *) tile; } else { local_work_size = (size_t *) nsMemory::Alloc(rank * sizeof(size_t)); if (local_work_size == NULL) { DEBUG_LOG_STATUS("Run", "allocation of local_work_size failed"); return NS_ERROR_OUT_OF_MEMORY; } for (int cnt = 0; cnt < rank; cnt++) { local_work_size[cnt] = (size_t) tile[cnt]; } } } #else /* USE_LOCAL_WORKSIZE */ local_work_size = NULL; #endif /* USE_LOCAL_WORKSIZE */ DEBUG_LOG_STATUS("Run", "setting failure code to 0"); err_code = clEnqueueWriteBuffer(cmdQueue, failureMem, CL_FALSE, 0, sizeof(int), &zero, 0, NULL, &writeEvent); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("Run", err_code); return NS_ERROR_ABORT; } DEBUG_LOG_STATUS("Run", "enqueing execution of kernel"); #ifdef WINDOWS_ROUNDTRIP dpoCContext::RecordBeginOfRoundTrip(parent); #endif /* WINDOWS_ROUNDTRIP */ err_code = clEnqueueNDRangeKernel(cmdQueue, kernel, rank, NULL, global_work_size, NULL, 1, &writeEvent, &runEvent); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("Run", err_code); return NS_ERROR_ABORT; } DEBUG_LOG_STATUS("Run", "reading failure code"); err_code = clEnqueueReadBuffer(cmdQueue, failureMem, CL_FALSE, 0, sizeof(int), _retval, 1, &runEvent, &readEvent); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("Run", err_code); return NS_ERROR_ABORT; } DEBUG_LOG_STATUS("Run", "waiting for execution to finish"); // For now we always wait for the run to complete. // In the long run, we may want to interleave this with JS execution and only sync on result read. err_code = clWaitForEvents( 1, &readEvent); DEBUG_LOG_STATUS("Run", "first event fired"); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("Run", err_code); return NS_ERROR_ABORT; } #ifdef WINDOWS_ROUNDTRIP dpoCContext::RecordEndOfRoundTrip(parent); #endif /* WINDOWS_ROUNDTRIP */ #ifdef CLPROFILE #ifdef CLPROFILE_ASYNC err_code = clSetEventCallback( readEvent, CL_COMPLETE, &dpoCContext::CollectTimings, parent); DEBUG_LOG_STATUS("Run", "second event fired"); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("Run", err_code); return NS_ERROR_ABORT; } #else /* CLPROFILE_ASYNC */ dpoCContext::CollectTimings(readEvent,CL_COMPLETE,parent); #endif /* CLPROFILE_ASYNC */ #endif /* CLPROFILE */ DEBUG_LOG_STATUS("Run", "execution completed successfully, start cleanup"); if (global_work_size != (size_t *) shape) { nsMemory::Free(global_work_size); } #ifdef USE_LOCAL_WORKSIZE if (local_work_size != (size_t *) tile) { nsMemory::Free(local_work_size); } #endif /* USE_LOCAL_WORKSIZE */ err_code = clReleaseEvent(readEvent); err_code = clReleaseEvent(runEvent); err_code = clReleaseEvent(writeEvent); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("Run", err_code); return NS_ERROR_ABORT; } DEBUG_LOG_STATUS("Run", "cleanup complete"); return NS_OK; }
/* .External */ SEXP ocl_call(SEXP args) { struct arg_chain *float_args = 0; ocl_call_context_t *occ; int on, an = 0, ftype = FT_DOUBLE, ftsize, ftres, async; SEXP ker = CADR(args), olen, arg, res, octx, dimVec; cl_kernel kernel = getKernel(ker); cl_context context; cl_command_queue commands; cl_device_id device_id = getDeviceID(getAttrib(ker, Rf_install("device"))); cl_mem output; cl_int err; size_t wdims[3] = {0, 0, 0}; int wdim = 1; if (clGetKernelInfo(kernel, CL_KERNEL_CONTEXT, sizeof(context), &context, NULL) != CL_SUCCESS || !context) Rf_error("cannot obtain kernel context via clGetKernelInfo"); args = CDDR(args); res = Rf_getAttrib(ker, install("precision")); if (TYPEOF(res) == STRSXP && LENGTH(res) == 1 && CHAR(STRING_ELT(res, 0))[0] != 'd') ftype = FT_SINGLE; ftsize = (ftype == FT_DOUBLE) ? sizeof(double) : sizeof(float); olen = CAR(args); /* size */ args = CDR(args); on = Rf_asInteger(olen); if (on < 0) Rf_error("invalid output length"); ftres = (Rf_asInteger(CAR(args)) == 1) ? 1 : 0; /* native.result */ if (ftype != FT_SINGLE) ftres = 0; args = CDR(args); async = (Rf_asInteger(CAR(args)) == 1) ? 0 : 1; /* wait */ args = CDR(args); dimVec = coerceVector(CAR(args), INTSXP); /* dim */ wdim = LENGTH(dimVec); if (wdim > 3) Rf_error("OpenCL standard only supports up to three work item dimensions - use index vectors for higher dimensions"); if (wdim) { int i; /* we don't use memcpy in case int and size_t are different */ for (i = 0; i < wdim; i++) wdims[i] = INTEGER(dimVec)[i]; } if (wdim < 1 || wdims[0] < 1 || (wdim > 1 && wdims[1] < 1) || (wdim > 2 && wdims[2] < 1)) Rf_error("invalid dimensions - muse be a numeric vector with positive values"); args = CDR(args); occ = (ocl_call_context_t*) calloc(1, sizeof(ocl_call_context_t)); if (!occ) Rf_error("unable to allocate ocl_call context"); octx = PROTECT(R_MakeExternalPtr(occ, R_NilValue, R_NilValue)); R_RegisterCFinalizerEx(octx, ocl_call_context_fin, TRUE); occ->output = output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, ftsize * on, NULL, &err); if (!output) Rf_error("failed to create output buffer of %d elements via clCreateBuffer (%d)", on, err); if (clSetKernelArg(kernel, an++, sizeof(cl_mem), &output) != CL_SUCCESS) Rf_error("failed to set first kernel argument as output in clSetKernelArg"); if (clSetKernelArg(kernel, an++, sizeof(on), &on) != CL_SUCCESS) Rf_error("failed to set second kernel argument as output length in clSetKernelArg"); occ->commands = commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) ocl_err("clCreateCommandQueue"); if (ftype == FT_SINGLE) /* need conversions, create floats buffer */ occ->float_args = float_args = arg_alloc(0, 32); while ((arg = CAR(args)) != R_NilValue) { int n, ndiv = 1; void *ptr; size_t al; switch (TYPEOF(arg)) { case REALSXP: if (ftype == FT_SINGLE) { int i; float *f; double *d = REAL(arg); n = LENGTH(arg); f = (float*) malloc(sizeof(float) * n); if (!f) Rf_error("unable to allocate temporary single-precision memory for conversion from a double-precision argument vector of length %d", n); for (i = 0; i < n; i++) f[i] = d[i]; ptr = f; al = sizeof(float); arg_add(float_args, ptr); } else { ptr = REAL(arg); al = sizeof(double); } break; case INTSXP: ptr = INTEGER(arg); al = sizeof(int); break; case LGLSXP: ptr = LOGICAL(arg); al = sizeof(int); break; case RAWSXP: if (inherits(arg, "clFloat")) { ptr = RAW(arg); ndiv = al = sizeof(float); break; } default: Rf_error("only numeric or logical kernel arguments are supported"); /* no-ops but needed to make the compiler happy */ ptr = 0; al = 0; } n = LENGTH(arg); if (ndiv != 1) n /= ndiv; if (n == 1) {/* scalar */ if (clSetKernelArg(kernel, an++, al, ptr) != CL_SUCCESS) Rf_error("Failed to set scalar kernel argument %d (size=%d)", an, al); } else { cl_mem input = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, al * n, ptr, &err); if (!input) Rf_error("Unable to create buffer (%d elements, %d bytes each) for vector argument %d (oclError %d)", n, al, an, err); if (!occ->mem_objects) occ->mem_objects = arg_alloc(0, 32); arg_add(occ->mem_objects, input); #if 0 /* we used this before CL_MEM_USE_HOST_PTR */ if (clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, al * n, ptr, 0, NULL, NULL) != CL_SUCCESS) Rf_error("Failed to transfer data (%d elements) for vector argument %d", n, an); #endif if (clSetKernelArg(kernel, an++, sizeof(cl_mem), &input) != CL_SUCCESS) Rf_error("Failed to set vector kernel argument %d (size=%d, length=%d)", an, al, n); /* clReleaseMemObject(input); */ } args = CDR(args); } if (clEnqueueNDRangeKernel(commands, kernel, wdim, NULL, wdims, NULL, 0, NULL, async ? &occ->event : NULL) != CL_SUCCESS) Rf_error("Error during kernel execution"); if (async) { /* asynchronous call -> get out and return the context */ #if USE_OCL_COMPLETE_CALLBACK clSetEventCallback(occ->event, CL_COMPLETE, ocl_complete_callback, occ); #endif clFlush(commands); /* the specs don't guarantee execution unless clFlush is called */ occ->ftres = ftres; occ->ftype = ftype; occ->on = on; Rf_setAttrib(octx, R_ClassSymbol, mkString("clCallContext")); UNPROTECT(1); return octx; } clFinish(commands); occ->finished = 1; /* we can release input memory objects now */ if (occ->mem_objects) { arg_free(occ->mem_objects, (afin_t) clReleaseMemObject); occ->mem_objects = 0; } if (float_args) { arg_free(float_args, 0); float_args = occ->float_args = 0; } res = ftres ? Rf_allocVector(RAWSXP, on * sizeof(float)) : Rf_allocVector(REALSXP, on); if (ftype == FT_SINGLE) { if (ftres) { if ((err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * on, RAW(res), 0, NULL, NULL )) != CL_SUCCESS) Rf_error("Unable to transfer result vector (%d float elements, oclError %d)", on, err); PROTECT(res); Rf_setAttrib(res, R_ClassSymbol, mkString("clFloat")); UNPROTECT(1); } else { /* float - need a temporary buffer */ float *fr = (float*) malloc(sizeof(float) * on); double *r = REAL(res); int i; if (!fr) Rf_error("unable to allocate memory for temporary single-precision output buffer"); occ->float_out = fr; if ((err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * on, fr, 0, NULL, NULL )) != CL_SUCCESS) Rf_error("Unable to transfer result vector (%d float elements, oclError %d)", on, err); for (i = 0; i < on; i++) r[i] = fr[i]; } } else if ((err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(double) * on, REAL(res), 0, NULL, NULL )) != CL_SUCCESS) Rf_error("Unable to transfer result vector (%d double elements, oclError %d)", on, err); ocl_call_context_fin(octx); UNPROTECT(1); return res; }
int main() { /* OpenCL data structures */ cl_device_id device; cl_context context; cl_command_queue queue; cl_program program; cl_kernel kernel; cl_int err; /* Data and events */ char *kernel_msg; float data[4096]; cl_mem data_buffer; cl_event kernel_event, read_event; /* Create a device and context */ device = create_device(); context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); if(err < 0) { perror("Couldn't create a context"); exit(1); } /* Build the program and create a kernel */ program = build_program(context, device, PROGRAM_FILE); kernel = clCreateKernel(program, KERNEL_FUNC, &err); if(err < 0) { perror("Couldn't create a kernel"); exit(1); }; /* Create a write-only buffer to hold the output data */ data_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(data), NULL, &err); if(err < 0) { perror("Couldn't create a buffer"); exit(1); }; /* Create kernel argument */ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_buffer); if(err < 0) { perror("Couldn't set a kernel argument"); exit(1); }; /* Create a command queue */ queue = clCreateCommandQueue(context, device, 0, &err); if(err < 0) { perror("Couldn't create a command queue"); exit(1); }; /* Enqueue kernel */ err = clEnqueueTask(queue, kernel, 0, NULL, &kernel_event); if(err < 0) { perror("Couldn't enqueue the kernel"); exit(1); } /* Read the buffer */ err = clEnqueueReadBuffer(queue, data_buffer, CL_FALSE, 0, sizeof(data), &data, 0, NULL, &read_event); if(err < 0) { perror("Couldn't read the buffer"); exit(1); } /* Set event handling routines */ kernel_msg = "The kernel finished successfully.\n\0"; err = clSetEventCallback(kernel_event, CL_COMPLETE, &kernel_complete, kernel_msg); if(err < 0) { perror("Couldn't set callback for event"); exit(1); } clSetEventCallback(read_event, CL_COMPLETE, &read_complete, data); /* Deallocate resources */ clReleaseMemObject(data_buffer); clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseProgram(program); clReleaseContext(context); return 0; }