Esempio n. 1
0
void WebCLEvent::setCallback(unsigned commandExecCallbackType, WebCLCallback* callback, ExceptionState& es)
{
    if (isReleased()) {
        es.throwWebCLException(WebCLException::INVALID_EVENT, WebCLException::invalidEventMessage);
        return;
    }

    if (commandExecCallbackType != CL_COMPLETE) {
        es.throwWebCLException(WebCLException::INVALID_VALUE, WebCLException::invalidValueMessage);
        return;
    }

    ASSERT(callback);
    if (m_callbacks.size()) {
        m_callbacks.append(adoptRef(callback));
        return;
    }

    m_callbacks.clear();
    m_callbacks.append(adoptRef(callback));
    WebCLEventHolder* holder = new WebCLEventHolder;
    holder->event = createWeakPtr();
    holder->type = commandExecCallbackType;
    cl_int err = clSetEventCallback(m_clEvent, commandExecCallbackType, &callbackProxy, holder);

    if (err != CL_SUCCESS)
        WebCLException::throwException(err, es);
}
Esempio n. 2
0
/* Writes the contents of a given dataset into a given cl_mem device memory buffer
 *
 * @env: Struct containing device/context/queue variables.
 * @mem_struct: Struct containing cl_mem buffer and the number of entries it can hold.
 * @dataset: Pointer to an integer array of data to be read, same length as buffer. */
void loadIntArrayIntoDevice(
  const RubiCLEnvironment env,
  const RubiCLMemoryBuffer mem_struct,
  int* dataset
) {
    if (DEBUG) printf("loadIntArrayIntoDevice\n");
  cl_event write_event;
  cl_int ret = clEnqueueWriteBuffer(
    env.queue,                                 // Command queue
    mem_struct.buffer,                         // Memory buffer
    CL_FALSE,                                  // Blocking write? (set to nonblocking)
    0,                                         // Offset in buffer to write to
    mem_struct.buffer_entries * sizeof(int),   // Input data size
    dataset,                                   // Input data
    0,                                         // Number of preceding actions
    NULL,                                      // List of preceding actions
    &write_event                               // Event object destination
  );

  if (ret != CL_SUCCESS) printf("clEnqueueWriteBuffer %s\n", oclErrorString(ret));

  clSetEventCallback(
    write_event,            // Event to monitor
    CL_COMPLETE,            // Status to fire on
    &releaseMemoryCallback, // Callback to trigger
    dataset                 // Data to pass to callback
  );

}
Esempio n. 3
0
rcl_status cl_write_buffer(struct client_state* state, buffer_t buffer,
	void* ptr, uint64_t offset, uint32_t size)
{
	struct buffer_state* buffer_state;
	cl_int retval;
	cl_event event;

	buffer--;
	if (!vector_valid_idx(&state->buffers, buffer)) {
		log_print(log_error, "Buffer %" PRIu32 " not found", buffer);
		return RCL_INVALID_BUFFER;
	}

	log_print(log_notice, "Writing to buffer, size: %" PRIu32 ", offset: %"	\
		PRIu64, size, offset);

	buffer_state = *vector_element(&state->buffers, buffer,
		struct buffer_state*);
	retval = clEnqueueWriteBuffer(state->command_queue, buffer_state->id,
		CL_FALSE, offset, size, ptr, 0, NULL, &event);

	if (retval)
		return opencl_error(retval);

	if (buffer_state->original)
		memcpy(buffer_state->original + offset, ptr, size);

	retval = clSetEventCallback(event, CL_COMPLETE, cl_free_write_buffer, ptr);
	if (retval) {
		clFinish(state->command_queue);
		free(ptr);
	}

	return RCL_OK;
}
Esempio n. 4
0
cl_int clEnqueueNDRangeKernel (cl_command_queue command_queue,
                               cl_kernel kernel,
                               cl_uint work_dim,
                               const size_t *global_work_offset,
                               const size_t *global_work_size,
                               const size_t *local_work_size,
                               cl_uint num_events_in_wait_list,
                               const cl_event *event_wait_list,
                               cl_event *event)
{
  static struct work_size_s work_sizes;
  
  struct ld_kernel_s *ldKernel = find_kernel_entry(kernel);
  int i;
  cl_int errcode;

  if (num_events_in_wait_list) {
    clCheck(clWaitForEvents(num_events_in_wait_list, event_wait_list));
  }
  
  assert(ldKernel);
  for (i = 0; i < work_dim; i++) {
    work_sizes.local[i] = local_work_size[i];
    work_sizes.global[i] = global_work_size[i]/work_sizes.local[i];
  }

#if ENABLE_KERNEL_PROFILING == 1
  static cl_event kern_event;
  
  if (!event) {
    event = &kern_event; // scope of the event is limited to this function.
  }
#endif
  
  kernel_executed_event(ldKernel, &work_sizes, work_dim);
  
  errcode = real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim,
                                        global_work_offset, global_work_size,
                                        local_work_size, num_events_in_wait_list,
                                        event_wait_list, event);
#if ENABLE_KERNEL_PROFILING == 1
  clCheck(errcode);
  
  clRetainEvent(*event);
  clSetEventCallback(*event, CL_COMPLETE, kernel_profiler_cb, ldKernel);
#endif

#if FORCE_FINISH_KERNEL
  real_clFinish(command_queue);
#endif
  
  kernel_finished_event(ldKernel, &work_sizes, work_dim);
  
  return errcode;
}
Esempio n. 5
0
 /// Registers a function to be called when the event status changes to
 /// \p status (by default CL_COMPLETE). The callback is passed the OpenCL
 /// event object, the event status, and a pointer to arbitrary user data.
 ///
 /// \see_opencl_ref{clSetEventCallback}
 ///
 /// \opencl_version_warning{1,1}
 void set_callback(void (BOOST_COMPUTE_CL_CALLBACK *callback)(
                       cl_event event, cl_int status, void *user_data
                   ),
                   cl_int status = CL_COMPLETE,
                   void *user_data = 0)
 {
     cl_int ret = clSetEventCallback(m_event, status, callback, user_data);
     if(ret != CL_SUCCESS){
         BOOST_THROW_EXCEPTION(opencl_error(ret));
     }
 }
Esempio n. 6
0
cl_int EventWrapper::setEventCallback (cl_int aCommandExecCallbackType,
                                       void (*aCallback)(cl_event, cl_int, void*),
                                       void* aUserData) {
#if CL_WRAPPER_CL_VERSION_SUPPORT >= 110
    return clSetEventCallback (mWrapped, aCommandExecCallbackType,
                               aCallback, aUserData);
#else // CL_WRAPPER_CL_VERSION_SUPPORT >= 110
    (void)aCommandExecCallbackType; (void)aCallback; (void)aUserData;
    D_LOG (LOG_LEVEL_ERROR, "CLWrapper support for OpenCL 1.1 API was not enabled at build time.");
    return CL_INVALID_VALUE;
#endif
}
Esempio n. 7
0
 static int RegisterCallback(lua_State *L) {
     cl_event event = *traits::CheckObject(L, 1);
     cl_int cmdType = static_cast<cl_int>(luaL_checknumber(L, 2));
     lua_State *thread = traits::CreateCallbackThread(L, 3);
     if (thread == NULL) {
         return 0;
     }
     cl_int err = clSetEventCallback(event, cmdType, Callback, thread);
     CheckCLError(L, err, "Failed registering event callback: %s.");
     lua_pushvalue(L, 1);
     traits::RegisterCallback(L);
     return 0;
 }
Esempio n. 8
0
	void Event::callback(
		CommandExecutionCallbackType type,
		void (*pfn_notify) (cl_event event, cl_int status, void * user_data),
		void * user_data
	) {
		static const auto error_map = error::ErrorMap{
			{ErrorCode::invalid_event, "the given event is invalid."},
			{ErrorCode::invalid_value, "the given callback is null; OR the given callback type is invalid."}
		};
		error::handle<EventException>(
			clSetEventCallback(
				m_id,
				static_cast<std::underlying_type<CommandExecutionCallbackType>::type>(type),
				pfn_notify,
				user_data
			),
			error_map
		);
	}
Esempio n. 9
0
void WebCL::waitForEventsImpl(const Vector<RefPtr<WebCLEvent>>& events, WebCLCallback* callback)
{
    Vector<cl_event> clEvents;
    Vector<WeakPtr<WebCLObject>> webEvents;
    WebCLHolder* holder = new WebCLHolder;
    holder->webcl = m_weakFactory.createWeakPtr();

    for (auto event : events) {
        clEvents.append(event->getEvent());
        webEvents.append(event->createWeakPtr());
    }

    if (!callback) {
        clWaitForEvents(clEvents.size(), clEvents.data());
    } else {
        m_callbackRegisterQueue.append(std::make_pair(webEvents, adoptRef(callback)));
        for (auto clEvent : clEvents)
            clSetEventCallback(clEvent, CL_COMPLETE, &callbackProxy, holder);
    }
}
Esempio n. 10
0
void WebCLEvent::setCallback(unsigned long commandExecCallbackType, 
                             PassOwnPtr<WebCLCallback> callback, 
                             ExceptionState& es) {
    cl_int err = 0;

    if (m_cl_Event == NULL) {
        es.throwWebCLException(
                WebCLException::INVALID_EVENT,
                WebCLException::invalidEventMessage);
        return;
    }

    if (commandExecCallbackType != WebCL::COMPLETE) {
        es.throwWebCLException(
                WebCLException::INVALID_VALUE,
                WebCLException::invalidValueMessage);
        return;
    }

    ASSERT(callback);

    if (CallbackDataVector* vector = callbackRegisterQueue().get(this)) {
        vector->append(std::make_pair(commandExecCallbackType, callback));
        return;
    }

    OwnPtr<CallbackDataVector> vector = adoptPtr(new CallbackDataVector());
    vector->append(std::make_pair(commandExecCallbackType, callback));
    callbackRegisterQueue().set(this, vector.release());
    err = WebCL::SUCCESS;
    pfnEventNotify callbackProxyPtr = &callbackProxy;
    err = clSetEventCallback(m_cl_Event, commandExecCallbackType, 
                             callbackProxyPtr, this);

    if (err != CL_SUCCESS) {
        WebCLException::throwException(err, es);
    } 
    return;
}
Esempio n. 11
0
void AutoExposure::update(cl_command_queue queue, cl_mem image)
{
    assert(_initialized);

    // If auto-exposure is disabled, don't do anything
    if(!_autoExposure)
        return;

    // Increse/decrease exposure from the difference between the current luma
    // average with an expected 0.5 frame average.
    _exposure *= 1.0f + qBound(-_adjustSpeed, 0.5f - _exposureData.meteringAverage, _adjustSpeed);

    _updateCounter++;
    if(_updateCounter % _updatePeriod)
        return;
    _updateCounter= 0;

    int ai= 0;
    clKernelArg(_downKernel, ai++, image);
    clKernelArg(_downKernel, ai++, _lumaImage);
    if(!clLaunchKernelEvent(_downKernel, queue, _lumaSize, "AE/Downsample"))
        return;

    // Download image data
    cl_event& downloadEvent= analytics.clEvent("AE/Download");

    size_t origin[3]= { 0,0,0 };
    size_t region[3]= { (size_t)_lumaSize.width(), (size_t)_lumaSize.height(), 1 };
    cl_int error= clEnqueueReadImage(queue, _lumaImage, CL_FALSE, origin, region,
                                     _lumaSize.width(),0,_lumaData,0,0, &downloadEvent);
    if(clCheckError(error, "clEnqueueReadImage"))
        return;

    error= clSetEventCallback(downloadEvent, CL_COMPLETE, exposureCallback, (void*)this);
    clCheckError(error, "clSetEventCallback");

    // When the download is done, exposureCallback will be called
}
Esempio n. 12
0
static void opencl_task_start(
  MTAPI_IN mtapi_task_hndl_t task,
  MTAPI_OUT mtapi_status_t* status) {
  mtapi_status_t local_status = MTAPI_ERR_UNKNOWN;
  cl_int err;

  if (embb_mtapi_node_is_initialized()) {
    embb_mtapi_node_t * node = embb_mtapi_node_get_instance();

    if (embb_mtapi_task_pool_is_handle_valid(node->task_pool, task)) {
      embb_mtapi_task_t * local_task =
        embb_mtapi_task_pool_get_storage_for_handle(node->task_pool, task);

      if (embb_mtapi_action_pool_is_handle_valid(
        node->action_pool, local_task->action)) {
        embb_mtapi_action_t * local_action =
          embb_mtapi_action_pool_get_storage_for_handle(
          node->action_pool, local_task->action);

        embb_mtapi_opencl_plugin_t * plugin = &embb_mtapi_opencl_plugin;
        embb_mtapi_opencl_action_t * opencl_action =
          (embb_mtapi_opencl_action_t*)local_action->plugin_data;
        embb_mtapi_opencl_task_t * opencl_task =
          (embb_mtapi_opencl_task_t*)embb_alloc(
            sizeof(embb_mtapi_opencl_task_t));

        size_t elements = local_task->result_size /
          opencl_action->element_size;
        size_t global_work_size;

        if (0 == elements)
          elements = 1;
        global_work_size =
          round_up(opencl_action->local_work_size, elements);

        opencl_task->task = task;

        opencl_task->arguments_size = (int)local_task->arguments_size;
        if (0 < local_task->arguments_size) {
          opencl_task->arguments = clCreateBuffer(plugin->context,
            CL_MEM_READ_ONLY, local_task->arguments_size, NULL, &err);
        } else {
          opencl_task->arguments = NULL;
        }
        opencl_task->result_buffer_size = (int)local_task->result_size;
        if (0 < local_task->result_size) {
          opencl_task->result_buffer = clCreateBuffer(plugin->context,
            CL_MEM_WRITE_ONLY, local_task->result_size, NULL, &err);
        } else {
          opencl_task->result_buffer = NULL;
        }

        err = clSetKernelArg(opencl_action->kernel, 0, sizeof(cl_mem),
          (const void*)&opencl_task->arguments);
        err |= clSetKernelArg(opencl_action->kernel, 1, sizeof(cl_int),
          (const void*)&opencl_task->arguments_size);

        err |= clSetKernelArg(opencl_action->kernel, 2, sizeof(cl_mem),
          (const void*)&opencl_task->result_buffer);
        err |= clSetKernelArg(opencl_action->kernel, 3, sizeof(cl_int),
          (const void*)&opencl_task->result_buffer_size);

        err |= clEnqueueWriteBuffer(plugin->command_queue,
          opencl_task->arguments, CL_FALSE, 0,
          (size_t)opencl_task->arguments_size, local_task->arguments,
          0, NULL, NULL);

        if (CL_SUCCESS == err) {
          embb_mtapi_task_set_state(local_task, MTAPI_TASK_RUNNING);

          err |= clEnqueueNDRangeKernel(plugin->command_queue,
            opencl_action->kernel, 1, NULL,
            &global_work_size, &opencl_action->local_work_size, 0, NULL, NULL);
          err |= clEnqueueReadBuffer(plugin->command_queue,
            opencl_task->result_buffer, CL_FALSE, 0,
            (size_t)opencl_task->result_buffer_size, local_task->result_buffer,
            0, NULL, &opencl_task->kernel_finish_event);
          err |= clSetEventCallback(opencl_task->kernel_finish_event,
            CL_COMPLETE, opencl_task_complete, opencl_task);
        }

        err |= clFlush(plugin->command_queue);
        if (CL_SUCCESS != err) {
          embb_mtapi_task_set_state(local_task, MTAPI_TASK_ERROR);
          local_status = MTAPI_ERR_ACTION_FAILED;
        } else {
          local_status = MTAPI_SUCCESS;
        }
      }
    }
  }

  mtapi_status_set(status, local_status);
}
Esempio n. 13
0
  void enqueue() {
    // Errors in this function can not be handled by opencl_err.hpp
    // because they require non-standard error handling
    CAF_LOG_TRACE("command::enqueue()");
    this->ref(); // reference held by the OpenCL comand queue
    cl_event event_k;
    auto data_or_nullptr = [](const dim_vec& vec) {
      return vec.empty() ? nullptr : vec.data();
    };
    // OpenCL expects cl_uint (unsigned int), hence the cast
    cl_int err = clEnqueueNDRangeKernel(
      queue_.get(), actor_facade_->kernel_.get(),
      static_cast<cl_uint>(actor_facade_->config_.dimensions().size()),
      data_or_nullptr(actor_facade_->config_.offsets()),
      data_or_nullptr(actor_facade_->config_.dimensions()),
      data_or_nullptr(actor_facade_->config_.local_dimensions()),
      static_cast<cl_uint>(mem_in_events_.size()),
      (mem_in_events_.empty() ? nullptr : mem_in_events_.data()), &event_k
    );
    if (err != CL_SUCCESS) {
      CAF_LOGMF(CAF_ERROR, "clEnqueueNDRangeKernel: " << get_opencl_error(err));
      clReleaseEvent(event_k);
      this->deref();
      return;
    } else {
      enqueue_read_buffers(event_k, detail::get_indices(result_buffers_));
      cl_event marker;
#if defined(__APPLE__)
      err = clEnqueueMarkerWithWaitList(
        queue_.get(),
        static_cast<cl_uint>(mem_out_events_.size()),
        mem_out_events_.data(), &marker
      );
#else
      err = clEnqueueMarker(queue_.get(), &marker);
#endif
      if (err != CL_SUCCESS) {
        CAF_LOGMF(CAF_ERROR, "clSetEventCallback: " << get_opencl_error(err));
        clReleaseEvent(marker);
        clReleaseEvent(event_k);
        this->deref(); // callback is not set
        return;
      }
      err = clSetEventCallback(marker, CL_COMPLETE,
                               [](cl_event, cl_int, void* data) {
                                 auto cmd = reinterpret_cast<command*>(data);
                                 cmd->handle_results();
                                 cmd->deref();
                               },
                               this);
      if (err != CL_SUCCESS) {
        CAF_LOGMF(CAF_ERROR, "clSetEventCallback: " << get_opencl_error(err));
        clReleaseEvent(marker);
        clReleaseEvent(event_k);
        this->deref(); // callback is not set
        return;
      }
      err = clFlush(queue_.get());
      if (err != CL_SUCCESS) {
        CAF_LOGMF(CAF_ERROR, "clFlush: " << get_opencl_error(err));
      }
      mem_out_events_.push_back(std::move(event_k));
      mem_out_events_.push_back(std::move(marker));
    }
  }
int main() {
    cl_int error_code = CL_SUCCESS;

    try {
        // find Intel platform
        cl_uint num_platforms = 0;
        error_code = clGetPlatformIDs(0, nullptr, &num_platforms);
        HANDLE_CL_ERROR(clGetPlatformIDs)
        std::unique_ptr<cl_platform_id[]> platform_ids(
                new cl_platform_id[static_cast<const std::size_t>(num_platforms)]);
        error_code = clGetPlatformIDs(num_platforms, platform_ids.get(), nullptr);
        HANDLE_CL_ERROR(clGetPlatformIDs)
        cl_platform_id platform = nullptr;
        for (std::size_t i = 0; i != static_cast<const std::size_t>(num_platforms); ++i) {
            std::size_t platform_name_size = 0;
            error_code = clGetPlatformInfo(platform_ids[i], CL_PLATFORM_NAME, 0, nullptr, &platform_name_size);
            HANDLE_CL_ERROR(clGetPlatformInfo)
            std::unique_ptr<char[]> platform_name(new char[platform_name_size]);
            error_code = clGetPlatformInfo(platform_ids[i], CL_PLATFORM_NAME,
                                           platform_name_size, platform_name.get(), nullptr);
            HANDLE_CL_ERROR(clGetPlatformInfo)
            if (std::strcmp(beignet_platform_name, platform_name.get()) == 0) {
                platform = platform_ids[i];
                std::cout << "Platform: " << platform_name.get() << std::endl;
                break;
            }
        }
        if (platform == nullptr) {
            throw std::runtime_error(std::string("Couldn't find platform with name: ") + beignet_platform_name);
        }

        // find Intel GPU
        cl_device_id device = nullptr;
        error_code = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, nullptr);
        HANDLE_CL_ERROR(clGetDeviceIDs)
        std::size_t device_name_size = 0;
        error_code = clGetDeviceInfo(device, CL_DEVICE_NAME, 0, nullptr, &device_name_size);
        HANDLE_CL_ERROR(clGetDeviceInfo)
        std::unique_ptr<char[]> device_name(new char[device_name_size]);
        error_code = clGetDeviceInfo(device, CL_DEVICE_NAME, device_name_size, device_name.get(), nullptr);
        HANDLE_CL_ERROR(clGetDeviceInfo)
        std::cout << "Device: " << device_name.get() << std::endl;

        // create OpenCL context, command queue, program and kernel
        const auto context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &error_code);
        HANDLE_CL_ERROR(clCreateContext)
        const auto command_queue = clCreateCommandQueue(context, device, 0, &error_code);
        HANDLE_CL_ERROR(clCreateCommandQueue)

        const char *source_strings[1];
        source_strings[0] = kernel_source;
        const std::size_t source_size = std::strlen(kernel_source);
        const auto program = clCreateProgramWithSource(context, 1, source_strings, &source_size, &error_code);
        HANDLE_CL_ERROR(clCreateProgramWithSource)
        error_code = clBuildProgram(program, 1, &device, "", nullptr, nullptr);
        HANDLE_CL_ERROR(clBuildProgram)
        const auto kernel = clCreateKernel(program, "print_hello", &error_code);
        HANDLE_CL_ERROR(clCreateKernel)

        // enqueue kernel and set event completion handler
        cl_event event;
        std::size_t global_work_size = 1;
        error_code = clEnqueueNDRangeKernel(command_queue, kernel, 1, nullptr, &global_work_size, nullptr,
                                            0, nullptr, &event);
        HANDLE_CL_ERROR(clEnqueueNDRangeKernel)

        error_code = clSetEventCallback(event, CL_COMPLETE, [](cl_event, cl_int, void *) {
            std::cout << "OpenCL callback" << std::endl;
            // Notify the waiting thread that the kernel is completed
            {
                std::lock_guard<std::mutex> cond_lock(cond_mutex);
                kernel_complete = true;
            }
            cond_var.notify_one();
        }, nullptr);
        HANDLE_CL_ERROR(clSetEventCallback)

        error_code = clFlush(command_queue);
        HANDLE_CL_ERROR(clFlush)

        // simulate work
        std::this_thread::sleep_for(std::chrono::seconds(1));

        // do work, dependent on kernel completion
        {
            std::unique_lock<std::mutex> cond_lock(cond_mutex);
            while (!kernel_complete) {
                if (cond_var.wait_for(cond_lock, std::chrono::seconds(5)) == std::cv_status::timeout) {
                    std::cout << "WARNING: A 5 second timeout has been reached on the condition variable.\n"
                            "         This may be a deadlock." << std::endl;
                }
            }
        }
        // When using Beignet, this will never be called as a deadlock will occur.
        std::cout << "Doing work, dependent on the kernel's completion" << std::endl;
    } catch (const std::exception &e) {
        std::cout << "Error: " << e.what() << std::endl;
    } catch (...) {
        std::cout << "Unknown error" << std::endl;
    }
}
Esempio n. 15
0
cl_int
clEnqueueFillBuffer(cl_command_queue command_queue,
					cl_mem buffer,
					const void *pattern,
					size_t pattern_size,
					size_t offset,
					size_t size,
					cl_uint num_events_in_wait_list,
					const cl_event *event_wait_list,
					cl_event *event)
{
	static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
	static cl_program	last_program = NULL;
	static cl_context	last_context = NULL;
	cl_kernel			kernel;
	cl_program			program;
	cl_context			context;
	cl_device_id		device;
	size_t				gworksz;
	size_t				lworksz;
	char				kernel_name[80];
	cl_int				rc;
	union {
		cl_char			v_char;
		cl_short		v_short;
		cl_int			v_int;
		cl_long			v_long;
	} pattern_value;
	cl_uint				pattern_nums;

	switch (pattern_size)
	{
		case sizeof(cl_char):
			pattern_value.v_char = *((cl_char *)pattern);
			break;
		case sizeof(cl_short):
			pattern_value.v_short = *((cl_short *)pattern);
			break;
		case sizeof(cl_int):
			pattern_value.v_int = *((cl_int *)pattern);
			break;
		case sizeof(cl_long):
			pattern_value.v_long = *((cl_long *)pattern);
			break;
		default:
			/*
			 * pattern_size was not support one, even though OpenCL 1.2
			 * spec says 16, 32, 64 or 128 bytes patterns are supported.
			 */
			return CL_INVALID_VALUE;
	}

	/* ensure alignment */
	if (offset % pattern_size != 0)
		return CL_INVALID_VALUE;
	if (size % pattern_size != 0)
		return CL_INVALID_VALUE;

	/* fetch context and device_id associated with this command queue */
	rc = clGetCommandQueueInfo(command_queue,
							   CL_QUEUE_CONTEXT,
							   sizeof(cl_context),
							   &context,
							   NULL);
	if (rc != CL_SUCCESS)
		return rc;

	pthread_mutex_lock(&lock);
	if (last_program && last_context == context)
	{
		rc = clRetainProgram(last_program);
		if (rc != CL_SUCCESS)
			goto out_unlock;
		program = last_program;
	}
	else
	{
		char			source[10240];
		const char	   *prog_source[1];
		size_t			prog_length[1];
		cl_uint			num_devices;
		cl_device_id   *device_ids;
		static struct {
			const char *type_name;
			size_t		type_size;
		} pattern_types[] = {
			{ "char",  sizeof(cl_char) },
			{ "short", sizeof(cl_short) },
			{ "int",   sizeof(cl_int) },
			{ "long",  sizeof(cl_long) },
		};
		size_t		i, ofs;

		/* fetch properties of cl_context */
		rc = clGetContextInfo(context,
							  CL_CONTEXT_NUM_DEVICES,
							  sizeof(cl_uint),
							  &num_devices,
							  NULL);
		if (rc != CL_SUCCESS)
			goto out_unlock;
		Assert(num_devices > 0);

		device_ids = calloc(num_devices, sizeof(cl_device_id));
		if (!device_ids)
		{
			rc = CL_OUT_OF_HOST_MEMORY;
			goto out_unlock;
		}
		rc = clGetContextInfo(context,
							  CL_CONTEXT_DEVICES,
							  sizeof(cl_device_id) * num_devices,
							  device_ids,
							  NULL);
		if (rc != CL_SUCCESS)
		{
			free(device_ids);
			goto out_unlock;
		}

		/* release the previous program */
		if (last_program)
		{
			rc = clReleaseProgram(last_program);
			Assert(rc == CL_SUCCESS);
			last_program = NULL;
			last_context = NULL;
		}

		/* create a program object */
		for (i=0, ofs=0; i < lengthof(pattern_types); i++)
		{
			ofs += snprintf(
				source + ofs, sizeof(source) - ofs,
				"__kernel void\n"
				"pgstromEnqueueFillBuffer_%zu(__global %s *buffer,\n"
				"                             %s value, uint nums)\n"
				"{\n"
				"  if (get_global_id(0) >= nums)\n"
				"    return;\n"
				"  buffer[get_global_id(0)] = value;\n"
				"}\n",
				pattern_types[i].type_size,
				pattern_types[i].type_name,
				pattern_types[i].type_name);
		}
		prog_source[0] = source;
		prog_length[0] = ofs;
		program = clCreateProgramWithSource(context,
											1,
											prog_source,
											prog_length,
											&rc);
		if (rc != CL_SUCCESS)
		{
			free(device_ids);
			goto out_unlock;
		}

		/* build this program object */
		rc = clBuildProgram(program,
							num_devices,
							device_ids,
							NULL,
							NULL,
							NULL);
		free(device_ids);
		if (rc != CL_SUCCESS)
		{
			clReleaseProgram(program);
			goto out_unlock;
		}

		/* acquire the program object */
		rc = clRetainProgram(program);
		if (rc != CL_SUCCESS)
		{
			clReleaseProgram(program);
			goto out_unlock;
		}
		last_program = program;
		last_context = context;
	}
	pthread_mutex_unlock(&lock);
	Assert(program != NULL);

	/* fetch a device id of this command queue */
	rc = clGetCommandQueueInfo(command_queue,
							   CL_QUEUE_DEVICE,
							   sizeof(cl_device_id),
							   &device,
							   NULL);
	if (rc != CL_SUCCESS)
		goto out_release_program;

	/* fetch a kernel object to be called */
	snprintf(kernel_name, sizeof(kernel_name),
			 "pgstromEnqueueFillBuffer_%zu", pattern_size);
	kernel = clCreateKernel(program,
							kernel_name,
							&rc);
	if (rc != CL_SUCCESS)
		goto out_release_program;

	/* 1st arg: __global <typename> *buffer */
	rc = clSetKernelArg(kernel, 0, sizeof(cl_mem), &buffer);
	if (rc != CL_SUCCESS)
		goto out_release_kernel;

	/* 2nd arg: <typename> value */
	rc = clSetKernelArg(kernel, 1, pattern_size, &pattern_value);
	if (rc != CL_SUCCESS)
		goto out_release_kernel;

	/* 3rd arg: size_t nums */
	pattern_nums = (offset + size) / pattern_size;
	rc = clSetKernelArg(kernel, 2, sizeof(cl_uint), &pattern_nums);
	if (rc != CL_SUCCESS)
		goto out_release_kernel;

	/* calculate optimal workgroup size */
	rc = clGetKernelWorkGroupInfo(kernel,
								  device,
								  CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
								  sizeof(size_t),
								  &lworksz,
								  NULL);
	Assert((lworksz & (lworksz - 1)) == 0);
	gworksz = ((size / pattern_size + lworksz - 1) / lworksz) * lworksz;

	/* enqueue a kernel, instead of clEnqueueFillBuffer */
	offset /= pattern_size;
	rc = clEnqueueNDRangeKernel(command_queue,
								kernel,
								1,
								&offset,
								&gworksz,
								&lworksz,
								num_events_in_wait_list,
								event_wait_list,
								event);
	if (rc != CL_SUCCESS)
		goto out_release_kernel;

	rc = clSetEventCallback(*event,
							CL_COMPLETE,
							pgstromEnqueueFillBufferCleanup,
							kernel);
	if (rc != CL_SUCCESS)
	{
		clWaitForEvents(1, event);
		goto out_release_kernel;
	}
	return CL_SUCCESS;

out_unlock:
	pthread_mutex_unlock(&lock);
	return rc;

out_release_kernel:
	clReleaseKernel(kernel);
out_release_program:
	clReleaseProgram(program);
	return rc;
}
Esempio n. 16
0
File: Event.cpp Progetto: k0zmo/clw
 void Event::setCallback(EEventStatus status, EventCallback cb)
 {
     _callback = std::move(cb);
     clSetEventCallback(_id, cl_int(status), 
         &detail::callback_priv, &_callback);
 }
Esempio n. 17
0
    void enqueue () {
        CPPA_LOG_TRACE("command::enqueue()");
        this->ref(); // reference held by the OpenCL comand queue
        cl_int err{0};
        cl_event event_k;
        auto data_or_nullptr = [](const dim_vec& vec) {
            return vec.empty() ? nullptr : vec.data();
        };
        err = clEnqueueNDRangeKernel(m_queue.get(),
                                     m_actor_facade->m_kernel.get(),
                                     m_actor_facade->m_global_dimensions.size(),
                                     data_or_nullptr(m_actor_facade->m_global_offsets),
                                     data_or_nullptr(m_actor_facade->m_global_dimensions),
                                     data_or_nullptr(m_actor_facade->m_local_dimensions),
                                     m_events.size(),
                                     (m_events.empty() ? nullptr : m_events.data()),
                                     &event_k);
        if (err != CL_SUCCESS) {
            CPPA_LOGMF(CPPA_ERROR, "clEnqueueNDRangeKernel: " 
                                   << get_opencl_error(err));
            this->deref(); // or can anything actually happen?
            return;
        }
        else {
            cl_event event_r;
            err = clEnqueueReadBuffer(m_queue.get(),
                                      m_arguments.back().get(),
                                      CL_FALSE,
                                      0,
                                      sizeof(typename R::value_type) * m_result_size,
                                      m_result.data(),
                                      1,
                                      &event_k,
                                      &event_r);
            if (err != CL_SUCCESS) {
                throw std::runtime_error("clEnqueueReadBuffer: "
                                         + get_opencl_error(err));
                this->deref(); // failed to enqueue command
                return;
            }
            err = clSetEventCallback(event_r,
                                     CL_COMPLETE,
                                     [](cl_event, cl_int, void* data) {
                                         auto cmd = reinterpret_cast<command*>(data);
                                         cmd->handle_results();
                                         cmd->deref();
                                     },
                                     this);
            if (err != CL_SUCCESS) {
                CPPA_LOGMF(CPPA_ERROR, "clSetEventCallback: "
                                       << get_opencl_error(err));
                this->deref(); // callback is not set
                return;
            }

            err = clFlush(m_queue.get());
            if (err != CL_SUCCESS) {
                CPPA_LOGMF(CPPA_ERROR, "clFlush: " << get_opencl_error(err));
            }
            m_events.push_back(std::move(event_k));
            m_events.push_back(std::move(event_r));
        }
    }
Esempio n. 18
0
int main()
{
    cl_platform_id platform_id = NULL;
    cl_device_id device_id = NULL;
    cl_context context = NULL;
    cl_command_queue command_queue = NULL;
    cl_mem objA = NULL;
    cl_mem objB = NULL;
    cl_mem objC = NULL;
    cl_program program = NULL;
    cl_kernel kernel = NULL;
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;
    cl_int ret;
 
    cl_event event1;
 
    int i, j;
    float *A;
    float *B;
    float *C;
 
    A = (float *)malloc(4*4*sizeof(float));
    B = (float *)malloc(4*4*sizeof(float));
    C = (float *)malloc(4*4*sizeof(float));
 
    /* Initialize input data */
    for (i=0; i<4; i++) {
        for (j=0; j<4; j++) {
            A[i*4+j] = i*4+j+1;
            B[i*4+j] = j*4+i+1;
        }
    }
 
    /* Get Platform/Device Information*/
    ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);
 
    /* Create OpenCL Context */
    context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
 
    /* Create command queue */
    command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
 
    /* Create Buffer Object */
    objA = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);
    objB = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);
    objC = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);
 
    /*
     * Creating an user event
     * As a user event is created, its execution status is set to be CL_SUBMITTED
     * and we tag the event to a callback so when event reaches CL_COMPLETE, it will 
     * execute postProcess
     */ 
    event1 = clCreateUserEvent(context, &ret);
    clSetEventCallback(event1, CL_COMPLETE, &postProcess, "Looks like its done.");

    /* Copy input data to the memory buffer */
 
    ret = clEnqueueWriteBuffer(command_queue, objA, CL_TRUE, 0, 4*4*sizeof(float), A, 0, NULL, NULL );
    printf("A has been written\n");
 
    /* The next command will wait for event1 according to its status*/
    ret = clEnqueueWriteBuffer(command_queue, objB, CL_TRUE, 0, 4*4*sizeof(float), B, 1, &event1, NULL);
    printf("B has been written\n");

    /* Tell event1 to complete */
    clSetUserEventStatus(event1, CL_COMPLETE);
	const char *file_names[] = {"sample_kernel.cl"}; 
	const int NUMBER_OF_FILES = 1;
	char* buffer[NUMBER_OF_FILES];
	size_t sizes[NUMBER_OF_FILES];
	loadProgramSource(file_names, NUMBER_OF_FILES, buffer, sizes);
	
    /* Create kernel program from source file*/
    program = clCreateProgramWithSource(context, 1, (const char **)buffer, sizes, &ret);
    ret     = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
 
    /* Create data parallel OpenCL kernel */
    kernel = clCreateKernel(program, "sample", &ret);
 
    /* Set OpenCL kernel arguments */
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&objA);
    ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&objB);
    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&objC);
 
    size_t global_item_size = 4;
    size_t local_item_size = 1;
 
    /* Execute OpenCL kernel as data parallel */
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, 
                                 &global_item_size, &local_item_size, 0, NULL, NULL);
 
    /* Transfer result to host */
    ret = clEnqueueReadBuffer(command_queue, objC, CL_TRUE, 0, 4*4*sizeof(float), C, 0, NULL, NULL);
 
    /* Display Results */
    for (i=0; i<4; i++) {
        for (j=0; j<4; j++) {
            printf("%7.2f ", C[i*4+j]);
        }
        printf("\n");
    }
 
 
    /* Finalization */
    ret = clFlush(command_queue);
    ret = clFinish(command_queue);
    ret = clReleaseKernel(kernel);
    ret = clReleaseProgram(program);
    ret = clReleaseMemObject(objA);
    ret = clReleaseMemObject(objB);
    ret = clReleaseMemObject(objC);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);
 
    free(A);
    free(B);
    free(C);
 
    return 0;
}
Esempio n. 19
0
/* PRUint32 run (in PRUint32 rank, [array, size_is (rank)] in PRUint32 shape, [array, size_is (rank), optional] in PRUint32 tile); */
NS_IMETHODIMP dpoCKernel::Run(PRUint32 rank, PRUint32 *shape, PRUint32 *tile, PRUint32 *_retval)
{
    cl_int err_code;
    cl_event runEvent, readEvent, writeEvent;
    size_t *global_work_size;
    size_t *local_work_size;
    const int zero = 0;

    DEBUG_LOG_STATUS("Run", "preparing execution of kernel");

    if (sizeof(size_t) == sizeof(PRUint32)) {
        global_work_size = (size_t *) shape;
    } else {
        global_work_size = (size_t *) nsMemory::Alloc(rank * sizeof(size_t));
        if (global_work_size == NULL) {
            DEBUG_LOG_STATUS("Run", "allocation of global_work_size failed");
            return NS_ERROR_OUT_OF_MEMORY;
        }
        for (int cnt = 0; cnt < rank; cnt++) {
            global_work_size[cnt] = shape[cnt];
        }
    }

#ifdef USE_LOCAL_WORKSIZE
    if (tile == NULL) {
        local_work_size = NULL;
    } else {
        if ((sizeof(size_t) == sizeof(PRUint32))) {
            local_work_size = (size_t *) tile;
        } else {
            local_work_size = (size_t *) nsMemory::Alloc(rank * sizeof(size_t));
            if (local_work_size == NULL) {
                DEBUG_LOG_STATUS("Run", "allocation of local_work_size failed");
                return NS_ERROR_OUT_OF_MEMORY;
            }
            for (int cnt = 0; cnt < rank; cnt++) {
                local_work_size[cnt] = (size_t) tile[cnt];
            }
        }
    }
#else /* USE_LOCAL_WORKSIZE */
    local_work_size = NULL;
#endif /* USE_LOCAL_WORKSIZE */

    DEBUG_LOG_STATUS("Run", "setting failure code to 0");

    err_code = clEnqueueWriteBuffer(cmdQueue, failureMem, CL_FALSE, 0, sizeof(int), &zero, 0, NULL, &writeEvent);
    if (err_code != CL_SUCCESS) {
        DEBUG_LOG_ERROR("Run", err_code);
        return NS_ERROR_ABORT;
    }

    DEBUG_LOG_STATUS("Run", "enqueing execution of kernel");

#ifdef WINDOWS_ROUNDTRIP
    dpoCContext::RecordBeginOfRoundTrip(parent);
#endif /* WINDOWS_ROUNDTRIP */

    err_code = clEnqueueNDRangeKernel(cmdQueue, kernel, rank, NULL, global_work_size, NULL, 1, &writeEvent, &runEvent);
    if (err_code != CL_SUCCESS) {
        DEBUG_LOG_ERROR("Run", err_code);
        return NS_ERROR_ABORT;
    }

    DEBUG_LOG_STATUS("Run", "reading failure code");

    err_code = clEnqueueReadBuffer(cmdQueue, failureMem, CL_FALSE, 0, sizeof(int), _retval, 1, &runEvent, &readEvent);
    if (err_code != CL_SUCCESS) {
        DEBUG_LOG_ERROR("Run", err_code);
        return NS_ERROR_ABORT;
    }

    DEBUG_LOG_STATUS("Run", "waiting for execution to finish");

    // For now we always wait for the run to complete.
    // In the long run, we may want to interleave this with JS execution and only sync on result read.
    err_code = clWaitForEvents( 1, &readEvent);

    DEBUG_LOG_STATUS("Run", "first event fired");

    if (err_code != CL_SUCCESS) {
        DEBUG_LOG_ERROR("Run", err_code);
        return NS_ERROR_ABORT;
    }
#ifdef WINDOWS_ROUNDTRIP
    dpoCContext::RecordEndOfRoundTrip(parent);
#endif /* WINDOWS_ROUNDTRIP */

#ifdef CLPROFILE
#ifdef CLPROFILE_ASYNC
    err_code = clSetEventCallback( readEvent, CL_COMPLETE, &dpoCContext::CollectTimings, parent);

    DEBUG_LOG_STATUS("Run", "second event fired");
    if (err_code != CL_SUCCESS) {
        DEBUG_LOG_ERROR("Run", err_code);
        return NS_ERROR_ABORT;
    }
#else /* CLPROFILE_ASYNC */
    dpoCContext::CollectTimings(readEvent,CL_COMPLETE,parent);
#endif /* CLPROFILE_ASYNC */
#endif /* CLPROFILE */

    DEBUG_LOG_STATUS("Run", "execution completed successfully, start cleanup");

    if (global_work_size != (size_t *) shape) {
        nsMemory::Free(global_work_size);
    }
#ifdef USE_LOCAL_WORKSIZE
    if (local_work_size != (size_t *) tile) {
        nsMemory::Free(local_work_size);
    }
#endif /* USE_LOCAL_WORKSIZE */

    err_code = clReleaseEvent(readEvent);
    err_code = clReleaseEvent(runEvent);
    err_code = clReleaseEvent(writeEvent);

    if (err_code != CL_SUCCESS) {
        DEBUG_LOG_ERROR("Run", err_code);
        return NS_ERROR_ABORT;
    }

    DEBUG_LOG_STATUS("Run", "cleanup complete");

    return NS_OK;
}
Esempio n. 20
0
File: ocl.c Progetto: cran/OpenCL
/* .External */
SEXP ocl_call(SEXP args) {
    struct arg_chain *float_args = 0;
    ocl_call_context_t *occ;
    int on, an = 0, ftype = FT_DOUBLE, ftsize, ftres, async;
    SEXP ker = CADR(args), olen, arg, res, octx, dimVec;
    cl_kernel kernel = getKernel(ker);
    cl_context context;
    cl_command_queue commands;
    cl_device_id device_id = getDeviceID(getAttrib(ker, Rf_install("device")));
    cl_mem output;
    cl_int err;
    size_t wdims[3] = {0, 0, 0};
    int wdim = 1;

    if (clGetKernelInfo(kernel, CL_KERNEL_CONTEXT, sizeof(context), &context, NULL) != CL_SUCCESS || !context)
	Rf_error("cannot obtain kernel context via clGetKernelInfo");
    args = CDDR(args);
    res = Rf_getAttrib(ker, install("precision"));
    if (TYPEOF(res) == STRSXP && LENGTH(res) == 1 && CHAR(STRING_ELT(res, 0))[0] != 'd')
	ftype = FT_SINGLE;
    ftsize = (ftype == FT_DOUBLE) ? sizeof(double) : sizeof(float);
    olen = CAR(args);  /* size */
    args = CDR(args);
    on = Rf_asInteger(olen);
    if (on < 0)
	Rf_error("invalid output length");
    ftres = (Rf_asInteger(CAR(args)) == 1) ? 1 : 0;  /* native.result */
    if (ftype != FT_SINGLE) ftres = 0;
    args = CDR(args);
    async = (Rf_asInteger(CAR(args)) == 1) ? 0 : 1;  /* wait */
    args = CDR(args);
    dimVec = coerceVector(CAR(args), INTSXP);  /* dim */
    wdim = LENGTH(dimVec);
    if (wdim > 3)
	Rf_error("OpenCL standard only supports up to three work item dimensions - use index vectors for higher dimensions");
    if (wdim) {
	int i; /* we don't use memcpy in case int and size_t are different */
	for (i = 0; i < wdim; i++)
	    wdims[i] = INTEGER(dimVec)[i];
    }
    if (wdim < 1 || wdims[0] < 1 || (wdim > 1 && wdims[1] < 1) || (wdim > 2 && wdims[2] < 1))
	Rf_error("invalid dimensions - muse be a numeric vector with positive values");

    args = CDR(args);
    occ = (ocl_call_context_t*) calloc(1, sizeof(ocl_call_context_t));
    if (!occ) Rf_error("unable to allocate ocl_call context");
    octx = PROTECT(R_MakeExternalPtr(occ, R_NilValue, R_NilValue));
    R_RegisterCFinalizerEx(octx, ocl_call_context_fin, TRUE);

    occ->output = output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, ftsize * on, NULL, &err);
    if (!output)
	Rf_error("failed to create output buffer of %d elements via clCreateBuffer (%d)", on, err);
    if (clSetKernelArg(kernel, an++, sizeof(cl_mem), &output) != CL_SUCCESS)
	Rf_error("failed to set first kernel argument as output in clSetKernelArg");
    if (clSetKernelArg(kernel, an++, sizeof(on), &on) != CL_SUCCESS)
	Rf_error("failed to set second kernel argument as output length in clSetKernelArg");
    occ->commands = commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
	ocl_err("clCreateCommandQueue");
    if (ftype == FT_SINGLE) /* need conversions, create floats buffer */
	occ->float_args = float_args = arg_alloc(0, 32);
    while ((arg = CAR(args)) != R_NilValue) {
	int n, ndiv = 1;
	void *ptr;
	size_t al;
	
	switch (TYPEOF(arg)) {
	case REALSXP:
	    if (ftype == FT_SINGLE) {
		int i;
		float *f;
		double *d = REAL(arg);
		n = LENGTH(arg);
		f = (float*) malloc(sizeof(float) * n);
		if (!f)
		    Rf_error("unable to allocate temporary single-precision memory for conversion from a double-precision argument vector of length %d", n);
		for (i = 0; i < n; i++) f[i] = d[i];
		ptr = f;
		al = sizeof(float);
		arg_add(float_args, ptr);
	    } else {
		ptr = REAL(arg);
		al = sizeof(double);
	    }
	    break;
	case INTSXP:
	    ptr = INTEGER(arg);
	    al = sizeof(int);
	    break;
	case LGLSXP:
	    ptr = LOGICAL(arg);
	    al = sizeof(int);
	    break;
	case RAWSXP:
	    if (inherits(arg, "clFloat")) {
		ptr = RAW(arg);
		ndiv = al = sizeof(float);
		break;
	    }
	default:
	    Rf_error("only numeric or logical kernel arguments are supported");
	    /* no-ops but needed to make the compiler happy */
	    ptr = 0;
	    al = 0;
	}
	n = LENGTH(arg);
	if (ndiv != 1) n /= ndiv;
	if (n == 1) {/* scalar */
	    if (clSetKernelArg(kernel, an++, al, ptr) != CL_SUCCESS)
		Rf_error("Failed to set scalar kernel argument %d (size=%d)", an, al);
	} else {
	    cl_mem input = clCreateBuffer(context,  CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,  al * n, ptr, &err);
	    if (!input)
		Rf_error("Unable to create buffer (%d elements, %d bytes each) for vector argument %d (oclError %d)", n, al, an, err);
	    if (!occ->mem_objects)
		occ->mem_objects = arg_alloc(0, 32);
	    arg_add(occ->mem_objects, input);
#if 0 /* we used this before CL_MEM_USE_HOST_PTR */
	    if (clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, al * n, ptr, 0, NULL, NULL) != CL_SUCCESS)
		Rf_error("Failed to transfer data (%d elements) for vector argument %d", n, an);
#endif
	    if (clSetKernelArg(kernel, an++, sizeof(cl_mem), &input) != CL_SUCCESS)
		Rf_error("Failed to set vector kernel argument %d (size=%d, length=%d)", an, al, n);
	    /* clReleaseMemObject(input); */
	}
	args = CDR(args);
    }

    if (clEnqueueNDRangeKernel(commands, kernel, wdim, NULL, wdims, NULL, 0, NULL, async ? &occ->event : NULL) != CL_SUCCESS)
	Rf_error("Error during kernel execution");

    if (async) { /* asynchronous call -> get out and return the context */
#if USE_OCL_COMPLETE_CALLBACK
	clSetEventCallback(occ->event, CL_COMPLETE, ocl_complete_callback, occ);
#endif
	clFlush(commands); /* the specs don't guarantee execution unless clFlush is called */
	occ->ftres = ftres;
	occ->ftype = ftype;
	occ->on = on;
	Rf_setAttrib(octx, R_ClassSymbol, mkString("clCallContext"));
	UNPROTECT(1);
	return octx;
    }

    clFinish(commands);
    occ->finished = 1;

    /* we can release input memory objects now */
    if (occ->mem_objects) {
      arg_free(occ->mem_objects, (afin_t) clReleaseMemObject);
      occ->mem_objects = 0;
    }
    if (float_args) {
      arg_free(float_args, 0);
      float_args = occ->float_args = 0;
    }

    res = ftres ? Rf_allocVector(RAWSXP, on * sizeof(float)) : Rf_allocVector(REALSXP, on);
    if (ftype == FT_SINGLE) {
	if (ftres) {
	  if ((err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * on, RAW(res), 0, NULL, NULL )) != CL_SUCCESS)
		Rf_error("Unable to transfer result vector (%d float elements, oclError %d)", on, err);
	    PROTECT(res);
	    Rf_setAttrib(res, R_ClassSymbol, mkString("clFloat"));
	    UNPROTECT(1);
	} else {
	    /* float - need a temporary buffer */
	    float *fr = (float*) malloc(sizeof(float) * on);
	    double *r = REAL(res);
	    int i;
	    if (!fr)
		Rf_error("unable to allocate memory for temporary single-precision output buffer");
	    occ->float_out = fr;
	    if ((err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * on, fr, 0, NULL, NULL )) != CL_SUCCESS)
		Rf_error("Unable to transfer result vector (%d float elements, oclError %d)", on, err);
	    for (i = 0; i < on; i++)
		r[i] = fr[i];
	}
    } else if ((err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(double) * on, REAL(res), 0, NULL, NULL )) != CL_SUCCESS)
	Rf_error("Unable to transfer result vector (%d double elements, oclError %d)", on, err);

    ocl_call_context_fin(octx);
    UNPROTECT(1);
    return res;
}
Esempio n. 21
0
int main() {

   /* OpenCL data structures */
   cl_device_id device;
   cl_context context;
   cl_command_queue queue;
   cl_program program;
   cl_kernel kernel;
   cl_int err;

   /* Data and events */
   char *kernel_msg;
   float data[4096];
   cl_mem data_buffer;
   cl_event kernel_event, read_event;   
   
   /* Create a device and context */
   device = create_device();
   context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
   if(err < 0) {
      perror("Couldn't create a context");
      exit(1);   
   }     

   /* Build the program and create a kernel */
   program = build_program(context, device, PROGRAM_FILE);
   kernel = clCreateKernel(program, KERNEL_FUNC, &err);
   if(err < 0) {
      perror("Couldn't create a kernel");
      exit(1);   
   };

   /* Create a write-only buffer to hold the output data */
   data_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 
         sizeof(data), NULL, &err);
   if(err < 0) {
      perror("Couldn't create a buffer");
      exit(1);   
   };         

   /* Create kernel argument */
   err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_buffer);
   if(err < 0) {
      perror("Couldn't set a kernel argument");
      exit(1);   
   };

   /* Create a command queue */
   queue = clCreateCommandQueue(context, device, 0, &err);
   if(err < 0) {
      perror("Couldn't create a command queue");
      exit(1);   
   };

   /* Enqueue kernel */
   err = clEnqueueTask(queue, kernel, 0, NULL, &kernel_event);
   if(err < 0) {
      perror("Couldn't enqueue the kernel");
      exit(1);   
   }

   /* Read the buffer */
   err = clEnqueueReadBuffer(queue, data_buffer, CL_FALSE, 0, 
      sizeof(data), &data, 0, NULL, &read_event);
   if(err < 0) {
      perror("Couldn't read the buffer");
      exit(1);   
   }
 
   /* Set event handling routines */
   kernel_msg = "The kernel finished successfully.\n\0";
   err = clSetEventCallback(kernel_event, CL_COMPLETE, 
         &kernel_complete, kernel_msg);
   if(err < 0) {
      perror("Couldn't set callback for event");
      exit(1);   
   }
   clSetEventCallback(read_event, CL_COMPLETE, &read_complete, data);

   /* Deallocate resources */
   clReleaseMemObject(data_buffer);
   clReleaseKernel(kernel);
   clReleaseCommandQueue(queue);
   clReleaseProgram(program);
   clReleaseContext(context);
   return 0;
}