C++ (Cpp) clCreateUserEvent 예제들

예제 #1

0

파일 보기

파일: event.hpp 프로젝트: STEllAR-GROUP/oclm

     operator cl_event() const
     {
         cl_int err = CL_SUCCESS;
         if(e_.size() == 1)
         {
             return e_[0];
         }
         if(!e_.empty())
         {
             cl_context ctx = get_info<command_queue::context_info_type>(command_queue());
             cl_event e = clCreateUserEvent(ctx, &err);
             OCLM_THROW_IF_EXCEPTION(err, "clCreateUserEvents");
 #ifdef CL_VERSION_1_2
             err = clEnqueueMarkerWithWaitList(command_queue(),
                 static_cast<cl_uint>(e_.size()), &e_[0], &e);
             OCLM_THROW_IF_EXCEPTION(err, "clEnqueueMarkerWithWaitList");
 #else
             get();
             err = clSetUserEventStatus(e, CL_COMPLETE);
 #endif
             return e;
         }
         else
             return cl_event();
     }

예제 #2

0

파일 보기

파일: pclu.c 프로젝트: NatTuck/cakemark

void 
pclu_call_kernel(pclu_program* pgm, cl_kernel kernel, pclu_range range)
{
    int errcode;

#define NO_CL_EVENTS 1

#ifdef NO_CL_EVENTS
    cl_event kernel_done = 0;
#else
    cl_event kernel_done = clCreateUserEvent(pgm->pclu->context, &errcode);
    pclu_check_call("clCreateUserEvent", errcode);
#endif

    size_t* local_size = 0;
    if (range.local[0] > 0)
        local_size = &(range.local[0]);

    errcode = clEnqueueNDRangeKernel(pgm->pclu->queue, kernel, range.nd, 0, 
				     range.global, local_size, 0, 0, &kernel_done);
    pclu_check_call("clEnqueueNDRangeKernel", errcode);

#ifndef NO_CL_EVENTS
    pclu_check_call("clWaitForEvents", clWaitForEvents(1, &kernel_done));
#endif

    pclu_check_call("clFinish", clFinish(pgm->pclu->queue));

    //pclu_check_call("clReleaseKernel", clReleaseKernel(kern));
}

예제 #3

0

파일 보기

파일: acc_opencl_event.c 프로젝트: rzk1/cp2k-mcgill

int acc_event_create (void** event_p){
  // debug info
  if (verbose_print){
    fprintf(stdout, "\n ... EVENT CREATION ... \n");
    fprintf(stdout, " ---> Entering: acc_event_create.\n");
  }

  // local event object pointer
  *event_p = malloc(sizeof(cl_event));
  cl_event *clevent = (cl_event *) *event_p;

  // get a device event object
  *clevent = clCreateUserEvent((*acc_opencl_my_device).ctx, &cl_error);
  if (acc_opencl_error_check(cl_error, __LINE__))
    return -1;

  // debug info
  if (verbose_print){
    fprintf(stdout, " ---> Leaving: acc_event_create.\n");
  }

  cl_error = clSetUserEventStatus(*clevent, CL_COMPLETE);

  // assign return value
  return 0;
}

예제 #4

0

파일 보기

파일: ocl_event.cpp 프로젝트: FeBrex/OpenCL-Wrapper

/*! \brief Instantiates a user Event.
  *
  * You can add this event to track a command within a command Queue by adding this
  * Event into the EventList of the command. This
  * Event is created using the provided Context.
  */
ocl::Event::Event(ocl::Context& ctxt) : _id(0), _ctxt(&ctxt)
{
    TRUE_ASSERT(this->_ctxt != 0, "Context not valid");

    cl_int err;
    _id =  clCreateUserEvent (_ctxt->id(), &err);
    OPENCL_SAFE_CALL(err);
    TRUE_ASSERT(_id != 0, "Could not create user event");
}

예제 #5

0

파일 보기

파일: ocl_event.cpp 프로젝트: azuredsky/OpenCL-Wrapper-1

/*! \brief Instantiates a user Event.
  *
  * You can add this event to track a command within a command Queue by adding this
  * Event into the EventList of the command. This
  * Event is created using the provided Context.
  */
ocl::Event::Event(ocl::Context& ctxt) : _id(0), _ctxt(&ctxt)
{
	if(this->_ctxt == nullptr) throw std::runtime_error("no active context");

    cl_int err;
    _id =  clCreateUserEvent (_ctxt->id(), &err);
    OPENCL_SAFE_CALL(err);
	if(this->_id == nullptr) throw std::runtime_error("could not create user event");
}

예제 #6

0

파일 보기

파일: event.cpp 프로젝트: Robbepop/cppCL

	Event::Event(Context const& context) :
		Object{}
	{
		static const auto error_map = error::ErrorMap{
			{ErrorCode::invalid_context, "the given context is invalid."}
		};
		auto error = cl_int{CL_INVALID_VALUE};
		auto new_id = clCreateUserEvent(context.id(), &error);
		if (error::handle<EventException>(error, error_map)) m_id = new_id;
	}

예제 #7

0

파일 보기

파일: Event.cpp 프로젝트: ArtisticCoding/micropolis

void CL::UserEvent::begin(const CL::Event& dependencies)
{
    cl_int status;
    
    _event = clCreateUserEvent(_device.get_context(), &status);
    OPENCL_ASSERT(status);

    _id = _device.insert_user_event(_name, _event, dependencies);
    _active = true;
}

예제 #8

0

파일 보기

파일: test_user_event.c 프로젝트: clementleger/pocl

int main()
{
  cl_int err;
  cl_event user_evt = NULL;
  int i;

  // An user event can be set to either complete or a negative value, indicating error;
  // additionally, no objects involved in a command that waits on the user event should
  // be released before the event status is set; however, it should be possible to release
  // everything even if the status is set to something which is NOT CL_COMPLETE. So
  // try both CL_COMPLETE and a negative value
  cl_int status[] = {CL_INVALID_EVENT, CL_COMPLETE };

  // We also query for profiling info of the event, which according to the standard
  // should return CL_PROFILING_INFO_NOT_AVAILABLE
  cl_ulong queued, submitted, started, endtime;

  for (i = 0; i < ARRAY_SIZE(status); ++i) {
	  cl_context context;
	  cl_command_queue queue;
	  cl_device_id device;

	  CHECK_CL_ERROR(poclu_get_any_device(&context, &device, &queue));
	  TEST_ASSERT( context );
	  TEST_ASSERT( device );
	  TEST_ASSERT( queue );

	  user_evt = clCreateUserEvent(context, &err);
	  CHECK_OPENCL_ERROR_IN("clCreateUserEvent");
	  TEST_ASSERT( user_evt );

	  CHECK_CL_ERROR(clSetUserEventStatus(user_evt, status[i]));

	  err = clGetEventProfilingInfo(user_evt, CL_PROFILING_COMMAND_QUEUED,
		  sizeof(queued), &queued, NULL);
	  TEST_ASSERT(err == CL_PROFILING_INFO_NOT_AVAILABLE);
	  err = clGetEventProfilingInfo(user_evt, CL_PROFILING_COMMAND_SUBMIT,
		  sizeof(submitted), &submitted, NULL);
	  TEST_ASSERT(err == CL_PROFILING_INFO_NOT_AVAILABLE);
	  err = clGetEventProfilingInfo(user_evt, CL_PROFILING_COMMAND_START,
		  sizeof(started), &started, NULL);
	  TEST_ASSERT(err == CL_PROFILING_INFO_NOT_AVAILABLE);
	  err = clGetEventProfilingInfo(user_evt, CL_PROFILING_COMMAND_END,
		  sizeof(endtime), &endtime, NULL);
	  TEST_ASSERT(err == CL_PROFILING_INFO_NOT_AVAILABLE);

	  CHECK_CL_ERROR(clReleaseEvent(user_evt));
	  CHECK_CL_ERROR(clReleaseCommandQueue(queue));
	  CHECK_CL_ERROR(clReleaseContext(context));
  }

  return EXIT_SUCCESS;

}

예제 #9

0

파일 보기

파일: ocl_event.cpp 프로젝트: FeBrex/OpenCL-Wrapper

/*! \brief Instantiates a user Event.
  *
  * You can add this event to track a command within a command Queue by adding this
  * Event into the EventList of the command.
  * If there is an active Platform and an active Context
  * this Event is created. Otherwise do not forget
  * to provide a Context and to create this Event.
  */
ocl::Event::Event() : _id(0), _ctxt(0)
{
    if(ocl::Platform::hasActivePlatform() && ocl::Platform::activePlatform()->hasActiveContext()){
        _ctxt = ocl::Platform::activePlatform()->activeContext();
        TRUE_ASSERT(_ctxt != 0, "No active context");
        cl_int err;
        _id =  clCreateUserEvent (_ctxt->id(), &err);
        OPENCL_SAFE_CALL(err);
        TRUE_ASSERT(_id != 0, "Could not create user event");
    }
}

예제 #10

0

파일 보기

파일: WebCLUserEvent.cpp 프로젝트: smishenk/chromium-crosswalk

PassRefPtr<WebCLUserEvent> WebCLUserEvent::create(PassRefPtr<WebCLContext> context, ExceptionState& es)
{
    cl_int userEventError = 0;
    cl_event userEvent = clCreateUserEvent(context->getContext(), &userEventError);
    if (userEventError != CL_SUCCESS) {
        WebCLException::throwException(userEventError, es);
        return nullptr;
    }

    return adoptRef(new WebCLUserEvent(userEvent, context));
}

예제 #11

0

파일 보기

파일: qclcontext.cpp 프로젝트: k0zmo/qt-opencl

/*!
    Creates a user event.  Returns null if user events are not
    supported.

    User events are a feature of OpenCL 1.1 which allows an application
    to insert a marker into the command queue.  Commands that depend
    upon the marker will not be executed until the application triggers
    the user event with QCLUserEvent::setFinished().
*/
QCLUserEvent QCLContext::createUserEvent()
{
#ifdef QT_OPENCL_1_1
    Q_D(QCLContext);
    cl_int error = CL_INVALID_CONTEXT;
    cl_event event = clCreateUserEvent(d->id, &error);
    reportError("QCLContext::createUserEvent:", error);
    return QCLUserEvent(event, true);
#else
    return QCLUserEvent();
#endif
}

예제 #12

0

파일 보기

파일: ocl_event.cpp 프로젝트: azuredsky/OpenCL-Wrapper-1

/*! \brief Instantiates a user Event.
  *
  * You can add this event to track a command within a command Queue by adding this
  * Event into the EventList of the command.
  * If there is an active Platform and an active Context
  * this Event is created. Otherwise do not forget
  * to provide a Context and to create this Event.
  */
ocl::Event::Event() : _id(0), _ctxt(0)
{
	if(!ocl::Platform::hasActivePlatform() || !ocl::Platform::activePlatform()->hasActiveContext()) return;

	_ctxt = ocl::Platform::activePlatform()->activeContext();

	if(this->_ctxt == nullptr) throw std::runtime_error("no active context");
	cl_int err;
	_id =  clCreateUserEvent (_ctxt->id(), &err);
	OPENCL_SAFE_CALL(err);
	if(this->_id == nullptr) throw std::runtime_error("could not create user event");

}

예제 #13

0

파일 보기

파일: milkyway_cl_util.c 프로젝트: LocutusOfBorg/milkywayathome_client

cl_event mwCreateEvent(CLInfo* ci)
{
    cl_int err;
    cl_event ev;

    ev = clCreateUserEvent(ci->clctx, &err);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Failed to create custom event");
        return NULL;
    }

    return ev;
}

예제 #14

0

파일 보기

파일: register_event.hpp 프로젝트: hapoo/hpxcl

static void register_event(hpx::opencl::device cldevice,
                           const hpx::naming::id_type & event_id)
{

    boost::shared_ptr<hpx::opencl::server::device>
    parent_device = hpx::get_ptr<hpx::opencl::server::device>
                        (cldevice.get_gid()).get();

    // create a fake event
    cl_int err;
    cl_event event_cl = clCreateUserEvent (
            parent_device->get_context(),
            &err);
    cl_ensure(err, "clEnqueueWriteBuffer()");
    err = clSetUserEventStatus(event_cl, CL_COMPLETE);
    cl_ensure(err, "clSetUserEventStatus()");

    parent_device->register_event(event_id, event_cl);
}

예제 #15

0

파일 보기

파일: pclu.c 프로젝트: NatTuck/pocl-0.7x

void
pclu_call_kernel(pclu_program* pgm, const char* name, pclu_range range, size_t argc, ...)
{
    cl_int errcode;
    cl_kernel kern = clCreateKernel(pgm->program, name, &errcode);
    pclu_check_call("clCreateKernel", errcode);

    va_list ap;
    va_start(ap, argc);

    for (cl_uint ii = 0; ii < argc; ++ii) {
        size_t size = va_arg(ap, size_t);	
        void*  arg  = va_arg(ap, void*);
        pclu_check_call("clSetKernelArg", clSetKernelArg(kern, ii, size, arg));
    }

    va_end(ap);

#define NO_CL_EVENTS 1

#ifdef NO_CL_EVENTS
    cl_event kernel_done = 0;
#else
    cl_event kernel_done = clCreateUserEvent(pgm->pclu->context, &errcode);
    pclu_check_call("clCreateUserEvent", errcode);
#endif

    errcode = clEnqueueNDRangeKernel(pgm->pclu->queue, kern, range.nd, 0, 
				     range.global, 0, 0, 0, &kernel_done);
    pclu_check_call("clEnqueueNDRangeKernel", errcode);

#ifndef NO_CL_EVENTS
    pclu_check_call("clWaitForEvents", clWaitForEvents(1, &kernel_done));
#endif

    pclu_check_call("clReleaseKernel", clReleaseKernel(kern));
}

예제 #16

0

파일 보기

파일: ccl.c 프로젝트: shibatch/cclopencl

int main(int argc, char **argv) {
  int i;

  if (argc < 2) {
    fprintf(stderr, "Usage : %s <image file name> [<device number>]\nThe program will threshold the image, apply CCL,\nand output the result to output.png.\n", argv[0]);
    fprintf(stderr, "\nAvailable OpenCL Devices :\n");
    simpleGetDevice(-1);
    exit(-1);
  }

  //

  IplImage *img = 0;
  img = cvLoadImage(argv[1], CV_LOAD_IMAGE_COLOR);
  if( !img ) abortf("Could not load %s\n", argv[1]);

  if (img->nChannels != 3) abortf("nChannels != 3\n");

  int iw = img->width, ih = img->height;
  uint8_t *data = (uint8_t *)img->imageData;

  //

  cl_int *bufPix   = (cl_int *)calloc(iw * ih,   sizeof(cl_int));
  cl_int *bufLabel = (cl_int *)calloc(iw * ih,   sizeof(cl_int));
  cl_int *bufFlags = (cl_int *)calloc(MAXPASS+1, sizeof(cl_int));

  {
    int x, y;
    for(y=0;y<ih;y++) {
      for(x=0;x<iw;x++) {
	bufPix[y * iw + x] = data[y * img->widthStep + x * 3 + 1] > 127 ? 1 : 0;
      }
    }
  }

  //

  int did = 0;
  if (argc >= 3) did = atoi(argv[2]);

  cl_device_id device = simpleGetDevice(did);
  cl_context context = simpleCreateContext(device);

  cl_command_queue queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, NULL);

  char *source = readFileAsStr("ccl.cl");
  cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source, 0, NULL);

  cl_int ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
  if (ret != CL_SUCCESS) {
    fprintf(stderr, "Could not build program : %d\n", ret);
    if (ret == CL_BUILD_PROGRAM_FAILURE) fprintf(stderr, "CL_BUILD_PROGRAM_FAILURE\n");
    if (clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 10000, strbuf, NULL) == CL_SUCCESS) {
      fprintf(stderr, "Build log follows\n");
      fprintf(stderr, "%s\n", strbuf);
    }
    exit(-1);
  }

  cl_kernel kernel_prepare = clCreateKernel(program, "labelxPreprocess_int_int", NULL);
  cl_kernel kernel_propagate = clCreateKernel(program, "label8xMain_int_int", NULL);

  // By specifying CL_MEM_COPY_HOST_PTR, device buffers are cleared.
  cl_mem memPix   = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, iw * ih * sizeof(cl_int), bufPix, NULL);
  cl_mem memLabel = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, iw * ih * sizeof(cl_int), bufLabel, NULL);
  cl_mem memFlags = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, (MAXPASS+1) * sizeof(cl_int), bufFlags, NULL);

  size_t work_size[2] = {(size_t)((iw + 31) & ~31), (size_t)((ih + 31) & ~31)};

  cl_event events[MAXPASS+1];
  for(i=0;i<=MAXPASS;i++) {
    events[i] = clCreateUserEvent(context, NULL);
  }

  //

  clSetKernelArg(kernel_prepare, 0, sizeof(cl_mem), (void *) &memLabel);
  clSetKernelArg(kernel_prepare, 1, sizeof(cl_mem), (void *) &memPix);
  clSetKernelArg(kernel_prepare, 2, sizeof(cl_mem), (void *) &memFlags);
  i = MAXPASS; clSetKernelArg(kernel_prepare, 3, sizeof(cl_int), (void *) &i);
  i = 0; clSetKernelArg(kernel_prepare, 4, sizeof(cl_int), (void *) &i);
  clSetKernelArg(kernel_prepare, 5, sizeof(cl_int), (int *) &iw);
  clSetKernelArg(kernel_prepare, 6, sizeof(cl_int), (int *) &ih);

  clEnqueueNDRangeKernel(queue, kernel_prepare, 2, NULL, work_size, NULL, 0, NULL, &events[0]);

  for(i=1;i<=MAXPASS;i++) {
    clSetKernelArg(kernel_propagate, 0, sizeof(cl_mem), (void *) &memLabel);
    clSetKernelArg(kernel_propagate, 1, sizeof(cl_mem), (void *) &memPix);
    clSetKernelArg(kernel_propagate, 2, sizeof(cl_mem), (void *) &memFlags);
    clSetKernelArg(kernel_propagate, 3, sizeof(cl_int), (void *) &i);
    clSetKernelArg(kernel_propagate, 4, sizeof(cl_int), (int *) &iw);
    clSetKernelArg(kernel_propagate, 5, sizeof(cl_int), (int *) &ih);

    clEnqueueNDRangeKernel(queue, kernel_propagate, 2, NULL, work_size, NULL, 0, NULL, &events[i]);
  }

  clEnqueueReadBuffer(queue, memLabel, CL_TRUE, 0, iw * ih * sizeof(cl_int), bufLabel, 0, NULL, NULL);
  clEnqueueReadBuffer(queue, memFlags, CL_TRUE, 0, (MAXPASS+1) * sizeof(cl_int), bufFlags, 0, NULL, NULL);

  clFinish(queue);

  long long int total = 0;
  for(i=0;i<=MAXPASS;i++) {
    cl_ulong tstart, tend;
    clGetEventProfilingInfo(events[i], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &tstart, NULL);
    clGetEventProfilingInfo(events[i], CL_PROFILING_COMMAND_END  , sizeof(cl_ulong), &tend  , NULL);
    clReleaseEvent(events[i]);

    printf("pass %2d : %10lld nano sec\n", i, (long long int)(tend - tstart));
    total += tend - tstart;
  }

  printf("total   : %10lld nano sec\n", total);

  clReleaseMemObject(memFlags);
  clReleaseMemObject(memLabel);
  clReleaseMemObject(memPix);
  clReleaseKernel(kernel_propagate);
  clReleaseKernel(kernel_prepare);
  clReleaseProgram(program);
  clReleaseCommandQueue(queue);
  clReleaseContext(context);

  //

  {
    int x, y;
    for(y=0;y<ih;y++) {
      for(x=0;x<iw;x++) {
	int rgb = bufLabel[y * iw + x] == -1 ? 0 : (bufLabel[y * iw + x]  * 1103515245 + 12345);
	//int rgb = bufLabel[y * iw + x] == -1 ? 0 : (bufLabel[y * iw + x]);
	data[y * img->widthStep + x * 3 + 0] = rgb & 0xff; rgb >>= 8;
	data[y * img->widthStep + x * 3 + 1] = rgb & 0xff; rgb >>= 8;
	data[y * img->widthStep + x * 3 + 2] = rgb & 0xff; rgb >>= 8;
      }
    }
  }

  int params[3] = { CV_IMWRITE_PNG_COMPRESSION, 9, 0 };

  cvSaveImage("output.png", img, params);

  free(bufFlags);
  free(bufLabel);
  free(bufPix);

  exit(0);
}

예제 #17

0

파일 보기

파일: events.c 프로젝트: gujunli/OpenCL-Benchmark

int main()
{
    cl_platform_id platform_id = NULL;
    cl_device_id device_id = NULL;
    cl_context context = NULL;
    cl_command_queue command_queue = NULL;
    cl_mem objA = NULL;
    cl_mem objB = NULL;
    cl_mem objC = NULL;
    cl_program program = NULL;
    cl_kernel kernel = NULL;
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;
    cl_int ret;
 
    cl_event event1;
 
    int i, j;
    float *A;
    float *B;
    float *C;
 
    A = (float *)malloc(4*4*sizeof(float));
    B = (float *)malloc(4*4*sizeof(float));
    C = (float *)malloc(4*4*sizeof(float));
 
    /* Initialize input data */
    for (i=0; i<4; i++) {
        for (j=0; j<4; j++) {
            A[i*4+j] = i*4+j+1;
            B[i*4+j] = j*4+i+1;
        }
    }
 
    /* Get Platform/Device Information*/
    ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);
 
    /* Create OpenCL Context */
    context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
 
    /* Create command queue */
    command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
 
    /* Create Buffer Object */
    objA = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);
    objB = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);
    objC = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);
 
    /*
     * Creating an user event
     * As a user event is created, its execution status is set to be CL_SUBMITTED
     * and we tag the event to a callback so when event reaches CL_COMPLETE, it will 
     * execute postProcess
     */ 
    event1 = clCreateUserEvent(context, &ret);
    clSetEventCallback(event1, CL_COMPLETE, &postProcess, "Looks like its done.");

    /* Copy input data to the memory buffer */
 
    ret = clEnqueueWriteBuffer(command_queue, objA, CL_TRUE, 0, 4*4*sizeof(float), A, 0, NULL, NULL );
    printf("A has been written\n");
 
    /* The next command will wait for event1 according to its status*/
    ret = clEnqueueWriteBuffer(command_queue, objB, CL_TRUE, 0, 4*4*sizeof(float), B, 1, &event1, NULL);
    printf("B has been written\n");

    /* Tell event1 to complete */
    clSetUserEventStatus(event1, CL_COMPLETE);
	const char *file_names[] = {"sample_kernel.cl"}; 
	const int NUMBER_OF_FILES = 1;
	char* buffer[NUMBER_OF_FILES];
	size_t sizes[NUMBER_OF_FILES];
	loadProgramSource(file_names, NUMBER_OF_FILES, buffer, sizes);
	
    /* Create kernel program from source file*/
    program = clCreateProgramWithSource(context, 1, (const char **)buffer, sizes, &ret);
    ret     = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
 
    /* Create data parallel OpenCL kernel */
    kernel = clCreateKernel(program, "sample", &ret);
 
    /* Set OpenCL kernel arguments */
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&objA);
    ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&objB);
    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&objC);
 
    size_t global_item_size = 4;
    size_t local_item_size = 1;
 
    /* Execute OpenCL kernel as data parallel */
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, 
                                 &global_item_size, &local_item_size, 0, NULL, NULL);
 
    /* Transfer result to host */
    ret = clEnqueueReadBuffer(command_queue, objC, CL_TRUE, 0, 4*4*sizeof(float), C, 0, NULL, NULL);
 
    /* Display Results */
    for (i=0; i<4; i++) {
        for (j=0; j<4; j++) {
            printf("%7.2f ", C[i*4+j]);
        }
        printf("\n");
    }
 
 
    /* Finalization */
    ret = clFlush(command_queue);
    ret = clFinish(command_queue);
    ret = clReleaseKernel(kernel);
    ret = clReleaseProgram(program);
    ret = clReleaseMemObject(objA);
    ret = clReleaseMemObject(objB);
    ret = clReleaseMemObject(objC);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);
 
    free(A);
    free(B);
    free(C);
 
    return 0;
}

예제 #18

0

파일 보기

파일: testdtfield_cl.c 프로젝트: gdolle/SCHNAPS

int TestDtfield_CL(void){
  bool test = true;

  if(!cldevice_is_acceptable(nplatform_cl, ndevice_cl)) {
    printf("OpenCL device not acceptable.\n");
    return true;
  }

  field f;
  
  // 2D meshes:
  // test/disque2d.msh
  // test/testdisque2d.msh
  // test/testmacromesh.msh
  // test/unit-cube.msh

  char *mshname =  "test/disque2d.msh";
  
  ReadMacroMesh(&(f.macromesh), mshname);
  Detect2DMacroMesh(&f.macromesh);
  BuildConnectivity(&f.macromesh);

#if 1
  // 2D version
  assert(f.macromesh.is2d);

  f.model.cfl = 0.05;
  f.model.m = 1;
  m = f.model.m;


  f.model.NumFlux = TransNumFlux2d;
  f.model.BoundaryFlux = TransBoundaryFlux2d;
  f.model.InitData = TransInitData2d;
  f.model.ImposedData = TransImposedData2d;
  f.varindex = GenericVarindex;

  f.interp.interp_param[0] = f.model.m;
  f.interp.interp_param[1] = 2; // x direction degree
  f.interp.interp_param[2] = 2; // y direction degree
  f.interp.interp_param[3] = 0; // z direction degree
  f.interp.interp_param[4] = 4; // x direction refinement
  f.interp.interp_param[5] = 4; // y direction refinement
  f.interp.interp_param[6] = 1; // z direction refinement

#else
  // 3D version
  f.model.cfl = 0.05;
  f.model.m = 1;
  f.model.NumFlux = TransNumFlux;
  f.model.BoundaryFlux = TestTransBoundaryFlux;
  f.model.InitData = TestTransInitData;
  f.model.ImposedData = TestTransImposedData;
  f.varindex = GenericVarindex;

  f.interp.interp_param[0] = f.model.m;
  f.interp.interp_param[1] = 2; // x direction degree
  f.interp.interp_param[2] = 2; // y direction degree
  f.interp.interp_param[3] = 2; // z direction degree
  f.interp.interp_param[4] = 3; // x direction refinement
  f.interp.interp_param[5] = 3; // y direction refinement
  f.interp.interp_param[6] = 3; // z direction refinement
#endif

  set_global_m(f.model.m);
  set_source_CL(&f, "OneSource");
  Initfield(&f);
  
  cl_event clv_dtfield = clCreateUserEvent(f.cli.context, NULL);
  
  dtfield_CL(&f, &f.wn_cl, 0, NULL, &clv_dtfield);
  clWaitForEvents(1, &clv_dtfield);
  CopyfieldtoCPU(&f);

  // Displayfield(&f);
  show_cl_timing(&f);

  real *saveptr = f.dtwn;
  f.dtwn = calloc(f.wsize, sizeof(real));

  f.model.Source = OneSource;
  dtfield(&f, f.wn, f.dtwn);
 
  real maxerr = 0;
  for(int i = 0; i < f.wsize; i++) {
    real error = f.dtwn[i] - saveptr[i];
    //printf("error= \t%f\t%f\t%f\n", error, f.dtwn[i], saveptr[i]);
    maxerr = fmax(fabs(error), maxerr);
  }
  printf("max error: %f\n", maxerr);

  test = (maxerr < 1e-8);

  return test;
}

예제 #19

0

파일 보기

파일: ljmd-cl.c 프로젝트: ajasja/ljmd-opencl

/* main */
int main(int argc, char **argv) 
{
  /*OpenCL variables */
  cl_device_id device;
  cl_device_type device_type; /*to test if we are on cpu or gpu*/
  cl_context context;
  cl_command_queue cmdQueue;

  /* The event variables are created only when needed */
#ifdef _UNBLOCK
  cl_uint  num_events = 3;
  cl_event event[num_events];
#endif

  FPTYPE * buffers[3];
  cl_mdsys_t cl_sys;
  cl_int status;

  int nprint, i, nthreads = 0;
  char restfile[BLEN], trajfile[BLEN], ergfile[BLEN], line[BLEN];
  FILE *fp,*traj,*erg;
  mdsys_t sys;


/* Start profiling */

#ifdef __PROFILING
  
  double t1, t2;

  t1 = second();

#endif

  /* handling the command line arguments */
  switch (argc) {
      case 2: /* only the cpu/gpu argument was passed, setting default nthreads */
	      if( !strcmp( argv[1], "cpu" ) ) nthreads = 16;
	      else nthreads = 1024;
	      break;
      case 3: /* both the device type (cpu/gpu) and the number of threads were passed */
	      nthreads = strtol(argv[2],NULL,10);
	      if( nthreads<0 ) {
		      fprintf( stderr, "\n. The number of threads must be more than 1.\n");
		      PrintUsageAndExit();
	      }
	      break;
      default:
	      PrintUsageAndExit();
	      break;
  }
  
  /* Initialize the OpenCL environment */
  if( InitOpenCLEnvironment( argv[1], &device, &context, &cmdQueue ) != CL_SUCCESS ){
    fprintf( stderr, "Program Error! OpenCL Environment was not initialized correctly.\n" );
    return 4;
  }

  /* The event initialization is performed only when needed */
#ifdef _UNBLOCK
  /* initialize the cl_event handler variables */
  for( i = 0; i < num_events; ++i) {
	  event[i] = clCreateUserEvent( context, NULL );
	  clSetUserEventStatus( event[i], CL_COMPLETE );
  }
#endif

  /* read input file */
  if(get_me_a_line(stdin,line)) return 1;
  sys.natoms=atoi(line);
  if(get_me_a_line(stdin,line)) return 1;
  sys.mass=atof(line);
  if(get_me_a_line(stdin,line)) return 1;
  sys.epsilon=atof(line);
  if(get_me_a_line(stdin,line)) return 1;
  sys.sigma=atof(line);
  if(get_me_a_line(stdin,line)) return 1;
  sys.rcut=atof(line);
  if(get_me_a_line(stdin,line)) return 1;
  sys.box=atof(line);
  if(get_me_a_line(stdin,restfile)) return 1;
  if(get_me_a_line(stdin,trajfile)) return 1;
  if(get_me_a_line(stdin,ergfile)) return 1;
  if(get_me_a_line(stdin,line)) return 1;
  sys.nsteps=atoi(line);
  if(get_me_a_line(stdin,line)) return 1;
  sys.dt=atof(line);
  if(get_me_a_line(stdin,line)) return 1;
  nprint=atoi(line);
  

  
  /* allocate memory */
  cl_sys.natoms = sys.natoms;
  cl_sys.rx = clCreateBuffer( context, CL_MEM_READ_WRITE, cl_sys.natoms * sizeof(FPTYPE), NULL, &status );
  cl_sys.ry = clCreateBuffer( context, CL_MEM_READ_WRITE, cl_sys.natoms * sizeof(FPTYPE), NULL, &status );
  cl_sys.rz = clCreateBuffer( context, CL_MEM_READ_WRITE, cl_sys.natoms * sizeof(FPTYPE), NULL, &status );
  cl_sys.vx = clCreateBuffer( context, CL_MEM_READ_WRITE, cl_sys.natoms * sizeof(FPTYPE), NULL, &status );
  cl_sys.vy = clCreateBuffer( context, CL_MEM_READ_WRITE, cl_sys.natoms * sizeof(FPTYPE), NULL, &status );
  cl_sys.vz = clCreateBuffer( context, CL_MEM_READ_WRITE, cl_sys.natoms * sizeof(FPTYPE), NULL, &status );
  cl_sys.fx = clCreateBuffer( context, CL_MEM_READ_WRITE, cl_sys.natoms * sizeof(FPTYPE), NULL, &status );
  cl_sys.fy = clCreateBuffer( context, CL_MEM_READ_WRITE, cl_sys.natoms * sizeof(FPTYPE), NULL, &status );
  cl_sys.fz = clCreateBuffer( context, CL_MEM_READ_WRITE, cl_sys.natoms * sizeof(FPTYPE), NULL, &status );
  
  buffers[0] = (FPTYPE *) malloc( 2 * cl_sys.natoms * sizeof(FPTYPE) );
  buffers[1] = (FPTYPE *) malloc( 2 * cl_sys.natoms * sizeof(FPTYPE) );
  buffers[2] = (FPTYPE *) malloc( 2 * cl_sys.natoms * sizeof(FPTYPE) );
  
  /* read restart */
  fp = fopen( restfile, "r" );
  if( fp ) {
    for( i = 0; i < 2 * cl_sys.natoms; ++i ){
#ifdef _USE_FLOAT
      fscanf( fp, "%f%f%f", buffers[0] + i, buffers[1] + i, buffers[2] + i);
#else
      fscanf( fp, "%lf%lf%lf", buffers[0] + i, buffers[1] + i, buffers[2] + i);
#endif
    }
    
    status = clEnqueueWriteBuffer( cmdQueue, cl_sys.rx, CL_TRUE, 0, cl_sys.natoms * sizeof(FPTYPE), buffers[0], 0, NULL, NULL ); 
    status |= clEnqueueWriteBuffer( cmdQueue, cl_sys.ry, CL_TRUE, 0, cl_sys.natoms * sizeof(FPTYPE), buffers[1], 0, NULL, NULL ); 
    status |= clEnqueueWriteBuffer( cmdQueue, cl_sys.rz, CL_TRUE, 0, cl_sys.natoms * sizeof(FPTYPE), buffers[2], 0, NULL, NULL ); 
    
    status |= clEnqueueWriteBuffer( cmdQueue, cl_sys.vx, CL_TRUE, 0, cl_sys.natoms * sizeof(FPTYPE), buffers[0] + cl_sys.natoms, 0, NULL, NULL ); 
    status |= clEnqueueWriteBuffer( cmdQueue, cl_sys.vy, CL_TRUE, 0, cl_sys.natoms * sizeof(FPTYPE), buffers[1] + cl_sys.natoms, 0, NULL, NULL ); 
    status |= clEnqueueWriteBuffer( cmdQueue, cl_sys.vz, CL_TRUE, 0, cl_sys.natoms * sizeof(FPTYPE), buffers[2] + cl_sys.natoms, 0, NULL, NULL ); 
    
    fclose(fp);

  } else {
    perror("cannot read restart file");
    return 3;
  }
  
  /* initialize forces and energies.*/
  sys.nfi=0;
  
  size_t globalWorkSize[1];
  globalWorkSize[0] = nthreads;
  
  const char * sourcecode =
  #include <opencl_kernels_as_string.h>
  ;

  cl_program program = clCreateProgramWithSource( context, 1, (const char **) &sourcecode, NULL, &status );
  
  status |= clBuildProgram( program, 0, NULL, kernelflags, NULL, NULL );
  
#ifdef __DEBUG
  size_t log_size;
  char log [200000]; 
  clGetProgramBuildInfo( program, device, CL_PROGRAM_BUILD_LOG, sizeof(log), log, &log_size );
  fprintf( stderr, "\nLog: \n\n %s", log ); 
#endif
  
  cl_kernel kernel_force = clCreateKernel( program, "opencl_force", &status );
  cl_kernel kernel_ekin = clCreateKernel( program, "opencl_ekin", &status );
  cl_kernel kernel_verlet_first = clCreateKernel( program, "opencl_verlet_first", &status );
  cl_kernel kernel_verlet_second = clCreateKernel( program, "opencl_verlet_second", &status );
  cl_kernel kernel_azzero = clCreateKernel( program, "opencl_azzero", &status );
  
  FPTYPE * tmp_epot;
  cl_mem epot_buffer;
  tmp_epot = (FPTYPE *) malloc( nthreads * sizeof(FPTYPE) );
  epot_buffer = clCreateBuffer( context, CL_MEM_READ_WRITE, nthreads * sizeof(FPTYPE), NULL, &status );
  
  /* precompute some constants */
  FPTYPE c12 = 4.0 * sys.epsilon * pow( sys.sigma, 12.0);
  FPTYPE c6  = 4.0 * sys.epsilon * pow( sys.sigma, 6.0);
  FPTYPE rcsq = sys.rcut * sys.rcut;
  FPTYPE boxby2 = HALF * sys.box;  
  FPTYPE dtmf = HALF * sys.dt / mvsq2e / sys.mass;
  sys.epot = ZERO;
  sys.ekin = ZERO;

  /* Azzero force buffer */
  status = clSetMultKernelArgs( kernel_azzero, 0, 4, KArg(cl_sys.fx), KArg(cl_sys.fy), KArg(cl_sys.fz), KArg(cl_sys.natoms));

  status = clEnqueueNDRangeKernel( cmdQueue, kernel_azzero, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL );

  status |= clSetMultKernelArgs( kernel_force, 0, 13,
	KArg(cl_sys.fx),
	KArg(cl_sys.fy),
	KArg(cl_sys.fz),
	KArg(cl_sys.rx),
	KArg(cl_sys.ry),
	KArg(cl_sys.rz),
	KArg(cl_sys.natoms),
	KArg(epot_buffer),
	KArg(c12),
	KArg(c6),
	KArg(rcsq),
	KArg(boxby2),
	KArg(sys.box));
  
  status = clEnqueueNDRangeKernel( cmdQueue, kernel_force, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL );
  
  status |= clEnqueueReadBuffer( cmdQueue, epot_buffer, CL_TRUE, 0, nthreads * sizeof(FPTYPE), tmp_epot, 0, NULL, NULL );     
  
  for( i = 0; i < nthreads; i++) sys.epot += tmp_epot[i];
  
  FPTYPE * tmp_ekin;
  cl_mem ekin_buffer;
  tmp_ekin = (FPTYPE *) malloc( nthreads * sizeof(FPTYPE) );
  ekin_buffer = clCreateBuffer( context, CL_MEM_READ_WRITE, nthreads * sizeof(FPTYPE), NULL, &status );
  
  status |= clSetMultKernelArgs( kernel_ekin, 0, 5, KArg(cl_sys.vx), KArg(cl_sys.vy), KArg(cl_sys.vz),
    KArg(cl_sys.natoms), KArg(ekin_buffer));
  
  status = clEnqueueNDRangeKernel( cmdQueue, kernel_ekin, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL );
    
  status |= clEnqueueReadBuffer( cmdQueue, ekin_buffer, CL_TRUE, 0, nthreads * sizeof(FPTYPE), tmp_ekin, 0, NULL, NULL );     

  for( i = 0; i < nthreads; i++) sys.ekin += tmp_ekin[i];
  sys.ekin *= HALF * mvsq2e * sys.mass;
  sys.temp  = TWO * sys.ekin / ( THREE * sys.natoms - THREE ) / kboltz;

  erg=fopen(ergfile,"w");
  traj=fopen(trajfile,"w");

  printf("Starting simulation with %d atoms for %d steps.\n",sys.natoms, sys.nsteps);
  printf("     NFI            TEMP            EKIN                 EPOT              ETOT\n");
  
  /* download data on host */
  status = clEnqueueReadBuffer( cmdQueue, cl_sys.rx, CL_TRUE, 0, cl_sys.natoms * sizeof(FPTYPE), buffers[0], 0, NULL, NULL ); 
  status |= clEnqueueReadBuffer( cmdQueue, cl_sys.ry, CL_TRUE, 0, cl_sys.natoms * sizeof(FPTYPE), buffers[1], 0, NULL, NULL ); 
  status |= clEnqueueReadBuffer( cmdQueue, cl_sys.rz, CL_TRUE, 0, cl_sys.natoms * sizeof(FPTYPE), buffers[2], 0, NULL, NULL ); 
  
  sys.rx = buffers[0];
  sys.ry = buffers[1];
  sys.rz = buffers[2];
  
  output(&sys, erg, traj);

  /**************************************************/
  /* main MD loop */
  for(sys.nfi=1; sys.nfi <= sys.nsteps; ++sys.nfi) {

    /* propagate system and recompute energies */
    /* 2) verlet_first   */
    status |= clSetMultKernelArgs( kernel_verlet_first, 0, 12,
      KArg(cl_sys.fx),
      KArg(cl_sys.fy),
      KArg(cl_sys.fz),
      KArg(cl_sys.rx),
      KArg(cl_sys.ry),
      KArg(cl_sys.rz),
      KArg(cl_sys.vx),
      KArg(cl_sys.vy),
      KArg(cl_sys.vz),
      KArg(cl_sys.natoms),
      KArg(sys.dt),
      KArg(dtmf));
    CheckSuccess(status, 2);

    /* When the data transfer is non blocking, this kernel has to wait the completion of part 8 (event[2]) */
#ifdef _UNBLOCK
    status = clEnqueueNDRangeKernel( cmdQueue, kernel_verlet_first, 1, NULL, globalWorkSize, NULL, 1, &event[2], NULL );
#else
    status = clEnqueueNDRangeKernel( cmdQueue, kernel_verlet_first, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL );
#endif

    /* 6) download position@device to position@host */
    if ((sys.nfi % nprint) == nprint-1) {

    /* In non blocking mode (CL_FALSE) this data transfer raises events[i] */
#ifdef _UNBLOCK
	status  = clEnqueueReadBuffer( cmdQueue, cl_sys.rx, CL_FALSE, 0, cl_sys.natoms * sizeof(FPTYPE), buffers[0], 0, NULL, &event[2] );
	status |= clEnqueueReadBuffer( cmdQueue, cl_sys.ry, CL_FALSE, 0, cl_sys.natoms * sizeof(FPTYPE), buffers[1], 0, NULL, &event[1] );
	status |= clEnqueueReadBuffer( cmdQueue, cl_sys.rz, CL_FALSE, 0, cl_sys.natoms * sizeof(FPTYPE), buffers[2], 0, NULL, &event[0] );
#else
	status  = clEnqueueReadBuffer( cmdQueue, cl_sys.rx, CL_TRUE, 0, cl_sys.natoms * sizeof(FPTYPE), buffers[0], 0, NULL, NULL );
	status |= clEnqueueReadBuffer( cmdQueue, cl_sys.ry, CL_TRUE, 0, cl_sys.natoms * sizeof(FPTYPE), buffers[1], 0, NULL, NULL );
	status |= clEnqueueReadBuffer( cmdQueue, cl_sys.rz, CL_TRUE, 0, cl_sys.natoms * sizeof(FPTYPE), buffers[2], 0, NULL, NULL );
#endif
	CheckSuccess(status, 6);
    }

    /* 3) force */
    status |= clSetMultKernelArgs( kernel_force, 0, 13,
      KArg(cl_sys.fx),
      KArg(cl_sys.fy),
      KArg(cl_sys.fz),
      KArg(cl_sys.rx),
      KArg(cl_sys.ry),
      KArg(cl_sys.rz),
      KArg(cl_sys.natoms),
      KArg(epot_buffer),
      KArg(c12),
      KArg(c6),
      KArg(rcsq),
      KArg(boxby2),
      KArg(sys.box));

    CheckSuccess(status, 3);
    status = clEnqueueNDRangeKernel( cmdQueue, kernel_force, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL );

    /* 7) download E_pot[i]@device and perform reduction to E_pot@host */
    if ((sys.nfi % nprint) == nprint-1) {

    /* In non blocking mode (CL_FALSE) this data transfer kernel raises an event[1] */
#ifdef _UNBLOCK
	status |= clEnqueueReadBuffer( cmdQueue, epot_buffer, CL_FALSE, 0, nthreads * sizeof(FPTYPE), tmp_epot, 0, NULL, &event[1] );
#else
	status |= clEnqueueReadBuffer( cmdQueue, epot_buffer, CL_TRUE, 0, nthreads * sizeof(FPTYPE), tmp_epot, 0, NULL, NULL );
#endif
	CheckSuccess(status, 7);
    }

    /* 4) verlet_second */
    status |= clSetMultKernelArgs( kernel_verlet_second, 0, 9,
      KArg(cl_sys.fx),
      KArg(cl_sys.fy),
      KArg(cl_sys.fz),
      KArg(cl_sys.vx),
      KArg(cl_sys.vy),
      KArg(cl_sys.vz),
      KArg(cl_sys.natoms),
      KArg(sys.dt),
      KArg(dtmf));

    CheckSuccess(status, 4);
    status = clEnqueueNDRangeKernel( cmdQueue, kernel_verlet_second, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL );

    if ((sys.nfi % nprint) == nprint-1) {

	/* 5) ekin */
	status |= clSetMultKernelArgs( kernel_ekin, 0, 5, KArg(cl_sys.vx), KArg(cl_sys.vy), KArg(cl_sys.vz),
			KArg(cl_sys.natoms), KArg(ekin_buffer));
	CheckSuccess(status, 5);
	status = clEnqueueNDRangeKernel( cmdQueue, kernel_ekin, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL );


	/* 8) download E_kin[i]@device and perform reduction to E_kin@host */
	/* In non blocking mode (CL_FALSE) this data transfer kernel raises an event[2] */
#ifdef _UNBLOCK
	status |= clEnqueueReadBuffer( cmdQueue, ekin_buffer, CL_FALSE, 0, nthreads * sizeof(FPTYPE), tmp_ekin, 0, NULL, &event[2] );
#else
	status |= clEnqueueReadBuffer( cmdQueue, ekin_buffer, CL_TRUE, 0, nthreads * sizeof(FPTYPE), tmp_ekin, 0, NULL, NULL );
#endif
	CheckSuccess(status, 8);
    }

    /* 1) write output every nprint steps */
    if ((sys.nfi % nprint) == 0) {

    /* Calling a synchronization function (only when in non blocking mode) that will wait until all the
     * events[i], related to the data transfers, to be completed */
#ifdef _UNBLOCK
        clWaitForEvents(3, event);
#endif
	sys.rx = buffers[0];
	sys.ry = buffers[1];
	sys.rz = buffers[2];

	/* initialize the sys.epot@host and sys.ekin@host variables to ZERO */
	sys.epot = ZERO;
	sys.ekin = ZERO;

	/* reduction on the tmp_Exxx[i] buffers downloaded from the device
	 * during parts 7 and 8 of the previous MD loop iteration */
	for( i = 0; i < nthreads; i++) {
		sys.epot += tmp_epot[i];
		sys.ekin += tmp_ekin[i];
	}

	/* multiplying the kinetic energy by prefactors */
	sys.ekin *= HALF * mvsq2e * sys.mass;
	sys.temp  = TWO * sys.ekin / ( THREE * sys.natoms - THREE ) / kboltz;

	/* writing output files (positions, energies and temperature) */
	output(&sys, erg, traj);
    }

  }
  /**************************************************/

/* End profiling */

#ifdef __PROFILING

t2 = second();

fprintf( stdout, "\n\nTime of execution = %.3g (seconds)\n", (t2 - t1) );

#endif






  /* clean up: close files, free memory */
  printf("Simulation Done.\n");
  fclose(erg);
  fclose(traj);

  free(buffers[0]);
  free(buffers[1]);
  free(buffers[2]);

  return 0;
}

예제 #20

0

파일 보기

파일: test_commandqueue.cpp 프로젝트: jakebolewski/clover

END_TEST

START_TEST (test_misc_events)
{
    cl_platform_id platform = 0;
    cl_device_id device;
    cl_context ctx;
    cl_command_queue queue;
    cl_int result;
    cl_event uevent1, uevent2, marker1, marker2;

    result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0);
    fail_if(
        result != CL_SUCCESS,
        "unable to get the default device"
    );

    ctx = clCreateContext(0, 1, &device, 0, 0, &result);
    fail_if(
        result != CL_SUCCESS || ctx == 0,
        "unable to create a valid context"
    );

    queue = clCreateCommandQueue(ctx, device, 0, &result);
    fail_if(
        result != CL_SUCCESS || queue == 0,
        "cannot create a command queue"
    );

    /*
     * This test will build a command queue blocked by an user event. The events
     * will be in this order :
     *
     * -: UserEvent1
     * 0: WaitForEvents1 (wait=UserEvent1)
     * 1: Marker1
     * -: UserEvent2
     * 2: WaitForEvents2 (wait=UserEvent2)
     * 3: Barrier
     * 4: Marker2 (to check the barrier worked)
     *
     * When the command queue is built, we :
     *  - Check that Marker1 is Queued (WaitForEvents waits)
     *  - Set UserEvent1 to Complete
     *  - Check that Marker1 is Complete (WaitForEvents stopped to wait)
     *  - Check that Marker2 is Queued (Barrier is there)
     *  - Set UserEvent2 to Complete
     *  - Check that Marker2 is Complete (no more barrier)
     */
    uevent1 = clCreateUserEvent(ctx, &result);
    fail_if(
        result != CL_SUCCESS,
        "unable to create UserEvent1"
    );

    uevent2 = clCreateUserEvent(ctx, &result);
    fail_if(
        result != CL_SUCCESS,
        "unable to create UserEvent2"
    );

    result = clEnqueueWaitForEvents(queue, 1, &uevent1);
    fail_if(
        result != CL_SUCCESS,
        "unable to enqueue WaitForEvents(UserEvent1)"
    );

    result = clEnqueueMarker(queue, &marker1);
    fail_if(
        result != CL_SUCCESS,
        "unable to enqueue Marker1"
    );

    result = clEnqueueWaitForEvents(queue, 1, &uevent2);
    fail_if(
        result != CL_SUCCESS,
        "unable to enqueue WaitForEvents(UserEvent2)"
    );

    result = clEnqueueBarrier(queue);
    fail_if(
        result != CL_SUCCESS,
        "unable to enqueue Barrier"
    );

    result = clEnqueueMarker(queue, &marker2);
    fail_if(
        result != CL_SUCCESS,
        "unable to enqueue Marker2"
    );

    // Now the checks
    cl_int status;

    result = clGetEventInfo(marker1, CL_EVENT_COMMAND_EXECUTION_STATUS,
                            sizeof(cl_int), &status, 0);
    fail_if(
        result != CL_SUCCESS || status != CL_QUEUED,
        "Marker1 must be Queued"
    );

    result = clSetUserEventStatus(uevent1, CL_COMPLETE);
    fail_if(
        result != CL_SUCCESS,
        "unable to set UserEvent1 to Complete"
    );

    result = clGetEventInfo(marker1, CL_EVENT_COMMAND_EXECUTION_STATUS,
                            sizeof(cl_int), &status, 0);
    fail_if(
        result != CL_SUCCESS || status != CL_COMPLETE,
        "Marker1 must be Complete"
    );

    result = clGetEventInfo(marker2, CL_EVENT_COMMAND_EXECUTION_STATUS,
                            sizeof(cl_int), &status, 0);
    fail_if(
        result != CL_SUCCESS || status != CL_QUEUED,
        "Marker2 must be Queued"
    );

    result = clSetUserEventStatus(uevent2, CL_COMPLETE);
    fail_if(
        result != CL_SUCCESS,
        "unable to set UserEvent2 to Complete"
    );

    result = clGetEventInfo(marker2, CL_EVENT_COMMAND_EXECUTION_STATUS,
                            sizeof(cl_int), &status, 0);
    fail_if(
        result != CL_SUCCESS || status != CL_COMPLETE,
        "Marker2 must be Complete"
    );

    clFinish(queue);

    clReleaseEvent(uevent1);
    clReleaseEvent(uevent2);
    clReleaseEvent(marker1);
    clReleaseEvent(marker2);
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);
}

예제 #21

0

파일 보기

파일: enqueue-fill-image.c 프로젝트: Jul13t/piglit

enum piglit_result
piglit_cl_test(const int argc,
               const char **argv,
               const struct piglit_cl_api_test_config* config,
               const struct piglit_cl_api_test_env* env)
{
#if defined(CL_VERSION_1_2)
	enum piglit_result result = PIGLIT_PASS;
	cl_int err;

#define IMG_WIDTH 4
#define IMG_HEIGHT 4
#define IMG_DATA_SIZE 4
#define IMG_BUFFER_SIZE IMG_WIDTH * IMG_HEIGHT * IMG_DATA_SIZE

	unsigned char img_buf[IMG_BUFFER_SIZE] = {0};
	unsigned char dst_buf[IMG_BUFFER_SIZE] = {0};
	unsigned char exp_buf[IMG_BUFFER_SIZE] = {0};
	int pattern[4] = {129, 33, 77, 255};
	size_t origin[3] = {0, 0, 0};
	size_t region[3] = {2, 2, 1};
	size_t tmp;
	cl_event event;
	cl_mem image;
	cl_image_format img_format;
	cl_image_desc img_desc = {0};
	cl_command_queue queue = env->context->command_queues[0];
	int i;

	cl_bool *image_support =
		piglit_cl_get_device_info(env->context->device_ids[0],
		                          CL_DEVICE_IMAGE_SUPPORT);

	if (!*image_support) {
		fprintf(stderr, "No image support\n");
		free(image_support);
		return PIGLIT_SKIP;
	}

	img_format.image_channel_order = CL_RGBA;
	img_format.image_channel_data_type = CL_UNSIGNED_INT8;
	img_desc.image_type = CL_MEM_OBJECT_IMAGE2D;
	img_desc.image_width = IMG_WIDTH;
	img_desc.image_height = IMG_HEIGHT;
	img_desc.buffer = NULL;

/*** Normal usage ***/
	image = clCreateImage(env->context->cl_ctx,
	                      CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
	                      &img_format, &img_desc, &img_buf, &err);

	if(!piglit_cl_check_error(err, CL_SUCCESS)) {
		fprintf(stderr, "Failed (error code: %s): Creating an image\n",
		        piglit_cl_get_error_name(err));
		return PIGLIT_FAIL;
	}

	if (!test(queue, image, pattern, origin, region,
	          0, NULL, NULL,
	          CL_SUCCESS, &result, "Enqueuing the image to be filled")) {
		return PIGLIT_FAIL;
	}

	region[0] = IMG_WIDTH;
	region[1] = IMG_HEIGHT;
	err = clEnqueueReadImage(queue, image, 1, origin, region, 0, 0,
	                         dst_buf, 0, NULL, NULL);
	if(!piglit_cl_check_error(err, CL_SUCCESS)) {
		fprintf(stderr, "Failed (error code: %s): Reading image\n",
		        piglit_cl_get_error_name(err));
		return PIGLIT_FAIL;
	}

	/*
	 * fill the host buffer with the pattern
	 * for exemple : pattern == 1234
	 *
	 * 12341234abcdabcd
	 * 12341234abcdabcd
	 * abcdabcdabcdabcd
	 * abcdabcdabcdabcd
	 */
	exp_buf[0] = pattern[0];
	exp_buf[1] = pattern[1];
	exp_buf[2] = pattern[2];
	exp_buf[3] = pattern[3];
	memcpy(exp_buf + (IMG_DATA_SIZE * 1), exp_buf, IMG_DATA_SIZE);
	memcpy(exp_buf + (IMG_DATA_SIZE * 4), exp_buf, IMG_DATA_SIZE);
	memcpy(exp_buf + (IMG_DATA_SIZE * 5), exp_buf, IMG_DATA_SIZE);

	for (i = 0; i < sizeof(dst_buf) / sizeof(dst_buf[0]); ++i) {
		if (!piglit_cl_probe_integer(dst_buf[i], exp_buf[i], 0)) {
			fprintf(stderr, "Error at %d: got %d, expected %d\n",
			        i, dst_buf[i], exp_buf[i]);
			return PIGLIT_FAIL;
		}
	}

/*** Errors ***/

	/*
	 * CL_INVALID_COMMAND_QUEUE if command_queue is not a valid command-queue.
	 */
	test(NULL, image, pattern, origin, region,
		  0, NULL, NULL,
		  CL_INVALID_COMMAND_QUEUE, &result,
		  "CL_INVALID_COMMAND_QUEUE if command_queue is not a valid command-queue");

	/*
	 * CL_INVALID_CONTEXT if the context associated with command_queue and
	 * image are not the same or if the context associated with command_queue
	 * and events in event_wait_list are not the same.
	 */
	{
		piglit_cl_context context;
		cl_int err;
		context = piglit_cl_create_context(env->platform_id,
		                                   env->context->device_ids, 1);
		if (context) {
			event = clCreateUserEvent(context->cl_ctx, &err);
			if (err == CL_SUCCESS) {
				err = clSetUserEventStatus(event, CL_COMPLETE);
				if (err == CL_SUCCESS) {
					test(context->command_queues[0], image, pattern, origin, region,
					     0, NULL, NULL,
					     CL_INVALID_CONTEXT, &result,
					     "CL_INVALID_CONTEXT if the context associated with command_queue and image are not the same");

					test(queue, image, pattern, origin, region,
					     1, &event, NULL,
					     CL_INVALID_CONTEXT, &result,
					     "CL_INVALID_CONTEXT if the context associated with command_queue and events in event_wait_list are not the same");
				} else {
					fprintf(stderr, "Could not set event status.\n");
					piglit_merge_result(&result, PIGLIT_WARN);
				}
				clReleaseEvent(event);
			} else {
				fprintf(stderr, "Could not create user event.\n");
				piglit_merge_result(&result, PIGLIT_WARN);
			}

			piglit_cl_release_context(context);
		} else {
			fprintf(stderr, "Could not test triggering CL_INVALID_CONTEXT.\n");
			piglit_merge_result(&result, PIGLIT_WARN);
		}
	}

	/*
	 * CL_INVALID_MEM_OBJECT if image is not a valid buffer object.
	 */
	test(queue, NULL, pattern, origin, region,
	     0, NULL, NULL,
	     CL_INVALID_MEM_OBJECT, &result,
	     "CL_INVALID_MEM_OBJECT if image is not a valid buffer object");

	/*
	 * CL_INVALID_VALUE if fill_color is NULL.
	 */
	test(queue, image, NULL, origin, region,
	     0, NULL, NULL,
	     CL_INVALID_VALUE, &result,
	     "CL_INVALID_VALUE if fill_color is NULL");

	/*
	 * CL_INVALID_VALUE if the region being written specified by origin and
	 * region is out of bounds or if ptr is a NULL value.
	 */
	tmp = origin[0];
	origin[0] = IMG_WIDTH + 1;
	test(queue, image, pattern, origin, region,
	     0, NULL, NULL,
	     CL_INVALID_VALUE, &result,
	     "CL_INVALID_VALUE if the region being written specified by origin and region is out of bounds (origin)");
	origin[0] = tmp;

	tmp = region[0];
	region[0] = IMG_WIDTH + 1;
	test(queue, image, pattern, origin, region,
	     0, NULL, NULL,
	     CL_INVALID_VALUE, &result,
	     "CL_INVALID_VALUE if the region being written specified by origin and region is out of bounds (region)");
	region[0] = tmp;

	test(queue, image, pattern, NULL, region,
	     0, NULL, NULL,
	     CL_INVALID_VALUE, &result,
	     "CL_INVALID_VALUE if ptr is a NULL value (origin)");

	test(queue, image, pattern, origin, NULL,
	     0, NULL, NULL,
	     CL_INVALID_VALUE, &result,
	     "CL_INVALID_VALUE if ptr is a NULL value (region)");

	/*
	 * CL_INVALID_VALUE if values in origin and region do not follow rules
	 * described in the argument description for origin and region.
	 */
	tmp = origin[2];
	origin[2] = 1;
	test(queue, image, pattern, origin, region,
	     0, NULL, NULL,
	     CL_INVALID_VALUE, &result,
	     "CL_INVALID_VALUE if values in origin do not follow rules described in the argument description for origin");
	origin[2] = tmp;

	tmp = region[2];
	region[2] = 0;
	test(queue, image, pattern, origin, region,
		  0, NULL, NULL,
		CL_INVALID_VALUE, &result,
		"CL_INVALID_VALUE if values in region do not follow rules described in the argument description for region");
	region[2] = tmp;

	/*
	 * CL_INVALID_EVENT_WAIT_LIST if event_wait_list is NULL and
	 * num_events_in_wait_list > 0, or event_wait_list is not NULL and
	 * num_events_in_wait_list is 0, or if event objects in event_wait_list
	 * are not valid events.
	 */
	event = NULL;
	test(queue, image, pattern, origin, region,
	     1, NULL, NULL,
	     CL_INVALID_EVENT_WAIT_LIST, &result,
	     "CL_INVALID_EVENT_WAIT_LIST if event_wait_list is NULL and num_events_in_wait_list > 0");

	test(queue, image, pattern, origin, region,
	     0, &event, NULL,
	     CL_INVALID_EVENT_WAIT_LIST, &result,
	     "CL_INVALID_EVENT_WAIT_LIST if event_wait_list is not NULL and num_events_in_wait_list is 0");

	test(queue, image, pattern, origin, region,
	     1, &event, NULL,
	     CL_INVALID_EVENT_WAIT_LIST, &result,
	     "CL_INVALID_EVENT_WAIT_LIST if event objects in event_wait_list are not valid events");

	/*
	 * CL_INVALID_IMAGE_SIZE if image dimensions (image width, height, specified
	 * or compute row and/or slice pitch) for image are not supported by device
	 * associated with queue.
	 */
	/* This is a per device test, clCreateImage would have failed before */

	/*
	 * CL_INVALID_IMAGE_FORMAT if image format (image channel order and data type)
	 * for image are not supported by device associated with queue.
	 */
	/* This is a per device test, clCreateImage would have failed before */

	free(image_support);
	clReleaseMemObject(image);
	return result;
#else
	return PIGLIT_SKIP;
#endif
}

예제 #22

0

파일 보기

파일: device.cpp 프로젝트: markadamo/tempest

void Device::setup(unsigned int minScan, unsigned int maxScan) {
    int err;
    this->minScan = minScan;
    this->maxScan = maxScan;
    
    // Determine Configuration
    this->candidateBufferSize = this->reduce_scores_multiple;
    this->reduce_scores_size = this->reduce_scores_multiple;

    size_t hostMem = sizeof(mObj) * Tempest::data.iNumSpectra * Tempest::params.numInternalPSMs
      + sizeof(eObj) * Tempest::data.iNumSpectra
      + sizeof(cl_mem) * Tempest::data.iNumSpectra
      + sizeof(std::vector<int>) + sizeof(int)*Tempest::data.host_iPeakBins.size()
      + sizeof(std::vector<float>) + sizeof(float)*Tempest::data.host_fPeakInts.size()
      + sizeof(int)*Tempest::data.iNumSpectra
      + sizeof(long)*Tempest::data.iNumSpectra;
    for (int candidateBufferSize=this->reduce_scores_multiple; hostMem + candidateBufferSize*Tempest::data.iNumSpectra*sizeof(cObj) < Tempest::config.maxHostMem; candidateBufferSize += this->reduce_scores_multiple) {
        for (int reduceScoresSize = 1;
             reduceScoresSize <= candidateBufferSize
                 && reduceScoresSize <= this->reduce_scores_size_max
                 && reduceScoresSize*(sizeof(int) + sizeof(float)) + this->reduce_scores_size_local <= this->lLocalMemSize;
             reduceScoresSize *= 2) {
            if (reduceScoresSize%(this->reduce_scores_multiple) == 0 && candidateBufferSize%reduceScoresSize == 0)
                if (candidateBufferSize * reduceScoresSize > this->candidateBufferSize * this->reduce_scores_size) {
                    this->candidateBufferSize = candidateBufferSize;
                    this->reduce_scores_size = reduceScoresSize;
                }	    
        }
    }
    if (Tempest::config.profile) {
        printf("cl_build: local_work_size=%ld\n", this->build_size);
        printf("cl_transform: local_work_size=%ld\n", this->transform_size);
        printf("cl_score: local_work_size=%ld\n", this->score_size);
        printf("candidate buffer size=%ld\n", this->candidateBufferSize);
        printf("cl_reduce_scores: local_work_size=%ld\n", this->reduce_scores_size);
    }

    for (int i=minScan+deviceInd; i<maxScan; i+=Tempest::config.iDevices.size()) {
        eObj* e = Tempest::data.eScans[i];
        e->candidateBuffer = (cObj*)malloc(this->candidateBufferSize * sizeof(cObj));
        e->candidateBufferSize = this->candidateBufferSize;
        e->clEventSent = clCreateUserEvent(clContext, NULL);
        clSetUserEventStatus(e->clEventSent, 0);
        e->device = this;
    }

    // peaks
    size_t size_iPeakBins = Tempest::data.lNumMS2Peaks * sizeof(cl_int);
    size_t size_fPeakInts = Tempest::data.lNumMS2Peaks * sizeof(cl_float);
    cl_iPeakBins = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, size_iPeakBins, &(Tempest::data.host_iPeakBins[0]), &err);
    Tempest::check_cl_error(__FILE__, __LINE__, err, "Unable to allocate device memory for peak bins.");
    cl_fPeakInts = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, size_fPeakInts, &(Tempest::data.host_fPeakInts[0]), &err);
    Tempest::check_cl_error(__FILE__, __LINE__, err, "Unable to allocate device memory for peak intensities.");
    
    // cleanup host
    //std::vector<int>().swap(Tempest::data.host_iPeakBins);
    //std::vector<float>().swap(Tempest::data.host_fPeakInts);

    //cudaMalloc((void**) &cl_fSpectra, Tempest::data.iNumMS2Bins * sizeof(float));
    //cl_fSpectra = clCreateBuffer(clContext, CL_MEM_READ_WRITE, Tempest::data.iNumMS2Bins * sizeof(float), NULL, &err);
    float * init_fSpectra = (float *) calloc(Tempest::data.iNumMS2Bins, sizeof(float));
    size_t size_init_fSpectra = Tempest::data.iNumMS2Bins * sizeof(cl_float);
    cl_init_fSpectra = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, size_init_fSpectra, init_fSpectra, &err);
    free(init_fSpectra);
    
    // candidate and results
    mObj * init_mPSMs = (mObj *) calloc(Tempest::data.iNumSpectra * Tempest::params.numInternalPSMs, sizeof(mObj));
    // for (int i=0; i<Tempest::data.iNumSpectra * Tempest::params.numInternalPSMs; i++)
    //     init_mPSMs[i].fScore = MIN_SCORE;
    //float * init_fNextScores = (float *) calloc(Tempest::data.iNumSpectra, sizeof(float));
    size_t size_cCandidates = sizeof(cObj) * this->candidateBufferSize;
    size_t size_fScores = sizeof(cl_float)  * this->candidateBufferSize;
    size_t size_mPSMs = sizeof(mObj)  * Tempest::data.iNumSpectra * Tempest::params.numInternalPSMs;
    //size_t size_fNextScores = sizeof(float) * Tempest::data.iNumSpectra;
    cl_cCandidates = clCreateBuffer(clContext, CL_MEM_READ_ONLY, size_cCandidates, NULL, &err);
    cl_fScores = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size_fScores, NULL, &err);  
    cl_mPSMs = clCreateBuffer(clContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, size_mPSMs , init_mPSMs, &err);
    //cl_fNextScores = clCreateBuffer(clContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, size_fNextScores, init_fNextScores, &err);
    //MEA: need to block free until previous clCreateBuffer commands complete?
    free(init_mPSMs);
    //free(init_fNextScores);
    Tempest::check_cl_error(__FILE__, __LINE__, err, "Unable to allocate device memory for candidates and results.");

    //determine how many spectra can be kept in device memory at a time
    size_t availMemSpectra = lGlobalMemSize
        - size_iPeakBins
        - size_fPeakInts
        - size_init_fSpectra
        - size_cCandidates
        - size_fScores
        - size_mPSMs;
    if (availMemSpectra > Tempest::config.maxDeviceMem)
        availMemSpectra = Tempest::config.maxDeviceMem;
    long maxCachedSpectra = availMemSpectra / (Tempest::data.iNumMS2Bins*sizeof(cl_float));
    if (maxCachedSpectra > (long)ceil(float(Tempest::data.iNumSpectra)/Tempest::devices.size()))
        maxCachedSpectra = (long)ceil(float(Tempest::data.iNumSpectra)/Tempest::devices.size());
    if (maxCachedSpectra <= 0)
        maxCachedSpectra = 1;
    
    printf(" » (%d:%d) Allocating %.2f MB of device memory for %ld cached %s.\n", platformID, deviceID, (float)maxCachedSpectra*Tempest::data.iNumMS2Bins*sizeof(cl_float)/MB, maxCachedSpectra, maxCachedSpectra==1 ? "spectrum" : "spectra");
    for (int i=0; i<maxCachedSpectra; i++) {
        cl_mem newBuffer = clCreateBuffer(clContext, CL_MEM_READ_WRITE, Tempest::data.iNumMS2Bins*sizeof(cl_float), NULL, &err);
        Tempest::check_cl_error(__FILE__, __LINE__, err, "Unable to allocate spectrum memory on device.");
        unusedBuffers.push(newBuffer);
    }
    
    setup_constant_memory();
    
    //initialize profiling variables
    scoreEvent = clCreateUserEvent(clContext, NULL);
    reduceEvent = clCreateUserEvent(clContext, NULL);
    buildEvent = clCreateUserEvent(clContext, NULL);
    memsetEvent = clCreateUserEvent(clContext, NULL);
    transformEvent = clCreateUserEvent(clContext, NULL);
    totalScoreTime = 0;
    totalReduceTime = 0;
    totalBuildTime = 0;
    totalTransformTime = 0;
    totalMemsetTime = 0;
    totalSendTime = 0;
    buildLaunches = 0;
    scoreKernelLaunches = 0;
    lastBuildIndex = -1;

}

예제 #23

0

파일 보기

파일: mat_mul_opencl_binary.c 프로젝트: Sangil-Lee/RefCode

void mat_mul_opencl_binary(float *M_A, float *M_B, float *M_C,
                       size_t ROW_A, size_t COL_A, size_t COL_B) {
  cl_platform_id   *platform;
  cl_device_type   dev_type;
  cl_device_id     dev;
  cl_context       context;
  cl_command_queue cmd_queue;
  cl_program       program;
  cl_kernel        kernel;
  cl_mem           mem_A, mem_B, mem_C;
  cl_event         ev_kernel, ev_bp;
  cl_int           err;
  cl_uint          num_platforms;
  cl_uint          num_dev = 0;
  int i;

  // Platform
  err = clGetPlatformIDs(0, NULL, &num_platforms);
  CHECK_ERROR(err);
  if (num_platforms == 0) {
    fprintf(stderr, "[%s:%d] ERROR: No OpenCL platform\n", __FILE__,__LINE__);
    exit(EXIT_FAILURE);
  }
  printf("Number of platforms: %u\n", num_platforms);
  platform = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
  err = clGetPlatformIDs(num_platforms, platform, NULL);
  CHECK_ERROR(err);

  // Device
  dev_type = get_device_type();
  for (i = 0; i < num_platforms; i++) {
    err = clGetDeviceIDs(platform[i], dev_type, 1, &dev, &num_dev);
    if (err != CL_DEVICE_NOT_FOUND) CHECK_ERROR(err);
    if (num_dev == 1) break;
  }
  if (num_dev < 1) {
    fprintf(stderr, "[%s:%d] ERROR: No device\n", __FILE__, __LINE__);
    exit(EXIT_FAILURE);
  }
  print_device_name(dev);
  free(platform);

  // Context
  context = clCreateContext(NULL, 1, &dev, NULL, NULL, &err);
  CHECK_ERROR(err);

  // Create a program.
  char *source_code = get_source_code("./kernel_2d.cl");
  program = clCreateProgramWithSource(context,
                                      1, (const char **)&source_code,
                                      NULL, &err);
  free(source_code);
  CHECK_ERROR(err);

  // Callback data for clBuildProgram
  ev_bp = clCreateUserEvent(context, &err);
  CHECK_ERROR(err);
  bp_data_t bp_data;
  bp_data.dev    = dev;
  bp_data.event  = &ev_bp;

  // Build the program.
  char build_opts[200];
  sprintf(build_opts, "-DROW_A=%lu -DCOL_A=%lu -DCOL_B=%lu",
          ROW_A, COL_A, COL_B);
  err = clBuildProgram(program, 1, &dev, build_opts,
                       build_program_callback, &bp_data);
  CHECK_ERROR(err);

  // Command queue
  cmd_queue = clCreateCommandQueue(context, dev,
                                   CL_QUEUE_PROFILING_ENABLE,
                                   &err);
  CHECK_ERROR(err);

  // Buffers
  mem_A = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
                         sizeof(float) * ROW_A * COL_A,
                         M_A, &err);
  CHECK_ERROR(err);

  mem_B = clCreateBuffer(context, CL_MEM_READ_ONLY, 
                         sizeof(float) * COL_A * COL_B,
                         NULL, &err);
  CHECK_ERROR(err);
  err = clEnqueueWriteBuffer(cmd_queue,
                             mem_B,
                             CL_FALSE, 0,
                             sizeof(float) * COL_A * COL_B,
                             M_B,
                             0, NULL, NULL);
  CHECK_ERROR(err)

  mem_C = clCreateBuffer(context, CL_MEM_READ_WRITE, 
                         sizeof(float) * ROW_A * COL_B,
                         NULL, &err);
  CHECK_ERROR(err);

  // Index space (gws) and work-group size (lws)
  size_t lws[2] = {16, 16};
  size_t gws[2];
  gws[1] = (size_t)ceil((double)ROW_A / lws[1]) * lws[1];
  gws[0] = (size_t)ceil((double)COL_B / lws[0]) * lws[0];

  // Wait for the kernel creation.
  clWaitForEvents(1, bp_data.event);

  // Kernel
  kernel = clCreateKernel(program, "mat_mul", &err);
  CHECK_ERROR(err);

  // Set the arguments.
  err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_A);
  CHECK_ERROR(err);
  err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &mem_B);
  CHECK_ERROR(err);
  err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_C);
  CHECK_ERROR(err);

  // Enqueue the kernel.
  err = clEnqueueNDRangeKernel(cmd_queue,
                               kernel,
                               2, NULL,
                               gws, lws,
                               0, NULL,
                               &ev_kernel);
  CHECK_ERROR(err);

  // Read the result.
  err = clEnqueueReadBuffer(cmd_queue,
                            mem_C,
                            CL_TRUE, 0,
                            sizeof(float) * ROW_A * COL_B, 
                            M_C,
                            0, NULL, NULL);
  CHECK_ERROR(err);

  // Read the profiling info.
  cl_ulong start_time, end_time;
  err = clGetEventProfilingInfo(ev_kernel, CL_PROFILING_COMMAND_START, 
                                sizeof(cl_ulong), &start_time, NULL);
  CHECK_ERROR(err);
  err = clGetEventProfilingInfo(ev_kernel, CL_PROFILING_COMMAND_END, 
                                sizeof(cl_ulong), &end_time, NULL);
  CHECK_ERROR(err);
  printf("Kernel time : %lf sec\n", (double)(end_time - start_time) / 10e9);

  // Release
  clReleaseEvent(ev_kernel);
  clReleaseEvent(ev_bp);
  clReleaseMemObject(mem_A);
  clReleaseMemObject(mem_B);
  clReleaseMemObject(mem_C);
  clReleaseKernel(kernel);
  clReleaseProgram(program);
  clReleaseCommandQueue(cmd_queue);
  clReleaseContext(context);
}