Пример #1
0
int acc_event_create (void** event_p){
  // debug info
  if (verbose_print){
    fprintf(stdout, "\n ... EVENT CREATION ... \n");
    fprintf(stdout, " ---> Entering: acc_event_create.\n");
  }

  // local event object pointer
  *event_p = malloc(sizeof(cl_event));
  cl_event *clevent = (cl_event *) *event_p;

  // get a device event object
  *clevent = clCreateUserEvent((*acc_opencl_my_device).ctx, &cl_error);
  if (acc_opencl_error_check(cl_error, __LINE__))
    return -1;

  // debug info
  if (verbose_print){
    fprintf(stdout, " ---> Leaving: acc_event_create.\n");
  }

  cl_error = clSetUserEventStatus(*clevent, CL_COMPLETE);

  // assign return value
  return 0;
}
Пример #2
0
     operator cl_event() const
     {
         cl_int err = CL_SUCCESS;
         if(e_.size() == 1)
         {
             return e_[0];
         }
         if(!e_.empty())
         {
             cl_context ctx = get_info<command_queue::context_info_type>(command_queue());
             cl_event e = clCreateUserEvent(ctx, &err);
             OCLM_THROW_IF_EXCEPTION(err, "clCreateUserEvents");
 #ifdef CL_VERSION_1_2
             err = clEnqueueMarkerWithWaitList(command_queue(),
                 static_cast<cl_uint>(e_.size()), &e_[0], &e);
             OCLM_THROW_IF_EXCEPTION(err, "clEnqueueMarkerWithWaitList");
 #else
             get();
             err = clSetUserEventStatus(e, CL_COMPLETE);
 #endif
             return e;
         }
         else
             return cl_event();
     }
void WebCLUserEvent::setStatus(cl_int executionStatus, ExceptionState& es)
{
    ASSERT(isUserEvent());
    if (isReleased()) {
        es.throwWebCLException(WebCLException::INVALID_EVENT, WebCLException::invalidEventMessage);
        return;
    }

    if (!(executionStatus < 0 || executionStatus == CL_COMPLETE)) {
        es.throwWebCLException(WebCLException::INVALID_VALUE, WebCLException::invalidValueMessage);
        return;
    }

    if (m_eventStatusSituation == StatusSet) {
        es.throwWebCLException(WebCLException::INVALID_OPERATION, WebCLException::invalidOperationMessage);
        return;
    }

    m_eventStatusSituation = StatusSet;
    m_executionStatus = executionStatus;

    cl_int err = clSetUserEventStatus(m_clEvent, executionStatus);
    if (err != CL_SUCCESS)
        WebCLException::throwException(err, es);
}
cl_int mwFinishEvent(cl_event ev)
{
    cl_int err;

    err = clSetUserEventStatus(ev, CL_COMPLETE);
    if (err != CL_SUCCESS)
        mwPerrorCL(err, "Failed to mark custom event as completed");

    return err;
}
Пример #5
0
int main()
{
  cl_int err;
  cl_event user_evt = NULL;
  int i;

  // An user event can be set to either complete or a negative value, indicating error;
  // additionally, no objects involved in a command that waits on the user event should
  // be released before the event status is set; however, it should be possible to release
  // everything even if the status is set to something which is NOT CL_COMPLETE. So
  // try both CL_COMPLETE and a negative value
  cl_int status[] = {CL_INVALID_EVENT, CL_COMPLETE };

  // We also query for profiling info of the event, which according to the standard
  // should return CL_PROFILING_INFO_NOT_AVAILABLE
  cl_ulong queued, submitted, started, endtime;

  for (i = 0; i < ARRAY_SIZE(status); ++i) {
	  cl_context context;
	  cl_command_queue queue;
	  cl_device_id device;

	  CHECK_CL_ERROR(poclu_get_any_device(&context, &device, &queue));
	  TEST_ASSERT( context );
	  TEST_ASSERT( device );
	  TEST_ASSERT( queue );

	  user_evt = clCreateUserEvent(context, &err);
	  CHECK_OPENCL_ERROR_IN("clCreateUserEvent");
	  TEST_ASSERT( user_evt );

	  CHECK_CL_ERROR(clSetUserEventStatus(user_evt, status[i]));

	  err = clGetEventProfilingInfo(user_evt, CL_PROFILING_COMMAND_QUEUED,
		  sizeof(queued), &queued, NULL);
	  TEST_ASSERT(err == CL_PROFILING_INFO_NOT_AVAILABLE);
	  err = clGetEventProfilingInfo(user_evt, CL_PROFILING_COMMAND_SUBMIT,
		  sizeof(submitted), &submitted, NULL);
	  TEST_ASSERT(err == CL_PROFILING_INFO_NOT_AVAILABLE);
	  err = clGetEventProfilingInfo(user_evt, CL_PROFILING_COMMAND_START,
		  sizeof(started), &started, NULL);
	  TEST_ASSERT(err == CL_PROFILING_INFO_NOT_AVAILABLE);
	  err = clGetEventProfilingInfo(user_evt, CL_PROFILING_COMMAND_END,
		  sizeof(endtime), &endtime, NULL);
	  TEST_ASSERT(err == CL_PROFILING_INFO_NOT_AVAILABLE);

	  CHECK_CL_ERROR(clReleaseEvent(user_evt));
	  CHECK_CL_ERROR(clReleaseCommandQueue(queue));
	  CHECK_CL_ERROR(clReleaseContext(context));
  }

  return EXIT_SUCCESS;

}
Пример #6
0
	void Event::setStatus(CommandExecutionStatus status) {
		static const auto error_map = error::ErrorMap{
			{ErrorCode::invalid_event, "the given user event was invalid."},
			{ErrorCode::invalid_value, "the given execution status is invalid."},
			{ErrorCode::invalid_operation, "the execution status has already been set for this event."}
		};
		error::handle<EventException>(
			clSetUserEventStatus(
				m_id, static_cast<std::underlying_type<CommandExecutionStatus>::type>(status)
			),
			error_map
		);
	}
Пример #7
0
    void UserEvent::setStatus(EEventStatus status)
    {
#if defined(HAVE_OPENCL_1_1)
        cl_int error = CL_SUCCESS;
        if(!_id || ((error = 
                clSetUserEventStatus(_id, cl_int(status))) != CL_SUCCESS))
        {
            detail::reportError("UserEvent::setStatus(): ", error);
        }
#else
        (void)status;
#endif
    }
Пример #8
0
cl_int EventWrapper::setUserEventStatus (cl_int aExecutionStatus) {
#if CL_WRAPPER_CL_VERSION_SUPPORT >= 110
    D_METHOD_START;
    cl_int err = clSetUserEventStatus (mWrapped, aExecutionStatus);
    if (err != CL_SUCCESS) {
        D_LOG (LOG_LEVEL_ERROR, " clSetUserEventStatus failed. (error %d)", err);
    }
    return err;
#else // CL_WRAPPER_CL_VERSION_SUPPORT >= 110
    (void)aExecutionStatus;
    D_LOG (LOG_LEVEL_ERROR, "CLWrapper support for OpenCL 1.1 API was not enabled at build time.");
    return CL_INVALID_VALUE;
#endif
}
Пример #9
0
void CL::UserEvent::end()
{
    cl_int status;
    
    if (!_active) return;

    status = clSetUserEventStatus(_event, CL_COMPLETE);
    OPENCL_ASSERT(status);

    _device.end_user_event(_id);
    
    _event = 0;
    _id = -1;
    _active = false;
}
Пример #10
0
static void register_event(hpx::opencl::device cldevice,
                           const hpx::naming::id_type & event_id)
{

    boost::shared_ptr<hpx::opencl::server::device>
    parent_device = hpx::get_ptr<hpx::opencl::server::device>
                        (cldevice.get_gid()).get();

    // create a fake event
    cl_int err;
    cl_event event_cl = clCreateUserEvent (
            parent_device->get_context(),
            &err);
    cl_ensure(err, "clEnqueueWriteBuffer()");
    err = clSetUserEventStatus(event_cl, CL_COMPLETE);
    cl_ensure(err, "clSetUserEventStatus()");

    parent_device->register_event(event_id, event_cl);
}
Пример #11
0
static void build_program_callback(cl_program program, void *user_data) {
  cl_int err;
  cl_build_status build_status;
  bp_data_t *bp_data = (bp_data_t *)user_data;

  // Check the build status.
  err = clGetProgramBuildInfo(program, bp_data->dev,
                              CL_PROGRAM_BUILD_STATUS,
                              sizeof(cl_build_status),
                              &build_status, NULL);
  CHECK_ERROR(err);
  if (build_status != CL_BUILD_SUCCESS) {
    print_build_log(program, bp_data->dev);
    exit(EXIT_FAILURE);
  }

  // Set the event status
  err = clSetUserEventStatus(*(bp_data->event), CL_COMPLETE);
  CHECK_ERROR(err);
}
Пример #12
0
void Device::setup(unsigned int minScan, unsigned int maxScan) {
    int err;
    this->minScan = minScan;
    this->maxScan = maxScan;
    
    // Determine Configuration
    this->candidateBufferSize = this->reduce_scores_multiple;
    this->reduce_scores_size = this->reduce_scores_multiple;

    size_t hostMem = sizeof(mObj) * Tempest::data.iNumSpectra * Tempest::params.numInternalPSMs
      + sizeof(eObj) * Tempest::data.iNumSpectra
      + sizeof(cl_mem) * Tempest::data.iNumSpectra
      + sizeof(std::vector<int>) + sizeof(int)*Tempest::data.host_iPeakBins.size()
      + sizeof(std::vector<float>) + sizeof(float)*Tempest::data.host_fPeakInts.size()
      + sizeof(int)*Tempest::data.iNumSpectra
      + sizeof(long)*Tempest::data.iNumSpectra;
    for (int candidateBufferSize=this->reduce_scores_multiple; hostMem + candidateBufferSize*Tempest::data.iNumSpectra*sizeof(cObj) < Tempest::config.maxHostMem; candidateBufferSize += this->reduce_scores_multiple) {
        for (int reduceScoresSize = 1;
             reduceScoresSize <= candidateBufferSize
                 && reduceScoresSize <= this->reduce_scores_size_max
                 && reduceScoresSize*(sizeof(int) + sizeof(float)) + this->reduce_scores_size_local <= this->lLocalMemSize;
             reduceScoresSize *= 2) {
            if (reduceScoresSize%(this->reduce_scores_multiple) == 0 && candidateBufferSize%reduceScoresSize == 0)
                if (candidateBufferSize * reduceScoresSize > this->candidateBufferSize * this->reduce_scores_size) {
                    this->candidateBufferSize = candidateBufferSize;
                    this->reduce_scores_size = reduceScoresSize;
                }	    
        }
    }
    if (Tempest::config.profile) {
        printf("cl_build: local_work_size=%ld\n", this->build_size);
        printf("cl_transform: local_work_size=%ld\n", this->transform_size);
        printf("cl_score: local_work_size=%ld\n", this->score_size);
        printf("candidate buffer size=%ld\n", this->candidateBufferSize);
        printf("cl_reduce_scores: local_work_size=%ld\n", this->reduce_scores_size);
    }

    for (int i=minScan+deviceInd; i<maxScan; i+=Tempest::config.iDevices.size()) {
        eObj* e = Tempest::data.eScans[i];
        e->candidateBuffer = (cObj*)malloc(this->candidateBufferSize * sizeof(cObj));
        e->candidateBufferSize = this->candidateBufferSize;
        e->clEventSent = clCreateUserEvent(clContext, NULL);
        clSetUserEventStatus(e->clEventSent, 0);
        e->device = this;
    }

    // peaks
    size_t size_iPeakBins = Tempest::data.lNumMS2Peaks * sizeof(cl_int);
    size_t size_fPeakInts = Tempest::data.lNumMS2Peaks * sizeof(cl_float);
    cl_iPeakBins = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, size_iPeakBins, &(Tempest::data.host_iPeakBins[0]), &err);
    Tempest::check_cl_error(__FILE__, __LINE__, err, "Unable to allocate device memory for peak bins.");
    cl_fPeakInts = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, size_fPeakInts, &(Tempest::data.host_fPeakInts[0]), &err);
    Tempest::check_cl_error(__FILE__, __LINE__, err, "Unable to allocate device memory for peak intensities.");
    
    // cleanup host
    //std::vector<int>().swap(Tempest::data.host_iPeakBins);
    //std::vector<float>().swap(Tempest::data.host_fPeakInts);

    //cudaMalloc((void**) &cl_fSpectra, Tempest::data.iNumMS2Bins * sizeof(float));
    //cl_fSpectra = clCreateBuffer(clContext, CL_MEM_READ_WRITE, Tempest::data.iNumMS2Bins * sizeof(float), NULL, &err);
    float * init_fSpectra = (float *) calloc(Tempest::data.iNumMS2Bins, sizeof(float));
    size_t size_init_fSpectra = Tempest::data.iNumMS2Bins * sizeof(cl_float);
    cl_init_fSpectra = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, size_init_fSpectra, init_fSpectra, &err);
    free(init_fSpectra);
    
    // candidate and results
    mObj * init_mPSMs = (mObj *) calloc(Tempest::data.iNumSpectra * Tempest::params.numInternalPSMs, sizeof(mObj));
    // for (int i=0; i<Tempest::data.iNumSpectra * Tempest::params.numInternalPSMs; i++)
    //     init_mPSMs[i].fScore = MIN_SCORE;
    //float * init_fNextScores = (float *) calloc(Tempest::data.iNumSpectra, sizeof(float));
    size_t size_cCandidates = sizeof(cObj) * this->candidateBufferSize;
    size_t size_fScores = sizeof(cl_float)  * this->candidateBufferSize;
    size_t size_mPSMs = sizeof(mObj)  * Tempest::data.iNumSpectra * Tempest::params.numInternalPSMs;
    //size_t size_fNextScores = sizeof(float) * Tempest::data.iNumSpectra;
    cl_cCandidates = clCreateBuffer(clContext, CL_MEM_READ_ONLY, size_cCandidates, NULL, &err);
    cl_fScores = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size_fScores, NULL, &err);  
    cl_mPSMs = clCreateBuffer(clContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, size_mPSMs , init_mPSMs, &err);
    //cl_fNextScores = clCreateBuffer(clContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, size_fNextScores, init_fNextScores, &err);
    //MEA: need to block free until previous clCreateBuffer commands complete?
    free(init_mPSMs);
    //free(init_fNextScores);
    Tempest::check_cl_error(__FILE__, __LINE__, err, "Unable to allocate device memory for candidates and results.");

    //determine how many spectra can be kept in device memory at a time
    size_t availMemSpectra = lGlobalMemSize
        - size_iPeakBins
        - size_fPeakInts
        - size_init_fSpectra
        - size_cCandidates
        - size_fScores
        - size_mPSMs;
    if (availMemSpectra > Tempest::config.maxDeviceMem)
        availMemSpectra = Tempest::config.maxDeviceMem;
    long maxCachedSpectra = availMemSpectra / (Tempest::data.iNumMS2Bins*sizeof(cl_float));
    if (maxCachedSpectra > (long)ceil(float(Tempest::data.iNumSpectra)/Tempest::devices.size()))
        maxCachedSpectra = (long)ceil(float(Tempest::data.iNumSpectra)/Tempest::devices.size());
    if (maxCachedSpectra <= 0)
        maxCachedSpectra = 1;
    
    printf(" » (%d:%d) Allocating %.2f MB of device memory for %ld cached %s.\n", platformID, deviceID, (float)maxCachedSpectra*Tempest::data.iNumMS2Bins*sizeof(cl_float)/MB, maxCachedSpectra, maxCachedSpectra==1 ? "spectrum" : "spectra");
    for (int i=0; i<maxCachedSpectra; i++) {
        cl_mem newBuffer = clCreateBuffer(clContext, CL_MEM_READ_WRITE, Tempest::data.iNumMS2Bins*sizeof(cl_float), NULL, &err);
        Tempest::check_cl_error(__FILE__, __LINE__, err, "Unable to allocate spectrum memory on device.");
        unusedBuffers.push(newBuffer);
    }
    
    setup_constant_memory();
    
    //initialize profiling variables
    scoreEvent = clCreateUserEvent(clContext, NULL);
    reduceEvent = clCreateUserEvent(clContext, NULL);
    buildEvent = clCreateUserEvent(clContext, NULL);
    memsetEvent = clCreateUserEvent(clContext, NULL);
    transformEvent = clCreateUserEvent(clContext, NULL);
    totalScoreTime = 0;
    totalReduceTime = 0;
    totalBuildTime = 0;
    totalTransformTime = 0;
    totalMemsetTime = 0;
    totalSendTime = 0;
    buildLaunches = 0;
    scoreKernelLaunches = 0;
    lastBuildIndex = -1;

}
Пример #13
0
/*! \brief Forces commands within a command Queue associated with this Event to wait until
  * the commmands to be completed.
  */
void ocl::Event::waitUntilCompleted() const
{
	OPENCL_SAFE_CALL( clSetUserEventStatus (this->id(), CL_COMPLETE) );
}
Пример #14
0
END_TEST

START_TEST (test_misc_events)
{
    cl_platform_id platform = 0;
    cl_device_id device;
    cl_context ctx;
    cl_command_queue queue;
    cl_int result;
    cl_event uevent1, uevent2, marker1, marker2;

    result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0);
    fail_if(
        result != CL_SUCCESS,
        "unable to get the default device"
    );

    ctx = clCreateContext(0, 1, &device, 0, 0, &result);
    fail_if(
        result != CL_SUCCESS || ctx == 0,
        "unable to create a valid context"
    );

    queue = clCreateCommandQueue(ctx, device, 0, &result);
    fail_if(
        result != CL_SUCCESS || queue == 0,
        "cannot create a command queue"
    );

    /*
     * This test will build a command queue blocked by an user event. The events
     * will be in this order :
     *
     * -: UserEvent1
     * 0: WaitForEvents1 (wait=UserEvent1)
     * 1: Marker1
     * -: UserEvent2
     * 2: WaitForEvents2 (wait=UserEvent2)
     * 3: Barrier
     * 4: Marker2 (to check the barrier worked)
     *
     * When the command queue is built, we :
     *  - Check that Marker1 is Queued (WaitForEvents waits)
     *  - Set UserEvent1 to Complete
     *  - Check that Marker1 is Complete (WaitForEvents stopped to wait)
     *  - Check that Marker2 is Queued (Barrier is there)
     *  - Set UserEvent2 to Complete
     *  - Check that Marker2 is Complete (no more barrier)
     */
    uevent1 = clCreateUserEvent(ctx, &result);
    fail_if(
        result != CL_SUCCESS,
        "unable to create UserEvent1"
    );

    uevent2 = clCreateUserEvent(ctx, &result);
    fail_if(
        result != CL_SUCCESS,
        "unable to create UserEvent2"
    );

    result = clEnqueueWaitForEvents(queue, 1, &uevent1);
    fail_if(
        result != CL_SUCCESS,
        "unable to enqueue WaitForEvents(UserEvent1)"
    );

    result = clEnqueueMarker(queue, &marker1);
    fail_if(
        result != CL_SUCCESS,
        "unable to enqueue Marker1"
    );

    result = clEnqueueWaitForEvents(queue, 1, &uevent2);
    fail_if(
        result != CL_SUCCESS,
        "unable to enqueue WaitForEvents(UserEvent2)"
    );

    result = clEnqueueBarrier(queue);
    fail_if(
        result != CL_SUCCESS,
        "unable to enqueue Barrier"
    );

    result = clEnqueueMarker(queue, &marker2);
    fail_if(
        result != CL_SUCCESS,
        "unable to enqueue Marker2"
    );

    // Now the checks
    cl_int status;

    result = clGetEventInfo(marker1, CL_EVENT_COMMAND_EXECUTION_STATUS,
                            sizeof(cl_int), &status, 0);
    fail_if(
        result != CL_SUCCESS || status != CL_QUEUED,
        "Marker1 must be Queued"
    );

    result = clSetUserEventStatus(uevent1, CL_COMPLETE);
    fail_if(
        result != CL_SUCCESS,
        "unable to set UserEvent1 to Complete"
    );

    result = clGetEventInfo(marker1, CL_EVENT_COMMAND_EXECUTION_STATUS,
                            sizeof(cl_int), &status, 0);
    fail_if(
        result != CL_SUCCESS || status != CL_COMPLETE,
        "Marker1 must be Complete"
    );

    result = clGetEventInfo(marker2, CL_EVENT_COMMAND_EXECUTION_STATUS,
                            sizeof(cl_int), &status, 0);
    fail_if(
        result != CL_SUCCESS || status != CL_QUEUED,
        "Marker2 must be Queued"
    );

    result = clSetUserEventStatus(uevent2, CL_COMPLETE);
    fail_if(
        result != CL_SUCCESS,
        "unable to set UserEvent2 to Complete"
    );

    result = clGetEventInfo(marker2, CL_EVENT_COMMAND_EXECUTION_STATUS,
                            sizeof(cl_int), &status, 0);
    fail_if(
        result != CL_SUCCESS || status != CL_COMPLETE,
        "Marker2 must be Complete"
    );

    clFinish(queue);

    clReleaseEvent(uevent1);
    clReleaseEvent(uevent2);
    clReleaseEvent(marker1);
    clReleaseEvent(marker2);
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);
}
Пример #15
0
int main()
{
    cl_platform_id platform_id = NULL;
    cl_device_id device_id = NULL;
    cl_context context = NULL;
    cl_command_queue command_queue = NULL;
    cl_mem objA = NULL;
    cl_mem objB = NULL;
    cl_mem objC = NULL;
    cl_program program = NULL;
    cl_kernel kernel = NULL;
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;
    cl_int ret;
 
    cl_event event1;
 
    int i, j;
    float *A;
    float *B;
    float *C;
 
    A = (float *)malloc(4*4*sizeof(float));
    B = (float *)malloc(4*4*sizeof(float));
    C = (float *)malloc(4*4*sizeof(float));
 
    /* Initialize input data */
    for (i=0; i<4; i++) {
        for (j=0; j<4; j++) {
            A[i*4+j] = i*4+j+1;
            B[i*4+j] = j*4+i+1;
        }
    }
 
    /* Get Platform/Device Information*/
    ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);
 
    /* Create OpenCL Context */
    context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
 
    /* Create command queue */
    command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
 
    /* Create Buffer Object */
    objA = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);
    objB = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);
    objC = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);
 
    /*
     * Creating an user event
     * As a user event is created, its execution status is set to be CL_SUBMITTED
     * and we tag the event to a callback so when event reaches CL_COMPLETE, it will 
     * execute postProcess
     */ 
    event1 = clCreateUserEvent(context, &ret);
    clSetEventCallback(event1, CL_COMPLETE, &postProcess, "Looks like its done.");

    /* Copy input data to the memory buffer */
 
    ret = clEnqueueWriteBuffer(command_queue, objA, CL_TRUE, 0, 4*4*sizeof(float), A, 0, NULL, NULL );
    printf("A has been written\n");
 
    /* The next command will wait for event1 according to its status*/
    ret = clEnqueueWriteBuffer(command_queue, objB, CL_TRUE, 0, 4*4*sizeof(float), B, 1, &event1, NULL);
    printf("B has been written\n");

    /* Tell event1 to complete */
    clSetUserEventStatus(event1, CL_COMPLETE);
	const char *file_names[] = {"sample_kernel.cl"}; 
	const int NUMBER_OF_FILES = 1;
	char* buffer[NUMBER_OF_FILES];
	size_t sizes[NUMBER_OF_FILES];
	loadProgramSource(file_names, NUMBER_OF_FILES, buffer, sizes);
	
    /* Create kernel program from source file*/
    program = clCreateProgramWithSource(context, 1, (const char **)buffer, sizes, &ret);
    ret     = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
 
    /* Create data parallel OpenCL kernel */
    kernel = clCreateKernel(program, "sample", &ret);
 
    /* Set OpenCL kernel arguments */
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&objA);
    ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&objB);
    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&objC);
 
    size_t global_item_size = 4;
    size_t local_item_size = 1;
 
    /* Execute OpenCL kernel as data parallel */
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, 
                                 &global_item_size, &local_item_size, 0, NULL, NULL);
 
    /* Transfer result to host */
    ret = clEnqueueReadBuffer(command_queue, objC, CL_TRUE, 0, 4*4*sizeof(float), C, 0, NULL, NULL);
 
    /* Display Results */
    for (i=0; i<4; i++) {
        for (j=0; j<4; j++) {
            printf("%7.2f ", C[i*4+j]);
        }
        printf("\n");
    }
 
 
    /* Finalization */
    ret = clFlush(command_queue);
    ret = clFinish(command_queue);
    ret = clReleaseKernel(kernel);
    ret = clReleaseProgram(program);
    ret = clReleaseMemObject(objA);
    ret = clReleaseMemObject(objB);
    ret = clReleaseMemObject(objC);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);
 
    free(A);
    free(B);
    free(C);
 
    return 0;
}
Пример #16
0
enum piglit_result
piglit_cl_test(const int argc,
               const char **argv,
               const struct piglit_cl_api_test_config* config,
               const struct piglit_cl_api_test_env* env)
{
#if defined(CL_VERSION_1_2)
	enum piglit_result result = PIGLIT_PASS;
	cl_int err;

#define IMG_WIDTH 4
#define IMG_HEIGHT 4
#define IMG_DATA_SIZE 4
#define IMG_BUFFER_SIZE IMG_WIDTH * IMG_HEIGHT * IMG_DATA_SIZE

	unsigned char img_buf[IMG_BUFFER_SIZE] = {0};
	unsigned char dst_buf[IMG_BUFFER_SIZE] = {0};
	unsigned char exp_buf[IMG_BUFFER_SIZE] = {0};
	int pattern[4] = {129, 33, 77, 255};
	size_t origin[3] = {0, 0, 0};
	size_t region[3] = {2, 2, 1};
	size_t tmp;
	cl_event event;
	cl_mem image;
	cl_image_format img_format;
	cl_image_desc img_desc = {0};
	cl_command_queue queue = env->context->command_queues[0];
	int i;

	cl_bool *image_support =
		piglit_cl_get_device_info(env->context->device_ids[0],
		                          CL_DEVICE_IMAGE_SUPPORT);

	if (!*image_support) {
		fprintf(stderr, "No image support\n");
		free(image_support);
		return PIGLIT_SKIP;
	}

	img_format.image_channel_order = CL_RGBA;
	img_format.image_channel_data_type = CL_UNSIGNED_INT8;
	img_desc.image_type = CL_MEM_OBJECT_IMAGE2D;
	img_desc.image_width = IMG_WIDTH;
	img_desc.image_height = IMG_HEIGHT;
	img_desc.buffer = NULL;

/*** Normal usage ***/
	image = clCreateImage(env->context->cl_ctx,
	                      CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
	                      &img_format, &img_desc, &img_buf, &err);

	if(!piglit_cl_check_error(err, CL_SUCCESS)) {
		fprintf(stderr, "Failed (error code: %s): Creating an image\n",
		        piglit_cl_get_error_name(err));
		return PIGLIT_FAIL;
	}

	if (!test(queue, image, pattern, origin, region,
	          0, NULL, NULL,
	          CL_SUCCESS, &result, "Enqueuing the image to be filled")) {
		return PIGLIT_FAIL;
	}

	region[0] = IMG_WIDTH;
	region[1] = IMG_HEIGHT;
	err = clEnqueueReadImage(queue, image, 1, origin, region, 0, 0,
	                         dst_buf, 0, NULL, NULL);
	if(!piglit_cl_check_error(err, CL_SUCCESS)) {
		fprintf(stderr, "Failed (error code: %s): Reading image\n",
		        piglit_cl_get_error_name(err));
		return PIGLIT_FAIL;
	}

	/*
	 * fill the host buffer with the pattern
	 * for exemple : pattern == 1234
	 *
	 * 12341234abcdabcd
	 * 12341234abcdabcd
	 * abcdabcdabcdabcd
	 * abcdabcdabcdabcd
	 */
	exp_buf[0] = pattern[0];
	exp_buf[1] = pattern[1];
	exp_buf[2] = pattern[2];
	exp_buf[3] = pattern[3];
	memcpy(exp_buf + (IMG_DATA_SIZE * 1), exp_buf, IMG_DATA_SIZE);
	memcpy(exp_buf + (IMG_DATA_SIZE * 4), exp_buf, IMG_DATA_SIZE);
	memcpy(exp_buf + (IMG_DATA_SIZE * 5), exp_buf, IMG_DATA_SIZE);

	for (i = 0; i < sizeof(dst_buf) / sizeof(dst_buf[0]); ++i) {
		if (!piglit_cl_probe_integer(dst_buf[i], exp_buf[i], 0)) {
			fprintf(stderr, "Error at %d: got %d, expected %d\n",
			        i, dst_buf[i], exp_buf[i]);
			return PIGLIT_FAIL;
		}
	}

/*** Errors ***/

	/*
	 * CL_INVALID_COMMAND_QUEUE if command_queue is not a valid command-queue.
	 */
	test(NULL, image, pattern, origin, region,
		  0, NULL, NULL,
		  CL_INVALID_COMMAND_QUEUE, &result,
		  "CL_INVALID_COMMAND_QUEUE if command_queue is not a valid command-queue");

	/*
	 * CL_INVALID_CONTEXT if the context associated with command_queue and
	 * image are not the same or if the context associated with command_queue
	 * and events in event_wait_list are not the same.
	 */
	{
		piglit_cl_context context;
		cl_int err;
		context = piglit_cl_create_context(env->platform_id,
		                                   env->context->device_ids, 1);
		if (context) {
			event = clCreateUserEvent(context->cl_ctx, &err);
			if (err == CL_SUCCESS) {
				err = clSetUserEventStatus(event, CL_COMPLETE);
				if (err == CL_SUCCESS) {
					test(context->command_queues[0], image, pattern, origin, region,
					     0, NULL, NULL,
					     CL_INVALID_CONTEXT, &result,
					     "CL_INVALID_CONTEXT if the context associated with command_queue and image are not the same");

					test(queue, image, pattern, origin, region,
					     1, &event, NULL,
					     CL_INVALID_CONTEXT, &result,
					     "CL_INVALID_CONTEXT if the context associated with command_queue and events in event_wait_list are not the same");
				} else {
					fprintf(stderr, "Could not set event status.\n");
					piglit_merge_result(&result, PIGLIT_WARN);
				}
				clReleaseEvent(event);
			} else {
				fprintf(stderr, "Could not create user event.\n");
				piglit_merge_result(&result, PIGLIT_WARN);
			}

			piglit_cl_release_context(context);
		} else {
			fprintf(stderr, "Could not test triggering CL_INVALID_CONTEXT.\n");
			piglit_merge_result(&result, PIGLIT_WARN);
		}
	}

	/*
	 * CL_INVALID_MEM_OBJECT if image is not a valid buffer object.
	 */
	test(queue, NULL, pattern, origin, region,
	     0, NULL, NULL,
	     CL_INVALID_MEM_OBJECT, &result,
	     "CL_INVALID_MEM_OBJECT if image is not a valid buffer object");

	/*
	 * CL_INVALID_VALUE if fill_color is NULL.
	 */
	test(queue, image, NULL, origin, region,
	     0, NULL, NULL,
	     CL_INVALID_VALUE, &result,
	     "CL_INVALID_VALUE if fill_color is NULL");

	/*
	 * CL_INVALID_VALUE if the region being written specified by origin and
	 * region is out of bounds or if ptr is a NULL value.
	 */
	tmp = origin[0];
	origin[0] = IMG_WIDTH + 1;
	test(queue, image, pattern, origin, region,
	     0, NULL, NULL,
	     CL_INVALID_VALUE, &result,
	     "CL_INVALID_VALUE if the region being written specified by origin and region is out of bounds (origin)");
	origin[0] = tmp;

	tmp = region[0];
	region[0] = IMG_WIDTH + 1;
	test(queue, image, pattern, origin, region,
	     0, NULL, NULL,
	     CL_INVALID_VALUE, &result,
	     "CL_INVALID_VALUE if the region being written specified by origin and region is out of bounds (region)");
	region[0] = tmp;

	test(queue, image, pattern, NULL, region,
	     0, NULL, NULL,
	     CL_INVALID_VALUE, &result,
	     "CL_INVALID_VALUE if ptr is a NULL value (origin)");

	test(queue, image, pattern, origin, NULL,
	     0, NULL, NULL,
	     CL_INVALID_VALUE, &result,
	     "CL_INVALID_VALUE if ptr is a NULL value (region)");

	/*
	 * CL_INVALID_VALUE if values in origin and region do not follow rules
	 * described in the argument description for origin and region.
	 */
	tmp = origin[2];
	origin[2] = 1;
	test(queue, image, pattern, origin, region,
	     0, NULL, NULL,
	     CL_INVALID_VALUE, &result,
	     "CL_INVALID_VALUE if values in origin do not follow rules described in the argument description for origin");
	origin[2] = tmp;

	tmp = region[2];
	region[2] = 0;
	test(queue, image, pattern, origin, region,
		  0, NULL, NULL,
		CL_INVALID_VALUE, &result,
		"CL_INVALID_VALUE if values in region do not follow rules described in the argument description for region");
	region[2] = tmp;

	/*
	 * CL_INVALID_EVENT_WAIT_LIST if event_wait_list is NULL and
	 * num_events_in_wait_list > 0, or event_wait_list is not NULL and
	 * num_events_in_wait_list is 0, or if event objects in event_wait_list
	 * are not valid events.
	 */
	event = NULL;
	test(queue, image, pattern, origin, region,
	     1, NULL, NULL,
	     CL_INVALID_EVENT_WAIT_LIST, &result,
	     "CL_INVALID_EVENT_WAIT_LIST if event_wait_list is NULL and num_events_in_wait_list > 0");

	test(queue, image, pattern, origin, region,
	     0, &event, NULL,
	     CL_INVALID_EVENT_WAIT_LIST, &result,
	     "CL_INVALID_EVENT_WAIT_LIST if event_wait_list is not NULL and num_events_in_wait_list is 0");

	test(queue, image, pattern, origin, region,
	     1, &event, NULL,
	     CL_INVALID_EVENT_WAIT_LIST, &result,
	     "CL_INVALID_EVENT_WAIT_LIST if event objects in event_wait_list are not valid events");

	/*
	 * CL_INVALID_IMAGE_SIZE if image dimensions (image width, height, specified
	 * or compute row and/or slice pitch) for image are not supported by device
	 * associated with queue.
	 */
	/* This is a per device test, clCreateImage would have failed before */

	/*
	 * CL_INVALID_IMAGE_FORMAT if image format (image channel order and data type)
	 * for image are not supported by device associated with queue.
	 */
	/* This is a per device test, clCreateImage would have failed before */

	free(image_support);
	clReleaseMemObject(image);
	return result;
#else
	return PIGLIT_SKIP;
#endif
}
Пример #17
0
/* main */
int main(int argc, char **argv) 
{
  /*OpenCL variables */
  cl_device_id device;
  cl_device_type device_type; /*to test if we are on cpu or gpu*/
  cl_context context;
  cl_command_queue cmdQueue;

  /* The event variables are created only when needed */
#ifdef _UNBLOCK
  cl_uint  num_events = 3;
  cl_event event[num_events];
#endif

  FPTYPE * buffers[3];
  cl_mdsys_t cl_sys;
  cl_int status;

  int nprint, i, nthreads = 0;
  char restfile[BLEN], trajfile[BLEN], ergfile[BLEN], line[BLEN];
  FILE *fp,*traj,*erg;
  mdsys_t sys;


/* Start profiling */

#ifdef __PROFILING
  
  double t1, t2;

  t1 = second();

#endif

  /* handling the command line arguments */
  switch (argc) {
      case 2: /* only the cpu/gpu argument was passed, setting default nthreads */
	      if( !strcmp( argv[1], "cpu" ) ) nthreads = 16;
	      else nthreads = 1024;
	      break;
      case 3: /* both the device type (cpu/gpu) and the number of threads were passed */
	      nthreads = strtol(argv[2],NULL,10);
	      if( nthreads<0 ) {
		      fprintf( stderr, "\n. The number of threads must be more than 1.\n");
		      PrintUsageAndExit();
	      }
	      break;
      default:
	      PrintUsageAndExit();
	      break;
  }
  
  /* Initialize the OpenCL environment */
  if( InitOpenCLEnvironment( argv[1], &device, &context, &cmdQueue ) != CL_SUCCESS ){
    fprintf( stderr, "Program Error! OpenCL Environment was not initialized correctly.\n" );
    return 4;
  }

  /* The event initialization is performed only when needed */
#ifdef _UNBLOCK
  /* initialize the cl_event handler variables */
  for( i = 0; i < num_events; ++i) {
	  event[i] = clCreateUserEvent( context, NULL );
	  clSetUserEventStatus( event[i], CL_COMPLETE );
  }
#endif

  /* read input file */
  if(get_me_a_line(stdin,line)) return 1;
  sys.natoms=atoi(line);
  if(get_me_a_line(stdin,line)) return 1;
  sys.mass=atof(line);
  if(get_me_a_line(stdin,line)) return 1;
  sys.epsilon=atof(line);
  if(get_me_a_line(stdin,line)) return 1;
  sys.sigma=atof(line);
  if(get_me_a_line(stdin,line)) return 1;
  sys.rcut=atof(line);
  if(get_me_a_line(stdin,line)) return 1;
  sys.box=atof(line);
  if(get_me_a_line(stdin,restfile)) return 1;
  if(get_me_a_line(stdin,trajfile)) return 1;
  if(get_me_a_line(stdin,ergfile)) return 1;
  if(get_me_a_line(stdin,line)) return 1;
  sys.nsteps=atoi(line);
  if(get_me_a_line(stdin,line)) return 1;
  sys.dt=atof(line);
  if(get_me_a_line(stdin,line)) return 1;
  nprint=atoi(line);
  

  
  /* allocate memory */
  cl_sys.natoms = sys.natoms;
  cl_sys.rx = clCreateBuffer( context, CL_MEM_READ_WRITE, cl_sys.natoms * sizeof(FPTYPE), NULL, &status );
  cl_sys.ry = clCreateBuffer( context, CL_MEM_READ_WRITE, cl_sys.natoms * sizeof(FPTYPE), NULL, &status );
  cl_sys.rz = clCreateBuffer( context, CL_MEM_READ_WRITE, cl_sys.natoms * sizeof(FPTYPE), NULL, &status );
  cl_sys.vx = clCreateBuffer( context, CL_MEM_READ_WRITE, cl_sys.natoms * sizeof(FPTYPE), NULL, &status );
  cl_sys.vy = clCreateBuffer( context, CL_MEM_READ_WRITE, cl_sys.natoms * sizeof(FPTYPE), NULL, &status );
  cl_sys.vz = clCreateBuffer( context, CL_MEM_READ_WRITE, cl_sys.natoms * sizeof(FPTYPE), NULL, &status );
  cl_sys.fx = clCreateBuffer( context, CL_MEM_READ_WRITE, cl_sys.natoms * sizeof(FPTYPE), NULL, &status );
  cl_sys.fy = clCreateBuffer( context, CL_MEM_READ_WRITE, cl_sys.natoms * sizeof(FPTYPE), NULL, &status );
  cl_sys.fz = clCreateBuffer( context, CL_MEM_READ_WRITE, cl_sys.natoms * sizeof(FPTYPE), NULL, &status );
  
  buffers[0] = (FPTYPE *) malloc( 2 * cl_sys.natoms * sizeof(FPTYPE) );
  buffers[1] = (FPTYPE *) malloc( 2 * cl_sys.natoms * sizeof(FPTYPE) );
  buffers[2] = (FPTYPE *) malloc( 2 * cl_sys.natoms * sizeof(FPTYPE) );
  
  /* read restart */
  fp = fopen( restfile, "r" );
  if( fp ) {
    for( i = 0; i < 2 * cl_sys.natoms; ++i ){
#ifdef _USE_FLOAT
      fscanf( fp, "%f%f%f", buffers[0] + i, buffers[1] + i, buffers[2] + i);
#else
      fscanf( fp, "%lf%lf%lf", buffers[0] + i, buffers[1] + i, buffers[2] + i);
#endif
    }
    
    status = clEnqueueWriteBuffer( cmdQueue, cl_sys.rx, CL_TRUE, 0, cl_sys.natoms * sizeof(FPTYPE), buffers[0], 0, NULL, NULL ); 
    status |= clEnqueueWriteBuffer( cmdQueue, cl_sys.ry, CL_TRUE, 0, cl_sys.natoms * sizeof(FPTYPE), buffers[1], 0, NULL, NULL ); 
    status |= clEnqueueWriteBuffer( cmdQueue, cl_sys.rz, CL_TRUE, 0, cl_sys.natoms * sizeof(FPTYPE), buffers[2], 0, NULL, NULL ); 
    
    status |= clEnqueueWriteBuffer( cmdQueue, cl_sys.vx, CL_TRUE, 0, cl_sys.natoms * sizeof(FPTYPE), buffers[0] + cl_sys.natoms, 0, NULL, NULL ); 
    status |= clEnqueueWriteBuffer( cmdQueue, cl_sys.vy, CL_TRUE, 0, cl_sys.natoms * sizeof(FPTYPE), buffers[1] + cl_sys.natoms, 0, NULL, NULL ); 
    status |= clEnqueueWriteBuffer( cmdQueue, cl_sys.vz, CL_TRUE, 0, cl_sys.natoms * sizeof(FPTYPE), buffers[2] + cl_sys.natoms, 0, NULL, NULL ); 
    
    fclose(fp);

  } else {
    perror("cannot read restart file");
    return 3;
  }
  
  /* initialize forces and energies.*/
  sys.nfi=0;
  
  size_t globalWorkSize[1];
  globalWorkSize[0] = nthreads;
  
  const char * sourcecode =
  #include <opencl_kernels_as_string.h>
  ;

  cl_program program = clCreateProgramWithSource( context, 1, (const char **) &sourcecode, NULL, &status );
  
  status |= clBuildProgram( program, 0, NULL, kernelflags, NULL, NULL );
  
#ifdef __DEBUG
  size_t log_size;
  char log [200000]; 
  clGetProgramBuildInfo( program, device, CL_PROGRAM_BUILD_LOG, sizeof(log), log, &log_size );
  fprintf( stderr, "\nLog: \n\n %s", log ); 
#endif
  
  cl_kernel kernel_force = clCreateKernel( program, "opencl_force", &status );
  cl_kernel kernel_ekin = clCreateKernel( program, "opencl_ekin", &status );
  cl_kernel kernel_verlet_first = clCreateKernel( program, "opencl_verlet_first", &status );
  cl_kernel kernel_verlet_second = clCreateKernel( program, "opencl_verlet_second", &status );
  cl_kernel kernel_azzero = clCreateKernel( program, "opencl_azzero", &status );
  
  FPTYPE * tmp_epot;
  cl_mem epot_buffer;
  tmp_epot = (FPTYPE *) malloc( nthreads * sizeof(FPTYPE) );
  epot_buffer = clCreateBuffer( context, CL_MEM_READ_WRITE, nthreads * sizeof(FPTYPE), NULL, &status );
  
  /* precompute some constants */
  FPTYPE c12 = 4.0 * sys.epsilon * pow( sys.sigma, 12.0);
  FPTYPE c6  = 4.0 * sys.epsilon * pow( sys.sigma, 6.0);
  FPTYPE rcsq = sys.rcut * sys.rcut;
  FPTYPE boxby2 = HALF * sys.box;  
  FPTYPE dtmf = HALF * sys.dt / mvsq2e / sys.mass;
  sys.epot = ZERO;
  sys.ekin = ZERO;

  /* Azzero force buffer */
  status = clSetMultKernelArgs( kernel_azzero, 0, 4, KArg(cl_sys.fx), KArg(cl_sys.fy), KArg(cl_sys.fz), KArg(cl_sys.natoms));

  status = clEnqueueNDRangeKernel( cmdQueue, kernel_azzero, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL );

  status |= clSetMultKernelArgs( kernel_force, 0, 13,
	KArg(cl_sys.fx),
	KArg(cl_sys.fy),
	KArg(cl_sys.fz),
	KArg(cl_sys.rx),
	KArg(cl_sys.ry),
	KArg(cl_sys.rz),
	KArg(cl_sys.natoms),
	KArg(epot_buffer),
	KArg(c12),
	KArg(c6),
	KArg(rcsq),
	KArg(boxby2),
	KArg(sys.box));
  
  status = clEnqueueNDRangeKernel( cmdQueue, kernel_force, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL );
  
  status |= clEnqueueReadBuffer( cmdQueue, epot_buffer, CL_TRUE, 0, nthreads * sizeof(FPTYPE), tmp_epot, 0, NULL, NULL );     
  
  for( i = 0; i < nthreads; i++) sys.epot += tmp_epot[i];
  
  FPTYPE * tmp_ekin;
  cl_mem ekin_buffer;
  tmp_ekin = (FPTYPE *) malloc( nthreads * sizeof(FPTYPE) );
  ekin_buffer = clCreateBuffer( context, CL_MEM_READ_WRITE, nthreads * sizeof(FPTYPE), NULL, &status );
  
  status |= clSetMultKernelArgs( kernel_ekin, 0, 5, KArg(cl_sys.vx), KArg(cl_sys.vy), KArg(cl_sys.vz),
    KArg(cl_sys.natoms), KArg(ekin_buffer));
  
  status = clEnqueueNDRangeKernel( cmdQueue, kernel_ekin, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL );
    
  status |= clEnqueueReadBuffer( cmdQueue, ekin_buffer, CL_TRUE, 0, nthreads * sizeof(FPTYPE), tmp_ekin, 0, NULL, NULL );     

  for( i = 0; i < nthreads; i++) sys.ekin += tmp_ekin[i];
  sys.ekin *= HALF * mvsq2e * sys.mass;
  sys.temp  = TWO * sys.ekin / ( THREE * sys.natoms - THREE ) / kboltz;

  erg=fopen(ergfile,"w");
  traj=fopen(trajfile,"w");

  printf("Starting simulation with %d atoms for %d steps.\n",sys.natoms, sys.nsteps);
  printf("     NFI            TEMP            EKIN                 EPOT              ETOT\n");
  
  /* download data on host */
  status = clEnqueueReadBuffer( cmdQueue, cl_sys.rx, CL_TRUE, 0, cl_sys.natoms * sizeof(FPTYPE), buffers[0], 0, NULL, NULL ); 
  status |= clEnqueueReadBuffer( cmdQueue, cl_sys.ry, CL_TRUE, 0, cl_sys.natoms * sizeof(FPTYPE), buffers[1], 0, NULL, NULL ); 
  status |= clEnqueueReadBuffer( cmdQueue, cl_sys.rz, CL_TRUE, 0, cl_sys.natoms * sizeof(FPTYPE), buffers[2], 0, NULL, NULL ); 
  
  sys.rx = buffers[0];
  sys.ry = buffers[1];
  sys.rz = buffers[2];
  
  output(&sys, erg, traj);

  /**************************************************/
  /* main MD loop */
  for(sys.nfi=1; sys.nfi <= sys.nsteps; ++sys.nfi) {

    /* propagate system and recompute energies */
    /* 2) verlet_first   */
    status |= clSetMultKernelArgs( kernel_verlet_first, 0, 12,
      KArg(cl_sys.fx),
      KArg(cl_sys.fy),
      KArg(cl_sys.fz),
      KArg(cl_sys.rx),
      KArg(cl_sys.ry),
      KArg(cl_sys.rz),
      KArg(cl_sys.vx),
      KArg(cl_sys.vy),
      KArg(cl_sys.vz),
      KArg(cl_sys.natoms),
      KArg(sys.dt),
      KArg(dtmf));
    CheckSuccess(status, 2);

    /* When the data transfer is non blocking, this kernel has to wait the completion of part 8 (event[2]) */
#ifdef _UNBLOCK
    status = clEnqueueNDRangeKernel( cmdQueue, kernel_verlet_first, 1, NULL, globalWorkSize, NULL, 1, &event[2], NULL );
#else
    status = clEnqueueNDRangeKernel( cmdQueue, kernel_verlet_first, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL );
#endif

    /* 6) download position@device to position@host */
    if ((sys.nfi % nprint) == nprint-1) {

    /* In non blocking mode (CL_FALSE) this data transfer raises events[i] */
#ifdef _UNBLOCK
	status  = clEnqueueReadBuffer( cmdQueue, cl_sys.rx, CL_FALSE, 0, cl_sys.natoms * sizeof(FPTYPE), buffers[0], 0, NULL, &event[2] );
	status |= clEnqueueReadBuffer( cmdQueue, cl_sys.ry, CL_FALSE, 0, cl_sys.natoms * sizeof(FPTYPE), buffers[1], 0, NULL, &event[1] );
	status |= clEnqueueReadBuffer( cmdQueue, cl_sys.rz, CL_FALSE, 0, cl_sys.natoms * sizeof(FPTYPE), buffers[2], 0, NULL, &event[0] );
#else
	status  = clEnqueueReadBuffer( cmdQueue, cl_sys.rx, CL_TRUE, 0, cl_sys.natoms * sizeof(FPTYPE), buffers[0], 0, NULL, NULL );
	status |= clEnqueueReadBuffer( cmdQueue, cl_sys.ry, CL_TRUE, 0, cl_sys.natoms * sizeof(FPTYPE), buffers[1], 0, NULL, NULL );
	status |= clEnqueueReadBuffer( cmdQueue, cl_sys.rz, CL_TRUE, 0, cl_sys.natoms * sizeof(FPTYPE), buffers[2], 0, NULL, NULL );
#endif
	CheckSuccess(status, 6);
    }

    /* 3) force */
    status |= clSetMultKernelArgs( kernel_force, 0, 13,
      KArg(cl_sys.fx),
      KArg(cl_sys.fy),
      KArg(cl_sys.fz),
      KArg(cl_sys.rx),
      KArg(cl_sys.ry),
      KArg(cl_sys.rz),
      KArg(cl_sys.natoms),
      KArg(epot_buffer),
      KArg(c12),
      KArg(c6),
      KArg(rcsq),
      KArg(boxby2),
      KArg(sys.box));

    CheckSuccess(status, 3);
    status = clEnqueueNDRangeKernel( cmdQueue, kernel_force, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL );

    /* 7) download E_pot[i]@device and perform reduction to E_pot@host */
    if ((sys.nfi % nprint) == nprint-1) {

    /* In non blocking mode (CL_FALSE) this data transfer kernel raises an event[1] */
#ifdef _UNBLOCK
	status |= clEnqueueReadBuffer( cmdQueue, epot_buffer, CL_FALSE, 0, nthreads * sizeof(FPTYPE), tmp_epot, 0, NULL, &event[1] );
#else
	status |= clEnqueueReadBuffer( cmdQueue, epot_buffer, CL_TRUE, 0, nthreads * sizeof(FPTYPE), tmp_epot, 0, NULL, NULL );
#endif
	CheckSuccess(status, 7);
    }

    /* 4) verlet_second */
    status |= clSetMultKernelArgs( kernel_verlet_second, 0, 9,
      KArg(cl_sys.fx),
      KArg(cl_sys.fy),
      KArg(cl_sys.fz),
      KArg(cl_sys.vx),
      KArg(cl_sys.vy),
      KArg(cl_sys.vz),
      KArg(cl_sys.natoms),
      KArg(sys.dt),
      KArg(dtmf));

    CheckSuccess(status, 4);
    status = clEnqueueNDRangeKernel( cmdQueue, kernel_verlet_second, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL );

    if ((sys.nfi % nprint) == nprint-1) {

	/* 5) ekin */
	status |= clSetMultKernelArgs( kernel_ekin, 0, 5, KArg(cl_sys.vx), KArg(cl_sys.vy), KArg(cl_sys.vz),
			KArg(cl_sys.natoms), KArg(ekin_buffer));
	CheckSuccess(status, 5);
	status = clEnqueueNDRangeKernel( cmdQueue, kernel_ekin, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL );


	/* 8) download E_kin[i]@device and perform reduction to E_kin@host */
	/* In non blocking mode (CL_FALSE) this data transfer kernel raises an event[2] */
#ifdef _UNBLOCK
	status |= clEnqueueReadBuffer( cmdQueue, ekin_buffer, CL_FALSE, 0, nthreads * sizeof(FPTYPE), tmp_ekin, 0, NULL, &event[2] );
#else
	status |= clEnqueueReadBuffer( cmdQueue, ekin_buffer, CL_TRUE, 0, nthreads * sizeof(FPTYPE), tmp_ekin, 0, NULL, NULL );
#endif
	CheckSuccess(status, 8);
    }

    /* 1) write output every nprint steps */
    if ((sys.nfi % nprint) == 0) {

    /* Calling a synchronization function (only when in non blocking mode) that will wait until all the
     * events[i], related to the data transfers, to be completed */
#ifdef _UNBLOCK
        clWaitForEvents(3, event);
#endif
	sys.rx = buffers[0];
	sys.ry = buffers[1];
	sys.rz = buffers[2];

	/* initialize the sys.epot@host and sys.ekin@host variables to ZERO */
	sys.epot = ZERO;
	sys.ekin = ZERO;

	/* reduction on the tmp_Exxx[i] buffers downloaded from the device
	 * during parts 7 and 8 of the previous MD loop iteration */
	for( i = 0; i < nthreads; i++) {
		sys.epot += tmp_epot[i];
		sys.ekin += tmp_ekin[i];
	}

	/* multiplying the kinetic energy by prefactors */
	sys.ekin *= HALF * mvsq2e * sys.mass;
	sys.temp  = TWO * sys.ekin / ( THREE * sys.natoms - THREE ) / kboltz;

	/* writing output files (positions, energies and temperature) */
	output(&sys, erg, traj);
    }

  }
  /**************************************************/

/* End profiling */

#ifdef __PROFILING

t2 = second();

fprintf( stdout, "\n\nTime of execution = %.3g (seconds)\n", (t2 - t1) );

#endif






  /* clean up: close files, free memory */
  printf("Simulation Done.\n");
  fclose(erg);
  fclose(traj);

  free(buffers[0]);
  free(buffers[1]);
  free(buffers[2]);

  return 0;
}