Exemple #1
0
double
oclLaunchKernel(cl_kernel k, cl_command_queue q, int nbobj, int nbthread, const char *fname, const int line)
{
    cl_int err = 0;
    dim3 gws, lws;
    cl_event event;
    double elapsk;
    int maxThreads = 0;
    cl_uint one = 1;
    cl_device_id dId = oclGetDeviceOfCQueue(q);
    size_t prefsz = 32;

    maxThreads = oclGetMaxWorkSize(k, dId);
    maxThreads = MIN(maxThreads, nbthread);

    // Get the proper size for the hardware
    err = clGetKernelWorkGroupInfo(k, dId, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(prefsz), &prefsz, NULL);
    oclCheckErr(err, "clGetKernelWorkGroupInfo CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE");

    // make sure we have the proper multiple: AMD 7970 crashes is not met.
    maxThreads = oclMultiple(maxThreads, prefsz);
    // printf("1D %d \n", maxThreads);

    oclMkNDrange(nbobj, maxThreads, NDR_1D, gws, lws);
    // printf("Launch: %ld G:%ld %ld %ld L:%ld %ld %ld\n", nbobj, gws[0], gws[1], gws[2], lws[0], lws[1], lws[2]);

    err = clEnqueueNDRangeKernel(q, k, NDR_1D, NULL, gws, lws, 0, NULL, &event);
    oclCheckErrF(err, "clEnqueueNDRangeKernel", fname, line);

    err = clWaitForEvents(one, &event);
    oclCheckErrF(err, "clWaitForEvents", fname, line);

    elapsk = oclChronoElaps(event);

    err = clReleaseEvent(event);
    oclCheckErrF(err, "clReleaseEvent", fname, line);

    return elapsk;
}
Exemple #2
0
 // Describes how to run the CLBlast routine
 static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
   #ifdef OPENCL_API
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Hbmv(args.layout, args.triangle,
                        args.n, args.kl, args.alpha,
                        buffers.a_mat(), args.a_offset, args.a_ld,
                        buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
                        buffers.y_vec(), args.y_offset, args.y_inc,
                        &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
   #elif CUDA_API
     auto status = Hbmv(args.layout, args.triangle,
                        args.n, args.kl, args.alpha,
                        buffers.a_mat(), args.a_offset, args.a_ld,
                        buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
                        buffers.y_vec(), args.y_offset, args.y_inc,
                        queue.GetContext()(), queue.GetDevice()());
     cuStreamSynchronize(queue());
   #endif
   return status;
 }
PerformanceAnalyser::TimelineEntry PerformanceAnalyser::analyzeEvent(cl_event &event) {
    // Wait for event information to be ready
    clWaitForEvents(1, &event);
    TimelineEntry entry;

    cl_ulong time;
    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &time, NULL);
    entry.start_time = (double) time / 1000000000.0;

    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &time, NULL);
    entry.end_time = (double) time / 1000000000.0;

    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &time, NULL);
    double exec_start = ((double) time / 1000000000.0);
    entry.execution_time = entry.end_time - exec_start;
    entry.api_overhead = exec_start - entry.start_time;
    entry.total_time = entry.end_time - entry.start_time;

    entry.cpu_time = (getTime()-m_time)-entry.total_time;

    return entry;
}
Exemple #4
0
void read_value(){

    int err;
    cl_event readevent;
    err = clEnqueueReadBuffer(commands, d_output, CL_TRUE, 0,
                              REC_N * sizeof(cl_int),
                              h_output, 0, NULL, &readevent);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to read output array! %d\n", err);
        printf("Test failed\n");
        exit(1);
    }

    clWaitForEvents(1, &readevent);

    printf("\n[host] outputs:\n");
    for (int i = 0; i < REC_N; ++i) {
        printf("%d ", h_output[i]);
    }
    printf("\n");
}
Exemple #5
0
void mat_mul_cl_clblas(const F *A, const F *B, F *C, size_t n, Cache *cache) {
    cl_event event;
    size_t mat_sizeof;

    mat_sizeof = n * n * sizeof(F);
    clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_a, CL_TRUE, 0, mat_sizeof, (F*)A, 0, NULL, NULL);
    clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_b, CL_TRUE, 0, mat_sizeof, (F*)B, 0, NULL, NULL);
    clblasSgemm(
        clblasRowMajor,
        clblasNoTrans,
        clblasNoTrans,

        n,
        n,
        n,
        1.0,

        cache->buf_a,
        0,
        n,

        cache->buf_b,
        0,
        n,

        0.0,
        cache->buf_c,
        0,
        n,

        1,
        &(cache->common.command_queue),
        0,
        NULL,
        &event
    );
    clWaitForEvents(1, &event);
    clEnqueueReadBuffer(cache->common.command_queue, cache->buf_c, CL_TRUE, 0, mat_sizeof, C, 0, NULL, NULL);
}
/* Wait for an event then release it */
cl_int mwWaitReleaseEvent(cl_event* ev)
{
    cl_int err;

    assert(ev);

    err = clWaitForEvents(1, ev);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Failed to wait for event");
        return err;
    }

    err = clReleaseEvent(*ev);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Failed to release event");
        return err;
    }

    return CL_SUCCESS;
}
Exemple #7
0
/*!
    Copies the contents of this buffer, starting at \a offset to
    \a rect within \a dest.  Returns true if the copy was successful;
    false otherwise.

    This function will block until the request finishes.
    The request is executed on the active command queue for context().

    \sa copyToAsync()
*/
bool QCLBuffer::copyTo
(size_t offset, const QCLImage2D &dest, const QRect &rect)
{
    const size_t dst_origin[3] = {static_cast<size_t>(rect.x()),
                                  static_cast<size_t>(rect.y()), 0
                                 };
    const size_t region[3] = {static_cast<size_t>(rect.width()),
                              static_cast<size_t>(rect.height()), 1
                             };
    cl_event event;
    cl_int error = clEnqueueCopyBufferToImage
                   (context()->activeQueue(), memoryId(), dest.memoryId(),
                    offset, dst_origin, region, 0, 0, &event);
    context()->reportError("QCLBuffer::copyTo(QCLImage2D):", error);
    if (error == CL_SUCCESS) {
        clWaitForEvents(1, &event);
        clReleaseEvent(event);
        return true;
    } else {
        return false;
    }
}
void testScanImpl(int rLen)
{
	int _CPU_GPU=0;
	cl_event eventList[2];
	int index=0;
	cl_kernel Kernel; 
	int CPU_GPU;
	double burden;	
	int result=0;
	int memSize=sizeof(int)*rLen;
	int outSize=sizeof(int)*rLen;
	void *Rin;
	HOST_MALLOC(Rin, memSize);
	generateRandInt((int*)Rin, rLen,rLen,0);
	void *Rout;
	HOST_MALLOC(Rout, outSize);
	cl_mem d_Rin;
	CL_MALLOC(&d_Rin, memSize);
	cl_mem d_Rout;
	CL_MALLOC(&d_Rout, outSize);
	cl_writebuffer(d_Rin, Rin, memSize,&index,eventList,&CPU_GPU,&burden,_CPU_GPU);
	ScanPara *SP;
	SP=(ScanPara*)malloc(sizeof(ScanPara));
	initScan(rLen,SP);
	scanImpl(d_Rin,rLen,d_Rout,&index,eventList,&Kernel,&CPU_GPU,&burden,SP,_CPU_GPU);	
	cl_readbuffer(Rout, d_Rout, outSize,&index,eventList,&CPU_GPU,&burden,_CPU_GPU);
	clWaitForEvents(1,&eventList[(index-1)%2]);
	closeScan(SP);
	deschedule(CPU_GPU,burden);
	//validateScan( (int*)Rin, rLen, (int*)Rout );
	HOST_FREE(Rin);
	HOST_FREE(Rout);
	CL_FREE(d_Rin);
	CL_FREE(d_Rout);
	clReleaseKernel(Kernel);  
	clReleaseEvent(eventList[0]);
	clReleaseEvent(eventList[1]);
}
Exemple #9
0
int acc_event_synchronize (void* event){
  // debug info
  if (verbose_print){
    fprintf(stdout, "\n ... EVENT SYNCHRONIZATION ... \n");
    fprintf(stdout, " ---> Entering: acc_event_synchronize.\n");
  }

  // local event and queue pointers
  cl_event *clevent = (cl_event *) event;

  // wait for an event ( !!! need to share the same ctx !!! )
  cl_error = clWaitForEvents((cl_uint) 1, clevent);
  if (acc_opencl_error_check(cl_error, __LINE__))
    return -1;

  // debug info
  if (verbose_print){
    fprintf(stdout, " ---> Leaving: acc_event_synchronize.\n");
  }

  // assign return value
  return 0;
}
Exemple #10
0
void
pclu_call_kernel(pclu_program* pgm, const char* name, pclu_range range, size_t argc, ...)
{
    cl_int errcode;
    cl_kernel kern = clCreateKernel(pgm->program, name, &errcode);
    pclu_check_call("clCreateKernel", errcode);

    va_list ap;
    va_start(ap, argc);

    for (cl_uint ii = 0; ii < argc; ++ii) {
        size_t size = va_arg(ap, size_t);	
        void*  arg  = va_arg(ap, void*);
        pclu_check_call("clSetKernelArg", clSetKernelArg(kern, ii, size, arg));
    }

    va_end(ap);

#define NO_CL_EVENTS 1

#ifdef NO_CL_EVENTS
    cl_event kernel_done = 0;
#else
    cl_event kernel_done = clCreateUserEvent(pgm->pclu->context, &errcode);
    pclu_check_call("clCreateUserEvent", errcode);
#endif

    errcode = clEnqueueNDRangeKernel(pgm->pclu->queue, kern, range.nd, 0, 
				     range.global, 0, 0, 0, &kernel_done);
    pclu_check_call("clEnqueueNDRangeKernel", errcode);

#ifndef NO_CL_EVENTS
    pclu_check_call("clWaitForEvents", clWaitForEvents(1, &kernel_done));
#endif

    pclu_check_call("clReleaseKernel", clReleaseKernel(kern));
}
extern "C" int CL_GroupBy(Record * h_Rin, int rLen, Record* h_Rout, int** h_startPos,
                          int numThread, int numBlock , int _CPU_GPU)
{
    cl_mem d_Rin;
    cl_mem d_Rout;
    cl_mem d_startPos;
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
    cl_event eventList[2];
    int index=0;
    cl_kernel Kernel;
    int CPU_GPU;
    double burden;
    int memSize = sizeof(Record)*rLen;


    CL_MALLOC( &d_Rin, memSize );
    CL_MALLOC(&d_Rout, memSize );
    cl_writebuffer( d_Rin, h_Rin, memSize,&index,eventList,&CPU_GPU,&burden,_CPU_GPU);
    int numGroup = 0;


    numGroup= groupByImpl(d_Rin, rLen, d_Rout, &d_startPos, numThread, numBlock,&index,eventList,&Kernel,&CPU_GPU,&burden,_CPU_GPU);
    (*h_startPos) = (int*)malloc( sizeof(int)*numGroup );

    cl_readbuffer( *h_startPos, d_startPos, sizeof(int)*numGroup,&index,eventList,&CPU_GPU,&burden,_CPU_GPU);
    cl_readbuffer( h_Rout, d_Rout, sizeof(Record)*rLen,&index,eventList,&CPU_GPU,&burden,_CPU_GPU);
    clWaitForEvents(1,&eventList[(index-1)%2]);
    deschedule(CPU_GPU,burden);
    CL_FREE( d_Rin );
    CL_FREE( d_Rout );
    CL_FREE( d_startPos );
    clReleaseKernel(Kernel);
    clReleaseEvent(eventList[0]);
    clReleaseEvent(eventList[1]);
    printf("CL_GroupBy\n");
    return numGroup;
}
Exemple #12
0
void deathray::SingleFrameExecute() {	
	cl_uint wait_list_length = 0;
	cl_event wait_list[3];
	result status;

	if (temporal_radius_Y_ == 0 && h_Y_ > 0.f) {
		status = g_SingleFrame_Y.CopyTo(srcpY_);
		if (status != FILTER_OK) env_->ThrowError("Deathray: Copy Y to device status=%d and OpenCL status=%d", status, g_last_cl_error);
	}
	if (temporal_radius_UV_ == 0 && h_UV_ > 0.f) {
		status = g_SingleFrame_U.CopyTo(srcpU_);
		if (status != FILTER_OK) env_->ThrowError("Deathray: Copy U to device status=%d and OpenCL status=%d", status, g_last_cl_error);
		status = g_SingleFrame_V.CopyTo(srcpV_);
		if (status != FILTER_OK) env_->ThrowError("Deathray: Copy V to device status=%d and OpenCL status=%d", status, g_last_cl_error);
	}

	if (temporal_radius_Y_ == 0 && h_Y_ > 0.f) {
		status = g_SingleFrame_Y.Execute();
		if (status != FILTER_OK) env_->ThrowError("Deathray: Execute Y kernel status=%d and OpenCL status=%d", status, g_last_cl_error);
		status = g_SingleFrame_Y.CopyFrom(dstpY_, wait_list);
		if (status != FILTER_OK) env_->ThrowError("Deathray: Copy Y to host status=%d and OpenCL status=%d", status, g_last_cl_error);
		++wait_list_length;
	}

	if (temporal_radius_UV_ == 0 && h_UV_ > 0.f) {
		g_SingleFrame_U.Execute();
		if (status != FILTER_OK) env_->ThrowError("Deathray: Execute U kernel status=%d and OpenCL status=%d", status, g_last_cl_error);
		g_SingleFrame_U.CopyFrom(dstpU_, wait_list + wait_list_length++);
		if (status != FILTER_OK) env_->ThrowError("Deathray: Copy U to host status=%d and OpenCL status=%d", status, g_last_cl_error);
		g_SingleFrame_V.Execute();
		if (status != FILTER_OK) env_->ThrowError("Deathray: Execute V kernel status=%d and OpenCL status=%d", status, g_last_cl_error);
		g_SingleFrame_V.CopyFrom(dstpV_, wait_list + wait_list_length++);
		if (status != FILTER_OK) env_->ThrowError("Deathray: Copy V to host status=%d and OpenCL status=%d", status, g_last_cl_error);
	}

	clWaitForEvents(wait_list_length, wait_list);
}
Exemple #13
0
    void copyhostptr_roundtrip_func()
    {
        timer.Start(timer_id);
        //set up buffer
        cl_int err;
        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
                                         buffer_.offA_) * sizeof(T),
                                        buffer_.a_, &err);

        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
                                         buffer_.offB_) * sizeof(T),
                                        buffer_.b_, &err);
        //call func
        xTrsm_Function(false);
        //read gpu buffer
        err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
                                  buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ *
                                  sizeof(T),
                                  buffer_.b_, 0, NULL, &event_);
        clWaitForEvents(1, &event_);
        timer.Stop(timer_id);
    }
Exemple #14
0
void write_to_buffer(eObj* e, cObj cCandidate) {
    Tempest::data.lNumPSMs += 1;
    if (e->iNumBufferedCandidates == 0) {
        clWaitForEvents(1, &(e->clEventSent));
        if (Tempest::config.profile) {
          cl_ulong start;
          cl_ulong end;
          int err;
          err = clGetEventProfilingInfo(e->clEventSent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
          err |= clGetEventProfilingInfo(e->clEventSent, CL_PROFILING_COMMAND_END,   sizeof(cl_ulong), &end,   NULL);
          if (err == 0)
          e->device->totalSendTime += (end-start);
          }
        clReleaseEvent(e->clEventSent);
    }
    e->candidateBuffer[e->iNumBufferedCandidates] = cCandidate;
    //memcpy(e->candidateBuffer+e->iNumBufferedCandidates, &cCandidate, sizeof(cObj));
    e->iNumCandidates++;
    e->iNumBufferedCandidates++;
    if (e->iNumBufferedCandidates == e->candidateBufferSize) {
        //printf("%d\t%d\n", gpu_info.iNumScoringKernels, iBin);
        e->device->scoreCandidates(e);
    }
}
Exemple #15
0
cl_int Dsyrk_internal(
  cl_env *env, double *a, double *c, double alpha, double beta,
  clblasTranspose transA, clblasUplo uplo, int ar, int ac, int n, int size_a, int size_c)
{
  CHECK(clblasSetup());
  cl_event events[NEVENTS];
  int nevent = 0;
  cl_mem mem_a = create_mem(env, a, size_a, CL_MEM_READ_ONLY, &(events[nevent++]));
  cl_mem mem_c;
  if (beta != 0) mem_c = create_mem(env, c, size_c, CL_MEM_READ_WRITE, &(events[nevent++]));
  else mem_c = create_mem(env, NULL, size_c, CL_MEM_READ_WRITE, NULL);
  
  int k = transA == clblasNoTrans ? ar : ac;
  cl_int err = clblasDsyrk(clblasColumnMajor, uplo, transA, 
    n, k, alpha, mem_a, 0, ac, beta, mem_c, 0, n,
    1, &(env->queues[0]), nevent, events, &(events[nevent]));
  CHECK(err);
  events[nevent+1] = *read_mem(env, mem_c, c, size_c, 1, &(events[nevent]));
  CHECK(clWaitForEvents(1, &(events[nevent+1])));
  CHECK(clReleaseMemObject(mem_a));
  CHECK(clReleaseMemObject(mem_c));
  clblasTeardown();
  return CL_SUCCESS;
}
int
main(void)
{
    cl_int err;
    cl_platform_id platform = 0;
    cl_device_id device = 0;
    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    cl_context ctx = 0;
    cl_command_queue queue = 0;
    cl_mem bufX, bufY;
    cl_event event = NULL;
    int ret = 0;
	int lenX = 1 + (N-1)*abs(incx);
	int lenY = 1 + (N-1)*abs(incy);

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs(1, &platform, NULL);

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetPlatformIDs() failed with %d\n", err );
        return 1;
    }

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);

    if (err != CL_SUCCESS) {
        printf( "clGetDeviceIDs() failed with %d\n", err );
        return 1;
    }

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateContext() failed with %d\n", err );
        return 1;
    }

    queue = clCreateCommandQueue(ctx, device, 0, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateCommandQueue() failed with %d\n", err );
        clReleaseContext(ctx);
        return 1;
    }

    /* Setup clblas. */
    err = clblasSetup();
    if (err != CL_SUCCESS) {
        printf("clblasSetup() failed with %d\n", err);
        clReleaseCommandQueue(queue);
        clReleaseContext(ctx);
        return 1;
    }

    /* Prepare OpenCL memory objects and place matrices inside them. */
    bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenX*sizeof(cl_float)), NULL, &err);
    bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenY*sizeof(cl_float)), NULL, &err);

    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL);

	printResult();

    /* Call clblas function. */
    err = clblasSrot(N, bufX, 0, incx, bufY, 0, incy, C, S, 1, &queue, 0, NULL, &event);
//	printf("here\n");
    if (err != CL_SUCCESS) {
        printf("clblasSrot() failed with %d\n", err);
        ret = 1;
    }
    else {
        /* Wait for calculations to be finished. */
        err = clWaitForEvents(1, &event);

        /* Fetch results of calculations from GPU memory. */
        err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)),
                                    Y, 0, NULL, NULL);
        err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)),
                                    X, 0, NULL, NULL);

        /* At this point you will get the result of SROT placed in vector Y. */
        printResult();
    }
	//printf("here\n");

    /* Release OpenCL memory objects. */
    clReleaseMemObject(bufY);
    clReleaseMemObject(bufX);

    /* Finalize work with clblas. */
    clblasTeardown();

    /* Release OpenCL working objects. */
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);

    return ret;
}
Exemple #17
0
struct tableNode * groupBy(struct groupByNode * gb, struct clContext * context, struct statistic * pp){

    struct timespec start,end;
    clock_gettime(CLOCK_REALTIME,&start);

    cl_event ndrEvt;
    cl_ulong startTime,endTime;

    struct tableNode * res = NULL;
    long gpuTupleNum;
    int gpuGbColNum;
    cl_mem gpuGbIndex;
    cl_mem gpuGbType, gpuGbSize;

    cl_mem gpuGbKey;
    cl_mem gpuContent;

    int gbCount;                // the number of groups
    int gbConstant = 0;         // whether group by constant

    cl_int error = 0;

    res = (struct tableNode *) malloc(sizeof(struct tableNode));
    CHECK_POINTER(res);
    res->tupleSize = gb->tupleSize;
    res->totalAttr = gb->outputAttrNum;
    res->attrType = (int *) malloc(sizeof(int) * res->totalAttr);
    CHECK_POINTER(res->attrType);
    res->attrSize = (int *) malloc(sizeof(int) * res->totalAttr);
    CHECK_POINTER(res->attrSize);
    res->attrTotalSize = (int *) malloc(sizeof(int) * res->totalAttr);
    CHECK_POINTER(res->attrTotalSize);
    res->dataPos = (int *) malloc(sizeof(int) * res->totalAttr);
    CHECK_POINTER(res->dataPos);
    res->dataFormat = (int *) malloc(sizeof(int) * res->totalAttr);
    CHECK_POINTER(res->dataFormat);
    res->content = (char **) malloc(sizeof(char **) * res->totalAttr);
    CHECK_POINTER(res->content);

    for(int i=0;i<res->totalAttr;i++){
        res->attrType[i] = gb->attrType[i];
        res->attrSize[i] = gb->attrSize[i];
        res->dataFormat[i] = UNCOMPRESSED;
    }
    
    gpuTupleNum = gb->table->tupleNum;
    gpuGbColNum = gb->groupByColNum;

    if(gpuGbColNum == 1 && gb->groupByIndex[0] == -1){

        gbConstant = 1;
    }

    size_t localSize = 128;
    size_t globalSize = 1024*128;

    int blockNum = gb->table->tupleNum / localSize + 1; 

    if(blockNum < 1024)
        globalSize = blockNum * 128;


    cl_mem gpu_hashNum;
    cl_mem gpu_psum;
    cl_mem gpuGbCount;

    long * cpuOffset = (long *)malloc(sizeof(long) * gb->table->totalAttr);
    CHECK_POINTER(cpuOffset);
    long offset = 0;
    long totalSize = 0;

    for(int i=0;i<gb->table->totalAttr;i++){

        int attrSize = gb->table->attrSize[i];
        int size = attrSize * gb->table->tupleNum;

        cpuOffset[i] = offset;

        /*align each column*/

        if(size % 4 !=0){
            size += 4 - (size%4);
        }

        offset += size;
        totalSize += size;
    }

    gpuContent = clCreateBuffer(context->context,CL_MEM_READ_ONLY, totalSize,NULL,&error);

    for(int i=0;i<gb->table->totalAttr;i++){

        int attrSize = gb->table->attrSize[i];
        int size = attrSize * gb->table->tupleNum;

        if(gb->table->dataPos[i]==MEM){
            error = clEnqueueWriteBuffer(context->queue, gpuContent, CL_TRUE, cpuOffset[i], size, gb->table->content[i],0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
            clWaitForEvents(1, &ndrEvt);
            clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
            clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
            pp->pcie += 1e-6 * (endTime - startTime);
#endif
        }else
            error = clEnqueueCopyBuffer(context->queue,(cl_mem)gb->table->content[i],gpuContent,0, cpuOffset[i],size,0,0,0);

    }

    cl_mem gpuOffset = clCreateBuffer(context->context,CL_MEM_READ_ONLY, sizeof(long)*gb->table->totalAttr,NULL,&error);
    clEnqueueWriteBuffer(context->queue,gpuOffset,CL_TRUE,0,sizeof(long)*gb->table->totalAttr,cpuOffset,0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
    clWaitForEvents(1, &ndrEvt);
    clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
    clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
    pp->pcie += 1e-6 * (endTime - startTime);
#endif

    if(gbConstant != 1){

        gpuGbType = clCreateBuffer(context->context,CL_MEM_READ_ONLY,sizeof(int)*gb->groupByColNum,NULL,&error);
        clEnqueueWriteBuffer(context->queue,gpuGbType,CL_TRUE,0,sizeof(int)*gb->groupByColNum,gb->groupByType,0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
        clWaitForEvents(1, &ndrEvt);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
        pp->pcie += 1e-6 * (endTime - startTime);
#endif

        gpuGbSize = clCreateBuffer(context->context,CL_MEM_READ_ONLY,sizeof(int)*gb->groupByColNum,NULL,&error);
        clEnqueueWriteBuffer(context->queue,gpuGbSize,CL_TRUE,0,sizeof(int)*gb->groupByColNum,gb->groupBySize,0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
        clWaitForEvents(1, &ndrEvt);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
        pp->pcie += 1e-6 * (endTime - startTime);
#endif

        gpuGbKey = clCreateBuffer(context->context,CL_MEM_READ_WRITE,sizeof(int)*gb->table->tupleNum,NULL,&error);

        gpuGbIndex = clCreateBuffer(context->context,CL_MEM_READ_ONLY, sizeof(int)*gb->groupByColNum,NULL,&error);
        clEnqueueWriteBuffer(context->queue,gpuGbIndex,CL_TRUE,0,sizeof(int)*gb->groupByColNum,gb->groupByIndex,0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
        clWaitForEvents(1, &ndrEvt);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
        pp->pcie += 1e-6 * (endTime - startTime);
#endif

        gpu_hashNum = clCreateBuffer(context->context,CL_MEM_READ_WRITE, sizeof(int)*HSIZE,NULL,&error);

        context->kernel = clCreateKernel(context->program,"cl_memset_int",0);

        int tmp = HSIZE;
        clSetKernelArg(context->kernel,0,sizeof(cl_mem), (void*)&gpu_hashNum);
        clSetKernelArg(context->kernel,1,sizeof(int), (void*)&tmp);

        error = clEnqueueNDRangeKernel(context->queue, context->kernel, 1, 0, &globalSize,&localSize,0,0,&ndrEvt);
#ifdef OPENCL_PROFILE
        clWaitForEvents(1, &ndrEvt);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
        pp->kernel += 1e-6 * (endTime - startTime);
#endif

        context->kernel = clCreateKernel(context->program, "build_groupby_key",0);
        clSetKernelArg(context->kernel,0,sizeof(cl_mem),(void *)&gpuContent);
        clSetKernelArg(context->kernel,1,sizeof(cl_mem),(void *)&gpuOffset);
        clSetKernelArg(context->kernel,2,sizeof(int),(void *)&gpuGbColNum);
        clSetKernelArg(context->kernel,3,sizeof(cl_mem),(void *)&gpuGbIndex);
        clSetKernelArg(context->kernel,4,sizeof(cl_mem),(void *)&gpuGbType);
        clSetKernelArg(context->kernel,5,sizeof(cl_mem),(void *)&gpuGbSize);
        clSetKernelArg(context->kernel,6,sizeof(long),(void *)&gpuTupleNum);
        clSetKernelArg(context->kernel,7,sizeof(cl_mem),(void *)&gpuGbKey);
        clSetKernelArg(context->kernel,8,sizeof(cl_mem),(void *)&gpu_hashNum);

        error = clEnqueueNDRangeKernel(context->queue, context->kernel, 1, 0, &globalSize,&localSize,0,0,&ndrEvt);
#ifdef OPENCL_PROFILE
        clWaitForEvents(1, &ndrEvt);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
        pp->kernel += 1e-6 * (endTime - startTime);
#endif

        clReleaseMemObject(gpuGbType);
        clReleaseMemObject(gpuGbSize);
        clReleaseMemObject(gpuGbIndex);

        gbCount = 1;

        tmp = 0;
        gpuGbCount = clCreateBuffer(context->context,CL_MEM_READ_WRITE, sizeof(int),NULL,&error);
        clEnqueueWriteBuffer(context->queue,gpuGbCount,CL_TRUE,0,sizeof(int),&tmp,0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
        clWaitForEvents(1, &ndrEvt);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
        pp->pcie += 1e-6 * (endTime - startTime);
#endif

        int hsize = HSIZE;
        context->kernel = clCreateKernel(context->program, "count_group_num",0);
        clSetKernelArg(context->kernel,0,sizeof(cl_mem),(void *)&gpu_hashNum);
        clSetKernelArg(context->kernel,1,sizeof(int),(void *)&hsize);
        clSetKernelArg(context->kernel,2,sizeof(cl_mem),(void *)&gpuGbCount);
        error = clEnqueueNDRangeKernel(context->queue, context->kernel, 1, 0, &globalSize,&localSize,0,0,&ndrEvt);
#ifdef OPENCL_PROFILE
        clWaitForEvents(1, &ndrEvt);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
        pp->kernel += 1e-6 * (endTime - startTime);
#endif

        clEnqueueReadBuffer(context->queue, gpuGbCount, CL_TRUE, 0, sizeof(int), &gbCount,0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
        clWaitForEvents(1, &ndrEvt);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
        pp->pcie += 1e-6 * (endTime - startTime);
#endif

        gpu_psum = clCreateBuffer(context->context,CL_MEM_READ_WRITE, sizeof(int)*HSIZE,NULL,&error);

        scanImpl(gpu_hashNum,HSIZE,gpu_psum,context,pp);

        clReleaseMemObject(gpuGbCount);
        clReleaseMemObject(gpu_hashNum);
    }

    if(gbConstant == 1)
        res->tupleNum = 1;
    else
        res->tupleNum = gbCount;

    printf("groupBy num %ld\n",res->tupleNum);

    gpuGbType = clCreateBuffer(context->context, CL_MEM_READ_ONLY, sizeof(int)*res->totalAttr, NULL, &error);
    clEnqueueWriteBuffer(context->queue,gpuGbType,CL_TRUE,0,sizeof(int)*res->totalAttr,res->attrType,0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
    clWaitForEvents(1, &ndrEvt);
    clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
    clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
    pp->pcie += 1e-6 * (endTime - startTime);
#endif

    gpuGbSize = clCreateBuffer(context->context, CL_MEM_READ_ONLY, sizeof(int)*res->totalAttr, NULL, &error);
    clEnqueueWriteBuffer(context->queue,gpuGbSize,CL_TRUE,0,sizeof(int)*res->totalAttr,res->attrSize,0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
    clWaitForEvents(1, &ndrEvt);
    clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
    clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
    pp->pcie += 1e-6 * (endTime - startTime);
#endif

    /*
     * @gpuGbExp is the mathExp in each groupBy expression
     * @mathexp stores the math exp for for the group expression that has two operands
     * The reason that we need two variables instead of one is that OpenCL doesn't support pointer to pointer
     * */

    cl_mem gpuGbExp = clCreateBuffer(context->context, CL_MEM_READ_ONLY, sizeof(struct mathExp)*res->totalAttr, NULL, &error);
    cl_mem mathexp = clCreateBuffer(context->context, CL_MEM_READ_ONLY, 2*sizeof(struct mathExp)*res->totalAttr, NULL, &error);

    struct mathExp tmpExp[2];
    int * cpuFunc = (int *) malloc(sizeof(int) * res->totalAttr);
    CHECK_POINTER(cpuFunc);

    offset = 0;
    for(int i=0;i<res->totalAttr;i++){

        error = clEnqueueWriteBuffer(context->queue, gpuGbExp, CL_TRUE, offset, sizeof(struct mathExp), &(gb->gbExp[i].exp),0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
        clWaitForEvents(1, &ndrEvt);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
        pp->pcie += 1e-6 * (endTime - startTime);
#endif

        offset += sizeof(struct mathExp);

        cpuFunc[i] = gb->gbExp[i].func;

        if(gb->gbExp[i].exp.opNum == 2){
            struct mathExp * tmpMath = (struct mathExp *) (gb->gbExp[i].exp.exp);
            tmpExp[0].op = tmpMath[0].op;
            tmpExp[0].opNum = tmpMath[0].opNum;
            tmpExp[0].opType = tmpMath[0].opType;
            tmpExp[0].opValue = tmpMath[0].opValue;

            tmpExp[1].op = tmpMath[1].op;
            tmpExp[1].opNum = tmpMath[1].opNum;
            tmpExp[1].opType = tmpMath[1].opType;
            tmpExp[1].opValue = tmpMath[1].opValue;
            clEnqueueWriteBuffer(context->queue, mathexp, CL_TRUE, 2*i*sizeof(struct mathExp),2*sizeof(struct mathExp),tmpExp,0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
            clWaitForEvents(1, &ndrEvt);
            clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
            clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
            pp->pcie += 1e-6 * (endTime - startTime);
#endif
        }
    }

    cl_mem gpuFunc = clCreateBuffer(context->context, CL_MEM_READ_ONLY, sizeof(int)*res->totalAttr, NULL, &error);
    clEnqueueWriteBuffer(context->queue,gpuFunc,CL_TRUE,0,sizeof(int)*res->totalAttr,cpuFunc,0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
    clWaitForEvents(1, &ndrEvt);
    clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
    clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
    pp->pcie += 1e-6 * (endTime - startTime);
#endif

    long *resOffset = (long *)malloc(sizeof(long)*res->totalAttr);
    CHECK_POINTER(resOffset);
    
    offset = 0;
    totalSize = 0;
    for(int i=0;i<res->totalAttr;i++){
        
        /*
         * align the output of each column on the boundary of 4
         */

        int size = res->attrSize[i] * res->tupleNum;
        if(size %4 != 0){
            size += 4- (size %4);
        }

        resOffset[i] = offset;
        offset += size; 
        totalSize += size;
    }

    cl_mem gpuResult = clCreateBuffer(context->context,CL_MEM_READ_WRITE, totalSize, NULL, &error);
    cl_mem gpuResOffset = clCreateBuffer(context->context, CL_MEM_READ_ONLY,sizeof(long)*res->totalAttr, NULL,&error);
    clEnqueueWriteBuffer(context->queue,gpuResOffset,CL_TRUE,0,sizeof(long)*res->totalAttr,resOffset,0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
    clWaitForEvents(1, &ndrEvt);
    clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
    clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
    pp->pcie += 1e-6 * (endTime - startTime);
#endif

    gpuGbColNum = res->totalAttr;

    if(gbConstant !=1){
        context->kernel = clCreateKernel(context->program,"agg_cal",0);
        clSetKernelArg(context->kernel,0,sizeof(cl_mem), (void*)&gpuContent);
        clSetKernelArg(context->kernel,1,sizeof(cl_mem), (void*)&gpuOffset);
        clSetKernelArg(context->kernel,2,sizeof(int), (void*)&gpuGbColNum);
        clSetKernelArg(context->kernel,3,sizeof(cl_mem), (void*)&gpuGbExp);
        clSetKernelArg(context->kernel,4,sizeof(cl_mem), (void*)&mathexp);
        clSetKernelArg(context->kernel,5,sizeof(cl_mem), (void*)&gpuGbType);
        clSetKernelArg(context->kernel,6,sizeof(cl_mem), (void*)&gpuGbSize);
        clSetKernelArg(context->kernel,7,sizeof(long), (void*)&gpuTupleNum);
        clSetKernelArg(context->kernel,8,sizeof(cl_mem), (void*)&gpuGbKey);
        clSetKernelArg(context->kernel,9,sizeof(cl_mem), (void*)&gpu_psum);
        clSetKernelArg(context->kernel,10,sizeof(cl_mem), (void*)&gpuResult);
        clSetKernelArg(context->kernel,11,sizeof(cl_mem), (void*)&gpuResOffset);
        clSetKernelArg(context->kernel,12,sizeof(cl_mem), (void*)&gpuFunc);

        error = clEnqueueNDRangeKernel(context->queue, context->kernel, 1, 0, &globalSize,&localSize,0,0,&ndrEvt);
#ifdef OPENCL_PROFILE
        clWaitForEvents(1, &ndrEvt);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
        pp->kernel += 1e-6 * (endTime - startTime);
#endif
        
        clReleaseMemObject(gpuGbKey);
        clReleaseMemObject(gpu_psum);
    }else{
        context->kernel = clCreateKernel(context->program,"agg_cal_cons",0);
        clSetKernelArg(context->kernel,0,sizeof(cl_mem), (void*)&gpuContent);
        clSetKernelArg(context->kernel,1,sizeof(cl_mem), (void*)&gpuOffset);
        clSetKernelArg(context->kernel,2,sizeof(int), (void*)&gpuGbColNum);
        clSetKernelArg(context->kernel,3,sizeof(cl_mem), (void*)&gpuGbExp);
        clSetKernelArg(context->kernel,4,sizeof(cl_mem), (void*)&mathexp);
        clSetKernelArg(context->kernel,5,sizeof(cl_mem), (void*)&gpuGbType);
        clSetKernelArg(context->kernel,6,sizeof(cl_mem), (void*)&gpuGbSize);
        clSetKernelArg(context->kernel,7,sizeof(long), (void*)&gpuTupleNum);
        clSetKernelArg(context->kernel,8,sizeof(cl_mem), (void*)&gpuResult);
        clSetKernelArg(context->kernel,9,sizeof(cl_mem), (void*)&gpuResOffset);
        clSetKernelArg(context->kernel,10,sizeof(cl_mem), (void*)&gpuFunc);

        globalSize = localSize * 4;
        error = clEnqueueNDRangeKernel(context->queue, context->kernel, 1, 0, &globalSize,&localSize,0,0,&ndrEvt);
#ifdef OPENCL_PROFILE
        clWaitForEvents(1, &ndrEvt);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
        pp->kernel += 1e-6 * (endTime - startTime);
#endif
    }

    for(int i=0; i<res->totalAttr;i++){
        res->content[i] = (char *)clCreateBuffer(context->context,CL_MEM_READ_WRITE, res->attrSize[i]*res->tupleNum, NULL, &error); 
        res->dataPos[i] = GPU;
        res->attrTotalSize[i] = res->tupleNum * res->attrSize[i];
        clEnqueueCopyBuffer(context->queue, gpuResult, (cl_mem)res->content[i], resOffset[i],0, res->attrSize[i] * res->tupleNum, 0,0,0);
    }

    free(resOffset);
    free(cpuOffset);

    clFinish(context->queue);
    clReleaseMemObject(gpuContent);
    clReleaseMemObject(gpuResult);
    clReleaseMemObject(gpuOffset);
    clReleaseMemObject(gpuResOffset);
    clReleaseMemObject(gpuGbExp);
    clReleaseMemObject(gpuFunc);

    clock_gettime(CLOCK_REALTIME,&end);
        double timeE = (end.tv_sec -  start.tv_sec)* BILLION + end.tv_nsec - start.tv_nsec;
        printf("GroupBy Time: %lf\n", timeE/(1000*1000));

    return res;
}
Exemple #18
0
int main(int argc, char **argv) {
  cl_int status;

  const char *platform_name = "NVIDIA";

  if (!find_platform(platform_name, &platform)) {
    fprintf(stderr,"Error: Platform \"%s\" not found\n", platform_name);
    print_platforms();
    teardown(-1);
  }

  status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
  checkError (status, "Error: could not query devices");

  context = clCreateContext(NULL, 1, &device, NULL, NULL, &status);
  checkError(status, "could not create context");

  const char name[] = KERNELDIR "/reduce.cl";

  unsigned char *source;
  size_t size;
  if (!load_file(name, &source, &size)) {
    teardown(-1);
  }

  program = clCreateProgramWithSource(context, 1, (const char **) &source, &size, &status);
  checkError(status, "Error: failed to create program %s: ", name);

  status = clBuildProgram(program, 1, &device, "-I.", NULL, NULL);

  if (status != CL_SUCCESS) {
    print_build_log(program, device);
    checkError(status, "Error: failed to create build %s: ", name);
  }

  free(source);

  print_device_info(device, 0);

  queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
  checkError(status, "could not create command queue");

  cl_ulong start, end;
  cl_event event;

  size_t width  = 1024+1024;
  size_t buf_size = width*sizeof(cl_float);

  kernel = clCreateKernel(program, "reduce", &status);
  checkError(status, "could not create kernel");

  size_t work_size = width;
  size_t local_size = 64;
  size_t local_buf_size = local_size * sizeof(cl_float);
  size_t groups = width / local_size;
  size_t res_buf_size = groups * sizeof(cl_float);

  float *data_in  = malloc(buf_size);
  float *data_out = malloc(res_buf_size);
  if (!data_in || !data_out) {
    fprintf(stderr,"\nError: malloc failed\n");
    teardown(-1);
  }

  for (unsigned int i = 0; i < width; ++i) {
    data_in[i] = (float) (i % 16);
  }

  buffer_in = clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &status);
  checkError(status, "Error: could not create buffer_in");

  buffer_out = clCreateBuffer(context, CL_MEM_READ_WRITE, res_buf_size, NULL, &status);
  checkError(status, "Error: could not create buffer_out");

  status = clEnqueueWriteBuffer(queue, buffer_in, CL_FALSE, 0, buf_size, data_in, 0, NULL, NULL);
  checkError(status, "Error: could not copy data into device");

  // execute kernel
  int arg = 0;
  status  = clSetKernelArg(kernel, arg++, sizeof(cl_mem), &buffer_in);
  status  = clSetKernelArg(kernel, arg++, sizeof(cl_mem), &buffer_out);
  status  = clSetKernelArg(kernel, arg++, local_buf_size, NULL);
  status  = clSetKernelArg(kernel, arg++, sizeof(cl_int), &width);
  checkError(status, "Error: could not set args");

  status = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &work_size, &local_size, 0, NULL, &event);
  checkError(status, "Error: could not enqueue kernel");

  status = clWaitForEvents(1, &event);
  checkError(status, "Error: could not wait for event");

  status  = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
  checkError(status, "Error: could not get start profile information");

  status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
  checkError(status, "Error: could not get end profile information");

  status = clReleaseEvent(event);
  checkError(status, "Error: could not release event");

  // read results back
  status = clEnqueueReadBuffer(queue, buffer_out, CL_TRUE, 0, res_buf_size, data_out, 0, NULL, NULL);
  checkError(status, "Error: could not copy data into device");

  status  = clFinish(queue);
  checkError(status, "Error: could not finish successfully");

  float clsum = 0;
  for (unsigned int i = 0; i < groups; ++i) {
    clsum += data_out[i];
  }

#ifdef DEBUG
  for (int i = 0; i < groups; ++i) {
    printf("%.0f ", data_out[i]);
  }
#endif

  double elapsed = (end - start) * 1e-9f;
  printf("time: %f\n", elapsed);

  float sum = 0;
  for (unsigned int i = 0; i < width; ++i) {
    sum += data_in[i];
  }

  if (sum != clsum)
    fprintf(stderr, "Compare failed: %f != %f\n", clsum, sum);


  free(data_in);
  free(data_out);
  teardown(0);
}
int 
NBody::runCLKernels()
{
    cl_int status;
    cl_event events[1];

    /* 
    * Enqueue a kernel run call.
    */
    size_t globalThreads[] = {numBodies};
    size_t localThreads[] = {groupSize};

    if(localThreads[0] > maxWorkItemSizes[0] ||
       localThreads[0] > maxWorkGroupSize)
    {
        std::cout << "Unsupported: Device"
            "does not support requested number of work items.";
        return SDK_FAILURE;
    }

    status = clEnqueueNDRangeKernel(
        commandQueue,
        kernel,
        1,
        NULL,
        globalThreads,
        localThreads,
        0,
        NULL,
        NULL);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS, 
        "clEnqueueNDRangeKernel failed."))
    {
        return SDK_FAILURE;
    }

    status = clFinish(commandQueue);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS, 
        "clFinish failed."))
    {
        return SDK_FAILURE;
    }

    /* Copy data from new to old */
    status = clEnqueueCopyBuffer(commandQueue,
                                 newPos,
                                 currPos,
                                 0,
                                 0,
                                 sizeof(cl_float4) * numBodies,
                                 0,
                                 0,
                                 0);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS, 
        "clEnqueueCopyBuffer failed.(newPos->oldPos)"))
    {
        return SDK_FAILURE;
    }

    status = clEnqueueCopyBuffer(commandQueue,
                                 newVel,
                                 currVel,
                                 0,
                                 0,
                                 sizeof(cl_float4) * numBodies,
                                 0,
                                 0,
                                 0);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS, 
        "clEnqueueCopyBuffer failed.(newVel->oldVels)"))
    {
        return SDK_FAILURE;
    }

    status = clFinish(commandQueue);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS, 
        "clFinish failed."))
    {
        return SDK_FAILURE;
    }

    /* Enqueue readBuffer*/
    status = clEnqueueReadBuffer(
        commandQueue,
        currPos,
        CL_TRUE,
        0,
        numBodies* sizeof(cl_float4),
        pos,
        0,
        NULL,
        &events[0]);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS,
        "clEnqueueReadBuffer failed."))
        return SDK_FAILURE;

    /* Wait for the read buffer to finish execution */
    status = clWaitForEvents(1, &events[0]);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS,
        "clWaitForEvents failed."))
        return SDK_FAILURE;

    clReleaseEvent(events[0]);

    return SDK_SUCCESS;
}
ErrorStatus crossprod_clblas(cl_device_id device, void *inMatrix, void *outMatrix, int nrow, int ncol, bool use_float)
{
    std::stringstream result;
    
    float *input_matrix_f = (float *)inMatrix;
    
    float *output_matrix_f = (float *)outMatrix;
    
    double *input_matrix_d = (double *)inMatrix;
    
    double *output_matrix_d = (double *)outMatrix;
    
    if (debug) {
        result << "crossprod_clblas( " << (use_float ? "FLOAT" : "DOUBLE") <<
        ", nrow = " << nrow << ", ncol = " << ncol << ")" << std::endl << std::endl;
    }
    
    cl_int err = CL_SUCCESS;

    clblasStatus status = clblasSetup();
    if (status != CL_SUCCESS) {
        if (debug) {
            result << "clblasSetup: " << clblasErrorToString(status) << std::endl;
        }
        
        err = CL_INVALID_OPERATION;
    }

    // get first platform
    cl_platform_id platform = NULL;
    if (err == CL_SUCCESS) {
        err = clGetPlatformIDs(1, &platform, NULL);
    }
    
    if (debug && err == CL_SUCCESS) {
        result << "Platform: " << getPlatformInfoString(platform, CL_PLATFORM_NAME) << std::endl;
        result << "Device: " << getDeviceInfoString(device, CL_DEVICE_NAME) << std::endl;
    }
    
    // context
    cl_context context = NULL;
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "clCreateContext:" << std::endl;
        }
        
        context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    }
    
    // queue
    cl_command_queue queue = NULL;
    if (err == CL_SUCCESS) {
#ifdef CL_VERSION_2_0
        if (debug) {
            result << "clCreateCommandQueueWithProperties:" << std::endl;
        }
        
        queue = clCreateCommandQueueWithProperties(context, device, NULL, &err);
        
#else
        if (debug) {
            result << "clCreateCommandQueue:" << std::endl;
        }
        
        queue = clCreateCommandQueue(context, device, 0, &err);
#endif
    }
    
    // buffers
    cl_mem cl_input_matrix = NULL;
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "clCreateBuffer cl_input_matrix:" << std::endl;
        }
        
        if (use_float) {
            cl_input_matrix = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                             nrow * ncol * sizeof(float), input_matrix_f, &err);
            
        } else {
            cl_input_matrix = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                             nrow * ncol * sizeof(double), input_matrix_d, &err);
        }
    }
    
    cl_mem cl_output_matrix = NULL;
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "clCreateBuffer cl_output_vector:" << std::endl;
        }

        if (use_float) {
            cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                                              ncol * ncol * sizeof(float), output_matrix_f, &err);
            
        } else {
            cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                                              ncol * ncol * sizeof(double), output_matrix_d, &err);
        }

    }
    
    // ++++++++++++
    const clblasOrder order = clblasColumnMajor;
    const clblasTranspose transA = clblasTrans;
    
    const size_t lda = nrow;
    const size_t ldc = ncol;
    
    const cl_float alpha = 1.0;
    
    clblasUplo uplo = clblasUpper;
    
    cl_event event = NULL;
    
    if (err == CL_SUCCESS) {
        if (use_float) {
            if (debug) {
                result << "clblasSsyrk:" << std::endl;
            }
            
            status = clblasSsyrk(order, uplo, transA, ncol, nrow, alpha, cl_input_matrix, 0, lda, 0.0,
                                 cl_output_matrix, 0, ldc, 1, &queue, 0, NULL, &event);
            
            if (status != CL_SUCCESS && debug) {
                result << "clblasSgemm error:" << clblasErrorToString(status) << std::endl;
            }

        } else {
            if (debug) {
                result << "clblasDsyrk:" << std::endl;
            }
            
            status = clblasDsyrk(order, uplo, transA, ncol, nrow, alpha, cl_input_matrix, 0, lda, 0.0,
                                 cl_output_matrix, 0, ldc, 1, &queue, 0, NULL, &event);
            
            if (status != CL_SUCCESS) {
                if (debug) {
                    result << "clblasDgemm error:" << clblasErrorToString(status) << std::endl;
                }
                
                err = status;
            }
        }
    }
    
    if (err == CL_SUCCESS) {
        /* Wait for calculations to be finished. */
        if (debug) {
            result << "clWaitForEvents:" << std::endl;
        }
        err = clWaitForEvents(1, &event);
    }
    
    // retrieve result
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "Retrieve result:" << std::endl;
        }
        
        if (use_float) {
            clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, ncol * ncol * sizeof(float), output_matrix_f, 0, NULL, NULL);
            symmetrizeSquare_f(output_matrix_f, ncol);
            
        } else {
            clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, ncol * ncol * sizeof(double), output_matrix_d, 0, NULL, NULL);
            symmetrizeSquare_d(output_matrix_d, ncol);
        }
    }
    
    std::string err_str = clErrorToString(err);
    result << std::endl << err_str << std::endl;
    
    // cleanup
    clReleaseMemObject(cl_output_matrix);
    cl_output_matrix = NULL;
    
    clReleaseMemObject(cl_input_matrix);
    cl_input_matrix = NULL;
    
    clReleaseCommandQueue(queue);
    queue = NULL;
    
    clReleaseContext(context);
    context = NULL;
    
    if (debug) {
        CERR << result.str();
    }
    
    ErrorStatus errorStatus = { err, status };
    
//    return status != CL_SUCCESS ? clblasErrorToString(status) : clErrorToString(err);
    return errorStatus;
}
void reduceFirstPass( cl_mem d_Rin, int rLen, int numThread, int numMaxBlock, int OPERATOR,int *index,cl_event *eventList,cl_kernel *kernel,int *Flag_CPU_GPU,double * burden,tempResult *tR, int _CPU_GPU)
{
	int* info = (int*)malloc( sizeof(int)*2 );
	//get the information of partition	
	//return bool: if is multiple of maxNumThread
	//if yes, info[0]: number of blocks, info[1] = maxNumThread
	//if no, info[0]: number of blocks except of the last block, info[1]: number of thread in the last block
	bool isMul = howPartition( rLen, numThread, info );

	//scan the isP2 blocks	
	unsigned int numBlock = info[0];
	unsigned int numElementsPerBlock = 0;
	unsigned int extraSpace = 0;
	unsigned int sharedMemSize = 0;

	cl_mem d_temp; //for coalsed 
	CL_MALLOC( &d_temp, sizeof(int)*rLen );
	cl_mem t_temp=NULL;//!!!!!!!!!!
	CL_MALLOC( &t_temp, sizeof(int)*rLen );
	
	
	unsigned int base = 0;
	unsigned int offset = 0;
	cl_mem d_data;

	if( numBlock > 0 )
	{
		int numChunk = ceil( (float)numBlock/numMaxBlock );

		for( int chunkIdx = 0; chunkIdx < numChunk; chunkIdx++ )
		{
			base = chunkIdx*numElementsPerBlock*numMaxBlock;
			offset = chunkIdx*numMaxBlock;
			int subNumBlock = (chunkIdx == (numChunk - 1))?( numBlock - chunkIdx*numMaxBlock ):(numMaxBlock);

			numElementsPerBlock = numThread*2;
			extraSpace = numElementsPerBlock/NUM_BANKS;
			sharedMemSize = sizeof(int)*( numElementsPerBlock + extraSpace );
  			perscanFirstPass_kernel_int(t_temp, d_temp, tR->d_scanBlockSums[0], d_Rin, numElementsPerBlock, true, base, offset, OPERATOR,subNumBlock, numThread, sharedMemSize,rLen,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU );
			clWaitForEvents(1,&eventList[(*index-1)%2]); 
		}
	
	}
	clWaitForEvents(1,&eventList[(*index-1)%2]); 
	//scan the single not isP2 block
	if( (!isMul) || (numBlock == 0) )
	{
		base = numElementsPerBlock*info[0];
		offset = info[0];
		unsigned int remainer = rLen - numElementsPerBlock*info[0];

		numThread = info[1];//update the numThread
		
		//if only one elements
		if( numThread == 0 )
		{
			copyLastElement_kernel_int(tR->d_scanBlockSums[0], d_Rin, base, offset,1, 1,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU);
			clWaitForEvents(1,&eventList[(*index-1)%2]); 
		}
		else
		{
			numBlock = 1;
			numElementsPerBlock = numThread*2;
			extraSpace = numElementsPerBlock/NUM_BANKS;
			sharedMemSize = sizeof(int)*( numElementsPerBlock + extraSpace );
			
			if( isPowerOfTwo( remainer ) )
			{

				perscanFirstPass_kernel_int(t_temp, d_temp, tR->d_scanBlockSums[0], d_Rin, remainer, true, base, offset, OPERATOR ,numBlock, numThread, sharedMemSize,rLen, index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU );
				clWaitForEvents(1,&eventList[(*index-1)%2]); 
			}
			else
			{
				perscanFirstPass_kernel_int(t_temp,d_temp, tR->d_scanBlockSums[0], d_Rin, remainer, false, base, offset, OPERATOR ,numBlock, numThread, sharedMemSize,rLen,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU );
				clWaitForEvents(1,&eventList[(*index-1)%2]); 
			}	
		}
	
	}
	clWaitForEvents(1,&eventList[(*index-1)%2]);
	CL_FREE( d_temp );
	CL_FREE( t_temp );
}
int reduceBlockSums( cl_mem d_Rout, int maxNumThread, int OPERATOR, int rLen,int *index,cl_event *eventList,cl_kernel *kernel,int *Flag_CPU_GPU,double * burden,tempResult *tR, int _CPU_GPU)
{
	int* info = (int*)malloc( sizeof(int)*2 );
	cl_mem temp=NULL;
	CL_MALLOC(&temp, sizeof(int)*rLen );
	//get the information of partition	
	//return bool: if is multiple of maxNumThread
	//if yes, info[0]: number of blocks, info[1] = maxNumThread
	//if no, info[0]: number of blocks except of the last block, info[1]: number of thread in the last block
	for( int level = 0; level < ( tR->d_numLevelsAllocated - 1 ); level++ )
	{
		bool isMul = howPartition( tR->levelSize[level], maxNumThread, info );

		unsigned int numBlock = info[0];
		unsigned int numElementsPerBlock = 0;
		unsigned int extraSpace = 0;
		unsigned int sharedMemSize = 0;
		
		//scan the isP2 blocks
		if( numBlock > 0 )
		{
			numElementsPerBlock = maxNumThread*2;
			extraSpace = numElementsPerBlock/NUM_BANKS;
			sharedMemSize = sizeof(int)*( numElementsPerBlock + extraSpace );			
			perscan_kernel_int(temp, tR->d_scanBlockSums[level + 1], tR->d_scanBlockSums[level], numElementsPerBlock, true, 0, 0, OPERATOR, numBlock, maxNumThread,sharedMemSize,rLen,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU);
			clWaitForEvents(1,&eventList[(*index-1)%2]); 
		}
		clWaitForEvents(1,&eventList[(*index-1)%2]); 
		//scan the single not isP2 block
		if( (!isMul) || (numBlock == 0) )
		{
			unsigned int base = numElementsPerBlock*info[0];
			unsigned int offset = info[0];
			unsigned int remainer = tR->levelSize[level] - numElementsPerBlock*info[0];

			int numThread = info[1];//update the numThread
			clWaitForEvents(1,&eventList[(*index-1)%2]); 
			//only one number in the last block
			if( numThread == 0 )
			{				
				cl_copyBuffer((tR->d_scanBlockSums[level+1]), offset,
					tR->d_scanBlockSums[level], base, sizeof(int), index,eventList,Flag_CPU_GPU,burden,_CPU_GPU); 
			
			}
			else
			{
				numBlock = 1;
				numElementsPerBlock = numThread*2;
				extraSpace = numElementsPerBlock/NUM_BANKS;
				sharedMemSize = sizeof(int)*( numElementsPerBlock + extraSpace );
				
				if( isPowerOfTwo( remainer ) )
				{
					perscan_kernel_int(temp, tR->d_scanBlockSums[level + 1],tR->d_scanBlockSums[level], remainer, true, base, offset, OPERATOR,numBlock, numThread, sharedMemSize,rLen,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU); 
					clWaitForEvents(1,&eventList[(*index-1)%2]); 
				}
				else
				{
					
					perscan_kernel_int(temp,tR->d_scanBlockSums[level + 1], tR->d_scanBlockSums[level], remainer, false, base, offset, OPERATOR,numBlock, numThread, sharedMemSize,rLen,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU);
					clWaitForEvents(1,&eventList[(*index-1)%2]); 
				}
			}
			
		}
	}
	clWaitForEvents(1,&eventList[(*index-1)%2]); 
	getResult_kernel_init(tR->d_scanBlockSums[tR->d_numLevelsAllocated - 1], d_Rout, rLen, OPERATOR,1,1,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU);
	clWaitForEvents(1,&eventList[(*index-1)%2]); 
	CL_FREE(temp);
	return 1;
}
int main()
{
//	unsigned long start_time = time_ms();

	// init matrix
	memset(u, 0, N*N*sizeof(VALUE));

	printf("Jacobi with  N=%d, L_SZ=%d, IT=%d\n", N, L_SZ, IT);
	printf("Kernel file name: %s\n", KERNEL_FILE_NAME);

	// init F
	for(int i=0; i<N; i++)
		for(int j=0; j<N; j++)
			f[i][j] = init_func(i, j);

	VALUE factor = pow((VALUE)1/N, 2);

	// ocl initialization
	cl_context context;
	cl_command_queue command_queue;
	cl_device_id device_id = cluInitDevice(CL_DEVICE, &context, &command_queue);

	// create memory buffers
	cl_int err;
	cl_mem matrix_U = clCreateBuffer(context, CL_MEM_READ_WRITE, N * N * sizeof(VALUE), NULL, &err);
	cl_mem matrix_F = clCreateBuffer(context, CL_MEM_READ_ONLY, N * N * sizeof(VALUE), NULL, &err);
	cl_mem matrix_TMP = clCreateBuffer(context, CL_MEM_READ_WRITE, N * N * sizeof(VALUE), NULL, &err);
	CLU_ERRCHECK(err, "Failed to create buffer for matrix");

	// used for profiling info
	cl_event ev_write_U;
	cl_event ev_write_F;
	cl_event ev_kernel;
	cl_event ev_read_TMP;

	double write_total, read_total, kernel_total;
	write_total = read_total = kernel_total = 0.0;

	// create kernel from source
	char tmp[1024];
	sprintf(tmp, "-DN=%i -DVALUE=%s", N, EXPAND_AND_QUOTE(VALUE));
	cl_program program = cluBuildProgramFromFile(context, device_id, KERNEL_FILE_NAME, tmp);
	cl_kernel kernel = clCreateKernel(program, "jacobi", &err);
	CLU_ERRCHECK(err, "Failed to create matrix_mul kernel from program");

	/* ---------------------------- main part ----------------------------------- */

	// also initialize target matrix with zero values!!!
	err = clEnqueueWriteBuffer(command_queue, matrix_TMP, CL_TRUE, 0, N * N * sizeof(VALUE), u, 0, NULL, &ev_write_U);
	CLU_ERRCHECK(err, "Failed to write matrix to device");
	// write f to device
	err = clEnqueueWriteBuffer(command_queue, matrix_F, CL_FALSE, 0, N * N * sizeof(VALUE), f, 0, NULL, &ev_write_F);
	CLU_ERRCHECK(err, "Failed to write matrix F to device");

	// write matrix u to device
	err = clEnqueueWriteBuffer(command_queue, matrix_U, CL_FALSE, 0, N * N * sizeof(VALUE), u, 0, NULL, &ev_write_U);
	CLU_ERRCHECK(err, "Failed to write matrix U to device");

	// define global work size
	size_t g_work_size[2] = {N, N};
	size_t l_work_size[2] = {L_SZ, L_SZ};

	cl_mem buffer_u;
	cl_mem buffer_tmp;

	for (int i = 0; i < IT; ++i) {
		// swap U and TMP arguments based on iteration counter
		if(i % 2 == 0) {
			buffer_u = matrix_U;
			buffer_tmp = matrix_TMP;
		} else {
			buffer_u = matrix_TMP;
			buffer_tmp = matrix_U;
		}
		// compute memory block dimensions
		int block_dim = (L_SZ + 2) * (L_SZ + 2);
		// set kernel arguments
		cluSetKernelArguments(kernel, 5,
							  sizeof(cl_mem), (void *)&buffer_u,
							  sizeof(cl_mem), (void *)&matrix_F,
							  sizeof(cl_mem), (void *)&buffer_tmp,
							  // local memory buffer
							  block_dim * sizeof(VALUE), NULL,
							  sizeof(VALUE), (void *)&factor);

		// execute kernel
		err = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, g_work_size, l_work_size, 0, NULL, &ev_kernel);
		CLU_ERRCHECK(err, "Failed to enqueue 2D kernel");
		// wait until execution completes
		clWaitForEvents(1, &ev_kernel);
		// add profiling information
		kernel_total += getDurationMS(ev_kernel);
	}

	// copy results back to host
	err = clEnqueueReadBuffer(command_queue, buffer_tmp, CL_TRUE, 0, N * N * sizeof(VALUE), u, 0, NULL, &ev_read_TMP);
	CLU_ERRCHECK(err, "Failed reading back result");

	// compute profiling information
	write_total += getDurationMS(ev_write_U);
	write_total += getDurationMS(ev_write_F);
	read_total += getDurationMS(ev_read_TMP);


	/* ---------------------------- evaluate results ---------------------------------- */
	// print result
	printf("OCL Device: %s\n", cluGetDeviceDescription(device_id, CL_DEVICE));
//	printf("Verification: %4s\n", (success) ? "OK" : "ERR");

	printf("Write total:      %9.4f ms\n", write_total);
	printf("Read total:       %9.4f ms\n", read_total);
	printf("Kernel execution: %9.4f ms\n", kernel_total);
	printf("Time total:       %9.4f ms\n\n", write_total + read_total + kernel_total);
#ifdef DEBUG
	print_result(u);
#endif



	/* ---------------------------- finalization ------------------------------------- */

	err = clFinish(command_queue);
	err |= clReleaseKernel(kernel);
	err |= clReleaseProgram(program);
	err |= clReleaseMemObject(matrix_U);
	err |= clReleaseMemObject(matrix_F);
	err |= clReleaseMemObject(matrix_TMP);
	err |= clReleaseCommandQueue(command_queue);
	err |= clReleaseContext(context);
	CLU_ERRCHECK(err, "Failed during ocl cleanup");

	return EXIT_SUCCESS;
}
void wait(int index,cl_event *eventList){
	printf("index of %d Going to wait!\n",index);
	cl_int err=clWaitForEvents(1,&eventList[(index-1)%2]);
	printf("index of %d Finish wait! err is %d\n,",index,err);
}
Exemple #25
0
template <typename ElemType> nano_time_t
TrsvPerformanceTest<ElemType>::clblasPerfSingle(void)
{
    nano_time_t time;
    cl_event event;
    cl_int status;
    cl_command_queue queue = base_->commandQueues()[0];
    size_t lenX = 1 + ((params_.N-1) * abs(params_.incx)) + params_.offBX;

    status = clEnqueueWriteBuffer(queue, mobjX_, CL_TRUE, 0,
                                  lenX * sizeof(ElemType), backX_, 0, NULL, &event);
    if (status != CL_SUCCESS) {
        cerr << "Vector X buffer object enqueuing error, status = " <<
                 status << endl;

        return NANOTIME_ERR;
    }

    status = clWaitForEvents(1, &event);
    if (status != CL_SUCCESS) {
        cout << "Wait on event failed, status = " <<
                status << endl;

        return NANOTIME_ERR;
    }

    event = NULL;

    DataType type;
    type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT:( typeid(ElemType) == typeid(double))? TYPE_DOUBLE: ( typeid(ElemType) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;

	time = getCurrentTime();
#define TIMING
#ifdef TIMING
	clFinish( queue);

	int iter = 20;
	for ( int i = 1; i <= iter; i++)
	{
#endif
    status = (cl_int)clMath::clblas::trsv(type, params_.order, params_.uplo,
        params_.transA, params_.diag, params_.N, mobjA_, params_.offa, params_.lda,
        mobjX_, params_.offBX, params_.incx, 1, &queue, 0, NULL, &event);

    if (status != CL_SUCCESS) {
        cerr << "The CLBLAS TRSV function failed, status = " <<
                status << endl;

        return NANOTIME_ERR;
    }

#ifdef TIMING
	} // iter loop
	clFinish( queue);
    time = getCurrentTime() - time;
	time /= iter;
#else

    status = flushAll(1, &queue);
    if (status != CL_SUCCESS) {
        cerr << "clFlush() failed, status = " << status << endl;
        return NANOTIME_ERR;
    }

    status = waitForSuccessfulFinish(1, &queue, &event);
    if (status == CL_SUCCESS) {
        time = getCurrentTime() - time;
    }
    else {
        cerr << "Waiting for completion of commands to the queue failed, "
                "status = " << status << endl;
        time = NANOTIME_ERR;
    }
#endif
    return time;
}
Exemple #26
0
int solve_ocl(problem* problem) {

	// -------------- create list of sub-problems ---------------------

	// get upper boundary for number of sub-problems
	int max_problems = powl(problem->size, OMP_CUT);

	// create array of activation records
	call_record* sub_problems = malloc(sizeof(call_record) * max_problems);
	call_record* pos = sub_problems;

	// create store for partial solutions
	int* block = malloc(sizeof(int) * max_problems * problem->size);
	int used = 0;

	// fill up sub-problem array
	volatile int best = 1<<30;
	int partial[problem->size];
	initActivationRecord(problem, partial, 0, 0, 0, &best, &pos, block, &used, OMP_CUT);

	unsigned num_sub_problems = pos - sub_problems;
	printf("Sub-problem list filled %d/%d\n", num_sub_problems, max_problems);


	// -------------- process list using GPU ---------------------

	// ---- process in parallel ----
	cl_int error;	

	// get platform
	cl_platform_id platform;
	{
		cl_uint numPlatforms;
		handleErrorCode(clGetPlatformIDs(0,0, &numPlatforms));
		INFO("Found %d platform(s)!", numPlatforms);

		cl_platform_id ids[numPlatforms];
		handleErrorCode(clGetPlatformIDs(numPlatforms,ids, &numPlatforms));

		platform = ids[0];
	}

	// get devices
	cl_device_id device;
	{
		cl_uint numDevices;
		handleErrorCode(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, 0, &numDevices));
		cl_device_id ids[numDevices];
		handleErrorCode(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, numDevices, ids, &numDevices));
		device = ids[0];

		size_t size;
		clGetDeviceInfo(device, CL_DEVICE_VENDOR, 0, 0, &size);
		char name[size];
		clGetDeviceInfo(device, CL_DEVICE_VENDOR, size*sizeof(char), name, 0); 
		printf("Using Device %s\n", name);
	}

	// create context
	INFO("Creating context ..");
	cl_context context = clCreateContext(0, 1, &device, 0, 0, &error);
	handleErrorCode(error);

	// create queue
	INFO("Creating command queue ..");
	cl_command_queue queue = clCreateCommandQueue(context, device, 0, &error);
	handleErrorCode(error);

	// create program
	INFO("Building program ..");
	cl_program program;
	{
		const char* code = loadFile("qap_array.cl");
		INFO("Kernel Code:\n%s\n", code);

		size_t code_length = strlen(code);
		program = clCreateProgramWithSource(context,1, &code, &code_length, &error);
		handleErrorCode(error);

		// build program
		error = clBuildProgram(program, 1, &device, 0, 0, 0);

		if (error == CL_BUILD_PROGRAM_FAILURE) {
			printf("Program built failed!\n");

			// obtain additional build error information
			size_t logSize = 2048;
			char log[logSize];
			clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, logSize, log, &logSize);

			printf("Log:\n%s\n", log);
				
			exit(1);
		} else {
			handleErrorCode(error);
		}
	}

	// get kernel
	INFO("Building kernel ..");
	cl_kernel kernel = clCreateKernel(program, "qap", &error);
	handleErrorCode(error);

	// create buffers
	INFO("Creating Buffers ..");
	cl_mem mA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*problem->size*problem->size, problem->A->data, &error); handleErrorCode(error);
	cl_mem mB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*problem->size*problem->size, problem->B->data, &error); handleErrorCode(error);
	cl_mem vC = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(call_record)*num_sub_problems, sub_problems, &error); handleErrorCode(error);
	cl_mem vP = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*problem->size*num_sub_problems, block, &error); handleErrorCode(error);
	cl_mem sR = clCreateBuffer(context, CL_MEM_READ_WRITE| CL_MEM_COPY_HOST_PTR, sizeof(int), (void*)&best, &error); handleErrorCode(error);

	// set up arguments
	INFO("Setting up Arguments ..");
	handleErrorCode(clSetKernelArg(kernel, 0, sizeof(int), &problem->size));
	handleErrorCode(clSetKernelArg(kernel, 1, sizeof(mA), &mA));
	handleErrorCode(clSetKernelArg(kernel, 2, sizeof(mB), &mB));
	handleErrorCode(clSetKernelArg(kernel, 3, sizeof(vC), &vC));
	handleErrorCode(clSetKernelArg(kernel, 4, sizeof(vP), &vP));
	handleErrorCode(clSetKernelArg(kernel, 5, sizeof(sR), &sR));
	handleErrorCode(clSetKernelArg(kernel, 6, sizeof(num_sub_problems), &num_sub_problems));

	// run kernel
	INFO("Running kernel ..");
	size_t local_work_size = 64;
	size_t global_work_offset = 0;
	size_t global_work_size = (num_sub_problems + (local_work_size - 1))/local_work_size*local_work_size;
	cl_event kernel_done;
	double start = getTime();
	handleErrorCode(clEnqueueNDRangeKernel(queue, kernel, 1, &global_work_offset, &global_work_size, &local_work_size, 0, 0, &kernel_done));

	handleErrorCode(clWaitForEvents(1,&kernel_done));
	double time = getTime() - start;
	printf("OpenCL Computation Time: %lf sec\n", time);

	// enqueue read operation
	cl_event read_done;
	int res = 0;
	handleErrorCode(clEnqueueReadBuffer(queue, sR, true, 0, sizeof(int), &res, 1, &kernel_done, &read_done));

	// wait for completion
	handleErrorCode(clWaitForEvents(1,&read_done));

	// cleanup
	handleErrorCode(clFinish(queue));
	handleErrorCode(clReleaseKernel(kernel));
	handleErrorCode(clReleaseProgram(program));
	handleErrorCode(clReleaseCommandQueue(queue));
	handleErrorCode(clReleaseContext(context));

	// -------------- free resources and be done ---------------------

	// free temporary arrays
	free(sub_problems);
	free(block);

	// return result
	return res;

/*

	// ---- process in parallel ----
	cl_int error;	

	// get platform
	cl_platform_id platform;
	{
		cl_uint numPlatforms;
		handleErrorCode(clGetPlatformIDs(0,0, &numPlatforms));
		INFO("Found %d platform(s)!", numPlatforms);

		cl_platform_id ids[numPlatforms];
		handleErrorCode(clGetPlatformIDs(numPlatforms,ids, &numPlatforms));

		platform = ids[0];
	}

	// get devices
	cl_device_id device;
	{
		cl_uint numDevices;
		handleErrorCode(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, 0, &numDevices));
		cl_device_id ids[numDevices];
		handleErrorCode(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, numDevices, ids, &numDevices));
		device = ids[0];
	}

	// create context
	INFO("Creating context ..");
	cl_context context = clCreateContext(0, 1, &device, 0, 0, &error);
	handleErrorCode(error);

	// create queue
	INFO("Creating command queue ..");
	cl_command_queue queue = clCreateCommandQueue(context, device, 0, &error);
	handleErrorCode(error);

	// create program
	INFO("Building program ..");
	cl_program program;
	{
		const char* code = loadFile("fib.cl");
		INFO("Kernel Code:\n%s\n", code);

		size_t code_length = strlen(code);
		program = clCreateProgramWithSource(context,1, &code, &code_length, &error);
		handleErrorCode(error);

		// build program
		error = clBuildProgram(program, 1, &device, 0, 0, 0);

		if (error == CL_BUILD_PROGRAM_FAILURE) {
			printf("Program built failed!\n");

			// obtain additional build error information
			size_t logSize = 2048;
			char log[logSize];
			clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, logSize, log, &logSize);

			printf("Log:\n%s\n", log);
				
			exit(1);
		} else {
			handleErrorCode(error);
		}
	}

	// get kernel
	INFO("Building kernel ..");
	cl_kernel kernel = clCreateKernel(program, "fib", &error);
	handleErrorCode(error);

	// create buffers
	INFO("Creating Buffers ..");
	cl_mem vecA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*size, record, &error); handleErrorCode(error);
	cl_mem vecR = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int)*size, 0, &error); handleErrorCode(error);

	// set up arguments
	INFO("Setting up Arguments ..");
	handleErrorCode(clSetKernelArg(kernel, 0, sizeof(vecA), &vecA));
	handleErrorCode(clSetKernelArg(kernel, 1, sizeof(vecR), &vecR));
	handleErrorCode(clSetKernelArg(kernel, 2, sizeof(size), &size));

	// run kernel
	INFO("Running kernel ..");
	size_t local_work_size = 64;
	size_t global_work_offset = 0;
	size_t global_work_size = size;
	cl_event kernel_done;
	double start = getTime();
	handleErrorCode(clEnqueueNDRangeKernel(queue, kernel, 1, &global_work_offset, &global_work_size, &local_work_size, 0, 0, &kernel_done));

	handleErrorCode(clWaitForEvents(1,&kernel_done));
	double time = getTime() - start;
	printf("OpenCL Computation Time: %lf sec\n", time);

	// enqueue read operation
	cl_event read_done;
	int* res = malloc(sizeof(int)*size);
	handleErrorCode(clEnqueueReadBuffer(queue, vecR, true, 0, sizeof(int)*size, res, 1, &kernel_done, &read_done));

	// wait for completion
	handleErrorCode(clWaitForEvents(1,&read_done));

	// cleanup
	handleErrorCode(clFinish(queue));
	handleErrorCode(clReleaseKernel(kernel));
	handleErrorCode(clReleaseProgram(program));
	handleErrorCode(clReleaseCommandQueue(queue));
	handleErrorCode(clReleaseContext(context));

	// aggregate results	TODO: move reduction to OpenCL kernel
	int sum = 0;
	#pragma omp parallel for reduction(+:sum)
	for(int i=0; i<size; i++) {
		sum += res[i];
	}

	// release arrays
	free(record);
	free(res);

	// done
	return sum;
*/
}
Exemple #27
0
void matrixMulGPU(cl_uint ciDeviceCount, cl_mem h_A, float* h_B_data, unsigned int mem_size_B, float* h_C )
{
    cl_mem d_A[MAX_GPU_COUNT];
    cl_mem d_C[MAX_GPU_COUNT];
    cl_mem d_B[MAX_GPU_COUNT];

    cl_event GPUDone[MAX_GPU_COUNT];
    cl_event GPUExecution[MAX_GPU_COUNT];

    // Start the computation on each available GPU
    
    // Create buffers for each GPU
    // Each GPU will compute sizePerGPU rows of the result
    int sizePerGPU = uiHA / ciDeviceCount;

    int workOffset[MAX_GPU_COUNT];
    int workSize[MAX_GPU_COUNT];

    workOffset[0] = 0;
    for(unsigned int i=0; i < ciDeviceCount; ++i) 
    {
        // Input buffer
        workSize[i] = (i != (ciDeviceCount - 1)) ? sizePerGPU : (uiHA - workOffset[i]);        

        d_A[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, workSize[i] * sizeof(float) * uiWA, NULL,NULL);

        // Copy only assigned rows from host to device
        clEnqueueCopyBuffer(commandQueue[i], h_A, d_A[i], workOffset[i] * sizeof(float) * uiWA, 
                            0, workSize[i] * sizeof(float) * uiWA, 0, NULL, NULL);        
        
        // create OpenCL buffer on device that will be initiatlize from the host memory on first use
        // on device
        d_B[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                mem_size_B, h_B_data, NULL);

        // Output buffer
        d_C[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY,  workSize[i] * uiWC * sizeof(float), NULL,NULL);
              
        // set the args values
        clSetKernelArg(multiplicationKernel[i], 0, sizeof(cl_mem), (void *) &d_C[i]);
        clSetKernelArg(multiplicationKernel[i], 1, sizeof(cl_mem), (void *) &d_A[i]);
        clSetKernelArg(multiplicationKernel[i], 2, sizeof(cl_mem), (void *) &d_B[i]);
        clSetKernelArg(multiplicationKernel[i], 3, sizeof(float) * BLOCK_SIZE *BLOCK_SIZE, 0 );
        clSetKernelArg(multiplicationKernel[i], 4, sizeof(float) * BLOCK_SIZE *BLOCK_SIZE, 0 );
        clSetKernelArg(multiplicationKernel[i], 5, sizeof(cl_int), (void *) &uiWA);
        clSetKernelArg(multiplicationKernel[i], 6, sizeof(cl_int), (void *) &uiWB);

        if(i+1 < ciDeviceCount)
            workOffset[i + 1] = workOffset[i] + workSize[i];
    }
    
    // Execute Multiplication on all GPUs in parallel
    size_t localWorkSize[] = {BLOCK_SIZE, BLOCK_SIZE};
    size_t globalWorkSize[] = {shrRoundUp(BLOCK_SIZE, uiWC), shrRoundUp(BLOCK_SIZE, workSize[0])};
    
    // Launch kernels on devices
#ifdef GPU_PROFILING
	
	int nIter = 30;

    for (int j = -1; j < nIter; j++) 
    {
        // Sync all queues to host and start timer first time through loop
        if(j == 0){
            for(unsigned int i = 0; i < ciDeviceCount; i++) 
            {
                clFinish(commandQueue[i]);
            }

            shrDeltaT(0);
        }
#endif
        for(unsigned int i = 0; i < ciDeviceCount; i++) 
        {
			// Multiplication - non-blocking execution:  launch and push to device(s)
			globalWorkSize[1] = shrRoundUp(BLOCK_SIZE, workSize[i]);
			clEnqueueNDRangeKernel(commandQueue[i], multiplicationKernel[i], 2, 0, globalWorkSize, localWorkSize,
				                   0, NULL, &GPUExecution[i]);
            clFlush(commandQueue[i]);
		}

#ifdef GPU_PROFILING
    }
#endif

    // sync all queues to host
	for(unsigned int i = 0; i < ciDeviceCount; i++) 
    {
		clFinish(commandQueue[i]);
	}

#ifdef GPU_PROFILING

    // stop and log timer 
    double dSeconds = shrDeltaT(0)/(double)nIter;
    double dNumOps = 2.0 * (double)uiWA * (double)uiHA * (double)uiWB;
    double gflops = 1.0e-9 * dNumOps/dSeconds;
    shrLogEx(LOGBOTH | MASTER, 0, "oclMatrixMul, Throughput = %.4f GFlops/s, Time = %.5f s, Size = %.0f, NumDevsUsed = %d, Workgroup = %u\n", 
            gflops, dSeconds, dNumOps, ciDeviceCount, localWorkSize[0] * localWorkSize[1]);

    // Print kernel timing per GPU
    shrLog("\n");
    for(unsigned int i = 0; i < ciDeviceCount; i++) 
    {    
        shrLog("  Kernel execution time on GPU %d \t: %.5f s\n", i, executionTime(GPUExecution[i]));
    }
    shrLog("\n");
#endif

    for(unsigned int i = 0; i < ciDeviceCount; i++) 
    {    
        // Non-blocking copy of result from device to host
        clEnqueueReadBuffer(commandQueue[i], d_C[i], CL_FALSE, 0, uiWC * sizeof(float) * workSize[i], 
                            h_C + workOffset[i] * uiWC, 0, NULL, &GPUDone[i]);
    }

	// CPU sync with GPU
    clWaitForEvents(ciDeviceCount, GPUDone);


    // Release mem and event objects    
    for(unsigned int i = 0; i < ciDeviceCount; i++) 
    {
        clReleaseMemObject(d_A[i]);
        clReleaseMemObject(d_C[i]);
        clReleaseMemObject(d_B[i]);
	    clReleaseEvent(GPUExecution[i]);
	    clReleaseEvent(GPUDone[i]);
    }
}
bool SkDifferentPixelsMetric::diff(SkBitmap* baseline, SkBitmap* test, bool computeMask,
                                   Result* result) const {
    double startTime = get_seconds();

    if (!fIsGood) {
        return false;
    }

    // If we never end up running the kernel, include some safe defaults in the result.
    result->poiCount = 0;

    // Ensure the images are comparable
    if (baseline->width() != test->width() || baseline->height() != test->height() ||
        baseline->width() <= 0 || baseline->height() <= 0 ||
        baseline->config() != test->config()) {
        return false;
    }

    cl_mem baselineImage;
    cl_mem testImage;
    cl_mem resultsBuffer;

    // Upload images to the CL device
    if (!this->makeImage2D(baseline, &baselineImage) || !this->makeImage2D(test, &testImage)) {
        SkDebugf("creation of openCL images failed");
        return false;
    }

    // A small hack that makes calculating percentage difference easier later on.
    result->result = 1.0 / ((double)baseline->width() * baseline->height());

    // Make a buffer to store results into. It must be initialized with pointers to memory.
    static const int kZero = 0;
    // We know OpenCL won't write to it because we use CL_MEM_COPY_HOST_PTR
    resultsBuffer = clCreateBuffer(fContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                                   sizeof(int), (int*)&kZero, NULL);

    // Set all kernel arguments
    cl_int setArgErr = clSetKernelArg(fKernel, 0, sizeof(cl_mem), &baselineImage);
    setArgErr       |= clSetKernelArg(fKernel, 1, sizeof(cl_mem), &testImage);
    setArgErr       |= clSetKernelArg(fKernel, 2, sizeof(cl_mem), &resultsBuffer);
    if (CL_SUCCESS != setArgErr) {
        SkDebugf("Set arg failed: %s\n", cl_error_to_string(setArgErr));
        return false;
    }

    // Queue this diff on the CL device
    cl_event event;
    const size_t workSize[] = { baseline->width(), baseline->height() };
    cl_int enqueueErr;
    enqueueErr = clEnqueueNDRangeKernel(fCommandQueue, fKernel, 2, NULL, workSize,
                                        NULL, 0, NULL, &event);
    if (CL_SUCCESS != enqueueErr) {
        SkDebugf("Enqueue failed: %s\n", cl_error_to_string(enqueueErr));
        return false;
    }

    // This makes things totally synchronous. Actual queue is not ready yet
    clWaitForEvents(1, &event);

    // Immediate read back the results
    clEnqueueReadBuffer(fCommandQueue, resultsBuffer, CL_TRUE, 0,
                        sizeof(int), &result->poiCount, 0, NULL, NULL);
    result->result *= (double)result->poiCount;
    result->result = (1.0 - result->result);

    // Release all the buffers created
    clReleaseMemObject(resultsBuffer);
    clReleaseMemObject(baselineImage);
    clReleaseMemObject(testImage);

    result->timeElapsed = get_seconds() - startTime;
    return true;
}
Exemple #29
0
int main(int argc, char **argv)
{
	cl_platform_id platforms[100];
	cl_uint platforms_n = 0;
	CL_CHECK(clGetPlatformIDs(100, platforms, &platforms_n));

	printf("=== %d OpenCL platform(s) found: ===\n", platforms_n);
	for (int i=0; i<platforms_n; i++)
	{
		char buffer[10240];
		printf("  -- %d --\n", i);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL));
		printf("  PROFILE = %s\n", buffer);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL));
		printf("  VERSION = %s\n", buffer);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL));
		printf("  NAME = %s\n", buffer);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL));
		printf("  VENDOR = %s\n", buffer);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL));
		printf("  EXTENSIONS = %s\n", buffer);
	}

	if (platforms_n == 0)
		return 1;

	cl_device_id devices[100];
	cl_uint devices_n = 0;
	// CL_CHECK(clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 100, devices, &devices_n));
	CL_CHECK(clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 100, devices, &devices_n));

	printf("=== %d OpenCL device(s) found on platform:\n", platforms_n);
	for (int i=0; i<devices_n; i++)
	{
		char buffer[10240];
		cl_uint buf_uint;
		cl_ulong buf_ulong;
		printf("  -- %d --\n", i);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL));
		printf("  DEVICE_NAME = %s\n", buffer);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL));
		printf("  DEVICE_VENDOR = %s\n", buffer);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL));
		printf("  DEVICE_VERSION = %s\n", buffer);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL));
		printf("  DRIVER_VERSION = %s\n", buffer);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL));
		printf("  DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL));
		printf("  DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL));
		printf("  DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong);
	}

	if (devices_n == 0)
		return 1;

	cl_context context;
	context = CL_CHECK_ERR(clCreateContext(NULL, 1, devices, &pfn_notify, NULL, &_err));

	const char *program_source[] = {
		"__kernel void simple_demo(__global int *src, __global int *dst, int factor)\n",
		"{\n",
		"	int i = get_global_id(0);\n",
		"	dst[i] = src[i] * factor;\n",
		"}\n"
	};

	cl_program program;
	program = CL_CHECK_ERR(clCreateProgramWithSource(context, sizeof(program_source)/sizeof(*program_source), program_source, NULL, &_err));
	if (clBuildProgram(program, 1, devices, "", NULL, NULL) != CL_SUCCESS) {
		char buffer[10240];
		clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL);
		fprintf(stderr, "CL Compilation failed:\n%s", buffer);
		abort();
	}
	CL_CHECK(clUnloadCompiler());

	cl_mem input_buffer;
	input_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int)*NUM_DATA, NULL, &_err));

	cl_mem output_buffer;
	output_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int)*NUM_DATA, NULL, &_err));

	int factor = 2;

	cl_kernel kernel;
	kernel = CL_CHECK_ERR(clCreateKernel(program, "simple_demo", &_err));
	CL_CHECK(clSetKernelArg(kernel, 0, sizeof(input_buffer), &input_buffer));
	CL_CHECK(clSetKernelArg(kernel, 1, sizeof(output_buffer), &output_buffer));
	CL_CHECK(clSetKernelArg(kernel, 2, sizeof(factor), &factor));

	cl_command_queue queue;
	queue = CL_CHECK_ERR(clCreateCommandQueue(context, devices[0], 0, &_err));

	for (int i=0; i<NUM_DATA; i++) {
		CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, i*sizeof(int), sizeof(int), &i, 0, NULL, NULL));
	}

	cl_event kernel_completion;
	size_t global_work_size[1] = { NUM_DATA };
	CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, &kernel_completion));
	CL_CHECK(clWaitForEvents(1, &kernel_completion));
	CL_CHECK(clReleaseEvent(kernel_completion));

	printf("Result:");
	for (int i=0; i<NUM_DATA; i++) {
		int data;
		CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, i*sizeof(int), sizeof(int), &data, 0, NULL, NULL));
		printf(" %d", data);
	}
	printf("\n");

	CL_CHECK(clReleaseMemObject(input_buffer));
	CL_CHECK(clReleaseMemObject(output_buffer));

	CL_CHECK(clReleaseKernel(kernel));
	CL_CHECK(clReleaseProgram(program));
	CL_CHECK(clReleaseContext(context));

	return 0;
}
Exemple #30
0
int main(int argc, char *argv[])
{
#define EXTRAROOM 1024
	char type_def[] = "-DTYPE=floatXX";
	char* type_ptr= type_def + sizeof(type_def) - 3;
	cl_uint vec_width = 1;

	// selected platform and device number
	cl_uint pn = 0, dn = 0;

	// OpenCL error
	cl_int error;

	// generic iterator
	cl_uint i;

	// set platform/device num from command line
	if (argc > 1)
		pn = atoi(argv[1]);
	if (argc > 2)
		dn = atoi(argv[2]);
	if (argc > 3) {
		vec_width = atoi(argv[3]);
		// this should only be 2, 4, 8, 16
		// if the user passes bogus data, it's their problem.
		if (vec_width > 1)
			strncpy(type_ptr, argv[3], 2);
		if (vec_width == 3)
			vec_width++;
	} else {
		*type_ptr = '\0';
	}


	error = clGetPlatformIDs(0, NULL, &np);
	CHECK_ERROR("getting amount of platform IDs");
	printf("%u platforms found\n", np);
	if (pn >= np) {
		fprintf(stderr, "there is no platform #%u\n" , pn);
		exit(1);
	}
	// only allocate for IDs up to the intended one
	platform = calloc(pn+1,sizeof(*platform));
	// if allocation failed, next call will bomb. rely on this
	error = clGetPlatformIDs(pn+1, platform, NULL);
	CHECK_ERROR("getting platform IDs");

	// choose platform
	p = platform[pn];

	error = clGetPlatformInfo(p, CL_PLATFORM_NAME, BUFSZ, strbuf, NULL);
	CHECK_ERROR("getting platform name");
	printf("using platform %u: %s\n", pn, strbuf);

	error = clGetDeviceIDs(p, CL_DEVICE_TYPE_ALL, 0, NULL, &nd);
	CHECK_ERROR("getting amount of device IDs");
	printf("%u devices found\n", nd);
	if (dn >= nd) {
		fprintf(stderr, "there is no device #%u\n", dn);
		exit(1);
	}
	// only allocate for IDs up to the intended one
	device = calloc(dn+1,sizeof(*device));
	// if allocation failed, next call will bomb. rely on this
	error = clGetDeviceIDs(p, CL_DEVICE_TYPE_ALL, dn+1, device, NULL);
	CHECK_ERROR("getting device IDs");

	// choose device
	d = device[dn];
	error = clGetDeviceInfo(d, CL_DEVICE_NAME, BUFSZ, strbuf, NULL);
	CHECK_ERROR("getting device name");
	printf("using device %u: %s\n", dn, strbuf);

	error = clGetDeviceInfo(d, CL_DEVICE_GLOBAL_MEM_SIZE,
			sizeof(gmem), &gmem, NULL);
	CHECK_ERROR("getting device global memory size");
	error = clGetDeviceInfo(d, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
			sizeof(alloc_max), &alloc_max, NULL);
	CHECK_ERROR("getting device max memory allocation size");

	// create context
	ctx_prop[1] = (cl_context_properties)p;
	ctx = clCreateContext(ctx_prop, 1, &d, NULL, NULL, &error);
	CHECK_ERROR("creating context");

	// create queue
	q = clCreateCommandQueue(ctx, d, CL_QUEUE_PROFILING_ENABLE, &error);
	CHECK_ERROR("creating queue");

	// create program
	pg = clCreateProgramWithSource(ctx, sizeof(src)/sizeof(*src), src, NULL, &error);
	CHECK_ERROR("creating program");

	// build program
	printf("OpenCL program build options: %s\n", type_def);
	error = clBuildProgram(pg, 1, &d, type_def, NULL, NULL);
#if 1
	if (error == CL_BUILD_PROGRAM_FAILURE) {
		error = clGetProgramBuildInfo(pg, d, CL_PROGRAM_BUILD_LOG,
			BUFSZ, strbuf, NULL);
		CHECK_ERROR("get program build info");
		printf("=== BUILD LOG ===\n%s\n=========\n", strbuf);
	}
#endif
	CHECK_ERROR("building program");

	// get kernels
	k_set = clCreateKernel(pg, "set", &error);
	CHECK_ERROR("creating kernel set");

	k_add = clCreateKernel(pg, "add", &error);
	CHECK_ERROR("creating kernel add");


	error = clGetKernelWorkGroupInfo(k_add, d, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
			sizeof(wgm), &wgm, NULL);
	CHECK_ERROR("getting preferred workgroup size multiple");

	// we allocate two buffers
	nbuf = 2;

	// reduce buffer allocation size to ensure we can fit all buffer
	// into the device memory
	if (alloc_max > gmem/nbuf)
		buf_size = gmem/nbuf;
	else
		buf_size = alloc_max;

	// number of elements that fit in the given buf_size
	nels = buf_size/sizeof(cl_float)/vec_width;
	// set the buffer size to match exactly what we need
	buf_size = nels*sizeof(cl_float)*vec_width;

	gws = ROUND_MUL(nels, wgm);

	printf("will use %zu workitems to process %u elements of type %s\n",
			gws, nels, type_def + 7);

#define MB (1024*1024.0)

	printf("will try allocating %u buffers of %gMB each\n", nbuf, buf_size/MB);

	buf = calloc(nbuf, sizeof(cl_mem));

	if (!buf) {
		fprintf(stderr, "could not prepare support for %u buffers\n", nbuf);
		exit(1);
	}

	// we try multiple configurations: no HOST_PTR flags, USE_HOST_PTR and ALLOC_HOST_PTR
	const cl_mem_flags buf_flags[] = {
		CL_MEM_READ_WRITE,
		CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE,
		CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
		CL_MEM_READ_WRITE,
	};

	const size_t nturns = sizeof(buf_flags)/sizeof(*buf_flags);
	const size_t nloops = 5; // number of loops for each turn, for stats
	const size_t median = nloops/2; // location of median value after sorting
	const size_t gmem_bytes_rw = 2*buf_size;

	const char * const flag_names[] = {
		"(none)", "USE_HOST_PTR", "ALLOC_HOST_PTR", "(none)"
	};

	double runtimes[nturns][3][nloops]; /* set, add, map */
	memset(runtimes, 0, nturns*sizeof(*runtimes));

	hbuf = calloc(nbuf, sizeof(*hbuf));
	if (!hbuf) {
		fputs("couldn't allocate host buffer array\n", stderr);
		exit(1);
	}

	for (size_t turn = 0; turn < sizeof(buf_flags)/sizeof(*buf_flags); ++turn) {
		for (i = 0; i < nbuf; ++i) {
			if (buf_flags[turn] & CL_MEM_USE_HOST_PTR) {
				hbuf[i] = calloc(buf_size, 1);
				if (!hbuf[i]) {
					fputs("couldn't allocate host buffer array\n", stderr);
					exit(1);
				}
			}
			buf[i] = clCreateBuffer(ctx, buf_flags[turn], buf_size,
				hbuf[i], &error);
			CHECK_ERROR("allocating buffer");
			printf("buffer %u allocated\n", i);
		}

		for (size_t loop = 0; loop < nloops; ++loop) {
			clSetKernelArg(k_set, 0, sizeof(buf[0]), buf);
			clSetKernelArg(k_set, 1, sizeof(buf[1]), buf + 1);
			clSetKernelArg(k_set, 2, sizeof(nels), &nels);
			error = clEnqueueNDRangeKernel(q, k_set, 1, NULL, &gws, NULL,
					0, NULL, &set_event);
			CHECK_ERROR("enqueueing kernel set");

			clSetKernelArg(k_add, 0, sizeof(buf[0]), buf);
			clSetKernelArg(k_add, 1, sizeof(buf[1]), buf + 1);
			clSetKernelArg(k_add, 2, sizeof(nels), &nels);
			error = clEnqueueNDRangeKernel(q, k_add, 1, NULL, &gws, NULL,
					1, &set_event, &add_event);
			CHECK_ERROR("enqueueing kernel add");

			float *hmap = clEnqueueMapBuffer(q, buf[0], CL_TRUE,
				CL_MAP_READ, 0, buf_size, 1, &add_event, &map_event, &error);
			CHECK_ERROR("map");

			error = clWaitForEvents(1, &map_event);
			CHECK_ERROR("map event");

			printf("Turn %zu, loop %zu: %s\n", turn, loop, flag_names[turn]);
			runtimes[turn][0][loop] = event_perf(set_event, gmem_bytes_rw, "set");
			runtimes[turn][1][loop] = event_perf(add_event, gmem_bytes_rw, "add");
			runtimes[turn][2][loop] = event_perf(map_event, buf_size, "map");

			clEnqueueUnmapMemObject(q, buf[0], hmap, 0, NULL, NULL);

			clFinish(q);

			// release the events
			clReleaseEvent(set_event);
			clReleaseEvent(add_event);
			clReleaseEvent(map_event);
		}

		// release the buffers
		for (i = 0; i < nbuf; ++i) {
			if (buf_flags[turn] & CL_MEM_USE_HOST_PTR) {
				free(hbuf[i]);
				hbuf[i] = NULL;
			}
			clReleaseMemObject(buf[i]);
		}

	}

	puts("Summary/stats:");

	for (size_t turn = 0; turn < nturns; ++turn) {
		double avg[3] = {0};

		/* I'm lazy, so sort with qsort and then compute average,
		 * otherwise we could just compute min, max, avg and median together */
		qsort(runtimes[turn][0], nloops, sizeof(double), compare_double);
		qsort(runtimes[turn][1], nloops, sizeof(double), compare_double);
		qsort(runtimes[turn][2], nloops, sizeof(double), compare_double);
		for (size_t loop = 0; loop < nloops; ++loop) {
			avg[0] += runtimes[turn][0][loop];
			avg[1] += runtimes[turn][1][loop];
			avg[2] += runtimes[turn][2][loop];
		}
		avg[0] /= nloops;
		avg[1] /= nloops;
		avg[2] /= nloops;

		printf("Turn %zu: %s\n", turn, flag_names[turn]);
		printf("set\ttime (ms): best: %8g, median: %8g, worst: %8g, avg: %8g\n",
			runtimes[turn][0][0],
			runtimes[turn][0][median],
			runtimes[turn][0][nloops - 1],
			avg[0]);
		printf("\tBW (GB/s): best: %8g, median: %8g, worst: %8g, avg: %8g\n",
			gmem_bytes_rw/runtimes[turn][0][0]*1.0e-6,
			gmem_bytes_rw/runtimes[turn][0][median]*1.0e-6,
			gmem_bytes_rw/runtimes[turn][0][nloops - 1]*1.0e-6,
			gmem_bytes_rw/avg[0]*1.0e-6);
		printf("add\ttime (ms): best: %8g, median: %8g worst: %8g, avg: %8g\n",
			runtimes[turn][1][0],
			runtimes[turn][1][median],
			runtimes[turn][1][nloops - 1],
			avg[1]);
		printf("\tBW (GB/s): best: %8g, median: %8g, worst: %8g, avg: %8g\n",
			gmem_bytes_rw/runtimes[turn][1][0]*1.0e-6,
			gmem_bytes_rw/runtimes[turn][1][median]*1.0e-6,
			gmem_bytes_rw/runtimes[turn][1][nloops - 1]*1.0e-6,
			gmem_bytes_rw/avg[1]*1.0e-6);
		printf("map\ttime (ms): best: %8g, median: %8g, worst: %8g, avg: %8g\n",
			runtimes[turn][2][0],
			runtimes[turn][2][median],
			runtimes[turn][2][nloops - 1],
			avg[2]);
		printf("\tBW (GB/s): best: %8g, median: %8g, worst: %8g, avg: %8g\n",
			buf_size/runtimes[turn][2][0]*1.0e-6,
			buf_size/runtimes[turn][2][median]*1.0e-6,
			buf_size/runtimes[turn][2][nloops - 1]*1.0e-6,
			buf_size/avg[2]*1.0e-6);

	}


	return 0;
}