double oclLaunchKernel(cl_kernel k, cl_command_queue q, int nbobj, int nbthread, const char *fname, const int line) { cl_int err = 0; dim3 gws, lws; cl_event event; double elapsk; int maxThreads = 0; cl_uint one = 1; cl_device_id dId = oclGetDeviceOfCQueue(q); size_t prefsz = 32; maxThreads = oclGetMaxWorkSize(k, dId); maxThreads = MIN(maxThreads, nbthread); // Get the proper size for the hardware err = clGetKernelWorkGroupInfo(k, dId, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(prefsz), &prefsz, NULL); oclCheckErr(err, "clGetKernelWorkGroupInfo CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE"); // make sure we have the proper multiple: AMD 7970 crashes is not met. maxThreads = oclMultiple(maxThreads, prefsz); // printf("1D %d \n", maxThreads); oclMkNDrange(nbobj, maxThreads, NDR_1D, gws, lws); // printf("Launch: %ld G:%ld %ld %ld L:%ld %ld %ld\n", nbobj, gws[0], gws[1], gws[2], lws[0], lws[1], lws[2]); err = clEnqueueNDRangeKernel(q, k, NDR_1D, NULL, gws, lws, 0, NULL, &event); oclCheckErrF(err, "clEnqueueNDRangeKernel", fname, line); err = clWaitForEvents(one, &event); oclCheckErrF(err, "clWaitForEvents", fname, line); elapsk = oclChronoElaps(event); err = clReleaseEvent(event); oclCheckErrF(err, "clReleaseEvent", fname, line); return elapsk; }
// Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Hbmv(args.layout, args.triangle, args.n, args.kl, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.x_vec(), args.x_offset, args.x_inc, args.beta, buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Hbmv(args.layout, args.triangle, args.n, args.kl, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.x_vec(), args.x_offset, args.x_inc, args.beta, buffers.y_vec(), args.y_offset, args.y_inc, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; }
PerformanceAnalyser::TimelineEntry PerformanceAnalyser::analyzeEvent(cl_event &event) { // Wait for event information to be ready clWaitForEvents(1, &event); TimelineEntry entry; cl_ulong time; clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &time, NULL); entry.start_time = (double) time / 1000000000.0; clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &time, NULL); entry.end_time = (double) time / 1000000000.0; clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &time, NULL); double exec_start = ((double) time / 1000000000.0); entry.execution_time = entry.end_time - exec_start; entry.api_overhead = exec_start - entry.start_time; entry.total_time = entry.end_time - entry.start_time; entry.cpu_time = (getTime()-m_time)-entry.total_time; return entry; }
void read_value(){ int err; cl_event readevent; err = clEnqueueReadBuffer(commands, d_output, CL_TRUE, 0, REC_N * sizeof(cl_int), h_output, 0, NULL, &readevent); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); printf("Test failed\n"); exit(1); } clWaitForEvents(1, &readevent); printf("\n[host] outputs:\n"); for (int i = 0; i < REC_N; ++i) { printf("%d ", h_output[i]); } printf("\n"); }
void mat_mul_cl_clblas(const F *A, const F *B, F *C, size_t n, Cache *cache) { cl_event event; size_t mat_sizeof; mat_sizeof = n * n * sizeof(F); clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_a, CL_TRUE, 0, mat_sizeof, (F*)A, 0, NULL, NULL); clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_b, CL_TRUE, 0, mat_sizeof, (F*)B, 0, NULL, NULL); clblasSgemm( clblasRowMajor, clblasNoTrans, clblasNoTrans, n, n, n, 1.0, cache->buf_a, 0, n, cache->buf_b, 0, n, 0.0, cache->buf_c, 0, n, 1, &(cache->common.command_queue), 0, NULL, &event ); clWaitForEvents(1, &event); clEnqueueReadBuffer(cache->common.command_queue, cache->buf_c, CL_TRUE, 0, mat_sizeof, C, 0, NULL, NULL); }
/* Wait for an event then release it */ cl_int mwWaitReleaseEvent(cl_event* ev) { cl_int err; assert(ev); err = clWaitForEvents(1, ev); if (err != CL_SUCCESS) { mwPerrorCL(err, "Failed to wait for event"); return err; } err = clReleaseEvent(*ev); if (err != CL_SUCCESS) { mwPerrorCL(err, "Failed to release event"); return err; } return CL_SUCCESS; }
/*! Copies the contents of this buffer, starting at \a offset to \a rect within \a dest. Returns true if the copy was successful; false otherwise. This function will block until the request finishes. The request is executed on the active command queue for context(). \sa copyToAsync() */ bool QCLBuffer::copyTo (size_t offset, const QCLImage2D &dest, const QRect &rect) { const size_t dst_origin[3] = {static_cast<size_t>(rect.x()), static_cast<size_t>(rect.y()), 0 }; const size_t region[3] = {static_cast<size_t>(rect.width()), static_cast<size_t>(rect.height()), 1 }; cl_event event; cl_int error = clEnqueueCopyBufferToImage (context()->activeQueue(), memoryId(), dest.memoryId(), offset, dst_origin, region, 0, 0, &event); context()->reportError("QCLBuffer::copyTo(QCLImage2D):", error); if (error == CL_SUCCESS) { clWaitForEvents(1, &event); clReleaseEvent(event); return true; } else { return false; } }
void testScanImpl(int rLen) { int _CPU_GPU=0; cl_event eventList[2]; int index=0; cl_kernel Kernel; int CPU_GPU; double burden; int result=0; int memSize=sizeof(int)*rLen; int outSize=sizeof(int)*rLen; void *Rin; HOST_MALLOC(Rin, memSize); generateRandInt((int*)Rin, rLen,rLen,0); void *Rout; HOST_MALLOC(Rout, outSize); cl_mem d_Rin; CL_MALLOC(&d_Rin, memSize); cl_mem d_Rout; CL_MALLOC(&d_Rout, outSize); cl_writebuffer(d_Rin, Rin, memSize,&index,eventList,&CPU_GPU,&burden,_CPU_GPU); ScanPara *SP; SP=(ScanPara*)malloc(sizeof(ScanPara)); initScan(rLen,SP); scanImpl(d_Rin,rLen,d_Rout,&index,eventList,&Kernel,&CPU_GPU,&burden,SP,_CPU_GPU); cl_readbuffer(Rout, d_Rout, outSize,&index,eventList,&CPU_GPU,&burden,_CPU_GPU); clWaitForEvents(1,&eventList[(index-1)%2]); closeScan(SP); deschedule(CPU_GPU,burden); //validateScan( (int*)Rin, rLen, (int*)Rout ); HOST_FREE(Rin); HOST_FREE(Rout); CL_FREE(d_Rin); CL_FREE(d_Rout); clReleaseKernel(Kernel); clReleaseEvent(eventList[0]); clReleaseEvent(eventList[1]); }
int acc_event_synchronize (void* event){ // debug info if (verbose_print){ fprintf(stdout, "\n ... EVENT SYNCHRONIZATION ... \n"); fprintf(stdout, " ---> Entering: acc_event_synchronize.\n"); } // local event and queue pointers cl_event *clevent = (cl_event *) event; // wait for an event ( !!! need to share the same ctx !!! ) cl_error = clWaitForEvents((cl_uint) 1, clevent); if (acc_opencl_error_check(cl_error, __LINE__)) return -1; // debug info if (verbose_print){ fprintf(stdout, " ---> Leaving: acc_event_synchronize.\n"); } // assign return value return 0; }
void pclu_call_kernel(pclu_program* pgm, const char* name, pclu_range range, size_t argc, ...) { cl_int errcode; cl_kernel kern = clCreateKernel(pgm->program, name, &errcode); pclu_check_call("clCreateKernel", errcode); va_list ap; va_start(ap, argc); for (cl_uint ii = 0; ii < argc; ++ii) { size_t size = va_arg(ap, size_t); void* arg = va_arg(ap, void*); pclu_check_call("clSetKernelArg", clSetKernelArg(kern, ii, size, arg)); } va_end(ap); #define NO_CL_EVENTS 1 #ifdef NO_CL_EVENTS cl_event kernel_done = 0; #else cl_event kernel_done = clCreateUserEvent(pgm->pclu->context, &errcode); pclu_check_call("clCreateUserEvent", errcode); #endif errcode = clEnqueueNDRangeKernel(pgm->pclu->queue, kern, range.nd, 0, range.global, 0, 0, 0, &kernel_done); pclu_check_call("clEnqueueNDRangeKernel", errcode); #ifndef NO_CL_EVENTS pclu_check_call("clWaitForEvents", clWaitForEvents(1, &kernel_done)); #endif pclu_check_call("clReleaseKernel", clReleaseKernel(kern)); }
extern "C" int CL_GroupBy(Record * h_Rin, int rLen, Record* h_Rout, int** h_startPos, int numThread, int numBlock , int _CPU_GPU) { cl_mem d_Rin; cl_mem d_Rout; cl_mem d_startPos; ///////////////////////////////////////////////////////////////////////////////////////////////////////////// cl_event eventList[2]; int index=0; cl_kernel Kernel; int CPU_GPU; double burden; int memSize = sizeof(Record)*rLen; CL_MALLOC( &d_Rin, memSize ); CL_MALLOC(&d_Rout, memSize ); cl_writebuffer( d_Rin, h_Rin, memSize,&index,eventList,&CPU_GPU,&burden,_CPU_GPU); int numGroup = 0; numGroup= groupByImpl(d_Rin, rLen, d_Rout, &d_startPos, numThread, numBlock,&index,eventList,&Kernel,&CPU_GPU,&burden,_CPU_GPU); (*h_startPos) = (int*)malloc( sizeof(int)*numGroup ); cl_readbuffer( *h_startPos, d_startPos, sizeof(int)*numGroup,&index,eventList,&CPU_GPU,&burden,_CPU_GPU); cl_readbuffer( h_Rout, d_Rout, sizeof(Record)*rLen,&index,eventList,&CPU_GPU,&burden,_CPU_GPU); clWaitForEvents(1,&eventList[(index-1)%2]); deschedule(CPU_GPU,burden); CL_FREE( d_Rin ); CL_FREE( d_Rout ); CL_FREE( d_startPos ); clReleaseKernel(Kernel); clReleaseEvent(eventList[0]); clReleaseEvent(eventList[1]); printf("CL_GroupBy\n"); return numGroup; }
void deathray::SingleFrameExecute() { cl_uint wait_list_length = 0; cl_event wait_list[3]; result status; if (temporal_radius_Y_ == 0 && h_Y_ > 0.f) { status = g_SingleFrame_Y.CopyTo(srcpY_); if (status != FILTER_OK) env_->ThrowError("Deathray: Copy Y to device status=%d and OpenCL status=%d", status, g_last_cl_error); } if (temporal_radius_UV_ == 0 && h_UV_ > 0.f) { status = g_SingleFrame_U.CopyTo(srcpU_); if (status != FILTER_OK) env_->ThrowError("Deathray: Copy U to device status=%d and OpenCL status=%d", status, g_last_cl_error); status = g_SingleFrame_V.CopyTo(srcpV_); if (status != FILTER_OK) env_->ThrowError("Deathray: Copy V to device status=%d and OpenCL status=%d", status, g_last_cl_error); } if (temporal_radius_Y_ == 0 && h_Y_ > 0.f) { status = g_SingleFrame_Y.Execute(); if (status != FILTER_OK) env_->ThrowError("Deathray: Execute Y kernel status=%d and OpenCL status=%d", status, g_last_cl_error); status = g_SingleFrame_Y.CopyFrom(dstpY_, wait_list); if (status != FILTER_OK) env_->ThrowError("Deathray: Copy Y to host status=%d and OpenCL status=%d", status, g_last_cl_error); ++wait_list_length; } if (temporal_radius_UV_ == 0 && h_UV_ > 0.f) { g_SingleFrame_U.Execute(); if (status != FILTER_OK) env_->ThrowError("Deathray: Execute U kernel status=%d and OpenCL status=%d", status, g_last_cl_error); g_SingleFrame_U.CopyFrom(dstpU_, wait_list + wait_list_length++); if (status != FILTER_OK) env_->ThrowError("Deathray: Copy U to host status=%d and OpenCL status=%d", status, g_last_cl_error); g_SingleFrame_V.Execute(); if (status != FILTER_OK) env_->ThrowError("Deathray: Execute V kernel status=%d and OpenCL status=%d", status, g_last_cl_error); g_SingleFrame_V.CopyFrom(dstpV_, wait_list + wait_list_length++); if (status != FILTER_OK) env_->ThrowError("Deathray: Copy V to host status=%d and OpenCL status=%d", status, g_last_cl_error); } clWaitForEvents(wait_list_length, wait_list); }
void copyhostptr_roundtrip_func() { timer.Start(timer_id); //set up buffer cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(T), buffer_.a_, &err); buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(T), buffer_.b_, &err); //call func xTrsm_Function(false); //read gpu buffer err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE, buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ * sizeof(T), buffer_.b_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); }
void write_to_buffer(eObj* e, cObj cCandidate) { Tempest::data.lNumPSMs += 1; if (e->iNumBufferedCandidates == 0) { clWaitForEvents(1, &(e->clEventSent)); if (Tempest::config.profile) { cl_ulong start; cl_ulong end; int err; err = clGetEventProfilingInfo(e->clEventSent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); err |= clGetEventProfilingInfo(e->clEventSent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); if (err == 0) e->device->totalSendTime += (end-start); } clReleaseEvent(e->clEventSent); } e->candidateBuffer[e->iNumBufferedCandidates] = cCandidate; //memcpy(e->candidateBuffer+e->iNumBufferedCandidates, &cCandidate, sizeof(cObj)); e->iNumCandidates++; e->iNumBufferedCandidates++; if (e->iNumBufferedCandidates == e->candidateBufferSize) { //printf("%d\t%d\n", gpu_info.iNumScoringKernels, iBin); e->device->scoreCandidates(e); } }
cl_int Dsyrk_internal( cl_env *env, double *a, double *c, double alpha, double beta, clblasTranspose transA, clblasUplo uplo, int ar, int ac, int n, int size_a, int size_c) { CHECK(clblasSetup()); cl_event events[NEVENTS]; int nevent = 0; cl_mem mem_a = create_mem(env, a, size_a, CL_MEM_READ_ONLY, &(events[nevent++])); cl_mem mem_c; if (beta != 0) mem_c = create_mem(env, c, size_c, CL_MEM_READ_WRITE, &(events[nevent++])); else mem_c = create_mem(env, NULL, size_c, CL_MEM_READ_WRITE, NULL); int k = transA == clblasNoTrans ? ar : ac; cl_int err = clblasDsyrk(clblasColumnMajor, uplo, transA, n, k, alpha, mem_a, 0, ac, beta, mem_c, 0, n, 1, &(env->queues[0]), nevent, events, &(events[nevent])); CHECK(err); events[nevent+1] = *read_mem(env, mem_c, c, size_c, 1, &(events[nevent])); CHECK(clWaitForEvents(1, &(events[nevent+1]))); CHECK(clReleaseMemObject(mem_a)); CHECK(clReleaseMemObject(mem_c)); clblasTeardown(); return CL_SUCCESS; }
int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufX, bufY; cl_event event = NULL; int ret = 0; int lenX = 1 + (N-1)*abs(incx); int lenY = 1 + (N-1)*abs(incy); /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenX*sizeof(cl_float)), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenY*sizeof(cl_float)), NULL, &err); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); printResult(); /* Call clblas function. */ err = clblasSrot(N, bufX, 0, incx, bufY, 0, incy, C, S, 1, &queue, 0, NULL, &event); // printf("here\n"); if (err != CL_SUCCESS) { printf("clblasSrot() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); /* At this point you will get the result of SROT placed in vector Y. */ printResult(); } //printf("here\n"); /* Release OpenCL memory objects. */ clReleaseMemObject(bufY); clReleaseMemObject(bufX); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; }
struct tableNode * groupBy(struct groupByNode * gb, struct clContext * context, struct statistic * pp){ struct timespec start,end; clock_gettime(CLOCK_REALTIME,&start); cl_event ndrEvt; cl_ulong startTime,endTime; struct tableNode * res = NULL; long gpuTupleNum; int gpuGbColNum; cl_mem gpuGbIndex; cl_mem gpuGbType, gpuGbSize; cl_mem gpuGbKey; cl_mem gpuContent; int gbCount; // the number of groups int gbConstant = 0; // whether group by constant cl_int error = 0; res = (struct tableNode *) malloc(sizeof(struct tableNode)); CHECK_POINTER(res); res->tupleSize = gb->tupleSize; res->totalAttr = gb->outputAttrNum; res->attrType = (int *) malloc(sizeof(int) * res->totalAttr); CHECK_POINTER(res->attrType); res->attrSize = (int *) malloc(sizeof(int) * res->totalAttr); CHECK_POINTER(res->attrSize); res->attrTotalSize = (int *) malloc(sizeof(int) * res->totalAttr); CHECK_POINTER(res->attrTotalSize); res->dataPos = (int *) malloc(sizeof(int) * res->totalAttr); CHECK_POINTER(res->dataPos); res->dataFormat = (int *) malloc(sizeof(int) * res->totalAttr); CHECK_POINTER(res->dataFormat); res->content = (char **) malloc(sizeof(char **) * res->totalAttr); CHECK_POINTER(res->content); for(int i=0;i<res->totalAttr;i++){ res->attrType[i] = gb->attrType[i]; res->attrSize[i] = gb->attrSize[i]; res->dataFormat[i] = UNCOMPRESSED; } gpuTupleNum = gb->table->tupleNum; gpuGbColNum = gb->groupByColNum; if(gpuGbColNum == 1 && gb->groupByIndex[0] == -1){ gbConstant = 1; } size_t localSize = 128; size_t globalSize = 1024*128; int blockNum = gb->table->tupleNum / localSize + 1; if(blockNum < 1024) globalSize = blockNum * 128; cl_mem gpu_hashNum; cl_mem gpu_psum; cl_mem gpuGbCount; long * cpuOffset = (long *)malloc(sizeof(long) * gb->table->totalAttr); CHECK_POINTER(cpuOffset); long offset = 0; long totalSize = 0; for(int i=0;i<gb->table->totalAttr;i++){ int attrSize = gb->table->attrSize[i]; int size = attrSize * gb->table->tupleNum; cpuOffset[i] = offset; /*align each column*/ if(size % 4 !=0){ size += 4 - (size%4); } offset += size; totalSize += size; } gpuContent = clCreateBuffer(context->context,CL_MEM_READ_ONLY, totalSize,NULL,&error); for(int i=0;i<gb->table->totalAttr;i++){ int attrSize = gb->table->attrSize[i]; int size = attrSize * gb->table->tupleNum; if(gb->table->dataPos[i]==MEM){ error = clEnqueueWriteBuffer(context->queue, gpuContent, CL_TRUE, cpuOffset[i], size, gb->table->content[i],0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->pcie += 1e-6 * (endTime - startTime); #endif }else error = clEnqueueCopyBuffer(context->queue,(cl_mem)gb->table->content[i],gpuContent,0, cpuOffset[i],size,0,0,0); } cl_mem gpuOffset = clCreateBuffer(context->context,CL_MEM_READ_ONLY, sizeof(long)*gb->table->totalAttr,NULL,&error); clEnqueueWriteBuffer(context->queue,gpuOffset,CL_TRUE,0,sizeof(long)*gb->table->totalAttr,cpuOffset,0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->pcie += 1e-6 * (endTime - startTime); #endif if(gbConstant != 1){ gpuGbType = clCreateBuffer(context->context,CL_MEM_READ_ONLY,sizeof(int)*gb->groupByColNum,NULL,&error); clEnqueueWriteBuffer(context->queue,gpuGbType,CL_TRUE,0,sizeof(int)*gb->groupByColNum,gb->groupByType,0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->pcie += 1e-6 * (endTime - startTime); #endif gpuGbSize = clCreateBuffer(context->context,CL_MEM_READ_ONLY,sizeof(int)*gb->groupByColNum,NULL,&error); clEnqueueWriteBuffer(context->queue,gpuGbSize,CL_TRUE,0,sizeof(int)*gb->groupByColNum,gb->groupBySize,0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->pcie += 1e-6 * (endTime - startTime); #endif gpuGbKey = clCreateBuffer(context->context,CL_MEM_READ_WRITE,sizeof(int)*gb->table->tupleNum,NULL,&error); gpuGbIndex = clCreateBuffer(context->context,CL_MEM_READ_ONLY, sizeof(int)*gb->groupByColNum,NULL,&error); clEnqueueWriteBuffer(context->queue,gpuGbIndex,CL_TRUE,0,sizeof(int)*gb->groupByColNum,gb->groupByIndex,0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->pcie += 1e-6 * (endTime - startTime); #endif gpu_hashNum = clCreateBuffer(context->context,CL_MEM_READ_WRITE, sizeof(int)*HSIZE,NULL,&error); context->kernel = clCreateKernel(context->program,"cl_memset_int",0); int tmp = HSIZE; clSetKernelArg(context->kernel,0,sizeof(cl_mem), (void*)&gpu_hashNum); clSetKernelArg(context->kernel,1,sizeof(int), (void*)&tmp); error = clEnqueueNDRangeKernel(context->queue, context->kernel, 1, 0, &globalSize,&localSize,0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->kernel += 1e-6 * (endTime - startTime); #endif context->kernel = clCreateKernel(context->program, "build_groupby_key",0); clSetKernelArg(context->kernel,0,sizeof(cl_mem),(void *)&gpuContent); clSetKernelArg(context->kernel,1,sizeof(cl_mem),(void *)&gpuOffset); clSetKernelArg(context->kernel,2,sizeof(int),(void *)&gpuGbColNum); clSetKernelArg(context->kernel,3,sizeof(cl_mem),(void *)&gpuGbIndex); clSetKernelArg(context->kernel,4,sizeof(cl_mem),(void *)&gpuGbType); clSetKernelArg(context->kernel,5,sizeof(cl_mem),(void *)&gpuGbSize); clSetKernelArg(context->kernel,6,sizeof(long),(void *)&gpuTupleNum); clSetKernelArg(context->kernel,7,sizeof(cl_mem),(void *)&gpuGbKey); clSetKernelArg(context->kernel,8,sizeof(cl_mem),(void *)&gpu_hashNum); error = clEnqueueNDRangeKernel(context->queue, context->kernel, 1, 0, &globalSize,&localSize,0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->kernel += 1e-6 * (endTime - startTime); #endif clReleaseMemObject(gpuGbType); clReleaseMemObject(gpuGbSize); clReleaseMemObject(gpuGbIndex); gbCount = 1; tmp = 0; gpuGbCount = clCreateBuffer(context->context,CL_MEM_READ_WRITE, sizeof(int),NULL,&error); clEnqueueWriteBuffer(context->queue,gpuGbCount,CL_TRUE,0,sizeof(int),&tmp,0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->pcie += 1e-6 * (endTime - startTime); #endif int hsize = HSIZE; context->kernel = clCreateKernel(context->program, "count_group_num",0); clSetKernelArg(context->kernel,0,sizeof(cl_mem),(void *)&gpu_hashNum); clSetKernelArg(context->kernel,1,sizeof(int),(void *)&hsize); clSetKernelArg(context->kernel,2,sizeof(cl_mem),(void *)&gpuGbCount); error = clEnqueueNDRangeKernel(context->queue, context->kernel, 1, 0, &globalSize,&localSize,0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->kernel += 1e-6 * (endTime - startTime); #endif clEnqueueReadBuffer(context->queue, gpuGbCount, CL_TRUE, 0, sizeof(int), &gbCount,0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->pcie += 1e-6 * (endTime - startTime); #endif gpu_psum = clCreateBuffer(context->context,CL_MEM_READ_WRITE, sizeof(int)*HSIZE,NULL,&error); scanImpl(gpu_hashNum,HSIZE,gpu_psum,context,pp); clReleaseMemObject(gpuGbCount); clReleaseMemObject(gpu_hashNum); } if(gbConstant == 1) res->tupleNum = 1; else res->tupleNum = gbCount; printf("groupBy num %ld\n",res->tupleNum); gpuGbType = clCreateBuffer(context->context, CL_MEM_READ_ONLY, sizeof(int)*res->totalAttr, NULL, &error); clEnqueueWriteBuffer(context->queue,gpuGbType,CL_TRUE,0,sizeof(int)*res->totalAttr,res->attrType,0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->pcie += 1e-6 * (endTime - startTime); #endif gpuGbSize = clCreateBuffer(context->context, CL_MEM_READ_ONLY, sizeof(int)*res->totalAttr, NULL, &error); clEnqueueWriteBuffer(context->queue,gpuGbSize,CL_TRUE,0,sizeof(int)*res->totalAttr,res->attrSize,0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->pcie += 1e-6 * (endTime - startTime); #endif /* * @gpuGbExp is the mathExp in each groupBy expression * @mathexp stores the math exp for for the group expression that has two operands * The reason that we need two variables instead of one is that OpenCL doesn't support pointer to pointer * */ cl_mem gpuGbExp = clCreateBuffer(context->context, CL_MEM_READ_ONLY, sizeof(struct mathExp)*res->totalAttr, NULL, &error); cl_mem mathexp = clCreateBuffer(context->context, CL_MEM_READ_ONLY, 2*sizeof(struct mathExp)*res->totalAttr, NULL, &error); struct mathExp tmpExp[2]; int * cpuFunc = (int *) malloc(sizeof(int) * res->totalAttr); CHECK_POINTER(cpuFunc); offset = 0; for(int i=0;i<res->totalAttr;i++){ error = clEnqueueWriteBuffer(context->queue, gpuGbExp, CL_TRUE, offset, sizeof(struct mathExp), &(gb->gbExp[i].exp),0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->pcie += 1e-6 * (endTime - startTime); #endif offset += sizeof(struct mathExp); cpuFunc[i] = gb->gbExp[i].func; if(gb->gbExp[i].exp.opNum == 2){ struct mathExp * tmpMath = (struct mathExp *) (gb->gbExp[i].exp.exp); tmpExp[0].op = tmpMath[0].op; tmpExp[0].opNum = tmpMath[0].opNum; tmpExp[0].opType = tmpMath[0].opType; tmpExp[0].opValue = tmpMath[0].opValue; tmpExp[1].op = tmpMath[1].op; tmpExp[1].opNum = tmpMath[1].opNum; tmpExp[1].opType = tmpMath[1].opType; tmpExp[1].opValue = tmpMath[1].opValue; clEnqueueWriteBuffer(context->queue, mathexp, CL_TRUE, 2*i*sizeof(struct mathExp),2*sizeof(struct mathExp),tmpExp,0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->pcie += 1e-6 * (endTime - startTime); #endif } } cl_mem gpuFunc = clCreateBuffer(context->context, CL_MEM_READ_ONLY, sizeof(int)*res->totalAttr, NULL, &error); clEnqueueWriteBuffer(context->queue,gpuFunc,CL_TRUE,0,sizeof(int)*res->totalAttr,cpuFunc,0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->pcie += 1e-6 * (endTime - startTime); #endif long *resOffset = (long *)malloc(sizeof(long)*res->totalAttr); CHECK_POINTER(resOffset); offset = 0; totalSize = 0; for(int i=0;i<res->totalAttr;i++){ /* * align the output of each column on the boundary of 4 */ int size = res->attrSize[i] * res->tupleNum; if(size %4 != 0){ size += 4- (size %4); } resOffset[i] = offset; offset += size; totalSize += size; } cl_mem gpuResult = clCreateBuffer(context->context,CL_MEM_READ_WRITE, totalSize, NULL, &error); cl_mem gpuResOffset = clCreateBuffer(context->context, CL_MEM_READ_ONLY,sizeof(long)*res->totalAttr, NULL,&error); clEnqueueWriteBuffer(context->queue,gpuResOffset,CL_TRUE,0,sizeof(long)*res->totalAttr,resOffset,0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->pcie += 1e-6 * (endTime - startTime); #endif gpuGbColNum = res->totalAttr; if(gbConstant !=1){ context->kernel = clCreateKernel(context->program,"agg_cal",0); clSetKernelArg(context->kernel,0,sizeof(cl_mem), (void*)&gpuContent); clSetKernelArg(context->kernel,1,sizeof(cl_mem), (void*)&gpuOffset); clSetKernelArg(context->kernel,2,sizeof(int), (void*)&gpuGbColNum); clSetKernelArg(context->kernel,3,sizeof(cl_mem), (void*)&gpuGbExp); clSetKernelArg(context->kernel,4,sizeof(cl_mem), (void*)&mathexp); clSetKernelArg(context->kernel,5,sizeof(cl_mem), (void*)&gpuGbType); clSetKernelArg(context->kernel,6,sizeof(cl_mem), (void*)&gpuGbSize); clSetKernelArg(context->kernel,7,sizeof(long), (void*)&gpuTupleNum); clSetKernelArg(context->kernel,8,sizeof(cl_mem), (void*)&gpuGbKey); clSetKernelArg(context->kernel,9,sizeof(cl_mem), (void*)&gpu_psum); clSetKernelArg(context->kernel,10,sizeof(cl_mem), (void*)&gpuResult); clSetKernelArg(context->kernel,11,sizeof(cl_mem), (void*)&gpuResOffset); clSetKernelArg(context->kernel,12,sizeof(cl_mem), (void*)&gpuFunc); error = clEnqueueNDRangeKernel(context->queue, context->kernel, 1, 0, &globalSize,&localSize,0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->kernel += 1e-6 * (endTime - startTime); #endif clReleaseMemObject(gpuGbKey); clReleaseMemObject(gpu_psum); }else{ context->kernel = clCreateKernel(context->program,"agg_cal_cons",0); clSetKernelArg(context->kernel,0,sizeof(cl_mem), (void*)&gpuContent); clSetKernelArg(context->kernel,1,sizeof(cl_mem), (void*)&gpuOffset); clSetKernelArg(context->kernel,2,sizeof(int), (void*)&gpuGbColNum); clSetKernelArg(context->kernel,3,sizeof(cl_mem), (void*)&gpuGbExp); clSetKernelArg(context->kernel,4,sizeof(cl_mem), (void*)&mathexp); clSetKernelArg(context->kernel,5,sizeof(cl_mem), (void*)&gpuGbType); clSetKernelArg(context->kernel,6,sizeof(cl_mem), (void*)&gpuGbSize); clSetKernelArg(context->kernel,7,sizeof(long), (void*)&gpuTupleNum); clSetKernelArg(context->kernel,8,sizeof(cl_mem), (void*)&gpuResult); clSetKernelArg(context->kernel,9,sizeof(cl_mem), (void*)&gpuResOffset); clSetKernelArg(context->kernel,10,sizeof(cl_mem), (void*)&gpuFunc); globalSize = localSize * 4; error = clEnqueueNDRangeKernel(context->queue, context->kernel, 1, 0, &globalSize,&localSize,0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->kernel += 1e-6 * (endTime - startTime); #endif } for(int i=0; i<res->totalAttr;i++){ res->content[i] = (char *)clCreateBuffer(context->context,CL_MEM_READ_WRITE, res->attrSize[i]*res->tupleNum, NULL, &error); res->dataPos[i] = GPU; res->attrTotalSize[i] = res->tupleNum * res->attrSize[i]; clEnqueueCopyBuffer(context->queue, gpuResult, (cl_mem)res->content[i], resOffset[i],0, res->attrSize[i] * res->tupleNum, 0,0,0); } free(resOffset); free(cpuOffset); clFinish(context->queue); clReleaseMemObject(gpuContent); clReleaseMemObject(gpuResult); clReleaseMemObject(gpuOffset); clReleaseMemObject(gpuResOffset); clReleaseMemObject(gpuGbExp); clReleaseMemObject(gpuFunc); clock_gettime(CLOCK_REALTIME,&end); double timeE = (end.tv_sec - start.tv_sec)* BILLION + end.tv_nsec - start.tv_nsec; printf("GroupBy Time: %lf\n", timeE/(1000*1000)); return res; }
int main(int argc, char **argv) { cl_int status; const char *platform_name = "NVIDIA"; if (!find_platform(platform_name, &platform)) { fprintf(stderr,"Error: Platform \"%s\" not found\n", platform_name); print_platforms(); teardown(-1); } status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL); checkError (status, "Error: could not query devices"); context = clCreateContext(NULL, 1, &device, NULL, NULL, &status); checkError(status, "could not create context"); const char name[] = KERNELDIR "/reduce.cl"; unsigned char *source; size_t size; if (!load_file(name, &source, &size)) { teardown(-1); } program = clCreateProgramWithSource(context, 1, (const char **) &source, &size, &status); checkError(status, "Error: failed to create program %s: ", name); status = clBuildProgram(program, 1, &device, "-I.", NULL, NULL); if (status != CL_SUCCESS) { print_build_log(program, device); checkError(status, "Error: failed to create build %s: ", name); } free(source); print_device_info(device, 0); queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status); checkError(status, "could not create command queue"); cl_ulong start, end; cl_event event; size_t width = 1024+1024; size_t buf_size = width*sizeof(cl_float); kernel = clCreateKernel(program, "reduce", &status); checkError(status, "could not create kernel"); size_t work_size = width; size_t local_size = 64; size_t local_buf_size = local_size * sizeof(cl_float); size_t groups = width / local_size; size_t res_buf_size = groups * sizeof(cl_float); float *data_in = malloc(buf_size); float *data_out = malloc(res_buf_size); if (!data_in || !data_out) { fprintf(stderr,"\nError: malloc failed\n"); teardown(-1); } for (unsigned int i = 0; i < width; ++i) { data_in[i] = (float) (i % 16); } buffer_in = clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &status); checkError(status, "Error: could not create buffer_in"); buffer_out = clCreateBuffer(context, CL_MEM_READ_WRITE, res_buf_size, NULL, &status); checkError(status, "Error: could not create buffer_out"); status = clEnqueueWriteBuffer(queue, buffer_in, CL_FALSE, 0, buf_size, data_in, 0, NULL, NULL); checkError(status, "Error: could not copy data into device"); // execute kernel int arg = 0; status = clSetKernelArg(kernel, arg++, sizeof(cl_mem), &buffer_in); status = clSetKernelArg(kernel, arg++, sizeof(cl_mem), &buffer_out); status = clSetKernelArg(kernel, arg++, local_buf_size, NULL); status = clSetKernelArg(kernel, arg++, sizeof(cl_int), &width); checkError(status, "Error: could not set args"); status = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &work_size, &local_size, 0, NULL, &event); checkError(status, "Error: could not enqueue kernel"); status = clWaitForEvents(1, &event); checkError(status, "Error: could not wait for event"); status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); checkError(status, "Error: could not get start profile information"); status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); checkError(status, "Error: could not get end profile information"); status = clReleaseEvent(event); checkError(status, "Error: could not release event"); // read results back status = clEnqueueReadBuffer(queue, buffer_out, CL_TRUE, 0, res_buf_size, data_out, 0, NULL, NULL); checkError(status, "Error: could not copy data into device"); status = clFinish(queue); checkError(status, "Error: could not finish successfully"); float clsum = 0; for (unsigned int i = 0; i < groups; ++i) { clsum += data_out[i]; } #ifdef DEBUG for (int i = 0; i < groups; ++i) { printf("%.0f ", data_out[i]); } #endif double elapsed = (end - start) * 1e-9f; printf("time: %f\n", elapsed); float sum = 0; for (unsigned int i = 0; i < width; ++i) { sum += data_in[i]; } if (sum != clsum) fprintf(stderr, "Compare failed: %f != %f\n", clsum, sum); free(data_in); free(data_out); teardown(0); }
int NBody::runCLKernels() { cl_int status; cl_event events[1]; /* * Enqueue a kernel run call. */ size_t globalThreads[] = {numBodies}; size_t localThreads[] = {groupSize}; if(localThreads[0] > maxWorkItemSizes[0] || localThreads[0] > maxWorkGroupSize) { std::cout << "Unsupported: Device" "does not support requested number of work items."; return SDK_FAILURE; } status = clEnqueueNDRangeKernel( commandQueue, kernel, 1, NULL, globalThreads, localThreads, 0, NULL, NULL); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueNDRangeKernel failed.")) { return SDK_FAILURE; } status = clFinish(commandQueue); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clFinish failed.")) { return SDK_FAILURE; } /* Copy data from new to old */ status = clEnqueueCopyBuffer(commandQueue, newPos, currPos, 0, 0, sizeof(cl_float4) * numBodies, 0, 0, 0); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueCopyBuffer failed.(newPos->oldPos)")) { return SDK_FAILURE; } status = clEnqueueCopyBuffer(commandQueue, newVel, currVel, 0, 0, sizeof(cl_float4) * numBodies, 0, 0, 0); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueCopyBuffer failed.(newVel->oldVels)")) { return SDK_FAILURE; } status = clFinish(commandQueue); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clFinish failed.")) { return SDK_FAILURE; } /* Enqueue readBuffer*/ status = clEnqueueReadBuffer( commandQueue, currPos, CL_TRUE, 0, numBodies* sizeof(cl_float4), pos, 0, NULL, &events[0]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueReadBuffer failed.")) return SDK_FAILURE; /* Wait for the read buffer to finish execution */ status = clWaitForEvents(1, &events[0]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clWaitForEvents failed.")) return SDK_FAILURE; clReleaseEvent(events[0]); return SDK_SUCCESS; }
ErrorStatus crossprod_clblas(cl_device_id device, void *inMatrix, void *outMatrix, int nrow, int ncol, bool use_float) { std::stringstream result; float *input_matrix_f = (float *)inMatrix; float *output_matrix_f = (float *)outMatrix; double *input_matrix_d = (double *)inMatrix; double *output_matrix_d = (double *)outMatrix; if (debug) { result << "crossprod_clblas( " << (use_float ? "FLOAT" : "DOUBLE") << ", nrow = " << nrow << ", ncol = " << ncol << ")" << std::endl << std::endl; } cl_int err = CL_SUCCESS; clblasStatus status = clblasSetup(); if (status != CL_SUCCESS) { if (debug) { result << "clblasSetup: " << clblasErrorToString(status) << std::endl; } err = CL_INVALID_OPERATION; } // get first platform cl_platform_id platform = NULL; if (err == CL_SUCCESS) { err = clGetPlatformIDs(1, &platform, NULL); } if (debug && err == CL_SUCCESS) { result << "Platform: " << getPlatformInfoString(platform, CL_PLATFORM_NAME) << std::endl; result << "Device: " << getDeviceInfoString(device, CL_DEVICE_NAME) << std::endl; } // context cl_context context = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateContext:" << std::endl; } context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); } // queue cl_command_queue queue = NULL; if (err == CL_SUCCESS) { #ifdef CL_VERSION_2_0 if (debug) { result << "clCreateCommandQueueWithProperties:" << std::endl; } queue = clCreateCommandQueueWithProperties(context, device, NULL, &err); #else if (debug) { result << "clCreateCommandQueue:" << std::endl; } queue = clCreateCommandQueue(context, device, 0, &err); #endif } // buffers cl_mem cl_input_matrix = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateBuffer cl_input_matrix:" << std::endl; } if (use_float) { cl_input_matrix = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrow * ncol * sizeof(float), input_matrix_f, &err); } else { cl_input_matrix = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrow * ncol * sizeof(double), input_matrix_d, &err); } } cl_mem cl_output_matrix = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateBuffer cl_output_vector:" << std::endl; } if (use_float) { cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, ncol * ncol * sizeof(float), output_matrix_f, &err); } else { cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, ncol * ncol * sizeof(double), output_matrix_d, &err); } } // ++++++++++++ const clblasOrder order = clblasColumnMajor; const clblasTranspose transA = clblasTrans; const size_t lda = nrow; const size_t ldc = ncol; const cl_float alpha = 1.0; clblasUplo uplo = clblasUpper; cl_event event = NULL; if (err == CL_SUCCESS) { if (use_float) { if (debug) { result << "clblasSsyrk:" << std::endl; } status = clblasSsyrk(order, uplo, transA, ncol, nrow, alpha, cl_input_matrix, 0, lda, 0.0, cl_output_matrix, 0, ldc, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS && debug) { result << "clblasSgemm error:" << clblasErrorToString(status) << std::endl; } } else { if (debug) { result << "clblasDsyrk:" << std::endl; } status = clblasDsyrk(order, uplo, transA, ncol, nrow, alpha, cl_input_matrix, 0, lda, 0.0, cl_output_matrix, 0, ldc, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { if (debug) { result << "clblasDgemm error:" << clblasErrorToString(status) << std::endl; } err = status; } } } if (err == CL_SUCCESS) { /* Wait for calculations to be finished. */ if (debug) { result << "clWaitForEvents:" << std::endl; } err = clWaitForEvents(1, &event); } // retrieve result if (err == CL_SUCCESS) { if (debug) { result << "Retrieve result:" << std::endl; } if (use_float) { clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, ncol * ncol * sizeof(float), output_matrix_f, 0, NULL, NULL); symmetrizeSquare_f(output_matrix_f, ncol); } else { clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, ncol * ncol * sizeof(double), output_matrix_d, 0, NULL, NULL); symmetrizeSquare_d(output_matrix_d, ncol); } } std::string err_str = clErrorToString(err); result << std::endl << err_str << std::endl; // cleanup clReleaseMemObject(cl_output_matrix); cl_output_matrix = NULL; clReleaseMemObject(cl_input_matrix); cl_input_matrix = NULL; clReleaseCommandQueue(queue); queue = NULL; clReleaseContext(context); context = NULL; if (debug) { CERR << result.str(); } ErrorStatus errorStatus = { err, status }; // return status != CL_SUCCESS ? clblasErrorToString(status) : clErrorToString(err); return errorStatus; }
void reduceFirstPass( cl_mem d_Rin, int rLen, int numThread, int numMaxBlock, int OPERATOR,int *index,cl_event *eventList,cl_kernel *kernel,int *Flag_CPU_GPU,double * burden,tempResult *tR, int _CPU_GPU) { int* info = (int*)malloc( sizeof(int)*2 ); //get the information of partition //return bool: if is multiple of maxNumThread //if yes, info[0]: number of blocks, info[1] = maxNumThread //if no, info[0]: number of blocks except of the last block, info[1]: number of thread in the last block bool isMul = howPartition( rLen, numThread, info ); //scan the isP2 blocks unsigned int numBlock = info[0]; unsigned int numElementsPerBlock = 0; unsigned int extraSpace = 0; unsigned int sharedMemSize = 0; cl_mem d_temp; //for coalsed CL_MALLOC( &d_temp, sizeof(int)*rLen ); cl_mem t_temp=NULL;//!!!!!!!!!! CL_MALLOC( &t_temp, sizeof(int)*rLen ); unsigned int base = 0; unsigned int offset = 0; cl_mem d_data; if( numBlock > 0 ) { int numChunk = ceil( (float)numBlock/numMaxBlock ); for( int chunkIdx = 0; chunkIdx < numChunk; chunkIdx++ ) { base = chunkIdx*numElementsPerBlock*numMaxBlock; offset = chunkIdx*numMaxBlock; int subNumBlock = (chunkIdx == (numChunk - 1))?( numBlock - chunkIdx*numMaxBlock ):(numMaxBlock); numElementsPerBlock = numThread*2; extraSpace = numElementsPerBlock/NUM_BANKS; sharedMemSize = sizeof(int)*( numElementsPerBlock + extraSpace ); perscanFirstPass_kernel_int(t_temp, d_temp, tR->d_scanBlockSums[0], d_Rin, numElementsPerBlock, true, base, offset, OPERATOR,subNumBlock, numThread, sharedMemSize,rLen,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU ); clWaitForEvents(1,&eventList[(*index-1)%2]); } } clWaitForEvents(1,&eventList[(*index-1)%2]); //scan the single not isP2 block if( (!isMul) || (numBlock == 0) ) { base = numElementsPerBlock*info[0]; offset = info[0]; unsigned int remainer = rLen - numElementsPerBlock*info[0]; numThread = info[1];//update the numThread //if only one elements if( numThread == 0 ) { copyLastElement_kernel_int(tR->d_scanBlockSums[0], d_Rin, base, offset,1, 1,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU); clWaitForEvents(1,&eventList[(*index-1)%2]); } else { numBlock = 1; numElementsPerBlock = numThread*2; extraSpace = numElementsPerBlock/NUM_BANKS; sharedMemSize = sizeof(int)*( numElementsPerBlock + extraSpace ); if( isPowerOfTwo( remainer ) ) { perscanFirstPass_kernel_int(t_temp, d_temp, tR->d_scanBlockSums[0], d_Rin, remainer, true, base, offset, OPERATOR ,numBlock, numThread, sharedMemSize,rLen, index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU ); clWaitForEvents(1,&eventList[(*index-1)%2]); } else { perscanFirstPass_kernel_int(t_temp,d_temp, tR->d_scanBlockSums[0], d_Rin, remainer, false, base, offset, OPERATOR ,numBlock, numThread, sharedMemSize,rLen,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU ); clWaitForEvents(1,&eventList[(*index-1)%2]); } } } clWaitForEvents(1,&eventList[(*index-1)%2]); CL_FREE( d_temp ); CL_FREE( t_temp ); }
int reduceBlockSums( cl_mem d_Rout, int maxNumThread, int OPERATOR, int rLen,int *index,cl_event *eventList,cl_kernel *kernel,int *Flag_CPU_GPU,double * burden,tempResult *tR, int _CPU_GPU) { int* info = (int*)malloc( sizeof(int)*2 ); cl_mem temp=NULL; CL_MALLOC(&temp, sizeof(int)*rLen ); //get the information of partition //return bool: if is multiple of maxNumThread //if yes, info[0]: number of blocks, info[1] = maxNumThread //if no, info[0]: number of blocks except of the last block, info[1]: number of thread in the last block for( int level = 0; level < ( tR->d_numLevelsAllocated - 1 ); level++ ) { bool isMul = howPartition( tR->levelSize[level], maxNumThread, info ); unsigned int numBlock = info[0]; unsigned int numElementsPerBlock = 0; unsigned int extraSpace = 0; unsigned int sharedMemSize = 0; //scan the isP2 blocks if( numBlock > 0 ) { numElementsPerBlock = maxNumThread*2; extraSpace = numElementsPerBlock/NUM_BANKS; sharedMemSize = sizeof(int)*( numElementsPerBlock + extraSpace ); perscan_kernel_int(temp, tR->d_scanBlockSums[level + 1], tR->d_scanBlockSums[level], numElementsPerBlock, true, 0, 0, OPERATOR, numBlock, maxNumThread,sharedMemSize,rLen,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU); clWaitForEvents(1,&eventList[(*index-1)%2]); } clWaitForEvents(1,&eventList[(*index-1)%2]); //scan the single not isP2 block if( (!isMul) || (numBlock == 0) ) { unsigned int base = numElementsPerBlock*info[0]; unsigned int offset = info[0]; unsigned int remainer = tR->levelSize[level] - numElementsPerBlock*info[0]; int numThread = info[1];//update the numThread clWaitForEvents(1,&eventList[(*index-1)%2]); //only one number in the last block if( numThread == 0 ) { cl_copyBuffer((tR->d_scanBlockSums[level+1]), offset, tR->d_scanBlockSums[level], base, sizeof(int), index,eventList,Flag_CPU_GPU,burden,_CPU_GPU); } else { numBlock = 1; numElementsPerBlock = numThread*2; extraSpace = numElementsPerBlock/NUM_BANKS; sharedMemSize = sizeof(int)*( numElementsPerBlock + extraSpace ); if( isPowerOfTwo( remainer ) ) { perscan_kernel_int(temp, tR->d_scanBlockSums[level + 1],tR->d_scanBlockSums[level], remainer, true, base, offset, OPERATOR,numBlock, numThread, sharedMemSize,rLen,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU); clWaitForEvents(1,&eventList[(*index-1)%2]); } else { perscan_kernel_int(temp,tR->d_scanBlockSums[level + 1], tR->d_scanBlockSums[level], remainer, false, base, offset, OPERATOR,numBlock, numThread, sharedMemSize,rLen,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU); clWaitForEvents(1,&eventList[(*index-1)%2]); } } } } clWaitForEvents(1,&eventList[(*index-1)%2]); getResult_kernel_init(tR->d_scanBlockSums[tR->d_numLevelsAllocated - 1], d_Rout, rLen, OPERATOR,1,1,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU); clWaitForEvents(1,&eventList[(*index-1)%2]); CL_FREE(temp); return 1; }
int main() { // unsigned long start_time = time_ms(); // init matrix memset(u, 0, N*N*sizeof(VALUE)); printf("Jacobi with N=%d, L_SZ=%d, IT=%d\n", N, L_SZ, IT); printf("Kernel file name: %s\n", KERNEL_FILE_NAME); // init F for(int i=0; i<N; i++) for(int j=0; j<N; j++) f[i][j] = init_func(i, j); VALUE factor = pow((VALUE)1/N, 2); // ocl initialization cl_context context; cl_command_queue command_queue; cl_device_id device_id = cluInitDevice(CL_DEVICE, &context, &command_queue); // create memory buffers cl_int err; cl_mem matrix_U = clCreateBuffer(context, CL_MEM_READ_WRITE, N * N * sizeof(VALUE), NULL, &err); cl_mem matrix_F = clCreateBuffer(context, CL_MEM_READ_ONLY, N * N * sizeof(VALUE), NULL, &err); cl_mem matrix_TMP = clCreateBuffer(context, CL_MEM_READ_WRITE, N * N * sizeof(VALUE), NULL, &err); CLU_ERRCHECK(err, "Failed to create buffer for matrix"); // used for profiling info cl_event ev_write_U; cl_event ev_write_F; cl_event ev_kernel; cl_event ev_read_TMP; double write_total, read_total, kernel_total; write_total = read_total = kernel_total = 0.0; // create kernel from source char tmp[1024]; sprintf(tmp, "-DN=%i -DVALUE=%s", N, EXPAND_AND_QUOTE(VALUE)); cl_program program = cluBuildProgramFromFile(context, device_id, KERNEL_FILE_NAME, tmp); cl_kernel kernel = clCreateKernel(program, "jacobi", &err); CLU_ERRCHECK(err, "Failed to create matrix_mul kernel from program"); /* ---------------------------- main part ----------------------------------- */ // also initialize target matrix with zero values!!! err = clEnqueueWriteBuffer(command_queue, matrix_TMP, CL_TRUE, 0, N * N * sizeof(VALUE), u, 0, NULL, &ev_write_U); CLU_ERRCHECK(err, "Failed to write matrix to device"); // write f to device err = clEnqueueWriteBuffer(command_queue, matrix_F, CL_FALSE, 0, N * N * sizeof(VALUE), f, 0, NULL, &ev_write_F); CLU_ERRCHECK(err, "Failed to write matrix F to device"); // write matrix u to device err = clEnqueueWriteBuffer(command_queue, matrix_U, CL_FALSE, 0, N * N * sizeof(VALUE), u, 0, NULL, &ev_write_U); CLU_ERRCHECK(err, "Failed to write matrix U to device"); // define global work size size_t g_work_size[2] = {N, N}; size_t l_work_size[2] = {L_SZ, L_SZ}; cl_mem buffer_u; cl_mem buffer_tmp; for (int i = 0; i < IT; ++i) { // swap U and TMP arguments based on iteration counter if(i % 2 == 0) { buffer_u = matrix_U; buffer_tmp = matrix_TMP; } else { buffer_u = matrix_TMP; buffer_tmp = matrix_U; } // compute memory block dimensions int block_dim = (L_SZ + 2) * (L_SZ + 2); // set kernel arguments cluSetKernelArguments(kernel, 5, sizeof(cl_mem), (void *)&buffer_u, sizeof(cl_mem), (void *)&matrix_F, sizeof(cl_mem), (void *)&buffer_tmp, // local memory buffer block_dim * sizeof(VALUE), NULL, sizeof(VALUE), (void *)&factor); // execute kernel err = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, g_work_size, l_work_size, 0, NULL, &ev_kernel); CLU_ERRCHECK(err, "Failed to enqueue 2D kernel"); // wait until execution completes clWaitForEvents(1, &ev_kernel); // add profiling information kernel_total += getDurationMS(ev_kernel); } // copy results back to host err = clEnqueueReadBuffer(command_queue, buffer_tmp, CL_TRUE, 0, N * N * sizeof(VALUE), u, 0, NULL, &ev_read_TMP); CLU_ERRCHECK(err, "Failed reading back result"); // compute profiling information write_total += getDurationMS(ev_write_U); write_total += getDurationMS(ev_write_F); read_total += getDurationMS(ev_read_TMP); /* ---------------------------- evaluate results ---------------------------------- */ // print result printf("OCL Device: %s\n", cluGetDeviceDescription(device_id, CL_DEVICE)); // printf("Verification: %4s\n", (success) ? "OK" : "ERR"); printf("Write total: %9.4f ms\n", write_total); printf("Read total: %9.4f ms\n", read_total); printf("Kernel execution: %9.4f ms\n", kernel_total); printf("Time total: %9.4f ms\n\n", write_total + read_total + kernel_total); #ifdef DEBUG print_result(u); #endif /* ---------------------------- finalization ------------------------------------- */ err = clFinish(command_queue); err |= clReleaseKernel(kernel); err |= clReleaseProgram(program); err |= clReleaseMemObject(matrix_U); err |= clReleaseMemObject(matrix_F); err |= clReleaseMemObject(matrix_TMP); err |= clReleaseCommandQueue(command_queue); err |= clReleaseContext(context); CLU_ERRCHECK(err, "Failed during ocl cleanup"); return EXIT_SUCCESS; }
void wait(int index,cl_event *eventList){ printf("index of %d Going to wait!\n",index); cl_int err=clWaitForEvents(1,&eventList[(index-1)%2]); printf("index of %d Finish wait! err is %d\n,",index,err); }
template <typename ElemType> nano_time_t TrsvPerformanceTest<ElemType>::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; size_t lenX = 1 + ((params_.N-1) * abs(params_.incx)) + params_.offBX; status = clEnqueueWriteBuffer(queue, mobjX_, CL_TRUE, 0, lenX * sizeof(ElemType), backX_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector X buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; DataType type; type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT:( typeid(ElemType) == typeid(double))? TYPE_DOUBLE: ( typeid(ElemType) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; time = getCurrentTime(); #define TIMING #ifdef TIMING clFinish( queue); int iter = 20; for ( int i = 1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::trsv(type, params_.order, params_.uplo, params_.transA, params_.diag, params_.N, mobjA_, params_.offa, params_.lda, mobjX_, params_.offBX, params_.incx, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS TRSV function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; }
int solve_ocl(problem* problem) { // -------------- create list of sub-problems --------------------- // get upper boundary for number of sub-problems int max_problems = powl(problem->size, OMP_CUT); // create array of activation records call_record* sub_problems = malloc(sizeof(call_record) * max_problems); call_record* pos = sub_problems; // create store for partial solutions int* block = malloc(sizeof(int) * max_problems * problem->size); int used = 0; // fill up sub-problem array volatile int best = 1<<30; int partial[problem->size]; initActivationRecord(problem, partial, 0, 0, 0, &best, &pos, block, &used, OMP_CUT); unsigned num_sub_problems = pos - sub_problems; printf("Sub-problem list filled %d/%d\n", num_sub_problems, max_problems); // -------------- process list using GPU --------------------- // ---- process in parallel ---- cl_int error; // get platform cl_platform_id platform; { cl_uint numPlatforms; handleErrorCode(clGetPlatformIDs(0,0, &numPlatforms)); INFO("Found %d platform(s)!", numPlatforms); cl_platform_id ids[numPlatforms]; handleErrorCode(clGetPlatformIDs(numPlatforms,ids, &numPlatforms)); platform = ids[0]; } // get devices cl_device_id device; { cl_uint numDevices; handleErrorCode(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, 0, &numDevices)); cl_device_id ids[numDevices]; handleErrorCode(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, numDevices, ids, &numDevices)); device = ids[0]; size_t size; clGetDeviceInfo(device, CL_DEVICE_VENDOR, 0, 0, &size); char name[size]; clGetDeviceInfo(device, CL_DEVICE_VENDOR, size*sizeof(char), name, 0); printf("Using Device %s\n", name); } // create context INFO("Creating context .."); cl_context context = clCreateContext(0, 1, &device, 0, 0, &error); handleErrorCode(error); // create queue INFO("Creating command queue .."); cl_command_queue queue = clCreateCommandQueue(context, device, 0, &error); handleErrorCode(error); // create program INFO("Building program .."); cl_program program; { const char* code = loadFile("qap_array.cl"); INFO("Kernel Code:\n%s\n", code); size_t code_length = strlen(code); program = clCreateProgramWithSource(context,1, &code, &code_length, &error); handleErrorCode(error); // build program error = clBuildProgram(program, 1, &device, 0, 0, 0); if (error == CL_BUILD_PROGRAM_FAILURE) { printf("Program built failed!\n"); // obtain additional build error information size_t logSize = 2048; char log[logSize]; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, logSize, log, &logSize); printf("Log:\n%s\n", log); exit(1); } else { handleErrorCode(error); } } // get kernel INFO("Building kernel .."); cl_kernel kernel = clCreateKernel(program, "qap", &error); handleErrorCode(error); // create buffers INFO("Creating Buffers .."); cl_mem mA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*problem->size*problem->size, problem->A->data, &error); handleErrorCode(error); cl_mem mB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*problem->size*problem->size, problem->B->data, &error); handleErrorCode(error); cl_mem vC = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(call_record)*num_sub_problems, sub_problems, &error); handleErrorCode(error); cl_mem vP = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*problem->size*num_sub_problems, block, &error); handleErrorCode(error); cl_mem sR = clCreateBuffer(context, CL_MEM_READ_WRITE| CL_MEM_COPY_HOST_PTR, sizeof(int), (void*)&best, &error); handleErrorCode(error); // set up arguments INFO("Setting up Arguments .."); handleErrorCode(clSetKernelArg(kernel, 0, sizeof(int), &problem->size)); handleErrorCode(clSetKernelArg(kernel, 1, sizeof(mA), &mA)); handleErrorCode(clSetKernelArg(kernel, 2, sizeof(mB), &mB)); handleErrorCode(clSetKernelArg(kernel, 3, sizeof(vC), &vC)); handleErrorCode(clSetKernelArg(kernel, 4, sizeof(vP), &vP)); handleErrorCode(clSetKernelArg(kernel, 5, sizeof(sR), &sR)); handleErrorCode(clSetKernelArg(kernel, 6, sizeof(num_sub_problems), &num_sub_problems)); // run kernel INFO("Running kernel .."); size_t local_work_size = 64; size_t global_work_offset = 0; size_t global_work_size = (num_sub_problems + (local_work_size - 1))/local_work_size*local_work_size; cl_event kernel_done; double start = getTime(); handleErrorCode(clEnqueueNDRangeKernel(queue, kernel, 1, &global_work_offset, &global_work_size, &local_work_size, 0, 0, &kernel_done)); handleErrorCode(clWaitForEvents(1,&kernel_done)); double time = getTime() - start; printf("OpenCL Computation Time: %lf sec\n", time); // enqueue read operation cl_event read_done; int res = 0; handleErrorCode(clEnqueueReadBuffer(queue, sR, true, 0, sizeof(int), &res, 1, &kernel_done, &read_done)); // wait for completion handleErrorCode(clWaitForEvents(1,&read_done)); // cleanup handleErrorCode(clFinish(queue)); handleErrorCode(clReleaseKernel(kernel)); handleErrorCode(clReleaseProgram(program)); handleErrorCode(clReleaseCommandQueue(queue)); handleErrorCode(clReleaseContext(context)); // -------------- free resources and be done --------------------- // free temporary arrays free(sub_problems); free(block); // return result return res; /* // ---- process in parallel ---- cl_int error; // get platform cl_platform_id platform; { cl_uint numPlatforms; handleErrorCode(clGetPlatformIDs(0,0, &numPlatforms)); INFO("Found %d platform(s)!", numPlatforms); cl_platform_id ids[numPlatforms]; handleErrorCode(clGetPlatformIDs(numPlatforms,ids, &numPlatforms)); platform = ids[0]; } // get devices cl_device_id device; { cl_uint numDevices; handleErrorCode(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, 0, &numDevices)); cl_device_id ids[numDevices]; handleErrorCode(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, numDevices, ids, &numDevices)); device = ids[0]; } // create context INFO("Creating context .."); cl_context context = clCreateContext(0, 1, &device, 0, 0, &error); handleErrorCode(error); // create queue INFO("Creating command queue .."); cl_command_queue queue = clCreateCommandQueue(context, device, 0, &error); handleErrorCode(error); // create program INFO("Building program .."); cl_program program; { const char* code = loadFile("fib.cl"); INFO("Kernel Code:\n%s\n", code); size_t code_length = strlen(code); program = clCreateProgramWithSource(context,1, &code, &code_length, &error); handleErrorCode(error); // build program error = clBuildProgram(program, 1, &device, 0, 0, 0); if (error == CL_BUILD_PROGRAM_FAILURE) { printf("Program built failed!\n"); // obtain additional build error information size_t logSize = 2048; char log[logSize]; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, logSize, log, &logSize); printf("Log:\n%s\n", log); exit(1); } else { handleErrorCode(error); } } // get kernel INFO("Building kernel .."); cl_kernel kernel = clCreateKernel(program, "fib", &error); handleErrorCode(error); // create buffers INFO("Creating Buffers .."); cl_mem vecA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*size, record, &error); handleErrorCode(error); cl_mem vecR = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int)*size, 0, &error); handleErrorCode(error); // set up arguments INFO("Setting up Arguments .."); handleErrorCode(clSetKernelArg(kernel, 0, sizeof(vecA), &vecA)); handleErrorCode(clSetKernelArg(kernel, 1, sizeof(vecR), &vecR)); handleErrorCode(clSetKernelArg(kernel, 2, sizeof(size), &size)); // run kernel INFO("Running kernel .."); size_t local_work_size = 64; size_t global_work_offset = 0; size_t global_work_size = size; cl_event kernel_done; double start = getTime(); handleErrorCode(clEnqueueNDRangeKernel(queue, kernel, 1, &global_work_offset, &global_work_size, &local_work_size, 0, 0, &kernel_done)); handleErrorCode(clWaitForEvents(1,&kernel_done)); double time = getTime() - start; printf("OpenCL Computation Time: %lf sec\n", time); // enqueue read operation cl_event read_done; int* res = malloc(sizeof(int)*size); handleErrorCode(clEnqueueReadBuffer(queue, vecR, true, 0, sizeof(int)*size, res, 1, &kernel_done, &read_done)); // wait for completion handleErrorCode(clWaitForEvents(1,&read_done)); // cleanup handleErrorCode(clFinish(queue)); handleErrorCode(clReleaseKernel(kernel)); handleErrorCode(clReleaseProgram(program)); handleErrorCode(clReleaseCommandQueue(queue)); handleErrorCode(clReleaseContext(context)); // aggregate results TODO: move reduction to OpenCL kernel int sum = 0; #pragma omp parallel for reduction(+:sum) for(int i=0; i<size; i++) { sum += res[i]; } // release arrays free(record); free(res); // done return sum; */ }
void matrixMulGPU(cl_uint ciDeviceCount, cl_mem h_A, float* h_B_data, unsigned int mem_size_B, float* h_C ) { cl_mem d_A[MAX_GPU_COUNT]; cl_mem d_C[MAX_GPU_COUNT]; cl_mem d_B[MAX_GPU_COUNT]; cl_event GPUDone[MAX_GPU_COUNT]; cl_event GPUExecution[MAX_GPU_COUNT]; // Start the computation on each available GPU // Create buffers for each GPU // Each GPU will compute sizePerGPU rows of the result int sizePerGPU = uiHA / ciDeviceCount; int workOffset[MAX_GPU_COUNT]; int workSize[MAX_GPU_COUNT]; workOffset[0] = 0; for(unsigned int i=0; i < ciDeviceCount; ++i) { // Input buffer workSize[i] = (i != (ciDeviceCount - 1)) ? sizePerGPU : (uiHA - workOffset[i]); d_A[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, workSize[i] * sizeof(float) * uiWA, NULL,NULL); // Copy only assigned rows from host to device clEnqueueCopyBuffer(commandQueue[i], h_A, d_A[i], workOffset[i] * sizeof(float) * uiWA, 0, workSize[i] * sizeof(float) * uiWA, 0, NULL, NULL); // create OpenCL buffer on device that will be initiatlize from the host memory on first use // on device d_B[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_B, h_B_data, NULL); // Output buffer d_C[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, workSize[i] * uiWC * sizeof(float), NULL,NULL); // set the args values clSetKernelArg(multiplicationKernel[i], 0, sizeof(cl_mem), (void *) &d_C[i]); clSetKernelArg(multiplicationKernel[i], 1, sizeof(cl_mem), (void *) &d_A[i]); clSetKernelArg(multiplicationKernel[i], 2, sizeof(cl_mem), (void *) &d_B[i]); clSetKernelArg(multiplicationKernel[i], 3, sizeof(float) * BLOCK_SIZE *BLOCK_SIZE, 0 ); clSetKernelArg(multiplicationKernel[i], 4, sizeof(float) * BLOCK_SIZE *BLOCK_SIZE, 0 ); clSetKernelArg(multiplicationKernel[i], 5, sizeof(cl_int), (void *) &uiWA); clSetKernelArg(multiplicationKernel[i], 6, sizeof(cl_int), (void *) &uiWB); if(i+1 < ciDeviceCount) workOffset[i + 1] = workOffset[i] + workSize[i]; } // Execute Multiplication on all GPUs in parallel size_t localWorkSize[] = {BLOCK_SIZE, BLOCK_SIZE}; size_t globalWorkSize[] = {shrRoundUp(BLOCK_SIZE, uiWC), shrRoundUp(BLOCK_SIZE, workSize[0])}; // Launch kernels on devices #ifdef GPU_PROFILING int nIter = 30; for (int j = -1; j < nIter; j++) { // Sync all queues to host and start timer first time through loop if(j == 0){ for(unsigned int i = 0; i < ciDeviceCount; i++) { clFinish(commandQueue[i]); } shrDeltaT(0); } #endif for(unsigned int i = 0; i < ciDeviceCount; i++) { // Multiplication - non-blocking execution: launch and push to device(s) globalWorkSize[1] = shrRoundUp(BLOCK_SIZE, workSize[i]); clEnqueueNDRangeKernel(commandQueue[i], multiplicationKernel[i], 2, 0, globalWorkSize, localWorkSize, 0, NULL, &GPUExecution[i]); clFlush(commandQueue[i]); } #ifdef GPU_PROFILING } #endif // sync all queues to host for(unsigned int i = 0; i < ciDeviceCount; i++) { clFinish(commandQueue[i]); } #ifdef GPU_PROFILING // stop and log timer double dSeconds = shrDeltaT(0)/(double)nIter; double dNumOps = 2.0 * (double)uiWA * (double)uiHA * (double)uiWB; double gflops = 1.0e-9 * dNumOps/dSeconds; shrLogEx(LOGBOTH | MASTER, 0, "oclMatrixMul, Throughput = %.4f GFlops/s, Time = %.5f s, Size = %.0f, NumDevsUsed = %d, Workgroup = %u\n", gflops, dSeconds, dNumOps, ciDeviceCount, localWorkSize[0] * localWorkSize[1]); // Print kernel timing per GPU shrLog("\n"); for(unsigned int i = 0; i < ciDeviceCount; i++) { shrLog(" Kernel execution time on GPU %d \t: %.5f s\n", i, executionTime(GPUExecution[i])); } shrLog("\n"); #endif for(unsigned int i = 0; i < ciDeviceCount; i++) { // Non-blocking copy of result from device to host clEnqueueReadBuffer(commandQueue[i], d_C[i], CL_FALSE, 0, uiWC * sizeof(float) * workSize[i], h_C + workOffset[i] * uiWC, 0, NULL, &GPUDone[i]); } // CPU sync with GPU clWaitForEvents(ciDeviceCount, GPUDone); // Release mem and event objects for(unsigned int i = 0; i < ciDeviceCount; i++) { clReleaseMemObject(d_A[i]); clReleaseMemObject(d_C[i]); clReleaseMemObject(d_B[i]); clReleaseEvent(GPUExecution[i]); clReleaseEvent(GPUDone[i]); } }
bool SkDifferentPixelsMetric::diff(SkBitmap* baseline, SkBitmap* test, bool computeMask, Result* result) const { double startTime = get_seconds(); if (!fIsGood) { return false; } // If we never end up running the kernel, include some safe defaults in the result. result->poiCount = 0; // Ensure the images are comparable if (baseline->width() != test->width() || baseline->height() != test->height() || baseline->width() <= 0 || baseline->height() <= 0 || baseline->config() != test->config()) { return false; } cl_mem baselineImage; cl_mem testImage; cl_mem resultsBuffer; // Upload images to the CL device if (!this->makeImage2D(baseline, &baselineImage) || !this->makeImage2D(test, &testImage)) { SkDebugf("creation of openCL images failed"); return false; } // A small hack that makes calculating percentage difference easier later on. result->result = 1.0 / ((double)baseline->width() * baseline->height()); // Make a buffer to store results into. It must be initialized with pointers to memory. static const int kZero = 0; // We know OpenCL won't write to it because we use CL_MEM_COPY_HOST_PTR resultsBuffer = clCreateBuffer(fContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(int), (int*)&kZero, NULL); // Set all kernel arguments cl_int setArgErr = clSetKernelArg(fKernel, 0, sizeof(cl_mem), &baselineImage); setArgErr |= clSetKernelArg(fKernel, 1, sizeof(cl_mem), &testImage); setArgErr |= clSetKernelArg(fKernel, 2, sizeof(cl_mem), &resultsBuffer); if (CL_SUCCESS != setArgErr) { SkDebugf("Set arg failed: %s\n", cl_error_to_string(setArgErr)); return false; } // Queue this diff on the CL device cl_event event; const size_t workSize[] = { baseline->width(), baseline->height() }; cl_int enqueueErr; enqueueErr = clEnqueueNDRangeKernel(fCommandQueue, fKernel, 2, NULL, workSize, NULL, 0, NULL, &event); if (CL_SUCCESS != enqueueErr) { SkDebugf("Enqueue failed: %s\n", cl_error_to_string(enqueueErr)); return false; } // This makes things totally synchronous. Actual queue is not ready yet clWaitForEvents(1, &event); // Immediate read back the results clEnqueueReadBuffer(fCommandQueue, resultsBuffer, CL_TRUE, 0, sizeof(int), &result->poiCount, 0, NULL, NULL); result->result *= (double)result->poiCount; result->result = (1.0 - result->result); // Release all the buffers created clReleaseMemObject(resultsBuffer); clReleaseMemObject(baselineImage); clReleaseMemObject(testImage); result->timeElapsed = get_seconds() - startTime; return true; }
int main(int argc, char **argv) { cl_platform_id platforms[100]; cl_uint platforms_n = 0; CL_CHECK(clGetPlatformIDs(100, platforms, &platforms_n)); printf("=== %d OpenCL platform(s) found: ===\n", platforms_n); for (int i=0; i<platforms_n; i++) { char buffer[10240]; printf(" -- %d --\n", i); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL)); printf(" PROFILE = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL)); printf(" VERSION = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL)); printf(" NAME = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL)); printf(" VENDOR = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL)); printf(" EXTENSIONS = %s\n", buffer); } if (platforms_n == 0) return 1; cl_device_id devices[100]; cl_uint devices_n = 0; // CL_CHECK(clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 100, devices, &devices_n)); CL_CHECK(clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 100, devices, &devices_n)); printf("=== %d OpenCL device(s) found on platform:\n", platforms_n); for (int i=0; i<devices_n; i++) { char buffer[10240]; cl_uint buf_uint; cl_ulong buf_ulong; printf(" -- %d --\n", i); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL)); printf(" DEVICE_NAME = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL)); printf(" DEVICE_VENDOR = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL)); printf(" DEVICE_VERSION = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL)); printf(" DRIVER_VERSION = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL)); printf(" DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL)); printf(" DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL)); printf(" DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong); } if (devices_n == 0) return 1; cl_context context; context = CL_CHECK_ERR(clCreateContext(NULL, 1, devices, &pfn_notify, NULL, &_err)); const char *program_source[] = { "__kernel void simple_demo(__global int *src, __global int *dst, int factor)\n", "{\n", " int i = get_global_id(0);\n", " dst[i] = src[i] * factor;\n", "}\n" }; cl_program program; program = CL_CHECK_ERR(clCreateProgramWithSource(context, sizeof(program_source)/sizeof(*program_source), program_source, NULL, &_err)); if (clBuildProgram(program, 1, devices, "", NULL, NULL) != CL_SUCCESS) { char buffer[10240]; clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL); fprintf(stderr, "CL Compilation failed:\n%s", buffer); abort(); } CL_CHECK(clUnloadCompiler()); cl_mem input_buffer; input_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int)*NUM_DATA, NULL, &_err)); cl_mem output_buffer; output_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int)*NUM_DATA, NULL, &_err)); int factor = 2; cl_kernel kernel; kernel = CL_CHECK_ERR(clCreateKernel(program, "simple_demo", &_err)); CL_CHECK(clSetKernelArg(kernel, 0, sizeof(input_buffer), &input_buffer)); CL_CHECK(clSetKernelArg(kernel, 1, sizeof(output_buffer), &output_buffer)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(factor), &factor)); cl_command_queue queue; queue = CL_CHECK_ERR(clCreateCommandQueue(context, devices[0], 0, &_err)); for (int i=0; i<NUM_DATA; i++) { CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, i*sizeof(int), sizeof(int), &i, 0, NULL, NULL)); } cl_event kernel_completion; size_t global_work_size[1] = { NUM_DATA }; CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, &kernel_completion)); CL_CHECK(clWaitForEvents(1, &kernel_completion)); CL_CHECK(clReleaseEvent(kernel_completion)); printf("Result:"); for (int i=0; i<NUM_DATA; i++) { int data; CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, i*sizeof(int), sizeof(int), &data, 0, NULL, NULL)); printf(" %d", data); } printf("\n"); CL_CHECK(clReleaseMemObject(input_buffer)); CL_CHECK(clReleaseMemObject(output_buffer)); CL_CHECK(clReleaseKernel(kernel)); CL_CHECK(clReleaseProgram(program)); CL_CHECK(clReleaseContext(context)); return 0; }
int main(int argc, char *argv[]) { #define EXTRAROOM 1024 char type_def[] = "-DTYPE=floatXX"; char* type_ptr= type_def + sizeof(type_def) - 3; cl_uint vec_width = 1; // selected platform and device number cl_uint pn = 0, dn = 0; // OpenCL error cl_int error; // generic iterator cl_uint i; // set platform/device num from command line if (argc > 1) pn = atoi(argv[1]); if (argc > 2) dn = atoi(argv[2]); if (argc > 3) { vec_width = atoi(argv[3]); // this should only be 2, 4, 8, 16 // if the user passes bogus data, it's their problem. if (vec_width > 1) strncpy(type_ptr, argv[3], 2); if (vec_width == 3) vec_width++; } else { *type_ptr = '\0'; } error = clGetPlatformIDs(0, NULL, &np); CHECK_ERROR("getting amount of platform IDs"); printf("%u platforms found\n", np); if (pn >= np) { fprintf(stderr, "there is no platform #%u\n" , pn); exit(1); } // only allocate for IDs up to the intended one platform = calloc(pn+1,sizeof(*platform)); // if allocation failed, next call will bomb. rely on this error = clGetPlatformIDs(pn+1, platform, NULL); CHECK_ERROR("getting platform IDs"); // choose platform p = platform[pn]; error = clGetPlatformInfo(p, CL_PLATFORM_NAME, BUFSZ, strbuf, NULL); CHECK_ERROR("getting platform name"); printf("using platform %u: %s\n", pn, strbuf); error = clGetDeviceIDs(p, CL_DEVICE_TYPE_ALL, 0, NULL, &nd); CHECK_ERROR("getting amount of device IDs"); printf("%u devices found\n", nd); if (dn >= nd) { fprintf(stderr, "there is no device #%u\n", dn); exit(1); } // only allocate for IDs up to the intended one device = calloc(dn+1,sizeof(*device)); // if allocation failed, next call will bomb. rely on this error = clGetDeviceIDs(p, CL_DEVICE_TYPE_ALL, dn+1, device, NULL); CHECK_ERROR("getting device IDs"); // choose device d = device[dn]; error = clGetDeviceInfo(d, CL_DEVICE_NAME, BUFSZ, strbuf, NULL); CHECK_ERROR("getting device name"); printf("using device %u: %s\n", dn, strbuf); error = clGetDeviceInfo(d, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(gmem), &gmem, NULL); CHECK_ERROR("getting device global memory size"); error = clGetDeviceInfo(d, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(alloc_max), &alloc_max, NULL); CHECK_ERROR("getting device max memory allocation size"); // create context ctx_prop[1] = (cl_context_properties)p; ctx = clCreateContext(ctx_prop, 1, &d, NULL, NULL, &error); CHECK_ERROR("creating context"); // create queue q = clCreateCommandQueue(ctx, d, CL_QUEUE_PROFILING_ENABLE, &error); CHECK_ERROR("creating queue"); // create program pg = clCreateProgramWithSource(ctx, sizeof(src)/sizeof(*src), src, NULL, &error); CHECK_ERROR("creating program"); // build program printf("OpenCL program build options: %s\n", type_def); error = clBuildProgram(pg, 1, &d, type_def, NULL, NULL); #if 1 if (error == CL_BUILD_PROGRAM_FAILURE) { error = clGetProgramBuildInfo(pg, d, CL_PROGRAM_BUILD_LOG, BUFSZ, strbuf, NULL); CHECK_ERROR("get program build info"); printf("=== BUILD LOG ===\n%s\n=========\n", strbuf); } #endif CHECK_ERROR("building program"); // get kernels k_set = clCreateKernel(pg, "set", &error); CHECK_ERROR("creating kernel set"); k_add = clCreateKernel(pg, "add", &error); CHECK_ERROR("creating kernel add"); error = clGetKernelWorkGroupInfo(k_add, d, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(wgm), &wgm, NULL); CHECK_ERROR("getting preferred workgroup size multiple"); // we allocate two buffers nbuf = 2; // reduce buffer allocation size to ensure we can fit all buffer // into the device memory if (alloc_max > gmem/nbuf) buf_size = gmem/nbuf; else buf_size = alloc_max; // number of elements that fit in the given buf_size nels = buf_size/sizeof(cl_float)/vec_width; // set the buffer size to match exactly what we need buf_size = nels*sizeof(cl_float)*vec_width; gws = ROUND_MUL(nels, wgm); printf("will use %zu workitems to process %u elements of type %s\n", gws, nels, type_def + 7); #define MB (1024*1024.0) printf("will try allocating %u buffers of %gMB each\n", nbuf, buf_size/MB); buf = calloc(nbuf, sizeof(cl_mem)); if (!buf) { fprintf(stderr, "could not prepare support for %u buffers\n", nbuf); exit(1); } // we try multiple configurations: no HOST_PTR flags, USE_HOST_PTR and ALLOC_HOST_PTR const cl_mem_flags buf_flags[] = { CL_MEM_READ_WRITE, CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, CL_MEM_READ_WRITE, }; const size_t nturns = sizeof(buf_flags)/sizeof(*buf_flags); const size_t nloops = 5; // number of loops for each turn, for stats const size_t median = nloops/2; // location of median value after sorting const size_t gmem_bytes_rw = 2*buf_size; const char * const flag_names[] = { "(none)", "USE_HOST_PTR", "ALLOC_HOST_PTR", "(none)" }; double runtimes[nturns][3][nloops]; /* set, add, map */ memset(runtimes, 0, nturns*sizeof(*runtimes)); hbuf = calloc(nbuf, sizeof(*hbuf)); if (!hbuf) { fputs("couldn't allocate host buffer array\n", stderr); exit(1); } for (size_t turn = 0; turn < sizeof(buf_flags)/sizeof(*buf_flags); ++turn) { for (i = 0; i < nbuf; ++i) { if (buf_flags[turn] & CL_MEM_USE_HOST_PTR) { hbuf[i] = calloc(buf_size, 1); if (!hbuf[i]) { fputs("couldn't allocate host buffer array\n", stderr); exit(1); } } buf[i] = clCreateBuffer(ctx, buf_flags[turn], buf_size, hbuf[i], &error); CHECK_ERROR("allocating buffer"); printf("buffer %u allocated\n", i); } for (size_t loop = 0; loop < nloops; ++loop) { clSetKernelArg(k_set, 0, sizeof(buf[0]), buf); clSetKernelArg(k_set, 1, sizeof(buf[1]), buf + 1); clSetKernelArg(k_set, 2, sizeof(nels), &nels); error = clEnqueueNDRangeKernel(q, k_set, 1, NULL, &gws, NULL, 0, NULL, &set_event); CHECK_ERROR("enqueueing kernel set"); clSetKernelArg(k_add, 0, sizeof(buf[0]), buf); clSetKernelArg(k_add, 1, sizeof(buf[1]), buf + 1); clSetKernelArg(k_add, 2, sizeof(nels), &nels); error = clEnqueueNDRangeKernel(q, k_add, 1, NULL, &gws, NULL, 1, &set_event, &add_event); CHECK_ERROR("enqueueing kernel add"); float *hmap = clEnqueueMapBuffer(q, buf[0], CL_TRUE, CL_MAP_READ, 0, buf_size, 1, &add_event, &map_event, &error); CHECK_ERROR("map"); error = clWaitForEvents(1, &map_event); CHECK_ERROR("map event"); printf("Turn %zu, loop %zu: %s\n", turn, loop, flag_names[turn]); runtimes[turn][0][loop] = event_perf(set_event, gmem_bytes_rw, "set"); runtimes[turn][1][loop] = event_perf(add_event, gmem_bytes_rw, "add"); runtimes[turn][2][loop] = event_perf(map_event, buf_size, "map"); clEnqueueUnmapMemObject(q, buf[0], hmap, 0, NULL, NULL); clFinish(q); // release the events clReleaseEvent(set_event); clReleaseEvent(add_event); clReleaseEvent(map_event); } // release the buffers for (i = 0; i < nbuf; ++i) { if (buf_flags[turn] & CL_MEM_USE_HOST_PTR) { free(hbuf[i]); hbuf[i] = NULL; } clReleaseMemObject(buf[i]); } } puts("Summary/stats:"); for (size_t turn = 0; turn < nturns; ++turn) { double avg[3] = {0}; /* I'm lazy, so sort with qsort and then compute average, * otherwise we could just compute min, max, avg and median together */ qsort(runtimes[turn][0], nloops, sizeof(double), compare_double); qsort(runtimes[turn][1], nloops, sizeof(double), compare_double); qsort(runtimes[turn][2], nloops, sizeof(double), compare_double); for (size_t loop = 0; loop < nloops; ++loop) { avg[0] += runtimes[turn][0][loop]; avg[1] += runtimes[turn][1][loop]; avg[2] += runtimes[turn][2][loop]; } avg[0] /= nloops; avg[1] /= nloops; avg[2] /= nloops; printf("Turn %zu: %s\n", turn, flag_names[turn]); printf("set\ttime (ms): best: %8g, median: %8g, worst: %8g, avg: %8g\n", runtimes[turn][0][0], runtimes[turn][0][median], runtimes[turn][0][nloops - 1], avg[0]); printf("\tBW (GB/s): best: %8g, median: %8g, worst: %8g, avg: %8g\n", gmem_bytes_rw/runtimes[turn][0][0]*1.0e-6, gmem_bytes_rw/runtimes[turn][0][median]*1.0e-6, gmem_bytes_rw/runtimes[turn][0][nloops - 1]*1.0e-6, gmem_bytes_rw/avg[0]*1.0e-6); printf("add\ttime (ms): best: %8g, median: %8g worst: %8g, avg: %8g\n", runtimes[turn][1][0], runtimes[turn][1][median], runtimes[turn][1][nloops - 1], avg[1]); printf("\tBW (GB/s): best: %8g, median: %8g, worst: %8g, avg: %8g\n", gmem_bytes_rw/runtimes[turn][1][0]*1.0e-6, gmem_bytes_rw/runtimes[turn][1][median]*1.0e-6, gmem_bytes_rw/runtimes[turn][1][nloops - 1]*1.0e-6, gmem_bytes_rw/avg[1]*1.0e-6); printf("map\ttime (ms): best: %8g, median: %8g, worst: %8g, avg: %8g\n", runtimes[turn][2][0], runtimes[turn][2][median], runtimes[turn][2][nloops - 1], avg[2]); printf("\tBW (GB/s): best: %8g, median: %8g, worst: %8g, avg: %8g\n", buf_size/runtimes[turn][2][0]*1.0e-6, buf_size/runtimes[turn][2][median]*1.0e-6, buf_size/runtimes[turn][2][nloops - 1]*1.0e-6, buf_size/avg[2]*1.0e-6); } return 0; }