int NBody::runCLKernels() { cl_int status; cl_event events[1]; /* * Enqueue a kernel run call. */ size_t globalThreads[] = {numBodies}; size_t localThreads[] = {256}; assert(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalThreads, localThreads, 0, NULL, NULL) == CL_SUCCESS); assert(clFinish(commandQueue) == CL_SUCCESS); assert(clEnqueueCopyBuffer(commandQueue, newPos, currPos, 0, 0, sizeof(cl_float4) * numBodies, 0, 0, 0) == CL_SUCCESS); assert(clEnqueueCopyBuffer(commandQueue, newVel, currVel, 0, 0, sizeof(cl_float4) * numBodies, 0, 0, 0) == CL_SUCCESS); assert(clFinish(commandQueue) == CL_SUCCESS); /* Enqueue readBuffer*/ assert(clEnqueueReadBuffer(commandQueue, currPos, CL_TRUE, 0, numBodies* sizeof(cl_float4), pos, 0, NULL, &events[0]) == CL_SUCCESS); /* Wait for the read buffer to finish execution */ assert(clWaitForEvents(1, &events[0]) == CL_SUCCESS); clReleaseEvent(events[0]); return 0; }
void cl_copyBuffer(cl_mem dest, int destOffset, cl_mem src, int srcOffset, size_t size,int *index,cl_event *eventList,int *Flag_CPU_GPU,double * burden, int _CPU_GPU) { int preFlag=(*Flag_CPU_GPU); double preBurden=(*burden); int CPU_GPU=0; CPU_GPU=cl_copyBufferscheduler(size,Flag_CPU_GPU,burden,_CPU_GPU); cl_int ciErr1; (*Flag_CPU_GPU)=CPU_GPU; if(*index!=0) { ciErr1 = clEnqueueCopyBuffer(CommandQueue[CPU_GPU], src, dest, srcOffset, destOffset, size, 1, &eventList[((*index)-1)%2], &eventList[(*index)%2]); deschedule(preFlag,preBurden); } else ciErr1 = clEnqueueCopyBuffer(CommandQueue[CPU_GPU], src, dest, srcOffset, destOffset, size, 0, NULL, &eventList[*index]); (*index)++; //clEnqueueWriteBuffer(CommandQueue[CPU_GPU], to, CL_FALSE, 0, size, from, 0, NULL, NULL); if (ciErr1 != CL_SUCCESS) { printf("Error %d in cl_copyBuffer, Line %u in file %s !!!\n\n", ciErr1,__LINE__, __FILE__); cl_clean(EXIT_FAILURE); } clFlush(CommandQueue[CPU_GPU]); }
/** @brief Resize the matrix. * * @param new_size1 New number of rows * @param new_size2 New number of columns * @param preserve If true, the old values are preserved. At present, old values are always discarded. */ void resize(unsigned int new_size1, unsigned int new_size2, bool preserve = true) { if (new_size1 > 0 && new_size2 > 0) { if (new_size1 > _rows) //enlarge buffer { if (_rows == 0) { _row_buffer = viennacl::ocl::device().createMemory(CL_MEM_READ_WRITE, sizeof(unsigned int)*(new_size1 + 1)); _col_buffer = viennacl::ocl::device().createMemory(CL_MEM_READ_WRITE, sizeof(unsigned int)*(new_size1 + 1)); _elements = viennacl::ocl::device().createMemory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*(new_size1 + 1)); //set new memory to zero: std::vector<unsigned int> coord_temp(new_size1 + 1); std::vector<SCALARTYPE> temp(new_size1 + 1); cl_int err; err = clEnqueueWriteBuffer(viennacl::ocl::device().queue().get(), _row_buffer.get(), CL_TRUE, 0, sizeof(unsigned int)*coord_temp.size(), &(coord_temp[0]), 0, NULL, NULL); CL_ERR_CHECK(err); err = clEnqueueWriteBuffer(viennacl::ocl::device().queue().get(), _col_buffer.get(), CL_TRUE, 0, sizeof(unsigned int)*coord_temp.size(), &(coord_temp[0]), 0, NULL, NULL); CL_ERR_CHECK(err); err = clEnqueueWriteBuffer(viennacl::ocl::device().queue().get(), _elements.get(), CL_TRUE, 0, sizeof(SCALARTYPE)*temp.size(), &(temp[0]), 0, NULL, NULL); CL_ERR_CHECK(err); } else //enlarge only row array, because no entries are added to the matrix { viennacl::ocl::handle<cl_mem> _row_buffer_old = _row_buffer; _row_buffer = viennacl::ocl::device().createMemory(CL_MEM_READ_WRITE, sizeof(unsigned int)*(new_size1 + 1)); cl_int err = clEnqueueCopyBuffer(viennacl::ocl::device().queue().get(), _row_buffer_old.get(), _row_buffer.get(), 0, 0, sizeof(unsigned int)* (_rows + 1), 0, NULL, NULL); CL_ERR_CHECK(err); //set new memory to zero: std::vector<SCALARTYPE> temp(new_size1 - _rows + 1); err = clEnqueueWriteBuffer(viennacl::ocl::device().queue().get(), _elements.get(), CL_TRUE, sizeof(SCALARTYPE)*(_rows + 1), sizeof(SCALARTYPE)*temp.size(), &(temp[0]), 0, NULL, NULL); CL_ERR_CHECK(err); } } else if (new_size1 < _rows) //reduce buffer { viennacl::ocl::handle<cl_mem> _row_buffer_old = _row_buffer; _row_buffer = viennacl::ocl::device().createMemory(CL_MEM_READ_WRITE, sizeof(unsigned int)*(new_size1 + 1)); cl_int err = clEnqueueCopyBuffer(viennacl::ocl::device().queue().get(), _row_buffer_old.get(), _row_buffer.get(), 0, 0, sizeof(unsigned int) * (new_size1 + 1), 0, NULL, NULL); CL_ERR_CHECK(err); //TODO: discard entries in the matrix that are beyond the allowed sizes } _rows = new_size1; _cols = new_size2; } }
/// Enqueues a command to copy data from \p src_buffer to /// \p dst_buffer. /// /// \see_opencl_ref{clEnqueueCopyBuffer} /// /// \see copy() event enqueue_copy_buffer(const buffer &src_buffer, const buffer &dst_buffer, size_t src_offset, size_t dst_offset, size_t size, const wait_list &events = wait_list()) { BOOST_ASSERT(m_queue != 0); BOOST_ASSERT(src_offset + size <= src_buffer.size()); BOOST_ASSERT(dst_offset + size <= dst_buffer.size()); BOOST_ASSERT(src_buffer.get_context() == this->get_context()); BOOST_ASSERT(dst_buffer.get_context() == this->get_context()); event event_; cl_int ret = clEnqueueCopyBuffer( m_queue, src_buffer.get(), dst_buffer.get(), src_offset, dst_offset, size, events.size(), events.get_event_ptr(), &event_.get() ); if(ret != CL_SUCCESS){ BOOST_THROW_EXCEPTION(opencl_error(ret)); } return event_; }
void copy_int_buffer(sotl_device_t *dev, cl_mem *dst_buf, cl_mem *src_buf, const unsigned nb_elems) { int k = KERNEL_COPY_BUFFER; cl_int err = CL_SUCCESS; if (dev->type == CL_DEVICE_TYPE_GPU) { /* Using clEnqueueCopyBuffer() instead of our own copy buffer kernel * improves performance on GPU devices like NVIDIA. */ err = clEnqueueCopyBuffer(dev->queue, *src_buf, *dst_buf, 0, 0, nb_elems * sizeof(int), 0, NULL, prof_event_ptr(dev,k)); check(err, "Failed to copy buffer using clEnqueueCopyBuffer().\n"); } else { /* Using our own copy buffer kernel is better on other devices like * Intel Xeon (Phi) which seems to have a bad implementation of * clEnqueueCopyBuffer(). */ err |= clSetKernelArg(dev->kernel[k], 0, sizeof(cl_mem), dst_buf); err |= clSetKernelArg(dev->kernel[k], 1, sizeof(cl_mem), src_buf); err |= clSetKernelArg(dev->kernel[k], 2, sizeof(nb_elems), &nb_elems); check(err, "Failed to set kernel arguments: %s.\n", kernel_name(k)); size_t local = MIN(dev->tile_size, dev->max_workgroup_size); size_t global = ROUND(nb_elems); err = clEnqueueNDRangeKernel(dev->queue, dev->kernel[k], 1, NULL, &global, &local, 0, NULL, prof_event_ptr(dev,k)); check(err, "Failed to exec kernel: %s.\n", kernel_name(k)); } }
void CLWrapper::copyTo( CLWrapper *target, int srcOffset, int dstOffset, int count ) { if( !onDevice ) { throw std::runtime_error("Must have called copyToDevice() or createOnDevice() before calling copyTo(CLWrapper*)"); } if( !target->onDevice ) { throw std::runtime_error("Must have called copyToDevice() or createOnDevice() on target before calling copyTo(target)"); } if( srcOffset + count > N ) { throw std::runtime_error("copyTo: not enough source elements, given offset " + easycl::toString(srcOffset) + " and count " + easycl::toString(count)); } if( dstOffset + count > target->N ) { throw std::runtime_error("copyTo: not enough destation elements, given offset " + easycl::toString(dstOffset) + " and count " + easycl::toString(count)); } if( getElementSize() != target->getElementSize() ) { throw std::runtime_error("copyTo: element size mismatch between source and target CLWrapper objects"); } // can assume that we have our data on the device now, because of if check // just now // we will also assume that destination CLWrapper* is valid cl_event event = NULL; cl_int err = clEnqueueCopyBuffer( *(cl->queue), devicearray, target->devicearray, srcOffset * getElementSize(), dstOffset * getElementSize(), count * getElementSize(), 0, NULL, &event ); if (err != CL_SUCCESS) { throw std::runtime_error("copyTo failed with " + easycl::toString( err ) ); } else { /* Wait for calculations to be finished. */ // err = clWaitForEvents(1, &event); } clReleaseEvent(event); target->markDeviceDirty(); }
void test_copy_buf(size_t sz, size_t src_off, size_t dst_off, size_t cb) { unsigned int i; OCL_MAP_BUFFER(0); for (i=0; i < sz; i++) { ((char*)buf_data[0])[i] = (rand() & 63); } OCL_UNMAP_BUFFER(0); if (src_off + cb > sz || dst_off + cb > sz) { /* Expect Error. */ OCL_ASSERT(clEnqueueCopyBuffer(queue, buf[0], buf[1], src_off, dst_off, cb*sizeof(char), 0, NULL, NULL)); return; } OCL_ASSERT(!clEnqueueCopyBuffer(queue, buf[0], buf[1], src_off, dst_off, cb*sizeof(char), 0, NULL, NULL)); OCL_MAP_BUFFER(0); OCL_MAP_BUFFER(1); #if 0 printf("\n########### Src buffer: \n"); for (i = 0; i < cb; ++i) printf(" %2.2u", ((unsigned char*)buf_data[0])[i + src_off]); printf("\n########### dst buffer: \n"); for (i = 0; i < cb; ++i) printf(" %2.2u", ((unsigned char*)buf_data[1])[i + dst_off]); #endif // Check results for (i = 0; i < cb; ++i) { if (((char*)buf_data[0])[i + src_off] != ((char*)buf_data[1])[i + dst_off]) { printf ("different index is %d\n", i); OCL_ASSERT(0); } } OCL_UNMAP_BUFFER(0); OCL_UNMAP_BUFFER(1); }
// dst: Destination memory address // dst_offset: The offset where to begin copying data from dst. If dst is host buffer, the offset // is always 0 // src: Source memory address // dst_offset: The offset where to begin copying data from src. If dst is host buffer, the offset // is always 0 // count: Size in bytes to copy // kind: Type of transfer int gpuMemcpy(void* dst, size_t dst_offset, void* src, size_t src_offset, size_t count, gpuMemcpyKind kind) { switch(kind) { case gpuMemcpyHostToHost: memcpy(dst, src, count); break; case gpuMemcpyDeviceToHost: { cl_int err = clEnqueueReadBuffer(Concurrency::getAllocator().getQueue(), static_cast<cl_mem>(src), CL_TRUE, src_offset, count, dst, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Read error = %d\n", err); exit(1); } break; } case gpuMemcpyHostToDevice: { cl_int err = clEnqueueWriteBuffer(Concurrency::getAllocator().getQueue(), static_cast<cl_mem>(dst), CL_TRUE, dst_offset, count, src, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Write error = %d\n", err); exit(1); } break; } case gpuMemcpyDeviceToDevice: { cl_event event; cl_int err = clEnqueueCopyBuffer (Concurrency::getAllocator().getQueue(), static_cast<cl_mem>(src), static_cast<cl_mem>(dst), src_offset, dst_offset, count, 0, NULL, &event); if (err != CL_SUCCESS) { printf("Copy error = %d\n", err); exit(1); } clWaitForEvents(1, &event); break; } case gpuMemcpyDefault: break; } return 0; }
cl_int WINAPI wine_clEnqueueCopyBuffer(cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer, size_t src_offset, size_t dst_offset, size_t cb, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) { cl_int ret; TRACE("\n"); ret = clEnqueueCopyBuffer(command_queue, src_buffer, dst_buffer, src_offset, dst_offset, cb, num_events_in_wait_list, event_wait_list, event); return ret; }
/** @brief Allocate memory for the supplied number of nonzeros in the matrix. Old values are preserved. */ void reserve(unsigned int new_nonzeros) { if (new_nonzeros > _nonzeros) { viennacl::ocl::handle<cl_mem> _col_buffer_old = _col_buffer; viennacl::ocl::handle<cl_mem> _elements_old = _elements; _col_buffer = viennacl::ocl::device().createMemory(CL_MEM_READ_WRITE, sizeof(unsigned int) * new_nonzeros); _elements = viennacl::ocl::device().createMemory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE) * new_nonzeros); cl_int err; err = clEnqueueCopyBuffer(viennacl::ocl::device().queue().get(), _col_buffer_old.get(), _col_buffer.get(), 0, 0, sizeof(unsigned int)*_nonzeros, 0, NULL, NULL); CL_ERR_CHECK(err); err = clEnqueueCopyBuffer(viennacl::ocl::device().queue().get(), _elements_old.get(), _elements.get(), 0, 0, sizeof(SCALARTYPE)*_nonzeros, 0, NULL, NULL); CL_ERR_CHECK(err); _nonzeros = new_nonzeros; } }
ClEvent ClCopyBuffer::enqueue(Error& error, const ClCommandQueue& queue, cl_uint nEvents, const cl_event * events) { cl_event e; cl_int status = clEnqueueCopyBuffer(queue.get(), s.get(), d.get(), srcOffsetBytes, dstOffsetBytes, sizeBytes, nEvents, events, &e); clCheckStatus(error, status, "clEnqueueCopyBuffer"); return ClEvent(e); }
cl_mem mwDuplicateBuffer(CLInfo* ci, cl_mem buf) { cl_mem bufCopy; size_t size; cl_mem_flags flags; cl_int err; cl_event ev; if (!buf) { return NULL; } err = clGetMemObjectInfo(buf, CL_MEM_FLAGS, sizeof(flags), &flags, NULL); if (err != CL_SUCCESS) { mwPerrorCL(err, "Failed to get memory flags for buffer duplication"); return NULL; } err = clGetMemObjectInfo(buf, CL_MEM_SIZE, sizeof(size), &size, NULL); if (err != CL_SUCCESS) { mwPerrorCL(err, "Failed to get memory size for buffer duplication"); return NULL; } /* We may have initialized that one from a host pointer, but not this one */ flags ^= CL_MEM_COPY_HOST_PTR; /* Create a copy of the same size */ bufCopy = clCreateBuffer(ci->clctx, flags, size, NULL, &err); if (err != CL_SUCCESS) { mwPerrorCL(err, "Failed to create copy buffer of size "ZU, size); return NULL; } err = clEnqueueCopyBuffer(ci->queue, buf, bufCopy, 0, 0, size, 0, NULL, &ev); if (err != CL_SUCCESS) { mwPerrorCL(err, "Failed to enqueue buffer copy of size"ZU, size); clReleaseMemObject(bufCopy); return NULL; } err = mwWaitReleaseEvent(&ev); if (err != CL_SUCCESS) { mwPerrorCL(err, "Failed to wait for buffer copy"); clReleaseMemObject(bufCopy); return NULL; } return bufCopy; }
void f0(int arg1, int* arg2, int arg2c, int** out4) { int mem3c; mem3c = arg2c; cl_mem mem3 = clCreateBuffer(context,CL_MEM_READ_WRITE,(mem3c * sizeof(int)),NULL,NULL); clEnqueueWriteBuffer(command_queue,mem3,CL_TRUE,0,(mem3c * sizeof(int)),arg2,0,NULL,NULL); int mem5c; mem5c = mem3c; cl_mem mem5 = clCreateBuffer(context,CL_MEM_READ_WRITE,(mem5c * sizeof(int)),NULL,NULL); clEnqueueCopyBuffer(command_queue,mem3,mem5,0,0,(mem5c * sizeof(int)),0,NULL,NULL); for(int o = 0; o < arg1; o++) { int mem7; mem7 = (~(4294967295 << (o + 1))); int mem8; mem8 = (o + 1); int mem9c; mem9c = mem5c; cl_mem mem9 = clCreateBuffer(context,CL_MEM_READ_WRITE,(mem9c * sizeof(int)),NULL,NULL); clSetKernelArg(k10, 0, sizeof(cl_mem), &mem5); clSetKernelArg(k10, 1, sizeof(int), &mem7); clSetKernelArg(k10, 2, sizeof(cl_mem), &mem9); clSetKernelArg(k10, 3, sizeof(int), &o); size_t global_item_size = mem5c; size_t local_item_size = 1024; clEnqueueNDRangeKernel(command_queue, k10, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); clEnqueueCopyBuffer(command_queue,mem9,mem5,0,0,(mem5c * sizeof(int)),0,NULL,NULL); for(int v = 0; v < o; v++) { int mem14; mem14 = (mem8 - (v + 2)); int mem15; mem15 = (1 << mem14); clSetKernelArg(k16, 0, sizeof(cl_mem), &mem5); clSetKernelArg(k16, 1, sizeof(int), &mem15); clSetKernelArg(k16, 2, sizeof(int), &mem14); global_item_size = mem5c; local_item_size = 1024; clEnqueueNDRangeKernel(command_queue, k16, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); } clReleaseMemObject(mem9); } clEnqueueReadBuffer(command_queue,mem5,CL_TRUE,0,(mem5c * sizeof(int)),(*out4),0,NULL,NULL); clReleaseMemObject(mem5); clReleaseMemObject(mem3); }
void cl_copyBuffer(cl_mem dest, cl_mem src, size_t size,int CPU_GPU) { cl_int ciErr1; ciErr1 = clEnqueueCopyBuffer(CommandQueue[CPU_GPU], src, dest, 0, 0, size, 0, NULL, NULL); if (ciErr1 != CL_SUCCESS) { printf("Error %d in cl_copyBuffer, Line %u in file %s !!!\n\n",ciErr1, __LINE__, __FILE__); cl_clean(EXIT_FAILURE); } clFinish(CommandQueue[CPU_GPU]); }
/////////////////////////////////////////////////////////////////////////////// // test the bandwidth of a device to host memcopy of a specific size /////////////////////////////////////////////////////////////////////////////// double testDeviceToDeviceTransfer(unsigned int memSize) { double elapsedTimeInSec = 0.0; double bandwidthInMBs = 0.0; unsigned char* h_idata = NULL; cl_int ciErrNum = CL_SUCCESS; //allocate host memory h_idata = (unsigned char *)malloc( memSize ); //initialize the memory for(unsigned int i = 0; i < memSize/sizeof(unsigned char); i++) { h_idata[i] = (unsigned char) (i & 0xff); } // allocate device input and output memory and initialize the device input memory cl_mem d_idata = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, memSize, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); cl_mem d_odata = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, memSize, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, d_idata, CL_TRUE, 0, memSize, h_idata, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); // Sync queue to host, start timer 0, and copy data from one GPU buffer to another GPU bufffer clFinish(cqCommandQueue); shrDeltaT(0); for(unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { ciErrNum = clEnqueueCopyBuffer(cqCommandQueue, d_idata, d_odata, 0, 0, memSize, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); } // Sync with GPU clFinish(cqCommandQueue); //get the the elapsed time in seconds elapsedTimeInSec = shrDeltaT(0); // Calculate bandwidth in MB/s // This is for kernels that read and write GMEM simultaneously // Obtained Throughput for unidirectional block copies will be 1/2 of this # bandwidthInMBs = 2.0 * ((double)memSize * (double)MEMCOPY_ITERATIONS)/(elapsedTimeInSec * (double)(1 << 20)); //clean up memory on host and device free(h_idata); clReleaseMemObject(d_idata); clReleaseMemObject(d_odata); return bandwidthInMBs; }
bool swap_kernel_buffers(cl_command_queue *cmd_queue, unsigned int cqc, size_t *offy, size_t width, size_t height, cl_mem src, cl_mem dst) { cl_int err; printf("Swapping kernel buffers on device\n"); err = clEnqueueCopyBuffer(cmd_queue[0], src, dst, 0, 0, (width / 8) * height, 0, NULL, NULL); if (err) { printf("clEnqueueCopyBuffer failed! (Error: %d)\n", err); return false; } return true; }
void vector_copy(Vector * src, Vector * dest) { /* cl_int clEnqueueCopyBuffer ( cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer, size_t src_offset, size_t dst_offset, size_t cb, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event)*/ clEnqueueCopyBuffer(queue,src->gpu_vals,dest->gpu_vals,0,0,sizeof(TYPE)*src->length,0,NULL,NULL); src->locality = 1; }
/*! Requests that the \a size bytes at \a offset in this buffer be copied to \a destOffset in the buffer \a dest. Returns an event object that can be used to wait for the request to finish. The request will not start until all of the events in \a after have been signaled as finished. The request is executed on the active command queue for context(). \sa copyTo() */ QCLEvent QCLBuffer::copyToAsync (size_t offset, size_t size, const QCLBuffer &dest, size_t destOffset, const QCLEventList &after) { cl_event event; cl_int error = clEnqueueCopyBuffer (context()->activeQueue(), memoryId(), dest.memoryId(), offset, destOffset, size, after.size(), after.eventData(), &event); context()->reportError("QCLBuffer::copyToAsync:", error); if (error != CL_SUCCESS) return QCLEvent(); else return QCLEvent(event); }
/*! Copies the \a size bytes at \a offset in this buffer be copied to \a destOffset in the buffer \a dest. Returns true if the copy was successful; false otherwise. This function will block until the request finishes. The request is executed on the active command queue for context(). \sa copyToAsync() */ bool QCLBuffer::copyTo (size_t offset, size_t size, const QCLBuffer &dest, size_t destOffset) { cl_event event; cl_int error = clEnqueueCopyBuffer (context()->activeQueue(), memoryId(), dest.memoryId(), offset, destOffset, size, 0, 0, &event); context()->reportError("QCLBuffer::copyTo(QCLBuffer):", error); if (error == CL_SUCCESS) { clWaitForEvents(1, &event); clReleaseEvent(event); return true; } else { return false; } }
/** @brief Copies 'bytes_to_copy' bytes from address 'src_buffer + src_offset' in the OpenCL context to memory starting at address 'dst_buffer + dst_offset' in the same OpenCL context. * * @param src_buffer A smart pointer to the begin of an allocated OpenCL buffer * @param dst_buffer A smart pointer to the end of an allocated OpenCL buffer * @param src_offset Offset of the first byte to be written from the address given by 'src_buffer' (in bytes) * @param dst_offset Offset of the first byte to be written to the address given by 'dst_buffer' (in bytes) * @param bytes_to_copy Number of bytes to be copied */ inline void memory_copy(viennacl::ocl::handle<cl_mem> const & src_buffer, viennacl::ocl::handle<cl_mem> & dst_buffer, vcl_size_t src_offset, vcl_size_t dst_offset, vcl_size_t bytes_to_copy) { assert( &src_buffer.context() == &dst_buffer.context() && bool("Transfer between memory buffers in different contexts not supported yet!")); viennacl::ocl::context & memory_context = const_cast<viennacl::ocl::context &>(src_buffer.context()); cl_int err = clEnqueueCopyBuffer(memory_context.get_queue().handle().get(), src_buffer.get(), dst_buffer.get(), src_offset, dst_offset, bytes_to_copy, 0, NULL, NULL); //events VIENNACL_ERR_CHECK(err); }
void NuiKinfuOpenCLFeedbackFrame::CopyColors(cl_mem colorsCL) { cl_int err = CL_SUCCESS; cl_command_queue queue = NuiOpenCLGlobal::instance().clQueue(); err = clEnqueueCopyBuffer( queue, colorsCL, m_colorsCL, 0, 0, m_nWidth * m_nHeight * sizeof(BGRQUAD), 0, NULL, NULL ); NUI_CHECK_CL_ERR(err); }
void fft_1D(cl_mem a,cl_mem b,cl_mem c, int N, cl_kernel init, cl_kernel knl,cl_command_queue queue,int direction,int offset_line) { //handle complex-to-complex fft, accutal size = 2 * N //size_t ldim[] = { 128 }; //size_t gdim[] = { (N /ldim[0])/2}; int Ns = 1; int y =0; SET_7_KERNEL_ARGS(init, a, b, N, Ns,direction,offset_line,y); size_t ldim[] = { 1 }; size_t gdim[] = { N/4 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, init, 1, NULL, gdim, ldim, 0, NULL, NULL)); for(Ns=4; Ns<N; Ns<<=2) { SET_6_KERNEL_ARGS(knl, b, c, N, Ns,direction,offset_line); size_t ldim[] = { 1 }; size_t gdim[] = { N/4 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, knl, 1, NULL, gdim, ldim, 0, NULL, NULL)); clEnqueueCopyBuffer(queue,c,b, offset_line*N*2*sizeof(float), offset_line*N*2*sizeof(float), sizeof(float)*N*2,0,NULL,NULL); //VecCopy(c,b,N,offset_line,vec_copy,queue); } }
// Copy a buffer void cl_copyBufferToBuffer(cl_mem dst, cl_mem src, size_t size) { static int eventCnt = 0; cl_event* eventPtr = NULL, event; if(eventsEnabled) { eventPtr = &event; } cl_int status; status = clEnqueueCopyBuffer(commandQueue, src, dst, 0, 0, size, 0, NULL, eventPtr); cl_errChk(status, "Copying buffer", true); if(eventsEnabled) { char* eventStr = catStringWithInt("copyBuffer", eventCnt++); events->newIOEvent(*eventPtr, eventStr); } }
const bool BufferObject::copyToBuffer( const kvs::cl::CommandQueue& queue, const kvs::cl::BufferObject& dst, const size_t size, const size_t dst_offset, const size_t self_offset ) { if ( !m_is_created ) { kvsMessageError( "Buffer is not created yet." ); return( false ); } const cl_int result = clEnqueueCopyBuffer( queue.queue(), m_memory, dst.memory(), self_offset, dst_offset, size, 0, NULL, NULL ); if ( result != CL_SUCCESS ) { kvsMessageError( "OpenCL; %s.", kvs::cl::ErrorString( result ) ); return( false ); } return( true ); }
void Network::step() { if (!built) build(); if (built) { LayerMap::iterator it; Layer* l; //update the external layers for (it = layers.begin(); it != layers.end(); it++) { l = it->second; if (l->isExternal()) { //update layer and add outputs to native outputs array ExternalLayer* el = (ExternalLayer*) l; el->update(); UnitMap::iterator uit; UnitId unitId; for (uit = el->units.begin(); uit != el->units.end(); uit++) { output[unitId] = el->getOutput(unitId); } } } //run the kernel cl_int err; size_t workSize = (size_t) numNonExUnits; prepareKernel(); err = clEnqueueNDRangeKernel(clDevice.commandQueue, *clKernel, 1, NULL, &workSize, NULL, 0, NULL, NULL); err = clFinish(clDevice.commandQueue); if (err != CL_SUCCESS) { printf("Failed to execute kernel!\n"); } //copy the new output to the previous output clEnqueueCopyBuffer(clDevice.commandQueue, clNewOutput, clPrevOutput, 0, 0, sizeof(float)*units.size(), 0, NULL, NULL); err = clFinish(clDevice.commandQueue); } }
//-------------------------------------------------------------------------------------- // Name: SetUseOpenCL() // Desc: Set whether to use OpenCL or CPU simulation //-------------------------------------------------------------------------------------- VOID CClothSimCL::SetUseOpenCL( bool bUseOpenCL ) { // If we are switching from CPU to GPU, copy the // current position to the previous position so // the simulation does not jump around if( bUseOpenCL == true && m_bUseOpenCL == false) { AcquireReleaseVBOs( true ); // Copy the current position to previous position to initialize the simulation cl_int errNum = 0; errNum = clEnqueueCopyBuffer( m_commandQueue, m_vboMem[CUR_POSITION], m_vboMem[PREV_POSITION], 0, 0, sizeof(cl_float4) * m_uiNumVerts, 0, NULL, NULL); if( errNum != CL_SUCCESS ) { FrmLogMessage( "Error copying positions VBO.\n" ); return; } AcquireReleaseVBOs( false ); } m_bUseOpenCL = bUseOpenCL; }
void matrixMulGPU(cl_uint ciDeviceCount, cl_mem h_A, float* h_B_data, unsigned int mem_size_B, float* h_C ) { cl_mem d_A[MAX_GPU_COUNT]; cl_mem d_C[MAX_GPU_COUNT]; cl_mem d_B[MAX_GPU_COUNT]; cl_event GPUDone[MAX_GPU_COUNT]; cl_event GPUExecution[MAX_GPU_COUNT]; // Start the computation on each available GPU // Create buffers for each GPU // Each GPU will compute sizePerGPU rows of the result int sizePerGPU = uiHA / ciDeviceCount; int workOffset[MAX_GPU_COUNT]; int workSize[MAX_GPU_COUNT]; workOffset[0] = 0; for(unsigned int i=0; i < ciDeviceCount; ++i) { // Input buffer workSize[i] = (i != (ciDeviceCount - 1)) ? sizePerGPU : (uiHA - workOffset[i]); d_A[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, workSize[i] * sizeof(float) * uiWA, NULL,NULL); // Copy only assigned rows from host to device clEnqueueCopyBuffer(commandQueue[i], h_A, d_A[i], workOffset[i] * sizeof(float) * uiWA, 0, workSize[i] * sizeof(float) * uiWA, 0, NULL, NULL); // create OpenCL buffer on device that will be initiatlize from the host memory on first use // on device d_B[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_B, h_B_data, NULL); // Output buffer d_C[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, workSize[i] * uiWC * sizeof(float), NULL,NULL); // set the args values clSetKernelArg(multiplicationKernel[i], 0, sizeof(cl_mem), (void *) &d_C[i]); clSetKernelArg(multiplicationKernel[i], 1, sizeof(cl_mem), (void *) &d_A[i]); clSetKernelArg(multiplicationKernel[i], 2, sizeof(cl_mem), (void *) &d_B[i]); clSetKernelArg(multiplicationKernel[i], 3, sizeof(float) * BLOCK_SIZE *BLOCK_SIZE, 0 ); clSetKernelArg(multiplicationKernel[i], 4, sizeof(float) * BLOCK_SIZE *BLOCK_SIZE, 0 ); clSetKernelArg(multiplicationKernel[i], 5, sizeof(cl_int), (void *) &uiWA); clSetKernelArg(multiplicationKernel[i], 6, sizeof(cl_int), (void *) &uiWB); if(i+1 < ciDeviceCount) workOffset[i + 1] = workOffset[i] + workSize[i]; } // Execute Multiplication on all GPUs in parallel size_t localWorkSize[] = {BLOCK_SIZE, BLOCK_SIZE}; size_t globalWorkSize[] = {shrRoundUp(BLOCK_SIZE, uiWC), shrRoundUp(BLOCK_SIZE, workSize[0])}; // Launch kernels on devices #ifdef GPU_PROFILING int nIter = 30; for (int j = -1; j < nIter; j++) { // Sync all queues to host and start timer first time through loop if(j == 0){ for(unsigned int i = 0; i < ciDeviceCount; i++) { clFinish(commandQueue[i]); } shrDeltaT(0); } #endif for(unsigned int i = 0; i < ciDeviceCount; i++) { // Multiplication - non-blocking execution: launch and push to device(s) globalWorkSize[1] = shrRoundUp(BLOCK_SIZE, workSize[i]); clEnqueueNDRangeKernel(commandQueue[i], multiplicationKernel[i], 2, 0, globalWorkSize, localWorkSize, 0, NULL, &GPUExecution[i]); clFlush(commandQueue[i]); } #ifdef GPU_PROFILING } #endif // sync all queues to host for(unsigned int i = 0; i < ciDeviceCount; i++) { clFinish(commandQueue[i]); } #ifdef GPU_PROFILING // stop and log timer double dSeconds = shrDeltaT(0)/(double)nIter; double dNumOps = 2.0 * (double)uiWA * (double)uiHA * (double)uiWB; double gflops = 1.0e-9 * dNumOps/dSeconds; shrLogEx(LOGBOTH | MASTER, 0, "oclMatrixMul, Throughput = %.4f GFlops/s, Time = %.5f s, Size = %.0f, NumDevsUsed = %d, Workgroup = %u\n", gflops, dSeconds, dNumOps, ciDeviceCount, localWorkSize[0] * localWorkSize[1]); // Print kernel timing per GPU shrLog("\n"); for(unsigned int i = 0; i < ciDeviceCount; i++) { shrLog(" Kernel execution time on GPU %d \t: %.5f s\n", i, executionTime(GPUExecution[i])); } shrLog("\n"); #endif for(unsigned int i = 0; i < ciDeviceCount; i++) { // Non-blocking copy of result from device to host clEnqueueReadBuffer(commandQueue[i], d_C[i], CL_FALSE, 0, uiWC * sizeof(float) * workSize[i], h_C + workOffset[i] * uiWC, 0, NULL, &GPUDone[i]); } // CPU sync with GPU clWaitForEvents(ciDeviceCount, GPUDone); // Release mem and event objects for(unsigned int i = 0; i < ciDeviceCount; i++) { clReleaseMemObject(d_A[i]); clReleaseMemObject(d_C[i]); clReleaseMemObject(d_B[i]); clReleaseEvent(GPUExecution[i]); clReleaseEvent(GPUDone[i]); } }
struct tableNode * groupBy(struct groupByNode * gb, struct clContext * context, struct statistic * pp){ struct timespec start,end; clock_gettime(CLOCK_REALTIME,&start); cl_event ndrEvt; cl_ulong startTime,endTime; struct tableNode * res = NULL; long gpuTupleNum; int gpuGbColNum; cl_mem gpuGbIndex; cl_mem gpuGbType, gpuGbSize; cl_mem gpuGbKey; cl_mem gpuContent; int gbCount; // the number of groups int gbConstant = 0; // whether group by constant cl_int error = 0; res = (struct tableNode *) malloc(sizeof(struct tableNode)); CHECK_POINTER(res); res->tupleSize = gb->tupleSize; res->totalAttr = gb->outputAttrNum; res->attrType = (int *) malloc(sizeof(int) * res->totalAttr); CHECK_POINTER(res->attrType); res->attrSize = (int *) malloc(sizeof(int) * res->totalAttr); CHECK_POINTER(res->attrSize); res->attrTotalSize = (int *) malloc(sizeof(int) * res->totalAttr); CHECK_POINTER(res->attrTotalSize); res->dataPos = (int *) malloc(sizeof(int) * res->totalAttr); CHECK_POINTER(res->dataPos); res->dataFormat = (int *) malloc(sizeof(int) * res->totalAttr); CHECK_POINTER(res->dataFormat); res->content = (char **) malloc(sizeof(char **) * res->totalAttr); CHECK_POINTER(res->content); for(int i=0;i<res->totalAttr;i++){ res->attrType[i] = gb->attrType[i]; res->attrSize[i] = gb->attrSize[i]; res->dataFormat[i] = UNCOMPRESSED; } gpuTupleNum = gb->table->tupleNum; gpuGbColNum = gb->groupByColNum; if(gpuGbColNum == 1 && gb->groupByIndex[0] == -1){ gbConstant = 1; } size_t localSize = 128; size_t globalSize = 1024*128; int blockNum = gb->table->tupleNum / localSize + 1; if(blockNum < 1024) globalSize = blockNum * 128; cl_mem gpu_hashNum; cl_mem gpu_psum; cl_mem gpuGbCount; long * cpuOffset = (long *)malloc(sizeof(long) * gb->table->totalAttr); CHECK_POINTER(cpuOffset); long offset = 0; long totalSize = 0; for(int i=0;i<gb->table->totalAttr;i++){ int attrSize = gb->table->attrSize[i]; int size = attrSize * gb->table->tupleNum; cpuOffset[i] = offset; /*align each column*/ if(size % 4 !=0){ size += 4 - (size%4); } offset += size; totalSize += size; } gpuContent = clCreateBuffer(context->context,CL_MEM_READ_ONLY, totalSize,NULL,&error); for(int i=0;i<gb->table->totalAttr;i++){ int attrSize = gb->table->attrSize[i]; int size = attrSize * gb->table->tupleNum; if(gb->table->dataPos[i]==MEM){ error = clEnqueueWriteBuffer(context->queue, gpuContent, CL_TRUE, cpuOffset[i], size, gb->table->content[i],0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->pcie += 1e-6 * (endTime - startTime); #endif }else error = clEnqueueCopyBuffer(context->queue,(cl_mem)gb->table->content[i],gpuContent,0, cpuOffset[i],size,0,0,0); } cl_mem gpuOffset = clCreateBuffer(context->context,CL_MEM_READ_ONLY, sizeof(long)*gb->table->totalAttr,NULL,&error); clEnqueueWriteBuffer(context->queue,gpuOffset,CL_TRUE,0,sizeof(long)*gb->table->totalAttr,cpuOffset,0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->pcie += 1e-6 * (endTime - startTime); #endif if(gbConstant != 1){ gpuGbType = clCreateBuffer(context->context,CL_MEM_READ_ONLY,sizeof(int)*gb->groupByColNum,NULL,&error); clEnqueueWriteBuffer(context->queue,gpuGbType,CL_TRUE,0,sizeof(int)*gb->groupByColNum,gb->groupByType,0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->pcie += 1e-6 * (endTime - startTime); #endif gpuGbSize = clCreateBuffer(context->context,CL_MEM_READ_ONLY,sizeof(int)*gb->groupByColNum,NULL,&error); clEnqueueWriteBuffer(context->queue,gpuGbSize,CL_TRUE,0,sizeof(int)*gb->groupByColNum,gb->groupBySize,0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->pcie += 1e-6 * (endTime - startTime); #endif gpuGbKey = clCreateBuffer(context->context,CL_MEM_READ_WRITE,sizeof(int)*gb->table->tupleNum,NULL,&error); gpuGbIndex = clCreateBuffer(context->context,CL_MEM_READ_ONLY, sizeof(int)*gb->groupByColNum,NULL,&error); clEnqueueWriteBuffer(context->queue,gpuGbIndex,CL_TRUE,0,sizeof(int)*gb->groupByColNum,gb->groupByIndex,0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->pcie += 1e-6 * (endTime - startTime); #endif gpu_hashNum = clCreateBuffer(context->context,CL_MEM_READ_WRITE, sizeof(int)*HSIZE,NULL,&error); context->kernel = clCreateKernel(context->program,"cl_memset_int",0); int tmp = HSIZE; clSetKernelArg(context->kernel,0,sizeof(cl_mem), (void*)&gpu_hashNum); clSetKernelArg(context->kernel,1,sizeof(int), (void*)&tmp); error = clEnqueueNDRangeKernel(context->queue, context->kernel, 1, 0, &globalSize,&localSize,0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->kernel += 1e-6 * (endTime - startTime); #endif context->kernel = clCreateKernel(context->program, "build_groupby_key",0); clSetKernelArg(context->kernel,0,sizeof(cl_mem),(void *)&gpuContent); clSetKernelArg(context->kernel,1,sizeof(cl_mem),(void *)&gpuOffset); clSetKernelArg(context->kernel,2,sizeof(int),(void *)&gpuGbColNum); clSetKernelArg(context->kernel,3,sizeof(cl_mem),(void *)&gpuGbIndex); clSetKernelArg(context->kernel,4,sizeof(cl_mem),(void *)&gpuGbType); clSetKernelArg(context->kernel,5,sizeof(cl_mem),(void *)&gpuGbSize); clSetKernelArg(context->kernel,6,sizeof(long),(void *)&gpuTupleNum); clSetKernelArg(context->kernel,7,sizeof(cl_mem),(void *)&gpuGbKey); clSetKernelArg(context->kernel,8,sizeof(cl_mem),(void *)&gpu_hashNum); error = clEnqueueNDRangeKernel(context->queue, context->kernel, 1, 0, &globalSize,&localSize,0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->kernel += 1e-6 * (endTime - startTime); #endif clReleaseMemObject(gpuGbType); clReleaseMemObject(gpuGbSize); clReleaseMemObject(gpuGbIndex); gbCount = 1; tmp = 0; gpuGbCount = clCreateBuffer(context->context,CL_MEM_READ_WRITE, sizeof(int),NULL,&error); clEnqueueWriteBuffer(context->queue,gpuGbCount,CL_TRUE,0,sizeof(int),&tmp,0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->pcie += 1e-6 * (endTime - startTime); #endif int hsize = HSIZE; context->kernel = clCreateKernel(context->program, "count_group_num",0); clSetKernelArg(context->kernel,0,sizeof(cl_mem),(void *)&gpu_hashNum); clSetKernelArg(context->kernel,1,sizeof(int),(void *)&hsize); clSetKernelArg(context->kernel,2,sizeof(cl_mem),(void *)&gpuGbCount); error = clEnqueueNDRangeKernel(context->queue, context->kernel, 1, 0, &globalSize,&localSize,0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->kernel += 1e-6 * (endTime - startTime); #endif clEnqueueReadBuffer(context->queue, gpuGbCount, CL_TRUE, 0, sizeof(int), &gbCount,0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->pcie += 1e-6 * (endTime - startTime); #endif gpu_psum = clCreateBuffer(context->context,CL_MEM_READ_WRITE, sizeof(int)*HSIZE,NULL,&error); scanImpl(gpu_hashNum,HSIZE,gpu_psum,context,pp); clReleaseMemObject(gpuGbCount); clReleaseMemObject(gpu_hashNum); } if(gbConstant == 1) res->tupleNum = 1; else res->tupleNum = gbCount; printf("groupBy num %ld\n",res->tupleNum); gpuGbType = clCreateBuffer(context->context, CL_MEM_READ_ONLY, sizeof(int)*res->totalAttr, NULL, &error); clEnqueueWriteBuffer(context->queue,gpuGbType,CL_TRUE,0,sizeof(int)*res->totalAttr,res->attrType,0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->pcie += 1e-6 * (endTime - startTime); #endif gpuGbSize = clCreateBuffer(context->context, CL_MEM_READ_ONLY, sizeof(int)*res->totalAttr, NULL, &error); clEnqueueWriteBuffer(context->queue,gpuGbSize,CL_TRUE,0,sizeof(int)*res->totalAttr,res->attrSize,0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->pcie += 1e-6 * (endTime - startTime); #endif /* * @gpuGbExp is the mathExp in each groupBy expression * @mathexp stores the math exp for for the group expression that has two operands * The reason that we need two variables instead of one is that OpenCL doesn't support pointer to pointer * */ cl_mem gpuGbExp = clCreateBuffer(context->context, CL_MEM_READ_ONLY, sizeof(struct mathExp)*res->totalAttr, NULL, &error); cl_mem mathexp = clCreateBuffer(context->context, CL_MEM_READ_ONLY, 2*sizeof(struct mathExp)*res->totalAttr, NULL, &error); struct mathExp tmpExp[2]; int * cpuFunc = (int *) malloc(sizeof(int) * res->totalAttr); CHECK_POINTER(cpuFunc); offset = 0; for(int i=0;i<res->totalAttr;i++){ error = clEnqueueWriteBuffer(context->queue, gpuGbExp, CL_TRUE, offset, sizeof(struct mathExp), &(gb->gbExp[i].exp),0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->pcie += 1e-6 * (endTime - startTime); #endif offset += sizeof(struct mathExp); cpuFunc[i] = gb->gbExp[i].func; if(gb->gbExp[i].exp.opNum == 2){ struct mathExp * tmpMath = (struct mathExp *) (gb->gbExp[i].exp.exp); tmpExp[0].op = tmpMath[0].op; tmpExp[0].opNum = tmpMath[0].opNum; tmpExp[0].opType = tmpMath[0].opType; tmpExp[0].opValue = tmpMath[0].opValue; tmpExp[1].op = tmpMath[1].op; tmpExp[1].opNum = tmpMath[1].opNum; tmpExp[1].opType = tmpMath[1].opType; tmpExp[1].opValue = tmpMath[1].opValue; clEnqueueWriteBuffer(context->queue, mathexp, CL_TRUE, 2*i*sizeof(struct mathExp),2*sizeof(struct mathExp),tmpExp,0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->pcie += 1e-6 * (endTime - startTime); #endif } } cl_mem gpuFunc = clCreateBuffer(context->context, CL_MEM_READ_ONLY, sizeof(int)*res->totalAttr, NULL, &error); clEnqueueWriteBuffer(context->queue,gpuFunc,CL_TRUE,0,sizeof(int)*res->totalAttr,cpuFunc,0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->pcie += 1e-6 * (endTime - startTime); #endif long *resOffset = (long *)malloc(sizeof(long)*res->totalAttr); CHECK_POINTER(resOffset); offset = 0; totalSize = 0; for(int i=0;i<res->totalAttr;i++){ /* * align the output of each column on the boundary of 4 */ int size = res->attrSize[i] * res->tupleNum; if(size %4 != 0){ size += 4- (size %4); } resOffset[i] = offset; offset += size; totalSize += size; } cl_mem gpuResult = clCreateBuffer(context->context,CL_MEM_READ_WRITE, totalSize, NULL, &error); cl_mem gpuResOffset = clCreateBuffer(context->context, CL_MEM_READ_ONLY,sizeof(long)*res->totalAttr, NULL,&error); clEnqueueWriteBuffer(context->queue,gpuResOffset,CL_TRUE,0,sizeof(long)*res->totalAttr,resOffset,0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->pcie += 1e-6 * (endTime - startTime); #endif gpuGbColNum = res->totalAttr; if(gbConstant !=1){ context->kernel = clCreateKernel(context->program,"agg_cal",0); clSetKernelArg(context->kernel,0,sizeof(cl_mem), (void*)&gpuContent); clSetKernelArg(context->kernel,1,sizeof(cl_mem), (void*)&gpuOffset); clSetKernelArg(context->kernel,2,sizeof(int), (void*)&gpuGbColNum); clSetKernelArg(context->kernel,3,sizeof(cl_mem), (void*)&gpuGbExp); clSetKernelArg(context->kernel,4,sizeof(cl_mem), (void*)&mathexp); clSetKernelArg(context->kernel,5,sizeof(cl_mem), (void*)&gpuGbType); clSetKernelArg(context->kernel,6,sizeof(cl_mem), (void*)&gpuGbSize); clSetKernelArg(context->kernel,7,sizeof(long), (void*)&gpuTupleNum); clSetKernelArg(context->kernel,8,sizeof(cl_mem), (void*)&gpuGbKey); clSetKernelArg(context->kernel,9,sizeof(cl_mem), (void*)&gpu_psum); clSetKernelArg(context->kernel,10,sizeof(cl_mem), (void*)&gpuResult); clSetKernelArg(context->kernel,11,sizeof(cl_mem), (void*)&gpuResOffset); clSetKernelArg(context->kernel,12,sizeof(cl_mem), (void*)&gpuFunc); error = clEnqueueNDRangeKernel(context->queue, context->kernel, 1, 0, &globalSize,&localSize,0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->kernel += 1e-6 * (endTime - startTime); #endif clReleaseMemObject(gpuGbKey); clReleaseMemObject(gpu_psum); }else{ context->kernel = clCreateKernel(context->program,"agg_cal_cons",0); clSetKernelArg(context->kernel,0,sizeof(cl_mem), (void*)&gpuContent); clSetKernelArg(context->kernel,1,sizeof(cl_mem), (void*)&gpuOffset); clSetKernelArg(context->kernel,2,sizeof(int), (void*)&gpuGbColNum); clSetKernelArg(context->kernel,3,sizeof(cl_mem), (void*)&gpuGbExp); clSetKernelArg(context->kernel,4,sizeof(cl_mem), (void*)&mathexp); clSetKernelArg(context->kernel,5,sizeof(cl_mem), (void*)&gpuGbType); clSetKernelArg(context->kernel,6,sizeof(cl_mem), (void*)&gpuGbSize); clSetKernelArg(context->kernel,7,sizeof(long), (void*)&gpuTupleNum); clSetKernelArg(context->kernel,8,sizeof(cl_mem), (void*)&gpuResult); clSetKernelArg(context->kernel,9,sizeof(cl_mem), (void*)&gpuResOffset); clSetKernelArg(context->kernel,10,sizeof(cl_mem), (void*)&gpuFunc); globalSize = localSize * 4; error = clEnqueueNDRangeKernel(context->queue, context->kernel, 1, 0, &globalSize,&localSize,0,0,&ndrEvt); #ifdef OPENCL_PROFILE clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->kernel += 1e-6 * (endTime - startTime); #endif } for(int i=0; i<res->totalAttr;i++){ res->content[i] = (char *)clCreateBuffer(context->context,CL_MEM_READ_WRITE, res->attrSize[i]*res->tupleNum, NULL, &error); res->dataPos[i] = GPU; res->attrTotalSize[i] = res->tupleNum * res->attrSize[i]; clEnqueueCopyBuffer(context->queue, gpuResult, (cl_mem)res->content[i], resOffset[i],0, res->attrSize[i] * res->tupleNum, 0,0,0); } free(resOffset); free(cpuOffset); clFinish(context->queue); clReleaseMemObject(gpuContent); clReleaseMemObject(gpuResult); clReleaseMemObject(gpuOffset); clReleaseMemObject(gpuResOffset); clReleaseMemObject(gpuGbExp); clReleaseMemObject(gpuFunc); clock_gettime(CLOCK_REALTIME,&end); double timeE = (end.tv_sec - start.tv_sec)* BILLION + end.tv_nsec - start.tv_nsec; printf("GroupBy Time: %lf\n", timeE/(1000*1000)); return res; }
void Device::scoreCandidates(eObj *e) { //e->iNumBufferedCandidates = 0; //return; //MEA: static? static cObj* p; //static size_t iNumBlocks; static size_t stGlobalDim; static size_t globalTransDim = Tempest::mround(Tempest::data.iNumMS2Bins, this->transform_size); static float fElapsedTime; long lSpectrumOffset = e->lIndex*Tempest::data.iNumMS2Bins; long lScratchOffset = (long)Tempest::data.iCrossCorrelationWidth; long lNoOffset = 0; int err; cl_ulong start; cl_ulong end; err = clEnqueueWriteBuffer(clCommandQueue, cl_cCandidates, CL_FALSE, 0, sizeof(cObj) * e->iNumBufferedCandidates, e->candidateBuffer, 0, NULL, &(e->clEventSent)); Tempest::check_cl_error(__FILE__, __LINE__, err, "Unable to copy candidate data from host to GPU"); stGlobalDim = Tempest::mround(Tempest::data.host_iPeakCounts[e->lIndex], this->build_size); cl_mem spectrumBuffer; std::map<long,cl_mem>::iterator s2bElem = spectrum2buffer.find(e->lIndex); if (s2bElem == spectrum2buffer.end()) { //spectrum not cached if (!unusedBuffers.empty()) { spectrumBuffer = unusedBuffers.top(); unusedBuffers.pop(); } else { spectrumBuffer = spectrum2buffer.begin()->second; spectrum2buffer.erase(spectrum2buffer.begin()); } spectrum2buffer[e->lIndex] = spectrumBuffer; //initialize buffer err = clEnqueueCopyBuffer(clCommandQueue, cl_init_fSpectra, spectrumBuffer, 0, 0, Tempest::data.iNumMS2Bins*sizeof(cl_float), 0, NULL, Tempest::config.profile ? &memsetEvent : NULL); //Tempest::check_cl_error(__FILE__, __LINE__, err, "Unable to clear spectrum memory"); if (err != 0) { //memory cap reached. Stop filling new buffers. unusedBuffers = std::stack<cl_mem>(); spectrumBuffer = spectrum2buffer.begin()->second; spectrum2buffer.erase(spectrum2buffer.begin()); spectrum2buffer[e->lIndex] = spectrumBuffer; err = clEnqueueCopyBuffer(clCommandQueue, cl_init_fSpectra, spectrumBuffer, 0, 0, Tempest::data.iNumMS2Bins*sizeof(cl_float), 0, NULL, Tempest::config.profile ? &memsetEvent : NULL); Tempest::check_cl_error(__FILE__, __LINE__, err, "Unable to clear spectrum memory"); } if (Tempest::config.profile) { clFinish(clCommandQueue); clGetEventProfilingInfo(memsetEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); clGetEventProfilingInfo(memsetEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); totalMemsetTime += (end-start); clReleaseEvent(memsetEvent); } // build err = clSetKernelArg(__cl_build, 0, sizeof(cl_mem), &spectrumBuffer); err |= clSetKernelArg(__cl_build, 1, sizeof(int), &(Tempest::data.host_iPeakCounts[e->lIndex])); err |= clSetKernelArg(__cl_build, 4, sizeof(long), &(Tempest::data.host_lPeakIndices[e->lIndex])); err |= clEnqueueNDRangeKernel(clCommandQueue, __cl_build, 1, NULL, &stGlobalDim, &(this->build_size), 0, NULL, Tempest::config.profile ? &buildEvent : NULL); Tempest::check_cl_error(__FILE__, __LINE__, err, "Could not build spectrum (cl_build kernel)"); if (Tempest::config.profile) { clFinish(clCommandQueue); clGetEventProfilingInfo(buildEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); clGetEventProfilingInfo(buildEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); totalBuildTime += (end-start); buildLaunches += 1; clReleaseEvent(buildEvent); } // transform if (Tempest::params.xcorrTransformWidth) { //size_t localDim = CROSS_CORRELATION_WINDOW * 2; //size_t globalDim = localDim * Tempest::data.iNumMS2Bins; size_t globalDim = Tempest::mround(Tempest::data.iNumMS2Bins, this->transform_size); err = clSetKernelArg(__cl_transform, 0, sizeof(cl_mem), &spectrumBuffer); err |= clEnqueueNDRangeKernel(clCommandQueue, __cl_transform, 1, NULL, &globalDim, &(this->transform_size), 0, NULL, Tempest::config.profile ? & transformEvent : NULL); Tempest::check_cl_error(__FILE__, __LINE__, err, "Could not transform spectrum (cl_transform kernel)"); if (Tempest::config.profile) { clFinish(clCommandQueue); clGetEventProfilingInfo(transformEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); clGetEventProfilingInfo(transformEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); totalTransformTime += (end-start); clReleaseEvent(transformEvent); } } } else { //move spectrum entry to end of map by reinserting spectrumBuffer = s2bElem->second; spectrum2buffer.erase(s2bElem); spectrum2buffer[e->lIndex] = spectrumBuffer; } // score err = clSetKernelArg(__cl_score, 0, sizeof(int), &(e->iPrecursorCharge)); err |= clSetKernelArg(__cl_score, 1, sizeof(int), &(e->iNumBufferedCandidates)); err |= clSetKernelArg(__cl_score, 4, sizeof(cl_mem), &spectrumBuffer); err |= clSetKernelArg(__cl_score, 5, sizeof(long), &lNoOffset); err |= clEnqueueNDRangeKernel(clCommandQueue, __cl_score, 1, NULL, &(this->candidateBufferSize), &(this->score_size), 0, NULL, Tempest::config.profile ? &scoreEvent : NULL); Tempest::check_cl_error(__FILE__, __LINE__, err, "Could not score candidates (cl_score kernel)"); if (Tempest::config.profile) { clFinish(clCommandQueue); clGetEventProfilingInfo(scoreEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); clGetEventProfilingInfo(scoreEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); totalScoreTime += (end-start); clReleaseEvent(scoreEvent); scoreKernelLaunches++; } // Process Scores // TODO what if buffer size is less than 512? long lPSMsOffset = e->lIndex * Tempest::params.numInternalPSMs; err |= clSetKernelArg(__cl_reduce_scores, 4, sizeof(long), &lPSMsOffset); if (Tempest::config.parallelReduce) err |= clEnqueueNDRangeKernel(clCommandQueue, __cl_reduce_scores, 1, NULL, &(this->reduce_scores_size), &(this->reduce_scores_size), 0, NULL, Tempest::config.profile ? &reduceEvent : NULL); else err |= clEnqueueTask(clCommandQueue, __cl_reduce_scores, 0, NULL, Tempest::config.profile ? &reduceEvent : NULL); Tempest::check_cl_error(__FILE__, __LINE__, err, "Could not process scores (cl_reduce_scores kernel)"); if (Tempest::config.profile) { clFinish(clCommandQueue); clGetEventProfilingInfo(reduceEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); clGetEventProfilingInfo(reduceEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); totalReduceTime += (end-start); clReleaseEvent(reduceEvent); } // reset buffer e->iNumBufferedCandidates = 0; }
int NBody::runCLKernels() { cl_int status; cl_event events[1]; /* * Enqueue a kernel run call. */ size_t globalThreads[] = {numBodies}; size_t localThreads[] = {groupSize}; if(localThreads[0] > maxWorkItemSizes[0] || localThreads[0] > maxWorkGroupSize) { std::cout << "Unsupported: Device" "does not support requested number of work items."; return SDK_FAILURE; } status = clEnqueueNDRangeKernel( commandQueue, kernel, 1, NULL, globalThreads, localThreads, 0, NULL, NULL); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueNDRangeKernel failed.")) { return SDK_FAILURE; } status = clFinish(commandQueue); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clFinish failed.")) { return SDK_FAILURE; } /* Copy data from new to old */ status = clEnqueueCopyBuffer(commandQueue, newPos, currPos, 0, 0, sizeof(cl_float4) * numBodies, 0, 0, 0); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueCopyBuffer failed.(newPos->oldPos)")) { return SDK_FAILURE; } status = clEnqueueCopyBuffer(commandQueue, newVel, currVel, 0, 0, sizeof(cl_float4) * numBodies, 0, 0, 0); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueCopyBuffer failed.(newVel->oldVels)")) { return SDK_FAILURE; } status = clFinish(commandQueue); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clFinish failed.")) { return SDK_FAILURE; } /* Enqueue readBuffer*/ status = clEnqueueReadBuffer( commandQueue, currPos, CL_TRUE, 0, numBodies* sizeof(cl_float4), pos, 0, NULL, &events[0]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueReadBuffer failed.")) return SDK_FAILURE; /* Wait for the read buffer to finish execution */ status = clWaitForEvents(1, &events[0]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clWaitForEvents failed.")) return SDK_FAILURE; clReleaseEvent(events[0]); return SDK_SUCCESS; }