cl_int MemoryObjectWrapper::createSubBuffer (cl_mem_flags aFlags, RegionWrapper const& aRegion, MemoryObjectWrapper** aResultOut) { #if CL_WRAPPER_CL_VERSION_SUPPORT >= 110 D_METHOD_START; cl_int err = CL_SUCCESS; VALIDATE_ARG_POINTER (aResultOut, &err, err); cl_buffer_region region; region.origin = aRegion.origin; region.size = aRegion.size; cl_mem mem = clCreateSubBuffer (mWrapped, aFlags, CL_BUFFER_CREATE_TYPE_REGION, (void const*)®ion, &err); if (err != CL_SUCCESS || !mem) D_LOG (LOG_LEVEL_ERROR, "clCreateSubBuffer failed. (error %d)", err); // NOTE: clCreateSubBuffer can return an existing handle. *aResultOut = MemoryObjectWrapper::getNewOrExisting (mem); return err; #else // CL_WRAPPER_CL_VERSION_SUPPORT >= 110 (void)aFlags; (void)aRegion; (void)aResultOut; D_LOG (LOG_LEVEL_ERROR, "CLWrapper support for OpenCL 1.1 API was not enabled at build time."); return CL_INVALID_VALUE; #endif }
/* * call-seq: * Memory.create_subbuffer(buffer, , [10, 100]) * */ static VALUE rcl_mem_create_subbuffer(VALUE mod, VALUE buffer, VALUE flags, VALUE region) { EXPECT_RCL_TYPE(buffer, Memory); EXPECT_FIXNUM(flags); EXPECT_ARRAY(region); long i = RARRAY_LEN(region); if (i != 2) { rb_raise(rb_eArgError, "Expected the parameter region has 2 items, got (%ld).", i); } VALUE sz = rb_ary_entry(region, 0); EXTRACT_SIZE(sz, origin); sz = rb_ary_entry(region, 1); EXTRACT_SIZE(sz, offset); cl_buffer_region br; br.origin = origin; br.size = offset; cl_mem buf = MemoryPtr(buffer); cl_mem_flags mf = FIX2INT(flags); cl_int res = CL_SUCCESS; cl_mem subbuf = clCreateSubBuffer(buf, mf, CL_BUFFER_CREATE_TYPE_REGION, &br, &res); CHECK_AND_RAISE(res); return RMemory(subbuf); }
/*! Creates a new buffer that refers to the \a size bytes, starting at \a offset within this buffer. The data in the new buffer will be accessed according to \a access. Sub-buffers are an OpenCL 1.1 feature. On OpenCL 1.0, this function will return a null buffer. \sa parentBuffer(), offset() */ QCLBuffer QCLBuffer::createSubBuffer (size_t offset, size_t size, QCLMemoryObject::Access access) { #ifdef QT_OPENCL_1_1 cl_int error; cl_buffer_region region; region.origin = offset; region.size = size; cl_mem mem = clCreateSubBuffer (memoryId(), cl_mem_flags(access), CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); context()->reportError("QCLBuffer::createSubBuffer:", error); return QCLBuffer(context(), mem); #else Q_UNUSED(offset); Q_UNUSED(size); Q_UNUSED(access); return QCLBuffer(); #endif }
boost::shared_ptr<DeviceMatrixCL> makeDeviceMatrixCL(DeviceMatrixCL3D& src, const int slice) { const int height = src.dim_y; const int width = src.dim_x; DeviceMatrixCL* mat = new DeviceMatrixCL(); mat->width = width; mat->height = height; //size_t mem_size = width * height size_t buffer_region[2] = {src.pitch_t * slice, src.pitch_t}; cl_int err; mat->dataMatrix = clCreateSubBuffer(src.dataMatrix, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, buffer_region, &err); mat->pitch = src.pitch_y; return boost::shared_ptr<DeviceMatrixCL>(mat, deleteDeviceMatrixCL); }
void ConvolutionLayerSpatial<Dtype>::setBufferKernelArg( const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top, viennacl::ocl::kernel *kernel, const cl_uint &argIdx, viennacl::ocl::context *ctx, cl_mem buffer, size_t offset, size_t size, bool readOnly, bool preserved) { if (offset == 0) { kernel->arg(argIdx, WrapHandle((cl_mem) buffer, ctx)); return; } if (preserved && subBufferMap.find(std::make_tuple(buffer, offset, size)) != subBufferMap.end()) { kernel->arg(argIdx, WrapHandle(subBufferMap.find (std::make_tuple(buffer, offset, size))->second, ctx)); return; } cl_buffer_region region; region.origin = offset * sizeof(Dtype); region.size = size * sizeof(Dtype); cl_mem_flags memFlags = readOnly ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE; cl_int error; cl_mem sub_buffer = clCreateSubBuffer(buffer, memFlags, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); CHECK_EQ(error, CL_SUCCESS) << "Failed to create sub buffer." << std::endl; kernel->arg(argIdx, WrapHandle(sub_buffer, ctx)); if (preserved) subBufferMap.insert(std::make_pair(std::make_tuple(buffer, offset, size), sub_buffer)); else tmpSubBuffers.push_back(sub_buffer); }
static void _ocl_offset_data(QSP_ARG_DECL Data_Obj *dp, index_t offset) { #ifndef USE_OPENCL_SUBREGION /* The original code used subBuffers, but overlapping subregions * don't work... * So instead we use a common memory buffer, but keep track * of the starting offset (in elements). This offset has * to be passed to the kernels. */ //fprintf(stderr,"ocl_offset_data: obj %s, offset = %d\n",OBJ_NAME(dp),offset); //fprintf(stderr,"\tparent obj %s, parent offset = %d\n",OBJ_NAME(OBJ_PARENT(dp)), //OBJ_OFFSET(OBJ_PARENT(dp))); if( IS_COMPLEX(dp) ){ assert( (offset & 1) == 0 ); offset /= 2; //fprintf(stderr,"Adjusted offset (%d) for complex object %s\n",offset,OBJ_NAME(dp)); } else if( IS_QUAT(dp) ){ assert( (offset & 3) == 0 ); offset /= 4; } SET_OBJ_DATA_PTR(dp,OBJ_DATA_PTR(OBJ_PARENT(dp))); SET_OBJ_OFFSET( dp, OBJ_OFFSET(OBJ_PARENT(dp)) + offset ); #else // USE_OPENCL_SUBREGION cl_mem buf; cl_mem parent_buf; cl_buffer_region reg; cl_int status; int extra_offset; parent_buf = find_parent_buf(OBJ_PARENT(dp),&extra_offset); assert( parent_buf != NULL ); reg.origin = (offset+extra_offset) * ELEMENT_SIZE(dp); // No - the region has to be big enough for all of the elements. // The safest thing is to include everything from the start // of the subregion to the end of the parent. Note that this // cannot handle negative increments!? // reg.size = OBJ_N_MACH_ELTS(dp) * ELEMENT_SIZE(dp); // p p p p p p p // p p c c c p p // p p p p p p p // p p c c c p p reg.size = OBJ_SEQ_INC(dp)*(OBJ_SEQS(dp)-1) + OBJ_FRM_INC(dp)*(OBJ_FRAMES(dp)-1) + OBJ_ROW_INC(dp)*(OBJ_ROWS(dp)-1) + OBJ_PXL_INC(dp)*(OBJ_COLS(dp)-1) + OBJ_COMP_INC(dp)*(OBJ_COMPS(dp)-1) + 1; reg.size *= ELEMENT_SIZE(dp); //fprintf(stderr,"requesting subregion of %ld bytes at offset %ld\n", //reg.size,reg.origin); buf = clCreateSubBuffer ( parent_buf, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®, &status); if( status != CL_SUCCESS ){ report_ocl_error(status, "clCreateSubBuffer"); SET_OBJ_DATA_PTR(dp,OBJ_DATA_PTR(OBJ_PARENT(dp))); } else { SET_OBJ_DATA_PTR(dp,buf); } // BUG - Because this object doesn't "own" the data, the sub-buffer // won't be released when the object is destroyed, a possible memory // leak... // We need to add a special case, or make data releasing a // platform-specific function... #endif // USE_OPENCL_SUBREGION }
enum piglit_result piglit_cl_test(const int argc, const char **argv, const struct piglit_cl_custom_test_config *config, const struct piglit_cl_custom_test_env *env) { piglit_cl_context piglit_cl_context = NULL; cl_command_queue queue = NULL; cl_mem buffer = NULL, sub_buffer = NULL; cl_program program = NULL; cl_kernel kernel = NULL; unsigned i; size_t global_size = 1, local_size = 1; cl_buffer_region region = {PAD_SIZE, SUB_BUFFER_SIZE }; cl_int err; char *sub_data = malloc(BUFFER_SIZE); char *padding = malloc(PAD_SIZE); char data_byte = (char)DATA_BYTE; char pad_byte = 0xcd; char *out_data = malloc(BUFFER_SIZE); assert(SUB_BUFFER_SIZE % 4 == 0); memset(sub_data, data_byte, SUB_BUFFER_SIZE); memset(padding, pad_byte, PAD_SIZE); piglit_cl_context = piglit_cl_create_context(env->platform_id, &env->device_id, 1); queue = piglit_cl_context->command_queues[0]; buffer = piglit_cl_create_buffer(piglit_cl_context, CL_MEM_READ_WRITE, BUFFER_SIZE); sub_buffer = clCreateSubBuffer(buffer, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); if (err != CL_SUCCESS) { fprintf(stderr, "clCreateSubBuffer() failed."); return PIGLIT_FAIL; } clEnqueueWriteBuffer(queue, buffer, CL_FALSE, 0, PAD_SIZE, padding, 0, NULL, NULL); clEnqueueWriteBuffer(queue, buffer, CL_FALSE, BUFFER_SIZE - PAD_SIZE, PAD_SIZE, padding, 0, NULL, NULL); clFinish(queue); program = piglit_cl_build_program_with_source(piglit_cl_context, 1, &source, ""); kernel = piglit_cl_create_kernel(program, "test"); if (!piglit_cl_set_kernel_arg(kernel, 0, sizeof(cl_mem), &sub_buffer)) { return PIGLIT_FAIL; } if (!piglit_cl_enqueue_ND_range_kernel(queue, kernel, 1, &global_size, &local_size)) { return PIGLIT_FAIL; } clFinish(queue); clEnqueueReadBuffer(queue, buffer, CL_TRUE, 0, BUFFER_SIZE, out_data, 0, NULL, NULL); for (i = 0; i < PAD_SIZE; i++) { if (!piglit_cl_probe_integer(out_data[i], pad_byte, 0)) { fprintf(stderr, "Failed at offset %u\n", i); return PIGLIT_FAIL; } } for (i = BUFFER_SIZE - PAD_SIZE; i < BUFFER_SIZE; i++) { if (!piglit_cl_probe_integer(out_data[i], pad_byte, 0)) { fprintf(stderr, "Failed at offset %u\n", i); return PIGLIT_FAIL; } } for (i = PAD_SIZE; i < BUFFER_SIZE - PAD_SIZE; i++) { if (!piglit_cl_probe_integer(out_data[i], data_byte, 0)) { fprintf(stderr, "Failed at offset %u\n", i); return PIGLIT_FAIL; } } return PIGLIT_PASS; }
/** * \related cl_Mem_Object_t * * This function allocates memory for child Memory Object with generic data * container & sets function pointers * * @param[in] self pointer to existing Memory Object, in which function pointer * 'Make_Child' is defined to point on this function. * @param[in] mem_flags OpenCL memory flags, which will be used for OpenCL * sub-buffer creation * @param[in] buffer_create_type information about sub-buffer creation type * @param[in] buffer_create_info structure with sub-buffer origin & size * values. * * @return pointer to allocated structure in case of success, * \ref VOID_MEM_OBJ_PTR otherwise. In case of error it sets error code, which * is available via 'error' structure. * * @warning always use 'Destroy' function pointer to free memory, allocated * by this function. */ static scow_Mem_Object* Buffer_Make_Sub_Buffer(scow_Mem_Object *self, cl_mem_flags flags, cl_buffer_create_type buffer_create_type, const void *buffer_create_info) { OCL_CHECK_EXISTENCE(self, VOID_MEM_OBJ_PTR); cl_int ret; scow_Mem_Object* child; /* Sub-buffer creation is defined only for Buffer, which isn't someone's * child. */ if (self->obj_mem_type != BUFFER) { ret = INVALID_ARG_TYPE; self->error->Set_Last_Code(self->error, ret); OCL_DIE_ON_ERROR(ret, CL_SUCCESS, NULL, VOID_MEM_OBJ_PTR); } if (self->obj_paternity != PARENT_OBJECT) { ret = WRONG_PARENT_OBJECT; self->error->Set_Last_Code(self->error, ret); OCL_DIE_ON_ERROR(ret, CL_SUCCESS, NULL, VOID_MEM_OBJ_PTR); } child = (scow_Mem_Object*) calloc(1, sizeof(*child)); OCL_CHECK_EXISTENCE(child, VOID_MEM_OBJ_PTR); child->obj_mem_type = BUFFER; child->obj_paternity = CHILD_OBJECT; child->mem_flags = flags; child->parent_thread = self->parent_thread; child->error = Make_Error(); child->timer = Make_Timer(VOID_KERNEL_PTR); child->Get_Mem_Obj = Mem_Object_Get_Mem_Obj; child->Destroy = Mem_Object_Destroy; child->Swap = Mem_Object_Swap; child->Unmap = Mem_Object_Unmap; child->Map = Buffer_Map; child->Write = Buffer_Send_To_Device; child->Read = Buffer_Get_From_Device; child->Copy = Buffer_Copy; child->Erase = Buffer_Erase; child->Sync = Mem_Object_Sync; child->Get_Height = Buffer_Get_Height; child->Get_Width = Buffer_Get_Width; child->Get_Row_Pitch = Buffer_Get_Row_Pitch; child->Make_Child = Buffer_Make_Sub_Buffer; child->cl_mem_object = clCreateSubBuffer(self->cl_mem_object, flags, buffer_create_type, buffer_create_info, &ret); OCL_DIE_ON_ERROR(ret, CL_SUCCESS, child->Destroy(child), VOID_MEM_OBJ_PTR); child->size = ((cl_buffer_region*) buffer_create_info)->size; child->origin = ((cl_buffer_region*) buffer_create_info)->origin; if (self->host_ptr) { child->host_ptr = (unsigned char*)self->host_ptr + child->origin; } return child; }
// main() for simple buffer and sub-buffer example // int main(int argc, char** argv) { cl_int errNum; cl_uint numPlatforms; cl_uint numDevices; cl_platform_id * platformIDs; cl_device_id * deviceIDs; cl_context context; cl_program program; std::vector<cl_kernel> kernels; std::vector<cl_command_queue> queues; std::vector<cl_mem> buffers; int * inputOutput; std::cout << "Simple buffer and sub-buffer Example" << std::endl; // First, select an OpenCL platform to run on. errNum = clGetPlatformIDs(0, NULL, &numPlatforms); checkErr( (errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs"); platformIDs = (cl_platform_id *)alloca(sizeof(cl_platform_id) * numPlatforms); std::cout << "Number of platforms: \t" << numPlatforms << std::endl; errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL); checkErr( (errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs"); std::ifstream srcFile("simple.cl"); checkErr(srcFile.is_open() ? CL_SUCCESS : -1, "reading simple.cl"); std::string srcProg( std::istreambuf_iterator<char>(srcFile), (std::istreambuf_iterator<char>())); const char * src = srcProg.c_str(); size_t length = srcProg.length(); deviceIDs = NULL; DisplayPlatformInfo( platformIDs[PLATFORM_INDEX], CL_PLATFORM_VENDOR, "CL_PLATFORM_VENDOR"); errNum = clGetDeviceIDs( platformIDs[PLATFORM_INDEX], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices); if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND){ checkErr(errNum, "clGetDeviceIDs"); } deviceIDs = (cl_device_id *)alloca( sizeof(cl_device_id) * numDevices); errNum = clGetDeviceIDs( platformIDs[PLATFORM_INDEX], CL_DEVICE_TYPE_ALL, numDevices, &deviceIDs[0], NULL); checkErr(errNum, "clGetDeviceIDs"); cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platformIDs[PLATFORM_INDEX], 0 }; context = clCreateContext( contextProperties, numDevices, deviceIDs, NULL, NULL, &errNum); checkErr(errNum, "clCreateContext"); // Create program from source program = clCreateProgramWithSource( context, 1, &src, &length, &errNum); checkErr(errNum, "clCreateProgramWithSource"); // Build program errNum = clBuildProgram( program, numDevices, deviceIDs, "-I.", NULL, NULL); if (errNum != CL_SUCCESS){ // Determine the reason for the error char buildLog[16384]; clGetProgramBuildInfo( program, deviceIDs[0], CL_PROGRAM_BUILD_LOG, sizeof(buildLog), buildLog, NULL); std::cerr << "Error in OpenCL C source: " << std::endl; std::cerr << buildLog; checkErr(errNum, "clBuildProgram"); } // create buffers and sub-buffers inputOutput = new int[NUM_BUFFER_ELEMENTS * numDevices]; for (unsigned int i = 0; i < NUM_BUFFER_ELEMENTS * numDevices; i++) { inputOutput[i] = i; } // create a single buffer to cover all the input data cl_mem buffer = clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof(int) * NUM_BUFFER_ELEMENTS * numDevices, NULL, &errNum); checkErr(errNum, "clCreateBuffer"); buffers.push_back(buffer); // now for all devices other than the first create a sub-buffer for (unsigned int i = 1; i < numDevices; i++) { cl_buffer_region region = { NUM_BUFFER_ELEMENTS * i * sizeof(int), NUM_BUFFER_ELEMENTS * sizeof(int) }; buffer = clCreateSubBuffer( buffers[0], CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &errNum); checkErr(errNum, "clCreateSubBuffer"); buffers.push_back(buffer); } // Create command queues for (int i = 0; i < numDevices; i++) { InfoDevice<cl_device_type>::display(deviceIDs[i], CL_DEVICE_TYPE, "CL_DEVICE_TYPE"); cl_command_queue queue = clCreateCommandQueue( context, deviceIDs[i], 0, &errNum); checkErr(errNum, "clCreateCommandQueue"); queues.push_back(queue); cl_kernel kernel = clCreateKernel( program, "square", &errNum); checkErr(errNum, "clCreateKernel(square)"); errNum = clSetKernelArg( kernel, 0, sizeof(cl_mem), (void *)&buffers[i]); checkErr(errNum, "clSetKernelArg(square)"); kernels.push_back(kernel); // Write input data clEnqueueWriteBuffer( queues[0], buffers[0], CL_TRUE, 0, sizeof(int) * NUM_BUFFER_ELEMENTS * numDevices, (void*)inputOutput, 0, NULL, NULL); std::vector<cl_event> events; // call kernel for each device for (int i = 0; i < queues.size(); i++) { cl_event event; size_t gWI = NUM_BUFFER_ELEMENTS; errNum = clEnqueueNDRangeKernel( queues[i], kernels[i], 1, NULL, (const size_t*)&gWI, (const size_t*)NULL, 0, 0, &event); events.push_back(event); } // Technically don't need this as we are doing a blocking read // with in-order queue. clWaitForEvents(events.size(), events.data()); // Read back computed data clEnqueueReadBuffer( queues[0], buffers[0], CL_TRUE, 0, sizeof(int) * NUM_BUFFER_ELEMENTS * numDevices, (void*)inputOutput, 0, NULL, NULL); // Display output in rows for (unsigned i = 0; i < numDevices; i++) { for (unsigned elems = i * NUM_BUFFER_ELEMENTS; elems < ((i+1) * NUM_BUFFER_ELEMENTS); elems++) { std::cout << " " << inputOutput[elems]; } std::cout << std::endl; } std::cout << "Program completed successfully" << std::endl; return 0; } }