コード例 #1
0
ファイル: simple.cpp プロジェクト: BeauJoh/SimpleImageLoad
// main() for simple buffer and sub-buffer example
//
int main(int argc, char** argv)
{
    
    std::cout << "Simple Image Processing Example" << std::endl;
    
    
    // First, select an OpenCL platform to run on.
    errNum = clGetPlatformIDs(0, NULL, &numPlatforms);
    checkErr(
             (errNum != CL_SUCCESS) ?
             errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS),
             "clGetPlatformIDs");
    platformIDs = (cl_platform_id *)alloca(sizeof(cl_platform_id) * numPlatforms);
    std::cout << "Number of platforms: \t" << numPlatforms << std::endl;
    errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
    checkErr(
             (errNum != CL_SUCCESS) ?
             errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS),
             "clGetPlatformIDs");
    std::ifstream srcFile("gaussian_filter.cl");
    
    checkErr(srcFile.is_open() ? CL_SUCCESS : -1, "reading simple.cl");
    
    std::string srcProg(
                        std::istreambuf_iterator<char>(srcFile),
                        (std::istreambuf_iterator<char>()));
    const char * src = srcProg.c_str();
    size_t length = srcProg.length();
    deviceIDs = NULL;
    DisplayPlatformInfo(
                        platformIDs[PLATFORM_INDEX],
                        CL_PLATFORM_VENDOR,
                        "CL_PLATFORM_VENDOR");
    errNum = clGetDeviceIDs(
                            platformIDs[PLATFORM_INDEX],
                            CL_DEVICE_TYPE_ALL,
                            0,
                            NULL,
                            &numDevices);
    if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND){
        checkErr(errNum, "clGetDeviceIDs");
    }
    
    deviceIDs = (cl_device_id *)alloca(sizeof(cl_device_id) * numDevices);
    errNum = clGetDeviceIDs(
                            platformIDs[PLATFORM_INDEX],
                            CL_DEVICE_TYPE_ALL,
                            numDevices,
                            &deviceIDs[0],
                            NULL);
    checkErr(errNum, "clGetDeviceIDs");
    
    cl_context_properties contextProperties[] =
    {
        CL_CONTEXT_PLATFORM,
        (cl_context_properties)platformIDs[PLATFORM_INDEX],
        0
    };
    
    context = clCreateContext(
                              contextProperties,
                              numDevices,
                              deviceIDs,
                              NULL,
                              NULL,
                              &errNum);
    
    checkErr(errNum, "clCreateContext");
    // Create program from source
    program = clCreateProgramWithSource(
                                        context,
                                        1,
                                        &src,
                                        &length,
                                        &errNum);
    checkErr(errNum, "clCreateProgramWithSource");
    
    // Build program
    errNum = clBuildProgram(
                            program,
                            numDevices,
                            deviceIDs,
                            "-I.",
                            NULL,
                            NULL);

    if (errNum != CL_SUCCESS){
        // Determine the reason for the error
        char buildLog[16384];
        clGetProgramBuildInfo(
                              program,
                              deviceIDs[0],
                              CL_PROGRAM_BUILD_LOG,
                              sizeof(buildLog),
                              buildLog,
                              NULL);
        std::cerr << "Error in OpenCL C source: " << std::endl;
        std::cerr << buildLog;
        checkErr(errNum, "clBuildProgram");
    }
    
    // Create a command commands
	//
	if(!(commands = clCreateCommandQueue(context, deviceIDs[0], 0, &errNum))) {
        std::cout << "Failed to create a command commands!" << std::endl;
        cleanKill(EXIT_FAILURE);
    }
    
    cl_kernel kernel = clCreateKernel(program, "gaussian_filter", &errNum);
    checkErr(errNum, "clCreateKernel(gaussian_filter)");

    if(!doesGPUSupportImageObjects){
        cleanKill(EXIT_FAILURE);
    }
    
    inputImage = LoadImage(context, (char*)"rgba.png", width, height);
        
    cl_image_format format; 
    format.image_channel_order = CL_RGBA; 
    format.image_channel_data_type = CL_UNORM_INT8;
    
    outputImage = clCreateImage2D(context, 
                             CL_MEM_WRITE_ONLY, 
                             &format, 
                             width, 
                             height,
                             0, 
                             NULL, 
                             &errNum);
    
    if(there_was_an_error(errNum)){
        std::cout << "Output Image Buffer creation error!" << std::endl;
        cleanKill(EXIT_FAILURE);
    }    
    
	if (!inputImage || !outputImage ){
        std::cout << "Failed to allocate device memory!" << std::endl;
        cleanKill(EXIT_FAILURE);
	}
    
    char *buffer = new char [width * height * 4];
    size_t origin[3] = { 0, 0, 0 };
    size_t region[3] = { width, height, 1};

    sampler = clCreateSampler(context,
                              CL_FALSE, // Non-normalized coordinates 
                              CL_ADDRESS_CLAMP_TO_EDGE, 
                              CL_FILTER_NEAREST, 
                              &errNum);
    
    if(there_was_an_error(errNum)){
        std::cout << "Error creating CL sampler object." << std::endl;
        cleanKill(EXIT_FAILURE);
    }
    
    // Set the kernel arguments
    errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
    errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
    errNum |= clSetKernelArg(kernel, 2, sizeof(cl_sampler), &sampler);
    errNum |= clSetKernelArg(kernel, 3, sizeof(cl_int), &width);
    errNum |= clSetKernelArg(kernel, 4, sizeof(cl_int), &height);
    if (errNum != CL_SUCCESS)
    {
        std::cerr << "Error setting kernel arguments." << std::endl;
        std::cerr << print_cl_errstring(errNum) << std::endl;
        cleanKill(EXIT_FAILURE);
    }
    
    //errNum = clGetKernelWorkGroupInfo(kernel, deviceIDs, CL_KERNEL_WORK_GROUP_SIZE, sizeof(unsigned short)* height*width*4, &local, NULL);
    
//	if (errNum != CL_SUCCESS)
//	{
//        cout << print_cl_errstring(err) << endl;
//        if(err == CL_INVALID_VALUE){
//            cout << "if param_name is not valid, or if size in bytes specified by param_value_size "
//            << "is less than the size of return type as described in the table above and "
//            << "param_value is not NULL." << endl;
//        }
//		cout << "Error: Failed to retrieve kernel work group info!" << err << endl;
//		cleanKill(EXIT_FAILURE);
//	}
    
    std::cout << "Max work group size is " << CL_DEVICE_MAX_WORK_GROUP_SIZE << std::endl;
    std::cout << "Max work item size is " << CL_DEVICE_MAX_WORK_ITEM_SIZES << std::endl;
    
    size_t localWorkSize[2];
    size_t globalWorkSize[2];
    
    localWorkSize[0] = 1;
    localWorkSize[1] = localWorkSize[0];
    globalWorkSize[0] = width*height;
    globalWorkSize[1] = globalWorkSize[0];
    
    //CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and number of work-items specified by global_work_size is not evenly divisable by size of work-group given by local_work_size
    
    
    
        //size_t globalWorkSize[2] =  { RoundUp(localWorkSize[0], width), RoundUp(localWorkSize[1], height)};

//    size_t globalWorkSize[1] = {sizeof(unsigned short)* height * width};
//	size_t localWorkSize[1] = {64};
    
    // Queue the kernel up for execution
    errNum = clEnqueueNDRangeKernel(commands, kernel, 2, NULL,
                                    globalWorkSize, localWorkSize,
                                    0, NULL, NULL);
    
    if (errNum != CL_SUCCESS){
        std::cerr << "Error queuing kernel for execution." << std::endl;
        std::cerr << print_cl_errstring(errNum) << std::endl;
        cleanKill(EXIT_FAILURE);
    }
    
    // Wait for the command commands to get serviced before reading back results
	//
	clFinish(commands);
    
    // Read back computed data
    errNum = clEnqueueReadImage(commands, outputImage,
                                     CL_TRUE, origin, region, 0, 0, buffer, 0, NULL, NULL);
    
    SaveImage((char*)"outRGBA.png", (char*)buffer, width, height);

    std::cout << "Program completed successfully" << std::endl;        
    return 0;     
}
コード例 #2
0
void WriteBufferOperation::executeOpenCLRegion(OpenCLDevice *device, rcti *rect, unsigned int chunkNumber,
                                               MemoryBuffer **inputMemoryBuffers, MemoryBuffer *outputBuffer)
{
	float *outputFloatBuffer = outputBuffer->getBuffer();
	cl_int error;
	/*
	 * 1. create cl_mem from outputbuffer
	 * 2. call NodeOperation (input) executeOpenCLChunk(.....)
	 * 3. schedule readback from opencl to main device (outputbuffer)
	 * 4. schedule native callback
	 *
	 * note: list of cl_mem will be filled by 2, and needs to be cleaned up by 4
	 */
	// STEP 1
	const unsigned int outputBufferWidth = outputBuffer->getWidth();
	const unsigned int outputBufferHeight = outputBuffer->getHeight();

	const cl_image_format imageFormat = {
		CL_RGBA,
		CL_FLOAT
	};

	cl_mem clOutputBuffer = clCreateImage2D(device->getContext(), CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, &imageFormat, outputBufferWidth, outputBufferHeight, 0, outputFloatBuffer, &error);
	if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error));  }
	
	// STEP 2
	list<cl_mem> *clMemToCleanUp = new list<cl_mem>();
	clMemToCleanUp->push_back(clOutputBuffer);
	list<cl_kernel> *clKernelsToCleanUp = new list<cl_kernel>();

	this->m_input->executeOpenCL(device, outputBuffer, clOutputBuffer, inputMemoryBuffers, clMemToCleanUp, clKernelsToCleanUp);

	// STEP 3

	size_t origin[3] = {0, 0, 0};
	size_t region[3] = {outputBufferWidth, outputBufferHeight, 1};

//	clFlush(queue);
//	clFinish(queue);

	error = clEnqueueBarrier(device->getQueue());
	if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error));  }
	error = clEnqueueReadImage(device->getQueue(), clOutputBuffer, CL_TRUE, origin, region, 0, 0, outputFloatBuffer, 0, NULL, NULL);
	if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error));  }
	
	this->getMemoryProxy()->getBuffer()->copyContentFrom(outputBuffer);

	// STEP 4
	while (!clMemToCleanUp->empty()) {
		cl_mem mem = clMemToCleanUp->front();
		error = clReleaseMemObject(mem);
		if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error));  }
		clMemToCleanUp->pop_front();
	}

	while (!clKernelsToCleanUp->empty()) {
		cl_kernel kernel = clKernelsToCleanUp->front();
		error = clReleaseKernel(kernel);
		if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error));  }
		clKernelsToCleanUp->pop_front();
	}
	delete clKernelsToCleanUp;
}
コード例 #3
0
ファイル: main.cpp プロジェクト: datakun/SourceCodeReading
/*-------------------------------------------------------------------------
 * ソフトフォーカス処理をおこなう(OpenCLを使った実装, イメージオブジェクトを使う)
 */
static void
softfocusWithOpenCLImage(unsigned char* srcData,
                         unsigned char* dstData,
                         const unsigned int width,
                         const unsigned int height)
{
    ClHelper clHelper;
    clHelper.preloadProgram("../../calc.cl");  // カーネルプログラムの読み込み

    cl_context       context = clHelper.getContext();      // コンテキストの取得
    cl_command_queue queue   = clHelper.getCommandQueue(); // コマンドキューの取得
    cl_program       program = clHelper.getProgram();      // プログラムの取得
    if (program == (cl_program)0) {
        throw MyError("program bug, program object is not loaded", __FUNCTION__);
    }

    cl_int status;

    // イメージフォーマットの定義
    cl_image_format format;
    format.image_channel_data_type = CL_UNORM_INT8; // 0-255を[0-1]に正規化
    format.image_channel_order     = CL_BGRA;       // BGRAの順に並んでいる

    // 元のイメージデータを保持するイメージオブジェクトの作成
    cl_mem src_mem = clCreateImage2D(context,
                                     CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
                                     &format,
                                     width, height,
                                     0,
                                     srcData,
                                     &status);
    if (status != CL_SUCCESS) {
        ClHelper::printError(status);
        throw MyError("failed to create image memory object 1", __FUNCTION__);
    }

    // 結果を保持するイメージオブジェクトの作成
    cl_mem dst_mem = clCreateImage2D(context,
                                     CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
                                     &format,
                                     width, height,
                                     0,
                                     dstData,
                                     &status);
    if (status != CL_SUCCESS) {
        ClHelper::printError(status);
        throw MyError("failed to create image memory object 2", __FUNCTION__);
    }

    // カーネルの作成
    cl_kernel kernel;
    kernel = clCreateKernel(program, "softFocus", &status);
    if (kernel == (cl_kernel)0) {
        ClHelper::printError(status);
        throw MyError("failed to create kernel", __FUNCTION__);
    }

    // カーネル関数引数のセット
    status += clSetKernelArg(kernel, 0, sizeof(src_mem), (void *)&src_mem);
    status += clSetKernelArg(kernel, 1, sizeof(dst_mem), (void *)&dst_mem);
    if (status != 0) {
        printf("clSetKernelArg failed\n");
        throw MyError("failed to set kernel arguments.", __FUNCTION__);
    }

    // カーネル実行のリクエスト
    cl_uint work_dim = 2; // x, y
    size_t global_work_size[] = {width - 2, height - 2};

    status = clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL,
                                    global_work_size, 0, 0, NULL, NULL);
    if (status != CL_SUCCESS) {
        ClHelper::printError(status);
        throw MyError("clEnqueueNDRangeKernel failed.", __FUNCTION__);
    }

    // 画像データの取得
    const size_t origin[] = {0, 0, 0};
    const size_t region[] = {width, height, 1};
    status = clEnqueueReadImage(queue,
                                dst_mem,
                                CL_TRUE,
                                origin,
                                region,
                                0, 0,
                                dstData,
                                NULL, 0, NULL);

    if (status != CL_SUCCESS) {
        ClHelper::printError(status);
        throw MyError("failed to read image buffer", __FUNCTION__);
    }
                                
    // リソースの解放
    clReleaseMemObject(dst_mem);
    clReleaseMemObject(src_mem);
}
コード例 #4
0
/*! \brief Calls an OpenCL kernel from OpenVX Graph.
 * Steps:
 * \arg Find the target
 * \arg Get the vxcl context
 * \arg Find the kernel (to get cl kernel information)
 * \arg for each input parameter that is an object, enqueue write
 * \arg wait for finish
 * \arg for each parameter, SetKernelArg
 * \arg call kernel
 * \arg wait for finish
 * \arg for each output parameter that is an object, enqueue read
 * \arg wait for finish
 * \note This implementation will attempt to use the External API as much as possible,
 * but will cast to internal representation when needed (due to lack of API or
 * need for secret information). This is not an optimal OpenCL invocation.
 */
vx_status vxclCallOpenCLKernel(vx_node node, const vx_reference *parameters, vx_uint32 num)
{
    vx_status status = VX_FAILURE;
    vx_context context = node->base.context;
    vx_target target = (vx_target_t *)&node->base.context->targets[node->affinity];
    vx_cl_kernel_description_t *vxclk = vxclFindKernel(node->kernel->enumeration);
    vx_uint32 pidx, pln, didx, plidx, argidx;
    cl_int err = 0;
    size_t off_dim[3] = {0,0,0};
    size_t work_dim[3];
    //size_t local_dim[3];
    cl_event writeEvents[VX_INT_MAX_PARAMS];
    cl_event readEvents[VX_INT_MAX_PARAMS];
    cl_int we = 0, re = 0;

    vxSemWait(&target->base.lock);

    // determine which platform to use
    plidx = 0;

    // determine which device to use
    didx = 0;

    /* for each input/bi data object, enqueue it and set the kernel parameters */
    for (argidx = 0, pidx = 0; pidx < num; pidx++)
    {
        vx_reference ref = node->parameters[pidx];
        vx_enum dir = node->kernel->signature.directions[pidx];
        vx_enum type = node->kernel->signature.types[pidx];
        vx_memory_t *memory = NULL;

        switch (type)
        {
            case VX_TYPE_ARRAY:
                memory = &((vx_array)ref)->memory;
                break;
            case VX_TYPE_CONVOLUTION:
                memory = &((vx_convolution)ref)->base.memory;
                break;
            case VX_TYPE_DISTRIBUTION:
                memory = &((vx_distribution)ref)->memory;
                break;
            case VX_TYPE_IMAGE:
                memory = &((vx_image)ref)->memory;
                break;
            case VX_TYPE_LUT:
                memory = &((vx_lut_t*)ref)->memory;
                break;
            case VX_TYPE_MATRIX:
                memory = &((vx_matrix)ref)->memory;
                break;
            //case VX_TYPE_PYRAMID:
            //    break;
            case VX_TYPE_REMAP:
                memory = &((vx_remap)ref)->memory;
                break;
            //case VX_TYPE_SCALAR:
            //case VX_TYPE_THRESHOLD:
            //    break;
        }
        if (memory) {
            for (pln = 0; pln < memory->nptrs; pln++) {
                if (memory->cl_type == CL_MEM_OBJECT_BUFFER) {
                    if (type == VX_TYPE_IMAGE) {

                        /* set the work dimensions */
                        work_dim[0] = memory->dims[pln][VX_DIM_X];
                        work_dim[1] = memory->dims[pln][VX_DIM_Y];

                        // width, height, stride_x, stride_y
                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &memory->dims[pln][VX_DIM_X]);
                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &memory->dims[pln][VX_DIM_Y]);
                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_X]);
                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_Y]);
                        VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 4 parameters\n");
                    } else if (type == VX_TYPE_ARRAY || type == VX_TYPE_LUT) {
                        vx_array arr = (vx_array)ref;
                        // sizeof item, active count, capacity
                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&arr->item_size);
                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&arr->num_items); // this is output?
                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&arr->capacity);
                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &arr->memory.strides[VX_DIM_X]);
                        VX_PRINT(VX_ZONE_INFO, "Setting vx_buffer as Buffer with 4 parameters\n");
                    } else if (type == VX_TYPE_MATRIX) {
                        vx_matrix mat = (vx_matrix)ref;
                        // columns, rows
                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&mat->columns);
                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&mat->rows);
                        VX_PRINT(VX_ZONE_INFO, "Setting vx_matrix as Buffer with 2 parameters\n");
                    } else if (type == VX_TYPE_DISTRIBUTION) {
                        vx_distribution dist = (vx_distribution)ref;
                        // num, range, offset, winsize
                        vx_uint32 range = dist->memory.dims[0][VX_DIM_X] * dist->window_x;
                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&dist->memory.dims[VX_DIM_X]);
                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&range);
                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&dist->offset_x);
                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&dist->window_x);
                    } else if (type == VX_TYPE_CONVOLUTION) {
                        vx_convolution conv = (vx_convolution)ref;
                        // columns, rows, scale
                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&conv->base.columns);
                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&conv->base.rows);
                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&conv->scale);
                    }
                    err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(cl_mem), &memory->hdls[pln]);
                    CL_ERROR_MSG(err, "clSetKernelArg");
                    if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL)
                    {
                        err = clEnqueueWriteBuffer(context->queues[plidx][didx],
                                                   memory->hdls[pln],
                                                   CL_TRUE,
                                                   0,
                                                   vxComputeMemorySize(memory, pln),
                                                   memory->ptrs[pln],
                                                   0,
                                                   NULL,
                                                   &ref->event);
                    }
                } else if (memory->cl_type == CL_MEM_OBJECT_IMAGE2D) {
                    vx_rectangle_t rect = {0};
                    vx_image image = (vx_image)ref;
                    vxGetValidRegionImage(image, &rect);
                    size_t origin[3] = {rect.start_x, rect.start_y, 0};
                    size_t region[3] = {rect.end_x-rect.start_x, rect.end_y-rect.start_y, 1};
                    /* set the work dimensions */
                    work_dim[0] = rect.end_x-rect.start_x;
                    work_dim[1] = rect.end_y-rect.start_y;
                    VX_PRINT(VX_ZONE_INFO, "Setting vx_image as image2d_t wd={%zu,%zu} arg:%u\n",work_dim[0], work_dim[1], argidx);
                    err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(cl_mem), &memory->hdls[pln]);
                    CL_ERROR_MSG(err, "clSetKernelArg");
                    if (err != CL_SUCCESS) {
                        VX_PRINT(VX_ZONE_ERROR, "Error Calling Kernel %s, parameter %u\n", node->kernel->name, pidx);
                    }
                    if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL)
                    {
                        err = clEnqueueWriteImage(context->queues[plidx][didx],
                                                  memory->hdls[pln],
                                                  CL_TRUE,
                                                  origin, region,
                                                  memory->strides[pln][VX_DIM_Y],
                                                  0,
                                                  memory->ptrs[pln],
                                                  0, NULL,
                                                  &ref->event);
                        CL_ERROR_MSG(err, "clEnqueueWriteImage");
                    }
                }
            }
        } else {
            if (type == VX_TYPE_SCALAR) {
                vx_value_t value; // largest platform atomic
                vx_size size = 0ul;
                vx_scalar sc = (vx_scalar)ref;
                vx_enum stype = VX_TYPE_INVALID;
                vxReadScalarValue(sc, &value);
                vxQueryScalar(sc, VX_SCALAR_ATTRIBUTE_TYPE, &stype, sizeof(stype));
                size = vxSizeOfType(stype);
                err = clSetKernelArg(vxclk->kernels[plidx], argidx++, size, &value);
            }
            else if (type == VX_TYPE_THRESHOLD) {
                vx_enum ttype = 0;
                vx_threshold th = (vx_threshold)ref;
                vxQueryThreshold(th, VX_THRESHOLD_ATTRIBUTE_TYPE, &ttype, sizeof(ttype));
                if (ttype == VX_THRESHOLD_TYPE_BINARY) {
                    err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint8), &th->value);
                } else if (ttype == VX_THRESHOLD_TYPE_RANGE) {
                    err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint8), &th->lower);
                    err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint8), &th->upper);
                }
            }
        }
    }
    we = 0;
    for (pidx = 0; pidx < num; pidx++) {
        vx_reference ref = node->parameters[pidx];
        vx_enum dir = node->kernel->signature.directions[pidx];
        if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL) {
            memcpy(&writeEvents[we++],&ref->event, sizeof(cl_event));
        }
    }
    //local_dim[0] = 1;
    //local_dim[1] = 1;
    err = clEnqueueNDRangeKernel(context->queues[plidx][didx],
                                 vxclk->kernels[plidx],
                                 2,
                                 off_dim,
                                 work_dim,
                                 NULL,
                                 we, writeEvents, &node->base.event);

    CL_ERROR_MSG(err, "clEnqueueNDRangeKernel");
    /* enqueue a read on all output data */
    for (pidx = 0; pidx < num; pidx++)
    {
        vx_reference ref = node->parameters[pidx];
        vx_enum dir = node->kernel->signature.directions[pidx];
        vx_enum type = node->kernel->signature.types[pidx];

        if (dir == VX_OUTPUT || dir == VX_BIDIRECTIONAL)
        {
            vx_memory_t *memory = NULL;

            switch (type)
            {
                case VX_TYPE_ARRAY:
                    memory = &((vx_array)ref)->memory;
                    break;
                case VX_TYPE_CONVOLUTION:
                    memory = &((vx_convolution)ref)->base.memory;
                    break;
                case VX_TYPE_DISTRIBUTION:
                    memory = &((vx_distribution)ref)->memory;
                    break;
                case VX_TYPE_IMAGE:
                    memory = &((vx_image)ref)->memory;
                    break;
                case VX_TYPE_LUT:
                    memory = &((vx_lut_t*)ref)->memory;
                    break;
                case VX_TYPE_MATRIX:
                    memory = &((vx_matrix)ref)->memory;
                    break;
                //case VX_TYPE_PYRAMID:
                //    break;
                case VX_TYPE_REMAP:
                    memory = &((vx_remap)ref)->memory;
                    break;
                //case VX_TYPE_SCALAR:
                //case VX_TYPE_THRESHOLD:
                //    break;
            }
            if (memory) {
                for (pln = 0; pln < memory->nptrs; pln++) {
                    if (memory->cl_type == CL_MEM_OBJECT_BUFFER) {
                        err = clEnqueueReadBuffer(context->queues[plidx][didx],
                            memory->hdls[pln],
                            CL_TRUE, 0, vxComputeMemorySize(memory, pln),
                            memory->ptrs[pln],
                            1, &node->base.event, &ref->event);
                        CL_ERROR_MSG(err, "clEnqueueReadBuffer");
                    } else if (memory->cl_type == CL_MEM_OBJECT_IMAGE2D) {
                        vx_rectangle_t rect = {0};
                        vx_image image = (vx_image)ref;
                        vxGetValidRegionImage(image, &rect);
                        size_t origin[3] = {rect.start_x, rect.start_y, 0};
                        size_t region[3] = {rect.end_x-rect.start_x, rect.end_y-rect.start_y, 1};
                        /* set the work dimensions */
                        work_dim[0] = rect.end_x-rect.start_x;
                        work_dim[1] = rect.end_y-rect.start_y;
                        err = clEnqueueReadImage(context->queues[plidx][didx],
                                                  memory->hdls[pln],
                                                  CL_TRUE,
                                                  origin, region,
                                                  memory->strides[pln][VX_DIM_Y],
                                                  0,
                                                  memory->ptrs[pln],
                                                  1, &node->base.event,
                                                  &ref->event);
                        CL_ERROR_MSG(err, "clEnqueueReadImage");
                        VX_PRINT(VX_ZONE_INFO, "Reading Image wd={%zu,%zu}\n", work_dim[0], work_dim[1]);
                    }
                }
            }
        }
    }
    re = 0;
    for (pidx = 0; pidx < num; pidx++) {
        vx_reference ref = node->parameters[pidx];
        vx_enum dir = node->kernel->signature.directions[pidx];
        if (dir == VX_OUTPUT || dir == VX_BIDIRECTIONAL) {
            memcpy(&readEvents[re++],&ref->event, sizeof(cl_event));
        }
    }
    err = clFlush(context->queues[plidx][didx]);
    CL_ERROR_MSG(err, "Flush");
    VX_PRINT(VX_ZONE_TARGET, "Waiting for read events!\n");
    clWaitForEvents(re, readEvents);
    if (err == CL_SUCCESS)
        status = VX_SUCCESS;
//exit:
    VX_PRINT(VX_ZONE_API, "%s exiting %d\n", __FUNCTION__, status);
    vxSemPost(&target->base.lock);
    return status;
}
コード例 #5
0
int main()
{
	int i,j,k;
	// nb of operations:
	const int dsize = 512;
	int nthreads = 1;
	int nbOfAverages = 1e2;
	int opsMAC = 2; // operations per MAC
	cl_short4 *in, *out;
	cl_half *ck;
	double tops; //total ops

#define NQUEUES 1
	cl_int err;
	cl_platform_id platform = 0;
	cl_device_id device = 0;
	cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
	cl_context ctx = 0;
	cl_command_queue queues[NQUEUES];
	cl_mem bufin, bufck, bufout;
	cl_event event = NULL;
	cl_program program;
	cl_kernel kernel;
	size_t global[2], local[2];
	size_t param[5];
	char version[300];
  
	// allocate matrices
	
	in = (cl_short4 *) calloc(dsize*dsize, sizeof(*in));
	out = (cl_short4 *) calloc(dsize*dsize, sizeof(*out));
	ck = (cl_half *) calloc(9*9, sizeof(*ck));
	in[0].x = 0x3c00;
	in[1].x = 0x4000;
	in[dsize].x = 0x4100;
	ck[0] = 0x3c00;
	ck[1] = 0x4000;
	ck[9] = 0x3000;

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs( 1, &platform, NULL );
    err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL );

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext( props, 1, &device, NULL, NULL, &err );
    for(i = 0; i < NQUEUES; i++)
    	queues[i] = clCreateCommandQueue( ctx, device, 0, &err );

	// Print some info about the system
	clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(version), version, NULL);
	printf("CL_DEVICE_VERSION=%s\n", version);
	clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(version), version, NULL);
	printf("CL_DRIVER_VERSION=%s\n", version);
	program = clCreateProgramWithSource(ctx, 1, (const char **)&source, NULL, &err);
	clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(param[0]), param, NULL);
	printf("CL_DEVICE_LOCAL_MEM_SIZE=%d\n", (int)param[0]);
	clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(param[0]), param, NULL);
	printf("CL_DEVICE_MAX_WORK_GROUP_SIZE=%d\n", (int)param[0]);
	clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(param[0]), param, NULL);
	printf("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS=%d\n", (int)param[0]);
	j = param[0];
	clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(param[0])*j, param, NULL);
	printf("CL_DEVICE_MAX_WORK_ITEM_SIZES=");
	for(i = 0; i < j; i++)
		printf("%d ", (int)param[i]);
	printf("\n");
        clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(param[0]), param, NULL);
        printf("CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE=%d\n", (int)param[0]);
		
		
	program = clCreateProgramWithSource(ctx, 1, (const char **)&source, NULL, &err);
	if(!program)
	{
		printf("Error creating program\n");
		return -1;
	}
	err = clBuildProgram(program, 0, 0, 0, 0, 0);
	if(err != CL_SUCCESS)
	{
		char buffer[20000];
		size_t len;
		
		clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
		puts(buffer);
		return -1;
	}
	kernel = clCreateKernel(program, "conv9x9", &err);
	if(!kernel || err != CL_SUCCESS)
	{
		printf("Error creating kernel\n");
		return -1;
	}

    /* Prepare OpenCL memory objects and place matrices inside them. */
	cl_image_format fmt = {CL_RGBA, CL_HALF_FLOAT};
	cl_int rc;
	bufin = clCreateImage2D(ctx, CL_MEM_READ_ONLY, &fmt, dsize, dsize, 0, 0, &rc);
	bufout = clCreateImage2D(ctx, CL_MEM_WRITE_ONLY, &fmt, dsize, dsize, 0, 0, &rc);
    bufck = clCreateBuffer( ctx, CL_MEM_READ_ONLY, 9 * 9 * sizeof(*ck),
                          NULL, &err );

	size_t origin[3] = {0,0,0};
	size_t region[3] = {dsize, dsize, 1};
    err = clEnqueueWriteImage(queues[0], bufin, CL_TRUE, origin, region, dsize * sizeof(*in), 0, in, 0, NULL, NULL );
    err = clEnqueueWriteBuffer( queues[0], bufck, CL_TRUE, 0, 9 * 9 * sizeof( *ck ), ck, 0, NULL, NULL );
	clSetKernelArg(kernel, 0, sizeof(int), &dsize);
	clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufin);
	clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufck);
	clSetKernelArg(kernel, 3, sizeof(cl_mem), &bufout);
	local[0] = 8;
	local[1] = 8;
	global[0] = global[1] = dsize-32;
    usleep(100000);

	struct timeval start,end;
	gettimeofday(&start, NULL);

	for (k=0; k<nthreads; k++) {
		//printf("Hello from thread %d, nthreads %d\n", omp_get_thread_num(), omp_get_num_threads());
		for(i=0;i<nbOfAverages;i++) {
		// do the 2D convolution
			err = clEnqueueNDRangeKernel(queues[0], kernel, 2, NULL, global, local, 0, NULL, NULL);
			if(err != CL_SUCCESS)
			{
				printf("clEnqueueNDRangeKernel error %d\n", err);
				return -1;
			}
		}
	}

	clFinish(queues[0]);
	gettimeofday(&end, NULL);
	double t = ((double) (end.tv_sec - start.tv_sec))
	+ ((double) (end.tv_usec - start.tv_usec)) / 1e6; //reports time in [s] - verified!

    /* Wait for calculations to be finished. */

    /* Fetch results of calculations from GPU memory. */
    err = clEnqueueReadImage(queues[0], bufout, CL_TRUE, origin, region, dsize * sizeof(*out), 0, out, 0, NULL, NULL );
	clFinish(queues[0]);
	
	printf("%x %x %x %x\n", out[0].x, out[1].x, out[dsize].x, out[dsize+1].x);

    /* Release OpenCL memory objects. */
    clReleaseMemObject( bufin );
    clReleaseMemObject( bufck );
    clReleaseMemObject( bufout );

    /* Release OpenCL working objects. */
    for(i = 0; i < NQUEUES; i++)
    	clReleaseCommandQueue( queues[i] );
    clReleaseContext( ctx );
	
	// report performance:
	tops = 4 * nthreads * opsMAC * (dsize-32)*(dsize-32)*9*9; // total ops
	printf("Total M ops = %.0lf, # of threads = %d", nbOfAverages*tops*1e-6, nthreads);
	printf("\nTime in s: %lf:", t);
	printf("\nTest performance [G OP/s] %lf:", tops*nbOfAverages/t*1e-9);
	printf("\n");
	return(0);
}
コード例 #6
0
ファイル: cl_image.c プロジェクト: ddantas/opencl_demos
void clInvert3D(CL* cl, VglImage* img){
	cl_int err;
    cl_image_desc desc    = getDesc(img->shape[0], img->shape[1], 3, img->shape[2]);
    cl_image_desc descOut = getDesc(img->shape[0], img->shape[1], 3, img->shape[2]);
    cl_image_format src;
    cl_image_format out;
    switch(img->nChannels){
		case 1:
			src.image_channel_order = CL_A;
			out.image_channel_order = CL_A;
			break;
		case 3:
			rgb2rgba(NULL, img);
			src.image_channel_order = CL_RGBA;
			out.image_channel_order = CL_RGBA;
			break;
		case 4:
			src.image_channel_order = CL_RGBA;
			out.image_channel_order = CL_RGBA;
			break;
		default:
			printf("Numero de canais não suportado\n");
			exit;
	}
    src.image_channel_data_type = CL_UNORM_INT8;
    out.image_channel_data_type = CL_UNORM_INT8;
       
    cl_mem src_mem = 
    clCreateImage(cl->context, CL_MEM_READ_ONLY, &src, &desc, NULL, &err);
    printf("IMAGE STATUS SOURCE\t"); cl_error(err);
    
    cl_mem out_mem = 
	clCreateImage(cl->context, CL_MEM_WRITE_ONLY, &out, &descOut, NULL, &err);
	printf("IMAGE STATUS OUT\t"); cl_error(err);
    
    clGetMemObjectInfo(src_mem, CL_MEM_TYPE, sizeof(cl_int), &err, NULL);
    if(err == CL_MEM_OBJECT_IMAGE3D)
		printf("IMAGE TYPE:\t\tCL_MEM_OBJECT_IMAGE3D\n");
	
	size_t *src_origin=(size_t*)malloc(sizeof(size_t)*3);
	src_origin[0] = 0;
	src_origin[1] = 0;
	src_origin[2] = 0;
	
	size_t *src_region=(size_t*)malloc(sizeof(size_t)*3);
	src_region[0] = img->shape[0];
	src_region[1] = img->shape[1];
	src_region[2] = img->shape[2];
		
	err = clEnqueueWriteImage(cl->queue, src_mem, CL_TRUE,
	src_origin, src_region, 0, 0, (void*)img->ndarray, 0, 0, NULL);
	printf("ENQUEUE IMAGE STATUS "); cl_error(err);
	
	cl_program program;
	cl_kernel kernel;
	
	const char* k  = "./CLdemos/CL/Invert3D_RGBA.cl";
	const char* k2 = "./CLdemos/CL/Invert3D_A.cl";
	char** fonte;
	if(img->nChannels==1)
		fonte = (char**)getKernelPtr(k2);
	if(img->nChannels==4)
		fonte = (char**)getKernelPtr(k);
	
	program = clCreateProgramWithSource(cl->context, 1, (const char**)fonte, NULL, &err);
	printf("CREATE PROGRAM STATUS: "); cl_error(err);
	clBuildProgram(program, 0, NULL, NULL, NULL, &err);
	printf("BUILD PROGRAM STATUS: "); cl_error(err);
	kernel = clCreateKernel(program, "invert", &err);
	printf("KERNEL STATUS "); cl_error(err);
	
	
	err = clSetKernelArg( kernel, 0, sizeof( cl_mem ), (void *) &src_mem);
	printf("SET 1 KERNEL ARG "); cl_error(err);
	err = clSetKernelArg( kernel, 1, sizeof( cl_mem ), (void *) &out_mem);
	printf("SET 2 KERNEL ARG "); cl_error(err);

	size_t worksize[] = { img->shape[0], img->shape[1], img->shape[2]};
	err = clEnqueueNDRangeKernel(cl->queue, kernel, 2, NULL, worksize,
	0, 0, 0, 0);
	printf("ENQUEUE ND KERNEL STATUS "); cl_error(err);
	
	clFinish(cl->queue);
	
	char* auxout = (char*)malloc(img->shape[0]*img->shape[1]*img->shape[2]*img->nChannels);
	err = clEnqueueReadImage(cl->queue, out_mem, CL_TRUE, 
	src_origin, src_region, 0, 0, auxout, 0, NULL, NULL);
	printf("READ NEW IMAGE STATUS "); cl_error(err);
	
	for(int i=0; i<(img->shape[0]*img->nChannels*img->shape[1]*img->shape[2]); i++)
        img->ndarray[i] = auxout[i];
        
    free(auxout);    
    clReleaseKernel(kernel);
    clReleaseProgram(program);
}
コード例 #7
0
END_TEST

START_TEST (test_read_write_image)
{
    cl_device_id device;
    cl_context ctx;
    cl_command_queue queue;
    cl_mem image2d, part2d;
    cl_int result;

    cl_platform_id platform      = 0;
    cl_uint        num_platforms = 0;
    clGetPlatformIDs(1, &platform, &num_platforms);

    unsigned char image2d_data_24bpp[3*3*4] = {
        255, 0, 0, 0,       0, 255, 0, 0,       128, 128, 128, 0,
        0, 0, 255, 0,       255, 255, 0, 0,     0, 128, 0, 0,
        255, 128, 0, 0,     128, 0, 255, 0,     0, 0, 0, 0
    };

    unsigned char image2d_part_24bpp[2*2*4] = {
        255, 0, 0, 0,       0, 255, 0, 0,
        0, 0, 255, 0,       255, 255, 0, 0
    };

    unsigned char image2d_buffer[3*3*4];
    unsigned char image2d_part[2*2*4];

    cl_image_format fmt;

    fmt.image_channel_data_type = CL_UNORM_INT8;
    fmt.image_channel_order = CL_RGBA;

    size_t origin[3] = {0, 0, 0};
    size_t region[3] = {3, 3, 1};

    result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0);
    fail_if(
        result != CL_SUCCESS,
        "unable to get the default device"
    );

    ctx = clCreateContext(0, 1, &device, 0, 0, &result);
    fail_if(
        result != CL_SUCCESS || ctx == 0,
        "unable to create a valid context"
    );

    queue = clCreateCommandQueue(ctx, device, 0, &result);
    fail_if(
        result != CL_SUCCESS || queue == 0,
        "cannot create a command queue"
    );

    image2d = clCreateImage2D(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &fmt,
                              3, 3, 0, image2d_buffer, &result);
    fail_if(
        result != CL_SUCCESS || image2d == 0,
        "cannot create a valid 3x3 image2D"
    );

    part2d = clCreateImage2D(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &fmt,
                             2, 2, 0, image2d_part, &result);
    fail_if(
        result != CL_SUCCESS || image2d == 0,
        "cannot create a valid 2x2 image2D"
    );

    // Write data in buffer
    result = clEnqueueWriteImage(queue, image2d, 1, origin, region, 0, 0,
                                 image2d_data_24bpp, 0, 0, 0);
    fail_if(
        result != CL_SUCCESS,
        "cannot enqueue a blocking write image event"
    );

    // Read it back
    region[0] = 2;
    region[1] = 2;

    result = clEnqueueReadImage(queue, image2d, 1, origin, region, 0, 0,
                                image2d_part, 0, 0, 0);
    fail_if(
        result != CL_SUCCESS,
        "cannot enqueue a blocking read image event"
    );

    // Compare
 #if 0 // images not supported
    fail_if(
        std::memcmp(image2d_part, image2d_part_24bpp, sizeof(image2d_part)) != 0,
        "reading and writing images doesn't produce the correct result"
    );
#endif

    // Read it back using a buffer
    cl_event event;
    std::memset(image2d_part, 0, sizeof(image2d_part));

    result = clEnqueueCopyImage(queue, image2d, part2d, origin, origin,
                                region, 0, 0, &event);
    fail_if(
        result != CL_SUCCESS,
        "unable to enqueue a copy image event"
    );

    result = clWaitForEvents(1, &event);
    fail_if(
        result != CL_SUCCESS,
        "unable to wait for events"
    );

    // Compare
#if 0 // images not supported
    fail_if(
        std::memcmp(image2d_part, image2d_part_24bpp, sizeof(image2d_part)) != 0,
        "copying images doesn't produce the correct result"
    );
#endif

    clReleaseEvent(event);
    clReleaseMemObject(part2d);
    clReleaseMemObject(image2d);
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);
}