示例#1
0
template <typename T> cl_int DWTKernel<T>::run(T* in, int sizeX, int sizeY, int levels){

	if (!in)
		return CL_INVALID_VALUE;

	cl_int error_code;
	cl_context context  = NULL;

    // Obtain the OpenCL context from the command-queue properties
	error_code = clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT, sizeof(cl_context), &context, NULL);
	if (CL_SUCCESS != error_code)
	{
		LogError("Error: clGetCommandQueueInfo (CL_QUEUE_CONTEXT) returned %s.\n", TranslateOpenCLError(error_code));
		return error_code;
	}

	// allocate memory on device
	srcMem = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeX * sizeY * sizeof(T), in, &error_code);
    if (CL_SUCCESS != error_code)
    {
        LogError("Error: clCreateBuffer (in) returned %s.\n", TranslateOpenCLError(error_code));
        return error_code;
    }

	dstMem = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeX * sizeY * sizeof(T), NULL, &error_code);
    if (CL_SUCCESS != error_code)
    {
        LogError("Error: clCreateBuffer (out) returned %s.\n", TranslateOpenCLError(error_code));
        return error_code;
    }
	ownsMemory = true;
	run(srcMem, dstMem, sizeX, sizeY, levels);
	return CL_SUCCESS;
}
示例#2
0
/*
 * Set kernel arguments
 */
cl_uint SetKernelArguments(ocl_args_d_t *ocl)
{
    cl_int err = CL_SUCCESS;

    err = clSetKernelArg(ocl->kernel, 0, sizeof(cl_mem), (void *)&ocl->srcA);
    if (CL_SUCCESS != err)
    {
        LogError("error: Failed to set argument srcA, returned %s\n", TranslateOpenCLError(err));
        return err;
    }

    err = clSetKernelArg(ocl->kernel, 1, sizeof(cl_mem), (void *)&ocl->srcB);
    if (CL_SUCCESS != err)
    {
        LogError("Error: Failed to set argument srcB, returned %s\n", TranslateOpenCLError(err));
        return err;
    }

    err = clSetKernelArg(ocl->kernel, 2, sizeof(cl_mem), (void *)&ocl->dstMem);
    if (CL_SUCCESS != err)
    {
        LogError("Error: Failed to set argument dstMem, returned %s\n", TranslateOpenCLError(err));
        return err;
    }

    return err;
}
示例#3
0
/*
 * Execute the kernel
 */
cl_uint ExecuteAddKernel(ocl_args_d_t *ocl, cl_uint width, cl_uint height)
{
    cl_int err = CL_SUCCESS;

    // Define global iteration space for clEnqueueNDRangeKernel.
    size_t globalWorkSize[2] = { width, height };


    // execute kernel
    err = clEnqueueNDRangeKernel(ocl->commandQueue, ocl->kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
    if (CL_SUCCESS != err)
    {
        LogError("Error: Failed to run kernel, return %s\n", TranslateOpenCLError(err));
        return err;
    }

    // Wait until the queued kernel is completed by the device
    err = clFinish(ocl->commandQueue);
    if (CL_SUCCESS != err)
    {
        LogError("Error: clFinish return %s\n", TranslateOpenCLError(err));
        return err;
    }

    return CL_SUCCESS;
}
示例#4
0
/*
 * Create and build OpenCL program from its source code
 */
int CreateAndBuildProgram(ocl_args_d_t *ocl)
{
    cl_int err = CL_SUCCESS;

    // Upload the OpenCL C source code from the input file to source
    // The size of the C program is returned in sourceSize
    char* source = NULL;
    size_t src_size = 0;
    err = ReadSourceFromFile("Template.cl", &source, &src_size);
    if (CL_SUCCESS != err)
    {
        LogError("Error: ReadSourceFromFile returned %s.\n", TranslateOpenCLError(err));
        goto Finish;
    }

    // And now after you obtained a regular C string call clCreateProgramWithSource to create OpenCL program object.
    ocl->program = clCreateProgramWithSource(ocl->context, 1, (const char**)&source, &src_size, &err);
    if (CL_SUCCESS != err)
    {
        LogError("Error: clCreateProgramWithSource returned %s.\n", TranslateOpenCLError(err));
        goto Finish;
    }

    // Build the program
    // During creation a program is not built. You need to explicitly call build function.
    // Here you just use create-build sequence,
    // but there are also other possibilities when program consist of several parts,
    // some of which are libraries, and you may want to consider using clCompileProgram and clLinkProgram as
    // alternatives.
    err = clBuildProgram(ocl->program, 1, &ocl->device, "", NULL, NULL);
    if (CL_SUCCESS != err)
    {
        LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));

        // In case of error print the build log to the standard output
        // First check the size of the log
        // Then allocate the memory and obtain the log from the program
        if (err == CL_BUILD_PROGRAM_FAILURE)
        {
            size_t log_size = 0;
            clGetProgramBuildInfo(ocl->program, ocl->device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);

            std::vector<char> build_log(log_size);
            clGetProgramBuildInfo(ocl->program, ocl->device, CL_PROGRAM_BUILD_LOG, log_size, &build_log[0], NULL);

            LogError("Error happened during the build of OpenCL program.\nBuild log:%s", &build_log[0]);
        }
    }

Finish:
    if (source)
    {
        delete[] source;
        source = NULL;
    }

    return err;
}
示例#5
0
cl_uint CreateBufferArguments()
{

    cl_int err = CL_SUCCESS;

    // Create new OpenCL buffer objects
    // As these buffer are used only for read by the kernel, you are recommended to create it with flag CL_MEM_READ_ONLY.
    // Always set minimal read/write flags for buffers, it may lead to better performance because it allows runtime
    // to better organize data copying.
    // You use CL_MEM_COPY_HOST_PTR here, because the buffers should be populated with bytes at inputA and inputB.

    ocl.Lights = clCreateBuffer(ocl.context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(RectangleLight) * masterSet.LightCount, masterSet.m_rectLight, &err);
    if (CL_SUCCESS != err)
    {
        printf("Error: clCreateBuffer for Lights returned %s\n", TranslateOpenCLError(err));
        return err;
    }

    ocl.LightCount = masterSet.LightCount;

    ocl.Shapes = clCreateBuffer(ocl.context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(Plane) * masterSet.PlaneCount, masterSet.m_plane, &err);
    if (CL_SUCCESS != err)
    {
        printf("Error: clCreateBuffer for Shapes returned %s\n", TranslateOpenCLError(err));
        return err;
    }

    ocl.ShapeCount = masterSet.PlaneCount;

    ocl.sampleCount = SampleCount;

    ocl.cam = clCreateBuffer(ocl.context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(Camera), &cam, &err);
    if (CL_SUCCESS != err)
    {
        printf("Error: clCreateBuffer for cam returned %s\n", TranslateOpenCLError(err));
        return err;
    }

    ocl.width = Width;
    ocl.height = Height;
    /*
    ocl.Pixels = clCreateBuffer(ocl.context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof(cl_uint) * WorkAmount, NULL, &err);
    if (CL_SUCCESS != err)
    {
        printf("Error: clCreateBuffer for Pixels returned %s\n", TranslateOpenCLError(err));
        return err;
    }

    ocl.Seeds = clCreateBuffer(ocl.context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_uint) * WorkAmount * 2, Seeds, &err);
    if (CL_SUCCESS != err)
    {
        printf("Error: clCreateBuffer for Seeds returned %s\n", TranslateOpenCLError(err));
        return err;
    }
    */
    return CL_SUCCESS;
}
示例#6
0
template <typename T> cl_int DWTKernel<T>::setWindowKernelArgs(int WIN_SX, int WIN_SY) {
	cl_int error_code = clSetKernelArg(myKernel, 0, sizeof(int), &WIN_SX);
	if (CL_SUCCESS != error_code)
	{
		LogError("Error: clSetKernelArg returned %s.\n", TranslateOpenCLError(error_code));
		return error_code;
	}
	error_code = clSetKernelArg(myKernel, 1, sizeof(int), &WIN_SY);
	if (CL_SUCCESS != error_code)
	{
		LogError("Error: clSetKernelArg returned %s.\n", TranslateOpenCLError(error_code));
		return error_code;
	}
	return CL_SUCCESS;
}
示例#7
0
文件: OCLDWT.cpp 项目: knopkem/roger
template<typename T> tDeviceRC OCLDWT<T>::setKernelArgs(OCLKernel* myKernel,unsigned int width, unsigned int height,unsigned int steps, unsigned int level, unsigned int levels) {
    numKernelArgs = 0;
    cl_kernel targetKernel = myKernel->getKernel();
    cl_int error_code = clSetKernelArg(targetKernel, numKernelArgs++, sizeof(cl_mem),  memoryManager->getDwtIn(level));
    if (DeviceSuccess != error_code)
    {
        LogError("setKernelArgs returned %s.", TranslateOpenCLError(error_code));
        return error_code;
    }

    error_code = clSetKernelArg(targetKernel, numKernelArgs++, sizeof(cl_mem), (level < levels-1) ?
                                memoryManager->getDwtIn(level+1) :  memoryManager->getDWTOut() );
    if (DeviceSuccess != error_code)
    {
        LogError("setKernelArgs returned %s.", TranslateOpenCLError(error_code));
        return error_code;
    }

    error_code = clSetKernelArg(targetKernel, numKernelArgs++, sizeof(cl_mem), memoryManager->getDWTOut());
    if (DeviceSuccess != error_code)
    {
        LogError("setKernelArgs returned %s.", TranslateOpenCLError(error_code));
        return error_code;
    }

    error_code = clSetKernelArg(targetKernel, numKernelArgs++, sizeof(width), &width);
    if (DeviceSuccess != error_code)
    {
        LogError("setKernelArgs returned %s.", TranslateOpenCLError(error_code));
        return error_code;
    }
    error_code = clSetKernelArg(targetKernel, numKernelArgs++, sizeof(height), &height);
    if (DeviceSuccess != error_code)
    {
        LogError("setKernelArgs returned %s.", TranslateOpenCLError(error_code));
        return error_code;
    }

    error_code = clSetKernelArg(targetKernel, numKernelArgs++, sizeof(steps), &steps);
    if (DeviceSuccess != error_code)
    {
        LogError("setKernelArgs returned %s.", TranslateOpenCLError(error_code));
        return error_code;
    }
    error_code = clSetKernelArg(targetKernel, numKernelArgs++, sizeof(level), &level);
    if (DeviceSuccess != error_code)
    {
        LogError("setKernelArgs returned %s.", TranslateOpenCLError(error_code));
        return error_code;
    }
    error_code = clSetKernelArg(targetKernel, numKernelArgs++, sizeof(levels), &levels);
    if (DeviceSuccess != error_code)
    {
        LogError("setKernelArgs returned %s.", TranslateOpenCLError(error_code));
        return error_code;
    }
    return DeviceSuccess;
}
示例#8
0
int OCLDeviceManager::init(eDeviceType type) {
    bool isCpu = type == CPU;
    ocl_args_d_t** oclArgs;
    if (isCpu) {
        if (ocl_cpu)
            return 0;
        ocl_cpu = new ocl_args_d_t();
        oclArgs = &ocl_cpu;

    } else {

        if (ocl_gpu)
            return 0;
        ocl_gpu = new ocl_args_d_t();
        oclArgs = &ocl_gpu;

    }
    data_args_d_t args;
    args.preferGpu = !isCpu;
    args.preferCpu = isCpu;
    args.vendorName = NULL;
    int error_code;
    error_code = InitOpenCL(*oclArgs, &args);
    if (CL_SUCCESS != error_code)
    {
        LogError("InitOpenCL returned %s.", TranslateOpenCLError(error_code));
        delete *oclArgs;
        *oclArgs = NULL;;
    }
    return error_code;

}
示例#9
0
template <typename T> cl_int DWTKernel<T>::setImageSizeKernelArgs(int sx, int sy) {

	cl_int error_code = clSetKernelArg(myKernel, 5, sizeof(int), &sx);
	if (CL_SUCCESS != error_code)
	{
		LogError("Error: clSetKernelArg returned %s.\n", TranslateOpenCLError(error_code));
		return error_code;
	}
	error_code = clSetKernelArg(myKernel, 6, sizeof(int), &sy);
	if (CL_SUCCESS != error_code)
	{
		LogError("Error: clSetKernelArg returned %s.\n", TranslateOpenCLError(error_code));
		return error_code;
	}
	return CL_SUCCESS;
}
示例#10
0
template <typename T> tDeviceRC DWTKernel<T>::copyLLBandToSrc(int LLSizeX, int LLSizeY){
	  // copy forward or reverse transformed LL band from output back into the input
	size_t bufferOffset[] = { 0, 0, 0};
	cl_int err = CL_SUCCESS;

	// The region size must be given in bytes
	size_t region[] = {LLSizeX * sizeof(T), LLSizeY, 1 };
			
	err = clEnqueueCopyBufferRect ( queue, 	//copy command will be queued
				    dstMem,		
					srcMem,		
					bufferOffset,	//offset associated with src_buffer
					bufferOffset,     //offset associated with src_buffer
					region,		//(width, height, depth) in bytes of the 2D or 3D rectangle being copied
					region[0],   //length of each row in bytes
					0, //length of each 2D slice in bytes 
					region[0] ,   //length of each row in bytes
					0, //length of each 2D slice in bytes
					0,
					NULL,
					NULL);
	if (CL_SUCCESS != err)
	{
		LogError("Error: clEnqueueCopyBufferRect (srcMem) returned %s.\n", TranslateOpenCLError(err));
	}
	return err;

}
示例#11
0
cl_uint CreateAndBuildProgram()
{
    cl_int err = CL_SUCCESS;

    // Upload the OpenCL C source code from the input file to source
    // The size of the C program is returned in sourceSize
    char* source = NULL;
    size_t src_size = 0;
    err = ReadSourceFromFile("ray_algorithm.cl", &source, &src_size);
    if (CL_SUCCESS != err)
    {
        printf("Error: ReadSourceFromFile returned %s.\n", TranslateOpenCLError(err));
        goto Finish;
    }

    // And now after you obtained a regular C string call clCreateProgramWithSource to create OpenCL program object.
    ocl.program = clCreateProgramWithSource(ocl.context, 1, (const char**)&source, &src_size, &err);
    if (CL_SUCCESS != err)
    {
        printf("Error: clCreateProgramWithSource returned %s.\n", TranslateOpenCLError(err));
        goto Finish;
    }

    // Build the program
    // During creation a program is not built. You need to explicitly call build function.
    // Here you just use create-build sequence,
    // but there are also other possibilities when program consist of several parts,
    // some of which are libraries, and you may want to consider using clCompileProgram and clLinkProgram as
    // alternatives.
    err = clBuildProgram(ocl.program, 2, ocl.device, "", NULL, NULL);
    if (CL_SUCCESS != err)
    {
        printf("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));
    }

Finish:
    if (source)
    {
        delete[] source;
        source = NULL;
    }

    return err;
}
示例#12
0
/*
 * "Read" the result buffer (mapping the buffer to the host memory address)
 */
bool ReadAndVerify(ocl_args_d_t *ocl, cl_uint width, cl_uint height, cl_int *inputA, cl_int *inputB)
{
    cl_int err = CL_SUCCESS;
    bool result = true;

    // Enqueue a command to map the buffer object (ocl->dstMem) into the host address space and returns a pointer to it
    // The map operation is blocking
    cl_int *resultPtr = (cl_int *)clEnqueueMapBuffer(ocl->commandQueue, ocl->dstMem, true, CL_MAP_READ, 0, sizeof(cl_uint) * width * height, 0, NULL, NULL, &err);

    if (CL_SUCCESS != err)
    {
        LogError("Error: clEnqueueMapBuffer returned %s\n", TranslateOpenCLError(err));
        return false;
    }

    // Call clFinish to guarantee that output region is updated
    err = clFinish(ocl->commandQueue);
    if (CL_SUCCESS != err)
    {
        LogError("Error: clFinish returned %s\n", TranslateOpenCLError(err));
    }

    // We mapped dstMem to resultPtr, so resultPtr is ready and includes the kernel output !!!
    // Verify the results
    unsigned int size = width * height;
    for (unsigned int k = 0; k < size; ++k)
    {
        if (resultPtr[k] != inputA[k] + inputB[k])
        {
            LogError("Verification failed at %d: (%d + %d = %d)\n", k, inputA[k], inputB[k], resultPtr[k]);
            result = false;
        }
    }

    // Unmapped the output buffer before releasing it
    err = clEnqueueUnmapMemObject(ocl->commandQueue, ocl->dstMem, resultPtr, 0, NULL, NULL);
    if (CL_SUCCESS != err)
    {
        LogError("Error: clEnqueueUnmapMemObject returned %s\n", TranslateOpenCLError(err));
    }

    return result;
}
示例#13
0
template <typename T> T* DWTKernel<T>::mapOutputBufferToHost(){
		
	cl_int error_code = CL_SUCCESS;
	void* hostPtr = clEnqueueMapBuffer(queue, dstMem, true, CL_MAP_READ, 0, dimX * dimY * sizeof(T), 0, NULL, NULL, &error_code);
    if (CL_SUCCESS != error_code)
    {
        LogError("Error: clEnqueueMapBuffer return %s.\n", TranslateOpenCLError(error_code));
    }
	return (T*)hostPtr;
	

}
示例#14
0
template <typename T> cl_int DWTKernel<T>::run(cl_mem in, cl_mem out, int sizeX, int sizeY, int levels){
	srcMem = in;
	dstMem = out;
	dimX = sizeX;
	dimY = sizeY;

	cl_int error_code = clSetKernelArg(myKernel, 3, sizeof(cl_mem), &srcMem);
	if (CL_SUCCESS != error_code)
	{
		LogError("Error: clSetKernelArg returned %s.\n", TranslateOpenCLError(error_code));
		return error_code;
	}
	error_code = clSetKernelArg(myKernel, 4, sizeof(cl_mem), &dstMem);
	if (CL_SUCCESS != error_code)
	{
		LogError("Error: clSetKernelArg returned %s.\n", TranslateOpenCLError(error_code));
		return error_code;
	}
	dwt(sizeX, sizeY, levels);
	return CL_SUCCESS;
}
示例#15
0
/*
 * Create OpenCL buffers from host memory
 * These buffers will be used later by the OpenCL kernel
 */
int CreateBufferArguments(ocl_args_d_t *ocl, cl_int* inputA, cl_int* inputB, cl_int* outputC, cl_uint arrayWidth, cl_uint arrayHeight)
{
    cl_int err = CL_SUCCESS;

    // Create new OpenCL buffer objects
    // As these buffer are used only for read by the kernel, you are recommended to create it with flag CL_MEM_READ_ONLY.
    // Always set minimal read/write flags for buffers, it may lead to better performance because it allows runtime
    // to better organize data copying.
    // You use CL_MEM_COPY_HOST_PTR here, because the buffers should be populated with bytes at inputA and inputB.

    ocl->srcA = clCreateBuffer(ocl->context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_uint) * arrayWidth * arrayHeight, inputA, &err);
    if (CL_SUCCESS != err)
    {
        LogError("Error: clCreateBuffer for srcA returned %s\n", TranslateOpenCLError(err));
        return err;
    }

    ocl->srcB = clCreateBuffer(ocl->context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_uint) * arrayWidth * arrayHeight, inputB, &err);
    if (CL_SUCCESS != err)
    {
        LogError("Error: clCreateBuffer for srcB returned %s\n", TranslateOpenCLError(err));
        return err;
    }

    // If the output buffer is created directly on top of output buffer using CL_MEM_USE_HOST_PTR,
    // then, depending on the OpenCL runtime implementation and hardware capabilities, 
    // it may save you not necessary data copying.
    // As it is known that output buffer will be write only, you explicitly declare it using CL_MEM_WRITE_ONLY.
    ocl->dstMem = clCreateBuffer(ocl->context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_uint) * arrayWidth * arrayHeight, outputC, &err);
    if (CL_SUCCESS != err)
    {
        LogError("Error: clCreateBuffer for dstMem returned %s\n", TranslateOpenCLError(err));
        return err;
    }


    return CL_SUCCESS;
}
示例#16
0
template <typename T> DWTKernel<T>::~DWTKernel(void)
{
	cl_int error_code = CL_SUCCESS;
	if (ownsMemory) {

		// free memory on device
		if (srcMem) {
			error_code = clReleaseMemObject(srcMem);
			if (CL_SUCCESS != error_code)
			{
				LogError("Error: clReleaseMemObject (input) returned %s.\n", TranslateOpenCLError(error_code));
			}
		}

		if (dstMem) {
			error_code = clReleaseMemObject(dstMem);
			if (CL_SUCCESS != error_code)
			{
				LogError("Error: clReleaseMemObject (output) returned %s.\n", TranslateOpenCLError(error_code));
			}
		}
	}
}
示例#17
0
  /// Only computes optimal number of sliding window steps, 
  /// number of threadblocks and then lanches the 5/3 FDWT kernel.
  /// @param WIN_SX  width of sliding window
  /// @param WIN_SY  height of sliding window
  /// @param in       input image
  /// @param out      output buffer
  /// @param sx       width of the input image 
  /// @param sy       height of the input image
template <typename T>  void DWTKernel<T>::enqueue (int WIN_SX, int WIN_SY, const int sx, const int sy) {

	if (setWindowKernelArgs(WIN_SX, WIN_SY) != CL_SUCCESS)
	  return;

	cl_int error_code = setImageSizeKernelArgs(sx, sy);
	if (CL_SUCCESS != error_code)
	{
		LogError("Error: setImageSizeKernelArgs returned %s.\n", TranslateOpenCLError(error_code));
		return;
	}

	// allocate local data 
    size_t localMemSize =  calcTransformDataBufferSize(WIN_SX,WIN_SY) * sizeof(T);   

	 // Dynamically allocate local memory (allocated per workgroup)
	error_code = clSetKernelArg(myKernel, 2, localMemSize, NULL);
	if (CL_SUCCESS != error_code)
	{
		LogError("Error: clSetKernelArg returned %s.\n", TranslateOpenCLError(error_code));
		return;
	}

	// compute optimal number of steps of each sliding window
    const int steps = divRndUp(sy, 15 * WIN_SY);
	 error_code = clSetKernelArg(myKernel, 7, sizeof(T), &steps);
	if (CL_SUCCESS != error_code)
	{
		LogError("Error: clSetKernelArg returned %s.\n", TranslateOpenCLError(error_code));
		return;
	}

    size_t global_work_size[3] = {divRndUp(sx, WIN_SX) * WIN_SX, divRndUp(sy, WIN_SY * steps),1};
	size_t local_work_size[3] = {WIN_SX,1,1};

	DeviceKernel::enqueue(2,global_work_size, local_work_size);
  }
示例#18
0
文件: OCLDWT.cpp 项目: knopkem/roger
/**
A note about resolution levels: For a transform with N resolution levels, resolution levels run from 0 up to N-1.
**/
template<typename T> tDeviceRC OCLDWT<T>::setKernelArgsQuant(OCLKernel* myKernel, float quantLL, float quantLH, float quantHH) {

    cl_kernel targetKernel = myKernel->getKernel();
    cl_int error_code = clSetKernelArg(targetKernel, numKernelArgs++, sizeof(quantLL), &quantLL);
    if (DeviceSuccess != error_code)
    {
        LogError("setKernelArgs returned %s.", TranslateOpenCLError(error_code));
        return error_code;
    }
    error_code = clSetKernelArg(targetKernel, numKernelArgs++, sizeof(quantLH), &quantLH);
    if (DeviceSuccess != error_code)
    {
        LogError("setKernelArgs returned %s.", TranslateOpenCLError(error_code));
        return error_code;
    }
    error_code = clSetKernelArg(targetKernel, numKernelArgs++, sizeof(quantHH), &quantHH);
    if (DeviceSuccess != error_code)
    {
        LogError("setKernelArgs returned %s.", TranslateOpenCLError(error_code));
        return error_code;
    }

    return DeviceSuccess;
}
示例#19
0
cl_uint SetKernelArguments()
{
    cl_int err = CL_SUCCESS;

    err = clSetKernelArg(ocl.kernel, 0, sizeof(cl_mem), (void *)&ocl.Lights);
    if (CL_SUCCESS != err)
    {
        printf("error: Failed to set argument Lights, returned %s\n", TranslateOpenCLError(err));
        return err;
    }

    err = clSetKernelArg(ocl.kernel, 1, sizeof(cl_uint), (void *)&ocl.LightCount);
    if (CL_SUCCESS != err)
    {
        printf("Error: Failed to set argument LightCount, returned %s\n", TranslateOpenCLError(err));
        return err;
    }

    err = clSetKernelArg(ocl.kernel, 2, sizeof(cl_mem), (void *)&ocl.Shapes);
    if (CL_SUCCESS != err)
    {
        printf("error: Failed to set argument Shapes, returned %s\n", TranslateOpenCLError(err));
        return err;
    }

    err = clSetKernelArg(ocl.kernel, 3, sizeof(cl_uint), (void *)&ocl.ShapeCount);
    if (CL_SUCCESS != err)
    {
        printf("Error: Failed to set argument ShapeCount, returned %s\n", TranslateOpenCLError(err));
        return err;
    }

    err = clSetKernelArg(ocl.kernel, 4, sizeof(cl_uint), (void *)&ocl.sampleCount);
    if (CL_SUCCESS != err)
    {
        printf("Error: Failed to set argument ShapeCount, returned %s\n", TranslateOpenCLError(err));
        return err;
    }

    err = clSetKernelArg(ocl.kernel, 5, sizeof(cl_uint), (void *)&ocl.width);
    if (CL_SUCCESS != err)
    {
        printf("Error: Failed to set argument ShapeCount, returned %s\n", TranslateOpenCLError(err));
        return err;
    }

    err = clSetKernelArg(ocl.kernel, 6, sizeof(cl_uint), (void *)&ocl.height);
    if (CL_SUCCESS != err)
    {
        printf("Error: Failed to set argument ShapeCount, returned %s\n", TranslateOpenCLError(err));
        return err;
    }

    err = clSetKernelArg(ocl.kernel, 7, sizeof(cl_mem), (void *)&ocl.cam);
    if (CL_SUCCESS != err)
    {
        printf("Error: Failed to set argument ShapeCount, returned %s\n", TranslateOpenCLError(err));
        return err;
    }

    return err;
}
示例#20
0
/*
 * destructor - called only once
 * Release all OpenCL objects
 * This is a regular sequence of calls to deallocate all created OpenCL resources in bootstrapOpenCL.
 *
 * You may want to call these deallocation procedures in the middle of your application execution
 * (not at the end) if you don't further need OpenCL runtime.
 * You may want to do that in order to free some memory, for example,
 * or recreate OpenCL objects with different parameters.
 *
 */
ocl_args_d_t::~ocl_args_d_t()
{
    cl_int err = CL_SUCCESS;

    if (kernel)
    {
        err = clReleaseKernel(kernel);
        if (CL_SUCCESS != err)
        {
            LogError("Error: clReleaseKernel returned '%s'.\n", TranslateOpenCLError(err));
        }
    }
    if (program)
    {
        err = clReleaseProgram(program);
        if (CL_SUCCESS != err)
        {
            LogError("Error: clReleaseProgram returned '%s'.\n", TranslateOpenCLError(err));
        }
    }
    if (srcA)
    {
        err = clReleaseMemObject(srcA);
        if (CL_SUCCESS != err)
        {
            LogError("Error: clReleaseMemObject returned '%s'.\n", TranslateOpenCLError(err));
        }
    }
    if (srcB)
    {
        err = clReleaseMemObject(srcB);
        if (CL_SUCCESS != err)
        {
            LogError("Error: clReleaseMemObject returned '%s'.\n", TranslateOpenCLError(err));
        }
    }
    if (dstMem)
    {
        err = clReleaseMemObject(dstMem);
        if (CL_SUCCESS != err)
        {
            LogError("Error: clReleaseMemObject returned '%s'.\n", TranslateOpenCLError(err));
        }
    }
    if (commandQueue)
    {
        err = clReleaseCommandQueue(commandQueue);
        if (CL_SUCCESS != err)
        {
            LogError("Error: clReleaseCommandQueue returned '%s'.\n", TranslateOpenCLError(err));
        }
    }
    if (device)
    {
        err = clReleaseDevice(device);
        if (CL_SUCCESS != err)
        {
            LogError("Error: clReleaseDevice returned '%s'.\n", TranslateOpenCLError(err));
        }
    }
    if (context)
    {
        err = clReleaseContext(context);
        if (CL_SUCCESS != err)
        {
            LogError("Error: clReleaseContext returned '%s'.\n", TranslateOpenCLError(err));
        }
    }

    /*
     * Note there is no procedure to deallocate platform
     * because it was not created at the startup,
     * but just queried from OpenCL runtime.
     */
}
示例#21
0
/*
 * Check whether an OpenCL platform is the required platform
 * (based on the platform's name)
 */
bool CheckPreferredPlatformMatch(cl_platform_id platform, const char* preferredPlatform)
{
    size_t stringLength = 0;
    cl_int err = CL_SUCCESS;
    bool match = false;

    // In order to read the platform's name, we first read the platform's name string length (param_value is NULL).
    // The value returned in stringLength
    err = clGetPlatformInfo(platform, CL_PLATFORM_NAME, 0, NULL, &stringLength);
    if (CL_SUCCESS != err)
    {
        LogError("Error: clGetPlatformInfo() to get CL_PLATFORM_NAME length returned '%s'.\n", TranslateOpenCLError(err));
        return false;
    }

    // Now, that we know the platform's name string length, we can allocate enough space before read it
    std::vector<char> platformName(stringLength);

    // Read the platform's name string
    // The read value returned in platformName
    err = clGetPlatformInfo(platform, CL_PLATFORM_NAME, stringLength, &platformName[0], NULL);
    if (CL_SUCCESS != err)
    {
        LogError("Error: clGetplatform_ids() to get CL_PLATFORM_NAME returned %s.\n", TranslateOpenCLError(err));
        return false;
    }

    // Now check if the platform's name is the required one
    if (strstr(&platformName[0], preferredPlatform) != 0)
    {
        // The checked platform is the one we're looking for
        match = true;
    }

    return match;
}
示例#22
0
/*
 * This function picks/creates necessary OpenCL objects which are needed.
 * The objects are:
 * OpenCL platform, device, context, and command queue.
 *
 * All these steps are needed to be performed once in a regular OpenCL application.
 * This happens before actual compute kernels calls are performed.
 *
 * For convenience, in this application you store all those basic OpenCL objects in structure ocl_args_d_t,
 * so this function populates fields of this structure, which is passed as parameter ocl.
 * Please, consider reviewing the fields before going further.
 * The structure definition is right in the beginning of this file.
 */
int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType)
{
    // The following variable stores return codes for all OpenCL calls.
    cl_int err = CL_SUCCESS;

    // Query for all available OpenCL platforms on the system
    // Here you enumerate all platforms and pick one which name has preferredPlatform as a sub-string
    cl_platform_id platformId = FindOpenCLPlatform("Intel", deviceType);
    if (NULL == platformId)
    {
        LogError("Error: Failed to find OpenCL platform.\n");
        return CL_INVALID_VALUE;
    }

    // Create context with device of specified type.
    // Required device type is passed as function argument deviceType.
    // So you may use this function to create context for any CPU or GPU OpenCL device.
    // The creation is synchronized (pfn_notify is NULL) and NULL user_data
    cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platformId, 0 };
    ocl->context = clCreateContextFromType(contextProperties, deviceType, NULL, NULL, &err);
    if ((CL_SUCCESS != err) || (NULL == ocl->context))
    {
        LogError("Couldn't create a context, clCreateContextFromType() returned '%s'.\n", TranslateOpenCLError(err));
        return err;
    }

    // Query for OpenCL device which was used for context creation
    err = clGetContextInfo(ocl->context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &ocl->device, NULL);
    if (CL_SUCCESS != err)
    {
        LogError("Error: clGetContextInfo() to get list of devices returned %s.\n", TranslateOpenCLError(err));
        return err;
    }

    // Read the OpenCL platform's version and the device OpenCL and OpenCL C versions
    GetPlatformAndDeviceVersion(platformId, ocl);

    // Create command queue.
    // OpenCL kernels are enqueued for execution to a particular device through special objects called command queues.
    // Command queue guarantees some ordering between calls and other OpenCL commands.
    // Here you create a simple in-order OpenCL command queue that doesn't allow execution of two kernels in parallel on a target device.
#ifdef CL_VERSION_2_0
    if (OPENCL_VERSION_2_0 == ocl->deviceVersion)
    {
        const cl_command_queue_properties properties[] = { CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0 };
        ocl->commandQueue = clCreateCommandQueueWithProperties(ocl->context, ocl->device, properties, &err);
    }
    else {
        // default behavior: OpenCL 1.2
        cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE;
        ocl->commandQueue = clCreateCommandQueue(ocl->context, ocl->device, properties, &err);
    }
#else
    // default behavior: OpenCL 1.2
    cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE;
    ocl->commandQueue = clCreateCommandQueue(ocl->context, ocl->device, properties, &err);
#endif
    if (CL_SUCCESS != err)
    {
        LogError("Error: clCreateCommandQueue() returned %s.\n", TranslateOpenCLError(err));
        return err;
    }

    return CL_SUCCESS;
}
示例#23
0
/*
 * This function read the OpenCL platdorm and device versions
 * (using clGetxxxInfo API) and stores it in the ocl structure.
 * Later it will enable us to support both OpenCL 1.2 and 2.0 platforms and devices
 * in the same program.
 */
int GetPlatformAndDeviceVersion(cl_platform_id platformId, ocl_args_d_t *ocl)
{
    cl_int err = CL_SUCCESS;

    // Read the platform's version string length (param_value is NULL).
    // The value returned in stringLength
    size_t stringLength = 0;
    err = clGetPlatformInfo(platformId, CL_PLATFORM_VERSION, 0, NULL, &stringLength);
    if (CL_SUCCESS != err)
    {
        LogError("Error: clGetPlatformInfo() to get CL_PLATFORM_VERSION length returned '%s'.\n", TranslateOpenCLError(err));
        return err;
    }

    // Now, that we know the platform's version string length, we can allocate enough space before read it
    std::vector<char> platformVersion(stringLength);

    // Read the platform's version string
    // The read value returned in platformVersion
    err = clGetPlatformInfo(platformId, CL_PLATFORM_VERSION, stringLength, &platformVersion[0], NULL);
    if (CL_SUCCESS != err)
    {
        LogError("Error: clGetplatform_ids() to get CL_PLATFORM_VERSION returned %s.\n", TranslateOpenCLError(err));
        return err;
    }

    if (strstr(&platformVersion[0], "OpenCL 2.0") != NULL)
    {
        ocl->platformVersion = OPENCL_VERSION_2_0;
    }

    // Read the device's version string length (param_value is NULL).
    err = clGetDeviceInfo(ocl->device, CL_DEVICE_VERSION, 0, NULL, &stringLength);
    if (CL_SUCCESS != err)
    {
        LogError("Error: clGetDeviceInfo() to get CL_DEVICE_VERSION length returned '%s'.\n", TranslateOpenCLError(err));
        return err;
    }

    // Now, that we know the device's version string length, we can allocate enough space before read it
    std::vector<char> deviceVersion(stringLength);

    // Read the device's version string
    // The read value returned in deviceVersion
    err = clGetDeviceInfo(ocl->device, CL_DEVICE_VERSION, stringLength, &deviceVersion[0], NULL);
    if (CL_SUCCESS != err)
    {
        LogError("Error: clGetDeviceInfo() to get CL_DEVICE_VERSION returned %s.\n", TranslateOpenCLError(err));
        return err;
    }

    if (strstr(&deviceVersion[0], "OpenCL 2.0") != NULL)
    {
        ocl->deviceVersion = OPENCL_VERSION_2_0;
    }

    // Read the device's OpenCL C version string length (param_value is NULL).
    err = clGetDeviceInfo(ocl->device, CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &stringLength);
    if (CL_SUCCESS != err)
    {
        LogError("Error: clGetDeviceInfo() to get CL_DEVICE_OPENCL_C_VERSION length returned '%s'.\n", TranslateOpenCLError(err));
        return err;
    }

    // Now, that we know the device's OpenCL C version string length, we can allocate enough space before read it
    std::vector<char> compilerVersion(stringLength);

    // Read the device's OpenCL C version string
    // The read value returned in compilerVersion
    err = clGetDeviceInfo(ocl->device, CL_DEVICE_OPENCL_C_VERSION, stringLength, &compilerVersion[0], NULL);
    if (CL_SUCCESS != err)
    {
        LogError("Error: clGetDeviceInfo() to get CL_DEVICE_OPENCL_C_VERSION returned %s.\n", TranslateOpenCLError(err));
        return err;
    }

    else if (strstr(&compilerVersion[0], "OpenCL C 2.0") != NULL)
    {
        ocl->compilerVersion = OPENCL_VERSION_2_0;
    }

    return err;
}
示例#24
0
/*
 * Find and return the preferred OpenCL platform
 * In case that preferredPlatform is NULL, the ID of the first discovered platform will be returned
 */
cl_platform_id FindOpenCLPlatform(const char* preferredPlatform, cl_device_type deviceType)
{
    cl_uint numPlatforms = 0;
    cl_int err = CL_SUCCESS;

    // Get (in numPlatforms) the number of OpenCL platforms available
    // No platform ID will be return, since platforms is NULL
    err = clGetPlatformIDs(0, NULL, &numPlatforms);
    if (CL_SUCCESS != err)
    {
        LogError("Error: clGetplatform_ids() to get num platforms returned %s.\n", TranslateOpenCLError(err));
        return NULL;
    }
    LogInfo("Number of available platforms: %u\n", numPlatforms);

    if (0 == numPlatforms)
    {
        LogError("Error: No platforms found!\n");
        return NULL;
    }

    std::vector<cl_platform_id> platforms(numPlatforms);

    // Now, obtains a list of numPlatforms OpenCL platforms available
    // The list of platforms available will be returned in platforms
    err = clGetPlatformIDs(numPlatforms, &platforms[0], NULL);
    if (CL_SUCCESS != err)
    {
        LogError("Error: clGetplatform_ids() to get platforms returned %s.\n", TranslateOpenCLError(err));
        return NULL;
    }

    // Check if one of the available platform matches the preferred requirements
    for (cl_uint i = 0; i < numPlatforms; i++)
    {
        bool match = true;
        cl_uint numDevices = 0;

        // If the preferredPlatform is not NULL then check if platforms[i] is the required one
        // Otherwise, continue the check with platforms[i]
        if ((NULL != preferredPlatform) && (strlen(preferredPlatform) > 0))
        {
            // In case we're looking for a specific platform
            match = CheckPreferredPlatformMatch(platforms[i], preferredPlatform);
        }

        // match is true if the platform's name is the required one or don't care (NULL)
        if (match)
        {
            // Obtains the number of deviceType devices available on platform
            // When the function failed we expect numDevices to be zero.
            // We ignore the function return value since a non-zero error code
            // could happen if this platform doesn't support the specified device type.
            err = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &numDevices);
            if (CL_SUCCESS != err)
            {
                LogError("clGetDeviceIDs() returned %s.\n", TranslateOpenCLError(err));
            }

            if (0 != numDevices)
            {
                // There is at list one device that answer the requirements
                return platforms[i];
            }
        }
    }

    return NULL;
}
示例#25
0
/*
 * main execution routine
 * Basically it consists of three parts:
 *   - generating the inputs
 *   - running OpenCL kernel
 *   - reading results of processing
 */
int _tmain(int argc, TCHAR* argv[])
{
    cl_int err;
    ocl_args_d_t ocl;
    cl_device_type deviceType = CL_DEVICE_TYPE_GPU;

    LARGE_INTEGER perfFrequency;
    LARGE_INTEGER performanceCountNDRangeStart;
    LARGE_INTEGER performanceCountNDRangeStop;

    cl_uint arrayWidth = 1024;
    cl_uint arrayHeight = 1024;

    //initialize Open CL objects (context, queue, etc.)
    if (CL_SUCCESS != SetupOpenCL(&ocl, deviceType))
    {
        return -1;
    }

    // allocate working buffers. 
    // the buffer should be aligned with 4K page and size should fit 64-byte cached line
    cl_uint optimizedSize = ((sizeof(cl_int) * arrayWidth * arrayHeight - 1) / 64 + 1) * 64;
    cl_int* inputA = (cl_int*)_aligned_malloc(optimizedSize, 4096);
    cl_int* inputB = (cl_int*)_aligned_malloc(optimizedSize, 4096);
    cl_int* outputC = (cl_int*)_aligned_malloc(optimizedSize, 4096);
    if (NULL == inputA || NULL == inputB || NULL == outputC)
    {
        LogError("Error: _aligned_malloc failed to allocate buffers.\n");
        return -1;
    }

    //random input
    generateInput(inputA, arrayWidth, arrayHeight);
    generateInput(inputB, arrayWidth, arrayHeight);

    // Create OpenCL buffers from host memory
    // These buffers will be used later by the OpenCL kernel
    if (CL_SUCCESS != CreateBufferArguments(&ocl, inputA, inputB, outputC, arrayWidth, arrayHeight))
    {
        return -1;
    }

    // Create and build the OpenCL program
    if (CL_SUCCESS != CreateAndBuildProgram(&ocl))
    {
        return -1;
    }

    // Program consists of kernels.
    // Each kernel can be called (enqueued) from the host part of OpenCL application.
    // To call the kernel, you need to create it from existing program.
    ocl.kernel = clCreateKernel(ocl.program, "Add", &err);
    if (CL_SUCCESS != err)
    {
        LogError("Error: clCreateKernel returned %s\n", TranslateOpenCLError(err));
        return -1;
    }

    // Passing arguments into OpenCL kernel.
    if (CL_SUCCESS != SetKernelArguments(&ocl))
    {
        return -1;
    }

    // Regularly you wish to use OpenCL in your application to achieve greater performance results
    // that are hard to achieve in other ways.
    // To understand those performance benefits you may want to measure time your application spent in OpenCL kernel execution.
    // The recommended way to obtain this time is to measure interval between two moments:
    //   - just before clEnqueueNDRangeKernel is called, and
    //   - just after clFinish is called
    // clFinish is necessary to measure entire time spending in the kernel, measuring just clEnqueueNDRangeKernel is not enough,
    // because this call doesn't guarantees that kernel is finished.
    // clEnqueueNDRangeKernel is just enqueue new command in OpenCL command queue and doesn't wait until it ends.
    // clFinish waits until all commands in command queue are finished, that suits your need to measure time.
    bool queueProfilingEnable = true;
    if (queueProfilingEnable)
        QueryPerformanceCounter(&performanceCountNDRangeStart);
    // Execute (enqueue) the kernel
    if (CL_SUCCESS != ExecuteAddKernel(&ocl, arrayWidth, arrayHeight))
    {
        return -1;
    }
    if (queueProfilingEnable)
        QueryPerformanceCounter(&performanceCountNDRangeStop);

    // The last part of this function: getting processed results back.
    // use map-unmap sequence to update original memory area with output buffer.
    ReadAndVerify(&ocl, arrayWidth, arrayHeight, inputA, inputB);

    // retrieve performance counter frequency
    if (queueProfilingEnable)
    {
        QueryPerformanceFrequency(&perfFrequency);
        LogInfo("NDRange performance counter time %f ms.\n",
            1000.0f*(float)(performanceCountNDRangeStop.QuadPart - performanceCountNDRangeStart.QuadPart) / (float)perfFrequency.QuadPart);
    }

    _aligned_free(inputA);
    _aligned_free(inputB);
    _aligned_free(outputC);

#if defined(_DEBUG)
    getchar();
#endif

    return 0;
}
示例#26
0
void imgdiff(size_t N, size_t width, size_t height, double* diff_matrix, unsigned char* images) 
{

	//// we need to fill in ////
	cl_platform_id *platform;
	cl_device_type dev_type = CL_DEVICE_TYPE_GPU;
	cl_device_id *devs;
	cl_context context;
	cl_command_queue *cmd_queues;
	cl_program program;
	cl_kernel *kernels;
	cl_uint num_platforms;
	cl_uint num_devs;

	cl_mem* m_image1;
	cl_mem* m_image2;
	cl_mem* m_result;

	cl_event* ev_kernels;

	int err = CL_SUCCESS;

	int i, j, k;
	
	// modify version
	err = clGetPlatformIDs(0, NULL, &num_platforms);
	if(err != CL_SUCCESS)
	{
		printf("Error: platform error\n");
		return 0;
	}

	if(num_platforms == 0)
	{
		printf("Error: platform no count\n");
		return 0;
	}

	platform = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platforms);
	err = clGetPlatformIDs(num_platforms, platform, NULL);
	if(err != CL_SUCCESS)
	{
		printf("Error: clGetPlatformIDs error\n");
		return 0;
	}

	for(i = 0; i<num_platforms; i++)
	{
		err = clGetDeviceIDs(platform[i], dev_type, 0, NULL, &num_devs);
		if(err != CL_SUCCESS)
		{
			printf("Error: clGetDevice\n");
			return 0;
		}
		if(num_devs >= 1)
		{
			devs = (cl_device_id*)malloc(sizeof(cl_device_id) * num_devs);

			clGetDeviceIDs(platform[i], dev_type, num_devs, devs, NULL);
			break;
		}
	}

	context = clCreateContext(NULL, num_devs, devs, NULL, NULL, &err);
	if(err != CL_SUCCESS)
	{
		printf("Error: clCreateContext error\n");
		return 0;
	}

	char* source = NULL;
	size_t src_size = 0;
	err = ReadSourceFromFile("./imgdiff_cal.cl", &source, &src_size);
	if (CL_SUCCESS != err)
	{
		printf("Error: ReadSourceFromFile returned %s.\n", err);
		free(source);
		return 0;
	}

	program = clCreateProgramWithSource(context, 1, (const char**)&source, &src_size, &err);
	if(err != CL_SUCCESS)
	{
		printf("Error: clCreateProgram error\n");
		return 0;
	}

	free(source);
	printf("Create Program Success\n");

#if DBG
	// Measure clBuildProgram -@henry added
	gettimeofday(&start_m, NULL );
#endif
	err = clBuildProgram(program, num_devs, devs, "", NULL, NULL);
#if DBG
	gettimeofday(&end_m, NULL );

	double time = (end_m.tv_usec - start_m.tv_usec)*1e-6 + (end_m.tv_sec - start_m.tv_sec);
	printf("[Debug] Elapsed Time of clBuildProgram() : %lf s\n",time); 
#endif
	if(err != CL_SUCCESS)
	{
		printf("Error: clBuildProgram\n");
		return 0;
	}

	printf("Build Program Success\n");

	kernels = (cl_kernel*)malloc(sizeof(cl_kernel)*num_devs);
	for(i = 0; i<num_devs; i++)
	{
		kernels[i] = clCreateKernel(program, "imgdiff_cal", NULL);
	}


	printf("Create Kernel Success\n");

	cmd_queues = (cl_command_queue*)malloc(sizeof(cl_command_queue)*num_devs);
	for(i=0; i<num_devs; i++)
	{
		cmd_queues[i] = clCreateCommandQueue(context, devs[i], 0, &err);
		if(err != CL_SUCCESS)
		{
			printf("Error: clCreateCommandQueue error\n");
			return 0;
		}

	}

	printf("Create commandQueue Success\n");
	int LOCAL_WIDTH = 16;
	int LOCAL_HEIGHT = 16;


	int WORK_WIDTH = ceil((double)width / LOCAL_WIDTH)*LOCAL_WIDTH;
	int WORK_HEIGHT = ceil((double)height/LOCAL_HEIGHT) * LOCAL_HEIGHT;
	int WORK_AMOUNT = width * height;
	int WORK_GROUP_COUNT = ceil(((double)WORK_WIDTH * WORK_HEIGHT) / (LOCAL_WIDTH * LOCAL_HEIGHT));
	
	int WORK_GROUP_WIDTH = width;
	int WORK_GROUP_HEIGHT = height;

	int SAMPLE_COUNT = 16;
	int WORK_COUNT[num_devs];
	double tmp_result_data[WORK_GROUP_COUNT*SAMPLE_COUNT];

	printf("WORK_WIDTH %d\tWORK_HEIGHT %d\t WORK_AMOUNT %d\t WORK_GROUP_COUNT %d\n", 
			WORK_WIDTH, WORK_HEIGHT, WORK_AMOUNT, WORK_GROUP_COUNT);

	m_image1 = (cl_mem*)malloc(sizeof(cl_mem)* num_devs);
	m_image2 = (cl_mem*)malloc(sizeof(cl_mem)* num_devs);
	
	m_result = (cl_mem*)malloc(sizeof(cl_mem)* num_devs);


	for(i=0; i<num_devs; i++)
	{
		m_image1[i] = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned char) * WORK_AMOUNT * 3, NULL, NULL);
		m_image2[i] = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned char) * WORK_AMOUNT*SAMPLE_COUNT * 3, NULL, NULL);
			
		m_result[i] = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(double) * WORK_GROUP_COUNT * SAMPLE_COUNT, NULL, NULL);
		clSetKernelArg(kernels[i], 0, sizeof(cl_mem), (void*)&m_image1[i]);
		clSetKernelArg(kernels[i], 1, sizeof(cl_mem), (void*)&m_image2[i]);
		clSetKernelArg(kernels[i], 2, sizeof(cl_mem), (void*)&m_result[i]);
		clSetKernelArg(kernels[i], 3, sizeof(cl_int), &WORK_GROUP_WIDTH);
		clSetKernelArg(kernels[i], 4, sizeof(cl_int), &WORK_GROUP_HEIGHT);
	}

	ev_kernels  = (cl_event*)malloc(sizeof(cl_event)*num_devs);

	int row, col;

	row = 0;
	col = 1;

	for(row = 0; row < N; row++)
	{
		if( (N-row-1) < (SAMPLE_COUNT*4) && SAMPLE_COUNT > 1)
			SAMPLE_COUNT = SAMPLE_COUNT / 2;
		
		int remain_count = N - (row + 1);

		for(i=0; i<num_devs; i++)
		{

			clEnqueueWriteBuffer(cmd_queues[i], m_image1[i], CL_FALSE, 0, 
					sizeof(unsigned char) * WORK_AMOUNT * 3, (void*)(images + 
					(row * width*height)*3), 0, NULL, NULL);
		}
			
		diff_matrix[row*N + row] = 0;
		col = row + 1;
		while( col< N)
		{
			size_t lws[2] = { LOCAL_WIDTH, LOCAL_HEIGHT };
			size_t gws[2] = { WORK_WIDTH, WORK_HEIGHT};
			
			for(i=0; i<num_devs; i++)
			{
				if((remain_count - SAMPLE_COUNT) < 0)
				{
					WORK_COUNT[i] = remain_count;
					remain_count = 0;
				}
				else
				{
					WORK_COUNT[i] = SAMPLE_COUNT;
					remain_count = remain_count - SAMPLE_COUNT;
				}
				
				if(WORK_COUNT[i] != 0)
				{
					
					clSetKernelArg(kernels[i], 5, sizeof(cl_int), &WORK_COUNT[i]);
					
					int offset = 0;
					for(j=0; j<i; j++)
						offset += WORK_COUNT[j];


					err = clEnqueueWriteBuffer(cmd_queues[i], m_image2[i], CL_FALSE, 0, 
							sizeof(unsigned char)*WORK_AMOUNT*WORK_COUNT[i]*3, 
							(void*)(images +((col * width*height) + (WORK_AMOUNT * 
										offset))*3), 0, NULL, NULL);

				}


			}

			for( i=0; i < num_devs; i++ )
			{
				if(WORK_COUNT[i] != 0)
				{
					err = clEnqueueNDRangeKernel(cmd_queues[i], kernels[i], 2, NULL, gws, lws, 0, NULL, NULL);
					if(err != CL_SUCCESS)
					{
						printf("Error: clEnqueueNDRangeKernel %d error\n", i);
						printf("%s\n", TranslateOpenCLError(err));
						return 0;
					}
				}
			}
			double tmp_sum = 0;
			i = 0;
			for( i = num_devs -1; i >= 0; i-- )
			{
				
				if(WORK_COUNT[i] != 0)
				{
					err = clEnqueueReadBuffer( cmd_queues[i], m_result[i], CL_TRUE, 0, 
						sizeof(double) * WORK_GROUP_COUNT * WORK_COUNT[i], 
						tmp_result_data, 0, NULL, NULL); 
					if(err != CL_SUCCESS)
					{
						printf("Error: clEnqueueReadBuffer%d error\n", i);
						return 0;
					}
					//printf("receive......");

					for(j = 0; j<WORK_COUNT[i]; j++)
					{	
						tmp_sum = 0;
						for(k = 0; k<WORK_GROUP_COUNT; k++)
						{
							tmp_sum += tmp_result_data[k + j*WORK_GROUP_COUNT];
							//printf("%lf\t", tmp_result_data[k+j*WORK_GROUP_COUNT]);

						}
						//printf("%lf %lf\n", tmp_sum, tmp_result_data[j*WORK_GROUP_COUNT]);
						
						int offset = 0;
						for(k=0; k<i; k++)
							offset += WORK_COUNT[k];
						diff_matrix[row*N+col+j+offset] = diff_matrix[(col+j+offset)*N+row] = tmp_sum;
					}

				}
				
			}
			
			for( i = 0; i < num_devs; i++ )
			{
				col += WORK_COUNT[i];
			} 
		}
	}

}