예제 #1
0
파일: pe001.c 프로젝트: nikhiljangam/TCR
int main() {
    cl_int err;
    cl_kernel kernel;
    cl_program program;
    cl_mem buffer;
    cl_ulong ans;
    int i;
    char *source;
    size_t localws[3];
    size_t globalws[3];

    initopencl();
    source=getsource("pe001.cl");
    /* create a program object and load source code into it */
    program=clCreateProgramWithSource(context,1,(const char **)&source,NULL,&err);
    if(CL_SUCCESS!=err) clerror("error creating program",err);
    /* builds a program associated with a program object */
    if(CL_SUCCESS!=(err=clBuildProgram(program,1,devices+DEVICEID,NULL,NULL,NULL)))
        clerror("error building program",err);
    /* create kernel object from built program */
    kernel=clCreateKernel(program,"pe001",&err);
    if(CL_SUCCESS!=err) clerror("error creating kernel",err);
    free(source);

    /* create memory buffer: one cl_ulong for each kernel, a total of MAX=N/KERNEL */
    buffer=clCreateBuffer(context,CL_MEM_READ_WRITE,MAX*sizeof(cl_ulong),NULL,&err);
    if(CL_SUCCESS!=err) clerror("error creating buffer",err);

    /* set up kernel arguments */
    if(CL_SUCCESS!=(err=clSetKernelArg(kernel,0,sizeof(cl_mem),&buffer))) clerror("error setting kernel argument 0",err);

    globalws[0]=MAX;
    localws[0]=LOCAL;

    /* run kernel */
    if(CL_SUCCESS!=(err=clEnqueueNDRangeKernel(queue,kernel,1,NULL,globalws,localws,0,NULL,NULL)))
        clerror("error running kernel",err);
    /* wait until kernel has finished */
    if(CL_SUCCESS!=(err=clFinish(queue))) clerror("error waiting for queue",err);

    /* copy to host memory */
    if(CL_SUCCESS!=(err=clEnqueueReadBuffer(queue,buffer,CL_TRUE,0,MAX*sizeof(cl_ulong),a,0,NULL,NULL)))
        clerror("error copying result to host",err);
    if(CL_SUCCESS!=(err=clFinish(queue))) clerror("error waiting for queue",err);

    /* assemble final answer */
    ans=0;
    for(i=0; i<ACTUAL/KERNEL; i++) ans+=a[i];
    printf("ans: "LL"\n",ans);

    /* TODO deallocate buffer */

    clReleaseKernel(kernel);
    clReleaseProgram(program);
    shutdownopencl();
    return 0;
}
예제 #2
0
generic_info
memory_object::get_info(cl_uint param_name) const
{
    switch ((cl_mem_info)param_name) {
    case CL_MEM_TYPE:
        return pyopencl_get_int_info(cl_mem_object_type, MemObject,
                                     PYOPENCL_CL_CASTABLE_THIS, param_name);
    case CL_MEM_FLAGS:
        return pyopencl_get_int_info(cl_mem_flags, MemObject,
                                     PYOPENCL_CL_CASTABLE_THIS, param_name);
    case CL_MEM_SIZE:
        return pyopencl_get_int_info(size_t, MemObject, PYOPENCL_CL_CASTABLE_THIS, param_name);
    case CL_MEM_HOST_PTR:
        throw clerror("MemoryObject.get_info", CL_INVALID_VALUE,
                      "Use MemoryObject.get_host_array to get "
                      "host pointer.");
    case CL_MEM_MAP_COUNT:
    case CL_MEM_REFERENCE_COUNT:
        return pyopencl_get_int_info(cl_uint, MemObject,
                                     PYOPENCL_CL_CASTABLE_THIS, param_name);
    case CL_MEM_CONTEXT:
        return pyopencl_get_opaque_info(context, MemObject, PYOPENCL_CL_CASTABLE_THIS, param_name);

#if PYOPENCL_CL_VERSION >= 0x1010
        // TODO
        //       case CL_MEM_ASSOCIATED_MEMOBJECT:
        //      {
        //        cl_mem param_value;
        //        PYOPENCL_CALL_GUARDED(clGetMemObjectInfo, (this, param_name, sizeof(param_value), &param_value, 0));
        //        if (param_value == 0)
        //          {
        //            // no associated memory object? no problem.
        //            return py::object();
        //          }

        //        return create_mem_object_wrapper(param_value);
        //      }
    case CL_MEM_OFFSET:
        return pyopencl_get_int_info(size_t, MemObject, PYOPENCL_CL_CASTABLE_THIS, param_name);
#endif
#if PYOPENCL_CL_VERSION >= 0x2000
    case CL_MEM_USES_SVM_POINTER:
        return pyopencl_get_int_info(cl_bool, MemObject, PYOPENCL_CL_CASTABLE_THIS, param_name);
#endif

    default:
        throw clerror("MemoryObject.get_info", CL_INVALID_VALUE);
    }
}
예제 #3
0
cl_context_properties
get_apple_cgl_share_group()
{
#ifdef __APPLE__
    #ifdef HAVE_GL
        CGLContextObj kCGLContext = CGLGetCurrentContext();
        CGLShareGroupObj kCGLShareGroup = CGLGetShareGroup(kCGLContext);

        return (cl_context_properties)kCGLShareGroup;
    #else
        throw clerror("get_apple_cgl_share_group unavailable: "
            "GL interop not compiled",
            CL_INVALID_VALUE);
    #endif
#else
    throw clerror("get_apple_cgl_share_group unavailable: non-Apple platform",
        CL_INVALID_VALUE);
#endif /* __APPLE__ */
}
예제 #4
0
generic_info
gl_texture::get_gl_texture_info(cl_gl_texture_info param_name) const
{
    switch (param_name) {
    case CL_GL_TEXTURE_TARGET:
        return pyopencl_get_int_info(GLenum, GLTexture, PYOPENCL_CL_CASTABLE_THIS, param_name);
    case CL_GL_MIPMAP_LEVEL:
        return pyopencl_get_int_info(GLint, GLTexture, PYOPENCL_CL_CASTABLE_THIS, param_name);
    default:
        throw clerror("MemoryObject.get_gl_texture_info", CL_INVALID_VALUE);
    }
}
예제 #5
0
generic_info
program::get_info(cl_uint param) const
{
    switch ((cl_program_info)param) {
    case CL_PROGRAM_CONTEXT:
        return pyopencl_get_opaque_info(context, Program, this, param);
    case CL_PROGRAM_REFERENCE_COUNT:
    case CL_PROGRAM_NUM_DEVICES:
        return pyopencl_get_int_info(cl_uint, Program, this, param);
    case CL_PROGRAM_DEVICES:
        return pyopencl_get_opaque_array_info(device, Program, this, param);
    case CL_PROGRAM_SOURCE:
        return pyopencl_get_str_info(Program, this, param);
    case CL_PROGRAM_BINARY_SIZES:
        return pyopencl_get_array_info(size_t, Program, this, param);
    case CL_PROGRAM_BINARIES: {
        auto sizes = pyopencl_get_vec_info(size_t, Program, this,
                                           CL_PROGRAM_BINARY_SIZES);
        pyopencl_buf<char*> result_ptrs(sizes.len());
        for (size_t i  = 0;i < sizes.len();i++) {
            result_ptrs[i] = (char*)malloc(sizes[i]);
        }
        try {
            pyopencl_call_guarded(clGetProgramInfo, this, CL_PROGRAM_BINARIES,
                                  sizes.len() * sizeof(char*),
                                  result_ptrs.get(), nullptr);
        } catch (...) {
            for (size_t i  = 0;i < sizes.len();i++) {
                free(result_ptrs[i]);
            }
        }
        pyopencl_buf<generic_info> gis(sizes.len());
        for (size_t i  = 0;i < sizes.len();i++) {
            gis[i].value = result_ptrs[i];
            gis[i].dontfree = 0;
            gis[i].opaque_class = CLASS_NONE;
            gis[i].type =  _copy_str(std::string("char[") +
                                     tostring(sizes[i]) + "]");
        }
        return pyopencl_convert_array_info(generic_info, gis);
    }

#if PYOPENCL_CL_VERSION >= 0x1020
    case CL_PROGRAM_NUM_KERNELS:
        return pyopencl_get_int_info(size_t, Program, this, param);
    case CL_PROGRAM_KERNEL_NAMES:
        return pyopencl_get_str_info(Program, this, param);
#endif
    default:
        throw clerror("Program.get_info", CL_INVALID_VALUE);
    }
}
예제 #6
0
generic_info
platform::get_info(cl_uint param_name) const
{
    switch ((cl_platform_info)param_name) {
    case CL_PLATFORM_PROFILE:
    case CL_PLATFORM_VERSION:
    case CL_PLATFORM_NAME:
    case CL_PLATFORM_VENDOR:
#if !(defined(CL_PLATFORM_NVIDIA) && CL_PLATFORM_NVIDIA == 0x3001)
    case CL_PLATFORM_EXTENSIONS:
#endif
        return pyopencl_get_str_info(Platform, this, param_name);
    default:
        throw clerror("Platform.get_info", CL_INVALID_VALUE);
    }
}
예제 #7
0
PYOPENCL_USE_RESULT generic_info
kernel::get_arg_info(cl_uint idx, cl_kernel_arg_info param) const
{
    switch (param) {
    case CL_KERNEL_ARG_ADDRESS_QUALIFIER:
        return pyopencl_get_int_info(cl_kernel_arg_address_qualifier,
                                     KernelArg, this, idx, param);
    case CL_KERNEL_ARG_ACCESS_QUALIFIER:
        return pyopencl_get_int_info(cl_kernel_arg_access_qualifier,
                                     KernelArg, this, idx, param);
    case CL_KERNEL_ARG_TYPE_NAME:
    case CL_KERNEL_ARG_NAME:
        return pyopencl_get_str_info(KernelArg, this, idx, param);
    default:
        throw clerror("Kernel.get_arg_info", CL_INVALID_VALUE);
    }
}
예제 #8
0
error*
svm_alloc(
    clobj_t _ctx, cl_mem_flags flags, size_t size, cl_uint alignment,
    void **result)
{
#if PYOPENCL_CL_VERSION >= 0x2000
    auto ctx = static_cast<context*>(_ctx);
    return c_handle_retry_mem_error([&] {
            *result = clSVMAlloc(ctx->data(), flags, size, alignment);
            if (!*result)
                throw clerror("clSVMalloc", CL_INVALID_VALUE,
                    "(allocation failure, unspecified reason)");
        });
#else
    PYOPENCL_UNSUPPORTED_BEFORE(clSVMAlloc, "CL 2.0")
#endif
}
예제 #9
0
error*
memory_object__get_host_array(clobj_t _obj, void **hostptr, size_t *size)
{
    auto obj = static_cast<memory_object*>(_obj);
    return c_handle_error([&] {
            cl_mem_flags flags;
            pyopencl_call_guarded(clGetMemObjectInfo, obj, CL_MEM_FLAGS,
                                  size_arg(flags), nullptr);
            if (!(flags & CL_MEM_USE_HOST_PTR))
                throw clerror("MemoryObject.get_host_array", CL_INVALID_VALUE,
                              "Only MemoryObject with USE_HOST_PTR "
                              "is supported.");
            pyopencl_call_guarded(clGetMemObjectInfo, obj, CL_MEM_HOST_PTR,
                                  size_arg(*hostptr), nullptr);
            pyopencl_call_guarded(clGetMemObjectInfo, obj, CL_MEM_SIZE,
                                  size_arg(*size), nullptr);
        });
}
예제 #10
0
PYOPENCL_USE_RESULT static gl_texture*
create_from_gl_texture(const context *ctx, cl_mem_flags flags,
                       GLenum texture_target, GLint miplevel,
                       GLuint texture, unsigned dims)
{
    if (dims == 2) {
        cl_mem mem = pyopencl_call_guarded(clCreateFromGLTexture2D,
                                           ctx, flags, texture_target,
                                           miplevel, texture);
        return pyopencl_convert_obj(gl_texture, clReleaseMemObject, mem);
    } else if (dims == 3) {
        cl_mem mem = pyopencl_call_guarded(clCreateFromGLTexture3D,
                                           ctx, flags, texture_target,
                                           miplevel, texture);
        return pyopencl_convert_obj(gl_texture, clReleaseMemObject, mem);
    } else {
        throw clerror("Image", CL_INVALID_VALUE, "invalid dimension");
    }
}
예제 #11
0
generic_info
kernel::get_info(cl_uint param) const
{
    switch ((cl_kernel_info)param) {
    case CL_KERNEL_FUNCTION_NAME:
        return pyopencl_get_str_info(Kernel, PYOPENCL_CL_CASTABLE_THIS, param);
    case CL_KERNEL_NUM_ARGS:
    case CL_KERNEL_REFERENCE_COUNT:
        return pyopencl_get_int_info(cl_uint, Kernel, PYOPENCL_CL_CASTABLE_THIS, param);
    case CL_KERNEL_CONTEXT:
        return pyopencl_get_opaque_info(context, Kernel, PYOPENCL_CL_CASTABLE_THIS, param);
    case CL_KERNEL_PROGRAM:
        return pyopencl_get_opaque_info(program, Kernel, PYOPENCL_CL_CASTABLE_THIS, param);
#if PYOPENCL_CL_VERSION >= 0x1020
    case CL_KERNEL_ATTRIBUTES:
        return pyopencl_get_str_info(Kernel, PYOPENCL_CL_CASTABLE_THIS, param);
#endif
    default:
        throw clerror("Kernel.get_info", CL_INVALID_VALUE);
    }
}
예제 #12
0
파일: context.cpp 프로젝트: AI42/pyopencl
void
context::get_version(cl_context ctx, int *major, int *minor)
{
    cl_device_id s_buff[16];
    size_t size;
    pyopencl_buf<cl_device_id> d_buff(0);
    cl_device_id *devs = s_buff;
    pyopencl_call_guarded(clGetContextInfo, ctx, CL_CONTEXT_DEVICES,
                          0, nullptr, buf_arg(size));
    if (PYOPENCL_UNLIKELY(!size)) {
        throw clerror("Context.get_version", CL_INVALID_VALUE,
                      "Cannot get devices from context.");
    }
    if (PYOPENCL_UNLIKELY(size > sizeof(s_buff))) {
        d_buff.resize(size / sizeof(cl_device_id));
        devs = d_buff.get();
    }
    pyopencl_call_guarded(clGetContextInfo, ctx, CL_CONTEXT_DEVICES,
                          size_arg(devs, size), buf_arg(size));
    device::get_version(devs[0], major, minor);
}
예제 #13
0
generic_info
image::get_image_info(cl_image_info param) const
{
    switch (param) {
    case CL_IMAGE_FORMAT:
        return pyopencl_get_int_info(cl_image_format, Image, this, param);
    case CL_IMAGE_ELEMENT_SIZE:
    case CL_IMAGE_ROW_PITCH:
    case CL_IMAGE_SLICE_PITCH:
    case CL_IMAGE_WIDTH:
    case CL_IMAGE_HEIGHT:
    case CL_IMAGE_DEPTH:
#if PYOPENCL_CL_VERSION >= 0x1020
    case CL_IMAGE_ARRAY_SIZE:
#endif
        return pyopencl_get_int_info(size_t, Image, this, param);

#if PYOPENCL_CL_VERSION >= 0x1020
        // TODO:
        //    case CL_IMAGE_BUFFER:
        //      {
        //        cl_mem param_value;
        //        PYOPENCL_CALL_GUARDED(clGetImageInfo, (this, param, sizeof(param_value), &param_value, 0));
        //        if (param_value == 0)
        //               {
        //                 // no associated memory object? no problem.
        //                 return py::object();
        //               }
        //        return create_mem_object_wrapper(param_value);
        //      }
    case CL_IMAGE_NUM_MIP_LEVELS:
    case CL_IMAGE_NUM_SAMPLES:
        return pyopencl_get_int_info(cl_uint, Image, this, param);
#endif
    default:
        throw clerror("Image.get_image_info", CL_INVALID_VALUE);
    }
}
예제 #14
0
generic_info
kernel::get_work_group_info(cl_kernel_work_group_info param,
                            const device *dev) const
{
    switch (param) {
#if PYOPENCL_CL_VERSION >= 0x1010
    case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE:
#endif
    case CL_KERNEL_WORK_GROUP_SIZE:
        return pyopencl_get_int_info(size_t, KernelWorkGroup, PYOPENCL_CL_CASTABLE_THIS, dev, param);
    case CL_KERNEL_COMPILE_WORK_GROUP_SIZE:
        return pyopencl_get_array_info(size_t, KernelWorkGroup,
                                       PYOPENCL_CL_CASTABLE_THIS, dev, param);
    case CL_KERNEL_LOCAL_MEM_SIZE:
#if PYOPENCL_CL_VERSION >= 0x1010
    case CL_KERNEL_PRIVATE_MEM_SIZE:
#endif
        return pyopencl_get_int_info(cl_ulong, KernelWorkGroup,
                                     PYOPENCL_CL_CASTABLE_THIS, dev, param);
    default:
        throw clerror("Kernel.get_work_group_info", CL_INVALID_VALUE);
    }
}
예제 #15
0
void
platform::get_version(cl_platform_id plat, int *major, int *minor)
{
    char s_buff[128];
    size_t size;
    pyopencl_buf<char> d_buff(0);
    char *name = s_buff;
    pyopencl_call_guarded(clGetPlatformInfo, plat, CL_PLATFORM_VERSION,
                          0, nullptr, buf_arg(size));
    if (PYOPENCL_UNLIKELY(size > sizeof(s_buff))) {
        d_buff.resize(size);
        name = d_buff.get();
    }
    pyopencl_call_guarded(clGetPlatformInfo, plat, CL_PLATFORM_VERSION,
                          size_arg(name, size), buf_arg(size));
    std::cmatch ver_match;
    if (!std::regex_match(name, ver_match, ver_regex)) {
        throw clerror("Platform.get_version", CL_INVALID_VALUE,
                      "platform returned non-conformant "
                      "platform version string");
    }
    *major = atoi(name + ver_match.position(1));
    *minor = atoi(name + ver_match.position(2));
}
예제 #16
0
generic_info
program::get_build_info(const device *dev, cl_program_build_info param) const
{
    switch (param) {
    case CL_PROGRAM_BUILD_STATUS:
        return pyopencl_get_int_info(cl_build_status, ProgramBuild,
                                     this, dev, param);
    case CL_PROGRAM_BUILD_OPTIONS:
    case CL_PROGRAM_BUILD_LOG:
        return pyopencl_get_str_info(ProgramBuild, this, dev, param);
#if PYOPENCL_CL_VERSION >= 0x1020
    case CL_PROGRAM_BINARY_TYPE:
        return pyopencl_get_int_info(cl_program_binary_type, ProgramBuild,
                                     this, dev, param);
#endif
#if PYOPENCL_CL_VERSION >= 0x2000
    case CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE:
        return pyopencl_get_int_info(size_t, ProgramBuild,
                                     this, dev, param);
#endif
    default:
        throw clerror("Program.get_build_info", CL_INVALID_VALUE);
    }
}
예제 #17
0
파일: pe184.c 프로젝트: manish05/TCR
/* create context and create command queue, use platform and device as
   indicated by PLATFORMID and DEVICEID */
void initopencl() {
	cl_int err;
	/* get platform ids */
	if(CL_SUCCESS!=(err=clGetPlatformIDs(0,NULL,&numplatforms)))
		clerror("couldn't get number of platforms",err);
	platforms=malloc(numplatforms*sizeof(cl_platform_id));
	if(CL_SUCCESS!=(err=clGetPlatformIDs(numplatforms,platforms,NULL)))
		clerror("couldn't get platforms",err);
	/* get device */
	if(CL_SUCCESS!=(err=clGetDeviceIDs(platforms[PLATFORMID],CL_DEVICE_TYPE_ALL,0,NULL,&numdevices)))
		clerror("couldn't get number of devices",err);
	devices=malloc(numdevices*sizeof(cl_device_id));
	if(CL_SUCCESS!=(err=clGetDeviceIDs(platforms[PLATFORMID],CL_DEVICE_TYPE_ALL,numdevices,devices,NULL)))
		clerror("couldn't get devices",err);
	/* create context */
	context=clCreateContext(NULL,1,devices+DEVICEID,NULL,NULL,&err);
	if(CL_SUCCESS!=err) clerror("couldn't get context",err);
	/* create command queue */
	queue=clCreateCommandQueue(context,devices[DEVICEID],0,&err);
	if(CL_SUCCESS!=err) clerror("couldn't create command queue",err);
}
예제 #18
0
파일: context.cpp 프로젝트: AI42/pyopencl
generic_info
context::get_info(cl_uint param_name) const
{
    switch ((cl_context_info)param_name) {
    case CL_CONTEXT_REFERENCE_COUNT:
        return pyopencl_get_int_info(cl_uint, Context,
                                     PYOPENCL_CL_CASTABLE_THIS, param_name);
    case CL_CONTEXT_DEVICES:
        return pyopencl_get_opaque_array_info(device, Context,
                                              PYOPENCL_CL_CASTABLE_THIS, param_name);
    case CL_CONTEXT_PROPERTIES: {
        auto result = pyopencl_get_vec_info(
            cl_context_properties, Context, PYOPENCL_CL_CASTABLE_THIS, param_name);
        pyopencl_buf<generic_info> py_result(result.len() / 2);
        size_t i = 0;
        for (;i < py_result.len();i++) {
            cl_context_properties key = result[i * 2];
            if (key == 0)
                break;
            cl_context_properties value = result[i * 2 + 1];
            generic_info &info = py_result[i];
            info.dontfree = 0;
            info.opaque_class = CLASS_NONE;
            switch (key) {
            case CL_CONTEXT_PLATFORM:
                info.opaque_class = CLASS_PLATFORM;
                info.type = "void *";
                info.value = new platform(
                    reinterpret_cast<cl_platform_id>(value));
                break;

#if defined(PYOPENCL_GL_SHARING_VERSION) && (PYOPENCL_GL_SHARING_VERSION >= 1)
#if defined(__APPLE__) && defined(HAVE_GL)
            case CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE:
#else
            case CL_GL_CONTEXT_KHR:
            case CL_EGL_DISPLAY_KHR:
            case CL_GLX_DISPLAY_KHR:
            case CL_WGL_HDC_KHR:
            case CL_CGL_SHAREGROUP_KHR:
#endif
                info.type = "intptr_t *";
                info.value = (void*)value;
                // we do not own this object
                info.dontfree = 1;
                break;

#endif
            default:
                throw clerror("Context.get_info", CL_INVALID_VALUE,
                              "unknown context_property key encountered");
            }
        }
        py_result.resize(i);
        return pyopencl_convert_array_info(generic_info, py_result);
    }

#if PYOPENCL_CL_VERSION >= 0x1010
    case CL_CONTEXT_NUM_DEVICES:
        return pyopencl_get_int_info(cl_uint, Context,
                                     PYOPENCL_CL_CASTABLE_THIS, param_name);
#endif

    default:
        throw clerror("Context.get_info", CL_INVALID_VALUE);
    }
}
예제 #19
0
파일: pe184.c 프로젝트: manish05/TCR
int main() {
	cl_int err;
	cl_kernel kernel;
	cl_program program;
	cl_mem buffer;
	cl_ulong ans,cur;
	int i,x3,y3;
	char *source;
	size_t localws[3];
	size_t globalws[3];

	initopencl();
	source=getsource("pe184.cl");
	/* create a program object and load source code into it */
	program=clCreateProgramWithSource(context,1,(const char **)&source,NULL,&err);
	if(CL_SUCCESS!=err) clerror("error creating program",err);
	/* builds a program associated with a program object */
	if(CL_SUCCESS!=(err=clBuildProgram(program,1,devices+DEVICEID,NULL,NULL,NULL)))
		clerror("error building program",err);
	/* create kernel object from built program */
	kernel=clCreateKernel(program,"pe184",&err);
	if(CL_SUCCESS!=err) clerror("error creating kernel",err);
	free(source);

	/* create memory buffer: one cl_ulong for each kernel, a total of MAX=N/KERNEL */
	buffer=clCreateBuffer(context,CL_MEM_READ_WRITE,N2*N2*sizeof(cl_uint),NULL,&err);
	if(CL_SUCCESS!=err) clerror("error creating buffer",err);

	globalws[0]=N2;
	globalws[1]=N2;
	globalws[2]=1;

	localws[0]=N2;
	localws[1]=1;
	localws[2]=1;

	ans=0;
	for(x3=0;x3<N;x3++) {
		for(y3=0;y3<N;y3++) if(x3*x3+y3*y3<N*N) {
			/* set up kernel arguments */
			if(CL_SUCCESS!=(err=clSetKernelArg(kernel,0,sizeof(cl_mem),&buffer))) clerror("error setting kernel argument 0",err);
			if(CL_SUCCESS!=(err=clSetKernelArg(kernel,1,sizeof(cl_int),&x3))) clerror("error setting kernel argument 1",err);
			if(CL_SUCCESS!=(err=clSetKernelArg(kernel,2,sizeof(cl_int),&y3))) clerror("error setting kernel argument 2",err);

			/* run kernel */
			if(CL_SUCCESS!=(err=clEnqueueNDRangeKernel(queue,kernel,3,NULL,globalws,localws,0,NULL,NULL)))
				clerror("error running kernel",err);
			/* wait until kernel has finished */
			if(CL_SUCCESS!=(err=clFinish(queue))) clerror("error waiting for queue",err);

			/* copy to host memory */
			if(CL_SUCCESS!=(err=clEnqueueReadBuffer(queue,buffer,CL_TRUE,0,N2*N2*sizeof(cl_uint),a,0,NULL,NULL)))
				clerror("error copying result to host",err);
			if(CL_SUCCESS!=(err=clFinish(queue))) clerror("error waiting for queue",err);

			/* assemble final answer */
			for(cur=i=0;i<N2*N2;i++) cur+=a[i];
			if(x3) cur*=2;
			if(y3) cur*=2;
			ans+=cur;
		}
		printf("done x3=%d/%d\n",x3+1,N);
	}
	printf("answer: "LL"\n",ans/6);

	clReleaseMemObject(buffer);
	clReleaseKernel(kernel);
	clReleaseProgram(program);
	shutdownopencl();
	return 0;
}