Exemple #1
0
    /// Enqueues a command to migrate \p mem_objects.
    ///
    /// \see_opencl_ref{clEnqueueMigrateMemObjects}
    ///
    /// \opencl_version_warning{1,2}
    event enqueue_migrate_memory_objects(uint_ num_mem_objects,
                                         const cl_mem *mem_objects,
                                         cl_mem_migration_flags flags,
                                         const wait_list &events = wait_list())
    {
        BOOST_ASSERT(m_queue != 0);

        event event_;

        cl_int ret = clEnqueueMigrateMemObjects(
            m_queue,
            num_mem_objects,
            mem_objects,
            flags,
            events.size(),
            events.get_event_ptr(),
            &event_.get()
        );

        if(ret != CL_SUCCESS){
            BOOST_THROW_EXCEPTION(opencl_error(ret));
        }

        return event_;
    }
JNIEXPORT jint JNICALL Java_org_lwjgl_opencl_CL12_nclEnqueueMigrateMemObjects(JNIEnv *env, jclass clazz, jlong command_queue, jint num_mem_objects, jlong mem_objects, jlong flags, jint num_events_in_wait_list, jlong event_wait_list, jlong event, jlong function_pointer) {
	const cl_mem *mem_objects_address = (const cl_mem *)(intptr_t)mem_objects;
	const cl_event *event_wait_list_address = (const cl_event *)(intptr_t)event_wait_list;
	cl_event *event_address = (cl_event *)(intptr_t)event;
	clEnqueueMigrateMemObjectsPROC clEnqueueMigrateMemObjects = (clEnqueueMigrateMemObjectsPROC)((intptr_t)function_pointer);
	cl_int __result = clEnqueueMigrateMemObjects((cl_command_queue)(intptr_t)command_queue, num_mem_objects, mem_objects_address, flags, num_events_in_wait_list, event_wait_list_address, event_address);
	return __result;
}
int
DeviceFission::runCLALLKerenls()
{
    cl_int status;
	cl_event writeEvent;
	cl_event migrateEvents; 
	cl_event rangeEvent[2];

    // Set global and local work items
    size_t globalThreads[] = {half_length};
    size_t localThreads[] = {groupSize};

	// Enqueue write Buffer to the first sub device queue
	status = clEnqueueWriteBuffer(subCmdQueue[0],
								  InBuf,
								  CL_FALSE,
								  0,
								  half_length* sizeof(cl_int),
								  input,
								  0,
								  NULL,
								  &writeEvent);
	CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed");

	cl_uint rangeEventNum = 0;

	if (cpu2cpu)
	{
		status = clEnqueueMigrateMemObjects(subCmdQueue[1],
											1,
											&InBuf,
											CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED ,
											1,
											&writeEvent,
											&migrateEvents);
		CHECK_OPENCL_ERROR(status, "clEnqueueMigrateMemObjects failed.");

		rangeEvent[0] = writeEvent;
		rangeEvent[1] = migrateEvents;
		rangeEventNum++;
	}
	else if(cpu2gpu)
	{
		status = clEnqueueMigrateMemObjects(gpuCmdQueue,
											1,
											&InBuf,
											CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED ,
											1,
											&writeEvent,
											&migrateEvents);
		CHECK_OPENCL_ERROR(status, "clEnqueueMigrateMemObjects failed.");

		rangeEvent[0] = rangeEvent[1] = migrateEvents;
		rangeEventNum++;
	}
	else
	{
		rangeEvent[0] = rangeEvent[1] = writeEvent;
		rangeEventNum++;
	}

    for(cl_uint i = 0; i < numSubDevices; ++i)
    {
		// Set subOutBuf as second argument 
		status = clSetKernelArg(subKernel[i], 1, sizeof(cl_mem), (void*)&subOutBuf[i]);
		CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (subOutBuf)");

		// Set InBuf as first argument
		status = clSetKernelArg(subKernel[i], 0, sizeof(cl_mem),(void*)&InBuf);
		CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (InBuf)");	

        // Enqueue kernel
		status = clEnqueueNDRangeKernel(subCmdQueue[i],
										subKernel[i],
										1,
										NULL,
										globalThreads,
										localThreads,
										rangeEventNum,
										&rangeEvent[i],
										NULL);
		CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed.(subCmdQueue)");

		// Enqueue readBuffer
		status = clEnqueueReadBuffer(subCmdQueue[i],
									 subOutBuf[i],
									 CL_FALSE,
									 0,
									 half_length * sizeof(cl_int),
									 subOutput + half_length * i,
									 0,
									 NULL,
									 NULL);
		CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed. (subCmdQueue)");

		// Set gpuOutBuf as second argument 
		status = clSetKernelArg(gpuKernel[i], 1, sizeof(cl_mem), (void*)&gpuOutBuf[i]);
		CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (gpuOutBuf)");

		// Set InBuf as first argument 
		status = clSetKernelArg(gpuKernel[i], 0, sizeof(cl_mem),(void*)&InBuf);
		CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (InBuf)");

		// Enqueue kernel to gpuCmdQueue
		status = clEnqueueNDRangeKernel(gpuCmdQueue,
										gpuKernel[i],
										1,
										NULL,
										globalThreads,
										localThreads,
										0,
										NULL,
										NULL);
		CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed.(gpuCmdQueue)");

		// Enqueue readBuffer to gpuCmdQueue
		status = clEnqueueReadBuffer(gpuCmdQueue,
									 gpuOutBuf[i],
									 CL_FALSE,
					 				 0,
									 half_length * sizeof(cl_int),
									 gpuOutput + half_length * i,
									 0,
									 NULL,
									 NULL);
		CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed. (gpuCmdQueue)");
	}

	// Flush all queues together
	status = clFlush(gpuCmdQueue);
	CHECK_OPENCL_ERROR(status, "clFlush failed. (gpuCmdQueue)");

	for(cl_uint i = 0; i < numSubDevices; ++i)
	{
		status = clFlush(subCmdQueue[i]);
		CHECK_OPENCL_ERROR(status, "clFlush failed. (subCmdQueue)");
	}
	
	// Finish all queues
	status = clFinish(subCmdQueue[0]);
	CHECK_OPENCL_ERROR(status, "clFinish failed. (subCmdQueue[0])");

	status = clFinish(subCmdQueue[1]);
	CHECK_OPENCL_ERROR(status, "clFinish failed. (subCmdQueue[1])");

	status = clFinish(gpuCmdQueue);
	CHECK_OPENCL_ERROR(status, "clFinish failed. (gpuCmdQueue)");

	status = clReleaseEvent(writeEvent);
	CHECK_OPENCL_ERROR(status, "clReleaseEvent failed. (writeEvent)");

	if (cpu2gpu || cpu2cpu)
	{
		status = clReleaseEvent(migrateEvents);
		CHECK_OPENCL_ERROR(status, "clReleaseEvent failed. (migrateEvents)");
	}
    return SDK_SUCCESS;
}
Exemple #4
0
int main(int argc, char *argv[])
{
    // selected platform and device number
    cl_uint pn = 0, dn = 0;

    // OpenCL error
    cl_int error;

    // generic iterator
    cl_uint i;

    // major/minor version of the platform OpenCL version
    cl_uint ocl_major, ocl_minor;

    // set platform/device num from command line
    if (argc > 1)
        pn = atoi(argv[1]);
    if (argc > 2)
        dn = atoi(argv[2]);

    error = clGetPlatformIDs(0, NULL, &np);
    CHECK_ERROR("getting amount of platform IDs");
    printf("%u platforms found\n", np);
    if (pn >= np) {
        fprintf(stderr, "there is no platform #%u\n" , pn);
        exit(1);
    }
    // only allocate for IDs up to the intended one
    platform = calloc(pn+1,sizeof(*platform));
    // if allocation failed, next call will bomb. rely on this
    error = clGetPlatformIDs(pn+1, platform, NULL);
    CHECK_ERROR("getting platform IDs");

    // choose platform
    p = platform[pn];

    error = clGetPlatformInfo(p, CL_PLATFORM_NAME, BUFSZ, strbuf, NULL);
    CHECK_ERROR("getting platform name");
    printf("using platform %u: %s\n", pn, strbuf);

    error = clGetPlatformInfo(p, CL_PLATFORM_VERSION, BUFSZ, strbuf, NULL);
    CHECK_ERROR("getting platform version");
    // we need 1.2 at least
    i = sscanf(strbuf, "OpenCL %u.%u ", &ocl_major, &ocl_minor);
    if (i != 2) {
        fprintf(stderr, "%s:%u: unable to determine platform OpenCL version\n",
                __func__, __LINE__);
        exit(1);
    }
    if (ocl_major == 1 && ocl_minor < 2) {
        fprintf(stderr, "%s:%u: Platform version %s is not at least 1.2\n",
                __func__, __LINE__, strbuf);
        exit(1);
    }

    error = clGetDeviceIDs(p, CL_DEVICE_TYPE_ALL, 0, NULL, &nd);
    CHECK_ERROR("getting amount of device IDs");
    printf("%u devices found\n", nd);
    if (dn >= nd) {
        fprintf(stderr, "there is no device #%u\n", dn);
        exit(1);
    }
    // only allocate for IDs up to the intended one
    device = calloc(dn+1,sizeof(*device));
    // if allocation failed, next call will bomb. rely on this
    error = clGetDeviceIDs(p, CL_DEVICE_TYPE_ALL, dn+1, device, NULL);
    CHECK_ERROR("getting device IDs");

    // choose device
    d = device[dn];
    error = clGetDeviceInfo(d, CL_DEVICE_NAME, BUFSZ, strbuf, NULL);
    CHECK_ERROR("getting device name");
    printf("using device %u: %s\n", dn, strbuf);

    error = clGetDeviceInfo(d, CL_DEVICE_GLOBAL_MEM_SIZE,
                            sizeof(gmem), &gmem, NULL);
    CHECK_ERROR("getting device global memory size");
    error = clGetDeviceInfo(d, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
                            sizeof(alloc_max), &alloc_max, NULL);
    CHECK_ERROR("getting device max memory allocation size");

    // create context
    ctx_prop[1] = (cl_context_properties)p;
    ctx = clCreateContext(ctx_prop, 1, &d, NULL, NULL, &error);
    CHECK_ERROR("creating context");

    // create queue
    q = clCreateCommandQueue(ctx, d, CL_QUEUE_PROFILING_ENABLE, &error);
    CHECK_ERROR("creating queue");

    // create program
    pg = clCreateProgramWithSource(ctx, sizeof(src)/sizeof(*src), src, NULL, &error);
    CHECK_ERROR("creating program");

    // build program
    error = clBuildProgram(pg, 1, &d, NULL, NULL, NULL);
    CHECK_ERROR("building program");

    // get kernel
    k = clCreateKernel(pg, "add", &error);
    CHECK_ERROR("creating kernel");

    error = clGetKernelWorkGroupInfo(k, d, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
                                     sizeof(wgm), &wgm, NULL);
    CHECK_ERROR("getting preferred workgroup size multiple");

    // number of elements on which kernel will be launched. it's ok if we don't
    // cover every byte of the buffers
    nels = alloc_max/sizeof(cl_float);

    gws = ROUND_MUL(nels, wgm);

    printf("will use %zu workitems grouped by %zu to process %u elements\n",
           gws, wgm, nels);

    // we will try and allocate at least one buffer more than needed to fill
    // the device memory, and no less than 3 anyway
    nbuf = gmem/alloc_max + 1;
    if (nbuf < 3)
        nbuf = 3;

#define MB (1024*1024.0)

    printf("will try allocating %u buffers of %gMB each to overcommit %gMB\n",
           nbuf, alloc_max/MB, gmem/MB);

    buf = calloc(nbuf, sizeof(cl_mem));

    if (!buf) {
        fprintf(stderr, "could not prepare support for %u buffers\n", nbuf);
        exit(1);
    }

    for (i = 0; i < nbuf; ++i) {
        buf[i] = clCreateBuffer(ctx, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, alloc_max,
                                NULL, &error);
        CHECK_ERROR("allocating buffer");
        printf("buffer %u allocated\n", i);
    }

    // memset the first buffer
    hbuf = clEnqueueMapBuffer(q, buf[0], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION,
                              0, alloc_max, 0, NULL, NULL, &error);
    CHECK_ERROR("mapping buffer 0");
    memset(hbuf, 0, alloc_max);
    error = clEnqueueUnmapMemObject(q, buf[0], hbuf, 0, NULL, NULL);
    CHECK_ERROR("unmapping buffer 0");
    hbuf = NULL;

    // use the buffers
    for (i = 1; i < nbuf; ++i) {
        printf("testing buffer %u\n", i);

        // for each buffer, we do a setup on CPU and then use it as second
        // argument for the kernel
        hbuf = clEnqueueMapBuffer(q, buf[i], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION,
                                  0, alloc_max, 0, NULL, NULL, &error);
        CHECK_ERROR("mapping buffer");
        for (e = 0; e < nels; ++e)
            hbuf[e] = i;
        error = clEnqueueUnmapMemObject(q, buf[i], hbuf, 0, NULL, NULL);
        CHECK_ERROR("unmapping buffer");
        hbuf = NULL;

        // migrate previous buffer out of the GPU
        if (i > 1) {
            error = clEnqueueMigrateMemObjects(q, 1, buf + i-1,
                                               CL_MIGRATE_MEM_OBJECT_HOST | CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED,
                                               0, NULL, NULL);
            CHECK_ERROR("migrating previous buffer to host");
        }
        // make sure all pending actions are completed
        error =	clFinish(q);
        CHECK_ERROR("settling down");

        clSetKernelArg(k, 0, sizeof(buf[0]), buf);
        clSetKernelArg(k, 1, sizeof(buf[i]), buf + i);
        clSetKernelArg(k, 2, sizeof(nels), &nels);
        error = clEnqueueNDRangeKernel(q, k, 1, NULL, &gws, &wgm,
                                       0, NULL, &krn_evt);
        CHECK_ERROR("enqueueing kernel");

        expected = i*(i+1)/2.0f;
        hbuf = clEnqueueMapBuffer(q, buf[0], CL_TRUE, CL_MAP_READ,
                                  0, alloc_max, 1, &krn_evt, NULL, &error);
        CHECK_ERROR("mapping buffer 0");
        for (e = 0; e < nels; ++e)
            if (hbuf[e] != expected) {
                fprintf(stderr, "mismatch @ %u: %g instead of %g\n",
                        e, hbuf[e], expected);
                exit(1);
            }
        error = clEnqueueUnmapMemObject(q, buf[0], hbuf, 0, NULL, NULL);
        CHECK_ERROR("unmapping buffer 0");
        hbuf = NULL;
        clReleaseEvent(krn_evt); // free up the kernel event
    }

    for (i = 1; i <= nbuf; ++i) {
        clReleaseMemObject(buf[nbuf - i]);
        printf("buffer %u freed\n", nbuf  - i);
    }

    return 0;
}