/// Enqueues a command to migrate \p mem_objects. /// /// \see_opencl_ref{clEnqueueMigrateMemObjects} /// /// \opencl_version_warning{1,2} event enqueue_migrate_memory_objects(uint_ num_mem_objects, const cl_mem *mem_objects, cl_mem_migration_flags flags, const wait_list &events = wait_list()) { BOOST_ASSERT(m_queue != 0); event event_; cl_int ret = clEnqueueMigrateMemObjects( m_queue, num_mem_objects, mem_objects, flags, events.size(), events.get_event_ptr(), &event_.get() ); if(ret != CL_SUCCESS){ BOOST_THROW_EXCEPTION(opencl_error(ret)); } return event_; }
JNIEXPORT jint JNICALL Java_org_lwjgl_opencl_CL12_nclEnqueueMigrateMemObjects(JNIEnv *env, jclass clazz, jlong command_queue, jint num_mem_objects, jlong mem_objects, jlong flags, jint num_events_in_wait_list, jlong event_wait_list, jlong event, jlong function_pointer) { const cl_mem *mem_objects_address = (const cl_mem *)(intptr_t)mem_objects; const cl_event *event_wait_list_address = (const cl_event *)(intptr_t)event_wait_list; cl_event *event_address = (cl_event *)(intptr_t)event; clEnqueueMigrateMemObjectsPROC clEnqueueMigrateMemObjects = (clEnqueueMigrateMemObjectsPROC)((intptr_t)function_pointer); cl_int __result = clEnqueueMigrateMemObjects((cl_command_queue)(intptr_t)command_queue, num_mem_objects, mem_objects_address, flags, num_events_in_wait_list, event_wait_list_address, event_address); return __result; }
int DeviceFission::runCLALLKerenls() { cl_int status; cl_event writeEvent; cl_event migrateEvents; cl_event rangeEvent[2]; // Set global and local work items size_t globalThreads[] = {half_length}; size_t localThreads[] = {groupSize}; // Enqueue write Buffer to the first sub device queue status = clEnqueueWriteBuffer(subCmdQueue[0], InBuf, CL_FALSE, 0, half_length* sizeof(cl_int), input, 0, NULL, &writeEvent); CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed"); cl_uint rangeEventNum = 0; if (cpu2cpu) { status = clEnqueueMigrateMemObjects(subCmdQueue[1], 1, &InBuf, CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED , 1, &writeEvent, &migrateEvents); CHECK_OPENCL_ERROR(status, "clEnqueueMigrateMemObjects failed."); rangeEvent[0] = writeEvent; rangeEvent[1] = migrateEvents; rangeEventNum++; } else if(cpu2gpu) { status = clEnqueueMigrateMemObjects(gpuCmdQueue, 1, &InBuf, CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED , 1, &writeEvent, &migrateEvents); CHECK_OPENCL_ERROR(status, "clEnqueueMigrateMemObjects failed."); rangeEvent[0] = rangeEvent[1] = migrateEvents; rangeEventNum++; } else { rangeEvent[0] = rangeEvent[1] = writeEvent; rangeEventNum++; } for(cl_uint i = 0; i < numSubDevices; ++i) { // Set subOutBuf as second argument status = clSetKernelArg(subKernel[i], 1, sizeof(cl_mem), (void*)&subOutBuf[i]); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (subOutBuf)"); // Set InBuf as first argument status = clSetKernelArg(subKernel[i], 0, sizeof(cl_mem),(void*)&InBuf); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (InBuf)"); // Enqueue kernel status = clEnqueueNDRangeKernel(subCmdQueue[i], subKernel[i], 1, NULL, globalThreads, localThreads, rangeEventNum, &rangeEvent[i], NULL); CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed.(subCmdQueue)"); // Enqueue readBuffer status = clEnqueueReadBuffer(subCmdQueue[i], subOutBuf[i], CL_FALSE, 0, half_length * sizeof(cl_int), subOutput + half_length * i, 0, NULL, NULL); CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed. (subCmdQueue)"); // Set gpuOutBuf as second argument status = clSetKernelArg(gpuKernel[i], 1, sizeof(cl_mem), (void*)&gpuOutBuf[i]); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (gpuOutBuf)"); // Set InBuf as first argument status = clSetKernelArg(gpuKernel[i], 0, sizeof(cl_mem),(void*)&InBuf); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (InBuf)"); // Enqueue kernel to gpuCmdQueue status = clEnqueueNDRangeKernel(gpuCmdQueue, gpuKernel[i], 1, NULL, globalThreads, localThreads, 0, NULL, NULL); CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed.(gpuCmdQueue)"); // Enqueue readBuffer to gpuCmdQueue status = clEnqueueReadBuffer(gpuCmdQueue, gpuOutBuf[i], CL_FALSE, 0, half_length * sizeof(cl_int), gpuOutput + half_length * i, 0, NULL, NULL); CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed. (gpuCmdQueue)"); } // Flush all queues together status = clFlush(gpuCmdQueue); CHECK_OPENCL_ERROR(status, "clFlush failed. (gpuCmdQueue)"); for(cl_uint i = 0; i < numSubDevices; ++i) { status = clFlush(subCmdQueue[i]); CHECK_OPENCL_ERROR(status, "clFlush failed. (subCmdQueue)"); } // Finish all queues status = clFinish(subCmdQueue[0]); CHECK_OPENCL_ERROR(status, "clFinish failed. (subCmdQueue[0])"); status = clFinish(subCmdQueue[1]); CHECK_OPENCL_ERROR(status, "clFinish failed. (subCmdQueue[1])"); status = clFinish(gpuCmdQueue); CHECK_OPENCL_ERROR(status, "clFinish failed. (gpuCmdQueue)"); status = clReleaseEvent(writeEvent); CHECK_OPENCL_ERROR(status, "clReleaseEvent failed. (writeEvent)"); if (cpu2gpu || cpu2cpu) { status = clReleaseEvent(migrateEvents); CHECK_OPENCL_ERROR(status, "clReleaseEvent failed. (migrateEvents)"); } return SDK_SUCCESS; }
int main(int argc, char *argv[]) { // selected platform and device number cl_uint pn = 0, dn = 0; // OpenCL error cl_int error; // generic iterator cl_uint i; // major/minor version of the platform OpenCL version cl_uint ocl_major, ocl_minor; // set platform/device num from command line if (argc > 1) pn = atoi(argv[1]); if (argc > 2) dn = atoi(argv[2]); error = clGetPlatformIDs(0, NULL, &np); CHECK_ERROR("getting amount of platform IDs"); printf("%u platforms found\n", np); if (pn >= np) { fprintf(stderr, "there is no platform #%u\n" , pn); exit(1); } // only allocate for IDs up to the intended one platform = calloc(pn+1,sizeof(*platform)); // if allocation failed, next call will bomb. rely on this error = clGetPlatformIDs(pn+1, platform, NULL); CHECK_ERROR("getting platform IDs"); // choose platform p = platform[pn]; error = clGetPlatformInfo(p, CL_PLATFORM_NAME, BUFSZ, strbuf, NULL); CHECK_ERROR("getting platform name"); printf("using platform %u: %s\n", pn, strbuf); error = clGetPlatformInfo(p, CL_PLATFORM_VERSION, BUFSZ, strbuf, NULL); CHECK_ERROR("getting platform version"); // we need 1.2 at least i = sscanf(strbuf, "OpenCL %u.%u ", &ocl_major, &ocl_minor); if (i != 2) { fprintf(stderr, "%s:%u: unable to determine platform OpenCL version\n", __func__, __LINE__); exit(1); } if (ocl_major == 1 && ocl_minor < 2) { fprintf(stderr, "%s:%u: Platform version %s is not at least 1.2\n", __func__, __LINE__, strbuf); exit(1); } error = clGetDeviceIDs(p, CL_DEVICE_TYPE_ALL, 0, NULL, &nd); CHECK_ERROR("getting amount of device IDs"); printf("%u devices found\n", nd); if (dn >= nd) { fprintf(stderr, "there is no device #%u\n", dn); exit(1); } // only allocate for IDs up to the intended one device = calloc(dn+1,sizeof(*device)); // if allocation failed, next call will bomb. rely on this error = clGetDeviceIDs(p, CL_DEVICE_TYPE_ALL, dn+1, device, NULL); CHECK_ERROR("getting device IDs"); // choose device d = device[dn]; error = clGetDeviceInfo(d, CL_DEVICE_NAME, BUFSZ, strbuf, NULL); CHECK_ERROR("getting device name"); printf("using device %u: %s\n", dn, strbuf); error = clGetDeviceInfo(d, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(gmem), &gmem, NULL); CHECK_ERROR("getting device global memory size"); error = clGetDeviceInfo(d, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(alloc_max), &alloc_max, NULL); CHECK_ERROR("getting device max memory allocation size"); // create context ctx_prop[1] = (cl_context_properties)p; ctx = clCreateContext(ctx_prop, 1, &d, NULL, NULL, &error); CHECK_ERROR("creating context"); // create queue q = clCreateCommandQueue(ctx, d, CL_QUEUE_PROFILING_ENABLE, &error); CHECK_ERROR("creating queue"); // create program pg = clCreateProgramWithSource(ctx, sizeof(src)/sizeof(*src), src, NULL, &error); CHECK_ERROR("creating program"); // build program error = clBuildProgram(pg, 1, &d, NULL, NULL, NULL); CHECK_ERROR("building program"); // get kernel k = clCreateKernel(pg, "add", &error); CHECK_ERROR("creating kernel"); error = clGetKernelWorkGroupInfo(k, d, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(wgm), &wgm, NULL); CHECK_ERROR("getting preferred workgroup size multiple"); // number of elements on which kernel will be launched. it's ok if we don't // cover every byte of the buffers nels = alloc_max/sizeof(cl_float); gws = ROUND_MUL(nels, wgm); printf("will use %zu workitems grouped by %zu to process %u elements\n", gws, wgm, nels); // we will try and allocate at least one buffer more than needed to fill // the device memory, and no less than 3 anyway nbuf = gmem/alloc_max + 1; if (nbuf < 3) nbuf = 3; #define MB (1024*1024.0) printf("will try allocating %u buffers of %gMB each to overcommit %gMB\n", nbuf, alloc_max/MB, gmem/MB); buf = calloc(nbuf, sizeof(cl_mem)); if (!buf) { fprintf(stderr, "could not prepare support for %u buffers\n", nbuf); exit(1); } for (i = 0; i < nbuf; ++i) { buf[i] = clCreateBuffer(ctx, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, alloc_max, NULL, &error); CHECK_ERROR("allocating buffer"); printf("buffer %u allocated\n", i); } // memset the first buffer hbuf = clEnqueueMapBuffer(q, buf[0], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, alloc_max, 0, NULL, NULL, &error); CHECK_ERROR("mapping buffer 0"); memset(hbuf, 0, alloc_max); error = clEnqueueUnmapMemObject(q, buf[0], hbuf, 0, NULL, NULL); CHECK_ERROR("unmapping buffer 0"); hbuf = NULL; // use the buffers for (i = 1; i < nbuf; ++i) { printf("testing buffer %u\n", i); // for each buffer, we do a setup on CPU and then use it as second // argument for the kernel hbuf = clEnqueueMapBuffer(q, buf[i], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, alloc_max, 0, NULL, NULL, &error); CHECK_ERROR("mapping buffer"); for (e = 0; e < nels; ++e) hbuf[e] = i; error = clEnqueueUnmapMemObject(q, buf[i], hbuf, 0, NULL, NULL); CHECK_ERROR("unmapping buffer"); hbuf = NULL; // migrate previous buffer out of the GPU if (i > 1) { error = clEnqueueMigrateMemObjects(q, 1, buf + i-1, CL_MIGRATE_MEM_OBJECT_HOST | CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED, 0, NULL, NULL); CHECK_ERROR("migrating previous buffer to host"); } // make sure all pending actions are completed error = clFinish(q); CHECK_ERROR("settling down"); clSetKernelArg(k, 0, sizeof(buf[0]), buf); clSetKernelArg(k, 1, sizeof(buf[i]), buf + i); clSetKernelArg(k, 2, sizeof(nels), &nels); error = clEnqueueNDRangeKernel(q, k, 1, NULL, &gws, &wgm, 0, NULL, &krn_evt); CHECK_ERROR("enqueueing kernel"); expected = i*(i+1)/2.0f; hbuf = clEnqueueMapBuffer(q, buf[0], CL_TRUE, CL_MAP_READ, 0, alloc_max, 1, &krn_evt, NULL, &error); CHECK_ERROR("mapping buffer 0"); for (e = 0; e < nels; ++e) if (hbuf[e] != expected) { fprintf(stderr, "mismatch @ %u: %g instead of %g\n", e, hbuf[e], expected); exit(1); } error = clEnqueueUnmapMemObject(q, buf[0], hbuf, 0, NULL, NULL); CHECK_ERROR("unmapping buffer 0"); hbuf = NULL; clReleaseEvent(krn_evt); // free up the kernel event } for (i = 1; i <= nbuf; ++i) { clReleaseMemObject(buf[nbuf - i]); printf("buffer %u freed\n", nbuf - i); } return 0; }