unsigned initExecutionBluesteins(const unsigned size, const unsigned m) { allocateHostMemoryBluesteins(size, m); if (deviceCount) { printf("Initializing device(s).." ); // create the OpenCL context on available GPU devices init_cl_context(CL_DEVICE_TYPE_GPU); const cl_uint ciDeviceCount = getDeviceCount(); if (!ciDeviceCount) { printf("No opencl specific devices!\n"); return 0; } printf("Creating Command Queue...\n"); // create a command queue on device 1 for (unsigned i = 0; i < deviceCount; ++i) { createCommandQueue(i); } } return 1; }
ClWrapper::ClWrapper(cl_device_type device_type) : _device_type(device_type) { createPlatform(); createDevice(); createContext(); createCommandQueue(); printOpenCLInfo(); }
int main(int argc, char** argv) { xdl::IPXdevLCore core = xdl::createCore(); core->plug(xdl::XdevLPluginName("XdevLComputeDeviceCL"), xdl::XdevLVersion(0,1,0)); auto computeDevice = xdl::createModule<xdl::IPXdevLComputeDevice>(core, xdl::XdevLModuleName("XdevLComputeDevice"), xdl::XdevLID("MyComputeDevice")); // We need a context. auto context = computeDevice->createContext(); // We need a command queue to run commands. auto commandQueue = context->createCommandQueue(); auto program = context->createProgram(); auto inBuffer = context->createBuffer(xdl::XDEVL_COMPUTE_BUFFER_READ_ONLY, sizeof(float) * 10); auto outBuffer = context->createBuffer(xdl::XDEVL_COMPUTE_BUFFER_WRITE_ONLY, sizeof(float) * 10); // // Load and build the kernel. // auto kernel = program->compileFromFile(xdl::XdevLFileName("compute_device_demo.cl"), xdl::XdevLString("calculate_sqrt")); for(int a = 0; a < 100; a++) { kernel->setArgumentBuffer(0, inBuffer); kernel->setArgumentBuffer(1, outBuffer); kernel->setArgumentFloat(2, 2); inBuffer->upload(commandQueue.get(), sizeof(float) * 10, (xdl::xdl_uint8*)data); xdl::XdevLComputeExecuteParameter para(commandQueue.get(), kernel.get(), {32}); program->execute(para); // std::cout << "Before: " << std::endl; // for(auto item : data) { // std::cout << item << " : "; // } // std::cout << std::endl; outBuffer->download(commandQueue.get(), sizeof(float) * 10, (xdl::xdl_uint8*)data); std::cout << "After: " << std::endl; for(auto item : data) { std::cout << item << " : "; } std::cout << std::endl; } xdl::destroyCore(core); }
float sgemmMain(int rowa,int cola,int colb) { cl_context context = 0; cl_command_queue commandQueue = 0; cl_program program = 0; cl_device_id device = 0; cl_kernel kernel = 0; const unsigned int numberOfMemoryObjects = 3; cl_mem memoryObjectsa = 0; cl_mem memoryObjectsb = 0; cl_mem memoryObjectsc = 0; cl_int errorNumber; cl_uint clrowa = rowa; cl_uint clcola = cola; cl_uint clcolb = colb; int err; err = createContext(&context); LOGD("create context"); err = createCommandQueue(context, &commandQueue, &device); err = createProgram(context, device, "/mnt/sdcard/kernel/sgemm.cl", &program); kernel = clCreateKernel(program, "sgemm", &errorNumber); LOGD("createKernel code %d",errorNumber); LOGD("start computing"); float alpha = 1; float beta = 0.1; /* Create the matrices. */ size_t matrixSizea = rowa * cola; size_t matrixSizeb = cola * colb; size_t matrixSizec = rowa * colb; /* As all the matrices have the same size, the buffer size is common. */ size_t bufferSizea = matrixSizea * sizeof(float); size_t bufferSizeb = matrixSizeb * sizeof(float); size_t bufferSizec = matrixSizec * sizeof(float); /* Create buffers for the matrices used in the kernel. */ int createMemoryObjectsSuccess = 0; memoryObjectsa = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, bufferSizea, NULL, &errorNumber); createMemoryObjectsSuccess &= errorNumber; memoryObjectsb = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, bufferSizeb, NULL, &errorNumber); createMemoryObjectsSuccess &= errorNumber; memoryObjectsc = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, bufferSizec, NULL, &errorNumber); createMemoryObjectsSuccess &= errorNumber; LOGD("create memory err %d",createMemoryObjectsSuccess); int mapMemoryObjectsSuccess = 0; cl_float* matrixA = (cl_float*)clEnqueueMapBuffer(commandQueue, memoryObjectsa, CL_TRUE, CL_MAP_WRITE, 0, bufferSizea, 0, NULL, NULL, &errorNumber); mapMemoryObjectsSuccess &= errorNumber; cl_float* matrixB = (cl_float*)clEnqueueMapBuffer(commandQueue, memoryObjectsb, CL_TRUE, CL_MAP_WRITE, 0, bufferSizeb, 0, NULL, NULL, &errorNumber); mapMemoryObjectsSuccess &= errorNumber; cl_float* matrixC = (cl_float*)clEnqueueMapBuffer(commandQueue, memoryObjectsc, CL_TRUE, CL_MAP_WRITE, 0, bufferSizec, 0, NULL, NULL, &errorNumber); mapMemoryObjectsSuccess &= errorNumber; LOGD("map memory err %d",mapMemoryObjectsSuccess); sgemmInitialize(rowa,cola,colb, matrixA, matrixB, matrixC); LOGD("data initial finish"); int unmapMemoryObjectsSuccess = 0; errorNumber = clEnqueueUnmapMemObject(commandQueue, memoryObjectsa, matrixA, 0, NULL, NULL); LOGD("memory code %d",errorNumber); unmapMemoryObjectsSuccess &= errorNumber; errorNumber = clEnqueueUnmapMemObject(commandQueue, memoryObjectsb, matrixB, 0, NULL, NULL); LOGD("memory code %d",errorNumber); unmapMemoryObjectsSuccess &= errorNumber; errorNumber = clEnqueueUnmapMemObject(commandQueue, memoryObjectsc, matrixC, 0, NULL, NULL); LOGD("memory code %d",errorNumber); unmapMemoryObjectsSuccess &= errorNumber; LOGD("unmap memory err %d",unmapMemoryObjectsSuccess); int setKernelArgumentsSuccess = 0; errorNumber = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memoryObjectsa); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 1, sizeof(cl_mem), &memoryObjectsb); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 2, sizeof(cl_mem), &memoryObjectsc); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 3, sizeof(cl_uint), &clrowa); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 4, sizeof(cl_uint), &clcola); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 5, sizeof(cl_uint), &clcolb); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 6, sizeof(cl_float), &alpha); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 7, sizeof(cl_float), &beta); setKernelArgumentsSuccess &= errorNumber; LOGD("setKernel err %d",setKernelArgumentsSuccess); LOGD("start running kernel"); clock_t start_t,end_t; float cost_time; start_t = clock(); cl_event event = 0; size_t globalWorksize[2] = {rowa, colb}; errorNumber = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalWorksize, NULL, 0, NULL, &event); //LOGD("Enqueue err code %d",errorNumber); errorNumber = clFinish(commandQueue); end_t = clock(); cost_time = (float)(end_t-start_t)/CLOCKS_PER_SEC*1000; LOGD("Finish err code %d",errorNumber); float time; time = printProfilingInfo(event); LOGT("using CPU clock: %f ms",cost_time); LOGT("using GPU clock: %f ms",time); clReleaseEvent(event); matrixC = (cl_float*)clEnqueueMapBuffer(commandQueue, memoryObjectsc, CL_TRUE, CL_MAP_READ, 0, bufferSizec, 0, NULL, NULL, &errorNumber); clEnqueueUnmapMemObject(commandQueue, memoryObjectsc, matrixC, 0, NULL, NULL); LOGD("read out matrixC finish"); LOGD("matrixC value C(0,0): %f",matrixC[0]); cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjectsa, memoryObjectsb,memoryObjectsc,numberOfMemoryObjects); LOGD("RUNNING finsh"); return time; }
PassRefPtr<WebCLCommandQueue> WebCLContext::createCommandQueue(ExceptionState& es) { return createCommandQueue(nullptr, 0, es); }
PassRefPtr<WebCLCommandQueue> WebCLContext::createCommandQueue(WebCLDevice* device, ExceptionState& es) { return createCommandQueue(device, 0, es); }
PassRefPtr<WebCLCommandQueue> WebCLContext::createCommandQueue(int properties, ExceptionState& es) { return createCommandQueue(nullptr, properties, es); }