int main(int argc, char ** argv) { // Load image SIPL::Image<float> * image = new SIPL::Image<float>("images/sunset.jpg"); // Create OpenCL context Context context = createCLContextFromArguments(argc, argv); // Compile OpenCL code Program program = buildProgramFromSource(context, "gaussian_blur.cl"); // Select device and create a command queue for it VECTOR_CLASS<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>(); CommandQueue queue = CommandQueue(context, devices[0]); // Create an OpenCL Image / texture and transfer data to the device Image2D clImage = Image2D(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, ImageFormat(CL_R, CL_FLOAT), image->getWidth(), image->getHeight(), 0, (void*)image->getData()); // Create a buffer for the result Buffer clResult = Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float)*image->getWidth()*image->getHeight()); // Create Gaussian mask int maskSize; float * mask = createBlurMask(10.0f, &maskSize); // Create buffer for mask and transfer it to the device Buffer clMask = Buffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float)*(maskSize*2+1)*(maskSize*2+1), mask); // Run Gaussian kernel Kernel gaussianBlur = Kernel(program, "gaussian_blur"); gaussianBlur.setArg(0, clImage); gaussianBlur.setArg(1, clMask); gaussianBlur.setArg(2, clResult); gaussianBlur.setArg(3, maskSize); queue.enqueueNDRangeKernel( gaussianBlur, NullRange, NDRange(image->getWidth(), image->getHeight()), NullRange ); // Transfer image back to host float* data = new float[image->getWidth()*image->getHeight()]; queue.enqueueReadBuffer(clResult, CL_TRUE, 0, sizeof(float)*image->getWidth()*image->getHeight(), data); image->setData(data); // Save image to disk image->save("images/result.jpg", "jpeg"); image->display(); }
int main(int argc, char * argv[]) { const std::string hw("Hello WorldCL\n"); char * outH = new char [hw.length()+1]; ImageCL i; i.create_cl_context(); Buffer out_buffer; int opts = CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR; i.create_cl_kernel_buffer(out_buffer, opts, hw.length()+1); // device handles vector<Device> devices; devices = i.get_devices_list(); // load kernel code Kernel k = i.load_cl_kernel_file("image.cl", "hello", out_buffer); CommandQueue q = i.run_kernel(k, devices[0], hw.length()+1); // blocks until finished q.enqueueReadBuffer(out_buffer, CL_TRUE, 0, hw.length()+1, (void*)&hw); std::cout << hw; return 0; }
int main(int argc, char **argv) { srand((unsigned)time(NULL)); Kernel kernel; CommandQueue queue; Context context; { std::vector<Platform> platformList; Platform::get(&platformList); clog << "Platform number is: " << platformList.size() << endl; std::string platformVendor; platformList[0].getInfo((cl_platform_info)CL_PLATFORM_VENDOR, &platformVendor); clog << "Platform is by: " << platformVendor << "\n"; cl_context_properties cprops[] = { CL_CONTEXT_PLATFORM, (cl_context_properties) platformList[0](), 0 }; context = Context(GET_TARGET_PLATFORM, cprops); std::vector<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>(); queue = CommandQueue(context, devices[0]); std::string sourceCode = "#include \"es.cl\"\n"; Program::Sources source(1, std::make_pair(sourceCode.c_str(), sourceCode.length()+1)); Program program = Program(context, source); try { program.build(devices, "-I."); } catch (Error &) { std::string errors; program.getBuildInfo(devices[0], CL_PROGRAM_BUILD_LOG, &errors); std::cerr << "Build log: " << endl << errors << endl; return 1; } kernel = Kernel(program, "es"); } individual *individuals = new individual[LAMBDA]; for (int i = 0; i < LAMBDA; i++) { for (int j = 0; j < DIM; ++j) { individuals[i].x[j] = (rand()/((float)RAND_MAX)) * (XMAX-XMIN) + XMIN; individuals[i].s[j] = (XMAX-XMIN) / 6.f; } for (int j = 0; j < DIM_A; ++j) { individuals[i].a[j] = (rand()/((float)RAND_MAX)) * (2*PI) - PI; } individuals[i].fitness = 0; } float gbest = std::numeric_limits<float>::infinity(), xbest[DIM]; Buffer esBuffer = Buffer(context, 0, INDIVIDUALS_SIZE); Event ev; queue.enqueueMapBuffer(esBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, INDIVIDUALS_SIZE); for (int i = 0; i < 1000; i++) { queue.enqueueWriteBuffer(esBuffer, CL_TRUE, 0, INDIVIDUALS_SIZE, individuals); kernel.setArg(1, (cl_ulong)rand()); kernel.setArg(0, esBuffer); queue.enqueueNDRangeKernel(kernel, NullRange, NDRange(LAMBDA), NDRange(1), NULL, &ev); ev.wait(); queue.enqueueReadBuffer(esBuffer, CL_TRUE, 0, INDIVIDUALS_SIZE, individuals); std::sort(individuals, individuals + LAMBDA, individual_comp); individual mean = get_mean(individuals); for (int j = 0; j < LAMBDA; ++j) { individuals[j] = mean; } } gbest = individuals[0].fitness; for (int i = 0; i < DIM; ++i) xbest[i] = individuals[0].x[i]; clog << "Best value " << gbest << " found at ("; for (int i = 0; i < DIM; ++i) clog << xbest[i] << (i == DIM-1 ? ")" : ", "); clog << "\n"; clog << "Our computation estemates it: f(" << xbest[0] << ", ..., " << xbest[DIM-1] << ") = " << es_f(xbest) << endl; delete[] individuals; return 0; }
/****************************************************************************** * main ******************************************************************************/ int main(int argc, char *argv[]) { /*------------------------------------------------------------------------- * Catch ctrl-c so we ensure that we call dtors and the dsp is reset properly *------------------------------------------------------------------------*/ signal(SIGABRT, exit); signal(SIGTERM, exit); struct timespec tp_start, tp_end; try { /*------------------------------------------------------------------------ * One time OpenCL Setup *-----------------------------------------------------------------------*/ Context context(CL_DEVICE_TYPE_ALL); std::vector<Device> devices(context.getInfo<CL_CONTEXT_DEVICES>()); CommandQueue *QcpuIO = NULL; CommandQueue *QcpuOO = NULL; CommandQueue *QdspOO = NULL; std::vector<Device> dspDevices; for (int d = 0; d < devices.size(); d++) { cl_device_type type; devices[d].getInfo(CL_DEVICE_TYPE, &type); if (type == CL_DEVICE_TYPE_CPU) { QcpuIO = new CommandQueue(context, devices[d], PROFILE); QcpuOO = new CommandQueue(context, devices[d], PROFILE|OOOEXEC); } else if (type == CL_DEVICE_TYPE_ACCELERATOR) { QdspOO = new CommandQueue(context, devices[d], PROFILE|OOOEXEC); dspDevices.push_back(devices[d]); } } if (QcpuIO == NULL) { std::cout << "CPU devices are not fully supported in the current" << std::endl << "OpenCL implementation (native kernel support only)." << std::endl << "As a result, CPU devices are not enabled by" << std::endl << "default. This example uses OpenCL CPU native" << std::endl << "kernels and can be run with the CPU device enabled." << std::endl << "To enable a CPU device define the environment variable" << std::endl << "'TI_OCL_CPU_DEVICE_ENABLE' before running the example." << std::endl; exit(-1); } assert(QdspOO != NULL); Program::Sources source (1, std::make_pair(kernStr, strlen(kernStr))); Program program(Program(context, source)); program.build(dspDevices); Kernel K(program, "compute"); /*------------------------------------------------------------------------ * Define a Buffer for each possible in flight task *-----------------------------------------------------------------------*/ std::vector<BufUP> bufs; for (int i = 0; i < inflight; ++i) bufs.push_back(BufUP(new Buffer(context, CL_MEM_READ_WRITE, size))); /*------------------------------------------------------------------------ * Define a 3-D vector of OpenCL Events. 1st dim is for the number of * in flight tasks, the second dim is for the processing stages of a single * task. The 3rd dim is an artifact of the c++ binding for event wait * lists. All enqueue API's take a wait list which is a vector<Event>*, and * they take an Event*. All events in the wait list vector must complete, * before this event will execute. The single event argument is for the * event that will be set as a result of this enqueue. *-----------------------------------------------------------------------*/ vecVecVecEv evt(inflight, vecVecEv(STAGES, vecEv(1))); /*------------------------------------------------------------------------ * Enqueue a dummy DSP kernel call to force the OpenCL lazy execution * to go ahead and compile the kernel and load it. This will prevent the * compile and load times from skewing the reported numbers. This is not * needed by the algorithm and is purely a tactic to get consistent numbers * from the the running of the bulk of this algorithm *-----------------------------------------------------------------------*/ K.setArg(0, *bufs[0]); K.setArg(1, 0); QdspOO->enqueueTask(K); K.setArg(1, elements); clock_gettime(CLOCK_MONOTONIC, &tp_start); /*------------------------------------------------------------------------ * Iterate for as many tasks as there are *-----------------------------------------------------------------------*/ for (int i = 0; i < tasks; ++i) { /*--------------------------------------------------------------------- * Variables to ensure that this iteration is using the correct circular * resources: i.e. buffers and arrays. *--------------------------------------------------------------------*/ int circIdx = i % inflight; Buffer &buf(*bufs[circIdx]); int *ary(arys [circIdx]); Event nullEv; /*--------------------------------------------------------------------- * Native kernels are only passed a single pointer, so define a structure * that contains the actual arguments, populate that and then create * a C++ binding native argument class that has the pointer and a size. *--------------------------------------------------------------------*/ arguments_t proArgs = { ary, elements, i, i }; arguments_t conArgs = { ary, elements, i+1, i }; native_arg_t proNargs(&proArgs, sizeof(proArgs)); native_arg_t conNargs(&conArgs, sizeof(conArgs)); K.setArg(0, buf); /*--------------------------------------------------------------------- * Since we are reusing N sets of buffers in this loop, we need to make * sure than iteration I does not start until after iteration I-N * completes. Iterations < N can start immediately. *--------------------------------------------------------------------*/ vecEv *start_waits = (i < inflight) ? 0 : &evt[circIdx][CNS]; evt[circIdx][PRD][0] = nullEv; evt[circIdx][WRT][0] = nullEv; evt[circIdx][CMP][0] = nullEv; evt[circIdx][RD ][0] = nullEv; QcpuOO->enqueueNativeKernel(cpu_produce, proNargs, 0, 0, start_waits, &evt[circIdx][PRD][0]); evt[circIdx][CNS][0] = nullEv; QdspOO->enqueueWriteBuffer (buf, CL_FALSE, 0, size, ary, &evt[circIdx][PRD], &evt[circIdx][WRT][0]); QdspOO->enqueueTask (K, &evt[circIdx][WRT], &evt[circIdx][CMP][0]); QdspOO->enqueueReadBuffer (buf, CL_FALSE, 0, size, ary, &evt[circIdx][CMP], &evt[circIdx][RD ][0]); QcpuIO->enqueueNativeKernel(cpu_consume, conNargs, 0, 0, &evt[circIdx][RD ], &evt[circIdx][CNS][0]); } /*------------------------------------------------------------------------ * Only need to wait for the CPU In Order queue to finish, since all all * other enqueue events must finish before the CPU IO queue can finish *-----------------------------------------------------------------------*/ QcpuIO->finish(); delete QcpuIO; delete QcpuOO; delete QdspOO; clock_gettime(CLOCK_MONOTONIC, &tp_end); double elapsed = clock_diff (&tp_start, &tp_end); printf("Elapsed : %8.6f secs\n", elapsed); /*------------------------------------------------------------------------ * After the running is complete, report timing for each step *-----------------------------------------------------------------------*/ #if PROFILE cl_ulong ref; evt[0][0][0].getProfilingInfo(CL_PROFILING_COMMAND_QUEUED, &ref); for (int i = 0; i < inflight; ++i) { for (int s = 0; s < STAGES; ++s) ocl_relative_times(evt[i][s][0], stage_names[s], ref); cout << endl; } #endif } catch (Error err) { cerr << "ERROR: " << err.what() << "(" << ocl_decode_error(err.err()) << ")" << endl; incorrect_results = true; } if (incorrect_results) return -1; }