/****************************************************************************** * main ******************************************************************************/ int main(int argc, char *argv[]) { /*------------------------------------------------------------------------- * Catch ctrl-c so we ensure that we call dtors and the dsp is reset properly *------------------------------------------------------------------------*/ signal(SIGABRT, exit); signal(SIGTERM, exit); struct timespec tp_start, tp_end; try { /*------------------------------------------------------------------------ * One time OpenCL Setup *-----------------------------------------------------------------------*/ Context context(CL_DEVICE_TYPE_ALL); std::vector<Device> devices(context.getInfo<CL_CONTEXT_DEVICES>()); CommandQueue *QcpuIO = NULL; CommandQueue *QcpuOO = NULL; CommandQueue *QdspOO = NULL; std::vector<Device> dspDevices; for (int d = 0; d < devices.size(); d++) { cl_device_type type; devices[d].getInfo(CL_DEVICE_TYPE, &type); if (type == CL_DEVICE_TYPE_CPU) { QcpuIO = new CommandQueue(context, devices[d], PROFILE); QcpuOO = new CommandQueue(context, devices[d], PROFILE|OOOEXEC); } else if (type == CL_DEVICE_TYPE_ACCELERATOR) { QdspOO = new CommandQueue(context, devices[d], PROFILE|OOOEXEC); dspDevices.push_back(devices[d]); } } if (QcpuIO == NULL) { std::cout << "CPU devices are not fully supported in the current" << std::endl << "OpenCL implementation (native kernel support only)." << std::endl << "As a result, CPU devices are not enabled by " << std::endl << "default. This example uses OpenCL CPU native" << std::endl << "kernels and can be run with the CPU device enabled." << std::endl << "To enable a CPU device define the environment variable" << std::endl << "'TI_OCL_CPU_DEVICE_ENABLE' before running the example." << std::endl; exit(-1); } assert(QdspOO != NULL); Program::Sources source (1, std::make_pair(kernStr, strlen(kernStr))); Program program(Program(context, source)); program.build(dspDevices); Kernel K(program, "compute"); K.setArg(1, elements); /*------------------------------------------------------------------------ * Define a Buffer for each possible in flight task *-----------------------------------------------------------------------*/ std::vector<BufUP> bufs; for (int i = 0; i < inflight; ++i) bufs.push_back(BufUP(new Buffer(context, CL_MEM_READ_WRITE, size))); /*------------------------------------------------------------------------ * Define a 3-D vector of OpenCL Events. 1st dim is for the number of * in flight tasks, the second dim is for the processing stages of a single * task. The 3rd dim is an artifact of the c++ binding for event wait * lists. All enqueue API's take a wait list which is a vector<Event>*, and * they take an Event*. All events in the wait list vector must complete, * before this event will execute. The single event argument is for the * event that will be set as a result of this enqueue. *-----------------------------------------------------------------------*/ vecVecVecEv evt(inflight, vecVecEv(STAGES, vecEv(1))); clock_gettime(CLOCK_MONOTONIC, &tp_start); /*------------------------------------------------------------------------ * Iterate for as many tasks as there are *-----------------------------------------------------------------------*/ for (int i = 0; i < tasks; ++i) { /*--------------------------------------------------------------------- * Variables to ensure that this iteration is using the correct circular * resources: i.e. buffers and arrays. *--------------------------------------------------------------------*/ int circIdx = i % inflight; Buffer &buf(*bufs[circIdx]); int *ary(arys [circIdx]); Event nullEv; K.setArg(0, buf); /*--------------------------------------------------------------------- * Since we are reusing N sets of buffers in this loop, we need to make * sure than iteration I does not start until after iteration I-N * completes. Iterations < N can start immediately. *--------------------------------------------------------------------*/ int eIdx = circIdx; vecEv *start_waits = (i < inflight) ? 0 : &evt[eIdx][RUM]; evt[circIdx][WMP][0] = nullEv; evt[circIdx][PRD][0] = nullEv; evt[circIdx][WUM][0] = nullEv; evt[circIdx][CMP][0] = nullEv; evt[circIdx][RMP][0] = nullEv; evt[circIdx][CNS][0] = nullEv; int *p = (int*)QdspOO->enqueueMapBuffer(buf, CL_FALSE, CL_MAP_WRITE, 0, size, start_waits, &evt[eIdx][WMP][0]); evt[circIdx][RUM][0] = nullEv; /*--------------------------------------------------------------------- * Native kernels are only passed a single pointer, so define a structure * that contains the actual arguments, populate that and then create * a C++ binding native argument class that has the pointer and a size. *--------------------------------------------------------------------*/ arguments_t proArgs = { p, elements, i, i }; native_arg_t proNargs(&proArgs, sizeof(proArgs)); QcpuOO->enqueueNativeKernel(cpu_produce, proNargs, 0, 0, &evt[eIdx][WMP], &evt[eIdx][PRD][0]); QdspOO->enqueueUnmapMemObject(buf, p, &evt[eIdx][PRD], &evt[eIdx][WUM][0]); QdspOO->enqueueTask(K, &evt[eIdx][WUM], &evt[eIdx][CMP][0]); p = (int*)QdspOO->enqueueMapBuffer(buf, CL_FALSE, CL_MAP_READ, 0, size, &evt[eIdx][CMP], &evt[eIdx][RMP][0]); arguments_t conArgs = { p, elements, i+1, i }; native_arg_t conNargs(&conArgs, sizeof(conArgs)); QcpuIO->enqueueNativeKernel (cpu_consume, conNargs, 0, 0, &evt[eIdx][RMP], &evt[eIdx][CNS][0]); QdspOO->enqueueUnmapMemObject (buf, p, &evt[eIdx][CNS], &evt[eIdx][RUM][0]); } /*------------------------------------------------------------------------ * Only need to wait for the CPU In Order queue to finish, since all all * other enqueue events must finish before the CPU IO queue can finish *-----------------------------------------------------------------------*/ // QcpuIO.finish(); QdspOO->finish(); delete QcpuIO; delete QcpuOO; delete QdspOO; clock_gettime(CLOCK_MONOTONIC, &tp_end); double elapsed = clock_diff (&tp_start, &tp_end); printf("Elapsed : %8.4f secs\n", elapsed); /*------------------------------------------------------------------------ * After the running is complete, report timing for each step *-----------------------------------------------------------------------*/ #if PROFILE cl_ulong ref; evt[0][0][0].getProfilingInfo(CL_PROFILING_COMMAND_QUEUED, &ref); for (int i = 0; i < tasks; ++i) { for (int s = 0; s < STAGES; ++s) ocl_relative_times(evt[i][s][0], stage_names[s], ref); cout << endl; } #endif } catch (Error err) { cerr << "ERROR: " << err.what() << "(" << ocl_decode_error(err.err()) << ")" << endl; incorrect_results = true; } if (incorrect_results) return -1; }
/** * Prepare and execute the OpenCL Kernel */ void executeCL(void) { Event wait; try { // Stage stageExecuteCL(); // Execute cl_int err = m_queue.enqueueNDRangeKernel(m_kernel, NullRange, m_global, m_local, NULL, &wait); clPrintErr(err, "Execute Error -> ", stdout); wait.wait(); m_queue.finish(); // Collate collateExecuteCL(); } catch(Error error) { std::cout << std::endl << error.what() << "(" << error.err() << ")" << std::endl; fail("OpenCL Error"); } }