int main(int argc, char ** argv) {
    // Load image
    SIPL::Image<float> * image = new SIPL::Image<float>("images/sunset.jpg");

    // Create OpenCL context
    Context context = createCLContextFromArguments(argc, argv);

    // Compile OpenCL code
    Program program = buildProgramFromSource(context, "gaussian_blur.cl");

    // Select device and create a command queue for it
    VECTOR_CLASS<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
    CommandQueue queue = CommandQueue(context, devices[0]);

    // Create an OpenCL Image / texture and transfer data to the device
    Image2D clImage = Image2D(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, ImageFormat(CL_R, CL_FLOAT), image->getWidth(), image->getHeight(), 0, (void*)image->getData());

    // Create a buffer for the result
    Buffer clResult = Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float)*image->getWidth()*image->getHeight());

    // Create Gaussian mask
    int maskSize;
    float * mask = createBlurMask(10.0f, &maskSize);

    // Create buffer for mask and transfer it to the device
    Buffer clMask = Buffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float)*(maskSize*2+1)*(maskSize*2+1), mask);

    // Run Gaussian kernel
    Kernel gaussianBlur = Kernel(program, "gaussian_blur");
    gaussianBlur.setArg(0, clImage);
    gaussianBlur.setArg(1, clMask);
    gaussianBlur.setArg(2, clResult);
    gaussianBlur.setArg(3, maskSize);

    queue.enqueueNDRangeKernel(
        gaussianBlur,
        NullRange,
        NDRange(image->getWidth(), image->getHeight()),
        NullRange
    );

    // Transfer image back to host
    float* data = new float[image->getWidth()*image->getHeight()];
    queue.enqueueReadBuffer(clResult, CL_TRUE, 0, sizeof(float)*image->getWidth()*image->getHeight(), data);
    image->setData(data);

    // Save image to disk
    image->save("images/result.jpg", "jpeg");
    image->display();
}
Exemple #2
0
int main(int argc, char * argv[])
{
  const std::string hw("Hello WorldCL\n");
  char * outH = new char [hw.length()+1];
  ImageCL i;
  i.create_cl_context();
  Buffer out_buffer;
  int opts = CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR;
  i.create_cl_kernel_buffer(out_buffer, opts, hw.length()+1);

  // device handles
  vector<Device> devices;
  devices = i.get_devices_list();

  // load kernel code

  Kernel k = i.load_cl_kernel_file("image.cl", "hello", out_buffer);
  CommandQueue q = i.run_kernel(k, devices[0], hw.length()+1); // blocks until finished
  q.enqueueReadBuffer(out_buffer, CL_TRUE, 0, hw.length()+1, (void*)&hw);
  std::cout << hw;
  return 0;
}
Exemple #3
0
int main(int argc, char **argv)
{
	srand((unsigned)time(NULL));

	Kernel kernel;
	CommandQueue queue;
	Context context;

	{
		std::vector<Platform> platformList;
		Platform::get(&platformList);

		clog << "Platform number is: " << platformList.size() << endl;

		std::string platformVendor;
		platformList[0].getInfo((cl_platform_info)CL_PLATFORM_VENDOR, &platformVendor);
		clog << "Platform is by: " << platformVendor << "\n";

		cl_context_properties cprops[] = {
			CL_CONTEXT_PLATFORM, (cl_context_properties) platformList[0](),
			0
		};
		context = Context(GET_TARGET_PLATFORM, cprops);

		std::vector<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
		queue = CommandQueue(context, devices[0]);

		std::string sourceCode = "#include \"es.cl\"\n";
		Program::Sources source(1, std::make_pair(sourceCode.c_str(), sourceCode.length()+1));
		Program program = Program(context, source);

		try
		{
			program.build(devices, "-I.");
		}
		catch (Error &)
		{
			std::string errors;
			program.getBuildInfo(devices[0], CL_PROGRAM_BUILD_LOG, &errors);
			std::cerr << "Build log: " << endl << errors << endl;
			return 1;
		}

		kernel = Kernel(program, "es");
	}

	individual *individuals = new individual[LAMBDA];
	for (int i = 0; i < LAMBDA; i++)
	{
		for (int j = 0; j < DIM; ++j)
		{
			individuals[i].x[j] = (rand()/((float)RAND_MAX)) * (XMAX-XMIN) + XMIN;
			individuals[i].s[j] = (XMAX-XMIN) / 6.f;
		}
		for (int j = 0; j < DIM_A; ++j)
		{
			individuals[i].a[j] = (rand()/((float)RAND_MAX)) * (2*PI) - PI;
		}
		
		individuals[i].fitness = 0;
	}

	float gbest = std::numeric_limits<float>::infinity(), xbest[DIM];
	
	Buffer esBuffer = Buffer(context, 0, INDIVIDUALS_SIZE);
	Event ev;
	queue.enqueueMapBuffer(esBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, INDIVIDUALS_SIZE);
	
	for (int i = 0; i < 1000; i++)
	{
		queue.enqueueWriteBuffer(esBuffer, CL_TRUE, 0, INDIVIDUALS_SIZE, individuals);
		kernel.setArg(1, (cl_ulong)rand());
		kernel.setArg(0, esBuffer);
		queue.enqueueNDRangeKernel(kernel, NullRange, NDRange(LAMBDA), NDRange(1), NULL, &ev);
		ev.wait();
		queue.enqueueReadBuffer(esBuffer, CL_TRUE, 0, INDIVIDUALS_SIZE, individuals);
		
		std::sort(individuals, individuals + LAMBDA, individual_comp);
		individual mean = get_mean(individuals);
		for (int j = 0; j < LAMBDA; ++j)
		{
			individuals[j] = mean;
		}
	}
	gbest = individuals[0].fitness;
	for (int i = 0; i < DIM; ++i) xbest[i] = individuals[0].x[i];
	clog << "Best value " << gbest << " found at (";
	for (int i = 0; i < DIM; ++i) clog << xbest[i] << (i == DIM-1 ? ")" : ", ");
	clog << "\n";
	clog << "Our computation estemates it: f(" << xbest[0] << ", ..., " << xbest[DIM-1] << ") = " << es_f(xbest) << endl;

	delete[] individuals;

	return 0;
}
Exemple #4
0
/******************************************************************************
* main
******************************************************************************/
int main(int argc, char *argv[])
{
   /*-------------------------------------------------------------------------
   * Catch ctrl-c so we ensure that we call dtors and the dsp is reset properly
   *------------------------------------------------------------------------*/
   signal(SIGABRT, exit);
   signal(SIGTERM, exit);

   struct timespec tp_start, tp_end;

   try 
   {
     /*------------------------------------------------------------------------
     * One time OpenCL Setup
     *-----------------------------------------------------------------------*/
     Context             context(CL_DEVICE_TYPE_ALL); 
     std::vector<Device> devices(context.getInfo<CL_CONTEXT_DEVICES>());

     CommandQueue        *QcpuIO = NULL;
     CommandQueue        *QcpuOO = NULL;
     CommandQueue        *QdspOO = NULL;

     std::vector<Device> dspDevices;
     for (int d = 0; d < devices.size(); d++)
     {
	cl_device_type type;
	devices[d].getInfo(CL_DEVICE_TYPE, &type);

	if (type == CL_DEVICE_TYPE_CPU)
	{
	   QcpuIO = new CommandQueue(context, devices[d], PROFILE);
	   QcpuOO = new CommandQueue(context, devices[d], PROFILE|OOOEXEC);
	}
	else if (type == CL_DEVICE_TYPE_ACCELERATOR)
        {
	   QdspOO  = new CommandQueue(context, devices[d], PROFILE|OOOEXEC);
           dspDevices.push_back(devices[d]);
        }
     }

     if (QcpuIO == NULL)
     {
	std::cout << 
	"CPU devices are not fully supported in the current" << std::endl <<
	"OpenCL implementation (native kernel support only)." << std::endl << 
	"As a result, CPU devices are not enabled by" << std::endl <<
	"default.  This example uses OpenCL CPU native" << std::endl <<
	"kernels and can be run with the CPU device enabled." << std::endl << 
        "To enable a CPU device define the environment variable" << std::endl <<
        "'TI_OCL_CPU_DEVICE_ENABLE' before running the example." << std::endl;
	 exit(-1);
     }

     assert(QdspOO != NULL);

     Program::Sources    source (1, std::make_pair(kernStr, strlen(kernStr)));
     Program             program(Program(context, source));

     program.build(dspDevices);
     Kernel K(program, "compute");

     /*------------------------------------------------------------------------
     * Define a Buffer for each possible in flight task
     *-----------------------------------------------------------------------*/
     std::vector<BufUP> bufs;
     for (int i = 0; i < inflight; ++i) 
         bufs.push_back(BufUP(new Buffer(context, CL_MEM_READ_WRITE, size)));

     /*------------------------------------------------------------------------
     * Define a 3-D vector of OpenCL Events.  1st dim is for the number of 
     * in flight tasks, the second dim is for the processing stages of a single
     * task.  The 3rd dim is an artifact of the c++ binding for event wait 
     * lists.  All enqueue API's take a wait list which is a vector<Event>*, and
     * they take an Event*.  All events in the wait list vector must complete,
     * before this event will execute.  The single event argument is for the 
     * event that will be set as a result of this enqueue. 
     *-----------------------------------------------------------------------*/
     vecVecVecEv evt(inflight, vecVecEv(STAGES, vecEv(1)));

     /*------------------------------------------------------------------------
     * Enqueue a dummy DSP kernel call to force the OpenCL lazy execution
     * to go ahead and compile the kernel and load it.  This will prevent the 
     * compile and load times from skewing the reported numbers.  This is not 
     * needed by the algorithm and is purely a tactic to get consistent numbers
     * from the the running of the bulk of this algorithm
     *-----------------------------------------------------------------------*/
     K.setArg(0, *bufs[0]);
     K.setArg(1, 0);
     QdspOO->enqueueTask(K);

     K.setArg(1, elements);
     clock_gettime(CLOCK_MONOTONIC, &tp_start);

     /*------------------------------------------------------------------------
     * Iterate for as many tasks as there are
     *-----------------------------------------------------------------------*/
     for (int i = 0; i < tasks; ++i)
     {
        /*---------------------------------------------------------------------
        * Variables to ensure that this iteration is using the correct circular
        * resources: i.e. buffers and arrays.
        *--------------------------------------------------------------------*/
        int     circIdx = i % inflight;
        Buffer &buf(*bufs[circIdx]);
        int    *ary(arys [circIdx]);
        Event  nullEv;

        /*---------------------------------------------------------------------
        * Native kernels are only passed a single pointer, so define a structure
        * that contains the actual arguments, populate that and then create 
        * a C++ binding native argument class that has the pointer and a size.
        *--------------------------------------------------------------------*/
        arguments_t proArgs = { ary, elements, i,   i };
        arguments_t conArgs = { ary, elements, i+1, i };
        native_arg_t proNargs(&proArgs, sizeof(proArgs));
        native_arg_t conNargs(&conArgs, sizeof(conArgs));

        K.setArg(0, buf);

        /*---------------------------------------------------------------------
        * Since we are reusing N sets of buffers in this loop, we need to make
        * sure than iteration I does not start until after iteration I-N 
        * completes. Iterations < N can start immediately.
        *--------------------------------------------------------------------*/
        vecEv *start_waits = (i < inflight) ? 0 : &evt[circIdx][CNS];

        evt[circIdx][PRD][0] = nullEv;
        evt[circIdx][WRT][0] = nullEv;
        evt[circIdx][CMP][0] = nullEv;
        evt[circIdx][RD ][0] = nullEv;

        QcpuOO->enqueueNativeKernel(cpu_produce, proNargs, 0, 0,  
                start_waits,        &evt[circIdx][PRD][0]);

        evt[circIdx][CNS][0] = nullEv;

        QdspOO->enqueueWriteBuffer (buf, CL_FALSE, 0, size, ary,  
                &evt[circIdx][PRD], &evt[circIdx][WRT][0]);
        QdspOO->enqueueTask        (K,                            
                &evt[circIdx][WRT], &evt[circIdx][CMP][0]);
        QdspOO->enqueueReadBuffer  (buf, CL_FALSE, 0, size, ary,  
                &evt[circIdx][CMP], &evt[circIdx][RD ][0]);
        QcpuIO->enqueueNativeKernel(cpu_consume, conNargs, 0, 0,  
                &evt[circIdx][RD ], &evt[circIdx][CNS][0]);
     }

     /*------------------------------------------------------------------------
     * Only need to wait for the CPU In Order queue to finish, since all all
     * other enqueue events must finish before the CPU IO queue can finish
     *-----------------------------------------------------------------------*/
     QcpuIO->finish();

     delete QcpuIO;
     delete QcpuOO;
     delete QdspOO;

     clock_gettime(CLOCK_MONOTONIC, &tp_end);
     double elapsed = clock_diff (&tp_start, &tp_end);
     printf("Elapsed : %8.6f secs\n", elapsed);

     /*------------------------------------------------------------------------
     * After the running is complete, report timing for each step
     *-----------------------------------------------------------------------*/
#if PROFILE 
     cl_ulong ref;
     evt[0][0][0].getProfilingInfo(CL_PROFILING_COMMAND_QUEUED, &ref);

     for (int i = 0; i < inflight; ++i)
     {
          for (int s = 0; s < STAGES; ++s)
              ocl_relative_times(evt[i][s][0], stage_names[s], ref);
          cout << endl;
     }
#endif
   }

   catch (Error err)
   {
       cerr << "ERROR: " << err.what() << "("
            << ocl_decode_error(err.err()) << ")"
            << endl;
       incorrect_results = true;
   }

   if (incorrect_results) return -1;
}