void StepWorldV3OpenCL(world_t &world, float dt, unsigned n)
{
  // OpenCL setup
  // platform
  std::vector<cl::Platform> platforms;

  cl::Platform::get(&platforms);
  if (platforms.size() == 0)
    throw std::runtime_error("No OpenCL platforms found.");

  std::cerr << "Found " << platforms.size() << " platforms\n";
  for (unsigned i = 0; i < platforms.size(); i++) {
    std::string vendor = platforms[i].getInfo<CL_PLATFORM_VENDOR>();
    std::cerr << "  Platform " << i << " : " << vendor << "\n";
  }

  int selectedPlatform = 0;
  if (getenv("HPCE_SELECT_PLATFORM")) {
    selectedPlatform = atoi(getenv("HPCE_SELECT_PLATFORM"));
  }
  std::cerr << "Choosing platform " << selectedPlatform << "\n";
  cl::Platform platform = platforms.at(selectedPlatform);

  // device
  std::vector<cl::Device> devices;
  platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
  if (devices.size() == 0) {
    throw std::runtime_error("No opencl devices found.\n");
  }

  std::cerr << "Found " << devices.size() << " devices\n";
  for (unsigned i = 0; i < devices.size(); i++) {
    std::string name = devices[i].getInfo<CL_DEVICE_NAME>();
    std::cerr << "  Device " << i << " : " << name << "\n";
  }

  int selectedDevice = 0;
  if (getenv("HPCE_SELECT_DEVICE")) {
    selectedDevice = atoi(getenv("HPCE_SELECT_DEVICE"));
  }
  std::cerr << "Choosing device " << selectedDevice << "\n";
  cl::Device device = devices.at(selectedDevice);

  // context
  cl::Context context(devices);

  std::string kernelSource = LoadSource("step_world_v3_kernel.cl");

  cl::Program::Sources sources;  // A vector of (data,length) pairs
  sources.push_back(
      std::make_pair(kernelSource.c_str(),
                     kernelSource.size() + 1));  // push on our single string

  cl::Program program(context, sources);
  try { program.build(devices); }
  catch (...)
  {
    for (unsigned i = 0; i < devices.size(); i++) {
      std::cerr << "Log for device " << devices[i].getInfo<CL_DEVICE_NAME>()
                << ":\n\n";
      std::cerr << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[i])
                << "\n\n";
    }
    throw;
  }

  size_t cbBuffer = 4 * world.w * world.h;
  cl::Buffer buffProperties(context, CL_MEM_READ_ONLY, cbBuffer);
  cl::Buffer buffState(context, CL_MEM_READ_ONLY, cbBuffer);
  cl::Buffer buffBuffer(context, CL_MEM_WRITE_ONLY, cbBuffer);

  cl::Kernel kernel(program, "kernel_xy");

  float outer = world.alpha * dt;  // We spread alpha to other cells per time
  float inner = 1 - outer / 4;     // Anything that doesn't spread stays

  kernel.setArg(0, inner);
  kernel.setArg(1, outer);
  kernel.setArg(2, buffState);
  kernel.setArg(3, buffProperties);
  kernel.setArg(4, buffBuffer);

  cl::CommandQueue queue(context, device);

  queue.enqueueWriteBuffer(buffProperties, CL_TRUE, 0, cbBuffer,
                           &world.properties[0]);

  unsigned w = world.w, h = world.h;
  // This is our temporary working space
  std::vector<float> buffer(w * h);

  cl::NDRange offset(0, 0);      // Always start iterations at x=0, y=0
  cl::NDRange globalSize(w, h);  // Global size must match the original loops
  cl::NDRange localSize = cl::NullRange;  // We don't care about local size

  for (unsigned t = 0; t < n; t++) {
    cl::Event evCopiedState;
    queue.enqueueWriteBuffer(buffState, CL_FALSE, 0, cbBuffer, &world.state[0],
                             NULL, &evCopiedState);

    std::vector<cl::Event> kernelDependencies(1, evCopiedState);
    cl::Event evExecutedKernel;
    queue.enqueueNDRangeKernel(kernel, offset, globalSize, localSize,
                               &kernelDependencies, &evExecutedKernel);

    std::vector<cl::Event> copyBackDependencies(1, evExecutedKernel);
    queue.enqueueReadBuffer(buffBuffer, CL_TRUE, 0, cbBuffer, &buffer[0],
                            &copyBackDependencies);

    std::swap(world.state, buffer);
    world.t += dt;  // We have moved the world forwards in time
  }
}
void StepWorldV4DoubleBuffered(world_t &world, float dt, unsigned n)
{
    // Get platforms
    std::vector<cl::Platform> platforms;
    cl::Platform::get(&platforms);
    if (platforms.size() == 0)
        throw std::runtime_error("No OpenCL platforms found.");

#ifdef DEBUG
    std::cerr << "Found " << platforms.size() << " platforms\n";
    for (unsigned i = 0; i < platforms.size(); i++)
    {
        std::string vendor = platforms[0].getInfo<CL_PLATFORM_VENDOR>();
        std::cerr << "  Platform " << i << " : " << vendor << "\n";
    }
#endif

    // select platforms
    int selectedPlatform = 0;
    if (getenv("HPCE_SELECT_PLATFORM"))
    {
        selectedPlatform = atoi(getenv("HPCE_SELECT_PLATFORM"));
    }

#ifdef DEBUG
    std::cerr << "Choosing platform " << selectedPlatform << "\n";
#endif

    cl::Platform platform = platforms.at(selectedPlatform);

    // Get devices
    std::vector<cl::Device> devices;
    platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
    if (devices.size() == 0)
    {
        throw std::runtime_error("No opencl devices found.\n");
    }

#ifdef DEBUG
    std::cerr << "Found " << devices.size() << " devices\n";
    for (unsigned i = 0; i < devices.size(); i++)
    {
        std::string name = devices[i].getInfo<CL_DEVICE_NAME>();
        std::cerr << "  Device " << i << " : " << name << "\n";
    }
#endif

    // Select device
    int selectedDevice = 0;
    if (getenv("HPCE_SELECT_DEVICE"))
    {
        selectedDevice = atoi(getenv("HPCE_SELECT_DEVICE"));
    }
#ifdef DEBUG
    std::cerr << "Choosing device " << selectedDevice << "\n";
#endif

    cl::Device device = devices.at(selectedDevice);

    // Create context
    cl::Context context(devices);

    // Load kernel to string
    std::string kernelSource = LoadSource("step_world_v3_kernel.cl");

    // Load kernel to sources
    cl::Program::Sources sources;   // A vector of (data,length) pairs
    sources.push_back(std::make_pair(kernelSource.c_str(), kernelSource.size() + 1)); // push on our single string

    // Create program from context
    cl::Program program(context, sources);

#ifdef DEBUG
    try
    {
        program.build(devices);
    }
    catch (...)
    {
        for (unsigned i = 0; i < devices.size(); i++)
        {
            std::cerr << "Log for device " << devices[i].getInfo<CL_DEVICE_NAME>() << ":\n\n";
            std::cerr << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[i]) << "\n\n";
        }
        throw;
    }
#else
    program.build(devices);
#endif

    size_t cbBuffer = 4 * world.w * world.h;
    cl::Buffer buffProperties(context, CL_MEM_READ_ONLY, cbBuffer);
    cl::Buffer buffState(context, CL_MEM_READ_WRITE, cbBuffer);
    cl::Buffer buffBuffer(context, CL_MEM_READ_WRITE, cbBuffer);

    cl::Kernel kernel(program, "kernel_xy");

    unsigned w = world.w, h = world.h;

    float outer = world.alpha * dt; // We spread alpha to other cells per time
    float inner = 1 - outer / 4;        // Anything that doesn't spread stays

    kernel.setArg(0, inner);
    kernel.setArg(1, outer);

    kernel.setArg(3, buffProperties);


    cl::CommandQueue queue(context, device);

    queue.enqueueWriteBuffer(buffProperties, CL_TRUE, 0, cbBuffer, &world.properties[0]);
    queue.enqueueWriteBuffer(buffState, CL_TRUE, 0, cbBuffer, &world.state[0]);

    cl::NDRange offset(0, 0);               // Always start iterations at x=0, y=0
    cl::NDRange globalSize(w, h);   // Global size must match the original loops
    cl::NDRange localSize = cl::NullRange;  // We don't care about local size

    for (unsigned t = 0; t < n; t++)
    {
        kernel.setArg(2, buffState);
        kernel.setArg(4, buffBuffer);

        queue.enqueueNDRangeKernel(kernel, offset, globalSize, localSize);
        queue.enqueueBarrier();

        // queue.enqueueCopyBuffer(buffBuffer, buffState, 0, 0, cbBuffer, 0, NULL);
        // queue.enqueueBarrier();

        std::swap(buffState, buffBuffer);
        // queue.enqueueReadBuffer(buffBuffer, CL_TRUE, 0, cbBuffer, &buffer[0]);
        // Swapping rather than assigning is cheaper: just a pointer swap
        // rather than a memcpy, so O(1) rather than O(w*h)

        world.t += dt; // We have moved the world forwards in time

    } // end of for(t...

    // This is our temporary working space
    queue.enqueueReadBuffer(buffState, CL_TRUE, 0, cbBuffer, &world.state[0]);
}