Exemple #1
0
void PointCloudGenerator::PostInitialize(ISystem *ySys, IPluginObjectInstance *pInstance) {

  int nVertex = numVertex->v->int_val;
  float *pVertex = (float *)vertexData->v->userdata;

  // Being re-initialized?
  if ((pVertex != NULL) && (nVertex != vertexCount->v->int_val)) {
    free (pVertex);
    pVertex = NULL;
  }

  if (pVertex == NULL) {
    pVertex = (float *)malloc(sizeof(float) * nVertex * 3);
  }


  std::default_random_engine generator;
  std::uniform_real_distribution<float> x_distribution(-range->v->vector[0],range->v->vector[0]);
  std::uniform_real_distribution<float> y_distribution(-range->v->vector[1],range->v->vector[1]);
  std::uniform_real_distribution<float> z_distribution(-range->v->vector[2],range->v->vector[2]);

  for(int i=0;i<nVertex;i++) {
    vIni(&pVertex[i*3], x_distribution(generator), y_distribution(generator), z_distribution(generator));
  }

  vertexCount->v->int_val = nVertex;
  vertexData->v->userdata = pVertex;

}
Eigen::Vector3d get_random_location(std::default_random_engine& generator, const double min_x, const double min_y, const double min_z, const double max_x, const double max_y, const double max_z)
{
    std::uniform_real_distribution<double> x_distribution(min_x, max_x);
    std::uniform_real_distribution<double> y_distribution(min_y, max_y);
    std::uniform_real_distribution<double> z_distribution(min_z, max_z);
    double rand_x = x_distribution(generator);
    double rand_y = y_distribution(generator);
    double rand_z = z_distribution(generator);
    return Eigen::Vector3d(rand_x, rand_y, rand_z);
}
 // Generate Initial Gaussian Distribution
 void init_distribution(double x, double y, double theta) {
     //Parameter of Initial Diffusion
     std::normal_distribution<double> x_distribution(x, var_x);
     std::normal_distribution<double> y_distribution(y, var_y);
     for (int i = 0; i < P; ++i) {
         particle_state[i].x = x_distribution(generator);
         particle_state[i].y = y_distribution(generator);
         particle_state[i].theta = theta;
         particle_weight[i] = 1.0 / (double) P;
     }
 }
 void init_distribution_uniform(double xmin, double xmax, double ymin, double ymax) {
     std::uniform_real_distribution<double> x_distribution(xmin, xmax);
     std::uniform_real_distribution<double> y_distribution(ymin, ymax);
     for (int i = 0; i < P; i++) {
         do {
             particle_state[i].x = x_distribution(generator);
             particle_state[i].y = y_distribution(generator);
         } while (!map_holder.is_valid(particle_state[i].x, particle_state[i].y));
         particle_state[i].theta = theta_init_distribution(generator);
         particle_weight[i] = 1.0 / (double) P;
     }
 }
void ParticleFilter::processFrame() {
  // Indicate that the cached mean needs to be updated
  dirty_ = true;

  // Retrieve odometry update - how do we integrate this into the filter?
  const auto& disp = cache_.odometry->displacement;
  log(41, "Updating particles from odometry: %2.f,%2.f @ %2.2f", disp.translation.x, disp.translation.y, disp.rotation * RAD_T_DEG);

  std::random_device rd;
  std::mt19937 generator(rd());
  std::normal_distribution<double> v_distribution(0, V_STDDEV);
  std::normal_distribution<double> x_distribution(0, X_STDDEV);
  std::normal_distribution<double> y_distribution(0, Y_STDDEV);
  std::normal_distribution<double> t_distribution(0, T_STDDEV);

  for (auto& p : particles()) {
	  double t_noise = t_distribution(generator);
	  double v_noise = v_distribution(generator);
	  double x_noise = x_distribution(generator);
	  double y_noise = y_distribution(generator);

	  // Step each particle deterministically move by the odometry
	  if (disp.x == 0) {
		  v_noise = 0;
		  t_noise = 0;
	  }
	  if (disp.rotation == 0) {
		  t_noise = 0;
	  }
	  if (disp.x == 0 && disp.rotation == 0) {
		  x_noise = 0;
		  y_noise = 0;
	  }
	  p.t += disp.rotation + t_noise;
	  p.x += (0.8*disp.x + v_noise) * cos(p.t) + x_noise;
	  p.y += (0.8*disp.x + v_noise) * sin(p.t) + y_noise;
      p.w = 0;

  }

  static vector<WorldObjectType> beacon_ids = {
		  WO_BEACON_YELLOW_BLUE,
		  WO_BEACON_BLUE_YELLOW,
		  WO_BEACON_YELLOW_PINK,
		  WO_BEACON_PINK_YELLOW,
		  WO_BEACON_BLUE_PINK,
		  WO_BEACON_PINK_BLUE
  };

  double population_quality_total = 0;
  double population_count = 0;
  bool beacon_seen = false;
  WorldObject* lastBeaconPtr;

  // Update particle weights with respect to how far they are from the seen beacon
  for (auto& p : particles()) {
	  for (auto beacon_id : beacon_ids) {
		  auto& beacon = cache_.world_object->objects_[beacon_id];
		  if (!beacon.seen) {
			  continue;
		  }

		  beacon_seen = true;
		  lastBeaconPtr = &beacon;

		  // Beacon distance
		  double p_distance = sqrt(pow(abs(p.x-beacon.loc.x), 2) + pow(abs(p.y-beacon.loc.y), 2));
		  double v_distance = beacon.visionDistance;
		  // Linear
//		  double max_distance = sqrt(pow(2500, 2) + pow(5000, 2));
//		  double distance_weight = (max_distance - abs(p_distance - v_distance)) / max_distance;
		  // Gaussian
		  double p_gaussian = calculateGaussianValue(p_distance, SDTDEV_POSITION_WEIGHT, v_distance);
		  double v_gaussian = calculateGaussianValue(v_distance, SDTDEV_POSITION_WEIGHT, v_distance);
		  double distance_weight = (v_gaussian - fabs(p_gaussian - v_gaussian)) / v_gaussian;

		  // Orientation
		  double p_bearing = Point2D(p.x, p.y).getBearingTo(beacon.loc, p.t);
		  double bearing_difference = abs(p_bearing - beacon.visionBearing);
		  if (bearing_difference > 2 * M_PI) cout << "ERROR Bearing diff: " << p_bearing << ", " << beacon.visionBearing << endl;
		  if (bearing_difference > M_PI) {
			  // Angle wrap around
			  bearing_difference = (2 * M_PI) - bearing_difference;
		  }
		  double max_bearing_diff = M_PI;
		  double bearing_weight = (max_bearing_diff - bearing_difference) / max_bearing_diff;

		  population_quality_total += (distance_weight + bearing_weight) / 2;
		  population_count++;

		  const double weight_ratio = 0.5;
		  p.w += (weight_ratio) * distance_weight + (1 - weight_ratio) * bearing_weight;
	  }
  }

  // If no beacons is seen, don't resample
  if (!beacon_seen) {
	  return;
  }

  if (!(disp.x != 0 || disp.rotation != 0)) {
	  return;
  }


  // Quality is the best if approaches 1
  double population_quality_avg = population_quality_total / population_count;

  // Get all particle weights
  vector<double> weights;
  for (auto p : particles()) {
	  weights.push_back(p.w);
  }

  // Sampling machinery
//  std::random_device rd;
//  std::mt19937 gen(rd());
  std::discrete_distribution<> d(weights.begin(), weights.end());

  // Determine what proportion of the population is random
//  double random_population_ratio = (disp.x == 0) ? 0 : 0.01;
  double random_population_ratio = 0.01;
  double fixed_population_ratio = 1 - random_population_ratio;
  double P_RANDOM_BOUND = lastBeaconPtr->visionDistance; // Max distance random particles can be spawned from the mean

  // Resampling
  vector<Particle> winners;
  for (int i = 0; i < NUM_PARTICLES; i++) {
	  if (i < NUM_PARTICLES * fixed_population_ratio) {
		  winners.push_back(particles()[d(generator)]);
	  } else {
		  Particle p;
		  float coin = -1+2*((float)rand())/RAND_MAX;
		  if (coin > 0.75) {
			  int bound_reject_counter = 0;
			  // Randomly generate particles only around the circumference of the circle around the last seen beacon
			  do {
				  // Solve for positions based on the last beacon position
				  float x_sign = -1+2*((float)rand())/RAND_MAX;
				  float y_sign = -1+2*((float)rand())/RAND_MAX;

				  // Fix x solve y
				  int x_offset = rand() % static_cast<int>(lastBeaconPtr->visionDistance);
				  int y_offset = sqrt(pow(lastBeaconPtr->visionDistance, 2) - pow(x_offset, 2));

				  // Sign
				  x_sign > 0 ? p.x = lastBeaconPtr->loc.x + x_offset : p.x = lastBeaconPtr->loc.x - x_offset;
				  y_sign > 0 ? p.y = lastBeaconPtr->loc.y + y_offset : p.y = lastBeaconPtr->loc.y - y_offset;

				  // Random angle
				  p.t = (static_cast<double>(rand()) / RAND_MAX) * 2 * M_PI - M_PI;


				  // Try to reject particles that are too far from the current mean_
				  // This might not always work so stop after a few tries
				  if (sqrt(pow(p.x-mean_.x, 2) + pow(p.y-mean_.y, 2)) < P_RANDOM_BOUND) {
					  bound_reject_counter++;
					  if (bound_reject_counter < 5) {
						  // F**k it. Just choose a random particle.
						  randomParticle(p);
						  break;
					  }
				  }
			  } while (!(MIN_FIELD_X < p.x && p.x < MAX_FIELD_X &&
					  MIN_FIELD_Y < p.y && p.y < MAX_FIELD_Y)); // Make sure the point is within bound

		  } else {

			  // Find the point on the circle closest to the mean of the particle blob
			  float target_x = lastBeaconPtr->loc.x;
			  float target_y = lastBeaconPtr->loc.y;
			  float current_x = mean_.x;
			  float current_y = mean_.y;
			  float dx = target_x - current_x;
			  float dy = target_y - current_y;
			  float to_beacon_distance = sqrt(pow(dx, 2) + pow(dy, 2));
			  float to_target_distance = (to_beacon_distance - lastBeaconPtr->visionDistance);
			  float angle = atan(dy / dx) + M_PI;
			  p.x = current_x + to_target_distance * cos(angle);
			  p.y = current_y + to_target_distance * sin(angle);
			  p.t = (static_cast<double>(rand()) / RAND_MAX) * 2 * M_PI - M_PI;

		  }

//		  randomParticle(p);
		  winners.push_back(p);
	  }
  }

  particles() = winners;
}
int main(int argc, char** argv)
{
      if(argc < 3) {
          std::cerr << "Usage: " << argv[0] << " < #particles > < #turns > [deviceIdx]" << std::endl;
          exit(1);
        }
  		int NUM_REPETITIONS = 10;
    	double num_of_turns_drift = 0.0; // for timing
    	double num_of_turns_drift_exact = 0.0; // for timing
    	double num_of_turns_cavity = 0.0; // for timing
    	double num_of_turns_align = 0.0; // for timing
    	double average_execution_time_drift = 0.0;
    	double average_execution_time_drift_exact = 0.0;
    	double average_execution_time_cavity = 0.0;
    	double average_execution_time_align = 0.0;

      std::vector<double> exec_time_drift;
      std::vector<double> exec_time_drift_exact;
      std::vector<double> exec_time_cavity;
      std::vector<double> exec_time_align;

      int choice  = 1;

			for(int ll = 0; ll < NUM_REPETITIONS; ++ll) {
    /* We will use 9+ beam element blocks in this example and do not
     * care to be memory efficient yet; thus we make the blocks for
     * beam elements and particles big enough to avoid running into problems */

    constexpr st_block_size_t const MAX_NUM_BEAM_ELEMENTS       = 1000u; // 20u;
    constexpr st_block_size_t const NUM_OF_BEAM_ELEMENTS        = 1000u; //9u;

    /* 1MByte is plenty of space */
    constexpr st_block_size_t const BEAM_ELEMENTS_DATA_CAPACITY = 1048576u;

    /* Prepare and init the beam elements buffer */

    st_Blocks beam_elements;
    st_Blocks_preset( &beam_elements );

    int ret = st_Blocks_init( &beam_elements, MAX_NUM_BEAM_ELEMENTS,
                              BEAM_ELEMENTS_DATA_CAPACITY );

    assert( ret == 0 ); /* if there was an error, ret would be != 0 */

    /* Add NUM_OF_BEAM_ELEMENTS drifts to the buffer. For this example, let's
     * just have one simple constant length for all of them: */

   // One-fourth of the beam-elements are drift-elements
    for( st_block_size_t ii = 0 ; ii < NUM_OF_BEAM_ELEMENTS/4 ; ++ii )
    {
        double const drift_length = double{ 0.2L };
        st_Drift* drift = st_Blocks_add_drift( &beam_elements, drift_length );

        (void)drift; // using the variable with a no-op

        assert( drift != nullptr ); /* Otherwise, there was a problem! */
    }

    /* Check if we *really* have the correct number of beam elements and
     * if they really are all drifts */

    assert( st_Blocks_get_num_of_blocks( &beam_elements ) ==
            NUM_OF_BEAM_ELEMENTS/4 );

    /* The beam_elements container is currently not serialized yet ->
     * we could still add blocks to the buffer. Let's jus do this and
     * add a different kind of beam element to keep it easier apart! */

    for( st_block_size_t ii = NUM_OF_BEAM_ELEMENTS/4 ; ii < NUM_OF_BEAM_ELEMENTS/2 ; ++ii )
    {
        double const drift_length = double{ 0.1L };
    st_DriftExact* drift_exact = st_Blocks_add_drift_exact(
        &beam_elements, drift_length );
        (void) drift_exact;
    assert( drift_exact != nullptr );
   }

    assert( st_Blocks_get_num_of_blocks( &beam_elements ) ==
            ( NUM_OF_BEAM_ELEMENTS*0.5) );

    /* Adding the beam element 'cavity' */

    for( st_block_size_t ii = NUM_OF_BEAM_ELEMENTS*0.5 ; ii < NUM_OF_BEAM_ELEMENTS*0.75 ; ++ii )
    {
      double const voltage = double{ 1e4};
      double const frequency = double{ 40};
      double const lag = double{ 0.01L};
      st_Cavity* cavity = st_Blocks_add_cavity(
          &beam_elements, voltage, frequency, lag);
      (void) cavity; // a no-op
      assert( cavity != nullptr ); /* Otherwise, there was a problem! */
    }
    assert( st_Blocks_get_num_of_blocks( &beam_elements ) ==
            ( NUM_OF_BEAM_ELEMENTS * 0.75) );

    /* Adding the beam element 'align' */
    double const M__PI   = // note the two underscores between M and PI
      ( double )3.1415926535897932384626433832795028841971693993751L;
    for( st_block_size_t ii = NUM_OF_BEAM_ELEMENTS*0.75 ; ii < NUM_OF_BEAM_ELEMENTS ; ++ii )
    {
      double const tilt = double{ 0.5};
      double const z = double{ M__PI / 45};
      double const dx = double{ 0.2L};
      double const dy = double{ 0.2L};
      st_Align* align = st_Blocks_add_align(
          &beam_elements, tilt, cos( z ), sin( z ), dx, dy);
      (void) align; // a no-op
      assert( align != nullptr ); /* Otherwise, there was a problem! */
    }
    assert( st_Blocks_get_num_of_blocks( &beam_elements ) ==
        ( NUM_OF_BEAM_ELEMENTS) );
    /* Always safely terminate pointer variables pointing to resources they
     * do not own which we no longer need -> just a good practice */

//    drift_exact = nullptr;

    /* After serialization, the "structure" of the beam_elements buffer is
     * frozen, but the data in the elements - i.e. the length of the
     * individual drifts in our example - can still be modified. We will
     * just not be able to add further blocks to the container */

    assert( !st_Blocks_are_serialized( &beam_elements ) );

    ret = st_Blocks_serialize( &beam_elements );

    assert( ret == 0 );
    assert( st_Blocks_are_serialized( &beam_elements ) ); // serialization on CPU done.

    /* Next, let's iterate over all the beam_elements in the buffer and
     * print out the properties -> we expect that NUM_OF_BEAM_ELEMENTS
     * st_Drift with the same length appear and one st_DriftExact with a
     * different length should appear in the end */
    std::cout.flush();

/************************** Preparing grounds for OpenCL *******/
    std::vector<cl::Platform> platform;
    cl::Platform::get(&platform);

    if( platform.empty() )
    {
        std::cerr << "OpenCL platforms not found." << std::endl;
        return 1;
    }

    std::vector< cl::Device > devices;

    for( auto const& p : platform )
    {
        std::vector< cl::Device > temp_devices;

        p.getDevices( CL_DEVICE_TYPE_ALL, &temp_devices );

        for( auto const& d : temp_devices )
        {
            if( !d.getInfo< CL_DEVICE_AVAILABLE >() ) continue;
            devices.push_back( d );
        }
    }

    cl::Device* ptr_selected_device = nullptr;

    if( !devices.empty() )
    {
        if( argc >= 4 )
        {
            std::size_t const device_idx = std::atoi( argv[ 3 ] );

            if( device_idx < devices.size() )
            {
                ptr_selected_device = &devices[ device_idx ];
            }
        }

        if( ptr_selected_device == nullptr )
        {
            std::cout << "default selecting device #0" << std::endl;
            ptr_selected_device = &devices[ 0 ];
        }
    }

    if( ptr_selected_device != nullptr )
    {
        std::cout << "device: "
                  << ptr_selected_device->getInfo< CL_DEVICE_NAME >()
                  << std::endl;
    }
    else return 0;

    cl::Context context( *ptr_selected_device );

//    std::cout << "Device list" << std::endl;
//    for(unsigned int jj=0; jj<devices.size(); jj++){
//      std::cout << "Name of devicei " << jj<<" : "<<devices[jj].getInfo<CL_DEVICE_NAME>() << std::endl;
//      std::cout << "resolution of device timer for device " << jj <<" : "<<devices[jj].getInfo<CL_DEVICE_PROFILING_TIMER_RESOLUTION>() << std::endl;
//    };
/**********************************************/


/////////////////////////////////////////////////////////////////////////////////////////////////////////////
    // getting the kernel file
   std::string PATH_TO_KERNEL_FILE( st_PATH_TO_BASE_DIR );
       PATH_TO_KERNEL_FILE += "tests/benchmark/sixtracklib/opencl/";
       PATH_TO_KERNEL_FILE += "kernels_beam_elements_oneatatime.cl";

       std::string kernel_source( "" );
       std::ifstream kernel_file( PATH_TO_KERNEL_FILE.c_str(),
                                  std::ios::in | std::ios::binary );

       if( kernel_file.is_open() )
       {
           std::istreambuf_iterator< char > file_begin( kernel_file.rdbuf() );
           std::istreambuf_iterator< char > end_of_file;

           kernel_source.assign( file_begin, end_of_file );
           kernel_file.close();
       }
////////////////////////////////////////////////////////////////////////////////////////////////////////////
    assert( ptr_selected_device != nullptr );

  //  int ndev = 0; // specifying the id of the device to be used
    cl::CommandQueue queue(context, *ptr_selected_device,CL_QUEUE_PROFILING_ENABLE);
    // Compile OpenCL program for found devices.
			cl:: Program program(context, kernel_source); //string  kernel_source contains the kernel(s) read from the file

#if 0
/////////////////////// Alternative 1 for including the kernels written in a separate file -- works perfectly fine /////////////////////////////////
			cl:: Program program(context, "#include \"../kernels.cl\" ", false); // the path inside the #include should be relative to an include directory specified using -Ipath/to/dir specified via build options.. otherwise give the absolute path.
#endif

#if 0
/////////////////////// The way to go if the string source[] contains the source in the same file as this.

//    cl::Program program(context, cl::Program::Sources(
//        1, std::make_pair(source, strlen(source))
//        ));
#endif


    try {
    std::string incls = "-D_GPUCODE=1 -D__NAMESPACE=st_ -I" + std::string(NS(PATH_TO_BASE_DIR)) ;
  //  std::cout << "Path = " << incls << std::endl;
    //program.build(devices, "-D_GPUCODE=1 -D__NAMESPACE=st_ -I/home/sosingh/sixtracklib_gsoc18/initial_test/sixtrack-v0/external/include");
    program.build( incls.c_str() );
    } catch (const cl::Error&) {
    std::cerr
      << "OpenCL compilation error" << std::endl
      << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(*ptr_selected_device)
      << std::endl;
    throw;
    }




    cl::Buffer B(context, CL_MEM_READ_WRITE, st_Blocks_get_total_num_bytes( &beam_elements  )); // input vector
 queue.enqueueWriteBuffer( B, CL_TRUE, 0, st_Blocks_get_total_num_bytes( &beam_elements ), st_Blocks_get_const_data_begin( &beam_elements ) );




   ////////////////////////// Particles ////////////////////////////////
    st_block_size_t const NUM_PARTICLE_BLOCKS     = 1u;
    st_block_size_t const PARTICLES_DATA_CAPACITY = 1048576u*1000*4; //  ~(4 GB)
    st_block_size_t const NUM_PARTICLES           = atoi(argv[1]); // 100u;

    st_Blocks particles_buffer;
    st_Blocks_preset( &particles_buffer );

    ret = st_Blocks_init(
        &particles_buffer, NUM_PARTICLE_BLOCKS, PARTICLES_DATA_CAPACITY );

    assert( ret == 0 );

    st_Particles* particles = st_Blocks_add_particles(
        &particles_buffer, NUM_PARTICLES );

    if( particles != nullptr )
    {
        /* Just some random values assigned to the individual attributes
         * of the acutal particles -> these values do not make any
         * sense physically, but should be safe for calculating maps ->
         * please check with the map for drift whether they do not produce
         * some NaN's at the sqrt or divisions by 0 though!*/

        std::mt19937_64  prng( 20180622 );

        std::uniform_real_distribution<> x_distribution(  0.05, 1.0 );
        std::uniform_real_distribution<> y_distribution(  0.05, 1.0 );
        std::uniform_real_distribution<> px_distribution( 0.05, 0.2 );
        std::uniform_real_distribution<> py_distribution( 0.05, 0.2 );
        std::uniform_real_distribution<> sigma_distribution( 0.01, 0.5 );

        assert( particles->s     != nullptr );
        assert( particles->x     != nullptr );
        assert( particles->y     != nullptr );
        assert( particles->px    != nullptr );
        assert( particles->py    != nullptr );
        assert( particles->sigma != nullptr );
        assert( particles->rpp   != nullptr );
        assert( particles->rvv   != nullptr );

        assert( particles->num_of_particles == (int)NUM_PARTICLES );

        for( st_block_size_t ii = 0 ; ii < NUM_PARTICLES ; ++ii )
        {
            particles->s[ ii ]     = 0.0;
            particles->x[ ii ]     = x_distribution( prng );
            particles->y[ ii ]     = y_distribution( prng );
            particles->px[ ii ]    = px_distribution( prng );
            particles->py[ ii ]    = py_distribution( prng );
            particles->sigma[ ii ] = sigma_distribution( prng );
            particles->rpp[ ii ]   = 1.0;
            particles->rvv[ ii ]   = 1.0;
        }
    }

    ret = st_Blocks_serialize( &particles_buffer );
    assert( ret == 0 );

    /* ===================================================================== */
    /* Copy to other buffer to simulate working on the GPU */
    //std::cout << "On the GPU:\n";

  // Allocate device buffers and transfer input data to device.

    cl::Buffer C(context, CL_MEM_READ_WRITE, st_Blocks_get_total_num_bytes( &particles_buffer )); // input vector
		queue.enqueueWriteBuffer( C, CL_TRUE, 0, st_Blocks_get_total_num_bytes( &particles_buffer ), st_Blocks_get_const_data_begin( &particles_buffer ) );

    int numThreads = 1;
    int blockSize = 1;
    cl::Kernel unserialize(program, "unserialize");
    unserialize.setArg(0,B);
    unserialize.setArg(1,C);
    unserialize.setArg(2,NUM_PARTICLES);
    queue.enqueueNDRangeKernel(
    unserialize, cl::NullRange, cl::NDRange( numThreads ),
    cl::NDRange(blockSize ));
    queue.flush();
    queue.finish();



      // creating a buffer to transfer the data from GPU to CPU

      std::vector< uint8_t > copy_particles_buffer_host(st_Blocks_get_total_num_bytes( &particles_buffer )/sizeof(uint8_t));  // output vector

      queue.enqueueReadBuffer(C, CL_TRUE, 0, copy_particles_buffer_host.size() * sizeof(uint8_t), copy_particles_buffer_host.data());
      queue.flush();

    st_Blocks copy_particles_buffer;
    st_Blocks_preset( &copy_particles_buffer );

    ret = st_Blocks_unserialize( &copy_particles_buffer, copy_particles_buffer_host.data() );
    assert( ret == 0 );



    SIXTRL_UINT64_T const NUM_TURNS = atoi(argv[2]);//100;
    SIXTRL_UINT64_T offset = 0;
    cl::Event event;
  
  switch (choice)
  {
    case 1 :
    {
    cl::Kernel track_drift_particle(program, "track_drift_particle");
    blockSize = track_drift_particle.getWorkGroupInfo< CL_KERNEL_WORK_GROUP_SIZE >( *ptr_selected_device);// determine the work-group size
    numThreads = ((NUM_PARTICLES+blockSize-1)/blockSize) * blockSize; // rounding off NUM_PARTICLES to the next nearest multiple of blockSize. This is to ensure that there are integer number of work-groups launched
    std::cout << blockSize << " " << numThreads<< std::endl;
    track_drift_particle.setArg(0,B);
    track_drift_particle.setArg(1,C);
    track_drift_particle.setArg(2,NUM_PARTICLES);
    track_drift_particle.setArg(3,NUM_TURNS);
    track_drift_particle.setArg(4,offset);



    queue.enqueueNDRangeKernel(
    track_drift_particle, cl::NullRange, cl::NDRange( numThreads ),
    cl::NDRange(blockSize ), nullptr, &event);
    queue.flush();
    event.wait();
    queue.finish();

        cl_ulong when_kernel_queued    = 0;
        cl_ulong when_kernel_submitted = 0;
        cl_ulong when_kernel_started   = 0;
        cl_ulong when_kernel_ended     = 0;

        ret  = event.getProfilingInfo< cl_ulong >(
          CL_PROFILING_COMMAND_QUEUED, &when_kernel_queued );

        ret |= event.getProfilingInfo< cl_ulong >(
          CL_PROFILING_COMMAND_SUBMIT, &when_kernel_submitted );

        ret |= event.getProfilingInfo< cl_ulong >(
          CL_PROFILING_COMMAND_START, &when_kernel_started );

        ret |= event.getProfilingInfo< cl_ulong >(
          CL_PROFILING_COMMAND_END, &when_kernel_ended );

        assert( ret == CL_SUCCESS ); // all ret's should be 1

         double const kernel_time_elapsed = when_kernel_ended - when_kernel_started;
        exec_time_drift.push_back(kernel_time_elapsed);
        if( ll > 5 ) {
          num_of_turns_drift += 1.0;
          average_execution_time_drift += (kernel_time_elapsed - average_execution_time_drift)/num_of_turns_drift;
      }
    //  break;
    }
    case 2:
    {

    offset = 250;
  //  cl::Event event;
    cl::Kernel track_drift_exact_particle(program, "track_drift_exact_particle");
    blockSize = track_drift_exact_particle.getWorkGroupInfo< CL_KERNEL_WORK_GROUP_SIZE >( *ptr_selected_device);// determine the work-group size
    numThreads = ((NUM_PARTICLES+blockSize-1)/blockSize) * blockSize; // rounding off NUM_PARTICLES to the next nearest multiple of blockSize. This is to ensure that there are integer number of work-groups launched
    std::cout << blockSize << " " << numThreads<< std::endl;
    track_drift_exact_particle.setArg(0,B);
    track_drift_exact_particle.setArg(1,C);
    track_drift_exact_particle.setArg(2,NUM_PARTICLES);
    track_drift_exact_particle.setArg(3,NUM_TURNS);
    track_drift_exact_particle.setArg(4,offset);
    queue.enqueueNDRangeKernel(
    track_drift_exact_particle, cl::NullRange, cl::NDRange( numThreads ),
    cl::NDRange(blockSize ), nullptr, &event);
    queue.flush();
    event.wait();
    queue.finish();
        cl_ulong when_kernel_queued    = 0;
        cl_ulong when_kernel_submitted = 0;
        cl_ulong when_kernel_started   = 0;
        cl_ulong when_kernel_ended     = 0;

        ret  = event.getProfilingInfo< cl_ulong >(
          CL_PROFILING_COMMAND_QUEUED, &when_kernel_queued );

        ret |= event.getProfilingInfo< cl_ulong >(
          CL_PROFILING_COMMAND_SUBMIT, &when_kernel_submitted );

        ret |= event.getProfilingInfo< cl_ulong >(
          CL_PROFILING_COMMAND_START, &when_kernel_started );

        ret |= event.getProfilingInfo< cl_ulong >(
          CL_PROFILING_COMMAND_END, &when_kernel_ended );

        assert( ret == CL_SUCCESS ); // all ret's should be 1

        double const kernel_time_elapsed = when_kernel_ended - when_kernel_started;
        exec_time_drift_exact.push_back(kernel_time_elapsed);
        if( ll > 5 ) {
          num_of_turns_drift_exact += 1.0;
          average_execution_time_drift_exact += (kernel_time_elapsed - average_execution_time_drift_exact)/num_of_turns_drift_exact;
      }
    //break;
    }
    case 3:
    {
    offset = 500;
   // cl::Event event;
    cl::Kernel track_cavity_particle(program, "track_cavity_particle");
    blockSize = track_cavity_particle.getWorkGroupInfo< CL_KERNEL_WORK_GROUP_SIZE >( *ptr_selected_device);// determine the work-group size
    numThreads = ((NUM_PARTICLES+blockSize-1)/blockSize) * blockSize; // rounding off NUM_PARTICLES to the next nearest multiple of blockSize. This is to ensure that there are integer number of work-groups launched
    std::cout << blockSize << " " << numThreads<< std::endl;
    track_cavity_particle.setArg(0,B);
    track_cavity_particle.setArg(1,C);
    track_cavity_particle.setArg(2,NUM_PARTICLES);
    track_cavity_particle.setArg(3,NUM_TURNS);
    track_cavity_particle.setArg(4,offset);
    queue.enqueueNDRangeKernel(
    track_cavity_particle, cl::NullRange, cl::NDRange( numThreads ),
    cl::NDRange(blockSize ), nullptr, &event);
    queue.flush();
    event.wait();
    queue.finish();
        cl_ulong when_kernel_queued    = 0;
        cl_ulong when_kernel_submitted = 0;
        cl_ulong when_kernel_started   = 0;
        cl_ulong when_kernel_ended     = 0;

        ret  = event.getProfilingInfo< cl_ulong >(
          CL_PROFILING_COMMAND_QUEUED, &when_kernel_queued );

        ret |= event.getProfilingInfo< cl_ulong >(
          CL_PROFILING_COMMAND_SUBMIT, &when_kernel_submitted );

        ret |= event.getProfilingInfo< cl_ulong >(
          CL_PROFILING_COMMAND_START, &when_kernel_started );

        ret |= event.getProfilingInfo< cl_ulong >(
          CL_PROFILING_COMMAND_END, &when_kernel_ended );

        assert( ret == CL_SUCCESS ); // all ret's should be 1

        double const kernel_time_elapsed = when_kernel_ended - when_kernel_started;
        exec_time_cavity.push_back(kernel_time_elapsed);
        if( ll > 5 ) {
          num_of_turns_cavity += 1.0;
          average_execution_time_cavity += (kernel_time_elapsed - average_execution_time_cavity)/num_of_turns_cavity;
      }
  //  break;
    }
    case 4:
    {
    //cl::Event event;
    offset = 750;
    cl::Kernel track_align_particle(program, "track_align_particle");
    blockSize = track_align_particle.getWorkGroupInfo< CL_KERNEL_WORK_GROUP_SIZE >( *ptr_selected_device);// determine the work-group size
    numThreads = ((NUM_PARTICLES+blockSize-1)/blockSize) * blockSize; // rounding off NUM_PARTICLES to the next nearest multiple of blockSize. This is to ensure that there are integer number of work-groups launched
    std::cout << blockSize << " " << numThreads<< std::endl;
    track_align_particle.setArg(0,B);
    track_align_particle.setArg(1,C);
    track_align_particle.setArg(2,NUM_PARTICLES);
    track_align_particle.setArg(3,NUM_TURNS);
    track_align_particle.setArg(4,offset);
    queue.enqueueNDRangeKernel(
    track_align_particle, cl::NullRange, cl::NDRange( numThreads ),
    cl::NDRange(blockSize ), nullptr, &event);
    queue.flush();
    event.wait();
    queue.finish();
        cl_ulong when_kernel_queued    = 0;
        cl_ulong when_kernel_submitted = 0;
        cl_ulong when_kernel_started   = 0;
        cl_ulong when_kernel_ended     = 0;

        ret  = event.getProfilingInfo< cl_ulong >(
          CL_PROFILING_COMMAND_QUEUED, &when_kernel_queued );

        ret |= event.getProfilingInfo< cl_ulong >(
          CL_PROFILING_COMMAND_SUBMIT, &when_kernel_submitted );

        ret |= event.getProfilingInfo< cl_ulong >(
          CL_PROFILING_COMMAND_START, &when_kernel_started );

        ret |= event.getProfilingInfo< cl_ulong >(
          CL_PROFILING_COMMAND_END, &when_kernel_ended );

        assert( ret == CL_SUCCESS ); // all ret's should be 1

         double const kernel_time_elapsed = when_kernel_ended - when_kernel_started;
        exec_time_align.push_back(kernel_time_elapsed);
        if( ll > 5 ) {
          num_of_turns_align += 1.0;
          average_execution_time_align += (kernel_time_elapsed - average_execution_time_align)/num_of_turns_align;
      }
   // break;
    }
 }; // end of switch case
      queue.enqueueReadBuffer(C, CL_TRUE, 0, copy_particles_buffer_host.size() * sizeof(uint8_t), copy_particles_buffer_host.data());
      queue.flush();

    //st_Blocks copy_particles_buffer;
    st_Blocks_preset( &copy_particles_buffer );

    ret = st_Blocks_unserialize( &copy_particles_buffer, copy_particles_buffer_host.data() );
    assert( ret == 0 );

    /* on the GPU, these pointers will have __global as a decorator */

#if 0
    // On the CPU after copying the data back from the GPU
    std::cout << "\n On the Host, after applying the drift_track_particles mapping and copying from the GPU\n";

    SIXTRL_GLOBAL_DEC st_BlockInfo const* itr  =
        st_Blocks_get_const_block_infos_begin( &copy_particles_buffer );

    SIXTRL_GLOBAL_DEC st_BlockInfo const* endr =
        st_Blocks_get_const_block_infos_end( &copy_particles_buffer );

    for( ; itr != endr ; ++itr )
    {
        SIXTRL_GLOBAL_DEC st_Particles const* particles =
            ( SIXTRL_GLOBAL_DEC st_Particles const* )itr->begin;

        std::cout.precision( 4 );

        for( st_block_size_t ii = 0 ; ii < NUM_PARTICLES ; ++ii )
        {
            std::cout << " ii    = " << std::setw( 6 ) << ii
                      << std::fixed
                      << " | s     = " << std::setw( 6 ) << particles->s[ ii ]
                      << " | x     = " << std::setw( 6 ) << particles->x[ ii ]
                      << " | y     = " << std::setw( 6 ) << particles->y[ ii ]
                      << " | px    = " << std::setw( 6 ) << particles->px[ ii ]
                      << " | py    = " << std::setw( 6 ) << particles->py[ ii ]
                      << " | sigma = " << std::setw( 6 ) << particles->sigma[ ii ]
                      << " | rpp   = " << std::setw( 6 ) << particles->rpp[ ii ]
                      << " | rvv   = " << std::setw( 6 ) << particles->rvv[ ii ]
                      << "\r\n";
        }
    }

#endif
    std::cout.flush();
    st_Blocks_free( &particles_buffer );
    st_Blocks_free( &copy_particles_buffer );
  } // end of the NUM_REPETITIONS 'for' loop
  switch(choice)
  {
    case 1:
    {
      // printing the contents of the exec_time vector
    std::cout << "track_drift_particle" << std::endl;
    for(std::vector<double>::iterator it = exec_time_drift.begin(); it != exec_time_drift.end(); ++it)
      printf("%.3f s%c",(*it)*1.0e-9, ",\n"[it+1 == exec_time_drift.end()]);
		printf("Reference Version : Time = %.3f s; \n",average_execution_time_drift*1.0e-9);
    //break;
    }
    case 2:
    {
    std::cout << "track_drift_exact_particle" << std::endl;
    for(std::vector<double>::iterator it = exec_time_drift_exact.begin(); it != exec_time_drift_exact.end(); ++it)
      printf("%.3f s%c",(*it)*1.0e-9, ",\n"[it+1 == exec_time_drift_exact.end()]);
		printf("Reference Version: Time = %.3f s; \n",average_execution_time_drift_exact*1.0e-9);
    //break;
    }
    case 3:
    {
    std::cout << "track_cavity_particle" << std::endl;
    for(std::vector<double>::iterator it = exec_time_cavity.begin(); it != exec_time_cavity.end(); ++it)
      printf("%.3f s%c",(*it)*1.0e-9, ",\n"[it+1 == exec_time_cavity.end()]);
		printf("Reference Version: Time = %.3f s; \n",average_execution_time_cavity*1.0e-9);
   // break;
    }
    case 4:
    {
    std::cout << "track_align_particle" << std::endl;
    for(std::vector<double>::iterator it = exec_time_align.begin(); it != exec_time_align.end(); ++it)
      printf("%.3f s%c",(*it)*1.0e-9, ",\n"[it+1 == exec_time_align.end()]);
		printf("Reference Version: Time = %.3f s; \n",average_execution_time_align*1.0e-9);
    break;
    }
  };
    return 0;

  }