void PointCloudGenerator::PostInitialize(ISystem *ySys, IPluginObjectInstance *pInstance) { int nVertex = numVertex->v->int_val; float *pVertex = (float *)vertexData->v->userdata; // Being re-initialized? if ((pVertex != NULL) && (nVertex != vertexCount->v->int_val)) { free (pVertex); pVertex = NULL; } if (pVertex == NULL) { pVertex = (float *)malloc(sizeof(float) * nVertex * 3); } std::default_random_engine generator; std::uniform_real_distribution<float> x_distribution(-range->v->vector[0],range->v->vector[0]); std::uniform_real_distribution<float> y_distribution(-range->v->vector[1],range->v->vector[1]); std::uniform_real_distribution<float> z_distribution(-range->v->vector[2],range->v->vector[2]); for(int i=0;i<nVertex;i++) { vIni(&pVertex[i*3], x_distribution(generator), y_distribution(generator), z_distribution(generator)); } vertexCount->v->int_val = nVertex; vertexData->v->userdata = pVertex; }
Eigen::Vector3d get_random_location(std::default_random_engine& generator, const double min_x, const double min_y, const double min_z, const double max_x, const double max_y, const double max_z) { std::uniform_real_distribution<double> x_distribution(min_x, max_x); std::uniform_real_distribution<double> y_distribution(min_y, max_y); std::uniform_real_distribution<double> z_distribution(min_z, max_z); double rand_x = x_distribution(generator); double rand_y = y_distribution(generator); double rand_z = z_distribution(generator); return Eigen::Vector3d(rand_x, rand_y, rand_z); }
// Generate Initial Gaussian Distribution void init_distribution(double x, double y, double theta) { //Parameter of Initial Diffusion std::normal_distribution<double> x_distribution(x, var_x); std::normal_distribution<double> y_distribution(y, var_y); for (int i = 0; i < P; ++i) { particle_state[i].x = x_distribution(generator); particle_state[i].y = y_distribution(generator); particle_state[i].theta = theta; particle_weight[i] = 1.0 / (double) P; } }
void init_distribution_uniform(double xmin, double xmax, double ymin, double ymax) { std::uniform_real_distribution<double> x_distribution(xmin, xmax); std::uniform_real_distribution<double> y_distribution(ymin, ymax); for (int i = 0; i < P; i++) { do { particle_state[i].x = x_distribution(generator); particle_state[i].y = y_distribution(generator); } while (!map_holder.is_valid(particle_state[i].x, particle_state[i].y)); particle_state[i].theta = theta_init_distribution(generator); particle_weight[i] = 1.0 / (double) P; } }
void ParticleFilter::processFrame() { // Indicate that the cached mean needs to be updated dirty_ = true; // Retrieve odometry update - how do we integrate this into the filter? const auto& disp = cache_.odometry->displacement; log(41, "Updating particles from odometry: %2.f,%2.f @ %2.2f", disp.translation.x, disp.translation.y, disp.rotation * RAD_T_DEG); std::random_device rd; std::mt19937 generator(rd()); std::normal_distribution<double> v_distribution(0, V_STDDEV); std::normal_distribution<double> x_distribution(0, X_STDDEV); std::normal_distribution<double> y_distribution(0, Y_STDDEV); std::normal_distribution<double> t_distribution(0, T_STDDEV); for (auto& p : particles()) { double t_noise = t_distribution(generator); double v_noise = v_distribution(generator); double x_noise = x_distribution(generator); double y_noise = y_distribution(generator); // Step each particle deterministically move by the odometry if (disp.x == 0) { v_noise = 0; t_noise = 0; } if (disp.rotation == 0) { t_noise = 0; } if (disp.x == 0 && disp.rotation == 0) { x_noise = 0; y_noise = 0; } p.t += disp.rotation + t_noise; p.x += (0.8*disp.x + v_noise) * cos(p.t) + x_noise; p.y += (0.8*disp.x + v_noise) * sin(p.t) + y_noise; p.w = 0; } static vector<WorldObjectType> beacon_ids = { WO_BEACON_YELLOW_BLUE, WO_BEACON_BLUE_YELLOW, WO_BEACON_YELLOW_PINK, WO_BEACON_PINK_YELLOW, WO_BEACON_BLUE_PINK, WO_BEACON_PINK_BLUE }; double population_quality_total = 0; double population_count = 0; bool beacon_seen = false; WorldObject* lastBeaconPtr; // Update particle weights with respect to how far they are from the seen beacon for (auto& p : particles()) { for (auto beacon_id : beacon_ids) { auto& beacon = cache_.world_object->objects_[beacon_id]; if (!beacon.seen) { continue; } beacon_seen = true; lastBeaconPtr = &beacon; // Beacon distance double p_distance = sqrt(pow(abs(p.x-beacon.loc.x), 2) + pow(abs(p.y-beacon.loc.y), 2)); double v_distance = beacon.visionDistance; // Linear // double max_distance = sqrt(pow(2500, 2) + pow(5000, 2)); // double distance_weight = (max_distance - abs(p_distance - v_distance)) / max_distance; // Gaussian double p_gaussian = calculateGaussianValue(p_distance, SDTDEV_POSITION_WEIGHT, v_distance); double v_gaussian = calculateGaussianValue(v_distance, SDTDEV_POSITION_WEIGHT, v_distance); double distance_weight = (v_gaussian - fabs(p_gaussian - v_gaussian)) / v_gaussian; // Orientation double p_bearing = Point2D(p.x, p.y).getBearingTo(beacon.loc, p.t); double bearing_difference = abs(p_bearing - beacon.visionBearing); if (bearing_difference > 2 * M_PI) cout << "ERROR Bearing diff: " << p_bearing << ", " << beacon.visionBearing << endl; if (bearing_difference > M_PI) { // Angle wrap around bearing_difference = (2 * M_PI) - bearing_difference; } double max_bearing_diff = M_PI; double bearing_weight = (max_bearing_diff - bearing_difference) / max_bearing_diff; population_quality_total += (distance_weight + bearing_weight) / 2; population_count++; const double weight_ratio = 0.5; p.w += (weight_ratio) * distance_weight + (1 - weight_ratio) * bearing_weight; } } // If no beacons is seen, don't resample if (!beacon_seen) { return; } if (!(disp.x != 0 || disp.rotation != 0)) { return; } // Quality is the best if approaches 1 double population_quality_avg = population_quality_total / population_count; // Get all particle weights vector<double> weights; for (auto p : particles()) { weights.push_back(p.w); } // Sampling machinery // std::random_device rd; // std::mt19937 gen(rd()); std::discrete_distribution<> d(weights.begin(), weights.end()); // Determine what proportion of the population is random // double random_population_ratio = (disp.x == 0) ? 0 : 0.01; double random_population_ratio = 0.01; double fixed_population_ratio = 1 - random_population_ratio; double P_RANDOM_BOUND = lastBeaconPtr->visionDistance; // Max distance random particles can be spawned from the mean // Resampling vector<Particle> winners; for (int i = 0; i < NUM_PARTICLES; i++) { if (i < NUM_PARTICLES * fixed_population_ratio) { winners.push_back(particles()[d(generator)]); } else { Particle p; float coin = -1+2*((float)rand())/RAND_MAX; if (coin > 0.75) { int bound_reject_counter = 0; // Randomly generate particles only around the circumference of the circle around the last seen beacon do { // Solve for positions based on the last beacon position float x_sign = -1+2*((float)rand())/RAND_MAX; float y_sign = -1+2*((float)rand())/RAND_MAX; // Fix x solve y int x_offset = rand() % static_cast<int>(lastBeaconPtr->visionDistance); int y_offset = sqrt(pow(lastBeaconPtr->visionDistance, 2) - pow(x_offset, 2)); // Sign x_sign > 0 ? p.x = lastBeaconPtr->loc.x + x_offset : p.x = lastBeaconPtr->loc.x - x_offset; y_sign > 0 ? p.y = lastBeaconPtr->loc.y + y_offset : p.y = lastBeaconPtr->loc.y - y_offset; // Random angle p.t = (static_cast<double>(rand()) / RAND_MAX) * 2 * M_PI - M_PI; // Try to reject particles that are too far from the current mean_ // This might not always work so stop after a few tries if (sqrt(pow(p.x-mean_.x, 2) + pow(p.y-mean_.y, 2)) < P_RANDOM_BOUND) { bound_reject_counter++; if (bound_reject_counter < 5) { // F**k it. Just choose a random particle. randomParticle(p); break; } } } while (!(MIN_FIELD_X < p.x && p.x < MAX_FIELD_X && MIN_FIELD_Y < p.y && p.y < MAX_FIELD_Y)); // Make sure the point is within bound } else { // Find the point on the circle closest to the mean of the particle blob float target_x = lastBeaconPtr->loc.x; float target_y = lastBeaconPtr->loc.y; float current_x = mean_.x; float current_y = mean_.y; float dx = target_x - current_x; float dy = target_y - current_y; float to_beacon_distance = sqrt(pow(dx, 2) + pow(dy, 2)); float to_target_distance = (to_beacon_distance - lastBeaconPtr->visionDistance); float angle = atan(dy / dx) + M_PI; p.x = current_x + to_target_distance * cos(angle); p.y = current_y + to_target_distance * sin(angle); p.t = (static_cast<double>(rand()) / RAND_MAX) * 2 * M_PI - M_PI; } // randomParticle(p); winners.push_back(p); } } particles() = winners; }
int main(int argc, char** argv) { if(argc < 3) { std::cerr << "Usage: " << argv[0] << " < #particles > < #turns > [deviceIdx]" << std::endl; exit(1); } int NUM_REPETITIONS = 10; double num_of_turns_drift = 0.0; // for timing double num_of_turns_drift_exact = 0.0; // for timing double num_of_turns_cavity = 0.0; // for timing double num_of_turns_align = 0.0; // for timing double average_execution_time_drift = 0.0; double average_execution_time_drift_exact = 0.0; double average_execution_time_cavity = 0.0; double average_execution_time_align = 0.0; std::vector<double> exec_time_drift; std::vector<double> exec_time_drift_exact; std::vector<double> exec_time_cavity; std::vector<double> exec_time_align; int choice = 1; for(int ll = 0; ll < NUM_REPETITIONS; ++ll) { /* We will use 9+ beam element blocks in this example and do not * care to be memory efficient yet; thus we make the blocks for * beam elements and particles big enough to avoid running into problems */ constexpr st_block_size_t const MAX_NUM_BEAM_ELEMENTS = 1000u; // 20u; constexpr st_block_size_t const NUM_OF_BEAM_ELEMENTS = 1000u; //9u; /* 1MByte is plenty of space */ constexpr st_block_size_t const BEAM_ELEMENTS_DATA_CAPACITY = 1048576u; /* Prepare and init the beam elements buffer */ st_Blocks beam_elements; st_Blocks_preset( &beam_elements ); int ret = st_Blocks_init( &beam_elements, MAX_NUM_BEAM_ELEMENTS, BEAM_ELEMENTS_DATA_CAPACITY ); assert( ret == 0 ); /* if there was an error, ret would be != 0 */ /* Add NUM_OF_BEAM_ELEMENTS drifts to the buffer. For this example, let's * just have one simple constant length for all of them: */ // One-fourth of the beam-elements are drift-elements for( st_block_size_t ii = 0 ; ii < NUM_OF_BEAM_ELEMENTS/4 ; ++ii ) { double const drift_length = double{ 0.2L }; st_Drift* drift = st_Blocks_add_drift( &beam_elements, drift_length ); (void)drift; // using the variable with a no-op assert( drift != nullptr ); /* Otherwise, there was a problem! */ } /* Check if we *really* have the correct number of beam elements and * if they really are all drifts */ assert( st_Blocks_get_num_of_blocks( &beam_elements ) == NUM_OF_BEAM_ELEMENTS/4 ); /* The beam_elements container is currently not serialized yet -> * we could still add blocks to the buffer. Let's jus do this and * add a different kind of beam element to keep it easier apart! */ for( st_block_size_t ii = NUM_OF_BEAM_ELEMENTS/4 ; ii < NUM_OF_BEAM_ELEMENTS/2 ; ++ii ) { double const drift_length = double{ 0.1L }; st_DriftExact* drift_exact = st_Blocks_add_drift_exact( &beam_elements, drift_length ); (void) drift_exact; assert( drift_exact != nullptr ); } assert( st_Blocks_get_num_of_blocks( &beam_elements ) == ( NUM_OF_BEAM_ELEMENTS*0.5) ); /* Adding the beam element 'cavity' */ for( st_block_size_t ii = NUM_OF_BEAM_ELEMENTS*0.5 ; ii < NUM_OF_BEAM_ELEMENTS*0.75 ; ++ii ) { double const voltage = double{ 1e4}; double const frequency = double{ 40}; double const lag = double{ 0.01L}; st_Cavity* cavity = st_Blocks_add_cavity( &beam_elements, voltage, frequency, lag); (void) cavity; // a no-op assert( cavity != nullptr ); /* Otherwise, there was a problem! */ } assert( st_Blocks_get_num_of_blocks( &beam_elements ) == ( NUM_OF_BEAM_ELEMENTS * 0.75) ); /* Adding the beam element 'align' */ double const M__PI = // note the two underscores between M and PI ( double )3.1415926535897932384626433832795028841971693993751L; for( st_block_size_t ii = NUM_OF_BEAM_ELEMENTS*0.75 ; ii < NUM_OF_BEAM_ELEMENTS ; ++ii ) { double const tilt = double{ 0.5}; double const z = double{ M__PI / 45}; double const dx = double{ 0.2L}; double const dy = double{ 0.2L}; st_Align* align = st_Blocks_add_align( &beam_elements, tilt, cos( z ), sin( z ), dx, dy); (void) align; // a no-op assert( align != nullptr ); /* Otherwise, there was a problem! */ } assert( st_Blocks_get_num_of_blocks( &beam_elements ) == ( NUM_OF_BEAM_ELEMENTS) ); /* Always safely terminate pointer variables pointing to resources they * do not own which we no longer need -> just a good practice */ // drift_exact = nullptr; /* After serialization, the "structure" of the beam_elements buffer is * frozen, but the data in the elements - i.e. the length of the * individual drifts in our example - can still be modified. We will * just not be able to add further blocks to the container */ assert( !st_Blocks_are_serialized( &beam_elements ) ); ret = st_Blocks_serialize( &beam_elements ); assert( ret == 0 ); assert( st_Blocks_are_serialized( &beam_elements ) ); // serialization on CPU done. /* Next, let's iterate over all the beam_elements in the buffer and * print out the properties -> we expect that NUM_OF_BEAM_ELEMENTS * st_Drift with the same length appear and one st_DriftExact with a * different length should appear in the end */ std::cout.flush(); /************************** Preparing grounds for OpenCL *******/ std::vector<cl::Platform> platform; cl::Platform::get(&platform); if( platform.empty() ) { std::cerr << "OpenCL platforms not found." << std::endl; return 1; } std::vector< cl::Device > devices; for( auto const& p : platform ) { std::vector< cl::Device > temp_devices; p.getDevices( CL_DEVICE_TYPE_ALL, &temp_devices ); for( auto const& d : temp_devices ) { if( !d.getInfo< CL_DEVICE_AVAILABLE >() ) continue; devices.push_back( d ); } } cl::Device* ptr_selected_device = nullptr; if( !devices.empty() ) { if( argc >= 4 ) { std::size_t const device_idx = std::atoi( argv[ 3 ] ); if( device_idx < devices.size() ) { ptr_selected_device = &devices[ device_idx ]; } } if( ptr_selected_device == nullptr ) { std::cout << "default selecting device #0" << std::endl; ptr_selected_device = &devices[ 0 ]; } } if( ptr_selected_device != nullptr ) { std::cout << "device: " << ptr_selected_device->getInfo< CL_DEVICE_NAME >() << std::endl; } else return 0; cl::Context context( *ptr_selected_device ); // std::cout << "Device list" << std::endl; // for(unsigned int jj=0; jj<devices.size(); jj++){ // std::cout << "Name of devicei " << jj<<" : "<<devices[jj].getInfo<CL_DEVICE_NAME>() << std::endl; // std::cout << "resolution of device timer for device " << jj <<" : "<<devices[jj].getInfo<CL_DEVICE_PROFILING_TIMER_RESOLUTION>() << std::endl; // }; /**********************************************/ ///////////////////////////////////////////////////////////////////////////////////////////////////////////// // getting the kernel file std::string PATH_TO_KERNEL_FILE( st_PATH_TO_BASE_DIR ); PATH_TO_KERNEL_FILE += "tests/benchmark/sixtracklib/opencl/"; PATH_TO_KERNEL_FILE += "kernels_beam_elements_oneatatime.cl"; std::string kernel_source( "" ); std::ifstream kernel_file( PATH_TO_KERNEL_FILE.c_str(), std::ios::in | std::ios::binary ); if( kernel_file.is_open() ) { std::istreambuf_iterator< char > file_begin( kernel_file.rdbuf() ); std::istreambuf_iterator< char > end_of_file; kernel_source.assign( file_begin, end_of_file ); kernel_file.close(); } //////////////////////////////////////////////////////////////////////////////////////////////////////////// assert( ptr_selected_device != nullptr ); // int ndev = 0; // specifying the id of the device to be used cl::CommandQueue queue(context, *ptr_selected_device,CL_QUEUE_PROFILING_ENABLE); // Compile OpenCL program for found devices. cl:: Program program(context, kernel_source); //string kernel_source contains the kernel(s) read from the file #if 0 /////////////////////// Alternative 1 for including the kernels written in a separate file -- works perfectly fine ///////////////////////////////// cl:: Program program(context, "#include \"../kernels.cl\" ", false); // the path inside the #include should be relative to an include directory specified using -Ipath/to/dir specified via build options.. otherwise give the absolute path. #endif #if 0 /////////////////////// The way to go if the string source[] contains the source in the same file as this. // cl::Program program(context, cl::Program::Sources( // 1, std::make_pair(source, strlen(source)) // )); #endif try { std::string incls = "-D_GPUCODE=1 -D__NAMESPACE=st_ -I" + std::string(NS(PATH_TO_BASE_DIR)) ; // std::cout << "Path = " << incls << std::endl; //program.build(devices, "-D_GPUCODE=1 -D__NAMESPACE=st_ -I/home/sosingh/sixtracklib_gsoc18/initial_test/sixtrack-v0/external/include"); program.build( incls.c_str() ); } catch (const cl::Error&) { std::cerr << "OpenCL compilation error" << std::endl << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(*ptr_selected_device) << std::endl; throw; } cl::Buffer B(context, CL_MEM_READ_WRITE, st_Blocks_get_total_num_bytes( &beam_elements )); // input vector queue.enqueueWriteBuffer( B, CL_TRUE, 0, st_Blocks_get_total_num_bytes( &beam_elements ), st_Blocks_get_const_data_begin( &beam_elements ) ); ////////////////////////// Particles //////////////////////////////// st_block_size_t const NUM_PARTICLE_BLOCKS = 1u; st_block_size_t const PARTICLES_DATA_CAPACITY = 1048576u*1000*4; // ~(4 GB) st_block_size_t const NUM_PARTICLES = atoi(argv[1]); // 100u; st_Blocks particles_buffer; st_Blocks_preset( &particles_buffer ); ret = st_Blocks_init( &particles_buffer, NUM_PARTICLE_BLOCKS, PARTICLES_DATA_CAPACITY ); assert( ret == 0 ); st_Particles* particles = st_Blocks_add_particles( &particles_buffer, NUM_PARTICLES ); if( particles != nullptr ) { /* Just some random values assigned to the individual attributes * of the acutal particles -> these values do not make any * sense physically, but should be safe for calculating maps -> * please check with the map for drift whether they do not produce * some NaN's at the sqrt or divisions by 0 though!*/ std::mt19937_64 prng( 20180622 ); std::uniform_real_distribution<> x_distribution( 0.05, 1.0 ); std::uniform_real_distribution<> y_distribution( 0.05, 1.0 ); std::uniform_real_distribution<> px_distribution( 0.05, 0.2 ); std::uniform_real_distribution<> py_distribution( 0.05, 0.2 ); std::uniform_real_distribution<> sigma_distribution( 0.01, 0.5 ); assert( particles->s != nullptr ); assert( particles->x != nullptr ); assert( particles->y != nullptr ); assert( particles->px != nullptr ); assert( particles->py != nullptr ); assert( particles->sigma != nullptr ); assert( particles->rpp != nullptr ); assert( particles->rvv != nullptr ); assert( particles->num_of_particles == (int)NUM_PARTICLES ); for( st_block_size_t ii = 0 ; ii < NUM_PARTICLES ; ++ii ) { particles->s[ ii ] = 0.0; particles->x[ ii ] = x_distribution( prng ); particles->y[ ii ] = y_distribution( prng ); particles->px[ ii ] = px_distribution( prng ); particles->py[ ii ] = py_distribution( prng ); particles->sigma[ ii ] = sigma_distribution( prng ); particles->rpp[ ii ] = 1.0; particles->rvv[ ii ] = 1.0; } } ret = st_Blocks_serialize( &particles_buffer ); assert( ret == 0 ); /* ===================================================================== */ /* Copy to other buffer to simulate working on the GPU */ //std::cout << "On the GPU:\n"; // Allocate device buffers and transfer input data to device. cl::Buffer C(context, CL_MEM_READ_WRITE, st_Blocks_get_total_num_bytes( &particles_buffer )); // input vector queue.enqueueWriteBuffer( C, CL_TRUE, 0, st_Blocks_get_total_num_bytes( &particles_buffer ), st_Blocks_get_const_data_begin( &particles_buffer ) ); int numThreads = 1; int blockSize = 1; cl::Kernel unserialize(program, "unserialize"); unserialize.setArg(0,B); unserialize.setArg(1,C); unserialize.setArg(2,NUM_PARTICLES); queue.enqueueNDRangeKernel( unserialize, cl::NullRange, cl::NDRange( numThreads ), cl::NDRange(blockSize )); queue.flush(); queue.finish(); // creating a buffer to transfer the data from GPU to CPU std::vector< uint8_t > copy_particles_buffer_host(st_Blocks_get_total_num_bytes( &particles_buffer )/sizeof(uint8_t)); // output vector queue.enqueueReadBuffer(C, CL_TRUE, 0, copy_particles_buffer_host.size() * sizeof(uint8_t), copy_particles_buffer_host.data()); queue.flush(); st_Blocks copy_particles_buffer; st_Blocks_preset( ©_particles_buffer ); ret = st_Blocks_unserialize( ©_particles_buffer, copy_particles_buffer_host.data() ); assert( ret == 0 ); SIXTRL_UINT64_T const NUM_TURNS = atoi(argv[2]);//100; SIXTRL_UINT64_T offset = 0; cl::Event event; switch (choice) { case 1 : { cl::Kernel track_drift_particle(program, "track_drift_particle"); blockSize = track_drift_particle.getWorkGroupInfo< CL_KERNEL_WORK_GROUP_SIZE >( *ptr_selected_device);// determine the work-group size numThreads = ((NUM_PARTICLES+blockSize-1)/blockSize) * blockSize; // rounding off NUM_PARTICLES to the next nearest multiple of blockSize. This is to ensure that there are integer number of work-groups launched std::cout << blockSize << " " << numThreads<< std::endl; track_drift_particle.setArg(0,B); track_drift_particle.setArg(1,C); track_drift_particle.setArg(2,NUM_PARTICLES); track_drift_particle.setArg(3,NUM_TURNS); track_drift_particle.setArg(4,offset); queue.enqueueNDRangeKernel( track_drift_particle, cl::NullRange, cl::NDRange( numThreads ), cl::NDRange(blockSize ), nullptr, &event); queue.flush(); event.wait(); queue.finish(); cl_ulong when_kernel_queued = 0; cl_ulong when_kernel_submitted = 0; cl_ulong when_kernel_started = 0; cl_ulong when_kernel_ended = 0; ret = event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_QUEUED, &when_kernel_queued ); ret |= event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_SUBMIT, &when_kernel_submitted ); ret |= event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_START, &when_kernel_started ); ret |= event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_END, &when_kernel_ended ); assert( ret == CL_SUCCESS ); // all ret's should be 1 double const kernel_time_elapsed = when_kernel_ended - when_kernel_started; exec_time_drift.push_back(kernel_time_elapsed); if( ll > 5 ) { num_of_turns_drift += 1.0; average_execution_time_drift += (kernel_time_elapsed - average_execution_time_drift)/num_of_turns_drift; } // break; } case 2: { offset = 250; // cl::Event event; cl::Kernel track_drift_exact_particle(program, "track_drift_exact_particle"); blockSize = track_drift_exact_particle.getWorkGroupInfo< CL_KERNEL_WORK_GROUP_SIZE >( *ptr_selected_device);// determine the work-group size numThreads = ((NUM_PARTICLES+blockSize-1)/blockSize) * blockSize; // rounding off NUM_PARTICLES to the next nearest multiple of blockSize. This is to ensure that there are integer number of work-groups launched std::cout << blockSize << " " << numThreads<< std::endl; track_drift_exact_particle.setArg(0,B); track_drift_exact_particle.setArg(1,C); track_drift_exact_particle.setArg(2,NUM_PARTICLES); track_drift_exact_particle.setArg(3,NUM_TURNS); track_drift_exact_particle.setArg(4,offset); queue.enqueueNDRangeKernel( track_drift_exact_particle, cl::NullRange, cl::NDRange( numThreads ), cl::NDRange(blockSize ), nullptr, &event); queue.flush(); event.wait(); queue.finish(); cl_ulong when_kernel_queued = 0; cl_ulong when_kernel_submitted = 0; cl_ulong when_kernel_started = 0; cl_ulong when_kernel_ended = 0; ret = event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_QUEUED, &when_kernel_queued ); ret |= event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_SUBMIT, &when_kernel_submitted ); ret |= event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_START, &when_kernel_started ); ret |= event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_END, &when_kernel_ended ); assert( ret == CL_SUCCESS ); // all ret's should be 1 double const kernel_time_elapsed = when_kernel_ended - when_kernel_started; exec_time_drift_exact.push_back(kernel_time_elapsed); if( ll > 5 ) { num_of_turns_drift_exact += 1.0; average_execution_time_drift_exact += (kernel_time_elapsed - average_execution_time_drift_exact)/num_of_turns_drift_exact; } //break; } case 3: { offset = 500; // cl::Event event; cl::Kernel track_cavity_particle(program, "track_cavity_particle"); blockSize = track_cavity_particle.getWorkGroupInfo< CL_KERNEL_WORK_GROUP_SIZE >( *ptr_selected_device);// determine the work-group size numThreads = ((NUM_PARTICLES+blockSize-1)/blockSize) * blockSize; // rounding off NUM_PARTICLES to the next nearest multiple of blockSize. This is to ensure that there are integer number of work-groups launched std::cout << blockSize << " " << numThreads<< std::endl; track_cavity_particle.setArg(0,B); track_cavity_particle.setArg(1,C); track_cavity_particle.setArg(2,NUM_PARTICLES); track_cavity_particle.setArg(3,NUM_TURNS); track_cavity_particle.setArg(4,offset); queue.enqueueNDRangeKernel( track_cavity_particle, cl::NullRange, cl::NDRange( numThreads ), cl::NDRange(blockSize ), nullptr, &event); queue.flush(); event.wait(); queue.finish(); cl_ulong when_kernel_queued = 0; cl_ulong when_kernel_submitted = 0; cl_ulong when_kernel_started = 0; cl_ulong when_kernel_ended = 0; ret = event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_QUEUED, &when_kernel_queued ); ret |= event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_SUBMIT, &when_kernel_submitted ); ret |= event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_START, &when_kernel_started ); ret |= event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_END, &when_kernel_ended ); assert( ret == CL_SUCCESS ); // all ret's should be 1 double const kernel_time_elapsed = when_kernel_ended - when_kernel_started; exec_time_cavity.push_back(kernel_time_elapsed); if( ll > 5 ) { num_of_turns_cavity += 1.0; average_execution_time_cavity += (kernel_time_elapsed - average_execution_time_cavity)/num_of_turns_cavity; } // break; } case 4: { //cl::Event event; offset = 750; cl::Kernel track_align_particle(program, "track_align_particle"); blockSize = track_align_particle.getWorkGroupInfo< CL_KERNEL_WORK_GROUP_SIZE >( *ptr_selected_device);// determine the work-group size numThreads = ((NUM_PARTICLES+blockSize-1)/blockSize) * blockSize; // rounding off NUM_PARTICLES to the next nearest multiple of blockSize. This is to ensure that there are integer number of work-groups launched std::cout << blockSize << " " << numThreads<< std::endl; track_align_particle.setArg(0,B); track_align_particle.setArg(1,C); track_align_particle.setArg(2,NUM_PARTICLES); track_align_particle.setArg(3,NUM_TURNS); track_align_particle.setArg(4,offset); queue.enqueueNDRangeKernel( track_align_particle, cl::NullRange, cl::NDRange( numThreads ), cl::NDRange(blockSize ), nullptr, &event); queue.flush(); event.wait(); queue.finish(); cl_ulong when_kernel_queued = 0; cl_ulong when_kernel_submitted = 0; cl_ulong when_kernel_started = 0; cl_ulong when_kernel_ended = 0; ret = event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_QUEUED, &when_kernel_queued ); ret |= event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_SUBMIT, &when_kernel_submitted ); ret |= event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_START, &when_kernel_started ); ret |= event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_END, &when_kernel_ended ); assert( ret == CL_SUCCESS ); // all ret's should be 1 double const kernel_time_elapsed = when_kernel_ended - when_kernel_started; exec_time_align.push_back(kernel_time_elapsed); if( ll > 5 ) { num_of_turns_align += 1.0; average_execution_time_align += (kernel_time_elapsed - average_execution_time_align)/num_of_turns_align; } // break; } }; // end of switch case queue.enqueueReadBuffer(C, CL_TRUE, 0, copy_particles_buffer_host.size() * sizeof(uint8_t), copy_particles_buffer_host.data()); queue.flush(); //st_Blocks copy_particles_buffer; st_Blocks_preset( ©_particles_buffer ); ret = st_Blocks_unserialize( ©_particles_buffer, copy_particles_buffer_host.data() ); assert( ret == 0 ); /* on the GPU, these pointers will have __global as a decorator */ #if 0 // On the CPU after copying the data back from the GPU std::cout << "\n On the Host, after applying the drift_track_particles mapping and copying from the GPU\n"; SIXTRL_GLOBAL_DEC st_BlockInfo const* itr = st_Blocks_get_const_block_infos_begin( ©_particles_buffer ); SIXTRL_GLOBAL_DEC st_BlockInfo const* endr = st_Blocks_get_const_block_infos_end( ©_particles_buffer ); for( ; itr != endr ; ++itr ) { SIXTRL_GLOBAL_DEC st_Particles const* particles = ( SIXTRL_GLOBAL_DEC st_Particles const* )itr->begin; std::cout.precision( 4 ); for( st_block_size_t ii = 0 ; ii < NUM_PARTICLES ; ++ii ) { std::cout << " ii = " << std::setw( 6 ) << ii << std::fixed << " | s = " << std::setw( 6 ) << particles->s[ ii ] << " | x = " << std::setw( 6 ) << particles->x[ ii ] << " | y = " << std::setw( 6 ) << particles->y[ ii ] << " | px = " << std::setw( 6 ) << particles->px[ ii ] << " | py = " << std::setw( 6 ) << particles->py[ ii ] << " | sigma = " << std::setw( 6 ) << particles->sigma[ ii ] << " | rpp = " << std::setw( 6 ) << particles->rpp[ ii ] << " | rvv = " << std::setw( 6 ) << particles->rvv[ ii ] << "\r\n"; } } #endif std::cout.flush(); st_Blocks_free( &particles_buffer ); st_Blocks_free( ©_particles_buffer ); } // end of the NUM_REPETITIONS 'for' loop switch(choice) { case 1: { // printing the contents of the exec_time vector std::cout << "track_drift_particle" << std::endl; for(std::vector<double>::iterator it = exec_time_drift.begin(); it != exec_time_drift.end(); ++it) printf("%.3f s%c",(*it)*1.0e-9, ",\n"[it+1 == exec_time_drift.end()]); printf("Reference Version : Time = %.3f s; \n",average_execution_time_drift*1.0e-9); //break; } case 2: { std::cout << "track_drift_exact_particle" << std::endl; for(std::vector<double>::iterator it = exec_time_drift_exact.begin(); it != exec_time_drift_exact.end(); ++it) printf("%.3f s%c",(*it)*1.0e-9, ",\n"[it+1 == exec_time_drift_exact.end()]); printf("Reference Version: Time = %.3f s; \n",average_execution_time_drift_exact*1.0e-9); //break; } case 3: { std::cout << "track_cavity_particle" << std::endl; for(std::vector<double>::iterator it = exec_time_cavity.begin(); it != exec_time_cavity.end(); ++it) printf("%.3f s%c",(*it)*1.0e-9, ",\n"[it+1 == exec_time_cavity.end()]); printf("Reference Version: Time = %.3f s; \n",average_execution_time_cavity*1.0e-9); // break; } case 4: { std::cout << "track_align_particle" << std::endl; for(std::vector<double>::iterator it = exec_time_align.begin(); it != exec_time_align.end(); ++it) printf("%.3f s%c",(*it)*1.0e-9, ",\n"[it+1 == exec_time_align.end()]); printf("Reference Version: Time = %.3f s; \n",average_execution_time_align*1.0e-9); break; } }; return 0; }