cl_program CLHelper::CreateProgram ( const char* file_name ) { cl_int error = 0; cl_program program = 0; LOGDEBUG << "Compiling " << file_name; std::string binary_path; System::GetExecutablePath(binary_path); // Search in binary path first std::string full_path = binary_path + std::string(file_name); // If kernel cannot be found, go up one folder (Xcode, Visual Studio and // other multi-target build setups) if ( !std::ifstream(full_path, std::ios::in).good()) { full_path = binary_path + "../" + std::string(file_name); } std::ifstream kernel_file ( full_path, std::ios::in ); if ( !kernel_file.good() ) { FATAL ( "Cannot open kernel: " << full_path ); } std::ostringstream oss; oss << kernel_file.rdbuf(); std::string kernel_content = oss.str(); const char* kernel_content_char = kernel_content.c_str(); program = clCreateProgramWithSource ( context, 1, ( const char** ) &kernel_content_char, NULL, NULL ); if ( program == NULL ) { FATAL ( "Cannot create kernel: " << file_name ); } error = clBuildProgram ( program, 1, &device, NULL, NULL, NULL ); if ( error != CL_SUCCESS ) { char build_log[16384]; clGetProgramBuildInfo ( program, device, CL_PROGRAM_BUILD_LOG, 16384, build_log, NULL ); LOGERROR << "Error compiling kernel " << file_name << ":\n" << std::string ( build_log ); FATAL ( "Compilation failed, exiting..." ); } return program; }
cl_program CreateProgram(cl_context context, cl_device_id device, const char *file_name) { cl_int error_number; cl_program program; std::ifstream kernel_file(file_name, std::ios::in); if (!kernel_file.is_open()) { std::cerr << "Failed to open file for reading: " << file_name << std::endl; return NULL; } std::ostringstream oss; oss << kernel_file.rdbuf(); std::string source_stand_string = oss.str(); const char *source_string = source_stand_string.c_str(); program = clCreateProgramWithSource(context, 1, (const char**)&source_string, NULL, NULL); if (program == NULL) { std::cerr << "Failed to create CL program from source." << std::endl; return NULL; } error_number = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (error_number != CL_SUCCESS) { char build_log[16384]; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(build_log), build_log, NULL); std::cerr << "Error in kernel: " << std::endl; std::cerr << build_log; clReleaseProgram(program); return NULL; } return program; }
int main() { using size_t = ::st_buffer_size_t; using num_particles_t = ::st_particle_num_elements_t; using buffer_t = ::st_Buffer; using object_t = ::st_Object; using particles_t = ::st_Particles; using timing_result_t = sixtrack::benchmarks::TimingResult; size_t const NUM_TURNS = size_t{ 20u }; /* ===================================================================== */ /* ==== Prepare Host Buffers */ double begin_time = ::st_Time_get_seconds_since_epoch(); std::vector< size_t > num_particles_list = { 20000u }; std::sort( num_particles_list.begin(), num_particles_list.end() ); /* --------------------------------------------------------------------- */ buffer_t* lhc_beam_elements = ::st_Buffer_new_from_file( ::st_PATH_TO_TEST_LHC_BEAM_ELEMENTS_DATA_NO_BEAM_BEAM ); object_t const* be_begin = ::st_Buffer_get_const_objects_begin( lhc_beam_elements ); object_t const* be_end = ::st_Buffer_get_const_objects_end( lhc_beam_elements ); /* --------------------------------------------------------------------- */ buffer_t* lhc_particles_buffer = ::st_Buffer_new_from_file( st_PATH_TO_TEST_LHC_PARTICLES_DATA_T1_P2_NO_BEAM_BEAM ); particles_t const* lhc_particles = ( particles_t const* )( uintptr_t )::st_Object_get_begin_addr( ::st_Buffer_get_const_objects_begin( lhc_particles_buffer ) ); size_t const lhc_num_particles = ::st_Particles_get_num_of_particles( lhc_particles ); /* --------------------------------------------------------------------- */ size_t const max_num_particles = num_particles_list.back(); size_t const requ_num_slots = ::st_Particles_get_required_num_slots( lhc_particles_buffer, max_num_particles ); size_t const requ_num_dataptrs = ::st_Particles_get_required_num_dataptrs( lhc_particles_buffer, max_num_particles ); size_t const req_particles_buffer_size = ::st_Buffer_calculate_required_buffer_length( lhc_particles_buffer, max_num_particles, requ_num_slots, requ_num_dataptrs, size_t{ 0 } ); buffer_t* particles_buffer = ::st_Buffer_new( req_particles_buffer_size ); /* --------------------------------------------------------------------- */ double now = ::st_Time_get_seconds_since_epoch(); double const time_setup_host_buffers = ( now >= begin_time ) ? ( now - begin_time ) : double{ 0.0 }; /* ===================================================================== */ /* ==== Prepare OpenCL Environment Buffers */ begin_time = ::st_Time_get_seconds_since_epoch(); std::vector< cl::Platform > platforms; cl::Platform::get( &platforms ); std::vector< cl::Device > devices; for( auto const& p : platforms ) { std::vector< cl::Device > temp_devices; p.getDevices( CL_DEVICE_TYPE_ALL, &temp_devices ); for( auto const& d : temp_devices ) { if( !d.getInfo< CL_DEVICE_AVAILABLE >() ) continue; devices.push_back( d ); } } now = ::st_Time_get_seconds_since_epoch(); double const time_get_platforms = ( now >= begin_time ) ? now - begin_time : double{ 0 }; begin_time = ::st_Time_get_seconds_since_epoch(); if( !devices.empty() ) { std::ostringstream a2str( "" ); std::string const PATH_TO_BASE_DIR = ::st_PATH_TO_BASE_DIR; a2str << " -D_GPUCODE=1" << " -D__NAMESPACE=st_" << " -DSIXTRL_DATAPTR_DEC=__global" << " -DSIXTRL_BUFFER_DATAPTR_DEC=__global" << " -DSIXTRL_BUFFER_OBJ_ARGPTR_DEC=__global" << " -DISXTRL_BUFFER_OBJ_DATAPTR_DEC=__global" << " -DSIXTRL_PARTICLE_ARGPTR_DEC=__global" << " -DSIXTRL_PARTICLE_DATAPTR_DEC=__global" << " -DSIXTRL_BE_ARGPTR_DEC=__global" << " -DSIXTRL_BE_DATAPTR_DEC=__global" << " -I" << PATH_TO_BASE_DIR; std::string const REMAP_COMPILE_OPTIONS = a2str.str(); /* ----------------------------------------------------------------- */ std::string path_to_source = PATH_TO_BASE_DIR; path_to_source += "sixtracklib/opencl/impl/track_particles_kernel.cl"; std::ifstream kernel_file( path_to_source, std::ios::in ); std::string const REMAP_PROGRAM_SOURCE_CODE( ( std::istreambuf_iterator< char >( kernel_file ) ), std::istreambuf_iterator< char >() ); kernel_file.close(); path_to_source = PATH_TO_BASE_DIR; path_to_source += "sixtracklib/opencl/impl/"; path_to_source += "track_particles_priv_particles_optimized_kernel.cl"; kernel_file.open( path_to_source, std::ios::in ); std::string const TRACKING_PRORGRAM_SOURCE_CODE( ( std::istreambuf_iterator< char >( kernel_file ) ), std::istreambuf_iterator< char >() ); a2str.str( "" ); a2str << " -D_GPUCODE=1" << " -D__NAMESPACE=st_" << " -DSIXTRL_DATAPTR_DEC=__global" << " -DSIXTRL_BUFFER_DATAPTR_DEC=__global" << " -DSIXTRL_BUFFER_OBJ_ARGPTR_DEC=__global" << " -DISXTRL_BUFFER_OBJ_DATAPTR_DEC=__global" << " -DSIXTRL_PARTICLE_ARGPTR_DEC=__private" << " -DSIXTRL_PARTICLE_DATAPTR_DEC=__private" << " -DSIXTRL_BE_ARGPTR_DEC=__global" << " -DSIXTRL_BE_DATAPTR_DEC=__global" << " -I" << PATH_TO_BASE_DIR; std::string const TRACKING_COMPILE_OPTIONS = a2str.str(); /* ----------------------------------------------------------------- */ for( auto& device : devices ) { cl::Platform platform( device.getInfo< CL_DEVICE_PLATFORM >() ); std::cout << "--------------------------------------------------" << "----------------------------------------------\r\n" << "INFO :: Perform test for device : " << device.getInfo< CL_DEVICE_NAME >() << "\r\n" << "INFO :: Platform : " << platform.getInfo< CL_PLATFORM_NAME >() << "\r\n" << "INFO :: Platform Vendor : " << platform.getInfo< CL_PLATFORM_VENDOR >() << "\r\n" << "INFO :: Device Type : "; auto const device_type = device.getInfo< CL_DEVICE_TYPE >(); switch( device_type ) { case CL_DEVICE_TYPE_CPU: { std::cout << "CPU"; break; } case CL_DEVICE_TYPE_GPU: { std::cout << "GPU"; break; } case CL_DEVICE_TYPE_ACCELERATOR: { std::cout << "Accelerator"; break; } case CL_DEVICE_TYPE_CUSTOM: { std::cout << "Custom"; break; } default: { std::cout << "Unknown"; } }; size_t const device_max_compute_units = device.getInfo< CL_DEVICE_MAX_COMPUTE_UNITS >(); std::cout << "\r\n" << "INFO :: Max work-group size : " << device.getInfo< CL_DEVICE_MAX_WORK_GROUP_SIZE >() << "\r\n" << "INFO :: Max num compute units : " << device_max_compute_units << "\r\n"; /* ------------------------------------------------------------- */ cl_int cl_ret = CL_SUCCESS; cl::Context context( device ); cl::CommandQueue queue( context, device, CL_QUEUE_PROFILING_ENABLE ); cl::Program remap_program( context, REMAP_PROGRAM_SOURCE_CODE ); cl::Program tracking_program( context, TRACKING_PRORGRAM_SOURCE_CODE ); try { cl_ret = remap_program.build( REMAP_COMPILE_OPTIONS.c_str() ); } catch( cl::Error const& e ) { std::cerr << "ERROR :: remap_program :: " << "OpenCL Compilation Error -> Stopping Unit-Test \r\n" << remap_program.getBuildInfo< CL_PROGRAM_BUILD_LOG >( device ) << "\r\n" << std::endl; cl_ret = CL_FALSE; throw; } try { cl_ret = tracking_program.build( TRACKING_COMPILE_OPTIONS.c_str() ); } catch( cl::Error const& e ) { std::cerr << "ERROR :: tracking_program :: " << "OpenCL Compilation Error -> Stopping Unit-Test \r\n" << tracking_program.getBuildInfo< CL_PROGRAM_BUILD_LOG >( device ) << "\r\n" << std::endl; cl_ret = CL_FALSE; throw; } /* ------------------------------------------------------------- */ cl::Kernel remapping_kernel; try { remapping_kernel = cl::Kernel( remap_program, "st_Remap_particles_beam_elements_buffers_opencl" ); } catch( cl::Error const& e ) { std::cout << "kernel remap_kernel :: " << "line = " << __LINE__ << " :: " << "ERROR : " << e.what() << "\r\n" << e.err() << std::endl; cl_ret = CL_FALSE; throw; } size_t remap_work_group_size = remapping_kernel.getWorkGroupInfo< CL_KERNEL_WORK_GROUP_SIZE >( device ); size_t const remap_work_group_size_prefered_multiple = remapping_kernel.getWorkGroupInfo< CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE >( device ); size_t remap_num_threads = remap_work_group_size_prefered_multiple; size_t remap_group_size = remap_work_group_size_prefered_multiple; /* ------------------------------------------------------------- */ cl::Kernel tracking_kernel; try { tracking_kernel = cl::Kernel( tracking_program, "st_Track_particles_beam_elements_priv_particles_optimized_opencl" ); } catch( cl::Error const& e ) { std::cout << "kernel tracking_kernel :: " << "line = " << __LINE__ << " :: ERROR : " << e.what() << std::endl << e.err() << std::endl; cl_ret = CL_FALSE; throw; } size_t track_work_group_size = tracking_kernel.getWorkGroupInfo< CL_KERNEL_WORK_GROUP_SIZE >( device ); size_t const track_work_group_size_prefered_multiple = tracking_kernel.getWorkGroupInfo< CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE >( device ); now = ::st_Time_get_seconds_since_epoch(); double const time_cl_program_compile = ( now >= begin_time ) ? ( now - begin_time ) : double{ 0.0 }; /* ============================================================= */ for( auto const NUM_PARTICLES : num_particles_list ) { size_t tracking_num_threads = size_t{ 0 }; size_t tracking_group_size = track_work_group_size; tracking_num_threads = NUM_PARTICLES / track_work_group_size; tracking_num_threads *= track_work_group_size; if( tracking_num_threads < NUM_PARTICLES ) { tracking_num_threads += track_work_group_size; } std::cout << "INFO :: num_particles : " << NUM_PARTICLES << "\r\n" << "INFO :: remap kernel wg size : " << remap_work_group_size << "\r\n" << "INFO :: remap kernel wg size multi : " << remap_work_group_size_prefered_multiple << "\r\n" << "INFO :: remap kernel launch with : " << remap_num_threads << " threads \r\n" << "INFO :: remap_kernel local size : " << remap_group_size << " threads \r\n\r\n" << "INFO :: num_turns : " << NUM_TURNS << "\r\n" << "INFO :: tracking kernel wg size : " << track_work_group_size << "\r\n" << "INFO :: tracking kernel wg size multi : " << track_work_group_size_prefered_multiple << "\r\n" << "INFO :: tracking kernel launch with : " << tracking_num_threads << " threads\r\n" << "INFO :: tracking kernel local size : " << tracking_group_size << " threads\r\n" << std::endl; begin_time = ::st_Time_get_seconds_since_epoch(); int success = ::st_Buffer_reset( particles_buffer ); SIXTRL_ASSERT( success == 0 ); particles_t* particles = ::st_Particles_new( particles_buffer, NUM_PARTICLES ); for( size_t ii = size_t{ 0 } ; ii < NUM_PARTICLES ; ++ii ) { size_t jj = ii % lhc_num_particles; ::st_Particles_copy_single( particles, ii, lhc_particles, jj ); } now = ::st_Time_get_seconds_since_epoch(); double const time_setup_particle_buffer = ( now >= begin_time ) ? ( now - begin_time ) : double{ 0.0 }; /* ========================================================= */ std::vector< cl::Event > write_xfer_events( 3u, cl::Event{} ); cl_ulong write_xfer_when_queued[] = { 0, 0, 0 }; cl_ulong write_xfer_when_submitted[] = { 0, 0, 0 }; cl_ulong write_xfer_when_started[] = { 0, 0, 0 }; cl_ulong write_xfer_when_ended[] = { 0, 0, 0 }; begin_time = ::st_Time_get_seconds_since_epoch(); int32_t success_flag = int32_t{ 0 }; cl::Buffer cl_particles( context, CL_MEM_READ_WRITE, ::st_Buffer_get_size( lhc_particles_buffer ) ); cl::Buffer cl_beam_elements( context, CL_MEM_READ_WRITE, ::st_Buffer_get_size( lhc_beam_elements ) ); cl::Buffer cl_success_flag( context, CL_MEM_READ_WRITE, sizeof( success_flag ) ); try { cl_ret = queue.enqueueWriteBuffer( cl_particles, CL_TRUE, 0, ::st_Buffer_get_size( particles_buffer ), ::st_Buffer_get_const_data_begin( particles_buffer ), nullptr, &write_xfer_events[ 0 ] ); cl_ret |= write_xfer_events[ 0 ].getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_QUEUED, &write_xfer_when_queued[ 0 ] ); cl_ret |= write_xfer_events[ 0 ].getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_SUBMIT, &write_xfer_when_submitted[ 0 ] ); cl_ret |= write_xfer_events[ 0 ].getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_START, &write_xfer_when_started[ 0 ] ); cl_ret |= write_xfer_events[ 0 ].getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_END, &write_xfer_when_ended[ 0 ] ); } catch( cl::Error const& e ) { std::cout << "enqueueWriteBuffer( particles_buffer ) :: " << "line = " << __LINE__ << " :: ERROR : " << e.what() << std::endl << e.err() << std::endl; cl_ret = CL_FALSE; throw; } SIXTRL_ASSERT( cl_ret == CL_SUCCESS ); try { cl_ret = queue.enqueueWriteBuffer( cl_beam_elements, CL_TRUE, 0, ::st_Buffer_get_size( lhc_beam_elements ), ::st_Buffer_get_const_data_begin( lhc_beam_elements ), nullptr, &write_xfer_events[ 1 ] ); cl_ret |= write_xfer_events[ 1 ].getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_QUEUED, &write_xfer_when_queued[ 1 ] ); cl_ret |= write_xfer_events[ 1 ].getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_SUBMIT, &write_xfer_when_submitted[ 1 ] ); cl_ret |= write_xfer_events[ 1 ].getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_START, &write_xfer_when_started[ 1 ] ); cl_ret |= write_xfer_events[ 1 ].getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_END, &write_xfer_when_ended[ 1 ] ); } catch( cl::Error const& e ) { std::cout << "enqueueWriteBuffer( beam_elements ) :: " << "line = " << __LINE__ << " :: ERROR : " << e.what() << std::endl << e.err() << std::endl; cl_ret = CL_FALSE; throw; } success_flag = int32_t{ 0 }; try { cl_ret = queue.enqueueWriteBuffer( cl_success_flag, CL_TRUE, 0, sizeof( success_flag ), &success_flag, nullptr, &write_xfer_events[ 2 ] ); cl_ret |= write_xfer_events[ 2 ].getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_QUEUED, &write_xfer_when_queued[ 2 ] ); cl_ret |= write_xfer_events[ 2 ].getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_SUBMIT, &write_xfer_when_submitted[ 2 ] ); cl_ret |= write_xfer_events[ 2 ].getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_START, &write_xfer_when_started[ 2 ] ); cl_ret |= write_xfer_events[ 2 ].getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_END, &write_xfer_when_ended[ 2 ] ); } catch( cl::Error const& e ) { std::cout << "enqueueWriteBuffer( success_flag ) :: " << "line = " << __LINE__ << " :: ERROR : " << e.what() << std::endl << e.err() << std::endl; cl_ret = CL_FALSE; throw; } now = ::st_Time_get_seconds_since_epoch(); double const time_write_xfer = ( now >= begin_time ) ? ( now - begin_time ) : double{ 0 }; /* ======================================================== */ cl::Event run_remap_kernel_event; cl_ulong run_remap_kernel_when_queued = cl_ulong{ 0 }; cl_ulong run_remap_kernel_when_submitted = cl_ulong{ 0 }; cl_ulong run_remap_kernel_when_started = cl_ulong{ 0 }; cl_ulong run_remap_kernel_when_ended = cl_ulong{ 0 }; begin_time = ::st_Time_get_seconds_since_epoch(); remapping_kernel.setArg( 0, cl_particles ); remapping_kernel.setArg( 1, cl_beam_elements ); remapping_kernel.setArg( 2, cl_success_flag ); try { cl_ret = queue.enqueueNDRangeKernel( remapping_kernel, cl::NullRange, cl::NDRange( remap_num_threads ), cl::NDRange( remap_group_size ), nullptr, &run_remap_kernel_event ); } catch( cl::Error const& e ) { std::cout << "enqueueNDRangeKernel( remapping_kernel ) :: " << "line = " << __LINE__ << " :: ERROR : " << e.what() << std::endl << e.err() << std::endl; cl_ret = CL_FALSE; throw; } SIXTRL_ASSERT( cl_ret == CL_SUCCESS ); queue.flush(); run_remap_kernel_event.wait(); cl_ret = run_remap_kernel_event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_QUEUED, &run_remap_kernel_when_queued ); cl_ret |= run_remap_kernel_event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_SUBMIT, &run_remap_kernel_when_submitted ); cl_ret |= run_remap_kernel_event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_START, &run_remap_kernel_when_started ); cl_ret |= run_remap_kernel_event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_END, &run_remap_kernel_when_ended ); now = ::st_Time_get_seconds_since_epoch(); double const time_run_remapping_kernel = ( now >= begin_time ) ? ( now - begin_time ) : double{ 0 }; /* ========================================================= */ cl::Event xfer_after_remap_events; cl_ulong xfer_after_remap_when_queued = cl_ulong{ 0 }; cl_ulong xfer_after_remap_when_submitted = cl_ulong{ 0 }; cl_ulong xfer_after_remap_when_started = cl_ulong{ 0 }; cl_ulong xfer_after_remap_when_ended = cl_ulong{ 0 }; begin_time = ::st_Time_get_seconds_since_epoch(); try { cl_ret = queue.enqueueReadBuffer( cl_success_flag, CL_TRUE, 0, sizeof( success_flag ), &success_flag, nullptr, &xfer_after_remap_events ); cl_ret = xfer_after_remap_events.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_QUEUED, &xfer_after_remap_when_queued ); cl_ret |= xfer_after_remap_events.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_SUBMIT, &xfer_after_remap_when_submitted ); cl_ret |= xfer_after_remap_events.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_START, &xfer_after_remap_when_started ); cl_ret |= xfer_after_remap_events.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_END, &xfer_after_remap_when_ended ); } catch( cl::Error const& e ) { std::cout << "enqueueReadBuffer( success_flag ) :: " << "line = " << __LINE__ << " :: ERROR : " << e.what() << std::endl << e.err() << std::endl; cl_ret = CL_FALSE; throw; } now = ::st_Time_get_seconds_since_epoch(); SIXTRL_ASSERT( cl_ret == CL_SUCCESS ); SIXTRL_ASSERT( success_flag == int32_t{ 0 } ); double const time_xfer_after_remap = ( now >= begin_time ) ? ( now - begin_time ) : double{ 0 }; /* ========================================================== */ uint64_t const turns = NUM_TURNS; cl::Event run_tracking_kernel_event; cl_ulong run_tracking_kernel_when_queued = cl_ulong{ 0 }; cl_ulong run_tracking_kernel_when_submitted = cl_ulong{ 0 }; cl_ulong run_tracking_kernel_when_started = cl_ulong{ 0 }; cl_ulong run_tracking_kernel_when_ended = cl_ulong{ 0 }; begin_time = ::st_Time_get_seconds_since_epoch(); tracking_kernel.setArg( 0, cl_particles ); tracking_kernel.setArg( 1, cl_beam_elements ); tracking_kernel.setArg( 2, turns ); tracking_kernel.setArg( 3, cl_success_flag ); try { cl_ret = queue.enqueueNDRangeKernel( tracking_kernel, cl::NullRange, cl::NDRange( tracking_num_threads ), cl::NDRange( tracking_group_size ), nullptr, &run_tracking_kernel_event ); } catch( cl::Error const& e ) { std::cout << "enqueueNDRangeKernel( remapping_kernel ) :: " << "line = " << __LINE__ << " :: ERROR : " << e.what() << std::endl << e.err() << std::endl; cl_ret = CL_FALSE; throw; } cl_ret = queue.flush(); run_tracking_kernel_event.wait(); cl_ret |= run_tracking_kernel_event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_QUEUED, &run_tracking_kernel_when_queued ); cl_ret |= run_tracking_kernel_event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_SUBMIT, &run_tracking_kernel_when_submitted ); cl_ret |= run_tracking_kernel_event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_START, &run_tracking_kernel_when_started ); cl_ret |= run_tracking_kernel_event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_END, &run_tracking_kernel_when_ended ); now = ::st_Time_get_seconds_since_epoch(); double const time_run_tracking_kernel = ( now >= begin_time ) ? ( now - begin_time ) : double{ 0 }; double const time_tracking_until_submitted = static_cast< double >( run_tracking_kernel_when_submitted - run_tracking_kernel_when_queued ) * 1e-9; double const time_tracking_until_start = static_cast< double >( run_tracking_kernel_when_started - run_tracking_kernel_when_submitted ) * 1e-9; double const time_tracking_device_execution = static_cast< double >( run_tracking_kernel_when_ended - run_tracking_kernel_when_started ) * 1e-9; /* ========================================================== */ std::vector< cl::Event > xfer_after_tracking_events( 2u, cl::Event{} ); cl_ulong xfer_after_tracking_when_queued[] = { 0, 0 }; cl_ulong xfer_after_tracking_when_submitted[] = { 0, 0 }; cl_ulong xfer_after_tracking_when_started[] = { 0, 0 }; cl_ulong xfer_after_tracking_when_ended[] = { 0, 0 }; begin_time = ::st_Time_get_seconds_since_epoch(); try { cl_ret = queue.enqueueReadBuffer( cl_particles, CL_TRUE, 0, ::st_Buffer_get_size( particles_buffer ), ::st_Buffer_get_data_begin( particles_buffer ), nullptr, &xfer_after_tracking_events[ 0 ] ); cl_ret |= xfer_after_tracking_events[ 0 ].getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_QUEUED, &xfer_after_tracking_when_queued[ 0 ] ); cl_ret |= xfer_after_tracking_events[ 0 ].getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_SUBMIT, &xfer_after_tracking_when_submitted[ 0 ] ); cl_ret |= xfer_after_tracking_events[ 0 ].getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_START, &xfer_after_tracking_when_started[ 0 ] ); cl_ret |= xfer_after_tracking_events[ 0 ].getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_END, &xfer_after_tracking_when_ended[ 0 ] ); } catch( cl::Error const& e ) { std::cout << "enqueueReadBuffer( success_flag ) :: " << "line = " << __LINE__ << " :: ERROR : " << e.what() << std::endl << e.err() << std::endl; cl_ret = CL_FALSE; throw; } SIXTRL_ASSERT( cl_ret == CL_SUCCESS ); try { cl_ret = queue.enqueueReadBuffer( cl_success_flag, CL_TRUE, 0, sizeof( success_flag ), &success_flag, nullptr, &xfer_after_tracking_events[ 1 ] ); cl_ret |= xfer_after_tracking_events[ 1 ].getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_QUEUED, &xfer_after_tracking_when_queued[ 1 ] ); cl_ret |= xfer_after_tracking_events[ 1 ].getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_SUBMIT, &xfer_after_tracking_when_submitted[ 1 ] ); cl_ret |= xfer_after_tracking_events[ 1 ].getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_START, &xfer_after_tracking_when_started[ 1 ] ); cl_ret |= xfer_after_tracking_events[ 1 ].getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_END, &xfer_after_tracking_when_ended[ 1 ] ); } catch( cl::Error const& e ) { std::cout << "enqueueReadBuffer( success_flag ) :: " << "line = " << __LINE__ << " :: ERROR : " << e.what() << std::endl << e.err() << std::endl; cl_ret = CL_FALSE; throw; } now = ::st_Time_get_seconds_since_epoch(); SIXTRL_ASSERT( cl_ret == CL_SUCCESS ); SIXTRL_ASSERT( success_flag == int32_t{ 0 } ); double const time_xfer_after_tracking = ( now >= begin_time ) ? ( now - begin_time ) : double{ 0 }; /* ======================================================== */ a2str.str( "" ); double time_run_tracking_normalized = time_run_tracking_kernel / static_cast< double >( NUM_TURNS * NUM_PARTICLES ); if( time_run_tracking_normalized >= 0.1 ) { a2str << "sec"; } std::cout << std::endl << "Reslts: \r\n" << "------------------------------------------------" << "------------------------------------------------" << "--------------------------------------------\r\n" << " :: Tracking time : " << std::setw( 20 ) << std::fixed << time_run_tracking_kernel << " [sec] \r\n" << " :: Tracking time/particle/turn : "; if( time_run_tracking_normalized >= 200e-3 ) { std::cout << std::setw( 20 ) << std::fixed << time_run_tracking_normalized << "[sec]\r\n"; } else if( time_run_tracking_normalized >= 200e-6 ) { std::cout << std::setw( 20 ) << std::fixed << time_run_tracking_normalized * 1e3 << "[millisec]\r\n"; } else { std::cout << std::setw( 20 ) << std::fixed << time_run_tracking_normalized * 1e6 << "[usec]\r\n"; } std::cout << " :: device_run_time : " << std::setw( 20 ) << std::fixed << time_tracking_device_execution << "\r\n" << " :: device overhead : " << std::setw( 20 ) << std::fixed << time_tracking_until_start << " + " << time_tracking_until_submitted << "\r\n" << "------------------------------------------------" << "------------------------------------------------" << "--------------------------------------------\r\n" << "\r\n" << std::endl; } int success = ::st_Buffer_remap( particles_buffer ); SIXTRL_ASSERT( success == 0 ); ::st_Buffer_reset( particles_buffer ); } } ::st_Buffer_delete( lhc_particles_buffer ); ::st_Buffer_delete( lhc_beam_elements ); ::st_Buffer_delete( particles_buffer ); return 0; }
static void customcl_setup( std::string cl_program = "blocking-2-v4", std::string arithmetic = "float") { err = 0; // build options for opencl. std::string cl_build_options = "-DT=" + arithmetic + " -DT4=" + arithmetic + "4" + " -DT8=" + arithmetic + "8" + " -DT16=" + arithmetic + "16" + " " + (arithmetic == "double" ? " -DSAMPLE_NEEDS_DOUBLE" : "") + " " + (arithmetic == "half" ? " -DSAMPLE_NEEDS_HALF" : ""); // clkernel name. std::string clkernel_path = "clkernel/"; caffe::cl_program = cl_program; if(cl_program == "blocking-2-v4") { clkernel_path += "gemm-blocking-2x2-vload4.cl"; }else if(cl_program == "blocking-4-v4") { clkernel_path += "gemm-blocking-4x4-vload4.cl"; }else if(cl_program == "noblock-v8") { clkernel_path += "gemm-noblock-vload8.cl"; } std::ifstream kernel_file(clkernel_path); std::string kernel_str((std::istreambuf_iterator<char>(kernel_file)), std::istreambuf_iterator<char>()); viennacl::ocl::current_context().build_options( "-DT=" + arithmetic + " -DT4=" + arithmetic + "4" + " -DT8=" + arithmetic + "8" + " -DT16=" + arithmetic + "16" + " " + (arithmetic == "double" ? " -DSAMPLE_NEEDS_DOUBLE" : "") + " " + (arithmetic == "half" ? " -DSAMPLE_NEEDS_HALF" : "")); cl_prog = viennacl::ocl::get_context(0).add_program( kernel_str, "gemm_program"); gemm_exec = cl_prog.get_kernel("gemm"); transpose_exec = cl_prog.get_kernel("transpose"); copy_exec = cl_prog.get_kernel("copy"); transpose_ptr = (void*)clCreateBuffer( viennacl::ocl::current_context().handle().get(), CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, TRANSPOSE_BUFFER_DIM * TRANSPOSE_BUFFER_DIM * 8, host_trans_buffer, &err ); SAMPLE_CHECK_ERRORS(err); copy_ptr = (void*)clCreateBuffer( viennacl::ocl::current_context().handle().get(), CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, TRANSPOSE_BUFFER_DIM * TRANSPOSE_BUFFER_DIM * 8, host_copy_buffer, &err ); SAMPLE_CHECK_ERRORS(err); result_ptr = (void*)clCreateBuffer( viennacl::ocl::current_context().handle().get(), CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, TRANSPOSE_BUFFER_DIM * TRANSPOSE_BUFFER_DIM * 8, host_result_buffer, &err ); SAMPLE_CHECK_ERRORS(err); }
int main(int argc, char** argv) { if(argc < 3) { std::cerr << "Usage: " << argv[0] << " < #particles > < #turns > [deviceIdx]" << std::endl; exit(1); } int NUM_REPETITIONS = 10; double num_of_turns_drift = 0.0; // for timing double num_of_turns_drift_exact = 0.0; // for timing double num_of_turns_cavity = 0.0; // for timing double num_of_turns_align = 0.0; // for timing double average_execution_time_drift = 0.0; double average_execution_time_drift_exact = 0.0; double average_execution_time_cavity = 0.0; double average_execution_time_align = 0.0; std::vector<double> exec_time_drift; std::vector<double> exec_time_drift_exact; std::vector<double> exec_time_cavity; std::vector<double> exec_time_align; int choice = 1; for(int ll = 0; ll < NUM_REPETITIONS; ++ll) { /* We will use 9+ beam element blocks in this example and do not * care to be memory efficient yet; thus we make the blocks for * beam elements and particles big enough to avoid running into problems */ constexpr st_block_size_t const MAX_NUM_BEAM_ELEMENTS = 1000u; // 20u; constexpr st_block_size_t const NUM_OF_BEAM_ELEMENTS = 1000u; //9u; /* 1MByte is plenty of space */ constexpr st_block_size_t const BEAM_ELEMENTS_DATA_CAPACITY = 1048576u; /* Prepare and init the beam elements buffer */ st_Blocks beam_elements; st_Blocks_preset( &beam_elements ); int ret = st_Blocks_init( &beam_elements, MAX_NUM_BEAM_ELEMENTS, BEAM_ELEMENTS_DATA_CAPACITY ); assert( ret == 0 ); /* if there was an error, ret would be != 0 */ /* Add NUM_OF_BEAM_ELEMENTS drifts to the buffer. For this example, let's * just have one simple constant length for all of them: */ // One-fourth of the beam-elements are drift-elements for( st_block_size_t ii = 0 ; ii < NUM_OF_BEAM_ELEMENTS/4 ; ++ii ) { double const drift_length = double{ 0.2L }; st_Drift* drift = st_Blocks_add_drift( &beam_elements, drift_length ); (void)drift; // using the variable with a no-op assert( drift != nullptr ); /* Otherwise, there was a problem! */ } /* Check if we *really* have the correct number of beam elements and * if they really are all drifts */ assert( st_Blocks_get_num_of_blocks( &beam_elements ) == NUM_OF_BEAM_ELEMENTS/4 ); /* The beam_elements container is currently not serialized yet -> * we could still add blocks to the buffer. Let's jus do this and * add a different kind of beam element to keep it easier apart! */ for( st_block_size_t ii = NUM_OF_BEAM_ELEMENTS/4 ; ii < NUM_OF_BEAM_ELEMENTS/2 ; ++ii ) { double const drift_length = double{ 0.1L }; st_DriftExact* drift_exact = st_Blocks_add_drift_exact( &beam_elements, drift_length ); (void) drift_exact; assert( drift_exact != nullptr ); } assert( st_Blocks_get_num_of_blocks( &beam_elements ) == ( NUM_OF_BEAM_ELEMENTS*0.5) ); /* Adding the beam element 'cavity' */ for( st_block_size_t ii = NUM_OF_BEAM_ELEMENTS*0.5 ; ii < NUM_OF_BEAM_ELEMENTS*0.75 ; ++ii ) { double const voltage = double{ 1e4}; double const frequency = double{ 40}; double const lag = double{ 0.01L}; st_Cavity* cavity = st_Blocks_add_cavity( &beam_elements, voltage, frequency, lag); (void) cavity; // a no-op assert( cavity != nullptr ); /* Otherwise, there was a problem! */ } assert( st_Blocks_get_num_of_blocks( &beam_elements ) == ( NUM_OF_BEAM_ELEMENTS * 0.75) ); /* Adding the beam element 'align' */ double const M__PI = // note the two underscores between M and PI ( double )3.1415926535897932384626433832795028841971693993751L; for( st_block_size_t ii = NUM_OF_BEAM_ELEMENTS*0.75 ; ii < NUM_OF_BEAM_ELEMENTS ; ++ii ) { double const tilt = double{ 0.5}; double const z = double{ M__PI / 45}; double const dx = double{ 0.2L}; double const dy = double{ 0.2L}; st_Align* align = st_Blocks_add_align( &beam_elements, tilt, cos( z ), sin( z ), dx, dy); (void) align; // a no-op assert( align != nullptr ); /* Otherwise, there was a problem! */ } assert( st_Blocks_get_num_of_blocks( &beam_elements ) == ( NUM_OF_BEAM_ELEMENTS) ); /* Always safely terminate pointer variables pointing to resources they * do not own which we no longer need -> just a good practice */ // drift_exact = nullptr; /* After serialization, the "structure" of the beam_elements buffer is * frozen, but the data in the elements - i.e. the length of the * individual drifts in our example - can still be modified. We will * just not be able to add further blocks to the container */ assert( !st_Blocks_are_serialized( &beam_elements ) ); ret = st_Blocks_serialize( &beam_elements ); assert( ret == 0 ); assert( st_Blocks_are_serialized( &beam_elements ) ); // serialization on CPU done. /* Next, let's iterate over all the beam_elements in the buffer and * print out the properties -> we expect that NUM_OF_BEAM_ELEMENTS * st_Drift with the same length appear and one st_DriftExact with a * different length should appear in the end */ std::cout.flush(); /************************** Preparing grounds for OpenCL *******/ std::vector<cl::Platform> platform; cl::Platform::get(&platform); if( platform.empty() ) { std::cerr << "OpenCL platforms not found." << std::endl; return 1; } std::vector< cl::Device > devices; for( auto const& p : platform ) { std::vector< cl::Device > temp_devices; p.getDevices( CL_DEVICE_TYPE_ALL, &temp_devices ); for( auto const& d : temp_devices ) { if( !d.getInfo< CL_DEVICE_AVAILABLE >() ) continue; devices.push_back( d ); } } cl::Device* ptr_selected_device = nullptr; if( !devices.empty() ) { if( argc >= 4 ) { std::size_t const device_idx = std::atoi( argv[ 3 ] ); if( device_idx < devices.size() ) { ptr_selected_device = &devices[ device_idx ]; } } if( ptr_selected_device == nullptr ) { std::cout << "default selecting device #0" << std::endl; ptr_selected_device = &devices[ 0 ]; } } if( ptr_selected_device != nullptr ) { std::cout << "device: " << ptr_selected_device->getInfo< CL_DEVICE_NAME >() << std::endl; } else return 0; cl::Context context( *ptr_selected_device ); // std::cout << "Device list" << std::endl; // for(unsigned int jj=0; jj<devices.size(); jj++){ // std::cout << "Name of devicei " << jj<<" : "<<devices[jj].getInfo<CL_DEVICE_NAME>() << std::endl; // std::cout << "resolution of device timer for device " << jj <<" : "<<devices[jj].getInfo<CL_DEVICE_PROFILING_TIMER_RESOLUTION>() << std::endl; // }; /**********************************************/ ///////////////////////////////////////////////////////////////////////////////////////////////////////////// // getting the kernel file std::string PATH_TO_KERNEL_FILE( st_PATH_TO_BASE_DIR ); PATH_TO_KERNEL_FILE += "tests/benchmark/sixtracklib/opencl/"; PATH_TO_KERNEL_FILE += "kernels_beam_elements_oneatatime.cl"; std::string kernel_source( "" ); std::ifstream kernel_file( PATH_TO_KERNEL_FILE.c_str(), std::ios::in | std::ios::binary ); if( kernel_file.is_open() ) { std::istreambuf_iterator< char > file_begin( kernel_file.rdbuf() ); std::istreambuf_iterator< char > end_of_file; kernel_source.assign( file_begin, end_of_file ); kernel_file.close(); } //////////////////////////////////////////////////////////////////////////////////////////////////////////// assert( ptr_selected_device != nullptr ); // int ndev = 0; // specifying the id of the device to be used cl::CommandQueue queue(context, *ptr_selected_device,CL_QUEUE_PROFILING_ENABLE); // Compile OpenCL program for found devices. cl:: Program program(context, kernel_source); //string kernel_source contains the kernel(s) read from the file #if 0 /////////////////////// Alternative 1 for including the kernels written in a separate file -- works perfectly fine ///////////////////////////////// cl:: Program program(context, "#include \"../kernels.cl\" ", false); // the path inside the #include should be relative to an include directory specified using -Ipath/to/dir specified via build options.. otherwise give the absolute path. #endif #if 0 /////////////////////// The way to go if the string source[] contains the source in the same file as this. // cl::Program program(context, cl::Program::Sources( // 1, std::make_pair(source, strlen(source)) // )); #endif try { std::string incls = "-D_GPUCODE=1 -D__NAMESPACE=st_ -I" + std::string(NS(PATH_TO_BASE_DIR)) ; // std::cout << "Path = " << incls << std::endl; //program.build(devices, "-D_GPUCODE=1 -D__NAMESPACE=st_ -I/home/sosingh/sixtracklib_gsoc18/initial_test/sixtrack-v0/external/include"); program.build( incls.c_str() ); } catch (const cl::Error&) { std::cerr << "OpenCL compilation error" << std::endl << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(*ptr_selected_device) << std::endl; throw; } cl::Buffer B(context, CL_MEM_READ_WRITE, st_Blocks_get_total_num_bytes( &beam_elements )); // input vector queue.enqueueWriteBuffer( B, CL_TRUE, 0, st_Blocks_get_total_num_bytes( &beam_elements ), st_Blocks_get_const_data_begin( &beam_elements ) ); ////////////////////////// Particles //////////////////////////////// st_block_size_t const NUM_PARTICLE_BLOCKS = 1u; st_block_size_t const PARTICLES_DATA_CAPACITY = 1048576u*1000*4; // ~(4 GB) st_block_size_t const NUM_PARTICLES = atoi(argv[1]); // 100u; st_Blocks particles_buffer; st_Blocks_preset( &particles_buffer ); ret = st_Blocks_init( &particles_buffer, NUM_PARTICLE_BLOCKS, PARTICLES_DATA_CAPACITY ); assert( ret == 0 ); st_Particles* particles = st_Blocks_add_particles( &particles_buffer, NUM_PARTICLES ); if( particles != nullptr ) { /* Just some random values assigned to the individual attributes * of the acutal particles -> these values do not make any * sense physically, but should be safe for calculating maps -> * please check with the map for drift whether they do not produce * some NaN's at the sqrt or divisions by 0 though!*/ std::mt19937_64 prng( 20180622 ); std::uniform_real_distribution<> x_distribution( 0.05, 1.0 ); std::uniform_real_distribution<> y_distribution( 0.05, 1.0 ); std::uniform_real_distribution<> px_distribution( 0.05, 0.2 ); std::uniform_real_distribution<> py_distribution( 0.05, 0.2 ); std::uniform_real_distribution<> sigma_distribution( 0.01, 0.5 ); assert( particles->s != nullptr ); assert( particles->x != nullptr ); assert( particles->y != nullptr ); assert( particles->px != nullptr ); assert( particles->py != nullptr ); assert( particles->sigma != nullptr ); assert( particles->rpp != nullptr ); assert( particles->rvv != nullptr ); assert( particles->num_of_particles == (int)NUM_PARTICLES ); for( st_block_size_t ii = 0 ; ii < NUM_PARTICLES ; ++ii ) { particles->s[ ii ] = 0.0; particles->x[ ii ] = x_distribution( prng ); particles->y[ ii ] = y_distribution( prng ); particles->px[ ii ] = px_distribution( prng ); particles->py[ ii ] = py_distribution( prng ); particles->sigma[ ii ] = sigma_distribution( prng ); particles->rpp[ ii ] = 1.0; particles->rvv[ ii ] = 1.0; } } ret = st_Blocks_serialize( &particles_buffer ); assert( ret == 0 ); /* ===================================================================== */ /* Copy to other buffer to simulate working on the GPU */ //std::cout << "On the GPU:\n"; // Allocate device buffers and transfer input data to device. cl::Buffer C(context, CL_MEM_READ_WRITE, st_Blocks_get_total_num_bytes( &particles_buffer )); // input vector queue.enqueueWriteBuffer( C, CL_TRUE, 0, st_Blocks_get_total_num_bytes( &particles_buffer ), st_Blocks_get_const_data_begin( &particles_buffer ) ); int numThreads = 1; int blockSize = 1; cl::Kernel unserialize(program, "unserialize"); unserialize.setArg(0,B); unserialize.setArg(1,C); unserialize.setArg(2,NUM_PARTICLES); queue.enqueueNDRangeKernel( unserialize, cl::NullRange, cl::NDRange( numThreads ), cl::NDRange(blockSize )); queue.flush(); queue.finish(); // creating a buffer to transfer the data from GPU to CPU std::vector< uint8_t > copy_particles_buffer_host(st_Blocks_get_total_num_bytes( &particles_buffer )/sizeof(uint8_t)); // output vector queue.enqueueReadBuffer(C, CL_TRUE, 0, copy_particles_buffer_host.size() * sizeof(uint8_t), copy_particles_buffer_host.data()); queue.flush(); st_Blocks copy_particles_buffer; st_Blocks_preset( ©_particles_buffer ); ret = st_Blocks_unserialize( ©_particles_buffer, copy_particles_buffer_host.data() ); assert( ret == 0 ); SIXTRL_UINT64_T const NUM_TURNS = atoi(argv[2]);//100; SIXTRL_UINT64_T offset = 0; cl::Event event; switch (choice) { case 1 : { cl::Kernel track_drift_particle(program, "track_drift_particle"); blockSize = track_drift_particle.getWorkGroupInfo< CL_KERNEL_WORK_GROUP_SIZE >( *ptr_selected_device);// determine the work-group size numThreads = ((NUM_PARTICLES+blockSize-1)/blockSize) * blockSize; // rounding off NUM_PARTICLES to the next nearest multiple of blockSize. This is to ensure that there are integer number of work-groups launched std::cout << blockSize << " " << numThreads<< std::endl; track_drift_particle.setArg(0,B); track_drift_particle.setArg(1,C); track_drift_particle.setArg(2,NUM_PARTICLES); track_drift_particle.setArg(3,NUM_TURNS); track_drift_particle.setArg(4,offset); queue.enqueueNDRangeKernel( track_drift_particle, cl::NullRange, cl::NDRange( numThreads ), cl::NDRange(blockSize ), nullptr, &event); queue.flush(); event.wait(); queue.finish(); cl_ulong when_kernel_queued = 0; cl_ulong when_kernel_submitted = 0; cl_ulong when_kernel_started = 0; cl_ulong when_kernel_ended = 0; ret = event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_QUEUED, &when_kernel_queued ); ret |= event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_SUBMIT, &when_kernel_submitted ); ret |= event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_START, &when_kernel_started ); ret |= event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_END, &when_kernel_ended ); assert( ret == CL_SUCCESS ); // all ret's should be 1 double const kernel_time_elapsed = when_kernel_ended - when_kernel_started; exec_time_drift.push_back(kernel_time_elapsed); if( ll > 5 ) { num_of_turns_drift += 1.0; average_execution_time_drift += (kernel_time_elapsed - average_execution_time_drift)/num_of_turns_drift; } // break; } case 2: { offset = 250; // cl::Event event; cl::Kernel track_drift_exact_particle(program, "track_drift_exact_particle"); blockSize = track_drift_exact_particle.getWorkGroupInfo< CL_KERNEL_WORK_GROUP_SIZE >( *ptr_selected_device);// determine the work-group size numThreads = ((NUM_PARTICLES+blockSize-1)/blockSize) * blockSize; // rounding off NUM_PARTICLES to the next nearest multiple of blockSize. This is to ensure that there are integer number of work-groups launched std::cout << blockSize << " " << numThreads<< std::endl; track_drift_exact_particle.setArg(0,B); track_drift_exact_particle.setArg(1,C); track_drift_exact_particle.setArg(2,NUM_PARTICLES); track_drift_exact_particle.setArg(3,NUM_TURNS); track_drift_exact_particle.setArg(4,offset); queue.enqueueNDRangeKernel( track_drift_exact_particle, cl::NullRange, cl::NDRange( numThreads ), cl::NDRange(blockSize ), nullptr, &event); queue.flush(); event.wait(); queue.finish(); cl_ulong when_kernel_queued = 0; cl_ulong when_kernel_submitted = 0; cl_ulong when_kernel_started = 0; cl_ulong when_kernel_ended = 0; ret = event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_QUEUED, &when_kernel_queued ); ret |= event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_SUBMIT, &when_kernel_submitted ); ret |= event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_START, &when_kernel_started ); ret |= event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_END, &when_kernel_ended ); assert( ret == CL_SUCCESS ); // all ret's should be 1 double const kernel_time_elapsed = when_kernel_ended - when_kernel_started; exec_time_drift_exact.push_back(kernel_time_elapsed); if( ll > 5 ) { num_of_turns_drift_exact += 1.0; average_execution_time_drift_exact += (kernel_time_elapsed - average_execution_time_drift_exact)/num_of_turns_drift_exact; } //break; } case 3: { offset = 500; // cl::Event event; cl::Kernel track_cavity_particle(program, "track_cavity_particle"); blockSize = track_cavity_particle.getWorkGroupInfo< CL_KERNEL_WORK_GROUP_SIZE >( *ptr_selected_device);// determine the work-group size numThreads = ((NUM_PARTICLES+blockSize-1)/blockSize) * blockSize; // rounding off NUM_PARTICLES to the next nearest multiple of blockSize. This is to ensure that there are integer number of work-groups launched std::cout << blockSize << " " << numThreads<< std::endl; track_cavity_particle.setArg(0,B); track_cavity_particle.setArg(1,C); track_cavity_particle.setArg(2,NUM_PARTICLES); track_cavity_particle.setArg(3,NUM_TURNS); track_cavity_particle.setArg(4,offset); queue.enqueueNDRangeKernel( track_cavity_particle, cl::NullRange, cl::NDRange( numThreads ), cl::NDRange(blockSize ), nullptr, &event); queue.flush(); event.wait(); queue.finish(); cl_ulong when_kernel_queued = 0; cl_ulong when_kernel_submitted = 0; cl_ulong when_kernel_started = 0; cl_ulong when_kernel_ended = 0; ret = event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_QUEUED, &when_kernel_queued ); ret |= event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_SUBMIT, &when_kernel_submitted ); ret |= event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_START, &when_kernel_started ); ret |= event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_END, &when_kernel_ended ); assert( ret == CL_SUCCESS ); // all ret's should be 1 double const kernel_time_elapsed = when_kernel_ended - when_kernel_started; exec_time_cavity.push_back(kernel_time_elapsed); if( ll > 5 ) { num_of_turns_cavity += 1.0; average_execution_time_cavity += (kernel_time_elapsed - average_execution_time_cavity)/num_of_turns_cavity; } // break; } case 4: { //cl::Event event; offset = 750; cl::Kernel track_align_particle(program, "track_align_particle"); blockSize = track_align_particle.getWorkGroupInfo< CL_KERNEL_WORK_GROUP_SIZE >( *ptr_selected_device);// determine the work-group size numThreads = ((NUM_PARTICLES+blockSize-1)/blockSize) * blockSize; // rounding off NUM_PARTICLES to the next nearest multiple of blockSize. This is to ensure that there are integer number of work-groups launched std::cout << blockSize << " " << numThreads<< std::endl; track_align_particle.setArg(0,B); track_align_particle.setArg(1,C); track_align_particle.setArg(2,NUM_PARTICLES); track_align_particle.setArg(3,NUM_TURNS); track_align_particle.setArg(4,offset); queue.enqueueNDRangeKernel( track_align_particle, cl::NullRange, cl::NDRange( numThreads ), cl::NDRange(blockSize ), nullptr, &event); queue.flush(); event.wait(); queue.finish(); cl_ulong when_kernel_queued = 0; cl_ulong when_kernel_submitted = 0; cl_ulong when_kernel_started = 0; cl_ulong when_kernel_ended = 0; ret = event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_QUEUED, &when_kernel_queued ); ret |= event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_SUBMIT, &when_kernel_submitted ); ret |= event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_START, &when_kernel_started ); ret |= event.getProfilingInfo< cl_ulong >( CL_PROFILING_COMMAND_END, &when_kernel_ended ); assert( ret == CL_SUCCESS ); // all ret's should be 1 double const kernel_time_elapsed = when_kernel_ended - when_kernel_started; exec_time_align.push_back(kernel_time_elapsed); if( ll > 5 ) { num_of_turns_align += 1.0; average_execution_time_align += (kernel_time_elapsed - average_execution_time_align)/num_of_turns_align; } // break; } }; // end of switch case queue.enqueueReadBuffer(C, CL_TRUE, 0, copy_particles_buffer_host.size() * sizeof(uint8_t), copy_particles_buffer_host.data()); queue.flush(); //st_Blocks copy_particles_buffer; st_Blocks_preset( ©_particles_buffer ); ret = st_Blocks_unserialize( ©_particles_buffer, copy_particles_buffer_host.data() ); assert( ret == 0 ); /* on the GPU, these pointers will have __global as a decorator */ #if 0 // On the CPU after copying the data back from the GPU std::cout << "\n On the Host, after applying the drift_track_particles mapping and copying from the GPU\n"; SIXTRL_GLOBAL_DEC st_BlockInfo const* itr = st_Blocks_get_const_block_infos_begin( ©_particles_buffer ); SIXTRL_GLOBAL_DEC st_BlockInfo const* endr = st_Blocks_get_const_block_infos_end( ©_particles_buffer ); for( ; itr != endr ; ++itr ) { SIXTRL_GLOBAL_DEC st_Particles const* particles = ( SIXTRL_GLOBAL_DEC st_Particles const* )itr->begin; std::cout.precision( 4 ); for( st_block_size_t ii = 0 ; ii < NUM_PARTICLES ; ++ii ) { std::cout << " ii = " << std::setw( 6 ) << ii << std::fixed << " | s = " << std::setw( 6 ) << particles->s[ ii ] << " | x = " << std::setw( 6 ) << particles->x[ ii ] << " | y = " << std::setw( 6 ) << particles->y[ ii ] << " | px = " << std::setw( 6 ) << particles->px[ ii ] << " | py = " << std::setw( 6 ) << particles->py[ ii ] << " | sigma = " << std::setw( 6 ) << particles->sigma[ ii ] << " | rpp = " << std::setw( 6 ) << particles->rpp[ ii ] << " | rvv = " << std::setw( 6 ) << particles->rvv[ ii ] << "\r\n"; } } #endif std::cout.flush(); st_Blocks_free( &particles_buffer ); st_Blocks_free( ©_particles_buffer ); } // end of the NUM_REPETITIONS 'for' loop switch(choice) { case 1: { // printing the contents of the exec_time vector std::cout << "track_drift_particle" << std::endl; for(std::vector<double>::iterator it = exec_time_drift.begin(); it != exec_time_drift.end(); ++it) printf("%.3f s%c",(*it)*1.0e-9, ",\n"[it+1 == exec_time_drift.end()]); printf("Reference Version : Time = %.3f s; \n",average_execution_time_drift*1.0e-9); //break; } case 2: { std::cout << "track_drift_exact_particle" << std::endl; for(std::vector<double>::iterator it = exec_time_drift_exact.begin(); it != exec_time_drift_exact.end(); ++it) printf("%.3f s%c",(*it)*1.0e-9, ",\n"[it+1 == exec_time_drift_exact.end()]); printf("Reference Version: Time = %.3f s; \n",average_execution_time_drift_exact*1.0e-9); //break; } case 3: { std::cout << "track_cavity_particle" << std::endl; for(std::vector<double>::iterator it = exec_time_cavity.begin(); it != exec_time_cavity.end(); ++it) printf("%.3f s%c",(*it)*1.0e-9, ",\n"[it+1 == exec_time_cavity.end()]); printf("Reference Version: Time = %.3f s; \n",average_execution_time_cavity*1.0e-9); // break; } case 4: { std::cout << "track_align_particle" << std::endl; for(std::vector<double>::iterator it = exec_time_align.begin(); it != exec_time_align.end(); ++it) printf("%.3f s%c",(*it)*1.0e-9, ",\n"[it+1 == exec_time_align.end()]); printf("Reference Version: Time = %.3f s; \n",average_execution_time_align*1.0e-9); break; } }; return 0; }
void createKernelFile(const char * dirname) { //Step 1: Open kernel file std::string header_name(dirname); std::ofstream kernel_file(("@PROJECT_BINARY_DIR@/viennacl/linalg/kernels/" + header_name + "_kernels.h").c_str()); //Step 2: Write kernel header file preamble std::string dirname_uppercase(dirname); std::transform(dirname_uppercase.begin(), dirname_uppercase.end(), dirname_uppercase.begin(), toupper); kernel_file << "#ifndef VIENNACL_" << dirname_uppercase << "_KERNELS_HPP_" << std::endl; kernel_file << "#define VIENNACL_" << dirname_uppercase << "_KERNELS_HPP_" << std::endl; kernel_file << "#include \"viennacl/tools/tools.hpp\"" << std::endl; kernel_file << "#include \"viennacl/ocl/kernel.hpp\"" << std::endl; kernel_file << "#include \"viennacl/ocl/platform.hpp\"" << std::endl; kernel_file << "#include \"viennacl/ocl/utils.hpp\"" << std::endl; kernel_file << "#include \"viennacl/linalg/kernels/" << dirname << "_source.h\"" << std::endl; kernel_file << std::endl; kernel_file << "//Automatically generated file from aux-directory, do not edit manually!" << std::endl; kernel_file << "/** @file " << header_name << "_kernels.h" << std::endl; kernel_file << " * @brief OpenCL kernel file, generated automatically from scripts in auxiliary/. */" << std::endl; kernel_file << "namespace viennacl" << std::endl; kernel_file << "{" << std::endl; kernel_file << " namespace linalg" << std::endl; kernel_file << " {" << std::endl; kernel_file << " namespace kernels" << std::endl; kernel_file << " {" << std::endl; //Step 3: Write class information: kernel_file << " template<class TYPE, unsigned int alignment>" << std::endl; kernel_file << " struct " << dirname << ";" << std::endl << std::endl; //Step 4: Write single precision kernels std::string dir(dirname); kernel_file << std::endl << " /////////////// single precision kernels //////////////// " << std::endl; fs::path filepath = fs::system_complete( fs::path( dir ) ); if ( fs::is_directory( filepath ) ) { //std::cout << "\nIn directory: " << filepath.directory_string() << std::endl; fs::directory_iterator end_iter; //write and register single precision sources: for ( fs::directory_iterator alignment_itr( filepath ); alignment_itr != end_iter; ++alignment_itr ) { if (fs::is_directory( alignment_itr->path() )) { #ifdef USE_OLD_BOOST_FILESYSTEM_VERSION std::string subfolder = alignment_itr->path().filename(); #else std::string subfolder = alignment_itr->path().filename().string(); #endif if( subfolder.find("align") == std::string::npos ) continue; writeKernelInit(kernel_file, dirname, subfolder, true); } //if is_directory } //for alignment_iterator kernel_file << std::endl; } //if is_directory else std::cerr << "Cannot access directory " << dirname << std::endl; //Step 5: Write double precision kernels kernel_file << std::endl << " /////////////// double precision kernels //////////////// " << std::endl; filepath = fs::system_complete( fs::path( dir ) ); if ( fs::is_directory( filepath ) ) { //std::cout << "\nIn directory: " << filepath.directory_string() << std::endl; fs::directory_iterator end_iter; //write and register single precision sources: for ( fs::directory_iterator alignment_itr( filepath ); alignment_itr != end_iter; ++alignment_itr ) { if (fs::is_directory( alignment_itr->path() )) { #ifdef USE_OLD_BOOST_FILESYSTEM_VERSION std::string subfolder = alignment_itr->path().filename(); #else std::string subfolder = alignment_itr->path().filename().string(); #endif if( subfolder.find("align") == std::string::npos ) continue; writeKernelInit(kernel_file, dirname, subfolder, false); } //if is_directory } //for alignment_iterator kernel_file << std::endl; } //if is_directory else std::cerr << "Cannot access directory " << dirname << std::endl; //Final Step: Write file tail: kernel_file << " } //namespace kernels" << std::endl; kernel_file << " } //namespace linalg" << std::endl; kernel_file << "} //namespace viennacl" << std::endl; kernel_file << "#endif" << std::endl; kernel_file << std::endl; kernel_file.close(); }