コード例 #1
0
ファイル: Init.cpp プロジェクト: cnndabbler/cn24
cl_program CLHelper::CreateProgram ( const char* file_name ) {
  cl_int error = 0;
  cl_program program = 0;

  LOGDEBUG << "Compiling " << file_name;

  std::string binary_path;
  System::GetExecutablePath(binary_path);
  
  // Search in binary path first
  std::string full_path = binary_path + std::string(file_name);
  
  // If kernel cannot be found, go up one folder (Xcode, Visual Studio and
  // other multi-target build setups)
  if ( !std::ifstream(full_path, std::ios::in).good()) {
    full_path = binary_path + "../" + std::string(file_name);
  }
  
  std::ifstream kernel_file ( full_path, std::ios::in );

  if ( !kernel_file.good() ) {
    FATAL ( "Cannot open kernel: " << full_path );
  }

  std::ostringstream oss;
  oss << kernel_file.rdbuf();

  std::string kernel_content = oss.str();
  const char* kernel_content_char = kernel_content.c_str();

  program = clCreateProgramWithSource ( context, 1, ( const char** ) &kernel_content_char, NULL, NULL );

  if ( program == NULL ) {
    FATAL ( "Cannot create kernel: " << file_name );
  }

  error = clBuildProgram ( program, 1, &device, NULL, NULL, NULL );

  if ( error != CL_SUCCESS ) {
    char build_log[16384];
    clGetProgramBuildInfo ( program, device, CL_PROGRAM_BUILD_LOG, 16384, build_log, NULL );
    LOGERROR << "Error compiling kernel " << file_name << ":\n" << std::string ( build_log );
    FATAL ( "Compilation failed, exiting..." );
  }

  return program;
}
コード例 #2
0
ファイル: mpiOCL.cpp プロジェクト: liubingjun/mpiopencl
cl_program CreateProgram(cl_context context, cl_device_id device, const char *file_name)
{
    cl_int error_number;
    cl_program program;

    std::ifstream kernel_file(file_name, std::ios::in);
    if (!kernel_file.is_open())
    {
        std::cerr << "Failed to open file for reading: " << file_name << std::endl;
        return NULL;
    }

    std::ostringstream oss;
    oss << kernel_file.rdbuf();

    std::string source_stand_string = oss.str();
    const char *source_string = source_stand_string.c_str();
    program = clCreateProgramWithSource(context,
                                        1,
                                        (const char**)&source_string,
                                        NULL,
                                        NULL);
    if (program == NULL)
    {
        std::cerr << "Failed to create CL program from source." << std::endl;
        return NULL;
    }

    error_number = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (error_number != CL_SUCCESS)
    {
        char build_log[16384];
        clGetProgramBuildInfo(program,
                              device,
                              CL_PROGRAM_BUILD_LOG,
                              sizeof(build_log),
                              build_log,
                              NULL);
        std::cerr << "Error in kernel: " << std::endl;
        std::cerr << build_log;
        clReleaseProgram(program);
        return NULL;
    }
    return program;
}
int main()
{
    using size_t                = ::st_buffer_size_t;
    using num_particles_t       = ::st_particle_num_elements_t;
    using buffer_t              = ::st_Buffer;
    using object_t              = ::st_Object;
    using particles_t           = ::st_Particles;

    using timing_result_t       = sixtrack::benchmarks::TimingResult;
    size_t const NUM_TURNS      = size_t{ 20u };

    /* ===================================================================== */
    /* ==== Prepare Host Buffers                                             */

    double begin_time = ::st_Time_get_seconds_since_epoch();

    std::vector< size_t > num_particles_list =
    {
        20000u
    };

    std::sort( num_particles_list.begin(), num_particles_list.end() );

    /* --------------------------------------------------------------------- */

    buffer_t* lhc_beam_elements = ::st_Buffer_new_from_file(
        ::st_PATH_TO_TEST_LHC_BEAM_ELEMENTS_DATA_NO_BEAM_BEAM );

    object_t const* be_begin =
        ::st_Buffer_get_const_objects_begin( lhc_beam_elements );

    object_t const* be_end   =
        ::st_Buffer_get_const_objects_end( lhc_beam_elements );

    /* --------------------------------------------------------------------- */

    buffer_t* lhc_particles_buffer = ::st_Buffer_new_from_file(
        st_PATH_TO_TEST_LHC_PARTICLES_DATA_T1_P2_NO_BEAM_BEAM );

    particles_t const* lhc_particles = ( particles_t const* )( uintptr_t
        )::st_Object_get_begin_addr( ::st_Buffer_get_const_objects_begin(
            lhc_particles_buffer ) );

    size_t const lhc_num_particles =
        ::st_Particles_get_num_of_particles( lhc_particles );

    /* --------------------------------------------------------------------- */

    size_t const max_num_particles = num_particles_list.back();

    size_t const requ_num_slots = ::st_Particles_get_required_num_slots(
        lhc_particles_buffer, max_num_particles );

    size_t const requ_num_dataptrs = ::st_Particles_get_required_num_dataptrs(
        lhc_particles_buffer, max_num_particles );

    size_t const req_particles_buffer_size =
        ::st_Buffer_calculate_required_buffer_length( lhc_particles_buffer,
            max_num_particles, requ_num_slots, requ_num_dataptrs, size_t{ 0 } );

    buffer_t* particles_buffer = ::st_Buffer_new( req_particles_buffer_size );

    /* --------------------------------------------------------------------- */

    double now = ::st_Time_get_seconds_since_epoch();

    double const time_setup_host_buffers =
        ( now >= begin_time ) ? ( now - begin_time ) : double{ 0.0 };

    /* ===================================================================== */
    /* ==== Prepare OpenCL Environment Buffers                                             */

    begin_time = ::st_Time_get_seconds_since_epoch();

    std::vector< cl::Platform > platforms;
    cl::Platform::get( &platforms );

    std::vector< cl::Device > devices;

    for( auto const& p : platforms )
    {
        std::vector< cl::Device > temp_devices;

        p.getDevices( CL_DEVICE_TYPE_ALL, &temp_devices );

        for( auto const& d : temp_devices )
        {
            if( !d.getInfo< CL_DEVICE_AVAILABLE >() ) continue;

            devices.push_back( d );
        }
    }

    now =  ::st_Time_get_seconds_since_epoch();

    double const time_get_platforms =
        ( now >= begin_time ) ? now - begin_time : double{ 0 };

    begin_time = ::st_Time_get_seconds_since_epoch();

    if( !devices.empty() )
    {
        std::ostringstream a2str( "" );
        std::string const PATH_TO_BASE_DIR = ::st_PATH_TO_BASE_DIR;

        a2str << " -D_GPUCODE=1"
              << " -D__NAMESPACE=st_"
              << " -DSIXTRL_DATAPTR_DEC=__global"
              << " -DSIXTRL_BUFFER_DATAPTR_DEC=__global"
              << " -DSIXTRL_BUFFER_OBJ_ARGPTR_DEC=__global"
              << " -DISXTRL_BUFFER_OBJ_DATAPTR_DEC=__global"
              << " -DSIXTRL_PARTICLE_ARGPTR_DEC=__global"
              << " -DSIXTRL_PARTICLE_DATAPTR_DEC=__global"
              << " -DSIXTRL_BE_ARGPTR_DEC=__global"
              << " -DSIXTRL_BE_DATAPTR_DEC=__global"
              << " -I" << PATH_TO_BASE_DIR;

        std::string const REMAP_COMPILE_OPTIONS = a2str.str();

        /* ----------------------------------------------------------------- */

        std::string path_to_source = PATH_TO_BASE_DIR;
        path_to_source += "sixtracklib/opencl/impl/track_particles_kernel.cl";

        std::ifstream kernel_file( path_to_source, std::ios::in );

        std::string const REMAP_PROGRAM_SOURCE_CODE(
            ( std::istreambuf_iterator< char >( kernel_file ) ),
              std::istreambuf_iterator< char >() );

        kernel_file.close();

        path_to_source  = PATH_TO_BASE_DIR;
        path_to_source += "sixtracklib/opencl/impl/";
        path_to_source += "track_particles_priv_particles_optimized_kernel.cl";

        kernel_file.open( path_to_source, std::ios::in );

        std::string const TRACKING_PRORGRAM_SOURCE_CODE(
            ( std::istreambuf_iterator< char >( kernel_file ) ),
              std::istreambuf_iterator< char >() );

        a2str.str( "" );

        a2str << " -D_GPUCODE=1"
              << " -D__NAMESPACE=st_"
              << " -DSIXTRL_DATAPTR_DEC=__global"
              << " -DSIXTRL_BUFFER_DATAPTR_DEC=__global"
              << " -DSIXTRL_BUFFER_OBJ_ARGPTR_DEC=__global"
              << " -DISXTRL_BUFFER_OBJ_DATAPTR_DEC=__global"
              << " -DSIXTRL_PARTICLE_ARGPTR_DEC=__private"
              << " -DSIXTRL_PARTICLE_DATAPTR_DEC=__private"
              << " -DSIXTRL_BE_ARGPTR_DEC=__global"
              << " -DSIXTRL_BE_DATAPTR_DEC=__global"
              << " -I" << PATH_TO_BASE_DIR;

        std::string const TRACKING_COMPILE_OPTIONS = a2str.str();

        /* ----------------------------------------------------------------- */

        for( auto& device : devices )
        {
            cl::Platform platform( device.getInfo< CL_DEVICE_PLATFORM >() );

            std::cout << "--------------------------------------------------"
                      << "----------------------------------------------\r\n"
                      << "INFO  :: Perform test for device       : "
                      << device.getInfo< CL_DEVICE_NAME >() << "\r\n"
                      << "INFO  :: Platform                      : "
                      << platform.getInfo< CL_PLATFORM_NAME >() << "\r\n"
                      << "INFO  :: Platform Vendor               : "
                      << platform.getInfo< CL_PLATFORM_VENDOR >() << "\r\n"
                      << "INFO  :: Device Type                   : ";

            auto const device_type = device.getInfo< CL_DEVICE_TYPE >();

            switch( device_type )
            {
                case CL_DEVICE_TYPE_CPU:
                {
                    std::cout << "CPU";
                    break;
                }

                case CL_DEVICE_TYPE_GPU:
                {
                    std::cout << "GPU";
                    break;
                }

                case CL_DEVICE_TYPE_ACCELERATOR:
                {
                    std::cout << "Accelerator";
                    break;
                }

                case CL_DEVICE_TYPE_CUSTOM:
                {
                    std::cout << "Custom";
                    break;
                }

                default:
                {
                    std::cout << "Unknown";
                }
            };

            size_t const device_max_compute_units =
                device.getInfo< CL_DEVICE_MAX_COMPUTE_UNITS >();

            std::cout << "\r\n"
                      << "INFO  :: Max work-group size           : "
                      << device.getInfo< CL_DEVICE_MAX_WORK_GROUP_SIZE >()
                      << "\r\n"
                      << "INFO  :: Max num compute units         : "
                      << device_max_compute_units << "\r\n";

            /* ------------------------------------------------------------- */

            cl_int cl_ret = CL_SUCCESS;

            cl::Context context( device );
            cl::CommandQueue queue( context, device, CL_QUEUE_PROFILING_ENABLE );
            cl::Program remap_program( context, REMAP_PROGRAM_SOURCE_CODE );
            cl::Program tracking_program( context, TRACKING_PRORGRAM_SOURCE_CODE );

            try
            {
                cl_ret = remap_program.build( REMAP_COMPILE_OPTIONS.c_str() );
            }
            catch( cl::Error const& e )
            {
                std::cerr
                      << "ERROR :: remap_program :: "
                      << "OpenCL Compilation Error -> Stopping Unit-Test \r\n"
                      << remap_program.getBuildInfo< CL_PROGRAM_BUILD_LOG >( device )
                      << "\r\n"
                      << std::endl;

                cl_ret = CL_FALSE;
                throw;
            }

            try
            {
                cl_ret = tracking_program.build( TRACKING_COMPILE_OPTIONS.c_str() );
            }
            catch( cl::Error const& e )
            {
                std::cerr
                      << "ERROR :: tracking_program :: "
                      << "OpenCL Compilation Error -> Stopping Unit-Test \r\n"
                      << tracking_program.getBuildInfo< CL_PROGRAM_BUILD_LOG >( device )
                      << "\r\n"
                      << std::endl;

                cl_ret = CL_FALSE;
                throw;
            }

            /* ------------------------------------------------------------- */

             cl::Kernel remapping_kernel;

            try
            {
                remapping_kernel =
                    cl::Kernel( remap_program, "st_Remap_particles_beam_elements_buffers_opencl" );
            }
            catch( cl::Error const& e )
            {
                std::cout << "kernel remap_kernel :: "
                          << "line  = " << __LINE__ << " :: "
                          << "ERROR : " << e.what() << "\r\n"
                          << e.err() << std::endl;

                cl_ret = CL_FALSE;
                throw;
            }

            size_t remap_work_group_size = remapping_kernel.getWorkGroupInfo<
                CL_KERNEL_WORK_GROUP_SIZE >( device );

            size_t const remap_work_group_size_prefered_multiple =
                remapping_kernel.getWorkGroupInfo<
                    CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE >( device );

            size_t remap_num_threads = remap_work_group_size_prefered_multiple;
            size_t remap_group_size  = remap_work_group_size_prefered_multiple;

            /* ------------------------------------------------------------- */

            cl::Kernel tracking_kernel;

            try
            {
                tracking_kernel = cl::Kernel( tracking_program,
                    "st_Track_particles_beam_elements_priv_particles_optimized_opencl" );
            }
            catch( cl::Error const& e )
            {
                std::cout << "kernel tracking_kernel :: "
                          << "line = " << __LINE__
                          << " :: ERROR : " << e.what() << std::endl
                          << e.err() << std::endl;
                cl_ret = CL_FALSE;
                throw;
            }

            size_t track_work_group_size = tracking_kernel.getWorkGroupInfo<
                CL_KERNEL_WORK_GROUP_SIZE >( device );

            size_t const track_work_group_size_prefered_multiple =
                tracking_kernel.getWorkGroupInfo<
                    CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE >( device );

            now = ::st_Time_get_seconds_since_epoch();

            double const time_cl_program_compile = ( now >= begin_time )
                ? ( now - begin_time ) : double{ 0.0 };

            /* ============================================================= */

            for( auto const NUM_PARTICLES : num_particles_list )
            {
                size_t tracking_num_threads = size_t{ 0 };
                size_t tracking_group_size  = track_work_group_size;

                tracking_num_threads  = NUM_PARTICLES / track_work_group_size;
                tracking_num_threads *= track_work_group_size;

                if( tracking_num_threads < NUM_PARTICLES )
                {
                    tracking_num_threads += track_work_group_size;
                }

                std::cout << "INFO  :: num_particles                 : "
                          << NUM_PARTICLES << "\r\n"
                          << "INFO  :: remap kernel wg size          : "
                          << remap_work_group_size << "\r\n"
                          << "INFO  :: remap kernel wg size multi    : "
                          << remap_work_group_size_prefered_multiple << "\r\n"
                          << "INFO  :: remap kernel launch with      : "
                          << remap_num_threads << " threads \r\n"
                          << "INFO  :: remap_kernel local size       : "
                          << remap_group_size << " threads \r\n\r\n"
                          << "INFO  :: num_turns                     : "
                          << NUM_TURNS     << "\r\n"
                          << "INFO  :: tracking kernel wg size       : "
                          << track_work_group_size << "\r\n"
                          << "INFO  :: tracking kernel wg size multi : "
                          << track_work_group_size_prefered_multiple << "\r\n"
                          << "INFO  :: tracking kernel launch with   : "
                          << tracking_num_threads << " threads\r\n"
                          << "INFO  :: tracking kernel local size    : "
                          << tracking_group_size  << " threads\r\n"
                          << std::endl;

                begin_time = ::st_Time_get_seconds_since_epoch();

                int success = ::st_Buffer_reset( particles_buffer );
                SIXTRL_ASSERT( success == 0 );

                particles_t* particles = ::st_Particles_new(
                    particles_buffer, NUM_PARTICLES );

                for( size_t ii = size_t{ 0 } ; ii < NUM_PARTICLES ; ++ii )
                {
                    size_t jj = ii % lhc_num_particles;
                    ::st_Particles_copy_single( particles, ii, lhc_particles, jj );
                }

                now = ::st_Time_get_seconds_since_epoch();

                double const time_setup_particle_buffer = ( now >= begin_time )
                    ? ( now - begin_time ) : double{ 0.0 };

                /* ========================================================= */

                std::vector< cl::Event > write_xfer_events( 3u, cl::Event{} );

                cl_ulong write_xfer_when_queued[]    = { 0, 0, 0 };
                cl_ulong write_xfer_when_submitted[] = { 0, 0, 0 };
                cl_ulong write_xfer_when_started[]   = { 0, 0, 0 };
                cl_ulong write_xfer_when_ended[]     = { 0, 0, 0 };

                begin_time = ::st_Time_get_seconds_since_epoch();

                int32_t success_flag = int32_t{ 0 };

                cl::Buffer cl_particles( context, CL_MEM_READ_WRITE,
                    ::st_Buffer_get_size( lhc_particles_buffer ) );

                cl::Buffer cl_beam_elements( context, CL_MEM_READ_WRITE,
                    ::st_Buffer_get_size( lhc_beam_elements ) );

                cl::Buffer cl_success_flag( context, CL_MEM_READ_WRITE,
                    sizeof( success_flag ) );

                try
                {
                    cl_ret = queue.enqueueWriteBuffer( cl_particles, CL_TRUE, 0,
                        ::st_Buffer_get_size( particles_buffer ),
                        ::st_Buffer_get_const_data_begin( particles_buffer ),
                        nullptr, &write_xfer_events[ 0 ] );

                    cl_ret |= write_xfer_events[ 0 ].getProfilingInfo< cl_ulong >(
                        CL_PROFILING_COMMAND_QUEUED, &write_xfer_when_queued[ 0 ] );

                    cl_ret |= write_xfer_events[ 0 ].getProfilingInfo< cl_ulong >(
                        CL_PROFILING_COMMAND_SUBMIT, &write_xfer_when_submitted[ 0 ] );

                    cl_ret |= write_xfer_events[ 0 ].getProfilingInfo< cl_ulong >(
                        CL_PROFILING_COMMAND_START, &write_xfer_when_started[ 0 ] );

                    cl_ret |= write_xfer_events[ 0 ].getProfilingInfo< cl_ulong >(
                        CL_PROFILING_COMMAND_END, &write_xfer_when_ended[ 0 ] );
                }
                catch( cl::Error const& e )
                {
                    std::cout << "enqueueWriteBuffer( particles_buffer ) :: "
                            << "line = " << __LINE__
                            << " :: ERROR : " << e.what() << std::endl
                            << e.err() << std::endl;

                    cl_ret = CL_FALSE;
                    throw;
                }

                SIXTRL_ASSERT( cl_ret == CL_SUCCESS );

                try
                {
                    cl_ret = queue.enqueueWriteBuffer( cl_beam_elements, CL_TRUE, 0,
                        ::st_Buffer_get_size( lhc_beam_elements ),
                        ::st_Buffer_get_const_data_begin( lhc_beam_elements ),
                        nullptr, &write_xfer_events[ 1 ] );

                    cl_ret |= write_xfer_events[ 1 ].getProfilingInfo< cl_ulong >(
                        CL_PROFILING_COMMAND_QUEUED, &write_xfer_when_queued[ 1 ] );

                    cl_ret |= write_xfer_events[ 1 ].getProfilingInfo< cl_ulong >(
                        CL_PROFILING_COMMAND_SUBMIT, &write_xfer_when_submitted[ 1 ] );

                    cl_ret |= write_xfer_events[ 1 ].getProfilingInfo< cl_ulong >(
                        CL_PROFILING_COMMAND_START, &write_xfer_when_started[ 1 ] );

                    cl_ret |= write_xfer_events[ 1 ].getProfilingInfo< cl_ulong >(
                        CL_PROFILING_COMMAND_END, &write_xfer_when_ended[ 1 ] );
                }
                catch( cl::Error const& e )
                {
                    std::cout << "enqueueWriteBuffer( beam_elements ) :: "
                            << "line = " << __LINE__
                            << " :: ERROR : " << e.what() << std::endl
                            << e.err() << std::endl;

                    cl_ret = CL_FALSE;
                    throw;
                }

                success_flag = int32_t{ 0 };

                try
                {
                    cl_ret = queue.enqueueWriteBuffer( cl_success_flag, CL_TRUE, 0,
                        sizeof( success_flag ), &success_flag,
                        nullptr, &write_xfer_events[ 2 ] );

                    cl_ret |= write_xfer_events[ 2 ].getProfilingInfo< cl_ulong >(
                        CL_PROFILING_COMMAND_QUEUED, &write_xfer_when_queued[ 2 ] );

                    cl_ret |= write_xfer_events[ 2 ].getProfilingInfo< cl_ulong >(
                        CL_PROFILING_COMMAND_SUBMIT, &write_xfer_when_submitted[ 2 ] );

                    cl_ret |= write_xfer_events[ 2 ].getProfilingInfo< cl_ulong >(
                        CL_PROFILING_COMMAND_START, &write_xfer_when_started[ 2 ] );

                    cl_ret |= write_xfer_events[ 2 ].getProfilingInfo< cl_ulong >(
                        CL_PROFILING_COMMAND_END, &write_xfer_when_ended[ 2 ] );
                }
                catch( cl::Error const& e )
                {
                    std::cout << "enqueueWriteBuffer( success_flag ) :: "
                            << "line = " << __LINE__
                            << " :: ERROR : " << e.what() << std::endl
                            << e.err() << std::endl;

                    cl_ret = CL_FALSE;
                    throw;
                }

                now = ::st_Time_get_seconds_since_epoch();

                double const time_write_xfer = ( now >= begin_time )
                    ? ( now - begin_time ) : double{ 0 };

                /*  ======================================================== */

                cl::Event run_remap_kernel_event;

                cl_ulong run_remap_kernel_when_queued    = cl_ulong{ 0 };
                cl_ulong run_remap_kernel_when_submitted = cl_ulong{ 0 };
                cl_ulong run_remap_kernel_when_started   = cl_ulong{ 0 };
                cl_ulong run_remap_kernel_when_ended     = cl_ulong{ 0 };

                begin_time = ::st_Time_get_seconds_since_epoch();

                remapping_kernel.setArg( 0, cl_particles );
                remapping_kernel.setArg( 1, cl_beam_elements );
                remapping_kernel.setArg( 2, cl_success_flag );

                try
                {
                    cl_ret = queue.enqueueNDRangeKernel( remapping_kernel,
                        cl::NullRange, cl::NDRange( remap_num_threads ),
                        cl::NDRange( remap_group_size ), nullptr,
                        &run_remap_kernel_event );
                }
                catch( cl::Error const& e )
                {
                    std::cout << "enqueueNDRangeKernel( remapping_kernel ) :: "
                            << "line = " << __LINE__
                            << " :: ERROR : " << e.what() << std::endl
                            << e.err() << std::endl;

                    cl_ret = CL_FALSE;
                    throw;
                }

                SIXTRL_ASSERT( cl_ret == CL_SUCCESS );

                queue.flush();
                run_remap_kernel_event.wait();

                cl_ret = run_remap_kernel_event.getProfilingInfo< cl_ulong >(
                    CL_PROFILING_COMMAND_QUEUED, &run_remap_kernel_when_queued );

                cl_ret |= run_remap_kernel_event.getProfilingInfo< cl_ulong >(
                    CL_PROFILING_COMMAND_SUBMIT, &run_remap_kernel_when_submitted );

                cl_ret |= run_remap_kernel_event.getProfilingInfo< cl_ulong >(
                    CL_PROFILING_COMMAND_START, &run_remap_kernel_when_started );

                cl_ret |= run_remap_kernel_event.getProfilingInfo< cl_ulong >(
                    CL_PROFILING_COMMAND_END, &run_remap_kernel_when_ended );

                now = ::st_Time_get_seconds_since_epoch();

                double const time_run_remapping_kernel = ( now >= begin_time )
                    ? ( now - begin_time ) : double{ 0 };

                /* ========================================================= */

                cl::Event xfer_after_remap_events;

                cl_ulong xfer_after_remap_when_queued    = cl_ulong{ 0 };
                cl_ulong xfer_after_remap_when_submitted = cl_ulong{ 0 };
                cl_ulong xfer_after_remap_when_started   = cl_ulong{ 0 };
                cl_ulong xfer_after_remap_when_ended     = cl_ulong{ 0 };

                begin_time = ::st_Time_get_seconds_since_epoch();

                try
                {
                    cl_ret = queue.enqueueReadBuffer( cl_success_flag, CL_TRUE, 0,
                        sizeof( success_flag ), &success_flag,
                        nullptr, &xfer_after_remap_events );


                    cl_ret = xfer_after_remap_events.getProfilingInfo< cl_ulong >(
                        CL_PROFILING_COMMAND_QUEUED, &xfer_after_remap_when_queued );

                    cl_ret |= xfer_after_remap_events.getProfilingInfo< cl_ulong >(
                        CL_PROFILING_COMMAND_SUBMIT, &xfer_after_remap_when_submitted );

                    cl_ret |= xfer_after_remap_events.getProfilingInfo< cl_ulong >(
                        CL_PROFILING_COMMAND_START, &xfer_after_remap_when_started );

                    cl_ret |= xfer_after_remap_events.getProfilingInfo< cl_ulong >(
                        CL_PROFILING_COMMAND_END, &xfer_after_remap_when_ended );
                }
                catch( cl::Error const& e )
                {
                    std::cout << "enqueueReadBuffer( success_flag ) :: "
                            << "line = " << __LINE__
                            << " :: ERROR : " << e.what() << std::endl
                            << e.err() << std::endl;

                    cl_ret = CL_FALSE;
                    throw;
                }

                now = ::st_Time_get_seconds_since_epoch();

                SIXTRL_ASSERT( cl_ret == CL_SUCCESS );
                SIXTRL_ASSERT( success_flag == int32_t{ 0 } );

                double const time_xfer_after_remap = ( now >= begin_time )
                    ? ( now - begin_time ) : double{ 0 };

                /* ========================================================== */

                uint64_t const turns = NUM_TURNS;

                cl::Event run_tracking_kernel_event;

                cl_ulong run_tracking_kernel_when_queued    = cl_ulong{ 0 };
                cl_ulong run_tracking_kernel_when_submitted = cl_ulong{ 0 };
                cl_ulong run_tracking_kernel_when_started   = cl_ulong{ 0 };
                cl_ulong run_tracking_kernel_when_ended     = cl_ulong{ 0 };

                begin_time = ::st_Time_get_seconds_since_epoch();

                tracking_kernel.setArg( 0, cl_particles );
                tracking_kernel.setArg( 1, cl_beam_elements );
                tracking_kernel.setArg( 2, turns );
                tracking_kernel.setArg( 3, cl_success_flag );

                try
                {
                    cl_ret = queue.enqueueNDRangeKernel( tracking_kernel,
                        cl::NullRange, cl::NDRange( tracking_num_threads ),
                        cl::NDRange( tracking_group_size ), nullptr,
                        &run_tracking_kernel_event );
                }
                catch( cl::Error const& e )
                {
                    std::cout << "enqueueNDRangeKernel( remapping_kernel ) :: "
                              << "line = " << __LINE__
                              << " :: ERROR : " << e.what() << std::endl
                              << e.err() << std::endl;

                    cl_ret = CL_FALSE;
                    throw;
                }

                cl_ret = queue.flush();
                run_tracking_kernel_event.wait();

                cl_ret |= run_tracking_kernel_event.getProfilingInfo< cl_ulong >(
                    CL_PROFILING_COMMAND_QUEUED, &run_tracking_kernel_when_queued );

                cl_ret |= run_tracking_kernel_event.getProfilingInfo< cl_ulong >(
                    CL_PROFILING_COMMAND_SUBMIT, &run_tracking_kernel_when_submitted );

                cl_ret |= run_tracking_kernel_event.getProfilingInfo< cl_ulong >(
                    CL_PROFILING_COMMAND_START, &run_tracking_kernel_when_started );

                cl_ret |= run_tracking_kernel_event.getProfilingInfo< cl_ulong >(
                    CL_PROFILING_COMMAND_END, &run_tracking_kernel_when_ended );

                now = ::st_Time_get_seconds_since_epoch();

                double const time_run_tracking_kernel = ( now >= begin_time )
                    ? ( now - begin_time ) : double{ 0 };

                double const time_tracking_until_submitted =
                    static_cast< double >( run_tracking_kernel_when_submitted -
                                           run_tracking_kernel_when_queued ) * 1e-9;

                double const time_tracking_until_start =
                    static_cast< double >( run_tracking_kernel_when_started -
                                           run_tracking_kernel_when_submitted ) * 1e-9;

                double const time_tracking_device_execution =
                    static_cast< double >( run_tracking_kernel_when_ended -
                                           run_tracking_kernel_when_started ) * 1e-9;

                /* ========================================================== */

                std::vector< cl::Event >
                    xfer_after_tracking_events( 2u, cl::Event{} );

                cl_ulong xfer_after_tracking_when_queued[]    = { 0, 0 };
                cl_ulong xfer_after_tracking_when_submitted[] = { 0, 0 };
                cl_ulong xfer_after_tracking_when_started[]   = { 0, 0 };
                cl_ulong xfer_after_tracking_when_ended[]     = { 0, 0 };

                begin_time = ::st_Time_get_seconds_since_epoch();

                try
                {
                    cl_ret = queue.enqueueReadBuffer( cl_particles, CL_TRUE, 0,
                        ::st_Buffer_get_size( particles_buffer ),
                        ::st_Buffer_get_data_begin( particles_buffer ),
                        nullptr, &xfer_after_tracking_events[ 0 ] );


                    cl_ret |= xfer_after_tracking_events[ 0 ].getProfilingInfo< cl_ulong >(
                        CL_PROFILING_COMMAND_QUEUED, &xfer_after_tracking_when_queued[ 0 ] );

                    cl_ret |= xfer_after_tracking_events[ 0 ].getProfilingInfo< cl_ulong >(
                        CL_PROFILING_COMMAND_SUBMIT, &xfer_after_tracking_when_submitted[ 0 ] );

                    cl_ret |= xfer_after_tracking_events[ 0 ].getProfilingInfo< cl_ulong >(
                        CL_PROFILING_COMMAND_START, &xfer_after_tracking_when_started[ 0 ] );

                    cl_ret |= xfer_after_tracking_events[ 0 ].getProfilingInfo< cl_ulong >(
                        CL_PROFILING_COMMAND_END, &xfer_after_tracking_when_ended[ 0 ] );
                }
                catch( cl::Error const& e )
                {
                    std::cout << "enqueueReadBuffer( success_flag ) :: "
                            << "line = " << __LINE__
                            << " :: ERROR : " << e.what() << std::endl
                            << e.err() << std::endl;

                    cl_ret = CL_FALSE;
                    throw;
                }

                SIXTRL_ASSERT( cl_ret == CL_SUCCESS );

                try
                {
                    cl_ret = queue.enqueueReadBuffer( cl_success_flag, CL_TRUE, 0,
                        sizeof( success_flag ), &success_flag,
                        nullptr, &xfer_after_tracking_events[ 1 ] );

                    cl_ret |= xfer_after_tracking_events[ 1 ].getProfilingInfo< cl_ulong >(
                        CL_PROFILING_COMMAND_QUEUED, &xfer_after_tracking_when_queued[ 1 ] );

                    cl_ret |= xfer_after_tracking_events[ 1 ].getProfilingInfo< cl_ulong >(
                        CL_PROFILING_COMMAND_SUBMIT, &xfer_after_tracking_when_submitted[ 1 ] );

                    cl_ret |= xfer_after_tracking_events[ 1 ].getProfilingInfo< cl_ulong >(
                        CL_PROFILING_COMMAND_START, &xfer_after_tracking_when_started[ 1 ] );

                    cl_ret |= xfer_after_tracking_events[ 1 ].getProfilingInfo< cl_ulong >(
                        CL_PROFILING_COMMAND_END, &xfer_after_tracking_when_ended[ 1 ] );
                }
                catch( cl::Error const& e )
                {
                    std::cout << "enqueueReadBuffer( success_flag ) :: "
                            << "line = " << __LINE__
                            << " :: ERROR : " << e.what() << std::endl
                            << e.err() << std::endl;

                    cl_ret = CL_FALSE;
                    throw;
                }

                now = ::st_Time_get_seconds_since_epoch();

                SIXTRL_ASSERT( cl_ret == CL_SUCCESS );
                SIXTRL_ASSERT( success_flag == int32_t{ 0 } );

                double const time_xfer_after_tracking = ( now >= begin_time )
                    ? ( now - begin_time ) : double{ 0 };

                /* ======================================================== */

                a2str.str( "" );

                double time_run_tracking_normalized =
                    time_run_tracking_kernel / static_cast< double >(
                        NUM_TURNS * NUM_PARTICLES );

                if( time_run_tracking_normalized >= 0.1 )
                {
                    a2str << "sec";
                }


                std::cout << std::endl
                          << "Reslts: \r\n"
                          << "------------------------------------------------"
                          << "------------------------------------------------"
                          << "--------------------------------------------\r\n"
                          << "      :: Tracking time                 : "
                          << std::setw( 20 ) << std::fixed
                          << time_run_tracking_kernel << " [sec] \r\n"
                          << "      :: Tracking time/particle/turn   : ";


                if( time_run_tracking_normalized >= 200e-3 )
                {
                    std::cout << std::setw( 20 ) << std::fixed
                              << time_run_tracking_normalized << "[sec]\r\n";
                }
                else if( time_run_tracking_normalized >= 200e-6 )
                {
                    std::cout << std::setw( 20 ) << std::fixed
                              << time_run_tracking_normalized * 1e3 << "[millisec]\r\n";
                }
                else
                {
                    std::cout << std::setw( 20 ) << std::fixed
                              << time_run_tracking_normalized * 1e6 << "[usec]\r\n";
                }

                std::cout << "      :: device_run_time               : "
                          << std::setw( 20 ) << std::fixed
                          << time_tracking_device_execution << "\r\n"
                          << "      :: device overhead               : "
                          << std::setw( 20 ) << std::fixed
                          << time_tracking_until_start << " + "
                          << time_tracking_until_submitted << "\r\n"
                          << "------------------------------------------------"
                          << "------------------------------------------------"
                          << "--------------------------------------------\r\n"
                          << "\r\n"
                          << std::endl;
            }

            int success = ::st_Buffer_remap( particles_buffer );
            SIXTRL_ASSERT( success == 0 );

            ::st_Buffer_reset( particles_buffer );
        }
    }

    ::st_Buffer_delete( lhc_particles_buffer );
    ::st_Buffer_delete( lhc_beam_elements );
    ::st_Buffer_delete( particles_buffer );

    return 0;
}
コード例 #4
0
static void customcl_setup(
    std::string cl_program = "blocking-2-v4", 
    std::string arithmetic = "float") {

    err = 0;
    // build options for opencl.
    std::string cl_build_options =
        "-DT=" + arithmetic +
        " -DT4=" + arithmetic + "4" + 
        " -DT8=" + arithmetic + "8" + 
        " -DT16=" + arithmetic + "16" + 
        " " + (arithmetic == "double" ? " -DSAMPLE_NEEDS_DOUBLE" : "") + 
        " " + (arithmetic == "half" ? " -DSAMPLE_NEEDS_HALF" : "");

    // clkernel name.
    std::string clkernel_path = "clkernel/";
    
    caffe::cl_program = cl_program;
    if(cl_program == "blocking-2-v4") {
      clkernel_path += "gemm-blocking-2x2-vload4.cl";
    }else if(cl_program == "blocking-4-v4") {
      clkernel_path += "gemm-blocking-4x4-vload4.cl";
    }else if(cl_program == "noblock-v8") {
      clkernel_path += "gemm-noblock-vload8.cl";
    }

    std::ifstream kernel_file(clkernel_path);
    std::string kernel_str((std::istreambuf_iterator<char>(kernel_file)),
                           std::istreambuf_iterator<char>());
    viennacl::ocl::current_context().build_options(
        "-DT=" + arithmetic +
        " -DT4=" + arithmetic + "4" + 
        " -DT8=" + arithmetic + "8" + 
        " -DT16=" + arithmetic + "16" + 
        " " + (arithmetic == "double" ? " -DSAMPLE_NEEDS_DOUBLE" : "") + 
        " " + (arithmetic == "half" ? " -DSAMPLE_NEEDS_HALF" : ""));
        

    cl_prog = viennacl::ocl::get_context(0).add_program(
        kernel_str, "gemm_program");

    gemm_exec = cl_prog.get_kernel("gemm");
    transpose_exec = cl_prog.get_kernel("transpose"); 
    copy_exec = cl_prog.get_kernel("copy"); 

    transpose_ptr = (void*)clCreateBuffer(
        viennacl::ocl::current_context().handle().get(),
        CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
        TRANSPOSE_BUFFER_DIM * TRANSPOSE_BUFFER_DIM * 8,
        host_trans_buffer,
        &err
    );
    SAMPLE_CHECK_ERRORS(err);

    copy_ptr = (void*)clCreateBuffer(
        viennacl::ocl::current_context().handle().get(),
        CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
        TRANSPOSE_BUFFER_DIM * TRANSPOSE_BUFFER_DIM * 8,
        host_copy_buffer,
        &err
    );
    SAMPLE_CHECK_ERRORS(err);

    result_ptr = (void*)clCreateBuffer(
        viennacl::ocl::current_context().handle().get(),
        CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
        TRANSPOSE_BUFFER_DIM * TRANSPOSE_BUFFER_DIM * 8,
        host_result_buffer,
        &err
    );
    SAMPLE_CHECK_ERRORS(err);
}
コード例 #5
0
int main(int argc, char** argv)
{
      if(argc < 3) {
          std::cerr << "Usage: " << argv[0] << " < #particles > < #turns > [deviceIdx]" << std::endl;
          exit(1);
        }
  		int NUM_REPETITIONS = 10;
    	double num_of_turns_drift = 0.0; // for timing
    	double num_of_turns_drift_exact = 0.0; // for timing
    	double num_of_turns_cavity = 0.0; // for timing
    	double num_of_turns_align = 0.0; // for timing
    	double average_execution_time_drift = 0.0;
    	double average_execution_time_drift_exact = 0.0;
    	double average_execution_time_cavity = 0.0;
    	double average_execution_time_align = 0.0;

      std::vector<double> exec_time_drift;
      std::vector<double> exec_time_drift_exact;
      std::vector<double> exec_time_cavity;
      std::vector<double> exec_time_align;

      int choice  = 1;

			for(int ll = 0; ll < NUM_REPETITIONS; ++ll) {
    /* We will use 9+ beam element blocks in this example and do not
     * care to be memory efficient yet; thus we make the blocks for
     * beam elements and particles big enough to avoid running into problems */

    constexpr st_block_size_t const MAX_NUM_BEAM_ELEMENTS       = 1000u; // 20u;
    constexpr st_block_size_t const NUM_OF_BEAM_ELEMENTS        = 1000u; //9u;

    /* 1MByte is plenty of space */
    constexpr st_block_size_t const BEAM_ELEMENTS_DATA_CAPACITY = 1048576u;

    /* Prepare and init the beam elements buffer */

    st_Blocks beam_elements;
    st_Blocks_preset( &beam_elements );

    int ret = st_Blocks_init( &beam_elements, MAX_NUM_BEAM_ELEMENTS,
                              BEAM_ELEMENTS_DATA_CAPACITY );

    assert( ret == 0 ); /* if there was an error, ret would be != 0 */

    /* Add NUM_OF_BEAM_ELEMENTS drifts to the buffer. For this example, let's
     * just have one simple constant length for all of them: */

   // One-fourth of the beam-elements are drift-elements
    for( st_block_size_t ii = 0 ; ii < NUM_OF_BEAM_ELEMENTS/4 ; ++ii )
    {
        double const drift_length = double{ 0.2L };
        st_Drift* drift = st_Blocks_add_drift( &beam_elements, drift_length );

        (void)drift; // using the variable with a no-op

        assert( drift != nullptr ); /* Otherwise, there was a problem! */
    }

    /* Check if we *really* have the correct number of beam elements and
     * if they really are all drifts */

    assert( st_Blocks_get_num_of_blocks( &beam_elements ) ==
            NUM_OF_BEAM_ELEMENTS/4 );

    /* The beam_elements container is currently not serialized yet ->
     * we could still add blocks to the buffer. Let's jus do this and
     * add a different kind of beam element to keep it easier apart! */

    for( st_block_size_t ii = NUM_OF_BEAM_ELEMENTS/4 ; ii < NUM_OF_BEAM_ELEMENTS/2 ; ++ii )
    {
        double const drift_length = double{ 0.1L };
    st_DriftExact* drift_exact = st_Blocks_add_drift_exact(
        &beam_elements, drift_length );
        (void) drift_exact;
    assert( drift_exact != nullptr );
   }

    assert( st_Blocks_get_num_of_blocks( &beam_elements ) ==
            ( NUM_OF_BEAM_ELEMENTS*0.5) );

    /* Adding the beam element 'cavity' */

    for( st_block_size_t ii = NUM_OF_BEAM_ELEMENTS*0.5 ; ii < NUM_OF_BEAM_ELEMENTS*0.75 ; ++ii )
    {
      double const voltage = double{ 1e4};
      double const frequency = double{ 40};
      double const lag = double{ 0.01L};
      st_Cavity* cavity = st_Blocks_add_cavity(
          &beam_elements, voltage, frequency, lag);
      (void) cavity; // a no-op
      assert( cavity != nullptr ); /* Otherwise, there was a problem! */
    }
    assert( st_Blocks_get_num_of_blocks( &beam_elements ) ==
            ( NUM_OF_BEAM_ELEMENTS * 0.75) );

    /* Adding the beam element 'align' */
    double const M__PI   = // note the two underscores between M and PI
      ( double )3.1415926535897932384626433832795028841971693993751L;
    for( st_block_size_t ii = NUM_OF_BEAM_ELEMENTS*0.75 ; ii < NUM_OF_BEAM_ELEMENTS ; ++ii )
    {
      double const tilt = double{ 0.5};
      double const z = double{ M__PI / 45};
      double const dx = double{ 0.2L};
      double const dy = double{ 0.2L};
      st_Align* align = st_Blocks_add_align(
          &beam_elements, tilt, cos( z ), sin( z ), dx, dy);
      (void) align; // a no-op
      assert( align != nullptr ); /* Otherwise, there was a problem! */
    }
    assert( st_Blocks_get_num_of_blocks( &beam_elements ) ==
        ( NUM_OF_BEAM_ELEMENTS) );
    /* Always safely terminate pointer variables pointing to resources they
     * do not own which we no longer need -> just a good practice */

//    drift_exact = nullptr;

    /* After serialization, the "structure" of the beam_elements buffer is
     * frozen, but the data in the elements - i.e. the length of the
     * individual drifts in our example - can still be modified. We will
     * just not be able to add further blocks to the container */

    assert( !st_Blocks_are_serialized( &beam_elements ) );

    ret = st_Blocks_serialize( &beam_elements );

    assert( ret == 0 );
    assert( st_Blocks_are_serialized( &beam_elements ) ); // serialization on CPU done.

    /* Next, let's iterate over all the beam_elements in the buffer and
     * print out the properties -> we expect that NUM_OF_BEAM_ELEMENTS
     * st_Drift with the same length appear and one st_DriftExact with a
     * different length should appear in the end */
    std::cout.flush();

/************************** Preparing grounds for OpenCL *******/
    std::vector<cl::Platform> platform;
    cl::Platform::get(&platform);

    if( platform.empty() )
    {
        std::cerr << "OpenCL platforms not found." << std::endl;
        return 1;
    }

    std::vector< cl::Device > devices;

    for( auto const& p : platform )
    {
        std::vector< cl::Device > temp_devices;

        p.getDevices( CL_DEVICE_TYPE_ALL, &temp_devices );

        for( auto const& d : temp_devices )
        {
            if( !d.getInfo< CL_DEVICE_AVAILABLE >() ) continue;
            devices.push_back( d );
        }
    }

    cl::Device* ptr_selected_device = nullptr;

    if( !devices.empty() )
    {
        if( argc >= 4 )
        {
            std::size_t const device_idx = std::atoi( argv[ 3 ] );

            if( device_idx < devices.size() )
            {
                ptr_selected_device = &devices[ device_idx ];
            }
        }

        if( ptr_selected_device == nullptr )
        {
            std::cout << "default selecting device #0" << std::endl;
            ptr_selected_device = &devices[ 0 ];
        }
    }

    if( ptr_selected_device != nullptr )
    {
        std::cout << "device: "
                  << ptr_selected_device->getInfo< CL_DEVICE_NAME >()
                  << std::endl;
    }
    else return 0;

    cl::Context context( *ptr_selected_device );

//    std::cout << "Device list" << std::endl;
//    for(unsigned int jj=0; jj<devices.size(); jj++){
//      std::cout << "Name of devicei " << jj<<" : "<<devices[jj].getInfo<CL_DEVICE_NAME>() << std::endl;
//      std::cout << "resolution of device timer for device " << jj <<" : "<<devices[jj].getInfo<CL_DEVICE_PROFILING_TIMER_RESOLUTION>() << std::endl;
//    };
/**********************************************/


/////////////////////////////////////////////////////////////////////////////////////////////////////////////
    // getting the kernel file
   std::string PATH_TO_KERNEL_FILE( st_PATH_TO_BASE_DIR );
       PATH_TO_KERNEL_FILE += "tests/benchmark/sixtracklib/opencl/";
       PATH_TO_KERNEL_FILE += "kernels_beam_elements_oneatatime.cl";

       std::string kernel_source( "" );
       std::ifstream kernel_file( PATH_TO_KERNEL_FILE.c_str(),
                                  std::ios::in | std::ios::binary );

       if( kernel_file.is_open() )
       {
           std::istreambuf_iterator< char > file_begin( kernel_file.rdbuf() );
           std::istreambuf_iterator< char > end_of_file;

           kernel_source.assign( file_begin, end_of_file );
           kernel_file.close();
       }
////////////////////////////////////////////////////////////////////////////////////////////////////////////
    assert( ptr_selected_device != nullptr );

  //  int ndev = 0; // specifying the id of the device to be used
    cl::CommandQueue queue(context, *ptr_selected_device,CL_QUEUE_PROFILING_ENABLE);
    // Compile OpenCL program for found devices.
			cl:: Program program(context, kernel_source); //string  kernel_source contains the kernel(s) read from the file

#if 0
/////////////////////// Alternative 1 for including the kernels written in a separate file -- works perfectly fine /////////////////////////////////
			cl:: Program program(context, "#include \"../kernels.cl\" ", false); // the path inside the #include should be relative to an include directory specified using -Ipath/to/dir specified via build options.. otherwise give the absolute path.
#endif

#if 0
/////////////////////// The way to go if the string source[] contains the source in the same file as this.

//    cl::Program program(context, cl::Program::Sources(
//        1, std::make_pair(source, strlen(source))
//        ));
#endif


    try {
    std::string incls = "-D_GPUCODE=1 -D__NAMESPACE=st_ -I" + std::string(NS(PATH_TO_BASE_DIR)) ;
  //  std::cout << "Path = " << incls << std::endl;
    //program.build(devices, "-D_GPUCODE=1 -D__NAMESPACE=st_ -I/home/sosingh/sixtracklib_gsoc18/initial_test/sixtrack-v0/external/include");
    program.build( incls.c_str() );
    } catch (const cl::Error&) {
    std::cerr
      << "OpenCL compilation error" << std::endl
      << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(*ptr_selected_device)
      << std::endl;
    throw;
    }




    cl::Buffer B(context, CL_MEM_READ_WRITE, st_Blocks_get_total_num_bytes( &beam_elements  )); // input vector
 queue.enqueueWriteBuffer( B, CL_TRUE, 0, st_Blocks_get_total_num_bytes( &beam_elements ), st_Blocks_get_const_data_begin( &beam_elements ) );




   ////////////////////////// Particles ////////////////////////////////
    st_block_size_t const NUM_PARTICLE_BLOCKS     = 1u;
    st_block_size_t const PARTICLES_DATA_CAPACITY = 1048576u*1000*4; //  ~(4 GB)
    st_block_size_t const NUM_PARTICLES           = atoi(argv[1]); // 100u;

    st_Blocks particles_buffer;
    st_Blocks_preset( &particles_buffer );

    ret = st_Blocks_init(
        &particles_buffer, NUM_PARTICLE_BLOCKS, PARTICLES_DATA_CAPACITY );

    assert( ret == 0 );

    st_Particles* particles = st_Blocks_add_particles(
        &particles_buffer, NUM_PARTICLES );

    if( particles != nullptr )
    {
        /* Just some random values assigned to the individual attributes
         * of the acutal particles -> these values do not make any
         * sense physically, but should be safe for calculating maps ->
         * please check with the map for drift whether they do not produce
         * some NaN's at the sqrt or divisions by 0 though!*/

        std::mt19937_64  prng( 20180622 );

        std::uniform_real_distribution<> x_distribution(  0.05, 1.0 );
        std::uniform_real_distribution<> y_distribution(  0.05, 1.0 );
        std::uniform_real_distribution<> px_distribution( 0.05, 0.2 );
        std::uniform_real_distribution<> py_distribution( 0.05, 0.2 );
        std::uniform_real_distribution<> sigma_distribution( 0.01, 0.5 );

        assert( particles->s     != nullptr );
        assert( particles->x     != nullptr );
        assert( particles->y     != nullptr );
        assert( particles->px    != nullptr );
        assert( particles->py    != nullptr );
        assert( particles->sigma != nullptr );
        assert( particles->rpp   != nullptr );
        assert( particles->rvv   != nullptr );

        assert( particles->num_of_particles == (int)NUM_PARTICLES );

        for( st_block_size_t ii = 0 ; ii < NUM_PARTICLES ; ++ii )
        {
            particles->s[ ii ]     = 0.0;
            particles->x[ ii ]     = x_distribution( prng );
            particles->y[ ii ]     = y_distribution( prng );
            particles->px[ ii ]    = px_distribution( prng );
            particles->py[ ii ]    = py_distribution( prng );
            particles->sigma[ ii ] = sigma_distribution( prng );
            particles->rpp[ ii ]   = 1.0;
            particles->rvv[ ii ]   = 1.0;
        }
    }

    ret = st_Blocks_serialize( &particles_buffer );
    assert( ret == 0 );

    /* ===================================================================== */
    /* Copy to other buffer to simulate working on the GPU */
    //std::cout << "On the GPU:\n";

  // Allocate device buffers and transfer input data to device.

    cl::Buffer C(context, CL_MEM_READ_WRITE, st_Blocks_get_total_num_bytes( &particles_buffer )); // input vector
		queue.enqueueWriteBuffer( C, CL_TRUE, 0, st_Blocks_get_total_num_bytes( &particles_buffer ), st_Blocks_get_const_data_begin( &particles_buffer ) );

    int numThreads = 1;
    int blockSize = 1;
    cl::Kernel unserialize(program, "unserialize");
    unserialize.setArg(0,B);
    unserialize.setArg(1,C);
    unserialize.setArg(2,NUM_PARTICLES);
    queue.enqueueNDRangeKernel(
    unserialize, cl::NullRange, cl::NDRange( numThreads ),
    cl::NDRange(blockSize ));
    queue.flush();
    queue.finish();



      // creating a buffer to transfer the data from GPU to CPU

      std::vector< uint8_t > copy_particles_buffer_host(st_Blocks_get_total_num_bytes( &particles_buffer )/sizeof(uint8_t));  // output vector

      queue.enqueueReadBuffer(C, CL_TRUE, 0, copy_particles_buffer_host.size() * sizeof(uint8_t), copy_particles_buffer_host.data());
      queue.flush();

    st_Blocks copy_particles_buffer;
    st_Blocks_preset( &copy_particles_buffer );

    ret = st_Blocks_unserialize( &copy_particles_buffer, copy_particles_buffer_host.data() );
    assert( ret == 0 );



    SIXTRL_UINT64_T const NUM_TURNS = atoi(argv[2]);//100;
    SIXTRL_UINT64_T offset = 0;
    cl::Event event;
  
  switch (choice)
  {
    case 1 :
    {
    cl::Kernel track_drift_particle(program, "track_drift_particle");
    blockSize = track_drift_particle.getWorkGroupInfo< CL_KERNEL_WORK_GROUP_SIZE >( *ptr_selected_device);// determine the work-group size
    numThreads = ((NUM_PARTICLES+blockSize-1)/blockSize) * blockSize; // rounding off NUM_PARTICLES to the next nearest multiple of blockSize. This is to ensure that there are integer number of work-groups launched
    std::cout << blockSize << " " << numThreads<< std::endl;
    track_drift_particle.setArg(0,B);
    track_drift_particle.setArg(1,C);
    track_drift_particle.setArg(2,NUM_PARTICLES);
    track_drift_particle.setArg(3,NUM_TURNS);
    track_drift_particle.setArg(4,offset);



    queue.enqueueNDRangeKernel(
    track_drift_particle, cl::NullRange, cl::NDRange( numThreads ),
    cl::NDRange(blockSize ), nullptr, &event);
    queue.flush();
    event.wait();
    queue.finish();

        cl_ulong when_kernel_queued    = 0;
        cl_ulong when_kernel_submitted = 0;
        cl_ulong when_kernel_started   = 0;
        cl_ulong when_kernel_ended     = 0;

        ret  = event.getProfilingInfo< cl_ulong >(
          CL_PROFILING_COMMAND_QUEUED, &when_kernel_queued );

        ret |= event.getProfilingInfo< cl_ulong >(
          CL_PROFILING_COMMAND_SUBMIT, &when_kernel_submitted );

        ret |= event.getProfilingInfo< cl_ulong >(
          CL_PROFILING_COMMAND_START, &when_kernel_started );

        ret |= event.getProfilingInfo< cl_ulong >(
          CL_PROFILING_COMMAND_END, &when_kernel_ended );

        assert( ret == CL_SUCCESS ); // all ret's should be 1

         double const kernel_time_elapsed = when_kernel_ended - when_kernel_started;
        exec_time_drift.push_back(kernel_time_elapsed);
        if( ll > 5 ) {
          num_of_turns_drift += 1.0;
          average_execution_time_drift += (kernel_time_elapsed - average_execution_time_drift)/num_of_turns_drift;
      }
    //  break;
    }
    case 2:
    {

    offset = 250;
  //  cl::Event event;
    cl::Kernel track_drift_exact_particle(program, "track_drift_exact_particle");
    blockSize = track_drift_exact_particle.getWorkGroupInfo< CL_KERNEL_WORK_GROUP_SIZE >( *ptr_selected_device);// determine the work-group size
    numThreads = ((NUM_PARTICLES+blockSize-1)/blockSize) * blockSize; // rounding off NUM_PARTICLES to the next nearest multiple of blockSize. This is to ensure that there are integer number of work-groups launched
    std::cout << blockSize << " " << numThreads<< std::endl;
    track_drift_exact_particle.setArg(0,B);
    track_drift_exact_particle.setArg(1,C);
    track_drift_exact_particle.setArg(2,NUM_PARTICLES);
    track_drift_exact_particle.setArg(3,NUM_TURNS);
    track_drift_exact_particle.setArg(4,offset);
    queue.enqueueNDRangeKernel(
    track_drift_exact_particle, cl::NullRange, cl::NDRange( numThreads ),
    cl::NDRange(blockSize ), nullptr, &event);
    queue.flush();
    event.wait();
    queue.finish();
        cl_ulong when_kernel_queued    = 0;
        cl_ulong when_kernel_submitted = 0;
        cl_ulong when_kernel_started   = 0;
        cl_ulong when_kernel_ended     = 0;

        ret  = event.getProfilingInfo< cl_ulong >(
          CL_PROFILING_COMMAND_QUEUED, &when_kernel_queued );

        ret |= event.getProfilingInfo< cl_ulong >(
          CL_PROFILING_COMMAND_SUBMIT, &when_kernel_submitted );

        ret |= event.getProfilingInfo< cl_ulong >(
          CL_PROFILING_COMMAND_START, &when_kernel_started );

        ret |= event.getProfilingInfo< cl_ulong >(
          CL_PROFILING_COMMAND_END, &when_kernel_ended );

        assert( ret == CL_SUCCESS ); // all ret's should be 1

        double const kernel_time_elapsed = when_kernel_ended - when_kernel_started;
        exec_time_drift_exact.push_back(kernel_time_elapsed);
        if( ll > 5 ) {
          num_of_turns_drift_exact += 1.0;
          average_execution_time_drift_exact += (kernel_time_elapsed - average_execution_time_drift_exact)/num_of_turns_drift_exact;
      }
    //break;
    }
    case 3:
    {
    offset = 500;
   // cl::Event event;
    cl::Kernel track_cavity_particle(program, "track_cavity_particle");
    blockSize = track_cavity_particle.getWorkGroupInfo< CL_KERNEL_WORK_GROUP_SIZE >( *ptr_selected_device);// determine the work-group size
    numThreads = ((NUM_PARTICLES+blockSize-1)/blockSize) * blockSize; // rounding off NUM_PARTICLES to the next nearest multiple of blockSize. This is to ensure that there are integer number of work-groups launched
    std::cout << blockSize << " " << numThreads<< std::endl;
    track_cavity_particle.setArg(0,B);
    track_cavity_particle.setArg(1,C);
    track_cavity_particle.setArg(2,NUM_PARTICLES);
    track_cavity_particle.setArg(3,NUM_TURNS);
    track_cavity_particle.setArg(4,offset);
    queue.enqueueNDRangeKernel(
    track_cavity_particle, cl::NullRange, cl::NDRange( numThreads ),
    cl::NDRange(blockSize ), nullptr, &event);
    queue.flush();
    event.wait();
    queue.finish();
        cl_ulong when_kernel_queued    = 0;
        cl_ulong when_kernel_submitted = 0;
        cl_ulong when_kernel_started   = 0;
        cl_ulong when_kernel_ended     = 0;

        ret  = event.getProfilingInfo< cl_ulong >(
          CL_PROFILING_COMMAND_QUEUED, &when_kernel_queued );

        ret |= event.getProfilingInfo< cl_ulong >(
          CL_PROFILING_COMMAND_SUBMIT, &when_kernel_submitted );

        ret |= event.getProfilingInfo< cl_ulong >(
          CL_PROFILING_COMMAND_START, &when_kernel_started );

        ret |= event.getProfilingInfo< cl_ulong >(
          CL_PROFILING_COMMAND_END, &when_kernel_ended );

        assert( ret == CL_SUCCESS ); // all ret's should be 1

        double const kernel_time_elapsed = when_kernel_ended - when_kernel_started;
        exec_time_cavity.push_back(kernel_time_elapsed);
        if( ll > 5 ) {
          num_of_turns_cavity += 1.0;
          average_execution_time_cavity += (kernel_time_elapsed - average_execution_time_cavity)/num_of_turns_cavity;
      }
  //  break;
    }
    case 4:
    {
    //cl::Event event;
    offset = 750;
    cl::Kernel track_align_particle(program, "track_align_particle");
    blockSize = track_align_particle.getWorkGroupInfo< CL_KERNEL_WORK_GROUP_SIZE >( *ptr_selected_device);// determine the work-group size
    numThreads = ((NUM_PARTICLES+blockSize-1)/blockSize) * blockSize; // rounding off NUM_PARTICLES to the next nearest multiple of blockSize. This is to ensure that there are integer number of work-groups launched
    std::cout << blockSize << " " << numThreads<< std::endl;
    track_align_particle.setArg(0,B);
    track_align_particle.setArg(1,C);
    track_align_particle.setArg(2,NUM_PARTICLES);
    track_align_particle.setArg(3,NUM_TURNS);
    track_align_particle.setArg(4,offset);
    queue.enqueueNDRangeKernel(
    track_align_particle, cl::NullRange, cl::NDRange( numThreads ),
    cl::NDRange(blockSize ), nullptr, &event);
    queue.flush();
    event.wait();
    queue.finish();
        cl_ulong when_kernel_queued    = 0;
        cl_ulong when_kernel_submitted = 0;
        cl_ulong when_kernel_started   = 0;
        cl_ulong when_kernel_ended     = 0;

        ret  = event.getProfilingInfo< cl_ulong >(
          CL_PROFILING_COMMAND_QUEUED, &when_kernel_queued );

        ret |= event.getProfilingInfo< cl_ulong >(
          CL_PROFILING_COMMAND_SUBMIT, &when_kernel_submitted );

        ret |= event.getProfilingInfo< cl_ulong >(
          CL_PROFILING_COMMAND_START, &when_kernel_started );

        ret |= event.getProfilingInfo< cl_ulong >(
          CL_PROFILING_COMMAND_END, &when_kernel_ended );

        assert( ret == CL_SUCCESS ); // all ret's should be 1

         double const kernel_time_elapsed = when_kernel_ended - when_kernel_started;
        exec_time_align.push_back(kernel_time_elapsed);
        if( ll > 5 ) {
          num_of_turns_align += 1.0;
          average_execution_time_align += (kernel_time_elapsed - average_execution_time_align)/num_of_turns_align;
      }
   // break;
    }
 }; // end of switch case
      queue.enqueueReadBuffer(C, CL_TRUE, 0, copy_particles_buffer_host.size() * sizeof(uint8_t), copy_particles_buffer_host.data());
      queue.flush();

    //st_Blocks copy_particles_buffer;
    st_Blocks_preset( &copy_particles_buffer );

    ret = st_Blocks_unserialize( &copy_particles_buffer, copy_particles_buffer_host.data() );
    assert( ret == 0 );

    /* on the GPU, these pointers will have __global as a decorator */

#if 0
    // On the CPU after copying the data back from the GPU
    std::cout << "\n On the Host, after applying the drift_track_particles mapping and copying from the GPU\n";

    SIXTRL_GLOBAL_DEC st_BlockInfo const* itr  =
        st_Blocks_get_const_block_infos_begin( &copy_particles_buffer );

    SIXTRL_GLOBAL_DEC st_BlockInfo const* endr =
        st_Blocks_get_const_block_infos_end( &copy_particles_buffer );

    for( ; itr != endr ; ++itr )
    {
        SIXTRL_GLOBAL_DEC st_Particles const* particles =
            ( SIXTRL_GLOBAL_DEC st_Particles const* )itr->begin;

        std::cout.precision( 4 );

        for( st_block_size_t ii = 0 ; ii < NUM_PARTICLES ; ++ii )
        {
            std::cout << " ii    = " << std::setw( 6 ) << ii
                      << std::fixed
                      << " | s     = " << std::setw( 6 ) << particles->s[ ii ]
                      << " | x     = " << std::setw( 6 ) << particles->x[ ii ]
                      << " | y     = " << std::setw( 6 ) << particles->y[ ii ]
                      << " | px    = " << std::setw( 6 ) << particles->px[ ii ]
                      << " | py    = " << std::setw( 6 ) << particles->py[ ii ]
                      << " | sigma = " << std::setw( 6 ) << particles->sigma[ ii ]
                      << " | rpp   = " << std::setw( 6 ) << particles->rpp[ ii ]
                      << " | rvv   = " << std::setw( 6 ) << particles->rvv[ ii ]
                      << "\r\n";
        }
    }

#endif
    std::cout.flush();
    st_Blocks_free( &particles_buffer );
    st_Blocks_free( &copy_particles_buffer );
  } // end of the NUM_REPETITIONS 'for' loop
  switch(choice)
  {
    case 1:
    {
      // printing the contents of the exec_time vector
    std::cout << "track_drift_particle" << std::endl;
    for(std::vector<double>::iterator it = exec_time_drift.begin(); it != exec_time_drift.end(); ++it)
      printf("%.3f s%c",(*it)*1.0e-9, ",\n"[it+1 == exec_time_drift.end()]);
		printf("Reference Version : Time = %.3f s; \n",average_execution_time_drift*1.0e-9);
    //break;
    }
    case 2:
    {
    std::cout << "track_drift_exact_particle" << std::endl;
    for(std::vector<double>::iterator it = exec_time_drift_exact.begin(); it != exec_time_drift_exact.end(); ++it)
      printf("%.3f s%c",(*it)*1.0e-9, ",\n"[it+1 == exec_time_drift_exact.end()]);
		printf("Reference Version: Time = %.3f s; \n",average_execution_time_drift_exact*1.0e-9);
    //break;
    }
    case 3:
    {
    std::cout << "track_cavity_particle" << std::endl;
    for(std::vector<double>::iterator it = exec_time_cavity.begin(); it != exec_time_cavity.end(); ++it)
      printf("%.3f s%c",(*it)*1.0e-9, ",\n"[it+1 == exec_time_cavity.end()]);
		printf("Reference Version: Time = %.3f s; \n",average_execution_time_cavity*1.0e-9);
   // break;
    }
    case 4:
    {
    std::cout << "track_align_particle" << std::endl;
    for(std::vector<double>::iterator it = exec_time_align.begin(); it != exec_time_align.end(); ++it)
      printf("%.3f s%c",(*it)*1.0e-9, ",\n"[it+1 == exec_time_align.end()]);
		printf("Reference Version: Time = %.3f s; \n",average_execution_time_align*1.0e-9);
    break;
    }
  };
    return 0;

  }
コード例 #6
0
ファイル: converter.cpp プロジェクト: bollig/viennacl-dev
void createKernelFile(const char * dirname)
{
    //Step 1: Open kernel file
    std::string header_name(dirname);
    std::ofstream kernel_file(("@PROJECT_BINARY_DIR@/viennacl/linalg/kernels/" + header_name + "_kernels.h").c_str());

    //Step 2: Write kernel header file preamble
    std::string dirname_uppercase(dirname);
    std::transform(dirname_uppercase.begin(), dirname_uppercase.end(), dirname_uppercase.begin(), toupper);
    kernel_file << "#ifndef VIENNACL_" << dirname_uppercase << "_KERNELS_HPP_" << std::endl;
    kernel_file << "#define VIENNACL_" << dirname_uppercase << "_KERNELS_HPP_" << std::endl;
    kernel_file << "#include \"viennacl/tools/tools.hpp\"" << std::endl;
    kernel_file << "#include \"viennacl/ocl/kernel.hpp\"" << std::endl;
    kernel_file << "#include \"viennacl/ocl/platform.hpp\"" << std::endl;
    kernel_file << "#include \"viennacl/ocl/utils.hpp\"" << std::endl;
    kernel_file << "#include \"viennacl/linalg/kernels/" << dirname << "_source.h\"" << std::endl;
    kernel_file << std::endl;
    kernel_file << "//Automatically generated file from aux-directory, do not edit manually!" << std::endl;
    kernel_file << "/** @file " << header_name << "_kernels.h" << std::endl;
    kernel_file << " *  @brief OpenCL kernel file, generated automatically from scripts in auxiliary/. */" << std::endl;
    kernel_file << "namespace viennacl" << std::endl;
    kernel_file << "{" << std::endl;
    kernel_file << " namespace linalg" << std::endl;
    kernel_file << " {" << std::endl;
    kernel_file << "  namespace kernels" << std::endl;
    kernel_file << "  {" << std::endl;

    //Step 3: Write class information:
    kernel_file << "   template<class TYPE, unsigned int alignment>" << std::endl;
    kernel_file << "   struct " << dirname << ";" << std::endl << std::endl;

    //Step 4: Write single precision kernels
    std::string dir(dirname);
    kernel_file << std::endl << "    /////////////// single precision kernels //////////////// " << std::endl;
    fs::path filepath = fs::system_complete( fs::path( dir ) );
    if ( fs::is_directory( filepath ) )
    {
        //std::cout << "\nIn directory: " << filepath.directory_string() << std::endl;

        fs::directory_iterator end_iter;
        //write and register single precision sources:
        for ( fs::directory_iterator alignment_itr( filepath );
              alignment_itr != end_iter;
              ++alignment_itr )
        {
            if (fs::is_directory( alignment_itr->path() ))
            {
#ifdef USE_OLD_BOOST_FILESYSTEM_VERSION
                std::string subfolder = alignment_itr->path().filename();
#else
                std::string subfolder = alignment_itr->path().filename().string();
#endif
                if( subfolder.find("align") == std::string::npos )
                  continue;
                writeKernelInit(kernel_file, dirname, subfolder, true);
            } //if is_directory
        } //for alignment_iterator
        kernel_file << std::endl;
    } //if is_directory
    else
        std::cerr << "Cannot access directory " << dirname << std::endl;

    //Step 5: Write double precision kernels
    kernel_file << std::endl << "    /////////////// double precision kernels //////////////// " << std::endl;
    filepath = fs::system_complete( fs::path( dir ) );
    if ( fs::is_directory( filepath ) )
    {
        //std::cout << "\nIn directory: " << filepath.directory_string() << std::endl;

        fs::directory_iterator end_iter;
        //write and register single precision sources:
        for ( fs::directory_iterator alignment_itr( filepath );
              alignment_itr != end_iter;
              ++alignment_itr )
        {
            if (fs::is_directory( alignment_itr->path() ))
            {
#ifdef USE_OLD_BOOST_FILESYSTEM_VERSION
                std::string subfolder = alignment_itr->path().filename();
#else
                std::string subfolder = alignment_itr->path().filename().string();
#endif
                if( subfolder.find("align") == std::string::npos )
                  continue;
                writeKernelInit(kernel_file, dirname, subfolder, false);
            } //if is_directory
        } //for alignment_iterator
        kernel_file << std::endl;
    } //if is_directory
    else
        std::cerr << "Cannot access directory " << dirname << std::endl;

    //Final Step: Write file tail:
    kernel_file << "  }  //namespace kernels" << std::endl;
    kernel_file << " }  //namespace linalg" << std::endl;
    kernel_file << "}  //namespace viennacl" << std::endl;
    kernel_file << "#endif" << std::endl;
    kernel_file << std::endl;
    kernel_file.close();
}