Esempio n. 1
0
size_t test_global_to_local_ids(unsigned num_ids, unsigned capacity, unsigned num_find_iterations)
{

  typedef Device execution_space;
  typedef typename execution_space::size_type size_type;

  typedef Kokkos::View<uint32_t*,execution_space> local_id_view;
  typedef Kokkos::UnorderedMap<uint32_t,size_type,execution_space> global_id_view;

  double elasped_time = 0;
  Kokkos::Timer timer;

  local_id_view local_2_global("local_ids", num_ids);
  global_id_view global_2_local(capacity);

  int shiftw = 15;

  //create
  elasped_time = timer.seconds();
  std::cout << std::setw(shiftw) <<  "allocate: " <<  elasped_time << std::endl;
  timer.reset();

  // generate unique ids
  {
    generate_ids<Device> gen(local_2_global);
  }

  // generate
  elasped_time = timer.seconds();
  std::cout << std::setw(shiftw) << "generate: " <<  elasped_time << std::endl;
  timer.reset();

  {
    fill_map<Device> fill(global_2_local, local_2_global);
  }

  // fill
  elasped_time = timer.seconds();
  std::cout << std::setw(shiftw) << "fill: " <<  elasped_time << std::endl;
  timer.reset();


  size_t num_errors = global_2_local.failed_insert();

  if (num_errors == 0u) {
    for (unsigned i=0; i<num_find_iterations; ++i)
    {
      find_test<Device> find(global_2_local, local_2_global,num_errors);
    }

    // find
    elasped_time = timer.seconds();
    std::cout << std::setw(shiftw) << "lookup: " <<  elasped_time << std::endl;
  }
  else {
    std::cout << "    !!! Fill Failed !!!" << std::endl;
  }

  return num_errors;
}
Esempio n. 2
0
int main(int narg, char* arg[]) {
  Kokkos::initialize(narg,arg);

  int size = 1000000;

  // Create Views
  idx_type idx("Idx",size,64);
  view_type dest("Dest",size);
  view_type src("Src",size);

  srand(134231);

  Kokkos::fence();

  // When using UVM Cuda views can be accessed on the Host directly
  for(int i=0; i<size; i++) {
    for(int j=0; j<int(idx.dimension_1()); j++)
      idx(i,j) = (size + i + (rand()%500 - 250))%size;
  }

  Kokkos::fence();
  // Run on the device
  // This will cause a sync of idx to the device since it was modified on the host
  Kokkos::Timer timer;
  Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src));
  Kokkos::fence();
  double sec1_dev = timer.seconds();

  // No data transfer will happen now, since nothing is accessed on the host
  timer.reset();
  Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src));
  Kokkos::fence();
  double sec2_dev = timer.seconds();

  // Run on the host
  // This will cause a sync back to the host of dest which was changed on the device
  // Compare runtime here with the dual_view example: dest will be copied back in 4k blocks
  // when they are accessed the first time during the parallel_for. Due to the latency of a memcpy
  // this gives lower effective bandwidth when doing a manual copy via dual views
  timer.reset();
  Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src));
  Kokkos::fence();
  double sec1_host = timer.seconds();

  // No data transfers will happen now
  timer.reset();
  Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src));
  Kokkos::fence();
  double sec2_host = timer.seconds();



  printf("Device Time with Sync: %e without Sync: %e \n",sec1_dev,sec2_dev);
  printf("Host   Time with Sync: %e without Sync: %e \n",sec1_host,sec2_host);

  Kokkos::finalize();
}
Esempio n. 3
0
int main(int argc, char* args[]) {
  if (argc != 3){
	printf("Please pass two integers on the command line\n");
  }
  else {

  // Initialize Kokkos
  Kokkos::initialize(argc,args);
  int size = atoi(args[1]);
  int samples = atoi(args[2]);

  // Create two random number generator pools one for 64bit states and one for 1024 bit states
  // Both take an 64 bit unsigned integer seed to initialize a Random_XorShift64 generator which
  // is used to fill the generators of the pool.
  Kokkos::Random_XorShift64_Pool<> rand_pool64(5374857);
  Kokkos::Random_XorShift1024_Pool<> rand_pool1024(5374857);
  Kokkos::DualView<uint64_t*> vals("Vals",size*samples);

  // Run some performance comparisons
  Kokkos::Timer timer;
  Kokkos::parallel_for(size,generate_random<Kokkos::Random_XorShift64_Pool<> >(vals.d_view,rand_pool64,samples));
  Kokkos::fence();

  timer.reset();
  Kokkos::parallel_for(size,generate_random<Kokkos::Random_XorShift64_Pool<> >(vals.d_view,rand_pool64,samples));
  Kokkos::fence();
  double time_64 = timer.seconds();

  Kokkos::parallel_for(size,generate_random<Kokkos::Random_XorShift1024_Pool<> >(vals.d_view,rand_pool1024,samples));
  Kokkos::fence();

  timer.reset();
  Kokkos::parallel_for(size,generate_random<Kokkos::Random_XorShift1024_Pool<> >(vals.d_view,rand_pool1024,samples));
  Kokkos::fence();
  double time_1024 = timer.seconds();

  printf("#Time XorShift64*:   %e %e\n",time_64,1.0e-9*samples*size/time_64 );
  printf("#Time XorShift1024*: %e %e\n",time_1024,1.0e-9*samples*size/time_1024 );

  Kokkos::deep_copy(vals.h_view,vals.d_view);

  Kokkos::finalize();
  }
  return 0;
}
Esempio n. 4
0
extern "C" void kokkosp_init_library(const int loadSeq,
  const uint64_t interfaceVer,
  const uint32_t devInfoCount,
  void* deviceInfo) {

  num_spaces = 0;
  for(int i=0; i<16; i++)
    space_size[i] = 0;
  
  timer.reset();
}
Esempio n. 5
0
void test_dynrankview_op_perf( const int par_size )
{

  typedef DeviceType execution_space;
  typedef typename execution_space::size_type size_type;
  const size_type dim2 = 900;
  const size_type dim3 = 300;

  double elapsed_time_view = 0;
  double elapsed_time_compview = 0;
  double elapsed_time_strideview = 0;
  double elapsed_time_view_rank7 = 0;
  double elapsed_time_drview = 0;
  double elapsed_time_compdrview = 0;
  Kokkos::Timer timer;
  {
    Kokkos::View<double***,DeviceType> testview("testview",par_size,dim2,dim3);
    typedef InitViewFunctor<DeviceType> FunctorType;

    timer.reset();
    Kokkos::RangePolicy<DeviceType> policy(0,par_size);
    Kokkos::parallel_for( policy , FunctorType(testview) );
    DeviceType::fence();
    elapsed_time_view = timer.seconds();
    std::cout << " View time (init only): " << elapsed_time_view << std::endl;


    timer.reset();
    Kokkos::View<double*,DeviceType> sumview("sumview",par_size);
    Kokkos::parallel_for( policy , typename FunctorType::SumComputationTest(testview, sumview) );
    DeviceType::fence();
    elapsed_time_compview = timer.seconds();
    std::cout << " View sum computation time: " << elapsed_time_view << std::endl;


    Kokkos::View<double***,Kokkos::LayoutStride, DeviceType> teststrideview = Kokkos::subview(testview, Kokkos::ALL, Kokkos::ALL,Kokkos::ALL);
    typedef InitStrideViewFunctor<DeviceType> FunctorStrideType;

    timer.reset();
    Kokkos::parallel_for( policy , FunctorStrideType(teststrideview) );
    DeviceType::fence();
    elapsed_time_strideview = timer.seconds();
    std::cout << " Strided View time (init only): " << elapsed_time_strideview << std::endl;
  }
  {
    Kokkos::View<double*******,DeviceType> testview("testview",par_size,dim2,dim3,1,1,1,1);
    typedef InitViewRank7Functor<DeviceType> FunctorType;

    timer.reset();
    Kokkos::RangePolicy<DeviceType> policy(0,par_size);
    Kokkos::parallel_for( policy , FunctorType(testview) );
    DeviceType::fence();
    elapsed_time_view_rank7 = timer.seconds();
    std::cout << " View Rank7 time (init only): " << elapsed_time_view_rank7 << std::endl;
  }
  {
    Kokkos::DynRankView<double,DeviceType> testdrview("testdrview",par_size,dim2,dim3);
    typedef InitDynRankViewFunctor<DeviceType> FunctorType;

    timer.reset();
    Kokkos::RangePolicy<DeviceType> policy(0,par_size);
    Kokkos::parallel_for( policy , FunctorType(testdrview) );
    DeviceType::fence();
    elapsed_time_drview = timer.seconds();
    std::cout << " DynRankView time (init only): " << elapsed_time_drview << std::endl;

    timer.reset();
    Kokkos::DynRankView<double,DeviceType> sumview("sumview",par_size);
    Kokkos::parallel_for( policy , typename FunctorType::SumComputationTest(testdrview, sumview) );
    DeviceType::fence();
    elapsed_time_compdrview = timer.seconds();
    std::cout << " DynRankView sum computation time: " << elapsed_time_compdrview << std::endl;

  }

  std::cout << " Ratio of View to DynRankView time: " << elapsed_time_view / elapsed_time_drview << std::endl; //expect < 1
  std::cout << " Ratio of View to DynRankView sum computation time: " << elapsed_time_compview / elapsed_time_compdrview << std::endl; //expect < 1
  std::cout << " Ratio of View to View Rank7  time: " << elapsed_time_view / elapsed_time_view_rank7 << std::endl; //expect < 1
  std::cout << " Ratio of StrideView to DynRankView time: " << elapsed_time_strideview / elapsed_time_drview << std::endl; //expect < 1
  std::cout << " Ratio of DynRankView to View Rank7  time: " << elapsed_time_drview / elapsed_time_view_rank7 << std::endl; //expect ?

  timer.reset();

} //end test_dynrankview
Esempio n. 6
0
void sort_array( const size_t array_length /* length of spans of array to sort */
               , const size_t total_length /* total length of array */
               , const int print = 1 )
{
  typedef Device execution_space ;
  typedef Kokkos::View<int*,Device>  device_array_type ;

#if defined( KOKKOS_HAVE_CUDA )

  typedef typename
    Kokkos::Impl::if_c< Kokkos::Impl::is_same< Device , Kokkos::Cuda >::value
                      , Kokkos::View<int*,Kokkos::Cuda::array_layout,Kokkos::CudaHostPinnedSpace>
                      , typename device_array_type::HostMirror
                      >::type  host_array_type ;

#else

  typedef typename device_array_type::HostMirror  host_array_type ;

#endif

  Kokkos::Timer timer;

  const device_array_type  work_array("work_array" , array_length );
  const host_array_type    host_array("host_array" , total_length );

  std::cout << "sort_array length( " << total_length << " )"
            << " in chunks( " << array_length << " )"
            << std::endl ;

  double sec = timer.seconds();
  std::cout << "declaring Views took "
            << sec << " seconds" << std::endl;
  timer.reset();

  for ( size_t i = 0 ; i < total_length ; ++i ) {
    host_array(i) = ( lrand48() * total_length ) >> 31 ;
  }

  sec = timer.seconds();
  std::cout << "initializing " << total_length << " elements on host took "
            << sec << " seconds" << std::endl;
  timer.reset();

  double sec_copy_in  = 0 ;
  double sec_sort     = 0 ;
  double sec_copy_out = 0 ;
  double sec_error    = 0 ;
  size_t error_count  = 0 ;

  for ( size_t begin = 0 ; begin < total_length ; begin += array_length ) {

    const size_t end = begin + array_length < total_length
                     ? begin + array_length : total_length ;

    const std::pair<size_t,size_t> host_range(begin,end);

    const host_array_type host_subarray = Kokkos::subview( host_array , host_range );

    timer.reset();

    Kokkos::deep_copy( work_array , host_subarray );

    sec_copy_in += timer.seconds(); timer.reset();

    SortView< execution_space >( work_array , 0 , end - begin );

    sec_sort += timer.seconds(); timer.reset();

    Kokkos::deep_copy( host_subarray , work_array );

    sec_copy_out += timer.seconds(); timer.reset();

    for ( size_t i = begin + 1 ; i < end ; ++i ) {
      if ( host_array(i) < host_array(i-1) ) ++error_count ;
    }

    sec_error += timer.seconds(); timer.reset();
  }

  std::cout << "copy to   device " << sec_copy_in  << " seconds" << std::endl
            << "sort on   device " << sec_sort     << " seconds" << std::endl
            << "copy from device " << sec_copy_out << " seconds" << std::endl
            << "errors " << error_count << " took " << sec_error << " seconds" << std::endl
            ;
}