Esempio n. 1
0
size_t test_global_to_local_ids(unsigned num_ids, unsigned capacity, unsigned num_find_iterations)
{

  typedef Device execution_space;
  typedef typename execution_space::size_type size_type;

  typedef Kokkos::View<uint32_t*,execution_space> local_id_view;
  typedef Kokkos::UnorderedMap<uint32_t,size_type,execution_space> global_id_view;

  double elasped_time = 0;
  Kokkos::Timer timer;

  local_id_view local_2_global("local_ids", num_ids);
  global_id_view global_2_local(capacity);

  int shiftw = 15;

  //create
  elasped_time = timer.seconds();
  std::cout << std::setw(shiftw) <<  "allocate: " <<  elasped_time << std::endl;
  timer.reset();

  // generate unique ids
  {
    generate_ids<Device> gen(local_2_global);
  }

  // generate
  elasped_time = timer.seconds();
  std::cout << std::setw(shiftw) << "generate: " <<  elasped_time << std::endl;
  timer.reset();

  {
    fill_map<Device> fill(global_2_local, local_2_global);
  }

  // fill
  elasped_time = timer.seconds();
  std::cout << std::setw(shiftw) << "fill: " <<  elasped_time << std::endl;
  timer.reset();


  size_t num_errors = global_2_local.failed_insert();

  if (num_errors == 0u) {
    for (unsigned i=0; i<num_find_iterations; ++i)
    {
      find_test<Device> find(global_2_local, local_2_global,num_errors);
    }

    // find
    elasped_time = timer.seconds();
    std::cout << std::setw(shiftw) << "lookup: " <<  elasped_time << std::endl;
  }
  else {
    std::cout << "    !!! Fill Failed !!!" << std::endl;
  }

  return num_errors;
}
Esempio n. 2
0
int main(int narg, char* arg[]) {
  Kokkos::initialize(narg,arg);

  int size = 1000000;

  // Create Views
  idx_type idx("Idx",size,64);
  view_type dest("Dest",size);
  view_type src("Src",size);

  srand(134231);

  Kokkos::fence();

  // When using UVM Cuda views can be accessed on the Host directly
  for(int i=0; i<size; i++) {
    for(int j=0; j<int(idx.dimension_1()); j++)
      idx(i,j) = (size + i + (rand()%500 - 250))%size;
  }

  Kokkos::fence();
  // Run on the device
  // This will cause a sync of idx to the device since it was modified on the host
  Kokkos::Timer timer;
  Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src));
  Kokkos::fence();
  double sec1_dev = timer.seconds();

  // No data transfer will happen now, since nothing is accessed on the host
  timer.reset();
  Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src));
  Kokkos::fence();
  double sec2_dev = timer.seconds();

  // Run on the host
  // This will cause a sync back to the host of dest which was changed on the device
  // Compare runtime here with the dual_view example: dest will be copied back in 4k blocks
  // when they are accessed the first time during the parallel_for. Due to the latency of a memcpy
  // this gives lower effective bandwidth when doing a manual copy via dual views
  timer.reset();
  Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src));
  Kokkos::fence();
  double sec1_host = timer.seconds();

  // No data transfers will happen now
  timer.reset();
  Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src));
  Kokkos::fence();
  double sec2_host = timer.seconds();



  printf("Device Time with Sync: %e without Sync: %e \n",sec1_dev,sec2_dev);
  printf("Host   Time with Sync: %e without Sync: %e \n",sec1_host,sec2_host);

  Kokkos::finalize();
}
Esempio n. 3
0
int main(int argc, char* args[]) {
  if (argc != 3){
	printf("Please pass two integers on the command line\n");
  }
  else {

  // Initialize Kokkos
  Kokkos::initialize(argc,args);
  int size = atoi(args[1]);
  int samples = atoi(args[2]);

  // Create two random number generator pools one for 64bit states and one for 1024 bit states
  // Both take an 64 bit unsigned integer seed to initialize a Random_XorShift64 generator which
  // is used to fill the generators of the pool.
  Kokkos::Random_XorShift64_Pool<> rand_pool64(5374857);
  Kokkos::Random_XorShift1024_Pool<> rand_pool1024(5374857);
  Kokkos::DualView<uint64_t*> vals("Vals",size*samples);

  // Run some performance comparisons
  Kokkos::Timer timer;
  Kokkos::parallel_for(size,generate_random<Kokkos::Random_XorShift64_Pool<> >(vals.d_view,rand_pool64,samples));
  Kokkos::fence();

  timer.reset();
  Kokkos::parallel_for(size,generate_random<Kokkos::Random_XorShift64_Pool<> >(vals.d_view,rand_pool64,samples));
  Kokkos::fence();
  double time_64 = timer.seconds();

  Kokkos::parallel_for(size,generate_random<Kokkos::Random_XorShift1024_Pool<> >(vals.d_view,rand_pool1024,samples));
  Kokkos::fence();

  timer.reset();
  Kokkos::parallel_for(size,generate_random<Kokkos::Random_XorShift1024_Pool<> >(vals.d_view,rand_pool1024,samples));
  Kokkos::fence();
  double time_1024 = timer.seconds();

  printf("#Time XorShift64*:   %e %e\n",time_64,1.0e-9*samples*size/time_64 );
  printf("#Time XorShift1024*: %e %e\n",time_1024,1.0e-9*samples*size/time_1024 );

  Kokkos::deep_copy(vals.h_view,vals.d_view);

  Kokkos::finalize();
  }
  return 0;
}
Esempio n. 4
0
extern "C" void kokkosp_init_library(const int loadSeq,
  const uint64_t interfaceVer,
  const uint32_t devInfoCount,
  void* deviceInfo) {

  num_spaces = 0;
  for(int i=0; i<16; i++)
    space_size[i] = 0;
  
  timer.reset();
}
int main(int narg, char* args[]) {
  Kokkos::initialize(narg,args);
  
  int chunk_size = 1024;
  int nchunks = 100000; //1024*1024;
  Kokkos::DualView<int*> data("data",nchunks*chunk_size+1);

  srand(1231093);

  for(int i = 0; i < (int) data.dimension_0(); i++) {
    data.h_view(i) = rand()%TEAM_SIZE;
  }
  data.modify<Host>();
  data.sync<Device>();

  Kokkos::DualView<int**> histogram("histogram",TEAM_SIZE,TEAM_SIZE);


  Kokkos::Timer timer;
  // threads/team is automatically limited to maximum supported by the device.
  Kokkos::parallel_for( team_policy( nchunks , TEAM_SIZE )
                      , find_2_tuples(chunk_size,data,histogram) );
  Kokkos::fence();
  double time = timer.seconds();

  histogram.sync<Host>();

  printf("Time: %f \n\n",time);
  int sum = 0;
  for(int k=0; k<TEAM_SIZE; k++) {
    for(int l=0; l<TEAM_SIZE; l++) {
      printf("%i ",histogram.h_view(k,l));
      sum += histogram.h_view(k,l);
    }
    printf("\n");
  }
  printf("Result: %i %i\n",sum,chunk_size*nchunks);
  Kokkos::finalize();
}
Esempio n. 6
0
extern "C" void kokkosp_allocate_data(const SpaceHandle space, const char* label, const void* const ptr, const uint64_t size) {
  std::lock_guard<std::mutex> lock(m);
 
  double time = timer.seconds();
  
  int space_i = num_spaces;
  for(int s = 0; s<num_spaces; s++)
    if(strcmp(space_name[s],space.name)==0)
      space_i = s;

  if(space_i == num_spaces) {
    strncpy(space_name[num_spaces],space.name,64);
    num_spaces++;
  }
  space_size[space_i] += size;
  space_size_track[space_i].push_back(std::make_tuple(time,space_size[space_i],max_mem_usage()));
}
Esempio n. 7
0
void test_dynrankview_op_perf( const int par_size )
{

  typedef DeviceType execution_space;
  typedef typename execution_space::size_type size_type;
  const size_type dim2 = 900;
  const size_type dim3 = 300;

  double elapsed_time_view = 0;
  double elapsed_time_compview = 0;
  double elapsed_time_strideview = 0;
  double elapsed_time_view_rank7 = 0;
  double elapsed_time_drview = 0;
  double elapsed_time_compdrview = 0;
  Kokkos::Timer timer;
  {
    Kokkos::View<double***,DeviceType> testview("testview",par_size,dim2,dim3);
    typedef InitViewFunctor<DeviceType> FunctorType;

    timer.reset();
    Kokkos::RangePolicy<DeviceType> policy(0,par_size);
    Kokkos::parallel_for( policy , FunctorType(testview) );
    DeviceType::fence();
    elapsed_time_view = timer.seconds();
    std::cout << " View time (init only): " << elapsed_time_view << std::endl;


    timer.reset();
    Kokkos::View<double*,DeviceType> sumview("sumview",par_size);
    Kokkos::parallel_for( policy , typename FunctorType::SumComputationTest(testview, sumview) );
    DeviceType::fence();
    elapsed_time_compview = timer.seconds();
    std::cout << " View sum computation time: " << elapsed_time_view << std::endl;


    Kokkos::View<double***,Kokkos::LayoutStride, DeviceType> teststrideview = Kokkos::subview(testview, Kokkos::ALL, Kokkos::ALL,Kokkos::ALL);
    typedef InitStrideViewFunctor<DeviceType> FunctorStrideType;

    timer.reset();
    Kokkos::parallel_for( policy , FunctorStrideType(teststrideview) );
    DeviceType::fence();
    elapsed_time_strideview = timer.seconds();
    std::cout << " Strided View time (init only): " << elapsed_time_strideview << std::endl;
  }
  {
    Kokkos::View<double*******,DeviceType> testview("testview",par_size,dim2,dim3,1,1,1,1);
    typedef InitViewRank7Functor<DeviceType> FunctorType;

    timer.reset();
    Kokkos::RangePolicy<DeviceType> policy(0,par_size);
    Kokkos::parallel_for( policy , FunctorType(testview) );
    DeviceType::fence();
    elapsed_time_view_rank7 = timer.seconds();
    std::cout << " View Rank7 time (init only): " << elapsed_time_view_rank7 << std::endl;
  }
  {
    Kokkos::DynRankView<double,DeviceType> testdrview("testdrview",par_size,dim2,dim3);
    typedef InitDynRankViewFunctor<DeviceType> FunctorType;

    timer.reset();
    Kokkos::RangePolicy<DeviceType> policy(0,par_size);
    Kokkos::parallel_for( policy , FunctorType(testdrview) );
    DeviceType::fence();
    elapsed_time_drview = timer.seconds();
    std::cout << " DynRankView time (init only): " << elapsed_time_drview << std::endl;

    timer.reset();
    Kokkos::DynRankView<double,DeviceType> sumview("sumview",par_size);
    Kokkos::parallel_for( policy , typename FunctorType::SumComputationTest(testdrview, sumview) );
    DeviceType::fence();
    elapsed_time_compdrview = timer.seconds();
    std::cout << " DynRankView sum computation time: " << elapsed_time_compdrview << std::endl;

  }

  std::cout << " Ratio of View to DynRankView time: " << elapsed_time_view / elapsed_time_drview << std::endl; //expect < 1
  std::cout << " Ratio of View to DynRankView sum computation time: " << elapsed_time_compview / elapsed_time_compdrview << std::endl; //expect < 1
  std::cout << " Ratio of View to View Rank7  time: " << elapsed_time_view / elapsed_time_view_rank7 << std::endl; //expect < 1
  std::cout << " Ratio of StrideView to DynRankView time: " << elapsed_time_strideview / elapsed_time_drview << std::endl; //expect < 1
  std::cout << " Ratio of DynRankView to View Rank7  time: " << elapsed_time_drview / elapsed_time_view_rank7 << std::endl; //expect ?

  timer.reset();

} //end test_dynrankview
Esempio n. 8
0
void sort_array( const size_t array_length /* length of spans of array to sort */
               , const size_t total_length /* total length of array */
               , const int print = 1 )
{
  typedef Device execution_space ;
  typedef Kokkos::View<int*,Device>  device_array_type ;

#if defined( KOKKOS_HAVE_CUDA )

  typedef typename
    Kokkos::Impl::if_c< Kokkos::Impl::is_same< Device , Kokkos::Cuda >::value
                      , Kokkos::View<int*,Kokkos::Cuda::array_layout,Kokkos::CudaHostPinnedSpace>
                      , typename device_array_type::HostMirror
                      >::type  host_array_type ;

#else

  typedef typename device_array_type::HostMirror  host_array_type ;

#endif

  Kokkos::Timer timer;

  const device_array_type  work_array("work_array" , array_length );
  const host_array_type    host_array("host_array" , total_length );

  std::cout << "sort_array length( " << total_length << " )"
            << " in chunks( " << array_length << " )"
            << std::endl ;

  double sec = timer.seconds();
  std::cout << "declaring Views took "
            << sec << " seconds" << std::endl;
  timer.reset();

  for ( size_t i = 0 ; i < total_length ; ++i ) {
    host_array(i) = ( lrand48() * total_length ) >> 31 ;
  }

  sec = timer.seconds();
  std::cout << "initializing " << total_length << " elements on host took "
            << sec << " seconds" << std::endl;
  timer.reset();

  double sec_copy_in  = 0 ;
  double sec_sort     = 0 ;
  double sec_copy_out = 0 ;
  double sec_error    = 0 ;
  size_t error_count  = 0 ;

  for ( size_t begin = 0 ; begin < total_length ; begin += array_length ) {

    const size_t end = begin + array_length < total_length
                     ? begin + array_length : total_length ;

    const std::pair<size_t,size_t> host_range(begin,end);

    const host_array_type host_subarray = Kokkos::subview( host_array , host_range );

    timer.reset();

    Kokkos::deep_copy( work_array , host_subarray );

    sec_copy_in += timer.seconds(); timer.reset();

    SortView< execution_space >( work_array , 0 , end - begin );

    sec_sort += timer.seconds(); timer.reset();

    Kokkos::deep_copy( host_subarray , work_array );

    sec_copy_out += timer.seconds(); timer.reset();

    for ( size_t i = begin + 1 ; i < end ; ++i ) {
      if ( host_array(i) < host_array(i-1) ) ++error_count ;
    }

    sec_error += timer.seconds(); timer.reset();
  }

  std::cout << "copy to   device " << sec_copy_in  << " seconds" << std::endl
            << "sort on   device " << sec_sort     << " seconds" << std::endl
            << "copy from device " << sec_copy_out << " seconds" << std::endl
            << "errors " << error_count << " took " << sec_error << " seconds" << std::endl
            ;
}
void run_allocateview_tests(int N, int R) {
  const int N1 = N;
  const int N2 = N*N;
  const int N3 = N2*N;
  const int N4 = N2*N2;
  const int N8 = N4*N4;

  double time1,time2,time3,time4,time5,time6,time7,time8,time_raw = 100000.0;
  {
    Kokkos::Timer timer;
    for(int r=0; r<R; r++) {
      Kokkos::View<double*,Layout> a("A1",N8);
    }
    time1 = timer.seconds()/R;
  }
  {
    Kokkos::Timer timer;
    for(int r=0; r<R; r++) {
      Kokkos::View<double**,Layout> a("A2",N4,N4);
    }
    time2 = timer.seconds()/R;
  }
  {
    Kokkos::Timer timer;
    for(int r=0; r<R; r++) {
      Kokkos::View<double***,Layout> a("A3",N3,N3,N2);
    }
    time3 = timer.seconds()/R;
  }
  {
    Kokkos::Timer timer;
    for(int r=0; r<R; r++) {
      Kokkos::View<double****,Layout> a("A4",N2,N2,N2,N2);
    }
    time4 = timer.seconds()/R;
  }
  {
    Kokkos::Timer timer;
    for(int r=0; r<R; r++) {
      Kokkos::View<double*****,Layout> a("A5",N2,N2,N1,N1,N2);
    }
    time5 = timer.seconds()/R;
  }
  {
    Kokkos::Timer timer;
    for(int r=0; r<R; r++) {
      Kokkos::View<double******,Layout> a("A6",N2,N1,N1,N1,N1,N2);
    }
    time6 = timer.seconds()/R;
  }
  {
    Kokkos::Timer timer;
    for(int r=0; r<R; r++) {
      Kokkos::View<double*******,Layout> a("A7",N2,N1,N1,N1,N1,N1,N1);
    }
    time7 = timer.seconds()/R;
  }
  {
    Kokkos::Timer timer;
    for(int r=0; r<R; r++) {
      Kokkos::View<double********,Layout> a("A8",N1,N1,N1,N1,N1,N1,N1,N1);
    }
    time8 = timer.seconds()/R;
  }
  #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
  {
    Kokkos::Timer timer;
    for(int r=0;r<R;r++) {
      double* a_ptr = (double*) Kokkos::kokkos_malloc("A", sizeof(double)*N8);
      Kokkos::parallel_for(N8, KOKKOS_LAMBDA (const int& i) {
        a_ptr[i] = 0.0;
      });
      Kokkos::kokkos_free(a_ptr);
    }
    time_raw = timer.seconds()/R;
  }
  #endif
  double size = 1.0*N8*8/1024/1024;
  printf("   Raw:   %lf s   %lf MB   %lf GB/s\n",time_raw,size,size/1024/time_raw);
  printf("   Rank1: %lf s   %lf MB   %lf GB/s\n",time1,size,size/1024/time1);
  printf("   Rank2: %lf s   %lf MB   %lf GB/s\n",time2,size,size/1024/time2);
  printf("   Rank3: %lf s   %lf MB   %lf GB/s\n",time3,size,size/1024/time3);
  printf("   Rank4: %lf s   %lf MB   %lf GB/s\n",time4,size,size/1024/time4);
  printf("   Rank5: %lf s   %lf MB   %lf GB/s\n",time5,size,size/1024/time5);
  printf("   Rank6: %lf s   %lf MB   %lf GB/s\n",time6,size,size/1024/time6);
  printf("   Rank7: %lf s   %lf MB   %lf GB/s\n",time7,size,size/1024/time7);
  printf("   Rank8: %lf s   %lf MB   %lf GB/s\n",time8,size,size/1024/time8);
}
Esempio n. 10
0
int main(int argc, char** argv) {

  printf("Running MD Skeleton\n");
  /* Thread numbers for Host */

  int num_threads = 1;
  int teams = 1;
  int device = 0; // Default device for GPU runs

  /* avoid unused variable warnings */
  (void)num_threads;
  (void)teams;
  (void)device;

  /* Default value for number of force calculations */

  int iter = 100;

  /* Default value for system size (4*nx*ny*nz atoms)
   * nx, ny and nz are set to system_size if not specififed on commandline */

  int system_size = 20;
  int nx = -1;
  int ny = -1;
  int nz = -1;

  int neighbor_size = 1; // Default bin size for neighbor list construction

  double rho = 0.8442; // Number density of the system
  double delta = 0; // Scaling factor for random offsets of atom positions


  /* read in command-line arguments */

  for(int i = 0; i < argc; i++) {
    if((strcmp(argv[i], "-t") == 0) || (strcmp(argv[i], "--num_threads") == 0)) {
      num_threads = atoi(argv[++i]);
      continue;
    }

    if((strcmp(argv[i], "--teams") == 0)) {
      teams = atoi(argv[++i]);
      continue;
    }

    if((strcmp(argv[i], "-d") == 0) || (strcmp(argv[i], "--device") == 0))  {
      device = atoi(argv[++i]);
      continue;
    }

    if((strcmp(argv[i], "--delta") == 0)) {
      delta = atof(argv[++i]);
      continue;
    }

    if((strcmp(argv[i], "-i") == 0) || (strcmp(argv[i], "--iter") == 0))  {
      iter = atoi(argv[++i]);
      continue;
    }

    if((strcmp(argv[i], "-rho") == 0)) {
      rho = atoi(argv[++i]);
      continue;
    }

    if((strcmp(argv[i], "-s") == 0) || (strcmp(argv[i], "--size") == 0)) {
      system_size = atoi(argv[++i]);
      continue;
    }

    if((strcmp(argv[i], "-nx") == 0)) {
      nx = atoi(argv[++i]);
      continue;
    }

    if((strcmp(argv[i], "-ny") == 0)) {
      ny = atoi(argv[++i]);
      continue;
    }

    if((strcmp(argv[i], "-nz") == 0)) {
      nz = atoi(argv[++i]);
      continue;
    }

    if((strcmp(argv[i], "-b") == 0) || (strcmp(argv[i], "--neigh_bins") == 0))  {
      neighbor_size = atoi(argv[++i]);
      continue;
    }
  }

  if( nx < 0 ) nx = system_size;
  if( ny < 0 ) ny = system_size;
  if( nz < 0 ) nz = system_size;

  printf("-> Init Device\n");

#if defined( KOKKOS_HAVE_CUDA )
  Kokkos::HostSpace::execution_space::initialize(teams*num_threads);
  Kokkos::Cuda::SelectDevice select_device(device);
  Kokkos::Cuda::initialize(select_device);
#elif defined( KOKKOS_HAVE_OPENMP )
  Kokkos::OpenMP::initialize(teams*num_threads);
#elif defined( KOKKOS_HAVE_PTHREAD )
  Kokkos::Threads::initialize(teams*num_threads);
#endif

  System system;
  system.neigh_cut = 2.8;
  system.force_cut = 2.5;
  system.force_cutsq = system.force_cut*system.force_cut;
  system.delta = delta;

  printf("-> Build system\n");
  create_system(system,nx,ny,nz,rho);

  printf("-> Created %i atoms and %i ghost atoms\n",system.nlocal,system.nghost);

  system.nbinx = system.box.xprd/neighbor_size+1;
  system.nbiny = system.box.yprd/neighbor_size+1;
  system.nbinz = system.box.zprd/neighbor_size+1;


  printf("-> Building Neighborlist\n");

  neigh_setup(system);
  neigh_build(system);

  double2 ev = force(system,1);

  printf("-> Calculate Energy: %f Virial: %f\n",ev.x,ev.y);

  printf("-> Running %i force calculations\n",iter);

  Kokkos::Timer timer;

  for(int i=0;i<iter;i++) {
    force(system,0);
  }


  double time = timer.seconds();
  printf("Time: %e s for %i iterations with %i atoms\n",time,iter,system.nlocal);

  execution_space::finalize();
}