int main(int narg, char* args[]) {
  Kokkos::initialize(narg,args);
  
  int chunk_size = 1024;
  int nchunks = 100000; //1024*1024;
  Kokkos::DualView<int*> data("data",nchunks*chunk_size+1);

  srand(1231093);

  for(int i = 0; i < (int) data.dimension_0(); i++) {
    data.h_view(i) = rand()%TEAM_SIZE;
  }
  data.modify<Host>();
  data.sync<Device>();

  Kokkos::DualView<int**> histogram("histogram",TEAM_SIZE,TEAM_SIZE);


  Kokkos::Timer timer;
  // threads/team is automatically limited to maximum supported by the device.
  Kokkos::parallel_for( team_policy( nchunks , TEAM_SIZE )
                      , find_2_tuples(chunk_size,data,histogram) );
  Kokkos::fence();
  double time = timer.seconds();

  histogram.sync<Host>();

  printf("Time: %f \n\n",time);
  int sum = 0;
  for(int k=0; k<TEAM_SIZE; k++) {
    for(int l=0; l<TEAM_SIZE; l++) {
      printf("%i ",histogram.h_view(k,l));
      sum += histogram.h_view(k,l);
    }
    printf("\n");
  }
  printf("Result: %i %i\n",sum,chunk_size*nchunks);
  Kokkos::finalize();
}
Пример #2
0
int main(int narg, char* args[]) {
  Kokkos::initialize(narg,args);
  
  int chunk_size = 1024;
  int nchunks = 100000; //1024*1024;
  Kokkos::DualView<int*> data("data",nchunks*chunk_size+1);

  srand(1231093);

  for(int i = 0; i < data.dimension_0(); i++) {
    data.h_view(i) = rand()%TS;
  }
  data.modify<Host>();
  data.sync<Device>();

  Kokkos::DualView<int**> histogram("histogram",TS,TS);


  Kokkos::Impl::Timer timer;
  Kokkos::parallel_for(
      Kokkos::ParallelWorkRequest(nchunks,TS<Device::team_max()?TS:Device::team_max()),
      find_2_tuples(chunk_size,data,histogram));
  Kokkos::fence();
  double time = timer.seconds();

  histogram.sync<Host>();

  printf("Time: %lf \n\n",time);
  int sum = 0;
  for(int k=0; k<TS; k++) {
    for(int l=0; l<TS; l++) {
      printf("%i ",histogram.h_view(k,l));
      sum += histogram.h_view(k,l);
    }
    printf("\n");
  }
  printf("Result: %i %i\n",sum,chunk_size*nchunks);
  Kokkos::finalize();
}