Esempio n. 1
0
static Handle reuse_existing_handle(uintptr_t key, H_Type type, size_t flags)
{
	if(flags & RES_NO_CACHE)
		return 0;

	// object of specified key and type doesn't exist yet
	Handle h = h_find(type, key);
	if(h <= 0)
		return 0;

	HDATA* hd;
	RETURN_STATUS_IF_ERR(h_data_tag_type(h, type, hd));	// h_find means this won't fail

	hd->refs += 1;

	// we are reactivating a closed but cached handle.
	// need to generate a new tag so that copies of the
	// previous handle can no longer access the resource.
	// (we don't need to reset the tag in h_free, because
	// use before this fails due to refs > 0 check in h_user_data).
	if(hd->refs == 1)
	{
		const Tag tag = gen_tag();
		h = handle(h_idx(h), tag);	// can't fail
		hd->h = h;
	}

	return h;
}
Esempio n. 2
0
int main(int narg, char* arg[]) {
  Kokkos::initialize(narg,arg);

  int size = 1000000;

  // Create DualViews. This will allocate on both the device and its
  // host_mirror_device.
  idx_type idx("Idx",size,64);
  view_type dest("Dest",size);
  view_type src("Src",size);

  srand(134231);

  // Get a reference to the host view of idx directly (equivalent to
  // idx.view<idx_type::host_mirror_device_type>() )
  idx_type::t_host h_idx = idx.h_view;
  for (int i = 0; i < size; ++i) {
    for (view_type::size_type j=0; j < h_idx.dimension_1 (); ++j) {
      h_idx(i,j) = (size + i + (rand () % 500 - 250)) % size;
    }
  }

  // Mark idx as modified on the host_mirror_device_type so that a
  // sync to the device will actually move data.  The sync happens in
  // the functor's constructor.
  idx.modify<idx_type::host_mirror_device_type>();

  // Run on the device.  This will cause a sync of idx to the device,
  // since it was marked as modified on the host.
  Kokkos::Impl::Timer timer;
  Kokkos::parallel_for(size,localsum<view_type::device_type>(idx,dest,src));
  Kokkos::fence();
  double sec1_dev = timer.seconds();

  timer.reset();
  Kokkos::parallel_for(size,localsum<view_type::device_type>(idx,dest,src));
  Kokkos::fence();
  double sec2_dev = timer.seconds();

  // Run on the host (could be the same as device).  This will cause a
  // sync back to the host of dest.  Note that if the Device is CUDA,
  // the data layout will not be optimal on host, so performance is
  // lower than what it would be for a pure host compilation.
  timer.reset();
  Kokkos::parallel_for(size,localsum<view_type::host_mirror_device_type>(idx,dest,src));
  Kokkos::fence();
  double sec1_host = timer.seconds();

  timer.reset();
  Kokkos::parallel_for(size,localsum<view_type::host_mirror_device_type>(idx,dest,src));
  Kokkos::fence();
  double sec2_host = timer.seconds();

  printf("Device Time with Sync: %lf without Sync: %lf \n",sec1_dev,sec2_dev);
  printf("Host   Time with Sync: %lf without Sync: %lf \n",sec1_host,sec2_host);

  Kokkos::finalize();
}
Esempio n. 3
0
// get HDATA for the given handle.
// only uses (and checks) the index field.
// used by h_force_close (which must work regardless of tag).
static inline Status h_data_no_tag(const Handle h, HDATA*& hd)
{
	ssize_t idx = (ssize_t)h_idx(h);
	RETURN_STATUS_IF_ERR(h_data_from_idx(idx, hd));
	// need to verify it's in range - h_data_from_idx can only verify that
	// it's < maximum allowable index.
	if(uintptr_t(hd) > uintptr_t(hpool.da.base)+hpool.da.pos)
		WARN_RETURN(ERR::H_IDX_UNUSED);
	return INFO::OK;
}
Esempio n. 4
0
static void key_add(uintptr_t key, Handle h)
{
	Key2Idx* key2idx = key2idx_wrapper.get();
	if(!key2idx)
		return;

	const ssize_t idx = h_idx(h);
	// note: MSDN documentation of stdext::hash_multimap is incorrect;
	// there is no overload of insert() that returns pair<iterator, bool>.
	(void)key2idx->insert(std::make_pair(key, idx));

	key2idx_wrapper.lock();
}
Esempio n. 5
0
int main(int narg, char* arg[]) {
  Kokkos::initialize (narg, arg);

  int size = 1000000;

  idx_type idx("Idx",size,64);
  idx_type_host h_idx = Kokkos::create_mirror_view (idx);

  view_type dest ("Dest", size);
  view_type src ("Src", size);

  srand(134231);

  for (int i = 0; i < size; i++) {
    for (view_type::size_type j = 0; j < h_idx.dimension_1 (); ++j) {
      h_idx(i,j) = (size + i + (rand () % 500 - 250)) % size;
    }
  }

  // Deep copy the initial data to the device
  Kokkos::deep_copy(idx,h_idx);
  // Run the first kernel to warmup caches
  Kokkos::parallel_for(size,localsum<view_type,view_type_rnd>(idx,dest,src));
  Kokkos::fence();

  // Run the localsum functor using the RandomAccess trait. On CPUs there should
  // not be any different in performance to not using the RandomAccess trait.
  // On GPUs where can be a dramatic difference
  Kokkos::Impl::Timer time1;
  Kokkos::parallel_for(size,localsum<view_type,view_type_rnd>(idx,dest,src));
  Kokkos::fence();
  double sec1 = time1.seconds();

  Kokkos::Impl::Timer time2;
  Kokkos::parallel_for(size,localsum<view_type,view_type>(idx,dest,src));
  Kokkos::fence();
  double sec2 = time2.seconds();

  printf("Time with Trait RandomAccess: %f with Plain: %f \n",sec1,sec2);

  Kokkos::finalize();
}