static Handle reuse_existing_handle(uintptr_t key, H_Type type, size_t flags) { if(flags & RES_NO_CACHE) return 0; // object of specified key and type doesn't exist yet Handle h = h_find(type, key); if(h <= 0) return 0; HDATA* hd; RETURN_STATUS_IF_ERR(h_data_tag_type(h, type, hd)); // h_find means this won't fail hd->refs += 1; // we are reactivating a closed but cached handle. // need to generate a new tag so that copies of the // previous handle can no longer access the resource. // (we don't need to reset the tag in h_free, because // use before this fails due to refs > 0 check in h_user_data). if(hd->refs == 1) { const Tag tag = gen_tag(); h = handle(h_idx(h), tag); // can't fail hd->h = h; } return h; }
int main(int narg, char* arg[]) { Kokkos::initialize(narg,arg); int size = 1000000; // Create DualViews. This will allocate on both the device and its // host_mirror_device. idx_type idx("Idx",size,64); view_type dest("Dest",size); view_type src("Src",size); srand(134231); // Get a reference to the host view of idx directly (equivalent to // idx.view<idx_type::host_mirror_device_type>() ) idx_type::t_host h_idx = idx.h_view; for (int i = 0; i < size; ++i) { for (view_type::size_type j=0; j < h_idx.dimension_1 (); ++j) { h_idx(i,j) = (size + i + (rand () % 500 - 250)) % size; } } // Mark idx as modified on the host_mirror_device_type so that a // sync to the device will actually move data. The sync happens in // the functor's constructor. idx.modify<idx_type::host_mirror_device_type>(); // Run on the device. This will cause a sync of idx to the device, // since it was marked as modified on the host. Kokkos::Impl::Timer timer; Kokkos::parallel_for(size,localsum<view_type::device_type>(idx,dest,src)); Kokkos::fence(); double sec1_dev = timer.seconds(); timer.reset(); Kokkos::parallel_for(size,localsum<view_type::device_type>(idx,dest,src)); Kokkos::fence(); double sec2_dev = timer.seconds(); // Run on the host (could be the same as device). This will cause a // sync back to the host of dest. Note that if the Device is CUDA, // the data layout will not be optimal on host, so performance is // lower than what it would be for a pure host compilation. timer.reset(); Kokkos::parallel_for(size,localsum<view_type::host_mirror_device_type>(idx,dest,src)); Kokkos::fence(); double sec1_host = timer.seconds(); timer.reset(); Kokkos::parallel_for(size,localsum<view_type::host_mirror_device_type>(idx,dest,src)); Kokkos::fence(); double sec2_host = timer.seconds(); printf("Device Time with Sync: %lf without Sync: %lf \n",sec1_dev,sec2_dev); printf("Host Time with Sync: %lf without Sync: %lf \n",sec1_host,sec2_host); Kokkos::finalize(); }
// get HDATA for the given handle. // only uses (and checks) the index field. // used by h_force_close (which must work regardless of tag). static inline Status h_data_no_tag(const Handle h, HDATA*& hd) { ssize_t idx = (ssize_t)h_idx(h); RETURN_STATUS_IF_ERR(h_data_from_idx(idx, hd)); // need to verify it's in range - h_data_from_idx can only verify that // it's < maximum allowable index. if(uintptr_t(hd) > uintptr_t(hpool.da.base)+hpool.da.pos) WARN_RETURN(ERR::H_IDX_UNUSED); return INFO::OK; }
static void key_add(uintptr_t key, Handle h) { Key2Idx* key2idx = key2idx_wrapper.get(); if(!key2idx) return; const ssize_t idx = h_idx(h); // note: MSDN documentation of stdext::hash_multimap is incorrect; // there is no overload of insert() that returns pair<iterator, bool>. (void)key2idx->insert(std::make_pair(key, idx)); key2idx_wrapper.lock(); }
int main(int narg, char* arg[]) { Kokkos::initialize (narg, arg); int size = 1000000; idx_type idx("Idx",size,64); idx_type_host h_idx = Kokkos::create_mirror_view (idx); view_type dest ("Dest", size); view_type src ("Src", size); srand(134231); for (int i = 0; i < size; i++) { for (view_type::size_type j = 0; j < h_idx.dimension_1 (); ++j) { h_idx(i,j) = (size + i + (rand () % 500 - 250)) % size; } } // Deep copy the initial data to the device Kokkos::deep_copy(idx,h_idx); // Run the first kernel to warmup caches Kokkos::parallel_for(size,localsum<view_type,view_type_rnd>(idx,dest,src)); Kokkos::fence(); // Run the localsum functor using the RandomAccess trait. On CPUs there should // not be any different in performance to not using the RandomAccess trait. // On GPUs where can be a dramatic difference Kokkos::Impl::Timer time1; Kokkos::parallel_for(size,localsum<view_type,view_type_rnd>(idx,dest,src)); Kokkos::fence(); double sec1 = time1.seconds(); Kokkos::Impl::Timer time2; Kokkos::parallel_for(size,localsum<view_type,view_type>(idx,dest,src)); Kokkos::fence(); double sec2 = time2.seconds(); printf("Time with Trait RandomAccess: %f with Plain: %f \n",sec1,sec2); Kokkos::finalize(); }