std::pair<CudaEvent, view_type> copy(const ArrayView< value_type, HostCoordinator< value_type, PinnedAllocator< value_type, alignment>>> &from, view_type &to) { assert(from.size()==to.size()); #ifdef VERBOSE using oType = ArrayView< value_type, HostCoordinator< value_type, PinnedAllocator< value_type, alignment>>>; std::cout << util::pretty_printer<DeviceCoordinator>::print(*this) << "::" << util::blue("copy") << "(asynchronous, " << from.size() << ")" << "\n " << util::type_printer<oType>::print() << " @ " << from.data() << util::yellow(" -> ") << util::type_printer<view_type>::print() << " @ " << to.data() << std::endl; #endif auto status = cudaMemcpy( reinterpret_cast<void*>(to.begin()), reinterpret_cast<const void*>(from.begin()), from.size()*sizeof(value_type), cudaMemcpyHostToDevice ); if(status != cudaSuccess) { std::cerr << util::red("error") << " bad CUDA memcopy, unable to copy " << sizeof(T)*from.size() << " bytes from host to device"; exit(-1); } CudaEvent event; return std::make_pair(event, to); }
void free(view_type& rng) { Allocator allocator; if(rng.data()) allocator.deallocate(rng.data(), rng.size()); #ifdef VERBOSE std::cerr << util::type_printer<DeviceCoordinator>::print() << "::free()" << std::endl; #endif impl::reset(rng); }
// construct as a copy of another range Array(view_type const& other) : base(coordinator_type().allocate(other.size())) { #ifdef VERBOSE std::cerr << util::green("Array(other&)") + " other = " << util::pretty_printer<view_type>::print(other) << std::endl; #endif coordinator_.copy(static_cast<base const&>(other), *this); }
void MergeGroup::addImage(view_type const& image) { if (image.width() != width_ || image.height() != height_) { throw std::invalid_argument("Dimensions of image passed in differ to others in the sequence."); } if (pyramids_.size() == groupSize_) { throw std::invalid_argument("Group already contains enough images to fuse. Cannot add another."); } // TODO: do quality mask here, then create pyramid from Pending image // which image in the group is this size_t imageNum = pyramids_.size(); // create subviews from arena for a single pyramid std::vector< view_type > subviews; for (auto& fuseView : fuseViews_) { subviews.push_back(fuseView[imageNum]); } // transfer input image into first level of pyramid std::copy(image.begin(), image.end(), subviews.front().begin()); std::cout << "========================\n" "Creating Pyramid.\n" "========================" << std::endl; ImagePyramid pyramid(context_, std::move(subviews), [=](Pending2DImage const& im) { return createPyramidLevel(im, program_); }); pyramids_.push_back(std::move(pyramid)); }
KOKKOS_INLINE_FUNCTION void operator() (device_type device) const { typename local_view_type::Partition part( device.team_rank() , device.team_size() ); const local_view_type local_x( dev_x , part ); const local_view_type local_y( dev_y , part ); const int element = device.league_rank(); // Apply evaluation function to this thread's fix-sized UQ sample set. simple_function< value_type >( local_x(element) , local_y(element) ); // Print x and y if (print) { device.team_barrier(); if ( ! device.league_rank() && ! device.team_rank() ) { printf("view_kernel league(%d:%d) team_size(%d) dim(%d) size(%d)\n", device.league_rank(), device.league_size(),device.team_size(), int(dev_x.dimension_1()), int(local_x(element).size()) ); } if ( ! device.team_rank() ) { printf("x(%i) = { ",element); for (int sample=0; sample< int(dev_x.dimension_1()); ++sample) { printf("%g ", dev_x(element,sample)); } printf("}\n\n"); printf("y(%i) = { ",element); for (int sample=0; sample< int(dev_y.dimension_1()); ++sample) { printf("%g ", dev_y(element,sample)); } printf("}\n\n"); } device.team_barrier(); } }
KOKKOS_INLINE_FUNCTION void operator() (device_type device) const { int element = device.league_rank(); int num_threads = device.team_size(); int thread = device.team_rank(); int num_samples = dev_x.dimension_1(); int num_samples_per_thread = num_samples / num_threads; // Initialize x storage_type x_s(&dev_x(element,thread), num_samples_per_thread, num_threads); storage_type y_s(&dev_y(element,thread), num_samples_per_thread, num_threads); array_vector_type x(x_s), y(y_s); simple_function<scalar_vector_type>(x,y); // Print x and y if (print) { for (int tidx = 0; tidx<num_threads; tidx++) { if (thread == tidx) { printf("x(%i) = [ ",tidx); for (int sample=0; sample<num_samples_per_thread; sample++) printf("%g ", x.coeff(sample)); printf("]\n\n"); } device.team_barrier(); } for (int tidx = 0; tidx<num_threads; tidx++) { if (thread == tidx) { printf("y(%i) = [ ",tidx); for (int sample=0; sample<num_samples_per_thread; sample++) printf("%g ", y.coeff(sample)); printf("]\n\n"); } device.team_barrier(); } } }
// copy memory from one gpu range to another void copy(const view_type &from, view_type &to) { assert(from.size()==to.size()); assert(!from.overlaps(to)); auto status = cudaMemcpy( reinterpret_cast<void*>(to.begin()), reinterpret_cast<const void*>(from.begin()), from.size()*sizeof(value_type), cudaMemcpyDeviceToDevice ); if(status != cudaSuccess) { std::cerr << util::red("error") << " bad CUDA memcopy, unable to copy " << sizeof(T)*from.size() << " bytes from device to device"; exit(-1); } }
KOKKOS_INLINE_FUNCTION void operator() (device_type device) const { int element = device.league_rank(); int num_threads = device.team_size(); int thread = device.team_rank(); int num_samples = dev_x.dimension_1(); scalar_type x, y; for (int sample=thread; sample<num_samples; sample+=num_threads) { // Initialize x x = dev_x(element, sample); // Compute function simple_function<scalar_type>(x,y); // Return result dev_y(element, sample) = y; } }
// fill memory void set(view_type &rng, value_type value) { gpu::fill<value_type>(rng.data(), value, rng.size()); }
int main() { typedef float channel_type; typedef mizuiro::image::format< mizuiro::image::dimension< 3 >, mizuiro::image::interleaved< mizuiro::color::homogenous< channel_type, mizuiro::color::layout::rgba > > > format; typedef mizuiro::image::store< format, mizuiro::access::raw > store; store img( store::dim_type( 100, 100, 100 ) ); typedef store::view_type view_type; typedef view_type::bound_type bound_type; // TODO: create an algorithm for this! { view_type const view( img.view() ); typedef view_type::dim_type dim_type; typedef dim_type::size_type size_type; dim_type const dim( img.view().dim() ); for(size_type x = 0; x < dim[0]; ++x) for(size_type y = 0; y < dim[1]; ++y) for(size_type z = 0; z < dim[2]; ++z) view[ dim_type( x, y, z ) ] = mizuiro::color::object< format::color_format >( (mizuiro::color::init::red = static_cast<channel_type>(x)) (mizuiro::color::init::green = static_cast<channel_type>(y)) (mizuiro::color::init::blue = static_cast<channel_type>(z)) (mizuiro::color::init::alpha = static_cast<channel_type>(255)) ); } std::cout << '\n'; view_type const sub_view( mizuiro::image::sub_view( img.view(), bound_type( bound_type::dim_type( 1, 1, 1 ), bound_type::dim_type( 3, 4, 3 ) ) ) ); std::cout << "sub image (with pitch " << sub_view.pitch() << ")\n"; view_type const sub_sub_view( mizuiro::image::sub_view( sub_view, bound_type( bound_type::dim_type( 1, 1, 1 ), bound_type::dim_type( 2, 3, 2 ) ) ) ); std::cout << "sub sub image (with pitch " << sub_sub_view.pitch() << ")\n"; mizuiro::image::algorithm::print( std::cout, sub_sub_view ); std::cout << '\n'; { typedef std::reverse_iterator< view_type::iterator > reverse_iterator; for( reverse_iterator it( sub_sub_view.end() ); it != reverse_iterator(sub_sub_view.begin()); ++it ) std::cout << *it << ' '; } std::cout << '\n'; }
// // This function inserts "Clones" into the // the view. // // We need to pass the first argument // as a non-const reference to be able to store // 'T*' instead of 'const T*' objects. Alternatively, // we might change the declaration of the 'view_type' // to // typedef boost::ptr_vector<const photon,boost::view_clone_manager> // view_type; ^^^^^^ // void insert( vector_type& from, view_type& to ) { to.insert( to.end(), from.begin(), from.end() ); }
explicit type( const view_type & v ) : m_view( v ), value_count( v.dimension_0() ) {}
inline static unsigned value_count( const view_type & v ) { return v.dimension_0(); }
KOKKOS_INLINE_FUNCTION void operator() (device_type device) const { int element = device.league_rank(); int num_threads = device.team_size(); int thread = device.team_rank(); int num_samples = dev_x.dimension_1(); int num_samples_per_thread = num_samples / num_threads; // multi-point expansions array_vector_type x(num_samples_per_thread, 0.0), y(num_samples_per_thread, 0.0); // Initialize x if (reset && storage_type::supports_reset) { storage_type& x_s = x.storage(); storage_type& y_s = y.storage(); x_s.shallowReset(&dev_x(element,thread), num_samples_per_thread, num_threads, false); y_s.shallowReset(&dev_y(element,thread), num_samples_per_thread, num_threads, false); } else { for (int sample=0; sample<num_samples_per_thread; ++sample) x.fastAccessCoeff(sample) = dev_x(element,thread+sample*num_threads); } simple_function<scalar_vector_type>(x,y); // Print x and y if (print) { for (int tidx = 0; tidx<num_threads; tidx++) { if (thread == tidx) { printf("x(%i) = [ ",tidx); for (int sample=0; sample<num_samples_per_thread; sample++) printf("%g ", x.coeff(sample)); printf("]\n\n"); } device.team_barrier(); } for (int tidx = 0; tidx<num_threads; tidx++) { if (thread == tidx) { printf("y(%i) = [ ",tidx); for (int sample=0; sample<num_samples_per_thread; sample++) printf("%g ", y.coeff(sample)); printf("]\n\n"); } device.team_barrier(); } } // Return result if (!(reset && storage_type::supports_reset)) { for (int sample=0; sample<num_samples_per_thread; ++sample) dev_y(element,thread+sample*num_threads) = y.fastAccessCoeff(sample); } }