void Stream::addCallback( cudaStreamCallback_t callback, void * userData ) { #if !defined(NDEBUG) int device; CUDA_VERIFY( cudaGetDevice( &device ) ); int majorCap, minorCap; CUDA_VERIFY( cudaDeviceGetAttribute( &majorCap, cudaDevAttrComputeCapabilityMajor, device ) ); CUDA_VERIFY( cudaDeviceGetAttribute( &minorCap, cudaDevAttrComputeCapabilityMinor, device ) ); DP_ASSERT( ( 1 < majorCap ) || ( ( 1 == majorCap ) && ( 1 <= minorCap ) ) ); #endif CUDA_VERIFY( cudaStreamAddCallback( m_stream, callback, userData, 0 ) ); }
int oskar_device_compute_capability(void) { int version = 0; #ifdef OSKAR_HAVE_CUDA int major = 0, minor = 0, device = 0; cudaGetDevice(&device); cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device); cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device); version = 10 * major + minor; #endif return version; }
__host__ __device__ int max_grid_size_of_bulk_then_execute_concurrent_kernel(const agency::cuda::device_id& device, const Function& f, Shape block_dim, const agency::cuda::async_future<T>& predecessor, const ResultFactory& result_factory, const OuterFactory& outer_factory, const InnerFactory& inner_factory) { const size_t block_dimension = agency::detail::shape_size<Shape>::value; constexpr auto kernel = detail::bulk_then_execute_kernel<block_dimension,Function,T,ResultFactory,OuterFactory,InnerFactory>::value; int max_active_blocks_per_multiprocessor = 0; detail::throw_on_error(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks_per_multiprocessor, kernel, agency::detail::shape_cast<int>(block_dim), device.native_handle()), "cuda::detail::max_grid_size_of_bulk_then_execute_concurrent_kernel(): CUDA error after cudaOccupancyMaxActiveBlocksPerMultiprocessor()"); int num_multiprocessors = 0; detail::throw_on_error(cudaDeviceGetAttribute(&num_multiprocessors, cudaDevAttrMultiProcessorCount, device.native_handle()), "cuda::detail::max_grid_size_of_bulk_then_execute_concurrent_kernel(): CUDA error after cudaDeviceGetAttribute()"); return max_active_blocks_per_multiprocessor * num_multiprocessors; }