/** @brief returns whether or not the profile is likely to be slow on a particular device * @param dev the given device*/ bool is_slow(viennacl::ocl::device const & dev) const{ bool res = false; if(dev.type()==CL_DEVICE_TYPE_GPU){ std::size_t warp_size = 32; if(dev.vendor_id()==4098) warp_size = 64; res = static_cast<bool>(((local_size_1_*local_size_2_)%warp_size)>0); } return res || is_slow_impl(dev); }
/** @brief returns whether or not the profile leads to undefined behavior on particular device * @param dev the given device*/ bool is_invalid(viennacl::ocl::device const & dev, size_t scalartype_size) const{ //Query device informations size_t lmem_available = static_cast<size_t>(dev.local_mem_size()); size_t max_workgroup_size = dev.max_work_group_size(); std::vector<size_t> max_work_item_sizes = dev.max_work_item_sizes(); bool invalid_work_group_sizes = local_size_1_*local_size_2_ > max_workgroup_size || local_size_1_ > max_work_item_sizes[0] || local_size_2_ > max_work_item_sizes[1]; // uses too much resources return invalid_work_group_sizes || lmem_used(scalartype_size)>lmem_available || invalid_impl(dev, scalartype_size); }
/** @brief Convenience function for setting devices for a context */ inline void setup_context(long i, viennacl::ocl::device const & device) { std::vector<cl_device_id> device_id_array(1); device_id_array[0] = device.id(); viennacl::ocl::backend<>::setup_context(i, device_id_array); }
bool invalid_base(viennacl::ocl::device const & dev, size_t lmem_used) const{ //Query profile informations std::pair<size_t, size_t> workgroup_size = local_work_size(); //Query device informations size_t lmem_available = viennacl::ocl::info<CL_DEVICE_LOCAL_MEM_SIZE>(dev.id()); size_t max_workgroup_size = viennacl::ocl::info<CL_DEVICE_MAX_WORK_GROUP_SIZE>(dev.id()); std::vector<size_t> max_work_item_sizes = viennacl::ocl::info<CL_DEVICE_MAX_WORK_ITEM_SIZES>(dev.id()); bool invalid_work_group_sizes = workgroup_size.first*workgroup_size.second > max_workgroup_size; // uses too much resources invalid_work_group_sizes = invalid_work_group_sizes || workgroup_size.first > max_work_item_sizes[0]; if(max_work_item_sizes.size()>1) invalid_work_group_sizes = invalid_work_group_sizes || workgroup_size.second > max_work_item_sizes[1]; return invalid_work_group_sizes || lmem_used>lmem_available; }