virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/) { cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice); /* Use small global size on CPU devices as it seems to be much faster. */ if(type == CL_DEVICE_TYPE_CPU) { VLOG(1) << "Global size: (64, 64)."; return make_int2(64, 64); } cl_ulong max_buffer_size; clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL); if(DebugFlags().opencl.mem_limit) { max_buffer_size = min(max_buffer_size, cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used)); } VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(max_buffer_size) << " bytes. (" << string_human_readable_size(max_buffer_size) << ")."; /* Limit to 2gb, as we shouldn't need more than that and some devices may support much more. */ max_buffer_size = min(max_buffer_size / 2, (cl_ulong)2l*1024*1024*1024); size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size); int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64), (int)sqrt(num_elements)); VLOG(1) << "Global size: " << global_size << "."; return global_size; }
KernelFunctions( F kernel_default, F kernel_sse2, F kernel_sse3, F kernel_sse41, F kernel_avx, F kernel_avx2) { const char *architecture_name = "default"; kernel = kernel_default; /* Silence potential warnings about unused variables * when compiling without some architectures. */ (void)kernel_sse2; (void)kernel_sse3; (void)kernel_sse41; (void)kernel_avx; (void)kernel_avx2; #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) { architecture_name = "AVX2"; kernel = kernel_avx2; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX if (DebugFlags().cpu.has_avx() && system_cpu_support_avx()) { architecture_name = "AVX"; kernel = kernel_avx; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 if (DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) { architecture_name = "SSE4.1"; kernel = kernel_sse41; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 if (DebugFlags().cpu.has_sse3() && system_cpu_support_sse3()) { architecture_name = "SSE3"; kernel = kernel_sse3; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) { architecture_name = "SSE2"; kernel = kernel_sse2; } #endif if (strcmp(architecture_name, logged_architecture) != 0) { VLOG(1) << "Will be using " << architecture_name << " kernels."; logged_architecture = architecture_name; } }
virtual BVHLayoutMask get_bvh_layout_mask() const { BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2; if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) { bvh_layout_mask |= BVH_LAYOUT_BVH4; } if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) { bvh_layout_mask |= BVH_LAYOUT_BVH8; } #ifdef WITH_EMBREE bvh_layout_mask |= BVH_LAYOUT_EMBREE; #endif /* WITH_EMBREE */ return bvh_layout_mask; }
CPUDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_) : Device(info_, stats_, profiler_, background_), texture_info(this, "__texture_info", MEM_TEXTURE), #define REGISTER_KERNEL(name) name##_kernel(KERNEL_FUNCTIONS(name)) REGISTER_KERNEL(path_trace), REGISTER_KERNEL(convert_to_half_float), REGISTER_KERNEL(convert_to_byte), REGISTER_KERNEL(shader), REGISTER_KERNEL(filter_divide_shadow), REGISTER_KERNEL(filter_get_feature), REGISTER_KERNEL(filter_write_feature), REGISTER_KERNEL(filter_detect_outliers), REGISTER_KERNEL(filter_combine_halves), REGISTER_KERNEL(filter_nlm_calc_difference), REGISTER_KERNEL(filter_nlm_blur), REGISTER_KERNEL(filter_nlm_calc_weight), REGISTER_KERNEL(filter_nlm_update_output), REGISTER_KERNEL(filter_nlm_normalize), REGISTER_KERNEL(filter_construct_transform), REGISTER_KERNEL(filter_nlm_construct_gramian), REGISTER_KERNEL(filter_finalize), REGISTER_KERNEL(data_init) #undef REGISTER_KERNEL { if (info.cpu_threads == 0) { info.cpu_threads = TaskScheduler::num_threads(); } #ifdef WITH_OSL kernel_globals.osl = &osl_globals; #endif use_split_kernel = DebugFlags().cpu.split_kernel; if (use_split_kernel) { VLOG(1) << "Will be using split kernel."; } need_texture_info = false; #define REGISTER_SPLIT_KERNEL(name) \ split_kernels[#name] = KernelFunctions<void (*)(KernelGlobals *, KernelData *)>( \ KERNEL_FUNCTIONS(name)) REGISTER_SPLIT_KERNEL(path_init); REGISTER_SPLIT_KERNEL(scene_intersect); REGISTER_SPLIT_KERNEL(lamp_emission); REGISTER_SPLIT_KERNEL(do_volume); REGISTER_SPLIT_KERNEL(queue_enqueue); REGISTER_SPLIT_KERNEL(indirect_background); REGISTER_SPLIT_KERNEL(shader_setup); REGISTER_SPLIT_KERNEL(shader_sort); REGISTER_SPLIT_KERNEL(shader_eval); REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao); REGISTER_SPLIT_KERNEL(subsurface_scatter); REGISTER_SPLIT_KERNEL(direct_lighting); REGISTER_SPLIT_KERNEL(shadow_blocked_ao); REGISTER_SPLIT_KERNEL(shadow_blocked_dl); REGISTER_SPLIT_KERNEL(enqueue_inactive); REGISTER_SPLIT_KERNEL(next_iteration_setup); REGISTER_SPLIT_KERNEL(indirect_subsurface); REGISTER_SPLIT_KERNEL(buffer_update); #undef REGISTER_SPLIT_KERNEL #undef KERNEL_FUNCTIONS }