CPUDevice(DeviceInfo& info, Stats &stats, bool background) : Device(info, stats, background) { #ifdef WITH_OSL kernel_globals.osl = &osl_globals; #endif /* do now to avoid thread issues */ system_cpu_support_sse2(); system_cpu_support_sse3(); system_cpu_support_sse41(); system_cpu_support_avx(); system_cpu_support_avx2(); #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 if(system_cpu_support_avx2()) { VLOG(1) << "Will be using AVX2 kernels."; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX if(system_cpu_support_avx()) { VLOG(1) << "Will be using AVX kernels."; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 if(system_cpu_support_sse41()) { VLOG(1) << "Will be using SSE4.1 kernels."; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 if(system_cpu_support_sse3()) { VLOG(1) << "Will be using SSE3kernels."; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 if(system_cpu_support_sse2()) { VLOG(1) << "Will be using SSE2 kernels."; } else #endif { VLOG(1) << "Will be using regular kernels."; } }
KernelFunctions( F kernel_default, F kernel_sse2, F kernel_sse3, F kernel_sse41, F kernel_avx, F kernel_avx2) { const char *architecture_name = "default"; kernel = kernel_default; /* Silence potential warnings about unused variables * when compiling without some architectures. */ (void)kernel_sse2; (void)kernel_sse3; (void)kernel_sse41; (void)kernel_avx; (void)kernel_avx2; #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) { architecture_name = "AVX2"; kernel = kernel_avx2; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX if (DebugFlags().cpu.has_avx() && system_cpu_support_avx()) { architecture_name = "AVX"; kernel = kernel_avx; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 if (DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) { architecture_name = "SSE4.1"; kernel = kernel_sse41; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 if (DebugFlags().cpu.has_sse3() && system_cpu_support_sse3()) { architecture_name = "SSE3"; kernel = kernel_sse3; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) { architecture_name = "SSE2"; kernel = kernel_sse2; } #endif if (strcmp(architecture_name, logged_architecture) != 0) { VLOG(1) << "Will be using " << architecture_name << " kernels."; logged_architecture = architecture_name; } }
string device_cpu_capabilities(void) { string capabilities = ""; capabilities += system_cpu_support_sse2() ? "SSE2 " : ""; capabilities += system_cpu_support_sse3() ? "SSE3 " : ""; capabilities += system_cpu_support_sse41() ? "SSE41 " : ""; capabilities += system_cpu_support_avx() ? "AVX " : ""; capabilities += system_cpu_support_avx2() ? "AVX2" : ""; if(capabilities[capabilities.size() - 1] == ' ') capabilities.resize(capabilities.size() - 1); return capabilities; }
void thread_shader(DeviceTask& task) { KernelGlobals kg = kernel_globals; #ifdef WITH_OSL OSLShader::thread_init(&kg, &kernel_globals, &osl_globals); #endif void(*shader_kernel)(KernelGlobals*, uint4*, float4*, int, int, int, int); #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 if(system_cpu_support_avx2()) shader_kernel = kernel_cpu_avx2_shader; else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX if(system_cpu_support_avx()) shader_kernel = kernel_cpu_avx_shader; else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 if(system_cpu_support_sse41()) shader_kernel = kernel_cpu_sse41_shader; else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 if(system_cpu_support_sse3()) shader_kernel = kernel_cpu_sse3_shader; else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 if(system_cpu_support_sse2()) shader_kernel = kernel_cpu_sse2_shader; else #endif shader_kernel = kernel_cpu_shader; for(int sample = 0; sample < task.num_samples; sample++) { for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) shader_kernel(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, task.offset, sample); if(task.get_cancel() || task_pool.canceled()) break; task.update_progress(NULL); } #ifdef WITH_OSL OSLShader::thread_free(&kg); #endif }
CPUDevice(DeviceInfo& info, Stats &stats, bool background) : Device(info, stats, background) { #ifdef WITH_OSL kernel_globals.osl = &osl_globals; #endif /* do now to avoid thread issues */ system_cpu_support_sse2(); system_cpu_support_sse3(); system_cpu_support_sse41(); system_cpu_support_avx(); system_cpu_support_avx2(); }
virtual BVHLayoutMask get_bvh_layout_mask() const { BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2; if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) { bvh_layout_mask |= BVH_LAYOUT_BVH4; } if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) { bvh_layout_mask |= BVH_LAYOUT_BVH8; } #ifdef WITH_EMBREE bvh_layout_mask |= BVH_LAYOUT_EMBREE; #endif /* WITH_EMBREE */ return bvh_layout_mask; }
void thread_film_convert(DeviceTask& task) { float sample_scale = 1.0f/(task.sample + 1); if(task.rgba_half) { void(*convert_to_half_float_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int); #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 if(system_cpu_support_avx2()) { convert_to_half_float_kernel = kernel_cpu_avx2_convert_to_half_float; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX if(system_cpu_support_avx()) { convert_to_half_float_kernel = kernel_cpu_avx_convert_to_half_float; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 if(system_cpu_support_sse41()) { convert_to_half_float_kernel = kernel_cpu_sse41_convert_to_half_float; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 if(system_cpu_support_sse3()) { convert_to_half_float_kernel = kernel_cpu_sse3_convert_to_half_float; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 if(system_cpu_support_sse2()) { convert_to_half_float_kernel = kernel_cpu_sse2_convert_to_half_float; } else #endif { convert_to_half_float_kernel = kernel_cpu_convert_to_half_float; } for(int y = task.y; y < task.y + task.h; y++) for(int x = task.x; x < task.x + task.w; x++) convert_to_half_float_kernel(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer, sample_scale, x, y, task.offset, task.stride); } else { void(*convert_to_byte_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int); #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 if(system_cpu_support_avx2()) { convert_to_byte_kernel = kernel_cpu_avx2_convert_to_byte; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX if(system_cpu_support_avx()) { convert_to_byte_kernel = kernel_cpu_avx_convert_to_byte; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 if(system_cpu_support_sse41()) { convert_to_byte_kernel = kernel_cpu_sse41_convert_to_byte; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 if(system_cpu_support_sse3()) { convert_to_byte_kernel = kernel_cpu_sse3_convert_to_byte; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 if(system_cpu_support_sse2()) { convert_to_byte_kernel = kernel_cpu_sse2_convert_to_byte; } else #endif { convert_to_byte_kernel = kernel_cpu_convert_to_byte; } for(int y = task.y; y < task.y + task.h; y++) for(int x = task.x; x < task.x + task.w; x++) convert_to_byte_kernel(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, sample_scale, x, y, task.offset, task.stride); } }
void thread_path_trace(DeviceTask& task) { if(task_pool.canceled()) { if(task.need_finish_queue == false) return; } KernelGlobals kg = kernel_globals; #ifdef WITH_OSL OSLShader::thread_init(&kg, &kernel_globals, &osl_globals); #endif RenderTile tile; void(*path_trace_kernel)(KernelGlobals*, float*, unsigned int*, int, int, int, int, int); #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 if(system_cpu_support_avx2()) { path_trace_kernel = kernel_cpu_avx2_path_trace; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX if(system_cpu_support_avx()) { path_trace_kernel = kernel_cpu_avx_path_trace; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 if(system_cpu_support_sse41()) { path_trace_kernel = kernel_cpu_sse41_path_trace; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 if(system_cpu_support_sse3()) { path_trace_kernel = kernel_cpu_sse3_path_trace; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 if(system_cpu_support_sse2()) { path_trace_kernel = kernel_cpu_sse2_path_trace; } else #endif { path_trace_kernel = kernel_cpu_path_trace; } while(task.acquire_tile(this, tile)) { float *render_buffer = (float*)tile.buffer; uint *rng_state = (uint*)tile.rng_state; int start_sample = tile.start_sample; int end_sample = tile.start_sample + tile.num_samples; for(int sample = start_sample; sample < end_sample; sample++) { if(task.get_cancel() || task_pool.canceled()) { if(task.need_finish_queue == false) break; } for(int y = tile.y; y < tile.y + tile.h; y++) { for(int x = tile.x; x < tile.x + tile.w; x++) { path_trace_kernel(&kg, render_buffer, rng_state, sample, x, y, tile.offset, tile.stride); } } tile.sample = sample + 1; task.update_progress(&tile); } task.release_tile(tile); if(task_pool.canceled()) { if(task.need_finish_queue == false) break; } } #ifdef WITH_OSL OSLShader::thread_free(&kg); #endif }