void path_trace(DeviceTask *task, SplitRenderTile& rtile, int2 max_render_feasible_tile_size) { /* cast arguments to cl types */ cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); cl_mem d_buffer = CL_MEM_PTR(rtile.buffer); cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state); cl_int d_x = rtile.x; cl_int d_y = rtile.y; cl_int d_w = rtile.w; cl_int d_h = rtile.h; cl_int d_offset = rtile.offset; cl_int d_stride = rtile.stride; /* Make sure that set render feasible tile size is a multiple of local * work size dimensions. */ assert(max_render_feasible_tile_size.x % SPLIT_KERNEL_LOCAL_SIZE_X == 0); assert(max_render_feasible_tile_size.y % SPLIT_KERNEL_LOCAL_SIZE_Y == 0); size_t global_size[2]; size_t local_size[2] = {SPLIT_KERNEL_LOCAL_SIZE_X, SPLIT_KERNEL_LOCAL_SIZE_Y}; /* Set the range of samples to be processed for every ray in * path-regeneration logic. */ cl_int start_sample = rtile.start_sample; cl_int end_sample = rtile.start_sample + rtile.num_samples; cl_int num_samples = rtile.num_samples; #ifdef __WORK_STEALING__ global_size[0] = (((d_w - 1) / local_size[0]) + 1) * local_size[0]; global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1]; unsigned int num_parallel_samples = 1; #else global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1]; unsigned int num_threads = max_render_feasible_tile_size.x * max_render_feasible_tile_size.y; unsigned int num_tile_columns_possible = num_threads / global_size[1]; /* Estimate number of parallel samples that can be * processed in parallel. */ unsigned int num_parallel_samples = min(num_tile_columns_possible / d_w, rtile.num_samples); /* Wavefront size in AMD is 64. * TODO(sergey): What about other platforms? */ if(num_parallel_samples >= 64) { /* TODO(sergey): Could use generic round-up here. */ num_parallel_samples = (num_parallel_samples / 64) * 64; } assert(num_parallel_samples != 0); global_size[0] = d_w * num_parallel_samples; #endif /* __WORK_STEALING__ */ assert(global_size[0] * global_size[1] <= max_render_feasible_tile_size.x * max_render_feasible_tile_size.y); /* Allocate all required global memory once. */ if(first_tile) { size_t num_global_elements = max_render_feasible_tile_size.x * max_render_feasible_tile_size.y; /* TODO(sergey): This will actually over-allocate if * particular kernel does not support multiclosure. */ size_t shaderdata_size = get_shader_data_size(current_max_closure); #ifdef __WORK_STEALING__ /* Calculate max groups */ size_t max_global_size[2]; size_t tile_x = max_render_feasible_tile_size.x; size_t tile_y = max_render_feasible_tile_size.y; max_global_size[0] = (((tile_x - 1) / local_size[0]) + 1) * local_size[0]; max_global_size[1] = (((tile_y - 1) / local_size[1]) + 1) * local_size[1]; max_work_groups = (max_global_size[0] * max_global_size[1]) / (local_size[0] * local_size[1]); /* Allocate work_pool_wgs memory. */ work_pool_wgs = mem_alloc(max_work_groups * sizeof(unsigned int)); #endif /* __WORK_STEALING__ */ /* Allocate queue_index memory only once. */ Queue_index = mem_alloc(NUM_QUEUES * sizeof(int)); use_queues_flag = mem_alloc(sizeof(char)); kgbuffer = mem_alloc(get_KernelGlobals_size()); /* Create global buffers for ShaderData. */ sd = mem_alloc(num_global_elements * shaderdata_size); sd_DL_shadow = mem_alloc(num_global_elements * 2 * shaderdata_size); /* Creation of global memory buffers which are shared among * the kernels. */ rng_coop = mem_alloc(num_global_elements * sizeof(RNG)); throughput_coop = mem_alloc(num_global_elements * sizeof(float3)); L_transparent_coop = mem_alloc(num_global_elements * sizeof(float)); PathRadiance_coop = mem_alloc(num_global_elements * sizeof(PathRadiance)); Ray_coop = mem_alloc(num_global_elements * sizeof(Ray)); PathState_coop = mem_alloc(num_global_elements * sizeof(PathState)); Intersection_coop = mem_alloc(num_global_elements * sizeof(Intersection)); AOAlpha_coop = mem_alloc(num_global_elements * sizeof(float3)); AOBSDF_coop = mem_alloc(num_global_elements * sizeof(float3)); AOLightRay_coop = mem_alloc(num_global_elements * sizeof(Ray)); BSDFEval_coop = mem_alloc(num_global_elements * sizeof(BsdfEval)); ISLamp_coop = mem_alloc(num_global_elements * sizeof(int)); LightRay_coop = mem_alloc(num_global_elements * sizeof(Ray)); Intersection_coop_shadow = mem_alloc(2 * num_global_elements * sizeof(Intersection)); #ifdef WITH_CYCLES_DEBUG debugdata_coop = mem_alloc(num_global_elements * sizeof(DebugData)); #endif ray_state = mem_alloc(num_global_elements * sizeof(char)); hostRayStateArray = (char *)calloc(num_global_elements, sizeof(char)); assert(hostRayStateArray != NULL && "Can't create hostRayStateArray memory"); Queue_data = mem_alloc(num_global_elements * (NUM_QUEUES * sizeof(int)+sizeof(int))); work_array = mem_alloc(num_global_elements * sizeof(unsigned int)); per_sample_output_buffers = mem_alloc(num_global_elements * per_thread_output_buffer_size); } cl_int dQueue_size = global_size[0] * global_size[1]; cl_uint start_arg_index = kernel_set_args(program_data_init(), 0, kgbuffer, sd_DL_shadow, d_data, per_sample_output_buffers, d_rng_state, rng_coop, throughput_coop, L_transparent_coop, PathRadiance_coop, Ray_coop, PathState_coop, Intersection_coop_shadow, ray_state); /* TODO(sergey): Avoid map lookup here. */ #define KERNEL_TEX(type, ttype, name) \ set_kernel_arg_mem(program_data_init(), &start_arg_index, #name); #include "kernel_textures.h" #undef KERNEL_TEX start_arg_index += kernel_set_args(program_data_init(), start_arg_index, start_sample, d_x, d_y, d_w, d_h, d_offset, d_stride, rtile.rng_state_offset_x, rtile.rng_state_offset_y, rtile.buffer_rng_state_stride, Queue_data, Queue_index, dQueue_size, use_queues_flag, work_array, #ifdef __WORK_STEALING__ work_pool_wgs, num_samples, #endif #ifdef WITH_CYCLES_DEBUG debugdata_coop, #endif num_parallel_samples); kernel_set_args(program_scene_intersect(), 0, kgbuffer, d_data, rng_coop, Ray_coop, PathState_coop, Intersection_coop, ray_state, d_w, d_h, Queue_data, Queue_index, dQueue_size, use_queues_flag, #ifdef WITH_CYCLES_DEBUG debugdata_coop, #endif num_parallel_samples); kernel_set_args(program_lamp_emission(), 0, kgbuffer, d_data, throughput_coop, PathRadiance_coop, Ray_coop, PathState_coop, Intersection_coop, ray_state, d_w, d_h, Queue_data, Queue_index, dQueue_size, use_queues_flag, num_parallel_samples); kernel_set_args(program_queue_enqueue(), 0, Queue_data, Queue_index, ray_state, dQueue_size); kernel_set_args(program_background_buffer_update(), 0, kgbuffer, d_data, per_sample_output_buffers, d_rng_state, rng_coop, throughput_coop, PathRadiance_coop, Ray_coop, PathState_coop, L_transparent_coop, ray_state, d_w, d_h, d_x, d_y, d_stride, rtile.rng_state_offset_x, rtile.rng_state_offset_y, rtile.buffer_rng_state_stride, work_array, Queue_data, Queue_index, dQueue_size, end_sample, start_sample, #ifdef __WORK_STEALING__ work_pool_wgs, num_samples, #endif #ifdef WITH_CYCLES_DEBUG debugdata_coop, #endif num_parallel_samples); kernel_set_args(program_shader_eval(), 0, kgbuffer, d_data, sd, rng_coop, Ray_coop, PathState_coop, Intersection_coop, ray_state, Queue_data, Queue_index, dQueue_size); kernel_set_args(program_holdout_emission_blurring_pathtermination_ao(), 0, kgbuffer, d_data, sd, per_sample_output_buffers, rng_coop, throughput_coop, L_transparent_coop, PathRadiance_coop, PathState_coop, Intersection_coop, AOAlpha_coop, AOBSDF_coop, AOLightRay_coop, d_w, d_h, d_x, d_y, d_stride, ray_state, work_array, Queue_data, Queue_index, dQueue_size, #ifdef __WORK_STEALING__ start_sample, #endif num_parallel_samples); kernel_set_args(program_direct_lighting(), 0, kgbuffer, d_data, sd, rng_coop, PathState_coop, ISLamp_coop, LightRay_coop, BSDFEval_coop, ray_state, Queue_data, Queue_index, dQueue_size); kernel_set_args(program_shadow_blocked(), 0, kgbuffer, d_data, PathState_coop, LightRay_coop, AOLightRay_coop, ray_state, Queue_data, Queue_index, dQueue_size); kernel_set_args(program_next_iteration_setup(), 0, kgbuffer, d_data, sd, rng_coop, throughput_coop, PathRadiance_coop, Ray_coop, PathState_coop, LightRay_coop, ISLamp_coop, BSDFEval_coop, AOLightRay_coop, AOBSDF_coop, AOAlpha_coop, ray_state, Queue_data, Queue_index, dQueue_size, use_queues_flag); kernel_set_args(program_sum_all_radiance(), 0, d_data, d_buffer, per_sample_output_buffers, num_parallel_samples, d_w, d_h, d_stride, rtile.buffer_offset_x, rtile.buffer_offset_y, rtile.buffer_rng_state_stride, start_sample); /* Macro for Enqueuing split kernels. */ #define GLUE(a, b) a ## b #define ENQUEUE_SPLIT_KERNEL(kernelName, globalSize, localSize) \ { \ ciErr = clEnqueueNDRangeKernel(cqCommandQueue, \ GLUE(program_, \ kernelName)(), \ 2, \ NULL, \ globalSize, \ localSize, \ 0, \ NULL, \ NULL); \ opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); \ if(ciErr != CL_SUCCESS) { \ string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", \ clewErrorString(ciErr)); \ opencl_error(message); \ return; \ } \ } (void) 0 /* Enqueue ckPathTraceKernel_data_init kernel. */ ENQUEUE_SPLIT_KERNEL(data_init, global_size, local_size); bool activeRaysAvailable = true; /* Record number of time host intervention has been made */ unsigned int numHostIntervention = 0; unsigned int numNextPathIterTimes = PathIteration_times; bool canceled = false; while(activeRaysAvailable) { /* Twice the global work size of other kernels for * ckPathTraceKernel_shadow_blocked_direct_lighting. */ size_t global_size_shadow_blocked[2]; global_size_shadow_blocked[0] = global_size[0] * 2; global_size_shadow_blocked[1] = global_size[1]; /* Do path-iteration in host [Enqueue Path-iteration kernels. */ for(int PathIter = 0; PathIter < PathIteration_times; PathIter++) { ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size); ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size); ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); ENQUEUE_SPLIT_KERNEL(background_buffer_update, global_size, local_size); ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size); ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size); ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size); ENQUEUE_SPLIT_KERNEL(shadow_blocked, global_size_shadow_blocked, local_size); ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size); if(task->get_cancel()) { canceled = true; break; } } /* Read ray-state into Host memory to decide if we should exit * path-iteration in host. */ ciErr = clEnqueueReadBuffer(cqCommandQueue, ray_state, CL_TRUE, 0, global_size[0] * global_size[1] * sizeof(char), hostRayStateArray, 0, NULL, NULL); assert(ciErr == CL_SUCCESS); activeRaysAvailable = false; for(int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) { if(int8_t(hostRayStateArray[rayStateIter]) != RAY_INACTIVE) { /* Not all rays are RAY_INACTIVE. */ activeRaysAvailable = true; break; } } if(activeRaysAvailable) { numHostIntervention++; PathIteration_times = PATH_ITER_INC_FACTOR; /* Host intervention done before all rays become RAY_INACTIVE; * Set do more initial iterations for the next tile. */ numNextPathIterTimes += PATH_ITER_INC_FACTOR; } if(task->get_cancel()) { canceled = true; break; } } /* Execute SumALLRadiance kernel to accumulate radiance calculated in * per_sample_output_buffers into RenderTile's output buffer. */ if(!canceled) { size_t sum_all_radiance_local_size[2] = {16, 16}; size_t sum_all_radiance_global_size[2]; sum_all_radiance_global_size[0] = (((d_w - 1) / sum_all_radiance_local_size[0]) + 1) * sum_all_radiance_local_size[0]; sum_all_radiance_global_size[1] = (((d_h - 1) / sum_all_radiance_local_size[1]) + 1) * sum_all_radiance_local_size[1]; ENQUEUE_SPLIT_KERNEL(sum_all_radiance, sum_all_radiance_global_size, sum_all_radiance_local_size); } #undef ENQUEUE_SPLIT_KERNEL #undef GLUE if(numHostIntervention == 0) { /* This means that we are executing kernel more than required * Must avoid this for the next sample/tile. */ PathIteration_times = ((numNextPathIterTimes - PATH_ITER_INC_FACTOR) <= 0) ? PATH_ITER_INC_FACTOR : numNextPathIterTimes - PATH_ITER_INC_FACTOR; } else { /* Number of path-iterations done for this tile is set as * Initial path-iteration times for the next tile */ PathIteration_times = numNextPathIterTimes; } first_tile = false; }
bool DeviceSplitKernel::path_trace(DeviceTask *task, RenderTile& tile, device_memory& kgbuffer, device_memory& kernel_data) { if(device->have_error()) { return false; } /* Get local size */ size_t local_size[2]; { int2 lsize = split_kernel_local_size(); local_size[0] = lsize[0]; local_size[1] = lsize[1]; } /* Number of elements in the global state buffer */ int num_global_elements = global_size[0] * global_size[1]; /* Allocate all required global memory once. */ if(first_tile) { first_tile = false; /* Set gloabl size */ { int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task); /* Make sure that set work size is a multiple of local * work size dimensions. */ global_size[0] = round_up(gsize[0], local_size[0]); global_size[1] = round_up(gsize[1], local_size[1]); } num_global_elements = global_size[0] * global_size[1]; assert(num_global_elements % WORK_POOL_SIZE == 0); /* Calculate max groups */ /* Denotes the maximum work groups possible w.r.t. current requested tile size. */ unsigned int work_pool_size = (device->info.type == DEVICE_CPU) ? WORK_POOL_SIZE_CPU : WORK_POOL_SIZE_GPU; unsigned int max_work_groups = num_global_elements / work_pool_size + 1; /* Allocate work_pool_wgs memory. */ work_pool_wgs.alloc_to_device(max_work_groups); queue_index.alloc_to_device(NUM_QUEUES); use_queues_flag.alloc_to_device(1); split_data.alloc_to_device(state_buffer_size(kgbuffer, kernel_data, num_global_elements)); ray_state.alloc(num_global_elements); } #define ENQUEUE_SPLIT_KERNEL(name, global_size, local_size) \ if(device->have_error()) { \ return false; \ } \ if(!kernel_##name->enqueue(KernelDimensions(global_size, local_size), kgbuffer, kernel_data)) { \ return false; \ } tile.sample = tile.start_sample; /* for exponential increase between tile updates */ int time_multiplier = 1; while(tile.sample < tile.start_sample + tile.num_samples) { /* to keep track of how long it takes to run a number of samples */ double start_time = time_dt(); /* initial guess to start rolling average */ const int initial_num_samples = 1; /* approx number of samples per second */ int samples_per_second = (avg_time_per_sample > 0.0) ? int(double(time_multiplier) / avg_time_per_sample) + 1 : initial_num_samples; RenderTile subtile = tile; subtile.start_sample = tile.sample; subtile.num_samples = min(samples_per_second, tile.start_sample + tile.num_samples - tile.sample); if(device->have_error()) { return false; } /* reset state memory here as global size for data_init * kernel might not be large enough to do in kernel */ work_pool_wgs.zero_to_device(); split_data.zero_to_device(); ray_state.zero_to_device(); if(!enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size), subtile, num_global_elements, kgbuffer, kernel_data, split_data, ray_state, queue_index, use_queues_flag, work_pool_wgs)) { return false; } ENQUEUE_SPLIT_KERNEL(path_init, global_size, local_size); bool activeRaysAvailable = true; double cancel_time = DBL_MAX; while(activeRaysAvailable) { /* Do path-iteration in host [Enqueue Path-iteration kernels. */ for(int PathIter = 0; PathIter < 16; PathIter++) { ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size); ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size); ENQUEUE_SPLIT_KERNEL(do_volume, global_size, local_size); ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); ENQUEUE_SPLIT_KERNEL(indirect_background, global_size, local_size); ENQUEUE_SPLIT_KERNEL(shader_setup, global_size, local_size); ENQUEUE_SPLIT_KERNEL(shader_sort, global_size, local_size); ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size); ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size); ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size); ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size); ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size); ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size); ENQUEUE_SPLIT_KERNEL(enqueue_inactive, global_size, local_size); ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size); ENQUEUE_SPLIT_KERNEL(indirect_subsurface, global_size, local_size); ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); ENQUEUE_SPLIT_KERNEL(buffer_update, global_size, local_size); if(task->get_cancel() && cancel_time == DBL_MAX) { /* Wait up to twice as many seconds for current samples to finish * to avoid artifacts in render result from ending too soon. */ cancel_time = time_dt() + 2.0 * time_multiplier; } if(time_dt() > cancel_time) { return true; } } /* Decide if we should exit path-iteration in host. */ ray_state.copy_from_device(0, global_size[0] * global_size[1], 1); activeRaysAvailable = false; for(int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) { if(!IS_STATE(ray_state.data(), rayStateIter, RAY_INACTIVE)) { if(IS_STATE(ray_state.data(), rayStateIter, RAY_INVALID)) { /* Something went wrong, abort to avoid looping endlessly. */ device->set_error("Split kernel error: invalid ray state"); return false; } /* Not all rays are RAY_INACTIVE. */ activeRaysAvailable = true; break; } } if(time_dt() > cancel_time) { return true; } } double time_per_sample = ((time_dt()-start_time) / subtile.num_samples); if(avg_time_per_sample == 0.0) { /* start rolling average */ avg_time_per_sample = time_per_sample; } else { avg_time_per_sample = alpha*time_per_sample + (1.0-alpha)*avg_time_per_sample; } #undef ENQUEUE_SPLIT_KERNEL tile.sample += subtile.num_samples; task->update_progress(&tile, tile.w*tile.h*subtile.num_samples); time_multiplier = min(time_multiplier << 1, 10); if(task->get_cancel()) { return true; } } return true; }