void path_trace(RenderTile& rtile, int sample) { /* Cast arguments to cl types. */ cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); cl_mem d_buffer = CL_MEM_PTR(rtile.buffer); cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state); cl_int d_x = rtile.x; cl_int d_y = rtile.y; cl_int d_w = rtile.w; cl_int d_h = rtile.h; cl_int d_offset = rtile.offset; cl_int d_stride = rtile.stride; /* Sample arguments. */ cl_int d_sample = sample; cl_kernel ckPathTraceKernel = path_trace_program(ustring("path_trace")); cl_uint start_arg_index = kernel_set_args(ckPathTraceKernel, 0, d_data, d_buffer, d_rng_state); #define KERNEL_TEX(type, ttype, name) \ set_kernel_arg_mem(ckPathTraceKernel, &start_arg_index, #name); #include "kernel_textures.h" #undef KERNEL_TEX start_arg_index += kernel_set_args(ckPathTraceKernel, start_arg_index, d_sample, d_x, d_y, d_w, d_h, d_offset, d_stride); enqueue_kernel(ckPathTraceKernel, d_w, d_h); }
void path_trace(RenderTile& rtile, int sample) { scoped_timer timer(&rtile.buffers->render_time); /* Cast arguments to cl types. */ cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); cl_mem d_buffer = CL_MEM_PTR(rtile.buffer); cl_int d_x = rtile.x; cl_int d_y = rtile.y; cl_int d_w = rtile.w; cl_int d_h = rtile.h; cl_int d_offset = rtile.offset; cl_int d_stride = rtile.stride; /* Sample arguments. */ cl_int d_sample = sample; cl_kernel ckPathTraceKernel = path_trace_program(ustring("path_trace")); cl_uint start_arg_index = kernel_set_args(ckPathTraceKernel, 0, d_data, d_buffer); set_kernel_arg_buffers(ckPathTraceKernel, &start_arg_index); start_arg_index += kernel_set_args(ckPathTraceKernel, start_arg_index, d_sample, d_x, d_y, d_w, d_h, d_offset, d_stride); enqueue_kernel(ckPathTraceKernel, d_w, d_h); }
void path_trace(DeviceTask *task, SplitRenderTile& rtile, int2 max_render_feasible_tile_size) { /* cast arguments to cl types */ cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); cl_mem d_buffer = CL_MEM_PTR(rtile.buffer); cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state); cl_int d_x = rtile.x; cl_int d_y = rtile.y; cl_int d_w = rtile.w; cl_int d_h = rtile.h; cl_int d_offset = rtile.offset; cl_int d_stride = rtile.stride; /* Make sure that set render feasible tile size is a multiple of local * work size dimensions. */ assert(max_render_feasible_tile_size.x % SPLIT_KERNEL_LOCAL_SIZE_X == 0); assert(max_render_feasible_tile_size.y % SPLIT_KERNEL_LOCAL_SIZE_Y == 0); size_t global_size[2]; size_t local_size[2] = {SPLIT_KERNEL_LOCAL_SIZE_X, SPLIT_KERNEL_LOCAL_SIZE_Y}; /* Set the range of samples to be processed for every ray in * path-regeneration logic. */ cl_int start_sample = rtile.start_sample; cl_int end_sample = rtile.start_sample + rtile.num_samples; cl_int num_samples = rtile.num_samples; #ifdef __WORK_STEALING__ global_size[0] = (((d_w - 1) / local_size[0]) + 1) * local_size[0]; global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1]; unsigned int num_parallel_samples = 1; #else global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1]; unsigned int num_threads = max_render_feasible_tile_size.x * max_render_feasible_tile_size.y; unsigned int num_tile_columns_possible = num_threads / global_size[1]; /* Estimate number of parallel samples that can be * processed in parallel. */ unsigned int num_parallel_samples = min(num_tile_columns_possible / d_w, rtile.num_samples); /* Wavefront size in AMD is 64. * TODO(sergey): What about other platforms? */ if(num_parallel_samples >= 64) { /* TODO(sergey): Could use generic round-up here. */ num_parallel_samples = (num_parallel_samples / 64) * 64; } assert(num_parallel_samples != 0); global_size[0] = d_w * num_parallel_samples; #endif /* __WORK_STEALING__ */ assert(global_size[0] * global_size[1] <= max_render_feasible_tile_size.x * max_render_feasible_tile_size.y); /* Allocate all required global memory once. */ if(first_tile) { size_t num_global_elements = max_render_feasible_tile_size.x * max_render_feasible_tile_size.y; /* TODO(sergey): This will actually over-allocate if * particular kernel does not support multiclosure. */ size_t shaderdata_size = get_shader_data_size(current_max_closure); #ifdef __WORK_STEALING__ /* Calculate max groups */ size_t max_global_size[2]; size_t tile_x = max_render_feasible_tile_size.x; size_t tile_y = max_render_feasible_tile_size.y; max_global_size[0] = (((tile_x - 1) / local_size[0]) + 1) * local_size[0]; max_global_size[1] = (((tile_y - 1) / local_size[1]) + 1) * local_size[1]; max_work_groups = (max_global_size[0] * max_global_size[1]) / (local_size[0] * local_size[1]); /* Allocate work_pool_wgs memory. */ work_pool_wgs = mem_alloc(max_work_groups * sizeof(unsigned int)); #endif /* __WORK_STEALING__ */ /* Allocate queue_index memory only once. */ Queue_index = mem_alloc(NUM_QUEUES * sizeof(int)); use_queues_flag = mem_alloc(sizeof(char)); kgbuffer = mem_alloc(get_KernelGlobals_size()); /* Create global buffers for ShaderData. */ sd = mem_alloc(num_global_elements * shaderdata_size); sd_DL_shadow = mem_alloc(num_global_elements * 2 * shaderdata_size); /* Creation of global memory buffers which are shared among * the kernels. */ rng_coop = mem_alloc(num_global_elements * sizeof(RNG)); throughput_coop = mem_alloc(num_global_elements * sizeof(float3)); L_transparent_coop = mem_alloc(num_global_elements * sizeof(float)); PathRadiance_coop = mem_alloc(num_global_elements * sizeof(PathRadiance)); Ray_coop = mem_alloc(num_global_elements * sizeof(Ray)); PathState_coop = mem_alloc(num_global_elements * sizeof(PathState)); Intersection_coop = mem_alloc(num_global_elements * sizeof(Intersection)); AOAlpha_coop = mem_alloc(num_global_elements * sizeof(float3)); AOBSDF_coop = mem_alloc(num_global_elements * sizeof(float3)); AOLightRay_coop = mem_alloc(num_global_elements * sizeof(Ray)); BSDFEval_coop = mem_alloc(num_global_elements * sizeof(BsdfEval)); ISLamp_coop = mem_alloc(num_global_elements * sizeof(int)); LightRay_coop = mem_alloc(num_global_elements * sizeof(Ray)); Intersection_coop_shadow = mem_alloc(2 * num_global_elements * sizeof(Intersection)); #ifdef WITH_CYCLES_DEBUG debugdata_coop = mem_alloc(num_global_elements * sizeof(DebugData)); #endif ray_state = mem_alloc(num_global_elements * sizeof(char)); hostRayStateArray = (char *)calloc(num_global_elements, sizeof(char)); assert(hostRayStateArray != NULL && "Can't create hostRayStateArray memory"); Queue_data = mem_alloc(num_global_elements * (NUM_QUEUES * sizeof(int)+sizeof(int))); work_array = mem_alloc(num_global_elements * sizeof(unsigned int)); per_sample_output_buffers = mem_alloc(num_global_elements * per_thread_output_buffer_size); } cl_int dQueue_size = global_size[0] * global_size[1]; cl_uint start_arg_index = kernel_set_args(program_data_init(), 0, kgbuffer, sd_DL_shadow, d_data, per_sample_output_buffers, d_rng_state, rng_coop, throughput_coop, L_transparent_coop, PathRadiance_coop, Ray_coop, PathState_coop, Intersection_coop_shadow, ray_state); /* TODO(sergey): Avoid map lookup here. */ #define KERNEL_TEX(type, ttype, name) \ set_kernel_arg_mem(program_data_init(), &start_arg_index, #name); #include "kernel_textures.h" #undef KERNEL_TEX start_arg_index += kernel_set_args(program_data_init(), start_arg_index, start_sample, d_x, d_y, d_w, d_h, d_offset, d_stride, rtile.rng_state_offset_x, rtile.rng_state_offset_y, rtile.buffer_rng_state_stride, Queue_data, Queue_index, dQueue_size, use_queues_flag, work_array, #ifdef __WORK_STEALING__ work_pool_wgs, num_samples, #endif #ifdef WITH_CYCLES_DEBUG debugdata_coop, #endif num_parallel_samples); kernel_set_args(program_scene_intersect(), 0, kgbuffer, d_data, rng_coop, Ray_coop, PathState_coop, Intersection_coop, ray_state, d_w, d_h, Queue_data, Queue_index, dQueue_size, use_queues_flag, #ifdef WITH_CYCLES_DEBUG debugdata_coop, #endif num_parallel_samples); kernel_set_args(program_lamp_emission(), 0, kgbuffer, d_data, throughput_coop, PathRadiance_coop, Ray_coop, PathState_coop, Intersection_coop, ray_state, d_w, d_h, Queue_data, Queue_index, dQueue_size, use_queues_flag, num_parallel_samples); kernel_set_args(program_queue_enqueue(), 0, Queue_data, Queue_index, ray_state, dQueue_size); kernel_set_args(program_background_buffer_update(), 0, kgbuffer, d_data, per_sample_output_buffers, d_rng_state, rng_coop, throughput_coop, PathRadiance_coop, Ray_coop, PathState_coop, L_transparent_coop, ray_state, d_w, d_h, d_x, d_y, d_stride, rtile.rng_state_offset_x, rtile.rng_state_offset_y, rtile.buffer_rng_state_stride, work_array, Queue_data, Queue_index, dQueue_size, end_sample, start_sample, #ifdef __WORK_STEALING__ work_pool_wgs, num_samples, #endif #ifdef WITH_CYCLES_DEBUG debugdata_coop, #endif num_parallel_samples); kernel_set_args(program_shader_eval(), 0, kgbuffer, d_data, sd, rng_coop, Ray_coop, PathState_coop, Intersection_coop, ray_state, Queue_data, Queue_index, dQueue_size); kernel_set_args(program_holdout_emission_blurring_pathtermination_ao(), 0, kgbuffer, d_data, sd, per_sample_output_buffers, rng_coop, throughput_coop, L_transparent_coop, PathRadiance_coop, PathState_coop, Intersection_coop, AOAlpha_coop, AOBSDF_coop, AOLightRay_coop, d_w, d_h, d_x, d_y, d_stride, ray_state, work_array, Queue_data, Queue_index, dQueue_size, #ifdef __WORK_STEALING__ start_sample, #endif num_parallel_samples); kernel_set_args(program_direct_lighting(), 0, kgbuffer, d_data, sd, rng_coop, PathState_coop, ISLamp_coop, LightRay_coop, BSDFEval_coop, ray_state, Queue_data, Queue_index, dQueue_size); kernel_set_args(program_shadow_blocked(), 0, kgbuffer, d_data, PathState_coop, LightRay_coop, AOLightRay_coop, ray_state, Queue_data, Queue_index, dQueue_size); kernel_set_args(program_next_iteration_setup(), 0, kgbuffer, d_data, sd, rng_coop, throughput_coop, PathRadiance_coop, Ray_coop, PathState_coop, LightRay_coop, ISLamp_coop, BSDFEval_coop, AOLightRay_coop, AOBSDF_coop, AOAlpha_coop, ray_state, Queue_data, Queue_index, dQueue_size, use_queues_flag); kernel_set_args(program_sum_all_radiance(), 0, d_data, d_buffer, per_sample_output_buffers, num_parallel_samples, d_w, d_h, d_stride, rtile.buffer_offset_x, rtile.buffer_offset_y, rtile.buffer_rng_state_stride, start_sample); /* Macro for Enqueuing split kernels. */ #define GLUE(a, b) a ## b #define ENQUEUE_SPLIT_KERNEL(kernelName, globalSize, localSize) \ { \ ciErr = clEnqueueNDRangeKernel(cqCommandQueue, \ GLUE(program_, \ kernelName)(), \ 2, \ NULL, \ globalSize, \ localSize, \ 0, \ NULL, \ NULL); \ opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); \ if(ciErr != CL_SUCCESS) { \ string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", \ clewErrorString(ciErr)); \ opencl_error(message); \ return; \ } \ } (void) 0 /* Enqueue ckPathTraceKernel_data_init kernel. */ ENQUEUE_SPLIT_KERNEL(data_init, global_size, local_size); bool activeRaysAvailable = true; /* Record number of time host intervention has been made */ unsigned int numHostIntervention = 0; unsigned int numNextPathIterTimes = PathIteration_times; bool canceled = false; while(activeRaysAvailable) { /* Twice the global work size of other kernels for * ckPathTraceKernel_shadow_blocked_direct_lighting. */ size_t global_size_shadow_blocked[2]; global_size_shadow_blocked[0] = global_size[0] * 2; global_size_shadow_blocked[1] = global_size[1]; /* Do path-iteration in host [Enqueue Path-iteration kernels. */ for(int PathIter = 0; PathIter < PathIteration_times; PathIter++) { ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size); ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size); ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); ENQUEUE_SPLIT_KERNEL(background_buffer_update, global_size, local_size); ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size); ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size); ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size); ENQUEUE_SPLIT_KERNEL(shadow_blocked, global_size_shadow_blocked, local_size); ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size); if(task->get_cancel()) { canceled = true; break; } } /* Read ray-state into Host memory to decide if we should exit * path-iteration in host. */ ciErr = clEnqueueReadBuffer(cqCommandQueue, ray_state, CL_TRUE, 0, global_size[0] * global_size[1] * sizeof(char), hostRayStateArray, 0, NULL, NULL); assert(ciErr == CL_SUCCESS); activeRaysAvailable = false; for(int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) { if(int8_t(hostRayStateArray[rayStateIter]) != RAY_INACTIVE) { /* Not all rays are RAY_INACTIVE. */ activeRaysAvailable = true; break; } } if(activeRaysAvailable) { numHostIntervention++; PathIteration_times = PATH_ITER_INC_FACTOR; /* Host intervention done before all rays become RAY_INACTIVE; * Set do more initial iterations for the next tile. */ numNextPathIterTimes += PATH_ITER_INC_FACTOR; } if(task->get_cancel()) { canceled = true; break; } } /* Execute SumALLRadiance kernel to accumulate radiance calculated in * per_sample_output_buffers into RenderTile's output buffer. */ if(!canceled) { size_t sum_all_radiance_local_size[2] = {16, 16}; size_t sum_all_radiance_global_size[2]; sum_all_radiance_global_size[0] = (((d_w - 1) / sum_all_radiance_local_size[0]) + 1) * sum_all_radiance_local_size[0]; sum_all_radiance_global_size[1] = (((d_h - 1) / sum_all_radiance_local_size[1]) + 1) * sum_all_radiance_local_size[1]; ENQUEUE_SPLIT_KERNEL(sum_all_radiance, sum_all_radiance_global_size, sum_all_radiance_local_size); } #undef ENQUEUE_SPLIT_KERNEL #undef GLUE if(numHostIntervention == 0) { /* This means that we are executing kernel more than required * Must avoid this for the next sample/tile. */ PathIteration_times = ((numNextPathIterTimes - PATH_ITER_INC_FACTOR) <= 0) ? PATH_ITER_INC_FACTOR : numNextPathIterTimes - PATH_ITER_INC_FACTOR; } else { /* Number of path-iterations done for this tile is set as * Initial path-iteration times for the next tile */ PathIteration_times = numNextPathIterTimes; } first_tile = false; }