Beispiel #1
0
	void path_trace(RenderTile& rtile, int sample)
	{
		/* Cast arguments to cl types. */
		cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
		cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
		cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
		cl_int d_x = rtile.x;
		cl_int d_y = rtile.y;
		cl_int d_w = rtile.w;
		cl_int d_h = rtile.h;
		cl_int d_offset = rtile.offset;
		cl_int d_stride = rtile.stride;

		/* Sample arguments. */
		cl_int d_sample = sample;

		cl_kernel ckPathTraceKernel = path_trace_program(ustring("path_trace"));

		cl_uint start_arg_index =
			kernel_set_args(ckPathTraceKernel,
			                0,
			                d_data,
			                d_buffer,
			                d_rng_state);

#define KERNEL_TEX(type, ttype, name) \
		set_kernel_arg_mem(ckPathTraceKernel, &start_arg_index, #name);
#include "kernel_textures.h"
#undef KERNEL_TEX

		start_arg_index += kernel_set_args(ckPathTraceKernel,
		                                   start_arg_index,
		                                   d_sample,
		                                   d_x,
		                                   d_y,
		                                   d_w,
		                                   d_h,
		                                   d_offset,
		                                   d_stride);

		enqueue_kernel(ckPathTraceKernel, d_w, d_h);
	}
Beispiel #2
0
	void path_trace(RenderTile& rtile, int sample)
	{
		scoped_timer timer(&rtile.buffers->render_time);

		/* Cast arguments to cl types. */
		cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
		cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
		cl_int d_x = rtile.x;
		cl_int d_y = rtile.y;
		cl_int d_w = rtile.w;
		cl_int d_h = rtile.h;
		cl_int d_offset = rtile.offset;
		cl_int d_stride = rtile.stride;

		/* Sample arguments. */
		cl_int d_sample = sample;

		cl_kernel ckPathTraceKernel = path_trace_program(ustring("path_trace"));

		cl_uint start_arg_index =
			kernel_set_args(ckPathTraceKernel,
			                0,
			                d_data,
			                d_buffer);

		set_kernel_arg_buffers(ckPathTraceKernel, &start_arg_index);

		start_arg_index += kernel_set_args(ckPathTraceKernel,
		                                   start_arg_index,
		                                   d_sample,
		                                   d_x,
		                                   d_y,
		                                   d_w,
		                                   d_h,
		                                   d_offset,
		                                   d_stride);

		enqueue_kernel(ckPathTraceKernel, d_w, d_h);
	}
Beispiel #3
0
	void path_trace(DeviceTask *task,
	                SplitRenderTile& rtile,
	                int2 max_render_feasible_tile_size)
	{
		/* cast arguments to cl types */
		cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
		cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
		cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
		cl_int d_x = rtile.x;
		cl_int d_y = rtile.y;
		cl_int d_w = rtile.w;
		cl_int d_h = rtile.h;
		cl_int d_offset = rtile.offset;
		cl_int d_stride = rtile.stride;

		/* Make sure that set render feasible tile size is a multiple of local
		 * work size dimensions.
		 */
		assert(max_render_feasible_tile_size.x % SPLIT_KERNEL_LOCAL_SIZE_X == 0);
		assert(max_render_feasible_tile_size.y % SPLIT_KERNEL_LOCAL_SIZE_Y == 0);

		size_t global_size[2];
		size_t local_size[2] = {SPLIT_KERNEL_LOCAL_SIZE_X,
		                        SPLIT_KERNEL_LOCAL_SIZE_Y};

		/* Set the range of samples to be processed for every ray in
		 * path-regeneration logic.
		 */
		cl_int start_sample = rtile.start_sample;
		cl_int end_sample = rtile.start_sample + rtile.num_samples;
		cl_int num_samples = rtile.num_samples;

#ifdef __WORK_STEALING__
		global_size[0] = (((d_w - 1) / local_size[0]) + 1) * local_size[0];
		global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
		unsigned int num_parallel_samples = 1;
#else
		global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
		unsigned int num_threads = max_render_feasible_tile_size.x *
		                           max_render_feasible_tile_size.y;
		unsigned int num_tile_columns_possible = num_threads / global_size[1];
		/* Estimate number of parallel samples that can be
		 * processed in parallel.
		 */
		unsigned int num_parallel_samples = min(num_tile_columns_possible / d_w,
		                                        rtile.num_samples);
		/* Wavefront size in AMD is 64.
		 * TODO(sergey): What about other platforms?
		 */
		if(num_parallel_samples >= 64) {
			/* TODO(sergey): Could use generic round-up here. */
			num_parallel_samples = (num_parallel_samples / 64) * 64;
		}
		assert(num_parallel_samples != 0);

		global_size[0] = d_w * num_parallel_samples;
#endif  /* __WORK_STEALING__ */

		assert(global_size[0] * global_size[1] <=
		       max_render_feasible_tile_size.x * max_render_feasible_tile_size.y);

		/* Allocate all required global memory once. */
		if(first_tile) {
			size_t num_global_elements = max_render_feasible_tile_size.x *
			                             max_render_feasible_tile_size.y;
			/* TODO(sergey): This will actually over-allocate if
			 * particular kernel does not support multiclosure.
			 */
			size_t shaderdata_size = get_shader_data_size(current_max_closure);

#ifdef __WORK_STEALING__
			/* Calculate max groups */
			size_t max_global_size[2];
			size_t tile_x = max_render_feasible_tile_size.x;
			size_t tile_y = max_render_feasible_tile_size.y;
			max_global_size[0] = (((tile_x - 1) / local_size[0]) + 1) * local_size[0];
			max_global_size[1] = (((tile_y - 1) / local_size[1]) + 1) * local_size[1];
			max_work_groups = (max_global_size[0] * max_global_size[1]) /
			                  (local_size[0] * local_size[1]);
			/* Allocate work_pool_wgs memory. */
			work_pool_wgs = mem_alloc(max_work_groups * sizeof(unsigned int));
#endif  /* __WORK_STEALING__ */

			/* Allocate queue_index memory only once. */
			Queue_index = mem_alloc(NUM_QUEUES * sizeof(int));
			use_queues_flag = mem_alloc(sizeof(char));
			kgbuffer = mem_alloc(get_KernelGlobals_size());

			/* Create global buffers for ShaderData. */
			sd = mem_alloc(num_global_elements * shaderdata_size);
			sd_DL_shadow = mem_alloc(num_global_elements * 2 * shaderdata_size);

			/* Creation of global memory buffers which are shared among
			 * the kernels.
			 */
			rng_coop = mem_alloc(num_global_elements * sizeof(RNG));
			throughput_coop = mem_alloc(num_global_elements * sizeof(float3));
			L_transparent_coop = mem_alloc(num_global_elements * sizeof(float));
			PathRadiance_coop = mem_alloc(num_global_elements * sizeof(PathRadiance));
			Ray_coop = mem_alloc(num_global_elements * sizeof(Ray));
			PathState_coop = mem_alloc(num_global_elements * sizeof(PathState));
			Intersection_coop = mem_alloc(num_global_elements * sizeof(Intersection));
			AOAlpha_coop = mem_alloc(num_global_elements * sizeof(float3));
			AOBSDF_coop = mem_alloc(num_global_elements * sizeof(float3));
			AOLightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
			BSDFEval_coop = mem_alloc(num_global_elements * sizeof(BsdfEval));
			ISLamp_coop = mem_alloc(num_global_elements * sizeof(int));
			LightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
			Intersection_coop_shadow = mem_alloc(2 * num_global_elements * sizeof(Intersection));

#ifdef WITH_CYCLES_DEBUG
			debugdata_coop = mem_alloc(num_global_elements * sizeof(DebugData));
#endif

			ray_state = mem_alloc(num_global_elements * sizeof(char));

			hostRayStateArray = (char *)calloc(num_global_elements, sizeof(char));
			assert(hostRayStateArray != NULL && "Can't create hostRayStateArray memory");

			Queue_data = mem_alloc(num_global_elements * (NUM_QUEUES * sizeof(int)+sizeof(int)));
			work_array = mem_alloc(num_global_elements * sizeof(unsigned int));
			per_sample_output_buffers = mem_alloc(num_global_elements *
			                                      per_thread_output_buffer_size);
		}

		cl_int dQueue_size = global_size[0] * global_size[1];

		cl_uint start_arg_index =
			kernel_set_args(program_data_init(),
			                0,
			                kgbuffer,
			                sd_DL_shadow,
			                d_data,
			                per_sample_output_buffers,
			                d_rng_state,
			                rng_coop,
			                throughput_coop,
			                L_transparent_coop,
			                PathRadiance_coop,
			                Ray_coop,
			                PathState_coop,
			                Intersection_coop_shadow,
			                ray_state);

/* TODO(sergey): Avoid map lookup here. */
#define KERNEL_TEX(type, ttype, name) \
	set_kernel_arg_mem(program_data_init(), &start_arg_index, #name);
#include "kernel_textures.h"
#undef KERNEL_TEX

		start_arg_index +=
			kernel_set_args(program_data_init(),
			                start_arg_index,
			                start_sample,
			                d_x,
			                d_y,
			                d_w,
			                d_h,
			                d_offset,
			                d_stride,
			                rtile.rng_state_offset_x,
			                rtile.rng_state_offset_y,
			                rtile.buffer_rng_state_stride,
			                Queue_data,
			                Queue_index,
			                dQueue_size,
			                use_queues_flag,
			                work_array,
#ifdef __WORK_STEALING__
			                work_pool_wgs,
			                num_samples,
#endif
#ifdef WITH_CYCLES_DEBUG
			                debugdata_coop,
#endif
			                num_parallel_samples);

		kernel_set_args(program_scene_intersect(),
		                0,
		                kgbuffer,
		                d_data,
		                rng_coop,
		                Ray_coop,
		                PathState_coop,
		                Intersection_coop,
		                ray_state,
		                d_w,
		                d_h,
		                Queue_data,
		                Queue_index,
		                dQueue_size,
		                use_queues_flag,
#ifdef WITH_CYCLES_DEBUG
		                debugdata_coop,
#endif
		                num_parallel_samples);

		kernel_set_args(program_lamp_emission(),
		                0,
		                kgbuffer,
		                d_data,
		                throughput_coop,
		                PathRadiance_coop,
		                Ray_coop,
		                PathState_coop,
		                Intersection_coop,
		                ray_state,
		                d_w,
		                d_h,
		                Queue_data,
		                Queue_index,
		                dQueue_size,
		                use_queues_flag,
		                num_parallel_samples);

		kernel_set_args(program_queue_enqueue(),
		                0,
		                Queue_data,
		                Queue_index,
		                ray_state,
		                dQueue_size);

		kernel_set_args(program_background_buffer_update(),
		                 0,
		                 kgbuffer,
		                 d_data,
		                 per_sample_output_buffers,
		                 d_rng_state,
		                 rng_coop,
		                 throughput_coop,
		                 PathRadiance_coop,
		                 Ray_coop,
		                 PathState_coop,
		                 L_transparent_coop,
		                 ray_state,
		                 d_w,
		                 d_h,
		                 d_x,
		                 d_y,
		                 d_stride,
		                 rtile.rng_state_offset_x,
		                 rtile.rng_state_offset_y,
		                 rtile.buffer_rng_state_stride,
		                 work_array,
		                 Queue_data,
		                 Queue_index,
		                 dQueue_size,
		                 end_sample,
		                 start_sample,
#ifdef __WORK_STEALING__
		                 work_pool_wgs,
		                 num_samples,
#endif
#ifdef WITH_CYCLES_DEBUG
		                 debugdata_coop,
#endif
		                 num_parallel_samples);

		kernel_set_args(program_shader_eval(),
		                0,
		                kgbuffer,
		                d_data,
		                sd,
		                rng_coop,
		                Ray_coop,
		                PathState_coop,
		                Intersection_coop,
		                ray_state,
		                Queue_data,
		                Queue_index,
		                dQueue_size);

		kernel_set_args(program_holdout_emission_blurring_pathtermination_ao(),
		                0,
		                kgbuffer,
		                d_data,
		                sd,
		                per_sample_output_buffers,
		                rng_coop,
		                throughput_coop,
		                L_transparent_coop,
		                PathRadiance_coop,
		                PathState_coop,
		                Intersection_coop,
		                AOAlpha_coop,
		                AOBSDF_coop,
		                AOLightRay_coop,
		                d_w,
		                d_h,
		                d_x,
		                d_y,
		                d_stride,
		                ray_state,
		                work_array,
		                Queue_data,
		                Queue_index,
		                dQueue_size,
#ifdef __WORK_STEALING__
		                start_sample,
#endif
		                num_parallel_samples);

		kernel_set_args(program_direct_lighting(),
		                0,
		                kgbuffer,
		                d_data,
		                sd,
		                rng_coop,
		                PathState_coop,
		                ISLamp_coop,
		                LightRay_coop,
		                BSDFEval_coop,
		                ray_state,
		                Queue_data,
		                Queue_index,
		                dQueue_size);

		kernel_set_args(program_shadow_blocked(),
		                0,
		                kgbuffer,
		                d_data,
		                PathState_coop,
		                LightRay_coop,
		                AOLightRay_coop,
		                ray_state,
		                Queue_data,
		                Queue_index,
		                dQueue_size);

		kernel_set_args(program_next_iteration_setup(),
		                0,
		                kgbuffer,
		                d_data,
		                sd,
		                rng_coop,
		                throughput_coop,
		                PathRadiance_coop,
		                Ray_coop,
		                PathState_coop,
		                LightRay_coop,
		                ISLamp_coop,
		                BSDFEval_coop,
		                AOLightRay_coop,
		                AOBSDF_coop,
		                AOAlpha_coop,
		                ray_state,
		                Queue_data,
		                Queue_index,
		                dQueue_size,
		                use_queues_flag);

		kernel_set_args(program_sum_all_radiance(),
		                0,
		                d_data,
		                d_buffer,
		                per_sample_output_buffers,
		                num_parallel_samples,
		                d_w,
		                d_h,
		                d_stride,
		                rtile.buffer_offset_x,
		                rtile.buffer_offset_y,
		                rtile.buffer_rng_state_stride,
		                start_sample);

		/* Macro for Enqueuing split kernels. */
#define GLUE(a, b) a ## b
#define ENQUEUE_SPLIT_KERNEL(kernelName, globalSize, localSize) \
		{ \
			ciErr = clEnqueueNDRangeKernel(cqCommandQueue, \
			                               GLUE(program_, \
			                                    kernelName)(), \
			                               2, \
			                               NULL, \
			                               globalSize, \
			                               localSize, \
			                               0, \
			                               NULL, \
			                               NULL); \
			opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); \
			if(ciErr != CL_SUCCESS) { \
				string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", \
				                               clewErrorString(ciErr)); \
				opencl_error(message); \
				return; \
			} \
		} (void) 0

		/* Enqueue ckPathTraceKernel_data_init kernel. */
		ENQUEUE_SPLIT_KERNEL(data_init, global_size, local_size);
		bool activeRaysAvailable = true;

		/* Record number of time host intervention has been made */
		unsigned int numHostIntervention = 0;
		unsigned int numNextPathIterTimes = PathIteration_times;
		bool canceled = false;
		while(activeRaysAvailable) {
			/* Twice the global work size of other kernels for
			 * ckPathTraceKernel_shadow_blocked_direct_lighting. */
			size_t global_size_shadow_blocked[2];
			global_size_shadow_blocked[0] = global_size[0] * 2;
			global_size_shadow_blocked[1] = global_size[1];

			/* Do path-iteration in host [Enqueue Path-iteration kernels. */
			for(int PathIter = 0; PathIter < PathIteration_times; PathIter++) {
				ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
				ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
				ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
				ENQUEUE_SPLIT_KERNEL(background_buffer_update, global_size, local_size);
				ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
				ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size);
				ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
				ENQUEUE_SPLIT_KERNEL(shadow_blocked, global_size_shadow_blocked, local_size);
				ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);

				if(task->get_cancel()) {
					canceled = true;
					break;
				}
			}

			/* Read ray-state into Host memory to decide if we should exit
			 * path-iteration in host.
			 */
			ciErr = clEnqueueReadBuffer(cqCommandQueue,
			                            ray_state,
			                            CL_TRUE,
			                            0,
			                            global_size[0] * global_size[1] * sizeof(char),
			                            hostRayStateArray,
			                            0,
			                            NULL,
			                            NULL);
			assert(ciErr == CL_SUCCESS);

			activeRaysAvailable = false;

			for(int rayStateIter = 0;
			    rayStateIter < global_size[0] * global_size[1];
			    ++rayStateIter)
			{
				if(int8_t(hostRayStateArray[rayStateIter]) != RAY_INACTIVE) {
					/* Not all rays are RAY_INACTIVE. */
					activeRaysAvailable = true;
					break;
				}
			}

			if(activeRaysAvailable) {
				numHostIntervention++;
				PathIteration_times = PATH_ITER_INC_FACTOR;
				/* Host intervention done before all rays become RAY_INACTIVE;
				 * Set do more initial iterations for the next tile.
				 */
				numNextPathIterTimes += PATH_ITER_INC_FACTOR;
			}

			if(task->get_cancel()) {
				canceled = true;
				break;
			}
		}

		/* Execute SumALLRadiance kernel to accumulate radiance calculated in
		 * per_sample_output_buffers into RenderTile's output buffer.
		 */
		if(!canceled) {
			size_t sum_all_radiance_local_size[2] = {16, 16};
			size_t sum_all_radiance_global_size[2];
			sum_all_radiance_global_size[0] =
				(((d_w - 1) / sum_all_radiance_local_size[0]) + 1) *
				sum_all_radiance_local_size[0];
			sum_all_radiance_global_size[1] =
				(((d_h - 1) / sum_all_radiance_local_size[1]) + 1) *
				sum_all_radiance_local_size[1];
			ENQUEUE_SPLIT_KERNEL(sum_all_radiance,
			                     sum_all_radiance_global_size,
			                     sum_all_radiance_local_size);
		}

#undef ENQUEUE_SPLIT_KERNEL
#undef GLUE

		if(numHostIntervention == 0) {
			/* This means that we are executing kernel more than required
			 * Must avoid this for the next sample/tile.
			 */
			PathIteration_times = ((numNextPathIterTimes - PATH_ITER_INC_FACTOR) <= 0) ?
			PATH_ITER_INC_FACTOR : numNextPathIterTimes - PATH_ITER_INC_FACTOR;
		}
		else {
			/* Number of path-iterations done for this tile is set as
			 * Initial path-iteration times for the next tile
			 */
			PathIteration_times = numNextPathIterTimes;
		}

		first_tile = false;
	}