void tonemap(DeviceTask& task) { /* cast arguments to cl types */ cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); cl_mem d_rgba = CL_MEM_PTR(task.rgba); cl_mem d_buffer = CL_MEM_PTR(task.buffer); cl_int d_x = task.x; cl_int d_y = task.y; cl_int d_w = task.w; cl_int d_h = task.h; cl_int d_sample = task.sample; cl_int d_resolution = task.resolution; cl_int d_offset = task.offset; cl_int d_stride = task.stride; /* sample arguments */ int narg = 0; ciErr = 0; ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_data), (void*)&d_data); ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_rgba), (void*)&d_rgba); ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_buffer), (void*)&d_buffer); #define KERNEL_TEX(type, ttype, name) \ ciErr |= set_kernel_arg_mem(ckFilmConvertKernel, &narg, #name); #include "kernel_textures.h" ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_sample), (void*)&d_sample); ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_resolution), (void*)&d_resolution); ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_x), (void*)&d_x); ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_y), (void*)&d_y); ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_w), (void*)&d_w); ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_h), (void*)&d_h); ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_offset), (void*)&d_offset); ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_stride), (void*)&d_stride); opencl_assert(ciErr); size_t workgroup_size; clGetKernelWorkGroupInfo(ckFilmConvertKernel, cdDevice, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL); workgroup_size = max(sqrt((double)workgroup_size), 1.0); size_t local_size[2] = {workgroup_size, workgroup_size}; size_t global_size[2] = {global_size_round_up(local_size[0], d_w), global_size_round_up(local_size[1], d_h)}; /* run kernel */ ciErr = clEnqueueNDRangeKernel(cqCommandQueue, ckFilmConvertKernel, 2, NULL, global_size, local_size, 0, NULL, NULL); opencl_assert(ciErr); opencl_assert(clFinish(cqCommandQueue)); }
void path_trace(RenderTile& rtile, int sample) { /* cast arguments to cl types */ cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); cl_mem d_buffer = CL_MEM_PTR(rtile.buffer); cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state); cl_int d_x = rtile.x; cl_int d_y = rtile.y; cl_int d_w = rtile.w; cl_int d_h = rtile.h; cl_int d_sample = sample; cl_int d_offset = rtile.offset; cl_int d_stride = rtile.stride; /* sample arguments */ int narg = 0; ciErr = 0; ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_data), (void*)&d_data); ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_buffer), (void*)&d_buffer); ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_rng_state), (void*)&d_rng_state); #define KERNEL_TEX(type, ttype, name) \ ciErr |= set_kernel_arg_mem(ckPathTraceKernel, &narg, #name); #include "kernel_textures.h" ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_sample), (void*)&d_sample); ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_x), (void*)&d_x); ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_y), (void*)&d_y); ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_w), (void*)&d_w); ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_h), (void*)&d_h); ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_offset), (void*)&d_offset); ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_stride), (void*)&d_stride); opencl_assert(ciErr); size_t workgroup_size; clGetKernelWorkGroupInfo(ckPathTraceKernel, cdDevice, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL); workgroup_size = max(sqrt((double)workgroup_size), 1.0); size_t local_size[2] = {workgroup_size, workgroup_size}; size_t global_size[2] = {global_size_round_up(local_size[0], d_w), global_size_round_up(local_size[1], d_h)}; /* run kernel */ ciErr = clEnqueueNDRangeKernel(cqCommandQueue, ckPathTraceKernel, 2, NULL, global_size, local_size, 0, NULL, NULL); opencl_assert(ciErr); opencl_assert(clFinish(cqCommandQueue)); }
void mem_copy_to(device_memory& mem) { /* this is blocking */ size_t size = mem.memory_size(); ciErr = clEnqueueWriteBuffer(cqCommandQueue, CL_MEM_PTR(mem.device_pointer), CL_TRUE, 0, size, (void*)mem.data_pointer, 0, NULL, NULL); opencl_assert(ciErr); }
void mem_free(device_memory& mem) { if(mem.device_pointer) { ciErr = clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)); mem.device_pointer = 0; opencl_assert(ciErr); } }
void mem_copy_from(device_memory& mem, int y, int w, int h, int elem) { size_t offset = elem*y*w; size_t size = elem*w*h; ciErr = clEnqueueReadBuffer(cqCommandQueue, CL_MEM_PTR(mem.device_pointer), CL_TRUE, offset, size, (uchar*)mem.data_pointer + offset, 0, NULL, NULL); opencl_assert(ciErr); }
void path_trace(RenderTile& rtile, int sample) { /* Cast arguments to cl types. */ cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); cl_mem d_buffer = CL_MEM_PTR(rtile.buffer); cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state); cl_int d_x = rtile.x; cl_int d_y = rtile.y; cl_int d_w = rtile.w; cl_int d_h = rtile.h; cl_int d_offset = rtile.offset; cl_int d_stride = rtile.stride; /* Sample arguments. */ cl_int d_sample = sample; cl_kernel ckPathTraceKernel = path_trace_program(ustring("path_trace")); cl_uint start_arg_index = kernel_set_args(ckPathTraceKernel, 0, d_data, d_buffer, d_rng_state); #define KERNEL_TEX(type, ttype, name) \ set_kernel_arg_mem(ckPathTraceKernel, &start_arg_index, #name); #include "kernel_textures.h" #undef KERNEL_TEX start_arg_index += kernel_set_args(ckPathTraceKernel, start_arg_index, d_sample, d_x, d_y, d_w, d_h, d_offset, d_stride); enqueue_kernel(ckPathTraceKernel, d_w, d_h); }
cl_int set_kernel_arg_mem(cl_kernel kernel, int *narg, const char *name) { cl_mem ptr; cl_int err = 0; if(mem_map.find(name) != mem_map.end()) { device_memory *mem = mem_map[name]; ptr = CL_MEM_PTR(mem->device_pointer); } else { /* work around NULL not working, even though the spec says otherwise */ ptr = CL_MEM_PTR(null_mem); } err |= clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr); opencl_assert(err); return err; }
void path_trace(RenderTile& rtile, int sample) { scoped_timer timer(&rtile.buffers->render_time); /* Cast arguments to cl types. */ cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); cl_mem d_buffer = CL_MEM_PTR(rtile.buffer); cl_int d_x = rtile.x; cl_int d_y = rtile.y; cl_int d_w = rtile.w; cl_int d_h = rtile.h; cl_int d_offset = rtile.offset; cl_int d_stride = rtile.stride; /* Sample arguments. */ cl_int d_sample = sample; cl_kernel ckPathTraceKernel = path_trace_program(ustring("path_trace")); cl_uint start_arg_index = kernel_set_args(ckPathTraceKernel, 0, d_data, d_buffer); set_kernel_arg_buffers(ckPathTraceKernel, &start_arg_index); start_arg_index += kernel_set_args(ckPathTraceKernel, start_arg_index, d_sample, d_x, d_y, d_w, d_h, d_offset, d_stride); enqueue_kernel(ckPathTraceKernel, d_w, d_h); }
/* Split kernel utility functions. */ size_t get_tex_size(const char *tex_name) { cl_mem ptr; size_t ret_size = 0; MemMap::iterator i = mem_map.find(tex_name); if(i != mem_map.end()) { ptr = CL_MEM_PTR(i->second); ciErr = clGetMemObjectInfo(ptr, CL_MEM_SIZE, sizeof(ret_size), &ret_size, NULL); assert(ciErr == CL_SUCCESS); } return ret_size; }
foreach(Allocation* allocation, allocations) { if(allocation->needs_copy_to_device) { /* Copy from host to device. */ opencl_device_assert(device, clEnqueueWriteBuffer(device->cqCommandQueue, CL_MEM_PTR(buffer->device_pointer), CL_FALSE, offset, allocation->mem->memory_size(), (void*)allocation->mem->data_pointer, 0, NULL, NULL )); allocation->needs_copy_to_device = false; } offset += allocation->size; }
~OpenCLDevice() { if(null_mem) clReleaseMemObject(CL_MEM_PTR(null_mem)); map<string, device_vector<uchar>*>::iterator mt; for(mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) { mem_free(*(mt->second)); delete mt->second; } if(ckPathTraceKernel) clReleaseKernel(ckPathTraceKernel); if(ckFilmConvertKernel) clReleaseKernel(ckFilmConvertKernel); if(cpProgram) clReleaseProgram(cpProgram); if(cqCommandQueue) clReleaseCommandQueue(cqCommandQueue); if(cxContext) clReleaseContext(cxContext); }
void MemoryManager::DeviceBuffer::update_device_memory(OpenCLDeviceBase *device) { bool need_realloc = false; /* Calculate total size and remove any freed. */ size_t total_size = 0; for(int i = allocations.size()-1; i >= 0; i--) { Allocation* allocation = allocations[i]; /* Remove allocations that have been freed. */ if(!allocation->mem || allocation->mem->memory_size() == 0) { allocation->device_buffer = NULL; allocation->size = 0; allocations.erase(allocations.begin()+i); need_realloc = true; continue; } /* Get actual size for allocation. */ size_t alloc_size = align_up(allocation->mem->memory_size(), 16); if(allocation->size != alloc_size) { /* Allocation is either new or resized. */ allocation->size = alloc_size; allocation->needs_copy_to_device = true; need_realloc = true; } total_size += alloc_size; } if(need_realloc) { cl_ulong max_buffer_size; clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL); if(total_size > max_buffer_size) { device->set_error("Scene too complex to fit in available memory."); return; } device_memory *new_buffer = new device_memory; new_buffer->resize(total_size); device->mem_alloc(string_printf("buffer_%p", this).data(), *new_buffer, MEM_READ_ONLY); size_t offset = 0; foreach(Allocation* allocation, allocations) { if(allocation->needs_copy_to_device) { /* Copy from host to device. */ opencl_device_assert(device, clEnqueueWriteBuffer(device->cqCommandQueue, CL_MEM_PTR(new_buffer->device_pointer), CL_FALSE, offset, allocation->mem->memory_size(), (void*)allocation->mem->data_pointer, 0, NULL, NULL )); allocation->needs_copy_to_device = false; } else { /* Fast copy from memory already on device. */ opencl_device_assert(device, clEnqueueCopyBuffer(device->cqCommandQueue, CL_MEM_PTR(buffer->device_pointer), CL_MEM_PTR(new_buffer->device_pointer), allocation->desc.offset, offset, allocation->mem->memory_size(), 0, NULL, NULL )); } allocation->desc.offset = offset; offset += allocation->size; } device->mem_free(*buffer); delete buffer; buffer = new_buffer; } else {
void path_trace(DeviceTask *task, SplitRenderTile& rtile, int2 max_render_feasible_tile_size) { /* cast arguments to cl types */ cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); cl_mem d_buffer = CL_MEM_PTR(rtile.buffer); cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state); cl_int d_x = rtile.x; cl_int d_y = rtile.y; cl_int d_w = rtile.w; cl_int d_h = rtile.h; cl_int d_offset = rtile.offset; cl_int d_stride = rtile.stride; /* Make sure that set render feasible tile size is a multiple of local * work size dimensions. */ assert(max_render_feasible_tile_size.x % SPLIT_KERNEL_LOCAL_SIZE_X == 0); assert(max_render_feasible_tile_size.y % SPLIT_KERNEL_LOCAL_SIZE_Y == 0); size_t global_size[2]; size_t local_size[2] = {SPLIT_KERNEL_LOCAL_SIZE_X, SPLIT_KERNEL_LOCAL_SIZE_Y}; /* Set the range of samples to be processed for every ray in * path-regeneration logic. */ cl_int start_sample = rtile.start_sample; cl_int end_sample = rtile.start_sample + rtile.num_samples; cl_int num_samples = rtile.num_samples; #ifdef __WORK_STEALING__ global_size[0] = (((d_w - 1) / local_size[0]) + 1) * local_size[0]; global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1]; unsigned int num_parallel_samples = 1; #else global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1]; unsigned int num_threads = max_render_feasible_tile_size.x * max_render_feasible_tile_size.y; unsigned int num_tile_columns_possible = num_threads / global_size[1]; /* Estimate number of parallel samples that can be * processed in parallel. */ unsigned int num_parallel_samples = min(num_tile_columns_possible / d_w, rtile.num_samples); /* Wavefront size in AMD is 64. * TODO(sergey): What about other platforms? */ if(num_parallel_samples >= 64) { /* TODO(sergey): Could use generic round-up here. */ num_parallel_samples = (num_parallel_samples / 64) * 64; } assert(num_parallel_samples != 0); global_size[0] = d_w * num_parallel_samples; #endif /* __WORK_STEALING__ */ assert(global_size[0] * global_size[1] <= max_render_feasible_tile_size.x * max_render_feasible_tile_size.y); /* Allocate all required global memory once. */ if(first_tile) { size_t num_global_elements = max_render_feasible_tile_size.x * max_render_feasible_tile_size.y; /* TODO(sergey): This will actually over-allocate if * particular kernel does not support multiclosure. */ size_t shaderdata_size = get_shader_data_size(current_max_closure); #ifdef __WORK_STEALING__ /* Calculate max groups */ size_t max_global_size[2]; size_t tile_x = max_render_feasible_tile_size.x; size_t tile_y = max_render_feasible_tile_size.y; max_global_size[0] = (((tile_x - 1) / local_size[0]) + 1) * local_size[0]; max_global_size[1] = (((tile_y - 1) / local_size[1]) + 1) * local_size[1]; max_work_groups = (max_global_size[0] * max_global_size[1]) / (local_size[0] * local_size[1]); /* Allocate work_pool_wgs memory. */ work_pool_wgs = mem_alloc(max_work_groups * sizeof(unsigned int)); #endif /* __WORK_STEALING__ */ /* Allocate queue_index memory only once. */ Queue_index = mem_alloc(NUM_QUEUES * sizeof(int)); use_queues_flag = mem_alloc(sizeof(char)); kgbuffer = mem_alloc(get_KernelGlobals_size()); /* Create global buffers for ShaderData. */ sd = mem_alloc(num_global_elements * shaderdata_size); sd_DL_shadow = mem_alloc(num_global_elements * 2 * shaderdata_size); /* Creation of global memory buffers which are shared among * the kernels. */ rng_coop = mem_alloc(num_global_elements * sizeof(RNG)); throughput_coop = mem_alloc(num_global_elements * sizeof(float3)); L_transparent_coop = mem_alloc(num_global_elements * sizeof(float)); PathRadiance_coop = mem_alloc(num_global_elements * sizeof(PathRadiance)); Ray_coop = mem_alloc(num_global_elements * sizeof(Ray)); PathState_coop = mem_alloc(num_global_elements * sizeof(PathState)); Intersection_coop = mem_alloc(num_global_elements * sizeof(Intersection)); AOAlpha_coop = mem_alloc(num_global_elements * sizeof(float3)); AOBSDF_coop = mem_alloc(num_global_elements * sizeof(float3)); AOLightRay_coop = mem_alloc(num_global_elements * sizeof(Ray)); BSDFEval_coop = mem_alloc(num_global_elements * sizeof(BsdfEval)); ISLamp_coop = mem_alloc(num_global_elements * sizeof(int)); LightRay_coop = mem_alloc(num_global_elements * sizeof(Ray)); Intersection_coop_shadow = mem_alloc(2 * num_global_elements * sizeof(Intersection)); #ifdef WITH_CYCLES_DEBUG debugdata_coop = mem_alloc(num_global_elements * sizeof(DebugData)); #endif ray_state = mem_alloc(num_global_elements * sizeof(char)); hostRayStateArray = (char *)calloc(num_global_elements, sizeof(char)); assert(hostRayStateArray != NULL && "Can't create hostRayStateArray memory"); Queue_data = mem_alloc(num_global_elements * (NUM_QUEUES * sizeof(int)+sizeof(int))); work_array = mem_alloc(num_global_elements * sizeof(unsigned int)); per_sample_output_buffers = mem_alloc(num_global_elements * per_thread_output_buffer_size); } cl_int dQueue_size = global_size[0] * global_size[1]; cl_uint start_arg_index = kernel_set_args(program_data_init(), 0, kgbuffer, sd_DL_shadow, d_data, per_sample_output_buffers, d_rng_state, rng_coop, throughput_coop, L_transparent_coop, PathRadiance_coop, Ray_coop, PathState_coop, Intersection_coop_shadow, ray_state); /* TODO(sergey): Avoid map lookup here. */ #define KERNEL_TEX(type, ttype, name) \ set_kernel_arg_mem(program_data_init(), &start_arg_index, #name); #include "kernel_textures.h" #undef KERNEL_TEX start_arg_index += kernel_set_args(program_data_init(), start_arg_index, start_sample, d_x, d_y, d_w, d_h, d_offset, d_stride, rtile.rng_state_offset_x, rtile.rng_state_offset_y, rtile.buffer_rng_state_stride, Queue_data, Queue_index, dQueue_size, use_queues_flag, work_array, #ifdef __WORK_STEALING__ work_pool_wgs, num_samples, #endif #ifdef WITH_CYCLES_DEBUG debugdata_coop, #endif num_parallel_samples); kernel_set_args(program_scene_intersect(), 0, kgbuffer, d_data, rng_coop, Ray_coop, PathState_coop, Intersection_coop, ray_state, d_w, d_h, Queue_data, Queue_index, dQueue_size, use_queues_flag, #ifdef WITH_CYCLES_DEBUG debugdata_coop, #endif num_parallel_samples); kernel_set_args(program_lamp_emission(), 0, kgbuffer, d_data, throughput_coop, PathRadiance_coop, Ray_coop, PathState_coop, Intersection_coop, ray_state, d_w, d_h, Queue_data, Queue_index, dQueue_size, use_queues_flag, num_parallel_samples); kernel_set_args(program_queue_enqueue(), 0, Queue_data, Queue_index, ray_state, dQueue_size); kernel_set_args(program_background_buffer_update(), 0, kgbuffer, d_data, per_sample_output_buffers, d_rng_state, rng_coop, throughput_coop, PathRadiance_coop, Ray_coop, PathState_coop, L_transparent_coop, ray_state, d_w, d_h, d_x, d_y, d_stride, rtile.rng_state_offset_x, rtile.rng_state_offset_y, rtile.buffer_rng_state_stride, work_array, Queue_data, Queue_index, dQueue_size, end_sample, start_sample, #ifdef __WORK_STEALING__ work_pool_wgs, num_samples, #endif #ifdef WITH_CYCLES_DEBUG debugdata_coop, #endif num_parallel_samples); kernel_set_args(program_shader_eval(), 0, kgbuffer, d_data, sd, rng_coop, Ray_coop, PathState_coop, Intersection_coop, ray_state, Queue_data, Queue_index, dQueue_size); kernel_set_args(program_holdout_emission_blurring_pathtermination_ao(), 0, kgbuffer, d_data, sd, per_sample_output_buffers, rng_coop, throughput_coop, L_transparent_coop, PathRadiance_coop, PathState_coop, Intersection_coop, AOAlpha_coop, AOBSDF_coop, AOLightRay_coop, d_w, d_h, d_x, d_y, d_stride, ray_state, work_array, Queue_data, Queue_index, dQueue_size, #ifdef __WORK_STEALING__ start_sample, #endif num_parallel_samples); kernel_set_args(program_direct_lighting(), 0, kgbuffer, d_data, sd, rng_coop, PathState_coop, ISLamp_coop, LightRay_coop, BSDFEval_coop, ray_state, Queue_data, Queue_index, dQueue_size); kernel_set_args(program_shadow_blocked(), 0, kgbuffer, d_data, PathState_coop, LightRay_coop, AOLightRay_coop, ray_state, Queue_data, Queue_index, dQueue_size); kernel_set_args(program_next_iteration_setup(), 0, kgbuffer, d_data, sd, rng_coop, throughput_coop, PathRadiance_coop, Ray_coop, PathState_coop, LightRay_coop, ISLamp_coop, BSDFEval_coop, AOLightRay_coop, AOBSDF_coop, AOAlpha_coop, ray_state, Queue_data, Queue_index, dQueue_size, use_queues_flag); kernel_set_args(program_sum_all_radiance(), 0, d_data, d_buffer, per_sample_output_buffers, num_parallel_samples, d_w, d_h, d_stride, rtile.buffer_offset_x, rtile.buffer_offset_y, rtile.buffer_rng_state_stride, start_sample); /* Macro for Enqueuing split kernels. */ #define GLUE(a, b) a ## b #define ENQUEUE_SPLIT_KERNEL(kernelName, globalSize, localSize) \ { \ ciErr = clEnqueueNDRangeKernel(cqCommandQueue, \ GLUE(program_, \ kernelName)(), \ 2, \ NULL, \ globalSize, \ localSize, \ 0, \ NULL, \ NULL); \ opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); \ if(ciErr != CL_SUCCESS) { \ string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", \ clewErrorString(ciErr)); \ opencl_error(message); \ return; \ } \ } (void) 0 /* Enqueue ckPathTraceKernel_data_init kernel. */ ENQUEUE_SPLIT_KERNEL(data_init, global_size, local_size); bool activeRaysAvailable = true; /* Record number of time host intervention has been made */ unsigned int numHostIntervention = 0; unsigned int numNextPathIterTimes = PathIteration_times; bool canceled = false; while(activeRaysAvailable) { /* Twice the global work size of other kernels for * ckPathTraceKernel_shadow_blocked_direct_lighting. */ size_t global_size_shadow_blocked[2]; global_size_shadow_blocked[0] = global_size[0] * 2; global_size_shadow_blocked[1] = global_size[1]; /* Do path-iteration in host [Enqueue Path-iteration kernels. */ for(int PathIter = 0; PathIter < PathIteration_times; PathIter++) { ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size); ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size); ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); ENQUEUE_SPLIT_KERNEL(background_buffer_update, global_size, local_size); ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size); ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size); ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size); ENQUEUE_SPLIT_KERNEL(shadow_blocked, global_size_shadow_blocked, local_size); ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size); if(task->get_cancel()) { canceled = true; break; } } /* Read ray-state into Host memory to decide if we should exit * path-iteration in host. */ ciErr = clEnqueueReadBuffer(cqCommandQueue, ray_state, CL_TRUE, 0, global_size[0] * global_size[1] * sizeof(char), hostRayStateArray, 0, NULL, NULL); assert(ciErr == CL_SUCCESS); activeRaysAvailable = false; for(int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) { if(int8_t(hostRayStateArray[rayStateIter]) != RAY_INACTIVE) { /* Not all rays are RAY_INACTIVE. */ activeRaysAvailable = true; break; } } if(activeRaysAvailable) { numHostIntervention++; PathIteration_times = PATH_ITER_INC_FACTOR; /* Host intervention done before all rays become RAY_INACTIVE; * Set do more initial iterations for the next tile. */ numNextPathIterTimes += PATH_ITER_INC_FACTOR; } if(task->get_cancel()) { canceled = true; break; } } /* Execute SumALLRadiance kernel to accumulate radiance calculated in * per_sample_output_buffers into RenderTile's output buffer. */ if(!canceled) { size_t sum_all_radiance_local_size[2] = {16, 16}; size_t sum_all_radiance_global_size[2]; sum_all_radiance_global_size[0] = (((d_w - 1) / sum_all_radiance_local_size[0]) + 1) * sum_all_radiance_local_size[0]; sum_all_radiance_global_size[1] = (((d_h - 1) / sum_all_radiance_local_size[1]) + 1) * sum_all_radiance_local_size[1]; ENQUEUE_SPLIT_KERNEL(sum_all_radiance, sum_all_radiance_global_size, sum_all_radiance_local_size); } #undef ENQUEUE_SPLIT_KERNEL #undef GLUE if(numHostIntervention == 0) { /* This means that we are executing kernel more than required * Must avoid this for the next sample/tile. */ PathIteration_times = ((numNextPathIterTimes - PATH_ITER_INC_FACTOR) <= 0) ? PATH_ITER_INC_FACTOR : numNextPathIterTimes - PATH_ITER_INC_FACTOR; } else { /* Number of path-iterations done for this tile is set as * Initial path-iteration times for the next tile */ PathIteration_times = numNextPathIterTimes; } first_tile = false; }
void thread_run(DeviceTask *task) { if(task->type == DeviceTask::FILM_CONVERT) { film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half); } else if(task->type == DeviceTask::SHADER) { shader(*task); } else if(task->type == DeviceTask::PATH_TRACE) { RenderTile tile; bool initialize_data_and_check_render_feasibility = false; bool need_to_split_tiles_further = false; int2 max_render_feasible_tile_size; size_t feasible_global_work_size; const int2 tile_size = task->requested_tile_size; /* Keep rendering tiles until done. */ while(task->acquire_tile(this, tile)) { if(!initialize_data_and_check_render_feasibility) { /* Initialize data. */ /* Calculate per_thread_output_buffer_size. */ size_t output_buffer_size = 0; ciErr = clGetMemObjectInfo((cl_mem)tile.buffer, CL_MEM_SIZE, sizeof(output_buffer_size), &output_buffer_size, NULL); assert(ciErr == CL_SUCCESS && "Can't get tile.buffer mem object info"); /* This value is different when running on AMD and NV. */ if(background) { /* In offline render the number of buffer elements * associated with tile.buffer is the current tile size. */ per_thread_output_buffer_size = output_buffer_size / (tile.w * tile.h); } else { /* interactive rendering, unlike offline render, the number of buffer elements * associated with tile.buffer is the entire viewport size. */ per_thread_output_buffer_size = output_buffer_size / (tile.buffers->params.width * tile.buffers->params.height); } /* Check render feasibility. */ feasible_global_work_size = get_feasible_global_work_size( tile_size, CL_MEM_PTR(const_mem_map["__data"]->device_pointer)); max_render_feasible_tile_size = get_max_render_feasible_tile_size( feasible_global_work_size); need_to_split_tiles_further = need_to_split_tile(tile_size.x, tile_size.y, max_render_feasible_tile_size); initialize_data_and_check_render_feasibility = true; } if(need_to_split_tiles_further) { int2 split_tile_size = get_split_tile_size(tile, max_render_feasible_tile_size); vector<SplitRenderTile> to_path_trace_render_tiles = split_tiles(tile, split_tile_size); /* Print message to console */ if(background && (to_path_trace_render_tiles.size() > 1)) { fprintf(stderr, "Message : Tiles need to be split " "further inside path trace (due to insufficient " "device-global-memory for split kernel to " "function) \n" "The current tile of dimensions %dx%d is split " "into tiles of dimension %dx%d for render \n", tile.w, tile.h, split_tile_size.x, split_tile_size.y); } /* Process all split tiles. */ for(int tile_iter = 0; tile_iter < to_path_trace_render_tiles.size(); ++tile_iter) { path_trace(task, to_path_trace_render_tiles[tile_iter], max_render_feasible_tile_size); } } else { /* No splitting required; process the entire tile at once. */ /* Render feasible tile size is user-set-tile-size itself. */ max_render_feasible_tile_size.x = (((tile_size.x - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) * SPLIT_KERNEL_LOCAL_SIZE_X; max_render_feasible_tile_size.y = (((tile_size.y - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) * SPLIT_KERNEL_LOCAL_SIZE_Y; /* buffer_rng_state_stride is stride itself. */ SplitRenderTile split_tile(tile); split_tile.buffer_rng_state_stride = tile.stride; path_trace(task, split_tile, max_render_feasible_tile_size); } tile.sample = tile.start_sample + tile.num_samples; /* Complete kernel execution before release tile. */ /* This helps in multi-device render; * The device that reaches the critical-section function * release_tile waits (stalling other devices from entering * release_tile) for all kernels to complete. If device1 (a * slow-render device) reaches release_tile first then it would * stall device2 (a fast-render device) from proceeding to render * next tile. */ clFinish(cqCommandQueue); task->release_tile(tile); } } }