void GPUSort::Sort(CLBuffer &data) { CLEventList wait; CLEvent ev; local_sort_kernel_->SetIntArg(0, length_); local_sort_kernel_->SetBufferArg(1, &data); local_sort_kernel_->SetLocalBufferArg(2, local_sort_kernel_->work_group_size() * sizeof(cl_int)); local_sort_kernel_->Run1D(length_, NULL, &ev); CLBuffer *in = &data; CLBuffer *out = temp_buffer_.get(); for (int len = local_sort_kernel_->work_group_size(); len < length_; len *= 2) { wait.Clear(); wait.Add(ev); merge_sort_pass_kernel_->SetIntArg(0, length_); merge_sort_pass_kernel_->SetIntArg(1, len); merge_sort_pass_kernel_->SetBufferArg(2, in); merge_sort_pass_kernel_->SetBufferArg(3, out); merge_sort_pass_kernel_->Run1D(length_, &wait, &ev); swap(in, out); } if (in != &data) { wait.Clear(); wait.Add(ev); data.CopyFrom(*in, &wait, &ev); } ev.WaitFor(); }
void GPURayTracer::RenderFrame(const Camera &camera) { PROFILE_TIMER_START("frame"); frame_stopwatch_.Restart(); CLEvent ev; PROFILE_TIMER_START("GPU"); PROFILE_TIMER_START("GPU init tracing"); /*__kernel void InitTracingFrame(__global TracingState *tracing_states, float16 view_proj_inv, float lod_voxel_size)*/ init_frame_kernel_->SetBufferArg(0, tracing_states_.get()); init_frame_kernel_->SetFloat16Arg(1, camera.ViewProjectionMatrix().Inverse()); init_frame_kernel_->SetFloatArg(2, lod_voxel_size_); init_frame_kernel_->Run2D(frame_width_, frame_height_, NULL, &ev); ev.WaitFor(); PROFILE_TIMER_COMMIT("GPU init tracing"); PROFILE_TIMER_STOP(); cache_manager_->StartRequestSession(); int iterations; for (iterations = 0;; ++iterations) { PROFILE_TIMER_START("GPU"); PROFILE_TIMER_START("GPU tracing passes"); /*__kernel void RaytracingPass(__global int *faults_and_hits, // out __global TracingState *tracing_states, // in-out __global uint *node_links, __global uint *far_pointers, int root_node_index)*/ raytracing_pass_kernel_->SetBufferArg(0, faults_and_hits_.get()); raytracing_pass_kernel_->SetBufferArg(1, tracing_states_.get()); raytracing_pass_kernel_->SetBufferArg(2, cache_manager_->data()->ChannelByIndex(0)->cl_buffer()); raytracing_pass_kernel_->SetBufferArg(3, cache_manager_->data()->far_pointers_buffer()); raytracing_pass_kernel_->SetIntArg(4, cache_manager_->root_node_index()); const CLBuffer *b = NULL; if (cache_manager_->data()->ChannelByName("normals")) b = cache_manager_->data()->ChannelByName("normals")->cl_buffer(); raytracing_pass_kernel_->SetBufferArg(5, b); b = NULL; if (cache_manager_->data()->ChannelByName("colors")) b = cache_manager_->data()->ChannelByName("colors")->cl_buffer(); raytracing_pass_kernel_->SetBufferArg(6, b); raytracing_pass_kernel_->Run2D(frame_width_, frame_height_, NULL, &ev); ev.WaitFor(); PROFILE_TIMER_STOP(); PROFILE_TIMER_STOP(); // always give it at least one iteration of loading if (iterations > 0 && frame_stopwatch_.TimeSinceRestart() > frame_time_limit_) break; feedback_extractor_->Process(*faults_and_hits_, cache_manager_->data()->nodes_in_block()); cache_manager_->StartRequestTransaction(); PROFILE_TIMER_START("CPU mark as used"); const vector<int> &hits = feedback_extractor_->hit_blocks(); for (int i = 0; i < (int)hits.size(); ++i) { cache_manager_->MarkBlockAsUsed(hits[i]); } const vector<int> &dup_hits = feedback_extractor_->duplicate_hit_blocks(); for (int i = 0; i < (int)dup_hits.size(); ++i) { cache_manager_->MarkParentAsUsed(dup_hits[i]); } PROFILE_TIMER_STOP(); PROFILE_VALUE_COMMIT_SET("hits per pass", hits.size()); PROFILE_VALUE_COMMIT_SET("hits dup per pass", dup_hits.size()); PROFILE_TIMER_START("CPU request blocks"); const vector<int> &faults = feedback_extractor_->fault_blocks(); for (int i = 0; i < (int)faults.size(); ++i) { if (cache_manager_->TransactionFilledCache()) break; cache_manager_->RequestBlock(faults[i]); } PROFILE_TIMER_STOP(); PROFILE_VALUE_ADD("faults", faults.size()); cache_manager_->EndRequestTransaction(); if (faults.empty() || cache_manager_->SessionFilledCache()) break; } PROFILE_TIMER_COMMIT("CPU request blocks"); PROFILE_TIMER_COMMIT("CPU mark as used"); PROFILE_TIMER_COMMIT("GPU tracing passes"); PROFILE_VALUE_COMMIT_SET("tracing iterations", iterations); PROFILE_VALUE_COMMIT_SET("cache too small", cache_manager_->SessionFilledCache() ? 1 : 0); PROFILE_VALUE_COMMIT("faults"); PROFILE_VALUE_COMMIT("uploaded blocks"); PROFILE_VALUE_COMMIT("updated far pointers"); PROFILE_VALUE_COMMIT("updated pointers"); cache_manager_->EndRequestSession(); PROFILE_TIMER_START("GPU"); PROFILE_TIMER_START("GPU finish tracing"); /*__kernel void FinishTracingFrame(__global uchar4 *result_colors, __global TracingState *tracing_states, __constant float4 background_color)*/ finish_frame_kernel_->SetBufferArg(0, out_image_.get()); finish_frame_kernel_->SetBufferArg(1, tracing_states_.get()); finish_frame_kernel_->SetFloat4Arg(2, fvec3(0, 216/255., 1), 1); const CLBuffer *b = NULL; if (cache_manager_->data()->ChannelByName("normals")) b = cache_manager_->data()->ChannelByName("normals")->cl_buffer(); finish_frame_kernel_->SetBufferArg(3, b); b = NULL; if (cache_manager_->data()->ChannelByName("colors")) b = cache_manager_->data()->ChannelByName("colors")->cl_buffer(); finish_frame_kernel_->SetBufferArg(4, b); finish_frame_kernel_->Run2D(frame_width_, frame_height_, NULL, &ev); ev.WaitFor(); PROFILE_TIMER_COMMIT("GPU finish tracing"); PROFILE_TIMER_STOP(); PROFILE_TIMER_COMMIT("frame"); // this part kinda breaks encapsulation // the excuse for this is that it's just profiling code, not code that "matters" :) PROFILE_TIMER_COMMIT("GPU sort"); PROFILE_TIMER_COMMIT("GPU scan"); PROFILE_TIMER_COMMIT("GPU/BUS compr/size"); PROFILE_TIMER_COMMIT("BUS read faults"); PROFILE_TIMER_COMMIT("BUS read hits"); PROFILE_TIMER_COMMIT("HDD? load block"); PROFILE_TIMER_COMMIT("CPU process links"); PROFILE_TIMER_COMMIT("BUS upload block"); PROFILE_TIMER_COMMIT("BUS upload far ptr"); PROFILE_TIMER_COMMIT("BUS upload ptr"); PROFILE_TIMER_COMMIT("BUS unload pointer"); PROFILE_TIMER_COMMIT("GPU"); PROFILE_TIMER_COMMIT("BUS"); PROFILE_TIMER_COMMIT("HDD"); }