void GPUSort::Sort(CLBuffer &data) { CLEventList wait; CLEvent ev; local_sort_kernel_->SetIntArg(0, length_); local_sort_kernel_->SetBufferArg(1, &data); local_sort_kernel_->SetLocalBufferArg(2, local_sort_kernel_->work_group_size() * sizeof(cl_int)); local_sort_kernel_->Run1D(length_, NULL, &ev); CLBuffer *in = &data; CLBuffer *out = temp_buffer_.get(); for (int len = local_sort_kernel_->work_group_size(); len < length_; len *= 2) { wait.Clear(); wait.Add(ev); merge_sort_pass_kernel_->SetIntArg(0, length_); merge_sort_pass_kernel_->SetIntArg(1, len); merge_sort_pass_kernel_->SetBufferArg(2, in); merge_sort_pass_kernel_->SetBufferArg(3, out); merge_sort_pass_kernel_->Run1D(length_, &wait, &ev); swap(in, out); } if (in != &data) { wait.Clear(); wait.Add(ev); data.CopyFrom(*in, &wait, &ev); } ev.WaitFor(); }