void mem_alloc(device_memory &mem) { if (mem.type == MEM_TEXTURE) { assert(!"mem_alloc not supported for textures."); } else { if (mem.name) { VLOG(1) << "Buffer allocate: " << mem.name << ", " << string_human_readable_number(mem.memory_size()) << " bytes. (" << string_human_readable_size(mem.memory_size()) << ")"; } if (mem.type == MEM_DEVICE_ONLY) { assert(!mem.host_pointer); size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES; void *data = util_aligned_malloc(mem.memory_size(), alignment); mem.device_pointer = (device_ptr)data; } else { mem.device_pointer = (device_ptr)mem.host_pointer; } mem.device_size = mem.memory_size(); stats.mem_alloc(mem.device_size); } }
virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/) { cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice); /* Use small global size on CPU devices as it seems to be much faster. */ if(type == CL_DEVICE_TYPE_CPU) { VLOG(1) << "Global size: (64, 64)."; return make_int2(64, 64); } cl_ulong max_buffer_size; clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL); if(DebugFlags().opencl.mem_limit) { max_buffer_size = min(max_buffer_size, cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used)); } VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(max_buffer_size) << " bytes. (" << string_human_readable_size(max_buffer_size) << ")."; /* Limit to 2gb, as we shouldn't need more than that and some devices may support much more. */ max_buffer_size = min(max_buffer_size / 2, (cl_ulong)2l*1024*1024*1024); size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size); int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64), (int)sqrt(num_elements)); VLOG(1) << "Global size: " << global_size << "."; return global_size; }
size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, uint64_t max_buffer_size) { uint64_t size_per_element = state_buffer_size(kg, data, 1024) / 1024; VLOG(1) << "Split state element size: " << string_human_readable_number(size_per_element) << " bytes. (" << string_human_readable_size(size_per_element) << ")."; return max_buffer_size / size_per_element; }
string NamedSizeStats::full_report(int indent_level) { const string indent(indent_level * kIndentNumSpaces, ' '); const string double_indent = indent + indent; string result = ""; result += string_printf("%sTotal memory: %s (%s)\n", indent.c_str(), string_human_readable_size(total_size).c_str(), string_human_readable_number(total_size).c_str()); sort(entries.begin(), entries.end(), namedSizeEntryComparator); foreach(const NamedSizeEntry& entry, entries) { result += string_printf( "%s%-32s %s (%s)\n", double_indent.c_str(), entry.name.c_str(), string_human_readable_size(entry.size).c_str(), string_human_readable_number(entry.size).c_str()); }
std::ostream& operator <<(std::ostream &os, DebugFlagsConstRef debug_flags) { os << "CPU flags:\n" << " AVX2 : " << string_from_bool(debug_flags.cpu.avx2) << "\n" << " AVX : " << string_from_bool(debug_flags.cpu.avx) << "\n" << " SSE4.1 : " << string_from_bool(debug_flags.cpu.sse41) << "\n" << " SSE3 : " << string_from_bool(debug_flags.cpu.sse3) << "\n" << " SSE2 : " << string_from_bool(debug_flags.cpu.sse2) << "\n" << " BVH layout : " << bvh_layout_name(debug_flags.cpu.bvh_layout) << "\n" << " Split : " << string_from_bool(debug_flags.cpu.split_kernel) << "\n"; os << "CUDA flags:\n" << " Adaptive Compile: " << string_from_bool(debug_flags.cuda.adaptive_compile) << "\n"; const char *opencl_device_type, *opencl_kernel_type; switch(debug_flags.opencl.device_type) { case DebugFlags::OpenCL::DEVICE_NONE: opencl_device_type = "NONE"; break; case DebugFlags::OpenCL::DEVICE_ALL: opencl_device_type = "ALL"; break; case DebugFlags::OpenCL::DEVICE_DEFAULT: opencl_device_type = "DEFAULT"; break; case DebugFlags::OpenCL::DEVICE_CPU: opencl_device_type = "CPU"; break; case DebugFlags::OpenCL::DEVICE_GPU: opencl_device_type = "GPU"; break; case DebugFlags::OpenCL::DEVICE_ACCELERATOR: opencl_device_type = "ACCELERATOR"; break; } switch(debug_flags.opencl.kernel_type) { case DebugFlags::OpenCL::KERNEL_DEFAULT: opencl_kernel_type = "DEFAULT"; break; case DebugFlags::OpenCL::KERNEL_MEGA: opencl_kernel_type = "MEGA"; break; case DebugFlags::OpenCL::KERNEL_SPLIT: opencl_kernel_type = "SPLIT"; break; } os << "OpenCL flags:\n" << " Device type : " << opencl_device_type << "\n" << " Kernel type : " << opencl_kernel_type << "\n" << " Debug : " << string_from_bool(debug_flags.opencl.debug) << "\n" << " Single program : " << string_from_bool(debug_flags.opencl.single_program) << "\n" << " Memory limit : " << string_human_readable_size(debug_flags.opencl.mem_limit) << "\n"; return os; }
void mem_alloc(device_memory& mem) { if(mem.name) { VLOG(1) << "Buffer allocate: " << mem.name << ", " << string_human_readable_number(mem.memory_size()) << " bytes. (" << string_human_readable_size(mem.memory_size()) << ")"; } thread_scoped_lock lock(rpc_lock); mem.device_pointer = ++mem_counter; RPCSend snd(socket, &error_func, "mem_alloc"); snd.add(mem); snd.write(); }
void tex_alloc(device_memory &mem) { VLOG(1) << "Texture allocate: " << mem.name << ", " << string_human_readable_number(mem.memory_size()) << " bytes. (" << string_human_readable_size(mem.memory_size()) << ")"; if (mem.interpolation == INTERPOLATION_NONE) { /* Data texture. */ kernel_tex_copy(&kernel_globals, mem.name, mem.host_pointer, mem.data_size); } else { /* Image Texture. */ int flat_slot = 0; if (string_startswith(mem.name, "__tex_image")) { int pos = string(mem.name).rfind("_"); flat_slot = atoi(mem.name + pos + 1); } else { assert(0); } if (flat_slot >= texture_info.size()) { /* Allocate some slots in advance, to reduce amount * of re-allocations. */ texture_info.resize(flat_slot + 128); } TextureInfo &info = texture_info[flat_slot]; info.data = (uint64_t)mem.host_pointer; info.cl_buffer = 0; info.interpolation = mem.interpolation; info.extension = mem.extension; info.width = mem.data_width; info.height = mem.data_height; info.depth = mem.data_depth; need_texture_info = true; } mem.device_pointer = (device_ptr)mem.host_pointer; mem.device_size = mem.memory_size(); stats.mem_alloc(mem.device_size); }
virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/) { cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice); /* Use small global size on CPU devices as it seems to be much faster. */ if(type == CL_DEVICE_TYPE_CPU) { VLOG(1) << "Global size: (64, 64)."; return make_int2(64, 64); } cl_ulong max_buffer_size; clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL); VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(max_buffer_size) << " bytes. (" << string_human_readable_size(max_buffer_size) << ")."; size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size / 2); int2 global_size = make_int2(round_down((int)sqrt(num_elements), 64), (int)sqrt(num_elements)); VLOG(1) << "Global size: " << global_size << "."; return global_size; }
void tex_alloc(const char *name, device_memory& mem, InterpolationType interpolation, ExtensionType extension) { VLOG(1) << "Texture allocate: " << name << ", " << string_human_readable_number(mem.memory_size()) << " bytes. (" << string_human_readable_size(mem.memory_size()) << ")"; thread_scoped_lock lock(rpc_lock); mem.device_pointer = ++mem_counter; RPCSend snd(socket, &error_func, "tex_alloc"); string name_string(name); snd.add(name_string); snd.add(mem); snd.add(interpolation); snd.add(extension); snd.write(); snd.write_buffer((void*)mem.data_pointer, mem.memory_size()); }