Пример #1
0
  void mem_alloc(device_memory &mem)
  {
    if (mem.type == MEM_TEXTURE) {
      assert(!"mem_alloc not supported for textures.");
    }
    else {
      if (mem.name) {
        VLOG(1) << "Buffer allocate: " << mem.name << ", "
                << string_human_readable_number(mem.memory_size()) << " bytes. ("
                << string_human_readable_size(mem.memory_size()) << ")";
      }

      if (mem.type == MEM_DEVICE_ONLY) {
        assert(!mem.host_pointer);
        size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES;
        void *data = util_aligned_malloc(mem.memory_size(), alignment);
        mem.device_pointer = (device_ptr)data;
      }
      else {
        mem.device_pointer = (device_ptr)mem.host_pointer;
      }

      mem.device_size = mem.memory_size();
      stats.mem_alloc(mem.device_size);
    }
  }
Пример #2
0
	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/)
	{
		cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice);
		/* Use small global size on CPU devices as it seems to be much faster. */
		if(type == CL_DEVICE_TYPE_CPU) {
			VLOG(1) << "Global size: (64, 64).";
			return make_int2(64, 64);
		}

		cl_ulong max_buffer_size;
		clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);

		if(DebugFlags().opencl.mem_limit) {
			max_buffer_size = min(max_buffer_size,
			                      cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used));
		}

		VLOG(1) << "Maximum device allocation size: "
		        << string_human_readable_number(max_buffer_size) << " bytes. ("
		        << string_human_readable_size(max_buffer_size) << ").";

		/* Limit to 2gb, as we shouldn't need more than that and some devices may support much more. */
		max_buffer_size = min(max_buffer_size / 2, (cl_ulong)2l*1024*1024*1024);

		size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size);
		int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64), (int)sqrt(num_elements));
		VLOG(1) << "Global size: " << global_size << ".";
		return global_size;
	}
Пример #3
0
size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, uint64_t max_buffer_size)
{
	uint64_t size_per_element = state_buffer_size(kg, data, 1024) / 1024;
	VLOG(1) << "Split state element size: "
	        << string_human_readable_number(size_per_element) << " bytes. ("
	        << string_human_readable_size(size_per_element) << ").";
	return max_buffer_size / size_per_element;
}
Пример #4
0
string NamedSizeStats::full_report(int indent_level)
{
	const string indent(indent_level * kIndentNumSpaces, ' ');
	const string double_indent = indent + indent;
	string result = "";
	result += string_printf("%sTotal memory: %s (%s)\n",
	                        indent.c_str(),
	                        string_human_readable_size(total_size).c_str(),
	                        string_human_readable_number(total_size).c_str());
	sort(entries.begin(), entries.end(), namedSizeEntryComparator);
	foreach(const NamedSizeEntry& entry, entries) {
		result += string_printf(
		        "%s%-32s %s (%s)\n",
		        double_indent.c_str(),
		        entry.name.c_str(),
		        string_human_readable_size(entry.size).c_str(),
		        string_human_readable_number(entry.size).c_str());
	}
Пример #5
0
std::ostream& operator <<(std::ostream &os,
                          DebugFlagsConstRef debug_flags)
{
	os << "CPU flags:\n"
	   << "  AVX2       : " << string_from_bool(debug_flags.cpu.avx2) << "\n"
	   << "  AVX        : " << string_from_bool(debug_flags.cpu.avx) << "\n"
	   << "  SSE4.1     : " << string_from_bool(debug_flags.cpu.sse41) << "\n"
	   << "  SSE3       : " << string_from_bool(debug_flags.cpu.sse3) << "\n"
	   << "  SSE2       : " << string_from_bool(debug_flags.cpu.sse2) << "\n"
	   << "  BVH layout : " << bvh_layout_name(debug_flags.cpu.bvh_layout) << "\n"
	   << "  Split      : " << string_from_bool(debug_flags.cpu.split_kernel) << "\n";

	os << "CUDA flags:\n"
	   << " Adaptive Compile: " << string_from_bool(debug_flags.cuda.adaptive_compile) << "\n";

	const char *opencl_device_type,
	           *opencl_kernel_type;
	switch(debug_flags.opencl.device_type) {
		case DebugFlags::OpenCL::DEVICE_NONE:
			opencl_device_type = "NONE";
			break;
		case DebugFlags::OpenCL::DEVICE_ALL:
			opencl_device_type = "ALL";
			break;
		case DebugFlags::OpenCL::DEVICE_DEFAULT:
			opencl_device_type = "DEFAULT";
			break;
		case DebugFlags::OpenCL::DEVICE_CPU:
			opencl_device_type = "CPU";
			break;
		case DebugFlags::OpenCL::DEVICE_GPU:
			opencl_device_type = "GPU";
			break;
		case DebugFlags::OpenCL::DEVICE_ACCELERATOR:
			opencl_device_type = "ACCELERATOR";
			break;
	}
	switch(debug_flags.opencl.kernel_type) {
		case DebugFlags::OpenCL::KERNEL_DEFAULT:
			opencl_kernel_type = "DEFAULT";
			break;
		case DebugFlags::OpenCL::KERNEL_MEGA:
			opencl_kernel_type = "MEGA";
			break;
		case DebugFlags::OpenCL::KERNEL_SPLIT:
			opencl_kernel_type = "SPLIT";
			break;
	}
	os << "OpenCL flags:\n"
	   << "  Device type    : " << opencl_device_type << "\n"
	   << "  Kernel type    : " << opencl_kernel_type << "\n"
	   << "  Debug          : " << string_from_bool(debug_flags.opencl.debug) << "\n"
	   << "  Single program : " << string_from_bool(debug_flags.opencl.single_program) << "\n"
	   << "  Memory limit   : " << string_human_readable_size(debug_flags.opencl.mem_limit) << "\n";
	return os;
}
Пример #6
0
	void mem_alloc(device_memory& mem)
	{
		if(mem.name) {
			VLOG(1) << "Buffer allocate: " << mem.name << ", "
				    << string_human_readable_number(mem.memory_size()) << " bytes. ("
				    << string_human_readable_size(mem.memory_size()) << ")";
		}

		thread_scoped_lock lock(rpc_lock);

		mem.device_pointer = ++mem_counter;

		RPCSend snd(socket, &error_func, "mem_alloc");
		snd.add(mem);
		snd.write();
	}
Пример #7
0
  void tex_alloc(device_memory &mem)
  {
    VLOG(1) << "Texture allocate: " << mem.name << ", "
            << string_human_readable_number(mem.memory_size()) << " bytes. ("
            << string_human_readable_size(mem.memory_size()) << ")";

    if (mem.interpolation == INTERPOLATION_NONE) {
      /* Data texture. */
      kernel_tex_copy(&kernel_globals, mem.name, mem.host_pointer, mem.data_size);
    }
    else {
      /* Image Texture. */
      int flat_slot = 0;
      if (string_startswith(mem.name, "__tex_image")) {
        int pos = string(mem.name).rfind("_");
        flat_slot = atoi(mem.name + pos + 1);
      }
      else {
        assert(0);
      }

      if (flat_slot >= texture_info.size()) {
        /* Allocate some slots in advance, to reduce amount
         * of re-allocations. */
        texture_info.resize(flat_slot + 128);
      }

      TextureInfo &info = texture_info[flat_slot];
      info.data = (uint64_t)mem.host_pointer;
      info.cl_buffer = 0;
      info.interpolation = mem.interpolation;
      info.extension = mem.extension;
      info.width = mem.data_width;
      info.height = mem.data_height;
      info.depth = mem.data_depth;

      need_texture_info = true;
    }

    mem.device_pointer = (device_ptr)mem.host_pointer;
    mem.device_size = mem.memory_size();
    stats.mem_alloc(mem.device_size);
  }
Пример #8
0
	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/)
	{
		cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice);
		/* Use small global size on CPU devices as it seems to be much faster. */
		if(type == CL_DEVICE_TYPE_CPU) {
			VLOG(1) << "Global size: (64, 64).";
			return make_int2(64, 64);
		}

		cl_ulong max_buffer_size;
		clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
		VLOG(1) << "Maximum device allocation size: "
		        << string_human_readable_number(max_buffer_size) << " bytes. ("
		        << string_human_readable_size(max_buffer_size) << ").";

		size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size / 2);
		int2 global_size = make_int2(round_down((int)sqrt(num_elements), 64), (int)sqrt(num_elements));
		VLOG(1) << "Global size: " << global_size << ".";
		return global_size;
	}
Пример #9
0
	void tex_alloc(const char *name,
	               device_memory& mem,
	               InterpolationType interpolation,
	               ExtensionType extension)
	{
		VLOG(1) << "Texture allocate: " << name << ", "
		        << string_human_readable_number(mem.memory_size()) << " bytes. ("
		        << string_human_readable_size(mem.memory_size()) << ")";

		thread_scoped_lock lock(rpc_lock);

		mem.device_pointer = ++mem_counter;

		RPCSend snd(socket, &error_func, "tex_alloc");

		string name_string(name);

		snd.add(name_string);
		snd.add(mem);
		snd.add(interpolation);
		snd.add(extension);
		snd.write();
		snd.write_buffer((void*)mem.data_pointer, mem.memory_size());
	}