int qemuSetupGlobalCpuCgroup(virDomainObjPtr vm) { qemuDomainObjPrivatePtr priv = vm->privateData; unsigned long long period = vm->def->cputune.global_period; long long quota = vm->def->cputune.global_quota; char *mem_mask = NULL; virDomainNumatuneMemMode mem_mode; if ((period || quota) && !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", _("cgroup cpu is required for scheduler tuning")); return -1; } /* * If CPU cgroup controller is not initialized here, then we need * neither period nor quota settings. And if CPUSET controller is * not initialized either, then there's nothing to do anyway. */ if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) && !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) return 0; if (virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 && mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT && virDomainNumatuneMaybeFormatNodeset(vm->def->numa, priv->autoNodeset, &mem_mask, -1) < 0) goto cleanup; if (period || quota) { if (qemuSetupCgroupVcpuBW(priv->cgroup, period, quota) < 0) goto cleanup; } VIR_FREE(mem_mask); return 0; cleanup: VIR_FREE(mem_mask); return -1; }
static int CVE_2013_6456_libvirt1_1_0_lxcDomainDetachDeviceHostdevMiscLive(virDomainObjPtr vm, virDomainDeviceDefPtr dev) { virLXCDomainObjPrivatePtr priv = vm->privateData; virDomainHostdevDefPtr def = NULL; int i, ret = -1; char *dst = NULL; if (!priv->initpid) { virReportError(VIR_ERR_OPERATION_INVALID, "%s", _("Cannot attach disk until init PID is known")); goto cleanup; } if ((i = virDomainHostdevFind(vm->def, dev->data.hostdev, &def)) < 0) { virReportError(VIR_ERR_OPERATION_FAILED, _("hostdev %s not found"), dev->data.hostdev->source.caps.u.misc.chardev); goto cleanup; } if (virAsprintf(&dst, "/proc/%llu/root/%s", (unsigned long long)priv->initpid, def->source.caps.u.misc.chardev) < 0) { virReportOOMError(); goto cleanup; } if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES)) { virReportError(VIR_ERR_OPERATION_INVALID, "%s", _("devices cgroup isn't mounted")); goto cleanup; } VIR_DEBUG("Unlinking %s", dst); if (unlink(dst) < 0 && errno != ENOENT) { virDomainAuditHostdev(vm, def, "detach", false); virReportSystemError(errno, _("Unable to remove device %s"), dst); goto cleanup; } virDomainAuditHostdev(vm, def, "detach", true); if (virCgroupDenyDevicePath(priv->cgroup, def->source.caps.u.misc.chardev, VIR_CGROUP_DEVICE_RWM) != 0) VIR_WARN("cannot deny device %s for domain %s", def->source.caps.u.misc.chardev, vm->def->name); virDomainHostdevRemove(vm->def, i); virDomainHostdevDefFree(def); ret = 0; cleanup: VIR_FREE(dst); return ret; }
int qemuTeardownImageCgroup(virDomainObjPtr vm, virStorageSourcePtr src) { qemuDomainObjPrivatePtr priv = vm->privateData; int perms = VIR_CGROUP_DEVICE_RWM; size_t i; int ret; if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES)) return 0; if (!src->path || !virStorageSourceIsLocalStorage(src)) { VIR_DEBUG("Not updating cgroups for disk path '%s', type: %s", NULLSTR(src->path), virStorageTypeToString(src->type)); return 0; } if (virFileExists(DEVICE_MAPPER_CONTROL_PATH)) { for (i = 0; i < vm->def->ndisks; i++) { virStorageSourcePtr diskSrc = vm->def->disks[i]->src; if (src == diskSrc) continue; if (virStoragePRDefIsManaged(diskSrc->pr)) break; } if (i == vm->def->ndisks) { VIR_DEBUG("Disabling device mapper control"); ret = virCgroupDenyDevicePath(priv->cgroup, DEVICE_MAPPER_CONTROL_PATH, perms, true); virDomainAuditCgroupPath(vm, priv->cgroup, "deny", DEVICE_MAPPER_CONTROL_PATH, virCgroupGetDevicePermsString(perms), ret); if (ret < 0) return ret; } } VIR_DEBUG("Deny path %s", src->path); ret = virCgroupDenyDevicePath(priv->cgroup, src->path, perms, true); virDomainAuditCgroupPath(vm, priv->cgroup, "deny", src->path, virCgroupGetDevicePermsString(perms), ret); /* If you're looking for a counter part to * qemuSetupImagePathCgroup you're at the right place. * However, we can't just blindly deny all the device mapper * targets of src->path because they might still be used by * another disk in domain. Just like we are not removing * disks from namespace. */ return ret; }
static int qemuSetupImagePathCgroup(virDomainObjPtr vm, const char *path, bool readonly) { qemuDomainObjPrivatePtr priv = vm->privateData; int perms = VIR_CGROUP_DEVICE_READ; char **targetPaths = NULL; size_t i; int rv; int ret = -1; if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES)) return 0; if (!readonly) perms |= VIR_CGROUP_DEVICE_WRITE; VIR_DEBUG("Allow path %s, perms: %s", path, virCgroupGetDevicePermsString(perms)); rv = virCgroupAllowDevicePath(priv->cgroup, path, perms, true); virDomainAuditCgroupPath(vm, priv->cgroup, "allow", path, virCgroupGetDevicePermsString(perms), rv); if (rv < 0) goto cleanup; if (rv > 0) { /* @path is neither character device nor block device. */ ret = 0; goto cleanup; } if (virDevMapperGetTargets(path, &targetPaths) < 0 && errno != ENOSYS && errno != EBADF) { virReportSystemError(errno, _("Unable to get devmapper targets for %s"), path); goto cleanup; } for (i = 0; targetPaths && targetPaths[i]; i++) { rv = virCgroupAllowDevicePath(priv->cgroup, targetPaths[i], perms, false); virDomainAuditCgroupPath(vm, priv->cgroup, "allow", targetPaths[i], virCgroupGetDevicePermsString(perms), rv); if (rv < 0) goto cleanup; } ret = 0; cleanup: virStringListFree(targetPaths); return ret; }
int qemuTeardownHostdevCgroup(virDomainObjPtr vm, virDomainHostdevDefPtr dev) { int ret = -1; qemuDomainObjPrivatePtr priv = vm->privateData; virDomainHostdevSubsysPCIPtr pcisrc = &dev->source.subsys.u.pci; virPCIDevicePtr pci = NULL; char *path = NULL; /* currently this only does something for PCI devices using vfio * for device assignment, but it is called for *all* hostdev * devices. */ if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES)) return 0; if (dev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS) { switch (dev->source.subsys.type) { case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI: if (pcisrc->backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) { int rv; pci = virPCIDeviceNew(pcisrc->addr.domain, pcisrc->addr.bus, pcisrc->addr.slot, pcisrc->addr.function); if (!pci) goto cleanup; if (!(path = virPCIDeviceGetIOMMUGroupDev(pci))) goto cleanup; VIR_DEBUG("Cgroup deny %s for PCI device assignment", path); rv = virCgroupDenyDevicePath(priv->cgroup, path, VIR_CGROUP_DEVICE_RWM); virDomainAuditCgroupPath(vm, priv->cgroup, "deny", path, "rwm", rv == 0); if (rv < 0) goto cleanup; } break; case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB: /* nothing to tear down for USB */ break; default: break; } } ret = 0; cleanup: virPCIDeviceFree(pci); VIR_FREE(path); return ret; }
int qemuSetupDiskCgroup(virDomainObjPtr vm, virDomainDiskDefPtr disk) { qemuDomainObjPrivatePtr priv = vm->privateData; if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES)) return 0; return virDomainDiskDefForeachPath(disk, true, qemuSetupDiskPathAllow, vm); }
static int qemuSetupBlkioCgroup(virDomainObjPtr vm) { qemuDomainObjPrivatePtr priv = vm->privateData; size_t i; if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_BLKIO)) { if (vm->def->blkio.weight || vm->def->blkio.ndevices) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", _("Block I/O tuning is not available on this host")); return -1; } else { return 0; } } if (vm->def->blkio.weight != 0 && virCgroupSetBlkioWeight(priv->cgroup, vm->def->blkio.weight) < 0) return -1; if (vm->def->blkio.ndevices) { for (i = 0; i < vm->def->blkio.ndevices; i++) { virBlkioDevicePtr dev = &vm->def->blkio.devices[i]; if (dev->weight && (virCgroupSetBlkioDeviceWeight(priv->cgroup, dev->path, dev->weight) < 0)) return -1; if (dev->riops && (virCgroupSetBlkioDeviceReadIops(priv->cgroup, dev->path, dev->riops) < 0)) return -1; if (dev->wiops && (virCgroupSetBlkioDeviceWriteIops(priv->cgroup, dev->path, dev->wiops) < 0)) return -1; if (dev->rbps && (virCgroupSetBlkioDeviceReadBps(priv->cgroup, dev->path, dev->rbps) < 0)) return -1; if (dev->wbps && (virCgroupSetBlkioDeviceWriteBps(priv->cgroup, dev->path, dev->wbps) < 0)) return -1; } } return 0; }
static int qemuSetImageCgroupInternal(virDomainObjPtr vm, virStorageSourcePtr src, bool deny, bool forceReadonly) { qemuDomainObjPrivatePtr priv = vm->privateData; int perms = VIR_CGROUP_DEVICE_READ; int ret; if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES)) return 0; if (!src->path || !virStorageSourceIsLocalStorage(src)) { VIR_DEBUG("Not updating cgroups for disk path '%s', type: %s", NULLSTR(src->path), virStorageTypeToString(src->type)); return 0; } if (deny) { perms |= VIR_CGROUP_DEVICE_WRITE | VIR_CGROUP_DEVICE_MKNOD; VIR_DEBUG("Deny path %s", src->path); ret = virCgroupDenyDevicePath(priv->cgroup, src->path, perms); } else { if (!src->readonly && !forceReadonly) perms |= VIR_CGROUP_DEVICE_WRITE; VIR_DEBUG("Allow path %s, perms: %s", src->path, virCgroupGetDevicePermsString(perms)); ret = virCgroupAllowDevicePath(priv->cgroup, src->path, perms); } virDomainAuditCgroupPath(vm, priv->cgroup, deny ? "deny" : "allow", src->path, virCgroupGetDevicePermsString(perms), ret == 0); /* Get this for root squash NFS */ if (ret < 0 && virLastErrorIsSystemErrno(EACCES)) { VIR_DEBUG("Ignoring EACCES for %s", src->path); virResetLastError(); ret = 0; } return ret; }
static int qemuSetupCpusetCgroup(virDomainObjPtr vm) { qemuDomainObjPrivatePtr priv = vm->privateData; if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) return 0; if (virCgroupSetCpusetMemoryMigrate(priv->cgroup, true) < 0) return -1; return 0; }
static int qemuSetupSEVCgroup(virDomainObjPtr vm) { qemuDomainObjPrivatePtr priv = vm->privateData; int ret; if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES)) return 0; ret = virCgroupAllowDevicePath(priv->cgroup, "/dev/sev", VIR_CGROUP_DEVICE_RW, false); virDomainAuditCgroupPath(vm, priv->cgroup, "allow", "/dev/sev", "rw", ret); return ret; }
static int qemuSetupGraphicsCgroup(virDomainObjPtr vm, virDomainGraphicsDefPtr gfx) { qemuDomainObjPrivatePtr priv = vm->privateData; const char *rendernode = virDomainGraphicsGetRenderNode(gfx); int ret; if (!rendernode || !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES)) return 0; ret = virCgroupAllowDevicePath(priv->cgroup, rendernode, VIR_CGROUP_DEVICE_RW, false); virDomainAuditCgroupPath(vm, priv->cgroup, "allow", rendernode, "rw", ret); return ret; }
static int qemuSetupCpuCgroup(virQEMUDriverPtr driver, virDomainObjPtr vm) { qemuDomainObjPrivatePtr priv = vm->privateData; virObjectEventPtr event = NULL; virTypedParameterPtr eventParams = NULL; int eventNparams = 0; int eventMaxparams = 0; if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) { if (vm->def->cputune.sharesSpecified) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", _("CPU tuning is not available on this host")); return -1; } else { return 0; } } if (vm->def->cputune.sharesSpecified) { unsigned long long val; if (virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares) < 0) return -1; if (virCgroupGetCpuShares(priv->cgroup, &val) < 0) return -1; if (vm->def->cputune.shares != val) { vm->def->cputune.shares = val; if (virTypedParamsAddULLong(&eventParams, &eventNparams, &eventMaxparams, VIR_DOMAIN_TUNABLE_CPU_CPU_SHARES, val) < 0) return -1; event = virDomainEventTunableNewFromObj(vm, eventParams, eventNparams); } if (event) qemuDomainEventQueue(driver, event); } return 0; }
int qemuTeardownMemoryDevicesCgroup(virDomainObjPtr vm, virDomainMemoryDefPtr mem) { qemuDomainObjPrivatePtr priv = vm->privateData; int rv; if (mem->model != VIR_DOMAIN_MEMORY_MODEL_NVDIMM) return 0; if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES)) return 0; rv = virCgroupDenyDevicePath(priv->cgroup, mem->nvdimmPath, VIR_CGROUP_DEVICE_RWM, false); virDomainAuditCgroupPath(vm, priv->cgroup, "deny", mem->nvdimmPath, "rwm", rv); return rv; }
int qemuTeardownHostdevCgroup(virDomainObjPtr vm, virDomainHostdevDefPtr dev) { qemuDomainObjPrivatePtr priv = vm->privateData; char **path = NULL; size_t i, npaths = 0; int rv, ret = -1; /* currently this only does something for PCI devices using vfio * for device assignment, but it is called for *all* hostdev * devices. */ if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES)) return 0; if (dev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS && dev->source.subsys.type == VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI && dev->source.subsys.u.pci.backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO && qemuDomainGetHostdevPath(vm->def, dev, true, &npaths, &path, NULL) < 0) goto cleanup; for (i = 0; i < npaths; i++) { VIR_DEBUG("Cgroup deny %s", path[i]); rv = virCgroupDenyDevicePath(priv->cgroup, path[i], VIR_CGROUP_DEVICE_RWM, false); virDomainAuditCgroupPath(vm, priv->cgroup, "deny", path[i], "rwm", rv); if (rv < 0) goto cleanup; } ret = 0; cleanup: for (i = 0; i < npaths; i++) VIR_FREE(path[i]); VIR_FREE(path); return ret; }
static int qemuSetupCpuCgroup(virDomainObjPtr vm) { qemuDomainObjPrivatePtr priv = vm->privateData; if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) { if (vm->def->cputune.shares) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", _("CPU tuning is not available on this host")); return -1; } else { return 0; } } if (vm->def->cputune.shares && virCgroupSetCpuShares(priv->cgroup, vm->def->cputune.shares) < 0) return -1; return 0; }
int qemuSetupMemoryDevicesCgroup(virDomainObjPtr vm, virDomainMemoryDefPtr mem) { qemuDomainObjPrivatePtr priv = vm->privateData; int rv; if (mem->model != VIR_DOMAIN_MEMORY_MODEL_NVDIMM) return 0; if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES)) return 0; VIR_DEBUG("Setting devices Cgroup for NVDIMM device: %s", mem->nvdimmPath); rv = virCgroupAllowDevicePath(priv->cgroup, mem->nvdimmPath, VIR_CGROUP_DEVICE_RW, false); virDomainAuditCgroupPath(vm, priv->cgroup, "allow", mem->nvdimmPath, "rw", rv); return rv; }
int qemuTeardownInputCgroup(virDomainObjPtr vm, virDomainInputDefPtr dev) { qemuDomainObjPrivatePtr priv = vm->privateData; int ret = 0; if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES)) return 0; switch (dev->type) { case VIR_DOMAIN_INPUT_TYPE_PASSTHROUGH: VIR_DEBUG("Process path '%s' for input device", dev->source.evdev); ret = virCgroupDenyDevicePath(priv->cgroup, dev->source.evdev, VIR_CGROUP_DEVICE_RWM, false); virDomainAuditCgroupPath(vm, priv->cgroup, "deny", dev->source.evdev, "rwm", ret); break; } return ret; }
/** * qemuCgroupEmulatorAllNodesAllow: * @cgroup: domain cgroup pointer * @retData: filled with structure used to roll back the operation * * Allows all NUMA nodes for the qemu emulator thread temporarily. This is * necessary when hotplugging cpus since it requires memory allocated in the * DMA region. Afterwards the operation can be reverted by * qemuCgroupEmulatorAllNodesRestore. * * Returns 0 on success -1 on error */ int qemuCgroupEmulatorAllNodesAllow(virCgroupPtr cgroup, qemuCgroupEmulatorAllNodesDataPtr *retData) { qemuCgroupEmulatorAllNodesDataPtr data = NULL; char *all_nodes_str = NULL; virBitmapPtr all_nodes = NULL; int ret = -1; if (!virNumaIsAvailable() || !virCgroupHasController(cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) return 0; if (!(all_nodes = virNumaGetHostMemoryNodeset())) goto cleanup; if (!(all_nodes_str = virBitmapFormat(all_nodes))) goto cleanup; if (VIR_ALLOC(data) < 0) goto cleanup; if (virCgroupNewThread(cgroup, VIR_CGROUP_THREAD_EMULATOR, 0, false, &data->emulatorCgroup) < 0) goto cleanup; if (virCgroupGetCpusetMems(data->emulatorCgroup, &data->emulatorMemMask) < 0 || virCgroupSetCpusetMems(data->emulatorCgroup, all_nodes_str) < 0) goto cleanup; VIR_STEAL_PTR(*retData, data); ret = 0; cleanup: VIR_FREE(all_nodes_str); virBitmapFree(all_nodes); qemuCgroupEmulatorAllNodesDataFree(data); return ret; }
static int qemuTeardownChrSourceCgroup(virDomainObjPtr vm, virDomainChrSourceDefPtr source) { qemuDomainObjPrivatePtr priv = vm->privateData; int ret; if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES)) return 0; if (source->type != VIR_DOMAIN_CHR_TYPE_DEV) return 0; VIR_DEBUG("Process path '%s' for device", source->data.file.path); ret = virCgroupDenyDevicePath(priv->cgroup, source->data.file.path, VIR_CGROUP_DEVICE_RW, false); virDomainAuditCgroupPath(vm, priv->cgroup, "deny", source->data.file.path, "rw", ret); return ret; }
int qemuSetupCgroupForEmulator(virQEMUDriverPtr driver, virDomainObjPtr vm, virBitmapPtr nodemask) { virBitmapPtr cpumask = NULL; virBitmapPtr cpumap = NULL; virCgroupPtr cgroup_emulator = NULL; virDomainDefPtr def = vm->def; qemuDomainObjPrivatePtr priv = vm->privateData; unsigned long long period = vm->def->cputune.emulator_period; long long quota = vm->def->cputune.emulator_quota; if ((period || quota) && !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", _("cgroup cpu is required for scheduler tuning")); return -1; } if (priv->cgroup == NULL) return 0; /* Not supported, so claim success */ if (virCgroupNewEmulator(priv->cgroup, true, &cgroup_emulator) < 0) goto cleanup; if (virCgroupMoveTask(priv->cgroup, cgroup_emulator) < 0) goto cleanup; if (def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) { if (!(cpumap = qemuPrepareCpumap(driver, nodemask))) goto cleanup; cpumask = cpumap; } else if (def->cputune.emulatorpin) { cpumask = def->cputune.emulatorpin->cpumask; } else if (def->cpumask) { cpumask = def->cpumask; } if (cpumask) { if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET) && qemuSetupCgroupEmulatorPin(cgroup_emulator, cpumask) < 0) goto cleanup; } if (period || quota) { if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) && qemuSetupCgroupVcpuBW(cgroup_emulator, period, quota) < 0) goto cleanup; } virCgroupFree(&cgroup_emulator); virBitmapFree(cpumap); return 0; cleanup: virBitmapFree(cpumap); if (cgroup_emulator) { virCgroupRemove(cgroup_emulator); virCgroupFree(&cgroup_emulator); } return -1; }
/** * virLXCProcessStart: * @conn: pointer to connection * @driver: pointer to driver structure * @vm: pointer to virtual machine structure * @autoDestroy: mark the domain for auto destruction * @reason: reason for switching vm to running state * * Starts a vm * * Returns 0 on success or -1 in case of error */ int virLXCProcessStart(virConnectPtr conn, virLXCDriverPtr driver, virDomainObjPtr vm, unsigned int nfiles, int *files, bool autoDestroy, virDomainRunningReason reason) { int rc = -1, r; size_t nttyFDs = 0; int *ttyFDs = NULL; size_t i; char *logfile = NULL; int logfd = -1; size_t nveths = 0; char **veths = NULL; int handshakefds[2] = { -1, -1 }; off_t pos = -1; char ebuf[1024]; char *timestamp; virCommandPtr cmd = NULL; virLXCDomainObjPrivatePtr priv = vm->privateData; virCapsPtr caps = NULL; virErrorPtr err = NULL; virLXCDriverConfigPtr cfg = virLXCDriverGetConfig(driver); virCgroupPtr selfcgroup; int status; if (virCgroupNewSelf(&selfcgroup) < 0) return -1; if (!virCgroupHasController(selfcgroup, VIR_CGROUP_CONTROLLER_CPUACCT)) { virCgroupFree(&selfcgroup); virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("Unable to find 'cpuacct' cgroups controller mount")); return -1; } if (!virCgroupHasController(selfcgroup, VIR_CGROUP_CONTROLLER_DEVICES)) { virCgroupFree(&selfcgroup); virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("Unable to find 'devices' cgroups controller mount")); return -1; } if (!virCgroupHasController(selfcgroup, VIR_CGROUP_CONTROLLER_MEMORY)) { virCgroupFree(&selfcgroup); virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("Unable to find 'memory' cgroups controller mount")); return -1; } virCgroupFree(&selfcgroup); if (virFileMakePath(cfg->logDir) < 0) { virReportSystemError(errno, _("Cannot create log directory '%s'"), cfg->logDir); return -1; } if (!vm->def->resource) { virDomainResourceDefPtr res; if (VIR_ALLOC(res) < 0) goto cleanup; if (VIR_STRDUP(res->partition, "/machine") < 0) { VIR_FREE(res); goto cleanup; } vm->def->resource = res; } if (virAsprintf(&logfile, "%s/%s.log", cfg->logDir, vm->def->name) < 0) return -1; if (!(caps = virLXCDriverGetCapabilities(driver, false))) goto cleanup; /* Do this up front, so any part of the startup process can add * runtime state to vm->def that won't be persisted. This let's us * report implicit runtime defaults in the XML, like vnc listen/socket */ VIR_DEBUG("Setting current domain def as transient"); if (virDomainObjSetDefTransient(caps, driver->xmlopt, vm, true) < 0) goto cleanup; /* Run an early hook to set-up missing devices */ if (virHookPresent(VIR_HOOK_DRIVER_LXC)) { char *xml = virDomainDefFormat(vm->def, 0); int hookret; hookret = virHookCall(VIR_HOOK_DRIVER_LXC, vm->def->name, VIR_HOOK_LXC_OP_PREPARE, VIR_HOOK_SUBOP_BEGIN, NULL, xml, NULL); VIR_FREE(xml); /* * If the script raised an error abort the launch */ if (hookret < 0) goto cleanup; } if (virLXCProcessEnsureRootFS(vm) < 0) goto cleanup; /* Must be run before security labelling */ VIR_DEBUG("Preparing host devices"); if (virLXCPrepareHostDevices(driver, vm->def) < 0) goto cleanup; /* Here we open all the PTYs we need on the host OS side. * The LXC controller will open the guest OS side PTYs * and forward I/O between them. */ nttyFDs = vm->def->nconsoles; if (VIR_ALLOC_N(ttyFDs, nttyFDs) < 0) goto cleanup; for (i = 0; i < vm->def->nconsoles; i++) ttyFDs[i] = -1; /* If you are using a SecurityDriver with dynamic labelling, then generate a security label for isolation */ VIR_DEBUG("Generating domain security label (if required)"); if (vm->def->nseclabels && vm->def->seclabels[0]->type == VIR_DOMAIN_SECLABEL_DEFAULT) vm->def->seclabels[0]->type = VIR_DOMAIN_SECLABEL_NONE; if (virSecurityManagerGenLabel(driver->securityManager, vm->def) < 0) { virDomainAuditSecurityLabel(vm, false); goto cleanup; } virDomainAuditSecurityLabel(vm, true); VIR_DEBUG("Setting domain security labels"); if (virSecurityManagerSetAllLabel(driver->securityManager, vm->def, NULL) < 0) goto cleanup; for (i = 0; i < vm->def->nconsoles; i++) { char *ttyPath; if (vm->def->consoles[i]->source.type != VIR_DOMAIN_CHR_TYPE_PTY) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", _("Only PTY console types are supported")); goto cleanup; } if (virFileOpenTty(&ttyFDs[i], &ttyPath, 1) < 0) { virReportSystemError(errno, "%s", _("Failed to allocate tty")); goto cleanup; } VIR_FREE(vm->def->consoles[i]->source.data.file.path); vm->def->consoles[i]->source.data.file.path = ttyPath; VIR_FREE(vm->def->consoles[i]->info.alias); if (virAsprintf(&vm->def->consoles[i]->info.alias, "console%zu", i) < 0) goto cleanup; } if (virLXCProcessSetupInterfaces(conn, vm->def, &nveths, &veths) < 0) goto cleanup; /* Save the configuration for the controller */ if (virDomainSaveConfig(cfg->stateDir, vm->def) < 0) goto cleanup; if ((logfd = open(logfile, O_WRONLY | O_APPEND | O_CREAT, S_IRUSR|S_IWUSR)) < 0) { virReportSystemError(errno, _("Failed to open '%s'"), logfile); goto cleanup; } if (pipe(handshakefds) < 0) { virReportSystemError(errno, "%s", _("Unable to create pipe")); goto cleanup; } if (!(cmd = virLXCProcessBuildControllerCmd(driver, vm, nveths, veths, ttyFDs, nttyFDs, files, nfiles, handshakefds[1]))) goto cleanup; virCommandSetOutputFD(cmd, &logfd); virCommandSetErrorFD(cmd, &logfd); /* now that we know it is about to start call the hook if present */ if (virHookPresent(VIR_HOOK_DRIVER_LXC)) { char *xml = virDomainDefFormat(vm->def, 0); int hookret; hookret = virHookCall(VIR_HOOK_DRIVER_LXC, vm->def->name, VIR_HOOK_LXC_OP_START, VIR_HOOK_SUBOP_BEGIN, NULL, xml, NULL); VIR_FREE(xml); /* * If the script raised an error abort the launch */ if (hookret < 0) goto cleanup; } /* Log timestamp */ if ((timestamp = virTimeStringNow()) == NULL) goto cleanup; if (safewrite(logfd, timestamp, strlen(timestamp)) < 0 || safewrite(logfd, START_POSTFIX, strlen(START_POSTFIX)) < 0) { VIR_WARN("Unable to write timestamp to logfile: %s", virStrerror(errno, ebuf, sizeof(ebuf))); } VIR_FREE(timestamp); /* Log generated command line */ virCommandWriteArgLog(cmd, logfd); if ((pos = lseek(logfd, 0, SEEK_END)) < 0) VIR_WARN("Unable to seek to end of logfile: %s", virStrerror(errno, ebuf, sizeof(ebuf))); virCommandRawStatus(cmd); if (virCommandRun(cmd, &status) < 0) goto cleanup; if (status != 0) { if (virLXCProcessReadLogOutput(vm, logfile, pos, ebuf, sizeof(ebuf)) <= 0) { if (WIFEXITED(status)) snprintf(ebuf, sizeof(ebuf), _("unexpected exit status %d"), WEXITSTATUS(status)); else snprintf(ebuf, sizeof(ebuf), "%s", _("terminated abnormally")); } virReportError(VIR_ERR_INTERNAL_ERROR, _("guest failed to start: %s"), ebuf); goto cleanup; } if (VIR_CLOSE(handshakefds[1]) < 0) { virReportSystemError(errno, "%s", _("could not close handshake fd")); goto cleanup; } /* Connect to the controller as a client *first* because * this will block until the child has written their * pid file out to disk & created their cgroup */ if (!(priv->monitor = virLXCProcessConnectMonitor(driver, vm))) { /* Intentionally overwrite the real monitor error message, * since a better one is almost always found in the logs */ if (virLXCProcessReadLogOutput(vm, logfile, pos, ebuf, sizeof(ebuf)) > 0) { virResetLastError(); virReportError(VIR_ERR_INTERNAL_ERROR, _("guest failed to start: %s"), ebuf); } goto cleanup; } /* And get its pid */ if ((r = virPidFileRead(cfg->stateDir, vm->def->name, &vm->pid)) < 0) { if (virLXCProcessReadLogOutput(vm, logfile, pos, ebuf, sizeof(ebuf)) > 0) virReportError(VIR_ERR_INTERNAL_ERROR, _("guest failed to start: %s"), ebuf); else virReportSystemError(-r, _("Failed to read pid file %s/%s.pid"), cfg->stateDir, vm->def->name); goto cleanup; } if (virCgroupNewDetectMachine(vm->def->name, "lxc", vm->pid, vm->def->resource ? vm->def->resource->partition : NULL, -1, &priv->cgroup) < 0) goto error; if (!priv->cgroup) { virReportError(VIR_ERR_INTERNAL_ERROR, _("No valid cgroup for machine %s"), vm->def->name); goto error; } priv->stopReason = VIR_DOMAIN_EVENT_STOPPED_FAILED; priv->wantReboot = false; vm->def->id = vm->pid; virDomainObjSetState(vm, VIR_DOMAIN_RUNNING, reason); priv->doneStopEvent = false; if (virAtomicIntInc(&driver->nactive) == 1 && driver->inhibitCallback) driver->inhibitCallback(true, driver->inhibitOpaque); if (lxcContainerWaitForContinue(handshakefds[0]) < 0) { char out[1024]; if (!(virLXCProcessReadLogOutput(vm, logfile, pos, out, 1024) < 0)) { virReportError(VIR_ERR_INTERNAL_ERROR, _("guest failed to start: %s"), out); } goto error; } if (autoDestroy && virCloseCallbacksSet(driver->closeCallbacks, vm, conn, lxcProcessAutoDestroy) < 0) goto error; if (virDomainObjSetDefTransient(caps, driver->xmlopt, vm, false) < 0) goto error; /* Write domain status to disk. * * XXX: Earlier we wrote the plain "live" domain XML to this * location for the benefit of libvirt_lxc. We're now overwriting * it with the live status XML instead. This is a (currently * harmless) inconsistency we should fix one day */ if (virDomainSaveStatus(driver->xmlopt, cfg->stateDir, vm) < 0) goto error; /* finally we can call the 'started' hook script if any */ if (virHookPresent(VIR_HOOK_DRIVER_LXC)) { char *xml = virDomainDefFormat(vm->def, 0); int hookret; hookret = virHookCall(VIR_HOOK_DRIVER_LXC, vm->def->name, VIR_HOOK_LXC_OP_STARTED, VIR_HOOK_SUBOP_BEGIN, NULL, xml, NULL); VIR_FREE(xml); /* * If the script raised an error abort the launch */ if (hookret < 0) goto error; } rc = 0; cleanup: if (rc != 0 && !err) err = virSaveLastError(); virCommandFree(cmd); if (VIR_CLOSE(logfd) < 0) { virReportSystemError(errno, "%s", _("could not close logfile")); rc = -1; } for (i = 0; i < nveths; i++) { if (rc != 0 && veths[i]) ignore_value(virNetDevVethDelete(veths[i])); VIR_FREE(veths[i]); } if (rc != 0) { if (vm->newDef) { virDomainDefFree(vm->newDef); vm->newDef = NULL; } if (priv->monitor) { virObjectUnref(priv->monitor); priv->monitor = NULL; } virDomainConfVMNWFilterTeardown(vm); virSecurityManagerRestoreAllLabel(driver->securityManager, vm->def, false); virSecurityManagerReleaseLabel(driver->securityManager, vm->def); /* Clear out dynamically assigned labels */ if (vm->def->nseclabels && vm->def->seclabels[0]->type == VIR_DOMAIN_SECLABEL_DYNAMIC) { VIR_FREE(vm->def->seclabels[0]->model); VIR_FREE(vm->def->seclabels[0]->label); VIR_FREE(vm->def->seclabels[0]->imagelabel); } } for (i = 0; i < nttyFDs; i++) VIR_FORCE_CLOSE(ttyFDs[i]); VIR_FREE(ttyFDs); VIR_FORCE_CLOSE(handshakefds[0]); VIR_FORCE_CLOSE(handshakefds[1]); VIR_FREE(logfile); virObjectUnref(cfg); virObjectUnref(caps); if (err) { virSetError(err); virFreeError(err); } return rc; error: err = virSaveLastError(); virLXCProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FAILED); goto cleanup; }
int qemuSetupCgroupForVcpu(virDomainObjPtr vm) { virCgroupPtr cgroup_vcpu = NULL; qemuDomainObjPrivatePtr priv = vm->privateData; virDomainDefPtr def = vm->def; size_t i, j; unsigned long long period = vm->def->cputune.period; long long quota = vm->def->cputune.quota; char *mem_mask = NULL; virDomainNumatuneMemMode mem_mode; if ((period || quota) && !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", _("cgroup cpu is required for scheduler tuning")); return -1; } /* * If CPU cgroup controller is not initialized here, then we need * neither period nor quota settings. And if CPUSET controller is * not initialized either, then there's nothing to do anyway. */ if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) && !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) return 0; /* We are trying to setup cgroups for CPU pinning, which can also be done * with virProcessSetAffinity, thus the lack of cgroups is not fatal here. */ if (priv->cgroup == NULL) return 0; if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) { /* If we don't know VCPU<->PID mapping or all vcpu runs in the same * thread, we cannot control each vcpu. */ return 0; } if (virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 && mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT && virDomainNumatuneMaybeFormatNodeset(vm->def->numa, priv->autoNodeset, &mem_mask, -1) < 0) goto cleanup; for (i = 0; i < priv->nvcpupids; i++) { virCgroupFree(&cgroup_vcpu); if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_VCPU, i, true, &cgroup_vcpu) < 0) goto cleanup; /* move the thread for vcpu to sub dir */ if (virCgroupAddTask(cgroup_vcpu, priv->vcpupids[i]) < 0) goto cleanup; if (period || quota) { if (qemuSetupCgroupVcpuBW(cgroup_vcpu, period, quota) < 0) goto cleanup; } /* Set vcpupin in cgroup if vcpupin xml is provided */ if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) { virBitmapPtr cpumap = NULL; if (mem_mask && virCgroupSetCpusetMems(cgroup_vcpu, mem_mask) < 0) goto cleanup; /* try to use the default cpu maps */ if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) cpumap = priv->autoCpuset; else cpumap = vm->def->cpumask; /* lookup a more specific pinning info */ for (j = 0; j < def->cputune.nvcpupin; j++) { if (def->cputune.vcpupin[j]->id == i) { cpumap = def->cputune.vcpupin[j]->cpumask; break; } } if (!cpumap) continue; if (qemuSetupCgroupCpusetCpus(cgroup_vcpu, cpumap) < 0) goto cleanup; } } virCgroupFree(&cgroup_vcpu); VIR_FREE(mem_mask); return 0; cleanup: if (cgroup_vcpu) { virCgroupRemove(cgroup_vcpu); virCgroupFree(&cgroup_vcpu); } VIR_FREE(mem_mask); return -1; }
int qemuSetupHostdevCGroup(virDomainObjPtr vm, virDomainHostdevDefPtr dev) { int ret = -1; qemuDomainObjPrivatePtr priv = vm->privateData; virPCIDevicePtr pci = NULL; virUSBDevicePtr usb = NULL; virSCSIDevicePtr scsi = NULL; char *path = NULL; /* currently this only does something for PCI devices using vfio * for device assignment, but it is called for *all* hostdev * devices. */ if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES)) return 0; if (dev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS) { switch (dev->source.subsys.type) { case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI: if (dev->source.subsys.u.pci.backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) { int rv; pci = virPCIDeviceNew(dev->source.subsys.u.pci.addr.domain, dev->source.subsys.u.pci.addr.bus, dev->source.subsys.u.pci.addr.slot, dev->source.subsys.u.pci.addr.function); if (!pci) goto cleanup; if (!(path = virPCIDeviceGetIOMMUGroupDev(pci))) goto cleanup; VIR_DEBUG("Cgroup allow %s for PCI device assignment", path); rv = virCgroupAllowDevicePath(priv->cgroup, path, VIR_CGROUP_DEVICE_RW); virDomainAuditCgroupPath(vm, priv->cgroup, "allow", path, "rw", rv == 0); if (rv < 0) goto cleanup; } break; case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB: /* NB: hostdev->missing wasn't previously checked in the * case of hotplug, only when starting a domain. Now it is * always checked, and the cgroup setup skipped if true. */ if (dev->missing) break; if ((usb = virUSBDeviceNew(dev->source.subsys.u.usb.bus, dev->source.subsys.u.usb.device, NULL)) == NULL) { goto cleanup; } /* oddly, qemuSetupHostUSBDeviceCgroup doesn't ever * reference the usb object we just created */ if (virUSBDeviceFileIterate(usb, qemuSetupHostUSBDeviceCgroup, vm) < 0) { goto cleanup; } break; case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_SCSI: if ((scsi = virSCSIDeviceNew(NULL, dev->source.subsys.u.scsi.adapter, dev->source.subsys.u.scsi.bus, dev->source.subsys.u.scsi.target, dev->source.subsys.u.scsi.unit, dev->readonly, dev->shareable)) == NULL) goto cleanup; if (virSCSIDeviceFileIterate(scsi, qemuSetupHostSCSIDeviceCgroup, vm) < 0) goto cleanup; default: break; } } ret = 0; cleanup: virPCIDeviceFree(pci); virUSBDeviceFree(usb); virSCSIDeviceFree(scsi); VIR_FREE(path); return ret; }
static int qemuSetupCpusetCgroup(virDomainObjPtr vm, virBitmapPtr nodemask, virCapsPtr caps) { qemuDomainObjPrivatePtr priv = vm->privateData; char *mem_mask = NULL; char *cpu_mask = NULL; int ret = -1; if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) return 0; if ((vm->def->numatune.memory.nodemask || (vm->def->numatune.memory.placement_mode == VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO)) && vm->def->numatune.memory.mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) { if (vm->def->numatune.memory.placement_mode == VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO) mem_mask = virBitmapFormat(nodemask); else mem_mask = virBitmapFormat(vm->def->numatune.memory.nodemask); if (!mem_mask) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("failed to convert memory nodemask")); goto cleanup; } if (virCgroupSetCpusetMems(priv->cgroup, mem_mask) < 0) goto cleanup; } if (vm->def->cpumask || (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO)) { if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) { virBitmapPtr cpumap; if (!(cpumap = virCapabilitiesGetCpusForNodemask(caps, nodemask))) goto cleanup; cpu_mask = virBitmapFormat(cpumap); virBitmapFree(cpumap); } else { cpu_mask = virBitmapFormat(vm->def->cpumask); } if (!cpu_mask) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("failed to convert cpu mask")); goto cleanup; } if (virCgroupSetCpusetCpus(priv->cgroup, cpu_mask) < 0) goto cleanup; } ret = 0; cleanup: VIR_FREE(mem_mask); VIR_FREE(cpu_mask); return ret; }
int qemuSetupCgroupForVcpu(virDomainObjPtr vm) { virCgroupPtr cgroup_vcpu = NULL; qemuDomainObjPrivatePtr priv = vm->privateData; virDomainDefPtr def = vm->def; size_t i, j; unsigned long long period = vm->def->cputune.period; long long quota = vm->def->cputune.quota; if ((period || quota) && !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", _("cgroup cpu is required for scheduler tuning")); return -1; } /* We are trying to setup cgroups for CPU pinning, which can also be done * with virProcessInfoSetAffinity, thus the lack of cgroups is not fatal * here. */ if (priv->cgroup == NULL) return 0; if (priv->nvcpupids == 0 || priv->vcpupids[0] == vm->pid) { /* If we don't know VCPU<->PID mapping or all vcpu runs in the same * thread, we cannot control each vcpu. */ VIR_WARN("Unable to get vcpus' pids."); return 0; } for (i = 0; i < priv->nvcpupids; i++) { if (virCgroupNewVcpu(priv->cgroup, i, true, &cgroup_vcpu) < 0) goto cleanup; /* move the thread for vcpu to sub dir */ if (virCgroupAddTask(cgroup_vcpu, priv->vcpupids[i]) < 0) goto cleanup; if (period || quota) { if (qemuSetupCgroupVcpuBW(cgroup_vcpu, period, quota) < 0) goto cleanup; } /* Set vcpupin in cgroup if vcpupin xml is provided */ if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) { /* find the right CPU to pin, otherwise * qemuSetupCgroupVcpuPin will fail. */ for (j = 0; j < def->cputune.nvcpupin; j++) { if (def->cputune.vcpupin[j]->vcpuid != i) continue; if (qemuSetupCgroupVcpuPin(cgroup_vcpu, def->cputune.vcpupin, def->cputune.nvcpupin, i) < 0) goto cleanup; break; } } virCgroupFree(&cgroup_vcpu); } return 0; cleanup: if (cgroup_vcpu) { virCgroupRemove(cgroup_vcpu); virCgroupFree(&cgroup_vcpu); } return -1; }
static int qemuSetupDevicesCgroup(virQEMUDriverPtr driver, virDomainObjPtr vm) { qemuDomainObjPrivatePtr priv = vm->privateData; virQEMUDriverConfigPtr cfg = NULL; const char *const *deviceACL = NULL; int rv = -1; int ret = -1; size_t i; if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES)) return 0; rv = virCgroupDenyAllDevices(priv->cgroup); virDomainAuditCgroup(vm, priv->cgroup, "deny", "all", rv == 0); if (rv < 0) { if (virLastErrorIsSystemErrno(EPERM)) { virResetLastError(); VIR_WARN("Group devices ACL is not accessible, disabling whitelisting"); return 0; } goto cleanup; } for (i = 0; i < vm->def->ndisks; i++) { if (qemuSetupDiskCgroup(vm, vm->def->disks[i]) < 0) goto cleanup; } rv = virCgroupAllowDeviceMajor(priv->cgroup, 'c', DEVICE_PTY_MAJOR, VIR_CGROUP_DEVICE_RW); virDomainAuditCgroupMajor(vm, priv->cgroup, "allow", DEVICE_PTY_MAJOR, "pty", "rw", rv == 0); if (rv < 0) goto cleanup; cfg = virQEMUDriverGetConfig(driver); deviceACL = cfg->cgroupDeviceACL ? (const char *const *)cfg->cgroupDeviceACL : defaultDeviceACL; if (vm->def->nsounds && ((!vm->def->ngraphics && cfg->nogfxAllowHostAudio) || (vm->def->graphics && ((vm->def->graphics[0]->type == VIR_DOMAIN_GRAPHICS_TYPE_VNC && cfg->vncAllowHostAudio) || (vm->def->graphics[0]->type == VIR_DOMAIN_GRAPHICS_TYPE_SDL))))) { rv = virCgroupAllowDeviceMajor(priv->cgroup, 'c', DEVICE_SND_MAJOR, VIR_CGROUP_DEVICE_RW); virDomainAuditCgroupMajor(vm, priv->cgroup, "allow", DEVICE_SND_MAJOR, "sound", "rw", rv == 0); if (rv < 0) goto cleanup; } for (i = 0; deviceACL[i] != NULL; i++) { if (!virFileExists(deviceACL[i])) { VIR_DEBUG("Ignoring non-existent device %s", deviceACL[i]); continue; } rv = virCgroupAllowDevicePath(priv->cgroup, deviceACL[i], VIR_CGROUP_DEVICE_RW); virDomainAuditCgroupPath(vm, priv->cgroup, "allow", deviceACL[i], "rw", rv == 0); if (rv < 0 && !virLastErrorIsSystemErrno(ENOENT)) goto cleanup; } if (virDomainChrDefForeach(vm->def, true, qemuSetupChardevCgroup, vm) < 0) goto cleanup; if (vm->def->tpm && (qemuSetupTPMCgroup(vm->def, vm->def->tpm, vm) < 0)) goto cleanup; for (i = 0; i < vm->def->nhostdevs; i++) { if (qemuSetupHostdevCGroup(vm, vm->def->hostdevs[i]) < 0) goto cleanup; } for (i = 0; i < vm->def->nrngs; i++) { if (vm->def->rngs[i]->backend == VIR_DOMAIN_RNG_BACKEND_RANDOM) { VIR_DEBUG("Setting Cgroup ACL for RNG device"); rv = virCgroupAllowDevicePath(priv->cgroup, vm->def->rngs[i]->source.file, VIR_CGROUP_DEVICE_RW); virDomainAuditCgroupPath(vm, priv->cgroup, "allow", vm->def->rngs[i]->source.file, "rw", rv == 0); if (rv < 0 && !virLastErrorIsSystemErrno(ENOENT)) goto cleanup; } } ret = 0; cleanup: virObjectUnref(cfg); return ret; }
int qemuSetupHostdevCGroup(virDomainObjPtr vm, virDomainHostdevDefPtr dev) { int ret = -1; qemuDomainObjPrivatePtr priv = vm->privateData; virDomainHostdevSubsysUSBPtr usbsrc = &dev->source.subsys.u.usb; virDomainHostdevSubsysPCIPtr pcisrc = &dev->source.subsys.u.pci; virDomainHostdevSubsysSCSIPtr scsisrc = &dev->source.subsys.u.scsi; virPCIDevicePtr pci = NULL; virUSBDevicePtr usb = NULL; virSCSIDevicePtr scsi = NULL; char *path = NULL; /* currently this only does something for PCI devices using vfio * for device assignment, but it is called for *all* hostdev * devices. */ if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_DEVICES)) return 0; if (dev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS) { switch (dev->source.subsys.type) { case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI: if (pcisrc->backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) { int rv; pci = virPCIDeviceNew(pcisrc->addr.domain, pcisrc->addr.bus, pcisrc->addr.slot, pcisrc->addr.function); if (!pci) goto cleanup; if (!(path = virPCIDeviceGetIOMMUGroupDev(pci))) goto cleanup; VIR_DEBUG("Cgroup allow %s for PCI device assignment", path); rv = virCgroupAllowDevicePath(priv->cgroup, path, VIR_CGROUP_DEVICE_RW); virDomainAuditCgroupPath(vm, priv->cgroup, "allow", path, "rw", rv == 0); if (rv < 0) goto cleanup; } break; case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB: /* NB: hostdev->missing wasn't previously checked in the * case of hotplug, only when starting a domain. Now it is * always checked, and the cgroup setup skipped if true. */ if (dev->missing) break; if ((usb = virUSBDeviceNew(usbsrc->bus, usbsrc->device, NULL)) == NULL) { goto cleanup; } /* oddly, qemuSetupHostUSBDeviceCgroup doesn't ever * reference the usb object we just created */ if (virUSBDeviceFileIterate(usb, qemuSetupHostUSBDeviceCgroup, vm) < 0) { goto cleanup; } break; case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_SCSI: { if (scsisrc->protocol == VIR_DOMAIN_HOSTDEV_SCSI_PROTOCOL_TYPE_ISCSI) { virDomainHostdevSubsysSCSIiSCSIPtr iscsisrc = &scsisrc->u.iscsi; /* Follow qemuSetupDiskCgroup() and qemuSetImageCgroupInternal() * which does nothing for non local storage */ VIR_DEBUG("Not updating cgroups for hostdev iSCSI path '%s'", iscsisrc->path); } else { virDomainHostdevSubsysSCSIHostPtr scsihostsrc = &scsisrc->u.host; if ((scsi = virSCSIDeviceNew(NULL, scsihostsrc->adapter, scsihostsrc->bus, scsihostsrc->target, scsihostsrc->unit, dev->readonly, dev->shareable)) == NULL) goto cleanup; if (virSCSIDeviceFileIterate(scsi, qemuSetupHostSCSIDeviceCgroup, vm) < 0) goto cleanup; } break; } default: break; } } ret = 0; cleanup: virPCIDeviceFree(pci); virUSBDeviceFree(usb); virSCSIDeviceFree(scsi); VIR_FREE(path); return ret; }
int qemuSetupCgroupForIOThreads(virDomainObjPtr vm) { virCgroupPtr cgroup_iothread = NULL; qemuDomainObjPrivatePtr priv = vm->privateData; virDomainDefPtr def = vm->def; size_t i; unsigned long long period = vm->def->cputune.period; long long quota = vm->def->cputune.quota; char *mem_mask = NULL; virDomainNumatuneMemMode mem_mode; if ((period || quota) && !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", _("cgroup cpu is required for scheduler tuning")); return -1; } /* * If CPU cgroup controller is not initialized here, then we need * neither period nor quota settings. And if CPUSET controller is * not initialized either, then there's nothing to do anyway. */ if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) && !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) return 0; /* We are trying to setup cgroups for CPU pinning, which can also be done * with virProcessSetAffinity, thus the lack of cgroups is not fatal here. */ if (priv->cgroup == NULL) return 0; if (virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 && mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT && virDomainNumatuneMaybeFormatNodeset(vm->def->numa, priv->autoNodeset, &mem_mask, -1) < 0) goto cleanup; for (i = 0; i < def->niothreadids; i++) { /* IOThreads are numbered 1..n, although the array is 0..n-1, * so we will account for that here */ if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_IOTHREAD, def->iothreadids[i]->iothread_id, true, &cgroup_iothread) < 0) goto cleanup; /* move the thread for iothread to sub dir */ if (virCgroupAddTask(cgroup_iothread, def->iothreadids[i]->thread_id) < 0) goto cleanup; if (period || quota) { if (qemuSetupCgroupVcpuBW(cgroup_iothread, period, quota) < 0) goto cleanup; } /* Set iothreadpin in cgroup if iothreadpin xml is provided */ if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) { virBitmapPtr cpumask = NULL; if (mem_mask && virCgroupSetCpusetMems(cgroup_iothread, mem_mask) < 0) goto cleanup; if (def->iothreadids[i]->cpumask) cpumask = def->iothreadids[i]->cpumask; else if (def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) cpumask = priv->autoCpuset; else cpumask = def->cpumask; if (cpumask && qemuSetupCgroupCpusetCpus(cgroup_iothread, cpumask) < 0) goto cleanup; } virCgroupFree(&cgroup_iothread); } VIR_FREE(mem_mask); return 0; cleanup: if (cgroup_iothread) { virCgroupRemove(cgroup_iothread); virCgroupFree(&cgroup_iothread); } VIR_FREE(mem_mask); return -1; }
int qemuSetupCgroupForEmulator(virDomainObjPtr vm) { virBitmapPtr cpumask = NULL; virCgroupPtr cgroup_emulator = NULL; virDomainDefPtr def = vm->def; qemuDomainObjPrivatePtr priv = vm->privateData; unsigned long long period = vm->def->cputune.emulator_period; long long quota = vm->def->cputune.emulator_quota; if ((period || quota) && !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU)) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", _("cgroup cpu is required for scheduler tuning")); return -1; } /* * If CPU cgroup controller is not initialized here, then we need * neither period nor quota settings. And if CPUSET controller is * not initialized either, then there's nothing to do anyway. */ if (!virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) && !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) return 0; if (priv->cgroup == NULL) return 0; /* Not supported, so claim success */ if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_EMULATOR, 0, true, &cgroup_emulator) < 0) goto cleanup; if (virCgroupMoveTask(priv->cgroup, cgroup_emulator) < 0) goto cleanup; if (def->cputune.emulatorpin) cpumask = def->cputune.emulatorpin; else if (def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) cpumask = priv->autoCpuset; else if (def->cpumask) cpumask = def->cpumask; if (cpumask) { if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET) && qemuSetupCgroupCpusetCpus(cgroup_emulator, cpumask) < 0) goto cleanup; } if (period || quota) { if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPU) && qemuSetupCgroupVcpuBW(cgroup_emulator, period, quota) < 0) goto cleanup; } virCgroupFree(&cgroup_emulator); return 0; cleanup: if (cgroup_emulator) { virCgroupRemove(cgroup_emulator); virCgroupFree(&cgroup_emulator); } return -1; }
static void qemuRestoreCgroupState(virDomainObjPtr vm) { char *mem_mask = NULL; char *nodeset = NULL; int empty = -1; qemuDomainObjPrivatePtr priv = vm->privateData; size_t i = 0; virBitmapPtr all_nodes; virCgroupPtr cgroup_temp = NULL; if (!virNumaIsAvailable() || !virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) return; if (!(all_nodes = virNumaGetHostMemoryNodeset())) goto error; if (!(mem_mask = virBitmapFormat(all_nodes))) goto error; if ((empty = virCgroupHasEmptyTasks(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) <= 0) goto error; if (virCgroupSetCpusetMems(priv->cgroup, mem_mask) < 0) goto error; for (i = 0; i < virDomainDefGetVcpusMax(vm->def); i++) { virDomainVcpuDefPtr vcpu = virDomainDefGetVcpu(vm->def, i); if (!vcpu->online) continue; if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_VCPU, i, false, &cgroup_temp) < 0 || virCgroupSetCpusetMemoryMigrate(cgroup_temp, true) < 0 || virCgroupGetCpusetMems(cgroup_temp, &nodeset) < 0 || virCgroupSetCpusetMems(cgroup_temp, nodeset) < 0) goto cleanup; VIR_FREE(nodeset); virCgroupFree(&cgroup_temp); } for (i = 0; i < vm->def->niothreadids; i++) { if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_IOTHREAD, vm->def->iothreadids[i]->iothread_id, false, &cgroup_temp) < 0 || virCgroupSetCpusetMemoryMigrate(cgroup_temp, true) < 0 || virCgroupGetCpusetMems(cgroup_temp, &nodeset) < 0 || virCgroupSetCpusetMems(cgroup_temp, nodeset) < 0) goto cleanup; VIR_FREE(nodeset); virCgroupFree(&cgroup_temp); } if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_EMULATOR, 0, false, &cgroup_temp) < 0 || virCgroupSetCpusetMemoryMigrate(cgroup_temp, true) < 0 || virCgroupGetCpusetMems(cgroup_temp, &nodeset) < 0 || virCgroupSetCpusetMems(cgroup_temp, nodeset) < 0) goto cleanup; cleanup: VIR_FREE(mem_mask); VIR_FREE(nodeset); virBitmapFree(all_nodes); virCgroupFree(&cgroup_temp); return; error: virResetLastError(); VIR_DEBUG("Couldn't restore cgroups to meaningful state"); goto cleanup; }