static void lxcClientIO(int watch ATTRIBUTE_UNUSED, int fd, int events, void *opaque) { struct lxcMonitor *monitor = opaque; char buf[1024]; ssize_t ret; if (events & (VIR_EVENT_HANDLE_HANGUP | VIR_EVENT_HANDLE_ERROR)) { virEventRemoveHandle(monitor->clientWatch); monitor->clientWatch = -1; return; } reread: ret = read(fd, buf, sizeof(buf)); if (ret == -1 && errno == EINTR) goto reread; if (ret == -1 && errno == EAGAIN) return; if (ret == -1) { lxcError(VIR_ERR_INTERNAL_ERROR, "%s", _("Unable to read from monitor client")); virMutexLock(&lock); quit = true; virMutexUnlock(&lock); return; } if (ret == 0) { VIR_DEBUG("Client %d gone", fd); VIR_FORCE_CLOSE(monitor->clientFd); virEventRemoveHandle(monitor->clientWatch); monitor->clientWatch = -1; } }
/* * This is running as the 'init' process insid the container. * It removes some capabilities that could be dangerous to * host system, since they are not currently "containerized" */ static int lxcContainerDropCapabilities(void) { #if HAVE_CAPNG int ret; capng_get_caps_process(); if ((ret = capng_updatev(CAPNG_DROP, CAPNG_EFFECTIVE | CAPNG_PERMITTED | CAPNG_INHERITABLE | CAPNG_BOUNDING_SET, CAP_SYS_BOOT, /* No use of reboot */ CAP_SYS_MODULE, /* No kernel module loading */ CAP_SYS_TIME, /* No changing the clock */ CAP_AUDIT_CONTROL, /* No messing with auditing status */ CAP_MAC_ADMIN, /* No messing with LSM config */ -1 /* sentinal */)) < 0) { lxcError(VIR_ERR_INTERNAL_ERROR, _("Failed to remove capabilities: %d"), ret); return -1; } if ((ret = capng_apply(CAPNG_SELECT_BOTH)) < 0) { lxcError(VIR_ERR_INTERNAL_ERROR, _("Failed to apply capabilities: %d"), ret); return -1; } /* We do not need to call capng_lock() in this case. The bounding * set restriction will prevent them reacquiring sys_boot/module/time, * etc which is all that matters for the container. Once inside the * container it is fine for SECURE_NOROOT / SECURE_NO_SETUID_FIXUP to * be unmasked - they can never escape the bounding set. */ #else VIR_WARN0("libcap-ng support not compiled in, unable to clear capabilities"); #endif return 0; }
static int lxcControllerClearCapabilities(void) { #if HAVE_CAPNG int ret; capng_clear(CAPNG_SELECT_BOTH); if ((ret = capng_apply(CAPNG_SELECT_BOTH)) < 0) { lxcError(VIR_ERR_INTERNAL_ERROR, _("failed to apply capabilities: %d"), ret); return -1; } #else VIR_WARN0("libcap-ng support not compiled in, unable to clear capabilities"); #endif return 0; }
static int lxcMonitorServer(const char *sockpath) { int fd; struct sockaddr_un addr; if ((fd = socket(PF_UNIX, SOCK_STREAM, 0)) < 0) { virReportSystemError(errno, _("failed to create server socket '%s'"), sockpath); goto error; } unlink(sockpath); memset(&addr, 0, sizeof(addr)); addr.sun_family = AF_UNIX; if (virStrcpyStatic(addr.sun_path, sockpath) == NULL) { lxcError(VIR_ERR_INTERNAL_ERROR, _("Socket path %s too long for destination"), sockpath); goto error; } if (bind(fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) { virReportSystemError(errno, _("failed to bind server socket '%s'"), sockpath); goto error; } if (listen(fd, 30 /* backlog */ ) < 0) { virReportSystemError(errno, _("failed to listen server socket %s"), sockpath); goto error; } return fd; error: VIR_FORCE_CLOSE(fd); return -1; }
static void lxcServerAccept(int watch ATTRIBUTE_UNUSED, int fd, int events ATTRIBUTE_UNUSED, void *opaque) { struct lxcMonitor *monitor = opaque; int client; if ((client = accept(fd, NULL, NULL)) < 0) { /* First reflex may be simply to declare accept failure to be a fatal error. However, accept may fail when a client quits between the above poll and here. That case is not fatal, but rather to be expected, if not common, so ignore it. */ if (ignorable_accept_errno(errno)) return; virReportSystemError(errno, "%s", _("Unable to accept monitor client")); virMutexLock(&lock); quit = true; virMutexUnlock(&lock); return; } VIR_DEBUG("New client %d (old %d)\n", client, monitor->clientFd); VIR_FORCE_CLOSE(monitor->clientFd); virEventRemoveHandle(monitor->clientWatch); monitor->clientFd = client; if ((monitor->clientWatch = virEventAddHandle(monitor->clientFd, VIR_EVENT_HANDLE_READABLE, lxcClientIO, monitor, NULL)) < 0) { lxcError(VIR_ERR_INTERNAL_ERROR, "%s", _("Unable to watch client socket")); virMutexLock(&lock); quit = true; virMutexUnlock(&lock); return; } }
/** * lxcControllerMain * @monitor: server socket fd to accept client requests * @client: initial client which is the libvirtd daemon * @appPty: open fd for application facing Pty * @contPty: open fd for container facing Pty * * Forwards traffic between fds. Data read from appPty will be written to contPty * This process loops forever. * This uses epoll in edge triggered mode to avoid a hard loop on POLLHUP * events when the user disconnects the virsh console via ctrl-] * * Returns 0 on success or -1 in case of error */ static int lxcControllerMain(int monitor, int client, int appPty, int contPty, pid_t container) { int rc = -1; int epollFd; struct epoll_event epollEvent; int numEvents; int numActive = 0; lxcTtyForwardFd_t fdArray[2]; int timeout = -1; int curFdOff = 0; int writeFdOff = 0; fdArray[0].fd = appPty; fdArray[0].active = 0; fdArray[1].fd = contPty; fdArray[1].active = 0; VIR_DEBUG("monitor=%d client=%d appPty=%d contPty=%d", monitor, client, appPty, contPty); /* create the epoll fild descriptor */ epollFd = epoll_create(2); if (0 > epollFd) { virReportSystemError(errno, "%s", _("epoll_create(2) failed")); goto cleanup; } /* add the file descriptors the epoll fd */ memset(&epollEvent, 0x00, sizeof(epollEvent)); epollEvent.events = EPOLLIN|EPOLLET; /* edge triggered */ epollEvent.data.fd = appPty; if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, appPty, &epollEvent)) { virReportSystemError(errno, "%s", _("epoll_ctl(appPty) failed")); goto cleanup; } epollEvent.data.fd = contPty; if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, contPty, &epollEvent)) { virReportSystemError(errno, "%s", _("epoll_ctl(contPty) failed")); goto cleanup; } epollEvent.events = EPOLLIN; epollEvent.data.fd = monitor; if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, monitor, &epollEvent)) { virReportSystemError(errno, "%s", _("epoll_ctl(monitor) failed")); goto cleanup; } epollEvent.events = EPOLLHUP; epollEvent.data.fd = client; if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, client, &epollEvent)) { virReportSystemError(errno, "%s", _("epoll_ctl(client) failed")); goto cleanup; } while (1) { /* if active fd's, return if no events, else wait forever */ timeout = (numActive > 0) ? 0 : -1; numEvents = epoll_wait(epollFd, &epollEvent, 1, timeout); if (numEvents > 0) { if (epollEvent.data.fd == monitor) { int fd = accept(monitor, NULL, 0); if (fd < 0) { /* First reflex may be simply to declare accept failure to be a fatal error. However, accept may fail when a client quits between the above epoll_wait and here. That case is not fatal, but rather to be expected, if not common, so ignore it. */ if (ignorable_epoll_accept_errno(errno)) continue; virReportSystemError(errno, "%s", _("accept(monitor,...) failed")); goto cleanup; } if (client != -1) { /* Already connected, so kick new one out */ VIR_FORCE_CLOSE(fd); continue; } client = fd; epollEvent.events = EPOLLHUP; epollEvent.data.fd = client; if (0 > epoll_ctl(epollFd, EPOLL_CTL_ADD, client, &epollEvent)) { virReportSystemError(errno, "%s", _("epoll_ctl(client) failed")); goto cleanup; } } else if (client != -1 && epollEvent.data.fd == client) { if (0 > epoll_ctl(epollFd, EPOLL_CTL_DEL, client, &epollEvent)) { virReportSystemError(errno, "%s", _("epoll_ctl(client) failed")); goto cleanup; } VIR_FORCE_CLOSE(client); } else { if (epollEvent.events & EPOLLIN) { curFdOff = epollEvent.data.fd == appPty ? 0 : 1; if (!fdArray[curFdOff].active) { fdArray[curFdOff].active = 1; ++numActive; } } else if (epollEvent.events & EPOLLHUP) { if (lxcPidGone(container)) goto cleanup; curFdOff = epollEvent.data.fd == appPty ? 0 : 1; if (fdArray[curFdOff].active) { fdArray[curFdOff].active = 0; --numActive; } continue; } else { lxcError(VIR_ERR_INTERNAL_ERROR, _("error event %d"), epollEvent.events); goto cleanup; } } } else if (0 == numEvents) { if (2 == numActive) { /* both fds active, toggle between the two */ curFdOff ^= 1; } else { /* only one active, if current is active, use it, else it */ /* must be the other one (ie. curFd just went inactive) */ curFdOff = fdArray[curFdOff].active ? curFdOff : curFdOff ^ 1; } } else { if (EINTR == errno) { continue; } /* error */ virReportSystemError(errno, "%s", _("epoll_wait() failed")); goto cleanup; } if (0 < numActive) { writeFdOff = curFdOff ^ 1; rc = lxcFdForward(fdArray[curFdOff].fd, fdArray[writeFdOff].fd); if (EAGAIN == rc) { /* this fd no longer has data, set it as inactive */ --numActive; fdArray[curFdOff].active = 0; } else if (-1 == rc) { if (lxcPidGone(container)) goto cleanup; continue; } } } rc = 0; cleanup: VIR_FORCE_CLOSE(appPty); VIR_FORCE_CLOSE(contPty); VIR_FORCE_CLOSE(epollFd); return rc; }
/** * lxcContainerChild: * @data: pointer to container arguments * * This function is run in the process clone()'d in lxcStartContainer. * Perform a number of container setup tasks: * Setup container file system * mount container /proca * Then exec's the container init * * Returns 0 on success or -1 in case of error */ static int lxcContainerChild( void *data ) { lxc_child_argv_t *argv = data; virDomainDefPtr vmDef = argv->config; int ttyfd; char *ttyPath; virDomainFSDefPtr root; if (NULL == vmDef) { lxcError(VIR_ERR_INTERNAL_ERROR, "%s", _("lxcChild() passed invalid vm definition")); return -1; } root = virDomainGetRootFilesystem(vmDef); if (root) { if (virAsprintf(&ttyPath, "%s%s", root->src, argv->ttyPath) < 0) { virReportOOMError(); return -1; } } else { if (!(ttyPath = strdup(argv->ttyPath))) { virReportOOMError(); return -1; } } ttyfd = open(ttyPath, O_RDWR|O_NOCTTY); if (ttyfd < 0) { virReportSystemError(errno, _("Failed to open tty %s"), ttyPath); VIR_FREE(ttyPath); return -1; } VIR_FREE(ttyPath); if (lxcContainerSetStdio(argv->monitor, ttyfd) < 0) { close(ttyfd); return -1; } close(ttyfd); if (lxcContainerSetupMounts(vmDef, root) < 0) return -1; /* Wait for interface devices to show up */ if (lxcContainerWaitForContinue(argv->monitor) < 0) return -1; /* rename and enable interfaces */ if (lxcContainerRenameAndEnableInterfaces(argv->nveths, argv->veths) < 0) return -1; /* drop a set of root capabilities */ if (lxcContainerDropCapabilities() < 0) return -1; /* this function will only return if an error occured */ return lxcContainerExecInit(vmDef); }
/** * lxcControllerMain * @serverFd: server socket fd to accept client requests * @clientFd: initial client which is the libvirtd daemon * @hostFd: open fd for application facing Pty * @contFd: open fd for container facing Pty * * Processes I/O on consoles and the monitor * * Returns 0 on success or -1 in case of error */ static int lxcControllerMain(int serverFd, int clientFd, int *hostFds, int *contFds, size_t nFds, pid_t container) { struct lxcConsole *consoles; struct lxcMonitor monitor = { .serverFd = serverFd, .clientFd = clientFd, }; virErrorPtr err; int rc = -1; size_t i; if (virMutexInit(&lock) < 0) goto cleanup2; if (pipe2(sigpipe, O_CLOEXEC|O_NONBLOCK) < 0) { virReportSystemError(errno, "%s", _("Cannot create signal pipe")); goto cleanup; } if (virEventAddHandle(sigpipe[0], VIR_EVENT_HANDLE_READABLE, lxcSignalChildIO, &container, NULL) < 0) { lxcError(VIR_ERR_INTERNAL_ERROR, "%s", _("Unable to watch signal pipe")); goto cleanup; } if (signal(SIGCHLD, lxcSignalChildHandler) == SIG_ERR) { virReportSystemError(errno, "%s", _("Cannot install signal handler")); goto cleanup; } VIR_DEBUG("serverFd=%d clientFd=%d", serverFd, clientFd); virResetLastError(); if ((monitor.serverWatch = virEventAddHandle(monitor.serverFd, VIR_EVENT_HANDLE_READABLE, lxcServerAccept, &monitor, NULL)) < 0) { lxcError(VIR_ERR_INTERNAL_ERROR, "%s", _("Unable to watch monitor socket")); goto cleanup; } if (monitor.clientFd != -1 && (monitor.clientWatch = virEventAddHandle(monitor.clientFd, VIR_EVENT_HANDLE_READABLE, lxcClientIO, &monitor, NULL)) < 0) { lxcError(VIR_ERR_INTERNAL_ERROR, "%s", _("Unable to watch client socket")); goto cleanup; } if (VIR_ALLOC_N(consoles, nFds) < 0) { virReportOOMError(); goto cleanup; } for (i = 0 ; i < nFds ; i++) { consoles[i].hostFd = hostFds[i]; consoles[i].contFd = contFds[i]; if ((consoles[i].hostWatch = virEventAddHandle(consoles[i].hostFd, VIR_EVENT_HANDLE_READABLE, lxcConsoleIO, &consoles[i], NULL)) < 0) { lxcError(VIR_ERR_INTERNAL_ERROR, "%s", _("Unable to watch host console PTY")); goto cleanup; } if ((consoles[i].contWatch = virEventAddHandle(consoles[i].contFd, VIR_EVENT_HANDLE_READABLE, lxcConsoleIO, &consoles[i], NULL)) < 0) { lxcError(VIR_ERR_INTERNAL_ERROR, "%s", _("Unable to watch host console PTY")); goto cleanup; } } virMutexLock(&lock); while (!quit) { virMutexUnlock(&lock); if (virEventRunDefaultImpl() < 0) goto cleanup; virMutexLock(&lock); } virMutexUnlock(&lock); err = virGetLastError(); if (!err || err->code == VIR_ERR_OK) rc = 0; cleanup: virMutexDestroy(&lock); signal(SIGCHLD, SIG_DFL); cleanup2: VIR_FORCE_CLOSE(monitor.serverFd); VIR_FORCE_CLOSE(monitor.clientFd); VIR_FREE(consoles); return rc; }
static int lxcGetLoopFD(char **dev_name) { int fd = -1; DIR *dh = NULL; struct dirent *de; char *looppath; struct loop_info64 lo; VIR_DEBUG("Looking for loop devices in /dev"); if (!(dh = opendir("/dev"))) { virReportSystemError(errno, "%s", _("Unable to read /dev")); goto cleanup; } while ((de = readdir(dh)) != NULL) { if (!STRPREFIX(de->d_name, "loop")) continue; if (virAsprintf(&looppath, "/dev/%s", de->d_name) < 0) { virReportOOMError(); goto cleanup; } VIR_DEBUG("Checking up on device %s", looppath); if ((fd = open(looppath, O_RDWR)) < 0) { virReportSystemError(errno, _("Unable to open %s"), looppath); goto cleanup; } if (ioctl(fd, LOOP_GET_STATUS64, &lo) < 0) { /* Got a free device, return the fd */ if (errno == ENXIO) goto cleanup; VIR_FORCE_CLOSE(fd); virReportSystemError(errno, _("Unable to get loop status on %s"), looppath); goto cleanup; } /* Oh well, try the next device */ VIR_FORCE_CLOSE(fd); VIR_FREE(looppath); } lxcError(VIR_ERR_INTERNAL_ERROR, "%s", _("Unable to find a free loop device in /dev")); cleanup: if (fd != -1) { VIR_DEBUG("Got free loop device %s %d", looppath, fd); *dev_name = looppath; } else { VIR_DEBUG("No free loop devices available"); VIR_FREE(looppath); } if (dh) closedir(dh); return fd; }
static int lxcControllerRun(virDomainDefPtr def, unsigned int nveths, char **veths, int monitor, int client, int *ttyFDs, size_t nttyFDs, int handshakefd) { int rc = -1; int control[2] = { -1, -1}; int containerhandshake[2] = { -1, -1 }; int *containerTtyFDs = NULL; char **containerTtyPaths = NULL; pid_t container = -1; virDomainFSDefPtr root; char *devpts = NULL; char *devptmx = NULL; size_t nloopDevs = 0; int *loopDevs = NULL; size_t i; if (VIR_ALLOC_N(containerTtyFDs, nttyFDs) < 0) { virReportOOMError(); goto cleanup; } if (VIR_ALLOC_N(containerTtyPaths, nttyFDs) < 0) { virReportOOMError(); goto cleanup; } if (socketpair(PF_UNIX, SOCK_STREAM, 0, control) < 0) { virReportSystemError(errno, "%s", _("sockpair failed")); goto cleanup; } if (socketpair(PF_UNIX, SOCK_STREAM, 0, containerhandshake) < 0) { virReportSystemError(errno, "%s", _("socketpair failed")); goto cleanup; } if (lxcSetupLoopDevices(def, &nloopDevs, &loopDevs) < 0) goto cleanup; root = virDomainGetRootFilesystem(def); if (lxcSetContainerResources(def) < 0) goto cleanup; /* * If doing a chroot style setup, we need to prepare * a private /dev/pts for the child now, which they * will later move into position. * * This is complex because 'virsh console' needs to * use /dev/pts from the host OS, and the guest OS * needs to use /dev/pts from the guest. * * This means that we (libvirt_lxc) need to see and * use both /dev/pts instances. We're running in the * host OS context though and don't want to expose * the guest OS /dev/pts there. * * Thus we call unshare(CLONE_NS) so that we can see * the guest's new /dev/pts, without it becoming * visible to the host OS. We also put the root FS * into slave mode, just in case it was currently * marked as shared */ if (root) { VIR_DEBUG("Setting up private /dev/pts"); if (!virFileExists(root->src)) { virReportSystemError(errno, _("root source %s does not exist"), root->src); goto cleanup; } if (unshare(CLONE_NEWNS) < 0) { virReportSystemError(errno, "%s", _("Cannot unshare mount namespace")); goto cleanup; } if (mount("", "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) { virReportSystemError(errno, "%s", _("Failed to switch root mount into slave mode")); goto cleanup; } if (virAsprintf(&devpts, "%s/dev/pts", root->src) < 0 || virAsprintf(&devptmx, "%s/dev/pts/ptmx", root->src) < 0) { virReportOOMError(); goto cleanup; } if (virFileMakePath(devpts) < 0) { virReportSystemError(errno, _("Failed to make path %s"), devpts); goto cleanup; } /* XXX should we support gid=X for X!=5 for distros which use * a different gid for tty? */ VIR_DEBUG("Mounting 'devpts' on %s", devpts); if (mount("devpts", devpts, "devpts", 0, "newinstance,ptmxmode=0666,mode=0620,gid=5") < 0) { virReportSystemError(errno, _("Failed to mount devpts on %s"), devpts); goto cleanup; } if (access(devptmx, R_OK) < 0) { VIR_WARN("Kernel does not support private devpts, using shared devpts"); VIR_FREE(devptmx); } } else { if (nttyFDs != -1) { lxcError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", _("Expected exactly one TTY fd")); goto cleanup; } } for (i = 0 ; i < nttyFDs ; i++) { if (devptmx) { VIR_DEBUG("Opening tty on private %s", devptmx); if (lxcCreateTty(devptmx, &containerTtyFDs[i], &containerTtyPaths[i]) < 0) { virReportSystemError(errno, "%s", _("Failed to allocate tty")); goto cleanup; } } else { VIR_DEBUG("Opening tty on shared /dev/ptmx"); if (virFileOpenTty(&containerTtyFDs[i], &containerTtyPaths[i], 0) < 0) { virReportSystemError(errno, "%s", _("Failed to allocate tty")); goto cleanup; } } } if (lxcSetPersonality(def) < 0) goto cleanup; if ((container = lxcContainerStart(def, nveths, veths, control[1], containerhandshake[1], containerTtyPaths, nttyFDs)) < 0) goto cleanup; VIR_FORCE_CLOSE(control[1]); VIR_FORCE_CLOSE(containerhandshake[1]); if (lxcControllerMoveInterfaces(nveths, veths, container) < 0) goto cleanup; if (lxcContainerSendContinue(control[0]) < 0) { virReportSystemError(errno, "%s", _("Unable to send container continue message")); goto cleanup; } if (lxcContainerWaitForContinue(containerhandshake[0]) < 0) { virReportSystemError(errno, "%s", _("error receiving signal from container")); goto cleanup; } /* Now the container is fully setup... */ /* ...we can close the loop devices... */ for (i = 0 ; i < nloopDevs ; i++) VIR_FORCE_CLOSE(loopDevs[i]); /* ...and reduce our privileges */ if (lxcControllerClearCapabilities() < 0) goto cleanup; if (lxcContainerSendContinue(handshakefd) < 0) { virReportSystemError(errno, "%s", _("error sending continue signal to parent")); goto cleanup; } VIR_FORCE_CLOSE(handshakefd); if (virSetBlocking(monitor, false) < 0 || virSetBlocking(client, false) < 0) { virReportSystemError(errno, "%s", _("Unable to set file descriptor non blocking")); goto cleanup; } for (i = 0 ; i < nttyFDs ; i++) { if (virSetBlocking(ttyFDs[i], false) < 0 || virSetBlocking(containerTtyFDs[i], false) < 0) { virReportSystemError(errno, "%s", _("Unable to set file descriptor non blocking")); goto cleanup; } } rc = lxcControllerMain(monitor, client, ttyFDs, containerTtyFDs, nttyFDs, container); monitor = client = -1; cleanup: VIR_FREE(devptmx); VIR_FREE(devpts); VIR_FORCE_CLOSE(control[0]); VIR_FORCE_CLOSE(control[1]); VIR_FORCE_CLOSE(handshakefd); VIR_FORCE_CLOSE(containerhandshake[0]); VIR_FORCE_CLOSE(containerhandshake[1]); for (i = 0 ; i < nttyFDs ; i++) VIR_FREE(containerTtyPaths[i]); VIR_FREE(containerTtyPaths); for (i = 0 ; i < nttyFDs ; i++) VIR_FORCE_CLOSE(containerTtyFDs[i]); VIR_FREE(containerTtyFDs); for (i = 0 ; i < nloopDevs ; i++) VIR_FORCE_CLOSE(loopDevs[i]); VIR_FREE(loopDevs); virPidAbort(container); return rc; }