static bool setup_limits(struct lxc_handler *h, bool do_devices) { struct lxc_list *iterator; struct lxc_cgroup *cg; bool ret = false; struct lxc_list *cgroup_settings = &h->conf->cgroup; char *cgroup_path = h->cgroup_info->data; if (lxc_list_empty(cgroup_settings)) return true; lxc_list_for_each(iterator, cgroup_settings) { char controller[100], *p; cg = iterator->elem; if (do_devices != !strncmp("devices", cg->subsystem, 7)) continue; if (strlen(cg->subsystem) > 100) // i smell a rat goto out; strcpy(controller, cg->subsystem); p = strchr(controller, '.'); if (p) *p = '\0'; if (cgm_do_set(controller, cg->subsystem, cgroup_path , cg->value) < 0) { ERROR("Error setting %s to %s for %s\n", cg->subsystem, cg->value, h->name); goto out; } DEBUG("cgroup '%s' set to '%s'", cg->subsystem, cg->value); }
static bool chown_cgroup(const char *controller, const char *cgroup_path, struct lxc_conf *conf) { struct chown_data data; if (lxc_list_empty(&conf->id_map)) /* If there's no mapping then we don't need to chown */ return true; data.controller = controller; data.cgroup_path = cgroup_path; data.origuid = geteuid(); if (userns_exec_1(conf, chown_cgroup_wrapper, &data) < 0) { ERROR("Error requesting cgroup chown in new namespace"); return false; } /* now chmod 775 the directory else the container cannot create cgroups */ if (!lxc_cgmanager_chmod(controller, cgroup_path, "", 0775)) return false; if (!lxc_cgmanager_chmod(controller, cgroup_path, "tasks", 0775)) return false; if (!lxc_cgmanager_chmod(controller, cgroup_path, "cgroup.procs", 0775)) return false; return true; }
static bool cgm_setup_limits(void *hdata, struct lxc_list *cgroup_settings, bool do_devices) { struct cgm_data *d = hdata; struct lxc_list *iterator, *sorted_cgroup_settings, *next; struct lxc_cgroup *cg; bool ret = false; if (lxc_list_empty(cgroup_settings)) return true; if (!d || !d->cgroup_path) return false; if (!cgm_dbus_connect()) { ERROR("Error connecting to cgroup manager"); return false; } sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings); if (!sorted_cgroup_settings) { return false; } lxc_list_for_each(iterator, sorted_cgroup_settings) { char controller[100], *p; cg = iterator->elem; if (do_devices != !strncmp("devices", cg->subsystem, 7)) continue; if (strlen(cg->subsystem) > 100) // i smell a rat goto out; strcpy(controller, cg->subsystem); p = strchr(controller, '.'); if (p) *p = '\0'; if (cgmanager_set_value_sync(NULL, cgroup_manager, controller, d->cgroup_path, cg->subsystem, cg->value) != 0) { NihError *nerr; nerr = nih_error_get(); if (do_devices) { WARN("call to cgmanager_set_value_sync failed: %s", nerr->message); nih_free(nerr); WARN("Error setting cgroup %s:%s limit type %s", controller, d->cgroup_path, cg->subsystem); continue; } ERROR("call to cgmanager_set_value_sync failed: %s", nerr->message); nih_free(nerr); ERROR("Error setting cgroup %s:%s limit type %s", controller, d->cgroup_path, cg->subsystem); goto out; } DEBUG("cgroup '%s' set to '%s'", cg->subsystem, cg->value); }
static int must_drop_cap_sys_boot(struct lxc_conf *conf) { FILE *f; int ret, cmd, v, flags; long stack_size = 4096; void *stack = alloca(stack_size); int status; pid_t pid; f = fopen("/proc/sys/kernel/ctrl-alt-del", "r"); if (!f) { DEBUG("failed to open /proc/sys/kernel/ctrl-alt-del"); return 1; } ret = fscanf(f, "%d", &v); fclose(f); if (ret != 1) { DEBUG("Failed to read /proc/sys/kernel/ctrl-alt-del"); return 1; } cmd = v ? LINUX_REBOOT_CMD_CAD_ON : LINUX_REBOOT_CMD_CAD_OFF; flags = CLONE_NEWPID | SIGCHLD; if (!lxc_list_empty(&conf->id_map)) flags |= CLONE_NEWUSER; #ifdef __ia64__ pid = __clone2(container_reboot_supported, stack, stack_size, flags, &cmd); #else stack += stack_size; pid = clone(container_reboot_supported, stack, flags, &cmd); #endif if (pid < 0) { if (flags & CLONE_NEWUSER) ERROR("failed to clone (%#x): %s (includes CLONE_NEWUSER)", flags, strerror(errno)); else ERROR("failed to clone (%#x): %s", flags, strerror(errno)); return -1; } if (wait(&status) < 0) { SYSERROR("unexpected wait error: %m"); return -1; } if (WEXITSTATUS(status) != 1) return 1; return 0; }
static struct lxc_netdev *network_netdev(const char *key, const char *value, struct lxc_list *network) { struct lxc_netdev *netdev; if (lxc_list_empty(network)) { ERROR("network is not created for '%s' = '%s' option", key, value); return NULL; } netdev = lxc_list_last_elem(network); if (!netdev) { ERROR("no network device defined for '%s' = '%s' option", key, value); return NULL; } return netdev; }
/* Internal helper. Must be called with the cgmanager dbus socket open */ static bool chown_cgroup(const char *cgroup_path, struct lxc_conf *conf) { struct chown_data data; char **slist = subsystems; int i; if (lxc_list_empty(&conf->id_map)) /* If there's no mapping then we don't need to chown */ return true; data.cgroup_path = cgroup_path; data.origuid = geteuid(); /* Unpriv users can't chown it themselves, so chown from * a child namespace mapping both our own and the target uid */ if (userns_exec_1(conf, chown_cgroup_wrapper, &data) < 0) { ERROR("Error requesting cgroup chown in new namespace"); return false; } /* * Now chmod 775 the directory else the container cannot create cgroups. * This can't be done in the child namespace because it only group-owns * the cgroup */ if (cgm_supports_multiple_controllers) slist = subsystems_inone; for (i = 0; slist[i]; i++) { if (!lxc_cgmanager_chmod(slist[i], cgroup_path, "", 0775)) return false; if (!lxc_cgmanager_chmod(slist[i], cgroup_path, "tasks", 0775)) return false; if (!lxc_cgmanager_chmod(slist[i], cgroup_path, "cgroup.procs", 0775)) return false; } return true; }
int lxc_attach(const char* name, const char* lxcpath, lxc_attach_exec_t exec_function, void* exec_payload, lxc_attach_options_t* options, pid_t* attached_process) { int ret, status; pid_t init_pid, pid, attached_pid, expected; struct lxc_proc_context_info *init_ctx; char* cwd; char* new_cwd; int ipc_sockets[2]; signed long personality; if (!options) options = &attach_static_default_options; init_pid = lxc_cmd_get_init_pid(name, lxcpath); if (init_pid < 0) { ERROR("Failed to get init pid."); return -1; } init_ctx = lxc_proc_get_context_info(init_pid); if (!init_ctx) { ERROR("Failed to get context of init process: %ld.", (long)init_pid); return -1; } personality = get_personality(name, lxcpath); if (init_ctx->personality < 0) { ERROR("Failed to get personality of the container."); lxc_proc_put_context_info(init_ctx); return -1; } init_ctx->personality = personality; init_ctx->container = lxc_container_new(name, lxcpath); if (!init_ctx->container) return -1; if (!fetch_seccomp(init_ctx->container, options)) WARN("Failed to get seccomp policy."); if (!no_new_privs(init_ctx->container, options)) WARN("Could not determine whether PR_SET_NO_NEW_PRIVS is set."); cwd = getcwd(NULL, 0); /* Determine which namespaces the container was created with * by asking lxc-start, if necessary. */ if (options->namespaces == -1) { options->namespaces = lxc_cmd_get_clone_flags(name, lxcpath); /* call failed */ if (options->namespaces == -1) { ERROR("Failed to automatically determine the " "namespaces which the container uses."); free(cwd); lxc_proc_put_context_info(init_ctx); return -1; } } /* Create a socket pair for IPC communication; set SOCK_CLOEXEC in order * to make sure we don't irritate other threads that want to fork+exec * away * * IMPORTANT: if the initial process is multithreaded and another call * just fork()s away without exec'ing directly after, the socket fd will * exist in the forked process from the other thread and any close() in * our own child process will not really cause the socket to close * properly, potentiall causing the parent to hang. * * For this reason, while IPC is still active, we have to use shutdown() * if the child exits prematurely in order to signal that the socket is * closed and cannot assume that the child exiting will automatically do * that. * * IPC mechanism: (X is receiver) * initial process intermediate attached * X <--- send pid of * attached proc, * then exit * send 0 ------------------------------------> X * [do initialization] * X <------------------------------------ send 1 * [add to cgroup, ...] * send 2 ------------------------------------> X * [set LXC_ATTACH_NO_NEW_PRIVS] * X <------------------------------------ send 3 * [open LSM label fd] * send 4 ------------------------------------> X * [set LSM label] * close socket close socket * run program */ ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); if (ret < 0) { SYSERROR("Could not set up required IPC mechanism for attaching."); free(cwd); lxc_proc_put_context_info(init_ctx); return -1; } /* Create intermediate subprocess, three reasons: * 1. Runs all pthread_atfork handlers and the child will no * longer be threaded (we can't properly setns() in a threaded * process). * 2. We can't setns() in the child itself, since we want to make * sure we are properly attached to the pidns. * 3. Also, the initial thread has to put the attached process * into the cgroup, which we can only do if we didn't already * setns() (otherwise, user namespaces will hate us). */ pid = fork(); if (pid < 0) { SYSERROR("Failed to create first subprocess."); free(cwd); lxc_proc_put_context_info(init_ctx); return -1; } if (pid) { int procfd = -1; pid_t to_cleanup_pid = pid; /* Initial thread, we close the socket that is for the * subprocesses. */ close(ipc_sockets[1]); free(cwd); /* Attach to cgroup, if requested. */ if (options->attach_flags & LXC_ATTACH_MOVE_TO_CGROUP) { if (!cgroup_attach(name, lxcpath, pid)) goto on_error; } /* Setup resource limits */ if (!lxc_list_empty(&init_ctx->container->lxc_conf->limits) && setup_resource_limits(&init_ctx->container->lxc_conf->limits, pid)) { goto on_error; } /* Open /proc before setns() to the containers namespace so we * don't rely on any information from inside the container. */ procfd = open("/proc", O_DIRECTORY | O_RDONLY | O_CLOEXEC); if (procfd < 0) { SYSERROR("Unable to open /proc."); goto on_error; } /* Let the child process know to go ahead. */ status = 0; ret = lxc_write_nointr(ipc_sockets[0], &status, sizeof(status)); if (ret <= 0) { ERROR("Intended to send sequence number 0: %s.", strerror(errno)); goto on_error; } /* Get pid of attached process from intermediate process. */ ret = lxc_read_nointr_expect(ipc_sockets[0], &attached_pid, sizeof(attached_pid), NULL); if (ret <= 0) { if (ret != 0) ERROR("Expected to receive pid: %s.", strerror(errno)); goto on_error; } /* Ignore SIGKILL (CTRL-C) and SIGQUIT (CTRL-\) - issue #313. */ if (options->stdin_fd == 0) { signal(SIGINT, SIG_IGN); signal(SIGQUIT, SIG_IGN); } /* Reap intermediate process. */ ret = wait_for_pid(pid); if (ret < 0) goto on_error; /* We will always have to reap the attached process now. */ to_cleanup_pid = attached_pid; /* Tell attached process it may start initializing. */ status = 0; ret = lxc_write_nointr(ipc_sockets[0], &status, sizeof(status)); if (ret <= 0) { ERROR("Intended to send sequence number 0: %s.", strerror(errno)); goto on_error; } /* Wait for the attached process to finish initializing. */ expected = 1; ret = lxc_read_nointr_expect(ipc_sockets[0], &status, sizeof(status), &expected); if (ret <= 0) { if (ret != 0) ERROR("Expected to receive sequence number 1: %s.", strerror(errno)); goto on_error; } /* Tell attached process we're done. */ status = 2; ret = lxc_write_nointr(ipc_sockets[0], &status, sizeof(status)); if (ret <= 0) { ERROR("Intended to send sequence number 2: %s.", strerror(errno)); goto on_error; } /* Wait for the (grand)child to tell us that it's ready to set * up its LSM labels. */ expected = 3; ret = lxc_read_nointr_expect(ipc_sockets[0], &status, sizeof(status), &expected); if (ret <= 0) { ERROR("Expected to receive sequence number 3: %s.", strerror(errno)); goto on_error; } /* Open LSM fd and send it to child. */ if ((options->namespaces & CLONE_NEWNS) && (options->attach_flags & LXC_ATTACH_LSM) && init_ctx->lsm_label) { int on_exec, saved_errno; int labelfd = -1; on_exec = options->attach_flags & LXC_ATTACH_LSM_EXEC ? 1 : 0; /* Open fd for the LSM security module. */ labelfd = lsm_openat(procfd, attached_pid, on_exec); if (labelfd < 0) goto on_error; /* Send child fd of the LSM security module to write to. */ ret = lxc_abstract_unix_send_fds(ipc_sockets[0], &labelfd, 1, NULL, 0); saved_errno = errno; close(labelfd); if (ret <= 0) { ERROR("Intended to send file descriptor %d: %s.", labelfd, strerror(saved_errno)); goto on_error; } } if (procfd >= 0) close(procfd); /* Now shut down communication with child, we're done. */ shutdown(ipc_sockets[0], SHUT_RDWR); close(ipc_sockets[0]); lxc_proc_put_context_info(init_ctx); /* We're done, the child process should now execute whatever it * is that the user requested. The parent can now track it with * waitpid() or similar. */ *attached_process = attached_pid; return 0; on_error: /* First shut down the socket, then wait for the pid, otherwise * the pid we're waiting for may never exit. */ if (procfd >= 0) close(procfd); shutdown(ipc_sockets[0], SHUT_RDWR); close(ipc_sockets[0]); if (to_cleanup_pid) (void) wait_for_pid(to_cleanup_pid); lxc_proc_put_context_info(init_ctx); return -1; } /* First subprocess begins here, we close the socket that is for the * initial thread. */ close(ipc_sockets[0]); /* Wait for the parent to have setup cgroups. */ expected = 0; status = -1; ret = lxc_read_nointr_expect(ipc_sockets[1], &status, sizeof(status), &expected); if (ret <= 0) { ERROR("Expected to receive sequence number 0: %s.", strerror(errno)); shutdown(ipc_sockets[1], SHUT_RDWR); rexit(-1); } if ((options->attach_flags & LXC_ATTACH_MOVE_TO_CGROUP) && cgns_supported()) options->namespaces |= CLONE_NEWCGROUP; /* Attach now, create another subprocess later, since pid namespaces * only really affect the children of the current process. */ ret = lxc_attach_to_ns(init_pid, options->namespaces); if (ret < 0) { ERROR("Failed to enter namespaces."); shutdown(ipc_sockets[1], SHUT_RDWR); rexit(-1); } /* Attach succeeded, try to cwd. */ if (options->initial_cwd) new_cwd = options->initial_cwd; else new_cwd = cwd; ret = chdir(new_cwd); if (ret < 0) WARN("Could not change directory to \"%s\".", new_cwd); free(cwd); /* Now create the real child process. */ { struct attach_clone_payload payload = { .ipc_socket = ipc_sockets[1], .options = options, .init_ctx = init_ctx, .exec_function = exec_function, .exec_payload = exec_payload, }; /* We use clone_parent here to make this subprocess a direct * child of the initial process. Then this intermediate process * can exit and the parent can directly track the attached * process. */ pid = lxc_clone(attach_child_main, &payload, CLONE_PARENT); } /* Shouldn't happen, clone() should always return positive pid. */ if (pid <= 0) { SYSERROR("Failed to create subprocess."); shutdown(ipc_sockets[1], SHUT_RDWR); rexit(-1); } /* Tell grandparent the pid of the pid of the newly created child. */ ret = lxc_write_nointr(ipc_sockets[1], &pid, sizeof(pid)); if (ret != sizeof(pid)) { /* If this really happens here, this is very unfortunate, since * the parent will not know the pid of the attached process and * will not be able to wait for it (and we won't either due to * CLONE_PARENT) so the parent won't be able to reap it and the * attached process will remain a zombie. */ ERROR("Intended to send pid %d: %s.", pid, strerror(errno)); shutdown(ipc_sockets[1], SHUT_RDWR); rexit(-1); } /* The rest is in the hands of the initial and the attached process. */ rexit(0); }
int main(int argc, char *argv[]) { int c; unsigned long flags = CLONE_NEWUSER | CLONE_NEWNS; char ttyname0[256], ttyname1[256], ttyname2[256]; int status; int ret; int pid; char *default_args[] = {"/bin/sh", NULL}; char buf[1]; int pipe1[2], // child tells parent it has unshared pipe2[2]; // parent tells child it is mapped and may proceed memset(ttyname0, '\0', sizeof(ttyname0)); memset(ttyname1, '\0', sizeof(ttyname1)); memset(ttyname2, '\0', sizeof(ttyname2)); if (isatty(0)) { ret = readlink("/proc/self/fd/0", ttyname0, sizeof(ttyname0)); if (ret < 0) { perror("unable to open stdin."); exit(1); } ret = readlink("/proc/self/fd/1", ttyname1, sizeof(ttyname1)); if (ret < 0) { printf("Warning: unable to open stdout, continuing."); memset(ttyname1, '\0', sizeof(ttyname1)); } ret = readlink("/proc/self/fd/2", ttyname2, sizeof(ttyname2)); if (ret < 0) { printf("Warning: unable to open stderr, continuing."); memset(ttyname2, '\0', sizeof(ttyname2)); } } lxc_list_init(&active_map); while ((c = getopt(argc, argv, "m:h")) != EOF) { switch (c) { case 'm': if (parse_map(optarg)) usage(argv[0]); break; case 'h': default: usage(argv[0]); } }; if (lxc_list_empty(&active_map)) { if (find_default_map()) { fprintf(stderr, "You have no allocated subuids or subgids\n"); exit(1); } } argv = &argv[optind]; argc = argc - optind; if (argc < 1) { argv = default_args; argc = 1; } if (pipe(pipe1) < 0 || pipe(pipe2) < 0) { perror("pipe"); exit(1); } if ((pid = fork()) == 0) { // Child. close(pipe1[0]); close(pipe2[1]); opentty(ttyname0, 0); opentty(ttyname1, 1); opentty(ttyname2, 2); ret = unshare(flags); if (ret < 0) { perror("unshare"); return 1; } buf[0] = '1'; if (write(pipe1[1], buf, 1) < 1) { perror("write pipe"); exit(1); } if (read(pipe2[0], buf, 1) < 1) { perror("read pipe"); exit(1); } if (buf[0] != '1') { fprintf(stderr, "parent had an error, child exiting\n"); exit(1); } close(pipe1[1]); close(pipe2[0]); return do_child((void*)argv); } close(pipe1[1]); close(pipe2[0]); if (read(pipe1[0], buf, 1) < 1) { perror("read pipe"); exit(1); } buf[0] = '1'; if (lxc_map_ids(&active_map, pid)) { fprintf(stderr, "error mapping child\n"); ret = 0; } if (write(pipe2[1], buf, 1) < 0) { perror("write to pipe"); exit(1); } if ((ret = waitpid(pid, &status, __WALL)) < 0) { printf("waitpid() returns %d, errno %d\n", ret, errno); exit(1); } exit(WEXITSTATUS(status)); }
int main(int argc, char *argv[]) { int c, pid, ret, status; char buf[1]; int pipe_fds1[2], /* child tells parent it has unshared */ pipe_fds2[2]; /* parent tells child it is mapped and may proceed */ unsigned long flags = CLONE_NEWUSER | CLONE_NEWNS; char ttyname0[256] = {0}, ttyname1[256] = {0}, ttyname2[256] = {0}; char *default_args[] = {"/bin/sh", NULL}; lxc_log_fd = STDERR_FILENO; if (isatty(STDIN_FILENO)) { ret = readlink("/proc/self/fd/0", ttyname0, sizeof(ttyname0)); if (ret < 0) { CMD_SYSERROR("Failed to open stdin"); _exit(EXIT_FAILURE); } ret = readlink("/proc/self/fd/1", ttyname1, sizeof(ttyname1)); if (ret < 0) { CMD_SYSINFO("Failed to open stdout. Continuing"); ttyname1[0] = '\0'; } ret = readlink("/proc/self/fd/2", ttyname2, sizeof(ttyname2)); if (ret < 0) { CMD_SYSINFO("Failed to open stderr. Continuing"); ttyname2[0] = '\0'; } } lxc_list_init(&active_map); while ((c = getopt(argc, argv, "m:h")) != EOF) { switch (c) { case 'm': ret = parse_map(optarg); if (ret < 0) { usage(argv[0]); _exit(EXIT_FAILURE); } break; case 'h': usage(argv[0]); _exit(EXIT_SUCCESS); default: usage(argv[0]); _exit(EXIT_FAILURE); } }; if (lxc_list_empty(&active_map)) { ret = find_default_map(); if (ret < 0) { fprintf(stderr, "Failed to find subuid or subgid allocation\n"); _exit(EXIT_FAILURE); } } argv = &argv[optind]; argc = argc - optind; if (argc < 1) argv = default_args; ret = pipe2(pipe_fds1, O_CLOEXEC); if (ret < 0) { CMD_SYSERROR("Failed to open new pipe"); _exit(EXIT_FAILURE); } ret = pipe2(pipe_fds2, O_CLOEXEC); if (ret < 0) { CMD_SYSERROR("Failed to open new pipe"); close(pipe_fds1[0]); close(pipe_fds1[1]); _exit(EXIT_FAILURE); } pid = fork(); if (pid < 0) { close(pipe_fds1[0]); close(pipe_fds1[1]); close(pipe_fds2[0]); close(pipe_fds2[1]); _exit(EXIT_FAILURE); } if (pid == 0) { close(pipe_fds1[0]); close(pipe_fds2[1]); opentty(ttyname0, STDIN_FILENO); opentty(ttyname1, STDOUT_FILENO); opentty(ttyname2, STDERR_FILENO); ret = unshare(flags); if (ret < 0) { CMD_SYSERROR("Failed to unshare mount and user namespace"); close(pipe_fds1[1]); close(pipe_fds2[0]); _exit(EXIT_FAILURE); } buf[0] = '1'; ret = lxc_write_nointr(pipe_fds1[1], buf, 1); if (ret != 1) { CMD_SYSERROR("Failed to write to pipe file descriptor %d", pipe_fds1[1]); close(pipe_fds1[1]); close(pipe_fds2[0]); _exit(EXIT_FAILURE); } ret = lxc_read_nointr(pipe_fds2[0], buf, 1); if (ret != 1) { CMD_SYSERROR("Failed to read from pipe file descriptor %d", pipe_fds2[0]); close(pipe_fds1[1]); close(pipe_fds2[0]); _exit(EXIT_FAILURE); } close(pipe_fds1[1]); close(pipe_fds2[0]); if (buf[0] != '1') { fprintf(stderr, "Received unexpected value from parent process\n"); _exit(EXIT_FAILURE); } ret = do_child((void *)argv); if (ret < 0) _exit(EXIT_FAILURE); _exit(EXIT_SUCCESS); } close(pipe_fds1[1]); close(pipe_fds2[0]); ret = lxc_read_nointr(pipe_fds1[0], buf, 1); if (ret <= 0) CMD_SYSERROR("Failed to read from pipe file descriptor %d", pipe_fds1[0]); buf[0] = '1'; ret = lxc_map_ids(&active_map, pid); if (ret < 0) fprintf(stderr, "Failed to write id mapping for child process\n"); ret = lxc_write_nointr(pipe_fds2[1], buf, 1); if (ret < 0) { CMD_SYSERROR("Failed to write to pipe file descriptor %d", pipe_fds2[1]); _exit(EXIT_FAILURE); } ret = waitpid(pid, &status, __WALL); if (ret < 0) { CMD_SYSERROR("Failed to wait on child process"); _exit(EXIT_FAILURE); } _exit(WEXITSTATUS(status)); }