static int check_prctl(void) { unsigned long user_auxv = 0; unsigned int *tid_addr; int ret; ret = sys_prctl(PR_GET_TID_ADDRESS, (unsigned long)&tid_addr, 0, 0, 0); if (ret) { pr_msg("prctl: PR_GET_TID_ADDRESS is not supported\n"); return -1; } ret = sys_prctl(PR_SET_MM, PR_SET_MM_BRK, sys_brk(0), 0, 0); if (ret) { if (ret == -EPERM) pr_msg("prctl: One needs CAP_SYS_RESOURCE capability to perform testing\n"); else pr_msg("prctl: PR_SET_MM is not supported\n"); return -1; } ret = sys_prctl(PR_SET_MM, PR_SET_MM_EXE_FILE, -1, 0, 0); if (ret != -EBADF) { pr_msg("prctl: PR_SET_MM_EXE_FILE is not supported (%d)\n", ret); return -1; } ret = sys_prctl(PR_SET_MM, PR_SET_MM_AUXV, (long)&user_auxv, sizeof(user_auxv), 0); if (ret) { pr_msg("prctl: PR_SET_MM_AUXV is not supported\n"); return -1; } return 0; }
static int check_prctl(void) { unsigned long user_auxv = 0; unsigned int *tid_addr; unsigned int size = 0; int ret; ret = sys_prctl(PR_GET_TID_ADDRESS, (unsigned long)&tid_addr, 0, 0, 0); if (ret) { pr_msg("prctl: PR_GET_TID_ADDRESS is not supported"); return -1; } /* * Either new or old interface must be supported in the kernel. */ ret = sys_prctl(PR_SET_MM, PR_SET_MM_MAP_SIZE, (unsigned long)&size, 0, 0); if (ret) { if (!opts.check_ms_kernel) { pr_msg("prctl: PR_SET_MM_MAP is not supported, which " "is required for restoring user namespaces\n"); return -1; } else pr_warn("Skipping unssuported PR_SET_MM_MAP\n"); ret = sys_prctl(PR_SET_MM, PR_SET_MM_BRK, sys_brk(0), 0, 0); if (ret) { if (ret == -EPERM) pr_msg("prctl: One needs CAP_SYS_RESOURCE capability to perform testing\n"); else pr_msg("prctl: PR_SET_MM is not supported\n"); return -1; } ret = sys_prctl(PR_SET_MM, PR_SET_MM_EXE_FILE, -1, 0, 0); if (ret != -EBADF) { pr_msg("prctl: PR_SET_MM_EXE_FILE is not supported (%d)\n", ret); return -1; } ret = sys_prctl(PR_SET_MM, PR_SET_MM_AUXV, (long)&user_auxv, sizeof(user_auxv), 0); if (ret) { pr_msg("prctl: PR_SET_MM_AUXV is not supported\n"); return -1; } } return 0; }
static inline int restore_pdeath_sig(struct thread_restore_args *ta) { if (ta->pdeath_sig) return sys_prctl(PR_SET_PDEATHSIG, ta->pdeath_sig, 0, 0, 0); else return 0; }
static int dump_creds(struct parasite_dump_creds *args) { int ret; args->secbits = sys_prctl(PR_GET_SECUREBITS, 0, 0, 0, 0); ret = sys_getgroups(0, NULL); if (ret < 0) goto grps_err; args->ngroups = ret; if (args->ngroups >= PARASITE_MAX_GROUPS) { pr_err("Too many groups in task %d\n", (int)args->ngroups); return -1; } ret = sys_getgroups(args->ngroups, args->groups); if (ret < 0) goto grps_err; if (ret != args->ngroups) { pr_err("Groups changed on the fly %d -> %d\n", args->ngroups, ret); return -1; } return 0; grps_err: pr_err("Error calling getgroups (%d)\n", ret); return -1; }
/* Note: it is necessary to treat option as an unsigned int, * with the corresponding cast to a signed int to insure that the * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) * and the register representation of a signed int (msr in 64-bit mode) is performed. */ asmlinkage long compat_sys_prctl(u32 option, u32 arg2, u32 arg3, u32 arg4, u32 arg5) { return sys_prctl((int)option, (unsigned long) arg2, (unsigned long) arg3, (unsigned long) arg4, (unsigned long) arg5); }
static int dump_tid_info(struct parasite_dump_tid_info *args) { int ret; ret = sys_prctl(PR_GET_TID_ADDRESS, (unsigned long) &args->tid_addr, 0, 0, 0); if (ret) return ret; args->tid = sys_gettid(); return 0; }
static int dump_misc(struct parasite_dump_misc *args) { args->secbits = sys_prctl(PR_GET_SECUREBITS, 0, 0, 0, 0); args->brk = sys_brk(0); args->blocked = old_blocked; args->pid = sys_getpid(); args->sid = sys_getsid(); args->pgid = sys_getpgid(); return 0; }
static int restore_dumpable_flag(MmEntry *mme) { int current_dumpable; int ret; if (!mme->has_dumpable) { pr_warn("Dumpable flag not present in criu dump.\n"); return 0; } if (mme->dumpable == 0 || mme->dumpable == 1) { ret = sys_prctl(PR_SET_DUMPABLE, mme->dumpable, 0, 0, 0); if (ret) { pr_err("Unable to set PR_SET_DUMPABLE: %d\n", ret); return -1; } return 0; } /* * If dumpable flag is present but it is not 0 or 1, then we can not * use prctl to set it back. Try to see if it is already correct * (which is likely if sysctl fs.suid_dumpable is the same when dump * and restore are run), in which case there is nothing to do. * Otherwise, set dumpable to 0 which should be a secure fallback. */ current_dumpable = sys_prctl(PR_GET_DUMPABLE, 0, 0, 0, 0); if (mme->dumpable != current_dumpable) { pr_warn("Dumpable flag [%d] does not match current [%d]. " "Will fallback to setting it to 0 to disable it.\n", mme->dumpable, current_dumpable); ret = sys_prctl(PR_SET_DUMPABLE, 0, 0, 0, 0); if (ret) { pr_err("Unable to set PR_SET_DUMPABLE: %d\n", ret); return -1; } } return 0; }
static int setup_seccomp_filter(void) { struct sock_filter filter[] = { BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct seccomp_data, nr)), /* Allow all syscalls except ptrace */ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_ptrace, 0, 1), BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL), BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), }; struct sock_fprog bpf_prog = { .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])), .filter = filter, }; if (sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, (long) &bpf_prog, 0, 0) < 0) return -1; return 0; } static int check_ptrace_dump_seccomp_filters(void) { pid_t pid; int ret = 0, len; if (opts.check_ms_kernel) { pr_warn("Skipping PTRACE_SECCOMP_GET_FILTER check"); return 0; } pid = fork_and_ptrace_attach(setup_seccomp_filter); if (pid < 0) return -1; len = ptrace(PTRACE_SECCOMP_GET_FILTER, pid, 0, NULL); if (len < 0) { ret = -1; pr_perror("Dumping seccomp filters not supported"); } kill(pid, SIGKILL); return ret; }
static int check_userns(void) { int ret; unsigned long size = 0; ret = access("/proc/self/ns/user", F_OK); if (ret) { pr_perror("No userns proc file"); return -1; } ret = sys_prctl(PR_SET_MM, PR_SET_MM_MAP_SIZE, (unsigned long)&size, 0, 0); if (ret) { errno = -ret; pr_perror("No new prctl API"); return -1; } return 0; }
int ckpt_restore_ext(ckpt_desc_t desc) { int ret; ckpt_ext_t ext; mm_segment_t fs; log_restore_ext("restoring extra attributes ..."); if (ckpt_read(desc, &ext, sizeof(ckpt_ext_t)) != sizeof(ckpt_ext_t)) { log_err("failed to read extra attributes"); return -EIO; } set_personality(ext.personality); current->clear_child_tid = ext.clear_child_tid; fs = get_fs(); set_fs(KERNEL_DS); ret = sys_prctl(PR_SET_NAME, (unsigned long)ext.comm, 0, 0, 0); set_fs(fs); log_restore_pos(desc); return ret; }
static int dump_thread(struct parasite_dump_thread *args) { pid_t tid = sys_gettid(); struct tid_state_s *s; int ret; s = find_thread_state(tid); if (!s) return -ENOENT; if (!s->use_sig_blocked) return -EINVAL; ret = sys_prctl(PR_GET_TID_ADDRESS, (unsigned long) &args->tid_addr, 0, 0, 0); if (ret) return ret; args->blocked = s->sig_blocked; args->tid = tid; args->tls = arch_get_tls(); return 0; }
/* This function gets the list of all linux threads of the current process * passes them to the 'callback' along with the 'parameter' pointer; at the * call back call time all the threads are paused via * PTRACE_ATTACH. * The callback is executed from a separate thread which shares only the * address space, the filesystem, and the filehandles with the caller. Most * notably, it does not share the same pid and ppid; and if it terminates, * the rest of the application is still there. 'callback' is supposed to do * or arrange for ResumeAllProcessThreads. This happens automatically, if * the thread raises a synchronous signal (e.g. SIGSEGV); asynchronous * signals are blocked. If the 'callback' decides to unblock them, it must * ensure that they cannot terminate the application, or that * ResumeAllProcessThreads will get called. * It is an error for the 'callback' to make any library calls that could * acquire locks. Most notably, this means that most system calls have to * avoid going through libc. Also, this means that it is not legal to call * exit() or abort(). * We return -1 on error and the return value of 'callback' on success. */ int ListAllProcessThreads(void *parameter, ListAllProcessThreadsCallBack callback, ...) { char altstack_mem[ALT_STACKSIZE]; struct ListerParams args; pid_t clone_pid; int dumpable = 1, sig; struct kernel_sigset_t sig_blocked, sig_old; va_start(args.ap, callback); /* If we are short on virtual memory, initializing the alternate stack * might trigger a SIGSEGV. Let's do this early, before it could get us * into more trouble (i.e. before signal handlers try to use the alternate * stack, and before we attach to other threads). */ memset(altstack_mem, 0, sizeof(altstack_mem)); /* Some of our cleanup functions could conceivable use more stack space. * Try to touch the stack right now. This could be defeated by the compiler * being too smart for it's own good, so try really hard. */ DirtyStack(32768); /* Make this process "dumpable". This is necessary in order to ptrace() * after having called setuid(). */ dumpable = sys_prctl(PR_GET_DUMPABLE, 0); if (!dumpable) sys_prctl(PR_SET_DUMPABLE, 1); /* Fill in argument block for dumper thread */ args.result = -1; args.err = 0; args.altstack_mem = altstack_mem; args.parameter = parameter; args.callback = callback; /* Before cloning the thread lister, block all asynchronous signals, as we */ /* are not prepared to handle them. */ sys_sigfillset(&sig_blocked); for (sig = 0; sig < sizeof(sync_signals)/sizeof(*sync_signals); sig++) { sys_sigdelset(&sig_blocked, sync_signals[sig]); } if (sys_sigprocmask(SIG_BLOCK, &sig_blocked, &sig_old)) { args.err = errno; args.result = -1; goto failed; } /* scope */ { /* After cloning, both the parent and the child share the same instance * of errno. We must make sure that at least one of these processes * (in our case, the parent) uses modified syscall macros that update * a local copy of errno, instead. */ #ifdef __cplusplus #define sys0_sigprocmask sys.sigprocmask #define sys0_waitpid sys.waitpid SysCalls sys; #else int my_errno; #define SYS_ERRNO my_errno #define SYS_INLINE inline #define SYS_PREFIX 0 #undef SYS_LINUX_SYSCALL_SUPPORT_H #include "linux_syscall_support.h" #endif int clone_errno; clone_pid = local_clone((int (*)(void *))ListerThread, &args); clone_errno = errno; sys_sigprocmask(SIG_SETMASK, &sig_old, &sig_old); if (clone_pid >= 0) { int status, rc; while ((rc = sys0_waitpid(clone_pid, &status, __WALL)) < 0 && ERRNO == EINTR) { /* Keep waiting */ } if (rc < 0) { args.err = ERRNO; args.result = -1; } else if (WIFEXITED(status)) { switch (WEXITSTATUS(status)) { case 0: break; /* Normal process termination */ case 2: args.err = EFAULT; /* Some fault (e.g. SIGSEGV) detected */ args.result = -1; break; case 3: args.err = EPERM; /* Process is already being traced */ args.result = -1; break; default:args.err = ECHILD; /* Child died unexpectedly */ args.result = -1; break; } } else if (!WIFEXITED(status)) { args.err = EFAULT; /* Terminated due to an unhandled signal*/ args.result = -1; } } else { args.result = -1; args.err = clone_errno; } } /* Restore the "dumpable" state of the process */ failed: if (!dumpable) sys_prctl(PR_SET_DUMPABLE, dumpable); va_end(args.ap); errno = args.err; return args.result; }
static int restore_creds(CredsEntry *ce) { int b, i, ret; struct cap_header hdr; struct cap_data data[_LINUX_CAPABILITY_U32S_3]; /* * We're still root here and thus can do it without failures. */ /* * First -- set the SECURE_NO_SETUID_FIXUP bit not to * lose caps bits when changing xids. */ ret = sys_prctl(PR_SET_SECUREBITS, 1 << SECURE_NO_SETUID_FIXUP, 0, 0, 0); if (ret) { pr_err("Unable to set SECURE_NO_SETUID_FIXUP: %d\n", ret); return -1; } /* * Second -- restore xids. Since we still have the CAP_SETUID * capability nothing should fail. But call the setfsXid last * to override the setresXid settings. */ ret = sys_setresuid(ce->uid, ce->euid, ce->suid); if (ret) { pr_err("Unable to set real, effective and saved user ID: %d\n", ret); return -1; } sys_setfsuid(ce->fsuid); if (sys_setfsuid(-1) != ce->fsuid) { pr_err("Unable to set fsuid\n"); return -1; } ret = sys_setresgid(ce->gid, ce->egid, ce->sgid); if (ret) { pr_err("Unable to set real, effective and saved group ID: %d\n", ret); return -1; } sys_setfsgid(ce->fsgid); if (sys_setfsgid(-1) != ce->fsgid) { pr_err("Unable to set fsgid\n"); return -1; } /* * Third -- restore securebits. We don't need them in any * special state any longer. */ ret = sys_prctl(PR_SET_SECUREBITS, ce->secbits, 0, 0, 0); if (ret) { pr_err("Unable to set PR_SET_SECUREBITS: %d\n", ret); return -1; } /* * Fourth -- trim bset. This can only be done while * having the CAP_SETPCAP capablity. */ for (b = 0; b < CR_CAP_SIZE; b++) { for (i = 0; i < 32; i++) { if (b * 32 + i > cap_last_cap) break; if (ce->cap_bnd[b] & (1 << i)) /* already set */ continue; ret = sys_prctl(PR_CAPBSET_DROP, i + b * 32, 0, 0, 0); if (ret) { pr_err("Unable to drop capability %d: %d\n", i + b * 32, ret); return -1; } } } /* * Fifth -- restore caps. Nothing but cap bits are changed * at this stage, so just do it. */ hdr.version = _LINUX_CAPABILITY_VERSION_3; hdr.pid = 0; BUILD_BUG_ON(_LINUX_CAPABILITY_U32S_3 != CR_CAP_SIZE); for (i = 0; i < CR_CAP_SIZE; i++) { data[i].eff = ce->cap_eff[i]; data[i].prm = ce->cap_prm[i]; data[i].inh = ce->cap_inh[i]; } ret = sys_capset(&hdr, data); if (ret) { pr_err("Unable to restore capabilities: %d\n", ret); return -1; } return 0; }