/** * Modify the current syscall of @tracee as described by @modif * regarding the given @config. This function returns whether the * syscall was modified or not. */ static bool modify_syscall(Tracee *tracee, const Config *config, const Modif *modif) { size_t i, j; word_t syscall; assert(config != NULL); if (!needs_kompat(config, modif->expected_release)) return false; /* Check if this syscall is supported on this architecture. */ syscall = detranslate_sysnum(get_abi(tracee), modif->new_sysarg_num); if (syscall == SYSCALL_AVOIDER) return false; set_sysnum(tracee, modif->new_sysarg_num); /* Shift syscall arguments. */ for (i = 0; i < MAX_ARG_SHIFT; i++) { Reg sysarg = modif->shifts[i].sysarg; size_t nb_args = modif->shifts[i].nb_args; int offset = modif->shifts[i].offset; for (j = 0; j < nb_args; j++) { word_t arg = peek_reg(tracee, CURRENT, sysarg + j); poke_reg(tracee, sysarg + j + offset, arg); } } return true; }
/** * Translate the wait syscall made by @ptracer into a "void" syscall * if the expected pid is one of its ptracees, in order to emulate the * ptrace mechanism within PRoot. This function returns -errno if an * error occured (unsupported request), otherwise 0. */ int translate_wait_enter(Tracee *ptracer) { Tracee *ptracee; pid_t pid; PTRACER.waits_in = WAITS_IN_KERNEL; /* Don't emulate the ptrace mechanism if it's not a ptracer. */ if (PTRACER.nb_ptracees == 0) return 0; /* Don't emulate the ptrace mechanism if the requested pid is * not a ptracee. */ pid = (pid_t) peek_reg(ptracer, ORIGINAL, SYSARG_1); if (pid != -1) { ptracee = get_tracee(ptracer, pid, false); if (ptracee == NULL || PTRACEE.ptracer != ptracer) return 0; } /* This syscall is canceled at the enter stage in order to be * handled at the exit stage. */ set_sysnum(ptracer, PR_void); PTRACER.waits_in = WAITS_IN_PROOT; return 0; }
/** * Modify the current syscall of @tracee as described by @modif * regarding the given @config. This function returns whether the * syscall was modified or not. */ static bool modify_syscall(Tracee *tracee, const Config *config, const Modif *modif) { size_t i, j; assert(config != NULL); if (!needs_kompat(config, modif->expected_release)) return false; set_sysnum(tracee, modif->new_sysarg_num); /* Shift syscall arguments. */ for (i = 0; i < MAX_ARG_SHIFT; i++) { Reg sysarg = modif->shifts[i].sysarg; size_t nb_args = modif->shifts[i].nb_args; int offset = modif->shifts[i].offset; for (j = 0; j < nb_args; j++) { word_t arg = peek_reg(tracee, CURRENT, sysarg + j); poke_reg(tracee, sysarg + j + offset, arg); } } return true; }
void translate_syscall(Tracee *tracee) { const bool is_enter_stage = IS_IN_SYSENTER(tracee); int status; assert(tracee->exe != NULL); status = fetch_regs(tracee); if (status < 0) return; if (is_enter_stage) { /* Never restore original register values at the end * of this stage. */ tracee->restore_original_regs = false; print_current_regs(tracee, 3, "sysenter start"); /* Translate the syscall only if it was actually * requested by the tracee, it is not a syscall * chained by PRoot. */ if (tracee->chain.syscalls == NULL) { save_current_regs(tracee, ORIGINAL); status = translate_syscall_enter(tracee); save_current_regs(tracee, MODIFIED); } else { status = notify_extensions(tracee, SYSCALL_CHAINED_ENTER, 0, 0); tracee->restart_how = PTRACE_SYSCALL; } /* Remember the tracee status for the "exit" stage and * avoid the actual syscall if an error was reported * by the translation/extension. */ if (status < 0) { set_sysnum(tracee, PR_void); poke_reg(tracee, SYSARG_RESULT, (word_t) status); tracee->status = status; } else tracee->status = 1; /* Restore tracee's stack pointer now if it won't hit * the sysexit stage (i.e. when seccomp is enabled and * there's nothing else to do). */ if (tracee->restart_how == PTRACE_CONT) { tracee->status = 0; poke_reg(tracee, STACK_POINTER, peek_reg(tracee, ORIGINAL, STACK_POINTER)); } } else { /* By default, restore original register values at the * end of this stage. */ tracee->restore_original_regs = true; print_current_regs(tracee, 5, "sysexit start"); /* Translate the syscall only if it was actually * requested by the tracee, it is not a syscall * chained by PRoot. */ if (tracee->chain.syscalls == NULL) translate_syscall_exit(tracee); else (void) notify_extensions(tracee, SYSCALL_CHAINED_EXIT, 0, 0); /* Reset the tracee's status. */ tracee->status = 0; /* Insert the next chained syscall, if any. */ if (tracee->chain.syscalls != NULL) chain_next_syscall(tracee); } (void) push_regs(tracee); if (is_enter_stage) print_current_regs(tracee, 5, "sysenter end" ); else print_current_regs(tracee, 4, "sysexit end"); }
/** * Start the loading of @tracee. This function returns no error since * it's either too late to do anything useful (the calling process is * already replaced) or the error reported by the kernel * (syscall_result < 0) will be propagated as-is. */ void translate_execve_exit(Tracee *tracee) { word_t syscall_result; int status; if (IS_NOTIFICATION_PTRACED_LOAD_DONE(tracee)) { /* Be sure not to confuse the ptracer with an * unexpected syscall/returned value. */ poke_reg(tracee, SYSARG_RESULT, 0); set_sysnum(tracee, PR_execve); /* According to most ABIs, all registers have * undefined values at program startup except: * * - the stack pointer * - the instruction pointer * - the rtld_fini pointer * - the state flags */ poke_reg(tracee, STACK_POINTER, peek_reg(tracee, ORIGINAL, SYSARG_2)); poke_reg(tracee, INSTR_POINTER, peek_reg(tracee, ORIGINAL, SYSARG_3)); poke_reg(tracee, RTLD_FINI, 0); poke_reg(tracee, STATE_FLAGS, 0); /* Restore registers to their current values. */ save_current_regs(tracee, ORIGINAL); tracee->_regs_were_changed = true; /* This is is required to make GDB work correctly * under PRoot, however it deserves to be used * unconditionally. */ (void) bind_proc_pid_auxv(tracee); /* If the PTRACE_O_TRACEEXEC option is *not* in effect * for the execing tracee, the kernel delivers an * extra SIGTRAP to the tracee after execve(2) * *returns*. This is an ordinary signal (similar to * one which can be generated by "kill -TRAP"), not a * special kind of ptrace-stop. Employing * PTRACE_GETSIGINFO for this signal returns si_code * set to 0 (SI_USER). This signal may be blocked by * signal mask, and thus may be delivered (much) * later. -- man 2 ptrace * * This signal is delayed so far since the program was * not fully loaded yet; GDB would get "invalid * adress" errors otherwise. */ if ((tracee->as_ptracee.options & PTRACE_O_TRACEEXEC) == 0) kill(tracee->pid, SIGTRAP); return; } syscall_result = peek_reg(tracee, CURRENT, SYSARG_RESULT); if ((int) syscall_result < 0) return; /* Execve happened; commit the new "/proc/self/exe". */ if (tracee->new_exe != NULL) { (void) talloc_unlink(tracee, tracee->exe); tracee->exe = talloc_reference(tracee, tracee->new_exe); talloc_set_name_const(tracee->exe, "$exe"); } /* New processes have no heap. */ bzero(tracee->heap, sizeof(Heap)); /* Transfer the load script to the loader. */ status = transfer_load_script(tracee); if (status < 0) note(tracee, ERROR, INTERNAL, "can't transfer load script: %s", strerror(-status)); return; }
/** * Translate the input arguments of the current @tracee's syscall in the * @tracee->pid process area. This function sets @tracee->status to * -errno if an error occured from the tracee's point-of-view (EFAULT * for instance), otherwise 0. */ int translate_syscall_enter(Tracee *tracee) { int flags; int dirfd; int olddirfd; int newdirfd; int status; int status2; char path[PATH_MAX]; char oldpath[PATH_MAX]; char newpath[PATH_MAX]; word_t syscall_number; bool special = false; status = notify_extensions(tracee, SYSCALL_ENTER_START, 0, 0); if (status < 0) goto end; if (status > 0) return 0; /* Translate input arguments. */ syscall_number = get_sysnum(tracee, ORIGINAL); switch (syscall_number) { default: /* Nothing to do. */ status = 0; break; case PR_execve: status = translate_execve_enter(tracee); break; case PR_ptrace: status = translate_ptrace_enter(tracee); break; case PR_wait4: case PR_waitpid: status = translate_wait_enter(tracee); break; case PR_brk: translate_brk_enter(tracee); status = 0; break; case PR_getcwd: set_sysnum(tracee, PR_void); status = 0; break; case PR_fchdir: case PR_chdir: { struct stat statl; char *tmp; /* The ending "." ensures an error will be reported if * path does not exist or if it is not a directory. */ if (syscall_number == PR_chdir) { status = get_sysarg_path(tracee, path, SYSARG_1); if (status < 0) break; status = join_paths(2, oldpath, path, "."); if (status < 0) break; dirfd = AT_FDCWD; } else { strcpy(oldpath, "."); dirfd = peek_reg(tracee, CURRENT, SYSARG_1); } status = translate_path(tracee, path, dirfd, oldpath, true); if (status < 0) break; status = lstat(path, &statl); if (status < 0) break; /* Check this directory is accessible. */ if ((statl.st_mode & S_IXUSR) == 0) return -EACCES; /* Sadly this method doesn't detranslate statefully, * this means that there's an ambiguity when several * bindings are from the same host path: * * $ proot -m /tmp:/a -m /tmp:/b fchdir_getcwd /a * /b * * $ proot -m /tmp:/b -m /tmp:/a fchdir_getcwd /a * /a * * A solution would be to follow each file descriptor * just like it is done for cwd. */ status = detranslate_path(tracee, path, NULL); if (status < 0) break; /* Remove the trailing "/" or "/.". */ chop_finality(path); tmp = talloc_strdup(tracee->fs, path); if (tmp == NULL) { status = -ENOMEM; break; } TALLOC_FREE(tracee->fs->cwd); tracee->fs->cwd = tmp; talloc_set_name_const(tracee->fs->cwd, "$cwd"); set_sysnum(tracee, PR_void); status = 0; break; } case PR_bind: case PR_connect: { word_t address; word_t size; address = peek_reg(tracee, CURRENT, SYSARG_2); size = peek_reg(tracee, CURRENT, SYSARG_3); status = translate_socketcall_enter(tracee, &address, size); if (status <= 0) break; poke_reg(tracee, SYSARG_2, address); poke_reg(tracee, SYSARG_3, sizeof(struct sockaddr_un)); status = 0; break; } #define SYSARG_ADDR(n) (args_addr + ((n) - 1) * sizeof_word(tracee)) #define PEEK_WORD(addr, forced_errno) \ peek_word(tracee, addr); \ if (errno != 0) { \ status = forced_errno ?: -errno; \ break; \ } #define POKE_WORD(addr, value) \ poke_word(tracee, addr, value); \ if (errno != 0) { \ status = -errno; \ break; \ } case PR_accept: case PR_accept4: /* Nothing special to do if no sockaddr was specified. */ if (peek_reg(tracee, ORIGINAL, SYSARG_2) == 0) { status = 0; break; } special = true; /* Fall through. */ case PR_getsockname: case PR_getpeername:{ int size; /* Remember: PEEK_WORD puts -errno in status and breaks if an * error occured. */ size = (int) PEEK_WORD(peek_reg(tracee, ORIGINAL, SYSARG_3), special ? -EINVAL : 0); /* The "size" argument is both used as an input parameter * (max. size) and as an output parameter (actual size). The * exit stage needs to know the max. size to not overwrite * anything, that's why it is copied in the 6th argument * (unused) before the kernel updates it. */ poke_reg(tracee, SYSARG_6, size); status = 0; break; } case PR_socketcall: { word_t args_addr; word_t sock_addr_saved; word_t sock_addr; word_t size_addr; word_t size; args_addr = peek_reg(tracee, CURRENT, SYSARG_2); switch (peek_reg(tracee, CURRENT, SYSARG_1)) { case SYS_BIND: case SYS_CONNECT: /* Handle these cases below. */ status = 1; break; case SYS_ACCEPT: case SYS_ACCEPT4: /* Nothing special to do if no sockaddr was specified. */ sock_addr = PEEK_WORD(SYSARG_ADDR(2), 0); if (sock_addr == 0) { status = 0; break; } special = true; /* Fall through. */ case SYS_GETSOCKNAME: case SYS_GETPEERNAME: /* Remember: PEEK_WORD puts -errno in status and breaks * if an error occured. */ size_addr = PEEK_WORD(SYSARG_ADDR(3), 0); size = (int) PEEK_WORD(size_addr, special ? -EINVAL : 0); /* See case PR_accept for explanation. */ poke_reg(tracee, SYSARG_6, size); status = 0; break; default: status = 0; break; } /* An error occured or there's nothing else to do. */ if (status <= 0) break; /* Remember: PEEK_WORD puts -errno in status and breaks if an * error occured. */ sock_addr = PEEK_WORD(SYSARG_ADDR(2), 0); size = PEEK_WORD(SYSARG_ADDR(3), 0); sock_addr_saved = sock_addr; status = translate_socketcall_enter(tracee, &sock_addr, size); if (status <= 0) break; /* These parameters are used/restored at the exit stage. */ poke_reg(tracee, SYSARG_5, sock_addr_saved); poke_reg(tracee, SYSARG_6, size); /* Remember: POKE_WORD puts -errno in status and breaks if an * error occured. */ POKE_WORD(SYSARG_ADDR(2), sock_addr); POKE_WORD(SYSARG_ADDR(3), sizeof(struct sockaddr_un)); status = 0; break; } #undef SYSARG_ADDR #undef PEEK_WORD #undef POKE_WORD case PR_access: case PR_acct: case PR_chmod: case PR_chown: case PR_chown32: case PR_chroot: case PR_getxattr: case PR_listxattr: case PR_mknod: case PR_oldstat: case PR_creat: case PR_removexattr: case PR_setxattr: case PR_stat: case PR_stat64: case PR_statfs: case PR_statfs64: case PR_swapoff: case PR_swapon: case PR_truncate: case PR_truncate64: case PR_umount: case PR_umount2: case PR_uselib: case PR_utime: case PR_utimes: status = translate_sysarg(tracee, SYSARG_1, REGULAR); break; case PR_open: flags = peek_reg(tracee, CURRENT, SYSARG_2); if ( ((flags & O_NOFOLLOW) != 0) || ((flags & O_EXCL) != 0 && (flags & O_CREAT) != 0)) status = translate_sysarg(tracee, SYSARG_1, SYMLINK); else status = translate_sysarg(tracee, SYSARG_1, REGULAR); break; case PR_fchownat: case PR_fstatat64: case PR_newfstatat: case PR_utimensat: case PR_name_to_handle_at: dirfd = peek_reg(tracee, CURRENT, SYSARG_1); status = get_sysarg_path(tracee, path, SYSARG_2); if (status < 0) break; flags = ( syscall_number == PR_fchownat || syscall_number == PR_name_to_handle_at) ? peek_reg(tracee, CURRENT, SYSARG_5) : peek_reg(tracee, CURRENT, SYSARG_4); if ((flags & AT_SYMLINK_NOFOLLOW) != 0) status = translate_path2(tracee, dirfd, path, SYSARG_2, SYMLINK); else status = translate_path2(tracee, dirfd, path, SYSARG_2, REGULAR); break; case PR_fchmodat: case PR_faccessat: case PR_futimesat: case PR_mknodat: dirfd = peek_reg(tracee, CURRENT, SYSARG_1); status = get_sysarg_path(tracee, path, SYSARG_2); if (status < 0) break; status = translate_path2(tracee, dirfd, path, SYSARG_2, REGULAR); break; case PR_inotify_add_watch: flags = peek_reg(tracee, CURRENT, SYSARG_3); if ((flags & IN_DONT_FOLLOW) != 0) status = translate_sysarg(tracee, SYSARG_2, SYMLINK); else status = translate_sysarg(tracee, SYSARG_2, REGULAR); break; case PR_readlink: case PR_lchown: case PR_lchown32: case PR_lgetxattr: case PR_llistxattr: case PR_lremovexattr: case PR_lsetxattr: case PR_lstat: case PR_lstat64: case PR_oldlstat: case PR_unlink: case PR_rmdir: case PR_mkdir: status = translate_sysarg(tracee, SYSARG_1, SYMLINK); break; case PR_pivot_root: status = translate_sysarg(tracee, SYSARG_1, REGULAR); if (status < 0) break; status = translate_sysarg(tracee, SYSARG_2, REGULAR); break; case PR_linkat: olddirfd = peek_reg(tracee, CURRENT, SYSARG_1); newdirfd = peek_reg(tracee, CURRENT, SYSARG_3); flags = peek_reg(tracee, CURRENT, SYSARG_5); status = get_sysarg_path(tracee, oldpath, SYSARG_2); if (status < 0) break; status = get_sysarg_path(tracee, newpath, SYSARG_4); if (status < 0) break; if ((flags & AT_SYMLINK_FOLLOW) != 0) status = translate_path2(tracee, olddirfd, oldpath, SYSARG_2, REGULAR); else status = translate_path2(tracee, olddirfd, oldpath, SYSARG_2, SYMLINK); if (status < 0) break; status = translate_path2(tracee, newdirfd, newpath, SYSARG_4, SYMLINK); break; case PR_mount: status = get_sysarg_path(tracee, path, SYSARG_1); if (status < 0) break; /* The following check covers only 90% of the cases. */ if (path[0] == '/' || path[0] == '.') { status = translate_path2(tracee, AT_FDCWD, path, SYSARG_1, REGULAR); if (status < 0) break; } status = translate_sysarg(tracee, SYSARG_2, REGULAR); break; case PR_openat: dirfd = peek_reg(tracee, CURRENT, SYSARG_1); flags = peek_reg(tracee, CURRENT, SYSARG_3); status = get_sysarg_path(tracee, path, SYSARG_2); if (status < 0) break; if ( ((flags & O_NOFOLLOW) != 0) || ((flags & O_EXCL) != 0 && (flags & O_CREAT) != 0)) status = translate_path2(tracee, dirfd, path, SYSARG_2, SYMLINK); else status = translate_path2(tracee, dirfd, path, SYSARG_2, REGULAR); break; case PR_readlinkat: case PR_unlinkat: case PR_mkdirat: dirfd = peek_reg(tracee, CURRENT, SYSARG_1); status = get_sysarg_path(tracee, path, SYSARG_2); if (status < 0) break; status = translate_path2(tracee, dirfd, path, SYSARG_2, SYMLINK); break; case PR_link: case PR_rename: status = translate_sysarg(tracee, SYSARG_1, SYMLINK); if (status < 0) break; status = translate_sysarg(tracee, SYSARG_2, SYMLINK); break; case PR_renameat: olddirfd = peek_reg(tracee, CURRENT, SYSARG_1); newdirfd = peek_reg(tracee, CURRENT, SYSARG_3); status = get_sysarg_path(tracee, oldpath, SYSARG_2); if (status < 0) break; status = get_sysarg_path(tracee, newpath, SYSARG_4); if (status < 0) break; status = translate_path2(tracee, olddirfd, oldpath, SYSARG_2, SYMLINK); if (status < 0) break; status = translate_path2(tracee, newdirfd, newpath, SYSARG_4, SYMLINK); break; case PR_symlink: status = translate_sysarg(tracee, SYSARG_2, SYMLINK); break; case PR_symlinkat: newdirfd = peek_reg(tracee, CURRENT, SYSARG_2); status = get_sysarg_path(tracee, newpath, SYSARG_3); if (status < 0) break; status = translate_path2(tracee, newdirfd, newpath, SYSARG_3, SYMLINK); break; } end: status2 = notify_extensions(tracee, SYSCALL_ENTER_END, status, 0); if (status2 < 0) status = status2; return status; }
word_t translate_brk_enter(Tracee *tracee) { word_t new_brk_address; size_t old_heap_size; size_t new_heap_size; if (heap_offset == 0) { heap_offset = sysconf(_SC_PAGE_SIZE); if ((int) heap_offset <= 0) heap_offset = 0x1000; } /* Non-fixed mmap pages might be placed right after the * emulated heap on some architectures. A solution is to * preallocate some space to ensure a minimal heap size. */ if (tracee->heap->prealloc_size == 0) tracee->heap->prealloc_size = MAX(PREALLOCATED_HEAP_SIZE, heap_offset); new_brk_address = peek_reg(tracee, CURRENT, SYSARG_1); DEBUG_BRK("brk(0x%lx)\n", new_brk_address); /* Allocate a new mapping for the emulated heap. */ if (tracee->heap->base == 0) { Sysnum sysnum; if (new_brk_address != 0) notice(tracee, WARNING, INTERNAL, "process %d is doing suspicious brk()", tracee->pid); /* I don't understand yet why mmap(2) fails (EFAULT) * on architectures that also have mmap2(2). Maybe * this former implies MAP_FIXED in such cases. */ sysnum = detranslate_sysnum(get_abi(tracee), PR_mmap2) != SYSCALL_AVOIDER ? PR_mmap2 : PR_mmap; set_sysnum(tracee, sysnum); poke_reg(tracee, SYSARG_1 /* address */, 0); poke_reg(tracee, SYSARG_2 /* length */, heap_offset + tracee->heap->prealloc_size); poke_reg(tracee, SYSARG_3 /* prot */, PROT_READ | PROT_WRITE); poke_reg(tracee, SYSARG_4 /* flags */, MAP_PRIVATE | MAP_ANONYMOUS); poke_reg(tracee, SYSARG_5 /* fd */, -1); poke_reg(tracee, SYSARG_6 /* offset */, 0); return 0; } /* The size of the heap can't be negative. */ if (new_brk_address < tracee->heap->base) { set_sysnum(tracee, PR_void); return 0; } new_heap_size = new_brk_address - tracee->heap->base; old_heap_size = tracee->heap->size; /* Clear the released memory in preallocated space, so it will be * in the expected state next time it will be reallocated. */ if (new_heap_size < old_heap_size && new_heap_size < tracee->heap->prealloc_size) { (void) clear_mem(tracee, tracee->heap->base + new_heap_size, MIN(old_heap_size, tracee->heap->prealloc_size) - new_heap_size); } /* No need to use mremap when both old size and new size are * in the preallocated space. */ if ( new_heap_size <= tracee->heap->prealloc_size && old_heap_size <= tracee->heap->prealloc_size) { tracee->heap->size = new_heap_size; set_sysnum(tracee, PR_void); return 0; } /* Ensure the preallocated space will never be released. */ new_heap_size = MAX(new_heap_size, tracee->heap->prealloc_size); old_heap_size = MAX(old_heap_size, tracee->heap->prealloc_size); /* Actually resizing. */ set_sysnum(tracee, PR_mremap); poke_reg(tracee, SYSARG_1 /* old_address */, tracee->heap->base - heap_offset); poke_reg(tracee, SYSARG_2 /* old_size */, old_heap_size + heap_offset); poke_reg(tracee, SYSARG_3 /* new_size */, new_heap_size + heap_offset); poke_reg(tracee, SYSARG_4 /* flags */, 0); poke_reg(tracee, SYSARG_5 /* new_address */, 0); return 0; }