/** * Allocate @size bytes in the @tracee's memory space. This function * returns the address of the allocated memory in the @tracee's memory * space, otherwise 0 if an error occured. */ word_t alloc_mem(Tracee *tracee, ssize_t size) { word_t stack_pointer; /* Get the current value of the stack pointer from the tracee's * USER area. */ stack_pointer = peek_reg(tracee, CURRENT, STACK_POINTER); /* Some ABIs specify an amount of bytes after the stack * pointer that shall not be used by anything but the compiler * (for optimization purpose). */ if (stack_pointer == peek_reg(tracee, ORIGINAL, STACK_POINTER)) size += RED_ZONE_SIZE; /* Sanity check. */ if ( (size > 0 && stack_pointer <= size) || (size < 0 && stack_pointer >= ULONG_MAX + size)) { notice(tracee, WARNING, INTERNAL, "integer under/overflow detected in %s", __FUNCTION__); return 0; } /* Remember the stack grows downward. */ stack_pointer -= size; /* Set the new value of the stack pointer in the tracee's USER * area. */ poke_reg(tracee, STACK_POINTER, stack_pointer); return stack_pointer; }
void translate_brk_exit(Tracee *tracee) { word_t result; word_t sysnum; int tracee_errno; assert(heap_offset > 0); sysnum = get_sysnum(tracee, MODIFIED); result = peek_reg(tracee, CURRENT, SYSARG_RESULT); tracee_errno = (int) result; switch (sysnum) { case PR_void: poke_reg(tracee, SYSARG_RESULT, tracee->heap->base + tracee->heap->size); break; case PR_mmap: case PR_mmap2: /* On error, mmap(2) returns -errno (the last 4k is * reserved for this), whereas brk(2) returns the * previous value. */ if (tracee_errno < 0 && tracee_errno > -4096) { poke_reg(tracee, SYSARG_RESULT, 0); break; } tracee->heap->base = result + heap_offset; tracee->heap->size = 0; poke_reg(tracee, SYSARG_RESULT, tracee->heap->base + tracee->heap->size); break; case PR_mremap: /* On error, mremap(2) returns -errno (the last 4k is * reserved this), whereas brk(2) returns the previous * value. */ if ( (tracee_errno < 0 && tracee_errno > -4096) || (tracee->heap->base != result + heap_offset)) { poke_reg(tracee, SYSARG_RESULT, tracee->heap->base + tracee->heap->size); break; } tracee->heap->size = peek_reg(tracee, MODIFIED, SYSARG_3) - heap_offset; poke_reg(tracee, SYSARG_RESULT, tracee->heap->base + tracee->heap->size); break; default: assert(0); } DEBUG_BRK("brk() = 0x%lx\n", peek_reg(tracee, CURRENT, SYSARG_RESULT)); }
/** * Emulate the wait* syscall made by @ptracer if it was in the context * of the ptrace mechanism. This function returns -errno if an error * occured, otherwise the pid of the expected tracee. */ int translate_wait_exit(Tracee *ptracer) { Tracee *ptracee; word_t options; int status; pid_t pid; assert(PTRACER.waits_in == WAITS_IN_PROOT); PTRACER.waits_in = DOESNT_WAIT; pid = (pid_t) peek_reg(ptracer, ORIGINAL, SYSARG_1); options = peek_reg(ptracer, ORIGINAL, SYSARG_3); /* Is there such a stopped ptracee with an event not yet * passed to its ptracer? */ ptracee = get_stopped_ptracee(ptracer, pid, true, options); if (ptracee == NULL) { /* Is there still living ptracees? */ if (PTRACER.nb_ptracees == 0) return -ECHILD; /* Non blocking wait(2) ? */ if ((options & WNOHANG) != 0) { /* if WNOHANG was specified and one or more * child(ren) specified by pid exist, but have * not yet changed state, then 0 is returned. * On error, -1 is returned. * * -- man 2 waitpid */ return (has_ptracees(ptracer, pid, options) ? 0 : -ECHILD); } /* Otherwise put this ptracer in the "waiting for * ptracee" state, it will be woken up in * handle_ptracee_event() later. */ PTRACER.wait_pid = pid; PTRACER.wait_options = options; return 0; } status = update_wait_status(ptracer, ptracee); if (status < 0) return status; pid = ptracee->pid; /* Zombies can rest in peace once the ptracer is notified. */ if (PTRACEE.is_zombie) TALLOC_FREE(ptracee); return pid; }
/** * Modify the current syscall of @tracee as described by @modif * regarding the given @config. This function returns whether the * syscall was modified or not. */ static bool modify_syscall(Tracee *tracee, const Config *config, const Modif *modif) { size_t i, j; word_t syscall; assert(config != NULL); if (!needs_kompat(config, modif->expected_release)) return false; /* Check if this syscall is supported on this architecture. */ syscall = detranslate_sysnum(get_abi(tracee), modif->new_sysarg_num); if (syscall == SYSCALL_AVOIDER) return false; set_sysnum(tracee, modif->new_sysarg_num); /* Shift syscall arguments. */ for (i = 0; i < MAX_ARG_SHIFT; i++) { Reg sysarg = modif->shifts[i].sysarg; size_t nb_args = modif->shifts[i].nb_args; int offset = modif->shifts[i].offset; for (j = 0; j < nb_args; j++) { word_t arg = peek_reg(tracee, CURRENT, sysarg + j); poke_reg(tracee, sysarg + j + offset, arg); } } return true; }
/** * Translate the wait syscall made by @ptracer into a "void" syscall * if the expected pid is one of its ptracees, in order to emulate the * ptrace mechanism within PRoot. This function returns -errno if an * error occured (unsupported request), otherwise 0. */ int translate_wait_enter(Tracee *ptracer) { Tracee *ptracee; pid_t pid; PTRACER.waits_in = WAITS_IN_KERNEL; /* Don't emulate the ptrace mechanism if it's not a ptracer. */ if (PTRACER.nb_ptracees == 0) return 0; /* Don't emulate the ptrace mechanism if the requested pid is * not a ptracee. */ pid = (pid_t) peek_reg(ptracer, ORIGINAL, SYSARG_1); if (pid != -1) { ptracee = get_tracee(ptracer, pid, false); if (ptracee == NULL || PTRACEE.ptracer != ptracer) return 0; } /* This syscall is canceled at the enter stage in order to be * handled at the exit stage. */ set_sysnum(ptracer, PR_void); PTRACER.waits_in = WAITS_IN_PROOT; return 0; }
/** * Modify the current syscall of @tracee as described by @modif * regarding the given @config. This function returns whether the * syscall was modified or not. */ static bool modify_syscall(Tracee *tracee, const Config *config, const Modif *modif) { size_t i, j; assert(config != NULL); if (!needs_kompat(config, modif->expected_release)) return false; set_sysnum(tracee, modif->new_sysarg_num); /* Shift syscall arguments. */ for (i = 0; i < MAX_ARG_SHIFT; i++) { Reg sysarg = modif->shifts[i].sysarg; size_t nb_args = modif->shifts[i].nb_args; int offset = modif->shifts[i].offset; for (j = 0; j < nb_args; j++) { word_t arg = peek_reg(tracee, CURRENT, sysarg + j); poke_reg(tracee, sysarg + j + offset, arg); } } return true; }
/** * Update pid & wait status of @ptracer's wait(2) for the given * @ptracee. */ static int update_wait_status(Tracee *ptracer, Tracee *ptracee) { int event = PTRACEE.event4.ptracer.value; word_t address; address = peek_reg(ptracer, ORIGINAL, SYSARG_2); if (address != 0) { poke_int32(ptracer, address, event); if (errno != 0) return -errno; } PTRACEE.event4.ptracer.pending = false; /* Under PRoot, the kernel will report its termination once * again to its parent since "ptracer != parent" from kernel's * point-of-view. PRoot has to mask this second notification * not to make the parent/ptracer confused. */ if ( (WIFEXITED(event) || WIFSIGNALED(event)) && is_direct_ptracee(ptracer, ptracee->pid)) { set_exited_direct_ptracee(ptracer, ptracee->pid); } return ptracee->pid; }
/** * Set the *cached* value of the given @tracees' @reg. */ void poke_reg(Tracee *tracee, Reg reg, word_t value) { if (peek_reg(tracee, CURRENT, reg) == value) return; REG(tracee, CURRENT, reg) = value; tracee->_regs_were_changed = true; }
/** * Remove @discarded_flags from the given @tracee's @sysarg register * if the actual kernel release is not compatible with the * @expected_release. */ static void discard_fd_flags(Tracee *tracee, const Config *config, int discarded_flags, int expected_release, Reg sysarg) { word_t flags; if (!needs_kompat(config, expected_release)) return; flags = peek_reg(tracee, CURRENT, sysarg); poke_reg(tracee, sysarg, flags & ~discarded_flags); }
/** * Update pid & wait status of @ptracer's wait(2) for the given * @ptracee. This function returns -errno if an error occurred, 0 if * the wait syscall will be restarted (ie. the event is discarded), * otherwise @ptracee's pid. */ static int update_wait_status(Tracee *ptracer, Tracee *ptracee) { word_t address; int result; /* Special case: the Linux kernel reports the terminating * event issued by a process to both its parent and its * tracer, except when they are the same. In this case the * Linux kernel reports the terminating event only once to the * tracing parent ... */ if (PTRACEE.ptracer == ptracee->parent && (WIFEXITED(PTRACEE.event4.ptracer.value) || WIFSIGNALED(PTRACEE.event4.ptracer.value))) { /* ... So hide this terminating event (toward its * tracer, ie. PRoot) and make the second one appear * (towards its parent, ie. the ptracer). This will * ensure its exit status is collected from a kernel * point-of-view (ie. it doesn't stay a zombie * forever). */ restart_original_syscall(ptracer); /* Detach this ptracee from its ptracer, PRoot doesn't * have anything else to emulate. */ detach_from_ptracer(ptracee); /* Zombies can rest in peace once the ptracer is * notified. */ if (PTRACEE.is_zombie) TALLOC_FREE(ptracee); return 0; } address = peek_reg(ptracer, ORIGINAL, SYSARG_2); if (address != 0) { poke_int32(ptracer, address, PTRACEE.event4.ptracer.value); if (errno != 0) return -errno; } PTRACEE.event4.ptracer.pending = false; /* Be careful; ptracee might get freed before its pid is * returned. */ result = ptracee->pid; /* Zombies can rest in peace once the ptracer is notified. */ if (PTRACEE.is_zombie) { detach_from_ptracer(ptracee); TALLOC_FREE(ptracee); } return result; }
/** * Print the value of the current @tracee's registers according * to the @verbose_level. Note: @message is mixed to the output. */ void print_current_regs(Tracee *tracee, int verbose_level, const char *message) { if (tracee->verbose < verbose_level) return; note(tracee, INFO, INTERNAL, "pid %d: %s: %s(0x%lx, 0x%lx, 0x%lx, 0x%lx, 0x%lx, 0x%lx) = 0x%lx [0x%lx, %d]", tracee->pid, message, stringify_sysnum(get_sysnum(tracee, CURRENT)), peek_reg(tracee, CURRENT, SYSARG_1), peek_reg(tracee, CURRENT, SYSARG_2), peek_reg(tracee, CURRENT, SYSARG_3), peek_reg(tracee, CURRENT, SYSARG_4), peek_reg(tracee, CURRENT, SYSARG_5), peek_reg(tracee, CURRENT, SYSARG_6), peek_reg(tracee, CURRENT, SYSARG_RESULT), peek_reg(tracee, CURRENT, STACK_POINTER), get_abi(tracee)); }
/** * Allocate @size bytes in the @tracee's memory space. This function * returns the address of the allocated memory in the @tracee's memory * space, otherwise 0 if an error occured. */ word_t alloc_mem(Tracee *tracee, ssize_t size) { word_t stack_pointer; /* This function should be called in sysenter only since the * stack pointer is systematically restored at the end of * sysexit (except for execve, but in this case the stack * pointer should be handled with care since it is used by the * process to retrieve argc, argv, envp, and auxv). */ assert(IS_IN_SYSENTER(tracee)); /* Get the current value of the stack pointer from the tracee's * USER area. */ stack_pointer = peek_reg(tracee, CURRENT, STACK_POINTER); /* Some ABIs specify an amount of bytes after the stack * pointer that shall not be used by anything but the compiler * (for optimization purpose). */ if (stack_pointer == peek_reg(tracee, ORIGINAL, STACK_POINTER)) size += RED_ZONE_SIZE; /* Sanity check. */ if ( (size > 0 && stack_pointer <= (word_t) size) || (size < 0 && stack_pointer >= ULONG_MAX + size)) { note(tracee, WARNING, INTERNAL, "integer under/overflow detected in %s", __FUNCTION__); return 0; } /* Remember the stack grows downward. */ stack_pointer -= size; /* Set the new value of the stack pointer in the tracee's USER * area. */ poke_reg(tracee, STACK_POINTER, stack_pointer); return stack_pointer; }
void translate_syscall(Tracee *tracee) { const bool is_enter_stage = (tracee->status == 0); int status; assert(tracee->exe != NULL); status = fetch_regs(tracee); if (status < 0) return; if (is_enter_stage) { /* Never restore original register values at the end * of this stage. */ tracee->restore_original_regs = false; print_current_regs(tracee, 3, "sysenter start"); save_current_regs(tracee, ORIGINAL); translate_syscall_enter(tracee); print_current_regs(tracee, 5, "sysenter end"); save_current_regs(tracee, MODIFIED); /* Restore tracee's stack pointer now if it won't hit * the sysexit stage (i.e. when seccomp is enabled and * there's nothing else to do). */ if (tracee->restart_how == PTRACE_CONT) { tracee->status = 0; poke_reg(tracee, STACK_POINTER, peek_reg(tracee, ORIGINAL, STACK_POINTER)); } } else { /* By default, restore original register values at the * end of this stage. */ tracee->restore_original_regs = true; print_current_regs(tracee, 5, "sysexit start"); translate_syscall_exit(tracee); print_current_regs(tracee, 4, "sysexit end"); } (void) push_regs(tracee); }
/** * Copy in @path a C string (PATH_MAX bytes max.) from the @tracee's * memory address space pointed to by the @reg argument of the * current syscall. This function returns -errno if an error occured, * otherwise it returns the size in bytes put into the @path. */ int get_sysarg_path(const Tracee *tracee, char path[PATH_MAX], Reg reg) { int size; word_t src; src = peek_reg(tracee, CURRENT, reg); /* Check if the parameter is not NULL. Technically we should * not return an -EFAULT for this special value since it is * allowed for some syscall, utimensat(2) for instance. */ if (src == 0) { path[0] = '\0'; return 0; } /* Get the path from the tracee's memory space. */ size = read_path(tracee, path, src); if (size < 0) return size; path[size] = '\0'; return size; }
int pt_debugger_x64::syscall() { return (int) peek_reg(ORIG_RAX); }
static int handle_syschained_exit(Tracee *tracee, Config *config) { int sysnum; sysnum = get_sysnum(tracee, CURRENT); switch(sysnum) { #define SYSARG_ADDR(n) (args_addr + ((n) - 1) * sizeof_word(tracee)) #define PEEK_WORD(addr, forced_errno) \ peek_word(tracee, addr); \ if (errno != 0) { \ status = forced_errno ?: -errno; \ break; \ } #define POKE_WORD(addr, value) \ poke_word(tracee, addr, value); \ if (errno != 0) { \ status = -errno; \ break; \ } case PR_socketcall: { word_t args_addr; word_t call; call = peek_reg(tracee, CURRENT, SYSARG_1); args_addr = peek_reg(tracee, CURRENT, SYSARG_2); switch(call) { case SYS_GETSOCKNAME:{ word_t sockfd, sock_addr; int result, status; sockfd = PEEK_WORD(SYSARG_ADDR(1), 0); sock_addr = PEEK_WORD(SYSARG_ADDR(2), 0); result = peek_reg(tracee, CURRENT, SYSARG_RESULT); status = add_changed_port_as_entry(tracee, config, sockfd, sock_addr, result); return status; } default: return 0; } return 0; } #undef SYSARG_ADDR #undef PEEK_WORD #undef POKE_WORD case PR_getsockname:{ word_t sockfd, sock_addr; int result; sockfd = peek_reg(tracee, CURRENT, SYSARG_1); sock_addr = peek_reg(tracee, CURRENT, SYSARG_2); result = peek_reg(tracee, CURRENT, SYSARG_RESULT); return add_changed_port_as_entry(tracee, config, sockfd, sock_addr, result); } default: return 0; } }
void translate_syscall(Tracee *tracee) { const bool is_enter_stage = IS_IN_SYSENTER(tracee); int status; assert(tracee->exe != NULL); status = fetch_regs(tracee); if (status < 0) return; if (is_enter_stage) { /* Never restore original register values at the end * of this stage. */ tracee->restore_original_regs = false; print_current_regs(tracee, 3, "sysenter start"); /* Translate the syscall only if it was actually * requested by the tracee, it is not a syscall * chained by PRoot. */ if (tracee->chain.syscalls == NULL) { save_current_regs(tracee, ORIGINAL); status = translate_syscall_enter(tracee); save_current_regs(tracee, MODIFIED); } else { status = notify_extensions(tracee, SYSCALL_CHAINED_ENTER, 0, 0); tracee->restart_how = PTRACE_SYSCALL; } /* Remember the tracee status for the "exit" stage and * avoid the actual syscall if an error was reported * by the translation/extension. */ if (status < 0) { set_sysnum(tracee, PR_void); poke_reg(tracee, SYSARG_RESULT, (word_t) status); tracee->status = status; } else tracee->status = 1; /* Restore tracee's stack pointer now if it won't hit * the sysexit stage (i.e. when seccomp is enabled and * there's nothing else to do). */ if (tracee->restart_how == PTRACE_CONT) { tracee->status = 0; poke_reg(tracee, STACK_POINTER, peek_reg(tracee, ORIGINAL, STACK_POINTER)); } } else { /* By default, restore original register values at the * end of this stage. */ tracee->restore_original_regs = true; print_current_regs(tracee, 5, "sysexit start"); /* Translate the syscall only if it was actually * requested by the tracee, it is not a syscall * chained by PRoot. */ if (tracee->chain.syscalls == NULL) translate_syscall_exit(tracee); else (void) notify_extensions(tracee, SYSCALL_CHAINED_EXIT, 0, 0); /* Reset the tracee's status. */ tracee->status = 0; /* Insert the next chained syscall, if any. */ if (tracee->chain.syscalls != NULL) chain_next_syscall(tracee); } (void) push_regs(tracee); if (is_enter_stage) print_current_regs(tracee, 5, "sysenter end" ); else print_current_regs(tracee, 4, "sysexit end"); }
/** * Force permissions of @path to "rwx" during the path translation of * current @tracee's syscall, in order to simulate CAP_DAC_OVERRIDE. * The original permissions are restored through talloc destructors. * See canonicalize() for the meaning of @is_final. */ static void handle_host_path(const Tracee *tracee, const char *path, bool is_final) { ModifiedNode *node; struct stat perms; mode_t new_mode; int status; /* Get the meta-data */ status = stat(path, &perms); if (status < 0) return; /* Copy the current permissions */ new_mode = perms.st_mode & (S_IRWXU | S_IRWXG | S_IRWXO); /* Add read and write permissions to everything. */ new_mode |= (S_IRUSR | S_IWUSR); /* Always add 'x' bit to directories */ if (S_ISDIR(perms.st_mode)) new_mode |= S_IXUSR; /* Patch the permissions only if needed. */ if (new_mode == (perms.st_mode & (S_IRWXU | S_IRWXG | S_IRWXO))) return; node = talloc_zero(tracee->ctx, ModifiedNode); if (node == NULL) return; if (!is_final) { /* Restore the previous mode of any non final components. */ node->mode = perms.st_mode; } else { switch (get_sysnum(tracee, ORIGINAL)) { /* For chmod syscalls: restore the new mode of the final component. */ case PR_chmod: node->mode = peek_reg(tracee, ORIGINAL, SYSARG_2); break; case PR_fchmodat: node->mode = peek_reg(tracee, ORIGINAL, SYSARG_3); break; /* For stat syscalls: don't touch the mode of the final component. */ case PR_fstatat64: case PR_lstat: case PR_lstat64: case PR_newfstatat: case PR_oldlstat: case PR_oldstat: case PR_stat: case PR_stat64: case PR_statfs: case PR_statfs64: return; /* Otherwise: restore the previous mode of the final component. */ default: node->mode = perms.st_mode; break; } } node->path = talloc_strdup(node, path); if (node->path == NULL) { /* Keep only consistent nodes. */ TALLOC_FREE(node); return; } /* The mode restoration works because Talloc destructors are * called in reverse order. */ talloc_set_destructor(node, restore_mode); (void) chmod(path, new_mode); return; }
static int handle_sysenter_end(Tracee *tracee, Config *config) { int status; int sysnum; sysnum = get_sysnum(tracee, CURRENT); switch(sysnum) { #define SYSARG_ADDR(n) (args_addr + ((n) - 1) * sizeof_word(tracee)) #define PEEK_WORD(addr, forced_errno) \ peek_word(tracee, addr); \ if (errno != 0) { \ status = forced_errno ?: -errno; \ break; \ } #define POKE_WORD(addr, value) \ poke_word(tracee, addr, value); \ if (errno != 0) { \ status = -errno; \ break; \ } case PR_socketcall: { word_t sockfd; word_t args_addr; word_t sock_addr_saved; word_t sock_addr; word_t size; word_t call; int is_bind_syscall = sysnum == PR_bind; call = peek_reg(tracee, CURRENT, SYSARG_1); is_bind_syscall = call == SYS_BIND; args_addr = peek_reg(tracee, CURRENT, SYSARG_2); switch(call) { case SYS_BIND: case SYS_CONNECT: { /* Remember: PEEK_WORD puts -errno in status and breaks if an * error occured. */ sockfd = PEEK_WORD(SYSARG_ADDR(1), 0); sock_addr = PEEK_WORD(SYSARG_ADDR(2), 0); size = PEEK_WORD(SYSARG_ADDR(3), 0); sock_addr_saved = sock_addr; status = translate_port(tracee, config, sockfd, &sock_addr, size, is_bind_syscall); if (status < 0) break; /* These parameters are used/restored at the exit stage. */ poke_reg(tracee, SYSARG_5, sock_addr_saved); poke_reg(tracee, SYSARG_6, size); /* Remember: POKE_WORD puts -errno in status and breaks if an * error occured. */ POKE_WORD(SYSARG_ADDR(2), sock_addr); POKE_WORD(SYSARG_ADDR(3), sizeof(struct sockaddr_un)); return 0; } case SYS_LISTEN: { word_t sockfd; if(!config->netcoop_mode || !config->need_to_check_new_port) return 0; /* we retrieve this one from the listen() system call */ sockfd = PEEK_WORD(SYSARG_ADDR(1), 0); status = prepare_getsockname_chained_syscall(tracee, config, sockfd, true); return status; } default: return 0; } break; } #undef SYSARG_ADDR #undef PEEK_WORD #undef POKE_WORD case PR_connect: case PR_bind: { int size; int is_bind_syscall; word_t sockfd, sock_addr; /* * Get the reg address of the socket, and the size of the structure. * Note that the sockaddr and addrlen are at the same position for all 4 of these syscalls. */ sockfd = peek_reg(tracee, CURRENT, SYSARG_1); sock_addr = peek_reg(tracee, CURRENT, SYSARG_2); size = (int) peek_reg(tracee, CURRENT, SYSARG_3); is_bind_syscall = sysnum == PR_bind; status = translate_port(tracee, config, sockfd, &sock_addr, size, is_bind_syscall); if (status < 0) { return status; } /* then we modify the syscall argument so that it uses the modified socket address */ poke_reg(tracee, SYSARG_2, sock_addr); //poke_reg(tracee, SYSARG_3, size); poke_reg(tracee, SYSARG_3, sizeof(struct sockaddr_un)); return 0; } case PR_listen: { word_t sockfd; if(!config->netcoop_mode || !config->need_to_check_new_port) return 0; /* we retrieve this one from the listen() system call */ sockfd = peek_reg(tracee, CURRENT, SYSARG_1); status = prepare_getsockname_chained_syscall(tracee, config, sockfd, false); return status; } default: return 0; } return 0; }
/** * Force current @tracee's syscall to behave as if executed by "root". * This function returns -errno if an error occured, otherwise 0. */ static int handle_sysexit_end(Tracee *tracee) { word_t sysnum; sysnum = get_sysnum(tracee, ORIGINAL); switch (sysnum) { case PR_chroot: { char path[PATH_MAX]; word_t result; word_t input; int status; /* Override only permission errors. */ result = peek_reg(tracee, CURRENT, SYSARG_RESULT); if ((int) result != -EPERM) return 0; input = peek_reg(tracee, MODIFIED, SYSARG_1); status = read_path(tracee, path, input); if (status < 0) return status; /* Only "new rootfs == current rootfs" is supported yet. */ status = compare_paths(get_root(tracee), path); if (status != PATHS_ARE_EQUAL) return 0; /* Force success. */ poke_reg(tracee, SYSARG_RESULT, 0); return 0; } case PR_setresuid: case PR_setresgid: case PR_setresuid32: case PR_setresgid32: case PR_mknod: case PR_capset: case PR_setxattr: case PR_chmod: case PR_chown: case PR_fchmod: case PR_fchown: case PR_lchown: case PR_chown32: case PR_fchown32: case PR_lchown32: case PR_fchmodat: case PR_fchownat: { word_t result; /* Override only permission errors. */ result = peek_reg(tracee, CURRENT, SYSARG_RESULT); if ((int) result != -EPERM) return 0; /* Force success. */ poke_reg(tracee, SYSARG_RESULT, 0); return 0; } case PR_getresuid: case PR_getresuid32: case PR_getresgid: case PR_getresgid32: poke_mem(tracee, peek_reg(tracee, ORIGINAL, SYSARG_1), 0); if (errno != 0) return -EFAULT; poke_mem(tracee, peek_reg(tracee, ORIGINAL, SYSARG_2), 0); if (errno != 0) return -EFAULT; poke_mem(tracee, peek_reg(tracee, ORIGINAL, SYSARG_3), 0); if (errno != 0) return -EFAULT; /* Force success. */ poke_reg(tracee, SYSARG_RESULT, 0); return 0; case PR_fstatat64: case PR_newfstatat: case PR_stat64: case PR_lstat64: case PR_fstat64: case PR_stat: case PR_lstat: case PR_fstat: { word_t result; word_t address; word_t uid, gid; Reg sysarg; /* Override only if it succeed. */ result = peek_reg(tracee, CURRENT, SYSARG_RESULT); if (result != 0) return 0; /* Get the address of the 'stat' structure. */ if (sysnum == PR_fstatat64 || sysnum == PR_newfstatat) sysarg = SYSARG_3; else sysarg = SYSARG_2; address = peek_reg(tracee, ORIGINAL, sysarg); /* Get the uid & gid values from the 'stat' structure. */ uid = peek_mem(tracee, address + offsetof_stat_uid(tracee)); if (errno != 0) uid = 0; /* Not fatal. */ gid = peek_mem(tracee, address + offsetof_stat_gid(tracee)); if (errno != 0) gid = 0; /* Not fatal. */ /* These values are 32-bit width, even on 64-bit architecture. */ uid &= 0xFFFFFFFF; gid &= 0xFFFFFFFF; /* Override only if the file is owned by the current user. * Errors are not fatal here. */ if (uid == getuid()) poke_mem(tracee, address + offsetof_stat_uid(tracee), 0); if (gid == getgid()) poke_mem(tracee, address + offsetof_stat_gid(tracee), 0); return 0; } case PR_getuid: case PR_getgid: case PR_getegid: case PR_geteuid: case PR_getuid32: case PR_getgid32: case PR_geteuid32: case PR_getegid32: case PR_setuid: case PR_setgid: case PR_setfsuid: case PR_setfsgid: case PR_setuid32: case PR_setgid32: case PR_setfsuid32: case PR_setfsgid32: /* Force success. */ poke_reg(tracee, SYSARG_RESULT, 0); return 0; default: return 0; } }
/** * Replace current @tracee's syscall with an older and compatible one * whenever it's required, i.e. when the syscall is supported by the * kernel as specified by @config->virtual_release but it isn't * supported by the actual kernel. */ static int handle_sysenter_end(Tracee *tracee, Config *config) { /* Note: syscalls like "openat" can be replaced by "open" since PRoot * has canonicalized "fd + path" into "path". */ switch (get_sysnum(tracee, ORIGINAL)) { case PR_accept4: { Modif modif = { .expected_release = KERNEL_VERSION(2,6,28), .new_sysarg_num = PR_accept, .shifts = NONE }; modify_syscall(tracee, config, &modif); return 0; } case PR_dup3: { Modif modif = { .expected_release = KERNEL_VERSION(2,6,27), .new_sysarg_num = PR_dup2, .shifts = NONE }; /* "If oldfd equals newfd, then dup3() fails with the * error EINVAL" -- man dup3 */ if (peek_reg(tracee, CURRENT, SYSARG_1) == peek_reg(tracee, CURRENT, SYSARG_2)) return -EINVAL; modify_syscall(tracee, config, &modif); return 0; } case PR_epoll_create1: { bool modified; Modif modif = { .expected_release = KERNEL_VERSION(2,6,27), .new_sysarg_num = PR_epoll_create, .shifts = NONE }; /* "the size argument is ignored, but must be greater * than zero" -- man epoll_create */ modified = modify_syscall(tracee, config, &modif); if (modified) poke_reg(tracee, SYSARG_1, 1); return 0; } case PR_epoll_pwait: { Modif modif = { .expected_release = KERNEL_VERSION(2,6,19), .new_sysarg_num = PR_epoll_wait, .shifts = NONE }; modify_syscall(tracee, config, &modif); return 0; } case PR_eventfd2: { bool modified; word_t flags; Modif modif = { .expected_release = KERNEL_VERSION(2,6,27), .new_sysarg_num = PR_eventfd, .shifts = NONE }; modified = modify_syscall(tracee, config, &modif); if (modified) { /* EFD_SEMAPHORE can't be emulated with eventfd. */ flags = peek_reg(tracee, CURRENT, SYSARG_2); if ((flags & EFD_SEMAPHORE) != 0) return -EINVAL; } return 0; } case PR_faccessat: { Modif modif = { .expected_release = KERNEL_VERSION(2,6,16), .new_sysarg_num = PR_access, .shifts = { [0] = { .sysarg = SYSARG_2, .nb_args = 2, .offset = -1 } } }; modify_syscall(tracee, config, &modif); return 0; } case PR_fchmodat: { Modif modif = { .expected_release = KERNEL_VERSION(2,6,16), .new_sysarg_num = PR_chmod, .shifts = { [0] = { .sysarg = SYSARG_2, .nb_args = 2, .offset = -1 } } }; modify_syscall(tracee, config, &modif); return 0; } case PR_fchownat: { word_t flags; Modif modif = { .expected_release = KERNEL_VERSION(2,6,16), .shifts = { [0] = { .sysarg = SYSARG_2, .nb_args = 3, .offset = -1 } } }; flags = peek_reg(tracee, CURRENT, SYSARG_5); modif.new_sysarg_num = ((flags & AT_SYMLINK_NOFOLLOW) != 0 ? PR_lchown : PR_chown); modify_syscall(tracee, config, &modif); return 0; } case PR_fcntl: { word_t command; if (!needs_kompat(config, KERNEL_VERSION(2,6,24))) return 0; command = peek_reg(tracee, ORIGINAL, SYSARG_2); if (command == F_DUPFD_CLOEXEC) poke_reg(tracee, SYSARG_2, F_DUPFD); return 0; } case PR_newfstatat: case PR_fstatat64: { word_t flags; Modif modif = { .expected_release = KERNEL_VERSION(2,6,16), .shifts = { [0] = { .sysarg = SYSARG_2, .nb_args = 2, .offset = -1 } } }; flags = peek_reg(tracee, CURRENT, SYSARG_4); if ((flags & ~AT_SYMLINK_NOFOLLOW) != 0) return -EINVAL; /* Exposed by LTP. */ #if defined(ARCH_X86_64) if ((flags & AT_SYMLINK_NOFOLLOW) != 0) modif.new_sysarg_num = (get_abi(tracee) != ABI_2 ? PR_lstat : PR_lstat64); else modif.new_sysarg_num = (get_abi(tracee) != ABI_2 ? PR_stat : PR_stat64); #else if ((flags & AT_SYMLINK_NOFOLLOW) != 0) modif.new_sysarg_num = PR_lstat64; else modif.new_sysarg_num = PR_stat64; #endif modify_syscall(tracee, config, &modif); return 0; } case PR_futex: { word_t operation; static bool warned = false; if (!needs_kompat(config, KERNEL_VERSION(2,6,22)) || config->actual_release == 0) return 0; operation = peek_reg(tracee, CURRENT, SYSARG_2); if ((operation & FUTEX_PRIVATE_FLAG) == 0) return 0; if (!warned) { warned = true; note(tracee, WARNING, USER, "kompat: this kernel doesn't support private futexes " "and PRoot can't emulate them. Expect some troubles..."); } poke_reg(tracee, SYSARG_2, operation & ~FUTEX_PRIVATE_FLAG); return 0; } case PR_futimesat: { Modif modif = { .expected_release = KERNEL_VERSION(2,6,16), .new_sysarg_num = PR_utimes, .shifts = { [0] = { .sysarg = SYSARG_2, .nb_args = 2, .offset = -1 } } }; modify_syscall(tracee, config, &modif); return 0; } case PR_inotify_init1: { Modif modif = { .expected_release = KERNEL_VERSION(2,6,27), .new_sysarg_num = PR_inotify_init, .shifts = NONE }; modify_syscall(tracee, config, &modif); return 0; } case PR_linkat: { word_t flags; Modif modif = { .expected_release = KERNEL_VERSION(2,6,16), .new_sysarg_num = PR_link, .shifts = { [0] = { .sysarg = SYSARG_2, .nb_args = 1, .offset = -1 }, [1] = { .sysarg = SYSARG_4, .nb_args = 1, .offset = -2 } } }; flags = peek_reg(tracee, CURRENT, SYSARG_5); if ((flags & ~AT_SYMLINK_FOLLOW) != 0) return -EINVAL; /* Exposed by LTP. */ modify_syscall(tracee, config, &modif); return 0; } case PR_mkdirat: { Modif modif = { .expected_release = KERNEL_VERSION(2,6,16), .new_sysarg_num = PR_mkdir, .shifts = { [0] = { .sysarg = SYSARG_2, .nb_args = 2, .offset = -1 } } }; modify_syscall(tracee, config, &modif); return 0; }
/** * Return the neutral value of the @tracee's current syscall number. */ Sysnum get_sysnum(const Tracee *tracee, RegVersion version) { return translate_sysnum(get_abi(tracee), peek_reg(tracee, version, SYSARG_NUM)); }
/** * Make new @parent's child inherit from it. Depending on * @clone_flags, some information are copied or shared. This function * returns -errno if an error occured, otherwise 0. */ int new_child(Tracee *parent, word_t clone_flags) { unsigned long pid; Tracee *child; int status; /* If the tracee calls clone(2) with the CLONE_VFORK flag, * PTRACE_EVENT_VFORK will be delivered instead [...]; * otherwise if the tracee calls clone(2) with the exit signal * set to SIGCHLD, PTRACE_EVENT_FORK will be delivered [...] * * -- ptrace(2) man-page * * That means we have to check if it's actually a clone(2) in * order to get the right flags. */ status = fetch_regs(parent); if (status >= 0 && get_sysnum(parent, CURRENT) == PR_clone) clone_flags = peek_reg(parent, CURRENT, SYSARG_1); /* Get the pid of the parent's new child. */ status = ptrace(PTRACE_GETEVENTMSG, parent->pid, NULL, &pid); if (status < 0) { notice(parent, WARNING, SYSTEM, "ptrace(GETEVENTMSG)"); return status; } child = get_tracee(parent, (pid_t) pid, true); if (child == NULL) { notice(parent, WARNING, SYSTEM, "running out of memory"); return -ENOMEM; } /* Sanity checks. */ assert(child != NULL && child->exe == NULL && child->cmdline == NULL && child->fs->cwd == NULL && child->fs->bindings.pending == NULL && child->fs->bindings.guest == NULL && child->fs->bindings.host == NULL && child->qemu == NULL && child->glue == NULL); child->verbose = parent->verbose; child->seccomp = parent->seccomp; child->sysexit_pending = parent->sysexit_pending; /* If CLONE_VM is set, the calling process and the child * process run in the same memory space [...] any memory * mapping or unmapping performed with mmap(2) or munmap(2) by * the child or calling process also affects the other * process. * * If CLONE_VM is not set, the child process runs in a * separate copy of the memory space of the calling process at * the time of clone(). Memory writes or file * mappings/unmappings performed by one of the processes do * not affect the other, as with fork(2). * * -- clone(2) man-page */ TALLOC_FREE(child->heap); child->heap = ((clone_flags & CLONE_VM) != 0) ? talloc_reference(child, parent->heap) : talloc_memdup(child, parent->heap, sizeof(Heap)); if (child->heap == NULL) return -ENOMEM; /* If CLONE_FS is set, the parent and the child process share * the same file system information. This includes the root * of the file system, the current working directory, and the * umask. Any call to chroot(2), chdir(2), or umask(2) * performed by the parent process or the child process also * affects the other process. * * If CLONE_FS is not set, the child process works on a copy * of the file system information of the parent process at the * time of the clone() call. Calls to chroot(2), chdir(2), * umask(2) performed later by one of the processes do not * affect the other process. * * -- clone(2) man-page */ TALLOC_FREE(child->fs); if ((clone_flags & CLONE_FS) != 0) { /* File-system name-space is shared. */ child->fs = talloc_reference(child, parent->fs); } else { /* File-system name-space is copied. */ child->fs = talloc_zero(child, FileSystemNameSpace); if (child->fs == NULL) return -ENOMEM; child->fs->cwd = talloc_strdup(child->fs, parent->fs->cwd); if (child->fs->cwd == NULL) return -ENOMEM; talloc_set_name_const(child->fs->cwd, "$cwd"); /* Bindings are shared across file-system name-spaces since a * "mount --bind" made by a process affects all other processes * under Linux. Actually they are copied when a sub * reconfiguration occured (nested proot or chroot(2)). */ child->fs->bindings.guest = talloc_reference(child->fs, parent->fs->bindings.guest); child->fs->bindings.host = talloc_reference(child->fs, parent->fs->bindings.host); } /* The path to the executable and the command-line are unshared only * once the child process does a call to execve(2). */ child->exe = talloc_reference(child, parent->exe); child->cmdline = talloc_reference(child, parent->cmdline); child->qemu_pie_workaround = parent->qemu_pie_workaround; child->qemu = talloc_reference(child, parent->qemu); child->glue = talloc_reference(child, parent->glue); child->host_ldso_paths = talloc_reference(child, parent->host_ldso_paths); child->guest_ldso_paths = talloc_reference(child, parent->guest_ldso_paths); inherit_extensions(child, parent, false); /* Restart the child tracee if it was already alive but * stopped until that moment. */ if (child->sigstop == SIGSTOP_PENDING) { int status; child->sigstop = SIGSTOP_ALLOWED; status = ptrace(PTRACE_SYSCALL, child->pid, NULL, 0); if (status < 0) TALLOC_FREE(child); } return 0; }
/** * Translate the output arguments of the current @tracee's syscall in * the @tracee->pid process area. This function sets the result of * this syscall to @tracee->status if an error occured previously * during the translation, that is, if @tracee->status is less than 0. */ void translate_syscall_exit(Tracee *tracee) { word_t syscall_number; word_t syscall_result; int status; status = notify_extensions(tracee, SYSCALL_EXIT_START, 0, 0); if (status < 0) { poke_reg(tracee, SYSARG_RESULT, (word_t) status); goto end; } if (status > 0) return; /* Set the tracee's errno if an error occured previously during * the translation. */ if (tracee->status < 0) { poke_reg(tracee, SYSARG_RESULT, (word_t) tracee->status); goto end; } /* Translate output arguments: * - break: update the syscall result register with "status" * - goto end: nothing else to do. */ syscall_number = get_sysnum(tracee, ORIGINAL); syscall_result = peek_reg(tracee, CURRENT, SYSARG_RESULT); switch (syscall_number) { case PR_brk: translate_brk_exit(tracee); goto end; case PR_getcwd: { char path[PATH_MAX]; size_t new_size; size_t size; word_t output; size = (size_t) peek_reg(tracee, ORIGINAL, SYSARG_2); if (size == 0) { status = -EINVAL; break; } /* Ensure cwd still exists. */ status = translate_path(tracee, path, AT_FDCWD, ".", false); if (status < 0) break; new_size = strlen(tracee->fs->cwd) + 1; if (size < new_size) { status = -ERANGE; break; } /* Overwrite the path. */ output = peek_reg(tracee, ORIGINAL, SYSARG_1); status = write_data(tracee, output, tracee->fs->cwd, new_size); if (status < 0) break; /* The value of "status" is used to update the returned value * in translate_syscall_exit(). */ status = new_size; break; } case PR_accept: case PR_accept4: /* Nothing special to do if no sockaddr was specified. */ if (peek_reg(tracee, ORIGINAL, SYSARG_2) == 0) goto end; /* Fall through. */ case PR_getsockname: case PR_getpeername: { word_t sock_addr; word_t size_addr; word_t max_size; /* Error reported by the kernel. */ if ((int) syscall_result < 0) goto end; sock_addr = peek_reg(tracee, ORIGINAL, SYSARG_2); size_addr = peek_reg(tracee, MODIFIED, SYSARG_3); max_size = peek_reg(tracee, MODIFIED, SYSARG_6); status = translate_socketcall_exit(tracee, sock_addr, size_addr, max_size); if (status < 0) break; /* Don't overwrite the syscall result. */ goto end; } #define SYSARG_ADDR(n) (args_addr + ((n) - 1) * sizeof_word(tracee)) #define POKE_WORD(addr, value) \ poke_word(tracee, addr, value); \ if (errno != 0) { \ status = -errno; \ break; \ } #define PEEK_WORD(addr) \ peek_word(tracee, addr); \ if (errno != 0) { \ status = -errno; \ break; \ } case PR_socketcall: { word_t args_addr; word_t sock_addr; word_t size_addr; word_t max_size; args_addr = peek_reg(tracee, ORIGINAL, SYSARG_2); switch (peek_reg(tracee, ORIGINAL, SYSARG_1)) { case SYS_ACCEPT: case SYS_ACCEPT4: /* Nothing special to do if no sockaddr was specified. */ sock_addr = PEEK_WORD(SYSARG_ADDR(2)); if (sock_addr == 0) goto end; /* Fall through. */ case SYS_GETSOCKNAME: case SYS_GETPEERNAME: /* Handle these cases below. */ status = 1; break; case SYS_BIND: case SYS_CONNECT: /* Restore the initial parameters: this memory was * overwritten at the enter stage. Remember: POKE_WORD * puts -errno in status and breaks if an error * occured. */ POKE_WORD(SYSARG_ADDR(2), peek_reg(tracee, MODIFIED, SYSARG_5)); POKE_WORD(SYSARG_ADDR(3), peek_reg(tracee, MODIFIED, SYSARG_6)); status = 0; break; default: status = 0; break; } /* Error reported by the kernel or there's nothing else to do. */ if ((int) syscall_result < 0 || status == 0) goto end; /* An error occured in SYS_BIND or SYS_CONNECT. */ if (status < 0) break; /* Remember: PEEK_WORD puts -errno in status and breaks if an * error occured. */ sock_addr = PEEK_WORD(SYSARG_ADDR(2)); size_addr = PEEK_WORD(SYSARG_ADDR(3)); max_size = peek_reg(tracee, MODIFIED, SYSARG_6); status = translate_socketcall_exit(tracee, sock_addr, size_addr, max_size); if (status < 0) break; /* Don't overwrite the syscall result. */ goto end; } #undef SYSARG_ADDR #undef PEEK_WORD #undef POKE_WORD case PR_fchdir: case PR_chdir: /* These syscalls are fully emulated, see enter.c for details * (like errors). */ status = 0; break; case PR_rename: case PR_renameat: { char old_path[PATH_MAX]; char new_path[PATH_MAX]; ssize_t old_length; ssize_t new_length; Comparison comparison; Reg old_reg; Reg new_reg; char *tmp; /* Error reported by the kernel. */ if ((int) syscall_result < 0) goto end; if (syscall_number == PR_rename) { old_reg = SYSARG_1; new_reg = SYSARG_2; } else { old_reg = SYSARG_2; new_reg = SYSARG_4; } /* Get the old path, then convert it to the same * "point-of-view" as tracee->fs->cwd (guest). */ status = read_path(tracee, old_path, peek_reg(tracee, MODIFIED, old_reg)); if (status < 0) break; status = detranslate_path(tracee, old_path, NULL); if (status < 0) break; old_length = (status > 0 ? status - 1 : (ssize_t) strlen(old_path)); /* Nothing special to do if the moved path is not the * current working directory. */ comparison = compare_paths(old_path, tracee->fs->cwd); if (comparison != PATH1_IS_PREFIX && comparison != PATHS_ARE_EQUAL) { status = 0; break; } /* Get the new path, then convert it to the same * "point-of-view" as tracee->fs->cwd (guest). */ status = read_path(tracee, new_path, peek_reg(tracee, MODIFIED, new_reg)); if (status < 0) break; status = detranslate_path(tracee, new_path, NULL); if (status < 0) break; new_length = (status > 0 ? status - 1 : (ssize_t) strlen(new_path)); /* Sanity check. */ if (strlen(tracee->fs->cwd) >= PATH_MAX) { status = 0; break; } strcpy(old_path, tracee->fs->cwd); /* Update the virtual current working directory. */ substitute_path_prefix(old_path, old_length, new_path, new_length); tmp = talloc_strdup(tracee->fs, old_path); if (tmp == NULL) { status = -ENOMEM; break; } TALLOC_FREE(tracee->fs->cwd); tracee->fs->cwd = tmp; status = 0; break; } case PR_readlink: case PR_readlinkat: { char referee[PATH_MAX]; char referer[PATH_MAX]; size_t old_size; size_t new_size; size_t max_size; word_t input; word_t output; /* Error reported by the kernel. */ if ((int) syscall_result < 0) goto end; old_size = syscall_result; if (syscall_number == PR_readlink) { output = peek_reg(tracee, ORIGINAL, SYSARG_2); max_size = peek_reg(tracee, ORIGINAL, SYSARG_3); input = peek_reg(tracee, MODIFIED, SYSARG_1); } else { output = peek_reg(tracee, ORIGINAL, SYSARG_3); max_size = peek_reg(tracee, ORIGINAL, SYSARG_4); input = peek_reg(tracee, MODIFIED, SYSARG_2); } if (max_size > PATH_MAX) max_size = PATH_MAX; if (max_size == 0) { status = -EINVAL; break; } /* The kernel does NOT put the NULL terminating byte for * readlink(2). */ status = read_data(tracee, referee, output, old_size); if (status < 0) break; referee[old_size] = '\0'; /* Not optimal but safe (path is fully translated). */ status = read_path(tracee, referer, input); if (status < 0) break; if (status >= PATH_MAX) { status = -ENAMETOOLONG; break; } status = detranslate_path(tracee, referee, referer); if (status < 0) break; /* The original path doesn't require any transformation, i.e * it is a symetric binding. */ if (status == 0) goto end; /* Overwrite the path. Note: the output buffer might be * initialized with zeros but it was updated with the kernel * result, and then with the detranslated result. This later * might be shorter than the former, so it's safier to add a * NULL terminating byte when possible. This problem was * exposed by IDA Demo 6.3. */ if ((size_t) status < max_size) { new_size = status - 1; status = write_data(tracee, output, referee, status); } else { new_size = max_size; status = write_data(tracee, output, referee, max_size); } if (status < 0) break; /* The value of "status" is used to update the returned value * in translate_syscall_exit(). */ status = new_size; break; } #if defined(ARCH_X86_64) case PR_uname: { struct utsname utsname; word_t address; size_t size; if (get_abi(tracee) != ABI_2) goto end; /* Error reported by the kernel. */ if ((int) syscall_result < 0) goto end; address = peek_reg(tracee, ORIGINAL, SYSARG_1); status = read_data(tracee, &utsname, address, sizeof(utsname)); if (status < 0) break; /* Some 32-bit programs like package managers can be * confused when the kernel reports "x86_64". */ size = sizeof(utsname.machine); strncpy(utsname.machine, "i686", size); utsname.machine[size - 1] = '\0'; status = write_data(tracee, address, &utsname, sizeof(utsname)); if (status < 0) break; status = 0; break; } #endif case PR_execve: translate_execve_exit(tracee); goto end; case PR_ptrace: status = translate_ptrace_exit(tracee); break; case PR_wait4: case PR_waitpid: if (tracee->as_ptracer.waits_in != WAITS_IN_PROOT) goto end; status = translate_wait_exit(tracee); break; case PR_setrlimit: case PR_prlimit64: /* Error reported by the kernel. */ if ((int) syscall_result < 0) goto end; status = translate_setrlimit_exit(tracee, syscall_number == PR_prlimit64); if (status < 0) break; /* Don't overwrite the syscall result. */ goto end; default: goto end; } poke_reg(tracee, SYSARG_RESULT, (word_t) status); end: status = notify_extensions(tracee, SYSCALL_EXIT_END, status, 0); if (status < 0) poke_reg(tracee, SYSARG_RESULT, (word_t) status); }
/** * Start the loading of @tracee. This function returns no error since * it's either too late to do anything useful (the calling process is * already replaced) or the error reported by the kernel * (syscall_result < 0) will be propagated as-is. */ void translate_execve_exit(Tracee *tracee) { word_t syscall_result; int status; if (IS_NOTIFICATION_PTRACED_LOAD_DONE(tracee)) { /* Be sure not to confuse the ptracer with an * unexpected syscall/returned value. */ poke_reg(tracee, SYSARG_RESULT, 0); set_sysnum(tracee, PR_execve); /* According to most ABIs, all registers have * undefined values at program startup except: * * - the stack pointer * - the instruction pointer * - the rtld_fini pointer * - the state flags */ poke_reg(tracee, STACK_POINTER, peek_reg(tracee, ORIGINAL, SYSARG_2)); poke_reg(tracee, INSTR_POINTER, peek_reg(tracee, ORIGINAL, SYSARG_3)); poke_reg(tracee, RTLD_FINI, 0); poke_reg(tracee, STATE_FLAGS, 0); /* Restore registers to their current values. */ save_current_regs(tracee, ORIGINAL); tracee->_regs_were_changed = true; /* This is is required to make GDB work correctly * under PRoot, however it deserves to be used * unconditionally. */ (void) bind_proc_pid_auxv(tracee); /* If the PTRACE_O_TRACEEXEC option is *not* in effect * for the execing tracee, the kernel delivers an * extra SIGTRAP to the tracee after execve(2) * *returns*. This is an ordinary signal (similar to * one which can be generated by "kill -TRAP"), not a * special kind of ptrace-stop. Employing * PTRACE_GETSIGINFO for this signal returns si_code * set to 0 (SI_USER). This signal may be blocked by * signal mask, and thus may be delivered (much) * later. -- man 2 ptrace * * This signal is delayed so far since the program was * not fully loaded yet; GDB would get "invalid * adress" errors otherwise. */ if ((tracee->as_ptracee.options & PTRACE_O_TRACEEXEC) == 0) kill(tracee->pid, SIGTRAP); return; } syscall_result = peek_reg(tracee, CURRENT, SYSARG_RESULT); if ((int) syscall_result < 0) return; /* Execve happened; commit the new "/proc/self/exe". */ if (tracee->new_exe != NULL) { (void) talloc_unlink(tracee, tracee->exe); tracee->exe = talloc_reference(tracee, tracee->new_exe); talloc_set_name_const(tracee->exe, "$exe"); } /* New processes have no heap. */ bzero(tracee->heap, sizeof(Heap)); /* Transfer the load script to the loader. */ status = transfer_load_script(tracee); if (status < 0) note(tracee, ERROR, INTERNAL, "can't transfer load script: %s", strerror(-status)); return; }
/** * Convert @tracee->load_info into a load script, then transfer this * latter into @tracee's memory. */ static int transfer_load_script(Tracee *tracee) { const word_t stack_pointer = peek_reg(tracee, CURRENT, STACK_POINTER); static word_t page_size = 0; static word_t page_mask = 0; word_t entry_point; size_t script_size; size_t strings_size; size_t string1_size; size_t string2_size; size_t string3_size; size_t padding_size; word_t string1_address; word_t string2_address; word_t string3_address; void *buffer; size_t buffer_size; bool needs_executable_stack; LoadStatement *statement; void *cursor; int status; if (page_size == 0) { page_size = sysconf(_SC_PAGE_SIZE); if ((int) page_size <= 0) page_size = 0x1000; page_mask = ~(page_size - 1); } needs_executable_stack = (tracee->load_info->needs_executable_stack || ( tracee->load_info->interp != NULL && tracee->load_info->interp->needs_executable_stack)); /* Strings addresses are required to generate the load script, * for "open" actions. Since I want to generate it in one * pass, these strings will be put right below the current * stack pointer -- the only known adresses so far -- in the * "strings area". */ string1_size = strlen(tracee->load_info->user_path) + 1; string2_size = (tracee->load_info->interp == NULL ? 0 : strlen(tracee->load_info->interp->user_path) + 1); string3_size = (tracee->load_info->raw_path == tracee->load_info->user_path ? 0 : strlen(tracee->load_info->raw_path) + 1); /* A padding will be appended at the end of the load script * (a.k.a "strings area") to ensure this latter is aligned to * a word boundary, for sake of performance. */ padding_size = (stack_pointer - string1_size - string2_size - string3_size) % sizeof_word(tracee); strings_size = string1_size + string2_size + string3_size + padding_size; string1_address = stack_pointer - strings_size; string2_address = stack_pointer - strings_size + string1_size; string3_address = (string3_size == 0 ? string1_address : stack_pointer - strings_size + string1_size + string2_size); /* Compute the size of the load script. */ script_size = LOAD_STATEMENT_SIZE(*statement, open) + (LOAD_STATEMENT_SIZE(*statement, mmap) * talloc_array_length(tracee->load_info->mappings)) + (tracee->load_info->interp == NULL ? 0 : LOAD_STATEMENT_SIZE(*statement, open) + (LOAD_STATEMENT_SIZE(*statement, mmap) * talloc_array_length(tracee->load_info->interp->mappings))) + (needs_executable_stack ? LOAD_STATEMENT_SIZE(*statement, make_stack_exec) : 0) + LOAD_STATEMENT_SIZE(*statement, start); /* Allocate enough room for both the load script and the * strings area. */ buffer_size = script_size + strings_size; buffer = talloc_zero_size(tracee->ctx, buffer_size); if (buffer == NULL) return -ENOMEM; cursor = buffer; /* Load script statement: open. */ statement = cursor; statement->action = LOAD_ACTION_OPEN; statement->open.string_address = string1_address; cursor += LOAD_STATEMENT_SIZE(*statement, open); /* Load script statements: mmap. */ cursor = transcript_mappings(cursor, tracee->load_info->mappings); if (tracee->load_info->interp != NULL) { /* Load script statement: open. */ statement = cursor; statement->action = LOAD_ACTION_OPEN_NEXT; statement->open.string_address = string2_address; cursor += LOAD_STATEMENT_SIZE(*statement, open); /* Load script statements: mmap. */ cursor = transcript_mappings(cursor, tracee->load_info->interp->mappings); entry_point = ELF_FIELD(tracee->load_info->interp->elf_header, entry); } else entry_point = ELF_FIELD(tracee->load_info->elf_header, entry); if (needs_executable_stack) { /* Load script statement: stack_exec. */ statement = cursor; statement->action = LOAD_ACTION_MAKE_STACK_EXEC; statement->make_stack_exec.start = stack_pointer & page_mask; cursor += LOAD_STATEMENT_SIZE(*statement, make_stack_exec); } /* Load script statement: start. */ statement = cursor; /* Start of the program slightly differs when ptraced. */ if (tracee->as_ptracee.ptracer != NULL) statement->action = LOAD_ACTION_START_TRACED; else statement->action = LOAD_ACTION_START; statement->start.stack_pointer = stack_pointer; statement->start.entry_point = entry_point; statement->start.at_phent = ELF_FIELD(tracee->load_info->elf_header, phentsize); statement->start.at_phnum = ELF_FIELD(tracee->load_info->elf_header, phnum); statement->start.at_entry = ELF_FIELD(tracee->load_info->elf_header, entry); statement->start.at_phdr = ELF_FIELD(tracee->load_info->elf_header, phoff) + tracee->load_info->mappings[0].addr; statement->start.at_execfn = string3_address; cursor += LOAD_STATEMENT_SIZE(*statement, start); /* Sanity check. */ assert((uintptr_t) cursor - (uintptr_t) buffer == script_size); /* Convert the load script to the expected format. */ if (is_32on64_mode(tracee)) { int i; for (i = 0; buffer + i * sizeof(uint64_t) < cursor; i++) ((uint32_t *) buffer)[i] = ((uint64_t *) buffer)[i]; } /* Concatenate the load script and the strings. */ memcpy(cursor, tracee->load_info->user_path, string1_size); cursor += string1_size; if (string2_size != 0) { memcpy(cursor, tracee->load_info->interp->user_path, string2_size); cursor += string2_size; } if (string3_size != 0) { memcpy(cursor, tracee->load_info->raw_path, string3_size); cursor += string3_size; } /* Sanity check. */ cursor += padding_size; assert((uintptr_t) cursor - (uintptr_t) buffer == buffer_size); /* Allocate enough room in tracee's memory for the load * script, and make the first user argument points to this * location. Note that it is safe to update the stack pointer * manually since we are in execve sysexit. However it should * be done before transfering data since the kernel might not * allow page faults below the stack pointer. */ poke_reg(tracee, STACK_POINTER, stack_pointer - buffer_size); poke_reg(tracee, USERARG_1, stack_pointer - buffer_size); /* Copy everything in the tracee's memory at once. */ status = write_data(tracee, stack_pointer - buffer_size, buffer, buffer_size); if (status < 0) return status; /* Tracee's stack content is now as follow: * * +------------+ <- initial stack pointer (higher address) * | padding | * +------------+ * | string3 | * +------------+ * | string2 | * +------------+ * | string1 | * +------------+ * | start | * +------------+ * | mmap anon | * +------------+ * | mmap file | * +------------+ * | open next | * +------------+ * | mmap anon. | * +------------+ * | mmap file | * +------------+ * | open | * +------------+ <- stack pointer, userarg1 (word aligned) */ /* Remember we are in the sysexit stage, so be sure the * current register values will be used as-is at the end. */ save_current_regs(tracee, ORIGINAL); tracee->_regs_were_changed = true; return 0; }
/** * Translate the input arguments of the current @tracee's syscall in the * @tracee->pid process area. This function sets @tracee->status to * -errno if an error occured from the tracee's point-of-view (EFAULT * for instance), otherwise 0. */ int translate_syscall_enter(Tracee *tracee) { int flags; int dirfd; int olddirfd; int newdirfd; int status; int status2; char path[PATH_MAX]; char oldpath[PATH_MAX]; char newpath[PATH_MAX]; word_t syscall_number; bool special = false; status = notify_extensions(tracee, SYSCALL_ENTER_START, 0, 0); if (status < 0) goto end; if (status > 0) return 0; /* Translate input arguments. */ syscall_number = get_sysnum(tracee, ORIGINAL); switch (syscall_number) { default: /* Nothing to do. */ status = 0; break; case PR_execve: status = translate_execve_enter(tracee); break; case PR_ptrace: status = translate_ptrace_enter(tracee); break; case PR_wait4: case PR_waitpid: status = translate_wait_enter(tracee); break; case PR_brk: translate_brk_enter(tracee); status = 0; break; case PR_getcwd: set_sysnum(tracee, PR_void); status = 0; break; case PR_fchdir: case PR_chdir: { struct stat statl; char *tmp; /* The ending "." ensures an error will be reported if * path does not exist or if it is not a directory. */ if (syscall_number == PR_chdir) { status = get_sysarg_path(tracee, path, SYSARG_1); if (status < 0) break; status = join_paths(2, oldpath, path, "."); if (status < 0) break; dirfd = AT_FDCWD; } else { strcpy(oldpath, "."); dirfd = peek_reg(tracee, CURRENT, SYSARG_1); } status = translate_path(tracee, path, dirfd, oldpath, true); if (status < 0) break; status = lstat(path, &statl); if (status < 0) break; /* Check this directory is accessible. */ if ((statl.st_mode & S_IXUSR) == 0) return -EACCES; /* Sadly this method doesn't detranslate statefully, * this means that there's an ambiguity when several * bindings are from the same host path: * * $ proot -m /tmp:/a -m /tmp:/b fchdir_getcwd /a * /b * * $ proot -m /tmp:/b -m /tmp:/a fchdir_getcwd /a * /a * * A solution would be to follow each file descriptor * just like it is done for cwd. */ status = detranslate_path(tracee, path, NULL); if (status < 0) break; /* Remove the trailing "/" or "/.". */ chop_finality(path); tmp = talloc_strdup(tracee->fs, path); if (tmp == NULL) { status = -ENOMEM; break; } TALLOC_FREE(tracee->fs->cwd); tracee->fs->cwd = tmp; talloc_set_name_const(tracee->fs->cwd, "$cwd"); set_sysnum(tracee, PR_void); status = 0; break; } case PR_bind: case PR_connect: { word_t address; word_t size; address = peek_reg(tracee, CURRENT, SYSARG_2); size = peek_reg(tracee, CURRENT, SYSARG_3); status = translate_socketcall_enter(tracee, &address, size); if (status <= 0) break; poke_reg(tracee, SYSARG_2, address); poke_reg(tracee, SYSARG_3, sizeof(struct sockaddr_un)); status = 0; break; } #define SYSARG_ADDR(n) (args_addr + ((n) - 1) * sizeof_word(tracee)) #define PEEK_WORD(addr, forced_errno) \ peek_word(tracee, addr); \ if (errno != 0) { \ status = forced_errno ?: -errno; \ break; \ } #define POKE_WORD(addr, value) \ poke_word(tracee, addr, value); \ if (errno != 0) { \ status = -errno; \ break; \ } case PR_accept: case PR_accept4: /* Nothing special to do if no sockaddr was specified. */ if (peek_reg(tracee, ORIGINAL, SYSARG_2) == 0) { status = 0; break; } special = true; /* Fall through. */ case PR_getsockname: case PR_getpeername:{ int size; /* Remember: PEEK_WORD puts -errno in status and breaks if an * error occured. */ size = (int) PEEK_WORD(peek_reg(tracee, ORIGINAL, SYSARG_3), special ? -EINVAL : 0); /* The "size" argument is both used as an input parameter * (max. size) and as an output parameter (actual size). The * exit stage needs to know the max. size to not overwrite * anything, that's why it is copied in the 6th argument * (unused) before the kernel updates it. */ poke_reg(tracee, SYSARG_6, size); status = 0; break; } case PR_socketcall: { word_t args_addr; word_t sock_addr_saved; word_t sock_addr; word_t size_addr; word_t size; args_addr = peek_reg(tracee, CURRENT, SYSARG_2); switch (peek_reg(tracee, CURRENT, SYSARG_1)) { case SYS_BIND: case SYS_CONNECT: /* Handle these cases below. */ status = 1; break; case SYS_ACCEPT: case SYS_ACCEPT4: /* Nothing special to do if no sockaddr was specified. */ sock_addr = PEEK_WORD(SYSARG_ADDR(2), 0); if (sock_addr == 0) { status = 0; break; } special = true; /* Fall through. */ case SYS_GETSOCKNAME: case SYS_GETPEERNAME: /* Remember: PEEK_WORD puts -errno in status and breaks * if an error occured. */ size_addr = PEEK_WORD(SYSARG_ADDR(3), 0); size = (int) PEEK_WORD(size_addr, special ? -EINVAL : 0); /* See case PR_accept for explanation. */ poke_reg(tracee, SYSARG_6, size); status = 0; break; default: status = 0; break; } /* An error occured or there's nothing else to do. */ if (status <= 0) break; /* Remember: PEEK_WORD puts -errno in status and breaks if an * error occured. */ sock_addr = PEEK_WORD(SYSARG_ADDR(2), 0); size = PEEK_WORD(SYSARG_ADDR(3), 0); sock_addr_saved = sock_addr; status = translate_socketcall_enter(tracee, &sock_addr, size); if (status <= 0) break; /* These parameters are used/restored at the exit stage. */ poke_reg(tracee, SYSARG_5, sock_addr_saved); poke_reg(tracee, SYSARG_6, size); /* Remember: POKE_WORD puts -errno in status and breaks if an * error occured. */ POKE_WORD(SYSARG_ADDR(2), sock_addr); POKE_WORD(SYSARG_ADDR(3), sizeof(struct sockaddr_un)); status = 0; break; } #undef SYSARG_ADDR #undef PEEK_WORD #undef POKE_WORD case PR_access: case PR_acct: case PR_chmod: case PR_chown: case PR_chown32: case PR_chroot: case PR_getxattr: case PR_listxattr: case PR_mknod: case PR_oldstat: case PR_creat: case PR_removexattr: case PR_setxattr: case PR_stat: case PR_stat64: case PR_statfs: case PR_statfs64: case PR_swapoff: case PR_swapon: case PR_truncate: case PR_truncate64: case PR_umount: case PR_umount2: case PR_uselib: case PR_utime: case PR_utimes: status = translate_sysarg(tracee, SYSARG_1, REGULAR); break; case PR_open: flags = peek_reg(tracee, CURRENT, SYSARG_2); if ( ((flags & O_NOFOLLOW) != 0) || ((flags & O_EXCL) != 0 && (flags & O_CREAT) != 0)) status = translate_sysarg(tracee, SYSARG_1, SYMLINK); else status = translate_sysarg(tracee, SYSARG_1, REGULAR); break; case PR_fchownat: case PR_fstatat64: case PR_newfstatat: case PR_utimensat: case PR_name_to_handle_at: dirfd = peek_reg(tracee, CURRENT, SYSARG_1); status = get_sysarg_path(tracee, path, SYSARG_2); if (status < 0) break; flags = ( syscall_number == PR_fchownat || syscall_number == PR_name_to_handle_at) ? peek_reg(tracee, CURRENT, SYSARG_5) : peek_reg(tracee, CURRENT, SYSARG_4); if ((flags & AT_SYMLINK_NOFOLLOW) != 0) status = translate_path2(tracee, dirfd, path, SYSARG_2, SYMLINK); else status = translate_path2(tracee, dirfd, path, SYSARG_2, REGULAR); break; case PR_fchmodat: case PR_faccessat: case PR_futimesat: case PR_mknodat: dirfd = peek_reg(tracee, CURRENT, SYSARG_1); status = get_sysarg_path(tracee, path, SYSARG_2); if (status < 0) break; status = translate_path2(tracee, dirfd, path, SYSARG_2, REGULAR); break; case PR_inotify_add_watch: flags = peek_reg(tracee, CURRENT, SYSARG_3); if ((flags & IN_DONT_FOLLOW) != 0) status = translate_sysarg(tracee, SYSARG_2, SYMLINK); else status = translate_sysarg(tracee, SYSARG_2, REGULAR); break; case PR_readlink: case PR_lchown: case PR_lchown32: case PR_lgetxattr: case PR_llistxattr: case PR_lremovexattr: case PR_lsetxattr: case PR_lstat: case PR_lstat64: case PR_oldlstat: case PR_unlink: case PR_rmdir: case PR_mkdir: status = translate_sysarg(tracee, SYSARG_1, SYMLINK); break; case PR_pivot_root: status = translate_sysarg(tracee, SYSARG_1, REGULAR); if (status < 0) break; status = translate_sysarg(tracee, SYSARG_2, REGULAR); break; case PR_linkat: olddirfd = peek_reg(tracee, CURRENT, SYSARG_1); newdirfd = peek_reg(tracee, CURRENT, SYSARG_3); flags = peek_reg(tracee, CURRENT, SYSARG_5); status = get_sysarg_path(tracee, oldpath, SYSARG_2); if (status < 0) break; status = get_sysarg_path(tracee, newpath, SYSARG_4); if (status < 0) break; if ((flags & AT_SYMLINK_FOLLOW) != 0) status = translate_path2(tracee, olddirfd, oldpath, SYSARG_2, REGULAR); else status = translate_path2(tracee, olddirfd, oldpath, SYSARG_2, SYMLINK); if (status < 0) break; status = translate_path2(tracee, newdirfd, newpath, SYSARG_4, SYMLINK); break; case PR_mount: status = get_sysarg_path(tracee, path, SYSARG_1); if (status < 0) break; /* The following check covers only 90% of the cases. */ if (path[0] == '/' || path[0] == '.') { status = translate_path2(tracee, AT_FDCWD, path, SYSARG_1, REGULAR); if (status < 0) break; } status = translate_sysarg(tracee, SYSARG_2, REGULAR); break; case PR_openat: dirfd = peek_reg(tracee, CURRENT, SYSARG_1); flags = peek_reg(tracee, CURRENT, SYSARG_3); status = get_sysarg_path(tracee, path, SYSARG_2); if (status < 0) break; if ( ((flags & O_NOFOLLOW) != 0) || ((flags & O_EXCL) != 0 && (flags & O_CREAT) != 0)) status = translate_path2(tracee, dirfd, path, SYSARG_2, SYMLINK); else status = translate_path2(tracee, dirfd, path, SYSARG_2, REGULAR); break; case PR_readlinkat: case PR_unlinkat: case PR_mkdirat: dirfd = peek_reg(tracee, CURRENT, SYSARG_1); status = get_sysarg_path(tracee, path, SYSARG_2); if (status < 0) break; status = translate_path2(tracee, dirfd, path, SYSARG_2, SYMLINK); break; case PR_link: case PR_rename: status = translate_sysarg(tracee, SYSARG_1, SYMLINK); if (status < 0) break; status = translate_sysarg(tracee, SYSARG_2, SYMLINK); break; case PR_renameat: olddirfd = peek_reg(tracee, CURRENT, SYSARG_1); newdirfd = peek_reg(tracee, CURRENT, SYSARG_3); status = get_sysarg_path(tracee, oldpath, SYSARG_2); if (status < 0) break; status = get_sysarg_path(tracee, newpath, SYSARG_4); if (status < 0) break; status = translate_path2(tracee, olddirfd, oldpath, SYSARG_2, SYMLINK); if (status < 0) break; status = translate_path2(tracee, newdirfd, newpath, SYSARG_4, SYMLINK); break; case PR_symlink: status = translate_sysarg(tracee, SYSARG_2, SYMLINK); break; case PR_symlinkat: newdirfd = peek_reg(tracee, CURRENT, SYSARG_2); status = get_sysarg_path(tracee, newpath, SYSARG_3); if (status < 0) break; status = translate_path2(tracee, newdirfd, newpath, SYSARG_3, SYMLINK); break; } end: status2 = notify_extensions(tracee, SYSCALL_ENTER_END, status, 0); if (status2 < 0) status = status2; return status; }
int pt_debugger_arm::syscall() { return (int) peek_reg(ARM_r7); }
long pt_debugger_arm::result() { return peek_reg(ARM_r0); }
long pt_debugger_x64::result() { return peek_reg(RAX); }