__visible void do_syscall_64(unsigned long nr, struct pt_regs *regs) { struct thread_info *ti; enter_from_user_mode(); local_irq_enable(); ti = current_thread_info(); if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) nr = syscall_trace_enter(regs); /* * NB: Native and x32 syscalls are dispatched from the same * table. The only functional difference is the x32 bit in * regs->orig_ax, which changes the behavior of some syscalls. */ nr &= __SYSCALL_MASK; if (likely(nr < NR_syscalls)) { nr = array_index_nospec(nr, NR_syscalls); regs->ax = sys_call_table[nr](regs); } syscall_return_slowpath(regs); }
/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ __visible long do_fast_syscall_32(struct pt_regs *regs) { /* * Called using the internal vDSO SYSENTER/SYSCALL32 calling * convention. Adjust regs so it looks like we entered using int80. */ unsigned long landing_pad = (unsigned long)current->mm->context.vdso + vdso_image_32.sym_int80_landing_pad; /* * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward * so that 'regs->ip -= 2' lands back on an int $0x80 instruction. * Fix it up. */ regs->ip = landing_pad; enter_from_user_mode(); local_irq_enable(); /* Fetch EBP from where the vDSO stashed it. */ if ( #ifdef CONFIG_X86_64 /* * Micro-optimization: the pointer we're following is explicitly * 32 bits, so it can't be out of range. */ __get_user(*(u32 *)®s->bp, (u32 __user __force *)(unsigned long)(u32)regs->sp) #else get_user(*(u32 *)®s->bp, (u32 __user __force *)(unsigned long)(u32)regs->sp) #endif ) { /* User code screwed up. */ local_irq_disable(); regs->ax = -EFAULT; prepare_exit_to_usermode(regs); return 0; /* Keep it simple: use IRET. */ } /* Now this is just like a normal syscall. */ do_syscall_32_irqs_on(regs); #ifdef CONFIG_X86_64 /* * Opportunistic SYSRETL: if possible, try to return using SYSRETL. * SYSRETL is available on all 64-bit CPUs, so we don't need to * bother with SYSEXIT. * * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP, * because the ECX fixup above will ensure that this is essentially * never the case. */ return regs->cs == __USER32_CS && regs->ss == __USER_DS && regs->ip == landing_pad && (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0; #else /* * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT. * * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP, * because the ECX fixup above will ensure that this is essentially * never the case. * * We don't allow syscalls at all from VM86 mode, but we still * need to check VM, because we might be returning from sys_vm86. */ return static_cpu_has(X86_FEATURE_SEP) && regs->cs == __USER_CS && regs->ss == __USER_DS && regs->ip == landing_pad && (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0; #endif }
/* Handles int $0x80 */ __visible void do_int80_syscall_32(struct pt_regs *regs) { enter_from_user_mode(); local_irq_enable(); do_syscall_32_irqs_on(regs); }
/* * We can return 0 to resume the syscall or anything else to go to phase * 2. If we resume the syscall, we need to put something appropriate in * regs->orig_ax. * * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax * are fully functional. * * For phase 2's benefit, our return value is: * 0: resume the syscall * 1: go to phase 2; no seccomp phase 2 needed * anything else: go to phase 2; pass return value to seccomp */ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) { struct thread_info *ti = pt_regs_to_thread_info(regs); unsigned long ret = 0; u32 work; if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) BUG_ON(regs != task_pt_regs(current)); work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY; #ifdef CONFIG_CONTEXT_TRACKING /* * If TIF_NOHZ is set, we are required to call user_exit() before * doing anything that could touch RCU. */ if (work & _TIF_NOHZ) { enter_from_user_mode(); work &= ~_TIF_NOHZ; } #endif #ifdef CONFIG_SECCOMP /* * Do seccomp first -- it should minimize exposure of other * code, and keeping seccomp fast is probably more valuable * than the rest of this. */ if (work & _TIF_SECCOMP) { struct seccomp_data sd; sd.arch = arch; sd.nr = regs->orig_ax; sd.instruction_pointer = regs->ip; #ifdef CONFIG_X86_64 if (arch == AUDIT_ARCH_X86_64) { sd.args[0] = regs->di; sd.args[1] = regs->si; sd.args[2] = regs->dx; sd.args[3] = regs->r10; sd.args[4] = regs->r8; sd.args[5] = regs->r9; } else #endif { sd.args[0] = regs->bx; sd.args[1] = regs->cx; sd.args[2] = regs->dx; sd.args[3] = regs->si; sd.args[4] = regs->di; sd.args[5] = regs->bp; } BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0); BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1); ret = seccomp_phase1(&sd); if (ret == SECCOMP_PHASE1_SKIP) { regs->orig_ax = -1; ret = 0; } else if (ret != SECCOMP_PHASE1_OK) { return ret; /* Go directly to phase 2 */ } work &= ~_TIF_SECCOMP; } #endif /* Do our best to finish without phase 2. */ if (work == 0) return ret; /* seccomp and/or nohz only (ret == 0 here) */ #ifdef CONFIG_AUDITSYSCALL if (work == _TIF_SYSCALL_AUDIT) { /* * If there is no more work to be done except auditing, * then audit in phase 1. Phase 2 always audits, so, if * we audit here, then we can't go on to phase 2. */ do_audit_syscall_entry(regs, arch); return 0; } #endif return 1; /* Something is enabled that we can't handle in phase 1 */ }