/** * @brief Find a free IDT slot and setup the interrupt handler. */ static int vmbus_vector_alloc(void) { int vector; uintptr_t func; struct gate_descriptor *ip; /* * Search backwards form the highest IDT vector available for use * as vmbus channel callback vector. We install 'hv_vmbus_callback' * handler at that vector and use it to interrupt vcpus. */ vector = APIC_SPURIOUS_INT; while (--vector >= APIC_IPI_INTS) { ip = &idt[vector]; func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); if (func == (uintptr_t)&IDTVEC(rsvd)) { #ifdef __i386__ setidt(vector , IDTVEC(hv_vmbus_callback), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); #else setidt(vector , IDTVEC(hv_vmbus_callback), SDT_SYSIGT, SEL_KPL, 0); #endif return (vector); } } return (0); }
/* * Map the local APIC and setup necessary interrupt vectors. */ void lapic_init(vm_paddr_t addr) { u_int regs[4]; int i, arat; /* Map the local APIC and setup the spurious interrupt handler. */ KASSERT(trunc_page(addr) == addr, ("local APIC not aligned on a page boundary")); lapic_paddr = addr; lapic = pmap_mapdev(addr, sizeof(lapic_t)); setidt(APIC_SPURIOUS_INT, IDTVEC(spuriousint), SDT_APIC, SEL_KPL, GSEL_APIC); /* Perform basic initialization of the BSP's local APIC. */ lapic_enable(); /* Set BSP's per-CPU local APIC ID. */ PCPU_SET(apic_id, lapic_id()); /* Local APIC timer interrupt. */ setidt(APIC_TIMER_INT, IDTVEC(timerint), SDT_APIC, SEL_KPL, GSEL_APIC); /* Local APIC error interrupt. */ setidt(APIC_ERROR_INT, IDTVEC(errorint), SDT_APIC, SEL_KPL, GSEL_APIC); /* XXX: Thermal interrupt */ /* Local APIC CMCI. */ setidt(APIC_CMC_INT, IDTVEC(cmcint), SDT_APICT, SEL_KPL, GSEL_APIC); if ((resource_int_value("apic", 0, "clock", &i) != 0 || i != 0)) { arat = 0; /* Intel CPUID 0x06 EAX[2] set if APIC timer runs in C3. */ if (cpu_vendor_id == CPU_VENDOR_INTEL && cpu_high >= 6) { do_cpuid(0x06, regs); if ((regs[0] & CPUTPM1_ARAT) != 0) arat = 1; } bzero(&lapic_et, sizeof(lapic_et)); lapic_et.et_name = "LAPIC"; lapic_et.et_flags = ET_FLAGS_PERIODIC | ET_FLAGS_ONESHOT | ET_FLAGS_PERCPU; lapic_et.et_quality = 600; if (!arat) { lapic_et.et_flags |= ET_FLAGS_C3STOP; lapic_et.et_quality -= 200; } lapic_et.et_frequency = 0; /* We don't know frequency yet, so trying to guess. */ lapic_et.et_min_period.sec = 0; lapic_et.et_min_period.frac = 0x00001000LL << 32; lapic_et.et_max_period.sec = 1; lapic_et.et_max_period.frac = 0; lapic_et.et_start = lapic_et_start; lapic_et.et_stop = lapic_et_stop; lapic_et.et_priv = NULL; et_register(&lapic_et); } }
int vmm_ipi_alloc(void) { int idx; uintptr_t func; struct gate_descriptor *ip; /* * Search backwards from the highest IDT vector available for use * as our IPI vector. We install the 'justreturn' handler at that * vector and use it to interrupt the vcpus. * * We do this because the IPI_AST is heavyweight and saves all * registers in the trapframe. This is overkill for our use case * which is simply to EOI the interrupt and return. */ idx = APIC_SPURIOUS_INT; while (--idx >= APIC_IPI_INTS) { ip = &idt[idx]; func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); if (func == (uintptr_t)&IDTVEC(rsvd)) { setidt(idx , IDTVEC(justreturn), SDT_SYSIGT, SEL_KPL, 0); return (idx); } } return (0); }
void cpu_set_tss_gates(struct cpu_info *ci) { struct segment_descriptor sd; ci->ci_doubleflt_stack = (char *)uvm_km_alloc(kernel_map, USPACE); cpu_init_tss(&ci->ci_doubleflt_tss, ci->ci_doubleflt_stack, IDTVEC(tss_trap08)); setsegment(&sd, &ci->ci_doubleflt_tss, sizeof(struct i386tss) - 1, SDT_SYS386TSS, SEL_KPL, 0, 0); ci->ci_gdt[GTRAPTSS_SEL].sd = sd; setgate(&idt[8], NULL, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GTRAPTSS_SEL, SEL_KPL)); #if defined(DDB) && defined(MULTIPROCESSOR) /* * Set up separate handler for the DDB IPI, so that it doesn't * stomp on a possibly corrupted stack. * * XXX overwriting the gate set in db_machine_init. * Should rearrange the code so that it's set only once. */ ci->ci_ddbipi_stack = (char *)uvm_km_alloc(kernel_map, USPACE); cpu_init_tss(&ci->ci_ddbipi_tss, ci->ci_ddbipi_stack, Xintrddbipi); setsegment(&sd, &ci->ci_ddbipi_tss, sizeof(struct i386tss) - 1, SDT_SYS386TSS, SEL_KPL, 0, 0); ci->ci_gdt[GIPITSS_SEL].sd = sd; setgate(&idt[ddb_vec], NULL, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GIPITSS_SEL, SEL_KPL)); #endif }
void vmm_ipi_free(int ipinum) { uintptr_t func; struct gate_descriptor *ip; KASSERT(ipinum >= APIC_IPI_INTS && ipinum < APIC_SPURIOUS_INT, ("invalid ipi %d", ipinum)); ip = &idt[ipinum]; func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); KASSERT(func == (uintptr_t)&IDTVEC(justreturn), ("invalid ipi %d", ipinum)); setidt(ipinum, IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); }
/** * @brief Restore the IDT slot to rsvd. */ static void vmbus_vector_free(int vector) { uintptr_t func; struct gate_descriptor *ip; if (vector == 0) return; KASSERT(vector >= APIC_IPI_INTS && vector < APIC_SPURIOUS_INT, ("invalid vector %d", vector)); ip = &idt[vector]; func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); KASSERT(func == (uintptr_t)&IDTVEC(hv_vmbus_callback), ("invalid vector %d", vector)); setidt(vector, IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); }
/* * Fill in default interrupt table (in case of spurious interrupt * during configuration of kernel, setup interrupt control unit */ void isa_defaultirq(void) { int i; /* icu vectors */ for (i = 0; i < ICU_LEN; i++) setgate(&idt[ICU_OFFSET + i], IDTVEC(intr)[i], 0, SDT_SYS386IGT, SEL_KPL, GICODE_SEL); /* initialize 8259's */ outb(IO_ICU1, 0x11); /* reset; program device, four bytes */ outb(IO_ICU1+1, ICU_OFFSET); /* starting at this vector index */ outb(IO_ICU1+1, 1 << IRQ_SLAVE); /* slave on line 2 */ #ifdef AUTO_EOI_1 outb(IO_ICU1+1, 2 | 1); /* auto EOI, 8086 mode */ #else outb(IO_ICU1+1, 1); /* 8086 mode */ #endif outb(IO_ICU1+1, 0xff); /* leave interrupts masked */ outb(IO_ICU1, 0x68); /* special mask mode (if available) */ outb(IO_ICU1, 0x0a); /* Read IRR by default. */ #ifdef REORDER_IRQ outb(IO_ICU1, 0xc0 | (3 - 1)); /* pri order 3-7, 0-2 (com2 first) */ #endif outb(IO_ICU2, 0x11); /* reset; program device, four bytes */ outb(IO_ICU2+1, ICU_OFFSET+8); /* staring at this vector index */ outb(IO_ICU2+1, IRQ_SLAVE); #ifdef AUTO_EOI_2 outb(IO_ICU2+1, 2 | 1); /* auto EOI, 8086 mode */ #else outb(IO_ICU2+1, 1); /* 8086 mode */ #endif outb(IO_ICU2+1, 0xff); /* leave interrupts masked */ outb(IO_ICU2, 0x68); /* special mask mode (if available) */ outb(IO_ICU2, 0x0a); /* Read IRR by default. */ }
void kern_trap(struct trapframe *frame) { struct globaldata *gd = mycpu; struct thread *td = gd->gd_curthread; struct lwp *lp; struct proc *p; int i = 0, ucode = 0, type, code; int have_mplock = 0; #ifdef INVARIANTS int crit_count = td->td_critcount; lwkt_tokref_t curstop = td->td_toks_stop; #endif vm_offset_t eva; lp = td->td_lwp; p = td->td_proc; if (frame->tf_trapno == T_PAGEFLT) eva = frame->tf_err; else eva = 0; #ifdef DDB if (db_active) { ++gd->gd_trap_nesting_level; MAKEMPSAFE(have_mplock); trap_fatal(frame, FALSE, eva); --gd->gd_trap_nesting_level; goto out2; } #endif type = frame->tf_trapno; code = frame->tf_err; #if 0 kernel_trap: #endif /* kernel trap */ switch (type) { case T_PAGEFLT: /* page fault */ MAKEMPSAFE(have_mplock); trap_pfault(frame, FALSE, eva); goto out2; case T_DNA: #if NNPX > 0 /* * The kernel may be using npx for copying or other * purposes. */ panic("kernel NPX should not happen"); if (npxdna(frame)) goto out2; #endif break; case T_PROTFLT: /* general protection fault */ case T_SEGNPFLT: /* segment not present fault */ /* * Invalid segment selectors and out of bounds * %eip's and %esp's can be set up in user mode. * This causes a fault in kernel mode when the * kernel tries to return to user mode. We want * to get this fault so that we can fix the * problem here and not have to check all the * selectors and pointers when the user changes * them. */ if (mycpu->gd_intr_nesting_level == 0) { if (td->td_pcb->pcb_onfault) { frame->tf_eip = (register_t)td->td_pcb->pcb_onfault; goto out2; } } break; case T_TSSFLT: /* * PSL_NT can be set in user mode and isn't cleared * automatically when the kernel is entered. This * causes a TSS fault when the kernel attempts to * `iret' because the TSS link is uninitialized. We * want to get this fault so that we can fix the * problem here and not every time the kernel is * entered. */ if (frame->tf_eflags & PSL_NT) { frame->tf_eflags &= ~PSL_NT; goto out2; } break; case T_TRCTRAP: /* trace trap */ #if 0 if (frame->tf_eip == (int)IDTVEC(syscall)) { /* * We've just entered system mode via the * syscall lcall. Continue single stepping * silently until the syscall handler has * saved the flags. */ goto out2; } if (frame->tf_eip == (int)IDTVEC(syscall) + 1) { /* * The syscall handler has now saved the * flags. Stop single stepping it. */ frame->tf_eflags &= ~PSL_T; goto out2; } #endif #if 0 /* * Ignore debug register trace traps due to * accesses in the user's address space, which * can happen under several conditions such as * if a user sets a watchpoint on a buffer and * then passes that buffer to a system call. * We still want to get TRCTRAPS for addresses * in kernel space because that is useful when * debugging the kernel. */ if (user_dbreg_trap()) { /* * Reset breakpoint bits because the * processor doesn't */ load_dr6(rdr6() & 0xfffffff0); goto out2; } #endif /* * FALLTHROUGH (TRCTRAP kernel mode, kernel address) */ case T_BPTFLT: /* * If DDB is enabled, let it handle the debugger trap. * Otherwise, debugger traps "can't happen". */ #ifdef DDB MAKEMPSAFE(have_mplock); if (kdb_trap (type, 0, frame)) goto out2; #endif break; case T_DIVIDE: MAKEMPSAFE(have_mplock); trap_fatal(frame, FALSE, eva); goto out2; case T_NMI: MAKEMPSAFE(have_mplock); trap_fatal(frame, FALSE, eva); goto out2; case T_SYSCALL80: /* * Ignore this trap generated from a spurious SIGTRAP. * * single stepping in / syscalls leads to spurious / SIGTRAP * so ignore * * Haiku (c) 2007 Simon 'corecode' Schubert */ goto out2; } /* * Translate fault for emulators (e.g. Linux) */ if (*p->p_sysent->sv_transtrap) i = (*p->p_sysent->sv_transtrap)(i, type); MAKEMPSAFE(have_mplock); trapsignal(lp, i, ucode); #ifdef DEBUG if (type <= MAX_TRAP_MSG) { uprintf("fatal process exception: %s", trap_msg[type]); if ((type == T_PAGEFLT) || (type == T_PROTFLT)) uprintf(", fault VA = 0x%lx", (u_long)eva); uprintf("\n"); } #endif out2: ; if (have_mplock) rel_mplock(); #ifdef INVARIANTS KASSERT(crit_count == td->td_critcount, ("trap: critical section count mismatch! %d/%d", crit_count, td->td_pri)); KASSERT(curstop == td->td_toks_stop, ("trap: extra tokens held after trap! %zd/%zd", curstop - &td->td_toks_base, td->td_toks_stop - &td->td_toks_base)); #endif }
IDTVEC(ioapic_intr179), IDTVEC(ioapic_intr180), IDTVEC(ioapic_intr181), IDTVEC(ioapic_intr182), IDTVEC(ioapic_intr183), IDTVEC(ioapic_intr184), IDTVEC(ioapic_intr185), IDTVEC(ioapic_intr186), IDTVEC(ioapic_intr187), IDTVEC(ioapic_intr188), IDTVEC(ioapic_intr189), IDTVEC(ioapic_intr190), IDTVEC(ioapic_intr191); static inthand_t *ioapic_intr[IOAPIC_HWI_VECTORS] = { &IDTVEC(ioapic_intr0), &IDTVEC(ioapic_intr1), &IDTVEC(ioapic_intr2), &IDTVEC(ioapic_intr3), &IDTVEC(ioapic_intr4), &IDTVEC(ioapic_intr5), &IDTVEC(ioapic_intr6), &IDTVEC(ioapic_intr7), &IDTVEC(ioapic_intr8), &IDTVEC(ioapic_intr9), &IDTVEC(ioapic_intr10), &IDTVEC(ioapic_intr11), &IDTVEC(ioapic_intr12), &IDTVEC(ioapic_intr13), &IDTVEC(ioapic_intr14), &IDTVEC(ioapic_intr15),
void trap(struct trapframe *frame) { struct thread *td = curthread; struct proc *p = td->td_proc; int i = 0, ucode = 0, code; u_int type; register_t addr = 0; vm_offset_t eva; ksiginfo_t ksi; #ifdef POWERFAIL_NMI static int lastalert = 0; #endif PCPU_INC(cnt.v_trap); type = frame->tf_trapno; #ifdef SMP /* Handler for NMI IPIs used for stopping CPUs. */ if (type == T_NMI) { if (ipi_nmi_handler() == 0) goto out; } #endif /* SMP */ #ifdef KDB if (kdb_active) { kdb_reenter(); goto out; } #endif if (type == T_RESERVED) { trap_fatal(frame, 0); goto out; } #ifdef HWPMC_HOOKS /* * CPU PMCs interrupt using an NMI so we check for that first. * If the HWPMC module is active, 'pmc_hook' will point to * the function to be called. A return value of '1' from the * hook means that the NMI was handled by it and that we can * return immediately. */ if (type == T_NMI && pmc_intr && (*pmc_intr)(PCPU_GET(cpuid), frame)) goto out; #endif if (type == T_MCHK) { if (!mca_intr()) trap_fatal(frame, 0); goto out; } #ifdef KDTRACE_HOOKS /* * A trap can occur while DTrace executes a probe. Before * executing the probe, DTrace blocks re-scheduling and sets * a flag in it's per-cpu flags to indicate that it doesn't * want to fault. On returning from the probe, the no-fault * flag is cleared and finally re-scheduling is enabled. * * If the DTrace kernel module has registered a trap handler, * call it and if it returns non-zero, assume that it has * handled the trap and modified the trap frame so that this * function can return normally. */ if ((type == T_PROTFLT || type == T_PAGEFLT) && dtrace_trap_func != NULL) if ((*dtrace_trap_func)(frame, type)) goto out; if (type == T_DTRACE_PROBE || type == T_DTRACE_RET || type == T_BPTFLT) { struct reg regs; fill_frame_regs(frame, ®s); if (type == T_DTRACE_PROBE && dtrace_fasttrap_probe_ptr != NULL && dtrace_fasttrap_probe_ptr(®s) == 0) goto out; if (type == T_BPTFLT && dtrace_pid_probe_ptr != NULL && dtrace_pid_probe_ptr(®s) == 0) goto out; if (type == T_DTRACE_RET && dtrace_return_probe_ptr != NULL && dtrace_return_probe_ptr(®s) == 0) goto out; } #endif if ((frame->tf_eflags & PSL_I) == 0) { /* * Buggy application or kernel code has disabled * interrupts and then trapped. Enabling interrupts * now is wrong, but it is better than running with * interrupts disabled until they are accidentally * enabled later. */ if (ISPL(frame->tf_cs) == SEL_UPL || (frame->tf_eflags & PSL_VM)) uprintf( "pid %ld (%s): trap %d with interrupts disabled\n", (long)curproc->p_pid, curthread->td_name, type); else if (type != T_BPTFLT && type != T_TRCTRAP && frame->tf_eip != (int)cpu_switch_load_gs) { /* * XXX not quite right, since this may be for a * multiple fault in user mode. */ printf("kernel trap %d with interrupts disabled\n", type); /* * Page faults need interrupts disabled until later, * and we shouldn't enable interrupts while holding * a spin lock or if servicing an NMI. */ if (type != T_NMI && type != T_PAGEFLT && td->td_md.md_spinlock_count == 0) enable_intr(); } } eva = 0; code = frame->tf_err; if (type == T_PAGEFLT) { /* * For some Cyrix CPUs, %cr2 is clobbered by * interrupts. This problem is worked around by using * an interrupt gate for the pagefault handler. We * are finally ready to read %cr2 and then must * reenable interrupts. * * If we get a page fault while in a critical section, then * it is most likely a fatal kernel page fault. The kernel * is already going to panic trying to get a sleep lock to * do the VM lookup, so just consider it a fatal trap so the * kernel can print out a useful trap message and even get * to the debugger. * * If we get a page fault while holding a non-sleepable * lock, then it is most likely a fatal kernel page fault. * If WITNESS is enabled, then it's going to whine about * bogus LORs with various VM locks, so just skip to the * fatal trap handling directly. */ eva = rcr2(); if (td->td_critnest != 0 || WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL, "Kernel page fault") != 0) trap_fatal(frame, eva); else enable_intr(); } if ((ISPL(frame->tf_cs) == SEL_UPL) || ((frame->tf_eflags & PSL_VM) && !(PCPU_GET(curpcb)->pcb_flags & PCB_VM86CALL))) { /* user trap */ td->td_pticks = 0; td->td_frame = frame; addr = frame->tf_eip; if (td->td_ucred != p->p_ucred) cred_update_thread(td); switch (type) { case T_PRIVINFLT: /* privileged instruction fault */ i = SIGILL; ucode = ILL_PRVOPC; break; case T_BPTFLT: /* bpt instruction fault */ case T_TRCTRAP: /* trace trap */ enable_intr(); frame->tf_eflags &= ~PSL_T; i = SIGTRAP; ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT); break; case T_ARITHTRAP: /* arithmetic trap */ #ifdef DEV_NPX ucode = npxtrap(); if (ucode == -1) goto userout; #else ucode = 0; #endif i = SIGFPE; break; /* * The following two traps can happen in * vm86 mode, and, if so, we want to handle * them specially. */ case T_PROTFLT: /* general protection fault */ case T_STKFLT: /* stack fault */ if (frame->tf_eflags & PSL_VM) { i = vm86_emulate((struct vm86frame *)frame); if (i == 0) goto user; break; } i = SIGBUS; ucode = (type == T_PROTFLT) ? BUS_OBJERR : BUS_ADRERR; break; case T_SEGNPFLT: /* segment not present fault */ i = SIGBUS; ucode = BUS_ADRERR; break; case T_TSSFLT: /* invalid TSS fault */ i = SIGBUS; ucode = BUS_OBJERR; break; case T_DOUBLEFLT: /* double fault */ default: i = SIGBUS; ucode = BUS_OBJERR; break; case T_PAGEFLT: /* page fault */ i = trap_pfault(frame, TRUE, eva); #if defined(I586_CPU) && !defined(NO_F00F_HACK) if (i == -2) { /* * The f00f hack workaround has triggered, so * treat the fault as an illegal instruction * (T_PRIVINFLT) instead of a page fault. */ type = frame->tf_trapno = T_PRIVINFLT; /* Proceed as in that case. */ ucode = ILL_PRVOPC; i = SIGILL; break; } #endif if (i == -1) goto userout; if (i == 0) goto user; if (i == SIGSEGV) ucode = SEGV_MAPERR; else { if (prot_fault_translation == 0) { /* * Autodetect. * This check also covers the images * without the ABI-tag ELF note. */ if (SV_CURPROC_ABI() == SV_ABI_FREEBSD && p->p_osrel >= P_OSREL_SIGSEGV) { i = SIGSEGV; ucode = SEGV_ACCERR; } else { i = SIGBUS; ucode = BUS_PAGE_FAULT; } } else if (prot_fault_translation == 1) { /* * Always compat mode. */ i = SIGBUS; ucode = BUS_PAGE_FAULT; } else { /* * Always SIGSEGV mode. */ i = SIGSEGV; ucode = SEGV_ACCERR; } } addr = eva; break; case T_DIVIDE: /* integer divide fault */ ucode = FPE_INTDIV; i = SIGFPE; break; #ifdef DEV_ISA case T_NMI: #ifdef POWERFAIL_NMI #ifndef TIMER_FREQ # define TIMER_FREQ 1193182 #endif if (time_second - lastalert > 10) { log(LOG_WARNING, "NMI: power fail\n"); sysbeep(880, hz); lastalert = time_second; } goto userout; #else /* !POWERFAIL_NMI */ /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) { #ifdef KDB /* * NMI can be hooked up to a pushbutton * for debugging. */ if (kdb_on_nmi) { printf ("NMI ... going to debugger\n"); kdb_trap(type, 0, frame); } #endif /* KDB */ goto userout; } else if (panic_on_nmi) panic("NMI indicates hardware failure"); break; #endif /* POWERFAIL_NMI */ #endif /* DEV_ISA */ case T_OFLOW: /* integer overflow fault */ ucode = FPE_INTOVF; i = SIGFPE; break; case T_BOUND: /* bounds check fault */ ucode = FPE_FLTSUB; i = SIGFPE; break; case T_DNA: #ifdef DEV_NPX KASSERT(PCB_USER_FPU(td->td_pcb), ("kernel FPU ctx has leaked")); /* transparent fault (due to context switch "late") */ if (npxdna()) goto userout; #endif uprintf("pid %d killed due to lack of floating point\n", p->p_pid); i = SIGKILL; ucode = 0; break; case T_FPOPFLT: /* FPU operand fetch fault */ ucode = ILL_COPROC; i = SIGILL; break; case T_XMMFLT: /* SIMD floating-point exception */ ucode = 0; /* XXX */ i = SIGFPE; break; } } else { /* kernel trap */ KASSERT(cold || td->td_ucred != NULL, ("kernel trap doesn't have ucred")); switch (type) { case T_PAGEFLT: /* page fault */ (void) trap_pfault(frame, FALSE, eva); goto out; case T_DNA: #ifdef DEV_NPX KASSERT(!PCB_USER_FPU(td->td_pcb), ("Unregistered use of FPU in kernel")); if (npxdna()) goto out; #endif break; case T_ARITHTRAP: /* arithmetic trap */ case T_XMMFLT: /* SIMD floating-point exception */ case T_FPOPFLT: /* FPU operand fetch fault */ /* * XXXKIB for now disable any FPU traps in kernel * handler registration seems to be overkill */ trap_fatal(frame, 0); goto out; /* * The following two traps can happen in * vm86 mode, and, if so, we want to handle * them specially. */ case T_PROTFLT: /* general protection fault */ case T_STKFLT: /* stack fault */ if (frame->tf_eflags & PSL_VM) { i = vm86_emulate((struct vm86frame *)frame); if (i != 0) /* * returns to original process */ vm86_trap((struct vm86frame *)frame); goto out; } if (type == T_STKFLT) break; /* FALL THROUGH */ case T_SEGNPFLT: /* segment not present fault */ if (PCPU_GET(curpcb)->pcb_flags & PCB_VM86CALL) break; /* * Invalid %fs's and %gs's can be created using * procfs or PT_SETREGS or by invalidating the * underlying LDT entry. This causes a fault * in kernel mode when the kernel attempts to * switch contexts. Lose the bad context * (XXX) so that we can continue, and generate * a signal. */ if (frame->tf_eip == (int)cpu_switch_load_gs) { PCPU_GET(curpcb)->pcb_gs = 0; #if 0 PROC_LOCK(p); kern_psignal(p, SIGBUS); PROC_UNLOCK(p); #endif goto out; } if (td->td_intr_nesting_level != 0) break; /* * Invalid segment selectors and out of bounds * %eip's and %esp's can be set up in user mode. * This causes a fault in kernel mode when the * kernel tries to return to user mode. We want * to get this fault so that we can fix the * problem here and not have to check all the * selectors and pointers when the user changes * them. */ if (frame->tf_eip == (int)doreti_iret) { frame->tf_eip = (int)doreti_iret_fault; goto out; } if (frame->tf_eip == (int)doreti_popl_ds) { frame->tf_eip = (int)doreti_popl_ds_fault; goto out; } if (frame->tf_eip == (int)doreti_popl_es) { frame->tf_eip = (int)doreti_popl_es_fault; goto out; } if (frame->tf_eip == (int)doreti_popl_fs) { frame->tf_eip = (int)doreti_popl_fs_fault; goto out; } if (PCPU_GET(curpcb)->pcb_onfault != NULL) { frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault; goto out; } break; case T_TSSFLT: /* * PSL_NT can be set in user mode and isn't cleared * automatically when the kernel is entered. This * causes a TSS fault when the kernel attempts to * `iret' because the TSS link is uninitialized. We * want to get this fault so that we can fix the * problem here and not every time the kernel is * entered. */ if (frame->tf_eflags & PSL_NT) { frame->tf_eflags &= ~PSL_NT; goto out; } break; case T_TRCTRAP: /* trace trap */ if (frame->tf_eip == (int)IDTVEC(lcall_syscall)) { /* * We've just entered system mode via the * syscall lcall. Continue single stepping * silently until the syscall handler has * saved the flags. */ goto out; } if (frame->tf_eip == (int)IDTVEC(lcall_syscall) + 1) { /* * The syscall handler has now saved the * flags. Stop single stepping it. */ frame->tf_eflags &= ~PSL_T; goto out; } /* * Ignore debug register trace traps due to * accesses in the user's address space, which * can happen under several conditions such as * if a user sets a watchpoint on a buffer and * then passes that buffer to a system call. * We still want to get TRCTRAPS for addresses * in kernel space because that is useful when * debugging the kernel. */ if (user_dbreg_trap() && !(PCPU_GET(curpcb)->pcb_flags & PCB_VM86CALL)) { /* * Reset breakpoint bits because the * processor doesn't */ load_dr6(rdr6() & 0xfffffff0); goto out; } /* * FALLTHROUGH (TRCTRAP kernel mode, kernel address) */ case T_BPTFLT: /* * If KDB is enabled, let it handle the debugger trap. * Otherwise, debugger traps "can't happen". */ #ifdef KDB if (kdb_trap(type, 0, frame)) goto out; #endif break; #ifdef DEV_ISA case T_NMI: #ifdef POWERFAIL_NMI if (time_second - lastalert > 10) { log(LOG_WARNING, "NMI: power fail\n"); sysbeep(880, hz); lastalert = time_second; } goto out; #else /* !POWERFAIL_NMI */ /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) { #ifdef KDB /* * NMI can be hooked up to a pushbutton * for debugging. */ if (kdb_on_nmi) { printf ("NMI ... going to debugger\n"); kdb_trap(type, 0, frame); } #endif /* KDB */ goto out; } else if (panic_on_nmi == 0) goto out; /* FALLTHROUGH */ #endif /* POWERFAIL_NMI */ #endif /* DEV_ISA */ } trap_fatal(frame, eva); goto out; } /* Translate fault for emulators (e.g. Linux) */ if (*p->p_sysent->sv_transtrap) i = (*p->p_sysent->sv_transtrap)(i, type); ksiginfo_init_trap(&ksi); ksi.ksi_signo = i; ksi.ksi_code = ucode; ksi.ksi_addr = (void *)addr; ksi.ksi_trapno = type; trapsignal(td, &ksi); #ifdef DEBUG if (type <= MAX_TRAP_MSG) { uprintf("fatal process exception: %s", trap_msg[type]); if ((type == T_PAGEFLT) || (type == T_PROTFLT)) uprintf(", fault VA = 0x%lx", (u_long)eva); uprintf("\n"); } #endif user: userret(td, frame); mtx_assert(&Giant, MA_NOTOWNED); KASSERT(PCB_USER_FPU(td->td_pcb), ("Return from trap with kernel FPU ctx leaked")); userout: out: return; }
void init_x86_64(paddr_t first_avail) { extern void consinit(void); extern struct extent *iomem_ex; struct region_descriptor region; struct mem_segment_descriptor *ldt_segp; int x, first16q, ist; u_int64_t seg_start, seg_end; u_int64_t seg_start1, seg_end1; cpu_init_msrs(&cpu_info_primary); proc0.p_addr = proc0paddr; cpu_info_primary.ci_curpcb = &proc0.p_addr->u_pcb; x86_bus_space_init(); consinit(); /* XXX SHOULD NOT BE DONE HERE */ /* * Initailize PAGE_SIZE-dependent variables. */ uvm_setpagesize(); #if 0 uvmexp.ncolors = 2; #endif /* * Boot arguments are in a single page specified by /boot. * * We require the "new" vector form, as well as memory ranges * to be given in bytes rather than KB. * * locore copies the data into bootinfo[] for us. */ if ((bootapiver & (BAPIV_VECTOR | BAPIV_BMEMMAP)) == (BAPIV_VECTOR | BAPIV_BMEMMAP)) { if (bootinfo_size >= sizeof(bootinfo)) panic("boot args too big"); getbootinfo(bootinfo, bootinfo_size); } else panic("invalid /boot"); avail_start = PAGE_SIZE; /* BIOS leaves data in low memory */ /* and VM system doesn't work with phys 0 */ #ifdef MULTIPROCESSOR if (avail_start < MP_TRAMPOLINE + PAGE_SIZE) avail_start = MP_TRAMPOLINE + PAGE_SIZE; #endif /* * Call pmap initialization to make new kernel address space. * We must do this before loading pages into the VM system. */ pmap_bootstrap(VM_MIN_KERNEL_ADDRESS, IOM_END + trunc_page(KBTOB(biosextmem))); if (avail_start != PAGE_SIZE) pmap_prealloc_lowmem_ptps(); if (mem_cluster_cnt == 0) { /* * Allocate the physical addresses used by RAM from the iomem * extent map. This is done before the addresses are * page rounded just to make sure we get them all. */ if (extent_alloc_region(iomem_ex, 0, KBTOB(biosbasemem), EX_NOWAIT)) { /* XXX What should we do? */ printf("WARNING: CAN'T ALLOCATE BASE MEMORY FROM " "IOMEM EXTENT MAP!\n"); } mem_clusters[0].start = 0; mem_clusters[0].size = trunc_page(KBTOB(biosbasemem)); physmem += atop(mem_clusters[0].size); if (extent_alloc_region(iomem_ex, IOM_END, KBTOB(biosextmem), EX_NOWAIT)) { /* XXX What should we do? */ printf("WARNING: CAN'T ALLOCATE EXTENDED MEMORY FROM " "IOMEM EXTENT MAP!\n"); } #if 0 #if NISADMA > 0 /* * Some motherboards/BIOSes remap the 384K of RAM that would * normally be covered by the ISA hole to the end of memory * so that it can be used. However, on a 16M system, this * would cause bounce buffers to be allocated and used. * This is not desirable behaviour, as more than 384K of * bounce buffers might be allocated. As a work-around, * we round memory down to the nearest 1M boundary if * we're using any isadma devices and the remapped memory * is what puts us over 16M. */ if (biosextmem > (15*1024) && biosextmem < (16*1024)) { char pbuf[9]; format_bytes(pbuf, sizeof(pbuf), biosextmem - (15*1024)); printf("Warning: ignoring %s of remapped memory\n", pbuf); biosextmem = (15*1024); } #endif #endif mem_clusters[1].start = IOM_END; mem_clusters[1].size = trunc_page(KBTOB(biosextmem)); physmem += atop(mem_clusters[1].size); mem_cluster_cnt = 2; avail_end = IOM_END + trunc_page(KBTOB(biosextmem)); } /* * If we have 16M of RAM or less, just put it all on * the default free list. Otherwise, put the first * 16M of RAM on a lower priority free list (so that * all of the ISA DMA'able memory won't be eaten up * first-off). */ if (avail_end <= (16 * 1024 * 1024)) first16q = VM_FREELIST_DEFAULT; else first16q = VM_FREELIST_FIRST16; /* Make sure the end of the space used by the kernel is rounded. */ first_avail = round_page(first_avail); kern_end = KERNBASE + first_avail; /* * Now, load the memory clusters (which have already been * rounded and truncated) into the VM system. * * NOTE: WE ASSUME THAT MEMORY STARTS AT 0 AND THAT THE KERNEL * IS LOADED AT IOM_END (1M). */ for (x = 0; x < mem_cluster_cnt; x++) { seg_start = mem_clusters[x].start; seg_end = mem_clusters[x].start + mem_clusters[x].size; seg_start1 = 0; seg_end1 = 0; if (seg_start > 0xffffffffULL) { printf("skipping %lld bytes of memory above 4GB\n", seg_end - seg_start); continue; } if (seg_end > 0x100000000ULL) { printf("skipping %lld bytes of memory above 4GB\n", seg_end - 0x100000000ULL); seg_end = 0x100000000ULL; } /* * Skip memory before our available starting point. */ if (seg_end <= avail_start) continue; if (avail_start >= seg_start && avail_start < seg_end) { if (seg_start != 0) panic("init_x86_64: memory doesn't start at 0"); seg_start = avail_start; if (seg_start == seg_end) continue; } /* * If this segment contains the kernel, split it * in two, around the kernel. */ if (seg_start <= IOM_END && first_avail <= seg_end) { seg_start1 = first_avail; seg_end1 = seg_end; seg_end = IOM_END; } /* First hunk */ if (seg_start != seg_end) { if (seg_start <= (16 * 1024 * 1024) && first16q != VM_FREELIST_DEFAULT) { u_int64_t tmp; if (seg_end > (16 * 1024 * 1024)) tmp = (16 * 1024 * 1024); else tmp = seg_end; #if DEBUG_MEMLOAD printf("loading 0x%qx-0x%qx (0x%lx-0x%lx)\n", (unsigned long long)seg_start, (unsigned long long)tmp, atop(seg_start), atop(tmp)); #endif uvm_page_physload(atop(seg_start), atop(tmp), atop(seg_start), atop(tmp), first16q); seg_start = tmp; } if (seg_start != seg_end) { #if DEBUG_MEMLOAD printf("loading 0x%qx-0x%qx (0x%lx-0x%lx)\n", (unsigned long long)seg_start, (unsigned long long)seg_end, atop(seg_start), atop(seg_end)); #endif uvm_page_physload(atop(seg_start), atop(seg_end), atop(seg_start), atop(seg_end), VM_FREELIST_DEFAULT); } } /* Second hunk */ if (seg_start1 != seg_end1) { if (seg_start1 <= (16 * 1024 * 1024) && first16q != VM_FREELIST_DEFAULT) { u_int64_t tmp; if (seg_end1 > (16 * 1024 * 1024)) tmp = (16 * 1024 * 1024); else tmp = seg_end1; #if DEBUG_MEMLOAD printf("loading 0x%qx-0x%qx (0x%lx-0x%lx)\n", (unsigned long long)seg_start1, (unsigned long long)tmp, atop(seg_start1), atop(tmp)); #endif uvm_page_physload(atop(seg_start1), atop(tmp), atop(seg_start1), atop(tmp), first16q); seg_start1 = tmp; } if (seg_start1 != seg_end1) { #if DEBUG_MEMLOAD printf("loading 0x%qx-0x%qx (0x%lx-0x%lx)\n", (unsigned long long)seg_start1, (unsigned long long)seg_end1, atop(seg_start1), atop(seg_end1)); #endif uvm_page_physload(atop(seg_start1), atop(seg_end1), atop(seg_start1), atop(seg_end1), VM_FREELIST_DEFAULT); } } } /* * Steal memory for the message buffer (at end of core). */ { struct vm_physseg *vps = NULL; psize_t sz = round_page(MSGBUFSIZE); psize_t reqsz = sz; for (x = 0; x < vm_nphysseg; x++) { vps = &vm_physmem[x]; if (ptoa(vps->avail_end) == avail_end) break; } if (x == vm_nphysseg) panic("init_x86_64: can't find end of memory"); /* Shrink so it'll fit in the last segment. */ if ((vps->avail_end - vps->avail_start) < atop(sz)) sz = ptoa(vps->avail_end - vps->avail_start); vps->avail_end -= atop(sz); vps->end -= atop(sz); msgbuf_paddr = ptoa(vps->avail_end); /* Remove the last segment if it now has no pages. */ if (vps->start == vps->end) { for (vm_nphysseg--; x < vm_nphysseg; x++) vm_physmem[x] = vm_physmem[x + 1]; } /* Now find where the new avail_end is. */ for (avail_end = 0, x = 0; x < vm_nphysseg; x++) if (vm_physmem[x].avail_end > avail_end) avail_end = vm_physmem[x].avail_end; avail_end = ptoa(avail_end); /* Warn if the message buffer had to be shrunk. */ if (sz != reqsz) printf("WARNING: %ld bytes not available for msgbuf " "in last cluster (%ld used)\n", reqsz, sz); } /* * XXXfvdl todo: acpi wakeup code. */ pmap_growkernel(VM_MIN_KERNEL_ADDRESS + 32 * 1024 * 1024); pmap_kenter_pa(idt_vaddr, idt_paddr, VM_PROT_READ|VM_PROT_WRITE); pmap_kenter_pa(idt_vaddr + PAGE_SIZE, idt_paddr + PAGE_SIZE, VM_PROT_READ|VM_PROT_WRITE); pmap_kenter_pa(lo32_vaddr, lo32_paddr, VM_PROT_READ|VM_PROT_WRITE); idt = (struct gate_descriptor *)idt_vaddr; gdtstore = (char *)(idt + NIDT); ldtstore = gdtstore + DYNSEL_START; /* make gdt gates and memory segments */ set_mem_segment(GDT_ADDR_MEM(gdtstore, GCODE_SEL), 0, 0xfffff, SDT_MEMERA, SEL_KPL, 1, 0, 1); set_mem_segment(GDT_ADDR_MEM(gdtstore, GDATA_SEL), 0, 0xfffff, SDT_MEMRWA, SEL_KPL, 1, 0, 1); set_sys_segment(GDT_ADDR_SYS(gdtstore, GLDT_SEL), ldtstore, LDT_SIZE - 1, SDT_SYSLDT, SEL_KPL, 0); set_mem_segment(GDT_ADDR_MEM(gdtstore, GUCODE_SEL), 0, atop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMERA, SEL_UPL, 1, 0, 1); set_mem_segment(GDT_ADDR_MEM(gdtstore, GUDATA_SEL), 0, atop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMRWA, SEL_UPL, 1, 0, 1); /* make ldt gates and memory segments */ setgate((struct gate_descriptor *)(ldtstore + LSYS5CALLS_SEL), &IDTVEC(oosyscall), 0, SDT_SYS386CGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); *(struct mem_segment_descriptor *)(ldtstore + LUCODE_SEL) = *GDT_ADDR_MEM(gdtstore, GUCODE_SEL); *(struct mem_segment_descriptor *)(ldtstore + LUDATA_SEL) = *GDT_ADDR_MEM(gdtstore, GUDATA_SEL); /* * 32 bit GDT entries. */ set_mem_segment(GDT_ADDR_MEM(gdtstore, GUCODE32_SEL), 0, atop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMERA, SEL_UPL, 1, 1, 0); set_mem_segment(GDT_ADDR_MEM(gdtstore, GUDATA32_SEL), 0, atop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0); /* * 32 bit LDT entries. */ ldt_segp = (struct mem_segment_descriptor *)(ldtstore + LUCODE32_SEL); set_mem_segment(ldt_segp, 0, atop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMERA, SEL_UPL, 1, 1, 0); ldt_segp = (struct mem_segment_descriptor *)(ldtstore + LUDATA32_SEL); set_mem_segment(ldt_segp, 0, atop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0); /* * Other entries. */ memcpy((struct gate_descriptor *)(ldtstore + LSOL26CALLS_SEL), (struct gate_descriptor *)(ldtstore + LSYS5CALLS_SEL), sizeof (struct gate_descriptor)); memcpy((struct gate_descriptor *)(ldtstore + LBSDICALLS_SEL), (struct gate_descriptor *)(ldtstore + LSYS5CALLS_SEL), sizeof (struct gate_descriptor)); /* exceptions */ for (x = 0; x < 32; x++) { ist = (x == 8) ? 1 : 0; setgate(&idt[x], IDTVEC(exceptions)[x], ist, SDT_SYS386IGT, (x == 3 || x == 4) ? SEL_UPL : SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); idt_allocmap[x] = 1; } /* new-style interrupt gate for syscalls */ setgate(&idt[128], &IDTVEC(osyscall), 0, SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); idt_allocmap[128] = 1; setregion(®ion, gdtstore, DYNSEL_START - 1); lgdt(®ion); cpu_init_idt(); #ifdef DDB db_machine_init(); ddb_init(); if (boothowto & RB_KDB) Debugger(); #endif #ifdef KGDB kgdb_port_init(); if (boothowto & RB_KDB) { kgdb_debug_init = 1; kgdb_connect(1); } #endif intr_default_setup(); softintr_init(); splraise(IPL_IPI); enable_intr(); /* Make sure maxproc is sane */ if (maxproc > cpu_maxproc()) maxproc = cpu_maxproc(); }
static void ia32_syscall_disable(void *dummy) { setidt(IDT_SYSCALL, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); }
} static lapics[MAX_APIC_ID + 1]; /* Global defaults for local APIC LVT entries. */ static struct lvt lvts[LVT_MAX + 1] = { { 1, 1, 1, 1, APIC_LVT_DM_EXTINT, 0 }, /* LINT0: masked ExtINT */ { 1, 1, 0, 1, APIC_LVT_DM_NMI, 0 }, /* LINT1: NMI */ { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_TIMER_INT }, /* Timer */ { 1, 1, 0, 1, APIC_LVT_DM_FIXED, APIC_ERROR_INT }, /* Error */ { 1, 1, 1, 1, APIC_LVT_DM_NMI, 0 }, /* PMC */ { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_THERMAL_INT }, /* Thermal */ { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_CMC_INT }, /* CMCI */ }; static inthand_t *ioint_handlers[] = { NULL, /* 0 - 31 */ IDTVEC(apic_isr1), /* 32 - 63 */ IDTVEC(apic_isr2), /* 64 - 95 */ IDTVEC(apic_isr3), /* 96 - 127 */ IDTVEC(apic_isr4), /* 128 - 159 */ IDTVEC(apic_isr5), /* 160 - 191 */ IDTVEC(apic_isr6), /* 192 - 223 */ IDTVEC(apic_isr7), /* 224 - 255 */ }; static u_int32_t lapic_timer_divisors[] = { APIC_TDCR_1, APIC_TDCR_2, APIC_TDCR_4, APIC_TDCR_8, APIC_TDCR_16, APIC_TDCR_32, APIC_TDCR_64, APIC_TDCR_128 }; extern inthand_t IDTVEC(rsvd);
#ifndef SALBOOT extern struct kern_devconf kdc_cpu0; struct kern_devconf kdc_isa0 = { 0, 0, 0, /* filled in by dev_attach */ "isa", 0, { MDDT_BUS, 0 }, 0, 0, 0, BUS_EXTERNALLEN, &kdc_cpu0, /* parent is the CPU */ 0, /* no parentdata */ DC_BUSY, /* busses are always busy */ "ISA bus", DC_CLS_BUS /* class */ }; static inthand_t *fastintr[ICU_LEN] = { &IDTVEC(fastintr0), &IDTVEC(fastintr1), &IDTVEC(fastintr2), &IDTVEC(fastintr3), &IDTVEC(fastintr4), &IDTVEC(fastintr5), &IDTVEC(fastintr6), &IDTVEC(fastintr7), &IDTVEC(fastintr8), &IDTVEC(fastintr9), &IDTVEC(fastintr10), &IDTVEC(fastintr11), &IDTVEC(fastintr12), &IDTVEC(fastintr13), &IDTVEC(fastintr14), &IDTVEC(fastintr15) }; static inthand_t *slowintr[ICU_LEN] = { &IDTVEC(intr0), &IDTVEC(intr1), &IDTVEC(intr2), &IDTVEC(intr3), &IDTVEC(intr4), &IDTVEC(intr5), &IDTVEC(intr6), &IDTVEC(intr7), &IDTVEC(intr8), &IDTVEC(intr9), &IDTVEC(intr10), &IDTVEC(intr11), &IDTVEC(intr12), &IDTVEC(intr13), &IDTVEC(intr14), &IDTVEC(intr15) };
static void ia32_syscall_enable(void *dummy) { setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYSIGT, SEL_UPL, 0); }
/* * AP cpu's call this to sync up protected mode. * * WARNING! %gs is not set up on entry. This routine sets up %gs. */ void init_secondary(void) { int gsel_tss; int x, myid = bootAP; u_int64_t msr, cr0; struct mdglobaldata *md; struct privatespace *ps; ps = &CPU_prvspace[myid]; gdt_segs[GPROC0_SEL].ssd_base = (long) &ps->mdglobaldata.gd_common_tss; ps->mdglobaldata.mi.gd_prvspace = ps; /* We fill the 32-bit segment descriptors */ for (x = 0; x < NGDT; x++) { if (x != GPROC0_SEL && x != (GPROC0_SEL + 1)) ssdtosd(&gdt_segs[x], &gdt[myid * NGDT + x]); } /* And now a 64-bit one */ ssdtosyssd(&gdt_segs[GPROC0_SEL], (struct system_segment_descriptor *)&gdt[myid * NGDT + GPROC0_SEL]); r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; r_gdt.rd_base = (long) &gdt[myid * NGDT]; lgdt(&r_gdt); /* does magic intra-segment return */ /* lgdt() destroys the GSBASE value, so we load GSBASE after lgdt() */ wrmsr(MSR_FSBASE, 0); /* User value */ wrmsr(MSR_GSBASE, (u_int64_t)ps); wrmsr(MSR_KGSBASE, 0); /* XXX User value while we're in the kernel */ lidt(&r_idt_arr[mdcpu->mi.gd_cpuid]); #if 0 lldt(_default_ldt); mdcpu->gd_currentldt = _default_ldt; #endif gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); gdt[myid * NGDT + GPROC0_SEL].sd_type = SDT_SYSTSS; md = mdcpu; /* loaded through %gs:0 (mdglobaldata.mi.gd_prvspace)*/ md->gd_common_tss.tss_rsp0 = 0; /* not used until after switch */ #if 0 /* JG XXX */ md->gd_common_tss.tss_ioopt = (sizeof md->gd_common_tss) << 16; #endif md->gd_tss_gdt = &gdt[myid * NGDT + GPROC0_SEL]; md->gd_common_tssd = *md->gd_tss_gdt; /* double fault stack */ md->gd_common_tss.tss_ist1 = (long)&md->mi.gd_prvspace->idlestack[ sizeof(md->mi.gd_prvspace->idlestack)]; ltr(gsel_tss); /* * Set to a known state: * Set by mpboot.s: CR0_PG, CR0_PE * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM */ cr0 = rcr0(); cr0 &= ~(CR0_CD | CR0_NW | CR0_EM); load_cr0(cr0); /* Set up the fast syscall stuff */ msr = rdmsr(MSR_EFER) | EFER_SCE; wrmsr(MSR_EFER, msr); wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall)); wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); wrmsr(MSR_STAR, msr); wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D|PSL_IOPL); pmap_set_opt(); /* PSE/4MB pages, etc */ pmap_init_pat(); /* Page Attribute Table */ /* set up CPU registers and state */ cpu_setregs(); /* set up SSE/NX registers */ initializecpu(myid); /* set up FPU state on the AP */ npxinit(__INITIAL_FPUCW__); /* disable the APIC, just to be SURE */ lapic->svr &= ~APIC_SVR_ENABLE; }
void trap(struct trapframe *frame) { struct globaldata *gd = mycpu; struct thread *td = gd->gd_curthread; struct lwp *lp = td->td_lwp; struct proc *p; int sticks = 0; int i = 0, ucode = 0, type, code; int have_mplock = 0; #ifdef INVARIANTS int crit_count = td->td_critcount; lwkt_tokref_t curstop = td->td_toks_stop; #endif vm_offset_t eva; p = td->td_proc; #ifdef DDB /* * We need to allow T_DNA faults when the debugger is active since * some dumping paths do large bcopy() which use the floating * point registers for faster copying. */ if (db_active && frame->tf_trapno != T_DNA) { eva = (frame->tf_trapno == T_PAGEFLT ? rcr2() : 0); ++gd->gd_trap_nesting_level; MAKEMPSAFE(have_mplock); trap_fatal(frame, eva); --gd->gd_trap_nesting_level; goto out2; } #endif eva = 0; ++gd->gd_trap_nesting_level; if (frame->tf_trapno == T_PAGEFLT) { /* * For some Cyrix CPUs, %cr2 is clobbered by interrupts. * This problem is worked around by using an interrupt * gate for the pagefault handler. We are finally ready * to read %cr2 and then must reenable interrupts. * * XXX this should be in the switch statement, but the * NO_FOOF_HACK and VM86 goto and ifdefs obfuscate the * flow of control too much for this to be obviously * correct. */ eva = rcr2(); cpu_enable_intr(); } --gd->gd_trap_nesting_level; if (!(frame->tf_eflags & PSL_I)) { /* * Buggy application or kernel code has disabled interrupts * and then trapped. Enabling interrupts now is wrong, but * it is better than running with interrupts disabled until * they are accidentally enabled later. */ type = frame->tf_trapno; if (ISPL(frame->tf_cs)==SEL_UPL || (frame->tf_eflags & PSL_VM)) { MAKEMPSAFE(have_mplock); kprintf( "pid %ld (%s): trap %d with interrupts disabled\n", (long)curproc->p_pid, curproc->p_comm, type); } else if (type != T_BPTFLT && type != T_TRCTRAP) { /* * XXX not quite right, since this may be for a * multiple fault in user mode. */ MAKEMPSAFE(have_mplock); kprintf("kernel trap %d with interrupts disabled\n", type); } cpu_enable_intr(); } #if defined(I586_CPU) && !defined(NO_F00F_HACK) restart: #endif type = frame->tf_trapno; code = frame->tf_err; if (in_vm86call) { if (frame->tf_eflags & PSL_VM && (type == T_PROTFLT || type == T_STKFLT)) { KKASSERT(get_mplock_count(curthread) > 0); i = vm86_emulate((struct vm86frame *)frame); KKASSERT(get_mplock_count(curthread) > 0); if (i != 0) { /* * returns to original process */ vm86_trap((struct vm86frame *)frame, have_mplock); KKASSERT(0); /* NOT REACHED */ } goto out2; } switch (type) { /* * these traps want either a process context, or * assume a normal userspace trap. */ case T_PROTFLT: case T_SEGNPFLT: trap_fatal(frame, eva); goto out2; case T_TRCTRAP: type = T_BPTFLT; /* kernel breakpoint */ /* FALL THROUGH */ } goto kernel_trap; /* normal kernel trap handling */ } if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) { /* user trap */ KTR_LOG(kernentry_trap, p->p_pid, lp->lwp_tid, frame->tf_trapno, eva); userenter(td, p); sticks = (int)td->td_sticks; lp->lwp_md.md_regs = frame; switch (type) { case T_PRIVINFLT: /* privileged instruction fault */ i = SIGILL; ucode = ILL_PRVOPC; break; case T_BPTFLT: /* bpt instruction fault */ case T_TRCTRAP: /* trace trap */ frame->tf_eflags &= ~PSL_T; i = SIGTRAP; ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT); break; case T_ARITHTRAP: /* arithmetic trap */ ucode = code; i = SIGFPE; break; case T_ASTFLT: /* Allow process switch */ mycpu->gd_cnt.v_soft++; if (mycpu->gd_reqflags & RQF_AST_OWEUPC) { atomic_clear_int(&mycpu->gd_reqflags, RQF_AST_OWEUPC); addupc_task(p, p->p_prof.pr_addr, p->p_prof.pr_ticks); } goto out; /* * The following two traps can happen in * vm86 mode, and, if so, we want to handle * them specially. */ case T_PROTFLT: /* general protection fault */ case T_STKFLT: /* stack fault */ if (frame->tf_eflags & PSL_VM) { i = vm86_emulate((struct vm86frame *)frame); if (i == 0) goto out; break; } i = SIGBUS; ucode = (type == T_PROTFLT) ? BUS_OBJERR : BUS_ADRERR; break; case T_SEGNPFLT: /* segment not present fault */ i = SIGBUS; ucode = BUS_ADRERR; break; case T_TSSFLT: /* invalid TSS fault */ case T_DOUBLEFLT: /* double fault */ default: i = SIGBUS; ucode = BUS_OBJERR; break; case T_PAGEFLT: /* page fault */ i = trap_pfault(frame, TRUE, eva); if (i == -1) goto out; #if defined(I586_CPU) && !defined(NO_F00F_HACK) if (i == -2) goto restart; #endif if (i == 0) goto out; if (i == SIGSEGV) ucode = SEGV_MAPERR; else { i = SIGSEGV; ucode = SEGV_ACCERR; } break; case T_DIVIDE: /* integer divide fault */ ucode = FPE_INTDIV; i = SIGFPE; break; #if NISA > 0 case T_NMI: MAKEMPSAFE(have_mplock); #ifdef POWERFAIL_NMI goto handle_powerfail; #else /* !POWERFAIL_NMI */ /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) { #ifdef DDB /* * NMI can be hooked up to a pushbutton * for debugging. */ if (ddb_on_nmi) { kprintf ("NMI ... going to debugger\n"); kdb_trap (type, 0, frame); } #endif /* DDB */ goto out2; } else if (panic_on_nmi) panic("NMI indicates hardware failure"); break; #endif /* POWERFAIL_NMI */ #endif /* NISA > 0 */ case T_OFLOW: /* integer overflow fault */ ucode = FPE_INTOVF; i = SIGFPE; break; case T_BOUND: /* bounds check fault */ ucode = FPE_FLTSUB; i = SIGFPE; break; case T_DNA: /* * Virtual kernel intercept - pass the DNA exception * to the virtual kernel if it asked to handle it. * This occurs when the virtual kernel is holding * onto the FP context for a different emulated * process then the one currently running. * * We must still call npxdna() since we may have * saved FP state that the virtual kernel needs * to hand over to a different emulated process. */ if (lp->lwp_vkernel && lp->lwp_vkernel->ve && (td->td_pcb->pcb_flags & FP_VIRTFP) ) { npxdna(); break; } #if NNPX > 0 /* * The kernel may have switched out the FP unit's * state, causing the user process to take a fault * when it tries to use the FP unit. Restore the * state here */ if (npxdna()) goto out; #endif if (!pmath_emulate) { i = SIGFPE; ucode = FPE_FPU_NP_TRAP; break; } i = (*pmath_emulate)(frame); if (i == 0) { if (!(frame->tf_eflags & PSL_T)) goto out2; frame->tf_eflags &= ~PSL_T; i = SIGTRAP; } /* else ucode = emulator_only_knows() XXX */ break; case T_FPOPFLT: /* FPU operand fetch fault */ ucode = ILL_COPROC; i = SIGILL; break; case T_XMMFLT: /* SIMD floating-point exception */ ucode = 0; /* XXX */ i = SIGFPE; break; } } else { kernel_trap: /* kernel trap */ switch (type) { case T_PAGEFLT: /* page fault */ trap_pfault(frame, FALSE, eva); goto out2; case T_DNA: #if NNPX > 0 /* * The kernel may be using npx for copying or other * purposes. */ if (npxdna()) goto out2; #endif break; case T_PROTFLT: /* general protection fault */ case T_SEGNPFLT: /* segment not present fault */ /* * Invalid segment selectors and out of bounds * %eip's and %esp's can be set up in user mode. * This causes a fault in kernel mode when the * kernel tries to return to user mode. We want * to get this fault so that we can fix the * problem here and not have to check all the * selectors and pointers when the user changes * them. */ #define MAYBE_DORETI_FAULT(where, whereto) \ do { \ if (frame->tf_eip == (int)where) { \ frame->tf_eip = (int)whereto; \ goto out2; \ } \ } while (0) if (mycpu->gd_intr_nesting_level == 0) { /* * Invalid %fs's and %gs's can be created using * procfs or PT_SETREGS or by invalidating the * underlying LDT entry. This causes a fault * in kernel mode when the kernel attempts to * switch contexts. Lose the bad context * (XXX) so that we can continue, and generate * a signal. */ MAYBE_DORETI_FAULT(doreti_iret, doreti_iret_fault); MAYBE_DORETI_FAULT(doreti_popl_ds, doreti_popl_ds_fault); MAYBE_DORETI_FAULT(doreti_popl_es, doreti_popl_es_fault); MAYBE_DORETI_FAULT(doreti_popl_fs, doreti_popl_fs_fault); MAYBE_DORETI_FAULT(doreti_popl_gs, doreti_popl_gs_fault); /* * NOTE: cpu doesn't push esp on kernel trap */ if (td->td_pcb->pcb_onfault && td->td_pcb->pcb_onfault_sp == (int)&frame->tf_esp) { frame->tf_eip = (register_t)td->td_pcb->pcb_onfault; goto out2; } } break; case T_TSSFLT: /* * PSL_NT can be set in user mode and isn't cleared * automatically when the kernel is entered. This * causes a TSS fault when the kernel attempts to * `iret' because the TSS link is uninitialized. We * want to get this fault so that we can fix the * problem here and not every time the kernel is * entered. */ if (frame->tf_eflags & PSL_NT) { frame->tf_eflags &= ~PSL_NT; goto out2; } break; case T_TRCTRAP: /* trace trap */ if (frame->tf_eip == (int)IDTVEC(syscall)) { /* * We've just entered system mode via the * syscall lcall. Continue single stepping * silently until the syscall handler has * saved the flags. */ goto out2; } if (frame->tf_eip == (int)IDTVEC(syscall) + 1) { /* * The syscall handler has now saved the * flags. Stop single stepping it. */ frame->tf_eflags &= ~PSL_T; goto out2; } /* * Ignore debug register trace traps due to * accesses in the user's address space, which * can happen under several conditions such as * if a user sets a watchpoint on a buffer and * then passes that buffer to a system call. * We still want to get TRCTRAPS for addresses * in kernel space because that is useful when * debugging the kernel. */ if (user_dbreg_trap()) { /* * Reset breakpoint bits because the * processor doesn't */ load_dr6(rdr6() & 0xfffffff0); goto out2; } /* * FALLTHROUGH (TRCTRAP kernel mode, kernel address) */ case T_BPTFLT: /* * If DDB is enabled, let it handle the debugger trap. * Otherwise, debugger traps "can't happen". */ ucode = TRAP_BRKPT; #ifdef DDB MAKEMPSAFE(have_mplock); if (kdb_trap (type, 0, frame)) goto out2; #endif break; #if NISA > 0 case T_NMI: MAKEMPSAFE(have_mplock); #ifdef POWERFAIL_NMI #ifndef TIMER_FREQ # define TIMER_FREQ 1193182 #endif handle_powerfail: { static unsigned lastalert = 0; if (time_uptime - lastalert > 10) { log(LOG_WARNING, "NMI: power fail\n"); sysbeep(TIMER_FREQ/880, hz); lastalert = time_uptime; } /* YYY mp count */ goto out2; } #else /* !POWERFAIL_NMI */ /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) { #ifdef DDB /* * NMI can be hooked up to a pushbutton * for debugging. */ if (ddb_on_nmi) { kprintf ("NMI ... going to debugger\n"); kdb_trap (type, 0, frame); } #endif /* DDB */ goto out2; } else if (panic_on_nmi == 0) goto out2; /* FALL THROUGH */ #endif /* POWERFAIL_NMI */ #endif /* NISA > 0 */ } MAKEMPSAFE(have_mplock); trap_fatal(frame, eva); goto out2; } /* * Virtual kernel intercept - if the fault is directly related to a * VM context managed by a virtual kernel then let the virtual kernel * handle it. */ if (lp->lwp_vkernel && lp->lwp_vkernel->ve) { vkernel_trap(lp, frame); goto out; } /* Translate fault for emulators (e.g. Linux) */ if (*p->p_sysent->sv_transtrap) i = (*p->p_sysent->sv_transtrap)(i, type); MAKEMPSAFE(have_mplock); trapsignal(lp, i, ucode); #ifdef DEBUG if (type <= MAX_TRAP_MSG) { uprintf("fatal process exception: %s", trap_msg[type]); if ((type == T_PAGEFLT) || (type == T_PROTFLT)) uprintf(", fault VA = 0x%lx", (u_long)eva); uprintf("\n"); } #endif out: userret(lp, frame, sticks); userexit(lp); out2: ; if (have_mplock) rel_mplock(); if (p != NULL && lp != NULL) KTR_LOG(kernentry_trap_ret, p->p_pid, lp->lwp_tid); #ifdef INVARIANTS KASSERT(crit_count == td->td_critcount, ("trap: critical section count mismatch! %d/%d", crit_count, td->td_pri)); KASSERT(curstop == td->td_toks_stop, ("trap: extra tokens held after trap! %zd/%zd", curstop - &td->td_toks_base, td->td_toks_stop - &td->td_toks_base)); #endif }
/* * Exception, fault, and trap interface to the kernel. * This common code is called from assembly language IDT gate entry * routines that prepare a suitable stack frame, and restore this * frame after the exception has been processed. * * This function is also called from doreti in an interlock to handle ASTs. * For example: hardwareint->INTROUTINE->(set ast)->doreti->trap * * NOTE! We have to retrieve the fault address prior to potentially * blocking, including blocking on any token. * * NOTE! NMI and kernel DBG traps remain on their respective pcpu IST * stacks if taken from a kernel RPL. trap() cannot block in this * situation. DDB entry or a direct report-and-return is ok. * * XXX gd_trap_nesting_level currently prevents lwkt_switch() from panicing * if an attempt is made to switch from a fast interrupt or IPI. */ void trap(struct trapframe *frame) { static struct krate sscpubugrate = { 1 }; struct globaldata *gd = mycpu; struct thread *td = gd->gd_curthread; struct lwp *lp = td->td_lwp; struct proc *p; int sticks = 0; int i = 0, ucode = 0, type, code; #ifdef INVARIANTS int crit_count = td->td_critcount; lwkt_tokref_t curstop = td->td_toks_stop; #endif vm_offset_t eva; p = td->td_proc; clear_quickret(); #ifdef DDB /* * We need to allow T_DNA faults when the debugger is active since * some dumping paths do large bcopy() which use the floating * point registers for faster copying. */ if (db_active && frame->tf_trapno != T_DNA) { eva = (frame->tf_trapno == T_PAGEFLT ? frame->tf_addr : 0); ++gd->gd_trap_nesting_level; trap_fatal(frame, eva); --gd->gd_trap_nesting_level; goto out2; } #endif eva = 0; if ((frame->tf_rflags & PSL_I) == 0) { /* * Buggy application or kernel code has disabled interrupts * and then trapped. Enabling interrupts now is wrong, but * it is better than running with interrupts disabled until * they are accidentally enabled later. */ type = frame->tf_trapno; if (ISPL(frame->tf_cs) == SEL_UPL) { /* JG curproc can be NULL */ kprintf( "pid %ld (%s): trap %d with interrupts disabled\n", (long)curproc->p_pid, curproc->p_comm, type); } else if ((type == T_STKFLT || type == T_PROTFLT || type == T_SEGNPFLT) && frame->tf_rip == (long)doreti_iret) { /* * iretq fault from kernel mode during return to * userland. * * This situation is expected, don't complain. */ } else if (type != T_NMI && type != T_BPTFLT && type != T_TRCTRAP) { /* * XXX not quite right, since this may be for a * multiple fault in user mode. */ kprintf("kernel trap %d (%s @ 0x%016jx) with " "interrupts disabled\n", type, td->td_comm, frame->tf_rip); } cpu_enable_intr(); } type = frame->tf_trapno; code = frame->tf_err; if (ISPL(frame->tf_cs) == SEL_UPL) { /* user trap */ KTR_LOG(kernentry_trap, p->p_pid, lp->lwp_tid, frame->tf_trapno, eva); userenter(td, p); sticks = (int)td->td_sticks; KASSERT(lp->lwp_md.md_regs == frame, ("Frame mismatch %p %p", lp->lwp_md.md_regs, frame)); switch (type) { case T_PRIVINFLT: /* privileged instruction fault */ i = SIGILL; ucode = ILL_PRVOPC; break; case T_BPTFLT: /* bpt instruction fault */ case T_TRCTRAP: /* trace trap */ frame->tf_rflags &= ~PSL_T; i = SIGTRAP; ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT); break; case T_ARITHTRAP: /* arithmetic trap */ ucode = code; i = SIGFPE; break; case T_ASTFLT: /* Allow process switch */ mycpu->gd_cnt.v_soft++; if (mycpu->gd_reqflags & RQF_AST_OWEUPC) { atomic_clear_int(&mycpu->gd_reqflags, RQF_AST_OWEUPC); addupc_task(p, p->p_prof.pr_addr, p->p_prof.pr_ticks); } goto out; case T_PROTFLT: /* general protection fault */ i = SIGBUS; ucode = BUS_OBJERR; break; case T_STKFLT: /* stack fault */ case T_SEGNPFLT: /* segment not present fault */ i = SIGBUS; ucode = BUS_ADRERR; break; case T_TSSFLT: /* invalid TSS fault */ case T_DOUBLEFLT: /* double fault */ default: i = SIGBUS; ucode = BUS_OBJERR; break; case T_PAGEFLT: /* page fault */ i = trap_pfault(frame, TRUE); #ifdef DDB if (frame->tf_rip == 0) { /* used for kernel debugging only */ while (freeze_on_seg_fault) tsleep(p, 0, "freeze", hz * 20); } #endif if (i == -1 || i == 0) goto out; if (i == SIGSEGV) { ucode = SEGV_MAPERR; } else { i = SIGSEGV; ucode = SEGV_ACCERR; } break; case T_DIVIDE: /* integer divide fault */ ucode = FPE_INTDIV; i = SIGFPE; break; #if NISA > 0 case T_NMI: /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) { #ifdef DDB /* * NMI can be hooked up to a pushbutton * for debugging. */ if (ddb_on_nmi) { kprintf ("NMI ... going to debugger\n"); kdb_trap(type, 0, frame); } #endif /* DDB */ goto out2; } else if (panic_on_nmi) panic("NMI indicates hardware failure"); break; #endif /* NISA > 0 */ case T_OFLOW: /* integer overflow fault */ ucode = FPE_INTOVF; i = SIGFPE; break; case T_BOUND: /* bounds check fault */ ucode = FPE_FLTSUB; i = SIGFPE; break; case T_DNA: /* * Virtual kernel intercept - pass the DNA exception * to the virtual kernel if it asked to handle it. * This occurs when the virtual kernel is holding * onto the FP context for a different emulated * process then the one currently running. * * We must still call npxdna() since we may have * saved FP state that the virtual kernel needs * to hand over to a different emulated process. */ if (lp->lwp_vkernel && lp->lwp_vkernel->ve && (td->td_pcb->pcb_flags & FP_VIRTFP) ) { npxdna(); break; } /* * The kernel may have switched out the FP unit's * state, causing the user process to take a fault * when it tries to use the FP unit. Restore the * state here */ if (npxdna()) { gd->gd_cnt.v_trap++; goto out; } i = SIGFPE; ucode = FPE_FPU_NP_TRAP; break; case T_FPOPFLT: /* FPU operand fetch fault */ ucode = ILL_COPROC; i = SIGILL; break; case T_XMMFLT: /* SIMD floating-point exception */ ucode = 0; /* XXX */ i = SIGFPE; break; } } else { /* kernel trap */ switch (type) { case T_PAGEFLT: /* page fault */ trap_pfault(frame, FALSE); goto out2; case T_DNA: /* * The kernel is apparently using fpu for copying. * XXX this should be fatal unless the kernel has * registered such use. */ if (npxdna()) { gd->gd_cnt.v_trap++; goto out2; } break; case T_STKFLT: /* stack fault */ case T_PROTFLT: /* general protection fault */ case T_SEGNPFLT: /* segment not present fault */ /* * Invalid segment selectors and out of bounds * %rip's and %rsp's can be set up in user mode. * This causes a fault in kernel mode when the * kernel tries to return to user mode. We want * to get this fault so that we can fix the * problem here and not have to check all the * selectors and pointers when the user changes * them. */ if (mycpu->gd_intr_nesting_level == 0) { /* * NOTE: in 64-bit mode traps push rsp/ss * even if no ring change occurs. */ if (td->td_pcb->pcb_onfault && td->td_pcb->pcb_onfault_sp == frame->tf_rsp) { frame->tf_rip = (register_t) td->td_pcb->pcb_onfault; goto out2; } /* * If the iretq in doreti faults during * return to user, it will be special-cased * in IDTVEC(prot) to get here. We want * to 'return' to doreti_iret_fault in * ipl.s in approximately the same state we * were in at the iretq. */ if (frame->tf_rip == (long)doreti_iret) { frame->tf_rip = (long)doreti_iret_fault; goto out2; } } break; case T_TSSFLT: /* * PSL_NT can be set in user mode and isn't cleared * automatically when the kernel is entered. This * causes a TSS fault when the kernel attempts to * `iret' because the TSS link is uninitialized. We * want to get this fault so that we can fix the * problem here and not every time the kernel is * entered. */ if (frame->tf_rflags & PSL_NT) { frame->tf_rflags &= ~PSL_NT; #if 0 /* do we need this? */ if (frame->tf_rip == (long)doreti_iret) frame->tf_rip = (long)doreti_iret_fault; #endif goto out2; } break; case T_TRCTRAP: /* trace trap */ /* * Detect historical CPU artifact on syscall or int $3 * entry (if not shortcutted in exception.s via * DIRECT_DISALLOW_SS_CPUBUG). */ gd->gd_cnt.v_trap++; if (frame->tf_rip == (register_t)IDTVEC(fast_syscall)) { krateprintf(&sscpubugrate, "Caught #DB at syscall cpu artifact\n"); goto out2; } if (frame->tf_rip == (register_t)IDTVEC(bpt)) { krateprintf(&sscpubugrate, "Caught #DB at int $N cpu artifact\n"); goto out2; } /* * Ignore debug register trace traps due to * accesses in the user's address space, which * can happen under several conditions such as * if a user sets a watchpoint on a buffer and * then passes that buffer to a system call. * We still want to get TRCTRAPS for addresses * in kernel space because that is useful when * debugging the kernel. */ if (user_dbreg_trap()) { /* * Reset breakpoint bits because the * processor doesn't */ load_dr6(rdr6() & ~0xf); goto out2; } /* * FALLTHROUGH (TRCTRAP kernel mode, kernel address) */ case T_BPTFLT: /* * If DDB is enabled, let it handle the debugger trap. * Otherwise, debugger traps "can't happen". */ ucode = TRAP_BRKPT; #ifdef DDB if (kdb_trap(type, 0, frame)) goto out2; #endif break; #if NISA > 0 case T_NMI: /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) { #ifdef DDB /* * NMI can be hooked up to a pushbutton * for debugging. */ if (ddb_on_nmi) { kprintf ("NMI ... going to debugger\n"); kdb_trap(type, 0, frame); } #endif /* DDB */ goto out2; } else if (panic_on_nmi == 0) goto out2; /* FALL THROUGH */ #endif /* NISA > 0 */ } trap_fatal(frame, 0); goto out2; } /* * Fault from user mode, virtual kernel interecept. * * If the fault is directly related to a VM context managed by a * virtual kernel then let the virtual kernel handle it. */ if (lp->lwp_vkernel && lp->lwp_vkernel->ve) { vkernel_trap(lp, frame); goto out; } /* Translate fault for emulators (e.g. Linux) */ if (*p->p_sysent->sv_transtrap) i = (*p->p_sysent->sv_transtrap)(i, type); gd->gd_cnt.v_trap++; trapsignal(lp, i, ucode); #ifdef DEBUG if (type <= MAX_TRAP_MSG) { uprintf("fatal process exception: %s", trap_msg[type]); if ((type == T_PAGEFLT) || (type == T_PROTFLT)) uprintf(", fault VA = 0x%lx", frame->tf_addr); uprintf("\n"); } #endif out: userret(lp, frame, sticks); userexit(lp); out2: ; if (p != NULL && lp != NULL) KTR_LOG(kernentry_trap_ret, p->p_pid, lp->lwp_tid); #ifdef INVARIANTS KASSERT(crit_count == td->td_critcount, ("trap: critical section count mismatch! %d/%d", crit_count, td->td_pri)); KASSERT(curstop == td->td_toks_stop, ("trap: extra tokens held after trap! %ld/%ld", curstop - &td->td_toks_base, td->td_toks_stop - &td->td_toks_base)); #endif }
/* * Map the local APIC and setup necessary interrupt vectors. */ static void native_lapic_init(vm_paddr_t addr) { uint32_t ver; u_int regs[4]; int i, arat; /* * Enable x2APIC mode if possible. Map the local APIC * registers page. * * Keep the LAPIC registers page mapped uncached for x2APIC * mode too, to have direct map page attribute set to * uncached. This is needed to work around CPU errata present * on all Intel processors. */ KASSERT(trunc_page(addr) == addr, ("local APIC not aligned on a page boundary")); lapic_paddr = addr; lapic_map = pmap_mapdev(addr, PAGE_SIZE); if (x2apic_mode) { native_lapic_enable_x2apic(); lapic_map = NULL; } /* Setup the spurious interrupt handler. */ setidt(APIC_SPURIOUS_INT, IDTVEC(spuriousint), SDT_APIC, SEL_KPL, GSEL_APIC); /* Perform basic initialization of the BSP's local APIC. */ lapic_enable(); /* Set BSP's per-CPU local APIC ID. */ PCPU_SET(apic_id, lapic_id()); /* Local APIC timer interrupt. */ setidt(APIC_TIMER_INT, IDTVEC(timerint), SDT_APIC, SEL_KPL, GSEL_APIC); /* Local APIC error interrupt. */ setidt(APIC_ERROR_INT, IDTVEC(errorint), SDT_APIC, SEL_KPL, GSEL_APIC); /* XXX: Thermal interrupt */ /* Local APIC CMCI. */ setidt(APIC_CMC_INT, IDTVEC(cmcint), SDT_APICT, SEL_KPL, GSEL_APIC); if ((resource_int_value("apic", 0, "clock", &i) != 0 || i != 0)) { arat = 0; /* Intel CPUID 0x06 EAX[2] set if APIC timer runs in C3. */ if (cpu_vendor_id == CPU_VENDOR_INTEL && cpu_high >= 6) { do_cpuid(0x06, regs); if ((regs[0] & CPUTPM1_ARAT) != 0) arat = 1; } bzero(&lapic_et, sizeof(lapic_et)); lapic_et.et_name = "LAPIC"; lapic_et.et_flags = ET_FLAGS_PERIODIC | ET_FLAGS_ONESHOT | ET_FLAGS_PERCPU; lapic_et.et_quality = 600; if (!arat) { lapic_et.et_flags |= ET_FLAGS_C3STOP; lapic_et.et_quality -= 200; } lapic_et.et_frequency = 0; /* We don't know frequency yet, so trying to guess. */ lapic_et.et_min_period = 0x00001000LL; lapic_et.et_max_period = SBT_1S; lapic_et.et_start = lapic_et_start; lapic_et.et_stop = lapic_et_stop; lapic_et.et_priv = NULL; et_register(&lapic_et); } /* * Set lapic_eoi_suppression after lapic_enable(), to not * enable suppression in the hardware prematurely. Note that * we by default enable suppression even when system only has * one IO-APIC, since EOI is broadcasted to all APIC agents, * including CPUs, otherwise. */ ver = lapic_read32(LAPIC_VERSION); if ((ver & APIC_VER_EOI_SUPPRESSION) != 0) { lapic_eoi_suppression = 1; TUNABLE_INT_FETCH("hw.lapic_eoi_suppression", &lapic_eoi_suppression); } }
/* * trap(frame): exception, fault, and trap interface to BSD kernel. * * This common code is called from assembly language IDT gate entry routines * that prepare a suitable stack frame, and restore this frame after the * exception has been processed. Note that the effect is as if the arguments * were passed call by reference. */ void trap(struct trapframe *frame) { struct lwp *l = curlwp; struct proc *p; struct pcb *pcb; extern char fusubail[], kcopy_fault[], return_address_fault[], IDTVEC(osyscall)[]; struct trapframe *vframe; ksiginfo_t ksi; void *onfault; int type, error; uint32_t cr2; bool pfail; if (__predict_true(l != NULL)) { pcb = lwp_getpcb(l); p = l->l_proc; } else { /* * this can happen eg. on break points in early on boot. */ pcb = NULL; p = NULL; } type = frame->tf_trapno; #ifdef DEBUG if (trapdebug) { trap_print(frame, l); } #endif if (type != T_NMI && !KERNELMODE(frame->tf_cs, frame->tf_eflags)) { type |= T_USER; l->l_md.md_regs = frame; pcb->pcb_cr2 = 0; LWP_CACHE_CREDS(l, p); } #ifdef KDTRACE_HOOKS /* * A trap can occur while DTrace executes a probe. Before * executing the probe, DTrace blocks re-scheduling and sets * a flag in its per-cpu flags to indicate that it doesn't * want to fault. On returning from the the probe, the no-fault * flag is cleared and finally re-scheduling is enabled. * * If the DTrace kernel module has registered a trap handler, * call it and if it returns non-zero, assume that it has * handled the trap and modified the trap frame so that this * function can return normally. */ if ((type == T_PROTFLT || type == T_PAGEFLT) && dtrace_trap_func != NULL) { if ((*dtrace_trap_func)(frame, type)) { return; } } #endif switch (type) { case T_ASTFLT: /*FALLTHROUGH*/ default: we_re_toast: if (type == T_TRCTRAP) check_dr0(); else trap_print(frame, l); if (kdb_trap(type, 0, frame)) return; if (kgdb_trap(type, frame)) return; /* * If this is a breakpoint, don't panic if we're not connected. */ if (type == T_BPTFLT && kgdb_disconnected()) { printf("kgdb: ignored %s\n", trap_type[type]); return; } panic("trap"); /*NOTREACHED*/ case T_PROTFLT: case T_SEGNPFLT: case T_ALIGNFLT: case T_TSSFLT: if (p == NULL) goto we_re_toast; /* Check for copyin/copyout fault. */ onfault = onfault_handler(pcb, frame); if (onfault != NULL) { copyefault: error = EFAULT; copyfault: frame->tf_eip = (uintptr_t)onfault; frame->tf_eax = error; return; } /* * Check for failure during return to user mode. * This can happen loading invalid values into the segment * registers, or during the 'iret' itself. * * We do this by looking at the instruction we faulted on. * The specific instructions we recognize only happen when * returning from a trap, syscall, or interrupt. */ kernelfault: KSI_INIT_TRAP(&ksi); ksi.ksi_signo = SIGSEGV; ksi.ksi_code = SEGV_ACCERR; ksi.ksi_trap = type; switch (*(u_char *)frame->tf_eip) { case 0xcf: /* iret */ /* * The 'iret' instruction faulted, so we have the * 'user' registers saved after the kernel %eip:%cs:%fl * of the 'iret' and below that the user %eip:%cs:%fl * the 'iret' was processing. * We must delete the 3 words of kernel return address * from the stack to generate a normal stack frame * (eg for sending a SIGSEGV). */ vframe = (void *)((int *)frame + 3); if (KERNELMODE(vframe->tf_cs, vframe->tf_eflags)) goto we_re_toast; memmove(vframe, frame, offsetof(struct trapframe, tf_eip)); /* Set the faulting address to the user %eip */ ksi.ksi_addr = (void *)vframe->tf_eip; break; case 0x8e: switch (*(uint32_t *)frame->tf_eip) { case 0x8e242c8e: /* mov (%esp,%gs), then */ case 0x0424648e: /* mov 0x4(%esp),%fs */ case 0x0824448e: /* mov 0x8(%esp),%es */ case 0x0c245c8e: /* mov 0xc(%esp),%ds */ break; default: goto we_re_toast; } /* * We faulted loading one if the user segment registers. * The stack frame containing the user registers is * still valid and is just below the %eip:%cs:%fl of * the kernel fault frame. */ vframe = (void *)(&frame->tf_eflags + 1); if (KERNELMODE(vframe->tf_cs, vframe->tf_eflags)) goto we_re_toast; /* There is no valid address for the fault */ break; default: goto we_re_toast; } /* * We might have faulted trying to execute the * trampoline for a local (nested) signal handler. * Only generate SIGSEGV if the user %cs isn't changed. * (This is only strictly necessary in the 'iret' case.) */ if (!pmap_exec_fixup(&p->p_vmspace->vm_map, vframe, pcb)) { /* Save outer frame for any signal return */ l->l_md.md_regs = vframe; (*p->p_emul->e_trapsignal)(l, &ksi); } /* Return to user by reloading the user frame */ trap_return_fault_return(vframe); /* NOTREACHED */ case T_PROTFLT|T_USER: /* protection fault */ case T_TSSFLT|T_USER: case T_SEGNPFLT|T_USER: case T_STKFLT|T_USER: case T_ALIGNFLT|T_USER: KSI_INIT_TRAP(&ksi); ksi.ksi_addr = (void *)rcr2(); switch (type) { case T_SEGNPFLT|T_USER: case T_STKFLT|T_USER: ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_ADRERR; break; case T_TSSFLT|T_USER: ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; break; case T_ALIGNFLT|T_USER: ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_ADRALN; break; case T_PROTFLT|T_USER: #ifdef VM86 if (frame->tf_eflags & PSL_VM) { vm86_gpfault(l, type & ~T_USER); goto out; } #endif /* * If pmap_exec_fixup does something, * let's retry the trap. */ if (pmap_exec_fixup(&p->p_vmspace->vm_map, frame, pcb)){ goto out; } ksi.ksi_signo = SIGSEGV; ksi.ksi_code = SEGV_ACCERR; break; default: KASSERT(0); break; } goto trapsignal; case T_PRIVINFLT|T_USER: /* privileged instruction fault */ case T_FPOPFLT|T_USER: /* coprocessor operand fault */ KSI_INIT_TRAP(&ksi); ksi.ksi_signo = SIGILL; ksi.ksi_addr = (void *) frame->tf_eip; switch (type) { case T_PRIVINFLT|T_USER: ksi.ksi_code = ILL_PRVOPC; break; case T_FPOPFLT|T_USER: ksi.ksi_code = ILL_COPROC; break; default: ksi.ksi_code = 0; break; } goto trapsignal; case T_ASTFLT|T_USER: /* Allow process switch. */ //curcpu()->ci_data.cpu_nast++; if (l->l_pflag & LP_OWEUPC) { l->l_pflag &= ~LP_OWEUPC; ADDUPROF(l); } /* Allow a forced task switch. */ if (curcpu()->ci_want_resched) { preempt(); } goto out; case T_BOUND|T_USER: case T_OFLOW|T_USER: case T_DIVIDE|T_USER: KSI_INIT_TRAP(&ksi); ksi.ksi_signo = SIGFPE; ksi.ksi_addr = (void *)frame->tf_eip; switch (type) { case T_BOUND|T_USER: ksi.ksi_code = FPE_FLTSUB; break; case T_OFLOW|T_USER: ksi.ksi_code = FPE_INTOVF; break; case T_DIVIDE|T_USER: ksi.ksi_code = FPE_INTDIV; break; default: ksi.ksi_code = 0; break; } goto trapsignal; case T_PAGEFLT: /* Allow page faults in kernel mode. */ if (__predict_false(l == NULL)) goto we_re_toast; /* * fusubail is used by [fs]uswintr() to prevent page faulting * from inside the profiling interrupt. */ onfault = pcb->pcb_onfault; if (onfault == fusubail || onfault == return_address_fault) { goto copyefault; } if (cpu_intr_p() || (l->l_pflag & LP_INTR) != 0) { goto we_re_toast; } cr2 = rcr2(); goto faultcommon; case T_PAGEFLT|T_USER: { /* page fault */ register vaddr_t va; register struct vmspace *vm; register struct vm_map *map; vm_prot_t ftype; extern struct vm_map *kernel_map; cr2 = rcr2(); faultcommon: vm = p->p_vmspace; if (__predict_false(vm == NULL)) { goto we_re_toast; } pcb->pcb_cr2 = cr2; va = trunc_page((vaddr_t)cr2); /* * It is only a kernel address space fault iff: * 1. (type & T_USER) == 0 and * 2. pcb_onfault not set or * 3. pcb_onfault set but supervisor space fault * The last can occur during an exec() copyin where the * argument space is lazy-allocated. */ if (type == T_PAGEFLT && va >= KERNBASE) map = kernel_map; else map = &vm->vm_map; if (frame->tf_err & PGEX_W) ftype = VM_PROT_WRITE; else if (frame->tf_err & PGEX_X) ftype = VM_PROT_EXECUTE; else ftype = VM_PROT_READ; #ifdef DIAGNOSTIC if (map == kernel_map && va == 0) { printf("trap: bad kernel access at %lx\n", va); goto we_re_toast; } #endif /* Fault the original page in. */ onfault = pcb->pcb_onfault; pcb->pcb_onfault = NULL; error = uvm_fault(map, va, ftype); pcb->pcb_onfault = onfault; if (error == 0) { if (map != kernel_map && (void *)va >= vm->vm_maxsaddr) uvm_grow(p, va); pfail = false; while (type == T_PAGEFLT) { /* * we need to switch pmap now if we're in * the middle of copyin/out. * * but we don't need to do so for kcopy as * it never touch userspace. */ kpreempt_disable(); if (curcpu()->ci_want_pmapload) { onfault = onfault_handler(pcb, frame); if (onfault != kcopy_fault) { pmap_load(); } } /* * We need to keep the pmap loaded and * so avoid being preempted until back * into the copy functions. Disable * interrupts at the hardware level before * re-enabling preemption. Interrupts * will be re-enabled by 'iret' when * returning back out of the trap stub. * They'll only be re-enabled when the * program counter is once again in * the copy functions, and so visible * to cpu_kpreempt_exit(). */ #ifndef XEN x86_disable_intr(); #endif l->l_nopreempt--; if (l->l_nopreempt > 0 || !l->l_dopreempt || pfail) { return; } #ifndef XEN x86_enable_intr(); #endif /* * If preemption fails for some reason, * don't retry it. The conditions won't * change under our nose. */ pfail = kpreempt(0); } goto out; } if (type == T_PAGEFLT) { onfault = onfault_handler(pcb, frame); if (onfault != NULL) goto copyfault; printf("uvm_fault(%p, %#lx, %d) -> %#x\n", map, va, ftype, error); goto kernelfault; } KSI_INIT_TRAP(&ksi); ksi.ksi_trap = type & ~T_USER; ksi.ksi_addr = (void *)cr2; switch (error) { case EINVAL: ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_ADRERR; break; case EACCES: ksi.ksi_signo = SIGSEGV; ksi.ksi_code = SEGV_ACCERR; error = EFAULT; break; case ENOMEM: ksi.ksi_signo = SIGKILL; printf("UVM: pid %d.%d (%s), uid %d killed: " "out of swap\n", p->p_pid, l->l_lid, p->p_comm, l->l_cred ? kauth_cred_geteuid(l->l_cred) : -1); break; default: ksi.ksi_signo = SIGSEGV; ksi.ksi_code = SEGV_MAPERR; break; } #ifdef TRAP_SIGDEBUG printf("pid %d.%d (%s): signal %d at eip %x addr %lx " "error %d\n", p->p_pid, l->l_lid, p->p_comm, ksi.ksi_signo, frame->tf_eip, va, error); #endif (*p->p_emul->e_trapsignal)(l, &ksi); break; } case T_TRCTRAP: /* Check whether they single-stepped into a lcall. */ if (frame->tf_eip == (int)IDTVEC(osyscall)) return; if (frame->tf_eip == (int)IDTVEC(osyscall) + 1) { frame->tf_eflags &= ~PSL_T; return; } goto we_re_toast; case T_BPTFLT|T_USER: /* bpt instruction fault */ case T_TRCTRAP|T_USER: /* trace trap */ /* * Don't go single-stepping into a RAS. */ if (p->p_raslist == NULL || (ras_lookup(p, (void *)frame->tf_eip) == (void *)-1)) { KSI_INIT_TRAP(&ksi); ksi.ksi_signo = SIGTRAP; ksi.ksi_trap = type & ~T_USER; if (type == (T_BPTFLT|T_USER)) ksi.ksi_code = TRAP_BRKPT; else ksi.ksi_code = TRAP_TRACE; ksi.ksi_addr = (void *)frame->tf_eip; (*p->p_emul->e_trapsignal)(l, &ksi); } break; case T_NMI: if (nmi_dispatch(frame)) return; /* NMI can be hooked up to a pushbutton for debugging */ if (kgdb_trap(type, frame)) return; if (kdb_trap(type, 0, frame)) return; /* machine/parity/power fail/"kitchen sink" faults */ #if NMCA > 0 mca_nmi(); #endif x86_nmi(); } if ((type & T_USER) == 0) return; out: userret(l); return; trapsignal: ksi.ksi_trap = type & ~T_USER; (*p->p_emul->e_trapsignal)(l, &ksi); userret(l); }