/* * Report critical errors. ip may be NULL. */ void hammer_critical_error(hammer_mount_t hmp, hammer_inode_t ip, int error, const char *msg) { hmp->flags |= HAMMER_MOUNT_CRITICAL_ERROR; krateprintf(&hmp->krate, "HAMMER(%s): Critical error inode=%jd error=%d %s\n", hmp->mp->mnt_stat.f_mntfromname, (intmax_t)(ip ? ip->obj_id : -1), error, msg); if (hmp->ronly == 0) { hmp->ronly = 2; /* special errored read-only mode */ hmp->mp->mnt_flag |= MNT_RDONLY; RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL, hammer_adjust_volume_mode, NULL); kprintf("HAMMER(%s): Forcing read-only mode\n", hmp->mp->mnt_stat.f_mntfromname); } hmp->error = error; if (hammer_debug_critical) Debugger("Entering debugger"); }
/* * Exception, fault, and trap interface to the kernel. * This common code is called from assembly language IDT gate entry * routines that prepare a suitable stack frame, and restore this * frame after the exception has been processed. * * This function is also called from doreti in an interlock to handle ASTs. * For example: hardwareint->INTROUTINE->(set ast)->doreti->trap * * NOTE! We have to retrieve the fault address prior to potentially * blocking, including blocking on any token. * * NOTE! NMI and kernel DBG traps remain on their respective pcpu IST * stacks if taken from a kernel RPL. trap() cannot block in this * situation. DDB entry or a direct report-and-return is ok. * * XXX gd_trap_nesting_level currently prevents lwkt_switch() from panicing * if an attempt is made to switch from a fast interrupt or IPI. */ void trap(struct trapframe *frame) { static struct krate sscpubugrate = { 1 }; struct globaldata *gd = mycpu; struct thread *td = gd->gd_curthread; struct lwp *lp = td->td_lwp; struct proc *p; int sticks = 0; int i = 0, ucode = 0, type, code; #ifdef INVARIANTS int crit_count = td->td_critcount; lwkt_tokref_t curstop = td->td_toks_stop; #endif vm_offset_t eva; p = td->td_proc; clear_quickret(); #ifdef DDB /* * We need to allow T_DNA faults when the debugger is active since * some dumping paths do large bcopy() which use the floating * point registers for faster copying. */ if (db_active && frame->tf_trapno != T_DNA) { eva = (frame->tf_trapno == T_PAGEFLT ? frame->tf_addr : 0); ++gd->gd_trap_nesting_level; trap_fatal(frame, eva); --gd->gd_trap_nesting_level; goto out2; } #endif eva = 0; if ((frame->tf_rflags & PSL_I) == 0) { /* * Buggy application or kernel code has disabled interrupts * and then trapped. Enabling interrupts now is wrong, but * it is better than running with interrupts disabled until * they are accidentally enabled later. */ type = frame->tf_trapno; if (ISPL(frame->tf_cs) == SEL_UPL) { /* JG curproc can be NULL */ kprintf( "pid %ld (%s): trap %d with interrupts disabled\n", (long)curproc->p_pid, curproc->p_comm, type); } else if ((type == T_STKFLT || type == T_PROTFLT || type == T_SEGNPFLT) && frame->tf_rip == (long)doreti_iret) { /* * iretq fault from kernel mode during return to * userland. * * This situation is expected, don't complain. */ } else if (type != T_NMI && type != T_BPTFLT && type != T_TRCTRAP) { /* * XXX not quite right, since this may be for a * multiple fault in user mode. */ kprintf("kernel trap %d (%s @ 0x%016jx) with " "interrupts disabled\n", type, td->td_comm, frame->tf_rip); } cpu_enable_intr(); } type = frame->tf_trapno; code = frame->tf_err; if (ISPL(frame->tf_cs) == SEL_UPL) { /* user trap */ KTR_LOG(kernentry_trap, p->p_pid, lp->lwp_tid, frame->tf_trapno, eva); userenter(td, p); sticks = (int)td->td_sticks; KASSERT(lp->lwp_md.md_regs == frame, ("Frame mismatch %p %p", lp->lwp_md.md_regs, frame)); switch (type) { case T_PRIVINFLT: /* privileged instruction fault */ i = SIGILL; ucode = ILL_PRVOPC; break; case T_BPTFLT: /* bpt instruction fault */ case T_TRCTRAP: /* trace trap */ frame->tf_rflags &= ~PSL_T; i = SIGTRAP; ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT); break; case T_ARITHTRAP: /* arithmetic trap */ ucode = code; i = SIGFPE; break; case T_ASTFLT: /* Allow process switch */ mycpu->gd_cnt.v_soft++; if (mycpu->gd_reqflags & RQF_AST_OWEUPC) { atomic_clear_int(&mycpu->gd_reqflags, RQF_AST_OWEUPC); addupc_task(p, p->p_prof.pr_addr, p->p_prof.pr_ticks); } goto out; case T_PROTFLT: /* general protection fault */ i = SIGBUS; ucode = BUS_OBJERR; break; case T_STKFLT: /* stack fault */ case T_SEGNPFLT: /* segment not present fault */ i = SIGBUS; ucode = BUS_ADRERR; break; case T_TSSFLT: /* invalid TSS fault */ case T_DOUBLEFLT: /* double fault */ default: i = SIGBUS; ucode = BUS_OBJERR; break; case T_PAGEFLT: /* page fault */ i = trap_pfault(frame, TRUE); #ifdef DDB if (frame->tf_rip == 0) { /* used for kernel debugging only */ while (freeze_on_seg_fault) tsleep(p, 0, "freeze", hz * 20); } #endif if (i == -1 || i == 0) goto out; if (i == SIGSEGV) { ucode = SEGV_MAPERR; } else { i = SIGSEGV; ucode = SEGV_ACCERR; } break; case T_DIVIDE: /* integer divide fault */ ucode = FPE_INTDIV; i = SIGFPE; break; #if NISA > 0 case T_NMI: /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) { #ifdef DDB /* * NMI can be hooked up to a pushbutton * for debugging. */ if (ddb_on_nmi) { kprintf ("NMI ... going to debugger\n"); kdb_trap(type, 0, frame); } #endif /* DDB */ goto out2; } else if (panic_on_nmi) panic("NMI indicates hardware failure"); break; #endif /* NISA > 0 */ case T_OFLOW: /* integer overflow fault */ ucode = FPE_INTOVF; i = SIGFPE; break; case T_BOUND: /* bounds check fault */ ucode = FPE_FLTSUB; i = SIGFPE; break; case T_DNA: /* * Virtual kernel intercept - pass the DNA exception * to the virtual kernel if it asked to handle it. * This occurs when the virtual kernel is holding * onto the FP context for a different emulated * process then the one currently running. * * We must still call npxdna() since we may have * saved FP state that the virtual kernel needs * to hand over to a different emulated process. */ if (lp->lwp_vkernel && lp->lwp_vkernel->ve && (td->td_pcb->pcb_flags & FP_VIRTFP) ) { npxdna(); break; } /* * The kernel may have switched out the FP unit's * state, causing the user process to take a fault * when it tries to use the FP unit. Restore the * state here */ if (npxdna()) { gd->gd_cnt.v_trap++; goto out; } i = SIGFPE; ucode = FPE_FPU_NP_TRAP; break; case T_FPOPFLT: /* FPU operand fetch fault */ ucode = ILL_COPROC; i = SIGILL; break; case T_XMMFLT: /* SIMD floating-point exception */ ucode = 0; /* XXX */ i = SIGFPE; break; } } else { /* kernel trap */ switch (type) { case T_PAGEFLT: /* page fault */ trap_pfault(frame, FALSE); goto out2; case T_DNA: /* * The kernel is apparently using fpu for copying. * XXX this should be fatal unless the kernel has * registered such use. */ if (npxdna()) { gd->gd_cnt.v_trap++; goto out2; } break; case T_STKFLT: /* stack fault */ case T_PROTFLT: /* general protection fault */ case T_SEGNPFLT: /* segment not present fault */ /* * Invalid segment selectors and out of bounds * %rip's and %rsp's can be set up in user mode. * This causes a fault in kernel mode when the * kernel tries to return to user mode. We want * to get this fault so that we can fix the * problem here and not have to check all the * selectors and pointers when the user changes * them. */ if (mycpu->gd_intr_nesting_level == 0) { /* * NOTE: in 64-bit mode traps push rsp/ss * even if no ring change occurs. */ if (td->td_pcb->pcb_onfault && td->td_pcb->pcb_onfault_sp == frame->tf_rsp) { frame->tf_rip = (register_t) td->td_pcb->pcb_onfault; goto out2; } /* * If the iretq in doreti faults during * return to user, it will be special-cased * in IDTVEC(prot) to get here. We want * to 'return' to doreti_iret_fault in * ipl.s in approximately the same state we * were in at the iretq. */ if (frame->tf_rip == (long)doreti_iret) { frame->tf_rip = (long)doreti_iret_fault; goto out2; } } break; case T_TSSFLT: /* * PSL_NT can be set in user mode and isn't cleared * automatically when the kernel is entered. This * causes a TSS fault when the kernel attempts to * `iret' because the TSS link is uninitialized. We * want to get this fault so that we can fix the * problem here and not every time the kernel is * entered. */ if (frame->tf_rflags & PSL_NT) { frame->tf_rflags &= ~PSL_NT; #if 0 /* do we need this? */ if (frame->tf_rip == (long)doreti_iret) frame->tf_rip = (long)doreti_iret_fault; #endif goto out2; } break; case T_TRCTRAP: /* trace trap */ /* * Detect historical CPU artifact on syscall or int $3 * entry (if not shortcutted in exception.s via * DIRECT_DISALLOW_SS_CPUBUG). */ gd->gd_cnt.v_trap++; if (frame->tf_rip == (register_t)IDTVEC(fast_syscall)) { krateprintf(&sscpubugrate, "Caught #DB at syscall cpu artifact\n"); goto out2; } if (frame->tf_rip == (register_t)IDTVEC(bpt)) { krateprintf(&sscpubugrate, "Caught #DB at int $N cpu artifact\n"); goto out2; } /* * Ignore debug register trace traps due to * accesses in the user's address space, which * can happen under several conditions such as * if a user sets a watchpoint on a buffer and * then passes that buffer to a system call. * We still want to get TRCTRAPS for addresses * in kernel space because that is useful when * debugging the kernel. */ if (user_dbreg_trap()) { /* * Reset breakpoint bits because the * processor doesn't */ load_dr6(rdr6() & ~0xf); goto out2; } /* * FALLTHROUGH (TRCTRAP kernel mode, kernel address) */ case T_BPTFLT: /* * If DDB is enabled, let it handle the debugger trap. * Otherwise, debugger traps "can't happen". */ ucode = TRAP_BRKPT; #ifdef DDB if (kdb_trap(type, 0, frame)) goto out2; #endif break; #if NISA > 0 case T_NMI: /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) { #ifdef DDB /* * NMI can be hooked up to a pushbutton * for debugging. */ if (ddb_on_nmi) { kprintf ("NMI ... going to debugger\n"); kdb_trap(type, 0, frame); } #endif /* DDB */ goto out2; } else if (panic_on_nmi == 0) goto out2; /* FALL THROUGH */ #endif /* NISA > 0 */ } trap_fatal(frame, 0); goto out2; } /* * Fault from user mode, virtual kernel interecept. * * If the fault is directly related to a VM context managed by a * virtual kernel then let the virtual kernel handle it. */ if (lp->lwp_vkernel && lp->lwp_vkernel->ve) { vkernel_trap(lp, frame); goto out; } /* Translate fault for emulators (e.g. Linux) */ if (*p->p_sysent->sv_transtrap) i = (*p->p_sysent->sv_transtrap)(i, type); gd->gd_cnt.v_trap++; trapsignal(lp, i, ucode); #ifdef DEBUG if (type <= MAX_TRAP_MSG) { uprintf("fatal process exception: %s", trap_msg[type]); if ((type == T_PAGEFLT) || (type == T_PROTFLT)) uprintf(", fault VA = 0x%lx", frame->tf_addr); uprintf("\n"); } #endif out: userret(lp, frame, sticks); userexit(lp); out2: ; if (p != NULL && lp != NULL) KTR_LOG(kernentry_trap_ret, p->p_pid, lp->lwp_tid); #ifdef INVARIANTS KASSERT(crit_count == td->td_critcount, ("trap: critical section count mismatch! %d/%d", crit_count, td->td_pri)); KASSERT(curstop == td->td_toks_stop, ("trap: extra tokens held after trap! %ld/%ld", curstop - &td->td_toks_base, td->td_toks_stop - &td->td_toks_base)); #endif }
/* * This is now called from local media FS's to operate against their * own vnodes if they fail to implement VOP_PUTPAGES. * * This is typically called indirectly via the pageout daemon and * clustering has already typically occured, so in general we ask the * underlying filesystem to write the data out asynchronously rather * then delayed. */ int vnode_pager_generic_putpages(struct vnode *vp, vm_page_t *m, int bytecount, int flags, int *rtvals) { int i; int maxsize, ncount, count; vm_ooffset_t poffset; struct uio auio; struct iovec aiov; int error; int ioflags; count = bytecount / PAGE_SIZE; for (i = 0; i < count; i++) rtvals[i] = VM_PAGER_AGAIN; if ((int) m[0]->pindex < 0) { kprintf("vnode_pager_putpages: attempt to write meta-data!!! -- 0x%lx(%x)\n", (long)m[0]->pindex, m[0]->dirty); rtvals[0] = VM_PAGER_BAD; return VM_PAGER_BAD; } maxsize = count * PAGE_SIZE; ncount = count; poffset = IDX_TO_OFF(m[0]->pindex); /* * If the page-aligned write is larger then the actual file we * have to invalidate pages occuring beyond the file EOF. * * If the file EOF resides in the middle of a page we still clear * all of that page's dirty bits later on. If we didn't it would * endlessly re-write. * * We do not under any circumstances truncate the valid bits, as * this will screw up bogus page replacement. * * The caller has already read-protected the pages. The VFS must * use the buffer cache to wrap the pages. The pages might not * be immediately flushed by the buffer cache but once under its * control the pages themselves can wind up being marked clean * and their covering buffer cache buffer can be marked dirty. */ if (poffset + maxsize > vp->v_filesize) { if (poffset < vp->v_filesize) { maxsize = vp->v_filesize - poffset; ncount = btoc(maxsize); } else { maxsize = 0; ncount = 0; } if (ncount < count) { for (i = ncount; i < count; i++) { rtvals[i] = VM_PAGER_BAD; } } } /* * pageouts are already clustered, use IO_ASYNC to force a bawrite() * rather then a bdwrite() to prevent paging I/O from saturating * the buffer cache. Dummy-up the sequential heuristic to cause * large ranges to cluster. If neither IO_SYNC or IO_ASYNC is set, * the system decides how to cluster. */ ioflags = IO_VMIO; if (flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL)) ioflags |= IO_SYNC; else if ((flags & VM_PAGER_CLUSTER_OK) == 0) ioflags |= IO_ASYNC; ioflags |= (flags & VM_PAGER_PUT_INVAL) ? IO_INVAL: 0; ioflags |= IO_SEQMAX << IO_SEQSHIFT; aiov.iov_base = (caddr_t) 0; aiov.iov_len = maxsize; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = poffset; auio.uio_segflg = UIO_NOCOPY; auio.uio_rw = UIO_WRITE; auio.uio_resid = maxsize; auio.uio_td = NULL; error = VOP_WRITE(vp, &auio, ioflags, proc0.p_ucred); mycpu->gd_cnt.v_vnodeout++; mycpu->gd_cnt.v_vnodepgsout += ncount; if (error) { krateprintf(&vbadrate, "vnode_pager_putpages: I/O error %d\n", error); } if (auio.uio_resid) { krateprintf(&vresrate, "vnode_pager_putpages: residual I/O %zd at %lu\n", auio.uio_resid, (u_long)m[0]->pindex); } if (error == 0) { for (i = 0; i < ncount; i++) { rtvals[i] = VM_PAGER_OK; vm_page_undirty(m[i]); } } return rtvals[0]; }