/*
 * Report critical errors.  ip may be NULL.
 */
void
hammer_critical_error(hammer_mount_t hmp, hammer_inode_t ip,
		      int error, const char *msg)
{
	hmp->flags |= HAMMER_MOUNT_CRITICAL_ERROR;

	krateprintf(&hmp->krate,
		    "HAMMER(%s): Critical error inode=%jd error=%d %s\n",
		    hmp->mp->mnt_stat.f_mntfromname,
		    (intmax_t)(ip ? ip->obj_id : -1),
		    error, msg);

	if (hmp->ronly == 0) {
		hmp->ronly = 2;		/* special errored read-only mode */
		hmp->mp->mnt_flag |= MNT_RDONLY;
		RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
			hammer_adjust_volume_mode, NULL);
		kprintf("HAMMER(%s): Forcing read-only mode\n",
			hmp->mp->mnt_stat.f_mntfromname);
	}
	hmp->error = error;
	if (hammer_debug_critical)
		Debugger("Entering debugger");
}
Exemple #2
0
/*
 * Exception, fault, and trap interface to the kernel.
 * This common code is called from assembly language IDT gate entry
 * routines that prepare a suitable stack frame, and restore this
 * frame after the exception has been processed.
 *
 * This function is also called from doreti in an interlock to handle ASTs.
 * For example:  hardwareint->INTROUTINE->(set ast)->doreti->trap
 *
 * NOTE!  We have to retrieve the fault address prior to potentially
 *	  blocking, including blocking on any token.
 *
 * NOTE!  NMI and kernel DBG traps remain on their respective pcpu IST
 *	  stacks if taken from a kernel RPL. trap() cannot block in this
 *	  situation.  DDB entry or a direct report-and-return is ok.
 *
 * XXX gd_trap_nesting_level currently prevents lwkt_switch() from panicing
 * if an attempt is made to switch from a fast interrupt or IPI.
 */
void
trap(struct trapframe *frame)
{
	static struct krate sscpubugrate = { 1 };
	struct globaldata *gd = mycpu;
	struct thread *td = gd->gd_curthread;
	struct lwp *lp = td->td_lwp;
	struct proc *p;
	int sticks = 0;
	int i = 0, ucode = 0, type, code;
#ifdef INVARIANTS
	int crit_count = td->td_critcount;
	lwkt_tokref_t curstop = td->td_toks_stop;
#endif
	vm_offset_t eva;

	p = td->td_proc;
	clear_quickret();

#ifdef DDB
        /*
	 * We need to allow T_DNA faults when the debugger is active since
	 * some dumping paths do large bcopy() which use the floating
	 * point registers for faster copying.
	 */
	if (db_active && frame->tf_trapno != T_DNA) {
		eva = (frame->tf_trapno == T_PAGEFLT ? frame->tf_addr : 0);
		++gd->gd_trap_nesting_level;
		trap_fatal(frame, eva);
		--gd->gd_trap_nesting_level;
		goto out2;
	}
#endif

	eva = 0;

	if ((frame->tf_rflags & PSL_I) == 0) {
		/*
		 * Buggy application or kernel code has disabled interrupts
		 * and then trapped.  Enabling interrupts now is wrong, but
		 * it is better than running with interrupts disabled until
		 * they are accidentally enabled later.
		 */

		type = frame->tf_trapno;
		if (ISPL(frame->tf_cs) == SEL_UPL) {
			/* JG curproc can be NULL */
			kprintf(
			    "pid %ld (%s): trap %d with interrupts disabled\n",
			    (long)curproc->p_pid, curproc->p_comm, type);
		} else if ((type == T_STKFLT || type == T_PROTFLT ||
			    type == T_SEGNPFLT) &&
			   frame->tf_rip == (long)doreti_iret) {
			/*
			 * iretq fault from kernel mode during return to
			 * userland.
			 *
			 * This situation is expected, don't complain.
			 */
		} else if (type != T_NMI && type != T_BPTFLT &&
			   type != T_TRCTRAP) {
			/*
			 * XXX not quite right, since this may be for a
			 * multiple fault in user mode.
			 */
			kprintf("kernel trap %d (%s @ 0x%016jx) with "
				"interrupts disabled\n",
				type,
				td->td_comm,
				frame->tf_rip);
		}
		cpu_enable_intr();
	}

	type = frame->tf_trapno;
	code = frame->tf_err;

	if (ISPL(frame->tf_cs) == SEL_UPL) {
		/* user trap */

		KTR_LOG(kernentry_trap, p->p_pid, lp->lwp_tid,
			frame->tf_trapno, eva);

		userenter(td, p);

		sticks = (int)td->td_sticks;
		KASSERT(lp->lwp_md.md_regs == frame,
			("Frame mismatch %p %p", lp->lwp_md.md_regs, frame));

		switch (type) {
		case T_PRIVINFLT:	/* privileged instruction fault */
			i = SIGILL;
			ucode = ILL_PRVOPC;
			break;

		case T_BPTFLT:		/* bpt instruction fault */
		case T_TRCTRAP:		/* trace trap */
			frame->tf_rflags &= ~PSL_T;
			i = SIGTRAP;
			ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT);
			break;

		case T_ARITHTRAP:	/* arithmetic trap */
			ucode = code;
			i = SIGFPE;
			break;

		case T_ASTFLT:		/* Allow process switch */
			mycpu->gd_cnt.v_soft++;
			if (mycpu->gd_reqflags & RQF_AST_OWEUPC) {
				atomic_clear_int(&mycpu->gd_reqflags,
						 RQF_AST_OWEUPC);
				addupc_task(p, p->p_prof.pr_addr,
					    p->p_prof.pr_ticks);
			}
			goto out;

		case T_PROTFLT:		/* general protection fault */
			i = SIGBUS;
			ucode = BUS_OBJERR;
			break;
		case T_STKFLT:		/* stack fault */
		case T_SEGNPFLT:	/* segment not present fault */
			i = SIGBUS;
			ucode = BUS_ADRERR;
			break;
		case T_TSSFLT:		/* invalid TSS fault */
		case T_DOUBLEFLT:	/* double fault */
		default:
			i = SIGBUS;
			ucode = BUS_OBJERR;
			break;

		case T_PAGEFLT:		/* page fault */
			i = trap_pfault(frame, TRUE);
#ifdef DDB
			if (frame->tf_rip == 0) {
				/* used for kernel debugging only */
				while (freeze_on_seg_fault)
					tsleep(p, 0, "freeze", hz * 20);
			}
#endif
			if (i == -1 || i == 0)
				goto out;
			if (i == SIGSEGV) {
				ucode = SEGV_MAPERR;
			} else {
				i = SIGSEGV;
				ucode = SEGV_ACCERR;
			}
			break;

		case T_DIVIDE:		/* integer divide fault */
			ucode = FPE_INTDIV;
			i = SIGFPE;
			break;

#if NISA > 0
		case T_NMI:
			/* machine/parity/power fail/"kitchen sink" faults */
			if (isa_nmi(code) == 0) {
#ifdef DDB
				/*
				 * NMI can be hooked up to a pushbutton
				 * for debugging.
				 */
				if (ddb_on_nmi) {
					kprintf ("NMI ... going to debugger\n");
					kdb_trap(type, 0, frame);
				}
#endif /* DDB */
				goto out2;
			} else if (panic_on_nmi)
				panic("NMI indicates hardware failure");
			break;
#endif /* NISA > 0 */

		case T_OFLOW:		/* integer overflow fault */
			ucode = FPE_INTOVF;
			i = SIGFPE;
			break;

		case T_BOUND:		/* bounds check fault */
			ucode = FPE_FLTSUB;
			i = SIGFPE;
			break;

		case T_DNA:
			/*
			 * Virtual kernel intercept - pass the DNA exception
			 * to the virtual kernel if it asked to handle it.
			 * This occurs when the virtual kernel is holding
			 * onto the FP context for a different emulated
			 * process then the one currently running.
			 *
			 * We must still call npxdna() since we may have
			 * saved FP state that the virtual kernel needs
			 * to hand over to a different emulated process.
			 */
			if (lp->lwp_vkernel && lp->lwp_vkernel->ve &&
			    (td->td_pcb->pcb_flags & FP_VIRTFP)
			) {
				npxdna();
				break;
			}

			/*
			 * The kernel may have switched out the FP unit's
			 * state, causing the user process to take a fault
			 * when it tries to use the FP unit.  Restore the
			 * state here
			 */
			if (npxdna()) {
				gd->gd_cnt.v_trap++;
				goto out;
			}
			i = SIGFPE;
			ucode = FPE_FPU_NP_TRAP;
			break;

		case T_FPOPFLT:		/* FPU operand fetch fault */
			ucode = ILL_COPROC;
			i = SIGILL;
			break;

		case T_XMMFLT:		/* SIMD floating-point exception */
			ucode = 0; /* XXX */
			i = SIGFPE;
			break;
		}
	} else {
		/* kernel trap */

		switch (type) {
		case T_PAGEFLT:			/* page fault */
			trap_pfault(frame, FALSE);
			goto out2;

		case T_DNA:
			/*
			 * The kernel is apparently using fpu for copying.
			 * XXX this should be fatal unless the kernel has
			 * registered such use.
			 */
			if (npxdna()) {
				gd->gd_cnt.v_trap++;
				goto out2;
			}
			break;

		case T_STKFLT:		/* stack fault */
		case T_PROTFLT:		/* general protection fault */
		case T_SEGNPFLT:	/* segment not present fault */
			/*
			 * Invalid segment selectors and out of bounds
			 * %rip's and %rsp's can be set up in user mode.
			 * This causes a fault in kernel mode when the
			 * kernel tries to return to user mode.  We want
			 * to get this fault so that we can fix the
			 * problem here and not have to check all the
			 * selectors and pointers when the user changes
			 * them.
			 */
			if (mycpu->gd_intr_nesting_level == 0) {
				/*
				 * NOTE: in 64-bit mode traps push rsp/ss
				 *	 even if no ring change occurs.
				 */
				if (td->td_pcb->pcb_onfault &&
				    td->td_pcb->pcb_onfault_sp ==
				    frame->tf_rsp) {
					frame->tf_rip = (register_t)
						td->td_pcb->pcb_onfault;
					goto out2;
				}

				/*
				 * If the iretq in doreti faults during
				 * return to user, it will be special-cased
				 * in IDTVEC(prot) to get here.  We want
				 * to 'return' to doreti_iret_fault in
				 * ipl.s in approximately the same state we
				 * were in at the iretq.
				 */
				if (frame->tf_rip == (long)doreti_iret) {
					frame->tf_rip = (long)doreti_iret_fault;
					goto out2;
				}
			}
			break;

		case T_TSSFLT:
			/*
			 * PSL_NT can be set in user mode and isn't cleared
			 * automatically when the kernel is entered.  This
			 * causes a TSS fault when the kernel attempts to
			 * `iret' because the TSS link is uninitialized.  We
			 * want to get this fault so that we can fix the
			 * problem here and not every time the kernel is
			 * entered.
			 */
			if (frame->tf_rflags & PSL_NT) {
				frame->tf_rflags &= ~PSL_NT;
#if 0
				/* do we need this? */
				if (frame->tf_rip == (long)doreti_iret)
					frame->tf_rip = (long)doreti_iret_fault;
#endif
				goto out2;
			}
			break;

		case T_TRCTRAP:	 /* trace trap */
			/*
			 * Detect historical CPU artifact on syscall or int $3
			 * entry (if not shortcutted in exception.s via
			 * DIRECT_DISALLOW_SS_CPUBUG).
			 */
			gd->gd_cnt.v_trap++;
			if (frame->tf_rip == (register_t)IDTVEC(fast_syscall)) {
				krateprintf(&sscpubugrate,
					"Caught #DB at syscall cpu artifact\n");
				goto out2;
			}
			if (frame->tf_rip == (register_t)IDTVEC(bpt)) {
				krateprintf(&sscpubugrate,
					"Caught #DB at int $N cpu artifact\n");
				goto out2;
			}

			/*
			 * Ignore debug register trace traps due to
			 * accesses in the user's address space, which
			 * can happen under several conditions such as
			 * if a user sets a watchpoint on a buffer and
			 * then passes that buffer to a system call.
			 * We still want to get TRCTRAPS for addresses
			 * in kernel space because that is useful when
			 * debugging the kernel.
			 */
			if (user_dbreg_trap()) {
				/*
				 * Reset breakpoint bits because the
				 * processor doesn't
				 */
				load_dr6(rdr6() & ~0xf);
				goto out2;
			}
			/*
			 * FALLTHROUGH (TRCTRAP kernel mode, kernel address)
			 */
		case T_BPTFLT:
			/*
			 * If DDB is enabled, let it handle the debugger trap.
			 * Otherwise, debugger traps "can't happen".
			 */
			ucode = TRAP_BRKPT;
#ifdef DDB
			if (kdb_trap(type, 0, frame))
				goto out2;
#endif
			break;

#if NISA > 0
		case T_NMI:
			/* machine/parity/power fail/"kitchen sink" faults */
			if (isa_nmi(code) == 0) {
#ifdef DDB
				/*
				 * NMI can be hooked up to a pushbutton
				 * for debugging.
				 */
				if (ddb_on_nmi) {
					kprintf ("NMI ... going to debugger\n");
					kdb_trap(type, 0, frame);
				}
#endif /* DDB */
				goto out2;
			} else if (panic_on_nmi == 0)
				goto out2;
			/* FALL THROUGH */
#endif /* NISA > 0 */
		}
		trap_fatal(frame, 0);
		goto out2;
	}

	/*
	 * Fault from user mode, virtual kernel interecept.
	 *
	 * If the fault is directly related to a VM context managed by a
	 * virtual kernel then let the virtual kernel handle it.
	 */
	if (lp->lwp_vkernel && lp->lwp_vkernel->ve) {
		vkernel_trap(lp, frame);
		goto out;
	}

	/* Translate fault for emulators (e.g. Linux) */
	if (*p->p_sysent->sv_transtrap)
		i = (*p->p_sysent->sv_transtrap)(i, type);

	gd->gd_cnt.v_trap++;
	trapsignal(lp, i, ucode);

#ifdef DEBUG
	if (type <= MAX_TRAP_MSG) {
		uprintf("fatal process exception: %s",
			trap_msg[type]);
		if ((type == T_PAGEFLT) || (type == T_PROTFLT))
			uprintf(", fault VA = 0x%lx", frame->tf_addr);
		uprintf("\n");
	}
#endif

out:
	userret(lp, frame, sticks);
	userexit(lp);
out2:	;
	if (p != NULL && lp != NULL)
		KTR_LOG(kernentry_trap_ret, p->p_pid, lp->lwp_tid);
#ifdef INVARIANTS
	KASSERT(crit_count == td->td_critcount,
		("trap: critical section count mismatch! %d/%d",
		crit_count, td->td_pri));
	KASSERT(curstop == td->td_toks_stop,
		("trap: extra tokens held after trap! %ld/%ld",
		curstop - &td->td_toks_base,
		td->td_toks_stop - &td->td_toks_base));
#endif
}
/*
 * This is now called from local media FS's to operate against their
 * own vnodes if they fail to implement VOP_PUTPAGES.
 *
 * This is typically called indirectly via the pageout daemon and
 * clustering has already typically occured, so in general we ask the
 * underlying filesystem to write the data out asynchronously rather
 * then delayed.
 */
int
vnode_pager_generic_putpages(struct vnode *vp, vm_page_t *m, int bytecount,
			     int flags, int *rtvals)
{
	int i;
	int maxsize, ncount, count;
	vm_ooffset_t poffset;
	struct uio auio;
	struct iovec aiov;
	int error;
	int ioflags;

	count = bytecount / PAGE_SIZE;

	for (i = 0; i < count; i++)
		rtvals[i] = VM_PAGER_AGAIN;

	if ((int) m[0]->pindex < 0) {
		kprintf("vnode_pager_putpages: attempt to write meta-data!!! -- 0x%lx(%x)\n",
			(long)m[0]->pindex, m[0]->dirty);
		rtvals[0] = VM_PAGER_BAD;
		return VM_PAGER_BAD;
	}

	maxsize = count * PAGE_SIZE;
	ncount = count;

	poffset = IDX_TO_OFF(m[0]->pindex);

	/*
	 * If the page-aligned write is larger then the actual file we
	 * have to invalidate pages occuring beyond the file EOF.
	 *
	 * If the file EOF resides in the middle of a page we still clear
	 * all of that page's dirty bits later on.  If we didn't it would
	 * endlessly re-write.
	 *
	 * We do not under any circumstances truncate the valid bits, as
	 * this will screw up bogus page replacement.
	 *
	 * The caller has already read-protected the pages.  The VFS must
	 * use the buffer cache to wrap the pages.  The pages might not
	 * be immediately flushed by the buffer cache but once under its
	 * control the pages themselves can wind up being marked clean
	 * and their covering buffer cache buffer can be marked dirty.
	 */
	if (poffset + maxsize > vp->v_filesize) {
		if (poffset < vp->v_filesize) {
			maxsize = vp->v_filesize - poffset;
			ncount = btoc(maxsize);
		} else {
			maxsize = 0;
			ncount = 0;
		}
		if (ncount < count) {
			for (i = ncount; i < count; i++) {
				rtvals[i] = VM_PAGER_BAD;
			}
		}
	}

	/*
	 * pageouts are already clustered, use IO_ASYNC to force a bawrite()
	 * rather then a bdwrite() to prevent paging I/O from saturating
	 * the buffer cache.  Dummy-up the sequential heuristic to cause
	 * large ranges to cluster.  If neither IO_SYNC or IO_ASYNC is set,
	 * the system decides how to cluster.
	 */
	ioflags = IO_VMIO;
	if (flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL))
		ioflags |= IO_SYNC;
	else if ((flags & VM_PAGER_CLUSTER_OK) == 0)
		ioflags |= IO_ASYNC;
	ioflags |= (flags & VM_PAGER_PUT_INVAL) ? IO_INVAL: 0;
	ioflags |= IO_SEQMAX << IO_SEQSHIFT;

	aiov.iov_base = (caddr_t) 0;
	aiov.iov_len = maxsize;
	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	auio.uio_offset = poffset;
	auio.uio_segflg = UIO_NOCOPY;
	auio.uio_rw = UIO_WRITE;
	auio.uio_resid = maxsize;
	auio.uio_td = NULL;
	error = VOP_WRITE(vp, &auio, ioflags, proc0.p_ucred);
	mycpu->gd_cnt.v_vnodeout++;
	mycpu->gd_cnt.v_vnodepgsout += ncount;

	if (error) {
		krateprintf(&vbadrate,
			    "vnode_pager_putpages: I/O error %d\n", error);
	}
	if (auio.uio_resid) {
		krateprintf(&vresrate,
			    "vnode_pager_putpages: residual I/O %zd at %lu\n",
			    auio.uio_resid, (u_long)m[0]->pindex);
	}
	if (error == 0) {
		for (i = 0; i < ncount; i++) {
			rtvals[i] = VM_PAGER_OK;
			vm_page_undirty(m[i]);
		}
	}
	return rtvals[0];
}