示例#1
0
/* We walk down the guest page tables to get a guest-physical address */
unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
{
	pgd_t gpgd;
	pte_t gpte;
#ifdef CONFIG_X86_PAE
	pmd_t gpmd;
#endif

	/* Still not set up?  Just map 1:1. */
	if (unlikely(cpu->linear_pages))
		return vaddr;

	/* First step: get the top-level Guest page table entry. */
	gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
	/* Toplevel not present?  We can't map it in. */
	if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) {
		kill_guest(cpu, "Bad address %#lx", vaddr);
		return -1UL;
	}

#ifdef CONFIG_X86_PAE
	gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
	if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
		kill_guest(cpu, "Bad address %#lx", vaddr);
	gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t);
#else
	gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t);
#endif
	if (!(pte_flags(gpte) & _PAGE_PRESENT))
		kill_guest(cpu, "Bad address %#lx", vaddr);

	return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);
}
static int emulate_insn(struct lg_cpu *cpu)
{
	u8 insn;
	unsigned int insnlen = 0, in = 0, small_operand = 0;
	unsigned long physaddr = guest_pa(cpu, cpu->regs->eip);

	if ((cpu->regs->cs & 3) != GUEST_PL)
		return 0;

	
	insn = lgread(cpu, physaddr, u8);

	if (insn == 0xfa) {
		
		cpu->regs->eip++;
		return 1;
	}

	if (insn == 0x66) {
		small_operand = 1;
		
		insnlen = 1;
		insn = lgread(cpu, physaddr + insnlen, u8);
	}

	switch (insn & 0xFE) {
	case 0xE4: 
		insnlen += 2;
		in = 1;
		break;
	case 0xEC: 
		insnlen += 1;
		in = 1;
		break;
	case 0xE6: 
		insnlen += 2;
		break;
	case 0xEE: 
		insnlen += 1;
		break;
	default:
		
		return 0;
	}

	if (in) {
		
		if (insn & 0x1) {
			if (small_operand)
				cpu->regs->eax |= 0xFFFF;
			else
				cpu->regs->eax = 0xFFFFFFFF;
		} else
			cpu->regs->eax |= 0xFF;
	}
	
	cpu->regs->eip += insnlen;
	
	return 1;
}
示例#3
0
/* We walk down the guest page tables to get a guest-physical address */
bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr)
{
	pgd_t gpgd;
	pte_t gpte;
#ifdef CONFIG_X86_PAE
	pmd_t gpmd;
#endif

	/* Still not set up?  Just map 1:1. */
	if (unlikely(cpu->linear_pages)) {
		*paddr = vaddr;
		return true;
	}

	/* First step: get the top-level Guest page table entry. */
	gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
	/* Toplevel not present?  We can't map it in. */
	if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
		goto fail;

#ifdef CONFIG_X86_PAE
	gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
	if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
		goto fail;
	gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t);
#else
	gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t);
#endif
	if (!(pte_flags(gpte) & _PAGE_PRESENT))
		goto fail;

	*paddr = pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);
	return true;

fail:
	*paddr = -1UL;
	return false;
}
示例#4
0
/* This restores the eflags word which was pushed on the stack by a trap */
static void restore_eflags(struct lg_cpu *cpu)
{
	/* This is the physical address of the stack. */
	unsigned long stack_pa = guest_pa(cpu, cpu->regs->esp);

	/*
	 * Stack looks like this:
	 * Address	Contents
	 * esp		EIP
	 * esp + 4	CS
	 * esp + 8	EFLAGS
	 */
	cpu->regs->eflags = lgread(cpu, stack_pa + 8, u32);
	cpu->regs->eflags &=
		~(X86_EFLAGS_TF|X86_EFLAGS_VM|X86_EFLAGS_RF|X86_EFLAGS_NT);
}
示例#5
0
/*H:330
 * (i) Looking up a page table entry when the Guest faults.
 *
 * We saw this call in run_guest(): when we see a page fault in the Guest, we
 * come here.  That's because we only set up the shadow page tables lazily as
 * they're needed, so we get page faults all the time and quietly fix them up
 * and return to the Guest without it knowing.
 *
 * If we fixed up the fault (ie. we mapped the address), this routine returns
 * true.  Otherwise, it was a real fault and we need to tell the Guest.
 */
bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
{
	pgd_t gpgd;
	pgd_t *spgd;
	unsigned long gpte_ptr;
	pte_t gpte;
	pte_t *spte;

	/* Mid level for PAE. */
#ifdef CONFIG_X86_PAE
	pmd_t *spmd;
	pmd_t gpmd;
#endif

	/* First step: get the top-level Guest page table entry. */
	if (unlikely(cpu->linear_pages)) {
		/* Faking up a linear mapping. */
		gpgd = __pgd(CHECK_GPGD_MASK);
	} else {
		gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
		/* Toplevel not present?  We can't map it in. */
		if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
			return false;
	}

	/* Now look at the matching shadow entry. */
	spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
	if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
		/* No shadow entry: allocate a new shadow PTE page. */
		unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
		/*
		 * This is not really the Guest's fault, but killing it is
		 * simple for this corner case.
		 */
		if (!ptepage) {
			kill_guest(cpu, "out of memory allocating pte page");
			return false;
		}
		/* We check that the Guest pgd is OK. */
		check_gpgd(cpu, gpgd);
		/*
		 * And we copy the flags to the shadow PGD entry.  The page
		 * number in the shadow PGD is the page we just allocated.
		 */
		set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd)));
	}

#ifdef CONFIG_X86_PAE
	if (unlikely(cpu->linear_pages)) {
		/* Faking up a linear mapping. */
		gpmd = __pmd(_PAGE_TABLE);
	} else {
		gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
		/* Middle level not present?  We can't map it in. */
		if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
			return false;
	}

	/* Now look at the matching shadow entry. */
	spmd = spmd_addr(cpu, *spgd, vaddr);

	if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) {
		/* No shadow entry: allocate a new shadow PTE page. */
		unsigned long ptepage = get_zeroed_page(GFP_KERNEL);

		/*
		 * This is not really the Guest's fault, but killing it is
		 * simple for this corner case.
		 */
		if (!ptepage) {
			kill_guest(cpu, "out of memory allocating pte page");
			return false;
		}

		/* We check that the Guest pmd is OK. */
		check_gpmd(cpu, gpmd);

		/*
		 * And we copy the flags to the shadow PMD entry.  The page
		 * number in the shadow PMD is the page we just allocated.
		 */
		set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd)));
	}

	/*
	 * OK, now we look at the lower level in the Guest page table: keep its
	 * address, because we might update it later.
	 */
	gpte_ptr = gpte_addr(cpu, gpmd, vaddr);
#else
	/*
	 * OK, now we look at the lower level in the Guest page table: keep its
	 * address, because we might update it later.
	 */
	gpte_ptr = gpte_addr(cpu, gpgd, vaddr);
#endif

	if (unlikely(cpu->linear_pages)) {
		/* Linear?  Make up a PTE which points to same page. */
		gpte = __pte((vaddr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT);
	} else {
		/* Read the actual PTE value. */
		gpte = lgread(cpu, gpte_ptr, pte_t);
	}

	/* If this page isn't in the Guest page tables, we can't page it in. */
	if (!(pte_flags(gpte) & _PAGE_PRESENT))
		return false;

	/*
	 * Check they're not trying to write to a page the Guest wants
	 * read-only (bit 2 of errcode == write).
	 */
	if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW))
		return false;

	/* User access to a kernel-only page? (bit 3 == user access) */
	if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER))
		return false;

	/*
	 * Check that the Guest PTE flags are OK, and the page number is below
	 * the pfn_limit (ie. not mapping the Launcher binary).
	 */
	check_gpte(cpu, gpte);

	/* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
	gpte = pte_mkyoung(gpte);
	if (errcode & 2)
		gpte = pte_mkdirty(gpte);

	/* Get the pointer to the shadow PTE entry we're going to set. */
	spte = spte_addr(cpu, *spgd, vaddr);

	/*
	 * If there was a valid shadow PTE entry here before, we release it.
	 * This can happen with a write to a previously read-only entry.
	 */
	release_pte(*spte);

	/*
	 * If this is a write, we insist that the Guest page is writable (the
	 * final arg to gpte_to_spte()).
	 */
	if (pte_dirty(gpte))
		*spte = gpte_to_spte(cpu, gpte, 1);
	else
		/*
		 * If this is a read, don't set the "writable" bit in the page
		 * table entry, even if the Guest says it's writable.  That way
		 * we will come back here when a write does actually occur, so
		 * we can update the Guest's _PAGE_DIRTY flag.
		 */
		set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0));

	/*
	 * Finally, we write the Guest PTE entry back: we've set the
	 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags.
	 */
	if (likely(!cpu->linear_pages))
		lgwrite(cpu, gpte_ptr, pte_t, gpte);

	/*
	 * The fault is fixed, the page table is populated, the mapping
	 * manipulated, the result returned and the code complete.  A small
	 * delay and a trace of alliteration are the only indications the Guest
	 * has that a page fault occurred at all.
	 */
	return true;
}
示例#6
0
/*H:330
 * (i) Looking up a page table entry when the Guest faults.
 *
 * We saw this call in run_guest(): when we see a page fault in the Guest, we
 * come here.  That's because we only set up the shadow page tables lazily as
 * they're needed, so we get page faults all the time and quietly fix them up
 * and return to the Guest without it knowing.
 *
 * If we fixed up the fault (ie. we mapped the address), this routine returns
 * true.  Otherwise, it was a real fault and we need to tell the Guest.
 *
 * There's a corner case: they're trying to access memory between
 * pfn_limit and device_limit, which is I/O memory.  In this case, we
 * return false and set @iomem to the physical address, so the the
 * Launcher can handle the instruction manually.
 */
bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode,
		 unsigned long *iomem)
{
	unsigned long gpte_ptr;
	pte_t gpte;
	pte_t *spte;
	pmd_t gpmd;
	pgd_t gpgd;

	*iomem = 0;

	/* We never demand page the Switcher, so trying is a mistake. */
	if (vaddr >= switcher_addr)
		return false;

	/* First step: get the top-level Guest page table entry. */
	if (unlikely(cpu->linear_pages)) {
		/* Faking up a linear mapping. */
		gpgd = __pgd(CHECK_GPGD_MASK);
	} else {
		gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
		/* Toplevel not present?  We can't map it in. */
		if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
			return false;

		/* 
		 * This kills the Guest if it has weird flags or tries to
		 * refer to a "physical" address outside the bounds.
		 */
		if (!check_gpgd(cpu, gpgd))
			return false;
	}

	/* This "mid-level" entry is only used for non-linear, PAE mode. */
	gpmd = __pmd(_PAGE_TABLE);

#ifdef CONFIG_X86_PAE
	if (likely(!cpu->linear_pages)) {
		gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
		/* Middle level not present?  We can't map it in. */
		if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
			return false;

		/* 
		 * This kills the Guest if it has weird flags or tries to
		 * refer to a "physical" address outside the bounds.
		 */
		if (!check_gpmd(cpu, gpmd))
			return false;
	}

	/*
	 * OK, now we look at the lower level in the Guest page table: keep its
	 * address, because we might update it later.
	 */
	gpte_ptr = gpte_addr(cpu, gpmd, vaddr);
#else
	/*
	 * OK, now we look at the lower level in the Guest page table: keep its
	 * address, because we might update it later.
	 */
	gpte_ptr = gpte_addr(cpu, gpgd, vaddr);
#endif

	if (unlikely(cpu->linear_pages)) {
		/* Linear?  Make up a PTE which points to same page. */
		gpte = __pte((vaddr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT);
	} else {
		/* Read the actual PTE value. */
		gpte = lgread(cpu, gpte_ptr, pte_t);
	}

	/* If this page isn't in the Guest page tables, we can't page it in. */
	if (!(pte_flags(gpte) & _PAGE_PRESENT))
		return false;

	/*
	 * Check they're not trying to write to a page the Guest wants
	 * read-only (bit 2 of errcode == write).
	 */
	if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW))
		return false;

	/* User access to a kernel-only page? (bit 3 == user access) */
	if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER))
		return false;

	/* If they're accessing io memory, we expect a fault. */
	if (gpte_in_iomem(cpu, gpte)) {
		*iomem = (pte_pfn(gpte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
		return false;
	}

	/*
	 * Check that the Guest PTE flags are OK, and the page number is below
	 * the pfn_limit (ie. not mapping the Launcher binary).
	 */
	if (!check_gpte(cpu, gpte))
		return false;

	/* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
	gpte = pte_mkyoung(gpte);
	if (errcode & 2)
		gpte = pte_mkdirty(gpte);

	/* Get the pointer to the shadow PTE entry we're going to set. */
	spte = find_spte(cpu, vaddr, true, pgd_flags(gpgd), pmd_flags(gpmd));
	if (!spte)
		return false;

	/*
	 * If there was a valid shadow PTE entry here before, we release it.
	 * This can happen with a write to a previously read-only entry.
	 */
	release_pte(*spte);

	/*
	 * If this is a write, we insist that the Guest page is writable (the
	 * final arg to gpte_to_spte()).
	 */
	if (pte_dirty(gpte))
		*spte = gpte_to_spte(cpu, gpte, 1);
	else
		/*
		 * If this is a read, don't set the "writable" bit in the page
		 * table entry, even if the Guest says it's writable.  That way
		 * we will come back here when a write does actually occur, so
		 * we can update the Guest's _PAGE_DIRTY flag.
		 */
		set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0));

	/*
	 * Finally, we write the Guest PTE entry back: we've set the
	 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags.
	 */
	if (likely(!cpu->linear_pages))
		lgwrite(cpu, gpte_ptr, pte_t, gpte);

	/*
	 * The fault is fixed, the page table is populated, the mapping
	 * manipulated, the result returned and the code complete.  A small
	 * delay and a trace of alliteration are the only indications the Guest
	 * has that a page fault occurred at all.
	 */
	return true;
}
示例#7
0
/*H:130 Now we've examined the hypercall code; our Guest can make requests.
 * Our Guest is usually so well behaved; it never tries to do things it isn't
 * allowed to, and uses hypercalls instead.  Unfortunately, Linux's paravirtual
 * infrastructure isn't quite complete, because it doesn't contain replacements
 * for the Intel I/O instructions.  As a result, the Guest sometimes fumbles
 * across one during the boot process as it probes for various things which are
 * usually attached to a PC.
 *
 * When the Guest uses one of these instructions, we get a trap (General
 * Protection Fault) and come here.  We see if it's one of those troublesome
 * instructions and skip over it.  We return true if we did. */
static int emulate_insn(struct lg_cpu *cpu)
{
	u8 insn;
	unsigned int insnlen = 0, in = 0, shift = 0;
	/* The eip contains the *virtual* address of the Guest's instruction:
	 * guest_pa just subtracts the Guest's page_offset. */
	unsigned long physaddr = guest_pa(cpu, cpu->regs->eip);

	/* This must be the Guest kernel trying to do something, not userspace!
	 * The bottom two bits of the CS segment register are the privilege
	 * level. */
	if ((cpu->regs->cs & 3) != GUEST_PL)
		return 0;

	/* Decoding x86 instructions is icky. */
	insn = lgread(cpu, physaddr, u8);

	/* 0x66 is an "operand prefix".  It means it's using the upper 16 bits
	   of the eax register. */
	if (insn == 0x66) {
		shift = 16;
		/* The instruction is 1 byte so far, read the next byte. */
		insnlen = 1;
		insn = lgread(cpu, physaddr + insnlen, u8);
	}

	/* We can ignore the lower bit for the moment and decode the 4 opcodes
	 * we need to emulate. */
	switch (insn & 0xFE) {
	case 0xE4: /* in     <next byte>,%al */
		insnlen += 2;
		in = 1;
		break;
	case 0xEC: /* in     (%dx),%al */
		insnlen += 1;
		in = 1;
		break;
	case 0xE6: /* out    %al,<next byte> */
		insnlen += 2;
		break;
	case 0xEE: /* out    %al,(%dx) */
		insnlen += 1;
		break;
	default:
		/* OK, we don't know what this is, can't emulate. */
		return 0;
	}

	/* If it was an "IN" instruction, they expect the result to be read
	 * into %eax, so we change %eax.  We always return all-ones, which
	 * traditionally means "there's nothing there". */
	if (in) {
		/* Lower bit tells is whether it's a 16 or 32 bit access */
		if (insn & 0x1)
			cpu->regs->eax = 0xFFFFFFFF;
		else
			cpu->regs->eax |= (0xFFFF << shift);
	}
	/* Finally, we've "done" the instruction, so move past it. */
	cpu->regs->eip += insnlen;
	/* Success! */
	return 1;
}
示例#8
0
/*H:130
 * Now we've examined the hypercall code; our Guest can make requests.
 * Our Guest is usually so well behaved; it never tries to do things it isn't
 * allowed to, and uses hypercalls instead.  Unfortunately, Linux's paravirtual
 * infrastructure isn't quite complete, because it doesn't contain replacements
 * for the Intel I/O instructions.  As a result, the Guest sometimes fumbles
 * across one during the boot process as it probes for various things which are
 * usually attached to a PC.
 *
 * When the Guest uses one of these instructions, we get a trap (General
 * Protection Fault) and come here.  We see if it's one of those troublesome
 * instructions and skip over it.  We return true if we did.
 */
static int emulate_insn(struct lg_cpu *cpu)
{
	u8 insn;
	unsigned int insnlen = 0, in = 0, small_operand = 0;
	/*
	 * The eip contains the *virtual* address of the Guest's instruction:
	 * walk the Guest's page tables to find the "physical" address.
	 */
	unsigned long physaddr = guest_pa(cpu, cpu->regs->eip);

	/*
	 * This must be the Guest kernel trying to do something, not userspace!
	 * The bottom two bits of the CS segment register are the privilege
	 * level.
	 */
	if ((cpu->regs->cs & 3) != GUEST_PL)
		return 0;

	/* Decoding x86 instructions is icky. */
	insn = lgread(cpu, physaddr, u8);

	/*
	 * Around 2.6.33, the kernel started using an emulation for the
	 * cmpxchg8b instruction in early boot on many configurations.  This
	 * code isn't paravirtualized, and it tries to disable interrupts.
	 * Ignore it, which will Mostly Work.
	 */
	if (insn == 0xfa) {
		/* "cli", or Clear Interrupt Enable instruction.  Skip it. */
		cpu->regs->eip++;
		return 1;
	}

	/*
	 * 0x66 is an "operand prefix".  It means a 16, not 32 bit in/out.
	 */
	if (insn == 0x66) {
		small_operand = 1;
		/* The instruction is 1 byte so far, read the next byte. */
		insnlen = 1;
		insn = lgread(cpu, physaddr + insnlen, u8);
	}

	/*
	 * We can ignore the lower bit for the moment and decode the 4 opcodes
	 * we need to emulate.
	 */
	switch (insn & 0xFE) {
	case 0xE4: /* in     <next byte>,%al */
		insnlen += 2;
		in = 1;
		break;
	case 0xEC: /* in     (%dx),%al */
		insnlen += 1;
		in = 1;
		break;
	case 0xE6: /* out    %al,<next byte> */
		insnlen += 2;
		break;
	case 0xEE: /* out    %al,(%dx) */
		insnlen += 1;
		break;
	default:
		/* OK, we don't know what this is, can't emulate. */
		return 0;
	}

	/*
	 * If it was an "IN" instruction, they expect the result to be read
	 * into %eax, so we change %eax.  We always return all-ones, which
	 * traditionally means "there's nothing there".
	 */
	if (in) {
		/* Lower bit tells means it's a 32/16 bit access */
		if (insn & 0x1) {
			if (small_operand)
				cpu->regs->eax |= 0xFFFF;
			else
				cpu->regs->eax = 0xFFFFFFFF;
		} else
			cpu->regs->eax |= 0xFF;
	}
	/* Finally, we've "done" the instruction, so move past it. */
	cpu->regs->eip += insnlen;
	/* Success! */
	return 1;
}