/* We walk down the guest page tables to get a guest-physical address */ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) { pgd_t gpgd; pte_t gpte; #ifdef CONFIG_X86_PAE pmd_t gpmd; #endif /* Still not set up? Just map 1:1. */ if (unlikely(cpu->linear_pages)) return vaddr; /* First step: get the top-level Guest page table entry. */ gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); /* Toplevel not present? We can't map it in. */ if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) { kill_guest(cpu, "Bad address %#lx", vaddr); return -1UL; } #ifdef CONFIG_X86_PAE gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) kill_guest(cpu, "Bad address %#lx", vaddr); gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t); #else gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t); #endif if (!(pte_flags(gpte) & _PAGE_PRESENT)) kill_guest(cpu, "Bad address %#lx", vaddr); return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); }
static int emulate_insn(struct lg_cpu *cpu) { u8 insn; unsigned int insnlen = 0, in = 0, small_operand = 0; unsigned long physaddr = guest_pa(cpu, cpu->regs->eip); if ((cpu->regs->cs & 3) != GUEST_PL) return 0; insn = lgread(cpu, physaddr, u8); if (insn == 0xfa) { cpu->regs->eip++; return 1; } if (insn == 0x66) { small_operand = 1; insnlen = 1; insn = lgread(cpu, physaddr + insnlen, u8); } switch (insn & 0xFE) { case 0xE4: insnlen += 2; in = 1; break; case 0xEC: insnlen += 1; in = 1; break; case 0xE6: insnlen += 2; break; case 0xEE: insnlen += 1; break; default: return 0; } if (in) { if (insn & 0x1) { if (small_operand) cpu->regs->eax |= 0xFFFF; else cpu->regs->eax = 0xFFFFFFFF; } else cpu->regs->eax |= 0xFF; } cpu->regs->eip += insnlen; return 1; }
/* We walk down the guest page tables to get a guest-physical address */ bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr) { pgd_t gpgd; pte_t gpte; #ifdef CONFIG_X86_PAE pmd_t gpmd; #endif /* Still not set up? Just map 1:1. */ if (unlikely(cpu->linear_pages)) { *paddr = vaddr; return true; } /* First step: get the top-level Guest page table entry. */ gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); /* Toplevel not present? We can't map it in. */ if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) goto fail; #ifdef CONFIG_X86_PAE gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) goto fail; gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t); #else gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t); #endif if (!(pte_flags(gpte) & _PAGE_PRESENT)) goto fail; *paddr = pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); return true; fail: *paddr = -1UL; return false; }
/* This restores the eflags word which was pushed on the stack by a trap */ static void restore_eflags(struct lg_cpu *cpu) { /* This is the physical address of the stack. */ unsigned long stack_pa = guest_pa(cpu, cpu->regs->esp); /* * Stack looks like this: * Address Contents * esp EIP * esp + 4 CS * esp + 8 EFLAGS */ cpu->regs->eflags = lgread(cpu, stack_pa + 8, u32); cpu->regs->eflags &= ~(X86_EFLAGS_TF|X86_EFLAGS_VM|X86_EFLAGS_RF|X86_EFLAGS_NT); }
/*H:330 * (i) Looking up a page table entry when the Guest faults. * * We saw this call in run_guest(): when we see a page fault in the Guest, we * come here. That's because we only set up the shadow page tables lazily as * they're needed, so we get page faults all the time and quietly fix them up * and return to the Guest without it knowing. * * If we fixed up the fault (ie. we mapped the address), this routine returns * true. Otherwise, it was a real fault and we need to tell the Guest. */ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) { pgd_t gpgd; pgd_t *spgd; unsigned long gpte_ptr; pte_t gpte; pte_t *spte; /* Mid level for PAE. */ #ifdef CONFIG_X86_PAE pmd_t *spmd; pmd_t gpmd; #endif /* First step: get the top-level Guest page table entry. */ if (unlikely(cpu->linear_pages)) { /* Faking up a linear mapping. */ gpgd = __pgd(CHECK_GPGD_MASK); } else { gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); /* Toplevel not present? We can't map it in. */ if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) return false; } /* Now look at the matching shadow entry. */ spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { /* No shadow entry: allocate a new shadow PTE page. */ unsigned long ptepage = get_zeroed_page(GFP_KERNEL); /* * This is not really the Guest's fault, but killing it is * simple for this corner case. */ if (!ptepage) { kill_guest(cpu, "out of memory allocating pte page"); return false; } /* We check that the Guest pgd is OK. */ check_gpgd(cpu, gpgd); /* * And we copy the flags to the shadow PGD entry. The page * number in the shadow PGD is the page we just allocated. */ set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); } #ifdef CONFIG_X86_PAE if (unlikely(cpu->linear_pages)) { /* Faking up a linear mapping. */ gpmd = __pmd(_PAGE_TABLE); } else { gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); /* Middle level not present? We can't map it in. */ if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) return false; } /* Now look at the matching shadow entry. */ spmd = spmd_addr(cpu, *spgd, vaddr); if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) { /* No shadow entry: allocate a new shadow PTE page. */ unsigned long ptepage = get_zeroed_page(GFP_KERNEL); /* * This is not really the Guest's fault, but killing it is * simple for this corner case. */ if (!ptepage) { kill_guest(cpu, "out of memory allocating pte page"); return false; } /* We check that the Guest pmd is OK. */ check_gpmd(cpu, gpmd); /* * And we copy the flags to the shadow PMD entry. The page * number in the shadow PMD is the page we just allocated. */ set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); } /* * OK, now we look at the lower level in the Guest page table: keep its * address, because we might update it later. */ gpte_ptr = gpte_addr(cpu, gpmd, vaddr); #else /* * OK, now we look at the lower level in the Guest page table: keep its * address, because we might update it later. */ gpte_ptr = gpte_addr(cpu, gpgd, vaddr); #endif if (unlikely(cpu->linear_pages)) { /* Linear? Make up a PTE which points to same page. */ gpte = __pte((vaddr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT); } else { /* Read the actual PTE value. */ gpte = lgread(cpu, gpte_ptr, pte_t); } /* If this page isn't in the Guest page tables, we can't page it in. */ if (!(pte_flags(gpte) & _PAGE_PRESENT)) return false; /* * Check they're not trying to write to a page the Guest wants * read-only (bit 2 of errcode == write). */ if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW)) return false; /* User access to a kernel-only page? (bit 3 == user access) */ if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) return false; /* * Check that the Guest PTE flags are OK, and the page number is below * the pfn_limit (ie. not mapping the Launcher binary). */ check_gpte(cpu, gpte); /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ gpte = pte_mkyoung(gpte); if (errcode & 2) gpte = pte_mkdirty(gpte); /* Get the pointer to the shadow PTE entry we're going to set. */ spte = spte_addr(cpu, *spgd, vaddr); /* * If there was a valid shadow PTE entry here before, we release it. * This can happen with a write to a previously read-only entry. */ release_pte(*spte); /* * If this is a write, we insist that the Guest page is writable (the * final arg to gpte_to_spte()). */ if (pte_dirty(gpte)) *spte = gpte_to_spte(cpu, gpte, 1); else /* * If this is a read, don't set the "writable" bit in the page * table entry, even if the Guest says it's writable. That way * we will come back here when a write does actually occur, so * we can update the Guest's _PAGE_DIRTY flag. */ set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0)); /* * Finally, we write the Guest PTE entry back: we've set the * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ if (likely(!cpu->linear_pages)) lgwrite(cpu, gpte_ptr, pte_t, gpte); /* * The fault is fixed, the page table is populated, the mapping * manipulated, the result returned and the code complete. A small * delay and a trace of alliteration are the only indications the Guest * has that a page fault occurred at all. */ return true; }
/*H:330 * (i) Looking up a page table entry when the Guest faults. * * We saw this call in run_guest(): when we see a page fault in the Guest, we * come here. That's because we only set up the shadow page tables lazily as * they're needed, so we get page faults all the time and quietly fix them up * and return to the Guest without it knowing. * * If we fixed up the fault (ie. we mapped the address), this routine returns * true. Otherwise, it was a real fault and we need to tell the Guest. * * There's a corner case: they're trying to access memory between * pfn_limit and device_limit, which is I/O memory. In this case, we * return false and set @iomem to the physical address, so the the * Launcher can handle the instruction manually. */ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode, unsigned long *iomem) { unsigned long gpte_ptr; pte_t gpte; pte_t *spte; pmd_t gpmd; pgd_t gpgd; *iomem = 0; /* We never demand page the Switcher, so trying is a mistake. */ if (vaddr >= switcher_addr) return false; /* First step: get the top-level Guest page table entry. */ if (unlikely(cpu->linear_pages)) { /* Faking up a linear mapping. */ gpgd = __pgd(CHECK_GPGD_MASK); } else { gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); /* Toplevel not present? We can't map it in. */ if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) return false; /* * This kills the Guest if it has weird flags or tries to * refer to a "physical" address outside the bounds. */ if (!check_gpgd(cpu, gpgd)) return false; } /* This "mid-level" entry is only used for non-linear, PAE mode. */ gpmd = __pmd(_PAGE_TABLE); #ifdef CONFIG_X86_PAE if (likely(!cpu->linear_pages)) { gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); /* Middle level not present? We can't map it in. */ if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) return false; /* * This kills the Guest if it has weird flags or tries to * refer to a "physical" address outside the bounds. */ if (!check_gpmd(cpu, gpmd)) return false; } /* * OK, now we look at the lower level in the Guest page table: keep its * address, because we might update it later. */ gpte_ptr = gpte_addr(cpu, gpmd, vaddr); #else /* * OK, now we look at the lower level in the Guest page table: keep its * address, because we might update it later. */ gpte_ptr = gpte_addr(cpu, gpgd, vaddr); #endif if (unlikely(cpu->linear_pages)) { /* Linear? Make up a PTE which points to same page. */ gpte = __pte((vaddr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT); } else { /* Read the actual PTE value. */ gpte = lgread(cpu, gpte_ptr, pte_t); } /* If this page isn't in the Guest page tables, we can't page it in. */ if (!(pte_flags(gpte) & _PAGE_PRESENT)) return false; /* * Check they're not trying to write to a page the Guest wants * read-only (bit 2 of errcode == write). */ if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW)) return false; /* User access to a kernel-only page? (bit 3 == user access) */ if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) return false; /* If they're accessing io memory, we expect a fault. */ if (gpte_in_iomem(cpu, gpte)) { *iomem = (pte_pfn(gpte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK); return false; } /* * Check that the Guest PTE flags are OK, and the page number is below * the pfn_limit (ie. not mapping the Launcher binary). */ if (!check_gpte(cpu, gpte)) return false; /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ gpte = pte_mkyoung(gpte); if (errcode & 2) gpte = pte_mkdirty(gpte); /* Get the pointer to the shadow PTE entry we're going to set. */ spte = find_spte(cpu, vaddr, true, pgd_flags(gpgd), pmd_flags(gpmd)); if (!spte) return false; /* * If there was a valid shadow PTE entry here before, we release it. * This can happen with a write to a previously read-only entry. */ release_pte(*spte); /* * If this is a write, we insist that the Guest page is writable (the * final arg to gpte_to_spte()). */ if (pte_dirty(gpte)) *spte = gpte_to_spte(cpu, gpte, 1); else /* * If this is a read, don't set the "writable" bit in the page * table entry, even if the Guest says it's writable. That way * we will come back here when a write does actually occur, so * we can update the Guest's _PAGE_DIRTY flag. */ set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0)); /* * Finally, we write the Guest PTE entry back: we've set the * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ if (likely(!cpu->linear_pages)) lgwrite(cpu, gpte_ptr, pte_t, gpte); /* * The fault is fixed, the page table is populated, the mapping * manipulated, the result returned and the code complete. A small * delay and a trace of alliteration are the only indications the Guest * has that a page fault occurred at all. */ return true; }
/*H:130 Now we've examined the hypercall code; our Guest can make requests. * Our Guest is usually so well behaved; it never tries to do things it isn't * allowed to, and uses hypercalls instead. Unfortunately, Linux's paravirtual * infrastructure isn't quite complete, because it doesn't contain replacements * for the Intel I/O instructions. As a result, the Guest sometimes fumbles * across one during the boot process as it probes for various things which are * usually attached to a PC. * * When the Guest uses one of these instructions, we get a trap (General * Protection Fault) and come here. We see if it's one of those troublesome * instructions and skip over it. We return true if we did. */ static int emulate_insn(struct lg_cpu *cpu) { u8 insn; unsigned int insnlen = 0, in = 0, shift = 0; /* The eip contains the *virtual* address of the Guest's instruction: * guest_pa just subtracts the Guest's page_offset. */ unsigned long physaddr = guest_pa(cpu, cpu->regs->eip); /* This must be the Guest kernel trying to do something, not userspace! * The bottom two bits of the CS segment register are the privilege * level. */ if ((cpu->regs->cs & 3) != GUEST_PL) return 0; /* Decoding x86 instructions is icky. */ insn = lgread(cpu, physaddr, u8); /* 0x66 is an "operand prefix". It means it's using the upper 16 bits of the eax register. */ if (insn == 0x66) { shift = 16; /* The instruction is 1 byte so far, read the next byte. */ insnlen = 1; insn = lgread(cpu, physaddr + insnlen, u8); } /* We can ignore the lower bit for the moment and decode the 4 opcodes * we need to emulate. */ switch (insn & 0xFE) { case 0xE4: /* in <next byte>,%al */ insnlen += 2; in = 1; break; case 0xEC: /* in (%dx),%al */ insnlen += 1; in = 1; break; case 0xE6: /* out %al,<next byte> */ insnlen += 2; break; case 0xEE: /* out %al,(%dx) */ insnlen += 1; break; default: /* OK, we don't know what this is, can't emulate. */ return 0; } /* If it was an "IN" instruction, they expect the result to be read * into %eax, so we change %eax. We always return all-ones, which * traditionally means "there's nothing there". */ if (in) { /* Lower bit tells is whether it's a 16 or 32 bit access */ if (insn & 0x1) cpu->regs->eax = 0xFFFFFFFF; else cpu->regs->eax |= (0xFFFF << shift); } /* Finally, we've "done" the instruction, so move past it. */ cpu->regs->eip += insnlen; /* Success! */ return 1; }
/*H:130 * Now we've examined the hypercall code; our Guest can make requests. * Our Guest is usually so well behaved; it never tries to do things it isn't * allowed to, and uses hypercalls instead. Unfortunately, Linux's paravirtual * infrastructure isn't quite complete, because it doesn't contain replacements * for the Intel I/O instructions. As a result, the Guest sometimes fumbles * across one during the boot process as it probes for various things which are * usually attached to a PC. * * When the Guest uses one of these instructions, we get a trap (General * Protection Fault) and come here. We see if it's one of those troublesome * instructions and skip over it. We return true if we did. */ static int emulate_insn(struct lg_cpu *cpu) { u8 insn; unsigned int insnlen = 0, in = 0, small_operand = 0; /* * The eip contains the *virtual* address of the Guest's instruction: * walk the Guest's page tables to find the "physical" address. */ unsigned long physaddr = guest_pa(cpu, cpu->regs->eip); /* * This must be the Guest kernel trying to do something, not userspace! * The bottom two bits of the CS segment register are the privilege * level. */ if ((cpu->regs->cs & 3) != GUEST_PL) return 0; /* Decoding x86 instructions is icky. */ insn = lgread(cpu, physaddr, u8); /* * Around 2.6.33, the kernel started using an emulation for the * cmpxchg8b instruction in early boot on many configurations. This * code isn't paravirtualized, and it tries to disable interrupts. * Ignore it, which will Mostly Work. */ if (insn == 0xfa) { /* "cli", or Clear Interrupt Enable instruction. Skip it. */ cpu->regs->eip++; return 1; } /* * 0x66 is an "operand prefix". It means a 16, not 32 bit in/out. */ if (insn == 0x66) { small_operand = 1; /* The instruction is 1 byte so far, read the next byte. */ insnlen = 1; insn = lgread(cpu, physaddr + insnlen, u8); } /* * We can ignore the lower bit for the moment and decode the 4 opcodes * we need to emulate. */ switch (insn & 0xFE) { case 0xE4: /* in <next byte>,%al */ insnlen += 2; in = 1; break; case 0xEC: /* in (%dx),%al */ insnlen += 1; in = 1; break; case 0xE6: /* out %al,<next byte> */ insnlen += 2; break; case 0xEE: /* out %al,(%dx) */ insnlen += 1; break; default: /* OK, we don't know what this is, can't emulate. */ return 0; } /* * If it was an "IN" instruction, they expect the result to be read * into %eax, so we change %eax. We always return all-ones, which * traditionally means "there's nothing there". */ if (in) { /* Lower bit tells means it's a 32/16 bit access */ if (insn & 0x1) { if (small_operand) cpu->regs->eax |= 0xFFFF; else cpu->regs->eax = 0xFFFFFFFF; } else cpu->regs->eax |= 0xFF; } /* Finally, we've "done" the instruction, so move past it. */ cpu->regs->eip += insnlen; /* Success! */ return 1; }