/*H:130 Now we've examined the hypercall code; our Guest can make requests. * Our Guest is usually so well behaved; it never tries to do things it isn't * allowed to, and uses hypercalls instead. Unfortunately, Linux's paravirtual * infrastructure isn't quite complete, because it doesn't contain replacements * for the Intel I/O instructions. As a result, the Guest sometimes fumbles * across one during the boot process as it probes for various things which are * usually attached to a PC. * * When the Guest uses one of these instructions, we get a trap (General * Protection Fault) and come here. We see if it's one of those troublesome * instructions and skip over it. We return true if we did. */ static int emulate_insn(struct lg_cpu *cpu) { u8 insn; unsigned int insnlen = 0, in = 0, shift = 0; /* The eip contains the *virtual* address of the Guest's instruction: * guest_pa just subtracts the Guest's page_offset. */ unsigned long physaddr = guest_pa(cpu, cpu->regs->eip); /* This must be the Guest kernel trying to do something, not userspace! * The bottom two bits of the CS segment register are the privilege * level. */ if ((cpu->regs->cs & 3) != GUEST_PL) return 0; /* Decoding x86 instructions is icky. */ insn = lgread(cpu, physaddr, u8); /* 0x66 is an "operand prefix". It means it's using the upper 16 bits of the eax register. */ if (insn == 0x66) { shift = 16; /* The instruction is 1 byte so far, read the next byte. */ insnlen = 1; insn = lgread(cpu, physaddr + insnlen, u8); } /* We can ignore the lower bit for the moment and decode the 4 opcodes * we need to emulate. */ switch (insn & 0xFE) { case 0xE4: /* in <next byte>,%al */ insnlen += 2; in = 1; break; case 0xEC: /* in (%dx),%al */ insnlen += 1; in = 1; break; case 0xE6: /* out %al,<next byte> */ insnlen += 2; break; case 0xEE: /* out %al,(%dx) */ insnlen += 1; break; default: /* OK, we don't know what this is, can't emulate. */ return 0; } /* If it was an "IN" instruction, they expect the result to be read * into %eax, so we change %eax. We always return all-ones, which * traditionally means "there's nothing there". */ if (in) { /* Lower bit tells is whether it's a 16 or 32 bit access */ if (insn & 0x1) cpu->regs->eax = 0xFFFFFFFF; else cpu->regs->eax |= (0xFFFF << shift); } /* Finally, we've "done" the instruction, so move past it. */ cpu->regs->eip += insnlen; /* Success! */ return 1; }
/*H:210 * The set_guest_interrupt() routine actually delivers the interrupt or * trap. The mechanics of delivering traps and interrupts to the Guest are the * same, except some traps have an "error code" which gets pushed onto the * stack as well: the caller tells us if this is one. * * "lo" and "hi" are the two parts of the Interrupt Descriptor Table for this * interrupt or trap. It's split into two parts for traditional reasons: gcc * on i386 used to be frightened by 64 bit numbers. * * We set up the stack just like the CPU does for a real interrupt, so it's * identical for the Guest (and the standard "iret" instruction will undo * it). */ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, bool has_err) { unsigned long gstack, origstack; u32 eflags, ss, irq_enable; unsigned long virtstack; /* * There are two cases for interrupts: one where the Guest is already * in the kernel, and a more complex one where the Guest is in * userspace. We check the privilege level to find out. */ if ((cpu->regs->ss&0x3) != GUEST_PL) { /* * The Guest told us their kernel stack with the SET_STACK * hypercall: both the virtual address and the segment. */ virtstack = cpu->esp1; ss = cpu->ss1; origstack = gstack = guest_pa(cpu, virtstack); /* * We push the old stack segment and pointer onto the new * stack: when the Guest does an "iret" back from the interrupt * handler the CPU will notice they're dropping privilege * levels and expect these here. */ push_guest_stack(cpu, &gstack, cpu->regs->ss); push_guest_stack(cpu, &gstack, cpu->regs->esp); } else { /* We're staying on the same Guest (kernel) stack. */ virtstack = cpu->regs->esp; ss = cpu->regs->ss; origstack = gstack = guest_pa(cpu, virtstack); } /* * Remember that we never let the Guest actually disable interrupts, so * the "Interrupt Flag" bit is always set. We copy that bit from the * Guest's "irq_enabled" field into the eflags word: we saw the Guest * copy it back in "lguest_iret". */ eflags = cpu->regs->eflags; if (get_user(irq_enable, &cpu->lg->lguest_data->irq_enabled) == 0 && !(irq_enable & X86_EFLAGS_IF)) eflags &= ~X86_EFLAGS_IF; /* * An interrupt is expected to push three things on the stack: the old * "eflags" word, the old code segment, and the old instruction * pointer. */ push_guest_stack(cpu, &gstack, eflags); push_guest_stack(cpu, &gstack, cpu->regs->cs); push_guest_stack(cpu, &gstack, cpu->regs->eip); /* For the six traps which supply an error code, we push that, too. */ if (has_err) push_guest_stack(cpu, &gstack, cpu->regs->errcode); /* * Now we've pushed all the old state, we change the stack, the code * segment and the address to execute. */ cpu->regs->ss = ss; cpu->regs->esp = virtstack + (gstack - origstack); cpu->regs->cs = (__KERNEL_CS|GUEST_PL); cpu->regs->eip = idt_address(lo, hi); /* * There are two kinds of interrupt handlers: 0xE is an "interrupt * gate" which expects interrupts to be disabled on entry. */ if (idt_type(lo, hi) == 0xE) if (put_user(0, &cpu->lg->lguest_data->irq_enabled)) kill_guest(cpu, "Disabling interrupts"); }
/*H:130 * Now we've examined the hypercall code; our Guest can make requests. * Our Guest is usually so well behaved; it never tries to do things it isn't * allowed to, and uses hypercalls instead. Unfortunately, Linux's paravirtual * infrastructure isn't quite complete, because it doesn't contain replacements * for the Intel I/O instructions. As a result, the Guest sometimes fumbles * across one during the boot process as it probes for various things which are * usually attached to a PC. * * When the Guest uses one of these instructions, we get a trap (General * Protection Fault) and come here. We see if it's one of those troublesome * instructions and skip over it. We return true if we did. */ static int emulate_insn(struct lg_cpu *cpu) { u8 insn; unsigned int insnlen = 0, in = 0, small_operand = 0; /* * The eip contains the *virtual* address of the Guest's instruction: * walk the Guest's page tables to find the "physical" address. */ unsigned long physaddr = guest_pa(cpu, cpu->regs->eip); /* * This must be the Guest kernel trying to do something, not userspace! * The bottom two bits of the CS segment register are the privilege * level. */ if ((cpu->regs->cs & 3) != GUEST_PL) return 0; /* Decoding x86 instructions is icky. */ insn = lgread(cpu, physaddr, u8); /* * Around 2.6.33, the kernel started using an emulation for the * cmpxchg8b instruction in early boot on many configurations. This * code isn't paravirtualized, and it tries to disable interrupts. * Ignore it, which will Mostly Work. */ if (insn == 0xfa) { /* "cli", or Clear Interrupt Enable instruction. Skip it. */ cpu->regs->eip++; return 1; } /* * 0x66 is an "operand prefix". It means a 16, not 32 bit in/out. */ if (insn == 0x66) { small_operand = 1; /* The instruction is 1 byte so far, read the next byte. */ insnlen = 1; insn = lgread(cpu, physaddr + insnlen, u8); } /* * We can ignore the lower bit for the moment and decode the 4 opcodes * we need to emulate. */ switch (insn & 0xFE) { case 0xE4: /* in <next byte>,%al */ insnlen += 2; in = 1; break; case 0xEC: /* in (%dx),%al */ insnlen += 1; in = 1; break; case 0xE6: /* out %al,<next byte> */ insnlen += 2; break; case 0xEE: /* out %al,(%dx) */ insnlen += 1; break; default: /* OK, we don't know what this is, can't emulate. */ return 0; } /* * If it was an "IN" instruction, they expect the result to be read * into %eax, so we change %eax. We always return all-ones, which * traditionally means "there's nothing there". */ if (in) { /* Lower bit tells means it's a 32/16 bit access */ if (insn & 0x1) { if (small_operand) cpu->regs->eax |= 0xFFFF; else cpu->regs->eax = 0xFFFFFFFF; } else cpu->regs->eax |= 0xFF; } /* Finally, we've "done" the instruction, so move past it. */ cpu->regs->eip += insnlen; /* Success! */ return 1; }