/* Is this a vmm specific thing? or generic? * * what do we do when we want to kill the vm? what are our other options? */ bool handle_vmexit(struct guest_thread *gth) { struct vm_trapframe *vm_tf = gth_to_vmtf(gth); switch (vm_tf->tf_exit_reason) { case EXIT_REASON_EPT_VIOLATION: return handle_ept_fault(gth); case EXIT_REASON_VMCALL: return handle_vmcall(gth); case EXIT_REASON_IO_INSTRUCTION: return handle_io(gth); case EXIT_REASON_MSR_WRITE: case EXIT_REASON_MSR_READ: return handle_msr(gth); case EXIT_REASON_APIC_ACCESS: return handle_apic_access(gth); case EXIT_REASON_HLT: return handle_halt(gth); case EXIT_REASON_MWAIT_INSTRUCTION: return handle_mwait(gth); case EXIT_REASON_EXTERNAL_INTERRUPT: case EXIT_REASON_APIC_WRITE: /* TODO: just ignore these? */ return TRUE; default: fprintf(stderr, "Don't know how to handle exit %d\n", vm_tf->tf_exit_reason); fprintf(stderr, "RIP %p, shutdown 0x%x\n", vm_tf->tf_rip, vm_tf->tf_exit_reason); return FALSE; } }
static bool handle_halt(struct guest_thread *gth) { struct vm_trapframe *vm_tf = gth_to_vmtf(gth); /* It's possible the guest disabled IRQs and halted, perhaps waiting on an * NMI or something. If we need to support that, we can change this. */ sleep_til_irq(gth); vm_tf->tf_rip += 1; return TRUE; }
static bool handle_mwait(struct guest_thread *gth) { struct vm_trapframe *vm_tf = gth_to_vmtf(gth); /* TODO: we need to handle the actual monitor part of mwait. This just * implements the power management / halting. Likewise, it's possible IRQs * are disabled (as with halt). */ sleep_til_irq(gth); vm_tf->tf_rip += 3; return TRUE; }
static bool handle_vmcall(struct guest_thread *gth) { struct vm_trapframe *vm_tf = gth_to_vmtf(gth); uint8_t byte; byte = vm_tf->tf_rdi; printf("%c", byte); if (byte == '\n') printf("%c", '%'); vm_tf->tf_rip += 3; return TRUE; }
static bool handle_apic_access(struct guest_thread *gth) { uint64_t gpa, *regp; uint8_t regx; int store, size; int advance; struct vm_trapframe *vm_tf = gth_to_vmtf(gth); if (decode(gth, &gpa, ®x, ®p, &store, &size, &advance)) return FALSE; if (__apic_access(gth, gpa, regx, regp, store)) return FALSE; vm_tf->tf_rip += advance; return TRUE; }
static bool handle_ept_fault(struct guest_thread *gth) { struct vm_trapframe *vm_tf = gth_to_vmtf(gth); struct virtual_machine *vm = gth_to_vm(gth); uint64_t gpa, *regp; uint8_t regx; int store, size; int advance; if (decode(gth, &gpa, ®x, ®p, &store, &size, &advance)) return FALSE; assert(size >= 0); /* TODO use helpers for some of these addr checks. the fee/fec ones might * be wrong too. */ for (int i = 0; i < VIRTIO_MMIO_MAX_NUM_DEV; i++) { if (vm->virtio_mmio_devices[i] == NULL) continue; if (PG_ADDR(gpa) != vm->virtio_mmio_devices[i]->addr) continue; /* TODO: can the guest cause us to spawn off infinite threads? */ if (store) virtio_mmio_wr(vm, vm->virtio_mmio_devices[i], gpa, size, (uint32_t *)regp); else *regp = virtio_mmio_rd(vm, vm->virtio_mmio_devices[i], gpa, size); vm_tf->tf_rip += advance; return TRUE; } if (PG_ADDR(gpa) == 0xfec00000) { do_ioapic(gth, gpa, regx, regp, store); } else if (PG_ADDR(gpa) == 0) { memmove(regp, &vm->low4k[gpa], size); } else { fprintf(stderr, "EPT violation: can't handle %p\n", gpa); fprintf(stderr, "RIP %p, exit reason 0x%x\n", vm_tf->tf_rip, vm_tf->tf_exit_reason); fprintf(stderr, "Returning 0xffffffff\n"); showstatus(stderr, gth); /* Just fill the whole register for now. */ *regp = (uint64_t) -1; return FALSE; } vm_tf->tf_rip += advance; return TRUE; }
static bool handle_msr(struct guest_thread *gth) { struct vm_trapframe *vm_tf = gth_to_vmtf(gth); /* TODO: consider pushing the gth into msrio */ if (msrio(gth, gth_to_gpci(gth), vm_tf->tf_exit_reason)) { /* Use event injection through vmctl to send a general protection fault * vmctl.interrupt gets written to the VM-Entry Interruption-Information * Field by vmx */ vm_tf->tf_trap_inject = VM_TRAP_VALID | VM_TRAP_ERROR_CODE | VM_TRAP_HARDWARE | HW_TRAP_GP_FAULT; } else { vm_tf->tf_rip += 2; } return TRUE; }
/* Convert a kernel guest virtual address to physical address. * Assumes that the guest VA is in the high negative address space. * TODO: Takes the vm_thread argument so that we can walk the page tables * instead of just coercing the pointer. Therefore, this is not in vmm.h * since it may get complex. */ int gvatogpa(struct guest_thread *vm_thread, uint64_t va, uint64_t *pa) { assert(vm_thread != NULL); struct vm_trapframe *vm_tf = gth_to_vmtf(vm_thread); uint64_t *ptptr = (uint64_t *)vm_tf->tf_cr3; uint64_t entry; for (int shift = PML4_SHIFT; shift >= PML1_SHIFT; shift -= BITS_PER_PML) { entry = ptptr[PMLx(va, shift)]; if (!PAGE_PRESENT(entry)) return -1; if ((entry & PTE_PS) != 0) { uint64_t bitmask = ((1 << shift) - 1); *pa = (((uint64_t)va & bitmask) | (entry & ~bitmask)); return 0; } ptptr = (uint64_t *)PG_ADDR(entry); } *pa = ((uint64_t)va & 0xfff) | (uint64_t)ptptr; return 0; }
int main(int argc, char **argv) { int vmmflags = VMM_VMCALL_PRINTF; uint64_t entry = 0; int ret; struct vm_trapframe *vm_tf; int c; int option_index; static struct option long_options[] = { {"debug", no_argument, 0, 'd'}, {"vmmflags", required_argument, 0, 'v'}, {"memsize", required_argument, 0, 'm'}, {"memstart", required_argument, 0, 'M'}, {"stack", required_argument, 0, 'S'}, {"cmdline_extra", required_argument, 0, 'c'}, {"greedy", no_argument, 0, 'g'}, {"scp", no_argument, 0, 's'}, {"help", no_argument, 0, 'h'}, {0, 0, 0, 0} }; fprintf(stderr, "%p %p %p %p\n", PGSIZE, PGSHIFT, PML1_SHIFT, PML1_PTE_REACH); if ((uintptr_t)__procinfo.program_end >= MinMemory) { fprintf(stderr, "Panic: vmrunkernel binary extends into guest memory\n"); exit(1); } while ((c = getopt_long(argc, argv, "dv:m:M:S:gsh", long_options, &option_index)) != -1) { switch (c) { case 'd': debug++; break; case 'v': vmmflags = strtoull(optarg, 0, 0); break; case 'm': memsize = strtoull(optarg, 0, 0); break; case 'M': memstart = strtoull(optarg, 0, 0); break; case 'S': stack = strtoull(optarg, 0, 0); break; case 'g': /* greedy */ parlib_never_yield = TRUE; break; case 's': /* scp */ parlib_wants_to_be_mcp = FALSE; break; case 'h': default: // Sadly, the getopt_long struct does // not have a pointer to help text. for (int i = 0; i < sizeof(long_options)/sizeof(long_options[0]) - 1; i++) { struct option *l = &long_options[i]; fprintf(stderr, "%s or %c%s\n", l->name, l->val, l->has_arg ? " <arg>" : ""); } exit(0); } } argc -= optind; argv += optind; if (argc < 1) { fprintf(stderr, "Usage: %s vmimage [-n (no vmcall printf)]\n", argv[0]); exit(1); } if ((uintptr_t)(memstart + memsize) >= (uintptr_t)BRK_START) { fprintf(stderr, "memstart 0x%lx memsize 0x%lx -> 0x%lx is too large; overlaps BRK_START at %p\n", memstart, memsize, memstart + memsize, BRK_START); exit(1); } ram = mmap((void *)memstart, memsize, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_POPULATE | MAP_ANONYMOUS, -1, 0); if (ram != (void *)memstart) { fprintf(stderr, "Could not mmap 0x%lx bytes at 0x%lx\n", memsize, memstart); exit(1); } entry = load_kernel(argv[0]); if (entry == 0) { fprintf(stderr, "Unable to load kernel %s\n", argv[0]); exit(1); } vm->nr_gpcs = 1; vm->gpcis = &gpci; ret = vmm_init(vm, vmmflags); if (ret) { fprintf(stderr, "vmm_init failed: %r\n"); exit(1); } /* Allocate 3 pages for page table pages: a page of 512 GiB * PTEs with only one entry filled to point to a page of 1 GiB * PTEs; a page of 1 GiB PTEs with only one entry filled to * point to a page of 2 MiB PTEs; and a page of 2 MiB PTEs, * all of which may be filled. For now, we don't handle * starting addresses not aligned on 512 GiB boundaries or * sizes > GiB */ ret = posix_memalign((void **)&p512, PGSIZE, 3 * PGSIZE); if (ret) { perror("ptp alloc"); exit(1); } /* Set up a 1:1 ("identity") page mapping from guest virtual * to guest physical using the (host virtual) * `kerneladdress`. This mapping may be used for only a short * time, until the guest sets up its own page tables. Be aware * that the values stored in the table are physical addresses. * This is subtle and mistakes are easily disguised due to the * identity mapping, so take care when manipulating these * mappings. */ p1 = &p512[NPTENTRIES]; p2m = &p512[2 * NPTENTRIES]; fprintf(stderr, "Map %p for %zu bytes\n", memstart, memsize); /* TODO: fix this nested loop so it's correct for more than * one GiB. */ for(uintptr_t p4 = memstart; p4 < memstart + memsize; p4 += PML4_PTE_REACH) { p512[PML4(p4)] = (uint64_t)p1 | PTE_KERN_RW; for (uintptr_t p3 = p4; p3 < memstart + memsize; p3 += PML3_PTE_REACH) { p1[PML3(p3)] = (uint64_t)p2m | PTE_KERN_RW; for (uintptr_t p2 = p3; p2 < memstart + memsize; p2 += PML2_PTE_REACH) { p2m[PML2(p2)] = (uint64_t)(p2) | PTE_KERN_RW | PTE_PS; } } } fprintf(stderr, "p512 %p p512[0] is 0x%lx p1 %p p1[0] is 0x%x\n", p512, p512[0], p1, p1[0]); vm_tf = gth_to_vmtf(vm->gths[0]); vm_tf->tf_cr3 = (uint64_t) p512; vm_tf->tf_rip = entry; vm_tf->tf_rsp = stack; vm_tf->tf_rsi = (uint64_t) 0; start_guest_thread(vm->gths[0]); uthread_sleep_forever(); return 0; }
static bool rvi_is_set(struct guest_thread *gth) { uint8_t rvi = gth_to_vmtf(gth)->tf_guest_intr_status & 0xff; return rvi != 0; }
/* Get the RIP as a physical address. */ int rippa(struct guest_thread *vm_thread, uint64_t *pa) { assert(vm_thread != NULL); return gvatogpa(vm_thread, gth_to_vmtf(vm_thread)->tf_rip, pa); }