/** * insn_init() - initialize struct insn * @insn: &struct insn to be initialized * @kaddr: address (in kernel memory) of instruction (or copy thereof) * @x86_64: !0 for 64-bit kernel or 64-bit app */ void insn_init(struct insn *insn, const void *kaddr, int x86_64) { memset(insn, 0, sizeof(*insn)); insn->kaddr = ktla_ktva(kaddr); insn->next_byte = ktla_ktva(kaddr); insn->x86_64 = x86_64 ? 1 : 0; insn->opnd_bytes = 4; if (x86_64) insn->addr_bytes = 8; else insn->addr_bytes = 4; }
unsigned paravirt_patch_insns(void *insnbuf, unsigned len, const char *start, const char *end) { unsigned insn_len = end - start; if (insn_len > len || start == NULL) insn_len = len; else memcpy(insnbuf, ktla_ktva(start), insn_len); return insn_len; }
static void run_plant_and_detach_test(int is_early) { char before[BREAK_INSTR_SIZE]; char after[BREAK_INSTR_SIZE]; probe_kernel_read(before, ktla_ktva((char *)kgdbts_break_test), BREAK_INSTR_SIZE); init_simple_test(); ts.tst = plant_and_detach_test; ts.name = "plant_and_detach_test"; /* Activate test with initial breakpoint */ if (!is_early) kgdb_breakpoint(); probe_kernel_read(after, ktla_ktva((char *)kgdbts_break_test), BREAK_INSTR_SIZE); if (memcmp(before, after, BREAK_INSTR_SIZE)) { printk(KERN_CRIT "kgdbts: ERROR kgdb corrupted memory\n"); panic("kgdb memory corruption"); } /* complete the detach test */ if (!is_early) kgdbts_break_test(); }
static nokprobe_inline void __synthesize_relative_insn(void *from, void *to, u8 op) { struct __arch_relative_insn { u8 op; s32 raddr; } __packed *insn; insn = (struct __arch_relative_insn *)ktla_ktva(from); pax_open_kernel(); insn->raddr = (s32)((long)(to) - ((long)(from) + 5)); insn->op = op; pax_close_kernel(); }
/** * imr_self_test * * Verify IMR self_test with some simple tests to verify overlap, * zero sized allocations and 1 KiB sized areas. * */ static void __init imr_self_test(void) { phys_addr_t base = virt_to_phys((void *)ktla_ktva((unsigned long)_text)); size_t size = virt_to_phys(&__end_rodata) - base; const char *fmt_over = "overlapped IMR @ (0x%08lx - 0x%08lx)\n"; int ret; /* Test zero zero. */ ret = imr_add_range(0, 0, 0, 0, false); imr_self_test_result(ret < 0, "zero sized IMR\n"); /* Test exact overlap. */ ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false); imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size)); /* Test overlap with base inside of existing. */ base += size - IMR_ALIGN; ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false); imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size)); /* Test overlap with end inside of existing. */ base -= size + IMR_ALIGN * 2; ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false); imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size)); /* Test that a 1 KiB IMR @ zero with read/write all will bomb out. */ ret = imr_add_range(0, IMR_ALIGN, IMR_READ_ACCESS_ALL, IMR_WRITE_ACCESS_ALL, false); imr_self_test_result(ret < 0, "1KiB IMR @ 0x00000000 - access-all\n"); /* Test that a 1 KiB IMR @ zero with CPU only will work. */ ret = imr_add_range(0, IMR_ALIGN, IMR_CPU, IMR_CPU, false); imr_self_test_result(ret >= 0, "1KiB IMR @ 0x00000000 - cpu-access\n"); if (ret >= 0) { ret = imr_remove_range(0, IMR_ALIGN); imr_self_test_result(ret == 0, "teardown - cpu-access\n"); } /* Test 2 KiB works. */ size = IMR_ALIGN * 2; ret = imr_add_range(0, size, IMR_READ_ACCESS_ALL, IMR_WRITE_ACCESS_ALL, false); imr_self_test_result(ret >= 0, "2KiB IMR @ 0x00000000\n"); if (ret >= 0) { ret = imr_remove_range(0, size); imr_self_test_result(ret == 0, "teardown 2KiB\n"); } }
static unsigned long __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr) { struct kprobe *kp; unsigned long faddr; kp = get_kprobe((void *)addr); faddr = ftrace_location(addr); /* * Addresses inside the ftrace location are refused by * arch_check_ftrace_location(). Something went terribly wrong * if such an address is checked here. */ if (WARN_ON(faddr && faddr != addr)) return 0UL; /* * Use the current code if it is not modified by Kprobe * and it cannot be modified by ftrace. */ if (!kp && !faddr) return addr; /* * Basically, kp->ainsn.insn has an original instruction. * However, RIP-relative instruction can not do single-stepping * at different place, __copy_instruction() tweaks the displacement of * that instruction. In that case, we can't recover the instruction * from the kp->ainsn.insn. * * On the other hand, in case on normal Kprobe, kp->opcode has a copy * of the first byte of the probed instruction, which is overwritten * by int3. And the instruction at kp->addr is not modified by kprobes * except for the first byte, we can recover the original instruction * from it and kp->opcode. * * In case of Kprobes using ftrace, we do not have a copy of * the original instruction. In fact, the ftrace location might * be modified at anytime and even could be in an inconsistent state. * Fortunately, we know that the original code is the ideal 5-byte * long NOP. */ memcpy(buf, (void *)ktla_ktva(addr), MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); if (faddr) memcpy(buf, ideal_nops[NOP_ATOMIC5], 5); else buf[0] = kp->opcode; return ktva_ktla((unsigned long)buf); }
int apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, unsigned int relsec, struct module *me) { unsigned int i; Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr; Elf32_Sym *sym; uint32_t *plocation, location; DEBUGP("Applying relocate section %u to %u\n", relsec, sechdrs[relsec].sh_info); for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { /* This is where to make the change */ plocation = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr + rel[i].r_offset; location = (uint32_t)plocation; if (sechdrs[sechdrs[relsec].sh_info].sh_flags & SHF_EXECINSTR) plocation = ktla_ktva((void *)plocation); /* This is the symbol it is referring to. Note that all undefined symbols have been resolved. */ sym = (Elf32_Sym *)sechdrs[symindex].sh_addr + ELF32_R_SYM(rel[i].r_info); switch (ELF32_R_TYPE(rel[i].r_info)) { case R_386_32: /* We add the value into the location given */ pax_open_kernel(); *plocation += sym->st_value; pax_close_kernel(); break; case R_386_PC32: /* Add the value, subtract its postition */ pax_open_kernel(); *plocation += sym->st_value - location; pax_close_kernel(); break; default: printk(KERN_ERR "module %s: Unknown relocation: %u\n", me->name, ELF32_R_TYPE(rel[i].r_info)); return -ENOEXEC; } } return 0; }
/** * insn_init() - initialize struct insn * @insn: &struct insn to be initialized * @kaddr: address (in kernel memory) of instruction (or copy thereof) * @x86_64: !0 for 64-bit kernel or 64-bit app */ void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64) { /* * Instructions longer than MAX_INSN_SIZE (15 bytes) are invalid * even if the input buffer is long enough to hold them. */ if (buf_len > MAX_INSN_SIZE) buf_len = MAX_INSN_SIZE; memset(insn, 0, sizeof(*insn)); insn->kaddr = (void *)ktla_ktva((unsigned long)kaddr); insn->end_kaddr = insn->kaddr + buf_len; insn->next_byte = insn->kaddr; insn->x86_64 = x86_64 ? 1 : 0; insn->opnd_bytes = 4; if (x86_64) insn->addr_bytes = 8; else insn->addr_bytes = 4; }
void __init setup_real_mode(void) { u16 real_mode_seg; const u32 *rel; u32 count; unsigned char *base; unsigned long phys_base; struct trampoline_header *trampoline_header; size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); #ifdef CONFIG_X86_64 u64 *trampoline_pgd; u64 efer; #endif base = (unsigned char *)real_mode_header; memcpy(base, real_mode_blob, size); phys_base = __pa(base); real_mode_seg = phys_base >> 4; rel = (u32 *) real_mode_relocs; /* 16-bit segment relocations. */ count = *rel++; while (count--) { u16 *seg = (u16 *) (base + *rel++); *seg = real_mode_seg; } /* 32-bit linear relocations. */ count = *rel++; while (count--) { u32 *ptr = (u32 *) (base + *rel++); *ptr += phys_base; } /* Must be perfomed *after* relocation. */ trampoline_header = (struct trampoline_header *) __va(real_mode_header->trampoline_header); #ifdef CONFIG_X86_32 trampoline_header->start = __pa_symbol(ktla_ktva(startup_32_smp)); #ifdef CONFIG_PAX_KERNEXEC trampoline_header->start -= LOAD_PHYSICAL_ADDR; #endif trampoline_header->boot_cs = __BOOT_CS; trampoline_header->gdt_limit = __BOOT_DS + 7; trampoline_header->gdt_base = __pa_symbol(boot_gdt); #else /* * Some AMD processors will #GP(0) if EFER.LMA is set in WRMSR * so we need to mask it out. */ rdmsrl(MSR_EFER, efer); trampoline_header->efer = efer & ~EFER_LMA; trampoline_header->start = (u64) secondary_startup_64; trampoline_cr4_features = &trampoline_header->cr4; *trampoline_cr4_features = __read_cr4(); trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); trampoline_pgd[0] = init_level4_pgt[pgd_index(__PAGE_OFFSET)].pgd & ~_PAGE_NX; trampoline_pgd[511] = init_level4_pgt[511].pgd; #endif }
/* Offset from where switcher.S was compiled to where we've copied it */ static unsigned long switcher_offset(void) { return switcher_addr - (unsigned long)ktla_ktva(start_switcher_text); }
/*H:020 * Now the Switcher is mapped and every thing else is ready, we need to do * some more i386-specific initialization. */ void __init lguest_arch_host_init(void) { int i; /* * Most of the x86/switcher_32.S doesn't care that it's been moved; on * Intel, jumps are relative, and it doesn't access any references to * external code or data. * * The only exception is the interrupt handlers in switcher.S: their * addresses are placed in a table (default_idt_entries), so we need to * update the table with the new addresses. switcher_offset() is a * convenience function which returns the distance between the * compiled-in switcher code and the high-mapped copy we just made. */ for (i = 0; i < IDT_ENTRIES; i++) default_idt_entries[i] = ktla_ktva(default_idt_entries[i]) + switcher_offset(); /* * Set up the Switcher's per-cpu areas. * * Each CPU gets two pages of its own within the high-mapped region * (aka. "struct lguest_pages"). Much of this can be initialized now, * but some depends on what Guest we are running (which is set up in * copy_in_guest_info()). */ for_each_possible_cpu(i) { /* lguest_pages() returns this CPU's two pages. */ struct lguest_pages *pages = lguest_pages(i); /* This is a convenience pointer to make the code neater. */ struct lguest_ro_state *state = &pages->state; /* * The Global Descriptor Table: the Host has a different one * for each CPU. We keep a descriptor for the GDT which says * where it is and how big it is (the size is actually the last * byte, not the size, hence the "-1"). */ state->host_gdt_desc.size = GDT_SIZE-1; state->host_gdt_desc.address = (long)get_cpu_gdt_table(i); /* * All CPUs on the Host use the same Interrupt Descriptor * Table, so we just use store_idt(), which gets this CPU's IDT * descriptor. */ store_idt(&state->host_idt_desc); /* * The descriptors for the Guest's GDT and IDT can be filled * out now, too. We copy the GDT & IDT into ->guest_gdt and * ->guest_idt before actually running the Guest. */ state->guest_idt_desc.size = sizeof(state->guest_idt)-1; state->guest_idt_desc.address = (long)&state->guest_idt; state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1; state->guest_gdt_desc.address = (long)&state->guest_gdt; /* * We know where we want the stack to be when the Guest enters * the Switcher: in pages->regs. The stack grows upwards, so * we start it at the end of that structure. */ state->guest_tss.sp0 = (long)(&pages->regs + 1); /* * And this is the GDT entry to use for the stack: we keep a * couple of special LGUEST entries. */ state->guest_tss.ss0 = LGUEST_DS; /* * x86 can have a finegrained bitmap which indicates what I/O * ports the process can use. We set it to the end of our * structure, meaning "none". */ state->guest_tss.io_bitmap_base = sizeof(state->guest_tss); /* * Some GDT entries are the same across all Guests, so we can * set them up now. */ setup_default_gdt_entries(state); /* Most IDT entries are the same for all Guests, too.*/ setup_default_idt_entries(state, default_idt_entries); /* * The Host needs to be able to use the LGUEST segments on this * CPU, too, so put them in the Host GDT. */ get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; } /* * In the Switcher, we want the %cs segment register to use the * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so * it will be undisturbed when we switch. To change %cs and jump we * need this structure to feed to Intel's "lcall" instruction. */ lguest_entry.offset = (long)ktla_ktva(switch_to_guest) + switcher_offset(); lguest_entry.segment = LGUEST_CS; /* * Finally, we need to turn off "Page Global Enable". PGE is an * optimization where page table entries are specially marked to show * they never change. The Host kernel marks all the kernel pages this * way because it's always present, even when userspace is running. * * Lguest breaks this: unbeknownst to the rest of the Host kernel, we * switch to the Guest kernel. If you don't disable this on all CPUs, * you'll get really weird bugs that you'll chase for two days. * * I used to turn PGE off every time we switched to the Guest and back * on when we return, but that slowed the Switcher down noticibly. */ /* * We don't need the complexity of CPUs coming and going while we're * doing this. */ get_online_cpus(); if (cpu_has_pge) { /* We have a broader idea of "global". */ /* Remember that this was originally set (for cleanup). */ cpu_had_pge = 1; /* * adjust_pge is a helper function which sets or unsets the PGE * bit on its CPU, depending on the argument (0 == unset). */ on_each_cpu(adjust_pge, (void *)0, 1); /* Turn off the feature in the global feature set. */ clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); } put_online_cpus(); }
/*H:010 * We need to set up the Switcher at a high virtual address. Remember the * Switcher is a few hundred bytes of assembler code which actually changes the * CPU to run the Guest, and then changes back to the Host when a trap or * interrupt happens. * * The Switcher code must be at the same virtual address in the Guest as the * Host since it will be running as the switchover occurs. * * Trying to map memory at a particular address is an unusual thing to do, so * it's not a simple one-liner. */ static __init int map_switcher(void) { int i, err; struct page **pagep; /* * Map the Switcher in to high memory. * * It turns out that if we choose the address 0xFFC00000 (4MB under the * top virtual address), it makes setting up the page tables really * easy. */ /* * We allocate an array of struct page pointers. map_vm_area() wants * this, rather than just an array of pages. */ switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, GFP_KERNEL); if (!switcher_page) { err = -ENOMEM; goto out; } /* * Now we actually allocate the pages. The Guest will see these pages, * so we make sure they're zeroed. */ for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { switcher_page[i] = alloc_page(GFP_KERNEL|__GFP_ZERO); if (!switcher_page[i]) { err = -ENOMEM; goto free_some_pages; } } /* * First we check that the Switcher won't overlap the fixmap area at * the top of memory. It's currently nowhere near, but it could have * very strange effects if it ever happened. */ if (SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){ err = -ENOMEM; printk("lguest: mapping switcher would thwack fixmap\n"); goto free_pages; } /* * Now we reserve the "virtual memory area" we want: 0xFFC00000 * (SWITCHER_ADDR). We might not get it in theory, but in practice * it's worked so far. The end address needs +1 because __get_vm_area * allocates an extra guard page, so we need space for that. */ #if defined(CONFIG_MODULES) && defined(CONFIG_X86_32) && defined(CONFIG_PAX_KERNEXEC) switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, VM_ALLOC | VM_KERNEXEC, SWITCHER_ADDR, SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE); #else switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, VM_ALLOC, SWITCHER_ADDR, SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE); #endif if (!switcher_vma) { err = -ENOMEM; printk("lguest: could not map switcher pages high\n"); goto free_pages; } /* * This code actually sets up the pages we've allocated to appear at * SWITCHER_ADDR. map_vm_area() takes the vma we allocated above, the * kind of pages we're mapping (kernel pages), and a pointer to our * array of struct pages. It increments that pointer, but we don't * care. */ pagep = switcher_page; err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep); if (err) { printk("lguest: map_vm_area failed: %i\n", err); goto free_vma; } /* * Now the Switcher is mapped at the right address, we can't fail! * Copy in the compiled-in Switcher code (from <arch>_switcher.S). */ memcpy(switcher_vma->addr, ktla_ktva(start_switcher_text), end_switcher_text - start_switcher_text); printk(KERN_INFO "lguest: mapped switcher at %p\n", switcher_vma->addr); /* And we succeeded... */ return 0; free_vma: vunmap(switcher_vma->addr); free_pages: i = TOTAL_SWITCHER_PAGES; free_some_pages: for (--i; i >= 0; i--) __free_pages(switcher_page[i], 0); kfree(switcher_page); out: return err; }
/* Offset from where switcher.S was compiled to where we've copied it */ static unsigned long switcher_offset(void) { return SWITCHER_ADDR - (unsigned long)ktla_ktva(start_switcher_text); }