static void build_page_tables(pgentry_t *top_table, unsigned long *pt_pfn, void *start_va, void *end_va) { // // *pt_pfn - unused pages already mapped by domain builder (512K+) // carve new page tables/directories from *pt_pfn as needed // start_va - end_va - range of frames to map // struct mmu_update mmu_updates[L1_PAGETABLE_ENTRIES]; int count = 0; while (start_va + PAGE_SIZE <= end_va) { pgentry_t *tab = top_table; unsigned int offset; pgentry_t pte; #if defined(__x86_64__) offset = l4_table_offset((unsigned long)start_va); pte = tab[offset]; if ((pte & _PAGE_PRESENT) == 0) pte = new_pt_page(pt_pfn, tab, offset, L3_FRAME); tab = pte_to_virt(pte); #endif offset = l3_table_offset((unsigned long)start_va); pte = tab[offset]; if ((pte & _PAGE_PRESENT) == 0) pte = new_pt_page(pt_pfn, tab, offset, L2_FRAME); tab = pte_to_virt(pte); offset = l2_table_offset((unsigned long)start_va); pte = tab[offset]; if ((pte & _PAGE_PRESENT) == 0) pte = new_pt_page(pt_pfn, tab, offset, L1_FRAME); tab = pte_to_virt(pte); offset = l1_table_offset((unsigned long)start_va); pte = tab[offset]; if ((pte & _PAGE_PRESENT) == 0) { unsigned long pt_mfn = virt_to_mfn(tab); mmu_updates[count].ptr = ((pgentry_t)pt_mfn << PAGE_SHIFT) + sizeof(pgentry_t)*offset; mmu_updates[count].val = virt_to_mfn(start_va) << PAGE_SHIFT | L1_PROT; count++; } start_va += PAGE_SIZE; if (count == L1_PAGETABLE_ENTRIES || start_va + PAGE_SIZE > end_va) { int rc = HYPERVISOR_mmu_update(mmu_updates, count, 0, DOMID_SELF); if (rc < 0) fatal_error("build_page_tables: mmu_update failed: %d", rc); count = 0; } } }
/* * Checks if a pagetable frame is needed at 'level' to map a given * address. Note, this function is specific to the initial page table * building. */ static int need_pt_frame(unsigned long va, int level) { unsigned long hyp_virt_start = HYPERVISOR_VIRT_START; unsigned long hyp_virt_end = HYPERVISOR_VIRT_END; /* In general frames will _not_ be needed if they were already allocated to map the hypervisor into our VA space */ if ( level == L3_FRAME ) { if ( l4_table_offset(va) >= l4_table_offset(hyp_virt_start) && l4_table_offset(va) <= l4_table_offset(hyp_virt_end)) return 0; return 1; } else if ( level == L2_FRAME ) { if ( l4_table_offset(va) >= l4_table_offset(hyp_virt_start) && l4_table_offset(va) <= l4_table_offset(hyp_virt_end)) if ( l3_table_offset(va) >= l3_table_offset(hyp_virt_start) && l3_table_offset(va) <= l3_table_offset(hyp_virt_end)) return 0; return 1; } else /* Always need l1 frames */ if ( level == L1_FRAME ) return 1; printk("ERROR: Unknown frame level %d, hypervisor %llx,%llx\n", level, hyp_virt_start, hyp_virt_end); return -1; }
void page_walk(unsigned long virt_address) { pgentry_t *tab = (pgentry_t *)start_info.pt_base, page; unsigned long addr = virt_address; printk("Pagetable walk from virt %lx, base %lx:\n", virt_address, start_info.pt_base); page = tab[l4_table_offset(addr)]; tab = pte_to_virt(page); printk(" L4 = %"PRIpte" (%p) [offset = %lx]\n", page, tab, l4_table_offset(addr)); page = tab[l3_table_offset(addr)]; tab = pte_to_virt(page); printk(" L3 = %"PRIpte" (%p) [offset = %lx]\n", page, tab, l3_table_offset(addr)); page = tab[l2_table_offset(addr)]; tab = pte_to_virt(page); printk(" L2 = %"PRIpte" (%p) [offset = %lx]\n", page, tab, l2_table_offset(addr)); page = tab[l1_table_offset(addr)]; printk(" L1 = %"PRIpte" [offset = %lx]\n", page, l1_table_offset(addr)); }
/* Returns: mfn for the given (pv guest) vaddr */ static unsigned long dbg_pv_va2mfn(dbgva_t vaddr, struct domain *dp, uint64_t pgd3val) { l3_pgentry_t l3e, *l3t; l2_pgentry_t l2e, *l2t; l1_pgentry_t l1e, *l1t; unsigned long cr3 = (pgd3val ? pgd3val : dp->vcpu[0]->arch.cr3); unsigned long mfn = cr3 >> PAGE_SHIFT; DBGP2("vaddr:%lx domid:%d cr3:%lx pgd3:%lx\n", vaddr, dp->domain_id, cr3, pgd3val); if ( pgd3val == 0 ) { l3t = map_domain_page(mfn); l3t += (cr3 & 0xFE0UL) >> 3; l3e = l3t[l3_table_offset(vaddr)]; mfn = l3e_get_pfn(l3e); unmap_domain_page(l3t); if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) return INVALID_MFN; }
static int handle_cow(unsigned long addr) { pgentry_t *tab = (pgentry_t *)start_info.pt_base, page; unsigned long new_page; int rc; page = tab[l4_table_offset(addr)]; if (!(page & _PAGE_PRESENT)) return 0; tab = pte_to_virt(page); page = tab[l3_table_offset(addr)]; if (!(page & _PAGE_PRESENT)) return 0; tab = pte_to_virt(page); page = tab[l2_table_offset(addr)]; if (!(page & _PAGE_PRESENT)) return 0; tab = pte_to_virt(page); page = tab[l1_table_offset(addr)]; if (!(page & _PAGE_PRESENT)) return 0; /* Only support CoW for the zero page. */ if (PHYS_PFN(page) != mfn_zero) return 0; new_page = alloc_pages(0); memset((void*) new_page, 0, PAGE_SIZE); rc = HYPERVISOR_update_va_mapping(addr & PAGE_MASK, __pte(virt_to_mach(new_page) | L1_PROT), UVMF_INVLPG); if (!rc) return 1; printk("Map zero page to %lx failed: %d.\n", addr, rc); return 0; }
/* * Make pt_pfn a new 'level' page table frame and hook it into the page * table at offset in previous level MFN (pref_l_mfn). pt_pfn is a guest * PFN. */ static void new_pt_frame(unsigned long *pt_pfn, unsigned long prev_l_mfn, unsigned long offset, unsigned long level) { pgentry_t *tab = (pgentry_t *)start_info.pt_base; unsigned long pt_page = (unsigned long)pfn_to_virt(*pt_pfn); pgentry_t prot_e, prot_t; mmu_update_t mmu_updates[1]; int rc; prot_e = prot_t = 0; DEBUG("Allocating new L%d pt frame for pfn=%lx, " "prev_l_mfn=%lx, offset=%lx", level, *pt_pfn, prev_l_mfn, offset); /* We need to clear the page, otherwise we might fail to map it as a page table page */ memset((void*) pt_page, 0, PAGE_SIZE); switch ( level ) { case L1_FRAME: prot_e = L1_PROT; prot_t = L2_PROT; break; case L2_FRAME: prot_e = L2_PROT; prot_t = L3_PROT; break; #if defined(__x86_64__) case L3_FRAME: prot_e = L3_PROT; prot_t = L4_PROT; break; #endif default: printk("new_pt_frame() called with invalid level number %d\n", level); do_exit(); break; } /* Make PFN a page table page */ #if defined(__x86_64__) tab = pte_to_virt(tab[l4_table_offset(pt_page)]); #endif tab = pte_to_virt(tab[l3_table_offset(pt_page)]); mmu_updates[0].ptr = (tab[l2_table_offset(pt_page)] & PAGE_MASK) + sizeof(pgentry_t) * l1_table_offset(pt_page); mmu_updates[0].val = (pgentry_t)pfn_to_mfn(*pt_pfn) << PAGE_SHIFT | (prot_e & ~_PAGE_RW); if ( (rc = HYPERVISOR_mmu_update(mmu_updates, 1, NULL, DOMID_SELF)) < 0 ) { printk("ERROR: PTE for new page table page could not be updated\n"); printk(" mmu_update failed with rc=%d\n", rc); do_exit(); } /* Hook the new page table page into the hierarchy */ mmu_updates[0].ptr = ((pgentry_t)prev_l_mfn << PAGE_SHIFT) + sizeof(pgentry_t) * offset; mmu_updates[0].val = (pgentry_t)pfn_to_mfn(*pt_pfn) << PAGE_SHIFT | prot_t; if ( (rc = HYPERVISOR_mmu_update(mmu_updates, 1, NULL, DOMID_SELF)) < 0 ) { printk("ERROR: mmu_update failed with rc=%d\n", rc); do_exit(); } *pt_pfn += 1; }
static mfn_t p2m_gfn_to_mfn(struct p2m_domain *p2m, unsigned long gfn, p2m_type_t *t, p2m_access_t *a, p2m_query_t q, unsigned int *page_order) { mfn_t mfn; paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT; l2_pgentry_t *l2e; l1_pgentry_t *l1e; unsigned long l1e_flags; p2m_type_t l1t; ASSERT(paging_mode_translate(p2m->domain)); /* XXX This is for compatibility with the old model, where anything not * XXX marked as RAM was considered to be emulated MMIO space. * XXX Once we start explicitly registering MMIO regions in the p2m * XXX we will return p2m_invalid for unmapped gfns */ *t = p2m_mmio_dm; /* Not implemented except with EPT */ *a = p2m_access_rwx; if ( gfn > p2m->max_mapped_pfn ) /* This pfn is higher than the highest the p2m map currently holds */ return _mfn(INVALID_MFN); /* Use the fast path with the linear mapping if we can */ if ( p2m == p2m_get_hostp2m(current->domain) ) return p2m_gfn_to_mfn_current(p2m, gfn, t, a, q, page_order); mfn = pagetable_get_mfn(p2m_get_pagetable(p2m)); #if CONFIG_PAGING_LEVELS >= 4 { l4_pgentry_t *l4e = map_domain_page(mfn_x(mfn)); l4e += l4_table_offset(addr); if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 ) { unmap_domain_page(l4e); return _mfn(INVALID_MFN); } mfn = _mfn(l4e_get_pfn(*l4e)); unmap_domain_page(l4e); } #endif { l3_pgentry_t *l3e = map_domain_page(mfn_x(mfn)); #if CONFIG_PAGING_LEVELS == 3 /* On PAE hosts the p2m has eight l3 entries, not four (see * shadow_set_p2m_entry()) so we can't use l3_table_offset. * Instead, just count the number of l3es from zero. It's safe * to do this because we already checked that the gfn is within * the bounds of the p2m. */ l3e += (addr >> L3_PAGETABLE_SHIFT); #else l3e += l3_table_offset(addr); #endif pod_retry_l3: if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 ) { if ( p2m_flags_to_type(l3e_get_flags(*l3e)) == p2m_populate_on_demand ) { if ( q & P2M_ALLOC ) { if ( !p2m_pod_demand_populate(p2m, gfn, PAGE_ORDER_1G, q) ) goto pod_retry_l3; gdprintk(XENLOG_ERR, "%s: Allocate 1GB failed!\n", __func__); } else *t = p2m_populate_on_demand; } unmap_domain_page(l3e); return _mfn(INVALID_MFN); } else if ( (l3e_get_flags(*l3e) & _PAGE_PSE) ) { mfn = _mfn(l3e_get_pfn(*l3e) + l2_table_offset(addr) * L1_PAGETABLE_ENTRIES + l1_table_offset(addr)); *t = p2m_flags_to_type(l3e_get_flags(*l3e)); unmap_domain_page(l3e); ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t)); if ( page_order ) *page_order = PAGE_ORDER_1G; return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN); } mfn = _mfn(l3e_get_pfn(*l3e)); unmap_domain_page(l3e); } l2e = map_domain_page(mfn_x(mfn)); l2e += l2_table_offset(addr); pod_retry_l2: if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 ) { /* PoD: Try to populate a 2-meg chunk */ if ( p2m_flags_to_type(l2e_get_flags(*l2e)) == p2m_populate_on_demand ) { if ( q & P2M_ALLOC ) { if ( !p2m_pod_demand_populate(p2m, gfn, PAGE_ORDER_2M, q) ) goto pod_retry_l2; } else *t = p2m_populate_on_demand; } unmap_domain_page(l2e); return _mfn(INVALID_MFN); } else if ( (l2e_get_flags(*l2e) & _PAGE_PSE) ) { mfn = _mfn(l2e_get_pfn(*l2e) + l1_table_offset(addr)); *t = p2m_flags_to_type(l2e_get_flags(*l2e)); unmap_domain_page(l2e); ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t)); if ( page_order ) *page_order = PAGE_ORDER_2M; return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN); } mfn = _mfn(l2e_get_pfn(*l2e)); unmap_domain_page(l2e); l1e = map_domain_page(mfn_x(mfn)); l1e += l1_table_offset(addr); pod_retry_l1: l1e_flags = l1e_get_flags(*l1e); l1t = p2m_flags_to_type(l1e_flags); if ( ((l1e_flags & _PAGE_PRESENT) == 0) && (!p2m_is_paging(l1t)) ) { /* PoD: Try to populate */ if ( l1t == p2m_populate_on_demand ) { if ( q & P2M_ALLOC ) { if ( !p2m_pod_demand_populate(p2m, gfn, PAGE_ORDER_4K, q) ) goto pod_retry_l1; } else *t = p2m_populate_on_demand; } unmap_domain_page(l1e); return _mfn(INVALID_MFN); } mfn = _mfn(l1e_get_pfn(*l1e)); *t = l1t; unmap_domain_page(l1e); ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t) || p2m_is_paging(*t)); if ( page_order ) *page_order = PAGE_ORDER_4K; return (p2m_is_valid(*t) || p2m_is_grant(*t)) ? mfn : _mfn(INVALID_MFN); }
static mfn_t p2m_pt_get_entry(struct p2m_domain *p2m, unsigned long gfn, p2m_type_t *t, p2m_access_t *a, p2m_query_t q, unsigned int *page_order) { mfn_t mfn; paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT; l2_pgentry_t *l2e; l1_pgentry_t *l1e; unsigned long l1e_flags; p2m_type_t l1t; ASSERT(paging_mode_translate(p2m->domain)); /* XXX This is for compatibility with the old model, where anything not * XXX marked as RAM was considered to be emulated MMIO space. * XXX Once we start explicitly registering MMIO regions in the p2m * XXX we will return p2m_invalid for unmapped gfns */ *t = p2m_mmio_dm; /* Not implemented except with EPT */ *a = p2m_access_rwx; if ( gfn > p2m->max_mapped_pfn ) /* This pfn is higher than the highest the p2m map currently holds */ return _mfn(INVALID_MFN); mfn = pagetable_get_mfn(p2m_get_pagetable(p2m)); { l4_pgentry_t *l4e = map_domain_page(mfn_x(mfn)); l4e += l4_table_offset(addr); if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 ) { unmap_domain_page(l4e); return _mfn(INVALID_MFN); } mfn = _mfn(l4e_get_pfn(*l4e)); unmap_domain_page(l4e); } { l3_pgentry_t *l3e = map_domain_page(mfn_x(mfn)); l3e += l3_table_offset(addr); pod_retry_l3: if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 ) { if ( p2m_flags_to_type(l3e_get_flags(*l3e)) == p2m_populate_on_demand ) { if ( q & P2M_ALLOC ) { if ( !p2m_pod_demand_populate(p2m, gfn, PAGE_ORDER_1G, q) ) goto pod_retry_l3; gdprintk(XENLOG_ERR, "%s: Allocate 1GB failed!\n", __func__); } else *t = p2m_populate_on_demand; } unmap_domain_page(l3e); return _mfn(INVALID_MFN); } else if ( (l3e_get_flags(*l3e) & _PAGE_PSE) ) { mfn = _mfn(l3e_get_pfn(*l3e) + l2_table_offset(addr) * L1_PAGETABLE_ENTRIES + l1_table_offset(addr)); *t = p2m_flags_to_type(l3e_get_flags(*l3e)); unmap_domain_page(l3e); ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t)); if ( page_order ) *page_order = PAGE_ORDER_1G; return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN); } mfn = _mfn(l3e_get_pfn(*l3e)); unmap_domain_page(l3e); } l2e = map_domain_page(mfn_x(mfn)); l2e += l2_table_offset(addr); pod_retry_l2: if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 ) { /* PoD: Try to populate a 2-meg chunk */ if ( p2m_flags_to_type(l2e_get_flags(*l2e)) == p2m_populate_on_demand ) { if ( q & P2M_ALLOC ) { if ( !p2m_pod_demand_populate(p2m, gfn, PAGE_ORDER_2M, q) ) goto pod_retry_l2; } else *t = p2m_populate_on_demand; } unmap_domain_page(l2e); return _mfn(INVALID_MFN); } else if ( (l2e_get_flags(*l2e) & _PAGE_PSE) ) { mfn = _mfn(l2e_get_pfn(*l2e) + l1_table_offset(addr)); *t = p2m_flags_to_type(l2e_get_flags(*l2e)); unmap_domain_page(l2e); ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t)); if ( page_order ) *page_order = PAGE_ORDER_2M; return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN); } mfn = _mfn(l2e_get_pfn(*l2e)); unmap_domain_page(l2e); l1e = map_domain_page(mfn_x(mfn)); l1e += l1_table_offset(addr); pod_retry_l1: l1e_flags = l1e_get_flags(*l1e); l1t = p2m_flags_to_type(l1e_flags); if ( ((l1e_flags & _PAGE_PRESENT) == 0) && (!p2m_is_paging(l1t)) ) { /* PoD: Try to populate */ if ( l1t == p2m_populate_on_demand ) { if ( q & P2M_ALLOC ) { if ( !p2m_pod_demand_populate(p2m, gfn, PAGE_ORDER_4K, q) ) goto pod_retry_l1; } else *t = p2m_populate_on_demand; } unmap_domain_page(l1e); return _mfn(INVALID_MFN); } mfn = _mfn(l1e_get_pfn(*l1e)); *t = l1t; unmap_domain_page(l1e); ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t) || p2m_is_paging(*t)); if ( page_order ) *page_order = PAGE_ORDER_4K; return (p2m_is_valid(*t) || p2m_is_grant(*t)) ? mfn : _mfn(INVALID_MFN); }
/* * pgd3val: this is the value of init_mm.pgd[3] in a PV guest. It is optional. * This to assist debug of modules in the guest. The kernel address * space seems is always mapped, but modules are not necessarily * mapped in any arbitraty guest cr3 that we pick if pgd3val is 0. * Modules should always be addressible if we use cr3 from init_mm. * Since pgd3val is already a pgd value, cr3->pgd[3], we just need to * do 2 level lookups. * * NOTE: 4 level paging works for 32 PAE guests also because cpu runs in IA32-e * mode. * Returns: mfn for the given (pv guest) vaddr */ static mfn_t dbg_pv_va2mfn(dbgva_t vaddr, struct domain *dp, uint64_t pgd3val) { l4_pgentry_t l4e, *l4t; l3_pgentry_t l3e, *l3t; l2_pgentry_t l2e, *l2t; l1_pgentry_t l1e, *l1t; unsigned long cr3 = (pgd3val ? pgd3val : dp->vcpu[0]->arch.cr3); mfn_t mfn = _mfn(cr3 >> PAGE_SHIFT); DBGP2("vaddr:%lx domid:%d cr3:%lx pgd3:%lx\n", vaddr, dp->domain_id, cr3, pgd3val); if ( pgd3val == 0 ) { l4t = map_domain_page(mfn); l4e = l4t[l4_table_offset(vaddr)]; unmap_domain_page(l4t); mfn = _mfn(l4e_get_pfn(l4e)); DBGP2("l4t:%p l4to:%lx l4e:%lx mfn:%#"PRI_mfn"\n", l4t, l4_table_offset(vaddr), l4e, mfn_x(mfn)); if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) ) { DBGP1("l4 PAGE not present. vaddr:%lx cr3:%lx\n", vaddr, cr3); return INVALID_MFN; } l3t = map_domain_page(mfn); l3e = l3t[l3_table_offset(vaddr)]; unmap_domain_page(l3t); mfn = _mfn(l3e_get_pfn(l3e)); DBGP2("l3t:%p l3to:%lx l3e:%lx mfn:%#"PRI_mfn"\n", l3t, l3_table_offset(vaddr), l3e, mfn_x(mfn)); if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_flags(l3e) & _PAGE_PSE) ) { DBGP1("l3 PAGE not present. vaddr:%lx cr3:%lx\n", vaddr, cr3); return INVALID_MFN; } } l2t = map_domain_page(mfn); l2e = l2t[l2_table_offset(vaddr)]; unmap_domain_page(l2t); mfn = _mfn(l2e_get_pfn(l2e)); DBGP2("l2t:%p l2to:%lx l2e:%lx mfn:%#"PRI_mfn"\n", l2t, l2_table_offset(vaddr), l2e, mfn_x(mfn)); if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || (l2e_get_flags(l2e) & _PAGE_PSE) ) { DBGP1("l2 PAGE not present. vaddr:%lx cr3:%lx\n", vaddr, cr3); return INVALID_MFN; } l1t = map_domain_page(mfn); l1e = l1t[l1_table_offset(vaddr)]; unmap_domain_page(l1t); mfn = _mfn(l1e_get_pfn(l1e)); DBGP2("l1t:%p l1to:%lx l1e:%lx mfn:%#"PRI_mfn"\n", l1t, l1_table_offset(vaddr), l1e, mfn_x(mfn)); return mfn_valid(mfn_x(mfn)) ? mfn : INVALID_MFN; }