/* * Currently all CPUs are redenzevous at the MCE softirq handler, no * need to consider paging p2m type * Currently only support HVM guest with EPT paging mode * XXX following situation missed: * PoD, Foreign mapped, Granted, Shared */ int unmmap_broken_page(struct domain *d, mfn_t mfn, unsigned long gfn) { mfn_t r_mfn; p2m_type_t pt; int rc; /* Always trust dom0's MCE handler will prevent future access */ if ( d == dom0 ) return 0; if (!mfn_valid(mfn_x(mfn))) return -EINVAL; if ( !has_hvm_container_domain(d) || !paging_mode_hap(d) ) return -ENOSYS; rc = -1; r_mfn = get_gfn_query(d, gfn, &pt); if ( p2m_to_mask(pt) & P2M_UNMAP_TYPES) { ASSERT(mfn_x(r_mfn) == mfn_x(mfn)); p2m_change_type(d, gfn, pt, p2m_ram_broken); rc = 0; } put_gfn(d, gfn); return rc; }
static int __init pvh_setup_p2m(struct domain *d) { struct vcpu *v = d->vcpu[0]; unsigned long nr_pages; unsigned int i; int rc; bool preempted; #define MB1_PAGES PFN_DOWN(MB(1)) nr_pages = dom0_compute_nr_pages(d, NULL, 0); pvh_setup_e820(d, nr_pages); do { preempted = false; paging_set_allocation(d, dom0_paging_pages(d, nr_pages), &preempted); process_pending_softirqs(); } while ( preempted ); /* * Memory below 1MB is identity mapped. * NB: this only makes sense when booted from legacy BIOS. */ rc = modify_identity_mmio(d, 0, MB1_PAGES, true); if ( rc ) { printk("Failed to identity map low 1MB: %d\n", rc); return rc; } /* Populate memory map. */ for ( i = 0; i < d->arch.nr_e820; i++ ) { unsigned long addr, size; if ( d->arch.e820[i].type != E820_RAM ) continue; addr = PFN_DOWN(d->arch.e820[i].addr); size = PFN_DOWN(d->arch.e820[i].size); if ( addr >= MB1_PAGES ) rc = pvh_populate_memory_range(d, addr, size); else { ASSERT(addr + size < MB1_PAGES); pvh_steal_low_ram(d, addr, size); } if ( rc ) return rc; } if ( cpu_has_vmx && paging_mode_hap(d) && !vmx_unrestricted_guest(v) ) { /* * Since Dom0 cannot be migrated, we will only setup the * unrestricted guest helpers if they are needed by the current * hardware we are running on. */ rc = pvh_setup_vmx_realmode_helpers(d); if ( rc ) return rc; } return 0; #undef MB1_PAGES }
void arch_get_domain_info(const struct domain *d, struct xen_domctl_getdomaininfo *info) { if ( paging_mode_hap(d) ) info->flags |= XEN_DOMINF_hap; }
/* This function can directly access fields which are covered by clean bits. */ static int construct_vmcb(struct vcpu *v) { struct arch_svm_struct *arch_svm = &v->arch.hvm_svm; struct vmcb_struct *vmcb = arch_svm->vmcb; vmcb->_general1_intercepts = GENERAL1_INTERCEPT_INTR | GENERAL1_INTERCEPT_NMI | GENERAL1_INTERCEPT_SMI | GENERAL1_INTERCEPT_INIT | GENERAL1_INTERCEPT_CPUID | GENERAL1_INTERCEPT_INVD | GENERAL1_INTERCEPT_HLT | GENERAL1_INTERCEPT_INVLPG | GENERAL1_INTERCEPT_INVLPGA | GENERAL1_INTERCEPT_IOIO_PROT | GENERAL1_INTERCEPT_MSR_PROT | GENERAL1_INTERCEPT_SHUTDOWN_EVT| GENERAL1_INTERCEPT_TASK_SWITCH; vmcb->_general2_intercepts = GENERAL2_INTERCEPT_VMRUN | GENERAL2_INTERCEPT_VMMCALL | GENERAL2_INTERCEPT_VMLOAD | GENERAL2_INTERCEPT_VMSAVE | GENERAL2_INTERCEPT_STGI | GENERAL2_INTERCEPT_CLGI | GENERAL2_INTERCEPT_SKINIT | GENERAL2_INTERCEPT_MWAIT | GENERAL2_INTERCEPT_WBINVD | GENERAL2_INTERCEPT_MONITOR | GENERAL2_INTERCEPT_XSETBV; /* Intercept all debug-register writes. */ vmcb->_dr_intercepts = ~0u; /* Intercept all control-register accesses except for CR2 and CR8. */ vmcb->_cr_intercepts = ~(CR_INTERCEPT_CR2_READ | CR_INTERCEPT_CR2_WRITE | CR_INTERCEPT_CR8_READ | CR_INTERCEPT_CR8_WRITE); /* I/O and MSR permission bitmaps. */ arch_svm->msrpm = alloc_xenheap_pages(get_order_from_bytes(MSRPM_SIZE), 0); if ( arch_svm->msrpm == NULL ) return -ENOMEM; memset(arch_svm->msrpm, 0xff, MSRPM_SIZE); svm_disable_intercept_for_msr(v, MSR_FS_BASE); svm_disable_intercept_for_msr(v, MSR_GS_BASE); svm_disable_intercept_for_msr(v, MSR_SHADOW_GS_BASE); svm_disable_intercept_for_msr(v, MSR_CSTAR); svm_disable_intercept_for_msr(v, MSR_LSTAR); svm_disable_intercept_for_msr(v, MSR_STAR); svm_disable_intercept_for_msr(v, MSR_SYSCALL_MASK); /* LWP_CBADDR MSR is saved and restored by FPU code. So SVM doesn't need to * intercept it. */ if ( cpu_has_lwp ) svm_disable_intercept_for_msr(v, MSR_AMD64_LWP_CBADDR); vmcb->_msrpm_base_pa = (u64)virt_to_maddr(arch_svm->msrpm); vmcb->_iopm_base_pa = (u64)virt_to_maddr(hvm_io_bitmap); /* Virtualise EFLAGS.IF and LAPIC TPR (CR8). */ vmcb->_vintr.fields.intr_masking = 1; /* Initialise event injection to no-op. */ vmcb->eventinj.bytes = 0; /* TSC. */ vmcb->_tsc_offset = 0; /* Don't need to intercept RDTSC if CPU supports TSC rate scaling */ if ( v->domain->arch.vtsc && !cpu_has_tsc_ratio ) { vmcb->_general1_intercepts |= GENERAL1_INTERCEPT_RDTSC; vmcb->_general2_intercepts |= GENERAL2_INTERCEPT_RDTSCP; } /* Guest EFER. */ v->arch.hvm_vcpu.guest_efer = 0; hvm_update_guest_efer(v); /* Guest segment limits. */ vmcb->cs.limit = ~0u; vmcb->es.limit = ~0u; vmcb->ss.limit = ~0u; vmcb->ds.limit = ~0u; vmcb->fs.limit = ~0u; vmcb->gs.limit = ~0u; /* Guest segment bases. */ vmcb->cs.base = 0; vmcb->es.base = 0; vmcb->ss.base = 0; vmcb->ds.base = 0; vmcb->fs.base = 0; vmcb->gs.base = 0; /* Guest segment AR bytes. */ vmcb->es.attr.bytes = 0xc93; /* read/write, accessed */ vmcb->ss.attr.bytes = 0xc93; vmcb->ds.attr.bytes = 0xc93; vmcb->fs.attr.bytes = 0xc93; vmcb->gs.attr.bytes = 0xc93; vmcb->cs.attr.bytes = 0xc9b; /* exec/read, accessed */ /* Guest IDT. */ vmcb->idtr.base = 0; vmcb->idtr.limit = 0; /* Guest GDT. */ vmcb->gdtr.base = 0; vmcb->gdtr.limit = 0; /* Guest LDT. */ vmcb->ldtr.sel = 0; vmcb->ldtr.base = 0; vmcb->ldtr.limit = 0; vmcb->ldtr.attr.bytes = 0; /* Guest TSS. */ vmcb->tr.attr.bytes = 0x08b; /* 32-bit TSS (busy) */ vmcb->tr.base = 0; vmcb->tr.limit = 0xff; v->arch.hvm_vcpu.guest_cr[0] = X86_CR0_PE | X86_CR0_ET; hvm_update_guest_cr(v, 0); v->arch.hvm_vcpu.guest_cr[4] = 0; hvm_update_guest_cr(v, 4); paging_update_paging_modes(v); vmcb->_exception_intercepts = HVM_TRAP_MASK | (1U << TRAP_no_device); if ( paging_mode_hap(v->domain) ) { vmcb->_np_enable = 1; /* enable nested paging */ vmcb->_g_pat = MSR_IA32_CR_PAT_RESET; /* guest PAT */ vmcb->_h_cr3 = pagetable_get_paddr( p2m_get_pagetable(p2m_get_hostp2m(v->domain))); /* No point in intercepting CR3 reads/writes. */ vmcb->_cr_intercepts &= ~(CR_INTERCEPT_CR3_READ|CR_INTERCEPT_CR3_WRITE); /* * No point in intercepting INVLPG if we don't have shadow pagetables * that need to be fixed up. */ vmcb->_general1_intercepts &= ~GENERAL1_INTERCEPT_INVLPG; /* PAT is under complete control of SVM when using nested paging. */ svm_disable_intercept_for_msr(v, MSR_IA32_CR_PAT); } else { vmcb->_exception_intercepts |= (1U << TRAP_page_fault); } if ( cpu_has_pause_filter ) { vmcb->_pause_filter_count = SVM_PAUSEFILTER_INIT; vmcb->_general1_intercepts |= GENERAL1_INTERCEPT_PAUSE; } vmcb->cleanbits.bytes = 0; return 0; }
unsigned long __init dom0_compute_nr_pages( struct domain *d, struct elf_dom_parms *parms, unsigned long initrd_len) { nodeid_t node; unsigned long avail = 0, nr_pages, min_pages, max_pages; bool_t need_paging; for_each_node_mask ( node, dom0_nodes ) avail += avail_domheap_pages_region(node, 0, 0) + initial_images_nrpages(node); /* Reserve memory for further dom0 vcpu-struct allocations... */ avail -= (d->max_vcpus - 1UL) << get_order_from_bytes(sizeof(struct vcpu)); /* ...and compat_l4's, if needed. */ if ( is_pv_32bit_domain(d) ) avail -= d->max_vcpus - 1; /* Reserve memory for iommu_dom0_init() (rough estimate). */ if ( iommu_enabled ) { unsigned int s; for ( s = 9; s < BITS_PER_LONG; s += 9 ) avail -= max_pdx >> s; } need_paging = is_hvm_domain(d) && (!iommu_hap_pt_share || !paging_mode_hap(d)); for ( ; ; need_paging = 0 ) { nr_pages = dom0_nrpages; min_pages = dom0_min_nrpages; max_pages = dom0_max_nrpages; /* * If allocation isn't specified, reserve 1/16th of available memory * for things like DMA buffers. This reservation is clamped to a * maximum of 128MB. */ if ( nr_pages == 0 ) nr_pages = -min(avail / 16, 128UL << (20 - PAGE_SHIFT)); /* Negative specification means "all memory - specified amount". */ if ( (long)nr_pages < 0 ) nr_pages += avail; if ( (long)min_pages < 0 ) min_pages += avail; if ( (long)max_pages < 0 ) max_pages += avail; /* Clamp according to min/max limits and available memory. */ nr_pages = max(nr_pages, min_pages); nr_pages = min(nr_pages, max_pages); nr_pages = min(nr_pages, avail); if ( !need_paging ) break; /* Reserve memory for shadow or HAP. */ avail -= dom0_paging_pages(d, nr_pages); } if ( is_pv_domain(d) && (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) && ((dom0_min_nrpages <= 0) || (nr_pages > min_pages)) ) { /* * Legacy Linux kernels (i.e. such without a XEN_ELFNOTE_INIT_P2M * note) require that there is enough virtual space beyond the initial * allocation to set up their initial page tables. This space is * roughly the same size as the p2m table, so make sure the initial * allocation doesn't consume more than about half the space that's * available between params.virt_base and the address space end. */ unsigned long vstart, vend, end; size_t sizeof_long = is_pv_32bit_domain(d) ? sizeof(int) : sizeof(long); vstart = parms->virt_base; vend = round_pgup(parms->virt_kend); if ( !parms->unmapped_initrd ) vend += round_pgup(initrd_len); end = vend + nr_pages * sizeof_long; if ( end > vstart ) end += end - vstart; if ( end <= vstart || (sizeof_long < sizeof(end) && end > (1UL << (8 * sizeof_long))) ) { end = sizeof_long >= sizeof(end) ? 0 : 1UL << (8 * sizeof_long); nr_pages = (end - vend) / (2 * sizeof_long); if ( dom0_min_nrpages > 0 && nr_pages < min_pages ) nr_pages = min_pages; printk("Dom0 memory clipped to %lu pages\n", nr_pages); } } d->max_pages = min_t(unsigned long, max_pages, UINT_MAX); return nr_pages; }