/* * balloon_replace_pages() * Try to replace nextexts blocks of 2^order pages. addr_bits specifies * how many bits of address the pages must be within (i.e. 16 would mean * that the pages cannot have an address > 64k). The constrints are on * what the hypervisor gives us -- we are free to give any pages in * exchange. The array pp is the pages we are giving away. The caller * provides storage space for mfns, which hold the new physical pages. */ long balloon_replace_pages(uint_t nextents, page_t **pp, uint_t addr_bits, uint_t order, mfn_t *mfns) { xen_memory_reservation_t memres; long fallback_cnt; long cnt; uint_t i, j, page_cnt, extlen; long e; int locked; /* * we shouldn't be allocating constrained pages on a guest. It doesn't * make any sense. They won't be constrained after a migration. */ ASSERT(DOMAIN_IS_INITDOMAIN(xen_info)); extlen = 1 << order; page_cnt = nextents * extlen; /* Give back the current pages to the hypervisor */ for (i = 0; i < page_cnt; i++) { cnt = balloon_free_pages(1, NULL, NULL, &pp[i]->p_pagenum); if (cnt != 1) { cmn_err(CE_PANIC, "balloon: unable to give a page back " "to the hypervisor.\n"); } } /* * try to allocate the new pages using addr_bits and order. If we can't * get all of the pages, try to get the remaining pages with no * constraints and, if that was successful, return the number of * constrained pages we did allocate. */ bzero(&memres, sizeof (memres)); /*LINTED: constant in conditional context*/ set_xen_guest_handle(memres.extent_start, mfns); memres.domid = DOMID_SELF; memres.nr_extents = nextents; memres.mem_flags = XENMEMF_address_bits(addr_bits); memres.extent_order = order; cnt = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres); /* assign the new MFNs to the current PFNs */ locked = balloon_lock_contig_pfnlist(cnt * extlen); for (i = 0; i < cnt; i++) { for (j = 0; j < extlen; j++) { reassign_pfn(pp[i * extlen + j]->p_pagenum, mfns[i] + j); } } if (locked) unlock_contig_pfnlist(); if (cnt != nextents) { if (cnt < 0) { cnt = 0; } /* * We couldn't get enough memory to satisfy our requirements. * The above loop will assign the parts of the request that * were successful (this part may be 0). We need to fill * in the rest. The bzero below clears out extent_order and * address_bits, so we'll take anything from the hypervisor * to replace the pages we gave away. */ fallback_cnt = page_cnt - cnt * extlen; bzero(&memres, sizeof (memres)); /*LINTED: constant in conditional context*/ set_xen_guest_handle(memres.extent_start, mfns); memres.domid = DOMID_SELF; memres.nr_extents = fallback_cnt; e = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres); if (e != fallback_cnt) { cmn_err(CE_PANIC, "balloon: unable to recover from " "failed increase_reservation.\n"); } locked = balloon_lock_contig_pfnlist(fallback_cnt); for (i = 0; i < fallback_cnt; i++) { uint_t offset = page_cnt - fallback_cnt; /* * We already used pp[0...(cnt * extlen)] before, * so start at the next entry in the pp array. */ reassign_pfn(pp[i + offset]->p_pagenum, mfns[i]); } if (locked) unlock_contig_pfnlist(); } /* * balloon_free_pages increments our counter. Decrement it here. */ atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -(long)page_cnt); /* * return the number of extents we were able to replace. If we got * this far, we know all the pp's are valid. */ return (cnt); }
/* map fgmfn of domid to lpfn in the current domain */ static int map_foreign_page(unsigned long lpfn, unsigned long fgmfn, unsigned int domid) { int rc; struct xen_add_to_physmap_range xatp = { .domid = DOMID_SELF, .foreign_domid = domid, .size = 1, .space = XENMAPSPACE_gmfn_foreign, }; xen_ulong_t idx = fgmfn; xen_pfn_t gpfn = lpfn; int err = 0; set_xen_guest_handle(xatp.idxs, &idx); set_xen_guest_handle(xatp.gpfns, &gpfn); set_xen_guest_handle(xatp.errs, &err); rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp); if (rc || err) { pr_warn("Failed to map pfn to mfn rc:%d:%d pfn:%lx mfn:%lx\n", rc, err, lpfn, fgmfn); return 1; } return 0; } struct remap_data { xen_pfn_t fgmfn; /* foreign domain's gmfn */ pgprot_t prot; domid_t domid; struct vm_area_struct *vma; int index; struct page **pages; struct xen_remap_mfn_info *info; }; static int remap_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr, void *data) { struct remap_data *info = data; struct page *page = info->pages[info->index++]; unsigned long pfn = page_to_pfn(page); pte_t pte = pfn_pte(pfn, info->prot); if (map_foreign_page(pfn, info->fgmfn, info->domid)) return -EFAULT; set_pte_at(info->vma->vm_mm, addr, ptep, pte); return 0; } int xen_remap_domain_mfn_range(struct vm_area_struct *vma, unsigned long addr, xen_pfn_t mfn, int nr, pgprot_t prot, unsigned domid, struct page **pages) { int err; struct remap_data data; /* TBD: Batching, current sole caller only does page at a time */ if (nr > 1) return -EINVAL; data.fgmfn = mfn; data.prot = prot; data.domid = domid; data.vma = vma; data.index = 0; data.pages = pages; err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT, remap_pte_fn, &data); return err; }
/** * machine_specific_memory_setup - Hook for machine specific memory setup. **/ char * __init xen_memory_setup(void) { static struct e820entry map[E820MAX] __initdata; unsigned long max_pfn = xen_start_info->nr_pages; unsigned long long mem_end; int rc; struct xen_memory_map memmap; unsigned long extra_pages = 0; unsigned long extra_limit; int i; int op; max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); mem_end = PFN_PHYS(max_pfn); memmap.nr_entries = E820MAX; set_xen_guest_handle(memmap.buffer, map); op = xen_initial_domain() ? XENMEM_machine_memory_map : XENMEM_memory_map; rc = HYPERVISOR_memory_op(op, &memmap); if (rc == -ENOSYS) { BUG_ON(xen_initial_domain()); memmap.nr_entries = 1; map[0].addr = 0ULL; map[0].size = mem_end; /* 8MB slack (to balance backend allocations). */ map[0].size += 8ULL << 20; map[0].type = E820_RAM; rc = 0; } BUG_ON(rc); e820.nr_map = 0; xen_extra_mem_start = mem_end; for (i = 0; i < memmap.nr_entries; i++) { unsigned long long end = map[i].addr + map[i].size; if (map[i].type == E820_RAM) { if (map[i].addr < mem_end && end > mem_end) { /* Truncate region to max_mem. */ u64 delta = end - mem_end; map[i].size -= delta; extra_pages += PFN_DOWN(delta); end = mem_end; } } if (end > xen_extra_mem_start) xen_extra_mem_start = end; /* If region is non-RAM or below mem_end, add what remains */ if ((map[i].type != E820_RAM || map[i].addr < mem_end) && map[i].size > 0) e820_add_region(map[i].addr, map[i].size, map[i].type); } /* * In domU, the ISA region is normal, usable memory, but we * reserve ISA memory anyway because too many things poke * about in there. * * In Dom0, the host E820 information can leave gaps in the * ISA range, which would cause us to release those pages. To * avoid this, we unconditionally reserve them here. */ e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, E820_RESERVED); /* * Reserve Xen bits: * - mfn_list * - xen_start_info * See comment above "struct start_info" in <xen/interface/xen.h> */ memblock_x86_reserve_range(__pa(xen_start_info->mfn_list), __pa(xen_start_info->pt_base), "XEN START INFO"); sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820); /* * Clamp the amount of extra memory to a EXTRA_MEM_RATIO * factor the base size. On non-highmem systems, the base * size is the full initial memory allocation; on highmem it * is limited to the max size of lowmem, so that it doesn't * get completely filled. * * In principle there could be a problem in lowmem systems if * the initial memory is also very large with respect to * lowmem, but we won't try to deal with that here. */ extra_limit = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), max_pfn + extra_pages); if (extra_limit >= max_pfn) extra_pages = extra_limit - max_pfn; else extra_pages = 0; if (!xen_initial_domain()) xen_add_extra_mem(extra_pages); return "Xen"; }
static unsigned long __init xen_release_chunk(unsigned long start, unsigned long end) { struct xen_memory_reservation reservation = { .address_bits = 0, .extent_order = 0, .domid = DOMID_SELF }; unsigned long len = 0; unsigned long pfn; int ret; for(pfn = start; pfn < end; pfn++) { unsigned long mfn = pfn_to_mfn(pfn); /* */ if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) continue; set_xen_guest_handle(reservation.extent_start, &mfn); reservation.nr_extents = 1; ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret); if (ret == 1) { __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); len++; } } printk(KERN_INFO "Freeing %lx-%lx pfn range: %lu pages freed\n", start, end, len); return len; } static unsigned long __init xen_set_identity_and_release( const struct e820entry *list, size_t map_size, unsigned long nr_pages) { phys_addr_t start = 0; unsigned long released = 0; unsigned long identity = 0; const struct e820entry *entry; int i; /* */ for (i = 0, entry = list; i < map_size; i++, entry++) { phys_addr_t end = entry->addr + entry->size; if (entry->type == E820_RAM || i == map_size - 1) { unsigned long start_pfn = PFN_DOWN(start); unsigned long end_pfn = PFN_UP(end); if (entry->type == E820_RAM) end_pfn = PFN_UP(entry->addr); if (start_pfn < end_pfn) { if (start_pfn < nr_pages) released += xen_release_chunk( start_pfn, min(end_pfn, nr_pages)); identity += set_phys_range_identity( start_pfn, end_pfn); } start = end; } } printk(KERN_INFO "Released %lu pages of unused memory\n", released); printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity); return released; }
static int dealloc_pte_fn( pte_t *pte, struct page *pmd_page, unsigned long addr, void *data) { unsigned long mfn = pte_mfn(*pte); int ret; struct xen_memory_reservation reservation = { .nr_extents = 1, .extent_order = 0, .domid = DOMID_SELF }; set_xen_guest_handle(reservation.extent_start, &mfn); set_pte_at(&init_mm, addr, pte, __pte_ma(0)); set_phys_to_machine(__pa(addr) >> PAGE_SHIFT, INVALID_P2M_ENTRY); ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); BUG_ON(ret != 1); return 0; } #endif struct page **alloc_empty_pages_and_pagevec(int nr_pages) { unsigned long vaddr, flags; struct page *page, **pagevec; int i, ret; pagevec = kmalloc(sizeof(page) * nr_pages, GFP_KERNEL); if (pagevec == NULL) return NULL; for (i = 0; i < nr_pages; i++) { page = pagevec[i] = alloc_page(GFP_KERNEL); if (page == NULL) goto err; vaddr = (unsigned long)page_address(page); scrub_pages(vaddr, 1); balloon_lock(flags); if (xen_feature(XENFEAT_auto_translated_physmap)) { unsigned long gmfn = page_to_pfn(page); struct xen_memory_reservation reservation = { .nr_extents = 1, .extent_order = 0, .domid = DOMID_SELF }; set_xen_guest_handle(reservation.extent_start, &gmfn); ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); if (ret == 1) ret = 0; /* success */ } else { #ifdef CONFIG_XEN ret = apply_to_page_range(&init_mm, vaddr, PAGE_SIZE, dealloc_pte_fn, NULL); #else /* Cannot handle non-auto translate mode. */ ret = 1; #endif } if (ret != 0) { balloon_unlock(flags); __free_page(page); goto err; } totalram_pages = --current_pages; balloon_unlock(flags); } out: schedule_work(&balloon_worker); #ifdef CONFIG_XEN flush_tlb_all(); #endif return pagevec; err: balloon_lock(flags); while (--i >= 0) balloon_append(pagevec[i]); balloon_unlock(flags); kfree(pagevec); pagevec = NULL; goto out; }
static unsigned long __init xen_release_chunk(phys_addr_t start_addr, phys_addr_t end_addr) { struct xen_memory_reservation reservation = { .address_bits = 0, .extent_order = 0, .domid = DOMID_SELF }; unsigned long start, end; unsigned long len = 0; unsigned long pfn; int ret; start = PFN_UP(start_addr); end = PFN_DOWN(end_addr); if (end <= start) return 0; printk(KERN_INFO "xen_release_chunk: looking at area pfn %lx-%lx: ", start, end); for(pfn = start; pfn < end; pfn++) { unsigned long mfn = pfn_to_mfn(pfn); /* Make sure pfn exists to start with */ if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) continue; set_xen_guest_handle(reservation.extent_start, &mfn); reservation.nr_extents = 1; ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n", start, end, ret); if (ret == 1) { set_phys_to_machine(pfn, INVALID_P2M_ENTRY); len++; } } printk(KERN_CONT "%ld pages freed\n", len); return len; } static unsigned long __init xen_return_unused_memory(unsigned long max_pfn, const struct e820map *e820) { phys_addr_t max_addr = PFN_PHYS(max_pfn); phys_addr_t last_end = 0; unsigned long released = 0; int i; for (i = 0; i < e820->nr_map && last_end < max_addr; i++) { phys_addr_t end = e820->map[i].addr; end = min(max_addr, end); released += xen_release_chunk(last_end, end); last_end = e820->map[i].addr + e820->map[i].size; } if (last_end < max_addr) released += xen_release_chunk(last_end, max_addr); printk(KERN_INFO "released %ld pages of unused memory\n", released); return released; }
static unsigned long __init xen_release_chunk(unsigned long start, unsigned long end) { struct xen_memory_reservation reservation = { .address_bits = 0, .extent_order = 0, .domid = DOMID_SELF }; unsigned long len = 0; unsigned long pfn; int ret; for(pfn = start; pfn < end; pfn++) { unsigned long mfn = pfn_to_mfn(pfn); /* Make sure pfn exists to start with */ if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) continue; set_xen_guest_handle(reservation.extent_start, &mfn); reservation.nr_extents = 1; ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret); if (ret == 1) { __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); len++; } } printk(KERN_INFO "Freeing %lx-%lx pfn range: %lu pages freed\n", start, end, len); return len; } static unsigned long __init xen_set_identity_and_release( const struct e820entry *list, size_t map_size, unsigned long nr_pages) { phys_addr_t start = 0; unsigned long released = 0; unsigned long identity = 0; const struct e820entry *entry; int i; /* * Combine non-RAM regions and gaps until a RAM region (or the * end of the map) is reached, then set the 1:1 map and * release the pages (if available) in those non-RAM regions. * * The combined non-RAM regions are rounded to a whole number * of pages so any partial pages are accessible via the 1:1 * mapping. This is needed for some BIOSes that put (for * example) the DMI tables in a reserved region that begins on * a non-page boundary. */ for (i = 0, entry = list; i < map_size; i++, entry++) { phys_addr_t end = entry->addr + entry->size; if (entry->type == E820_RAM || i == map_size - 1) { unsigned long start_pfn = PFN_DOWN(start); unsigned long end_pfn = PFN_UP(end); if (entry->type == E820_RAM) end_pfn = PFN_UP(entry->addr); if (start_pfn < end_pfn) { if (start_pfn < nr_pages) released += xen_release_chunk( start_pfn, min(end_pfn, nr_pages)); identity += set_phys_range_identity( start_pfn, end_pfn); } start = end; } } printk(KERN_INFO "Released %lu pages of unused memory\n", released); printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity); return released; }
void __init setup_arch(char **cmdline_p) { unsigned long kernel_end; #if defined(CONFIG_XEN_PRIVILEGED_GUEST) struct e820entry *machine_e820; struct xen_memory_map memmap; #endif #ifdef CONFIG_XEN /* Register a call for panic conditions. */ atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block); ROOT_DEV = MKDEV(RAMDISK_MAJOR,0); kernel_end = 0; /* dummy */ screen_info = SCREEN_INFO; if (xen_start_info->flags & SIF_INITDOMAIN) { /* This is drawn from a dump from vgacon:startup in * standard Linux. */ screen_info.orig_video_mode = 3; screen_info.orig_video_isVGA = 1; screen_info.orig_video_lines = 25; screen_info.orig_video_cols = 80; screen_info.orig_video_ega_bx = 3; screen_info.orig_video_points = 16; } else screen_info.orig_video_isVGA = 0; edid_info = EDID_INFO; saved_video_mode = SAVED_VIDEO_MODE; bootloader_type = LOADER_TYPE; #ifdef CONFIG_BLK_DEV_RAM rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK; rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0); rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0); #endif setup_xen_features(); HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); ARCH_SETUP #else ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV); screen_info = SCREEN_INFO; edid_info = EDID_INFO; saved_video_mode = SAVED_VIDEO_MODE; bootloader_type = LOADER_TYPE; #ifdef CONFIG_BLK_DEV_RAM rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK; rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0); rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0); #endif #endif /* !CONFIG_XEN */ setup_memory_region(); copy_edd(); if (!MOUNT_ROOT_RDONLY) root_mountflags &= ~MS_RDONLY; init_mm.start_code = (unsigned long) &_text; init_mm.end_code = (unsigned long) &_etext; init_mm.end_data = (unsigned long) &_edata; init_mm.brk = (unsigned long) &_end; #ifndef CONFIG_XEN code_resource.start = virt_to_phys(&_text); code_resource.end = virt_to_phys(&_etext)-1; data_resource.start = virt_to_phys(&_etext); data_resource.end = virt_to_phys(&_edata)-1; #endif parse_cmdline_early(cmdline_p); early_identify_cpu(&boot_cpu_data); /* * partially used pages are not usable - thus * we are rounding upwards: */ end_pfn = e820_end_of_ram(); num_physpages = end_pfn; /* for pfn_valid */ check_efer(); #ifndef CONFIG_XEN discover_ebda(); #endif init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT)); #ifdef CONFIG_ACPI_NUMA /* * Parse SRAT to discover nodes. */ acpi_numa_init(); #endif #ifdef CONFIG_NUMA numa_initmem_init(0, end_pfn); #else contig_initmem_init(0, end_pfn); #endif /* Reserve direct mapping */ reserve_bootmem_generic(table_start << PAGE_SHIFT, (table_end - table_start) << PAGE_SHIFT); /* reserve kernel */ kernel_end = round_up(__pa_symbol(&_end),PAGE_SIZE); reserve_bootmem_generic(HIGH_MEMORY, kernel_end - HIGH_MEMORY); #ifdef CONFIG_XEN /* reserve physmap, start info and initial page tables */ reserve_bootmem(kernel_end, (table_start<<PAGE_SHIFT)-kernel_end); #else /* * reserve physical page 0 - it's a special BIOS page on many boxes, * enabling clean reboots, SMP operation, laptop functions. */ reserve_bootmem_generic(0, PAGE_SIZE); /* reserve ebda region */ if (ebda_addr) reserve_bootmem_generic(ebda_addr, ebda_size); #endif #ifdef CONFIG_SMP /* * But first pinch a few for the stack/trampoline stuff * FIXME: Don't need the extra page at 4K, but need to fix * trampoline before removing it. (see the GDT stuff) */ reserve_bootmem_generic(PAGE_SIZE, PAGE_SIZE); /* Reserve SMP trampoline */ reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, PAGE_SIZE); #endif #ifdef CONFIG_ACPI_SLEEP /* * Reserve low memory region for sleep support. */ acpi_reserve_bootmem(); #endif #ifdef CONFIG_XEN #ifdef CONFIG_BLK_DEV_INITRD if (xen_start_info->mod_start) { if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) { /*reserve_bootmem_generic(INITRD_START, INITRD_SIZE);*/ initrd_start = INITRD_START + PAGE_OFFSET; initrd_end = initrd_start+INITRD_SIZE; initrd_below_start_ok = 1; } else { printk(KERN_ERR "initrd extends beyond end of memory " "(0x%08lx > 0x%08lx)\ndisabling initrd\n", (unsigned long)(INITRD_START + INITRD_SIZE), (unsigned long)(end_pfn << PAGE_SHIFT)); initrd_start = 0; } } #endif #else /* CONFIG_XEN */ #ifdef CONFIG_BLK_DEV_INITRD if (LOADER_TYPE && INITRD_START) { if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) { reserve_bootmem_generic(INITRD_START, INITRD_SIZE); initrd_start = INITRD_START ? INITRD_START + PAGE_OFFSET : 0; initrd_end = initrd_start+INITRD_SIZE; } else { printk(KERN_ERR "initrd extends beyond end of memory " "(0x%08lx > 0x%08lx)\ndisabling initrd\n", (unsigned long)(INITRD_START + INITRD_SIZE), (unsigned long)(end_pfn << PAGE_SHIFT)); initrd_start = 0; } } #endif #endif /* !CONFIG_XEN */ #ifdef CONFIG_KEXEC if (crashk_res.start != crashk_res.end) { reserve_bootmem(crashk_res.start, crashk_res.end - crashk_res.start + 1); } #endif paging_init(); #ifdef CONFIG_X86_LOCAL_APIC /* * Find and reserve possible boot-time SMP configuration: */ find_smp_config(); #endif #ifdef CONFIG_XEN { int i, j, k, fpp; unsigned long va; /* 'Initial mapping' of initrd must be destroyed. */ for (va = xen_start_info->mod_start; va < (xen_start_info->mod_start+xen_start_info->mod_len); va += PAGE_SIZE) { HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0); } if (!xen_feature(XENFEAT_auto_translated_physmap)) { /* Make sure we have a large enough P->M table. */ phys_to_machine_mapping = alloc_bootmem( end_pfn * sizeof(unsigned long)); memset(phys_to_machine_mapping, ~0, end_pfn * sizeof(unsigned long)); memcpy(phys_to_machine_mapping, (unsigned long *)xen_start_info->mfn_list, xen_start_info->nr_pages * sizeof(unsigned long)); free_bootmem( __pa(xen_start_info->mfn_list), PFN_PHYS(PFN_UP(xen_start_info->nr_pages * sizeof(unsigned long)))); /* Destroyed 'initial mapping' of old p2m table. */ for (va = xen_start_info->mfn_list; va < (xen_start_info->mfn_list + (xen_start_info->nr_pages*sizeof(unsigned long))); va += PAGE_SIZE) { HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0); } /* * Initialise the list of the frames that specify the * list of frames that make up the p2m table. Used by * save/restore. */ pfn_to_mfn_frame_list_list = alloc_bootmem(PAGE_SIZE); HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = virt_to_mfn(pfn_to_mfn_frame_list_list); fpp = PAGE_SIZE/sizeof(unsigned long); for (i=0, j=0, k=-1; i< end_pfn; i+=fpp, j++) { if ((j % fpp) == 0) { k++; BUG_ON(k>=fpp); pfn_to_mfn_frame_list[k] = alloc_bootmem(PAGE_SIZE); pfn_to_mfn_frame_list_list[k] = virt_to_mfn(pfn_to_mfn_frame_list[k]); j=0; } pfn_to_mfn_frame_list[k][j] = virt_to_mfn(&phys_to_machine_mapping[i]); } HYPERVISOR_shared_info->arch.max_pfn = end_pfn; } } if (xen_start_info->flags & SIF_INITDOMAIN) dmi_scan_machine(); if ( ! (xen_start_info->flags & SIF_INITDOMAIN)) { acpi_disabled = 1; #ifdef CONFIG_ACPI acpi_ht = 0; #endif } #endif #ifndef CONFIG_XEN check_ioapic(); #endif zap_low_mappings(0); /* * set this early, so we dont allocate cpu0 * if MADT list doesnt list BSP first * mpparse.c/MP_processor_info() allocates logical cpu numbers. */ cpu_set(0, cpu_present_map); #ifdef CONFIG_ACPI /* * Initialize the ACPI boot-time table parser (gets the RSDP and SDT). * Call this early for SRAT node setup. */ acpi_boot_table_init(); /* * Read APIC and some other early information from ACPI tables. */ acpi_boot_init(); #endif init_cpu_to_node(); #ifdef CONFIG_X86_LOCAL_APIC /* * get boot-time SMP configuration: */ if (smp_found_config) get_smp_config(); #ifndef CONFIG_XEN init_apic_mappings(); #endif #endif #if defined(CONFIG_XEN) && defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU) prefill_possible_map(); #endif /* * Request address space for all standard RAM and ROM resources * and also for regions reported as reserved by the e820. */ #if defined(CONFIG_XEN_PRIVILEGED_GUEST) probe_roms(); if (xen_start_info->flags & SIF_INITDOMAIN) { machine_e820 = alloc_bootmem_low_pages(PAGE_SIZE); memmap.nr_entries = E820MAX; set_xen_guest_handle(memmap.buffer, machine_e820); BUG_ON(HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap)); e820_reserve_resources(machine_e820, memmap.nr_entries); } #elif !defined(CONFIG_XEN) probe_roms(); e820_reserve_resources(e820.map, e820.nr_map); #endif request_resource(&iomem_resource, &video_ram_resource); { unsigned i; /* request I/O space for devices used on all i[345]86 PCs */ for (i = 0; i < STANDARD_IO_RESOURCES; i++) request_resource(&ioport_resource, &standard_io_resources[i]); } #if defined(CONFIG_XEN_PRIVILEGED_GUEST) if (xen_start_info->flags & SIF_INITDOMAIN) { e820_setup_gap(machine_e820, memmap.nr_entries); free_bootmem(__pa(machine_e820), PAGE_SIZE); } #elif !defined(CONFIG_XEN) e820_setup_gap(e820.map, e820.nr_map); #endif #ifdef CONFIG_GART_IOMMU iommu_hole_init(); #endif #ifdef CONFIG_XEN { struct physdev_set_iopl set_iopl; set_iopl.iopl = 1; HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); if (xen_start_info->flags & SIF_INITDOMAIN) { if (!(xen_start_info->flags & SIF_PRIVILEGED)) panic("Xen granted us console access " "but not privileged status"); #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) conswitchp = &vga_con; #elif defined(CONFIG_DUMMY_CONSOLE) conswitchp = &dummy_con; #endif #endif } else { extern int console_use_vt; console_use_vt = 0; } } #else /* CONFIG_XEN */ #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) conswitchp = &vga_con; #elif defined(CONFIG_DUMMY_CONSOLE) conswitchp = &dummy_con; #endif #endif #endif /* !CONFIG_XEN */ }
static void init_mem_alloc(void) { int local; /* variables needed to find start region */ paddr_t scratch_start; xen_memory_map_t map; DBG_MSG("Entered init_mem_alloc()\n"); /* * Free memory follows the stack. There's at least 512KB of scratch * space, rounded up to at least 2Mb alignment. That should be enough * for the page tables we'll need to build. The nucleus memory is * allocated last and will be outside the addressible range. We'll * switch to new page tables before we unpack the kernel */ scratch_start = RNDUP((paddr_t)(uintptr_t)&local, MMU_PAGESIZE); DBG(scratch_start); scratch_end = RNDUP((paddr_t)scratch_start + 512 * 1024, TWO_MEG); DBG(scratch_end); /* * For paranoia, leave some space between hypervisor data and ours. * Use 500 instead of 512. */ next_avail_addr = scratch_end - 500 * 1024; DBG(next_avail_addr); /* * The domain builder gives us at most 1 module */ DBG(xen_info->mod_len); if (xen_info->mod_len > 0) { DBG(xen_info->mod_start); modules[0].bm_addr = xen_info->mod_start; modules[0].bm_size = xen_info->mod_len; bi->bi_module_cnt = 1; bi->bi_modules = (native_ptr_t)modules; } else { bi->bi_module_cnt = 0; bi->bi_modules = NULL; } DBG(bi->bi_module_cnt); DBG(bi->bi_modules); DBG(xen_info->mfn_list); DBG(xen_info->nr_pages); max_mem = (paddr_t)xen_info->nr_pages << MMU_PAGESHIFT; DBG(max_mem); /* * Using pseudo-physical addresses, so only 1 memlist element */ memlists[0].addr = 0; DBG(memlists[0].addr); memlists[0].size = max_mem; DBG(memlists[0].size); memlists_used = 1; DBG(memlists_used); /* * finish building physinstall list */ sort_physinstall(); /* * build bios reserved memlists */ build_rsvdmemlists(); if (DOMAIN_IS_INITDOMAIN(xen_info)) { /* * build PCI Memory list */ map.nr_entries = MAXMAPS; /*LINTED: constant in conditional context*/ set_xen_guest_handle(map.buffer, map_buffer); if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &map) != 0) dboot_panic("getting XENMEM_machine_memory_map failed"); build_pcimemlists(map_buffer, map.nr_entries); } }
static BOOLEAN BalloonReleasePfnArray( IN ULONG Requested, OUT PULONG pReleased ) { xen_memory_reservation_t reservation; LARGE_INTEGER Start; LARGE_INTEGER End; ULONGLONG TimeDelta; BOOLEAN Slow; ULONG Index; ULONG Registered; ULONG Released; XM_ASSERT(Requested <= BALLOON_PFN_ARRAY_SIZE); KeQuerySystemTime(&Start); Released = 0; if (Requested == 0) goto done; for (Index = 0; Index < Requested; Index++) { if (Balloon.PfnArray[Index] == 0) { TraceError(("%s: PFN[%d] == 0\n", __FUNCTION__, Index)); XM_BUG(); } } Registered = RangeSetAddItems(&(Balloon.PfnsBalloonedOut), &(Balloon.PfnArray[0]), Requested); if (Registered < Requested) { TraceError(("%s: failed to register %d page(s)\n", __FUNCTION__, Requested - Registered)); if (Registered == 0) goto done; } SET_XEN_GUEST_HANDLE(reservation.extent_start, Balloon.PfnArray); reservation.extent_order = 0; reservation.mem_flags = 0; // unused reservation.domid = DOMID_SELF; reservation.nr_extents = Registered; Released = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); if (Released < Registered) { TraceWarning(("%s: partial release (%d < %d)\n", __FUNCTION__, Released, Registered)); // This should not fail as we're removing ranges we just added RangeSetRemoveItems(&(Balloon.PfnsBalloonedOut), &(Balloon.PfnArray[Released]), Registered - Released); } else if (Released > Registered) { XM_BUG(); } RtlZeroMemory(Balloon.PfnArray, Released * sizeof (PFN_NUMBER)); done: RangeSetDropRseCache(&(Balloon.PfnsBalloonedOut)); TraceVerbose(("%s: %d page(s)\n", __FUNCTION__, Released)); KeQuerySystemTime(&End); TimeDelta = (End.QuadPart - Start.QuadPart) / 10000ull; Slow = FALSE; if (TimeDelta != 0) { ULONGLONG Rate; Rate = (ULONGLONG)(Released * 1000) / TimeDelta; if (Rate < MIN_PAGES_PER_S) { TraceWarning(("%s: ran for more than %dms\n", __FUNCTION__, TimeDelta)); Slow = TRUE; } } *pReleased = Released; return Slow; }
static int increase_reservation(unsigned long nr_pages) { unsigned long pfn, i, flags; struct page *page; long rc; struct xen_memory_reservation reservation = { .address_bits = 0, .extent_order = 0, .domid = DOMID_SELF }; if (nr_pages > ARRAY_SIZE(frame_list)) nr_pages = ARRAY_SIZE(frame_list); spin_lock_irqsave(&balloon_lock, flags); page = balloon_first_page(); for (i = 0; i < nr_pages; i++) { BUG_ON(page == NULL); frame_list[i] = page_to_pfn(page); page = balloon_next_page(page); } set_xen_guest_handle(reservation.extent_start, frame_list); reservation.nr_extents = nr_pages; rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); if (rc < 0) goto out; for (i = 0; i < rc; i++) { page = balloon_retrieve(); BUG_ON(page == NULL); pfn = page_to_pfn(page); BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) && phys_to_machine_mapping_valid(pfn)); set_phys_to_machine(pfn, frame_list[i]); /* Link back into the page tables if not highmem. */ #ifdef CONFIG_PVM if (!xen_hvm_domain() && pfn < max_low_pfn) { int ret; ret = HYPERVISOR_update_va_mapping( (unsigned long)__va(pfn << PAGE_SHIFT), mfn_pte(frame_list[i], PAGE_KERNEL), 0); BUG_ON(ret); } #endif /* Relinquish the page back to the allocator. */ ClearPageReserved(page); init_page_count(page); __free_page(page); } balloon_stats.current_pages += rc; if (old_totalram_pages + rc < totalram_pages) { printk(KERN_INFO "old_totalram=%luKB, totalram_pages=%luKB\n", old_totalram_pages*4, totalram_pages*4); balloon_stats.current_pages = totalram_pages + totalram_bias; printk(KERN_INFO "when ballooning, the mem online! totalram=%luKB, current=%luKB\n", totalram_pages*4, balloon_stats.current_pages*4); } old_totalram_pages = totalram_pages; out: spin_unlock_irqrestore(&balloon_lock, flags); return rc < 0 ? rc : rc != nr_pages; } static int decrease_reservation(unsigned long nr_pages) { unsigned long pfn, i, flags; struct page *page; int need_sleep = 0; int ret; struct xen_memory_reservation reservation = { .address_bits = 0, .extent_order = 0, .domid = DOMID_SELF }; if (nr_pages > ARRAY_SIZE(frame_list)) nr_pages = ARRAY_SIZE(frame_list); for (i = 0; i < nr_pages; i++) { if ((page = alloc_page(GFP_BALLOON)) == NULL) { nr_pages = i; need_sleep = 1; break; } pfn = page_to_pfn(page); frame_list[i] = pfn_to_mfn(pfn); scrub_page(page); if (!xen_hvm_domain() && !PageHighMem(page)) { ret = HYPERVISOR_update_va_mapping( (unsigned long)__va(pfn << PAGE_SHIFT), __pte_ma(0), 0); BUG_ON(ret); } } /* Ensure that ballooned highmem pages don't have kmaps. */ #ifdef CONFIG_PVM kmap_flush_unused(); flush_tlb_all(); #endif spin_lock_irqsave(&balloon_lock, flags); /* No more mappings: invalidate P2M and add to balloon. */ for (i = 0; i < nr_pages; i++) { pfn = mfn_to_pfn(frame_list[i]); set_phys_to_machine(pfn, INVALID_P2M_ENTRY); balloon_append(pfn_to_page(pfn)); } set_xen_guest_handle(reservation.extent_start, frame_list); reservation.nr_extents = nr_pages; ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); BUG_ON(ret != nr_pages); balloon_stats.current_pages -= nr_pages; if(old_totalram_pages < totalram_pages + nr_pages) { printk(KERN_INFO "old_totalram=%luKB, totalram_pages=%luKB\n", old_totalram_pages*4, totalram_pages*4); balloon_stats.current_pages = totalram_pages + totalram_bias; printk(KERN_INFO "when ballooning, the mem online! totalram=%luKB, current=%luKB\n", totalram_pages*4, balloon_stats.current_pages*4); } old_totalram_pages = totalram_pages; spin_unlock_irqrestore(&balloon_lock, flags); return need_sleep; } /* * We avoid multiple worker processes conflicting via the balloon mutex. * We may of course race updates of the target counts (which are protected * by the balloon lock), or with changes to the Xen hard limit, but we will * recover from these in time. */ static void balloon_process(struct work_struct *work) { int need_sleep = 0; long credit; long total_increase = 0; char buffer[16]; mutex_lock(&balloon_mutex); printk(KERN_INFO "totalram_pages=%luKB, current_pages=%luKB,totalram_bias=%luKB\n", totalram_pages*4, balloon_stats.current_pages*4, totalram_bias*4); if (totalram_pages > old_totalram_pages) { //TODO:Just know that totalram_pages will increase. total_increase = (totalram_pages - old_totalram_pages) % GB2PAGE; if (totalram_bias > total_increase ) { totalram_bias = totalram_bias - total_increase; } balloon_stats.current_pages = totalram_pages + totalram_bias; old_totalram_pages = totalram_pages; } printk(KERN_INFO "totalram_pages=%luKB, current_pages=%luKB, totalram_bias=%luKB,total_increase=%ld\n", totalram_pages*4, balloon_stats.current_pages*4, totalram_bias*4, total_increase*4); xenbus_write(XBT_NIL, "control/uvp", "Balloon_flag", "1"); do { credit = current_target() - balloon_stats.current_pages; if (credit > 0) need_sleep = (increase_reservation(credit) != 0); if (credit < 0) need_sleep = (decrease_reservation(-credit) != 0); #ifndef CONFIG_PREEMPT if (need_resched()) schedule(); #endif } while ((credit != 0) && !need_sleep); /* Schedule more work if there is some still to be done. */ if (current_target() != balloon_stats.current_pages) { mod_timer(&balloon_timer, jiffies + HZ); sprintf(buffer,"%lu",balloon_stats.current_pages<<(PAGE_SHIFT-10)); xenbus_write(XBT_NIL, "memory", "target", buffer); } xenbus_write(XBT_NIL, "control/uvp", "Balloon_flag", "0"); mutex_unlock(&balloon_mutex); }
void __init xen_start_kernel(void) { unsigned int i; struct xen_machphys_mapping mapping; unsigned long machine_to_phys_nr_ents; #ifdef CONFIG_X86_32 struct xen_platform_parameters pp; extern pte_t swapper_pg_fixmap[PTRS_PER_PTE]; unsigned long addr; #endif xen_setup_features(); if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) { machine_to_phys_mapping = (unsigned long *)mapping.v_start; machine_to_phys_nr_ents = mapping.max_mfn + 1; } else machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES; while ((1UL << machine_to_phys_order) < machine_to_phys_nr_ents ) machine_to_phys_order++; if (!xen_feature(XENFEAT_auto_translated_physmap)) phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list; WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables)); reserve_early(ALIGN(__pa_symbol(&_end), PAGE_SIZE), __pa(xen_start_info->pt_base) + (xen_start_info->nr_pt_frames << PAGE_SHIFT), "Xen provided"); #ifdef CONFIG_X86_32 WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments)); init_mm.pgd = swapper_pg_dir = (pgd_t *)xen_start_info->pt_base; if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) { hypervisor_virt_start = pp.virt_start; reserve_top_address(0UL - pp.virt_start); } BUG_ON(pte_index(hypervisor_virt_start)); /* Do an early initialization of the fixmap area */ make_lowmem_page_readonly(swapper_pg_fixmap, XENFEAT_writable_page_tables); addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE); set_pmd(pmd_offset(pud_offset(swapper_pg_dir + pgd_index(addr), addr), addr), __pmd(__pa_symbol(swapper_pg_fixmap) | _PAGE_TABLE)); #else check_efer(); xen_init_pt(); #endif #define __FIXADDR_TOP (-PAGE_SIZE) #define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)) #define FIX_BUG_ON(fix) BUILD_BUG_ON(pmd_index(__fix_to_virt(FIX_##fix)) \ != pmd_index(__fix_to_virt(FIX_EARLYCON_MEM_BASE))) FIX_BUG_ON(SHARED_INFO); FIX_BUG_ON(ISAMAP_BEGIN); FIX_BUG_ON(ISAMAP_END); #undef pmd_index #undef __FIXADDR_TOP /* Switch to the real shared_info page, and clear the dummy page. */ set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info); HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); memset(empty_zero_page, 0, sizeof(empty_zero_page)); setup_vcpu_info(0); /* Set up mapping of lowest 1MB of physical memory. */ for (i = 0; i < NR_FIX_ISAMAPS; i++) if (is_initial_xendomain()) set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE); else __set_fixmap(FIX_ISAMAP_BEGIN - i, virt_to_machine(empty_zero_page), PAGE_KERNEL_RO); }
static int gnttab_map(unsigned int start_idx, unsigned int end_idx) { struct gnttab_setup_table setup; unsigned long start_gpfn = 0; xen_pfn_t *frames; unsigned int nr_gframes = end_idx + 1; int rc; if (xen_hvm_domain() || xen_feature(XENFEAT_auto_translated_physmap)) { struct xen_add_to_physmap xatp; unsigned int i = end_idx; rc = 0; if (xen_hvm_domain()) start_gpfn = xen_hvm_resume_frames >> PAGE_SHIFT; /* * Loop backwards, so that the first hypercall has the largest * index, ensuring that the table will grow only once. */ do { xatp.domid = DOMID_SELF; xatp.idx = i; xatp.space = XENMAPSPACE_grant_table; if (xen_hvm_domain()) xatp.gpfn = start_gpfn + i; else xatp.gpfn = pvh_get_grant_pfn(i); rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp); if (rc != 0) { printk(KERN_WARNING "grant table add_to_physmap failed, err=%d\n", rc); break; } } while (i-- > start_idx); return rc; } /* No need for kzalloc as it is initialized in following hypercall * GNTTABOP_setup_table. */ frames = kmalloc(nr_gframes * sizeof(unsigned long), GFP_ATOMIC); if (!frames) return -ENOMEM; setup.dom = DOMID_SELF; setup.nr_frames = nr_gframes; set_xen_guest_handle(setup.frame_list, frames); rc = HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1); if (rc == -ENOSYS) { kfree(frames); return -ENOSYS; } BUG_ON(rc || setup.status); rc = gnttab_interface->map_frames(frames, nr_gframes); kfree(frames); return rc; }
/* * Top level routine to direct suspend/resume of a domain. */ void xen_suspend_domain(void) { extern void rtcsync(void); extern void ec_resume(void); extern kmutex_t ec_lock; struct xen_add_to_physmap xatp; ulong_t flags; int err; cmn_err(CE_NOTE, "Domain suspending for save/migrate"); SUSPEND_DEBUG("xen_suspend_domain\n"); /* * We only want to suspend the PV devices, since the emulated devices * are suspended by saving the emulated device state. The PV devices * are all children of the xpvd nexus device. So we search the * device tree for the xpvd node to use as the root of the tree to * be suspended. */ if (xpvd_dip == NULL) ddi_walk_devs(ddi_root_node(), check_xpvd, NULL); /* * suspend interrupts and devices */ if (xpvd_dip != NULL) (void) xen_suspend_devices(ddi_get_child(xpvd_dip)); else cmn_err(CE_WARN, "No PV devices found to suspend"); SUSPEND_DEBUG("xenbus_suspend\n"); xenbus_suspend(); mutex_enter(&cpu_lock); /* * Suspend on vcpu 0 */ thread_affinity_set(curthread, 0); kpreempt_disable(); if (ncpus > 1) pause_cpus(NULL, NULL); /* * We can grab the ec_lock as it's a spinlock with a high SPL. Hence * any holder would have dropped it to get through pause_cpus(). */ mutex_enter(&ec_lock); /* * From here on in, we can't take locks. */ flags = intr_clear(); SUSPEND_DEBUG("HYPERVISOR_suspend\n"); /* * At this point we suspend and sometime later resume. * Note that this call may return with an indication of a cancelled * for now no matter ehat the return we do a full resume of all * suspended drivers, etc. */ (void) HYPERVISOR_shutdown(SHUTDOWN_suspend); /* * Point HYPERVISOR_shared_info to the proper place. */ xatp.domid = DOMID_SELF; xatp.idx = 0; xatp.space = XENMAPSPACE_shared_info; xatp.gpfn = xen_shared_info_frame; if ((err = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) != 0) panic("Could not set shared_info page. error: %d", err); SUSPEND_DEBUG("gnttab_resume\n"); gnttab_resume(); SUSPEND_DEBUG("ec_resume\n"); ec_resume(); intr_restore(flags); if (ncpus > 1) start_cpus(); mutex_exit(&ec_lock); mutex_exit(&cpu_lock); /* * Now we can take locks again. */ rtcsync(); SUSPEND_DEBUG("xenbus_resume\n"); xenbus_resume(); SUSPEND_DEBUG("xen_resume_devices\n"); if (xpvd_dip != NULL) (void) xen_resume_devices(ddi_get_child(xpvd_dip), 0); thread_affinity_clear(curthread); kpreempt_enable(); SUSPEND_DEBUG("finished xen_suspend_domain\n"); cmn_err(CE_NOTE, "domain restore/migrate completed"); }
void xen_hvm_init(void) { struct cpuid_regs cp; uint32_t xen_signature[4], base; char *xen_str; struct xen_add_to_physmap xatp; xen_capabilities_info_t caps; pfn_t pfn; uint64_t msrval, val; extern int apix_enable; if (xen_hvm_inited != 0) return; xen_hvm_inited = 1; /* * Xen's pseudo-cpuid function returns a string representing * the Xen signature in %ebx, %ecx, and %edx. * Loop over the base values, since it may be different if * the hypervisor has hyper-v emulation switched on. * * %eax contains the maximum supported cpuid function. */ for (base = 0x40000000; base < 0x40010000; base += 0x100) { cp.cp_eax = base; (void) __cpuid_insn(&cp); xen_signature[0] = cp.cp_ebx; xen_signature[1] = cp.cp_ecx; xen_signature[2] = cp.cp_edx; xen_signature[3] = 0; xen_str = (char *)xen_signature; if (strcmp("XenVMMXenVMM", xen_str) == 0 && cp.cp_eax >= (base + 2)) break; } if (base >= 0x40010000) return; /* * cpuid function at base + 1 returns the Xen version in %eax. The * top 16 bits are the major version, the bottom 16 are the minor * version. */ cp.cp_eax = base + 1; (void) __cpuid_insn(&cp); xen_major = cp.cp_eax >> 16; xen_minor = cp.cp_eax & 0xffff; /* * Below version 3.1 we can't do anything special as a HVM domain; * the PV drivers don't work, many hypercalls are not available, * etc. */ if (xen_major < 3 || (xen_major == 3 && xen_minor < 1)) return; /* * cpuid function at base + 2 returns information about the * hypercall page. %eax nominally contains the number of pages * with hypercall code, but according to the Xen guys, "I'll * guarantee that remains one forever more, so you can just * allocate a single page and get quite upset if you ever see CPUID * return more than one page." %ebx contains an MSR we use to ask * Xen to remap each page at a specific pfn. */ cp.cp_eax = base + 2; (void) __cpuid_insn(&cp); /* * Let Xen know where we want the hypercall page mapped. We * already have a page allocated in the .text section to simplify * the wrapper code. */ pfn = va_to_pfn(&hypercall_page); msrval = mmu_ptob(pfn); wrmsr(cp.cp_ebx, msrval); /* Fill in the xen_info data */ xen_info = &__xen_info; (void) sprintf(xen_info->magic, "xen-%d.%d", xen_major, xen_minor); if (hvm_get_param(HVM_PARAM_STORE_PFN, &val) < 0) return; /* * The first hypercall worked, so mark hypercalls as working. */ xen_hvm_features |= XEN_HVM_HYPERCALLS; xen_info->store_mfn = (mfn_t)val; if (hvm_get_param(HVM_PARAM_STORE_EVTCHN, &val) < 0) return; xen_info->store_evtchn = (mfn_t)val; /* Figure out whether the hypervisor is 32-bit or 64-bit. */ if ((HYPERVISOR_xen_version(XENVER_capabilities, &caps) == 0)) { ((char *)(caps))[sizeof (caps) - 1] = '\0'; if (strstr(caps, "x86_64") != NULL) xen_bits = 64; else if (strstr(caps, "x86_32") != NULL) xen_bits = 32; } if (xen_bits < 0) return; #ifdef __amd64 ASSERT(xen_bits == 64); #endif /* * Allocate space for the shared_info page and tell Xen where it * is. */ xen_shared_info_frame = va_to_pfn(&hypercall_shared_info_page); xatp.domid = DOMID_SELF; xatp.idx = 0; xatp.space = XENMAPSPACE_shared_info; xatp.gpfn = xen_shared_info_frame; if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp) != 0) return; HYPERVISOR_shared_info = (void *)&hypercall_shared_info_page; /* * A working HVM tlb flush hypercall was introduced in Xen 3.3. */ if (xen_major > 3 || (xen_major == 3 && xen_minor >= 3)) xen_hvm_features |= XEN_HVM_TLBFLUSH; /* FIXME Disable apix for the time being */ apix_enable = 0; }
static unsigned long __init xen_do_chunk(unsigned long start, unsigned long end, bool release) { struct xen_memory_reservation reservation = { .address_bits = 0, .extent_order = 0, .domid = DOMID_SELF }; unsigned long len = 0; unsigned long pfn; int ret; for (pfn = start; pfn < end; pfn++) { unsigned long frame; unsigned long mfn = pfn_to_mfn(pfn); if (release) { /* Make sure pfn exists to start with */ if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) continue; frame = mfn; } else { if (mfn != INVALID_P2M_ENTRY) continue; frame = pfn; } set_xen_guest_handle(reservation.extent_start, &frame); reservation.nr_extents = 1; ret = HYPERVISOR_memory_op(release ? XENMEM_decrease_reservation : XENMEM_populate_physmap, &reservation); WARN(ret != 1, "Failed to %s pfn %lx err=%d\n", release ? "release" : "populate", pfn, ret); if (ret == 1) { if (!early_set_phys_to_machine(pfn, release ? INVALID_P2M_ENTRY : frame)) { if (release) break; set_xen_guest_handle(reservation.extent_start, &frame); reservation.nr_extents = 1; ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); break; } len++; } else break; } if (len) printk(KERN_INFO "%s %lx-%lx pfn range: %lu pages %s\n", release ? "Freeing" : "Populating", start, end, len, release ? "freed" : "added"); return len; } static unsigned long __init xen_release_chunk(unsigned long start, unsigned long end) { return xen_do_chunk(start, end, true); } static unsigned long __init xen_populate_chunk( const struct e820entry *list, size_t map_size, unsigned long max_pfn, unsigned long *last_pfn, unsigned long credits_left) { const struct e820entry *entry; unsigned int i; unsigned long done = 0; unsigned long dest_pfn; for (i = 0, entry = list; i < map_size; i++, entry++) { unsigned long credits = credits_left; unsigned long s_pfn; unsigned long e_pfn; unsigned long pfns; long capacity; if (credits <= 0) break; if (entry->type != E820_RAM) continue; e_pfn = PFN_UP(entry->addr + entry->size); /* We only care about E820 after the xen_start_info->nr_pages */ if (e_pfn <= max_pfn) continue; s_pfn = PFN_DOWN(entry->addr); /* If the E820 falls within the nr_pages, we want to start * at the nr_pages PFN. * If that would mean going past the E820 entry, skip it */ if (s_pfn <= max_pfn) { capacity = e_pfn - max_pfn; dest_pfn = max_pfn; } else { /* last_pfn MUST be within E820_RAM regions */ if (*last_pfn && e_pfn >= *last_pfn) s_pfn = *last_pfn; capacity = e_pfn - s_pfn; dest_pfn = s_pfn; } /* If we had filled this E820_RAM entry, go to the next one. */ if (capacity <= 0) continue; if (credits > capacity) credits = capacity; pfns = xen_do_chunk(dest_pfn, dest_pfn + credits, false); done += pfns; credits_left -= pfns; *last_pfn = (dest_pfn + pfns); } return done; } static void __init xen_set_identity_and_release_chunk( unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, unsigned long *released, unsigned long *identity) { unsigned long pfn; /* * If the PFNs are currently mapped, the VA mapping also needs * to be updated to be 1:1. */ for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) (void)HYPERVISOR_update_va_mapping( (unsigned long)__va(pfn << PAGE_SHIFT), mfn_pte(pfn, PAGE_KERNEL_IO), 0); if (start_pfn < nr_pages) *released += xen_release_chunk( start_pfn, min(end_pfn, nr_pages)); *identity += set_phys_range_identity(start_pfn, end_pfn); }
/** * machine_specific_memory_setup - Hook for machine specific memory setup. **/ char * __init xen_memory_setup(void) { unsigned long max_pfn, pfn_s, n_pfns; phys_addr_t mem_end, addr, size, chunk_size; u32 type; int rc; struct xen_memory_map memmap; unsigned long max_pages; unsigned long extra_pages = 0; int i; int op; xen_parse_512gb(); max_pfn = xen_get_pages_limit(); max_pfn = min(max_pfn, xen_start_info->nr_pages); mem_end = PFN_PHYS(max_pfn); memmap.nr_entries = E820MAX; set_xen_guest_handle(memmap.buffer, xen_e820_map); op = xen_initial_domain() ? XENMEM_machine_memory_map : XENMEM_memory_map; rc = HYPERVISOR_memory_op(op, &memmap); if (rc == -ENOSYS) { BUG_ON(xen_initial_domain()); memmap.nr_entries = 1; xen_e820_map[0].addr = 0ULL; xen_e820_map[0].size = mem_end; /* 8MB slack (to balance backend allocations). */ xen_e820_map[0].size += 8ULL << 20; xen_e820_map[0].type = E820_RAM; rc = 0; } BUG_ON(rc); BUG_ON(memmap.nr_entries == 0); xen_e820_map_entries = memmap.nr_entries; /* * Xen won't allow a 1:1 mapping to be created to UNUSABLE * regions, so if we're using the machine memory map leave the * region as RAM as it is in the pseudo-physical map. * * UNUSABLE regions in domUs are not handled and will need * a patch in the future. */ if (xen_initial_domain()) xen_ignore_unusable(); /* Make sure the Xen-supplied memory map is well-ordered. */ sanitize_e820_map(xen_e820_map, ARRAY_SIZE(xen_e820_map), &xen_e820_map_entries); max_pages = xen_get_max_pages(); /* How many extra pages do we need due to remapping? */ max_pages += xen_count_remap_pages(max_pfn); if (max_pages > max_pfn) extra_pages += max_pages - max_pfn; /* * Clamp the amount of extra memory to a EXTRA_MEM_RATIO * factor the base size. On non-highmem systems, the base * size is the full initial memory allocation; on highmem it * is limited to the max size of lowmem, so that it doesn't * get completely filled. * * Make sure we have no memory above max_pages, as this area * isn't handled by the p2m management. * * In principle there could be a problem in lowmem systems if * the initial memory is also very large with respect to * lowmem, but we won't try to deal with that here. */ extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), extra_pages, max_pages - max_pfn); i = 0; addr = xen_e820_map[0].addr; size = xen_e820_map[0].size; while (i < xen_e820_map_entries) { bool discard = false; chunk_size = size; type = xen_e820_map[i].type; if (type == E820_RAM) { if (addr < mem_end) { chunk_size = min(size, mem_end - addr); } else if (extra_pages) { chunk_size = min(size, PFN_PHYS(extra_pages)); pfn_s = PFN_UP(addr); n_pfns = PFN_DOWN(addr + chunk_size) - pfn_s; extra_pages -= n_pfns; xen_add_extra_mem(pfn_s, n_pfns); xen_max_p2m_pfn = pfn_s + n_pfns; } else discard = true; } if (!discard) xen_align_and_add_e820_region(addr, chunk_size, type); addr += chunk_size; size -= chunk_size; if (size == 0) { i++; if (i < xen_e820_map_entries) { addr = xen_e820_map[i].addr; size = xen_e820_map[i].size; } } } /* * Set the rest as identity mapped, in case PCI BARs are * located here. */ set_phys_range_identity(addr / PAGE_SIZE, ~0ul); /* * In domU, the ISA region is normal, usable memory, but we * reserve ISA memory anyway because too many things poke * about in there. */ e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, E820_RESERVED); sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); /* * Check whether the kernel itself conflicts with the target E820 map. * Failing now is better than running into weird problems later due * to relocating (and even reusing) pages with kernel text or data. */ if (xen_is_e820_reserved(__pa_symbol(_text), __pa_symbol(__bss_stop) - __pa_symbol(_text))) { xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n"); BUG(); } /* * Check for a conflict of the hypervisor supplied page tables with * the target E820 map. */ xen_pt_check_e820(); xen_reserve_xen_mfnlist(); /* Check for a conflict of the initrd with the target E820 map. */ if (xen_is_e820_reserved(boot_params.hdr.ramdisk_image, boot_params.hdr.ramdisk_size)) { phys_addr_t new_area, start, size; new_area = xen_find_free_area(boot_params.hdr.ramdisk_size); if (!new_area) { xen_raw_console_write("Can't find new memory area for initrd needed due to E820 map conflict\n"); BUG(); } start = boot_params.hdr.ramdisk_image; size = boot_params.hdr.ramdisk_size; xen_phys_memcpy(new_area, start, size); pr_info("initrd moved from [mem %#010llx-%#010llx] to [mem %#010llx-%#010llx]\n", start, start + size, new_area, new_area + size); memblock_free(start, size); boot_params.hdr.ramdisk_image = new_area; boot_params.ext_ramdisk_image = new_area >> 32; }
static unsigned long __init xen_do_chunk(unsigned long start, unsigned long end, bool release) { struct xen_memory_reservation reservation = { .address_bits = 0, .extent_order = 0, .domid = DOMID_SELF }; unsigned long len = 0; int xlated_phys = xen_feature(XENFEAT_auto_translated_physmap); unsigned long pfn; int ret; for (pfn = start; pfn < end; pfn++) { unsigned long frame; unsigned long mfn = pfn_to_mfn(pfn); if (release) { /* Make sure pfn exists to start with */ if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) continue; frame = mfn; } else { if (!xlated_phys && mfn != INVALID_P2M_ENTRY) continue; frame = pfn; } set_xen_guest_handle(reservation.extent_start, &frame); reservation.nr_extents = 1; ret = HYPERVISOR_memory_op(release ? XENMEM_decrease_reservation : XENMEM_populate_physmap, &reservation); WARN(ret != 1, "Failed to %s pfn %lx err=%d\n", release ? "release" : "populate", pfn, ret); if (ret == 1) { if (!early_set_phys_to_machine(pfn, release ? INVALID_P2M_ENTRY : frame)) { if (release) break; set_xen_guest_handle(reservation.extent_start, &frame); reservation.nr_extents = 1; ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); break; } len++; } else break; } if (len) printk(KERN_INFO "%s %lx-%lx pfn range: %lu pages %s\n", release ? "Freeing" : "Populating", start, end, len, release ? "freed" : "added"); return len; } static unsigned long __init xen_release_chunk(unsigned long start, unsigned long end) { /* * Xen already ballooned out the E820 non RAM regions for us * and set them up properly in EPT. */ if (xen_feature(XENFEAT_auto_translated_physmap)) return end - start; return xen_do_chunk(start, end, true); } static unsigned long __init xen_populate_chunk( const struct e820entry *list, size_t map_size, unsigned long max_pfn, unsigned long *last_pfn, unsigned long credits_left) { const struct e820entry *entry; unsigned int i; unsigned long done = 0; unsigned long dest_pfn; for (i = 0, entry = list; i < map_size; i++, entry++) { unsigned long s_pfn; unsigned long e_pfn; unsigned long pfns; long capacity; if (credits_left <= 0) break; if (entry->type != E820_RAM) continue; e_pfn = PFN_DOWN(entry->addr + entry->size); /* We only care about E820 after the xen_start_info->nr_pages */ if (e_pfn <= max_pfn) continue; s_pfn = PFN_UP(entry->addr); /* If the E820 falls within the nr_pages, we want to start * at the nr_pages PFN. * If that would mean going past the E820 entry, skip it */ if (s_pfn <= max_pfn) { capacity = e_pfn - max_pfn; dest_pfn = max_pfn; } else { capacity = e_pfn - s_pfn; dest_pfn = s_pfn; } if (credits_left < capacity) capacity = credits_left; pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false); done += pfns; *last_pfn = (dest_pfn + pfns); if (pfns < capacity) break; credits_left -= pfns; } return done; } static void __init xen_set_identity_and_release_chunk( unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, unsigned long *released, unsigned long *identity) { unsigned long pfn; /* * If the PFNs are currently mapped, clear the mappings * (except for the ISA region which must be 1:1 mapped) to * release the refcounts (in Xen) on the original frames. */ /* * PVH E820 matches the hypervisor's P2M which means we need to * account for the proper values of *release and *identity. */ for (pfn = start_pfn; !xen_feature(XENFEAT_auto_translated_physmap) && pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) { pte_t pte = __pte_ma(0); if (pfn < PFN_UP(ISA_END_ADDRESS)) pte = mfn_pte(pfn, PAGE_KERNEL_IO); (void)HYPERVISOR_update_va_mapping( (unsigned long)__va(pfn << PAGE_SHIFT), pte, 0); } if (start_pfn < nr_pages) *released += xen_release_chunk( start_pfn, min(end_pfn, nr_pages)); *identity += set_phys_range_identity(start_pfn, end_pfn); }
static int privcmd_HYPERVISOR_memory_op(int cmd, void *arg) { int error = 0; import_export_t op_ie, sub_ie, gpfn_ie, mfn_ie; union { domid_t domid; struct xen_memory_reservation resv; struct xen_machphys_mfn_list xmml; struct xen_add_to_physmap xatp; struct xen_memory_map mm; struct xen_foreign_memory_map fmm; } op_arg; op_ie = sub_ie = gpfn_ie = mfn_ie = null_ie; switch (cmd) { case XENMEM_increase_reservation: case XENMEM_decrease_reservation: case XENMEM_populate_physmap: { ulong_t *taddr; if (import_buffer(&op_ie, arg, &op_arg, sizeof (op_arg.resv), IE_IMPEXP) != 0) return (-X_EFAULT); error = import_handle(&sub_ie, &op_arg.resv.extent_start, (op_arg.resv.nr_extents * sizeof (ulong_t)), IE_IMPEXP); if (error == -X_EFAULT) /*LINTED: constant in conditional context*/ get_xen_guest_handle(taddr, op_arg.resv.extent_start); else taddr = sub_ie.ie_kaddr; switch (cmd) { case XENMEM_increase_reservation: DTRACE_XPV4(increase__reservation__start, domid_t, op_arg.resv.domid, ulong_t, op_arg.resv.nr_extents, uint_t, op_arg.resv.extent_order, ulong_t *, taddr); break; case XENMEM_decrease_reservation: DTRACE_XPV4(decrease__reservation__start, domid_t, op_arg.resv.domid, ulong_t, op_arg.resv.nr_extents, uint_t, op_arg.resv.extent_order, ulong_t *, taddr); break; case XENMEM_populate_physmap: DTRACE_XPV3(populate__physmap__start, domid_t, op_arg.resv.domid, ulong_t, op_arg.resv.nr_extents, ulong_t *, taddr); break; } break; } case XENMEM_maximum_ram_page: break; case XENMEM_current_reservation: case XENMEM_maximum_reservation: case XENMEM_maximum_gpfn: if (import_buffer(&op_ie, arg, &op_arg, sizeof (op_arg.domid), IE_IMPEXP) != 0) return (-X_EFAULT); break; case XENMEM_machphys_mfn_list: { if (import_buffer(&op_ie, arg, &op_arg, sizeof (op_arg.xmml), IE_IMPEXP) != 0) return (-X_EFAULT); error = import_handle(&sub_ie, &op_arg.xmml.extent_start, (op_arg.xmml.max_extents * sizeof (ulong_t)), IE_IMPEXP); break; } case XENMEM_add_to_physmap: if (import_buffer(&op_ie, arg, &op_arg, sizeof (op_arg.xatp), IE_IMPEXP) != 0) return (-X_EFAULT); DTRACE_XPV4(add__to__physmap__start, domid_t, op_arg.xatp.domid, uint_t, op_arg.xatp.space, ulong_t, op_arg.xatp.idx, ulong_t, op_arg.xatp.gpfn); break; case XENMEM_memory_map: case XENMEM_machine_memory_map: { if (import_buffer(&op_ie, arg, &op_arg, sizeof (op_arg.mm), IE_EXPORT) != 0) return (-X_EFAULT); /* * XXPV: ugh. e820entry is packed, but not in the kernel, since * we remove all attributes; seems like this is a nice way to * break mysteriously. */ error = import_handle(&sub_ie, &op_arg.mm.buffer, (op_arg.mm.nr_entries * 20), IE_IMPEXP); break; } case XENMEM_set_memory_map: { struct xen_memory_map *taddr; if (import_buffer(&op_ie, arg, &op_arg, sizeof (op_arg.fmm), IE_IMPORT) != 0) return (-X_EFAULT); /* * As above. */ error = import_handle(&sub_ie, &op_arg.fmm.map.buffer, (op_arg.fmm.map.nr_entries * 20), IE_IMPEXP); if (error == -X_EFAULT) /*LINTED: constant in conditional context*/ get_xen_guest_handle(taddr, op_arg.fmm.map.buffer); else taddr = sub_ie.ie_kaddr; DTRACE_XPV3(set__memory__map__start, domid_t, op_arg.fmm.domid, int, op_arg.fmm.map.nr_entries, struct xen_memory_map *, taddr); break; } default: #ifdef DEBUG printf("unrecognized HYPERVISOR_memory_op %d\n", cmd); #endif return (-X_EINVAL); } if (error == 0) error = HYPERVISOR_memory_op(cmd, (arg == NULL) ? NULL: &op_arg); export_buffer(&op_ie, &error); export_buffer(&sub_ie, &error); export_buffer(&gpfn_ie, &error); export_buffer(&mfn_ie, &error); switch (cmd) { case XENMEM_increase_reservation: DTRACE_XPV1(increase__reservation__end, int, error); break; case XENMEM_decrease_reservation: DTRACE_XPV1(decrease__reservation__end, int, error); break; case XENMEM_populate_physmap: DTRACE_XPV1(populate__physmap__end, int, error); break; case XENMEM_add_to_physmap: DTRACE_XPV1(add__to__physmap__end, int, error); break; case XENMEM_set_memory_map: DTRACE_XPV1(set__memory__map__end, int, error); break; } return (error); }
/** * machine_specific_memory_setup - Hook for machine specific memory setup. **/ char * __init xen_memory_setup(void) { static struct e820entry map[E820MAX] __initdata; unsigned long max_pfn = xen_start_info->nr_pages; unsigned long long mem_end; int rc; struct xen_memory_map memmap; unsigned long max_pages; unsigned long last_pfn = 0; unsigned long extra_pages = 0; unsigned long populated; int i; int op; max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); mem_end = PFN_PHYS(max_pfn); memmap.nr_entries = E820MAX; set_xen_guest_handle(memmap.buffer, map); op = xen_initial_domain() ? XENMEM_machine_memory_map : XENMEM_memory_map; rc = HYPERVISOR_memory_op(op, &memmap); if (rc == -ENOSYS) { BUG_ON(xen_initial_domain()); memmap.nr_entries = 1; map[0].addr = 0ULL; map[0].size = mem_end; /* 8MB slack (to balance backend allocations). */ map[0].size += 8ULL << 20; map[0].type = E820_RAM; rc = 0; } BUG_ON(rc); /* * Xen won't allow a 1:1 mapping to be created to UNUSABLE * regions, so if we're using the machine memory map leave the * region as RAM as it is in the pseudo-physical map. * * UNUSABLE regions in domUs are not handled and will need * a patch in the future. */ if (xen_initial_domain()) xen_ignore_unusable(map, memmap.nr_entries); /* Make sure the Xen-supplied memory map is well-ordered. */ sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries); max_pages = xen_get_max_pages(); if (max_pages > max_pfn) extra_pages += max_pages - max_pfn; /* * Set P2M for all non-RAM pages and E820 gaps to be identity * type PFNs. Any RAM pages that would be made inaccesible by * this are first released. */ xen_released_pages = xen_set_identity_and_release( map, memmap.nr_entries, max_pfn); /* * Populate back the non-RAM pages and E820 gaps that had been * released. */ populated = xen_populate_chunk(map, memmap.nr_entries, max_pfn, &last_pfn, xen_released_pages); xen_released_pages -= populated; extra_pages += xen_released_pages; if (last_pfn > max_pfn) { max_pfn = min(MAX_DOMAIN_PAGES, last_pfn); mem_end = PFN_PHYS(max_pfn); } /* * Clamp the amount of extra memory to a EXTRA_MEM_RATIO * factor the base size. On non-highmem systems, the base * size is the full initial memory allocation; on highmem it * is limited to the max size of lowmem, so that it doesn't * get completely filled. * * In principle there could be a problem in lowmem systems if * the initial memory is also very large with respect to * lowmem, but we won't try to deal with that here. */ extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), extra_pages); i = 0; while (i < memmap.nr_entries) { u64 addr = map[i].addr; u64 size = map[i].size; u32 type = map[i].type; if (type == E820_RAM) { if (addr < mem_end) { size = min(size, mem_end - addr); } else if (extra_pages) { size = min(size, (u64)extra_pages * PAGE_SIZE); extra_pages -= size / PAGE_SIZE; xen_add_extra_mem(addr, size); } else type = E820_UNUSABLE; } xen_align_and_add_e820_region(addr, size, type); map[i].addr += size; map[i].size -= size; if (map[i].size == 0) i++; } /* * In domU, the ISA region is normal, usable memory, but we * reserve ISA memory anyway because too many things poke * about in there. */ e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, E820_RESERVED); /* * Reserve Xen bits: * - mfn_list * - xen_start_info * See comment above "struct start_info" in <xen/interface/xen.h> * We tried to make the the memblock_reserve more selective so * that it would be clear what region is reserved. Sadly we ran * in the problem wherein on a 64-bit hypervisor with a 32-bit * initial domain, the pt_base has the cr3 value which is not * neccessarily where the pagetable starts! As Jan put it: " * Actually, the adjustment turns out to be correct: The page * tables for a 32-on-64 dom0 get allocated in the order "first L1", * "first L2", "first L3", so the offset to the page table base is * indeed 2. When reading xen/include/public/xen.h's comment * very strictly, this is not a violation (since there nothing is said * that the first thing in the page table space is pointed to by * pt_base; I admit that this seems to be implied though, namely * do I think that it is implied that the page table space is the * range [pt_base, pt_base + nt_pt_frames), whereas that * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames), * which - without a priori knowledge - the kernel would have * difficulty to figure out)." - so lets just fall back to the * easy way and reserve the whole region. */ memblock_reserve(__pa(xen_start_info->mfn_list), xen_start_info->pt_base - xen_start_info->mfn_list); sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); return "Xen"; }
char * __init xen_memory_setup(void) { static struct e820entry map[E820MAX] __initdata; unsigned long max_pfn = xen_start_info->nr_pages; unsigned long long mem_end; int rc; struct xen_memory_map memmap; unsigned long max_pages; unsigned long extra_pages = 0; int i; int op; max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); mem_end = PFN_PHYS(max_pfn); memmap.nr_entries = E820MAX; set_xen_guest_handle(memmap.buffer, map); op = xen_initial_domain() ? XENMEM_machine_memory_map : XENMEM_memory_map; rc = HYPERVISOR_memory_op(op, &memmap); if (rc == -ENOSYS) { BUG_ON(xen_initial_domain()); memmap.nr_entries = 1; map[0].addr = 0ULL; map[0].size = mem_end; /* */ map[0].size += 8ULL << 20; map[0].type = E820_RAM; rc = 0; } BUG_ON(rc); /* */ sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries); max_pages = xen_get_max_pages(); if (max_pages > max_pfn) extra_pages += max_pages - max_pfn; /* */ xen_released_pages = xen_set_identity_and_release( map, memmap.nr_entries, max_pfn); extra_pages += xen_released_pages; /* */ extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), extra_pages); i = 0; while (i < memmap.nr_entries) { u64 addr = map[i].addr; u64 size = map[i].size; u32 type = map[i].type; if (type == E820_RAM) { if (addr < mem_end) { size = min(size, mem_end - addr); } else if (extra_pages) { size = min(size, (u64)extra_pages * PAGE_SIZE); extra_pages -= size / PAGE_SIZE; xen_add_extra_mem(addr, size); } else type = E820_UNUSABLE; } xen_align_and_add_e820_region(addr, size, type); map[i].addr += size; map[i].size -= size; if (map[i].size == 0) i++; } /* */ e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, E820_RESERVED); /* */ memblock_reserve(__pa(xen_start_info->mfn_list), xen_start_info->pt_base - xen_start_info->mfn_list); sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); return "Xen"; }
/** * machine_specific_memory_setup - Hook for machine specific memory setup. **/ char * __init xen_memory_setup(void) { static struct e820entry map[E820MAX] __initdata; static struct e820entry map_raw[E820MAX] __initdata; unsigned long max_pfn = xen_start_info->nr_pages; unsigned long long mem_end; int rc; struct xen_memory_map memmap; unsigned long extra_pages = 0; unsigned long extra_limit; unsigned long identity_pages = 0; int i; int op; max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); mem_end = PFN_PHYS(max_pfn); memmap.nr_entries = E820MAX; set_xen_guest_handle(memmap.buffer, map); op = xen_initial_domain() ? XENMEM_machine_memory_map : XENMEM_memory_map; rc = HYPERVISOR_memory_op(op, &memmap); if (rc == -ENOSYS) { BUG_ON(xen_initial_domain()); memmap.nr_entries = 1; map[0].addr = 0ULL; map[0].size = mem_end; /* 8MB slack (to balance backend allocations). */ map[0].size += 8ULL << 20; map[0].type = E820_RAM; rc = 0; } BUG_ON(rc); memcpy(map_raw, map, sizeof(map)); e820.nr_map = 0; xen_extra_mem_start = mem_end; for (i = 0; i < memmap.nr_entries; i++) { unsigned long long end; /* Guard against non-page aligned E820 entries. */ if (map[i].type == E820_RAM) map[i].size -= (map[i].size + map[i].addr) % PAGE_SIZE; end = map[i].addr + map[i].size; if (map[i].type == E820_RAM && end > mem_end) { /* RAM off the end - may be partially included */ u64 delta = min(map[i].size, end - mem_end); map[i].size -= delta; end -= delta; extra_pages += PFN_DOWN(delta); /* * Set RAM below 4GB that is not for us to be unusable. * This prevents "System RAM" address space from being * used as potential resource for I/O address (happens * when 'allocate_resource' is called). */ if (delta && (xen_initial_domain() && end < 0x100000000ULL)) e820_add_region(end, delta, E820_UNUSABLE); } if (map[i].size > 0 && end > xen_extra_mem_start) xen_extra_mem_start = end; /* Add region if any remains */ if (map[i].size > 0) e820_add_region(map[i].addr, map[i].size, map[i].type); } /* Align the balloon area so that max_low_pfn does not get set * to be at the _end_ of the PCI gap at the far end (fee01000). * Note that xen_extra_mem_start gets set in the loop above to be * past the last E820 region. */ if (xen_initial_domain() && (xen_extra_mem_start < (1ULL<<32))) xen_extra_mem_start = (1ULL<<32); /* * In domU, the ISA region is normal, usable memory, but we * reserve ISA memory anyway because too many things poke * about in there. * * In Dom0, the host E820 information can leave gaps in the * ISA range, which would cause us to release those pages. To * avoid this, we unconditionally reserve them here. */ e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, E820_RESERVED); /* * Reserve Xen bits: * - mfn_list * - xen_start_info * See comment above "struct start_info" in <xen/interface/xen.h> */ memblock_x86_reserve_range(__pa(xen_start_info->mfn_list), __pa(xen_start_info->pt_base), "XEN START INFO"); sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); extra_limit = xen_get_max_pages(); if (max_pfn + extra_pages > extra_limit) { if (extra_limit > max_pfn) extra_pages = extra_limit - max_pfn; else extra_pages = 0; } extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820); /* * Clamp the amount of extra memory to a EXTRA_MEM_RATIO * factor the base size. On non-highmem systems, the base * size is the full initial memory allocation; on highmem it * is limited to the max size of lowmem, so that it doesn't * get completely filled. * * In principle there could be a problem in lowmem systems if * the initial memory is also very large with respect to * lowmem, but we won't try to deal with that here. */ extra_limit = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), max_pfn + extra_pages); if (extra_limit >= max_pfn) extra_pages = extra_limit - max_pfn; else extra_pages = 0; xen_add_extra_mem(extra_pages); /* * Set P2M for all non-RAM pages and E820 gaps to be identity * type PFNs. We supply it with the non-sanitized version * of the E820. */ identity_pages = xen_set_identity(map_raw, memmap.nr_entries); printk(KERN_INFO "Set %ld page(s) to 1-1 mapping.\n", identity_pages); return "Xen"; }
static int increase_reservation(unsigned long nr_pages) { unsigned long pfn, i, flags; struct page *page; long rc; struct xen_memory_reservation reservation = { .address_bits = 0, .extent_order = 0, .domid = DOMID_SELF }; if (nr_pages > ARRAY_SIZE(frame_list)) nr_pages = ARRAY_SIZE(frame_list); balloon_lock(flags); page = balloon_first_page(); for (i = 0; i < nr_pages; i++) { BUG_ON(page == NULL); frame_list[i] = page_to_pfn(page);; page = balloon_next_page(page); } set_xen_guest_handle(reservation.extent_start, frame_list); reservation.nr_extents = nr_pages; rc = HYPERVISOR_memory_op( XENMEM_populate_physmap, &reservation); if (rc < nr_pages) { int ret; /* We hit the Xen hard limit: reprobe. */ set_xen_guest_handle(reservation.extent_start, frame_list); reservation.nr_extents = rc; ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); BUG_ON(ret != rc); hard_limit = current_pages + rc - driver_pages; goto out; } for (i = 0; i < nr_pages; i++) { page = balloon_retrieve(); BUG_ON(page == NULL); pfn = page_to_pfn(page); BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) && phys_to_machine_mapping_valid(pfn)); /* Update P->M and M->P tables. */ set_phys_to_machine(pfn, frame_list[i]); #ifdef CONFIG_XEN xen_machphys_update(frame_list[i], pfn); /* Link back into the page tables if not highmem. */ if (pfn < max_low_pfn) { int ret; ret = HYPERVISOR_update_va_mapping( (unsigned long)__va(pfn << PAGE_SHIFT), pfn_pte_ma(frame_list[i], PAGE_KERNEL), 0); BUG_ON(ret); } #endif /* Relinquish the page back to the allocator. */ ClearPageReserved(page); set_page_count(page, 1); __free_page(page); } current_pages += nr_pages; totalram_pages = current_pages; out: balloon_unlock(flags); return 0; } static int decrease_reservation(unsigned long nr_pages) { unsigned long pfn, i, flags; struct page *page; void *v; int need_sleep = 0; int ret; struct xen_memory_reservation reservation = { .address_bits = 0, .extent_order = 0, .domid = DOMID_SELF }; if (nr_pages > ARRAY_SIZE(frame_list)) nr_pages = ARRAY_SIZE(frame_list); for (i = 0; i < nr_pages; i++) { if ((page = alloc_page(GFP_BALLOON)) == NULL) { nr_pages = i; need_sleep = 1; break; } pfn = page_to_pfn(page); frame_list[i] = pfn_to_mfn(pfn); if (!PageHighMem(page)) { v = phys_to_virt(pfn << PAGE_SHIFT); scrub_pages(v, 1); #ifdef CONFIG_XEN ret = HYPERVISOR_update_va_mapping( (unsigned long)v, __pte_ma(0), 0); BUG_ON(ret); #endif } #ifdef CONFIG_XEN_SCRUB_PAGES else { v = kmap(page); scrub_pages(v, 1); kunmap(page); } #endif } #ifdef CONFIG_XEN /* Ensure that ballooned highmem pages don't have kmaps. */ kmap_flush_unused(); flush_tlb_all(); #endif balloon_lock(flags); /* No more mappings: invalidate P2M and add to balloon. */ for (i = 0; i < nr_pages; i++) { pfn = mfn_to_pfn(frame_list[i]); set_phys_to_machine(pfn, INVALID_P2M_ENTRY); balloon_append(pfn_to_page(pfn)); } set_xen_guest_handle(reservation.extent_start, frame_list); reservation.nr_extents = nr_pages; ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); BUG_ON(ret != nr_pages); current_pages -= nr_pages; totalram_pages = current_pages; balloon_unlock(flags); return need_sleep; } /* * We avoid multiple worker processes conflicting via the balloon mutex. * We may of course race updates of the target counts (which are protected * by the balloon lock), or with changes to the Xen hard limit, but we will * recover from these in time. */ static void balloon_process(void *unused) { int need_sleep = 0; long credit; down(&balloon_mutex); do { credit = current_target() - current_pages; if (credit > 0) need_sleep = (increase_reservation(credit) != 0); if (credit < 0) need_sleep = (decrease_reservation(-credit) != 0); #ifndef CONFIG_PREEMPT if (need_resched()) schedule(); #endif } while ((credit != 0) && !need_sleep); /* Schedule more work if there is some still to be done. */ if (current_target() != current_pages) mod_timer(&balloon_timer, jiffies + HZ); up(&balloon_mutex); }
static int xen_map_device_mmio(const struct resource *resources, unsigned int count) { unsigned int i, j, nr; int rc = 0; const struct resource *r; xen_pfn_t *gpfns; xen_ulong_t *idxs; int *errs; for (i = 0; i < count; i++) { struct xen_add_to_physmap_range xatp = { .domid = DOMID_SELF, .space = XENMAPSPACE_dev_mmio }; r = &resources[i]; nr = DIV_ROUND_UP(resource_size(r), XEN_PAGE_SIZE); if ((resource_type(r) != IORESOURCE_MEM) || (nr == 0)) continue; gpfns = kzalloc(sizeof(xen_pfn_t) * nr, GFP_KERNEL); idxs = kzalloc(sizeof(xen_ulong_t) * nr, GFP_KERNEL); errs = kzalloc(sizeof(int) * nr, GFP_KERNEL); if (!gpfns || !idxs || !errs) { kfree(gpfns); kfree(idxs); kfree(errs); rc = -ENOMEM; goto unmap; } for (j = 0; j < nr; j++) { /* * The regions are always mapped 1:1 to DOM0 and this is * fine because the memory map for DOM0 is the same as * the host (except for the RAM). */ gpfns[j] = XEN_PFN_DOWN(r->start) + j; idxs[j] = XEN_PFN_DOWN(r->start) + j; } xatp.size = nr; set_xen_guest_handle(xatp.gpfns, gpfns); set_xen_guest_handle(xatp.idxs, idxs); set_xen_guest_handle(xatp.errs, errs); rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp); kfree(gpfns); kfree(idxs); kfree(errs); if (rc) goto unmap; } return rc; unmap: xen_unmap_device_mmio(resources, i); return rc; }
static int __init xen_guest_init(void) { struct xen_add_to_physmap xatp; static struct shared_info *shared_info_page = 0; struct device_node *node; int len; const char *s = NULL; const char *version = NULL; const char *xen_prefix = "xen,xen-"; struct resource res; node = of_find_compatible_node(NULL, NULL, "xen,xen"); if (!node) { pr_debug("No Xen support\n"); return 0; } s = of_get_property(node, "compatible", &len); if (strlen(xen_prefix) + 3 < len && !strncmp(xen_prefix, s, strlen(xen_prefix))) version = s + strlen(xen_prefix); if (version == NULL) { pr_debug("Xen version not found\n"); return 0; } if (of_address_to_resource(node, GRANT_TABLE_PHYSADDR, &res)) return 0; xen_hvm_resume_frames = res.start >> PAGE_SHIFT; xen_events_irq = irq_of_parse_and_map(node, 0); pr_info("Xen %s support found, events_irq=%d gnttab_frame_pfn=%lx\n", version, xen_events_irq, xen_hvm_resume_frames); xen_domain_type = XEN_HVM_DOMAIN; xen_setup_features(); if (xen_feature(XENFEAT_dom0)) xen_start_info->flags |= SIF_INITDOMAIN|SIF_PRIVILEGED; else xen_start_info->flags &= ~(SIF_INITDOMAIN|SIF_PRIVILEGED); if (!shared_info_page) shared_info_page = (struct shared_info *) get_zeroed_page(GFP_KERNEL); if (!shared_info_page) { pr_err("not enough memory\n"); return -ENOMEM; } xatp.domid = DOMID_SELF; xatp.idx = 0; xatp.space = XENMAPSPACE_shared_info; xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) BUG(); HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info * page, we use it in the event channel upcall and in some pvclock * related functions. * The shared info contains exactly 1 CPU (the boot CPU). The guest * is required to use VCPUOP_register_vcpu_info to place vcpu info * for secondary CPUs as they are brought up. * For uniformity we use VCPUOP_register_vcpu_info even on cpu0. */ xen_vcpu_info = __alloc_percpu(sizeof(struct vcpu_info), sizeof(struct vcpu_info)); if (xen_vcpu_info == NULL) return -ENOMEM; gnttab_init(); if (!xen_initial_domain()) xenbus_probe(NULL); return 0; }
/* * balloon_free_pages() * free page_cnt pages, using any combination of mfns, pfns, and kva as long * as they refer to the same mapping. If an array of mfns is passed in, we * assume they were already cleared. Otherwise, we need to zero the pages * before giving them back to the hypervisor. kva space is not free'd up in * case the caller wants to re-use it. */ long balloon_free_pages(uint_t page_cnt, mfn_t *mfns, caddr_t kva, pfn_t *pfns) { xen_memory_reservation_t memdec; mfn_t mfn; pfn_t pfn; uint_t i; long e; #if DEBUG /* make sure kva is page aligned and maps to first pfn */ if (kva != NULL) { ASSERT(((uintptr_t)kva & PAGEOFFSET) == 0); if (pfns != NULL) { ASSERT(hat_getpfnum(kas.a_hat, kva) == pfns[0]); } } #endif /* if we have a kva, we can clean all pages with just one bzero */ if ((kva != NULL) && balloon_zero_memory) { bzero(kva, (page_cnt * PAGESIZE)); } /* if we were given a kva and/or a pfn */ if ((kva != NULL) || (pfns != NULL)) { /* * All the current callers only pass 1 page when using kva or * pfns, and use mfns when passing multiple pages. If that * assumption is changed, the following code will need some * work. The following ASSERT() guarantees we're respecting * the io locking quota. */ ASSERT(page_cnt < bln_contig_list_quota); /* go through all the pages */ for (i = 0; i < page_cnt; i++) { /* get the next pfn */ if (pfns == NULL) { pfn = hat_getpfnum(kas.a_hat, (kva + (PAGESIZE * i))); } else { pfn = pfns[i]; } /* * if we didn't already zero this page, do it now. we * need to do this *before* we give back the MFN */ if ((kva == NULL) && (balloon_zero_memory)) { pfnzero(pfn, 0, PAGESIZE); } /* * unmap the pfn. We don't free up the kva vmem space * so the caller can re-use it. The page must be * unmapped before it is given back to the hypervisor. */ if (kva != NULL) { hat_unload(kas.a_hat, (kva + (PAGESIZE * i)), PAGESIZE, HAT_UNLOAD_UNMAP); } /* grab the mfn before the pfn is marked as invalid */ mfn = pfn_to_mfn(pfn); /* mark the pfn as invalid */ reassign_pfn(pfn, MFN_INVALID); /* * if we weren't given an array of MFNs, we need to * free them up one at a time. Otherwise, we'll wait * until later and do it in one hypercall */ if (mfns == NULL) { bzero(&memdec, sizeof (memdec)); /*LINTED: constant in conditional context*/ set_xen_guest_handle(memdec.extent_start, &mfn); memdec.domid = DOMID_SELF; memdec.nr_extents = 1; e = HYPERVISOR_memory_op( XENMEM_decrease_reservation, &memdec); if (e != 1) { cmn_err(CE_PANIC, "balloon: unable to " "give a page back to the " "hypervisor.\n"); } } } } /* * if we were passed in MFNs, we haven't free'd them up yet. We can * do it with one call. */ if (mfns != NULL) { bzero(&memdec, sizeof (memdec)); /*LINTED: constant in conditional context*/ set_xen_guest_handle(memdec.extent_start, mfns); memdec.domid = DOMID_SELF; memdec.nr_extents = page_cnt; e = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &memdec); if (e != page_cnt) { cmn_err(CE_PANIC, "balloon: unable to give pages back " "to the hypervisor.\n"); } } atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, page_cnt); return (page_cnt); }
/** * machine_specific_memory_setup - Hook for machine specific memory setup. **/ char * __init xen_memory_setup(void) { static struct e820entry map[E820MAX] __initdata; unsigned long max_pfn = xen_start_info->nr_pages; unsigned long long mem_end; int rc; struct xen_memory_map memmap; unsigned long max_pages; unsigned long extra_pages = 0; int i; int op; max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); mem_end = PFN_PHYS(max_pfn); memmap.nr_entries = E820MAX; set_xen_guest_handle(memmap.buffer, map); op = xen_initial_domain() ? XENMEM_machine_memory_map : XENMEM_memory_map; rc = HYPERVISOR_memory_op(op, &memmap); if (rc == -ENOSYS) { BUG_ON(xen_initial_domain()); memmap.nr_entries = 1; map[0].addr = 0ULL; map[0].size = mem_end; /* 8MB slack (to balance backend allocations). */ map[0].size += 8ULL << 20; map[0].type = E820_RAM; rc = 0; } BUG_ON(rc); /* Make sure the Xen-supplied memory map is well-ordered. */ sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries); max_pages = xen_get_max_pages(); if (max_pages > max_pfn) extra_pages += max_pages - max_pfn; /* * Set P2M for all non-RAM pages and E820 gaps to be identity * type PFNs. Any RAM pages that would be made inaccesible by * this are first released. */ xen_released_pages = xen_set_identity_and_release( map, memmap.nr_entries, max_pfn); extra_pages += xen_released_pages; /* * Clamp the amount of extra memory to a EXTRA_MEM_RATIO * factor the base size. On non-highmem systems, the base * size is the full initial memory allocation; on highmem it * is limited to the max size of lowmem, so that it doesn't * get completely filled. * * In principle there could be a problem in lowmem systems if * the initial memory is also very large with respect to * lowmem, but we won't try to deal with that here. */ extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), extra_pages); i = 0; while (i < memmap.nr_entries) { u64 addr = map[i].addr; u64 size = map[i].size; u32 type = map[i].type; if (type == E820_RAM) { if (addr < mem_end) { size = min(size, mem_end - addr); } else if (extra_pages) { size = min(size, (u64)extra_pages * PAGE_SIZE); extra_pages -= size / PAGE_SIZE; xen_add_extra_mem(addr, size); } else type = E820_UNUSABLE; } xen_align_and_add_e820_region(addr, size, type); map[i].addr += size; map[i].size -= size; if (map[i].size == 0) i++; } /* * In domU, the ISA region is normal, usable memory, but we * reserve ISA memory anyway because too many things poke * about in there. */ e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, E820_RESERVED); /* * Reserve Xen bits: * - mfn_list * - xen_start_info * See comment above "struct start_info" in <xen/interface/xen.h> */ memblock_reserve(__pa(xen_start_info->mfn_list), xen_start_info->pt_base - xen_start_info->mfn_list); sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); return "Xen"; }
static int privcmd_ioctl(struct cdev *dev, unsigned long cmd, caddr_t arg, int mode, struct thread *td) { int error, i; switch (cmd) { case IOCTL_PRIVCMD_HYPERCALL: { struct ioctl_privcmd_hypercall *hcall; hcall = (struct ioctl_privcmd_hypercall *)arg; #ifdef __amd64__ /* * The hypervisor page table walker will refuse to access * user-space pages if SMAP is enabled, so temporary disable it * while performing the hypercall. */ if (cpu_stdext_feature & CPUID_STDEXT_SMAP) stac(); #endif error = privcmd_hypercall(hcall->op, hcall->arg[0], hcall->arg[1], hcall->arg[2], hcall->arg[3], hcall->arg[4]); #ifdef __amd64__ if (cpu_stdext_feature & CPUID_STDEXT_SMAP) clac(); #endif if (error >= 0) { hcall->retval = error; error = 0; } else { error = xen_translate_error(error); hcall->retval = 0; } break; } case IOCTL_PRIVCMD_MMAPBATCH: { struct ioctl_privcmd_mmapbatch *mmap; vm_map_t map; vm_map_entry_t entry; vm_object_t mem; vm_pindex_t pindex; vm_prot_t prot; boolean_t wired; struct xen_add_to_physmap_range add; xen_ulong_t *idxs; xen_pfn_t *gpfns; int *errs, index; struct privcmd_map *umap; uint16_t num; mmap = (struct ioctl_privcmd_mmapbatch *)arg; if ((mmap->num == 0) || ((mmap->addr & PAGE_MASK) != 0)) { error = EINVAL; break; } map = &td->td_proc->p_vmspace->vm_map; error = vm_map_lookup(&map, mmap->addr, VM_PROT_NONE, &entry, &mem, &pindex, &prot, &wired); if (error != KERN_SUCCESS) { error = EINVAL; break; } if ((entry->start != mmap->addr) || (entry->end != mmap->addr + (mmap->num * PAGE_SIZE))) { vm_map_lookup_done(map, entry); error = EINVAL; break; } vm_map_lookup_done(map, entry); if ((mem->type != OBJT_MGTDEVICE) || (mem->un_pager.devp.ops != &privcmd_pg_ops)) { error = EINVAL; break; } umap = mem->handle; add.domid = DOMID_SELF; add.space = XENMAPSPACE_gmfn_foreign; add.foreign_domid = mmap->dom; /* * The 'size' field in the xen_add_to_physmap_range only * allows for UINT16_MAX mappings in a single hypercall. */ num = MIN(mmap->num, UINT16_MAX); idxs = malloc(sizeof(*idxs) * num, M_PRIVCMD, M_WAITOK); gpfns = malloc(sizeof(*gpfns) * num, M_PRIVCMD, M_WAITOK); errs = malloc(sizeof(*errs) * num, M_PRIVCMD, M_WAITOK); set_xen_guest_handle(add.idxs, idxs); set_xen_guest_handle(add.gpfns, gpfns); set_xen_guest_handle(add.errs, errs); /* Allocate a bitset to store broken page mappings. */ umap->err = BITSET_ALLOC(mmap->num, M_PRIVCMD, M_WAITOK | M_ZERO); for (index = 0; index < mmap->num; index += num) { num = MIN(mmap->num - index, UINT16_MAX); add.size = num; error = copyin(&mmap->arr[index], idxs, sizeof(idxs[0]) * num); if (error != 0) goto mmap_out; for (i = 0; i < num; i++) gpfns[i] = atop(umap->phys_base_addr + (i + index) * PAGE_SIZE); bzero(errs, sizeof(*errs) * num); error = HYPERVISOR_memory_op( XENMEM_add_to_physmap_range, &add); if (error != 0) { error = xen_translate_error(error); goto mmap_out; } for (i = 0; i < num; i++) { if (errs[i] != 0) { errs[i] = xen_translate_error(errs[i]); /* Mark the page as invalid. */ BIT_SET(mmap->num, index + i, umap->err); } } error = copyout(errs, &mmap->err[index], sizeof(errs[0]) * num); if (error != 0) goto mmap_out; } umap->mapped = true; mmap_out: free(idxs, M_PRIVCMD); free(gpfns, M_PRIVCMD); free(errs, M_PRIVCMD); if (!umap->mapped) free(umap->err, M_PRIVCMD); break; } default: error = ENOSYS; break; } return (error); }