/*ARGSUSED*/ void hat_kpm_addmem_mseg_update(struct memseg *msp, pgcnt_t nkpmpgs, offset_t kpm_pages_off) { pfn_t base, end; /* * kphysm_add_memory_dynamic() does not set nkpmpgs * when page_t memory is externally allocated. That * code must properly calculate nkpmpgs in all cases * if nkpmpgs needs to be used at some point. */ /* * The meta (page_t) pages for dynamically added memory are allocated * either from the incoming memory itself or from existing memory. * In the former case the base of the incoming pages will be different * than the base of the dynamic segment so call memseg_get_start() to * get the actual base of the incoming memory for each case. */ base = memseg_get_start(msp); end = msp->pages_end; hat_devload(kas.a_hat, kpm_vbase + mmu_ptob(base), mmu_ptob(end - base), base, PROT_READ | PROT_WRITE, HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST); }
/* * Return the virtual address of the mapping area */ caddr_t i_cpr_map_setup(void) { /* * Allocate a virtual memory range spanned by an hmeblk. * This would be 8 hments or 64k bytes. Starting VA * must be 64k (8-page) aligned. */ cpr_vaddr = vmem_xalloc(heap_arena, mmu_ptob(NHMENTS), mmu_ptob(NHMENTS), 0, 0, NULL, NULL, VM_NOSLEEP); return (cpr_vaddr); }
/* * save/restore prom pages or free related allocs */ int i_cpr_prom_pages(int action) { int error; if (action == CPR_PROM_SAVE) { if (ppage_buf == NULL) { ASSERT(pphys_list == NULL); if (error = i_cpr_find_ppages()) return (error); i_cpr_save_ppages(); } } else if (action == CPR_PROM_RESTORE) { i_cpr_restore_ppages(); } else if (action == CPR_PROM_FREE) { if (pphys_list) { ASSERT(pphys_list_size); kmem_free(pphys_list, pphys_list_size); pphys_list = NULL; pphys_list_size = 0; } if (ppage_buf) { ASSERT(ppage_count); kmem_free(ppage_buf, mmu_ptob(ppage_count)); CPR_DEBUG(CPR_DEBUG1, "freed %ld prom pages\n", ppage_count); ppage_buf = NULL; ppage_count = 0; } } return (0); }
pgcnt_t i_cpr_count_storage_pages(int mapflag, bitfunc_t bitfunc) { pgcnt_t count = 0; if (i_cpr_storage_desc_base) { count += cpr_count_pages((caddr_t)i_cpr_storage_desc_base, (size_t)mmu_ptob(i_cpr_storage_desc_pgcnt), mapflag, bitfunc, DBG_SHOWRANGE); } if (i_cpr_storage_data_base) { count += cpr_count_pages(i_cpr_storage_data_base, (size_t)mmu_ptob(i_cpr_storage_data_sz), mapflag, bitfunc, DBG_SHOWRANGE); } return (count); }
static paddr_t mdb_ma_to_pa(uint64_t ma) { pfn_t pfn = mdb_mfn_to_pfn(mmu_btop(ma)); if (pfn == -(pfn_t)1) return (-(paddr_t)1); return (mmu_ptob((paddr_t)pfn) | (ma & (MMU_PAGESIZE - 1))); }
/*ARGSUSED*/ void hat_kpm_delmem_mseg_update(struct memseg *msp, struct memseg **mspp) { pfn_t base, end; /* * The meta (page_t) pages for dynamically added memory are allocated * either from the incoming memory itself or from existing memory. * In the former case the base of the incoming pages will be different * than the base of the dynamic segment so call memseg_get_start() to * get the actual base of the incoming memory for each case. */ base = memseg_get_start(msp); end = msp->pages_end; hat_unload(kas.a_hat, kpm_vbase + mmu_ptob(base), mmu_ptob(end - base), HAT_UNLOAD | HAT_UNLOAD_UNLOCK | HAT_UNLOAD_UNMAP); }
/* * Derived from cpr_write_statefile(). * Allocate (or reallocate after exhausting the supply) descriptors for each * chunk of contiguous sensitive kpages. */ static int i_cpr_storage_desc_alloc(csd_t **basepp, pgcnt_t *pgsp, csd_t **endpp, int retry) { pgcnt_t npages; int chunks; csd_t *descp, *end; size_t len; char *str = "i_cpr_storage_desc_alloc:"; /* * On initial allocation, add some extra to cover overhead caused * by the allocation for the storage area later. */ if (retry == 0) { chunks = cpr_contig_pages(NULL, STORAGE_DESC_ALLOC) + EXTRA_DESCS; npages = mmu_btopr(sizeof (**basepp) * (pgcnt_t)chunks); CPR_DEBUG(CPR_DEBUG7, "%s chunks %d, ", str, chunks); } else { CPR_DEBUG(CPR_DEBUG7, "%s retry %d: ", str, retry); npages = *pgsp + 1; } /* Free old descriptors, if any */ if (*basepp) kmem_free((caddr_t)*basepp, mmu_ptob(*pgsp)); descp = *basepp = kmem_alloc(mmu_ptob(npages), KM_NOSLEEP); if (descp == NULL) { CPR_DEBUG(CPR_DEBUG7, "%s no space for descriptors!\n", str); return (ENOMEM); } *pgsp = npages; len = mmu_ptob(npages); end = *endpp = descp + (len / (sizeof (**basepp))); CPR_DEBUG(CPR_DEBUG7, "npages 0x%lx, len 0x%lx, items 0x%lx\n\t*basepp " "%p, *endpp %p\n", npages, len, (len / (sizeof (**basepp))), (void *)*basepp, (void *)*endpp); i_cpr_storage_desc_init(descp, npages, end); return (0); }
/* * Replacement for devmap_devmem_setup() which will map a machine address * instead of a register set/offset. */ void gfxp_map_devmem(devmap_cookie_t dhc, gfx_maddr_t maddr, size_t length, ddi_device_acc_attr_t *attrp) { devmap_handle_t *dhp = (devmap_handle_t *)dhc; pfn_t pfn; #ifdef __xpv ASSERT(DOMAIN_IS_INITDOMAIN(xen_info)); pfn = xen_assign_pfn(mmu_btop(maddr)); #else pfn = mmu_btop(maddr); #endif dhp->dh_pfn = pfn; dhp->dh_len = mmu_ptob(mmu_btopr(length)); dhp->dh_roff = 0; #ifndef DEVMAP_DEVMEM_COOKIE #define DEVMAP_DEVMEM_COOKIE ((ddi_umem_cookie_t)0x1) /* XXPV */ #endif /* DEVMAP_DEVMEM_COOKIE */ dhp->dh_cookie = DEVMAP_DEVMEM_COOKIE; /*LINTED: E_EXPR_NULL_EFFECT*/ dhp->dh_flags |= DEVMAP_DEFAULTS; dhp->dh_maxprot = PROT_ALL & dhp->dh_orig_maxprot; /* no callbacks needed */ bzero(&dhp->dh_callbackops, sizeof (struct devmap_callback_ctl)); switch (attrp->devacc_attr_dataorder) { case DDI_UNORDERED_OK_ACC: dhp->dh_hat_attr = HAT_UNORDERED_OK; break; case DDI_MERGING_OK_ACC: dhp->dh_hat_attr = HAT_MERGING_OK; break; case DDI_LOADCACHING_OK_ACC: dhp->dh_hat_attr = HAT_LOADCACHING_OK; break; case DDI_STORECACHING_OK_ACC: dhp->dh_hat_attr = HAT_STORECACHING_OK; break; case DDI_STRICTORDER_ACC: default: dhp->dh_hat_attr = HAT_STRICTORDER; } /* don't use large pages */ dhp->dh_mmulevel = 0; dhp->dh_flags &= ~DEVMAP_FLAG_LARGE; dhp->dh_flags |= DEVMAP_SETUP_DONE; }
void i_cpr_storage_free(void) { /* Free descriptors */ if (i_cpr_storage_desc_base) { kmem_free(i_cpr_storage_desc_base, mmu_ptob(i_cpr_storage_desc_pgcnt)); i_cpr_storage_desc_base = NULL; i_cpr_storage_desc_pgcnt = 0; } /* Data storage */ if (i_cpr_storage_data_base) { kmem_free(i_cpr_storage_data_base, mmu_ptob(i_cpr_storage_data_sz)); i_cpr_storage_data_base = NULL; i_cpr_storage_data_sz = 0; } }
static void i_cpr_storage_desc_init(csd_t *descp, pgcnt_t npages, csd_t *end) { size_t len = mmu_ptob(npages); /* Initialize the descriptors to something impossible. */ bzero(descp, len); #ifdef DEBUG /* * This condition is tested by an ASSERT */ for (; descp < end; descp++) descp->csd_dirty_spfn = (uint_t)-1; #endif }
/* * Take a retired page off the retired-pages vnode and clear the toxic flags. * If "free" is nonzero, lock it and put it back on the freelist. If "free" * is zero, the caller already holds SE_EXCL lock so we simply unretire it * and don't do anything else with it. * * Any unretire messages are printed from this routine. * * Returns 0 if page pp was unretired; else an error code. */ int page_unretire_pp(page_t *pp, int free) { /* * To be retired, a page has to be hashed onto the retired_pages vnode * and have PR_RETIRED set in p_toxic. */ if (free == 0 || page_try_reclaim_lock(pp, SE_EXCL, SE_RETIRED)) { ASSERT(PAGE_EXCL(pp)); PR_DEBUG(prd_ulocked); if (!PP_RETIRED(pp)) { PR_DEBUG(prd_unotretired); page_unlock(pp); return (page_retire_done(pp, PRD_UNR_NOT)); } PR_MESSAGE(CE_NOTE, 1, "unretiring retired" " page 0x%08x.%08x", mmu_ptob((uint64_t)pp->p_pagenum)); if (pp->p_toxic & PR_FMA) { PR_DECR_KSTAT(pr_fma); } else if (pp->p_toxic & PR_UE) { PR_DECR_KSTAT(pr_ue); } else { PR_DECR_KSTAT(pr_mce); } page_clrtoxic(pp, PR_ALLFLAGS); if (free) { PR_DEBUG(prd_udestroy); page_destroy(pp, 0); } else { PR_DEBUG(prd_uhashout); page_hashout(pp, NULL); } mutex_enter(&freemem_lock); availrmem++; mutex_exit(&freemem_lock); PR_DEBUG(prd_uunretired); PR_DECR_KSTAT(pr_retired); PR_INCR_KSTAT(pr_unretired); return (page_retire_done(pp, PRD_UNR_SUCCESS)); } PR_DEBUG(prd_unotlocked); return (page_retire_done(pp, PRD_UNR_CANTLOCK)); }
/* * Estimate how much memory we will need to save * the sensitive pages with compression. */ static caddr_t i_cpr_storage_data_alloc(pgcnt_t pages, pgcnt_t *alloc_pages, int retry_cnt) { pgcnt_t alloc_pcnt, last_pcnt; caddr_t addr; char *str; str = "i_cpr_storage_data_alloc:"; if (retry_cnt == 0) { /* * common compression ratio is about 3:1 * initial storage allocation is estimated at 40% * to cover the majority of cases */ alloc_pcnt = INITIAL_ALLOC_PCNT; *alloc_pages = (pages * alloc_pcnt) / INTEGRAL; CPR_DEBUG(CPR_DEBUG7, "%s sensitive pages: %ld\n", str, pages); CPR_DEBUG(CPR_DEBUG7, "%s initial est pages: %ld, alloc %ld%%\n", str, *alloc_pages, alloc_pcnt); } else { /* * calculate the prior compression percentage (x100) * from the last attempt to save sensitive pages */ ASSERT(sensitive_pages_saved != 0); last_pcnt = (mmu_btopr(sensitive_size_saved) * INTEGRAL) / sensitive_pages_saved; CPR_DEBUG(CPR_DEBUG7, "%s last ratio %ld%%\n", str, last_pcnt); /* * new estimated storage size is based on * the larger ratio + 5% for each retry: * pages * (last + [5%, 10%]) */ alloc_pcnt = MAX(last_pcnt, INITIAL_ALLOC_PCNT) + (retry_cnt * 5); *alloc_pages = (pages * alloc_pcnt) / INTEGRAL; CPR_DEBUG(CPR_DEBUG7, "%s Retry est pages: %ld, alloc %ld%%\n", str, *alloc_pages, alloc_pcnt); } addr = kmem_alloc(mmu_ptob(*alloc_pages), KM_NOSLEEP); CPR_DEBUG(CPR_DEBUG7, "%s alloc %ld pages\n", str, *alloc_pages); return (addr); }
/* * From a machine address, find the corresponding pseudo-physical address. * Pseudo-physical address are contiguous and run from mfn_base in each VM. * Machine addresses are the real underlying hardware addresses. * These are needed for page table entries. Note that this routine is * poorly protected. A bad value of "ma" will cause a page fault. */ paddr_t ma_to_pa(maddr_t ma) { ulong_t pgoff = ma & MMU_PAGEOFFSET; ulong_t pfn = mfn_to_pfn_mapping[mmu_btop(ma)]; paddr_t pa; if (pfn >= xen_info->nr_pages) return (-(paddr_t)1); pa = mfn_base + mmu_ptob((paddr_t)pfn) + pgoff; #ifdef DEBUG if (ma != pa_to_ma(pa)) dboot_printf("ma_to_pa(%" PRIx64 ") got %" PRIx64 ", " "pa_to_ma() says %" PRIx64 "\n", ma, pa, pa_to_ma(pa)); #endif return (pa); }
static x86pte_t get_pte(hat_t *hat, htable_t *htable, uintptr_t addr) { x86pte_t buf; if (htable->ht_flags & HTABLE_COPIED) { uintptr_t ptr = (uintptr_t)hat->hat_copied_ptes; ptr += va2entry(htable, addr) << mmu.pte_size_shift; return (*(x86pte_t *)ptr); } paddr_t paddr = mmu_ptob((paddr_t)htable->ht_pfn); paddr += va2entry(htable, addr) << mmu.pte_size_shift; if ((mdb_pread(&buf, mmu.pte_size, paddr)) == mmu.pte_size) return (buf); return (0); }
static void xpvtap_user_request_unmap(xpvtap_state_t *state, uint_t uid) { blkif_request_t *req; struct seg *seg; struct as *as; caddr_t uaddr; int e; as = state->bt_map.um_as; if (as == NULL) { return; } /* get a copy of the original request */ req = &state->bt_map.um_outstanding_reqs[uid]; /* unmap the grefs for this request */ if ((req->operation != BLKIF_OP_WRITE_BARRIER) && (req->operation != BLKIF_OP_FLUSH_DISKCACHE) && (req->nr_segments != 0)) { uaddr = XPVTAP_GREF_REQADDR(state->bt_map.um_guest_pages, uid); AS_LOCK_ENTER(as, RW_READER); seg = as_findseg(as, state->bt_map.um_guest_pages, 0); if ((seg == NULL) || ((uaddr + mmu_ptob(req->nr_segments)) > (seg->s_base + seg->s_size))) { AS_LOCK_EXIT(as); xpvtap_rs_free(state->bt_map.um_rs, uid); return; } e = segmf_release_grefs(seg, uaddr, req->nr_segments); if (e != 0) { cmn_err(CE_WARN, "unable to release grefs"); } AS_LOCK_EXIT(as); } /* free up the user ring id */ xpvtap_rs_free(state->bt_map.um_rs, uid); }
caddr_t psm_map_phys_new(paddr_t addr, size_t len, int prot) { uint_t pgoffset; paddr_t base; pgcnt_t npages; caddr_t cvaddr; if (len == 0) return (0); pgoffset = addr & MMU_PAGEOFFSET; base = addr - pgoffset; npages = mmu_btopr(len + pgoffset); cvaddr = device_arena_alloc(ptob(npages), VM_NOSLEEP); if (cvaddr == NULL) return (0); hat_devload(kas.a_hat, cvaddr, mmu_ptob(npages), mmu_btop(base), prot, HAT_LOAD_LOCK); return (cvaddr + pgoffset); }
/* * Prints any page retire messages to the user, and decides what * error code is appropriate for the condition reported. */ static int page_retire_done(page_t *pp, int code) { page_retire_op_t *prop; uint64_t pa = 0; int i; if (pp != NULL) { pa = mmu_ptob((uint64_t)pp->p_pagenum); } prop = NULL; for (i = 0; page_retire_ops[i].pr_key != PRD_INVALID_KEY; i++) { if (page_retire_ops[i].pr_key == code) { prop = &page_retire_ops[i]; break; } } #ifdef DEBUG if (page_retire_ops[i].pr_key == PRD_INVALID_KEY) { cmn_err(CE_PANIC, "page_retire_done: Invalid opcode %d", code); } #endif ASSERT(prop->pr_key == code); prop->pr_count++; PR_MESSAGE(CE_NOTE, prop->pr_msglvl, prop->pr_message, pa); if (pp != NULL) { page_settoxic(pp, PR_MSG); } return (prop->pr_retval); }
static x86pte_t get_pte(hat_t *hat, htable_t *htable, uintptr_t addr) { x86pte_t buf; x86pte32_t *pte32 = (x86pte32_t *)&buf; size_t len; if (htable->ht_flags & HTABLE_VLP) { uintptr_t ptr = (uintptr_t)hat->hat_vlp_ptes; ptr += va2entry(htable, addr) << mmu.pte_size_shift; len = mdb_vread(&buf, mmu.pte_size, ptr); } else { paddr_t paddr = mmu_ptob((paddr_t)htable->ht_pfn); paddr += va2entry(htable, addr) << mmu.pte_size_shift; len = mdb_pread(&buf, mmu.pte_size, paddr); } if (len != mmu.pte_size) return (0); if (mmu.pte_size == sizeof (x86pte_t)) return (buf); return (*pte32); }
/* * We're done using the mapping area; release virtual space */ void i_cpr_map_destroy(void) { vmem_free(heap_arena, cpr_vaddr, mmu_ptob(NHMENTS)); cpr_vaddr = NULL; }
/* * Create multiboot info structure (mbi) base on the saved mbi. * Recalculate values of the pointer type fields in the data * structure based on the new starting physical address of the * data structure. */ static int fastboot_build_mbi(char *mdep, fastboot_info_t *nk) { mb_module_t *mbp; multiboot_info_t *mbi; /* pointer to multiboot structure */ uintptr_t start_addr_va; /* starting VA of mbi */ uintptr_t start_addr_pa; /* starting PA of mbi */ size_t offs = 0; /* offset from the starting address */ size_t arglen; /* length of the command line arg */ size_t size; /* size of the memory reserved for mbi */ size_t mdnsz; /* length of the boot archive name */ /* * If mdep is not NULL or empty, use the length of mdep + 1 * (for NULL terminating) as the length of the new command * line; else use the saved command line length as the * length for the new command line. */ if (mdep != NULL && strlen(mdep) != 0) { arglen = strlen(mdep) + 1; } else { arglen = saved_cmdline_len; } /* * Allocate memory for the new multiboot info structure (mbi). * If we have reserved memory for mbi but it's not enough, * free it and reallocate. */ size = PAGESIZE + P2ROUNDUP(arglen, PAGESIZE); if (nk->fi_mbi_size && nk->fi_mbi_size < size) { contig_free((void *)nk->fi_new_mbi_va, nk->fi_mbi_size); nk->fi_mbi_size = 0; } if (nk->fi_mbi_size == 0) { if ((nk->fi_new_mbi_va = (uintptr_t)contig_alloc(size, &fastboot_below_1G_dma_attr, PAGESIZE, 0)) == NULL) { cmn_err(CE_NOTE, fastboot_enomem_msg, (uint64_t)size, "1G"); return (-1); } /* * fi_mbi_size must be set after the allocation succeeds * as it's used to determine how much memory to free. */ nk->fi_mbi_size = size; } /* * Initalize memory */ bzero((void *)nk->fi_new_mbi_va, nk->fi_mbi_size); /* * Get PA for the new mbi */ start_addr_va = nk->fi_new_mbi_va; start_addr_pa = mmu_ptob((uint64_t)hat_getpfnum(kas.a_hat, (caddr_t)start_addr_va)); nk->fi_new_mbi_pa = (paddr_t)start_addr_pa; /* * Populate the rest of the fields in the data structure */ /* * Copy from the saved mbi to preserve all non-pointer type fields. */ mbi = (multiboot_info_t *)start_addr_va; bcopy(&saved_mbi, mbi, sizeof (*mbi)); /* * Recalculate mods_addr. Set mod_start and mod_end based on * the physical address of the new boot archive. Set mod_name * to the name of the new boto archive. */ offs += sizeof (multiboot_info_t); mbi->mods_addr = start_addr_pa + offs; mbp = (mb_module_t *)(start_addr_va + offs); mbp->mod_start = nk->fi_files[FASTBOOT_BOOTARCHIVE].fb_dest_pa; mbp->mod_end = nk->fi_files[FASTBOOT_BOOTARCHIVE].fb_next_pa; offs += sizeof (mb_module_t); mdnsz = strlen(fastboot_filename[FASTBOOT_NAME_BOOTARCHIVE]) + 1; bcopy(fastboot_filename[FASTBOOT_NAME_BOOTARCHIVE], (void *)(start_addr_va + offs), mdnsz); mbp->mod_name = start_addr_pa + offs; mbp->reserved = 0; /* * Make sure the offset is 16-byte aligned to avoid unaligned access. */ offs += mdnsz; offs = P2ROUNDUP_TYPED(offs, 16, size_t); /* * Recalculate mmap_addr */ mbi->mmap_addr = start_addr_pa + offs; bcopy((void *)(uintptr_t)saved_mmap, (void *)(start_addr_va + offs), saved_mbi.mmap_length); offs += saved_mbi.mmap_length; /* * Recalculate drives_addr */ mbi->drives_addr = start_addr_pa + offs; bcopy((void *)(uintptr_t)saved_drives, (void *)(start_addr_va + offs), saved_mbi.drives_length); offs += saved_mbi.drives_length; /* * Recalculate the address of cmdline. Set cmdline to contain the * new boot argument. */ mbi->cmdline = start_addr_pa + offs; if (mdep != NULL && strlen(mdep) != 0) { bcopy(mdep, (void *)(start_addr_va + offs), arglen); } else { bcopy((void *)saved_cmdline, (void *)(start_addr_va + offs), arglen); } /* clear fields and flags that are not copied */ bzero(&mbi->config_table, sizeof (*mbi) - offsetof(multiboot_info_t, config_table)); mbi->flags &= ~(MB_INFO_CONFIG_TABLE | MB_INFO_BOOT_LOADER_NAME | MB_INFO_APM_TABLE | MB_INFO_VIDEO_INFO); return (0); }
/* * This function performs the following tasks: * - Read the sizes of the new kernel and boot archive. * - Allocate memory for the new kernel and boot archive. * - Allocate memory for page tables necessary for mapping the memory * allocated for the files. * - Read the new kernel and boot archive into memory. * - Map in the fast reboot switcher. * - Load the fast reboot switcher to FASTBOOT_SWTCH_PA. * - Build the new multiboot_info structure * - Build page tables for the low 1G of physical memory. * - Mark the data structure as valid if all steps have succeeded. */ void fastboot_load_kernel(char *mdep) { void *buf = NULL; int i; fastboot_file_t *fb; uint32_t dboot_start_offset; char kern_bootpath[OBP_MAXPATHLEN]; extern uintptr_t postbootkernelbase; uintptr_t saved_kernelbase; int bootpath_len = 0; int is_failsafe = 0; int is_retry = 0; uint64_t end_addr; if (!fastreboot_capable) return; if (newkernel.fi_valid) fastboot_free_newkernel(&newkernel); saved_kernelbase = postbootkernelbase; postbootkernelbase = 0; /* * Initialize various HAT related fields in the data structure */ fastboot_init_fields(&newkernel); bzero(kern_bootpath, OBP_MAXPATHLEN); /* * Process the boot argument */ bzero(fastboot_args, OBP_MAXPATHLEN); fastboot_parse_mdep(mdep, kern_bootpath, &bootpath_len, fastboot_args); /* * Make sure we get the null character */ bcopy(kern_bootpath, fastboot_filename[FASTBOOT_NAME_UNIX], bootpath_len); bcopy(kern_bootfile, &fastboot_filename[FASTBOOT_NAME_UNIX][bootpath_len], strlen(kern_bootfile) + 1); bcopy(kern_bootpath, fastboot_filename[FASTBOOT_NAME_BOOTARCHIVE], bootpath_len); if (bcmp(kern_bootfile, FAILSAFE_BOOTFILE32, (sizeof (FAILSAFE_BOOTFILE32) - 1)) == 0 || bcmp(kern_bootfile, FAILSAFE_BOOTFILE64, (sizeof (FAILSAFE_BOOTFILE64) - 1)) == 0) { is_failsafe = 1; } load_kernel_retry: /* * Read in unix and boot_archive */ end_addr = DBOOT_ENTRY_ADDRESS; for (i = 0; i < FASTBOOT_MAX_FILES_MAP; i++) { struct _buf *file; uintptr_t va; uint64_t fsize; size_t fsize_roundup, pt_size; int page_index; uintptr_t offset; ddi_dma_attr_t dma_attr = fastboot_dma_attr; dprintf("fastboot_filename[%d] = %s\n", i, fastboot_filename[i]); if ((file = kobj_open_file(fastboot_filename[i])) == (struct _buf *)-1) { cmn_err(CE_NOTE, "!Fastboot: Couldn't open %s", fastboot_filename[i]); goto err_out; } if (kobj_get_filesize(file, &fsize) != 0) { cmn_err(CE_NOTE, "!Fastboot: Couldn't get filesize for %s", fastboot_filename[i]); goto err_out; } fsize_roundup = P2ROUNDUP_TYPED(fsize, PAGESIZE, size_t); /* * Where the files end in physical memory after being * relocated by the fast boot switcher. */ end_addr += fsize_roundup; if (end_addr > fastboot_below_1G_dma_attr.dma_attr_addr_hi) { cmn_err(CE_NOTE, "!Fastboot: boot archive is too big"); goto err_out; } /* * Adjust dma_attr_addr_lo so that the new kernel and boot * archive will not be overridden during relocation. */ if (end_addr > fastboot_dma_attr.dma_attr_addr_lo || end_addr > fastboot_below_1G_dma_attr.dma_attr_addr_lo) { if (is_retry) { /* * If we have already tried and didn't succeed, * just give up. */ cmn_err(CE_NOTE, "!Fastboot: boot archive is too big"); goto err_out; } else { /* Set the flag so we don't keep retrying */ is_retry++; /* Adjust dma_attr_addr_lo */ fastboot_dma_attr.dma_attr_addr_lo = end_addr; fastboot_below_1G_dma_attr.dma_attr_addr_lo = end_addr; /* * Free the memory we have already allocated * whose physical addresses might not fit * the new lo and hi constraints. */ fastboot_free_mem(&newkernel, end_addr); goto load_kernel_retry; } } if (!fastboot_contig) dma_attr.dma_attr_sgllen = (fsize / PAGESIZE) + (((fsize % PAGESIZE) == 0) ? 0 : 1); if ((buf = contig_alloc(fsize, &dma_attr, PAGESIZE, 0)) == NULL) { cmn_err(CE_NOTE, fastboot_enomem_msg, fsize, "64G"); goto err_out; } va = P2ROUNDUP_TYPED((uintptr_t)buf, PAGESIZE, uintptr_t); if (kobj_read_file(file, (char *)va, fsize, 0) < 0) { cmn_err(CE_NOTE, "!Fastboot: Couldn't read %s", fastboot_filename[i]); goto err_out; } fb = &newkernel.fi_files[i]; fb->fb_va = va; fb->fb_size = fsize; fb->fb_sectcnt = 0; pt_size = FASTBOOT_PTE_LIST_SIZE(fsize_roundup); /* * If we have reserved memory but it not enough, free it. */ if (fb->fb_pte_list_size && fb->fb_pte_list_size < pt_size) { contig_free((void *)fb->fb_pte_list_va, fb->fb_pte_list_size); fb->fb_pte_list_size = 0; } if (fb->fb_pte_list_size == 0) { if ((fb->fb_pte_list_va = (x86pte_t *)contig_alloc(pt_size, &fastboot_below_1G_dma_attr, PAGESIZE, 0)) == NULL) { cmn_err(CE_NOTE, fastboot_enomem_msg, (uint64_t)pt_size, "1G"); goto err_out; } /* * fb_pte_list_size must be set after the allocation * succeeds as it's used to determine how much memory to * free. */ fb->fb_pte_list_size = pt_size; } bzero((void *)(fb->fb_pte_list_va), fb->fb_pte_list_size); fb->fb_pte_list_pa = mmu_ptob((uint64_t)hat_getpfnum(kas.a_hat, (caddr_t)fb->fb_pte_list_va)); for (page_index = 0, offset = 0; offset < fb->fb_size; offset += PAGESIZE) { uint64_t paddr; paddr = mmu_ptob((uint64_t)hat_getpfnum(kas.a_hat, (caddr_t)fb->fb_va + offset)); ASSERT(paddr >= fastboot_dma_attr.dma_attr_addr_lo); /* * Include the pte_bits so we don't have to make * it in assembly. */ fb->fb_pte_list_va[page_index++] = (x86pte_t) (paddr | pte_bits); } fb->fb_pte_list_va[page_index] = FASTBOOT_TERMINATE; if (i == FASTBOOT_UNIX) { Ehdr *ehdr = (Ehdr *)va; int j; /* * Sanity checks: */ for (j = 0; j < SELFMAG; j++) { if (ehdr->e_ident[j] != ELFMAG[j]) { cmn_err(CE_NOTE, "!Fastboot: Bad ELF " "signature"); goto err_out; } } if (ehdr->e_ident[EI_CLASS] == ELFCLASS32 && ehdr->e_ident[EI_DATA] == ELFDATA2LSB && ehdr->e_machine == EM_386) { fb->fb_sectcnt = sizeof (fb->fb_sections) / sizeof (fb->fb_sections[0]); if (fastboot_elf32_find_loadables((void *)va, fsize, &fb->fb_sections[0], &fb->fb_sectcnt, &dboot_start_offset) < 0) { cmn_err(CE_NOTE, "!Fastboot: ELF32 " "program section failure"); goto err_out; } if (fb->fb_sectcnt == 0) { cmn_err(CE_NOTE, "!Fastboot: No ELF32 " "program sections found"); goto err_out; } if (is_failsafe) { /* Failsafe boot_archive */ bcopy(BOOTARCHIVE32_FAILSAFE, &fastboot_filename [FASTBOOT_NAME_BOOTARCHIVE] [bootpath_len], sizeof (BOOTARCHIVE32_FAILSAFE)); } else { bcopy(BOOTARCHIVE32, &fastboot_filename [FASTBOOT_NAME_BOOTARCHIVE] [bootpath_len], sizeof (BOOTARCHIVE32)); } } else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64 && ehdr->e_ident[EI_DATA] == ELFDATA2LSB && ehdr->e_machine == EM_AMD64) { if (fastboot_elf64_find_dboot_load_offset( (void *)va, fsize, &dboot_start_offset) != 0) { cmn_err(CE_NOTE, "!Fastboot: Couldn't " "find ELF64 dboot entry offset"); goto err_out; } if (!is_x86_feature(x86_featureset, X86FSET_64) || !is_x86_feature(x86_featureset, X86FSET_PAE)) { cmn_err(CE_NOTE, "Fastboot: Cannot " "reboot to %s: " "not a 64-bit capable system", kern_bootfile); goto err_out; } if (is_failsafe) { /* Failsafe boot_archive */ bcopy(BOOTARCHIVE64_FAILSAFE, &fastboot_filename [FASTBOOT_NAME_BOOTARCHIVE] [bootpath_len], sizeof (BOOTARCHIVE64_FAILSAFE)); } else { bcopy(BOOTARCHIVE64, &fastboot_filename [FASTBOOT_NAME_BOOTARCHIVE] [bootpath_len], sizeof (BOOTARCHIVE64)); } } else { cmn_err(CE_NOTE, "!Fastboot: Unknown ELF type"); goto err_out; } fb->fb_dest_pa = DBOOT_ENTRY_ADDRESS - dboot_start_offset; fb->fb_next_pa = DBOOT_ENTRY_ADDRESS + fsize_roundup; } else { fb->fb_dest_pa = newkernel.fi_files[i - 1].fb_next_pa; fb->fb_next_pa = fb->fb_dest_pa + fsize_roundup; } kobj_close_file(file); } /* * Add the function that will switch us to 32-bit protected mode */ fb = &newkernel.fi_files[FASTBOOT_SWTCH]; fb->fb_va = fb->fb_dest_pa = FASTBOOT_SWTCH_PA; fb->fb_size = MMU_PAGESIZE; hat_devload(kas.a_hat, (caddr_t)fb->fb_va, MMU_PAGESIZE, mmu_btop(fb->fb_dest_pa), PROT_READ | PROT_WRITE | PROT_EXEC, HAT_LOAD_NOCONSIST | HAT_LOAD_LOCK); /* * Build the new multiboot_info structure */ if (fastboot_build_mbi(fastboot_args, &newkernel) != 0) { goto err_out; } /* * Build page table for low 1G physical memory. Use big pages. * Allocate 4 (5 for amd64) pages for the page tables. * 1 page for PML4 (amd64) * 1 page for Page-Directory-Pointer Table * 2 pages for Page Directory * 1 page for Page Table. * The page table entry will be rewritten to map the physical * address as we do the copying. */ if (newkernel.fi_has_pae) { #ifdef __amd64 size_t size = MMU_PAGESIZE * 5; #else size_t size = MMU_PAGESIZE * 4; #endif /* __amd64 */ if (newkernel.fi_pagetable_size && newkernel.fi_pagetable_size < size) { contig_free((void *)newkernel.fi_pagetable_va, newkernel.fi_pagetable_size); newkernel.fi_pagetable_size = 0; } if (newkernel.fi_pagetable_size == 0) { if ((newkernel.fi_pagetable_va = (uintptr_t) contig_alloc(size, &fastboot_below_1G_dma_attr, MMU_PAGESIZE, 0)) == NULL) { cmn_err(CE_NOTE, fastboot_enomem_msg, (uint64_t)size, "1G"); goto err_out; } /* * fi_pagetable_size must be set after the allocation * succeeds as it's used to determine how much memory to * free. */ newkernel.fi_pagetable_size = size; } bzero((void *)(newkernel.fi_pagetable_va), size); newkernel.fi_pagetable_pa = mmu_ptob((uint64_t)hat_getpfnum(kas.a_hat, (caddr_t)newkernel.fi_pagetable_va)); newkernel.fi_last_table_pa = newkernel.fi_pagetable_pa + size - MMU_PAGESIZE; newkernel.fi_next_table_va = newkernel.fi_pagetable_va + MMU_PAGESIZE; newkernel.fi_next_table_pa = newkernel.fi_pagetable_pa + MMU_PAGESIZE; fastboot_build_pagetables(&newkernel); } /* Generate MD5 checksums */ fastboot_cksum_generate(&newkernel); /* Mark it as valid */ newkernel.fi_valid = 1; newkernel.fi_magic = FASTBOOT_MAGIC; postbootkernelbase = saved_kernelbase; return; err_out: postbootkernelbase = saved_kernelbase; newkernel.fi_valid = 0; fastboot_free_newkernel(&newkernel); }
/*ARGSUSED2*/ int do_privcmd_mmap(void *uarg, int mode, cred_t *cr) { privcmd_mmap_t __mmapcmd, *mmc = &__mmapcmd; privcmd_mmap_entry_t *umme; struct as *as = curproc->p_as; struct seg *seg; int i, error = 0; if (ddi_copyin(uarg, mmc, sizeof (*mmc), mode)) return (EFAULT); DTRACE_XPV3(mmap__start, domid_t, mmc->dom, int, mmc->num, privcmd_mmap_entry_t *, mmc->entry); if (mmc->dom == DOMID_SELF) { error = ENOTSUP; /* Too paranoid? */ goto done; } for (umme = mmc->entry, i = 0; i < mmc->num; i++, umme++) { privcmd_mmap_entry_t __mmapent, *mme = &__mmapent; caddr_t addr; if (ddi_copyin(umme, mme, sizeof (*mme), mode)) { error = EFAULT; break; } DTRACE_XPV3(mmap__entry, ulong_t, mme->va, ulong_t, mme->mfn, ulong_t, mme->npages); if (mme->mfn == MFN_INVALID) { error = EINVAL; break; } addr = (caddr_t)mme->va; /* * Find the segment we want to mess with, then add * the mfn range to the segment. */ AS_LOCK_ENTER(as, &as->a_lock, RW_READER); if ((seg = as_findseg(as, addr, 0)) == NULL || addr + mmu_ptob(mme->npages) > seg->s_base + seg->s_size) error = EINVAL; else error = segmf_add_mfns(seg, addr, mme->mfn, mme->npages, mmc->dom); AS_LOCK_EXIT(as, &as->a_lock); if (error != 0) break; } done: DTRACE_XPV1(mmap__end, int, error); return (error); }
/* * This routine allocates space to save the sensitive kernel pages, * i.e. kernel data nucleus, kvalloc and kvseg segments. * It's assumed that those segments are the only areas that can be * contaminated by memory allocations during statefile dumping. * The space allocated here contains: * A list of descriptors describing the saved sensitive pages. * The storage area for saving the compressed sensitive kernel pages. * Since storage pages are allocated from segkmem, they need to be * excluded when saving. */ int i_cpr_save_sensitive_kpages(void) { static const char pages_fmt[] = "\n%s %s allocs\n" " spages %ld, vpages %ld, diff %ld\n"; int retry_cnt; int error = 0; pgcnt_t pages, spages, vpages; caddr_t addr; char *str; /* * Tag sensitive kpages. Allocate space for storage descriptors * and storage data area based on the resulting bitmaps. * Note: The storage space will be part of the sensitive * segment, so we need to tag kpages here before the storage * is actually allocated just so their space won't be accounted * for. They will not be part of the statefile although those * pages will be claimed by cprboot. */ cpr_clear_bitmaps(); spages = i_cpr_count_sensitive_kpages(REGULAR_BITMAP, cpr_setbit); vpages = cpr_count_volatile_pages(REGULAR_BITMAP, cpr_clrbit); pages = spages - vpages; str = "i_cpr_save_sensitive_kpages:"; CPR_DEBUG(CPR_DEBUG7, pages_fmt, "before", str, spages, vpages, pages); /* * Allocate space to save the clean sensitive kpages */ for (retry_cnt = 0; retry_cnt < MAX_STORAGE_ALLOC_RETRY; retry_cnt++) { /* * Alloc on first pass or realloc if we are retrying because * of insufficient storage for sensitive pages */ if (retry_cnt == 0 || error == ENOMEM) { if (i_cpr_storage_data_base) { kmem_free(i_cpr_storage_data_base, mmu_ptob(i_cpr_storage_data_sz)); i_cpr_storage_data_base = NULL; i_cpr_storage_data_sz = 0; } addr = i_cpr_storage_data_alloc(pages, &i_cpr_storage_data_sz, retry_cnt); if (addr == NULL) { CPR_DEBUG(CPR_DEBUG7, "\n%s can't allocate data storage space!\n", str); return (ENOMEM); } i_cpr_storage_data_base = addr; i_cpr_storage_data_end = addr + mmu_ptob(i_cpr_storage_data_sz); } /* * Allocate on first pass, only realloc if retry is because of * insufficient descriptors, but reset contents on each pass * (desc_alloc resets contents as well) */ if (retry_cnt == 0 || error == -1) { error = i_cpr_storage_desc_alloc( &i_cpr_storage_desc_base, &i_cpr_storage_desc_pgcnt, &i_cpr_storage_desc_end, retry_cnt); if (error != 0) return (error); } else { i_cpr_storage_desc_init(i_cpr_storage_desc_base, i_cpr_storage_desc_pgcnt, i_cpr_storage_desc_end); } /* * We are ready to save the sensitive kpages to storage. * We cannot trust what's tagged in the bitmaps anymore * after storage allocations. Clear up the bitmaps and * retag the sensitive kpages again. The storage pages * should be untagged. */ cpr_clear_bitmaps(); spages = i_cpr_count_sensitive_kpages(REGULAR_BITMAP, cpr_setbit); vpages = cpr_count_volatile_pages(REGULAR_BITMAP, cpr_clrbit); CPR_DEBUG(CPR_DEBUG7, pages_fmt, "after ", str, spages, vpages, spages - vpages); /* * Returns 0 on success, -1 if too few descriptors, and * ENOMEM if not enough space to save sensitive pages */ CPR_DEBUG(CPR_DEBUG1, "compressing pages to storage...\n"); error = i_cpr_save_to_storage(); if (error == 0) { /* Saving to storage succeeded */ CPR_DEBUG(CPR_DEBUG1, "compressed %d pages\n", sensitive_pages_saved); break; } else if (error == -1) CPR_DEBUG(CPR_DEBUG1, "%s too few descriptors\n", str); } if (error == -1) error = ENOMEM; return (error); }
/* * Fill in the remaining CPU context and initialize it. */ static int mp_set_cpu_context(vcpu_guest_context_t *vgc, cpu_t *cp) { uint_t vec, iopl; vgc->flags = VGCF_IN_KERNEL; /* * fpu_ctx we leave as zero; on first fault we'll store * sse_initial into it anyway. */ #if defined(__amd64) vgc->user_regs.cs = KCS_SEL | SEL_KPL; /* force to ring 3 */ #else vgc->user_regs.cs = KCS_SEL; #endif vgc->user_regs.ds = KDS_SEL; vgc->user_regs.es = KDS_SEL; vgc->user_regs.ss = KDS_SEL; vgc->kernel_ss = KDS_SEL; /* * Allow I/O privilege level for Dom0 kernel. */ if (DOMAIN_IS_INITDOMAIN(xen_info)) iopl = (PS_IOPL & 0x1000); /* ring 1 */ else iopl = 0; #if defined(__amd64) vgc->user_regs.fs = 0; vgc->user_regs.gs = 0; vgc->user_regs.rflags = F_OFF | iopl; #elif defined(__i386) vgc->user_regs.fs = KFS_SEL; vgc->user_regs.gs = KGS_SEL; vgc->user_regs.eflags = F_OFF | iopl; vgc->event_callback_cs = vgc->user_regs.cs; vgc->failsafe_callback_cs = vgc->user_regs.cs; #endif /* * Initialize the trap_info_t from the IDT */ #if !defined(__lint) ASSERT(NIDT == sizeof (vgc->trap_ctxt) / sizeof (vgc->trap_ctxt[0])); #endif for (vec = 0; vec < NIDT; vec++) { trap_info_t *ti = &vgc->trap_ctxt[vec]; if (xen_idt_to_trap_info(vec, &cp->cpu_m.mcpu_idt[vec], ti) == 0) { ti->cs = KCS_SEL; ti->vector = vec; } } /* * No LDT */ /* * (We assert in various places that the GDT is (a) aligned on a * page boundary and (b) one page long, so this really should fit..) */ #ifdef CRASH_XEN vgc->gdt_frames[0] = pa_to_ma(mmu_btop(cp->cpu_m.mcpu_gdtpa)); #else vgc->gdt_frames[0] = pfn_to_mfn(mmu_btop(cp->cpu_m.mcpu_gdtpa)); #endif vgc->gdt_ents = NGDT; vgc->ctrlreg[0] = CR0_ENABLE_FPU_FLAGS(getcr0()); #if defined(__i386) if (mmu.pae_hat) vgc->ctrlreg[3] = xen_pfn_to_cr3(pfn_to_mfn(kas.a_hat->hat_htable->ht_pfn)); else #endif vgc->ctrlreg[3] = pa_to_ma(mmu_ptob(kas.a_hat->hat_htable->ht_pfn)); vgc->ctrlreg[4] = getcr4(); vgc->event_callback_eip = (uintptr_t)xen_callback; vgc->failsafe_callback_eip = (uintptr_t)xen_failsafe_callback; vgc->flags |= VGCF_failsafe_disables_events; #if defined(__amd64) /* * XXPV should this be moved to init_cpu_syscall? */ vgc->syscall_callback_eip = (uintptr_t)sys_syscall; vgc->flags |= VGCF_syscall_disables_events; ASSERT(vgc->user_regs.gs == 0); vgc->gs_base_kernel = (uintptr_t)cp; #endif return (xen_vcpu_initialize(cp->cpu_id, vgc)); }
/* * Create a guest virtual cpu context so that the virtual cpu * springs into life in the domain just about to call mp_startup() * * Virtual CPUs must be initialized once in the lifetime of the domain; * after that subsequent attempts to start them will fail with X_EEXIST. * * Thus 'alloc' -really- creates and initializes the virtual * CPU context just once. Once the initialisation succeeds, we never * free it, nor the regular cpu_t to which it refers. */ void * mach_cpucontext_alloc(struct cpu *cp) { kthread_t *tp = cp->cpu_thread; vcpu_guest_context_t vgc; int err = 1; /* * First, augment the incoming cpu structure * - vcpu pointer reference * - pending event storage area * - physical address of GDT */ cp->cpu_m.mcpu_vcpu_info = &HYPERVISOR_shared_info->vcpu_info[cp->cpu_id]; cp->cpu_m.mcpu_evt_pend = kmem_zalloc( sizeof (struct xen_evt_data), KM_SLEEP); cp->cpu_m.mcpu_gdtpa = mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)cp->cpu_gdt)); if ((err = xen_gdt_setprot(cp, PROT_READ)) != 0) goto done; /* * Now set up the vcpu context so that we can start this vcpu * in the kernel at tp->t_pc (mp_startup). Note that the * thread will thread_exit() shortly after performing the * initialization; in particular, we will *never* take a * privilege transition on this thread. */ bzero(&vgc, sizeof (vgc)); #ifdef __amd64 vgc.user_regs.rip = tp->t_pc; vgc.user_regs.rsp = tp->t_sp; vgc.user_regs.rbp = tp->t_sp - 2 * sizeof (greg_t); #else vgc.user_regs.eip = tp->t_pc; vgc.user_regs.esp = tp->t_sp; vgc.user_regs.ebp = tp->t_sp - 2 * sizeof (greg_t); #endif /* * XXPV Fix resume, if Russ didn't already fix it. * * Note that resume unconditionally puts t->t_stk + sizeof (regs) * into kernel_sp via HYPERVISOR_stack_switch. This anticipates * that only lwps take traps that switch to the kernel stack; * part of creating an lwp adjusts the stack by subtracting * sizeof (struct regs) off t_stk. * * The more interesting question is, why do we do all the work * of a fully fledged lwp for a plain thread? In particular * we don't have to call HYPERVISOR_stack_switch for lwp-less threads * or futz with the LDT. This should probably all be done with * an lwp context operator to keep pure thread context switch fast. */ vgc.kernel_sp = (ulong_t)tp->t_stk; err = mp_set_cpu_context(&vgc, cp); done: if (err) { mach_cpucontext_free(cp, NULL, err); return (NULL); } return (cp); }
void xen_hvm_init(void) { struct cpuid_regs cp; uint32_t xen_signature[4], base; char *xen_str; struct xen_add_to_physmap xatp; xen_capabilities_info_t caps; pfn_t pfn; uint64_t msrval, val; extern int apix_enable; if (xen_hvm_inited != 0) return; xen_hvm_inited = 1; /* * Xen's pseudo-cpuid function returns a string representing * the Xen signature in %ebx, %ecx, and %edx. * Loop over the base values, since it may be different if * the hypervisor has hyper-v emulation switched on. * * %eax contains the maximum supported cpuid function. */ for (base = 0x40000000; base < 0x40010000; base += 0x100) { cp.cp_eax = base; (void) __cpuid_insn(&cp); xen_signature[0] = cp.cp_ebx; xen_signature[1] = cp.cp_ecx; xen_signature[2] = cp.cp_edx; xen_signature[3] = 0; xen_str = (char *)xen_signature; if (strcmp("XenVMMXenVMM", xen_str) == 0 && cp.cp_eax >= (base + 2)) break; } if (base >= 0x40010000) return; /* * cpuid function at base + 1 returns the Xen version in %eax. The * top 16 bits are the major version, the bottom 16 are the minor * version. */ cp.cp_eax = base + 1; (void) __cpuid_insn(&cp); xen_major = cp.cp_eax >> 16; xen_minor = cp.cp_eax & 0xffff; /* * Below version 3.1 we can't do anything special as a HVM domain; * the PV drivers don't work, many hypercalls are not available, * etc. */ if (xen_major < 3 || (xen_major == 3 && xen_minor < 1)) return; /* * cpuid function at base + 2 returns information about the * hypercall page. %eax nominally contains the number of pages * with hypercall code, but according to the Xen guys, "I'll * guarantee that remains one forever more, so you can just * allocate a single page and get quite upset if you ever see CPUID * return more than one page." %ebx contains an MSR we use to ask * Xen to remap each page at a specific pfn. */ cp.cp_eax = base + 2; (void) __cpuid_insn(&cp); /* * Let Xen know where we want the hypercall page mapped. We * already have a page allocated in the .text section to simplify * the wrapper code. */ pfn = va_to_pfn(&hypercall_page); msrval = mmu_ptob(pfn); wrmsr(cp.cp_ebx, msrval); /* Fill in the xen_info data */ xen_info = &__xen_info; (void) sprintf(xen_info->magic, "xen-%d.%d", xen_major, xen_minor); if (hvm_get_param(HVM_PARAM_STORE_PFN, &val) < 0) return; /* * The first hypercall worked, so mark hypercalls as working. */ xen_hvm_features |= XEN_HVM_HYPERCALLS; xen_info->store_mfn = (mfn_t)val; if (hvm_get_param(HVM_PARAM_STORE_EVTCHN, &val) < 0) return; xen_info->store_evtchn = (mfn_t)val; /* Figure out whether the hypervisor is 32-bit or 64-bit. */ if ((HYPERVISOR_xen_version(XENVER_capabilities, &caps) == 0)) { ((char *)(caps))[sizeof (caps) - 1] = '\0'; if (strstr(caps, "x86_64") != NULL) xen_bits = 64; else if (strstr(caps, "x86_32") != NULL) xen_bits = 32; } if (xen_bits < 0) return; #ifdef __amd64 ASSERT(xen_bits == 64); #endif /* * Allocate space for the shared_info page and tell Xen where it * is. */ xen_shared_info_frame = va_to_pfn(&hypercall_shared_info_page); xatp.domid = DOMID_SELF; xatp.idx = 0; xatp.space = XENMAPSPACE_shared_info; xatp.gpfn = xen_shared_info_frame; if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp) != 0) return; HYPERVISOR_shared_info = (void *)&hypercall_shared_info_page; /* * A working HVM tlb flush hypercall was introduced in Xen 3.3. */ if (xen_major > 3 || (xen_major == 3 && xen_minor >= 3)) xen_hvm_features |= XEN_HVM_TLBFLUSH; /* FIXME Disable apix for the time being */ apix_enable = 0; }
/*ARGSUSED*/ static int segmf_faultpage(struct hat *hat, struct seg *seg, caddr_t addr, enum fault_type type, uint_t prot) { struct segmf_data *data = seg->s_data; uint_t hat_flags = HAT_LOAD_NOCONSIST; mfn_t mfn; x86pte_t pte; segmf_map_t *map; uint_t idx; idx = seg_page(seg, addr); map = &data->map[idx]; ASSERT(map->t_type == SEGMF_MAP_MFN); mfn = map->u.m.m_mfn; if (type == F_SOFTLOCK) { mutex_enter(&freemem_lock); data->softlockcnt++; mutex_exit(&freemem_lock); hat_flags |= HAT_LOAD_LOCK; } else hat_flags |= HAT_LOAD; if (segmf_faultpage_debug > 0) { uprintf("segmf_faultpage: addr %p domid %x mfn %lx prot %x\n", (void *)addr, data->domid, mfn, prot); segmf_faultpage_debug--; } /* * Ask the HAT to load a throwaway mapping to page zero, then * overwrite it with our foreign domain mapping. It gets removed * later via hat_unload() */ hat_devload(hat, addr, MMU_PAGESIZE, (pfn_t)0, PROT_READ | HAT_UNORDERED_OK, hat_flags); pte = mmu_ptob((x86pte_t)mfn) | PT_VALID | PT_USER | PT_FOREIGN; if (prot & PROT_WRITE) pte |= PT_WRITABLE; if (HYPERVISOR_update_va_mapping_otherdomain((uintptr_t)addr, pte, UVMF_INVLPG | UVMF_ALL, data->domid) != 0) { hat_flags = HAT_UNLOAD_UNMAP; if (type == F_SOFTLOCK) { hat_flags |= HAT_UNLOAD_UNLOCK; mutex_enter(&freemem_lock); data->softlockcnt--; mutex_exit(&freemem_lock); } hat_unload(hat, addr, MMU_PAGESIZE, hat_flags); return (FC_MAKE_ERR(EFAULT)); } return (0); }
/* * Report all hat's that either use PFN as a page table or that map the page. */ static int do_report_maps(pfn_t pfn) { struct hat *hatp; struct hat hat; htable_t *ht; htable_t htable; uintptr_t base; int h; int level; int entry; x86pte_t pte; x86pte_t buf; x86pte32_t *pte32 = (x86pte32_t *)&buf; physaddr_t paddr; size_t len; /* * The hats are kept in a list with khat at the head. */ for (hatp = khat; hatp != NULL; hatp = hat.hat_next) { /* * read the hat and its hash table */ if (mdb_vread(&hat, sizeof (hat), (uintptr_t)hatp) == -1) { mdb_warn("Couldn't read struct hat\n"); return (DCMD_ERR); } /* * read the htable hashtable */ paddr = 0; for (h = 0; h < hat.hat_num_hash; ++h) { if (mdb_vread(&ht, sizeof (htable_t *), (uintptr_t)(hat.hat_ht_hash + h)) == -1) { mdb_warn("Couldn't read htable\n"); return (DCMD_ERR); } for (; ht != NULL; ht = htable.ht_next) { if (mdb_vread(&htable, sizeof (htable_t), (uintptr_t)ht) == -1) { mdb_warn("Couldn't read htable\n"); return (DCMD_ERR); } /* * only report kernel addresses once */ if (hatp != khat && htable.ht_vaddr >= kernelbase) continue; /* * Is the PFN a pagetable itself? */ if (htable.ht_pfn == pfn) { mdb_printf("Pagetable for " "hat=%p htable=%p\n", hatp, ht); continue; } /* * otherwise, examine page mappings */ level = htable.ht_level; if (level > mmu.max_page_level) continue; paddr = mmu_ptob((physaddr_t)htable.ht_pfn); for (entry = 0; entry < HTABLE_NUM_PTES(&htable); ++entry) { base = htable.ht_vaddr + entry * mmu.level_size[level]; /* * only report kernel addresses once */ if (hatp != khat && base >= kernelbase) continue; len = mdb_pread(&buf, mmu.pte_size, paddr + entry * mmu.pte_size); if (len != mmu.pte_size) return (DCMD_ERR); if (mmu.pte_size == sizeof (x86pte_t)) pte = buf; else pte = *pte32; if ((pte & PT_VALID) == 0) continue; if (level == 0 || !(pte & PT_PAGESIZE)) pte &= PT_PADDR; else pte &= PT_PADDR_LGPG; if (mmu_btop(mdb_ma_to_pa(pte)) != pfn) continue; mdb_printf("hat=%p maps addr=%p\n", hatp, (caddr_t)base); } } } } done: return (DCMD_OK); }
void vpm_init() { long npages; struct vpmap *vpm; struct vpmfree *vpmflp; int i, ndx; extern void prefetch_smap_w(void *); if (!vpm_cache_enable) { return; } /* * Set the size of the cache. */ vpm_cache_size = mmu_ptob((physmem * vpm_cache_percent)/100); if (vpm_cache_size < VPMAP_MINCACHE) { vpm_cache_size = VPMAP_MINCACHE; } /* * Number of freelists. */ if (vpm_nfreelist == 0) { vpm_nfreelist = max_ncpus; } else if (vpm_nfreelist < 0 || vpm_nfreelist > 2 * max_ncpus) { cmn_err(CE_WARN, "vpmap create : number of freelist " "vpm_nfreelist %d using %d", vpm_nfreelist, max_ncpus); vpm_nfreelist = 2 * max_ncpus; } /* * Round it up to the next power of 2 */ if (vpm_nfreelist & (vpm_nfreelist - 1)) { vpm_nfreelist = 1 << (highbit(vpm_nfreelist)); } vpmd_freemsk = vpm_nfreelist - 1; /* * Use a per cpu rotor index to spread the allocations evenly * across the available vpm freelists. */ vpmd_cpu = kmem_zalloc(sizeof (union vpm_cpu) * max_ncpus, KM_SLEEP); ndx = 0; for (i = 0; i < max_ncpus; i++) { vpmd_cpu[i].vfree_ndx = ndx; ndx = (ndx + 1) & vpmd_freemsk; } /* * Allocate and initialize the freelist. */ vpmd_free = kmem_zalloc(vpm_nfreelist * sizeof (struct vpmfree), KM_SLEEP); for (i = 0; i < vpm_nfreelist; i++) { vpmflp = &vpmd_free[i]; /* * Set up initial queue pointers. They will get flipped * back and forth. */ vpmflp->vpm_allocq = &vpmflp->vpm_freeq[VPMALLOCQ]; vpmflp->vpm_releq = &vpmflp->vpm_freeq[VPMRELEQ]; } npages = mmu_btop(vpm_cache_size); /* * Allocate and initialize the vpmap structs. */ vpmd_vpmap = kmem_zalloc(sizeof (struct vpmap) * npages, KM_SLEEP); for (vpm = vpmd_vpmap; vpm <= &vpmd_vpmap[npages - 1]; vpm++) { struct vpmfree *vpmflp; union vpm_freeq *releq; struct vpmap *vpmapf; /* * Use prefetch as we have to walk thru a large number of * these data structures. We just use the smap's prefetch * routine as it does the same. This should work fine * for x64(this needs to be modifed when enabled on sparc). */ prefetch_smap_w((void *)vpm); vpm->vpm_free_ndx = VPMAP2VMF_NDX(vpm); vpmflp = VPMAP2VMF(vpm); releq = vpmflp->vpm_releq; vpmapf = releq->vpmq_free; if (vpmapf == NULL) { releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm; } else { vpm->vpm_next = vpmapf; vpm->vpm_prev = vpmapf->vpm_prev; vpmapf->vpm_prev = vpm; vpm->vpm_prev->vpm_next = vpm; releq->vpmq_free = vpm->vpm_next; } /* * Indicate that the vpmap is on the releq at start */ vpm->vpm_ndxflg = VPMRELEQ; } }
static int do_ptable_dcmd(pfn_t pfn) { struct hat *hatp; struct hat hat; htable_t *ht; htable_t htable; uintptr_t base; int h; int level; int entry; uintptr_t pagesize; x86pte_t pte; x86pte_t buf; x86pte32_t *pte32 = (x86pte32_t *)&buf; physaddr_t paddr; size_t len; /* * The hats are kept in a list with khat at the head. */ for (hatp = khat; hatp != NULL; hatp = hat.hat_next) { /* * read the hat and its hash table */ if (mdb_vread(&hat, sizeof (hat), (uintptr_t)hatp) == -1) { mdb_warn("Couldn't read struct hat\n"); return (DCMD_ERR); } /* * read the htable hashtable */ paddr = 0; for (h = 0; h < hat.hat_num_hash; ++h) { if (mdb_vread(&ht, sizeof (htable_t *), (uintptr_t)(hat.hat_ht_hash + h)) == -1) { mdb_warn("Couldn't read htable\n"); return (DCMD_ERR); } for (; ht != NULL; ht = htable.ht_next) { if (mdb_vread(&htable, sizeof (htable_t), (uintptr_t)ht) == -1) { mdb_warn("Couldn't read htable\n"); return (DCMD_ERR); } /* * Is this the PFN for this htable */ if (htable.ht_pfn == pfn) goto found_it; } } } found_it: if (htable.ht_pfn == pfn) { mdb_printf("htable=%p\n", ht); level = htable.ht_level; base = htable.ht_vaddr; pagesize = mmu.level_size[level]; } else { mdb_printf("Unknown pagetable - assuming level/addr 0"); level = 0; /* assume level == 0 for PFN */ base = 0; pagesize = MMU_PAGESIZE; } paddr = mmu_ptob((physaddr_t)pfn); for (entry = 0; entry < mmu.ptes_per_table; ++entry) { len = mdb_pread(&buf, mmu.pte_size, paddr + entry * mmu.pte_size); if (len != mmu.pte_size) return (DCMD_ERR); if (mmu.pte_size == sizeof (x86pte_t)) pte = buf; else pte = *pte32; if (pte == 0) continue; mdb_printf("[%3d] va=%p ", entry, base + entry * pagesize); do_pte_dcmd(level, pte); } done: return (DCMD_OK); }