/* * The list of mfn pages is out of date. Recompute it. */ static void rebuild_mfn_list(void) { int i = 0; size_t sz; size_t off; pfn_t pfn; SUSPEND_DEBUG("rebuild_mfn_list\n"); sz = ((mfn_count * sizeof (mfn_t)) + MMU_PAGEOFFSET) & MMU_PAGEMASK; for (off = 0; off < sz; off += MMU_PAGESIZE) { size_t j = mmu_btop(off); if (((j * sizeof (mfn_t)) & MMU_PAGEOFFSET) == 0) { pfn = hat_getpfnum(kas.a_hat, (caddr_t)&mfn_list_pages[j]); mfn_list_pages_page[i++] = pfn_to_mfn(pfn); } pfn = hat_getpfnum(kas.a_hat, (caddr_t)mfn_list + off); mfn_list_pages[j] = pfn_to_mfn(pfn); } pfn = hat_getpfnum(kas.a_hat, (caddr_t)mfn_list_pages_page); HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = pfn_to_mfn(pfn); }
/** * Allocates physical memory which satisfy the given constraints. * * @param uPhysHi The upper physical address limit (inclusive). * @param puPhys Where to store the physical address of the allocated * memory. Optional, can be NULL. * @param cb Size of allocation. * @param uAlignment Alignment. * @param fContig Whether the memory must be physically contiguous or * not. * * @returns Virtual address of allocated memory block or NULL if allocation * failed. */ DECLHIDDEN(void *) rtR0SolMemAlloc(uint64_t uPhysHi, uint64_t *puPhys, size_t cb, uint64_t uAlignment, bool fContig) { if ((cb & PAGEOFFSET) != 0) return NULL; size_t cPages = (cb + PAGESIZE - 1) >> PAGESHIFT; if (!cPages) return NULL; ddi_dma_attr_t DmaAttr = s_rtR0SolDmaAttr; DmaAttr.dma_attr_addr_hi = uPhysHi; DmaAttr.dma_attr_align = uAlignment; if (!fContig) DmaAttr.dma_attr_sgllen = cPages > INT_MAX ? INT_MAX - 1 : cPages; else AssertRelease(DmaAttr.dma_attr_sgllen == 1); void *pvMem = contig_alloc(cb, &DmaAttr, PAGESIZE, 1 /* can sleep */); if (!pvMem) { LogRel(("rtR0SolMemAlloc failed. cb=%u Align=%u fContig=%d\n", (unsigned)cb, (unsigned)uAlignment, fContig)); return NULL; } pfn_t PageFrameNum = hat_getpfnum(kas.a_hat, (caddr_t)pvMem); AssertRelease(PageFrameNum != PFN_INVALID); if (puPhys) *puPhys = (uint64_t)PageFrameNum << PAGESHIFT; return pvMem; }
/* * Set up our xenstore page and event channel. Domain 0 needs to allocate a * page and event channel; other domains use what we are told. */ void xb_init(void) { int err; if (DOMAIN_IS_INITDOMAIN(xen_info)) { if (xb_addr != NULL) return; xb_addr = ddi_umem_alloc(PAGESIZE, DDI_UMEM_SLEEP, &xb_cookie); xen_info->store_mfn = pfn_to_mfn(hat_getpfnum(kas.a_hat, xb_addr)); err = xen_alloc_unbound_evtchn(0, (int *)&xen_info->store_evtchn); ASSERT(err == 0); } else { /* * This is harmless on first boot, but needed for resume and * migrate. We use kbm_map_ma() as a shortcut instead of * directly using HYPERVISOR_update_va_mapping(). */ ASSERT(xb_addr != NULL); kbm_map_ma(mfn_to_ma(xen_info->store_mfn), (uintptr_t)xb_addr, 0); } ASSERT(xen_info->store_evtchn); }
static void segkmem_xdump_range(void *arg, void *start, size_t size) { struct as *as = arg; caddr_t addr = start; caddr_t addr_end = addr + size; while (addr < addr_end) { pfn_t pfn = hat_getpfnum(kas.a_hat, addr); if (pfn != PFN_INVALID && pfn <= physmax && pf_is_memory(pfn)) dump_addpage(as, addr, pfn); addr += PAGESIZE; dump_timeleft = dump_timeout; } }
/** * Returns the physical address for a virtual address. * * @param pv The virtual address. * * @returns The physical address corresponding to @a pv. */ static uint64_t rtR0MemObjSolVirtToPhys(void *pv) { struct hat *pHat = NULL; pfn_t PageFrameNum = 0; uintptr_t uVirtAddr = (uintptr_t)pv; if (SOL_IS_KRNL_ADDR(pv)) pHat = kas.a_hat; else { proc_t *pProcess = (proc_t *)RTR0ProcHandleSelf(); AssertRelease(pProcess); pHat = pProcess->p_as->a_hat; } PageFrameNum = hat_getpfnum(pHat, (caddr_t)(uVirtAddr & PAGEMASK)); AssertReleaseMsg(PageFrameNum != PFN_INVALID, ("rtR0MemObjSolVirtToPhys failed. pv=%p\n", pv)); return (((uint64_t)PageFrameNum << PAGE_SHIFT) | (uVirtAddr & PAGE_OFFSET_MASK)); }
/* * Map address "addr" in address space "as" into a kernel virtual address. * The memory is guaranteed to be resident and locked down. */ static caddr_t mapin(struct as *as, caddr_t addr, int writing) { page_t *pp; caddr_t kaddr; pfn_t pfnum; /* * NB: Because of past mistakes, we have bits being returned * by getpfnum that are actually the page type bits of the pte. * When the object we are trying to map is a memory page with * a page structure everything is ok and we can use the optimal * method, ppmapin. Otherwise, we have to do something special. */ pfnum = hat_getpfnum(as->a_hat, addr); if (pf_is_memory(pfnum)) { pp = page_numtopp_nolock(pfnum); if (pp != NULL) { ASSERT(PAGE_LOCKED(pp)); kaddr = ppmapin(pp, writing ? (PROT_READ | PROT_WRITE) : PROT_READ, (caddr_t)-1); return (kaddr + ((uintptr_t)addr & PAGEOFFSET)); } } /* * Oh well, we didn't have a page struct for the object we were * trying to map in; ppmapin doesn't handle devices, but allocating a * heap address allows ppmapout to free virutal space when done. */ kaddr = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP); hat_devload(kas.a_hat, kaddr, PAGESIZE, pfnum, writing ? (PROT_READ | PROT_WRITE) : PROT_READ, HAT_LOAD_LOCK); return (kaddr + ((uintptr_t)addr & PAGEOFFSET)); }
int xen_ldt_setprot(user_desc_t *ldt, size_t lsize, uint_t prot) { int err; caddr_t lva = (caddr_t)ldt; #if defined(__amd64) int pt_bits = PT_VALID; pgcnt_t npgs; if (prot & PROT_WRITE) pt_bits |= PT_WRITABLE; #endif /* __amd64 */ if ((err = as_setprot(&kas, (caddr_t)ldt, lsize, prot)) != 0) goto done; #if defined(__amd64) ASSERT(IS_P2ALIGNED(lsize, PAGESIZE)); npgs = mmu_btop(lsize); while (npgs--) { if ((err = xen_kpm_page(hat_getpfnum(kas.a_hat, lva), pt_bits)) != 0) break; lva += PAGESIZE; } #endif /* __amd64 */ done: if (err) { cmn_err(CE_WARN, "xen_ldt_setprot(%p, %s) failed: error %d", (void *)lva, (prot & PROT_WRITE) ? "writable" : "read-only", err); } return (err); }
/* * Top level routine to direct suspend/resume of a domain. */ void xen_suspend_domain(void) { extern void rtcsync(void); extern hrtime_t hres_last_tick; mfn_t start_info_mfn; ulong_t flags; pfn_t pfn; int i; /* * Check that we are happy to suspend on this hypervisor. */ if (xen_hypervisor_supports_solaris(XEN_SUSPEND_CHECK) == 0) { cpr_err(CE_WARN, "Cannot suspend on this hypervisor " "version: v%lu.%lu%s, need at least version v3.0.4 or " "-xvm based hypervisor", XENVER_CURRENT(xv_major), XENVER_CURRENT(xv_minor), XENVER_CURRENT(xv_ver)); return; } /* * XXPV - Are we definitely OK to suspend by the time we've connected * the handler? */ cpr_err(CE_NOTE, "Domain suspending for save/migrate"); SUSPEND_DEBUG("xen_suspend_domain\n"); /* * suspend interrupts and devices * XXPV - we use suspend/resume for both save/restore domains (like sun * cpr) and for migration. Would be nice to know the difference if * possible. For save/restore where down time may be a long time, we * may want to do more of the things that cpr does. (i.e. notify user * processes, shrink memory footprint for faster restore, etc.) */ xen_suspend_devices(); SUSPEND_DEBUG("xenbus_suspend\n"); xenbus_suspend(); pfn = hat_getpfnum(kas.a_hat, (caddr_t)xen_info); start_info_mfn = pfn_to_mfn(pfn); /* * XXPV: cpu hotplug can hold this under a xenbus watch. Are we safe * wrt xenbus being suspended here? */ mutex_enter(&cpu_lock); /* * Suspend must be done on vcpu 0, as no context for other CPUs is * saved. * * XXPV - add to taskq API ? */ thread_affinity_set(curthread, 0); kpreempt_disable(); SUSPEND_DEBUG("xen_start_migrate\n"); xen_start_migrate(); if (ncpus > 1) suspend_cpus(); /* * We can grab the ec_lock as it's a spinlock with a high SPL. Hence * any holder would have dropped it to get through suspend_cpus(). */ mutex_enter(&ec_lock); /* * From here on in, we can't take locks. */ SUSPEND_DEBUG("ec_suspend\n"); ec_suspend(); SUSPEND_DEBUG("gnttab_suspend\n"); gnttab_suspend(); flags = intr_clear(); xpv_time_suspend(); /* * Currently, the hypervisor incorrectly fails to bring back * powered-down VCPUs. Thus we need to record any powered-down VCPUs * to prevent any attempts to operate on them. But we have to do this * *after* the very first time we do ec_suspend(). */ for (i = 1; i < ncpus; i++) { if (cpu[i] == NULL) continue; if (cpu_get_state(cpu[i]) == P_POWEROFF) CPUSET_ATOMIC_ADD(cpu_suspend_lost_set, i); } /* * The dom0 save/migrate code doesn't automatically translate * these into PFNs, but expects them to be, so we do it here. * We don't use mfn_to_pfn() because so many OS services have * been disabled at this point. */ xen_info->store_mfn = mfn_to_pfn_mapping[xen_info->store_mfn]; xen_info->console.domU.mfn = mfn_to_pfn_mapping[xen_info->console.domU.mfn]; if (CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask == 0) { prom_printf("xen_suspend_domain(): " "CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask not set\n"); (void) HYPERVISOR_shutdown(SHUTDOWN_crash); } if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info, 0, UVMF_INVLPG)) { prom_printf("xen_suspend_domain(): " "HYPERVISOR_update_va_mapping() failed\n"); (void) HYPERVISOR_shutdown(SHUTDOWN_crash); } SUSPEND_DEBUG("HYPERVISOR_suspend\n"); /* * At this point we suspend and sometime later resume. */ if (HYPERVISOR_suspend(start_info_mfn)) { prom_printf("xen_suspend_domain(): " "HYPERVISOR_suspend() failed\n"); (void) HYPERVISOR_shutdown(SHUTDOWN_crash); } /* * Point HYPERVISOR_shared_info to its new value. */ if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info, xen_info->shared_info | PT_NOCONSIST | PT_VALID | PT_WRITABLE, UVMF_INVLPG)) (void) HYPERVISOR_shutdown(SHUTDOWN_crash); if (xen_info->nr_pages != mfn_count) { prom_printf("xen_suspend_domain(): number of pages" " changed, was 0x%lx, now 0x%lx\n", mfn_count, xen_info->nr_pages); (void) HYPERVISOR_shutdown(SHUTDOWN_crash); } xpv_time_resume(); cached_max_mfn = 0; SUSPEND_DEBUG("gnttab_resume\n"); gnttab_resume(); /* XXPV: add a note that this must be lockless. */ SUSPEND_DEBUG("ec_resume\n"); ec_resume(); intr_restore(flags); if (ncpus > 1) resume_cpus(); mutex_exit(&ec_lock); xen_end_migrate(); mutex_exit(&cpu_lock); /* * Now we can take locks again. */ /* * Force the tick value used for tv_nsec in hres_tick() to be up to * date. rtcsync() will reset the hrestime value appropriately. */ hres_last_tick = xpv_gethrtime(); /* * XXPV: we need to have resumed the CPUs since this takes locks, but * can remote CPUs see bad state? Presumably yes. Should probably nest * taking of todlock inside of cpu_lock, or vice versa, then provide an * unlocked version. Probably need to call clkinitf to reset cpu freq * and re-calibrate if we migrated to a different speed cpu. Also need * to make a (re)init_cpu_info call to update processor info structs * and device tree info. That remains to be written at the moment. */ rtcsync(); rebuild_mfn_list(); SUSPEND_DEBUG("xenbus_resume\n"); xenbus_resume(); SUSPEND_DEBUG("xenbus_resume_devices\n"); xen_resume_devices(); thread_affinity_clear(curthread); kpreempt_enable(); SUSPEND_DEBUG("finished xen_suspend_domain\n"); /* * We have restarted our suspended domain, update the hypervisor * details. NB: This must be done at the end of this function, * since we need the domain to be completely resumed before * these functions will work correctly. */ xen_set_version(XENVER_CURRENT_IDX); /* * We can check and report a warning, but we don't stop the * process. */ if (xen_hypervisor_supports_solaris(XEN_SUSPEND_CHECK) == 0) cmn_err(CE_WARN, "Found hypervisor version: v%lu.%lu%s " "but need at least version v3.0.4", XENVER_CURRENT(xv_major), XENVER_CURRENT(xv_minor), XENVER_CURRENT(xv_ver)); cmn_err(CE_NOTE, "domain restore/migrate completed"); }
/* * This function performs the following tasks: * - Read the sizes of the new kernel and boot archive. * - Allocate memory for the new kernel and boot archive. * - Allocate memory for page tables necessary for mapping the memory * allocated for the files. * - Read the new kernel and boot archive into memory. * - Map in the fast reboot switcher. * - Load the fast reboot switcher to FASTBOOT_SWTCH_PA. * - Build the new multiboot_info structure * - Build page tables for the low 1G of physical memory. * - Mark the data structure as valid if all steps have succeeded. */ void fastboot_load_kernel(char *mdep) { void *buf = NULL; int i; fastboot_file_t *fb; uint32_t dboot_start_offset; char kern_bootpath[OBP_MAXPATHLEN]; extern uintptr_t postbootkernelbase; uintptr_t saved_kernelbase; int bootpath_len = 0; int is_failsafe = 0; int is_retry = 0; uint64_t end_addr; if (!fastreboot_capable) return; if (newkernel.fi_valid) fastboot_free_newkernel(&newkernel); saved_kernelbase = postbootkernelbase; postbootkernelbase = 0; /* * Initialize various HAT related fields in the data structure */ fastboot_init_fields(&newkernel); bzero(kern_bootpath, OBP_MAXPATHLEN); /* * Process the boot argument */ bzero(fastboot_args, OBP_MAXPATHLEN); fastboot_parse_mdep(mdep, kern_bootpath, &bootpath_len, fastboot_args); /* * Make sure we get the null character */ bcopy(kern_bootpath, fastboot_filename[FASTBOOT_NAME_UNIX], bootpath_len); bcopy(kern_bootfile, &fastboot_filename[FASTBOOT_NAME_UNIX][bootpath_len], strlen(kern_bootfile) + 1); bcopy(kern_bootpath, fastboot_filename[FASTBOOT_NAME_BOOTARCHIVE], bootpath_len); if (bcmp(kern_bootfile, FAILSAFE_BOOTFILE32, (sizeof (FAILSAFE_BOOTFILE32) - 1)) == 0 || bcmp(kern_bootfile, FAILSAFE_BOOTFILE64, (sizeof (FAILSAFE_BOOTFILE64) - 1)) == 0) { is_failsafe = 1; } load_kernel_retry: /* * Read in unix and boot_archive */ end_addr = DBOOT_ENTRY_ADDRESS; for (i = 0; i < FASTBOOT_MAX_FILES_MAP; i++) { struct _buf *file; uintptr_t va; uint64_t fsize; size_t fsize_roundup, pt_size; int page_index; uintptr_t offset; ddi_dma_attr_t dma_attr = fastboot_dma_attr; dprintf("fastboot_filename[%d] = %s\n", i, fastboot_filename[i]); if ((file = kobj_open_file(fastboot_filename[i])) == (struct _buf *)-1) { cmn_err(CE_NOTE, "!Fastboot: Couldn't open %s", fastboot_filename[i]); goto err_out; } if (kobj_get_filesize(file, &fsize) != 0) { cmn_err(CE_NOTE, "!Fastboot: Couldn't get filesize for %s", fastboot_filename[i]); goto err_out; } fsize_roundup = P2ROUNDUP_TYPED(fsize, PAGESIZE, size_t); /* * Where the files end in physical memory after being * relocated by the fast boot switcher. */ end_addr += fsize_roundup; if (end_addr > fastboot_below_1G_dma_attr.dma_attr_addr_hi) { cmn_err(CE_NOTE, "!Fastboot: boot archive is too big"); goto err_out; } /* * Adjust dma_attr_addr_lo so that the new kernel and boot * archive will not be overridden during relocation. */ if (end_addr > fastboot_dma_attr.dma_attr_addr_lo || end_addr > fastboot_below_1G_dma_attr.dma_attr_addr_lo) { if (is_retry) { /* * If we have already tried and didn't succeed, * just give up. */ cmn_err(CE_NOTE, "!Fastboot: boot archive is too big"); goto err_out; } else { /* Set the flag so we don't keep retrying */ is_retry++; /* Adjust dma_attr_addr_lo */ fastboot_dma_attr.dma_attr_addr_lo = end_addr; fastboot_below_1G_dma_attr.dma_attr_addr_lo = end_addr; /* * Free the memory we have already allocated * whose physical addresses might not fit * the new lo and hi constraints. */ fastboot_free_mem(&newkernel, end_addr); goto load_kernel_retry; } } if (!fastboot_contig) dma_attr.dma_attr_sgllen = (fsize / PAGESIZE) + (((fsize % PAGESIZE) == 0) ? 0 : 1); if ((buf = contig_alloc(fsize, &dma_attr, PAGESIZE, 0)) == NULL) { cmn_err(CE_NOTE, fastboot_enomem_msg, fsize, "64G"); goto err_out; } va = P2ROUNDUP_TYPED((uintptr_t)buf, PAGESIZE, uintptr_t); if (kobj_read_file(file, (char *)va, fsize, 0) < 0) { cmn_err(CE_NOTE, "!Fastboot: Couldn't read %s", fastboot_filename[i]); goto err_out; } fb = &newkernel.fi_files[i]; fb->fb_va = va; fb->fb_size = fsize; fb->fb_sectcnt = 0; pt_size = FASTBOOT_PTE_LIST_SIZE(fsize_roundup); /* * If we have reserved memory but it not enough, free it. */ if (fb->fb_pte_list_size && fb->fb_pte_list_size < pt_size) { contig_free((void *)fb->fb_pte_list_va, fb->fb_pte_list_size); fb->fb_pte_list_size = 0; } if (fb->fb_pte_list_size == 0) { if ((fb->fb_pte_list_va = (x86pte_t *)contig_alloc(pt_size, &fastboot_below_1G_dma_attr, PAGESIZE, 0)) == NULL) { cmn_err(CE_NOTE, fastboot_enomem_msg, (uint64_t)pt_size, "1G"); goto err_out; } /* * fb_pte_list_size must be set after the allocation * succeeds as it's used to determine how much memory to * free. */ fb->fb_pte_list_size = pt_size; } bzero((void *)(fb->fb_pte_list_va), fb->fb_pte_list_size); fb->fb_pte_list_pa = mmu_ptob((uint64_t)hat_getpfnum(kas.a_hat, (caddr_t)fb->fb_pte_list_va)); for (page_index = 0, offset = 0; offset < fb->fb_size; offset += PAGESIZE) { uint64_t paddr; paddr = mmu_ptob((uint64_t)hat_getpfnum(kas.a_hat, (caddr_t)fb->fb_va + offset)); ASSERT(paddr >= fastboot_dma_attr.dma_attr_addr_lo); /* * Include the pte_bits so we don't have to make * it in assembly. */ fb->fb_pte_list_va[page_index++] = (x86pte_t) (paddr | pte_bits); } fb->fb_pte_list_va[page_index] = FASTBOOT_TERMINATE; if (i == FASTBOOT_UNIX) { Ehdr *ehdr = (Ehdr *)va; int j; /* * Sanity checks: */ for (j = 0; j < SELFMAG; j++) { if (ehdr->e_ident[j] != ELFMAG[j]) { cmn_err(CE_NOTE, "!Fastboot: Bad ELF " "signature"); goto err_out; } } if (ehdr->e_ident[EI_CLASS] == ELFCLASS32 && ehdr->e_ident[EI_DATA] == ELFDATA2LSB && ehdr->e_machine == EM_386) { fb->fb_sectcnt = sizeof (fb->fb_sections) / sizeof (fb->fb_sections[0]); if (fastboot_elf32_find_loadables((void *)va, fsize, &fb->fb_sections[0], &fb->fb_sectcnt, &dboot_start_offset) < 0) { cmn_err(CE_NOTE, "!Fastboot: ELF32 " "program section failure"); goto err_out; } if (fb->fb_sectcnt == 0) { cmn_err(CE_NOTE, "!Fastboot: No ELF32 " "program sections found"); goto err_out; } if (is_failsafe) { /* Failsafe boot_archive */ bcopy(BOOTARCHIVE32_FAILSAFE, &fastboot_filename [FASTBOOT_NAME_BOOTARCHIVE] [bootpath_len], sizeof (BOOTARCHIVE32_FAILSAFE)); } else { bcopy(BOOTARCHIVE32, &fastboot_filename [FASTBOOT_NAME_BOOTARCHIVE] [bootpath_len], sizeof (BOOTARCHIVE32)); } } else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64 && ehdr->e_ident[EI_DATA] == ELFDATA2LSB && ehdr->e_machine == EM_AMD64) { if (fastboot_elf64_find_dboot_load_offset( (void *)va, fsize, &dboot_start_offset) != 0) { cmn_err(CE_NOTE, "!Fastboot: Couldn't " "find ELF64 dboot entry offset"); goto err_out; } if (!is_x86_feature(x86_featureset, X86FSET_64) || !is_x86_feature(x86_featureset, X86FSET_PAE)) { cmn_err(CE_NOTE, "Fastboot: Cannot " "reboot to %s: " "not a 64-bit capable system", kern_bootfile); goto err_out; } if (is_failsafe) { /* Failsafe boot_archive */ bcopy(BOOTARCHIVE64_FAILSAFE, &fastboot_filename [FASTBOOT_NAME_BOOTARCHIVE] [bootpath_len], sizeof (BOOTARCHIVE64_FAILSAFE)); } else { bcopy(BOOTARCHIVE64, &fastboot_filename [FASTBOOT_NAME_BOOTARCHIVE] [bootpath_len], sizeof (BOOTARCHIVE64)); } } else { cmn_err(CE_NOTE, "!Fastboot: Unknown ELF type"); goto err_out; } fb->fb_dest_pa = DBOOT_ENTRY_ADDRESS - dboot_start_offset; fb->fb_next_pa = DBOOT_ENTRY_ADDRESS + fsize_roundup; } else { fb->fb_dest_pa = newkernel.fi_files[i - 1].fb_next_pa; fb->fb_next_pa = fb->fb_dest_pa + fsize_roundup; } kobj_close_file(file); } /* * Add the function that will switch us to 32-bit protected mode */ fb = &newkernel.fi_files[FASTBOOT_SWTCH]; fb->fb_va = fb->fb_dest_pa = FASTBOOT_SWTCH_PA; fb->fb_size = MMU_PAGESIZE; hat_devload(kas.a_hat, (caddr_t)fb->fb_va, MMU_PAGESIZE, mmu_btop(fb->fb_dest_pa), PROT_READ | PROT_WRITE | PROT_EXEC, HAT_LOAD_NOCONSIST | HAT_LOAD_LOCK); /* * Build the new multiboot_info structure */ if (fastboot_build_mbi(fastboot_args, &newkernel) != 0) { goto err_out; } /* * Build page table for low 1G physical memory. Use big pages. * Allocate 4 (5 for amd64) pages for the page tables. * 1 page for PML4 (amd64) * 1 page for Page-Directory-Pointer Table * 2 pages for Page Directory * 1 page for Page Table. * The page table entry will be rewritten to map the physical * address as we do the copying. */ if (newkernel.fi_has_pae) { #ifdef __amd64 size_t size = MMU_PAGESIZE * 5; #else size_t size = MMU_PAGESIZE * 4; #endif /* __amd64 */ if (newkernel.fi_pagetable_size && newkernel.fi_pagetable_size < size) { contig_free((void *)newkernel.fi_pagetable_va, newkernel.fi_pagetable_size); newkernel.fi_pagetable_size = 0; } if (newkernel.fi_pagetable_size == 0) { if ((newkernel.fi_pagetable_va = (uintptr_t) contig_alloc(size, &fastboot_below_1G_dma_attr, MMU_PAGESIZE, 0)) == NULL) { cmn_err(CE_NOTE, fastboot_enomem_msg, (uint64_t)size, "1G"); goto err_out; } /* * fi_pagetable_size must be set after the allocation * succeeds as it's used to determine how much memory to * free. */ newkernel.fi_pagetable_size = size; } bzero((void *)(newkernel.fi_pagetable_va), size); newkernel.fi_pagetable_pa = mmu_ptob((uint64_t)hat_getpfnum(kas.a_hat, (caddr_t)newkernel.fi_pagetable_va)); newkernel.fi_last_table_pa = newkernel.fi_pagetable_pa + size - MMU_PAGESIZE; newkernel.fi_next_table_va = newkernel.fi_pagetable_va + MMU_PAGESIZE; newkernel.fi_next_table_pa = newkernel.fi_pagetable_pa + MMU_PAGESIZE; fastboot_build_pagetables(&newkernel); } /* Generate MD5 checksums */ fastboot_cksum_generate(&newkernel); /* Mark it as valid */ newkernel.fi_valid = 1; newkernel.fi_magic = FASTBOOT_MAGIC; postbootkernelbase = saved_kernelbase; return; err_out: postbootkernelbase = saved_kernelbase; newkernel.fi_valid = 0; fastboot_free_newkernel(&newkernel); }
/* * Create multiboot info structure (mbi) base on the saved mbi. * Recalculate values of the pointer type fields in the data * structure based on the new starting physical address of the * data structure. */ static int fastboot_build_mbi(char *mdep, fastboot_info_t *nk) { mb_module_t *mbp; multiboot_info_t *mbi; /* pointer to multiboot structure */ uintptr_t start_addr_va; /* starting VA of mbi */ uintptr_t start_addr_pa; /* starting PA of mbi */ size_t offs = 0; /* offset from the starting address */ size_t arglen; /* length of the command line arg */ size_t size; /* size of the memory reserved for mbi */ size_t mdnsz; /* length of the boot archive name */ /* * If mdep is not NULL or empty, use the length of mdep + 1 * (for NULL terminating) as the length of the new command * line; else use the saved command line length as the * length for the new command line. */ if (mdep != NULL && strlen(mdep) != 0) { arglen = strlen(mdep) + 1; } else { arglen = saved_cmdline_len; } /* * Allocate memory for the new multiboot info structure (mbi). * If we have reserved memory for mbi but it's not enough, * free it and reallocate. */ size = PAGESIZE + P2ROUNDUP(arglen, PAGESIZE); if (nk->fi_mbi_size && nk->fi_mbi_size < size) { contig_free((void *)nk->fi_new_mbi_va, nk->fi_mbi_size); nk->fi_mbi_size = 0; } if (nk->fi_mbi_size == 0) { if ((nk->fi_new_mbi_va = (uintptr_t)contig_alloc(size, &fastboot_below_1G_dma_attr, PAGESIZE, 0)) == NULL) { cmn_err(CE_NOTE, fastboot_enomem_msg, (uint64_t)size, "1G"); return (-1); } /* * fi_mbi_size must be set after the allocation succeeds * as it's used to determine how much memory to free. */ nk->fi_mbi_size = size; } /* * Initalize memory */ bzero((void *)nk->fi_new_mbi_va, nk->fi_mbi_size); /* * Get PA for the new mbi */ start_addr_va = nk->fi_new_mbi_va; start_addr_pa = mmu_ptob((uint64_t)hat_getpfnum(kas.a_hat, (caddr_t)start_addr_va)); nk->fi_new_mbi_pa = (paddr_t)start_addr_pa; /* * Populate the rest of the fields in the data structure */ /* * Copy from the saved mbi to preserve all non-pointer type fields. */ mbi = (multiboot_info_t *)start_addr_va; bcopy(&saved_mbi, mbi, sizeof (*mbi)); /* * Recalculate mods_addr. Set mod_start and mod_end based on * the physical address of the new boot archive. Set mod_name * to the name of the new boto archive. */ offs += sizeof (multiboot_info_t); mbi->mods_addr = start_addr_pa + offs; mbp = (mb_module_t *)(start_addr_va + offs); mbp->mod_start = nk->fi_files[FASTBOOT_BOOTARCHIVE].fb_dest_pa; mbp->mod_end = nk->fi_files[FASTBOOT_BOOTARCHIVE].fb_next_pa; offs += sizeof (mb_module_t); mdnsz = strlen(fastboot_filename[FASTBOOT_NAME_BOOTARCHIVE]) + 1; bcopy(fastboot_filename[FASTBOOT_NAME_BOOTARCHIVE], (void *)(start_addr_va + offs), mdnsz); mbp->mod_name = start_addr_pa + offs; mbp->reserved = 0; /* * Make sure the offset is 16-byte aligned to avoid unaligned access. */ offs += mdnsz; offs = P2ROUNDUP_TYPED(offs, 16, size_t); /* * Recalculate mmap_addr */ mbi->mmap_addr = start_addr_pa + offs; bcopy((void *)(uintptr_t)saved_mmap, (void *)(start_addr_va + offs), saved_mbi.mmap_length); offs += saved_mbi.mmap_length; /* * Recalculate drives_addr */ mbi->drives_addr = start_addr_pa + offs; bcopy((void *)(uintptr_t)saved_drives, (void *)(start_addr_va + offs), saved_mbi.drives_length); offs += saved_mbi.drives_length; /* * Recalculate the address of cmdline. Set cmdline to contain the * new boot argument. */ mbi->cmdline = start_addr_pa + offs; if (mdep != NULL && strlen(mdep) != 0) { bcopy(mdep, (void *)(start_addr_va + offs), arglen); } else { bcopy((void *)saved_cmdline, (void *)(start_addr_va + offs), arglen); } /* clear fields and flags that are not copied */ bzero(&mbi->config_table, sizeof (*mbi) - offsetof(multiboot_info_t, config_table)); mbi->flags &= ~(MB_INFO_CONFIG_TABLE | MB_INFO_BOOT_LOADER_NAME | MB_INFO_APM_TABLE | MB_INFO_VIDEO_INFO); return (0); }
/* * Create a guest virtual cpu context so that the virtual cpu * springs into life in the domain just about to call mp_startup() * * Virtual CPUs must be initialized once in the lifetime of the domain; * after that subsequent attempts to start them will fail with X_EEXIST. * * Thus 'alloc' -really- creates and initializes the virtual * CPU context just once. Once the initialisation succeeds, we never * free it, nor the regular cpu_t to which it refers. */ void * mach_cpucontext_alloc(struct cpu *cp) { kthread_t *tp = cp->cpu_thread; vcpu_guest_context_t vgc; int err = 1; /* * First, augment the incoming cpu structure * - vcpu pointer reference * - pending event storage area * - physical address of GDT */ cp->cpu_m.mcpu_vcpu_info = &HYPERVISOR_shared_info->vcpu_info[cp->cpu_id]; cp->cpu_m.mcpu_evt_pend = kmem_zalloc( sizeof (struct xen_evt_data), KM_SLEEP); cp->cpu_m.mcpu_gdtpa = mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)cp->cpu_gdt)); if ((err = xen_gdt_setprot(cp, PROT_READ)) != 0) goto done; /* * Now set up the vcpu context so that we can start this vcpu * in the kernel at tp->t_pc (mp_startup). Note that the * thread will thread_exit() shortly after performing the * initialization; in particular, we will *never* take a * privilege transition on this thread. */ bzero(&vgc, sizeof (vgc)); #ifdef __amd64 vgc.user_regs.rip = tp->t_pc; vgc.user_regs.rsp = tp->t_sp; vgc.user_regs.rbp = tp->t_sp - 2 * sizeof (greg_t); #else vgc.user_regs.eip = tp->t_pc; vgc.user_regs.esp = tp->t_sp; vgc.user_regs.ebp = tp->t_sp - 2 * sizeof (greg_t); #endif /* * XXPV Fix resume, if Russ didn't already fix it. * * Note that resume unconditionally puts t->t_stk + sizeof (regs) * into kernel_sp via HYPERVISOR_stack_switch. This anticipates * that only lwps take traps that switch to the kernel stack; * part of creating an lwp adjusts the stack by subtracting * sizeof (struct regs) off t_stk. * * The more interesting question is, why do we do all the work * of a fully fledged lwp for a plain thread? In particular * we don't have to call HYPERVISOR_stack_switch for lwp-less threads * or futz with the LDT. This should probably all be done with * an lwp context operator to keep pure thread context switch fast. */ vgc.kernel_sp = (ulong_t)tp->t_stk; err = mp_set_cpu_context(&vgc, cp); done: if (err) { mach_cpucontext_free(cp, NULL, err); return (NULL); } return (cp); }
/* * balloon_free_pages() * free page_cnt pages, using any combination of mfns, pfns, and kva as long * as they refer to the same mapping. If an array of mfns is passed in, we * assume they were already cleared. Otherwise, we need to zero the pages * before giving them back to the hypervisor. kva space is not free'd up in * case the caller wants to re-use it. */ long balloon_free_pages(uint_t page_cnt, mfn_t *mfns, caddr_t kva, pfn_t *pfns) { xen_memory_reservation_t memdec; mfn_t mfn; pfn_t pfn; uint_t i; long e; #if DEBUG /* make sure kva is page aligned and maps to first pfn */ if (kva != NULL) { ASSERT(((uintptr_t)kva & PAGEOFFSET) == 0); if (pfns != NULL) { ASSERT(hat_getpfnum(kas.a_hat, kva) == pfns[0]); } } #endif /* if we have a kva, we can clean all pages with just one bzero */ if ((kva != NULL) && balloon_zero_memory) { bzero(kva, (page_cnt * PAGESIZE)); } /* if we were given a kva and/or a pfn */ if ((kva != NULL) || (pfns != NULL)) { /* * All the current callers only pass 1 page when using kva or * pfns, and use mfns when passing multiple pages. If that * assumption is changed, the following code will need some * work. The following ASSERT() guarantees we're respecting * the io locking quota. */ ASSERT(page_cnt < bln_contig_list_quota); /* go through all the pages */ for (i = 0; i < page_cnt; i++) { /* get the next pfn */ if (pfns == NULL) { pfn = hat_getpfnum(kas.a_hat, (kva + (PAGESIZE * i))); } else { pfn = pfns[i]; } /* * if we didn't already zero this page, do it now. we * need to do this *before* we give back the MFN */ if ((kva == NULL) && (balloon_zero_memory)) { pfnzero(pfn, 0, PAGESIZE); } /* * unmap the pfn. We don't free up the kva vmem space * so the caller can re-use it. The page must be * unmapped before it is given back to the hypervisor. */ if (kva != NULL) { hat_unload(kas.a_hat, (kva + (PAGESIZE * i)), PAGESIZE, HAT_UNLOAD_UNMAP); } /* grab the mfn before the pfn is marked as invalid */ mfn = pfn_to_mfn(pfn); /* mark the pfn as invalid */ reassign_pfn(pfn, MFN_INVALID); /* * if we weren't given an array of MFNs, we need to * free them up one at a time. Otherwise, we'll wait * until later and do it in one hypercall */ if (mfns == NULL) { bzero(&memdec, sizeof (memdec)); /*LINTED: constant in conditional context*/ set_xen_guest_handle(memdec.extent_start, &mfn); memdec.domid = DOMID_SELF; memdec.nr_extents = 1; e = HYPERVISOR_memory_op( XENMEM_decrease_reservation, &memdec); if (e != 1) { cmn_err(CE_PANIC, "balloon: unable to " "give a page back to the " "hypervisor.\n"); } } } } /* * if we were passed in MFNs, we haven't free'd them up yet. We can * do it with one call. */ if (mfns != NULL) { bzero(&memdec, sizeof (memdec)); /*LINTED: constant in conditional context*/ set_xen_guest_handle(memdec.extent_start, mfns); memdec.domid = DOMID_SELF; memdec.nr_extents = page_cnt; e = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &memdec); if (e != page_cnt) { cmn_err(CE_PANIC, "balloon: unable to give pages back " "to the hypervisor.\n"); } } atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, page_cnt); return (page_cnt); }
void sc_create(pci_t *pci_p) { dev_info_t *dip = pci_p->pci_dip; sc_t *sc_p; uint64_t paddr; #ifdef lint dip = dip; #endif if (!pci_stream_buf_exists) return; /* * Allocate streaming cache state structure and link it to * the pci state structure. */ sc_p = (sc_t *)kmem_zalloc(sizeof (sc_t), KM_SLEEP); pci_p->pci_sc_p = sc_p; sc_p->sc_pci_p = pci_p; pci_sc_setup(sc_p); sc_p->sc_sync_reg_pa = va_to_pa((char *)sc_p->sc_sync_reg); DEBUG3(DBG_ATTACH, dip, "sc_create: ctrl=%x, invl=%x, sync=%x\n", sc_p->sc_ctrl_reg, sc_p->sc_invl_reg, sc_p->sc_sync_reg); DEBUG2(DBG_ATTACH, dip, "sc_create: ctx_invl=%x ctx_match=%x\n", sc_p->sc_ctx_invl_reg, sc_p->sc_ctx_match_reg); DEBUG3(DBG_ATTACH, dip, "sc_create: data_diag=%x, tag_diag=%x, ltag_diag=%x\n", sc_p->sc_data_diag_acc, sc_p->sc_tag_diag_acc, sc_p->sc_ltag_diag_acc); /* * Allocate the flush/sync buffer. Make sure it's properly * aligned. */ sc_p->sc_sync_flag_base = vmem_xalloc(static_alloc_arena, PCI_SYNC_FLAG_SIZE, PCI_SYNC_FLAG_SIZE, 0, 0, NULL, NULL, VM_SLEEP); sc_p->sc_sync_flag_vaddr = (uint64_t *)sc_p->sc_sync_flag_base; paddr = (uint64_t)hat_getpfnum(kas.a_hat, (caddr_t)sc_p->sc_sync_flag_vaddr); paddr <<= MMU_PAGESHIFT; paddr += (uint64_t) ((uintptr_t)sc_p->sc_sync_flag_vaddr & ~MMU_PAGEMASK); sc_p->sc_sync_flag_pa = paddr; DEBUG2(DBG_ATTACH, dip, "sc_create: sync buffer - vaddr=%x paddr=%x\n", sc_p->sc_sync_flag_vaddr, sc_p->sc_sync_flag_pa); /* * Create a mutex to go along with it. While the mutex is held, * all interrupts should be blocked. This will prevent driver * interrupt routines from attempting to acquire the mutex while * held by a lower priority interrupt routine. Note also that * we now block cross calls as well, to prevent issues with * relocation. */ mutex_init(&sc_p->sc_sync_mutex, NULL, MUTEX_DRIVER, (void *)ipltospl(XCALL_PIL)); sc_configure(sc_p); }
/* * Private ioctl for libkvm to support kvm_physaddr(). * Given an address space and a VA, compute the PA. */ static int mmioctl_vtop(intptr_t data) { #ifdef _SYSCALL32 mem_vtop32_t vtop32; #endif mem_vtop_t mem_vtop; proc_t *p; pfn_t pfn = (pfn_t)PFN_INVALID; pid_t pid = 0; struct as *as; struct seg *seg; if (get_udatamodel() == DATAMODEL_NATIVE) { if (copyin((void *)data, &mem_vtop, sizeof (mem_vtop_t))) return (EFAULT); } #ifdef _SYSCALL32 else { if (copyin((void *)data, &vtop32, sizeof (mem_vtop32_t))) return (EFAULT); mem_vtop.m_as = (struct as *)(uintptr_t)vtop32.m_as; mem_vtop.m_va = (void *)(uintptr_t)vtop32.m_va; if (mem_vtop.m_as != NULL) return (EINVAL); } #endif if (mem_vtop.m_as == &kas) { pfn = hat_getpfnum(kas.a_hat, mem_vtop.m_va); } else { if (mem_vtop.m_as == NULL) { /* * Assume the calling process's address space if the * caller didn't specify one. */ p = curthread->t_procp; if (p == NULL) return (EIO); mem_vtop.m_as = p->p_as; } mutex_enter(&pidlock); for (p = practive; p != NULL; p = p->p_next) { if (p->p_as == mem_vtop.m_as) { pid = p->p_pid; break; } } mutex_exit(&pidlock); if (p == NULL) return (EIO); p = sprlock(pid); if (p == NULL) return (EIO); as = p->p_as; if (as == mem_vtop.m_as) { mutex_exit(&p->p_lock); AS_LOCK_ENTER(as, &as->a_lock, RW_READER); for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) if ((uintptr_t)mem_vtop.m_va - (uintptr_t)seg->s_base < seg->s_size) break; if (seg != NULL) pfn = hat_getpfnum(as->a_hat, mem_vtop.m_va); AS_LOCK_EXIT(as, &as->a_lock); mutex_enter(&p->p_lock); } sprunlock(p); } mem_vtop.m_pfn = pfn; if (pfn == PFN_INVALID) return (EIO); if (get_udatamodel() == DATAMODEL_NATIVE) { if (copyout(&mem_vtop, (void *)data, sizeof (mem_vtop_t))) return (EFAULT); } #ifdef _SYSCALL32 else { vtop32.m_pfn = mem_vtop.m_pfn; if (copyout(&vtop32, (void *)data, sizeof (mem_vtop32_t))) return (EFAULT); } #endif return (0); }
/*ARGSUSED3*/ static int mmrw(dev_t dev, struct uio *uio, enum uio_rw rw, cred_t *cred) { pfn_t v; struct iovec *iov; int error = 0; size_t c; ssize_t oresid = uio->uio_resid; minor_t minor = getminor(dev); while (uio->uio_resid > 0 && error == 0) { iov = uio->uio_iov; if (iov->iov_len == 0) { uio->uio_iov++; uio->uio_iovcnt--; if (uio->uio_iovcnt < 0) panic("mmrw"); continue; } switch (minor) { case M_MEM: memlist_read_lock(); if (!address_in_memlist(phys_install, (uint64_t)uio->uio_loffset, 1)) { memlist_read_unlock(); error = EFAULT; break; } memlist_read_unlock(); v = BTOP((u_offset_t)uio->uio_loffset); error = mmio(uio, rw, v, uio->uio_loffset & PAGEOFFSET, 0, NULL); break; case M_KMEM: case M_ALLKMEM: { page_t **ppp = NULL; caddr_t vaddr = (caddr_t)uio->uio_offset; int try_lock = NEED_LOCK_KVADDR(vaddr); int locked = 0; if ((error = plat_mem_do_mmio(uio, rw)) != ENOTSUP) break; /* * If vaddr does not map a valid page, as_pagelock() * will return failure. Hence we can't check the * return value and return EFAULT here as we'd like. * seg_kp and seg_kpm do not properly support * as_pagelock() for this context so we avoid it * using the try_lock set check above. Some day when * the kernel page locking gets redesigned all this * muck can be cleaned up. */ if (try_lock) locked = (as_pagelock(&kas, &ppp, vaddr, PAGESIZE, S_WRITE) == 0); v = hat_getpfnum(kas.a_hat, (caddr_t)(uintptr_t)uio->uio_loffset); if (v == PFN_INVALID) { if (locked) as_pageunlock(&kas, ppp, vaddr, PAGESIZE, S_WRITE); error = EFAULT; break; } error = mmio(uio, rw, v, uio->uio_loffset & PAGEOFFSET, minor == M_ALLKMEM || mm_kmem_io_access, (locked && ppp) ? *ppp : NULL); if (locked) as_pageunlock(&kas, ppp, vaddr, PAGESIZE, S_WRITE); } break; case M_ZERO: if (rw == UIO_READ) { label_t ljb; if (on_fault(&ljb)) { no_fault(); error = EFAULT; break; } uzero(iov->iov_base, iov->iov_len); no_fault(); uio->uio_resid -= iov->iov_len; uio->uio_loffset += iov->iov_len; break; } /* else it's a write, fall through to NULL case */ /*FALLTHROUGH*/ case M_NULL: if (rw == UIO_READ) return (0); c = iov->iov_len; iov->iov_base += c; iov->iov_len -= c; uio->uio_loffset += c; uio->uio_resid -= c; break; } } return (uio->uio_resid == oresid ? error : 0); }
/* * Initialize IOAT relative resources. */ static int fipe_ioat_init(void) { char *buf; size_t size; bzero(&fipe_ioat_ctrl, sizeof (fipe_ioat_ctrl)); mutex_init(&fipe_ioat_ctrl.ioat_lock, NULL, MUTEX_DRIVER, NULL); /* * Allocate memory for IOAT memory copy operation. * The allocated memory should be page aligned to achieve better power * savings. * Don't use ddi_dma_mem_alloc here to keep thing simple. This also * makes quiesce easier. */ size = PAGESIZE; buf = kmem_zalloc(size, KM_SLEEP); if ((intptr_t)buf & PAGEOFFSET) { kmem_free(buf, PAGESIZE); size <<= 1; buf = kmem_zalloc(size, KM_SLEEP); } fipe_ioat_ctrl.ioat_buf_size = size; fipe_ioat_ctrl.ioat_buf_start = buf; buf = (char *)P2ROUNDUP((intptr_t)buf, PAGESIZE); fipe_ioat_ctrl.ioat_buf_virtaddr = buf; fipe_ioat_ctrl.ioat_buf_physaddr = hat_getpfnum(kas.a_hat, buf); fipe_ioat_ctrl.ioat_buf_physaddr <<= PAGESHIFT; #ifdef FIPE_IOAT_BUILTIN { uint64_t bufpa; /* IOAT descriptor data structure copied from ioat.h. */ struct fipe_ioat_cmd_desc { uint32_t dd_size; uint32_t dd_ctrl; uint64_t dd_src_paddr; uint64_t dd_dest_paddr; uint64_t dd_next_desc; uint64_t dd_res4; uint64_t dd_res5; uint64_t dd_res6; uint64_t dd_res7; } *desc; /* * Build two IOAT command descriptors and chain them into ring. * Control flags as below: * 0x2: disable source snoop * 0x4: disable destination snoop * 0x0 << 24: memory copy operation * The layout for command descriptors and memory buffers are * organized for power saving effect, please don't change it. */ buf = fipe_ioat_ctrl.ioat_buf_virtaddr; bufpa = fipe_ioat_ctrl.ioat_buf_physaddr; fipe_ioat_ctrl.ioat_cmd_physaddr = bufpa; /* First command descriptor. */ desc = (struct fipe_ioat_cmd_desc *)(buf); desc->dd_size = 128; desc->dd_ctrl = 0x6; desc->dd_src_paddr = bufpa + 2048; desc->dd_dest_paddr = bufpa + 3072; /* Point to second descriptor. */ desc->dd_next_desc = bufpa + 64; /* Second command descriptor. */ desc = (struct fipe_ioat_cmd_desc *)(buf + 64); desc->dd_size = 128; desc->dd_ctrl = 0x6; desc->dd_src_paddr = bufpa + 2048; desc->dd_dest_paddr = bufpa + 3072; /* Point to first descriptor. */ desc->dd_next_desc = bufpa; } #endif /* FIPE_IOAT_BUILTIN */ return (0); }