static void less_pages(uint64_t base, uint64_t len) { uint64_t pa, end = base + len; extern int kcage_on; for (pa = base; pa < end; pa += PAGESIZE) { pfn_t pfnum; page_t *pp; pfnum = (pfn_t)(pa >> PAGESHIFT); if ((pp = page_numtopp_nolock(pfnum)) == NULL) cmn_err(CE_PANIC, "missing pfnum %lx", pfnum); /* * must break up any large pages that may have * constituent pages being utilized for * prom_alloc()'s. page_reclaim() can't handle * large pages. */ if (pp->p_szc != 0) page_boot_demote(pp); if (!PAGE_LOCKED(pp) && pp->p_lckcnt == 0) { /* * Ahhh yes, a prom page, * suck it off the freelist, * lock it, and hashin on prom_pages vp. */ if (page_trylock(pp, SE_EXCL) == 0) cmn_err(CE_PANIC, "prom page locked"); (void) page_reclaim(pp, NULL); /* * vnode offsets on the prom_ppages vnode * are page numbers (gack) for >32 bit * physical memory machines. */ (void) page_hashin(pp, &promvp, (offset_t)pfnum, NULL); if (kcage_on) { ASSERT(pp->p_szc == 0); if (PP_ISNORELOC(pp) == 0) { PP_SETNORELOC(pp); PLCNT_XFER_NORELOC(pp); } } (void) page_pp_lock(pp, 0, 1); } } }
/*ARGSUSED*/ caddr_t hat_kpm_mapin(struct page *pp, struct kpme *kpme) { caddr_t vaddr; if (kpm_enable == 0) { cmn_err(CE_WARN, "hat_kpm_mapin: kpm_enable not set"); return ((caddr_t)NULL); } if (pp == NULL || PAGE_LOCKED(pp) == 0) { cmn_err(CE_WARN, "hat_kpm_mapin: pp zero or not locked"); return ((caddr_t)NULL); } vaddr = hat_kpm_page2va(pp, 1); return (vaddr); }
/*ARGSUSED*/ void hat_kpm_mapout(struct page *pp, struct kpme *kpme, caddr_t vaddr) { #ifdef DEBUG if (kpm_enable == 0) { cmn_err(CE_WARN, "hat_kpm_mapout: kpm_enable not set"); return; } if (IS_KPM_ADDR(vaddr) == 0) { cmn_err(CE_WARN, "hat_kpm_mapout: no kpm address"); return; } if (pp == NULL || PAGE_LOCKED(pp) == 0) { cmn_err(CE_WARN, "hat_kpm_mapout: page zero or not locked"); return; } #endif }
/* * Map address "addr" in address space "as" into a kernel virtual address. * The memory is guaranteed to be resident and locked down. */ static caddr_t mapin(struct as *as, caddr_t addr, int writing) { page_t *pp; caddr_t kaddr; pfn_t pfnum; /* * NB: Because of past mistakes, we have bits being returned * by getpfnum that are actually the page type bits of the pte. * When the object we are trying to map is a memory page with * a page structure everything is ok and we can use the optimal * method, ppmapin. Otherwise, we have to do something special. */ pfnum = hat_getpfnum(as->a_hat, addr); if (pf_is_memory(pfnum)) { pp = page_numtopp_nolock(pfnum); if (pp != NULL) { ASSERT(PAGE_LOCKED(pp)); kaddr = ppmapin(pp, writing ? (PROT_READ | PROT_WRITE) : PROT_READ, (caddr_t)-1); return (kaddr + ((uintptr_t)addr & PAGEOFFSET)); } } /* * Oh well, we didn't have a page struct for the object we were * trying to map in; ppmapin doesn't handle devices, but allocating a * heap address allows ppmapout to free virutal space when done. */ kaddr = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP); hat_devload(kas.a_hat, kaddr, PAGESIZE, pfnum, writing ? (PROT_READ | PROT_WRITE) : PROT_READ, HAT_LOAD_LOCK); return (kaddr + ((uintptr_t)addr & PAGEOFFSET)); }
void pvn_write_done(page_t *plist, int flags) { int dfree = 0; int pgrec = 0; int pgout = 0; int pgpgout = 0; int anonpgout = 0; int anonfree = 0; int fspgout = 0; int fsfree = 0; int execpgout = 0; int execfree = 0; page_t *pp; struct cpu *cpup; struct vnode *vp = NULL; /* for probe */ uint_t ppattr; kmutex_t *vphm = NULL; ASSERT((flags & B_READ) == 0); /* * If we are about to start paging anyway, start freeing pages. */ if (write_free && freemem < lotsfree + pages_before_pager && (flags & B_ERROR) == 0) { flags |= B_FREE; } /* * Handle each page involved in the i/o operation. */ while (plist != NULL) { pp = plist; ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp)); page_sub(&plist, pp); /* Kernel probe support */ if (vp == NULL) vp = pp->p_vnode; if (((flags & B_ERROR) == 0) && IS_VMODSORT(vp)) { /* * Move page to the top of the v_page list. * Skip pages modified during IO. */ vphm = page_vnode_mutex(vp); mutex_enter(vphm); if ((pp->p_vpnext != pp) && !hat_ismod(pp)) { page_vpsub(&vp->v_pages, pp); page_vpadd(&vp->v_pages, pp); } mutex_exit(vphm); } if (flags & B_ERROR) { /* * Write operation failed. We don't want * to destroy (or free) the page unless B_FORCE * is set. We set the mod bit again and release * all locks on the page so that it will get written * back again later when things are hopefully * better again. * If B_INVAL and B_FORCE is set we really have * to destroy the page. */ if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) { page_io_unlock(pp); /*LINTED: constant in conditional context*/ VN_DISPOSE(pp, B_INVAL, 0, kcred); } else { hat_setmod_only(pp); page_io_unlock(pp); page_unlock(pp); } } else if (flags & B_INVAL) { /* * XXX - Failed writes with B_INVAL set are * not handled appropriately. */ page_io_unlock(pp); /*LINTED: constant in conditional context*/ VN_DISPOSE(pp, B_INVAL, 0, kcred); } else if (flags & B_FREE ||!hat_page_is_mapped(pp)) { /* * Update statistics for pages being paged out */ if (pp->p_vnode) { if (IS_SWAPFSVP(pp->p_vnode)) { anonpgout++; } else { if (pp->p_vnode->v_flag & VVMEXEC) { execpgout++; } else { fspgout++; } } } page_io_unlock(pp); pgout = 1; pgpgout++; TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT, "page_ws_out:pp %p", pp); /* * The page_struct_lock need not be acquired to * examine "p_lckcnt" and "p_cowcnt" since we'll * have an "exclusive" lock if the upgrade succeeds. */ if (page_tryupgrade(pp) && pp->p_lckcnt == 0 && pp->p_cowcnt == 0) { /* * Check if someone has reclaimed the * page. If ref and mod are not set, no * one is using it so we can free it. * The rest of the system is careful * to use the NOSYNC flag to unload * translations set up for i/o w/o * affecting ref and mod bits. * * Obtain a copy of the real hardware * mod bit using hat_pagesync(pp, HAT_DONTZERO) * to avoid having to flush the cache. */ ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD); ck_refmod: if (!(ppattr & (P_REF | P_MOD))) { if (hat_page_is_mapped(pp)) { /* * Doesn't look like the page * was modified so now we * really have to unload the * translations. Meanwhile * another CPU could've * modified it so we have to * check again. We don't loop * forever here because now * the translations are gone * and no one can get a new one * since we have the "exclusive" * lock on the page. */ (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); ppattr = hat_page_getattr(pp, P_REF | P_MOD); goto ck_refmod; } /* * Update statistics for pages being * freed */ if (pp->p_vnode) { if (IS_SWAPFSVP(pp->p_vnode)) { anonfree++; } else { if (pp->p_vnode->v_flag & VVMEXEC) { execfree++; } else { fsfree++; } } } /*LINTED: constant in conditional ctx*/ VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred); dfree++; } else { page_unlock(pp); pgrec++; TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE, "page_ws_free:pp %p", pp); } } else { /* * Page is either `locked' in memory * or was reclaimed and now has a * "shared" lock, so release it. */ page_unlock(pp); } } else { /* * Neither B_FREE nor B_INVAL nor B_ERROR. * Just release locks. */ page_io_unlock(pp); page_unlock(pp); } } CPU_STATS_ENTER_K(); cpup = CPU; /* get cpup now that CPU cannot change */ CPU_STATS_ADDQ(cpup, vm, dfree, dfree); CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec); CPU_STATS_ADDQ(cpup, vm, pgout, pgout); CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout); CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout); CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree); CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout); CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree); CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout); CPU_STATS_ADDQ(cpup, vm, execfree, execfree); CPU_STATS_EXIT_K(); /* Kernel probe */ TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */, tnf_opaque, vnode, vp, tnf_ulong, pages_pageout, pgpgout, tnf_ulong, pages_freed, dfree, tnf_ulong, pages_reclaimed, pgrec); }
/* * Scan page_t's and issue I/O's for modified pages. * * Also coalesces consecutive small sized free pages into the next larger * pagesize. This costs a tiny bit of time in fsflush, but will reduce time * spent scanning on later passes and for anybody allocating large pages. */ static void fsflush_do_pages() { vnode_t *vp; ulong_t pcount; hrtime_t timer = gethrtime(); ulong_t releases = 0; ulong_t nexamined = 0; ulong_t nlocked = 0; ulong_t nmodified = 0; ulong_t ncoalesce = 0; ulong_t cnt; int mod; int fspage = 1; u_offset_t offset; uint_t szc; page_t *coal_page = NULL; /* 1st page in group to coalesce */ uint_t coal_szc = 0; /* size code, coal_page->p_szc */ uint_t coal_cnt = 0; /* count of pages seen */ static ulong_t nscan = 0; static pgcnt_t last_total_pages = 0; static page_t *pp = NULL; /* * Check to see if total_pages has changed. */ if (total_pages != last_total_pages) { last_total_pages = total_pages; nscan = (last_total_pages * (tune.t_fsflushr))/v.v_autoup; } if (pp == NULL) pp = memsegs->pages; pcount = 0; while (pcount < nscan) { /* * move to the next page, skipping over large pages * and issuing prefetches. */ if (pp->p_szc && fspage == 0) { pfn_t pfn; pfn = page_pptonum(pp); cnt = page_get_pagecnt(pp->p_szc); cnt -= pfn & (cnt - 1); } else cnt = 1; pp = page_nextn(pp, cnt); prefetch_page_r((void *)pp); ASSERT(pp != NULL); pcount += cnt; /* * Do a bunch of dirty tests (ie. no locking) to determine * if we can quickly skip this page. These tests are repeated * after acquiring the page lock. */ ++nexamined; if (PP_ISSWAP(pp)) { fspage = 0; coal_page = NULL; continue; } /* * skip free pages too, but try coalescing them into larger * pagesizes */ if (PP_ISFREE(pp)) { /* * skip pages with a file system identity or that * are already maximum size */ fspage = 0; szc = pp->p_szc; if (pp->p_vnode != NULL || szc == fsf_npgsz - 1) { coal_page = NULL; continue; } /* * If not in a coalescing candidate page or the size * codes are different, start a new candidate. */ if (coal_page == NULL || coal_szc != szc) { /* * page must be properly aligned */ if ((page_pptonum(pp) & fsf_mask[szc]) != 0) { coal_page = NULL; continue; } coal_page = pp; coal_szc = szc; coal_cnt = 1; continue; } /* * acceptable to add this to existing candidate page */ ++coal_cnt; if (coal_cnt < fsf_pgcnt[coal_szc]) continue; /* * We've got enough pages to coalesce, so do it. * After promoting, we clear coal_page, so it will * take another pass to promote this to an even * larger page. */ ++ncoalesce; (void) page_promote_size(coal_page, coal_szc); coal_page = NULL; continue; } else { coal_page = NULL; } if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { fspage = 0; continue; } /* * Reject pages that can't be "exclusively" locked. */ if (!page_trylock(pp, SE_EXCL)) continue; ++nlocked; /* * After locking the page, redo the above checks. * Since we locked the page, leave out the PAGE_LOCKED() test. */ vp = pp->p_vnode; if (PP_ISSWAP(pp) || PP_ISFREE(pp) || vp == NULL || PP_ISKAS(pp) || (vp->v_flag & VISSWAP) != 0) { page_unlock(pp); fspage = 0; continue; } if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { page_unlock(pp); continue; } fspage = 1; ASSERT(vp->v_type != VCHR); /* * Check the modified bit. Leaving the bit alone in hardware. * It will be cleared if we do the putpage. */ if (IS_VMODSORT(vp)) mod = hat_ismod(pp); else mod = hat_pagesync(pp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD; if (mod) { ++nmodified; offset = pp->p_offset; /* * Hold the vnode before releasing the page lock * to prevent it from being freed and re-used by * some other thread. */ VN_HOLD(vp); page_unlock(pp); (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_ASYNC, kcred, NULL); VN_RELE(vp); } else { /* * Catch any pages which should be on the cache list, * but aren't yet. */ if (hat_page_is_mapped(pp) == 0) { ++releases; (void) page_release(pp, 1); } else { page_unlock(pp); } } } /* * maintain statistics * reset every million wakeups, just to avoid overflow */ if (++fsf_cycles == 1000000) { fsf_cycles = 0; fsf_total.fsf_scan = 0; fsf_total.fsf_examined = 0; fsf_total.fsf_locked = 0; fsf_total.fsf_modified = 0; fsf_total.fsf_coalesce = 0; fsf_total.fsf_time = 0; fsf_total.fsf_releases = 0; } else { fsf_total.fsf_scan += fsf_recent.fsf_scan = nscan; fsf_total.fsf_examined += fsf_recent.fsf_examined = nexamined; fsf_total.fsf_locked += fsf_recent.fsf_locked = nlocked; fsf_total.fsf_modified += fsf_recent.fsf_modified = nmodified; fsf_total.fsf_coalesce += fsf_recent.fsf_coalesce = ncoalesce; fsf_total.fsf_time += fsf_recent.fsf_time = gethrtime() - timer; fsf_total.fsf_releases += fsf_recent.fsf_releases = releases; } }
/* * Find, take and return a mutex held by hat_page_demote(). * Called by page_demote_vp_pages() before hat_page_demote() call and by * routines that want to block hat_page_demote() but can't do it * via locking all constituent pages. * * Return NULL if p_szc is 0. * * It should only be used for pages that can be demoted by hat_page_demote() * i.e. non swapfs file system pages. The logic here is lifted from * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase * since the page is locked and not free. * * Hash of the root page is used to find the lock. * To find the root in the presense of hat_page_demote() chageing the location * of the root this routine relies on the fact that hat_page_demote() changes * root last. * * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is * returned pp's p_szc may be any value. */ kmutex_t * page_szc_lock(page_t *pp) { kmutex_t *mtx; page_t *rootpp; uint_t szc; uint_t rszc; uint_t pszc = pp->p_szc; ASSERT(pp != NULL); ASSERT(PAGE_LOCKED(pp)); ASSERT(!PP_ISFREE(pp)); ASSERT(pp->p_vnode != NULL); ASSERT(!IS_SWAPFSVP(pp->p_vnode)); ASSERT(!PP_ISKAS(pp)); again: if (pszc == 0) { VM_STAT_ADD(pszclck_stat[0]); return (NULL); } /* The lock lives in the root page */ rootpp = PP_GROUPLEADER(pp, pszc); mtx = PAGE_SZC_MUTEX(rootpp); mutex_enter(mtx); /* * since p_szc can only decrease if pp == rootpp * rootpp will be always the same i.e we have the right root * regardless of rootpp->p_szc. * If location of pp's root didn't change after we took * the lock we have the right root. return mutex hashed off it. */ if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) { VM_STAT_ADD(pszclck_stat[1]); return (mtx); } /* * root location changed because page got demoted. * locate the new root. */ if (rszc < pszc) { szc = pp->p_szc; ASSERT(szc < pszc); mutex_exit(mtx); pszc = szc; VM_STAT_ADD(pszclck_stat[2]); goto again; } VM_STAT_ADD(pszclck_stat[3]); /* * current hat_page_demote not done yet. * wait for it to finish. */ mutex_exit(mtx); rootpp = PP_GROUPLEADER(rootpp, rszc); mtx = PAGE_SZC_MUTEX(rootpp); mutex_enter(mtx); mutex_exit(mtx); ASSERT(rootpp->p_szc < rszc); goto again; }
void plat_release_page(page_t *pp) { ASSERT((pp != NULL) && PAGE_LOCKED(pp)); page_unlock(pp); }