/* ARGSUSED */ static int memstat_callback(page_t *page, page_t *pp, memstat_t *stats) { struct vnode *vp = &stats->ms_vn; if (pp->p_vnode == NULL || pp->p_vnode == stats->ms_unused_vp) return (WALK_NEXT); else if (MS_PP_ISKAS(pp, stats)) stats->ms_kmem++; else if (MS_PP_ISZFS_DATA(pp, stats)) stats->ms_zfs_data++; else if (PP_ISFREE(pp)) stats->ms_cachelist++; else if (vn_get(stats->ms_vn_htable, vp, (uintptr_t)pp->p_vnode)) return (WALK_ERR); else if (IS_SWAPFSVP(vp)) stats->ms_anon++; else if ((vp->v_flag & VVMEXEC) != 0) stats->ms_exec++; else stats->ms_vnode++; stats->ms_total++; return (WALK_NEXT); }
/* * Try to retire a page when we stumble onto it in the page lock routines. */ void page_tryretire(page_t *pp) { ASSERT(PAGE_EXCL(pp)); if (!pr_enable) { page_unlock(pp); return; } /* * If the page is a big page, try to break it up. * * If there are other bad pages besides `pp', they will be * recursively retired for us thanks to a bit of magic. * If the page is a small page with errors, try to retire it. */ if (pp->p_szc > 0) { if (PP_ISFREE(pp) && !page_try_demote_free_pages(pp)) { page_unlock(pp); PR_DEBUG(prd_nofreedemote); return; } else if (!page_try_demote_pages(pp)) { page_unlock(pp); PR_DEBUG(prd_nodemote); return; } PR_DEBUG(prd_demoted); page_unlock(pp); } else { (void) page_retire_pp(pp); } }
void boot_mapin(caddr_t addr, size_t size) { caddr_t eaddr; page_t *pp; pfn_t pfnum; if (page_resv(btop(size), KM_NOSLEEP) == 0) panic("boot_mapin: page_resv failed"); for (eaddr = addr + size; addr < eaddr; addr += PAGESIZE) { pfnum = va_to_pfn(addr); if (pfnum == PFN_INVALID) continue; if ((pp = page_numtopp_nolock(pfnum)) == NULL) panic("boot_mapin(): No pp for pfnum = %lx", pfnum); /* * must break up any large pages that may have constituent * pages being utilized for BOP_ALLOC()'s before calling * page_numtopp().The locking code (ie. page_reclaim()) * can't handle them */ if (pp->p_szc != 0) page_boot_demote(pp); pp = page_numtopp(pfnum, SE_EXCL); if (pp == NULL || PP_ISFREE(pp)) panic("boot_alloc: pp is NULL or free"); /* * If the cage is on but doesn't yet contain this page, * mark it as non-relocatable. */ if (kcage_on && !PP_ISNORELOC(pp)) { PP_SETNORELOC(pp); PLCNT_XFER_NORELOC(pp); } (void) page_hashin(pp, &kvp, (u_offset_t)(uintptr_t)addr, NULL); pp->p_lckcnt = 1; #if defined(__x86) page_downgrade(pp); #else page_unlock(pp); #endif } }
void page_lock_delete(page_t *pp) { kmutex_t *pse = PAGE_SE_MUTEX(pp); ASSERT(PAGE_EXCL(pp)); ASSERT(pp->p_vnode == NULL); ASSERT(pp->p_offset == (u_offset_t)-1); ASSERT(!PP_ISFREE(pp)); mutex_enter(pse); THREAD_KPRI_RELEASE(); pp->p_selock = SE_DELETED; if (CV_HAS_WAITERS(&pp->p_cv)) cv_broadcast(&pp->p_cv); mutex_exit(pse); }
/* * Act like page_destroy(), but instead of freeing the page, hash it onto * the retired_pages vnode, and mark it retired. * * For fun, we try to scrub the page until it's squeaky clean. * availrmem is adjusted here. */ static void page_retire_destroy(page_t *pp) { u_offset_t off = (u_offset_t)((uintptr_t)pp); ASSERT(PAGE_EXCL(pp)); ASSERT(!PP_ISFREE(pp)); ASSERT(pp->p_szc == 0); ASSERT(!hat_page_is_mapped(pp)); ASSERT(!pp->p_vnode); page_clr_all_props(pp); pagescrub(pp, 0, MMU_PAGESIZE); pp->p_next = NULL; pp->p_prev = NULL; if (page_hashin(pp, retired_pages, off, NULL) == 0) { cmn_err(CE_PANIC, "retired page %p hashin failed", (void *)pp); } page_settoxic(pp, PR_RETIRED); page_clrtoxic(pp, PR_BUSY); page_retire_dequeue(pp); PR_INCR_KSTAT(pr_retired); if (pp->p_toxic & PR_FMA) { PR_INCR_KSTAT(pr_fma); } else if (pp->p_toxic & PR_UE) { PR_INCR_KSTAT(pr_ue); } else { PR_INCR_KSTAT(pr_mce); } mutex_enter(&freemem_lock); availrmem--; mutex_exit(&freemem_lock); page_unlock(pp); }
/* * Page retire self-test. For now, it always returns 0. */ int page_retire_test(void) { page_t *first, *pp, *cpp, *cpp2, *lpp; /* * Tests the corner case where a large page can't be retired * because one of the constituent pages is locked. We mark * one page to be retired and try to retire it, and mark the * other page to be retired but don't try to retire it, so * that page_unlock() in the failure path will recurse and try * to retire THAT page. This is the worst possible situation * we can get ourselves into. */ memsegs_lock(0); pp = first = page_first(); do { if (pp->p_szc && PP_PAGEROOT(pp) == pp) { cpp = pp + 1; lpp = PP_ISFREE(pp)? pp : pp + 2; cpp2 = pp + 3; if (!page_trylock(lpp, pp == lpp? SE_EXCL : SE_SHARED)) continue; if (!page_trylock(cpp, SE_EXCL)) { page_unlock(lpp); continue; } page_settoxic(cpp, PR_FMA | PR_BUSY); page_settoxic(cpp2, PR_FMA); page_tryretire(cpp); /* will fail */ page_unlock(lpp); (void) page_retire(cpp->p_pagenum, PR_FMA); (void) page_retire(cpp2->p_pagenum, PR_FMA); } } while ((pp = page_next(pp)) != first); memsegs_unlock(0); return (0); }
/* * page_retire_pp() decides what to do with a failing page. * * When we get a free page (e.g. the scrubber or in the free path) life is * nice because the page is clean and marked free -- those always retire * nicely. From there we go by order of difficulty. If the page has data, * we attempt to relocate its contents to a suitable replacement page. If * that does not succeed, we look to see if it is clean. If after all of * this we have a clean, unmapped page (which we usually do!), we retire it. * If the page is not clean, we still process it regardless on a UE; for * CEs or FMA requests, we fail leaving the page in service. The page will * eventually be tried again later. We always return with the page unlocked * since we are called from page_unlock(). * * We don't call panic or do anything fancy down in here. Our boss the DE * gets paid handsomely to do his job of figuring out what to do when errors * occur. We just do what he tells us to do. */ static int page_retire_pp(page_t *pp) { int toxic; ASSERT(PAGE_EXCL(pp)); ASSERT(pp->p_iolock_state == 0); ASSERT(pp->p_szc == 0); PR_DEBUG(prd_top); PR_TYPES(pp); toxic = pp->p_toxic; ASSERT(toxic & PR_REASONS); if ((toxic & (PR_FMA | PR_MCE)) && !(toxic & PR_UE) && page_retire_limit()) { page_clrtoxic(pp, PR_FMA | PR_MCE | PR_MSG | PR_BUSY); page_retire_dequeue(pp); page_unlock(pp); return (page_retire_done(pp, PRD_LIMIT)); } if (PP_ISFREE(pp)) { int dbgnoreclaim = MTBF(recl_calls, recl_mtbf) == 0; PR_DEBUG(prd_free); if (dbgnoreclaim || !page_reclaim(pp, NULL)) { PR_DEBUG(prd_noreclaim); PR_INCR_KSTAT(pr_failed); /* * page_reclaim() returns with `pp' unlocked when * it fails. */ if (dbgnoreclaim) page_unlock(pp); return (page_retire_done(pp, PRD_FAILED)); } } ASSERT(!PP_ISFREE(pp)); if ((toxic & PR_UE) == 0 && pp->p_vnode && !PP_ISNORELOCKERNEL(pp) && MTBF(reloc_calls, reloc_mtbf)) { page_t *newpp; spgcnt_t count; /* * If we can relocate the page, great! newpp will go * on without us, and everything is fine. Regardless * of whether the relocation succeeds, we are still * going to take `pp' around back and shoot it. */ newpp = NULL; if (page_relocate(&pp, &newpp, 0, 0, &count, NULL) == 0) { PR_DEBUG(prd_reloc); page_unlock(newpp); ASSERT(hat_page_getattr(pp, P_MOD) == 0); } else { PR_DEBUG(prd_relocfail); } } if (hat_ismod(pp)) { PR_DEBUG(prd_mod); PR_INCR_KSTAT(pr_failed); page_unlock(pp); return (page_retire_done(pp, PRD_FAILED)); } if (PP_ISKVP(pp)) { PR_DEBUG(prd_kern); PR_INCR_KSTAT(pr_failed_kernel); page_unlock(pp); return (page_retire_done(pp, PRD_FAILED)); } if (pp->p_lckcnt || pp->p_cowcnt) { PR_DEBUG(prd_locked); PR_INCR_KSTAT(pr_failed); page_unlock(pp); return (page_retire_done(pp, PRD_FAILED)); } (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); ASSERT(!hat_page_is_mapped(pp)); /* * If the page is modified, and was not relocated; we can't * retire it without dropping data on the floor. We have to * recheck after unloading since the dirty bit could have been * set since we last checked. */ if (hat_ismod(pp)) { PR_DEBUG(prd_mod_late); PR_INCR_KSTAT(pr_failed); page_unlock(pp); return (page_retire_done(pp, PRD_FAILED)); } if (pp->p_vnode) { PR_DEBUG(prd_hashout); page_hashout(pp, NULL); } ASSERT(!pp->p_vnode); /* * The problem page is locked, demoted, unmapped, not free, * hashed out, and not COW or mlocked (whew!). * * Now we select our ammunition, take it around back, and shoot it. */ if (toxic & PR_UE) { if (page_retire_transient_ue(pp)) { PR_DEBUG(prd_uescrubbed); return (page_retire_done(pp, PRD_UE_SCRUBBED)); } else { PR_DEBUG(prd_uenotscrubbed); page_retire_destroy(pp); return (page_retire_done(pp, PRD_SUCCESS)); } } else if (toxic & PR_FMA) { PR_DEBUG(prd_fma); page_retire_destroy(pp); return (page_retire_done(pp, PRD_SUCCESS)); } else if (toxic & PR_MCE) { PR_DEBUG(prd_mce); page_retire_destroy(pp); return (page_retire_done(pp, PRD_SUCCESS)); } panic("page_retire_pp: bad toxic flags %d", toxic); /*NOTREACHED*/ }
/* * Process a vnode's page list for all pages whose offset is >= off. * Pages are to either be free'd, invalidated, or written back to disk. * * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE * is specified, otherwise they are "shared" locked. * * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC} * * Special marker page_t's are inserted in the list in order * to keep track of where we are in the list when locks are dropped. * * Note the list is circular and insertions can happen only at the * head and tail of the list. The algorithm ensures visiting all pages * on the list in the following way: * * Drop two marker pages at the end of the list. * * Move one marker page backwards towards the start of the list until * it is at the list head, processing the pages passed along the way. * * Due to race conditions when the vphm mutex is dropped, additional pages * can be added to either end of the list, so we'll continue to move * the marker and process pages until it is up against the end marker. * * There is one special exit condition. If we are processing a VMODSORT * vnode and only writing back modified pages, we can stop as soon as * we run into an unmodified page. This makes fsync(3) operations fast. */ int pvn_vplist_dirty( vnode_t *vp, u_offset_t off, int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *), int flags, cred_t *cred) { page_t *pp; page_t *mark; /* marker page that moves toward head */ page_t *end; /* marker page at end of list */ int err = 0; int error; kmutex_t *vphm; se_t se; page_t **where_to_move; ASSERT(vp->v_type != VCHR); if (vp->v_pages == NULL) return (0); /* * Serialize vplist_dirty operations on this vnode by setting VVMLOCK. * * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync() * from getting blocked while flushing pages to a dead NFS server. */ mutex_enter(&vp->v_lock); if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) { mutex_exit(&vp->v_lock); return (EAGAIN); } while (vp->v_flag & VVMLOCK) cv_wait(&vp->v_cv, &vp->v_lock); if (vp->v_pages == NULL) { mutex_exit(&vp->v_lock); return (0); } vp->v_flag |= VVMLOCK; mutex_exit(&vp->v_lock); /* * Set up the marker pages used to walk the list */ end = kmem_cache_alloc(marker_cache, KM_SLEEP); end->p_vnode = vp; end->p_offset = (u_offset_t)-2; mark = kmem_cache_alloc(marker_cache, KM_SLEEP); mark->p_vnode = vp; mark->p_offset = (u_offset_t)-1; /* * Grab the lock protecting the vnode's page list * note that this lock is dropped at times in the loop. */ vphm = page_vnode_mutex(vp); mutex_enter(vphm); if (vp->v_pages == NULL) goto leave; /* * insert the markers and loop through the list of pages */ page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark); page_vpadd(&mark->p_vpnext, end); for (;;) { /* * If only doing an async write back, then we can * stop as soon as we get to start of the list. */ if (flags == B_ASYNC && vp->v_pages == mark) break; /* * otherwise stop when we've gone through all the pages */ if (mark->p_vpprev == end) break; pp = mark->p_vpprev; if (vp->v_pages == pp) where_to_move = &vp->v_pages; else where_to_move = &pp->p_vpprev->p_vpnext; ASSERT(pp->p_vnode == vp); /* * If just flushing dirty pages to disk and this vnode * is using a sorted list of pages, we can stop processing * as soon as we find an unmodified page. Since all the * modified pages are visited first. */ if (IS_VMODSORT(vp) && !(flags & (B_INVAL | B_FREE | B_TRUNC))) { if (!hat_ismod(pp) && !page_io_locked(pp)) { #ifdef DEBUG /* * For debug kernels examine what should be * all the remaining clean pages, asserting * that they are not modified. */ page_t *chk = pp; int attr; page_vpsub(&vp->v_pages, mark); page_vpadd(where_to_move, mark); do { chk = chk->p_vpprev; ASSERT(chk != end); if (chk == mark) continue; attr = hat_page_getattr(chk, P_MOD | P_REF); if ((attr & P_MOD) == 0) continue; panic("v_pages list not all clean: " "page_t*=%p vnode=%p off=%lx " "attr=0x%x last clean page_t*=%p\n", (void *)chk, (void *)chk->p_vnode, (long)chk->p_offset, attr, (void *)pp); } while (chk != vp->v_pages); #endif break; } else if (!(flags & B_ASYNC) && !hat_ismod(pp)) { /* * Couldn't get io lock, wait until IO is done. * Block only for sync IO since we don't want * to block async IO. */ mutex_exit(vphm); page_io_wait(pp); mutex_enter(vphm); continue; } } /* * Skip this page if the offset is out of the desired range. * Just move the marker and continue. */ if (pp->p_offset < off) { page_vpsub(&vp->v_pages, mark); page_vpadd(where_to_move, mark); continue; } /* * If we are supposed to invalidate or free this * page, then we need an exclusive lock. */ se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED; /* * We must acquire the page lock for all synchronous * operations (invalidate, free and write). */ if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) { /* * If the page_lock() drops the mutex * we must retry the loop. */ if (!page_lock(pp, se, vphm, P_NO_RECLAIM)) continue; /* * It's ok to move the marker page now. */ page_vpsub(&vp->v_pages, mark); page_vpadd(where_to_move, mark); } else { /* * update the marker page for all remaining cases */ page_vpsub(&vp->v_pages, mark); page_vpadd(where_to_move, mark); /* * For write backs, If we can't lock the page, it's * invalid or in the process of being destroyed. Skip * it, assuming someone else is writing it. */ if (!page_trylock(pp, se)) continue; } ASSERT(pp->p_vnode == vp); /* * Successfully locked the page, now figure out what to * do with it. Free pages are easily dealt with, invalidate * if desired or just go on to the next page. */ if (PP_ISFREE(pp)) { if ((flags & B_INVAL) == 0) { page_unlock(pp); continue; } /* * Invalidate (destroy) the page. */ mutex_exit(vphm); page_destroy_free(pp); mutex_enter(vphm); continue; } /* * pvn_getdirty() figures out what do do with a dirty page. * If the page is dirty, the putapage() routine will write it * and will kluster any other adjacent dirty pages it can. * * pvn_getdirty() and `(*putapage)' unlock the page. */ mutex_exit(vphm); if (pvn_getdirty(pp, flags)) { error = (*putapage)(vp, pp, NULL, NULL, flags, cred); if (!err) err = error; } mutex_enter(vphm); } page_vpsub(&vp->v_pages, mark); page_vpsub(&vp->v_pages, end); leave: /* * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds */ mutex_exit(vphm); kmem_cache_free(marker_cache, mark); kmem_cache_free(marker_cache, end); mutex_enter(&vp->v_lock); vp->v_flag &= ~VVMLOCK; cv_broadcast(&vp->v_cv); mutex_exit(&vp->v_lock); return (err); }
/* * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI, * B_TRUNC, B_FORCE}. B_DELWRI indicates that this page is part of a kluster * operation and is only to be considered if it doesn't involve any * waiting here. B_TRUNC indicates that the file is being truncated * and so no i/o needs to be done. B_FORCE indicates that the page * must be destroyed so don't try wrting it out. * * The caller must ensure that the page is locked. Returns 1, if * the page should be written back (the "iolock" is held in this * case), or 0 if the page has been dealt with or has been * unlocked. */ int pvn_getdirty(page_t *pp, int flags) { ASSERT((flags & (B_INVAL | B_FREE)) ? PAGE_EXCL(pp) : PAGE_SHARED(pp)); ASSERT(PP_ISFREE(pp) == 0); /* * If trying to invalidate or free a logically `locked' page, * forget it. Don't need page_struct_lock to check p_lckcnt and * p_cowcnt as the page is exclusively locked. */ if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) && (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) { page_unlock(pp); return (0); } /* * Now acquire the i/o lock so we can add it to the dirty * list (if necessary). We avoid blocking on the i/o lock * in the following cases: * * If B_DELWRI is set, which implies that this request is * due to a klustering operartion. * * If this is an async (B_ASYNC) operation and we are not doing * invalidation (B_INVAL) [The current i/o or fsflush will ensure * that the the page is written out]. */ if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) { if (!page_io_trylock(pp)) { page_unlock(pp); return (0); } } else { page_io_lock(pp); } /* * If we want to free or invalidate the page then * we need to unload it so that anyone who wants * it will have to take a minor fault to get it. * Otherwise, we're just writing the page back so we * need to sync up the hardwre and software mod bit to * detect any future modifications. We clear the * software mod bit when we put the page on the dirty * list. */ if (flags & (B_INVAL | B_FREE)) { (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); } else { (void) hat_pagesync(pp, HAT_SYNC_ZERORM); } if (!hat_ismod(pp) || (flags & B_TRUNC)) { /* * Don't need to add it to the * list after all. */ page_io_unlock(pp); if (flags & B_INVAL) { /*LINTED: constant in conditional context*/ VN_DISPOSE(pp, B_INVAL, 0, kcred); } else if (flags & B_FREE) { /*LINTED: constant in conditional context*/ VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred); } else { /* * This is advisory path for the callers * of VOP_PUTPAGE() who prefer freeing the * page _only_ if no one else is accessing it. * E.g. segmap_release() * * The above hat_ismod() check is useless because: * (1) we may not be holding SE_EXCL lock; * (2) we've not unloaded _all_ translations * * Let page_release() do the heavy-lifting. */ (void) page_release(pp, 1); } return (0); } /* * Page is dirty, get it ready for the write back * and add page to the dirty list. */ hat_clrrefmod(pp); /* * If we're going to free the page when we're done * then we can let others try to use it starting now. * We'll detect the fact that they used it when the * i/o is done and avoid freeing the page. */ if (flags & B_FREE) page_downgrade(pp); TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp); return (1); }
/* * Scan page_t's and issue I/O's for modified pages. * * Also coalesces consecutive small sized free pages into the next larger * pagesize. This costs a tiny bit of time in fsflush, but will reduce time * spent scanning on later passes and for anybody allocating large pages. */ static void fsflush_do_pages() { vnode_t *vp; ulong_t pcount; hrtime_t timer = gethrtime(); ulong_t releases = 0; ulong_t nexamined = 0; ulong_t nlocked = 0; ulong_t nmodified = 0; ulong_t ncoalesce = 0; ulong_t cnt; int mod; int fspage = 1; u_offset_t offset; uint_t szc; page_t *coal_page = NULL; /* 1st page in group to coalesce */ uint_t coal_szc = 0; /* size code, coal_page->p_szc */ uint_t coal_cnt = 0; /* count of pages seen */ static ulong_t nscan = 0; static pgcnt_t last_total_pages = 0; static page_t *pp = NULL; /* * Check to see if total_pages has changed. */ if (total_pages != last_total_pages) { last_total_pages = total_pages; nscan = (last_total_pages * (tune.t_fsflushr))/v.v_autoup; } if (pp == NULL) pp = memsegs->pages; pcount = 0; while (pcount < nscan) { /* * move to the next page, skipping over large pages * and issuing prefetches. */ if (pp->p_szc && fspage == 0) { pfn_t pfn; pfn = page_pptonum(pp); cnt = page_get_pagecnt(pp->p_szc); cnt -= pfn & (cnt - 1); } else cnt = 1; pp = page_nextn(pp, cnt); prefetch_page_r((void *)pp); ASSERT(pp != NULL); pcount += cnt; /* * Do a bunch of dirty tests (ie. no locking) to determine * if we can quickly skip this page. These tests are repeated * after acquiring the page lock. */ ++nexamined; if (PP_ISSWAP(pp)) { fspage = 0; coal_page = NULL; continue; } /* * skip free pages too, but try coalescing them into larger * pagesizes */ if (PP_ISFREE(pp)) { /* * skip pages with a file system identity or that * are already maximum size */ fspage = 0; szc = pp->p_szc; if (pp->p_vnode != NULL || szc == fsf_npgsz - 1) { coal_page = NULL; continue; } /* * If not in a coalescing candidate page or the size * codes are different, start a new candidate. */ if (coal_page == NULL || coal_szc != szc) { /* * page must be properly aligned */ if ((page_pptonum(pp) & fsf_mask[szc]) != 0) { coal_page = NULL; continue; } coal_page = pp; coal_szc = szc; coal_cnt = 1; continue; } /* * acceptable to add this to existing candidate page */ ++coal_cnt; if (coal_cnt < fsf_pgcnt[coal_szc]) continue; /* * We've got enough pages to coalesce, so do it. * After promoting, we clear coal_page, so it will * take another pass to promote this to an even * larger page. */ ++ncoalesce; (void) page_promote_size(coal_page, coal_szc); coal_page = NULL; continue; } else { coal_page = NULL; } if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { fspage = 0; continue; } /* * Reject pages that can't be "exclusively" locked. */ if (!page_trylock(pp, SE_EXCL)) continue; ++nlocked; /* * After locking the page, redo the above checks. * Since we locked the page, leave out the PAGE_LOCKED() test. */ vp = pp->p_vnode; if (PP_ISSWAP(pp) || PP_ISFREE(pp) || vp == NULL || PP_ISKAS(pp) || (vp->v_flag & VISSWAP) != 0) { page_unlock(pp); fspage = 0; continue; } if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { page_unlock(pp); continue; } fspage = 1; ASSERT(vp->v_type != VCHR); /* * Check the modified bit. Leaving the bit alone in hardware. * It will be cleared if we do the putpage. */ if (IS_VMODSORT(vp)) mod = hat_ismod(pp); else mod = hat_pagesync(pp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD; if (mod) { ++nmodified; offset = pp->p_offset; /* * Hold the vnode before releasing the page lock * to prevent it from being freed and re-used by * some other thread. */ VN_HOLD(vp); page_unlock(pp); (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_ASYNC, kcred, NULL); VN_RELE(vp); } else { /* * Catch any pages which should be on the cache list, * but aren't yet. */ if (hat_page_is_mapped(pp) == 0) { ++releases; (void) page_release(pp, 1); } else { page_unlock(pp); } } } /* * maintain statistics * reset every million wakeups, just to avoid overflow */ if (++fsf_cycles == 1000000) { fsf_cycles = 0; fsf_total.fsf_scan = 0; fsf_total.fsf_examined = 0; fsf_total.fsf_locked = 0; fsf_total.fsf_modified = 0; fsf_total.fsf_coalesce = 0; fsf_total.fsf_time = 0; fsf_total.fsf_releases = 0; } else { fsf_total.fsf_scan += fsf_recent.fsf_scan = nscan; fsf_total.fsf_examined += fsf_recent.fsf_examined = nexamined; fsf_total.fsf_locked += fsf_recent.fsf_locked = nlocked; fsf_total.fsf_modified += fsf_recent.fsf_modified = nmodified; fsf_total.fsf_coalesce += fsf_recent.fsf_coalesce = ncoalesce; fsf_total.fsf_time += fsf_recent.fsf_time = gethrtime() - timer; fsf_total.fsf_releases += fsf_recent.fsf_releases = releases; } }
/* * Find, take and return a mutex held by hat_page_demote(). * Called by page_demote_vp_pages() before hat_page_demote() call and by * routines that want to block hat_page_demote() but can't do it * via locking all constituent pages. * * Return NULL if p_szc is 0. * * It should only be used for pages that can be demoted by hat_page_demote() * i.e. non swapfs file system pages. The logic here is lifted from * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase * since the page is locked and not free. * * Hash of the root page is used to find the lock. * To find the root in the presense of hat_page_demote() chageing the location * of the root this routine relies on the fact that hat_page_demote() changes * root last. * * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is * returned pp's p_szc may be any value. */ kmutex_t * page_szc_lock(page_t *pp) { kmutex_t *mtx; page_t *rootpp; uint_t szc; uint_t rszc; uint_t pszc = pp->p_szc; ASSERT(pp != NULL); ASSERT(PAGE_LOCKED(pp)); ASSERT(!PP_ISFREE(pp)); ASSERT(pp->p_vnode != NULL); ASSERT(!IS_SWAPFSVP(pp->p_vnode)); ASSERT(!PP_ISKAS(pp)); again: if (pszc == 0) { VM_STAT_ADD(pszclck_stat[0]); return (NULL); } /* The lock lives in the root page */ rootpp = PP_GROUPLEADER(pp, pszc); mtx = PAGE_SZC_MUTEX(rootpp); mutex_enter(mtx); /* * since p_szc can only decrease if pp == rootpp * rootpp will be always the same i.e we have the right root * regardless of rootpp->p_szc. * If location of pp's root didn't change after we took * the lock we have the right root. return mutex hashed off it. */ if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) { VM_STAT_ADD(pszclck_stat[1]); return (mtx); } /* * root location changed because page got demoted. * locate the new root. */ if (rszc < pszc) { szc = pp->p_szc; ASSERT(szc < pszc); mutex_exit(mtx); pszc = szc; VM_STAT_ADD(pszclck_stat[2]); goto again; } VM_STAT_ADD(pszclck_stat[3]); /* * current hat_page_demote not done yet. * wait for it to finish. */ mutex_exit(mtx); rootpp = PP_GROUPLEADER(rootpp, rszc); mtx = PAGE_SZC_MUTEX(rootpp); mutex_enter(mtx); mutex_exit(mtx); ASSERT(rootpp->p_szc < rszc); goto again; }
/* * Read the comments inside of page_lock_es() carefully. * * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained. * This is used by threads subject to reader-starvation (eg. memory delete). * * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock, * it is expected that it will retry at a later time. Threads that will * not retry the lock *must* call page_lock_clr_exclwanted to clear the * SE_EWANTED bit. (When a thread using SE_EXCL_WANTED obtains the lock, * the bit is cleared.) */ int page_try_reclaim_lock(page_t *pp, se_t se, int es) { kmutex_t *pse = PAGE_SE_MUTEX(pp); selock_t old; mutex_enter(pse); old = pp->p_selock; ASSERT(((es & SE_EXCL_WANTED) == 0) || ((es & SE_EXCL_WANTED) && (se == SE_EXCL))); if (PP_RETIRED(pp) && !(es & SE_RETIRED)) { mutex_exit(pse); VM_STAT_ADD(page_trylock_failed); return (0); } if (se == SE_SHARED && es == 1 && old == 0) { se = SE_EXCL; } if (se == SE_SHARED) { if (!PP_ISFREE(pp)) { if (old >= 0) { /* * Readers are not allowed when excl wanted */ if ((old & SE_EWANTED) == 0) { pp->p_selock = old + SE_READER; mutex_exit(pse); return (1); } } mutex_exit(pse); return (0); } /* * The page is free, so we really want SE_EXCL (below) */ VM_STAT_ADD(page_try_reclaim_upgrade); } /* * The caller wants a writer lock. We try for it only if * SE_EWANTED is not set, or if the caller specified * SE_EXCL_WANTED. */ if (!(old & SE_EWANTED) || (es & SE_EXCL_WANTED)) { if ((old & ~SE_EWANTED) == 0) { /* no reader/writer lock held */ THREAD_KPRI_REQUEST(); /* this clears out our setting of the SE_EWANTED bit */ pp->p_selock = SE_WRITER; mutex_exit(pse); return (1); } } if (es & SE_EXCL_WANTED) { /* page is locked, set the SE_EWANTED bit */ pp->p_selock |= SE_EWANTED; } mutex_exit(pse); return (0); }
/* * With the addition of reader-writer lock semantics to page_lock_es, * callers wanting an exclusive (writer) lock may prevent shared-lock * (reader) starvation by setting the es parameter to SE_EXCL_WANTED. * In this case, when an exclusive lock cannot be acquired, p_selock's * SE_EWANTED bit is set. Shared-lock (reader) requests are also denied * if the page is slated for retirement. * * The se and es parameters determine if the lock should be granted * based on the following decision table: * * Lock wanted es flags p_selock/SE_EWANTED Action * ----------- -------------- ------------------- --------- * SE_EXCL any [1][2] unlocked/any grant lock, clear SE_EWANTED * SE_EXCL SE_EWANTED any lock/any deny, set SE_EWANTED * SE_EXCL none any lock/any deny * SE_SHARED n/a [2] shared/0 grant * SE_SHARED n/a [2] unlocked/0 grant * SE_SHARED n/a shared/1 deny * SE_SHARED n/a unlocked/1 deny * SE_SHARED n/a excl/any deny * * Notes: * [1] The code grants an exclusive lock to the caller and clears the bit * SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED * bit's value. This was deemed acceptable as we are not concerned about * exclusive-lock starvation. If this ever becomes an issue, a priority or * fifo mechanism should also be implemented. Meantime, the thread that * set SE_EWANTED should be prepared to catch this condition and reset it * * [2] Retired pages may not be locked at any time, regardless of the * dispostion of se, unless the es parameter has SE_RETIRED flag set. * * Notes on values of "es": * * es & 1: page_lookup_create will attempt page relocation * es & SE_EXCL_WANTED: caller wants SE_EWANTED set (eg. delete * memory thread); this prevents reader-starvation of waiting * writer thread(s) by giving priority to writers over readers. * es & SE_RETIRED: caller wants to lock pages even if they are * retired. Default is to deny the lock if the page is retired. * * And yes, we know, the semantics of this function are too complicated. * It's on the list to be cleaned up. */ int page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es) { int retval; kmutex_t *pse = PAGE_SE_MUTEX(pp); int upgraded; int reclaim_it; ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1); VM_STAT_ADD(page_lock_count); upgraded = 0; reclaim_it = 0; mutex_enter(pse); ASSERT(((es & SE_EXCL_WANTED) == 0) || ((es & SE_EXCL_WANTED) && (se == SE_EXCL))); if (PP_RETIRED(pp) && !(es & SE_RETIRED)) { mutex_exit(pse); VM_STAT_ADD(page_lock_retired); return (0); } if (se == SE_SHARED && es == 1 && pp->p_selock == 0) { se = SE_EXCL; } if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) { reclaim_it = 1; if (se == SE_SHARED) { /* * This is an interesting situation. * * Remember that p_free can only change if * p_selock < 0. * p_free does not depend on our holding `pse'. * And, since we hold `pse', p_selock can not change. * So, if p_free changes on us, the page is already * exclusively held, and we would fail to get p_selock * regardless. * * We want to avoid getting the share * lock on a free page that needs to be reclaimed. * It is possible that some other thread has the share * lock and has left the free page on the cache list. * pvn_vplist_dirty() does this for brief periods. * If the se_share is currently SE_EXCL, we will fail * to acquire p_selock anyway. Blocking is the * right thing to do. * If we need to reclaim this page, we must get * exclusive access to it, force the upgrade now. * Again, we will fail to acquire p_selock if the * page is not free and block. */ upgraded = 1; se = SE_EXCL; VM_STAT_ADD(page_lock_upgrade); } } if (se == SE_EXCL) { if (!(es & SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) { /* * if the caller wants a writer lock (but did not * specify exclusive access), and there is a pending * writer that wants exclusive access, return failure */ retval = 0; } else if ((pp->p_selock & ~SE_EWANTED) == 0) { /* no reader/writer lock held */ THREAD_KPRI_REQUEST(); /* this clears our setting of the SE_EWANTED bit */ pp->p_selock = SE_WRITER; retval = 1; } else { /* page is locked */ if (es & SE_EXCL_WANTED) { /* set the SE_EWANTED bit */ pp->p_selock |= SE_EWANTED; } retval = 0; } } else { retval = 0; if (pp->p_selock >= 0) { if ((pp->p_selock & SE_EWANTED) == 0) { pp->p_selock += SE_READER; retval = 1; } } } if (retval == 0) { if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) { VM_STAT_ADD(page_lock_deleted); mutex_exit(pse); return (retval); } #ifdef VM_STATS VM_STAT_ADD(page_lock_miss); if (upgraded) { VM_STAT_ADD(page_lock_upgrade_failed); } #endif if (lock) { VM_STAT_ADD(page_lock_miss_lock); mutex_exit(lock); } /* * Now, wait for the page to be unlocked and * release the lock protecting p_cv and p_selock. */ cv_wait(&pp->p_cv, pse); mutex_exit(pse); /* * The page identity may have changed while we were * blocked. If we are willing to depend on "pp" * still pointing to a valid page structure (i.e., * assuming page structures are not dynamically allocated * or freed), we could try to lock the page if its * identity hasn't changed. * * This needs to be measured, since we come back from * cv_wait holding pse (the expensive part of this * operation) we might as well try the cheap part. * Though we would also have to confirm that dropping * `lock' did not cause any grief to the callers. */ if (lock) { mutex_enter(lock); } } else { /* * We have the page lock. * If we needed to reclaim the page, and the page * needed reclaiming (ie, it was free), then we * have the page exclusively locked. We may need * to downgrade the page. */ ASSERT((upgraded) ? ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1); mutex_exit(pse); /* * We now hold this page's lock, either shared or * exclusive. This will prevent its identity from changing. * The page, however, may or may not be free. If the caller * requested, and it is free, go reclaim it from the * free list. If the page can't be reclaimed, return failure * so that the caller can start all over again. * * NOTE:page_reclaim() releases the page lock (p_selock) * if it can't be reclaimed. */ if (reclaim_it) { if (!page_reclaim(pp, lock)) { VM_STAT_ADD(page_lock_bad_reclaim); retval = 0; } else { VM_STAT_ADD(page_lock_reclaim); if (upgraded) { page_downgrade(pp); } } } } return (retval); }