/* * The page represented by pp is being given back to the hypervisor. * Add the page_t structure to our spare list. */ static void balloon_page_add(page_t *pp) { /* * We need to keep the page exclusively locked * to prevent swrand from grabbing it. */ ASSERT(PAGE_EXCL(pp)); ASSERT(MUTEX_HELD(&bln_mutex)); pp->p_prev = NULL; if (bln_spare_list_front == NULL) { bln_spare_list_front = bln_spare_list_back = pp; pp->p_next = NULL; } else if (pp->p_pagenum >= mfn_count) { /* * The pfn is invalid, so add at the end of list. Since these * adds should *only* be done by balloon_init_new_pages(), and * that does adds in order, the following ASSERT should * never trigger. */ ASSERT(pp->p_pagenum > bln_spare_list_back->p_pagenum); bln_spare_list_back->p_next = pp; pp->p_next = NULL; bln_spare_list_back = pp; } else { /* Add at beginning of list */ pp->p_next = bln_spare_list_front; bln_spare_list_front = pp; } }
/* * Try to retire a page when we stumble onto it in the page lock routines. */ void page_tryretire(page_t *pp) { ASSERT(PAGE_EXCL(pp)); if (!pr_enable) { page_unlock(pp); return; } /* * If the page is a big page, try to break it up. * * If there are other bad pages besides `pp', they will be * recursively retired for us thanks to a bit of magic. * If the page is a small page with errors, try to retire it. */ if (pp->p_szc > 0) { if (PP_ISFREE(pp) && !page_try_demote_free_pages(pp)) { page_unlock(pp); PR_DEBUG(prd_nofreedemote); return; } else if (!page_try_demote_pages(pp)) { page_unlock(pp); PR_DEBUG(prd_nodemote); return; } PR_DEBUG(prd_demoted); page_unlock(pp); } else { (void) page_retire_pp(pp); } }
/* * Take a retired page off the retired-pages vnode and clear the toxic flags. * If "free" is nonzero, lock it and put it back on the freelist. If "free" * is zero, the caller already holds SE_EXCL lock so we simply unretire it * and don't do anything else with it. * * Any unretire messages are printed from this routine. * * Returns 0 if page pp was unretired; else an error code. */ int page_unretire_pp(page_t *pp, int free) { /* * To be retired, a page has to be hashed onto the retired_pages vnode * and have PR_RETIRED set in p_toxic. */ if (free == 0 || page_try_reclaim_lock(pp, SE_EXCL, SE_RETIRED)) { ASSERT(PAGE_EXCL(pp)); PR_DEBUG(prd_ulocked); if (!PP_RETIRED(pp)) { PR_DEBUG(prd_unotretired); page_unlock(pp); return (page_retire_done(pp, PRD_UNR_NOT)); } PR_MESSAGE(CE_NOTE, 1, "unretiring retired" " page 0x%08x.%08x", mmu_ptob((uint64_t)pp->p_pagenum)); if (pp->p_toxic & PR_FMA) { PR_DECR_KSTAT(pr_fma); } else if (pp->p_toxic & PR_UE) { PR_DECR_KSTAT(pr_ue); } else { PR_DECR_KSTAT(pr_mce); } page_clrtoxic(pp, PR_ALLFLAGS); if (free) { PR_DEBUG(prd_udestroy); page_destroy(pp, 0); } else { PR_DEBUG(prd_uhashout); page_hashout(pp, NULL); } mutex_enter(&freemem_lock); availrmem++; mutex_exit(&freemem_lock); PR_DEBUG(prd_uunretired); PR_DECR_KSTAT(pr_retired); PR_INCR_KSTAT(pr_unretired); return (page_retire_done(pp, PRD_UNR_SUCCESS)); } PR_DEBUG(prd_unotlocked); return (page_retire_done(pp, PRD_UNR_CANTLOCK)); }
void page_lock_delete(page_t *pp) { kmutex_t *pse = PAGE_SE_MUTEX(pp); ASSERT(PAGE_EXCL(pp)); ASSERT(pp->p_vnode == NULL); ASSERT(pp->p_offset == (u_offset_t)-1); ASSERT(!PP_ISFREE(pp)); mutex_enter(pse); THREAD_KPRI_RELEASE(); pp->p_selock = SE_DELETED; if (CV_HAS_WAITERS(&pp->p_cv)) cv_broadcast(&pp->p_cv); mutex_exit(pse); }
/* * Downgrade the "exclusive" lock on the page to a "shared" lock * while holding the mutex protecting this page's p_selock field. */ void page_downgrade(page_t *pp) { kmutex_t *pse = PAGE_SE_MUTEX(pp); int excl_waiting; ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED); ASSERT(PAGE_EXCL(pp)); mutex_enter(pse); excl_waiting = pp->p_selock & SE_EWANTED; THREAD_KPRI_RELEASE(); pp->p_selock = SE_READER | excl_waiting; if (CV_HAS_WAITERS(&pp->p_cv)) cv_broadcast(&pp->p_cv); mutex_exit(pse); }
/* * Try to clear a page_t with a single UE. If the UE was transient, it is * returned to service, and we return 1. Otherwise we return 0 meaning * that further processing is required to retire the page. */ static int page_retire_transient_ue(page_t *pp) { ASSERT(PAGE_EXCL(pp)); ASSERT(!hat_page_is_mapped(pp)); /* * If this page is a repeat offender, retire him under the * "two strikes and you're out" rule. The caller is responsible * for scrubbing the page to try to clear the error. */ if (pp->p_toxic & PR_UE_SCRUBBED) { PR_INCR_KSTAT(pr_ue_persistent); return (0); } if (page_clear_transient_ue(pp)) { /* * We set the PR_SCRUBBED_UE bit; if we ever see this * page again, we will retire it, no questions asked. */ page_settoxic(pp, PR_UE_SCRUBBED); if (page_retire_first_ue) { PR_INCR_KSTAT(pr_ue_cleared_retire); return (0); } else { PR_INCR_KSTAT(pr_ue_cleared_free); page_clrtoxic(pp, PR_UE | PR_MCE | PR_MSG | PR_BUSY); page_retire_dequeue(pp); /* LINTED: CONSTCOND */ VN_DISPOSE(pp, B_FREE, 1, kcred); return (1); } } PR_INCR_KSTAT(pr_ue_persistent); return (0); }
/* * Act like page_destroy(), but instead of freeing the page, hash it onto * the retired_pages vnode, and mark it retired. * * For fun, we try to scrub the page until it's squeaky clean. * availrmem is adjusted here. */ static void page_retire_destroy(page_t *pp) { u_offset_t off = (u_offset_t)((uintptr_t)pp); ASSERT(PAGE_EXCL(pp)); ASSERT(!PP_ISFREE(pp)); ASSERT(pp->p_szc == 0); ASSERT(!hat_page_is_mapped(pp)); ASSERT(!pp->p_vnode); page_clr_all_props(pp); pagescrub(pp, 0, MMU_PAGESIZE); pp->p_next = NULL; pp->p_prev = NULL; if (page_hashin(pp, retired_pages, off, NULL) == 0) { cmn_err(CE_PANIC, "retired page %p hashin failed", (void *)pp); } page_settoxic(pp, PR_RETIRED); page_clrtoxic(pp, PR_BUSY); page_retire_dequeue(pp); PR_INCR_KSTAT(pr_retired); if (pp->p_toxic & PR_FMA) { PR_INCR_KSTAT(pr_fma); } else if (pp->p_toxic & PR_UE) { PR_INCR_KSTAT(pr_ue); } else { PR_INCR_KSTAT(pr_mce); } mutex_enter(&freemem_lock); availrmem--; mutex_exit(&freemem_lock); page_unlock(pp); }
/* * Return a page_t structure from our spare list, or NULL if none are available. */ static page_t * balloon_page_sub(void) { page_t *pp; ASSERT(MUTEX_HELD(&bln_mutex)); if (bln_spare_list_front == NULL) { return (NULL); } pp = bln_spare_list_front; ASSERT(PAGE_EXCL(pp)); ASSERT(pp->p_pagenum <= mfn_count); if (pp->p_pagenum == mfn_count) { return (NULL); } bln_spare_list_front = pp->p_next; if (bln_spare_list_front == NULL) bln_spare_list_back = NULL; pp->p_next = NULL; return (pp); }
static void segkmem_free_one_lp(caddr_t addr, size_t size) { page_t *pp, *rootpp = NULL; pgcnt_t pgs_left = btopr(size); ASSERT(size == segkmem_lpsize); hat_unload(kas.a_hat, addr, size, HAT_UNLOAD_UNLOCK); for (; pgs_left > 0; addr += PAGESIZE, pgs_left--) { pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr, SE_EXCL); if (pp == NULL) panic("segkmem_free_one_lp: page not found"); ASSERT(PAGE_EXCL(pp)); pp->p_lckcnt = 0; if (rootpp == NULL) rootpp = pp; } ASSERT(rootpp != NULL); page_destroy_pages(rootpp); /* page_unresv() is done by the caller */ }
/* * Attempt to clear a UE from a page. * Returns 1 if the error has been successfully cleared. */ static int page_clear_transient_ue(page_t *pp) { caddr_t kaddr; uint8_t rb, wb; uint64_t pa; uint32_t pa_hi, pa_lo; on_trap_data_t otd; int errors = 0; int i; ASSERT(PAGE_EXCL(pp)); ASSERT(PP_PR_REQ(pp)); ASSERT(pp->p_szc == 0); ASSERT(!hat_page_is_mapped(pp)); /* * Clear the page and attempt to clear the UE. If we trap * on the next access to the page, we know the UE has recurred. */ pagescrub(pp, 0, PAGESIZE); /* * Map the page and write a bunch of bit patterns to compare * what we wrote with what we read back. This isn't a perfect * test but it should be good enough to catch most of the * recurring UEs. If this fails to catch a recurrent UE, we'll * retire the page the next time we see a UE on the page. */ kaddr = ppmapin(pp, PROT_READ|PROT_WRITE, (caddr_t)-1); pa = ptob((uint64_t)page_pptonum(pp)); pa_hi = (uint32_t)(pa >> 32); pa_lo = (uint32_t)pa; /* * Fill the page with each (0x00 - 0xFF] bit pattern, flushing * the cache in between reading and writing. We do this under * on_trap() protection to avoid recursion. */ if (on_trap(&otd, OT_DATA_EC)) { PR_MESSAGE(CE_WARN, 1, MSG_UE, pa); errors = 1; } else { for (wb = 0xff; wb > 0; wb--) { for (i = 0; i < PAGESIZE; i++) { kaddr[i] = wb; } sync_data_memory(kaddr, PAGESIZE); for (i = 0; i < PAGESIZE; i++) { rb = kaddr[i]; if (rb != wb) { /* * We had a mismatch without a trap. * Uh-oh. Something is really wrong * with this system. */ if (page_retire_messages) { cmn_err(CE_WARN, MSG_DM, pa_hi, pa_lo, rb, wb); } errors = 1; goto out; /* double break */ } } } } out: no_trap(); ppmapout(kaddr); return (errors ? 0 : 1); }
/* * Note that multiple bits may cleared in a single clrtoxic operation. * Must be called with the page exclusively locked to prevent races which * may attempt to retire a page without any toxic bits set. */ void page_clrtoxic(page_t *pp, uchar_t bits) { ASSERT(PAGE_EXCL(pp)); atomic_and_8(&pp->p_toxic, ~bits); }
/* * page_retire_pp() decides what to do with a failing page. * * When we get a free page (e.g. the scrubber or in the free path) life is * nice because the page is clean and marked free -- those always retire * nicely. From there we go by order of difficulty. If the page has data, * we attempt to relocate its contents to a suitable replacement page. If * that does not succeed, we look to see if it is clean. If after all of * this we have a clean, unmapped page (which we usually do!), we retire it. * If the page is not clean, we still process it regardless on a UE; for * CEs or FMA requests, we fail leaving the page in service. The page will * eventually be tried again later. We always return with the page unlocked * since we are called from page_unlock(). * * We don't call panic or do anything fancy down in here. Our boss the DE * gets paid handsomely to do his job of figuring out what to do when errors * occur. We just do what he tells us to do. */ static int page_retire_pp(page_t *pp) { int toxic; ASSERT(PAGE_EXCL(pp)); ASSERT(pp->p_iolock_state == 0); ASSERT(pp->p_szc == 0); PR_DEBUG(prd_top); PR_TYPES(pp); toxic = pp->p_toxic; ASSERT(toxic & PR_REASONS); if ((toxic & (PR_FMA | PR_MCE)) && !(toxic & PR_UE) && page_retire_limit()) { page_clrtoxic(pp, PR_FMA | PR_MCE | PR_MSG | PR_BUSY); page_retire_dequeue(pp); page_unlock(pp); return (page_retire_done(pp, PRD_LIMIT)); } if (PP_ISFREE(pp)) { int dbgnoreclaim = MTBF(recl_calls, recl_mtbf) == 0; PR_DEBUG(prd_free); if (dbgnoreclaim || !page_reclaim(pp, NULL)) { PR_DEBUG(prd_noreclaim); PR_INCR_KSTAT(pr_failed); /* * page_reclaim() returns with `pp' unlocked when * it fails. */ if (dbgnoreclaim) page_unlock(pp); return (page_retire_done(pp, PRD_FAILED)); } } ASSERT(!PP_ISFREE(pp)); if ((toxic & PR_UE) == 0 && pp->p_vnode && !PP_ISNORELOCKERNEL(pp) && MTBF(reloc_calls, reloc_mtbf)) { page_t *newpp; spgcnt_t count; /* * If we can relocate the page, great! newpp will go * on without us, and everything is fine. Regardless * of whether the relocation succeeds, we are still * going to take `pp' around back and shoot it. */ newpp = NULL; if (page_relocate(&pp, &newpp, 0, 0, &count, NULL) == 0) { PR_DEBUG(prd_reloc); page_unlock(newpp); ASSERT(hat_page_getattr(pp, P_MOD) == 0); } else { PR_DEBUG(prd_relocfail); } } if (hat_ismod(pp)) { PR_DEBUG(prd_mod); PR_INCR_KSTAT(pr_failed); page_unlock(pp); return (page_retire_done(pp, PRD_FAILED)); } if (PP_ISKVP(pp)) { PR_DEBUG(prd_kern); PR_INCR_KSTAT(pr_failed_kernel); page_unlock(pp); return (page_retire_done(pp, PRD_FAILED)); } if (pp->p_lckcnt || pp->p_cowcnt) { PR_DEBUG(prd_locked); PR_INCR_KSTAT(pr_failed); page_unlock(pp); return (page_retire_done(pp, PRD_FAILED)); } (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); ASSERT(!hat_page_is_mapped(pp)); /* * If the page is modified, and was not relocated; we can't * retire it without dropping data on the floor. We have to * recheck after unloading since the dirty bit could have been * set since we last checked. */ if (hat_ismod(pp)) { PR_DEBUG(prd_mod_late); PR_INCR_KSTAT(pr_failed); page_unlock(pp); return (page_retire_done(pp, PRD_FAILED)); } if (pp->p_vnode) { PR_DEBUG(prd_hashout); page_hashout(pp, NULL); } ASSERT(!pp->p_vnode); /* * The problem page is locked, demoted, unmapped, not free, * hashed out, and not COW or mlocked (whew!). * * Now we select our ammunition, take it around back, and shoot it. */ if (toxic & PR_UE) { if (page_retire_transient_ue(pp)) { PR_DEBUG(prd_uescrubbed); return (page_retire_done(pp, PRD_UE_SCRUBBED)); } else { PR_DEBUG(prd_uenotscrubbed); page_retire_destroy(pp); return (page_retire_done(pp, PRD_SUCCESS)); } } else if (toxic & PR_FMA) { PR_DEBUG(prd_fma); page_retire_destroy(pp); return (page_retire_done(pp, PRD_SUCCESS)); } else if (toxic & PR_MCE) { PR_DEBUG(prd_mce); page_retire_destroy(pp); return (page_retire_done(pp, PRD_SUCCESS)); } panic("page_retire_pp: bad toxic flags %d", toxic); /*NOTREACHED*/ }
/* * This function is called when we want to decrease the memory reservation * of our domain. Allocate the memory and make a hypervisor call to give * it back. */ static spgcnt_t balloon_dec_reservation(ulong_t debit) { int i, locked; long rv; ulong_t request; page_t *pp; bzero(mfn_frames, sizeof (mfn_frames)); bzero(pfn_frames, sizeof (pfn_frames)); if (debit > FRAME_ARRAY_SIZE) { debit = FRAME_ARRAY_SIZE; } request = debit; /* * Don't bother if there isn't a safe amount of kmem left. */ if (kmem_avail() < balloon_minkmem) { kmem_reap(); if (kmem_avail() < balloon_minkmem) return (0); } if (page_resv(request, KM_NOSLEEP) == 0) { return (0); } xen_block_migrate(); for (i = 0; i < debit; i++) { pp = page_get_high_mfn(new_high_mfn); new_high_mfn = 0; if (pp == NULL) { /* * Call kmem_reap(), then try once more, * but only if there is a safe amount of * kmem left. */ kmem_reap(); if (kmem_avail() < balloon_minkmem || (pp = page_get_high_mfn(0)) == NULL) { debit = i; break; } } ASSERT(PAGE_EXCL(pp)); ASSERT(!hat_page_is_mapped(pp)); balloon_page_add(pp); pfn_frames[i] = pp->p_pagenum; mfn_frames[i] = pfn_to_mfn(pp->p_pagenum); } if (debit == 0) { xen_allow_migrate(); page_unresv(request); return (0); } /* * We zero all the pages before we start reassigning them in order to * minimize the time spent holding the lock on the contig pfn list. */ if (balloon_zero_memory) { for (i = 0; i < debit; i++) { pfnzero(pfn_frames[i], 0, PAGESIZE); } } /* * Remove all mappings for the pfns from the system */ locked = balloon_lock_contig_pfnlist(debit); for (i = 0; i < debit; i++) { reassign_pfn(pfn_frames[i], MFN_INVALID); } if (locked) unlock_contig_pfnlist(); rv = balloon_free_pages(debit, mfn_frames, NULL, NULL); if (rv < 0) { cmn_err(CE_WARN, "Attempt to return pages to the hypervisor " "failed - up to %lu pages lost (error = %ld)", debit, rv); rv = 0; } else if (rv != debit) { panic("Unexpected return value (%ld) from decrease reservation " "hypervisor call", rv); } xen_allow_migrate(); if (debit != request) page_unresv(request - debit); return (rv); }
/* * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI, * B_TRUNC, B_FORCE}. B_DELWRI indicates that this page is part of a kluster * operation and is only to be considered if it doesn't involve any * waiting here. B_TRUNC indicates that the file is being truncated * and so no i/o needs to be done. B_FORCE indicates that the page * must be destroyed so don't try wrting it out. * * The caller must ensure that the page is locked. Returns 1, if * the page should be written back (the "iolock" is held in this * case), or 0 if the page has been dealt with or has been * unlocked. */ int pvn_getdirty(page_t *pp, int flags) { ASSERT((flags & (B_INVAL | B_FREE)) ? PAGE_EXCL(pp) : PAGE_SHARED(pp)); ASSERT(PP_ISFREE(pp) == 0); /* * If trying to invalidate or free a logically `locked' page, * forget it. Don't need page_struct_lock to check p_lckcnt and * p_cowcnt as the page is exclusively locked. */ if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) && (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) { page_unlock(pp); return (0); } /* * Now acquire the i/o lock so we can add it to the dirty * list (if necessary). We avoid blocking on the i/o lock * in the following cases: * * If B_DELWRI is set, which implies that this request is * due to a klustering operartion. * * If this is an async (B_ASYNC) operation and we are not doing * invalidation (B_INVAL) [The current i/o or fsflush will ensure * that the the page is written out]. */ if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) { if (!page_io_trylock(pp)) { page_unlock(pp); return (0); } } else { page_io_lock(pp); } /* * If we want to free or invalidate the page then * we need to unload it so that anyone who wants * it will have to take a minor fault to get it. * Otherwise, we're just writing the page back so we * need to sync up the hardwre and software mod bit to * detect any future modifications. We clear the * software mod bit when we put the page on the dirty * list. */ if (flags & (B_INVAL | B_FREE)) { (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); } else { (void) hat_pagesync(pp, HAT_SYNC_ZERORM); } if (!hat_ismod(pp) || (flags & B_TRUNC)) { /* * Don't need to add it to the * list after all. */ page_io_unlock(pp); if (flags & B_INVAL) { /*LINTED: constant in conditional context*/ VN_DISPOSE(pp, B_INVAL, 0, kcred); } else if (flags & B_FREE) { /*LINTED: constant in conditional context*/ VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred); } else { /* * This is advisory path for the callers * of VOP_PUTPAGE() who prefer freeing the * page _only_ if no one else is accessing it. * E.g. segmap_release() * * The above hat_ismod() check is useless because: * (1) we may not be holding SE_EXCL lock; * (2) we've not unloaded _all_ translations * * Let page_release() do the heavy-lifting. */ (void) page_release(pp, 1); } return (0); } /* * Page is dirty, get it ready for the write back * and add page to the dirty list. */ hat_clrrefmod(pp); /* * If we're going to free the page when we're done * then we can let others try to use it starting now. * We'll detect the fact that they used it when the * i/o is done and avoid freeing the page. */ if (flags & B_FREE) page_downgrade(pp); TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp); return (1); }
/* * Allocate pages to back the virtual address range [addr, addr + size). * If addr is NULL, allocate the virtual address space as well. */ void * segkmem_xalloc(vmem_t *vmp, void *inaddr, size_t size, int vmflag, uint_t attr, page_t *(*page_create_func)(void *, size_t, int, void *), void *pcarg) { page_t *ppl; caddr_t addr = inaddr; pgcnt_t npages = btopr(size); int allocflag; if (inaddr == NULL && (addr = vmem_alloc(vmp, size, vmflag)) == NULL) return (NULL); ASSERT(((uintptr_t)addr & PAGEOFFSET) == 0); if (page_resv(npages, vmflag & VM_KMFLAGS) == 0) { if (inaddr == NULL) vmem_free(vmp, addr, size); return (NULL); } ppl = page_create_func(addr, size, vmflag, pcarg); if (ppl == NULL) { if (inaddr == NULL) vmem_free(vmp, addr, size); page_unresv(npages); return (NULL); } /* * Under certain conditions, we need to let the HAT layer know * that it cannot safely allocate memory. Allocations from * the hat_memload vmem arena always need this, to prevent * infinite recursion. * * In addition, the x86 hat cannot safely do memory * allocations while in vmem_populate(), because there * is no simple bound on its usage. */ if (vmflag & VM_MEMLOAD) allocflag = HAT_NO_KALLOC; #if defined(__x86) else if (vmem_is_populator()) allocflag = HAT_NO_KALLOC; #endif else allocflag = 0; while (ppl != NULL) { page_t *pp = ppl; page_sub(&ppl, pp); ASSERT(page_iolock_assert(pp)); ASSERT(PAGE_EXCL(pp)); page_io_unlock(pp); hat_memload(kas.a_hat, (caddr_t)(uintptr_t)pp->p_offset, pp, (PROT_ALL & ~PROT_USER) | HAT_NOSYNC | attr, HAT_LOAD_LOCK | allocflag); pp->p_lckcnt = 1; #if defined(__x86) page_downgrade(pp); #else if (vmflag & SEGKMEM_SHARELOCKED) page_downgrade(pp); else page_unlock(pp); #endif } return (addr); }
/* * With the addition of reader-writer lock semantics to page_lock_es, * callers wanting an exclusive (writer) lock may prevent shared-lock * (reader) starvation by setting the es parameter to SE_EXCL_WANTED. * In this case, when an exclusive lock cannot be acquired, p_selock's * SE_EWANTED bit is set. Shared-lock (reader) requests are also denied * if the page is slated for retirement. * * The se and es parameters determine if the lock should be granted * based on the following decision table: * * Lock wanted es flags p_selock/SE_EWANTED Action * ----------- -------------- ------------------- --------- * SE_EXCL any [1][2] unlocked/any grant lock, clear SE_EWANTED * SE_EXCL SE_EWANTED any lock/any deny, set SE_EWANTED * SE_EXCL none any lock/any deny * SE_SHARED n/a [2] shared/0 grant * SE_SHARED n/a [2] unlocked/0 grant * SE_SHARED n/a shared/1 deny * SE_SHARED n/a unlocked/1 deny * SE_SHARED n/a excl/any deny * * Notes: * [1] The code grants an exclusive lock to the caller and clears the bit * SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED * bit's value. This was deemed acceptable as we are not concerned about * exclusive-lock starvation. If this ever becomes an issue, a priority or * fifo mechanism should also be implemented. Meantime, the thread that * set SE_EWANTED should be prepared to catch this condition and reset it * * [2] Retired pages may not be locked at any time, regardless of the * dispostion of se, unless the es parameter has SE_RETIRED flag set. * * Notes on values of "es": * * es & 1: page_lookup_create will attempt page relocation * es & SE_EXCL_WANTED: caller wants SE_EWANTED set (eg. delete * memory thread); this prevents reader-starvation of waiting * writer thread(s) by giving priority to writers over readers. * es & SE_RETIRED: caller wants to lock pages even if they are * retired. Default is to deny the lock if the page is retired. * * And yes, we know, the semantics of this function are too complicated. * It's on the list to be cleaned up. */ int page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es) { int retval; kmutex_t *pse = PAGE_SE_MUTEX(pp); int upgraded; int reclaim_it; ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1); VM_STAT_ADD(page_lock_count); upgraded = 0; reclaim_it = 0; mutex_enter(pse); ASSERT(((es & SE_EXCL_WANTED) == 0) || ((es & SE_EXCL_WANTED) && (se == SE_EXCL))); if (PP_RETIRED(pp) && !(es & SE_RETIRED)) { mutex_exit(pse); VM_STAT_ADD(page_lock_retired); return (0); } if (se == SE_SHARED && es == 1 && pp->p_selock == 0) { se = SE_EXCL; } if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) { reclaim_it = 1; if (se == SE_SHARED) { /* * This is an interesting situation. * * Remember that p_free can only change if * p_selock < 0. * p_free does not depend on our holding `pse'. * And, since we hold `pse', p_selock can not change. * So, if p_free changes on us, the page is already * exclusively held, and we would fail to get p_selock * regardless. * * We want to avoid getting the share * lock on a free page that needs to be reclaimed. * It is possible that some other thread has the share * lock and has left the free page on the cache list. * pvn_vplist_dirty() does this for brief periods. * If the se_share is currently SE_EXCL, we will fail * to acquire p_selock anyway. Blocking is the * right thing to do. * If we need to reclaim this page, we must get * exclusive access to it, force the upgrade now. * Again, we will fail to acquire p_selock if the * page is not free and block. */ upgraded = 1; se = SE_EXCL; VM_STAT_ADD(page_lock_upgrade); } } if (se == SE_EXCL) { if (!(es & SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) { /* * if the caller wants a writer lock (but did not * specify exclusive access), and there is a pending * writer that wants exclusive access, return failure */ retval = 0; } else if ((pp->p_selock & ~SE_EWANTED) == 0) { /* no reader/writer lock held */ THREAD_KPRI_REQUEST(); /* this clears our setting of the SE_EWANTED bit */ pp->p_selock = SE_WRITER; retval = 1; } else { /* page is locked */ if (es & SE_EXCL_WANTED) { /* set the SE_EWANTED bit */ pp->p_selock |= SE_EWANTED; } retval = 0; } } else { retval = 0; if (pp->p_selock >= 0) { if ((pp->p_selock & SE_EWANTED) == 0) { pp->p_selock += SE_READER; retval = 1; } } } if (retval == 0) { if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) { VM_STAT_ADD(page_lock_deleted); mutex_exit(pse); return (retval); } #ifdef VM_STATS VM_STAT_ADD(page_lock_miss); if (upgraded) { VM_STAT_ADD(page_lock_upgrade_failed); } #endif if (lock) { VM_STAT_ADD(page_lock_miss_lock); mutex_exit(lock); } /* * Now, wait for the page to be unlocked and * release the lock protecting p_cv and p_selock. */ cv_wait(&pp->p_cv, pse); mutex_exit(pse); /* * The page identity may have changed while we were * blocked. If we are willing to depend on "pp" * still pointing to a valid page structure (i.e., * assuming page structures are not dynamically allocated * or freed), we could try to lock the page if its * identity hasn't changed. * * This needs to be measured, since we come back from * cv_wait holding pse (the expensive part of this * operation) we might as well try the cheap part. * Though we would also have to confirm that dropping * `lock' did not cause any grief to the callers. */ if (lock) { mutex_enter(lock); } } else { /* * We have the page lock. * If we needed to reclaim the page, and the page * needed reclaiming (ie, it was free), then we * have the page exclusively locked. We may need * to downgrade the page. */ ASSERT((upgraded) ? ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1); mutex_exit(pse); /* * We now hold this page's lock, either shared or * exclusive. This will prevent its identity from changing. * The page, however, may or may not be free. If the caller * requested, and it is free, go reclaim it from the * free list. If the page can't be reclaimed, return failure * so that the caller can start all over again. * * NOTE:page_reclaim() releases the page lock (p_selock) * if it can't be reclaimed. */ if (reclaim_it) { if (!page_reclaim(pp, lock)) { VM_STAT_ADD(page_lock_bad_reclaim); retval = 0; } else { VM_STAT_ADD(page_lock_reclaim); if (upgraded) { page_downgrade(pp); } } } } return (retval); }