/* * page_retire_hunt() callback for the retire thread. */ static void page_retire_thread_cb(page_t *pp) { PR_DEBUG(prd_tctop); if (!PP_ISKVP(pp) && page_trylock(pp, SE_EXCL)) { PR_DEBUG(prd_tclocked); page_unlock(pp); } }
int plat_hold_page(pfn_t pfn, int lock, page_t **pp_ret) { page_t *pp = page_numtopp_nolock(pfn); if (pp == NULL) return (PLAT_HOLD_FAIL); #if !defined(__xpv) /* * Pages are locked SE_SHARED because some hypervisors * like xVM ESX reclaim Guest OS memory by locking * it SE_EXCL so we want to leave these pages alone. */ if (lock == PLAT_HOLD_LOCK) { ASSERT(pp_ret != NULL); if (page_trylock(pp, SE_SHARED) == 0) return (PLAT_HOLD_FAIL); } #else /* __xpv */ if (lock == PLAT_HOLD_LOCK) { ASSERT(pp_ret != NULL); if (page_trylock(pp, SE_EXCL) == 0) return (PLAT_HOLD_FAIL); } if (mfn_list[pfn] == MFN_INVALID) { /* We failed - release the lock if we grabbed it earlier */ if (lock == PLAT_HOLD_LOCK) { page_unlock(pp); } return (PLAT_HOLD_FAIL); } #endif /* __xpv */ if (lock == PLAT_HOLD_LOCK) *pp_ret = pp; return (PLAT_HOLD_OK); }
static void less_pages(uint64_t base, uint64_t len) { uint64_t pa, end = base + len; extern int kcage_on; for (pa = base; pa < end; pa += PAGESIZE) { pfn_t pfnum; page_t *pp; pfnum = (pfn_t)(pa >> PAGESHIFT); if ((pp = page_numtopp_nolock(pfnum)) == NULL) cmn_err(CE_PANIC, "missing pfnum %lx", pfnum); /* * must break up any large pages that may have * constituent pages being utilized for * prom_alloc()'s. page_reclaim() can't handle * large pages. */ if (pp->p_szc != 0) page_boot_demote(pp); if (!PAGE_LOCKED(pp) && pp->p_lckcnt == 0) { /* * Ahhh yes, a prom page, * suck it off the freelist, * lock it, and hashin on prom_pages vp. */ if (page_trylock(pp, SE_EXCL) == 0) cmn_err(CE_PANIC, "prom page locked"); (void) page_reclaim(pp, NULL); /* * vnode offsets on the prom_ppages vnode * are page numbers (gack) for >32 bit * physical memory machines. */ (void) page_hashin(pp, &promvp, (offset_t)pfnum, NULL); if (kcage_on) { ASSERT(pp->p_szc == 0); if (PP_ISNORELOC(pp) == 0) { PP_SETNORELOC(pp); PLCNT_XFER_NORELOC(pp); } } (void) page_pp_lock(pp, 0, 1); } } }
/* * Page retire self-test. For now, it always returns 0. */ int page_retire_test(void) { page_t *first, *pp, *cpp, *cpp2, *lpp; /* * Tests the corner case where a large page can't be retired * because one of the constituent pages is locked. We mark * one page to be retired and try to retire it, and mark the * other page to be retired but don't try to retire it, so * that page_unlock() in the failure path will recurse and try * to retire THAT page. This is the worst possible situation * we can get ourselves into. */ memsegs_lock(0); pp = first = page_first(); do { if (pp->p_szc && PP_PAGEROOT(pp) == pp) { cpp = pp + 1; lpp = PP_ISFREE(pp)? pp : pp + 2; cpp2 = pp + 3; if (!page_trylock(lpp, pp == lpp? SE_EXCL : SE_SHARED)) continue; if (!page_trylock(cpp, SE_EXCL)) { page_unlock(lpp); continue; } page_settoxic(cpp, PR_FMA | PR_BUSY); page_settoxic(cpp2, PR_FMA); page_tryretire(cpp); /* will fail */ page_unlock(lpp); (void) page_retire(cpp->p_pagenum, PR_FMA); (void) page_retire(cpp2->p_pagenum, PR_FMA); } } while ((pp = page_next(pp)) != first); memsegs_unlock(0); return (0); }
/* * page_retire() - the front door in to retire a page. * * Ideally, page_retire() would instantly retire the requested page. * Unfortunately, some pages are locked or otherwise tied up and cannot be * retired right away. To deal with that, bits are set in p_toxic of the * page_t. An attempt is made to lock the page; if the attempt is successful, * we instantly unlock the page counting on page_unlock() to notice p_toxic * is nonzero and to call back into page_retire_pp(). Success is determined * by looking to see whether the page has been retired once it has been * unlocked. * * Returns: * * - 0 on success, * - EINVAL when the PA is whacko, * - EIO if the page is already retired or already pending retirement, or * - EAGAIN if the page could not be _immediately_ retired but is pending. */ int page_retire(uint64_t pa, uchar_t reason) { page_t *pp; ASSERT(reason & PR_REASONS); /* there must be a reason */ ASSERT(!(reason & ~PR_REASONS)); /* but no other bits */ pp = page_numtopp_nolock(mmu_btop(pa)); if (pp == NULL) { PR_MESSAGE(CE_WARN, 1, "Cannot schedule clearing of error on" " page 0x%08x.%08x; page is not relocatable memory", pa); return (page_retire_done(pp, PRD_INVALID_PA)); } if (PP_RETIRED(pp)) { PR_DEBUG(prd_dup1); return (page_retire_done(pp, PRD_DUPLICATE)); } if ((reason & PR_UE) && !PP_TOXIC(pp)) { PR_MESSAGE(CE_NOTE, 1, "Scheduling clearing of error on" " page 0x%08x.%08x", pa); } else if (PP_PR_REQ(pp)) { PR_DEBUG(prd_dup2); return (page_retire_done(pp, PRD_DUPLICATE)); } else { PR_MESSAGE(CE_NOTE, 1, "Scheduling removal of" " page 0x%08x.%08x", pa); } page_settoxic(pp, reason); page_retire_enqueue(pp); /* * And now for some magic. * * We marked this page toxic up above. All there is left to do is * to try to lock the page and then unlock it. The page lock routines * will intercept the page and retire it if they can. If the page * cannot be locked, 's okay -- page_unlock() will eventually get it, * or the background thread, until then the lock routines will deny * further locks on it. */ if (MTBF(pr_calls, pr_mtbf) && page_trylock(pp, SE_EXCL)) { PR_DEBUG(prd_prlocked); page_unlock(pp); } else { PR_DEBUG(prd_prnotlocked); } if (PP_RETIRED(pp)) { PR_DEBUG(prd_prretired); return (0); } else { cv_signal(&pr_cv); PR_INCR_KSTAT(pr_failed); if (pp->p_toxic & PR_MSG) { return (page_retire_done(pp, PRD_FAILED)); } else { return (page_retire_done(pp, PRD_PENDING)); } } }
void plat_freelist_process(int mnode) { page_t *page, **freelist; page_t *bdlist[STARFIRE_MAX_BOARDS]; page_t **sortlist[STARFIRE_MAX_BOARDS]; uint32_t idx, idy, size, color, max_color, lbn; uint32_t bd_flags, bd_cnt, result, bds; kmutex_t *pcm; int mtype; /* for each page size */ for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) { for (size = 0; size < mmu_page_sizes; size++) { /* * Compute the maximum # of phys colors based on * page size. */ max_color = page_get_pagecolors(size); /* for each color */ for (color = 0; color < max_color; color++) { bd_cnt = 0; bd_flags = 0; for (idx = 0; idx < STARFIRE_MAX_BOARDS; idx++) { bdlist[idx] = NULL; sortlist[idx] = NULL; } /* find freelist */ freelist = &PAGE_FREELISTS(mnode, size, color, mtype); if (*freelist == NULL) continue; /* acquire locks */ pcm = PC_BIN_MUTEX(mnode, color, PG_FREE_LIST); mutex_enter(pcm); /* * read freelist & sort pages by logical * board number */ /* grab pages till last one. */ while (*freelist) { page = *freelist; result = page_trylock(page, SE_EXCL); ASSERT(result); /* Delete from freelist */ if (size != 0) { page_vpsub(freelist, page); } else { mach_page_sub(freelist, page); } /* detect the lbn */ lbn = PFN_2_LBN(page->p_pagenum); /* add to bdlist[lbn] */ if (size != 0) { page_vpadd(&bdlist[lbn], page); } else { mach_page_add(&bdlist[lbn], page); } /* if lbn new */ if ((bd_flags & (1 << lbn)) == 0) { bd_flags |= (1 << lbn); bd_cnt++; } page_unlock(page); } /* * Make the sortlist so * bd_cnt choices show up */ bds = 0; for (idx = 0; idx < STARFIRE_MAX_BOARDS; idx++) { if (bdlist[idx]) sortlist[bds++] = &bdlist[idx]; } /* * Set random start. */ (void) random_idx(-color); /* * now rebuild the freelist by shuffling * pages from bd lists */ while (bd_cnt) { /* * get "random" index between 0 & * bd_cnt */ ASSERT(bd_cnt && (bd_cnt < STARFIRE_MAX_BOARDS+1)); idx = random_idx(bd_cnt); page = *sortlist[idx]; result = page_trylock(page, SE_EXCL); ASSERT(result); /* Delete from sort_list */ /* & Append to freelist */ /* Big pages use vp_add - 8k don't */ if (size != 0) { page_vpsub(sortlist[idx], page); page_vpadd(freelist, page); } else { mach_page_sub(sortlist[idx], page); mach_page_add(freelist, page); } /* needed for indexing tmp lists */ lbn = PFN_2_LBN(page->p_pagenum); /* * if this was the last page on this * list? */ if (*sortlist[idx] == NULL) { /* have to find brd list */ /* idx is lbn? -- No! */ /* sortlist, brdlist */ /* have diff indexs */ bd_flags &= ~(1 << lbn); --bd_cnt; /* * redo the sortlist so only * bd_cnt choices show up */ bds = 0; for (idy = 0; idy < STARFIRE_MAX_BOARDS; idy++) { if (bdlist[idy]) { sortlist[bds++] /* CSTYLED */ = &bdlist[idy]; } } } page_unlock(page); } mutex_exit(pcm); } } } }
/* * Process a vnode's page list for all pages whose offset is >= off. * Pages are to either be free'd, invalidated, or written back to disk. * * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE * is specified, otherwise they are "shared" locked. * * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC} * * Special marker page_t's are inserted in the list in order * to keep track of where we are in the list when locks are dropped. * * Note the list is circular and insertions can happen only at the * head and tail of the list. The algorithm ensures visiting all pages * on the list in the following way: * * Drop two marker pages at the end of the list. * * Move one marker page backwards towards the start of the list until * it is at the list head, processing the pages passed along the way. * * Due to race conditions when the vphm mutex is dropped, additional pages * can be added to either end of the list, so we'll continue to move * the marker and process pages until it is up against the end marker. * * There is one special exit condition. If we are processing a VMODSORT * vnode and only writing back modified pages, we can stop as soon as * we run into an unmodified page. This makes fsync(3) operations fast. */ int pvn_vplist_dirty( vnode_t *vp, u_offset_t off, int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *), int flags, cred_t *cred) { page_t *pp; page_t *mark; /* marker page that moves toward head */ page_t *end; /* marker page at end of list */ int err = 0; int error; kmutex_t *vphm; se_t se; page_t **where_to_move; ASSERT(vp->v_type != VCHR); if (vp->v_pages == NULL) return (0); /* * Serialize vplist_dirty operations on this vnode by setting VVMLOCK. * * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync() * from getting blocked while flushing pages to a dead NFS server. */ mutex_enter(&vp->v_lock); if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) { mutex_exit(&vp->v_lock); return (EAGAIN); } while (vp->v_flag & VVMLOCK) cv_wait(&vp->v_cv, &vp->v_lock); if (vp->v_pages == NULL) { mutex_exit(&vp->v_lock); return (0); } vp->v_flag |= VVMLOCK; mutex_exit(&vp->v_lock); /* * Set up the marker pages used to walk the list */ end = kmem_cache_alloc(marker_cache, KM_SLEEP); end->p_vnode = vp; end->p_offset = (u_offset_t)-2; mark = kmem_cache_alloc(marker_cache, KM_SLEEP); mark->p_vnode = vp; mark->p_offset = (u_offset_t)-1; /* * Grab the lock protecting the vnode's page list * note that this lock is dropped at times in the loop. */ vphm = page_vnode_mutex(vp); mutex_enter(vphm); if (vp->v_pages == NULL) goto leave; /* * insert the markers and loop through the list of pages */ page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark); page_vpadd(&mark->p_vpnext, end); for (;;) { /* * If only doing an async write back, then we can * stop as soon as we get to start of the list. */ if (flags == B_ASYNC && vp->v_pages == mark) break; /* * otherwise stop when we've gone through all the pages */ if (mark->p_vpprev == end) break; pp = mark->p_vpprev; if (vp->v_pages == pp) where_to_move = &vp->v_pages; else where_to_move = &pp->p_vpprev->p_vpnext; ASSERT(pp->p_vnode == vp); /* * If just flushing dirty pages to disk and this vnode * is using a sorted list of pages, we can stop processing * as soon as we find an unmodified page. Since all the * modified pages are visited first. */ if (IS_VMODSORT(vp) && !(flags & (B_INVAL | B_FREE | B_TRUNC))) { if (!hat_ismod(pp) && !page_io_locked(pp)) { #ifdef DEBUG /* * For debug kernels examine what should be * all the remaining clean pages, asserting * that they are not modified. */ page_t *chk = pp; int attr; page_vpsub(&vp->v_pages, mark); page_vpadd(where_to_move, mark); do { chk = chk->p_vpprev; ASSERT(chk != end); if (chk == mark) continue; attr = hat_page_getattr(chk, P_MOD | P_REF); if ((attr & P_MOD) == 0) continue; panic("v_pages list not all clean: " "page_t*=%p vnode=%p off=%lx " "attr=0x%x last clean page_t*=%p\n", (void *)chk, (void *)chk->p_vnode, (long)chk->p_offset, attr, (void *)pp); } while (chk != vp->v_pages); #endif break; } else if (!(flags & B_ASYNC) && !hat_ismod(pp)) { /* * Couldn't get io lock, wait until IO is done. * Block only for sync IO since we don't want * to block async IO. */ mutex_exit(vphm); page_io_wait(pp); mutex_enter(vphm); continue; } } /* * Skip this page if the offset is out of the desired range. * Just move the marker and continue. */ if (pp->p_offset < off) { page_vpsub(&vp->v_pages, mark); page_vpadd(where_to_move, mark); continue; } /* * If we are supposed to invalidate or free this * page, then we need an exclusive lock. */ se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED; /* * We must acquire the page lock for all synchronous * operations (invalidate, free and write). */ if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) { /* * If the page_lock() drops the mutex * we must retry the loop. */ if (!page_lock(pp, se, vphm, P_NO_RECLAIM)) continue; /* * It's ok to move the marker page now. */ page_vpsub(&vp->v_pages, mark); page_vpadd(where_to_move, mark); } else { /* * update the marker page for all remaining cases */ page_vpsub(&vp->v_pages, mark); page_vpadd(where_to_move, mark); /* * For write backs, If we can't lock the page, it's * invalid or in the process of being destroyed. Skip * it, assuming someone else is writing it. */ if (!page_trylock(pp, se)) continue; } ASSERT(pp->p_vnode == vp); /* * Successfully locked the page, now figure out what to * do with it. Free pages are easily dealt with, invalidate * if desired or just go on to the next page. */ if (PP_ISFREE(pp)) { if ((flags & B_INVAL) == 0) { page_unlock(pp); continue; } /* * Invalidate (destroy) the page. */ mutex_exit(vphm); page_destroy_free(pp); mutex_enter(vphm); continue; } /* * pvn_getdirty() figures out what do do with a dirty page. * If the page is dirty, the putapage() routine will write it * and will kluster any other adjacent dirty pages it can. * * pvn_getdirty() and `(*putapage)' unlock the page. */ mutex_exit(vphm); if (pvn_getdirty(pp, flags)) { error = (*putapage)(vp, pp, NULL, NULL, flags, cred); if (!err) err = error; } mutex_enter(vphm); } page_vpsub(&vp->v_pages, mark); page_vpsub(&vp->v_pages, end); leave: /* * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds */ mutex_exit(vphm); kmem_cache_free(marker_cache, mark); kmem_cache_free(marker_cache, end); mutex_enter(&vp->v_lock); vp->v_flag &= ~VVMLOCK; cv_broadcast(&vp->v_cv); mutex_exit(&vp->v_lock); return (err); }
/* * Scan page_t's and issue I/O's for modified pages. * * Also coalesces consecutive small sized free pages into the next larger * pagesize. This costs a tiny bit of time in fsflush, but will reduce time * spent scanning on later passes and for anybody allocating large pages. */ static void fsflush_do_pages() { vnode_t *vp; ulong_t pcount; hrtime_t timer = gethrtime(); ulong_t releases = 0; ulong_t nexamined = 0; ulong_t nlocked = 0; ulong_t nmodified = 0; ulong_t ncoalesce = 0; ulong_t cnt; int mod; int fspage = 1; u_offset_t offset; uint_t szc; page_t *coal_page = NULL; /* 1st page in group to coalesce */ uint_t coal_szc = 0; /* size code, coal_page->p_szc */ uint_t coal_cnt = 0; /* count of pages seen */ static ulong_t nscan = 0; static pgcnt_t last_total_pages = 0; static page_t *pp = NULL; /* * Check to see if total_pages has changed. */ if (total_pages != last_total_pages) { last_total_pages = total_pages; nscan = (last_total_pages * (tune.t_fsflushr))/v.v_autoup; } if (pp == NULL) pp = memsegs->pages; pcount = 0; while (pcount < nscan) { /* * move to the next page, skipping over large pages * and issuing prefetches. */ if (pp->p_szc && fspage == 0) { pfn_t pfn; pfn = page_pptonum(pp); cnt = page_get_pagecnt(pp->p_szc); cnt -= pfn & (cnt - 1); } else cnt = 1; pp = page_nextn(pp, cnt); prefetch_page_r((void *)pp); ASSERT(pp != NULL); pcount += cnt; /* * Do a bunch of dirty tests (ie. no locking) to determine * if we can quickly skip this page. These tests are repeated * after acquiring the page lock. */ ++nexamined; if (PP_ISSWAP(pp)) { fspage = 0; coal_page = NULL; continue; } /* * skip free pages too, but try coalescing them into larger * pagesizes */ if (PP_ISFREE(pp)) { /* * skip pages with a file system identity or that * are already maximum size */ fspage = 0; szc = pp->p_szc; if (pp->p_vnode != NULL || szc == fsf_npgsz - 1) { coal_page = NULL; continue; } /* * If not in a coalescing candidate page or the size * codes are different, start a new candidate. */ if (coal_page == NULL || coal_szc != szc) { /* * page must be properly aligned */ if ((page_pptonum(pp) & fsf_mask[szc]) != 0) { coal_page = NULL; continue; } coal_page = pp; coal_szc = szc; coal_cnt = 1; continue; } /* * acceptable to add this to existing candidate page */ ++coal_cnt; if (coal_cnt < fsf_pgcnt[coal_szc]) continue; /* * We've got enough pages to coalesce, so do it. * After promoting, we clear coal_page, so it will * take another pass to promote this to an even * larger page. */ ++ncoalesce; (void) page_promote_size(coal_page, coal_szc); coal_page = NULL; continue; } else { coal_page = NULL; } if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { fspage = 0; continue; } /* * Reject pages that can't be "exclusively" locked. */ if (!page_trylock(pp, SE_EXCL)) continue; ++nlocked; /* * After locking the page, redo the above checks. * Since we locked the page, leave out the PAGE_LOCKED() test. */ vp = pp->p_vnode; if (PP_ISSWAP(pp) || PP_ISFREE(pp) || vp == NULL || PP_ISKAS(pp) || (vp->v_flag & VISSWAP) != 0) { page_unlock(pp); fspage = 0; continue; } if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { page_unlock(pp); continue; } fspage = 1; ASSERT(vp->v_type != VCHR); /* * Check the modified bit. Leaving the bit alone in hardware. * It will be cleared if we do the putpage. */ if (IS_VMODSORT(vp)) mod = hat_ismod(pp); else mod = hat_pagesync(pp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD; if (mod) { ++nmodified; offset = pp->p_offset; /* * Hold the vnode before releasing the page lock * to prevent it from being freed and re-used by * some other thread. */ VN_HOLD(vp); page_unlock(pp); (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_ASYNC, kcred, NULL); VN_RELE(vp); } else { /* * Catch any pages which should be on the cache list, * but aren't yet. */ if (hat_page_is_mapped(pp) == 0) { ++releases; (void) page_release(pp, 1); } else { page_unlock(pp); } } } /* * maintain statistics * reset every million wakeups, just to avoid overflow */ if (++fsf_cycles == 1000000) { fsf_cycles = 0; fsf_total.fsf_scan = 0; fsf_total.fsf_examined = 0; fsf_total.fsf_locked = 0; fsf_total.fsf_modified = 0; fsf_total.fsf_coalesce = 0; fsf_total.fsf_time = 0; fsf_total.fsf_releases = 0; } else { fsf_total.fsf_scan += fsf_recent.fsf_scan = nscan; fsf_total.fsf_examined += fsf_recent.fsf_examined = nexamined; fsf_total.fsf_locked += fsf_recent.fsf_locked = nlocked; fsf_total.fsf_modified += fsf_recent.fsf_modified = nmodified; fsf_total.fsf_coalesce += fsf_recent.fsf_coalesce = ncoalesce; fsf_total.fsf_time += fsf_recent.fsf_time = gethrtime() - timer; fsf_total.fsf_releases += fsf_recent.fsf_releases = releases; } }