/* * Try to clear a page_t with a single UE. If the UE was transient, it is * returned to service, and we return 1. Otherwise we return 0 meaning * that further processing is required to retire the page. */ static int page_retire_transient_ue(page_t *pp) { ASSERT(PAGE_EXCL(pp)); ASSERT(!hat_page_is_mapped(pp)); /* * If this page is a repeat offender, retire him under the * "two strikes and you're out" rule. The caller is responsible * for scrubbing the page to try to clear the error. */ if (pp->p_toxic & PR_UE_SCRUBBED) { PR_INCR_KSTAT(pr_ue_persistent); return (0); } if (page_clear_transient_ue(pp)) { /* * We set the PR_SCRUBBED_UE bit; if we ever see this * page again, we will retire it, no questions asked. */ page_settoxic(pp, PR_UE_SCRUBBED); if (page_retire_first_ue) { PR_INCR_KSTAT(pr_ue_cleared_retire); return (0); } else { PR_INCR_KSTAT(pr_ue_cleared_free); page_clrtoxic(pp, PR_UE | PR_MCE | PR_MSG | PR_BUSY); page_retire_dequeue(pp); /* LINTED: CONSTCOND */ VN_DISPOSE(pp, B_FREE, 1, kcred); return (1); } } PR_INCR_KSTAT(pr_ue_persistent); return (0); }
/* * Act like page_destroy(), but instead of freeing the page, hash it onto * the retired_pages vnode, and mark it retired. * * For fun, we try to scrub the page until it's squeaky clean. * availrmem is adjusted here. */ static void page_retire_destroy(page_t *pp) { u_offset_t off = (u_offset_t)((uintptr_t)pp); ASSERT(PAGE_EXCL(pp)); ASSERT(!PP_ISFREE(pp)); ASSERT(pp->p_szc == 0); ASSERT(!hat_page_is_mapped(pp)); ASSERT(!pp->p_vnode); page_clr_all_props(pp); pagescrub(pp, 0, MMU_PAGESIZE); pp->p_next = NULL; pp->p_prev = NULL; if (page_hashin(pp, retired_pages, off, NULL) == 0) { cmn_err(CE_PANIC, "retired page %p hashin failed", (void *)pp); } page_settoxic(pp, PR_RETIRED); page_clrtoxic(pp, PR_BUSY); page_retire_dequeue(pp); PR_INCR_KSTAT(pr_retired); if (pp->p_toxic & PR_FMA) { PR_INCR_KSTAT(pr_fma); } else if (pp->p_toxic & PR_UE) { PR_INCR_KSTAT(pr_ue); } else { PR_INCR_KSTAT(pr_mce); } mutex_enter(&freemem_lock); availrmem--; mutex_exit(&freemem_lock); page_unlock(pp); }
/* * Attempt to clear a UE from a page. * Returns 1 if the error has been successfully cleared. */ static int page_clear_transient_ue(page_t *pp) { caddr_t kaddr; uint8_t rb, wb; uint64_t pa; uint32_t pa_hi, pa_lo; on_trap_data_t otd; int errors = 0; int i; ASSERT(PAGE_EXCL(pp)); ASSERT(PP_PR_REQ(pp)); ASSERT(pp->p_szc == 0); ASSERT(!hat_page_is_mapped(pp)); /* * Clear the page and attempt to clear the UE. If we trap * on the next access to the page, we know the UE has recurred. */ pagescrub(pp, 0, PAGESIZE); /* * Map the page and write a bunch of bit patterns to compare * what we wrote with what we read back. This isn't a perfect * test but it should be good enough to catch most of the * recurring UEs. If this fails to catch a recurrent UE, we'll * retire the page the next time we see a UE on the page. */ kaddr = ppmapin(pp, PROT_READ|PROT_WRITE, (caddr_t)-1); pa = ptob((uint64_t)page_pptonum(pp)); pa_hi = (uint32_t)(pa >> 32); pa_lo = (uint32_t)pa; /* * Fill the page with each (0x00 - 0xFF] bit pattern, flushing * the cache in between reading and writing. We do this under * on_trap() protection to avoid recursion. */ if (on_trap(&otd, OT_DATA_EC)) { PR_MESSAGE(CE_WARN, 1, MSG_UE, pa); errors = 1; } else { for (wb = 0xff; wb > 0; wb--) { for (i = 0; i < PAGESIZE; i++) { kaddr[i] = wb; } sync_data_memory(kaddr, PAGESIZE); for (i = 0; i < PAGESIZE; i++) { rb = kaddr[i]; if (rb != wb) { /* * We had a mismatch without a trap. * Uh-oh. Something is really wrong * with this system. */ if (page_retire_messages) { cmn_err(CE_WARN, MSG_DM, pa_hi, pa_lo, rb, wb); } errors = 1; goto out; /* double break */ } } } } out: no_trap(); ppmapout(kaddr); return (errors ? 0 : 1); }
/* * page_retire_pp() decides what to do with a failing page. * * When we get a free page (e.g. the scrubber or in the free path) life is * nice because the page is clean and marked free -- those always retire * nicely. From there we go by order of difficulty. If the page has data, * we attempt to relocate its contents to a suitable replacement page. If * that does not succeed, we look to see if it is clean. If after all of * this we have a clean, unmapped page (which we usually do!), we retire it. * If the page is not clean, we still process it regardless on a UE; for * CEs or FMA requests, we fail leaving the page in service. The page will * eventually be tried again later. We always return with the page unlocked * since we are called from page_unlock(). * * We don't call panic or do anything fancy down in here. Our boss the DE * gets paid handsomely to do his job of figuring out what to do when errors * occur. We just do what he tells us to do. */ static int page_retire_pp(page_t *pp) { int toxic; ASSERT(PAGE_EXCL(pp)); ASSERT(pp->p_iolock_state == 0); ASSERT(pp->p_szc == 0); PR_DEBUG(prd_top); PR_TYPES(pp); toxic = pp->p_toxic; ASSERT(toxic & PR_REASONS); if ((toxic & (PR_FMA | PR_MCE)) && !(toxic & PR_UE) && page_retire_limit()) { page_clrtoxic(pp, PR_FMA | PR_MCE | PR_MSG | PR_BUSY); page_retire_dequeue(pp); page_unlock(pp); return (page_retire_done(pp, PRD_LIMIT)); } if (PP_ISFREE(pp)) { int dbgnoreclaim = MTBF(recl_calls, recl_mtbf) == 0; PR_DEBUG(prd_free); if (dbgnoreclaim || !page_reclaim(pp, NULL)) { PR_DEBUG(prd_noreclaim); PR_INCR_KSTAT(pr_failed); /* * page_reclaim() returns with `pp' unlocked when * it fails. */ if (dbgnoreclaim) page_unlock(pp); return (page_retire_done(pp, PRD_FAILED)); } } ASSERT(!PP_ISFREE(pp)); if ((toxic & PR_UE) == 0 && pp->p_vnode && !PP_ISNORELOCKERNEL(pp) && MTBF(reloc_calls, reloc_mtbf)) { page_t *newpp; spgcnt_t count; /* * If we can relocate the page, great! newpp will go * on without us, and everything is fine. Regardless * of whether the relocation succeeds, we are still * going to take `pp' around back and shoot it. */ newpp = NULL; if (page_relocate(&pp, &newpp, 0, 0, &count, NULL) == 0) { PR_DEBUG(prd_reloc); page_unlock(newpp); ASSERT(hat_page_getattr(pp, P_MOD) == 0); } else { PR_DEBUG(prd_relocfail); } } if (hat_ismod(pp)) { PR_DEBUG(prd_mod); PR_INCR_KSTAT(pr_failed); page_unlock(pp); return (page_retire_done(pp, PRD_FAILED)); } if (PP_ISKVP(pp)) { PR_DEBUG(prd_kern); PR_INCR_KSTAT(pr_failed_kernel); page_unlock(pp); return (page_retire_done(pp, PRD_FAILED)); } if (pp->p_lckcnt || pp->p_cowcnt) { PR_DEBUG(prd_locked); PR_INCR_KSTAT(pr_failed); page_unlock(pp); return (page_retire_done(pp, PRD_FAILED)); } (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); ASSERT(!hat_page_is_mapped(pp)); /* * If the page is modified, and was not relocated; we can't * retire it without dropping data on the floor. We have to * recheck after unloading since the dirty bit could have been * set since we last checked. */ if (hat_ismod(pp)) { PR_DEBUG(prd_mod_late); PR_INCR_KSTAT(pr_failed); page_unlock(pp); return (page_retire_done(pp, PRD_FAILED)); } if (pp->p_vnode) { PR_DEBUG(prd_hashout); page_hashout(pp, NULL); } ASSERT(!pp->p_vnode); /* * The problem page is locked, demoted, unmapped, not free, * hashed out, and not COW or mlocked (whew!). * * Now we select our ammunition, take it around back, and shoot it. */ if (toxic & PR_UE) { if (page_retire_transient_ue(pp)) { PR_DEBUG(prd_uescrubbed); return (page_retire_done(pp, PRD_UE_SCRUBBED)); } else { PR_DEBUG(prd_uenotscrubbed); page_retire_destroy(pp); return (page_retire_done(pp, PRD_SUCCESS)); } } else if (toxic & PR_FMA) { PR_DEBUG(prd_fma); page_retire_destroy(pp); return (page_retire_done(pp, PRD_SUCCESS)); } else if (toxic & PR_MCE) { PR_DEBUG(prd_mce); page_retire_destroy(pp); return (page_retire_done(pp, PRD_SUCCESS)); } panic("page_retire_pp: bad toxic flags %d", toxic); /*NOTREACHED*/ }
/* * This function is called when we want to decrease the memory reservation * of our domain. Allocate the memory and make a hypervisor call to give * it back. */ static spgcnt_t balloon_dec_reservation(ulong_t debit) { int i, locked; long rv; ulong_t request; page_t *pp; bzero(mfn_frames, sizeof (mfn_frames)); bzero(pfn_frames, sizeof (pfn_frames)); if (debit > FRAME_ARRAY_SIZE) { debit = FRAME_ARRAY_SIZE; } request = debit; /* * Don't bother if there isn't a safe amount of kmem left. */ if (kmem_avail() < balloon_minkmem) { kmem_reap(); if (kmem_avail() < balloon_minkmem) return (0); } if (page_resv(request, KM_NOSLEEP) == 0) { return (0); } xen_block_migrate(); for (i = 0; i < debit; i++) { pp = page_get_high_mfn(new_high_mfn); new_high_mfn = 0; if (pp == NULL) { /* * Call kmem_reap(), then try once more, * but only if there is a safe amount of * kmem left. */ kmem_reap(); if (kmem_avail() < balloon_minkmem || (pp = page_get_high_mfn(0)) == NULL) { debit = i; break; } } ASSERT(PAGE_EXCL(pp)); ASSERT(!hat_page_is_mapped(pp)); balloon_page_add(pp); pfn_frames[i] = pp->p_pagenum; mfn_frames[i] = pfn_to_mfn(pp->p_pagenum); } if (debit == 0) { xen_allow_migrate(); page_unresv(request); return (0); } /* * We zero all the pages before we start reassigning them in order to * minimize the time spent holding the lock on the contig pfn list. */ if (balloon_zero_memory) { for (i = 0; i < debit; i++) { pfnzero(pfn_frames[i], 0, PAGESIZE); } } /* * Remove all mappings for the pfns from the system */ locked = balloon_lock_contig_pfnlist(debit); for (i = 0; i < debit; i++) { reassign_pfn(pfn_frames[i], MFN_INVALID); } if (locked) unlock_contig_pfnlist(); rv = balloon_free_pages(debit, mfn_frames, NULL, NULL); if (rv < 0) { cmn_err(CE_WARN, "Attempt to return pages to the hypervisor " "failed - up to %lu pages lost (error = %ld)", debit, rv); rv = 0; } else if (rv != debit) { panic("Unexpected return value (%ld) from decrease reservation " "hypervisor call", rv); } xen_allow_migrate(); if (debit != request) page_unresv(request - debit); return (rv); }
void pvn_write_done(page_t *plist, int flags) { int dfree = 0; int pgrec = 0; int pgout = 0; int pgpgout = 0; int anonpgout = 0; int anonfree = 0; int fspgout = 0; int fsfree = 0; int execpgout = 0; int execfree = 0; page_t *pp; struct cpu *cpup; struct vnode *vp = NULL; /* for probe */ uint_t ppattr; kmutex_t *vphm = NULL; ASSERT((flags & B_READ) == 0); /* * If we are about to start paging anyway, start freeing pages. */ if (write_free && freemem < lotsfree + pages_before_pager && (flags & B_ERROR) == 0) { flags |= B_FREE; } /* * Handle each page involved in the i/o operation. */ while (plist != NULL) { pp = plist; ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp)); page_sub(&plist, pp); /* Kernel probe support */ if (vp == NULL) vp = pp->p_vnode; if (((flags & B_ERROR) == 0) && IS_VMODSORT(vp)) { /* * Move page to the top of the v_page list. * Skip pages modified during IO. */ vphm = page_vnode_mutex(vp); mutex_enter(vphm); if ((pp->p_vpnext != pp) && !hat_ismod(pp)) { page_vpsub(&vp->v_pages, pp); page_vpadd(&vp->v_pages, pp); } mutex_exit(vphm); } if (flags & B_ERROR) { /* * Write operation failed. We don't want * to destroy (or free) the page unless B_FORCE * is set. We set the mod bit again and release * all locks on the page so that it will get written * back again later when things are hopefully * better again. * If B_INVAL and B_FORCE is set we really have * to destroy the page. */ if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) { page_io_unlock(pp); /*LINTED: constant in conditional context*/ VN_DISPOSE(pp, B_INVAL, 0, kcred); } else { hat_setmod_only(pp); page_io_unlock(pp); page_unlock(pp); } } else if (flags & B_INVAL) { /* * XXX - Failed writes with B_INVAL set are * not handled appropriately. */ page_io_unlock(pp); /*LINTED: constant in conditional context*/ VN_DISPOSE(pp, B_INVAL, 0, kcred); } else if (flags & B_FREE ||!hat_page_is_mapped(pp)) { /* * Update statistics for pages being paged out */ if (pp->p_vnode) { if (IS_SWAPFSVP(pp->p_vnode)) { anonpgout++; } else { if (pp->p_vnode->v_flag & VVMEXEC) { execpgout++; } else { fspgout++; } } } page_io_unlock(pp); pgout = 1; pgpgout++; TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT, "page_ws_out:pp %p", pp); /* * The page_struct_lock need not be acquired to * examine "p_lckcnt" and "p_cowcnt" since we'll * have an "exclusive" lock if the upgrade succeeds. */ if (page_tryupgrade(pp) && pp->p_lckcnt == 0 && pp->p_cowcnt == 0) { /* * Check if someone has reclaimed the * page. If ref and mod are not set, no * one is using it so we can free it. * The rest of the system is careful * to use the NOSYNC flag to unload * translations set up for i/o w/o * affecting ref and mod bits. * * Obtain a copy of the real hardware * mod bit using hat_pagesync(pp, HAT_DONTZERO) * to avoid having to flush the cache. */ ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD); ck_refmod: if (!(ppattr & (P_REF | P_MOD))) { if (hat_page_is_mapped(pp)) { /* * Doesn't look like the page * was modified so now we * really have to unload the * translations. Meanwhile * another CPU could've * modified it so we have to * check again. We don't loop * forever here because now * the translations are gone * and no one can get a new one * since we have the "exclusive" * lock on the page. */ (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); ppattr = hat_page_getattr(pp, P_REF | P_MOD); goto ck_refmod; } /* * Update statistics for pages being * freed */ if (pp->p_vnode) { if (IS_SWAPFSVP(pp->p_vnode)) { anonfree++; } else { if (pp->p_vnode->v_flag & VVMEXEC) { execfree++; } else { fsfree++; } } } /*LINTED: constant in conditional ctx*/ VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred); dfree++; } else { page_unlock(pp); pgrec++; TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE, "page_ws_free:pp %p", pp); } } else { /* * Page is either `locked' in memory * or was reclaimed and now has a * "shared" lock, so release it. */ page_unlock(pp); } } else { /* * Neither B_FREE nor B_INVAL nor B_ERROR. * Just release locks. */ page_io_unlock(pp); page_unlock(pp); } } CPU_STATS_ENTER_K(); cpup = CPU; /* get cpup now that CPU cannot change */ CPU_STATS_ADDQ(cpup, vm, dfree, dfree); CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec); CPU_STATS_ADDQ(cpup, vm, pgout, pgout); CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout); CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout); CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree); CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout); CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree); CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout); CPU_STATS_ADDQ(cpup, vm, execfree, execfree); CPU_STATS_EXIT_K(); /* Kernel probe */ TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */, tnf_opaque, vnode, vp, tnf_ulong, pages_pageout, pgpgout, tnf_ulong, pages_freed, dfree, tnf_ulong, pages_reclaimed, pgrec); }
/* * Scan page_t's and issue I/O's for modified pages. * * Also coalesces consecutive small sized free pages into the next larger * pagesize. This costs a tiny bit of time in fsflush, but will reduce time * spent scanning on later passes and for anybody allocating large pages. */ static void fsflush_do_pages() { vnode_t *vp; ulong_t pcount; hrtime_t timer = gethrtime(); ulong_t releases = 0; ulong_t nexamined = 0; ulong_t nlocked = 0; ulong_t nmodified = 0; ulong_t ncoalesce = 0; ulong_t cnt; int mod; int fspage = 1; u_offset_t offset; uint_t szc; page_t *coal_page = NULL; /* 1st page in group to coalesce */ uint_t coal_szc = 0; /* size code, coal_page->p_szc */ uint_t coal_cnt = 0; /* count of pages seen */ static ulong_t nscan = 0; static pgcnt_t last_total_pages = 0; static page_t *pp = NULL; /* * Check to see if total_pages has changed. */ if (total_pages != last_total_pages) { last_total_pages = total_pages; nscan = (last_total_pages * (tune.t_fsflushr))/v.v_autoup; } if (pp == NULL) pp = memsegs->pages; pcount = 0; while (pcount < nscan) { /* * move to the next page, skipping over large pages * and issuing prefetches. */ if (pp->p_szc && fspage == 0) { pfn_t pfn; pfn = page_pptonum(pp); cnt = page_get_pagecnt(pp->p_szc); cnt -= pfn & (cnt - 1); } else cnt = 1; pp = page_nextn(pp, cnt); prefetch_page_r((void *)pp); ASSERT(pp != NULL); pcount += cnt; /* * Do a bunch of dirty tests (ie. no locking) to determine * if we can quickly skip this page. These tests are repeated * after acquiring the page lock. */ ++nexamined; if (PP_ISSWAP(pp)) { fspage = 0; coal_page = NULL; continue; } /* * skip free pages too, but try coalescing them into larger * pagesizes */ if (PP_ISFREE(pp)) { /* * skip pages with a file system identity or that * are already maximum size */ fspage = 0; szc = pp->p_szc; if (pp->p_vnode != NULL || szc == fsf_npgsz - 1) { coal_page = NULL; continue; } /* * If not in a coalescing candidate page or the size * codes are different, start a new candidate. */ if (coal_page == NULL || coal_szc != szc) { /* * page must be properly aligned */ if ((page_pptonum(pp) & fsf_mask[szc]) != 0) { coal_page = NULL; continue; } coal_page = pp; coal_szc = szc; coal_cnt = 1; continue; } /* * acceptable to add this to existing candidate page */ ++coal_cnt; if (coal_cnt < fsf_pgcnt[coal_szc]) continue; /* * We've got enough pages to coalesce, so do it. * After promoting, we clear coal_page, so it will * take another pass to promote this to an even * larger page. */ ++ncoalesce; (void) page_promote_size(coal_page, coal_szc); coal_page = NULL; continue; } else { coal_page = NULL; } if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { fspage = 0; continue; } /* * Reject pages that can't be "exclusively" locked. */ if (!page_trylock(pp, SE_EXCL)) continue; ++nlocked; /* * After locking the page, redo the above checks. * Since we locked the page, leave out the PAGE_LOCKED() test. */ vp = pp->p_vnode; if (PP_ISSWAP(pp) || PP_ISFREE(pp) || vp == NULL || PP_ISKAS(pp) || (vp->v_flag & VISSWAP) != 0) { page_unlock(pp); fspage = 0; continue; } if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { page_unlock(pp); continue; } fspage = 1; ASSERT(vp->v_type != VCHR); /* * Check the modified bit. Leaving the bit alone in hardware. * It will be cleared if we do the putpage. */ if (IS_VMODSORT(vp)) mod = hat_ismod(pp); else mod = hat_pagesync(pp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD; if (mod) { ++nmodified; offset = pp->p_offset; /* * Hold the vnode before releasing the page lock * to prevent it from being freed and re-used by * some other thread. */ VN_HOLD(vp); page_unlock(pp); (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_ASYNC, kcred, NULL); VN_RELE(vp); } else { /* * Catch any pages which should be on the cache list, * but aren't yet. */ if (hat_page_is_mapped(pp) == 0) { ++releases; (void) page_release(pp, 1); } else { page_unlock(pp); } } } /* * maintain statistics * reset every million wakeups, just to avoid overflow */ if (++fsf_cycles == 1000000) { fsf_cycles = 0; fsf_total.fsf_scan = 0; fsf_total.fsf_examined = 0; fsf_total.fsf_locked = 0; fsf_total.fsf_modified = 0; fsf_total.fsf_coalesce = 0; fsf_total.fsf_time = 0; fsf_total.fsf_releases = 0; } else { fsf_total.fsf_scan += fsf_recent.fsf_scan = nscan; fsf_total.fsf_examined += fsf_recent.fsf_examined = nexamined; fsf_total.fsf_locked += fsf_recent.fsf_locked = nlocked; fsf_total.fsf_modified += fsf_recent.fsf_modified = nmodified; fsf_total.fsf_coalesce += fsf_recent.fsf_coalesce = ncoalesce; fsf_total.fsf_time += fsf_recent.fsf_time = gethrtime() - timer; fsf_total.fsf_releases += fsf_recent.fsf_releases = releases; } }