/* * Walk the vp->v_pages list, for every page call the callback function * pointed by *page_check. If page_check returns non-zero, then mark the * page as modified and if VMODSORT is set, move it to the end of v_pages * list. Moving makes sense only if we have at least two pages - this also * avoids having v_pages temporarily being NULL after calling page_vpsub() * if there was just one page. */ void pvn_vplist_setdirty(vnode_t *vp, int (*page_check)(page_t *)) { page_t *pp, *next, *end; kmutex_t *vphm; int shuffle; vphm = page_vnode_mutex(vp); mutex_enter(vphm); if (vp->v_pages == NULL) { mutex_exit(vphm); return; } end = vp->v_pages->p_vpprev; shuffle = IS_VMODSORT(vp) && (vp->v_pages != end); pp = vp->v_pages; for (;;) { next = pp->p_vpnext; if (pp->p_hash != PVN_VPLIST_HASH_TAG && page_check(pp)) { /* * hat_setmod_only() in contrast to hat_setmod() does * not shuffle the pages and does not grab the mutex * page_vnode_mutex. Exactly what we need. */ hat_setmod_only(pp); if (shuffle) { page_vpsub(&vp->v_pages, pp); ASSERT(vp->v_pages != NULL); page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, pp); } } /* Stop if we have just processed the last page. */ if (pp == end) break; pp = next; } mutex_exit(vphm); }
/* * If the vnode has pages, run the list and check for any that are * still dangling. We call this routine before putting an rnode on * the free list. */ static int nfs4_dross_pages(vnode_t *vp) { page_t *pp; kmutex_t *vphm; vphm = page_vnode_mutex(vp); mutex_enter(vphm); if ((pp = vp->v_pages) != NULL) { do { if (pp->p_fsdata != C_NOCOMMIT) { mutex_exit(vphm); return (1); } } while ((pp = pp->p_vpnext) != vp->v_pages); } mutex_exit(vphm); return (0); }
/* * Process a vnode's page list for all pages whose offset is >= off. * Pages are to either be free'd, invalidated, or written back to disk. * * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE * is specified, otherwise they are "shared" locked. * * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC} * * Special marker page_t's are inserted in the list in order * to keep track of where we are in the list when locks are dropped. * * Note the list is circular and insertions can happen only at the * head and tail of the list. The algorithm ensures visiting all pages * on the list in the following way: * * Drop two marker pages at the end of the list. * * Move one marker page backwards towards the start of the list until * it is at the list head, processing the pages passed along the way. * * Due to race conditions when the vphm mutex is dropped, additional pages * can be added to either end of the list, so we'll continue to move * the marker and process pages until it is up against the end marker. * * There is one special exit condition. If we are processing a VMODSORT * vnode and only writing back modified pages, we can stop as soon as * we run into an unmodified page. This makes fsync(3) operations fast. */ int pvn_vplist_dirty( vnode_t *vp, u_offset_t off, int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *), int flags, cred_t *cred) { page_t *pp; page_t *mark; /* marker page that moves toward head */ page_t *end; /* marker page at end of list */ int err = 0; int error; kmutex_t *vphm; se_t se; page_t **where_to_move; ASSERT(vp->v_type != VCHR); if (vp->v_pages == NULL) return (0); /* * Serialize vplist_dirty operations on this vnode by setting VVMLOCK. * * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync() * from getting blocked while flushing pages to a dead NFS server. */ mutex_enter(&vp->v_lock); if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) { mutex_exit(&vp->v_lock); return (EAGAIN); } while (vp->v_flag & VVMLOCK) cv_wait(&vp->v_cv, &vp->v_lock); if (vp->v_pages == NULL) { mutex_exit(&vp->v_lock); return (0); } vp->v_flag |= VVMLOCK; mutex_exit(&vp->v_lock); /* * Set up the marker pages used to walk the list */ end = kmem_cache_alloc(marker_cache, KM_SLEEP); end->p_vnode = vp; end->p_offset = (u_offset_t)-2; mark = kmem_cache_alloc(marker_cache, KM_SLEEP); mark->p_vnode = vp; mark->p_offset = (u_offset_t)-1; /* * Grab the lock protecting the vnode's page list * note that this lock is dropped at times in the loop. */ vphm = page_vnode_mutex(vp); mutex_enter(vphm); if (vp->v_pages == NULL) goto leave; /* * insert the markers and loop through the list of pages */ page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark); page_vpadd(&mark->p_vpnext, end); for (;;) { /* * If only doing an async write back, then we can * stop as soon as we get to start of the list. */ if (flags == B_ASYNC && vp->v_pages == mark) break; /* * otherwise stop when we've gone through all the pages */ if (mark->p_vpprev == end) break; pp = mark->p_vpprev; if (vp->v_pages == pp) where_to_move = &vp->v_pages; else where_to_move = &pp->p_vpprev->p_vpnext; ASSERT(pp->p_vnode == vp); /* * If just flushing dirty pages to disk and this vnode * is using a sorted list of pages, we can stop processing * as soon as we find an unmodified page. Since all the * modified pages are visited first. */ if (IS_VMODSORT(vp) && !(flags & (B_INVAL | B_FREE | B_TRUNC))) { if (!hat_ismod(pp) && !page_io_locked(pp)) { #ifdef DEBUG /* * For debug kernels examine what should be * all the remaining clean pages, asserting * that they are not modified. */ page_t *chk = pp; int attr; page_vpsub(&vp->v_pages, mark); page_vpadd(where_to_move, mark); do { chk = chk->p_vpprev; ASSERT(chk != end); if (chk == mark) continue; attr = hat_page_getattr(chk, P_MOD | P_REF); if ((attr & P_MOD) == 0) continue; panic("v_pages list not all clean: " "page_t*=%p vnode=%p off=%lx " "attr=0x%x last clean page_t*=%p\n", (void *)chk, (void *)chk->p_vnode, (long)chk->p_offset, attr, (void *)pp); } while (chk != vp->v_pages); #endif break; } else if (!(flags & B_ASYNC) && !hat_ismod(pp)) { /* * Couldn't get io lock, wait until IO is done. * Block only for sync IO since we don't want * to block async IO. */ mutex_exit(vphm); page_io_wait(pp); mutex_enter(vphm); continue; } } /* * Skip this page if the offset is out of the desired range. * Just move the marker and continue. */ if (pp->p_offset < off) { page_vpsub(&vp->v_pages, mark); page_vpadd(where_to_move, mark); continue; } /* * If we are supposed to invalidate or free this * page, then we need an exclusive lock. */ se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED; /* * We must acquire the page lock for all synchronous * operations (invalidate, free and write). */ if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) { /* * If the page_lock() drops the mutex * we must retry the loop. */ if (!page_lock(pp, se, vphm, P_NO_RECLAIM)) continue; /* * It's ok to move the marker page now. */ page_vpsub(&vp->v_pages, mark); page_vpadd(where_to_move, mark); } else { /* * update the marker page for all remaining cases */ page_vpsub(&vp->v_pages, mark); page_vpadd(where_to_move, mark); /* * For write backs, If we can't lock the page, it's * invalid or in the process of being destroyed. Skip * it, assuming someone else is writing it. */ if (!page_trylock(pp, se)) continue; } ASSERT(pp->p_vnode == vp); /* * Successfully locked the page, now figure out what to * do with it. Free pages are easily dealt with, invalidate * if desired or just go on to the next page. */ if (PP_ISFREE(pp)) { if ((flags & B_INVAL) == 0) { page_unlock(pp); continue; } /* * Invalidate (destroy) the page. */ mutex_exit(vphm); page_destroy_free(pp); mutex_enter(vphm); continue; } /* * pvn_getdirty() figures out what do do with a dirty page. * If the page is dirty, the putapage() routine will write it * and will kluster any other adjacent dirty pages it can. * * pvn_getdirty() and `(*putapage)' unlock the page. */ mutex_exit(vphm); if (pvn_getdirty(pp, flags)) { error = (*putapage)(vp, pp, NULL, NULL, flags, cred); if (!err) err = error; } mutex_enter(vphm); } page_vpsub(&vp->v_pages, mark); page_vpsub(&vp->v_pages, end); leave: /* * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds */ mutex_exit(vphm); kmem_cache_free(marker_cache, mark); kmem_cache_free(marker_cache, end); mutex_enter(&vp->v_lock); vp->v_flag &= ~VVMLOCK; cv_broadcast(&vp->v_cv); mutex_exit(&vp->v_lock); return (err); }
void pvn_write_done(page_t *plist, int flags) { int dfree = 0; int pgrec = 0; int pgout = 0; int pgpgout = 0; int anonpgout = 0; int anonfree = 0; int fspgout = 0; int fsfree = 0; int execpgout = 0; int execfree = 0; page_t *pp; struct cpu *cpup; struct vnode *vp = NULL; /* for probe */ uint_t ppattr; kmutex_t *vphm = NULL; ASSERT((flags & B_READ) == 0); /* * If we are about to start paging anyway, start freeing pages. */ if (write_free && freemem < lotsfree + pages_before_pager && (flags & B_ERROR) == 0) { flags |= B_FREE; } /* * Handle each page involved in the i/o operation. */ while (plist != NULL) { pp = plist; ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp)); page_sub(&plist, pp); /* Kernel probe support */ if (vp == NULL) vp = pp->p_vnode; if (((flags & B_ERROR) == 0) && IS_VMODSORT(vp)) { /* * Move page to the top of the v_page list. * Skip pages modified during IO. */ vphm = page_vnode_mutex(vp); mutex_enter(vphm); if ((pp->p_vpnext != pp) && !hat_ismod(pp)) { page_vpsub(&vp->v_pages, pp); page_vpadd(&vp->v_pages, pp); } mutex_exit(vphm); } if (flags & B_ERROR) { /* * Write operation failed. We don't want * to destroy (or free) the page unless B_FORCE * is set. We set the mod bit again and release * all locks on the page so that it will get written * back again later when things are hopefully * better again. * If B_INVAL and B_FORCE is set we really have * to destroy the page. */ if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) { page_io_unlock(pp); /*LINTED: constant in conditional context*/ VN_DISPOSE(pp, B_INVAL, 0, kcred); } else { hat_setmod_only(pp); page_io_unlock(pp); page_unlock(pp); } } else if (flags & B_INVAL) { /* * XXX - Failed writes with B_INVAL set are * not handled appropriately. */ page_io_unlock(pp); /*LINTED: constant in conditional context*/ VN_DISPOSE(pp, B_INVAL, 0, kcred); } else if (flags & B_FREE ||!hat_page_is_mapped(pp)) { /* * Update statistics for pages being paged out */ if (pp->p_vnode) { if (IS_SWAPFSVP(pp->p_vnode)) { anonpgout++; } else { if (pp->p_vnode->v_flag & VVMEXEC) { execpgout++; } else { fspgout++; } } } page_io_unlock(pp); pgout = 1; pgpgout++; TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT, "page_ws_out:pp %p", pp); /* * The page_struct_lock need not be acquired to * examine "p_lckcnt" and "p_cowcnt" since we'll * have an "exclusive" lock if the upgrade succeeds. */ if (page_tryupgrade(pp) && pp->p_lckcnt == 0 && pp->p_cowcnt == 0) { /* * Check if someone has reclaimed the * page. If ref and mod are not set, no * one is using it so we can free it. * The rest of the system is careful * to use the NOSYNC flag to unload * translations set up for i/o w/o * affecting ref and mod bits. * * Obtain a copy of the real hardware * mod bit using hat_pagesync(pp, HAT_DONTZERO) * to avoid having to flush the cache. */ ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD); ck_refmod: if (!(ppattr & (P_REF | P_MOD))) { if (hat_page_is_mapped(pp)) { /* * Doesn't look like the page * was modified so now we * really have to unload the * translations. Meanwhile * another CPU could've * modified it so we have to * check again. We don't loop * forever here because now * the translations are gone * and no one can get a new one * since we have the "exclusive" * lock on the page. */ (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); ppattr = hat_page_getattr(pp, P_REF | P_MOD); goto ck_refmod; } /* * Update statistics for pages being * freed */ if (pp->p_vnode) { if (IS_SWAPFSVP(pp->p_vnode)) { anonfree++; } else { if (pp->p_vnode->v_flag & VVMEXEC) { execfree++; } else { fsfree++; } } } /*LINTED: constant in conditional ctx*/ VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred); dfree++; } else { page_unlock(pp); pgrec++; TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE, "page_ws_free:pp %p", pp); } } else { /* * Page is either `locked' in memory * or was reclaimed and now has a * "shared" lock, so release it. */ page_unlock(pp); } } else { /* * Neither B_FREE nor B_INVAL nor B_ERROR. * Just release locks. */ page_io_unlock(pp); page_unlock(pp); } } CPU_STATS_ENTER_K(); cpup = CPU; /* get cpup now that CPU cannot change */ CPU_STATS_ADDQ(cpup, vm, dfree, dfree); CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec); CPU_STATS_ADDQ(cpup, vm, pgout, pgout); CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout); CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout); CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree); CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout); CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree); CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout); CPU_STATS_ADDQ(cpup, vm, execfree, execfree); CPU_STATS_EXIT_K(); /* Kernel probe */ TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */, tnf_opaque, vnode, vp, tnf_ulong, pages_pageout, pgpgout, tnf_ulong, pages_freed, dfree, tnf_ulong, pages_reclaimed, pgrec); }