Beispiel #1
0
/*
 * page_retire_pp() decides what to do with a failing page.
 *
 * When we get a free page (e.g. the scrubber or in the free path) life is
 * nice because the page is clean and marked free -- those always retire
 * nicely. From there we go by order of difficulty. If the page has data,
 * we attempt to relocate its contents to a suitable replacement page. If
 * that does not succeed, we look to see if it is clean. If after all of
 * this we have a clean, unmapped page (which we usually do!), we retire it.
 * If the page is not clean, we still process it regardless on a UE; for
 * CEs or FMA requests, we fail leaving the page in service. The page will
 * eventually be tried again later. We always return with the page unlocked
 * since we are called from page_unlock().
 *
 * We don't call panic or do anything fancy down in here. Our boss the DE
 * gets paid handsomely to do his job of figuring out what to do when errors
 * occur. We just do what he tells us to do.
 */
static int
page_retire_pp(page_t *pp)
{
	int		toxic;

	ASSERT(PAGE_EXCL(pp));
	ASSERT(pp->p_iolock_state == 0);
	ASSERT(pp->p_szc == 0);

	PR_DEBUG(prd_top);
	PR_TYPES(pp);

	toxic = pp->p_toxic;
	ASSERT(toxic & PR_REASONS);

	if ((toxic & (PR_FMA | PR_MCE)) && !(toxic & PR_UE) &&
	    page_retire_limit()) {
		page_clrtoxic(pp, PR_FMA | PR_MCE | PR_MSG | PR_BUSY);
		page_retire_dequeue(pp);
		page_unlock(pp);
		return (page_retire_done(pp, PRD_LIMIT));
	}

	if (PP_ISFREE(pp)) {
		int dbgnoreclaim = MTBF(recl_calls, recl_mtbf) == 0;

		PR_DEBUG(prd_free);

		if (dbgnoreclaim || !page_reclaim(pp, NULL)) {
			PR_DEBUG(prd_noreclaim);
			PR_INCR_KSTAT(pr_failed);
			/*
			 * page_reclaim() returns with `pp' unlocked when
			 * it fails.
			 */
			if (dbgnoreclaim)
				page_unlock(pp);
			return (page_retire_done(pp, PRD_FAILED));
		}
	}
	ASSERT(!PP_ISFREE(pp));

	if ((toxic & PR_UE) == 0 && pp->p_vnode && !PP_ISNORELOCKERNEL(pp) &&
	    MTBF(reloc_calls, reloc_mtbf)) {
		page_t *newpp;
		spgcnt_t count;

		/*
		 * If we can relocate the page, great! newpp will go
		 * on without us, and everything is fine.  Regardless
		 * of whether the relocation succeeds, we are still
		 * going to take `pp' around back and shoot it.
		 */
		newpp = NULL;
		if (page_relocate(&pp, &newpp, 0, 0, &count, NULL) == 0) {
			PR_DEBUG(prd_reloc);
			page_unlock(newpp);
			ASSERT(hat_page_getattr(pp, P_MOD) == 0);
		} else {
			PR_DEBUG(prd_relocfail);
		}
	}

	if (hat_ismod(pp)) {
		PR_DEBUG(prd_mod);
		PR_INCR_KSTAT(pr_failed);
		page_unlock(pp);
		return (page_retire_done(pp, PRD_FAILED));
	}

	if (PP_ISKVP(pp)) {
		PR_DEBUG(prd_kern);
		PR_INCR_KSTAT(pr_failed_kernel);
		page_unlock(pp);
		return (page_retire_done(pp, PRD_FAILED));
	}

	if (pp->p_lckcnt || pp->p_cowcnt) {
		PR_DEBUG(prd_locked);
		PR_INCR_KSTAT(pr_failed);
		page_unlock(pp);
		return (page_retire_done(pp, PRD_FAILED));
	}

	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
	ASSERT(!hat_page_is_mapped(pp));

	/*
	 * If the page is modified, and was not relocated; we can't
	 * retire it without dropping data on the floor. We have to
	 * recheck after unloading since the dirty bit could have been
	 * set since we last checked.
	 */
	if (hat_ismod(pp)) {
		PR_DEBUG(prd_mod_late);
		PR_INCR_KSTAT(pr_failed);
		page_unlock(pp);
		return (page_retire_done(pp, PRD_FAILED));
	}

	if (pp->p_vnode) {
		PR_DEBUG(prd_hashout);
		page_hashout(pp, NULL);
	}
	ASSERT(!pp->p_vnode);

	/*
	 * The problem page is locked, demoted, unmapped, not free,
	 * hashed out, and not COW or mlocked (whew!).
	 *
	 * Now we select our ammunition, take it around back, and shoot it.
	 */
	if (toxic & PR_UE) {
		if (page_retire_transient_ue(pp)) {
			PR_DEBUG(prd_uescrubbed);
			return (page_retire_done(pp, PRD_UE_SCRUBBED));
		} else {
			PR_DEBUG(prd_uenotscrubbed);
			page_retire_destroy(pp);
			return (page_retire_done(pp, PRD_SUCCESS));
		}
	} else if (toxic & PR_FMA) {
		PR_DEBUG(prd_fma);
		page_retire_destroy(pp);
		return (page_retire_done(pp, PRD_SUCCESS));
	} else if (toxic & PR_MCE) {
		PR_DEBUG(prd_mce);
		page_retire_destroy(pp);
		return (page_retire_done(pp, PRD_SUCCESS));
	}
	panic("page_retire_pp: bad toxic flags %d", toxic);
	/*NOTREACHED*/
}
Beispiel #2
0
/*
 * Process a vnode's page list for all pages whose offset is >= off.
 * Pages are to either be free'd, invalidated, or written back to disk.
 *
 * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE
 * is specified, otherwise they are "shared" locked.
 *
 * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC}
 *
 * Special marker page_t's are inserted in the list in order
 * to keep track of where we are in the list when locks are dropped.
 *
 * Note the list is circular and insertions can happen only at the
 * head and tail of the list. The algorithm ensures visiting all pages
 * on the list in the following way:
 *
 *    Drop two marker pages at the end of the list.
 *
 *    Move one marker page backwards towards the start of the list until
 *    it is at the list head, processing the pages passed along the way.
 *
 *    Due to race conditions when the vphm mutex is dropped, additional pages
 *    can be added to either end of the list, so we'll continue to move
 *    the marker and process pages until it is up against the end marker.
 *
 * There is one special exit condition. If we are processing a VMODSORT
 * vnode and only writing back modified pages, we can stop as soon as
 * we run into an unmodified page.  This makes fsync(3) operations fast.
 */
int
pvn_vplist_dirty(
    vnode_t		*vp,
    u_offset_t	off,
    int		(*putapage)(vnode_t *, page_t *, u_offset_t *,
                        size_t *, int, cred_t *),
    int		flags,
    cred_t		*cred)
{
    page_t		*pp;
    page_t		*mark;		/* marker page that moves toward head */
    page_t		*end;		/* marker page at end of list */
    int		err = 0;
    int		error;
    kmutex_t	*vphm;
    se_t		se;
    page_t		**where_to_move;

    ASSERT(vp->v_type != VCHR);

    if (vp->v_pages == NULL)
        return (0);


    /*
     * Serialize vplist_dirty operations on this vnode by setting VVMLOCK.
     *
     * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync()
     * from getting blocked while flushing pages to a dead NFS server.
     */
    mutex_enter(&vp->v_lock);
    if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) {
        mutex_exit(&vp->v_lock);
        return (EAGAIN);
    }

    while (vp->v_flag & VVMLOCK)
        cv_wait(&vp->v_cv, &vp->v_lock);

    if (vp->v_pages == NULL) {
        mutex_exit(&vp->v_lock);
        return (0);
    }

    vp->v_flag |= VVMLOCK;
    mutex_exit(&vp->v_lock);


    /*
     * Set up the marker pages used to walk the list
     */
    end = kmem_cache_alloc(marker_cache, KM_SLEEP);
    end->p_vnode = vp;
    end->p_offset = (u_offset_t)-2;
    mark = kmem_cache_alloc(marker_cache, KM_SLEEP);
    mark->p_vnode = vp;
    mark->p_offset = (u_offset_t)-1;

    /*
     * Grab the lock protecting the vnode's page list
     * note that this lock is dropped at times in the loop.
     */
    vphm = page_vnode_mutex(vp);
    mutex_enter(vphm);
    if (vp->v_pages == NULL)
        goto leave;

    /*
     * insert the markers and loop through the list of pages
     */
    page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark);
    page_vpadd(&mark->p_vpnext, end);
    for (;;) {

        /*
         * If only doing an async write back, then we can
         * stop as soon as we get to start of the list.
         */
        if (flags == B_ASYNC && vp->v_pages == mark)
            break;

        /*
         * otherwise stop when we've gone through all the pages
         */
        if (mark->p_vpprev == end)
            break;

        pp = mark->p_vpprev;
        if (vp->v_pages == pp)
            where_to_move = &vp->v_pages;
        else
            where_to_move = &pp->p_vpprev->p_vpnext;

        ASSERT(pp->p_vnode == vp);

        /*
         * If just flushing dirty pages to disk and this vnode
         * is using a sorted list of pages, we can stop processing
         * as soon as we find an unmodified page. Since all the
         * modified pages are visited first.
         */
        if (IS_VMODSORT(vp) &&
                !(flags & (B_INVAL | B_FREE | B_TRUNC))) {
            if (!hat_ismod(pp) && !page_io_locked(pp)) {
#ifdef  DEBUG
                /*
                 * For debug kernels examine what should be
                 * all the remaining clean pages, asserting
                 * that they are not modified.
                 */
                page_t	*chk = pp;
                int	attr;

                page_vpsub(&vp->v_pages, mark);
                page_vpadd(where_to_move, mark);
                do {
                    chk = chk->p_vpprev;
                    ASSERT(chk != end);
                    if (chk == mark)
                        continue;
                    attr = hat_page_getattr(chk, P_MOD |
                                            P_REF);
                    if ((attr & P_MOD) == 0)
                        continue;
                    panic("v_pages list not all clean: "
                          "page_t*=%p vnode=%p off=%lx "
                          "attr=0x%x last clean page_t*=%p\n",
                          (void *)chk, (void *)chk->p_vnode,
                          (long)chk->p_offset, attr,
                          (void *)pp);
                } while (chk != vp->v_pages);
#endif
                break;
            } else if (!(flags & B_ASYNC) && !hat_ismod(pp)) {
                /*
                 * Couldn't get io lock, wait until IO is done.
                 * Block only for sync IO since we don't want
                 * to block async IO.
                 */
                mutex_exit(vphm);
                page_io_wait(pp);
                mutex_enter(vphm);
                continue;
            }
        }

        /*
         * Skip this page if the offset is out of the desired range.
         * Just move the marker and continue.
         */
        if (pp->p_offset < off) {
            page_vpsub(&vp->v_pages, mark);
            page_vpadd(where_to_move, mark);
            continue;
        }

        /*
         * If we are supposed to invalidate or free this
         * page, then we need an exclusive lock.
         */
        se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;

        /*
         * We must acquire the page lock for all synchronous
         * operations (invalidate, free and write).
         */
        if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) {
            /*
             * If the page_lock() drops the mutex
             * we must retry the loop.
             */
            if (!page_lock(pp, se, vphm, P_NO_RECLAIM))
                continue;

            /*
             * It's ok to move the marker page now.
             */
            page_vpsub(&vp->v_pages, mark);
            page_vpadd(where_to_move, mark);
        } else {

            /*
             * update the marker page for all remaining cases
             */
            page_vpsub(&vp->v_pages, mark);
            page_vpadd(where_to_move, mark);

            /*
             * For write backs, If we can't lock the page, it's
             * invalid or in the process of being destroyed.  Skip
             * it, assuming someone else is writing it.
             */
            if (!page_trylock(pp, se))
                continue;
        }

        ASSERT(pp->p_vnode == vp);

        /*
         * Successfully locked the page, now figure out what to
         * do with it. Free pages are easily dealt with, invalidate
         * if desired or just go on to the next page.
         */
        if (PP_ISFREE(pp)) {
            if ((flags & B_INVAL) == 0) {
                page_unlock(pp);
                continue;
            }

            /*
             * Invalidate (destroy) the page.
             */
            mutex_exit(vphm);
            page_destroy_free(pp);
            mutex_enter(vphm);
            continue;
        }

        /*
         * pvn_getdirty() figures out what do do with a dirty page.
         * If the page is dirty, the putapage() routine will write it
         * and will kluster any other adjacent dirty pages it can.
         *
         * pvn_getdirty() and `(*putapage)' unlock the page.
         */
        mutex_exit(vphm);
        if (pvn_getdirty(pp, flags)) {
            error = (*putapage)(vp, pp, NULL, NULL, flags, cred);
            if (!err)
                err = error;
        }
        mutex_enter(vphm);
    }
    page_vpsub(&vp->v_pages, mark);
    page_vpsub(&vp->v_pages, end);

leave:
    /*
     * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds
     */
    mutex_exit(vphm);
    kmem_cache_free(marker_cache, mark);
    kmem_cache_free(marker_cache, end);
    mutex_enter(&vp->v_lock);
    vp->v_flag &= ~VVMLOCK;
    cv_broadcast(&vp->v_cv);
    mutex_exit(&vp->v_lock);
    return (err);
}
Beispiel #3
0
void
pvn_write_done(page_t *plist, int flags)
{
    int dfree = 0;
    int pgrec = 0;
    int pgout = 0;
    int pgpgout = 0;
    int anonpgout = 0;
    int anonfree = 0;
    int fspgout = 0;
    int fsfree = 0;
    int execpgout = 0;
    int execfree = 0;
    page_t *pp;
    struct cpu *cpup;
    struct vnode *vp = NULL;	/* for probe */
    uint_t ppattr;
    kmutex_t *vphm = NULL;

    ASSERT((flags & B_READ) == 0);

    /*
     * If we are about to start paging anyway, start freeing pages.
     */
    if (write_free && freemem < lotsfree + pages_before_pager &&
            (flags & B_ERROR) == 0) {
        flags |= B_FREE;
    }

    /*
     * Handle each page involved in the i/o operation.
     */
    while (plist != NULL) {
        pp = plist;
        ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp));
        page_sub(&plist, pp);

        /* Kernel probe support */
        if (vp == NULL)
            vp = pp->p_vnode;

        if (((flags & B_ERROR) == 0) && IS_VMODSORT(vp)) {
            /*
             * Move page to the top of the v_page list.
             * Skip pages modified during IO.
             */
            vphm = page_vnode_mutex(vp);
            mutex_enter(vphm);
            if ((pp->p_vpnext != pp) && !hat_ismod(pp)) {
                page_vpsub(&vp->v_pages, pp);
                page_vpadd(&vp->v_pages, pp);
            }
            mutex_exit(vphm);
        }

        if (flags & B_ERROR) {
            /*
             * Write operation failed.  We don't want
             * to destroy (or free) the page unless B_FORCE
             * is set. We set the mod bit again and release
             * all locks on the page so that it will get written
             * back again later when things are hopefully
             * better again.
             * If B_INVAL and B_FORCE is set we really have
             * to destroy the page.
             */
            if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) {
                page_io_unlock(pp);
                /*LINTED: constant in conditional context*/
                VN_DISPOSE(pp, B_INVAL, 0, kcred);
            } else {
                hat_setmod_only(pp);
                page_io_unlock(pp);
                page_unlock(pp);
            }
        } else if (flags & B_INVAL) {
            /*
             * XXX - Failed writes with B_INVAL set are
             * not handled appropriately.
             */
            page_io_unlock(pp);
            /*LINTED: constant in conditional context*/
            VN_DISPOSE(pp, B_INVAL, 0, kcred);
        } else if (flags & B_FREE ||!hat_page_is_mapped(pp)) {
            /*
             * Update statistics for pages being paged out
             */
            if (pp->p_vnode) {
                if (IS_SWAPFSVP(pp->p_vnode)) {
                    anonpgout++;
                } else {
                    if (pp->p_vnode->v_flag & VVMEXEC) {
                        execpgout++;
                    } else {
                        fspgout++;
                    }
                }
            }
            page_io_unlock(pp);
            pgout = 1;
            pgpgout++;
            TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT,
                    "page_ws_out:pp %p", pp);

            /*
             * The page_struct_lock need not be acquired to
             * examine "p_lckcnt" and "p_cowcnt" since we'll
             * have an "exclusive" lock if the upgrade succeeds.
             */
            if (page_tryupgrade(pp) &&
                    pp->p_lckcnt == 0 && pp->p_cowcnt == 0) {
                /*
                 * Check if someone has reclaimed the
                 * page.  If ref and mod are not set, no
                 * one is using it so we can free it.
                 * The rest of the system is careful
                 * to use the NOSYNC flag to unload
                 * translations set up for i/o w/o
                 * affecting ref and mod bits.
                 *
                 * Obtain a copy of the real hardware
                 * mod bit using hat_pagesync(pp, HAT_DONTZERO)
                 * to avoid having to flush the cache.
                 */
                ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
                                      HAT_SYNC_STOPON_MOD);
ck_refmod:
                if (!(ppattr & (P_REF | P_MOD))) {
                    if (hat_page_is_mapped(pp)) {
                        /*
                         * Doesn't look like the page
                         * was modified so now we
                         * really have to unload the
                         * translations.  Meanwhile
                         * another CPU could've
                         * modified it so we have to
                         * check again.  We don't loop
                         * forever here because now
                         * the translations are gone
                         * and no one can get a new one
                         * since we have the "exclusive"
                         * lock on the page.
                         */
                        (void) hat_pageunload(pp,
                                              HAT_FORCE_PGUNLOAD);
                        ppattr = hat_page_getattr(pp,
                                                  P_REF | P_MOD);
                        goto ck_refmod;
                    }
                    /*
                     * Update statistics for pages being
                     * freed
                     */
                    if (pp->p_vnode) {
                        if (IS_SWAPFSVP(pp->p_vnode)) {
                            anonfree++;
                        } else {
                            if (pp->p_vnode->v_flag
                                    & VVMEXEC) {
                                execfree++;
                            } else {
                                fsfree++;
                            }
                        }
                    }
                    /*LINTED: constant in conditional ctx*/
                    VN_DISPOSE(pp, B_FREE,
                               (flags & B_DONTNEED), kcred);
                    dfree++;
                } else {
                    page_unlock(pp);
                    pgrec++;
                    TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE,
                            "page_ws_free:pp %p", pp);
                }
            } else {
                /*
                 * Page is either `locked' in memory
                 * or was reclaimed and now has a
                 * "shared" lock, so release it.
                 */
                page_unlock(pp);
            }
        } else {
            /*
             * Neither B_FREE nor B_INVAL nor B_ERROR.
             * Just release locks.
             */
            page_io_unlock(pp);
            page_unlock(pp);
        }
    }

    CPU_STATS_ENTER_K();
    cpup = CPU;		/* get cpup now that CPU cannot change */
    CPU_STATS_ADDQ(cpup, vm, dfree, dfree);
    CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec);
    CPU_STATS_ADDQ(cpup, vm, pgout, pgout);
    CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout);
    CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout);
    CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree);
    CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout);
    CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree);
    CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout);
    CPU_STATS_ADDQ(cpup, vm, execfree, execfree);
    CPU_STATS_EXIT_K();

    /* Kernel probe */
    TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */,
                tnf_opaque,	vnode,			vp,
                tnf_ulong,	pages_pageout,		pgpgout,
                tnf_ulong,	pages_freed,		dfree,
                tnf_ulong,	pages_reclaimed,	pgrec);
}
Beispiel #4
0
/*
 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI,
 * B_TRUNC, B_FORCE}.  B_DELWRI indicates that this page is part of a kluster
 * operation and is only to be considered if it doesn't involve any
 * waiting here.  B_TRUNC indicates that the file is being truncated
 * and so no i/o needs to be done. B_FORCE indicates that the page
 * must be destroyed so don't try wrting it out.
 *
 * The caller must ensure that the page is locked.  Returns 1, if
 * the page should be written back (the "iolock" is held in this
 * case), or 0 if the page has been dealt with or has been
 * unlocked.
 */
int
pvn_getdirty(page_t *pp, int flags)
{
    ASSERT((flags & (B_INVAL | B_FREE)) ?
           PAGE_EXCL(pp) : PAGE_SHARED(pp));
    ASSERT(PP_ISFREE(pp) == 0);

    /*
     * If trying to invalidate or free a logically `locked' page,
     * forget it.  Don't need page_struct_lock to check p_lckcnt and
     * p_cowcnt as the page is exclusively locked.
     */
    if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) &&
            (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) {
        page_unlock(pp);
        return (0);
    }

    /*
     * Now acquire the i/o lock so we can add it to the dirty
     * list (if necessary).  We avoid blocking on the i/o lock
     * in the following cases:
     *
     *	If B_DELWRI is set, which implies that this request is
     *	due to a klustering operartion.
     *
     *	If this is an async (B_ASYNC) operation and we are not doing
     *	invalidation (B_INVAL) [The current i/o or fsflush will ensure
     *	that the the page is written out].
     */
    if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) {
        if (!page_io_trylock(pp)) {
            page_unlock(pp);
            return (0);
        }
    } else {
        page_io_lock(pp);
    }

    /*
     * If we want to free or invalidate the page then
     * we need to unload it so that anyone who wants
     * it will have to take a minor fault to get it.
     * Otherwise, we're just writing the page back so we
     * need to sync up the hardwre and software mod bit to
     * detect any future modifications.  We clear the
     * software mod bit when we put the page on the dirty
     * list.
     */
    if (flags & (B_INVAL | B_FREE)) {
        (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
    } else {
        (void) hat_pagesync(pp, HAT_SYNC_ZERORM);
    }

    if (!hat_ismod(pp) || (flags & B_TRUNC)) {
        /*
         * Don't need to add it to the
         * list after all.
         */
        page_io_unlock(pp);
        if (flags & B_INVAL) {
            /*LINTED: constant in conditional context*/
            VN_DISPOSE(pp, B_INVAL, 0, kcred);
        } else if (flags & B_FREE) {
            /*LINTED: constant in conditional context*/
            VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred);
        } else {
            /*
             * This is advisory path for the callers
             * of VOP_PUTPAGE() who prefer freeing the
             * page _only_ if no one else is accessing it.
             * E.g. segmap_release()
             *
             * The above hat_ismod() check is useless because:
             * (1) we may not be holding SE_EXCL lock;
             * (2) we've not unloaded _all_ translations
             *
             * Let page_release() do the heavy-lifting.
             */
            (void) page_release(pp, 1);
        }
        return (0);
    }

    /*
     * Page is dirty, get it ready for the write back
     * and add page to the dirty list.
     */
    hat_clrrefmod(pp);

    /*
     * If we're going to free the page when we're done
     * then we can let others try to use it starting now.
     * We'll detect the fact that they used it when the
     * i/o is done and avoid freeing the page.
     */
    if (flags & B_FREE)
        page_downgrade(pp);


    TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp);

    return (1);
}
/*
 * Scan page_t's and issue I/O's for modified pages.
 *
 * Also coalesces consecutive small sized free pages into the next larger
 * pagesize. This costs a tiny bit of time in fsflush, but will reduce time
 * spent scanning on later passes and for anybody allocating large pages.
 */
static void
fsflush_do_pages()
{
	vnode_t		*vp;
	ulong_t		pcount;
	hrtime_t	timer = gethrtime();
	ulong_t		releases = 0;
	ulong_t		nexamined = 0;
	ulong_t		nlocked = 0;
	ulong_t		nmodified = 0;
	ulong_t		ncoalesce = 0;
	ulong_t		cnt;
	int		mod;
	int		fspage = 1;
	u_offset_t	offset;
	uint_t		szc;

	page_t		*coal_page = NULL;  /* 1st page in group to coalesce */
	uint_t		coal_szc = 0;	    /* size code, coal_page->p_szc */
	uint_t		coal_cnt = 0;	    /* count of pages seen */

	static ulong_t	nscan = 0;
	static pgcnt_t	last_total_pages = 0;
	static page_t	*pp = NULL;

	/*
	 * Check to see if total_pages has changed.
	 */
	if (total_pages != last_total_pages) {
		last_total_pages = total_pages;
		nscan = (last_total_pages * (tune.t_fsflushr))/v.v_autoup;
	}

	if (pp == NULL)
		pp = memsegs->pages;

	pcount = 0;
	while (pcount < nscan) {

		/*
		 * move to the next page, skipping over large pages
		 * and issuing prefetches.
		 */
		if (pp->p_szc && fspage == 0) {
			pfn_t pfn;

			pfn  = page_pptonum(pp);
			cnt = page_get_pagecnt(pp->p_szc);
			cnt -= pfn & (cnt - 1);
		} else
			cnt = 1;

		pp = page_nextn(pp, cnt);
		prefetch_page_r((void *)pp);
		ASSERT(pp != NULL);
		pcount += cnt;

		/*
		 * Do a bunch of dirty tests (ie. no locking) to determine
		 * if we can quickly skip this page. These tests are repeated
		 * after acquiring the page lock.
		 */
		++nexamined;
		if (PP_ISSWAP(pp)) {
			fspage = 0;
			coal_page = NULL;
			continue;
		}

		/*
		 * skip free pages too, but try coalescing them into larger
		 * pagesizes
		 */
		if (PP_ISFREE(pp)) {
			/*
			 * skip pages with a file system identity or that
			 * are already maximum size
			 */
			fspage = 0;
			szc = pp->p_szc;
			if (pp->p_vnode != NULL || szc == fsf_npgsz - 1) {
				coal_page = NULL;
				continue;
			}

			/*
			 * If not in a coalescing candidate page or the size
			 * codes are different, start a new candidate.
			 */
			if (coal_page == NULL || coal_szc != szc) {

				/*
				 * page must be properly aligned
				 */
				if ((page_pptonum(pp) & fsf_mask[szc]) != 0) {
					coal_page = NULL;
					continue;
				}
				coal_page = pp;
				coal_szc = szc;
				coal_cnt = 1;
				continue;
			}

			/*
			 * acceptable to add this to existing candidate page
			 */
			++coal_cnt;
			if (coal_cnt < fsf_pgcnt[coal_szc])
				continue;

			/*
			 * We've got enough pages to coalesce, so do it.
			 * After promoting, we clear coal_page, so it will
			 * take another pass to promote this to an even
			 * larger page.
			 */
			++ncoalesce;
			(void) page_promote_size(coal_page, coal_szc);
			coal_page = NULL;
			continue;
		} else {
			coal_page = NULL;
		}

		if (PP_ISKAS(pp) ||
		    PAGE_LOCKED(pp) ||
		    pp->p_lckcnt != 0 ||
		    pp->p_cowcnt != 0) {
			fspage = 0;
			continue;
		}


		/*
		 * Reject pages that can't be "exclusively" locked.
		 */
		if (!page_trylock(pp, SE_EXCL))
			continue;
		++nlocked;


		/*
		 * After locking the page, redo the above checks.
		 * Since we locked the page, leave out the PAGE_LOCKED() test.
		 */
		vp = pp->p_vnode;
		if (PP_ISSWAP(pp) ||
		    PP_ISFREE(pp) ||
		    vp == NULL ||
		    PP_ISKAS(pp) ||
		    (vp->v_flag & VISSWAP) != 0) {
			page_unlock(pp);
			fspage = 0;
			continue;
		}
		if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
			page_unlock(pp);
			continue;
		}

		fspage = 1;
		ASSERT(vp->v_type != VCHR);

		/*
		 * Check the modified bit. Leaving the bit alone in hardware.
		 * It will be cleared if we do the putpage.
		 */
		if (IS_VMODSORT(vp))
			mod = hat_ismod(pp);
		else
			mod = hat_pagesync(pp,
			    HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD;

		if (mod) {
			++nmodified;
			offset = pp->p_offset;

			/*
			 * Hold the vnode before releasing the page lock
			 * to prevent it from being freed and re-used by
			 * some other thread.
			 */
			VN_HOLD(vp);

			page_unlock(pp);

			(void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_ASYNC,
			    kcred, NULL);

			VN_RELE(vp);
		} else {

			/*
			 * Catch any pages which should be on the cache list,
			 * but aren't yet.
			 */
			if (hat_page_is_mapped(pp) == 0) {
				++releases;
				(void) page_release(pp, 1);
			} else {
				page_unlock(pp);
			}
		}
	}

	/*
	 * maintain statistics
	 * reset every million wakeups, just to avoid overflow
	 */
	if (++fsf_cycles == 1000000) {
		fsf_cycles = 0;
		fsf_total.fsf_scan = 0;
		fsf_total.fsf_examined = 0;
		fsf_total.fsf_locked = 0;
		fsf_total.fsf_modified = 0;
		fsf_total.fsf_coalesce = 0;
		fsf_total.fsf_time = 0;
		fsf_total.fsf_releases = 0;
	} else {
		fsf_total.fsf_scan += fsf_recent.fsf_scan = nscan;
		fsf_total.fsf_examined += fsf_recent.fsf_examined = nexamined;
		fsf_total.fsf_locked += fsf_recent.fsf_locked = nlocked;
		fsf_total.fsf_modified += fsf_recent.fsf_modified = nmodified;
		fsf_total.fsf_coalesce += fsf_recent.fsf_coalesce = ncoalesce;
		fsf_total.fsf_time += fsf_recent.fsf_time = gethrtime() - timer;
		fsf_total.fsf_releases += fsf_recent.fsf_releases = releases;
	}
}