Esempio n. 1
0
/*
 * Walk the vp->v_pages list, for every page call the callback function
 * pointed by *page_check. If page_check returns non-zero, then mark the
 * page as modified and if VMODSORT is set, move it to the end of v_pages
 * list. Moving makes sense only if we have at least two pages - this also
 * avoids having v_pages temporarily being NULL after calling page_vpsub()
 * if there was just one page.
 */
void
pvn_vplist_setdirty(vnode_t *vp, int (*page_check)(page_t *))
{
    page_t	*pp, *next, *end;
    kmutex_t	*vphm;
    int	shuffle;

    vphm = page_vnode_mutex(vp);
    mutex_enter(vphm);

    if (vp->v_pages == NULL) {
        mutex_exit(vphm);
        return;
    }

    end = vp->v_pages->p_vpprev;
    shuffle = IS_VMODSORT(vp) && (vp->v_pages != end);
    pp = vp->v_pages;

    for (;;) {
        next = pp->p_vpnext;
        if (pp->p_hash != PVN_VPLIST_HASH_TAG && page_check(pp)) {
            /*
             * hat_setmod_only() in contrast to hat_setmod() does
             * not shuffle the pages and does not grab the mutex
             * page_vnode_mutex. Exactly what we need.
             */
            hat_setmod_only(pp);
            if (shuffle) {
                page_vpsub(&vp->v_pages, pp);
                ASSERT(vp->v_pages != NULL);
                page_vpadd(&vp->v_pages->p_vpprev->p_vpnext,
                           pp);
            }
        }
        /* Stop if we have just processed the last page. */
        if (pp == end)
            break;
        pp = next;
    }

    mutex_exit(vphm);
}
Esempio n. 2
0
void
plat_freelist_process(int mnode)
{
	page_t		*page, **freelist;
	page_t		*bdlist[STARFIRE_MAX_BOARDS];
	page_t		 **sortlist[STARFIRE_MAX_BOARDS];
	uint32_t	idx, idy, size, color, max_color, lbn;
	uint32_t	bd_flags, bd_cnt, result, bds;
	kmutex_t	*pcm;
	int 		mtype;

	/* for each page size */
	for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
		for (size = 0; size < mmu_page_sizes; size++) {

			/*
			 * Compute the maximum # of phys colors based on
			 * page size.
			 */
			max_color = page_get_pagecolors(size);

			/* for each color */
			for (color = 0; color < max_color; color++) {

				bd_cnt = 0;
				bd_flags = 0;
				for (idx = 0; idx < STARFIRE_MAX_BOARDS;
				    idx++) {
					bdlist[idx] = NULL;
					sortlist[idx] = NULL;
				}

				/* find freelist */
				freelist = &PAGE_FREELISTS(mnode, size,
				    color, mtype);

				if (*freelist == NULL)
					continue;

				/* acquire locks */
				pcm = PC_BIN_MUTEX(mnode, color, PG_FREE_LIST);
				mutex_enter(pcm);

				/*
				 * read freelist & sort pages by logical
				 * board number
				 */
				/* grab pages till last one. */
				while (*freelist) {
					page = *freelist;
					result = page_trylock(page, SE_EXCL);

					ASSERT(result);

					/* Delete from freelist */
					if (size != 0) {
						page_vpsub(freelist, page);
					} else {
						mach_page_sub(freelist, page);
					}

					/* detect the lbn */
					lbn = PFN_2_LBN(page->p_pagenum);

					/* add to bdlist[lbn] */
					if (size != 0) {
						page_vpadd(&bdlist[lbn], page);
					} else {
						mach_page_add(&bdlist[lbn],
						    page);
					}

					/* if lbn new */
					if ((bd_flags & (1 << lbn)) == 0) {
						bd_flags |= (1 << lbn);
						bd_cnt++;
					}
					page_unlock(page);
				}

				/*
				 * Make the sortlist so
				 * bd_cnt choices show up
				 */
				bds = 0;
				for (idx = 0; idx < STARFIRE_MAX_BOARDS;
				    idx++) {
					if (bdlist[idx])
						sortlist[bds++] = &bdlist[idx];
				}

				/*
				 * Set random start.
				 */
				(void) random_idx(-color);

				/*
				 * now rebuild the freelist by shuffling
				 * pages from bd lists
				 */
				while (bd_cnt) {

					/*
					 * get "random" index between 0 &
					 * bd_cnt
					 */

					ASSERT(bd_cnt &&
					    (bd_cnt < STARFIRE_MAX_BOARDS+1));

					idx = random_idx(bd_cnt);

					page = *sortlist[idx];
					result = page_trylock(page, SE_EXCL);

					ASSERT(result);

					/* Delete from sort_list */
					/*  & Append to freelist */
					/* Big pages use vp_add - 8k don't */
					if (size != 0) {
						page_vpsub(sortlist[idx], page);
						page_vpadd(freelist, page);
					} else {
						mach_page_sub(sortlist[idx],
						    page);
						mach_page_add(freelist, page);
					}

					/* needed for indexing tmp lists */
					lbn = PFN_2_LBN(page->p_pagenum);

					/*
					 * if this was the last page on this
					 * list?
					 */
					if (*sortlist[idx] == NULL) {

						/* have to find brd list */

						/* idx is lbn? -- No! */
						/* sortlist, brdlist */
						/*  have diff indexs */
						bd_flags &= ~(1 << lbn);
						--bd_cnt;

						/*
						 * redo the sortlist so only
						 * bd_cnt choices show up
						 */
						bds = 0;
						for (idy = 0;
						    idy < STARFIRE_MAX_BOARDS;
						    idy++) {
							if (bdlist[idy]) {
								sortlist[bds++]
								/* CSTYLED */
								= &bdlist[idy];
							}
						}
					}
					page_unlock(page);
				}
				mutex_exit(pcm);
			}
		}
	}
}
Esempio n. 3
0
/*
 * Process a vnode's page list for all pages whose offset is >= off.
 * Pages are to either be free'd, invalidated, or written back to disk.
 *
 * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE
 * is specified, otherwise they are "shared" locked.
 *
 * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC}
 *
 * Special marker page_t's are inserted in the list in order
 * to keep track of where we are in the list when locks are dropped.
 *
 * Note the list is circular and insertions can happen only at the
 * head and tail of the list. The algorithm ensures visiting all pages
 * on the list in the following way:
 *
 *    Drop two marker pages at the end of the list.
 *
 *    Move one marker page backwards towards the start of the list until
 *    it is at the list head, processing the pages passed along the way.
 *
 *    Due to race conditions when the vphm mutex is dropped, additional pages
 *    can be added to either end of the list, so we'll continue to move
 *    the marker and process pages until it is up against the end marker.
 *
 * There is one special exit condition. If we are processing a VMODSORT
 * vnode and only writing back modified pages, we can stop as soon as
 * we run into an unmodified page.  This makes fsync(3) operations fast.
 */
int
pvn_vplist_dirty(
    vnode_t		*vp,
    u_offset_t	off,
    int		(*putapage)(vnode_t *, page_t *, u_offset_t *,
                        size_t *, int, cred_t *),
    int		flags,
    cred_t		*cred)
{
    page_t		*pp;
    page_t		*mark;		/* marker page that moves toward head */
    page_t		*end;		/* marker page at end of list */
    int		err = 0;
    int		error;
    kmutex_t	*vphm;
    se_t		se;
    page_t		**where_to_move;

    ASSERT(vp->v_type != VCHR);

    if (vp->v_pages == NULL)
        return (0);


    /*
     * Serialize vplist_dirty operations on this vnode by setting VVMLOCK.
     *
     * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync()
     * from getting blocked while flushing pages to a dead NFS server.
     */
    mutex_enter(&vp->v_lock);
    if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) {
        mutex_exit(&vp->v_lock);
        return (EAGAIN);
    }

    while (vp->v_flag & VVMLOCK)
        cv_wait(&vp->v_cv, &vp->v_lock);

    if (vp->v_pages == NULL) {
        mutex_exit(&vp->v_lock);
        return (0);
    }

    vp->v_flag |= VVMLOCK;
    mutex_exit(&vp->v_lock);


    /*
     * Set up the marker pages used to walk the list
     */
    end = kmem_cache_alloc(marker_cache, KM_SLEEP);
    end->p_vnode = vp;
    end->p_offset = (u_offset_t)-2;
    mark = kmem_cache_alloc(marker_cache, KM_SLEEP);
    mark->p_vnode = vp;
    mark->p_offset = (u_offset_t)-1;

    /*
     * Grab the lock protecting the vnode's page list
     * note that this lock is dropped at times in the loop.
     */
    vphm = page_vnode_mutex(vp);
    mutex_enter(vphm);
    if (vp->v_pages == NULL)
        goto leave;

    /*
     * insert the markers and loop through the list of pages
     */
    page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark);
    page_vpadd(&mark->p_vpnext, end);
    for (;;) {

        /*
         * If only doing an async write back, then we can
         * stop as soon as we get to start of the list.
         */
        if (flags == B_ASYNC && vp->v_pages == mark)
            break;

        /*
         * otherwise stop when we've gone through all the pages
         */
        if (mark->p_vpprev == end)
            break;

        pp = mark->p_vpprev;
        if (vp->v_pages == pp)
            where_to_move = &vp->v_pages;
        else
            where_to_move = &pp->p_vpprev->p_vpnext;

        ASSERT(pp->p_vnode == vp);

        /*
         * If just flushing dirty pages to disk and this vnode
         * is using a sorted list of pages, we can stop processing
         * as soon as we find an unmodified page. Since all the
         * modified pages are visited first.
         */
        if (IS_VMODSORT(vp) &&
                !(flags & (B_INVAL | B_FREE | B_TRUNC))) {
            if (!hat_ismod(pp) && !page_io_locked(pp)) {
#ifdef  DEBUG
                /*
                 * For debug kernels examine what should be
                 * all the remaining clean pages, asserting
                 * that they are not modified.
                 */
                page_t	*chk = pp;
                int	attr;

                page_vpsub(&vp->v_pages, mark);
                page_vpadd(where_to_move, mark);
                do {
                    chk = chk->p_vpprev;
                    ASSERT(chk != end);
                    if (chk == mark)
                        continue;
                    attr = hat_page_getattr(chk, P_MOD |
                                            P_REF);
                    if ((attr & P_MOD) == 0)
                        continue;
                    panic("v_pages list not all clean: "
                          "page_t*=%p vnode=%p off=%lx "
                          "attr=0x%x last clean page_t*=%p\n",
                          (void *)chk, (void *)chk->p_vnode,
                          (long)chk->p_offset, attr,
                          (void *)pp);
                } while (chk != vp->v_pages);
#endif
                break;
            } else if (!(flags & B_ASYNC) && !hat_ismod(pp)) {
                /*
                 * Couldn't get io lock, wait until IO is done.
                 * Block only for sync IO since we don't want
                 * to block async IO.
                 */
                mutex_exit(vphm);
                page_io_wait(pp);
                mutex_enter(vphm);
                continue;
            }
        }

        /*
         * Skip this page if the offset is out of the desired range.
         * Just move the marker and continue.
         */
        if (pp->p_offset < off) {
            page_vpsub(&vp->v_pages, mark);
            page_vpadd(where_to_move, mark);
            continue;
        }

        /*
         * If we are supposed to invalidate or free this
         * page, then we need an exclusive lock.
         */
        se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;

        /*
         * We must acquire the page lock for all synchronous
         * operations (invalidate, free and write).
         */
        if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) {
            /*
             * If the page_lock() drops the mutex
             * we must retry the loop.
             */
            if (!page_lock(pp, se, vphm, P_NO_RECLAIM))
                continue;

            /*
             * It's ok to move the marker page now.
             */
            page_vpsub(&vp->v_pages, mark);
            page_vpadd(where_to_move, mark);
        } else {

            /*
             * update the marker page for all remaining cases
             */
            page_vpsub(&vp->v_pages, mark);
            page_vpadd(where_to_move, mark);

            /*
             * For write backs, If we can't lock the page, it's
             * invalid or in the process of being destroyed.  Skip
             * it, assuming someone else is writing it.
             */
            if (!page_trylock(pp, se))
                continue;
        }

        ASSERT(pp->p_vnode == vp);

        /*
         * Successfully locked the page, now figure out what to
         * do with it. Free pages are easily dealt with, invalidate
         * if desired or just go on to the next page.
         */
        if (PP_ISFREE(pp)) {
            if ((flags & B_INVAL) == 0) {
                page_unlock(pp);
                continue;
            }

            /*
             * Invalidate (destroy) the page.
             */
            mutex_exit(vphm);
            page_destroy_free(pp);
            mutex_enter(vphm);
            continue;
        }

        /*
         * pvn_getdirty() figures out what do do with a dirty page.
         * If the page is dirty, the putapage() routine will write it
         * and will kluster any other adjacent dirty pages it can.
         *
         * pvn_getdirty() and `(*putapage)' unlock the page.
         */
        mutex_exit(vphm);
        if (pvn_getdirty(pp, flags)) {
            error = (*putapage)(vp, pp, NULL, NULL, flags, cred);
            if (!err)
                err = error;
        }
        mutex_enter(vphm);
    }
    page_vpsub(&vp->v_pages, mark);
    page_vpsub(&vp->v_pages, end);

leave:
    /*
     * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds
     */
    mutex_exit(vphm);
    kmem_cache_free(marker_cache, mark);
    kmem_cache_free(marker_cache, end);
    mutex_enter(&vp->v_lock);
    vp->v_flag &= ~VVMLOCK;
    cv_broadcast(&vp->v_cv);
    mutex_exit(&vp->v_lock);
    return (err);
}
Esempio n. 4
0
void
pvn_write_done(page_t *plist, int flags)
{
    int dfree = 0;
    int pgrec = 0;
    int pgout = 0;
    int pgpgout = 0;
    int anonpgout = 0;
    int anonfree = 0;
    int fspgout = 0;
    int fsfree = 0;
    int execpgout = 0;
    int execfree = 0;
    page_t *pp;
    struct cpu *cpup;
    struct vnode *vp = NULL;	/* for probe */
    uint_t ppattr;
    kmutex_t *vphm = NULL;

    ASSERT((flags & B_READ) == 0);

    /*
     * If we are about to start paging anyway, start freeing pages.
     */
    if (write_free && freemem < lotsfree + pages_before_pager &&
            (flags & B_ERROR) == 0) {
        flags |= B_FREE;
    }

    /*
     * Handle each page involved in the i/o operation.
     */
    while (plist != NULL) {
        pp = plist;
        ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp));
        page_sub(&plist, pp);

        /* Kernel probe support */
        if (vp == NULL)
            vp = pp->p_vnode;

        if (((flags & B_ERROR) == 0) && IS_VMODSORT(vp)) {
            /*
             * Move page to the top of the v_page list.
             * Skip pages modified during IO.
             */
            vphm = page_vnode_mutex(vp);
            mutex_enter(vphm);
            if ((pp->p_vpnext != pp) && !hat_ismod(pp)) {
                page_vpsub(&vp->v_pages, pp);
                page_vpadd(&vp->v_pages, pp);
            }
            mutex_exit(vphm);
        }

        if (flags & B_ERROR) {
            /*
             * Write operation failed.  We don't want
             * to destroy (or free) the page unless B_FORCE
             * is set. We set the mod bit again and release
             * all locks on the page so that it will get written
             * back again later when things are hopefully
             * better again.
             * If B_INVAL and B_FORCE is set we really have
             * to destroy the page.
             */
            if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) {
                page_io_unlock(pp);
                /*LINTED: constant in conditional context*/
                VN_DISPOSE(pp, B_INVAL, 0, kcred);
            } else {
                hat_setmod_only(pp);
                page_io_unlock(pp);
                page_unlock(pp);
            }
        } else if (flags & B_INVAL) {
            /*
             * XXX - Failed writes with B_INVAL set are
             * not handled appropriately.
             */
            page_io_unlock(pp);
            /*LINTED: constant in conditional context*/
            VN_DISPOSE(pp, B_INVAL, 0, kcred);
        } else if (flags & B_FREE ||!hat_page_is_mapped(pp)) {
            /*
             * Update statistics for pages being paged out
             */
            if (pp->p_vnode) {
                if (IS_SWAPFSVP(pp->p_vnode)) {
                    anonpgout++;
                } else {
                    if (pp->p_vnode->v_flag & VVMEXEC) {
                        execpgout++;
                    } else {
                        fspgout++;
                    }
                }
            }
            page_io_unlock(pp);
            pgout = 1;
            pgpgout++;
            TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT,
                    "page_ws_out:pp %p", pp);

            /*
             * The page_struct_lock need not be acquired to
             * examine "p_lckcnt" and "p_cowcnt" since we'll
             * have an "exclusive" lock if the upgrade succeeds.
             */
            if (page_tryupgrade(pp) &&
                    pp->p_lckcnt == 0 && pp->p_cowcnt == 0) {
                /*
                 * Check if someone has reclaimed the
                 * page.  If ref and mod are not set, no
                 * one is using it so we can free it.
                 * The rest of the system is careful
                 * to use the NOSYNC flag to unload
                 * translations set up for i/o w/o
                 * affecting ref and mod bits.
                 *
                 * Obtain a copy of the real hardware
                 * mod bit using hat_pagesync(pp, HAT_DONTZERO)
                 * to avoid having to flush the cache.
                 */
                ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
                                      HAT_SYNC_STOPON_MOD);
ck_refmod:
                if (!(ppattr & (P_REF | P_MOD))) {
                    if (hat_page_is_mapped(pp)) {
                        /*
                         * Doesn't look like the page
                         * was modified so now we
                         * really have to unload the
                         * translations.  Meanwhile
                         * another CPU could've
                         * modified it so we have to
                         * check again.  We don't loop
                         * forever here because now
                         * the translations are gone
                         * and no one can get a new one
                         * since we have the "exclusive"
                         * lock on the page.
                         */
                        (void) hat_pageunload(pp,
                                              HAT_FORCE_PGUNLOAD);
                        ppattr = hat_page_getattr(pp,
                                                  P_REF | P_MOD);
                        goto ck_refmod;
                    }
                    /*
                     * Update statistics for pages being
                     * freed
                     */
                    if (pp->p_vnode) {
                        if (IS_SWAPFSVP(pp->p_vnode)) {
                            anonfree++;
                        } else {
                            if (pp->p_vnode->v_flag
                                    & VVMEXEC) {
                                execfree++;
                            } else {
                                fsfree++;
                            }
                        }
                    }
                    /*LINTED: constant in conditional ctx*/
                    VN_DISPOSE(pp, B_FREE,
                               (flags & B_DONTNEED), kcred);
                    dfree++;
                } else {
                    page_unlock(pp);
                    pgrec++;
                    TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE,
                            "page_ws_free:pp %p", pp);
                }
            } else {
                /*
                 * Page is either `locked' in memory
                 * or was reclaimed and now has a
                 * "shared" lock, so release it.
                 */
                page_unlock(pp);
            }
        } else {
            /*
             * Neither B_FREE nor B_INVAL nor B_ERROR.
             * Just release locks.
             */
            page_io_unlock(pp);
            page_unlock(pp);
        }
    }

    CPU_STATS_ENTER_K();
    cpup = CPU;		/* get cpup now that CPU cannot change */
    CPU_STATS_ADDQ(cpup, vm, dfree, dfree);
    CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec);
    CPU_STATS_ADDQ(cpup, vm, pgout, pgout);
    CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout);
    CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout);
    CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree);
    CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout);
    CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree);
    CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout);
    CPU_STATS_ADDQ(cpup, vm, execfree, execfree);
    CPU_STATS_EXIT_K();

    /* Kernel probe */
    TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */,
                tnf_opaque,	vnode,			vp,
                tnf_ulong,	pages_pageout,		pgpgout,
                tnf_ulong,	pages_freed,		dfree,
                tnf_ulong,	pages_reclaimed,	pgrec);
}