/* Read system call. */ static int sys_read (int handle, void *udst_, unsigned size) { uint8_t *udst = udst_; struct file_descriptor *fd; int bytes_read = 0; /* Look up file descriptor. */ if (handle != STDIN_FILENO) fd = lookup_file_fd (handle); while (size > 0) { /* How much to read into this page? */ size_t page_left = PGSIZE - pg_ofs (udst); size_t read_amt = size < page_left ? size : page_left; off_t retval; /* Check that touching this page is okay. */ if (!page_lock (udst, true)) thread_exit (); /* Read from file into page. */ if (handle != STDIN_FILENO) { retval = file_read (fd->file, udst, read_amt); if (retval < 0) { if (bytes_read == 0) bytes_read = -1; break; } bytes_read += retval; } else { size_t i; for (i = 0; i < read_amt; i++) udst[i] = input_getc (); bytes_read = read_amt; } /* Release page. */ page_unlock (udst); /* If it was a short read we're done. */ if (retval != (off_t) read_amt) break; /* Advance. */ udst += retval; size -= retval; } return bytes_read; }
/* * Any changes to this routine must also be carried over to * devmap_free_pages() in the seg_dev driver. This is because * we currently don't have a special kernel segment for non-paged * kernel memory that is exported by drivers to user space. */ static void segkmem_free_vn(vmem_t *vmp, void *inaddr, size_t size, struct vnode *vp, void (*func)(page_t *)) { page_t *pp; caddr_t addr = inaddr; caddr_t eaddr; pgcnt_t npages = btopr(size); ASSERT(((uintptr_t)addr & PAGEOFFSET) == 0); ASSERT(vp != NULL); if (kvseg.s_base == NULL) { segkmem_gc_list_t *gc = inaddr; gc->gc_arena = vmp; gc->gc_size = size; gc->gc_next = segkmem_gc_list; segkmem_gc_list = gc; return; } hat_unload(kas.a_hat, addr, size, HAT_UNLOAD_UNLOCK); for (eaddr = addr + size; addr < eaddr; addr += PAGESIZE) { #if defined(__x86) pp = page_find(vp, (u_offset_t)(uintptr_t)addr); if (pp == NULL) panic("segkmem_free: page not found"); if (!page_tryupgrade(pp)) { /* * Some other thread has a sharelock. Wait for * it to drop the lock so we can free this page. */ page_unlock(pp); pp = page_lookup(vp, (u_offset_t)(uintptr_t)addr, SE_EXCL); } #else pp = page_lookup(vp, (u_offset_t)(uintptr_t)addr, SE_EXCL); #endif if (pp == NULL) panic("segkmem_free: page not found"); /* Clear p_lckcnt so page_destroy() doesn't update availrmem */ pp->p_lckcnt = 0; if (func) func(pp); else page_destroy(pp, 0); } if (func == NULL) page_unresv(npages); if (vmp != NULL) vmem_free(vmp, inaddr, size); }
/* Write system call. */ static int sys_write (int handle, void *usrc_, unsigned size) { uint8_t *usrc = usrc_; struct file_descriptor *fd = NULL; int bytes_written = 0; /* Lookup up file descriptor. */ if (handle != STDOUT_FILENO) fd = lookup_file_fd (handle); while (size > 0) { /* How much bytes to write to this page? */ size_t page_left = PGSIZE - pg_ofs (usrc); size_t write_amt = size < page_left ? size : page_left; off_t retval; /* Check that we can touch this user page. */ if (!page_lock (usrc, false)) thread_exit (); /* Do the write. */ if (handle == STDOUT_FILENO) { putbuf ((char *) usrc, write_amt); retval = write_amt; } else retval = file_write (fd->file, usrc, write_amt); /* Release user page. */ page_unlock (usrc); /* Handle return value. */ if (retval < 0) { if (bytes_written == 0) bytes_written = -1; break; } bytes_written += retval; /* If it was a short write we're done. */ if (retval != (off_t) write_amt) break; /* Advance. */ usrc += retval; size -= retval; } return bytes_written; }
/* * Generic entry point used to release the "shared/exclusive" lock * and the "p_iolock" on pages after i/o is complete. */ void pvn_io_done(page_t *plist) { page_t *pp; while (plist != NULL) { pp = plist; page_sub(&plist, pp); page_io_unlock(pp); page_unlock(pp); } }
/* Creates a copy of user string US in kernel memory and returns it as a page that must be freed with palloc_free_page(). Truncates the string at PGSIZE bytes in size. Call thread_exit() if any of the user accesses are invalid. */ static char * copy_in_string (const char *us) { char *ks; char *upage; size_t length; ks = palloc_get_page (0); if (ks == NULL) thread_exit (); length = 0; for (;;) { upage = pg_round_down (us); if (!page_lock (upage, false)) goto lock_error; for (; us < upage + PGSIZE; us++) { ks[length++] = *us; if (*us == '\0') { page_unlock (upage); return ks; } else if (length >= PGSIZE) goto too_long_error; } page_unlock (upage); } too_long_error: page_unlock (upage); lock_error: palloc_free_page (ks); thread_exit (); }
/* * Page retire self-test. For now, it always returns 0. */ int page_retire_test(void) { page_t *first, *pp, *cpp, *cpp2, *lpp; /* * Tests the corner case where a large page can't be retired * because one of the constituent pages is locked. We mark * one page to be retired and try to retire it, and mark the * other page to be retired but don't try to retire it, so * that page_unlock() in the failure path will recurse and try * to retire THAT page. This is the worst possible situation * we can get ourselves into. */ memsegs_lock(0); pp = first = page_first(); do { if (pp->p_szc && PP_PAGEROOT(pp) == pp) { cpp = pp + 1; lpp = PP_ISFREE(pp)? pp : pp + 2; cpp2 = pp + 3; if (!page_trylock(lpp, pp == lpp? SE_EXCL : SE_SHARED)) continue; if (!page_trylock(cpp, SE_EXCL)) { page_unlock(lpp); continue; } page_settoxic(cpp, PR_FMA | PR_BUSY); page_settoxic(cpp2, PR_FMA); page_tryretire(cpp); /* will fail */ page_unlock(lpp); (void) page_retire(cpp->p_pagenum, PR_FMA); (void) page_retire(cpp2->p_pagenum, PR_FMA); } } while ((pp = page_next(pp)) != first); memsegs_unlock(0); return (0); }
/*ARGSUSED*/ static int bootfs_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp, page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw, cred_t *cr) { bootfs_node_t *bnp = vp->v_data; page_t *pp, *fpp; pfn_t pfn; for (;;) { /* Easy case where the page exists */ pp = page_lookup(vp, off, rw == S_CREATE ? SE_EXCL : SE_SHARED); if (pp != NULL) { if (pl != NULL) { pl[0] = pp; pl[1] = NULL; } else { page_unlock(pp); } return (0); } pp = page_create_va(vp, off, PAGESIZE, PG_EXCL | PG_WAIT, seg, addr); /* * If we didn't get the page, that means someone else beat us to * creating this so we need to try again. */ if (pp != NULL) break; } pfn = btop((bnp->bvn_addr + off) & PAGEMASK); fpp = page_numtopp_nolock(pfn); if (ppcopy(fpp, pp) == 0) { pvn_read_done(pp, B_ERROR); return (EIO); } if (pl != NULL) { pvn_plist_init(pp, pl, plsz, off, PAGESIZE, rw); } else { pvn_io_done(pp); } return (0); }
void boot_mapin(caddr_t addr, size_t size) { caddr_t eaddr; page_t *pp; pfn_t pfnum; if (page_resv(btop(size), KM_NOSLEEP) == 0) panic("boot_mapin: page_resv failed"); for (eaddr = addr + size; addr < eaddr; addr += PAGESIZE) { pfnum = va_to_pfn(addr); if (pfnum == PFN_INVALID) continue; if ((pp = page_numtopp_nolock(pfnum)) == NULL) panic("boot_mapin(): No pp for pfnum = %lx", pfnum); /* * must break up any large pages that may have constituent * pages being utilized for BOP_ALLOC()'s before calling * page_numtopp().The locking code (ie. page_reclaim()) * can't handle them */ if (pp->p_szc != 0) page_boot_demote(pp); pp = page_numtopp(pfnum, SE_EXCL); if (pp == NULL || PP_ISFREE(pp)) panic("boot_alloc: pp is NULL or free"); /* * If the cage is on but doesn't yet contain this page, * mark it as non-relocatable. */ if (kcage_on && !PP_ISNORELOC(pp)) { PP_SETNORELOC(pp); PLCNT_XFER_NORELOC(pp); } (void) page_hashin(pp, &kvp, (u_offset_t)(uintptr_t)addr, NULL); pp->p_lckcnt = 1; #if defined(__x86) page_downgrade(pp); #else page_unlock(pp); #endif } }
/* * Take a retired page off the retired-pages vnode and clear the toxic flags. * If "free" is nonzero, lock it and put it back on the freelist. If "free" * is zero, the caller already holds SE_EXCL lock so we simply unretire it * and don't do anything else with it. * * Any unretire messages are printed from this routine. * * Returns 0 if page pp was unretired; else an error code. */ int page_unretire_pp(page_t *pp, int free) { /* * To be retired, a page has to be hashed onto the retired_pages vnode * and have PR_RETIRED set in p_toxic. */ if (free == 0 || page_try_reclaim_lock(pp, SE_EXCL, SE_RETIRED)) { ASSERT(PAGE_EXCL(pp)); PR_DEBUG(prd_ulocked); if (!PP_RETIRED(pp)) { PR_DEBUG(prd_unotretired); page_unlock(pp); return (page_retire_done(pp, PRD_UNR_NOT)); } PR_MESSAGE(CE_NOTE, 1, "unretiring retired" " page 0x%08x.%08x", mmu_ptob((uint64_t)pp->p_pagenum)); if (pp->p_toxic & PR_FMA) { PR_DECR_KSTAT(pr_fma); } else if (pp->p_toxic & PR_UE) { PR_DECR_KSTAT(pr_ue); } else { PR_DECR_KSTAT(pr_mce); } page_clrtoxic(pp, PR_ALLFLAGS); if (free) { PR_DEBUG(prd_udestroy); page_destroy(pp, 0); } else { PR_DEBUG(prd_uhashout); page_hashout(pp, NULL); } mutex_enter(&freemem_lock); availrmem++; mutex_exit(&freemem_lock); PR_DEBUG(prd_uunretired); PR_DECR_KSTAT(pr_retired); PR_INCR_KSTAT(pr_unretired); return (page_retire_done(pp, PRD_UNR_SUCCESS)); } PR_DEBUG(prd_unotlocked); return (page_retire_done(pp, PRD_UNR_CANTLOCK)); }
/* Zero no-longer-used part of last page, when truncating a file * * This function only exists for Solaris. Other platforms do not support it. * * Locking: the vcache entry lock is held. It is released and re-obtained. * The caller will raise activeV (to prevent pageins), but this function must * be called first, since it causes a pagein. */ void osi_VM_PreTruncate(struct vcache *avc, int alen, afs_ucred_t *acred) { page_t *pp; int pageOffset = (alen & PAGEOFFSET); if (pageOffset == 0) { return; } ReleaseWriteLock(&avc->lock); AFS_GUNLOCK(); pp = page_lookup(AFSTOV(avc), alen - pageOffset, SE_EXCL); if (pp) { pagezero(pp, pageOffset, PAGESIZE - pageOffset); page_unlock(pp); } AFS_GLOCK(); ObtainWriteLock(&avc->lock, 563); }
int plat_hold_page(pfn_t pfn, int lock, page_t **pp_ret) { page_t *pp = page_numtopp_nolock(pfn); if (pp == NULL) return (PLAT_HOLD_FAIL); #if !defined(__xpv) /* * Pages are locked SE_SHARED because some hypervisors * like xVM ESX reclaim Guest OS memory by locking * it SE_EXCL so we want to leave these pages alone. */ if (lock == PLAT_HOLD_LOCK) { ASSERT(pp_ret != NULL); if (page_trylock(pp, SE_SHARED) == 0) return (PLAT_HOLD_FAIL); } #else /* __xpv */ if (lock == PLAT_HOLD_LOCK) { ASSERT(pp_ret != NULL); if (page_trylock(pp, SE_EXCL) == 0) return (PLAT_HOLD_FAIL); } if (mfn_list[pfn] == MFN_INVALID) { /* We failed - release the lock if we grabbed it earlier */ if (lock == PLAT_HOLD_LOCK) { page_unlock(pp); } return (PLAT_HOLD_FAIL); } #endif /* __xpv */ if (lock == PLAT_HOLD_LOCK) *pp_ret = pp; return (PLAT_HOLD_OK); }
/* Copies SIZE bytes from kernel address SRC to user address UDST. Call thread_exit() if any of the user accesses are invalid. */ static void copy_out (void *udst_, const void *src_, size_t size) { uint8_t *udst = udst_; const uint8_t *src = src_; while (size > 0) { size_t chunk_size = PGSIZE - pg_ofs (udst); if (chunk_size > size) chunk_size = size; if (!page_lock (udst, false)) thread_exit (); memcpy (udst, src, chunk_size); page_unlock (udst); udst += chunk_size; src += chunk_size; size -= chunk_size; } }
/* * Act like page_destroy(), but instead of freeing the page, hash it onto * the retired_pages vnode, and mark it retired. * * For fun, we try to scrub the page until it's squeaky clean. * availrmem is adjusted here. */ static void page_retire_destroy(page_t *pp) { u_offset_t off = (u_offset_t)((uintptr_t)pp); ASSERT(PAGE_EXCL(pp)); ASSERT(!PP_ISFREE(pp)); ASSERT(pp->p_szc == 0); ASSERT(!hat_page_is_mapped(pp)); ASSERT(!pp->p_vnode); page_clr_all_props(pp); pagescrub(pp, 0, MMU_PAGESIZE); pp->p_next = NULL; pp->p_prev = NULL; if (page_hashin(pp, retired_pages, off, NULL) == 0) { cmn_err(CE_PANIC, "retired page %p hashin failed", (void *)pp); } page_settoxic(pp, PR_RETIRED); page_clrtoxic(pp, PR_BUSY); page_retire_dequeue(pp); PR_INCR_KSTAT(pr_retired); if (pp->p_toxic & PR_FMA) { PR_INCR_KSTAT(pr_fma); } else if (pp->p_toxic & PR_UE) { PR_INCR_KSTAT(pr_ue); } else { PR_INCR_KSTAT(pr_mce); } mutex_enter(&freemem_lock); availrmem--; mutex_exit(&freemem_lock); page_unlock(pp); }
/* Copies SIZE bytes from user address USRC to kernel address DST. Call thread_exit() if any of the user accesses are invalid. */ static void copy_in (void *dst_, const void *usrc_, size_t size) { uint8_t *dst = dst_; const uint8_t *usrc = usrc_; while (size > 0) { size_t chunk_size = PGSIZE - pg_ofs (usrc); if (chunk_size > size) chunk_size = size; if (!page_lock (usrc, false)) thread_exit (); memcpy (dst, usrc, chunk_size); page_unlock (usrc); dst += chunk_size; usrc += chunk_size; size -= chunk_size; } }
/* * Scan page_t's and issue I/O's for modified pages. * * Also coalesces consecutive small sized free pages into the next larger * pagesize. This costs a tiny bit of time in fsflush, but will reduce time * spent scanning on later passes and for anybody allocating large pages. */ static void fsflush_do_pages() { vnode_t *vp; ulong_t pcount; hrtime_t timer = gethrtime(); ulong_t releases = 0; ulong_t nexamined = 0; ulong_t nlocked = 0; ulong_t nmodified = 0; ulong_t ncoalesce = 0; ulong_t cnt; int mod; int fspage = 1; u_offset_t offset; uint_t szc; page_t *coal_page = NULL; /* 1st page in group to coalesce */ uint_t coal_szc = 0; /* size code, coal_page->p_szc */ uint_t coal_cnt = 0; /* count of pages seen */ static ulong_t nscan = 0; static pgcnt_t last_total_pages = 0; static page_t *pp = NULL; /* * Check to see if total_pages has changed. */ if (total_pages != last_total_pages) { last_total_pages = total_pages; nscan = (last_total_pages * (tune.t_fsflushr))/v.v_autoup; } if (pp == NULL) pp = memsegs->pages; pcount = 0; while (pcount < nscan) { /* * move to the next page, skipping over large pages * and issuing prefetches. */ if (pp->p_szc && fspage == 0) { pfn_t pfn; pfn = page_pptonum(pp); cnt = page_get_pagecnt(pp->p_szc); cnt -= pfn & (cnt - 1); } else cnt = 1; pp = page_nextn(pp, cnt); prefetch_page_r((void *)pp); ASSERT(pp != NULL); pcount += cnt; /* * Do a bunch of dirty tests (ie. no locking) to determine * if we can quickly skip this page. These tests are repeated * after acquiring the page lock. */ ++nexamined; if (PP_ISSWAP(pp)) { fspage = 0; coal_page = NULL; continue; } /* * skip free pages too, but try coalescing them into larger * pagesizes */ if (PP_ISFREE(pp)) { /* * skip pages with a file system identity or that * are already maximum size */ fspage = 0; szc = pp->p_szc; if (pp->p_vnode != NULL || szc == fsf_npgsz - 1) { coal_page = NULL; continue; } /* * If not in a coalescing candidate page or the size * codes are different, start a new candidate. */ if (coal_page == NULL || coal_szc != szc) { /* * page must be properly aligned */ if ((page_pptonum(pp) & fsf_mask[szc]) != 0) { coal_page = NULL; continue; } coal_page = pp; coal_szc = szc; coal_cnt = 1; continue; } /* * acceptable to add this to existing candidate page */ ++coal_cnt; if (coal_cnt < fsf_pgcnt[coal_szc]) continue; /* * We've got enough pages to coalesce, so do it. * After promoting, we clear coal_page, so it will * take another pass to promote this to an even * larger page. */ ++ncoalesce; (void) page_promote_size(coal_page, coal_szc); coal_page = NULL; continue; } else { coal_page = NULL; } if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { fspage = 0; continue; } /* * Reject pages that can't be "exclusively" locked. */ if (!page_trylock(pp, SE_EXCL)) continue; ++nlocked; /* * After locking the page, redo the above checks. * Since we locked the page, leave out the PAGE_LOCKED() test. */ vp = pp->p_vnode; if (PP_ISSWAP(pp) || PP_ISFREE(pp) || vp == NULL || PP_ISKAS(pp) || (vp->v_flag & VISSWAP) != 0) { page_unlock(pp); fspage = 0; continue; } if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { page_unlock(pp); continue; } fspage = 1; ASSERT(vp->v_type != VCHR); /* * Check the modified bit. Leaving the bit alone in hardware. * It will be cleared if we do the putpage. */ if (IS_VMODSORT(vp)) mod = hat_ismod(pp); else mod = hat_pagesync(pp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD; if (mod) { ++nmodified; offset = pp->p_offset; /* * Hold the vnode before releasing the page lock * to prevent it from being freed and re-used by * some other thread. */ VN_HOLD(vp); page_unlock(pp); (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_ASYNC, kcred, NULL); VN_RELE(vp); } else { /* * Catch any pages which should be on the cache list, * but aren't yet. */ if (hat_page_is_mapped(pp) == 0) { ++releases; (void) page_release(pp, 1); } else { page_unlock(pp); } } } /* * maintain statistics * reset every million wakeups, just to avoid overflow */ if (++fsf_cycles == 1000000) { fsf_cycles = 0; fsf_total.fsf_scan = 0; fsf_total.fsf_examined = 0; fsf_total.fsf_locked = 0; fsf_total.fsf_modified = 0; fsf_total.fsf_coalesce = 0; fsf_total.fsf_time = 0; fsf_total.fsf_releases = 0; } else { fsf_total.fsf_scan += fsf_recent.fsf_scan = nscan; fsf_total.fsf_examined += fsf_recent.fsf_examined = nexamined; fsf_total.fsf_locked += fsf_recent.fsf_locked = nlocked; fsf_total.fsf_modified += fsf_recent.fsf_modified = nmodified; fsf_total.fsf_coalesce += fsf_recent.fsf_coalesce = ncoalesce; fsf_total.fsf_time += fsf_recent.fsf_time = gethrtime() - timer; fsf_total.fsf_releases += fsf_recent.fsf_releases = releases; } }
/* * Allocate a large page to back the virtual address range * [addr, addr + size). If addr is NULL, allocate the virtual address * space as well. */ static void * segkmem_xalloc_lp(vmem_t *vmp, void *inaddr, size_t size, int vmflag, uint_t attr, page_t *(*page_create_func)(void *, size_t, int, void *), void *pcarg) { caddr_t addr = inaddr, pa; size_t lpsize = segkmem_lpsize; pgcnt_t npages = btopr(size); pgcnt_t nbpages = btop(lpsize); pgcnt_t nlpages = size >> segkmem_lpshift; size_t ppasize = nbpages * sizeof (page_t *); page_t *pp, *rootpp, **ppa, *pplist = NULL; int i; vmflag |= VM_NOSLEEP; if (page_resv(npages, vmflag & VM_KMFLAGS) == 0) { return (NULL); } /* * allocate an array we need for hat_memload_array. * we use a separate arena to avoid recursion. * we will not need this array when hat_memload_array learns pp++ */ if ((ppa = vmem_alloc(segkmem_ppa_arena, ppasize, vmflag)) == NULL) { goto fail_array_alloc; } if (inaddr == NULL && (addr = vmem_alloc(vmp, size, vmflag)) == NULL) goto fail_vmem_alloc; ASSERT(((uintptr_t)addr & (lpsize - 1)) == 0); /* create all the pages */ for (pa = addr, i = 0; i < nlpages; i++, pa += lpsize) { if ((pp = page_create_func(pa, lpsize, vmflag, pcarg)) == NULL) goto fail_page_create; page_list_concat(&pplist, &pp); } /* at this point we have all the resource to complete the request */ while ((rootpp = pplist) != NULL) { for (i = 0; i < nbpages; i++) { ASSERT(pplist != NULL); pp = pplist; page_sub(&pplist, pp); ASSERT(page_iolock_assert(pp)); page_io_unlock(pp); ppa[i] = pp; } /* * Load the locked entry. It's OK to preload the entry into the * TSB since we now support large mappings in the kernel TSB. */ hat_memload_array(kas.a_hat, (caddr_t)(uintptr_t)rootpp->p_offset, lpsize, ppa, (PROT_ALL & ~PROT_USER) | HAT_NOSYNC | attr, HAT_LOAD_LOCK); for (--i; i >= 0; --i) { ppa[i]->p_lckcnt = 1; page_unlock(ppa[i]); } } vmem_free(segkmem_ppa_arena, ppa, ppasize); return (addr); fail_page_create: while ((rootpp = pplist) != NULL) { for (i = 0, pp = pplist; i < nbpages; i++, pp = pplist) { ASSERT(pp != NULL); page_sub(&pplist, pp); ASSERT(page_iolock_assert(pp)); page_io_unlock(pp); } page_destroy_pages(rootpp); } if (inaddr == NULL) vmem_free(vmp, addr, size); fail_vmem_alloc: vmem_free(segkmem_ppa_arena, ppa, ppasize); fail_array_alloc: page_unresv(npages); return (NULL); }
/*ARGSUSED*/ static faultcode_t segkmem_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t size, enum fault_type type, enum seg_rw rw) { pgcnt_t npages; spgcnt_t pg; page_t *pp; struct vnode *vp = seg->s_data; ASSERT(RW_READ_HELD(&seg->s_as->a_lock)); if (seg->s_as != &kas || size > seg->s_size || addr < seg->s_base || addr + size > seg->s_base + seg->s_size) panic("segkmem_fault: bad args"); /* * If it is one of segkp pages, call segkp_fault. */ if (segkp_bitmap && seg == &kvseg && BT_TEST(segkp_bitmap, btop((uintptr_t)(addr - seg->s_base)))) return (SEGOP_FAULT(hat, segkp, addr, size, type, rw)); if (rw != S_READ && rw != S_WRITE && rw != S_OTHER) return (FC_NOSUPPORT); npages = btopr(size); switch (type) { case F_SOFTLOCK: /* lock down already-loaded translations */ for (pg = 0; pg < npages; pg++) { pp = page_lookup(vp, (u_offset_t)(uintptr_t)addr, SE_SHARED); if (pp == NULL) { /* * Hmm, no page. Does a kernel mapping * exist for it? */ if (!hat_probe(kas.a_hat, addr)) { addr -= PAGESIZE; while (--pg >= 0) { pp = page_find(vp, (u_offset_t) (uintptr_t)addr); if (pp) page_unlock(pp); addr -= PAGESIZE; } return (FC_NOMAP); } } addr += PAGESIZE; } if (rw == S_OTHER) hat_reserve(seg->s_as, addr, size); return (0); case F_SOFTUNLOCK: while (npages--) { pp = page_find(vp, (u_offset_t)(uintptr_t)addr); if (pp) page_unlock(pp); addr += PAGESIZE; } return (0); default: return (FC_NOSUPPORT); } /*NOTREACHED*/ }
/* * page_retire() - the front door in to retire a page. * * Ideally, page_retire() would instantly retire the requested page. * Unfortunately, some pages are locked or otherwise tied up and cannot be * retired right away. To deal with that, bits are set in p_toxic of the * page_t. An attempt is made to lock the page; if the attempt is successful, * we instantly unlock the page counting on page_unlock() to notice p_toxic * is nonzero and to call back into page_retire_pp(). Success is determined * by looking to see whether the page has been retired once it has been * unlocked. * * Returns: * * - 0 on success, * - EINVAL when the PA is whacko, * - EIO if the page is already retired or already pending retirement, or * - EAGAIN if the page could not be _immediately_ retired but is pending. */ int page_retire(uint64_t pa, uchar_t reason) { page_t *pp; ASSERT(reason & PR_REASONS); /* there must be a reason */ ASSERT(!(reason & ~PR_REASONS)); /* but no other bits */ pp = page_numtopp_nolock(mmu_btop(pa)); if (pp == NULL) { PR_MESSAGE(CE_WARN, 1, "Cannot schedule clearing of error on" " page 0x%08x.%08x; page is not relocatable memory", pa); return (page_retire_done(pp, PRD_INVALID_PA)); } if (PP_RETIRED(pp)) { PR_DEBUG(prd_dup1); return (page_retire_done(pp, PRD_DUPLICATE)); } if ((reason & PR_UE) && !PP_TOXIC(pp)) { PR_MESSAGE(CE_NOTE, 1, "Scheduling clearing of error on" " page 0x%08x.%08x", pa); } else if (PP_PR_REQ(pp)) { PR_DEBUG(prd_dup2); return (page_retire_done(pp, PRD_DUPLICATE)); } else { PR_MESSAGE(CE_NOTE, 1, "Scheduling removal of" " page 0x%08x.%08x", pa); } page_settoxic(pp, reason); page_retire_enqueue(pp); /* * And now for some magic. * * We marked this page toxic up above. All there is left to do is * to try to lock the page and then unlock it. The page lock routines * will intercept the page and retire it if they can. If the page * cannot be locked, 's okay -- page_unlock() will eventually get it, * or the background thread, until then the lock routines will deny * further locks on it. */ if (MTBF(pr_calls, pr_mtbf) && page_trylock(pp, SE_EXCL)) { PR_DEBUG(prd_prlocked); page_unlock(pp); } else { PR_DEBUG(prd_prnotlocked); } if (PP_RETIRED(pp)) { PR_DEBUG(prd_prretired); return (0); } else { cv_signal(&pr_cv); PR_INCR_KSTAT(pr_failed); if (pp->p_toxic & PR_MSG) { return (page_retire_done(pp, PRD_FAILED)); } else { return (page_retire_done(pp, PRD_PENDING)); } } }
/* * page_retire_pp() decides what to do with a failing page. * * When we get a free page (e.g. the scrubber or in the free path) life is * nice because the page is clean and marked free -- those always retire * nicely. From there we go by order of difficulty. If the page has data, * we attempt to relocate its contents to a suitable replacement page. If * that does not succeed, we look to see if it is clean. If after all of * this we have a clean, unmapped page (which we usually do!), we retire it. * If the page is not clean, we still process it regardless on a UE; for * CEs or FMA requests, we fail leaving the page in service. The page will * eventually be tried again later. We always return with the page unlocked * since we are called from page_unlock(). * * We don't call panic or do anything fancy down in here. Our boss the DE * gets paid handsomely to do his job of figuring out what to do when errors * occur. We just do what he tells us to do. */ static int page_retire_pp(page_t *pp) { int toxic; ASSERT(PAGE_EXCL(pp)); ASSERT(pp->p_iolock_state == 0); ASSERT(pp->p_szc == 0); PR_DEBUG(prd_top); PR_TYPES(pp); toxic = pp->p_toxic; ASSERT(toxic & PR_REASONS); if ((toxic & (PR_FMA | PR_MCE)) && !(toxic & PR_UE) && page_retire_limit()) { page_clrtoxic(pp, PR_FMA | PR_MCE | PR_MSG | PR_BUSY); page_retire_dequeue(pp); page_unlock(pp); return (page_retire_done(pp, PRD_LIMIT)); } if (PP_ISFREE(pp)) { int dbgnoreclaim = MTBF(recl_calls, recl_mtbf) == 0; PR_DEBUG(prd_free); if (dbgnoreclaim || !page_reclaim(pp, NULL)) { PR_DEBUG(prd_noreclaim); PR_INCR_KSTAT(pr_failed); /* * page_reclaim() returns with `pp' unlocked when * it fails. */ if (dbgnoreclaim) page_unlock(pp); return (page_retire_done(pp, PRD_FAILED)); } } ASSERT(!PP_ISFREE(pp)); if ((toxic & PR_UE) == 0 && pp->p_vnode && !PP_ISNORELOCKERNEL(pp) && MTBF(reloc_calls, reloc_mtbf)) { page_t *newpp; spgcnt_t count; /* * If we can relocate the page, great! newpp will go * on without us, and everything is fine. Regardless * of whether the relocation succeeds, we are still * going to take `pp' around back and shoot it. */ newpp = NULL; if (page_relocate(&pp, &newpp, 0, 0, &count, NULL) == 0) { PR_DEBUG(prd_reloc); page_unlock(newpp); ASSERT(hat_page_getattr(pp, P_MOD) == 0); } else { PR_DEBUG(prd_relocfail); } } if (hat_ismod(pp)) { PR_DEBUG(prd_mod); PR_INCR_KSTAT(pr_failed); page_unlock(pp); return (page_retire_done(pp, PRD_FAILED)); } if (PP_ISKVP(pp)) { PR_DEBUG(prd_kern); PR_INCR_KSTAT(pr_failed_kernel); page_unlock(pp); return (page_retire_done(pp, PRD_FAILED)); } if (pp->p_lckcnt || pp->p_cowcnt) { PR_DEBUG(prd_locked); PR_INCR_KSTAT(pr_failed); page_unlock(pp); return (page_retire_done(pp, PRD_FAILED)); } (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); ASSERT(!hat_page_is_mapped(pp)); /* * If the page is modified, and was not relocated; we can't * retire it without dropping data on the floor. We have to * recheck after unloading since the dirty bit could have been * set since we last checked. */ if (hat_ismod(pp)) { PR_DEBUG(prd_mod_late); PR_INCR_KSTAT(pr_failed); page_unlock(pp); return (page_retire_done(pp, PRD_FAILED)); } if (pp->p_vnode) { PR_DEBUG(prd_hashout); page_hashout(pp, NULL); } ASSERT(!pp->p_vnode); /* * The problem page is locked, demoted, unmapped, not free, * hashed out, and not COW or mlocked (whew!). * * Now we select our ammunition, take it around back, and shoot it. */ if (toxic & PR_UE) { if (page_retire_transient_ue(pp)) { PR_DEBUG(prd_uescrubbed); return (page_retire_done(pp, PRD_UE_SCRUBBED)); } else { PR_DEBUG(prd_uenotscrubbed); page_retire_destroy(pp); return (page_retire_done(pp, PRD_SUCCESS)); } } else if (toxic & PR_FMA) { PR_DEBUG(prd_fma); page_retire_destroy(pp); return (page_retire_done(pp, PRD_SUCCESS)); } else if (toxic & PR_MCE) { PR_DEBUG(prd_mce); page_retire_destroy(pp); return (page_retire_done(pp, PRD_SUCCESS)); } panic("page_retire_pp: bad toxic flags %d", toxic); /*NOTREACHED*/ }
/* Read system call. */ static int sys_read (int handle, void *udst_, unsigned size) { uint8_t *udst = udst_; struct file_descriptor *fd; int bytes_read = 0; fd = lookup_fd (handle); while (size > 0) { /* How much to read into this page? */ size_t page_left = PGSIZE - pg_ofs (udst); size_t read_amt = size < page_left ? size : page_left; off_t retval; /* Read from file into page. */ if (handle != STDIN_FILENO) { if (!page_lock (udst, true)) thread_exit (); lock_acquire (&fs_lock); retval = file_read (fd->file, udst, read_amt); lock_release (&fs_lock); page_unlock (udst); } else { size_t i; for (i = 0; i < read_amt; i++) { char c = input_getc (); if (!page_lock (udst, true)) thread_exit (); udst[i] = c; page_unlock (udst); } bytes_read = read_amt; } /* Check success. */ if (retval < 0) { if (bytes_read == 0) bytes_read = -1; break; } bytes_read += retval; if (retval != (off_t) read_amt) { /* Short read, so we're done. */ break; } /* Advance. */ udst += retval; size -= retval; } return bytes_read; }
/* * Allocate pages to back the virtual address range [addr, addr + size). * If addr is NULL, allocate the virtual address space as well. */ void * segkmem_xalloc(vmem_t *vmp, void *inaddr, size_t size, int vmflag, uint_t attr, page_t *(*page_create_func)(void *, size_t, int, void *), void *pcarg) { page_t *ppl; caddr_t addr = inaddr; pgcnt_t npages = btopr(size); int allocflag; if (inaddr == NULL && (addr = vmem_alloc(vmp, size, vmflag)) == NULL) return (NULL); ASSERT(((uintptr_t)addr & PAGEOFFSET) == 0); if (page_resv(npages, vmflag & VM_KMFLAGS) == 0) { if (inaddr == NULL) vmem_free(vmp, addr, size); return (NULL); } ppl = page_create_func(addr, size, vmflag, pcarg); if (ppl == NULL) { if (inaddr == NULL) vmem_free(vmp, addr, size); page_unresv(npages); return (NULL); } /* * Under certain conditions, we need to let the HAT layer know * that it cannot safely allocate memory. Allocations from * the hat_memload vmem arena always need this, to prevent * infinite recursion. * * In addition, the x86 hat cannot safely do memory * allocations while in vmem_populate(), because there * is no simple bound on its usage. */ if (vmflag & VM_MEMLOAD) allocflag = HAT_NO_KALLOC; #if defined(__x86) else if (vmem_is_populator()) allocflag = HAT_NO_KALLOC; #endif else allocflag = 0; while (ppl != NULL) { page_t *pp = ppl; page_sub(&ppl, pp); ASSERT(page_iolock_assert(pp)); ASSERT(PAGE_EXCL(pp)); page_io_unlock(pp); hat_memload(kas.a_hat, (caddr_t)(uintptr_t)pp->p_offset, pp, (PROT_ALL & ~PROT_USER) | HAT_NOSYNC | attr, HAT_LOAD_LOCK | allocflag); pp->p_lckcnt = 1; #if defined(__x86) page_downgrade(pp); #else if (vmflag & SEGKMEM_SHARELOCKED) page_downgrade(pp); else page_unlock(pp); #endif } return (addr); }
void plat_freelist_process(int mnode) { page_t *page, **freelist; page_t *bdlist[STARFIRE_MAX_BOARDS]; page_t **sortlist[STARFIRE_MAX_BOARDS]; uint32_t idx, idy, size, color, max_color, lbn; uint32_t bd_flags, bd_cnt, result, bds; kmutex_t *pcm; int mtype; /* for each page size */ for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) { for (size = 0; size < mmu_page_sizes; size++) { /* * Compute the maximum # of phys colors based on * page size. */ max_color = page_get_pagecolors(size); /* for each color */ for (color = 0; color < max_color; color++) { bd_cnt = 0; bd_flags = 0; for (idx = 0; idx < STARFIRE_MAX_BOARDS; idx++) { bdlist[idx] = NULL; sortlist[idx] = NULL; } /* find freelist */ freelist = &PAGE_FREELISTS(mnode, size, color, mtype); if (*freelist == NULL) continue; /* acquire locks */ pcm = PC_BIN_MUTEX(mnode, color, PG_FREE_LIST); mutex_enter(pcm); /* * read freelist & sort pages by logical * board number */ /* grab pages till last one. */ while (*freelist) { page = *freelist; result = page_trylock(page, SE_EXCL); ASSERT(result); /* Delete from freelist */ if (size != 0) { page_vpsub(freelist, page); } else { mach_page_sub(freelist, page); } /* detect the lbn */ lbn = PFN_2_LBN(page->p_pagenum); /* add to bdlist[lbn] */ if (size != 0) { page_vpadd(&bdlist[lbn], page); } else { mach_page_add(&bdlist[lbn], page); } /* if lbn new */ if ((bd_flags & (1 << lbn)) == 0) { bd_flags |= (1 << lbn); bd_cnt++; } page_unlock(page); } /* * Make the sortlist so * bd_cnt choices show up */ bds = 0; for (idx = 0; idx < STARFIRE_MAX_BOARDS; idx++) { if (bdlist[idx]) sortlist[bds++] = &bdlist[idx]; } /* * Set random start. */ (void) random_idx(-color); /* * now rebuild the freelist by shuffling * pages from bd lists */ while (bd_cnt) { /* * get "random" index between 0 & * bd_cnt */ ASSERT(bd_cnt && (bd_cnt < STARFIRE_MAX_BOARDS+1)); idx = random_idx(bd_cnt); page = *sortlist[idx]; result = page_trylock(page, SE_EXCL); ASSERT(result); /* Delete from sort_list */ /* & Append to freelist */ /* Big pages use vp_add - 8k don't */ if (size != 0) { page_vpsub(sortlist[idx], page); page_vpadd(freelist, page); } else { mach_page_sub(sortlist[idx], page); mach_page_add(freelist, page); } /* needed for indexing tmp lists */ lbn = PFN_2_LBN(page->p_pagenum); /* * if this was the last page on this * list? */ if (*sortlist[idx] == NULL) { /* have to find brd list */ /* idx is lbn? -- No! */ /* sortlist, brdlist */ /* have diff indexs */ bd_flags &= ~(1 << lbn); --bd_cnt; /* * redo the sortlist so only * bd_cnt choices show up */ bds = 0; for (idy = 0; idy < STARFIRE_MAX_BOARDS; idy++) { if (bdlist[idy]) { sortlist[bds++] /* CSTYLED */ = &bdlist[idy]; } } } page_unlock(page); } mutex_exit(pcm); } } } }
/* * Process a vnode's page list for all pages whose offset is >= off. * Pages are to either be free'd, invalidated, or written back to disk. * * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE * is specified, otherwise they are "shared" locked. * * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC} * * Special marker page_t's are inserted in the list in order * to keep track of where we are in the list when locks are dropped. * * Note the list is circular and insertions can happen only at the * head and tail of the list. The algorithm ensures visiting all pages * on the list in the following way: * * Drop two marker pages at the end of the list. * * Move one marker page backwards towards the start of the list until * it is at the list head, processing the pages passed along the way. * * Due to race conditions when the vphm mutex is dropped, additional pages * can be added to either end of the list, so we'll continue to move * the marker and process pages until it is up against the end marker. * * There is one special exit condition. If we are processing a VMODSORT * vnode and only writing back modified pages, we can stop as soon as * we run into an unmodified page. This makes fsync(3) operations fast. */ int pvn_vplist_dirty( vnode_t *vp, u_offset_t off, int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *), int flags, cred_t *cred) { page_t *pp; page_t *mark; /* marker page that moves toward head */ page_t *end; /* marker page at end of list */ int err = 0; int error; kmutex_t *vphm; se_t se; page_t **where_to_move; ASSERT(vp->v_type != VCHR); if (vp->v_pages == NULL) return (0); /* * Serialize vplist_dirty operations on this vnode by setting VVMLOCK. * * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync() * from getting blocked while flushing pages to a dead NFS server. */ mutex_enter(&vp->v_lock); if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) { mutex_exit(&vp->v_lock); return (EAGAIN); } while (vp->v_flag & VVMLOCK) cv_wait(&vp->v_cv, &vp->v_lock); if (vp->v_pages == NULL) { mutex_exit(&vp->v_lock); return (0); } vp->v_flag |= VVMLOCK; mutex_exit(&vp->v_lock); /* * Set up the marker pages used to walk the list */ end = kmem_cache_alloc(marker_cache, KM_SLEEP); end->p_vnode = vp; end->p_offset = (u_offset_t)-2; mark = kmem_cache_alloc(marker_cache, KM_SLEEP); mark->p_vnode = vp; mark->p_offset = (u_offset_t)-1; /* * Grab the lock protecting the vnode's page list * note that this lock is dropped at times in the loop. */ vphm = page_vnode_mutex(vp); mutex_enter(vphm); if (vp->v_pages == NULL) goto leave; /* * insert the markers and loop through the list of pages */ page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark); page_vpadd(&mark->p_vpnext, end); for (;;) { /* * If only doing an async write back, then we can * stop as soon as we get to start of the list. */ if (flags == B_ASYNC && vp->v_pages == mark) break; /* * otherwise stop when we've gone through all the pages */ if (mark->p_vpprev == end) break; pp = mark->p_vpprev; if (vp->v_pages == pp) where_to_move = &vp->v_pages; else where_to_move = &pp->p_vpprev->p_vpnext; ASSERT(pp->p_vnode == vp); /* * If just flushing dirty pages to disk and this vnode * is using a sorted list of pages, we can stop processing * as soon as we find an unmodified page. Since all the * modified pages are visited first. */ if (IS_VMODSORT(vp) && !(flags & (B_INVAL | B_FREE | B_TRUNC))) { if (!hat_ismod(pp) && !page_io_locked(pp)) { #ifdef DEBUG /* * For debug kernels examine what should be * all the remaining clean pages, asserting * that they are not modified. */ page_t *chk = pp; int attr; page_vpsub(&vp->v_pages, mark); page_vpadd(where_to_move, mark); do { chk = chk->p_vpprev; ASSERT(chk != end); if (chk == mark) continue; attr = hat_page_getattr(chk, P_MOD | P_REF); if ((attr & P_MOD) == 0) continue; panic("v_pages list not all clean: " "page_t*=%p vnode=%p off=%lx " "attr=0x%x last clean page_t*=%p\n", (void *)chk, (void *)chk->p_vnode, (long)chk->p_offset, attr, (void *)pp); } while (chk != vp->v_pages); #endif break; } else if (!(flags & B_ASYNC) && !hat_ismod(pp)) { /* * Couldn't get io lock, wait until IO is done. * Block only for sync IO since we don't want * to block async IO. */ mutex_exit(vphm); page_io_wait(pp); mutex_enter(vphm); continue; } } /* * Skip this page if the offset is out of the desired range. * Just move the marker and continue. */ if (pp->p_offset < off) { page_vpsub(&vp->v_pages, mark); page_vpadd(where_to_move, mark); continue; } /* * If we are supposed to invalidate or free this * page, then we need an exclusive lock. */ se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED; /* * We must acquire the page lock for all synchronous * operations (invalidate, free and write). */ if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) { /* * If the page_lock() drops the mutex * we must retry the loop. */ if (!page_lock(pp, se, vphm, P_NO_RECLAIM)) continue; /* * It's ok to move the marker page now. */ page_vpsub(&vp->v_pages, mark); page_vpadd(where_to_move, mark); } else { /* * update the marker page for all remaining cases */ page_vpsub(&vp->v_pages, mark); page_vpadd(where_to_move, mark); /* * For write backs, If we can't lock the page, it's * invalid or in the process of being destroyed. Skip * it, assuming someone else is writing it. */ if (!page_trylock(pp, se)) continue; } ASSERT(pp->p_vnode == vp); /* * Successfully locked the page, now figure out what to * do with it. Free pages are easily dealt with, invalidate * if desired or just go on to the next page. */ if (PP_ISFREE(pp)) { if ((flags & B_INVAL) == 0) { page_unlock(pp); continue; } /* * Invalidate (destroy) the page. */ mutex_exit(vphm); page_destroy_free(pp); mutex_enter(vphm); continue; } /* * pvn_getdirty() figures out what do do with a dirty page. * If the page is dirty, the putapage() routine will write it * and will kluster any other adjacent dirty pages it can. * * pvn_getdirty() and `(*putapage)' unlock the page. */ mutex_exit(vphm); if (pvn_getdirty(pp, flags)) { error = (*putapage)(vp, pp, NULL, NULL, flags, cred); if (!err) err = error; } mutex_enter(vphm); } page_vpsub(&vp->v_pages, mark); page_vpsub(&vp->v_pages, end); leave: /* * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds */ mutex_exit(vphm); kmem_cache_free(marker_cache, mark); kmem_cache_free(marker_cache, end); mutex_enter(&vp->v_lock); vp->v_flag &= ~VVMLOCK; cv_broadcast(&vp->v_cv); mutex_exit(&vp->v_lock); return (err); }
/* * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI, * B_TRUNC, B_FORCE}. B_DELWRI indicates that this page is part of a kluster * operation and is only to be considered if it doesn't involve any * waiting here. B_TRUNC indicates that the file is being truncated * and so no i/o needs to be done. B_FORCE indicates that the page * must be destroyed so don't try wrting it out. * * The caller must ensure that the page is locked. Returns 1, if * the page should be written back (the "iolock" is held in this * case), or 0 if the page has been dealt with or has been * unlocked. */ int pvn_getdirty(page_t *pp, int flags) { ASSERT((flags & (B_INVAL | B_FREE)) ? PAGE_EXCL(pp) : PAGE_SHARED(pp)); ASSERT(PP_ISFREE(pp) == 0); /* * If trying to invalidate or free a logically `locked' page, * forget it. Don't need page_struct_lock to check p_lckcnt and * p_cowcnt as the page is exclusively locked. */ if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) && (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) { page_unlock(pp); return (0); } /* * Now acquire the i/o lock so we can add it to the dirty * list (if necessary). We avoid blocking on the i/o lock * in the following cases: * * If B_DELWRI is set, which implies that this request is * due to a klustering operartion. * * If this is an async (B_ASYNC) operation and we are not doing * invalidation (B_INVAL) [The current i/o or fsflush will ensure * that the the page is written out]. */ if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) { if (!page_io_trylock(pp)) { page_unlock(pp); return (0); } } else { page_io_lock(pp); } /* * If we want to free or invalidate the page then * we need to unload it so that anyone who wants * it will have to take a minor fault to get it. * Otherwise, we're just writing the page back so we * need to sync up the hardwre and software mod bit to * detect any future modifications. We clear the * software mod bit when we put the page on the dirty * list. */ if (flags & (B_INVAL | B_FREE)) { (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); } else { (void) hat_pagesync(pp, HAT_SYNC_ZERORM); } if (!hat_ismod(pp) || (flags & B_TRUNC)) { /* * Don't need to add it to the * list after all. */ page_io_unlock(pp); if (flags & B_INVAL) { /*LINTED: constant in conditional context*/ VN_DISPOSE(pp, B_INVAL, 0, kcred); } else if (flags & B_FREE) { /*LINTED: constant in conditional context*/ VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred); } else { /* * This is advisory path for the callers * of VOP_PUTPAGE() who prefer freeing the * page _only_ if no one else is accessing it. * E.g. segmap_release() * * The above hat_ismod() check is useless because: * (1) we may not be holding SE_EXCL lock; * (2) we've not unloaded _all_ translations * * Let page_release() do the heavy-lifting. */ (void) page_release(pp, 1); } return (0); } /* * Page is dirty, get it ready for the write back * and add page to the dirty list. */ hat_clrrefmod(pp); /* * If we're going to free the page when we're done * then we can let others try to use it starting now. * We'll detect the fact that they used it when the * i/o is done and avoid freeing the page. */ if (flags & B_FREE) page_downgrade(pp); TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp); return (1); }
void pvn_write_done(page_t *plist, int flags) { int dfree = 0; int pgrec = 0; int pgout = 0; int pgpgout = 0; int anonpgout = 0; int anonfree = 0; int fspgout = 0; int fsfree = 0; int execpgout = 0; int execfree = 0; page_t *pp; struct cpu *cpup; struct vnode *vp = NULL; /* for probe */ uint_t ppattr; kmutex_t *vphm = NULL; ASSERT((flags & B_READ) == 0); /* * If we are about to start paging anyway, start freeing pages. */ if (write_free && freemem < lotsfree + pages_before_pager && (flags & B_ERROR) == 0) { flags |= B_FREE; } /* * Handle each page involved in the i/o operation. */ while (plist != NULL) { pp = plist; ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp)); page_sub(&plist, pp); /* Kernel probe support */ if (vp == NULL) vp = pp->p_vnode; if (((flags & B_ERROR) == 0) && IS_VMODSORT(vp)) { /* * Move page to the top of the v_page list. * Skip pages modified during IO. */ vphm = page_vnode_mutex(vp); mutex_enter(vphm); if ((pp->p_vpnext != pp) && !hat_ismod(pp)) { page_vpsub(&vp->v_pages, pp); page_vpadd(&vp->v_pages, pp); } mutex_exit(vphm); } if (flags & B_ERROR) { /* * Write operation failed. We don't want * to destroy (or free) the page unless B_FORCE * is set. We set the mod bit again and release * all locks on the page so that it will get written * back again later when things are hopefully * better again. * If B_INVAL and B_FORCE is set we really have * to destroy the page. */ if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) { page_io_unlock(pp); /*LINTED: constant in conditional context*/ VN_DISPOSE(pp, B_INVAL, 0, kcred); } else { hat_setmod_only(pp); page_io_unlock(pp); page_unlock(pp); } } else if (flags & B_INVAL) { /* * XXX - Failed writes with B_INVAL set are * not handled appropriately. */ page_io_unlock(pp); /*LINTED: constant in conditional context*/ VN_DISPOSE(pp, B_INVAL, 0, kcred); } else if (flags & B_FREE ||!hat_page_is_mapped(pp)) { /* * Update statistics for pages being paged out */ if (pp->p_vnode) { if (IS_SWAPFSVP(pp->p_vnode)) { anonpgout++; } else { if (pp->p_vnode->v_flag & VVMEXEC) { execpgout++; } else { fspgout++; } } } page_io_unlock(pp); pgout = 1; pgpgout++; TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT, "page_ws_out:pp %p", pp); /* * The page_struct_lock need not be acquired to * examine "p_lckcnt" and "p_cowcnt" since we'll * have an "exclusive" lock if the upgrade succeeds. */ if (page_tryupgrade(pp) && pp->p_lckcnt == 0 && pp->p_cowcnt == 0) { /* * Check if someone has reclaimed the * page. If ref and mod are not set, no * one is using it so we can free it. * The rest of the system is careful * to use the NOSYNC flag to unload * translations set up for i/o w/o * affecting ref and mod bits. * * Obtain a copy of the real hardware * mod bit using hat_pagesync(pp, HAT_DONTZERO) * to avoid having to flush the cache. */ ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD); ck_refmod: if (!(ppattr & (P_REF | P_MOD))) { if (hat_page_is_mapped(pp)) { /* * Doesn't look like the page * was modified so now we * really have to unload the * translations. Meanwhile * another CPU could've * modified it so we have to * check again. We don't loop * forever here because now * the translations are gone * and no one can get a new one * since we have the "exclusive" * lock on the page. */ (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); ppattr = hat_page_getattr(pp, P_REF | P_MOD); goto ck_refmod; } /* * Update statistics for pages being * freed */ if (pp->p_vnode) { if (IS_SWAPFSVP(pp->p_vnode)) { anonfree++; } else { if (pp->p_vnode->v_flag & VVMEXEC) { execfree++; } else { fsfree++; } } } /*LINTED: constant in conditional ctx*/ VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred); dfree++; } else { page_unlock(pp); pgrec++; TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE, "page_ws_free:pp %p", pp); } } else { /* * Page is either `locked' in memory * or was reclaimed and now has a * "shared" lock, so release it. */ page_unlock(pp); } } else { /* * Neither B_FREE nor B_INVAL nor B_ERROR. * Just release locks. */ page_io_unlock(pp); page_unlock(pp); } } CPU_STATS_ENTER_K(); cpup = CPU; /* get cpup now that CPU cannot change */ CPU_STATS_ADDQ(cpup, vm, dfree, dfree); CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec); CPU_STATS_ADDQ(cpup, vm, pgout, pgout); CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout); CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout); CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree); CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout); CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree); CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout); CPU_STATS_ADDQ(cpup, vm, execfree, execfree); CPU_STATS_EXIT_K(); /* Kernel probe */ TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */, tnf_opaque, vnode, vp, tnf_ulong, pages_pageout, pgpgout, tnf_ulong, pages_freed, dfree, tnf_ulong, pages_reclaimed, pgrec); }
int vmxnet3s_txcache_init(vmxnet3s_softc_t *dp, vmxnet3s_txq_t *txq) { int i; int ndescrs; int node; page_t *page; struct seg kseg; vmxnet3s_txcache_t *cache = &dp->txcache; dev_info_t *dip = dp->dip; cache->num_pages = ((txq->cmdring.size * VMXNET3_HDR_COPY_SIZE) + (PAGESIZE - 1)) / PAGESIZE; /* Allocate pages */ if (!page_resv(cache->num_pages, KM_SLEEP)) { dev_err(dip, CE_WARN, "failed to reserve %d pages", cache->num_pages); goto out; } if (!page_create_wait(cache->num_pages, 0)) { dev_err(dip, CE_WARN, "failed to create %d pages", cache->num_pages); goto unresv_pages; } cache->pages = kmem_zalloc(cache->num_pages * sizeof (page_t *), KM_SLEEP); cache->page_maps = kmem_zalloc(cache->num_pages * sizeof (page_t *), KM_SLEEP); kseg.s_as = &kas; for (i = 0; i < cache->num_pages; i++) { page = page_get_freelist(&kvp, 0, &kseg, (caddr_t)(i*PAGESIZE), PAGESIZE, 0, NULL); if (page == NULL) { page = page_get_cachelist(&kvp, 0, &kseg, (caddr_t)(i * PAGESIZE), 0, NULL); if (page == NULL) goto free_pages; if (!PP_ISAGED(page)) page_hashout(page, NULL); } PP_CLRFREE(page); PP_CLRAGED(page); cache->pages[i] = page; } for (i = 0; i < cache->num_pages; i++) page_downgrade(cache->pages[i]); /* Allocate virtual address range for mapping pages */ cache->window = vmem_alloc(heap_arena, ptob(cache->num_pages), VM_SLEEP); ASSERT(cache->window); cache->num_nodes = txq->cmdring.size; /* Map pages */ for (i = 0; i < cache->num_pages; i++) { cache->page_maps[i] = cache->window + ptob(i); hat_devload(kas.a_hat, cache->page_maps[i], ptob(1), cache->pages[i]->p_pagenum, PROT_READ | PROT_WRITE | HAT_STRICTORDER, HAT_LOAD_LOCK); } /* Now setup cache items */ cache->nodes = kmem_zalloc(txq->cmdring.size * sizeof (vmxnet3s_txcache_node_t), KM_SLEEP); ndescrs = txq->cmdring.size; node = 0; for (i = 0; i < cache->num_pages; i++) { caddr_t va; int j; int lim; uint64_t pa; lim = (ndescrs <= VMXNET3_TX_CACHE_ITEMS_PER_PAGE) ? ndescrs : VMXNET3_TX_CACHE_ITEMS_PER_PAGE; va = cache->page_maps[i]; pa = cache->pages[i]->p_pagenum << PAGESHIFT; for (j = 0; j < lim; j++) { cache->nodes[node].pa = pa; cache->nodes[node].va = va; pa += VMXNET3_HDR_COPY_SIZE; va += VMXNET3_HDR_COPY_SIZE; node++; } ndescrs -= lim; } return (DDI_SUCCESS); free_pages: page_create_putback(cache->num_pages - i); while (--i >= 0) { if (!page_tryupgrade(cache->pages[i])) { page_unlock(cache->pages[i]); while (!page_lock(cache->pages[i], SE_EXCL, NULL, P_RECLAIM)) ; } page_free(cache->pages[i], 0); } kmem_free(cache->pages, cache->num_pages * PAGESIZE); unresv_pages: page_unresv(cache->num_pages); out: cache->num_pages = cache->num_nodes = 0; return (DDI_FAILURE); }
void plat_release_page(page_t *pp) { ASSERT((pp != NULL) && PAGE_LOCKED(pp)); page_unlock(pp); }