Example #1
0
/* Read system call. */
static int
sys_read (int handle, void *udst_, unsigned size) 
{
  uint8_t *udst = udst_;
  struct file_descriptor *fd;
  int bytes_read = 0;

  /* Look up file descriptor. */
  if (handle != STDIN_FILENO)
    fd = lookup_file_fd (handle);

  while (size > 0) 
    {
      /* How much to read into this page? */
      size_t page_left = PGSIZE - pg_ofs (udst);
      size_t read_amt = size < page_left ? size : page_left;
      off_t retval;

      /* Check that touching this page is okay. */
      if (!page_lock (udst, true)) 
        thread_exit ();

      /* Read from file into page. */
      if (handle != STDIN_FILENO) 
        {
          retval = file_read (fd->file, udst, read_amt);
          if (retval < 0)
            {
              if (bytes_read == 0)
                bytes_read = -1; 
              break;
            }
          bytes_read += retval; 
        }
      else 
        {
          size_t i;
          
          for (i = 0; i < read_amt; i++) 
            udst[i] = input_getc ();
          bytes_read = read_amt;
        }

      /* Release page. */
      page_unlock (udst);

      /* If it was a short read we're done. */
      if (retval != (off_t) read_amt)
        break;

      /* Advance. */
      udst += retval;
      size -= retval;
    }
   
  return bytes_read;
}
Example #2
0
/*
 * Any changes to this routine must also be carried over to
 * devmap_free_pages() in the seg_dev driver. This is because
 * we currently don't have a special kernel segment for non-paged
 * kernel memory that is exported by drivers to user space.
 */
static void
segkmem_free_vn(vmem_t *vmp, void *inaddr, size_t size, struct vnode *vp,
    void (*func)(page_t *))
{
	page_t *pp;
	caddr_t addr = inaddr;
	caddr_t eaddr;
	pgcnt_t npages = btopr(size);

	ASSERT(((uintptr_t)addr & PAGEOFFSET) == 0);
	ASSERT(vp != NULL);

	if (kvseg.s_base == NULL) {
		segkmem_gc_list_t *gc = inaddr;
		gc->gc_arena = vmp;
		gc->gc_size = size;
		gc->gc_next = segkmem_gc_list;
		segkmem_gc_list = gc;
		return;
	}

	hat_unload(kas.a_hat, addr, size, HAT_UNLOAD_UNLOCK);

	for (eaddr = addr + size; addr < eaddr; addr += PAGESIZE) {
#if defined(__x86)
		pp = page_find(vp, (u_offset_t)(uintptr_t)addr);
		if (pp == NULL)
			panic("segkmem_free: page not found");
		if (!page_tryupgrade(pp)) {
			/*
			 * Some other thread has a sharelock. Wait for
			 * it to drop the lock so we can free this page.
			 */
			page_unlock(pp);
			pp = page_lookup(vp, (u_offset_t)(uintptr_t)addr,
			    SE_EXCL);
		}
#else
		pp = page_lookup(vp, (u_offset_t)(uintptr_t)addr, SE_EXCL);
#endif
		if (pp == NULL)
			panic("segkmem_free: page not found");
		/* Clear p_lckcnt so page_destroy() doesn't update availrmem */
		pp->p_lckcnt = 0;
		if (func)
			func(pp);
		else
			page_destroy(pp, 0);
	}
	if (func == NULL)
		page_unresv(npages);

	if (vmp != NULL)
		vmem_free(vmp, inaddr, size);

}
Example #3
0
/* Write system call. */
static int
sys_write (int handle, void *usrc_, unsigned size) 
{
  uint8_t *usrc = usrc_;
  struct file_descriptor *fd = NULL;
  int bytes_written = 0;

  /* Lookup up file descriptor. */
  if (handle != STDOUT_FILENO)
    fd = lookup_file_fd (handle);

  while (size > 0) 
    {
      /* How much bytes to write to this page? */
      size_t page_left = PGSIZE - pg_ofs (usrc);
      size_t write_amt = size < page_left ? size : page_left;
      off_t retval;

      /* Check that we can touch this user page. */
      if (!page_lock (usrc, false)) 
        thread_exit ();

      /* Do the write. */
      if (handle == STDOUT_FILENO)
        {
          putbuf ((char *) usrc, write_amt);
          retval = write_amt;
        }
      else
        retval = file_write (fd->file, usrc, write_amt);

      /* Release user page. */
      page_unlock (usrc);

      /* Handle return value. */
      if (retval < 0) 
        {
          if (bytes_written == 0)
            bytes_written = -1;
          break;
        }
      bytes_written += retval;

      /* If it was a short write we're done. */
      if (retval != (off_t) write_amt)
        break;

      /* Advance. */
      usrc += retval;
      size -= retval;
    }
 
  return bytes_written;
}
Example #4
0
/*
 * Generic entry point used to release the "shared/exclusive" lock
 * and the "p_iolock" on pages after i/o is complete.
 */
void
pvn_io_done(page_t *plist)
{
    page_t *pp;

    while (plist != NULL) {
        pp = plist;
        page_sub(&plist, pp);
        page_io_unlock(pp);
        page_unlock(pp);
    }
}
Example #5
0
/* Creates a copy of user string US in kernel memory
   and returns it as a page that must be freed with
   palloc_free_page().
   Truncates the string at PGSIZE bytes in size.
   Call thread_exit() if any of the user accesses are invalid. */
static char *
copy_in_string (const char *us) 
{
  char *ks;
  char *upage;
  size_t length;
 
  ks = palloc_get_page (0);
  if (ks == NULL) 
    thread_exit ();

  length = 0;
  for (;;) 
    {
      upage = pg_round_down (us);
      if (!page_lock (upage, false))
        goto lock_error;

      for (; us < upage + PGSIZE; us++) 
        {
          ks[length++] = *us;
          if (*us == '\0') 
            {
              page_unlock (upage);
              return ks; 
            }
          else if (length >= PGSIZE) 
            goto too_long_error;
        }

      page_unlock (upage);
    }

 too_long_error:
  page_unlock (upage);
 lock_error:
  palloc_free_page (ks);
  thread_exit ();
}
Example #6
0
/*
 * Page retire self-test. For now, it always returns 0.
 */
int
page_retire_test(void)
{
	page_t *first, *pp, *cpp, *cpp2, *lpp;

	/*
	 * Tests the corner case where a large page can't be retired
	 * because one of the constituent pages is locked. We mark
	 * one page to be retired and try to retire it, and mark the
	 * other page to be retired but don't try to retire it, so
	 * that page_unlock() in the failure path will recurse and try
	 * to retire THAT page. This is the worst possible situation
	 * we can get ourselves into.
	 */
	memsegs_lock(0);
	pp = first = page_first();
	do {
		if (pp->p_szc && PP_PAGEROOT(pp) == pp) {
			cpp = pp + 1;
			lpp = PP_ISFREE(pp)? pp : pp + 2;
			cpp2 = pp + 3;
			if (!page_trylock(lpp, pp == lpp? SE_EXCL : SE_SHARED))
				continue;
			if (!page_trylock(cpp, SE_EXCL)) {
				page_unlock(lpp);
				continue;
			}
			page_settoxic(cpp, PR_FMA | PR_BUSY);
			page_settoxic(cpp2, PR_FMA);
			page_tryretire(cpp);	/* will fail */
			page_unlock(lpp);
			(void) page_retire(cpp->p_pagenum, PR_FMA);
			(void) page_retire(cpp2->p_pagenum, PR_FMA);
		}
	} while ((pp = page_next(pp)) != first);
	memsegs_unlock(0);

	return (0);
}
Example #7
0
/*ARGSUSED*/
static int
bootfs_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp,
    page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw,
    cred_t *cr)
{
	bootfs_node_t *bnp = vp->v_data;
	page_t *pp, *fpp;
	pfn_t pfn;

	for (;;) {
		/* Easy case where the page exists */
		pp = page_lookup(vp, off, rw == S_CREATE ? SE_EXCL : SE_SHARED);
		if (pp != NULL) {
			if (pl != NULL) {
				pl[0] = pp;
				pl[1] = NULL;
			} else {
				page_unlock(pp);
			}
			return (0);
		}

		pp = page_create_va(vp, off, PAGESIZE, PG_EXCL | PG_WAIT, seg,
		    addr);

		/*
		 * If we didn't get the page, that means someone else beat us to
		 * creating this so we need to try again.
		 */
		if (pp != NULL)
			break;
	}

	pfn = btop((bnp->bvn_addr + off) & PAGEMASK);
	fpp = page_numtopp_nolock(pfn);

	if (ppcopy(fpp, pp) == 0) {
		pvn_read_done(pp, B_ERROR);
		return (EIO);
	}

	if (pl != NULL) {
		pvn_plist_init(pp, pl, plsz, off, PAGESIZE, rw);
	} else {
		pvn_io_done(pp);
	}

	return (0);
}
Example #8
0
void
boot_mapin(caddr_t addr, size_t size)
{
	caddr_t	 eaddr;
	page_t	*pp;
	pfn_t	 pfnum;

	if (page_resv(btop(size), KM_NOSLEEP) == 0)
		panic("boot_mapin: page_resv failed");

	for (eaddr = addr + size; addr < eaddr; addr += PAGESIZE) {
		pfnum = va_to_pfn(addr);
		if (pfnum == PFN_INVALID)
			continue;
		if ((pp = page_numtopp_nolock(pfnum)) == NULL)
			panic("boot_mapin(): No pp for pfnum = %lx", pfnum);

		/*
		 * must break up any large pages that may have constituent
		 * pages being utilized for BOP_ALLOC()'s before calling
		 * page_numtopp().The locking code (ie. page_reclaim())
		 * can't handle them
		 */
		if (pp->p_szc != 0)
			page_boot_demote(pp);

		pp = page_numtopp(pfnum, SE_EXCL);
		if (pp == NULL || PP_ISFREE(pp))
			panic("boot_alloc: pp is NULL or free");

		/*
		 * If the cage is on but doesn't yet contain this page,
		 * mark it as non-relocatable.
		 */
		if (kcage_on && !PP_ISNORELOC(pp)) {
			PP_SETNORELOC(pp);
			PLCNT_XFER_NORELOC(pp);
		}

		(void) page_hashin(pp, &kvp, (u_offset_t)(uintptr_t)addr, NULL);
		pp->p_lckcnt = 1;
#if defined(__x86)
		page_downgrade(pp);
#else
		page_unlock(pp);
#endif
	}
}
Example #9
0
/*
 * Take a retired page off the retired-pages vnode and clear the toxic flags.
 * If "free" is nonzero, lock it and put it back on the freelist. If "free"
 * is zero, the caller already holds SE_EXCL lock so we simply unretire it
 * and don't do anything else with it.
 *
 * Any unretire messages are printed from this routine.
 *
 * Returns 0 if page pp was unretired; else an error code.
 */
int
page_unretire_pp(page_t *pp, int free)
{
	/*
	 * To be retired, a page has to be hashed onto the retired_pages vnode
	 * and have PR_RETIRED set in p_toxic.
	 */
	if (free == 0 || page_try_reclaim_lock(pp, SE_EXCL, SE_RETIRED)) {
		ASSERT(PAGE_EXCL(pp));
		PR_DEBUG(prd_ulocked);
		if (!PP_RETIRED(pp)) {
			PR_DEBUG(prd_unotretired);
			page_unlock(pp);
			return (page_retire_done(pp, PRD_UNR_NOT));
		}

		PR_MESSAGE(CE_NOTE, 1, "unretiring retired"
		    " page 0x%08x.%08x", mmu_ptob((uint64_t)pp->p_pagenum));
		if (pp->p_toxic & PR_FMA) {
			PR_DECR_KSTAT(pr_fma);
		} else if (pp->p_toxic & PR_UE) {
			PR_DECR_KSTAT(pr_ue);
		} else {
			PR_DECR_KSTAT(pr_mce);
		}
		page_clrtoxic(pp, PR_ALLFLAGS);

		if (free) {
			PR_DEBUG(prd_udestroy);
			page_destroy(pp, 0);
		} else {
			PR_DEBUG(prd_uhashout);
			page_hashout(pp, NULL);
		}

		mutex_enter(&freemem_lock);
		availrmem++;
		mutex_exit(&freemem_lock);

		PR_DEBUG(prd_uunretired);
		PR_DECR_KSTAT(pr_retired);
		PR_INCR_KSTAT(pr_unretired);
		return (page_retire_done(pp, PRD_UNR_SUCCESS));
	}
	PR_DEBUG(prd_unotlocked);
	return (page_retire_done(pp, PRD_UNR_CANTLOCK));
}
Example #10
0
File: osi_vm.c Project: hwr/openafs
/* Zero no-longer-used part of last page, when truncating a file
 *
 * This function only exists for Solaris.  Other platforms do not support it.
 *
 * Locking:  the vcache entry lock is held.  It is released and re-obtained.
 * The caller will raise activeV (to prevent pageins), but this function must
 * be called first, since it causes a pagein.
 */
void
osi_VM_PreTruncate(struct vcache *avc, int alen, afs_ucred_t *acred)
{
    page_t *pp;
    int pageOffset = (alen & PAGEOFFSET);

    if (pageOffset == 0) {
	return;
    }

    ReleaseWriteLock(&avc->lock);
    AFS_GUNLOCK();
    pp = page_lookup(AFSTOV(avc), alen - pageOffset, SE_EXCL);
    if (pp) {
	pagezero(pp, pageOffset, PAGESIZE - pageOffset);
	page_unlock(pp);
    }
    AFS_GLOCK();
    ObtainWriteLock(&avc->lock, 563);
}
int
plat_hold_page(pfn_t pfn, int lock, page_t **pp_ret)
{
	page_t *pp = page_numtopp_nolock(pfn);

	if (pp == NULL)
		return (PLAT_HOLD_FAIL);

#if !defined(__xpv)
	/*
	 * Pages are locked SE_SHARED because some hypervisors
	 * like xVM ESX reclaim Guest OS memory by locking
	 * it SE_EXCL so we want to leave these pages alone.
	 */
	if (lock == PLAT_HOLD_LOCK) {
		ASSERT(pp_ret != NULL);
		if (page_trylock(pp, SE_SHARED) == 0)
			return (PLAT_HOLD_FAIL);
	}
#else	/* __xpv */
	if (lock == PLAT_HOLD_LOCK) {
		ASSERT(pp_ret != NULL);
		if (page_trylock(pp, SE_EXCL) == 0)
			return (PLAT_HOLD_FAIL);
	}

	if (mfn_list[pfn] == MFN_INVALID) {
		/* We failed - release the lock if we grabbed it earlier */
		if (lock == PLAT_HOLD_LOCK) {
			page_unlock(pp);
		}
		return (PLAT_HOLD_FAIL);
	}
#endif	/* __xpv */

	if (lock == PLAT_HOLD_LOCK)
		*pp_ret = pp;

	return (PLAT_HOLD_OK);
}
Example #12
0
/* Copies SIZE bytes from kernel address SRC to user address
   UDST.
   Call thread_exit() if any of the user accesses are invalid. */
static void
copy_out (void *udst_, const void *src_, size_t size) 
{
  uint8_t *udst = udst_;
  const uint8_t *src = src_;

  while (size > 0) 
    {
      size_t chunk_size = PGSIZE - pg_ofs (udst);
      if (chunk_size > size)
        chunk_size = size;
      
      if (!page_lock (udst, false))
        thread_exit ();
      memcpy (udst, src, chunk_size);
      page_unlock (udst);

      udst += chunk_size;
      src += chunk_size;
      size -= chunk_size;
    }
}
Example #13
0
/*
 * Act like page_destroy(), but instead of freeing the page, hash it onto
 * the retired_pages vnode, and mark it retired.
 *
 * For fun, we try to scrub the page until it's squeaky clean.
 * availrmem is adjusted here.
 */
static void
page_retire_destroy(page_t *pp)
{
	u_offset_t off = (u_offset_t)((uintptr_t)pp);

	ASSERT(PAGE_EXCL(pp));
	ASSERT(!PP_ISFREE(pp));
	ASSERT(pp->p_szc == 0);
	ASSERT(!hat_page_is_mapped(pp));
	ASSERT(!pp->p_vnode);

	page_clr_all_props(pp);
	pagescrub(pp, 0, MMU_PAGESIZE);

	pp->p_next = NULL;
	pp->p_prev = NULL;
	if (page_hashin(pp, retired_pages, off, NULL) == 0) {
		cmn_err(CE_PANIC, "retired page %p hashin failed", (void *)pp);
	}

	page_settoxic(pp, PR_RETIRED);
	page_clrtoxic(pp, PR_BUSY);
	page_retire_dequeue(pp);
	PR_INCR_KSTAT(pr_retired);

	if (pp->p_toxic & PR_FMA) {
		PR_INCR_KSTAT(pr_fma);
	} else if (pp->p_toxic & PR_UE) {
		PR_INCR_KSTAT(pr_ue);
	} else {
		PR_INCR_KSTAT(pr_mce);
	}

	mutex_enter(&freemem_lock);
	availrmem--;
	mutex_exit(&freemem_lock);

	page_unlock(pp);
}
Example #14
0
/* Copies SIZE bytes from user address USRC to kernel address
   DST.
   Call thread_exit() if any of the user accesses are invalid. */
static void
copy_in (void *dst_, const void *usrc_, size_t size) 
{
  uint8_t *dst = dst_;
  const uint8_t *usrc = usrc_;

  while (size > 0) 
    {
      size_t chunk_size = PGSIZE - pg_ofs (usrc);
      if (chunk_size > size)
        chunk_size = size;
      
      if (!page_lock (usrc, false))
        thread_exit ();
      memcpy (dst, usrc, chunk_size);
      page_unlock (usrc);

      dst += chunk_size;
      usrc += chunk_size;
      size -= chunk_size;
    }
}
Example #15
0
/*
 * Scan page_t's and issue I/O's for modified pages.
 *
 * Also coalesces consecutive small sized free pages into the next larger
 * pagesize. This costs a tiny bit of time in fsflush, but will reduce time
 * spent scanning on later passes and for anybody allocating large pages.
 */
static void
fsflush_do_pages()
{
	vnode_t		*vp;
	ulong_t		pcount;
	hrtime_t	timer = gethrtime();
	ulong_t		releases = 0;
	ulong_t		nexamined = 0;
	ulong_t		nlocked = 0;
	ulong_t		nmodified = 0;
	ulong_t		ncoalesce = 0;
	ulong_t		cnt;
	int		mod;
	int		fspage = 1;
	u_offset_t	offset;
	uint_t		szc;

	page_t		*coal_page = NULL;  /* 1st page in group to coalesce */
	uint_t		coal_szc = 0;	    /* size code, coal_page->p_szc */
	uint_t		coal_cnt = 0;	    /* count of pages seen */

	static ulong_t	nscan = 0;
	static pgcnt_t	last_total_pages = 0;
	static page_t	*pp = NULL;

	/*
	 * Check to see if total_pages has changed.
	 */
	if (total_pages != last_total_pages) {
		last_total_pages = total_pages;
		nscan = (last_total_pages * (tune.t_fsflushr))/v.v_autoup;
	}

	if (pp == NULL)
		pp = memsegs->pages;

	pcount = 0;
	while (pcount < nscan) {

		/*
		 * move to the next page, skipping over large pages
		 * and issuing prefetches.
		 */
		if (pp->p_szc && fspage == 0) {
			pfn_t pfn;

			pfn  = page_pptonum(pp);
			cnt = page_get_pagecnt(pp->p_szc);
			cnt -= pfn & (cnt - 1);
		} else
			cnt = 1;

		pp = page_nextn(pp, cnt);
		prefetch_page_r((void *)pp);
		ASSERT(pp != NULL);
		pcount += cnt;

		/*
		 * Do a bunch of dirty tests (ie. no locking) to determine
		 * if we can quickly skip this page. These tests are repeated
		 * after acquiring the page lock.
		 */
		++nexamined;
		if (PP_ISSWAP(pp)) {
			fspage = 0;
			coal_page = NULL;
			continue;
		}

		/*
		 * skip free pages too, but try coalescing them into larger
		 * pagesizes
		 */
		if (PP_ISFREE(pp)) {
			/*
			 * skip pages with a file system identity or that
			 * are already maximum size
			 */
			fspage = 0;
			szc = pp->p_szc;
			if (pp->p_vnode != NULL || szc == fsf_npgsz - 1) {
				coal_page = NULL;
				continue;
			}

			/*
			 * If not in a coalescing candidate page or the size
			 * codes are different, start a new candidate.
			 */
			if (coal_page == NULL || coal_szc != szc) {

				/*
				 * page must be properly aligned
				 */
				if ((page_pptonum(pp) & fsf_mask[szc]) != 0) {
					coal_page = NULL;
					continue;
				}
				coal_page = pp;
				coal_szc = szc;
				coal_cnt = 1;
				continue;
			}

			/*
			 * acceptable to add this to existing candidate page
			 */
			++coal_cnt;
			if (coal_cnt < fsf_pgcnt[coal_szc])
				continue;

			/*
			 * We've got enough pages to coalesce, so do it.
			 * After promoting, we clear coal_page, so it will
			 * take another pass to promote this to an even
			 * larger page.
			 */
			++ncoalesce;
			(void) page_promote_size(coal_page, coal_szc);
			coal_page = NULL;
			continue;
		} else {
			coal_page = NULL;
		}

		if (PP_ISKAS(pp) ||
		    PAGE_LOCKED(pp) ||
		    pp->p_lckcnt != 0 ||
		    pp->p_cowcnt != 0) {
			fspage = 0;
			continue;
		}


		/*
		 * Reject pages that can't be "exclusively" locked.
		 */
		if (!page_trylock(pp, SE_EXCL))
			continue;
		++nlocked;


		/*
		 * After locking the page, redo the above checks.
		 * Since we locked the page, leave out the PAGE_LOCKED() test.
		 */
		vp = pp->p_vnode;
		if (PP_ISSWAP(pp) ||
		    PP_ISFREE(pp) ||
		    vp == NULL ||
		    PP_ISKAS(pp) ||
		    (vp->v_flag & VISSWAP) != 0) {
			page_unlock(pp);
			fspage = 0;
			continue;
		}
		if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
			page_unlock(pp);
			continue;
		}

		fspage = 1;
		ASSERT(vp->v_type != VCHR);

		/*
		 * Check the modified bit. Leaving the bit alone in hardware.
		 * It will be cleared if we do the putpage.
		 */
		if (IS_VMODSORT(vp))
			mod = hat_ismod(pp);
		else
			mod = hat_pagesync(pp,
			    HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD;

		if (mod) {
			++nmodified;
			offset = pp->p_offset;

			/*
			 * Hold the vnode before releasing the page lock
			 * to prevent it from being freed and re-used by
			 * some other thread.
			 */
			VN_HOLD(vp);

			page_unlock(pp);

			(void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_ASYNC,
			    kcred, NULL);

			VN_RELE(vp);
		} else {

			/*
			 * Catch any pages which should be on the cache list,
			 * but aren't yet.
			 */
			if (hat_page_is_mapped(pp) == 0) {
				++releases;
				(void) page_release(pp, 1);
			} else {
				page_unlock(pp);
			}
		}
	}

	/*
	 * maintain statistics
	 * reset every million wakeups, just to avoid overflow
	 */
	if (++fsf_cycles == 1000000) {
		fsf_cycles = 0;
		fsf_total.fsf_scan = 0;
		fsf_total.fsf_examined = 0;
		fsf_total.fsf_locked = 0;
		fsf_total.fsf_modified = 0;
		fsf_total.fsf_coalesce = 0;
		fsf_total.fsf_time = 0;
		fsf_total.fsf_releases = 0;
	} else {
		fsf_total.fsf_scan += fsf_recent.fsf_scan = nscan;
		fsf_total.fsf_examined += fsf_recent.fsf_examined = nexamined;
		fsf_total.fsf_locked += fsf_recent.fsf_locked = nlocked;
		fsf_total.fsf_modified += fsf_recent.fsf_modified = nmodified;
		fsf_total.fsf_coalesce += fsf_recent.fsf_coalesce = ncoalesce;
		fsf_total.fsf_time += fsf_recent.fsf_time = gethrtime() - timer;
		fsf_total.fsf_releases += fsf_recent.fsf_releases = releases;
	}
}
Example #16
0
/*
 * Allocate a large page to back the virtual address range
 * [addr, addr + size).  If addr is NULL, allocate the virtual address
 * space as well.
 */
static void *
segkmem_xalloc_lp(vmem_t *vmp, void *inaddr, size_t size, int vmflag,
    uint_t attr, page_t *(*page_create_func)(void *, size_t, int, void *),
    void *pcarg)
{
	caddr_t addr = inaddr, pa;
	size_t  lpsize = segkmem_lpsize;
	pgcnt_t npages = btopr(size);
	pgcnt_t nbpages = btop(lpsize);
	pgcnt_t nlpages = size >> segkmem_lpshift;
	size_t  ppasize = nbpages * sizeof (page_t *);
	page_t *pp, *rootpp, **ppa, *pplist = NULL;
	int i;

	vmflag |= VM_NOSLEEP;

	if (page_resv(npages, vmflag & VM_KMFLAGS) == 0) {
		return (NULL);
	}

	/*
	 * allocate an array we need for hat_memload_array.
	 * we use a separate arena to avoid recursion.
	 * we will not need this array when hat_memload_array learns pp++
	 */
	if ((ppa = vmem_alloc(segkmem_ppa_arena, ppasize, vmflag)) == NULL) {
		goto fail_array_alloc;
	}

	if (inaddr == NULL && (addr = vmem_alloc(vmp, size, vmflag)) == NULL)
		goto fail_vmem_alloc;

	ASSERT(((uintptr_t)addr & (lpsize - 1)) == 0);

	/* create all the pages */
	for (pa = addr, i = 0; i < nlpages; i++, pa += lpsize) {
		if ((pp = page_create_func(pa, lpsize, vmflag, pcarg)) == NULL)
			goto fail_page_create;
		page_list_concat(&pplist, &pp);
	}

	/* at this point we have all the resource to complete the request */
	while ((rootpp = pplist) != NULL) {
		for (i = 0; i < nbpages; i++) {
			ASSERT(pplist != NULL);
			pp = pplist;
			page_sub(&pplist, pp);
			ASSERT(page_iolock_assert(pp));
			page_io_unlock(pp);
			ppa[i] = pp;
		}
		/*
		 * Load the locked entry. It's OK to preload the entry into the
		 * TSB since we now support large mappings in the kernel TSB.
		 */
		hat_memload_array(kas.a_hat,
		    (caddr_t)(uintptr_t)rootpp->p_offset, lpsize,
		    ppa, (PROT_ALL & ~PROT_USER) | HAT_NOSYNC | attr,
		    HAT_LOAD_LOCK);

		for (--i; i >= 0; --i) {
			ppa[i]->p_lckcnt = 1;
			page_unlock(ppa[i]);
		}
	}

	vmem_free(segkmem_ppa_arena, ppa, ppasize);
	return (addr);

fail_page_create:
	while ((rootpp = pplist) != NULL) {
		for (i = 0, pp = pplist; i < nbpages; i++, pp = pplist) {
			ASSERT(pp != NULL);
			page_sub(&pplist, pp);
			ASSERT(page_iolock_assert(pp));
			page_io_unlock(pp);
		}
		page_destroy_pages(rootpp);
	}

	if (inaddr == NULL)
		vmem_free(vmp, addr, size);

fail_vmem_alloc:
	vmem_free(segkmem_ppa_arena, ppa, ppasize);

fail_array_alloc:
	page_unresv(npages);

	return (NULL);
}
Example #17
0
/*ARGSUSED*/
static faultcode_t
segkmem_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t size,
	enum fault_type type, enum seg_rw rw)
{
	pgcnt_t npages;
	spgcnt_t pg;
	page_t *pp;
	struct vnode *vp = seg->s_data;

	ASSERT(RW_READ_HELD(&seg->s_as->a_lock));

	if (seg->s_as != &kas || size > seg->s_size ||
	    addr < seg->s_base || addr + size > seg->s_base + seg->s_size)
		panic("segkmem_fault: bad args");

	/*
	 * If it is one of segkp pages, call segkp_fault.
	 */
	if (segkp_bitmap && seg == &kvseg &&
	    BT_TEST(segkp_bitmap, btop((uintptr_t)(addr - seg->s_base))))
		return (SEGOP_FAULT(hat, segkp, addr, size, type, rw));

	if (rw != S_READ && rw != S_WRITE && rw != S_OTHER)
		return (FC_NOSUPPORT);

	npages = btopr(size);

	switch (type) {
	case F_SOFTLOCK:	/* lock down already-loaded translations */
		for (pg = 0; pg < npages; pg++) {
			pp = page_lookup(vp, (u_offset_t)(uintptr_t)addr,
			    SE_SHARED);
			if (pp == NULL) {
				/*
				 * Hmm, no page. Does a kernel mapping
				 * exist for it?
				 */
				if (!hat_probe(kas.a_hat, addr)) {
					addr -= PAGESIZE;
					while (--pg >= 0) {
						pp = page_find(vp, (u_offset_t)
						    (uintptr_t)addr);
						if (pp)
							page_unlock(pp);
						addr -= PAGESIZE;
					}
					return (FC_NOMAP);
				}
			}
			addr += PAGESIZE;
		}
		if (rw == S_OTHER)
			hat_reserve(seg->s_as, addr, size);
		return (0);
	case F_SOFTUNLOCK:
		while (npages--) {
			pp = page_find(vp, (u_offset_t)(uintptr_t)addr);
			if (pp)
				page_unlock(pp);
			addr += PAGESIZE;
		}
		return (0);
	default:
		return (FC_NOSUPPORT);
	}
	/*NOTREACHED*/
}
Example #18
0
/*
 * page_retire() - the front door in to retire a page.
 *
 * Ideally, page_retire() would instantly retire the requested page.
 * Unfortunately, some pages are locked or otherwise tied up and cannot be
 * retired right away. To deal with that, bits are set in p_toxic of the
 * page_t. An attempt is made to lock the page; if the attempt is successful,
 * we instantly unlock the page counting on page_unlock() to notice p_toxic
 * is nonzero and to call back into page_retire_pp(). Success is determined
 * by looking to see whether the page has been retired once it has been
 * unlocked.
 *
 * Returns:
 *
 *   - 0 on success,
 *   - EINVAL when the PA is whacko,
 *   - EIO if the page is already retired or already pending retirement, or
 *   - EAGAIN if the page could not be _immediately_ retired but is pending.
 */
int
page_retire(uint64_t pa, uchar_t reason)
{
	page_t	*pp;

	ASSERT(reason & PR_REASONS);		/* there must be a reason */
	ASSERT(!(reason & ~PR_REASONS));	/* but no other bits */

	pp = page_numtopp_nolock(mmu_btop(pa));
	if (pp == NULL) {
		PR_MESSAGE(CE_WARN, 1, "Cannot schedule clearing of error on"
		    " page 0x%08x.%08x; page is not relocatable memory", pa);
		return (page_retire_done(pp, PRD_INVALID_PA));
	}
	if (PP_RETIRED(pp)) {
		PR_DEBUG(prd_dup1);
		return (page_retire_done(pp, PRD_DUPLICATE));
	}

	if ((reason & PR_UE) && !PP_TOXIC(pp)) {
		PR_MESSAGE(CE_NOTE, 1, "Scheduling clearing of error on"
		    " page 0x%08x.%08x", pa);
	} else if (PP_PR_REQ(pp)) {
		PR_DEBUG(prd_dup2);
		return (page_retire_done(pp, PRD_DUPLICATE));
	} else {
		PR_MESSAGE(CE_NOTE, 1, "Scheduling removal of"
		    " page 0x%08x.%08x", pa);
	}
	page_settoxic(pp, reason);
	page_retire_enqueue(pp);

	/*
	 * And now for some magic.
	 *
	 * We marked this page toxic up above.  All there is left to do is
	 * to try to lock the page and then unlock it.  The page lock routines
	 * will intercept the page and retire it if they can.  If the page
	 * cannot be locked, 's okay -- page_unlock() will eventually get it,
	 * or the background thread, until then the lock routines will deny
	 * further locks on it.
	 */
	if (MTBF(pr_calls, pr_mtbf) && page_trylock(pp, SE_EXCL)) {
		PR_DEBUG(prd_prlocked);
		page_unlock(pp);
	} else {
		PR_DEBUG(prd_prnotlocked);
	}

	if (PP_RETIRED(pp)) {
		PR_DEBUG(prd_prretired);
		return (0);
	} else {
		cv_signal(&pr_cv);
		PR_INCR_KSTAT(pr_failed);

		if (pp->p_toxic & PR_MSG) {
			return (page_retire_done(pp, PRD_FAILED));
		} else {
			return (page_retire_done(pp, PRD_PENDING));
		}
	}
}
Example #19
0
/*
 * page_retire_pp() decides what to do with a failing page.
 *
 * When we get a free page (e.g. the scrubber or in the free path) life is
 * nice because the page is clean and marked free -- those always retire
 * nicely. From there we go by order of difficulty. If the page has data,
 * we attempt to relocate its contents to a suitable replacement page. If
 * that does not succeed, we look to see if it is clean. If after all of
 * this we have a clean, unmapped page (which we usually do!), we retire it.
 * If the page is not clean, we still process it regardless on a UE; for
 * CEs or FMA requests, we fail leaving the page in service. The page will
 * eventually be tried again later. We always return with the page unlocked
 * since we are called from page_unlock().
 *
 * We don't call panic or do anything fancy down in here. Our boss the DE
 * gets paid handsomely to do his job of figuring out what to do when errors
 * occur. We just do what he tells us to do.
 */
static int
page_retire_pp(page_t *pp)
{
	int		toxic;

	ASSERT(PAGE_EXCL(pp));
	ASSERT(pp->p_iolock_state == 0);
	ASSERT(pp->p_szc == 0);

	PR_DEBUG(prd_top);
	PR_TYPES(pp);

	toxic = pp->p_toxic;
	ASSERT(toxic & PR_REASONS);

	if ((toxic & (PR_FMA | PR_MCE)) && !(toxic & PR_UE) &&
	    page_retire_limit()) {
		page_clrtoxic(pp, PR_FMA | PR_MCE | PR_MSG | PR_BUSY);
		page_retire_dequeue(pp);
		page_unlock(pp);
		return (page_retire_done(pp, PRD_LIMIT));
	}

	if (PP_ISFREE(pp)) {
		int dbgnoreclaim = MTBF(recl_calls, recl_mtbf) == 0;

		PR_DEBUG(prd_free);

		if (dbgnoreclaim || !page_reclaim(pp, NULL)) {
			PR_DEBUG(prd_noreclaim);
			PR_INCR_KSTAT(pr_failed);
			/*
			 * page_reclaim() returns with `pp' unlocked when
			 * it fails.
			 */
			if (dbgnoreclaim)
				page_unlock(pp);
			return (page_retire_done(pp, PRD_FAILED));
		}
	}
	ASSERT(!PP_ISFREE(pp));

	if ((toxic & PR_UE) == 0 && pp->p_vnode && !PP_ISNORELOCKERNEL(pp) &&
	    MTBF(reloc_calls, reloc_mtbf)) {
		page_t *newpp;
		spgcnt_t count;

		/*
		 * If we can relocate the page, great! newpp will go
		 * on without us, and everything is fine.  Regardless
		 * of whether the relocation succeeds, we are still
		 * going to take `pp' around back and shoot it.
		 */
		newpp = NULL;
		if (page_relocate(&pp, &newpp, 0, 0, &count, NULL) == 0) {
			PR_DEBUG(prd_reloc);
			page_unlock(newpp);
			ASSERT(hat_page_getattr(pp, P_MOD) == 0);
		} else {
			PR_DEBUG(prd_relocfail);
		}
	}

	if (hat_ismod(pp)) {
		PR_DEBUG(prd_mod);
		PR_INCR_KSTAT(pr_failed);
		page_unlock(pp);
		return (page_retire_done(pp, PRD_FAILED));
	}

	if (PP_ISKVP(pp)) {
		PR_DEBUG(prd_kern);
		PR_INCR_KSTAT(pr_failed_kernel);
		page_unlock(pp);
		return (page_retire_done(pp, PRD_FAILED));
	}

	if (pp->p_lckcnt || pp->p_cowcnt) {
		PR_DEBUG(prd_locked);
		PR_INCR_KSTAT(pr_failed);
		page_unlock(pp);
		return (page_retire_done(pp, PRD_FAILED));
	}

	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
	ASSERT(!hat_page_is_mapped(pp));

	/*
	 * If the page is modified, and was not relocated; we can't
	 * retire it without dropping data on the floor. We have to
	 * recheck after unloading since the dirty bit could have been
	 * set since we last checked.
	 */
	if (hat_ismod(pp)) {
		PR_DEBUG(prd_mod_late);
		PR_INCR_KSTAT(pr_failed);
		page_unlock(pp);
		return (page_retire_done(pp, PRD_FAILED));
	}

	if (pp->p_vnode) {
		PR_DEBUG(prd_hashout);
		page_hashout(pp, NULL);
	}
	ASSERT(!pp->p_vnode);

	/*
	 * The problem page is locked, demoted, unmapped, not free,
	 * hashed out, and not COW or mlocked (whew!).
	 *
	 * Now we select our ammunition, take it around back, and shoot it.
	 */
	if (toxic & PR_UE) {
		if (page_retire_transient_ue(pp)) {
			PR_DEBUG(prd_uescrubbed);
			return (page_retire_done(pp, PRD_UE_SCRUBBED));
		} else {
			PR_DEBUG(prd_uenotscrubbed);
			page_retire_destroy(pp);
			return (page_retire_done(pp, PRD_SUCCESS));
		}
	} else if (toxic & PR_FMA) {
		PR_DEBUG(prd_fma);
		page_retire_destroy(pp);
		return (page_retire_done(pp, PRD_SUCCESS));
	} else if (toxic & PR_MCE) {
		PR_DEBUG(prd_mce);
		page_retire_destroy(pp);
		return (page_retire_done(pp, PRD_SUCCESS));
	}
	panic("page_retire_pp: bad toxic flags %d", toxic);
	/*NOTREACHED*/
}
Example #20
0
/* Read system call. */
static int
sys_read (int handle, void *udst_, unsigned size) 
{
  uint8_t *udst = udst_;
  struct file_descriptor *fd;
  int bytes_read = 0;

  fd = lookup_fd (handle);
  while (size > 0) 
    {
      /* How much to read into this page? */
      size_t page_left = PGSIZE - pg_ofs (udst);
      size_t read_amt = size < page_left ? size : page_left;
      off_t retval;

      /* Read from file into page. */
      if (handle != STDIN_FILENO) 
        {
          if (!page_lock (udst, true)) 
            thread_exit (); 
          lock_acquire (&fs_lock);
          retval = file_read (fd->file, udst, read_amt);
          lock_release (&fs_lock);
          page_unlock (udst);
        }
      else 
        {
          size_t i;
          
          for (i = 0; i < read_amt; i++) 
            {
              char c = input_getc ();
              if (!page_lock (udst, true)) 
                thread_exit ();
              udst[i] = c;
              page_unlock (udst);
            }
          bytes_read = read_amt;
        }
      
      /* Check success. */
      if (retval < 0)
        {
          if (bytes_read == 0)
            bytes_read = -1; 
          break;
        }
      bytes_read += retval; 
      if (retval != (off_t) read_amt) 
        {
          /* Short read, so we're done. */
          break; 
        }

      /* Advance. */
      udst += retval;
      size -= retval;
    }
   
  return bytes_read;
}
Example #21
0
/*
 * Allocate pages to back the virtual address range [addr, addr + size).
 * If addr is NULL, allocate the virtual address space as well.
 */
void *
segkmem_xalloc(vmem_t *vmp, void *inaddr, size_t size, int vmflag, uint_t attr,
	page_t *(*page_create_func)(void *, size_t, int, void *), void *pcarg)
{
	page_t *ppl;
	caddr_t addr = inaddr;
	pgcnt_t npages = btopr(size);
	int allocflag;

	if (inaddr == NULL && (addr = vmem_alloc(vmp, size, vmflag)) == NULL)
		return (NULL);

	ASSERT(((uintptr_t)addr & PAGEOFFSET) == 0);

	if (page_resv(npages, vmflag & VM_KMFLAGS) == 0) {
		if (inaddr == NULL)
			vmem_free(vmp, addr, size);
		return (NULL);
	}

	ppl = page_create_func(addr, size, vmflag, pcarg);
	if (ppl == NULL) {
		if (inaddr == NULL)
			vmem_free(vmp, addr, size);
		page_unresv(npages);
		return (NULL);
	}

	/*
	 * Under certain conditions, we need to let the HAT layer know
	 * that it cannot safely allocate memory.  Allocations from
	 * the hat_memload vmem arena always need this, to prevent
	 * infinite recursion.
	 *
	 * In addition, the x86 hat cannot safely do memory
	 * allocations while in vmem_populate(), because there
	 * is no simple bound on its usage.
	 */
	if (vmflag & VM_MEMLOAD)
		allocflag = HAT_NO_KALLOC;
#if defined(__x86)
	else if (vmem_is_populator())
		allocflag = HAT_NO_KALLOC;
#endif
	else
		allocflag = 0;

	while (ppl != NULL) {
		page_t *pp = ppl;
		page_sub(&ppl, pp);
		ASSERT(page_iolock_assert(pp));
		ASSERT(PAGE_EXCL(pp));
		page_io_unlock(pp);
		hat_memload(kas.a_hat, (caddr_t)(uintptr_t)pp->p_offset, pp,
		    (PROT_ALL & ~PROT_USER) | HAT_NOSYNC | attr,
		    HAT_LOAD_LOCK | allocflag);
		pp->p_lckcnt = 1;
#if defined(__x86)
		page_downgrade(pp);
#else
		if (vmflag & SEGKMEM_SHARELOCKED)
			page_downgrade(pp);
		else
			page_unlock(pp);
#endif
	}

	return (addr);
}
Example #22
0
void
plat_freelist_process(int mnode)
{
	page_t		*page, **freelist;
	page_t		*bdlist[STARFIRE_MAX_BOARDS];
	page_t		 **sortlist[STARFIRE_MAX_BOARDS];
	uint32_t	idx, idy, size, color, max_color, lbn;
	uint32_t	bd_flags, bd_cnt, result, bds;
	kmutex_t	*pcm;
	int 		mtype;

	/* for each page size */
	for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
		for (size = 0; size < mmu_page_sizes; size++) {

			/*
			 * Compute the maximum # of phys colors based on
			 * page size.
			 */
			max_color = page_get_pagecolors(size);

			/* for each color */
			for (color = 0; color < max_color; color++) {

				bd_cnt = 0;
				bd_flags = 0;
				for (idx = 0; idx < STARFIRE_MAX_BOARDS;
				    idx++) {
					bdlist[idx] = NULL;
					sortlist[idx] = NULL;
				}

				/* find freelist */
				freelist = &PAGE_FREELISTS(mnode, size,
				    color, mtype);

				if (*freelist == NULL)
					continue;

				/* acquire locks */
				pcm = PC_BIN_MUTEX(mnode, color, PG_FREE_LIST);
				mutex_enter(pcm);

				/*
				 * read freelist & sort pages by logical
				 * board number
				 */
				/* grab pages till last one. */
				while (*freelist) {
					page = *freelist;
					result = page_trylock(page, SE_EXCL);

					ASSERT(result);

					/* Delete from freelist */
					if (size != 0) {
						page_vpsub(freelist, page);
					} else {
						mach_page_sub(freelist, page);
					}

					/* detect the lbn */
					lbn = PFN_2_LBN(page->p_pagenum);

					/* add to bdlist[lbn] */
					if (size != 0) {
						page_vpadd(&bdlist[lbn], page);
					} else {
						mach_page_add(&bdlist[lbn],
						    page);
					}

					/* if lbn new */
					if ((bd_flags & (1 << lbn)) == 0) {
						bd_flags |= (1 << lbn);
						bd_cnt++;
					}
					page_unlock(page);
				}

				/*
				 * Make the sortlist so
				 * bd_cnt choices show up
				 */
				bds = 0;
				for (idx = 0; idx < STARFIRE_MAX_BOARDS;
				    idx++) {
					if (bdlist[idx])
						sortlist[bds++] = &bdlist[idx];
				}

				/*
				 * Set random start.
				 */
				(void) random_idx(-color);

				/*
				 * now rebuild the freelist by shuffling
				 * pages from bd lists
				 */
				while (bd_cnt) {

					/*
					 * get "random" index between 0 &
					 * bd_cnt
					 */

					ASSERT(bd_cnt &&
					    (bd_cnt < STARFIRE_MAX_BOARDS+1));

					idx = random_idx(bd_cnt);

					page = *sortlist[idx];
					result = page_trylock(page, SE_EXCL);

					ASSERT(result);

					/* Delete from sort_list */
					/*  & Append to freelist */
					/* Big pages use vp_add - 8k don't */
					if (size != 0) {
						page_vpsub(sortlist[idx], page);
						page_vpadd(freelist, page);
					} else {
						mach_page_sub(sortlist[idx],
						    page);
						mach_page_add(freelist, page);
					}

					/* needed for indexing tmp lists */
					lbn = PFN_2_LBN(page->p_pagenum);

					/*
					 * if this was the last page on this
					 * list?
					 */
					if (*sortlist[idx] == NULL) {

						/* have to find brd list */

						/* idx is lbn? -- No! */
						/* sortlist, brdlist */
						/*  have diff indexs */
						bd_flags &= ~(1 << lbn);
						--bd_cnt;

						/*
						 * redo the sortlist so only
						 * bd_cnt choices show up
						 */
						bds = 0;
						for (idy = 0;
						    idy < STARFIRE_MAX_BOARDS;
						    idy++) {
							if (bdlist[idy]) {
								sortlist[bds++]
								/* CSTYLED */
								= &bdlist[idy];
							}
						}
					}
					page_unlock(page);
				}
				mutex_exit(pcm);
			}
		}
	}
}
Example #23
0
/*
 * Process a vnode's page list for all pages whose offset is >= off.
 * Pages are to either be free'd, invalidated, or written back to disk.
 *
 * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE
 * is specified, otherwise they are "shared" locked.
 *
 * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC}
 *
 * Special marker page_t's are inserted in the list in order
 * to keep track of where we are in the list when locks are dropped.
 *
 * Note the list is circular and insertions can happen only at the
 * head and tail of the list. The algorithm ensures visiting all pages
 * on the list in the following way:
 *
 *    Drop two marker pages at the end of the list.
 *
 *    Move one marker page backwards towards the start of the list until
 *    it is at the list head, processing the pages passed along the way.
 *
 *    Due to race conditions when the vphm mutex is dropped, additional pages
 *    can be added to either end of the list, so we'll continue to move
 *    the marker and process pages until it is up against the end marker.
 *
 * There is one special exit condition. If we are processing a VMODSORT
 * vnode and only writing back modified pages, we can stop as soon as
 * we run into an unmodified page.  This makes fsync(3) operations fast.
 */
int
pvn_vplist_dirty(
    vnode_t		*vp,
    u_offset_t	off,
    int		(*putapage)(vnode_t *, page_t *, u_offset_t *,
                        size_t *, int, cred_t *),
    int		flags,
    cred_t		*cred)
{
    page_t		*pp;
    page_t		*mark;		/* marker page that moves toward head */
    page_t		*end;		/* marker page at end of list */
    int		err = 0;
    int		error;
    kmutex_t	*vphm;
    se_t		se;
    page_t		**where_to_move;

    ASSERT(vp->v_type != VCHR);

    if (vp->v_pages == NULL)
        return (0);


    /*
     * Serialize vplist_dirty operations on this vnode by setting VVMLOCK.
     *
     * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync()
     * from getting blocked while flushing pages to a dead NFS server.
     */
    mutex_enter(&vp->v_lock);
    if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) {
        mutex_exit(&vp->v_lock);
        return (EAGAIN);
    }

    while (vp->v_flag & VVMLOCK)
        cv_wait(&vp->v_cv, &vp->v_lock);

    if (vp->v_pages == NULL) {
        mutex_exit(&vp->v_lock);
        return (0);
    }

    vp->v_flag |= VVMLOCK;
    mutex_exit(&vp->v_lock);


    /*
     * Set up the marker pages used to walk the list
     */
    end = kmem_cache_alloc(marker_cache, KM_SLEEP);
    end->p_vnode = vp;
    end->p_offset = (u_offset_t)-2;
    mark = kmem_cache_alloc(marker_cache, KM_SLEEP);
    mark->p_vnode = vp;
    mark->p_offset = (u_offset_t)-1;

    /*
     * Grab the lock protecting the vnode's page list
     * note that this lock is dropped at times in the loop.
     */
    vphm = page_vnode_mutex(vp);
    mutex_enter(vphm);
    if (vp->v_pages == NULL)
        goto leave;

    /*
     * insert the markers and loop through the list of pages
     */
    page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark);
    page_vpadd(&mark->p_vpnext, end);
    for (;;) {

        /*
         * If only doing an async write back, then we can
         * stop as soon as we get to start of the list.
         */
        if (flags == B_ASYNC && vp->v_pages == mark)
            break;

        /*
         * otherwise stop when we've gone through all the pages
         */
        if (mark->p_vpprev == end)
            break;

        pp = mark->p_vpprev;
        if (vp->v_pages == pp)
            where_to_move = &vp->v_pages;
        else
            where_to_move = &pp->p_vpprev->p_vpnext;

        ASSERT(pp->p_vnode == vp);

        /*
         * If just flushing dirty pages to disk and this vnode
         * is using a sorted list of pages, we can stop processing
         * as soon as we find an unmodified page. Since all the
         * modified pages are visited first.
         */
        if (IS_VMODSORT(vp) &&
                !(flags & (B_INVAL | B_FREE | B_TRUNC))) {
            if (!hat_ismod(pp) && !page_io_locked(pp)) {
#ifdef  DEBUG
                /*
                 * For debug kernels examine what should be
                 * all the remaining clean pages, asserting
                 * that they are not modified.
                 */
                page_t	*chk = pp;
                int	attr;

                page_vpsub(&vp->v_pages, mark);
                page_vpadd(where_to_move, mark);
                do {
                    chk = chk->p_vpprev;
                    ASSERT(chk != end);
                    if (chk == mark)
                        continue;
                    attr = hat_page_getattr(chk, P_MOD |
                                            P_REF);
                    if ((attr & P_MOD) == 0)
                        continue;
                    panic("v_pages list not all clean: "
                          "page_t*=%p vnode=%p off=%lx "
                          "attr=0x%x last clean page_t*=%p\n",
                          (void *)chk, (void *)chk->p_vnode,
                          (long)chk->p_offset, attr,
                          (void *)pp);
                } while (chk != vp->v_pages);
#endif
                break;
            } else if (!(flags & B_ASYNC) && !hat_ismod(pp)) {
                /*
                 * Couldn't get io lock, wait until IO is done.
                 * Block only for sync IO since we don't want
                 * to block async IO.
                 */
                mutex_exit(vphm);
                page_io_wait(pp);
                mutex_enter(vphm);
                continue;
            }
        }

        /*
         * Skip this page if the offset is out of the desired range.
         * Just move the marker and continue.
         */
        if (pp->p_offset < off) {
            page_vpsub(&vp->v_pages, mark);
            page_vpadd(where_to_move, mark);
            continue;
        }

        /*
         * If we are supposed to invalidate or free this
         * page, then we need an exclusive lock.
         */
        se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;

        /*
         * We must acquire the page lock for all synchronous
         * operations (invalidate, free and write).
         */
        if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) {
            /*
             * If the page_lock() drops the mutex
             * we must retry the loop.
             */
            if (!page_lock(pp, se, vphm, P_NO_RECLAIM))
                continue;

            /*
             * It's ok to move the marker page now.
             */
            page_vpsub(&vp->v_pages, mark);
            page_vpadd(where_to_move, mark);
        } else {

            /*
             * update the marker page for all remaining cases
             */
            page_vpsub(&vp->v_pages, mark);
            page_vpadd(where_to_move, mark);

            /*
             * For write backs, If we can't lock the page, it's
             * invalid or in the process of being destroyed.  Skip
             * it, assuming someone else is writing it.
             */
            if (!page_trylock(pp, se))
                continue;
        }

        ASSERT(pp->p_vnode == vp);

        /*
         * Successfully locked the page, now figure out what to
         * do with it. Free pages are easily dealt with, invalidate
         * if desired or just go on to the next page.
         */
        if (PP_ISFREE(pp)) {
            if ((flags & B_INVAL) == 0) {
                page_unlock(pp);
                continue;
            }

            /*
             * Invalidate (destroy) the page.
             */
            mutex_exit(vphm);
            page_destroy_free(pp);
            mutex_enter(vphm);
            continue;
        }

        /*
         * pvn_getdirty() figures out what do do with a dirty page.
         * If the page is dirty, the putapage() routine will write it
         * and will kluster any other adjacent dirty pages it can.
         *
         * pvn_getdirty() and `(*putapage)' unlock the page.
         */
        mutex_exit(vphm);
        if (pvn_getdirty(pp, flags)) {
            error = (*putapage)(vp, pp, NULL, NULL, flags, cred);
            if (!err)
                err = error;
        }
        mutex_enter(vphm);
    }
    page_vpsub(&vp->v_pages, mark);
    page_vpsub(&vp->v_pages, end);

leave:
    /*
     * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds
     */
    mutex_exit(vphm);
    kmem_cache_free(marker_cache, mark);
    kmem_cache_free(marker_cache, end);
    mutex_enter(&vp->v_lock);
    vp->v_flag &= ~VVMLOCK;
    cv_broadcast(&vp->v_cv);
    mutex_exit(&vp->v_lock);
    return (err);
}
Example #24
0
/*
 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI,
 * B_TRUNC, B_FORCE}.  B_DELWRI indicates that this page is part of a kluster
 * operation and is only to be considered if it doesn't involve any
 * waiting here.  B_TRUNC indicates that the file is being truncated
 * and so no i/o needs to be done. B_FORCE indicates that the page
 * must be destroyed so don't try wrting it out.
 *
 * The caller must ensure that the page is locked.  Returns 1, if
 * the page should be written back (the "iolock" is held in this
 * case), or 0 if the page has been dealt with or has been
 * unlocked.
 */
int
pvn_getdirty(page_t *pp, int flags)
{
    ASSERT((flags & (B_INVAL | B_FREE)) ?
           PAGE_EXCL(pp) : PAGE_SHARED(pp));
    ASSERT(PP_ISFREE(pp) == 0);

    /*
     * If trying to invalidate or free a logically `locked' page,
     * forget it.  Don't need page_struct_lock to check p_lckcnt and
     * p_cowcnt as the page is exclusively locked.
     */
    if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) &&
            (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) {
        page_unlock(pp);
        return (0);
    }

    /*
     * Now acquire the i/o lock so we can add it to the dirty
     * list (if necessary).  We avoid blocking on the i/o lock
     * in the following cases:
     *
     *	If B_DELWRI is set, which implies that this request is
     *	due to a klustering operartion.
     *
     *	If this is an async (B_ASYNC) operation and we are not doing
     *	invalidation (B_INVAL) [The current i/o or fsflush will ensure
     *	that the the page is written out].
     */
    if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) {
        if (!page_io_trylock(pp)) {
            page_unlock(pp);
            return (0);
        }
    } else {
        page_io_lock(pp);
    }

    /*
     * If we want to free or invalidate the page then
     * we need to unload it so that anyone who wants
     * it will have to take a minor fault to get it.
     * Otherwise, we're just writing the page back so we
     * need to sync up the hardwre and software mod bit to
     * detect any future modifications.  We clear the
     * software mod bit when we put the page on the dirty
     * list.
     */
    if (flags & (B_INVAL | B_FREE)) {
        (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
    } else {
        (void) hat_pagesync(pp, HAT_SYNC_ZERORM);
    }

    if (!hat_ismod(pp) || (flags & B_TRUNC)) {
        /*
         * Don't need to add it to the
         * list after all.
         */
        page_io_unlock(pp);
        if (flags & B_INVAL) {
            /*LINTED: constant in conditional context*/
            VN_DISPOSE(pp, B_INVAL, 0, kcred);
        } else if (flags & B_FREE) {
            /*LINTED: constant in conditional context*/
            VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred);
        } else {
            /*
             * This is advisory path for the callers
             * of VOP_PUTPAGE() who prefer freeing the
             * page _only_ if no one else is accessing it.
             * E.g. segmap_release()
             *
             * The above hat_ismod() check is useless because:
             * (1) we may not be holding SE_EXCL lock;
             * (2) we've not unloaded _all_ translations
             *
             * Let page_release() do the heavy-lifting.
             */
            (void) page_release(pp, 1);
        }
        return (0);
    }

    /*
     * Page is dirty, get it ready for the write back
     * and add page to the dirty list.
     */
    hat_clrrefmod(pp);

    /*
     * If we're going to free the page when we're done
     * then we can let others try to use it starting now.
     * We'll detect the fact that they used it when the
     * i/o is done and avoid freeing the page.
     */
    if (flags & B_FREE)
        page_downgrade(pp);


    TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp);

    return (1);
}
Example #25
0
void
pvn_write_done(page_t *plist, int flags)
{
    int dfree = 0;
    int pgrec = 0;
    int pgout = 0;
    int pgpgout = 0;
    int anonpgout = 0;
    int anonfree = 0;
    int fspgout = 0;
    int fsfree = 0;
    int execpgout = 0;
    int execfree = 0;
    page_t *pp;
    struct cpu *cpup;
    struct vnode *vp = NULL;	/* for probe */
    uint_t ppattr;
    kmutex_t *vphm = NULL;

    ASSERT((flags & B_READ) == 0);

    /*
     * If we are about to start paging anyway, start freeing pages.
     */
    if (write_free && freemem < lotsfree + pages_before_pager &&
            (flags & B_ERROR) == 0) {
        flags |= B_FREE;
    }

    /*
     * Handle each page involved in the i/o operation.
     */
    while (plist != NULL) {
        pp = plist;
        ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp));
        page_sub(&plist, pp);

        /* Kernel probe support */
        if (vp == NULL)
            vp = pp->p_vnode;

        if (((flags & B_ERROR) == 0) && IS_VMODSORT(vp)) {
            /*
             * Move page to the top of the v_page list.
             * Skip pages modified during IO.
             */
            vphm = page_vnode_mutex(vp);
            mutex_enter(vphm);
            if ((pp->p_vpnext != pp) && !hat_ismod(pp)) {
                page_vpsub(&vp->v_pages, pp);
                page_vpadd(&vp->v_pages, pp);
            }
            mutex_exit(vphm);
        }

        if (flags & B_ERROR) {
            /*
             * Write operation failed.  We don't want
             * to destroy (or free) the page unless B_FORCE
             * is set. We set the mod bit again and release
             * all locks on the page so that it will get written
             * back again later when things are hopefully
             * better again.
             * If B_INVAL and B_FORCE is set we really have
             * to destroy the page.
             */
            if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) {
                page_io_unlock(pp);
                /*LINTED: constant in conditional context*/
                VN_DISPOSE(pp, B_INVAL, 0, kcred);
            } else {
                hat_setmod_only(pp);
                page_io_unlock(pp);
                page_unlock(pp);
            }
        } else if (flags & B_INVAL) {
            /*
             * XXX - Failed writes with B_INVAL set are
             * not handled appropriately.
             */
            page_io_unlock(pp);
            /*LINTED: constant in conditional context*/
            VN_DISPOSE(pp, B_INVAL, 0, kcred);
        } else if (flags & B_FREE ||!hat_page_is_mapped(pp)) {
            /*
             * Update statistics for pages being paged out
             */
            if (pp->p_vnode) {
                if (IS_SWAPFSVP(pp->p_vnode)) {
                    anonpgout++;
                } else {
                    if (pp->p_vnode->v_flag & VVMEXEC) {
                        execpgout++;
                    } else {
                        fspgout++;
                    }
                }
            }
            page_io_unlock(pp);
            pgout = 1;
            pgpgout++;
            TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT,
                    "page_ws_out:pp %p", pp);

            /*
             * The page_struct_lock need not be acquired to
             * examine "p_lckcnt" and "p_cowcnt" since we'll
             * have an "exclusive" lock if the upgrade succeeds.
             */
            if (page_tryupgrade(pp) &&
                    pp->p_lckcnt == 0 && pp->p_cowcnt == 0) {
                /*
                 * Check if someone has reclaimed the
                 * page.  If ref and mod are not set, no
                 * one is using it so we can free it.
                 * The rest of the system is careful
                 * to use the NOSYNC flag to unload
                 * translations set up for i/o w/o
                 * affecting ref and mod bits.
                 *
                 * Obtain a copy of the real hardware
                 * mod bit using hat_pagesync(pp, HAT_DONTZERO)
                 * to avoid having to flush the cache.
                 */
                ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
                                      HAT_SYNC_STOPON_MOD);
ck_refmod:
                if (!(ppattr & (P_REF | P_MOD))) {
                    if (hat_page_is_mapped(pp)) {
                        /*
                         * Doesn't look like the page
                         * was modified so now we
                         * really have to unload the
                         * translations.  Meanwhile
                         * another CPU could've
                         * modified it so we have to
                         * check again.  We don't loop
                         * forever here because now
                         * the translations are gone
                         * and no one can get a new one
                         * since we have the "exclusive"
                         * lock on the page.
                         */
                        (void) hat_pageunload(pp,
                                              HAT_FORCE_PGUNLOAD);
                        ppattr = hat_page_getattr(pp,
                                                  P_REF | P_MOD);
                        goto ck_refmod;
                    }
                    /*
                     * Update statistics for pages being
                     * freed
                     */
                    if (pp->p_vnode) {
                        if (IS_SWAPFSVP(pp->p_vnode)) {
                            anonfree++;
                        } else {
                            if (pp->p_vnode->v_flag
                                    & VVMEXEC) {
                                execfree++;
                            } else {
                                fsfree++;
                            }
                        }
                    }
                    /*LINTED: constant in conditional ctx*/
                    VN_DISPOSE(pp, B_FREE,
                               (flags & B_DONTNEED), kcred);
                    dfree++;
                } else {
                    page_unlock(pp);
                    pgrec++;
                    TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE,
                            "page_ws_free:pp %p", pp);
                }
            } else {
                /*
                 * Page is either `locked' in memory
                 * or was reclaimed and now has a
                 * "shared" lock, so release it.
                 */
                page_unlock(pp);
            }
        } else {
            /*
             * Neither B_FREE nor B_INVAL nor B_ERROR.
             * Just release locks.
             */
            page_io_unlock(pp);
            page_unlock(pp);
        }
    }

    CPU_STATS_ENTER_K();
    cpup = CPU;		/* get cpup now that CPU cannot change */
    CPU_STATS_ADDQ(cpup, vm, dfree, dfree);
    CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec);
    CPU_STATS_ADDQ(cpup, vm, pgout, pgout);
    CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout);
    CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout);
    CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree);
    CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout);
    CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree);
    CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout);
    CPU_STATS_ADDQ(cpup, vm, execfree, execfree);
    CPU_STATS_EXIT_K();

    /* Kernel probe */
    TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */,
                tnf_opaque,	vnode,			vp,
                tnf_ulong,	pages_pageout,		pgpgout,
                tnf_ulong,	pages_freed,		dfree,
                tnf_ulong,	pages_reclaimed,	pgrec);
}
Example #26
0
int
vmxnet3s_txcache_init(vmxnet3s_softc_t *dp, vmxnet3s_txq_t *txq)
{
	int		i;
	int		ndescrs;
	int		node;
	page_t		*page;
	struct seg	kseg;
	vmxnet3s_txcache_t *cache = &dp->txcache;
	dev_info_t	*dip = dp->dip;

	cache->num_pages = ((txq->cmdring.size * VMXNET3_HDR_COPY_SIZE) +
	    (PAGESIZE - 1)) / PAGESIZE;

	/* Allocate pages */
	if (!page_resv(cache->num_pages, KM_SLEEP)) {
		dev_err(dip, CE_WARN, "failed to reserve %d pages",
		    cache->num_pages);
		goto out;
	}

	if (!page_create_wait(cache->num_pages, 0)) {
		dev_err(dip, CE_WARN, "failed to create %d pages",
		    cache->num_pages);
		goto unresv_pages;
	}

	cache->pages = kmem_zalloc(cache->num_pages * sizeof (page_t *),
	    KM_SLEEP);

	cache->page_maps = kmem_zalloc(cache->num_pages * sizeof (page_t *),
	    KM_SLEEP);

	kseg.s_as = &kas;
	for (i = 0; i < cache->num_pages; i++) {
		page = page_get_freelist(&kvp, 0, &kseg, (caddr_t)(i*PAGESIZE),
		    PAGESIZE, 0, NULL);
		if (page == NULL) {
			page = page_get_cachelist(&kvp, 0, &kseg,
			    (caddr_t)(i * PAGESIZE), 0, NULL);
			if (page == NULL)
				goto free_pages;
			if (!PP_ISAGED(page))
				page_hashout(page, NULL);
		}
		PP_CLRFREE(page);
		PP_CLRAGED(page);
		cache->pages[i] = page;
	}

	for (i = 0; i < cache->num_pages; i++)
		page_downgrade(cache->pages[i]);

	/* Allocate virtual address range for mapping pages */
	cache->window = vmem_alloc(heap_arena, ptob(cache->num_pages),
	    VM_SLEEP);
	ASSERT(cache->window);

	cache->num_nodes = txq->cmdring.size;

	/* Map pages */
	for (i = 0; i < cache->num_pages; i++) {
		cache->page_maps[i] = cache->window + ptob(i);
		hat_devload(kas.a_hat, cache->page_maps[i], ptob(1),
		    cache->pages[i]->p_pagenum,
		    PROT_READ | PROT_WRITE | HAT_STRICTORDER,
		    HAT_LOAD_LOCK);
	}

	/* Now setup cache items */
	cache->nodes = kmem_zalloc(txq->cmdring.size *
	    sizeof (vmxnet3s_txcache_node_t), KM_SLEEP);

	ndescrs = txq->cmdring.size;
	node = 0;
	for (i = 0; i < cache->num_pages; i++) {
		caddr_t		va;
		int		j;
		int		lim;
		uint64_t	pa;

		lim = (ndescrs <= VMXNET3_TX_CACHE_ITEMS_PER_PAGE) ? ndescrs :
		    VMXNET3_TX_CACHE_ITEMS_PER_PAGE;
		va = cache->page_maps[i];
		pa = cache->pages[i]->p_pagenum << PAGESHIFT;

		for (j = 0; j < lim; j++) {
			cache->nodes[node].pa = pa;
			cache->nodes[node].va = va;

			pa += VMXNET3_HDR_COPY_SIZE;
			va += VMXNET3_HDR_COPY_SIZE;
			node++;
		}
		ndescrs -= lim;
	}
	return (DDI_SUCCESS);

free_pages:
	page_create_putback(cache->num_pages - i);
	while (--i >= 0) {
		if (!page_tryupgrade(cache->pages[i])) {
			page_unlock(cache->pages[i]);
			while (!page_lock(cache->pages[i], SE_EXCL, NULL,
			    P_RECLAIM))
				;
		}
		page_free(cache->pages[i], 0);
	}
	kmem_free(cache->pages, cache->num_pages * PAGESIZE);
unresv_pages:
	page_unresv(cache->num_pages);
out:
	cache->num_pages = cache->num_nodes = 0;

	return (DDI_FAILURE);
}
void
plat_release_page(page_t *pp)
{
	ASSERT((pp != NULL) && PAGE_LOCKED(pp));
	page_unlock(pp);
}