/*ARGSUSED*/ void pvn_plist_init(page_t *pp, page_t *pl[], size_t plsz, u_offset_t off, size_t io_len, enum seg_rw rw) { ssize_t sz; page_t *ppcur, **ppp; /* * Set up to load plsz worth * starting at the needed page. */ while (pp != NULL && pp->p_offset != off) { /* * Remove page from the i/o list, * release the i/o and the page lock. */ ppcur = pp; page_sub(&pp, ppcur); page_io_unlock(ppcur); (void) page_release(ppcur, 1); } if (pp == NULL) { pl[0] = NULL; return; } sz = plsz; /* * Initialize the page list array. */ ppp = pl; do { ppcur = pp; *ppp++ = ppcur; page_sub(&pp, ppcur); page_io_unlock(ppcur); if (rw != S_CREATE) page_downgrade(ppcur); sz -= PAGESIZE; } while (sz > 0 && pp != NULL); *ppp = NULL; /* terminate list */ /* * Now free the remaining pages that weren't * loaded in the page list. */ while (pp != NULL) { ppcur = pp; page_sub(&pp, ppcur); page_io_unlock(ppcur); (void) page_release(ppcur, 1); } }
/* * NB: Don't check recsize or reclen. */ int ipc_port_send_data(const struct ipc_header *ipch, const void *p, size_t len) { struct vm_page *page; struct task *task; vaddr_t vaddr; int error; task = current_task(); ASSERT(task != NULL, "Must have a running task."); if (p == NULL) { ASSERT(len == 0, "Cannot send no data with a set data length."); if (len != 0) return (ERROR_INVALID); error = ipc_port_send_page(ipch, NULL); if (error != 0) return (error); return (0); } ASSERT(len != 0, "Cannot send data without data length."); ASSERT(len <= PAGE_SIZE, "Cannot send more than a page."); error = page_alloc(PAGE_FLAG_DEFAULT, &page); if (error != 0) return (error); error = page_map_direct(&kernel_vm, page, &vaddr); if (error != 0) { page_release(page); return (error); } memcpy((void *)vaddr, p, len); /* * Clear any trailing data so we don't leak kernel information. */ if (len != PAGE_SIZE) memset((void *)(vaddr + len), 0, PAGE_SIZE - len); error = page_unmap_direct(&kernel_vm, page, vaddr); if (error != 0) panic("%s: page_unmap_direct failed: %m", __func__, error); error = ipc_port_send_page(ipch, page); if (error != 0) { page_release(page); return (error); } return (0); }
int ipc_port_send(const struct ipc_header *ipch, void *vpage) { struct vm_page *page; struct task *task; struct vm *vm; int error; task = current_task(); ASSERT(task != NULL, "Must have a running task."); ASSERT(ipch != NULL, "Must have a header."); /* * Extract the vm_page for this page. */ if (vpage == NULL) { page = NULL; } else { if ((task->t_flags & TASK_KERNEL) == 0) vm = task->t_vm; else vm = &kernel_vm; error = page_extract(vm, (vaddr_t)vpage, &page); if (error != 0) return (error); if (vm == &kernel_vm) { error = page_unmap_direct(vm, page, (vaddr_t)vpage); if (error != 0) panic("%s: could not unmap direct page: %m", __func__, error); } else { error = page_unmap(vm, (vaddr_t)vpage, page); if (error != 0) panic("%s: could not unmap source page: %m", __func__, error); error = vm_free_address(vm, (vaddr_t)vpage); if (error != 0) panic("%s: could not free source page address: %m", __func__, error); } } error = ipc_port_send_page(ipch, page); if (error != 0) { if (page != NULL) page_release(page); return (error); } return (0); }
/* * Entry point to be used by file system getpage subr's and * other such routines which either want to unlock pages (B_ASYNC * request) or destroy a list of pages if an error occurred. */ void pvn_read_done(page_t *plist, int flags) { page_t *pp; while (plist != NULL) { pp = plist; page_sub(&plist, pp); page_io_unlock(pp); if (flags & B_ERROR) { /*LINTED: constant in conditional context*/ VN_DISPOSE(pp, B_INVAL, 0, kcred); } else { (void) page_release(pp, 0); } } }
/* * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI, * B_TRUNC, B_FORCE}. B_DELWRI indicates that this page is part of a kluster * operation and is only to be considered if it doesn't involve any * waiting here. B_TRUNC indicates that the file is being truncated * and so no i/o needs to be done. B_FORCE indicates that the page * must be destroyed so don't try wrting it out. * * The caller must ensure that the page is locked. Returns 1, if * the page should be written back (the "iolock" is held in this * case), or 0 if the page has been dealt with or has been * unlocked. */ int pvn_getdirty(page_t *pp, int flags) { ASSERT((flags & (B_INVAL | B_FREE)) ? PAGE_EXCL(pp) : PAGE_SHARED(pp)); ASSERT(PP_ISFREE(pp) == 0); /* * If trying to invalidate or free a logically `locked' page, * forget it. Don't need page_struct_lock to check p_lckcnt and * p_cowcnt as the page is exclusively locked. */ if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) && (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) { page_unlock(pp); return (0); } /* * Now acquire the i/o lock so we can add it to the dirty * list (if necessary). We avoid blocking on the i/o lock * in the following cases: * * If B_DELWRI is set, which implies that this request is * due to a klustering operartion. * * If this is an async (B_ASYNC) operation and we are not doing * invalidation (B_INVAL) [The current i/o or fsflush will ensure * that the the page is written out]. */ if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) { if (!page_io_trylock(pp)) { page_unlock(pp); return (0); } } else { page_io_lock(pp); } /* * If we want to free or invalidate the page then * we need to unload it so that anyone who wants * it will have to take a minor fault to get it. * Otherwise, we're just writing the page back so we * need to sync up the hardwre and software mod bit to * detect any future modifications. We clear the * software mod bit when we put the page on the dirty * list. */ if (flags & (B_INVAL | B_FREE)) { (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); } else { (void) hat_pagesync(pp, HAT_SYNC_ZERORM); } if (!hat_ismod(pp) || (flags & B_TRUNC)) { /* * Don't need to add it to the * list after all. */ page_io_unlock(pp); if (flags & B_INVAL) { /*LINTED: constant in conditional context*/ VN_DISPOSE(pp, B_INVAL, 0, kcred); } else if (flags & B_FREE) { /*LINTED: constant in conditional context*/ VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred); } else { /* * This is advisory path for the callers * of VOP_PUTPAGE() who prefer freeing the * page _only_ if no one else is accessing it. * E.g. segmap_release() * * The above hat_ismod() check is useless because: * (1) we may not be holding SE_EXCL lock; * (2) we've not unloaded _all_ translations * * Let page_release() do the heavy-lifting. */ (void) page_release(pp, 1); } return (0); } /* * Page is dirty, get it ready for the write back * and add page to the dirty list. */ hat_clrrefmod(pp); /* * If we're going to free the page when we're done * then we can let others try to use it starting now. * We'll detect the fact that they used it when the * i/o is done and avoid freeing the page. */ if (flags & B_FREE) page_downgrade(pp); TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp); return (1); }
/* * Handles common work of the VOP_GETPAGE routines by iterating page by page * calling the getpage helper for each. */ int pvn_getpages( int (*getpage)(vnode_t *, u_offset_t, size_t, uint_t *, page_t *[], size_t, struct seg *, caddr_t, enum seg_rw, cred_t *), struct vnode *vp, u_offset_t off, size_t len, uint_t *protp, page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cred) { page_t **ppp; u_offset_t o, eoff; size_t sz, xlen; int err; /* ensure that we have enough space */ ASSERT(pl == NULL || plsz >= len); /* * Loop one page at a time and let getapage function fill * in the next page in array. We only allow one page to be * returned at a time (except for the last page) so that we * don't have any problems with duplicates and other such * painful problems. This is a very simple minded algorithm, * but it does the job correctly. We hope that the cost of a * getapage call for a resident page that we might have been * able to get from an earlier call doesn't cost too much. */ ppp = pl; sz = (pl != NULL) ? PAGESIZE : 0; eoff = off + len; xlen = len; for (o = off; o < eoff; o += PAGESIZE, addr += PAGESIZE, xlen -= PAGESIZE) { if (o + PAGESIZE >= eoff && pl != NULL) { /* * Last time through - allow the all of * what's left of the pl[] array to be used. */ sz = plsz - (o - off); } err = (*getpage)(vp, o, xlen, protp, ppp, sz, seg, addr, rw, cred); if (err) { /* * Release any pages we already got. */ if (o > off && pl != NULL) { for (ppp = pl; *ppp != NULL; *ppp++ = NULL) (void) page_release(*ppp, 1); } break; } if (pl != NULL) ppp++; } return (err); }
/* * Scan page_t's and issue I/O's for modified pages. * * Also coalesces consecutive small sized free pages into the next larger * pagesize. This costs a tiny bit of time in fsflush, but will reduce time * spent scanning on later passes and for anybody allocating large pages. */ static void fsflush_do_pages() { vnode_t *vp; ulong_t pcount; hrtime_t timer = gethrtime(); ulong_t releases = 0; ulong_t nexamined = 0; ulong_t nlocked = 0; ulong_t nmodified = 0; ulong_t ncoalesce = 0; ulong_t cnt; int mod; int fspage = 1; u_offset_t offset; uint_t szc; page_t *coal_page = NULL; /* 1st page in group to coalesce */ uint_t coal_szc = 0; /* size code, coal_page->p_szc */ uint_t coal_cnt = 0; /* count of pages seen */ static ulong_t nscan = 0; static pgcnt_t last_total_pages = 0; static page_t *pp = NULL; /* * Check to see if total_pages has changed. */ if (total_pages != last_total_pages) { last_total_pages = total_pages; nscan = (last_total_pages * (tune.t_fsflushr))/v.v_autoup; } if (pp == NULL) pp = memsegs->pages; pcount = 0; while (pcount < nscan) { /* * move to the next page, skipping over large pages * and issuing prefetches. */ if (pp->p_szc && fspage == 0) { pfn_t pfn; pfn = page_pptonum(pp); cnt = page_get_pagecnt(pp->p_szc); cnt -= pfn & (cnt - 1); } else cnt = 1; pp = page_nextn(pp, cnt); prefetch_page_r((void *)pp); ASSERT(pp != NULL); pcount += cnt; /* * Do a bunch of dirty tests (ie. no locking) to determine * if we can quickly skip this page. These tests are repeated * after acquiring the page lock. */ ++nexamined; if (PP_ISSWAP(pp)) { fspage = 0; coal_page = NULL; continue; } /* * skip free pages too, but try coalescing them into larger * pagesizes */ if (PP_ISFREE(pp)) { /* * skip pages with a file system identity or that * are already maximum size */ fspage = 0; szc = pp->p_szc; if (pp->p_vnode != NULL || szc == fsf_npgsz - 1) { coal_page = NULL; continue; } /* * If not in a coalescing candidate page or the size * codes are different, start a new candidate. */ if (coal_page == NULL || coal_szc != szc) { /* * page must be properly aligned */ if ((page_pptonum(pp) & fsf_mask[szc]) != 0) { coal_page = NULL; continue; } coal_page = pp; coal_szc = szc; coal_cnt = 1; continue; } /* * acceptable to add this to existing candidate page */ ++coal_cnt; if (coal_cnt < fsf_pgcnt[coal_szc]) continue; /* * We've got enough pages to coalesce, so do it. * After promoting, we clear coal_page, so it will * take another pass to promote this to an even * larger page. */ ++ncoalesce; (void) page_promote_size(coal_page, coal_szc); coal_page = NULL; continue; } else { coal_page = NULL; } if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { fspage = 0; continue; } /* * Reject pages that can't be "exclusively" locked. */ if (!page_trylock(pp, SE_EXCL)) continue; ++nlocked; /* * After locking the page, redo the above checks. * Since we locked the page, leave out the PAGE_LOCKED() test. */ vp = pp->p_vnode; if (PP_ISSWAP(pp) || PP_ISFREE(pp) || vp == NULL || PP_ISKAS(pp) || (vp->v_flag & VISSWAP) != 0) { page_unlock(pp); fspage = 0; continue; } if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { page_unlock(pp); continue; } fspage = 1; ASSERT(vp->v_type != VCHR); /* * Check the modified bit. Leaving the bit alone in hardware. * It will be cleared if we do the putpage. */ if (IS_VMODSORT(vp)) mod = hat_ismod(pp); else mod = hat_pagesync(pp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD; if (mod) { ++nmodified; offset = pp->p_offset; /* * Hold the vnode before releasing the page lock * to prevent it from being freed and re-used by * some other thread. */ VN_HOLD(vp); page_unlock(pp); (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_ASYNC, kcred, NULL); VN_RELE(vp); } else { /* * Catch any pages which should be on the cache list, * but aren't yet. */ if (hat_page_is_mapped(pp) == 0) { ++releases; (void) page_release(pp, 1); } else { page_unlock(pp); } } } /* * maintain statistics * reset every million wakeups, just to avoid overflow */ if (++fsf_cycles == 1000000) { fsf_cycles = 0; fsf_total.fsf_scan = 0; fsf_total.fsf_examined = 0; fsf_total.fsf_locked = 0; fsf_total.fsf_modified = 0; fsf_total.fsf_coalesce = 0; fsf_total.fsf_time = 0; fsf_total.fsf_releases = 0; } else { fsf_total.fsf_scan += fsf_recent.fsf_scan = nscan; fsf_total.fsf_examined += fsf_recent.fsf_examined = nexamined; fsf_total.fsf_locked += fsf_recent.fsf_locked = nlocked; fsf_total.fsf_modified += fsf_recent.fsf_modified = nmodified; fsf_total.fsf_coalesce += fsf_recent.fsf_coalesce = ncoalesce; fsf_total.fsf_time += fsf_recent.fsf_time = gethrtime() - timer; fsf_total.fsf_releases += fsf_recent.fsf_releases = releases; } }
/* * XXX * receive could take a task-local port number like a fd and speed lookup and * minimize locking. */ int ipc_port_receive(ipc_port_t port, struct ipc_header *ipch, void **vpagep) { struct ipc_message *ipcmsg; struct ipc_port *ipcp; struct task *task; vaddr_t vaddr; int error, error2; task = current_task(); ASSERT(task != NULL, "Must have a running task."); ASSERT(ipch != NULL, "Must be able to copy out header."); IPC_PORTS_LOCK(); ipcp = ipc_port_lookup(port); if (ipcp == NULL) { IPC_PORTS_UNLOCK(); return (ERROR_NOT_FOUND); } IPC_PORTS_UNLOCK(); if (!ipc_port_right_check(ipcp, task, IPC_PORT_RIGHT_RECEIVE)) { IPC_PORT_UNLOCK(ipcp); return (ERROR_NO_RIGHT); } if (TAILQ_EMPTY(&ipcp->ipcp_msgs)) { IPC_PORT_UNLOCK(ipcp); return (ERROR_AGAIN); } ipcmsg = TAILQ_FIRST(&ipcp->ipcp_msgs); ASSERT(ipcmsg != NULL, "Queue must not change out from under us."); ASSERT(ipcmsg->ipcmsg_header.ipchdr_dst == ipcp->ipcp_port, "Destination must be this port."); TAILQ_REMOVE(&ipcp->ipcp_msgs, ipcmsg, ipcmsg_link); IPC_PORT_UNLOCK(ipcp); /* * Insert any passed rights. */ if (ipcmsg->ipcmsg_header.ipchdr_right != IPC_PORT_RIGHT_NONE) { ipcp = ipc_port_lookup(ipcmsg->ipcmsg_header.ipchdr_src); if (ipcp == NULL) panic("%s: port disappeared.", __func__); error = ipc_port_right_insert(ipcp, task, ipcmsg->ipcmsg_header.ipchdr_right); if (error != 0) panic("%s: grating rights failed: %m", __func__, error); IPC_PORT_UNLOCK(ipcp); } if (ipcmsg->ipcmsg_page == NULL) { if (vpagep != NULL) *vpagep = NULL; } else { if (vpagep == NULL) { /* * A task may refuse a page flip for any number of reasons. */ page_release(ipcmsg->ipcmsg_page); } else { /* * Map this page into the receiving task. */ if ((task->t_flags & TASK_KERNEL) == 0) { /* * User task. */ error = vm_alloc_address(task->t_vm, &vaddr, 1, false); if (error != 0) { page_release(ipcmsg->ipcmsg_page); free(ipcmsg); return (error); } error = page_map(task->t_vm, vaddr, ipcmsg->ipcmsg_page); if (error != 0) { error2 = vm_free_address(task->t_vm, vaddr); if (error2 != 0) panic("%s: vm_free_address failed: %m", __func__, error); page_release(ipcmsg->ipcmsg_page); free(ipcmsg); } } else { /* * Kernel task. */ error = page_map_direct(&kernel_vm, ipcmsg->ipcmsg_page, &vaddr); if (error != 0) { page_release(ipcmsg->ipcmsg_page); free(ipcmsg); return (error); } } *vpagep = (void *)vaddr; } } *ipch = ipcmsg->ipcmsg_header; free(ipcmsg); return (0); }