void vmxnet3s_txcache_release(vmxnet3s_softc_t *dp) { int i; int rc; vmxnet3s_txcache_t *cache = &dp->txcache; /* Unmap pages */ hat_unload(kas.a_hat, cache->window, ptob(cache->num_pages), HAT_UNLOAD_UNLOCK); vmem_free(heap_arena, cache->window, ptob(cache->num_pages)); /* Free pages */ for (i = 0; i < cache->num_pages; i++) { rc = page_tryupgrade(cache->pages[i]); if (!rc) { page_unlock(cache->pages[i]); while (!page_lock(cache->pages[i], SE_EXCL, NULL, P_RECLAIM)) ; } page_free(cache->pages[i], 0); } page_unresv(cache->num_pages); kmem_free(cache->pages, cache->num_pages * sizeof (page_t *)); kmem_free(cache->page_maps, cache->num_pages * sizeof (page_t *)); kmem_free(cache->nodes, cache->num_nodes * sizeof (vmxnet3s_txcache_node_t)); }
/* Read system call. */ static int sys_read (int handle, void *udst_, unsigned size) { uint8_t *udst = udst_; struct file_descriptor *fd; int bytes_read = 0; /* Look up file descriptor. */ if (handle != STDIN_FILENO) fd = lookup_file_fd (handle); while (size > 0) { /* How much to read into this page? */ size_t page_left = PGSIZE - pg_ofs (udst); size_t read_amt = size < page_left ? size : page_left; off_t retval; /* Check that touching this page is okay. */ if (!page_lock (udst, true)) thread_exit (); /* Read from file into page. */ if (handle != STDIN_FILENO) { retval = file_read (fd->file, udst, read_amt); if (retval < 0) { if (bytes_read == 0) bytes_read = -1; break; } bytes_read += retval; } else { size_t i; for (i = 0; i < read_amt; i++) udst[i] = input_getc (); bytes_read = read_amt; } /* Release page. */ page_unlock (udst); /* If it was a short read we're done. */ if (retval != (off_t) read_amt) break; /* Advance. */ udst += retval; size -= retval; } return bytes_read; }
/* Write system call. */ static int sys_write (int handle, void *usrc_, unsigned size) { uint8_t *usrc = usrc_; struct file_descriptor *fd = NULL; int bytes_written = 0; /* Lookup up file descriptor. */ if (handle != STDOUT_FILENO) fd = lookup_file_fd (handle); while (size > 0) { /* How much bytes to write to this page? */ size_t page_left = PGSIZE - pg_ofs (usrc); size_t write_amt = size < page_left ? size : page_left; off_t retval; /* Check that we can touch this user page. */ if (!page_lock (usrc, false)) thread_exit (); /* Do the write. */ if (handle == STDOUT_FILENO) { putbuf ((char *) usrc, write_amt); retval = write_amt; } else retval = file_write (fd->file, usrc, write_amt); /* Release user page. */ page_unlock (usrc); /* Handle return value. */ if (retval < 0) { if (bytes_written == 0) bytes_written = -1; break; } bytes_written += retval; /* If it was a short write we're done. */ if (retval != (off_t) write_amt) break; /* Advance. */ usrc += retval; size -= retval; } return bytes_written; }
/* Creates a copy of user string US in kernel memory and returns it as a page that must be freed with palloc_free_page(). Truncates the string at PGSIZE bytes in size. Call thread_exit() if any of the user accesses are invalid. */ static char * copy_in_string (const char *us) { char *ks; char *upage; size_t length; ks = palloc_get_page (0); if (ks == NULL) thread_exit (); length = 0; for (;;) { upage = pg_round_down (us); if (!page_lock (upage, false)) goto lock_error; for (; us < upage + PGSIZE; us++) { ks[length++] = *us; if (*us == '\0') { page_unlock (upage); return ks; } else if (length >= PGSIZE) goto too_long_error; } page_unlock (upage); } too_long_error: page_unlock (upage); lock_error: palloc_free_page (ks); thread_exit (); }
/* Copies SIZE bytes from user address USRC to kernel address DST. Call thread_exit() if any of the user accesses are invalid. */ static void copy_in (void *dst_, const void *usrc_, size_t size) { uint8_t *dst = dst_; const uint8_t *usrc = usrc_; while (size > 0) { size_t chunk_size = PGSIZE - pg_ofs (usrc); if (chunk_size > size) chunk_size = size; if (!page_lock (usrc, false)) thread_exit (); memcpy (dst, usrc, chunk_size); page_unlock (usrc); dst += chunk_size; usrc += chunk_size; size -= chunk_size; } }
/* Copies SIZE bytes from kernel address SRC to user address UDST. Call thread_exit() if any of the user accesses are invalid. */ static void copy_out (void *udst_, const void *src_, size_t size) { uint8_t *udst = udst_; const uint8_t *src = src_; while (size > 0) { size_t chunk_size = PGSIZE - pg_ofs (udst); if (chunk_size > size) chunk_size = size; if (!page_lock (udst, false)) thread_exit (); memcpy (udst, src, chunk_size); page_unlock (udst); udst += chunk_size; src += chunk_size; size -= chunk_size; } }
/* Read system call. */ static int sys_read (int handle, void *udst_, unsigned size) { uint8_t *udst = udst_; struct file_descriptor *fd; int bytes_read = 0; fd = lookup_fd (handle); while (size > 0) { /* How much to read into this page? */ size_t page_left = PGSIZE - pg_ofs (udst); size_t read_amt = size < page_left ? size : page_left; off_t retval; /* Read from file into page. */ if (handle != STDIN_FILENO) { if (!page_lock (udst, true)) thread_exit (); lock_acquire (&fs_lock); retval = file_read (fd->file, udst, read_amt); lock_release (&fs_lock); page_unlock (udst); } else { size_t i; for (i = 0; i < read_amt; i++) { char c = input_getc (); if (!page_lock (udst, true)) thread_exit (); udst[i] = c; page_unlock (udst); } bytes_read = read_amt; } /* Check success. */ if (retval < 0) { if (bytes_read == 0) bytes_read = -1; break; } bytes_read += retval; if (retval != (off_t) read_amt) { /* Short read, so we're done. */ break; } /* Advance. */ udst += retval; size -= retval; } return bytes_read; }
/* * We want to add memory, but have no spare page_t structures. Use some of * our new memory for the page_t structures. * * Somewhat similar to kphysm_add_memory_dynamic(), but simpler. */ static int balloon_init_new_pages(mfn_t framelist[], pgcnt_t count) { pgcnt_t metapgs, totalpgs, num_pages; paddr_t metasz; pfn_t meta_start; page_t *page_array; caddr_t va; int i, rv, locked; mem_structs_t *mem; struct memseg *segp; /* Calculate the number of pages we're going to add */ totalpgs = bln_stats.bln_new_target - bln_stats.bln_current_pages; /* * The following calculates the number of "meta" pages -- the pages * that will be required to hold page_t structures for all new pages. * Proof of this calculation is left up to the reader. */ metapgs = totalpgs - (((uint64_t)(totalpgs) << PAGESHIFT) / (PAGESIZE + sizeof (page_t))); /* * Given the number of page_t structures we need, is there also * room in our meta pages for a memseg and memlist struct? * If not, we'll need one more meta page. */ if ((metapgs << PAGESHIFT) < (totalpgs * sizeof (page_t) + MEM_STRUCT_SIZE)) metapgs++; /* * metapgs is calculated from totalpgs, which may be much larger than * count. If we don't have enough pages, all of the pages in this * batch will be made meta pages, and a future trip through * balloon_inc_reservation() will add the rest of the meta pages. */ if (metapgs > count) metapgs = count; /* * Figure out the number of page_t structures that can fit in metapgs * * This will cause us to initialize more page_t structures than we * need - these may be used in future memory increases. */ metasz = pfn_to_pa(metapgs); num_pages = (metasz - MEM_STRUCT_SIZE) / sizeof (page_t); DTRACE_PROBE3(balloon__alloc__stats, pgcnt_t, totalpgs, pgcnt_t, num_pages, pgcnt_t, metapgs); /* * We only increment mfn_count by count, not num_pages, to keep the * space of all valid pfns contiguous. This means we create page_t * structures with invalid pagenums -- we deal with this situation * in balloon_page_sub. */ mfn_count += count; /* * Get a VA for the pages that will hold page_t and other structures. * The memseg and memlist structures will go at the beginning, with * the page_t structures following. */ va = (caddr_t)vmem_alloc(heap_arena, metasz, VM_SLEEP); /* LINTED: improper alignment */ mem = (mem_structs_t *)va; page_array = mem->pages; meta_start = bln_stats.bln_max_pages; /* * Set the mfn to pfn mapping for the meta pages. */ locked = balloon_lock_contig_pfnlist(metapgs); for (i = 0; i < metapgs; i++) { reassign_pfn(bln_stats.bln_max_pages + i, framelist[i]); } if (locked) unlock_contig_pfnlist(); /* * For our meta pages, map them in and zero the page. * This will be the first time touching the new pages. */ hat_devload(kas.a_hat, va, metasz, bln_stats.bln_max_pages, PROT_READ | PROT_WRITE, HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST); bzero(va, metasz); /* * Initialize the page array for the new pages. */ for (i = 0; i < metapgs; i++) { page_array[i].p_pagenum = bln_stats.bln_max_pages++; page_array[i].p_offset = (u_offset_t)-1; page_iolock_init(&page_array[i]); rv = page_lock(&page_array[i], SE_EXCL, NULL, P_NO_RECLAIM); ASSERT(rv == 1); } /* * For the rest of the pages, initialize the page_t struct and * add them to the free list */ for (i = metapgs; i < num_pages; i++) { page_array[i].p_pagenum = bln_stats.bln_max_pages++; page_array[i].p_offset = (u_offset_t)-1; page_iolock_init(&page_array[i]); rv = page_lock(&page_array[i], SE_EXCL, NULL, P_NO_RECLAIM); ASSERT(rv == 1); balloon_page_add(&page_array[i]); } /* * Remember where I said that we don't call this function? The missing * code right here is why. We need to set up kpm mappings for any new * pages coming in. However, if someone starts up a domain with small * memory, then greatly increases it, we could get in some horrible * deadlock situations as we steal page tables for kpm use, and * userland applications take them right back before we can use them * to set up our new memory. Once a way around that is found, and a * few other changes are made, we'll be able to enable this code. */ /* * Update kernel structures, part 1: memsegs list */ mem->memseg.pages_base = meta_start; mem->memseg.pages_end = bln_stats.bln_max_pages - 1; mem->memseg.pages = &page_array[0]; mem->memseg.epages = &page_array[num_pages - 1]; mem->memseg.next = NULL; memsegs_lock(1); for (segp = memsegs; segp->next != NULL; segp = segp->next) ; segp->next = &mem->memseg; memsegs_unlock(1); /* * Update kernel structures, part 2: mem_node array */ mem_node_add_slice(meta_start, bln_stats.bln_max_pages); /* * Update kernel structures, part 3: phys_install array * (*sigh* how many of these things do we need?) */ memlist_write_lock(); memlist_add(pfn_to_pa(meta_start), num_pages, &mem->memlist, &phys_install); memlist_write_unlock(); build_pfn_hash(); return (metapgs); }
/* * Process a vnode's page list for all pages whose offset is >= off. * Pages are to either be free'd, invalidated, or written back to disk. * * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE * is specified, otherwise they are "shared" locked. * * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC} * * Special marker page_t's are inserted in the list in order * to keep track of where we are in the list when locks are dropped. * * Note the list is circular and insertions can happen only at the * head and tail of the list. The algorithm ensures visiting all pages * on the list in the following way: * * Drop two marker pages at the end of the list. * * Move one marker page backwards towards the start of the list until * it is at the list head, processing the pages passed along the way. * * Due to race conditions when the vphm mutex is dropped, additional pages * can be added to either end of the list, so we'll continue to move * the marker and process pages until it is up against the end marker. * * There is one special exit condition. If we are processing a VMODSORT * vnode and only writing back modified pages, we can stop as soon as * we run into an unmodified page. This makes fsync(3) operations fast. */ int pvn_vplist_dirty( vnode_t *vp, u_offset_t off, int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *), int flags, cred_t *cred) { page_t *pp; page_t *mark; /* marker page that moves toward head */ page_t *end; /* marker page at end of list */ int err = 0; int error; kmutex_t *vphm; se_t se; page_t **where_to_move; ASSERT(vp->v_type != VCHR); if (vp->v_pages == NULL) return (0); /* * Serialize vplist_dirty operations on this vnode by setting VVMLOCK. * * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync() * from getting blocked while flushing pages to a dead NFS server. */ mutex_enter(&vp->v_lock); if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) { mutex_exit(&vp->v_lock); return (EAGAIN); } while (vp->v_flag & VVMLOCK) cv_wait(&vp->v_cv, &vp->v_lock); if (vp->v_pages == NULL) { mutex_exit(&vp->v_lock); return (0); } vp->v_flag |= VVMLOCK; mutex_exit(&vp->v_lock); /* * Set up the marker pages used to walk the list */ end = kmem_cache_alloc(marker_cache, KM_SLEEP); end->p_vnode = vp; end->p_offset = (u_offset_t)-2; mark = kmem_cache_alloc(marker_cache, KM_SLEEP); mark->p_vnode = vp; mark->p_offset = (u_offset_t)-1; /* * Grab the lock protecting the vnode's page list * note that this lock is dropped at times in the loop. */ vphm = page_vnode_mutex(vp); mutex_enter(vphm); if (vp->v_pages == NULL) goto leave; /* * insert the markers and loop through the list of pages */ page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark); page_vpadd(&mark->p_vpnext, end); for (;;) { /* * If only doing an async write back, then we can * stop as soon as we get to start of the list. */ if (flags == B_ASYNC && vp->v_pages == mark) break; /* * otherwise stop when we've gone through all the pages */ if (mark->p_vpprev == end) break; pp = mark->p_vpprev; if (vp->v_pages == pp) where_to_move = &vp->v_pages; else where_to_move = &pp->p_vpprev->p_vpnext; ASSERT(pp->p_vnode == vp); /* * If just flushing dirty pages to disk and this vnode * is using a sorted list of pages, we can stop processing * as soon as we find an unmodified page. Since all the * modified pages are visited first. */ if (IS_VMODSORT(vp) && !(flags & (B_INVAL | B_FREE | B_TRUNC))) { if (!hat_ismod(pp) && !page_io_locked(pp)) { #ifdef DEBUG /* * For debug kernels examine what should be * all the remaining clean pages, asserting * that they are not modified. */ page_t *chk = pp; int attr; page_vpsub(&vp->v_pages, mark); page_vpadd(where_to_move, mark); do { chk = chk->p_vpprev; ASSERT(chk != end); if (chk == mark) continue; attr = hat_page_getattr(chk, P_MOD | P_REF); if ((attr & P_MOD) == 0) continue; panic("v_pages list not all clean: " "page_t*=%p vnode=%p off=%lx " "attr=0x%x last clean page_t*=%p\n", (void *)chk, (void *)chk->p_vnode, (long)chk->p_offset, attr, (void *)pp); } while (chk != vp->v_pages); #endif break; } else if (!(flags & B_ASYNC) && !hat_ismod(pp)) { /* * Couldn't get io lock, wait until IO is done. * Block only for sync IO since we don't want * to block async IO. */ mutex_exit(vphm); page_io_wait(pp); mutex_enter(vphm); continue; } } /* * Skip this page if the offset is out of the desired range. * Just move the marker and continue. */ if (pp->p_offset < off) { page_vpsub(&vp->v_pages, mark); page_vpadd(where_to_move, mark); continue; } /* * If we are supposed to invalidate or free this * page, then we need an exclusive lock. */ se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED; /* * We must acquire the page lock for all synchronous * operations (invalidate, free and write). */ if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) { /* * If the page_lock() drops the mutex * we must retry the loop. */ if (!page_lock(pp, se, vphm, P_NO_RECLAIM)) continue; /* * It's ok to move the marker page now. */ page_vpsub(&vp->v_pages, mark); page_vpadd(where_to_move, mark); } else { /* * update the marker page for all remaining cases */ page_vpsub(&vp->v_pages, mark); page_vpadd(where_to_move, mark); /* * For write backs, If we can't lock the page, it's * invalid or in the process of being destroyed. Skip * it, assuming someone else is writing it. */ if (!page_trylock(pp, se)) continue; } ASSERT(pp->p_vnode == vp); /* * Successfully locked the page, now figure out what to * do with it. Free pages are easily dealt with, invalidate * if desired or just go on to the next page. */ if (PP_ISFREE(pp)) { if ((flags & B_INVAL) == 0) { page_unlock(pp); continue; } /* * Invalidate (destroy) the page. */ mutex_exit(vphm); page_destroy_free(pp); mutex_enter(vphm); continue; } /* * pvn_getdirty() figures out what do do with a dirty page. * If the page is dirty, the putapage() routine will write it * and will kluster any other adjacent dirty pages it can. * * pvn_getdirty() and `(*putapage)' unlock the page. */ mutex_exit(vphm); if (pvn_getdirty(pp, flags)) { error = (*putapage)(vp, pp, NULL, NULL, flags, cred); if (!err) err = error; } mutex_enter(vphm); } page_vpsub(&vp->v_pages, mark); page_vpsub(&vp->v_pages, end); leave: /* * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds */ mutex_exit(vphm); kmem_cache_free(marker_cache, mark); kmem_cache_free(marker_cache, end); mutex_enter(&vp->v_lock); vp->v_flag &= ~VVMLOCK; cv_broadcast(&vp->v_cv); mutex_exit(&vp->v_lock); return (err); }
int vmxnet3s_txcache_init(vmxnet3s_softc_t *dp, vmxnet3s_txq_t *txq) { int i; int ndescrs; int node; page_t *page; struct seg kseg; vmxnet3s_txcache_t *cache = &dp->txcache; dev_info_t *dip = dp->dip; cache->num_pages = ((txq->cmdring.size * VMXNET3_HDR_COPY_SIZE) + (PAGESIZE - 1)) / PAGESIZE; /* Allocate pages */ if (!page_resv(cache->num_pages, KM_SLEEP)) { dev_err(dip, CE_WARN, "failed to reserve %d pages", cache->num_pages); goto out; } if (!page_create_wait(cache->num_pages, 0)) { dev_err(dip, CE_WARN, "failed to create %d pages", cache->num_pages); goto unresv_pages; } cache->pages = kmem_zalloc(cache->num_pages * sizeof (page_t *), KM_SLEEP); cache->page_maps = kmem_zalloc(cache->num_pages * sizeof (page_t *), KM_SLEEP); kseg.s_as = &kas; for (i = 0; i < cache->num_pages; i++) { page = page_get_freelist(&kvp, 0, &kseg, (caddr_t)(i*PAGESIZE), PAGESIZE, 0, NULL); if (page == NULL) { page = page_get_cachelist(&kvp, 0, &kseg, (caddr_t)(i * PAGESIZE), 0, NULL); if (page == NULL) goto free_pages; if (!PP_ISAGED(page)) page_hashout(page, NULL); } PP_CLRFREE(page); PP_CLRAGED(page); cache->pages[i] = page; } for (i = 0; i < cache->num_pages; i++) page_downgrade(cache->pages[i]); /* Allocate virtual address range for mapping pages */ cache->window = vmem_alloc(heap_arena, ptob(cache->num_pages), VM_SLEEP); ASSERT(cache->window); cache->num_nodes = txq->cmdring.size; /* Map pages */ for (i = 0; i < cache->num_pages; i++) { cache->page_maps[i] = cache->window + ptob(i); hat_devload(kas.a_hat, cache->page_maps[i], ptob(1), cache->pages[i]->p_pagenum, PROT_READ | PROT_WRITE | HAT_STRICTORDER, HAT_LOAD_LOCK); } /* Now setup cache items */ cache->nodes = kmem_zalloc(txq->cmdring.size * sizeof (vmxnet3s_txcache_node_t), KM_SLEEP); ndescrs = txq->cmdring.size; node = 0; for (i = 0; i < cache->num_pages; i++) { caddr_t va; int j; int lim; uint64_t pa; lim = (ndescrs <= VMXNET3_TX_CACHE_ITEMS_PER_PAGE) ? ndescrs : VMXNET3_TX_CACHE_ITEMS_PER_PAGE; va = cache->page_maps[i]; pa = cache->pages[i]->p_pagenum << PAGESHIFT; for (j = 0; j < lim; j++) { cache->nodes[node].pa = pa; cache->nodes[node].va = va; pa += VMXNET3_HDR_COPY_SIZE; va += VMXNET3_HDR_COPY_SIZE; node++; } ndescrs -= lim; } return (DDI_SUCCESS); free_pages: page_create_putback(cache->num_pages - i); while (--i >= 0) { if (!page_tryupgrade(cache->pages[i])) { page_unlock(cache->pages[i]); while (!page_lock(cache->pages[i], SE_EXCL, NULL, P_RECLAIM)) ; } page_free(cache->pages[i], 0); } kmem_free(cache->pages, cache->num_pages * PAGESIZE); unresv_pages: page_unresv(cache->num_pages); out: cache->num_pages = cache->num_nodes = 0; return (DDI_FAILURE); }