/* * Update cache contents upon write completion. */ void vdev_cache_write(zio_t *zio) { vdev_cache_t *vc = &zio->io_vd->vdev_cache; vdev_cache_entry_t *ve, ve_search; uint64_t io_start = zio->io_offset; uint64_t io_end = io_start + zio->io_size; uint64_t min_offset = P2ALIGN(io_start, VCBS); uint64_t max_offset = P2ROUNDUP(io_end, VCBS); avl_index_t where; ASSERT(zio->io_type == ZIO_TYPE_WRITE); mutex_enter(&vc->vc_lock); ve_search.ve_offset = min_offset; ve = avl_find(&vc->vc_offset_tree, &ve_search, &where); if (ve == NULL) ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER); while (ve != NULL && ve->ve_offset < max_offset) { uint64_t start = MAX(ve->ve_offset, io_start); uint64_t end = MIN(ve->ve_offset + VCBS, io_end); if (ve->ve_fill_io != NULL) { ve->ve_missed_update = 1; } else { bcopy((char *)zio->io_data + start - io_start, ve->ve_data + start - ve->ve_offset, end - start); } ve = AVL_NEXT(&vc->vc_offset_tree, ve); } mutex_exit(&vc->vc_lock); }
static caddr_t pci_cfgacc_map(paddr_t phys_addr) { #ifdef __xpv phys_addr = pfn_to_pa(xen_assign_pfn(mmu_btop(phys_addr))) | (phys_addr & MMU_PAGEOFFSET); #endif if (khat_running) { pfn_t pfn = mmu_btop(phys_addr); /* * pci_cfgacc_virt_base may hold address left from early * boot, which points to low mem. Realloc virtual address * in kernel space since it's already late in boot now. * Note: no need to unmap first, clear_boot_mappings() will * do that for us. */ if (pci_cfgacc_virt_base < (caddr_t)kernelbase) pci_cfgacc_virt_base = vmem_alloc(heap_arena, MMU_PAGESIZE, VM_SLEEP); hat_devload(kas.a_hat, pci_cfgacc_virt_base, MMU_PAGESIZE, pfn, PROT_READ | PROT_WRITE | HAT_STRICTORDER, HAT_LOAD_LOCK); } else { paddr_t pa_base = P2ALIGN(phys_addr, MMU_PAGESIZE); if (pci_cfgacc_virt_base == NULL) pci_cfgacc_virt_base = (caddr_t)alloc_vaddr(MMU_PAGESIZE, MMU_PAGESIZE); kbm_map((uintptr_t)pci_cfgacc_virt_base, pa_base, 0, 0); } return (pci_cfgacc_virt_base + (phys_addr & MMU_PAGEOFFSET)); }
/* * Allocate an entry in the cache. At the point we don't have the data, * we're just creating a placeholder so that multiple threads don't all * go off and read the same blocks. */ static vdev_cache_entry_t * vdev_cache_allocate(zio_t *zio) { vdev_cache_t *vc = &zio->io_vd->vdev_cache; uint64_t offset = P2ALIGN(zio->io_offset, VCBS); vdev_cache_entry_t *ve; ASSERT(MUTEX_HELD(&vc->vc_lock)); if (zfs_vdev_cache_size == 0) return (NULL); /* * If adding a new entry would exceed the cache size, * evict the oldest entry (LRU). */ if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) > zfs_vdev_cache_size) { ve = avl_first(&vc->vc_lastused_tree); if (ve->ve_fill_io != NULL) return (NULL); ASSERT(ve->ve_hits != 0); vdev_cache_evict(vc, ve); } ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); ve->ve_offset = offset; ve->ve_lastused = lbolt; ve->ve_data = zio_buf_alloc(VCBS); avl_add(&vc->vc_offset_tree, ve); avl_add(&vc->vc_lastused_tree, ve); return (ve); }
/* * Read data from the cache. Returns 0 on cache hit, errno on a miss. */ int vdev_cache_read(zio_t *zio) { vdev_cache_t *vc = &zio->io_vd->vdev_cache; vdev_cache_entry_t *ve, *ve_search; uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS); ASSERTV(uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);)
/*ARGSUSED*/ int plat_get_mem_unum(int synd_code, uint64_t flt_addr, int flt_bus_id, int flt_in_memory, ushort_t flt_status, char *buf, int buflen, int *lenp) { if (flt_in_memory && (p2get_mem_unum != NULL)) return (p2get_mem_unum(synd_code, P2ALIGN(flt_addr, 8), buf, buflen, lenp)); else return (ENOTSUP); }
/* * Check whether any portion of [start, end] segment is within the * [start_addr, end_addr] range. * * Return values: * 0 - address is outside the range * 1 - address is within the range */ static int address_in_range(uintptr_t start, uintptr_t end, size_t psz) { int rc = 1; /* * Nothing to do if there is no address range specified with -A */ if (start_addr != INVALID_ADDRESS || end_addr != INVALID_ADDRESS) { /* The segment end is below the range start */ if ((start_addr != INVALID_ADDRESS) && (end < P2ALIGN(start_addr, psz))) rc = 0; /* The segment start is above the range end */ if ((end_addr != INVALID_ADDRESS) && (start > P2ALIGN(end_addr + psz, psz))) rc = 0; } return (rc); }
static int zvol_discard(struct bio *bio) { zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data; uint64_t start = BIO_BI_SECTOR(bio) << 9; uint64_t size = BIO_BI_SIZE(bio); uint64_t end = start + size; int error; rl_t *rl; dmu_tx_t *tx; ASSERT(zv && zv->zv_open_count > 0); if (end > zv->zv_volsize) return (SET_ERROR(EIO)); /* * Align the request to volume block boundaries when REQ_SECURE is * available, but not requested. If we don't, then this will force * dnode_free_range() to zero out the unaligned parts, which is slow * (read-modify-write) and useless since we are not freeing any space * by doing so. Kernels that do not support REQ_SECURE (2.6.32 through * 2.6.35) will not receive this optimization. */ #ifdef REQ_SECURE if (!(bio->bi_rw & REQ_SECURE)) { start = P2ROUNDUP(start, zv->zv_volblocksize); end = P2ALIGN(end, zv->zv_volblocksize); size = end - start; } #endif if (start >= end) return (0); rl = zfs_range_lock(&zv->zv_znode, start, size, RL_WRITER); tx = dmu_tx_create(zv->zv_objset); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) { dmu_tx_abort(tx); } else { zvol_log_truncate(zv, tx, start, size, B_TRUE); dmu_tx_commit(tx); error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, size); } zfs_range_unlock(rl); return (error); }
static void zvol_discard(void *arg) { struct request *req = (struct request *)arg; struct request_queue *q = req->q; zvol_state_t *zv = q->queuedata; uint64_t start = blk_rq_pos(req) << 9; uint64_t end = start + blk_rq_bytes(req); int error; rl_t *rl; /* * Annotate this call path with a flag that indicates that it is * unsafe to use KM_SLEEP during memory allocations due to the * potential for a deadlock. KM_PUSHPAGE should be used instead. */ ASSERT(!(current->flags & PF_NOFS)); current->flags |= PF_NOFS; if (end > zv->zv_volsize) { blk_end_request(req, -EIO, blk_rq_bytes(req)); goto out; } /* * Align the request to volume block boundaries. If we don't, * then this will force dnode_free_range() to zero out the * unaligned parts, which is slow (read-modify-write) and * useless since we are not freeing any space by doing so. */ start = P2ROUNDUP(start, zv->zv_volblocksize); end = P2ALIGN(end, zv->zv_volblocksize); if (start >= end) { blk_end_request(req, 0, blk_rq_bytes(req)); goto out; } rl = zfs_range_lock(&zv->zv_znode, start, end - start, RL_WRITER); error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, end - start); /* * TODO: maybe we should add the operation to the log. */ zfs_range_unlock(rl); blk_end_request(req, -error, blk_rq_bytes(req)); out: current->flags &= ~PF_NOFS; }
static int zvol_discard(struct bio *bio) { zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data; uint64_t start = BIO_BI_SECTOR(bio) << 9; uint64_t size = BIO_BI_SIZE(bio); uint64_t end = start + size; int error; rl_t *rl; dmu_tx_t *tx; ASSERT(zv && zv->zv_open_count > 0); if (end > zv->zv_volsize) return (SET_ERROR(EIO)); /* * Align the request to volume block boundaries when a secure erase is * not required. This will prevent dnode_free_range() from zeroing out * the unaligned parts which is slow (read-modify-write) and useless * since we are not freeing any space by doing so. */ if (!bio_is_secure_erase(bio)) { start = P2ROUNDUP(start, zv->zv_volblocksize); end = P2ALIGN(end, zv->zv_volblocksize); size = end - start; } if (start >= end) return (0); rl = zfs_range_lock(&zv->zv_range_lock, start, size, RL_WRITER); tx = dmu_tx_create(zv->zv_objset); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) { dmu_tx_abort(tx); } else { zvol_log_truncate(zv, tx, start, size, B_TRUE); dmu_tx_commit(tx); error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, size); } zfs_range_unlock(rl); return (error); }
/* * Copy in a memory list from boot to kernel, with a filter function * to remove pages. The filter function can increase the address and/or * decrease the size to filter out pages. It will also align addresses and * sizes to PAGESIZE. */ void copy_memlist_filter( struct memlist *src, struct memlist **dstp, void (*filter)(uint64_t *, uint64_t *)) { struct memlist *dst, *prev; uint64_t addr; uint64_t size; uint64_t eaddr; dst = *dstp; prev = dst; /* * Move through the memlist applying a filter against * each range of memory. Note that we may apply the * filter multiple times against each memlist entry. */ for (; src; src = src->ml_next) { addr = P2ROUNDUP(src->ml_address, PAGESIZE); eaddr = P2ALIGN(src->ml_address + src->ml_size, PAGESIZE); while (addr < eaddr) { size = eaddr - addr; if (filter != NULL) filter(&addr, &size); if (size == 0) break; dst->ml_address = addr; dst->ml_size = size; dst->ml_next = 0; if (prev == dst) { dst->ml_prev = 0; dst++; } else { dst->ml_prev = prev; prev->ml_next = dst; dst++; prev++; } addr += size; } } *dstp = dst; }
static void zvol_discard(void *arg) { struct request *req = (struct request *)arg; struct request_queue *q = req->q; zvol_state_t *zv = q->queuedata; fstrans_cookie_t cookie = spl_fstrans_mark(); uint64_t start = blk_rq_pos(req) << 9; uint64_t end = start + blk_rq_bytes(req); int error; rl_t *rl; if (end > zv->zv_volsize) { error = EIO; goto out; } /* * Align the request to volume block boundaries. If we don't, * then this will force dnode_free_range() to zero out the * unaligned parts, which is slow (read-modify-write) and * useless since we are not freeing any space by doing so. */ start = P2ROUNDUP(start, zv->zv_volblocksize); end = P2ALIGN(end, zv->zv_volblocksize); if (start >= end) { error = 0; goto out; } rl = zfs_range_lock(&zv->zv_znode, start, end - start, RL_WRITER); error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, end-start); /* * TODO: maybe we should add the operation to the log. */ zfs_range_unlock(rl); out: blk_end_request(req, -error, blk_rq_bytes(req)); spl_fstrans_unmark(cookie); }
static int zvol_discard(struct bio *bio) { zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data; uint64_t start = BIO_BI_SECTOR(bio) << 9; uint64_t size = BIO_BI_SIZE(bio); uint64_t end = start + size; int error; rl_t *rl; if (end > zv->zv_volsize) return (SET_ERROR(EIO)); /* * Align the request to volume block boundaries when REQ_SECURE is * available, but not requested. If we don't, then this will force * dnode_free_range() to zero out the unaligned parts, which is slow * (read-modify-write) and useless since we are not freeing any space * by doing so. Kernels that do not support REQ_SECURE (2.6.32 through * 2.6.35) will not receive this optimization. */ #ifdef REQ_SECURE if (!(bio->bi_rw & REQ_SECURE)) { start = P2ROUNDUP(start, zv->zv_volblocksize); end = P2ALIGN(end, zv->zv_volblocksize); size = end - start; } #endif if (start >= end) return (0); rl = zfs_range_lock(&zv->zv_znode, start, size, RL_WRITER); error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, size); /* * TODO: maybe we should add the operation to the log. */ zfs_range_unlock(rl); return (error); }
void fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) { const fletcher_4_ops_t *ops; uint64_t p2size = P2ALIGN(size, 64); ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); if (size == 0) { ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); } else if (p2size == 0) { ops = &fletcher_4_scalar_ops; fletcher_4_byteswap_impl(ops, buf, size, zcp); } else { ops = fletcher_4_impl_get(); fletcher_4_byteswap_impl(ops, buf, p2size, zcp); if (p2size < size) fletcher_4_incremental_byteswap((char *)buf + p2size, size - p2size, zcp); } }
/*ARGSUSED*/ void fletcher_4_byteswap(const void *buf, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { const uint64_t p2size = P2ALIGN(size, 64); ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); if (size == 0 || p2size == 0) { ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); if (size > 0) fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, buf, size); } else { fletcher_4_byteswap_impl(buf, p2size, zcp); if (p2size < size) fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, (char *)buf + p2size, size - p2size); } }
/* * Note: longer-term, we should modify all of the dmu_buf_*() interfaces * to take a held dnode rather than <os, object> -- the lookup is wasteful, * and can induce severe lock contention when writing to several files * whose dnodes are in the same block. */ static int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) { dmu_buf_t **dbp; uint64_t blkid, nblks, i; uint32_t dbuf_flags; int err; zio_t *zio; ASSERT(length <= DMU_MAX_ACCESS); dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT; if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz) dbuf_flags |= DB_RF_NOPREFETCH; rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_datablkshift) { int blkshift = dn->dn_datablkshift; nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) - P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift; } else {
static int kgrep_range_basic(uintptr_t base, uintptr_t lim, void *kg_arg) { kgrep_data_t *kg = kg_arg; size_t pagesize = kg->kg_pagesize; uintptr_t pattern = kg->kg_pattern; uintptr_t *page = kg->kg_page; uintptr_t *page_end = &page[pagesize / sizeof (uintptr_t)]; uintptr_t *pos; uintptr_t addr, offset; int seen = 0; /* * page-align everything, to simplify the loop */ base = P2ALIGN(base, pagesize); lim = P2ROUNDUP(lim, pagesize); for (addr = base; addr < lim; addr += pagesize) { if (mdb_vread(page, pagesize, addr) == -1) continue; seen = 1; for (pos = page; pos < page_end; pos++) { if (*pos != pattern) continue; offset = (caddr_t)pos - (caddr_t)page; kgrep_cb(addr + offset, NULL, kg->kg_cbtype); } } if (seen) kg->kg_seen = 1; return (WALK_NEXT); }
/* * Note: longer-term, we should modify all of the dmu_buf_*() interfaces * to take a held dnode rather than <os, object> -- the lookup is wasteful, * and can induce severe lock contention when writing to several files * whose dnodes are in the same block. */ static int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, int read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp) { dsl_pool_t *dp = NULL; dmu_buf_t **dbp; uint64_t blkid, nblks, i; uint32_t flags; int err; zio_t *zio; hrtime_t start; ASSERT(length <= DMU_MAX_ACCESS); flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT; if (length > zfetch_array_rd_sz) flags |= DB_RF_NOPREFETCH; rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_datablkshift) { int blkshift = dn->dn_datablkshift; nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) - P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift; } else {
/* * Allocate an entry in the cache. At the point we don't have the data, * we're just creating a placeholder so that multiple threads don't all * go off and read the same blocks. */ static vdev_cache_entry_t * vdev_cache_allocate(zio_t *zio) { vdev_cache_t *vc = &zio->io_vd->vdev_cache; uint64_t offset = P2ALIGN(zio->io_offset, VCBS); vdev_cache_entry_t *ve; ASSERT(MUTEX_HELD(&vc->vc_lock)); if (zfs_vdev_cache_size == 0) return (NULL); /* * If adding a new entry would exceed the cache size, * evict the oldest entry (LRU). */ if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) > zfs_vdev_cache_size) { ve = avl_first(&vc->vc_lastused_tree); if (ve->ve_fill_io != NULL) return (NULL); ASSERT3U(ve->ve_hits, !=, 0); vdev_cache_evict(vc, ve); }
blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2; minlvl = restarted ? 1 : 2; restarted = B_TRUE; error = dnode_next_offset(DMU_META_DNODE(os), DNODE_FIND_HOLE, &offset, minlvl, blkfill, 0); if (error == 0) { object = offset >> DNODE_SHIFT; } } /* * Note: if "restarted", we may find a L0 that * is not suitably aligned. */ os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk) + dnodes_per_chunk; (void) atomic_swap_64(cpuobj, object); mutex_exit(&os->os_obj_lock); } /* * XXX We should check for an i/o error here and return * up to our caller. Actually we should pre-read it in * dmu_tx_assign(), but there is currently no mechanism * to do so. */ (void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots, FTAG, &dn); if (dn != NULL) { rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
bits >= 0; bits -= epbs) txh->txh_fudge += 1ULL << max_ibs; goto out; } off += delta; if (len >= delta) len -= delta; delta = dn->dn_datablksz; } } /* * 'end' is the last thing we will access, not one past. * This way we won't overflow when accessing the last byte. */ start = P2ALIGN(off, 1ULL << max_bs); end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1; txh->txh_space_towrite += end - start + 1; start >>= min_bs; end >>= min_bs; epbs = min_ibs - SPA_BLKPTRSHIFT; /* * The object contains at most 2^(64 - min_bs) blocks, * and each indirect level maps 2^epbs. */ for (bits = 64 - min_bs; bits >= 0; bits -= epbs) { start >>= epbs; end >>= epbs;
/* * Find something to roll, then if we don't have cached roll buffers * covering all the deltas in that MAPBLOCK then read the master * and overlay the deltas. * returns; * 0 if sucessful * 1 on finding nothing to roll * 2 on error */ int log_roll_read(ml_unit_t *ul, rollbuf_t *rbs, int nmblk, caddr_t roll_bufs, int *retnbuf) { offset_t mof; buf_t *bp; rollbuf_t *rbp; mt_map_t *logmap = ul->un_logmap; daddr_t mblkno; int i; int error; int nbuf; /* * Make sure there is really something to roll */ mof = 0; if (!logmap_next_roll(logmap, &mof)) { return (1); } /* * build some master blocks + deltas to roll forward */ rw_enter(&logmap->mtm_rwlock, RW_READER); nbuf = 0; do { mof = mof & (offset_t)MAPBLOCKMASK; mblkno = lbtodb(mof); /* * Check for the case of a new delta to a set up buffer */ for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) { if (P2ALIGN(rbp->rb_bh.b_blkno, MAPBLOCKSIZE / DEV_BSIZE) == mblkno) { TNF_PROBE_0(trans_roll_new_delta, "lufs", /* CSTYLED */); trans_roll_new_delta++; /* Flush out the current set of buffers */ goto flush_bufs; } } /* * Work out what to roll next. If it isn't cached then read * it asynchronously from the master. */ bp = &rbp->rb_bh; bp->b_blkno = mblkno; bp->b_flags = B_READ; bp->b_un.b_addr = roll_bufs + (nbuf << MAPBLOCKSHIFT); bp->b_bufsize = MAPBLOCKSIZE; if (top_read_roll(rbp, ul)) { /* logmap deltas were in use */ if (nbuf == 0) { /* * On first buffer wait for the logmap user * to finish by grabbing the logmap lock * exclusively rather than spinning */ rw_exit(&logmap->mtm_rwlock); lrr_wait++; rw_enter(&logmap->mtm_rwlock, RW_WRITER); rw_exit(&logmap->mtm_rwlock); return (1); } /* we have at least one buffer - flush it */ goto flush_bufs; } if ((bp->b_flags & B_INVAL) == 0) { nbuf++; } mof += MAPBLOCKSIZE; } while ((nbuf < nmblk) && logmap_next_roll(logmap, &mof));
/* * Construct a stack for init containing the arguments to it, then * pass control to exec_common. */ int exec_init(const char *initpath, const char *args) { caddr32_t ucp; caddr32_t *uap; caddr32_t *argv; caddr32_t exec_fnamep; char *scratchargs; int i, sarg; size_t argvlen, alen; boolean_t in_arg; int argc = 0; int error = 0, count = 0; proc_t *p = ttoproc(curthread); klwp_t *lwp = ttolwp(curthread); int brand_action; if (args == NULL) args = ""; alen = strlen(initpath) + 1 + strlen(args) + 1; scratchargs = kmem_alloc(alen, KM_SLEEP); (void) snprintf(scratchargs, alen, "%s %s", initpath, args); /* * We do a quick two state parse of the string to sort out how big * argc should be. */ in_arg = B_FALSE; for (i = 0; i < strlen(scratchargs); i++) { if (scratchargs[i] == ' ' || scratchargs[i] == '\0') { if (in_arg) { in_arg = B_FALSE; argc++; } } else { in_arg = B_TRUE; } } argvlen = sizeof (caddr32_t) * (argc + 1); argv = kmem_zalloc(argvlen, KM_SLEEP); /* * We pull off a bit of a hack here. We work our way through the * args string, putting nulls at the ends of space delimited tokens * (boot args don't support quoting at this time). Then we just * copy the whole mess to userland in one go. In other words, we * transform this: "init -s -r\0" into this on the stack: * * -0x00 \0 * -0x01 r * -0x02 - <--------. * -0x03 \0 | * -0x04 s | * -0x05 - <------. | * -0x06 \0 | | * -0x07 t | | * -0x08 i | | * -0x09 n | | * -0x0a i <---. | | * -0x10 NULL | | | (argv[3]) * -0x14 -----|--|-' (argv[2]) * -0x18 ------|--' (argv[1]) * -0x1c -------' (argv[0]) * * Since we know the value of ucp at the beginning of this process, * we can trivially compute the argv[] array which we also need to * place in userland: argv[i] = ucp - sarg(i), where ucp is the * stack ptr, and sarg is the string index of the start of the * argument. */ ucp = (caddr32_t)(uintptr_t)p->p_usrstack; argc = 0; in_arg = B_FALSE; sarg = 0; for (i = 0; i < alen; i++) { if (scratchargs[i] == ' ' || scratchargs[i] == '\0') { if (in_arg == B_TRUE) { in_arg = B_FALSE; scratchargs[i] = '\0'; argv[argc++] = ucp - (alen - sarg); } } else if (in_arg == B_FALSE) { in_arg = B_TRUE; sarg = i; } } ucp -= alen; error |= copyout(scratchargs, (caddr_t)(uintptr_t)ucp, alen); uap = (caddr32_t *)P2ALIGN((uintptr_t)ucp, sizeof (caddr32_t)); uap--; /* advance to be below the word we're in */ uap -= (argc + 1); /* advance argc words down, plus one for NULL */ error |= copyout(argv, uap, argvlen); if (error != 0) { zcmn_err(p->p_zone->zone_id, CE_WARN, "Could not construct stack for init.\n"); kmem_free(argv, argvlen); kmem_free(scratchargs, alen); return (EFAULT); } exec_fnamep = argv[0]; kmem_free(argv, argvlen); kmem_free(scratchargs, alen); /* * Point at the arguments. */ lwp->lwp_ap = lwp->lwp_arg; lwp->lwp_arg[0] = (uintptr_t)exec_fnamep; lwp->lwp_arg[1] = (uintptr_t)uap; lwp->lwp_arg[2] = NULL; curthread->t_post_sys = 1; curthread->t_sysnum = SYS_execve; /* * If we are executing init from zsched, we may have inherited its * parent process's signal mask. Clear it now so that we behave in * the same way as when started from the global zone. */ sigemptyset(&curthread->t_hold); brand_action = ZONE_IS_BRANDED(p->p_zone) ? EBA_BRAND : EBA_NONE; again: error = exec_common((const char *)(uintptr_t)exec_fnamep, (const char **)(uintptr_t)uap, NULL, brand_action); /* * Normally we would just set lwp_argsaved and t_post_sys and * let post_syscall reset lwp_ap for us. Unfortunately, * exec_init isn't always called from a system call. Instead * of making a mess of trap_cleanup, we just reset the args * pointer here. */ reset_syscall_args(); switch (error) { case 0: return (0); case ENOENT: zcmn_err(p->p_zone->zone_id, CE_WARN, "exec(%s) failed (file not found).\n", initpath); return (ENOENT); case EAGAIN: case EINTR: ++count; if (count < 5) { zcmn_err(p->p_zone->zone_id, CE_WARN, "exec(%s) failed with errno %d. Retrying...\n", initpath, error); goto again; } } zcmn_err(p->p_zone->zone_id, CE_WARN, "exec(%s) failed with errno %d.", initpath, error); return (error); }
/* * This is the prefetch entry point. It calls all of the other dmu_zfetch * routines to create, delete, find, or operate upon prefetch streams. */ void dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched) { zstream_t zst; zstream_t *newstream; int fetched; int inserted; unsigned int blkshft; uint64_t blksz; if (zfs_prefetch_disable) return; /* files that aren't ln2 blocksz are only one block -- nothing to do */ if (!zf->zf_dnode->dn_datablkshift) return; /* convert offset and size, into blockid and nblocks */ blkshft = zf->zf_dnode->dn_datablkshift; blksz = (1 << blkshft); bzero(&zst, sizeof (zstream_t)); zst.zst_offset = offset >> blkshft; zst.zst_len = (P2ROUNDUP(offset + size, blksz) - P2ALIGN(offset, blksz)) >> blkshft; fetched = dmu_zfetch_find(zf, &zst, prefetched); if (!fetched) { fetched = dmu_zfetch_colinear(zf, &zst); } if (!fetched) { newstream = dmu_zfetch_stream_reclaim(zf); /* * we still couldn't find a stream, drop the lock, and allocate * one if possible. Otherwise, give up and go home. */ if (newstream == NULL) { uint64_t maxblocks; uint32_t max_streams; uint32_t cur_streams; cur_streams = zf->zf_stream_cnt; maxblocks = zf->zf_dnode->dn_maxblkid; max_streams = MIN(zfetch_max_streams, (maxblocks / zfetch_block_cap)); if (max_streams == 0) { max_streams++; } if (cur_streams >= max_streams) { return; } newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP); } newstream->zst_offset = zst.zst_offset; newstream->zst_len = zst.zst_len; newstream->zst_stride = zst.zst_len; newstream->zst_ph_offset = zst.zst_len + zst.zst_offset; newstream->zst_cap = zst.zst_len; newstream->zst_direction = ZFETCH_FORWARD; newstream->zst_last = lbolt; mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL); rw_enter(&zf->zf_rwlock, RW_WRITER); inserted = dmu_zfetch_stream_insert(zf, newstream); rw_exit(&zf->zf_rwlock); if (!inserted) { mutex_destroy(&newstream->zst_lock); kmem_free(newstream, sizeof (zstream_t)); } } }
static inline slice_t * slice_small_get_slice_from_row(void *buf, small_allocatable_row_t **row) { (*row) = (small_allocatable_row_t *)buf; return (slice_t*)P2ALIGN((uint64_t)buf, (uint64_t)PAGE_SIZE); }
/* * Read data from the cache. Returns 0 on cache hit, errno on a miss. */ int vdev_cache_read(zio_t *zio) { vdev_cache_t *vc = &zio->io_vd->vdev_cache; vdev_cache_entry_t *ve, ve_search; uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS); uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); zio_t *fio; ASSERT(zio->io_type == ZIO_TYPE_READ); if (zio->io_flags & ZIO_FLAG_DONT_CACHE) return (EINVAL); if (zio->io_size > zfs_vdev_cache_max) return (EOVERFLOW); /* * If the I/O straddles two or more cache blocks, don't cache it. */ if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS)) return (EXDEV); ASSERT(cache_phase + zio->io_size <= VCBS); mutex_enter(&vc->vc_lock); ve_search.ve_offset = cache_offset; ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL); if (ve != NULL) { if (ve->ve_missed_update) { mutex_exit(&vc->vc_lock); return (ESTALE); } if ((fio = ve->ve_fill_io) != NULL) { zio_vdev_io_bypass(zio); zio_add_child(zio, fio); mutex_exit(&vc->vc_lock); VDCSTAT_BUMP(vdc_stat_delegations); return (0); } vdev_cache_hit(vc, ve, zio); zio_vdev_io_bypass(zio); mutex_exit(&vc->vc_lock); VDCSTAT_BUMP(vdc_stat_hits); return (0); } ve = vdev_cache_allocate(zio); if (ve == NULL) { mutex_exit(&vc->vc_lock); return (ENOMEM); } fio = zio_vdev_delegated_io(zio->io_vd, cache_offset, ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL, ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve); ve->ve_fill_io = fio; zio_vdev_io_bypass(zio); zio_add_child(zio, fio); mutex_exit(&vc->vc_lock); zio_nowait(fio); VDCSTAT_BUMP(vdc_stat_misses); return (0); }
/* * This routine assumes that the stack grows downward. * Returns 0 on success, errno on failure. */ int grow_internal(caddr_t sp, uint_t growszc) { struct proc *p = curproc; size_t newsize; size_t oldsize; int error; size_t pgsz; uint_t szc; struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL); ASSERT(sp < p->p_usrstack); sp = (caddr_t)P2ALIGN((uintptr_t)sp, PAGESIZE); /* * grow to growszc alignment but use current p->p_stkpageszc for * the segvn_crargs szc passed to segvn_create. For memcntl to * increase the szc, this allows the new extension segment to be * concatenated successfully with the existing stack segment. */ if ((szc = growszc) != 0) { pgsz = page_get_pagesize(szc); ASSERT(pgsz > PAGESIZE); newsize = p->p_usrstack - (caddr_t)P2ALIGN((uintptr_t)sp, pgsz); if (newsize > (size_t)p->p_stk_ctl) { szc = 0; pgsz = PAGESIZE; newsize = p->p_usrstack - sp; } } else { pgsz = PAGESIZE; newsize = p->p_usrstack - sp; } if (newsize > (size_t)p->p_stk_ctl) { (void) rctl_action(rctlproc_legacy[RLIMIT_STACK], p->p_rctls, p, RCA_UNSAFE_ALL); return (ENOMEM); } oldsize = p->p_stksize; ASSERT(P2PHASE(oldsize, PAGESIZE) == 0); if (newsize <= oldsize) { /* prevent the stack from shrinking */ return (0); } if (!(p->p_stkprot & PROT_EXEC)) { crargs.prot &= ~PROT_EXEC; } /* * extend stack with the proposed new growszc, which is different * than p_stkpageszc only on a memcntl to increase the stack pagesize. * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies via * map_pgszcvec(). Use AS_MAP_STACK to get intermediate page sizes * if not aligned to szc's pgsz. */ if (szc > 0) { caddr_t oldsp = p->p_usrstack - oldsize; caddr_t austk = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack, pgsz); if (IS_P2ALIGNED(p->p_usrstack, pgsz) || oldsp < austk) { crargs.szc = p->p_stkpageszc ? p->p_stkpageszc : AS_MAP_NO_LPOOB; } else if (oldsp == austk) { crargs.szc = szc; } else { crargs.szc = AS_MAP_STACK; } } else { crargs.szc = AS_MAP_NO_LPOOB; } crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_DOWN; if ((error = as_map(p->p_as, p->p_usrstack - newsize, newsize - oldsize, segvn_create, &crargs)) != 0) { if (error == EAGAIN) { cmn_err(CE_WARN, "Sorry, no swap space to grow stack " "for pid %d (%s)", p->p_pid, PTOU(p)->u_comm); } return (error); } p->p_stksize = newsize; return (0); }
/* * Read data from the cache. Returns 0 on cache hit, errno on a miss. */ int vdev_cache_read(zio_t *zio) { vdev_cache_t *vc = &zio->io_vd->vdev_cache; vdev_cache_entry_t *ve, ve_search; uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS); uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); zio_t *fio; ASSERT(zio->io_type == ZIO_TYPE_READ); if (zio->io_flags & ZIO_FLAG_DONT_CACHE) return (EINVAL); if (zio->io_size > zfs_vdev_cache_max) return (EOVERFLOW); /* * If the I/O straddles two or more cache blocks, don't cache it. */ if (P2CROSS(zio->io_offset, zio->io_offset + zio->io_size - 1, VCBS)) return (EXDEV); ASSERT(cache_phase + zio->io_size <= VCBS); mutex_enter(&vc->vc_lock); ve_search.ve_offset = cache_offset; ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL); if (ve != NULL) { if (ve->ve_missed_update) { mutex_exit(&vc->vc_lock); return (ESTALE); } if ((fio = ve->ve_fill_io) != NULL) { zio->io_delegate_next = fio->io_delegate_list; fio->io_delegate_list = zio; zio_vdev_io_bypass(zio); mutex_exit(&vc->vc_lock); return (0); } vdev_cache_hit(vc, ve, zio); zio_vdev_io_bypass(zio); mutex_exit(&vc->vc_lock); zio_next_stage(zio); return (0); } if (!(zio->io_flags & ZIO_FLAG_METADATA)) { mutex_exit(&vc->vc_lock); return (EINVAL); } ve = vdev_cache_allocate(zio); if (ve == NULL) { mutex_exit(&vc->vc_lock); return (ENOMEM); } fio = zio_vdev_child_io(zio, NULL, zio->io_vd, cache_offset, ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_NOBOOKMARK, vdev_cache_fill, ve); ve->ve_fill_io = fio; fio->io_delegate_list = zio; zio_vdev_io_bypass(zio); mutex_exit(&vc->vc_lock); zio_nowait(fio); return (0); }
/* * Returns an intersection of the [start, end] interval and the range specified * by -A flag [start_addr, end_addr]. Unspecified parts of the address range * have value INVALID_ADDRESS. * * The start_addr address is rounded down to the beginning of page and end_addr * is rounded up to the end of page. * * Returns the size of the resulting interval or zero if the interval is empty * or invalid. */ static size_t adjust_addr_range(uintptr_t start, uintptr_t end, size_t psz, uintptr_t *new_start, uintptr_t *new_end) { uintptr_t from; /* start_addr rounded down */ uintptr_t to; /* end_addr rounded up */ /* * Round down the lower address of the range to the beginning of page. */ if (start_addr == INVALID_ADDRESS) { /* * No start_addr specified by -A, the lower part of the interval * does not change. */ *new_start = start; } else { from = P2ALIGN(start_addr, psz); /* * If end address is outside the range, return an empty * interval */ if (end < from) { *new_start = *new_end = 0; return (0); } /* * The adjusted start address is the maximum of requested start * and the aligned start_addr of the -A range. */ *new_start = start < from ? from : start; } /* * Round up the higher address of the range to the end of page. */ if (end_addr == INVALID_ADDRESS) { /* * No end_addr specified by -A, the upper part of the interval * does not change. */ *new_end = end; } else { /* * If only one address is specified and it is the beginning of a * segment, get information about the whole segment. This * function is called once per segment and the 'end' argument is * always the end of a segment, so just use the 'end' value. */ to = (end_addr == start_addr && start == start_addr) ? end : P2ALIGN(end_addr + psz, psz); /* * If start address is outside the range, return an empty * interval */ if (start > to) { *new_start = *new_end = 0; return (0); } /* * The adjusted end address is the minimum of requested end * and the aligned end_addr of the -A range. */ *new_end = end > to ? to : end; } /* * Make sure that the resulting interval is legal. */ if (*new_end < *new_start) *new_start = *new_end = 0; /* Return the size of the interval */ return (*new_end - *new_start); }
/* * Algorithm: call arch-specific map_pgsz to get best page size to use, * then call grow_internal(). * Returns 0 on success. */ static int grow_lpg(caddr_t sp) { struct proc *p = curproc; size_t pgsz; size_t len, newsize; caddr_t addr, saddr; caddr_t growend; int oszc, szc; int err; newsize = p->p_usrstack - sp; oszc = p->p_stkpageszc; pgsz = map_pgsz(MAPPGSZ_STK, p, sp, newsize, 0); szc = page_szc(pgsz); /* * Covers two cases: * 1. page_szc() returns -1 for invalid page size, so we want to * ignore it in that case. * 2. By design we never decrease page size, as it is more stable. * This shouldn't happen as the stack never shrinks. */ if (szc <= oszc) { err = grow_internal(sp, oszc); /* failed, fall back to base page size */ if (err != 0 && oszc != 0) { err = grow_internal(sp, 0); } return (err); } /* * We've grown sufficiently to switch to a new page size. * So we are going to remap the whole segment with the new page size. */ err = grow_internal(sp, szc); /* The grow with szc failed, so fall back to base page size. */ if (err != 0) { if (szc != 0) { err = grow_internal(sp, 0); } return (err); } /* * Round up stack pointer to a large page boundary and remap * any pgsz pages in the segment already faulted in beyond that * point. */ saddr = p->p_usrstack - p->p_stksize; addr = (caddr_t)P2ROUNDUP((uintptr_t)saddr, pgsz); growend = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack, pgsz); len = growend - addr; /* Check that len is not negative. Update page size code for stack. */ if (addr >= saddr && growend > addr && IS_P2ALIGNED(len, pgsz)) { (void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE); p->p_stkpageszc = szc; } ASSERT(err == 0); return (err); /* should always be 0 */ }