uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { objset_impl_t *osi = os->os; uint64_t object; uint64_t L2_dnode_count = DNODES_PER_BLOCK << (osi->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT); dnode_t *dn = NULL; int restarted = B_FALSE; mutex_enter(&osi->os_obj_lock); for (;;) { object = osi->os_obj_next; /* * Each time we polish off an L2 bp worth of dnodes * (2^13 objects), move to another L2 bp that's still * reasonably sparse (at most 1/4 full). Look from the * beginning once, but after that keep looking from here. * If we can't find one, just keep going from here. */ if (P2PHASE(object, L2_dnode_count) == 0) { uint64_t offset = restarted ? object << DNODE_SHIFT : 0; int error = dnode_next_offset(osi->os_meta_dnode, DNODE_FIND_HOLE, &offset, 2, DNODES_PER_BLOCK >> 2, 0); restarted = B_TRUE; if (error == 0) object = offset >> DNODE_SHIFT; } osi->os_obj_next = ++object; /* * XXX We should check for an i/o error here and return * up to our caller. Actually we should pre-read it in * dmu_tx_assign(), but there is currently no mechanism * to do so. */ (void) dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG, &dn); if (dn) break; if (dmu_object_next(os, &object, B_TRUE, 0) == 0) osi->os_obj_next = object - 1; }
static void vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) { uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); ASSERT(MUTEX_HELD(&vc->vc_lock)); ASSERT(ve->ve_fill_io == NULL); if (ve->ve_lastused != lbolt) { avl_remove(&vc->vc_lastused_tree, ve); ve->ve_lastused = lbolt; avl_add(&vc->vc_lastused_tree, ve); } ve->ve_hits++; bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size); }
int bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx) { uint64_t blk, off; blkptr_t *bparray; int err; ASSERT(!BP_IS_HOLE(bp)); mutex_enter(&bpl->bpl_lock); err = bplist_hold(bpl); if (err) return (err); blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift; off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift); err = bplist_cache(bpl, blk); if (err) { mutex_exit(&bpl->bpl_lock); return (err); } dmu_buf_will_dirty(bpl->bpl_cached_dbuf, tx); bparray = bpl->bpl_cached_dbuf->db_data; bparray[off] = *bp; /* We never need the fill count. */ bparray[off].blk_fill = 0; /* The bplist will compress better if we can leave off the checksum */ if (!BP_GET_DEDUP(&bparray[off])) bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum)); dmu_buf_will_dirty(bpl->bpl_dbuf, tx); bpl->bpl_phys->bpl_entries++; bpl->bpl_phys->bpl_bytes += bp_get_dsize_sync(dmu_objset_spa(bpl->bpl_mos), bp); if (bpl->bpl_havecomp) { bpl->bpl_phys->bpl_comp += BP_GET_PSIZE(bp); bpl->bpl_phys->bpl_uncomp += BP_GET_UCSIZE(bp); } mutex_exit(&bpl->bpl_lock); return (0); }
static void umem_log_enter(const char *error_str, int serious) { #ifdef __native_client__ if ( s_zfslog_fd < 0 ){ s_zfslog_fd = open(ZVM_ZFS_LOG, O_WRONLY); } write(s_zfslog_fd, error_str, strlen(error_str)); #else int looped; char c; looped = 0; #ifdef ECELERITY mem_printf(serious ? DCRITICAL : DINFO, "umem: %s", error_str); #endif (void) mutex_lock(&umem_error_lock); while ((c = *error_str++) != '\0') { WRITE_AND_INC(umem_error_end, c); if (umem_error_end == umem_error_begin) looped = 1; } umem_error_buffer[umem_error_end] = 0; if (looped) { uint_t idx; umem_error_begin = P2PHASE(umem_error_end + 1, ERR_SIZE); idx = umem_error_begin; WRITE_AND_INC(idx, '.'); WRITE_AND_INC(idx, '.'); WRITE_AND_INC(idx, '.'); } (void) mutex_unlock(&umem_error_lock); #endif //__native_client__ }
/* ARGSUSED */ static void dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) { dnode_t *dn = txh->txh_dnode; uint64_t start, end, i; int min_bs, max_bs, min_ibs, max_ibs, epbs, bits; int err = 0; if (len == 0) return; min_bs = SPA_MINBLOCKSHIFT; max_bs = SPA_MAXBLOCKSHIFT; min_ibs = DN_MIN_INDBLKSHIFT; max_ibs = DN_MAX_INDBLKSHIFT; if (dn) { uint64_t history[DN_MAX_LEVELS]; int nlvls = dn->dn_nlevels; int delta; /* * For i/o error checking, read the first and last level-0 * blocks (if they are not aligned), and all the level-1 blocks. */ if (dn->dn_maxblkid == 0) { delta = dn->dn_datablksz; start = (off < dn->dn_datablksz) ? 0 : 1; end = (off+len <= dn->dn_datablksz) ? 0 : 1; if (start == 0 && (off > 0 || len < dn->dn_datablksz)) { err = dmu_tx_check_ioerr(NULL, dn, 0, 0); if (err) goto out; delta -= off; } } else { zio_t *zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); /* first level-0 block */ start = off >> dn->dn_datablkshift; if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { err = dmu_tx_check_ioerr(zio, dn, 0, start); if (err) goto out; } /* last level-0 block */ end = (off+len-1) >> dn->dn_datablkshift; if (end != start && end <= dn->dn_maxblkid && P2PHASE(off+len, dn->dn_datablksz)) { err = dmu_tx_check_ioerr(zio, dn, 0, end); if (err) goto out; } /* level-1 blocks */ if (nlvls > 1) { int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (i = (start>>shft)+1; i < end>>shft; i++) { err = dmu_tx_check_ioerr(zio, dn, 1, i); if (err) goto out; } } err = zio_wait(zio); if (err) goto out; delta = P2NPHASE(off, dn->dn_datablksz); } min_ibs = max_ibs = dn->dn_indblkshift; if (dn->dn_maxblkid > 0) { /* * The blocksize can't change, * so we can make a more precise estimate. */ ASSERT(dn->dn_datablkshift != 0); min_bs = max_bs = dn->dn_datablkshift; } /* * If this write is not off the end of the file * we need to account for overwrites/unref. */ if (start <= dn->dn_maxblkid) { for (int l = 0; l < DN_MAX_LEVELS; l++) history[l] = -1ULL; } while (start <= dn->dn_maxblkid) { dmu_buf_impl_t *db; rw_enter(&dn->dn_struct_rwlock, RW_READER); err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db); rw_exit(&dn->dn_struct_rwlock); if (err) { txh->txh_tx->tx_err = err; return; } dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE, history); dbuf_rele(db, FTAG); if (++start > end) { /* * Account for new indirects appearing * before this IO gets assigned into a txg. */ bits = 64 - min_bs; epbs = min_ibs - SPA_BLKPTRSHIFT; for (bits -= epbs * (nlvls - 1); bits >= 0; bits -= epbs) txh->txh_fudge += 1ULL << max_ibs; goto out; } off += delta; if (len >= delta) len -= delta; delta = dn->dn_datablksz; } }
void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, offset_t off, ssize_t resid, int ioflag) { uint32_t blocksize = zp->z_blksz; itx_wr_state_t write_state; uintptr_t fsync_cnt; if (zil_replaying(zilog, tx) || zp->z_unlinked) return; if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) write_state = WR_INDIRECT; else if (!spa_has_slogs(zilog->zl_spa) && resid >= zfs_immediate_write_sz) write_state = WR_INDIRECT; else if (ioflag & (FSYNC | FDSYNC)) write_state = WR_COPIED; else write_state = WR_NEED_COPY; if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) { (void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1)); } while (resid) { itx_t *itx; lr_write_t *lr; itx_wr_state_t wr_state = write_state; ssize_t len = resid; if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA) wr_state = WR_NEED_COPY; else if (wr_state == WR_INDIRECT) len = MIN(blocksize - P2PHASE(off, blocksize), resid); itx = zil_itx_create(txtype, sizeof (*lr) + (wr_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; if (wr_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os, zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { zil_itx_destroy(itx); itx = zil_itx_create(txtype, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; wr_state = WR_NEED_COPY; } itx->itx_wr_state = wr_state; lr->lr_foid = zp->z_id; lr->lr_offset = off; lr->lr_length = len; lr->lr_blkoff = 0; BP_ZERO(&lr->lr_blkptr); itx->itx_private = zp->z_zfsvfs; if (!(ioflag & (FSYNC | FDSYNC)) && (zp->z_sync_cnt == 0) && (fsync_cnt == 0)) itx->itx_sync = B_FALSE; zil_itx_assign(zilog, itx, tx); off += len; resid -= len; } }
/* * Load the space map disk into the specified range tree. Segments of maptype * are added to the range tree, other segment types are removed. * * Note: space_map_load() will drop sm_lock across dmu_read() calls. * The caller must be OK with this. */ int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype) { uint64_t *entry, *entry_map, *entry_map_end; uint64_t bufsize, size, offset, end, space; int error = 0; ASSERT(MUTEX_HELD(sm->sm_lock)); end = space_map_length(sm); space = space_map_allocated(sm); VERIFY0(range_tree_space(rt)); if (maptype == SM_FREE) { range_tree_add(rt, sm->sm_start, sm->sm_size); space = sm->sm_size - space; } bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE); entry_map = zio_buf_alloc(bufsize); mutex_exit(sm->sm_lock); if (end > bufsize) { dmu_prefetch(sm->sm_os, space_map_object(sm), 0, bufsize, end - bufsize, ZIO_PRIORITY_SYNC_READ); } mutex_enter(sm->sm_lock); for (offset = 0; offset < end; offset += bufsize) { size = MIN(end - offset, bufsize); VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0); VERIFY(size != 0); ASSERT3U(sm->sm_blksz, !=, 0); dprintf("object=%llu offset=%llx size=%llx\n", space_map_object(sm), offset, size); mutex_exit(sm->sm_lock); error = dmu_read(sm->sm_os, space_map_object(sm), offset, size, entry_map, DMU_READ_PREFETCH); mutex_enter(sm->sm_lock); if (error != 0) break; entry_map_end = entry_map + (size / sizeof (uint64_t)); for (entry = entry_map; entry < entry_map_end; entry++) { uint64_t e = *entry; uint64_t offset, size; if (SM_DEBUG_DECODE(e)) /* Skip debug entries */ continue; offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) + sm->sm_start; size = SM_RUN_DECODE(e) << sm->sm_shift; VERIFY0(P2PHASE(offset, 1ULL << sm->sm_shift)); VERIFY0(P2PHASE(size, 1ULL << sm->sm_shift)); VERIFY3U(offset, >=, sm->sm_start); VERIFY3U(offset + size, <=, sm->sm_start + sm->sm_size); if (SM_TYPE_DECODE(e) == maptype) { VERIFY3U(range_tree_space(rt) + size, <=, sm->sm_size); range_tree_add(rt, offset, size); } else { range_tree_remove(rt, offset, size); } }
/* * Read data from the cache. Returns 0 on cache hit, errno on a miss. */ int vdev_cache_read(zio_t *zio) { vdev_cache_t *vc = &zio->io_vd->vdev_cache; vdev_cache_entry_t *ve, ve_search; uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS); uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); zio_t *fio; ASSERT(zio->io_type == ZIO_TYPE_READ); if (zio->io_flags & ZIO_FLAG_DONT_CACHE) return (EINVAL); if (zio->io_size > zfs_vdev_cache_max) return (EOVERFLOW); /* * If the I/O straddles two or more cache blocks, don't cache it. */ if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS)) return (EXDEV); ASSERT(cache_phase + zio->io_size <= VCBS); mutex_enter(&vc->vc_lock); ve_search.ve_offset = cache_offset; ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL); if (ve != NULL) { if (ve->ve_missed_update) { mutex_exit(&vc->vc_lock); return (ESTALE); } if ((fio = ve->ve_fill_io) != NULL) { zio_vdev_io_bypass(zio); zio_add_child(zio, fio); mutex_exit(&vc->vc_lock); VDCSTAT_BUMP(vdc_stat_delegations); return (0); } vdev_cache_hit(vc, ve, zio); zio_vdev_io_bypass(zio); mutex_exit(&vc->vc_lock); VDCSTAT_BUMP(vdc_stat_hits); return (0); } ve = vdev_cache_allocate(zio); if (ve == NULL) { mutex_exit(&vc->vc_lock); return (ENOMEM); } fio = zio_vdev_delegated_io(zio->io_vd, cache_offset, ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL, ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve); ve->ve_fill_io = fio; zio_vdev_io_bypass(zio); zio_add_child(zio, fio); mutex_exit(&vc->vc_lock); zio_nowait(fio); VDCSTAT_BUMP(vdc_stat_misses); return (0); }
/* * Read data from the cache. Returns 0 on cache hit, errno on a miss. */ int vdev_cache_read(zio_t *zio) { vdev_cache_t *vc = &zio->io_vd->vdev_cache; vdev_cache_entry_t *ve, ve_search; uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS); uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); zio_t *fio; ASSERT(zio->io_type == ZIO_TYPE_READ); if (zio->io_flags & ZIO_FLAG_DONT_CACHE) return (EINVAL); if (zio->io_size > zfs_vdev_cache_max) return (EOVERFLOW); /* * If the I/O straddles two or more cache blocks, don't cache it. */ if (P2CROSS(zio->io_offset, zio->io_offset + zio->io_size - 1, VCBS)) return (EXDEV); ASSERT(cache_phase + zio->io_size <= VCBS); mutex_enter(&vc->vc_lock); ve_search.ve_offset = cache_offset; ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL); if (ve != NULL) { if (ve->ve_missed_update) { mutex_exit(&vc->vc_lock); return (ESTALE); } if ((fio = ve->ve_fill_io) != NULL) { zio->io_delegate_next = fio->io_delegate_list; fio->io_delegate_list = zio; zio_vdev_io_bypass(zio); mutex_exit(&vc->vc_lock); return (0); } vdev_cache_hit(vc, ve, zio); zio_vdev_io_bypass(zio); mutex_exit(&vc->vc_lock); zio_next_stage(zio); return (0); } if (!(zio->io_flags & ZIO_FLAG_METADATA)) { mutex_exit(&vc->vc_lock); return (EINVAL); } ve = vdev_cache_allocate(zio); if (ve == NULL) { mutex_exit(&vc->vc_lock); return (ENOMEM); } fio = zio_vdev_child_io(zio, NULL, zio->io_vd, cache_offset, ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_NOBOOKMARK, vdev_cache_fill, ve); ve->ve_fill_io = fio; fio->io_delegate_list = zio; zio_vdev_io_bypass(zio); mutex_exit(&vc->vc_lock); zio_nowait(fio); return (0); }
int zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len, char *addr) { dmu_object_info_t doi; ssize_t nbytes; itx_t *itx; lr_write_t *lr; objset_t *os; dmu_buf_t *db; uint64_t txg; uint64_t boff; int error; uint32_t blocksize; /* handle common case */ if (len <= zvol_immediate_write_sz) { itx = zvol_immediate_itx(off, len, addr); (void) zil_itx_assign(zv->zv_zilog, itx, tx); return (0); } txg = dmu_tx_get_txg(tx); os = zv->zv_objset; /* * We need to dmu_sync() each block in the range. * For this we need the blocksize. */ error = dmu_object_info(os, ZVOL_OBJ, &doi); if (error) return (error); blocksize = doi.doi_data_block_size; /* * We need to immediate write or dmu_sync() each block in the range. */ while (len) { nbytes = MIN(len, blocksize - P2PHASE(off, blocksize)); if (nbytes <= zvol_immediate_write_sz) { itx = zvol_immediate_itx(off, nbytes, addr); } else { boff = P2ALIGN_TYPED(off, blocksize, uint64_t); itx = zil_itx_create(TX_WRITE, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; lr->lr_foid = ZVOL_OBJ; lr->lr_offset = off; lr->lr_length = nbytes; lr->lr_blkoff = off - boff; BP_ZERO(&lr->lr_blkptr); /* XXX - we should do these IOs in parallel */ VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, boff, FTAG, &db)); ASSERT(boff == db->db_offset); error = dmu_sync(NULL, db, &lr->lr_blkptr, txg, NULL, NULL); dmu_buf_rele(db, FTAG); if (error) { kmem_free(itx, offsetof(itx_t, itx_lr)); return (error); } itx->itx_wr_state = WR_COPIED; } (void) zil_itx_assign(zv->zv_zilog, itx, tx); len -= nbytes; off += nbytes; } return (0); }
uint64_t dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { uint64_t object; uint64_t L1_dnode_count = DNODES_PER_BLOCK << (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT); dnode_t *dn = NULL; int dn_slots = dnodesize >> DNODE_SHIFT; boolean_t restarted = B_FALSE; uint64_t *cpuobj = NULL; int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; kpreempt_disable(); cpuobj = &os->os_obj_next_percpu[CPU_SEQID % os->os_obj_next_percpu_len]; kpreempt_enable(); if (dn_slots == 0) { dn_slots = DNODE_MIN_SLOTS; } else { ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); } /* * The "chunk" of dnodes that is assigned to a CPU-specific * allocator needs to be at least one block's worth, to avoid * lock contention on the dbuf. It can be at most one L1 block's * worth, so that the "rescan after polishing off a L1's worth" * logic below will be sure to kick in. */ if (dnodes_per_chunk < DNODES_PER_BLOCK) dnodes_per_chunk = DNODES_PER_BLOCK; if (dnodes_per_chunk > L1_dnode_count) dnodes_per_chunk = L1_dnode_count; object = *cpuobj; for (;;) { /* * If we finished a chunk of dnodes, get a new one from * the global allocator. */ if (P2PHASE(object, dnodes_per_chunk) == 0) { mutex_enter(&os->os_obj_lock); ASSERT0(P2PHASE(os->os_obj_next_chunk, dnodes_per_chunk)); object = os->os_obj_next_chunk; /* * Each time we polish off a L1 bp worth of dnodes * (2^12 objects), move to another L1 bp that's * still reasonably sparse (at most 1/4 full). Look * from the beginning at most once per txg. If we * still can't allocate from that L1 block, search * for an empty L0 block, which will quickly skip * to the end of the metadnode if no nearby L0 * blocks are empty. This fallback avoids a * pathology where full dnode blocks containing * large dnodes appear sparse because they have a * low blk_fill, leading to many failed allocation * attempts. In the long term a better mechanism to * search for sparse metadnode regions, such as * spacemaps, could be implemented. * * os_scan_dnodes is set during txg sync if enough * objects have been freed since the previous * rescan to justify backfilling again. * * Note that dmu_traverse depends on the behavior * that we use multiple blocks of the dnode object * before going back to reuse objects. Any change * to this algorithm should preserve that property * or find another solution to the issues described * in traverse_visitbp. */ if (P2PHASE(object, L1_dnode_count) == 0) { uint64_t offset; uint64_t blkfill; int minlvl; int error; if (os->os_rescan_dnodes) { offset = 0; os->os_rescan_dnodes = B_FALSE; } else { offset = object << DNODE_SHIFT; } blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2; minlvl = restarted ? 1 : 2; restarted = B_TRUE; error = dnode_next_offset(DMU_META_DNODE(os), DNODE_FIND_HOLE, &offset, minlvl, blkfill, 0); if (error == 0) { object = offset >> DNODE_SHIFT; } }
int zio_compress_data(int cpfunc, void *src, uint64_t srcsize, void **destp, uint64_t *destsizep, uint64_t *destbufsizep) { uint64_t *word, *word_end; uint64_t ciosize, gapsize, destbufsize; zio_compress_info_t *ci = &zio_compress_table[cpfunc]; char *dest; uint_t allzero; ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS); ASSERT((uint_t)cpfunc == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL); /* * If the data is all zeroes, we don't even need to allocate * a block for it. We indicate this by setting *destsizep = 0. */ allzero = 1; word = src; word_end = (uint64_t *)(uintptr_t)((uintptr_t)word + srcsize); while (word < word_end) { if (*word++ != 0) { allzero = 0; break; } } if (allzero) { *destp = NULL; *destsizep = 0; *destbufsizep = 0; return (1); } if (cpfunc == ZIO_COMPRESS_EMPTY) return (0); /* Compress at least 12.5% */ destbufsize = P2ALIGN(srcsize - (srcsize >> 3), SPA_MINBLOCKSIZE); if (destbufsize == 0) return (0); dest = zio_buf_alloc(destbufsize); ciosize = ci->ci_compress(src, dest, (size_t)srcsize, (size_t)destbufsize, ci->ci_level); if (ciosize > destbufsize) { zio_buf_free(dest, destbufsize); return (0); } /* Cool. We compressed at least as much as we were hoping to. */ /* For security, make sure we don't write random heap crap to disk */ gapsize = P2ROUNDUP(ciosize, SPA_MINBLOCKSIZE) - ciosize; if (gapsize != 0) { bzero(dest + ciosize, gapsize); ciosize += gapsize; } ASSERT3U(ciosize, <=, destbufsize); ASSERT(P2PHASE(ciosize, SPA_MINBLOCKSIZE) == 0); *destp = dest; *destsizep = ciosize; *destbufsizep = destbufsize; return (1); }
/* ARGSUSED */ static void dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) { dnode_t *dn = txh->txh_dnode; uint64_t start, end, i; int min_bs, max_bs, min_ibs, max_ibs, epbs, bits; int err = 0; if (len == 0) return; min_bs = SPA_MINBLOCKSHIFT; max_bs = SPA_MAXBLOCKSHIFT; min_ibs = DN_MIN_INDBLKSHIFT; max_ibs = DN_MAX_INDBLKSHIFT; /* * For i/o error checking, read the first and last level-0 * blocks (if they are not aligned), and all the level-1 blocks. */ if (dn) { if (dn->dn_maxblkid == 0) { err = dmu_tx_check_ioerr(NULL, dn, 0, 0); if (err) goto out; } else { zio_t *zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); /* first level-0 block */ start = off >> dn->dn_datablkshift; if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { err = dmu_tx_check_ioerr(zio, dn, 0, start); if (err) goto out; } /* last level-0 block */ end = (off+len-1) >> dn->dn_datablkshift; if (end != start && P2PHASE(off+len, dn->dn_datablksz)) { err = dmu_tx_check_ioerr(zio, dn, 0, end); if (err) goto out; } /* level-1 blocks */ if (dn->dn_nlevels > 1) { start >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT; end >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (i = start+1; i < end; i++) { err = dmu_tx_check_ioerr(zio, dn, 1, i); if (err) goto out; } } err = zio_wait(zio); if (err) goto out; } }
uint64_t dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { uint64_t object; uint64_t L1_dnode_count = DNODES_PER_BLOCK << (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT); dnode_t *dn = NULL; int dn_slots = dnodesize >> DNODE_SHIFT; boolean_t restarted = B_FALSE; if (dn_slots == 0) { dn_slots = DNODE_MIN_SLOTS; } else { ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); } mutex_enter(&os->os_obj_lock); for (;;) { object = os->os_obj_next; /* * Each time we polish off a L1 bp worth of dnodes (2^12 * objects), move to another L1 bp that's still * reasonably sparse (at most 1/4 full). Look from the * beginning at most once per txg. If we still can't * allocate from that L1 block, search for an empty L0 * block, which will quickly skip to the end of the * metadnode if the no nearby L0 blocks are empty. This * fallback avoids a pathology where full dnode blocks * containing large dnodes appear sparse because they * have a low blk_fill, leading to many failed * allocation attempts. In the long term a better * mechanism to search for sparse metadnode regions, * such as spacemaps, could be implemented. * * os_scan_dnodes is set during txg sync if enough objects * have been freed since the previous rescan to justify * backfilling again. * * Note that dmu_traverse depends on the behavior that we use * multiple blocks of the dnode object before going back to * reuse objects. Any change to this algorithm should preserve * that property or find another solution to the issues * described in traverse_visitbp. */ if (P2PHASE(object, L1_dnode_count) == 0) { uint64_t offset; uint64_t blkfill; int minlvl; int error; if (os->os_rescan_dnodes) { offset = 0; os->os_rescan_dnodes = B_FALSE; } else { offset = object << DNODE_SHIFT; } blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2; minlvl = restarted ? 1 : 2; restarted = B_TRUE; error = dnode_next_offset(DMU_META_DNODE(os), DNODE_FIND_HOLE, &offset, minlvl, blkfill, 0); if (error == 0) object = offset >> DNODE_SHIFT; } os->os_obj_next = object + dn_slots; /* * XXX We should check for an i/o error here and return * up to our caller. Actually we should pre-read it in * dmu_tx_assign(), but there is currently no mechanism * to do so. */ (void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots, FTAG, &dn); if (dn) break; if (dmu_object_next(os, &object, B_TRUE, 0) == 0) os->os_obj_next = object; else /* * Skip to next known valid starting point for a dnode. */ os->os_obj_next = P2ROUNDUP(object + 1, DNODES_PER_BLOCK); }
/* * This routine assumes that the stack grows downward. * Returns 0 on success, errno on failure. */ int grow_internal(caddr_t sp, uint_t growszc) { struct proc *p = curproc; size_t newsize; size_t oldsize; int error; size_t pgsz; uint_t szc; struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL); ASSERT(sp < p->p_usrstack); sp = (caddr_t)P2ALIGN((uintptr_t)sp, PAGESIZE); /* * grow to growszc alignment but use current p->p_stkpageszc for * the segvn_crargs szc passed to segvn_create. For memcntl to * increase the szc, this allows the new extension segment to be * concatenated successfully with the existing stack segment. */ if ((szc = growszc) != 0) { pgsz = page_get_pagesize(szc); ASSERT(pgsz > PAGESIZE); newsize = p->p_usrstack - (caddr_t)P2ALIGN((uintptr_t)sp, pgsz); if (newsize > (size_t)p->p_stk_ctl) { szc = 0; pgsz = PAGESIZE; newsize = p->p_usrstack - sp; } } else { pgsz = PAGESIZE; newsize = p->p_usrstack - sp; } if (newsize > (size_t)p->p_stk_ctl) { (void) rctl_action(rctlproc_legacy[RLIMIT_STACK], p->p_rctls, p, RCA_UNSAFE_ALL); return (ENOMEM); } oldsize = p->p_stksize; ASSERT(P2PHASE(oldsize, PAGESIZE) == 0); if (newsize <= oldsize) { /* prevent the stack from shrinking */ return (0); } if (!(p->p_stkprot & PROT_EXEC)) { crargs.prot &= ~PROT_EXEC; } /* * extend stack with the proposed new growszc, which is different * than p_stkpageszc only on a memcntl to increase the stack pagesize. * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies via * map_pgszcvec(). Use AS_MAP_STACK to get intermediate page sizes * if not aligned to szc's pgsz. */ if (szc > 0) { caddr_t oldsp = p->p_usrstack - oldsize; caddr_t austk = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack, pgsz); if (IS_P2ALIGNED(p->p_usrstack, pgsz) || oldsp < austk) { crargs.szc = p->p_stkpageszc ? p->p_stkpageszc : AS_MAP_NO_LPOOB; } else if (oldsp == austk) { crargs.szc = szc; } else { crargs.szc = AS_MAP_STACK; } } else { crargs.szc = AS_MAP_NO_LPOOB; } crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_DOWN; if ((error = as_map(p->p_as, p->p_usrstack - newsize, newsize - oldsize, segvn_create, &crargs)) != 0) { if (error == EAGAIN) { cmn_err(CE_WARN, "Sorry, no swap space to grow stack " "for pid %d (%s)", p->p_pid, PTOU(p)->u_comm); } return (error); } p->p_stksize = newsize; return (0); }
/* ARGSUSED */ static void dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) { dnode_t *dn = txh->txh_dnode; int err = 0; if (len == 0) return; (void) refcount_add_many(&txh->txh_space_towrite, len, FTAG); if (refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS) err = SET_ERROR(EFBIG); if (dn == NULL) return; /* * For i/o error checking, read the blocks that will be needed * to perform the write: the first and last level-0 blocks (if * they are not aligned, i.e. if they are partial-block writes), * and all the level-1 blocks. */ if (dn->dn_maxblkid == 0) { if (off < dn->dn_datablksz && (off > 0 || len < dn->dn_datablksz)) { err = dmu_tx_check_ioerr(NULL, dn, 0, 0); if (err != 0) { txh->txh_tx->tx_err = err; } } } else { zio_t *zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); /* first level-0 block */ uint64_t start = off >> dn->dn_datablkshift; if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { err = dmu_tx_check_ioerr(zio, dn, 0, start); if (err != 0) { txh->txh_tx->tx_err = err; } } /* last level-0 block */ uint64_t end = (off + len - 1) >> dn->dn_datablkshift; if (end != start && end <= dn->dn_maxblkid && P2PHASE(off + len, dn->dn_datablksz)) { err = dmu_tx_check_ioerr(zio, dn, 0, end); if (err != 0) { txh->txh_tx->tx_err = err; } } /* level-1 blocks */ if (dn->dn_nlevels > 1) { int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (uint64_t i = (start >> shft) + 1; i < end >> shft; i++) { err = dmu_tx_check_ioerr(zio, dn, 1, i); if (err != 0) { txh->txh_tx->tx_err = err; } } } err = zio_wait(zio); if (err != 0) { txh->txh_tx->tx_err = err; } }