/* * TX_WRITE2 are only generated when dmu_sync() returns EALREADY * meaning the pool block is already being synced. So now that we always write * out full blocks, all we have to do is expand the eof if * the file is grown. */ static int zfs_replay_write2(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap) { znode_t *zp; int error; uint64_t end; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) return (error); top: end = lr->lr_offset + lr->lr_length; if (end > zp->z_size) { dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); zp->z_size = end; dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { VN_RELE(ZTOV(zp)); if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); return (error); } (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), (void *)&zp->z_size, sizeof (uint64_t), tx); /* Ensure the replayed seq is updated */ (void) zil_replaying(zfsvfs->z_log, tx); dmu_tx_commit(tx); } VN_RELE(ZTOV(zp)); return (error); }
static int zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap) { zfsvfs_t *zfsvfs = arg1; lr_link_t *lr = arg2; char *name = (char *)(lr + 1); /* name follows lr_link_t */ znode_t *dzp, *zp; struct componentname cn; int error; int vflg = 0; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) return (error); if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) { VN_RELE(ZTOV(dzp)); return (error); } if (lr->lr_common.lrc_txtype & TX_CI) vflg |= FIGNORECASE; cn.cn_nameptr = name; cn.cn_cred = kcred; cn.cn_thread = curthread; cn.cn_flags = SAVENAME; vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY); vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY); error = VOP_LINK(ZTOV(dzp), ZTOV(zp), &cn /*,vflg*/); VOP_UNLOCK(ZTOV(zp), 0); VOP_UNLOCK(ZTOV(dzp), 0); VN_RELE(ZTOV(zp)); VN_RELE(ZTOV(dzp)); return (error); }
static int zfs_replay_acl_v0(zfsvfs_t *zfsvfs, void *data, boolean_t byteswap) { #ifdef __OSV__ kprintf("TX_ACL_V0 not supported on OSv\n"); return EOPNOTSUPP; #else lr_acl_v0_t *lr = data; ace_t *ace = (ace_t *)(lr + 1); /* ace array follows lr_acl_t */ vsecattr_t vsa; znode_t *zp; int error; if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); zfs_oldace_byteswap(ace, lr->lr_aclcnt); } if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) return (error); bzero(&vsa, sizeof (vsa)); vsa.vsa_mask = VSA_ACE | VSA_ACECNT; vsa.vsa_aclcnt = lr->lr_aclcnt; vsa.vsa_aclentsz = sizeof (ace_t) * vsa.vsa_aclcnt; vsa.vsa_aclflags = 0; vsa.vsa_aclentp = ace; #ifdef TODO error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred, NULL); #else panic("%s:%u: unsupported condition", __func__, __LINE__); #endif VN_RELE(ZTOV(zp)); return (error); #endif }
/* * Replay file create with optional ACL, xvattr information as well * as option FUID information. */ static int zfs_replay_create_acl(zfsvfs_t *zfsvfs, lr_acl_create_t *lracl, boolean_t byteswap) { char *name = NULL; /* location determined later */ lr_create_t *lr = (lr_create_t *)lracl; znode_t *dzp; vnode_t *vp = NULL; xvattr_t xva; int vflg = 0; vsecattr_t vsec = { 0 }; lr_attr_t *lrattr; void *aclstart; void *fuidstart; size_t xvatlen = 0; uint64_t txtype; int error; txtype = (lr->lr_common.lrc_txtype & ~TX_CI); if (byteswap) { byteswap_uint64_array(lracl, sizeof (*lracl)); if (txtype == TX_CREATE_ACL_ATTR || txtype == TX_MKDIR_ACL_ATTR) { lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); zfs_replay_swap_attrs(lrattr); xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); } aclstart = (caddr_t)(lracl + 1) + xvatlen; zfs_ace_byteswap(aclstart, lracl->lr_acl_bytes, B_FALSE); /* swap fuids */ if (lracl->lr_fuidcnt) { byteswap_uint64_array((caddr_t)aclstart + ZIL_ACE_LENGTH(lracl->lr_acl_bytes), lracl->lr_fuidcnt * sizeof (uint64_t)); } } if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) return (error); xva_init(&xva); zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID, lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid); /* * All forms of zfs create (create, mkdir, mkxattrdir, symlink) * eventually end up in zfs_mknode(), which assigns the object's * creation time and generation number. The generic VOP_CREATE() * doesn't have either concept, so we smuggle the values inside * the vattr's otherwise unused va_ctime and va_nblocks fields. */ ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); xva.xva_vattr.va_nblocks = lr->lr_gen; error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL); if (error != ENOENT) goto bail; if (lr->lr_common.lrc_txtype & TX_CI) vflg |= FIGNORECASE; switch (txtype) { case TX_CREATE_ACL: aclstart = (caddr_t)(lracl + 1); fuidstart = (caddr_t)aclstart + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, lr->lr_uid, lr->lr_gid); /*FALLTHROUGH*/ case TX_CREATE_ACL_ATTR: if (name == NULL) { lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); xva.xva_vattr.va_mask |= AT_XVATTR; zfs_replay_xvattr(lrattr, &xva); } vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS; vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen; vsec.vsa_aclcnt = lracl->lr_aclcnt; vsec.vsa_aclentsz = lracl->lr_acl_bytes; vsec.vsa_aclflags = lracl->lr_acl_flags; if (zfsvfs->z_fuid_replay == NULL) { fuidstart = (caddr_t)(lracl + 1) + xvatlen + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, lr->lr_uid, lr->lr_gid); } #ifdef TODO error = VOP_CREATE(ZTOV(dzp), name, &xva.xva_vattr, 0, 0, &vp, kcred, vflg, NULL, &vsec); #else panic("%s:%u: unsupported condition", __func__, __LINE__); #endif break; case TX_MKDIR_ACL: aclstart = (caddr_t)(lracl + 1); fuidstart = (caddr_t)aclstart + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, lr->lr_uid, lr->lr_gid); /*FALLTHROUGH*/ case TX_MKDIR_ACL_ATTR: if (name == NULL) { lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); zfs_replay_xvattr(lrattr, &xva); } vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS; vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen; vsec.vsa_aclcnt = lracl->lr_aclcnt; vsec.vsa_aclentsz = lracl->lr_acl_bytes; vsec.vsa_aclflags = lracl->lr_acl_flags; if (zfsvfs->z_fuid_replay == NULL) { fuidstart = (caddr_t)(lracl + 1) + xvatlen + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, lr->lr_uid, lr->lr_gid); } #ifdef TODO error = VOP_MKDIR(ZTOV(dzp), name, &xva.xva_vattr, &vp, kcred, NULL, vflg, &vsec); #else panic("%s:%u: unsupported condition", __func__, __LINE__); #endif break; default: error = ENOTSUP; } bail: if (error == 0 && vp != NULL) VN_RELE(vp); VN_RELE(ZTOV(dzp)); if (zfsvfs->z_fuid_replay) zfs_fuid_info_free(zfsvfs->z_fuid_replay); zfsvfs->z_fuid_replay = NULL; return (error); }
static int zfs_replay_setattr(zfsvfs_t *zfsvfs, void *data, boolean_t byteswap) { #ifdef __OSV__ kprintf("TX_SETATTR not supported on OSv\n"); return EOPNOTSUPP; #else lr_setattr_t *lr = data; znode_t *zp; xvattr_t xva; vattr_t *vap = &xva.xva_vattr; vnode_t *vp; int error; void *start; xva_init(&xva); if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); if ((lr->lr_mask & AT_XVATTR) && zfsvfs->z_version >= ZPL_VERSION_INITIAL) zfs_replay_swap_attrs((lr_attr_t *)(lr + 1)); } if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) return (error); zfs_init_vattr(vap, lr->lr_mask, lr->lr_mode, lr->lr_uid, lr->lr_gid, 0, lr->lr_foid); vap->va_size = lr->lr_size; ZFS_TIME_DECODE(&vap->va_atime, lr->lr_atime); ZFS_TIME_DECODE(&vap->va_mtime, lr->lr_mtime); /* * Fill in xvattr_t portions if necessary. */ start = (lr_setattr_t *)(lr + 1); if (vap->va_mask & AT_XVATTR) { zfs_replay_xvattr((lr_attr_t *)start, &xva); start = (caddr_t)start + ZIL_XVAT_SIZE(((lr_attr_t *)start)->lr_attr_masksize); } else xva.xva_vattr.va_mask &= ~AT_XVATTR; zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start, lr->lr_uid, lr->lr_gid); vp = ZTOV(zp); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_SETATTR(vp, vap, kcred); VOP_UNLOCK(vp, 0); zfs_fuid_info_free(zfsvfs->z_fuid_replay); zfsvfs->z_fuid_replay = NULL; VN_RELE(vp); return (error); #endif }
static int zfs_replay_write(zfsvfs_t *zfsvfs, void *_data, boolean_t byteswap) { #ifndef TODO_OSV kprintf("TX_WRITE\n"); return EOPNOTSUPP; #else lr_write_t *lr = _data; char *data = (char *)(lr + 1); /* data follows lr_write_t */ znode_t *zp; int error; ssize_t resid; uint64_t eod, offset, length; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { /* * As we can log writes out of order, it's possible the * file has been removed. In this case just drop the write * and return success. */ if (error == ENOENT) error = 0; return (error); } offset = lr->lr_offset; length = lr->lr_length; eod = offset + length; /* end of data for this write */ /* * This may be a write from a dmu_sync() for a whole block, * and may extend beyond the current end of the file. * We can't just replay what was written for this TX_WRITE as * a future TX_WRITE2 may extend the eof and the data for that * write needs to be there. So we write the whole block and * reduce the eof. This needs to be done within the single dmu * transaction created within vn_rdwr -> zfs_write. So a possible * new end of file is passed through in zfsvfs->z_replay_eof */ zfsvfs->z_replay_eof = 0; /* 0 means don't change end of file */ /* If it's a dmu_sync() block, write the whole block */ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); if (length < blocksize) { offset -= offset % blocksize; length = blocksize; } if (zp->z_size < eod) zfsvfs->z_replay_eof = eod; } error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, length, offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); VN_RELE(ZTOV(zp)); zfsvfs->z_replay_eof = 0; /* safety */ return (error); #endif }
int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) { blkptr_t *bp = zio->io_bp; uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum : (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); int byteswap; int error; uint64_t size = (bp == NULL ? zio->io_size : (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); uint64_t offset = zio->io_offset; void *data = zio->io_data; zio_checksum_info_t *ci = &zio_checksum_table[checksum]; zio_cksum_t actual_cksum, expected_cksum, verifier; if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) return (EINVAL); if (ci->ci_eck) { zio_eck_t *eck; if (checksum == ZIO_CHECKSUM_ZILOG2) { zil_chain_t *zilc = data; uint64_t nused; eck = &zilc->zc_eck; if (eck->zec_magic == ZEC_MAGIC) nused = zilc->zc_nused; else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC)) nused = BSWAP_64(zilc->zc_nused); else return (ECKSUM); if (nused > size) return (ECKSUM); size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t); } else { eck = (zio_eck_t *)((char *)data + size) - 1; } if (checksum == ZIO_CHECKSUM_GANG_HEADER) zio_checksum_gang_verifier(&verifier, bp); else if (checksum == ZIO_CHECKSUM_LABEL) zio_checksum_label_verifier(&verifier, offset); else verifier = bp->blk_cksum; byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); if (byteswap) byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); expected_cksum = eck->zec_cksum; eck->zec_cksum = verifier; ci->ci_func[byteswap](data, size, &actual_cksum); eck->zec_cksum = expected_cksum; if (byteswap) byteswap_uint64_array(&expected_cksum, sizeof (zio_cksum_t)); } else { ASSERT(!BP_IS_GANG(bp)); byteswap = BP_SHOULD_BYTESWAP(bp); expected_cksum = bp->blk_cksum; ci->ci_func[byteswap](data, size, &actual_cksum); } info->zbc_expected = expected_cksum; info->zbc_actual = actual_cksum; info->zbc_checksum_name = ci->ci_name; info->zbc_byteswapped = byteswap; info->zbc_injected = 0; info->zbc_has_cksum = 1; if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) { return (ECKSUM); } if (zio_injection_enabled && !zio->io_error && (error = zio_handle_fault_injection(zio, ECKSUM)) != 0) { info->zbc_injected = 1; return (error); } return (0); }
static void zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) { zil_replay_arg_t *zr = zra; const zil_header_t *zh = zilog->zl_header; uint64_t reclen = lr->lrc_reclen; uint64_t txtype = lr->lrc_txtype; char *name; int pass, error; if (!zilog->zl_replay) /* giving up */ return; if (lr->lrc_txg < claim_txg) /* already committed */ return; if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */ return; /* Strip case-insensitive bit, still present in log record */ txtype &= ~TX_CI; if (txtype == 0 || txtype >= TX_MAX_TYPE) { error = EINVAL; goto bad; } /* * Make a copy of the data so we can revise and extend it. */ bcopy(lr, zr->zr_lrbuf, reclen); /* * The log block containing this lr may have been byteswapped * so that we can easily examine common fields like lrc_txtype. * However, the log is a mix of different data types, and only the * replay vectors know how to byteswap their records. Therefore, if * the lr was byteswapped, undo it before invoking the replay vector. */ if (zr->zr_byteswap) byteswap_uint64_array(zr->zr_lrbuf, reclen); /* * We must now do two things atomically: replay this log record, * and update the log header sequence number to reflect the fact that * we did so. At the end of each replay function the sequence number * is updated if we are in replay mode. */ for (pass = 1; pass <= 2; pass++) { zilog->zl_replaying_seq = lr->lrc_seq; /* Only byteswap (if needed) on the 1st pass. */ error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf, zr->zr_byteswap && pass == 1); if (!error) return; /* * The DMU's dnode layer doesn't see removes until the txg * commits, so a subsequent claim can spuriously fail with * EEXIST. So if we receive any error we try syncing out * any removes then retry the transaction. */ if (pass == 1) txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); } bad: ASSERT(error); name = kmem_alloc(MAXNAMELEN, KM_SLEEP); dmu_objset_name(zr->zr_os, name); cmn_err(CE_WARN, "ZFS replay transaction error %d, " "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype, (lr->lrc_txtype & TX_CI) ? "CI" : ""); zilog->zl_replay = B_FALSE; kmem_free(name, MAXNAMELEN); }
static int zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap) { char *name = NULL; /* location determined later */ char *link; /* symlink content follows name */ znode_t *dzp; struct inode *ip = NULL; xvattr_t xva; int vflg = 0; size_t lrsize = sizeof (lr_create_t); lr_attr_t *lrattr; void *start; size_t xvatlen; uint64_t txtype; uint64_t objid; uint64_t dnodesize; int error; txtype = (lr->lr_common.lrc_txtype & ~TX_CI); if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); if (txtype == TX_CREATE_ATTR || txtype == TX_MKDIR_ATTR) zfs_replay_swap_attrs((lr_attr_t *)(lr + 1)); } if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) return (error); objid = LR_FOID_GET_OBJ(lr->lr_foid); dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT; xva_init(&xva); zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID, lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid); /* * All forms of zfs create (create, mkdir, mkxattrdir, symlink) * eventually end up in zfs_mknode(), which assigns the object's * creation time, generation number, and dnode slot count. The * generic zfs_create() has no concept of these attributes, so * we smuggle the values inside * the vattr's otherwise unused * va_ctime, va_nblocks, and va_nlink fields. */ ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); xva.xva_vattr.va_nblocks = lr->lr_gen; xva.xva_vattr.va_fsid = dnodesize; error = dmu_object_info(zfsvfs->z_os, objid, NULL); if (error != ENOENT) goto out; if (lr->lr_common.lrc_txtype & TX_CI) vflg |= FIGNORECASE; /* * Symlinks don't have fuid info, and CIFS never creates * symlinks. * * The _ATTR versions will grab the fuid info in their subcases. */ if ((int)lr->lr_common.lrc_txtype != TX_SYMLINK && (int)lr->lr_common.lrc_txtype != TX_MKDIR_ATTR && (int)lr->lr_common.lrc_txtype != TX_CREATE_ATTR) { start = (lr + 1); zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start, lr->lr_uid, lr->lr_gid); } switch (txtype) { case TX_CREATE_ATTR: lrattr = (lr_attr_t *)(caddr_t)(lr + 1); xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva); start = (caddr_t)(lr + 1) + xvatlen; zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start, lr->lr_uid, lr->lr_gid); name = (char *)start; /*FALLTHROUGH*/ case TX_CREATE: if (name == NULL) name = (char *)start; error = zfs_create(ZTOI(dzp), name, &xva.xva_vattr, 0, 0, &ip, kcred, vflg, NULL); break; case TX_MKDIR_ATTR: lrattr = (lr_attr_t *)(caddr_t)(lr + 1); xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva); start = (caddr_t)(lr + 1) + xvatlen; zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start, lr->lr_uid, lr->lr_gid); name = (char *)start; /*FALLTHROUGH*/ case TX_MKDIR: if (name == NULL) name = (char *)(lr + 1); error = zfs_mkdir(ZTOI(dzp), name, &xva.xva_vattr, &ip, kcred, vflg, NULL); break; case TX_MKXATTR: error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &ip, kcred); break; case TX_SYMLINK: name = (char *)(lr + 1); link = name + strlen(name) + 1; error = zfs_symlink(ZTOI(dzp), name, &xva.xva_vattr, link, &ip, kcred, vflg); break; default: error = SET_ERROR(ENOTSUP); } out: if (error == 0 && ip != NULL) iput(ip); iput(ZTOI(dzp)); if (zfsvfs->z_fuid_replay) zfs_fuid_info_free(zfsvfs->z_fuid_replay); zfsvfs->z_fuid_replay = NULL; return (error); }
/* * Replay file create with optional ACL, xvattr information as well * as option FUID information. */ static int zfs_replay_create_acl(zfsvfs_t *zfsvfs, lr_acl_create_t *lracl, boolean_t byteswap) { char *name = NULL; /* location determined later */ lr_create_t *lr = (lr_create_t *)lracl; znode_t *dzp; struct inode *ip = NULL; xvattr_t xva; int vflg = 0; vsecattr_t vsec = { 0 }; lr_attr_t *lrattr; void *aclstart; void *fuidstart; size_t xvatlen = 0; uint64_t txtype; uint64_t objid; uint64_t dnodesize; int error; txtype = (lr->lr_common.lrc_txtype & ~TX_CI); if (byteswap) { byteswap_uint64_array(lracl, sizeof (*lracl)); if (txtype == TX_CREATE_ACL_ATTR || txtype == TX_MKDIR_ACL_ATTR) { lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); zfs_replay_swap_attrs(lrattr); xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); } aclstart = (caddr_t)(lracl + 1) + xvatlen; zfs_ace_byteswap(aclstart, lracl->lr_acl_bytes, B_FALSE); /* swap fuids */ if (lracl->lr_fuidcnt) { byteswap_uint64_array((caddr_t)aclstart + ZIL_ACE_LENGTH(lracl->lr_acl_bytes), lracl->lr_fuidcnt * sizeof (uint64_t)); } } if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) return (error); objid = LR_FOID_GET_OBJ(lr->lr_foid); dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT; xva_init(&xva); zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID, lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid); /* * All forms of zfs create (create, mkdir, mkxattrdir, symlink) * eventually end up in zfs_mknode(), which assigns the object's * creation time, generation number, and dnode size. The generic * zfs_create() has no concept of these attributes, so we smuggle * the values inside the vattr's otherwise unused va_ctime, * va_nblocks, and va_fsid fields. */ ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); xva.xva_vattr.va_nblocks = lr->lr_gen; xva.xva_vattr.va_fsid = dnodesize; error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL); if (error != ENOENT) goto bail; if (lr->lr_common.lrc_txtype & TX_CI) vflg |= FIGNORECASE; switch (txtype) { case TX_CREATE_ACL: aclstart = (caddr_t)(lracl + 1); fuidstart = (caddr_t)aclstart + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, lr->lr_uid, lr->lr_gid); /*FALLTHROUGH*/ case TX_CREATE_ACL_ATTR: if (name == NULL) { lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); xva.xva_vattr.va_mask |= ATTR_XVATTR; zfs_replay_xvattr(lrattr, &xva); } vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS; vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen; vsec.vsa_aclcnt = lracl->lr_aclcnt; vsec.vsa_aclentsz = lracl->lr_acl_bytes; vsec.vsa_aclflags = lracl->lr_acl_flags; if (zfsvfs->z_fuid_replay == NULL) { fuidstart = (caddr_t)(lracl + 1) + xvatlen + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, lr->lr_uid, lr->lr_gid); } error = zfs_create(ZTOI(dzp), name, &xva.xva_vattr, 0, 0, &ip, kcred, vflg, &vsec); break; case TX_MKDIR_ACL: aclstart = (caddr_t)(lracl + 1); fuidstart = (caddr_t)aclstart + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, lr->lr_uid, lr->lr_gid); /*FALLTHROUGH*/ case TX_MKDIR_ACL_ATTR: if (name == NULL) { lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); zfs_replay_xvattr(lrattr, &xva); } vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS; vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen; vsec.vsa_aclcnt = lracl->lr_aclcnt; vsec.vsa_aclentsz = lracl->lr_acl_bytes; vsec.vsa_aclflags = lracl->lr_acl_flags; if (zfsvfs->z_fuid_replay == NULL) { fuidstart = (caddr_t)(lracl + 1) + xvatlen + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, lr->lr_uid, lr->lr_gid); } error = zfs_mkdir(ZTOI(dzp), name, &xva.xva_vattr, &ip, kcred, vflg, &vsec); break; default: error = SET_ERROR(ENOTSUP); } bail: if (error == 0 && ip != NULL) iput(ip); iput(ZTOI(dzp)); if (zfsvfs->z_fuid_replay) zfs_fuid_info_free(zfsvfs->z_fuid_replay); zfsvfs->z_fuid_replay = NULL; return (error); }
static int zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap) { zfsvfs_t *zfsvfs = arg1; lr_write_t *lr = arg2; char *data = (char *)(lr + 1); /* data follows lr_write_t */ znode_t *zp; int error, written; uint64_t eod, offset, length; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { /* * As we can log writes out of order, it's possible the * file has been removed. In this case just drop the write * and return success. */ if (error == ENOENT) error = 0; return (error); } offset = lr->lr_offset; length = lr->lr_length; eod = offset + length; /* end of data for this write */ /* * This may be a write from a dmu_sync() for a whole block, * and may extend beyond the current end of the file. * We can't just replay what was written for this TX_WRITE as * a future TX_WRITE2 may extend the eof and the data for that * write needs to be there. So we write the whole block and * reduce the eof. This needs to be done within the single dmu * transaction created within vn_rdwr -> zfs_write. So a possible * new end of file is passed through in zfsvfs->z_replay_eof */ zfsvfs->z_replay_eof = 0; /* 0 means don't change end of file */ /* If it's a dmu_sync() block, write the whole block */ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); if (length < blocksize) { offset -= offset % blocksize; length = blocksize; } if (zp->z_size < eod) zfsvfs->z_replay_eof = eod; } written = zpl_write_common(ZTOI(zp), data, length, &offset, UIO_SYSSPACE, 0, kcred); if (written < 0) error = -written; else if (written < length) error = SET_ERROR(EIO); /* short write */ iput(ZTOI(zp)); zfsvfs->z_replay_eof = 0; /* safety */ return (error); }
int zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp, enum zio_checksum checksum, abd_t *abd, uint64_t size, uint64_t offset, zio_bad_cksum_t *info) { zio_checksum_info_t *ci = &zio_checksum_table[checksum]; zio_cksum_t actual_cksum, expected_cksum; zio_eck_t eck; int byteswap; if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) return (SET_ERROR(EINVAL)); zio_checksum_template_init(checksum, spa); if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { zio_cksum_t verifier; size_t eck_offset; if (checksum == ZIO_CHECKSUM_ZILOG2) { zil_chain_t zilc; uint64_t nused; abd_copy_to_buf(&zilc, abd, sizeof (zil_chain_t)); eck = zilc.zc_eck; eck_offset = offsetof(zil_chain_t, zc_eck) + offsetof(zio_eck_t, zec_cksum); if (eck.zec_magic == ZEC_MAGIC) { nused = zilc.zc_nused; } else if (eck.zec_magic == BSWAP_64(ZEC_MAGIC)) { nused = BSWAP_64(zilc.zc_nused); } else { return (SET_ERROR(ECKSUM)); } if (nused > size) { return (SET_ERROR(ECKSUM)); } size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t); } else { eck_offset = size - sizeof (zio_eck_t); abd_copy_to_buf_off(&eck, abd, eck_offset, sizeof (zio_eck_t)); eck_offset += offsetof(zio_eck_t, zec_cksum); } if (checksum == ZIO_CHECKSUM_GANG_HEADER) zio_checksum_gang_verifier(&verifier, bp); else if (checksum == ZIO_CHECKSUM_LABEL) zio_checksum_label_verifier(&verifier, offset); else verifier = bp->blk_cksum; byteswap = (eck.zec_magic == BSWAP_64(ZEC_MAGIC)); if (byteswap) byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); expected_cksum = eck.zec_cksum; abd_copy_from_buf_off(abd, &verifier, eck_offset, sizeof (zio_cksum_t)); ci->ci_func[byteswap](abd, size, spa->spa_cksum_tmpls[checksum], &actual_cksum); abd_copy_from_buf_off(abd, &expected_cksum, eck_offset, sizeof (zio_cksum_t)); if (byteswap) { byteswap_uint64_array(&expected_cksum, sizeof (zio_cksum_t)); } } else { byteswap = BP_SHOULD_BYTESWAP(bp); expected_cksum = bp->blk_cksum; ci->ci_func[byteswap](abd, size, spa->spa_cksum_tmpls[checksum], &actual_cksum); } /* * MAC checksums are a special case since half of this checksum will * actually be the encryption MAC. This will be verified by the * decryption process, so we just check the truncated checksum now. * Objset blocks use embedded MACs so we don't truncate the checksum * for them. */ if (bp != NULL && BP_USES_CRYPT(bp) && BP_GET_TYPE(bp) != DMU_OT_OBJSET) { if (!(ci->ci_flags & ZCHECKSUM_FLAG_DEDUP)) { actual_cksum.zc_word[0] ^= actual_cksum.zc_word[2]; actual_cksum.zc_word[1] ^= actual_cksum.zc_word[3]; } actual_cksum.zc_word[2] = 0; actual_cksum.zc_word[3] = 0; expected_cksum.zc_word[2] = 0; expected_cksum.zc_word[3] = 0; } if (info != NULL) { info->zbc_expected = expected_cksum; info->zbc_actual = actual_cksum; info->zbc_checksum_name = ci->ci_name; info->zbc_byteswapped = byteswap; info->zbc_injected = 0; info->zbc_has_cksum = 1; } if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) return (SET_ERROR(ECKSUM)); return (0); }
int zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum, abd_t *abd, uint64_t size, uint64_t offset, zio_bad_cksum_t *info) { zio_checksum_info_t *ci = &zio_checksum_table[checksum]; int byteswap; zio_cksum_t actual_cksum, expected_cksum; if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) return (SET_ERROR(EINVAL)); zio_checksum_template_init(checksum, spa); if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { zio_eck_t *eck; zio_cksum_t verifier; size_t eck_offset; uint64_t data_size = size; void *data = abd_borrow_buf_copy(abd, data_size); if (checksum == ZIO_CHECKSUM_ZILOG2) { zil_chain_t *zilc = data; uint64_t nused; eck = &zilc->zc_eck; if (eck->zec_magic == ZEC_MAGIC) { nused = zilc->zc_nused; } else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC)) { nused = BSWAP_64(zilc->zc_nused); } else { abd_return_buf(abd, data, data_size); return (SET_ERROR(ECKSUM)); } if (nused > data_size) { abd_return_buf(abd, data, data_size); return (SET_ERROR(ECKSUM)); } size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t); } else { eck = (zio_eck_t *)((char *)data + data_size) - 1; } if (checksum == ZIO_CHECKSUM_GANG_HEADER) zio_checksum_gang_verifier(&verifier, bp); else if (checksum == ZIO_CHECKSUM_LABEL) zio_checksum_label_verifier(&verifier, offset); else verifier = bp->blk_cksum; byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); if (byteswap) byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); eck_offset = (size_t)(&eck->zec_cksum) - (size_t)data; expected_cksum = eck->zec_cksum; eck->zec_cksum = verifier; abd_return_buf_copy(abd, data, data_size); ci->ci_func[byteswap](abd, size, spa->spa_cksum_tmpls[checksum], &actual_cksum); abd_copy_from_buf_off(abd, &expected_cksum, eck_offset, sizeof (zio_cksum_t)); if (byteswap) { byteswap_uint64_array(&expected_cksum, sizeof (zio_cksum_t)); } } else { byteswap = BP_SHOULD_BYTESWAP(bp); expected_cksum = bp->blk_cksum; ci->ci_func[byteswap](abd, size, spa->spa_cksum_tmpls[checksum], &actual_cksum); } if (info != NULL) { info->zbc_expected = expected_cksum; info->zbc_actual = actual_cksum; info->zbc_checksum_name = ci->ci_name; info->zbc_byteswapped = byteswap; info->zbc_injected = 0; info->zbc_has_cksum = 1; } if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) return (SET_ERROR(ECKSUM)); return (0); }
int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) { blkptr_t *bp = zio->io_bp; uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum : (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); int byteswap; int error; uint64_t size = (bp == NULL ? zio->io_size : (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); uint64_t offset = zio->io_offset; void *data = zio->io_data; zio_checksum_info_t *ci = &zio_checksum_table[checksum]; zio_cksum_t actual_cksum, expected_cksum, verifier; if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) return (SET_ERROR(EINVAL)); if (ci->ci_eck) { zio_eck_t *eck; if (checksum == ZIO_CHECKSUM_ZILOG2) { zil_chain_t *zilc = data; uint64_t nused; eck = &zilc->zc_eck; if (eck->zec_magic == ZEC_MAGIC) nused = zilc->zc_nused; else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC)) nused = BSWAP_64(zilc->zc_nused); else return (SET_ERROR(ECKSUM)); if (nused > size) return (SET_ERROR(ECKSUM)); size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t); } else { eck = (zio_eck_t *)((char *)data + size) - 1; } if (checksum == ZIO_CHECKSUM_GANG_HEADER) zio_checksum_gang_verifier(&verifier, bp); else if (checksum == ZIO_CHECKSUM_LABEL) zio_checksum_label_verifier(&verifier, offset); else verifier = bp->blk_cksum; byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); if (byteswap) byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); expected_cksum = eck->zec_cksum; eck->zec_cksum = verifier; ci->ci_func[byteswap](data, size, &actual_cksum); eck->zec_cksum = expected_cksum; if (byteswap) byteswap_uint64_array(&expected_cksum, sizeof (zio_cksum_t)); } else { ASSERT(!BP_IS_GANG(bp)); byteswap = BP_SHOULD_BYTESWAP(bp); expected_cksum = bp->blk_cksum; ci->ci_func[byteswap](data, size, &actual_cksum); } info->zbc_expected = expected_cksum; info->zbc_actual = actual_cksum; info->zbc_checksum_name = ci->ci_name; info->zbc_byteswapped = byteswap; info->zbc_injected = 0; info->zbc_has_cksum = 1; /* * Special case for truncated checksums with crypto MAC * This may not be the best place to deal with this but it is here now. * * Words 0 and 1 and 32 bits of word 2 of the checksum are the * first 160 bytes of SHA256 hash. * The rest of words 2 and all of word 3 are the crypto MAC so * ignore those because we can't check them until we do the decryption * later, nor could we do them if the key wasn't present */ if (ci->ci_trunc) { if (!(0 == ( (actual_cksum.zc_word[0] - expected_cksum.zc_word[0]) | (actual_cksum.zc_word[1] - expected_cksum.zc_word[1]) | (BF64_GET(actual_cksum.zc_word[2], 0, 32) - BF64_GET(expected_cksum.zc_word[2], 0, 32))))) { return (ECKSUM); } } else if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) { return (SET_ERROR(ECKSUM)); } if (zio_injection_enabled && !zio->io_error && (error = zio_handle_fault_injection(zio, ECKSUM)) != 0) { info->zbc_injected = 1; return (error); } return (0); }
static void zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) { zil_replay_arg_t *zr = zra; const zil_header_t *zh = zilog->zl_header; uint64_t reclen = lr->lrc_reclen; uint64_t txtype = lr->lrc_txtype; char *name; int pass, error; if (!zilog->zl_replay) /* giving up */ return; if (lr->lrc_txg < claim_txg) /* already committed */ return; if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */ return; /* Strip case-insensitive bit, still present in log record */ txtype &= ~TX_CI; if (txtype == 0 || txtype >= TX_MAX_TYPE) { error = EINVAL; goto bad; } /* * Make a copy of the data so we can revise and extend it. */ bcopy(lr, zr->zr_lrbuf, reclen); /* * The log block containing this lr may have been byteswapped * so that we can easily examine common fields like lrc_txtype. * However, the log is a mix of different data types, and only the * replay vectors know how to byteswap their records. Therefore, if * the lr was byteswapped, undo it before invoking the replay vector. */ if (zr->zr_byteswap) byteswap_uint64_array(zr->zr_lrbuf, reclen); /* * If this is a TX_WRITE with a blkptr, suck in the data. */ if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) { lr_write_t *lrw = (lr_write_t *)lr; blkptr_t *wbp = &lrw->lr_blkptr; uint64_t wlen = lrw->lr_length; char *wbuf = zr->zr_lrbuf + reclen; if (BP_IS_HOLE(wbp)) { /* compressed to a hole */ bzero(wbuf, wlen); } else { /* * A subsequent write may have overwritten this block, * in which case wbp may have been been freed and * reallocated, and our read of wbp may fail with a * checksum error. We can safely ignore this because * the later write will provide the correct data. */ zbookmark_t zb; zb.zb_objset = dmu_objset_id(zilog->zl_os); zb.zb_object = lrw->lr_foid; zb.zb_level = -1; zb.zb_blkid = lrw->lr_offset / BP_GET_LSIZE(wbp); (void) zio_wait(zio_read(NULL, zilog->zl_spa, wbp, wbuf, BP_GET_LSIZE(wbp), NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb)); (void) memmove(wbuf, wbuf + lrw->lr_blkoff, wlen); } } /* * We must now do two things atomically: replay this log record, * and update the log header sequence number to reflect the fact that * we did so. At the end of each replay function the sequence number * is updated if we are in replay mode. */ for (pass = 1; pass <= 2; pass++) { zilog->zl_replaying_seq = lr->lrc_seq; /* Only byteswap (if needed) on the 1st pass. */ error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf, zr->zr_byteswap && pass == 1); if (!error) return; /* * The DMU's dnode layer doesn't see removes until the txg * commits, so a subsequent claim can spuriously fail with * EEXIST. So if we receive any error we try syncing out * any removes then retry the transaction. */ if (pass == 1) txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); } bad: ASSERT(error); name = kmem_alloc(MAXNAMELEN, KM_SLEEP); dmu_objset_name(zr->zr_os, name); cmn_err(CE_WARN, "ZFS replay transaction error %d, " "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype, (lr->lrc_txtype & TX_CI) ? "CI" : ""); zilog->zl_replay = B_FALSE; kmem_free(name, MAXNAMELEN); }
static int zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap) { char *name = NULL; /* location determined later */ char *link; /* symlink content follows name */ znode_t *dzp; vnode_t *vp = NULL; xvattr_t xva; int vflg = 0; size_t lrsize = sizeof (lr_create_t); lr_attr_t *lrattr; void *start; size_t xvatlen; uint64_t txtype; struct componentname cn; int error; txtype = (lr->lr_common.lrc_txtype & ~TX_CI); if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); if (txtype == TX_CREATE_ATTR || txtype == TX_MKDIR_ATTR) zfs_replay_swap_attrs((lr_attr_t *)(lr + 1)); } if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) return (error); xva_init(&xva); zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID, lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid); /* * All forms of zfs create (create, mkdir, mkxattrdir, symlink) * eventually end up in zfs_mknode(), which assigns the object's * creation time and generation number. The generic VOP_CREATE() * doesn't have either concept, so we smuggle the values inside * the vattr's otherwise unused va_ctime and va_nblocks fields. */ ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); xva.xva_vattr.va_nblocks = lr->lr_gen; error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL); if (error != ENOENT) goto out; if (lr->lr_common.lrc_txtype & TX_CI) vflg |= FIGNORECASE; /* * Symlinks don't have fuid info, and CIFS never creates * symlinks. * * The _ATTR versions will grab the fuid info in their subcases. */ if ((int)lr->lr_common.lrc_txtype != TX_SYMLINK && (int)lr->lr_common.lrc_txtype != TX_MKDIR_ATTR && (int)lr->lr_common.lrc_txtype != TX_CREATE_ATTR) { start = (lr + 1); zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start, lr->lr_uid, lr->lr_gid); } cn.cn_cred = kcred; cn.cn_thread = curthread; cn.cn_flags = SAVENAME; vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY); switch (txtype) { case TX_CREATE_ATTR: lrattr = (lr_attr_t *)(caddr_t)(lr + 1); xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva); start = (caddr_t)(lr + 1) + xvatlen; zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start, lr->lr_uid, lr->lr_gid); name = (char *)start; /*FALLTHROUGH*/ case TX_CREATE: if (name == NULL) name = (char *)start; cn.cn_nameptr = name; error = VOP_CREATE(ZTOV(dzp), &vp, &cn, &xva.xva_vattr /*,vflg*/); break; case TX_MKDIR_ATTR: lrattr = (lr_attr_t *)(caddr_t)(lr + 1); xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva); start = (caddr_t)(lr + 1) + xvatlen; zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start, lr->lr_uid, lr->lr_gid); name = (char *)start; /*FALLTHROUGH*/ case TX_MKDIR: if (name == NULL) name = (char *)(lr + 1); cn.cn_nameptr = name; error = VOP_MKDIR(ZTOV(dzp), &vp, &cn, &xva.xva_vattr /*,vflg*/); break; case TX_MKXATTR: error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &vp, kcred); break; case TX_SYMLINK: name = (char *)(lr + 1); link = name + strlen(name) + 1; cn.cn_nameptr = name; error = VOP_SYMLINK(ZTOV(dzp), &vp, &cn, &xva.xva_vattr, link /*,vflg*/); break; default: error = ENOTSUP; } VOP_UNLOCK(ZTOV(dzp), 0); out: if (error == 0 && vp != NULL) VN_URELE(vp); VN_RELE(ZTOV(dzp)); if (zfsvfs->z_fuid_replay) zfs_fuid_info_free(zfsvfs->z_fuid_replay); zfsvfs->z_fuid_replay = NULL; return (error); }
static int zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, boolean_t byteswap) { char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ char *tname = sname + strlen(sname) + 1; znode_t *sdzp, *tdzp; struct componentname scn, tcn; vnode_t *svp, *tvp; kthread_t *td = curthread; int error; int vflg = 0; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0) return (error); if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) { VN_RELE(ZTOV(sdzp)); return (error); } if (lr->lr_common.lrc_txtype & TX_CI) vflg |= FIGNORECASE; svp = tvp = NULL; scn.cn_nameptr = sname; scn.cn_namelen = strlen(sname); scn.cn_nameiop = DELETE; scn.cn_flags = ISLASTCN | SAVENAME; scn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY; scn.cn_cred = kcred; scn.cn_thread = td; vn_lock(ZTOV(sdzp), LK_EXCLUSIVE | LK_RETRY); error = VOP_LOOKUP(ZTOV(sdzp), &svp, &scn); VOP_UNLOCK(ZTOV(sdzp), 0); if (error != 0) goto fail; VOP_UNLOCK(svp, 0); tcn.cn_nameptr = tname; tcn.cn_namelen = strlen(tname); tcn.cn_nameiop = RENAME; tcn.cn_flags = ISLASTCN | SAVENAME; tcn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY; tcn.cn_cred = kcred; tcn.cn_thread = td; vn_lock(ZTOV(tdzp), LK_EXCLUSIVE | LK_RETRY); error = VOP_LOOKUP(ZTOV(tdzp), &tvp, &tcn); if (error == EJUSTRETURN) tvp = NULL; else if (error != 0) { VOP_UNLOCK(ZTOV(tdzp), 0); goto fail; } error = VOP_RENAME(ZTOV(sdzp), svp, &scn, ZTOV(tdzp), tvp, &tcn /*,vflg*/); return (error); fail: if (svp != NULL) vrele(svp); if (tvp != NULL) vrele(tvp); VN_RELE(ZTOV(tdzp)); VN_RELE(ZTOV(sdzp)); return (error); }
static int zfs_replay_write(void *arg1, char *arg2, boolean_t byteswap) { zfsvfs_t *zfsvfs = (zfsvfs_t *)arg1; lr_write_t *lr = (lr_write_t *)arg2; char *data = (char *)(lr + 1); /* data follows lr_write_t */ znode_t *zp; int error; #ifndef LINUX_PORT ssize_t resid; #else uio_t uio; int vflg = 0; struct iovec iov; #endif if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { /* * As we can log writes out of order, it's possible the * file has been removed. In this case just drop the write * and return success. */ if (error == ENOENT) error = 0; return (error); } #ifndef LINUX_PORT offset = lr->lr_offset; length = lr->lr_length; iov.iov_base = (void *) data; iov.iov_len = lr->lr_length; /* This may be a write from a dmu_sync() for a whole block, * and may extend beyond the current end of the file. * We can't just replay what was written for this TX_WRITE as * a future TX_WRITE2 may extend the eof and the data for that * write needs to be there. So we write the whole block and * reduce the eof. This needs to be done within the single dmu * transaction created within vn_rdwr -> zfs_write. So a possible * new end of file is passed through in zfsvfs->z_replay_eof */ zfsvfs->z_replay_eof = 0; /* 0 means don't change end of file */ /* If it's a dmu_sync() block, write the whole block */ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); if (length < blocksize) { offset -= offset % blocksize; length = blocksize; } if (zp->z_size < eod) zfsvfs->z_replay_eof = eod; } error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, length, offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); #else iov.iov_base = (void *) data; iov.iov_len = lr->lr_length; uio.uio_iov = &iov; uio.uio_resid = lr->lr_length; uio.uio_iovcnt = 1; uio.uio_loffset = (offset_t)lr->lr_offset; uio.uio_limit = MAXOFFSET_T; uio.uio_segflg = UIO_SYSSPACE; error = VOP_WRITE(ZTOV(zp), &uio, vflg, NULL , NULL); #endif VN_RELE(ZTOV(zp)); zfsvfs->z_replay_eof = 0; /* safety */ return (error); }