void zfs_rmnode(znode_t *zp) { zfs_sb_t *zsb = ZTOZSB(zp); objset_t *os = zsb->z_os; znode_t *xzp = NULL; dmu_tx_t *tx; uint64_t acl_obj; uint64_t xattr_obj; uint64_t count; int error; ASSERT(zp->z_links == 0); ASSERT(atomic_read(&ZTOI(zp)->i_count) == 0); /* * If this is an attribute directory, purge its contents. */ if (S_ISDIR(ZTOI(zp)->i_mode) && (zp->z_pflags & ZFS_XATTR)) { error = zap_count(os, zp->z_id, &count); if (error) { zfs_znode_dmu_fini(zp); return; } if (count > 0) { taskq_t *taskq; /* * There are still directory entries in this xattr * directory. Let zfs_unlinked_drain() deal with * them to avoid deadlocking this process in the * zfs_purgedir()->zfs_zget()->ilookup() callpath * on the xattr inode's I_FREEING bit. */ taskq = dsl_pool_iput_taskq(dmu_objset_pool(os)); taskq_dispatch(taskq, (task_func_t *) zfs_unlinked_drain, zsb, TQ_SLEEP); zfs_znode_dmu_fini(zp); return; } } /* * Free up all the data in the file. */ error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END); if (error) { /* * Not enough space. Leave the file in the unlinked set. */ zfs_znode_dmu_fini(zp); return; } /* * If the file has extended attributes, we're going to unlink * the xattr dir. */ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zsb), &xattr_obj, sizeof (xattr_obj)); if (error == 0 && xattr_obj) { error = zfs_zget(zsb, xattr_obj, &xzp); ASSERT(error == 0); } acl_obj = zfs_external_acl(zp); /* * Set up the final transaction. */ tx = dmu_tx_create(os); dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); dmu_tx_hold_zap(tx, zsb->z_unlinkedobj, FALSE, NULL); if (xzp) { dmu_tx_hold_zap(tx, zsb->z_unlinkedobj, TRUE, NULL); dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); } if (acl_obj) dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { /* * Not enough space to delete the file. Leave it in the * unlinked set, leaking it until the fs is remounted (at * which point we'll call zfs_unlinked_drain() to process it). */ dmu_tx_abort(tx); zfs_znode_dmu_fini(zp); goto out; } if (xzp) { ASSERT(error == 0); mutex_enter(&xzp->z_lock); xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */ xzp->z_links = 0; /* no more links to it */ VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zsb), &xzp->z_links, sizeof (xzp->z_links), tx)); mutex_exit(&xzp->z_lock); zfs_unlinked_add(xzp, tx); } /* Remove this znode from the unlinked set */ VERIFY3U(0, ==, zap_remove_int(zsb->z_os, zsb->z_unlinkedobj, zp->z_id, tx)); zfs_znode_delete(zp, tx); dmu_tx_commit(tx); out: if (xzp) iput(ZTOI(xzp)); }
/* * Link zp into dl. Can only fail if zp has been unlinked. */ int zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) { znode_t *dzp = dl->dl_dzp; zfs_sb_t *zsb = ZTOZSB(zp); uint64_t value; int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode); sa_bulk_attr_t bulk[5]; uint64_t mtime[2], ctime[2]; int count = 0; int error; mutex_enter(&zp->z_lock); if (!(flag & ZRENAMING)) { if (zp->z_unlinked) { /* no new links to unlinked zp */ ASSERT(!(flag & (ZNEW | ZEXISTS))); mutex_exit(&zp->z_lock); return (ENOENT); } zp->z_links++; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zsb), NULL, &zp->z_links, sizeof (zp->z_links)); } SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zsb), NULL, &dzp->z_id, sizeof (dzp->z_id)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zsb), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); if (!(flag & ZNEW)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zsb), NULL, ctime, sizeof (ctime)); zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, B_TRUE); } error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ASSERT(error == 0); mutex_exit(&zp->z_lock); mutex_enter(&dzp->z_lock); dzp->z_size++; dzp->z_links += zp_is_dir; count = 0; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zsb), NULL, &dzp->z_size, sizeof (dzp->z_size)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zsb), NULL, &dzp->z_links, sizeof (dzp->z_links)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zsb), NULL, mtime, sizeof (mtime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zsb), NULL, ctime, sizeof (ctime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zsb), NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); ASSERT(error == 0); mutex_exit(&dzp->z_lock); value = zfs_dirent(zp, zp->z_mode); error = zap_add(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, 8, 1, &value, tx); ASSERT(error == 0); return (0); }
void zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx) { dmu_buf_t *db = sa_get_db(hdl); znode_t *zp = sa_get_userdata(hdl); zfs_sb_t *zsb = ZTOZSB(zp); int count = 0; sa_bulk_attr_t *bulk, *sa_attrs; zfs_acl_locator_cb_t locate = { 0 }; uint64_t uid, gid, mode, rdev, xattr, parent; uint64_t crtime[2], mtime[2], ctime[2]; zfs_acl_phys_t znode_acl; char scanstamp[AV_SCANSTAMP_SZ]; boolean_t drop_lock = B_FALSE; /* * No upgrade if ACL isn't cached * since we won't know which locks are held * and ready the ACL would require special "locked" * interfaces that would be messy */ if (zp->z_acl_cached == NULL || S_ISLNK(ZTOI(zp)->i_mode)) return; /* * If the z_lock is held and we aren't the owner * the just return since we don't want to deadlock * trying to update the status of z_is_sa. This * file can then be upgraded at a later time. * * Otherwise, we know we are doing the * sa_update() that caused us to enter this function. */ if (mutex_owner(&zp->z_lock) != curthread) { if (mutex_tryenter(&zp->z_lock) == 0) return; else drop_lock = B_TRUE; } /* First do a bulk query of the attributes that aren't cached */ bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * 20, KM_SLEEP); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zsb), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zsb), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zsb), NULL, &crtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zsb), NULL, &mode, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zsb), NULL, &parent, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_XATTR(zsb), NULL, &xattr, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zsb), NULL, &rdev, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zsb), NULL, &uid, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zsb), NULL, &gid, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zsb), NULL, &znode_acl, 88); if (sa_bulk_lookup_locked(hdl, bulk, count) != 0) { kmem_free(bulk, sizeof (sa_bulk_attr_t) * 20); goto done; } /* * While the order here doesn't matter its best to try and organize * it is such a way to pick up an already existing layout number */ count = 0; sa_attrs = kmem_zalloc(sizeof (sa_bulk_attr_t) * 20, KM_SLEEP); SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MODE(zsb), NULL, &mode, 8); SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SIZE(zsb), NULL, &zp->z_size, 8); SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GEN(zsb), NULL, &zp->z_gen, 8); SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_UID(zsb), NULL, &uid, 8); SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GID(zsb), NULL, &gid, 8); SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_PARENT(zsb), NULL, &parent, 8); SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_FLAGS(zsb), NULL, &zp->z_pflags, 8); SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_ATIME(zsb), NULL, zp->z_atime, 16); SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MTIME(zsb), NULL, &mtime, 16); SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CTIME(zsb), NULL, &ctime, 16); SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CRTIME(zsb), NULL, &crtime, 16); SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_LINKS(zsb), NULL, &zp->z_links, 8); if (S_ISBLK(ZTOI(zp)->i_mode) || S_ISCHR(ZTOI(zp)->i_mode)) SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_RDEV(zsb), NULL, &rdev, 8); SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_COUNT(zsb), NULL, &zp->z_acl_cached->z_acl_count, 8); if (zp->z_acl_cached->z_version < ZFS_ACL_VERSION_FUID) zfs_acl_xform(zp, zp->z_acl_cached, CRED()); locate.cb_aclp = zp->z_acl_cached; SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_ACES(zsb), zfs_acl_data_locator, &locate, zp->z_acl_cached->z_acl_bytes); if (xattr) SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_XATTR(zsb), NULL, &xattr, 8); /* if scanstamp then add scanstamp */ if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) { abd_copy_to_buf_off(scanstamp, db->db_data, AV_SCANSTAMP_SZ, ZFS_OLD_ZNODE_PHYS_SIZE); SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SCANSTAMP(zsb), NULL, scanstamp, AV_SCANSTAMP_SZ); zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP; } VERIFY(dmu_set_bonustype(db, DMU_OT_SA, tx) == 0); VERIFY(sa_replace_all_by_template_locked(hdl, sa_attrs, count, tx) == 0); if (znode_acl.z_acl_extern_obj) VERIFY(0 == dmu_object_free(zsb->z_os, znode_acl.z_acl_extern_obj, tx)); zp->z_is_sa = B_TRUE; kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * 20); kmem_free(bulk, sizeof (sa_bulk_attr_t) * 20); done: if (drop_lock) mutex_exit(&zp->z_lock); }
/* * Lock a directory entry. A dirlock on <dzp, name> protects that name * in dzp's directory zap object. As long as you hold a dirlock, you can * assume two things: (1) dzp cannot be reaped, and (2) no other thread * can change the zap entry for (i.e. link or unlink) this name. * * Input arguments: * dzp - znode for directory * name - name of entry to lock * flag - ZNEW: if the entry already exists, fail with EEXIST. * ZEXISTS: if the entry does not exist, fail with ENOENT. * ZSHARED: allow concurrent access with other ZSHARED callers. * ZXATTR: we want dzp's xattr directory * ZCILOOK: On a mixed sensitivity file system, * this lookup should be case-insensitive. * ZCIEXACT: On a purely case-insensitive file system, * this lookup should be case-sensitive. * ZRENAMING: we are locking for renaming, force narrow locks * ZHAVELOCK: Don't grab the z_name_lock for this call. The * current thread already holds it. * * Output arguments: * zpp - pointer to the znode for the entry (NULL if there isn't one) * dlpp - pointer to the dirlock for this entry (NULL on error) * direntflags - (case-insensitive lookup only) * flags if multiple case-sensitive matches exist in directory * realpnp - (case-insensitive lookup only) * actual name matched within the directory * * Return value: 0 on success or errno on failure. * * NOTE: Always checks for, and rejects, '.' and '..'. * NOTE: For case-insensitive file systems we take wide locks (see below), * but return znode pointers to a single match. */ int zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, int flag, int *direntflags, pathname_t *realpnp) { zfs_sb_t *zsb = ZTOZSB(dzp); zfs_dirlock_t *dl; boolean_t update; boolean_t exact; uint64_t zoid; #ifdef HAVE_DNLC vnode_t *vp = NULL; #endif /* HAVE_DNLC */ int error = 0; int cmpflags; *zpp = NULL; *dlpp = NULL; /* * Verify that we are not trying to lock '.', '..', or '.zfs' */ if ((name[0] == '.' && (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'))) || (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)) return (EEXIST); /* * Case sensitivity and normalization preferences are set when * the file system is created. These are stored in the * zsb->z_case and zsb->z_norm fields. These choices * affect what vnodes can be cached in the DNLC, how we * perform zap lookups, and the "width" of our dirlocks. * * A normal dirlock locks a single name. Note that with * normalization a name can be composed multiple ways, but * when normalized, these names all compare equal. A wide * dirlock locks multiple names. We need these when the file * system is supporting mixed-mode access. It is sometimes * necessary to lock all case permutations of file name at * once so that simultaneous case-insensitive/case-sensitive * behaves as rationally as possible. */ /* * Decide if exact matches should be requested when performing * a zap lookup on file systems supporting case-insensitive * access. */ exact = ((zsb->z_case == ZFS_CASE_INSENSITIVE) && (flag & ZCIEXACT)) || ((zsb->z_case == ZFS_CASE_MIXED) && !(flag & ZCILOOK)); /* * Only look in or update the DNLC if we are looking for the * name on a file system that does not require normalization * or case folding. We can also look there if we happen to be * on a non-normalizing, mixed sensitivity file system IF we * are looking for the exact name. * * Maybe can add TO-UPPERed version of name to dnlc in ci-only * case for performance improvement? */ update = !zsb->z_norm || ((zsb->z_case == ZFS_CASE_MIXED) && !(zsb->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK)); /* * ZRENAMING indicates we are in a situation where we should * take narrow locks regardless of the file system's * preferences for normalizing and case folding. This will * prevent us deadlocking trying to grab the same wide lock * twice if the two names happen to be case-insensitive * matches. */ if (flag & ZRENAMING) cmpflags = 0; else cmpflags = zsb->z_norm; /* * Wait until there are no locks on this name. * * Don't grab the the lock if it is already held. However, cannot * have both ZSHARED and ZHAVELOCK together. */ ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK)); if (!(flag & ZHAVELOCK)) rw_enter(&dzp->z_name_lock, RW_READER); mutex_enter(&dzp->z_lock); for (;;) { if (dzp->z_unlinked) { mutex_exit(&dzp->z_lock); if (!(flag & ZHAVELOCK)) rw_exit(&dzp->z_name_lock); return (ENOENT); } for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) { if ((u8_strcmp(name, dl->dl_name, 0, cmpflags, U8_UNICODE_LATEST, &error) == 0) || error != 0) break; } if (error != 0) { mutex_exit(&dzp->z_lock); if (!(flag & ZHAVELOCK)) rw_exit(&dzp->z_name_lock); return (ENOENT); } if (dl == NULL) { /* * Allocate a new dirlock and add it to the list. */ dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP); cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL); dl->dl_name = name; dl->dl_sharecnt = 0; dl->dl_namelock = 0; dl->dl_namesize = 0; dl->dl_dzp = dzp; dl->dl_next = dzp->z_dirlocks; dzp->z_dirlocks = dl; break; } if ((flag & ZSHARED) && dl->dl_sharecnt != 0) break; cv_wait(&dl->dl_cv, &dzp->z_lock); } /* * If the z_name_lock was NOT held for this dirlock record it. */ if (flag & ZHAVELOCK) dl->dl_namelock = 1; if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) { /* * We're the second shared reference to dl. Make a copy of * dl_name in case the first thread goes away before we do. * Note that we initialize the new name before storing its * pointer into dl_name, because the first thread may load * dl->dl_name at any time. He'll either see the old value, * which is his, or the new shared copy; either is OK. */ dl->dl_namesize = strlen(dl->dl_name) + 1; name = kmem_alloc(dl->dl_namesize, KM_SLEEP); bcopy(dl->dl_name, name, dl->dl_namesize); dl->dl_name = name; } mutex_exit(&dzp->z_lock); /* * We have a dirlock on the name. (Note that it is the dirlock, * not the dzp's z_lock, that protects the name in the zap object.) * See if there's an object by this name; if so, put a hold on it. */ if (flag & ZXATTR) { error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zsb), &zoid, sizeof (zoid)); if (error == 0) error = (zoid == 0 ? ENOENT : 0); } else { #ifdef HAVE_DNLC if (update) vp = dnlc_lookup(ZTOI(dzp), name); if (vp == DNLC_NO_VNODE) { iput(vp); error = ENOENT; } else if (vp) { if (flag & ZNEW) { zfs_dirent_unlock(dl); iput(vp); return (EEXIST); } *dlpp = dl; *zpp = VTOZ(vp); return (0); } else { error = zfs_match_find(zsb, dzp, name, exact, update, direntflags, realpnp, &zoid); } #else error = zfs_match_find(zsb, dzp, name, exact, update, direntflags, realpnp, &zoid); #endif /* HAVE_DNLC */ } if (error) { if (error != ENOENT || (flag & ZEXISTS)) { zfs_dirent_unlock(dl); return (error); } } else { if (flag & ZNEW) { zfs_dirent_unlock(dl); return (EEXIST); } error = zfs_zget(zsb, zoid, zpp); if (error) { zfs_dirent_unlock(dl); return (error); } #ifdef HAVE_DNLC if (!(flag & ZXATTR) && update) dnlc_update(ZTOI(dzp), name, ZTOI(*zpp)); #endif /* HAVE_DNLC */ } *dlpp = dl; return (0); }
/* * Create a new DMU object to hold a zfs znode. * * IN: dzp - parent directory for new znode * vap - file attributes for new znode * tx - dmu transaction id for zap operations * cr - credentials of caller * flag - flags: * IS_ROOT_NODE - new object will be root * IS_XATTR - new object is an attribute * bonuslen - length of bonus buffer * setaclp - File/Dir initial ACL * fuidp - Tracks fuid allocation. * * OUT: zpp - allocated znode * */ void zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids) { uint64_t crtime[2], atime[2], mtime[2], ctime[2]; uint64_t mode, size, links, parent, pflags; uint64_t dzp_pflags = 0; uint64_t rdev = 0; zfs_sb_t *zsb = ZTOZSB(dzp); dmu_buf_t *db; timestruc_t now; uint64_t gen, obj; int err; int bonuslen; sa_handle_t *sa_hdl; dmu_object_type_t obj_type; sa_bulk_attr_t *sa_attrs; int cnt = 0; zfs_acl_locator_cb_t locate = { 0 }; if (zsb->z_replay) { obj = vap->va_nodeid; now = vap->va_ctime; /* see zfs_replay_create() */ gen = vap->va_nblocks; /* ditto */ } else { obj = 0; gethrestime(&now); gen = dmu_tx_get_txg(tx); } obj_type = zsb->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE; bonuslen = (obj_type == DMU_OT_SA) ? DN_MAX_BONUSLEN : ZFS_OLD_ZNODE_PHYS_SIZE; /* * Create a new DMU object. */ /* * There's currently no mechanism for pre-reading the blocks that will * be needed to allocate a new object, so we accept the small chance * that there will be an i/o error and we will fail one of the * assertions below. */ if (S_ISDIR(vap->va_mode)) { if (zsb->z_replay) { err = zap_create_claim_norm(zsb->z_os, obj, zsb->z_norm, DMU_OT_DIRECTORY_CONTENTS, obj_type, bonuslen, tx); ASSERT0(err); } else { obj = zap_create_norm(zsb->z_os, zsb->z_norm, DMU_OT_DIRECTORY_CONTENTS, obj_type, bonuslen, tx); } } else { if (zsb->z_replay) { err = dmu_object_claim(zsb->z_os, obj, DMU_OT_PLAIN_FILE_CONTENTS, 0, obj_type, bonuslen, tx); ASSERT0(err); } else { obj = dmu_object_alloc(zsb->z_os, DMU_OT_PLAIN_FILE_CONTENTS, 0, obj_type, bonuslen, tx); } } ZFS_OBJ_HOLD_ENTER(zsb, obj); VERIFY(0 == sa_buf_hold(zsb->z_os, obj, NULL, &db)); /* * If this is the root, fix up the half-initialized parent pointer * to reference the just-allocated physical data area. */ if (flag & IS_ROOT_NODE) { dzp->z_id = obj; } else { dzp_pflags = dzp->z_pflags; } /* * If parent is an xattr, so am I. */ if (dzp_pflags & ZFS_XATTR) { flag |= IS_XATTR; } if (zsb->z_use_fuids) pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; else pflags = 0; if (S_ISDIR(vap->va_mode)) { size = 2; /* contents ("." and "..") */ links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; } else { size = links = 0; } if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode)) rdev = vap->va_rdev; parent = dzp->z_id; mode = acl_ids->z_mode; if (flag & IS_XATTR) pflags |= ZFS_XATTR; /* * No execs denied will be deterimed when zfs_mode_compute() is called. */ pflags |= acl_ids->z_aclp->z_hints & (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT| ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED); ZFS_TIME_ENCODE(&now, crtime); ZFS_TIME_ENCODE(&now, ctime); if (vap->va_mask & ATTR_ATIME) { ZFS_TIME_ENCODE(&vap->va_atime, atime); } else { ZFS_TIME_ENCODE(&now, atime); } if (vap->va_mask & ATTR_MTIME) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); } else { ZFS_TIME_ENCODE(&now, mtime); } /* Now add in all of the "SA" attributes */ VERIFY(0 == sa_handle_get_from_db(zsb->z_os, db, NULL, SA_HDL_SHARED, &sa_hdl)); /* * Setup the array of attributes to be replaced/set on the new file * * order for DMU_OT_ZNODE is critical since it needs to be constructed * in the old znode_phys_t format. Don't change this ordering */ sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_PUSHPAGE); if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zsb), NULL, &atime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zsb), NULL, &mtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zsb), NULL, &ctime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zsb), NULL, &crtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zsb), NULL, &gen, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zsb), NULL, &mode, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zsb), NULL, &size, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zsb), NULL, &parent, 8); } else { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zsb), NULL, &mode, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zsb), NULL, &size, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zsb), NULL, &gen, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zsb), NULL, &acl_ids->z_fuid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zsb), NULL, &acl_ids->z_fgid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zsb), NULL, &parent, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zsb), NULL, &pflags, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zsb), NULL, &atime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zsb), NULL, &mtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zsb), NULL, &ctime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zsb), NULL, &crtime, 16); } SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zsb), NULL, &links, 8); if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zsb), NULL, &empty_xattr, 8); } if (obj_type == DMU_OT_ZNODE || (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zsb), NULL, &rdev, 8); } if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zsb), NULL, &pflags, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zsb), NULL, &acl_ids->z_fuid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zsb), NULL, &acl_ids->z_fgid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zsb), NULL, pad, sizeof (uint64_t) * 4); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zsb), NULL, &acl_phys, sizeof (zfs_acl_phys_t)); } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zsb), NULL, &acl_ids->z_aclp->z_acl_count, 8); locate.cb_aclp = acl_ids->z_aclp; SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zsb), zfs_acl_data_locator, &locate, acl_ids->z_aclp->z_acl_bytes); mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags, acl_ids->z_fuid, acl_ids->z_fgid); } VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0); if (!(flag & IS_ROOT_NODE)) { *zpp = zfs_znode_alloc(zsb, db, 0, obj_type, obj, sa_hdl, ZTOI(dzp)); VERIFY(*zpp != NULL); VERIFY(dzp != NULL); } else { /* * If we are creating the root node, the "parent" we * passed in is the znode for the root. */ *zpp = dzp; (*zpp)->z_sa_hdl = sa_hdl; } (*zpp)->z_pflags = pflags; (*zpp)->z_mode = mode; if (obj_type == DMU_OT_ZNODE || acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { err = zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx); ASSERT0(err); } kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); ZFS_OBJ_HOLD_EXIT(zsb, obj); }
/* * Reopen zfs_sb_t and release VFS ops. */ int zfs_resume_fs(zfs_sb_t *zsb, const char *osname) { int err, err2; ASSERT(RRW_WRITE_HELD(&zsb->z_teardown_lock)); ASSERT(RW_WRITE_HELD(&zsb->z_teardown_inactive_lock)); err = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, zsb, &zsb->z_os); if (err) { zsb->z_os = NULL; } else { znode_t *zp; uint64_t sa_obj = 0; err2 = zap_lookup(zsb->z_os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); if ((err || err2) && zsb->z_version >= ZPL_VERSION_SA) goto bail; if ((err = sa_setup(zsb->z_os, sa_obj, zfs_attr_table, ZPL_END, &zsb->z_attr_table)) != 0) goto bail; VERIFY(zfs_sb_setup(zsb, B_FALSE) == 0); zsb->z_rollback_time = jiffies; /* * Attempt to re-establish all the active inodes with their * dbufs. If a zfs_rezget() fails, then we unhash the inode * and mark it stale. This prevents a collision if a new * inode/object is created which must use the same inode * number. The stale inode will be be released when the * VFS prunes the dentry holding the remaining references * on the stale inode. */ mutex_enter(&zsb->z_znodes_lock); for (zp = list_head(&zsb->z_all_znodes); zp; zp = list_next(&zsb->z_all_znodes, zp)) { err2 = zfs_rezget(zp); if (err2) { remove_inode_hash(ZTOI(zp)); zp->z_is_stale = B_TRUE; } } mutex_exit(&zsb->z_znodes_lock); } bail: /* release the VFS ops */ rw_exit(&zsb->z_teardown_inactive_lock); rrw_exit(&zsb->z_teardown_lock, FTAG); if (err) { /* * Since we couldn't reopen zfs_sb_t or, setup the * sa framework, force unmount this file system. */ if (zsb->z_os) (void) zfs_umount(zsb->z_sb); } return (err); }
int zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) { zfs_sb_t *zsb = sb->s_fs_info; znode_t *zp; uint64_t object = 0; uint64_t fid_gen = 0; uint64_t gen_mask; uint64_t zp_gen; int i, err; *ipp = NULL; ZFS_ENTER(zsb); if (fidp->fid_len == LONG_FID_LEN) { zfid_long_t *zlfid = (zfid_long_t *)fidp; uint64_t objsetid = 0; uint64_t setgen = 0; for (i = 0; i < sizeof (zlfid->zf_setid); i++) objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); for (i = 0; i < sizeof (zlfid->zf_setgen); i++) setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); ZFS_EXIT(zsb); err = zfsctl_lookup_objset(sb, objsetid, &zsb); if (err) return (SET_ERROR(EINVAL)); ZFS_ENTER(zsb); } if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { zfid_short_t *zfid = (zfid_short_t *)fidp; for (i = 0; i < sizeof (zfid->zf_object); i++) object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); for (i = 0; i < sizeof (zfid->zf_gen); i++) fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); } else { ZFS_EXIT(zsb); return (SET_ERROR(EINVAL)); } /* A zero fid_gen means we are in the .zfs control directories */ if (fid_gen == 0 && (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) { *ipp = zsb->z_ctldir; ASSERT(*ipp != NULL); if (object == ZFSCTL_INO_SNAPDIR) { VERIFY(zfsctl_root_lookup(*ipp, "snapshot", ipp, 0, kcred, NULL, NULL) == 0); } else { igrab(*ipp); } ZFS_EXIT(zsb); return (0); } gen_mask = -1ULL >> (64 - 8 * i); dprintf("getting %llu [%llu mask %llx]\n", object, fid_gen, gen_mask); if ((err = zfs_zget(zsb, object, &zp))) { ZFS_EXIT(zsb); return (err); } (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zsb), &zp_gen, sizeof (uint64_t)); zp_gen = zp_gen & gen_mask; if (zp_gen == 0) zp_gen = 1; if (zp->z_unlinked || zp_gen != fid_gen) { dprintf("znode gen (%llu) != fid gen (%llu)\n", zp_gen, fid_gen); iput(ZTOI(zp)); ZFS_EXIT(zsb); return (SET_ERROR(EINVAL)); } *ipp = ZTOI(zp); if (*ipp) zfs_inode_update(ITOZ(*ipp)); ZFS_EXIT(zsb); return (0); }
/* * Free space in a file * * IN: zp - znode of file to free data in. * off - start of range * len - end of range (0 => EOF) * flag - current file open mode flags. * log - TRUE if this action should be logged * * RETURN: 0 on success, error code on failure */ int zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) { struct inode *ip = ZTOI(zp); dmu_tx_t *tx; zfs_sb_t *zsb = ZTOZSB(zp); zilog_t *zilog = zsb->z_log; uint64_t mode; uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int count = 0; int error; if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zsb), &mode, sizeof (mode))) != 0) return (error); if (off > zp->z_size) { error = zfs_extend(zp, off+len); if (error == 0 && log) goto log; else return (error); } /* * Check for any locks in the region to be freed. */ if (ip->i_flock && mandatory_lock(ip)) { uint64_t length = (len ? len : zp->z_size - off); if (!lock_may_write(ip, off, length)) return (SET_ERROR(EAGAIN)); } if (len == 0) { error = zfs_trunc(zp, off); } else { if ((error = zfs_free_range(zp, off, len)) == 0 && off + len > zp->z_size) error = zfs_extend(zp, off+len); } if (error || !log) return (error); log: tx = dmu_tx_create(zsb->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (error); } SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zsb), NULL, mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zsb), NULL, ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zsb), NULL, &zp->z_pflags, 8); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ASSERT(error == 0); zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); dmu_tx_commit(tx); zfs_inode_update(zp); return (0); }
/* * Handles TX_CREATE, TX_CREATE_ATTR, TX_MKDIR, TX_MKDIR_ATTR and * TK_MKXATTR transactions. * * TX_CREATE and TX_MKDIR are standard creates, but they may have FUID * domain information appended prior to the name. In this case the * uid/gid in the log record will be a log centric FUID. * * TX_CREATE_ACL_ATTR and TX_MKDIR_ACL_ATTR handle special creates that * may contain attributes, ACL and optional fuid information. * * TX_CREATE_ACL and TX_MKDIR_ACL handle special creates that specify * and ACL and normal users/groups in the ACEs. * * There may be an optional xvattr attribute information similar * to zfs_log_setattr. * * Also, after the file name "domain" strings may be appended. */ void zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *dzp, znode_t *zp, char *name, vsecattr_t *vsecp, zfs_fuid_info_t *fuidp, vattr_t *vap) { itx_t *itx; lr_create_t *lr; lr_acl_create_t *lracl; size_t aclsize = 0; size_t xvatsize = 0; size_t txsize; xvattr_t *xvap = (xvattr_t *)vap; void *end; size_t lrsize; size_t namesize = strlen(name) + 1; size_t fuidsz = 0; if (zil_replaying(zilog, tx) || zfs_xattr_owner_unlinked(dzp)) return; /* * If we have FUIDs present then add in space for * domains and ACE fuid's if any. */ if (fuidp) { fuidsz += fuidp->z_domain_str_sz; fuidsz += fuidp->z_fuid_cnt * sizeof (uint64_t); } if (vap->va_mask & ATTR_XVATTR) xvatsize = ZIL_XVAT_SIZE(xvap->xva_mapsize); if ((int)txtype == TX_CREATE_ATTR || (int)txtype == TX_MKDIR_ATTR || (int)txtype == TX_CREATE || (int)txtype == TX_MKDIR || (int)txtype == TX_MKXATTR) { txsize = sizeof (*lr) + namesize + fuidsz + xvatsize; lrsize = sizeof (*lr); } else { txsize = sizeof (lr_acl_create_t) + namesize + fuidsz + ZIL_ACE_LENGTH(aclsize) + xvatsize; lrsize = sizeof (lr_acl_create_t); } itx = zil_itx_create(txtype, txsize); lr = (lr_create_t *)&itx->itx_lr; lr->lr_doid = dzp->z_id; lr->lr_foid = zp->z_id; /* Store dnode slot count in 8 bits above object id. */ LR_FOID_SET_SLOTS(lr->lr_foid, zp->z_dnodesize >> DNODE_SHIFT); lr->lr_mode = zp->z_mode; if (!IS_EPHEMERAL(KUID_TO_SUID(ZTOI(zp)->i_uid))) { lr->lr_uid = (uint64_t)KUID_TO_SUID(ZTOI(zp)->i_uid); } else { lr->lr_uid = fuidp->z_fuid_owner; } if (!IS_EPHEMERAL(KGID_TO_SGID(ZTOI(zp)->i_gid))) { lr->lr_gid = (uint64_t)KGID_TO_SGID(ZTOI(zp)->i_gid); } else { lr->lr_gid = fuidp->z_fuid_group; } (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(zp)), &lr->lr_gen, sizeof (uint64_t)); (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)), lr->lr_crtime, sizeof (uint64_t) * 2); if (sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(ZTOZSB(zp)), &lr->lr_rdev, sizeof (lr->lr_rdev)) != 0) lr->lr_rdev = 0; /* * Fill in xvattr info if any */ if (vap->va_mask & ATTR_XVATTR) { zfs_log_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), xvap); end = (caddr_t)lr + lrsize + xvatsize; } else { end = (caddr_t)lr + lrsize; } /* Now fill in any ACL info */ if (vsecp) { lracl = (lr_acl_create_t *)&itx->itx_lr; lracl->lr_aclcnt = vsecp->vsa_aclcnt; lracl->lr_acl_bytes = aclsize; lracl->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0; lracl->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0; if (vsecp->vsa_aclflags & VSA_ACE_ACLFLAGS) lracl->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags; else lracl->lr_acl_flags = 0; bcopy(vsecp->vsa_aclentp, end, aclsize); end = (caddr_t)end + ZIL_ACE_LENGTH(aclsize); } /* drop in FUID info */ if (fuidp) { end = zfs_log_fuid_ids(fuidp, end); end = zfs_log_fuid_domains(fuidp, end); } /* * Now place file name in log record */ bcopy(name, end, namesize); zil_itx_assign(zilog, itx, tx); }
int zfs_zget(zfs_sb_t *zsb, uint64_t obj_num, znode_t **zpp) { dmu_object_info_t doi; dmu_buf_t *db; znode_t *zp; int err; sa_handle_t *hdl; *zpp = NULL; again: ZFS_OBJ_HOLD_ENTER(zsb, obj_num); err = sa_buf_hold(zsb->z_os, obj_num, NULL, &db); if (err) { ZFS_OBJ_HOLD_EXIT(zsb, obj_num); return (err); } dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_type != DMU_OT_SA && (doi.doi_bonus_type != DMU_OT_ZNODE || (doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t)))) { sa_buf_rele(db, NULL); ZFS_OBJ_HOLD_EXIT(zsb, obj_num); return (SET_ERROR(EINVAL)); } hdl = dmu_buf_get_user(db); if (hdl != NULL) { zp = sa_get_userdata(hdl); /* * Since "SA" does immediate eviction we * should never find a sa handle that doesn't * know about the znode. */ ASSERT3P(zp, !=, NULL); mutex_enter(&zp->z_lock); ASSERT3U(zp->z_id, ==, obj_num); if (zp->z_unlinked) { err = SET_ERROR(ENOENT); } else { /* * If igrab() returns NULL the VFS has independently * determined the inode should be evicted and has * called iput_final() to start the eviction process. * The SA handle is still valid but because the VFS * requires that the eviction succeed we must drop * our locks and references to allow the eviction to * complete. The zfs_zget() may then be retried. * * This unlikely case could be optimized by registering * a sops->drop_inode() callback. The callback would * need to detect the active SA hold thereby informing * the VFS that this inode should not be evicted. */ if (igrab(ZTOI(zp)) == NULL) { mutex_exit(&zp->z_lock); sa_buf_rele(db, NULL); ZFS_OBJ_HOLD_EXIT(zsb, obj_num); goto again; } *zpp = zp; err = 0; } mutex_exit(&zp->z_lock); sa_buf_rele(db, NULL); ZFS_OBJ_HOLD_EXIT(zsb, obj_num); return (err); }
static int zfs_replay_write(zfs_sb_t *zsb, lr_write_t *lr, boolean_t byteswap) { char *data = (char *)(lr + 1); /* data follows lr_write_t */ znode_t *zp; int error; uint64_t eod, offset, length; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); if ((error = zfs_zget(zsb, lr->lr_foid, &zp)) != 0) { /* * As we can log writes out of order, it's possible the * file has been removed. In this case just drop the write * and return success. */ if (error == ENOENT) error = 0; return (error); } offset = lr->lr_offset; length = lr->lr_length; eod = offset + length; /* end of data for this write */ /* * This may be a write from a dmu_sync() for a whole block, * and may extend beyond the current end of the file. * We can't just replay what was written for this TX_WRITE as * a future TX_WRITE2 may extend the eof and the data for that * write needs to be there. So we write the whole block and * reduce the eof. This needs to be done within the single dmu * transaction created within vn_rdwr -> zfs_write. So a possible * new end of file is passed through in zsb->z_replay_eof */ zsb->z_replay_eof = 0; /* 0 means don't change end of file */ /* If it's a dmu_sync() block, write the whole block */ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); if (length < blocksize) { offset -= offset % blocksize; length = blocksize; } if (zp->z_size < eod) zsb->z_replay_eof = eod; } error = zpl_write_common(ZTOI(zp), data, length, offset, UIO_SYSSPACE, 0, kcred); if (error) { if (error < 0) error = -error; else error = EIO; /* Short write */ } iput(ZTOI(zp)); zsb->z_replay_eof = 0; /* safety */ return (error); }
static int zfs_replay_create(zfs_sb_t *zsb, lr_create_t *lr, boolean_t byteswap) { char *name = NULL; /* location determined later */ char *link; /* symlink content follows name */ znode_t *dzp; struct inode *ip = NULL; xvattr_t xva; int vflg = 0; size_t lrsize = sizeof (lr_create_t); lr_attr_t *lrattr; void *start; size_t xvatlen; uint64_t txtype; int error; txtype = (lr->lr_common.lrc_txtype & ~TX_CI); if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); if (txtype == TX_CREATE_ATTR || txtype == TX_MKDIR_ATTR) zfs_replay_swap_attrs((lr_attr_t *)(lr + 1)); } if ((error = zfs_zget(zsb, lr->lr_doid, &dzp)) != 0) return (error); xva_init(&xva); zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID, lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid); /* * All forms of zfs create (create, mkdir, mkxattrdir, symlink) * eventually end up in zfs_mknode(), which assigns the object's * creation time and generation number. The generic zfs_create() * doesn't have either concept, so we smuggle the values inside * the vattr's otherwise unused va_ctime and va_nblocks fields. */ ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); xva.xva_vattr.va_nblocks = lr->lr_gen; error = dmu_object_info(zsb->z_os, lr->lr_foid, NULL); if (error != ENOENT) goto out; if (lr->lr_common.lrc_txtype & TX_CI) vflg |= FIGNORECASE; /* * Symlinks don't have fuid info, and CIFS never creates * symlinks. * * The _ATTR versions will grab the fuid info in their subcases. */ if ((int)lr->lr_common.lrc_txtype != TX_SYMLINK && (int)lr->lr_common.lrc_txtype != TX_MKDIR_ATTR && (int)lr->lr_common.lrc_txtype != TX_CREATE_ATTR) { start = (lr + 1); zsb->z_fuid_replay = zfs_replay_fuid_domain(start, &start, lr->lr_uid, lr->lr_gid); } switch (txtype) { case TX_CREATE_ATTR: lrattr = (lr_attr_t *)(caddr_t)(lr + 1); xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva); start = (caddr_t)(lr + 1) + xvatlen; zsb->z_fuid_replay = zfs_replay_fuid_domain(start, &start, lr->lr_uid, lr->lr_gid); name = (char *)start; /*FALLTHROUGH*/ case TX_CREATE: if (name == NULL) name = (char *)start; error = zfs_create(ZTOI(dzp), name, &xva.xva_vattr, 0, 0, &ip, kcred, vflg, NULL); break; case TX_MKDIR_ATTR: lrattr = (lr_attr_t *)(caddr_t)(lr + 1); xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva); start = (caddr_t)(lr + 1) + xvatlen; zsb->z_fuid_replay = zfs_replay_fuid_domain(start, &start, lr->lr_uid, lr->lr_gid); name = (char *)start; /*FALLTHROUGH*/ case TX_MKDIR: if (name == NULL) name = (char *)(lr + 1); error = zfs_mkdir(ZTOI(dzp), name, &xva.xva_vattr, &ip, kcred, vflg, NULL); break; case TX_MKXATTR: error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &ip, kcred); break; case TX_SYMLINK: name = (char *)(lr + 1); link = name + strlen(name) + 1; error = zfs_symlink(ZTOI(dzp), name, &xva.xva_vattr, link, &ip, kcred, vflg); break; default: error = ENOTSUP; } out: if (error == 0 && ip != NULL) iput(ip); iput(ZTOI(dzp)); if (zsb->z_fuid_replay) zfs_fuid_info_free(zsb->z_fuid_replay); zsb->z_fuid_replay = NULL; return (error); }
/* * Replay file create with optional ACL, xvattr information as well * as option FUID information. */ static int zfs_replay_create_acl(zfs_sb_t *zsb, lr_acl_create_t *lracl, boolean_t byteswap) { char *name = NULL; /* location determined later */ lr_create_t *lr = (lr_create_t *)lracl; znode_t *dzp; struct inode *ip = NULL; xvattr_t xva; int vflg = 0; vsecattr_t vsec = { 0 }; lr_attr_t *lrattr; void *aclstart; void *fuidstart; size_t xvatlen = 0; uint64_t txtype; int error; txtype = (lr->lr_common.lrc_txtype & ~TX_CI); if (byteswap) { byteswap_uint64_array(lracl, sizeof (*lracl)); if (txtype == TX_CREATE_ACL_ATTR || txtype == TX_MKDIR_ACL_ATTR) { lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); zfs_replay_swap_attrs(lrattr); xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); } aclstart = (caddr_t)(lracl + 1) + xvatlen; zfs_ace_byteswap(aclstart, lracl->lr_acl_bytes, B_FALSE); /* swap fuids */ if (lracl->lr_fuidcnt) { byteswap_uint64_array((caddr_t)aclstart + ZIL_ACE_LENGTH(lracl->lr_acl_bytes), lracl->lr_fuidcnt * sizeof (uint64_t)); } } if ((error = zfs_zget(zsb, lr->lr_doid, &dzp)) != 0) return (error); xva_init(&xva); zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID, lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid); /* * All forms of zfs create (create, mkdir, mkxattrdir, symlink) * eventually end up in zfs_mknode(), which assigns the object's * creation time and generation number. The generic zfs_create() * doesn't have either concept, so we smuggle the values inside * the vattr's otherwise unused va_ctime and va_nblocks fields. */ ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); xva.xva_vattr.va_nblocks = lr->lr_gen; error = dmu_object_info(zsb->z_os, lr->lr_foid, NULL); if (error != ENOENT) goto bail; if (lr->lr_common.lrc_txtype & TX_CI) vflg |= FIGNORECASE; switch (txtype) { case TX_CREATE_ACL: aclstart = (caddr_t)(lracl + 1); fuidstart = (caddr_t)aclstart + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); zsb->z_fuid_replay = zfs_replay_fuids(fuidstart, (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, lr->lr_uid, lr->lr_gid); /*FALLTHROUGH*/ case TX_CREATE_ACL_ATTR: if (name == NULL) { lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); xva.xva_vattr.va_mask |= ATTR_XVATTR; zfs_replay_xvattr(lrattr, &xva); } vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS; vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen; vsec.vsa_aclcnt = lracl->lr_aclcnt; vsec.vsa_aclentsz = lracl->lr_acl_bytes; vsec.vsa_aclflags = lracl->lr_acl_flags; if (zsb->z_fuid_replay == NULL) { fuidstart = (caddr_t)(lracl + 1) + xvatlen + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); zsb->z_fuid_replay = zfs_replay_fuids(fuidstart, (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, lr->lr_uid, lr->lr_gid); } error = zfs_create(ZTOI(dzp), name, &xva.xva_vattr, 0, 0, &ip, kcred, vflg, &vsec); break; case TX_MKDIR_ACL: aclstart = (caddr_t)(lracl + 1); fuidstart = (caddr_t)aclstart + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); zsb->z_fuid_replay = zfs_replay_fuids(fuidstart, (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, lr->lr_uid, lr->lr_gid); /*FALLTHROUGH*/ case TX_MKDIR_ACL_ATTR: if (name == NULL) { lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); zfs_replay_xvattr(lrattr, &xva); } vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS; vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen; vsec.vsa_aclcnt = lracl->lr_aclcnt; vsec.vsa_aclentsz = lracl->lr_acl_bytes; vsec.vsa_aclflags = lracl->lr_acl_flags; if (zsb->z_fuid_replay == NULL) { fuidstart = (caddr_t)(lracl + 1) + xvatlen + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); zsb->z_fuid_replay = zfs_replay_fuids(fuidstart, (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, lr->lr_uid, lr->lr_gid); } error = zfs_mkdir(ZTOI(dzp), name, &xva.xva_vattr, &ip, kcred, vflg, &vsec); break; default: error = ENOTSUP; } bail: if (error == 0 && ip != NULL) iput(ip); iput(ZTOI(dzp)); if (zsb->z_fuid_replay) zfs_fuid_info_free(zsb->z_fuid_replay); zsb->z_fuid_replay = NULL; return (error); }
/* * Unlink zp from dl, and mark zp for deletion if this was the last link. * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST). * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list. * If it's non-NULL, we use it to indicate whether the znode needs deletion, * and it's the caller's job to do it. */ int zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, boolean_t *unlinkedp) { znode_t *dzp = dl->dl_dzp; zfs_sb_t *zsb = ZTOZSB(dzp); int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode); boolean_t unlinked = B_FALSE; sa_bulk_attr_t bulk[5]; uint64_t mtime[2], ctime[2]; int count = 0; int error; #ifdef HAVE_DNLC dnlc_remove(ZTOI(dzp), dl->dl_name); #endif /* HAVE_DNLC */ if (!(flag & ZRENAMING)) { mutex_enter(&zp->z_lock); if (zp_is_dir && !zfs_dirempty(zp)) { mutex_exit(&zp->z_lock); return (EEXIST); } /* * If we get here, we are going to try to remove the object. * First try removing the name from the directory; if that * fails, return the error. */ error = zfs_dropname(dl, zp, dzp, tx, flag); if (error != 0) { mutex_exit(&zp->z_lock); return (error); } if (zp->z_links <= zp_is_dir) { zfs_panic_recover("zfs: link count on %lu is %u, " "should be at least %u", zp->z_id, (int)zp->z_links, zp_is_dir + 1); zp->z_links = zp_is_dir + 1; } if (--zp->z_links == zp_is_dir) { zp->z_unlinked = B_TRUE; zp->z_links = 0; unlinked = B_TRUE; } else { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zsb), NULL, &ctime, sizeof (ctime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zsb), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, B_TRUE); } SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zsb), NULL, &zp->z_links, sizeof (zp->z_links)); error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); count = 0; ASSERT(error == 0); mutex_exit(&zp->z_lock); } else { error = zfs_dropname(dl, zp, dzp, tx, flag); if (error != 0) return (error); } mutex_enter(&dzp->z_lock); dzp->z_size--; /* one dirent removed */ dzp->z_links -= zp_is_dir; /* ".." link from zp */ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zsb), NULL, &dzp->z_links, sizeof (dzp->z_links)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zsb), NULL, &dzp->z_size, sizeof (dzp->z_size)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zsb), NULL, ctime, sizeof (ctime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zsb), NULL, mtime, sizeof (mtime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zsb), NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); ASSERT(error == 0); mutex_exit(&dzp->z_lock); if (unlinkedp != NULL) *unlinkedp = unlinked; else if (unlinked) zfs_unlinked_add(zp, tx); return (0); }
/* * Reopen zfs_sb_t and release VFS ops. */ int zfs_resume_fs(zfs_sb_t *zsb, const char *osname) { int err, err2; znode_t *zp; uint64_t sa_obj = 0; ASSERT(RRM_WRITE_HELD(&zsb->z_teardown_lock)); ASSERT(RW_WRITE_HELD(&zsb->z_teardown_inactive_lock)); /* * We already own this, so just hold and rele it to update the * objset_t, as the one we had before may have been evicted. */ VERIFY0(dmu_objset_hold(osname, zsb, &zsb->z_os)); VERIFY3P(zsb->z_os->os_dsl_dataset->ds_owner, ==, zsb); VERIFY(dsl_dataset_long_held(zsb->z_os->os_dsl_dataset)); dmu_objset_rele(zsb->z_os, zsb); /* * Make sure version hasn't changed */ err = zfs_get_zplprop(zsb->z_os, ZFS_PROP_VERSION, &zsb->z_version); if (err) goto bail; err = zap_lookup(zsb->z_os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); if (err && zsb->z_version >= ZPL_VERSION_SA) goto bail; if ((err = sa_setup(zsb->z_os, sa_obj, zfs_attr_table, ZPL_END, &zsb->z_attr_table)) != 0) goto bail; if (zsb->z_version >= ZPL_VERSION_SA) sa_register_update_callback(zsb->z_os, zfs_sa_upgrade); VERIFY(zfs_sb_setup(zsb, B_FALSE) == 0); zfs_set_fuid_feature(zsb); zsb->z_rollback_time = jiffies; /* * Attempt to re-establish all the active inodes with their * dbufs. If a zfs_rezget() fails, then we unhash the inode * and mark it stale. This prevents a collision if a new * inode/object is created which must use the same inode * number. The stale inode will be be released when the * VFS prunes the dentry holding the remaining references * on the stale inode. */ mutex_enter(&zsb->z_znodes_lock); for (zp = list_head(&zsb->z_all_znodes); zp; zp = list_next(&zsb->z_all_znodes, zp)) { err2 = zfs_rezget(zp); if (err2) { remove_inode_hash(ZTOI(zp)); zp->z_is_stale = B_TRUE; } } mutex_exit(&zsb->z_znodes_lock); bail: /* release the VFS ops */ rw_exit(&zsb->z_teardown_inactive_lock); rrm_exit(&zsb->z_teardown_lock, FTAG); if (err) { /* * Since we couldn't setup the sa framework, try to force * unmount this file system. */ if (zsb->z_os) (void) zfs_umount(zsb->z_sb); } return (err); }
int zfs_make_xattrdir(znode_t *zp, vattr_t *vap, struct inode **xipp, cred_t *cr) { zfs_sb_t *zsb = ZTOZSB(zp); znode_t *xzp; dmu_tx_t *tx; int error; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; #ifdef DEBUG uint64_t parent; #endif *xipp = NULL; if ((error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr))) return (error); if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL, &acl_ids)) != 0) return (error); if (zfs_acl_ids_overquota(zsb, &acl_ids)) { zfs_acl_ids_free(&acl_ids); return (EDQUOT); } top: tx = dmu_tx_create(zsb->z_os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); fuid_dirtied = zsb->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zsb, tx); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); return (error); } zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zsb, tx); #ifdef DEBUG error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zsb), &parent, sizeof (parent)); ASSERT(error == 0 && parent == zp->z_id); #endif VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zsb), &xzp->z_id, sizeof (xzp->z_id), tx)); (void) zfs_log_create(zsb->z_log, tx, TX_MKXATTR, zp, xzp, "", NULL, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); *xipp = ZTOI(xzp); return (0); }
void zfs_rmnode(znode_t *zp) { zfsvfs_t *zfsvfs = ZTOZSB(zp); objset_t *os = zfsvfs->z_os; znode_t *xzp = NULL; dmu_tx_t *tx; uint64_t acl_obj; uint64_t xattr_obj; uint64_t links; int error; ASSERT(ZTOI(zp)->i_nlink == 0); ASSERT(atomic_read(&ZTOI(zp)->i_count) == 0); /* * If this is an attribute directory, purge its contents. */ if (S_ISDIR(ZTOI(zp)->i_mode) && (zp->z_pflags & ZFS_XATTR)) { if (zfs_purgedir(zp) != 0) { /* * Not enough space to delete some xattrs. * Leave it in the unlinked set. */ zfs_znode_dmu_fini(zp); return; } } /* * Free up all the data in the file. We don't do this for directories * because we need truncate and remove to be in the same tx, like in * zfs_znode_delete(). Otherwise, if we crash here we'll end up with * an inconsistent truncated zap object in the delete queue. Note a * truncated file is harmless since it only contains user data. */ if (S_ISREG(ZTOI(zp)->i_mode)) { error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END); if (error) { /* * Not enough space or we were interrupted by unmount. * Leave the file in the unlinked set. */ zfs_znode_dmu_fini(zp); return; } } /* * If the file has extended attributes, we're going to unlink * the xattr dir. */ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (error == 0 && xattr_obj) { error = zfs_zget(zfsvfs, xattr_obj, &xzp); ASSERT(error == 0); } acl_obj = zfs_external_acl(zp); /* * Set up the final transaction. */ tx = dmu_tx_create(os); dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); if (xzp) { dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL); dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); } if (acl_obj) dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { /* * Not enough space to delete the file. Leave it in the * unlinked set, leaking it until the fs is remounted (at * which point we'll call zfs_unlinked_drain() to process it). */ dmu_tx_abort(tx); zfs_znode_dmu_fini(zp); goto out; } if (xzp) { ASSERT(error == 0); mutex_enter(&xzp->z_lock); xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */ clear_nlink(ZTOI(xzp)); /* no more links to it */ links = 0; VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), &links, sizeof (links), tx)); mutex_exit(&xzp->z_lock); zfs_unlinked_add(xzp, tx); } /* Remove this znode from the unlinked set */ VERIFY3U(0, ==, zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); zfs_znode_delete(zp, tx); dmu_tx_commit(tx); out: if (xzp) zfs_iput_async(ZTOI(xzp)); }
void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) { struct super_block *sb; zfs_sb_t *zsb; uint64_t moid, obj, sa_obj, version; uint64_t sense = ZFS_CASE_SENSITIVE; uint64_t norm = 0; nvpair_t *elem; int error; int i; znode_t *rootzp = NULL; vattr_t vattr; znode_t *zp; zfs_acl_ids_t acl_ids; /* * First attempt to create master node. */ /* * In an empty objset, there are no blocks to read and thus * there can be no i/o errors (which we assert below). */ moid = MASTER_NODE_OBJ; error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, DMU_OT_NONE, 0, tx); ASSERT(error == 0); /* * Set starting attributes. */ version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os))); elem = NULL; while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { /* For the moment we expect all zpl props to be uint64_ts */ uint64_t val; char *name; ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); VERIFY(nvpair_value_uint64(elem, &val) == 0); name = nvpair_name(elem); if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { if (val < version) version = val; } else { error = zap_update(os, moid, name, 8, 1, &val, tx); } ASSERT(error == 0); if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) norm = val; else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) sense = val; } ASSERT(version != 0); error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); /* * Create zap object used for SA attribute registration */ if (version >= ZPL_VERSION_SA) { sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, DMU_OT_NONE, 0, tx); error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); ASSERT(error == 0); } else { sa_obj = 0; } /* * Create a delete queue. */ obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); ASSERT(error == 0); /* * Create root znode. Create minimal znode/inode/zsb/sb * to allow zfs_mknode to work. */ vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID; vattr.va_mode = S_IFDIR|0755; vattr.va_uid = crgetuid(cr); vattr.va_gid = crgetgid(cr); rootzp = kmem_cache_alloc(znode_cache, KM_PUSHPAGE); rootzp->z_moved = 0; rootzp->z_unlinked = 0; rootzp->z_atime_dirty = 0; rootzp->z_is_sa = USE_SA(version, os); zsb = kmem_zalloc(sizeof (zfs_sb_t), KM_PUSHPAGE | KM_NODEBUG); zsb->z_os = os; zsb->z_parent = zsb; zsb->z_version = version; zsb->z_use_fuids = USE_FUIDS(version, os); zsb->z_use_sa = USE_SA(version, os); zsb->z_norm = norm; sb = kmem_zalloc(sizeof (struct super_block), KM_PUSHPAGE); sb->s_fs_info = zsb; ZTOI(rootzp)->i_sb = sb; error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, &zsb->z_attr_table); ASSERT(error == 0); /* * Fold case on file systems that are always or sometimes case * insensitive. */ if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) zsb->z_norm |= U8_TEXTPREP_TOUPPER; mutex_init(&zsb->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zsb->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_init(&zsb->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, cr, NULL, &acl_ids)); zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); ASSERT3P(zp, ==, rootzp); error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); ASSERT(error == 0); zfs_acl_ids_free(&acl_ids); atomic_set(&ZTOI(rootzp)->i_count, 0); sa_handle_destroy(rootzp->z_sa_hdl); kmem_cache_free(znode_cache, rootzp); /* * Create shares directory */ error = zfs_create_share_dir(zsb, tx); ASSERT(error == 0); for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_destroy(&zsb->z_hold_mtx[i]); kmem_free(sb, sizeof (struct super_block)); kmem_free(zsb, sizeof (zfs_sb_t)); }
/* * Link zp into dl. Can only fail if zp has been unlinked. */ int zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) { znode_t *dzp = dl->dl_dzp; zfsvfs_t *zfsvfs = ZTOZSB(zp); uint64_t value; int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode); sa_bulk_attr_t bulk[5]; uint64_t mtime[2], ctime[2]; uint64_t links; int count = 0; int error; mutex_enter(&zp->z_lock); if (!(flag & ZRENAMING)) { if (zp->z_unlinked) { /* no new links to unlinked zp */ ASSERT(!(flag & (ZNEW | ZEXISTS))); mutex_exit(&zp->z_lock); return (SET_ERROR(ENOENT)); } if (!(flag & ZNEW)) { /* * ZNEW nodes come from zfs_mknode() where the link * count has already been initialised */ inc_nlink(ZTOI(zp)); links = ZTOI(zp)->i_nlink; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, sizeof (links)); } } SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &dzp->z_id, sizeof (dzp->z_id)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); if (!(flag & ZNEW)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, sizeof (ctime)); zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime); } error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ASSERT(error == 0); mutex_exit(&zp->z_lock); mutex_enter(&dzp->z_lock); dzp->z_size++; if (zp_is_dir) inc_nlink(ZTOI(dzp)); links = ZTOI(dzp)->i_nlink; count = 0; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &dzp->z_size, sizeof (dzp->z_size)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, sizeof (links)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, sizeof (ctime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); ASSERT(error == 0); mutex_exit(&dzp->z_lock); value = zfs_dirent(zp, zp->z_mode); error = zap_add(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, 8, 1, &value, tx); ASSERT(error == 0); return (0); }
int zfs_zget(zfs_sb_t *zsb, uint64_t obj_num, znode_t **zpp) { dmu_object_info_t doi; dmu_buf_t *db; znode_t *zp; int err; sa_handle_t *hdl; struct inode *ip; *zpp = NULL; again: ip = ilookup(zsb->z_sb, obj_num); ZFS_OBJ_HOLD_ENTER(zsb, obj_num); err = sa_buf_hold(zsb->z_os, obj_num, NULL, &db); if (err) { ZFS_OBJ_HOLD_EXIT(zsb, obj_num); iput(ip); return (err); } dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_type != DMU_OT_SA && (doi.doi_bonus_type != DMU_OT_ZNODE || (doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t)))) { sa_buf_rele(db, NULL); ZFS_OBJ_HOLD_EXIT(zsb, obj_num); iput(ip); return (SET_ERROR(EINVAL)); } hdl = dmu_buf_get_user(db); if (hdl != NULL) { if (ip == NULL) { /* * ilookup returned NULL, which means * the znode is dying - but the SA handle isn't * quite dead yet, we need to drop any locks * we're holding, re-schedule the task and try again. */ sa_buf_rele(db, NULL); ZFS_OBJ_HOLD_EXIT(zsb, obj_num); schedule(); goto again; } zp = sa_get_userdata(hdl); /* * Since "SA" does immediate eviction we * should never find a sa handle that doesn't * know about the znode. */ ASSERT3P(zp, !=, NULL); mutex_enter(&zp->z_lock); ASSERT3U(zp->z_id, ==, obj_num); if (zp->z_unlinked) { err = SET_ERROR(ENOENT); } else { igrab(ZTOI(zp)); *zpp = zp; err = 0; } sa_buf_rele(db, NULL); mutex_exit(&zp->z_lock); ZFS_OBJ_HOLD_EXIT(zsb, obj_num); iput(ip); return (err); } ASSERT3P(ip, ==, NULL); /* * Not found create new znode/vnode but only if file exists. * * There is a small window where zfs_vget() could * find this object while a file create is still in * progress. This is checked for in zfs_znode_alloc() * * if zfs_znode_alloc() fails it will drop the hold on the * bonus buffer. */ zp = zfs_znode_alloc(zsb, db, doi.doi_data_block_size, doi.doi_bonus_type, obj_num, NULL, NULL); if (zp == NULL) { err = SET_ERROR(ENOENT); } else { *zpp = zp; } ZFS_OBJ_HOLD_EXIT(zsb, obj_num); return (err); }
/* * Teardown the zfs_sb_t. * * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock' * and 'z_teardown_inactive_lock' held. */ int zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting) { znode_t *zp; rrw_enter(&zsb->z_teardown_lock, RW_WRITER, FTAG); if (!unmounting) { /* * We purge the parent filesystem's super block as the * parent filesystem and all of its snapshots have their * inode's super block set to the parent's filesystem's * super block. Note, 'z_parent' is self referential * for non-snapshots. */ shrink_dcache_sb(zsb->z_parent->z_sb); } /* * If someone has not already unmounted this file system, * drain the iput_taskq to ensure all active references to the * zfs_sb_t have been handled only then can it be safely destroyed. */ if (zsb->z_os) taskq_wait(dsl_pool_iput_taskq(dmu_objset_pool(zsb->z_os))); /* * Close the zil. NB: Can't close the zil while zfs_inactive * threads are blocked as zil_close can call zfs_inactive. */ if (zsb->z_log) { zil_close(zsb->z_log); zsb->z_log = NULL; } rw_enter(&zsb->z_teardown_inactive_lock, RW_WRITER); /* * If we are not unmounting (ie: online recv) and someone already * unmounted this file system while we were doing the switcheroo, * or a reopen of z_os failed then just bail out now. */ if (!unmounting && (zsb->z_unmounted || zsb->z_os == NULL)) { rw_exit(&zsb->z_teardown_inactive_lock); rrw_exit(&zsb->z_teardown_lock, FTAG); return (EIO); } /* * At this point there are no VFS ops active, and any new VFS ops * will fail with EIO since we have z_teardown_lock for writer (only * relevant for forced unmount). * * Release all holds on dbufs. */ mutex_enter(&zsb->z_znodes_lock); for (zp = list_head(&zsb->z_all_znodes); zp != NULL; zp = list_next(&zsb->z_all_znodes, zp)) { if (zp->z_sa_hdl) { ASSERT(atomic_read(&ZTOI(zp)->i_count) > 0); zfs_znode_dmu_fini(zp); } } mutex_exit(&zsb->z_znodes_lock); /* * If we are unmounting, set the unmounted flag and let new VFS ops * unblock. zfs_inactive will have the unmounted behavior, and all * other VFS ops will fail with EIO. */ if (unmounting) { zsb->z_unmounted = B_TRUE; rrw_exit(&zsb->z_teardown_lock, FTAG); rw_exit(&zsb->z_teardown_inactive_lock); } /* * z_os will be NULL if there was an error in attempting to reopen * zsb, so just return as the properties had already been * * unregistered and cached data had been evicted before. */ if (zsb->z_os == NULL) return (0); /* * Unregister properties. */ zfs_unregister_callbacks(zsb); /* * Evict cached data */ if (dsl_dataset_is_dirty(dmu_objset_ds(zsb->z_os)) && !zfs_is_readonly(zsb)) txg_wait_synced(dmu_objset_pool(zsb->z_os), 0); (void) dmu_objset_evict_dbufs(zsb->z_os); return (0); }