예제 #1
0
파일: zfs_znode.c 프로젝트: haad/netbsd-zfs
/*ARGSUSED*/
static int
zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
{
	znode_t *zp = buf;

	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));

	zp->z_vnode = vn_alloc(kmflags);
	if (zp->z_vnode == NULL) {
		return (-1);
	}
	ZTOV(zp)->v_data = zp;

	list_link_init(&zp->z_link_node);

	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
	rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
	rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);

	mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
	avl_create(&zp->z_range_avl, zfs_range_compare,
	    sizeof (rl_t), offsetof(rl_t, r_node));

	zp->z_dbuf = NULL;
	zp->z_dirlocks = NULL;
	zp->z_acl_cached = NULL;
	return (0);
}
예제 #2
0
static void
zfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db)
{
	znode_t		*nzp;

	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));

	mutex_enter(&zp->z_lock);

	ASSERT(zp->z_dbuf == NULL);
	zp->z_dbuf = db;
	nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_evict_error);

	/*
	 * there should be no
	 * concurrent zgets on this object.
	 */
	if (nzp != NULL)
		panic("existing znode %p for dbuf %p", (void *)nzp, (void *)db);

	/*
	 * Slap on VROOT if we are the root znode
	 */
	if (zp->z_id == zfsvfs->z_root)
		ZTOV(zp)->v_flag |= VROOT;

	mutex_exit(&zp->z_lock);
	vn_exists(ZTOV(zp));
}
예제 #3
0
파일: primr.c 프로젝트: myYearbook/primr
/**
 * Retrieve the blocks of the given file currently in the Linux Page Cache.
 *
 * @param   str                 IN      The string array.
 * @param   max                 IN      The number of elements in the array.
 * @param   value               IN      The search value.
 * @returns index of the array where the value is located or -1 if not found.
 */
void
PrimeCache (void)
{
    sqlite3        *dbh = NULL;                           /* Database Handle */
    sqlite3_stmt   *sth = NULL;                   /* SQLite Statement Handle */
    int             rc = 0;                            /* SQLite Return Code */

    printf("\nBeginning Cache Priming\n");
    fflush(stdout);

    /* Enable the SQLite shared cache */
//    sqlite3_enable_shared_cache(1);

    /* Open the SQLite database */
    rc = sqlite3_open(dbFileName, &dbh);

    sqlite3_busy_handler(dbh, busyHandler, NULL);

    /* Prepare the SQL */
    rc = sqlite3_prepare(dbh, SQL_CACHED_FILES_QUERY, -1, &sth, NULL);

    /*
     * Iterate over the block ranges, reading them into cache.
     */
    while ((rc = sqlite3_step(sth)) != SQLITE_ERROR)
    {
        if (rc == SQLITE_ROW)
        {
            int fileId = sqlite3_column_int(sth, 0);

            if (POINTER_IS_VALID(thp))
                tpool_add_work(thp, PrimeCacheForFile, (void *)fileId);
            else
                PrimeCacheForFile(fileId);
        }
        else if (rc == SQLITE_DONE)
        {
            break;
        }
        else if (rc == SQLITE_BUSY)
        {
            continue;
        }
    }

    sqlite3_finalize(sth);
    sqlite3_close(dbh);

} /* PrimeCache() */
예제 #4
0
/*ARGSUSED*/
static void
zfs_znode_cache_destructor(void *buf, void *arg)
{
	znode_t *zp = buf;

	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
	ASSERT(ZTOV(zp) == NULL);
	vn_free(ZTOV(zp));
	ASSERT(!list_link_active(&zp->z_link_node));
	mutex_destroy(&zp->z_lock);
	rw_destroy(&zp->z_map_lock);
	rw_destroy(&zp->z_parent_lock);
	rw_destroy(&zp->z_name_lock);
	mutex_destroy(&zp->z_acl_lock);
	avl_destroy(&zp->z_range_avl);
	mutex_destroy(&zp->z_range_lock);

	ASSERT(zp->z_dbuf == NULL);
	ASSERT(zp->z_dirlocks == NULL);
}
예제 #5
0
/*
 * XXX: We cannot use this function as a cache constructor, because
 *      there is one global cache for all file systems and we need
 *      to pass vfsp here, which is not possible, because argument
 *      'cdrarg' is defined at kmem_cache_create() time.
 */
static int
zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
{
	znode_t *zp = buf;
	vnode_t *vp;
	vfs_t *vfsp = arg;
	int error;

	POINTER_INVALIDATE(&zp->z_zfsvfs);
	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));

	if (vfsp != NULL) {
		error = getnewvnode("zfs", vfsp, &zfs_vnodeops, &vp);
		if (error != 0 && (kmflags & KM_NOSLEEP))
			return (-1);
		ASSERT(error == 0);
		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
		zp->z_vnode = vp;
		vp->v_data = (caddr_t)zp;
		VN_LOCK_AREC(vp);
	} else {
		zp->z_vnode = NULL;
	}

	list_link_init(&zp->z_link_node);

	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
	rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL);
	rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
	rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);

	mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
	avl_create(&zp->z_range_avl, zfs_range_compare,
	    sizeof (rl_t), offsetof(rl_t, r_node));

	zp->z_dbuf = NULL;
	zp->z_dirlocks = NULL;
	return (0);
}
예제 #6
0
/*
 * Construct a new znode/vnode and intialize.
 *
 * This does not do a call to dmu_set_user() that is
 * up to the caller to do, in case you don't want to
 * return the znode
 */
static znode_t *
zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
{
	znode_t	*zp;
	vnode_t *vp;

	zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
	zfs_znode_cache_constructor(zp, zfsvfs->z_parent->z_vfs, 0);

	ASSERT(zp->z_dirlocks == NULL);
	ASSERT(zp->z_dbuf == NULL);
	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));

	/*
	 * Defer setting z_zfsvfs until the znode is ready to be a candidate for
	 * the zfs_znode_move() callback.
	 */
	zp->z_phys = NULL;
	zp->z_unlinked = 0;
	zp->z_atime_dirty = 0;
	zp->z_mapcnt = 0;
	zp->z_last_itx = 0;
	zp->z_id = db->db_object;
	zp->z_blksz = blksz;
	zp->z_seq = 0x7A4653;
	zp->z_sync_cnt = 0;

	vp = ZTOV(zp);
#ifdef TODO
	vn_reinit(vp);
#endif

	zfs_znode_dmu_init(zfsvfs, zp, db);

	zp->z_gen = zp->z_phys->zp_gen;

#if 0
	if (vp == NULL)
		return (zp);
#endif

	vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
	switch (vp->v_type) {
	case VDIR:
		zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
		break;
	case VFIFO:
		vp->v_op = &zfs_fifoops;
		break;
	}
	if (vp->v_type != VFIFO)
		VN_LOCK_ASHARE(vp);

	mutex_enter(&zfsvfs->z_znodes_lock);
	list_insert_tail(&zfsvfs->z_all_znodes, zp);
	membar_producer();
	/*
	 * Everything else must be valid before assigning z_zfsvfs makes the
	 * znode eligible for zfs_znode_move().
	 */
	zp->z_zfsvfs = zfsvfs;
	mutex_exit(&zfsvfs->z_znodes_lock);

	VFS_HOLD(zfsvfs->z_vfs);
	return (zp);
}
예제 #7
0
/*ARGSUSED*/
static kmem_cbrc_t
zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg)
{
	znode_t *ozp = buf, *nzp = newbuf;
	zfsvfs_t *zfsvfs;
	vnode_t *vp;

	/*
	 * The znode is on the file system's list of known znodes if the vfs
	 * pointer is valid. We set the low bit of the vfs pointer when freeing
	 * the znode to invalidate it, and the memory patterns written by kmem
	 * (baddcafe and deadbeef) set at least one of the two low bits. A newly
	 * created znode sets the vfs pointer last of all to indicate that the
	 * znode is known and in a valid state to be moved by this function.
	 */
	zfsvfs = ozp->z_zfsvfs;
	if (!POINTER_IS_VALID(zfsvfs)) {
		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid);
		return (KMEM_CBRC_DONT_KNOW);
	}

	/*
	 * Ensure that the filesystem is not unmounted during the move.
	 */
	if (zfs_enter(zfsvfs) != 0) {		/* ZFS_ENTER */
		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted);
		return (KMEM_CBRC_DONT_KNOW);
	}

	mutex_enter(&zfsvfs->z_znodes_lock);
	/*
	 * Recheck the vfs pointer in case the znode was removed just before
	 * acquiring the lock.
	 */
	if (zfsvfs != ozp->z_zfsvfs) {
		mutex_exit(&zfsvfs->z_znodes_lock);
		ZFS_EXIT(zfsvfs);
		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck_invalid);
		return (KMEM_CBRC_DONT_KNOW);
	}

	/*
	 * At this point we know that as long as we hold z_znodes_lock, the
	 * znode cannot be freed and fields within the znode can be safely
	 * accessed. Now, prevent a race with zfs_zget().
	 */
	if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) {
		mutex_exit(&zfsvfs->z_znodes_lock);
		ZFS_EXIT(zfsvfs);
		ZNODE_STAT_ADD(znode_move_stats.zms_obj_held);
		return (KMEM_CBRC_LATER);
	}

	vp = ZTOV(ozp);
	if (mutex_tryenter(&vp->v_lock) == 0) {
		ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
		mutex_exit(&zfsvfs->z_znodes_lock);
		ZFS_EXIT(zfsvfs);
		ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked);
		return (KMEM_CBRC_LATER);
	}

	/* Only move znodes that are referenced _only_ by the DNLC. */
	if (vp->v_count != 1 || !vn_in_dnlc(vp)) {
		mutex_exit(&vp->v_lock);
		ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
		mutex_exit(&zfsvfs->z_znodes_lock);
		ZFS_EXIT(zfsvfs);
		ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc);
		return (KMEM_CBRC_LATER);
	}

	/*
	 * The znode is known and in a valid state to move. We're holding the
	 * locks needed to execute the critical section.
	 */
	zfs_znode_move_impl(ozp, nzp);
	mutex_exit(&vp->v_lock);
	ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);

	list_link_replace(&ozp->z_link_node, &nzp->z_link_node);
	mutex_exit(&zfsvfs->z_znodes_lock);
	ZFS_EXIT(zfsvfs);

	return (KMEM_CBRC_YES);
}
예제 #8
0
파일: primr.c 프로젝트: myYearbook/primr
/**
 * Retrieve the blocks of the given file currently in the Linux Page Cache.
 *
 * @param   str                 IN      The string array.
 * @param   max                 IN      The number of elements in the array.
 * @param   value               IN      The search value.
 * @returns index of the array where the value is located or -1 if not found.
 */
void
BuildCacheProfile (void)
{
    PGconn         *pgh = NULL;                /* Postgres Connection Handle */
    PGresult       *pgrsh = NULL;
    sqlite3        *dbh = NULL;                    /* SQLite Database Handle */
    sqlite3_stmt   *sth = NULL;                   /* SQLite Statement Handle */
    int             i = 0;                               /* Generic Iterator */
    int             rc = 0;                            /* SQLite Return Code */
    int             dbOid = 0;                      /* Postgres Database OID */
    int             dbTablespaceOid = 0;  /* Postgres Default Tablespace OID */
    char           *baseDir = get_current_dir_name();

    /* Open the SQLite database */
    rc = sqlite3_open(dbFileName, &dbh);

    sqlite3_busy_handler(dbh, busyHandler, NULL);

    /* Connect to Postgres */
    pgh = PQsetdbLogin(NULL, NULL, NULL, NULL,
        pgConnectString == NULL ? "postgres"
                                : pgConnectString,
        NULL, pgPassword);
    if (PQstatus(pgh) == CONNECTION_BAD)
    {
        ERROR_PRINT("%s", "Connection Test Failed\n");
        ERROR_PRINT("SQLERRMC: %s\n", PQerrorMessage(pgh));
        PQfinish(pgh);
        exit(EXIT_FAILURE);
    }

    /* Retrieve the Postgres block size */
    pgrsh = PQexec(pgh, PG_BLCKSZ_QUERY);
    if (PQresultStatus(pgrsh) == PGRES_TUPLES_OK)
    {
        db_block_size = atoi(PQgetvalue(pgrsh, 0, 0));
        PQclear(pgrsh);

        /* Inform user of block sizes */
        printf("PG BLCKSZ........ %u\n", db_block_size);
        printf("OS Page Size..... %u\n", (uint32_t)getpagesize());
    }
    else
    {
        ERROR_PRINT("%s", "Could not retrieve database settings\n");
        ERROR_PRINT("SQLERRMC: %s\n", PQerrorMessage(pgh));
        PQclear(pgrsh);
        PQfinish(pgh);
    }

    /* Retrieve the Postgres block size */
    pgrsh = PQexec(pgh, PG_DEFAULT_TABLESPACE_QUERY);
    if (PQresultStatus(pgrsh) == PGRES_TUPLES_OK)
    {
        dbOid = atoi(PQgetvalue(pgrsh, 0, 0));
        dbTablespaceOid = atoi(PQgetvalue(pgrsh, 0, 1));
        PQclear(pgrsh);
    }
    else
    {
        ERROR_PRINT("%s", "Could not retrieve database settings\n");
        ERROR_PRINT("SQLERRMC: %s\n", PQerrorMessage(pgh));
        PQclear(pgrsh);
        PQfinish(pgh);
    }

    /* Retrieve the Postgres tablespace information */
    pgrsh = PQexec(pgh, PG_OBJECT_QUERY);
    if (PQresultStatus(pgrsh) == PGRES_TUPLES_OK)
    {
        int numRows = PQntuples(pgrsh);
        char filePath[1024];

        printf("\nBeginning cache profiling for %d database objects.\n",
            numRows);

        for (i = 0; i < numRows; i++)
        {
            struct dirent  *dp = NULL;            /* Directory Entry Pointer */
            DIR            *dfd = NULL;                  /* Directory Stream */
            char pattern1[64];
            char pattern2[64];

            char *name = PQgetvalue(pgrsh, i, 0);
            char *kind = PQgetvalue(pgrsh, i, 1);
            int tableSpace = atoi(PQgetvalue(pgrsh, i, 2));
            int fileNode = atoi(PQgetvalue(pgrsh, i, 3));

            snprintf(pattern1, sizeof(pattern1), "%d", fileNode);
            snprintf(pattern2, sizeof(pattern2), "%d.*", fileNode);

            if (tableSpace == 0 || tableSpace == dbTablespaceOid)
                snprintf(filePath, sizeof(filePath),
                         "%s/pg_tblspc/%d/%d", pgDataDir,
                         dbTablespaceOid, dbOid);

            dfd = opendir(filePath);
            if (dfd != NULL)
            {
                chdir(filePath);

                while ((dp = readdir(dfd)) != NULL)
                {
                    struct stat sb;

                    /* Attempt to parse the given PG log file */
                    if (stat(dp->d_name, &sb) == -1)
                        continue;

                    /* Make sure this is a normal file */
                    if (!S_ISREG(sb.st_mode))
                        continue;

                    /* Does this file match our pattern */
                    DEBUG_PRINT("Checking pattern [%s] and [%s] for file [%s]\n",
                        pattern1, pattern2, dp->d_name);
                    if (fnmatch(pattern1, dp->d_name, 0) == 0
                        || fnmatch(pattern2, dp->d_name, 0) == 0)
                    {
                        char fullFilePath[1024];

                        snprintf(fullFilePath, sizeof(fullFilePath),
                            "%s/%s", filePath, dp->d_name);
                        if (POINTER_IS_VALID(thp))
                            tpool_add_work(thp, ProfileCacheForFile,
                                (void *)strdup(fullFilePath));
                        else
                            ProfileCacheForFile(strdup(fullFilePath));
                    }
                }

                /* We're done, close the directory stream */
                closedir(dfd);
                chdir(baseDir);
            }

        }
        PQclear(pgrsh);
    }
    else
    {
        ERROR_PRINT("%s", "Could not retrieve database settings\n");
        ERROR_PRINT("SQLERRMC: %s\n", PQerrorMessage(pgh));
        PQclear(pgrsh);
        PQfinish(pgh);
    }

    /* We're done with the connection */
    PQfinish(pgh);

} /* BuildCacheProfile() */
예제 #9
0
파일: primr.c 프로젝트: myYearbook/primr
int
main (int argc, char **argv)
{
    int             opt = 0;                            /* Option Identifier */
    int             optindex = 0;                            /* Option Index */
    bool            isProfiling = false;      /* Are we profiling the cache? */
    bool            isPriming = false;          /* Are we priming the cache? */
    long            numCPUs = 0;                /* The number of online CPUs */
    struct dirent  *dp = NULL;                    /* Directory Entry Pointer */
    DIR            *dfd = NULL;                          /* Directory Stream */
    double          loadAverages[3] = { 0.00 };      /* System Load Averages */
    PGconn         *pgh = NULL;                /* Postgres Connection Handle */
    bool            isPWDRequired = false;     /* Is Postgres Password Reqd? */

    struct option   long_options[] =                 /* Options for getopt() */
    {
        {"connect-string",  required_argument,  NULL, 'c'},
        {"profile",         no_argument,        NULL, 'p'},
        {"prime",           no_argument,        NULL, 'w'},
        {"data-dir",        required_argument,  NULL, 'D'},
        {"postgres-only",   no_argument,        NULL, 'o'},
        {"sqlite-db",       required_argument,  NULL, 's'},
        {"help",            no_argument,        NULL, 'h'},
        {"debug",           no_argument,        NULL, 'd'},
        {NULL, 0, NULL, 0}
    };

    /* Go for the glory! */
    fprintf(stderr, "\n%s: Release %s - %s\n", PACKAGE_NAME,
        PACKAGE_VERSION, APP_RELEASE);
    fprintf(stderr, "\n%s\n\n", APP_COPYRIGHT);
    fflush(stdout);

    /* Process command-line options */
    while ((opt = getopt_long(argc, argv, "c:s:D:awhdp",
        long_options, &optindex)) != -1)
    {
        switch (opt)
        {
            case 'h':
                usage();
                exit(EXIT_SUCCESS);
                break;
            case 'p':
                if (isPriming == false)
                    isProfiling = true;
                else
                {
                    fprintf(stderr,
                        "Profiling and warming are mutually exlusive!\n");
                    exit(EXIT_FAILURE);
                }
                break;
            case 'w':
                if (isProfiling == false)
                    isPriming = true;
                else
                {
                    fprintf(stderr,
                        "Profiling and warming are mutually exlusive!\n");
                    exit(EXIT_FAILURE);
                }
                break;
            case 'd':
                is_debug = true;
                break;
            case 's':
                dbFileName = xstrdup(optarg);
                break;
            case 'c':
                pgConnectString = xstrdup(optarg);
                break;
            case 'D':
                pgDataDir = optarg;
                break;
            default:
                usage();
                exit(EXIT_FAILURE);
        }
    }

    /* Make sure user requested profile OR prime */
    if (isProfiling == false && isPriming == false)
    {
        fprintf(stderr, "Expected either -p or -w\n");
        usage();
        exit(EXIT_FAILURE);
    }

    /* Make sure the database name is set */

    /* Get the PG log file name from the end of the command line */
    if (optind < (argc - 1))
    {
        fprintf(stderr, "too many command-line arguments (first is \"%s\")\n",
                argv[optind + 1]);
        usage();
        exit(EXIT_FAILURE);
    }

    /* Perform a Postgres connection test & get password (if required) */
    do
    {
        isPWDRequired = false;
        pgh = PQsetdbLogin(NULL, NULL, NULL, NULL,
            pgConnectString == NULL ? "postgres"
                                    : pgConnectString,
            NULL, pgPassword);
        if (PQstatus(pgh) == CONNECTION_BAD)
        {
            if (PQconnectionNeedsPassword(pgh) && pgPassword == NULL)
            {
                printf("\nTesting Postgres Connection\n");
                PQfinish(pgh);
                pgPassword = simple_prompt("Password: "******"%s", "Connection Test Failed\n");
                ERROR_PRINT("SQLERRMC: %s\n", PQerrorMessage(pgh));
                PQfinish(pgh);
                exit(EXIT_FAILURE);
            }
        }
    } while (isPWDRequired);
    PQfinish(pgh);
    /* Get the number of available CPUs */
    numCPUs = sysconf(_SC_NPROCESSORS_ONLN);
    if (numCPUs < 1)
        numCPUs = 1;

    /*
     * Choose the number of CPUs to use in the thread pool based on load
     * average.  It only makes sense to do this if we have more than one CPU
     * to play with.
     */
    if ((numCPUs > 1)
        && (getloadavg(loadAverages, 3) == 3))
    {
        long    idleCPUs = 0;                     /* The number of idle CPUs */

        /* Show what we got */
        printf("load averages.... %3.2f %3.2f %3.2f\n",
            loadAverages[0], loadAverages[1], loadAverages[2]);

        /*
         * We're going to base the number of usable CPUs by subtracting
         * the sum of 1 (to account for OS and I/O overhead) plus the 1 minute
         * load average from the number of available CPUs.
         */
        idleCPUs = numCPUs - (1 + (int)(loadAverages[0] + 0.5));

        /* Assign # of available CPUs with some sanity checking */
        if (idleCPUs < numCPUs)
            numCPUs = idleCPUs;
        if (numCPUs < 1)
            numCPUs = 1;
    }

    /* Inform user of # of CPUs that will be used */
    printf("usable CPUs...... %d\n", numCPUs);

    /* If we have more than one CPU, multi-thread our operations */
    if (numCPUs > 1)
    {
        /* Initialize the thread pool */
        thp = tpool_init(numCPUs, 1024, true);
    }

    if (isProfiling)
        BuildCacheProfile();
    else /* isPriming */
        PrimeCache();

    /* If we have more than one CPU, multi-thread our operations */
    if (POINTER_IS_VALID(thp))
    {
        /* Destroy the thread pool */
        tpool_destroy(thp, 1); 
    }

    /* Cleanup */
    free(dbFileName);

    return EXIT_SUCCESS;

} /* main() */
예제 #10
0
/*ARGSUSED*/
static kmem_cbrc_t
zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg)
{
	znode_t *ozp = buf, *nzp = newbuf;
	zfsvfs_t *zfsvfs;
	vnode_t *vp;

	/*
	 * The znode is on the file system's list of known znodes if the vfs
	 * pointer is valid. We set the low bit of the vfs pointer when freeing
	 * the znode to invalidate it, and the memory patterns written by kmem
	 * (baddcafe and deadbeef) set at least one of the two low bits. A newly
	 * created znode sets the vfs pointer last of all to indicate that the
	 * znode is known and in a valid state to be moved by this function.
	 */
	zfsvfs = ozp->z_zfsvfs;
	if (!POINTER_IS_VALID(zfsvfs)) {
		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid);
		return (KMEM_CBRC_DONT_KNOW);
	}

	/*
	 * Close a small window in which it's possible that the filesystem could
	 * be unmounted and freed, and zfsvfs, though valid in the previous
	 * statement, could point to unrelated memory by the time we try to
	 * prevent the filesystem from being unmounted.
	 */
	rw_enter(&zfsvfs_lock, RW_WRITER);
	if (zfsvfs != ozp->z_zfsvfs) {
		rw_exit(&zfsvfs_lock);
		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck1);
		return (KMEM_CBRC_DONT_KNOW);
	}

	/*
	 * If the znode is still valid, then so is the file system. We know that
	 * no valid file system can be freed while we hold zfsvfs_lock, so we
	 * can safely ensure that the filesystem is not and will not be
	 * unmounted. The next statement is equivalent to ZFS_ENTER().
	 */
	rrw_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
	if (zfsvfs->z_unmounted) {
		ZFS_EXIT(zfsvfs);
		rw_exit(&zfsvfs_lock);
		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted);
		return (KMEM_CBRC_DONT_KNOW);
	}
	rw_exit(&zfsvfs_lock);

	mutex_enter(&zfsvfs->z_znodes_lock);
	/*
	 * Recheck the vfs pointer in case the znode was removed just before
	 * acquiring the lock.
	 */
	if (zfsvfs != ozp->z_zfsvfs) {
		mutex_exit(&zfsvfs->z_znodes_lock);
		ZFS_EXIT(zfsvfs);
		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck2);
		return (KMEM_CBRC_DONT_KNOW);
	}

	/*
	 * At this point we know that as long as we hold z_znodes_lock, the
	 * znode cannot be freed and fields within the znode can be safely
	 * accessed. Now, prevent a race with zfs_zget().
	 */
	if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) {
		mutex_exit(&zfsvfs->z_znodes_lock);
		ZFS_EXIT(zfsvfs);
		ZNODE_STAT_ADD(znode_move_stats.zms_obj_held);
		return (KMEM_CBRC_LATER);
	}

	vp = ZTOV(ozp);
	if (mutex_tryenter(&vp->v_lock) == 0) {
		ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
		mutex_exit(&zfsvfs->z_znodes_lock);
		ZFS_EXIT(zfsvfs);
		ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked);
		return (KMEM_CBRC_LATER);
	}

	/* Only move znodes that are referenced _only_ by the DNLC. */
	if (vp->v_count != 1 || !vn_in_dnlc(vp)) {
		mutex_exit(&vp->v_lock);
		ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
		mutex_exit(&zfsvfs->z_znodes_lock);
		ZFS_EXIT(zfsvfs);
		ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc);
		return (KMEM_CBRC_LATER);
	}

	/*
	 * The znode is known and in a valid state to move. We're holding the
	 * locks needed to execute the critical section.
	 */
	zfs_znode_move_impl(ozp, nzp);
	mutex_exit(&vp->v_lock);
	ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);

	list_link_replace(&ozp->z_link_node, &nzp->z_link_node);
	mutex_exit(&zfsvfs->z_znodes_lock);
	ZFS_EXIT(zfsvfs);

	return (KMEM_CBRC_YES);
}