Beispiel #1
0
/*
 * Execute a 32-bit system call on behalf of the current thread.
 */
void
dosyscall(void)
{
	/*
	 * Need space on the stack to store syscall arguments.
	 */
	long		syscall_args[MAXSYSARGS];
	struct sysent	*se;
	int64_t		ret;

	syscall_mstate(LMS_TRAP, LMS_SYSTEM);

	ASSERT(curproc->p_model == DATAMODEL_ILP32);

	CPU_STATS_ENTER_K();
	CPU_STATS_ADDQ(CPU, sys, syscall, 1);
	CPU_STATS_EXIT_K();

	se = syscall_entry(curthread, syscall_args);

	/*
	 * syscall_entry() copied all 8 arguments into syscall_args.
	 */
	ret = se->sy_callc(syscall_args[0], syscall_args[1], syscall_args[2],
	    syscall_args[3], syscall_args[4], syscall_args[5], syscall_args[6],
	    syscall_args[7]);

	syscall_exit(curthread, (int)ret & 0xffffffffu, (int)(ret >> 32));
	syscall_mstate(LMS_SYSTEM, LMS_TRAP);
}
Beispiel #2
0
/*
 * Common code for writing a buffer with various options.
 *
 * force_wait  - wait for write completion regardless of B_ASYNC flag
 * do_relse    - release the buffer when we are done
 * clear_flags - flags to clear from the buffer
 */
void
bwrite_common(void *arg, struct buf *bp, int force_wait,
				int do_relse, int clear_flags)
{
	register int do_wait;
	struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
	int flag;
	klwp_t *lwp = ttolwp(curthread);
	struct cpu *cpup;

	ASSERT(SEMA_HELD(&bp->b_sem));
	flag = bp->b_flags;
	bp->b_flags &= ~clear_flags;
	if (lwp != NULL)
		lwp->lwp_ru.oublock++;
	CPU_STATS_ENTER_K();
	cpup = CPU;		/* get pointer AFTER preemption is disabled */
	CPU_STATS_ADDQ(cpup, sys, lwrite, 1);
	CPU_STATS_ADDQ(cpup, sys, bwrite, 1);
	do_wait = ((flag & B_ASYNC) == 0 || force_wait);
	if (do_wait == 0)
		CPU_STATS_ADDQ(cpup, sys, bawrite, 1);
	CPU_STATS_EXIT_K();
	if (ufsvfsp == NULL) {
		(void) bdev_strategy(bp);
	} else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
							/* ufs && logging */
		(*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
	} else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
							/* ufs && snapshots */
		(*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
	} else {
		ub.ub_bwrites.value.ul++;		/* ufs && !logging */
		(void) bdev_strategy(bp);
	}
	if (do_wait) {
		(void) biowait(bp);
		if (do_relse) {
			brelse(bp);
		}
	}
}
Beispiel #3
0
/*
 * Initiate cross call processing.
 */
static void
xc_common(
	xc_func_t func,
	xc_arg_t arg1,
	xc_arg_t arg2,
	xc_arg_t arg3,
	ulong_t *set,
	uint_t command)
{
	int c;
	struct cpu *cpup;
	xc_msg_t *msg;
	xc_data_t *data;
	int cnt;
	int save_spl;

	if (!xc_initialized) {
		if (BT_TEST(set, CPU->cpu_id) && (CPU->cpu_flags & CPU_READY) &&
		    func != NULL)
			(void) (*func)(arg1, arg2, arg3);
		return;
	}

	save_spl = splr(ipltospl(XC_HI_PIL));

	/*
	 * fill in cross call data
	 */
	data = &CPU->cpu_m.xc_data;
	data->xc_func = func;
	data->xc_a1 = arg1;
	data->xc_a2 = arg2;
	data->xc_a3 = arg3;

	/*
	 * Post messages to all CPUs involved that are CPU_READY
	 */
	CPU->cpu_m.xc_wait_cnt = 0;
	for (c = 0; c < max_ncpus; ++c) {
		if (!BT_TEST(set, c))
			continue;
		cpup = cpu[c];
		if (cpup == NULL || !(cpup->cpu_flags & CPU_READY))
			continue;

		/*
		 * Fill out a new message.
		 */
		msg = xc_extract(&CPU->cpu_m.xc_free);
		if (msg == NULL)
			panic("Ran out of free xc_msg_t's");
		msg->xc_command = command;
		if (msg->xc_master != CPU->cpu_id)
			panic("msg %p has wrong xc_master", (void *)msg);
		msg->xc_slave = c;

		/*
		 * Increment my work count for all messages that I'll
		 * transition from DONE to FREE.
		 * Also remember how many XC_MSG_WAITINGs to look for
		 */
		(void) xc_increment(&CPU->cpu_m);
		if (command == XC_MSG_SYNC)
			++CPU->cpu_m.xc_wait_cnt;

		/*
		 * Increment the target CPU work count then insert the message
		 * in the target msgbox. If I post the first bit of work
		 * for the target to do, send an IPI to the target CPU.
		 */
		cnt = xc_increment(&cpup->cpu_m);
		xc_insert(&cpup->cpu_m.xc_msgbox, msg);
		if (cpup != CPU) {
			if (cnt == 0) {
				CPU_STATS_ADDQ(CPU, sys, xcalls, 1);
				send_dirint(c, XC_HI_PIL);
				if (xc_collect_enable)
					++xc_total_cnt;
			} else if (xc_collect_enable) {
				++xc_multi_cnt;
			}
		}
	}

	/*
	 * Now drop into the message handler until all work is done
	 */
	(void) xc_serv(NULL, NULL);
	splx(save_spl);
}
Beispiel #4
0
void
pvn_write_done(page_t *plist, int flags)
{
    int dfree = 0;
    int pgrec = 0;
    int pgout = 0;
    int pgpgout = 0;
    int anonpgout = 0;
    int anonfree = 0;
    int fspgout = 0;
    int fsfree = 0;
    int execpgout = 0;
    int execfree = 0;
    page_t *pp;
    struct cpu *cpup;
    struct vnode *vp = NULL;	/* for probe */
    uint_t ppattr;
    kmutex_t *vphm = NULL;

    ASSERT((flags & B_READ) == 0);

    /*
     * If we are about to start paging anyway, start freeing pages.
     */
    if (write_free && freemem < lotsfree + pages_before_pager &&
            (flags & B_ERROR) == 0) {
        flags |= B_FREE;
    }

    /*
     * Handle each page involved in the i/o operation.
     */
    while (plist != NULL) {
        pp = plist;
        ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp));
        page_sub(&plist, pp);

        /* Kernel probe support */
        if (vp == NULL)
            vp = pp->p_vnode;

        if (((flags & B_ERROR) == 0) && IS_VMODSORT(vp)) {
            /*
             * Move page to the top of the v_page list.
             * Skip pages modified during IO.
             */
            vphm = page_vnode_mutex(vp);
            mutex_enter(vphm);
            if ((pp->p_vpnext != pp) && !hat_ismod(pp)) {
                page_vpsub(&vp->v_pages, pp);
                page_vpadd(&vp->v_pages, pp);
            }
            mutex_exit(vphm);
        }

        if (flags & B_ERROR) {
            /*
             * Write operation failed.  We don't want
             * to destroy (or free) the page unless B_FORCE
             * is set. We set the mod bit again and release
             * all locks on the page so that it will get written
             * back again later when things are hopefully
             * better again.
             * If B_INVAL and B_FORCE is set we really have
             * to destroy the page.
             */
            if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) {
                page_io_unlock(pp);
                /*LINTED: constant in conditional context*/
                VN_DISPOSE(pp, B_INVAL, 0, kcred);
            } else {
                hat_setmod_only(pp);
                page_io_unlock(pp);
                page_unlock(pp);
            }
        } else if (flags & B_INVAL) {
            /*
             * XXX - Failed writes with B_INVAL set are
             * not handled appropriately.
             */
            page_io_unlock(pp);
            /*LINTED: constant in conditional context*/
            VN_DISPOSE(pp, B_INVAL, 0, kcred);
        } else if (flags & B_FREE ||!hat_page_is_mapped(pp)) {
            /*
             * Update statistics for pages being paged out
             */
            if (pp->p_vnode) {
                if (IS_SWAPFSVP(pp->p_vnode)) {
                    anonpgout++;
                } else {
                    if (pp->p_vnode->v_flag & VVMEXEC) {
                        execpgout++;
                    } else {
                        fspgout++;
                    }
                }
            }
            page_io_unlock(pp);
            pgout = 1;
            pgpgout++;
            TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT,
                    "page_ws_out:pp %p", pp);

            /*
             * The page_struct_lock need not be acquired to
             * examine "p_lckcnt" and "p_cowcnt" since we'll
             * have an "exclusive" lock if the upgrade succeeds.
             */
            if (page_tryupgrade(pp) &&
                    pp->p_lckcnt == 0 && pp->p_cowcnt == 0) {
                /*
                 * Check if someone has reclaimed the
                 * page.  If ref and mod are not set, no
                 * one is using it so we can free it.
                 * The rest of the system is careful
                 * to use the NOSYNC flag to unload
                 * translations set up for i/o w/o
                 * affecting ref and mod bits.
                 *
                 * Obtain a copy of the real hardware
                 * mod bit using hat_pagesync(pp, HAT_DONTZERO)
                 * to avoid having to flush the cache.
                 */
                ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
                                      HAT_SYNC_STOPON_MOD);
ck_refmod:
                if (!(ppattr & (P_REF | P_MOD))) {
                    if (hat_page_is_mapped(pp)) {
                        /*
                         * Doesn't look like the page
                         * was modified so now we
                         * really have to unload the
                         * translations.  Meanwhile
                         * another CPU could've
                         * modified it so we have to
                         * check again.  We don't loop
                         * forever here because now
                         * the translations are gone
                         * and no one can get a new one
                         * since we have the "exclusive"
                         * lock on the page.
                         */
                        (void) hat_pageunload(pp,
                                              HAT_FORCE_PGUNLOAD);
                        ppattr = hat_page_getattr(pp,
                                                  P_REF | P_MOD);
                        goto ck_refmod;
                    }
                    /*
                     * Update statistics for pages being
                     * freed
                     */
                    if (pp->p_vnode) {
                        if (IS_SWAPFSVP(pp->p_vnode)) {
                            anonfree++;
                        } else {
                            if (pp->p_vnode->v_flag
                                    & VVMEXEC) {
                                execfree++;
                            } else {
                                fsfree++;
                            }
                        }
                    }
                    /*LINTED: constant in conditional ctx*/
                    VN_DISPOSE(pp, B_FREE,
                               (flags & B_DONTNEED), kcred);
                    dfree++;
                } else {
                    page_unlock(pp);
                    pgrec++;
                    TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE,
                            "page_ws_free:pp %p", pp);
                }
            } else {
                /*
                 * Page is either `locked' in memory
                 * or was reclaimed and now has a
                 * "shared" lock, so release it.
                 */
                page_unlock(pp);
            }
        } else {
            /*
             * Neither B_FREE nor B_INVAL nor B_ERROR.
             * Just release locks.
             */
            page_io_unlock(pp);
            page_unlock(pp);
        }
    }

    CPU_STATS_ENTER_K();
    cpup = CPU;		/* get cpup now that CPU cannot change */
    CPU_STATS_ADDQ(cpup, vm, dfree, dfree);
    CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec);
    CPU_STATS_ADDQ(cpup, vm, pgout, pgout);
    CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout);
    CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout);
    CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree);
    CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout);
    CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree);
    CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout);
    CPU_STATS_ADDQ(cpup, vm, execfree, execfree);
    CPU_STATS_EXIT_K();

    /* Kernel probe */
    TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */,
                tnf_opaque,	vnode,			vp,
                tnf_ulong,	pages_pageout,		pgpgout,
                tnf_ulong,	pages_freed,		dfree,
                tnf_ulong,	pages_reclaimed,	pgrec);
}
Beispiel #5
0
/*
 * Starting at current directory, translate pathname pnp to end.
 * Leave pathname of final component in pnp, return the vnode
 * for the final component in *compvpp, and return the vnode
 * for the parent of the final component in dirvpp.
 *
 * This is the central routine in pathname translation and handles
 * multiple components in pathnames, separating them at /'s.  It also
 * implements mounted file systems and processes symbolic links.
 *
 * vp is the vnode where the directory search should start.
 *
 * Reference counts: vp must be held prior to calling this function.  rootvp
 * should only be held if rootvp != rootdir.
 */
int
lookuppnvp(
	struct pathname *pnp,		/* pathname to lookup */
	struct pathname *rpnp,		/* if non-NULL, return resolved path */
	int flags,			/* follow symlinks */
	vnode_t **dirvpp,		/* ptr for parent vnode */
	vnode_t **compvpp,		/* ptr for entry vnode */
	vnode_t *rootvp,		/* rootvp */
	vnode_t *vp,			/* directory to start search at */
	cred_t *cr)			/* user's credential */
{
	vnode_t *cvp;	/* current component vp */
	char component[MAXNAMELEN];	/* buffer for component (incl null) */
	int error;
	int nlink;
	int lookup_flags;
	struct pathname presrvd; /* case preserved name */
	struct pathname *pp = NULL;
	vnode_t *startvp;
	vnode_t *zonevp = curproc->p_zone->zone_rootvp;		/* zone root */
	int must_be_directory = 0;
	boolean_t retry_with_kcred;
	uint32_t auditing = AU_AUDITING();

	CPU_STATS_ADDQ(CPU, sys, namei, 1);
	nlink = 0;
	cvp = NULL;
	if (rpnp)
		rpnp->pn_pathlen = 0;

	lookup_flags = dirvpp ? LOOKUP_DIR : 0;
	if (flags & FIGNORECASE) {
		lookup_flags |= FIGNORECASE;
		pn_alloc(&presrvd);
		pp = &presrvd;
	}

	if (auditing)
		audit_anchorpath(pnp, vp == rootvp);

	/*
	 * Eliminate any trailing slashes in the pathname.
	 * If there are any, we must follow all symlinks.
	 * Also, we must guarantee that the last component is a directory.
	 */
	if (pn_fixslash(pnp)) {
		flags |= FOLLOW;
		must_be_directory = 1;
	}

	startvp = vp;
next:
	retry_with_kcred = B_FALSE;

	/*
	 * Make sure we have a directory.
	 */
	if (vp->v_type != VDIR) {
		error = ENOTDIR;
		goto bad;
	}

	if (rpnp && VN_CMP(vp, rootvp))
		(void) pn_set(rpnp, "/");

	/*
	 * Process the next component of the pathname.
	 */
	if (error = pn_getcomponent(pnp, component)) {
		goto bad;
	}

	/*
	 * Handle "..": two special cases.
	 * 1. If we're at the root directory (e.g. after chroot or
	 *    zone_enter) then change ".." to "." so we can't get
	 *    out of this subtree.
	 * 2. If this vnode is the root of a mounted file system,
	 *    then replace it with the vnode that was mounted on
	 *    so that we take the ".." in the other file system.
	 */
	if (component[0] == '.' && component[1] == '.' && component[2] == 0) {
checkforroot:
		if (VN_CMP(vp, rootvp) || VN_CMP(vp, zonevp)) {
			component[1] = '\0';
		} else if (vp->v_flag & VROOT) {
			vfs_t *vfsp;
			cvp = vp;

			/*
			 * While we deal with the vfs pointer from the vnode
			 * the filesystem could have been forcefully unmounted
			 * and the vnode's v_vfsp could have been invalidated
			 * by VFS_UNMOUNT. Hence, we cache v_vfsp and use it
			 * with vfs_rlock_wait/vfs_unlock.
			 * It is safe to use the v_vfsp even it is freed by
			 * VFS_UNMOUNT because vfs_rlock_wait/vfs_unlock
			 * do not dereference v_vfsp. It is just used as a
			 * magic cookie.
			 * One more corner case here is the memory getting
			 * reused for another vfs structure. In this case
			 * lookuppnvp's vfs_rlock_wait will succeed, domount's
			 * vfs_lock will fail and domount will bail out with an
			 * error (EBUSY).
			 */
			vfsp = cvp->v_vfsp;

			/*
			 * This lock is used to synchronize
			 * mounts/unmounts and lookups.
			 * Threads doing mounts/unmounts hold the
			 * writers version vfs_lock_wait().
			 */

			vfs_rlock_wait(vfsp);

			/*
			 * If this vnode is on a file system that
			 * has been forcibly unmounted,
			 * we can't proceed. Cancel this operation
			 * and return EIO.
			 *
			 * vfs_vnodecovered is NULL if unmounted.
			 * Currently, nfs uses VFS_UNMOUNTED to
			 * check if it's a forced-umount. Keep the
			 * same checking here as well even though it
			 * may not be needed.
			 */
			if (((vp = cvp->v_vfsp->vfs_vnodecovered) == NULL) ||
			    (cvp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) {
				vfs_unlock(vfsp);
				VN_RELE(cvp);
				if (pp)
					pn_free(pp);
				return (EIO);
			}
			VN_HOLD(vp);
			vfs_unlock(vfsp);
			VN_RELE(cvp);
			cvp = NULL;
			/*
			 * Crossing mount points. For eg: We are doing
			 * a lookup of ".." for file systems root vnode
			 * mounted here, and VOP_LOOKUP() (with covered vnode)
			 * will be on underlying file systems mount point
			 * vnode. Set retry_with_kcred flag as we might end
			 * up doing VOP_LOOKUP() with kcred if required.
			 */
			retry_with_kcred = B_TRUE;
			goto checkforroot;
		}
	}

	/*
	 * LOOKUP_CHECKREAD is a private flag used by vnodetopath() to indicate
	 * that we need to have read permission on every directory in the entire
	 * path.  This is used to ensure that a forward-lookup of a cached value
	 * has the same effect as a reverse-lookup when the cached value cannot
	 * be found.
	 */
	if ((flags & LOOKUP_CHECKREAD) &&
	    (error = VOP_ACCESS(vp, VREAD, 0, cr, NULL)) != 0)
		goto bad;

	/*
	 * Perform a lookup in the current directory.
	 */
	error = VOP_LOOKUP(vp, component, &cvp, pnp, lookup_flags,
	    rootvp, cr, NULL, NULL, pp);

	/*
	 * Retry with kcred - If crossing mount points & error is EACCES.
	 *
	 * If we are crossing mount points here and doing ".." lookup,
	 * VOP_LOOKUP() might fail if the underlying file systems
	 * mount point has no execute permission. In cases like these,
	 * we retry VOP_LOOKUP() by giving as much privilage as possible
	 * by passing kcred credentials.
	 *
	 * In case of hierarchical file systems, passing kcred still may
	 * or may not work.
	 * For eg: UFS FS --> Mount NFS FS --> Again mount UFS on some
	 *			directory inside NFS FS.
	 */
	if ((error == EACCES) && retry_with_kcred)
		error = VOP_LOOKUP(vp, component, &cvp, pnp, lookup_flags,
		    rootvp, zone_kcred(), NULL, NULL, pp);

	if (error) {
		cvp = NULL;
		/*
		 * On error, return hard error if
		 * (a) we're not at the end of the pathname yet, or
		 * (b) the caller didn't want the parent directory, or
		 * (c) we failed for some reason other than a missing entry.
		 */
		if (pn_pathleft(pnp) || dirvpp == NULL || error != ENOENT)
			goto bad;
		if (auditing) {	/* directory access */
			if (error = audit_savepath(pnp, vp, vp, error, cr))
				goto bad_noaudit;
		}

		pn_setlast(pnp);
		/*
		 * We inform the caller that the desired entry must be
		 * a directory by adding a '/' to the component name.
		 */
		if (must_be_directory && (error = pn_addslash(pnp)) != 0)
			goto bad;
		*dirvpp = vp;
		if (compvpp != NULL)
			*compvpp = NULL;
		if (rootvp != rootdir)
			VN_RELE(rootvp);
		if (pp)
			pn_free(pp);
		return (0);
	}

	/*
	 * Traverse mount points.
	 * XXX why don't we need to hold a read lock here (call vn_vfsrlock)?
	 * What prevents a concurrent update to v_vfsmountedhere?
	 * 	Possible answer: if mounting, we might not see the mount
	 *	if it is concurrently coming into existence, but that's
	 *	really not much different from the thread running a bit slower.
	 *	If unmounting, we may get into traverse() when we shouldn't,
	 *	but traverse() will catch this case for us.
	 *	(For this to work, fetching v_vfsmountedhere had better
	 *	be atomic!)
	 */
	if (vn_mountedvfs(cvp) != NULL) {
		if ((error = traverse(&cvp)) != 0)
			goto bad;
	}

	/*
	 * If we hit a symbolic link and there is more path to be
	 * translated or this operation does not wish to apply
	 * to a link, then place the contents of the link at the
	 * front of the remaining pathname.
	 */
	if (cvp->v_type == VLNK && ((flags & FOLLOW) || pn_pathleft(pnp))) {
		struct pathname linkpath;

		if (++nlink > MAXSYMLINKS) {
			error = ELOOP;
			goto bad;
		}
		pn_alloc(&linkpath);
		if (error = pn_getsymlink(cvp, &linkpath, cr)) {
			pn_free(&linkpath);
			goto bad;
		}

		if (auditing)
			audit_symlink(pnp, &linkpath);

		if (pn_pathleft(&linkpath) == 0)
			(void) pn_set(&linkpath, ".");
		error = pn_insert(pnp, &linkpath, strlen(component));
		pn_free(&linkpath);
		if (error)
			goto bad;
		VN_RELE(cvp);
		cvp = NULL;
		if (pnp->pn_pathlen == 0) {
			error = ENOENT;
			goto bad;
		}
		if (pnp->pn_path[0] == '/') {
			do {
				pnp->pn_path++;
				pnp->pn_pathlen--;
			} while (pnp->pn_path[0] == '/');
			VN_RELE(vp);
			vp = rootvp;
			VN_HOLD(vp);
		}
		if (auditing)
			audit_anchorpath(pnp, vp == rootvp);
		if (pn_fixslash(pnp)) {
			flags |= FOLLOW;
			must_be_directory = 1;
		}
		goto next;
	}

	/*
	 * If rpnp is non-NULL, remember the resolved path name therein.
	 * Do not include "." components.  Collapse occurrences of
	 * "previous/..", so long as "previous" is not itself "..".
	 * Exhausting rpnp results in error ENAMETOOLONG.
	 */
	if (rpnp && strcmp(component, ".") != 0) {
		size_t len;

		if (strcmp(component, "..") == 0 &&
		    rpnp->pn_pathlen != 0 &&
		    !((rpnp->pn_pathlen > 2 &&
		    strncmp(rpnp->pn_path+rpnp->pn_pathlen-3, "/..", 3) == 0) ||
		    (rpnp->pn_pathlen == 2 &&
		    strncmp(rpnp->pn_path, "..", 2) == 0))) {
			while (rpnp->pn_pathlen &&
			    rpnp->pn_path[rpnp->pn_pathlen-1] != '/')
				rpnp->pn_pathlen--;
			if (rpnp->pn_pathlen > 1)
				rpnp->pn_pathlen--;
			rpnp->pn_path[rpnp->pn_pathlen] = '\0';
		} else {
			if (rpnp->pn_pathlen != 0 &&
			    rpnp->pn_path[rpnp->pn_pathlen-1] != '/')
				rpnp->pn_path[rpnp->pn_pathlen++] = '/';
			if (flags & FIGNORECASE) {
				/*
				 * Return the case-preserved name
				 * within the resolved path.
				 */
				error = copystr(pp->pn_buf,
				    rpnp->pn_path + rpnp->pn_pathlen,
				    rpnp->pn_bufsize - rpnp->pn_pathlen, &len);
			} else {
				error = copystr(component,
				    rpnp->pn_path + rpnp->pn_pathlen,
				    rpnp->pn_bufsize - rpnp->pn_pathlen, &len);
			}
			if (error)	/* copystr() returns ENAMETOOLONG */
				goto bad;
			rpnp->pn_pathlen += (len - 1);
			ASSERT(rpnp->pn_bufsize > rpnp->pn_pathlen);
		}
	}

	/*
	 * If no more components, return last directory (if wanted) and
	 * last component (if wanted).
	 */
	if (pn_pathleft(pnp) == 0) {
		/*
		 * If there was a trailing slash in the pathname,
		 * make sure the last component is a directory.
		 */
		if (must_be_directory && cvp->v_type != VDIR) {
			error = ENOTDIR;
			goto bad;
		}
		if (dirvpp != NULL) {
			/*
			 * Check that we have the real parent and not
			 * an alias of the last component.
			 */
			if (vn_compare(vp, cvp)) {
				if (auditing)
					(void) audit_savepath(pnp, cvp, vp,
					    EINVAL, cr);
				pn_setlast(pnp);
				VN_RELE(vp);
				VN_RELE(cvp);
				if (rootvp != rootdir)
					VN_RELE(rootvp);
				if (pp)
					pn_free(pp);
				return (EINVAL);
			}
			*dirvpp = vp;
		} else
			VN_RELE(vp);
		if (auditing)
			(void) audit_savepath(pnp, cvp, vp, 0, cr);
		if (pnp->pn_path == pnp->pn_buf)
			(void) pn_set(pnp, ".");
		else
			pn_setlast(pnp);
		if (rpnp) {
			if (VN_CMP(cvp, rootvp))
				(void) pn_set(rpnp, "/");
			else if (rpnp->pn_pathlen == 0)
				(void) pn_set(rpnp, ".");
		}

		if (compvpp != NULL)
			*compvpp = cvp;
		else
			VN_RELE(cvp);
		if (rootvp != rootdir)
			VN_RELE(rootvp);
		if (pp)
			pn_free(pp);
		return (0);
	}

	/*
	 * Skip over slashes from end of last component.
	 */
	while (pnp->pn_path[0] == '/') {
		pnp->pn_path++;
		pnp->pn_pathlen--;
	}

	/*
	 * Searched through another level of directory:
	 * release previous directory handle and save new (result
	 * of lookup) as current directory.
	 */
	VN_RELE(vp);
	vp = cvp;
	cvp = NULL;
	goto next;

bad:
	if (auditing)	/* reached end of path */
		(void) audit_savepath(pnp, cvp, vp, error, cr);
bad_noaudit:
	/*
	 * Error.  Release vnodes and return.
	 */
	if (cvp)
		VN_RELE(cvp);
	/*
	 * If the error was ESTALE and the current directory to look in
	 * was the root for this lookup, the root for a mounted file
	 * system, or the starting directory for lookups, then
	 * return ENOENT instead of ESTALE.  In this case, no recovery
	 * is possible by the higher level.  If ESTALE was returned for
	 * some intermediate directory along the path, then recovery
	 * is potentially possible and retrying from the higher level
	 * will either correct the situation by purging stale cache
	 * entries or eventually get back to the point where no recovery
	 * is possible.
	 */
	if (error == ESTALE &&
	    (VN_CMP(vp, rootvp) || (vp->v_flag & VROOT) || vp == startvp))
		error = ENOENT;
	VN_RELE(vp);
	if (rootvp != rootdir)
		VN_RELE(rootvp);
	if (pp)
		pn_free(pp);
	return (error);
}
Beispiel #6
0
/*
 * mutex_vector_enter() is called from the assembly mutex_enter() routine
 * if the lock is held or is not of type MUTEX_ADAPTIVE.
 */
void
mutex_vector_enter(mutex_impl_t *lp)
{
	kthread_id_t	owner;
	hrtime_t	sleep_time = 0;	/* how long we slept */
	uint_t		spin_count = 0;	/* how many times we spun */
	cpu_t 		*cpup, *last_cpu;
	extern cpu_t	*cpu_list;
	turnstile_t	*ts;
	volatile mutex_impl_t *vlp = (volatile mutex_impl_t *)lp;
	int		backoff;	/* current backoff */
	int		backctr;	/* ctr for backoff */
	int		sleep_count = 0;

	ASSERT_STACK_ALIGNED();

	if (MUTEX_TYPE_SPIN(lp)) {
		lock_set_spl(&lp->m_spin.m_spinlock, lp->m_spin.m_minspl,
		    &lp->m_spin.m_oldspl);
		return;
	}

	if (!MUTEX_TYPE_ADAPTIVE(lp)) {
		mutex_panic("mutex_enter: bad mutex", lp);
		return;
	}

	/*
	 * Adaptive mutexes must not be acquired from above LOCK_LEVEL.
	 * We can migrate after loading CPU but before checking CPU_ON_INTR,
	 * so we must verify by disabling preemption and loading CPU again.
	 */
	cpup = CPU;
	if (CPU_ON_INTR(cpup) && !panicstr) {
		kpreempt_disable();
		if (CPU_ON_INTR(CPU))
			mutex_panic("mutex_enter: adaptive at high PIL", lp);
		kpreempt_enable();
	}

	CPU_STATS_ADDQ(cpup, sys, mutex_adenters, 1);

	if (&plat_lock_delay) {
		backoff = 0;
	} else {
		backoff = BACKOFF_BASE;
	}

	for (;;) {
spin:
		spin_count++;
		/*
		 * Add an exponential backoff delay before trying again
		 * to touch the mutex data structure.
		 * the spin_count test and call to nulldev are to prevent
		 * the compiler optimizer from eliminating the delay loop.
		 */
		if (&plat_lock_delay) {
			plat_lock_delay(&backoff);
		} else {
			for (backctr = backoff; backctr; backctr--) {
				if (!spin_count) (void) nulldev();
			};    /* delay */
			backoff = backoff << 1;			/* double it */
			if (backoff > BACKOFF_CAP) {
				backoff = BACKOFF_CAP;
			}

			SMT_PAUSE();
		}

		if (panicstr)
			return;

		if ((owner = MUTEX_OWNER(vlp)) == NULL) {
			if (mutex_adaptive_tryenter(lp))
				break;
			continue;
		}

		if (owner == curthread)
			mutex_panic("recursive mutex_enter", lp);

		/*
		 * If lock is held but owner is not yet set, spin.
		 * (Only relevant for platforms that don't have cas.)
		 */
		if (owner == MUTEX_NO_OWNER)
			continue;

		/*
		 * When searching the other CPUs, start with the one where
		 * we last saw the owner thread.  If owner is running, spin.
		 *
		 * We must disable preemption at this point to guarantee
		 * that the list doesn't change while we traverse it
		 * without the cpu_lock mutex.  While preemption is
		 * disabled, we must revalidate our cached cpu pointer.
		 */
		kpreempt_disable();
		if (cpup->cpu_next == NULL)
			cpup = cpu_list;
		last_cpu = cpup;	/* mark end of search */
		do {
			if (cpup->cpu_thread == owner) {
				kpreempt_enable();
				goto spin;
			}
		} while ((cpup = cpup->cpu_next) != last_cpu);
		kpreempt_enable();

		/*
		 * The owner appears not to be running, so block.
		 * See the Big Theory Statement for memory ordering issues.
		 */
		ts = turnstile_lookup(lp);
		MUTEX_SET_WAITERS(lp);
		membar_enter();

		/*
		 * Recheck whether owner is running after waiters bit hits
		 * global visibility (above).  If owner is running, spin.
		 *
		 * Since we are at ipl DISP_LEVEL, kernel preemption is
		 * disabled, however we still need to revalidate our cached
		 * cpu pointer to make sure the cpu hasn't been deleted.
		 */
		if (cpup->cpu_next == NULL)
			last_cpu = cpup = cpu_list;
		do {
			if (cpup->cpu_thread == owner) {
				turnstile_exit(lp);
				goto spin;
			}
		} while ((cpup = cpup->cpu_next) != last_cpu);
		membar_consumer();

		/*
		 * If owner and waiters bit are unchanged, block.
		 */
		if (MUTEX_OWNER(vlp) == owner && MUTEX_HAS_WAITERS(vlp)) {
			sleep_time -= gethrtime();
			(void) turnstile_block(ts, TS_WRITER_Q, lp,
			    &mutex_sobj_ops, NULL, NULL);
			sleep_time += gethrtime();
			sleep_count++;
		} else {
			turnstile_exit(lp);
		}
	}

	ASSERT(MUTEX_OWNER(lp) == curthread);

	if (sleep_time != 0) {
		/*
		 * Note, sleep time is the sum of all the sleeping we
		 * did.
		 */
		LOCKSTAT_RECORD(LS_MUTEX_ENTER_BLOCK, lp, sleep_time);
	}

	/*
	 * We do not count a sleep as a spin.
	 */
	if (spin_count > sleep_count)
		LOCKSTAT_RECORD(LS_MUTEX_ENTER_SPIN, lp,
		    spin_count - sleep_count);

	LOCKSTAT_RECORD0(LS_MUTEX_ENTER_ACQUIRE, lp);
}