Example #1
0
int zfs_vn_rdwr(enum uio_rw rw, struct vnode *vp, caddr_t base, ssize_t len,
                offset_t offset, enum uio_seg seg, int ioflag, rlim64_t ulimit,
                cred_t *cr, ssize_t *residp)
{
    uio_t *auio;
    int spacetype;
    int error=0;
    vfs_context_t vctx;

    spacetype = UIO_SEG_IS_USER_SPACE(seg) ? UIO_USERSPACE32 : UIO_SYSSPACE;

    vctx = vfs_context_create((vfs_context_t)0);
    auio = uio_create(1, 0, spacetype, rw);
    uio_reset(auio, offset, spacetype, rw);
    uio_addiov(auio, (uint64_t)(uintptr_t)base, len);

    if (rw == UIO_READ) {
        error = VNOP_READ(vp, auio, ioflag, vctx);
    } else {
        error = VNOP_WRITE(vp, auio, ioflag, vctx);
    }

    if (residp) {
        *residp = uio_resid(auio);
    } else {
        if (uio_resid(auio) && error == 0)
            error = EIO;
    }

    uio_free(auio);
    vfs_context_rele(vctx);

    return (error);
}
Example #2
0
/*
 * uio_isuserspace - return non zero value if the address space 
 * flag is for a user address space (could be 32 or 64 bit).
 */
int uio_isuserspace( uio_t a_uio )
{
	if (a_uio == NULL) {
#if LP64_DEBUG
		panic("%s :%d - invalid uio_t\n", __FILE__, __LINE__); 
#endif /* LP64_DEBUG */
		return(0);
	}

	if (UIO_SEG_IS_USER_SPACE(a_uio->uio_segflg)) {
		return( 1 );
	}
	return( 0 );
}
Example #3
0
/*
 * Our version of vn_rdwr, here "vp" is not actually a vnode, but a ptr
 * to the node allocated in getf(). We use the "fp" part of the node to
 * be able to issue IO.
 * You must call getf() before calling spl_vn_rdwr().
 */
int spl_vn_rdwr(enum uio_rw rw,
                struct vnode *vp,
                caddr_t base,
                ssize_t len,
                offset_t offset,
                enum uio_seg seg,
                int ioflag,
                rlim64_t ulimit,    /* meaningful only if rw is UIO_WRITE */
                cred_t *cr,
                ssize_t *residp)
{
    struct spl_fileproc *sfp = (struct spl_fileproc*)vp;
    uio_t *auio;
    int spacetype;
    int error=0;
    vfs_context_t vctx;

    spacetype = UIO_SEG_IS_USER_SPACE(seg) ? UIO_USERSPACE32 : UIO_SYSSPACE;

    vctx = vfs_context_create((vfs_context_t)0);
    auio = uio_create(1, 0, spacetype, rw);
    uio_reset(auio, offset, spacetype, rw);
    uio_addiov(auio, (uint64_t)(uintptr_t)base, len);

    if (rw == UIO_READ) {
        error = fo_read(sfp->f_fp, auio, ioflag, vctx);
    } else {
        error = fo_write(sfp->f_fp, auio, ioflag, vctx);
        sfp->f_writes = 1;
    }

    if (residp) {
        *residp = uio_resid(auio);
    } else {
        if (uio_resid(auio) && error == 0)
            error = EIO;
    }

    uio_free(auio);
    vfs_context_rele(vctx);

    return (error);
}
Example #4
0
/*
 * uio_iovsaddr - get the address of the iovec array for the given uio_t.
 * This returns the location of the iovecs within the uio.
 * NOTE - for compatibility mode we just return the current value in uio_iovs
 * which will increase as the IO is completed and is NOT embedded within the
 * uio, it is a seperate array of one or more iovecs.
 */
__private_extern__ struct user_iovec * uio_iovsaddr( uio_t a_uio )
{
	struct user_iovec *		my_addr;
	
	if (a_uio == NULL) {
		return(NULL);
	}
	
	if (UIO_SEG_IS_USER_SPACE(a_uio->uio_segflg)) {
		/* we need this for compatibility mode. */
		my_addr = (struct user_iovec *) a_uio->uio_iovs.uiovp;
	}
	else {
#if DEBUG
		panic("uio_iovsaddr called for UIO_SYSSPACE request");
#endif
		my_addr = 0;
	}
	return(my_addr);
}
Example #5
0
/*
 * Convert a pathname into a pointer to a locked inode.
 *
 * The FOLLOW flag is set when symbolic links are to be followed
 * when they occur at the end of the name translation process.
 * Symbolic links are always followed for all other pathname
 * components other than the last.
 *
 * The segflg defines whether the name is to be copied from user
 * space or kernel space.
 *
 * Overall outline of namei:
 *
 *	copy in name
 *	get starting directory
 *	while (!done && !error) {
 *		call lookup to search path.
 *		if symbolic link, massage name in buffer and continue
 *	}
 *
 * Returns:	0			Success
 *		ENOENT			No such file or directory
 *		ELOOP			Too many levels of symbolic links
 *		ENAMETOOLONG		Filename too long
 *		copyinstr:EFAULT	Bad address
 *		copyinstr:ENAMETOOLONG	Filename too long
 *		lookup:EBADF		Bad file descriptor
 *		lookup:EROFS
 *		lookup:EACCES
 *		lookup:EPERM
 *		lookup:ERECYCLE	 vnode was recycled from underneath us in lookup.
 *						 This means we should re-drive lookup from this point.
 *		lookup: ???
 *		VNOP_READLINK:???
 */
int
namei(struct nameidata *ndp)
{
	struct filedesc *fdp;	/* pointer to file descriptor state */
	struct vnode *dp;	/* the directory we are searching */
	struct vnode *usedvp = ndp->ni_dvp;  /* store pointer to vp in case we must loop due to
										   	heavy vnode pressure */
	u_long cnpflags = ndp->ni_cnd.cn_flags; /* store in case we have to restore after loop */
	int error;
	struct componentname *cnp = &ndp->ni_cnd;
	vfs_context_t ctx = cnp->cn_context;
	proc_t p = vfs_context_proc(ctx);
#if CONFIG_AUDIT
/* XXX ut should be from context */
	uthread_t ut = (struct uthread *)get_bsdthread_info(current_thread());
#endif

	fdp = p->p_fd;

#if DIAGNOSTIC
	if (!vfs_context_ucred(ctx) || !p)
		panic ("namei: bad cred/proc");
	if (cnp->cn_nameiop & (~OPMASK))
		panic ("namei: nameiop contaminated with flags");
	if (cnp->cn_flags & OPMASK)
		panic ("namei: flags contaminated with nameiops");
#endif

	/*
	 * A compound VNOP found something that needs further processing:
	 * either a trigger vnode, a covered directory, or a symlink.
	 */
	if (ndp->ni_flag & NAMEI_CONTLOOKUP) {
		int rdonly, vbusyflags, keep_going, wantparent;

		rdonly = cnp->cn_flags & RDONLY;
		vbusyflags = ((cnp->cn_flags & CN_NBMOUNTLOOK) != 0) ? LK_NOWAIT : 0;
		keep_going = 0;
		wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT);

		ndp->ni_flag &= ~(NAMEI_CONTLOOKUP);

		error = lookup_handle_found_vnode(ndp, &ndp->ni_cnd, rdonly, vbusyflags, 
				&keep_going, ndp->ni_ncgeneration, wantparent, 0, ctx);
		if (error)
			goto out_drop;
		if (keep_going) {
			if ((cnp->cn_flags & ISSYMLINK) == 0) {
				panic("We need to keep going on a continued lookup, but for vp type %d (tag %d)\n", ndp->ni_vp->v_type, ndp->ni_vp->v_tag);
			}
			goto continue_symlink;
		}

		return 0;

	}

vnode_recycled:

	/*
	 * Get a buffer for the name to be translated, and copy the
	 * name into the buffer.
	 */
	if ((cnp->cn_flags & HASBUF) == 0) {
		cnp->cn_pnbuf = ndp->ni_pathbuf;
		cnp->cn_pnlen = PATHBUFLEN;
	}
#if LP64_DEBUG
	if ((UIO_SEG_IS_USER_SPACE(ndp->ni_segflg) == 0)
		&& (ndp->ni_segflg != UIO_SYSSPACE)
		&& (ndp->ni_segflg != UIO_SYSSPACE32)) {
		panic("%s :%d - invalid ni_segflg\n", __FILE__, __LINE__); 
	}
#endif /* LP64_DEBUG */

retry_copy:
	if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
		error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf,
			    cnp->cn_pnlen, (size_t *)&ndp->ni_pathlen);
	} else {
		error = copystr(CAST_DOWN(void *, ndp->ni_dirp), cnp->cn_pnbuf,
			    cnp->cn_pnlen, (size_t *)&ndp->ni_pathlen);
	}
	if (error == ENAMETOOLONG && !(cnp->cn_flags & HASBUF)) {
		MALLOC_ZONE(cnp->cn_pnbuf, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK);
		if (cnp->cn_pnbuf == NULL) {
			error = ENOMEM;
			goto error_out;
		}

		cnp->cn_flags |= HASBUF;
		cnp->cn_pnlen = MAXPATHLEN;
		
		goto retry_copy;
	}
	if (error)
	        goto error_out;

#if CONFIG_VOLFS
 	/*
	 * Check for legacy volfs style pathnames.
	 *
	 * For compatibility reasons we currently allow these paths,
	 * but future versions of the OS may not support them.
	 */
	if (ndp->ni_pathlen >= VOLFS_MIN_PATH_LEN &&
	    cnp->cn_pnbuf[0] == '/' &&
	    cnp->cn_pnbuf[1] == '.' &&
	    cnp->cn_pnbuf[2] == 'v' &&
	    cnp->cn_pnbuf[3] == 'o' &&
	    cnp->cn_pnbuf[4] == 'l' &&
	    cnp->cn_pnbuf[5] == '/' ) {
		char * realpath;
		int realpath_err;
		/* Attempt to resolve a legacy volfs style pathname. */
		MALLOC_ZONE(realpath, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK);
		if (realpath) {
			/* 
			 * We only error out on the ENAMETOOLONG cases where we know that
			 * vfs_getrealpath translation succeeded but the path could not fit into
			 * MAXPATHLEN characters.  In other failure cases, we may be dealing with a path
			 * that legitimately looks like /.vol/1234/567 and is not meant to be translated
			 */
			if ((realpath_err= vfs_getrealpath(&cnp->cn_pnbuf[6], realpath, MAXPATHLEN, ctx))) {
				FREE_ZONE(realpath, MAXPATHLEN, M_NAMEI);
				if (realpath_err == ENOSPC || realpath_err == ENAMETOOLONG){
					error = ENAMETOOLONG;
					goto error_out;
				}
			} else {
				if (cnp->cn_flags & HASBUF) {
					FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI);
				}
				cnp->cn_pnbuf = realpath;
				cnp->cn_pnlen = MAXPATHLEN;
				ndp->ni_pathlen = strlen(realpath) + 1;
				cnp->cn_flags |= HASBUF | CN_VOLFSPATH;
			}
		}
	}
#endif /* CONFIG_VOLFS */

#if CONFIG_AUDIT
	/* If we are auditing the kernel pathname, save the user pathname */
	if (cnp->cn_flags & AUDITVNPATH1)
		AUDIT_ARG(upath, ut->uu_cdir, cnp->cn_pnbuf, ARG_UPATH1); 
	if (cnp->cn_flags & AUDITVNPATH2)
		AUDIT_ARG(upath, ut->uu_cdir, cnp->cn_pnbuf, ARG_UPATH2); 
#endif /* CONFIG_AUDIT */

	/*
	 * Do not allow empty pathnames
	 */
	if (*cnp->cn_pnbuf == '\0') {
		error = ENOENT;
		goto error_out;
	}
	ndp->ni_loopcnt = 0;

	/*
	 * determine the starting point for the translation.
	 */
	if ((ndp->ni_rootdir = fdp->fd_rdir) == NULLVP) {
	        if ( !(fdp->fd_flags & FD_CHROOT))
		        ndp->ni_rootdir = rootvnode;
	}
	cnp->cn_nameptr = cnp->cn_pnbuf;

	ndp->ni_usedvp = NULLVP;

	if (*(cnp->cn_nameptr) == '/') {
	        while (*(cnp->cn_nameptr) == '/') {
		        cnp->cn_nameptr++;
			ndp->ni_pathlen--;
		}
		dp = ndp->ni_rootdir;
	} else if (cnp->cn_flags & USEDVP) {
	        dp = ndp->ni_dvp;
		ndp->ni_usedvp = dp;
	} else
	        dp = vfs_context_cwd(ctx);

	if (dp == NULLVP || (dp->v_lflag & VL_DEAD)) {
	        error = ENOENT;
		goto error_out;
	}
	ndp->ni_dvp = NULLVP;
	ndp->ni_vp  = NULLVP;

	for (;;) {
		ndp->ni_startdir = dp;

		if ( (error = lookup(ndp)) ) {
			goto error_out;
		}
		/*
		 * Check for symbolic link
		 */
		if ((cnp->cn_flags & ISSYMLINK) == 0) {
			return (0);
		}

continue_symlink:
		/* Gives us a new path to process, and a starting dir */
		error = lookup_handle_symlink(ndp, &dp, ctx);
		if (error != 0) {
			break;
		}
	}
	/*
	 * only come here if we fail to handle a SYMLINK...
	 * if either ni_dvp or ni_vp is non-NULL, then
	 * we need to drop the iocount that was picked
	 * up in the lookup routine
	 */
out_drop:
	if (ndp->ni_dvp)
	        vnode_put(ndp->ni_dvp);
	if (ndp->ni_vp)
	        vnode_put(ndp->ni_vp);
 error_out:
	if ( (cnp->cn_flags & HASBUF) ) {
		cnp->cn_flags &= ~HASBUF;
		FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI);
	}
	cnp->cn_pnbuf = NULL;
	ndp->ni_vp = NULLVP;
	ndp->ni_dvp = NULLVP;
	if (error == ERECYCLE){
		/* vnode was recycled underneath us. re-drive lookup to start at 
		   the beginning again, since recycling invalidated last lookup*/
		ndp->ni_cnd.cn_flags = cnpflags;
		ndp->ni_dvp = usedvp;
		goto vnode_recycled;
	}


	return (error);
}
Example #6
0
/*
 * Vnode op for write
 */
int
spec_write(struct vnop_write_args *ap)
{
	struct vnode *vp = ap->a_vp;
	struct uio *uio = ap->a_uio;
	struct buf *bp;
	daddr64_t bn;
	int bsize, blkmask, bscale;
	int io_sync;
	int devBlockSize=0;
	int n, on;
	int error = 0;
	dev_t dev;

#if DIAGNOSTIC
	if (uio->uio_rw != UIO_WRITE)
		panic("spec_write mode");
	if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
		panic("spec_write proc");
#endif

	switch (vp->v_type) {

	case VCHR:
		error = (*cdevsw[major(vp->v_rdev)].d_write)
			(vp->v_rdev, uio, ap->a_ioflag);
		return (error);

	case VBLK:
		if (uio_resid(uio) == 0)
			return (0);
		if (uio->uio_offset < 0)
			return (EINVAL);

		io_sync = (ap->a_ioflag & IO_SYNC);

		dev = (vp->v_rdev);

		devBlockSize = vp->v_specsize;
		if (devBlockSize > PAGE_SIZE)
			return(EINVAL);

	        bscale = PAGE_SIZE / devBlockSize;
		blkmask = bscale - 1;
		bsize = bscale * devBlockSize;
		

		do {
			bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ blkmask);
			on = uio->uio_offset % bsize;

			n = min((unsigned)(bsize - on), uio_resid(uio));

			/*
			 * Use buf_getblk() as an optimization IFF:
			 *
			 * 1)	We are reading exactly a block on a block
			 *	aligned boundary
			 * 2)	We know the size of the device from spec_open
			 * 3)	The read doesn't span the end of the device
			 *
			 * Otherwise, we fall back on buf_bread().
			 */
			if (n == bsize &&
			    vp->v_specdevsize != (u_int64_t)0 &&
			    (uio->uio_offset + (u_int64_t)n) > vp->v_specdevsize) {
			    /* reduce the size of the read to what is there */
			    n = (uio->uio_offset + (u_int64_t)n) - vp->v_specdevsize;
			}

			if (n == bsize)
			        bp = buf_getblk(vp, bn, bsize, 0, 0, BLK_WRITE);
			else
			        error = (int)buf_bread(vp, bn, bsize, NOCRED, &bp);

			/* Translate downstream error for upstream, if needed */
			if (!error)
				error = (int)buf_error(bp);
			if (error) {
				buf_brelse(bp);
				return (error);
			}
			n = min(n, bsize - buf_resid(bp));

			error = uiomove((char *)0 + buf_dataptr(bp) + on, n, uio);
			if (error) {
				buf_brelse(bp);
				return (error);
			}
			buf_markaged(bp);

			if (io_sync) 
			        error = buf_bwrite(bp);
			else {
			        if ((n + on) == bsize)
				        error = buf_bawrite(bp);
				else
				        error = buf_bdwrite(bp);
			}
		} while (error == 0 && uio_resid(uio) > 0 && n != 0);
		return (error);

	default:
		panic("spec_write type");
	}
	/* NOTREACHED */

	return (0);
}
Example #7
0
/*
 * Vnode op for read
 */
int
spec_read(struct vnop_read_args *ap)
{
	struct vnode *vp = ap->a_vp;
	struct uio *uio = ap->a_uio;
	struct buf *bp;
	daddr64_t bn, nextbn;
	long bsize, bscale;
	int devBlockSize=0;
	int n, on;
	int error = 0;
	dev_t dev;

#if DIAGNOSTIC
	if (uio->uio_rw != UIO_READ)
		panic("spec_read mode");
	if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
		panic("spec_read proc");
#endif
	if (uio_resid(uio) == 0)
		return (0);

	switch (vp->v_type) {

	case VCHR:
		error = (*cdevsw[major(vp->v_rdev)].d_read)
			(vp->v_rdev, uio, ap->a_ioflag);
		return (error);

	case VBLK:
		if (uio->uio_offset < 0)
			return (EINVAL);

		dev = vp->v_rdev;

		devBlockSize = vp->v_specsize;

		if (devBlockSize > PAGE_SIZE) 
			return (EINVAL);

	        bscale = PAGE_SIZE / devBlockSize;
		bsize = bscale * devBlockSize;

		do {
			on = uio->uio_offset % bsize;

			bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ (bscale - 1));
			
			if (vp->v_speclastr + bscale == bn) {
			        nextbn = bn + bscale;
				error = buf_breadn(vp, bn, (int)bsize, &nextbn,
					       (int *)&bsize, 1, NOCRED, &bp);
			} else
			        error = buf_bread(vp, bn, (int)bsize, NOCRED, &bp);

			vnode_lock(vp);
			vp->v_speclastr = bn;
			vnode_unlock(vp);

			n = bsize - buf_resid(bp);
			if ((on > n) || error) {
			        if (!error)
				        error = EINVAL;
				buf_brelse(bp);
				return (error);
			}
			n = min((unsigned)(n  - on), uio_resid(uio));

			error = uiomove((char *)0 + buf_dataptr(bp) + on, n, uio);
			if (n + on == bsize)
				buf_markaged(bp);
			buf_brelse(bp);
		} while (error == 0 && uio_resid(uio) > 0 && n != 0);
		return (error);

	default:
		panic("spec_read type");
	}
	/* NOTREACHED */

	return (0);
}
Example #8
0
int
physio( void (*f_strategy)(buf_t), 
	buf_t bp,
	dev_t dev,
	int flags,
	u_int (*f_minphys)(buf_t),
	struct uio *uio,
	int blocksize)
{
	struct proc *p = current_proc();
	int error, i, buf_allocated, todo, iosize;
	int orig_bflags = 0;
	int64_t done;

	error = 0;
	flags &= B_READ | B_WRITE;
	buf_allocated = 0;

	/*
	 * [check user read/write access to the data buffer]
	 *
	 * Check each iov one by one.  Note that we know if we're reading or
	 * writing, so we ignore the uio's rw parameter.  Also note that if
	 * we're doing a read, that's a *write* to user-space.
	 */
	for (i = 0; i < uio->uio_iovcnt; i++) {
		if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) {
			user_addr_t base;
			user_size_t len;
			
			if (uio_getiov(uio, i, &base, &len) ||
				!useracc(base,
					len,
		    		(flags == B_READ) ? B_WRITE : B_READ))
			return (EFAULT);
		}
	}
	/*
	 * Make sure we have a buffer, creating one if necessary.
	 */
	if (bp == NULL) {
		bp = buf_alloc((vnode_t)0);
		buf_allocated = 1;
	} else
	        orig_bflags = buf_flags(bp);
	/*
	 * at this point we should have a buffer
	 * that is marked BL_BUSY... we either 
	 * acquired it via buf_alloc, or it was
	 * passed into us... if it was passed
	 * in, it needs to already be owned by
	 * the caller (i.e. BL_BUSY is set)
	 */
	assert(bp->b_lflags & BL_BUSY);

	/*
	 * [set up the fixed part of the buffer for a transfer]
	 */
	bp->b_dev = dev;
	bp->b_proc = p;

	/*
	 * [mark the buffer busy for physical I/O]
	 * (i.e. set B_PHYS (because it's an I/O to user
	 * memory, and B_RAW, because B_RAW is to be
	 * "Set by physio for raw transfers.", in addition
	 * to the read/write flag.)
	 */
        buf_setflags(bp, B_PHYS | B_RAW);

	/*
	 * [while there is data to transfer and no I/O error]
	 * Note that I/O errors are handled with a 'goto' at the bottom
	 * of the 'while' loop.
	 */
	while (uio_resid(uio) > 0) {
			
			if ( (iosize = uio_curriovlen(uio)) > MAXPHYSIO_WIRED)
			        iosize = MAXPHYSIO_WIRED;
			/*
			 * make sure we're set to issue a fresh I/O
			 * in the right direction
			 */
			buf_reset(bp, flags);

			/* [set up the buffer for a maximum-sized transfer] */
 			buf_setblkno(bp, uio_offset(uio) / blocksize);
			buf_setcount(bp, iosize);
			buf_setdataptr(bp, (uintptr_t)CAST_DOWN(caddr_t, uio_curriovbase(uio)));
			
			/*
			 * [call f_minphys to bound the tranfer size]
			 * and remember the amount of data to transfer,
			 * for later comparison.
			 */
			(*f_minphys)(bp);
			todo = buf_count(bp);

			/*
			 * [lock the part of the user address space involved
			 *    in the transfer]
			 */

			if(UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) {
				error = vslock(CAST_USER_ADDR_T(buf_dataptr(bp)),
					       (user_size_t)todo);
				if (error)
					goto done;
			}
			
			/* [call f_strategy to start the transfer] */
			(*f_strategy)(bp);


			/* [wait for the transfer to complete] */
			error = (int)buf_biowait(bp);

			/*
			 * [unlock the part of the address space previously
			 *    locked]
			 */
			if(UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
				vsunlock(CAST_USER_ADDR_T(buf_dataptr(bp)),
					 (user_size_t)todo,
					 (flags & B_READ));

			/*
			 * [deduct the transfer size from the total number
			 *    of data to transfer]
			 */
			done = buf_count(bp) - buf_resid(bp);
			uio_update(uio, done);

			/*
			 * Now, check for an error.
			 * Also, handle weird end-of-disk semantics.
			 */
			if (error || done < todo)
				goto done;
	}

done:
	if (buf_allocated)
	        buf_free(bp);
	else
		buf_setflags(bp, orig_bflags);

	return (error);
}
Example #9
0
/*
 * Convert a pathname into a pointer to a locked inode.
 *
 * The FOLLOW flag is set when symbolic links are to be followed
 * when they occur at the end of the name translation process.
 * Symbolic links are always followed for all other pathname
 * components other than the last.
 *
 * The segflg defines whether the name is to be copied from user
 * space or kernel space.
 *
 * Overall outline of namei:
 *
 *	copy in name
 *	get starting directory
 *	while (!done && !error) {
 *		call lookup to search path.
 *		if symbolic link, massage name in buffer and continue
 *	}
 *
 * Returns:	0			Success
 *		ENOENT			No such file or directory
 *		ELOOP			Too many levels of symbolic links
 *		ENAMETOOLONG		Filename too long
 *		copyinstr:EFAULT	Bad address
 *		copyinstr:ENAMETOOLONG	Filename too long
 *		lookup:EBADF		Bad file descriptor
 *		lookup:EROFS
 *		lookup:EACCES
 *		lookup:EPERM
 *		lookup:ERECYCLE	 vnode was recycled from underneath us in lookup.
 *						 This means we should re-drive lookup from this point.
 *		lookup: ???
 *		VNOP_READLINK:???
 */
int
namei(struct nameidata *ndp)
{
	struct filedesc *fdp;	/* pointer to file descriptor state */
	char *cp;		/* pointer into pathname argument */
	struct vnode *dp;	/* the directory we are searching */
	struct vnode *usedvp = ndp->ni_dvp;  /* store pointer to vp in case we must loop due to
										   	heavy vnode pressure */
	u_long cnpflags = ndp->ni_cnd.cn_flags; /* store in case we have to restore after loop */
	uio_t auio;
	int error;
	struct componentname *cnp = &ndp->ni_cnd;
	vfs_context_t ctx = cnp->cn_context;
	proc_t p = vfs_context_proc(ctx);
/* XXX ut should be from context */
	uthread_t ut = (struct uthread *)get_bsdthread_info(current_thread());
	char *tmppn;
	char uio_buf[ UIO_SIZEOF(1) ];

#if DIAGNOSTIC
	if (!vfs_context_ucred(ctx) || !p)
		panic ("namei: bad cred/proc");
	if (cnp->cn_nameiop & (~OPMASK))
		panic ("namei: nameiop contaminated with flags");
	if (cnp->cn_flags & OPMASK)
		panic ("namei: flags contaminated with nameiops");
#endif
	fdp = p->p_fd;

vnode_recycled:

	/*
	 * Get a buffer for the name to be translated, and copy the
	 * name into the buffer.
	 */
	if ((cnp->cn_flags & HASBUF) == 0) {
		cnp->cn_pnbuf = ndp->ni_pathbuf;
		cnp->cn_pnlen = PATHBUFLEN;
	}
#if LP64_DEBUG
	if (IS_VALID_UIO_SEGFLG(ndp->ni_segflg) == 0) {
		panic("%s :%d - invalid ni_segflg\n", __FILE__, __LINE__); 
	}
#endif /* LP64_DEBUG */

retry_copy:
	if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
		error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf,
			    cnp->cn_pnlen, (size_t *)&ndp->ni_pathlen);
	} else {
		error = copystr(CAST_DOWN(void *, ndp->ni_dirp), cnp->cn_pnbuf,
			    cnp->cn_pnlen, (size_t *)&ndp->ni_pathlen);
	}
	if (error == ENAMETOOLONG && !(cnp->cn_flags & HASBUF)) {
		MALLOC_ZONE(cnp->cn_pnbuf, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK);
		if (cnp->cn_pnbuf == NULL) {
			error = ENOMEM;
			goto error_out;
		}

		cnp->cn_flags |= HASBUF;
		cnp->cn_pnlen = MAXPATHLEN;
		
		goto retry_copy;
	}
	if (error)
	        goto error_out;

#if CONFIG_VOLFS
 	/*
	 * Check for legacy volfs style pathnames.
	 *
	 * For compatibility reasons we currently allow these paths,
	 * but future versions of the OS may not support them.
	 */
	if (ndp->ni_pathlen >= VOLFS_MIN_PATH_LEN &&
	    cnp->cn_pnbuf[0] == '/' &&
	    cnp->cn_pnbuf[1] == '.' &&
	    cnp->cn_pnbuf[2] == 'v' &&
	    cnp->cn_pnbuf[3] == 'o' &&
	    cnp->cn_pnbuf[4] == 'l' &&
	    cnp->cn_pnbuf[5] == '/' ) {
		char * realpath;
		int realpath_err;
		/* Attempt to resolve a legacy volfs style pathname. */
		MALLOC_ZONE(realpath, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK);
		if (realpath) {
			if ((realpath_err= vfs_getrealpath(&cnp->cn_pnbuf[6], realpath, MAXPATHLEN, ctx))) {
				FREE_ZONE(realpath, MAXPATHLEN, M_NAMEI);
				if (realpath_err == ENOSPC){
					error = ENAMETOOLONG;
					goto error_out;
				}
			} else {
				if (cnp->cn_flags & HASBUF) {
					FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI);
				}
				cnp->cn_pnbuf = realpath;
				cnp->cn_pnlen = MAXPATHLEN;
				ndp->ni_pathlen = strlen(realpath) + 1;
				cnp->cn_flags |= HASBUF | CN_VOLFSPATH;
			}
		}
	}
 #endif /* CONFIG_VOLFS */

	/* If we are auditing the kernel pathname, save the user pathname */
	if (cnp->cn_flags & AUDITVNPATH1)
		AUDIT_ARG(upath, ut->uu_cdir, cnp->cn_pnbuf, ARG_UPATH1); 
	if (cnp->cn_flags & AUDITVNPATH2)
		AUDIT_ARG(upath, ut->uu_cdir, cnp->cn_pnbuf, ARG_UPATH2); 

	/*
	 * Do not allow empty pathnames
	 */
	if (*cnp->cn_pnbuf == '\0') {
		error = ENOENT;
		goto error_out;
	}
	ndp->ni_loopcnt = 0;

	/*
	 * determine the starting point for the translation.
	 */
	if ((ndp->ni_rootdir = fdp->fd_rdir) == NULLVP) {
	        if ( !(fdp->fd_flags & FD_CHROOT))
		        ndp->ni_rootdir = rootvnode;
	}
	cnp->cn_nameptr = cnp->cn_pnbuf;

	ndp->ni_usedvp = NULLVP;

	if (*(cnp->cn_nameptr) == '/') {
	        while (*(cnp->cn_nameptr) == '/') {
		        cnp->cn_nameptr++;
			ndp->ni_pathlen--;
		}
		dp = ndp->ni_rootdir;
	} else if (cnp->cn_flags & USEDVP) {
	        dp = ndp->ni_dvp;
		ndp->ni_usedvp = dp;
	} else
	        dp = vfs_context_cwd(ctx);

	if (dp == NULLVP || (dp->v_lflag & VL_DEAD)) {
	        error = ENOENT;
		goto error_out;
	}
	ndp->ni_dvp = NULLVP;
	ndp->ni_vp  = NULLVP;

	for (;;) {
	        int need_newpathbuf;
		int linklen;

		ndp->ni_startdir = dp;

		if ( (error = lookup(ndp)) ) {
			goto error_out;
		}
		/*
		 * Check for symbolic link
		 */
		if ((cnp->cn_flags & ISSYMLINK) == 0) {
			return (0);
		}
		if ((cnp->cn_flags & FSNODELOCKHELD)) {
		        cnp->cn_flags &= ~FSNODELOCKHELD;
			unlock_fsnode(ndp->ni_dvp, NULL);
		}	
		if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
			error = ELOOP;
			break;
		}
#if CONFIG_MACF
		if ((error = mac_vnode_check_readlink(ctx, ndp->ni_vp)) != 0)
			break;
#endif /* MAC */
		if (ndp->ni_pathlen > 1 || !(cnp->cn_flags & HASBUF))
		        need_newpathbuf = 1;
		else
		        need_newpathbuf = 0;

		if (need_newpathbuf) {
			MALLOC_ZONE(cp, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
			if (cp == NULL) {
				error = ENOMEM;
				break;
			}
		} else {
			cp = cnp->cn_pnbuf;
		}
		auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf));

		uio_addiov(auio, CAST_USER_ADDR_T(cp), MAXPATHLEN);

		error = VNOP_READLINK(ndp->ni_vp, auio, ctx);
		if (error) {
			if (need_newpathbuf)
				FREE_ZONE(cp, MAXPATHLEN, M_NAMEI);
			break;
		}
		// LP64todo - fix this
		linklen = MAXPATHLEN - uio_resid(auio);
		if (linklen + ndp->ni_pathlen > MAXPATHLEN) {
			if (need_newpathbuf)
				FREE_ZONE(cp, MAXPATHLEN, M_NAMEI);

			error = ENAMETOOLONG;
			break;
		}
		if (need_newpathbuf) {
			long len = cnp->cn_pnlen;

			tmppn = cnp->cn_pnbuf;
			bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen);
			cnp->cn_pnbuf = cp;
			cnp->cn_pnlen = MAXPATHLEN;

			if ( (cnp->cn_flags & HASBUF) )
			        FREE_ZONE(tmppn, len, M_NAMEI);
			else
			        cnp->cn_flags |= HASBUF;
		} else
			cnp->cn_pnbuf[linklen] = '\0';

		ndp->ni_pathlen += linklen;
		cnp->cn_nameptr = cnp->cn_pnbuf;

		/*
		 * starting point for 'relative'
		 * symbolic link path
		 */
		dp = ndp->ni_dvp;
	        /*
		 * get rid of references returned via 'lookup'
		 */
		vnode_put(ndp->ni_vp);
		vnode_put(ndp->ni_dvp);

		ndp->ni_vp = NULLVP;
		ndp->ni_dvp = NULLVP;

		/*
		 * Check if symbolic link restarts us at the root
		 */
		if (*(cnp->cn_nameptr) == '/') {
			while (*(cnp->cn_nameptr) == '/') {
				cnp->cn_nameptr++;
				ndp->ni_pathlen--;
			}
			if ((dp = ndp->ni_rootdir) == NULLVP) {
			        error = ENOENT;
				goto error_out;
			}
		}
	}
	/*
	 * only come here if we fail to handle a SYMLINK...
	 * if either ni_dvp or ni_vp is non-NULL, then
	 * we need to drop the iocount that was picked
	 * up in the lookup routine
	 */
	if (ndp->ni_dvp)
	        vnode_put(ndp->ni_dvp);
	if (ndp->ni_vp)
	        vnode_put(ndp->ni_vp);
 error_out:
	if ( (cnp->cn_flags & HASBUF) ) {
		cnp->cn_flags &= ~HASBUF;
		FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI);
	}
	cnp->cn_pnbuf = NULL;
	ndp->ni_vp = NULLVP;
	if (error == ERECYCLE){
		/* vnode was recycled underneath us. re-drive lookup to start at 
		   the beginning again, since recycling invalidated last lookup*/
		ndp->ni_cnd.cn_flags = cnpflags;
		ndp->ni_dvp = usedvp;
		goto vnode_recycled;
	}


	return (error);
}