/* * The close() system call uses it's own audit call to capture the path/vnode * information because those pieces are not easily obtained within the system * call itself. */ void audit_sysclose(struct thread *td, int fd) { struct kaudit_record *ar; struct vnode *vp; struct file *fp; int vfslocked; KASSERT(td != NULL, ("audit_sysclose: td == NULL")); ar = currecord(); if (ar == NULL) return; audit_arg_fd(fd); if (getvnode(td->td_proc->p_fd, fd, &fp) != 0) return; vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); vn_lock(vp, LK_SHARED | LK_RETRY); audit_arg_vnode1(vp); VOP_UNLOCK(vp, 0); VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); }
int kobj_read_file_vnode(struct _buf *file, char *buf, unsigned size, unsigned off) { struct vnode *vp = file->ptr; struct thread *td = curthread; struct uio auio; struct iovec aiov; int error, vfslocked; bzero(&aiov, sizeof(aiov)); bzero(&auio, sizeof(auio)); aiov.iov_base = buf; aiov.iov_len = size; auio.uio_iov = &aiov; auio.uio_offset = (off_t)off; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_iovcnt = 1; auio.uio_resid = size; auio.uio_td = td; vfslocked = VFS_LOCK_GIANT(vp->v_mount); vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_READ(vp, &auio, IO_UNIT | IO_SYNC, td->td_ucred); VOP_UNLOCK(vp, 0); VFS_UNLOCK_GIANT(vfslocked); return (error != 0 ? -1 : size - auio.uio_resid); }
static void * kobj_open_file_vnode(const char *file) { struct thread *td = curthread; struct filedesc *fd; struct nameidata nd; int error, flags, vfslocked; fd = td->td_proc->p_fd; FILEDESC_XLOCK(fd); if (fd->fd_rdir == NULL) { fd->fd_rdir = rootvnode; vref(fd->fd_rdir); } if (fd->fd_cdir == NULL) { fd->fd_cdir = rootvnode; vref(fd->fd_cdir); } FILEDESC_XUNLOCK(fd); flags = FREAD | O_NOFOLLOW; NDINIT(&nd, LOOKUP, MPSAFE, UIO_SYSSPACE, file, td); error = vn_open_cred(&nd, &flags, 0, 0, curthread->td_ucred, NULL); if (error != 0) return (NULL); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); /* We just unlock so we hold a reference. */ VOP_UNLOCK(nd.ni_vp, 0); VFS_UNLOCK_GIANT(vfslocked); return (nd.ni_vp); }
/* * Audit information about a file, either the file's vnode info, or its * socket address info. */ void audit_arg_file(struct proc *p, struct file *fp) { struct kaudit_record *ar; struct socket *so; struct inpcb *pcb; struct vnode *vp; int vfslocked; ar = currecord(); if (ar == NULL) return; switch (fp->f_type) { case DTYPE_VNODE: case DTYPE_FIFO: /* * XXXAUDIT: Only possibly to record as first vnode? */ vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); vn_lock(vp, LK_SHARED | LK_RETRY); audit_arg_vnode1(vp); VOP_UNLOCK(vp, 0); VFS_UNLOCK_GIANT(vfslocked); break; case DTYPE_SOCKET: so = (struct socket *)fp->f_data; if (INP_CHECK_SOCKAF(so, PF_INET)) { SOCK_LOCK(so); ar->k_ar.ar_arg_sockinfo.so_type = so->so_type; ar->k_ar.ar_arg_sockinfo.so_domain = INP_SOCKAF(so); ar->k_ar.ar_arg_sockinfo.so_protocol = so->so_proto->pr_protocol; SOCK_UNLOCK(so); pcb = (struct inpcb *)so->so_pcb; INP_RLOCK(pcb); ar->k_ar.ar_arg_sockinfo.so_raddr = pcb->inp_faddr.s_addr; ar->k_ar.ar_arg_sockinfo.so_laddr = pcb->inp_laddr.s_addr; ar->k_ar.ar_arg_sockinfo.so_rport = pcb->inp_fport; ar->k_ar.ar_arg_sockinfo.so_lport = pcb->inp_lport; INP_RUNLOCK(pcb); ARG_SET_VALID(ar, ARG_SOCKINFO); } break; default: /* XXXAUDIT: else? */ break; } }
int alq_open_flags(struct alq **alqp, const char *file, struct ucred *cred, int cmode, int size, int flags) { struct thread *td; struct nameidata nd; struct alq *alq; int oflags; int error; int vfslocked; KASSERT((size > 0), ("%s: size <= 0", __func__)); *alqp = NULL; td = curthread; NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, file, td); oflags = FWRITE | O_NOFOLLOW | O_CREAT; error = vn_open_cred(&nd, &oflags, cmode, 0, cred, NULL); if (error) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); /* We just unlock so we hold a reference */ VOP_UNLOCK(nd.ni_vp, 0); VFS_UNLOCK_GIANT(vfslocked); alq = bsd_malloc(sizeof(*alq), M_ALD, M_WAITOK|M_ZERO); alq->aq_vp = nd.ni_vp; alq->aq_cred = crhold(cred); mtx_init(&alq->aq_mtx, "ALD Queue", NULL, MTX_SPIN|MTX_QUIET); alq->aq_buflen = size; alq->aq_entmax = 0; alq->aq_entlen = 0; alq->aq_freebytes = alq->aq_buflen; alq->aq_entbuf = bsd_malloc(alq->aq_buflen, M_ALD, M_WAITOK|M_ZERO); alq->aq_writehead = alq->aq_writetail = 0; if (flags & ALQ_ORDERED) alq->aq_flags |= AQ_ORDERED; if ((error = ald_add(alq)) != 0) { alq_destroy(alq); return (error); } *alqp = alq; return (0); }
void kobj_close_file(struct _buf *file) { if (file->mounted) { struct vnode *vp = file->ptr; struct thread *td = curthread; int vfslocked; vfslocked = VFS_LOCK_GIANT(vp->v_mount); vn_close(vp, FREAD, td->td_ucred, td); VFS_UNLOCK_GIANT(vfslocked); } kmem_free(file, sizeof(*file)); }
static int kobj_get_filesize_vnode(struct _buf *file, uint64_t *size) { struct vnode *vp = file->ptr; struct vattr va; int error, vfslocked; vfslocked = VFS_LOCK_GIANT(vp->v_mount); vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &va, curthread->td_ucred); VOP_UNLOCK(vp, 0); if (error == 0) *size = (uint64_t)va.va_size; VFS_UNLOCK_GIANT(vfslocked); return (error); }
static int vfs_mountroot_readconf(struct thread *td, struct sbuf *sb) { static char buf[128]; struct nameidata nd; off_t ofs; ssize_t resid; int error, flags, len, vfslocked; NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, "/.mount.conf", td); flags = FREAD; error = vn_open(&nd, &flags, 0, NULL); if (error) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); ofs = 0; len = sizeof(buf) - 1; while (1) { error = vn_rdwr(UIO_READ, nd.ni_vp, buf, len, ofs, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) break; if (resid == len) break; buf[len - resid] = 0; sbuf_printf(sb, "%s", buf); ofs += len - resid; } VOP_UNLOCK(nd.ni_vp, 0); vn_close(nd.ni_vp, FREAD, td->td_ucred, td); VFS_UNLOCK_GIANT(vfslocked); return (error); }
static void unlock_and_deallocate(struct faultstate *fs) { vm_object_pip_wakeup(fs->object); VM_OBJECT_UNLOCK(fs->object); if (fs->object != fs->first_object) { VM_OBJECT_LOCK(fs->first_object); vm_page_lock(fs->first_m); vm_page_free(fs->first_m); vm_page_unlock(fs->first_m); vm_object_pip_wakeup(fs->first_object); VM_OBJECT_UNLOCK(fs->first_object); fs->first_m = NULL; } vm_object_deallocate(fs->first_object); unlock_map(fs); if (fs->vp != NULL) { vput(fs->vp); fs->vp = NULL; } VFS_UNLOCK_GIANT(fs->vfslocked); fs->vfslocked = 0; }
static void mac_proc_vm_revoke_recurse(struct thread *td, struct ucred *cred, struct vm_map *map) { vm_map_entry_t vme; int vfslocked, result; vm_prot_t revokeperms; vm_object_t backing_object, object; vm_ooffset_t offset; struct vnode *vp; struct mount *mp; if (!mac_mmap_revocation) return; vm_map_lock(map); for (vme = map->header.next; vme != &map->header; vme = vme->next) { if (vme->eflags & MAP_ENTRY_IS_SUB_MAP) { mac_proc_vm_revoke_recurse(td, cred, vme->object.sub_map); continue; } /* * Skip over entries that obviously are not shared. */ if (vme->eflags & (MAP_ENTRY_COW | MAP_ENTRY_NOSYNC) || !vme->max_protection) continue; /* * Drill down to the deepest backing object. */ offset = vme->offset; object = vme->object.vm_object; if (object == NULL) continue; VM_OBJECT_LOCK(object); while ((backing_object = object->backing_object) != NULL) { VM_OBJECT_LOCK(backing_object); offset += object->backing_object_offset; VM_OBJECT_UNLOCK(object); object = backing_object; } VM_OBJECT_UNLOCK(object); /* * At the moment, vm_maps and objects aren't considered by * the MAC system, so only things with backing by a normal * object (read: vnodes) are checked. */ if (object->type != OBJT_VNODE) continue; vp = (struct vnode *)object->handle; vfslocked = VFS_LOCK_GIANT(vp->v_mount); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); result = vme->max_protection; mac_vnode_check_mmap_downgrade(cred, vp, &result); VOP_UNLOCK(vp, 0); /* * Find out what maximum protection we may be allowing now * but a policy needs to get removed. */ revokeperms = vme->max_protection & ~result; if (!revokeperms) { VFS_UNLOCK_GIANT(vfslocked); continue; } printf("pid %ld: revoking %s perms from %#lx:%ld " "(max %s/cur %s)\n", (long)td->td_proc->p_pid, prot2str(revokeperms), (u_long)vme->start, (long)(vme->end - vme->start), prot2str(vme->max_protection), prot2str(vme->protection)); /* * This is the really simple case: if a map has more * max_protection than is allowed, but it's not being * actually used (that is, the current protection is still * allowed), we can just wipe it out and do nothing more. */ if ((vme->protection & revokeperms) == 0) { vme->max_protection -= revokeperms; } else { if (revokeperms & VM_PROT_WRITE) { /* * In the more complicated case, flush out all * pending changes to the object then turn it * copy-on-write. */ vm_object_reference(object); (void) vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VM_OBJECT_LOCK(object); vm_object_page_clean(object, offset, offset + vme->end - vme->start, OBJPC_SYNC); VM_OBJECT_UNLOCK(object); VOP_UNLOCK(vp, 0); vn_finished_write(mp); vm_object_deallocate(object); /* * Why bother if there's no read permissions * anymore? For the rest, we need to leave * the write permissions on for COW, or * remove them entirely if configured to. */ if (!mac_mmap_revocation_via_cow) { vme->max_protection &= ~VM_PROT_WRITE; vme->protection &= ~VM_PROT_WRITE; } if ((revokeperms & VM_PROT_READ) == 0) vme->eflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY; } if (revokeperms & VM_PROT_EXECUTE) { vme->max_protection &= ~VM_PROT_EXECUTE; vme->protection &= ~VM_PROT_EXECUTE; } if (revokeperms & VM_PROT_READ) { vme->max_protection = 0; vme->protection = 0; } pmap_protect(map->pmap, vme->start, vme->end, vme->protection & ~revokeperms); vm_map_simplify_entry(map, vme); } VFS_UNLOCK_GIANT(vfslocked); } vm_map_unlock(map); }
/* * Exit: deallocate address space and other resources, change proc state to * zombie, and unlink proc from allproc and parent's lists. Save exit status * and rusage for wait(). Check for child processes and orphan them. */ void exit1(struct thread *td, int rv) { struct proc *p, *nq, *q; struct vnode *vtmp; struct vnode *ttyvp = NULL; #ifdef KTRACE struct vnode *tracevp; struct ucred *tracecred; #endif struct plimit *plim; int locked; mtx_assert(&Giant, MA_NOTOWNED); p = td->td_proc; /* * XXX in case we're rebooting we just let init die in order to * work around an unsolved stack overflow seen very late during * shutdown on sparc64 when the gmirror worker process exists. */ if (p == initproc && rebooting == 0) { printf("init died (signal %d, exit %d)\n", WTERMSIG(rv), WEXITSTATUS(rv)); panic("Going nowhere without my init!"); } /* * MUST abort all other threads before proceeding past here. */ PROC_LOCK(p); while (p->p_flag & P_HADTHREADS) { /* * First check if some other thread got here before us.. * if so, act apropriatly, (exit or suspend); */ thread_suspend_check(0); /* * Kill off the other threads. This requires * some co-operation from other parts of the kernel * so it may not be instantaneous. With this state set * any thread entering the kernel from userspace will * thread_exit() in trap(). Any thread attempting to * sleep will return immediately with EINTR or EWOULDBLOCK * which will hopefully force them to back out to userland * freeing resources as they go. Any thread attempting * to return to userland will thread_exit() from userret(). * thread_exit() will unsuspend us when the last of the * other threads exits. * If there is already a thread singler after resumption, * calling thread_single will fail; in that case, we just * re-check all suspension request, the thread should * either be suspended there or exit. */ if (! thread_single(SINGLE_EXIT)) break; /* * All other activity in this process is now stopped. * Threading support has been turned off. */ } KASSERT(p->p_numthreads == 1, ("exit1: proc %p exiting with %d threads", p, p->p_numthreads)); /* * Wakeup anyone in procfs' PIOCWAIT. They should have a hold * on our vmspace, so we should block below until they have * released their reference to us. Note that if they have * requested S_EXIT stops we will block here until they ack * via PIOCCONT. */ _STOPEVENT(p, S_EXIT, rv); /* * Note that we are exiting and do another wakeup of anyone in * PIOCWAIT in case they aren't listening for S_EXIT stops or * decided to wait again after we told them we are exiting. */ p->p_flag |= P_WEXIT; wakeup(&p->p_stype); /* * Wait for any processes that have a hold on our vmspace to * release their reference. */ while (p->p_lock > 0) msleep(&p->p_lock, &p->p_mtx, PWAIT, "exithold", 0); PROC_UNLOCK(p); /* Drain the limit callout while we don't have the proc locked */ callout_drain(&p->p_limco); #ifdef AUDIT /* * The Sun BSM exit token contains two components: an exit status as * passed to exit(), and a return value to indicate what sort of exit * it was. The exit status is WEXITSTATUS(rv), but it's not clear * what the return value is. */ AUDIT_ARG_EXIT(WEXITSTATUS(rv), 0); AUDIT_SYSCALL_EXIT(0, td); #endif /* Are we a task leader? */ if (p == p->p_leader) { mtx_lock(&ppeers_lock); q = p->p_peers; while (q != NULL) { PROC_LOCK(q); psignal(q, SIGKILL); PROC_UNLOCK(q); q = q->p_peers; } while (p->p_peers != NULL) msleep(p, &ppeers_lock, PWAIT, "exit1", 0); mtx_unlock(&ppeers_lock); } /* * Check if any loadable modules need anything done at process exit. * E.g. SYSV IPC stuff * XXX what if one of these generates an error? */ EVENTHANDLER_INVOKE(process_exit, p); /* * If parent is waiting for us to exit or exec, * P_PPWAIT is set; we will wakeup the parent below. */ PROC_LOCK(p); stopprofclock(p); p->p_flag &= ~(P_TRACED | P_PPWAIT); /* * Stop the real interval timer. If the handler is currently * executing, prevent it from rearming itself and let it finish. */ if (timevalisset(&p->p_realtimer.it_value) && callout_stop(&p->p_itcallout) == 0) { timevalclear(&p->p_realtimer.it_interval); msleep(&p->p_itcallout, &p->p_mtx, PWAIT, "ritwait", 0); KASSERT(!timevalisset(&p->p_realtimer.it_value), ("realtime timer is still armed")); } PROC_UNLOCK(p); /* * Reset any sigio structures pointing to us as a result of * F_SETOWN with our pid. */ funsetownlst(&p->p_sigiolst); /* * If this process has an nlminfo data area (for lockd), release it */ if (nlminfo_release_p != NULL && p->p_nlminfo != NULL) (*nlminfo_release_p)(p); /* * Close open files and release open-file table. * This may block! */ fdfree(td); /* * If this thread tickled GEOM, we need to wait for the giggling to * stop before we return to userland */ if (td->td_pflags & TDP_GEOM) g_waitidle(); /* * Remove ourself from our leader's peer list and wake our leader. */ mtx_lock(&ppeers_lock); if (p->p_leader->p_peers) { q = p->p_leader; while (q->p_peers != p) q = q->p_peers; q->p_peers = p->p_peers; wakeup(p->p_leader); } mtx_unlock(&ppeers_lock); vmspace_exit(td); sx_xlock(&proctree_lock); if (SESS_LEADER(p)) { struct session *sp = p->p_session; struct tty *tp; /* * s_ttyp is not zero'd; we use this to indicate that * the session once had a controlling terminal. (for * logging and informational purposes) */ SESS_LOCK(sp); ttyvp = sp->s_ttyvp; tp = sp->s_ttyp; sp->s_ttyvp = NULL; sp->s_leader = NULL; SESS_UNLOCK(sp); /* * Signal foreground pgrp and revoke access to * controlling terminal if it has not been revoked * already. * * Because the TTY may have been revoked in the mean * time and could already have a new session associated * with it, make sure we don't send a SIGHUP to a * foreground process group that does not belong to this * session. */ if (tp != NULL) { tty_lock(tp); if (tp->t_session == sp) tty_signal_pgrp(tp, SIGHUP); tty_unlock(tp); } if (ttyvp != NULL) { sx_xunlock(&proctree_lock); if (vn_lock(ttyvp, LK_EXCLUSIVE) == 0) { VOP_REVOKE(ttyvp, REVOKEALL); VOP_UNLOCK(ttyvp, 0); } sx_xlock(&proctree_lock); } } fixjobc(p, p->p_pgrp, 0); sx_xunlock(&proctree_lock); (void)acct_process(td); /* Release the TTY now we've unlocked everything. */ if (ttyvp != NULL) vrele(ttyvp); #ifdef KTRACE /* * Disable tracing, then drain any pending records and release * the trace file. */ if (p->p_traceflag != 0) { PROC_LOCK(p); mtx_lock(&ktrace_mtx); p->p_traceflag = 0; mtx_unlock(&ktrace_mtx); PROC_UNLOCK(p); ktrprocexit(td); PROC_LOCK(p); mtx_lock(&ktrace_mtx); tracevp = p->p_tracevp; p->p_tracevp = NULL; tracecred = p->p_tracecred; p->p_tracecred = NULL; mtx_unlock(&ktrace_mtx); PROC_UNLOCK(p); if (tracevp != NULL) { locked = VFS_LOCK_GIANT(tracevp->v_mount); vrele(tracevp); VFS_UNLOCK_GIANT(locked); } if (tracecred != NULL) crfree(tracecred); } #endif /* * Release reference to text vnode */ if ((vtmp = p->p_textvp) != NULL) { p->p_textvp = NULL; locked = VFS_LOCK_GIANT(vtmp->v_mount); vrele(vtmp); VFS_UNLOCK_GIANT(locked); } /* * Release our limits structure. */ PROC_LOCK(p); plim = p->p_limit; p->p_limit = NULL; PROC_UNLOCK(p); lim_free(plim); /* * Remove proc from allproc queue and pidhash chain. * Place onto zombproc. Unlink from parent's child list. */ sx_xlock(&allproc_lock); LIST_REMOVE(p, p_list); LIST_INSERT_HEAD(&zombproc, p, p_list); LIST_REMOVE(p, p_hash); sx_xunlock(&allproc_lock); /* * Call machine-dependent code to release any * machine-dependent resources other than the address space. * The address space is released by "vmspace_exitfree(p)" in * vm_waitproc(). */ cpu_exit(td); WITNESS_WARN(WARN_PANIC, NULL, "process (pid %d) exiting", p->p_pid); /* * Reparent all of our children to init. */ sx_xlock(&proctree_lock); q = LIST_FIRST(&p->p_children); if (q != NULL) /* only need this if any child is S_ZOMB */ wakeup(initproc); for (; q != NULL; q = nq) { nq = LIST_NEXT(q, p_sibling); PROC_LOCK(q); proc_reparent(q, initproc); q->p_sigparent = SIGCHLD; /* * Traced processes are killed * since their existence means someone is screwing up. */ if (q->p_flag & P_TRACED) { struct thread *temp; q->p_flag &= ~(P_TRACED | P_STOPPED_TRACE); FOREACH_THREAD_IN_PROC(q, temp) temp->td_dbgflags &= ~TDB_SUSPEND; psignal(q, SIGKILL); } PROC_UNLOCK(q); } /* Save exit status. */ PROC_LOCK(p); p->p_xstat = rv; p->p_xthread = td; /* Tell the prison that we are gone. */ prison_proc_free(p->p_ucred->cr_prison); #ifdef KDTRACE_HOOKS /* * Tell the DTrace fasttrap provider about the exit if it * has declared an interest. */ if (dtrace_fasttrap_exit) dtrace_fasttrap_exit(p); #endif /* * Notify interested parties of our demise. */ KNOTE_LOCKED(&p->p_klist, NOTE_EXIT); #ifdef KDTRACE_HOOKS int reason = CLD_EXITED; if (WCOREDUMP(rv)) reason = CLD_DUMPED; else if (WIFSIGNALED(rv)) reason = CLD_KILLED; SDT_PROBE(proc, kernel, , exit, reason, 0, 0, 0, 0); #endif /* * Just delete all entries in the p_klist. At this point we won't * report any more events, and there are nasty race conditions that * can beat us if we don't. */ knlist_clear(&p->p_klist, 1); /* * Notify parent that we're gone. If parent has the PS_NOCLDWAIT * flag set, or if the handler is set to SIG_IGN, notify process * 1 instead (and hope it will handle this situation). */ PROC_LOCK(p->p_pptr); mtx_lock(&p->p_pptr->p_sigacts->ps_mtx); if (p->p_pptr->p_sigacts->ps_flag & (PS_NOCLDWAIT | PS_CLDSIGIGN)) { struct proc *pp; mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx); pp = p->p_pptr; PROC_UNLOCK(pp); proc_reparent(p, initproc); p->p_sigparent = SIGCHLD; PROC_LOCK(p->p_pptr); /* * Notify parent, so in case he was wait(2)ing or * executing waitpid(2) with our pid, he will * continue. */ wakeup(pp); } else mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx); if (p->p_pptr == initproc) psignal(p->p_pptr, SIGCHLD); else if (p->p_sigparent != 0) { if (p->p_sigparent == SIGCHLD) childproc_exited(p); else /* LINUX thread */ psignal(p->p_pptr, p->p_sigparent); } sx_xunlock(&proctree_lock); /* * The state PRS_ZOMBIE prevents other proesses from sending * signal to the process, to avoid memory leak, we free memory * for signal queue at the time when the state is set. */ sigqueue_flush(&p->p_sigqueue); sigqueue_flush(&td->td_sigqueue); /* * We have to wait until after acquiring all locks before * changing p_state. We need to avoid all possible context * switches (including ones from blocking on a mutex) while * marked as a zombie. We also have to set the zombie state * before we release the parent process' proc lock to avoid * a lost wakeup. So, we first call wakeup, then we grab the * sched lock, update the state, and release the parent process' * proc lock. */ wakeup(p->p_pptr); cv_broadcast(&p->p_pwait); sched_exit(p->p_pptr, td); PROC_SLOCK(p); p->p_state = PRS_ZOMBIE; PROC_UNLOCK(p->p_pptr); /* * Hopefully no one will try to deliver a signal to the process this * late in the game. */ knlist_destroy(&p->p_klist); /* * Save our children's rusage information in our exit rusage. */ ruadd(&p->p_ru, &p->p_rux, &p->p_stats->p_cru, &p->p_crux); /* * Make sure the scheduler takes this thread out of its tables etc. * This will also release this thread's reference to the ucred. * Other thread parts to release include pcb bits and such. */ thread_exit(); }
/* ARGSUSED */ int auditctl(struct thread *td, struct auditctl_args *uap) { struct nameidata nd; struct ucred *cred; struct vnode *vp; int error = 0; int flags, vfslocked; if (jailed(td->td_ucred)) return (ENOSYS); error = priv_check(td, PRIV_AUDIT_CONTROL); if (error) return (error); vp = NULL; cred = NULL; /* * If a path is specified, open the replacement vnode, perform * validity checks, and grab another reference to the current * credential. * * On Darwin, a NULL path argument is also used to disable audit. */ if (uap->path == NULL) return (EINVAL); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->path, td); flags = AUDIT_OPEN_FLAGS; error = vn_open(&nd, &flags, 0, NULL); if (error) return (error); vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; #ifdef MAC error = mac_system_check_auditctl(td->td_ucred, vp); VOP_UNLOCK(vp, 0); if (error) { vn_close(vp, AUDIT_CLOSE_FLAGS, td->td_ucred, td); VFS_UNLOCK_GIANT(vfslocked); return (error); } #else VOP_UNLOCK(vp, 0); #endif NDFREE(&nd, NDF_ONLY_PNBUF); if (vp->v_type != VREG) { vn_close(vp, AUDIT_CLOSE_FLAGS, td->td_ucred, td); VFS_UNLOCK_GIANT(vfslocked); return (EINVAL); } VFS_UNLOCK_GIANT(vfslocked); cred = td->td_ucred; crhold(cred); /* * XXXAUDIT: Should audit_suspended actually be cleared by * audit_worker? */ audit_suspended = 0; audit_rotate_vnode(cred, vp); return (error); }
static int link_elf_ctf_get(linker_file_t lf, linker_ctf_t *lc) { #ifdef DDB_CTF Elf_Ehdr *hdr = NULL; Elf_Shdr *shdr = NULL; caddr_t ctftab = NULL; caddr_t raw = NULL; caddr_t shstrtab = NULL; elf_file_t ef = (elf_file_t) lf; int flags; int i; int nbytes; ssize_t resid; int vfslocked; size_t sz; struct nameidata nd; struct thread *td = curthread; uint8_t ctf_hdr[CTF_HDR_SIZE]; #endif int error = 0; if (lf == NULL || lc == NULL) return (EINVAL); /* Set the defaults for no CTF present. That's not a crime! */ bzero(lc, sizeof(*lc)); #ifdef DDB_CTF /* * First check if we've tried to load CTF data previously and the * CTF ELF section wasn't found. We flag that condition by setting * ctfcnt to -1. See below. */ if (ef->ctfcnt < 0) return (EFTYPE); /* Now check if we've already loaded the CTF data.. */ if (ef->ctfcnt > 0) { /* We only need to load once. */ lc->ctftab = ef->ctftab; lc->ctfcnt = ef->ctfcnt; lc->symtab = ef->ddbsymtab; lc->strtab = ef->ddbstrtab; lc->strcnt = ef->ddbstrcnt; lc->nsym = ef->ddbsymcnt; lc->ctfoffp = (uint32_t **) &ef->ctfoff; lc->typoffp = (uint32_t **) &ef->typoff; lc->typlenp = &ef->typlen; return (0); } /* * We need to try reading the CTF data. Flag no CTF data present * by default and if we actually succeed in reading it, we'll * update ctfcnt to the number of bytes read. */ ef->ctfcnt = -1; NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, lf->pathname, td); flags = FREAD; error = vn_open(&nd, &flags, 0, NULL); if (error) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); /* Allocate memory for the FLF header. */ if ((hdr = malloc(sizeof(*hdr), M_LINKER, M_WAITOK)) == NULL) { error = ENOMEM; goto out; } /* Read the ELF header. */ if ((error = vn_rdwr(UIO_READ, nd.ni_vp, hdr, sizeof(*hdr), 0, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td)) != 0) goto out; /* Sanity check. */ if (!IS_ELF(*hdr)) { error = ENOEXEC; goto out; } nbytes = hdr->e_shnum * hdr->e_shentsize; if (nbytes == 0 || hdr->e_shoff == 0 || hdr->e_shentsize != sizeof(Elf_Shdr)) { error = ENOEXEC; goto out; } /* Allocate memory for all the section headers */ if ((shdr = malloc(nbytes, M_LINKER, M_WAITOK)) == NULL) { error = ENOMEM; goto out; } /* Read all the section headers */ if ((error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)shdr, nbytes, hdr->e_shoff, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td)) != 0) goto out; /* * We need to search for the CTF section by name, so if the * section names aren't present, then we can't locate the * .SUNW_ctf section containing the CTF data. */ if (hdr->e_shstrndx == 0 || shdr[hdr->e_shstrndx].sh_type != SHT_STRTAB) { printf("%s(%d): module %s e_shstrndx is %d, sh_type is %d\n", __func__, __LINE__, lf->pathname, hdr->e_shstrndx, shdr[hdr->e_shstrndx].sh_type); error = EFTYPE; goto out; } /* Allocate memory to buffer the section header strings. */ if ((shstrtab = malloc(shdr[hdr->e_shstrndx].sh_size, M_LINKER, M_WAITOK)) == NULL) { error = ENOMEM; goto out; } /* Read the section header strings. */ if ((error = vn_rdwr(UIO_READ, nd.ni_vp, shstrtab, shdr[hdr->e_shstrndx].sh_size, shdr[hdr->e_shstrndx].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td)) != 0) goto out; /* Search for the section containing the CTF data. */ for (i = 0; i < hdr->e_shnum; i++) if (strcmp(".SUNW_ctf", shstrtab + shdr[i].sh_name) == 0) break; /* Check if the CTF section wasn't found. */ if (i >= hdr->e_shnum) { printf("%s(%d): module %s has no .SUNW_ctf section\n", __func__, __LINE__, lf->pathname); error = EFTYPE; goto out; } /* Read the CTF header. */ if ((error = vn_rdwr(UIO_READ, nd.ni_vp, ctf_hdr, sizeof(ctf_hdr), shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td)) != 0) goto out; /* Check the CTF magic number. (XXX check for big endian!) */ if (ctf_hdr[0] != 0xf1 || ctf_hdr[1] != 0xcf) { printf("%s(%d): module %s has invalid format\n", __func__, __LINE__, lf->pathname); error = EFTYPE; goto out; } /* Check if version 2. */ if (ctf_hdr[2] != 2) { printf("%s(%d): module %s CTF format version is %d " "(2 expected)\n", __func__, __LINE__, lf->pathname, ctf_hdr[2]); error = EFTYPE; goto out; } /* Check if the data is compressed. */ if ((ctf_hdr[3] & 0x1) != 0) { uint32_t *u32 = (uint32_t *) ctf_hdr; /* * The last two fields in the CTF header are the offset * from the end of the header to the start of the string * data and the length of that string data. se this * information to determine the decompressed CTF data * buffer required. */ sz = u32[CTF_HDR_STRTAB_U32] + u32[CTF_HDR_STRLEN_U32] + sizeof(ctf_hdr); /* * Allocate memory for the compressed CTF data, including * the header (which isn't compressed). */ if ((raw = malloc(shdr[i].sh_size, M_LINKER, M_WAITOK)) == NULL) { error = ENOMEM; goto out; } } else { /* * The CTF data is not compressed, so the ELF section * size is the same as the buffer size required. */ sz = shdr[i].sh_size; } /* * Allocate memory to buffer the CTF data in it's decompressed * form. */ if ((ctftab = malloc(sz, M_LINKER, M_WAITOK)) == NULL) { error = ENOMEM; goto out; } /* * Read the CTF data into the raw buffer if compressed, or * directly into the CTF buffer otherwise. */ if ((error = vn_rdwr(UIO_READ, nd.ni_vp, raw == NULL ? ctftab : raw, shdr[i].sh_size, shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td)) != 0) goto out; /* Check if decompression is required. */ if (raw != NULL) { z_stream zs; int ret; /* * The header isn't compressed, so copy that into the * CTF buffer first. */ bcopy(ctf_hdr, ctftab, sizeof(ctf_hdr)); /* Initialise the zlib structure. */ bzero(&zs, sizeof(zs)); zs.zalloc = z_alloc; zs.zfree = z_free; if (inflateInit(&zs) != Z_OK) { error = EIO; goto out; } zs.avail_in = shdr[i].sh_size - sizeof(ctf_hdr); zs.next_in = ((uint8_t *) raw) + sizeof(ctf_hdr); zs.avail_out = sz - sizeof(ctf_hdr); zs.next_out = ((uint8_t *) ctftab) + sizeof(ctf_hdr); if ((ret = inflate(&zs, Z_FINISH)) != Z_STREAM_END) { printf("%s(%d): zlib inflate returned %d\n", __func__, __LINE__, ret); error = EIO; goto out; } } /* Got the CTF data! */ ef->ctftab = ctftab; ef->ctfcnt = shdr[i].sh_size; /* We'll retain the memory allocated for the CTF data. */ ctftab = NULL; /* Let the caller use the CTF data read. */ lc->ctftab = ef->ctftab; lc->ctfcnt = ef->ctfcnt; lc->symtab = ef->ddbsymtab; lc->strtab = ef->ddbstrtab; lc->strcnt = ef->ddbstrcnt; lc->nsym = ef->ddbsymcnt; lc->ctfoffp = (uint32_t **) &ef->ctfoff; lc->typoffp = (uint32_t **) &ef->typoff; lc->typlenp = &ef->typlen; out: VOP_UNLOCK(nd.ni_vp, 0); vn_close(nd.ni_vp, FREAD, td->td_ucred, td); VFS_UNLOCK_GIANT(vfslocked); if (hdr != NULL) free(hdr, M_LINKER); if (shdr != NULL) free(shdr, M_LINKER); if (shstrtab != NULL) free(shstrtab, M_LINKER); if (ctftab != NULL) free(ctftab, M_LINKER); if (raw != NULL) free(raw, M_LINKER); #else error = EOPNOTSUPP; #endif return (error); }
static int link_elf_load_file(linker_class_t cls, const char *filename, linker_file_t *result) { struct nameidata nd; struct thread *td = curthread; /* XXX */ Elf_Ehdr *hdr; Elf_Shdr *shdr; Elf_Sym *es; int nbytes, i, j; vm_offset_t mapbase; size_t mapsize; int error = 0; int resid, flags; elf_file_t ef; linker_file_t lf; int symtabindex; int symstrindex; int shstrindex; int nsym; int pb, rl, ra; int alignmask; int vfslocked; shdr = NULL; lf = NULL; mapsize = 0; hdr = NULL; NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, filename, td); flags = FREAD; error = vn_open(&nd, &flags, 0, NULL); if (error) return error; vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp->v_type != VREG) { error = ENOEXEC; goto out; } #ifdef MAC error = mac_kld_check_load(td->td_ucred, nd.ni_vp); if (error) { goto out; } #endif /* Read the elf header from the file. */ hdr = malloc(sizeof(*hdr), M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, nd.ni_vp, (void *)hdr, sizeof(*hdr), 0, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = ENOEXEC; goto out; } if (!IS_ELF(*hdr)) { error = ENOEXEC; goto out; } if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) { link_elf_error(filename, "Unsupported file layout"); error = ENOEXEC; goto out; } if (hdr->e_ident[EI_VERSION] != EV_CURRENT || hdr->e_version != EV_CURRENT) { link_elf_error(filename, "Unsupported file version"); error = ENOEXEC; goto out; } if (hdr->e_type != ET_REL) { error = ENOSYS; goto out; } if (hdr->e_machine != ELF_TARG_MACH) { link_elf_error(filename, "Unsupported machine"); error = ENOEXEC; goto out; } lf = linker_make_file(filename, &link_elf_class); if (!lf) { error = ENOMEM; goto out; } ef = (elf_file_t) lf; ef->nprogtab = 0; ef->e_shdr = 0; ef->nreltab = 0; ef->nrelatab = 0; /* Allocate and read in the section header */ nbytes = hdr->e_shnum * hdr->e_shentsize; if (nbytes == 0 || hdr->e_shoff == 0 || hdr->e_shentsize != sizeof(Elf_Shdr)) { error = ENOEXEC; goto out; } shdr = malloc(nbytes, M_LINKER, M_WAITOK); ef->e_shdr = shdr; error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)shdr, nbytes, hdr->e_shoff, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid) { error = ENOEXEC; goto out; } /* Scan the section header for information and table sizing. */ nsym = 0; symtabindex = -1; symstrindex = -1; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_size == 0) continue; switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: ef->nprogtab++; break; case SHT_SYMTAB: nsym++; symtabindex = i; symstrindex = shdr[i].sh_link; break; case SHT_REL: ef->nreltab++; break; case SHT_RELA: ef->nrelatab++; break; case SHT_STRTAB: break; } } if (ef->nprogtab == 0) { link_elf_error(filename, "file has no contents"); error = ENOEXEC; goto out; } if (nsym != 1) { /* Only allow one symbol table for now */ link_elf_error(filename, "file has no valid symbol table"); error = ENOEXEC; goto out; } if (symstrindex < 0 || symstrindex > hdr->e_shnum || shdr[symstrindex].sh_type != SHT_STRTAB) { link_elf_error(filename, "file has invalid symbol strings"); error = ENOEXEC; goto out; } /* Allocate space for tracking the load chunks */ if (ef->nprogtab != 0) ef->progtab = malloc(ef->nprogtab * sizeof(*ef->progtab), M_LINKER, M_WAITOK | M_ZERO); if (ef->nreltab != 0) ef->reltab = malloc(ef->nreltab * sizeof(*ef->reltab), M_LINKER, M_WAITOK | M_ZERO); if (ef->nrelatab != 0) ef->relatab = malloc(ef->nrelatab * sizeof(*ef->relatab), M_LINKER, M_WAITOK | M_ZERO); if (symtabindex == -1) panic("lost symbol table index"); /* Allocate space for and load the symbol table */ ef->ddbsymcnt = shdr[symtabindex].sh_size / sizeof(Elf_Sym); ef->ddbsymtab = malloc(shdr[symtabindex].sh_size, M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, nd.ni_vp, (void *)ef->ddbsymtab, shdr[symtabindex].sh_size, shdr[symtabindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } if (symstrindex == -1) panic("lost symbol string index"); /* Allocate space for and load the symbol strings */ ef->ddbstrcnt = shdr[symstrindex].sh_size; ef->ddbstrtab = malloc(shdr[symstrindex].sh_size, M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, nd.ni_vp, ef->ddbstrtab, shdr[symstrindex].sh_size, shdr[symstrindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } /* Do we have a string table for the section names? */ shstrindex = -1; if (hdr->e_shstrndx != 0 && shdr[hdr->e_shstrndx].sh_type == SHT_STRTAB) { shstrindex = hdr->e_shstrndx; ef->shstrcnt = shdr[shstrindex].sh_size; ef->shstrtab = malloc(shdr[shstrindex].sh_size, M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, nd.ni_vp, ef->shstrtab, shdr[shstrindex].sh_size, shdr[shstrindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } } /* Size up code/data(progbits) and bss(nobits). */ alignmask = 0; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_size == 0) continue; switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: alignmask = shdr[i].sh_addralign - 1; mapsize += alignmask; mapsize &= ~alignmask; mapsize += shdr[i].sh_size; break; } } /* * We know how much space we need for the text/data/bss/etc. * This stuff needs to be in a single chunk so that profiling etc * can get the bounds and gdb can associate offsets with modules */ ef->object = vm_object_allocate(OBJT_DEFAULT, round_page(mapsize) >> PAGE_SHIFT); if (ef->object == NULL) { error = ENOMEM; goto out; } ef->address = (caddr_t) vm_map_min(kernel_map); /* * In order to satisfy amd64's architectural requirements on the * location of code and data in the kernel's address space, request a * mapping that is above the kernel. */ mapbase = KERNBASE; error = vm_map_find(kernel_map, ef->object, 0, &mapbase, round_page(mapsize), TRUE, VM_PROT_ALL, VM_PROT_ALL, FALSE); if (error) { vm_object_deallocate(ef->object); ef->object = 0; goto out; } /* Wire the pages */ error = vm_map_wire(kernel_map, mapbase, mapbase + round_page(mapsize), VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES); if (error != KERN_SUCCESS) { error = ENOMEM; goto out; } /* Inform the kld system about the situation */ lf->address = ef->address = (caddr_t)mapbase; lf->size = mapsize; /* * Now load code/data(progbits), zero bss(nobits), allocate space for * and load relocs */ pb = 0; rl = 0; ra = 0; alignmask = 0; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_size == 0) continue; switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: alignmask = shdr[i].sh_addralign - 1; mapbase += alignmask; mapbase &= ~alignmask; if (ef->shstrtab && shdr[i].sh_name != 0) ef->progtab[pb].name = ef->shstrtab + shdr[i].sh_name; else if (shdr[i].sh_type == SHT_PROGBITS) ef->progtab[pb].name = "<<PROGBITS>>"; else ef->progtab[pb].name = "<<NOBITS>>"; if (ef->progtab[pb].name != NULL && !strcmp(ef->progtab[pb].name, DPCPU_SETNAME)) ef->progtab[pb].addr = dpcpu_alloc(shdr[i].sh_size); #ifdef VIMAGE else if (ef->progtab[pb].name != NULL && !strcmp(ef->progtab[pb].name, VNET_SETNAME)) ef->progtab[pb].addr = vnet_data_alloc(shdr[i].sh_size); #endif else ef->progtab[pb].addr = (void *)(uintptr_t)mapbase; if (ef->progtab[pb].addr == NULL) { error = ENOSPC; goto out; } ef->progtab[pb].size = shdr[i].sh_size; ef->progtab[pb].sec = i; if (shdr[i].sh_type == SHT_PROGBITS) { error = vn_rdwr(UIO_READ, nd.ni_vp, ef->progtab[pb].addr, shdr[i].sh_size, shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } /* Initialize the per-cpu or vnet area. */ if (ef->progtab[pb].addr != (void *)mapbase && !strcmp(ef->progtab[pb].name, DPCPU_SETNAME)) dpcpu_copy(ef->progtab[pb].addr, shdr[i].sh_size); #ifdef VIMAGE else if (ef->progtab[pb].addr != (void *)mapbase && !strcmp(ef->progtab[pb].name, VNET_SETNAME)) vnet_data_copy(ef->progtab[pb].addr, shdr[i].sh_size); #endif } else bzero(ef->progtab[pb].addr, shdr[i].sh_size); /* Update all symbol values with the offset. */ for (j = 0; j < ef->ddbsymcnt; j++) { es = &ef->ddbsymtab[j]; if (es->st_shndx != i) continue; es->st_value += (Elf_Addr)ef->progtab[pb].addr; } mapbase += shdr[i].sh_size; pb++; break; case SHT_REL: ef->reltab[rl].rel = malloc(shdr[i].sh_size, M_LINKER, M_WAITOK); ef->reltab[rl].nrel = shdr[i].sh_size / sizeof(Elf_Rel); ef->reltab[rl].sec = shdr[i].sh_info; error = vn_rdwr(UIO_READ, nd.ni_vp, (void *)ef->reltab[rl].rel, shdr[i].sh_size, shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } rl++; break; case SHT_RELA: ef->relatab[ra].rela = malloc(shdr[i].sh_size, M_LINKER, M_WAITOK); ef->relatab[ra].nrela = shdr[i].sh_size / sizeof(Elf_Rela); ef->relatab[ra].sec = shdr[i].sh_info; error = vn_rdwr(UIO_READ, nd.ni_vp, (void *)ef->relatab[ra].rela, shdr[i].sh_size, shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } ra++; break; } } if (pb != ef->nprogtab) panic("lost progbits"); if (rl != ef->nreltab) panic("lost reltab"); if (ra != ef->nrelatab) panic("lost relatab"); if (mapbase != (vm_offset_t)ef->address + mapsize) panic("mapbase 0x%lx != address %p + mapsize 0x%lx (0x%lx)\n", (u_long)mapbase, ef->address, (u_long)mapsize, (u_long)(vm_offset_t)ef->address + mapsize); /* Local intra-module relocations */ link_elf_reloc_local(lf); /* Pull in dependencies */ VOP_UNLOCK(nd.ni_vp, 0); error = linker_load_dependencies(lf); vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY); if (error) goto out; /* External relocations */ error = relocate_file(ef); if (error) goto out; /* Notify MD code that a module is being loaded. */ error = elf_cpu_load_file(lf); if (error) goto out; *result = lf; out: VOP_UNLOCK(nd.ni_vp, 0); vn_close(nd.ni_vp, FREAD, td->td_ucred, td); VFS_UNLOCK_GIANT(vfslocked); if (error && lf) linker_file_unload(lf, LINKER_UNLOAD_FORCE); if (hdr) free(hdr, M_LINKER); return error; }
void zfs_rmnode(znode_t *zp) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os = zfsvfs->z_os; znode_t *xzp = NULL; dmu_tx_t *tx; uint64_t acl_obj; int error; int vfslocked; vfslocked = VFS_LOCK_GIANT(zfsvfs->z_vfs); ASSERT(zp->z_phys->zp_links == 0); /* * If this is a ZIL replay then leave the object in the unlinked set. * Otherwise we can get a deadlock, because the delete can be * quite large and span multiple tx's and txgs, but each replay * creates a tx to atomically run the replay function and mark the * replay record as complete. We deadlock trying to start a tx in * a new txg to further the deletion but can't because the replay * tx hasn't finished. * * We actually delete the object if we get a failure to create an * object in zil_replay_log_record(), or after calling zil_replay(). */ if (zfsvfs->z_assign >= TXG_INITIAL) { zfs_znode_dmu_fini(zp); zfs_znode_free(zp); return; } /* * If this is an attribute directory, purge its contents. */ if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR && (zp->z_phys->zp_flags & ZFS_XATTR)) { if (zfs_purgedir(zp) != 0) { /* * Not enough space to delete some xattrs. * Leave it in the unlinked set. */ zfs_znode_dmu_fini(zp); zfs_znode_free(zp); VFS_UNLOCK_GIANT(vfslocked); return; } } /* * Free up all the data in the file. */ error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END); if (error) { /* * Not enough space. Leave the file in the unlinked set. */ zfs_znode_dmu_fini(zp); zfs_znode_free(zp); return; } /* * If the file has extended attributes, we're going to unlink * the xattr dir. */ if (zp->z_phys->zp_xattr) { error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp); ASSERT(error == 0); } acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj; /* * Set up the final transaction. */ tx = dmu_tx_create(os); dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); if (xzp) { dmu_tx_hold_bonus(tx, xzp->z_id); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL); } if (acl_obj) dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { /* * Not enough space to delete the file. Leave it in the * unlinked set, leaking it until the fs is remounted (at * which point we'll call zfs_unlinked_drain() to process it). */ dmu_tx_abort(tx); zfs_znode_dmu_fini(zp); zfs_znode_free(zp); goto out; } if (xzp) { dmu_buf_will_dirty(xzp->z_dbuf, tx); mutex_enter(&xzp->z_lock); xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */ xzp->z_phys->zp_links = 0; /* no more links to it */ mutex_exit(&xzp->z_lock); zfs_unlinked_add(xzp, tx); } /* Remove this znode from the unlinked set */ VERIFY3U(0, ==, zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); zfs_znode_delete(zp, tx); dmu_tx_commit(tx); out: if (xzp) VN_RELE(ZTOV(xzp)); VFS_UNLOCK_GIANT(vfslocked); }
/* * Flush all pending data to disk. This operation will block. */ static int alq_doio(struct alq *alq) { struct thread *td; struct mount *mp; struct vnode *vp; struct uio auio; struct iovec aiov[2]; int totlen; int iov; int vfslocked; int wrapearly; KASSERT((HAS_PENDING_DATA(alq)), ("%s: queue empty!", __func__)); vp = alq->aq_vp; td = curthread; totlen = 0; iov = 1; wrapearly = alq->aq_wrapearly; bzero(&aiov, sizeof(aiov)); bzero(&auio, sizeof(auio)); /* Start the write from the location of our buffer tail pointer. */ aiov[0].iov_base = alq->aq_entbuf + alq->aq_writetail; if (alq->aq_writetail < alq->aq_writehead) { /* Buffer not wrapped. */ totlen = aiov[0].iov_len = alq->aq_writehead - alq->aq_writetail; } else if (alq->aq_writehead == 0) { /* Buffer not wrapped (special case to avoid an empty iov). */ totlen = aiov[0].iov_len = alq->aq_buflen - alq->aq_writetail - wrapearly; } else { /* * Buffer wrapped, requires 2 aiov entries: * - first is from writetail to end of buffer * - second is from start of buffer to writehead */ aiov[0].iov_len = alq->aq_buflen - alq->aq_writetail - wrapearly; iov++; aiov[1].iov_base = alq->aq_entbuf; aiov[1].iov_len = alq->aq_writehead; totlen = aiov[0].iov_len + aiov[1].iov_len; } alq->aq_flags |= AQ_FLUSHING; ALQ_UNLOCK(alq); auio.uio_iov = &aiov[0]; auio.uio_offset = 0; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; auio.uio_iovcnt = iov; auio.uio_resid = totlen; auio.uio_td = td; /* * Do all of the junk required to write now. */ vfslocked = VFS_LOCK_GIANT(vp->v_mount); vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* * XXX: VOP_WRITE error checks are ignored. */ #ifdef MAC if (mac_vnode_check_write(alq->aq_cred, NOCRED, vp) == 0) #endif VOP_WRITE(vp, &auio, IO_UNIT | IO_APPEND, alq->aq_cred); VOP_UNLOCK(vp, 0); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); ALQ_LOCK(alq); alq->aq_flags &= ~AQ_FLUSHING; /* Adjust writetail as required, taking into account wrapping. */ alq->aq_writetail = (alq->aq_writetail + totlen + wrapearly) % alq->aq_buflen; alq->aq_freebytes += totlen + wrapearly; /* * If we just flushed part of the buffer which wrapped, reset the * wrapearly indicator. */ if (wrapearly) alq->aq_wrapearly = 0; /* * If we just flushed the buffer completely, reset indexes to 0 to * minimise buffer wraps. * This is also required to ensure alq_getn() can't wedge itself. */ if (!HAS_PENDING_DATA(alq)) alq->aq_writehead = alq->aq_writetail = 0; KASSERT((alq->aq_writetail >= 0 && alq->aq_writetail < alq->aq_buflen), ("%s: aq_writetail < 0 || aq_writetail >= aq_buflen", __func__)); if (alq->aq_flags & AQ_WANTED) { alq->aq_flags &= ~AQ_WANTED; return (1); } return(0); }
/* * Clean up the unionfs node. */ void unionfs_noderem(struct vnode *vp, struct thread *td) { int vfslocked; int count; struct unionfs_node *unp, *unp_t1, *unp_t2; struct unionfs_node_hashhead *hd; struct unionfs_node_status *unsp, *unsp_tmp; struct vnode *lvp; struct vnode *uvp; struct vnode *dvp; /* * Use the interlock to protect the clearing of v_data to * prevent faults in unionfs_lock(). */ VI_LOCK(vp); unp = VTOUNIONFS(vp); lvp = unp->un_lowervp; uvp = unp->un_uppervp; dvp = unp->un_dvp; unp->un_lowervp = unp->un_uppervp = NULLVP; vp->v_vnlock = &(vp->v_lock); vp->v_data = NULL; lockmgr(vp->v_vnlock, LK_EXCLUSIVE | LK_INTERLOCK, VI_MTX(vp), td); if (lvp != NULLVP) VOP_UNLOCK(lvp, 0, td); if (uvp != NULLVP) VOP_UNLOCK(uvp, 0, td); vp->v_object = NULL; if (dvp != NULLVP && unp->un_hash.le_prev != NULL) unionfs_rem_cached_vnode(unp, dvp); if (lvp != NULLVP) { vfslocked = VFS_LOCK_GIANT(lvp->v_mount); vrele(lvp); VFS_UNLOCK_GIANT(vfslocked); } if (uvp != NULLVP) { vfslocked = VFS_LOCK_GIANT(uvp->v_mount); vrele(uvp); VFS_UNLOCK_GIANT(vfslocked); } if (dvp != NULLVP) { vfslocked = VFS_LOCK_GIANT(dvp->v_mount); vrele(dvp); VFS_UNLOCK_GIANT(vfslocked); unp->un_dvp = NULLVP; } if (unp->un_path != NULL) { free(unp->un_path, M_UNIONFSPATH); unp->un_path = NULL; } if (unp->un_hashtbl != NULL) { for (count = 0; count <= unp->un_hashmask; count++) { hd = unp->un_hashtbl + count; LIST_FOREACH_SAFE(unp_t1, hd, un_hash, unp_t2) { LIST_REMOVE(unp_t1, un_hash); unp_t1->un_hash.le_next = NULL; unp_t1->un_hash.le_prev = NULL; } } hashdestroy(unp->un_hashtbl, M_UNIONFSHASH, unp->un_hashmask); }
/* * Q_QUOTAON - set up a quota file for a particular filesystem. */ int quotaon(struct thread *td, struct mount *mp, int type, void *fname) { struct ufsmount *ump; struct vnode *vp, **vpp; struct vnode *mvp; struct dquot *dq; int error, flags, vfslocked; struct nameidata nd; error = priv_check(td, PRIV_UFS_QUOTAON); if (error) return (error); if (mp->mnt_flag & MNT_RDONLY) return (EROFS); ump = VFSTOUFS(mp); dq = NODQUOT; NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_USERSPACE, fname, td); flags = FREAD | FWRITE; vfs_ref(mp); vfs_unbusy(mp); error = vn_open(&nd, &flags, 0, NULL); if (error != 0) { vfs_rel(mp); return (error); } vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; error = vfs_busy(mp, MBF_NOWAIT); vfs_rel(mp); if (error == 0) { if (vp->v_type != VREG) { error = EACCES; vfs_unbusy(mp); } } if (error != 0) { VOP_UNLOCK(vp, 0); (void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td); VFS_UNLOCK_GIANT(vfslocked); return (error); } UFS_LOCK(ump); if ((ump->um_qflags[type] & (QTF_OPENING|QTF_CLOSING)) != 0) { UFS_UNLOCK(ump); VOP_UNLOCK(vp, 0); (void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td); VFS_UNLOCK_GIANT(vfslocked); vfs_unbusy(mp); return (EALREADY); } ump->um_qflags[type] |= QTF_OPENING|QTF_CLOSING; UFS_UNLOCK(ump); if ((error = dqopen(vp, ump, type)) != 0) { VOP_UNLOCK(vp, 0); UFS_LOCK(ump); ump->um_qflags[type] &= ~(QTF_OPENING|QTF_CLOSING); UFS_UNLOCK(ump); (void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td); VFS_UNLOCK_GIANT(vfslocked); vfs_unbusy(mp); return (error); } VOP_UNLOCK(vp, 0); MNT_ILOCK(mp); mp->mnt_flag |= MNT_QUOTA; MNT_IUNLOCK(mp); vpp = &ump->um_quotas[type]; if (*vpp != vp) quotaoff1(td, mp, type); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vp->v_vflag |= VV_SYSTEM; VOP_UNLOCK(vp, 0); *vpp = vp; VFS_UNLOCK_GIANT(vfslocked); /* * Save the credential of the process that turned on quotas. * Set up the time limits for this quota. */ ump->um_cred[type] = crhold(td->td_ucred); ump->um_btime[type] = MAX_DQ_TIME; ump->um_itime[type] = MAX_IQ_TIME; if (dqget(NULLVP, 0, ump, type, &dq) == 0) { if (dq->dq_btime > 0) ump->um_btime[type] = dq->dq_btime; if (dq->dq_itime > 0) ump->um_itime[type] = dq->dq_itime; dqrele(NULLVP, dq); } /* * Allow the getdq from getinoquota below to read the quota * from file. */ UFS_LOCK(ump); ump->um_qflags[type] &= ~QTF_CLOSING; UFS_UNLOCK(ump); /* * Search vnodes associated with this mount point, * adding references to quota file being opened. * NB: only need to add dquot's for inodes being modified. */ again: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto again; } if (vp->v_type == VNON || vp->v_writecount == 0) { VOP_UNLOCK(vp, 0); vrele(vp); continue; } error = getinoquota(VTOI(vp)); VOP_UNLOCK(vp, 0); vrele(vp); if (error) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); break; } } if (error) quotaoff_inchange(td, mp, type); UFS_LOCK(ump); ump->um_qflags[type] &= ~QTF_OPENING; KASSERT((ump->um_qflags[type] & QTF_CLOSING) == 0, ("quotaon: leaking flags")); UFS_UNLOCK(ump); vfs_unbusy(mp); return (error); }
/* * Main code to turn off disk quotas for a filesystem. Does not change * flags. */ static int quotaoff1(struct thread *td, struct mount *mp, int type) { struct vnode *vp; struct vnode *qvp, *mvp; struct ufsmount *ump; struct dquot *dq; struct inode *ip; struct ucred *cr; int vfslocked; int error; ump = VFSTOUFS(mp); UFS_LOCK(ump); KASSERT((ump->um_qflags[type] & QTF_CLOSING) != 0, ("quotaoff1: flags are invalid")); if ((qvp = ump->um_quotas[type]) == NULLVP) { UFS_UNLOCK(ump); return (0); } cr = ump->um_cred[type]; UFS_UNLOCK(ump); /* * Search vnodes associated with this mount point, * deleting any references to quota file being closed. */ again: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto again; } ip = VTOI(vp); dq = ip->i_dquot[type]; ip->i_dquot[type] = NODQUOT; dqrele(vp, dq); VOP_UNLOCK(vp, 0); vrele(vp); } dqflush(qvp); /* Clear um_quotas before closing the quota vnode to prevent * access to the closed vnode from dqget/dqsync */ UFS_LOCK(ump); ump->um_quotas[type] = NULLVP; ump->um_cred[type] = NOCRED; UFS_UNLOCK(ump); vfslocked = VFS_LOCK_GIANT(qvp->v_mount); vn_lock(qvp, LK_EXCLUSIVE | LK_RETRY); qvp->v_vflag &= ~VV_SYSTEM; VOP_UNLOCK(qvp, 0); error = vn_close(qvp, FREAD|FWRITE, td->td_ucred, td); VFS_UNLOCK_GIANT(vfslocked); crfree(cr); return (error); }
/* * Flush all pending data to disk. This operation will block. */ static int alq_doio(struct alq *alq) { struct thread *td; struct mount *mp; struct vnode *vp; struct uio auio; struct iovec aiov[2]; struct ale *ale; struct ale *alstart; int totlen; int iov; int vfslocked; vp = alq->aq_vp; td = curthread; totlen = 0; iov = 0; alstart = ale = alq->aq_entvalid; alq->aq_entvalid = NULL; bzero(&aiov, sizeof(aiov)); bzero(&auio, sizeof(auio)); do { if (aiov[iov].iov_base == NULL) aiov[iov].iov_base = ale->ae_data; aiov[iov].iov_len += alq->aq_entlen; totlen += alq->aq_entlen; /* Check to see if we're wrapping the buffer */ if (ale->ae_data + alq->aq_entlen != ale->ae_next->ae_data) iov++; ale->ae_flags &= ~AE_VALID; ale = ale->ae_next; } while (ale->ae_flags & AE_VALID); alq->aq_flags |= AQ_FLUSHING; ALQ_UNLOCK(alq); if (iov == 2 || aiov[iov].iov_base == NULL) iov--; auio.uio_iov = &aiov[0]; auio.uio_offset = 0; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; auio.uio_iovcnt = iov + 1; auio.uio_resid = totlen; auio.uio_td = td; /* * Do all of the junk required to write now. */ vfslocked = VFS_LOCK_GIANT(vp->v_mount); vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* * XXX: VOP_WRITE error checks are ignored. */ #ifdef MAC if (mac_vnode_check_write(alq->aq_cred, NOCRED, vp) == 0) #endif VOP_WRITE(vp, &auio, IO_UNIT | IO_APPEND, alq->aq_cred); VOP_UNLOCK(vp, 0); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); ALQ_LOCK(alq); alq->aq_flags &= ~AQ_FLUSHING; if (alq->aq_entfree == NULL) alq->aq_entfree = alstart; if (alq->aq_flags & AQ_WANTED) { alq->aq_flags &= ~AQ_WANTED; return (1); } return(0); }
/* * Obtain a dquot structure for the specified identifier and quota file * reading the information from the file if necessary. */ static int dqget(struct vnode *vp, u_long id, struct ufsmount *ump, int type, struct dquot **dqp) { uint8_t buf[sizeof(struct dqblk64)]; off_t base, recsize; struct dquot *dq, *dq1; struct dqhash *dqh; struct vnode *dqvp; struct iovec aiov; struct uio auio; int vfslocked, dqvplocked, error; #ifdef DEBUG_VFS_LOCKS if (vp != NULLVP) ASSERT_VOP_ELOCKED(vp, "dqget"); #endif if (vp != NULLVP && *dqp != NODQUOT) { return (0); } /* XXX: Disallow negative id values to prevent the * creation of 100GB+ quota data files. */ if ((int)id < 0) return (EINVAL); UFS_LOCK(ump); dqvp = ump->um_quotas[type]; if (dqvp == NULLVP || (ump->um_qflags[type] & QTF_CLOSING)) { *dqp = NODQUOT; UFS_UNLOCK(ump); return (EINVAL); } vref(dqvp); UFS_UNLOCK(ump); error = 0; dqvplocked = 0; /* * Check the cache first. */ dqh = DQHASH(dqvp, id); DQH_LOCK(); dq = dqhashfind(dqh, id, dqvp); if (dq != NULL) { DQH_UNLOCK(); hfound: DQI_LOCK(dq); DQI_WAIT(dq, PINOD+1, "dqget"); DQI_UNLOCK(dq); if (dq->dq_ump == NULL) { dqrele(vp, dq); dq = NODQUOT; error = EIO; } *dqp = dq; vfslocked = VFS_LOCK_GIANT(dqvp->v_mount); if (dqvplocked) vput(dqvp); else vrele(dqvp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Quota vnode lock is before DQ_LOCK. Acquire dqvp lock there * since new dq will appear on the hash chain DQ_LOCKed. */ if (vp != dqvp) { DQH_UNLOCK(); vn_lock(dqvp, LK_SHARED | LK_RETRY); dqvplocked = 1; DQH_LOCK(); /* * Recheck the cache after sleep for quota vnode lock. */ dq = dqhashfind(dqh, id, dqvp); if (dq != NULL) { DQH_UNLOCK(); goto hfound; } } /* * Not in cache, allocate a new one or take it from the * free list. */ if (TAILQ_FIRST(&dqfreelist) == NODQUOT && numdquot < MAXQUOTAS * desiredvnodes) desireddquot += DQUOTINC; if (numdquot < desireddquot) { numdquot++; DQH_UNLOCK(); dq1 = malloc(sizeof *dq1, M_DQUOT, M_WAITOK | M_ZERO); mtx_init(&dq1->dq_lock, "dqlock", NULL, MTX_DEF); DQH_LOCK(); /* * Recheck the cache after sleep for memory. */ dq = dqhashfind(dqh, id, dqvp); if (dq != NULL) { numdquot--; DQH_UNLOCK(); mtx_destroy(&dq1->dq_lock); free(dq1, M_DQUOT); goto hfound; } dq = dq1; } else { if ((dq = TAILQ_FIRST(&dqfreelist)) == NULL) { DQH_UNLOCK(); tablefull("dquot"); *dqp = NODQUOT; vfslocked = VFS_LOCK_GIANT(dqvp->v_mount); if (dqvplocked) vput(dqvp); else vrele(dqvp); VFS_UNLOCK_GIANT(vfslocked); return (EUSERS); } if (dq->dq_cnt || (dq->dq_flags & DQ_MOD)) panic("dqget: free dquot isn't %p", dq); TAILQ_REMOVE(&dqfreelist, dq, dq_freelist); if (dq->dq_ump != NULL) LIST_REMOVE(dq, dq_hash); } /* * Dq is put into hash already locked to prevent parallel * usage while it is being read from file. */ dq->dq_flags = DQ_LOCK; dq->dq_id = id; dq->dq_type = type; dq->dq_ump = ump; LIST_INSERT_HEAD(dqh, dq, dq_hash); DQREF(dq); DQH_UNLOCK(); /* * Read the requested quota record from the quota file, performing * any necessary conversions. */ if (ump->um_qflags[type] & QTF_64BIT) { recsize = sizeof(struct dqblk64); base = sizeof(struct dqhdr64); } else { recsize = sizeof(struct dqblk32); base = 0; } auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = buf; aiov.iov_len = recsize; auio.uio_resid = recsize; auio.uio_offset = base + id * recsize; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_td = (struct thread *)0; vfslocked = VFS_LOCK_GIANT(dqvp->v_mount); error = VOP_READ(dqvp, &auio, 0, ump->um_cred[type]); if (auio.uio_resid == recsize && error == 0) { bzero(&dq->dq_dqb, sizeof(dq->dq_dqb)); } else { if (ump->um_qflags[type] & QTF_64BIT) dqb64_dq((struct dqblk64 *)buf, dq); else dqb32_dq((struct dqblk32 *)buf, dq); } if (dqvplocked) vput(dqvp); else vrele(dqvp); VFS_UNLOCK_GIANT(vfslocked); /* * I/O error in reading quota file, release * quota structure and reflect problem to caller. */ if (error) { DQH_LOCK(); dq->dq_ump = NULL; LIST_REMOVE(dq, dq_hash); DQH_UNLOCK(); DQI_LOCK(dq); if (dq->dq_flags & DQ_WANT) wakeup(dq); dq->dq_flags = 0; DQI_UNLOCK(dq); dqrele(vp, dq); *dqp = NODQUOT; return (error); } DQI_LOCK(dq); /* * Check for no limit to enforce. * Initialize time values if necessary. */ if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 && dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0) dq->dq_flags |= DQ_FAKE; if (dq->dq_id != 0) { if (dq->dq_btime == 0) { dq->dq_btime = time_second + ump->um_btime[type]; if (dq->dq_bsoftlimit && dq->dq_curblocks >= dq->dq_bsoftlimit) dq->dq_flags |= DQ_MOD; } if (dq->dq_itime == 0) { dq->dq_itime = time_second + ump->um_itime[type]; if (dq->dq_isoftlimit && dq->dq_curinodes >= dq->dq_isoftlimit) dq->dq_flags |= DQ_MOD; } } DQI_WAKEUP(dq); DQI_UNLOCK(dq); *dqp = dq; return (0); }
/* * The map entries can *almost* be read with programs like cat. However, * large maps need special programs to read. It is not easy to implement * a program that can sense the required size of the buffer, and then * subsequently do a read with the appropriate size. This operation cannot * be atomic. The best that we can do is to allow the program to do a read * with an arbitrarily large buffer, and return as much as we can. We can * return an error code if the buffer is too small (EFBIG), then the program * can try a bigger buffer. */ int procfs_doprocmap(PFS_FILL_ARGS) { struct vmspace *vm; vm_map_t map; vm_map_entry_t entry, tmp_entry; struct vnode *vp; char *fullpath, *freepath; struct uidinfo *uip; int error, vfslocked; unsigned int last_timestamp; #ifdef COMPAT_FREEBSD32 int wrap32 = 0; #endif PROC_LOCK(p); error = p_candebug(td, p); PROC_UNLOCK(p); if (error) return (error); if (uio->uio_rw != UIO_READ) return (EOPNOTSUPP); #ifdef COMPAT_FREEBSD32 if (curproc->p_sysent->sv_flags & SV_ILP32) { if (!(p->p_sysent->sv_flags & SV_ILP32)) return (EOPNOTSUPP); wrap32 = 1; } #endif vm = vmspace_acquire_ref(p); if (vm == NULL) return (ESRCH); map = &vm->vm_map; vm_map_lock_read(map); for (entry = map->header.next; entry != &map->header; entry = entry->next) { vm_object_t obj, tobj, lobj; int ref_count, shadow_count, flags; vm_offset_t e_start, e_end, addr; int resident, privateresident; char *type; vm_eflags_t e_eflags; vm_prot_t e_prot; if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) continue; e_eflags = entry->eflags; e_prot = entry->protection; e_start = entry->start; e_end = entry->end; privateresident = 0; obj = entry->object.vm_object; if (obj != NULL) { VM_OBJECT_LOCK(obj); if (obj->shadow_count == 1) privateresident = obj->resident_page_count; } uip = (entry->uip) ? entry->uip : (obj ? obj->uip : NULL); resident = 0; addr = entry->start; while (addr < entry->end) { if (pmap_extract(map->pmap, addr)) resident++; addr += PAGE_SIZE; } for (lobj = tobj = obj; tobj; tobj = tobj->backing_object) { if (tobj != obj) VM_OBJECT_LOCK(tobj); if (lobj != obj) VM_OBJECT_UNLOCK(lobj); lobj = tobj; } last_timestamp = map->timestamp; vm_map_unlock_read(map); freepath = NULL; fullpath = "-"; if (lobj) { switch (lobj->type) { default: case OBJT_DEFAULT: type = "default"; vp = NULL; break; case OBJT_VNODE: type = "vnode"; vp = lobj->handle; vref(vp); break; case OBJT_SWAP: type = "swap"; vp = NULL; break; case OBJT_SG: case OBJT_DEVICE: type = "device"; vp = NULL; break; } if (lobj != obj) VM_OBJECT_UNLOCK(lobj); flags = obj->flags; ref_count = obj->ref_count; shadow_count = obj->shadow_count; VM_OBJECT_UNLOCK(obj); if (vp != NULL) { vn_fullpath(td, vp, &fullpath, &freepath); vfslocked = VFS_LOCK_GIANT(vp->v_mount); vrele(vp); VFS_UNLOCK_GIANT(vfslocked); } } else { type = "none"; flags = 0; ref_count = 0; shadow_count = 0; } /* * format: * start, end, resident, private resident, cow, access, type, * charged, charged uid. */ error = sbuf_printf(sb, "0x%lx 0x%lx %d %d %p %s%s%s %d %d 0x%x %s %s %s %s %s %d\n", (u_long)e_start, (u_long)e_end, resident, privateresident, #ifdef COMPAT_FREEBSD32 wrap32 ? NULL : obj, /* Hide 64 bit value */ #else obj, #endif (e_prot & VM_PROT_READ)?"r":"-", (e_prot & VM_PROT_WRITE)?"w":"-", (e_prot & VM_PROT_EXECUTE)?"x":"-", ref_count, shadow_count, flags, (e_eflags & MAP_ENTRY_COW)?"COW":"NCOW", (e_eflags & MAP_ENTRY_NEEDS_COPY)?"NC":"NNC", type, fullpath, uip ? "CH":"NCH", uip ? uip->ui_uid : -1); if (freepath != NULL) free(freepath, M_TEMP); vm_map_lock_read(map); if (error == -1) { error = 0; break; } if (last_timestamp != map->timestamp) { /* * Look again for the entry because the map was * modified while it was unlocked. Specifically, * the entry may have been clipped, merged, or deleted. */ vm_map_lookup_entry(map, e_end - 1, &tmp_entry); entry = tmp_entry; } } vm_map_unlock_read(map); vmspace_free(vm); return (error); }
/* * Create the queue data structure, allocate the buffer, and open the file. */ int alq_open(struct alq **alqp, const char *file, struct ucred *cred, int cmode, int size, int count) { struct thread *td; struct nameidata nd; struct ale *ale; struct ale *alp; struct alq *alq; char *bufp; int flags; int error; int i, vfslocked; *alqp = NULL; td = curthread; NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, file, td); flags = FWRITE | O_NOFOLLOW | O_CREAT; error = vn_open_cred(&nd, &flags, cmode, 0, cred, NULL); if (error) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); /* We just unlock so we hold a reference */ VOP_UNLOCK(nd.ni_vp, 0); VFS_UNLOCK_GIANT(vfslocked); alq = malloc(sizeof(*alq), M_ALD, M_WAITOK|M_ZERO); alq->aq_entbuf = malloc(count * size, M_ALD, M_WAITOK|M_ZERO); alq->aq_first = malloc(sizeof(*ale) * count, M_ALD, M_WAITOK|M_ZERO); alq->aq_vp = nd.ni_vp; alq->aq_cred = crhold(cred); alq->aq_entmax = count; alq->aq_entlen = size; alq->aq_entfree = alq->aq_first; mtx_init(&alq->aq_mtx, "ALD Queue", NULL, MTX_SPIN|MTX_QUIET); bufp = alq->aq_entbuf; ale = alq->aq_first; alp = NULL; /* Match up entries with buffers */ for (i = 0; i < count; i++) { if (alp) alp->ae_next = ale; ale->ae_data = bufp; alp = ale; ale++; bufp += size; } alp->ae_next = alq->aq_first; if ((error = ald_add(alq)) != 0) return (error); *alqp = alq; return (0); }
/* * Set up nameidata for a lookup() call and do it. * * If pubflag is set, this call is done for a lookup operation on the * public filehandle. In that case we allow crossing mountpoints and * absolute pathnames. However, the caller is expected to check that * the lookup result is within the public fs, and deny access if * it is not. * * nfs_namei() clears out garbage fields that namei() might leave garbage. * This is mainly ni_vp and ni_dvp when an error occurs, and ni_dvp when no * error occurs but the parent was not requested. * * dirp may be set whether an error is returned or not, and must be * released by the caller. */ int nfs_namei(struct nameidata *ndp, struct nfsrv_descript *nfsd, fhandle_t *fhp, int len, struct nfssvc_sock *slp, struct sockaddr *nam, struct mbuf **mdp, caddr_t *dposp, struct vnode **retdirp, int v3, struct vattr *retdirattrp, int *retdirattr_retp, int pubflag) { int i, rem; struct mbuf *md; char *fromcp, *tocp, *cp; struct iovec aiov; struct uio auio; struct vnode *dp; int error, rdonly, linklen; struct componentname *cnp = &ndp->ni_cnd; int lockleaf = (cnp->cn_flags & LOCKLEAF) != 0; int dvfslocked; int vfslocked; vfslocked = 0; dvfslocked = 0; *retdirp = NULL; cnp->cn_flags |= NOMACCHECK; cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK); /* * Copy the name from the mbuf list to ndp->ni_pnbuf * and set the various ndp fields appropriately. */ fromcp = *dposp; tocp = cnp->cn_pnbuf; md = *mdp; rem = mtod(md, caddr_t) + md->m_len - fromcp; for (i = 0; i < len; i++) { while (rem == 0) { md = md->m_next; if (md == NULL) { error = EBADRPC; goto out; } fromcp = mtod(md, caddr_t); rem = md->m_len; } if (*fromcp == '\0' || (!pubflag && *fromcp == '/')) { error = EACCES; goto out; } *tocp++ = *fromcp++; rem--; } *tocp = '\0'; *mdp = md; *dposp = fromcp; len = nfsm_rndup(len)-len; if (len > 0) { if (rem >= len) *dposp += len; else if ((error = nfs_adv(mdp, dposp, len, rem)) != 0) goto out; } /* * Extract and set starting directory. */ error = nfsrv_fhtovp(fhp, FALSE, &dp, &dvfslocked, nfsd, slp, nam, &rdonly, pubflag); if (error) goto out; vfslocked = VFS_LOCK_GIANT(dp->v_mount); if (dp->v_type != VDIR) { vrele(dp); error = ENOTDIR; goto out; } if (rdonly) cnp->cn_flags |= RDONLY; /* * Set return directory. Reference to dp is implicitly transfered * to the returned pointer */ *retdirp = dp; if (v3) { vn_lock(dp, LK_EXCLUSIVE | LK_RETRY); *retdirattr_retp = VOP_GETATTR(dp, retdirattrp, ndp->ni_cnd.cn_cred); VOP_UNLOCK(dp, 0); } if (pubflag) { /* * Oh joy. For WebNFS, handle those pesky '%' escapes, * and the 'native path' indicator. */ cp = uma_zalloc(namei_zone, M_WAITOK); fromcp = cnp->cn_pnbuf; tocp = cp; if ((unsigned char)*fromcp >= WEBNFS_SPECCHAR_START) { switch ((unsigned char)*fromcp) { case WEBNFS_NATIVE_CHAR: /* * 'Native' path for us is the same * as a path according to the NFS spec, * just skip the escape char. */ fromcp++; break; /* * More may be added in the future, range 0x80-0xff */ default: error = EIO; uma_zfree(namei_zone, cp); goto out; } } /* * Translate the '%' escapes, URL-style. */ while (*fromcp != '\0') { if (*fromcp == WEBNFS_ESC_CHAR) { if (fromcp[1] != '\0' && fromcp[2] != '\0') { fromcp++; *tocp++ = HEXSTRTOI(fromcp); fromcp += 2; continue; } else { error = ENOENT; uma_zfree(namei_zone, cp); goto out; } } else *tocp++ = *fromcp++; } *tocp = '\0'; uma_zfree(namei_zone, cnp->cn_pnbuf); cnp->cn_pnbuf = cp; } ndp->ni_pathlen = (tocp - cnp->cn_pnbuf) + 1; ndp->ni_segflg = UIO_SYSSPACE; if (pubflag) { ndp->ni_rootdir = rootvnode; ndp->ni_loopcnt = 0; if (cnp->cn_pnbuf[0] == '/') { int tvfslocked; tvfslocked = VFS_LOCK_GIANT(rootvnode->v_mount); VFS_UNLOCK_GIANT(vfslocked); dp = rootvnode; vfslocked = tvfslocked; } } else { cnp->cn_flags |= NOCROSSMOUNT; } /* * Initialize for scan, set ni_startdir and bump ref on dp again * because lookup() will dereference ni_startdir. */ cnp->cn_thread = curthread; VREF(dp); ndp->ni_startdir = dp; if (!lockleaf) cnp->cn_flags |= LOCKLEAF; for (;;) { cnp->cn_nameptr = cnp->cn_pnbuf; /* * Call lookup() to do the real work. If an error occurs, * ndp->ni_vp and ni_dvp are left uninitialized or NULL and * we do not have to dereference anything before returning. * In either case ni_startdir will be dereferenced and NULLed * out. */ if (vfslocked) ndp->ni_cnd.cn_flags |= GIANTHELD; error = lookup(ndp); vfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0; ndp->ni_cnd.cn_flags &= ~GIANTHELD; if (error) break; /* * Check for encountering a symbolic link. Trivial * termination occurs if no symlink encountered. * Note: zfree is safe because error is 0, so we will * not zfree it again when we break. */ if ((cnp->cn_flags & ISSYMLINK) == 0) { if (cnp->cn_flags & (SAVENAME | SAVESTART)) cnp->cn_flags |= HASBUF; else uma_zfree(namei_zone, cnp->cn_pnbuf); if (ndp->ni_vp && !lockleaf) VOP_UNLOCK(ndp->ni_vp, 0); break; } /* * Validate symlink */ if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1) VOP_UNLOCK(ndp->ni_dvp, 0); if (!pubflag) { error = EINVAL; goto badlink2; } if (ndp->ni_loopcnt++ >= MAXSYMLINKS) { error = ELOOP; goto badlink2; } if (ndp->ni_pathlen > 1) cp = uma_zalloc(namei_zone, M_WAITOK); else cp = cnp->cn_pnbuf; aiov.iov_base = cp; aiov.iov_len = MAXPATHLEN; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = NULL; auio.uio_resid = MAXPATHLEN; error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred); if (error) { badlink1: if (ndp->ni_pathlen > 1) uma_zfree(namei_zone, cp); badlink2: vput(ndp->ni_vp); vrele(ndp->ni_dvp); break; } linklen = MAXPATHLEN - auio.uio_resid; if (linklen == 0) { error = ENOENT; goto badlink1; } if (linklen + ndp->ni_pathlen >= MAXPATHLEN) { error = ENAMETOOLONG; goto badlink1; } /* * Adjust or replace path */ if (ndp->ni_pathlen > 1) { bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen); uma_zfree(namei_zone, cnp->cn_pnbuf); cnp->cn_pnbuf = cp; } else cnp->cn_pnbuf[linklen] = '\0'; ndp->ni_pathlen += linklen; /* * Cleanup refs for next loop and check if root directory * should replace current directory. Normally ni_dvp * becomes the new base directory and is cleaned up when * we loop. Explicitly null pointers after invalidation * to clarify operation. */ vput(ndp->ni_vp); ndp->ni_vp = NULL; if (cnp->cn_pnbuf[0] == '/') { vrele(ndp->ni_dvp); ndp->ni_dvp = ndp->ni_rootdir; VREF(ndp->ni_dvp); } ndp->ni_startdir = ndp->ni_dvp; ndp->ni_dvp = NULL; } if (!lockleaf) cnp->cn_flags &= ~LOCKLEAF; if (cnp->cn_flags & GIANTHELD) { mtx_unlock(&Giant); cnp->cn_flags &= ~GIANTHELD; } /* * nfs_namei() guarentees that fields will not contain garbage * whether an error occurs or not. This allows the caller to track * cleanup state trivially. */ out: if (error) { uma_zfree(namei_zone, cnp->cn_pnbuf); ndp->ni_vp = NULL; ndp->ni_dvp = NULL; ndp->ni_startdir = NULL; cnp->cn_flags &= ~HASBUF; VFS_UNLOCK_GIANT(vfslocked); vfslocked = 0; } else if ((ndp->ni_cnd.cn_flags & (WANTPARENT|LOCKPARENT)) == 0) { ndp->ni_dvp = NULL; } /* * This differs from normal namei() in that even on failure we may * return with Giant held due to the dirp return. Make sure we only * have not recursed however. The calling code only expects to drop * one acquire. */ if (vfslocked || dvfslocked) ndp->ni_cnd.cn_flags |= GIANTHELD; if (vfslocked && dvfslocked) VFS_UNLOCK_GIANT(vfslocked); return (error); }
/* * Update the disk quota in the quota file. */ static int dqsync(struct vnode *vp, struct dquot *dq) { uint8_t buf[sizeof(struct dqblk64)]; off_t base, recsize; struct vnode *dqvp; struct iovec aiov; struct uio auio; int vfslocked, error; struct mount *mp; struct ufsmount *ump; #ifdef DEBUG_VFS_LOCKS if (vp != NULL) ASSERT_VOP_ELOCKED(vp, "dqsync"); #endif mp = NULL; error = 0; if (dq == NODQUOT) panic("dqsync: dquot"); if ((ump = dq->dq_ump) == NULL) return (0); UFS_LOCK(ump); if ((dqvp = ump->um_quotas[dq->dq_type]) == NULLVP) panic("dqsync: file"); vref(dqvp); UFS_UNLOCK(ump); vfslocked = VFS_LOCK_GIANT(dqvp->v_mount); DQI_LOCK(dq); if ((dq->dq_flags & DQ_MOD) == 0) { DQI_UNLOCK(dq); vrele(dqvp); VFS_UNLOCK_GIANT(vfslocked); return (0); } DQI_UNLOCK(dq); (void) vn_start_secondary_write(dqvp, &mp, V_WAIT); if (vp != dqvp) vn_lock(dqvp, LK_EXCLUSIVE | LK_RETRY); VFS_UNLOCK_GIANT(vfslocked); DQI_LOCK(dq); DQI_WAIT(dq, PINOD+2, "dqsync"); if ((dq->dq_flags & DQ_MOD) == 0) goto out; dq->dq_flags |= DQ_LOCK; DQI_UNLOCK(dq); /* * Write the quota record to the quota file, performing any * necessary conversions. See dqget() for additional details. */ if (ump->um_qflags[dq->dq_type] & QTF_64BIT) { dq_dqb64(dq, (struct dqblk64 *)buf); recsize = sizeof(struct dqblk64); base = sizeof(struct dqhdr64); } else { dq_dqb32(dq, (struct dqblk32 *)buf); recsize = sizeof(struct dqblk32); base = 0; } auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = buf; aiov.iov_len = recsize; auio.uio_resid = recsize; auio.uio_offset = base + dq->dq_id * recsize; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = (struct thread *)0; vfslocked = VFS_LOCK_GIANT(dqvp->v_mount); error = VOP_WRITE(dqvp, &auio, 0, dq->dq_ump->um_cred[dq->dq_type]); VFS_UNLOCK_GIANT(vfslocked); if (auio.uio_resid && error == 0) error = EIO; DQI_LOCK(dq); DQI_WAKEUP(dq); dq->dq_flags &= ~DQ_MOD; out: DQI_UNLOCK(dq); vfslocked = VFS_LOCK_GIANT(dqvp->v_mount); if (vp != dqvp) vput(dqvp); else vrele(dqvp); vn_finished_secondary_write(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); }
/* this call, unlike osi_FlushText, is supposed to discard caches that may contain invalid information if a file is written remotely, but that may contain valid information that needs to be written back if the file is being written locally. It doesn't subsume osi_FlushText, since the latter function may be needed to flush caches that are invalidated by local writes. avc->pvnLock is already held, avc->lock is guaranteed not to be held (by us, of course). */ void osi_FlushPages(struct vcache *avc, afs_ucred_t *credp) { #ifdef AFS_FBSD70_ENV int vfslocked; #endif afs_hyper_t origDV; #if defined(AFS_CACHE_BYPASS) /* The optimization to check DV under read lock below is identical a * change in CITI cache bypass work. The problem CITI found in 1999 * was that this code and background daemon doing prefetching competed * for the vcache entry shared lock. It's not clear to me from the * tech report, but it looks like CITI fixed the general prefetch code * path as a bonus when experimenting on prefetch for cache bypass, see * citi-tr-01-3. */ #endif ObtainReadLock(&avc->lock); /* If we've already purged this version, or if we're the ones * writing this version, don't flush it (could lose the * data we're writing). */ if ((hcmp((avc->f.m.DataVersion), (avc->mapDV)) <= 0) || ((avc->execsOrWriters > 0) && afs_DirtyPages(avc))) { ReleaseReadLock(&avc->lock); return; } ReleaseReadLock(&avc->lock); ObtainWriteLock(&avc->lock, 10); /* Check again */ if ((hcmp((avc->f.m.DataVersion), (avc->mapDV)) <= 0) || ((avc->execsOrWriters > 0) && afs_DirtyPages(avc))) { ReleaseWriteLock(&avc->lock); return; } if (hiszero(avc->mapDV)) { hset(avc->mapDV, avc->f.m.DataVersion); ReleaseWriteLock(&avc->lock); return; } AFS_STATCNT(osi_FlushPages); hset(origDV, avc->f.m.DataVersion); afs_Trace3(afs_iclSetp, CM_TRACE_FLUSHPAGES, ICL_TYPE_POINTER, avc, ICL_TYPE_INT32, origDV.low, ICL_TYPE_INT32, avc->f.m.Length); ReleaseWriteLock(&avc->lock); #ifdef AFS_FBSD70_ENV vfslocked = VFS_LOCK_GIANT(AFSTOV(avc)->v_mount); #endif #ifndef AFS_FBSD70_ENV AFS_GUNLOCK(); #endif osi_VM_FlushPages(avc, credp); #ifndef AFS_FBSD70_ENV AFS_GLOCK(); #endif #ifdef AFS_FBSD70_ENV VFS_UNLOCK_GIANT(vfslocked); #endif ObtainWriteLock(&avc->lock, 88); /* do this last, and to original version, since stores may occur * while executing above PUTPAGE call */ hset(avc->mapDV, origDV); ReleaseWriteLock(&avc->lock); }