/* * Execute a 32-bit system call on behalf of the current thread. */ void dosyscall(void) { /* * Need space on the stack to store syscall arguments. */ long syscall_args[MAXSYSARGS]; struct sysent *se; int64_t ret; syscall_mstate(LMS_TRAP, LMS_SYSTEM); ASSERT(curproc->p_model == DATAMODEL_ILP32); CPU_STATS_ENTER_K(); CPU_STATS_ADDQ(CPU, sys, syscall, 1); CPU_STATS_EXIT_K(); se = syscall_entry(curthread, syscall_args); /* * syscall_entry() copied all 8 arguments into syscall_args. */ ret = se->sy_callc(syscall_args[0], syscall_args[1], syscall_args[2], syscall_args[3], syscall_args[4], syscall_args[5], syscall_args[6], syscall_args[7]); syscall_exit(curthread, (int)ret & 0xffffffffu, (int)(ret >> 32)); syscall_mstate(LMS_SYSTEM, LMS_TRAP); }
/* * Common code for writing a buffer with various options. * * force_wait - wait for write completion regardless of B_ASYNC flag * do_relse - release the buffer when we are done * clear_flags - flags to clear from the buffer */ void bwrite_common(void *arg, struct buf *bp, int force_wait, int do_relse, int clear_flags) { register int do_wait; struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg; int flag; klwp_t *lwp = ttolwp(curthread); struct cpu *cpup; ASSERT(SEMA_HELD(&bp->b_sem)); flag = bp->b_flags; bp->b_flags &= ~clear_flags; if (lwp != NULL) lwp->lwp_ru.oublock++; CPU_STATS_ENTER_K(); cpup = CPU; /* get pointer AFTER preemption is disabled */ CPU_STATS_ADDQ(cpup, sys, lwrite, 1); CPU_STATS_ADDQ(cpup, sys, bwrite, 1); do_wait = ((flag & B_ASYNC) == 0 || force_wait); if (do_wait == 0) CPU_STATS_ADDQ(cpup, sys, bawrite, 1); CPU_STATS_EXIT_K(); if (ufsvfsp == NULL) { (void) bdev_strategy(bp); } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) { /* ufs && logging */ (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp); } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) { /* ufs && snapshots */ (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp); } else { ub.ub_bwrites.value.ul++; /* ufs && !logging */ (void) bdev_strategy(bp); } if (do_wait) { (void) biowait(bp); if (do_relse) { brelse(bp); } } }
/* * Initiate cross call processing. */ static void xc_common( xc_func_t func, xc_arg_t arg1, xc_arg_t arg2, xc_arg_t arg3, ulong_t *set, uint_t command) { int c; struct cpu *cpup; xc_msg_t *msg; xc_data_t *data; int cnt; int save_spl; if (!xc_initialized) { if (BT_TEST(set, CPU->cpu_id) && (CPU->cpu_flags & CPU_READY) && func != NULL) (void) (*func)(arg1, arg2, arg3); return; } save_spl = splr(ipltospl(XC_HI_PIL)); /* * fill in cross call data */ data = &CPU->cpu_m.xc_data; data->xc_func = func; data->xc_a1 = arg1; data->xc_a2 = arg2; data->xc_a3 = arg3; /* * Post messages to all CPUs involved that are CPU_READY */ CPU->cpu_m.xc_wait_cnt = 0; for (c = 0; c < max_ncpus; ++c) { if (!BT_TEST(set, c)) continue; cpup = cpu[c]; if (cpup == NULL || !(cpup->cpu_flags & CPU_READY)) continue; /* * Fill out a new message. */ msg = xc_extract(&CPU->cpu_m.xc_free); if (msg == NULL) panic("Ran out of free xc_msg_t's"); msg->xc_command = command; if (msg->xc_master != CPU->cpu_id) panic("msg %p has wrong xc_master", (void *)msg); msg->xc_slave = c; /* * Increment my work count for all messages that I'll * transition from DONE to FREE. * Also remember how many XC_MSG_WAITINGs to look for */ (void) xc_increment(&CPU->cpu_m); if (command == XC_MSG_SYNC) ++CPU->cpu_m.xc_wait_cnt; /* * Increment the target CPU work count then insert the message * in the target msgbox. If I post the first bit of work * for the target to do, send an IPI to the target CPU. */ cnt = xc_increment(&cpup->cpu_m); xc_insert(&cpup->cpu_m.xc_msgbox, msg); if (cpup != CPU) { if (cnt == 0) { CPU_STATS_ADDQ(CPU, sys, xcalls, 1); send_dirint(c, XC_HI_PIL); if (xc_collect_enable) ++xc_total_cnt; } else if (xc_collect_enable) { ++xc_multi_cnt; } } } /* * Now drop into the message handler until all work is done */ (void) xc_serv(NULL, NULL); splx(save_spl); }
void pvn_write_done(page_t *plist, int flags) { int dfree = 0; int pgrec = 0; int pgout = 0; int pgpgout = 0; int anonpgout = 0; int anonfree = 0; int fspgout = 0; int fsfree = 0; int execpgout = 0; int execfree = 0; page_t *pp; struct cpu *cpup; struct vnode *vp = NULL; /* for probe */ uint_t ppattr; kmutex_t *vphm = NULL; ASSERT((flags & B_READ) == 0); /* * If we are about to start paging anyway, start freeing pages. */ if (write_free && freemem < lotsfree + pages_before_pager && (flags & B_ERROR) == 0) { flags |= B_FREE; } /* * Handle each page involved in the i/o operation. */ while (plist != NULL) { pp = plist; ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp)); page_sub(&plist, pp); /* Kernel probe support */ if (vp == NULL) vp = pp->p_vnode; if (((flags & B_ERROR) == 0) && IS_VMODSORT(vp)) { /* * Move page to the top of the v_page list. * Skip pages modified during IO. */ vphm = page_vnode_mutex(vp); mutex_enter(vphm); if ((pp->p_vpnext != pp) && !hat_ismod(pp)) { page_vpsub(&vp->v_pages, pp); page_vpadd(&vp->v_pages, pp); } mutex_exit(vphm); } if (flags & B_ERROR) { /* * Write operation failed. We don't want * to destroy (or free) the page unless B_FORCE * is set. We set the mod bit again and release * all locks on the page so that it will get written * back again later when things are hopefully * better again. * If B_INVAL and B_FORCE is set we really have * to destroy the page. */ if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) { page_io_unlock(pp); /*LINTED: constant in conditional context*/ VN_DISPOSE(pp, B_INVAL, 0, kcred); } else { hat_setmod_only(pp); page_io_unlock(pp); page_unlock(pp); } } else if (flags & B_INVAL) { /* * XXX - Failed writes with B_INVAL set are * not handled appropriately. */ page_io_unlock(pp); /*LINTED: constant in conditional context*/ VN_DISPOSE(pp, B_INVAL, 0, kcred); } else if (flags & B_FREE ||!hat_page_is_mapped(pp)) { /* * Update statistics for pages being paged out */ if (pp->p_vnode) { if (IS_SWAPFSVP(pp->p_vnode)) { anonpgout++; } else { if (pp->p_vnode->v_flag & VVMEXEC) { execpgout++; } else { fspgout++; } } } page_io_unlock(pp); pgout = 1; pgpgout++; TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT, "page_ws_out:pp %p", pp); /* * The page_struct_lock need not be acquired to * examine "p_lckcnt" and "p_cowcnt" since we'll * have an "exclusive" lock if the upgrade succeeds. */ if (page_tryupgrade(pp) && pp->p_lckcnt == 0 && pp->p_cowcnt == 0) { /* * Check if someone has reclaimed the * page. If ref and mod are not set, no * one is using it so we can free it. * The rest of the system is careful * to use the NOSYNC flag to unload * translations set up for i/o w/o * affecting ref and mod bits. * * Obtain a copy of the real hardware * mod bit using hat_pagesync(pp, HAT_DONTZERO) * to avoid having to flush the cache. */ ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD); ck_refmod: if (!(ppattr & (P_REF | P_MOD))) { if (hat_page_is_mapped(pp)) { /* * Doesn't look like the page * was modified so now we * really have to unload the * translations. Meanwhile * another CPU could've * modified it so we have to * check again. We don't loop * forever here because now * the translations are gone * and no one can get a new one * since we have the "exclusive" * lock on the page. */ (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); ppattr = hat_page_getattr(pp, P_REF | P_MOD); goto ck_refmod; } /* * Update statistics for pages being * freed */ if (pp->p_vnode) { if (IS_SWAPFSVP(pp->p_vnode)) { anonfree++; } else { if (pp->p_vnode->v_flag & VVMEXEC) { execfree++; } else { fsfree++; } } } /*LINTED: constant in conditional ctx*/ VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred); dfree++; } else { page_unlock(pp); pgrec++; TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE, "page_ws_free:pp %p", pp); } } else { /* * Page is either `locked' in memory * or was reclaimed and now has a * "shared" lock, so release it. */ page_unlock(pp); } } else { /* * Neither B_FREE nor B_INVAL nor B_ERROR. * Just release locks. */ page_io_unlock(pp); page_unlock(pp); } } CPU_STATS_ENTER_K(); cpup = CPU; /* get cpup now that CPU cannot change */ CPU_STATS_ADDQ(cpup, vm, dfree, dfree); CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec); CPU_STATS_ADDQ(cpup, vm, pgout, pgout); CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout); CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout); CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree); CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout); CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree); CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout); CPU_STATS_ADDQ(cpup, vm, execfree, execfree); CPU_STATS_EXIT_K(); /* Kernel probe */ TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */, tnf_opaque, vnode, vp, tnf_ulong, pages_pageout, pgpgout, tnf_ulong, pages_freed, dfree, tnf_ulong, pages_reclaimed, pgrec); }
/* * Starting at current directory, translate pathname pnp to end. * Leave pathname of final component in pnp, return the vnode * for the final component in *compvpp, and return the vnode * for the parent of the final component in dirvpp. * * This is the central routine in pathname translation and handles * multiple components in pathnames, separating them at /'s. It also * implements mounted file systems and processes symbolic links. * * vp is the vnode where the directory search should start. * * Reference counts: vp must be held prior to calling this function. rootvp * should only be held if rootvp != rootdir. */ int lookuppnvp( struct pathname *pnp, /* pathname to lookup */ struct pathname *rpnp, /* if non-NULL, return resolved path */ int flags, /* follow symlinks */ vnode_t **dirvpp, /* ptr for parent vnode */ vnode_t **compvpp, /* ptr for entry vnode */ vnode_t *rootvp, /* rootvp */ vnode_t *vp, /* directory to start search at */ cred_t *cr) /* user's credential */ { vnode_t *cvp; /* current component vp */ char component[MAXNAMELEN]; /* buffer for component (incl null) */ int error; int nlink; int lookup_flags; struct pathname presrvd; /* case preserved name */ struct pathname *pp = NULL; vnode_t *startvp; vnode_t *zonevp = curproc->p_zone->zone_rootvp; /* zone root */ int must_be_directory = 0; boolean_t retry_with_kcred; uint32_t auditing = AU_AUDITING(); CPU_STATS_ADDQ(CPU, sys, namei, 1); nlink = 0; cvp = NULL; if (rpnp) rpnp->pn_pathlen = 0; lookup_flags = dirvpp ? LOOKUP_DIR : 0; if (flags & FIGNORECASE) { lookup_flags |= FIGNORECASE; pn_alloc(&presrvd); pp = &presrvd; } if (auditing) audit_anchorpath(pnp, vp == rootvp); /* * Eliminate any trailing slashes in the pathname. * If there are any, we must follow all symlinks. * Also, we must guarantee that the last component is a directory. */ if (pn_fixslash(pnp)) { flags |= FOLLOW; must_be_directory = 1; } startvp = vp; next: retry_with_kcred = B_FALSE; /* * Make sure we have a directory. */ if (vp->v_type != VDIR) { error = ENOTDIR; goto bad; } if (rpnp && VN_CMP(vp, rootvp)) (void) pn_set(rpnp, "/"); /* * Process the next component of the pathname. */ if (error = pn_getcomponent(pnp, component)) { goto bad; } /* * Handle "..": two special cases. * 1. If we're at the root directory (e.g. after chroot or * zone_enter) then change ".." to "." so we can't get * out of this subtree. * 2. If this vnode is the root of a mounted file system, * then replace it with the vnode that was mounted on * so that we take the ".." in the other file system. */ if (component[0] == '.' && component[1] == '.' && component[2] == 0) { checkforroot: if (VN_CMP(vp, rootvp) || VN_CMP(vp, zonevp)) { component[1] = '\0'; } else if (vp->v_flag & VROOT) { vfs_t *vfsp; cvp = vp; /* * While we deal with the vfs pointer from the vnode * the filesystem could have been forcefully unmounted * and the vnode's v_vfsp could have been invalidated * by VFS_UNMOUNT. Hence, we cache v_vfsp and use it * with vfs_rlock_wait/vfs_unlock. * It is safe to use the v_vfsp even it is freed by * VFS_UNMOUNT because vfs_rlock_wait/vfs_unlock * do not dereference v_vfsp. It is just used as a * magic cookie. * One more corner case here is the memory getting * reused for another vfs structure. In this case * lookuppnvp's vfs_rlock_wait will succeed, domount's * vfs_lock will fail and domount will bail out with an * error (EBUSY). */ vfsp = cvp->v_vfsp; /* * This lock is used to synchronize * mounts/unmounts and lookups. * Threads doing mounts/unmounts hold the * writers version vfs_lock_wait(). */ vfs_rlock_wait(vfsp); /* * If this vnode is on a file system that * has been forcibly unmounted, * we can't proceed. Cancel this operation * and return EIO. * * vfs_vnodecovered is NULL if unmounted. * Currently, nfs uses VFS_UNMOUNTED to * check if it's a forced-umount. Keep the * same checking here as well even though it * may not be needed. */ if (((vp = cvp->v_vfsp->vfs_vnodecovered) == NULL) || (cvp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) { vfs_unlock(vfsp); VN_RELE(cvp); if (pp) pn_free(pp); return (EIO); } VN_HOLD(vp); vfs_unlock(vfsp); VN_RELE(cvp); cvp = NULL; /* * Crossing mount points. For eg: We are doing * a lookup of ".." for file systems root vnode * mounted here, and VOP_LOOKUP() (with covered vnode) * will be on underlying file systems mount point * vnode. Set retry_with_kcred flag as we might end * up doing VOP_LOOKUP() with kcred if required. */ retry_with_kcred = B_TRUE; goto checkforroot; } } /* * LOOKUP_CHECKREAD is a private flag used by vnodetopath() to indicate * that we need to have read permission on every directory in the entire * path. This is used to ensure that a forward-lookup of a cached value * has the same effect as a reverse-lookup when the cached value cannot * be found. */ if ((flags & LOOKUP_CHECKREAD) && (error = VOP_ACCESS(vp, VREAD, 0, cr, NULL)) != 0) goto bad; /* * Perform a lookup in the current directory. */ error = VOP_LOOKUP(vp, component, &cvp, pnp, lookup_flags, rootvp, cr, NULL, NULL, pp); /* * Retry with kcred - If crossing mount points & error is EACCES. * * If we are crossing mount points here and doing ".." lookup, * VOP_LOOKUP() might fail if the underlying file systems * mount point has no execute permission. In cases like these, * we retry VOP_LOOKUP() by giving as much privilage as possible * by passing kcred credentials. * * In case of hierarchical file systems, passing kcred still may * or may not work. * For eg: UFS FS --> Mount NFS FS --> Again mount UFS on some * directory inside NFS FS. */ if ((error == EACCES) && retry_with_kcred) error = VOP_LOOKUP(vp, component, &cvp, pnp, lookup_flags, rootvp, zone_kcred(), NULL, NULL, pp); if (error) { cvp = NULL; /* * On error, return hard error if * (a) we're not at the end of the pathname yet, or * (b) the caller didn't want the parent directory, or * (c) we failed for some reason other than a missing entry. */ if (pn_pathleft(pnp) || dirvpp == NULL || error != ENOENT) goto bad; if (auditing) { /* directory access */ if (error = audit_savepath(pnp, vp, vp, error, cr)) goto bad_noaudit; } pn_setlast(pnp); /* * We inform the caller that the desired entry must be * a directory by adding a '/' to the component name. */ if (must_be_directory && (error = pn_addslash(pnp)) != 0) goto bad; *dirvpp = vp; if (compvpp != NULL) *compvpp = NULL; if (rootvp != rootdir) VN_RELE(rootvp); if (pp) pn_free(pp); return (0); } /* * Traverse mount points. * XXX why don't we need to hold a read lock here (call vn_vfsrlock)? * What prevents a concurrent update to v_vfsmountedhere? * Possible answer: if mounting, we might not see the mount * if it is concurrently coming into existence, but that's * really not much different from the thread running a bit slower. * If unmounting, we may get into traverse() when we shouldn't, * but traverse() will catch this case for us. * (For this to work, fetching v_vfsmountedhere had better * be atomic!) */ if (vn_mountedvfs(cvp) != NULL) { if ((error = traverse(&cvp)) != 0) goto bad; } /* * If we hit a symbolic link and there is more path to be * translated or this operation does not wish to apply * to a link, then place the contents of the link at the * front of the remaining pathname. */ if (cvp->v_type == VLNK && ((flags & FOLLOW) || pn_pathleft(pnp))) { struct pathname linkpath; if (++nlink > MAXSYMLINKS) { error = ELOOP; goto bad; } pn_alloc(&linkpath); if (error = pn_getsymlink(cvp, &linkpath, cr)) { pn_free(&linkpath); goto bad; } if (auditing) audit_symlink(pnp, &linkpath); if (pn_pathleft(&linkpath) == 0) (void) pn_set(&linkpath, "."); error = pn_insert(pnp, &linkpath, strlen(component)); pn_free(&linkpath); if (error) goto bad; VN_RELE(cvp); cvp = NULL; if (pnp->pn_pathlen == 0) { error = ENOENT; goto bad; } if (pnp->pn_path[0] == '/') { do { pnp->pn_path++; pnp->pn_pathlen--; } while (pnp->pn_path[0] == '/'); VN_RELE(vp); vp = rootvp; VN_HOLD(vp); } if (auditing) audit_anchorpath(pnp, vp == rootvp); if (pn_fixslash(pnp)) { flags |= FOLLOW; must_be_directory = 1; } goto next; } /* * If rpnp is non-NULL, remember the resolved path name therein. * Do not include "." components. Collapse occurrences of * "previous/..", so long as "previous" is not itself "..". * Exhausting rpnp results in error ENAMETOOLONG. */ if (rpnp && strcmp(component, ".") != 0) { size_t len; if (strcmp(component, "..") == 0 && rpnp->pn_pathlen != 0 && !((rpnp->pn_pathlen > 2 && strncmp(rpnp->pn_path+rpnp->pn_pathlen-3, "/..", 3) == 0) || (rpnp->pn_pathlen == 2 && strncmp(rpnp->pn_path, "..", 2) == 0))) { while (rpnp->pn_pathlen && rpnp->pn_path[rpnp->pn_pathlen-1] != '/') rpnp->pn_pathlen--; if (rpnp->pn_pathlen > 1) rpnp->pn_pathlen--; rpnp->pn_path[rpnp->pn_pathlen] = '\0'; } else { if (rpnp->pn_pathlen != 0 && rpnp->pn_path[rpnp->pn_pathlen-1] != '/') rpnp->pn_path[rpnp->pn_pathlen++] = '/'; if (flags & FIGNORECASE) { /* * Return the case-preserved name * within the resolved path. */ error = copystr(pp->pn_buf, rpnp->pn_path + rpnp->pn_pathlen, rpnp->pn_bufsize - rpnp->pn_pathlen, &len); } else { error = copystr(component, rpnp->pn_path + rpnp->pn_pathlen, rpnp->pn_bufsize - rpnp->pn_pathlen, &len); } if (error) /* copystr() returns ENAMETOOLONG */ goto bad; rpnp->pn_pathlen += (len - 1); ASSERT(rpnp->pn_bufsize > rpnp->pn_pathlen); } } /* * If no more components, return last directory (if wanted) and * last component (if wanted). */ if (pn_pathleft(pnp) == 0) { /* * If there was a trailing slash in the pathname, * make sure the last component is a directory. */ if (must_be_directory && cvp->v_type != VDIR) { error = ENOTDIR; goto bad; } if (dirvpp != NULL) { /* * Check that we have the real parent and not * an alias of the last component. */ if (vn_compare(vp, cvp)) { if (auditing) (void) audit_savepath(pnp, cvp, vp, EINVAL, cr); pn_setlast(pnp); VN_RELE(vp); VN_RELE(cvp); if (rootvp != rootdir) VN_RELE(rootvp); if (pp) pn_free(pp); return (EINVAL); } *dirvpp = vp; } else VN_RELE(vp); if (auditing) (void) audit_savepath(pnp, cvp, vp, 0, cr); if (pnp->pn_path == pnp->pn_buf) (void) pn_set(pnp, "."); else pn_setlast(pnp); if (rpnp) { if (VN_CMP(cvp, rootvp)) (void) pn_set(rpnp, "/"); else if (rpnp->pn_pathlen == 0) (void) pn_set(rpnp, "."); } if (compvpp != NULL) *compvpp = cvp; else VN_RELE(cvp); if (rootvp != rootdir) VN_RELE(rootvp); if (pp) pn_free(pp); return (0); } /* * Skip over slashes from end of last component. */ while (pnp->pn_path[0] == '/') { pnp->pn_path++; pnp->pn_pathlen--; } /* * Searched through another level of directory: * release previous directory handle and save new (result * of lookup) as current directory. */ VN_RELE(vp); vp = cvp; cvp = NULL; goto next; bad: if (auditing) /* reached end of path */ (void) audit_savepath(pnp, cvp, vp, error, cr); bad_noaudit: /* * Error. Release vnodes and return. */ if (cvp) VN_RELE(cvp); /* * If the error was ESTALE and the current directory to look in * was the root for this lookup, the root for a mounted file * system, or the starting directory for lookups, then * return ENOENT instead of ESTALE. In this case, no recovery * is possible by the higher level. If ESTALE was returned for * some intermediate directory along the path, then recovery * is potentially possible and retrying from the higher level * will either correct the situation by purging stale cache * entries or eventually get back to the point where no recovery * is possible. */ if (error == ESTALE && (VN_CMP(vp, rootvp) || (vp->v_flag & VROOT) || vp == startvp)) error = ENOENT; VN_RELE(vp); if (rootvp != rootdir) VN_RELE(rootvp); if (pp) pn_free(pp); return (error); }
/* * mutex_vector_enter() is called from the assembly mutex_enter() routine * if the lock is held or is not of type MUTEX_ADAPTIVE. */ void mutex_vector_enter(mutex_impl_t *lp) { kthread_id_t owner; hrtime_t sleep_time = 0; /* how long we slept */ uint_t spin_count = 0; /* how many times we spun */ cpu_t *cpup, *last_cpu; extern cpu_t *cpu_list; turnstile_t *ts; volatile mutex_impl_t *vlp = (volatile mutex_impl_t *)lp; int backoff; /* current backoff */ int backctr; /* ctr for backoff */ int sleep_count = 0; ASSERT_STACK_ALIGNED(); if (MUTEX_TYPE_SPIN(lp)) { lock_set_spl(&lp->m_spin.m_spinlock, lp->m_spin.m_minspl, &lp->m_spin.m_oldspl); return; } if (!MUTEX_TYPE_ADAPTIVE(lp)) { mutex_panic("mutex_enter: bad mutex", lp); return; } /* * Adaptive mutexes must not be acquired from above LOCK_LEVEL. * We can migrate after loading CPU but before checking CPU_ON_INTR, * so we must verify by disabling preemption and loading CPU again. */ cpup = CPU; if (CPU_ON_INTR(cpup) && !panicstr) { kpreempt_disable(); if (CPU_ON_INTR(CPU)) mutex_panic("mutex_enter: adaptive at high PIL", lp); kpreempt_enable(); } CPU_STATS_ADDQ(cpup, sys, mutex_adenters, 1); if (&plat_lock_delay) { backoff = 0; } else { backoff = BACKOFF_BASE; } for (;;) { spin: spin_count++; /* * Add an exponential backoff delay before trying again * to touch the mutex data structure. * the spin_count test and call to nulldev are to prevent * the compiler optimizer from eliminating the delay loop. */ if (&plat_lock_delay) { plat_lock_delay(&backoff); } else { for (backctr = backoff; backctr; backctr--) { if (!spin_count) (void) nulldev(); }; /* delay */ backoff = backoff << 1; /* double it */ if (backoff > BACKOFF_CAP) { backoff = BACKOFF_CAP; } SMT_PAUSE(); } if (panicstr) return; if ((owner = MUTEX_OWNER(vlp)) == NULL) { if (mutex_adaptive_tryenter(lp)) break; continue; } if (owner == curthread) mutex_panic("recursive mutex_enter", lp); /* * If lock is held but owner is not yet set, spin. * (Only relevant for platforms that don't have cas.) */ if (owner == MUTEX_NO_OWNER) continue; /* * When searching the other CPUs, start with the one where * we last saw the owner thread. If owner is running, spin. * * We must disable preemption at this point to guarantee * that the list doesn't change while we traverse it * without the cpu_lock mutex. While preemption is * disabled, we must revalidate our cached cpu pointer. */ kpreempt_disable(); if (cpup->cpu_next == NULL) cpup = cpu_list; last_cpu = cpup; /* mark end of search */ do { if (cpup->cpu_thread == owner) { kpreempt_enable(); goto spin; } } while ((cpup = cpup->cpu_next) != last_cpu); kpreempt_enable(); /* * The owner appears not to be running, so block. * See the Big Theory Statement for memory ordering issues. */ ts = turnstile_lookup(lp); MUTEX_SET_WAITERS(lp); membar_enter(); /* * Recheck whether owner is running after waiters bit hits * global visibility (above). If owner is running, spin. * * Since we are at ipl DISP_LEVEL, kernel preemption is * disabled, however we still need to revalidate our cached * cpu pointer to make sure the cpu hasn't been deleted. */ if (cpup->cpu_next == NULL) last_cpu = cpup = cpu_list; do { if (cpup->cpu_thread == owner) { turnstile_exit(lp); goto spin; } } while ((cpup = cpup->cpu_next) != last_cpu); membar_consumer(); /* * If owner and waiters bit are unchanged, block. */ if (MUTEX_OWNER(vlp) == owner && MUTEX_HAS_WAITERS(vlp)) { sleep_time -= gethrtime(); (void) turnstile_block(ts, TS_WRITER_Q, lp, &mutex_sobj_ops, NULL, NULL); sleep_time += gethrtime(); sleep_count++; } else { turnstile_exit(lp); } } ASSERT(MUTEX_OWNER(lp) == curthread); if (sleep_time != 0) { /* * Note, sleep time is the sum of all the sleeping we * did. */ LOCKSTAT_RECORD(LS_MUTEX_ENTER_BLOCK, lp, sleep_time); } /* * We do not count a sleep as a spin. */ if (spin_count > sleep_count) LOCKSTAT_RECORD(LS_MUTEX_ENTER_SPIN, lp, spin_count - sleep_count); LOCKSTAT_RECORD0(LS_MUTEX_ENTER_ACQUIRE, lp); }