void cv_signal(kcondvar_t *cvp) { condvar_impl_t *cp = (condvar_impl_t *)cvp; /* make sure the cv_waiters field looks sane */ ASSERT(cp->cv_waiters <= CV_MAX_WAITERS); if (cp->cv_waiters > 0) { sleepq_head_t *sqh = SQHASH(cp); disp_lock_enter(&sqh->sq_lock); ASSERT(CPU_ON_INTR(CPU) == 0); if (cp->cv_waiters & CV_WAITERS_MASK) { kthread_t *t; cp->cv_waiters--; t = sleepq_wakeone_chan(&sqh->sq_queue, cp); /* * If cv_waiters is non-zero (and less than * CV_MAX_WAITERS) there should be a thread * in the queue. */ ASSERT(t != NULL); } else if (sleepq_wakeone_chan(&sqh->sq_queue, cp) == NULL) { cp->cv_waiters = 0; } disp_lock_exit(&sqh->sq_lock); } }
void cv_broadcast(kcondvar_t *cvp) { condvar_impl_t *cp = (condvar_impl_t *)cvp; /* make sure the cv_waiters field looks sane */ ASSERT(cp->cv_waiters <= CV_MAX_WAITERS); if (cp->cv_waiters > 0) { sleepq_head_t *sqh = SQHASH(cp); disp_lock_enter(&sqh->sq_lock); ASSERT(CPU_ON_INTR(CPU) == 0); sleepq_wakeall_chan(&sqh->sq_queue, cp); cp->cv_waiters = 0; disp_lock_exit(&sqh->sq_lock); } }
/* * The cv_block() function blocks a thread on a condition variable * by putting it in a hashed sleep queue associated with the * synchronization object. * * Threads are taken off the hashed sleep queues via calls to * cv_signal(), cv_broadcast(), or cv_unsleep(). */ static void cv_block(condvar_impl_t *cvp) { kthread_t *t = curthread; klwp_t *lwp = ttolwp(t); sleepq_head_t *sqh; ASSERT(THREAD_LOCK_HELD(t)); ASSERT(t != CPU->cpu_idle_thread); ASSERT(CPU_ON_INTR(CPU) == 0); ASSERT(t->t_wchan0 == NULL && t->t_wchan == NULL); ASSERT(t->t_state == TS_ONPROC); t->t_schedflag &= ~TS_SIGNALLED; CL_SLEEP(t); /* assign kernel priority */ t->t_wchan = (caddr_t)cvp; t->t_sobj_ops = &cv_sobj_ops; DTRACE_SCHED(sleep); /* * The check for t_intr is to avoid doing the * account for an interrupt thread on the still-pinned * lwp's statistics. */ if (lwp != NULL && t->t_intr == NULL) { lwp->lwp_ru.nvcsw++; (void) new_mstate(t, LMS_SLEEP); } sqh = SQHASH(cvp); disp_lock_enter_high(&sqh->sq_lock); if (cvp->cv_waiters < CV_MAX_WAITERS) cvp->cv_waiters++; ASSERT(cvp->cv_waiters <= CV_MAX_WAITERS); THREAD_SLEEP(t, &sqh->sq_lock); sleepq_insert(&sqh->sq_queue, t); /* * THREAD_SLEEP() moves curthread->t_lockp to point to the * lock sqh->sq_lock. This lock is later released by the caller * when it calls thread_unlock() on curthread. */ }
/*ARGSUSED*/ int dtrace_getstackdepth(int aframes) { # if 1 TODO(); return 0; # else struct frame *fp = (struct frame *)dtrace_getfp(); struct frame *nextfp, *minfp, *stacktop; int depth = 0; int is_intr = 0; int on_intr; uintptr_t pc; if ((on_intr = CPU_ON_INTR(CPU)) != 0) stacktop = (struct frame *)(CPU->cpu_intr_stack + SA(MINFRAME)); else stacktop = (struct frame *)curthread->t_stk; minfp = fp; aframes++; for (;;) { depth++; if (is_intr) { struct regs *rp = (struct regs *)fp; nextfp = (struct frame *)rp->r_fp; pc = rp->r_pc; } else { nextfp = (struct frame *)fp->fr_savfp; pc = fp->fr_savpc; } if (nextfp <= minfp || nextfp >= stacktop) { if (on_intr) { /* * Hop from interrupt stack to thread stack. */ stacktop = (struct frame *)curthread->t_stk; minfp = (struct frame *)curthread->t_stkbase; on_intr = 0; continue; } break; } is_intr = pc - (uintptr_t)_interrupt < _interrupt_size || pc - (uintptr_t)_allsyscalls < _allsyscalls_size || pc - (uintptr_t)_cmntrap < _cmntrap_size; fp = nextfp; minfp = fp; } if (depth <= aframes) return (0); return (depth - aframes); # endif }
void new_cpu_mstate(int cmstate, hrtime_t curtime) { cpu_t *cpu = CPU; uint16_t gen; ASSERT(cpu->cpu_mstate != CMS_DISABLED); ASSERT(cmstate < NCMSTATES); ASSERT(cmstate != CMS_DISABLED); /* * This function cannot be re-entrant on a given CPU. As such, * we ASSERT and panic if we are called on behalf of an interrupt. * The one exception is for an interrupt which has previously * blocked. Such an interrupt is being scheduled by the dispatcher * just like a normal thread, and as such cannot arrive here * in a re-entrant manner. */ ASSERT(!CPU_ON_INTR(cpu) && curthread->t_intr == NULL); ASSERT(curthread->t_preempt > 0 || curthread == cpu->cpu_idle_thread); /* * LOCKING, or lack thereof: * * Updates to CPU mstate can only be made by the CPU * itself, and the above check to ignore interrupts * should prevent recursion into this function on a given * processor. i.e. no possible write contention. * * However, reads of CPU mstate can occur at any time * from any CPU. Any locking added to this code path * would seriously impact syscall performance. So, * instead we have a best-effort protection for readers. * The reader will want to account for any time between * cpu_mstate_start and the present time. This requires * some guarantees that the reader is getting coherent * information. * * We use a generation counter, which is set to 0 before * we start making changes, and is set to a new value * after we're done. Someone reading the CPU mstate * should check for the same non-zero value of this * counter both before and after reading all state. The * important point is that the reader is not a * performance-critical path, but this function is. * * The ordering of writes is critical. cpu_mstate_gen must * be visibly zero on all CPUs before we change cpu_mstate * and cpu_mstate_start. Additionally, cpu_mstate_gen must * not be restored to oldgen+1 until after all of the other * writes have become visible. * * Normally one puts membar_producer() calls to accomplish * this. Unfortunately this routine is extremely performance * critical (esp. in syscall_mstate below) and we cannot * afford the additional time, particularly on some x86 * architectures with extremely slow sfence calls. On a * CPU which guarantees write ordering (including sparc, x86, * and amd64) this is not a problem. The compiler could still * reorder the writes, so we make the four cpu fields * volatile to prevent this. * * TSO warning: should we port to a non-TSO (or equivalent) * CPU, this will break. * * The reader stills needs the membar_consumer() calls because, * although the volatiles prevent the compiler from reordering * loads, the CPU can still do so. */ NEW_CPU_MSTATE(cmstate); }
void dtrace_getpcstack(pc_t *pcstack, int pcstack_limit, int aframes, uint32_t *intrpc) { struct frame *fp = (struct frame *)__builtin_frame_address(0); struct frame *nextfp, *minfp, *stacktop; int depth = 0; int last = 0; uintptr_t pc; uintptr_t caller = CPU->cpu_dtrace_caller; int on_intr; if ((on_intr = CPU_ON_INTR(CPU)) != 0) stacktop = (struct frame *)dtrace_get_cpu_int_stack_top(); else stacktop = (struct frame *)(dtrace_get_kernel_stack(current_thread()) + kernel_stack_size); minfp = fp; aframes++; if (intrpc != NULL && depth < pcstack_limit) pcstack[depth++] = (pc_t)intrpc; while (depth < pcstack_limit) { nextfp = *(struct frame **)fp; #if defined(__x86_64__) pc = *(uintptr_t *)(((uintptr_t)fp) + RETURN_OFFSET64); #else pc = *(uintptr_t *)(((uintptr_t)fp) + RETURN_OFFSET); #endif if (nextfp <= minfp || nextfp >= stacktop) { if (on_intr) { /* * Hop from interrupt stack to thread stack. */ vm_offset_t kstack_base = dtrace_get_kernel_stack(current_thread()); minfp = (struct frame *)kstack_base; stacktop = (struct frame *)(kstack_base + kernel_stack_size); on_intr = 0; continue; } /* * This is the last frame we can process; indicate * that we should return after processing this frame. */ last = 1; } if (aframes > 0) { if (--aframes == 0 && caller != 0) { /* * We've just run out of artificial frames, * and we have a valid caller -- fill it in * now. */ ASSERT(depth < pcstack_limit); pcstack[depth++] = (pc_t)caller; caller = 0; } } else { if (depth < pcstack_limit) pcstack[depth++] = (pc_t)pc; } if (last) { while (depth < pcstack_limit) pcstack[depth++] = 0; return; } fp = nextfp; minfp = fp; } }
void panicsys(const char *format, va_list alist, struct regs *rp, int on_panic_stack) { int s = spl8(); kthread_t *t = curthread; cpu_t *cp = CPU; caddr_t intr_stack = NULL; uint_t intr_actv; ushort_t schedflag = t->t_schedflag; cpu_t *bound_cpu = t->t_bound_cpu; char preempt = t->t_preempt; (void) setjmp(&t->t_pcb); t->t_flag |= T_PANIC; t->t_schedflag |= TS_DONT_SWAP; t->t_bound_cpu = cp; t->t_preempt++; /* * Switch lbolt to event driven mode. */ lbolt_hybrid = lbolt_event_driven; panic_enter_hw(s); /* * If we're on the interrupt stack and an interrupt thread is available * in this CPU's pool, preserve the interrupt stack by detaching an * interrupt thread and making its stack the intr_stack. */ if (CPU_ON_INTR(cp) && cp->cpu_intr_thread != NULL) { kthread_t *it = cp->cpu_intr_thread; intr_stack = cp->cpu_intr_stack; intr_actv = cp->cpu_intr_actv; cp->cpu_intr_stack = thread_stk_init(it->t_stk); cp->cpu_intr_thread = it->t_link; /* * Clear only the high level bits of cpu_intr_actv. * We want to indicate that high-level interrupts are * not active without destroying the low-level interrupt * information stored there. */ cp->cpu_intr_actv &= ((1 << (LOCK_LEVEL + 1)) - 1); } /* * Record one-time panic information and quiesce the other CPUs. * Then print out the panic message and stack trace. */ if (on_panic_stack) { panic_data_t *pdp = (panic_data_t *)panicbuf; pdp->pd_version = PANICBUFVERS; pdp->pd_msgoff = sizeof (panic_data_t) - sizeof (panic_nv_t); if (t->t_panic_trap != NULL) panic_savetrap(pdp, t->t_panic_trap); else panic_saveregs(pdp, rp); (void) vsnprintf(&panicbuf[pdp->pd_msgoff], PANICBUFSIZE - pdp->pd_msgoff, format, alist); /* * Call into the platform code to stop the other CPUs. * We currently have all interrupts blocked, and expect that * the platform code will lower ipl only as far as needed to * perform cross-calls, and will acquire as *few* locks as is * possible -- panicstr is not set so we can still deadlock. */ panic_stopcpus(cp, t, s); panicstr = (char *)format; va_copy(panicargs, alist); panic_lbolt = LBOLT_NO_ACCOUNT; panic_lbolt64 = LBOLT_NO_ACCOUNT64; panic_hrestime = hrestime; panic_hrtime = gethrtime_waitfree(); panic_thread = t; panic_regs = t->t_pcb; panic_reg = rp; panic_cpu = *cp; panic_ipl = spltoipl(s); panic_schedflag = schedflag; panic_bound_cpu = bound_cpu; panic_preempt = preempt; if (intr_stack != NULL) { panic_cpu.cpu_intr_stack = intr_stack; panic_cpu.cpu_intr_actv = intr_actv; } /* * Lower ipl to 10 to keep clock() from running, but allow * keyboard interrupts to enter the debugger. These callbacks * are executed with panicstr set so they can bypass locks. */ splx(ipltospl(CLOCK_LEVEL)); panic_quiesce_hw(pdp); (void) FTRACE_STOP(); (void) callb_execute_class(CB_CL_PANIC, NULL); if (log_intrq != NULL) log_flushq(log_intrq); /* * If log_consq has been initialized and syslogd has started, * print any messages in log_consq that haven't been consumed. */ if (log_consq != NULL && log_consq != log_backlogq) log_printq(log_consq); fm_banner(); #if defined(__x86) /* * A hypervisor panic originates outside of Solaris, so we * don't want to prepend the panic message with misleading * pointers from within Solaris. */ if (!IN_XPV_PANIC()) #endif printf("\n\rpanic[cpu%d]/thread=%p: ", cp->cpu_id, (void *)t); vprintf(format, alist); printf("\n\n"); if (t->t_panic_trap != NULL) { panic_showtrap(t->t_panic_trap); printf("\n"); } traceregs(rp); printf("\n"); if (((boothowto & RB_DEBUG) || obpdebug) && !nopanicdebug && !panic_forced) { if (dumpvp != NULL) { debug_enter("panic: entering debugger " "(continue to save dump)"); } else { debug_enter("panic: entering debugger " "(no dump device, continue to reboot)"); } } } else if (panic_dump != 0 || panic_sync != 0 || panicstr != NULL) { printf("\n\rpanic[cpu%d]/thread=%p: ", cp->cpu_id, (void *)t); vprintf(format, alist); printf("\n"); } else goto spin; /* * Prior to performing sync or dump, we make sure that do_polled_io is * set, but we'll leave ipl at 10; deadman(), a CY_HIGH_LEVEL cyclic, * will re-enter panic if we are not making progress with sync or dump. */ /* * Sync the filesystems. Reset t_cred if not set because much of * the filesystem code depends on CRED() being valid. */ if (!in_sync && panic_trigger(&panic_sync)) { if (t->t_cred == NULL) t->t_cred = kcred; splx(ipltospl(CLOCK_LEVEL)); do_polled_io = 1; vfs_syncall(); } /* * Take the crash dump. If the dump trigger is already set, try to * enter the debugger again before rebooting the system. */ if (panic_trigger(&panic_dump)) { panic_dump_hw(s); splx(ipltospl(CLOCK_LEVEL)); errorq_panic(); do_polled_io = 1; dumpsys(); } else if (((boothowto & RB_DEBUG) || obpdebug) && !nopanicdebug) { debug_enter("panic: entering debugger (continue to reboot)"); } else printf("dump aborted: please record the above information!\n"); if (halt_on_panic) mdboot(A_REBOOT, AD_HALT, NULL, B_FALSE); else mdboot(A_REBOOT, panic_bootfcn, panic_bootstr, B_FALSE); spin: /* * Restore ipl to at most CLOCK_LEVEL so we don't end up spinning * and unable to jump into the debugger. */ splx(MIN(s, ipltospl(CLOCK_LEVEL))); for (;;) ; }
static void clock_tick_process(cpu_t *cp, clock_t mylbolt, int pending) { kthread_t *t; kmutex_t *plockp; int notick, intr; klwp_id_t lwp; /* * The locking here is rather tricky. thread_free_prevent() * prevents the thread returned from being freed while we * are looking at it. We can then check if the thread * is exiting and get the appropriate p_lock if it * is not. We have to be careful, though, because * the _process_ can still be freed while we've * prevented thread free. To avoid touching the * proc structure we put a pointer to the p_lock in the * thread structure. The p_lock is persistent so we * can acquire it even if the process is gone. At that * point we can check (again) if the thread is exiting * and either drop the lock or do the tick processing. */ t = cp->cpu_thread; /* Current running thread */ if (CPU == cp) { /* * 't' will be the tick processing thread on this * CPU. Use the pinned thread (if any) on this CPU * as the target of the clock tick. */ if (t->t_intr != NULL) t = t->t_intr; } /* * We use thread_free_prevent to keep the currently running * thread from being freed or recycled while we're * looking at it. */ thread_free_prevent(t); /* * We cannot hold the cpu_lock to prevent the * cpu_active from changing in the clock interrupt. * As long as we don't block (or don't get pre-empted) * the cpu_list will not change (all threads are paused * before list modification). */ if (CLOCK_TICK_CPU_OFFLINE(cp)) { thread_free_allow(t); return; } /* * Make sure the thread is still on the CPU. */ if ((t != cp->cpu_thread) && ((cp != CPU) || (t != cp->cpu_thread->t_intr))) { /* * We could not locate the thread. Skip this CPU. Race * conditions while performing these checks are benign. * These checks are not perfect and they don't need * to be. */ thread_free_allow(t); return; } intr = t->t_flag & T_INTR_THREAD; lwp = ttolwp(t); if (lwp == NULL || (t->t_proc_flag & TP_LWPEXIT) || intr) { /* * Thread is exiting (or uninteresting) so don't * do tick processing. */ thread_free_allow(t); return; } /* * OK, try to grab the process lock. See * comments above for why we're not using * ttoproc(t)->p_lockp here. */ plockp = t->t_plockp; mutex_enter(plockp); /* See above comment. */ if (CLOCK_TICK_CPU_OFFLINE(cp)) { mutex_exit(plockp); thread_free_allow(t); return; } /* * The thread may have exited between when we * checked above, and when we got the p_lock. */ if (t->t_proc_flag & TP_LWPEXIT) { mutex_exit(plockp); thread_free_allow(t); return; } /* * Either we have the p_lock for the thread's process, * or we don't care about the thread structure any more. * Either way we can allow thread free. */ thread_free_allow(t); /* * If we haven't done tick processing for this * lwp, then do it now. Since we don't hold the * lwp down on a CPU it can migrate and show up * more than once, hence the lbolt check. mylbolt * is copied at the time of tick scheduling to prevent * lbolt mismatches. * * Also, make sure that it's okay to perform the * tick processing before calling clock_tick. * Setting notick to a TRUE value (ie. not 0) * results in tick processing not being performed for * that thread. */ notick = ((cp->cpu_flags & CPU_QUIESCED) || CPU_ON_INTR(cp) || (cp->cpu_dispthread == cp->cpu_idle_thread)); if ((!notick) && (t->t_lbolt < mylbolt)) { t->t_lbolt = mylbolt; clock_tick(t, pending); } mutex_exit(plockp); }
/* * mutex_vector_enter() is called from the assembly mutex_enter() routine * if the lock is held or is not of type MUTEX_ADAPTIVE. */ void mutex_vector_enter(mutex_impl_t *lp) { kthread_id_t owner; hrtime_t sleep_time = 0; /* how long we slept */ uint_t spin_count = 0; /* how many times we spun */ cpu_t *cpup, *last_cpu; extern cpu_t *cpu_list; turnstile_t *ts; volatile mutex_impl_t *vlp = (volatile mutex_impl_t *)lp; int backoff; /* current backoff */ int backctr; /* ctr for backoff */ int sleep_count = 0; ASSERT_STACK_ALIGNED(); if (MUTEX_TYPE_SPIN(lp)) { lock_set_spl(&lp->m_spin.m_spinlock, lp->m_spin.m_minspl, &lp->m_spin.m_oldspl); return; } if (!MUTEX_TYPE_ADAPTIVE(lp)) { mutex_panic("mutex_enter: bad mutex", lp); return; } /* * Adaptive mutexes must not be acquired from above LOCK_LEVEL. * We can migrate after loading CPU but before checking CPU_ON_INTR, * so we must verify by disabling preemption and loading CPU again. */ cpup = CPU; if (CPU_ON_INTR(cpup) && !panicstr) { kpreempt_disable(); if (CPU_ON_INTR(CPU)) mutex_panic("mutex_enter: adaptive at high PIL", lp); kpreempt_enable(); } CPU_STATS_ADDQ(cpup, sys, mutex_adenters, 1); if (&plat_lock_delay) { backoff = 0; } else { backoff = BACKOFF_BASE; } for (;;) { spin: spin_count++; /* * Add an exponential backoff delay before trying again * to touch the mutex data structure. * the spin_count test and call to nulldev are to prevent * the compiler optimizer from eliminating the delay loop. */ if (&plat_lock_delay) { plat_lock_delay(&backoff); } else { for (backctr = backoff; backctr; backctr--) { if (!spin_count) (void) nulldev(); }; /* delay */ backoff = backoff << 1; /* double it */ if (backoff > BACKOFF_CAP) { backoff = BACKOFF_CAP; } SMT_PAUSE(); } if (panicstr) return; if ((owner = MUTEX_OWNER(vlp)) == NULL) { if (mutex_adaptive_tryenter(lp)) break; continue; } if (owner == curthread) mutex_panic("recursive mutex_enter", lp); /* * If lock is held but owner is not yet set, spin. * (Only relevant for platforms that don't have cas.) */ if (owner == MUTEX_NO_OWNER) continue; /* * When searching the other CPUs, start with the one where * we last saw the owner thread. If owner is running, spin. * * We must disable preemption at this point to guarantee * that the list doesn't change while we traverse it * without the cpu_lock mutex. While preemption is * disabled, we must revalidate our cached cpu pointer. */ kpreempt_disable(); if (cpup->cpu_next == NULL) cpup = cpu_list; last_cpu = cpup; /* mark end of search */ do { if (cpup->cpu_thread == owner) { kpreempt_enable(); goto spin; } } while ((cpup = cpup->cpu_next) != last_cpu); kpreempt_enable(); /* * The owner appears not to be running, so block. * See the Big Theory Statement for memory ordering issues. */ ts = turnstile_lookup(lp); MUTEX_SET_WAITERS(lp); membar_enter(); /* * Recheck whether owner is running after waiters bit hits * global visibility (above). If owner is running, spin. * * Since we are at ipl DISP_LEVEL, kernel preemption is * disabled, however we still need to revalidate our cached * cpu pointer to make sure the cpu hasn't been deleted. */ if (cpup->cpu_next == NULL) last_cpu = cpup = cpu_list; do { if (cpup->cpu_thread == owner) { turnstile_exit(lp); goto spin; } } while ((cpup = cpup->cpu_next) != last_cpu); membar_consumer(); /* * If owner and waiters bit are unchanged, block. */ if (MUTEX_OWNER(vlp) == owner && MUTEX_HAS_WAITERS(vlp)) { sleep_time -= gethrtime(); (void) turnstile_block(ts, TS_WRITER_Q, lp, &mutex_sobj_ops, NULL, NULL); sleep_time += gethrtime(); sleep_count++; } else { turnstile_exit(lp); } } ASSERT(MUTEX_OWNER(lp) == curthread); if (sleep_time != 0) { /* * Note, sleep time is the sum of all the sleeping we * did. */ LOCKSTAT_RECORD(LS_MUTEX_ENTER_BLOCK, lp, sleep_time); } /* * We do not count a sleep as a spin. */ if (spin_count > sleep_count) LOCKSTAT_RECORD(LS_MUTEX_ENTER_SPIN, lp, spin_count - sleep_count); LOCKSTAT_RECORD0(LS_MUTEX_ENTER_ACQUIRE, lp); }
int turnstile_block(turnstile_t *ts, int qnum, void *sobj, sobj_ops_t *sobj_ops, kmutex_t *mp, lwp_timer_t *lwptp) { kthread_t *owner; kthread_t *t = curthread; proc_t *p = ttoproc(t); klwp_t *lwp = ttolwp(t); turnstile_chain_t *tc = &TURNSTILE_CHAIN(sobj); int error = 0; int loser = 0; ASSERT(DISP_LOCK_HELD(&tc->tc_lock)); ASSERT(mp == NULL || IS_UPI(mp)); ASSERT((SOBJ_TYPE(sobj_ops) == SOBJ_USER_PI) ^ (mp == NULL)); thread_lock_high(t); if (ts == NULL) { /* * This is the first thread to block on this sobj. * Take its attached turnstile and add it to the hash chain. */ ts = t->t_ts; ts->ts_sobj = sobj; ts->ts_next = tc->tc_first; tc->tc_first = ts; ASSERT(ts->ts_waiters == 0); } else { /* * Another thread has already donated its turnstile * to block on this sobj, so ours isn't needed. * Stash it on the active turnstile's freelist. */ turnstile_t *myts = t->t_ts; myts->ts_free = ts->ts_free; ts->ts_free = myts; t->t_ts = ts; ASSERT(ts->ts_sobj == sobj); ASSERT(ts->ts_waiters > 0); } /* * Put the thread to sleep. */ ASSERT(t != CPU->cpu_idle_thread); ASSERT(CPU_ON_INTR(CPU) == 0); ASSERT(t->t_wchan0 == NULL && t->t_wchan == NULL); ASSERT(t->t_state == TS_ONPROC); if (SOBJ_TYPE(sobj_ops) == SOBJ_USER_PI) { curthread->t_flag |= T_WAKEABLE; } CL_SLEEP(t); /* assign kernel priority */ THREAD_SLEEP(t, &tc->tc_lock); t->t_wchan = sobj; t->t_sobj_ops = sobj_ops; DTRACE_SCHED(sleep); if (lwp != NULL) { lwp->lwp_ru.nvcsw++; (void) new_mstate(t, LMS_SLEEP); if (SOBJ_TYPE(sobj_ops) == SOBJ_USER_PI) { lwp->lwp_asleep = 1; lwp->lwp_sysabort = 0; /* * make wchan0 non-zero to conform to the rule that * threads blocking for user-level objects have a * non-zero wchan0: this prevents spurious wake-ups * by, for example, /proc. */ t->t_wchan0 = (caddr_t)1; } } ts->ts_waiters++; sleepq_insert(&ts->ts_sleepq[qnum], t); if (SOBJ_TYPE(sobj_ops) == SOBJ_MUTEX && SOBJ_OWNER(sobj_ops, sobj) == NULL) panic("turnstile_block(%p): unowned mutex", (void *)ts); /* * Follow the blocking chain to its end, willing our priority to * everyone who's in our way. */ while (t->t_sobj_ops != NULL && (owner = SOBJ_OWNER(t->t_sobj_ops, t->t_wchan)) != NULL) { if (owner == curthread) { if (SOBJ_TYPE(sobj_ops) != SOBJ_USER_PI) { panic("Deadlock: cycle in blocking chain"); } /* * If the cycle we've encountered ends in mp, * then we know it isn't a 'real' cycle because * we're going to drop mp before we go to sleep. * Moreover, since we've come full circle we know * that we must have willed priority to everyone * in our way. Therefore, we can break out now. */ if (t->t_wchan == (void *)mp) break; if (loser) lock_clear(&turnstile_loser_lock); /* * For SOBJ_USER_PI, a cycle is an application * deadlock which needs to be communicated * back to the application. */ thread_unlock_nopreempt(t); mutex_exit(mp); setrun(curthread); swtch(); /* necessary to transition state */ curthread->t_flag &= ~T_WAKEABLE; if (lwptp->lwpt_id != 0) (void) lwp_timer_dequeue(lwptp); setallwatch(); lwp->lwp_asleep = 0; lwp->lwp_sysabort = 0; return (EDEADLK); } if (!turnstile_interlock(t->t_lockp, &owner->t_lockp)) { /* * If we failed to grab the owner's thread lock, * turnstile_interlock() will have dropped t's * thread lock, so at this point we don't even know * that 't' exists anymore. The simplest solution * is to restart the entire priority inheritance dance * from the beginning of the blocking chain, since * we *do* know that 'curthread' still exists. * Application of priority inheritance is idempotent, * so it's OK that we're doing it more than once. * Note also that since we've dropped our thread lock, * we may already have been woken up; if so, our * t_sobj_ops will be NULL, the loop will terminate, * and the call to swtch() will be a no-op. Phew. * * There is one further complication: if two (or more) * threads keep trying to grab the turnstile locks out * of order and keep losing the race to another thread, * these "dueling losers" can livelock the system. * Therefore, once we get into this rare situation, * we serialize all the losers. */ if (loser == 0) { loser = 1; lock_set(&turnstile_loser_lock); } t = curthread; thread_lock_high(t); continue; } /* * We now have the owner's thread lock. If we are traversing * from non-SOBJ_USER_PI ops to SOBJ_USER_PI ops, then we know * that we have caught the thread while in the TS_SLEEP state, * but holding mp. We know that this situation is transient * (mp will be dropped before the holder actually sleeps on * the SOBJ_USER_PI sobj), so we will spin waiting for mp to * be dropped. Then, as in the turnstile_interlock() failure * case, we will restart the priority inheritance dance. */ if (SOBJ_TYPE(t->t_sobj_ops) != SOBJ_USER_PI && owner->t_sobj_ops != NULL && SOBJ_TYPE(owner->t_sobj_ops) == SOBJ_USER_PI) { kmutex_t *upi_lock = (kmutex_t *)t->t_wchan; ASSERT(IS_UPI(upi_lock)); ASSERT(SOBJ_TYPE(t->t_sobj_ops) == SOBJ_MUTEX); if (t->t_lockp != owner->t_lockp) thread_unlock_high(owner); thread_unlock_high(t); if (loser) lock_clear(&turnstile_loser_lock); while (mutex_owner(upi_lock) == owner) { SMT_PAUSE(); continue; } if (loser) lock_set(&turnstile_loser_lock); t = curthread; thread_lock_high(t); continue; } turnstile_pi_inherit(t->t_ts, owner, DISP_PRIO(t)); if (t->t_lockp != owner->t_lockp) thread_unlock_high(t); t = owner; } if (loser) lock_clear(&turnstile_loser_lock); /* * Note: 't' and 'curthread' were synonymous before the loop above, * but now they may be different. ('t' is now the last thread in * the blocking chain.) */ if (SOBJ_TYPE(sobj_ops) == SOBJ_USER_PI) { ushort_t s = curthread->t_oldspl; int timedwait = 0; uint_t imm_timeout = 0; clock_t tim = -1; thread_unlock_high(t); if (lwptp->lwpt_id != 0) { /* * We enqueued a timeout. If it has already fired, * lwptp->lwpt_imm_timeout has been set with cas, * so fetch it with cas. */ timedwait = 1; imm_timeout = atomic_cas_uint(&lwptp->lwpt_imm_timeout, 0, 0); } mutex_exit(mp); splx(s); if (ISSIG(curthread, JUSTLOOKING) || MUSTRETURN(p, curthread) || imm_timeout) setrun(curthread); swtch(); curthread->t_flag &= ~T_WAKEABLE; if (timedwait) tim = lwp_timer_dequeue(lwptp); setallwatch(); if (ISSIG(curthread, FORREAL) || lwp->lwp_sysabort || MUSTRETURN(p, curthread)) error = EINTR; else if (imm_timeout || (timedwait && tim == -1)) error = ETIME; lwp->lwp_sysabort = 0; lwp->lwp_asleep = 0; } else { thread_unlock_nopreempt(t); swtch(); } return (error); }