static void fipe_ioat_cancel(void) { uint32_t status; uint8_t *addr = fipe_ioat_ctrl.ioat_reg_addr; ddi_acc_handle_t handle = fipe_ioat_ctrl.ioat_reg_handle; /* * Reset channel. Sometimes reset is not reliable, * so check completion or abort status after reset. */ /* LINTED: constant in conditional context */ while (1) { /* Issue reset channel command. */ ddi_put8(handle, (uint8_t *)(addr + FIPE_IOAT_CHAN_CMD), 0x20); /* Query command status. */ status = ddi_get32(handle, (uint32_t *)(addr + FIPE_IOAT_CHAN_STS_LO)); if (status & 0x1) { /* Reset channel completed. */ break; } else { SMT_PAUSE(); } } /* Put channel into "not in use" state. */ ddi_put16(handle, (uint16_t *)(addr + FIPE_IOAT_CHAN_CTRL), 0); }
static void fipe_disable(void) { /* * Try to acquire lock, which also implicitly has the same effect * of calling membar_sync(). */ while (mutex_tryenter(&fipe_gbl_ctrl.lock) == 0) { /* * If power saving is inactive, just return and all dirty * house-keeping work will be handled in fipe_enable(). */ if (fipe_gbl_ctrl.pm_active == B_FALSE) { return; } else { (void) SMT_PAUSE(); } } /* Disable power saving if it's active. */ if (fipe_gbl_ctrl.pm_active) { /* * Set pm_active to FALSE as soon as possible to prevent * other CPUs from waiting on pm_active flag. */ fipe_gbl_ctrl.pm_active = B_FALSE; membar_producer(); fipe_mc_restore(); fipe_ioat_cancel(); } mutex_exit(&fipe_gbl_ctrl.lock); }
void lock_set_spl_spin(lock_t *lp, int new_pil, ushort_t *old_pil_addr, int old_pil) { int spin_count = 1; int backoff; /* current backoff */ int backctr; /* ctr for backoff */ if (panicstr) return; if (ncpus == 1) panic("lock_set_spl: %p lock held and only one CPU", lp); ASSERT(new_pil > LOCK_LEVEL); if (&plat_lock_delay) { backoff = 0; } else { backoff = BACKOFF_BASE; } do { splx(old_pil); while (LOCK_HELD(lp)) { if (panicstr) { *old_pil_addr = (ushort_t)splr(new_pil); return; } spin_count++; /* * Add an exponential backoff delay before trying again * to touch the mutex data structure. * spin_count test and call to nulldev are to prevent * compiler optimizer from eliminating the delay loop. */ if (&plat_lock_delay) { plat_lock_delay(&backoff); } else { for (backctr = backoff; backctr; backctr--) { if (!spin_count) (void) nulldev(); } backoff = backoff << 1; /* double it */ if (backoff > BACKOFF_CAP) { backoff = BACKOFF_CAP; } SMT_PAUSE(); } } old_pil = splr(new_pil); } while (!lock_spin_try(lp)); *old_pil_addr = (ushort_t)old_pil; if (spin_count) { LOCKSTAT_RECORD(LS_LOCK_SET_SPL_SPIN, lp, spin_count); } LOCKSTAT_RECORD(LS_LOCK_SET_SPL_ACQUIRE, lp, spin_count); }
/*ARGSUSED*/ static void acpi_cpu_check_wakeup(void *arg) { /* * Toggle interrupt flag to detect pending interrupts. * If interrupt happened, do_interrupt() will notify CPU idle * notification framework so no need to call cpu_idle_exit() here. */ sti(); SMT_PAUSE(); cli(); }
/* * Spin until either start_cpus() wakes us up, or we get a request to * enter the safe phase (followed by a later start_cpus()). */ void mach_cpu_pause(volatile char *safe) { *safe = PAUSE_WAIT; membar_enter(); while (*safe != PAUSE_IDLE) { if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE) enter_safe_phase(); SMT_PAUSE(); } }
/* * When we apply priority inheritance, we must grab the owner's thread lock * while already holding the waiter's thread lock. If both thread locks are * turnstile locks, this can lead to deadlock: while we hold L1 and try to * grab L2, some unrelated thread may be applying priority inheritance to * some other blocking chain, holding L2 and trying to grab L1. The most * obvious solution -- do a lock_try() for the owner lock -- isn't quite * sufficient because it can cause livelock: each thread may hold one lock, * try to grab the other, fail, bail out, and try again, looping forever. * To prevent livelock we must define a winner, i.e. define an arbitrary * lock ordering on the turnstile locks. For simplicity we declare that * virtual address order defines lock order, i.e. if L1 < L2, then the * correct lock ordering is L1, L2. Thus the thread that holds L1 and * wants L2 should spin until L2 is available, but the thread that holds * L2 and can't get L1 on the first try must drop L2 and return failure. * Moreover, the losing thread must not reacquire L2 until the winning * thread has had a chance to grab it; to ensure this, the losing thread * must grab L1 after dropping L2, thus spinning until the winner is done. * Complicating matters further, note that the owner's thread lock pointer * can change (i.e. be pointed at a different lock) while we're trying to * grab it. If that happens, we must unwind our state and try again. * * On success, returns 1 with both locks held. * On failure, returns 0 with neither lock held. */ static int turnstile_interlock(lock_t *wlp, lock_t *volatile *olpp) { ASSERT(LOCK_HELD(wlp)); for (;;) { volatile lock_t *olp = *olpp; /* * If the locks are identical, there's nothing to do. */ if (olp == wlp) return (1); if (lock_try((lock_t *)olp)) { /* * If 'olp' is still the right lock, return success. * Otherwise, drop 'olp' and try the dance again. */ if (olp == *olpp) return (1); lock_clear((lock_t *)olp); } else { hrtime_t spin_time = 0; /* * If we're grabbing the locks out of order, we lose. * Drop the waiter's lock, and then grab and release * the owner's lock to ensure that we won't retry * until the winner is done (as described above). */ if (olp >= (lock_t *)turnstile_table && olp < wlp) { lock_clear(wlp); lock_set((lock_t *)olp); lock_clear((lock_t *)olp); return (0); } /* * We're grabbing the locks in the right order, * so spin until the owner's lock either becomes * available or spontaneously changes. */ spin_time = LOCKSTAT_START_TIME(LS_TURNSTILE_INTERLOCK_SPIN); while (olp == *olpp && LOCK_HELD(olp)) { if (panicstr) return (1); SMT_PAUSE(); } LOCKSTAT_RECORD_TIME(LS_TURNSTILE_INTERLOCK_SPIN, olp, spin_time); } } }
/* * Simple C support for the cases where spin locks miss on the first try. */ void lock_set_spin(lock_t *lp) { int spin_count = 1; int backoff; /* current backoff */ int backctr; /* ctr for backoff */ if (panicstr) return; if (ncpus == 1) panic("lock_set: %p lock held and only one CPU", lp); if (&plat_lock_delay) { backoff = 0; } else { backoff = BACKOFF_BASE; } while (LOCK_HELD(lp) || !lock_spin_try(lp)) { if (panicstr) return; spin_count++; /* * Add an exponential backoff delay before trying again * to touch the mutex data structure. * the spin_count test and call to nulldev are to prevent * the compiler optimizer from eliminating the delay loop. */ if (&plat_lock_delay) { plat_lock_delay(&backoff); } else { /* delay */ for (backctr = backoff; backctr; backctr--) { if (!spin_count) (void) nulldev(); } backoff = backoff << 1; /* double it */ if (backoff > BACKOFF_CAP) { backoff = BACKOFF_CAP; } SMT_PAUSE(); } } if (spin_count) { LOCKSTAT_RECORD(LS_LOCK_SET_SPIN, lp, spin_count); } LOCKSTAT_RECORD0(LS_LOCK_SET_ACQUIRE, lp); }
void mp_enter_barrier(void) { hrtime_t last_poke_time = 0; int poke_allowed = 0; int done = 0; int i; ASSERT(MUTEX_HELD(&cpu_lock)); pause_cpus(NULL); while (!done) { done = 1; poke_allowed = 0; if (xpv_gethrtime() - last_poke_time > POKE_TIMEOUT) { last_poke_time = xpv_gethrtime(); poke_allowed = 1; } for (i = 0; i < NCPU; i++) { cpu_t *cp = cpu_get(i); if (cp == NULL || cp == CPU) continue; switch (cpu_phase[i]) { case CPU_PHASE_NONE: cpu_phase[i] = CPU_PHASE_WAIT_SAFE; poke_cpu(i); done = 0; break; case CPU_PHASE_WAIT_SAFE: if (poke_allowed) poke_cpu(i); done = 0; break; case CPU_PHASE_SAFE: case CPU_PHASE_POWERED_OFF: break; } } SMT_PAUSE(); } }
/* * Reach a point at which the CPU can be safely powered-off or * suspended. Nothing can wake this CPU out of the loop. */ static void enter_safe_phase(void) { ulong_t flags = intr_clear(); if (setjmp(&curthread->t_pcb) == 0) { cpu_phase[CPU->cpu_id] = CPU_PHASE_SAFE; while (cpu_phase[CPU->cpu_id] == CPU_PHASE_SAFE) SMT_PAUSE(); } ASSERT(!interrupts_enabled()); intr_restore(flags); }
void mach_cpu_pause(volatile char *safe) { /* * This cpu is now safe. */ *safe = PAUSE_WAIT; membar_enter(); /* make sure stores are flushed */ /* * Now we wait. When we are allowed to continue, safe * will be set to PAUSE_IDLE. */ while (*safe != PAUSE_IDLE) SMT_PAUSE(); }
/* * Called by a CPU which has just been onlined. It is expected that the CPU * performing the online operation will call tsc_sync_master(). * * TSC sync is disabled in the context of virtualization. See comments * above tsc_sync_master. */ void tsc_sync_slave(void) { ulong_t flags; hrtime_t s1; tsc_sync_t *tsc = tscp; int cnt; int hwtype; hwtype = get_hwenv(); if (!tsc_master_slave_sync_needed || hwtype == HW_XEN_HVM || hwtype == HW_VMWARE) return; flags = clear_int_flag(); for (cnt = 0; cnt < SYNC_ITERATIONS; cnt++) { /* Re-fill the cache line */ s1 = tsc->master_tsc; membar_enter(); tsc_sync_go = TSC_SYNC_GO; do { /* * Do not put an SMT_PAUSE here. For instance, * if the master and slave are really the same * hyper-threaded CPU, then you want the master * to yield to the slave as quickly as possible here, * but not the other way. */ s1 = tsc_read(); } while (tsc->master_tsc == 0); tsc->slave_tsc = s1; membar_enter(); tsc_sync_go = TSC_SYNC_DONE; while (tsc_sync_go != TSC_SYNC_STOP) SMT_PAUSE(); } restore_int_flag(flags); }
static void acpi_cpu_mwait_ipi_check_wakeup(void *arg) { volatile uint32_t *mcpu_mwait = (volatile uint32_t *)arg; ASSERT(arg != NULL); if (*mcpu_mwait != MWAIT_WAKEUP_IPI) { /* * CPU has been awakened, notify CPU idle notification system. */ cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE); } else { /* * Toggle interrupt flag to detect pending interrupts. * If interrupt happened, do_interrupt() will notify CPU idle * notification framework so no need to call cpu_idle_exit() * here. */ sti(); SMT_PAUSE(); cli(); } }
/* * mutex_vector_enter() is called from the assembly mutex_enter() routine * if the lock is held or is not of type MUTEX_ADAPTIVE. */ void mutex_vector_enter(mutex_impl_t *lp) { kthread_id_t owner; hrtime_t sleep_time = 0; /* how long we slept */ uint_t spin_count = 0; /* how many times we spun */ cpu_t *cpup, *last_cpu; extern cpu_t *cpu_list; turnstile_t *ts; volatile mutex_impl_t *vlp = (volatile mutex_impl_t *)lp; int backoff; /* current backoff */ int backctr; /* ctr for backoff */ int sleep_count = 0; ASSERT_STACK_ALIGNED(); if (MUTEX_TYPE_SPIN(lp)) { lock_set_spl(&lp->m_spin.m_spinlock, lp->m_spin.m_minspl, &lp->m_spin.m_oldspl); return; } if (!MUTEX_TYPE_ADAPTIVE(lp)) { mutex_panic("mutex_enter: bad mutex", lp); return; } /* * Adaptive mutexes must not be acquired from above LOCK_LEVEL. * We can migrate after loading CPU but before checking CPU_ON_INTR, * so we must verify by disabling preemption and loading CPU again. */ cpup = CPU; if (CPU_ON_INTR(cpup) && !panicstr) { kpreempt_disable(); if (CPU_ON_INTR(CPU)) mutex_panic("mutex_enter: adaptive at high PIL", lp); kpreempt_enable(); } CPU_STATS_ADDQ(cpup, sys, mutex_adenters, 1); if (&plat_lock_delay) { backoff = 0; } else { backoff = BACKOFF_BASE; } for (;;) { spin: spin_count++; /* * Add an exponential backoff delay before trying again * to touch the mutex data structure. * the spin_count test and call to nulldev are to prevent * the compiler optimizer from eliminating the delay loop. */ if (&plat_lock_delay) { plat_lock_delay(&backoff); } else { for (backctr = backoff; backctr; backctr--) { if (!spin_count) (void) nulldev(); }; /* delay */ backoff = backoff << 1; /* double it */ if (backoff > BACKOFF_CAP) { backoff = BACKOFF_CAP; } SMT_PAUSE(); } if (panicstr) return; if ((owner = MUTEX_OWNER(vlp)) == NULL) { if (mutex_adaptive_tryenter(lp)) break; continue; } if (owner == curthread) mutex_panic("recursive mutex_enter", lp); /* * If lock is held but owner is not yet set, spin. * (Only relevant for platforms that don't have cas.) */ if (owner == MUTEX_NO_OWNER) continue; /* * When searching the other CPUs, start with the one where * we last saw the owner thread. If owner is running, spin. * * We must disable preemption at this point to guarantee * that the list doesn't change while we traverse it * without the cpu_lock mutex. While preemption is * disabled, we must revalidate our cached cpu pointer. */ kpreempt_disable(); if (cpup->cpu_next == NULL) cpup = cpu_list; last_cpu = cpup; /* mark end of search */ do { if (cpup->cpu_thread == owner) { kpreempt_enable(); goto spin; } } while ((cpup = cpup->cpu_next) != last_cpu); kpreempt_enable(); /* * The owner appears not to be running, so block. * See the Big Theory Statement for memory ordering issues. */ ts = turnstile_lookup(lp); MUTEX_SET_WAITERS(lp); membar_enter(); /* * Recheck whether owner is running after waiters bit hits * global visibility (above). If owner is running, spin. * * Since we are at ipl DISP_LEVEL, kernel preemption is * disabled, however we still need to revalidate our cached * cpu pointer to make sure the cpu hasn't been deleted. */ if (cpup->cpu_next == NULL) last_cpu = cpup = cpu_list; do { if (cpup->cpu_thread == owner) { turnstile_exit(lp); goto spin; } } while ((cpup = cpup->cpu_next) != last_cpu); membar_consumer(); /* * If owner and waiters bit are unchanged, block. */ if (MUTEX_OWNER(vlp) == owner && MUTEX_HAS_WAITERS(vlp)) { sleep_time -= gethrtime(); (void) turnstile_block(ts, TS_WRITER_Q, lp, &mutex_sobj_ops, NULL, NULL); sleep_time += gethrtime(); sleep_count++; } else { turnstile_exit(lp); } } ASSERT(MUTEX_OWNER(lp) == curthread); if (sleep_time != 0) { /* * Note, sleep time is the sum of all the sleeping we * did. */ LOCKSTAT_RECORD(LS_MUTEX_ENTER_BLOCK, lp, sleep_time); } /* * We do not count a sleep as a spin. */ if (spin_count > sleep_count) LOCKSTAT_RECORD(LS_MUTEX_ENTER_SPIN, lp, spin_count - sleep_count); LOCKSTAT_RECORD0(LS_MUTEX_ENTER_ACQUIRE, lp); }
/* * Push out a priority cross call. */ static void xc_priority_common( xc_func_t func, xc_arg_t arg1, xc_arg_t arg2, xc_arg_t arg3, ulong_t *set) { int i; int c; struct cpu *cpup; /* * Wait briefly for any previous xc_priority to have finished. */ for (c = 0; c < max_ncpus; ++c) { cpup = cpu[c]; if (cpup == NULL || !(cpup->cpu_flags & CPU_READY)) continue; /* * The value of 40000 here is from old kernel code. It * really should be changed to some time based value, since * under a hypervisor, there's no guarantee a remote CPU * is even scheduled. */ for (i = 0; BT_TEST(xc_priority_set, c) && i < 40000; ++i) SMT_PAUSE(); /* * Some CPU did not respond to a previous priority request. It's * probably deadlocked with interrupts blocked or some such * problem. We'll just erase the previous request - which was * most likely a kmdb_enter that has already expired - and plow * ahead. */ if (BT_TEST(xc_priority_set, c)) { XC_BT_CLEAR(xc_priority_set, c); if (cpup->cpu_m.xc_work_cnt > 0) xc_decrement(&cpup->cpu_m); } } /* * fill in cross call data */ xc_priority_data.xc_func = func; xc_priority_data.xc_a1 = arg1; xc_priority_data.xc_a2 = arg2; xc_priority_data.xc_a3 = arg3; /* * Post messages to all CPUs involved that are CPU_READY * We'll always IPI, plus bang on the xc_msgbox for i86_mwait() */ for (c = 0; c < max_ncpus; ++c) { if (!BT_TEST(set, c)) continue; cpup = cpu[c]; if (cpup == NULL || !(cpup->cpu_flags & CPU_READY) || cpup == CPU) continue; (void) xc_increment(&cpup->cpu_m); XC_BT_SET(xc_priority_set, c); send_dirint(c, XC_HI_PIL); for (i = 0; i < 10; ++i) { (void) casptr(&cpup->cpu_m.xc_msgbox, cpup->cpu_m.xc_msgbox, cpup->cpu_m.xc_msgbox); } } }
/*ARGSUSED*/ uint_t xc_serv(caddr_t arg1, caddr_t arg2) { struct machcpu *mcpup = &(CPU->cpu_m); xc_msg_t *msg; xc_data_t *data; xc_msg_t *xc_waiters = NULL; uint32_t num_waiting = 0; xc_func_t func; xc_arg_t a1; xc_arg_t a2; xc_arg_t a3; uint_t rc = DDI_INTR_UNCLAIMED; while (mcpup->xc_work_cnt != 0) { rc = DDI_INTR_CLAIMED; /* * We may have to wait for a message to arrive. */ for (msg = NULL; msg == NULL; msg = xc_extract(&mcpup->xc_msgbox)) { /* * Alway check for and handle a priority message. */ if (BT_TEST(xc_priority_set, CPU->cpu_id)) { func = xc_priority_data.xc_func; a1 = xc_priority_data.xc_a1; a2 = xc_priority_data.xc_a2; a3 = xc_priority_data.xc_a3; XC_BT_CLEAR(xc_priority_set, CPU->cpu_id); xc_decrement(mcpup); func(a1, a2, a3); if (mcpup->xc_work_cnt == 0) return (rc); } /* * wait for a message to arrive */ SMT_PAUSE(); } /* * process the message */ switch (msg->xc_command) { /* * ASYNC gives back the message immediately, then we do the * function and return with no more waiting. */ case XC_MSG_ASYNC: data = &cpu[msg->xc_master]->cpu_m.xc_data; func = data->xc_func; a1 = data->xc_a1; a2 = data->xc_a2; a3 = data->xc_a3; msg->xc_command = XC_MSG_DONE; xc_insert(&cpu[msg->xc_master]->cpu_m.xc_msgbox, msg); if (func != NULL) (void) (*func)(a1, a2, a3); xc_decrement(mcpup); break; /* * SYNC messages do the call, then send it back to the master * in WAITING mode */ case XC_MSG_SYNC: data = &cpu[msg->xc_master]->cpu_m.xc_data; if (data->xc_func != NULL) (void) (*data->xc_func)(data->xc_a1, data->xc_a2, data->xc_a3); msg->xc_command = XC_MSG_WAITING; xc_insert(&cpu[msg->xc_master]->cpu_m.xc_msgbox, msg); break; /* * WAITING messsages are collected by the master until all * have arrived. Once all arrive, we release them back to * the slaves */ case XC_MSG_WAITING: xc_insert(&xc_waiters, msg); if (++num_waiting < mcpup->xc_wait_cnt) break; while ((msg = xc_extract(&xc_waiters)) != NULL) { msg->xc_command = XC_MSG_RELEASED; xc_insert(&cpu[msg->xc_slave]->cpu_m.xc_msgbox, msg); --num_waiting; } if (num_waiting != 0) panic("wrong number waiting"); mcpup->xc_wait_cnt = 0; break; /* * CALL messages do the function and then, like RELEASE, * send the message is back to master as DONE. */ case XC_MSG_CALL: data = &cpu[msg->xc_master]->cpu_m.xc_data; if (data->xc_func != NULL) (void) (*data->xc_func)(data->xc_a1, data->xc_a2, data->xc_a3); /*FALLTHROUGH*/ case XC_MSG_RELEASED: msg->xc_command = XC_MSG_DONE; xc_insert(&cpu[msg->xc_master]->cpu_m.xc_msgbox, msg); xc_decrement(mcpup); break; /* * DONE means a slave has completely finished up. * Once we collect all the DONE messages, we'll exit * processing too. */ case XC_MSG_DONE: msg->xc_command = XC_MSG_FREE; xc_insert(&mcpup->xc_free, msg); xc_decrement(mcpup); break; case XC_MSG_FREE: panic("free message 0x%p in msgbox", (void *)msg); break; default: panic("bad message 0x%p in msgbox", (void *)msg); break; } } return (rc); }
/* * Called by the master in the TSC sync operation (usually the boot CPU). * If the slave is discovered to have a skew, gethrtimef will be changed to * point to tsc_gethrtime_delta(). Calculating skews is precise only when * the master and slave TSCs are read simultaneously; however, there is no * algorithm that can read both CPUs in perfect simultaneity. The proposed * algorithm is an approximate method based on the behaviour of cache * management. The slave CPU continuously reads TSC and then reads a global * variable which the master CPU updates. The moment the master's update reaches * the slave's visibility (being forced by an mfence operation) we use the TSC * reading taken on the slave. A corresponding TSC read will be taken on the * master as soon as possible after finishing the mfence operation. But the * delay between causing the slave to notice the invalid cache line and the * competion of mfence is not repeatable. This error is heuristically assumed * to be 1/4th of the total write time as being measured by the two TSC reads * on the master sandwiching the mfence. Furthermore, due to the nature of * bus arbitration, contention on memory bus, etc., the time taken for the write * to reflect globally can vary a lot. So instead of taking a single reading, * a set of readings are taken and the one with least write time is chosen * to calculate the final skew. * * TSC sync is disabled in the context of virtualization because the CPUs * assigned to the guest are virtual CPUs which means the real CPUs on which * guest runs keep changing during life time of guest OS. So we would end up * calculating TSC skews for a set of CPUs during boot whereas the guest * might migrate to a different set of physical CPUs at a later point of * time. */ void tsc_sync_master(processorid_t slave) { ulong_t flags, source, min_write_time = ~0UL; hrtime_t write_time, x, mtsc_after, tdelta; tsc_sync_t *tsc = tscp; int cnt; int hwtype; hwtype = get_hwenv(); if (!tsc_master_slave_sync_needed || hwtype == HW_XEN_HVM || hwtype == HW_VMWARE) return; flags = clear_int_flag(); source = CPU->cpu_id; for (cnt = 0; cnt < SYNC_ITERATIONS; cnt++) { while (tsc_sync_go != TSC_SYNC_GO) SMT_PAUSE(); tsc->master_tsc = tsc_read(); membar_enter(); mtsc_after = tsc_read(); while (tsc_sync_go != TSC_SYNC_DONE) SMT_PAUSE(); write_time = mtsc_after - tsc->master_tsc; if (write_time <= min_write_time) { min_write_time = write_time; /* * Apply heuristic adjustment only if the calculated * delta is > 1/4th of the write time. */ x = tsc->slave_tsc - mtsc_after; if (x < 0) x = -x; if (x > (min_write_time/4)) /* * Subtract 1/4th of the measured write time * from the master's TSC value, as an estimate * of how late the mfence completion came * after the slave noticed the cache line * change. */ tdelta = tsc->slave_tsc - (mtsc_after - (min_write_time/4)); else tdelta = tsc->slave_tsc - mtsc_after; tsc_sync_tick_delta[slave] = tsc_sync_tick_delta[source] - tdelta; } tsc->master_tsc = tsc->slave_tsc = write_time = 0; membar_enter(); tsc_sync_go = TSC_SYNC_STOP; } if (tdelta < 0) tdelta = -tdelta; if (tdelta > largest_tsc_delta) largest_tsc_delta = tdelta; if (min_write_time < shortest_write_time) shortest_write_time = min_write_time; /* * Enable delta variants of tsc functions if the largest of all chosen * deltas is > smallest of the write time. */ if (largest_tsc_delta > shortest_write_time) { gethrtimef = tsc_gethrtime_delta; gethrtimeunscaledf = tsc_gethrtimeunscaled_delta; } restore_int_flag(flags); }
int turnstile_block(turnstile_t *ts, int qnum, void *sobj, sobj_ops_t *sobj_ops, kmutex_t *mp, lwp_timer_t *lwptp) { kthread_t *owner; kthread_t *t = curthread; proc_t *p = ttoproc(t); klwp_t *lwp = ttolwp(t); turnstile_chain_t *tc = &TURNSTILE_CHAIN(sobj); int error = 0; int loser = 0; ASSERT(DISP_LOCK_HELD(&tc->tc_lock)); ASSERT(mp == NULL || IS_UPI(mp)); ASSERT((SOBJ_TYPE(sobj_ops) == SOBJ_USER_PI) ^ (mp == NULL)); thread_lock_high(t); if (ts == NULL) { /* * This is the first thread to block on this sobj. * Take its attached turnstile and add it to the hash chain. */ ts = t->t_ts; ts->ts_sobj = sobj; ts->ts_next = tc->tc_first; tc->tc_first = ts; ASSERT(ts->ts_waiters == 0); } else { /* * Another thread has already donated its turnstile * to block on this sobj, so ours isn't needed. * Stash it on the active turnstile's freelist. */ turnstile_t *myts = t->t_ts; myts->ts_free = ts->ts_free; ts->ts_free = myts; t->t_ts = ts; ASSERT(ts->ts_sobj == sobj); ASSERT(ts->ts_waiters > 0); } /* * Put the thread to sleep. */ ASSERT(t != CPU->cpu_idle_thread); ASSERT(CPU_ON_INTR(CPU) == 0); ASSERT(t->t_wchan0 == NULL && t->t_wchan == NULL); ASSERT(t->t_state == TS_ONPROC); if (SOBJ_TYPE(sobj_ops) == SOBJ_USER_PI) { curthread->t_flag |= T_WAKEABLE; } CL_SLEEP(t); /* assign kernel priority */ THREAD_SLEEP(t, &tc->tc_lock); t->t_wchan = sobj; t->t_sobj_ops = sobj_ops; DTRACE_SCHED(sleep); if (lwp != NULL) { lwp->lwp_ru.nvcsw++; (void) new_mstate(t, LMS_SLEEP); if (SOBJ_TYPE(sobj_ops) == SOBJ_USER_PI) { lwp->lwp_asleep = 1; lwp->lwp_sysabort = 0; /* * make wchan0 non-zero to conform to the rule that * threads blocking for user-level objects have a * non-zero wchan0: this prevents spurious wake-ups * by, for example, /proc. */ t->t_wchan0 = (caddr_t)1; } } ts->ts_waiters++; sleepq_insert(&ts->ts_sleepq[qnum], t); if (SOBJ_TYPE(sobj_ops) == SOBJ_MUTEX && SOBJ_OWNER(sobj_ops, sobj) == NULL) panic("turnstile_block(%p): unowned mutex", (void *)ts); /* * Follow the blocking chain to its end, willing our priority to * everyone who's in our way. */ while (t->t_sobj_ops != NULL && (owner = SOBJ_OWNER(t->t_sobj_ops, t->t_wchan)) != NULL) { if (owner == curthread) { if (SOBJ_TYPE(sobj_ops) != SOBJ_USER_PI) { panic("Deadlock: cycle in blocking chain"); } /* * If the cycle we've encountered ends in mp, * then we know it isn't a 'real' cycle because * we're going to drop mp before we go to sleep. * Moreover, since we've come full circle we know * that we must have willed priority to everyone * in our way. Therefore, we can break out now. */ if (t->t_wchan == (void *)mp) break; if (loser) lock_clear(&turnstile_loser_lock); /* * For SOBJ_USER_PI, a cycle is an application * deadlock which needs to be communicated * back to the application. */ thread_unlock_nopreempt(t); mutex_exit(mp); setrun(curthread); swtch(); /* necessary to transition state */ curthread->t_flag &= ~T_WAKEABLE; if (lwptp->lwpt_id != 0) (void) lwp_timer_dequeue(lwptp); setallwatch(); lwp->lwp_asleep = 0; lwp->lwp_sysabort = 0; return (EDEADLK); } if (!turnstile_interlock(t->t_lockp, &owner->t_lockp)) { /* * If we failed to grab the owner's thread lock, * turnstile_interlock() will have dropped t's * thread lock, so at this point we don't even know * that 't' exists anymore. The simplest solution * is to restart the entire priority inheritance dance * from the beginning of the blocking chain, since * we *do* know that 'curthread' still exists. * Application of priority inheritance is idempotent, * so it's OK that we're doing it more than once. * Note also that since we've dropped our thread lock, * we may already have been woken up; if so, our * t_sobj_ops will be NULL, the loop will terminate, * and the call to swtch() will be a no-op. Phew. * * There is one further complication: if two (or more) * threads keep trying to grab the turnstile locks out * of order and keep losing the race to another thread, * these "dueling losers" can livelock the system. * Therefore, once we get into this rare situation, * we serialize all the losers. */ if (loser == 0) { loser = 1; lock_set(&turnstile_loser_lock); } t = curthread; thread_lock_high(t); continue; } /* * We now have the owner's thread lock. If we are traversing * from non-SOBJ_USER_PI ops to SOBJ_USER_PI ops, then we know * that we have caught the thread while in the TS_SLEEP state, * but holding mp. We know that this situation is transient * (mp will be dropped before the holder actually sleeps on * the SOBJ_USER_PI sobj), so we will spin waiting for mp to * be dropped. Then, as in the turnstile_interlock() failure * case, we will restart the priority inheritance dance. */ if (SOBJ_TYPE(t->t_sobj_ops) != SOBJ_USER_PI && owner->t_sobj_ops != NULL && SOBJ_TYPE(owner->t_sobj_ops) == SOBJ_USER_PI) { kmutex_t *upi_lock = (kmutex_t *)t->t_wchan; ASSERT(IS_UPI(upi_lock)); ASSERT(SOBJ_TYPE(t->t_sobj_ops) == SOBJ_MUTEX); if (t->t_lockp != owner->t_lockp) thread_unlock_high(owner); thread_unlock_high(t); if (loser) lock_clear(&turnstile_loser_lock); while (mutex_owner(upi_lock) == owner) { SMT_PAUSE(); continue; } if (loser) lock_set(&turnstile_loser_lock); t = curthread; thread_lock_high(t); continue; } turnstile_pi_inherit(t->t_ts, owner, DISP_PRIO(t)); if (t->t_lockp != owner->t_lockp) thread_unlock_high(t); t = owner; } if (loser) lock_clear(&turnstile_loser_lock); /* * Note: 't' and 'curthread' were synonymous before the loop above, * but now they may be different. ('t' is now the last thread in * the blocking chain.) */ if (SOBJ_TYPE(sobj_ops) == SOBJ_USER_PI) { ushort_t s = curthread->t_oldspl; int timedwait = 0; uint_t imm_timeout = 0; clock_t tim = -1; thread_unlock_high(t); if (lwptp->lwpt_id != 0) { /* * We enqueued a timeout. If it has already fired, * lwptp->lwpt_imm_timeout has been set with cas, * so fetch it with cas. */ timedwait = 1; imm_timeout = atomic_cas_uint(&lwptp->lwpt_imm_timeout, 0, 0); } mutex_exit(mp); splx(s); if (ISSIG(curthread, JUSTLOOKING) || MUSTRETURN(p, curthread) || imm_timeout) setrun(curthread); swtch(); curthread->t_flag &= ~T_WAKEABLE; if (timedwait) tim = lwp_timer_dequeue(lwptp); setallwatch(); if (ISSIG(curthread, FORREAL) || lwp->lwp_sysabort || MUSTRETURN(p, curthread)) error = EINTR; else if (imm_timeout || (timedwait && tim == -1)) error = ETIME; lwp->lwp_sysabort = 0; lwp->lwp_asleep = 0; } else { thread_unlock_nopreempt(t); swtch(); } return (error); }